aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/blog/using-covid-19-pubseq-part3.org3
-rw-r--r--scripts/uthsc_samples/.gitignore1
-rw-r--r--scripts/uthsc_samples/template.yaml35
-rw-r--r--scripts/uthsc_samples/uthsc_samples.py57
4 files changed, 95 insertions, 1 deletions
diff --git a/doc/blog/using-covid-19-pubseq-part3.org b/doc/blog/using-covid-19-pubseq-part3.org
index f3ba073..d0d6c7f 100644
--- a/doc/blog/using-covid-19-pubseq-part3.org
+++ b/doc/blog/using-covid-19-pubseq-part3.org
@@ -255,7 +255,8 @@ more metadata is yummy when stored in RDF. [[https://yummydata.org/][Yummydata]]
that many of the terms in above example are URIs, such as
host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606. We use
web ontologies for these to make the data less ambiguous and more
-FAIR. Check out the option fields as defined in the schema. If it is not listed
+FAIR. Check out the option fields as defined in the schema. If it is not listed,
+check the [[https://github.com/arvados/bh20-seq-resource/blob/master/semantic_enrichment/labels.ttl][labels.ttl]] file. Also,
a little bit of web searching may be required or [[./contact][contact]] us.
** Run the uploader (CLI)
diff --git a/scripts/uthsc_samples/.gitignore b/scripts/uthsc_samples/.gitignore
new file mode 100644
index 0000000..8786e3f
--- /dev/null
+++ b/scripts/uthsc_samples/.gitignore
@@ -0,0 +1 @@
+yaml
diff --git a/scripts/uthsc_samples/template.yaml b/scripts/uthsc_samples/template.yaml
new file mode 100644
index 0000000..1175ac8
--- /dev/null
+++ b/scripts/uthsc_samples/template.yaml
@@ -0,0 +1,35 @@
+id: placeholder
+
+license:
+ license_type: http://creativecommons.org/licenses/by/4.0/
+ title: "$sample_name - $locationx"
+ attribution_name: "Mariah Taylor, Colleen Jonsson"
+ attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php
+
+host:
+ host_id: "$sample_id"
+ host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
+
+sample:
+ sample_id: "$sample_id"
+ specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831]
+ collection_date: "$collection_date"
+ collection_location: $location
+
+virus:
+ virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
+ virus_strain: "$strain"
+
+technology:
+ sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
+ sequence_assembly_method: https://bio.tools/BWA#!
+ additional_technology_information: Oxford Nanopore MiniIon RNA long reads
+
+submitter:
+ authors: [Mariah Taylor, Colleen Jonsson]
+ submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins]
+ submitter_address: UTHSC, Memphis, Tennessee 38163, USA
+ originating_lab: Regional Biocontainment Laboratory, Memphis, TN
+ provider_sample_id: $sample_id
+ submitter_sample_id: $sample_id
+ submitter_orcid: [https://orcid.org/0000-0002-2640-7672,https://orcid.org/0000-0002-8021-9162]
diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py
new file mode 100644
index 0000000..5c39398
--- /dev/null
+++ b/scripts/uthsc_samples/uthsc_samples.py
@@ -0,0 +1,57 @@
+import os
+import pandas as pd
+from string import Template
+from dateutil.parser import parse
+import re
+
+import sys
+
+# Metadata in tabular format in a spreadsheet(?!)
+xlsx = '../../test/data/10_samples.xlsx'
+
+# Template in a text file
+template_yaml = 'template.yaml'
+
+dir_output = 'yaml'
+
+if not os.path.exists(dir_output):
+ os.makedirs(dir_output)
+
+table = pd.read_excel(xlsx)
+
+print(table)
+
+for index, row in table.iterrows():
+ sample = row['Sample ID']
+ print(f"Processing sample {sample}...")
+
+ with open(template_yaml) as f:
+ text = Template(f.read())
+ with open(os.path.join(dir_output,f"{sample}.yaml"), 'w') as fw:
+ sample_id = sample
+ sample_name = sample
+ collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d')
+ locationx = row['City']+", "+row['State']+", USA"
+ location = "https://www.wikidata.org/wiki/Q16563" # Memphis by default
+ map = {
+ "Pegram": "https://www.wikidata.org/wiki/Q3289517",
+ "Alexander": "https://www.wikidata.org/wiki/Q79663",
+ "Smithville": "https://www.wikidata.org/wiki/Q2145339",
+ "Nashville": "https://www.wikidata.org/wiki/Q23197",
+ "Madison": "https://www.wikidata.org/wiki/Q494755"
+ }
+
+ for name in map:
+ p = re.compile(name)
+ if p.match(locationx):
+ location = map[name]
+ break
+
+ strain = f"SARS-CoV-2/human/USA/{sample}/2020"
+ fw.write(text.substitute(sample_id=sample_id,
+ sample_name=sample_name,
+ collection_date=collection_date,
+ location=location,
+ locationx=locationx,
+ strain=strain
+ ))