From 43d7264dda8061a024befbc9ca0a89d7159b1e40 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 6 Nov 2020 09:52:32 +0000 Subject: UTHSC upload info --- doc/blog/using-covid-19-pubseq-part3.org | 3 +- scripts/uthsc_samples/.gitignore | 1 + scripts/uthsc_samples/template.yaml | 35 ++++++++++++++++++++ scripts/uthsc_samples/uthsc_samples.py | 57 ++++++++++++++++++++++++++++++++ 4 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 scripts/uthsc_samples/.gitignore create mode 100644 scripts/uthsc_samples/template.yaml create mode 100644 scripts/uthsc_samples/uthsc_samples.py diff --git a/doc/blog/using-covid-19-pubseq-part3.org b/doc/blog/using-covid-19-pubseq-part3.org index f3ba073..d0d6c7f 100644 --- a/doc/blog/using-covid-19-pubseq-part3.org +++ b/doc/blog/using-covid-19-pubseq-part3.org @@ -255,7 +255,8 @@ more metadata is yummy when stored in RDF. [[https://yummydata.org/][Yummydata]] that many of the terms in above example are URIs, such as host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606. We use web ontologies for these to make the data less ambiguous and more -FAIR. Check out the option fields as defined in the schema. If it is not listed +FAIR. Check out the option fields as defined in the schema. If it is not listed, +check the [[https://github.com/arvados/bh20-seq-resource/blob/master/semantic_enrichment/labels.ttl][labels.ttl]] file. Also, a little bit of web searching may be required or [[./contact][contact]] us. ** Run the uploader (CLI) diff --git a/scripts/uthsc_samples/.gitignore b/scripts/uthsc_samples/.gitignore new file mode 100644 index 0000000..8786e3f --- /dev/null +++ b/scripts/uthsc_samples/.gitignore @@ -0,0 +1 @@ +yaml diff --git a/scripts/uthsc_samples/template.yaml b/scripts/uthsc_samples/template.yaml new file mode 100644 index 0000000..1175ac8 --- /dev/null +++ b/scripts/uthsc_samples/template.yaml @@ -0,0 +1,35 @@ +id: placeholder + +license: + license_type: http://creativecommons.org/licenses/by/4.0/ + title: "$sample_name - $locationx" + attribution_name: "Mariah Taylor, Colleen Jonsson" + attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php + +host: + host_id: "$sample_id" + host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 + +sample: + sample_id: "$sample_id" + specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831] + collection_date: "$collection_date" + collection_location: $location + +virus: + virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 + virus_strain: "$strain" + +technology: + sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] + sequence_assembly_method: https://bio.tools/BWA#! + additional_technology_information: Oxford Nanopore MiniIon RNA long reads + +submitter: + authors: [Mariah Taylor, Colleen Jonsson] + submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins] + submitter_address: UTHSC, Memphis, Tennessee 38163, USA + originating_lab: Regional Biocontainment Laboratory, Memphis, TN + provider_sample_id: $sample_id + submitter_sample_id: $sample_id + submitter_orcid: [https://orcid.org/0000-0002-2640-7672,https://orcid.org/0000-0002-8021-9162] diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py new file mode 100644 index 0000000..5c39398 --- /dev/null +++ b/scripts/uthsc_samples/uthsc_samples.py @@ -0,0 +1,57 @@ +import os +import pandas as pd +from string import Template +from dateutil.parser import parse +import re + +import sys + +# Metadata in tabular format in a spreadsheet(?!) +xlsx = '../../test/data/10_samples.xlsx' + +# Template in a text file +template_yaml = 'template.yaml' + +dir_output = 'yaml' + +if not os.path.exists(dir_output): + os.makedirs(dir_output) + +table = pd.read_excel(xlsx) + +print(table) + +for index, row in table.iterrows(): + sample = row['Sample ID'] + print(f"Processing sample {sample}...") + + with open(template_yaml) as f: + text = Template(f.read()) + with open(os.path.join(dir_output,f"{sample}.yaml"), 'w') as fw: + sample_id = sample + sample_name = sample + collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d') + locationx = row['City']+", "+row['State']+", USA" + location = "https://www.wikidata.org/wiki/Q16563" # Memphis by default + map = { + "Pegram": "https://www.wikidata.org/wiki/Q3289517", + "Alexander": "https://www.wikidata.org/wiki/Q79663", + "Smithville": "https://www.wikidata.org/wiki/Q2145339", + "Nashville": "https://www.wikidata.org/wiki/Q23197", + "Madison": "https://www.wikidata.org/wiki/Q494755" + } + + for name in map: + p = re.compile(name) + if p.match(locationx): + location = map[name] + break + + strain = f"SARS-CoV-2/human/USA/{sample}/2020" + fw.write(text.substitute(sample_id=sample_id, + sample_name=sample_name, + collection_date=collection_date, + location=location, + locationx=locationx, + strain=strain + )) -- cgit v1.2.3