diff options
author | AndreaGuarracino | 2020-09-26 12:12:17 +0200 |
---|---|---|
committer | AndreaGuarracino | 2020-09-26 12:12:17 +0200 |
commit | 7c12e4976337a063301be260cb3954bf4303f5e0 (patch) | |
tree | a25effc5a4afca5aee235e2b9e986abfab10ea12 /scripts/esr_samples | |
parent | 8e9247ab29db30e9bb7c87bef5f127b3b5cec699 (diff) | |
download | bh20-seq-resource-7c12e4976337a063301be260cb3954bf4303f5e0.tar.gz bh20-seq-resource-7c12e4976337a063301be260cb3954bf4303f5e0.tar.lz bh20-seq-resource-7c12e4976337a063301be260cb3954bf4303f5e0.zip |
script for processing the metadata of the ESR samples; moved delete_entries_on_arvados script in scripts directory
Diffstat (limited to 'scripts/esr_samples')
-rw-r--r-- | scripts/esr_samples/Pathogen.cl.1.0.xlsx | bin | 0 -> 48488 bytes | |||
-rw-r--r-- | scripts/esr_samples/esr_samples.py | 85 | ||||
-rw-r--r-- | scripts/esr_samples/template.yaml | 29 |
3 files changed, 114 insertions, 0 deletions
diff --git a/scripts/esr_samples/Pathogen.cl.1.0.xlsx b/scripts/esr_samples/Pathogen.cl.1.0.xlsx Binary files differnew file mode 100644 index 0000000..c7c6393 --- /dev/null +++ b/scripts/esr_samples/Pathogen.cl.1.0.xlsx diff --git a/scripts/esr_samples/esr_samples.py b/scripts/esr_samples/esr_samples.py new file mode 100644 index 0000000..bd59612 --- /dev/null +++ b/scripts/esr_samples/esr_samples.py @@ -0,0 +1,85 @@ +import os +import pandas as pd +from string import Template +from dateutil.parser import parse + +path_metadata_xlsx = 'Pathogen.cl.1.0.xlsx' + +path_template_yaml = 'template.yaml' +# Removed from the template (for now) +# license: +# license_type: "http://creativecommons.org/licenses/by/4.0/" +# title: "SARS-CoV-2 New Zealand" +# attribution_name: "ESR" +# attribution_url: "https://www.esr.cri.nz/" + +dir_dict_ontology_standardization = '../dict_ontology_standardization/' + +dir_output = 'yaml' +suffix = '.consensus' + +if not os.path.exists(dir_output): + os.makedirs(dir_output) + +term_to_uri_dict = {} + +for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in + os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]: + print('Read {}'.format(path_dict_xxx_csv)) + + with open(path_dict_xxx_csv) as f: + for line in f: + if len(line.split(',')) > 2: + term, uri = line.strip('\n').split('",') + else: + term, uri = line.strip('\n').split(',') + + term = term.strip('"') + + if term in term_to_uri_dict: + print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) + continue + + term_to_uri_dict[term] = uri + +metadata_df = pd.read_excel(path_metadata_xlsx, skiprows=12) + +# Maybe not the best pandas-way to do this +for index, row in metadata_df.iterrows(): + # print(row['*sample_name']) + + geo_loc_name = row['*geo_loc_name'].replace(': ', ':') + country = '' + if not geo_loc_name in term_to_uri_dict: + if geo_loc_name in [ + 'New Zealand:Counties Manukau', 'New Zealand:Capital and Coast', 'New Zealand:Southern', + 'New Zealand:Waikato', + 'New Zealand:Lakes', 'New Zealand:Nelson Marlborough', 'New Zealand:South Canterbury', + 'New Zealand:MidCentral', + 'New Zealand:Tairawhiti', 'New Zealand:Hawkes Bay', 'New Zealand:NA', 'New Zealand:Taranaki' + ]: + geo_loc_name = 'New Zealand' + else: + print(geo_loc_name) + break + + country = term_to_uri_dict[geo_loc_name] + + d = { + 'host_species': term_to_uri_dict[row['*host']], + 'sample_id': row['*sample_name'], + 'collection_date': parse(row['*collection_date']).strftime('%Y-%m-%d'), + 'collection_location': country, + 'specimen_source': term_to_uri_dict[row['*isolation_source']], + 'virus_species': 'http://purl.obolibrary.org/obo/NCBITaxon_2697049', + + 'submitter_sample_id': row['bioproject_accession'], + } + + with open(path_template_yaml) as f: + src = Template(f.read()) + + with open(os.path.join(dir_output, '{}{}.yaml'.format(row['*sample_name'], suffix)), 'w') as fw: + fw.write(src.substitute(d)) + +print('{} YAML files created.'.format(len([x for x in os.listdir(dir_output) if x.endswith('.yaml')]))) diff --git a/scripts/esr_samples/template.yaml b/scripts/esr_samples/template.yaml new file mode 100644 index 0000000..98f08a6 --- /dev/null +++ b/scripts/esr_samples/template.yaml @@ -0,0 +1,29 @@ +id: placeholder + +host: + host_species: "$host_species" + +sample: + sample_id: "$sample_id" + collection_date: "$collection_date" + collection_location: "$collection_location" + specimen_source: ["$specimen_source"] + +virus: + virus_species: "$virus_species" + +technology: + sample_sequencing_technology: ["http://www.ebi.ac.uk/efo/EFO_0008632"] + sequence_assembly_method: "https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics" + additional_technology_information: "Artic V3 workflow" + +submitter: + authors: ["Jemma L Geoghegan", "Xiaoyun Ren", "Matthew Storey", "James Hadfield", "Lauren Jelley", "Sarah Jefferies", "Jill Sherwood", "Shevaun Paine", "Sue Huang", "Jordan Douglas", "Fabio K Mendes", "Andrew Sporle", "Michael G Baker", "David R Murdoch", "Nigel French", "Colin R Simpson", "David Welch", "Alexei J Drummond", "Edward C Holmes", "Sebastian Duchene", "Joep de Ligt"] + submitter_name: ["Joep de Ligt"] + submitter_address: "PO Box 50348, Porirua 5240, New Zealand" + originating_lab: "ESR" + submitter_sample_id: "$submitter_sample_id" + publication: "https://doi.org/10.1101/2020.08.05.20168930" + submitter_orcid: ["https://orcid.org/0000-0003-0970-0153"] + additional_submitter_information: "2020-08-20" +
\ No newline at end of file |