From b3a671f04743dc2bf48049b413d7d1f20d31bbcf Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Tue, 29 Sep 2020 18:46:49 +0200 Subject: esr_samples script refactoring; added a reference of the esr_samples script in the blog as an example of how to parse metadata --- doc/blog/using-covid-19-pubseq-part3.org | 16 +++++++++ .../ncbi_countries.csv | 2 +- scripts/esr_samples/esr_samples.py | 40 ++++++++-------------- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/doc/blog/using-covid-19-pubseq-part3.org b/doc/blog/using-covid-19-pubseq-part3.org index 4d70e7c..abc260c 100644 --- a/doc/blog/using-covid-19-pubseq-part3.org +++ b/doc/blog/using-covid-19-pubseq-part3.org @@ -21,6 +21,7 @@ - [[#bulk-sequence-uploader][Bulk sequence uploader]] - [[#run-the-uploader-cli][Run the uploader (CLI)]] - [[#example-uploading-bulk-genbank-sequences][Example: uploading bulk GenBank sequences]] + - [[#example-preparing-metadata][Example: preparing metadata]] * Uploading Data @@ -232,6 +233,7 @@ Guix package manager). The web interface using this exact same script so it should just work (TM). + ** Example: uploading bulk GenBank sequences We also use above script to bulk upload GenBank sequences with a [[https://github.com/arvados/bh20-seq-resource/blob/master/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py][FASTA @@ -250,3 +252,17 @@ ls $dir_fasta_and_yaml/*.yaml | while read path_code_yaml; do bh20-seq-uploader --skip-qc $path_code_yaml $path_code_fasta done #+END_SRC + + +** Example: preparing metadata + +Usually, metadata are available in tabular format, like spreadsheets. As an example, we provide a script +[[https://github.com/arvados/bh20-seq-resource/tree/master/scripts/esr_samples][esr_samples.py]] to show you how to parse +your metadata in YAML files ready for the upload. To execute the script, go in the ~bh20-seq-resource/scripts/esr_samples +and execute + +#+BEGIN_SRC sh +python3 esr_samples.py +#+END_SRC + +You will find the YAML files in the `yaml` folder which will be created in the same directory. \ No newline at end of file diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index 90d9af3..a710906 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -280,7 +280,7 @@ Italy:Bologna,http://www.wikidata.org/entity/Q1891 Italy:Cagliari,http://www.wikidata.org/entity/Q1897 Italy:Lazio,http://www.wikidata.org/entity/Q1282 Italy:Milan,http://www.wikidata.org/entity/Q490 -Italy:Lombardia, Milan,http://www.wikidata.org/entity/Q490 +"Italy:Lombardia, Milan",http://www.wikidata.org/entity/Q490 Italy:Palermo,http://www.wikidata.org/entity/Q2656 Italy:Rome,http://www.wikidata.org/entity/Q220 Italy:Turin,http://www.wikidata.org/entity/Q495 diff --git a/scripts/esr_samples/esr_samples.py b/scripts/esr_samples/esr_samples.py index bd59612..06f3d51 100644 --- a/scripts/esr_samples/esr_samples.py +++ b/scripts/esr_samples/esr_samples.py @@ -3,6 +3,12 @@ import pandas as pd from string import Template from dateutil.parser import parse +import sys + +sys.path.append('../') +from utils import check_and_get_ontology_dictionaries + +# Metadata in tabular format path_metadata_xlsx = 'Pathogen.cl.1.0.xlsx' path_template_yaml = 'template.yaml' @@ -13,7 +19,10 @@ path_template_yaml = 'template.yaml' # attribution_name: "ESR" # attribution_url: "https://www.esr.cri.nz/" + +# Read the dictionaries for the ontology dir_dict_ontology_standardization = '../dict_ontology_standardization/' +field_to_term_to_uri_dict = check_and_get_ontology_dictionaries(dir_dict_ontology_standardization) dir_output = 'yaml' suffix = '.consensus' @@ -21,27 +30,6 @@ suffix = '.consensus' if not os.path.exists(dir_output): os.makedirs(dir_output) -term_to_uri_dict = {} - -for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in - os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]: - print('Read {}'.format(path_dict_xxx_csv)) - - with open(path_dict_xxx_csv) as f: - for line in f: - if len(line.split(',')) > 2: - term, uri = line.strip('\n').split('",') - else: - term, uri = line.strip('\n').split(',') - - term = term.strip('"') - - if term in term_to_uri_dict: - print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) - continue - - term_to_uri_dict[term] = uri - metadata_df = pd.read_excel(path_metadata_xlsx, skiprows=12) # Maybe not the best pandas-way to do this @@ -49,8 +37,8 @@ for index, row in metadata_df.iterrows(): # print(row['*sample_name']) geo_loc_name = row['*geo_loc_name'].replace(': ', ':') - country = '' - if not geo_loc_name in term_to_uri_dict: + + if geo_loc_name not in field_to_term_to_uri_dict['ncbi_countries']: if geo_loc_name in [ 'New Zealand:Counties Manukau', 'New Zealand:Capital and Coast', 'New Zealand:Southern', 'New Zealand:Waikato', @@ -63,14 +51,14 @@ for index, row in metadata_df.iterrows(): print(geo_loc_name) break - country = term_to_uri_dict[geo_loc_name] + country = field_to_term_to_uri_dict['ncbi_countries'][geo_loc_name] d = { - 'host_species': term_to_uri_dict[row['*host']], + 'host_species': field_to_term_to_uri_dict['ncbi_host_species'][row['*host']], 'sample_id': row['*sample_name'], 'collection_date': parse(row['*collection_date']).strftime('%Y-%m-%d'), 'collection_location': country, - 'specimen_source': term_to_uri_dict[row['*isolation_source']], + 'specimen_source': field_to_term_to_uri_dict['ncbi_speciesman_source'][row['*isolation_source']], 'virus_species': 'http://purl.obolibrary.org/obo/NCBITaxon_2697049', 'submitter_sample_id': row['bioproject_accession'], -- cgit v1.2.3