diff options
author | AndreaGuarracino | 2020-09-29 18:46:49 +0200 |
---|---|---|
committer | AndreaGuarracino | 2020-09-29 18:46:49 +0200 |
commit | b3a671f04743dc2bf48049b413d7d1f20d31bbcf (patch) | |
tree | 05a6f0f15564f1da8e858551e644543865700ae8 /scripts | |
parent | c72dab2788d010153d5406f2d5ecbe3824571931 (diff) | |
download | bh20-seq-resource-b3a671f04743dc2bf48049b413d7d1f20d31bbcf.tar.gz bh20-seq-resource-b3a671f04743dc2bf48049b413d7d1f20d31bbcf.tar.lz bh20-seq-resource-b3a671f04743dc2bf48049b413d7d1f20d31bbcf.zip |
esr_samples script refactoring; added a reference of the esr_samples script in the blog as an example of how to parse metadata
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/dict_ontology_standardization/ncbi_countries.csv | 2 | ||||
-rw-r--r-- | scripts/esr_samples/esr_samples.py | 40 |
2 files changed, 15 insertions, 27 deletions
diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index 90d9af3..a710906 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -280,7 +280,7 @@ Italy:Bologna,http://www.wikidata.org/entity/Q1891 Italy:Cagliari,http://www.wikidata.org/entity/Q1897 Italy:Lazio,http://www.wikidata.org/entity/Q1282 Italy:Milan,http://www.wikidata.org/entity/Q490 -Italy:Lombardia, Milan,http://www.wikidata.org/entity/Q490 +"Italy:Lombardia, Milan",http://www.wikidata.org/entity/Q490 Italy:Palermo,http://www.wikidata.org/entity/Q2656 Italy:Rome,http://www.wikidata.org/entity/Q220 Italy:Turin,http://www.wikidata.org/entity/Q495 diff --git a/scripts/esr_samples/esr_samples.py b/scripts/esr_samples/esr_samples.py index bd59612..06f3d51 100644 --- a/scripts/esr_samples/esr_samples.py +++ b/scripts/esr_samples/esr_samples.py @@ -3,6 +3,12 @@ import pandas as pd from string import Template from dateutil.parser import parse +import sys + +sys.path.append('../') +from utils import check_and_get_ontology_dictionaries + +# Metadata in tabular format path_metadata_xlsx = 'Pathogen.cl.1.0.xlsx' path_template_yaml = 'template.yaml' @@ -13,7 +19,10 @@ path_template_yaml = 'template.yaml' # attribution_name: "ESR" # attribution_url: "https://www.esr.cri.nz/" + +# Read the dictionaries for the ontology dir_dict_ontology_standardization = '../dict_ontology_standardization/' +field_to_term_to_uri_dict = check_and_get_ontology_dictionaries(dir_dict_ontology_standardization) dir_output = 'yaml' suffix = '.consensus' @@ -21,27 +30,6 @@ suffix = '.consensus' if not os.path.exists(dir_output): os.makedirs(dir_output) -term_to_uri_dict = {} - -for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in - os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]: - print('Read {}'.format(path_dict_xxx_csv)) - - with open(path_dict_xxx_csv) as f: - for line in f: - if len(line.split(',')) > 2: - term, uri = line.strip('\n').split('",') - else: - term, uri = line.strip('\n').split(',') - - term = term.strip('"') - - if term in term_to_uri_dict: - print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) - continue - - term_to_uri_dict[term] = uri - metadata_df = pd.read_excel(path_metadata_xlsx, skiprows=12) # Maybe not the best pandas-way to do this @@ -49,8 +37,8 @@ for index, row in metadata_df.iterrows(): # print(row['*sample_name']) geo_loc_name = row['*geo_loc_name'].replace(': ', ':') - country = '' - if not geo_loc_name in term_to_uri_dict: + + if geo_loc_name not in field_to_term_to_uri_dict['ncbi_countries']: if geo_loc_name in [ 'New Zealand:Counties Manukau', 'New Zealand:Capital and Coast', 'New Zealand:Southern', 'New Zealand:Waikato', @@ -63,14 +51,14 @@ for index, row in metadata_df.iterrows(): print(geo_loc_name) break - country = term_to_uri_dict[geo_loc_name] + country = field_to_term_to_uri_dict['ncbi_countries'][geo_loc_name] d = { - 'host_species': term_to_uri_dict[row['*host']], + 'host_species': field_to_term_to_uri_dict['ncbi_host_species'][row['*host']], 'sample_id': row['*sample_name'], 'collection_date': parse(row['*collection_date']).strftime('%Y-%m-%d'), 'collection_location': country, - 'specimen_source': term_to_uri_dict[row['*isolation_source']], + 'specimen_source': field_to_term_to_uri_dict['ncbi_speciesman_source'][row['*isolation_source']], 'virus_species': 'http://purl.obolibrary.org/obo/NCBITaxon_2697049', 'submitter_sample_id': row['bioproject_accession'], |