esr_samples script refactoring; added a reference of the esr_samples script in the blog as an example of how to parse metadata

author: AndreaGuarracino 2020-09-29 18:46:49 +0200
committer: AndreaGuarracino 2020-09-29 18:46:49 +0200
commit: b3a671f04743dc2bf48049b413d7d1f20d31bbcf (patch)
tree: 05a6f0f15564f1da8e858551e644543865700ae8 /scripts
parent: c72dab2788d010153d5406f2d5ecbe3824571931 (diff)
download: bh20-seq-resource-b3a671f04743dc2bf48049b413d7d1f20d31bbcf.tar.gz
bh20-seq-resource-b3a671f04743dc2bf48049b413d7d1f20d31bbcf.tar.lz
bh20-seq-resource-b3a671f04743dc2bf48049b413d7d1f20d31bbcf.zip
2 files changed, 15 insertions, 27 deletions
diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv
index 90d9af3..a710906 100644
--- a/scripts/dict_ontology_standardization/ncbi_countries.csv
+++ b/scripts/dict_ontology_standardization/ncbi_countries.csv
@@ -280,7 +280,7 @@ Italy:Bologna,http://www.wikidata.org/entity/Q1891
 Italy:Cagliari,http://www.wikidata.org/entity/Q1897
 Italy:Lazio,http://www.wikidata.org/entity/Q1282
 Italy:Milan,http://www.wikidata.org/entity/Q490
-Italy:Lombardia, Milan,http://www.wikidata.org/entity/Q490
+"Italy:Lombardia, Milan",http://www.wikidata.org/entity/Q490
 Italy:Palermo,http://www.wikidata.org/entity/Q2656
 Italy:Rome,http://www.wikidata.org/entity/Q220
 Italy:Turin,http://www.wikidata.org/entity/Q495
diff --git a/scripts/esr_samples/esr_samples.py b/scripts/esr_samples/esr_samples.py
index bd59612..06f3d51 100644
--- a/scripts/esr_samples/esr_samples.py
+++ b/scripts/esr_samples/esr_samples.py
@@ -3,6 +3,12 @@ import pandas as pd
 from string import Template
 from dateutil.parser import parse
 
+import sys
+
+sys.path.append('../')
+from utils import check_and_get_ontology_dictionaries
+
+# Metadata in tabular format
 path_metadata_xlsx = 'Pathogen.cl.1.0.xlsx'
 
 path_template_yaml = 'template.yaml'
@@ -13,7 +19,10 @@ path_template_yaml = 'template.yaml'
 #    attribution_name: "ESR"
 #    attribution_url: "https://www.esr.cri.nz/"
 
+
+# Read the dictionaries for the ontology
 dir_dict_ontology_standardization = '../dict_ontology_standardization/'
+field_to_term_to_uri_dict = check_and_get_ontology_dictionaries(dir_dict_ontology_standardization)
 
 dir_output = 'yaml'
 suffix = '.consensus'
@@ -21,27 +30,6 @@ suffix = '.consensus'
 if not os.path.exists(dir_output):
     os.makedirs(dir_output)
 
-term_to_uri_dict = {}
-
-for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in
-                          os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
-    print('Read {}'.format(path_dict_xxx_csv))
-
-    with open(path_dict_xxx_csv) as f:
-        for line in f:
-            if len(line.split(',')) > 2:
-                term, uri = line.strip('\n').split('",')
-            else:
-                term, uri = line.strip('\n').split(',')
-
-            term = term.strip('"')
-
-            if term in term_to_uri_dict:
-                print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
-                continue
-
-            term_to_uri_dict[term] = uri
-
 metadata_df = pd.read_excel(path_metadata_xlsx, skiprows=12)
 
 # Maybe not the best pandas-way to do this
@@ -49,8 +37,8 @@ for index, row in metadata_df.iterrows():
     # print(row['*sample_name'])
 
     geo_loc_name = row['*geo_loc_name'].replace(': ', ':')
-    country = ''
-    if not geo_loc_name in term_to_uri_dict:
+
+    if geo_loc_name not in field_to_term_to_uri_dict['ncbi_countries']:
         if geo_loc_name in [
             'New Zealand:Counties Manukau', 'New Zealand:Capital and Coast', 'New Zealand:Southern',
             'New Zealand:Waikato',
@@ -63,14 +51,14 @@ for index, row in metadata_df.iterrows():
             print(geo_loc_name)
             break
 
-    country = term_to_uri_dict[geo_loc_name]
+    country = field_to_term_to_uri_dict['ncbi_countries'][geo_loc_name]
 
     d = {
-        'host_species': term_to_uri_dict[row['*host']],
+        'host_species': field_to_term_to_uri_dict['ncbi_host_species'][row['*host']],
         'sample_id': row['*sample_name'],
         'collection_date': parse(row['*collection_date']).strftime('%Y-%m-%d'),
         'collection_location': country,
-        'specimen_source': term_to_uri_dict[row['*isolation_source']],
+        'specimen_source': field_to_term_to_uri_dict['ncbi_speciesman_source'][row['*isolation_source']],
         'virus_species': 'http://purl.obolibrary.org/obo/NCBITaxon_2697049',
 
         'submitter_sample_id': row['bioproject_accession'],
author	AndreaGuarracino	2020-09-29 18:46:49 +0200
committer	AndreaGuarracino	2020-09-29 18:46:49 +0200
commit	b3a671f04743dc2bf48049b413d7d1f20d31bbcf (patch)
tree	05a6f0f15564f1da8e858551e644543865700ae8 /scripts
parent	c72dab2788d010153d5406f2d5ecbe3824571931 (diff)
download	bh20-seq-resource-b3a671f04743dc2bf48049b413d7d1f20d31bbcf.tar.gz bh20-seq-resource-b3a671f04743dc2bf48049b413d7d1f20d31bbcf.tar.lz bh20-seq-resource-b3a671f04743dc2bf48049b413d7d1f20d31bbcf.zip