3 files changed, 113 insertions, 96 deletions
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index b872896..d94093e 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -14,7 +14,10 @@ from dateutil.parser import parse
 import xml.etree.ElementTree as ET
 import json
 import gzip
+
 import sys
+sys.path.append('../')
+from utils import is_integer, check_and_get_ontology_dictionaries
 
 dir_yaml = 'yaml'
 
@@ -29,14 +32,6 @@ dir_dict_ontology_standardization = args.dict_ontology
 path_sra_study_accessions_txt = 'SRAStudyAccessions.{}.txt'.format(date)
 
 
-def is_integer(string_to_check):
-    try:
-        int(string_to_check)
-        return True
-    except ValueError:
-        return False
-
-
 accession_to_ignore_set = set()
 
 if args.ids_to_ignore:
@@ -64,25 +59,7 @@ if args.ids_to_consider:
         print('There are {} accessions to consider.'.format(len(accession_to_consider_set)))
 
 
-term_to_uri_dict = {}
-
-for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
-    print('Read {}'.format(path_dict_xxx_csv))
-
-    with open(path_dict_xxx_csv) as f:
-        for line in f:
-            if len(line.split(',')) > 2:
-                term, uri = line.strip('\n').split('",')
-            else:
-                term, uri = line.strip('\n').split(',')
-
-            term = term.strip('"')
-
-            if term in term_to_uri_dict:
-                print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
-                continue
-
-            term_to_uri_dict[term] = uri
+field_to_term_to_uri_dict = check_and_get_ontology_dictionaries(dir_dict_ontology_standardization)
 
 
 if not os.path.exists(dir_yaml):
@@ -151,13 +128,13 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
                 if VALUE_text.lower() in ['homo sapien', 'homosapiens']:
                     VALUE_text = 'Homo sapiens'
 
-                if VALUE_text in term_to_uri_dict:
-                    info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[VALUE_text]
+                if VALUE_text in field_to_term_to_uri_dict['ncbi_host_species']:
+                    info_for_yaml_dict['host']['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][VALUE_text]
                 else:
                     missing_value_list.append('\t'.join([accession, 'host_species', VALUE_text]))
             elif TAG_text in ['host_health_status', 'host health state']:
-                if VALUE_text in term_to_uri_dict:
-                    info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]
+                if VALUE_text in field_to_term_to_uri_dict['ncbi_host_health_status']:
+                    info_for_yaml_dict['host']['host_health_status'] = field_to_term_to_uri_dict['ncbi_host_health_status'][VALUE_text]
                 elif VALUE_text.strip("'") not in ['missing', 'not collected', 'not provided']:
                     missing_value_list.append('\t'.join([accession, 'host_health_status', VALUE_text]))
             elif TAG_text in ['strain', 'isolate']:
@@ -167,27 +144,27 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
                     if value_to_insert.lower() in ['homo sapien', 'homosapiens']:
                         value_to_insert = 'Homo sapiens'
 
-                    if value_to_insert in term_to_uri_dict:
-                        value_to_insert = term_to_uri_dict[value_to_insert]
+                    if value_to_insert in field_to_term_to_uri_dict['ncbi_host_species']:
+                        value_to_insert = field_to_term_to_uri_dict['ncbi_host_species'][value_to_insert]
 
                     if 'virus_strain' not in info_for_yaml_dict:
                         info_for_yaml_dict['virus']['virus_strain'] = value_to_insert
                     else:
                         info_for_yaml_dict['virus']['virus_strain'] += '; ' + value_to_insert
             elif TAG_text in ['isolation_source', 'isolation source host-associated']:
-                if VALUE_text in term_to_uri_dict:
-                    info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]]
+                if VALUE_text in field_to_term_to_uri_dict['ncbi_speciesman_source']:
+                    info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source'][VALUE_text]]
                 else:
                     if VALUE_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab', 'naso and/or oropharyngeal swab']:
-                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']]
+                        info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']]
                     elif VALUE_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab', 'Nasopharyngeal/Throat']:
-                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']]
+                        info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']]
                     elif VALUE_text.lower() in ['nasopharyngeal aspirate & throat swab', 'nasopharyngeal aspirate and throat swab']:
-                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']]
+                        info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal aspirate'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']]
                     elif VALUE_text.lower() in ['nasal swab and throat swab']:
-                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']]
+                        info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']]
                     elif VALUE_text.lower() in ['nasal-swab and oro-pharyngeal swab']:
-                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']]
+                        info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']]
                     elif VALUE_text.strip("'") not in ['missing', 'not collected', 'unknown', 'not provided', 'not applicable', 'N/A']:
                         missing_value_list.append('\t'.join([accession, 'specimen_source', VALUE_text]))
             elif TAG_text in ['host_sex', 'host sex']:
@@ -238,8 +215,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
                 if ': ' in VALUE_text:
                     VALUE_text = VALUE_text.replace(': ', ':')
 
-                if VALUE_text in term_to_uri_dict:
-                    info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[VALUE_text]
+                if VALUE_text in field_to_term_to_uri_dict['ncbi_countries']:
+                    info_for_yaml_dict['sample']['collection_location'] = field_to_term_to_uri_dict['ncbi_countries'][VALUE_text]
                 elif VALUE_text.lower() not in ['na', 'not applicable']:
                     missing_value_list.append('\t'.join([accession, 'geo_loc_name', VALUE_text]))
             #else:
@@ -255,8 +232,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
     INSTRUMENT_MODEL = [x.text for x in EXPERIMENT.find('PLATFORM').iter('INSTRUMENT_MODEL')][0]
 
     if INSTRUMENT_MODEL.lower() != 'unspecified':
-        if INSTRUMENT_MODEL in term_to_uri_dict:
-            info_for_yaml_dict['technology']['sample_sequencing_technology'] = [term_to_uri_dict[INSTRUMENT_MODEL]]
+        if INSTRUMENT_MODEL in field_to_term_to_uri_dict['ncbi_sequencing_technology']:
+            info_for_yaml_dict['technology']['sample_sequencing_technology'] = [field_to_term_to_uri_dict['ncbi_sequencing_technology'][INSTRUMENT_MODEL]]
         else:
             missing_value_list.append('\t'.join([accession, 'sample_sequencing_technology', INSTRUMENT_MODEL]))
     #else:
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 52aee4e..9b8fedc 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -16,11 +16,15 @@ import xml.etree.ElementTree as ET
 import json
 import os
 import requests
-import sys
 
 from datetime import date
 from dateutil.parser import parse
 
+import sys
+sys.path.append('../')
+from utils import is_integer, chunks, check_and_get_ontology_dictionaries
+
+
 num_ids_for_request = 100
 
 dir_metadata = 'metadata_from_nuccore'
@@ -30,16 +34,9 @@ dir_dict_ontology_standardization = args.dict_ontology
 today_date = date.today().strftime("%Y.%m.%d")
 path_ncbi_virus_accession = 'sequences.{}.acc'.format(today_date)
 
-def is_integer(string_to_check):
-    try:
-        int(string_to_check)
-        return True
-    except ValueError:
-        return False
 
-def chunks(lst, n):
-    for i in range(0, len(lst), n):
-        yield lst[i:i + n]
+field_to_term_to_uri_dict = check_and_get_ontology_dictionaries(dir_dict_ontology_standardization)
+
 
 if os.path.exists(dir_metadata):
     print("The directory '{}' already exists.".format(dir_metadata))
@@ -123,29 +120,10 @@ if not os.path.exists(dir_metadata):
             )
 
 
-term_to_uri_dict = {}
-
-for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
-    print('Read {}'.format(path_dict_xxx_csv))
-
-    with open(path_dict_xxx_csv) as f:
-        for line in f:
-            if len(line.split(',')) > 2:
-                term, uri = line.strip('\n').split('",')
-            else:
-                term, uri = line.strip('\n').split(',')
-
-            term = term.strip('"')
-
-            if term in term_to_uri_dict:
-                print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
-                continue
-
-            term_to_uri_dict[term] = uri
-
 if not os.path.exists(dir_fasta_and_yaml):
     os.makedirs(dir_fasta_and_yaml)
 
+
 min_len_to_count = 15000
 num_seq_with_len_ge_X_bp = 0
 
@@ -166,7 +144,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
             continue
 
         try:
-            #print(path_metadata_xxx_xml, accession_version)
+            # print(path_metadata_xxx_xml, accession_version)
 
             # A general default-empty yaml could be read from the definitive one
             info_for_yaml_dict = {
@@ -230,8 +208,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
                             new_seq_tec_list = []
                             for seq_tec in tech_info_to_parse.split(';'):
                                 seq_tec = seq_tec.strip()
-                                if seq_tec in term_to_uri_dict:
-                                    seq_tec = term_to_uri_dict[seq_tec]
+                                if seq_tec in field_to_term_to_uri_dict['ncbi_sequencing_technology']:
+                                    seq_tec = field_to_term_to_uri_dict['ncbi_sequencing_technology'][seq_tec]
                                     new_seq_tec_list.append(seq_tec)
                                 else:
                                     missing_value_list.append('\t'.join([accession_version, 'sample_sequencing_technology', seq_tec]))
@@ -256,17 +234,17 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
 
                     if GBQualifier_name_text == 'host':
                         GBQualifier_value_text = GBQualifier_value_text.split(';')[0] # For case like Homo sapiens;sex:female
-                        if GBQualifier_value_text in term_to_uri_dict:
+                        if GBQualifier_value_text in field_to_term_to_uri_dict['ncbi_host_species']:
                             # Cases like 'Felis catus; Domestic Shorthair'
-                            info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[GBQualifier_value_text]
+                            info_for_yaml_dict['host']['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][GBQualifier_value_text]
                         else:
                             GBQualifier_value_text_list = GBQualifier_value_text.split('; ')
 
-                            if GBQualifier_value_text_list[0] in term_to_uri_dict:
-                                info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[GBQualifier_value_text_list[0]]
+                            if GBQualifier_value_text_list[0] in field_to_term_to_uri_dict['ncbi_host_species']:
+                                info_for_yaml_dict['host']['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][GBQualifier_value_text_list[0]]
                             elif GBQualifier_value_text_list[0] and ('MT215193' in accession_version or 'MT270814' in accession_version):
                                 # Information checked manually from NCBI Virus
-                                info_for_yaml_dict['host']['host_species'] = term_to_uri_dict['Canis lupus familiaris']
+                                info_for_yaml_dict['host']['host_species'] = field_to_term_to_uri_dict['ncbi_host_species']['Canis lupus familiaris']
                             else:
                                 missing_value_list.append('\t'.join([accession_version, 'host_species', GBQualifier_value_text_list[0]]))
 
@@ -295,8 +273,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
 
                                 if host_sex in ['male', 'female']:
                                     info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if host_sex == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383"
-                                elif GBQualifier_value_text_list[1] in term_to_uri_dict:
-                                    info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]]
+                                elif GBQualifier_value_text_list[1] in field_to_term_to_uri_dict['ncbi_host_health_status']:
+                                    info_for_yaml_dict['host']['host_health_status'] = field_to_term_to_uri_dict['ncbi_host_health_status'][GBQualifier_value_text_list[1]]
                                 else:
                                     missing_value_list.append('\t'.join([accession_version, 'host_sex or host_health_status', GBQualifier_value_text_list[1]]))
 
@@ -318,25 +296,25 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
                         else:
                             info_for_yaml_dict['sample']['collector_name'] = GBQualifier_value_text
                     elif GBQualifier_name_text == 'isolation_source':
-                        if GBQualifier_value_text.upper() in term_to_uri_dict:
-                            GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa'
+                        if GBQualifier_value_text.upper() in field_to_term_to_uri_dict['ncbi_speciesman_source']:
+                            GBQualifier_value_text = GBQualifier_value_text.upper()  # For example, in case of 'usa: wa'
 
                         # Little cleaning
                         GBQualifier_value_text = GBQualifier_value_text.strip("/'")
 
-                        if GBQualifier_value_text in term_to_uri_dict:
-                            info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]]
+                        if GBQualifier_value_text in field_to_term_to_uri_dict['ncbi_speciesman_source']:
+                            info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source'][GBQualifier_value_text]]
                         else:
                             if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab', 'naso and/or oropharyngeal swab']:
-                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']]
+                                info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']]
                             elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab', 'Nasopharyngeal/Throat']:
-                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']]
+                                info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']]
                             elif GBQualifier_value_text.lower() in ['nasopharyngeal aspirate & throat swab', 'nasopharyngeal aspirate and throat swab']:
-                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']]
+                                info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal aspirate'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']]
                             elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']:
-                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']]
+                                info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']]
                             elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']:
-                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']]
+                                info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']]
                             else:
                                 missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text]))
                     elif GBQualifier_name_text == 'collection_date':
@@ -371,8 +349,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
                         if GBQualifier_name_text == 'country' and ': ' in GBQualifier_value_text:
                             GBQualifier_value_text = GBQualifier_value_text.replace(': ', ':')
 
-                        if GBQualifier_value_text in term_to_uri_dict:
-                            info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[GBQualifier_value_text]
+                        if GBQualifier_value_text in field_to_term_to_uri_dict['ncbi_countries']:
+                            info_for_yaml_dict['sample']['collection_location'] = field_to_term_to_uri_dict['ncbi_countries'][GBQualifier_value_text]
                         else:
                             missing_value_list.append('\t'.join([accession_version, GBQualifier_name_text, GBQualifier_value_text]))
                     elif GBQualifier_name_text == 'note':
@@ -387,7 +365,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
 
             # Check if mandatory fields are missing
             if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
-                #print(accession_version, ' - technology not found')
+                # print(accession_version, ' - technology not found')
                 if accession_version not in not_created_accession_dict:
                     not_created_accession_dict[accession_version] = []
                 not_created_accession_dict[accession_version].append('sample_sequencing_technology not found')
diff --git a/scripts/utils.py b/scripts/utils.py
new file mode 100644
index 0000000..3efc67a
--- /dev/null
+++ b/scripts/utils.py
@@ -0,0 +1,62 @@
+import os
+
+def is_integer(string_to_check):
+    try:
+        int(string_to_check)
+        return True
+    except ValueError:
+        return False
+
+def chunks(lst, n):
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+
+def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
+    # Check duplicated entry looking at all dictionaries
+    field_to_term_to_uri_dict = {}
+
+    path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
+                              os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')]
+
+    for path_dict_xxx_csv in path_dict_xxx_csv_list:
+        print('Read {}'.format(path_dict_xxx_csv))
+
+        with open(path_dict_xxx_csv) as f:
+            for line in f:
+                if len(line.split(',')) > 2:
+                    term, uri = line.strip('\n').split('",')
+                else:
+                    term, uri = line.strip('\n').split(',')
+
+                term = term.strip('"')
+
+                if term in field_to_term_to_uri_dict:
+                    print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
+                    continue
+
+                field_to_term_to_uri_dict[term] = uri
+
+    # Prepare separated dictionaries (to avoid, for example, that a valid IRI for species is accepted as specimen)
+    field_to_term_to_uri_dict = {}
+
+    for path_dict_xxx_csv in path_dict_xxx_csv_list:
+        field = os.path.basename(path_dict_xxx_csv).split('.')[0]
+
+        field_to_term_to_uri_dict[field] = {}
+
+        with open(path_dict_xxx_csv) as f:
+            for line in f:
+                if len(line.split(',')) > 2:
+                    term, uri = line.strip('\n').split('",')
+                else:
+                    term, uri = line.strip('\n').split(',')
+
+                term = term.strip('"')
+
+                if term in field_to_term_to_uri_dict[field]:
+                    print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term))
+                    continue
+
+                field_to_term_to_uri_dict[field][term] = uri
+
+    return field_to_term_to_uri_dict
\ No newline at end of file