Merge pull request #102 from AndreaGuarracino/patch-4

a lot of new ontology terms, genbank/sra scripts updated to be more general
author: Pjotr Prins 2020-08-24 15:55:30 +0100
committer: GitHub 2020-08-24 15:55:30 +0100
commit: 1094ee920a6826439e8be6243bfb6e806ddf7678 (patch)
tree: 346de1941249188f89ec41c722bb9300fe995d60 /scripts/create_sra_metadata/create_sra_metadata.py
parent: 7fabc4f9427856600e237c6cacd710f49b88d45d (diff)
parent: 3b9423891c4e90499a40c1be029ef40160efb557 (diff)
download: bh20-seq-resource-1094ee920a6826439e8be6243bfb6e806ddf7678.tar.gz
bh20-seq-resource-1094ee920a6826439e8be6243bfb6e806ddf7678.tar.lz
bh20-seq-resource-1094ee920a6826439e8be6243bfb6e806ddf7678.zip
1 files changed, 14 insertions, 4 deletions
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index 352a30e..0e1215e 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -23,14 +23,19 @@ term_to_uri_dict = {}
 for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
     print('Read {}'.format(path_dict_xxx_csv))
 
-    with open(path_dict_xxx_csv, 'r') as f:
+    with open(path_dict_xxx_csv) as f:
         for line in f:
             if len(line.split(',')) > 2:
                 term, uri = line.strip('\n').split('",')
-                term = term.strip('"')
             else:
                 term, uri = line.strip('\n').split(',')
 
+            term = term.strip('"')
+
+            if term in term_to_uri_dict:
+                print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
+                continue
+
             term_to_uri_dict[term] = uri
 
 def is_integer(string_to_check):
@@ -123,10 +128,12 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
                 if VALUE_text in term_to_uri_dict:
                     info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]]
                 else:
-                    if VALUE_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']:
+                    if VALUE_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab', 'naso and/or oropharyngeal swab']:
                         info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']]
-                    elif VALUE_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']:
+                    elif VALUE_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab', 'Nasopharyngeal/Throat']:
                         info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']]
+                    elif VALUE_text.lower() in ['nasopharyngeal aspirate & throat swab', 'nasopharyngeal aspirate and throat swab']:
+                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']]
                     elif VALUE_text.lower() in ['nasal swab and throat swab']:
                         info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']]
                     elif VALUE_text.lower() in ['nasal-swab and oro-pharyngeal swab']:
@@ -178,6 +185,9 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
                         else:
                             info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(VALUE_text)
             elif TAG_text == 'geo_loc_name':
+                if ': ' in VALUE_text:
+                    VALUE_text = VALUE_text.replace(': ', ':')
+
                 if VALUE_text in term_to_uri_dict:
                     info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[VALUE_text]
                 elif VALUE_text.lower() not in ['na', 'not applicable']:
author	Pjotr Prins	2020-08-24 15:55:30 +0100
committer	GitHub	2020-08-24 15:55:30 +0100
commit	1094ee920a6826439e8be6243bfb6e806ddf7678 (patch)
tree	346de1941249188f89ec41c722bb9300fe995d60 /scripts/create_sra_metadata/create_sra_metadata.py
parent	7fabc4f9427856600e237c6cacd710f49b88d45d (diff)
parent	3b9423891c4e90499a40c1be029ef40160efb557 (diff)
download	bh20-seq-resource-1094ee920a6826439e8be6243bfb6e806ddf7678.tar.gz bh20-seq-resource-1094ee920a6826439e8be6243bfb6e806ddf7678.tar.lz bh20-seq-resource-1094ee920a6826439e8be6243bfb6e806ddf7678.zip