From d6559a65865ebbb14e041893c2dae742fa146143 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 22 Aug 2020 16:41:02 +0200 Subject: genbank/sra scripts updated to read the dictionaries in a more general way --- scripts/create_sra_metadata/create_sra_metadata.py | 12 ++++++++++-- .../download_genbank_data/from_genbank_to_fasta_and_yaml.py | 8 +++++--- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'scripts') diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py index 352a30e..d02fde8 100644 --- a/scripts/create_sra_metadata/create_sra_metadata.py +++ b/scripts/create_sra_metadata/create_sra_metadata.py @@ -23,14 +23,19 @@ term_to_uri_dict = {} for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]: print('Read {}'.format(path_dict_xxx_csv)) - with open(path_dict_xxx_csv, 'r') as f: + with open(path_dict_xxx_csv) as f: for line in f: if len(line.split(',')) > 2: term, uri = line.strip('\n').split('",') - term = term.strip('"') else: term, uri = line.strip('\n').split(',') + term = term.strip('"') + + if term in term_to_uri_dict: + print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) + continue + term_to_uri_dict[term] = uri def is_integer(string_to_check): @@ -178,6 +183,9 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): else: info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(VALUE_text) elif TAG_text == 'geo_loc_name': + if ': ' in VALUE_text: + VALUE_text = VALUE_text.replace(': ', ':') + if VALUE_text in term_to_uri_dict: info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[VALUE_text] elif VALUE_text.lower() not in ['na', 'not applicable']: diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index dbebfbb..3c59f8c 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -121,10 +121,11 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x for line in f: if len(line.split(',')) > 2: term, uri = line.strip('\n').split('",') - term = term.strip('"') else: term, uri = line.strip('\n').split(',') + term = term.strip('"') + if term in term_to_uri_dict: print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) continue @@ -243,6 +244,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text if GBQualifier_name_text == 'host': + GBQualifier_value_text = GBQualifier_value_text.split(';')[0] # For case like Homo sapiens;sex:female if GBQualifier_value_text in term_to_uri_dict: # Cases like 'Felis catus; Domestic Shorthair' info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[GBQualifier_value_text] @@ -353,8 +355,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) info_for_yaml_dict['sample']['collection_date'] = date_to_write elif GBQualifier_name_text in ['lat_lon', 'country']: - if GBQualifier_value_text == 'Hong Kong': - GBQualifier_value_text = 'China: Hong Kong' + if GBQualifier_name_text == 'country' and ': ' in GBQualifier_value_text: + GBQualifier_value_text = GBQualifier_value_text.replace(': ', ':') if GBQualifier_value_text in term_to_uri_dict: info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[GBQualifier_value_text] -- cgit v1.2.3