diff options
Diffstat (limited to 'scripts/download_genbank_data')
-rwxr-xr-x | scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index dbebfbb..f314a1d 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -121,10 +121,11 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x for line in f: if len(line.split(',')) > 2: term, uri = line.strip('\n').split('",') - term = term.strip('"') else: term, uri = line.strip('\n').split(',') + term = term.strip('"') + if term in term_to_uri_dict: print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) continue @@ -243,6 +244,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text if GBQualifier_name_text == 'host': + GBQualifier_value_text = GBQualifier_value_text.split(';')[0] # For case like Homo sapiens;sex:female if GBQualifier_value_text in term_to_uri_dict: # Cases like 'Felis catus; Domestic Shorthair' info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[GBQualifier_value_text] @@ -314,10 +316,12 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if GBQualifier_value_text in term_to_uri_dict: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] else: - if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']: + if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab', 'naso and/or oropharyngeal swab']: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] - elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']: + elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab', 'Nasopharyngeal/Throat']: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] + elif GBQualifier_value_text.lower() in ['nasopharyngeal aspirate & throat swab', 'nasopharyngeal aspirate and throat swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']] elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']] elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']: @@ -353,8 +357,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) info_for_yaml_dict['sample']['collection_date'] = date_to_write elif GBQualifier_name_text in ['lat_lon', 'country']: - if GBQualifier_value_text == 'Hong Kong': - GBQualifier_value_text = 'China: Hong Kong' + if GBQualifier_name_text == 'country' and ': ' in GBQualifier_value_text: + GBQualifier_value_text = GBQualifier_value_text.replace(': ', ':') if GBQualifier_value_text in term_to_uri_dict: info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[GBQualifier_value_text] |