aboutsummaryrefslogtreecommitdiff
path: root/scripts/download_genbank_data
diff options
context:
space:
mode:
authorPjotr Prins2020-08-24 15:55:30 +0100
committerGitHub2020-08-24 15:55:30 +0100
commit1094ee920a6826439e8be6243bfb6e806ddf7678 (patch)
tree346de1941249188f89ec41c722bb9300fe995d60 /scripts/download_genbank_data
parent7fabc4f9427856600e237c6cacd710f49b88d45d (diff)
parent3b9423891c4e90499a40c1be029ef40160efb557 (diff)
downloadbh20-seq-resource-1094ee920a6826439e8be6243bfb6e806ddf7678.tar.gz
bh20-seq-resource-1094ee920a6826439e8be6243bfb6e806ddf7678.tar.lz
bh20-seq-resource-1094ee920a6826439e8be6243bfb6e806ddf7678.zip
Merge pull request #102 from AndreaGuarracino/patch-4
a lot of new ontology terms, genbank/sra scripts updated to be more general
Diffstat (limited to 'scripts/download_genbank_data')
-rwxr-xr-xscripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py14
1 files changed, 9 insertions, 5 deletions
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index dbebfbb..f314a1d 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -121,10 +121,11 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x
for line in f:
if len(line.split(',')) > 2:
term, uri = line.strip('\n').split('",')
- term = term.strip('"')
else:
term, uri = line.strip('\n').split(',')
+ term = term.strip('"')
+
if term in term_to_uri_dict:
print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
continue
@@ -243,6 +244,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text
if GBQualifier_name_text == 'host':
+ GBQualifier_value_text = GBQualifier_value_text.split(';')[0] # For case like Homo sapiens;sex:female
if GBQualifier_value_text in term_to_uri_dict:
# Cases like 'Felis catus; Domestic Shorthair'
info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[GBQualifier_value_text]
@@ -314,10 +316,12 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
if GBQualifier_value_text in term_to_uri_dict:
info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]]
else:
- if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']:
+ if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab', 'naso and/or oropharyngeal swab']:
info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']]
- elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']:
+ elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab', 'Nasopharyngeal/Throat']:
info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']]
+ elif GBQualifier_value_text.lower() in ['nasopharyngeal aspirate & throat swab', 'nasopharyngeal aspirate and throat swab']:
+ info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']]
elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']:
info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']]
elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']:
@@ -353,8 +357,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
info_for_yaml_dict['sample']['collection_date'] = date_to_write
elif GBQualifier_name_text in ['lat_lon', 'country']:
- if GBQualifier_value_text == 'Hong Kong':
- GBQualifier_value_text = 'China: Hong Kong'
+ if GBQualifier_name_text == 'country' and ': ' in GBQualifier_value_text:
+ GBQualifier_value_text = GBQualifier_value_text.replace(': ', ':')
if GBQualifier_value_text in term_to_uri_dict:
info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[GBQualifier_value_text]