From bc2e51bc8418876cc826482ece10874b2a61fa03 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Mon, 28 Sep 2020 09:13:50 +0200 Subject: genbank and sra scripts more picky on the ontologies; added utils.py for shared functions --- scripts/create_sra_metadata/create_sra_metadata.py | 65 ++++++----------- .../from_genbank_to_fasta_and_yaml.py | 82 ++++++++-------------- scripts/utils.py | 62 ++++++++++++++++ 3 files changed, 113 insertions(+), 96 deletions(-) create mode 100644 scripts/utils.py diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py index b872896..d94093e 100644 --- a/scripts/create_sra_metadata/create_sra_metadata.py +++ b/scripts/create_sra_metadata/create_sra_metadata.py @@ -14,7 +14,10 @@ from dateutil.parser import parse import xml.etree.ElementTree as ET import json import gzip + import sys +sys.path.append('../') +from utils import is_integer, check_and_get_ontology_dictionaries dir_yaml = 'yaml' @@ -29,14 +32,6 @@ dir_dict_ontology_standardization = args.dict_ontology path_sra_study_accessions_txt = 'SRAStudyAccessions.{}.txt'.format(date) -def is_integer(string_to_check): - try: - int(string_to_check) - return True - except ValueError: - return False - - accession_to_ignore_set = set() if args.ids_to_ignore: @@ -64,25 +59,7 @@ if args.ids_to_consider: print('There are {} accessions to consider.'.format(len(accession_to_consider_set))) -term_to_uri_dict = {} - -for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]: - print('Read {}'.format(path_dict_xxx_csv)) - - with open(path_dict_xxx_csv) as f: - for line in f: - if len(line.split(',')) > 2: - term, uri = line.strip('\n').split('",') - else: - term, uri = line.strip('\n').split(',') - - term = term.strip('"') - - if term in term_to_uri_dict: - print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) - continue - - term_to_uri_dict[term] = uri +field_to_term_to_uri_dict = check_and_get_ontology_dictionaries(dir_dict_ontology_standardization) if not os.path.exists(dir_yaml): @@ -151,13 +128,13 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): if VALUE_text.lower() in ['homo sapien', 'homosapiens']: VALUE_text = 'Homo sapiens' - if VALUE_text in term_to_uri_dict: - info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[VALUE_text] + if VALUE_text in field_to_term_to_uri_dict['ncbi_host_species']: + info_for_yaml_dict['host']['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][VALUE_text] else: missing_value_list.append('\t'.join([accession, 'host_species', VALUE_text])) elif TAG_text in ['host_health_status', 'host health state']: - if VALUE_text in term_to_uri_dict: - info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text] + if VALUE_text in field_to_term_to_uri_dict['ncbi_host_health_status']: + info_for_yaml_dict['host']['host_health_status'] = field_to_term_to_uri_dict['ncbi_host_health_status'][VALUE_text] elif VALUE_text.strip("'") not in ['missing', 'not collected', 'not provided']: missing_value_list.append('\t'.join([accession, 'host_health_status', VALUE_text])) elif TAG_text in ['strain', 'isolate']: @@ -167,27 +144,27 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): if value_to_insert.lower() in ['homo sapien', 'homosapiens']: value_to_insert = 'Homo sapiens' - if value_to_insert in term_to_uri_dict: - value_to_insert = term_to_uri_dict[value_to_insert] + if value_to_insert in field_to_term_to_uri_dict['ncbi_host_species']: + value_to_insert = field_to_term_to_uri_dict['ncbi_host_species'][value_to_insert] if 'virus_strain' not in info_for_yaml_dict: info_for_yaml_dict['virus']['virus_strain'] = value_to_insert else: info_for_yaml_dict['virus']['virus_strain'] += '; ' + value_to_insert elif TAG_text in ['isolation_source', 'isolation source host-associated']: - if VALUE_text in term_to_uri_dict: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]] + if VALUE_text in field_to_term_to_uri_dict['ncbi_speciesman_source']: + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source'][VALUE_text]] else: if VALUE_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab', 'naso and/or oropharyngeal swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']] elif VALUE_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab', 'Nasopharyngeal/Throat']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']] elif VALUE_text.lower() in ['nasopharyngeal aspirate & throat swab', 'nasopharyngeal aspirate and throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']] + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal aspirate'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']] elif VALUE_text.lower() in ['nasal swab and throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']] + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']] elif VALUE_text.lower() in ['nasal-swab and oro-pharyngeal swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']] + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']] elif VALUE_text.strip("'") not in ['missing', 'not collected', 'unknown', 'not provided', 'not applicable', 'N/A']: missing_value_list.append('\t'.join([accession, 'specimen_source', VALUE_text])) elif TAG_text in ['host_sex', 'host sex']: @@ -238,8 +215,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): if ': ' in VALUE_text: VALUE_text = VALUE_text.replace(': ', ':') - if VALUE_text in term_to_uri_dict: - info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[VALUE_text] + if VALUE_text in field_to_term_to_uri_dict['ncbi_countries']: + info_for_yaml_dict['sample']['collection_location'] = field_to_term_to_uri_dict['ncbi_countries'][VALUE_text] elif VALUE_text.lower() not in ['na', 'not applicable']: missing_value_list.append('\t'.join([accession, 'geo_loc_name', VALUE_text])) #else: @@ -255,8 +232,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): INSTRUMENT_MODEL = [x.text for x in EXPERIMENT.find('PLATFORM').iter('INSTRUMENT_MODEL')][0] if INSTRUMENT_MODEL.lower() != 'unspecified': - if INSTRUMENT_MODEL in term_to_uri_dict: - info_for_yaml_dict['technology']['sample_sequencing_technology'] = [term_to_uri_dict[INSTRUMENT_MODEL]] + if INSTRUMENT_MODEL in field_to_term_to_uri_dict['ncbi_sequencing_technology']: + info_for_yaml_dict['technology']['sample_sequencing_technology'] = [field_to_term_to_uri_dict['ncbi_sequencing_technology'][INSTRUMENT_MODEL]] else: missing_value_list.append('\t'.join([accession, 'sample_sequencing_technology', INSTRUMENT_MODEL])) #else: diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index 52aee4e..9b8fedc 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -16,11 +16,15 @@ import xml.etree.ElementTree as ET import json import os import requests -import sys from datetime import date from dateutil.parser import parse +import sys +sys.path.append('../') +from utils import is_integer, chunks, check_and_get_ontology_dictionaries + + num_ids_for_request = 100 dir_metadata = 'metadata_from_nuccore' @@ -30,16 +34,9 @@ dir_dict_ontology_standardization = args.dict_ontology today_date = date.today().strftime("%Y.%m.%d") path_ncbi_virus_accession = 'sequences.{}.acc'.format(today_date) -def is_integer(string_to_check): - try: - int(string_to_check) - return True - except ValueError: - return False -def chunks(lst, n): - for i in range(0, len(lst), n): - yield lst[i:i + n] +field_to_term_to_uri_dict = check_and_get_ontology_dictionaries(dir_dict_ontology_standardization) + if os.path.exists(dir_metadata): print("The directory '{}' already exists.".format(dir_metadata)) @@ -123,29 +120,10 @@ if not os.path.exists(dir_metadata): ) -term_to_uri_dict = {} - -for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]: - print('Read {}'.format(path_dict_xxx_csv)) - - with open(path_dict_xxx_csv) as f: - for line in f: - if len(line.split(',')) > 2: - term, uri = line.strip('\n').split('",') - else: - term, uri = line.strip('\n').split(',') - - term = term.strip('"') - - if term in term_to_uri_dict: - print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) - continue - - term_to_uri_dict[term] = uri - if not os.path.exists(dir_fasta_and_yaml): os.makedirs(dir_fasta_and_yaml) + min_len_to_count = 15000 num_seq_with_len_ge_X_bp = 0 @@ -166,7 +144,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) continue try: - #print(path_metadata_xxx_xml, accession_version) + # print(path_metadata_xxx_xml, accession_version) # A general default-empty yaml could be read from the definitive one info_for_yaml_dict = { @@ -230,8 +208,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) new_seq_tec_list = [] for seq_tec in tech_info_to_parse.split(';'): seq_tec = seq_tec.strip() - if seq_tec in term_to_uri_dict: - seq_tec = term_to_uri_dict[seq_tec] + if seq_tec in field_to_term_to_uri_dict['ncbi_sequencing_technology']: + seq_tec = field_to_term_to_uri_dict['ncbi_sequencing_technology'][seq_tec] new_seq_tec_list.append(seq_tec) else: missing_value_list.append('\t'.join([accession_version, 'sample_sequencing_technology', seq_tec])) @@ -256,17 +234,17 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if GBQualifier_name_text == 'host': GBQualifier_value_text = GBQualifier_value_text.split(';')[0] # For case like Homo sapiens;sex:female - if GBQualifier_value_text in term_to_uri_dict: + if GBQualifier_value_text in field_to_term_to_uri_dict['ncbi_host_species']: # Cases like 'Felis catus; Domestic Shorthair' - info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[GBQualifier_value_text] + info_for_yaml_dict['host']['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][GBQualifier_value_text] else: GBQualifier_value_text_list = GBQualifier_value_text.split('; ') - if GBQualifier_value_text_list[0] in term_to_uri_dict: - info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[GBQualifier_value_text_list[0]] + if GBQualifier_value_text_list[0] in field_to_term_to_uri_dict['ncbi_host_species']: + info_for_yaml_dict['host']['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][GBQualifier_value_text_list[0]] elif GBQualifier_value_text_list[0] and ('MT215193' in accession_version or 'MT270814' in accession_version): # Information checked manually from NCBI Virus - info_for_yaml_dict['host']['host_species'] = term_to_uri_dict['Canis lupus familiaris'] + info_for_yaml_dict['host']['host_species'] = field_to_term_to_uri_dict['ncbi_host_species']['Canis lupus familiaris'] else: missing_value_list.append('\t'.join([accession_version, 'host_species', GBQualifier_value_text_list[0]])) @@ -295,8 +273,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if host_sex in ['male', 'female']: info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if host_sex == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383" - elif GBQualifier_value_text_list[1] in term_to_uri_dict: - info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]] + elif GBQualifier_value_text_list[1] in field_to_term_to_uri_dict['ncbi_host_health_status']: + info_for_yaml_dict['host']['host_health_status'] = field_to_term_to_uri_dict['ncbi_host_health_status'][GBQualifier_value_text_list[1]] else: missing_value_list.append('\t'.join([accession_version, 'host_sex or host_health_status', GBQualifier_value_text_list[1]])) @@ -318,25 +296,25 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) else: info_for_yaml_dict['sample']['collector_name'] = GBQualifier_value_text elif GBQualifier_name_text == 'isolation_source': - if GBQualifier_value_text.upper() in term_to_uri_dict: - GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' + if GBQualifier_value_text.upper() in field_to_term_to_uri_dict['ncbi_speciesman_source']: + GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' # Little cleaning GBQualifier_value_text = GBQualifier_value_text.strip("/'") - if GBQualifier_value_text in term_to_uri_dict: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] + if GBQualifier_value_text in field_to_term_to_uri_dict['ncbi_speciesman_source']: + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source'][GBQualifier_value_text]] else: if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab', 'naso and/or oropharyngeal swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']] elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab', 'Nasopharyngeal/Throat']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']] elif GBQualifier_value_text.lower() in ['nasopharyngeal aspirate & throat swab', 'nasopharyngeal aspirate and throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']] + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal aspirate'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']] elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']] + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']] elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']] + info_for_yaml_dict['sample']['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']] else: missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text])) elif GBQualifier_name_text == 'collection_date': @@ -371,8 +349,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if GBQualifier_name_text == 'country' and ': ' in GBQualifier_value_text: GBQualifier_value_text = GBQualifier_value_text.replace(': ', ':') - if GBQualifier_value_text in term_to_uri_dict: - info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[GBQualifier_value_text] + if GBQualifier_value_text in field_to_term_to_uri_dict['ncbi_countries']: + info_for_yaml_dict['sample']['collection_location'] = field_to_term_to_uri_dict['ncbi_countries'][GBQualifier_value_text] else: missing_value_list.append('\t'.join([accession_version, GBQualifier_name_text, GBQualifier_value_text])) elif GBQualifier_name_text == 'note': @@ -387,7 +365,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) # Check if mandatory fields are missing if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']: - #print(accession_version, ' - technology not found') + # print(accession_version, ' - technology not found') if accession_version not in not_created_accession_dict: not_created_accession_dict[accession_version] = [] not_created_accession_dict[accession_version].append('sample_sequencing_technology not found') diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000..3efc67a --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,62 @@ +import os + +def is_integer(string_to_check): + try: + int(string_to_check) + return True + except ValueError: + return False + +def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] + +def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): + # Check duplicated entry looking at all dictionaries + field_to_term_to_uri_dict = {} + + path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in + os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')] + + for path_dict_xxx_csv in path_dict_xxx_csv_list: + print('Read {}'.format(path_dict_xxx_csv)) + + with open(path_dict_xxx_csv) as f: + for line in f: + if len(line.split(',')) > 2: + term, uri = line.strip('\n').split('",') + else: + term, uri = line.strip('\n').split(',') + + term = term.strip('"') + + if term in field_to_term_to_uri_dict: + print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) + continue + + field_to_term_to_uri_dict[term] = uri + + # Prepare separated dictionaries (to avoid, for example, that a valid IRI for species is accepted as specimen) + field_to_term_to_uri_dict = {} + + for path_dict_xxx_csv in path_dict_xxx_csv_list: + field = os.path.basename(path_dict_xxx_csv).split('.')[0] + + field_to_term_to_uri_dict[field] = {} + + with open(path_dict_xxx_csv) as f: + for line in f: + if len(line.split(',')) > 2: + term, uri = line.strip('\n').split('",') + else: + term, uri = line.strip('\n').split(',') + + term = term.strip('"') + + if term in field_to_term_to_uri_dict[field]: + print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term)) + continue + + field_to_term_to_uri_dict[field][term] = uri + + return field_to_term_to_uri_dict \ No newline at end of file -- cgit v1.2.3