From 21ac66de2f92b7a433c6677d7526ee8d9639d999 Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Mon, 22 Jun 2020 22:41:04 +0200 Subject: little fix for specimen_source --- scripts/from_genbank_to_fasta_and_yaml.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 87e99d4..6216340 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -305,12 +305,14 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if GBQualifier_value_text in term_to_uri_dict: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] else: - if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab']: + if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] - elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab']: + elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] - elif GBQualifier_value_text in ['nasopharyngeal aspirate/throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']] + elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']] + elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']] else: missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text])) elif GBQualifier_name_text == 'collection_date': -- cgit v1.2.3 From 5bd991fe5f048b2c4b2c23c6149c1bd789d3bdbc Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Mon, 22 Jun 2020 23:23:03 +0200 Subject: added new dictionary entries --- scripts/dict_ontology_standardization/ncbi_countries.csv | 4 ++++ scripts/dict_ontology_standardization/ncbi_host_species.csv | 1 + .../dict_ontology_standardization/ncbi_sequencing_technology.csv | 7 +++++++ 3 files changed, 12 insertions(+) diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index 85d4e8a..204f7f2 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -239,6 +239,7 @@ Solomon Islands,http://www.wikidata.org/entity/Q685 Somalia,http://www.wikidata.org/entity/Q1045 South Africa,http://www.wikidata.org/entity/Q258 South Africa: KwaZulu-Natal,http://www.wikidata.org/entity/Q81725 +South Africa: KZN,http://www.wikidata.org/entity/Q81725 South African Republic,http://www.wikidata.org/entity/Q550374 South Korea,http://www.wikidata.org/entity/Q884 South Sudan,http://www.wikidata.org/entity/Q958 @@ -279,6 +280,7 @@ USA: AR,http://www.wikidata.org/entity/Q1612 USA: AZ,http://www.wikidata.org/entity/Q816 USA: Arizona,http://www.wikidata.org/entity/Q816 USA: CA,http://www.wikidata.org/entity/Q99 +USA:CA,http://www.wikidata.org/entity/Q99 USA: California,http://www.wikidata.org/entity/Q99 USA:California,http://www.wikidata.org/entity/Q99 "USA: CA, San Diego County",http://www.wikidata.org/entity/Q108143 @@ -299,6 +301,7 @@ USA: IN,http://www.wikidata.org/entity/Q1415 USA: KS,http://www.wikidata.org/entity/Q1558 USA: KY,http://www.wikidata.org/entity/Q1603 USA: LA,http://www.wikidata.org/entity/Q1588 +USA:Los Angeles,http://www.wikidata.org/entity/Q65 "USA: New Orleans, LA",http://www.wikidata.org/entity/Q34404 USA: MA,http://www.wikidata.org/entity/Q771 USA: Massachusetts,http://www.wikidata.org/entity/Q771 @@ -330,6 +333,7 @@ USA: PA,http://www.wikidata.org/entity/Q1400 USA: RI,http://www.wikidata.org/entity/Q1387 "USA: San Francisco, CA",http://www.wikidata.org/entity/Q62 USA: SC,http://www.wikidata.org/entity/Q1456 +USA: South Carolina,http://www.wikidata.org/entity/Q1456 USA: SD,http://www.wikidata.org/entity/Q1211 "USA: Snohomish County, WA",http://www.wikidata.org/entity/Q110403 USA: TN,http://www.wikidata.org/entity/Q1509 diff --git a/scripts/dict_ontology_standardization/ncbi_host_species.csv b/scripts/dict_ontology_standardization/ncbi_host_species.csv index 102d458..bc6ac04 100644 --- a/scripts/dict_ontology_standardization/ncbi_host_species.csv +++ b/scripts/dict_ontology_standardization/ncbi_host_species.csv @@ -1,6 +1,7 @@ Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606 human,http://purl.obolibrary.org/obo/NCBITaxon_9606 Human,http://purl.obolibrary.org/obo/NCBITaxon_9606 +sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606 Mustela lutreola,http://purl.obolibrary.org/obo/NCBITaxon_9666 Manis javanica,http://purl.obolibrary.org/obo/NCBITaxon_9974 Felis catus,http://purl.obolibrary.org/obo/NCBITaxon_9685 diff --git a/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv b/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv index 110e90b..964cbf3 100644 --- a/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv +++ b/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv @@ -1,3 +1,9 @@ +Illumina HiSeq 1000,http://www.ebi.ac.uk/efo/EFO_0004204 +Illumina HiSeq 2000,http://www.ebi.ac.uk/efo/EFO_0004203 +Illumina HiSeq 2500,http://www.ebi.ac.uk/efo/EFO_0008565 +Illumina HiSeq 3000,http://www.ebi.ac.uk/efo/EFO_0008564 +Illumina HiSeq 4000,http://www.ebi.ac.uk/efo/EFO_0008563 +Illumina iSeq 100,http://www.ebi.ac.uk/efo/EFO_0008635 Illumian NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173 Illumina NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173 NextSeq500,http://www.ebi.ac.uk/efo/EFO_0009173 @@ -14,6 +20,7 @@ ONT (Oxford Nanopore Technologies),http://purl.obolibrary.org/obo/NCIT_C146818 Oxford Nanopore Technology,http://purl.obolibrary.org/obo/NCIT_C146818 Oxford Nanopore technologies MinION,http://www.ebi.ac.uk/efo/EFO_0008632 MinION Oxford Nanopore,http://www.ebi.ac.uk/efo/EFO_0008632 +MinION,http://www.ebi.ac.uk/efo/EFO_0008632 Nanopore,http://purl.obolibrary.org/obo/NCIT_C146818 Illumina MiSeq,http://www.ebi.ac.uk/efo/EFO_0004205 Illumina,http://purl.obolibrary.org/obo/OBI_0000759 -- cgit v1.2.3 From 75ad6744aa8edb7de92a427c55405d9331879ad4 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Mon, 22 Jun 2020 23:27:19 +0200 Subject: moved the genbank script in his specific directory --- .../from_genbank_to_fasta_and_yaml.py | 395 +++++ scripts/from_genbank_to_fasta_and_yaml.py | 395 ----- scripts/sequences.acc | 1731 -------------------- 3 files changed, 395 insertions(+), 2126 deletions(-) create mode 100755 scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py delete mode 100755 scripts/from_genbank_to_fasta_and_yaml.py delete mode 100644 scripts/sequences.acc diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py new file mode 100755 index 0000000..d76f56b --- /dev/null +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False) +parser.add_argument('--only-missing-id', action='store_true', help='download only missing id', required=False) +args = parser.parse_args() + +from Bio import Entrez +Entrez.email = 'another_email@gmail.com' + +import xml.etree.ElementTree as ET +import json +import os +import requests +import sys + +from datetime import date +from dateutil.parser import parse + +num_ids_for_request = 100 + +dir_metadata = 'metadata_from_nuccore' +dir_fasta_and_yaml = 'fasta_and_yaml' +dir_dict_ontology_standardization = '../dict_ontology_standardization/' + +today_date = date.today().strftime("%Y.%m.%d") +path_ncbi_virus_accession = 'sequences.{}.acc'.format(today_date) + +def is_integer(string_to_check): + try: + int(string_to_check) + return True + except ValueError: + return False + +def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] + +if os.path.exists(dir_metadata): + print("The directory '{}' already exists.".format(dir_metadata)) + + if not args.skip_request: + print("\tTo start the request, delete the directory '{}' or specify --skip-request.".format(dir_metadata)) + sys.exit(-1) + + +accession_already_downloaded_set = [] + +if os.path.exists(dir_fasta_and_yaml): + print("The directory '{}' already exists.".format(dir_fasta_and_yaml)) + if not args.only_missing_id: + print("To start the download, delete the directory '{}' or specify --only-missing-id.".format(dir_fasta_and_yaml)) + sys.exit(-1) + + accession_already_downloaded_set = set([x.split('.yaml')[0].split('.')[0] for x in os.listdir(dir_fasta_and_yaml) if x.endswith('.yaml')]) + print('There are {} accession already downloaded.'.format(len(accession_already_downloaded_set))) + + +if not os.path.exists(dir_metadata): + # Take all the ids + id_set = set() + + # Try to search several strings + term_list = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]'] + for term in term_list: + tmp_list = Entrez.read( + Entrez.esearch(db='nuccore', term=term, idtype='acc', retmax='10000') + )['IdList'] + + # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) + tmp_list = [x for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']] + + # Remove the version in the id + tmp_list = [x.split('.')[0] for x in tmp_list] + + #tmp_list = tmp_list[0:2] # restricting to small run + new_ids_set = set([x.split('.')[0] for x in tmp_list]) + new_ids = len(new_ids_set.difference(id_set)) + id_set.update(new_ids_set) + + print('Term:', term, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set)) + + if not os.path.exists(path_ncbi_virus_accession): + r = requests.get('https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?q=*:*&fq=%7B!tag=SeqType_s%7DSeqType_s:(%22Nucleotide%22)&fq=VirusLineageId_ss:(2697049)&cmd=download&sort=SourceDB_s%20desc,CreateDate_dt%20desc,id%20asc&dlfmt=acc&fl=id') + with open(path_ncbi_virus_accession, 'w') as fw: + fw.write(r.text) + + with open(path_ncbi_virus_accession) as f: + tmp_list = [line.strip('\n') for line in f] + + new_ids = len(set(tmp_list).difference(id_set)) + id_set.update(tmp_list) + + print('DB: NCBI Virus', today_date, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set)) + + if len(accession_already_downloaded_set) > 0: + id_set = id_set.difference(accession_already_downloaded_set) + print('There are {} missing IDs to download.'.format(len(id_set))) + + os.makedirs(dir_metadata) + for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)): + path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i)) + print('Requesting {} ids --> {}'.format(len(id_x_list), path_metadata_xxx_xml)) + + with open(path_metadata_xxx_xml, 'w') as fw: + fw.write( + Entrez.efetch(db='nuccore', id=id_x_list, retmode='xml').read() + ) + + +term_to_uri_dict = {} + +for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]: + print('Read {}'.format(path_dict_xxx_csv)) + + with open(path_dict_xxx_csv) as f: + for line in f: + if len(line.split(',')) > 2: + term, uri = line.strip('\n').split('",') + term = term.strip('"') + else: + term, uri = line.strip('\n').split(',') + + if term in term_to_uri_dict: + print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) + continue + + term_to_uri_dict[term] = uri + +if not os.path.exists(dir_fasta_and_yaml): + os.makedirs(dir_fasta_and_yaml) + +min_len_to_count = 27500 +num_seq_with_len_ge_X_bp = 0 + +missing_value_list = [] +accession_with_errors_list = [] + +for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]: + tree = ET.parse(path_metadata_xxx_xml) + GBSet = tree.getroot() + + for GBSeq in GBSet: + accession_version = GBSeq.find('GBSeq_accession-version').text + + GBSeq_sequence = GBSeq.find('GBSeq_sequence') + if GBSeq_sequence is None: + print(accession_version, ' - sequence not found') + continue + + try: + #print(path_metadata_xxx_xml, accession_version) + + # A general default-empty yaml could be read from the definitive one + info_for_yaml_dict = { + 'id': 'placeholder', + 'host': {}, + 'sample': {}, + 'virus': {}, + 'technology': {}, + 'submitter': {} + } + + + info_for_yaml_dict['sample']['sample_id'] = accession_version + info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now + + + # submitter info + GBSeq_references = GBSeq.find('GBSeq_references') + if GBSeq_references is not None: + info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')] + + GBReference = GBSeq_references.find('GBReference') + if GBReference is not None: + GBReference_journal = GBReference.find('GBReference_journal') + + if GBReference_journal is not None and GBReference_journal.text != 'Unpublished': + if 'Submitted' in GBReference_journal.text: + info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())] + info_for_yaml_dict['submitter']['submitter_address'] = ','.join(GBReference_journal.text.split(') ')[1].split(',')[1:]).strip() + else: + info_for_yaml_dict['submitter']['additional_submitter_information'] = GBReference_journal.text + + + GBSeq_comment = GBSeq.find('GBSeq_comment') + if GBSeq_comment is not None and 'Assembly-Data' in GBSeq_comment.text: + prefix_split_string = '##Genome-Assembly' if GBSeq_comment.text.startswith('##Genome-') else '##Assembly' + + GBSeq_comment_text = GBSeq_comment.text.split( + '{}-Data-START## ; '.format(prefix_split_string) + )[1].split(' ; {}-Data-END##'.format(prefix_split_string))[0] + + for info_to_check, field_in_yaml in zip( + ['Assembly Method', 'Coverage', 'Sequencing Technology'], + ['sequence_assembly_method', 'sequencing_coverage', 'sample_sequencing_technology'] + ): + if info_to_check in GBSeq_comment_text: + tech_info_to_parse = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0] + + if field_in_yaml == 'sequencing_coverage': + # A regular expression would be better! + try: + info_for_yaml_dict['technology'][field_in_yaml] = [ + float(tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>')) + ] + except ValueError: + print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse) + pass + elif field_in_yaml == 'sample_sequencing_technology': + new_seq_tec_list = [] + for seq_tec in tech_info_to_parse.split(';'): + seq_tec = seq_tec.strip() + if seq_tec in term_to_uri_dict: + seq_tec = term_to_uri_dict[seq_tec] + else: + missing_value_list.append('\t'.join([accession_version, 'sample_sequencing_technology', seq_tec])) + + new_seq_tec_list.append(seq_tec) + + info_for_yaml_dict['technology']['sample_sequencing_technology'] = [x for x in new_seq_tec_list] + else: + info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse + + + for GBFeature in GBSeq.iter('GBFeature'): + if GBFeature.find('GBFeature_key').text != 'source': + continue + + for GBQualifier in GBFeature.iter('GBQualifier'): + GBQualifier_value = GBQualifier.find('GBQualifier_value') + if GBQualifier_value is None: + continue + GBQualifier_value_text = GBQualifier_value.text + + GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text + + if GBQualifier_name_text == 'host': + GBQualifier_value_text_list = GBQualifier_value_text.split('; ') + + if GBQualifier_value_text_list[0] in term_to_uri_dict: + info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[GBQualifier_value_text_list[0]] + elif GBQualifier_value_text_list[0] and ('MT215193' in accession_version or 'MT270814' in accession_version): + # Information checked manually from NCBI Virus + info_for_yaml_dict['host']['host_species'] = term_to_uri_dict['Canis lupus familiaris'] + else: + missing_value_list.append('\t'.join([accession_version, 'host_species', GBQualifier_value_text_list[0]])) + + # Possible cases: + # - Homo sapiens --> ['Homo sapiens'] + # - Homo sapiens; female --> ['Homo sapiens', 'female'] + # - Homo sapiens; female 63 --> ['Homo sapiens', 'female 63'] + # - Homo sapiens; female; age 40 --> ['Homo sapiens', 'female', 'age 40'] + # - Homo sapiens; gender: F; age: 61 --> ['Homo sapiens', 'gender: F', 'age: 61'] + # - Homo sapiens; gender: M; age: 68 --> ['Homo sapiens', 'gender: M', 'age: 68'] + # - Homo sapiens; hospitalized patient --> ['Homo sapiens', 'hospitalized patient'] + # - Homo sapiens; male --> ['Homo sapiens', 'male'] + # - Homo sapiens; male; 63 --> ['Homo sapiens', 'male', '63'] + # - Homo sapiens; male; age 29 --> ['Homo sapiens', 'male', 'age 29'] + # - Homo sapiens; symptomatic --> ['Homo sapiens', 'symptomatic'] + if len(GBQualifier_value_text_list) > 1: + host_sex = '' + if 'female' in GBQualifier_value_text_list[1]: + host_sex = 'female' + elif 'male' in GBQualifier_value_text_list[1]: + host_sex = 'male' + elif 'gender' in GBQualifier_value_text_list[1]: + host_sex_one_lecter = GBQualifier_value_text_list[1].split(':')[-1].strip() + if host_sex_one_lecter in ['F', 'M']: + host_sex = 'female' if host_sex_one_lecter == 'F' else 'male' + + if host_sex in ['male', 'female']: + info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if host_sex == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383" + elif GBQualifier_value_text_list[1] in term_to_uri_dict: + info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]] + else: + missing_value_list.append('\t'.join([accession_version, 'host_sex or host_health_status', GBQualifier_value_text_list[1]])) + + # Host age + host_age = -1 + if len(GBQualifier_value_text_list[1].split(' ')) > 1 and is_integer(GBQualifier_value_text_list[1].split(' ')[-1]): + host_age = int(GBQualifier_value_text_list[1].split(' ')[-1]) + elif len(GBQualifier_value_text_list) > 2 and is_integer(GBQualifier_value_text_list[2].split(' ')[-1]): + host_age = int(GBQualifier_value_text_list[2].split(' ')[-1]) + + if host_age > -1: + info_for_yaml_dict['host']['host_age'] = host_age + info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036' + elif len(GBQualifier_value_text_list) > 2: + missing_value_list.append('\t'.join([accession_version, 'host_age', GBQualifier_value_text_list[2]])) + elif GBQualifier_name_text == 'collected_by': + if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]): + info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text + else: + info_for_yaml_dict['sample']['collector_name'] = GBQualifier_value_text + elif GBQualifier_name_text == 'isolation_source': + if GBQualifier_value_text.upper() in term_to_uri_dict: + GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' + + # Little cleaning + GBQualifier_value_text = GBQualifier_value_text.strip("/'") + + if GBQualifier_value_text in term_to_uri_dict: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] + else: + if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] + elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] + elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']] + elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']] + else: + missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text])) + elif GBQualifier_name_text == 'collection_date': + # TO_DO: which format we will use? + date_to_write = GBQualifier_value_text + + if len(GBQualifier_value_text.split('-')) == 1: + if int(GBQualifier_value_text) < 2020: + date_to_write = "{}-12-15".format(GBQualifier_value_text) + else: + date_to_write = "{}-01-15".format(GBQualifier_value_text) + + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + else: + info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + elif len(GBQualifier_value_text.split('-')) == 2: + date_to_write += '-15' + + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + else: + info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) + elif len(GBQualifier_value_text.split('-')) == 3: + GBQualifier_value_text_list = GBQualifier_value_text.split('-') + + if GBQualifier_value_text_list[1].isalpha(): + date_to_write = parse(GBQualifier_value_text).strftime('%Y-%m-%d') + + info_for_yaml_dict['sample']['collection_date'] = date_to_write + elif GBQualifier_name_text in ['lat_lon', 'country']: + if GBQualifier_value_text == 'Hong Kong': + GBQualifier_value_text = 'China: Hong Kong' + + if GBQualifier_value_text in term_to_uri_dict: + info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[GBQualifier_value_text] + else: + missing_value_list.append('\t'.join([accession_version, GBQualifier_name_text, GBQualifier_value_text])) + elif GBQualifier_name_text == 'note': + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += '; ' + GBQualifier_value_text + else: + info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text + elif GBQualifier_name_text == 'isolate': + info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text + elif GBQualifier_name_text == 'db_xref': + info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1] + + + # Remove technology key if empty! + if (info_for_yaml_dict['technology']=={}): + del info_for_yaml_dict['technology'] + + with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw: + fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper())) + + with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw: + json.dump(info_for_yaml_dict, fw, indent=2) + + + if(len(GBSeq_sequence.text) >= min_len_to_count): + num_seq_with_len_ge_X_bp += 1 + except: + print("Unexpected error for the ID {}: {}".format(accession_version, sys.exc_info()[0])) + accession_with_errors_list.append(accession_version) + continue + +if len(missing_value_list) > 0: + path_missing_terms_tsv = 'missing_terms.tsv' + print('Written missing terms in {}'.format(path_missing_terms_tsv)) + with open(path_missing_terms_tsv, 'w') as fw: + fw.write('\n'.join(missing_value_list)) + +if len(accession_with_errors_list) > 0: + path_accession_with_errors_tsv = 'accession_with_errors.tsv' + print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv)) + with open(path_accession_with_errors_tsv, 'w') as fw: + fw.write('\n'.join(accession_with_errors_list)) + +print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp)) diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py deleted file mode 100755 index 6216340..0000000 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ /dev/null @@ -1,395 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False) -parser.add_argument('--only-missing-id', action='store_true', help='download only missing id', required=False) -args = parser.parse_args() - -from Bio import Entrez -Entrez.email = 'another_email@gmail.com' - -import xml.etree.ElementTree as ET -import json -import os -import requests -import sys - -from datetime import date -from dateutil.parser import parse - -num_ids_for_request = 100 - -dir_metadata = 'metadata_from_nuccore' -dir_fasta_and_yaml = 'fasta_and_yaml' -dir_dict_ontology_standardization = 'dict_ontology_standardization/' - -today_date = date.today().strftime("%Y.%m.%d") -path_ncbi_virus_accession = 'sequences.{}.acc'.format(today_date) - -def is_integer(string_to_check): - try: - int(string_to_check) - return True - except ValueError: - return False - -def chunks(lst, n): - for i in range(0, len(lst), n): - yield lst[i:i + n] - -if os.path.exists(dir_metadata): - print("The directory '{}' already exists.".format(dir_metadata)) - - if not args.skip_request: - print("\tTo start the request, delete the directory '{}' or specify --skip-request.".format(dir_metadata)) - sys.exit(-1) - - -accession_already_downloaded_set = [] - -if os.path.exists(dir_fasta_and_yaml): - print("The directory '{}' already exists.".format(dir_fasta_and_yaml)) - if not args.only_missing_id: - print("To start the download, delete the directory '{}' or specify --only-missing-id.".format(dir_fasta_and_yaml)) - sys.exit(-1) - - accession_already_downloaded_set = set([x.split('.yaml')[0].split('.')[0] for x in os.listdir(dir_fasta_and_yaml) if x.endswith('.yaml')]) - print('There are {} accession already downloaded.'.format(len(accession_already_downloaded_set))) - - -if not os.path.exists(dir_metadata): - # Take all the ids - id_set = set() - - # Try to search several strings - term_list = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]'] - for term in term_list: - tmp_list = Entrez.read( - Entrez.esearch(db='nuccore', term=term, idtype='acc', retmax='10000') - )['IdList'] - - # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) - tmp_list = [x for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']] - - # Remove the version in the id - tmp_list = [x.split('.')[0] for x in tmp_list] - - #tmp_list = tmp_list[0:2] # restricting to small run - new_ids_set = set([x.split('.')[0] for x in tmp_list]) - new_ids = len(new_ids_set.difference(id_set)) - id_set.update(new_ids_set) - - print('Term:', term, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set)) - - if not os.path.exists(path_ncbi_virus_accession): - r = requests.get('https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?q=*:*&fq=%7B!tag=SeqType_s%7DSeqType_s:(%22Nucleotide%22)&fq=VirusLineageId_ss:(2697049)&cmd=download&sort=SourceDB_s%20desc,CreateDate_dt%20desc,id%20asc&dlfmt=acc&fl=id') - with open(path_ncbi_virus_accession, 'w') as fw: - fw.write(r.text) - - with open(path_ncbi_virus_accession) as f: - tmp_list = [line.strip('\n') for line in f] - - new_ids = len(set(tmp_list).difference(id_set)) - id_set.update(tmp_list) - - print('DB: NCBI Virus', today_date, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set)) - - if len(accession_already_downloaded_set) > 0: - id_set = id_set.difference(accession_already_downloaded_set) - print('There are {} missing IDs to download.'.format(len(id_set))) - - os.makedirs(dir_metadata) - for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)): - path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i)) - print('Requesting {} ids --> {}'.format(len(id_x_list), path_metadata_xxx_xml)) - - with open(path_metadata_xxx_xml, 'w') as fw: - fw.write( - Entrez.efetch(db='nuccore', id=id_x_list, retmode='xml').read() - ) - - -term_to_uri_dict = {} - -for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]: - print('Read {}'.format(path_dict_xxx_csv)) - - with open(path_dict_xxx_csv) as f: - for line in f: - if len(line.split(',')) > 2: - term, uri = line.strip('\n').split('",') - term = term.strip('"') - else: - term, uri = line.strip('\n').split(',') - - if term in term_to_uri_dict: - print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) - continue - - term_to_uri_dict[term] = uri - -if not os.path.exists(dir_fasta_and_yaml): - os.makedirs(dir_fasta_and_yaml) - -min_len_to_count = 27500 -num_seq_with_len_ge_X_bp = 0 - -missing_value_list = [] -accession_with_errors_list = [] - -for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]: - tree = ET.parse(path_metadata_xxx_xml) - GBSet = tree.getroot() - - for GBSeq in GBSet: - accession_version = GBSeq.find('GBSeq_accession-version').text - - GBSeq_sequence = GBSeq.find('GBSeq_sequence') - if GBSeq_sequence is None: - print(accession_version, ' - sequence not found') - continue - - try: - #print(path_metadata_xxx_xml, accession_version) - - # A general default-empty yaml could be read from the definitive one - info_for_yaml_dict = { - 'id': 'placeholder', - 'host': {}, - 'sample': {}, - 'virus': {}, - 'technology': {}, - 'submitter': {} - } - - - info_for_yaml_dict['sample']['sample_id'] = accession_version - info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now - - - # submitter info - GBSeq_references = GBSeq.find('GBSeq_references') - if GBSeq_references is not None: - info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')] - - GBReference = GBSeq_references.find('GBReference') - if GBReference is not None: - GBReference_journal = GBReference.find('GBReference_journal') - - if GBReference_journal is not None and GBReference_journal.text != 'Unpublished': - if 'Submitted' in GBReference_journal.text: - info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())] - info_for_yaml_dict['submitter']['submitter_address'] = ','.join(GBReference_journal.text.split(') ')[1].split(',')[1:]).strip() - else: - info_for_yaml_dict['submitter']['additional_submitter_information'] = GBReference_journal.text - - - GBSeq_comment = GBSeq.find('GBSeq_comment') - if GBSeq_comment is not None and 'Assembly-Data' in GBSeq_comment.text: - prefix_split_string = '##Genome-Assembly' if GBSeq_comment.text.startswith('##Genome-') else '##Assembly' - - GBSeq_comment_text = GBSeq_comment.text.split( - '{}-Data-START## ; '.format(prefix_split_string) - )[1].split(' ; {}-Data-END##'.format(prefix_split_string))[0] - - for info_to_check, field_in_yaml in zip( - ['Assembly Method', 'Coverage', 'Sequencing Technology'], - ['sequence_assembly_method', 'sequencing_coverage', 'sample_sequencing_technology'] - ): - if info_to_check in GBSeq_comment_text: - tech_info_to_parse = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0] - - if field_in_yaml == 'sequencing_coverage': - # A regular expression would be better! - try: - info_for_yaml_dict['technology'][field_in_yaml] = [ - float(tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>')) - ] - except ValueError: - print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse) - pass - elif field_in_yaml == 'sample_sequencing_technology': - new_seq_tec_list = [] - for seq_tec in tech_info_to_parse.split(';'): - seq_tec = seq_tec.strip() - if seq_tec in term_to_uri_dict: - seq_tec = term_to_uri_dict[seq_tec] - else: - missing_value_list.append('\t'.join([accession_version, 'sample_sequencing_technology', seq_tec])) - - new_seq_tec_list.append(seq_tec) - - info_for_yaml_dict['technology']['sample_sequencing_technology'] = [x for x in new_seq_tec_list] - else: - info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse - - - for GBFeature in GBSeq.iter('GBFeature'): - if GBFeature.find('GBFeature_key').text != 'source': - continue - - for GBQualifier in GBFeature.iter('GBQualifier'): - GBQualifier_value = GBQualifier.find('GBQualifier_value') - if GBQualifier_value is None: - continue - GBQualifier_value_text = GBQualifier_value.text - - GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text - - if GBQualifier_name_text == 'host': - GBQualifier_value_text_list = GBQualifier_value_text.split('; ') - - if GBQualifier_value_text_list[0] in term_to_uri_dict: - info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[GBQualifier_value_text_list[0]] - elif GBQualifier_value_text_list[0] and ('MT215193' in accession_version or 'MT270814' in accession_version): - # Information checked manually from NCBI Virus - info_for_yaml_dict['host']['host_species'] = term_to_uri_dict['Canis lupus familiaris'] - else: - missing_value_list.append('\t'.join([accession_version, 'host_species', GBQualifier_value_text_list[0]])) - - # Possible cases: - # - Homo sapiens --> ['Homo sapiens'] - # - Homo sapiens; female --> ['Homo sapiens', 'female'] - # - Homo sapiens; female 63 --> ['Homo sapiens', 'female 63'] - # - Homo sapiens; female; age 40 --> ['Homo sapiens', 'female', 'age 40'] - # - Homo sapiens; gender: F; age: 61 --> ['Homo sapiens', 'gender: F', 'age: 61'] - # - Homo sapiens; gender: M; age: 68 --> ['Homo sapiens', 'gender: M', 'age: 68'] - # - Homo sapiens; hospitalized patient --> ['Homo sapiens', 'hospitalized patient'] - # - Homo sapiens; male --> ['Homo sapiens', 'male'] - # - Homo sapiens; male; 63 --> ['Homo sapiens', 'male', '63'] - # - Homo sapiens; male; age 29 --> ['Homo sapiens', 'male', 'age 29'] - # - Homo sapiens; symptomatic --> ['Homo sapiens', 'symptomatic'] - if len(GBQualifier_value_text_list) > 1: - host_sex = '' - if 'female' in GBQualifier_value_text_list[1]: - host_sex = 'female' - elif 'male' in GBQualifier_value_text_list[1]: - host_sex = 'male' - elif 'gender' in GBQualifier_value_text_list[1]: - host_sex_one_lecter = GBQualifier_value_text_list[1].split(':')[-1].strip() - if host_sex_one_lecter in ['F', 'M']: - host_sex = 'female' if host_sex_one_lecter == 'F' else 'male' - - if host_sex in ['male', 'female']: - info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if host_sex == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383" - elif GBQualifier_value_text_list[1] in term_to_uri_dict: - info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]] - else: - missing_value_list.append('\t'.join([accession_version, 'host_sex or host_health_status', GBQualifier_value_text_list[1]])) - - # Host age - host_age = -1 - if len(GBQualifier_value_text_list[1].split(' ')) > 1 and is_integer(GBQualifier_value_text_list[1].split(' ')[-1]): - host_age = int(GBQualifier_value_text_list[1].split(' ')[-1]) - elif len(GBQualifier_value_text_list) > 2 and is_integer(GBQualifier_value_text_list[2].split(' ')[-1]): - host_age = int(GBQualifier_value_text_list[2].split(' ')[-1]) - - if host_age > -1: - info_for_yaml_dict['host']['host_age'] = host_age - info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036' - elif len(GBQualifier_value_text_list) > 2: - missing_value_list.append('\t'.join([accession_version, 'host_age', GBQualifier_value_text_list[2]])) - elif GBQualifier_name_text == 'collected_by': - if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]): - info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text - else: - info_for_yaml_dict['sample']['collector_name'] = GBQualifier_value_text - elif GBQualifier_name_text == 'isolation_source': - if GBQualifier_value_text.upper() in term_to_uri_dict: - GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' - - # Little cleaning - GBQualifier_value_text = GBQualifier_value_text.strip("/'") - - if GBQualifier_value_text in term_to_uri_dict: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] - else: - if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] - elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] - elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']] - elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']: - info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']] - else: - missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text])) - elif GBQualifier_name_text == 'collection_date': - # TO_DO: which format we will use? - date_to_write = GBQualifier_value_text - - if len(GBQualifier_value_text.split('-')) == 1: - if int(GBQualifier_value_text) < 2020: - date_to_write = "{}-12-15".format(GBQualifier_value_text) - else: - date_to_write = "{}-01-15".format(GBQualifier_value_text) - - if 'additional_collection_information' in info_for_yaml_dict['sample']: - info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) - else: - info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) - elif len(GBQualifier_value_text.split('-')) == 2: - date_to_write += '-15' - - if 'additional_collection_information' in info_for_yaml_dict['sample']: - info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) - else: - info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) - elif len(GBQualifier_value_text.split('-')) == 3: - GBQualifier_value_text_list = GBQualifier_value_text.split('-') - - if GBQualifier_value_text_list[1].isalpha(): - date_to_write = parse(GBQualifier_value_text).strftime('%Y-%m-%d') - - info_for_yaml_dict['sample']['collection_date'] = date_to_write - elif GBQualifier_name_text in ['lat_lon', 'country']: - if GBQualifier_value_text == 'Hong Kong': - GBQualifier_value_text = 'China: Hong Kong' - - if GBQualifier_value_text in term_to_uri_dict: - info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[GBQualifier_value_text] - else: - missing_value_list.append('\t'.join([accession_version, GBQualifier_name_text, GBQualifier_value_text])) - elif GBQualifier_name_text == 'note': - if 'additional_collection_information' in info_for_yaml_dict['sample']: - info_for_yaml_dict['sample']['additional_collection_information'] += '; ' + GBQualifier_value_text - else: - info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text - elif GBQualifier_name_text == 'isolate': - info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text - elif GBQualifier_name_text == 'db_xref': - info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1] - - - # Remove technology key if empty! - if (info_for_yaml_dict['technology']=={}): - del info_for_yaml_dict['technology'] - - with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw: - fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper())) - - with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw: - json.dump(info_for_yaml_dict, fw, indent=2) - - - if(len(GBSeq_sequence.text) >= min_len_to_count): - num_seq_with_len_ge_X_bp += 1 - except: - print("Unexpected error for the ID {}: {}".format(accession_version, sys.exc_info()[0])) - accession_with_errors_list.append(accession_version) - continue - -if len(missing_value_list) > 0: - path_missing_terms_tsv = 'missing_terms.tsv' - print('Written missing terms in {}'.format(path_missing_terms_tsv)) - with open(path_missing_terms_tsv, 'w') as fw: - fw.write('\n'.join(missing_value_list)) - -if len(accession_with_errors_list) > 0: - path_accession_with_errors_tsv = 'accession_with_errors.tsv' - print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv)) - with open(path_accession_with_errors_tsv, 'w') as fw: - fw.write('\n'.join(accession_with_errors_list)) - -print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp)) diff --git a/scripts/sequences.acc b/scripts/sequences.acc deleted file mode 100644 index 697d868..0000000 --- a/scripts/sequences.acc +++ /dev/null @@ -1,1731 +0,0 @@ -NC_045512 -MT394528 -MT394529 -MT394530 -MT394531 -MT394864 -MT396241 -MT396242 -MT396243 -MT396244 -MT396245 -MT396246 -MT396247 -MT396248 -MT396266 -MT380726 -MT380727 -MT380728 -MT380729 -MT380730 -MT380731 -MT380732 -MT380733 -MT380734 -MT385414 -MT385415 -MT385416 -MT385417 -MT385418 -MT385419 -MT385420 -MT385421 -MT385422 -MT385423 -MT385424 -MT385425 -MT385426 -MT385427 -MT385428 -MT385429 -MT385430 -MT385431 -MT385432 -MT385433 -MT385434 -MT385435 -MT385436 -MT385437 -MT385438 -MT385439 -MT385440 -MT385441 -MT385442 -MT385443 -MT385444 -MT385445 -MT385446 -MT385447 -MT385448 -MT385449 -MT385450 -MT385451 -MT385452 -MT385453 -MT385454 -MT385455 -MT385456 -MT385457 -MT385458 -MT385459 -MT385460 -MT385461 -MT385462 -MT385463 -MT385464 -MT385465 -MT385466 -MT385467 -MT385468 -MT385469 -MT385470 -MT385471 -MT385472 -MT385473 -MT385474 -MT385475 -MT385476 -MT385477 -MT385478 -MT385479 -MT385480 -MT385481 -MT385482 -MT385483 -MT385484 -MT385485 -MT385486 -MT385487 -MT385488 -MT385489 -MT385490 -MT385491 -MT385492 -MT385493 -MT385494 -MT385495 -MT385496 -MT385497 -MT186683 -MT252677 -MT252678 -MT252679 -MT252680 -MT252681 -MT252682 -MT252683 -MT252684 -MT252685 -MT252686 -MT252687 -MT252688 -MT252689 -MT252690 -MT252691 -MT252692 -MT252693 -MT252694 -MT252695 -MT252696 -MT252697 -MT252698 -MT252699 -MT252700 -MT252701 -MT252702 -MT252703 -MT252704 -MT252705 -MT252706 -MT252707 -MT252708 -MT252709 -MT252710 -MT252711 -MT252712 -MT252713 -MT252715 -MT252716 -MT252717 -MT252719 -MT252721 -MT252723 -MT252725 -MT252726 -MT252728 -MT252729 -MT252730 -MT252733 -MT252734 -MT252735 -MT252736 -MT252737 -MT252738 -MT252739 -MT252740 -MT252741 -MT252742 -MT252745 -MT252746 -MT252747 -MT252748 -MT252749 -MT252756 -MT252757 -MT252758 -MT252761 -MT252763 -MT252764 -MT252765 -MT252766 -MT252767 -MT252768 -MT252769 -MT252770 -MT252771 -MT252772 -MT252773 -MT252774 -MT252775 -MT252778 -MT252779 -MT252780 -MT252781 -MT252782 -MT252783 -MT252784 -MT252785 -MT252787 -MT252788 -MT252792 -MT252793 -MT252794 -MT252795 -MT252797 -MT252798 -MT252799 -MT252800 -MT252801 -MT252802 -MT252803 -MT252804 -MT252805 -MT252806 -MT252807 -MT252808 -MT252809 -MT252810 -MT252811 -MT252821 -MT252822 -MT252823 -MT252824 -MT339043 -MT365033 -MT374101 -MT374102 -MT374103 -MT374104 -MT374105 -MT374106 -MT374107 -MT374108 -MT374109 -MT374110 -MT374111 -MT374112 -MT374113 -MT374114 -MT374115 -MT374116 -MT375428 -MT375429 -MT375430 -MT375431 -MT375432 -MT375433 -MT375434 -MT375435 -MT375436 -MT375437 -MT375438 -MT375439 -MT375440 -MT375441 -MT375442 -MT375443 -MT375444 -MT375445 -MT375446 -MT375447 -MT375448 -MT375449 -MT375450 -MT375451 -MT375452 -MT375453 -MT375454 -MT375455 -MT375456 -MT375457 -MT375458 -MT375459 -MT375460 -MT375461 -MT375462 -MT375463 -MT375464 -MT375465 -MT375466 -MT375467 -MT375468 -MT375469 -MT375470 -MT375471 -MT375472 -MT375473 -MT375474 -MT375475 -MT375476 -MT375477 -MT375478 -MT375479 -MT375480 -MT375481 -MT375482 -MT375483 -MT370516 -MT370517 -MT370518 -MT370831 -MT370832 -MT370833 -MT370834 -MT370835 -MT370836 -MT370837 -MT370838 -MT370839 -MT370840 -MT370841 -MT370842 -MT370843 -MT370844 -MT370845 -MT370846 -MT370847 -MT370848 -MT370849 -MT370850 -MT370851 -MT370852 -MT370853 -MT370854 -MT370855 -MT370856 -MT370857 -MT370858 -MT370859 -MT370860 -MT370861 -MT370862 -MT370863 -MT370864 -MT370865 -MT370866 -MT370867 -MT370868 -MT370869 -MT370870 -MT370871 -MT370872 -MT370873 -MT370874 -MT370875 -MT370876 -MT370877 -MT370878 -MT370879 -MT370880 -MT370881 -MT370882 -MT370883 -MT370884 -MT370885 -MT370886 -MT370887 -MT370888 -MT370889 -MT370890 -MT370891 -MT370892 -MT370893 -MT370894 -MT370895 -MT370896 -MT370897 -MT370898 -MT370899 -MT370900 -MT370901 -MT370902 -MT370903 -MT370904 -MT370905 -MT370906 -MT370907 -MT370908 -MT370909 -MT370910 -MT370911 -MT370912 -MT370913 -MT370914 -MT370915 -MT370916 -MT370917 -MT370918 -MT370919 -MT370920 -MT370921 -MT370922 -MT370923 -MT370924 -MT370925 -MT370926 -MT370927 -MT370928 -MT370929 -MT370930 -MT370931 -MT370932 -MT370933 -MT370934 -MT370935 -MT370936 -MT370937 -MT370938 -MT370939 -MT370940 -MT370941 -MT370942 -MT370943 -MT370944 -MT370945 -MT370946 -MT370947 -MT370948 -MT370949 -MT370950 -MT370951 -MT370952 -MT370953 -MT370954 -MT370955 -MT370956 -MT370957 -MT370958 -MT370959 -MT370960 -MT370961 -MT370962 -MT370963 -MT370964 -MT370965 -MT370966 -MT370967 -MT370968 -MT370969 -MT370970 -MT370971 -MT370972 -MT370973 -MT370974 -MT370975 -MT370976 -MT370977 -MT370978 -MT370979 -MT370980 -MT370981 -MT370982 -MT370983 -MT370984 -MT370985 -MT370986 -MT370987 -MT370988 -MT370989 -MT370990 -MT370991 -MT370992 -MT370993 -MT370994 -MT370995 -MT370996 -MT370997 -MT370998 -MT370999 -MT371000 -MT371001 -MT371002 -MT371003 -MT371004 -MT371005 -MT371006 -MT371007 -MT371008 -MT371009 -MT371010 -MT371011 -MT371012 -MT371013 -MT371014 -MT371015 -MT371016 -MT371017 -MT371018 -MT371019 -MT371020 -MT371021 -MT371022 -MT371023 -MT371024 -MT371025 -MT371026 -MT371027 -MT371028 -MT371029 -MT371030 -MT371031 -MT371032 -MT371033 -MT371034 -MT371035 -MT371036 -MT371037 -MT371038 -MT371047 -MT371048 -MT371049 -MT371050 -MT371568 -MT371569 -MT371570 -MT371571 -MT371572 -MT371573 -MT371574 -MT372480 -MT372481 -MT372482 -MT372483 -7BV2_P -7BV2_T -LC542976 -LC542809 -MT114412 -MT114413 -MT114414 -MT114415 -MT114416 -MT114417 -MT114418 -MT114419 -MT230904 -MT358401 -MT358402 -MT358637 -MT358644 -MT358645 -MT358646 -MT358647 -MT358648 -MT358649 -MT358650 -MT358651 -MT358652 -MT358653 -MT358654 -MT358655 -MT358656 -MT358657 -MT358658 -MT358659 -MT358660 -MT358661 -MT358662 -MT358663 -MT358664 -MT358665 -MT358666 -MT358667 -MT358668 -MT358669 -MT358670 -MT358671 -MT358672 -MT358673 -MT358674 -MT358675 -MT358676 -MT358677 -MT358678 -MT358679 -MT358680 -MT358681 -MT358682 -MT358683 -MT358684 -MT358685 -MT358686 -MT358687 -MT358688 -MT358689 -MT358690 -MT358691 -MT358692 -MT358693 -MT358694 -MT358695 -MT358696 -MT358697 -MT358698 -MT358699 -MT358700 -MT358701 -MT358702 -MT358703 -MT358704 -MT358705 -MT358706 -MT358707 -MT358708 -MT358709 -MT358710 -MT358711 -MT358712 -MT358713 -MT358714 -MT358715 -MT358716 -MT358717 -MT358718 -MT358719 -MT358720 -MT358721 -MT358722 -MT358723 -MT358724 -MT358725 -MT358726 -MT358727 -MT358728 -MT358729 -MT358730 -MT358731 -MT358732 -MT358733 -MT358734 -MT358735 -MT358736 -MT358737 -MT358738 -MT358739 -MT358740 -MT358741 -MT358742 -MT358743 -MT358744 -MT358745 -MT358746 -MT358747 -MT358748 -MT359231 -MT359865 -MT359866 -MT350234 -MT350236 -MT350237 -MT350238 -MT350239 -MT350240 -MT350241 -MT350242 -MT350243 -MT350244 -MT350245 -MT350246 -MT350247 -MT350248 -MT350249 -MT350250 -MT350251 -MT350252 -MT350253 -MT350254 -MT350255 -MT350256 -MT350257 -MT350263 -MT350264 -MT350265 -MT350266 -MT350267 -MT350268 -MT350269 -MT350270 -MT350271 -MT350272 -MT350273 -MT350274 -MT350275 -MT350276 -MT350277 -MT350278 -MT350279 -MT350280 -MT350282 -MT344135 -MT344944 -MT344945 -MT344946 -MT344947 -MT344948 -MT344949 -MT344950 -MT344951 -MT344952 -MT344953 -MT344954 -MT344955 -MT344956 -MT344957 -MT344958 -MT344959 -MT344960 -MT344961 -MT344962 -MT344963 -MT345798 -MT345799 -MT345800 -MT345801 -MT345802 -MT345803 -MT345804 -MT345805 -MT345806 -MT345807 -MT345808 -MT345809 -MT345810 -MT345811 -MT345812 -MT345813 -MT345814 -MT345815 -MT345816 -MT345817 -MT345818 -MT345819 -MT345820 -MT345821 -MT345822 -MT345823 -MT345824 -MT345825 -MT345826 -MT345827 -MT345828 -MT345829 -MT345830 -MT345831 -MT345832 -MT345833 -MT345834 -MT345835 -MT345836 -MT345837 -MT345838 -MT345839 -MT345840 -MT345841 -MT345842 -MT345843 -MT345844 -MT345845 -MT345846 -MT345847 -MT345848 -MT345849 -MT345850 -MT345851 -MT345852 -MT345853 -MT345854 -MT345855 -MT345856 -MT345857 -MT345858 -MT345859 -MT345860 -MT345861 -MT345862 -MT345863 -MT345864 -MT345865 -MT345866 -MT345867 -MT345868 -MT345869 -MT345870 -MT345871 -MT345872 -MT345873 -MT345874 -MT345875 -MT345876 -MT345877 -MT345878 -MT345879 -MT345880 -MT345881 -MT345882 -MT345883 -MT345884 -MT345885 -MT345886 -MT345887 -MT345888 -MT339039 -MT339040 -MT339041 -MT334522 -MT334523 -MT334524 -MT334525 -MT334526 -MT334527 -MT334528 -MT334529 -MT334530 -MT334531 -MT334532 -MT334533 -MT334534 -MT334535 -MT334536 -MT334537 -MT334538 -MT334539 -MT334540 -MT334541 -MT334542 -MT334543 -MT334544 -MT334545 -MT334546 -MT334547 -MT334548 -MT334549 -MT334550 -MT334551 -MT334552 -MT334553 -MT334554 -MT334555 -MT334556 -MT334557 -MT334558 -MT334559 -MT334560 -MT334561 -MT334562 -MT334563 -MT334564 -MT334565 -MT334566 -MT334567 -MT334568 -MT334569 -MT334570 -MT334571 -MT334572 -MT334573 -MT324062 -MT324679 -MT324680 -MT324681 -MT324682 -MT324683 -MT324684 -MT325561 -MT325562 -MT325563 -MT325564 -MT325565 -MT325566 -MT325567 -MT325568 -MT325569 -MT325570 -MT325571 -MT325572 -MT325573 -MT325574 -MT325575 -MT325576 -MT325577 -MT325578 -MT325579 -MT325580 -MT325581 -MT325582 -MT325583 -MT325584 -MT325585 -MT325586 -MT325587 -MT325588 -MT325589 -MT325590 -MT325591 -MT325592 -MT325593 -MT325594 -MT325595 -MT325596 -MT325597 -MT325598 -MT325599 -MT325600 -MT325601 -MT325602 -MT325603 -MT325604 -MT325605 -MT325606 -MT325607 -MT325608 -MT325609 -MT325610 -MT325611 -MT325612 -MT325613 -MT325614 -MT325615 -MT325616 -MT325617 -MT325618 -MT325619 -MT325620 -MT325621 -MT325622 -MT325623 -MT325624 -MT325625 -MT325626 -MT325627 -MT325628 -MT325629 -MT325630 -MT325631 -MT325632 -MT325633 -MT325634 -MT325635 -MT325636 -MT325637 -MT325638 -MT325639 -MT325640 -MT326023 -MT326024 -MT326025 -MT326026 -MT326027 -MT326028 -MT326029 -MT326030 -MT326031 -MT326032 -MT326033 -MT326034 -MT326035 -MT326036 -MT326037 -MT326038 -MT326039 -MT326040 -MT326041 -MT326042 -MT326043 -MT326044 -MT326045 -MT326046 -MT326047 -MT326048 -MT326049 -MT326050 -MT326051 -MT326052 -MT326053 -MT326054 -MT326055 -MT326056 -MT326057 -MT326058 -MT326059 -MT326060 -MT326061 -MT326062 -MT326063 -MT326064 -MT326065 -MT326066 -MT326067 -MT326068 -MT326069 -MT326070 -MT326071 -MT326072 -MT326073 -MT326074 -MT326075 -MT326076 -MT326077 -MT326078 -MT326079 -MT326080 -MT326081 -MT326082 -MT326083 -MT326084 -MT326085 -MT326086 -MT326087 -MT326088 -MT326089 -MT326090 -MT326091 -MT326092 -MT326093 -MT326094 -MT326095 -MT326096 -MT326097 -MT326098 -MT326099 -MT326100 -MT326101 -MT326102 -MT326103 -MT326104 -MT326105 -MT326106 -MT326107 -MT326108 -MT326109 -MT326110 -MT326111 -MT326112 -MT326113 -MT326114 -MT326115 -MT326116 -MT326117 -MT326118 -MT326119 -MT326120 -MT326121 -MT326122 -MT326123 -MT326124 -MT326125 -MT326126 -MT326127 -MT326128 -MT326129 -MT326130 -MT326131 -MT326132 -MT326133 -MT326134 -MT326135 -MT326136 -MT326137 -MT326138 -MT326139 -MT326140 -MT326141 -MT326142 -MT326143 -MT326144 -MT326145 -MT326146 -MT326147 -MT326148 -MT326149 -MT326150 -MT326151 -MT326152 -MT326153 -MT326154 -MT326155 -MT326156 -MT326157 -MT326158 -MT326159 -MT326160 -MT326161 -MT326162 -MT326163 -MT326164 -MT326165 -MT326166 -MT326167 -MT326168 -MT326169 -MT326170 -MT326171 -MT326172 -MT326173 -MT326174 -MT326175 -MT326176 -MT326177 -MT326178 -MT326179 -MT326180 -MT326181 -MT326182 -MT326183 -MT326184 -MT326185 -MT326186 -MT326187 -MT326188 -MT326189 -MT326190 -MT326191 -MT327745 -MT328032 -MT328033 -MT328034 -MT328035 -MT039874 -MT077125 -MT322394 -MT322395 -MT322396 -MT322397 -MT322398 -MT322399 -MT322400 -MT322401 -MT322402 -MT322403 -MT322404 -MT322405 -MT322406 -MT322407 -MT322408 -MT322409 -MT322410 -MT322411 -MT322412 -MT322413 -MT322414 -MT322415 -MT322416 -MT322417 -MT322418 -MT322419 -MT322420 -MT322421 -MT322422 -MT322423 -MT322424 -MT320538 -MT320891 -MT308692 -MT308693 -MT308694 -MT308695 -MT308696 -MT308697 -MT308698 -MT308699 -MT308700 -MT308701 -MT308702 -MT308703 -MT308704 -MT293547 -MT300186 -MT304474 -MT304475 -MT304476 -MT304477 -MT304478 -MT304479 -MT304480 -MT304481 -MT304482 -MT304483 -MT304484 -MT304485 -MT304486 -MT304487 -MT304488 -MT304489 -MT304490 -MT304491 -MT273658 -MT281530 -MT281577 -MT291826 -MT291827 -MT291828 -MT291829 -MT291830 -MT291831 -MT291832 -MT291833 -MT291834 -MT291835 -MT291836 -MT292569 -MT292570 -MT292571 -MT292572 -MT292573 -MT292574 -MT292575 -MT292576 -MT292577 -MT292578 -MT292579 -MT292580 -MT292581 -MT292582 -MT293156 -MT293157 -MT293158 -MT293159 -MT293160 -MT293161 -MT293162 -MT293163 -MT293164 -MT293165 -MT293166 -MT293167 -MT293168 -MT293169 -MT293170 -MT293171 -MT293172 -MT293173 -MT293174 -MT293175 -MT293176 -MT293177 -MT293178 -MT293179 -MT293180 -MT293181 -MT293182 -MT293183 -MT293184 -MT293185 -MT293186 -MT293187 -MT293188 -MT293189 -MT293190 -MT293191 -MT293192 -MT293193 -MT293194 -MT293195 -MT293196 -MT293197 -MT293198 -MT293199 -MT293200 -MT293201 -MT293202 -MT293203 -MT293204 -MT293205 -MT293206 -MT293207 -MT293208 -MT293209 -MT293210 -MT293211 -MT293212 -MT293213 -MT293214 -MT293215 -MT293216 -MT293217 -MT293218 -MT293219 -MT293220 -MT293221 -MT293222 -MT293223 -MT293224 -MT293225 -MT295464 -MT295465 -MT276323 -MT276324 -MT276325 -MT276326 -MT276327 -MT276328 -MT276329 -MT276330 -MT276331 -MT276597 -MT276598 -MT262896 -MT262897 -MT262898 -MT262899 -MT262900 -MT262901 -MT262902 -MT262903 -MT262904 -MT262905 -MT262906 -MT262907 -MT262908 -MT262909 -MT262910 -MT262911 -MT262912 -MT262913 -MT262914 -MT262915 -MT262916 -MT262993 -MT263074 -MT263381 -MT263382 -MT263383 -MT263384 -MT263385 -MT263386 -MT263387 -MT263388 -MT263389 -MT263390 -MT263391 -MT263392 -MT263393 -MT263394 -MT263395 -MT263396 -MT263397 -MT263398 -MT263399 -MT263400 -MT263401 -MT263402 -MT263403 -MT263404 -MT263405 -MT263406 -MT263407 -MT263408 -MT263409 -MT263410 -MT263411 -MT263412 -MT263413 -MT263414 -MT263415 -MT263416 -MT263417 -MT263418 -MT263419 -MT263420 -MT263421 -MT263422 -MT263423 -MT263424 -MT263425 -MT263426 -MT263427 -MT263428 -MT263429 -MT263430 -MT263431 -MT263432 -MT263433 -MT263434 -MT263435 -MT263436 -MT263437 -MT263438 -MT263439 -MT263440 -MT263441 -MT263442 -MT263443 -MT263444 -MT263445 -MT263446 -MT263447 -MT263448 -MT263449 -MT263450 -MT263451 -MT263452 -MT263453 -MT263454 -MT263455 -MT263456 -MT263457 -MT263458 -MT263459 -MT263460 -MT263461 -MT263462 -MT263463 -MT263464 -MT263465 -MT263466 -MT263467 -MT263468 -MT263469 -MT256917 -MT256918 -MT256924 -MT258377 -MT258378 -MT258379 -MT258380 -MT258381 -MT258382 -MT258383 -MT259226 -MT259227 -MT259228 -MT259229 -MT259230 -MT259231 -MT259235 -MT259236 -MT259237 -MT259238 -MT259239 -MT259240 -MT259241 -MT259242 -MT259243 -MT259244 -MT259245 -MT259246 -MT259247 -MT259248 -MT259249 -MT259250 -MT259251 -MT259252 -MT259253 -MT259254 -MT259255 -MT259256 -MT259257 -MT259258 -MT259259 -MT259260 -MT259261 -MT259262 -MT259263 -MT259264 -MT259265 -MT259266 -MT259267 -MT259268 -MT259269 -MT259270 -MT259271 -MT259272 -MT259273 -MT259274 -MT259275 -MT259276 -MT259277 -MT259278 -MT259279 -MT259280 -MT259281 -MT259282 -MT259283 -MT259284 -MT259285 -MT259286 -MT259287 -LC534418 -LC534419 -MT251972 -MT251973 -MT251974 -MT251975 -MT251976 -MT251977 -MT251978 -MT251979 -MT251980 -MT253696 -MT253697 -MT253698 -MT253699 -MT253700 -MT253701 -MT253702 -MT253703 -MT253704 -MT253705 -MT253706 -MT253707 -MT253708 -MT253709 -MT253710 -MT233526 -MT246449 -MT246450 -MT246451 -MT246452 -MT246453 -MT246454 -MT246455 -MT246456 -MT246457 -MT246458 -MT246459 -MT246460 -MT246461 -MT246462 -MT246463 -MT246464 -MT246465 -MT246466 -MT246467 -MT246468 -MT246469 -MT246470 -MT246471 -MT246472 -MT246473 -MT246474 -MT246475 -MT246476 -MT246477 -MT246478 -MT246479 -MT246480 -MT246481 -MT246482 -MT246483 -MT246484 -MT246485 -MT246486 -MT246487 -MT246488 -MT246489 -MT246490 -MT246667 -MT240479 -MT232869 -MT232870 -MT232871 -MT232872 -MT233519 -MT233520 -MT233521 -MT233522 -MT233523 -MT226610 -MT198651 -MT198652 -MT198653 -MT192758 -MT192759 -MT192765 -MT192772 -MT192773 -MT186676 -MT186677 -MT186678 -MT186679 -MT186680 -MT186681 -MT186682 -MT187977 -MT188339 -MT188340 -MT188341 -CADDYA000000000 -MT184907 -MT184908 -MT184909 -MT184910 -MT184911 -MT184912 -MT184913 -MT163712 -MT163714 -MT163715 -MT163716 -MT163717 -MT163718 -MT163719 -MT163720 -MT163721 -MT163737 -MT163738 -MT066156 -MT121215 -MT159705 -MT159706 -MT159707 -MT159708 -MT159709 -MT159710 -MT159711 -MT159712 -MT159713 -MT159714 -MT159715 -MT159716 -MT159717 -MT159718 -MT159719 -MT159720 -MT159721 -MT159722 -MT159778 -MT161607 -LC529905 -MT012098 -MT050493 -MT152824 -MT152900 -MT135041 -MT135042 -MT135043 -MT135044 -MT126808 -MT127113 -MT127114 -MT127115 -MT127116 -LC528232 -LC528233 -MT123290 -MT123291 -MT123292 -MT123293 -MT118835 -MT111895 -MT111896 -MT106052 -MT106053 -MT106054 -MT093571 -MT093631 -MT081059 -MT081060 -MT081061 -MT081062 -MT081063 -MT081064 -MT081065 -MT081066 -MT081067 -MT081068 -MT072667 -MT072668 -MT072688 -MT066157 -MT066158 -MT066159 -MT066175 -MT066176 -LC523807 -LC523808 -LC523809 -MT042773 -MT042774 -MT042775 -MT042776 -MT042777 -MT042778 -MT044257 -MT044258 -MT049951 -MT050414 -MT050415 -MT050416 -MT050417 -MT039873 -MT039887 -MT039888 -MT039890 -LC522350 -MT027062 -MT027063 -MT027064 -MT019529 -MT019530 -MT019531 -MT019532 -MT019533 -MT020781 -MT020880 -MT020881 -LR757995 -LR757996 -LR757997 -LR757998 -MT007544 -MT008022 -MT008023 -MN996527 -MN996528 -MN996529 -MN996530 -MN996531 -MN988668 -MN988669 -MN994467 -MN994468 -MN997409 -MN988713 -MN938384 -MN938385 -MN938386 -MN938387 -MN938388 -MN938389 -MN938390 -MN975262 -MN975263 -MN975264 -MN975265 -MN975266 -MN975267 -MN975268 -MN985325 -MN970003 -MN970004 -MN908947 -- cgit v1.2.3 From 640675e07505dc0f7969480fe0643653e6408ec2 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Mon, 22 Jun 2020 23:29:35 +0200 Subject: added new script to prepare metadata of sra data --- .../SraExperimentPackage.2020.06.08.xml.gz | Bin 0 -> 3227724 bytes scripts/download_sra_data/download_sra_data.py | 232 +++++++++++++++++++++ 2 files changed, 232 insertions(+) create mode 100644 scripts/download_sra_data/SraExperimentPackage.2020.06.08.xml.gz create mode 100644 scripts/download_sra_data/download_sra_data.py diff --git a/scripts/download_sra_data/SraExperimentPackage.2020.06.08.xml.gz b/scripts/download_sra_data/SraExperimentPackage.2020.06.08.xml.gz new file mode 100644 index 0000000..f9cd995 Binary files /dev/null and b/scripts/download_sra_data/SraExperimentPackage.2020.06.08.xml.gz differ diff --git a/scripts/download_sra_data/download_sra_data.py b/scripts/download_sra_data/download_sra_data.py new file mode 100644 index 0000000..9145a43 --- /dev/null +++ b/scripts/download_sra_data/download_sra_data.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 + +import os +from dateutil.parser import parse +import xml.etree.ElementTree as ET +import json +import gzip + +dir_yaml = 'yaml' + +date = '2020.06.08' + +# Query on SRA: 'txid2697049[Organism]' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism%5D) -> Send to -> File -> Full XML -> Create File +path_sra_metadata_xml = 'SraExperimentPackage.{}.xml.gz'.format(date) + +dir_dict_ontology_standardization = '../dict_ontology_standardization/' +path_sra_study_accessions_txt = 'SRAStudyAccessions.{}.txt'.format(date) + +term_to_uri_dict = {} + +for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]: + print('Read {}'.format(path_dict_xxx_csv)) + + with open(path_dict_xxx_csv, 'r') as f: + for line in f: + if len(line.split(',')) > 2: + term, uri = line.strip('\n').split('",') + term = term.strip('"') + else: + term, uri = line.strip('\n').split(',') + + term_to_uri_dict[term] = uri + +def is_integer(string_to_check): + try: + int(string_to_check) + return True + except ValueError: + return False + +if not os.path.exists(dir_yaml): + os.makedirs(dir_yaml) + +sra_metadata_xml_file = gzip.open(path_sra_metadata_xml, 'r') +tree = ET.parse(sra_metadata_xml_file) +sra_metadata_xml_file.close() + +EXPERIMENT_PACKAGE_SET = tree.getroot() + +missing_value_list = [] + +run_accession_set = set() +run_accession_to_downloadble_file_url_dict = {} + +for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): + #print(i, EXPERIMENT_PACKAGE) + + # A general default-empty yaml could be read from the definitive one + info_for_yaml_dict = { + 'id': 'placeholder', + 'host': {}, + 'sample': {}, + 'virus': {}, + 'technology': {}, + 'submitter': {} + } + + RUN_SET = EXPERIMENT_PACKAGE.find('RUN_SET') + RUN = RUN_SET.find('RUN') + accession = RUN.attrib['accession'] + run_accession_set.add(accession) + #print(accession) + + info_for_yaml_dict['sample']['sample_id'] = accession + + SRAFiles = RUN.find('SRAFiles') + if SRAFiles is not None: + url = SRAFiles.find('SRAFile').attrib['url'] + if 'sra-download.ncbi.nlm.nih.gov' in url: + run_accession_to_downloadble_file_url_dict[accession] = url + + + SAMPLE = EXPERIMENT_PACKAGE.find('SAMPLE') + SAMPLE_ATTRIBUTE_list = SAMPLE.iter('SAMPLE_ATTRIBUTE') + + for SAMPLE_ATTRIBUTE in SAMPLE_ATTRIBUTE_list: + VALUE = SAMPLE_ATTRIBUTE.find('VALUE') + if VALUE is not None: + TAG_text = SAMPLE_ATTRIBUTE.find('TAG').text + VALUE_text = VALUE.text + + if TAG_text in ['host', 'host scientific name']: + if VALUE_text in term_to_uri_dict: + info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[VALUE_text] + else: + missing_value_list.append('\t'.join([accession, 'host_species', VALUE_text])) + elif TAG_text in ['host_health_status', 'host health state']: + if VALUE_text in term_to_uri_dict: + info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text] + elif VALUE_text.strip("'") not in ['missing', 'not collected', 'not provided']: + missing_value_list.append('\t'.join([accession, 'host_health_status', VALUE_text])) + elif TAG_text in ['strain', 'isolate']: + if VALUE_text.lower() not in ['not applicable', 'missing', 'na', 'unknown']: + if 'virus_strain' not in info_for_yaml_dict: + info_for_yaml_dict['virus']['virus_strain'] = VALUE_text + else: + info_for_yaml_dict['virus']['virus_strain'] += '; ' + VALUE_text + elif TAG_text in ['isolation_source', 'isolation source host-associated']: + if VALUE_text in term_to_uri_dict: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]] + else: + if VALUE_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] + elif VALUE_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] + elif VALUE_text.lower() in ['nasal swab and throat swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']] + elif VALUE_text.lower() in ['nasal-swab and oro-pharyngeal swab']: + info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']] + elif VALUE_text.strip("'") not in ['missing', 'not collected', 'unknown', 'not provided', 'not applicable', 'N/A']: + missing_value_list.append('\t'.join([accession, 'specimen_source', VALUE_text])) + elif TAG_text in ['host_sex', 'host sex']: + if VALUE_text.lower() not in ['missing', 'not provided']: + if VALUE_text in ['male', 'female']: + info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if VALUE_text == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383" + else: + missing_value_list.append('\t'.join([accession, 'host_sex', VALUE_text])) + elif TAG_text in ['host_age', 'host age']: + if is_integer(VALUE_text): + info_for_yaml_dict['host']['host_age'] = VALUE_text + info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036' + elif TAG_text == 'collected_by': + if VALUE_text.lower() not in ['not available', 'missing']: + name = VALUE_text in ['Dr. Susie Bartlett', 'Ahmed Babiker', 'Aisi Fu', 'Brandi Williamson', 'George Taiaroa', 'Natacha Ogando', 'Tim Dalebout', 'ykut Ozdarendeli'] + + info_for_yaml_dict['sample']['collector_name' if name else 'collecting_institution'] = VALUE_text + elif TAG_text == 'collecting institution': + if VALUE_text.lower() not in ['not provided', 'na']: + info_for_yaml_dict['sample']['collecting_institution'] = VALUE_text + elif TAG_text in ['collection_date', 'collection date']: + if VALUE_text.lower() not in ['not applicable', 'missing', 'na']: + date_to_write = VALUE_text + date_is_estimated = True + + VALUE_text_list = VALUE_text.split('-') + if len(VALUE_text_list) == 3: + date_is_estimated = False + + if VALUE_text_list[1].isalpha(): + date_to_write = parse(VALUE_text).strftime('%Y-%m-%d') + elif len(VALUE_text_list) == 2: + date_to_write = VALUE_text + '-15' + else: + if int(VALUE_text) < 2020: + date_to_write = "{}-12-15".format(VALUE_text) + else: + date_to_write = "{}-01-15".format(VALUE_text) + + info_for_yaml_dict['sample']['collection_date'] = date_to_write + + if date_is_estimated: + if 'additional_collection_information' in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(VALUE_text) + else: + info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(VALUE_text) + elif TAG_text == 'geo_loc_name': + if VALUE_text in term_to_uri_dict: + info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[VALUE_text] + elif VALUE_text.lower() not in ['na', 'not applicable']: + missing_value_list.append('\t'.join([accession, 'geo_loc_name', VALUE_text])) + #else: + # if TAG_text not in ['lat_lon', 'host_disease', 'BioSampleModel', 'passage_history']: + # print(accession, TAG_text, VALUE_text) + + + taxon_id = SAMPLE.find('SAMPLE_NAME').find('TAXON_ID').text + info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+taxon_id + + + EXPERIMENT = EXPERIMENT_PACKAGE.find('EXPERIMENT') + INSTRUMENT_MODEL = [x.text for x in EXPERIMENT.find('PLATFORM').iter('INSTRUMENT_MODEL')][0] + if INSTRUMENT_MODEL.lower() != 'unspecified': + if INSTRUMENT_MODEL in term_to_uri_dict: + info_for_yaml_dict['technology']['sample_sequencing_technology'] = [term_to_uri_dict[INSTRUMENT_MODEL]] + else: + missing_value_list.append('\t'.join([accession, 'sample_sequencing_technology', INSTRUMENT_MODEL])) + + LIBRARY_DESCRIPTOR = EXPERIMENT.find('DESIGN').find('LIBRARY_DESCRIPTOR') + if LIBRARY_DESCRIPTOR.text not in ['OTHER']: + info_for_yaml_dict['technology']['additional_technology_information'] = 'LIBRARY_STRATEGY: {};'.format(LIBRARY_DESCRIPTOR.find('LIBRARY_STRATEGY').text) + + + SUBMISSION = EXPERIMENT_PACKAGE.find('SUBMISSION') + info_for_yaml_dict['submitter']['submitter_sample_id'] = SUBMISSION.attrib['accession'] + + if SUBMISSION.attrib['lab_name'].lower() not in ['na']: + info_for_yaml_dict['submitter']['originating_lab'] = SUBMISSION.attrib['lab_name'] + + STUDY = EXPERIMENT_PACKAGE.find('STUDY') + info_for_yaml_dict['submitter']['publication'] = SUBMISSION.attrib['lab_name'] + + + Organization = EXPERIMENT_PACKAGE.find('Organization') + Organization_Name = Organization.find('Name') + info_for_yaml_dict['submitter']['authors'] = [Organization_Name.text] + + Organization_Contact = Organization.find('Contact') + if Organization_Contact is not None: + Organization_Contact_Name = Organization_Contact.find('Name') + info_for_yaml_dict['submitter']['submitter_name'] = [Organization_Contact_Name.find('First').text + ' ' + Organization_Contact_Name.find('Last').text] + info_for_yaml_dict['submitter']['additional_submitter_information'] = Organization_Contact.attrib['email'] + + Organization_Concact_Address = Organization_Contact.find('Address') + if Organization_Concact_Address is not None: + info_for_yaml_dict['submitter']['submitter_address'] = '; '.join([x.text for x in Organization_Concact_Address] + ['Postal code ' + Organization_Concact_Address.attrib['postal_code']]) + + Organization_Address = Organization.find('Address') + if Organization_Address is not None: + info_for_yaml_dict['submitter']['lab_address'] = '; '.join([x.text for x in Organization_Address] + ['Postal code ' + Organization_Address.attrib['postal_code']]) + + if 'collection_date' not in info_for_yaml_dict['sample']: + info_for_yaml_dict['sample']['collection_date'] = '1970-01-01' + info_for_yaml_dict['sample']['additional_collection_information'] = "The real 'collection_date' is missing" + + with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw: + json.dump(info_for_yaml_dict, fw, indent=2) + +if len(missing_value_list) > 0: + path_missing_terms_tsv = 'missing_terms.tsv' + print('Written missing terms in {}'.format(path_missing_terms_tsv)) + with open(path_missing_terms_tsv, 'w') as fw: + fw.write('\n'.join(missing_value_list)) \ No newline at end of file -- cgit v1.2.3 From 2d822c049f2b7f3e62a3ac4c19c827428b6f73e2 Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Mon, 22 Jun 2020 23:38:22 +0200 Subject: very little readme for the scripts --- scripts/README.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 scripts/README.md diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..a3ed1b6 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,6 @@ +### Instructions for download and/or prepare the data and/ or the metadata + +Just go into the `download_genbank_data` or `download_sra_data` directory and execute the python3 script inside. + +- `download_genbank_data/from_genbank_to_fasta_and_yaml.py` downloads the data and the matadata, preparing the FASTA and the YAML files; +- `download_sra_data/download_sra_data.py` creates the metadata in the form of YAML files from the SraExperimentPackage.XXX.xml.gz file in the same directory. -- cgit v1.2.3