diff options
author | LLTommy | 2020-06-08 19:18:06 +0200 |
---|---|---|
committer | GitHub | 2020-06-08 19:18:06 +0200 |
commit | 39eca644ed4a0a86510ae0b8afb8da9207d9750a (patch) | |
tree | 8c4b23c7c016275a57a96a018ea271f6c067824b /scripts | |
parent | 80cfaba31a99d0c34722312c1b1a69a139477510 (diff) | |
parent | e1447dedb1a2a1a03957e56c812acdedf47d43fb (diff) | |
download | bh20-seq-resource-39eca644ed4a0a86510ae0b8afb8da9207d9750a.tar.gz bh20-seq-resource-39eca644ed4a0a86510ae0b8afb8da9207d9750a.tar.lz bh20-seq-resource-39eca644ed4a0a86510ae0b8afb8da9207d9750a.zip |
Merge pull request #74 from AndreaGuarracino/patch-16
from_genbank_to_fasta_and_yaml script update
Diffstat (limited to 'scripts')
4 files changed, 155 insertions, 35 deletions
diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index 6b43137..7e83564 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -111,9 +111,11 @@ France,http://www.wikidata.org/entity/Q142 Gabon,http://www.wikidata.org/entity/Q1000 Georgia,http://www.wikidata.org/entity/Q230 Germany,http://www.wikidata.org/entity/Q183 +Germany: Bavaria,https://www.wikidata.org/wiki/Q980 Germany: Dusseldorf,https://www.wikidata.org/wiki/Q1718 Ghana,http://www.wikidata.org/entity/Q117 Greece,http://www.wikidata.org/entity/Q41 +Greece: Athens,https://www.wikidata.org/wiki/Q1524 Grenada,http://www.wikidata.org/entity/Q769 Guatemala,http://www.wikidata.org/entity/Q774 Guinea,http://www.wikidata.org/entity/Q1006 @@ -125,6 +127,7 @@ Hungary,http://www.wikidata.org/entity/Q28 Iceland,http://www.wikidata.org/entity/Q189 Icelandic Commonwealth,http://www.wikidata.org/entity/Q62389 India,http://www.wikidata.org/entity/Q668 +India: Ahmedabad,http://www.wikidata.org/entity/Q1070 India: Kerala State,http://www.wikidata.org/entity/Q1186 India: Rajkot,http://www.wikidata.org/entity/Q1815245 Indonesia,http://www.wikidata.org/entity/Q252 @@ -136,6 +139,8 @@ Ireland,http://www.wikidata.org/entity/Q27 Israel,http://www.wikidata.org/entity/Q801 Italy,http://www.wikidata.org/entity/Q38 Italy: Cagliari,http://www.wikidata.org/entity/Q1897 +Italy: Lazio,https://www.wikidata.org/wiki/Q1282 +Italy: Palermo,https://www.wikidata.org/wiki/Q2656 Italy: Rome,http://www.wikidata.org/entity/Q220 Ivory Coast,http://www.wikidata.org/entity/Q1008 Jamaica,http://www.wikidata.org/entity/Q766 @@ -272,6 +277,7 @@ USA: DC,http://www.wikidata.org/entity/Q3551781 USA: DE,http://www.wikidata.org/entity/Q1393 USA: FL,http://www.wikidata.org/entity/Q812 USA: GA,http://www.wikidata.org/entity/Q1428 +USA: Georgia,http://www.wikidata.org/entity/Q1428 USA: HI,http://www.wikidata.org/entity/Q782 USA: IA,http://www.wikidata.org/entity/Q1546 USA: ID,http://www.wikidata.org/entity/Q1221 @@ -283,9 +289,11 @@ USA: KY,http://www.wikidata.org/entity/Q1603 USA: LA,http://www.wikidata.org/entity/Q1588 "USA: New Orleans, LA",https://www.wikidata.org/wiki/Q34404 USA: MA,http://www.wikidata.org/entity/Q771 +USA: Massachusetts,http://www.wikidata.org/entity/Q771 USA: MD,http://www.wikidata.org/entity/Q1391 USA: ME,http://www.wikidata.org/entity/Q724 USA: MI,http://www.wikidata.org/entity/Q1166 +USA: Michigan,http://www.wikidata.org/entity/Q1166 USA: MN,http://www.wikidata.org/entity/Q1527 USA: MO,http://www.wikidata.org/entity/Q1581 USA: MS,http://www.wikidata.org/entity/Q1494 @@ -301,6 +309,7 @@ USA: NV,http://www.wikidata.org/entity/Q1227 USA: NY,http://www.wikidata.org/entity/Q1384 USA: New York,http://www.wikidata.org/entity/Q1384 USA: OH,http://www.wikidata.org/entity/Q1397 +USA: Ohio,http://www.wikidata.org/entity/Q1397 USA: OK,http://www.wikidata.org/entity/Q1649 USA: OR,http://www.wikidata.org/entity/Q824 USA: PA,http://www.wikidata.org/entity/Q1400 @@ -313,9 +322,11 @@ USA: TN,http://www.wikidata.org/entity/Q1509 USA: TX,http://www.wikidata.org/entity/Q1439 USA: UT,http://www.wikidata.org/entity/Q829 USA: VA,http://www.wikidata.org/entity/Q1370 +USA: Virginia,http://www.wikidata.org/entity/Q1370 USA: VT,http://www.wikidata.org/entity/Q16551 USA: WA,http://www.wikidata.org/entity/Q1223 USA: WI,http://www.wikidata.org/entity/Q1537 +USA: Wisconsin,http://www.wikidata.org/entity/Q1537 USA: WV,http://www.wikidata.org/entity/Q1371 USA: WY,http://www.wikidata.org/entity/Q1214 Uzbekistan,http://www.wikidata.org/entity/Q265 @@ -328,4 +339,4 @@ Viet Nam: Ho Chi Minh city,http://www.wikidata.org/entity/Q1854 Vietnam,http://www.wikidata.org/entity/Q881 Yemen,http://www.wikidata.org/entity/Q805 Zambia,http://www.wikidata.org/entity/Q953 -Zimbabwe,http://www.wikidata.org/entity/Q954
\ No newline at end of file +Zimbabwe,http://www.wikidata.org/entity/Q954 diff --git a/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv b/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv index 0c92c61..49cb6b7 100644 --- a/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv +++ b/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv @@ -1,8 +1,10 @@ Illumian NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173 Illumina NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173 +NextSeq500,http://www.ebi.ac.uk/efo/EFO_0009173 Nanopore MinION,http://www.ebi.ac.uk/efo/EFO_0008632 Oxford Nanopore MinION,http://www.ebi.ac.uk/efo/EFO_0008632 ONT (Oxford Nanopore Technologies),http://purl.obolibrary.org/obo/NCIT_C146818 +Oxford Nanopore Technology,http://purl.obolibrary.org/obo/NCIT_C146818 Oxford Nanopore technologies MinION,http://www.ebi.ac.uk/efo/EFO_0008632 MinION Oxford Nanopore,http://www.ebi.ac.uk/efo/EFO_0008632 Nanopore,http://purl.obolibrary.org/obo/NCIT_C146818 diff --git a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv index 7fa67f8..18b986c 100644 --- a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv +++ b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv @@ -1,4 +1,5 @@ nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 +nasopharyngeal swabs,http://purl.obolibrary.org/obo/NCIT_C155831 nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 nasopharyngeal,http://purl.obolibrary.org/obo/NCIT_C155831 respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831 @@ -12,10 +13,16 @@ nasopharyngeal (throat) washings,http://purl.obolibrary.org/obo/NCIT_C155831 oropharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835 throat swab,http://purl.obolibrary.org/obo/NCIT_C155835 oro-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835 +Oropharyngal,http://purl.obolibrary.org/obo/NCIT_C155835 +Oral-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835 +Oro-pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835 +Oropharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835 +oro pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835 buccal swab,http://purl.obolibrary.org/obo/NCIT_C155835 throat washing,http://purl.obolibrary.org/obo/NCIT_C155835 Throat Swab,http://purl.obolibrary.org/obo/NCIT_C155835 throat (oropharyngeal) swab,http://purl.obolibrary.org/obo/NCIT_C155835 +Throat (Oropharyngeal) swab,http://purl.obolibrary.org/obo/NCIT_C155835 bronchoalveolar lavage fluid,http://purl.obolibrary.org/obo/NCIT_C13195 swab,http://purl.obolibrary.org/obo/NCIT_C13195 oral swab,http://purl.obolibrary.org/obo/NCIT_C13195 diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 6f046ea..65adb00 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -1,31 +1,68 @@ #!/usr/bin/env python3 +import argparse +parser = argparse.ArgumentParser() +parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False) +parser.add_argument('--only-missing-id', action='store_true', help='download only missing id', required=False) +args = parser.parse_args() + from Bio import Entrez Entrez.email = 'another_email@gmail.com' import xml.etree.ElementTree as ET import json import os +import requests +import sys -from dateutil import parser +from datetime import date +from dateutil.parser import parse num_ids_for_request = 100 dir_metadata = 'metadata_from_nuccore' dir_fasta_and_yaml = 'fasta_and_yaml' dir_dict_ontology_standardization = 'dict_ontology_standardization/' -path_ncbi_virus_accession = 'sequences.acc' + +today_date = date.today().strftime("%Y.%m.%d") +path_ncbi_virus_accession = 'sequences.{}.acc'.format(today_date) + +def is_integer(string_to_check): + try: + int(string_to_check) + return True + except ValueError: + return False def chunks(lst, n): for i in range(0, len(lst), n): yield lst[i:i + n] -if not os.path.exists(dir_metadata): - os.makedirs(dir_metadata) +if os.path.exists(dir_metadata): + print("The directory '{}' already exists.".format(dir_metadata)) + + if not args.skip_request: + print("\tTo start the request, delete the directory '{}' or specify --skip-request.".format(dir_metadata)) + sys.exit(-1) + +accession_already_downloaded_set = [] + +if os.path.exists(dir_fasta_and_yaml): + print("The directory '{}' already exists.".format(dir_fasta_and_yaml)) + if not args.only_missing_id: + print("To start the download, delete the directory '{}' or specify --only-missing-id.".format(dir_fasta_and_yaml)) + sys.exit(-1) + + accession_already_downloaded_set = set([x.split('.yaml')[0].split('.')[0] for x in os.listdir(dir_fasta_and_yaml) if x.endswith('.yaml')]) + print('There are {} accession already downloaded.'.format(len(accession_already_downloaded_set))) + + +if not os.path.exists(dir_metadata): # Take all the ids id_set = set() + # Try to search several strings term_list = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]'] for term in term_list: tmp_list = Entrez.read( @@ -38,21 +75,31 @@ if not os.path.exists(dir_metadata): # Remove the version in the id tmp_list = [x.split('.')[0] for x in tmp_list] - print(term, len(tmp_list)) #tmp_list = tmp_list[0:2] # restricting to small run + new_ids_set = set([x.split('.')[0] for x in tmp_list]) + new_ids = len(new_ids_set.difference(id_set)) + id_set.update(new_ids_set) + + print('Term:', term, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set)) - id_set.update([x.split('.')[0] for x in tmp_list]) + if not os.path.exists(path_ncbi_virus_accession): + r = requests.get('https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?q=*:*&fq=%7B!tag=SeqType_s%7DSeqType_s:(%22Nucleotide%22)&fq=VirusLineageId_ss:(2697049)&cmd=download&sort=SourceDB_s%20desc,CreateDate_dt%20desc,id%20asc&dlfmt=acc&fl=id') + with open(path_ncbi_virus_accession, 'w') as fw: + fw.write(r.text) - print(term_list, len(id_set)) + with open(path_ncbi_virus_accession) as f: + tmp_list = [line.strip('\n') for line in f] - if os.path.exists(path_ncbi_virus_accession): - with open(path_ncbi_virus_accession) as f: - tmp_list = [line.strip('\n') for line in f] - print('NCBI Virus', len(tmp_list)) - id_set.update(tmp_list) - term_list.append('NCBI Virus') - print(term_list, len(id_set)) + new_ids = len(set(tmp_list).difference(id_set)) + id_set.update(tmp_list) + print('DB: NCBI Virus', today_date, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set)) + + if len(accession_already_downloaded_set) > 0: + id_set = id_set.difference(accession_already_downloaded_set) + print('There are {} missing IDs to download.'.format(len(id_set))) + + os.makedirs(dir_metadata) for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)): path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i)) print('Requesting {} ids --> {}'.format(len(id_x_list), path_metadata_xxx_xml)) @@ -79,13 +126,21 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x term_to_uri_dict[term] = uri species_to_taxid_dict = { - 'Homo sapiens': 'http://purl.obolibrary.org/obo/NCBITaxon_9606' + 'Homo sapiens': 'http://purl.obolibrary.org/obo/NCBITaxon_9606', + 'Mustela lutreola': 'http://purl.obolibrary.org/obo/NCBITaxon_9666', + 'Manis javanica': 'http://purl.obolibrary.org/obo/NCBITaxon_9974', + 'Felis catus': 'http://purl.obolibrary.org/obo/NCBITaxon_9685', + 'Panthera tigris jacksoni': 'http://purl.obolibrary.org/obo/NCBITaxon_419130', + 'Canis lupus familiaris': 'http://purl.obolibrary.org/obo/NCBITaxon_9615' } if not os.path.exists(dir_fasta_and_yaml): os.makedirs(dir_fasta_and_yaml) +min_len_to_count = 27500 +num_seq_with_len_ge_X_bp = 0 + missing_value_list = [] for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]: @@ -100,6 +155,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) print(accession_version, ' - sequence not found') continue + #print(path_metadata_xxx_xml, accession_version) # A general default-empty yaml could be read from the definitive one info_for_yaml_dict = { @@ -135,7 +191,11 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) GBSeq_comment = GBSeq.find('GBSeq_comment') if GBSeq_comment is not None and 'Assembly-Data' in GBSeq_comment.text: - GBSeq_comment_text = GBSeq_comment.text.split('##Assembly-Data-START## ; ')[1].split(' ; ##Assembly-Data-END##')[0] + prefix_split_string = '##Genome-Assembly' if GBSeq_comment.text.startswith('##Genome-') else '##Assembly' + + GBSeq_comment_text = GBSeq_comment.text.split( + '{}-Data-START## ; '.format(prefix_split_string) + )[1].split(' ; {}-Data-END##'.format(prefix_split_string))[0] for info_to_check, field_in_yaml in zip( ['Assembly Method', 'Coverage', 'Sequencing Technology'], @@ -186,21 +246,54 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if GBQualifier_value_text_list[0] in species_to_taxid_dict: info_for_yaml_dict['host']['host_species'] = species_to_taxid_dict[GBQualifier_value_text_list[0]] - + elif GBQualifier_value_text_list[0] and ('MT215193' in accession_version or 'MT270814' in accession_version): + # Information checked manually from NCBI Virus + info_for_yaml_dict['host']['host_species'] = species_to_taxid_dict['Canis lupus familiaris'] + else: + missing_value_list.append('\t'.join([accession_version, 'host_species', GBQualifier_value_text_list[0]])) + + # Possible cases: + # - Homo sapiens --> ['Homo sapiens'] + # - Homo sapiens; female --> ['Homo sapiens', 'female'] + # - Homo sapiens; female 63 --> ['Homo sapiens', 'female 63'] + # - Homo sapiens; female; age 40 --> ['Homo sapiens', 'female', 'age 40'] + # - Homo sapiens; gender: F; age: 61 --> ['Homo sapiens', 'gender: F', 'age: 61'] + # - Homo sapiens; gender: M; age: 68 --> ['Homo sapiens', 'gender: M', 'age: 68'] + # - Homo sapiens; hospitalized patient --> ['Homo sapiens', 'hospitalized patient'] + # - Homo sapiens; male --> ['Homo sapiens', 'male'] + # - Homo sapiens; male; 63 --> ['Homo sapiens', 'male', '63'] + # - Homo sapiens; male; age 29 --> ['Homo sapiens', 'male', 'age 29'] + # - Homo sapiens; symptomatic --> ['Homo sapiens', 'symptomatic'] if len(GBQualifier_value_text_list) > 1: - if GBQualifier_value_text_list[1] in ['male', 'female']: - if GBQualifier_value_text_list[1]=='male': - info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" - elif GBQualifier_value_text_list[1]=='female': - info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000383" + host_sex = '' + if 'female' in GBQualifier_value_text_list[1]: + host_sex = 'female' + elif 'male' in GBQualifier_value_text_list[1]: + host_sex = 'male' + elif 'gender' in GBQualifier_value_text_list[1]: + host_sex_one_lecter = GBQualifier_value_text_list[1].split(':')[-1].strip() + if host_sex_one_lecter in ['F', 'M']: + host_sex = 'female' if host_sex_one_lecter == 'F' else 'male' + + if host_sex in ['male', 'female']: + info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if host_sex == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383" elif GBQualifier_value_text_list[1] in term_to_uri_dict: - info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]] + info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]] else: - missing_value_list.append('\t'.join([accession_version, GBQualifier_name_text, GBQualifier_value_text_list[1]])) + missing_value_list.append('\t'.join([accession_version, 'host_sex or host_health_status', GBQualifier_value_text_list[1]])) - if 'age' in GBQualifier_value_text: - info_for_yaml_dict['host']['host_age'] = int(GBQualifier_value_text_list[2].split('age ')[1]) + # Host age + host_age = -1 + if len(GBQualifier_value_text_list[1].split(' ')) > 1 and is_integer(GBQualifier_value_text_list[1].split(' ')[-1]): + host_age = int(GBQualifier_value_text_list[1].split(' ')[-1]) + elif len(GBQualifier_value_text_list) > 2 and is_integer(GBQualifier_value_text_list[2].split(' ')[-1]): + host_age = int(GBQualifier_value_text_list[2].split(' ')[-1]) + + if host_age > -1: + info_for_yaml_dict['host']['host_age'] = host_age info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036' + elif len(GBQualifier_value_text_list) > 2: + missing_value_list.append('\t'.join([accession_version, 'host_age', GBQualifier_value_text_list[2]])) elif GBQualifier_name_text == 'collected_by': if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]): info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text @@ -210,12 +303,15 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if GBQualifier_value_text.upper() in term_to_uri_dict: GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' + # Little cleaning + GBQualifier_value_text = GBQualifier_value_text.strip("/'") + if GBQualifier_value_text in term_to_uri_dict: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]] else: - if GBQualifier_value_text in ['NP/OP swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'np/np swab', 'np/op']: + if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab']: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']] - elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab']: + elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab']: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']] elif GBQualifier_value_text in ['nasopharyngeal aspirate/throat swab']: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']] @@ -227,9 +323,9 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if len(GBQualifier_value_text.split('-')) == 1: if int(GBQualifier_value_text) < 2020: - date_to_write = "15 12 {}".format(GBQualifier_value_text) + date_to_write = "{}-12-15".format(GBQualifier_value_text) else: - date_to_write = "15 01 {}".format(GBQualifier_value_text) + date_to_write = "{}-01-15".format(GBQualifier_value_text) if 'additional_collection_information' in info_for_yaml_dict['sample']: info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text) @@ -246,7 +342,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) GBQualifier_value_text_list = GBQualifier_value_text.split('-') if GBQualifier_value_text_list[1].isalpha(): - date_to_write = GBQualifier_value_text_list[1] + ' ' + GBQualifier_value_text_list[0] + ' ' + GBQualifier_value_text_list[2] + date_to_write = parse(GBQualifier_value_text).strftime('%Y-%m-%d') info_for_yaml_dict['sample']['collection_date'] = date_to_write elif GBQualifier_name_text in ['lat_lon', 'country']: @@ -254,11 +350,9 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) GBQualifier_value_text = 'China: Hong Kong' if GBQualifier_value_text in term_to_uri_dict: - GBQualifier_value_text = term_to_uri_dict[GBQualifier_value_text] + info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[GBQualifier_value_text] else: missing_value_list.append('\t'.join([accession_version, GBQualifier_name_text, GBQualifier_value_text])) - - info_for_yaml_dict['sample']['collection_location'] = GBQualifier_value_text elif GBQualifier_name_text == 'note': if 'additional_collection_information' in info_for_yaml_dict['sample']: info_for_yaml_dict['sample']['additional_collection_information'] += '; ' + GBQualifier_value_text @@ -281,6 +375,12 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) json.dump(info_for_yaml_dict, fw, indent=2) + if(len(GBSeq_sequence.text) >= min_len_to_count): + num_seq_with_len_ge_X_bp += 1 + + if len(missing_value_list) > 0: with open('missing_terms.tsv', 'w') as fw: fw.write('\n'.join(missing_value_list)) + +print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp)) |