From 3c956d5bd1811d56502f073c40ffa4066ffaac3c Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Sun, 31 May 2020 12:36:29 +0200 Subject: Added new countries --- scripts/dict_ontology_standardization/ncbi_countries.csv | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'scripts/dict_ontology_standardization/ncbi_countries.csv') diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index 6b43137..6918493 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -111,9 +111,11 @@ France,http://www.wikidata.org/entity/Q142 Gabon,http://www.wikidata.org/entity/Q1000 Georgia,http://www.wikidata.org/entity/Q230 Germany,http://www.wikidata.org/entity/Q183 +Germany: Bavaria,https://www.wikidata.org/wiki/Q980 Germany: Dusseldorf,https://www.wikidata.org/wiki/Q1718 Ghana,http://www.wikidata.org/entity/Q117 Greece,http://www.wikidata.org/entity/Q41 +Greece: Athens,https://www.wikidata.org/wiki/Q1524 Grenada,http://www.wikidata.org/entity/Q769 Guatemala,http://www.wikidata.org/entity/Q774 Guinea,http://www.wikidata.org/entity/Q1006 @@ -136,6 +138,8 @@ Ireland,http://www.wikidata.org/entity/Q27 Israel,http://www.wikidata.org/entity/Q801 Italy,http://www.wikidata.org/entity/Q38 Italy: Cagliari,http://www.wikidata.org/entity/Q1897 +Italy: Lazio,https://www.wikidata.org/wiki/Q1282 +Italy: Palermo,https://www.wikidata.org/wiki/Q2656 Italy: Rome,http://www.wikidata.org/entity/Q220 Ivory Coast,http://www.wikidata.org/entity/Q1008 Jamaica,http://www.wikidata.org/entity/Q766 @@ -272,6 +276,7 @@ USA: DC,http://www.wikidata.org/entity/Q3551781 USA: DE,http://www.wikidata.org/entity/Q1393 USA: FL,http://www.wikidata.org/entity/Q812 USA: GA,http://www.wikidata.org/entity/Q1428 +USA: Georgia,http://www.wikidata.org/entity/Q1428 USA: HI,http://www.wikidata.org/entity/Q782 USA: IA,http://www.wikidata.org/entity/Q1546 USA: ID,http://www.wikidata.org/entity/Q1221 @@ -286,6 +291,7 @@ USA: MA,http://www.wikidata.org/entity/Q771 USA: MD,http://www.wikidata.org/entity/Q1391 USA: ME,http://www.wikidata.org/entity/Q724 USA: MI,http://www.wikidata.org/entity/Q1166 +USA: Michigan,http://www.wikidata.org/entity/Q1166 USA: MN,http://www.wikidata.org/entity/Q1527 USA: MO,http://www.wikidata.org/entity/Q1581 USA: MS,http://www.wikidata.org/entity/Q1494 @@ -301,6 +307,7 @@ USA: NV,http://www.wikidata.org/entity/Q1227 USA: NY,http://www.wikidata.org/entity/Q1384 USA: New York,http://www.wikidata.org/entity/Q1384 USA: OH,http://www.wikidata.org/entity/Q1397 +USA: Ohio,http://www.wikidata.org/entity/Q1397 USA: OK,http://www.wikidata.org/entity/Q1649 USA: OR,http://www.wikidata.org/entity/Q824 USA: PA,http://www.wikidata.org/entity/Q1400 @@ -316,6 +323,7 @@ USA: VA,http://www.wikidata.org/entity/Q1370 USA: VT,http://www.wikidata.org/entity/Q16551 USA: WA,http://www.wikidata.org/entity/Q1223 USA: WI,http://www.wikidata.org/entity/Q1537 +USA: Wisconsin,http://www.wikidata.org/entity/Q1537 USA: WV,http://www.wikidata.org/entity/Q1371 USA: WY,http://www.wikidata.org/entity/Q1214 Uzbekistan,http://www.wikidata.org/entity/Q265 @@ -328,4 +336,4 @@ Viet Nam: Ho Chi Minh city,http://www.wikidata.org/entity/Q1854 Vietnam,http://www.wikidata.org/entity/Q881 Yemen,http://www.wikidata.org/entity/Q805 Zambia,http://www.wikidata.org/entity/Q953 -Zimbabwe,http://www.wikidata.org/entity/Q954 \ No newline at end of file +Zimbabwe,http://www.wikidata.org/entity/Q954 -- cgit v1.2.3 From e1447dedb1a2a1a03957e56c812acdedf47d43fb Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sun, 7 Jun 2020 17:41:08 +0200 Subject: the script is more verbose; added other countries --- .../ncbi_countries.csv | 3 ++ scripts/from_genbank_to_fasta_and_yaml.py | 36 ++++++++++++++++++++-- 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'scripts/dict_ontology_standardization/ncbi_countries.csv') diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index 6918493..7e83564 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -127,6 +127,7 @@ Hungary,http://www.wikidata.org/entity/Q28 Iceland,http://www.wikidata.org/entity/Q189 Icelandic Commonwealth,http://www.wikidata.org/entity/Q62389 India,http://www.wikidata.org/entity/Q668 +India: Ahmedabad,http://www.wikidata.org/entity/Q1070 India: Kerala State,http://www.wikidata.org/entity/Q1186 India: Rajkot,http://www.wikidata.org/entity/Q1815245 Indonesia,http://www.wikidata.org/entity/Q252 @@ -288,6 +289,7 @@ USA: KY,http://www.wikidata.org/entity/Q1603 USA: LA,http://www.wikidata.org/entity/Q1588 "USA: New Orleans, LA",https://www.wikidata.org/wiki/Q34404 USA: MA,http://www.wikidata.org/entity/Q771 +USA: Massachusetts,http://www.wikidata.org/entity/Q771 USA: MD,http://www.wikidata.org/entity/Q1391 USA: ME,http://www.wikidata.org/entity/Q724 USA: MI,http://www.wikidata.org/entity/Q1166 @@ -320,6 +322,7 @@ USA: TN,http://www.wikidata.org/entity/Q1509 USA: TX,http://www.wikidata.org/entity/Q1439 USA: UT,http://www.wikidata.org/entity/Q829 USA: VA,http://www.wikidata.org/entity/Q1370 +USA: Virginia,http://www.wikidata.org/entity/Q1370 USA: VT,http://www.wikidata.org/entity/Q16551 USA: WA,http://www.wikidata.org/entity/Q1223 USA: WI,http://www.wikidata.org/entity/Q1537 diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index befd64d..65adb00 100755 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -1,5 +1,11 @@ #!/usr/bin/env python3 +import argparse +parser = argparse.ArgumentParser() +parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False) +parser.add_argument('--only-missing-id', action='store_true', help='download only missing id', required=False) +args = parser.parse_args() + from Bio import Entrez Entrez.email = 'another_email@gmail.com' @@ -7,6 +13,7 @@ import xml.etree.ElementTree as ET import json import os import requests +import sys from datetime import date from dateutil.parser import parse @@ -31,9 +38,27 @@ def chunks(lst, n): for i in range(0, len(lst), n): yield lst[i:i + n] -if not os.path.exists(dir_metadata): - os.makedirs(dir_metadata) +if os.path.exists(dir_metadata): + print("The directory '{}' already exists.".format(dir_metadata)) + + if not args.skip_request: + print("\tTo start the request, delete the directory '{}' or specify --skip-request.".format(dir_metadata)) + sys.exit(-1) + + +accession_already_downloaded_set = [] +if os.path.exists(dir_fasta_and_yaml): + print("The directory '{}' already exists.".format(dir_fasta_and_yaml)) + if not args.only_missing_id: + print("To start the download, delete the directory '{}' or specify --only-missing-id.".format(dir_fasta_and_yaml)) + sys.exit(-1) + + accession_already_downloaded_set = set([x.split('.yaml')[0].split('.')[0] for x in os.listdir(dir_fasta_and_yaml) if x.endswith('.yaml')]) + print('There are {} accession already downloaded.'.format(len(accession_already_downloaded_set))) + + +if not os.path.exists(dir_metadata): # Take all the ids id_set = set() @@ -70,6 +95,11 @@ if not os.path.exists(dir_metadata): print('DB: NCBI Virus', today_date, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set)) + if len(accession_already_downloaded_set) > 0: + id_set = id_set.difference(accession_already_downloaded_set) + print('There are {} missing IDs to download.'.format(len(id_set))) + + os.makedirs(dir_metadata) for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)): path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i)) print('Requesting {} ids --> {}'.format(len(id_x_list), path_metadata_xxx_xml)) @@ -353,4 +383,4 @@ if len(missing_value_list) > 0: with open('missing_terms.tsv', 'w') as fw: fw.write('\n'.join(missing_value_list)) -print('Num. sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp)) +print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp)) -- cgit v1.2.3