added option in the genbank script to ignore (already validated) IDs; code cleaning; typos

author: AndreaGuarracino 2020-08-26 16:56:47 +0200
committer: AndreaGuarracino 2020-08-26 16:56:47 +0200
commit: 9107e59b2e69c4f3f3369175d57631ed878009e7 (patch)
tree: 23603da970d1f40ce382ca1be544e783a71333bd
parent: ecd0df9069ff93d57eaaa62648c490a5df08fe53 (diff)
download: bh20-seq-resource-9107e59b2e69c4f3f3369175d57631ed878009e7.tar.gz
bh20-seq-resource-9107e59b2e69c4f3f3369175d57631ed878009e7.tar.lz
bh20-seq-resource-9107e59b2e69c4f3f3369175d57631ed878009e7.zip
1 files changed, 34 insertions, 24 deletions
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index eefdddb..272b5ba 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -2,8 +2,9 @@
 
 import argparse
 parser = argparse.ArgumentParser()
+parser.add_argument('--ids-to-ignore', type=str, help='file with ids to ignore in all steps, 1 id per line', required=False)
 parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False)
-parser.add_argument('--only-missing-id', action='store_true', help='download only missing id', required=False)
+parser.add_argument('--only-missing-ids', action='store_true', help='download only missing ids not already downloaded', required=False)
 parser.add_argument('--dict-ontology', type=str, help='where is the ontology',
                     default='../dict_ontology_standardization/',required=False)
 args = parser.parse_args()
@@ -48,17 +49,31 @@ if os.path.exists(dir_metadata):
         sys.exit(-1)
 
 
-accession_already_downloaded_set = []
+accession_to_ignore_set = set()
+
+if args.ids_to_ignore:
+    if not os.path.exists(args.ids_to_ignore):
+        print("\tThe '{}' file doesn't exist.".format(args.ids_to_ignore))
+        sys.exit(-1)
+
+    with open(args.ids_to_ignore) as f:
+        accession_to_ignore_set.update(set([x.split('.')[0] for x in f.read().strip('\n').split('\n')]))
+        print('There are {} accessions to ignore.'.format(len(accession_to_ignore_set)))
+
+
+accession_already_downloaded_set = set()
 
 if os.path.exists(dir_fasta_and_yaml):
     print("The directory '{}' already exists.".format(dir_fasta_and_yaml))
-    if not args.only_missing_id:
-        print("To start the download, delete the directory '{}' or specify --only-missing-id.".format(dir_fasta_and_yaml))
+    if not args.only_missing_ids:
+        print("To start the download, delete the directory '{}' or specify --only-missing-ids.".format(dir_fasta_and_yaml))
         sys.exit(-1)
 
     accession_already_downloaded_set = set([x.split('.yaml')[0].split('.')[0] for x in os.listdir(dir_fasta_and_yaml) if x.endswith('.yaml')])
-    print('There are {} accession already downloaded.'.format(len(accession_already_downloaded_set)))
+    print('There are {} accessions already downloaded.'.format(len(accession_already_downloaded_set)))
+
 
+accession_to_ignore_set.update(accession_already_downloaded_set)
 
 if not os.path.exists(dir_metadata):
     # Take all the ids
@@ -72,13 +87,9 @@ if not os.path.exists(dir_metadata):
         )['IdList']
 
         # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq)
-        tmp_list = [x for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']]
-
         # Remove the version in the id
-        tmp_list = [x.split('.')[0] for x in tmp_list]
+        new_ids_set = set([x.split('.')[0] for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']])
 
-        #tmp_list = tmp_list[0:2] # restricting to small run
-        new_ids_set = set([x.split('.')[0] for x in tmp_list])
         new_ids = len(new_ids_set.difference(id_set))
         id_set.update(new_ids_set)
 
@@ -97,9 +108,8 @@ if not os.path.exists(dir_metadata):
 
     print('DB: NCBI Virus', today_date, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set))
 
-    if len(accession_already_downloaded_set) > 0:
-        id_set = id_set.difference(accession_already_downloaded_set)
-        print('There are {} missing IDs to download.'.format(len(id_set)))
+    id_set = id_set.difference(accession_to_ignore_set)
+    print('There are {} missing IDs to download.'.format(len(id_set)))
 
     os.makedirs(dir_metadata)
     for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)):
@@ -260,17 +270,17 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
                                 missing_value_list.append('\t'.join([accession_version, 'host_species', GBQualifier_value_text_list[0]]))
 
                             # Possible cases:
-                            # - Homo sapiens						--> ['Homo sapiens']
-                            # - Homo sapiens; female				--> ['Homo sapiens', 'female']
-                            # - Homo sapiens; female 63				--> ['Homo sapiens', 'female 63']
-                            # - Homo sapiens; female; age 40		--> ['Homo sapiens', 'female', 'age 40']
-                            # - Homo sapiens; gender: F; age: 61	--> ['Homo sapiens', 'gender: F', 'age: 61']
-                            # - Homo sapiens; gender: M; age: 68	--> ['Homo sapiens', 'gender: M', 'age: 68']
-                            # - Homo sapiens; hospitalized patient	--> ['Homo sapiens', 'hospitalized patient']
-                            # - Homo sapiens; male					--> ['Homo sapiens', 'male']
-                            # - Homo sapiens; male; 63				--> ['Homo sapiens', 'male', '63']
-                            # - Homo sapiens; male; age 29			--> ['Homo sapiens', 'male', 'age 29']
-                            # - Homo sapiens; symptomatic			--> ['Homo sapiens', 'symptomatic']
+                            # - Homo sapiens                        --> ['Homo sapiens']
+                            # - Homo sapiens; female                --> ['Homo sapiens', 'female']
+                            # - Homo sapiens; female 63             --> ['Homo sapiens', 'female 63']
+                            # - Homo sapiens; female; age 40        --> ['Homo sapiens', 'female', 'age 40']
+                            # - Homo sapiens; gender: F; age: 61    --> ['Homo sapiens', 'gender: F', 'age: 61']
+                            # - Homo sapiens; gender: M; age: 68    --> ['Homo sapiens', 'gender: M', 'age: 68']
+                            # - Homo sapiens; hospitalized patient  --> ['Homo sapiens', 'hospitalized patient']
+                            # - Homo sapiens; male                  --> ['Homo sapiens', 'male']
+                            # - Homo sapiens; male; 63              --> ['Homo sapiens', 'male', '63']
+                            # - Homo sapiens; male; age 29          --> ['Homo sapiens', 'male', 'age 29']
+                            # - Homo sapiens; symptomatic           --> ['Homo sapiens', 'symptomatic']
                             if len(GBQualifier_value_text_list) > 1:
                                 host_sex = ''
                                 if 'female' in GBQualifier_value_text_list[1]:
author	AndreaGuarracino	2020-08-26 16:56:47 +0200
committer	AndreaGuarracino	2020-08-26 16:56:47 +0200
commit	9107e59b2e69c4f3f3369175d57631ed878009e7 (patch)
tree	23603da970d1f40ce382ca1be544e783a71333bd
parent	ecd0df9069ff93d57eaaa62648c490a5df08fe53 (diff)
download	bh20-seq-resource-9107e59b2e69c4f3f3369175d57631ed878009e7.tar.gz bh20-seq-resource-9107e59b2e69c4f3f3369175d57631ed878009e7.tar.lz bh20-seq-resource-9107e59b2e69c4f3f3369175d57631ed878009e7.zip