added ids-to-consider option to the NCBI script

author: AndreaGuarracino 2020-11-13 12:41:14 +0100
committer: AndreaGuarracino 2020-11-13 12:41:14 +0100
commit: 9674e9582536d52fef8f6dcde6dade07d0c580e7 (patch)
tree: 33364f157de4de8a5ff092205e31bd945758e1fd
parent: 133cc8b283cd3f27b1bae3863a48e4e351c30f82 (diff)
download: bh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.tar.gz
bh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.tar.lz
bh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.zip
1 files changed, 19 insertions, 0 deletions
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 9a46474..083122f 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -3,6 +3,7 @@
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument('--ids-to-ignore', type=str, help='file with ids to ignore in all steps, 1 id per line', required=False)
+parser.add_argument('--ids-to-consider', type=str, help='file with ids to consider in all steps, 1 id per line', required=False)
 parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False)
 parser.add_argument('--only-missing-ids', action='store_true', help='download only missing ids not already downloaded', required=False)
 parser.add_argument('--dict-ontology', type=str, help='where is the ontology',
@@ -73,6 +74,21 @@ if os.path.exists(dir_fasta_and_yaml):
 
 accession_to_ignore_set.update(accession_already_downloaded_set)
 
+
+accession_to_consider_set = set()
+
+if args.ids_to_consider:
+    if not os.path.exists(args.ids_to_consider):
+        print("\tThe '{}' file doesn't exist.".format(args.ids_to_consider))
+        sys.exit(-1)
+
+    with open(args.ids_to_consider) as f:
+        accession_to_consider_set.update(set([x.split('.')[0] for x in f.read().strip('\n').split('\n')]))
+
+    if len(accession_to_consider_set) > 0:
+        print('There are {} accessions to consider.'.format(len(accession_to_consider_set)))
+
+
 if not os.path.exists(dir_metadata):
     # Take all the ids
     id_set = set()
@@ -88,6 +104,9 @@ if not os.path.exists(dir_metadata):
         # Remove the version in the id
         new_ids_set = set([x.split('.')[0] for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']])
 
+        if len(accession_to_consider_set) > 0:
+            new_ids_set = new_ids_set.intersection(accession_to_consider_set)
+
         new_ids = len(new_ids_set.difference(id_set))
         id_set.update(new_ids_set)
author	AndreaGuarracino	2020-11-13 12:41:14 +0100
committer	AndreaGuarracino	2020-11-13 12:41:14 +0100
commit	9674e9582536d52fef8f6dcde6dade07d0c580e7 (patch)
tree	33364f157de4de8a5ff092205e31bd945758e1fd
parent	133cc8b283cd3f27b1bae3863a48e4e351c30f82 (diff)
download	bh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.tar.gz bh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.tar.lz bh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.zip