aboutsummaryrefslogtreecommitdiff
path: root/scripts/download_genbank_data
diff options
context:
space:
mode:
authorAndreaGuarracino2020-11-13 12:41:14 +0100
committerAndreaGuarracino2020-11-13 12:41:14 +0100
commit9674e9582536d52fef8f6dcde6dade07d0c580e7 (patch)
tree33364f157de4de8a5ff092205e31bd945758e1fd /scripts/download_genbank_data
parent133cc8b283cd3f27b1bae3863a48e4e351c30f82 (diff)
downloadbh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.tar.gz
bh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.tar.lz
bh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.zip
added ids-to-consider option to the NCBI script
Diffstat (limited to 'scripts/download_genbank_data')
-rwxr-xr-xscripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py19
1 files changed, 19 insertions, 0 deletions
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 9a46474..083122f 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -3,6 +3,7 @@
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--ids-to-ignore', type=str, help='file with ids to ignore in all steps, 1 id per line', required=False)
+parser.add_argument('--ids-to-consider', type=str, help='file with ids to consider in all steps, 1 id per line', required=False)
parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False)
parser.add_argument('--only-missing-ids', action='store_true', help='download only missing ids not already downloaded', required=False)
parser.add_argument('--dict-ontology', type=str, help='where is the ontology',
@@ -73,6 +74,21 @@ if os.path.exists(dir_fasta_and_yaml):
accession_to_ignore_set.update(accession_already_downloaded_set)
+
+accession_to_consider_set = set()
+
+if args.ids_to_consider:
+ if not os.path.exists(args.ids_to_consider):
+ print("\tThe '{}' file doesn't exist.".format(args.ids_to_consider))
+ sys.exit(-1)
+
+ with open(args.ids_to_consider) as f:
+ accession_to_consider_set.update(set([x.split('.')[0] for x in f.read().strip('\n').split('\n')]))
+
+ if len(accession_to_consider_set) > 0:
+ print('There are {} accessions to consider.'.format(len(accession_to_consider_set)))
+
+
if not os.path.exists(dir_metadata):
# Take all the ids
id_set = set()
@@ -88,6 +104,9 @@ if not os.path.exists(dir_metadata):
# Remove the version in the id
new_ids_set = set([x.split('.')[0] for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']])
+ if len(accession_to_consider_set) > 0:
+ new_ids_set = new_ids_set.intersection(accession_to_consider_set)
+
new_ids = len(new_ids_set.difference(id_set))
id_set.update(new_ids_set)