diff options
author | AndreaGuarracino | 2020-11-13 12:41:14 +0100 |
---|---|---|
committer | AndreaGuarracino | 2020-11-13 12:41:14 +0100 |
commit | 9674e9582536d52fef8f6dcde6dade07d0c580e7 (patch) | |
tree | 33364f157de4de8a5ff092205e31bd945758e1fd /scripts/download_genbank_data | |
parent | 133cc8b283cd3f27b1bae3863a48e4e351c30f82 (diff) | |
download | bh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.tar.gz bh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.tar.lz bh20-seq-resource-9674e9582536d52fef8f6dcde6dade07d0c580e7.zip |
added ids-to-consider option to the NCBI script
Diffstat (limited to 'scripts/download_genbank_data')
-rwxr-xr-x | scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index 9a46474..083122f 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -3,6 +3,7 @@ import argparse parser = argparse.ArgumentParser() parser.add_argument('--ids-to-ignore', type=str, help='file with ids to ignore in all steps, 1 id per line', required=False) +parser.add_argument('--ids-to-consider', type=str, help='file with ids to consider in all steps, 1 id per line', required=False) parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False) parser.add_argument('--only-missing-ids', action='store_true', help='download only missing ids not already downloaded', required=False) parser.add_argument('--dict-ontology', type=str, help='where is the ontology', @@ -73,6 +74,21 @@ if os.path.exists(dir_fasta_and_yaml): accession_to_ignore_set.update(accession_already_downloaded_set) + +accession_to_consider_set = set() + +if args.ids_to_consider: + if not os.path.exists(args.ids_to_consider): + print("\tThe '{}' file doesn't exist.".format(args.ids_to_consider)) + sys.exit(-1) + + with open(args.ids_to_consider) as f: + accession_to_consider_set.update(set([x.split('.')[0] for x in f.read().strip('\n').split('\n')])) + + if len(accession_to_consider_set) > 0: + print('There are {} accessions to consider.'.format(len(accession_to_consider_set))) + + if not os.path.exists(dir_metadata): # Take all the ids id_set = set() @@ -88,6 +104,9 @@ if not os.path.exists(dir_metadata): # Remove the version in the id new_ids_set = set([x.split('.')[0] for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']]) + if len(accession_to_consider_set) > 0: + new_ids_set = new_ids_set.intersection(accession_to_consider_set) + new_ids = len(new_ids_set.difference(id_set)) id_set.update(new_ids_set) |