aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data
diff options
context:
space:
mode:
Diffstat (limited to 'workflows/pull-data')
-rwxr-xr-xworkflows/pull-data/genbank/genbank-fetch-ids.py40
1 files changed, 19 insertions, 21 deletions
diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py
index 1962daa..cb48cd8 100755
--- a/workflows/pull-data/genbank/genbank-fetch-ids.py
+++ b/workflows/pull-data/genbank/genbank-fetch-ids.py
@@ -6,28 +6,20 @@
#
# See also directory .guix-run and README.md
-BATCH_SIZE=5000
-
import argparse
-import json
-import os
-import requests
import sys
-import xml.etree.ElementTree as ET
-from datetime import date, datetime
-from dateutil.parser import parse
+from datetime import date
+
+from Bio import Entrez
parser = argparse.ArgumentParser()
parser.add_argument('--max', type=int, help='Max queries', required=False)
parser.add_argument('--skip', type=str, help='File with ids to skip, 1 id per line', required=False)
args = parser.parse_args()
-from Bio import Entrez
-Entrez.email = 'another_email@gmail.com' # FIXME
-
-# min_acceptable_collection_date = datetime(2019, 12, 1)
+BATCH_SIZE = 5000
-today_date = date.today().strftime("%Y.%m.%d")
+Entrez.email = 'another_email@gmail.com' # FIXME
skip = set()
if args.skip:
@@ -36,10 +28,11 @@ if args.skip:
for line in content:
skip.add(line.strip())
-print(f"Skip size is {len(skip)}",file=sys.stderr)
+print(f"Skip size is {len(skip)}", file=sys.stderr)
# Try to search several strings
TERMS = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]']
+
# Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) starting with
PREFIX = ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']
@@ -47,22 +40,27 @@ ids = set()
for term in TERMS:
num_read = BATCH_SIZE
retstart = 0
+
while num_read == BATCH_SIZE:
record = Entrez.read(
- Entrez.esearch(db='nuccore', term=term, idtype='acc',
- retstart=retstart, retmax=BATCH_SIZE)
+ Entrez.esearch(db='nuccore', term=term, idtype='acc', retstart=retstart, retmax=BATCH_SIZE)
)
+
idlist = record['IdList']
new_ids = set(idlist)
num_read = len(new_ids)
- print(num_read,":",idlist[0],file=sys.stderr)
retstart += num_read
- new_ids.difference_update(skip) # remove skip ids
+
+ print(num_read, ":", idlist[0], file=sys.stderr)
+
+ new_ids.difference_update(skip) # remove skip ids
new_ids = set([id for id in new_ids if id[:2] not in PREFIX])
- ids.update(new_ids) # add to total set
- print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)})",file=sys.stderr)
+ ids.update(new_ids) # add to total set
+
+ print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)}", file=sys.stderr)
+
if args.max and len(ids) > args.max:
- print(f"Stopping past #{args.max} items",file=sys.stderr)
+ print(f"Stopping past #{args.max} items", file=sys.stderr)
break
for id in ids: