diff options
author | AndreaGuarracino | 2021-01-05 17:56:19 +0100 |
---|---|---|
committer | AndreaGuarracino | 2021-01-05 17:56:19 +0100 |
commit | 911ba372cfc4b35c5b52d18a573a636ea78d16d7 (patch) | |
tree | 02388b3da4f2c79ec48e4d541d0c679910713bfb | |
parent | f7666a7766c8138aa690340fc68cb67f709327f3 (diff) | |
download | bh20-seq-resource-911ba372cfc4b35c5b52d18a573a636ea78d16d7.tar.gz bh20-seq-resource-911ba372cfc4b35c5b52d18a573a636ea78d16d7.tar.lz bh20-seq-resource-911ba372cfc4b35c5b52d18a573a636ea78d16d7.zip |
cleaning update-from-genbank.py; removed unused import from genbank-fetch-ids.py
-rwxr-xr-x | workflows/pull-data/genbank/genbank-fetch-ids.py | 1 | ||||
-rwxr-xr-x | workflows/pull-data/genbank/update-from-genbank.py | 25 |
2 files changed, 13 insertions, 13 deletions
diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py index cb48cd8..e9e7315 100755 --- a/workflows/pull-data/genbank/genbank-fetch-ids.py +++ b/workflows/pull-data/genbank/genbank-fetch-ids.py @@ -8,7 +8,6 @@ import argparse import sys -from datetime import date from Bio import Entrez diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py index dca5563..95f5a93 100755 --- a/workflows/pull-data/genbank/update-from-genbank.py +++ b/workflows/pull-data/genbank/update-from-genbank.py @@ -14,22 +14,21 @@ import sys from utils import chunks from Bio import Entrez -Entrez.email = 'another_email@gmail.com' # FIXME -BATCH=100 +Entrez.email = 'another_email@gmail.com' # FIXME + +BATCH = 100 parser = argparse.ArgumentParser() -parser.add_argument('--max', type=int, help='Max queries', required=False) parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True) parser.add_argument('--out', type=str, help='Directory to write to', required=True) +parser.add_argument('--max', type=int, help='Max queries', required=False) args = parser.parse_args() ids = set() with open(args.ids) as f: - content = f.readlines() - for line in content: - ids.add(line.strip()) + ids.update([line.strip() for line in f]) dir = args.out if not os.path.exists(dir): @@ -37,12 +36,14 @@ if not os.path.exists(dir): request_num = BATCH if args.max: - request_num = min(BATCH,args.max) + request_num = min(BATCH, args.max) + +for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)): + xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz") + print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr) -for i, idsx in enumerate(chunks(list(ids), request_num)): - xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz") - print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr) with gzip.open(xmlfn, 'w') as f: - f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode()) - if args.max and i*request_num >= args.max: + f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode()) + + if args.max and num_chunk * request_num >= args.max: break |