diff options
author | AndreaGuarracino | 2021-01-07 23:50:01 +0100 |
---|---|---|
committer | AndreaGuarracino | 2021-01-07 23:50:01 +0100 |
commit | 4d841d279b2bf73da2ba815d53863c7f2861c956 (patch) | |
tree | 83b9ad136dabacbf7ed54e19b2db6df348bef904 /workflows/pull-data/genbank/update-from-genbank.py | |
parent | 141e619929cee17018417d71111063015e73c366 (diff) | |
parent | c080c3cffedcc0cc99496b5e70fcfdf998978f16 (diff) | |
download | bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.tar.gz bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.tar.lz bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.zip |
Merge branch 'master' into yamlfa2ttl
Diffstat (limited to 'workflows/pull-data/genbank/update-from-genbank.py')
-rwxr-xr-x | workflows/pull-data/genbank/update-from-genbank.py | 25 |
1 files changed, 13 insertions, 12 deletions
diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py index dca5563..95f5a93 100755 --- a/workflows/pull-data/genbank/update-from-genbank.py +++ b/workflows/pull-data/genbank/update-from-genbank.py @@ -14,22 +14,21 @@ import sys from utils import chunks from Bio import Entrez -Entrez.email = 'another_email@gmail.com' # FIXME -BATCH=100 +Entrez.email = 'another_email@gmail.com' # FIXME + +BATCH = 100 parser = argparse.ArgumentParser() -parser.add_argument('--max', type=int, help='Max queries', required=False) parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True) parser.add_argument('--out', type=str, help='Directory to write to', required=True) +parser.add_argument('--max', type=int, help='Max queries', required=False) args = parser.parse_args() ids = set() with open(args.ids) as f: - content = f.readlines() - for line in content: - ids.add(line.strip()) + ids.update([line.strip() for line in f]) dir = args.out if not os.path.exists(dir): @@ -37,12 +36,14 @@ if not os.path.exists(dir): request_num = BATCH if args.max: - request_num = min(BATCH,args.max) + request_num = min(BATCH, args.max) + +for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)): + xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz") + print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr) -for i, idsx in enumerate(chunks(list(ids), request_num)): - xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz") - print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr) with gzip.open(xmlfn, 'w') as f: - f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode()) - if args.max and i*request_num >= args.max: + f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode()) + + if args.max and num_chunk * request_num >= args.max: break |