aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data/genbank
diff options
context:
space:
mode:
authorPjotr Prins2021-01-01 16:24:37 +0000
committerPjotr Prins2021-01-01 16:24:37 +0000
commit3f059ebde6fe6888e62f4fc232d05fb3a322b011 (patch)
tree3702eff9a11381947716954f529b92cce9399b58 /workflows/pull-data/genbank
parentee01616a8c5ab5449325599dfaea32341f049784 (diff)
downloadbh20-seq-resource-3f059ebde6fe6888e62f4fc232d05fb3a322b011.tar.gz
bh20-seq-resource-3f059ebde6fe6888e62f4fc232d05fb3a322b011.tar.lz
bh20-seq-resource-3f059ebde6fe6888e62f4fc232d05fb3a322b011.zip
gzip output
Diffstat (limited to 'workflows/pull-data/genbank')
-rwxr-xr-xworkflows/pull-data/genbank/update-from-genbank.py7
1 files changed, 4 insertions, 3 deletions
diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py
index 6d6d90c..d92f87a 100755
--- a/workflows/pull-data/genbank/update-from-genbank.py
+++ b/workflows/pull-data/genbank/update-from-genbank.py
@@ -8,6 +8,7 @@
# See also directory .guix-run and README.md
import argparse
+import gzip
import os
import sys
from utils import chunks
@@ -35,9 +36,9 @@ if not os.path.exists(dir):
request_num = min(BATCH,args.max)
for i, idsx in enumerate(chunks(list(ids), request_num)):
- xmlfn = os.path.join(dir, f"metadata_{i}.xml")
+ xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz")
print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr)
- with open(xmlfn, 'w') as f:
- f.write(Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read())
+ with gzip.open(xmlfn, 'w') as f:
+ f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode())
if i*request_num >= args.max:
break