From 3f059ebde6fe6888e62f4fc232d05fb3a322b011 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 1 Jan 2021 16:24:37 +0000 Subject: gzip output --- workflows/pull-data/genbank/update-from-genbank.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py index 6d6d90c..d92f87a 100755 --- a/workflows/pull-data/genbank/update-from-genbank.py +++ b/workflows/pull-data/genbank/update-from-genbank.py @@ -8,6 +8,7 @@ # See also directory .guix-run and README.md import argparse +import gzip import os import sys from utils import chunks @@ -35,9 +36,9 @@ if not os.path.exists(dir): request_num = min(BATCH,args.max) for i, idsx in enumerate(chunks(list(ids), request_num)): - xmlfn = os.path.join(dir, f"metadata_{i}.xml") + xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz") print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr) - with open(xmlfn, 'w') as f: - f.write(Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()) + with gzip.open(xmlfn, 'w') as f: + f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode()) if i*request_num >= args.max: break -- cgit v1.2.3