aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreaGuarracino2021-01-05 17:56:19 +0100
committerAndreaGuarracino2021-01-05 17:56:19 +0100
commit911ba372cfc4b35c5b52d18a573a636ea78d16d7 (patch)
tree02388b3da4f2c79ec48e4d541d0c679910713bfb
parentf7666a7766c8138aa690340fc68cb67f709327f3 (diff)
downloadbh20-seq-resource-911ba372cfc4b35c5b52d18a573a636ea78d16d7.tar.gz
bh20-seq-resource-911ba372cfc4b35c5b52d18a573a636ea78d16d7.tar.lz
bh20-seq-resource-911ba372cfc4b35c5b52d18a573a636ea78d16d7.zip
cleaning update-from-genbank.py; removed unused import from genbank-fetch-ids.py
-rwxr-xr-xworkflows/pull-data/genbank/genbank-fetch-ids.py1
-rwxr-xr-xworkflows/pull-data/genbank/update-from-genbank.py25
2 files changed, 13 insertions, 13 deletions
diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py
index cb48cd8..e9e7315 100755
--- a/workflows/pull-data/genbank/genbank-fetch-ids.py
+++ b/workflows/pull-data/genbank/genbank-fetch-ids.py
@@ -8,7 +8,6 @@
import argparse
import sys
-from datetime import date
from Bio import Entrez
diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py
index dca5563..95f5a93 100755
--- a/workflows/pull-data/genbank/update-from-genbank.py
+++ b/workflows/pull-data/genbank/update-from-genbank.py
@@ -14,22 +14,21 @@ import sys
from utils import chunks
from Bio import Entrez
-Entrez.email = 'another_email@gmail.com' # FIXME
-BATCH=100
+Entrez.email = 'another_email@gmail.com' # FIXME
+
+BATCH = 100
parser = argparse.ArgumentParser()
-parser.add_argument('--max', type=int, help='Max queries', required=False)
parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True)
parser.add_argument('--out', type=str, help='Directory to write to', required=True)
+parser.add_argument('--max', type=int, help='Max queries', required=False)
args = parser.parse_args()
ids = set()
with open(args.ids) as f:
- content = f.readlines()
- for line in content:
- ids.add(line.strip())
+ ids.update([line.strip() for line in f])
dir = args.out
if not os.path.exists(dir):
@@ -37,12 +36,14 @@ if not os.path.exists(dir):
request_num = BATCH
if args.max:
- request_num = min(BATCH,args.max)
+ request_num = min(BATCH, args.max)
+
+for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)):
+ xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz")
+ print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr)
-for i, idsx in enumerate(chunks(list(ids), request_num)):
- xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz")
- print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr)
with gzip.open(xmlfn, 'w') as f:
- f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode())
- if args.max and i*request_num >= args.max:
+ f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode())
+
+ if args.max and num_chunk * request_num >= args.max:
break