genbank: minor fixes

author: Pjotr Prins 2021-01-01 10:59:56 -0600
committer: Pjotr Prins 2021-01-01 10:59:56 -0600
commit: a029c2329e748874bee88317e44d3f47547f71d8 (patch)
tree: 2ef6d412ff5ba3aae61c74898e7b2738ba4e6bc3 /workflows/pull-data/genbank
parent: 3f059ebde6fe6888e62f4fc232d05fb3a322b011 (diff)
download: bh20-seq-resource-a029c2329e748874bee88317e44d3f47547f71d8.tar.gz
bh20-seq-resource-a029c2329e748874bee88317e44d3f47547f71d8.tar.lz
bh20-seq-resource-a029c2329e748874bee88317e44d3f47547f71d8.zip
2 files changed, 8 insertions, 5 deletions
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md
index 479ddc9..5597234 100644
--- a/workflows/pull-data/genbank/README.md
+++ b/workflows/pull-data/genbank/README.md
@@ -2,11 +2,11 @@
 
 ```sh
 # --- get list of IDs already in PubSeq
-sparql-fetch-ids > pubseq_ids.txt
+./sparql-fetch-ids > pubseq_ids.txt
 # --- get list of missing genbank IDs
-genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt
+./genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt
 # --- fetch XML
-update-from-genbank.py --ids genbank_ids.txt --out ~/tmp/genbank
+python3 update-from-genbank.py --ids genbank_ids.txt --out ~/tmp/genbank
 # --- Transform to YAML and FASTA
 transform-genbank-xml2yamlfa --dir ~/tmp/genbank id --outdir ~/tmp/pubseq
 ```
diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py
index d92f87a..3faea39 100755
--- a/workflows/pull-data/genbank/update-from-genbank.py
+++ b/workflows/pull-data/genbank/update-from-genbank.py
@@ -34,11 +34,14 @@ dir = args.out
 if not os.path.exists(dir):
     raise Exception(f"Directory {dir} does not exist")
 
-request_num = min(BATCH,args.max)
+request_num = BATCH
+if args.max:
+  request_num = min(BATCH,args.max)
+
 for i, idsx in enumerate(chunks(list(ids), request_num)):
     xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz")
     print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr)
     with gzip.open(xmlfn, 'w') as f:
         f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode())
-    if i*request_num >= args.max:
+    if args.max and i*request_num >= args.max:
         break
author	Pjotr Prins	2021-01-01 10:59:56 -0600
committer	Pjotr Prins	2021-01-01 10:59:56 -0600
commit	a029c2329e748874bee88317e44d3f47547f71d8 (patch)
tree	2ef6d412ff5ba3aae61c74898e7b2738ba4e6bc3 /workflows/pull-data/genbank
parent	3f059ebde6fe6888e62f4fc232d05fb3a322b011 (diff)
download	bh20-seq-resource-a029c2329e748874bee88317e44d3f47547f71d8.tar.gz bh20-seq-resource-a029c2329e748874bee88317e44d3f47547f71d8.tar.lz bh20-seq-resource-a029c2329e748874bee88317e44d3f47547f71d8.zip