diff options
author | Pjotr Prins | 2021-01-01 10:59:56 -0600 |
---|---|---|
committer | Pjotr Prins | 2021-01-01 10:59:56 -0600 |
commit | a029c2329e748874bee88317e44d3f47547f71d8 (patch) | |
tree | 2ef6d412ff5ba3aae61c74898e7b2738ba4e6bc3 /workflows/pull-data/genbank | |
parent | 3f059ebde6fe6888e62f4fc232d05fb3a322b011 (diff) | |
download | bh20-seq-resource-a029c2329e748874bee88317e44d3f47547f71d8.tar.gz bh20-seq-resource-a029c2329e748874bee88317e44d3f47547f71d8.tar.lz bh20-seq-resource-a029c2329e748874bee88317e44d3f47547f71d8.zip |
genbank: minor fixes
Diffstat (limited to 'workflows/pull-data/genbank')
-rw-r--r-- | workflows/pull-data/genbank/README.md | 6 | ||||
-rwxr-xr-x | workflows/pull-data/genbank/update-from-genbank.py | 7 |
2 files changed, 8 insertions, 5 deletions
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index 479ddc9..5597234 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -2,11 +2,11 @@ ```sh # --- get list of IDs already in PubSeq -sparql-fetch-ids > pubseq_ids.txt +./sparql-fetch-ids > pubseq_ids.txt # --- get list of missing genbank IDs -genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt +./genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt # --- fetch XML -update-from-genbank.py --ids genbank_ids.txt --out ~/tmp/genbank +python3 update-from-genbank.py --ids genbank_ids.txt --out ~/tmp/genbank # --- Transform to YAML and FASTA transform-genbank-xml2yamlfa --dir ~/tmp/genbank id --outdir ~/tmp/pubseq ``` diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py index d92f87a..3faea39 100755 --- a/workflows/pull-data/genbank/update-from-genbank.py +++ b/workflows/pull-data/genbank/update-from-genbank.py @@ -34,11 +34,14 @@ dir = args.out if not os.path.exists(dir): raise Exception(f"Directory {dir} does not exist") -request_num = min(BATCH,args.max) +request_num = BATCH +if args.max: + request_num = min(BATCH,args.max) + for i, idsx in enumerate(chunks(list(ids), request_num)): xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz") print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr) with gzip.open(xmlfn, 'w') as f: f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode()) - if i*request_num >= args.max: + if args.max and i*request_num >= args.max: break |