diff options
author | AndreaGuarracino | 2021-01-07 23:50:01 +0100 |
---|---|---|
committer | AndreaGuarracino | 2021-01-07 23:50:01 +0100 |
commit | 4d841d279b2bf73da2ba815d53863c7f2861c956 (patch) | |
tree | 83b9ad136dabacbf7ed54e19b2db6df348bef904 /workflows/tools | |
parent | 141e619929cee17018417d71111063015e73c366 (diff) | |
parent | c080c3cffedcc0cc99496b5e70fcfdf998978f16 (diff) | |
download | bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.tar.gz bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.tar.lz bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.zip |
Merge branch 'master' into yamlfa2ttl
Diffstat (limited to 'workflows/tools')
-rwxr-xr-x | workflows/tools/pubseq-fetch-data.py | 55 | ||||
-rwxr-xr-x | workflows/tools/pubseq-fetch-ids (renamed from workflows/tools/sparql-fetch-ids) | 2 |
2 files changed, 56 insertions, 1 deletions
diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py new file mode 100755 index 0000000..ef4edde --- /dev/null +++ b/workflows/tools/pubseq-fetch-data.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import requests +import sys +import time + +parser = argparse.ArgumentParser(description=""" + +Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs +can be passed in on the command line or in a file. + +""") +parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records') +parser.add_argument('--out', type=str, help='Directory to write to', +required=True) +parser.add_argument('--ids', type=str, help='File with ids', required=False) +parser.add_argument('id', nargs='*', help='id(s)') +args = parser.parse_args() + +dir = args.out +if not os.path.exists(dir): + raise Exception(f"Directory {dir} does not exist") + +ids = args.id +if (len(ids)==0): + print(f"Reading {args.ids}") + with open(args.ids) as f: + ids = [ l.strip() for l in f.readlines() ] + +for id in ids: + print(id) + jsonfn = dir+"/"+id+".json" + if not os.path.exists(jsonfn): + count = 0 + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + while not r: + count += 1 + if count>10: raise Exception(f"Can not find record for {id}") + time.sleep(15) + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + with open(dir+"/"+id+".json","w") as outf: + outf.write(mr.text) + if args.fasta: + fastafn = dir+"/"+id+".fa" + if os.path.exists(fastafn): continue + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(fastafn,"w") as outf: + outf.write(fr.text) + diff --git a/workflows/tools/sparql-fetch-ids b/workflows/tools/pubseq-fetch-ids index 19b2d82..f5920ec 100755 --- a/workflows/tools/sparql-fetch-ids +++ b/workflows/tools/pubseq-fetch-ids @@ -2,7 +2,7 @@ # # Use a SPARQL query to fetch all IDs in the PubSeq database # -# sparql-fetch-ids > pubseq_ids.txt +# pubseq-fetch-ids > pubseq_ids.txt # # Note: requires Ruby 3.x. Older Ruby gives a syntax error |