From 1187fa716cacde2b50566b67b5d619b8f12894f9 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:07:39 +0000 Subject: fetches original metadata from PubSeq/Arvados --- workflows/tools/pubseq-fetch-data.py | 41 ++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100755 workflows/tools/pubseq-fetch-data.py (limited to 'workflows/tools/pubseq-fetch-data.py') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py new file mode 100755 index 0000000..c22d754 --- /dev/null +++ b/workflows/tools/pubseq-fetch-data.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import requests +import sys + +parser = argparse.ArgumentParser(description=""" + +Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs +can be passed in on the command line or in a file. + +""") +parser.add_argument('--out', type=str, help='Directory to write to', +required=True) +parser.add_argument('--ids', type=str, help='File with ids', required=False) +parser.add_argument('id', nargs='*', help='id(s)') +args = parser.parse_args() + +dir = args.out +if not os.path.exists(dir): + raise Exception(f"Directory {dir} does not exist") + +ids = args.id +if (len(ids)==0): + print(f"Reading {args.ids}") + with open(args.ids) as f: + ids = [ l.strip() for l in f.readlines() ] + +for id in ids[0:2]: + print(id) + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + if r: + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + meta = mr.json() + with open(dir+"/"+id+".json","w") as outf: + json.dump(meta, outf, indent=4) + else: + raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From 2ceeccd5e5158362548b868390e9d411f73cd9ff Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:29:41 +0000 Subject: fetch: do a straight dump of the original record --- workflows/tools/pubseq-fetch-data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'workflows/tools/pubseq-fetch-data.py') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index c22d754..3f5e6cf 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -28,14 +28,13 @@ if (len(ids)==0): with open(args.ids) as f: ids = [ l.strip() for l in f.readlines() ] -for id in ids[0:2]: +for id in ids: print(id) r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") if r: m_url = r.json()[0]['metadata'] mr = requests.get(m_url) - meta = mr.json() with open(dir+"/"+id+".json","w") as outf: - json.dump(meta, outf, indent=4) + outf.write(mr.text) else: raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From ced9613aa1c18c6a68056d1898b69865beac9ac2 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:35:05 +0000 Subject: Add option for fetching fasta --- workflows/tools/pubseq-fetch-data.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'workflows/tools/pubseq-fetch-data.py') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 3f5e6cf..23c4dea 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -12,6 +12,7 @@ Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs can be passed in on the command line or in a file. """) +parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records') parser.add_argument('--out', type=str, help='Directory to write to', required=True) parser.add_argument('--ids', type=str, help='File with ids', required=False) @@ -36,5 +37,10 @@ for id in ids: mr = requests.get(m_url) with open(dir+"/"+id+".json","w") as outf: outf.write(mr.text) + if args.fasta: + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(dir+"/"+id+".fa","w") as outf: + outf.write(fr.text) else: raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From 17cd8caa85991784f205109f2b64b255726a0e80 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 07:13:15 -0600 Subject: Fetching fixes --- workflows/tools/pubseq-fetch-data.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'workflows/tools/pubseq-fetch-data.py') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 23c4dea..2119fdf 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -31,16 +31,20 @@ if (len(ids)==0): for id in ids: print(id) - r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") - if r: - m_url = r.json()[0]['metadata'] - mr = requests.get(m_url) - with open(dir+"/"+id+".json","w") as outf: - outf.write(mr.text) - if args.fasta: - fa_url = r.json()[0]['fasta'] - fr = requests.get(fa_url) - with open(dir+"/"+id+".fa","w") as outf: - outf.write(fr.text) - else: - raise Exception(f"Can not find record for {id}") + jsonfn = dir+"/"+id+".json" + if not os.path.exists(jsonfn): + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + if r: + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + with open(dir+"/"+id+".json","w") as outf: + outf.write(mr.text) + if args.fasta: + fastafn = dir+"/"+id+".fa" + if os.path.exists(fastafn): continue + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(fastafn,"w") as outf: + outf.write(fr.text) + else: + raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From c31835f787f3ae36e26bad0a1803f8557f8084e7 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 6 Jan 2021 02:33:35 -0600 Subject: Pubseq fetch: sometimes a request times out. So repeat with intervals. --- workflows/tools/pubseq-fetch-data.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'workflows/tools/pubseq-fetch-data.py') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 2119fdf..ef4edde 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -5,6 +5,7 @@ import json import os import requests import sys +import time parser = argparse.ArgumentParser(description=""" @@ -33,18 +34,22 @@ for id in ids: print(id) jsonfn = dir+"/"+id+".json" if not os.path.exists(jsonfn): + count = 0 r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") - if r: - m_url = r.json()[0]['metadata'] - mr = requests.get(m_url) - with open(dir+"/"+id+".json","w") as outf: - outf.write(mr.text) - if args.fasta: - fastafn = dir+"/"+id+".fa" - if os.path.exists(fastafn): continue - fa_url = r.json()[0]['fasta'] - fr = requests.get(fa_url) - with open(fastafn,"w") as outf: - outf.write(fr.text) - else: - raise Exception(f"Can not find record for {id}") + while not r: + count += 1 + if count>10: raise Exception(f"Can not find record for {id}") + time.sleep(15) + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + with open(dir+"/"+id+".json","w") as outf: + outf.write(mr.text) + if args.fasta: + fastafn = dir+"/"+id+".fa" + if os.path.exists(fastafn): continue + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(fastafn,"w") as outf: + outf.write(fr.text) + -- cgit v1.2.3