From 1187fa716cacde2b50566b67b5d619b8f12894f9 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 12:07:39 +0000
Subject: fetches original metadata from PubSeq/Arvados

---
 workflows/tools/pubseq-fetch-data.py | 41 ++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100755 workflows/tools/pubseq-fetch-data.py

(limited to 'workflows/tools/pubseq-fetch-data.py')

diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
new file mode 100755
index 0000000..c22d754
--- /dev/null
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import requests
+import sys
+
+parser = argparse.ArgumentParser(description="""
+
+Fetch metadata (JSON) from PubSeq and optionally the FASTA files.  IDs
+can be passed in on the command line or in a file.
+
+""")
+parser.add_argument('--out', type=str, help='Directory to write to',
+required=True)
+parser.add_argument('--ids', type=str, help='File with ids', required=False)
+parser.add_argument('id', nargs='*', help='id(s)')
+args = parser.parse_args()
+
+dir = args.out
+if not os.path.exists(dir):
+    raise Exception(f"Directory {dir} does not exist")
+
+ids = args.id
+if (len(ids)==0):
+    print(f"Reading {args.ids}")
+    with open(args.ids) as f:
+        ids = [ l.strip() for l in f.readlines() ]
+
+for id in ids[0:2]:
+    print(id)
+    r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+    if r:
+        m_url = r.json()[0]['metadata']
+        mr = requests.get(m_url)
+        meta = mr.json()
+        with open(dir+"/"+id+".json","w") as outf:
+            json.dump(meta, outf, indent=4)
+    else:
+        raise Exception(f"Can not find record for {id}")
-- 
cgit 1.4.1


From 2ceeccd5e5158362548b868390e9d411f73cd9ff Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 12:29:41 +0000
Subject: fetch: do a straight dump of the original record

---
 workflows/tools/pubseq-fetch-data.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'workflows/tools/pubseq-fetch-data.py')

diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
index c22d754..3f5e6cf 100755
--- a/workflows/tools/pubseq-fetch-data.py
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -28,14 +28,13 @@ if (len(ids)==0):
     with open(args.ids) as f:
         ids = [ l.strip() for l in f.readlines() ]
 
-for id in ids[0:2]:
+for id in ids:
     print(id)
     r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
     if r:
         m_url = r.json()[0]['metadata']
         mr = requests.get(m_url)
-        meta = mr.json()
         with open(dir+"/"+id+".json","w") as outf:
-            json.dump(meta, outf, indent=4)
+            outf.write(mr.text)
     else:
         raise Exception(f"Can not find record for {id}")
-- 
cgit 1.4.1


From ced9613aa1c18c6a68056d1898b69865beac9ac2 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 12:35:05 +0000
Subject: Add option for fetching fasta

---
 workflows/tools/pubseq-fetch-data.py | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'workflows/tools/pubseq-fetch-data.py')

diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
index 3f5e6cf..23c4dea 100755
--- a/workflows/tools/pubseq-fetch-data.py
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -12,6 +12,7 @@ Fetch metadata (JSON) from PubSeq and optionally the FASTA files.  IDs
 can be passed in on the command line or in a file.
 
 """)
+parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records')
 parser.add_argument('--out', type=str, help='Directory to write to',
 required=True)
 parser.add_argument('--ids', type=str, help='File with ids', required=False)
@@ -36,5 +37,10 @@ for id in ids:
         mr = requests.get(m_url)
         with open(dir+"/"+id+".json","w") as outf:
             outf.write(mr.text)
+        if args.fasta:
+            fa_url = r.json()[0]['fasta']
+            fr = requests.get(fa_url)
+            with open(dir+"/"+id+".fa","w") as outf:
+                outf.write(fr.text)
     else:
         raise Exception(f"Can not find record for {id}")
-- 
cgit 1.4.1


From 17cd8caa85991784f205109f2b64b255726a0e80 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 07:13:15 -0600
Subject: Fetching fixes

---
 workflows/tools/pubseq-fetch-data.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'workflows/tools/pubseq-fetch-data.py')

diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
index 23c4dea..2119fdf 100755
--- a/workflows/tools/pubseq-fetch-data.py
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -31,16 +31,20 @@ if (len(ids)==0):
 
 for id in ids:
     print(id)
-    r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
-    if r:
-        m_url = r.json()[0]['metadata']
-        mr = requests.get(m_url)
-        with open(dir+"/"+id+".json","w") as outf:
-            outf.write(mr.text)
-        if args.fasta:
-            fa_url = r.json()[0]['fasta']
-            fr = requests.get(fa_url)
-            with open(dir+"/"+id+".fa","w") as outf:
-                outf.write(fr.text)
-    else:
-        raise Exception(f"Can not find record for {id}")
+    jsonfn = dir+"/"+id+".json"
+    if not os.path.exists(jsonfn):
+        r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+        if r:
+            m_url = r.json()[0]['metadata']
+            mr = requests.get(m_url)
+            with open(dir+"/"+id+".json","w") as outf:
+                outf.write(mr.text)
+            if args.fasta:
+                fastafn = dir+"/"+id+".fa"
+                if os.path.exists(fastafn): continue
+                fa_url = r.json()[0]['fasta']
+                fr = requests.get(fa_url)
+                with open(fastafn,"w") as outf:
+                    outf.write(fr.text)
+        else:
+            raise Exception(f"Can not find record for {id}")
-- 
cgit 1.4.1


From c31835f787f3ae36e26bad0a1803f8557f8084e7 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Wed, 6 Jan 2021 02:33:35 -0600
Subject: Pubseq fetch: sometimes a request times out. So repeat with
 intervals.

---
 workflows/tools/pubseq-fetch-data.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'workflows/tools/pubseq-fetch-data.py')

diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
index 2119fdf..ef4edde 100755
--- a/workflows/tools/pubseq-fetch-data.py
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -5,6 +5,7 @@ import json
 import os
 import requests
 import sys
+import time
 
 parser = argparse.ArgumentParser(description="""
 
@@ -33,18 +34,22 @@ for id in ids:
     print(id)
     jsonfn = dir+"/"+id+".json"
     if not os.path.exists(jsonfn):
+        count = 0
         r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
-        if r:
-            m_url = r.json()[0]['metadata']
-            mr = requests.get(m_url)
-            with open(dir+"/"+id+".json","w") as outf:
-                outf.write(mr.text)
-            if args.fasta:
-                fastafn = dir+"/"+id+".fa"
-                if os.path.exists(fastafn): continue
-                fa_url = r.json()[0]['fasta']
-                fr = requests.get(fa_url)
-                with open(fastafn,"w") as outf:
-                    outf.write(fr.text)
-        else:
-            raise Exception(f"Can not find record for {id}")
+        while not r:
+            count += 1
+            if count>10: raise Exception(f"Can not find record for {id}")
+            time.sleep(15)
+            r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+        m_url = r.json()[0]['metadata']
+        mr = requests.get(m_url)
+        with open(dir+"/"+id+".json","w") as outf:
+            outf.write(mr.text)
+        if args.fasta:
+            fastafn = dir+"/"+id+".fa"
+            if os.path.exists(fastafn): continue
+            fa_url = r.json()[0]['fasta']
+            fr = requests.get(fa_url)
+            with open(fastafn,"w") as outf:
+                outf.write(fr.text)
+
-- 
cgit 1.4.1