Merge branch 'master' into yamlfa2ttl

author: AndreaGuarracino 2021-01-07 23:50:01 +0100
committer: AndreaGuarracino 2021-01-07 23:50:01 +0100
commit: 4d841d279b2bf73da2ba815d53863c7f2861c956 (patch)
tree: 83b9ad136dabacbf7ed54e19b2db6df348bef904 /workflows/tools
parent: 141e619929cee17018417d71111063015e73c366 (diff)
parent: c080c3cffedcc0cc99496b5e70fcfdf998978f16 (diff)
download: bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.tar.gz
bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.tar.lz
bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.zip
2 files changed, 56 insertions, 1 deletions
diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
new file mode 100755
index 0000000..ef4edde
--- /dev/null
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import requests
+import sys
+import time
+
+parser = argparse.ArgumentParser(description="""
+
+Fetch metadata (JSON) from PubSeq and optionally the FASTA files.  IDs
+can be passed in on the command line or in a file.
+
+""")
+parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records')
+parser.add_argument('--out', type=str, help='Directory to write to',
+required=True)
+parser.add_argument('--ids', type=str, help='File with ids', required=False)
+parser.add_argument('id', nargs='*', help='id(s)')
+args = parser.parse_args()
+
+dir = args.out
+if not os.path.exists(dir):
+    raise Exception(f"Directory {dir} does not exist")
+
+ids = args.id
+if (len(ids)==0):
+    print(f"Reading {args.ids}")
+    with open(args.ids) as f:
+        ids = [ l.strip() for l in f.readlines() ]
+
+for id in ids:
+    print(id)
+    jsonfn = dir+"/"+id+".json"
+    if not os.path.exists(jsonfn):
+        count = 0
+        r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+        while not r:
+            count += 1
+            if count>10: raise Exception(f"Can not find record for {id}")
+            time.sleep(15)
+            r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+        m_url = r.json()[0]['metadata']
+        mr = requests.get(m_url)
+        with open(dir+"/"+id+".json","w") as outf:
+            outf.write(mr.text)
+        if args.fasta:
+            fastafn = dir+"/"+id+".fa"
+            if os.path.exists(fastafn): continue
+            fa_url = r.json()[0]['fasta']
+            fr = requests.get(fa_url)
+            with open(fastafn,"w") as outf:
+                outf.write(fr.text)
+
diff --git a/workflows/tools/sparql-fetch-ids b/workflows/tools/pubseq-fetch-ids
index 19b2d82..f5920ec 100755
--- a/workflows/tools/sparql-fetch-ids
+++ b/workflows/tools/pubseq-fetch-ids
@@ -2,7 +2,7 @@
 #
 # Use a SPARQL query to fetch all IDs in the PubSeq database
 #
-#   sparql-fetch-ids > pubseq_ids.txt
+#   pubseq-fetch-ids > pubseq_ids.txt
 #
 # Note: requires Ruby 3.x. Older Ruby gives a syntax error
author	AndreaGuarracino	2021-01-07 23:50:01 +0100
committer	AndreaGuarracino	2021-01-07 23:50:01 +0100
commit	4d841d279b2bf73da2ba815d53863c7f2861c956 (patch)
tree	83b9ad136dabacbf7ed54e19b2db6df348bef904 /workflows/tools
parent	141e619929cee17018417d71111063015e73c366 (diff)
parent	c080c3cffedcc0cc99496b5e70fcfdf998978f16 (diff)
download	bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.tar.gz bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.tar.lz bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.zip