about summary refs log tree commit diff
path: root/workflows/pull-data/genbank/sparql-fetch-ids
diff options
context:
space:
mode:
authorPjotr Prins2020-12-31 14:25:33 +0000
committerPjotr Prins2020-12-31 14:25:33 +0000
commit5216cf3149024052a3e87f4491d2bb7d9b06a237 (patch)
tree4ef4481c5c34ad8bd3442ca9ba6f8b869a117447 /workflows/pull-data/genbank/sparql-fetch-ids
parent3541089aa8af5d229e669eb38d3735cd2b0b8a05 (diff)
downloadbh20-seq-resource-5216cf3149024052a3e87f4491d2bb7d9b06a237.tar.gz
bh20-seq-resource-5216cf3149024052a3e87f4491d2bb7d9b06a237.tar.lz
bh20-seq-resource-5216cf3149024052a3e87f4491d2bb7d9b06a237.zip
Improve SPARQL query and comments
Diffstat (limited to 'workflows/pull-data/genbank/sparql-fetch-ids')
-rwxr-xr-xworkflows/pull-data/genbank/sparql-fetch-ids18
1 files changed, 13 insertions, 5 deletions
diff --git a/workflows/pull-data/genbank/sparql-fetch-ids b/workflows/pull-data/genbank/sparql-fetch-ids
index 9a8b8ee..683044c 100755
--- a/workflows/pull-data/genbank/sparql-fetch-ids
+++ b/workflows/pull-data/genbank/sparql-fetch-ids
@@ -1,4 +1,9 @@
 #!/usr/bin/env ruby
+#
+# Use a SPARQL query to fetch all IDs in the PubSeq database
+#
+#   sparql-fetch-ids > pubseq_ids.txt
+#
 
 require 'net/http'
 require 'json'
@@ -13,7 +18,8 @@ prefix schema: <https://schema.org/>
 PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
 "
 
-# Build a SPARQL query, submit and return results. Apply transform lambda
+# Build a SPARQL query, submit and return results. Apply transform
+# lambda when passed in
 
 def sparql q, transform = nil
   q = SPARQL_HEADER+q
@@ -42,16 +48,18 @@ start = 0
 num = MAX
 begin
   query = "
-select distinct ?id where {
+SELECT DISTINCT ?id
+FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
+WHERE {
 
-?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
+  ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
 
-} limit #{num} offset #{start}
+} LIMIT #{num} OFFSET #{start}
 "
   list = sparql(query, lambda { |rec| rec[:id] })
   list.each do | l |
     print(l,"\n")
   end
+  $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
   start += num
-  $stderr.print(start,":",list.first,"\n")
 end while list.size == MAX