aboutsummaryrefslogtreecommitdiff
path: root/workflows/tools/pubseq-fetch-ids
diff options
context:
space:
mode:
Diffstat (limited to 'workflows/tools/pubseq-fetch-ids')
-rwxr-xr-xworkflows/tools/pubseq-fetch-ids67
1 files changed, 67 insertions, 0 deletions
diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids
new file mode 100755
index 0000000..f5920ec
--- /dev/null
+++ b/workflows/tools/pubseq-fetch-ids
@@ -0,0 +1,67 @@
+#!/usr/bin/env ruby
+#
+# Use a SPARQL query to fetch all IDs in the PubSeq database
+#
+# pubseq-fetch-ids > pubseq_ids.txt
+#
+# Note: requires Ruby 3.x. Older Ruby gives a syntax error
+
+require 'net/http'
+require 'json'
+require 'ostruct'
+require 'erb'
+require 'pp'
+
+MAX=5_000
+
+SPARQL_HEADER="
+prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+prefix dc: <http://purl.org/dc/terms/>
+prefix schema: <https://schema.org/>
+PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+"
+
+# Build a SPARQL query, submit and return results. Apply transform
+# lambda when passed in
+def sparql query, transform = nil
+ api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}"
+
+ response = Net::HTTP.get_response(URI.parse(api_url))
+ data = JSON.parse(response.body,symbolize_names: true)
+ data => { head: { vars: }, results: { bindings: results} }
+ vars = vars.map { |v| v.to_sym }
+ results.map { |rec|
+ # return results after transforming to a Hash and applying the
+ # optional transform lambda. Note the transform can not only
+ # reduce results, or create an array, but also may transform into
+ # an OpenStruct.
+ res = {}
+ vars.each { |name| res[name] = rec[name][:value] }
+ if transform
+ transform.call(res)
+ else
+ res
+ end
+ }
+end
+
+start = 0
+num = MAX
+begin
+ query = "
+SELECT DISTINCT ?id
+FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
+WHERE {
+
+ ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
+
+} LIMIT #{num} OFFSET #{start}
+"
+ list = sparql(query, lambda { |rec| rec[:id] })
+ list.each do | l |
+ print(l,"\n")
+ end
+ $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
+ start += num
+end while list.size == MAX