diff options
Diffstat (limited to 'workflows/pubseq/pubseq-fetch-ids')
-rwxr-xr-x | workflows/pubseq/pubseq-fetch-ids | 67 |
1 files changed, 67 insertions, 0 deletions
diff --git a/workflows/pubseq/pubseq-fetch-ids b/workflows/pubseq/pubseq-fetch-ids new file mode 100755 index 0000000..f5920ec --- /dev/null +++ b/workflows/pubseq/pubseq-fetch-ids @@ -0,0 +1,67 @@ +#!/usr/bin/env ruby +# +# Use a SPARQL query to fetch all IDs in the PubSeq database +# +# pubseq-fetch-ids > pubseq_ids.txt +# +# Note: requires Ruby 3.x. Older Ruby gives a syntax error + +require 'net/http' +require 'json' +require 'ostruct' +require 'erb' +require 'pp' + +MAX=5_000 + +SPARQL_HEADER=" +prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> +prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> +prefix dc: <http://purl.org/dc/terms/> +prefix schema: <https://schema.org/> +PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/> +" + +# Build a SPARQL query, submit and return results. Apply transform +# lambda when passed in +def sparql query, transform = nil + api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" + + response = Net::HTTP.get_response(URI.parse(api_url)) + data = JSON.parse(response.body,symbolize_names: true) + data => { head: { vars: }, results: { bindings: results} } + vars = vars.map { |v| v.to_sym } + results.map { |rec| + # return results after transforming to a Hash and applying the + # optional transform lambda. Note the transform can not only + # reduce results, or create an array, but also may transform into + # an OpenStruct. + res = {} + vars.each { |name| res[name] = rec[name][:value] } + if transform + transform.call(res) + else + res + end + } +end + +start = 0 +num = MAX +begin + query = " +SELECT DISTINCT ?id +FROM <http://covid-19.genenetwork.org/graph/metadata.ttl> +WHERE { + + ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id . + +} LIMIT #{num} OFFSET #{start} +" + list = sparql(query, lambda { |rec| rec[:id] }) + list.each do | l | + print(l,"\n") + end + $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress + start += num +end while list.size == MAX |