#!/usr/bin/env ruby # # Use a SPARQL query to fetch all IDs in the PubSeq database # # pubseq-fetch-ids > pubseq_ids.txt # # Note: requires Ruby 3.x. Older Ruby gives a syntax error require 'net/http' require 'json' require 'ostruct' require 'erb' require 'pp' MAX=5_000 SPARQL_HEADER=" prefix rdfs: prefix rdf: prefix dc: prefix schema: PREFIX pubseq: " # Build a SPARQL query, submit and return results. Apply transform # lambda when passed in def sparql query, transform = nil api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" response = Net::HTTP.get_response(URI.parse(api_url)) data = JSON.parse(response.body,symbolize_names: true) data => { head: { vars: }, results: { bindings: results} } vars = vars.map { |v| v.to_sym } results.map { |rec| # return results after transforming to a Hash and applying the # optional transform lambda. Note the transform can not only # reduce results, or create an array, but also may transform into # an OpenStruct. res = {} vars.each { |name| res[name] = rec[name][:value] } if transform transform.call(res) else res end } end start = 0 num = MAX begin query = " SELECT DISTINCT ?id FROM WHERE { ?arvid ?id . } LIMIT #{num} OFFSET #{start} " list = sparql(query, lambda { |rec| rec[:id] }) list.each do | l | print(l,"\n") end $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress start += num end while list.size == MAX