#!/usr/bin/env ruby # # Use a SPARQL query to fetch all IDs in the PubSeq database # # sparql-fetch-ids > pubseq_ids.txt # require 'net/http' require 'json' require 'ostruct' require 'erb' SPARQL_HEADER=" prefix rdfs: prefix rdf: prefix dc: prefix schema: PREFIX pubseq: " # Build a SPARQL query, submit and return results. Apply transform # lambda when passed in def sparql q, transform = nil q = SPARQL_HEADER+q api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(q)}" response = Net::HTTP.get_response(URI.parse(api_url)) data = JSON.parse(response.body) vars = data['head']['vars'] results = data['results']['bindings'] results.map { |rec| # return results after transforming to a Hash and applying the # optional transform lambda. Note the transform can not only # reduce results, or create an array, but also may transform into # an OpenStruct. res = {} vars.each { |name| res[name.to_sym] = rec[name]['value'] } if transform transform.call(res) else res end } end MAX=5_000 start = 0 num = MAX begin query = " SELECT DISTINCT ?id FROM WHERE { ?arvid ?id . } LIMIT #{num} OFFSET #{start} " list = sparql(query, lambda { |rec| rec[:id] }) list.each do | l | print(l,"\n") end $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress start += num end while list.size == MAX