From 3541089aa8af5d229e669eb38d3735cd2b0b8a05 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 31 Dec 2020 14:17:42 +0000 Subject: genbank: sparql-fetch-ids --- workflows/pull-data/genbank/sparql-fetch-ids | 57 ++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'workflows/pull-data/genbank/sparql-fetch-ids') diff --git a/workflows/pull-data/genbank/sparql-fetch-ids b/workflows/pull-data/genbank/sparql-fetch-ids index e69de29..9a8b8ee 100755 --- a/workflows/pull-data/genbank/sparql-fetch-ids +++ b/workflows/pull-data/genbank/sparql-fetch-ids @@ -0,0 +1,57 @@ +#!/usr/bin/env ruby + +require 'net/http' +require 'json' +require 'ostruct' +require 'erb' + +SPARQL_HEADER=" +prefix rdfs: +prefix rdf: +prefix dc: +prefix schema: +PREFIX pubseq: +" + +# Build a SPARQL query, submit and return results. Apply transform lambda + +def sparql q, transform = nil + q = SPARQL_HEADER+q + api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(q)}" + + response = Net::HTTP.get_response(URI.parse(api_url)) + data = JSON.parse(response.body) + vars = data['head']['vars'] + results = data['results']['bindings'] + results.map { |rec| + res = {} + vars.each { |name| + res[name.to_sym] = rec[name]['value'] + } + if transform + transform.call(res) + else + res + end + } +end + +MAX=5_000 + +start = 0 +num = MAX +begin + query = " +select distinct ?id where { + +?arvid ?id . + +} limit #{num} offset #{start} +" + list = sparql(query, lambda { |rec| rec[:id] }) + list.each do | l | + print(l,"\n") + end + start += num + $stderr.print(start,":",list.first,"\n") +end while list.size == MAX -- cgit v1.2.3