From 8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 28 Jan 2021 18:45:52 +0000 Subject: Moving tools out of submodules (sorry!) --- workflows/pubseq/normalize-yamlfa.py | 97 +++++++++++++++++++++++++++++++ workflows/pubseq/normalize/README.md | 14 +++++ workflows/pubseq/normalize/__init__.py | 0 workflows/pubseq/normalize/mapping.py | 102 +++++++++++++++++++++++++++++++++ workflows/pubseq/pubseq-fetch-data.py | 55 ++++++++++++++++++ workflows/pubseq/pubseq-fetch-ids | 67 ++++++++++++++++++++++ workflows/tools/normalize-yamlfa.py | 97 ------------------------------- workflows/tools/normalize/README.md | 14 ----- workflows/tools/normalize/__init__.py | 0 workflows/tools/normalize/mapping.py | 102 --------------------------------- workflows/tools/pubseq-fetch-data.py | 55 ------------------ workflows/tools/pubseq-fetch-ids | 67 ---------------------- 12 files changed, 335 insertions(+), 335 deletions(-) create mode 100755 workflows/pubseq/normalize-yamlfa.py create mode 100644 workflows/pubseq/normalize/README.md create mode 100644 workflows/pubseq/normalize/__init__.py create mode 100644 workflows/pubseq/normalize/mapping.py create mode 100755 workflows/pubseq/pubseq-fetch-data.py create mode 100755 workflows/pubseq/pubseq-fetch-ids delete mode 100755 workflows/tools/normalize-yamlfa.py delete mode 100644 workflows/tools/normalize/README.md delete mode 100644 workflows/tools/normalize/__init__.py delete mode 100644 workflows/tools/normalize/mapping.py delete mode 100755 workflows/tools/pubseq-fetch-data.py delete mode 100755 workflows/tools/pubseq-fetch-ids (limited to 'workflows') diff --git a/workflows/pubseq/normalize-yamlfa.py b/workflows/pubseq/normalize-yamlfa.py new file mode 100755 index 0000000..55a8848 --- /dev/null +++ b/workflows/pubseq/normalize-yamlfa.py @@ -0,0 +1,97 @@ +# --- Normalize data +# normalize-yamlfa.py [--yaml] --in ~/tmp/pubseq/state.json file(s) +# +# Example: +# +# python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --species ncbi_host_species.csv --specimen specimen.csv --validate + +import argparse +import json +import os +import sys +import types +import normalize.mapping as mapping + +parser = argparse.ArgumentParser(description=""" + +Normalize parameters in PubSeq JSON/YAML files. All entries in +directory are parsed using the state.json file. It is possible +to select a subset of IDs. + +This tool has two modes of operation. It can validate with the +--validate switch which stops at a warning and does no rewriting. +This mode is typically used in troubleshooting. + +The other mode is --rewrite which rewrites the JSON files after +making a backup (.bak) of the original. This mode updates files and +won't stop - it is used for (automated) uploads. + +""") + +parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True) +parser.add_argument('--species', type=str, help='Species mapping file') +parser.add_argument('--specimen', type=str, help='Optional specimen mapping file') +parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning') +parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files') +parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON') +parser.add_argument('id', nargs='*', help='optional id(s)') + +args = parser.parse_args() + +with open(args.state) as jsonf: + data = json.load(jsonf) + +dir = os.path.dirname(args.state) +do_validate = args.validate +do_rewrite = args.rewrite + +ids = args.id +if not len(ids): + ids = list(data.keys()) + +species = {} +if args.species: + with open(args.species) as f: + for line in f: + name,uri = line.strip().split(',') + species[name] = uri +else: + print("WARNING: no species mapping",file=sys.stderr) +specimen = {} +if args.specimen: + with open(args.specimen) as f: + for line in f: + name,uri = line.strip().split(',') + specimen[name] = uri +else: + print("WARNING: no specimen mapping",file=sys.stderr) + +for id in ids: + if args.yaml: + raise Exception("YAML not yet supported") + fn = f"{dir}/{id}.json" + print(f"Reading {fn}",file=sys.stderr) + with open(fn) as f: + rec = types.SimpleNamespace(**json.load(f)) + if do_validate: + print(rec) + rec.host,warning = mapping.host_species(rec.host,species) + if warning: + print("WARNING "+warning,file=sys.stderr) + rec.warnings.append(warning) + rec.sample,warning = mapping.specimen_source(rec.sample,specimen) + if warning: + print("WARNING "+warning,file=sys.stderr) + rec.warnings.append(warning) + print(rec) + if do_validate and warning: + print("bailing out in validation mode",file=sys.stderr) + sys.exit(2) + if do_rewrite: + if not os.path.exists(fn+".bak"): # make backup the first time + os.rename(fn,fn+".bak") + with open(fn, 'w') as outfile: + print(f" Writing {fn}") + json.dump(rec.__dict__, outfile, indent=2) + else: + print(rec) diff --git a/workflows/pubseq/normalize/README.md b/workflows/pubseq/normalize/README.md new file mode 100644 index 0000000..b780a68 --- /dev/null +++ b/workflows/pubseq/normalize/README.md @@ -0,0 +1,14 @@ +# Normalization steps + +This library contains generic logic to normalize (string) data and +transforms strings to URIs. It should be applicable to data from +any source (GenBank, ENA etc). + +Important: missing data should be missing or None! Do not fill +in data by 'guessing'. + +When data is malformed a warning should be logged and added to the +warning list. Functions should be small enough to return only 1 +warning! + +Pjotr Prins (c) 2021 diff --git a/workflows/pubseq/normalize/__init__.py b/workflows/pubseq/normalize/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/workflows/pubseq/normalize/mapping.py b/workflows/pubseq/normalize/mapping.py new file mode 100644 index 0000000..3ed09c2 --- /dev/null +++ b/workflows/pubseq/normalize/mapping.py @@ -0,0 +1,102 @@ +# Normalization steps +# +# This library contains generic logic to normalize (string) data and +# transforms strings to URIs. It should be applicable to data from +# any source (GenBank, ENA etc). +# +# Important: missing data should be missing or None! Do not fill +# in data by 'guessing'. +# +# When data is malformed a warning should be logged and added to the +# warning list. Functions should be small enough to return only 1 +# warning! +# +# Pjotr Prins (c) 2021 + +import re +import types + +def host_species(host,mapping): + Homo_sapiens = "http://purl.obolibrary.org/obo/NCBITaxon_9606" + + SPECIES_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed + "human": Homo_sapiens, + "sapiens": Homo_sapiens, + "Mustela lutreola": "http://purl.obolibrary.org/obo/NCBITaxon_9666", + "Manis javanica": "http://purl.obolibrary.org/obo/NCBITaxon_9974", + "Felis catus": "http://purl.obolibrary.org/obo/NCBITaxon_9685", + "Panthera tigris": "http://purl.obolibrary.org/obo/NCBITaxon_419130", + "Canis lupus": "http://purl.obolibrary.org/obo/NCBITaxon_9615", + # Mink: + "vison": "http://purl.obolibrary.org/obo/NCBITaxon_452646" + } + + warning = None + host = types.SimpleNamespace(**host) + if not 'obolibrary' in host.host_species: + key = host.host_species + host.host_species = None + if key in mapping: + host.host_species = mapping[key] + else: + for term in SPECIES_TERMS: + p = re.compile(".*?"+term,re.IGNORECASE) + m = p.match(key) + if m: host.host_species = SPECIES_TERMS[term] + if not host.host_species: + warning = f"No URI mapping for host_species <{key}>" + if host.host_species == Unknown or host.host_species == None: + del(host.host_species) + return host.__dict__,warning + +Unknown = "Not found" # So as not to create a warning + +def specimen_source(sample,mapping): + Oronasopharynx = "http://purl.obolibrary.org/obo/NCIT_C155835" + Oropharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155835" + Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831" + Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195" + Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275" + Nasal_Swab = Nasopharyngeal # "http://purl.obolibrary.org/obo/NCIT_C132119" + Frozen_Food = "https://www.wikidata.org/wiki/Q751728" + Bronchoalveolar_Lavage = "http://purl.obolibrary.org/obo/NCIT_C13195", + Biospecimen = "http://purl.obolibrary.org/obo/NCIT_C70699" + SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed + "Oronasopharynx": Oronasopharynx, + "orophar": Oropharyngeal, + "pharyngeal": Nasopharyngeal, + "\snares": Nasal_Swab, + "saliva": Saliva, + "swab": Nasal_Swab, + "broncho": Bronchoalveolar_Lavage, + "seafood": Frozen_Food, + "packaging": Frozen_Food, + "specimen": Biospecimen, + "patient": Biospecimen, + "uknown": Unknown, + "unknown": Unknown + } + warning = None + sample = types.SimpleNamespace(**sample) + try: + if sample.specimen_source: + keys = sample.specimen_source + sample.specimen_source = [] + for key in keys: + if 'obolibrary' in key: + sample.specimen_source.append(key) + continue + if key in mapping: + sample.specimen_source.append(mapping[key]) + else: + for term in SPECIMEN_TERMS: + p = re.compile(".*?"+term,re.IGNORECASE) + m = p.match(key) + if m: sample.specimen_source = [SPECIMEN_TERMS[term]] + if len(sample.specimen_source)==0: + warning = f"No URI mapping for specimen_source <{key}>" + if sample.specimen_source == Unknown or sample.specimen_source == None: + del(sample.specimen_source) + except AttributeError: + pass + return sample.__dict__,warning diff --git a/workflows/pubseq/pubseq-fetch-data.py b/workflows/pubseq/pubseq-fetch-data.py new file mode 100755 index 0000000..ef4edde --- /dev/null +++ b/workflows/pubseq/pubseq-fetch-data.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import requests +import sys +import time + +parser = argparse.ArgumentParser(description=""" + +Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs +can be passed in on the command line or in a file. + +""") +parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records') +parser.add_argument('--out', type=str, help='Directory to write to', +required=True) +parser.add_argument('--ids', type=str, help='File with ids', required=False) +parser.add_argument('id', nargs='*', help='id(s)') +args = parser.parse_args() + +dir = args.out +if not os.path.exists(dir): + raise Exception(f"Directory {dir} does not exist") + +ids = args.id +if (len(ids)==0): + print(f"Reading {args.ids}") + with open(args.ids) as f: + ids = [ l.strip() for l in f.readlines() ] + +for id in ids: + print(id) + jsonfn = dir+"/"+id+".json" + if not os.path.exists(jsonfn): + count = 0 + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + while not r: + count += 1 + if count>10: raise Exception(f"Can not find record for {id}") + time.sleep(15) + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + with open(dir+"/"+id+".json","w") as outf: + outf.write(mr.text) + if args.fasta: + fastafn = dir+"/"+id+".fa" + if os.path.exists(fastafn): continue + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(fastafn,"w") as outf: + outf.write(fr.text) + diff --git a/workflows/pubseq/pubseq-fetch-ids b/workflows/pubseq/pubseq-fetch-ids new file mode 100755 index 0000000..f5920ec --- /dev/null +++ b/workflows/pubseq/pubseq-fetch-ids @@ -0,0 +1,67 @@ +#!/usr/bin/env ruby +# +# Use a SPARQL query to fetch all IDs in the PubSeq database +# +# pubseq-fetch-ids > pubseq_ids.txt +# +# Note: requires Ruby 3.x. Older Ruby gives a syntax error + +require 'net/http' +require 'json' +require 'ostruct' +require 'erb' +require 'pp' + +MAX=5_000 + +SPARQL_HEADER=" +prefix rdfs: +prefix rdf: +prefix dc: +prefix schema: +PREFIX pubseq: +" + +# Build a SPARQL query, submit and return results. Apply transform +# lambda when passed in +def sparql query, transform = nil + api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" + + response = Net::HTTP.get_response(URI.parse(api_url)) + data = JSON.parse(response.body,symbolize_names: true) + data => { head: { vars: }, results: { bindings: results} } + vars = vars.map { |v| v.to_sym } + results.map { |rec| + # return results after transforming to a Hash and applying the + # optional transform lambda. Note the transform can not only + # reduce results, or create an array, but also may transform into + # an OpenStruct. + res = {} + vars.each { |name| res[name] = rec[name][:value] } + if transform + transform.call(res) + else + res + end + } +end + +start = 0 +num = MAX +begin + query = " +SELECT DISTINCT ?id +FROM +WHERE { + + ?arvid ?id . + +} LIMIT #{num} OFFSET #{start} +" + list = sparql(query, lambda { |rec| rec[:id] }) + list.each do | l | + print(l,"\n") + end + $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress + start += num +end while list.size == MAX diff --git a/workflows/tools/normalize-yamlfa.py b/workflows/tools/normalize-yamlfa.py deleted file mode 100755 index 55a8848..0000000 --- a/workflows/tools/normalize-yamlfa.py +++ /dev/null @@ -1,97 +0,0 @@ -# --- Normalize data -# normalize-yamlfa.py [--yaml] --in ~/tmp/pubseq/state.json file(s) -# -# Example: -# -# python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --species ncbi_host_species.csv --specimen specimen.csv --validate - -import argparse -import json -import os -import sys -import types -import normalize.mapping as mapping - -parser = argparse.ArgumentParser(description=""" - -Normalize parameters in PubSeq JSON/YAML files. All entries in -directory are parsed using the state.json file. It is possible -to select a subset of IDs. - -This tool has two modes of operation. It can validate with the ---validate switch which stops at a warning and does no rewriting. -This mode is typically used in troubleshooting. - -The other mode is --rewrite which rewrites the JSON files after -making a backup (.bak) of the original. This mode updates files and -won't stop - it is used for (automated) uploads. - -""") - -parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True) -parser.add_argument('--species', type=str, help='Species mapping file') -parser.add_argument('--specimen', type=str, help='Optional specimen mapping file') -parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning') -parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files') -parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON') -parser.add_argument('id', nargs='*', help='optional id(s)') - -args = parser.parse_args() - -with open(args.state) as jsonf: - data = json.load(jsonf) - -dir = os.path.dirname(args.state) -do_validate = args.validate -do_rewrite = args.rewrite - -ids = args.id -if not len(ids): - ids = list(data.keys()) - -species = {} -if args.species: - with open(args.species) as f: - for line in f: - name,uri = line.strip().split(',') - species[name] = uri -else: - print("WARNING: no species mapping",file=sys.stderr) -specimen = {} -if args.specimen: - with open(args.specimen) as f: - for line in f: - name,uri = line.strip().split(',') - specimen[name] = uri -else: - print("WARNING: no specimen mapping",file=sys.stderr) - -for id in ids: - if args.yaml: - raise Exception("YAML not yet supported") - fn = f"{dir}/{id}.json" - print(f"Reading {fn}",file=sys.stderr) - with open(fn) as f: - rec = types.SimpleNamespace(**json.load(f)) - if do_validate: - print(rec) - rec.host,warning = mapping.host_species(rec.host,species) - if warning: - print("WARNING "+warning,file=sys.stderr) - rec.warnings.append(warning) - rec.sample,warning = mapping.specimen_source(rec.sample,specimen) - if warning: - print("WARNING "+warning,file=sys.stderr) - rec.warnings.append(warning) - print(rec) - if do_validate and warning: - print("bailing out in validation mode",file=sys.stderr) - sys.exit(2) - if do_rewrite: - if not os.path.exists(fn+".bak"): # make backup the first time - os.rename(fn,fn+".bak") - with open(fn, 'w') as outfile: - print(f" Writing {fn}") - json.dump(rec.__dict__, outfile, indent=2) - else: - print(rec) diff --git a/workflows/tools/normalize/README.md b/workflows/tools/normalize/README.md deleted file mode 100644 index b780a68..0000000 --- a/workflows/tools/normalize/README.md +++ /dev/null @@ -1,14 +0,0 @@ -# Normalization steps - -This library contains generic logic to normalize (string) data and -transforms strings to URIs. It should be applicable to data from -any source (GenBank, ENA etc). - -Important: missing data should be missing or None! Do not fill -in data by 'guessing'. - -When data is malformed a warning should be logged and added to the -warning list. Functions should be small enough to return only 1 -warning! - -Pjotr Prins (c) 2021 diff --git a/workflows/tools/normalize/__init__.py b/workflows/tools/normalize/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py deleted file mode 100644 index 3ed09c2..0000000 --- a/workflows/tools/normalize/mapping.py +++ /dev/null @@ -1,102 +0,0 @@ -# Normalization steps -# -# This library contains generic logic to normalize (string) data and -# transforms strings to URIs. It should be applicable to data from -# any source (GenBank, ENA etc). -# -# Important: missing data should be missing or None! Do not fill -# in data by 'guessing'. -# -# When data is malformed a warning should be logged and added to the -# warning list. Functions should be small enough to return only 1 -# warning! -# -# Pjotr Prins (c) 2021 - -import re -import types - -def host_species(host,mapping): - Homo_sapiens = "http://purl.obolibrary.org/obo/NCBITaxon_9606" - - SPECIES_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed - "human": Homo_sapiens, - "sapiens": Homo_sapiens, - "Mustela lutreola": "http://purl.obolibrary.org/obo/NCBITaxon_9666", - "Manis javanica": "http://purl.obolibrary.org/obo/NCBITaxon_9974", - "Felis catus": "http://purl.obolibrary.org/obo/NCBITaxon_9685", - "Panthera tigris": "http://purl.obolibrary.org/obo/NCBITaxon_419130", - "Canis lupus": "http://purl.obolibrary.org/obo/NCBITaxon_9615", - # Mink: - "vison": "http://purl.obolibrary.org/obo/NCBITaxon_452646" - } - - warning = None - host = types.SimpleNamespace(**host) - if not 'obolibrary' in host.host_species: - key = host.host_species - host.host_species = None - if key in mapping: - host.host_species = mapping[key] - else: - for term in SPECIES_TERMS: - p = re.compile(".*?"+term,re.IGNORECASE) - m = p.match(key) - if m: host.host_species = SPECIES_TERMS[term] - if not host.host_species: - warning = f"No URI mapping for host_species <{key}>" - if host.host_species == Unknown or host.host_species == None: - del(host.host_species) - return host.__dict__,warning - -Unknown = "Not found" # So as not to create a warning - -def specimen_source(sample,mapping): - Oronasopharynx = "http://purl.obolibrary.org/obo/NCIT_C155835" - Oropharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155835" - Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831" - Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195" - Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275" - Nasal_Swab = Nasopharyngeal # "http://purl.obolibrary.org/obo/NCIT_C132119" - Frozen_Food = "https://www.wikidata.org/wiki/Q751728" - Bronchoalveolar_Lavage = "http://purl.obolibrary.org/obo/NCIT_C13195", - Biospecimen = "http://purl.obolibrary.org/obo/NCIT_C70699" - SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed - "Oronasopharynx": Oronasopharynx, - "orophar": Oropharyngeal, - "pharyngeal": Nasopharyngeal, - "\snares": Nasal_Swab, - "saliva": Saliva, - "swab": Nasal_Swab, - "broncho": Bronchoalveolar_Lavage, - "seafood": Frozen_Food, - "packaging": Frozen_Food, - "specimen": Biospecimen, - "patient": Biospecimen, - "uknown": Unknown, - "unknown": Unknown - } - warning = None - sample = types.SimpleNamespace(**sample) - try: - if sample.specimen_source: - keys = sample.specimen_source - sample.specimen_source = [] - for key in keys: - if 'obolibrary' in key: - sample.specimen_source.append(key) - continue - if key in mapping: - sample.specimen_source.append(mapping[key]) - else: - for term in SPECIMEN_TERMS: - p = re.compile(".*?"+term,re.IGNORECASE) - m = p.match(key) - if m: sample.specimen_source = [SPECIMEN_TERMS[term]] - if len(sample.specimen_source)==0: - warning = f"No URI mapping for specimen_source <{key}>" - if sample.specimen_source == Unknown or sample.specimen_source == None: - del(sample.specimen_source) - except AttributeError: - pass - return sample.__dict__,warning diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py deleted file mode 100755 index ef4edde..0000000 --- a/workflows/tools/pubseq-fetch-data.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import json -import os -import requests -import sys -import time - -parser = argparse.ArgumentParser(description=""" - -Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs -can be passed in on the command line or in a file. - -""") -parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records') -parser.add_argument('--out', type=str, help='Directory to write to', -required=True) -parser.add_argument('--ids', type=str, help='File with ids', required=False) -parser.add_argument('id', nargs='*', help='id(s)') -args = parser.parse_args() - -dir = args.out -if not os.path.exists(dir): - raise Exception(f"Directory {dir} does not exist") - -ids = args.id -if (len(ids)==0): - print(f"Reading {args.ids}") - with open(args.ids) as f: - ids = [ l.strip() for l in f.readlines() ] - -for id in ids: - print(id) - jsonfn = dir+"/"+id+".json" - if not os.path.exists(jsonfn): - count = 0 - r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") - while not r: - count += 1 - if count>10: raise Exception(f"Can not find record for {id}") - time.sleep(15) - r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") - m_url = r.json()[0]['metadata'] - mr = requests.get(m_url) - with open(dir+"/"+id+".json","w") as outf: - outf.write(mr.text) - if args.fasta: - fastafn = dir+"/"+id+".fa" - if os.path.exists(fastafn): continue - fa_url = r.json()[0]['fasta'] - fr = requests.get(fa_url) - with open(fastafn,"w") as outf: - outf.write(fr.text) - diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids deleted file mode 100755 index f5920ec..0000000 --- a/workflows/tools/pubseq-fetch-ids +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env ruby -# -# Use a SPARQL query to fetch all IDs in the PubSeq database -# -# pubseq-fetch-ids > pubseq_ids.txt -# -# Note: requires Ruby 3.x. Older Ruby gives a syntax error - -require 'net/http' -require 'json' -require 'ostruct' -require 'erb' -require 'pp' - -MAX=5_000 - -SPARQL_HEADER=" -prefix rdfs: -prefix rdf: -prefix dc: -prefix schema: -PREFIX pubseq: -" - -# Build a SPARQL query, submit and return results. Apply transform -# lambda when passed in -def sparql query, transform = nil - api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" - - response = Net::HTTP.get_response(URI.parse(api_url)) - data = JSON.parse(response.body,symbolize_names: true) - data => { head: { vars: }, results: { bindings: results} } - vars = vars.map { |v| v.to_sym } - results.map { |rec| - # return results after transforming to a Hash and applying the - # optional transform lambda. Note the transform can not only - # reduce results, or create an array, but also may transform into - # an OpenStruct. - res = {} - vars.each { |name| res[name] = rec[name][:value] } - if transform - transform.call(res) - else - res - end - } -end - -start = 0 -num = MAX -begin - query = " -SELECT DISTINCT ?id -FROM -WHERE { - - ?arvid ?id . - -} LIMIT #{num} OFFSET #{start} -" - list = sparql(query, lambda { |rec| rec[:id] }) - list.each do | l | - print(l,"\n") - end - $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress - start += num -end while list.size == MAX -- cgit v1.2.3