From 1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 4 Jan 2021 08:58:38 +0000 Subject: Started on normalization --- workflows/pull-data/genbank/.gitignore | 1 + workflows/pull-data/genbank/README.md | 19 ++++- workflows/pull-data/genbank/genbank.py | 4 +- workflows/pull-data/genbank/sparql-fetch-ids | 67 --------------- .../genbank/transform-genbank-xml2yamlfa.py | 16 +++- workflows/tools | 1 - workflows/tools/normalize-yamlfa.py | 97 ++++++++++++++++++++++ workflows/tools/normalize/README.md | 14 ++++ workflows/tools/normalize/__init__.py | 0 workflows/tools/normalize/mapping.py | 43 ++++++++++ workflows/tools/sparql-fetch-ids | 67 +++++++++++++++ 11 files changed, 252 insertions(+), 77 deletions(-) delete mode 100755 workflows/pull-data/genbank/sparql-fetch-ids delete mode 160000 workflows/tools create mode 100755 workflows/tools/normalize-yamlfa.py create mode 100644 workflows/tools/normalize/README.md create mode 100644 workflows/tools/normalize/__init__.py create mode 100644 workflows/tools/normalize/mapping.py create mode 100755 workflows/tools/sparql-fetch-ids (limited to 'workflows') diff --git a/workflows/pull-data/genbank/.gitignore b/workflows/pull-data/genbank/.gitignore index 69b8a57..8bfdb5b 100644 --- a/workflows/pull-data/genbank/.gitignore +++ b/workflows/pull-data/genbank/.gitignore @@ -1,3 +1,4 @@ fasta_and_yaml/ *.tsv *.acc +*.txt diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index b5bac84..d7cc15f 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -1,14 +1,25 @@ -# pipeline +# GenBank + +This directory contains the tools to pull and transform +GenBank data. + +# Workflows + +## Prepare new GenBank data for upload + +The following workflow sends GenBank data into PubSeq ```sh # --- get list of IDs already in PubSeq -./sparql-fetch-ids > pubseq_ids.txt +../../tools/sparql-fetch-ids > pubseq_ids.txt # --- get list of missing genbank IDs ./genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt # --- fetch XML python3 update-from-genbank.py --ids genbank_ids.txt --out ~/tmp/genbank -# --- Transform to YAML and FASTA -python3 transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s) +# --- Transform to YAML/JSON and FASTA +python3 transform-genbank-xml2yamlfa.py --out ~/tmp/pubseq file(s) +# --- Normalize data +../../tools/normalize-yamlfa.py --in ~/tmp/pubseq/state.json file(s) ``` # TODO diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 26cb5e7..85d615c 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -1,4 +1,6 @@ # Genbank XML parser +# +# Pjotr Prins (c) 2021 from collections import namedtuple import dateutil @@ -59,7 +61,7 @@ Example of an output JSON: def get_metadata(id, gbseq): """This is a minimal data parser from genbank XML records. Inference on, for example geo location, is not allowed in this function and - happens downstream. + happens downstream (in normalize). That is to keep the parsing simple. diff --git a/workflows/pull-data/genbank/sparql-fetch-ids b/workflows/pull-data/genbank/sparql-fetch-ids deleted file mode 100755 index 19b2d82..0000000 --- a/workflows/pull-data/genbank/sparql-fetch-ids +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env ruby -# -# Use a SPARQL query to fetch all IDs in the PubSeq database -# -# sparql-fetch-ids > pubseq_ids.txt -# -# Note: requires Ruby 3.x. Older Ruby gives a syntax error - -require 'net/http' -require 'json' -require 'ostruct' -require 'erb' -require 'pp' - -MAX=5_000 - -SPARQL_HEADER=" -prefix rdfs: -prefix rdf: -prefix dc: -prefix schema: -PREFIX pubseq: -" - -# Build a SPARQL query, submit and return results. Apply transform -# lambda when passed in -def sparql query, transform = nil - api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" - - response = Net::HTTP.get_response(URI.parse(api_url)) - data = JSON.parse(response.body,symbolize_names: true) - data => { head: { vars: }, results: { bindings: results} } - vars = vars.map { |v| v.to_sym } - results.map { |rec| - # return results after transforming to a Hash and applying the - # optional transform lambda. Note the transform can not only - # reduce results, or create an array, but also may transform into - # an OpenStruct. - res = {} - vars.each { |name| res[name] = rec[name][:value] } - if transform - transform.call(res) - else - res - end - } -end - -start = 0 -num = MAX -begin - query = " -SELECT DISTINCT ?id -FROM -WHERE { - - ?arvid ?id . - -} LIMIT #{num} OFFSET #{start} -" - list = sparql(query, lambda { |rec| rec[:id] }) - list.each do | l | - print(l,"\n") - end - $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress - start += num -end while list.size == MAX diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py index ebdf17e..9414864 100755 --- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py +++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py @@ -1,18 +1,17 @@ #!/usr/bin/env python3 # -# Create a single YAML/FASTA from genbank XML +# Create a single YAML/FASTA for each genbank entry in GenBank XML file # # transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s) # # Also writes a validation file in the outdir named state.json -# -# Where --in can be a file or a directory # ---------------------------------------------------------------------- # See also directory .guix-run and README.md import argparse import gzip +import json import os import sys import types @@ -47,6 +46,12 @@ for xmlfn in args.files: try: valid,meta = genbank.get_metadata(id,gb) if valid: + # --- write JSON + jsonfn = basename + ".json" + with open(jsonfn, 'w') as outfile: + print(f" writing {jsonfn}") + json.dump(meta, outfile, indent=4) + # --- write FASTA fa = basename+".fa" seq = genbank.get_sequence(id,gb) print(f" writing {fa}") @@ -66,4 +71,7 @@ for xmlfn in args.files: state['warnings'] = meta['warnings'] states[id] = state -print(states) +statefn = dir + '/state.json' +with open(statefn, 'w') as outfile: + print(f" Writing {statefn}") + json.dump(states, outfile, indent=4) diff --git a/workflows/tools b/workflows/tools deleted file mode 160000 index c67c011..0000000 --- a/workflows/tools +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c67c011765bea798a24485cbe0a1c6c592436521 diff --git a/workflows/tools/normalize-yamlfa.py b/workflows/tools/normalize-yamlfa.py new file mode 100755 index 0000000..e3f92c0 --- /dev/null +++ b/workflows/tools/normalize-yamlfa.py @@ -0,0 +1,97 @@ +# --- Normalize data +# normalize-yamlfa.py [--yaml] --in ~/tmp/pubseq/state.json file(s) +# +# Example: +# +# python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json MW241349 --species ./scripts/dict_ontology_standardization/ncbi_host_species.csv + +import argparse +import json +import os +import sys +import types +import normalize.mapping as mapping + +parser = argparse.ArgumentParser(description=""" + +Normalize parameters in PubSeq JSON/YAML files. All entries in +directory are parsed using the state.json file. It is possible +to select a subset of IDs. + +This tool has two modes of operation. It can validate with the +`--validate` switch which stops at a warning and does no rewriting. +This mode is typically used in troubleshooting. + +The other mode is `--rewrite` which rewrites the JSON files after +making a backup (.bak) of the original. This mode updates files and +won't stop - it is used for (automated) uploads. + +""") + +parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True) +parser.add_argument('--species', type=str, help='Species mapping file') +parser.add_argument('--specimen', type=str, help='Specimen mapping file') +parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning') +parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files') +parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON') +parser.add_argument('id', nargs='*', help='optional id(s)') + +args = parser.parse_args() + +with open(args.state) as jsonf: + data = json.load(jsonf) + +dir = os.path.dirname(args.state) +do_validate = args.validate +do_rewrite = args.rewrite + +ids = args.id +if not len(ids): + ids = list(data.keys()) + +species = {} +if args.species: + with open(args.species) as f: + for line in f: + name,uri = line.strip().split(',') + species[name] = uri +else: + print("WARNING: no species mapping",file=sys.stderr) +specimen = {} +if args.specimen: + with open(args.specimen) as f: + for line in f: + name,uri = line.strip().split(',') + specimen[name] = uri +else: + print("WARNING: no specimen mapping",file=sys.stderr) + +for id in ids: + if args.yaml: + raise Exception("YAML not yet supported") + fn = f"{dir}/{id}.json" + print(f"Reading {fn}",file=sys.stderr) + with open(fn) as f: + rec = types.SimpleNamespace(**json.load(f)) + if do_validate: + print(rec) + rec.host,warning = mapping.host_species(rec.host,species) + if warning: + print("WARNING "+warning,file=sys.stderr) + rec.warnings.append(warning) + rec.sample,warning = mapping.specimen_source(rec.sample,specimen) + if warning: + print("WARNING "+warning,file=sys.stderr) + rec.warnings.append(warning) + print(rec) + if do_validate and warning: + print("bailing out in validation mode",file=sys.stderr) + sys.exit(2) + if do_rewrite: + if not os.path.exists(fn+".bak"): # make backup the first time + os.rename(fn,fn+".bak") + with open(fn, 'w') as outfile: + print(f" Writing {fn}") + json.dump(rec.__dict__, outfile, indent=4) + else: + print(rec) diff --git a/workflows/tools/normalize/README.md b/workflows/tools/normalize/README.md new file mode 100644 index 0000000..b780a68 --- /dev/null +++ b/workflows/tools/normalize/README.md @@ -0,0 +1,14 @@ +# Normalization steps + +This library contains generic logic to normalize (string) data and +transforms strings to URIs. It should be applicable to data from +any source (GenBank, ENA etc). + +Important: missing data should be missing or None! Do not fill +in data by 'guessing'. + +When data is malformed a warning should be logged and added to the +warning list. Functions should be small enough to return only 1 +warning! + +Pjotr Prins (c) 2021 diff --git a/workflows/tools/normalize/__init__.py b/workflows/tools/normalize/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py new file mode 100644 index 0000000..1d52b03 --- /dev/null +++ b/workflows/tools/normalize/mapping.py @@ -0,0 +1,43 @@ +# Normalization steps +# +# This library contains generic logic to normalize (string) data and +# transforms strings to URIs. It should be applicable to data from +# any source (GenBank, ENA etc). +# +# Important: missing data should be missing or None! Do not fill +# in data by 'guessing'. +# +# When data is malformed a warning should be logged and added to the +# warning list. Functions should be small enough to return only 1 +# warning! +# +# Pjotr Prins (c) 2021 + +import types + +def host_species(host,mapping): + warning = None + host = types.SimpleNamespace(**host) + if not 'obolibrary' in host.host_species: + key = host.host_species + if key in mapping: + host.host_species = mapping[key] + else: + warning = f"No URI mapping for host_species <{key}>" + return host.__dict__,warning + +def specimen_source(sample,mapping): + warning = None + sample = types.SimpleNamespace(**sample) + try: + if sample.specimen_source and not 'obolibrary' in sample.specimen_source: + key = sample.specimen_source + if key in mapping: + sample.specimen_source = mapping[key] + else: + sample.specimen_source = None + warning = f"No URI mapping for specimen_source <{key}>" + except AttributeError: + pass + if not sample.specimen_source: del(sample.specimen_source) + return sample.__dict__,warning diff --git a/workflows/tools/sparql-fetch-ids b/workflows/tools/sparql-fetch-ids new file mode 100755 index 0000000..19b2d82 --- /dev/null +++ b/workflows/tools/sparql-fetch-ids @@ -0,0 +1,67 @@ +#!/usr/bin/env ruby +# +# Use a SPARQL query to fetch all IDs in the PubSeq database +# +# sparql-fetch-ids > pubseq_ids.txt +# +# Note: requires Ruby 3.x. Older Ruby gives a syntax error + +require 'net/http' +require 'json' +require 'ostruct' +require 'erb' +require 'pp' + +MAX=5_000 + +SPARQL_HEADER=" +prefix rdfs: +prefix rdf: +prefix dc: +prefix schema: +PREFIX pubseq: +" + +# Build a SPARQL query, submit and return results. Apply transform +# lambda when passed in +def sparql query, transform = nil + api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" + + response = Net::HTTP.get_response(URI.parse(api_url)) + data = JSON.parse(response.body,symbolize_names: true) + data => { head: { vars: }, results: { bindings: results} } + vars = vars.map { |v| v.to_sym } + results.map { |rec| + # return results after transforming to a Hash and applying the + # optional transform lambda. Note the transform can not only + # reduce results, or create an array, but also may transform into + # an OpenStruct. + res = {} + vars.each { |name| res[name] = rec[name][:value] } + if transform + transform.call(res) + else + res + end + } +end + +start = 0 +num = MAX +begin + query = " +SELECT DISTINCT ?id +FROM +WHERE { + + ?arvid ?id . + +} LIMIT #{num} OFFSET #{start} +" + list = sparql(query, lambda { |rec| rec[:id] }) + list.each do | l | + print(l,"\n") + end + $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress + start += num +end while list.size == MAX -- cgit v1.2.3