From 1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 4 Jan 2021 08:58:38 +0000 Subject: Started on normalization --- workflows/pull-data/genbank/.gitignore | 1 + workflows/pull-data/genbank/README.md | 19 ++++-- workflows/pull-data/genbank/genbank.py | 4 +- workflows/pull-data/genbank/sparql-fetch-ids | 67 ---------------------- .../genbank/transform-genbank-xml2yamlfa.py | 16 ++++-- 5 files changed, 31 insertions(+), 76 deletions(-) delete mode 100755 workflows/pull-data/genbank/sparql-fetch-ids (limited to 'workflows/pull-data') diff --git a/workflows/pull-data/genbank/.gitignore b/workflows/pull-data/genbank/.gitignore index 69b8a57..8bfdb5b 100644 --- a/workflows/pull-data/genbank/.gitignore +++ b/workflows/pull-data/genbank/.gitignore @@ -1,3 +1,4 @@ fasta_and_yaml/ *.tsv *.acc +*.txt diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index b5bac84..d7cc15f 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -1,14 +1,25 @@ -# pipeline +# GenBank + +This directory contains the tools to pull and transform +GenBank data. + +# Workflows + +## Prepare new GenBank data for upload + +The following workflow sends GenBank data into PubSeq ```sh # --- get list of IDs already in PubSeq -./sparql-fetch-ids > pubseq_ids.txt +../../tools/sparql-fetch-ids > pubseq_ids.txt # --- get list of missing genbank IDs ./genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt # --- fetch XML python3 update-from-genbank.py --ids genbank_ids.txt --out ~/tmp/genbank -# --- Transform to YAML and FASTA -python3 transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s) +# --- Transform to YAML/JSON and FASTA +python3 transform-genbank-xml2yamlfa.py --out ~/tmp/pubseq file(s) +# --- Normalize data +../../tools/normalize-yamlfa.py --in ~/tmp/pubseq/state.json file(s) ``` # TODO diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 26cb5e7..85d615c 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -1,4 +1,6 @@ # Genbank XML parser +# +# Pjotr Prins (c) 2021 from collections import namedtuple import dateutil @@ -59,7 +61,7 @@ Example of an output JSON: def get_metadata(id, gbseq): """This is a minimal data parser from genbank XML records. Inference on, for example geo location, is not allowed in this function and - happens downstream. + happens downstream (in normalize). That is to keep the parsing simple. diff --git a/workflows/pull-data/genbank/sparql-fetch-ids b/workflows/pull-data/genbank/sparql-fetch-ids deleted file mode 100755 index 19b2d82..0000000 --- a/workflows/pull-data/genbank/sparql-fetch-ids +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env ruby -# -# Use a SPARQL query to fetch all IDs in the PubSeq database -# -# sparql-fetch-ids > pubseq_ids.txt -# -# Note: requires Ruby 3.x. Older Ruby gives a syntax error - -require 'net/http' -require 'json' -require 'ostruct' -require 'erb' -require 'pp' - -MAX=5_000 - -SPARQL_HEADER=" -prefix rdfs: -prefix rdf: -prefix dc: -prefix schema: -PREFIX pubseq: -" - -# Build a SPARQL query, submit and return results. Apply transform -# lambda when passed in -def sparql query, transform = nil - api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" - - response = Net::HTTP.get_response(URI.parse(api_url)) - data = JSON.parse(response.body,symbolize_names: true) - data => { head: { vars: }, results: { bindings: results} } - vars = vars.map { |v| v.to_sym } - results.map { |rec| - # return results after transforming to a Hash and applying the - # optional transform lambda. Note the transform can not only - # reduce results, or create an array, but also may transform into - # an OpenStruct. - res = {} - vars.each { |name| res[name] = rec[name][:value] } - if transform - transform.call(res) - else - res - end - } -end - -start = 0 -num = MAX -begin - query = " -SELECT DISTINCT ?id -FROM -WHERE { - - ?arvid ?id . - -} LIMIT #{num} OFFSET #{start} -" - list = sparql(query, lambda { |rec| rec[:id] }) - list.each do | l | - print(l,"\n") - end - $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress - start += num -end while list.size == MAX diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py index ebdf17e..9414864 100755 --- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py +++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py @@ -1,18 +1,17 @@ #!/usr/bin/env python3 # -# Create a single YAML/FASTA from genbank XML +# Create a single YAML/FASTA for each genbank entry in GenBank XML file # # transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s) # # Also writes a validation file in the outdir named state.json -# -# Where --in can be a file or a directory # ---------------------------------------------------------------------- # See also directory .guix-run and README.md import argparse import gzip +import json import os import sys import types @@ -47,6 +46,12 @@ for xmlfn in args.files: try: valid,meta = genbank.get_metadata(id,gb) if valid: + # --- write JSON + jsonfn = basename + ".json" + with open(jsonfn, 'w') as outfile: + print(f" writing {jsonfn}") + json.dump(meta, outfile, indent=4) + # --- write FASTA fa = basename+".fa" seq = genbank.get_sequence(id,gb) print(f" writing {fa}") @@ -66,4 +71,7 @@ for xmlfn in args.files: state['warnings'] = meta['warnings'] states[id] = state -print(states) +statefn = dir + '/state.json' +with open(statefn, 'w') as outfile: + print(f" Writing {statefn}") + json.dump(states, outfile, indent=4) -- cgit v1.2.3