aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data
diff options
context:
space:
mode:
authorPjotr Prins2021-01-04 08:58:38 +0000
committerPjotr Prins2021-01-04 08:58:38 +0000
commit1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef (patch)
tree34cc42ef12b81c05be8a57ca2a973b97e52f8461 /workflows/pull-data
parentba4161b1660c3a67090dd3715e9862906fb1cc5f (diff)
downloadbh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.gz
bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.lz
bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.zip
Started on normalization
Diffstat (limited to 'workflows/pull-data')
-rw-r--r--workflows/pull-data/genbank/.gitignore1
-rw-r--r--workflows/pull-data/genbank/README.md19
-rw-r--r--workflows/pull-data/genbank/genbank.py4
-rwxr-xr-xworkflows/pull-data/genbank/sparql-fetch-ids67
-rwxr-xr-xworkflows/pull-data/genbank/transform-genbank-xml2yamlfa.py16
5 files changed, 31 insertions, 76 deletions
diff --git a/workflows/pull-data/genbank/.gitignore b/workflows/pull-data/genbank/.gitignore
index 69b8a57..8bfdb5b 100644
--- a/workflows/pull-data/genbank/.gitignore
+++ b/workflows/pull-data/genbank/.gitignore
@@ -1,3 +1,4 @@
fasta_and_yaml/
*.tsv
*.acc
+*.txt
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md
index b5bac84..d7cc15f 100644
--- a/workflows/pull-data/genbank/README.md
+++ b/workflows/pull-data/genbank/README.md
@@ -1,14 +1,25 @@
-# pipeline
+# GenBank
+
+This directory contains the tools to pull and transform
+GenBank data.
+
+# Workflows
+
+## Prepare new GenBank data for upload
+
+The following workflow sends GenBank data into PubSeq
```sh
# --- get list of IDs already in PubSeq
-./sparql-fetch-ids > pubseq_ids.txt
+../../tools/sparql-fetch-ids > pubseq_ids.txt
# --- get list of missing genbank IDs
./genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt
# --- fetch XML
python3 update-from-genbank.py --ids genbank_ids.txt --out ~/tmp/genbank
-# --- Transform to YAML and FASTA
-python3 transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s)
+# --- Transform to YAML/JSON and FASTA
+python3 transform-genbank-xml2yamlfa.py --out ~/tmp/pubseq file(s)
+# --- Normalize data
+../../tools/normalize-yamlfa.py --in ~/tmp/pubseq/state.json file(s)
```
# TODO
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 26cb5e7..85d615c 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -1,4 +1,6 @@
# Genbank XML parser
+#
+# Pjotr Prins (c) 2021
from collections import namedtuple
import dateutil
@@ -59,7 +61,7 @@ Example of an output JSON:
def get_metadata(id, gbseq):
"""This is a minimal data parser from genbank XML records. Inference
on, for example geo location, is not allowed in this function and
- happens downstream.
+ happens downstream (in normalize).
That is to keep the parsing simple.
diff --git a/workflows/pull-data/genbank/sparql-fetch-ids b/workflows/pull-data/genbank/sparql-fetch-ids
deleted file mode 100755
index 19b2d82..0000000
--- a/workflows/pull-data/genbank/sparql-fetch-ids
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env ruby
-#
-# Use a SPARQL query to fetch all IDs in the PubSeq database
-#
-# sparql-fetch-ids > pubseq_ids.txt
-#
-# Note: requires Ruby 3.x. Older Ruby gives a syntax error
-
-require 'net/http'
-require 'json'
-require 'ostruct'
-require 'erb'
-require 'pp'
-
-MAX=5_000
-
-SPARQL_HEADER="
-prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
-prefix dc: <http://purl.org/dc/terms/>
-prefix schema: <https://schema.org/>
-PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-"
-
-# Build a SPARQL query, submit and return results. Apply transform
-# lambda when passed in
-def sparql query, transform = nil
- api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}"
-
- response = Net::HTTP.get_response(URI.parse(api_url))
- data = JSON.parse(response.body,symbolize_names: true)
- data => { head: { vars: }, results: { bindings: results} }
- vars = vars.map { |v| v.to_sym }
- results.map { |rec|
- # return results after transforming to a Hash and applying the
- # optional transform lambda. Note the transform can not only
- # reduce results, or create an array, but also may transform into
- # an OpenStruct.
- res = {}
- vars.each { |name| res[name] = rec[name][:value] }
- if transform
- transform.call(res)
- else
- res
- end
- }
-end
-
-start = 0
-num = MAX
-begin
- query = "
-SELECT DISTINCT ?id
-FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
-WHERE {
-
- ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
-
-} LIMIT #{num} OFFSET #{start}
-"
- list = sparql(query, lambda { |rec| rec[:id] })
- list.each do | l |
- print(l,"\n")
- end
- $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
- start += num
-end while list.size == MAX
diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
index ebdf17e..9414864 100755
--- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
+++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
@@ -1,18 +1,17 @@
#!/usr/bin/env python3
#
-# Create a single YAML/FASTA from genbank XML
+# Create a single YAML/FASTA for each genbank entry in GenBank XML file
#
# transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s)
#
# Also writes a validation file in the outdir named state.json
-#
-# Where --in can be a file or a directory
# ----------------------------------------------------------------------
# See also directory .guix-run and README.md
import argparse
import gzip
+import json
import os
import sys
import types
@@ -47,6 +46,12 @@ for xmlfn in args.files:
try:
valid,meta = genbank.get_metadata(id,gb)
if valid:
+ # --- write JSON
+ jsonfn = basename + ".json"
+ with open(jsonfn, 'w') as outfile:
+ print(f" writing {jsonfn}")
+ json.dump(meta, outfile, indent=4)
+ # --- write FASTA
fa = basename+".fa"
seq = genbank.get_sequence(id,gb)
print(f" writing {fa}")
@@ -66,4 +71,7 @@ for xmlfn in args.files:
state['warnings'] = meta['warnings']
states[id] = state
-print(states)
+statefn = dir + '/state.json'
+with open(statefn, 'w') as outfile:
+ print(f" Writing {statefn}")
+ json.dump(states, outfile, indent=4)