aboutsummaryrefslogtreecommitdiff
path: root/workflows/pangenome-generate
diff options
context:
space:
mode:
authorPeter Amstutz2020-11-11 14:53:04 -0500
committerPeter Amstutz2020-11-11 17:04:11 -0500
commit85c50a6ae14dab51eb4287404955e0508e2df1b9 (patch)
treef7c571f4f025e1430d786667f02c6bec7d06d7aa /workflows/pangenome-generate
parent391e73f5d71baceb35db973dba335dd362a73cbd (diff)
downloadbh20-seq-resource-85c50a6ae14dab51eb4287404955e0508e2df1b9.tar.gz
bh20-seq-resource-85c50a6ae14dab51eb4287404955e0508e2df1b9.tar.lz
bh20-seq-resource-85c50a6ae14dab51eb4287404955e0508e2df1b9.zip
Make collect-seqs skip bad inputs.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
Diffstat (limited to 'workflows/pangenome-generate')
-rw-r--r--workflows/pangenome-generate/collect-seqs.py55
1 files changed, 30 insertions, 25 deletions
diff --git a/workflows/pangenome-generate/collect-seqs.py b/workflows/pangenome-generate/collect-seqs.py
index 225a61f..cb5bc33 100644
--- a/workflows/pangenome-generate/collect-seqs.py
+++ b/workflows/pangenome-generate/collect-seqs.py
@@ -2,6 +2,7 @@ import sys
import arvados
import json
import shutil
+import logging
import arvados.collection
import ruamel.yaml
import schema_salad.schema
@@ -37,34 +38,38 @@ if len(sys.argv) > 3:
for item in validated:
pdh = item["portable_data_hash"]
uuid = item["uuid"]
- with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col:
- with col.open("sequence.fasta", "rt") as fa:
- subject = "http://covid19.genenetwork.org/resource/%s" % uuid
- label = fa.readline().strip()
- merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"')))
- merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/collection_pdh> \"%s\" .\n" % (subject, pdh))
- merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/collection_version> \"%s\" .\n" % (subject, item["version"]))
- skip = (subject in blacklist or label[1:] in blacklist)
- if skip:
- merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/excluded_from_graph> \"true\"^^<http://www.w3.org/2001/XMLSchema#boolean> .\n" % subject)
- if not skip:
- relabeled_fasta.write(">"+subject+"\n")
- data = fa.read(8096)
- while data:
+ try:
+ subject = "http://covid19.genenetwork.org/resource/%s" % uuid
+ with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col:
+ with col.open("metadata.yaml", "rt") as md:
+ metadata_content = ruamel.yaml.round_trip_load(md)
+ metadata_content["id"] = subject
+ add_lc_filename(metadata_content, metadata_content["id"])
+ doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadata_content, False, False)
+ g = schema_salad.jsonld_context.makerdf(subject, doc, document_loader.ctx)
+
+ with col.open("sequence.fasta", "rt") as fa:
+ label = fa.readline().strip()
+ merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"')))
+ merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/collection_pdh> \"%s\" .\n" % (subject, pdh))
+ merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/collection_version> \"%s\" .\n" % (subject, item["version"]))
+ skip = (subject in blacklist or label[1:] in blacklist)
+ if skip:
+ merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/excluded_from_graph> \"true\"^^<http://www.w3.org/2001/XMLSchema#boolean> .\n" % subject)
if not skip:
- relabeled_fasta.write(data)
- endswithnewline = data.endswith("\n")
+ relabeled_fasta.write(">"+subject+"\n")
data = fa.read(8096)
- if not skip and not endswithnewline:
- relabeled_fasta.write("\n")
+ while data:
+ if not skip:
+ relabeled_fasta.write(data)
+ endswithnewline = data.endswith("\n")
+ data = fa.read(8096)
+ if not skip and not endswithnewline:
+ relabeled_fasta.write("\n")
- with col.open("metadata.yaml", "rt") as md:
- metadata_content = ruamel.yaml.round_trip_load(md)
- metadata_content["id"] = subject
- add_lc_filename(metadata_content, metadata_content["id"])
- doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadata_content, False, False)
- g = schema_salad.jsonld_context.makerdf(subject, doc, document_loader.ctx)
- merged_metadata.write(g.serialize(format="ntriples").decode("utf-8"))
+ merged_metadata.write(g.serialize(format="ntriples").decode("utf-8"))
+ except Exception as e:
+ logging.exception("Error processing collection %s" % uuid)
shutil.rmtree(".cache")