From 85c50a6ae14dab51eb4287404955e0508e2df1b9 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Wed, 11 Nov 2020 14:53:04 -0500 Subject: Make collect-seqs skip bad inputs. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- workflows/pangenome-generate/collect-seqs.py | 55 +++++++++++++++------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/workflows/pangenome-generate/collect-seqs.py b/workflows/pangenome-generate/collect-seqs.py index 225a61f..cb5bc33 100644 --- a/workflows/pangenome-generate/collect-seqs.py +++ b/workflows/pangenome-generate/collect-seqs.py @@ -2,6 +2,7 @@ import sys import arvados import json import shutil +import logging import arvados.collection import ruamel.yaml import schema_salad.schema @@ -37,34 +38,38 @@ if len(sys.argv) > 3: for item in validated: pdh = item["portable_data_hash"] uuid = item["uuid"] - with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col: - with col.open("sequence.fasta", "rt") as fa: - subject = "http://covid19.genenetwork.org/resource/%s" % uuid - label = fa.readline().strip() - merged_metadata.write("<%s> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"'))) - merged_metadata.write("<%s> \"%s\" .\n" % (subject, pdh)) - merged_metadata.write("<%s> \"%s\" .\n" % (subject, item["version"])) - skip = (subject in blacklist or label[1:] in blacklist) - if skip: - merged_metadata.write("<%s> \"true\"^^ .\n" % subject) - if not skip: - relabeled_fasta.write(">"+subject+"\n") - data = fa.read(8096) - while data: + try: + subject = "http://covid19.genenetwork.org/resource/%s" % uuid + with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col: + with col.open("metadata.yaml", "rt") as md: + metadata_content = ruamel.yaml.round_trip_load(md) + metadata_content["id"] = subject + add_lc_filename(metadata_content, metadata_content["id"]) + doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadata_content, False, False) + g = schema_salad.jsonld_context.makerdf(subject, doc, document_loader.ctx) + + with col.open("sequence.fasta", "rt") as fa: + label = fa.readline().strip() + merged_metadata.write("<%s> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"'))) + merged_metadata.write("<%s> \"%s\" .\n" % (subject, pdh)) + merged_metadata.write("<%s> \"%s\" .\n" % (subject, item["version"])) + skip = (subject in blacklist or label[1:] in blacklist) + if skip: + merged_metadata.write("<%s> \"true\"^^ .\n" % subject) if not skip: - relabeled_fasta.write(data) - endswithnewline = data.endswith("\n") + relabeled_fasta.write(">"+subject+"\n") data = fa.read(8096) - if not skip and not endswithnewline: - relabeled_fasta.write("\n") + while data: + if not skip: + relabeled_fasta.write(data) + endswithnewline = data.endswith("\n") + data = fa.read(8096) + if not skip and not endswithnewline: + relabeled_fasta.write("\n") - with col.open("metadata.yaml", "rt") as md: - metadata_content = ruamel.yaml.round_trip_load(md) - metadata_content["id"] = subject - add_lc_filename(metadata_content, metadata_content["id"]) - doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadata_content, False, False) - g = schema_salad.jsonld_context.makerdf(subject, doc, document_loader.ctx) - merged_metadata.write(g.serialize(format="ntriples").decode("utf-8")) + merged_metadata.write(g.serialize(format="ntriples").decode("utf-8")) + except Exception as e: + logging.exception("Error processing collection %s" % uuid) shutil.rmtree(".cache") -- cgit v1.2.3