diff options
author | Peter Amstutz | 2020-08-19 15:19:07 -0400 |
---|---|---|
committer | Peter Amstutz | 2020-08-19 16:31:38 -0400 |
commit | b3d2ccf951903ac0b7d717357fb1cccca26fbd15 (patch) | |
tree | 92adcacbd208546bf3f063e6c25765e660972244 /workflows/pangenome-generate/collect-seqs.py | |
parent | 592c921a3223c03d8a22f7a852641ac5d753fb31 (diff) | |
download | bh20-seq-resource-b3d2ccf951903ac0b7d717357fb1cccca26fbd15.tar.gz bh20-seq-resource-b3d2ccf951903ac0b7d717357fb1cccca26fbd15.tar.lz bh20-seq-resource-b3d2ccf951903ac0b7d717357fb1cccca26fbd15.zip |
Consolidate steps to scale graph generation workflow
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
Diffstat (limited to 'workflows/pangenome-generate/collect-seqs.py')
-rw-r--r-- | workflows/pangenome-generate/collect-seqs.py | 67 |
1 files changed, 67 insertions, 0 deletions
diff --git a/workflows/pangenome-generate/collect-seqs.py b/workflows/pangenome-generate/collect-seqs.py new file mode 100644 index 0000000..af4a0dc --- /dev/null +++ b/workflows/pangenome-generate/collect-seqs.py @@ -0,0 +1,67 @@ +import sys +import arvados +import json +import shutil +import arvados.collection +import ruamel.yaml +import schema_salad.schema +import schema_salad.jsonld_context +from schema_salad.sourceline import add_lc_filename + +api = arvados.api() +keepclient = arvados.keep.KeepClient(api_client=api) + +validated = arvados.util.list_all(api.collections().list, filters=[ + ["owner_uuid", "=", sys.argv[1]], + ["properties.status", "=", "validated"]]) + +validated.sort(key=lambda v: v["portable_data_hash"]) + +relabeled_fasta = open("relabeledSeqs.fasta", "wt") +merged_metadata = open("mergedMetadata.ttl", "wt") + +metadataSchema = sys.argv[2] + +blacklist = set() +if len(sys.argv) > 3: + with open(sys.argv[3]) as bl: + for l in bl: + blacklist.add(l.strip()) + +(document_loader, + avsc_names, + schema_metadata, + metaschema_loader) = schema_salad.schema.load_schema(metadataSchema) + + +for item in validated: + pdh = item["portable_data_hash"] + with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col: + with col.open("sequence.fasta", "rt") as fa: + subject = "http://collections.lugli.arvadosapi.com/c=%s/sequence.fasta" % pdh + label = fa.readline().strip() + merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"'))) + skip = (subject in blacklist or label[1:] in blacklist) + if skip: + merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/excluded_from_graph> \"true\"^^<http://www.w3.org/2001/XMLSchema#boolean> .\n" % subject) + if not skip: + relabeled_fasta.write(">"+subject+"\n") + data = fa.read(8096) + while data: + if not skip: + relabeled_fasta.write(data) + endswithnewline = data.endswith("\n") + data = fa.read(8096) + if not skip and not endswithnewline: + relabeled_fasta.write("\n") + + with col.open("metadata.yaml", "rt") as md: + metadata_content = ruamel.yaml.round_trip_load(md) + metadata_content["id"] = subject + add_lc_filename(metadata_content, metadata_content["id"]) + doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadata_content, False, False) + g = schema_salad.jsonld_context.makerdf(subject, doc, document_loader.ctx) + merged_metadata.write(g.serialize(format="ntriples").decode("utf-8")) + + +shutil.rmtree(".cache") |