From b3d2ccf951903ac0b7d717357fb1cccca26fbd15 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Wed, 19 Aug 2020 15:19:07 -0400 Subject: Consolidate steps to scale graph generation workflow Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- workflows/pangenome-generate/arv-main.cwl | 48 ++++++++++++++++ workflows/pangenome-generate/collect-seqs.cwl | 42 ++++++++++++++ workflows/pangenome-generate/collect-seqs.py | 67 ++++++++++++++++++++++ workflows/pangenome-generate/dups2metadata.cwl | 19 ++++++ workflows/pangenome-generate/dups2metadata.py | 17 ++++++ .../pangenome-generate/pangenome-generate_spoa.cwl | 25 ++------ .../sort_fasta_by_quality_and_len.cwl | 4 ++ workflows/pangenome-generate/spoa.cwl | 4 +- 8 files changed, 205 insertions(+), 21 deletions(-) create mode 100644 workflows/pangenome-generate/arv-main.cwl create mode 100644 workflows/pangenome-generate/collect-seqs.cwl create mode 100644 workflows/pangenome-generate/collect-seqs.py create mode 100644 workflows/pangenome-generate/dups2metadata.cwl create mode 100644 workflows/pangenome-generate/dups2metadata.py (limited to 'workflows/pangenome-generate') diff --git a/workflows/pangenome-generate/arv-main.cwl b/workflows/pangenome-generate/arv-main.cwl new file mode 100644 index 0000000..176cfe7 --- /dev/null +++ b/workflows/pangenome-generate/arv-main.cwl @@ -0,0 +1,48 @@ +cwlVersion: v1.1 +class: Workflow +requirements: + SubworkflowFeatureRequirement: {} +inputs: + src_project: string + metadataSchema: File + exclude: File? +outputs: + odgiGraph: + type: File + outputSource: pangenome-generate/odgiGraph + odgiPNG: + type: File + outputSource: pangenome-generate/odgiPNG + spoaGFA: + type: File + outputSource: pangenome-generate/spoaGFA + odgiRDF: + type: File + outputSource: pangenome-generate/odgiRDF + readsMergeDedup: + type: File + outputSource: pangenome-generate/readsMergeDedup + mergedMetadata: + type: File + outputSource: pangenome-generate/mergedMetadata + indexed_paths: + type: File + outputSource: pangenome-generate/indexed_paths + colinear_components: + type: Directory + outputSource: pangenome-generate/colinear_components +steps: + collect-seqs: + run: collect-seqs.cwl + in: + src_project: src_project + schema: metadataSchema + exclude: exclude + out: [relabeledSeqs, mergedMetadata] + pangenome-generate: + run: pangenome-generate_spoa.cwl + in: + seqs: collect-seqs/relabeledSeqs + metadata: collect-seqs/mergedMetadata + exclude: exclude + out: [odgiGraph, odgiPNG, spoaGFA, odgiRDF, readsMergeDedup, mergedMetadata, indexed_paths, colinear_components] diff --git a/workflows/pangenome-generate/collect-seqs.cwl b/workflows/pangenome-generate/collect-seqs.cwl new file mode 100644 index 0000000..3511df1 --- /dev/null +++ b/workflows/pangenome-generate/collect-seqs.cwl @@ -0,0 +1,42 @@ +cwlVersion: v1.1 +class: CommandLineTool +$namespaces: + arv: "http://arvados.org/cwl#" + cwltool: "http://commonwl.org/cwltool#" +requirements: + arv:APIRequirement: {} + arv:RuntimeConstraints: + outputDirType: keep_output_dir + DockerRequirement: + dockerPull: arvados/jobs:2.0.3 + WorkReuse: + enableReuse: false + ResourceRequirement: + coresMin: 1 + ramMin: 1024 +baseCommand: python3 +inputs: + script: + type: File + default: + class: File + location: collect-seqs.py + inputBinding: {position: 1} + src_project: + type: string + inputBinding: {position: 2} + schema: + type: File + inputBinding: {position: 3} + exclude: + type: File? + inputBinding: {position: 4} +outputs: + relabeledSeqs: + type: File + outputBinding: + glob: relabeledSeqs.fasta + mergedMetadata: + type: File + outputBinding: + glob: mergedMetadata.ttl diff --git a/workflows/pangenome-generate/collect-seqs.py b/workflows/pangenome-generate/collect-seqs.py new file mode 100644 index 0000000..af4a0dc --- /dev/null +++ b/workflows/pangenome-generate/collect-seqs.py @@ -0,0 +1,67 @@ +import sys +import arvados +import json +import shutil +import arvados.collection +import ruamel.yaml +import schema_salad.schema +import schema_salad.jsonld_context +from schema_salad.sourceline import add_lc_filename + +api = arvados.api() +keepclient = arvados.keep.KeepClient(api_client=api) + +validated = arvados.util.list_all(api.collections().list, filters=[ + ["owner_uuid", "=", sys.argv[1]], + ["properties.status", "=", "validated"]]) + +validated.sort(key=lambda v: v["portable_data_hash"]) + +relabeled_fasta = open("relabeledSeqs.fasta", "wt") +merged_metadata = open("mergedMetadata.ttl", "wt") + +metadataSchema = sys.argv[2] + +blacklist = set() +if len(sys.argv) > 3: + with open(sys.argv[3]) as bl: + for l in bl: + blacklist.add(l.strip()) + +(document_loader, + avsc_names, + schema_metadata, + metaschema_loader) = schema_salad.schema.load_schema(metadataSchema) + + +for item in validated: + pdh = item["portable_data_hash"] + with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col: + with col.open("sequence.fasta", "rt") as fa: + subject = "http://collections.lugli.arvadosapi.com/c=%s/sequence.fasta" % pdh + label = fa.readline().strip() + merged_metadata.write("<%s> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"'))) + skip = (subject in blacklist or label[1:] in blacklist) + if skip: + merged_metadata.write("<%s> \"true\"^^ .\n" % subject) + if not skip: + relabeled_fasta.write(">"+subject+"\n") + data = fa.read(8096) + while data: + if not skip: + relabeled_fasta.write(data) + endswithnewline = data.endswith("\n") + data = fa.read(8096) + if not skip and not endswithnewline: + relabeled_fasta.write("\n") + + with col.open("metadata.yaml", "rt") as md: + metadata_content = ruamel.yaml.round_trip_load(md) + metadata_content["id"] = subject + add_lc_filename(metadata_content, metadata_content["id"]) + doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadata_content, False, False) + g = schema_salad.jsonld_context.makerdf(subject, doc, document_loader.ctx) + merged_metadata.write(g.serialize(format="ntriples").decode("utf-8")) + + +shutil.rmtree(".cache") diff --git a/workflows/pangenome-generate/dups2metadata.cwl b/workflows/pangenome-generate/dups2metadata.cwl new file mode 100644 index 0000000..cf54675 --- /dev/null +++ b/workflows/pangenome-generate/dups2metadata.cwl @@ -0,0 +1,19 @@ +cwlVersion: v1.1 +class: CommandLineTool +baseCommand: python +inputs: + script: + type: File + default: + class: File + location: dups2metadata.py + inputBinding: {position: 1} + metadata: + type: File + inputBinding: {position: 2} + dups: + type: File? + inputBinding: {position: 3} +stdout: mergedmetadata.ttl +outputs: + merged: stdout diff --git a/workflows/pangenome-generate/dups2metadata.py b/workflows/pangenome-generate/dups2metadata.py new file mode 100644 index 0000000..9bda10a --- /dev/null +++ b/workflows/pangenome-generate/dups2metadata.py @@ -0,0 +1,17 @@ +import sys + +md = open(sys.argv[1], "rt") +for d in md: + print(d) + +if len(sys.argv) < 3: + exit(0) + +sameseqs = open(sys.argv[2], "rt") +for d in sameseqs: + logging.warn(d) + g = re.match(r"\d+\t(.*)", d) + logging.warn("%s", g.group(1)) + sp = g.group(1).split(",") + for n in sp[1:]: + print("<%s> <%s> ." % (n.strip(), sp[0].strip())) diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl index 8b34ff8..33bf64e 100644 --- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl +++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl @@ -5,11 +5,8 @@ requirements: ScatterFeatureRequirement: {} StepInputExpressionRequirement: {} inputs: - inputReads: File[] - metadata: File[] - metadataSchema: File - subjects: string[] - exclude: File? + seqs: File + metadata: File bin_widths: type: int[] default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000] @@ -36,7 +33,7 @@ outputs: outputSource: dedup_and_sort_by_quality_and_len/reads_dedupped_sorted_by_quality_and_len mergedMetadata: type: File - outputSource: mergeMetadata/merged + outputSource: dups2metadata/merged indexed_paths: type: File outputSource: index_paths/indexed_paths @@ -44,15 +41,8 @@ outputs: type: Directory outputSource: segment_components/colinear_components steps: - relabel: - in: - readsFA: inputReads - subjects: subjects - exclude: exclude - out: [relabeledSeqs, originalLabels] - run: relabel-seqs.cwl dedup_and_sort_by_quality_and_len: - in: {reads: relabel/relabeledSeqs} + in: {reads: seqs} out: [reads_dedupped_sorted_by_quality_and_len, dups] run: sort_fasta_by_quality_and_len.cwl induceGraph: @@ -81,15 +71,12 @@ steps: in: {odgi: buildGraph/odgiGraph} out: [rdf] run: odgi_to_rdf.cwl - mergeMetadata: + dups2metadata: in: metadata: metadata - metadataSchema: metadataSchema - subjects: subjects dups: dedup_and_sort_by_quality_and_len/dups - originalLabels: relabel/originalLabels out: [merged] - run: merge-metadata.cwl + run: dups2metadata.cwl bin_paths: run: ../tools/odgi/odgi_bin.cwl in: diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl index 59f027e..f8da5d3 100644 --- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl @@ -1,5 +1,9 @@ cwlVersion: v1.1 class: CommandLineTool +hints: + ResourceRequirement: + coresMin: 1 + ramMin: 1024 inputs: readsFA: type: File diff --git a/workflows/pangenome-generate/spoa.cwl b/workflows/pangenome-generate/spoa.cwl index 1e390d8..132633c 100644 --- a/workflows/pangenome-generate/spoa.cwl +++ b/workflows/pangenome-generate/spoa.cwl @@ -2,8 +2,7 @@ cwlVersion: v1.1 class: CommandLineTool inputs: readsFA: File -stdout: $(inputs.readsFA.nameroot).g6.gfa -script: + script: type: File default: {class: File, location: relabel-seqs.py} outputs: @@ -20,6 +19,7 @@ hints: ramMin: $(15 * 1024) outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20)) baseCommand: spoa +stdout: $(inputs.readsFA.nameroot).g6.gfa arguments: [ $(inputs.readsFA), -G, -- cgit v1.2.3