From d1e8809a15ae74d0b847abb80f9f63f53078e1d6 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Wed, 19 Aug 2020 15:15:57 -0400 Subject: Scaling pangenome generation Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- workflows/pangenome-generate/arv-main.cwl | 20 ++-- workflows/pangenome-generate/dups2metadata.py | 7 +- .../odgi-build-from-spoa-gfa.cwl | 16 +-- .../pangenome-generate/pangenome-generate_spoa.cwl | 110 ++++++++++++--------- .../sort_fasta_by_quality_and_len.cwl | 3 + workflows/pangenome-generate/spoa.cwl | 3 +- workflows/tools | 2 +- 7 files changed, 87 insertions(+), 74 deletions(-) (limited to 'workflows') diff --git a/workflows/pangenome-generate/arv-main.cwl b/workflows/pangenome-generate/arv-main.cwl index 176cfe7..8d7f83b 100644 --- a/workflows/pangenome-generate/arv-main.cwl +++ b/workflows/pangenome-generate/arv-main.cwl @@ -10,9 +10,9 @@ outputs: odgiGraph: type: File outputSource: pangenome-generate/odgiGraph - odgiPNG: - type: File - outputSource: pangenome-generate/odgiPNG +# odgiPNG: +# type: File +# outputSource: pangenome-generate/odgiPNG spoaGFA: type: File outputSource: pangenome-generate/spoaGFA @@ -25,12 +25,12 @@ outputs: mergedMetadata: type: File outputSource: pangenome-generate/mergedMetadata - indexed_paths: - type: File - outputSource: pangenome-generate/indexed_paths - colinear_components: - type: Directory - outputSource: pangenome-generate/colinear_components +# indexed_paths: +# type: File +# outputSource: pangenome-generate/indexed_paths +# colinear_components: +# type: Directory +# outputSource: pangenome-generate/colinear_components steps: collect-seqs: run: collect-seqs.cwl @@ -45,4 +45,4 @@ steps: seqs: collect-seqs/relabeledSeqs metadata: collect-seqs/mergedMetadata exclude: exclude - out: [odgiGraph, odgiPNG, spoaGFA, odgiRDF, readsMergeDedup, mergedMetadata, indexed_paths, colinear_components] + out: [odgiGraph, spoaGFA, odgiRDF, readsMergeDedup, mergedMetadata] diff --git a/workflows/pangenome-generate/dups2metadata.py b/workflows/pangenome-generate/dups2metadata.py index 9bda10a..89e7236 100644 --- a/workflows/pangenome-generate/dups2metadata.py +++ b/workflows/pangenome-generate/dups2metadata.py @@ -1,17 +1,16 @@ import sys +import re md = open(sys.argv[1], "rt") for d in md: - print(d) + sys.stdout.write(d) if len(sys.argv) < 3: exit(0) sameseqs = open(sys.argv[2], "rt") for d in sameseqs: - logging.warn(d) g = re.match(r"\d+\t(.*)", d) - logging.warn("%s", g.group(1)) sp = g.group(1).split(",") for n in sp[1:]: - print("<%s> <%s> ." % (n.strip(), sp[0].strip())) + sys.stdout.write("<%s> <%s> .\n" % (n.strip(), sp[0].strip())) diff --git a/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl b/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl index 2459ce7..1eadc88 100644 --- a/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl +++ b/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl @@ -9,21 +9,21 @@ outputs: glob: $(inputs.inputGFA.nameroot).unchop.sorted.odgi requirements: InlineJavascriptRequirement: {} - ShellCommandRequirement: {} hints: DockerRequirement: - dockerPull: "quay.io/biocontainers/odgi:v0.3--py37h8b12597_0" + dockerPull: "odgi-bash-binutils:latest" ResourceRequirement: coresMin: 4 ramMin: $(7 * 1024) outdirMin: $(Math.ceil((inputs.inputGFA.size/(1024*1024*1024)+1) * 2)) InitialWorkDirRequirement: + # Will fail if input file is not writable (odgi bug) listing: - entry: $(inputs.inputGFA) writable: true -arguments: [odgi, build, -g, $(inputs.inputGFA), -o, -, - {shellQuote: false, valueFrom: "|"}, - odgi, unchop, -i, -, -o, -, - {shellQuote: false, valueFrom: "|"}, - odgi, sort, -i, -, -p, s, -o, $(inputs.inputGFA.nameroot).unchop.sorted.odgi - ] +arguments: + - "sh" + - "-c" + - >- + odgi build -g '$(inputs.inputGFA.path)' -o - | odgi unchop -i - -o - | + odgi sort -i - -p s -o $(inputs.inputGFA.nameroot).unchop.sorted.odgi diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl index 33bf64e..ed12254 100644 --- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl +++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl @@ -19,9 +19,9 @@ outputs: odgiGraph: type: File outputSource: buildGraph/odgiGraph - odgiPNG: - type: File - outputSource: vizGraph/graph_image +# odgiPNG: +# type: File +# outputSource: vizGraph/graph_image spoaGFA: type: File outputSource: induceGraph/spoaGFA @@ -34,16 +34,16 @@ outputs: mergedMetadata: type: File outputSource: dups2metadata/merged - indexed_paths: - type: File - outputSource: index_paths/indexed_paths - colinear_components: - type: Directory - outputSource: segment_components/colinear_components +# indexed_paths: +# type: File +# outputSource: index_paths/indexed_paths +# colinear_components: +# type: Directory +# outputSource: segment_components/colinear_components steps: dedup_and_sort_by_quality_and_len: in: {reads: seqs} - out: [reads_dedupped_sorted_by_quality_and_len, dups] + out: [sortedReadsFA, dups] run: sort_fasta_by_quality_and_len.cwl induceGraph: in: @@ -54,19 +54,23 @@ steps: in: {inputGFA: induceGraph/spoaGFA} out: [odgiGraph] run: odgi-build-from-spoa-gfa.cwl - vizGraph: - in: - sparse_graph_index: buildGraph/odgiGraph - width: - default: 50000 - height: - default: 500 - path_per_row: - default: true - path_height: - default: 4 - out: [graph_image] - run: ../tools/odgi/odgi_viz.cwl + # vizGraph: + # in: + # sparse_graph_index: buildGraph/odgiGraph + # width: + # default: 50000 + # height: + # default: 500 + # path_per_row: + # default: true + # path_height: + # default: 4 + # out: [graph_image] + # requirements: + # ResourceRequirement: + # ramMin: $(15 * 1024) + # outdirMin: 10 + # run: ../tools/odgi/odgi_viz.cwl odgi2rdf: in: {odgi: buildGraph/odgiGraph} out: [rdf] @@ -77,29 +81,37 @@ steps: dups: dedup_and_sort_by_quality_and_len/dups out: [merged] run: dups2metadata.cwl - bin_paths: - run: ../tools/odgi/odgi_bin.cwl - in: - sparse_graph_index: buildGraph/odgiGraph - bin_width: bin_widths - scatter: bin_width - out: [ bins, pangenome_sequence ] - index_paths: - label: Create path index - run: ../tools/odgi/odgi_pathindex.cwl - in: - sparse_graph_index: buildGraph/odgiGraph - out: [ indexed_paths ] - segment_components: - label: Run component segmentation - run: ../tools/graph-genome-segmentation/component_segmentation.cwl - in: - bins: bin_paths/bins - cells_per_file: cells_per_file - pangenome_sequence: - source: bin_paths/pangenome_sequence - valueFrom: $(self[0]) - # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index - # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index - # regardless of bin_width, so we take the first pangenome_sequence as input for this step - out: [ colinear_components ] + # bin_paths: + # requirements: + # ResourceRequirement: + # ramMin: 3000 + # outdirMin: 10 + # run: ../tools/odgi/odgi_bin.cwl + # in: + # sparse_graph_index: buildGraph/odgiGraph + # bin_width: bin_widths + # scatter: bin_width + # out: [ bins, pangenome_sequence ] + # index_paths: + # label: Create path index + # requirements: + # ResourceRequirement: + # ramMin: 3000 + # outdirMin: 10 + # run: ../tools/odgi/odgi_pathindex.cwl + # in: + # sparse_graph_index: buildGraph/odgiGraph + # out: [ indexed_paths ] + # segment_components: + # label: Run component segmentation + # run: ../tools/graph-genome-segmentation/component_segmentation.cwl + # in: + # bins: bin_paths/bins + # cells_per_file: cells_per_file + # pangenome_sequence: + # source: bin_paths/pangenome_sequence + # valueFrom: $(self[0]) + # # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index + # # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index + # # regardless of bin_width, so we take the first pangenome_sequence as input for this step + # out: [ colinear_components ] diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl index f8da5d3..9d9b31d 100644 --- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl @@ -16,6 +16,9 @@ stdout: $(inputs.readsFA.nameroot).sorted_by_quality_and_len.fasta outputs: sortedReadsFA: type: stdout + dups: + type: File + outputBinding: {glob: dups.txt} requirements: InlineJavascriptRequirement: {} ShellCommandRequirement: {} diff --git a/workflows/pangenome-generate/spoa.cwl b/workflows/pangenome-generate/spoa.cwl index 132633c..150227d 100644 --- a/workflows/pangenome-generate/spoa.cwl +++ b/workflows/pangenome-generate/spoa.cwl @@ -10,10 +10,9 @@ outputs: type: stdout requirements: InlineJavascriptRequirement: {} - ShellCommandRequirement: {} hints: DockerRequirement: - dockerPull: "quay.io/biocontainers/spoa:3.0.2--hc9558a2_0" + dockerPull: "quay.io/biocontainers/spoa:3.4.0--hc9558a2_0" ResourceRequirement: coresMin: 1 ramMin: $(15 * 1024) diff --git a/workflows/tools b/workflows/tools index 61ffac1..c67c011 160000 --- a/workflows/tools +++ b/workflows/tools @@ -1 +1 @@ -Subproject commit 61ffac1862822f08dc20b6f8e2f22634b986b0bc +Subproject commit c67c011765bea798a24485cbe0a1c6c592436521 -- cgit v1.2.3