aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Amstutz2020-08-19 15:15:57 -0400
committerPeter Amstutz2020-08-19 16:31:38 -0400
commitd1e8809a15ae74d0b847abb80f9f63f53078e1d6 (patch)
tree77a62801b9e8928889d85dadccf1ffce85b2e0cb
parentb3d2ccf951903ac0b7d717357fb1cccca26fbd15 (diff)
downloadbh20-seq-resource-d1e8809a15ae74d0b847abb80f9f63f53078e1d6.tar.gz
bh20-seq-resource-d1e8809a15ae74d0b847abb80f9f63f53078e1d6.tar.lz
bh20-seq-resource-d1e8809a15ae74d0b847abb80f9f63f53078e1d6.zip
Scaling pangenome generation
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
-rw-r--r--workflows/pangenome-generate/arv-main.cwl20
-rw-r--r--workflows/pangenome-generate/dups2metadata.py7
-rw-r--r--workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl16
-rw-r--r--workflows/pangenome-generate/pangenome-generate_spoa.cwl110
-rw-r--r--workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl3
-rw-r--r--workflows/pangenome-generate/spoa.cwl3
m---------workflows/tools0
7 files changed, 86 insertions, 73 deletions
diff --git a/workflows/pangenome-generate/arv-main.cwl b/workflows/pangenome-generate/arv-main.cwl
index 176cfe7..8d7f83b 100644
--- a/workflows/pangenome-generate/arv-main.cwl
+++ b/workflows/pangenome-generate/arv-main.cwl
@@ -10,9 +10,9 @@ outputs:
odgiGraph:
type: File
outputSource: pangenome-generate/odgiGraph
- odgiPNG:
- type: File
- outputSource: pangenome-generate/odgiPNG
+# odgiPNG:
+# type: File
+# outputSource: pangenome-generate/odgiPNG
spoaGFA:
type: File
outputSource: pangenome-generate/spoaGFA
@@ -25,12 +25,12 @@ outputs:
mergedMetadata:
type: File
outputSource: pangenome-generate/mergedMetadata
- indexed_paths:
- type: File
- outputSource: pangenome-generate/indexed_paths
- colinear_components:
- type: Directory
- outputSource: pangenome-generate/colinear_components
+# indexed_paths:
+# type: File
+# outputSource: pangenome-generate/indexed_paths
+# colinear_components:
+# type: Directory
+# outputSource: pangenome-generate/colinear_components
steps:
collect-seqs:
run: collect-seqs.cwl
@@ -45,4 +45,4 @@ steps:
seqs: collect-seqs/relabeledSeqs
metadata: collect-seqs/mergedMetadata
exclude: exclude
- out: [odgiGraph, odgiPNG, spoaGFA, odgiRDF, readsMergeDedup, mergedMetadata, indexed_paths, colinear_components]
+ out: [odgiGraph, spoaGFA, odgiRDF, readsMergeDedup, mergedMetadata]
diff --git a/workflows/pangenome-generate/dups2metadata.py b/workflows/pangenome-generate/dups2metadata.py
index 9bda10a..89e7236 100644
--- a/workflows/pangenome-generate/dups2metadata.py
+++ b/workflows/pangenome-generate/dups2metadata.py
@@ -1,17 +1,16 @@
import sys
+import re
md = open(sys.argv[1], "rt")
for d in md:
- print(d)
+ sys.stdout.write(d)
if len(sys.argv) < 3:
exit(0)
sameseqs = open(sys.argv[2], "rt")
for d in sameseqs:
- logging.warn(d)
g = re.match(r"\d+\t(.*)", d)
- logging.warn("%s", g.group(1))
sp = g.group(1).split(",")
for n in sp[1:]:
- print("<%s> <http://biohackathon.org/bh20-seq-schema/has_duplicate_sequence> <%s> ." % (n.strip(), sp[0].strip()))
+ sys.stdout.write("<%s> <http://biohackathon.org/bh20-seq-schema/has_duplicate_sequence> <%s> .\n" % (n.strip(), sp[0].strip()))
diff --git a/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl b/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl
index 2459ce7..1eadc88 100644
--- a/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl
+++ b/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl
@@ -9,21 +9,21 @@ outputs:
glob: $(inputs.inputGFA.nameroot).unchop.sorted.odgi
requirements:
InlineJavascriptRequirement: {}
- ShellCommandRequirement: {}
hints:
DockerRequirement:
- dockerPull: "quay.io/biocontainers/odgi:v0.3--py37h8b12597_0"
+ dockerPull: "odgi-bash-binutils:latest"
ResourceRequirement:
coresMin: 4
ramMin: $(7 * 1024)
outdirMin: $(Math.ceil((inputs.inputGFA.size/(1024*1024*1024)+1) * 2))
InitialWorkDirRequirement:
+ # Will fail if input file is not writable (odgi bug)
listing:
- entry: $(inputs.inputGFA)
writable: true
-arguments: [odgi, build, -g, $(inputs.inputGFA), -o, -,
- {shellQuote: false, valueFrom: "|"},
- odgi, unchop, -i, -, -o, -,
- {shellQuote: false, valueFrom: "|"},
- odgi, sort, -i, -, -p, s, -o, $(inputs.inputGFA.nameroot).unchop.sorted.odgi
- ]
+arguments:
+ - "sh"
+ - "-c"
+ - >-
+ odgi build -g '$(inputs.inputGFA.path)' -o - | odgi unchop -i - -o - |
+ odgi sort -i - -p s -o $(inputs.inputGFA.nameroot).unchop.sorted.odgi
diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
index 33bf64e..ed12254 100644
--- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl
+++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
@@ -19,9 +19,9 @@ outputs:
odgiGraph:
type: File
outputSource: buildGraph/odgiGraph
- odgiPNG:
- type: File
- outputSource: vizGraph/graph_image
+# odgiPNG:
+# type: File
+# outputSource: vizGraph/graph_image
spoaGFA:
type: File
outputSource: induceGraph/spoaGFA
@@ -34,16 +34,16 @@ outputs:
mergedMetadata:
type: File
outputSource: dups2metadata/merged
- indexed_paths:
- type: File
- outputSource: index_paths/indexed_paths
- colinear_components:
- type: Directory
- outputSource: segment_components/colinear_components
+# indexed_paths:
+# type: File
+# outputSource: index_paths/indexed_paths
+# colinear_components:
+# type: Directory
+# outputSource: segment_components/colinear_components
steps:
dedup_and_sort_by_quality_and_len:
in: {reads: seqs}
- out: [reads_dedupped_sorted_by_quality_and_len, dups]
+ out: [sortedReadsFA, dups]
run: sort_fasta_by_quality_and_len.cwl
induceGraph:
in:
@@ -54,19 +54,23 @@ steps:
in: {inputGFA: induceGraph/spoaGFA}
out: [odgiGraph]
run: odgi-build-from-spoa-gfa.cwl
- vizGraph:
- in:
- sparse_graph_index: buildGraph/odgiGraph
- width:
- default: 50000
- height:
- default: 500
- path_per_row:
- default: true
- path_height:
- default: 4
- out: [graph_image]
- run: ../tools/odgi/odgi_viz.cwl
+ # vizGraph:
+ # in:
+ # sparse_graph_index: buildGraph/odgiGraph
+ # width:
+ # default: 50000
+ # height:
+ # default: 500
+ # path_per_row:
+ # default: true
+ # path_height:
+ # default: 4
+ # out: [graph_image]
+ # requirements:
+ # ResourceRequirement:
+ # ramMin: $(15 * 1024)
+ # outdirMin: 10
+ # run: ../tools/odgi/odgi_viz.cwl
odgi2rdf:
in: {odgi: buildGraph/odgiGraph}
out: [rdf]
@@ -77,29 +81,37 @@ steps:
dups: dedup_and_sort_by_quality_and_len/dups
out: [merged]
run: dups2metadata.cwl
- bin_paths:
- run: ../tools/odgi/odgi_bin.cwl
- in:
- sparse_graph_index: buildGraph/odgiGraph
- bin_width: bin_widths
- scatter: bin_width
- out: [ bins, pangenome_sequence ]
- index_paths:
- label: Create path index
- run: ../tools/odgi/odgi_pathindex.cwl
- in:
- sparse_graph_index: buildGraph/odgiGraph
- out: [ indexed_paths ]
- segment_components:
- label: Run component segmentation
- run: ../tools/graph-genome-segmentation/component_segmentation.cwl
- in:
- bins: bin_paths/bins
- cells_per_file: cells_per_file
- pangenome_sequence:
- source: bin_paths/pangenome_sequence
- valueFrom: $(self[0])
- # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index
- # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index
- # regardless of bin_width, so we take the first pangenome_sequence as input for this step
- out: [ colinear_components ]
+ # bin_paths:
+ # requirements:
+ # ResourceRequirement:
+ # ramMin: 3000
+ # outdirMin: 10
+ # run: ../tools/odgi/odgi_bin.cwl
+ # in:
+ # sparse_graph_index: buildGraph/odgiGraph
+ # bin_width: bin_widths
+ # scatter: bin_width
+ # out: [ bins, pangenome_sequence ]
+ # index_paths:
+ # label: Create path index
+ # requirements:
+ # ResourceRequirement:
+ # ramMin: 3000
+ # outdirMin: 10
+ # run: ../tools/odgi/odgi_pathindex.cwl
+ # in:
+ # sparse_graph_index: buildGraph/odgiGraph
+ # out: [ indexed_paths ]
+ # segment_components:
+ # label: Run component segmentation
+ # run: ../tools/graph-genome-segmentation/component_segmentation.cwl
+ # in:
+ # bins: bin_paths/bins
+ # cells_per_file: cells_per_file
+ # pangenome_sequence:
+ # source: bin_paths/pangenome_sequence
+ # valueFrom: $(self[0])
+ # # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index
+ # # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index
+ # # regardless of bin_width, so we take the first pangenome_sequence as input for this step
+ # out: [ colinear_components ]
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
index f8da5d3..9d9b31d 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
@@ -16,6 +16,9 @@ stdout: $(inputs.readsFA.nameroot).sorted_by_quality_and_len.fasta
outputs:
sortedReadsFA:
type: stdout
+ dups:
+ type: File
+ outputBinding: {glob: dups.txt}
requirements:
InlineJavascriptRequirement: {}
ShellCommandRequirement: {}
diff --git a/workflows/pangenome-generate/spoa.cwl b/workflows/pangenome-generate/spoa.cwl
index 132633c..150227d 100644
--- a/workflows/pangenome-generate/spoa.cwl
+++ b/workflows/pangenome-generate/spoa.cwl
@@ -10,10 +10,9 @@ outputs:
type: stdout
requirements:
InlineJavascriptRequirement: {}
- ShellCommandRequirement: {}
hints:
DockerRequirement:
- dockerPull: "quay.io/biocontainers/spoa:3.0.2--hc9558a2_0"
+ dockerPull: "quay.io/biocontainers/spoa:3.4.0--hc9558a2_0"
ResourceRequirement:
coresMin: 1
ramMin: $(15 * 1024)
diff --git a/workflows/tools b/workflows/tools
-Subproject 61ffac1862822f08dc20b6f8e2f22634b986b0b
+Subproject c67c011765bea798a24485cbe0a1c6c59243652