aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2020-05-12 13:55:55 -0500
committerPjotr Prins2020-05-12 13:55:55 -0500
commit8e9042879db7fe061f3d18737f4102b5fd0b8c65 (patch)
tree978645b86e805c218451beaf194bfd97f2fbd9d8
parent40ca03045bbc4f6fb1258acb6f42a60ee5532e0d (diff)
parent55bab826fda6c6bc3190476f8e2c61e51ef092d6 (diff)
downloadbh20-seq-resource-8e9042879db7fe061f3d18737f4102b5fd0b8c65.tar.gz
bh20-seq-resource-8e9042879db7fe061f3d18737f4102b5fd0b8c65.tar.lz
bh20-seq-resource-8e9042879db7fe061f3d18737f4102b5fd0b8c65.zip
Merge branch 'master' of github.com:arvados/bh20-seq-resource
-rw-r--r--.gitmodules3
-rw-r--r--bh20sequploader/bh20seq-schema.yml2
-rw-r--r--bh20sequploader/bh20seq-shex.rdf2
-rw-r--r--workflows/pangenome-generate/minimap2.cwl23
-rw-r--r--workflows/pangenome-generate/odgi-viz.cwl25
-rw-r--r--workflows/pangenome-generate/odgi_to_rdf.cwl9
-rw-r--r--workflows/pangenome-generate/pangenome-generate.cwl83
-rw-r--r--workflows/pangenome-generate/seqkit-rmdup.cwl37
m---------workflows/tools0
9 files changed, 78 insertions, 106 deletions
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..c7d7f99
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "workflows/tools"]
+ path = workflows/tools
+ url = https://github.com/common-workflow-library/bio-cwl-tools.git
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index 99e1a11..8a16bd3 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -25,7 +25,7 @@ $graph:
jsonldPredicate:
_id: http://semanticscience.org/resource/SIO_000115
host_sex:
- doc: Sex of the host as defined in PATO, expect male () or female ()
+ doc: Sex of the host as defined in PATO, expect Male (http://purl.obolibrary.org/obo/PATO_0000384) or Female (http://purl.obolibrary.org/obo/PATO_0000383) or in Intersex (http://purl.obolibrary.org/obo/PATO_0001340)
type: string?
jsonldPredicate:
_id: http://purl.obolibrary.org/obo/PATO_0000047
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index cdf2296..a017805 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -20,7 +20,7 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
:hostShape {
efo:EFO_0000532 [ obo:NCBITaxon_~ ] ;
sio:SIO_000115 xsd:string ?;
- obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 ] ?;
+ obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 obo:PATO_0001340] ?;
obo:PATO_0000011 xsd:integer ?;
obo:NCIT_C42574 [ obo:UO_~ ] ?;
obo:NCIT_C25688 [obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987 ] ? ;
diff --git a/workflows/pangenome-generate/minimap2.cwl b/workflows/pangenome-generate/minimap2.cwl
deleted file mode 100644
index bf8eb4c..0000000
--- a/workflows/pangenome-generate/minimap2.cwl
+++ /dev/null
@@ -1,23 +0,0 @@
-cwlVersion: v1.1
-class: CommandLineTool
-inputs:
- readsFA: File
-outputs:
- readsPAF: stdout
-requirements:
- InlineJavascriptRequirement: {}
-hints:
- DockerRequirement:
- dockerPull: "quay.io/biocontainers/minimap2:2.17--h8b12597_1"
- ResourceRequirement:
- coresMin: 8
- coresMax: 32
- ramMin: $(15 * 1024)
- outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20))
-stdout: $(inputs.readsFA.nameroot).paf
-baseCommand: minimap2
-arguments: [-cx, asm20,
- -w, "1",
- -t, $(runtime.cores),
- $(inputs.readsFA),
- $(inputs.readsFA)]
diff --git a/workflows/pangenome-generate/odgi-viz.cwl b/workflows/pangenome-generate/odgi-viz.cwl
deleted file mode 100644
index d440fcb..0000000
--- a/workflows/pangenome-generate/odgi-viz.cwl
+++ /dev/null
@@ -1,25 +0,0 @@
-cwlVersion: v1.1
-class: CommandLineTool
-inputs:
- inputODGI: File
-outputs:
- odgiPNG:
- type: File
- outputBinding:
- glob: $(inputs.inputODGI.nameroot).png
-requirements:
- InlineJavascriptRequirement: {}
-hints:
- DockerRequirement:
- dockerPull: "quay.io/biocontainers/odgi:v0.3--py37h8b12597_0"
- ResourceRequirement:
- coresMin: 4
- ramMin: $(7 * 1024)
- outdirMin: 1
-baseCommand: [odgi, viz]
-arguments: [-i, $(inputs.inputODGI),
- -o, $(inputs.inputODGI.nameroot).png,
- -x, "50000",
- -y, "500",
- -R,
- -P, "4"]
diff --git a/workflows/pangenome-generate/odgi_to_rdf.cwl b/workflows/pangenome-generate/odgi_to_rdf.cwl
index e6a279b..9a2e912 100644
--- a/workflows/pangenome-generate/odgi_to_rdf.cwl
+++ b/workflows/pangenome-generate/odgi_to_rdf.cwl
@@ -10,10 +10,8 @@ requirements:
ResourceRequirement:
ramMin: $((2 * 1024) + 1)
inputs:
- - id: odgi
- type: File
- - id: output_name
- type: string?
+ odgi: File
+ output_name: string?
stdout: $(inputs.output_name || inputs.odgi.nameroot+'.ttl.xz')
@@ -23,5 +21,4 @@ arguments:
xz, --stdout]
outputs:
- - id: rdf
- type: stdout
+ rdf: stdout
diff --git a/workflows/pangenome-generate/pangenome-generate.cwl b/workflows/pangenome-generate/pangenome-generate.cwl
index 6794e2d..1d49904 100644
--- a/workflows/pangenome-generate/pangenome-generate.cwl
+++ b/workflows/pangenome-generate/pangenome-generate.cwl
@@ -1,17 +1,29 @@
+#!/usr/bin/env cwl-runner
cwlVersion: v1.1
class: Workflow
+requirements:
+ ScatterFeatureRequirement: {}
+ StepInputExpressionRequirement: {}
inputs:
inputReads: File[]
metadata: File[]
metadataSchema: File
subjects: string[]
+ bin_widths:
+ type: int[]
+ default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000]
+ doc: width of each bin in basepairs along the graph vector
+ cells_per_file:
+ type: int
+ default: 100
+ doc: Cells per file on component_segmentation
outputs:
odgiGraph:
type: File
outputSource: buildGraph/odgiGraph
odgiPNG:
type: File
- outputSource: vizGraph/odgiPNG
+ outputSource: vizGraph/graph_image
seqwishGFA:
type: File
outputSource: induceGraph/seqwishGFA
@@ -20,10 +32,16 @@ outputs:
outputSource: odgi2rdf/rdf
readsMergeDedup:
type: File
- outputSource: dedup/readsMergeDedup
+ outputSource: dedup/reads_dedup
mergedMetadata:
type: File
outputSource: mergeMetadata/merged
+ indexed_paths:
+ type: File
+ outputSource: index_paths/indexed_paths
+ colinear_components:
+ type: Directory
+ outputSource: segment_components/colinear_components
steps:
relabel:
in:
@@ -32,17 +50,21 @@ steps:
out: [relabeledSeqs, originalLabels]
run: relabel-seqs.cwl
dedup:
- in: {readsFA: relabel/relabeledSeqs}
- out: [readsMergeDedup, dups]
- run: seqkit-rmdup.cwl
+ in: {reads: relabel/relabeledSeqs}
+ out: [reads_dedup, dups]
+ run: ../tools/seqkit/seqkit_rmdup.cwl
overlapReads:
- in: {readsFA: dedup/readsMergeDedup}
- out: [readsPAF]
- run: minimap2.cwl
+ in:
+ target: dedup/reads_dedup
+ query: dedup/reads_dedup
+ outputCIGAR:
+ default: true
+ out: [alignments]
+ run: ../tools/minimap2/minimap2_paf.cwl
induceGraph:
in:
- readsFA: dedup/readsMergeDedup
- readsPAF: overlapReads/readsPAF
+ readsFA: dedup/reads_dedup
+ readsPAF: overlapReads/alignments
out: [seqwishGFA]
run: seqwish.cwl
buildGraph:
@@ -50,9 +72,18 @@ steps:
out: [odgiGraph]
run: odgi-build.cwl
vizGraph:
- in: {inputODGI: buildGraph/odgiGraph}
- out: [odgiPNG]
- run: odgi-viz.cwl
+ in:
+ sparse_graph_index: buildGraph/odgiGraph
+ width:
+ default: 50000
+ height:
+ default: 500
+ path_per_row:
+ default: true
+ path_height:
+ default: 4
+ out: [graph_image]
+ run: ../tools/odgi/odgi_viz.cwl
odgi2rdf:
in: {odgi: buildGraph/odgiGraph}
out: [rdf]
@@ -66,3 +97,29 @@ steps:
originalLabels: relabel/originalLabels
out: [merged]
run: merge-metadata.cwl
+ bin_paths:
+ run: ../tools/odgi/odgi_bin.cwl
+ in:
+ sparse_graph_index: buildGraph/odgiGraph
+ bin_width: bin_widths
+ scatter: bin_width
+ out: [ bins, pangenome_sequence ]
+ index_paths:
+ label: Create path index
+ run : ../tools/odgi/odgi_pathindex.cwl
+ in:
+ sparse_graph_index: buildGraph/odgiGraph
+ out: [ indexed_paths ]
+ segment_components:
+ label: Run component segmentation
+ run: ../tools/graph-genome-segmentation/component_segmentation.cwl
+ in:
+ bins: bin_paths/bins
+ cells_per_file: cells_per_file
+ pangenome_sequence:
+ source: bin_paths/pangenome_sequence
+ valueFrom: $(self[0])
+ # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index
+ # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index
+ # regardless of bin_width, so we take the first pangenome_sequence as input for this step
+ out: [ colinear_components ]
diff --git a/workflows/pangenome-generate/seqkit-rmdup.cwl b/workflows/pangenome-generate/seqkit-rmdup.cwl
deleted file mode 100644
index 071fa66..0000000
--- a/workflows/pangenome-generate/seqkit-rmdup.cwl
+++ /dev/null
@@ -1,37 +0,0 @@
-cwlVersion: v1.1
-class: CommandLineTool
-inputs:
- readsFA: File
-outputs:
- readsMergeDedup:
- type: File
- outputBinding:
- glob: readsMergeDedup.fasta
- dups:
- type: File?
- outputBinding:
- glob: dups.txt
-requirements:
- InlineJavascriptRequirement: {}
-hints:
- DockerRequirement:
- dockerPull: "quay.io/biocontainers/seqkit:0.7.1--0"
- ResourceRequirement:
- coresMin: 8
- coresMax: 32
- ramMin: $(7 * 1024)
- outdirMin: |
- ${
- var sum = 0;
- for (var i = 0; i < inputs.readsFA.length; i++) {
- sum += inputs.readsFA[i].size;
- }
- return (sum/(1024*1024*1024)+1) + 20;
- }
-baseCommand: seqkit
-arguments: [rmdup,
- --by-seq,
- --ignore-case,
- --dup-num-file, dups.txt,
- -o, readsMergeDedup.fasta,
- $(inputs.readsFA)]
diff --git a/workflows/tools b/workflows/tools
new file mode 160000
+Subproject 659e174d0d42ed6b9afd79d9e6f68e225c526d1