aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2021-01-26 16:22:19 +0000
committerGitHub2021-01-26 16:22:19 +0000
commita54122023ef2bde4ef4fc39355b3491678bbf773 (patch)
tree150bef4a574486db9e7c16d87be89ad219d84bb1
parentb112b3203e34ea61dfdf802bce5036f938eaa774 (diff)
parent4842117f530aba08b4a253aee533a8d4802b1c12 (diff)
downloadbh20-seq-resource-a54122023ef2bde4ef4fc39355b3491678bbf773.tar.gz
bh20-seq-resource-a54122023ef2bde4ef4fc39355b3491678bbf773.tar.lz
bh20-seq-resource-a54122023ef2bde4ef4fc39355b3491678bbf773.zip
Merge pull request #124 from urbanslug/master
Add phylogeny workflow
-rw-r--r--workflows/phylogeny/README.md38
-rw-r--r--workflows/phylogeny/align.cwl61
-rw-r--r--workflows/phylogeny/augur.cwl36
-rw-r--r--workflows/phylogeny/awk-coverage.cwl23
-rw-r--r--workflows/phylogeny/clado-job.yml78
-rw-r--r--workflows/phylogeny/coverage.cwl36
-rw-r--r--workflows/phylogeny/metadata.cwl33
-rw-r--r--workflows/phylogeny/newick.cwl49
-rw-r--r--workflows/phylogeny/phylogeny.cwl118
9 files changed, 472 insertions, 0 deletions
diff --git a/workflows/phylogeny/README.md b/workflows/phylogeny/README.md
new file mode 100644
index 0000000..6ecc047
--- /dev/null
+++ b/workflows/phylogeny/README.md
@@ -0,0 +1,38 @@
+A workflow to generate a phylogeny that can be visualized using [auspice](https://github.com/urbanslug/auspice).
+Expects a multi-fasta file path at [pggb_fasta][1] and generates a tree in `json` format.
+
+#### Dependencies
+
+Depends on:
+ - [pggb](https://github.com/pangenome/pggb/blob/master/pggb)
+ * [wfmash](https://github.com/ekg/wfmash)
+ * [seqwish](https://github.com/ekg/seqwish)
+ * [smoothxg](https://github.com/pangenome/smoothxg)
+ * [odgi](https://github.com/vgteam/odgi)
+
+ - [taxophages](https://github.com/urbanslug/taxophages/)
+ * Clone and run with `python main.py ...`
+
+ - [augur](https://github.com/nextstrain/augur)
+
+
+#### Running
+
+Expects that taxophages is cloned in a previous dir but you can update the path [main_py_script][2] to wherever it is.
+
+Run the phylogeny workflow with the bleow after specifying your path to [pggb_fasta][1].
+
+```bash
+R_PACKAGES="${HOME}/RLibraries" \ # a directory holding R packages. Needed if R packages installed using install.packages on server e.g https://github.com/urbanslug/taxophages/blob/master/scripts/deps.R
+TAXOPHAGES_ENV=server \ # helps taxophages figure out where it is being ran
+AUGUR_RECURSION_LIMIT=30000 \ # augur isn't used to working with so many nested values
+cwltool --preserve-entire-environment --no-container phylogeny.cwl clado-job.yml
+```
+
+Alternatively run any workflow with
+```
+cwltool --no-container <workflow>.cwl clado-job.yml
+```
+
+[1]: clado-job.yml#L8
+[2]: clado-job.yml#L28
diff --git a/workflows/phylogeny/align.cwl b/workflows/phylogeny/align.cwl
new file mode 100644
index 0000000..cb2484e
--- /dev/null
+++ b/workflows/phylogeny/align.cwl
@@ -0,0 +1,61 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+
+class: CommandLineTool
+baseCommand: pggb
+
+inputs:
+ threads:
+ type: int
+ inputBinding:
+ position: 1
+ prefix: -t
+
+ pggb_wfmash:
+ type: boolean
+ inputBinding:
+ position: 2
+ prefix: --wfmash
+
+ pggb_fasta:
+ type: File
+ inputBinding:
+ position: 3
+ prefix: -i
+
+ pggb_mash_k_mer:
+ type: int
+ inputBinding:
+ position: 4
+ prefix: -K
+
+ pggb_map_percent_identity:
+ type: int
+ inputBinding:
+ position: 5
+ prefix: -p
+
+ pggb_num_secondary_mappings:
+ type: int
+ inputBinding:
+ position: 6
+ prefix: -n
+
+ pggb_segment_length:
+ type: int
+ inputBinding:
+ position: 7
+ prefix: -s
+
+ pggb_output_dir:
+ type: string
+ inputBinding:
+ position: 8
+ prefix: -o
+
+outputs:
+ pggb_odgi_graph:
+ type: File
+ outputBinding:
+ glob: '*.smooth.og' \ No newline at end of file
diff --git a/workflows/phylogeny/augur.cwl b/workflows/phylogeny/augur.cwl
new file mode 100644
index 0000000..4676333
--- /dev/null
+++ b/workflows/phylogeny/augur.cwl
@@ -0,0 +1,36 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+
+class: CommandLineTool
+baseCommand: bash
+
+requirements:
+ InitialWorkDirRequirement:
+ listing:
+ - $(inputs.dataDir)
+
+inputs:
+ nextstrain_bash_script:
+ type: File
+ inputBinding:
+ position: 1
+
+ newick_tree_2:
+ type: File
+ inputBinding:
+ position: 2
+
+ metadata_newick:
+ type: File
+ inputBinding:
+ position: 3
+
+ dataDir:
+ type: Directory
+
+outputs:
+ newick_json:
+ type: File
+ outputBinding:
+ glob: 'covid.json' \ No newline at end of file
diff --git a/workflows/phylogeny/awk-coverage.cwl b/workflows/phylogeny/awk-coverage.cwl
new file mode 100644
index 0000000..f7a357f
--- /dev/null
+++ b/workflows/phylogeny/awk-coverage.cwl
@@ -0,0 +1,23 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+class: CommandLineTool
+
+baseCommand: awk
+
+inputs:
+ consensus_regex:
+ type: string
+ inputBinding:
+ position: 1
+
+ coverage_tsv:
+ type: File
+ inputBinding:
+ position: 2
+
+outputs:
+ awk_coverage_matrix:
+ type: stdout
+
+stdout: coverage.no_consensus.tsv \ No newline at end of file
diff --git a/workflows/phylogeny/clado-job.yml b/workflows/phylogeny/clado-job.yml
new file mode 100644
index 0000000..f8204c7
--- /dev/null
+++ b/workflows/phylogeny/clado-job.yml
@@ -0,0 +1,78 @@
+message: Hello world!
+
+threads: 16
+
+pggb_wfmash: true
+pggb_fasta:
+ class: File
+ path: ../data/qc/relabeledSeqs.sorted.qc.100sample.fasta
+pggb_mash_k_mer: 19
+pggb_map_percent_identity: 95
+pggb_num_secondary_mappings: 10000
+pggb_segment_length: 5000
+pggb_output_dir: "."
+
+odgi_paths: paths
+odgi_graph:
+ class: File
+ path: ./relabeledSeqs.sorted.qc.100sample.fasta.pggb-W-s5000-l15000-p95-n10000-a0-K19-k19-w10000-j5000-e5000-I0-R0.smooth.og
+haplotypes: true
+
+consensus_regex: '!/^Consensus/'
+coverage_tsv:
+ class: File
+ path: ./coverage.tsv
+
+main_py_script:
+ class: File
+ path: ../main.py
+metadata: get-metadata
+coverage_matrix:
+ class: File
+ path: ./coverage.no_consensus.tsv
+coverage_matrix_with_metadata: ./coverage.metadata.tsv
+
+clado-rsvd: clado-rsvd
+cladogram_matrix:
+ class: File
+ path: ./coverage.metadata.tsv
+reduced_matrix: ./coverage.reduced.tsv
+svg_figure: 30k_700cm.svg
+
+newick: gen-newick
+newick_dimensions: 100
+newick_coverage_matrix:
+ class: File
+ path: ./coverage.metadata.tsv
+newick_metadata: ./metadata.tsv
+newick_tree: ./tree.workflow.nwk
+
+nextstrain_R_script:
+ class: File
+ path: ../taxophages/viz/nextstrain.R
+
+coverage_matrix_with_metadata_2:
+ class: File
+ path: ../data/5k/covmatrix.5k.metadata.tsv
+
+metadata_only: ./metadata.tsv
+newick_tree: tree.workflow.nwk
+distance_matrix: distance_matrix.workflow.tsv
+rsvd_dimensions: "1000"
+filter_unknowns: "TRUE"
+
+nextstrain_bash_script:
+ class: File
+ path: ../scripts/nextstrain.sh
+
+newick_tree_2:
+ class: File
+ path: ./tree.workflow.nwk
+
+metadata_newick:
+ class: File
+ path: ./metadata.tsv
+
+dataDir:
+ class: Directory
+ path: ../config
diff --git a/workflows/phylogeny/coverage.cwl b/workflows/phylogeny/coverage.cwl
new file mode 100644
index 0000000..ed706ca
--- /dev/null
+++ b/workflows/phylogeny/coverage.cwl
@@ -0,0 +1,36 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+
+class: CommandLineTool
+baseCommand: odgi
+
+inputs:
+ odgi_paths:
+ type: string
+ inputBinding:
+ position: 1
+
+ odgi_graph:
+ type: File
+ inputBinding:
+ position: 2
+ prefix: -i
+
+ haplotypes:
+ type: boolean
+ inputBinding:
+ position: 4
+ prefix: -H
+
+ threads:
+ type: int
+ inputBinding:
+ position: 5
+ prefix: -t
+
+outputs:
+ coverage_matrix:
+ type: stdout
+
+stdout: coverage.tsv \ No newline at end of file
diff --git a/workflows/phylogeny/metadata.cwl b/workflows/phylogeny/metadata.cwl
new file mode 100644
index 0000000..4ce6e17
--- /dev/null
+++ b/workflows/phylogeny/metadata.cwl
@@ -0,0 +1,33 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+
+class: CommandLineTool
+baseCommand: python
+
+inputs:
+ main_py_script:
+ type: File
+ inputBinding:
+ position: 1
+
+ metadata:
+ type: string
+ inputBinding:
+ position: 2
+
+ coverage_matrix:
+ type: File
+ inputBinding:
+ position: 3
+
+ coverage_matrix_with_metadata:
+ type: string
+ inputBinding:
+ position: 4
+
+outputs:
+ coverage_matrix_with_metadata_out:
+ type: File
+ outputBinding:
+ glob: '*.metadata.tsv' \ No newline at end of file
diff --git a/workflows/phylogeny/newick.cwl b/workflows/phylogeny/newick.cwl
new file mode 100644
index 0000000..e1e78f7
--- /dev/null
+++ b/workflows/phylogeny/newick.cwl
@@ -0,0 +1,49 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+
+class: CommandLineTool
+baseCommand: python
+
+inputs:
+ main_py_script:
+ type: File
+ inputBinding:
+ position: 1
+
+ newick:
+ type: string
+ inputBinding:
+ position: 2
+
+ newick_dimensions:
+ type: int
+ inputBinding:
+ position: 3
+ prefix: -d
+
+ newick_coverage_matrix:
+ type: File
+ inputBinding:
+ position: 3
+
+ newick_metadata:
+ type: string
+ inputBinding:
+ position: 4
+
+ newick_tree:
+ type: string
+ inputBinding:
+ position: 5
+
+outputs:
+ metadata_out:
+ type: File
+ outputBinding:
+ glob: 'metadata.tsv'
+
+ newick_tree_out:
+ type: File
+ outputBinding:
+ glob: '*.nwk' \ No newline at end of file
diff --git a/workflows/phylogeny/phylogeny.cwl b/workflows/phylogeny/phylogeny.cwl
new file mode 100644
index 0000000..7ae3ab7
--- /dev/null
+++ b/workflows/phylogeny/phylogeny.cwl
@@ -0,0 +1,118 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+class: Workflow
+
+#############################################
+
+inputs:
+
+ # align
+ threads: int
+ pggb_wfmash: boolean
+ pggb_fasta: File
+ pggb_mash_k_mer: int
+ pggb_map_percent_identity: int
+ pggb_num_secondary_mappings: int
+ pggb_segment_length: int
+ pggb_output_dir: string
+
+ # extract coverage vector
+ odgi_paths: string
+ odgi_graph: File
+ haplotypes: boolean
+ threads: int
+
+ # remove consensus paths
+ consensus_regex: string
+ coverage_tsv: File
+
+ # Get metadata
+ main_py_script: File
+ metadata: string
+ coverage_matrix: File
+ coverage_matrix_with_metadata: string
+
+ # Generate newick tree
+ main_py_script: File
+ newick: string
+ newick_dimensions: int
+ newick_coverage_matrix: File
+ newick_metadata: string
+ newick_tree: string
+
+ # Genenrate augur JSON file
+ nextstrain_bash_script: File
+ newick_tree_2: File
+ metadata_newick: File
+ dataDir: Directory
+
+
+#############################################
+
+outputs:
+ augur_json:
+ type: File
+ outputSource: augur/newick_json
+
+#############################################
+
+steps:
+ align:
+ run: align.cwl
+ in:
+ threads: threads
+ pggb_wfmash: pggb_wfmash
+ pggb_fasta: pggb_fasta
+ pggb_mash_k_mer: pggb_mash_k_mer
+ pggb_map_percent_identity: pggb_map_percent_identity
+ pggb_num_secondary_mappings: pggb_num_secondary_mappings
+ pggb_segment_length: pggb_segment_length
+ pggb_output_dir: pggb_output_dir
+ out: [pggb_odgi_graph]
+
+ odgi:
+ run: coverage.cwl
+ in:
+ odgi_paths: odgi_paths
+ odgi_graph: align/pggb_odgi_graph
+ haplotypes: haplotypes
+ threads: threads
+ out: [coverage_matrix]
+
+ awk:
+ run: awk-coverage.cwl
+ in:
+ consensus_regex: consensus_regex
+ coverage_tsv: odgi/coverage_matrix
+ out: [awk_coverage_matrix]
+
+ metadata:
+ run: metadata.cwl
+ in:
+ main_py_script: main_py_script
+ metadata: metadata
+ coverage_matrix: awk/awk_coverage_matrix
+ coverage_matrix_with_metadata: coverage_matrix_with_metadata
+ out: [coverage_matrix_with_metadata_out]
+
+ newick:
+ run: newick.cwl
+ in:
+ main_py_script: main_py_script
+ newick: newick
+ newick_dimensions: newick_dimensions
+ newick_coverage_matrix: metadata/coverage_matrix_with_metadata_out
+ newick_metadata: newick_metadata
+ newick_tree: newick_tree
+ out: [metadata_out, newick_tree_out]
+
+ augur:
+ run: augur.cwl
+ in:
+ nextstrain_bash_script: nextstrain_bash_script
+ newick_tree_2: newick/newick_tree_out
+ metadata_newick: newick/metadata_out
+ dataDir: dataDir
+
+ out: [newick_json]