From 4842117f530aba08b4a253aee533a8d4802b1c12 Mon Sep 17 00:00:00 2001 From: Njagi Mwaniki Date: Tue, 26 Jan 2021 13:44:01 +0300 Subject: Add phylogeny workflow Signed-off-by: Njagi Mwaniki --- workflows/phylogeny/README.md | 38 +++++++++++ workflows/phylogeny/align.cwl | 61 ++++++++++++++++++ workflows/phylogeny/augur.cwl | 36 +++++++++++ workflows/phylogeny/awk-coverage.cwl | 23 +++++++ workflows/phylogeny/clado-job.yml | 78 +++++++++++++++++++++++ workflows/phylogeny/coverage.cwl | 36 +++++++++++ workflows/phylogeny/metadata.cwl | 33 ++++++++++ workflows/phylogeny/newick.cwl | 49 +++++++++++++++ workflows/phylogeny/phylogeny.cwl | 118 +++++++++++++++++++++++++++++++++++ 9 files changed, 472 insertions(+) create mode 100644 workflows/phylogeny/README.md create mode 100644 workflows/phylogeny/align.cwl create mode 100644 workflows/phylogeny/augur.cwl create mode 100644 workflows/phylogeny/awk-coverage.cwl create mode 100644 workflows/phylogeny/clado-job.yml create mode 100644 workflows/phylogeny/coverage.cwl create mode 100644 workflows/phylogeny/metadata.cwl create mode 100644 workflows/phylogeny/newick.cwl create mode 100644 workflows/phylogeny/phylogeny.cwl diff --git a/workflows/phylogeny/README.md b/workflows/phylogeny/README.md new file mode 100644 index 0000000..6ecc047 --- /dev/null +++ b/workflows/phylogeny/README.md @@ -0,0 +1,38 @@ +A workflow to generate a phylogeny that can be visualized using [auspice](https://github.com/urbanslug/auspice). +Expects a multi-fasta file path at [pggb_fasta][1] and generates a tree in `json` format. + +#### Dependencies + +Depends on: + - [pggb](https://github.com/pangenome/pggb/blob/master/pggb) + * [wfmash](https://github.com/ekg/wfmash) + * [seqwish](https://github.com/ekg/seqwish) + * [smoothxg](https://github.com/pangenome/smoothxg) + * [odgi](https://github.com/vgteam/odgi) + + - [taxophages](https://github.com/urbanslug/taxophages/) + * Clone and run with `python main.py ...` + + - [augur](https://github.com/nextstrain/augur) + + +#### Running + +Expects that taxophages is cloned in a previous dir but you can update the path [main_py_script][2] to wherever it is. + +Run the phylogeny workflow with the bleow after specifying your path to [pggb_fasta][1]. + +```bash +R_PACKAGES="${HOME}/RLibraries" \ # a directory holding R packages. Needed if R packages installed using install.packages on server e.g https://github.com/urbanslug/taxophages/blob/master/scripts/deps.R +TAXOPHAGES_ENV=server \ # helps taxophages figure out where it is being ran +AUGUR_RECURSION_LIMIT=30000 \ # augur isn't used to working with so many nested values +cwltool --preserve-entire-environment --no-container phylogeny.cwl clado-job.yml +``` + +Alternatively run any workflow with +``` +cwltool --no-container .cwl clado-job.yml +``` + +[1]: clado-job.yml#L8 +[2]: clado-job.yml#L28 diff --git a/workflows/phylogeny/align.cwl b/workflows/phylogeny/align.cwl new file mode 100644 index 0000000..cb2484e --- /dev/null +++ b/workflows/phylogeny/align.cwl @@ -0,0 +1,61 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 + +class: CommandLineTool +baseCommand: pggb + +inputs: + threads: + type: int + inputBinding: + position: 1 + prefix: -t + + pggb_wfmash: + type: boolean + inputBinding: + position: 2 + prefix: --wfmash + + pggb_fasta: + type: File + inputBinding: + position: 3 + prefix: -i + + pggb_mash_k_mer: + type: int + inputBinding: + position: 4 + prefix: -K + + pggb_map_percent_identity: + type: int + inputBinding: + position: 5 + prefix: -p + + pggb_num_secondary_mappings: + type: int + inputBinding: + position: 6 + prefix: -n + + pggb_segment_length: + type: int + inputBinding: + position: 7 + prefix: -s + + pggb_output_dir: + type: string + inputBinding: + position: 8 + prefix: -o + +outputs: + pggb_odgi_graph: + type: File + outputBinding: + glob: '*.smooth.og' \ No newline at end of file diff --git a/workflows/phylogeny/augur.cwl b/workflows/phylogeny/augur.cwl new file mode 100644 index 0000000..4676333 --- /dev/null +++ b/workflows/phylogeny/augur.cwl @@ -0,0 +1,36 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 + +class: CommandLineTool +baseCommand: bash + +requirements: + InitialWorkDirRequirement: + listing: + - $(inputs.dataDir) + +inputs: + nextstrain_bash_script: + type: File + inputBinding: + position: 1 + + newick_tree_2: + type: File + inputBinding: + position: 2 + + metadata_newick: + type: File + inputBinding: + position: 3 + + dataDir: + type: Directory + +outputs: + newick_json: + type: File + outputBinding: + glob: 'covid.json' \ No newline at end of file diff --git a/workflows/phylogeny/awk-coverage.cwl b/workflows/phylogeny/awk-coverage.cwl new file mode 100644 index 0000000..f7a357f --- /dev/null +++ b/workflows/phylogeny/awk-coverage.cwl @@ -0,0 +1,23 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 +class: CommandLineTool + +baseCommand: awk + +inputs: + consensus_regex: + type: string + inputBinding: + position: 1 + + coverage_tsv: + type: File + inputBinding: + position: 2 + +outputs: + awk_coverage_matrix: + type: stdout + +stdout: coverage.no_consensus.tsv \ No newline at end of file diff --git a/workflows/phylogeny/clado-job.yml b/workflows/phylogeny/clado-job.yml new file mode 100644 index 0000000..f8204c7 --- /dev/null +++ b/workflows/phylogeny/clado-job.yml @@ -0,0 +1,78 @@ +message: Hello world! + +threads: 16 + +pggb_wfmash: true +pggb_fasta: + class: File + path: ../data/qc/relabeledSeqs.sorted.qc.100sample.fasta +pggb_mash_k_mer: 19 +pggb_map_percent_identity: 95 +pggb_num_secondary_mappings: 10000 +pggb_segment_length: 5000 +pggb_output_dir: "." + +odgi_paths: paths +odgi_graph: + class: File + path: ./relabeledSeqs.sorted.qc.100sample.fasta.pggb-W-s5000-l15000-p95-n10000-a0-K19-k19-w10000-j5000-e5000-I0-R0.smooth.og +haplotypes: true + +consensus_regex: '!/^Consensus/' +coverage_tsv: + class: File + path: ./coverage.tsv + +main_py_script: + class: File + path: ../main.py +metadata: get-metadata +coverage_matrix: + class: File + path: ./coverage.no_consensus.tsv +coverage_matrix_with_metadata: ./coverage.metadata.tsv + +clado-rsvd: clado-rsvd +cladogram_matrix: + class: File + path: ./coverage.metadata.tsv +reduced_matrix: ./coverage.reduced.tsv +svg_figure: 30k_700cm.svg + +newick: gen-newick +newick_dimensions: 100 +newick_coverage_matrix: + class: File + path: ./coverage.metadata.tsv +newick_metadata: ./metadata.tsv +newick_tree: ./tree.workflow.nwk + +nextstrain_R_script: + class: File + path: ../taxophages/viz/nextstrain.R + +coverage_matrix_with_metadata_2: + class: File + path: ../data/5k/covmatrix.5k.metadata.tsv + +metadata_only: ./metadata.tsv +newick_tree: tree.workflow.nwk +distance_matrix: distance_matrix.workflow.tsv +rsvd_dimensions: "1000" +filter_unknowns: "TRUE" + +nextstrain_bash_script: + class: File + path: ../scripts/nextstrain.sh + +newick_tree_2: + class: File + path: ./tree.workflow.nwk + +metadata_newick: + class: File + path: ./metadata.tsv + +dataDir: + class: Directory + path: ../config diff --git a/workflows/phylogeny/coverage.cwl b/workflows/phylogeny/coverage.cwl new file mode 100644 index 0000000..ed706ca --- /dev/null +++ b/workflows/phylogeny/coverage.cwl @@ -0,0 +1,36 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 + +class: CommandLineTool +baseCommand: odgi + +inputs: + odgi_paths: + type: string + inputBinding: + position: 1 + + odgi_graph: + type: File + inputBinding: + position: 2 + prefix: -i + + haplotypes: + type: boolean + inputBinding: + position: 4 + prefix: -H + + threads: + type: int + inputBinding: + position: 5 + prefix: -t + +outputs: + coverage_matrix: + type: stdout + +stdout: coverage.tsv \ No newline at end of file diff --git a/workflows/phylogeny/metadata.cwl b/workflows/phylogeny/metadata.cwl new file mode 100644 index 0000000..4ce6e17 --- /dev/null +++ b/workflows/phylogeny/metadata.cwl @@ -0,0 +1,33 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 + +class: CommandLineTool +baseCommand: python + +inputs: + main_py_script: + type: File + inputBinding: + position: 1 + + metadata: + type: string + inputBinding: + position: 2 + + coverage_matrix: + type: File + inputBinding: + position: 3 + + coverage_matrix_with_metadata: + type: string + inputBinding: + position: 4 + +outputs: + coverage_matrix_with_metadata_out: + type: File + outputBinding: + glob: '*.metadata.tsv' \ No newline at end of file diff --git a/workflows/phylogeny/newick.cwl b/workflows/phylogeny/newick.cwl new file mode 100644 index 0000000..e1e78f7 --- /dev/null +++ b/workflows/phylogeny/newick.cwl @@ -0,0 +1,49 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 + +class: CommandLineTool +baseCommand: python + +inputs: + main_py_script: + type: File + inputBinding: + position: 1 + + newick: + type: string + inputBinding: + position: 2 + + newick_dimensions: + type: int + inputBinding: + position: 3 + prefix: -d + + newick_coverage_matrix: + type: File + inputBinding: + position: 3 + + newick_metadata: + type: string + inputBinding: + position: 4 + + newick_tree: + type: string + inputBinding: + position: 5 + +outputs: + metadata_out: + type: File + outputBinding: + glob: 'metadata.tsv' + + newick_tree_out: + type: File + outputBinding: + glob: '*.nwk' \ No newline at end of file diff --git a/workflows/phylogeny/phylogeny.cwl b/workflows/phylogeny/phylogeny.cwl new file mode 100644 index 0000000..7ae3ab7 --- /dev/null +++ b/workflows/phylogeny/phylogeny.cwl @@ -0,0 +1,118 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 +class: Workflow + +############################################# + +inputs: + + # align + threads: int + pggb_wfmash: boolean + pggb_fasta: File + pggb_mash_k_mer: int + pggb_map_percent_identity: int + pggb_num_secondary_mappings: int + pggb_segment_length: int + pggb_output_dir: string + + # extract coverage vector + odgi_paths: string + odgi_graph: File + haplotypes: boolean + threads: int + + # remove consensus paths + consensus_regex: string + coverage_tsv: File + + # Get metadata + main_py_script: File + metadata: string + coverage_matrix: File + coverage_matrix_with_metadata: string + + # Generate newick tree + main_py_script: File + newick: string + newick_dimensions: int + newick_coverage_matrix: File + newick_metadata: string + newick_tree: string + + # Genenrate augur JSON file + nextstrain_bash_script: File + newick_tree_2: File + metadata_newick: File + dataDir: Directory + + +############################################# + +outputs: + augur_json: + type: File + outputSource: augur/newick_json + +############################################# + +steps: + align: + run: align.cwl + in: + threads: threads + pggb_wfmash: pggb_wfmash + pggb_fasta: pggb_fasta + pggb_mash_k_mer: pggb_mash_k_mer + pggb_map_percent_identity: pggb_map_percent_identity + pggb_num_secondary_mappings: pggb_num_secondary_mappings + pggb_segment_length: pggb_segment_length + pggb_output_dir: pggb_output_dir + out: [pggb_odgi_graph] + + odgi: + run: coverage.cwl + in: + odgi_paths: odgi_paths + odgi_graph: align/pggb_odgi_graph + haplotypes: haplotypes + threads: threads + out: [coverage_matrix] + + awk: + run: awk-coverage.cwl + in: + consensus_regex: consensus_regex + coverage_tsv: odgi/coverage_matrix + out: [awk_coverage_matrix] + + metadata: + run: metadata.cwl + in: + main_py_script: main_py_script + metadata: metadata + coverage_matrix: awk/awk_coverage_matrix + coverage_matrix_with_metadata: coverage_matrix_with_metadata + out: [coverage_matrix_with_metadata_out] + + newick: + run: newick.cwl + in: + main_py_script: main_py_script + newick: newick + newick_dimensions: newick_dimensions + newick_coverage_matrix: metadata/coverage_matrix_with_metadata_out + newick_metadata: newick_metadata + newick_tree: newick_tree + out: [metadata_out, newick_tree_out] + + augur: + run: augur.cwl + in: + nextstrain_bash_script: nextstrain_bash_script + newick_tree_2: newick/newick_tree_out + metadata_newick: newick/metadata_out + dataDir: dataDir + + out: [newick_json] -- cgit v1.2.3