about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--workflows/phylogeny/README.md38
-rw-r--r--workflows/phylogeny/align.cwl61
-rw-r--r--workflows/phylogeny/augur.cwl36
-rw-r--r--workflows/phylogeny/awk-coverage.cwl23
-rw-r--r--workflows/phylogeny/clado-job.yml78
-rw-r--r--workflows/phylogeny/coverage.cwl36
-rw-r--r--workflows/phylogeny/metadata.cwl33
-rw-r--r--workflows/phylogeny/newick.cwl49
-rw-r--r--workflows/phylogeny/phylogeny.cwl118
9 files changed, 472 insertions, 0 deletions
diff --git a/workflows/phylogeny/README.md b/workflows/phylogeny/README.md
new file mode 100644
index 0000000..6ecc047
--- /dev/null
+++ b/workflows/phylogeny/README.md
@@ -0,0 +1,38 @@
+A workflow to generate a phylogeny that can be visualized using [auspice](https://github.com/urbanslug/auspice).  
+Expects a multi-fasta file path at [pggb_fasta][1] and generates a tree in `json` format.
+
+#### Dependencies
+
+Depends on:
+ - [pggb](https://github.com/pangenome/pggb/blob/master/pggb)
+   * [wfmash](https://github.com/ekg/wfmash)
+   * [seqwish](https://github.com/ekg/seqwish)
+   * [smoothxg](https://github.com/pangenome/smoothxg)
+   * [odgi](https://github.com/vgteam/odgi)
+
+ - [taxophages](https://github.com/urbanslug/taxophages/)
+   * Clone and run with `python main.py ...`
+
+ - [augur](https://github.com/nextstrain/augur)
+
+
+#### Running
+
+Expects that taxophages is cloned in a previous dir but you can update the path [main_py_script][2] to wherever it is.
+
+Run the phylogeny workflow with the bleow after specifying your path to [pggb_fasta][1].
+
+```bash
+R_PACKAGES="${HOME}/RLibraries" \     # a directory holding R packages. Needed if R packages installed using install.packages on server e.g https://github.com/urbanslug/taxophages/blob/master/scripts/deps.R
+TAXOPHAGES_ENV=server \               # helps taxophages figure out where it is being ran
+AUGUR_RECURSION_LIMIT=30000 \         # augur isn't used to working with so many nested values
+cwltool --preserve-entire-environment --no-container phylogeny.cwl clado-job.yml
+```
+
+Alternatively run any workflow with
+```
+cwltool --no-container <workflow>.cwl clado-job.yml
+```
+
+[1]: clado-job.yml#L8
+[2]: clado-job.yml#L28
diff --git a/workflows/phylogeny/align.cwl b/workflows/phylogeny/align.cwl
new file mode 100644
index 0000000..cb2484e
--- /dev/null
+++ b/workflows/phylogeny/align.cwl
@@ -0,0 +1,61 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+
+class: CommandLineTool
+baseCommand: pggb
+
+inputs:
+  threads:
+    type: int
+    inputBinding:
+        position: 1
+        prefix: -t
+
+  pggb_wfmash:
+    type: boolean
+    inputBinding:
+        position: 2
+        prefix: --wfmash
+
+  pggb_fasta:
+    type: File
+    inputBinding:
+        position: 3
+        prefix: -i
+
+  pggb_mash_k_mer:
+    type: int
+    inputBinding:
+        position: 4
+        prefix: -K
+
+  pggb_map_percent_identity:
+    type: int
+    inputBinding:
+        position: 5
+        prefix: -p
+
+  pggb_num_secondary_mappings:
+    type: int
+    inputBinding:
+        position: 6
+        prefix: -n
+
+  pggb_segment_length:
+    type: int
+    inputBinding:
+        position: 7
+        prefix: -s
+
+  pggb_output_dir:
+    type: string
+    inputBinding:
+        position: 8
+        prefix: -o
+
+outputs:
+    pggb_odgi_graph:
+        type: File
+        outputBinding:
+            glob: '*.smooth.og'
\ No newline at end of file
diff --git a/workflows/phylogeny/augur.cwl b/workflows/phylogeny/augur.cwl
new file mode 100644
index 0000000..4676333
--- /dev/null
+++ b/workflows/phylogeny/augur.cwl
@@ -0,0 +1,36 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+
+class: CommandLineTool
+baseCommand: bash
+
+requirements:
+  InitialWorkDirRequirement:
+    listing:
+      - $(inputs.dataDir)
+
+inputs:
+    nextstrain_bash_script:
+        type: File
+        inputBinding:
+            position: 1
+
+    newick_tree_2:
+        type: File
+        inputBinding:
+            position: 2
+
+    metadata_newick:
+        type: File
+        inputBinding:
+            position: 3
+
+    dataDir:
+        type: Directory
+
+outputs:
+    newick_json:
+        type: File
+        outputBinding:
+            glob: 'covid.json'
\ No newline at end of file
diff --git a/workflows/phylogeny/awk-coverage.cwl b/workflows/phylogeny/awk-coverage.cwl
new file mode 100644
index 0000000..f7a357f
--- /dev/null
+++ b/workflows/phylogeny/awk-coverage.cwl
@@ -0,0 +1,23 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+class: CommandLineTool
+
+baseCommand: awk
+
+inputs:
+  consensus_regex:
+    type: string
+    inputBinding:
+        position: 1
+
+  coverage_tsv:
+    type: File
+    inputBinding:
+        position: 2
+ 
+outputs:
+  awk_coverage_matrix:
+    type: stdout
+
+stdout: coverage.no_consensus.tsv
\ No newline at end of file
diff --git a/workflows/phylogeny/clado-job.yml b/workflows/phylogeny/clado-job.yml
new file mode 100644
index 0000000..f8204c7
--- /dev/null
+++ b/workflows/phylogeny/clado-job.yml
@@ -0,0 +1,78 @@
+message: Hello world!
+
+threads: 16
+
+pggb_wfmash: true
+pggb_fasta:
+  class: File
+  path: ../data/qc/relabeledSeqs.sorted.qc.100sample.fasta
+pggb_mash_k_mer: 19
+pggb_map_percent_identity: 95
+pggb_num_secondary_mappings: 10000
+pggb_segment_length: 5000
+pggb_output_dir: "."
+
+odgi_paths: paths
+odgi_graph:
+  class: File
+  path: ./relabeledSeqs.sorted.qc.100sample.fasta.pggb-W-s5000-l15000-p95-n10000-a0-K19-k19-w10000-j5000-e5000-I0-R0.smooth.og
+haplotypes: true
+
+consensus_regex: '!/^Consensus/' 
+coverage_tsv:
+  class: File
+  path: ./coverage.tsv
+
+main_py_script:
+  class: File
+  path: ../main.py
+metadata: get-metadata
+coverage_matrix:
+  class: File
+  path: ./coverage.no_consensus.tsv
+coverage_matrix_with_metadata: ./coverage.metadata.tsv
+
+clado-rsvd: clado-rsvd
+cladogram_matrix:
+  class: File
+  path: ./coverage.metadata.tsv
+reduced_matrix: ./coverage.reduced.tsv
+svg_figure: 30k_700cm.svg
+
+newick: gen-newick
+newick_dimensions: 100
+newick_coverage_matrix:
+  class: File
+  path: ./coverage.metadata.tsv
+newick_metadata: ./metadata.tsv
+newick_tree: ./tree.workflow.nwk
+
+nextstrain_R_script:
+  class: File
+  path: ../taxophages/viz/nextstrain.R 
+
+coverage_matrix_with_metadata_2:
+  class: File
+  path: ../data/5k/covmatrix.5k.metadata.tsv
+
+metadata_only: ./metadata.tsv
+newick_tree: tree.workflow.nwk
+distance_matrix: distance_matrix.workflow.tsv
+rsvd_dimensions: "1000"
+filter_unknowns: "TRUE"
+
+nextstrain_bash_script:
+  class: File
+  path: ../scripts/nextstrain.sh
+
+newick_tree_2:
+  class: File
+  path: ./tree.workflow.nwk
+
+metadata_newick:
+  class: File
+  path: ./metadata.tsv
+
+dataDir: 
+  class: Directory
+  path: ../config
diff --git a/workflows/phylogeny/coverage.cwl b/workflows/phylogeny/coverage.cwl
new file mode 100644
index 0000000..ed706ca
--- /dev/null
+++ b/workflows/phylogeny/coverage.cwl
@@ -0,0 +1,36 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+
+class: CommandLineTool
+baseCommand: odgi
+
+inputs:
+  odgi_paths:
+    type: string
+    inputBinding:
+      position: 1
+
+  odgi_graph:
+    type: File
+    inputBinding:
+        position: 2
+        prefix: -i
+
+  haplotypes:
+    type: boolean
+    inputBinding:
+      position: 4
+      prefix: -H
+
+  threads:
+    type: int
+    inputBinding:
+        position: 5
+        prefix: -t
+
+outputs:
+  coverage_matrix:
+    type: stdout
+
+stdout: coverage.tsv
\ No newline at end of file
diff --git a/workflows/phylogeny/metadata.cwl b/workflows/phylogeny/metadata.cwl
new file mode 100644
index 0000000..4ce6e17
--- /dev/null
+++ b/workflows/phylogeny/metadata.cwl
@@ -0,0 +1,33 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+
+class: CommandLineTool
+baseCommand: python
+
+inputs:
+    main_py_script:
+        type: File
+        inputBinding:
+            position: 1
+
+    metadata:
+        type: string
+        inputBinding:
+            position: 2
+
+    coverage_matrix:
+        type: File
+        inputBinding:
+            position: 3
+
+    coverage_matrix_with_metadata:
+        type: string
+        inputBinding:
+            position: 4
+
+outputs:
+    coverage_matrix_with_metadata_out:
+        type: File
+        outputBinding:
+            glob: '*.metadata.tsv'
\ No newline at end of file
diff --git a/workflows/phylogeny/newick.cwl b/workflows/phylogeny/newick.cwl
new file mode 100644
index 0000000..e1e78f7
--- /dev/null
+++ b/workflows/phylogeny/newick.cwl
@@ -0,0 +1,49 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+
+class: CommandLineTool
+baseCommand: python
+
+inputs:
+    main_py_script:
+        type: File
+        inputBinding:
+            position: 1
+
+    newick:
+        type: string
+        inputBinding:
+            position: 2
+
+    newick_dimensions:
+        type: int
+        inputBinding:
+            position: 3
+            prefix: -d
+
+    newick_coverage_matrix:
+        type: File
+        inputBinding:
+            position: 3
+    
+    newick_metadata:
+        type: string
+        inputBinding:
+            position: 4
+
+    newick_tree:
+        type: string
+        inputBinding:
+            position: 5
+
+outputs:
+    metadata_out:
+        type: File
+        outputBinding:
+            glob: 'metadata.tsv'
+
+    newick_tree_out:
+        type: File
+        outputBinding:
+            glob: '*.nwk'
\ No newline at end of file
diff --git a/workflows/phylogeny/phylogeny.cwl b/workflows/phylogeny/phylogeny.cwl
new file mode 100644
index 0000000..7ae3ab7
--- /dev/null
+++ b/workflows/phylogeny/phylogeny.cwl
@@ -0,0 +1,118 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+class: Workflow
+
+#############################################
+
+inputs:
+
+  # align
+  threads: int
+  pggb_wfmash: boolean
+  pggb_fasta: File
+  pggb_mash_k_mer: int
+  pggb_map_percent_identity: int
+  pggb_num_secondary_mappings: int
+  pggb_segment_length: int
+  pggb_output_dir: string
+
+  # extract coverage vector
+  odgi_paths: string
+  odgi_graph: File
+  haplotypes: boolean
+  threads: int
+
+  # remove consensus paths
+  consensus_regex: string
+  coverage_tsv: File
+
+  # Get metadata
+  main_py_script: File
+  metadata: string
+  coverage_matrix: File
+  coverage_matrix_with_metadata: string
+
+  # Generate newick tree
+  main_py_script: File
+  newick: string
+  newick_dimensions: int
+  newick_coverage_matrix: File  
+  newick_metadata: string
+  newick_tree: string
+
+  # Genenrate augur JSON file
+  nextstrain_bash_script: File
+  newick_tree_2: File
+  metadata_newick: File
+  dataDir: Directory
+
+
+#############################################
+
+outputs:
+  augur_json:
+    type: File
+    outputSource: augur/newick_json
+
+#############################################
+
+steps:
+  align:
+    run: align.cwl
+    in:
+      threads: threads
+      pggb_wfmash: pggb_wfmash
+      pggb_fasta: pggb_fasta
+      pggb_mash_k_mer: pggb_mash_k_mer
+      pggb_map_percent_identity: pggb_map_percent_identity
+      pggb_num_secondary_mappings: pggb_num_secondary_mappings
+      pggb_segment_length: pggb_segment_length
+      pggb_output_dir: pggb_output_dir
+    out: [pggb_odgi_graph]
+
+  odgi:
+    run: coverage.cwl
+    in:
+      odgi_paths: odgi_paths
+      odgi_graph: align/pggb_odgi_graph
+      haplotypes: haplotypes
+      threads: threads
+    out: [coverage_matrix]
+
+  awk:
+    run: awk-coverage.cwl
+    in:
+      consensus_regex: consensus_regex
+      coverage_tsv: odgi/coverage_matrix
+    out: [awk_coverage_matrix]
+
+  metadata:
+    run: metadata.cwl
+    in:
+      main_py_script: main_py_script
+      metadata: metadata
+      coverage_matrix: awk/awk_coverage_matrix
+      coverage_matrix_with_metadata: coverage_matrix_with_metadata
+    out: [coverage_matrix_with_metadata_out]
+
+  newick:
+    run: newick.cwl
+    in:
+      main_py_script: main_py_script
+      newick: newick
+      newick_dimensions: newick_dimensions
+      newick_coverage_matrix: metadata/coverage_matrix_with_metadata_out
+      newick_metadata: newick_metadata
+      newick_tree: newick_tree      
+    out: [metadata_out, newick_tree_out]
+
+  augur:
+    run: augur.cwl
+    in:
+      nextstrain_bash_script: nextstrain_bash_script
+      newick_tree_2: newick/newick_tree_out
+      metadata_newick: newick/metadata_out
+      dataDir: dataDir
+      
+    out: [newick_json]