diff options
| author | Arun Isaac | 2026-03-23 23:14:34 +0000 |
|---|---|---|
| committer | Arun Isaac | 2026-03-23 23:28:23 +0000 |
| commit | 68a9f303269d1c1784f4b94e3fc8d25ee8055883 (patch) | |
| tree | 88c578ce2fca3b7f9e5ea5ad3cfc69d2a9406267 /pggb.scm | |
| download | pggb.cwl-68a9f303269d1c1784f4b94e3fc8d25ee8055883.tar.gz pggb.cwl-68a9f303269d1c1784f4b94e3fc8d25ee8055883.tar.lz pggb.cwl-68a9f303269d1c1784f4b94e3fc8d25ee8055883.zip | |
Diffstat (limited to 'pggb.scm')
| -rw-r--r-- | pggb.scm | 506 |
1 files changed, 506 insertions, 0 deletions
diff --git a/pggb.scm b/pggb.scm new file mode 100644 index 0000000..595cb97 --- /dev/null +++ b/pggb.scm @@ -0,0 +1,506 @@ +;;; pggb.cwl --- CWL port of pggb +;;; Copyright © 2026 Arun Isaac <arunisaac@systemreboot.net> +;;; +;;; This file is part of pggb.cwl. +;;; +;;; pggb.cwl is free software: you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation, either version 3 of the License, or +;;; (at your option) any later version. +;;; +;;; pggb.cwl is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;; General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with pggb.cwl. If not, see <https://www.gnu.org/licenses/>. + +;;; +;;; This is a port of pggb 0.7.4 to CWL. +;;; + +(define gzip + (command #:inputs (file #:type File) + #:run "gzip" "--to-stdout" file + #:stdout "$(inputs.file.basename).gz" + #:outputs (compressed-file #:type File + #:binding ((glob . "$(inputs.file.basename).gz"))) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "gzip"))))))))) + +(define bgzip + (command #:inputs (fasta #:type File) (threads #:type int) + #:run "bgzip" ("-@" threads) "--stdout" fasta + #:outputs (compressed-fasta #:type stdout + #:other ((secondaryFiles . #(".gzi")))) + #:stdout "$(inputs.fasta.basename).gz" + #:other ((hints (ResourceRequirement + (coresMin . "$(inputs.threads)")) + (SoftwareRequirement + (packages . #(((package . "htslib"))))))))) + +(define samtools-faidx + (command #:inputs (compressed_fasta #:type File + #:stage? #t) + #:run "samtools" "faidx" "$(inputs.compressed_fasta.basename)" + #:outputs (indexed-fasta #:type File + #:binding ((glob . "$(inputs.compressed_fasta.basename)")) + #:other ((secondaryFiles . #(".fai" ".gzi")))) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "samtools"))))))))) + +(define wfmash-approximate + (command #:inputs + (sequences #:type File + #:other ((secondaryFiles . #(".fai" ".gzi")))) + (number_of_haplotypes #:type int) + (segment_length #:type int) + (block_length #:type #(int null)) + (map-percent-identity #:type int) + (num-mappings-for-segment #:type int) + (no-split #:type boolean) + (sparsify_mappings #:type #(float null)) + (kmer-size #:type int) + (kmer-threshold #:type float) + (exclude-delimiter #:type string) + (hg-filter-ani-diff #:type float) + (no-merge #:type boolean) + (threads #:type int) + #:run + "wfmash" + ("--segment-length" segment_length) + ("--block-length" "$(inputs.block_length === null ? 5*inputs.segment_length : inputs.block_length)") + ("--map-pct-id" map-percent-identity) + ("--num-mappings-for-segment" num-mappings-for-segment) + ("--no-split" no-split) + ;; Set sparse mapping using giant component heuristic: we + ;; keep 10*log(n)/n mappings if this is less than 1, + ;; otherwise we keep all. + ("--sparsify-mappings" "$(inputs.sparsify_mappings === null ? Math.min(10*Math.log(inputs.number_of_haplotypes)/inputs.number_of_haplotypes, 1) : inputs.sparsify_mappings)") + ("--kmer" kmer-size) + ("--kmer-threshold" kmer-threshold) + "--skip-self" + ("--skip-prefix" exclude-delimiter) + ("--threads" threads) + "--lower-triangular" + ("--hg-filter-ani-diff" hg-filter-ani-diff) + "--approx-map" + ("--no-merge" no-merge) + sequences + #:outputs (approximate-pairwise-alignment #:type stdout) + #:stdout "$(inputs.sequences.nameroot).paf" + #:other ((hints (ResourceRequirement + (coresMin . "$(inputs.threads)")) + (SoftwareRequirement + (packages . #(((package . "wfmash"))))))))) + +(define split-pairwise-alignment + (command #:inputs (pairwise_alignment #:stage? #t) (chunks #:type int) + #:run "split_approx_mappings_in_chunks" pairwise_alignment chunks + #:outputs (pairwise-alignment-chunks #:type (array File) + #:binding ((glob . "$(inputs.pairwise_alignment.basename).chunk_*.paf"))) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "wfmash"))))))))) + +(define wfmash + (command #:inputs + (sequences #:type File + #:other ((secondaryFiles . #(".fai" ".gzi")))) + (number_of_haplotypes #:type int) + (segment_length #:type int) + (block_length #:type #(int null)) + (map-percent-identity #:type int) + (num-mappings-for-segment #:type int) + (no-split #:type boolean) + (sparsify_mappings #:type #(float null)) + (kmer-size #:type int) + (kmer-threshold #:type float) + (exclude-delimiter #:type string) + (hg-filter-ani-diff #:type float) + (no-merge #:type boolean) + (input-paf #:type File) + (threads #:type int) + #:run + "wfmash" + ("--segment-length" segment_length) + ("--block-length" "$(inputs.block_length === null ? 5*inputs.segment_length : inputs.block_length)") + ("--map-pct-id" map-percent-identity) + ("--num-mappings-for-segment" num-mappings-for-segment) + ("--no-split" no-split) + ;; Set sparse mapping using giant component heuristic: we + ;; keep 10*log(n)/n mappings if this is less than 1, + ;; otherwise we keep all. + ("--sparsify-mappings" "$(inputs.sparsify_mappings === null ? Math.min(10*Math.log(inputs.number_of_haplotypes)/inputs.number_of_haplotypes, 1) : inputs.sparsify_mappings)") + ("--kmer" kmer-size) + ("--kmer-threshold" kmer-threshold) + "--skip-self" + ("--skip-prefix" exclude-delimiter) + ("--threads" threads) + "--lower-triangular" + ("--hg-filter-ani-diff" hg-filter-ani-diff) + ("--no-merge" no-merge) + ("--input-paf" input-paf) + sequences + #:outputs (pairwise-alignment #:type stdout) + #:stdout "$(inputs.sequences.nameroot).paf" + #:other ((hints (ResourceRequirement + (coresMin . "$(inputs.threads)")) + (SoftwareRequirement + (packages . #(((package . "wfmash"))))))))) + +(define seqwish + (command #:inputs + (sequences #:type File) + (pairwise-alignments #:type (array File)) + (min-match-length #:type int) + (sparse-factor #:type float) + (transclose-batch #:type string) + (threads #:type int) + #:run + "seqwish" + ("--seqs" sequences) + ("--paf-alns" (array pairwise-alignments #:separator ",")) + ("--min-match-len" min-match-length) + ("--sparse-factor" sparse-factor) + ("--transclose-batch" transclose-batch) + ("--threads" threads) + ("-g" "$(inputs.sequences.nameroot).gfa") + #:outputs (variation-graph #:type File + #:binding ((glob . "$(inputs.sequences.nameroot).gfa"))) + #:other ((hints (ResourceRequirement + (coresMin . "$(inputs.threads)")) + (SoftwareRequirement + (packages . #(((package . "seqwish"))))))))) + +(define smoothxg + (command #:inputs + (graph #:type File) + (number_of_haplotypes #:type int) + (consensus-prefix #:type string) + (map_percent_identity #:type int) + (maximum-path-jump #:type string) + (maximum-edge-jump #:type string) + (abpoa #:type boolean) + (global-poa #:type boolean) + (poa-target-lengths #:type (array string)) + (poa_params #:type string) + (poa-padding-ratio #:type float) + (poa_maximum_pad_depth #:type int) + (threads #:type int) + #:run + "smoothxg" + ("--gfa-in" graph) + ("--n-haps" number_of_haplotypes) + ("--chop-to" "100") + ("--block-id-min" "$(inputs.map_percent_identity / 100)") + ("--block-ratio-min" "0") + ("--path-jump-max" maximum-path-jump) + ("--edge-jump-max" maximum-edge-jump) + ("--poa-length-targets" (array poa-target-lengths + #:separator ",")) + ("--poa-params" "${switch (inputs.poa_params) { case \"asm5\": return \"1,19,39,3,81,1\"; case \"asm10\": return \"1,9,16,2,41,1\"; case \"asm15\": return \"1,7,11,2,33,1\"; case \"asm20\": return \"1,4,6,2,26,1\"; default: return inputs.poa_params }}") + ("--poa-padding-ratio" poa-padding-ratio) + ("--max-block-depth-adaptive-poa-padding" "$(inputs.poa_maximum_pad_depth * inputs.number_of_haplotypes)") + ("--min-block-depth-split" "0") + ("--min-block-depth-mash" "0") + ("--abpoa" abpoa) + ("--change-alignment-mode" global-poa) + ("--write-msa-in-maf-format" "$(inputs.graph.nameroot).maf") + ("--consensus-prefix" consensus-prefix) + ;; Consensus paths are temporarily disabled in pggb. + "--vanish-consensus" + ("--threads" threads) + ("--smoothed-out" "$(inputs.graph.nameroot).gfa") + #:outputs + (smoothed-graph #:type File + #:binding ((glob . "$(inputs.graph.nameroot).gfa"))) + (multiple-sequence-alignment #:type File + #:binding ((glob . "$(inputs.graph.nameroot).maf"))) + #:other ((hints (ResourceRequirement + (coresMin . "$(inputs.threads)")) + (SoftwareRequirement + (packages . #(((package . "smoothxg"))))))))) + +(define gfaffix + (command #:inputs (graph #:type File) + #:run "gfaffix" graph ("--output_refined" "$(inputs.graph.nameroot).fix.gfa") + #:stdout "$(inputs.graph.nameroot).fix.affixes.tsv" + #:outputs + (fixed-graph #:type File + #:binding ((glob . "$(inputs.graph.nameroot).fix.gfa"))) + (fix-affixes #:type stdout) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "gfaffix"))))))))) + +(define odgi-build + (command #:inputs graph (threads #:type int) + ;; Unlike pggb, we preserve the graph segment IDs; we do + ;; not optimize them away using --optimize. + #:run "odgi" "build" ("-t" threads) ("--gfa" graph) ("--out" "$(inputs.graph.nameroot).og") + #:outputs (odgi-graph #:binding ((glob . "$(inputs.graph.nameroot).og"))) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "odgi"))))))))) + +(define odgi-view + (command #:inputs graph + #:run "odgi" "view" "--to-gfa" ("--idx" graph) + #:stdout "$(inputs.graph.nameroot).gfa" + #:outputs (gfa-graph #:type stdout) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "odgi"))))))))) + + +(define odgi-unchop + (command #:inputs graph (threads #:type int) + #:run "odgi" "unchop" ("-t" threads) ("--idx" graph) ("--out" "$(inputs.graph.nameroot).og") + #:outputs (unchopped-graph #:binding ((glob . "$(inputs.graph.nameroot).og"))) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "odgi"))))))))) + +(define odgi-sort + (command #:inputs graph (threads #:type int) + #:run "odgi" "sort" ("--pipeline" "Ygs") ("-t" threads) ("--idx" graph) ("--out" "$(inputs.graph.nameroot).og") + #:outputs (sorted-graph #:binding ((glob . "$(inputs.graph.nameroot).og"))) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "odgi"))))))))) + +(define odgi-stats + (command #:inputs graph + #:run "odgi" "stats" ("--idx" graph) "--multiqc" "--sum-path-nodes-distances" "--no-gap-links" "--penalize-different-orientation" "--mean-links-length" + #:stdout "$(inputs.graph.nameroot).stats.yaml" + #:outputs (stats #:type stdout) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "odgi"))))))))) + +(define odgi-viz + (command #:inputs graph (width #:type int #:default 1500) (height #:type int #:default 500) (path-height #:type int #:default 10) (ignore-prefix #:type string) (white-to-black #:type boolean #:default #f) (change-darkness #:type boolean #:default #f) (color-by-mean-depth #:type boolean #:default #f) (color-by-mean-inversion-rate #:type boolean #:default #f) (compressed-mode #:type boolean #:default #f) (color-by-uncalled-bases #:type boolean #:default #f) + #:run "odgi" "viz" ("--width" width) ("--height" height) ("--ignore-prefix" ignore-prefix) ("--white-to-black" white-to-black) ("--change-darkness" change-darkness) ("--color-by-mean-depth" color-by-mean-depth) ("--color-by-mean-inversion-rate" color-by-mean-inversion-rate) ("--compressed-mode" compressed-mode) ("--color-by-uncalled-bases" color-by-uncalled-bases) ("--idx" graph) ("--out" "$(inputs.graph.nameroot).png") + #:outputs (visualization #:binding ((glob . "$(inputs.graph.nameroot).png"))) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "odgi"))))))))) + +(define odgi-layout + (command #:inputs graph (threads #:type int) + #:run "odgi" "layout" ("--threads" threads) ("--idx" graph) ("--out" "$(inputs.graph.nameroot).lay") + #:outputs (layout #:binding ((glob . "$(inputs.graph.nameroot).lay"))) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "odgi"))))))))) + +(define odgi-draw + (command #:inputs graph layout (height #:type int #:default 1000) (color-paths #:type boolean #:default #f) (line-width #:type int #:default 10) + #:run "odgi" "draw" ("--idx" graph) ("--coords-in" layout) ("--png" "$(inputs.graph.nameroot).png") ("--png-height" height) ("--color-paths" color-paths) ("--line-width" line-width) + #:outputs (visualization-2d #:binding ((glob . "$(inputs.graph.nameroot).png"))) + #:other ((hints (SoftwareRequirement + (packages . #(((package . "odgi"))))))))) + +(define reduce-redundancy + (workflow (graph (threads #:type int)) + (pipe (gfaffix #:graph graph) + (odgi-build #:graph fixed-graph + #:threads threads) + (odgi-unchop #:graph odgi-graph + #:threads threads) + (odgi-sort #:graph unchopped-graph + #:threads threads) + (rename #:reduced-graph sorted-graph)))) + +(define visualize + (workflow (graph (ignore-prefix #:type string)) + (tee (pipe (odgi-viz (default-visualize) + #:graph graph + #:ignore-prefix ignore-prefix) + (rename #:default-visualization visualization)) + (pipe (odgi-viz (visualize-by-nucleotide-position) + #:graph graph + #:ignore-prefix ignore-prefix + #:white-to-black #t + #:change-darkness #t) + (rename #:visualization-by-nucleotide-position visualization)) + (pipe (odgi-viz (visualize-by-mean-depth) + #:graph graph + #:ignore-prefix ignore-prefix + #:color-by-mean-depth #t) + (rename #:visualization-by-mean-depth visualization)) + (pipe (odgi-viz (visualize-by-mean-inversion-rate) + #:graph graph + #:ignore-prefix ignore-prefix + #:color-by-mean-inversion-rate #t) + (rename #:visualization-by-mean-inversion-rate visualization)) + (pipe (odgi-viz (visualize-with-compressed-mode) + #:graph graph + #:ignore-prefix ignore-prefix + #:compressed-mode #t) + (rename #:compressed-mode-visualization visualization)) + (pipe (odgi-viz (visualize-by-uncalled-bases) + #:graph graph + #:ignore-prefix ignore-prefix + #:color-by-uncalled-bases #t) + (rename #:visualization-by-uncalled-bases visualization))))) + +(define visualize-2d + (workflow (graph (threads #:type int)) + (pipe (odgi-layout #:graph graph + #:threads threads) + (tee (pipe (odgi-draw (default-visualize-2d) + #:graph graph + #:layout layout) + (rename #:default-visualization-2d visualization-2d)) + (pipe (odgi-draw (visualize-2d-with-paths) + #:graph graph + #:layout layout + #:color-paths #t + #:line-width 20) + (rename #:visualization-2d-with-paths visualization-2d)))))) + +(workflow ((sequences #:type File) + (number-of-haplotypes #:type int + #:label "number of haplotypes") + ;; wfmash + (wfmash-chunks #:type int + #:default 1 + #:label "number of jobs to split wfmash alignment into") + (segment-length #:type int + #:default 1000 + #:label "segment seed length for mapping") + (block-length #:type #(int null) + #:label "keep merged mappings supported by homologies of this total length (default: 5*segment_length)") + (map-percent-identity #:type int + #:default 90 + #:label "percent identity in the mashmap step") + (num-mappings-for-segment #:type int + #:default 1 + #:label "number of mappings to retain for each query/reference pair") + (no-split #:type boolean + #:default #f + #:label "disable splitting of input sequences during mapping") + (sparsify-mappings #:type #(float null) + #:label "keep this fraction of mappings") + (kmer-size #:type int + #:default 15 + #:label "kmer size") + (kmer-threshold #:type float + #:default 0.001 + #:label "ignore the top % most-frequent kmers") + (exclude-delimiter #:type string + #:default "#" + #:label "skip mappings when the query and target have the same prefix before the last occurrence of the given character C (default assumes PanSN-spec)") + (hg-filter-ani-diff #:type float + #:default 0.0 + #:label "filter out mappings unlikely to be this ANI less than the best mapping") + (no-merge #:type boolean + #:default #f + #:label "don't merge consecutive segment-level mappings") + ;; seqwish + (min-match-length #:type int + #:default 23 + #:label "filter exact matches below this length") + (sparse-factor #:type float + #:default 0 + #:label "sparsify input maches, keeping this fraction that minimizes a hash function") + (transclose-batch #:type string + #:default "1M" + #:label "number of base pairs to use for transitive closure batch") + ;; smoothxg + (consensus-prefix #:type string + #:default "Consensus_" + #:label "prefix of consensus path names") + (maximum-path-jump #:type string + #:default "0" + #:label "maximum path jump to include in block") + (maximum-edge-jump #:type string + #:default "0" + #:label "maximum edge jump before breaking") + (abpoa #:type boolean + #:default #f + #:label "run abPOA instead of SPOA for smoothxg") + (global-poa #:type boolean + #:default #f + #:label "run global POA alignment") + (poa-target-lengths #:type (array string) + #:default #("700" "1100") + #:label "POA target lengths; blocks are split when paths go over this length") + (poa-params #:type string + #:default "asm20" + #:label "POA score parameters") + (poa-padding-ratio #:type float + #:default 0.001 + #:label "flanking sequence length fraction to pad end of each sequence with") + (poa-maximum-pad-depth #:type int + #:default 100 + #:label "depth/haplotype at which we don't pad the POA problem") + (threads #:type int + #:label "number of threads")) + (pipe (bgzip #:fasta sequences + #:threads threads) + (samtools-faidx #:compressed_fasta compressed-fasta) + (tee (pipe (wfmash-approximate #:sequences indexed-fasta + #:number_of_haplotypes number-of-haplotypes + #:segment_length segment-length + #:block_length block-length + #:map-percent-identity map-percent-identity + #:num-mappings-for-segment num-mappings-for-segment + #:no-split no-split + #:sparsify_mappings sparsify-mappings + #:kmer-size kmer-size + #:kmer-threshold kmer-threshold + #:exclude-delimiter exclude-delimiter + #:hg-filter-ani-diff hg-filter-ani-diff + #:no-merge no-merge + #:threads threads) + (split-pairwise-alignment #:pairwise_alignment approximate-pairwise-alignment + #:chunks wfmash-chunks)) + (identity)) + (scatter (wfmash #:sequences indexed-fasta + #:number_of_haplotypes number-of-haplotypes + #:segment_length segment-length + #:block_length block-length + #:map-percent-identity map-percent-identity + #:num-mappings-for-segment num-mappings-for-segment + #:no-split no-split + #:sparsify_mappings sparsify-mappings + #:kmer-size kmer-size + #:kmer-threshold kmer-threshold + #:exclude-delimiter exclude-delimiter + #:hg-filter-ani-diff hg-filter-ani-diff + #:no-merge no-merge + #:threads threads) + #:input-paf pairwise-alignment-chunks) + (seqwish #:sequences sequences + #:pairwise-alignments pairwise-alignment + #:min-match-length min-match-length + #:sparse-factor sparse-factor + #:transclose-batch transclose-batch + #:threads threads) + (rename #:unsmoothed-pangenome variation-graph) + (tee (pipe (smoothxg #:graph unsmoothed-pangenome + #:number_of_haplotypes number-of-haplotypes + #:consensus-prefix consensus-prefix + #:map_percent_identity map-percent-identity + #:maximum-path-jump maximum-path-jump + #:maximum-edge-jump maximum-edge-jump + #:abpoa abpoa + #:global-poa global-poa + #:poa-target-lengths poa-target-lengths + #:poa_params poa-params + #:poa-padding-ratio poa-padding-ratio + #:poa_maximum_pad_depth poa-maximum-pad-depth + #:threads threads) + (reduce-redundancy #:graph smoothed-graph + #:threads threads) + (tee (pipe (odgi-stats (stats-for-graph) + #:graph reduced-graph) + (rename #:graph-stats stats)) + (visualize #:graph reduced-graph + #:ignore-prefix consensus-prefix) + (visualize-2d #:graph reduced-graph + #:threads threads) + (pipe (odgi-view #:graph reduced-graph) + (rename #:pangenome gfa-graph)))) + (pipe (odgi-build (build-unsmoothed-odgi-graph) + #:graph unsmoothed-pangenome + #:threads threads) + (odgi-stats (stats-for-unsmoothed-graph) + #:graph odgi-graph) + (rename #:unsmoothed-graph-stats stats)) + (identity)))) |
