;;; pggb.cwl --- CWL port of pggb ;;; Copyright © 2026 Arun Isaac ;;; ;;; This file is part of pggb.cwl. ;;; ;;; pggb.cwl is free software: you can redistribute it and/or modify it ;;; under the terms of the GNU General Public License as published by ;;; the Free Software Foundation, either version 3 of the License, or ;;; (at your option) any later version. ;;; ;;; pggb.cwl is distributed in the hope that it will be useful, but ;;; WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;; General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with pggb.cwl. If not, see . ;;; ;;; This is a port of pggb 0.7.4 to CWL. ;;; (define gzip (command #:inputs (file #:type File) #:run "gzip" "--to-stdout" file #:stdout "$(inputs.file.basename).gz" #:outputs (compressed-file #:type File #:binding ((glob . "$(inputs.file.basename).gz"))) #:other ((hints (SoftwareRequirement (packages . #(((package . "gzip"))))))))) (define bgzip (command #:inputs (fasta #:type File) (threads #:type int) #:run "bgzip" ("-@" threads) "--stdout" fasta #:outputs (compressed-fasta #:type stdout #:other ((secondaryFiles . #(".gzi")))) #:stdout "$(inputs.fasta.basename).gz" #:other ((hints (ResourceRequirement (coresMin . "$(inputs.threads)")) (SoftwareRequirement (packages . #(((package . "htslib"))))))))) (define samtools-faidx (command #:inputs (compressed_fasta #:type File #:stage? #t) #:run "samtools" "faidx" "$(inputs.compressed_fasta.basename)" #:outputs (indexed-fasta #:type File #:binding ((glob . "$(inputs.compressed_fasta.basename)")) #:other ((secondaryFiles . #(".fai" ".gzi")))) #:other ((hints (SoftwareRequirement (packages . #(((package . "samtools"))))))))) (define wfmash-approximate (command #:inputs (sequences #:type File #:other ((secondaryFiles . #(".fai" ".gzi")))) (number_of_haplotypes #:type int) (segment_length #:type int) (block_length #:type #(int null)) (map-percent-identity #:type int) (num-mappings-for-segment #:type int) (no-split #:type boolean) (sparsify_mappings #:type #(float null)) (kmer-size #:type int) (kmer-threshold #:type float) (exclude-delimiter #:type string) (hg-filter-ani-diff #:type float) (no-merge #:type boolean) (threads #:type int) #:run "wfmash" ("--segment-length" segment_length) ("--block-length" "$(inputs.block_length === null ? 5*inputs.segment_length : inputs.block_length)") ("--map-pct-id" map-percent-identity) ("--num-mappings-for-segment" num-mappings-for-segment) ("--no-split" no-split) ;; Set sparse mapping using giant component heuristic: we ;; keep 10*log(n)/n mappings if this is less than 1, ;; otherwise we keep all. ("--sparsify-mappings" "$(inputs.sparsify_mappings === null ? Math.min(10*Math.log(inputs.number_of_haplotypes)/inputs.number_of_haplotypes, 1) : inputs.sparsify_mappings)") ("--kmer" kmer-size) ("--kmer-threshold" kmer-threshold) "--skip-self" ("--skip-prefix" exclude-delimiter) ("--threads" threads) "--lower-triangular" ("--hg-filter-ani-diff" hg-filter-ani-diff) "--approx-map" ("--no-merge" no-merge) sequences #:outputs (approximate-pairwise-alignment #:type stdout) #:stdout "$(inputs.sequences.nameroot).paf" #:other ((hints (ResourceRequirement (coresMin . "$(inputs.threads)")) (SoftwareRequirement (packages . #(((package . "wfmash"))))))))) (define split-pairwise-alignment (command #:inputs (pairwise_alignment #:stage? #t) (chunks #:type int) #:run "split_approx_mappings_in_chunks" pairwise_alignment chunks #:outputs (pairwise-alignment-chunks #:type (array File) #:binding ((glob . "$(inputs.pairwise_alignment.basename).chunk_*.paf"))) #:other ((hints (SoftwareRequirement (packages . #(((package . "wfmash"))))))))) (define wfmash (command #:inputs (sequences #:type File #:other ((secondaryFiles . #(".fai" ".gzi")))) (number_of_haplotypes #:type int) (segment_length #:type int) (block_length #:type #(int null)) (map-percent-identity #:type int) (num-mappings-for-segment #:type int) (no-split #:type boolean) (sparsify_mappings #:type #(float null)) (kmer-size #:type int) (kmer-threshold #:type float) (exclude-delimiter #:type string) (hg-filter-ani-diff #:type float) (no-merge #:type boolean) (input-paf #:type File) (threads #:type int) #:run "wfmash" ("--segment-length" segment_length) ("--block-length" "$(inputs.block_length === null ? 5*inputs.segment_length : inputs.block_length)") ("--map-pct-id" map-percent-identity) ("--num-mappings-for-segment" num-mappings-for-segment) ("--no-split" no-split) ;; Set sparse mapping using giant component heuristic: we ;; keep 10*log(n)/n mappings if this is less than 1, ;; otherwise we keep all. ("--sparsify-mappings" "$(inputs.sparsify_mappings === null ? Math.min(10*Math.log(inputs.number_of_haplotypes)/inputs.number_of_haplotypes, 1) : inputs.sparsify_mappings)") ("--kmer" kmer-size) ("--kmer-threshold" kmer-threshold) "--skip-self" ("--skip-prefix" exclude-delimiter) ("--threads" threads) "--lower-triangular" ("--hg-filter-ani-diff" hg-filter-ani-diff) ("--no-merge" no-merge) ("--input-paf" input-paf) sequences #:outputs (pairwise-alignment #:type stdout) #:stdout "$(inputs.sequences.nameroot).paf" #:other ((hints (ResourceRequirement (coresMin . "$(inputs.threads)")) (SoftwareRequirement (packages . #(((package . "wfmash"))))))))) (define seqwish (command #:inputs (sequences #:type File) (pairwise-alignments #:type (array File)) (min-match-length #:type int) (sparse-factor #:type float) (transclose-batch #:type string) (threads #:type int) #:run "seqwish" ("--seqs" sequences) ("--paf-alns" (array pairwise-alignments #:separator ",")) ("--min-match-len" min-match-length) ("--sparse-factor" sparse-factor) ("--transclose-batch" transclose-batch) ("--threads" threads) ("-g" "$(inputs.sequences.nameroot).gfa") #:outputs (variation-graph #:type File #:binding ((glob . "$(inputs.sequences.nameroot).gfa"))) #:other ((hints (ResourceRequirement (coresMin . "$(inputs.threads)")) (SoftwareRequirement (packages . #(((package . "seqwish"))))))))) (define smoothxg (command #:inputs (graph #:type File) (number_of_haplotypes #:type int) (consensus-prefix #:type string) (map_percent_identity #:type int) (maximum-path-jump #:type string) (maximum-edge-jump #:type string) (abpoa #:type boolean) (global-poa #:type boolean) (poa-target-lengths #:type (array string)) (poa_params #:type string) (poa-padding-ratio #:type float) (poa_maximum_pad_depth #:type int) (threads #:type int) #:run "smoothxg" ("--gfa-in" graph) ("--n-haps" number_of_haplotypes) ("--chop-to" "100") ("--block-id-min" "$(inputs.map_percent_identity / 100)") ("--block-ratio-min" "0") ("--path-jump-max" maximum-path-jump) ("--edge-jump-max" maximum-edge-jump) ("--poa-length-targets" (array poa-target-lengths #:separator ",")) ("--poa-params" "${switch (inputs.poa_params) { case \"asm5\": return \"1,19,39,3,81,1\"; case \"asm10\": return \"1,9,16,2,41,1\"; case \"asm15\": return \"1,7,11,2,33,1\"; case \"asm20\": return \"1,4,6,2,26,1\"; default: return inputs.poa_params }}") ("--poa-padding-ratio" poa-padding-ratio) ("--max-block-depth-adaptive-poa-padding" "$(inputs.poa_maximum_pad_depth * inputs.number_of_haplotypes)") ("--min-block-depth-split" "0") ("--min-block-depth-mash" "0") ("--abpoa" abpoa) ("--change-alignment-mode" global-poa) ("--write-msa-in-maf-format" "$(inputs.graph.nameroot).maf") ("--consensus-prefix" consensus-prefix) ;; Consensus paths are temporarily disabled in pggb. "--vanish-consensus" ("--threads" threads) ("--smoothed-out" "$(inputs.graph.nameroot).gfa") #:outputs (smoothed-graph #:type File #:binding ((glob . "$(inputs.graph.nameroot).gfa"))) (multiple-sequence-alignment #:type File #:binding ((glob . "$(inputs.graph.nameroot).maf"))) #:other ((hints (ResourceRequirement (coresMin . "$(inputs.threads)")) (SoftwareRequirement (packages . #(((package . "smoothxg"))))))))) (define gfaffix (command #:inputs (graph #:type File) #:run "gfaffix" graph ("--output_refined" "$(inputs.graph.nameroot).fix.gfa") #:stdout "$(inputs.graph.nameroot).fix.affixes.tsv" #:outputs (fixed-graph #:type File #:binding ((glob . "$(inputs.graph.nameroot).fix.gfa"))) (fix-affixes #:type stdout) #:other ((hints (SoftwareRequirement (packages . #(((package . "gfaffix"))))))))) (define odgi-build (command #:inputs graph (threads #:type int) ;; Unlike pggb, we preserve the graph segment IDs; we do ;; not optimize them away using --optimize. #:run "odgi" "build" ("-t" threads) ("--gfa" graph) ("--out" "$(inputs.graph.nameroot).og") #:outputs (odgi-graph #:binding ((glob . "$(inputs.graph.nameroot).og"))) #:other ((hints (SoftwareRequirement (packages . #(((package . "odgi"))))))))) (define odgi-view (command #:inputs graph #:run "odgi" "view" "--to-gfa" ("--idx" graph) #:stdout "$(inputs.graph.nameroot).gfa" #:outputs (gfa-graph #:type stdout) #:other ((hints (SoftwareRequirement (packages . #(((package . "odgi"))))))))) (define odgi-unchop (command #:inputs graph (threads #:type int) #:run "odgi" "unchop" ("-t" threads) ("--idx" graph) ("--out" "$(inputs.graph.nameroot).og") #:outputs (unchopped-graph #:binding ((glob . "$(inputs.graph.nameroot).og"))) #:other ((hints (SoftwareRequirement (packages . #(((package . "odgi"))))))))) (define odgi-sort (command #:inputs graph (threads #:type int) #:run "odgi" "sort" ("--pipeline" "Ygs") ("-t" threads) ("--idx" graph) ("--out" "$(inputs.graph.nameroot).og") #:outputs (sorted-graph #:binding ((glob . "$(inputs.graph.nameroot).og"))) #:other ((hints (SoftwareRequirement (packages . #(((package . "odgi"))))))))) (define odgi-stats (command #:inputs graph #:run "odgi" "stats" ("--idx" graph) "--multiqc" "--sum-path-nodes-distances" "--no-gap-links" "--penalize-different-orientation" "--mean-links-length" #:stdout "$(inputs.graph.nameroot).stats.yaml" #:outputs (stats #:type stdout) #:other ((hints (SoftwareRequirement (packages . #(((package . "odgi"))))))))) (define odgi-viz (command #:inputs graph (width #:type int #:default 1500) (height #:type int #:default 500) (path-height #:type int #:default 10) (ignore-prefix #:type string) (white-to-black #:type boolean #:default #f) (change-darkness #:type boolean #:default #f) (color-by-mean-depth #:type boolean #:default #f) (color-by-mean-inversion-rate #:type boolean #:default #f) (compressed-mode #:type boolean #:default #f) (color-by-uncalled-bases #:type boolean #:default #f) #:run "odgi" "viz" ("--width" width) ("--height" height) ("--ignore-prefix" ignore-prefix) ("--white-to-black" white-to-black) ("--change-darkness" change-darkness) ("--color-by-mean-depth" color-by-mean-depth) ("--color-by-mean-inversion-rate" color-by-mean-inversion-rate) ("--compressed-mode" compressed-mode) ("--color-by-uncalled-bases" color-by-uncalled-bases) ("--idx" graph) ("--out" "$(inputs.graph.nameroot).png") #:outputs (visualization #:binding ((glob . "$(inputs.graph.nameroot).png"))) #:other ((hints (SoftwareRequirement (packages . #(((package . "odgi"))))))))) (define odgi-layout (command #:inputs graph (threads #:type int) #:run "odgi" "layout" ("--threads" threads) ("--idx" graph) ("--out" "$(inputs.graph.nameroot).lay") #:outputs (layout #:binding ((glob . "$(inputs.graph.nameroot).lay"))) #:other ((hints (SoftwareRequirement (packages . #(((package . "odgi"))))))))) (define odgi-draw (command #:inputs graph layout (height #:type int #:default 1000) (color-paths #:type boolean #:default #f) (line-width #:type int #:default 10) #:run "odgi" "draw" ("--idx" graph) ("--coords-in" layout) ("--png" "$(inputs.graph.nameroot).png") ("--png-height" height) ("--color-paths" color-paths) ("--line-width" line-width) #:outputs (visualization-2d #:binding ((glob . "$(inputs.graph.nameroot).png"))) #:other ((hints (SoftwareRequirement (packages . #(((package . "odgi"))))))))) (define reduce-redundancy (workflow (graph (threads #:type int)) (pipe (gfaffix #:graph graph) (odgi-build #:graph fixed-graph #:threads threads) (odgi-unchop #:graph odgi-graph #:threads threads) (odgi-sort #:graph unchopped-graph #:threads threads) (rename #:reduced-graph sorted-graph)))) (define visualize (workflow (graph (ignore-prefix #:type string)) (tee (pipe (odgi-viz (default-visualize) #:graph graph #:ignore-prefix ignore-prefix) (rename #:default-visualization visualization)) (pipe (odgi-viz (visualize-by-nucleotide-position) #:graph graph #:ignore-prefix ignore-prefix #:white-to-black #t #:change-darkness #t) (rename #:visualization-by-nucleotide-position visualization)) (pipe (odgi-viz (visualize-by-mean-depth) #:graph graph #:ignore-prefix ignore-prefix #:color-by-mean-depth #t) (rename #:visualization-by-mean-depth visualization)) (pipe (odgi-viz (visualize-by-mean-inversion-rate) #:graph graph #:ignore-prefix ignore-prefix #:color-by-mean-inversion-rate #t) (rename #:visualization-by-mean-inversion-rate visualization)) (pipe (odgi-viz (visualize-with-compressed-mode) #:graph graph #:ignore-prefix ignore-prefix #:compressed-mode #t) (rename #:compressed-mode-visualization visualization)) (pipe (odgi-viz (visualize-by-uncalled-bases) #:graph graph #:ignore-prefix ignore-prefix #:color-by-uncalled-bases #t) (rename #:visualization-by-uncalled-bases visualization))))) (define visualize-2d (workflow (graph (threads #:type int)) (pipe (odgi-layout #:graph graph #:threads threads) (tee (pipe (odgi-draw (default-visualize-2d) #:graph graph #:layout layout) (rename #:default-visualization-2d visualization-2d)) (pipe (odgi-draw (visualize-2d-with-paths) #:graph graph #:layout layout #:color-paths #t #:line-width 20) (rename #:visualization-2d-with-paths visualization-2d)))))) (workflow ((sequences #:type File) (number-of-haplotypes #:type int #:label "number of haplotypes") ;; wfmash (wfmash-chunks #:type int #:default 1 #:label "number of jobs to split wfmash alignment into") (segment-length #:type int #:default 1000 #:label "segment seed length for mapping") (block-length #:type #(int null) #:label "keep merged mappings supported by homologies of this total length (default: 5*segment_length)") (map-percent-identity #:type int #:default 90 #:label "percent identity in the mashmap step") (num-mappings-for-segment #:type int #:default 1 #:label "number of mappings to retain for each query/reference pair") (no-split #:type boolean #:default #f #:label "disable splitting of input sequences during mapping") (sparsify-mappings #:type #(float null) #:label "keep this fraction of mappings") (kmer-size #:type int #:default 15 #:label "kmer size") (kmer-threshold #:type float #:default 0.001 #:label "ignore the top % most-frequent kmers") (exclude-delimiter #:type string #:default "#" #:label "skip mappings when the query and target have the same prefix before the last occurrence of the given character C (default assumes PanSN-spec)") (hg-filter-ani-diff #:type float #:default 0.0 #:label "filter out mappings unlikely to be this ANI less than the best mapping") (no-merge #:type boolean #:default #f #:label "don't merge consecutive segment-level mappings") ;; seqwish (min-match-length #:type int #:default 23 #:label "filter exact matches below this length") (sparse-factor #:type float #:default 0 #:label "sparsify input maches, keeping this fraction that minimizes a hash function") (transclose-batch #:type string #:default "1M" #:label "number of base pairs to use for transitive closure batch") ;; smoothxg (consensus-prefix #:type string #:default "Consensus_" #:label "prefix of consensus path names") (maximum-path-jump #:type string #:default "0" #:label "maximum path jump to include in block") (maximum-edge-jump #:type string #:default "0" #:label "maximum edge jump before breaking") (abpoa #:type boolean #:default #f #:label "run abPOA instead of SPOA for smoothxg") (global-poa #:type boolean #:default #f #:label "run global POA alignment") (poa-target-lengths #:type (array string) #:default #("700" "1100") #:label "POA target lengths; blocks are split when paths go over this length") (poa-params #:type string #:default "asm20" #:label "POA score parameters") (poa-padding-ratio #:type float #:default 0.001 #:label "flanking sequence length fraction to pad end of each sequence with") (poa-maximum-pad-depth #:type int #:default 100 #:label "depth/haplotype at which we don't pad the POA problem") (threads #:type int #:label "number of threads")) (pipe (bgzip #:fasta sequences #:threads threads) (samtools-faidx #:compressed_fasta compressed-fasta) (tee (pipe (wfmash-approximate #:sequences indexed-fasta #:number_of_haplotypes number-of-haplotypes #:segment_length segment-length #:block_length block-length #:map-percent-identity map-percent-identity #:num-mappings-for-segment num-mappings-for-segment #:no-split no-split #:sparsify_mappings sparsify-mappings #:kmer-size kmer-size #:kmer-threshold kmer-threshold #:exclude-delimiter exclude-delimiter #:hg-filter-ani-diff hg-filter-ani-diff #:no-merge no-merge #:threads threads) (split-pairwise-alignment #:pairwise_alignment approximate-pairwise-alignment #:chunks wfmash-chunks)) (identity)) (scatter (wfmash #:sequences indexed-fasta #:number_of_haplotypes number-of-haplotypes #:segment_length segment-length #:block_length block-length #:map-percent-identity map-percent-identity #:num-mappings-for-segment num-mappings-for-segment #:no-split no-split #:sparsify_mappings sparsify-mappings #:kmer-size kmer-size #:kmer-threshold kmer-threshold #:exclude-delimiter exclude-delimiter #:hg-filter-ani-diff hg-filter-ani-diff #:no-merge no-merge #:threads threads) #:input-paf pairwise-alignment-chunks) (seqwish #:sequences sequences #:pairwise-alignments pairwise-alignment #:min-match-length min-match-length #:sparse-factor sparse-factor #:transclose-batch transclose-batch #:threads threads) (rename #:unsmoothed-pangenome variation-graph) (tee (pipe (smoothxg #:graph unsmoothed-pangenome #:number_of_haplotypes number-of-haplotypes #:consensus-prefix consensus-prefix #:map_percent_identity map-percent-identity #:maximum-path-jump maximum-path-jump #:maximum-edge-jump maximum-edge-jump #:abpoa abpoa #:global-poa global-poa #:poa-target-lengths poa-target-lengths #:poa_params poa-params #:poa-padding-ratio poa-padding-ratio #:poa_maximum_pad_depth poa-maximum-pad-depth #:threads threads) (reduce-redundancy #:graph smoothed-graph #:threads threads) (tee (pipe (odgi-stats (stats-for-graph) #:graph reduced-graph) (rename #:graph-stats stats)) (visualize #:graph reduced-graph #:ignore-prefix consensus-prefix) (visualize-2d #:graph reduced-graph #:threads threads) (pipe (odgi-view #:graph reduced-graph) (rename #:pangenome gfa-graph)))) (pipe (odgi-build (build-unsmoothed-odgi-graph) #:graph unsmoothed-pangenome #:threads threads) (odgi-stats (stats-for-unsmoothed-graph) #:graph odgi-graph) (rename #:unsmoothed-graph-stats stats)) (identity))))