;; An implementation of the pangenome graph builder shell script at ;; https://github.com/pangenome/pggb in ccwl (define wfmash (command #:inputs (input-fasta #:label "input FASTA/FASTQ file") (segment-length #:type int) (block-length #:type int) (no-merge-segments #:type boolean?) (no-split #:type boolean?) (map-pct-id #:type float) (n-secondary #:type int) (mash-kmer #:type int #:default 16) (threads #:type int #:default 1) #:run "wfmash" ("-s" segment-length) ("-l" block-length) ("-M" no-merge-segments) ("-N" no-split) ("-p" map-pct-id) ("-n" n-secondary) ("-k" mash-kmer) ("-t" threads) input-fasta input-fasta #:outputs (mapper-out #:type stdout))) (define seqwish (command #:inputs (input-fasta #:label "input FASTA/FASTQ file") mapper-out (min-match-len #:type int #:default 19) (transclose-batch #:type int #:default 1000000) (threads #:type int #:default 1) #:run "seqwish" "-t" threads "-s" input-fasta "-p" mapper-out "-k" min-match-len "-g" "seqwish.gfa" "-B" transclose-batch "-P" #:outputs (seqwish-out #:binding '((glob . "seqwish.gfa"))))) (define smoothxg (command #:inputs seqwish-out (block-weight-max #:type int #:default 10000) (block-id-min #:type int #:default 0) (ratio-contain #:type int #:default 0) (path-jump-max #:type int #:default 5000) (edge-jump-max #:type int #:default 5000) (poa-length-max #:type int #:default 10000) (poa-params #:type string #:default "1,4,6,2,26,1") (consensus-spec #:type string #:default "10,100,1000,10000") (threads #:type int #:default 1) #:run "smoothxg" ("-t" threads) ("-g" seqwish-out) ("-w" block-weight-max) "-M" "-J" "0.7" "-K" "-G" "150" ("-I" block-id-min) ("-j" path-jump-max) ("-e" edge-jump-max) ("-l" poa-length-max) ("-p" poa-params) #:outputs (smoothxg-graph #:binding '((glob . "smoothxg_out.gfa"))) (smoothxg-consensus #:binding '((glob . "smoothxg_out.consensus@10__y_0_1000000.gfa"))) (smoothxg-alignment #:binding '((glob . "smoothxg_out.maf"))))) (define odgi-build (command #:inputs (threads #:type int #:default 1) smoothxg-graph #:run "odgi" "build" ("-t" threads) "-P" ("-g" smoothxg-graph) "-o" "odgi_out.og" #:outputs (odgi-out #:binding '((glob . "odgi_out.og"))))) (workflow ((segment-length #:type int #:label "segment length for mapping") (block-length #:type int #:label "minimum block length filter for mapping") (no-merge-segments #:type boolean? #:label "do not merge successive mappings") (no-split #:type boolean? #:label "disable splitting of input sequences during mapping") (map-pct-id #:type float #:label "percent identity in the wfmash or edyeet mashmap step") (n-secondary #:type int #:label "number of secondary mappings to retain in 'map' filter mode") (mash-kmer #:type int #:label "kmer size for mashmap" #:default 16) (input-fasta #:label "input FASTA/FASTQ file") (min-match-len #:type int #:label "ignore exact matches below this length" #:default 19) (transclose-batch #:type int #:label "number of bp to use for transitive closure batch" #:default 1000000) (block-weight-max #:type int #:label "maximum seed sequence in block" #:default 10000) (block-id-min #:type int #:label "split blocks into groups connected by this identity threshold" #:default 0) (ratio-contain #:type int #:label "minimum short length / long length ratio to compare sequences for the containment metric in the clustering" #:default 0) (path-jump-max #:type int #:label "maximum path jump to include in block" #:default 5000) (edge-jump-max #:type int #:label "maximum edge jump before breaking" #:default 5000) (poa-length-max #:type int #:label "maximum sequence length to put into POA" #:default 10000) (poa-params #:type string #:label "score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2" #:default "1,4,6,2,26,1") (consensus-spec #:type string #:label "consensus graph specification: write the consensus graph to BASENAME.cons_[spec].gfa; where each spec contains at least a min_len parameter (which defines the length of divergences from consensus paths to preserve in the output), optionally a file containing reference paths to preserve in the output, a flag (y/n) indicating whether we should also use the POA consensus paths, a minimum coverage of consensus paths to retain (min_cov), and a maximum allele length (max_len, defaults to 1e6); implies -a; example: cons,100,1000:refs1.txt:n,1000:refs2.txt:y:2.3:1000000,10000" #:default "10,100,1000,10000") (threads #:type int #:label "number of compute threads to use in parallel steps" #:default 1)) (pipe (wfmash #:input-fasta input-fasta #:segment-length segment-length #:block-length block-length #:no-merge-segments no-merge-segments #:no-split no-split #:map-pct-id map-pct-id #:n-secondary n-secondary #:mash-kmer mash-kmer #:threads threads) (seqwish #:input-fasta input-fasta #:mapper-out mapper-out #:min-match-len min-match-len #:transclose-batch transclose-batch #:threads threads) (smoothxg #:seqwish-out seqwish-out #:block-weight-max block-weight-max #:block-id-min block-id-min #:ratio-contain ratio-contain #:path-jump-max path-jump-max #:edge-jump-max edge-jump-max #:poa-length-max poa-length-max #:poa-params poa-params #:consensus-spec consensus-spec #:threads threads) (odgi-build #:smoothxg-graph smoothxg-graph #:threads threads)))