From 1b2312ffd5f30e9c0ff2e7dfb8bdead0c15052a3 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 9 Dec 2021 22:31:48 +0530 Subject: contrib: Add pggb workflow. * contrib/pggb.scm: New file. --- contrib/pggb.scm | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 contrib/pggb.scm diff --git a/contrib/pggb.scm b/contrib/pggb.scm new file mode 100644 index 0000000..1d73d2a --- /dev/null +++ b/contrib/pggb.scm @@ -0,0 +1,132 @@ +;; An implementation of the pangenome graph builder shell script at +;; https://github.com/pangenome/pggb in ccwl + +(define wfmash + (command #:inputs + (input-fasta #:label "input FASTA/FASTQ file") + (segment-length #:type int) + (block-length #:type int) + (no-merge-segments #:type boolean?) + (no-split #:type boolean?) + (map-pct-id #:type float) + (n-secondary #:type int) + (mash-kmer #:type int #:default 16) + (threads #:type int #:default 1) + #:run "wfmash" ("-s" segment-length) ("-l" block-length) ("-M" no-merge-segments) + ("-N" no-split) ("-p" map-pct-id) ("-n" n-secondary) ("-k" mash-kmer) ("-t" threads) + input-fasta input-fasta + #:outputs (mapper-out #:type stdout))) + +(define seqwish + (command #:inputs + (input-fasta #:label "input FASTA/FASTQ file") + mapper-out + (min-match-len #:type int #:default 19) + (transclose-batch #:type int #:default 1000000) + (threads #:type int #:default 1) + #:run "seqwish" "-t" threads "-s" input-fasta "-p" mapper-out "-k" min-match-len + "-g" "seqwish.gfa" "-B" transclose-batch "-P" + #:outputs (seqwish-out #:binding '((glob . "seqwish.gfa"))))) + +(define smoothxg + (command #:inputs + seqwish-out + (block-weight-max #:type int #:default 10000) + (block-id-min #:type int #:default 0) + (ratio-contain #:type int #:default 0) + (path-jump-max #:type int #:default 5000) + (edge-jump-max #:type int #:default 5000) + (poa-length-max #:type int #:default 10000) + (poa-params #:type string #:default "1,4,6,2,26,1") + (consensus-spec #:type string #:default "10,100,1000,10000") + (threads #:type int #:default 1) + #:run "smoothxg" ("-t" threads) ("-g" seqwish-out) ("-w" block-weight-max) "-M" + "-J" "0.7" "-K" "-G" "150" ("-I" block-id-min) ("-j" path-jump-max) + ("-e" edge-jump-max) ("-l" poa-length-max) ("-p" poa-params) + #:outputs + (smoothxg-graph #:binding '((glob . "smoothxg_out.gfa"))) + (smoothxg-consensus #:binding '((glob . "smoothxg_out.consensus@10__y_0_1000000.gfa"))) + (smoothxg-alignment #:binding '((glob . "smoothxg_out.maf"))))) + +(define odgi-build + (command #:inputs + (threads #:type int #:default 1) + smoothxg-graph + #:run "odgi" "build" ("-t" threads) "-P" ("-g" smoothxg-graph) "-o" "odgi_out.og" + #:outputs (odgi-out #:binding '((glob . "odgi_out.og"))))) + +(workflow ((segment-length #:type int + #:label "segment length for mapping") + (block-length #:type int + #:label "minimum block length filter for mapping") + (no-merge-segments #:type boolean? + #:label "do not merge successive mappings") + (no-split #:type boolean? + #:label "disable splitting of input sequences during mapping") + (map-pct-id #:type float + #:label "percent identity in the wfmash or edyeet mashmap step") + (n-secondary #:type int + #:label "number of secondary mappings to retain in 'map' filter mode") + (mash-kmer #:type int + #:label "kmer size for mashmap" + #:default 16) + (input-fasta #:label "input FASTA/FASTQ file") + (min-match-len #:type int + #:label "ignore exact matches below this length" + #:default 19) + (transclose-batch #:type int + #:label "number of bp to use for transitive closure batch" + #:default 1000000) + (block-weight-max #:type int + #:label "maximum seed sequence in block" + #:default 10000) + (block-id-min #:type int + #:label "split blocks into groups connected by this identity threshold" + #:default 0) + (ratio-contain #:type int + #:label "minimum short length / long length ratio to compare sequences for the containment metric in the clustering" + #:default 0) + (path-jump-max #:type int + #:label "maximum path jump to include in block" + #:default 5000) + (edge-jump-max #:type int + #:label "maximum edge jump before breaking" + #:default 5000) + (poa-length-max #:type int + #:label "maximum sequence length to put into POA" + #:default 10000) + (poa-params #:type string + #:label "score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2" + #:default "1,4,6,2,26,1") + (consensus-spec #:type string + #:label "consensus graph specification: write the consensus graph to BASENAME.cons_[spec].gfa; where each spec contains at least a min_len parameter (which defines the length of divergences from consensus paths to preserve in the output), optionally a file containing reference paths to preserve in the output, a flag (y/n) indicating whether we should also use the POA consensus paths, a minimum coverage of consensus paths to retain (min_cov), and a maximum allele length (max_len, defaults to 1e6); implies -a; example: cons,100,1000:refs1.txt:n,1000:refs2.txt:y:2.3:1000000,10000" + #:default "10,100,1000,10000") + (threads #:type int + #:label "number of compute threads to use in parallel steps" + #:default 1)) + (pipe (wfmash #:input-fasta input-fasta + #:segment-length segment-length + #:block-length block-length + #:no-merge-segments no-merge-segments + #:no-split no-split + #:map-pct-id map-pct-id + #:n-secondary n-secondary + #:mash-kmer mash-kmer + #:threads threads) + (seqwish #:input-fasta input-fasta + #:mapper-out mapper-out + #:min-match-len min-match-len + #:transclose-batch transclose-batch + #:threads threads) + (smoothxg #:seqwish-out seqwish-out + #:block-weight-max block-weight-max + #:block-id-min block-id-min + #:ratio-contain ratio-contain + #:path-jump-max path-jump-max + #:edge-jump-max edge-jump-max + #:poa-length-max poa-length-max + #:poa-params poa-params + #:consensus-spec consensus-spec + #:threads threads) + (odgi-build #:smoothxg-graph smoothxg-graph + #:threads threads))) -- cgit v1.2.3