summaryrefslogtreecommitdiff
path: root/doc/ccwl.skb
blob: 57fa225e19e51da6964fd0b841f42c2fd70e11a8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
;;; ccwl --- Concise Common Workflow Language
;;; Copyright © 2021 Arun Isaac <arunisaac@systemreboot.net>
;;;
;;; This file is part of ccwl.
;;;
;;; ccwl is free software: you can redistribute it and/or modify it
;;; under the terms of the GNU General Public License as published by
;;; the Free Software Foundation, either version 3 of the License, or
;;; (at your option) any later version.
;;;
;;; ccwl is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with ccwl.  If not, see <https://www.gnu.org/licenses/>.

(use-modules (skribilo lib)
             (doc skribilo))

(document :title [Concise Common Workflow Language]
  (chapter :title [Introduction]
    (p [,(abbr :short "CWL" :long "Common Workflow
Language") is an open standard for describing analysis workflows and
tools in a way that makes them portable and scalable across a variety
of software and hardware environments.])
    (p [,(abbr :short "ccwl" :long "Concise Common
Workflow Language") is a concise syntax to express CWL workflows. It
is implemented as an ,(abbr :short "EDSL" :long "Embedded Domain
Specific Language") in the Scheme programming language, a minimalist
dialect of the Lisp family of programming languages.])
    (p [ccwl is a compiler to generate CWL workflows
from concise descriptions in ccwl. In the future, ccwl will also have
a runtime whereby users can interactively execute workflows while
developing them.]))
  (chapter :title [Tutorial]
    (p [This tutorial will introduce you to writing
workflows in ccwl. Some knowledge of CWL is assumed. To learn about
CWL, please see the ,(ref :url "https://www.commonwl.org/user_guide/"
:text "Common Workflow Language User Guide")])

    (section :title [Important concepts]
      (p [The CWL and ccwl workflow languages
are statically typed programming languages where functions accept
multiple named inputs and return multiple named outputs. Let 's break
down what that means.])
      (subsection :title [Static typing]
        (p [In CWL, the type of arguments accepted by a function and
the type of outputs returned by that function are specified explicitly
by the programmer, and are known at compile time even before the code
has been run. Hence, we say that it is statically typed.]))
      (subsection :title [Positional arguments and named arguments]
        (p [In many languages, the order of arguments passed to a
function is significant. The position of each argument determines
which formal argument it gets mapped to. For example, passing
positional arguments in Scheme looks like ,(code "(foo 1
2)"). However, in a language that supports named arguments (say,
Scheme or Python), the order of arguments is not significant. Each
argument explicitly names the formal argument it gets mapped to. For
example, in Scheme, passing named arguments may look like ,(code "(foo
#:bar 1 #:baz 2)") and is equivalent to ,(code "(foo #:baz 2 #:bar
1)"). Likewise, in Python, passing named arguments looks like
,(code "foo(bar=1, baz=2)") and is equivalent to ,(code "foo(baz=2,
bar=1)").]))
      (subsection :title [Multiple function arguments and return values]
        (p [In most languages, functions accept multiple input
arguments but only return a single output value. However, in CWL, a
function can return multiple output values as well. These multiple
outputs are unordered and are each addressed by a unique name.])))

    (section :title [First example]
      (p [As is tradition, let us start with a simple ,(emph "Hello
World") workflow in ccwl. This workflow accepts a string input and
prints that string.])

      (scheme-source "doc/hello-world.scm")

      (p [The first form in this code defines the ,(code "print")
command. This form is the equivalent of defining a
,(code "CommandLineTool") class workflow in CWL. The arguments after
,(code "#:inputs") define the inputs to the workflow. The arguments
after ,(code "#:run") specify the command that will be run. The input
,(code "(message #:type 'string)") defines a ,(code "string") type
input named ,(code "message"). The command defined in the
,(code "#:run") argument is the command itself followed by a list of
command arguments. One of the arguments references the input
,(code "message"). Notice how the command definition is very close to
a shell command, only that it is slightly annotated with inputs and
their types.])

      (p [The second form describes the actual workflow and is the
equivalent of defining a ,(code "Workflow") class workflow in CWL. The
form ,(code "((message #:type string))") specifies the inputs of the
workflow. In this case, there is only one input---,(code "message") of
type ,(code "string"). The body of the workflow specifies the commands
that will be executed. The body of this workflow executes only a
single command---the ,(code "print") command---passing the
,(code "message") input of the workflow as the ,(code "message") input
to the ,(code "print") command.])

      (p [If this workflow is written to a file
,(file "hello-world.scm"), we may compile it to CWL by running])

      (prog :line #f [$ ccwl compile hello-world.scm])

      (p [This prints a big chunk of generated CWL to standard
output. We have achieved quite a lot of concision already! We write
the generated CWL to a file and execute it using (command "cwltool")
as follows. The expected output is also shown.])

      (prog :line #f (source :file "doc/hello-world.out")))

    (section :title [Capturing the standard output stream of a command]
      (p [Let us return to the ,(emph "Hello World") example in the
previous section. But now, let us capture the standard output of the
,(code "print") command in an output object. The ccwl code is the same
as earlier with only the addition of an ,(code "stdout") type output
object to the command definition.])

      (scheme-source "doc/capture-stdout.scm")

      (p [Let's write this code to a file
,(file "capture-stdout.scm"), generate CWL, write the generated CWL to
,(file "capture-stdout.cwl"), and run it using ,(code "cwltool"). We
might expect something like the output below. Notice how the standard
output of the ,(code "print") command has been captured in the file
,(file "51fe79d15e7790a9ded795304220d7a44aa84b48").])

      (prog :line #f (source :file "doc/capture-stdout.out")))

    (section :title [Capturing output files]
      (p [In the previous section, we captured the standard output
stream of a command. But, how do we capture any output files created
by a command? Let us see.])

      (p [Consider a tar archive ,(file "hello.tar") containing a file
,(file "hello.txt").])

      (prog :line #f (source :file "doc/hello.tar.out"))

      (p [Let us write a workflow to extract the file
,(file "hello.txt") from the archive. Everything in the following
workflow except the ,(code "#:binding") parameter will already be
familiar to you. The ,(code "#:binding") parameter sets the
,(code "outputBinding") field in the generated CWL. In the example
below, we set the ,(code "glob") field to look for a file named
,(file "hello.txt").])

      (scheme-source "doc/capture-output-file.scm")

      (p [Writing this workflow to ,(file "capture-output-file.scm"),
compiling and running it gives us the following output. Notice that
the file ,(file "hello.txt") has been captured and is now present in
our current working directory.])

      (prog :line #f (source :file "doc/capture-output-file.out"))

      (p [The above workflow is not awfully flexible. The name of the
file to extract is hardcoded into the workflow. Let us modify the
workflow to accept the name of the file to extract. We introduce
,(code "extractfile"), a ,(code "string") type input that is passed to
,(command "tar") and is referenced in the ,(code "glob") field.])

      (scheme-source "doc/capture-output-file-with-parameter-reference.scm")

      (p [Compiling and running this workflow gives us the following
output.])

      (prog :line #f (source :file "doc/capture-output-file-with-parameter-reference.out")))

    (section :title [Passing input into the standard input stream of a command]
      (p [Some commands read input from their standard input
stream. Let us do that from ccwl. The workflow below reports the size
of a file by passing it into the standard input of
,(command "wc"). Notice the additional ,(code "#:stdin") keyword that
references the input ,(code "file").])

      (scheme-source "doc/pass-stdin.scm")

      (p [Compiling and running this workflow gives us the following
output. Notice the file ,(file "hello.txt") passed into the standard
input of ,(command "wc"), and the file size reported in bytes.])

      (prog :line #f (source :file "doc/pass-stdin.out")))

    (section :title [Workflow with multiple steps]
      (p [Till now, we have only written trivial workflows with a
single command. If we were only interested in executing single
commands, we would hardly need a workflow language! So, in this
section, let us write our first multi-step workflow and learn how to
connect steps together in an arbitrary topology.])

      (subsection :title [pipe]
        (p [First, the simplest of topologies---a linear chain
representing sequential execution of steps. The following workflow
decompresses a compressed C source file, compiles and then executes
it.])

        (scheme-source "doc/decompress-compile-run.scm")

        (p [Notice the
,(source-ref "ccwl/ccwl.scm" "\\(\\(pipe" (code "pipe")) form in the
body of the workflow. The ,(code "pipe") form specifies a list of
steps to be executed sequentially. The inputs coming into
,(code "pipe") are passed into the first step. Thereafter, the outputs
of each step are passed as inputs into the next. Note that this has
nothing to do with the Unix pipe. The inputs/outputs passed between
steps are general CWL inputs/outputs. They need not be the standard
stdin and stdout streams.])

        (image :file "doc/decompress-compile-run.png")

        (p [Writing this worklow to
,(file "decompress-compile-run.scm"), compiling and running it with
the compressed C source file ,(file "hello.c.gz") gives us the
following output.])

        (prog :line #f (source :file "doc/decompress-compile-run.out"))

        (p [The steps run in succession, and the stdout of the
compiled executable is in
,(file "c32c587f7afbdf87cf991c14a43edecf09cd93bf"). Success!]))

      (subsection :title [tee]
        (p [Next, the tee topology. The following workflow computes
three different checksums of a given input file.])

        (scheme-source "doc/checksum.scm")

        (p [Notice the
,(source-ref "ccwl/ccwl.scm" "\\(\\(tee" (code "tee")) form in the
body of the workflow. The ,(code "tee") form specifies a list of steps
that are independent of each other. The inputs coming into
,(code "tee") are passed into every step contained in the body of the
,(code "tee"). The outputs of each step are collected together and
unioned as the output of the ,(code "tee").])

        (image :file "doc/checksum.png")

        (p [Writing this workflow to ,(file "checksum.scm"), compiling
and running it with some file ,(file "hello.txt") gives us the
following output.])

        (prog :line #f (source :file "doc/checksum.out"))

        (p [The MD5, SHA1 and SHA256 checksums are in the files
,(file "112be1054505027982e64d56b0879049c12737c6"),
,(file "d2f19c786fcd3feb329004c8747803fba581a02d") and
,(file "0d2eaa5619c14b43326101200d0f27b0d8a1a4b1") respectively.]))))

  (chapter :title [Contributing]
    (p [ccwl is developed on GitHub at ,(ref
:url "https://github.com/arunisaac/ccwl"). Feedback, suggestions,
feature requests, bug reports and pull requests are all
welcome. Unclear and unspecific error messages are considered a
bug. Do report them!])))