summaryrefslogtreecommitdiff
path: root/doc/ccwl.skb
blob: cb78fd52036184e0534e4c4a25cb80cfd31b4762 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
;;; ccwl --- Concise Common Workflow Language
;;; Copyright © 2021, 2023 Arun Isaac <arunisaac@systemreboot.net>
;;;
;;; This file is part of ccwl.
;;;
;;; ccwl is free software: you can redistribute it and/or modify it
;;; under the terms of the GNU General Public License as published by
;;; the Free Software Foundation, either version 3 of the License, or
;;; (at your option) any later version.
;;;
;;; ccwl is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with ccwl.  If not, see <https://www.gnu.org/licenses/>.

(use-modules (skribilo lib)
             (doc skribilo))

(document :title [Concise Common Workflow Language]
  (toc)
  (chapter :title [Introduction]
           :ident "chapter-introduction"
    (p [,(abbr :short "CWL" :long "Common Workflow
Language") is an open standard for describing analysis workflows and
tools in a way that makes them portable and scalable across a variety
of software and hardware environments.])
    (p [,(abbr :short "ccwl" :long "Concise Common
Workflow Language") is a concise syntax to express CWL workflows. It
is implemented as an ,(abbr :short "EDSL" :long "Embedded Domain
Specific Language") in the Scheme programming language, a minimalist
dialect of the Lisp family of programming languages.])
    (p [ccwl is a compiler to generate CWL workflows
from concise descriptions in ccwl. In the future, ccwl will also have
a runtime whereby users can interactively execute workflows while
developing them.]))
  (chapter :title [Tutorial]
           :ident "chapter-tutorial"
    (p [This tutorial will introduce you to writing
workflows in ccwl. Some knowledge of CWL is assumed. To learn about
CWL, please see the ,(ref :url "https://www.commonwl.org/user_guide/"
:text "Common Workflow Language User Guide")])

    (section :title [Important concepts]
             :ident "section-important-concepts"
      (p [The CWL and ccwl workflow languages
are statically typed programming languages where functions accept
multiple named inputs and return multiple named outputs. Let 's break
down what that means.])
      (subsection :title [Static typing]
        (p [In CWL, the type of arguments accepted by a function and
the type of outputs returned by that function are specified explicitly
by the programmer, and are known at compile time even before the code
has been run. Hence, we say that it is statically typed.]))
      (subsection :title [Positional arguments and named arguments]
        (p [In many languages, the order of arguments passed to a
function is significant. The position of each argument determines
which formal argument it gets mapped to. For example, passing
positional arguments in Scheme looks like ,(code "(foo 1
2)"). However, in a language that supports named arguments (say,
Scheme or Python), the order of arguments is not significant. Each
argument explicitly names the formal argument it gets mapped to. For
example, in Scheme, passing named arguments may look like ,(code "(foo
#:bar 1 #:baz 2)") and is equivalent to ,(code "(foo #:baz 2 #:bar
1)"). Likewise, in Python, passing named arguments looks like
,(code "foo(bar=1, baz=2)") and is equivalent to ,(code "foo(baz=2,
bar=1)").]))
      (subsection :title [Multiple function arguments and return values]
        (p [In most languages, functions accept multiple input
arguments but only return a single output value. However, in CWL, a
function can return multiple output values as well. These multiple
outputs are unordered and are each addressed by a unique name.])))

    (section :title [First example]
             :ident "section-first-example"
      (p [As is tradition, let us start with a simple ,(emph "Hello
World") workflow in ccwl. This workflow accepts a string input and
prints that string.])

      (scheme-source "doc/hello-world.scm")

      (p [The first form in this code defines the ,(code "print")
command. This form is the equivalent of defining a
,(code "CommandLineTool") class workflow in CWL. The arguments after
,(code "#:inputs") define the inputs to the workflow. The arguments
after ,(code "#:run") specify the command that will be run. The input
,(code "(message #:type 'string)") defines a ,(code "string") type
input named ,(code "message"). The command defined in the
,(code "#:run") argument is the command itself followed by a list of
command arguments. One of the arguments references the input
,(code "message"). Notice how the command definition is very close to
a shell command, only that it is slightly annotated with inputs and
their types.])

      (p [The second form describes the actual workflow and is the
equivalent of defining a ,(code "Workflow") class workflow in CWL. The
form ,(code "((message #:type string))") specifies the inputs of the
workflow. In this case, there is only one input---,(code "message") of
type ,(code "string"). The body of the workflow specifies the commands
that will be executed. The body of this workflow executes only a
single command---the ,(code "print") command---passing the
,(code "message") input of the workflow as the ,(code "message") input
to the ,(code "print") command.])

      (p [If this workflow is written to a file
,(file "hello-world.scm"), we may compile it to CWL by running])

      (prog :line #f [$ ccwl compile hello-world.scm])

      (p [This prints a big chunk of generated CWL to standard
output. We have achieved quite a lot of concision already! We write
the generated CWL to a file and execute it using (command "cwltool")
as follows. The expected output is also shown.])

      (prog :line #f (source :file "doc/hello-world.out")))

    (section :title [Capturing the standard output stream of a command]
             :ident "section-capturing-stdout"
      (p [Let us return to the ,(emph "Hello World") example in the
previous section. But now, let us capture the standard output of the
,(code "print") command in an output object. The ccwl code is the same
as earlier with the addition of an ,(code "stdout") type output object
and an ,(code "#:stdout") parameter specifying the name of the file to
capture standard output in.])

      (scheme-source "doc/capture-stdout.scm")

      (p [Let's write this code to a file
,(file "capture-stdout.scm"), generate CWL, write the generated CWL to
,(file "capture-stdout.cwl"), and run it using ,(code "cwltool"). We
might expect something like the output below. Notice how the standard
output of the ,(code "print") command has been captured in the file
,(file "printed-message-output.txt").])

      (prog :line #f (source :file "doc/capture-stdout.out")))

    (section :title [Capturing output files]
             :ident "section-capturing-output-files"
      (p [In the previous section, we captured the standard output
stream of a command. But, how do we capture any output files created
by a command? Let us see.])

      (p [Consider a tar archive ,(file "hello.tar") containing a file
,(file "hello.txt").])

      (prog :line #f (source :file "doc/hello.tar.out"))

      (p [Let us write a workflow to extract the file
,(file "hello.txt") from the archive. Everything in the following
workflow except the ,(code "#:binding") parameter will already be
familiar to you. The ,(code "#:binding") parameter sets the
,(code "outputBinding") field in the generated CWL. In the example
below, we set the ,(code "glob") field to look for a file named
,(file "hello.txt").])

      (scheme-source "doc/capture-output-file.scm")

      (p [Writing this workflow to ,(file "capture-output-file.scm"),
compiling and running it gives us the following output. Notice that
the file ,(file "hello.txt") has been captured and is now present in
our current working directory.])

      (prog :line #f (source :file "doc/capture-output-file.out"))

      (p [The above workflow is not awfully flexible. The name of the
file to extract is hardcoded into the workflow. Let us modify the
workflow to accept the name of the file to extract. We introduce
,(code "extractfile"), a ,(code "string") type input that is passed to
,(command "tar") and is referenced in the ,(code "glob") field.])

      (scheme-source "doc/capture-output-file-with-parameter-reference.scm")

      (p [Compiling and running this workflow gives us the following
output.])

      (prog :line #f (source :file "doc/capture-output-file-with-parameter-reference.out")))

    (section :title [Passing input into the standard input stream of a command]
             :ident "section-passing-input-into-stdin"
      (p [Some commands read input from their standard input
stream. Let us do that from ccwl. The workflow below reports the size
of a file by passing it into the standard input of
,(command "wc"). Notice the additional ,(code "#:stdin") keyword that
references the input ,(code "file").])

      (scheme-source "doc/pass-stdin.scm")

      (p [Compiling and running this workflow gives us the following
output. Notice the file ,(file "hello.txt") passed into the standard
input of ,(command "wc"), and the file size reported in bytes.])

      (prog :line #f (source :file "doc/pass-stdin.out")))

    (section :title [Workflow with multiple steps]
             :ident "section-workflow-with-multiple-steps"
      (p [Till now, we have only written trivial workflows with a
single command. If we were only interested in executing single
commands, we would hardly need a workflow language! So, in this
section, let us write our first multi-step workflow and learn how to
connect steps together in an arbitrary topology.])

      (subsection :title [pipe]
        (p [First, the simplest of topologies---a linear chain
representing sequential execution of steps. The following workflow
decompresses a compressed C source file, compiles and then executes
it.])

        (scheme-source "doc/decompress-compile-run.scm")

        (p [Notice the
,(source-ref "ccwl/ccwl.scm" "\\(\\(pipe" (code "pipe")) form in the
body of the workflow. The ,(code "pipe") form specifies a list of
steps to be executed sequentially. The inputs coming into
,(code "pipe") are passed into the first step. Thereafter, the outputs
of each step are passed as inputs into the next. Note that this has
nothing to do with the Unix pipe. The inputs/outputs passed between
steps are general CWL inputs/outputs. They need not be the standard
stdin and stdout streams.])

        (image :file "doc/decompress-compile-run.png")

        (p [Writing this worklow to
,(file "decompress-compile-run.scm"), compiling and running it with
the compressed C source file ,(file "hello.c.gz") gives us the
following output.])

        (prog :line #f (source :file "doc/decompress-compile-run.out"))

        (p [The steps run in succession, and the stdout of the
compiled executable is in ,(file "run-output.txt"). Success!]))

      (subsection :title [tee]
        (p [Next, the tee topology. The following workflow computes
three different checksums of a given input file.])

        (scheme-source "doc/checksum.scm")

        (p [Notice the
,(source-ref "ccwl/ccwl.scm" "\\(\\(tee" (code "tee")) form in the
body of the workflow. The ,(code "tee") form specifies a list of steps
that are independent of each other. The inputs coming into
,(code "tee") are passed into every step contained in the body of the
,(code "tee"). The outputs of each step are collected together and
unioned as the output of the ,(code "tee").])

        (image :file "doc/checksum.png")

        (p [Writing this workflow to ,(file "checksum.scm"), compiling
and running it with some file ,(file "hello.txt") gives us the
following output.])

        (prog :line #f (source :file "doc/checksum.out"))

        (p [The MD5, SHA1 and SHA256 checksums are in the files ,(file
"md5"), ,(file "sha1") and ,(file "sha256") respectively.])))

    (section :title [Let's write a spell check workflow]
             :ident "section-spell-check-workflow"
      (p [Finally, let's put together a complex workflow to understand
how everything fits together. The workflow we will be attempting is a
spell check workflow inspired by the founders of Unix,(footnote
["UNIX: Making Computers Easier to Use" has a ,(ref
:url "https://www.youtube.com/watch?v=XvDZLjaCJuw?t=315"
:text "section where Brian Kernighan writes a spell check system using
pipes")]) and by dgsh,(footnote [dgsh, a shell supporting general
directed graph pipelines, has a ,(ref
:url "https://www.spinellis.gr/sw/dgsh/#spell-highlight" :text "spell
check example").]). The workflow is pictured below. Let's start by
coding each of the steps required by the workflow.])

      (image :file "doc/spell-check.png")

      (p [The first command, ,(code "split-words"), splits up the
input text into words, one per line. It does this by invoking the
,(command "tr") command to replace anything that is not an alphabetic
character with a newline. In addition, it uses the
,(code "--squeeze-repeats") flag to prevent blank lines from appearing
in its output. Notice that no type is specified for the input
,(code "text"). When no type is specified, ccwl assumes a
,(code "File") type.]
         (scheme-source-form "doc/spell-check.scm" "\\(define split-words"))

      (p [We want our spell check to be case-insensitive. So, we
downcase all words. This is achieved using another invocation of the
,(command "tr") command.]
         (scheme-source-form "doc/spell-check.scm" "\\(define downcase"))

      (p [For easy comparison against a dictionary, we want both our
words and our dictionary sorted and deduplicated. We achieve this by
invoking the ,(command "sort") command with the ,(code "--unique")
flag.]
         (scheme-source-form "doc/spell-check.scm" "\\(define sort"))

      (p [Finally, we compare the sorted word list with the sorted
dictionary to identify the misspellings. We do this using the
,(command "comm") command.]
         (scheme-source-form "doc/spell-check.scm"
                             "\\(define find-misspellings"))

      (p [Now, let's wire up the workflow. First, we assemble the
,(code "split-words")-,(code "downcase")-,(code "sort-words") arm of
the workflow. This arm is just a linear chain that can be assembled
using ,(code "pipe"). We will need to invoke the ,(code "sort")
command twice in our workflow. To distinguish the two invocations, CWL
requires us to specify a unique step id for each invocation. We do
this using the second element, ,(code "(sort-words)"). To avoid name
conflicts, we also need to rename the output of the ,(code "sort")
command. The last step, ,(source-ref "ccwl/ccwl.scm"
"\\(\\(rename new-key old-key\\)" (code "rename")), a special ccwl
construct that, is used to achieve this. In this case, it renames the
,(code "sorted") output of the ,(code "sort") command into ,(code
"sorted-words").]
         (scheme-source "doc/spell-check-workflow-1.scm"))

      (p [Next, we assemble the ,(code "split-dictionary") arm of the
workflow. This arm is just a single step. Then, we connect up both the
arms using a ,(code "tee"). Here too, we have a step id and renaming
of intermediate inputs/outputs.]
         (scheme-source "doc/spell-check-workflow-2.scm"))

      (p [And finally, we use the outputs of both the arms of the
workflow together in the ,(code "find-misspellings") step.]
         (scheme-source-form "doc/spell-check.scm" "\\(workflow"))

      (p [The complete workflow is as follows.]
         (scheme-source "doc/spell-check.scm"))

      (p [When compiled and run with a text file and a dictionary, the
misspelt words appear at the output.]
         (prog :line #f (source :file "doc/spell-check.out")))))

  (chapter :title [Cookbook]
           :ident "chapter-cookbook"
    (section :title [Stage input files]
             :ident "section-stage-input-files"
      (p [When running command-line tools, CWL normally has separate
directories for input and output files. But, some command-line tools
expect their input and output files to be in the same directory, and
this may not sit well with them. In such situations, we can tell CWL
to ,(emph "stage") the input file into the output directory. We may
express this in ccwl using the ,(code "#:stage?") parameter to the
inputs to be staged. Here is a rather concocted example.]
         (scheme-source "doc/staging-input-files.scm")))
    (section :title [Array types]
             :ident "section-array-types"
      (p [ccwl supports array types using the following syntax.]
         (scheme-source "doc/array-types.scm"))
      (p [Nested array types are also supported.]
         (scheme-source "doc/nested-array-types.scm")))
    (section :title [Array input item separators]
      :ident "section-array-input-item-separators"
      (p [Occasionally, it is required to serialize array type inputs
by separating them with a specific item separator. This can be
achieved by explicitly specifying a separator in the ,(code [#:run])
argument of ,(code [command]). For example, to use comma as the item
separator, you could do]
         (scheme-source "doc/array-input-item-separators.scm")
         [If ,(code "[foo, bar, aal, vel]") is passed in as ,(code
[messages]), then the command invoked is ,(samp "echo
foo,bar,aal,vel").]))
    (section :title [Scatter/gather]
             :ident "section-scatter-gather"
      (p [ccwl supports CWL's dotproduct scatter/gather feature using
the following syntax. Here, the ,(code [other-messages]) input to the
workflow is an array of strings that is scattered over the ,(code
[print]) step. Each run of the ,(code [print]) step gets an element of
,(code [other-messages]) as its ,(code [other-message]) argument.]
         (scheme-source "doc/scatter-gather.scm")))
    (section :title [Reuse external CWL workflows]
             :ident "section-reuse-external-cwl-workflows"
      (p [Even though you may be a ccwl convert (hurrah!), others may
not be. And, you might have to work with CWL workflows written by
others. ccwl permits easy reuse of external CWL workflows, and free
mixing with ccwl commands. Here is a workflow to find the string
length of a message, where one of the commands, ,(code "echo"), is
defined as an external CWL workflow. External CWL workflows are
referenced in ccwl using ,(code "cwl-workflow"). The identifiers and
types of the inputs/outputs are read from the YAML specification of
the external CWL workflow.]
         (scheme-source "doc/external-cwl-workflow.scm")
         [,(file "echo.cwl") is defined as]
         ;; TODO: Syntax highlight doc/external-cwl-workflow.cwl.
         (prog :line #f (source :file "doc/echo.cwl"))))
    (section :title [The ,(code [identity]) construct]
             :ident "identity-construct"
      (p [Sometimes, it is helpful for a step to simply copy all input
keys forward to the output. This is what the ,(code [identity])
construct is for. An example follows.]
         (scheme-source "doc/identity-construct.scm")
         (image :file "doc/identity-construct.png"))))

  (chapter :title [Contributing]
           :ident "chapter-contributing"
    (p [ccwl is developed on GitHub at ,(ref
:url "https://github.com/arunisaac/ccwl"). Feedback, suggestions,
feature requests, bug reports and pull requests are all
welcome. Unclear and unspecific error messages are considered a
bug. Do report them!])))