aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArun Isaac2024-09-06 14:26:48 +0100
committerArun Isaac2024-09-06 14:26:48 +0100
commitadd78e89ce9ec0eb309b900e0199454db1b6420f (patch)
treeee43543db6f9400a4a9c65088dbde1add5037283
parentde5b44f066985fd1dadcc81d02919f213bdaf5c6 (diff)
downloadravanan-add78e89ce9ec0eb309b900e0199454db1b6420f.tar.gz
ravanan-add78e89ce9ec0eb309b900e0199454db1b6420f.tar.lz
ravanan-add78e89ce9ec0eb309b900e0199454db1b6420f.zip
command-line-tool: Refactor scheduler into separate named functions.
* ravanan/command-line-tool.scm (command-line-tool-scheduler): Refactor scheduler into separate named functions.
-rw-r--r--ravanan/command-line-tool.scm110
1 files changed, 63 insertions, 47 deletions
diff --git a/ravanan/command-line-tool.scm b/ravanan/command-line-tool.scm
index efa953e..c943881 100644
--- a/ravanan/command-line-tool.scm
+++ b/ravanan/command-line-tool.scm
@@ -939,53 +939,69 @@ named @var{name} with @var{inputs} using tools from Guix manifest
(define* (command-line-tool-scheduler manifest scratch store batch-system
#:key guix-daemon-socket
slurm-api-endpoint slurm-jwt)
- (scheduler (match-lambda*
- ((proc inputs)
- (let ((name (scheduler-proc-name proc))
- (cwl (scheduler-proc-cwl proc)))
- (run-command-line-tool name
- manifest
- cwl
- inputs
- scratch
- store
- batch-system
- #:guix-daemon-socket guix-daemon-socket
- #:slurm-api-endpoint slurm-api-endpoint
- #:slurm-jwt slurm-jwt))))
- (lambda (state)
- (guard (c ((job-failure? c)
- (let ((script (job-failure-script c)))
- (error
- "~a failed; logs at ~a and ~a~%"
- script
- (script->store-stdout-file script store)
- (script->store-stderr-file script store)))))
- (cond
- ((single-machine-job-state? state)
- (if (single-machine-job-state-success? state)
- 'completed
- (raise-exception
- (job-failure (single-machine-job-state-script state)))))
- ((slurm-job-state? state)
- (case (job-state (slurm-job-state-job-id state)
- #:api-endpoint slurm-api-endpoint
- #:jwt slurm-jwt)
- ((failed)
- (raise-exception
- (job-failure (slurm-job-state-script state))))
- (else => identity))))))
+ (define (schedule proc inputs)
+ "Schedule @var{proc} with inputs from the @var{inputs} association list. Return a
+job state object."
+ (let ((name (scheduler-proc-name proc))
+ (cwl (scheduler-proc-cwl proc)))
+ (run-command-line-tool name
+ manifest
+ cwl
+ inputs
+ scratch
+ store
+ batch-system
+ #:guix-daemon-socket guix-daemon-socket
+ #:slurm-api-endpoint slurm-api-endpoint
+ #:slurm-jwt slurm-jwt)))
+
+ (define (poll state)
+ "Return current status of job @var{state} object---one of the symbols
+@code{pending} or @code{completed}. Raise an exception and exit if job has
+failed."
+ (guard (c ((job-failure? c)
+ (let ((script (job-failure-script c)))
+ (error
+ "~a failed; logs at ~a and ~a~%"
+ script
+ (script->store-stdout-file script store)
+ (script->store-stderr-file script store)))))
+ (cond
+ ;; Single machine jobs are run synchronously. So, they return success or
+ ;; failure immediately.
+ ((single-machine-job-state? state)
+ (if (single-machine-job-state-success? state)
+ 'completed
+ (raise-exception
+ (job-failure (single-machine-job-state-script state)))))
+ ;; Poll slurm for job state.
+ ((slurm-job-state? state)
+ (case (job-state (slurm-job-state-job-id state)
+ #:api-endpoint slurm-api-endpoint
+ #:jwt slurm-jwt)
+ ((failed)
+ (raise-exception
+ (job-failure (slurm-job-state-script state))))
+ (else => identity))))))
+
+ (define (capture-output state)
+ "Return output of completed job @var{state}."
+ (let ((script ((case batch-system
+ ((single-machine) single-machine-job-state-script)
+ ((slurm-api) slurm-job-state-script))
+ state)))
+ (format (current-error-port)
+ "~a completed; logs at ~a and ~a~%"
+ script
+ (script->store-stdout-file script store)
+ (script->store-stderr-file script store))
+ (capture-command-line-tool-output script store)))
+
+ (scheduler schedule
+ poll
(case batch-system
+ ;; Single machine jobs are run synchronously. So, there is no
+ ;; need to wait to poll them.
((single-machine) 0)
((slurm-api) %job-poll-interval))
- (lambda (state)
- (let ((script ((case batch-system
- ((single-machine) single-machine-job-state-script)
- ((slurm-api) slurm-job-state-script))
- state)))
- (format (current-error-port)
- "~a completed; logs at ~a and ~a~%"
- script
- (script->store-stdout-file script store)
- (script->store-stderr-file script store))
- (capture-command-line-tool-output script store)))))
+ capture-output))