From 5d06ccc6b27dd213871ff8cede2d44aed2cae373 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Wed, 9 Oct 2024 22:50:24 +0100 Subject: batch-system: Introduce record type. A record type allows us an easy way to group and pass along parameters required by the slurm-api batch system. * ravanan/batch-system.scm: New file. * bin/ravanan: Import (ravanan batch-system). (main): Create object for batch-system argument. Do not pass #:slurm-api-endpoint and #:slurm-jwt arguments. * ravanan/command-line-tool.scm: Import (ravanan batch-system). (command-line-tool-supported-requirements): Check for slurm-api batch system using slurm-api-batch-system?. (run-command-line-tool): Check for slurm-api batch system using slurm-api-batch-system?. Do not accept #:slurm-api-endpoint and #:slurm-jwt arguments. * ravanan/workflow.scm: Import (ravanan batch-system). (workflow-scheduler): Do not accept #:slurm-api-endpoint and #:slurm-jwt arguments. Pass batch-system to job-state-status. (run-workflow): Check for slurm-api batch system using slurm-api-batch-system?. Do not accept #:slurm-api-endpoint and #:slurm-jwt arguments. * ravanan/job-state.scm: Import (ravanan batch-system). (job-state-status): Do not accept #:slurm-api-endpoint and #:slurm-jwt arguments. Accept batch-system argument. --- bin/ravanan | 14 +++++++++----- ravanan/batch-system.scm | 30 ++++++++++++++++++++++++++++++ ravanan/command-line-tool.scm | 31 +++++++++++++++---------------- ravanan/job-state.scm | 19 ++++++++----------- ravanan/workflow.scm | 42 +++++++++++++++--------------------------- 5 files changed, 77 insertions(+), 59 deletions(-) create mode 100644 ravanan/batch-system.scm diff --git a/bin/ravanan b/bin/ravanan index cc9005d..5807a8e 100755 --- a/bin/ravanan +++ b/bin/ravanan @@ -27,6 +27,7 @@ exec guile --no-auto-compile -e main -s "$0" "$@" (ice-9 match) (web uri) (json) + (ravanan batch-system) (ravanan reader) (ravanan utils) (ravanan workflow)) @@ -145,11 +146,14 @@ files that have the token in the @verbatim{SLURM_JWT=token} format." (if (file-name-absolute? (assq-ref args 'store)) (assq-ref args 'store) (canonicalize-path (assq-ref args 'store))) - (assq-ref args 'batch-system) - #:guix-daemon-socket (assq-ref args 'guix-daemon-socket) - #:slurm-api-endpoint (assq-ref args 'slurm-api-endpoint) - #:slurm-jwt (and (assq-ref args 'slurm-jwt) - (read-jwt (assq-ref args 'slurm-jwt)))) + (case (assq-ref args 'batch-system) + ((single-machine) 'single-machine) + ((slurm-api) + (slurm-api-batch-system + (assq-ref args 'slurm-api-endpoint) + (and (assq-ref args 'slurm-jwt) + (read-jwt (assq-ref args 'slurm-jwt)))))) + #:guix-daemon-socket (assq-ref args 'guix-daemon-socket)) (current-output-port) #:pretty #t) (newline (current-output-port)))))))) diff --git a/ravanan/batch-system.scm b/ravanan/batch-system.scm new file mode 100644 index 0000000..726bff8 --- /dev/null +++ b/ravanan/batch-system.scm @@ -0,0 +1,30 @@ +;;; ravanan --- High-reproducibility CWL runner powered by Guix +;;; Copyright © 2024 Arun Isaac +;;; +;;; This file is part of ravanan. +;;; +;;; ravanan is free software: you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation, either version 3 of the License, or +;;; (at your option) any later version. +;;; +;;; ravanan is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;; General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with ravanan. If not, see . + +(define-module (ravanan batch-system) + #:use-module (srfi srfi-9 gnu) + #:export (slurm-api-batch-system + slurm-api-batch-system? + slurm-api-batch-system-endpoint + slurm-api-batch-system-jwt)) + +(define-immutable-record-type + (slurm-api-batch-system endpoint jwt) + slurm-api-batch-system? + (endpoint slurm-api-batch-system-endpoint) + (jwt slurm-api-batch-system-jwt)) diff --git a/ravanan/command-line-tool.scm b/ravanan/command-line-tool.scm index 6225595..4254df9 100644 --- a/ravanan/command-line-tool.scm +++ b/ravanan/command-line-tool.scm @@ -35,6 +35,7 @@ #:use-module (guix search-paths) #:use-module (guix store) #:use-module (json) + #:use-module (ravanan batch-system) #:use-module (ravanan javascript) #:use-module (ravanan job-state) #:use-module (ravanan reader) @@ -75,11 +76,11 @@ "ResourceRequirement")) (define (command-line-tool-supported-requirements batch-system) - (case batch-system - ((single-machine) + (cond + ((eq? batch-system 'single-machine) (delete "ResourceRequirement" %command-line-tool-supported-requirements)) - ((slurm-api) + ((slurm-api-batch-system? batch-system) %command-line-tool-supported-requirements) (else (assertion-violation batch-system "Unknown batch system")))) @@ -370,14 +371,12 @@ path." (define* (run-command-line-tool name manifest-file cwl inputs scratch store batch-system - #:key guix-daemon-socket - slurm-api-endpoint slurm-jwt) + #:key guix-daemon-socket) "Run @code{CommandLineTool} class workflow @var{cwl} named @var{name} with @var{inputs} using tools from Guix manifest in @var{manifest-file}. -@var{scratch}, @var{store}, @var{batch-system}, @var{guix-daemon-socket}, -@var{slurm-api-endpoint} and @var{slurm-jwt} are the same as in -@code{run-workflow} from @code{(ravanan workflow)}." +@var{scratch}, @var{store}, @var{batch-system} and @var{guix-daemon-socket} are +the same as in @code{run-workflow} from @code{(ravanan workflow)}." ;; TODO: Write to the store atomically. (let* ((script (build-command-line-tool-script name manifest-file cwl inputs @@ -422,8 +421,8 @@ path." (when (file-exists? store-files-directory) (delete-file-recursively store-files-directory)) (mkdir store-files-directory) - (case batch-system - ((single-machine) + (cond + ((eq? batch-system 'single-machine) (setenv "WORKFLOW_OUTPUT_DIRECTORY" store-files-directory) (setenv "WORKFLOW_OUTPUT_DATA_FILE" store-data-file) (format (current-error-port) @@ -434,7 +433,7 @@ path." (lambda () (with-error-to-file stderr-file (cut system* script))))))) - ((slurm-api) + ((slurm-api-batch-system? batch-system) (format (current-error-port) "Submitting job ~a~%" script) @@ -447,8 +446,8 @@ path." cpus name script - #:api-endpoint slurm-api-endpoint - #:jwt slurm-jwt))) + #:api-endpoint (slurm-api-batch-system-endpoint batch-system) + #:jwt (slurm-api-batch-system-jwt batch-system)))) (format (current-error-port) "~a submitted as job ID ~a~%" script @@ -610,10 +609,10 @@ named @var{name} with @var{inputs} using tools from Guix manifest in "listing"))) (define (cores batch-system) - (case batch-system - ((slurm-api) + (cond + ((slurm-api-batch-system? batch-system) #~(string->number (getenv "SLURM_CPUS_ON_NODE"))) - ((single-machine) + ((eq? batch-system 'single-machine) #~(total-processor-count)) (else (assertion-violation batch-system "Unknown batch system")))) diff --git a/ravanan/job-state.scm b/ravanan/job-state.scm index 88ed4b7..834a5bd 100644 --- a/ravanan/job-state.scm +++ b/ravanan/job-state.scm @@ -26,6 +26,7 @@ (define-module (ravanan job-state) #:use-module (srfi srfi-9 gnu) + #:use-module (ravanan batch-system) #:use-module (ravanan slurm-api) #:use-module (ravanan work vectors) #:export (single-machine-job-state @@ -54,12 +55,10 @@ slurm-job-state-script)) state)) -(define* (job-state-status state #:key slurm-api-endpoint slurm-jwt) - "Return current status and updated state of job with @var{state}. The status is -one of the symbols @code{completed}, @code{failed} or @code{pending}. - -@var{slurm-api-endpoint} and @var{slurm-jwt} are the same as in -@code{run-workflow} from @code{(ravanan workflow)}." +(define* (job-state-status state batch-system) + "Return current status and updated state of job with @var{state} on +@var{batch-system}. The status is one of the symbols @code{completed}, +@code{failed} or @code{pending}." (values (cond ;; Single machine jobs are run synchronously. So, they return success ;; or failure immediately. @@ -70,15 +69,13 @@ one of the symbols @code{completed}, @code{failed} or @code{pending}. ;; Poll slurm for job state. ((slurm-job-state? state) (job-state (slurm-job-state-job-id state) - #:api-endpoint slurm-api-endpoint - #:jwt slurm-jwt)) + #:api-endpoint (slurm-api-batch-system-endpoint batch-system) + #:jwt (slurm-api-batch-system-jwt batch-system))) ;; For vector states, poll each state element and return 'completed ;; only if all state elements have completed. ((vector? state) (or (vector-every (lambda (state-element) - (case (job-state-status state-element - #:slurm-api-endpoint slurm-api-endpoint - #:slurm-jwt slurm-jwt) + (case (job-state-status state-element batch-system) ((completed) => identity) (else #f))) state) diff --git a/ravanan/workflow.scm b/ravanan/workflow.scm index 95dfb73..61a1297 100644 --- a/ravanan/workflow.scm +++ b/ravanan/workflow.scm @@ -27,6 +27,7 @@ #:use-module (ice-9 filesystem) #:use-module (ice-9 match) #:use-module (web uri) + #:use-module (ravanan batch-system) #:use-module (ravanan command-line-tool) #:use-module (ravanan job-state) #:use-module (ravanan propnet) @@ -246,8 +247,7 @@ propagator." scheduler)) (define* (workflow-scheduler manifest-file scratch store batch-system - #:key guix-daemon-socket - slurm-api-endpoint slurm-jwt) + #:key guix-daemon-socket) (define (schedule proc inputs scheduler) "Schedule @var{proc} with inputs from the @var{inputs} association list. Return a job state object. @var{proc} may either be a @code{} object or a @@ -296,9 +296,7 @@ job state object. @var{proc} may either be a @code{} object or a scratch store batch-system - #:guix-daemon-socket guix-daemon-socket - #:slurm-api-endpoint slurm-api-endpoint - #:slurm-jwt slurm-jwt) + #:guix-daemon-socket guix-daemon-socket) (assoc-ref* cwl "outputs"))) ((string=? class "ExpressionTool") (error "Workflow class not implemented yet" class)) @@ -335,8 +333,7 @@ exit if job has failed." ((command-line-tool-state? state) (let ((status updated-job-state (job-state-status (command-line-tool-state-job-state state) - #:slurm-api-endpoint slurm-api-endpoint - #:slurm-jwt slurm-jwt))) + batch-system))) (values (case status ((failed) (raise-exception (job-failure @@ -572,29 +569,18 @@ error out." (define* (run-workflow name manifest-file cwl inputs scratch store batch-system - #:key guix-daemon-socket - slurm-api-endpoint slurm-jwt) + #:key guix-daemon-socket) "Run a workflow @var{cwl} named @var{name} with @var{inputs} using tools from Guix manifest in @var{manifest-file}. -@var{scratch} is the path to the scratch area on all worker nodes. The -scratch area need not be shared. @var{store} is the path to the shared -ravanan store. @var{batch-system} is a symbol representing one of the -supported batch systems (either @code{'single-machine} or -@code{'slurm-api}). +@var{scratch} is the path to the scratch area on all worker nodes. The scratch +area need not be shared. @var{store} is the path to the shared ravanan store. +@var{batch-system} is an object representing one of the supported batch systems. -@var{guix-daemon-socket} is the Guix daemon socket to connect to. - -@var{slurm-api-endpoint}, a @code{} object, is the slurm API -endpoint to connect to. @var{slurm-jwt}, a string, is the JWT token to -authenticate to the slurm API with. @var{slurm-api-endpoint} and -@var{slurm-jwt} are only used when @var{batch-system} is -@code{'slurm-api}." +@var{guix-daemon-socket} is the Guix daemon socket to connect to." (let ((scheduler (workflow-scheduler manifest-file scratch store batch-system - #:guix-daemon-socket guix-daemon-socket - #:slurm-api-endpoint slurm-api-endpoint - #:slurm-jwt slurm-jwt))) + #:guix-daemon-socket guix-daemon-socket))) (let loop ((state ((scheduler-schedule scheduler) (scheduler-proc name cwl %nothing %nothing) inputs @@ -605,11 +591,13 @@ authenticate to the slurm API with. @var{slurm-api-endpoint} and (begin ;; Pause before looping and polling again so we don't bother the ;; job server too often. - (sleep (case batch-system + (sleep (cond ;; Single machine jobs are run synchronously. So, there ;; is no need to wait to poll them. - ((single-machine) 0) - ((slurm-api) %job-poll-interval))) + ((eq? batch-system 'single-machine) + 0) + ((slurm-api-batch-system? batch-system) + %job-poll-interval))) (loop state)) ;; Capture outputs. ((scheduler-capture-output scheduler) state)))))) -- cgit v1.2.3