aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArun Isaac2025-01-26 11:53:53 +0000
committerArun Isaac2025-01-26 11:56:50 +0000
commitcdb781e4e89f4d2909b5b07f93c39450e6c1f319 (patch)
tree56110ec08882ffff83646eb3b7cc0fd15f46e310
parent46252b0457bfccc6394c3c521f48185869f741a9 (diff)
downloadravanan-cdb781e4e89f4d2909b5b07f93c39450e6c1f319.tar.gz
ravanan-cdb781e4e89f4d2909b5b07f93c39450e6c1f319.tar.lz
ravanan-cdb781e4e89f4d2909b5b07f93c39450e6c1f319.zip
slurm-api: Handle all slurm job state codes.
* ravanan/slurm-api.scm: Import assertion-violation from (rnrs base). (slurm-state-codes->job-state): New function. (job-state): Use slurm-state-codes->job-state. * tests/slurm-api.scm: New file.
-rw-r--r--ravanan/slurm-api.scm60
-rw-r--r--tests/slurm-api.scm48
2 files changed, 88 insertions, 20 deletions
diff --git a/ravanan/slurm-api.scm b/ravanan/slurm-api.scm
index 430b3c7..88f1de0 100644
--- a/ravanan/slurm-api.scm
+++ b/ravanan/slurm-api.scm
@@ -17,6 +17,7 @@
;;; along with ravanan. If not, see <https://www.gnu.org/licenses/>.
(define-module (ravanan slurm-api)
+ #:use-module ((rnrs base) #:select (assertion-violation))
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-26)
#:use-module (srfi srfi-71)
@@ -116,6 +117,34 @@ state-monadic value."
"job ~a assigned id ~a" name job-id)
(state-return job-id)))))
+(define (slurm-state-codes->job-state state-codes)
+ "Convert vector of @var{state-codes} from slurm into a @code{pending},
+@code{failed} or @code{completed} symbol."
+ ;; Slurm job state codes are documented at
+ ;; https://slurm.schedmd.com/job_state_codes.html
+ (let* ((base-state-codes (list 'boot_fail 'cancelled 'completed 'deadline
+ 'failed 'node_fail 'out_of_memory 'pending
+ 'preempted 'running 'suspended 'timeout))
+ (base-states state-flags (partition (cut memq <> base-state-codes)
+ (map (compose string->symbol string-downcase)
+ (vector->list state-codes)))))
+ ;; TODO: Capture and report fine-grained job states, not merely 'failed,
+ ;; 'pending and 'completed.
+ (match base-states
+ (((or 'pending 'preempted 'running 'suspended))
+ 'pending)
+ (((or 'boot_fail 'cancelled 'deadline 'failed 'node_fail 'out_of_memory))
+ 'failed)
+ (('completed)
+ ;; If job has an additional COMPLETING flag, wait for it to fully
+ ;; complete. Mark it as pending until then.
+ (if (memq 'completing state-flags)
+ 'pending
+ 'completed))
+ (_
+ (assertion-violation base-states
+ "Multiple base states reported by slurm")))))
+
(define* (job-state job-id #:key api-endpoint jwt)
"Query the state of slurm @var{job-id} via @var{api-endpoint}
authenticating using @var{jwt}. Return value is one of the symbols
@@ -131,26 +160,17 @@ monad."
(match (json-ref response "errors")
(#()
(state-return
- (match (json-ref (find (lambda (job)
- (= (json-ref job "job_id")
- job-id))
- (vector->list (json-ref response "jobs")))
- "job_state")
- (#(job-state)
- (trace 'slurm-api
- "slurmctld reports ~a state for job ~a" job-state job-id)
- (let ((job-state
- (case (string->symbol (string-downcase job-state))
- ;; slurm returns a PENDING state when the job has not yet
- ;; been scheduled on a compute node, and RUNNING once it
- ;; has been scheduled and is running.
- ((pending running) 'pending)
- ((completed) 'completed)
- ((failed) 'failed)
- (else (error "Unknown slurm job state" job-state)))))
- (trace 'slurm-api
- "return ~a state for job ~a" job-state job-id)
- job-state)))))
+ (let ((state-codes (json-ref (find (lambda (job)
+ (= (json-ref job "job_id")
+ job-id))
+ (vector->list (json-ref response "jobs")))
+ "job_state")))
+ (trace 'slurm-api
+ "slurmctld reports ~a state for job ~a" state-codes job-id)
+ (let ((job-state (slurm-state-codes->job-state state-codes)))
+ (trace 'slurm-api
+ "return ~a state for job ~a" job-state job-id)
+ job-state))))
(#(errors ...)
;; Check in slurmdbd if job has been completed and purged from
;; slurmctld's active memory.
diff --git a/tests/slurm-api.scm b/tests/slurm-api.scm
new file mode 100644
index 0000000..347c5c3
--- /dev/null
+++ b/tests/slurm-api.scm
@@ -0,0 +1,48 @@
+;;; ravanan --- High-reproducibility CWL runner powered by Guix
+;;; Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
+;;;
+;;; This file is part of ravanan.
+;;;
+;;; ravanan is free software: you can redistribute it and/or modify it
+;;; under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation, either version 3 of the License, or
+;;; (at your option) any later version.
+;;;
+;;; ravanan is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;; General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with ravanan. If not, see <https://www.gnu.org/licenses/>.
+
+(use-modules (srfi srfi-64)
+ (ravanan slurm-api))
+
+(define slurm-state-codes->job-state
+ (@@ (ravanan slurm-api)
+ slurm-state-codes->job-state))
+
+(test-begin "slurm-api")
+
+(test-equal "Handle PENDING job state"
+ 'pending
+ (slurm-state-codes->job-state #("PENDING")))
+
+(test-equal "Handle RUNNING job state"
+ 'pending
+ (slurm-state-codes->job-state #("RUNNING")))
+
+(test-equal "Handle FAILED job state"
+ 'failed
+ (slurm-state-codes->job-state #("FAILED")))
+
+(test-equal "Handle COMPLETED job state"
+ 'completed
+ (slurm-state-codes->job-state #("COMPLETED")))
+
+(test-equal "Handle COMPLETED job state with additional COMPLETING flag"
+ 'pending
+ (slurm-state-codes->job-state #("COMPLETED" "COMPLETING")))
+
+(test-end "slurm-api")