summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--ravanan/slurm-api.scm60
-rw-r--r--tests/slurm-api.scm48
2 files changed, 88 insertions, 20 deletions
diff --git a/ravanan/slurm-api.scm b/ravanan/slurm-api.scm
index 430b3c7..88f1de0 100644
--- a/ravanan/slurm-api.scm
+++ b/ravanan/slurm-api.scm
@@ -17,6 +17,7 @@
 ;;; along with ravanan.  If not, see <https://www.gnu.org/licenses/>.
 
 (define-module (ravanan slurm-api)
+  #:use-module ((rnrs base) #:select (assertion-violation))
   #:use-module (srfi srfi-1)
   #:use-module (srfi srfi-26)
   #:use-module (srfi srfi-71)
@@ -116,6 +117,34 @@ state-monadic value."
               "job ~a assigned id ~a" name job-id)
        (state-return job-id)))))
 
+(define (slurm-state-codes->job-state state-codes)
+  "Convert vector of @var{state-codes} from slurm into a @code{pending},
+@code{failed} or @code{completed} symbol."
+  ;; Slurm job state codes are documented at
+  ;; https://slurm.schedmd.com/job_state_codes.html
+  (let* ((base-state-codes (list 'boot_fail 'cancelled 'completed 'deadline
+                                 'failed 'node_fail 'out_of_memory 'pending
+                                 'preempted 'running 'suspended 'timeout))
+         (base-states state-flags (partition (cut memq <> base-state-codes)
+                                             (map (compose string->symbol string-downcase)
+                                                  (vector->list state-codes)))))
+    ;; TODO: Capture and report fine-grained job states, not merely 'failed,
+    ;; 'pending and 'completed.
+    (match base-states
+      (((or 'pending 'preempted 'running 'suspended))
+       'pending)
+      (((or 'boot_fail 'cancelled 'deadline 'failed 'node_fail 'out_of_memory))
+       'failed)
+      (('completed)
+       ;; If job has an additional COMPLETING flag, wait for it to fully
+       ;; complete. Mark it as pending until then.
+       (if (memq 'completing state-flags)
+           'pending
+           'completed))
+      (_
+       (assertion-violation base-states
+                            "Multiple base states reported by slurm")))))
+
 (define* (job-state job-id #:key api-endpoint jwt)
   "Query the state of slurm @var{job-id} via @var{api-endpoint}
 authenticating using @var{jwt}. Return value is one of the symbols
@@ -131,26 +160,17 @@ monad."
      (match (json-ref response "errors")
        (#()
         (state-return
-         (match (json-ref (find (lambda (job)
-                                  (= (json-ref job "job_id")
-                                     job-id))
-                                (vector->list (json-ref response "jobs")))
-                          "job_state")
-           (#(job-state)
-            (trace 'slurm-api
-                   "slurmctld reports ~a state for job ~a" job-state job-id)
-            (let ((job-state
-                   (case (string->symbol (string-downcase job-state))
-                     ;; slurm returns a PENDING state when the job has not yet
-                     ;; been scheduled on a compute node, and RUNNING once it
-                     ;; has been scheduled and is running.
-                     ((pending running) 'pending)
-                     ((completed) 'completed)
-                     ((failed) 'failed)
-                     (else (error "Unknown slurm job state" job-state)))))
-              (trace 'slurm-api
-                     "return ~a state for job ~a" job-state job-id)
-              job-state)))))
+         (let ((state-codes (json-ref (find (lambda (job)
+                                              (= (json-ref job "job_id")
+                                                 job-id))
+                                            (vector->list (json-ref response "jobs")))
+                                      "job_state")))
+           (trace 'slurm-api
+                  "slurmctld reports ~a state for job ~a" state-codes job-id)
+           (let ((job-state (slurm-state-codes->job-state state-codes)))
+             (trace 'slurm-api
+                    "return ~a state for job ~a" job-state job-id)
+             job-state))))
        (#(errors ...)
         ;; Check in slurmdbd if job has been completed and purged from
         ;; slurmctld's active memory.
diff --git a/tests/slurm-api.scm b/tests/slurm-api.scm
new file mode 100644
index 0000000..347c5c3
--- /dev/null
+++ b/tests/slurm-api.scm
@@ -0,0 +1,48 @@
+;;; ravanan --- High-reproducibility CWL runner powered by Guix
+;;; Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
+;;;
+;;; This file is part of ravanan.
+;;;
+;;; ravanan is free software: you can redistribute it and/or modify it
+;;; under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation, either version 3 of the License, or
+;;; (at your option) any later version.
+;;;
+;;; ravanan is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;;; General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with ravanan.  If not, see <https://www.gnu.org/licenses/>.
+
+(use-modules (srfi srfi-64)
+             (ravanan slurm-api))
+
+(define slurm-state-codes->job-state
+  (@@ (ravanan slurm-api)
+      slurm-state-codes->job-state))
+
+(test-begin "slurm-api")
+
+(test-equal "Handle PENDING job state"
+  'pending
+  (slurm-state-codes->job-state #("PENDING")))
+
+(test-equal "Handle RUNNING job state"
+  'pending
+  (slurm-state-codes->job-state #("RUNNING")))
+
+(test-equal "Handle FAILED job state"
+  'failed
+  (slurm-state-codes->job-state #("FAILED")))
+
+(test-equal "Handle COMPLETED job state"
+  'completed
+  (slurm-state-codes->job-state #("COMPLETED")))
+
+(test-equal "Handle COMPLETED job state with additional COMPLETING flag"
+  'pending
+  (slurm-state-codes->job-state #("COMPLETED" "COMPLETING")))
+
+(test-end "slurm-api")