summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2024-10-27 02:54:14 +0000
committerArun Isaac2024-11-06 00:37:10 +0000
commitdcdfae66509895ec6513b1d0ada365e89abf869f (patch)
treeb90b66cccf7b80c2073dcffb5773f43e0b821071
parent2f1fb017a9330fe5c11719a297140ec6896c3c6c (diff)
downloadravanan-dcdfae66509895ec6513b1d0ada365e89abf869f.tar.gz
ravanan-dcdfae66509895ec6513b1d0ada365e89abf869f.tar.lz
ravanan-dcdfae66509895ec6513b1d0ada365e89abf869f.zip
slurm-api: Find state of purged jobs.
* ravanan/slurm-api.scm (job-state): Query slurmdb for jobs that have
been purged from slurmctld's active memory.
-rw-r--r--ravanan/slurm-api.scm59
1 files changed, 45 insertions, 14 deletions
diff --git a/ravanan/slurm-api.scm b/ravanan/slurm-api.scm
index 8e7abd7..ea56775 100644
--- a/ravanan/slurm-api.scm
+++ b/ravanan/slurm-api.scm
@@ -108,17 +108,48 @@ FAQ}) to request for the job."
   "Query the state of slurm @var{job-id} via @var{api-endpoint}
 authenticating using @var{jwt}. Return value is one of the symbols
 @code{pending}, @code{failed} and @code{completed}."
-  ;; TODO: What if job has been "archived"? Then, look up archived
-  ;; jobs too.
-  (let ((response (check-api-error
-                   (slurm-http-get api-endpoint
-                                   jwt
-                                   (string-append "/slurm/v0.0.41/job/"
-                                                  (number->string job-id))))))
-    (match (json-ref (find (lambda (job)
-                             (= (json-ref job "job_id")
-                                job-id))
-                           (vector->list (json-ref response "jobs")))
-                     "job_state")
-      (#(job-state)
-       (string->symbol (string-downcase job-state))))))
+  (let ((response (slurm-http-get api-endpoint
+                                  jwt
+                                  (string-append "/slurm/v0.0.41/job/"
+                                                 (number->string job-id)))))
+    (match (json-ref response "errors")
+      (#()
+       (match (json-ref (find (lambda (job)
+                                (= (json-ref job "job_id")
+                                   job-id))
+                              (vector->list (json-ref response "jobs")))
+                        "job_state")
+         (#(job-state)
+          (string->symbol (string-downcase job-state)))))
+      (#(errors ...)
+       ;; Check in slurmdbd if job has been completed and purged from
+       ;; slurmctld's active memory.
+       (match (find (lambda (error)
+                      (= (json-ref error "error_number")
+                         ;; Error number 2017 (Invalid job id specified) may
+                         ;; have occurred because the job has completed, has
+                         ;; exceeded MinJobAge (as set in slurm.conf) and has
+                         ;; therefore been purged from slurmctld's active
+                         ;; memory.
+                         2017))
+                    errors)
+         (error-2017
+          (let ((response
+                 (check-api-error
+                  (slurm-http-get api-endpoint
+                                  jwt
+                                  (string-append "/slurmdb/v0.0.41/job/"
+                                                 (number->string job-id))))))
+            (match (json-ref (find (lambda (job)
+                                     (= (json-ref job "job_id")
+                                        job-id))
+                                   (vector->list (json-ref response "jobs")))
+                             "exit_code" "status")
+              (#(job-state)
+               ;; job-state is either "SUCCESS" or "ERROR".
+               (if (eq? (string->symbol (string-downcase job-state))
+                        'success)
+                   'success
+                   'failed)))))
+         (#f
+          (check-api-error response)))))))