summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2025-01-24 20:18:39 +0000
committerArun Isaac2025-01-24 20:47:30 +0000
commit518d81b2c6c2ab15c7b67c25fdf401e461530725 (patch)
tree95aab545a771eb9c98dd09aca7a8c25040c62976
parent31cd85c914c0b2247ed796eaf34fe6f6c5c61a85 (diff)
downloadravanan-518d81b2c6c2ab15c7b67c25fdf401e461530725.tar.gz
ravanan-518d81b2c6c2ab15c7b67c25fdf401e461530725.tar.lz
ravanan-518d81b2c6c2ab15c7b67c25fdf401e461530725.zip
slurm-api: Support tracing.
* ravanan/slurm-api.scm: Import (ravanan verbosity).
(submit-job, job-state): Add traces.
* bin/ravanan (%options): Support slurm-api in --trace option.
(print-usage): Document it.
-rwxr-xr-xbin/ravanan5
-rw-r--r--ravanan/slurm-api.scm133
2 files changed, 85 insertions, 53 deletions
diff --git a/bin/ravanan b/bin/ravanan
index 71f7017..cbd4e72 100755
--- a/bin/ravanan
+++ b/bin/ravanan
@@ -80,7 +80,7 @@ exec guile --no-auto-compile -e main -s "$0" "$@"
                          result)))
         (option (list "trace") #t #f
                 (lambda (opt name arg result)
-                  (let ((accepted-values (list)))
+                  (let ((accepted-values (list "slurm-api")))
                     (if (member arg accepted-values)
                         (assoc-set result
                           (cons 'traces
@@ -129,7 +129,8 @@ Slurm API batch system options:
 Debugging options:
 
   --trace=SUBSYSTEM              enable tracing on subsystem;
-                                 repeat to trace multiple subsystems"
+                                 repeat to trace multiple subsystems
+                                 (accepted values: slurm-api)"
           program))
 
 (define (read-jwt file)
diff --git a/ravanan/slurm-api.scm b/ravanan/slurm-api.scm
index 32dc3c7..2719b5f 100644
--- a/ravanan/slurm-api.scm
+++ b/ravanan/slurm-api.scm
@@ -25,6 +25,7 @@
   #:use-module (web client)
   #:use-module (web uri)
   #:use-module (json)
+  #:use-module (ravanan verbosity)
   #:use-module (ravanan work monads)
   #:use-module (ravanan work utils)
   #:export (submit-job
@@ -102,62 +103,92 @@ state-monadic value."
                 `(("nice" . ,nice))
                 '())))
 
-  (state-let* ((json (slurm-http-post api-endpoint
-                                      jwt
-                                      "/slurm/v0.0.41/job/submit"
-                                      `(("jobs" . #(,job-spec))))))
-    (check-api-error json)
-    (state-return (json-ref json "job_id"))))
+  (state-begin
+   (state-return (trace 'slurm-api
+                        "submitting script ~a as job ~a" script name))
+   (state-let* ((json (slurm-http-post api-endpoint
+                                       jwt
+                                       "/slurm/v0.0.41/job/submit"
+                                       `(("jobs" . #(,job-spec))))))
+     (check-api-error json)
+     (let ((job-id (json-ref json "job_id")))
+       (trace 'slurm-api
+              "job ~a assigned id ~a" name job-id)
+       (state-return job-id)))))
 
 (define* (job-state job-id #:key api-endpoint jwt)
   "Query the state of slurm @var{job-id} via @var{api-endpoint}
 authenticating using @var{jwt}. Return value is one of the symbols
 @code{pending}, @code{failed} and @code{completed} encapsulated in the state
 monad."
-  (state-let* ((response (slurm-http-get api-endpoint
+  (state-begin
+   (state-return (trace 'slurm-api
+                        "polling job id ~a" job-id))
+   (state-let* ((response (slurm-http-get api-endpoint
+                                          jwt
+                                          (string-append "/slurm/v0.0.41/job/"
+                                                         (number->string job-id)))))
+     (match (json-ref response "errors")
+       (#()
+        (state-return
+         (match (json-ref (find (lambda (job)
+                                  (= (json-ref job "job_id")
+                                     job-id))
+                                (vector->list (json-ref response "jobs")))
+                          "job_state")
+           (#(job-state)
+            (trace 'slurm-api
+                   "slurmctld reports ~a state for job ~a" job-state job-id)
+            (let ((job-state (string->symbol (string-downcase job-state))))
+              (trace 'slurm-api
+                     "return ~a state for job ~a" job-state job-id)
+              job-state)))))
+       (#(errors ...)
+        ;; Check in slurmdbd if job has been completed and purged from
+        ;; slurmctld's active memory.
+        (match (find (lambda (error)
+                       (= (json-ref error "error_number")
+                          ;; Error number 2017 (Invalid job id specified) may
+                          ;; have occurred because the job has completed, has
+                          ;; exceeded MinJobAge (as set in slurm.conf) and has
+                          ;; therefore been purged from slurmctld's active
+                          ;; memory.
+                          2017))
+                     errors)
+          (error-2017
+           (trace 'slurm-api
+                  (string-append
+                   "error number 2017 (invalid job id specified) received"
+                   " for job ~a; it is not in slurmctld's active memory;"
+                   " checking in slurmdbd")
+                  job-id)
+           (state-let* ((response
+                         (slurm-http-get api-endpoint
                                          jwt
-                                         (string-append "/slurm/v0.0.41/job/"
+                                         (string-append "/slurmdb/v0.0.41/job/"
                                                         (number->string job-id)))))
-    (match (json-ref response "errors")
-      (#()
-       (state-return
-        (match (json-ref (find (lambda (job)
-                                 (= (json-ref job "job_id")
-                                    job-id))
-                               (vector->list (json-ref response "jobs")))
-                         "job_state")
-          (#(job-state)
-           (string->symbol (string-downcase job-state))))))
-      (#(errors ...)
-       ;; Check in slurmdbd if job has been completed and purged from
-       ;; slurmctld's active memory.
-       (match (find (lambda (error)
-                      (= (json-ref error "error_number")
-                         ;; Error number 2017 (Invalid job id specified) may
-                         ;; have occurred because the job has completed, has
-                         ;; exceeded MinJobAge (as set in slurm.conf) and has
-                         ;; therefore been purged from slurmctld's active
-                         ;; memory.
-                         2017))
-                    errors)
-         (error-2017
-          (state-let* ((response
-                        (slurm-http-get api-endpoint
-                                        jwt
-                                        (string-append "/slurmdb/v0.0.41/job/"
-                                                       (number->string job-id)))))
-            (check-api-error response)
-            (state-return
-             (match (json-ref (find (lambda (job)
-                                      (= (json-ref job "job_id")
-                                         job-id))
-                                    (vector->list (json-ref response "jobs")))
-                              "exit_code" "status")
-               (#(job-state)
-                ;; job-state is either "SUCCESS" or "ERROR".
-                (if (eq? (string->symbol (string-downcase job-state))
-                         'success)
-                    'success
-                    'failed))))))
-         (#f
-          (state-return (check-api-error response))))))))
+             (check-api-error response)
+             (state-return
+              (match (json-ref (find (lambda (job)
+                                       (= (json-ref job "job_id")
+                                          job-id))
+                                     (vector->list (json-ref response "jobs")))
+                               "exit_code" "status")
+                (#(job-state)
+                 (trace 'slurm-api
+                        "slurmdbd reports ~a state for job ~a"
+                        job-state job-id)
+                 ;; job-state is either "SUCCESS" or "ERROR".
+                 (let ((job-state (if (eq? (string->symbol (string-downcase job-state))
+                                           'success)
+                                      'success
+                                      'failed)))
+                   (trace 'slurm-api
+                          "return ~a state for job ~a" job-state job-id)
+                   job-state))))))
+          (#f
+           (trace 'slurm-api
+                  (string-append
+                   "slurmdbd does not report error number 2017 for job ~a;"
+                   " don't know what to do"))
+           (state-return (check-api-error response)))))))))