diff options
author | Arun Isaac | 2025-01-24 20:18:39 +0000 |
---|---|---|
committer | Arun Isaac | 2025-01-24 20:47:30 +0000 |
commit | 518d81b2c6c2ab15c7b67c25fdf401e461530725 (patch) | |
tree | 95aab545a771eb9c98dd09aca7a8c25040c62976 | |
parent | 31cd85c914c0b2247ed796eaf34fe6f6c5c61a85 (diff) | |
download | ravanan-518d81b2c6c2ab15c7b67c25fdf401e461530725.tar.gz ravanan-518d81b2c6c2ab15c7b67c25fdf401e461530725.tar.lz ravanan-518d81b2c6c2ab15c7b67c25fdf401e461530725.zip |
slurm-api: Support tracing.
* ravanan/slurm-api.scm: Import (ravanan verbosity).
(submit-job, job-state): Add traces.
* bin/ravanan (%options): Support slurm-api in --trace option.
(print-usage): Document it.
-rwxr-xr-x | bin/ravanan | 5 | ||||
-rw-r--r-- | ravanan/slurm-api.scm | 133 |
2 files changed, 85 insertions, 53 deletions
diff --git a/bin/ravanan b/bin/ravanan index 71f7017..cbd4e72 100755 --- a/bin/ravanan +++ b/bin/ravanan @@ -80,7 +80,7 @@ exec guile --no-auto-compile -e main -s "$0" "$@" result))) (option (list "trace") #t #f (lambda (opt name arg result) - (let ((accepted-values (list))) + (let ((accepted-values (list "slurm-api"))) (if (member arg accepted-values) (assoc-set result (cons 'traces @@ -129,7 +129,8 @@ Slurm API batch system options: Debugging options: --trace=SUBSYSTEM enable tracing on subsystem; - repeat to trace multiple subsystems" + repeat to trace multiple subsystems + (accepted values: slurm-api)" program)) (define (read-jwt file) diff --git a/ravanan/slurm-api.scm b/ravanan/slurm-api.scm index 32dc3c7..2719b5f 100644 --- a/ravanan/slurm-api.scm +++ b/ravanan/slurm-api.scm @@ -25,6 +25,7 @@ #:use-module (web client) #:use-module (web uri) #:use-module (json) + #:use-module (ravanan verbosity) #:use-module (ravanan work monads) #:use-module (ravanan work utils) #:export (submit-job @@ -102,62 +103,92 @@ state-monadic value." `(("nice" . ,nice)) '()))) - (state-let* ((json (slurm-http-post api-endpoint - jwt - "/slurm/v0.0.41/job/submit" - `(("jobs" . #(,job-spec)))))) - (check-api-error json) - (state-return (json-ref json "job_id")))) + (state-begin + (state-return (trace 'slurm-api + "submitting script ~a as job ~a" script name)) + (state-let* ((json (slurm-http-post api-endpoint + jwt + "/slurm/v0.0.41/job/submit" + `(("jobs" . #(,job-spec)))))) + (check-api-error json) + (let ((job-id (json-ref json "job_id"))) + (trace 'slurm-api + "job ~a assigned id ~a" name job-id) + (state-return job-id))))) (define* (job-state job-id #:key api-endpoint jwt) "Query the state of slurm @var{job-id} via @var{api-endpoint} authenticating using @var{jwt}. Return value is one of the symbols @code{pending}, @code{failed} and @code{completed} encapsulated in the state monad." - (state-let* ((response (slurm-http-get api-endpoint + (state-begin + (state-return (trace 'slurm-api + "polling job id ~a" job-id)) + (state-let* ((response (slurm-http-get api-endpoint + jwt + (string-append "/slurm/v0.0.41/job/" + (number->string job-id))))) + (match (json-ref response "errors") + (#() + (state-return + (match (json-ref (find (lambda (job) + (= (json-ref job "job_id") + job-id)) + (vector->list (json-ref response "jobs"))) + "job_state") + (#(job-state) + (trace 'slurm-api + "slurmctld reports ~a state for job ~a" job-state job-id) + (let ((job-state (string->symbol (string-downcase job-state)))) + (trace 'slurm-api + "return ~a state for job ~a" job-state job-id) + job-state))))) + (#(errors ...) + ;; Check in slurmdbd if job has been completed and purged from + ;; slurmctld's active memory. + (match (find (lambda (error) + (= (json-ref error "error_number") + ;; Error number 2017 (Invalid job id specified) may + ;; have occurred because the job has completed, has + ;; exceeded MinJobAge (as set in slurm.conf) and has + ;; therefore been purged from slurmctld's active + ;; memory. + 2017)) + errors) + (error-2017 + (trace 'slurm-api + (string-append + "error number 2017 (invalid job id specified) received" + " for job ~a; it is not in slurmctld's active memory;" + " checking in slurmdbd") + job-id) + (state-let* ((response + (slurm-http-get api-endpoint jwt - (string-append "/slurm/v0.0.41/job/" + (string-append "/slurmdb/v0.0.41/job/" (number->string job-id))))) - (match (json-ref response "errors") - (#() - (state-return - (match (json-ref (find (lambda (job) - (= (json-ref job "job_id") - job-id)) - (vector->list (json-ref response "jobs"))) - "job_state") - (#(job-state) - (string->symbol (string-downcase job-state)))))) - (#(errors ...) - ;; Check in slurmdbd if job has been completed and purged from - ;; slurmctld's active memory. - (match (find (lambda (error) - (= (json-ref error "error_number") - ;; Error number 2017 (Invalid job id specified) may - ;; have occurred because the job has completed, has - ;; exceeded MinJobAge (as set in slurm.conf) and has - ;; therefore been purged from slurmctld's active - ;; memory. - 2017)) - errors) - (error-2017 - (state-let* ((response - (slurm-http-get api-endpoint - jwt - (string-append "/slurmdb/v0.0.41/job/" - (number->string job-id))))) - (check-api-error response) - (state-return - (match (json-ref (find (lambda (job) - (= (json-ref job "job_id") - job-id)) - (vector->list (json-ref response "jobs"))) - "exit_code" "status") - (#(job-state) - ;; job-state is either "SUCCESS" or "ERROR". - (if (eq? (string->symbol (string-downcase job-state)) - 'success) - 'success - 'failed)))))) - (#f - (state-return (check-api-error response)))))))) + (check-api-error response) + (state-return + (match (json-ref (find (lambda (job) + (= (json-ref job "job_id") + job-id)) + (vector->list (json-ref response "jobs"))) + "exit_code" "status") + (#(job-state) + (trace 'slurm-api + "slurmdbd reports ~a state for job ~a" + job-state job-id) + ;; job-state is either "SUCCESS" or "ERROR". + (let ((job-state (if (eq? (string->symbol (string-downcase job-state)) + 'success) + 'success + 'failed))) + (trace 'slurm-api + "return ~a state for job ~a" job-state job-id) + job-state)))))) + (#f + (trace 'slurm-api + (string-append + "slurmdbd does not report error number 2017 for job ~a;" + " don't know what to do")) + (state-return (check-api-error response))))))))) |