aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArun Isaac2024-10-27 02:54:14 +0000
committerArun Isaac2024-11-06 00:37:10 +0000
commitdcdfae66509895ec6513b1d0ada365e89abf869f (patch)
treeb90b66cccf7b80c2073dcffb5773f43e0b821071
parent2f1fb017a9330fe5c11719a297140ec6896c3c6c (diff)
downloadravanan-dcdfae66509895ec6513b1d0ada365e89abf869f.tar.gz
ravanan-dcdfae66509895ec6513b1d0ada365e89abf869f.tar.lz
ravanan-dcdfae66509895ec6513b1d0ada365e89abf869f.zip
slurm-api: Find state of purged jobs.
* ravanan/slurm-api.scm (job-state): Query slurmdb for jobs that have been purged from slurmctld's active memory.
-rw-r--r--ravanan/slurm-api.scm59
1 files changed, 45 insertions, 14 deletions
diff --git a/ravanan/slurm-api.scm b/ravanan/slurm-api.scm
index 8e7abd7..ea56775 100644
--- a/ravanan/slurm-api.scm
+++ b/ravanan/slurm-api.scm
@@ -108,17 +108,48 @@ FAQ}) to request for the job."
"Query the state of slurm @var{job-id} via @var{api-endpoint}
authenticating using @var{jwt}. Return value is one of the symbols
@code{pending}, @code{failed} and @code{completed}."
- ;; TODO: What if job has been "archived"? Then, look up archived
- ;; jobs too.
- (let ((response (check-api-error
- (slurm-http-get api-endpoint
- jwt
- (string-append "/slurm/v0.0.41/job/"
- (number->string job-id))))))
- (match (json-ref (find (lambda (job)
- (= (json-ref job "job_id")
- job-id))
- (vector->list (json-ref response "jobs")))
- "job_state")
- (#(job-state)
- (string->symbol (string-downcase job-state))))))
+ (let ((response (slurm-http-get api-endpoint
+ jwt
+ (string-append "/slurm/v0.0.41/job/"
+ (number->string job-id)))))
+ (match (json-ref response "errors")
+ (#()
+ (match (json-ref (find (lambda (job)
+ (= (json-ref job "job_id")
+ job-id))
+ (vector->list (json-ref response "jobs")))
+ "job_state")
+ (#(job-state)
+ (string->symbol (string-downcase job-state)))))
+ (#(errors ...)
+ ;; Check in slurmdbd if job has been completed and purged from
+ ;; slurmctld's active memory.
+ (match (find (lambda (error)
+ (= (json-ref error "error_number")
+ ;; Error number 2017 (Invalid job id specified) may
+ ;; have occurred because the job has completed, has
+ ;; exceeded MinJobAge (as set in slurm.conf) and has
+ ;; therefore been purged from slurmctld's active
+ ;; memory.
+ 2017))
+ errors)
+ (error-2017
+ (let ((response
+ (check-api-error
+ (slurm-http-get api-endpoint
+ jwt
+ (string-append "/slurmdb/v0.0.41/job/"
+ (number->string job-id))))))
+ (match (json-ref (find (lambda (job)
+ (= (json-ref job "job_id")
+ job-id))
+ (vector->list (json-ref response "jobs")))
+ "exit_code" "status")
+ (#(job-state)
+ ;; job-state is either "SUCCESS" or "ERROR".
+ (if (eq? (string->symbol (string-downcase job-state))
+ 'success)
+ 'success
+ 'failed)))))
+ (#f
+ (check-api-error response)))))))