@@ -9,11 +9,17 @@ cleanup() {
99 if [ -n " ${tail_pid:- } " ]; then
1010 kill " ${tail_pid} " 2> /dev/null || true
1111 fi
12- # Cancel the SLURM job if the monitor is exiting due to an error
13- # (e.g., the CI runner is being killed). Don't cancel on success.
12+ # Cancel the SLURM job only if it is still active in the scheduler.
13+ # If the job already left the queue (squeue returns empty), it has finished
14+ # and run_monitored_slurm_job.sh will recover via sacct — don't cancel it.
1415 if [ " ${monitor_success:- 0} " -ne 1 ] && [ -n " ${job_id:- } " ]; then
15- echo " Monitor exiting abnormally — cancelling SLURM job $job_id "
16- scancel " $job_id " 2> /dev/null || true
16+ active_state=$( squeue -j " $job_id " -h -o ' %T' 2> /dev/null | head -n1 | tr -d ' ' || echo " " )
17+ if [ -n " $active_state " ]; then
18+ echo " Monitor exiting abnormally — cancelling SLURM job $job_id (state: $active_state )"
19+ scancel " $job_id " 2> /dev/null || true
20+ else
21+ echo " Monitor exiting abnormally — SLURM job $job_id already left queue, not cancelling"
22+ fi
1723 fi
1824}
1925trap cleanup EXIT
@@ -46,6 +52,15 @@ get_job_state() {
4652 # Fallback to sacct (works for completed/historical jobs)
4753 if command -v sacct > /dev/null 2>&1 ; then
4854 state=$( sacct -j " $jid " -n -X -P -o State 2> /dev/null | head -n1 | cut -d' |' -f1 || true)
55+ # When a job is preempted+requeued, sacct -X reports PREEMPTED for the
56+ # original attempt while the requeued run may have completed. Check all
57+ # records (without -X) for a terminal state that supersedes PREEMPTED.
58+ if [ " $state " = " PREEMPTED" ]; then
59+ requeue_state=$( sacct -j " $jid " -n -P -o State 2> /dev/null | grep -v PREEMPTED | head -n1 | cut -d' |' -f1 || true)
60+ if [ -n " $requeue_state " ]; then
61+ state=" $requeue_state "
62+ fi
63+ fi
4964 if [ -n " $state " ]; then
5065 echo " $state "
5166 return
@@ -56,9 +71,11 @@ get_job_state() {
5671}
5772
5873# Check if a state is terminal (job is done, for better or worse)
74+ # PREEMPTED is intentionally excluded: with --requeue the job restarts under
75+ # the same job ID and we must keep monitoring rather than exiting early.
5976is_terminal_state () {
6077 case " $1 " in
61- COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED| REVOKED)
78+ COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|REVOKED)
6279 return 0 ;;
6380 * )
6481 return 1 ;;
@@ -74,7 +91,7 @@ while [ ! -f "$output_file" ]; do
7491 state=$( get_job_state " $job_id " )
7592
7693 case " $state " in
77- PENDING|CONFIGURING)
94+ PENDING|CONFIGURING|PREEMPTED )
7895 unknown_count=0
7996 sleep 5
8097 ;;
0 commit comments