From 7cf2a7126214be3d5ea9886d5a25a86d8b0ea5f1 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 17 May 2026 21:24:07 -0400 Subject: [PATCH] runners(mi300x): pin salloc to known-good nodes Three of the nine mi300x compute nodes are currently unusable: - chi-mi300x-033, chi-mi300x-037: down (Not responding) - chi-mi300x-049: drained for persistent /nvme_home disk-full (kept down by a watchdog re-applying State=DOWN every 10s) Without a nodelist filter, salloc sometimes lands a job on a node that's about to be drained or that has a half-extracted enroot dir, causing 'pyxis: failed to create container filesystem (No space left on device)' / 'srun: Node failure' / 'manifest unknown'-style errors visible in PRs #1426 and #1403. Add an explicit --nodelist of the 6 healthy nodes (mirroring how runners/launch_b300-nv.sh:336 pins to the known-good B300 set). --- runners/launch_mi300x-amds.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index 20addccf4..4f085d0ad 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -9,7 +9,10 @@ LOCK_FILE="${SQUASH_FILE}.lock" set -x -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# Pin to the known-good mi300x nodes; others are unavailable: +# chi-mi300x-033, chi-mi300x-037: down (Not responding) +# chi-mi300x-049: drained (persistent /nvme_home disk-full) +JOB_ID=$(salloc --partition=$PARTITION --nodelist=chi-mi300x-[034-036,054,057-058].ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job"