From 7cf2a7126214be3d5ea9886d5a25a86d8b0ea5f1 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Sun, 17 May 2026 21:24:07 -0400
Subject: [PATCH] runners(mi300x): pin salloc to known-good nodes

Three of the nine mi300x compute nodes are currently unusable:
  - chi-mi300x-033, chi-mi300x-037: down (Not responding)
  - chi-mi300x-049: drained for persistent /nvme_home disk-full
    (kept down by a watchdog re-applying State=DOWN every 10s)

Without a nodelist filter, salloc sometimes lands a job on a node
that's about to be drained or that has a half-extracted enroot dir,
causing 'pyxis: failed to create container filesystem (No space left
on device)' / 'srun: Node failure' / 'manifest unknown'-style errors
visible in PRs #1426 and #1403.

Add an explicit --nodelist of the 6 healthy nodes (mirroring how
runners/launch_b300-nv.sh:336 pins to the known-good B300 set).
---
 runners/launch_mi300x-amds.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh
index 20addccf4..4f085d0ad 100644
--- a/runners/launch_mi300x-amds.sh
+++ b/runners/launch_mi300x-amds.sh
@@ -9,7 +9,10 @@ LOCK_FILE="${SQUASH_FILE}.lock"
 
 set -x
 
-JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+# Pin to the known-good mi300x nodes; others are unavailable:
+#   chi-mi300x-033, chi-mi300x-037: down (Not responding)
+#   chi-mi300x-049:                  drained (persistent /nvme_home disk-full)
+JOB_ID=$(salloc --partition=$PARTITION --nodelist=chi-mi300x-[034-036,054,057-058].ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
 
 if [ -z "$JOB_ID" ]; then
     echo "ERROR: salloc failed to allocate a job"