From 5ab200296a052e1d6cdbdaf9cafcaffc2fc1f50c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 17:33:32 -0500
Subject: [PATCH 1/2] perf: update MI300X MiniMax-M3 image and FP8 KV cache

---
 .github/configs/amd-master.yaml                           | 8 +++-----
 .../single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh     | 7 +++----
 perf-changelog.yaml                                       | 8 ++++++++
 runners/launch_mi300x-amds.sh                             | 3 ++-
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 3d50247d7..cee4df4cb 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2873,12 +2873,10 @@ minimaxm3-fp4-mi355x-atom:
       search-space:
       - { tp: 4, conc-start: 1, conc-end: 128 }
 
-# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
-# MI355X serving shape, but retain the default BF16 KV cache because this
-# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
-# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency.
+# MiniMax-M3 MXFP8 MI300X recipe. Use the TP8-only H100 search space: TP8 for
+# latency and TP8+EP8 (TEP) at high concurrency.
 minimaxm3-fp8-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:minimax-m3
+  image: vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi300x
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
index f2cdaf284..8566c5185 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
@@ -1,10 +1,8 @@
 #!/usr/bin/env bash
 
 # MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe.
-# Reuses the dedicated ROCm image and the MI355X serving shape. Block size 128
-# is mandatory for MSA sparse attention. Keep the default BF16 KV cache on
-# gfx942: the checkpoint has no calibrated q/prob scales for ROCm FP8
-# attention, and vLLM's fallback scale of 1.0 corrupts model accuracy.
+# Block size 128 is mandatory for MSA sparse attention. Use FP8 KV cache to
+# reduce memory pressure and increase the available concurrency headroom.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -55,6 +53,7 @@ set -x
 vllm serve "$MODEL" --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
     --block-size 128 \
+    --kv-cache-dtype fp8 \
     --no-enable-prefix-caching \
     --language-model-only \
     --max-model-len "$MAX_MODEL_LEN" \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 06a81eaf1..336ca120c 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3950,3 +3950,11 @@
     - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
     - "Update Applied TBO on high concurrencies"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
+
+- config-keys:
+    - minimaxm3-fp8-mi300x-vllm
+  description:
+    - "Update the MI300X MiniMax-M3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a"
+    - "Use FP8 KV cache"
+    - "Exclude unprovisioned chi-mi300x-121 from Slurm allocation"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1837
diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh
index b0c1e22c8..fe1843bf5 100644
--- a/runners/launch_mi300x-amds.sh
+++ b/runners/launch_mi300x-amds.sh
@@ -15,7 +15,8 @@ set -x
 
 # Exclude known-bad nodes; let Slurm pick from anything else:
 #   chi-mi300x-049: persistent /nvme_home disk-full
-JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi300x-049 --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+#   chi-mi300x-121: missing required Enroot and RAID storage provisioning
+JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi300x-049,chi-mi300x-121 --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
 
 if [ -z "$JOB_ID" ]; then
     echo "ERROR: salloc failed to allocate a job"

From 771e633deb450de86ec27ea8ed9a2edb9448c2dc Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 19:28:14 -0500
Subject: [PATCH 2/2] chore: validate PR #1837 changelog before reuse
 [skip-sweep]