From 5ab200296a052e1d6cdbdaf9cafcaffc2fc1f50c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 17:33:32 -0500 Subject: [PATCH 1/2] perf: update MI300X MiniMax-M3 image and FP8 KV cache --- .github/configs/amd-master.yaml | 8 +++----- .../single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh | 7 +++---- perf-changelog.yaml | 8 ++++++++ runners/launch_mi300x-amds.sh | 3 ++- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 3d50247d7..cee4df4cb 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2873,12 +2873,10 @@ minimaxm3-fp4-mi355x-atom: search-space: - { tp: 4, conc-start: 1, conc-end: 128 } -# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and -# MI355X serving shape, but retain the default BF16 KV cache because this -# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100 -# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency. +# MiniMax-M3 MXFP8 MI300X recipe. Use the TP8-only H100 search space: TP8 for +# latency and TP8+EP8 (TEP) at high concurrency. minimaxm3-fp8-mi300x-vllm: - image: vllm/vllm-openai-rocm:minimax-m3 + image: vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi300x diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh index f2cdaf284..8566c5185 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh @@ -1,10 +1,8 @@ #!/usr/bin/env bash # MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe. -# Reuses the dedicated ROCm image and the MI355X serving shape. Block size 128 -# is mandatory for MSA sparse attention. Keep the default BF16 KV cache on -# gfx942: the checkpoint has no calibrated q/prob scales for ROCm FP8 -# attention, and vLLM's fallback scale of 1.0 corrupts model accuracy. +# Block size 128 is mandatory for MSA sparse attention. Use FP8 KV cache to +# reduce memory pressure and increase the available concurrency headroom. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -55,6 +53,7 @@ set -x vllm serve "$MODEL" --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ --block-size 128 \ + --kv-cache-dtype fp8 \ --no-enable-prefix-caching \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 06a81eaf1..336ca120c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3950,3 +3950,11 @@ - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)" - "Update Applied TBO on high concurrencies" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717 + +- config-keys: + - minimaxm3-fp8-mi300x-vllm + description: + - "Update the MI300X MiniMax-M3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a" + - "Use FP8 KV cache" + - "Exclude unprovisioned chi-mi300x-121 from Slurm allocation" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1837 diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index b0c1e22c8..fe1843bf5 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -15,7 +15,8 @@ set -x # Exclude known-bad nodes; let Slurm pick from anything else: # chi-mi300x-049: persistent /nvme_home disk-full -JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi300x-049 --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# chi-mi300x-121: missing required Enroot and RAID storage provisioning +JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi300x-049,chi-mi300x-121 --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" From 771e633deb450de86ec27ea8ed9a2edb9448c2dc Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 19:28:14 -0500 Subject: [PATCH 2/2] chore: validate PR #1837 changelog before reuse [skip-sweep]