SemiAnalysisAI · chunfangamd · Jun 26, 2026 · Jun 26, 2026 · claude · Jun 26, 2026
@@ -2525,7 +2525,7 @@ dsv4-fp4-mi355x-atom-disagg:
 # https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5
 # MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA.
 minimaxm3-fp8-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:minimax-m3
+  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
@@ -31,6 +31,7 @@ fi
 SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
@@ -61,6 +61,7 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600
 # Run with CUDA graphs (no --enforce-eager): VLLM_USE_BREAKABLE_CUDAGRAPH=0
 # avoids the M3-decode breakable-cudagraph path that previously forced eager.
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -4280,3 +4280,12 @@
     - "Update the MiniMax-M3 MXFP8 MI355X vLLM EAGLE3 benchmark image from vllm/vllm-openai-rocm:minimax-m3 to vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e."
     - "Benchmark configuration, EAGLE3 draft model, serving flags, and search space are unchanged."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1941
+
+- config-keys:
+    - minimaxm3-fp8-mi355x-vllm
+    - minimaxm3-fp8-mi355x-vllm-mtp
+  description:
+    - "Update the MiniMax-M3 MXFP8 MI355X vLLM benchmark image from vllm/vllm-openai-rocm:minimax-m3 to vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e, which includes the gfx950 mxfp8 MoE/linear tuning for MiniMax-M3 (vllm-project/vllm#45725)."
+    - "Export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 in the standard and EAGLE3 (MTP) bench scripts to use INT6 quick all-reduce on CDNA4/gfx950, reducing TP all-reduce cost for the mxfp8 workload."
+    - "Benchmark serving flags and search space are otherwise unchanged."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1946