From fc9924d200ae08d83d9ed867edc881c4c4955292 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 26 Jun 2026 20:13:44 +0000 Subject: [PATCH 1/7] [AMD] Update MiniMax-M3 FP8 MI355X vLLM image and enable INT6 quick-reduce --- benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh | 1 + benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh index 9ec86f517..23c1a2f7f 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh @@ -31,6 +31,7 @@ fi SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh index 757d54786..87c07a35a 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh @@ -61,6 +61,7 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600 # Run with CUDA graphs (no --enforce-eager): VLLM_USE_BREAKABLE_CUDAGRAPH=0 # avoids the M3-decode breakable-cudagraph path that previously forced eager. export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context From 97fc8d6dc60732f0ce2fbc96bdae800b7df02336 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 26 Jun 2026 20:20:56 +0000 Subject: [PATCH 2/7] [AMD] Update changelog --- perf-changelog.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3ef8c37db..653c8b6bb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4323,3 +4323,12 @@ - "Enable AITER MoE on MiniMax-M3 MXFP4 MI355X single-node vLLM STP: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter." - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e) for AITER MoE and shared-expert fusion support (vllm-project/vllm#46419, vllm-project/vllm#46545)." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1954 + +- config-keys: + - minimaxm3-fp8-mi355x-vllm + - minimaxm3-fp8-mi355x-vllm-mtp + description: + - "Update the MiniMax-M3 MXFP8 MI355X vLLM benchmark image from vllm/vllm-openai-rocm:minimax-m3 to vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e, which includes the gfx950 mxfp8 MoE/linear tuning for MiniMax-M3 (vllm-project/vllm#45725)." + - "Export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 in the standard and EAGLE3 (MTP) bench scripts to use INT6 quick all-reduce on CDNA4/gfx950, reducing TP all-reduce cost for the mxfp8 workload." + - "Benchmark serving flags and search space are otherwise unchanged." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1946 From 5c4ab6825a41df95d0a4625272f9edcdf7132ed5 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Mon, 29 Jun 2026 08:35:56 +0000 Subject: [PATCH 3/7] [AMD] Retune MiniMax-M3 FP8 MI355X vLLM search space --- .github/configs/amd-master.yaml | 28 +++++++++++----------------- perf-changelog.yaml | 5 ++++- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index cbfc09f81..c1ae8226e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2537,19 +2537,14 @@ minimaxm3-fp8-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } - - { tp: 4, conc-start: 1, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 32 } + - { tp: 4, conc-start: 4, conc-end: 32 } - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 } - - { tp: 2, ep: 2, conc-start: 16, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } - - { tp: 4, conc-start: 1, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 } + - { tp: 8, conc-start: 1, conc-end: 2 } + - { tp: 4, conc-start: 2, conc-end: 128 } # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of # minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the @@ -2574,18 +2569,17 @@ minimaxm3-fp8-mi355x-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp } + - { tp: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 128, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp } + - { tp: 4, conc-start: 16, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 1, spec-decoding: mtp } # MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config. minimaxm3-fp4-mi355x-vllm-disagg: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 653c8b6bb..f63f35604 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4330,5 +4330,8 @@ description: - "Update the MiniMax-M3 MXFP8 MI355X vLLM benchmark image from vllm/vllm-openai-rocm:minimax-m3 to vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e, which includes the gfx950 mxfp8 MoE/linear tuning for MiniMax-M3 (vllm-project/vllm#45725)." - "Export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 in the standard and EAGLE3 (MTP) bench scripts to use INT6 quick all-reduce on CDNA4/gfx950, reducing TP all-reduce cost for the mxfp8 workload." - - "Benchmark serving flags and search space are otherwise unchanged." + - "Retune the TP/EP search space to the best layout per concurrency band and drop redundant points (full TP8/EP8, TP2/EP2, DP-attention)." + - "minimaxm3-fp8-mi355x-vllm: 1k1k sweeps TP8 (conc 1-32), TP4 (conc 4-32), TP4/EP4 (conc 64-512); 8k1k sweeps TP8 (conc 1-2), TP4 (conc 2-128)." + - "minimaxm3-fp8-mi355x-vllm-mtp: 1k1k sweeps TP8 (conc 4-32), TP8/EP8 (conc 1-256), TP4 (conc 1-2 and 32-64), TP4/EP4 (conc 128-512); 8k1k sweeps TP8 (conc 1 and 4-16), TP4 (conc 16-128)." + - "Serving flags are otherwise unchanged." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1946 From 87d4da15f822f1d536f80a40bd387ecdfd33657c Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Mon, 29 Jun 2026 12:35:27 +0000 Subject: [PATCH 4/7] [AMD] Bump MiniMax-M3 FP8 MI355X image and enable AITER fused experts Pin minimaxm3-fp8-mi355x-vllm{,-mtp} to nightly-4559c43a, which bakes in fused shared-experts MoE (vllm-project/vllm#46545) and the AITER flydsl MoE backend (#46184). Align both bench scripts with vllm-project/recipes#581 by exporting VLLM_ROCM_USE_AITER=1 and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 alongside the existing INT6 quick-reduce; no --moe-backend override, so AITER is auto-selected. --- .github/configs/amd-master.yaml | 8 ++++---- .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh | 5 +++++ .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh | 5 +++++ perf-changelog.yaml | 4 ++-- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index c1ae8226e..d583badaa 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2525,7 +2525,7 @@ dsv4-fp4-mi355x-atom-disagg: # https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5 # MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA. minimaxm3-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x @@ -2538,7 +2538,7 @@ minimaxm3-fp8-mi355x-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 } - isl: 8192 osl: 1024 @@ -2557,7 +2557,7 @@ minimaxm3-fp8-mi355x-vllm: # acceptance dilutes in big batches, and the draft weights + draft KV shave # headroom — tp2-ep2 is dropped since its KV headroom was already thin. minimaxm3-fp8-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x @@ -2578,7 +2578,7 @@ minimaxm3-fp8-mi355x-vllm-mtp: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp } - - { tp: 4, conc-start: 16, conc-end: 128, spec-decoding: mtp } + - { tp: 4, conc-start: 2, conc-end: 128, spec-decoding: mtp } - { tp: 8, conc-start: 1, conc-end: 1, spec-decoding: mtp } # MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config. diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh index 23c1a2f7f..12fc571eb 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh @@ -31,6 +31,11 @@ fi SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +# MI355X mxfp8 recipe (vllm-project/recipes#581): AITER kernels + fused +# shared-experts MoE (needs vllm-project/vllm#46545) plus INT6 quick all-reduce. +# No --moe-backend override here, so AITER is the auto-selected MoE backend. +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh index 87c07a35a..1f1c196ba 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh @@ -61,6 +61,11 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600 # Run with CUDA graphs (no --enforce-eager): VLLM_USE_BREAKABLE_CUDAGRAPH=0 # avoids the M3-decode breakable-cudagraph path that previously forced eager. export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +# MI355X mxfp8 recipe (vllm-project/recipes#581): AITER kernels + fused +# shared-experts MoE (needs vllm-project/vllm#46545) plus INT6 quick all-reduce. +# No --moe-backend override here, so AITER is the auto-selected MoE backend. +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f63f35604..1455ffad4 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4328,8 +4328,8 @@ - minimaxm3-fp8-mi355x-vllm - minimaxm3-fp8-mi355x-vllm-mtp description: - - "Update the MiniMax-M3 MXFP8 MI355X vLLM benchmark image from vllm/vllm-openai-rocm:minimax-m3 to vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e, which includes the gfx950 mxfp8 MoE/linear tuning for MiniMax-M3 (vllm-project/vllm#45725)." - - "Export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 in the standard and EAGLE3 (MTP) bench scripts to use INT6 quick all-reduce on CDNA4/gfx950, reducing TP all-reduce cost for the mxfp8 workload." + - "Update the MiniMax-M3 MXFP8 MI355X vLLM benchmark image from vllm/vllm-openai-rocm:minimax-m3 to vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1, which includes the gfx950 mxfp8 MoE/linear tuning (vllm-project/vllm#45725), fused shared-experts MoE for the mxfp8 model (#46545), and the AITER flydsl MoE backend (#46184)." + - "Align the standard and EAGLE3 (MTP) bench scripts with vllm-project/recipes#581: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1, and VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 (AITER kernels + fused shared-experts MoE + INT6 quick all-reduce; no --moe-backend override, so AITER is the auto-selected MoE backend)." - "Retune the TP/EP search space to the best layout per concurrency band and drop redundant points (full TP8/EP8, TP2/EP2, DP-attention)." - "minimaxm3-fp8-mi355x-vllm: 1k1k sweeps TP8 (conc 1-32), TP4 (conc 4-32), TP4/EP4 (conc 64-512); 8k1k sweeps TP8 (conc 1-2), TP4 (conc 2-128)." - "minimaxm3-fp8-mi355x-vllm-mtp: 1k1k sweeps TP8 (conc 4-32), TP8/EP8 (conc 1-256), TP4 (conc 1-2 and 32-64), TP4/EP4 (conc 128-512); 8k1k sweeps TP8 (conc 1 and 4-16), TP4 (conc 16-128)." From 70521c16eaa5ddd5d65a3fc4bd1861df8e5da24b Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Tue, 30 Jun 2026 00:06:27 +0000 Subject: [PATCH 5/7] aiter master flag and ep Signed-off-by: Hongxia Yang --- .../fixed_seq_len/minimaxm3_fp8_mi355x.sh | 21 +++++++++++++++---- .../fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh | 21 +++++++++++++++---- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh index 12fc571eb..89c136c27 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh @@ -31,10 +31,12 @@ fi SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 -# MI355X mxfp8 recipe (vllm-project/recipes#581): AITER kernels + fused -# shared-experts MoE (needs vllm-project/vllm#46545) plus INT6 quick all-reduce. -# No --moe-backend override here, so AITER is the auto-selected MoE backend. -export VLLM_ROCM_USE_AITER=1 +# MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus +# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The +# fusion checks this env directly and runs on both the aiter and native MXFP8 +# MoE paths (it is independent of the AITER master switch, and self-disables +# under expert parallelism inside the model), so enable it unconditionally. +# (The AITER master switch itself is set below, gated on expert parallelism.) export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 @@ -53,6 +55,17 @@ elif [ "$EP_SIZE" -gt 1 ]; then PARALLEL_ARGS+=(--enable-expert-parallel) fi +# Gate the AITER master switch on expert parallelism. With EP, the aiter fused +# MoE path is the auto-selected backend (no --moe-backend override). With EP +# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3 +# output, so leave it off and fall back to the native MXFP8 path (the +# shared-experts fusion set above still applies — it is master-independent). +if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then + export VLLM_ROCM_USE_AITER=1 +else + export VLLM_ROCM_USE_AITER=0 +fi + start_gpu_monitor set -x diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh index 1f1c196ba..50a7d6d9f 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh @@ -61,10 +61,12 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600 # Run with CUDA graphs (no --enforce-eager): VLLM_USE_BREAKABLE_CUDAGRAPH=0 # avoids the M3-decode breakable-cudagraph path that previously forced eager. export VLLM_USE_BREAKABLE_CUDAGRAPH=0 -# MI355X mxfp8 recipe (vllm-project/recipes#581): AITER kernels + fused -# shared-experts MoE (needs vllm-project/vllm#46545) plus INT6 quick all-reduce. -# No --moe-backend override here, so AITER is the auto-selected MoE backend. -export VLLM_ROCM_USE_AITER=1 +# MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus +# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The +# fusion checks this env directly and runs on both the aiter and native MXFP8 +# MoE paths (it is independent of the AITER master switch, and self-disables +# under expert parallelism inside the model), so enable it unconditionally. +# (The AITER master switch itself is set below, gated on expert parallelism.) export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 @@ -83,6 +85,17 @@ elif [ "$EP_SIZE" -gt 1 ]; then PARALLEL_ARGS+=(--enable-expert-parallel) fi +# Gate the AITER master switch on expert parallelism. With EP, the aiter fused +# MoE path is the auto-selected backend (no --moe-backend override). With EP +# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3 +# output, so leave it off and fall back to the native MXFP8 path (the +# shared-experts fusion set above still applies — it is master-independent). +if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then + export VLLM_ROCM_USE_AITER=1 +else + export VLLM_ROCM_USE_AITER=0 +fi + # use 3 speculative tokens for all configs for now NUM_SPEC_TOKENS=3 From 5c8fa81dd374e6aef0a98458d3b8eea2cac2e82e Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Tue, 30 Jun 2026 00:15:32 +0000 Subject: [PATCH 6/7] [AMD] Gate AITER master switch on EP for MiniMax-M3 MXFP8 recipes Set VLLM_ROCM_USE_AITER on only for expert-parallel (EP/DP-attention) runs, where the AITER fused MoE is the auto-selected backend. TP-only runs leave it off and use the native MXFP8 path (the master switch otherwise produces degenerate MiniMax-M3 output). Keep VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 unconditional: the router-append shared-experts fusion checks the env directly (independent of the master switch) and self-disables under EP inside the model. Co-authored-by: Claude --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1455ffad4..7be8600fb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4329,7 +4329,7 @@ - minimaxm3-fp8-mi355x-vllm-mtp description: - "Update the MiniMax-M3 MXFP8 MI355X vLLM benchmark image from vllm/vllm-openai-rocm:minimax-m3 to vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1, which includes the gfx950 mxfp8 MoE/linear tuning (vllm-project/vllm#45725), fused shared-experts MoE for the mxfp8 model (#46545), and the AITER flydsl MoE backend (#46184)." - - "Align the standard and EAGLE3 (MTP) bench scripts with vllm-project/recipes#581: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1, and VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 (AITER kernels + fused shared-experts MoE + INT6 quick all-reduce; no --moe-backend override, so AITER is the auto-selected MoE backend)." + - "Align the standard and EAGLE3 (MTP) bench scripts with vllm-project/recipes#581: gate VLLM_ROCM_USE_AITER on expert parallelism (on for EP/DP-attention runs, where the AITER fused MoE is the auto-selected backend; off for TP-only runs, which fall back to the native MXFP8 path since the master switch otherwise yields degenerate MiniMax-M3 output), export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 unconditionally (the router-append shared-experts fusion is independent of the master switch and self-disables under EP), and export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 (INT6 quick all-reduce)." - "Retune the TP/EP search space to the best layout per concurrency band and drop redundant points (full TP8/EP8, TP2/EP2, DP-attention)." - "minimaxm3-fp8-mi355x-vllm: 1k1k sweeps TP8 (conc 1-32), TP4 (conc 4-32), TP4/EP4 (conc 64-512); 8k1k sweeps TP8 (conc 1-2), TP4 (conc 2-128)." - "minimaxm3-fp8-mi355x-vllm-mtp: 1k1k sweeps TP8 (conc 4-32), TP8/EP8 (conc 1-256), TP4 (conc 1-2 and 32-64), TP4/EP4 (conc 128-512); 8k1k sweeps TP8 (conc 1 and 4-16), TP4 (conc 16-128)." From a06a4993b43ec32543a257fb58c51105816f23cf Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Tue, 30 Jun 2026 03:27:09 -0400 Subject: [PATCH 7/7] [AMD] Drop conc=512 from FP8 MI355X vLLM MTP tp4/ep4 1k1k sweep The minimaxm3-fp8-mi355x-vllm-mtp tp=4 ep=4 (dp-attn=false) 1k1k point was failing at concurrency 512; lower conc-end 512 -> 256 so it sweeps 128/256. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/amd-master.yaml | 2 +- perf-changelog.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d583badaa..a437f4ecd 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2573,7 +2573,7 @@ minimaxm3-fp8-mi355x-vllm-mtp: - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp } - { tp: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 128, conc-end: 512, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7be8600fb..a112c6349 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4332,6 +4332,6 @@ - "Align the standard and EAGLE3 (MTP) bench scripts with vllm-project/recipes#581: gate VLLM_ROCM_USE_AITER on expert parallelism (on for EP/DP-attention runs, where the AITER fused MoE is the auto-selected backend; off for TP-only runs, which fall back to the native MXFP8 path since the master switch otherwise yields degenerate MiniMax-M3 output), export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 unconditionally (the router-append shared-experts fusion is independent of the master switch and self-disables under EP), and export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 (INT6 quick all-reduce)." - "Retune the TP/EP search space to the best layout per concurrency band and drop redundant points (full TP8/EP8, TP2/EP2, DP-attention)." - "minimaxm3-fp8-mi355x-vllm: 1k1k sweeps TP8 (conc 1-32), TP4 (conc 4-32), TP4/EP4 (conc 64-512); 8k1k sweeps TP8 (conc 1-2), TP4 (conc 2-128)." - - "minimaxm3-fp8-mi355x-vllm-mtp: 1k1k sweeps TP8 (conc 4-32), TP8/EP8 (conc 1-256), TP4 (conc 1-2 and 32-64), TP4/EP4 (conc 128-512); 8k1k sweeps TP8 (conc 1 and 4-16), TP4 (conc 16-128)." + - "minimaxm3-fp8-mi355x-vllm-mtp: 1k1k sweeps TP8 (conc 4-32), TP8/EP8 (conc 1-256), TP4 (conc 1-2 and 32-64), TP4/EP4 (conc 128-256); 8k1k sweeps TP8 (conc 1 and 4-16), TP4 (conc 16-128)." - "Serving flags are otherwise unchanged." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1946