diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 58469dcc4..9386d9c27 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2666,7 +2666,7 @@ minimaxm3-fp4-mi355x-vllm: # tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base # FP4 sweep at extreme concurrency where speculative decoding loses value. minimaxm3-fp4-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh index 96a560493..8a15b8c89 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh @@ -36,6 +36,18 @@ fi SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +# Use AITER MoE for the MXFP4 experts, matching minimaxm3_fp4_mi355x_vllm.sh. +# This is required for ALL configs including expert parallelism: with EP enabled +# and moe_backend=auto, the AITER MXFP4 backend is skipped and selection falls +# back to Mxfp4MoeBackend.EMULATION, which triggers a first-time build of the +# Quark hw-emulation C++ kernel (kernel_ext, 9 ROCm arches) on every worker at +# warmup. Concurrent EP workers deadlock on the shared torch_extensions build +# lock, hanging engine-core for hours. Forcing --moe-backend aiter selects the +# AITER_MXFP4_MXFP4 backend instead (verified working under TP4+EP4 with EAGLE3 +# spec decoding), avoiding the emulation build entirely. +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_MOE=1 +export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context @@ -65,6 +77,7 @@ vllm serve "$MODEL" --port "$PORT" \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ --attention-backend TRITON_ATTN \ + --moe-backend aiter \ --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ --tool-call-parser minimax_m3 \ --enable-auto-tool-choice \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 068e37a7e..450226250 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4344,6 +4344,14 @@ - "Reuse the existing MXFP8 B300 topology and concurrency matrix across 15 srt-slurm recipes, while dropping the FP8-only Marlin override from TP4 decode" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1931 +- config-keys: + - minimaxm3-fp4-mi355x-vllm-mtp + description: + - "Enable AITER MoE on MiniMax-M3 MXFP4 MI355X single-node vLLM MTP (EAGLE3), mirroring the STP recipe: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter unconditionally (including expert parallelism)." + - "Fixes the ~8h engine-core startup hang on EP configs: with moe_backend=auto, EP fell back to Mxfp4MoeBackend.EMULATION, which deadlocked all expert-parallel workers building the Quark hw-emulation C++ kernel into a shared torch_extensions dir. Forcing --moe-backend aiter selects AITER_MXFP4_MXFP4 (no emulation build)." + - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e), matching the STP recipe." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1964 + - config-keys: - minimaxm3-fp4-b300-dynamo-vllm description: