diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f6166699a..0d7000868 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2537,19 +2537,14 @@ minimaxm3-fp8-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } - - { tp: 4, conc-start: 1, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 32 } + - { tp: 4, conc-start: 4, conc-end: 32 } - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 } - - { tp: 2, ep: 2, conc-start: 16, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } - - { tp: 4, conc-start: 1, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 } + - { tp: 8, conc-start: 1, conc-end: 2 } + - { tp: 4, conc-start: 2, conc-end: 128 } # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of # minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the @@ -2574,18 +2569,17 @@ minimaxm3-fp8-mi355x-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp } + - { tp: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 128, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp } + - { tp: 4, conc-start: 16, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 1, spec-decoding: mtp } # MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config. minimaxm3-fp4-mi355x-vllm-disagg: diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh index 9ec86f517..23c1a2f7f 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh @@ -31,6 +31,7 @@ fi SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh index 757d54786..87c07a35a 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh @@ -61,6 +61,7 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600 # Run with CUDA graphs (no --enforce-eager): VLLM_USE_BREAKABLE_CUDAGRAPH=0 # avoids the M3-decode breakable-cudagraph path that previously forced eager. export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1cbadb492..7bf15ff2b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4296,6 +4296,18 @@ - "Benchmark configuration, EAGLE3 draft model, serving flags, and search space are unchanged." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1941 +- config-keys: + - minimaxm3-fp8-mi355x-vllm + - minimaxm3-fp8-mi355x-vllm-mtp + description: + - "Update the MiniMax-M3 MXFP8 MI355X vLLM benchmark image from vllm/vllm-openai-rocm:minimax-m3 to vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e, which includes the gfx950 mxfp8 MoE/linear tuning for MiniMax-M3 (vllm-project/vllm#45725)." + - "Export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 in the standard and EAGLE3 (MTP) bench scripts to use INT6 quick all-reduce on CDNA4/gfx950, reducing TP all-reduce cost for the mxfp8 workload." + - "Retune the TP/EP search space to the best layout per concurrency band and drop redundant points (full TP8/EP8, TP2/EP2, DP-attention)." + - "minimaxm3-fp8-mi355x-vllm: 1k1k sweeps TP8 (conc 1-32), TP4 (conc 4-32), TP4/EP4 (conc 64-512); 8k1k sweeps TP8 (conc 1-2), TP4 (conc 2-128)." + - "minimaxm3-fp8-mi355x-vllm-mtp: 1k1k sweeps TP8 (conc 4-32), TP8/EP8 (conc 1-256), TP4 (conc 1-2 and 32-64), TP4/EP4 (conc 128-512); 8k1k sweeps TP8 (conc 1 and 4-16), TP4 (conc 16-128)." + - "Serving flags are otherwise unchanged." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1946 + - config-keys: - minimaxm3-fp8-mi355x-vllm description: