diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh index 11607ae27..757d54786 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh @@ -5,8 +5,9 @@ # minimaxm3_fp8_mi355x.sh. Adds the Inferact/MiniMax-M3-EAGLE3 draft head via # --speculative-config with 3 speculative tokens. Everything else mirrors the # non-MTP recipe: MXFP8 from TP=4 on gfx950, mandatory --block-size 128, -# --language-model-only for the text-only benchmark, FP8 KV cache, -# --attention-backend TRITON_ATTN, and --enforce-eager. +# --language-model-only for the text-only benchmark, FP8 KV cache, and +# --attention-backend TRITON_ATTN. Runs with CUDA graphs (no --enforce-eager); +# VLLM_USE_BREAKABLE_CUDAGRAPH=0 avoids the M3-decode breakable-cudagraph path. # # Unlike the CUDA recipes, the drafter needs no attention_backend override: # the FlashInfer "page size 128 requires GQA/MQA" limitation that forced @@ -57,6 +58,9 @@ fi SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 +# Run with CUDA graphs (no --enforce-eager): VLLM_USE_BREAKABLE_CUDAGRAPH=0 +# avoids the M3-decode breakable-cudagraph path that previously forced eager. +export VLLM_USE_BREAKABLE_CUDAGRAPH=0 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context @@ -168,11 +172,11 @@ set -x vllm serve "$MODEL" --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ --block-size 128 \ + --no-enable-prefix-caching \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ --kv-cache-dtype fp8 \ --attention-backend TRITON_ATTN \ - --enforce-eager \ --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ --tool-call-parser minimax_m3 \ --reasoning-parser minimax_m3 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4222b1035..f33d8d49d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3783,3 +3783,10 @@ - "H200-aligned layouts and concurrency ranges: TP4 and TP8 latency, TP4/TP8 expert parallelism, and TP8 data-parallel attention across 1k1k and 8k1k" - "Route the MI325X Hugging Face cache and runtime compiler caches to node-local storage, and mount ROCm GPU devices explicitly" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1748 + +- config-keys: + - minimaxm3-fp8-mi355x-vllm-mtp + description: + - "Run the MiniMax-M3 MXFP8 MI355X EAGLE3 MTP recipe with CUDA graphs instead of --enforce-eager" + - "Drop --enforce-eager and set VLLM_USE_BREAKABLE_CUDAGRAPH=0, which avoids the M3-decode breakable-cudagraph path that previously forced eager execution (the non-MTP MI355X recipe already got this in #1754)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1755