diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh index b2bf882ff..9ec86f517 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh @@ -30,6 +30,7 @@ fi SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 +export VLLM_USE_BREAKABLE_CUDAGRAPH=0 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context @@ -52,11 +53,11 @@ set -x vllm serve "$MODEL" --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ --block-size 128 \ + --no-enable-prefix-caching \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ --kv-cache-dtype fp8 \ --attention-backend TRITON_ATTN \ - --enforce-eager \ --tool-call-parser minimax_m3 \ --reasoning-parser minimax_m3 \ --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c578ea7c4..420b2f48b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3768,3 +3768,9 @@ - "Run the MiniMax-M3 MXFP8 MI300X EAGLE3 MTP recipe with CUDA graphs instead of --enforce-eager" - "Drop --enforce-eager and set VLLM_USE_BREAKABLE_CUDAGRAPH=0 (matching the non-MTP MI300X recipe, #1750), which avoids the M3-decode breakable-cudagraph path that previously forced eager execution" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1756 + +- config-keys: + - minimaxm3-fp8-mi355x-vllm + description: + - "Enable CUDA graphs for MiniMax-M3 MXFP8 on MI355X, set VLLM_USE_BREAKABLE_CUDAGRAPH=0, and disable prefix caching" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1754