From 9257ae18519942eef1c0abb4a16854407c3cdded Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 14 Jun 2026 01:47:24 -0500 Subject: [PATCH 1/2] perf: enable MI355X MiniMax M3 CUDA graphs --- .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh | 2 +- perf-changelog.yaml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh index b2bf882ff..9a70f5249 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh @@ -30,6 +30,7 @@ fi SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 +export VLLM_USE_BREAKABLE_CUDAGRAPH=0 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context @@ -56,7 +57,6 @@ vllm serve "$MODEL" --port "$PORT" \ --max-model-len "$MAX_MODEL_LEN" \ --kv-cache-dtype fp8 \ --attention-backend TRITON_ATTN \ - --enforce-eager \ --tool-call-parser minimax_m3 \ --reasoning-parser minimax_m3 \ --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 36ddee83f..8d4dc8503 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3761,3 +3761,9 @@ description: - "Enable CUDA graphs for MiniMax-M3 MXFP8 on MI300X and set VLLM_USE_BREAKABLE_CUDAGRAPH=0 per AMD guidance" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1750 + +- config-keys: + - minimaxm3-fp8-mi355x-vllm + description: + - "Enable CUDA graphs for MiniMax-M3 MXFP8 on MI355X and set VLLM_USE_BREAKABLE_CUDAGRAPH=0" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD From 9e2258bd2be534c19b0d7469a7e477e9c382d6f2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 14 Jun 2026 01:47:55 -0500 Subject: [PATCH 2/2] fix: disable MI355X MiniMax M3 prefix caching --- benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh | 1 + perf-changelog.yaml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh index 9a70f5249..9ec86f517 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh @@ -53,6 +53,7 @@ set -x vllm serve "$MODEL" --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ --block-size 128 \ + --no-enable-prefix-caching \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ --kv-cache-dtype fp8 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8d4dc8503..35abea4ce 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3765,5 +3765,5 @@ - config-keys: - minimaxm3-fp8-mi355x-vllm description: - - "Enable CUDA graphs for MiniMax-M3 MXFP8 on MI355X and set VLLM_USE_BREAKABLE_CUDAGRAPH=0" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + - "Enable CUDA graphs for MiniMax-M3 MXFP8 on MI355X, set VLLM_USE_BREAKABLE_CUDAGRAPH=0, and disable prefix caching" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1754