SemiAnalysisAI · cquil11 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
@@ -30,6 +30,7 @@ fi
 
 SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
+export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
@@ -52,11 +53,11 @@ set -x
 vllm serve "$MODEL" --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
     --block-size 128 \
+    --no-enable-prefix-caching \
     --language-model-only \
     --max-model-len "$MAX_MODEL_LEN" \
     --kv-cache-dtype fp8 \
     --attention-backend TRITON_ATTN \
-    --enforce-eager \
     --tool-call-parser minimax_m3 \
     --reasoning-parser minimax_m3 \
     --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3768,3 +3768,9 @@
     - "Run the MiniMax-M3 MXFP8 MI300X EAGLE3 MTP recipe with CUDA graphs instead of --enforce-eager"
     - "Drop --enforce-eager and set VLLM_USE_BREAKABLE_CUDAGRAPH=0 (matching the non-MTP MI300X recipe, #1750), which avoids the M3-decode breakable-cudagraph path that previously forced eager execution"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1756
+
+- config-keys:
+    - minimaxm3-fp8-mi355x-vllm
+  description:
+    - "Enable CUDA graphs for MiniMax-M3 MXFP8 on MI355X, set VLLM_USE_BREAKABLE_CUDAGRAPH=0, and disable prefix caching"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1754