SemiAnalysisAI · functionstackx · Jun 14, 2026 · Jun 14, 2026 · cursor · Jun 14, 2026
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
@@ -5,8 +5,9 @@
 # minimaxm3_fp8_mi355x.sh. Adds the Inferact/MiniMax-M3-EAGLE3 draft head via
 # --speculative-config with 3 speculative tokens. Everything else mirrors the
 # non-MTP recipe: MXFP8 from TP=4 on gfx950, mandatory --block-size 128,
-# --language-model-only for the text-only benchmark, FP8 KV cache,
-# --attention-backend TRITON_ATTN, and --enforce-eager.
+# --language-model-only for the text-only benchmark, FP8 KV cache, and
+# --attention-backend TRITON_ATTN. Runs with CUDA graphs (no --enforce-eager);
+# VLLM_USE_BREAKABLE_CUDAGRAPH=0 avoids the M3-decode breakable-cudagraph path.
 #
 # Unlike the CUDA recipes, the drafter needs no attention_backend override:
 # the FlashInfer "page size 128 requires GQA/MQA" limitation that forced
@@ -57,6 +58,9 @@ fi
 
 SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
+# Run with CUDA graphs (no --enforce-eager): VLLM_USE_BREAKABLE_CUDAGRAPH=0
+# avoids the M3-decode breakable-cudagraph path that previously forced eager.
+export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
@@ -168,11 +172,11 @@ set -x
 vllm serve "$MODEL" --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
     --block-size 128 \
+    --no-enable-prefix-caching \
     --language-model-only \
     --max-model-len "$MAX_MODEL_LEN" \
     --kv-cache-dtype fp8 \
     --attention-backend TRITON_ATTN \
-    --enforce-eager \
     --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
     --tool-call-parser minimax_m3 \
     --reasoning-parser minimax_m3 \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3783,3 +3783,10 @@
     - "H200-aligned layouts and concurrency ranges: TP4 and TP8 latency, TP4/TP8 expert parallelism, and TP8 data-parallel attention across 1k1k and 8k1k"
     - "Route the MI325X Hugging Face cache and runtime compiler caches to node-local storage, and mount ROCm GPU devices explicitly"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1748
+
+- config-keys:
+    - minimaxm3-fp8-mi355x-vllm-mtp
+  description:
+    - "Run the MiniMax-M3 MXFP8 MI355X EAGLE3 MTP recipe with CUDA graphs instead of --enforce-eager"
+    - "Drop --enforce-eager and set VLLM_USE_BREAKABLE_CUDAGRAPH=0, which avoids the M3-decode breakable-cudagraph path that previously forced eager execution (the non-MTP MI355X recipe already got this in #1754)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1755