SemiAnalysisAI · functionstackx · Jun 16, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 16, 2026
@@ -12676,13 +12676,16 @@ minimaxm3-fp8-b200-vllm-mtp:
       - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
       - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
       - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
       - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
       - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
 
 # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
@@ -12710,13 +12713,16 @@ minimaxm3-fp8-b300-vllm-mtp:
       - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
       - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
       - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
       - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
       - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
 
 # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh
@@ -61,6 +61,7 @@ SERVER_LOG=/workspace/server.log
 # 444 GB of MXFP8 weights off shared FS; engine startup can exceed the
 # default 600s readiness window.
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
+export VLLM_FLOAT32_MATMUL_PRECISION=high
 
 if [ "${DP_ATTENTION}" = "true" ]; then
   PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
@@ -73,14 +74,6 @@ fi
 # use 3 speculative tokens for all configs for now
 NUM_SPEC_TOKENS=3
 
-# Fixed-seq-len runs don't need graphs past the decode step's token count:
-# with spec decoding every running request contributes 1 + NUM_SPEC_TOKENS
-# tokens per step, so capture up to the next power of two >=
-# CONC * (1 + NUM_SPEC_TOKENS), capped at vLLM's 2048 ceiling.
-CAPTURE_SIZE=4
-while (( CAPTURE_SIZE < CONC * (1 + NUM_SPEC_TOKENS) )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
-(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048
-
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
     MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
@@ -95,7 +88,7 @@ $PARALLEL_ARGS \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
 --language-model-only \
---max-cudagraph-capture-size $CAPTURE_SIZE \
+--max-cudagraph-capture-size 2048 \
 --max-num-batched-tokens "$((ISL * 2 ))" \
 --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
 --stream-interval 20 --no-enable-prefix-caching \

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
@@ -62,6 +62,7 @@ SERVER_LOG=/workspace/server.log
 # 444 GB of MXFP8 weights off shared FS; engine startup can exceed the
 # default 600s readiness window.
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
+export VLLM_FLOAT32_MATMUL_PRECISION=high
 
 if [ "${DP_ATTENTION}" = "true" ]; then
   PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
@@ -74,14 +75,6 @@ fi
 # use 3 speculative tokens for all configs for now
 NUM_SPEC_TOKENS=3
 
-# Fixed-seq-len runs don't need graphs past the decode step's token count:
-# with spec decoding every running request contributes 1 + NUM_SPEC_TOKENS
-# tokens per step, so capture up to the next power of two >=
-# CONC * (1 + NUM_SPEC_TOKENS), capped at vLLM's 2048 ceiling.
-CAPTURE_SIZE=4
-while (( CAPTURE_SIZE < CONC * (1 + NUM_SPEC_TOKENS) )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
-(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048
-
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
     MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
@@ -96,7 +89,7 @@ $PARALLEL_ARGS \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
 --language-model-only \
---max-cudagraph-capture-size $CAPTURE_SIZE \
+--max-cudagraph-capture-size 2048 \
 --max-num-batched-tokens "$((ISL * 2 ))" \
 --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
 --stream-interval 20 --no-enable-prefix-caching \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3838,6 +3838,7 @@
     - "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762
 
+
 - config-keys:
     - dsv4-fp4-gb300-dynamo-trt
     - dsv4-fp4-gb300-dynamo-trt-mtp
@@ -3871,3 +3872,11 @@
     - "Image: vllm/vllm-openai-rocm:nightly"
     - "Add more sweep points"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1785
+
+- config-keys:
+    - minimaxm3-fp8-b200-vllm-mtp
+    - minimaxm3-fp8-b300-vllm-mtp
+  description:
+    - "Align MiniMax-M3 B200/B300 EAGLE3 MTP serving with the MiniMax-M2.5 FP8 serving settings by setting VLLM_FLOAT32_MATMUL_PRECISION=high and using max cudagraph capture size 2048."
+    - "Add TP4+EP4 MTP coverage: DP-attention rows for 1k1k/8k1k and the missing non-DP-attention row for 8k1k."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1784