Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12676,13 +12676,16 @@ minimaxm3-fp8-b200-vllm-mtp:
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
Expand Down Expand Up @@ -12710,13 +12713,16 @@ minimaxm3-fp8-b300-vllm-mtp:
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
Expand Down
11 changes: 2 additions & 9 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ SERVER_LOG=/workspace/server.log
# 444 GB of MXFP8 weights off shared FS; engine startup can exceed the
# default 600s readiness window.
export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_FLOAT32_MATMUL_PRECISION=high

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
Expand All @@ -73,14 +74,6 @@ fi
# use 3 speculative tokens for all configs for now
NUM_SPEC_TOKENS=3

# Fixed-seq-len runs don't need graphs past the decode step's token count:
# with spec decoding every running request contributes 1 + NUM_SPEC_TOKENS
# tokens per step, so capture up to the next power of two >=
# CONC * (1 + NUM_SPEC_TOKENS), capped at vLLM's 2048 ceiling.
CAPTURE_SIZE=4
while (( CAPTURE_SIZE < CONC * (1 + NUM_SPEC_TOKENS) )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
Expand All @@ -95,7 +88,7 @@ $PARALLEL_ARGS \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--language-model-only \
--max-cudagraph-capture-size $CAPTURE_SIZE \
--max-cudagraph-capture-size 2048 \
--max-num-batched-tokens "$((ISL * 2 ))" \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
--stream-interval 20 --no-enable-prefix-caching \
Expand Down
11 changes: 2 additions & 9 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ SERVER_LOG=/workspace/server.log
# 444 GB of MXFP8 weights off shared FS; engine startup can exceed the
# default 600s readiness window.
export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_FLOAT32_MATMUL_PRECISION=high

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
Expand All @@ -74,14 +75,6 @@ fi
# use 3 speculative tokens for all configs for now
NUM_SPEC_TOKENS=3

# Fixed-seq-len runs don't need graphs past the decode step's token count:
# with spec decoding every running request contributes 1 + NUM_SPEC_TOKENS
# tokens per step, so capture up to the next power of two >=
# CONC * (1 + NUM_SPEC_TOKENS), capped at vLLM's 2048 ceiling.
CAPTURE_SIZE=4
while (( CAPTURE_SIZE < CONC * (1 + NUM_SPEC_TOKENS) )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
Expand All @@ -96,7 +89,7 @@ $PARALLEL_ARGS \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--language-model-only \
--max-cudagraph-capture-size $CAPTURE_SIZE \
--max-cudagraph-capture-size 2048 \
--max-num-batched-tokens "$((ISL * 2 ))" \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
--stream-interval 20 --no-enable-prefix-caching \
Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3838,6 +3838,7 @@
- "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762


- config-keys:
- dsv4-fp4-gb300-dynamo-trt
- dsv4-fp4-gb300-dynamo-trt-mtp
Expand Down Expand Up @@ -3871,3 +3872,11 @@
- "Image: vllm/vllm-openai-rocm:nightly"
- "Add more sweep points"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1785

- config-keys:
- minimaxm3-fp8-b200-vllm-mtp
- minimaxm3-fp8-b300-vllm-mtp
description:
- "Align MiniMax-M3 B200/B300 EAGLE3 MTP serving with the MiniMax-M2.5 FP8 serving settings by setting VLLM_FLOAT32_MATMUL_PRECISION=high and using max cudagraph capture size 2048."
- "Add TP4+EP4 MTP coverage: DP-attention rows for 1k1k/8k1k and the missing non-DP-attention row for 8k1k."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1784