diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f6c9735ab..486edd6ad 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12676,6 +12676,7 @@ minimaxm3-fp8-b200-vllm-mtp: - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 @@ -12683,6 +12684,8 @@ minimaxm3-fp8-b200-vllm-mtp: - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of @@ -12710,6 +12713,7 @@ minimaxm3-fp8-b300-vllm-mtp: - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 @@ -12717,6 +12721,8 @@ minimaxm3-fp8-b300-vllm-mtp: - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh index 42e3b9e25..dde0e69c0 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh @@ -61,6 +61,7 @@ SERVER_LOG=/workspace/server.log # 444 GB of MXFP8 weights off shared FS; engine startup can exceed the # default 600s readiness window. export VLLM_ENGINE_READY_TIMEOUT_S=3600 +export VLLM_FLOAT32_MATMUL_PRECISION=high if [ "${DP_ATTENTION}" = "true" ]; then PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel" @@ -73,14 +74,6 @@ fi # use 3 speculative tokens for all configs for now NUM_SPEC_TOKENS=3 -# Fixed-seq-len runs don't need graphs past the decode step's token count: -# with spec decoding every running request contributes 1 + NUM_SPEC_TOKENS -# tokens per step, so capture up to the next power of two >= -# CONC * (1 + NUM_SPEC_TOKENS), capped at vLLM's 2048 ceiling. -CAPTURE_SIZE=4 -while (( CAPTURE_SIZE < CONC * (1 + NUM_SPEC_TOKENS) )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done -(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048 - if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" @@ -95,7 +88,7 @@ $PARALLEL_ARGS \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ --language-model-only \ ---max-cudagraph-capture-size $CAPTURE_SIZE \ +--max-cudagraph-capture-size 2048 \ --max-num-batched-tokens "$((ISL * 2 ))" \ --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \ --stream-interval 20 --no-enable-prefix-caching \ diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh index b832f1f39..a93da87b0 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh @@ -62,6 +62,7 @@ SERVER_LOG=/workspace/server.log # 444 GB of MXFP8 weights off shared FS; engine startup can exceed the # default 600s readiness window. export VLLM_ENGINE_READY_TIMEOUT_S=3600 +export VLLM_FLOAT32_MATMUL_PRECISION=high if [ "${DP_ATTENTION}" = "true" ]; then PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel" @@ -74,14 +75,6 @@ fi # use 3 speculative tokens for all configs for now NUM_SPEC_TOKENS=3 -# Fixed-seq-len runs don't need graphs past the decode step's token count: -# with spec decoding every running request contributes 1 + NUM_SPEC_TOKENS -# tokens per step, so capture up to the next power of two >= -# CONC * (1 + NUM_SPEC_TOKENS), capped at vLLM's 2048 ceiling. -CAPTURE_SIZE=4 -while (( CAPTURE_SIZE < CONC * (1 + NUM_SPEC_TOKENS) )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done -(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048 - if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" @@ -96,7 +89,7 @@ $PARALLEL_ARGS \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ --language-model-only \ ---max-cudagraph-capture-size $CAPTURE_SIZE \ +--max-cudagraph-capture-size 2048 \ --max-num-batched-tokens "$((ISL * 2 ))" \ --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \ --stream-interval 20 --no-enable-prefix-caching \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a677cf010..9747ca860 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3838,6 +3838,7 @@ - "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762 + - config-keys: - dsv4-fp4-gb300-dynamo-trt - dsv4-fp4-gb300-dynamo-trt-mtp @@ -3871,3 +3872,11 @@ - "Image: vllm/vllm-openai-rocm:nightly" - "Add more sweep points" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1785 + +- config-keys: + - minimaxm3-fp8-b200-vllm-mtp + - minimaxm3-fp8-b300-vllm-mtp + description: + - "Align MiniMax-M3 B200/B300 EAGLE3 MTP serving with the MiniMax-M2.5 FP8 serving settings by setting VLLM_FLOAT32_MATMUL_PRECISION=high and using max cudagraph capture size 2048." + - "Add TP4+EP4 MTP coverage: DP-attention rows for 1k1k/8k1k and the missing non-DP-attention row for 8k1k." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1784