Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11850,3 +11850,31 @@ minimaxm3-fp8-b300-vllm-mtp:
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# MiniMax-M3 MXFP8 B200 MTP (EAGLE3 spec decoding) — recipe spec_decoding
# feature: Inferact/MiniMax-M3-EAGLE3 draft head, num_speculative_tokens=3,
# attention_backend=FLASH_ATTN (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# Spec decoding accelerates the latency end (compute headroom to speculate), so
# the sweep focuses on low/mid concurrency on the latency layouts. The draft
# head is pre-staged beside the main weights on b200-dgxc (see launch_b200-dgxc.sh).
minimaxm3-fp8-b200-vllm-mtp:
image: vllm/vllm-openai:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: b200-dgxc
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 32, spec-decoding: mtp }
124 changes: 124 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/env bash

# MiniMax-M3 MXFP8 B200 single-node vLLM MTP (EAGLE3) variant of
# minimaxm3_fp8_b200.sh (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# Adds the recipe's spec_decoding feature: EAGLE3 speculative decoding with the
# Inferact/MiniMax-M3-EAGLE3 draft head (num_speculative_tokens=3,
# attention_backend=FLASH_ATTN). EAGLE acceptance collapses on raw random
# prompts, so the benchmark routes prompts through chat-formatted encoding via
# --use-chat-template (required for all *_mtp.sh per AGENTS.md).

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

nvidia-smi

# launch_b200-dgxc.sh rewrites MODEL to the pre-downloaded
# /lustre/fsw/gharunners/models/MiniMax-M3-MXFP8 path; only download when
# handed a bare HF id (b200-cw / b200-nb runners).
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

# EAGLE3 draft head. launch_b200-dgxc.sh mounts it beside the main weights and
# exports DRAFT_MODEL_PATH; default to the sibling of MODEL_PATH, then fall
# back to the HF id (vLLM downloads it) for stand-alone runs.
DRAFT_MODEL="${DRAFT_MODEL_PATH:-}"
if [[ -z "$DRAFT_MODEL" ]]; then
if [[ "$MODEL" == /* && -d "${MODEL%/*}/MiniMax-M3-EAGLE3" ]]; then
DRAFT_MODEL="${MODEL%/*}/MiniMax-M3-EAGLE3"
else
DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"
fi
fi
if [[ "$DRAFT_MODEL" != /* ]]; then hf download "$DRAFT_MODEL"; fi
echo "EAGLE3 draft head: $DRAFT_MODEL"

SERVER_LOG=/workspace/server.log

# 444 GB of MXFP8 weights + EAGLE3 draft head off shared FS; engine startup
# can exceed the default 600s readiness window.
export VLLM_ENGINE_READY_TIMEOUT_S=3600

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

# Fixed-seq-len runs don't need graphs past the request concurrency, but spec
# decoding verifies CONC*(1+num_spec) tokens per decode step; capture up to the
# next power of two >= that, capped at vLLM's 2048 ceiling.
NUM_SPEC_TOKENS=3
CAPTURE_SIZE=4
TARGET=$(( CONC * (NUM_SPEC_TOKENS + 1) ))
while (( CAPTURE_SIZE < TARGET )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048

SPEC_CONFIG="{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}"

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
vllm serve $MODEL --port $PORT \
$PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--language-model-only \
--speculative-config "$SPEC_CONFIG" \
--max-cudagraph-capture-size $CAPTURE_SIZE \
--max-num-batched-tokens "$((ISL * 2 ))" \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

# --use-chat-template: EAGLE3 acceptance is trained against chat-formatted
# inputs; benchmarking raw prompts silently regresses the acceptance rate.
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code \
--use-chat-template

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3677,3 +3677,13 @@
- "Image: vllm/vllm-openai:minimax-m3 (dedicated day-zero image; M3 not in a stable release yet)."
- "Sweeps TP4/TP8, TP+EP (TEP), and DP-attention+EP (DEP) per https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3; H100 is TP8-only (MXFP8 weights ~427 GB)."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1731

- config-keys:
- minimaxm3-fp8-b200-vllm-mtp
description:
- "MiniMax-M3 MXFP8 B200 MTP (EAGLE3 speculative decoding) variant of minimaxm3-fp8-b200-vllm"
- "Spec config (recipe spec_decoding feature): method eagle3, draft head Inferact/MiniMax-M3-EAGLE3, num_speculative_tokens=3, attention_backend FLASH_ATTN"
- "Image: vllm/vllm-openai:minimax-m3; --block-size 128, --language-model-only; benchmark uses --use-chat-template (EAGLE acceptance collapses on raw random prompts)"
- "Latency-end sweep (TP8, TP4, TP8+EP8) at low/mid concurrency where spec decoding has compute headroom to help"
- "EAGLE3 draft head pre-staged beside the main weights and bind-mounted on b200-dgxc (launch_b200-dgxc.sh)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1736
9 changes: 8 additions & 1 deletion runners/launch_b200-dgxc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then
# tree (root-owned); it lives in the sa-shared-writable gharunners tree.
export MODEL_PATH="/lustre/fsw/gharunners/models/MiniMax-M3-MXFP8"
export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8"
# EAGLE3 draft head for the MTP variant; pre-staged beside the main weights.
# Bind-mounted via EXTRA_MOUNTS below so the *_mtp.sh script sees it as the
# sibling of MODEL_PATH. Harmless for the non-MTP script (which ignores it).
if [[ -d "/lustre/fsw/gharunners/models/MiniMax-M3-EAGLE3" ]]; then
export DRAFT_MODEL_PATH="/lustre/fsw/gharunners/models/MiniMax-M3-EAGLE3"
EXTRA_MOUNTS="${EXTRA_MOUNTS:+$EXTRA_MOUNTS,}$DRAFT_MODEL_PATH:$DRAFT_MODEL_PATH"
fi
else
echo "Unsupported model prefix/precision: $MODEL_PREFIX/$PRECISION"
echo "Available models under /lustre/fsw/models:"
Expand Down Expand Up @@ -448,7 +455,7 @@ else

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$MODEL_PATH:$MODEL_PATH,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$MODEL_PATH:$MODEL_PATH,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache${EXTRA_MOUNTS:+,$EXTRA_MOUNTS} \
--no-container-mount-home \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \
Expand Down
Loading