Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11678,3 +11678,40 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
tp: 4
ep: 4
dp-attn: true

# MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
# cores on Blackwell. M3 support has not shipped in a stable vLLM release;
# the dedicated vllm/vllm-openai:minimax-m3 image is built from the m3_release
# branch (vllm-project/vllm#45381). --block-size 128 is mandatory (MSA
# sparse/index cache alignment). Weights are NOT SRE-staged; b300 falls back
# to writable /data/models (see launch_b300-nv.sh MODEL_PATH split).
minimaxm3-fp8-b300-vllm:
image: vllm/vllm-openai:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: b300
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
- { tp: 4, conc-start: 1, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 512 }
# tp2 fits MXFP8 weights (~222 GB/GPU of 288) but KV headroom is thin;
# 1k1k only, drop if it OOMs at the high end.
- { tp: 2, ep: 2, conc-start: 16, conc-end: 128 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
- { tp: 4, conc-start: 1, conc-end: 128 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 }
109 changes: 109 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env bash

# MiniMax-M3 MXFP8 B300 single-node vLLM recipe
# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Same shape as the B200
# script, but follows the b300 launcher's MODEL/MODEL_PATH split:
# launch_b300-nv.sh keeps MODEL as the HF id and points MODEL_PATH at
# /data/models/<basename> (writable NFS) for models not in the SRE-staged
# /scratch/models list — MiniMax-M3 is not staged. --block-size 128 is
# mandatory (MSA sparse/index cache); the benchmark is text-only, so
# --language-model-only frees the vision encoder's VRAM.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE.
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

nvidia-smi

SERVER_LOG=/workspace/server.log

# 444 GB of MXFP8 weights off shared FS; engine startup can exceed the
# default 600s readiness window.
export VLLM_ENGINE_READY_TIMEOUT_S=3600

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

# Fixed-seq-len runs don't need graphs past the request concurrency: capture
# up to the next power of two >= CONC, capped at vLLM's 2048 ceiling.
CAPTURE_SIZE=4
while (( CAPTURE_SIZE < CONC )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port $PORT \
$PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--language-model-only \
--max-cudagraph-capture-size $CAPTURE_SIZE \
--max-num-batched-tokens "$((ISL * 2 ))" \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3636,3 +3636,13 @@
- "Add MiniMax-M2.5 FP4 (NVFP4) B300 TensorRT-LLM benchmark (model: nvidia/MiniMax-M2.5-NVFP4)"
- "Image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc18"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1712

- config-keys:
- minimaxm3-fp8-b300-vllm
description:
- "Initial submission: MiniMax-M3 MXFP8 day-zero single-node vLLM benchmark on B300 (model: MiniMaxAI/MiniMax-M3-MXFP8, 427B total / 26B active MoE with MSA sparse attention)"
- "Image: vllm/vllm-openai:minimax-m3 (already the cu130 build; M3 support is unmerged upstream — vllm-project/vllm#45381)"
- "--block-size 128 is mandatory (MSA sparse/index cache alignment); --language-model-only skips the vision encoder for text-only throughput; conc-scaled --max-cudagraph-capture-size"
- "Layouts: TP8 and TP4 (latency), TP4+EP4 / TP8+EP8 (TEP throughput), tp2-ep2, TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k"
- "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724
Loading