Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4581,6 +4581,40 @@ minimaxm2.5-fp8-h100-vllm-agentic:
- { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] }
- { tp: 4, ep: 4, offloading: cpu, conc-list: [5, 6, 7, 8, 10, 12] }

# Day-zero MiniMax-M3 recipe (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# M3 support has not shipped in a stable vLLM release; the dedicated
# vllm/vllm-openai:minimax-m3 image is the supported path. MXFP8 variant
# (NVIDIA-quantized, ~427 GB weights) is the lowest precision available —
# BF16 (~854 GB) does not fit 8x H100 (640 GB) at all, so H100 is TP8-only:
# weights alone take ~56 GB of each 80 GB GPU, leaving no room below TP8.
# dp-attn: true maps to the recipe's "DP8 + Expert Parallel" serve mode
# (vLLM --data-parallel-size 8 --enable-expert-parallel).
minimaxm3-fp8-h100-vllm:
image: vllm/vllm-openai:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: h100
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
# DEP (dp-attn) omitted on H100: each DP rank replicates the ~20 GB
# BF16-dequantized attention/dense/embedding weights next to its
# ~52 GB expert shard, and KV-cache init fails at high conc (sweep
# 27441767143, conc 256/512: "No available memory for the cache
# blocks"). TEP8 covers the high-concurrency regime instead.
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }

dsr1-fp8-h100-dynamo-sglang:
image: lmsysorg/sglang:v0.5.8-cu130
model: deepseek-ai/DeepSeek-R1-0528
Expand Down Expand Up @@ -4807,6 +4841,38 @@ minimaxm2.5-fp8-h200-vllm-agentic:
- { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] }
- { tp: 4, offloading: cpu, conc-list: [24, 28, 32, 36, 40, 48] }

# Day-zero MiniMax-M3 recipe (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# Dedicated vllm/vllm-openai:minimax-m3 image (no stable release has M3 yet).
# MXFP8 variant (~427 GB weights) is the lowest precision available; on
# 8x H200 (1128 GB) it leaves ample KV headroom where BF16 is a tight fit.
# TP4 (~112 GB weights/GPU) is memory-tight — swept only at low/mid conc.
# dp-attn: true maps to the recipe's "DP8 + Expert Parallel" serve mode.
minimaxm3-fp8-h200-vllm:
image: vllm/vllm-openai:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: h200
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 128, conc-end: 256 }
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, conc-start: 4, conc-end: 32 }
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }

dsr1-fp4-gb200-dynamo-trt:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
model: nvidia/DeepSeek-R1-0528-NVFP4-v2
Expand Down
131 changes: 131 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_h100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env bash

# MiniMax-M3 MXFP8 H100 single-node vLLM recipe
# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). 427B/26B-active MoE with MSA
# sparse attention. --block-size 128 is mandatory (MSA sparse_block_size is
# 128; the default 16 misaligns sparse indexing). The benchmark is text-only,
# so --language-model-only skips the vision encoder and frees VRAM for KV.
# dp-attn=true maps to DP×EP (DEP) per the recipe's "DP8 + Expert Parallel"
# layout; ep>1 maps to TP+EP (TEP). Hopper has no native MX tensor cores, so
# the MXFP8 MoE runs through vLLM's Hopper-compatible backends (Marlin /
# DeepGEMM) selected by the mxfp8 oracle in the minimax-m3 image.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

nvidia-smi

# The shared HF cache lives on a network FS; concurrent day-zero downloads of
# the same ~444 GB checkpoint from sibling nodes hit huggingface_hub's
# WeakFileLock "[Errno 116] Stale file handle" race. Retry the download (it
# resumes), then serve with HF_HUB_OFFLINE=1 so vllm's snapshot_download does
# a lock-free local-cache read instead of re-contending the lock files.
SERVE_OFFLINE=()
if [[ "$MODEL" != /* ]]; then
for attempt in 1 2 3 4 5; do
hf download "$MODEL" && break
if [ "$attempt" = 5 ]; then echo "hf download failed after $attempt attempts" >&2; exit 1; fi
echo "hf download attempt $attempt failed; retrying in 60s" >&2
sleep 60
done
SERVE_OFFLINE=(env HF_HUB_OFFLINE=1)
fi

SERVER_LOG=/workspace/server.log

export PYTHONNOUSERSITE=1
# ~444 GB of MXFP8 weights off shared FS; engine startup can exceed the
# default 600s readiness window.
export VLLM_ENGINE_READY_TIMEOUT_S=3600

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

# Fixed-seq-len runs don't need graphs past the request concurrency: capture
# up to the next power of two >= CONC (per-DP-rank batch is CONC/DP but ragged
# arrival makes the full CONC bound safer), capped at vLLM's 2048 ceiling.
CAPTURE_SIZE=4
while (( CAPTURE_SIZE < CONC )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048

# H100 DEP is weights-bound: every DP rank replicates the ~20 GB
# BF16-dequantized attention/dense/embedding weights next to its ~52 GB
# expert shard, and at gmu 0.90 KV-cache init fails outright at high conc
# (sweep 27441767143, conc 256: "No available memory for the cache blocks").
# Claw back headroom: higher gpu-memory-utilization, and decode graphs
# capped at 2x the per-rank batch share instead of the full CONC bound.
GMU=0.90
if [ "${DP_ATTENTION}" = "true" ]; then
GMU=0.94
PER_RANK_BOUND=$(( 2 * ((CONC + TP - 1) / TP) ))
CAPTURE_SIZE=4
while (( CAPTURE_SIZE < PER_RANK_BOUND )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048
fi

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
"${SERVE_OFFLINE[@]}" vllm serve $MODEL --port $PORT \
$PARALLEL_ARGS \
--gpu-memory-utilization $GMU \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--language-model-only \
--max-cudagraph-capture-size $CAPTURE_SIZE \
--max-num-batched-tokens "$((ISL * 2 ))" \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
116 changes: 116 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_h200.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env bash

# MiniMax-M3 MXFP8 H200 single-node vLLM recipe
# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). 427B/26B-active MoE with MSA
# sparse attention. --block-size 128 is mandatory (MSA sparse_block_size is
# 128; the default 16 misaligns sparse indexing). The benchmark is text-only,
# so --language-model-only skips the vision encoder and frees VRAM for KV.
# dp-attn=true maps to DP×EP (DEP) per the recipe's "DP8 + Expert Parallel"
# layout; ep>1 maps to TP+EP (TEP). Hopper has no native MX tensor cores, so
# the MXFP8 MoE runs through vLLM's Hopper-compatible backends (Marlin /
# DeepGEMM) selected by the mxfp8 oracle in the minimax-m3 image.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

nvidia-smi

# The shared HF cache lives on a network FS; concurrent day-zero downloads of
# the same ~444 GB checkpoint from sibling nodes hit huggingface_hub's
# WeakFileLock "[Errno 116] Stale file handle" race. Retry the download (it
# resumes), then serve with HF_HUB_OFFLINE=1 so vllm's snapshot_download does
# a lock-free local-cache read instead of re-contending the lock files.
SERVE_OFFLINE=()
if [[ "$MODEL" != /* ]]; then
for attempt in 1 2 3 4 5; do
hf download "$MODEL" && break
if [ "$attempt" = 5 ]; then echo "hf download failed after $attempt attempts" >&2; exit 1; fi
echo "hf download attempt $attempt failed; retrying in 60s" >&2
sleep 60
done
SERVE_OFFLINE=(env HF_HUB_OFFLINE=1)
fi

SERVER_LOG=/workspace/server.log

export PYTHONNOUSERSITE=1
# ~444 GB of MXFP8 weights off shared FS; engine startup can exceed the
# default 600s readiness window.
export VLLM_ENGINE_READY_TIMEOUT_S=3600

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

# Fixed-seq-len runs don't need graphs past the request concurrency: capture
# up to the next power of two >= CONC (per-DP-rank batch is CONC/DP but ragged
# arrival makes the full CONC bound safer), capped at vLLM's 2048 ceiling.
CAPTURE_SIZE=4
while (( CAPTURE_SIZE < CONC )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
"${SERVE_OFFLINE[@]}" vllm serve $MODEL --port $PORT \
$PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--language-model-only \
--max-cudagraph-capture-size $CAPTURE_SIZE \
--max-num-batched-tokens "$((ISL * 2 ))" \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3563,6 +3563,7 @@
- "Add atom-disagg sweep: 2P1D DPA+TP8 (conc 256-2048) and 1P1D TP8 (conc 4-256) at isl=8192/osl=1024"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1683


- config-keys:
- dsv4-fp4-b300-trt
- dsv4-fp4-b300-trt-mtp
Expand Down Expand Up @@ -3611,6 +3612,7 @@
- "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692


- config-keys:
- dsv4-fp4-gb200-dynamo-sglang
description:
Expand Down Expand Up @@ -3666,3 +3668,12 @@
- "Drafter pinned to FLASH_ATTN via speculative-config attention_backend: the EAGLE3 head is MHA and FlashInfer only supports the mandatory page size 128 through its GQA-only trtllm-gen kernel"
- "Layouts: TP8 / TP4 (latency), TP8+EP8 / TP4+EP4 (TEP), TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k — non-MTP search space trimmed at the extreme-concurrency end, tp2-ep2 dropped (draft weights + draft KV headroom)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1733

- config-keys:
- minimaxm3-fp8-h200-vllm
- minimaxm3-fp8-h100-vllm
description:
- "Day-zero MiniMax-M3 MXFP8 single-node recipes for H200 and H100 (vLLM)."
- "Image: vllm/vllm-openai:minimax-m3 (dedicated day-zero image; M3 not in a stable release yet)."
- "Sweeps TP4/TP8, TP+EP (TEP), and DP-attention+EP (DEP) per https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3; H100 is TP8-only (MXFP8 weights ~427 GB)."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1731
Loading