SemiAnalysisAI · functionstackx · Jun 13, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
@@ -4581,6 +4581,40 @@ minimaxm2.5-fp8-h100-vllm-agentic:
       - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] }
       - { tp: 4, ep: 4, offloading: cpu,  conc-list: [5, 6, 7, 8, 10, 12] }
 
+# Day-zero MiniMax-M3 recipe (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
+# M3 support has not shipped in a stable vLLM release; the dedicated
+# vllm/vllm-openai:minimax-m3 image is the supported path. MXFP8 variant
+# (NVIDIA-quantized, ~427 GB weights) is the lowest precision available —
+# BF16 (~854 GB) does not fit 8x H100 (640 GB) at all, so H100 is TP8-only:
+# weights alone take ~56 GB of each 80 GB GPU, leaving no room below TP8.
+# dp-attn: true maps to the recipe's "DP8 + Expert Parallel" serve mode
+# (vLLM --data-parallel-size 8 --enable-expert-parallel).
+minimaxm3-fp8-h100-vllm:
+  image: vllm/vllm-openai:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: h100
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # DEP (dp-attn) omitted on H100: each DP rank replicates the ~20 GB
+      # BF16-dequantized attention/dense/embedding weights next to its
+      # ~52 GB expert shard, and KV-cache init fails at high conc (sweep
+      # 27441767143, conc 256/512: "No available memory for the cache
+      # blocks"). TEP8 covers the high-concurrency regime instead.
+      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+      - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }
+
 dsr1-fp8-h100-dynamo-sglang:
   image: lmsysorg/sglang:v0.5.8-cu130
   model: deepseek-ai/DeepSeek-R1-0528
@@ -4807,6 +4841,38 @@ minimaxm2.5-fp8-h200-vllm-agentic:
       - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] }
       - { tp: 4, offloading: cpu,  conc-list: [24, 28, 32, 36, 40, 48] }
 
+# Day-zero MiniMax-M3 recipe (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
+# Dedicated vllm/vllm-openai:minimax-m3 image (no stable release has M3 yet).
+# MXFP8 variant (~427 GB weights) is the lowest precision available; on
+# 8x H200 (1128 GB) it leaves ample KV headroom where BF16 is a tight fit.
+# TP4 (~112 GB weights/GPU) is memory-tight — swept only at low/mid conc.
+# dp-attn: true maps to the recipe's "DP8 + Expert Parallel" serve mode.
+minimaxm3-fp8-h200-vllm:
+  image: vllm/vllm-openai:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: h200
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 64 }
+      - { tp: 4, ep: 4, conc-start: 128, conc-end: 256 }
+      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 8, ep: 8, conc-start: 256, conc-end: 512 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 32 }
+      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
+
 dsr1-fp4-gb200-dynamo-trt:
   image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
   model: nvidia/DeepSeek-R1-0528-NVFP4-v2

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_h100.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_h100.sh
@@ -0,0 +1,131 @@
+#!/usr/bin/env bash
+
+# MiniMax-M3 MXFP8 H100 single-node vLLM recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). 427B/26B-active MoE with MSA
+# sparse attention. --block-size 128 is mandatory (MSA sparse_block_size is
+# 128; the default 16 misaligns sparse indexing). The benchmark is text-only,
+# so --language-model-only skips the vision encoder and frees VRAM for KV.
+# dp-attn=true maps to DP×EP (DEP) per the recipe's "DP8 + Expert Parallel"
+# layout; ep>1 maps to TP+EP (TEP). Hopper has no native MX tensor cores, so
+# the MXFP8 MoE runs through vLLM's Hopper-compatible backends (Marlin /
+# DeepGEMM) selected by the mxfp8 oracle in the minimax-m3 image.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    EP_SIZE \
+    DP_ATTENTION \
+    CONC \
+    ISL \
+    OSL \
+    MAX_MODEL_LEN \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+# The shared HF cache lives on a network FS; concurrent day-zero downloads of
+# the same ~444 GB checkpoint from sibling nodes hit huggingface_hub's
+# WeakFileLock "[Errno 116] Stale file handle" race. Retry the download (it
+# resumes), then serve with HF_HUB_OFFLINE=1 so vllm's snapshot_download does
+# a lock-free local-cache read instead of re-contending the lock files.
+SERVE_OFFLINE=()
+if [[ "$MODEL" != /* ]]; then
+  for attempt in 1 2 3 4 5; do
+    hf download "$MODEL" && break
+    if [ "$attempt" = 5 ]; then echo "hf download failed after $attempt attempts" >&2; exit 1; fi
+    echo "hf download attempt $attempt failed; retrying in 60s" >&2
+    sleep 60
+  done
+  SERVE_OFFLINE=(env HF_HUB_OFFLINE=1)
+fi
+
+SERVER_LOG=/workspace/server.log
+
+export PYTHONNOUSERSITE=1
+# ~444 GB of MXFP8 weights off shared FS; engine startup can exceed the
+# default 600s readiness window.
+export VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+if [ "${DP_ATTENTION}" = "true" ]; then
+  PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
+elif [ "$EP_SIZE" -gt 1 ]; then
+  PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
+else
+  PARALLEL_ARGS="--tensor-parallel-size=$TP"
+fi
+
+# Fixed-seq-len runs don't need graphs past the request concurrency: capture
+# up to the next power of two >= CONC (per-DP-rank batch is CONC/DP but ragged
+# arrival makes the full CONC bound safer), capped at vLLM's 2048 ceiling.
+CAPTURE_SIZE=4
+while (( CAPTURE_SIZE < CONC )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
+(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048
+
+# H100 DEP is weights-bound: every DP rank replicates the ~20 GB
+# BF16-dequantized attention/dense/embedding weights next to its ~52 GB
+# expert shard, and at gmu 0.90 KV-cache init fails outright at high conc
+# (sweep 27441767143, conc 256: "No available memory for the cache blocks").
+# Claw back headroom: higher gpu-memory-utilization, and decode graphs
+# capped at 2x the per-rank batch share instead of the full CONC bound.
+GMU=0.90
+if [ "${DP_ATTENTION}" = "true" ]; then
+  GMU=0.94
+  PER_RANK_BOUND=$(( 2 * ((CONC + TP - 1) / TP) ))
+  CAPTURE_SIZE=4
+  while (( CAPTURE_SIZE < PER_RANK_BOUND )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
+  (( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048
+fi
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+set -x
+"${SERVE_OFFLINE[@]}" vllm serve $MODEL --port $PORT \
+$PARALLEL_ARGS \
+--gpu-memory-utilization $GMU \
+--max-model-len $MAX_MODEL_LEN \
+--block-size 128 \
+--language-model-only \
+--max-cudagraph-capture-size $CAPTURE_SIZE \
+--max-num-batched-tokens "$((ISL * 2 ))" \
+--stream-interval 20 --no-enable-prefix-caching \
+--trust-remote-code > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --trust-remote-code
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_h200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_h200.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+
+# MiniMax-M3 MXFP8 H200 single-node vLLM recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). 427B/26B-active MoE with MSA
+# sparse attention. --block-size 128 is mandatory (MSA sparse_block_size is
+# 128; the default 16 misaligns sparse indexing). The benchmark is text-only,
+# so --language-model-only skips the vision encoder and frees VRAM for KV.
+# dp-attn=true maps to DP×EP (DEP) per the recipe's "DP8 + Expert Parallel"
+# layout; ep>1 maps to TP+EP (TEP). Hopper has no native MX tensor cores, so
+# the MXFP8 MoE runs through vLLM's Hopper-compatible backends (Marlin /
+# DeepGEMM) selected by the mxfp8 oracle in the minimax-m3 image.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    EP_SIZE \
+    DP_ATTENTION \
+    CONC \
+    ISL \
+    OSL \
+    MAX_MODEL_LEN \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+# The shared HF cache lives on a network FS; concurrent day-zero downloads of
+# the same ~444 GB checkpoint from sibling nodes hit huggingface_hub's
+# WeakFileLock "[Errno 116] Stale file handle" race. Retry the download (it
+# resumes), then serve with HF_HUB_OFFLINE=1 so vllm's snapshot_download does
+# a lock-free local-cache read instead of re-contending the lock files.
+SERVE_OFFLINE=()
+if [[ "$MODEL" != /* ]]; then
+  for attempt in 1 2 3 4 5; do
+    hf download "$MODEL" && break
+    if [ "$attempt" = 5 ]; then echo "hf download failed after $attempt attempts" >&2; exit 1; fi
+    echo "hf download attempt $attempt failed; retrying in 60s" >&2
+    sleep 60
+  done
+  SERVE_OFFLINE=(env HF_HUB_OFFLINE=1)
+fi
+
+SERVER_LOG=/workspace/server.log
+
+export PYTHONNOUSERSITE=1
+# ~444 GB of MXFP8 weights off shared FS; engine startup can exceed the
+# default 600s readiness window.
+export VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+if [ "${DP_ATTENTION}" = "true" ]; then
+  PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
+elif [ "$EP_SIZE" -gt 1 ]; then
+  PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
+else
+  PARALLEL_ARGS="--tensor-parallel-size=$TP"
+fi
+
+# Fixed-seq-len runs don't need graphs past the request concurrency: capture
+# up to the next power of two >= CONC (per-DP-rank batch is CONC/DP but ragged
+# arrival makes the full CONC bound safer), capped at vLLM's 2048 ceiling.
+CAPTURE_SIZE=4
+while (( CAPTURE_SIZE < CONC )); do CAPTURE_SIZE=$((CAPTURE_SIZE * 2)); done
+(( CAPTURE_SIZE > 2048 )) && CAPTURE_SIZE=2048
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+set -x
+"${SERVE_OFFLINE[@]}" vllm serve $MODEL --port $PORT \
+$PARALLEL_ARGS \
+--gpu-memory-utilization 0.90 \
+--max-model-len $MAX_MODEL_LEN \
+--block-size 128 \
+--language-model-only \
+--max-cudagraph-capture-size $CAPTURE_SIZE \
+--max-num-batched-tokens "$((ISL * 2 ))" \
+--stream-interval 20 --no-enable-prefix-caching \
+--trust-remote-code > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --trust-remote-code
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3563,6 +3563,7 @@
     - "Add atom-disagg sweep: 2P1D DPA+TP8 (conc 256-2048) and 1P1D TP8 (conc 4-256) at isl=8192/osl=1024"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1683
 
+
 - config-keys:
     - dsv4-fp4-b300-trt
     - dsv4-fp4-b300-trt-mtp
@@ -3611,6 +3612,7 @@
     - "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692
 
+
 - config-keys:
     - dsv4-fp4-gb200-dynamo-sglang
   description:
@@ -3666,3 +3668,12 @@
     - "Drafter pinned to FLASH_ATTN via speculative-config attention_backend: the EAGLE3 head is MHA and FlashInfer only supports the mandatory page size 128 through its GQA-only trtllm-gen kernel"
     - "Layouts: TP8 / TP4 (latency), TP8+EP8 / TP4+EP4 (TEP), TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k — non-MTP search space trimmed at the extreme-concurrency end, tp2-ep2 dropped (draft weights + draft KV headroom)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1733
+
+- config-keys:
+    - minimaxm3-fp8-h200-vllm
+    - minimaxm3-fp8-h100-vllm
+  description:
+    - "Day-zero MiniMax-M3 MXFP8 single-node recipes for H200 and H100 (vLLM)."
+    - "Image: vllm/vllm-openai:minimax-m3 (dedicated day-zero image; M3 not in a stable release yet)."
+    - "Sweeps TP4/TP8, TP+EP (TEP), and DP-attention+EP (DEP) per https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3; H100 is TP8-only (MXFP8 weights ~427 GB)."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1731