Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2810,3 +2810,39 @@ minimaxm3-fp8-mi355x-vllm:
- { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
- { tp: 4, conc-start: 1, conc-end: 128 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
# minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). No
# attention_backend override is needed — the server runs on TRITON_ATTN, so
# the FlashInfer page-128/MHA limitation that forced FLASH_ATTN on Blackwell
# does not apply here. Search space mirrors the non-MTP entry trimmed at the
# extreme-concurrency end, identical to the minimaxm3-fp8-b300-vllm-mtp /
# b200-vllm-mtp precedent: spec decode pays off at low/mid concurrency while
# acceptance dilutes in big batches, and the draft weights + draft KV shave
# headroom — tp2-ep2 is dropped since its KV headroom was already thin.
minimaxm3-fp8-mi355x-vllm-mtp:
image: vllm/vllm-openai-rocm:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi355x
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
124 changes: 124 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/env bash

# MiniMax-M3 MXFP8 MI355X (gfx950) single-node vLLM recipe with EAGLE3
# speculative decoding — the spec-decoding=mtp variant of
# minimaxm3_fp8_mi355x.sh. Adds the Inferact/MiniMax-M3-EAGLE3 draft head via
# --speculative-config with 3 speculative tokens. Everything else mirrors the
# non-MTP recipe: MXFP8 from TP=4 on gfx950, mandatory --block-size 128,
# --language-model-only for the text-only benchmark, FP8 KV cache,
# --attention-backend TRITON_ATTN, and --enforce-eager.
#
# Unlike the CUDA recipes, the drafter needs no attention_backend override:
# the FlashInfer "page size 128 requires GQA/MQA" limitation that forced
# FLASH_ATTN for the EAGLE3 MHA head on Blackwell is FlashInfer/CUDA-specific.
# Here the whole server runs on TRITON_ATTN (set globally below), which serves
# the MHA draft fine.
#
# KNOWN BLOCKER (2026-06-13): this recipe does NOT yet run on the current
# vllm/vllm-openai-rocm:minimax-m3 image. Engine init fails with
# "RuntimeError: Model does not support EAGLE3 interface but
# aux_hidden_state_outputs was requested" — the ROCm build's
# MiniMaxM3SparseForConditionalGeneration class does not implement vLLM's
# SupportsEagle3 aux-hidden-state hook. The CUDA minimax-m3 image (a newer
# vLLM commit) does, which is why the B300/B200/H100/H200 EAGLE3 recipes pass.
# Confirmed independent of --trust-remote-code (sweeps 27472217773 /
# 27472704212). The recipe is otherwise correct and should pass once the ROCm
# image is rebuilt with MiniMax-M3 EAGLE3 target support.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

# MODEL stays a bare HF id on the mi355x single-node runner (weights are
# pre-staged in the mounted NFS HF cache, so this is a fast cache hit). The
# EAGLE3 draft is not staged; fetch it into the same cache.
if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
hf download "$DRAFT_MODEL"
fi

if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

SERVER_LOG=/workspace/server.log
export VLLM_ENGINE_READY_TIMEOUT_S=3600

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
fi

PARALLEL_ARGS=(--tensor-parallel-size "$TP")
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(
--tensor-parallel-size 1
--data-parallel-size "$TP"
--enable-expert-parallel
)
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS+=(--enable-expert-parallel)
fi

# use 3 speculative tokens for all configs for now
NUM_SPEC_TOKENS=3

start_gpu_monitor

set -x
vllm serve "$MODEL" --port "$PORT" \
"${PARALLEL_ARGS[@]}" \
--block-size 128 \
--language-model-only \
--max-model-len "$MAX_MODEL_LEN" \
--kv-cache-dtype fp8 \
--attention-backend TRITON_ATTN \
--enforce-eager \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
--tool-call-parser minimax_m3 \
--reasoning-parser minimax_m3 \
--enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &

SERVER_PID=$!
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

# Spec-decode acceptance rate degrades on raw random tokens; route prompts
# through the chat template as the other MTP recipes do.
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code \
--use-chat-template

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3717,3 +3717,13 @@
- "B300-parity layouts and concurrency ranges: TP8, TP8+EP8, TP4, TP4+EP4, TP2+EP2, and TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k"
- "launch_mi355x-amds.sh routes M3 weights to NFS /it-share/hf-hub-cache instead of node-local /var/lib NVMe"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1725

- config-keys:
- minimaxm3-fp8-mi355x-vllm-mtp
description:
- "Initial submission: MiniMax-M3 MXFP8 MI355X (gfx950) vLLM benchmark with EAGLE3 speculative decoding (target: MiniMaxAI/MiniMax-M3-MXFP8, draft: Inferact/MiniMax-M3-EAGLE3, 3 speculative tokens) — spec-decoding=mtp variant of the MI355X day-zero recipe"
- "Image: vllm/vllm-openai-rocm:minimax-m3 (same day-zero ROCm build as the non-MTP entry)"
- "Serve shape follows minimaxm3-fp8-mi355x-vllm (--block-size 128, --language-model-only, --kv-cache-dtype fp8, --attention-backend TRITON_ATTN, --enforce-eager, minimax_m3 parsers); prompts routed through the chat template for realistic acceptance"
- "No attention_backend override on the drafter: the server runs on TRITON_ATTN, so the FlashInfer page-128/MHA limitation that forced FLASH_ATTN on the CUDA recipes does not apply on ROCm"
- "Layouts: TP8 / TP4 (latency), TP8+EP8 / TP4+EP4 (TEP), TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k — non-MTP search space trimmed at the extreme-concurrency end, tp2-ep2 dropped, mirroring the minimaxm3-fp8-b300-vllm-mtp search space"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1742
Loading