Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2587,6 +2587,32 @@ minimaxm3-fp8-mi355x-vllm-mtp:
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# MiniMax-M3 MXFP4 MI355X single-node vLLM recipe.
# amd/MiniMax-M3-MXFP4 via the AITER MoE backend (block size 128 for MSA, no fp8
# KV cache -- the checkpoint has no calibrated KV scales). Upstream recipe:
# https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3 (mxfp4 variant).
# TP8 covers the low-latency / low-concurrency regime; TP4 carries the full sweep.
minimaxm3-fp4-mi355x-vllm:
image: rocm/vllm-dev:vllm-0.23.1-rocm723-mi35x-mori-0625
model: amd/MiniMax-M3-MXFP4
model-prefix: minimaxm3
runner: mi355x
precision: fp4
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32 }
- { tp: 4, conc-start: 1, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32 }
- { tp: 4, conc-start: 1, conc-end: 256 }

# MiniMax-M3 MXFP4 MI355X atom recipe:
# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
# block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe.
Expand Down
91 changes: 91 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env bash

# MiniMax-M3 MXFP4 MI355X (gfx950) single-node vLLM recipe.
# amd/MiniMax-M3-MXFP4 served through the AITER MoE backend. Requires block size
# 128 (MSA sparse attention). KV cache is left at the default dtype: this
# checkpoint ships no calibrated FP8 KV scales, so --kv-cache-dtype fp8 would
# fall back to an uncalibrated scale of 1.0 (accuracy risk).

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

SERVER_LOG=/workspace/server.log
export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_USE_BREAKABLE_CUDAGRAPH=0
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_MOE=1

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
fi

PARALLEL_ARGS=(--tensor-parallel-size "$TP")
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(
--tensor-parallel-size 1
--data-parallel-size "$TP"
--enable-expert-parallel
)
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS+=(--enable-expert-parallel)
fi

start_gpu_monitor

set -x
vllm serve "$MODEL" --port "$PORT" \
"${PARALLEL_ARGS[@]}" \
--block-size 128 \
--no-enable-prefix-caching \
--language-model-only \
--max-model-len "$MAX_MODEL_LEN" \
--moe-backend aiter \
--attention-backend TRITON_ATTN \
--tool-call-parser minimax_m3 \
--reasoning-parser minimax_m3 \
--enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &

SERVER_PID=$!
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
6 changes: 6 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4190,3 +4190,9 @@
- "Update the DeepSeek-V4-Pro B200 disaggregated Dynamo-vLLM benchmark to the vllm/vllm-openai:v0.23.0 image"
- "Lower max-num-batched-tokens to 16384 and gpu-memory-utilization to 0.9 on the high-throughput and max-throughput recipes to avoid OOM"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1899

- config-keys:
- minimaxm3-fp4-mi355x-vllm
description:
- "Initial submission: MiniMax-M3 MXFP4 single-node on MI355X with vLLM (AITER MoE, TRITON_ATTN MSA, block-size 128, no fp8 KV cache). Pure TP: TP8 low-concurrency (conc 1-32) + TP4 full sweep (conc 1-256), at 1k/1k and 8k/1k."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1936