Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
e0b81f4
[AMD] server_atom: improve config print and cleanup
seungrokj Jun 19, 2026
2ecfe19
update perf-changelog for dsv4-fp4-mi355x-atom-disagg-mtp
seungrokj Jun 19, 2026
68ff385
[AMD] fix DECODE_MTP_SIZE and BENCH_REQUEST_RATE propagation in atom-…
seungrokj Jun 19, 2026
f0c64d8
[AMD] server_atom: pass SPEC_ARGS to prefill server
seungrokj Jun 19, 2026
53722ef
[AMD] amd-master: fix comment for 1P1D TP8+DPA+TBO+MTP1 config
seungrokj Jun 19, 2026
09f0d18
[AMD] dsv4_atom-disagg: remove DECODE_MTP_SIZE from check_env_vars
seungrokj Jun 19, 2026
7643da7
[AMD] bench: use --dsv4 flag for DeepSeek-V4-Pro MTP benchmarks
seungrokj Jun 19, 2026
f9c69d3
[AMD] server_atom: export IS_MTP=true when SPEC_DECODING=mtp for benc…
seungrokj Jun 19, 2026
290eb53
[AMD] server_atom: fix hf-overrides JSON quoting
seungrokj Jun 19, 2026
82ce90f
fix: inline --hf-overrides to avoid eval word-splitting, remove OPT_ARGS
seungrokj Jun 19, 2026
af235c9
refactor: extract --hf-overrides into HF_OVERRIDES_ARG variable
seungrokj Jun 19, 2026
e264c4e
fix: enable --hf-overrides only for DeepSeek-V4-Pro
seungrokj Jun 19, 2026
2cea307
fix: add HF_OVERRIDES_ARG to INFO config print block
seungrokj Jun 19, 2026
74c7a5a
fix: replace broken-quote array splice with ${ARRAY[*]} in CMD strings
seungrokj Jun 19, 2026
95a730e
fix: remove ${CUDAGRAPH_OPT} from decode CMD
seungrokj Jun 19, 2026
4d2cf04
feat: add MiniMax-M3 ATOM disagg CI script and server_atom.sh support
seungrokj Jun 19, 2026
14eb7f2
feat: add minimaxm3-fp4-mi355x-atom-disagg recipe and AITER_QUICK_RED…
seungrokj Jun 19, 2026
c2cce71
feat: export AITER_QUICK_REDUCE_QUANTIZATION=INT4 for non-DSv4 models
seungrokj Jun 19, 2026
aeb73dc
fix: server_atom.sh and minimaxm3 disagg cleanup
seungrokj Jun 19, 2026
697f26f
fix: dsv4_fp4_mi355x_atom-disagg cleanup
seungrokj Jun 19, 2026
f2b89c6
fix: set BLOCK_SIZE=128 for MiniMax-M3 in minimaxm3_fp4_mi355x_atom-d…
seungrokj Jun 19, 2026
48b3daf
fix: use KV_CACHE_DTYPE=fp8 for MiniMax-M3 disagg (matches atom serve…
seungrokj Jun 19, 2026
d06a44c
feat: update minimaxm3-fp4-mi355x-atom-disagg search space and disabl…
seungrokj Jun 19, 2026
38e82c3
feat: add MiniMax-M3-MXFP4/MXFP8 to models_atom.yaml; set KV_CACHE_DT…
seungrokj Jun 19, 2026
69e4be7
fix: set mi355x-disagg runner and add dynamic cudagraph sizes for dec…
seungrokj Jun 19, 2026
4b57fab
fix: gate ATOM_MOE_GU_ITLV and AITER_BF16_FP8_MOE_BOUND on DeepSeek-V…
seungrokj Jun 19, 2026
1e0bb1e
fix: preserve empty KV_CACHE_DTYPE to skip --kv-cache-dtype flag
seungrokj Jun 20, 2026
a933e35
fix: use KV_CACHE_DTYPE=auto for minimaxm3 disagg to skip --kv-cache-…
seungrokj Jun 20, 2026
41f23a1
fix: align minimaxm3 disagg settings with slurm reference script
seungrokj Jun 20, 2026
ae82c60
fix: rename minimaxm3-fp4-mi355x-atom-disagg to minimaxm3-fp8 and rem…
seungrokj Jun 20, 2026
bcbda4f
feat: add minimaxm3_fp8_mi355x_atom-disagg multi-node benchmark script
seungrokj Jun 20, 2026
5e82263
benchmarks: rename minimaxm3 to dsv4 atom-disagg script and generaliz…
seungrokj Jun 21, 2026
c0a813b
fix: bump minimaxm3-fp8-mi355x-atom-disagg image and pin MAX_MODEL_LEN
seungrokj Jun 23, 2026
a2e7439
Merge branch 'main' into amd/atom_mesh_0619_m3_fp8
seungrokj Jun 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2631,6 +2631,57 @@ minimaxm3-fp8-mi355x-atom-mtp:
search-space:
- { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }

minimaxm3-fp8-mi355x-atom-disagg:
image: rocm/atom-dev:MiniMax-M3-20260622
model: amd/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi355x-disagg
precision: fp8
framework: atom-disagg
multinode: true
disagg: true
scenarios:
Comment thread
cursor[bot] marked this conversation as resolved.
fixed-seq-len:
- isl: 8192
osl: 1024
search-space:
# 1P1D TP4
- conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
Comment thread
chunfangamd marked this conversation as resolved.
- "DECODE_NODES=1"
# 1P1D TP4
- isl: 1024
osl: 1024
search-space:
# 1P1D TP4
- conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"

# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
# MI355X serving shape, but retain the default BF16 KV cache because this
# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
Expand Down
6 changes: 5 additions & 1 deletion benchmarks/multi_node/amd_utils/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,11 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do
extra_flags="--trust-remote-code --tokenizer $MODEL_PATH"
else
if [ "$IS_MTP" = "true" ]; then
extra_flags="--use-chat-template"
if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
extra_flags="--dsv4"
else
extra_flags="--use-chat-template"
fi
fi
fi

Expand Down
10 changes: 6 additions & 4 deletions benchmarks/multi_node/amd_utils/env_atom.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,18 @@ export LOGLEVEL=WARNING
# mooncake RDMA KV transfer library path
export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-}

# ATOM MoE gather/scatter interleave optimization
export ATOM_MOE_GU_ITLV=1

# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP)

# aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting)
export AITER_LOG_LEVEL=WARNING

# Disable bf16->fp8 MoE bound (matches reference script)
export AITER_BF16_FP8_MOE_BOUND=0
if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
# ATOM MoE gather/scatter interleave optimization
export ATOM_MOE_GU_ITLV=1
# Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro)
export AITER_BF16_FP8_MOE_BOUND=0
fi

# Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf)
# No env var needed; documented here for reference.
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/multi_node/amd_utils/job.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ DOCKER_ENV_COMMON=(
-e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO
-e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER
-e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY
-e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE
-e TQDM_MININTERVAL=\$TQDM_MININTERVAL
-e DRY_RUN=\$DRY_RUN
-e BENCHMARK_LOGS_DIR=/benchmark_logs
Expand Down Expand Up @@ -411,10 +412,12 @@ elif [[ "$ENGINE" == "atom-disagg" ]]; then
-e DECODE_PORT=${DECODE_PORT:-8020}
-e ROUTER_PORT=${ROUTER_PORT:-30000}
-e HANDSHAKE_PORT=${HANDSHAKE_PORT:-6301}
-e MEM_FRACTION=${MEM_FRACTION:-0.85}
-e MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.85}
-e KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-fp8}
-e BLOCK_SIZE=${BLOCK_SIZE:-16}
-e MAX_NUM_SEQS=${MAX_NUM_SEQS:-256}
-e MAX_MODEL_LEN=${MAX_MODEL_LEN:-}
-e MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-}
-e EXTRA_SERVER_ARGS=\${EXTRA_SERVER_ARGS:-}
-e IBDEVICES=${IBDEVICES:-}
)
Expand Down
10 changes: 10 additions & 0 deletions benchmarks/multi_node/amd_utils/models_atom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,13 @@ DeepSeek-V4-Pro:
base_flags: ""
mtp_flags: ""
dp_flags: ""

MiniMax-M3-MXFP4:
base_flags: ""
mtp_flags: ""
dp_flags: ""

MiniMax-M3-MXFP8:
base_flags: ""
mtp_flags: ""
dp_flags: ""
115 changes: 96 additions & 19 deletions benchmarks/multi_node/amd_utils/server_atom.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,23 @@ DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
DECODE_ENABLE_EP="${DECODE_ENABLE_EP}"
DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"

# MTP
SPEC_DECODING="${SPEC_DECODING:-}"
DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-1}"

# ATOM server ports (different from SGLang which uses 8000 for all)
PREFILL_PORT="${PREFILL_PORT:-8010}"
DECODE_PORT="${DECODE_PORT:-8020}"
ROUTER_PORT="${ROUTER_PORT:-8000}"
HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}"

# ATOM server tuning (from reference script defaults)
MEM_FRACTION="${MEM_FRACTION:-0.85}"
MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}"
KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}"
BLOCK_SIZE="${BLOCK_SIZE:-16}"
MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-}"
MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-}"
EXTRA_SERVER_ARGS="${EXTRA_SERVER_ARGS:-}"

# Benchmark Configuration
Expand Down Expand Up @@ -100,34 +106,91 @@ for i in $(seq 0 $((yD - 1))); do
DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$idx]}:${DECODE_PORT}"
done

echo "Prefill IPs : ${PREFILL_IPS[*]}"
echo "Decode IPs : ${DECODE_IPS[*]}"

PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP}"
PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}"
DECODE_ENABLE_EP="${DECODE_ENABLE_EP}"
DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"

# Parallel args
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
if [ "$PREFILL_ENABLE_DP" = "true" ]; then
if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
else #DPA+TP
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention )
else #TP+DPA+TBO
if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo )
export GPU_MAX_HW_QUEUES=5
export ATOM_CPU_AFFINITY=1
else #TP+DPA
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention )
fi
fi
fi

# (srok), split DPA & TBO cases
DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
if [ "$DECODE_ENABLE_DP" = "true" ]; then
if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
else #DPA+TP
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention )
else #TP+DPA+TBO
if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo )
export GPU_MAX_HW_QUEUES=5
export ATOM_CPU_AFFINITY=1
else #TP+DPA
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention )
fi
fi
fi

echo "Prefill Parallel args : ${PREFILL_PARALLEL_ARGS[*]}"
echo "Decode Parallel args : ${DECODE_PARALLEL_ARGS[*]}"
# MTP args
SPEC_ARGS=() #TP
if [ "$SPEC_DECODING" = "mtp" ]; then
SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE")
fi

# HF overrides (single-quoted JSON preserved through eval)
HF_OVERRIDES_ARG=""
if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'"
fi

# KV cache dtype (skip if unset or 'auto')
KV_CACHE_ARG=""
if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then
KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}"
fi

# Optional model length / batched-token cap
MODEL_LEN_ARGS=""
if [[ -n "$MAX_MODEL_LEN" ]]; then
MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-model-len ${MAX_MODEL_LEN}"
fi
if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then
MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}"
fi

if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then
export AITER_QUICK_REDUCE_QUANTIZATION=INT4
fi

cat <<INFO
=== Configuration ===
PREFILL : ${PREFILL_IPS[*]} (TP=${PREFILL_TP_SIZE}, EP=${PREFILL_ENABLE_EP:-false}, DP=${PREFILL_ENABLE_DP:-false}, port=${PREFILL_PORT})
DECODE : ${DECODE_IPS[*]} (TP=${DECODE_TP_SIZE}, EP=${DECODE_ENABLE_EP:-false}, DP=${DECODE_ENABLE_DP:-false}, port=${DECODE_PORT})
ROUTER : port=${ROUTER_PORT}
MODEL : ${MODEL_NAME}
BACKEND : atom (PD mooncake KV transfer)
MTP : method=mtp num_speculative_tokens=${DECODE_MTP_SIZE}
xP/yD : ${xP} / ${yD}
KV cache : dtype=${KV_CACHE_DTYPE:-auto} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC}
Model len: max_model_len=${MAX_MODEL_LEN:-unset} max_num_batched_tokens=${MAX_NUM_BATCHED_TOKENS:-unset}
Prefill args : ${PREFILL_PARALLEL_ARGS[*]}
Decode args : ${DECODE_PARALLEL_ARGS[*]}
Spec args : ${SPEC_ARGS[*]}
Opt args : ${HF_OVERRIDES_ARG}
=====================
INFO

# =============================================================================
# Node Role Assignment
Expand All @@ -153,12 +216,15 @@ if [ "$NODE_RANK" -eq 0 ]; then
--model ${MODEL_DIR}/${MODEL_NAME} \
--host 0.0.0.0 --server-port ${PREFILL_PORT} \
--trust-remote-code \
"${PREFILL_PARALLEL_ARGS[@]}" \
--kv_cache_dtype ${KV_CACHE_DTYPE} \
${PREFILL_PARALLEL_ARGS[*]} \
${SPEC_ARGS[*]} \
${KV_CACHE_ARG} \
--block-size ${BLOCK_SIZE} \
--gpu-memory-utilization ${MEM_FRACTION} \
--gpu-memory-utilization ${MEM_FRAC_STATIC} \
--max-num-seqs ${MAX_NUM_SEQS} \
${MODEL_LEN_ARGS} \
--no-enable_prefix_caching \
${HF_OVERRIDES_ARG} \
--kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
${EXTRA_SERVER_ARGS}"

Expand Down Expand Up @@ -248,6 +314,11 @@ if [ "$NODE_RANK" -eq 0 ]; then

cd $ATOM_WS_PATH

export IS_MTP="false"
if [ "$SPEC_DECODING" = "mtp" ]; then
export IS_MTP="true"
fi

BENCH_CMD="bash $ATOM_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
$MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \
Expand Down Expand Up @@ -367,12 +438,15 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
--model ${MODEL_DIR}/${MODEL_NAME} \
--host 0.0.0.0 --server-port ${PREFILL_PORT} \
--trust-remote-code \
"${PREFILL_PARALLEL_ARGS[@]}" \
--kv_cache_dtype ${KV_CACHE_DTYPE} \
${PREFILL_PARALLEL_ARGS[*]} \
${SPEC_ARGS[*]} \
${KV_CACHE_ARG} \
--block-size ${BLOCK_SIZE} \
--gpu-memory-utilization ${MEM_FRACTION} \
--gpu-memory-utilization ${MEM_FRAC_STATIC} \
--max-num-seqs ${MAX_NUM_SEQS} \
${MODEL_LEN_ARGS} \
--no-enable_prefix_caching \
${HF_OVERRIDES_ARG} \
--kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
${EXTRA_SERVER_ARGS}"

Expand Down Expand Up @@ -449,12 +523,15 @@ else
--model ${MODEL_DIR}/${MODEL_NAME} \
--host 0.0.0.0 --server-port ${DECODE_PORT} \
--trust-remote-code \
"${DECODE_PARALLEL_ARGS[@]}" \
--kv_cache_dtype ${KV_CACHE_DTYPE} \
${DECODE_PARALLEL_ARGS[*]} \
Comment thread
cursor[bot] marked this conversation as resolved.
${SPEC_ARGS[*]} \
${KV_CACHE_ARG} \
--block-size ${BLOCK_SIZE} \
--gpu-memory-utilization ${MEM_FRACTION} \
--gpu-memory-utilization ${MEM_FRAC_STATIC} \
--max-num-seqs ${DECODE_MAX_NUM_SEQS} \
${MODEL_LEN_ARGS} \
--no-enable_prefix_caching \
${HF_OVERRIDES_ARG} \
--kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
--cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \
${EXTRA_SERVER_ARGS}"
Expand Down
Loading