Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 204 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12363,6 +12363,210 @@ minimaxm2.5-fp8-b300-dynamo-vllm:
ep: 4
dp-attn: true

# MiniMax-M3 B300 disaggregated vLLM recipes sourced from
# NVIDIA/srt-slurm#223 at 5caabe364e1ef531fab9926c75e32ae8927b1553.
# The upstream recipes use DEP2 prefill workers and a mix of TEP8, DEP8,
# and DEP4 decode workers across the submitted 1k1k/8k1k frontier.
minimaxm3-fp8-b300-dynamo-vllm:
image: vllm/vllm-openai:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: b300
precision: fp8
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- conc-list: [4, 16, 64, 128, 4096]
prefill:
num-worker: 1
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
- conc-list: [2048]
prefill:
num-worker: 1
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml"
decode:
num-worker: 2
tp: 4
ep: 1
dp-attn: true
- conc-list: [512, 4096]
prefill:
num-worker: 2
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
- conc-list: [32]
prefill:
num-worker: 2
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [4]
prefill:
num-worker: 3
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- isl: 8192
osl: 1024
search-space:
- conc-list: [128]
prefill:
num-worker: 1
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: true
- conc-list: [256, 512]
prefill:
num-worker: 2
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: true
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [512]
prefill:
num-worker: 3
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: true
- conc-list: [32]
prefill:
num-worker: 3
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [4096]
prefill:
num-worker: 4
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: true
- conc-list: [4096]
prefill:
num-worker: 4
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml"
decode:
num-worker: 3
tp: 4
ep: 1
dp-attn: true
- conc-list: [4, 64]
prefill:
num-worker: 5
tp: 2
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false

minimaxm2.5-fp8-gb300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: MiniMaxAI/MiniMax-M2.5
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tep8-1k1k"

model:
path: "MiniMaxAI/MiniMax-M3-MXFP8"
container: "vllm/vllm-openai:minimax-m3"
precision: "fp8"

resources:
gpu_type: "b300"
gpus_per_node: 8
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 2
gpus_per_decode: 8

dynamo:
install: true
version: 1.3.0.dev20260614

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
UCX_NET_DEVICES: "all"
UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

decode_environment:
UCX_NET_DEVICES: "all"
UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

vllm_config:
prefill:
tensor-parallel-size: 1
data-parallel-size: 2
data-parallel-rpc-port: 13345
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048

decode:
tensor-parallel-size: 8
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-num-seqs: 4096
max-num-batched-tokens: 16384
max-cudagraph-capture-size: 8196

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 WARNING: max-cudagraph-capture-size: 8196 is likely a typo for 8192 (2^13)
Why it matters: 8196 is not a power of 2. Every other recipe in this repo uses power-of-2 values (e.g. 2048). vLLM uses this as a batch-size threshold for CUDA graph capture — a non-power-of-2 value is unusual and likely unintentional. This same value appears across all 6 decode configs in this PR.
Fix: Since these are imported verbatim from NVIDIA/srt-slurm#223, you may want to confirm with upstream whether 8196 is intentional or should be 8192. If it's an upstream typo, it's worth fixing here too.

Suggested change
max-cudagraph-capture-size: 8196
max-cudagraph-capture-size: 8192


health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "4x16x64x128x4096"
req_rate: "inf"
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep4-1k1k"

model:
path: "MiniMaxAI/MiniMax-M3-MXFP8"
container: "vllm/vllm-openai:minimax-m3"
precision: "fp8"

resources:
gpu_type: "b300"
gpus_per_node: 8
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 2
gpus_per_prefill: 2
gpus_per_decode: 4

dynamo:
install: true
version: 1.3.0.dev20260614

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
UCX_NET_DEVICES: "all"
UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

decode_environment:
UCX_NET_DEVICES: "all"
UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

vllm_config:
prefill:
tensor-parallel-size: 1
data-parallel-size: 2
data-parallel-rpc-port: 13345
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048

decode:
tensor-parallel-size: 1
data-parallel-size: 4
data-parallel-rpc-port: 13345
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-num-seqs: 1024
max-num-batched-tokens: 16384
max-cudagraph-capture-size: 8196

health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "2048"
req_rate: "inf"
Loading
Loading