Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 226 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12817,6 +12817,232 @@ minimaxm2.5-fp8-b300-dynamo-vllm:
ep: 4
dp-attn: true

minimaxm3-fp8-b300-dynamo-vllm:
image: vllm/vllm-openai:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: b300
precision: fp8
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- conc-list: [4, 16, 64, 128, 4096]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
- conc-list: [2048]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
- conc-list: [512, 4096]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [32]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [4]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- isl: 8192
osl: 1024
search-space:
- conc-list: [128]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [256, 512]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [512]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [32]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [4096]
prefill:
num-worker: 4
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [4096]
prefill:
num-worker: 4
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml"
decode:
num-worker: 3
tp: 4
ep: 4
dp-attn: true
- conc-list: [4, 64]
prefill:
num-worker: 5
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 5
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false

minimaxm2.5-fp8-gb300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: MiniMaxAI/MiniMax-M2.5
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tep8-1k1k"

model:
path: "MiniMaxAI/MiniMax-M3-MXFP8"
container: "vllm/vllm-openai:minimax-m3"
precision: "fp8"

resources:
gpu_type: "b300"
gpus_per_node: 8
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 2
gpus_per_decode: 8

dynamo:
install: true
version: 1.3.0.dev20260614

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
UCX_TLS: "cuda_copy,rc"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

decode_environment:
UCX_TLS: "cuda_copy,rc"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

vllm_config:
prefill:
tensor-parallel-size: 1
data-parallel-size: 2
data-parallel-rpc-port: 13345
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048

decode:
tensor-parallel-size: 8
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-num-seqs: 4096
max-num-batched-tokens: 16384
max-cudagraph-capture-size: 8196

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrong cudagraph capture size

Medium Severity

All six new MiniMax M3 B300 decode blocks set max-cudagraph-capture-size to 8196, while prefill uses 2048 and decode sets max-num-seqs to 4096. That value is not used elsewhere in the repo and sits four above the usual power-of-two 8192 paired with 4096-sequence decode configs, so CUDA graph capture may not align with intended batch sizes.

Additional Locations (2)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 54b829a. Configure here.


health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "4x16x64x128x4096"
req_rate: "inf"
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tp8-marlin-1k1k"

model:
path: "MiniMaxAI/MiniMax-M3-MXFP8"
container: "vllm/vllm-openai:minimax-m3"
precision: "fp8"

resources:
gpu_type: "b300"
gpus_per_node: 8
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 2
gpus_per_decode: 8

dynamo:
install: true
version: 1.3.0.dev20260614

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
UCX_TLS: "cuda_copy,rc"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

decode_environment:
UCX_TLS: "cuda_copy,rc"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

vllm_config:
prefill:
tensor-parallel-size: 1
data-parallel-size: 2
data-parallel-rpc-port: 13345
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048

decode:
tensor-parallel-size: 8
enable-expert-parallel: false
moe-backend: marlin
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-num-seqs: 4096
max-num-batched-tokens: 16384
max-cudagraph-capture-size: 8196

health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x4x8x16"
req_rate: "inf"
Loading
Loading