Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
965b046
feat: MiniMax-M3 MXFP8 full sweep config for GB300
Oseltamivir Jun 13, 2026
e3fa89f
chore: update perf-changelog pr-link to #1735
Oseltamivir Jun 13, 2026
b915c89
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 13, 2026
afc3f92
Update runner name in nvidia-master.yaml
Oseltamivir Jun 13, 2026
99a075b
fix: add sbatch_directives mem=0 + cpus-per-task=72 to M3 GB300 recipes
Oseltamivir Jun 13, 2026
26e2005
fix: run M3 GB300 workers cache-only (HF_HUB_OFFLINE=1) to avoid fetc…
Oseltamivir Jun 13, 2026
ce76bd7
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 13, 2026
b660ddd
fix: re-pin utils/aiperf to live cjq/agentx-v0.3 tip (ff2b646c)
Oseltamivir Jun 13, 2026
7ea8b0b
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 13, 2026
ef7c650
MiniMax-M3 GB300: disagg-only sweep + multi-node-NVLink KV transfer
Oseltamivir Jun 13, 2026
c94bf9f
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 13, 2026
7fd8904
M3 GB300: add 8k1k disagg sweep; drop unschedulable collocated-1n
Oseltamivir Jun 13, 2026
5df0669
M3 GB300: add rack-saturating balanced-ratio TP-ep1 max-throughput di…
Oseltamivir Jun 14, 2026
88e99ce
Merge remote-tracking branch 'origin/main' into feat/minimax-m3-gb300…
Oseltamivir Jun 14, 2026
62fe18d
M3 GB300: replace dep16dec with 1P4D TP4-ep1; add prefill-heavy 10P7D…
Oseltamivir Jun 14, 2026
f4c6384
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 14, 2026
1d71f49
[Klaud Cold]minimaxm3-fp8-mi300x-vllm-mtp: day-zero MiniMax-M3 EAGLE3…
functionstackx Jun 14, 2026
2bf5851
[AMD] perf: enable MiniMax M3 CUDA graphs on MI300X (#1750)
cquil11 Jun 14, 2026
fd922a6
[Klaud Cold] minimaxm3-fp8-mi300x-vllm-mtp: run with CUDA graphs (dro…
functionstackx Jun 14, 2026
805dc1c
M3 GB300: drop dominated configs, restore 1P1D full range
Oseltamivir Jun 14, 2026
a8d3eb5
Merge remote-tracking branch 'origin/main' into feat/minimax-m3-gb300…
Oseltamivir Jun 14, 2026
1be9cfe
M3 GB300 disagg: add DSV4-level decode optimizations
Oseltamivir Jun 14, 2026
e24318c
Switch GB300 M3 recipes to nightly-aarch64 + add Marlin MoE for TP-on…
Oseltamivir Jun 19, 2026
2a97ca2
Merge remote-tracking branch 'origin/main' into feat/minimax-m3-gb300…
Oseltamivir Jun 19, 2026
d4deb1e
fix: switch GB300 M3 runner from gb300-cw to gb300-nv
Oseltamivir Jun 19, 2026
0370cf4
fix: add minimaxm3-fp8 to gb300-nv launcher + switch recipes to alias…
Oseltamivir Jun 19, 2026
aa3df42
feat: redesign GB300 M3 recipes — DEP8 prefill, TEP8/TP8/DEP8 decode
Oseltamivir Jun 19, 2026
6bb84b0
feat: TEP4 prefill + B300-optimal decode for GB300 M3 disagg
Oseltamivir Jun 19, 2026
8e49bf3
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 19, 2026
e5bc74e
feat: adapt NV B300 PR #1863 disagg configs for GB300 M3 sweep
Oseltamivir Jun 20, 2026
9707d9a
Merge branch 'main' into feat/minimax-m3-gb300-sweep
Oseltamivir Jun 20, 2026
cad3e01
fix: reduce GB300 DEP CUDA graph capture sizes
Oseltamivir Jun 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
246 changes: 246 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13030,6 +13030,252 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
ep: 4
dp-attn: true

# MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863.
# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8,
# DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped.
# kv-cache-dtype=fp8 added. srun_options mem=0 required.
minimaxm3-fp8-gb300-dynamo-vllm:
image: vllm/vllm-openai:nightly-aarch64
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: gb300-nv
precision: fp8
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
# 1p1d DEP2+TEP8, 3n: conc 4,16,64,128,4096
- conc-list: [4, 16, 64, 128, 4096]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false

# 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false

# 1p2d DEP2+DEP4, 3n: conc 2048
- conc-list: [2048]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: true

# 2p1d DEP2+DEP8, 3n: conc 512,4096
- conc-list: [512, 4096]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

# 2p1d DEP2+TEP8, 3n: conc 32
- conc-list: [32]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false

# 2p2d DEP2+TEP8, 5n: conc 16
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false

# 3p2d DEP2+TEP8, 6n: conc 4
- conc-list: [4]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false

- isl: 8192
osl: 1024
search-space:
# 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false

# 1p2d DEP2+DEP8, 5n: conc 128
- conc-list: [128]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true

# 2p2d DEP2+DEP8, 5n: conc 256,512
- conc-list: [256, 512]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true

# 2p2d DEP2+TEP8, 5n: conc 16
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false

# 3p2d DEP2+DEP8, 6n: conc 512
- conc-list: [512]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true

# 3p2d DEP2+TEP8, 6n: conc 32
- conc-list: [32]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false

# 4p2d DEP2+DEP8, 6n: conc 4096
- conc-list: [4096]
prefill:
num-worker: 4
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true

# 5p2d DEP2+TEP8, 7n: conc 4,64
- conc-list: [4, 64]
prefill:
num-worker: 5
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false

qwen3.5-fp4-b200-trt:
image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc18
model: nvidia/Qwen3.5-397B-A17B-NVFP4
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ on:

env:
RANDOM_RANGE_RATIO: 0.8
# Day-zero models resolved via hf: ids download from the Hub inside the
# slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests
# get 429-rate-limited when several workers pull a 444 GB snapshot at
# once; sbatch/srun inherit this env so the token reaches the workers.
HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }}
EXP_NAME: ${{ inputs.exp-name }}
IMAGE: ${{ inputs.image }}
MODEL_PREFIX: ${{ inputs.model-prefix }}
Expand Down
22 changes: 22 additions & 0 deletions KLAUD_DEBUG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,28 @@ and waits for the PR checks automatically.

Seen on: #1395 (kimik2.5-fp4-b200-vllm — needed env var), #1403 (gptoss-fp4-mi300x-vllm — needed 0.90), #1461 (dsv4-fp8-h200-vllm — needed 0.90).

### 2.1 DEP CUDA-graph capture OOM on GB300

**Symptom:** TP1 + data/expert-parallel decode workers load successfully and
allocate the KV cache, then fail in `breakable_cudagraph.py` at
`torch.cuda.graph.capture_end()` with `CUDA error: out of memory`. Large GB300
VRAM does not prevent this because vLLM fills the configured memory budget with
KV cache before capturing hundreds of persistent graphs.

**Root cause:** `max-num-seqs` and `max-cudagraph-capture-size` were sized from
global benchmark concurrency instead of per-DP-rank concurrency. MiniMax-M3
DEP4/DEP8 recipes requested capture sizes of 4096-8192 and up to 4096 sequences,
creating 358-806 graphs per GPU.

**First-line tuning:** keep `gpu-memory-utilization: 0.90`, but size graph limits
to the per-rank load. For the GB300 MiniMax-M3 sweep, use
`max-num-seqs: 512` and `max-cudagraph-capture-size: 2048` on DEP decoders.
This matches the single-node GB300 recipe and still covers the largest 512
requests per DP rank. If capture still OOMs, lower decode
`gpu-memory-utilization` to `0.85`.

Seen on: #1735 (MiniMax-M3 MXFP8 GB300 dynamo-vLLM).

---

## 3. Custom DSV4 image → generic v0.5.12 OOMs
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tep8-fp8-1k1k"

# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8)
# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
precision: "fp8"

dynamo:
install: true
wheel: "1.2.0.dev20260526"

sbatch_directives:
mem: "0"
cpus-per-task: "72"
srun_options:
mem: "0"

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 720
interval_seconds: 10

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 2
gpus_per_decode: 8

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLOAT32_MATMUL_PRECISION: "high"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLOAT32_MATMUL_PRECISION: "high"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
tensor-parallel-size: 1
data-parallel-size: 2
data-parallel-rpc-port: 13345
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
kv-cache-dtype: fp8
stream-interval: 32
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
tensor-parallel-size: 8
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
kv-cache-dtype: fp8
stream-interval: 32
max-num-seqs: 4096
max-num-batched-tokens: 16384
max-cudagraph-capture-size: 8192

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TEP8 cudagraph limits too high

High Severity

Four 1k1k TEP8 decode recipes still set max-num-seqs: 4096 and max-cudagraph-capture-size: 8192, while the same change documents GB300 MiniMax-M3 graph capture OOM from those magnitudes and caps DEP decoders at 512/2048. Decode startup can hit CUDA OOM during graph capture before benchmarks run.

Additional Locations (2)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit cad3e01. Configure here.


benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "4x16x64x128x4096"
req_rate: "inf"
Loading