SemiAnalysisAI · Oseltamivir · Jun 20, 2026 · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
@@ -13030,6 +13030,252 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
           ep: 4
           dp-attn: true
 
+# MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863.
+# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8,
+# DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped.
+# kv-cache-dtype=fp8 added. srun_options mem=0 required.
+minimaxm3-fp8-gb300-dynamo-vllm:
+  image: vllm/vllm-openai:nightly-aarch64
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: gb300-nv
+  precision: fp8
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1p1d DEP2+TEP8, 3n: conc 4,16,64,128,4096
+      - conc-list: [4, 16, 64, 128, 4096]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16
+      - conc-list: [1, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+
+      # 1p2d DEP2+DEP4, 3n: conc 2048
+      - conc-list: [2048]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml"
+        decode:
+          num-worker: 2
+          tp: 4
+          ep: 4
+          dp-attn: true
+
+      # 2p1d DEP2+DEP8, 3n: conc 512,4096
+      - conc-list: [512, 4096]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # 2p1d DEP2+TEP8, 3n: conc 32
+      - conc-list: [32]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # 2p2d DEP2+TEP8, 5n: conc 16
+      - conc-list: [16]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # 3p2d DEP2+TEP8, 6n: conc 4
+      - conc-list: [4]
+        prefill:
+          num-worker: 3
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16
+      - conc-list: [1, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+
+      # 1p2d DEP2+DEP8, 5n: conc 128
+      - conc-list: [128]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # 2p2d DEP2+DEP8, 5n: conc 256,512
+      - conc-list: [256, 512]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # 2p2d DEP2+TEP8, 5n: conc 16
+      - conc-list: [16]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # 3p2d DEP2+DEP8, 6n: conc 512
+      - conc-list: [512]
+        prefill:
+          num-worker: 3
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # 3p2d DEP2+TEP8, 6n: conc 32
+      - conc-list: [32]
+        prefill:
+          num-worker: 3
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # 4p2d DEP2+DEP8, 6n: conc 4096
+      - conc-list: [4096]
+        prefill:
+          num-worker: 4
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # 5p2d DEP2+TEP8, 7n: conc 4,64
+      - conc-list: [4, 64]
+        prefill:
+          num-worker: 5
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+
 qwen3.5-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc18
   model: nvidia/Qwen3.5-397B-A17B-NVFP4

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -123,6 +123,11 @@ on:
 
 env:
   RANDOM_RANGE_RATIO: 0.8
+  # Day-zero models resolved via hf: ids download from the Hub inside the
+  # slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests
+  # get 429-rate-limited when several workers pull a 444 GB snapshot at
+  # once; sbatch/srun inherit this env so the token reaches the workers.
+  HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }}
   EXP_NAME: ${{ inputs.exp-name }}
   IMAGE: ${{ inputs.image }}
   MODEL_PREFIX: ${{ inputs.model-prefix }}

diff --git a/KLAUD_DEBUG.md b/KLAUD_DEBUG.md
@@ -56,6 +56,28 @@ and waits for the PR checks automatically.
 
 Seen on: #1395 (kimik2.5-fp4-b200-vllm — needed env var), #1403 (gptoss-fp4-mi300x-vllm — needed 0.90), #1461 (dsv4-fp8-h200-vllm — needed 0.90).
 
+### 2.1 DEP CUDA-graph capture OOM on GB300
+
+**Symptom:** TP1 + data/expert-parallel decode workers load successfully and
+allocate the KV cache, then fail in `breakable_cudagraph.py` at
+`torch.cuda.graph.capture_end()` with `CUDA error: out of memory`. Large GB300
+VRAM does not prevent this because vLLM fills the configured memory budget with
+KV cache before capturing hundreds of persistent graphs.
+
+**Root cause:** `max-num-seqs` and `max-cudagraph-capture-size` were sized from
+global benchmark concurrency instead of per-DP-rank concurrency. MiniMax-M3
+DEP4/DEP8 recipes requested capture sizes of 4096-8192 and up to 4096 sequences,
+creating 358-806 graphs per GPU.
+
+**First-line tuning:** keep `gpu-memory-utilization: 0.90`, but size graph limits
+to the per-rank load. For the GB300 MiniMax-M3 sweep, use
+`max-num-seqs: 512` and `max-cudagraph-capture-size: 2048` on DEP decoders.
+This matches the single-node GB300 recipe and still covers the largest 512
+requests per DP rank. If capture still OOMs, lower decode
+`gpu-memory-utilization` to `0.85`.
+
+Seen on: #1735 (MiniMax-M3 MXFP8 GB300 dynamo-vLLM).
+
 ---
 
 ## 3. Custom DSV4 image → generic v0.5.12 OOMs

diff --git a/...node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml b/...node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml
@@ -0,0 +1,103 @@
+name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tep8-fp8-1k1k"
+
+# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8)
+# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
+srun_options:
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      kv-cache-dtype: fp8
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      kv-cache-dtype: fp8
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x16x64x128x4096"
+  req_rate: "inf"