diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 96a68b589..41e694489 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12035,10 +12035,9 @@ minimaxm3-fp8-b300-dynamo-vllm: ep: 8 dp-attn: false -# MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863. -# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8, -# DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped. -# kv-cache-dtype=fp8 added. srun_options mem=0 required. +# MiniMax-M3 GB300 disagg sweep — refreshed recipe set (no Marlin variants). +# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: DEP4, TEP8, DEP8, TEP4. +# 4 GPU/node (GB300 NVL72). kv-cache-dtype=fp8. srun_options mem=0 required. minimaxm3-fp8-gb300-dynamo-vllm: image: vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223 model: MiniMaxAI/MiniMax-M3-MXFP8 @@ -12053,155 +12052,155 @@ minimaxm3-fp8-gb300-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # 1p1d DEP2+TEP8, 3n: conc 4,16,64,128,4096 - - conc-list: [4, 16, 64, 128, 4096] + # 1p1d DEP2+DEP4, 2n: conc 8192 + - conc-list: [8192] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml" decode: num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false + tp: 4 + ep: 4 + dp-attn: true - # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16 - - conc-list: [1, 4, 8, 16] + # 1p2d DEP2+TEP8, 5n: conc 4,16,64,128,256 + - conc-list: [4, 16, 64, 128, 256] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml" decode: - num-worker: 1 - tp: 4 - ep: 1 + num-worker: 2 + tp: 8 + ep: 8 dp-attn: false - # 1p2d DEP2+DEP4, 3n: conc 2048 - - conc-list: [2048] + # 2p2d DEP2+TEP8, 5n: conc 32 + - conc-list: [32] prefill: - num-worker: 1 + num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml" decode: num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true + tp: 8 + ep: 8 + dp-attn: false - # 2p1d DEP2+DEP8, 3n: conc 512,4096 - - conc-list: [512, 4096] + # 2p3d DEP2+DEP4, 4n: conc 8192 + - conc-list: [8192] prefill: num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml" decode: - num-worker: 1 - tp: 8 - ep: 8 + num-worker: 3 + tp: 4 + ep: 4 dp-attn: true - # 2p1d DEP2+TEP8, 3n: conc 32 - - conc-list: [32] + # 2p4d DEP2+DEP4, 5n: conc 8192 + - conc-list: [8192] prefill: num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml" decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true - # 2p2d DEP2+TEP8, 5n: conc 16 - - conc-list: [16] + # 4p2d DEP2+DEP8, 6n: conc 1024,4096 + - conc-list: [1024, 4096] prefill: - num-worker: 2 + num-worker: 4 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml" decode: num-worker: 2 tp: 8 ep: 8 - dp-attn: false + dp-attn: true - # 3p2d DEP2+TEP8, 6n: conc 4 - - conc-list: [4] + - isl: 8192 + osl: 1024 + search-space: + # 1p1d DEP2+DEP8, 3n: conc 256 + - conc-list: [256] prefill: - num-worker: 3 + num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 - dp-attn: false + dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16 - - conc-list: [1, 4, 8, 16] + # 1p1d DEP2+TEP8, 3n: conc 128 + - conc-list: [128] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml" decode: num-worker: 1 - tp: 4 - ep: 1 + tp: 8 + ep: 8 dp-attn: false - # 1p2d DEP2+DEP8, 5n: conc 128 - - conc-list: [128] + # 1p2d DEP2+TEP8, 5n: conc 32,64,128 + - conc-list: [32, 64, 128] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml" decode: num-worker: 2 tp: 8 ep: 8 - dp-attn: true + dp-attn: false - # 2p2d DEP2+DEP8, 5n: conc 256,512 - - conc-list: [256, 512] + # 2p1d DEP2+DEP8, 3n: conc 512 + - conc-list: [512] prefill: num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 dp-attn: true @@ -12214,72 +12213,72 @@ minimaxm3-fp8-gb300-dynamo-vllm: ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: false - # 3p2d DEP2+DEP8, 6n: conc 512 - - conc-list: [512] + # 2p4d DEP2+TEP4, 5n: conc 4 + - conc-list: [4] prefill: - num-worker: 3 + num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml" decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false - # 3p2d DEP2+TEP8, 6n: conc 32 - - conc-list: [32] + # 3p1d DEP2+DEP8, 4n: conc 1024 + - conc-list: [1024] prefill: num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 - dp-attn: false + dp-attn: true - # 4p2d DEP2+DEP8, 6n: conc 4096 - - conc-list: [4096] + # 3p2d DEP2+DEP8, 6n: conc 512 + - conc-list: [512] prefill: - num-worker: 4 + num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - # 5p2d DEP2+TEP8, 7n: conc 4,64 - - conc-list: [4, 64] + # 6p1d DEP2+DEP8, 5n: conc 2048 + - conc-list: [2048] prefill: - num-worker: 5 + num-worker: 6 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 - dp-attn: false + dp-attn: true qwen3.5-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc18 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml similarity index 76% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml index af5315c76..f57d7af09 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml @@ -1,37 +1,37 @@ -name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-dep4-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-dep4-fp8-1k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP4 (TP1 DP4 EP) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP4 decode (TP1 DP4 EP, 4 GPU/worker = 1 node each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 1 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 2 + decode_nodes: 1 prefill_workers: 1 - decode_workers: 2 + decode_workers: 1 gpus_per_prefill: 2 gpus_per_decode: 4 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +76,13 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 2048 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 4 data-parallel-rpc-port: 13345 @@ -92,8 +94,8 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -101,5 +103,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "2048" + concurrencies: "8192" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml similarity index 77% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml index 4b00b5660..b4f457654 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml @@ -1,37 +1,37 @@ -name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tep8-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-tep8-fp8-1k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 decode (TP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 2 + decode_nodes: 4 prefill_workers: 1 - decode_workers: 1 + decode_workers: 2 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -81,6 +82,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true @@ -93,11 +95,11 @@ backend: stream-interval: 32 max-num-seqs: 4096 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8192 + max-cudagraph-capture-size: 8196 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "4x16x64x128x4096" + concurrencies: "4x16x64x128x256" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml similarity index 81% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml index 5babf0835..6bba9ea86 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -1,30 +1,30 @@ name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-tep8-fp8-1k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 decode (TP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -81,6 +82,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true @@ -93,11 +95,11 @@ backend: stream-interval: 32 max-num-seqs: 4096 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8192 + max-cudagraph-capture-size: 8196 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "16" + concurrencies: "32" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml similarity index 71% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml index 26fa89b94..de852e427 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml @@ -1,37 +1,37 @@ -name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tp4-marlin-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb300-2p3d-dep2-dep4-fp8-1k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin -# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 3D DEP4 decode (TP1 DP4 EP, 4 GPU/worker = 1 node each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 3 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 + decode_nodes: 3 + prefill_workers: 2 + decode_workers: 3 gpus_per_prefill: 2 gpus_per_decode: 4 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,15 +76,17 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 2048 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - enable-expert-parallel: false - moe-backend: marlin + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true block-size: 128 @@ -91,8 +94,8 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 4096 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -100,5 +103,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "1x4x8x16" + concurrencies: "8192" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml new file mode 100644 index 000000000..8f7b7b140 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml @@ -0,0 +1,107 @@ +name: "minimax-m3-vllm-disagg-gb300-2p4d-dep2-dep4-fp8-1k1k" + +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 4D DEP4 decode (TP1 DP4 EP, 4 GPU/worker = 1 node each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + + +dynamo: + install: true + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 + +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 100 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 100 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml similarity index 75% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml index 7cc5f50c4..f6cf6a59f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml @@ -1,37 +1,37 @@ -name: "minimax-m3-vllm-disagg-gb300-2p1d-dep2-dep8-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb300-4p2d-dep2-dep8-fp8-1k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 (TP1 DP8 EP) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. +# 4P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 2 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 4 + decode_workers: 2 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +76,13 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 2048 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 @@ -92,8 +94,8 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 4096 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -101,5 +103,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "512x4096" + concurrencies: "1024x4096" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml deleted file mode 100644 index 0c4f3498c..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml +++ /dev/null @@ -1,103 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb300-2p1d-dep2-tep8-fp8-1k1k" - -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 2304 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 8 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 2304 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 4096 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8192 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml deleted file mode 100644 index d4176055a..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml +++ /dev/null @@ -1,103 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-tep8-fp8-1k1k" - -# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 4 - prefill_workers: 3 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 2304 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 8 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 2304 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 4096 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8192 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml similarity index 76% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml index b56b65b26..d990d661b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml @@ -1,37 +1,37 @@ -name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-dep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-dep8-fp8-8k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 2 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 4 + decode_nodes: 2 prefill_workers: 1 - decode_workers: 2 + decode_workers: 1 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +76,13 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 16384 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 @@ -92,8 +94,8 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -101,5 +103,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "128" + concurrencies: "256" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml similarity index 77% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml index 35950dc32..d46133924 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml @@ -1,37 +1,37 @@ -name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-tep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tep8-fp8-8k1k" -# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 decode (TP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 2 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 4 - prefill_workers: 3 - decode_workers: 2 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -81,6 +82,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true @@ -99,5 +101,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "32" + concurrencies: "128" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml similarity index 78% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml index dbc9c5c9a..e8c606e27 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml @@ -1,36 +1,36 @@ -name: "minimax-m3-vllm-disagg-gb300-5p2d-dep2-tep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-tep8-fp8-8k1k" -# 5P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 7 nodes (3P + 4D). Adapted from NV B300 PR #1863. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 decode (TP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 3 + prefill_nodes: 1 decode_nodes: 4 - prefill_workers: 5 + prefill_workers: 1 decode_workers: 2 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -81,6 +82,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true @@ -99,5 +101,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4x64" + concurrencies: "32x64x128" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml similarity index 76% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml index 7beba3420..02c3be14a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml @@ -1,37 +1,37 @@ -name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-dep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb300-2p1d-dep2-dep8-fp8-8k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 2 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 4 + decode_nodes: 2 prefill_workers: 2 - decode_workers: 2 + decode_workers: 1 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +76,13 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 16384 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 @@ -92,8 +94,8 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -101,5 +103,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "256x512" + concurrencies: "512" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml similarity index 83% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml index 1ea678ace..304650d6c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -1,30 +1,30 @@ name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-tep8-fp8-8k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 decode (TP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -81,6 +82,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..efea8bfac --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml @@ -0,0 +1,105 @@ +name: "minimax-m3-vllm-disagg-gb300-2p4d-dep2-tep4-fp8-8k1k" + +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 4D TEP4 decode (TP4 EP, 4 GPU/worker = 1 node each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + + +dynamo: + install: true + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 + +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml similarity index 75% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml index 1526cd7ad..97e1ec88c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml @@ -1,37 +1,37 @@ -name: "minimax-m3-vllm-disagg-gb300-4p2d-dep2-dep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb300-3p1d-dep2-dep8-fp8-8k1k" -# 4P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. +# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 2 prefill + 2 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 2 - decode_nodes: 4 - prefill_workers: 4 - decode_workers: 2 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +76,13 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 16384 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 @@ -92,8 +94,8 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -101,5 +103,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4096" + concurrencies: "1024" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml similarity index 81% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml index f4e000a5f..745b2fad4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml @@ -1,30 +1,30 @@ name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-dep8-fp8-8k1k" -# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. +# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 2 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 sbatch_directives: mem: "0" cpus-per-task: "72" + srun_options: mem: "0" -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - resources: gpu_type: "gb300" gpus_per_node: 4 @@ -64,6 +64,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +76,13 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 16384 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 @@ -92,8 +94,8 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..9be5cc177 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml @@ -0,0 +1,107 @@ +name: "minimax-m3-vllm-disagg-gb300-6p1d-dep2-dep8-fp8-8k1k" + +# 6P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 3 prefill + 2 decode (+ head/infra). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + + +dynamo: + install: true + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 + +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 3 + decode_nodes: 2 + prefill_workers: 6 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 100 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 100 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml deleted file mode 100644 index 4ee41241e..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml +++ /dev/null @@ -1,104 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tp4-marlin-fp8-8k1k" - -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin -# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - enable-expert-parallel: false - moe-backend: marlin - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 1024 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 2048 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x4x8x16" - req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b3c7d283a..0ce7ceecd 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4222,6 +4222,15 @@ - "Initial submission: MiniMax-M3 MXFP4 disagg (prefill/decode) on MI355X with vLLM over the MoRI-IO KV connector (8k/1k)." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1914 +- config-keys: + - minimaxm3-fp8-gb300-dynamo-vllm + description: + - "Refresh GB300 MiniMax-M3 disagg recipe set: replace disagg-gb300-* files with new naming convention; drop TP4+Marlin variants." + - "1k/1k topologies (6 shapes): 1p1d-dep2-dep4 (conc 8192), 1p2d-dep2-tep8 (conc 4,16,64,128,256), 2p2d-dep2-tep8 (conc 32), 2p3d-dep2-dep4 (conc 8192), 2p4d-dep2-dep4 (conc 8192), 4p2d-dep2-dep8 (conc 1024,4096)." + - "8k/1k topologies (9 shapes): 1p1d-dep2-dep8 (conc 256), 1p1d-dep2-tep8 (conc 128), 1p2d-dep2-tep8 (conc 32,64,128), 2p1d-dep2-dep8 (conc 512), 2p2d-dep2-tep8 (conc 16), 2p4d-dep2-tep4 (conc 4), 3p1d-dep2-dep8 (conc 1024), 3p2d-dep2-dep8 (conc 512), 6p1d-dep2-dep8 (conc 2048)." + - "Image unchanged: vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1925 + - config-keys: - minimaxm3-fp4-b300-vllm description: diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 689451443..676615500 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -182,7 +182,7 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECIS elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout main + git checkout sa-submission-q2-2026 mkdir -p recipes/vllm/minimax-m3-gb300-fp8 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8" recipes/vllm/minimax-m3-gb300-fp8 SRTCTL_SETUP_SCRIPT="minimax-m3-gb300-vllm-fixes.sh"