From de20fa83058126b9b5dd3ad3805c3183fd3c07f6 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Mon, 15 Jun 2026 13:25:40 -0700 Subject: [PATCH 1/5] Add MiniMax M3 B300 Dynamo vLLM recipes --- .github/configs/nvidia-master.yaml | 93 +++++++++++++++++++ .../b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml | 81 ++++++++++++++++ .../b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml | 83 +++++++++++++++++ .../b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml | 83 +++++++++++++++++ .../b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml | 81 ++++++++++++++++ .../b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml | 81 ++++++++++++++++ .../b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml | 81 ++++++++++++++++ perf-changelog.yaml | 7 ++ runners/launch_b300-nv.sh | 11 ++- 9 files changed, 600 insertions(+), 1 deletion(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c8fe1ea97..c38d213a6 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12363,6 +12363,99 @@ minimaxm2.5-fp8-b300-dynamo-vllm: ep: 4 dp-attn: true +minimaxm3-fp8-b300-dynamo-vllm: + image: vllm/vllm-openai:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: b300 + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [4, 16, 64, 128, 4096] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [2048] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [512, 4096] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [32] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + minimaxm2.5-fp8-gb300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 model: MiniMaxAI/MiniMax-M2.5 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..e1c908228 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x16x64x128x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml new file mode 100644 index 000000000..625927baf --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep4-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml new file mode 100644 index 000000000..951cdfc94 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-dep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..495fc9b78 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..58c533504 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..165072cc5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 67a91d5d2..02be97989 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3887,3 +3887,10 @@ - "Align MiniMax-M3 B300 vLLM fixed-sequence serving with MiniMax-M2.5 FP8 settings by setting VLLM_FLOAT32_MATMUL_PRECISION=high and restoring max cudagraph capture size 2048." - "Add TP4+EP4 coverage for MiniMax-M3 B300: DP-attention rows for 1k1k/8k1k and the missing non-DP-attention row for 8k1k." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1781 + +- config-keys: + - minimaxm3-fp8-b300-dynamo-vllm + description: + - "Add MiniMax-M3 MXFP8 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k STP." + - "Add six local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k and wire the B300 launcher to overlay them into srt-slurm." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index a941860c0..c32516c9e 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -45,8 +45,11 @@ elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" && $FRAMEWORK == " elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/data/models/MiniMax-M2.5" export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" +elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then + export MODEL_PATH="/data/models/MiniMax-M3-MXFP8" + export SRT_SLURM_MODEL_PREFIX="MiniMaxAI/MiniMax-M3-MXFP8" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm, minimaxm3-fp8 with dynamo-vllm" exit 1 fi @@ -79,6 +82,12 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECIS git checkout main mkdir -p recipes/vllm/minimax-m2.5-fp8 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8" recipes/vllm/minimax-m2.5-fp8 +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" || exit 1 + git checkout main + mkdir -p recipes/vllm/minimax-m3 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3" recipes/vllm/minimax-m3 else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 From 131639e227ff487bab2c42c7cc0e7915f1e26cf1 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Mon, 15 Jun 2026 13:26:46 -0700 Subject: [PATCH 2/5] Update MiniMax M3 B300 changelog link --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 02be97989..363ab062c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3893,4 +3893,4 @@ description: - "Add MiniMax-M3 MXFP8 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k STP." - "Add six local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k and wire the B300 launcher to overlay them into srt-slurm." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1787 From a2d9824a7c7df059d7eb56c253325b8c4c468310 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 16 Jun 2026 10:58:41 -0700 Subject: [PATCH 3/5] Add MiniMax M3 B300 8k Dynamo vLLM recipes --- .github/configs/nvidia-master.yaml | 133 ++++++++++++++++++ .../1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml | 82 +++++++++++ .../b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml | 81 +++++++++++ .../b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml | 81 +++++++++++ .../b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml | 80 +++++++++++ .../b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml | 81 +++++++++++ .../b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml | 80 +++++++++++ .../b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml | 81 +++++++++++ .../b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml | 81 +++++++++++ .../b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml | 80 +++++++++++ .../8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml | 81 +++++++++++ perf-changelog.yaml | 5 +- 12 files changed, 944 insertions(+), 2 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c38d213a6..65068f8e1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12390,6 +12390,19 @@ minimaxm3-fp8-b300-dynamo-vllm: tp: 8 ep: 8 dp-attn: false + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false - conc-list: [2048] prefill: num-worker: 1 @@ -12455,6 +12468,126 @@ minimaxm3-fp8-b300-dynamo-vllm: tp: 8 ep: 8 dp-attn: false + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [128] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [256, 512] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [512] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [32] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [4, 64] + prefill: + num-worker: 5 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 5 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false minimaxm2.5-fp8-gb300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml new file mode 100644 index 000000000..452d748b4 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml @@ -0,0 +1,82 @@ +name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tp8-marlin-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: false + moe-backend: marlin + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..cb9256e25 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..7a805df7f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..8dc4c5c06 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-tep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..8ff86e9d9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..a90bfcf32 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..8feb79b71 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-4p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml new file mode 100644 index 000000000..5639c2eda --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-4p3d-fp8-dep2-dep4-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 3 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..1f517d815 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-5p2d-fp8-dep2-tep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 5 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x64" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml new file mode 100644 index 000000000..cf1840dfe --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-5p2d-fp8-dep2-tp8-marlin-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 5 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: false + moe-backend: marlin + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 363ab062c..30dc99c25 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3891,6 +3891,7 @@ - config-keys: - minimaxm3-fp8-b300-dynamo-vllm description: - - "Add MiniMax-M3 MXFP8 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k STP." - - "Add six local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k and wire the B300 launcher to overlay them into srt-slurm." + - "Add MiniMax-M3 MXFP8 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k and 8k1k STP." + - "Add local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8 and wire the B300 launcher to overlay them into srt-slurm." + - "Add TP8 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1787 From 7dbec24d0a08688409052288ae7d937d66fad59c Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 17 Jun 2026 22:00:07 -0700 Subject: [PATCH 4/5] [NV][MiniMax-M3]point to sa-submission-q2-2026 branch instead (#1822) --- runners/launch_b300-nv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index c32516c9e..f2a83e4b3 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -85,7 +85,7 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECIS elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 - git checkout main + git checkout sa-submission-q2-2026 mkdir -p recipes/vllm/minimax-m3 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3" recipes/vllm/minimax-m3 else From 8abe295684115d25208c1635fbc6cac23b971584 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 17 Jun 2026 22:40:09 -0700 Subject: [PATCH 5/5] [NV][MiniMax-M3]fix UCX_ settings to fix nixl handshake failure (#1823) * point to sa-submission-q2-2026 branch instead * fix UCX_* settings to fix nixl handshake failure --- .../vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml | 6 ++---- .../minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml | 6 ++---- .../vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml | 6 ++---- .../minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml | 6 ++---- 16 files changed, 32 insertions(+), 64 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml index e1c908228..e76827af3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml @@ -28,13 +28,11 @@ backend: connector: null prefill_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" decode_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml index 452d748b4..c8362cb32 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml @@ -28,13 +28,11 @@ backend: connector: null prefill_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" decode_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml index 625927baf..349a125bb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml @@ -28,13 +28,11 @@ backend: connector: null prefill_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" decode_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml index 951cdfc94..0f790c79b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml @@ -28,13 +28,11 @@ backend: connector: null prefill_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" decode_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml index 495fc9b78..1372ff29a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml @@ -28,13 +28,11 @@ backend: connector: null prefill_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" decode_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml index 58c533504..4447d971b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -28,13 +28,11 @@ backend: connector: null prefill_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" decode_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml index 165072cc5..b03d644d2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml @@ -28,13 +28,11 @@ backend: connector: null prefill_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" decode_environment: - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml index cb9256e25..890014563 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml @@ -29,13 +29,11 @@ backend: prefill_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" decode_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml index 7a805df7f..6d9ecc425 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -29,13 +29,11 @@ backend: prefill_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" decode_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml index 8dc4c5c06..90d816592 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -29,13 +29,11 @@ backend: prefill_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" decode_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml index 8ff86e9d9..84d580452 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml @@ -29,13 +29,11 @@ backend: prefill_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" decode_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml index a90bfcf32..f272b21bc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml @@ -29,13 +29,11 @@ backend: prefill_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" decode_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml index 8feb79b71..b087b0926 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -29,13 +29,11 @@ backend: prefill_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" decode_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml index 5639c2eda..94c36243e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml @@ -29,13 +29,11 @@ backend: prefill_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" decode_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml index 1f517d815..94f546ec2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml @@ -29,13 +29,11 @@ backend: prefill_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" decode_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml index cf1840dfe..e77e77600 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml @@ -29,13 +29,11 @@ backend: prefill_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" decode_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_NET_DEVICES: "all" - UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + UCX_TLS: "cuda_copy,rc" vllm_config: prefill: