diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c8fe1ea97..9bc22043d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12363,6 +12363,210 @@ minimaxm2.5-fp8-b300-dynamo-vllm: ep: 4 dp-attn: true +# MiniMax-M3 B300 disaggregated vLLM recipes sourced from +# NVIDIA/srt-slurm#223 at 5caabe364e1ef531fab9926c75e32ae8927b1553. +# The upstream recipes use DEP2 prefill workers and a mix of TEP8, DEP8, +# and DEP4 decode workers across the submitted 1k1k/8k1k frontier. +minimaxm3-fp8-b300-dynamo-vllm: + image: vllm/vllm-openai:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: b300 + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [4, 16, 64, 128, 4096] + prefill: + num-worker: 1 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [2048] + prefill: + num-worker: 1 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: true + - conc-list: [512, 4096] + prefill: + num-worker: 2 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + - conc-list: [32] + prefill: + num-worker: 2 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 3 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [128] + prefill: + num-worker: 1 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + - conc-list: [256, 512] + prefill: + num-worker: 2 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [512] + prefill: + num-worker: 3 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + - conc-list: [32] + prefill: + num-worker: 3 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 1 + dp-attn: true + - conc-list: [4, 64] + prefill: + num-worker: 5 + tp: 2 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + minimaxm2.5-fp8-gb300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 model: MiniMaxAI/MiniMax-M2.5 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..7f93d1aa1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x16x64x128x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml new file mode 100644 index 000000000..8dc458eab --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep4-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml new file mode 100644 index 000000000..afe1c3e49 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-dep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..7a2d87740 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..33435b328 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..ac98b9cc8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..cb9256e25 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..7a805df7f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..8dc4c5c06 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-tep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..8ff86e9d9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..a90bfcf32 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..8feb79b71 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-4p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml new file mode 100644 index 000000000..5639c2eda --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-4p3d-fp8-dep2-dep4-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 3 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..1f517d815 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-5p2d-fp8-dep2-tep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 5 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_NET_DEVICES: "all" + UCX_TLS: "rc,cuda_ipc,cuda_copy,sm,self,tcp" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x64" + req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 67a91d5d2..842a98517 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3887,3 +3887,12 @@ - "Align MiniMax-M3 B300 vLLM fixed-sequence serving with MiniMax-M2.5 FP8 settings by setting VLLM_FLOAT32_MATMUL_PRECISION=high and restoring max cudagraph capture size 2048." - "Add TP4+EP4 coverage for MiniMax-M3 B300: DP-attention rows for 1k1k/8k1k and the missing non-DP-attention row for 8k1k." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1781 + +- config-keys: + - minimaxm3-fp8-b300-dynamo-vllm + description: + - "Add MiniMax-M3 MXFP8 B300 Dynamo vLLM disaggregated 1k1k and 8k1k benchmark coverage" + - "Import fourteen STP recipes from NVIDIA/srt-slurm PR #223 at 5caabe364e1ef531fab9926c75e32ae8927b1553" + - "Cover DEP2 prefill with TEP8, DEP8, and DEP4 decode topologies across submitted concurrencies 4 through 4096" + - "Image: vllm/vllm-openai:minimax-m3; model: MiniMaxAI/MiniMax-M3-MXFP8" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1788 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index a941860c0..f2a83e4b3 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -45,8 +45,11 @@ elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" && $FRAMEWORK == " elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/data/models/MiniMax-M2.5" export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" +elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then + export MODEL_PATH="/data/models/MiniMax-M3-MXFP8" + export SRT_SLURM_MODEL_PREFIX="MiniMaxAI/MiniMax-M3-MXFP8" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm, minimaxm3-fp8 with dynamo-vllm" exit 1 fi @@ -79,6 +82,12 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECIS git checkout main mkdir -p recipes/vllm/minimax-m2.5-fp8 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8" recipes/vllm/minimax-m2.5-fp8 +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" || exit 1 + git checkout sa-submission-q2-2026 + mkdir -p recipes/vllm/minimax-m3 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3" recipes/vllm/minimax-m3 else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1