diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index dd57a6e93..8bbc86f23 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8891,7 +8891,7 @@ kimik2.5-fp4-gb300-dynamo-trt: dp-attn: true kimik2.5-fp4-gb200-dynamo-vllm: - image: vllm/vllm-openai:v0.18.0-cu130 + image: vllm/vllm-openai:v0.21.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: gb200 @@ -8904,91 +8904,111 @@ kimik2.5-fp4-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - - conc-list: [256, 512, 1024, 2048, 3072, 4096] + - conc-list: [4096, 12288] prefill: num-worker: 1 - tp: 4 + tp: 1 ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p1d-dep4-dep8.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 + tp: 1 + ep: 8 dp-attn: true - - conc-list: [4, 8, 16, 32, 64, 128] + - conc-list: [4, 8, 32, 128] prefill: num-worker: 1 - tp: 4 + tp: 1 ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml" + - "CONFIG_FILE=recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p4d-dep4-tp8.yaml" decode: num-worker: 4 - tp: 4 - ep: 4 + tp: 8 + ep: 1 dp-attn: false + - conc-list: [4096, 6144] + prefill: + num-worker: 1 + tp: 1 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 1 + ep: 16 + dp-attn: true - isl: 8192 osl: 1024 search-space: - - conc-list: [4, 8, 16, 32, 128] + - conc-list: [128] prefill: num-worker: 1 - tp: 4 + tp: 1 ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml" + - "CONFIG_FILE=recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml" decode: num-worker: 4 tp: 4 ep: 4 dp-attn: false - - conc-list: [512, 1024] + - conc-list: [4, 8, 16, 32, 256] + prefill: + num-worker: 1 + tp: 1 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-1p4d-dep4-tp8.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [1024] prefill: num-worker: 3 - tp: 4 + tp: 1 ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml" decode: num-worker: 1 - tp: 16 + tp: 1 ep: 16 dp-attn: true - - conc-list: [2048] + - conc-list: [3072] prefill: - num-worker: 5 - tp: 4 + num-worker: 6 + tp: 1 ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml" + - "CONFIG_FILE=recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml" decode: num-worker: 1 - tp: 8 - ep: 8 + tp: 1 + ep: 16 dp-attn: true - - conc-list: [3072, 4096] + - conc-list: [6144] prefill: - num-worker: 6 - tp: 4 + num-worker: 8 + tp: 1 ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-8p1d-dep4-dep16.yaml" decode: num-worker: 1 - tp: 16 + tp: 1 ep: 16 dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml new file mode 100644 index 000000000..0ee4f3e7b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml @@ -0,0 +1,100 @@ +name: "vllm-disagg-gb200-1p1d-dep4-dep16" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.21.0" + precision: "fp4" + +dynamo: + version: 1.2.1 + install: true + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 1024 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 256 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 128 + max-cudagraph-capture-size: 384 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096x6144" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p1d-dep4-dep8.yaml new file mode 100644 index 000000000..40683d1b8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p1d-dep4-dep8.yaml @@ -0,0 +1,100 @@ +name: "vllm-disagg-gb200-1p1d-dep4-dep8" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.21.0" + precision: "fp4" + +dynamo: + version: 1.2.1 + install: true + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 1024 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.93 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 1024 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 128 + max-cudagraph-capture-size: 1024 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096x12288" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p4d-dep4-tp8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p4d-dep4-tp8.yaml new file mode 100644 index 000000000..bda68b920 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/1k1k/disagg-gb200-1p4d-dep4-tp8.yaml @@ -0,0 +1,96 @@ +name: "kimi-vllm-disagg-gb200-1p4d-dep4-tp8" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.21.0" + precision: "fp4" + +dynamo: + version: 1.2.1 + install: true + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "0" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 1024 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.93 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: false + max-model-len: 3072 + max-num-seqs: 1024 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 1024 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x32x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml new file mode 100644 index 000000000..429945c05 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml @@ -0,0 +1,96 @@ +name: "kimi-vllm-disagg-gb200-1p4d-dep4-tep4" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.21.0" + precision: "fp4" + +dynamo: + version: 1.2.1 + install: true + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.93 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 16 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 16 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-1p4d-dep4-tp8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-1p4d-dep4-tp8.yaml new file mode 100644 index 000000000..55165aeb3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-1p4d-dep4-tp8.yaml @@ -0,0 +1,96 @@ +name: "kimi-vllm-disagg-gb200-1p4d-dep4-tp8" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.21.0" + precision: "fp4" + +dynamo: + version: 1.2.1 + install: true + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.93 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: false + max-model-len: 10240 + max-num-seqs: 16 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 16 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x256" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml new file mode 100644 index 000000000..b4d1e8c82 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml @@ -0,0 +1,99 @@ +name: "kimi-vllm-disagg-gb200-3p1d-dep4-dep16" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.21.0" + precision: "fp4" + +dynamo: + version: 1.2.1 + install: true + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 3 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.93 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 256 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 256 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml new file mode 100644 index 000000000..1f4de8df1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml @@ -0,0 +1,99 @@ +name: "kimi-vllm-disagg-gb200-6p1d-dep4-dep16" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.21.0" + precision: "fp4" + +dynamo: + version: 1.2.1 + install: true + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 6 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.93 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 512 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "3072" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-8p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-8p1d-dep4-dep16.yaml new file mode 100644 index 000000000..45a7214b2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4/8k1k/disagg-gb200-8p1d-dep4-dep16.yaml @@ -0,0 +1,100 @@ +name: "kimi-vllm-disagg-gb200-8p1d-dep4-dep16" + +model: + path: "kimi-k2.5-nvfp4" + container: "vllm/vllm-openai:v0.21.0" + precision: "fp4" + +dynamo: + version: 1.2.1 + install: true + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 8 + decode_nodes: 4 + prefill_workers: 8 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.93 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "nvidia/Kimi-K2.5-NVFP4" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 256 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "6144" + req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 20a0baf95..eb47ba6ae 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4006,3 +4006,11 @@ - "STP-only: 14 configs for ISL1K/OSL1K (conc 8–8192), 11 configs for ISL8K/OSL1K (conc 4–2253)" - "Topology range: 1–8 prefill workers (TP4/EP4), decode 1d-dep8 to 1d-dep32 and 4d/5d-tep8/tep4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1796 + +- config-keys: + - kimik2.5-fp4-gb200-dynamo-vllm + description: + - "Refresh GB200 dynamo-vllm disagg sweep: bump image to vllm/vllm-openai:v0.21.0, dynamo 1.2.1, switch to checked-in kimi-k2.5-fp4 recipes" + - "1k/1k: 1p1d-dep4-dep8 (conc 4096,12288), 1p4d-dep4-tp8 (conc 4-128), 1p1d-dep4-dep16 (conc 4096,6144)" + - "8k/1k: 1p4d-dep4-tep4 (conc 128), 1p4d-dep4-tp8 (conc 4-256), 3p1d-dep4-dep16 (conc 1024), 6p1d-dep4-dep16 (conc 3072), 8p1d-dep4-dep16 (conc 6144)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1862 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 36c8af203..5eae86bb3 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -81,7 +81,7 @@ NGINX_IMAGE="nginx:1.27.4" # squash dir on a path that's also visible to compute nodes. Falls # back to the legacy sa-shared path so other configs are untouched. SQUASH_DIR="/mnt/lustre01/users-public/sa-shared" -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then echo "=== cluster diagnostic (minimax sweep) ===" echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)" echo "HOME=$HOME" @@ -202,7 +202,7 @@ SRT_REPO_DIR="srt-slurm" # cross-mounted to compute nodes. Put the srt-slurm workspace and staged # InferenceX checkout on a writable shared-FS path that compute can see. # Per-run-unique paths avoid races between parallel sweep jobs. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then SHARED_BASE="" for cand in \ /mnt/lustre01/users-public/sa-shared/gha-runs \ @@ -269,6 +269,12 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2 exit 1 fi +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 + cd "$SRT_REPO_DIR" || exit 1 + git checkout main || exit 1 + mkdir -p recipes/vllm/kimi-k2.5-fp4 || exit 1 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5-fp4" recipes/vllm/kimi-k2.5-fp4 || exit 1 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" @@ -292,7 +298,7 @@ source $HOME/.local/bin/env # under a head-node-only path, .venv/bin/python3 becomes a broken # symlink on compute. Pin the venv to /usr/bin/python3 — a system # path that exists at the same location on both head and compute. -if [[ $MODEL_PREFIX == "minimaxm2.5" && -x /usr/bin/python3 ]]; then +if [[ ($MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5") && -x /usr/bin/python3 ]]; then uv venv --seed --python /usr/bin/python3 else uv venv --seed @@ -312,7 +318,7 @@ SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" # Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path # above so srtctl's outputs/ directory (which lives under # SRTCTL_ROOT) is visible to compute nodes. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then SRTCTL_ROOT="$SRT_REPO_DIR" fi echo "Creating srtslurm.yaml configuration..." @@ -354,7 +360,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" # can't see. Stage the relevant subset to shared FS and repoint # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already # on shared FS) and .git (not needed in container) for speed. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}" mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1 rsync -a --delete \