diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8bbc86f23..c99f59634 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9375,6 +9375,114 @@ dsv4-fp4-gb200-dynamo-sglang: ep: 12 dp-attn: true +qwen3.5-fp8-gb200-dynamo-sglang: + image: lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: gb200 + precision: fp8 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P1D STP: TP4 prefill + TP4 decode (pure tensor parallel). 2 nodes (1+1). + - spec-decoding: "none" + conc-list: [1, 2, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/1k1k/1p1d-tp4-tp4.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + # 1P1D wide-EP: prefill DEP4 + decode DEP16. 5 nodes (1+4). + - spec-decoding: "none" + conc-list: [512, 1024, 2048] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/1k1k/1p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # 2P1D wide-EP: 2 prefill DEP4 + decode DEP16. 6 nodes (2+4). + - spec-decoding: "none" + conc-list: [4096] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/1k1k/2p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # 1P1D STP: TP4 prefill + TP4 decode (pure tensor parallel). 2 nodes (1+1). + - spec-decoding: "none" + conc-list: [1, 2, 4, 8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/8k1k/1p1d-tp4-tp4.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + # 4P1D wide-EP: 4 prefill DEP4 + decode DEP16. 8 nodes (4+4). + - spec-decoding: "none" + conc-list: [1024] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/8k1k/4p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # 8P1D wide-EP: 8 prefill DEP4 + decode DEP16. 12 nodes (8+4). + - spec-decoding: "none" + conc-list: [2048, 4096] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/8k1k/8p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + dsv4-fp4-b300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/1k1k/1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/1k1k/1p1d-dep4-dep16.yaml new file mode 100644 index 000000000..c110e1599 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/1k1k/1p1d-dep4-dep16.yaml @@ -0,0 +1,155 @@ +name: "qwen3.5-1p1d-dep4-dep16" + +setup_script: rebuild-deepep.sh + +dynamo: + hash: 46520ca59afe992fb5ef61b3197b2316f8df9b2b + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 1 + nginx_container: nginx + +model: + path: "qwen3.5-fp8" + container: "lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + + decode_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + MC_TE_METRIC: "true" + SGLANG_DG_CACHE_DIR: "/tmp/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + SGLANG_HEALTH_CHECK_TIMEOUT: "1800" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + mem-fraction-static: 0.8 + chunked-prefill-size: 65536 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + log-level: "info" + page-size: 64 + attention-backend: "trtllm_mha" + moe-runner-backend: "flashinfer_trtllm" + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + trust-remote-code: true + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + prefill-round-robin-balance: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 128 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + + chunked-prefill-size: 4096 + context-length: 4096 + mem-fraction-static: 0.80 + max-mamba-cache-size: 2048 + max-running-requests: 2048 + cuda-graph-max-bs: 128 + watchdog-timeout: 1000000 + + page-size: 64 + attention-backend: "trtllm_mha" + moe-runner-backend: "deep_gemm" + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + eplb-algorithm: "deepseek" + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + req_rate: "inf" + random_range_ratio: 0.8 + concurrencies: "512x1024x2048" \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/1k1k/1p1d-tp4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/1k1k/1p1d-tp4-tp4.yaml new file mode 100644 index 000000000..143f339d0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/1k1k/1p1d-tp4-tp4.yaml @@ -0,0 +1,123 @@ +name: "qwen3.5-1p1d-tp4-tp4" + +sbatch_directives: + mem: "0" + +dynamo: + hash: 46520ca59afe992fb5ef61b3197b2316f8df9b2b + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 1 + nginx_container: nginx + +model: + path: "qwen3.5-fp8" + container: "lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + + decode_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_HEALTH_CHECK_TIMEOUT: "3600" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + MC_FORCE_MNNVL: "1" + MC_TE_METRIC: "true" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + trust-remote-code: true + attention-backend: "trtllm_mha" + tensor-parallel-size: 4 + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + max-running-requests: 1024 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + context-length: 4096 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 + disaggregation-mode: "prefill" + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + trust-remote-code: true + attention-backend: "trtllm_mha" + tensor-parallel-size: 4 + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + max-running-requests: 1024 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + context-length: 4096 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 + disaggregation-mode: "decode" + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + req_rate: "inf" + num_prompts_mult: 10 + num_warmup_mult: 1 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64" \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/1k1k/2p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/1k1k/2p1d-dep4-dep16.yaml new file mode 100644 index 000000000..7b4ae6a03 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/1k1k/2p1d-dep4-dep16.yaml @@ -0,0 +1,155 @@ +name: "qwen3.5-2p1d-dep4-dep16" + +setup_script: rebuild-deepep.sh + +dynamo: + hash: 46520ca59afe992fb5ef61b3197b2316f8df9b2b + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx + +model: + path: "qwen3.5-fp8" + container: "lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + + decode_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + MC_TE_METRIC: "true" + SGLANG_DG_CACHE_DIR: "/tmp/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + SGLANG_HEALTH_CHECK_TIMEOUT: "1800" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + mem-fraction-static: 0.8 + chunked-prefill-size: 65536 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + log-level: "info" + page-size: 64 + attention-backend: "trtllm_mha" + moe-runner-backend: "flashinfer_trtllm" + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + trust-remote-code: true + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + prefill-round-robin-balance: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 128 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + + chunked-prefill-size: 4096 + context-length: 8192 + mem-fraction-static: 0.75 + max-mamba-cache-size: 4096 + max-running-requests: 4096 + cuda-graph-max-bs: 256 + watchdog-timeout: 1000000 + + page-size: 64 + attention-backend: "trtllm_mha" + moe-runner-backend: "deep_gemm" + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + eplb-algorithm: "deepseek" + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + req_rate: "inf" + random_range_ratio: 0.8 + concurrencies: "4096" \ No newline at end of file diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/8k1k/1p1d-tp4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/8k1k/1p1d-tp4-tp4.yaml new file mode 100644 index 000000000..d869b247a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/8k1k/1p1d-tp4-tp4.yaml @@ -0,0 +1,121 @@ +name: "qwen3.5-1p1d-tp4-tp4" + +sbatch_directives: + mem: "0" + +dynamo: + hash: 46520ca59afe992fb5ef61b3197b2316f8df9b2b + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 1 + nginx_container: nginx + +model: + path: "qwen3.5-fp8" + container: "lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + + decode_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_HEALTH_CHECK_TIMEOUT: "3600" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + MC_FORCE_MNNVL: "1" + MC_TE_METRIC: "true" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + trust-remote-code: true + attention-backend: "trtllm_mha" + tensor-parallel-size: 4 + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + max-running-requests: 1024 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + context-length: 16384 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 + disaggregation-mode: "prefill" + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + trust-remote-code: true + attention-backend: "trtllm_mha" + tensor-parallel-size: 4 + mamba-ssm-dtype: "bfloat16" + moe-runner-backend: "flashinfer_trtllm" + disable-radix-cache: true + max-running-requests: 1024 + mem-fraction-static: 0.8 + chunked-prefill-size: 16384 + max-prefill-tokens: 16384 + context-length: 16384 + cuda-graph-max-bs: 1024 + decode-log-interval: 1 + stream-interval: 50 + disaggregation-mode: "decode" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: "inf" + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/8k1k/4p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/8k1k/4p1d-dep4-dep16.yaml new file mode 100644 index 000000000..0eb6c5881 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/8k1k/4p1d-dep4-dep16.yaml @@ -0,0 +1,160 @@ +name: "qwen3.5-4p1d-dep4-dep16" + +setup_script: rebuild-deepep.sh + +sbatch_directives: + mem: "0" + +dynamo: + hash: 46520ca59afe992fb5ef61b3197b2316f8df9b2b + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 3 + nginx_container: nginx + +model: + path: "qwen3.5-fp8" + container: "lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 4 + prefill_workers: 4 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + + decode_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + MC_TE_METRIC: "true" + SGLANG_DG_CACHE_DIR: "/tmp/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + SGLANG_HEALTH_CHECK_TIMEOUT: "1800" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + mem-fraction-static: 0.8 + chunked-prefill-size: 65536 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + log-level: "info" + page-size: 64 + attention-backend: "trtllm_mha" + moe-runner-backend: "flashinfer_trtllm" + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + trust-remote-code: true + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + prefill-round-robin-balance: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 128 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + + chunked-prefill-size: 4096 + context-length: 16384 + mem-fraction-static: 0.80 + max-mamba-cache-size: 2048 + max-running-requests: 2048 + cuda-graph-max-bs: 128 + watchdog-timeout: 1000000 + + page-size: 64 + attention-backend: "trtllm_mha" + moe-runner-backend: "deep_gemm" + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + eplb-algorithm: "deepseek" + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: "inf" + num_prompts_mult: 20 + num_warmup_mult: 2 + random_range_ratio: 0.8 + concurrencies: "1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/8k1k/8p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/8k1k/8p1d-dep4-dep16.yaml new file mode 100644 index 000000000..58f2604a7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/8k1k/8p1d-dep4-dep16.yaml @@ -0,0 +1,161 @@ +name: "qwen3.5-8p1d-dep4-dep16" + +setup_script: rebuild-deepep.sh + +sbatch_directives: + mem: "0" + +infra: + etcd_nats_dedicated_node: true + +dynamo: + hash: 46520ca59afe992fb5ef61b3197b2316f8df9b2b + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 4 + nginx_container: nginx + +model: + path: "qwen3.5-fp8" + container: "lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc" + precision: "fp8" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 8 + decode_nodes: 4 + prefill_workers: 8 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + + decode_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600" + TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600" + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + MC_FORCE_MNNVL: "1" + MC_TE_METRIC: "true" + SGLANG_DG_CACHE_DIR: "/tmp/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" + SGLANG_HEALTH_CHECK_TIMEOUT: "1800" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + + sglang_config: + prefill: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + mem-fraction-static: 0.8 + chunked-prefill-size: 65536 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + disable-cuda-graph: true + log-level: "info" + page-size: 64 + attention-backend: "trtllm_mha" + moe-runner-backend: "flashinfer_trtllm" + + decode: + served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8" + model-path: "/model/" + trust-remote-code: true + quantization: "fp8" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + prefill-round-robin-balance: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 128 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + + chunked-prefill-size: 4096 + context-length: 16384 + mem-fraction-static: 0.80 + max-mamba-cache-size: 2048 + max-running-requests: 2048 + cuda-graph-max-bs: 128 + watchdog-timeout: 1000000 + + page-size: 64 + attention-backend: "trtllm_mha" + moe-runner-backend: "deep_gemm" + moe-a2a-backend: "deepep" + deepep-mode: "low_latency" + ep-dispatch-algorithm: "static" + eplb-algorithm: "deepseek" + + decode-log-interval: 1 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + req_rate: "inf" + random_range_ratio: 0.8 + concurrencies: "2048x4096" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 18a1203d5..12cb29600 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4028,3 +4028,11 @@ - "Recover the failed official ingest for PR #1796 from validated sweep run 27663808752 (attempt 2)" - "Artifact-only recovery: reuse 23 fixed-sequence rows and 2 eval rows without rerunning benchmarks" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1884 + +- config-keys: + - qwen3.5-fp8-gb200-dynamo-sglang + description: + - "Add Qwen3.5-397B-A17B-FP8 GB200 disaggregated multinode SGLang benchmarks via Dynamo" + - "Image: lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc" + - "6 topologies across 1k/1k and 8k/1k: 1P1D TP4 STP + wide-EP (DEP4 prefill / DEP16 decode) from 1P1D up to 8P1D, recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1810 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 5eae86bb3..4017b1fd2 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -21,6 +21,9 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then # DSV4 sglang recipes. export MODEL_PATH="/mnt/lustre01/models/deepseek-v4-pro" export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + elif [[ $MODEL_PREFIX == "qwen3.5" && $PRECISION == "fp8" ]]; then + export MODEL_PATH="/mnt/lustre01/models/Qwen3.5-397B-A17B-FP8" + export SRT_SLURM_MODEL_PREFIX="qwen3.5-fp8" else export MODEL_PATH=$MODEL fi @@ -255,6 +258,11 @@ elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then cd "$SRT_REPO_DIR" mkdir -p recipes/sglang/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 +elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "qwen3.5" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + mkdir -p recipes/sglang/qwen3.5 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5" recipes/sglang/qwen3.5 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 cd "$SRT_REPO_DIR" || exit 1