Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9375,6 +9375,114 @@ dsv4-fp4-gb200-dynamo-sglang:
ep: 12
dp-attn: true

qwen3.5-fp8-gb200-dynamo-sglang:
image: lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc
model: Qwen/Qwen3.5-397B-A17B-FP8
model-prefix: qwen3.5
runner: gb200
precision: fp8
framework: dynamo-sglang
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
# 1P1D STP: TP4 prefill + TP4 decode (pure tensor parallel). 2 nodes (1+1).
- spec-decoding: "none"
conc-list: [1, 2, 4, 8, 16, 32, 64]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/1k1k/1p1d-tp4-tp4.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
# 1P1D wide-EP: prefill DEP4 + decode DEP16. 5 nodes (1+4).
- spec-decoding: "none"
conc-list: [512, 1024, 2048]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/1k1k/1p1d-dep4-dep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# 2P1D wide-EP: 2 prefill DEP4 + decode DEP16. 6 nodes (2+4).
- spec-decoding: "none"
conc-list: [4096]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/1k1k/2p1d-dep4-dep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- isl: 8192
osl: 1024
search-space:
# 1P1D STP: TP4 prefill + TP4 decode (pure tensor parallel). 2 nodes (1+1).
- spec-decoding: "none"
conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/8k1k/1p1d-tp4-tp4.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
# 4P1D wide-EP: 4 prefill DEP4 + decode DEP16. 8 nodes (4+4).
- spec-decoding: "none"
conc-list: [1024]
prefill:
num-worker: 4
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/8k1k/4p1d-dep4-dep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# 8P1D wide-EP: 8 prefill DEP4 + decode DEP16. 12 nodes (8+4).
- spec-decoding: "none"
conc-list: [2048, 4096]
prefill:
num-worker: 8
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/qwen3.5/gb200-fp8/8k1k/8p1d-dep4-dep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

dsv4-fp4-b300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: deepseek-ai/DeepSeek-V4-Pro
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
name: "qwen3.5-1p1d-dep4-dep16"

setup_script: rebuild-deepep.sh

dynamo:
hash: 46520ca59afe992fb5ef61b3197b2316f8df9b2b
install: true

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 1
nginx_container: nginx

model:
path: "qwen3.5-fp8"
container: "lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc"
precision: "fp8"

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 4
prefill_workers: 1
decode_workers: 1

backend:
type: sglang

prefill_environment:
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
NO_COLOR: "1"
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600"
TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600"
TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
MC_FORCE_MNNVL: "1"
SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_HEALTH_STARTING_OK: "1"
SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0"

decode_environment:
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
NO_COLOR: "1"
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600"
TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600"
TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
MC_FORCE_MNNVL: "1"
MC_TE_METRIC: "true"
SGLANG_DG_CACHE_DIR: "/tmp/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
SGLANG_HEALTH_CHECK_TIMEOUT: "1800"
SGLANG_HEALTH_STARTING_OK: "1"
SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0"

sglang_config:
prefill:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"
trust-remote-code: true

tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 4
enable-dp-attention: true
enable-dp-lm-head: true

mamba-scheduler-strategy: "no_buffer"
mamba-track-interval: 2048
mamba-ssm-dtype: "bfloat16"
disaggregation-mode: "prefill"
disable-radix-cache: true
disaggregation-bootstrap-port: 31000
mem-fraction-static: 0.8
chunked-prefill-size: 65536
load-balance-method: "round_robin"
watchdog-timeout: 1000000
disable-cuda-graph: true
log-level: "info"
page-size: 64
attention-backend: "trtllm_mha"
moe-runner-backend: "flashinfer_trtllm"

decode:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"
trust-remote-code: true
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"

tensor-parallel-size: 16
data-parallel-size: 16
expert-parallel-size: 16
enable-dp-attention: true
enable-dp-lm-head: true
prefill-round-robin-balance: true

mamba-scheduler-strategy: "no_buffer"
mamba-track-interval: 128
mamba-ssm-dtype: "bfloat16"

disaggregation-mode: "decode"
disable-radix-cache: true
disaggregation-bootstrap-port: 31000

chunked-prefill-size: 4096
context-length: 4096
mem-fraction-static: 0.80
max-mamba-cache-size: 2048
max-running-requests: 2048
cuda-graph-max-bs: 128
watchdog-timeout: 1000000

page-size: 64
attention-backend: "trtllm_mha"
moe-runner-backend: "deep_gemm"
moe-a2a-backend: "deepep"
deepep-mode: "low_latency"
ep-dispatch-algorithm: "static"
eplb-algorithm: "deepseek"

decode-log-interval: 1
stream-interval: 50

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
req_rate: "inf"
random_range_ratio: 0.8
concurrencies: "512x1024x2048"
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
name: "qwen3.5-1p1d-tp4-tp4"

sbatch_directives:
mem: "0"

dynamo:
hash: 46520ca59afe992fb5ef61b3197b2316f8df9b2b
install: true

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 1
nginx_container: nginx

model:
path: "qwen3.5-fp8"
container: "lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc"
precision: "fp8"

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1

backend:
type: sglang

prefill_environment:
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600"
TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600"
TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"

decode_environment:
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "3600"
TORCH_NCCL_WATCHDOG_TIMEOUT_SEC: "3600"
TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: "3600"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_HEALTH_CHECK_TIMEOUT: "3600"
SGLANG_HEALTH_STARTING_OK: "1"
SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0"
MC_FORCE_MNNVL: "1"
MC_TE_METRIC: "true"
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"

sglang_config:
prefill:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"
trust-remote-code: true
attention-backend: "trtllm_mha"
tensor-parallel-size: 4
mamba-ssm-dtype: "bfloat16"
moe-runner-backend: "flashinfer_trtllm"
disable-radix-cache: true
max-running-requests: 1024
mem-fraction-static: 0.8
chunked-prefill-size: 16384
max-prefill-tokens: 16384
context-length: 4096
cuda-graph-max-bs: 1024
decode-log-interval: 1
stream-interval: 50
disaggregation-mode: "prefill"

decode:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"
trust-remote-code: true
attention-backend: "trtllm_mha"
tensor-parallel-size: 4
mamba-ssm-dtype: "bfloat16"
moe-runner-backend: "flashinfer_trtllm"
disable-radix-cache: true
max-running-requests: 1024
mem-fraction-static: 0.8
chunked-prefill-size: 16384
max-prefill-tokens: 16384
context-length: 4096
cuda-graph-max-bs: 1024
decode-log-interval: 1
stream-interval: 50
disaggregation-mode: "decode"

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
req_rate: "inf"
num_prompts_mult: 10
num_warmup_mult: 1
random_range_ratio: 0.8
concurrencies: "1x2x4x8x16x32x64"
Loading