diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d86a23541..521ba6636 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2390,6 +2390,306 @@ glm5-fp4-b300-sglang-mtp: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } +glm5-fp4-gb300-dynamo-trt: + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13 + model: nvidia/GLM-5-NVFP4 + model-prefix: glm5 + runner: gb300-nv + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # STP configurations + - conc-list: [ 4 ] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 5 ] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 24 ] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch4_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch4_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 92 ] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 105 ] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen5tep4_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 336 ] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen4tep8_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 666 ] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx1dep2_gen1dep32_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 2253 ] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep16_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx2dep2_gen1dep16_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 2253 ] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep32_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 4301 ] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep16_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx3dep2_gen1dep16_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 8192 ] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL1K_OSL1K/STP/ctx4dep2_gen1dep16_batch512_eplb256_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # STP configurations + - conc-list: [ 10 ] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch2_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch2_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 25 ] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch4_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 50 ] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch8_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 100 ] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx1dep2_gen5tep4_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 308 ] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx3dep2_gen1dep32_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx3dep2_gen1dep32_batch8_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 615 ] + prefill: + num-worker: 6 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx6dep2_gen1dep32_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx6dep2_gen1dep32_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 1076 ] + prefill: + num-worker: 9 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep16_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx9dep2_gen1dep16_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 1229 ] + prefill: + num-worker: 11 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx11dep2_gen1dep32_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx11dep2_gen1dep32_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 2151 ] + prefill: + num-worker: 15 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep16_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb300_nvfp4/ISL8K_OSL1K/STP/ctx15dep2_gen1dep16_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + qwen3.5-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1f88b47aa..dec0d280c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3910,7 +3910,7 @@ - "Use the Marlin MoE backend for MiniMax-M3 B200/B300 TP-only vLLM configurations by adding --moe-backend marlin when expert parallelism is disabled." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1809 - - config-keys: +- config-keys: - dsr1-fp8-gb300-dynamo-trt description: - "Fix gsm8k accuracy at 88% instead of 95% for a single point." @@ -3919,3 +3919,11 @@ - "Also update all configs for DSR1 TRTLLM FP8 to reflect latest released image usage" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1767 +- config-keys: + - glm5-fp4-gb300-dynamo-trt + description: + - "Add GLM-5 NVFP4 GB300 disaggregated TRT-LLM (STP, non-MTP) benchmarks via Dynamo (22 STP configs: 13 for 1K/1K, 9 for 8K/1K)" + - "Container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13" + - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026 (gb300_nvfp4 STP recipes)" + - "Runner script launch_gb300-nv.sh: added dynamo-trt-specific glm5-fp4 case with SERVED_MODEL_NAME and SRT_SLURM_MODEL_PREFIX=nvidia/GLM-5-NVFP4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1798 \ No newline at end of file diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index d21c91d10..5629065f7 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -39,6 +39,10 @@ elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then # symlink on the runner pod that points at the NFS copy. export MODEL_PATH=/scratch/models/DeepSeek-V4-Pro export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" +elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp4" && $FRAMEWORK == "dynamo-trt" ]]; then + export SERVED_MODEL_NAME="glm-5-nvfp4" + export MODEL_PATH=/scratch/models/GLM-5-NVFP4 + export SRT_SLURM_MODEL_PREFIX="nvidia/GLM-5-NVFP4" elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp4" ]]; then export MODEL_PATH=/scratch/models/GLM-5-NVFP4 export SRT_SLURM_MODEL_PREFIX="glm-5-fp4"