From fa2ebe5a2c6a611ee84bf71f68c77bb6eb8ccabb Mon Sep 17 00:00:00 2001 From: Xin Li Date: Sun, 14 Jun 2026 21:33:25 -0400 Subject: [PATCH 1/8] dsr1-fp8-gb300-dynamo-trt: pin image to tensorrtllm-runtime:1.3.0-dev.1-cuda13 --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5f7d16d60..8574f755f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -6587,7 +6587,7 @@ dsr1-fp4-gb300-dynamo-sglang: dp-attn: true dsr1-fp8-gb300-dynamo-trt: - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: gb300 From f6f41361eb389fd0119ca2c842caec26b3c4a851 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Sun, 14 Jun 2026 21:36:08 -0400 Subject: [PATCH 2/8] dsr1-fp8-gb300-dynamo-trt: pin image to tensorrtllm-runtime:1.3.0-dev.1-cuda13, fix gsm8k accuracy --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 86254474d..707c04b00 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3814,3 +3814,9 @@ description: - "Extend MiniMax-M3 MXFP8 H100/H200 non-MTP sweeps to concurrency 1 on the latency rows (H100: TP8; H200: TP4 and TP8) and add full TEP coverage from conc 1 to 256 (H100: TP8+EP8; H200: TP4+EP4 and TP8+EP8, incl. a new TP4+EP4 row for 8k1k). H200 TP8+EP8 upper bound moves 512->256 (high concurrency stays covered by the TP8+EP8 dp-attn DEP rows). DEP rows unchanged" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1761 + +- config-keys: + - dsr1-fp8-gb300-dynamo-trt + description: + - "Fix gsm8k accuracy at 88% instead of 95% for a single point" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/ From 4c442b64964db8cffd5acab099977f608cf96f2a Mon Sep 17 00:00:00 2001 From: Xin Li Date: Sun, 14 Jun 2026 23:58:19 -0400 Subject: [PATCH 3/8] perf changelog update --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 707c04b00..7b8b281f9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3819,4 +3819,4 @@ - dsr1-fp8-gb300-dynamo-trt description: - "Fix gsm8k accuracy at 88% instead of 95% for a single point" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/ + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1767 From d3b807ccdf6ae0d0295fcf04e6facd2a209fc799 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Mon, 15 Jun 2026 00:07:47 -0400 Subject: [PATCH 4/8] change runner --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8574f755f..857972476 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -6590,7 +6590,7 @@ dsr1-fp8-gb300-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 - runner: gb300 + runner: gb300-nv precision: fp8 framework: dynamo-trt multinode: true From 3a9d26c63306dd02b7f0d5854646cb69600dedbb Mon Sep 17 00:00:00 2001 From: Xin Li Date: Mon, 15 Jun 2026 22:37:34 -0400 Subject: [PATCH 5/8] perf change log --- perf-changelog.yaml | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f647d3b8f..fdb9b5f94 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3841,18 +3841,6 @@ - "Switch fixed-seq-len search space from TP8 to TP4 for both isl=1024 and isl=8192 scenarios" - "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762 - -- config-keys: - - dsv4-fp4-gb300-dynamo-trt - - dsv4-fp4-gb300-dynamo-trt-mtp - description: - - "Add DeepSeek-V4-Pro MXFP4 GB300 disaggregated TRT-LLM benchmarks via Dynamo (27 STP + 27 MTP configs)" - - "New configs: dsv4-fp4-gb300-dynamo-trt (STP) and dsv4-fp4-gb300-dynamo-trt-mtp (MTP)" - - "Covers ISL 1024/OSL 1024 (14 STP + 14 MTP) and ISL 8192/OSL 1024 (13 STP + 13 MTP)" - - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-deepseek-v4-dev.1" - - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" - - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689 - config-keys: - minimaxm3-fp8-b200-vllm @@ -3867,3 +3855,16 @@ - "Align MiniMax-M3 B200 vLLM fixed-sequence serving with MiniMax-M2.5 FP8 B200 settings by setting VLLM_FLOAT32_MATMUL_PRECISION=high and restoring max cudagraph capture size 2048." - "Add TP4+EP4 coverage for MiniMax-M3 B200: DP-attention rows for 1k1k/8k1k and the missing non-DP-attention row for 8k1k." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1779 + +- config-keys: + - dsv4-fp4-gb300-dynamo-trt + - dsv4-fp4-gb300-dynamo-trt-mtp + description: + - "Add DeepSeek-V4-Pro MXFP4 GB300 disaggregated TRT-LLM benchmarks via Dynamo (27 STP + 27 MTP configs)" + - "New configs: dsv4-fp4-gb300-dynamo-trt (STP) and dsv4-fp4-gb300-dynamo-trt-mtp (MTP)" + - "Covers ISL 1024/OSL 1024 (14 STP + 14 MTP) and ISL 8192/OSL 1024 (13 STP + 13 MTP)" + - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-deepseek-v4-dev.1" + - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" + - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689 + From 7c4c6454911f5465ddc9e426802fb4c35b5ac5ca Mon Sep 17 00:00:00 2001 From: Xin Li Date: Mon, 15 Jun 2026 22:39:12 -0400 Subject: [PATCH 6/8] fix perf change log --- perf-changelog.yaml | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index fdb9b5f94..9cec83a85 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3832,15 +3832,23 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1761 - config-keys: - - dsr1-fp8-gb300-dynamo-trt - description: - - "Fix gsm8k accuracy at 88% instead of 95% for a single point" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1767 - dsv4-fp4-mi355x-sglang description: - "Switch fixed-seq-len search space from TP8 to TP4 for both isl=1024 and isl=8192 scenarios" - "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762 + +- config-keys: + - dsv4-fp4-gb300-dynamo-trt + - dsv4-fp4-gb300-dynamo-trt-mtp + description: + - "Add DeepSeek-V4-Pro MXFP4 GB300 disaggregated TRT-LLM benchmarks via Dynamo (27 STP + 27 MTP configs)" + - "New configs: dsv4-fp4-gb300-dynamo-trt (STP) and dsv4-fp4-gb300-dynamo-trt-mtp (MTP)" + - "Covers ISL 1024/OSL 1024 (14 STP + 14 MTP) and ISL 8192/OSL 1024 (13 STP + 13 MTP)" + - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-deepseek-v4-dev.1" + - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" + - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689 - config-keys: - minimaxm3-fp8-b200-vllm @@ -3857,14 +3865,7 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1779 - config-keys: - - dsv4-fp4-gb300-dynamo-trt - - dsv4-fp4-gb300-dynamo-trt-mtp + - dsr1-fp8-gb300-dynamo-trt description: - - "Add DeepSeek-V4-Pro MXFP4 GB300 disaggregated TRT-LLM benchmarks via Dynamo (27 STP + 27 MTP configs)" - - "New configs: dsv4-fp4-gb300-dynamo-trt (STP) and dsv4-fp4-gb300-dynamo-trt-mtp (MTP)" - - "Covers ISL 1024/OSL 1024 (14 STP + 14 MTP) and ISL 8192/OSL 1024 (13 STP + 13 MTP)" - - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-deepseek-v4-dev.1" - - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" - - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689 - + - "Fix gsm8k accuracy at 88% instead of 95% for a single point" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1767 From 728eb321dd4b1decd81b2d460cb39aa369a0c9c8 Mon Sep 17 00:00:00 2001 From: Xin Li <119016172+xinli-sw@users.noreply.github.com> Date: Tue, 16 Jun 2026 01:06:06 -0400 Subject: [PATCH 7/8] Enhance gsm8k accuracy fix description in changelog Updated description for gsm8k accuracy fix to include config updates. --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9cec83a85..33fd0451b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3867,5 +3867,5 @@ - config-keys: - dsr1-fp8-gb300-dynamo-trt description: - - "Fix gsm8k accuracy at 88% instead of 95% for a single point" + - "Fix gsm8k accuracy at 88% instead of 95% for a single point, also update all configs for DSR1 TRTLLM FP8 to reflect latest released image usage" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1767 From f867bd42585cca316b31a058c9276fad6bdbab00 Mon Sep 17 00:00:00 2001 From: Xin Li <119016172+xinli-sw@users.noreply.github.com> Date: Wed, 17 Jun 2026 00:02:52 -0400 Subject: [PATCH 8/8] Refine accuracy fix description for DSR1 TRTLLM Updated description for gsm8k accuracy fix and config updates. --- perf-changelog.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 33fd0451b..208d37dea 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3867,5 +3867,8 @@ - config-keys: - dsr1-fp8-gb300-dynamo-trt description: - - "Fix gsm8k accuracy at 88% instead of 95% for a single point, also update all configs for DSR1 TRTLLM FP8 to reflect latest released image usage" + - "Fix gsm8k accuracy at 88% instead of 95% for a single point." + - "In previous submission, there was an numeric issue causing accuracy degradation and performance anomaly in some MTP points at certain concurrency." + - "This issue is now fixed in the latest TRTLLM release." + - "Also update all configs for DSR1 TRTLLM FP8 to reflect latest released image usage" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1767