From 778120f390f9837247720656c532a332aced127f Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Sun, 17 May 2026 19:46:03 -0400
Subject: [PATCH 1/7] [Klaud Cold] Update dsv4-fp8-h200-vllm (+mtp) vLLM image
 to v0.21.0

---
 .github/configs/nvidia-master.yaml | 4 ++--
 perf-changelog.yaml                | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 37dd5af3f..ec5cab79a 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2844,7 +2844,7 @@ dsr1-fp8-h200-sglang:
 # Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache
 # flag is omitted. Max-model-len is pinned at 800k per the recipe.
 dsv4-fp8-h200-vllm:
-  image: vllm/vllm-openai:deepseekv4-cu129
+  image: vllm/vllm-openai:v0.21.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: h200
@@ -2868,7 +2868,7 @@ dsv4-fp8-h200-vllm:
 # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
 dsv4-fp8-h200-vllm-mtp:
-  image: vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4
+  image: vllm/vllm-openai:v0.21.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: h200
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 13a695453..f3d3257a8 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2629,3 +2629,10 @@
   description:
     - "Update vLLM ROCm image from v0.18.0 to v0.21.0"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1404
+
+- config-keys:
+    - dsv4-fp8-h200-vllm
+    - dsv4-fp8-h200-vllm-mtp
+  description:
+    - "Update vLLM image (deepseekv4-cu129 custom + v0.20.1@sha256) to v0.21.0"
+  pr-link: PLACEHOLDER

From c43b9949269ca816bf9da5f76f179a3d72a500fd Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Sun, 17 May 2026 19:46:07 -0400
Subject: [PATCH 2/7] chore: fill pr-link for #1461

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index f3d3257a8..45e7e3e1f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2635,4 +2635,4 @@
     - dsv4-fp8-h200-vllm-mtp
   description:
     - "Update vLLM image (deepseekv4-cu129 custom + v0.20.1@sha256) to v0.21.0"
-  pr-link: PLACEHOLDER
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1461

From edc51bcd87f5dfe21d00fd8d291b70c8db1c2be9 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 20 May 2026 11:14:55 -0700
Subject: [PATCH 3/7] fix(dsv4_fp8_h200): force DeepSeek V4 FP8 quantization

---
 benchmarks/single_node/dsv4_fp8_h200.sh     | 1 +
 benchmarks/single_node/dsv4_fp8_h200_mtp.sh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh
index 938e95b51..9d2061b24 100644
--- a/benchmarks/single_node/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/dsv4_fp8_h200.sh
@@ -62,6 +62,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 "${PARALLEL_ARGS[@]}" \
 "${EP_ARGS[@]}" \
 $MAX_MODEL_LEN_ARG \
+--quantization deepseek_v4_fp8 \
 --gpu-memory-utilization 0.90 \
 --max-num-seqs 512 \
 --max-num-batched-tokens 512 \
diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
index cc6838487..e0465aa87 100755
--- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
@@ -70,6 +70,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 "${PARALLEL_ARGS[@]}" \
 "${EP_ARGS[@]}" \
 $MAX_MODEL_LEN_ARG \
+--quantization deepseek_v4_fp8 \
 --gpu-memory-utilization 0.90 \
 --max-num-seqs 512 \
 --max-num-batched-tokens 512 \

From 0f731257daf395ce2e75c815223ebb52a6d12378 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 20 May 2026 11:34:06 -0700
Subject: [PATCH 4/7] Update perf-changelog.yaml

---
 perf-changelog.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 974d49266..202fda682 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2663,14 +2663,14 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1458
 
 - config-keys:
-<<<<<<< update-dsv4-fp8-h200-vllm-v0.21.0
     - dsv4-fp8-h200-vllm
     - dsv4-fp8-h200-vllm-mtp
   description:
     - "Update vLLM image to v0.21.0 (from custom deepseekv4-cu129 / v0.20.1@sha256-pinned)"
     - "Lower --gpu-memory-utilization from 0.95 to 0.90 in dsv4_fp8_h200.sh and dsv4_fp8_h200_mtp.sh — v0.21.0 uses more memory at load time, OOM'd on GPU 2 at 0.95"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1461
-=======
+
+- config-keys:
     - dsr1-fp8-mi325x-sglang
   description:
     - "Update SGLang image from v0.5.9-rocm700-mi30x to v0.5.12-rocm700-mi30x"

From 293b997053270a81cc1218e32465d815ece2b1e7 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 20 May 2026 11:54:25 -0700
Subject: [PATCH 5/7] Update perf-changelog.yaml

---
 perf-changelog.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 202fda682..89f5af860 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3051,4 +3051,3 @@
   description:
     - "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475
->>>>>>> main

From 80109e00dfecc60887cb2b444b74afa19ede7da3 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 20 May 2026 13:09:56 -0700
Subject: [PATCH 6/7] back to 0.95

---
 benchmarks/single_node/dsv4_fp8_h200.sh     | 2 +-
 benchmarks/single_node/dsv4_fp8_h200_mtp.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh
index 9d2061b24..18e5dacc5 100644
--- a/benchmarks/single_node/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/dsv4_fp8_h200.sh
@@ -63,7 +63,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 "${EP_ARGS[@]}" \
 $MAX_MODEL_LEN_ARG \
 --quantization deepseek_v4_fp8 \
---gpu-memory-utilization 0.90 \
+--gpu-memory-utilization 0.95 \
 --max-num-seqs 512 \
 --max-num-batched-tokens 512 \
 --no-enable-flashinfer-autotune \
diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
index e0465aa87..0446ac6d9 100755
--- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
@@ -71,7 +71,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 "${EP_ARGS[@]}" \
 $MAX_MODEL_LEN_ARG \
 --quantization deepseek_v4_fp8 \
---gpu-memory-utilization 0.90 \
+--gpu-memory-utilization 0.95 \
 --max-num-seqs 512 \
 --max-num-batched-tokens 512 \
 --no-enable-flashinfer-autotune \

From e5f395ef27c506cef285b0be306e23ff11a1f192 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 20 May 2026 14:29:23 -0700
Subject: [PATCH 7/7] fix(dsv4_fp8_h200): use sweep max model length

---
 benchmarks/single_node/dsv4_fp8_h200.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh
index 18e5dacc5..51e4a72d2 100644
--- a/benchmarks/single_node/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/dsv4_fp8_h200.sh
@@ -13,6 +13,7 @@ check_env_vars \
     CONC \
     ISL \
     OSL \
+    MAX_MODEL_LEN \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
@@ -35,7 +36,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
     MAX_MODEL_LEN_ARG="--max-model-len $EVAL_MAX_MODEL_LEN"
 else
-    MAX_MODEL_LEN_ARG="--max-model-len 800000"
+    MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN"
 fi
 
 # DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP);