From 778120f390f9837247720656c532a332aced127f Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 17 May 2026 19:46:03 -0400 Subject: [PATCH 1/7] [Klaud Cold] Update dsv4-fp8-h200-vllm (+mtp) vLLM image to v0.21.0 --- .github/configs/nvidia-master.yaml | 4 ++-- perf-changelog.yaml | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 37dd5af3f..ec5cab79a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2844,7 +2844,7 @@ dsr1-fp8-h200-sglang: # Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache # flag is omitted. Max-model-len is pinned at 800k per the recipe. dsv4-fp8-h200-vllm: - image: vllm/vllm-openai:deepseekv4-cu129 + image: vllm/vllm-openai:v0.21.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 @@ -2868,7 +2868,7 @@ dsv4-fp8-h200-vllm: # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp8-h200-vllm-mtp: - image: vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4 + image: vllm/vllm-openai:v0.21.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 13a695453..f3d3257a8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2629,3 +2629,10 @@ description: - "Update vLLM ROCm image from v0.18.0 to v0.21.0" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1404 + +- config-keys: + - dsv4-fp8-h200-vllm + - dsv4-fp8-h200-vllm-mtp + description: + - "Update vLLM image (deepseekv4-cu129 custom + v0.20.1@sha256) to v0.21.0" + pr-link: PLACEHOLDER From c43b9949269ca816bf9da5f76f179a3d72a500fd Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 17 May 2026 19:46:07 -0400 Subject: [PATCH 2/7] chore: fill pr-link for #1461 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f3d3257a8..45e7e3e1f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2635,4 +2635,4 @@ - dsv4-fp8-h200-vllm-mtp description: - "Update vLLM image (deepseekv4-cu129 custom + v0.20.1@sha256) to v0.21.0" - pr-link: PLACEHOLDER + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1461 From edc51bcd87f5dfe21d00fd8d291b70c8db1c2be9 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 20 May 2026 11:14:55 -0700 Subject: [PATCH 3/7] fix(dsv4_fp8_h200): force DeepSeek V4 FP8 quantization --- benchmarks/single_node/dsv4_fp8_h200.sh | 1 + benchmarks/single_node/dsv4_fp8_h200_mtp.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh index 938e95b51..9d2061b24 100644 --- a/benchmarks/single_node/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/dsv4_fp8_h200.sh @@ -62,6 +62,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ "${PARALLEL_ARGS[@]}" \ "${EP_ARGS[@]}" \ $MAX_MODEL_LEN_ARG \ +--quantization deepseek_v4_fp8 \ --gpu-memory-utilization 0.90 \ --max-num-seqs 512 \ --max-num-batched-tokens 512 \ diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh index cc6838487..e0465aa87 100755 --- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh +++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh @@ -70,6 +70,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ "${PARALLEL_ARGS[@]}" \ "${EP_ARGS[@]}" \ $MAX_MODEL_LEN_ARG \ +--quantization deepseek_v4_fp8 \ --gpu-memory-utilization 0.90 \ --max-num-seqs 512 \ --max-num-batched-tokens 512 \ From 0f731257daf395ce2e75c815223ebb52a6d12378 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 20 May 2026 11:34:06 -0700 Subject: [PATCH 4/7] Update perf-changelog.yaml --- perf-changelog.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 974d49266..202fda682 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2663,14 +2663,14 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1458 - config-keys: -<<<<<<< update-dsv4-fp8-h200-vllm-v0.21.0 - dsv4-fp8-h200-vllm - dsv4-fp8-h200-vllm-mtp description: - "Update vLLM image to v0.21.0 (from custom deepseekv4-cu129 / v0.20.1@sha256-pinned)" - "Lower --gpu-memory-utilization from 0.95 to 0.90 in dsv4_fp8_h200.sh and dsv4_fp8_h200_mtp.sh — v0.21.0 uses more memory at load time, OOM'd on GPU 2 at 0.95" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1461 -======= + +- config-keys: - dsr1-fp8-mi325x-sglang description: - "Update SGLang image from v0.5.9-rocm700-mi30x to v0.5.12-rocm700-mi30x" From 293b997053270a81cc1218e32465d815ece2b1e7 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 20 May 2026 11:54:25 -0700 Subject: [PATCH 5/7] Update perf-changelog.yaml --- perf-changelog.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 202fda682..89f5af860 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3051,4 +3051,3 @@ description: - "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475 ->>>>>>> main From 80109e00dfecc60887cb2b444b74afa19ede7da3 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 20 May 2026 13:09:56 -0700 Subject: [PATCH 6/7] back to 0.95 --- benchmarks/single_node/dsv4_fp8_h200.sh | 2 +- benchmarks/single_node/dsv4_fp8_h200_mtp.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh index 9d2061b24..18e5dacc5 100644 --- a/benchmarks/single_node/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/dsv4_fp8_h200.sh @@ -63,7 +63,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ "${EP_ARGS[@]}" \ $MAX_MODEL_LEN_ARG \ --quantization deepseek_v4_fp8 \ ---gpu-memory-utilization 0.90 \ +--gpu-memory-utilization 0.95 \ --max-num-seqs 512 \ --max-num-batched-tokens 512 \ --no-enable-flashinfer-autotune \ diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh index e0465aa87..0446ac6d9 100755 --- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh +++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh @@ -71,7 +71,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ "${EP_ARGS[@]}" \ $MAX_MODEL_LEN_ARG \ --quantization deepseek_v4_fp8 \ ---gpu-memory-utilization 0.90 \ +--gpu-memory-utilization 0.95 \ --max-num-seqs 512 \ --max-num-batched-tokens 512 \ --no-enable-flashinfer-autotune \ From e5f395ef27c506cef285b0be306e23ff11a1f192 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 20 May 2026 14:29:23 -0700 Subject: [PATCH 7/7] fix(dsv4_fp8_h200): use sweep max model length --- benchmarks/single_node/dsv4_fp8_h200.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh index 18e5dacc5..51e4a72d2 100644 --- a/benchmarks/single_node/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/dsv4_fp8_h200.sh @@ -13,6 +13,7 @@ check_env_vars \ CONC \ ISL \ OSL \ + MAX_MODEL_LEN \ RANDOM_RANGE_RATIO \ RESULT_FILENAME @@ -35,7 +36,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context MAX_MODEL_LEN_ARG="--max-model-len $EVAL_MAX_MODEL_LEN" else - MAX_MODEL_LEN_ARG="--max-model-len 800000" + MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN" fi # DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP);