From 1730de5e0160e1d6b0c7ed23740574f983679acf Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 1 May 2026 15:11:05 +0000 Subject: [PATCH 1/4] [AMD] dsv4-fp8-mi355x-sglang - bump to c924543 daily image - enable TileLang attn/indexer + cuda graph --- .github/configs/amd-master.yaml | 2 +- benchmarks/single_node/dsv4_fp8_mi355x.sh | 6 +++--- perf-changelog.yaml | 12 ++++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 893210ef6..63b9236bf 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1519,7 +1519,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=1" dsv4-fp8-mi355x-sglang: - image: rocm/sgl-dev:deepseek-v4-mi35x + image: rocm/sgl-dev:rocm720-mi35x-c924543-20260430-DSv4 model: sgl-project/DeepSeek-V4-Pro-FP8 model-prefix: dsv4 runner: mi355x diff --git a/benchmarks/single_node/dsv4_fp8_mi355x.sh b/benchmarks/single_node/dsv4_fp8_mi355x.sh index 971b18b6a..8fe26d778 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x.sh @@ -39,13 +39,14 @@ else: print(f"No patch needed: model_type is {config.get('model_type')!r}") PYEOF -# DSv4-specific SGLang env vars (from sgl-project/sglang#23608) +export SGLANG_REASONING_EFFORT=max export SGLANG_OPT_USE_FUSED_COMPRESS=false export SGLANG_OPT_USE_OLD_COMPRESSOR=true export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false export SGLANG_OPT_USE_FUSED_HASH_TOPK=false -export SGLANG_HACK_FLASHMLA_BACKEND=torch +export SGLANG_HACK_FLASHMLA_BACKEND=tilelang +export SGLANG_OPT_USE_TILELANG_INDEXER=true export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false export SGLANG_OPT_USE_TILELANG_MHC_PRE=false export SGLANG_OPT_USE_TILELANG_MHC_POST=false @@ -85,7 +86,6 @@ python3 -m sglang.launch_server \ --page-size 256 \ --chunked-prefill-size 8192 \ --disable-shared-experts-fusion \ - --disable-cuda-graph \ --tool-call-parser deepseekv4 \ --reasoning-parser deepseek-v4 \ --watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0403c2385..211c346db 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2069,3 +2069,15 @@ - "Recipes cover 8k/1k aggregate TP8 low-latency conc=1, low-latency bridge 1P DEP8 + 4D TP8 no-offload conc=16/32/64, mid 1P/1D DEP8 MegaMOE conc=128, and high-throughput 2P/1D DEP8 MegaMOE conc=1024" - "All recipes enable FP4 indexer cache and speculative-config mtp with num_speculative_tokens=2" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1242 + +- config-keys: + - dsv4-fp8-mi355x-sglang + description: + - "Bump dsv4-fp8-mi355x-sglang image to rocm/sgl-dev:rocm720-mi35x-c924543-20260430-DSv4 (sgl-project/sglang amd/deepseek_v4 integration through 6/N + ENV-set commit c924543)" + - "Switch SGLANG_HACK_FLASHMLA_BACKEND from torch to tilelang (sgl-project/sglang#24033, FlashMLA 101->2 kernels per call)" + - "Add SGLANG_OPT_USE_TILELANG_INDEXER=true (sgl-project/sglang#24050, fp8 paged-MQA-logits indexer 12->1 kernels per call)" + - "Drop --disable-cuda-graph from sglang.launch_server (CUDA graph for DSv4 on ROCm/HIP enabled by sgl-project/sglang#23832)" + - "Keep SGLANG_TOPK_TRANSFORM_512_TORCH=1 for now: sgl-project/sglang#24143 (topk512 native ROCm kernel) merged 4-30 21:31 UTC, after the c924543 image was built (4-30 08:26 UTC); will flip to 0 once a newer daily image lands" + - "Keep SGLANG_DSV4_FP4_EXPERTS=false and SGLANG_FORCE_TRITON_MOE_FP8=1: required for sgl-project/DeepSeek-V4-Pro-FP8 (FP4 path asserts intermediate_size_per_partition==2048 in fp8.py; swiglu_limit clamp lives in fused_moe_triton)" + - "Expected speedup over the previous PR #23608 day-0 torch-fallback recipe: ~5.4-5.8x at conc 1-8 (matches the '+ indexer tilelang attn' tier in the AMD DSv4-Flash-FP8 reference table)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/Placeholder From d6dd2f7a6d87da521fd9f2c67dd20352a70833f4 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 1 May 2026 15:14:49 +0000 Subject: [PATCH 2/4] Update Perf Changelog --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 211c346db..53b0b0ae9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2080,4 +2080,4 @@ - "Keep SGLANG_TOPK_TRANSFORM_512_TORCH=1 for now: sgl-project/sglang#24143 (topk512 native ROCm kernel) merged 4-30 21:31 UTC, after the c924543 image was built (4-30 08:26 UTC); will flip to 0 once a newer daily image lands" - "Keep SGLANG_DSV4_FP4_EXPERTS=false and SGLANG_FORCE_TRITON_MOE_FP8=1: required for sgl-project/DeepSeek-V4-Pro-FP8 (FP4 path asserts intermediate_size_per_partition==2048 in fp8.py; swiglu_limit clamp lives in fused_moe_triton)" - "Expected speedup over the previous PR #23608 day-0 torch-fallback recipe: ~5.4-5.8x at conc 1-8 (matches the '+ indexer tilelang attn' tier in the AMD DSv4-Flash-FP8 reference table)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/Placeholder + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1255 From 21bcdc51b299a6577593bf3a50bbc056d8a72359 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 30 Apr 2026 23:19:00 -0700 Subject: [PATCH 3/4] Fix DSv4 eval prompt encoding --- benchmarks/benchmark_lib.sh | 85 +++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 8 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 4c0c8642e..3a9c5252d 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -538,7 +538,7 @@ _patch_lm_eval() { patch_dir="$(mktemp -d)" cat > "$patch_dir/sitecustomize.py" <<'PY' # --- Patch LocalChatCompletion.parse_generations to handle empty content with reasoning_content --- -import re, sys, unicodedata, json +import os, re, sys, unicodedata, json from lm_eval.filters import extraction as ex from lm_eval.models.openai_completions import LocalChatCompletion as _LCC @@ -565,7 +565,7 @@ def _le_parse_generations(outputs, **kwargs): # Keep staticmethod semantics _LCC.parse_generations = staticmethod(_le_parse_generations) -# --- Patch TemplateAPI.apply_chat_template to avoid injecting "type": "text" for TRT --- +# --- Patch TemplateAPI.apply_chat_template --- try: from lm_eval.models import api_models as _api_models _TemplateAPI = _api_models.TemplateAPI @@ -576,6 +576,56 @@ except Exception: if _TemplateAPI is not None and _JsonChatStr is not None: _orig_apply_chat_template = _TemplateAPI.apply_chat_template + _dsv4_encode_messages = None + + def _content_to_text(content): + if isinstance(content, str): + return content + if isinstance(content, list): + parts = [] + for item in content: + if isinstance(item, dict): + parts.append(str(item.get("text", item.get("content", "")))) + else: + parts.append(str(item)) + return "\n".join(part for part in parts if part) + if content is None: + return "" + return str(content) + + def _load_dsv4_encoder(): + global _dsv4_encode_messages + if _dsv4_encode_messages is not None: + return _dsv4_encode_messages + + roots = [ + os.environ.get("INFMAX_WORKSPACE"), + os.environ.get("GITHUB_WORKSPACE"), + os.getcwd(), + "/workspace", + "/infmax-workspace", + ] + for root in roots: + if not root: + continue + candidate = os.path.join(root, "utils", "bench_serving") + if os.path.exists(os.path.join(candidate, "encoding_dsv4.py")) and candidate not in sys.path: + sys.path.insert(0, candidate) + + from encoding_dsv4 import encode_messages + + _dsv4_encode_messages = encode_messages + return _dsv4_encode_messages + + def _apply_dsv4_chat_template(chat_history): + encode_messages = _load_dsv4_encoder() + messages = [] + for item in chat_history: + normalized = {**item} + normalized.pop("type", None) + normalized["content"] = _content_to_text(normalized.get("content")) + messages.append(normalized) + return encode_messages(messages, thinking_mode="thinking") def _patched_apply_chat_template( self, @@ -583,6 +633,8 @@ if _TemplateAPI is not None and _JsonChatStr is not None: add_generation_prompt: bool = True, ): """Applies a chat template to a list of chat history between user and model.""" + if os.environ.get("EVAL_DSV4_CHAT_TEMPLATE") == "1": + return _apply_dsv4_chat_template(chat_history) if self.tokenizer_backend == "huggingface" and self.tokenized_requests: return self.tokenizer.apply_chat_template( chat_history, @@ -687,13 +739,30 @@ run_lm_eval() { esac done - _install_lm_eval_deps - _patch_lm_eval - local openai_server_base="http://0.0.0.0:${port}" local openai_chat_base="${openai_server_base}/v1/chat/completions" + local openai_completions_base="${openai_server_base}/v1/completions" export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} - MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL + export MODEL_NAME="${MODEL_NAME:-$MODEL}" # Prefer MODEL_NAME, else MODEL + + local lm_eval_model="local-chat-completions" + local lm_eval_base_url="$openai_chat_base" + local lm_eval_eos_string="${EVAL_EOS_STRING:-}" + local lm_eval_tokenizer_args="tokenized_requests=False" + + if [[ "${MODEL_PREFIX:-}" == "dsv4" || "${MODEL_NAME:-}" == *"DeepSeek-V4"* || "${MODEL:-}" == *"DeepSeek-V4"* ]]; then + export EVAL_DSV4_CHAT_TEMPLATE=1 + lm_eval_model="local-completions" + lm_eval_base_url="$openai_completions_base" + lm_eval_eos_string="${EVAL_EOS_STRING:-<|end▁of▁sentence|>}" + lm_eval_tokenizer_args="tokenizer_backend=None,tokenized_requests=False" + echo "Using DeepSeek-V4 eval prompt encoding via utils/bench_serving/encoding_dsv4.py" + else + unset EVAL_DSV4_CHAT_TEMPLATE + fi + + _install_lm_eval_deps + _patch_lm_eval # Cap output tokens: must fit within context window (leave room for input), # and avoid excessive KV cache reservation per request on TRT. @@ -706,11 +775,11 @@ run_lm_eval() { # Export for append_lm_eval_summary to pick up export EVAL_RESULT_DIR="$results_dir" set -x - python3 -m lm_eval --model local-chat-completions --apply_chat_template \ + python3 -m lm_eval --model "${lm_eval_model}" --apply_chat_template \ --tasks "${tasks_dir}" \ --output_path "${results_dir}" \ --log_samples \ - --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,tokenized_requests=False,max_length=${eval_context_len}" \ + --model_args "model=${MODEL_NAME},base_url=${lm_eval_base_url},api_key=${OPENAI_API_KEY},eos_string=${lm_eval_eos_string},max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,${lm_eval_tokenizer_args},max_length=${eval_context_len}" \ --gen_kwargs "max_tokens=${max_output_tokens},temperature=${temperature},top_p=${top_p}" local eval_exit=$? set +x From 7176f553351d87e0d62f5f688f8a507d94d2018d Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 3 May 2026 10:15:15 +0000 Subject: [PATCH 4/4] [AMD] dsv4-fp8-mi355x-sglang: bump to a8410de daily - enable topk512 + fused compress decode kernels --- .github/configs/amd-master.yaml | 2 +- benchmarks/single_node/dsv4_fp8_mi355x.sh | 4 ++-- perf-changelog.yaml | 14 ++++++++------ 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index eaf504f63..9d1f1e506 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1569,7 +1569,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=1" dsv4-fp8-mi355x-sglang: - image: rocm/sgl-dev:rocm720-mi35x-c924543-20260430-DSv4 + image: rocm/sgl-dev:rocm720-mi35x-a8410de-20260502-DSv4 model: sgl-project/DeepSeek-V4-Pro-FP8 model-prefix: dsv4 runner: mi355x diff --git a/benchmarks/single_node/dsv4_fp8_mi355x.sh b/benchmarks/single_node/dsv4_fp8_mi355x.sh index 8fe26d778..1775e69dc 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x.sh @@ -40,7 +40,7 @@ else: PYEOF export SGLANG_REASONING_EFFORT=max -export SGLANG_OPT_USE_FUSED_COMPRESS=false +export SGLANG_OPT_USE_FUSED_COMPRESS=true export SGLANG_OPT_USE_OLD_COMPRESSOR=true export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false @@ -53,7 +53,7 @@ export SGLANG_OPT_USE_TILELANG_MHC_POST=false export SGLANG_ENABLE_THINKING=1 export SGLANG_USE_AITER=1 export SGLANG_USE_ROCM700A=1 -export SGLANG_TOPK_TRANSFORM_512_TORCH=1 +export SGLANG_TOPK_TRANSFORM_512_TORCH=0 export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1 export SGLANG_DSV4_FP4_EXPERTS=false export SGLANG_OPT_DPSK_V4_RADIX=0 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a959aff1b..b3c691e03 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2073,13 +2073,15 @@ - config-keys: - dsv4-fp8-mi355x-sglang description: - - "Bump dsv4-fp8-mi355x-sglang image to rocm/sgl-dev:rocm720-mi35x-c924543-20260430-DSv4 (sgl-project/sglang amd/deepseek_v4 integration through 6/N + ENV-set commit c924543)" - - "Switch SGLANG_HACK_FLASHMLA_BACKEND from torch to tilelang (sgl-project/sglang#24033, FlashMLA 101->2 kernels per call)" - - "Add SGLANG_OPT_USE_TILELANG_INDEXER=true (sgl-project/sglang#24050, fp8 paged-MQA-logits indexer 12->1 kernels per call)" - - "Drop --disable-cuda-graph from sglang.launch_server (CUDA graph for DSv4 on ROCm/HIP enabled by sgl-project/sglang#23832)" - - "Keep SGLANG_TOPK_TRANSFORM_512_TORCH=1 for now: sgl-project/sglang#24143 (topk512 native ROCm kernel) merged 4-30 21:31 UTC, after the c924543 image was built (4-30 08:26 UTC); will flip to 0 once a newer daily image lands" + - "Bump dsv4-fp8-mi355x-sglang image rocm/sgl-dev:deepseek-v4-mi35x (PR #23608 day-0) -> rocm/sgl-dev:rocm720-mi35x-a8410de-20260502-DSv4 (sgl-project/sglang amd/deepseek_v4 integration through 8/N): cuda graph (#23832), TileLang FlashMLA attn (#24033, 101->2 kernels per call), TileLang indexer attn (#24050, 12->1 kernels per call), native ROCm topk512transform kernel (#24143, ~30->1 launches per call), fused compress decode kernel (#24249, compressor c4/c128 decode fused)" + - "Switch SGLANG_HACK_FLASHMLA_BACKEND torch -> tilelang (sgl-project/sglang#24033)" + - "Add SGLANG_OPT_USE_TILELANG_INDEXER=true (sgl-project/sglang#24050)" + - "Drop --disable-cuda-graph from sglang.launch_server (sgl-project/sglang#23832)" + - "Set SGLANG_TOPK_TRANSFORM_512_TORCH=0 to use the native ROCm topk512 kernel from sgl-project/sglang#24143" + - "Set SGLANG_OPT_USE_FUSED_COMPRESS=true to use the fused compress decode kernel from sgl-project/sglang#24249" - "Keep SGLANG_DSV4_FP4_EXPERTS=false and SGLANG_FORCE_TRITON_MOE_FP8=1: required for sgl-project/DeepSeek-V4-Pro-FP8 (FP4 path asserts intermediate_size_per_partition==2048 in fp8.py; swiglu_limit clamp lives in fused_moe_triton)" - - "Expected speedup over the previous PR #23608 day-0 torch-fallback recipe: ~5.4-5.8x at conc 1-8 (matches the '+ indexer tilelang attn' tier in the AMD DSv4-Flash-FP8 reference table)" + - "Expected throughput speedup over the PR #23608 day-0 torch-fallback recipe: ~6.5-6.8x at conc 1-8 (matches the screenshot's full '+ topk512transform kernel' tier in the AMD DSv4-Flash-FP8 reference table)" + - "Eval fix (accuracy): cherry-pick `Fix DSv4 eval prompt encoding` from the DSV4-ATOM topic branch (sgl-project/sglang#23608 follow-up, commit 5128a68b). Patches benchmarks/benchmark_lib.sh::run_lm_eval to detect DSv4 (MODEL_PREFIX==dsv4 or MODEL/MODEL_NAME contains 'DeepSeek-V4') and (a) switch the lm-eval model adapter from local-chat-completions to local-completions, (b) point base_url at /v1/completions, (c) set EVAL_EOS_STRING to <|end▁of▁sentence|>, (d) set EVAL_DSV4_CHAT_TEMPLATE=1 so the patched lm-eval TemplateAPI.apply_chat_template renders messages locally via utils/bench_serving/encoding_dsv4.py instead of the missing server-side template. Required because sgl-project/DeepSeek-V4-Pro-FP8 ships tokenizer_config.json without chat_template, causing /v1/chat/completions to return HTTP 400 on every request (`No HuggingFace chat template found` in server log; verified empirically on a8410de: SGLang serving_chat.py / template_manager.py / encoding_dsv4.py are byte-identical to c924543 and have no DSv4 auto-wiring)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1255 - config-keys: