Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1569,7 +1569,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
- "DECODE_MTP_SIZE=1"

dsv4-fp8-mi355x-sglang:
image: rocm/sgl-dev:deepseek-v4-mi35x
image: rocm/sgl-dev:rocm720-mi35x-a8410de-20260502-DSv4
model: sgl-project/DeepSeek-V4-Pro-FP8
Comment thread
chunfangamd marked this conversation as resolved.
model-prefix: dsv4
runner: mi355x
Expand Down
85 changes: 77 additions & 8 deletions benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ _patch_lm_eval() {
patch_dir="$(mktemp -d)"
cat > "$patch_dir/sitecustomize.py" <<'PY'
# --- Patch LocalChatCompletion.parse_generations to handle empty content with reasoning_content ---
import re, sys, unicodedata, json
import os, re, sys, unicodedata, json
from lm_eval.filters import extraction as ex
from lm_eval.models.openai_completions import LocalChatCompletion as _LCC

Expand All @@ -575,7 +575,7 @@ def _le_parse_generations(outputs, **kwargs):
# Keep staticmethod semantics
_LCC.parse_generations = staticmethod(_le_parse_generations)

# --- Patch TemplateAPI.apply_chat_template to avoid injecting "type": "text" for TRT ---
# --- Patch TemplateAPI.apply_chat_template ---
try:
from lm_eval.models import api_models as _api_models
_TemplateAPI = _api_models.TemplateAPI
Expand All @@ -586,13 +586,65 @@ except Exception:

if _TemplateAPI is not None and _JsonChatStr is not None:
_orig_apply_chat_template = _TemplateAPI.apply_chat_template
_dsv4_encode_messages = None

def _content_to_text(content):
if isinstance(content, str):
return content
if isinstance(content, list):
parts = []
for item in content:
if isinstance(item, dict):
parts.append(str(item.get("text", item.get("content", ""))))
else:
parts.append(str(item))
return "\n".join(part for part in parts if part)
if content is None:
return ""
return str(content)

def _load_dsv4_encoder():
global _dsv4_encode_messages
if _dsv4_encode_messages is not None:
return _dsv4_encode_messages

roots = [
os.environ.get("INFMAX_WORKSPACE"),
os.environ.get("GITHUB_WORKSPACE"),
os.getcwd(),
"/workspace",
"/infmax-workspace",
]
for root in roots:
if not root:
continue
candidate = os.path.join(root, "utils", "bench_serving")
if os.path.exists(os.path.join(candidate, "encoding_dsv4.py")) and candidate not in sys.path:
sys.path.insert(0, candidate)

from encoding_dsv4 import encode_messages

_dsv4_encode_messages = encode_messages
return _dsv4_encode_messages

def _apply_dsv4_chat_template(chat_history):
encode_messages = _load_dsv4_encoder()
messages = []
for item in chat_history:
normalized = {**item}
normalized.pop("type", None)
normalized["content"] = _content_to_text(normalized.get("content"))
messages.append(normalized)
return encode_messages(messages, thinking_mode="thinking")

def _patched_apply_chat_template(
self,
chat_history,
add_generation_prompt: bool = True,
):
"""Applies a chat template to a list of chat history between user and model."""
if os.environ.get("EVAL_DSV4_CHAT_TEMPLATE") == "1":
return _apply_dsv4_chat_template(chat_history)
if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
return self.tokenizer.apply_chat_template(
chat_history,
Expand Down Expand Up @@ -697,13 +749,30 @@ run_lm_eval() {
esac
done

_install_lm_eval_deps
_patch_lm_eval

local openai_server_base="http://0.0.0.0:${port}"
local openai_chat_base="${openai_server_base}/v1/chat/completions"
local openai_completions_base="${openai_server_base}/v1/completions"
export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL
export MODEL_NAME="${MODEL_NAME:-$MODEL}" # Prefer MODEL_NAME, else MODEL

local lm_eval_model="local-chat-completions"
local lm_eval_base_url="$openai_chat_base"
local lm_eval_eos_string="${EVAL_EOS_STRING:-</s>}"
local lm_eval_tokenizer_args="tokenized_requests=False"

if [[ "${MODEL_PREFIX:-}" == "dsv4" || "${MODEL_NAME:-}" == *"DeepSeek-V4"* || "${MODEL:-}" == *"DeepSeek-V4"* ]]; then
export EVAL_DSV4_CHAT_TEMPLATE=1
lm_eval_model="local-completions"
lm_eval_base_url="$openai_completions_base"
lm_eval_eos_string="${EVAL_EOS_STRING:-<|end▁of▁sentence|>}"
lm_eval_tokenizer_args="tokenizer_backend=None,tokenized_requests=False"
echo "Using DeepSeek-V4 eval prompt encoding via utils/bench_serving/encoding_dsv4.py"
else
unset EVAL_DSV4_CHAT_TEMPLATE
fi

_install_lm_eval_deps
_patch_lm_eval

# Cap output tokens: must fit within context window (leave room for input),
# and avoid excessive KV cache reservation per request on TRT.
Expand All @@ -716,11 +785,11 @@ run_lm_eval() {
# Export for append_lm_eval_summary to pick up
export EVAL_RESULT_DIR="$results_dir"
set -x
python3 -m lm_eval --model local-chat-completions --apply_chat_template \
python3 -m lm_eval --model "${lm_eval_model}" --apply_chat_template \
--tasks "${tasks_dir}" \
--output_path "${results_dir}" \
--log_samples \
--model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,tokenized_requests=False,max_length=${eval_context_len}" \
--model_args "model=${MODEL_NAME},base_url=${lm_eval_base_url},api_key=${OPENAI_API_KEY},eos_string=${lm_eval_eos_string},max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,${lm_eval_tokenizer_args},max_length=${eval_context_len}" \
--gen_kwargs "max_tokens=${max_output_tokens},temperature=${temperature},top_p=${top_p}"
local eval_exit=$?
set +x
Expand Down
10 changes: 5 additions & 5 deletions benchmarks/single_node/dsv4_fp8_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,21 @@ else:
print(f"No patch needed: model_type is {config.get('model_type')!r}")
PYEOF

# DSv4-specific SGLang env vars (from sgl-project/sglang#23608)
export SGLANG_OPT_USE_FUSED_COMPRESS=false
export SGLANG_REASONING_EFFORT=max
export SGLANG_OPT_USE_FUSED_COMPRESS=true
export SGLANG_OPT_USE_OLD_COMPRESSOR=true
export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false
export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false
export SGLANG_OPT_USE_FUSED_HASH_TOPK=false
export SGLANG_HACK_FLASHMLA_BACKEND=torch
export SGLANG_HACK_FLASHMLA_BACKEND=tilelang
export SGLANG_OPT_USE_TILELANG_INDEXER=true
export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false
export SGLANG_OPT_USE_TILELANG_MHC_PRE=false
export SGLANG_OPT_USE_TILELANG_MHC_POST=false
export SGLANG_ENABLE_THINKING=1
export SGLANG_USE_AITER=1
export SGLANG_USE_ROCM700A=1
export SGLANG_TOPK_TRANSFORM_512_TORCH=1
export SGLANG_TOPK_TRANSFORM_512_TORCH=0
export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1
export SGLANG_DSV4_FP4_EXPERTS=false
export SGLANG_OPT_DPSK_V4_RADIX=0
Expand Down Expand Up @@ -85,7 +86,6 @@ python3 -m sglang.launch_server \
--page-size 256 \
--chunked-prefill-size 8192 \
--disable-shared-experts-fusion \
--disable-cuda-graph \
--tool-call-parser deepseekv4 \
--reasoning-parser deepseek-v4 \
--watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
Expand Down
14 changes: 14 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2082,6 +2082,20 @@
- "All recipes enable FP4 indexer cache and speculative-config mtp with num_speculative_tokens=2"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1242

- config-keys:
- dsv4-fp8-mi355x-sglang
description:
- "Bump dsv4-fp8-mi355x-sglang image rocm/sgl-dev:deepseek-v4-mi35x (PR #23608 day-0) -> rocm/sgl-dev:rocm720-mi35x-a8410de-20260502-DSv4 (sgl-project/sglang amd/deepseek_v4 integration through 8/N): cuda graph (#23832), TileLang FlashMLA attn (#24033, 101->2 kernels per call), TileLang indexer attn (#24050, 12->1 kernels per call), native ROCm topk512transform kernel (#24143, ~30->1 launches per call), fused compress decode kernel (#24249, compressor c4/c128 decode fused)"
- "Switch SGLANG_HACK_FLASHMLA_BACKEND torch -> tilelang (sgl-project/sglang#24033)"
- "Add SGLANG_OPT_USE_TILELANG_INDEXER=true (sgl-project/sglang#24050)"
- "Drop --disable-cuda-graph from sglang.launch_server (sgl-project/sglang#23832)"
- "Set SGLANG_TOPK_TRANSFORM_512_TORCH=0 to use the native ROCm topk512 kernel from sgl-project/sglang#24143"
- "Set SGLANG_OPT_USE_FUSED_COMPRESS=true to use the fused compress decode kernel from sgl-project/sglang#24249"
- "Keep SGLANG_DSV4_FP4_EXPERTS=false and SGLANG_FORCE_TRITON_MOE_FP8=1: required for sgl-project/DeepSeek-V4-Pro-FP8 (FP4 path asserts intermediate_size_per_partition==2048 in fp8.py; swiglu_limit clamp lives in fused_moe_triton)"
- "Expected throughput speedup over the PR #23608 day-0 torch-fallback recipe: ~6.5-6.8x at conc 1-8 (matches the screenshot's full '+ topk512transform kernel' tier in the AMD DSv4-Flash-FP8 reference table)"
- "Eval fix (accuracy): cherry-pick `Fix DSv4 eval prompt encoding` from the DSV4-ATOM topic branch (sgl-project/sglang#23608 follow-up, commit 5128a68b). Patches benchmarks/benchmark_lib.sh::run_lm_eval to detect DSv4 (MODEL_PREFIX==dsv4 or MODEL/MODEL_NAME contains 'DeepSeek-V4') and (a) switch the lm-eval model adapter from local-chat-completions to local-completions, (b) point base_url at /v1/completions, (c) set EVAL_EOS_STRING to <|end▁of▁sentence|>, (d) set EVAL_DSV4_CHAT_TEMPLATE=1 so the patched lm-eval TemplateAPI.apply_chat_template renders messages locally via utils/bench_serving/encoding_dsv4.py instead of the missing server-side template. Required because sgl-project/DeepSeek-V4-Pro-FP8 ships tokenizer_config.json without chat_template, causing /v1/chat/completions to return HTTP 400 on every request (`No HuggingFace chat template found` in server log; verified empirically on a8410de: SGLang serving_chat.py / template_manager.py / encoding_dsv4.py are byte-identical to c924543 and have no DSv4 auto-wiring)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1255

- config-keys:
- glm5-fp8-mi355x-sglang-mtp
description:
Expand Down
Loading