Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1776,6 +1776,27 @@ dsv4-fp4-b200-trt:
- { tp: 8, conc-start: 1, conc-end: 32 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 1024 }

dsv4-fp4-b200-trt-mtp:
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200-dsv4
precision: fp4
framework: trt
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128, spec-decoding: mtp }

# MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
dsv4-fp4-b200-vllm-mtp:
Expand Down
159 changes: 159 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b200_trt_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#!/usr/bin/env bash

# DeepSeek-V4-Pro B200 TensorRT-LLM MTP variant. The configured image already
# contains the DeepSeek-V4 TRTLLM build; this path only toggles speculative MTP.

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
DP_ATTENTION \
EP_SIZE

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-1}"
export TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV="${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-1}"

sanitize_slurm_mpi_env_for_trtllm() {
if [[ "${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-0}" != "1" ]]; then
return 0
fi

echo "Sanitizing Slurm/PMI environment for TensorRT-LLM launch"
while IFS='=' read -r name _; do
case "$name" in
SLURM_*|PMIX*|PMI*|OMPI_*|ORTE_*)
unset "$name"
;;
esac
done < <(env)
}

sanitize_slurm_mpi_env_for_trtllm

export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"

if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
fi

nvidia-smi

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}
EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml"

MOE_BACKEND="TRTLLM"
MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}"
MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"

ATTENTION_DP_CONFIG=""
if [[ "$DP_ATTENTION" == "true" ]]; then
ATTENTION_DP_CONFIG="
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60"
fi

cat > "$EXTRA_CONFIG_FILE" << EOF
cuda_graph_config:
enable_padding: true
max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE
enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG
print_iter_log: true
kv_cache_config:
tokens_per_block: 128
dtype: fp8
free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
enable_block_reuse: false
stream_interval: 10
num_postprocess_workers: 4
moe_config:
backend: $MOE_BACKEND
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: $MTP
EOF

echo "Generated config file contents:"
cat "$EXTRA_CONFIG_FILE"

MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
MAX_NUM_TOKENS=$(( ISL + OSL + (MTP + 1) * MAX_BATCH_SIZE + 256 ))
MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

set -x
SERVE_CMD=(
trtllm-serve "$MODEL" \
--host 0.0.0.0 \
--port "$PORT" \
--trust_remote_code \
--backend pytorch \
--max_batch_size "$MAX_BATCH_SIZE" \
--max_seq_len "$MAX_MODEL_LEN" \
--max_num_tokens "$MAX_NUM_TOKENS" \
--tp_size "$TP" \
--ep_size "$EP_SIZE" \
--custom_tokenizer deepseek_v4 \
--config "$EXTRA_CONFIG_FILE"
)

if [[ "${TRTLLM_DSV4_USE_MPIRUN:-1}" == "0" ]]; then
"${SERVE_CMD[@]}" > "$SERVER_LOG" 2>&1 &
else
mpirun -n 1 --oversubscribe --allow-run-as-root \
"${SERVE_CMD[@]}" \
> "$SERVER_LOG" 2>&1 &
fi

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend openai \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$(( CONC * 10 ))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir "$PWD/" \
--trust-remote-code \
--use-chat-template \
--dsv4 \
--server-pid "$SERVER_PID"

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2283,3 +2283,11 @@
- "8k/1k TP8/EP8: keep block-size=32 and shuffled KV cache disabled; disable AITER MoE (VLLM_ROCM_USE_AITER_MOE=0); disable async scheduling"
- "8k/1k non-TP8/EP8: disable async scheduling through c64; switch to block-size=16 with shuffled KV cache at c64 and above"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1276

- config-keys:
- dsv4-fp4-b200-trt-mtp
description:
- "Add DeepSeek-V4-Pro FP4 B200 TensorRT-LLM MTP coverage using ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715"
- "Mirror the B200 TRT STP search space with spec-decoding: mtp and TensorRT-LLM MTP num_nextn_predict_layers=2"
- "Benchmark serving uses the DeepSeek-V4 chat template for MTP acceptance-rate correctness"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1294
Loading