SemiAnalysisAI · seungrokj · Jun 18, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
@@ -2261,15 +2261,8 @@ dsv4-fp4-mi355x-vllm-mtp:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
 
-# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
-# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
-# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until
-# the AITER sparse-attention kernel / multi-request path lands upstream.
-# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is
-# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom);
-# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA.
 dsv4-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
+  image: rocm/atom-dev:nightly_202606161823
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -2281,13 +2274,20 @@ dsv4-fp4-mi355x-atom:
     - isl: 1024
       osl: 1024
       search-space:
+        # conc4-64, TP8
+        # conc128-512, DPA
+        # conc1024-2048, DPA TBO
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 1024 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 512 }
+        # conc4-64, TP8
+        # conc128, DPA
+        # conc256-2048, DPA TBO
+      - { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] }
+      - { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -22,31 +22,51 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO
 SERVER_LOG=/workspace/server.log
 
 PARALLEL_ARGS=(-tp "$TP") #TP
+CUDAGRAPH_SIZES='[1, 2, 4, 8, 16, 32, 48, 64, 128, 256, 512]'
 if [ "$DP_ATTENTION" = "true" ]; then
     if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
-    else #DP+TP
-        PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
+    else #DPA+TP
+        #DPA+TP+TBO
+        if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+            export GPU_MAX_HW_QUEUES=5
+        elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+            export GPU_MAX_HW_QUEUES=5
+        else
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
+        fi
     fi
 fi 
 
+BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN")
+    export EVAL_MAX_MODEL_LEN
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
 set -x
 export ATOM_DISABLE_MMAP=true
 export AITER_BF16_FP8_MOE_BOUND=0
 export ATOM_MOE_GU_ITLV=1
-# TODO: add --no-enable_chunked_prefill, when dsv4 prefix caching is supported 
-#https://github.com/ROCm/ATOM/commit/7df93a181da4d3c3250c2441c7d5e2745a03d0cd#diff-61b1ba0b8b74523530d2d5cdc739d4f3a23a43bedf69015a5235844d46e9373bL1127
+MEM_FRAC_STATIC=0.9
+OPT_ARGS=(--hf-overrides '{"use_index_cache": true, "index_topk_freq": 4}')
+
 python3 -m atom.entrypoints.openai_server \
     --model $MODEL \
     --server-port $PORT \
     "${PARALLEL_ARGS[@]}" \
     --kv_cache_dtype fp8 \
     --trust-remote-code \
-    --gpu-memory-utilization 0.85 \
-    > $SERVER_LOG 2>&1 &
+    --gpu-memory-utilization $MEM_FRAC_STATIC \
+    --no-enable_prefix_caching \
+    --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \
+    "${OPT_ARGS[@]}" \
+    > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
 

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3926,4 +3926,12 @@
     - "Container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13"
     - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026 (gb300_nvfp4 STP recipes)"
     - "Runner script launch_gb300-nv.sh: added dynamo-trt-specific glm5-fp4 case with SERVED_MODEL_NAME and SRT_SLURM_MODEL_PREFIX=nvidia/GLM-5-NVFP4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1798
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1798
+
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
+    - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
+    - "Update Applied TBO on high concurrencies"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717