From d9298e8bd1c5c7c74251f5b60c1666e0fe5fb252 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Sun, 7 Sep 2025 17:10:42 -0700
Subject: [PATCH 1/7] first commit

---
 .github/workflows/70b-tmpl.yml    | 138 +++++++++++++++---------------
 .github/workflows/dsr1-tmpl.yml   |  70 ++++++++++-----
 benchmarks/dsr1_b200_trt_slurm.sh |  71 +++++++++++++++
 debug_hw.py                       |  18 ++++
 runners/launch_b200-nv.sh         |   4 +-
 utils/plot_perf.py                | 103 ++++++++++++++++------
 6 files changed, 286 insertions(+), 118 deletions(-)
 create mode 100644 benchmarks/dsr1_b200_trt_slurm.sh
 create mode 100644 debug_hw.py

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 2d820463b..b6fe04a3a 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -30,22 +30,22 @@ jobs:
       - name: Find the latest Docker image
         run: echo "Hardcoding image tags for now."
 
-  bmk-h100:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h100
-      image: 'kedarpotdar147/vllm0.1:latest'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
-      framework: 'vllm'
+  # bmk-h100:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: h100
+  #     image: 'kedarpotdar147/vllm0.1:latest'
+  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+  #     tp-list: '[2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
+  #     framework: 'vllm'
 
   bmk-h200:
     needs: find-latest-image
@@ -60,7 +60,7 @@ jobs:
       runner: h200
       image: 'kedarpotdar147/vllm0.1:latest'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
+      tp-list: '[2]'
       timeout: ${{ inputs.timeout }}
       framework: 'vllm'
 
@@ -77,26 +77,26 @@ jobs:
       runner: h200-trt
       image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'  
+      tp-list: '[2]'  
       timeout: ${{ inputs.timeout }}
       framework: 'trt'
 
-  bmk-b200:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: b200
-      image: 'kedarpotdar147/vllm:05'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[1, 2]'  
-      timeout: ${{ inputs.timeout }}
-      framework: 'vllm'
+  # bmk-b200:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: b200
+  #     image: 'kedarpotdar147/vllm:05'
+  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+  #     tp-list: '[1, 2]'  
+  #     timeout: ${{ inputs.timeout }}
+  #     framework: 'vllm'
 
   bmk-b200-trt:
     needs: find-latest-image
@@ -109,45 +109,45 @@ jobs:
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
       runner: b200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2'
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[1, 2]'  
+      tp-list: '[2]'  
       timeout: ${{ inputs.timeout }}
       framework: 'trt'
 
-  bmk-mi300x:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi300x
-      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-      tp-list: '[1, 2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
-      framework: 'vllm'
+  # bmk-mi300x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi300x
+  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+  #     model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+  #     tp-list: '[1, 2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
+  #     framework: 'vllm'
 
-  bmk-mi325x:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi325x
-      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-      tp-list: '[1, 2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
-      framework: 'vllm'
+  # bmk-mi325x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi325x
+  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+  #     model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+  #     tp-list: '[1, 2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
+  #     framework: 'vllm'
 
   # bmk-mi355x:
   #   needs: find-latest-image
@@ -167,7 +167,7 @@ jobs:
   #     framework: 'vllm'
 
   collect-results:
-    needs: [bmk-h100, bmk-h200, bmk-h200-trt, bmk-b200-trt, bmk-mi300x, bmk-mi325x] 
+    needs: [ bmk-h200, bmk-h200-trt]  
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
index 856e501b9..eff277bd1 100644
--- a/.github/workflows/dsr1-tmpl.yml
+++ b/.github/workflows/dsr1-tmpl.yml
@@ -22,6 +22,10 @@ on:
         required: false
         type: number
         default: 180
+      precision:
+        required: false
+        type: string
+        default: fp8
 
 jobs:
   find-latest-image:
@@ -46,6 +50,7 @@ jobs:
       tp-list: '[8]'
       timeout: ${{ inputs.timeout }}
       framework: 'sglang'
+      precision: ${{ inputs.precision }}
 
   bmk-b200:
     needs: find-latest-image
@@ -63,8 +68,9 @@ jobs:
       tp-list: '[8]'
       timeout: ${{ inputs.timeout }}
       framework: 'sglang'
+      precision: ${{ inputs.precision }}
 
-  bmk-mi300x:
+  bmk-b200-trt-fp4:
     needs: find-latest-image
     uses: ./.github/workflows/benchmark-tmpl.yml
     secrets: inherit
@@ -74,29 +80,49 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi300x
-      image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
+      runner: b200-trt
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1'
+      model: 'nvidia/DeepSeek-R1-0528-FP4'
       tp-list: '[8]'
       timeout: ${{ inputs.timeout }}
-      framework: 'sglang'
+      framework: 'trt'
+      precision: fp4
 
-  bmk-mi325x:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi325x
-      image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      tp-list: '[8]'
-      timeout: ${{ inputs.timeout }}
-      framework: 'sglang'
+  # bmk-mi300x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi300x
+  #     image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
+  #     model: 'deepseek-ai/DeepSeek-R1-0528'
+  #     tp-list: '[8]'
+  #     timeout: ${{ inputs.timeout }}
+  #     framework: 'sglang'
+  #     precision: ${{ inputs.precision }}
+
+  # bmk-mi325x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi325x
+  #     image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
+  #     model: 'deepseek-ai/DeepSeek-R1-0528'
+  #     tp-list: '[8]'
+  #     timeout: ${{ inputs.timeout }}
+  #     framework: 'sglang'
+  #     precision: ${{ inputs.precision }}
 
   # bmk-mi355x:
   #   needs: find-latest-image
@@ -116,7 +142,7 @@ jobs:
   #     framework: 'sglang'
 
   collect-results:
-    needs: [bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x] 
+    needs: [bmk-h200, bmk-b200, bmk-b200-trt-fp4, ] 
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/benchmarks/dsr1_b200_trt_slurm.sh b/benchmarks/dsr1_b200_trt_slurm.sh
new file mode 100644
index 000000000..c5662ac10
--- /dev/null
+++ b/benchmarks/dsr1_b200_trt_slurm.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+
+set -x
+
+cat > dsr1-fp4-config.yml << 'EOF'
+enable_attention_dp: false 
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 256 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+speculative_config:
+  decoding_type: MTP
+  num_nextn_predict_layers: 3
+moe_config:
+  backend: TRTLLM
+EOF
+
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --ep_size 8 --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --extra_llm_api_options dsr1-fp4-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
+        sleep 5
+        tail -n100 $SERVER_LOG
+        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
+        exit 1
+    fi
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos --use-chat-template \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/debug_hw.py b/debug_hw.py
new file mode 100644
index 000000000..2fdb69e1c
--- /dev/null
+++ b/debug_hw.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+import json
+from pathlib import Path
+
+# Test what hw values are actually in the results
+results_dir = Path("results")
+if results_dir.exists():
+    print("Found results directory")
+    for result_path in results_dir.rglob("*.json"):
+        print(f"\nFile: {result_path}")
+        with open(result_path) as f:
+            result = json.load(f)
+        print(f"  hw: '{result.get('hw', 'MISSING')}'")
+        print(f"  framework: '{result.get('framework', 'MISSING')}'")
+        print(f"  precision: '{result.get('precision', 'MISSING')}'")
+        print(f"  model: '{result.get('model', 'MISSING')}'")
+else:
+    print("No results directory found")
diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index 933481d72..518f57d9f 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -11,9 +11,9 @@ fi
 PARTITION="dgx-b200"
 # Use framework-specific SQSH file
 if [ "$FRAMEWORK" = "trt" ]; then
-    SQUASH_FILE="/raid/image_${MODEL_CODE}_b200_trt-0903.sqsh"
+    SQUASH_FILE="/raid/image_${MODEL_CODE}_b200_trt-0907.sqsh"
 else
-    SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0903.sqsh"
+    SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0907.sqsh"
 fi
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
diff --git a/utils/plot_perf.py b/utils/plot_perf.py
index 87f7a1ec7..1e108f83e 100644
--- a/utils/plot_perf.py
+++ b/utils/plot_perf.py
@@ -33,10 +33,21 @@ def plot_tput_vs_e2el(precision_filter=None):
         filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter]
 
     for hw_label, color in hw_color.items():
-        xs = [result['median_e2el'] for result in filtered_results if result['hw'] == hw_label]
-        ys = [result['tput_per_gpu'] for result in filtered_results if result['hw'] == hw_label]
-        if xs and ys:
-            ax.scatter(xs, ys, label=hw_label.upper(), color=color)
+        # Separate fp8 and fp4 results for this hardware
+        fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
+        fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
+        
+        # Plot fp8 results with circles
+        if fp8_results:
+            xs_fp8 = [r['median_e2el'] for r in fp8_results]
+            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
+            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
+        
+        # Plot fp4 results with squares
+        if fp4_results:
+            xs_fp4 = [r['median_e2el'] for r in fp4_results]
+            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
+            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
 
     for result in filtered_results:
         x, y = result['median_e2el'], result['tput_per_gpu']
@@ -61,10 +72,21 @@ def plot_tput_vs_intvty(precision_filter=None):
         filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter]
 
     for hw_label, color in hw_color.items():
-        xs = [result['median_intvty'] for result in filtered_results if result['hw'] == hw_label]
-        ys = [result['tput_per_gpu'] for result in filtered_results if result['hw'] == hw_label]
-        if xs and ys:
-            ax.scatter(xs, ys, label=hw_label.upper(), color=color)
+        # Separate fp8 and fp4 results for this hardware
+        fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
+        fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
+        
+        # Plot fp8 results with circles
+        if fp8_results:
+            xs_fp8 = [r['median_intvty'] for r in fp8_results]
+            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
+            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
+        
+        # Plot fp4 results with squares
+        if fp4_results:
+            xs_fp4 = [r['median_intvty'] for r in fp4_results]
+            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
+            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
 
     for result in filtered_results:
         x, y = result['median_intvty'], result['tput_per_gpu']
@@ -84,10 +106,21 @@ def plot_tput_vs_e2el_for_model(model_results, model_name):
     fig, ax = plt.subplots()
     
     for hw_label, color in hw_color.items():
-        xs = [result['median_e2el'] for result in model_results if result['hw'] == hw_label]
-        ys = [result['tput_per_gpu'] for result in model_results if result['hw'] == hw_label]
-        if xs and ys:
-            ax.scatter(xs, ys, label=hw_label.upper(), color=color)
+        # Separate fp8 and fp4 results for this hardware
+        fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
+        fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
+        
+        # Plot fp8 results with circles
+        if fp8_results:
+            xs_fp8 = [r['median_e2el'] for r in fp8_results]
+            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
+            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
+        
+        # Plot fp4 results with squares
+        if fp4_results:
+            xs_fp4 = [r['median_e2el'] for r in fp4_results]
+            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
+            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
 
     for result in model_results:
         x, y = result['median_e2el'], result['tput_per_gpu']
@@ -109,10 +142,21 @@ def plot_tput_vs_intvty_for_model(model_results, model_name):
     fig, ax = plt.subplots()
     
     for hw_label, color in hw_color.items():
-        xs = [result['median_intvty'] for result in model_results if result['hw'] == hw_label]
-        ys = [result['tput_per_gpu'] for result in model_results if result['hw'] == hw_label]
-        if xs and ys:
-            ax.scatter(xs, ys, label=hw_label.upper(), color=color)
+        # Separate fp8 and fp4 results for this hardware
+        fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
+        fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
+        
+        # Plot fp8 results with circles
+        if fp8_results:
+            xs_fp8 = [r['median_intvty'] for r in fp8_results]
+            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
+            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
+        
+        # Plot fp4 results with squares
+        if fp4_results:
+            xs_fp4 = [r['median_intvty'] for r in fp4_results]
+            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
+            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
 
     for result in model_results:
         x, y = result['median_intvty'], result['tput_per_gpu']
@@ -131,13 +175,22 @@ def plot_tput_vs_intvty_for_model(model_results, model_name):
 
 
 # Create one plot per model showing all frameworks and hardware
-# Group results by model (70b, dsr1, etc.)
-models = set(r.get('model', 'unknown') for r in results)
-
-for model in models:
-    # Filter results for this model
-    model_results = [r for r in results if r.get('model', 'unknown') == model]
+# Group results by model family (70b, dsr1, etc.) instead of full model name
+def get_model_family(model_name):
+    if '70b' in model_name.lower() or 'llama-3.3-70b' in model_name.lower():
+        return '70b'
+    elif 'dsr1' in model_name.lower() or 'deepseek-r1' in model_name.lower():
+        return 'dsr1'
+    else:
+        # Fallback to first part of model name
+        return model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name
+
+model_families = set(get_model_family(r.get('model', 'unknown')) for r in results)
+
+for model_family in model_families:
+    # Filter results for this model family
+    model_results = [r for r in results if get_model_family(r.get('model', 'unknown')) == model_family]
     
-    # Create plots for this model
-    plot_tput_vs_e2el_for_model(model_results, model)
-    plot_tput_vs_intvty_for_model(model_results, model)
+    # Create plots for this model family
+    plot_tput_vs_e2el_for_model(model_results, model_family)
+    plot_tput_vs_intvty_for_model(model_results, model_family)

From 4966b0aa1cbba698cc4c266b1bb0a8cee9113ea3 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Sun, 7 Sep 2025 17:12:16 -0700
Subject: [PATCH 2/7] remove 8k tests

---
 .github/workflows/workflow-scheduler.yml | 74 ++++++++++++------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index 70fee5f5a..eab4510ff 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -30,43 +30,43 @@ jobs:
       max-model-len: 2048
       random-range-ratio: 0.8
 
-  _70b-8k1k:
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # _70b-8k1k:
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
   
-  dsr1-8k1k:
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-8k1k:
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
 
-  _70b-1k8k:
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      timeout: 240
+  # _70b-1k8k:
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+  #     timeout: 240
 
-  dsr1-1k8k:
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-1k8k:
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8

From bc8716751ab1df7a66804a74004c02a0a4d957a1 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Sun, 7 Sep 2025 17:15:10 -0700
Subject: [PATCH 3/7] remove concurrency lock

---
 .github/workflows/workflow-scheduler.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index eab4510ff..e6fcd95b3 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -1,13 +1,13 @@
 name: Workflow Scheduler
 
-concurrency:
-  group: benchmark-lock
-  cancel-in-progress: false
+# concurrency:
+#   group: benchmark-lock-v2
+#   cancel-in-progress: false
 
 on:
   workflow_dispatch:
-  schedule:
-    - cron: '0 5 * * *'
+  # schedule:
+  #   - cron: '0 5 * * *'
 
 jobs:
   _70b-1k1k:

From b8e567cf618803905aba99d7e2208c53acd0991f Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Sun, 7 Sep 2025 17:15:25 -0700
Subject: [PATCH 4/7] typo in concurrecny lock

---
 .github/workflows/workflow-scheduler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index e6fcd95b3..9011c06bf 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -1,7 +1,7 @@
 name: Workflow Scheduler
 
 # concurrency:
-#   group: benchmark-lock-v2
+#   group: benchmark-lock
 #   cancel-in-progress: false
 
 on:

From 759c0f6078d7e7102d3790b6fad1476a95997950 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Sun, 7 Sep 2025 19:08:04 -0700
Subject: [PATCH 5/7] enable other tests

---
 .github/workflows/70b-tmpl.yml           | 132 +++++++++++------------
 .github/workflows/dsr1-tmpl.yml          |  71 ++++++------
 .github/workflows/workflow-scheduler.yml |  84 +++++++--------
 3 files changed, 144 insertions(+), 143 deletions(-)

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index b6fe04a3a..250a50ad0 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -30,22 +30,22 @@ jobs:
       - name: Find the latest Docker image
         run: echo "Hardcoding image tags for now."
 
-  # bmk-h100:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: h100
-  #     image: 'kedarpotdar147/vllm0.1:latest'
-  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-  #     tp-list: '[2, 4, 8]'
-  #     timeout: ${{ inputs.timeout }}
-  #     framework: 'vllm'
+  bmk-h100:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: h100
+      image: 'kedarpotdar147/vllm0.1:latest'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[2, 4, 8]'
+      timeout: ${{ inputs.timeout }}
+      framework: 'vllm'
 
   bmk-h200:
     needs: find-latest-image
@@ -81,22 +81,22 @@ jobs:
       timeout: ${{ inputs.timeout }}
       framework: 'trt'
 
-  # bmk-b200:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: b200
-  #     image: 'kedarpotdar147/vllm:05'
-  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-  #     tp-list: '[1, 2]'  
-  #     timeout: ${{ inputs.timeout }}
-  #     framework: 'vllm'
+  bmk-b200:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: b200
+      image: 'kedarpotdar147/vllm:05'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[1, 2]'  
+      timeout: ${{ inputs.timeout }}
+      framework: 'vllm'
 
   bmk-b200-trt:
     needs: find-latest-image
@@ -111,43 +111,43 @@ jobs:
       runner: b200-trt
       image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[2]'  
+      tp-list: '[1,2]'  
       timeout: ${{ inputs.timeout }}
       framework: 'trt'
 
-  # bmk-mi300x:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: mi300x
-  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-  #     model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-  #     tp-list: '[1, 2, 4, 8]'
-  #     timeout: ${{ inputs.timeout }}
-  #     framework: 'vllm'
+  bmk-mi300x:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: mi300x
+      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+      tp-list: '[1, 2, 4, 8]'
+      timeout: ${{ inputs.timeout }}
+      framework: 'vllm'
 
-  # bmk-mi325x:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: mi325x
-  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-  #     model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-  #     tp-list: '[1, 2, 4, 8]'
-  #     timeout: ${{ inputs.timeout }}
-  #     framework: 'vllm'
+  bmk-mi325x:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: mi325x
+      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+      tp-list: '[1, 2, 4, 8]'
+      timeout: ${{ inputs.timeout }}
+      framework: 'vllm'
 
   # bmk-mi355x:
   #   needs: find-latest-image
@@ -167,7 +167,7 @@ jobs:
   #     framework: 'vllm'
 
   collect-results:
-    needs: [ bmk-h200, bmk-h200-trt]  
+    needs: [ bmk-h200, bmk-h200-trt, bmk-b200, bmk-b200-trt, bmk-mi300x, bmk-mi325x]  
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
index eff277bd1..7ede786ba 100644
--- a/.github/workflows/dsr1-tmpl.yml
+++ b/.github/workflows/dsr1-tmpl.yml
@@ -88,41 +88,41 @@ jobs:
       framework: 'trt'
       precision: fp4
 
-  # bmk-mi300x:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: mi300x
-  #     image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
-  #     model: 'deepseek-ai/DeepSeek-R1-0528'
-  #     tp-list: '[8]'
-  #     timeout: ${{ inputs.timeout }}
-  #     framework: 'sglang'
-  #     precision: ${{ inputs.precision }}
+  bmk-mi300x:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: mi300x
+      image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
+      model: 'deepseek-ai/DeepSeek-R1-0528'
+      tp-list: '[8]'
+      timeout: ${{ inputs.timeout }}
+      framework: 'sglang'
+      precision: ${{ inputs.precision }}
 
-  # bmk-mi325x:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: mi325x
-  #     image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
-  #     model: 'deepseek-ai/DeepSeek-R1-0528'
-  #     tp-list: '[8]'
-  #     timeout: ${{ inputs.timeout }}
-  #     framework: 'sglang'
-  #     precision: ${{ inputs.precision }}
+  bmk-mi325x:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: mi325x
+      image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
+      model: 'deepseek-ai/DeepSeek-R1-0528'
+      tp-list: '[8]'
+      timeout: ${{ inputs.timeout }}
+      framework: 'sglang'
+      precision: ${{ inputs.precision }}
 
   # bmk-mi355x:
   #   needs: find-latest-image
@@ -140,9 +140,10 @@ jobs:
   #     tp-list: '[8]'
   #     timeout: ${{ inputs.timeout }}
   #     framework: 'sglang'
+  #     precision: ${{ inputs.precision }}
 
   collect-results:
-    needs: [bmk-h200, bmk-b200, bmk-b200-trt-fp4, ] 
+    needs: [bmk-h200, bmk-b200, bmk-b200-trt-fp4, bmk-mi300x, bmk-mi325x] 
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index 9011c06bf..70fee5f5a 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -1,13 +1,13 @@
 name: Workflow Scheduler
 
-# concurrency:
-#   group: benchmark-lock
-#   cancel-in-progress: false
+concurrency:
+  group: benchmark-lock
+  cancel-in-progress: false
 
 on:
   workflow_dispatch:
-  # schedule:
-  #   - cron: '0 5 * * *'
+  schedule:
+    - cron: '0 5 * * *'
 
 jobs:
   _70b-1k1k:
@@ -30,43 +30,43 @@ jobs:
       max-model-len: 2048
       random-range-ratio: 0.8
 
-  # _70b-8k1k:
-  #   uses: ./.github/workflows/70b-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: '70b_8k1k'
-  #     isl: 8192
-  #     osl: 1024
-  #     max-model-len: 9216
-  #     random-range-ratio: 0.8
+  _70b-8k1k:
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_8k1k'
+      isl: 8192
+      osl: 1024
+      max-model-len: 9216
+      random-range-ratio: 0.8
   
-  # dsr1-8k1k:
-  #   uses: ./.github/workflows/dsr1-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: 'dsr1_8k1k'
-  #     isl: 8192
-  #     osl: 1024
-  #     max-model-len: 9216
-  #     random-range-ratio: 0.8
+  dsr1-8k1k:
+    uses: ./.github/workflows/dsr1-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: 'dsr1_8k1k'
+      isl: 8192
+      osl: 1024
+      max-model-len: 9216
+      random-range-ratio: 0.8
 
-  # _70b-1k8k:
-  #   uses: ./.github/workflows/70b-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: '70b_1k8k'
-  #     isl: 1024
-  #     osl: 8192
-  #     max-model-len: 9216
-  #     random-range-ratio: 0.8
-  #     timeout: 240
+  _70b-1k8k:
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k8k'
+      isl: 1024
+      osl: 8192
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      timeout: 240
 
-  # dsr1-1k8k:
-  #   uses: ./.github/workflows/dsr1-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: 'dsr1_1k8k'
-  #     isl: 1024
-  #     osl: 8192
-  #     max-model-len: 9216
-  #     random-range-ratio: 0.8
+  dsr1-1k8k:
+    uses: ./.github/workflows/dsr1-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: 'dsr1_1k8k'
+      isl: 1024
+      osl: 8192
+      max-model-len: 9216
+      random-range-ratio: 0.8

From 294fe5f2712d98b7ebc529e7f1027a7d63613472 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Sun, 7 Sep 2025 19:55:39 -0700
Subject: [PATCH 6/7] remove MTP

---
 benchmarks/dsr1_b200_trt_slurm.sh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/benchmarks/dsr1_b200_trt_slurm.sh b/benchmarks/dsr1_b200_trt_slurm.sh
index c5662ac10..417313212 100644
--- a/benchmarks/dsr1_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_b200_trt_slurm.sh
@@ -32,9 +32,6 @@ kv_cache_config:
   dtype: fp8 
   enable_block_reuse: false 
 stream_interval: 10
-speculative_config:
-  decoding_type: MTP
-  num_nextn_predict_layers: 3
 moe_config:
   backend: TRTLLM
 EOF
@@ -65,7 +62,7 @@ python3 bench_serving/benchmark_serving.py \
 --dataset-name random \
 --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
 --num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos --use-chat-template \
+--request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
 --result-dir /workspace/ \
 --result-filename $RESULT_FILENAME.json

From a5df32918fba0605d3b10b0261e7e36c1b5124d3 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Sun, 7 Sep 2025 21:30:47 -0700
Subject: [PATCH 7/7] update tp-list to include full list

---
 .github/workflows/70b-tmpl.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 250a50ad0..968f8e1cc 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -60,7 +60,7 @@ jobs:
       runner: h200
       image: 'kedarpotdar147/vllm0.1:latest'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[2]'
+      tp-list: '[1, 2, 4, 8]'
       timeout: ${{ inputs.timeout }}
       framework: 'vllm'
 
@@ -77,7 +77,7 @@ jobs:
       runner: h200-trt
       image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[2]'  
+      tp-list: '[1, 2, 4, 8]'  
       timeout: ${{ inputs.timeout }}
       framework: 'trt'
 
@@ -94,7 +94,7 @@ jobs:
       runner: b200
       image: 'kedarpotdar147/vllm:05'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[1, 2]'  
+      tp-list: '[1, 2, 4, 8]'  
       timeout: ${{ inputs.timeout }}
       framework: 'vllm'