Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/70b-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ jobs:
runner: b200
image: 'kedarpotdar147/vllm:05'
model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
tp-list: '[1, 2]'
tp-list: '[1, 2, 4, 8]'
timeout: ${{ inputs.timeout }}
framework: 'vllm'

Expand All @@ -109,9 +109,9 @@ jobs:
max-model-len: ${{ inputs.max-model-len }}
random-range-ratio: ${{ inputs.random-range-ratio }}
runner: b200-trt
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2'
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1'
model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
tp-list: '[1, 2]'
tp-list: '[1,2]'
timeout: ${{ inputs.timeout }}
framework: 'trt'

Expand Down Expand Up @@ -167,7 +167,7 @@ jobs:
# framework: 'vllm'

collect-results:
needs: [bmk-h100, bmk-h200, bmk-h200-trt, bmk-b200-trt, bmk-mi300x, bmk-mi325x]
needs: [ bmk-h200, bmk-h200-trt, bmk-b200, bmk-b200-trt, bmk-mi300x, bmk-mi325x]
if: ${{ always() && !cancelled() }}
uses: ./.github/workflows/collect-results.yml
secrets: inherit
Expand Down
29 changes: 28 additions & 1 deletion .github/workflows/dsr1-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ on:
required: false
type: number
default: 180
precision:
required: false
type: string
default: fp8

jobs:
find-latest-image:
Expand All @@ -46,6 +50,7 @@ jobs:
tp-list: '[8]'
timeout: ${{ inputs.timeout }}
framework: 'sglang'
precision: ${{ inputs.precision }}

bmk-b200:
needs: find-latest-image
Expand All @@ -63,6 +68,25 @@ jobs:
tp-list: '[8]'
timeout: ${{ inputs.timeout }}
framework: 'sglang'
precision: ${{ inputs.precision }}

bmk-b200-trt-fp4:
needs: find-latest-image
uses: ./.github/workflows/benchmark-tmpl.yml
secrets: inherit
with:
exp-name: ${{ inputs.exp-name }}
isl: ${{ inputs.isl }}
osl: ${{ inputs.osl }}
max-model-len: ${{ inputs.max-model-len }}
random-range-ratio: ${{ inputs.random-range-ratio }}
runner: b200-trt
image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1'
model: 'nvidia/DeepSeek-R1-0528-FP4'
tp-list: '[8]'
timeout: ${{ inputs.timeout }}
framework: 'trt'
precision: fp4

bmk-mi300x:
needs: find-latest-image
Expand All @@ -80,6 +104,7 @@ jobs:
tp-list: '[8]'
timeout: ${{ inputs.timeout }}
framework: 'sglang'
precision: ${{ inputs.precision }}

bmk-mi325x:
needs: find-latest-image
Expand All @@ -97,6 +122,7 @@ jobs:
tp-list: '[8]'
timeout: ${{ inputs.timeout }}
framework: 'sglang'
precision: ${{ inputs.precision }}

# bmk-mi355x:
# needs: find-latest-image
Expand All @@ -114,9 +140,10 @@ jobs:
# tp-list: '[8]'
# timeout: ${{ inputs.timeout }}
# framework: 'sglang'
# precision: ${{ inputs.precision }}

collect-results:
needs: [bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x]
needs: [bmk-h200, bmk-b200, bmk-b200-trt-fp4, bmk-mi300x, bmk-mi325x]
if: ${{ always() && !cancelled() }}
uses: ./.github/workflows/collect-results.yml
secrets: inherit
Expand Down
68 changes: 68 additions & 0 deletions benchmarks/dsr1_b200_trt_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env bash

# === Required Env Vars ===
# HF_TOKEN
# HF_HUB_CACHE
# IMAGE
# MODEL
# ISL
# OSL
# MAX_MODEL_LEN
# RANDOM_RANGE_RATIO
# TP
# CONC
# RESULT_FILENAME
# PORT_OFFSET

echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"

hf download $MODEL
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
PORT=$(( 8888 + $PORT_OFFSET ))


set -x

cat > dsr1-fp4-config.yml << 'EOF'
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 256
kv_cache_config:
dtype: fp8
enable_block_reuse: false
stream_interval: 10
moe_config:
backend: TRTLLM
EOF

# Launch TRT-LLM server
mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --ep_size 8 --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --extra_llm_api_options dsr1-fp4-config.yml --port $PORT > $SERVER_LOG 2>&1 &


set +x
while IFS= read -r line; do
printf '%s\n' "$line"
if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
sleep 5
tail -n100 $SERVER_LOG
echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
exit 1
fi
if [[ "$line" == *"Application startup complete"* ]]; then
break
fi
done < <(tail -F -n0 "$SERVER_LOG")

set -x
git clone https://github.com/kimbochen/bench_serving.git
python3 bench_serving/benchmark_serving.py \
--model $MODEL --backend openai \
--base-url http://0.0.0.0:$PORT \
--dataset-name random \
--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
--request-rate inf --ignore-eos \
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
--result-dir /workspace/ \
--result-filename $RESULT_FILENAME.json
18 changes: 18 additions & 0 deletions debug_hw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env python3
import json
from pathlib import Path

# Test what hw values are actually in the results
results_dir = Path("results")
if results_dir.exists():
print("Found results directory")
for result_path in results_dir.rglob("*.json"):
print(f"\nFile: {result_path}")
with open(result_path) as f:
result = json.load(f)
print(f" hw: '{result.get('hw', 'MISSING')}'")
print(f" framework: '{result.get('framework', 'MISSING')}'")
print(f" precision: '{result.get('precision', 'MISSING')}'")
print(f" model: '{result.get('model', 'MISSING')}'")
else:
print("No results directory found")
4 changes: 2 additions & 2 deletions runners/launch_b200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ fi
PARTITION="dgx-b200"
# Use framework-specific SQSH file
if [ "$FRAMEWORK" = "trt" ]; then
SQUASH_FILE="/raid/image_${MODEL_CODE}_b200_trt-0903.sqsh"
SQUASH_FILE="/raid/image_${MODEL_CODE}_b200_trt-0907.sqsh"
else
SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0903.sqsh"
SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0907.sqsh"
fi

salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
Expand Down
103 changes: 78 additions & 25 deletions utils/plot_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,21 @@ def plot_tput_vs_e2el(precision_filter=None):
filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter]

for hw_label, color in hw_color.items():
xs = [result['median_e2el'] for result in filtered_results if result['hw'] == hw_label]
ys = [result['tput_per_gpu'] for result in filtered_results if result['hw'] == hw_label]
if xs and ys:
ax.scatter(xs, ys, label=hw_label.upper(), color=color)
# Separate fp8 and fp4 results for this hardware
fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']

# Plot fp8 results with circles
if fp8_results:
xs_fp8 = [r['median_e2el'] for r in fp8_results]
ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)

# Plot fp4 results with squares
if fp4_results:
xs_fp4 = [r['median_e2el'] for r in fp4_results]
ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)

for result in filtered_results:
x, y = result['median_e2el'], result['tput_per_gpu']
Expand All @@ -61,10 +72,21 @@ def plot_tput_vs_intvty(precision_filter=None):
filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter]

for hw_label, color in hw_color.items():
xs = [result['median_intvty'] for result in filtered_results if result['hw'] == hw_label]
ys = [result['tput_per_gpu'] for result in filtered_results if result['hw'] == hw_label]
if xs and ys:
ax.scatter(xs, ys, label=hw_label.upper(), color=color)
# Separate fp8 and fp4 results for this hardware
fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']

# Plot fp8 results with circles
if fp8_results:
xs_fp8 = [r['median_intvty'] for r in fp8_results]
ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)

# Plot fp4 results with squares
if fp4_results:
xs_fp4 = [r['median_intvty'] for r in fp4_results]
ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)

for result in filtered_results:
x, y = result['median_intvty'], result['tput_per_gpu']
Expand All @@ -84,10 +106,21 @@ def plot_tput_vs_e2el_for_model(model_results, model_name):
fig, ax = plt.subplots()

for hw_label, color in hw_color.items():
xs = [result['median_e2el'] for result in model_results if result['hw'] == hw_label]
ys = [result['tput_per_gpu'] for result in model_results if result['hw'] == hw_label]
if xs and ys:
ax.scatter(xs, ys, label=hw_label.upper(), color=color)
# Separate fp8 and fp4 results for this hardware
fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']

# Plot fp8 results with circles
if fp8_results:
xs_fp8 = [r['median_e2el'] for r in fp8_results]
ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)

# Plot fp4 results with squares
if fp4_results:
xs_fp4 = [r['median_e2el'] for r in fp4_results]
ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)

for result in model_results:
x, y = result['median_e2el'], result['tput_per_gpu']
Expand All @@ -109,10 +142,21 @@ def plot_tput_vs_intvty_for_model(model_results, model_name):
fig, ax = plt.subplots()

for hw_label, color in hw_color.items():
xs = [result['median_intvty'] for result in model_results if result['hw'] == hw_label]
ys = [result['tput_per_gpu'] for result in model_results if result['hw'] == hw_label]
if xs and ys:
ax.scatter(xs, ys, label=hw_label.upper(), color=color)
# Separate fp8 and fp4 results for this hardware
fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']

# Plot fp8 results with circles
if fp8_results:
xs_fp8 = [r['median_intvty'] for r in fp8_results]
ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)

# Plot fp4 results with squares
if fp4_results:
xs_fp4 = [r['median_intvty'] for r in fp4_results]
ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)

for result in model_results:
x, y = result['median_intvty'], result['tput_per_gpu']
Expand All @@ -131,13 +175,22 @@ def plot_tput_vs_intvty_for_model(model_results, model_name):


# Create one plot per model showing all frameworks and hardware
# Group results by model (70b, dsr1, etc.)
models = set(r.get('model', 'unknown') for r in results)

for model in models:
# Filter results for this model
model_results = [r for r in results if r.get('model', 'unknown') == model]
# Group results by model family (70b, dsr1, etc.) instead of full model name
def get_model_family(model_name):
if '70b' in model_name.lower() or 'llama-3.3-70b' in model_name.lower():
return '70b'
elif 'dsr1' in model_name.lower() or 'deepseek-r1' in model_name.lower():
return 'dsr1'
else:
# Fallback to first part of model name
return model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name

model_families = set(get_model_family(r.get('model', 'unknown')) for r in results)

for model_family in model_families:
# Filter results for this model family
model_results = [r for r in results if get_model_family(r.get('model', 'unknown')) == model_family]

# Create plots for this model
plot_tput_vs_e2el_for_model(model_results, model)
plot_tput_vs_intvty_for_model(model_results, model)
# Create plots for this model family
plot_tput_vs_e2el_for_model(model_results, model_family)
plot_tput_vs_intvty_for_model(model_results, model_family)