From d9298e8bd1c5c7c74251f5b60c1666e0fe5fb252 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Sun, 7 Sep 2025 17:10:42 -0700 Subject: [PATCH 1/7] first commit --- .github/workflows/70b-tmpl.yml | 138 +++++++++++++++--------------- .github/workflows/dsr1-tmpl.yml | 70 ++++++++++----- benchmarks/dsr1_b200_trt_slurm.sh | 71 +++++++++++++++ debug_hw.py | 18 ++++ runners/launch_b200-nv.sh | 4 +- utils/plot_perf.py | 103 ++++++++++++++++------ 6 files changed, 286 insertions(+), 118 deletions(-) create mode 100644 benchmarks/dsr1_b200_trt_slurm.sh create mode 100644 debug_hw.py diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index 2d820463b..b6fe04a3a 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -30,22 +30,22 @@ jobs: - name: Find the latest Docker image run: echo "Hardcoding image tags for now." - bmk-h100: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: h100 - image: 'kedarpotdar147/vllm0.1:latest' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[2, 4, 8]' - timeout: ${{ inputs.timeout }} - framework: 'vllm' + # bmk-h100: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: h100 + # image: 'kedarpotdar147/vllm0.1:latest' + # model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + # tp-list: '[2, 4, 8]' + # timeout: ${{ inputs.timeout }} + # framework: 'vllm' bmk-h200: needs: find-latest-image @@ -60,7 +60,7 @@ jobs: runner: h200 image: 'kedarpotdar147/vllm0.1:latest' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[1, 2, 4, 8]' + tp-list: '[2]' timeout: ${{ inputs.timeout }} framework: 'vllm' @@ -77,26 +77,26 @@ jobs: runner: h200-trt image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[1, 2, 4, 8]' + tp-list: '[2]' timeout: ${{ inputs.timeout }} framework: 'trt' - bmk-b200: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: b200 - image: 'kedarpotdar147/vllm:05' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[1, 2]' - timeout: ${{ inputs.timeout }} - framework: 'vllm' + # bmk-b200: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: b200 + # image: 'kedarpotdar147/vllm:05' + # model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + # tp-list: '[1, 2]' + # timeout: ${{ inputs.timeout }} + # framework: 'vllm' bmk-b200-trt: needs: find-latest-image @@ -109,45 +109,45 @@ jobs: max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} runner: b200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[1, 2]' + tp-list: '[2]' timeout: ${{ inputs.timeout }} framework: 'trt' - bmk-mi300x: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi300x - image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' - model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - tp-list: '[1, 2, 4, 8]' - timeout: ${{ inputs.timeout }} - framework: 'vllm' + # bmk-mi300x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi300x + # image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' + # model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + # tp-list: '[1, 2, 4, 8]' + # timeout: ${{ inputs.timeout }} + # framework: 'vllm' - bmk-mi325x: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi325x - image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' - model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - tp-list: '[1, 2, 4, 8]' - timeout: ${{ inputs.timeout }} - framework: 'vllm' + # bmk-mi325x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi325x + # image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' + # model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + # tp-list: '[1, 2, 4, 8]' + # timeout: ${{ inputs.timeout }} + # framework: 'vllm' # bmk-mi355x: # needs: find-latest-image @@ -167,7 +167,7 @@ jobs: # framework: 'vllm' collect-results: - needs: [bmk-h100, bmk-h200, bmk-h200-trt, bmk-b200-trt, bmk-mi300x, bmk-mi325x] + needs: [ bmk-h200, bmk-h200-trt] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index 856e501b9..eff277bd1 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -22,6 +22,10 @@ on: required: false type: number default: 180 + precision: + required: false + type: string + default: fp8 jobs: find-latest-image: @@ -46,6 +50,7 @@ jobs: tp-list: '[8]' timeout: ${{ inputs.timeout }} framework: 'sglang' + precision: ${{ inputs.precision }} bmk-b200: needs: find-latest-image @@ -63,8 +68,9 @@ jobs: tp-list: '[8]' timeout: ${{ inputs.timeout }} framework: 'sglang' + precision: ${{ inputs.precision }} - bmk-mi300x: + bmk-b200-trt-fp4: needs: find-latest-image uses: ./.github/workflows/benchmark-tmpl.yml secrets: inherit @@ -74,29 +80,49 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi300x - image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' - model: 'deepseek-ai/DeepSeek-R1-0528' + runner: b200-trt + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' + model: 'nvidia/DeepSeek-R1-0528-FP4' tp-list: '[8]' timeout: ${{ inputs.timeout }} - framework: 'sglang' + framework: 'trt' + precision: fp4 - bmk-mi325x: - needs: find-latest-image - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi325x - image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' - model: 'deepseek-ai/DeepSeek-R1-0528' - tp-list: '[8]' - timeout: ${{ inputs.timeout }} - framework: 'sglang' + # bmk-mi300x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi300x + # image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' + # model: 'deepseek-ai/DeepSeek-R1-0528' + # tp-list: '[8]' + # timeout: ${{ inputs.timeout }} + # framework: 'sglang' + # precision: ${{ inputs.precision }} + + # bmk-mi325x: + # needs: find-latest-image + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} + # runner: mi325x + # image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' + # model: 'deepseek-ai/DeepSeek-R1-0528' + # tp-list: '[8]' + # timeout: ${{ inputs.timeout }} + # framework: 'sglang' + # precision: ${{ inputs.precision }} # bmk-mi355x: # needs: find-latest-image @@ -116,7 +142,7 @@ jobs: # framework: 'sglang' collect-results: - needs: [bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x] + needs: [bmk-h200, bmk-b200, bmk-b200-trt-fp4, ] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/benchmarks/dsr1_b200_trt_slurm.sh b/benchmarks/dsr1_b200_trt_slurm.sh new file mode 100644 index 000000000..c5662ac10 --- /dev/null +++ b/benchmarks/dsr1_b200_trt_slurm.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + + +set -x + +cat > dsr1-fp4-config.yml << 'EOF' +enable_attention_dp: false +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 +moe_config: + backend: TRTLLM +EOF + +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --ep_size 8 --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --extra_llm_api_options dsr1-fp4-config.yml --port $PORT > $SERVER_LOG 2>&1 & + + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then + sleep 5 + tail -n100 $SERVER_LOG + echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" + exit 1 + fi + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos --use-chat-template \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/debug_hw.py b/debug_hw.py new file mode 100644 index 000000000..2fdb69e1c --- /dev/null +++ b/debug_hw.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +import json +from pathlib import Path + +# Test what hw values are actually in the results +results_dir = Path("results") +if results_dir.exists(): + print("Found results directory") + for result_path in results_dir.rglob("*.json"): + print(f"\nFile: {result_path}") + with open(result_path) as f: + result = json.load(f) + print(f" hw: '{result.get('hw', 'MISSING')}'") + print(f" framework: '{result.get('framework', 'MISSING')}'") + print(f" precision: '{result.get('precision', 'MISSING')}'") + print(f" model: '{result.get('model', 'MISSING')}'") +else: + print("No results directory found") diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 933481d72..518f57d9f 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -11,9 +11,9 @@ fi PARTITION="dgx-b200" # Use framework-specific SQSH file if [ "$FRAMEWORK" = "trt" ]; then - SQUASH_FILE="/raid/image_${MODEL_CODE}_b200_trt-0903.sqsh" + SQUASH_FILE="/raid/image_${MODEL_CODE}_b200_trt-0907.sqsh" else - SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0903.sqsh" + SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0907.sqsh" fi salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell diff --git a/utils/plot_perf.py b/utils/plot_perf.py index 87f7a1ec7..1e108f83e 100644 --- a/utils/plot_perf.py +++ b/utils/plot_perf.py @@ -33,10 +33,21 @@ def plot_tput_vs_e2el(precision_filter=None): filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter] for hw_label, color in hw_color.items(): - xs = [result['median_e2el'] for result in filtered_results if result['hw'] == hw_label] - ys = [result['tput_per_gpu'] for result in filtered_results if result['hw'] == hw_label] - if xs and ys: - ax.scatter(xs, ys, label=hw_label.upper(), color=color) + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_e2el'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_e2el'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) for result in filtered_results: x, y = result['median_e2el'], result['tput_per_gpu'] @@ -61,10 +72,21 @@ def plot_tput_vs_intvty(precision_filter=None): filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter] for hw_label, color in hw_color.items(): - xs = [result['median_intvty'] for result in filtered_results if result['hw'] == hw_label] - ys = [result['tput_per_gpu'] for result in filtered_results if result['hw'] == hw_label] - if xs and ys: - ax.scatter(xs, ys, label=hw_label.upper(), color=color) + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_intvty'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_intvty'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) for result in filtered_results: x, y = result['median_intvty'], result['tput_per_gpu'] @@ -84,10 +106,21 @@ def plot_tput_vs_e2el_for_model(model_results, model_name): fig, ax = plt.subplots() for hw_label, color in hw_color.items(): - xs = [result['median_e2el'] for result in model_results if result['hw'] == hw_label] - ys = [result['tput_per_gpu'] for result in model_results if result['hw'] == hw_label] - if xs and ys: - ax.scatter(xs, ys, label=hw_label.upper(), color=color) + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_e2el'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_e2el'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) for result in model_results: x, y = result['median_e2el'], result['tput_per_gpu'] @@ -109,10 +142,21 @@ def plot_tput_vs_intvty_for_model(model_results, model_name): fig, ax = plt.subplots() for hw_label, color in hw_color.items(): - xs = [result['median_intvty'] for result in model_results if result['hw'] == hw_label] - ys = [result['tput_per_gpu'] for result in model_results if result['hw'] == hw_label] - if xs and ys: - ax.scatter(xs, ys, label=hw_label.upper(), color=color) + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_intvty'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_intvty'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) for result in model_results: x, y = result['median_intvty'], result['tput_per_gpu'] @@ -131,13 +175,22 @@ def plot_tput_vs_intvty_for_model(model_results, model_name): # Create one plot per model showing all frameworks and hardware -# Group results by model (70b, dsr1, etc.) -models = set(r.get('model', 'unknown') for r in results) - -for model in models: - # Filter results for this model - model_results = [r for r in results if r.get('model', 'unknown') == model] +# Group results by model family (70b, dsr1, etc.) instead of full model name +def get_model_family(model_name): + if '70b' in model_name.lower() or 'llama-3.3-70b' in model_name.lower(): + return '70b' + elif 'dsr1' in model_name.lower() or 'deepseek-r1' in model_name.lower(): + return 'dsr1' + else: + # Fallback to first part of model name + return model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name + +model_families = set(get_model_family(r.get('model', 'unknown')) for r in results) + +for model_family in model_families: + # Filter results for this model family + model_results = [r for r in results if get_model_family(r.get('model', 'unknown')) == model_family] - # Create plots for this model - plot_tput_vs_e2el_for_model(model_results, model) - plot_tput_vs_intvty_for_model(model_results, model) + # Create plots for this model family + plot_tput_vs_e2el_for_model(model_results, model_family) + plot_tput_vs_intvty_for_model(model_results, model_family) From 4966b0aa1cbba698cc4c266b1bb0a8cee9113ea3 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Sun, 7 Sep 2025 17:12:16 -0700 Subject: [PATCH 2/7] remove 8k tests --- .github/workflows/workflow-scheduler.yml | 74 ++++++++++++------------ 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml index 70fee5f5a..eab4510ff 100644 --- a/.github/workflows/workflow-scheduler.yml +++ b/.github/workflows/workflow-scheduler.yml @@ -30,43 +30,43 @@ jobs: max-model-len: 2048 random-range-ratio: 0.8 - _70b-8k1k: - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 + # _70b-8k1k: + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 - dsr1-8k1k: - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 + # dsr1-8k1k: + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 - _70b-1k8k: - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 - timeout: 240 + # _70b-1k8k: + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_1k8k' + # isl: 1024 + # osl: 8192 + # max-model-len: 9216 + # random-range-ratio: 0.8 + # timeout: 240 - dsr1-1k8k: - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 + # dsr1-1k8k: + # uses: ./.github/workflows/dsr1-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'dsr1_1k8k' + # isl: 1024 + # osl: 8192 + # max-model-len: 9216 + # random-range-ratio: 0.8 From bc8716751ab1df7a66804a74004c02a0a4d957a1 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Sun, 7 Sep 2025 17:15:10 -0700 Subject: [PATCH 3/7] remove concurrency lock --- .github/workflows/workflow-scheduler.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml index eab4510ff..e6fcd95b3 100644 --- a/.github/workflows/workflow-scheduler.yml +++ b/.github/workflows/workflow-scheduler.yml @@ -1,13 +1,13 @@ name: Workflow Scheduler -concurrency: - group: benchmark-lock - cancel-in-progress: false +# concurrency: +# group: benchmark-lock-v2 +# cancel-in-progress: false on: workflow_dispatch: - schedule: - - cron: '0 5 * * *' + # schedule: + # - cron: '0 5 * * *' jobs: _70b-1k1k: From b8e567cf618803905aba99d7e2208c53acd0991f Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Sun, 7 Sep 2025 17:15:25 -0700 Subject: [PATCH 4/7] typo in concurrecny lock --- .github/workflows/workflow-scheduler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml index e6fcd95b3..9011c06bf 100644 --- a/.github/workflows/workflow-scheduler.yml +++ b/.github/workflows/workflow-scheduler.yml @@ -1,7 +1,7 @@ name: Workflow Scheduler # concurrency: -# group: benchmark-lock-v2 +# group: benchmark-lock # cancel-in-progress: false on: From 759c0f6078d7e7102d3790b6fad1476a95997950 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Sun, 7 Sep 2025 19:08:04 -0700 Subject: [PATCH 5/7] enable other tests --- .github/workflows/70b-tmpl.yml | 132 +++++++++++------------ .github/workflows/dsr1-tmpl.yml | 71 ++++++------ .github/workflows/workflow-scheduler.yml | 84 +++++++-------- 3 files changed, 144 insertions(+), 143 deletions(-) diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index b6fe04a3a..250a50ad0 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -30,22 +30,22 @@ jobs: - name: Find the latest Docker image run: echo "Hardcoding image tags for now." - # bmk-h100: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: h100 - # image: 'kedarpotdar147/vllm0.1:latest' - # model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - # tp-list: '[2, 4, 8]' - # timeout: ${{ inputs.timeout }} - # framework: 'vllm' + bmk-h100: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: h100 + image: 'kedarpotdar147/vllm0.1:latest' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + tp-list: '[2, 4, 8]' + timeout: ${{ inputs.timeout }} + framework: 'vllm' bmk-h200: needs: find-latest-image @@ -81,22 +81,22 @@ jobs: timeout: ${{ inputs.timeout }} framework: 'trt' - # bmk-b200: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: b200 - # image: 'kedarpotdar147/vllm:05' - # model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - # tp-list: '[1, 2]' - # timeout: ${{ inputs.timeout }} - # framework: 'vllm' + bmk-b200: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: b200 + image: 'kedarpotdar147/vllm:05' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + tp-list: '[1, 2]' + timeout: ${{ inputs.timeout }} + framework: 'vllm' bmk-b200-trt: needs: find-latest-image @@ -111,43 +111,43 @@ jobs: runner: b200-trt image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[2]' + tp-list: '[1,2]' timeout: ${{ inputs.timeout }} framework: 'trt' - # bmk-mi300x: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: mi300x - # image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' - # model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - # tp-list: '[1, 2, 4, 8]' - # timeout: ${{ inputs.timeout }} - # framework: 'vllm' + bmk-mi300x: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: mi300x + image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' + model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + tp-list: '[1, 2, 4, 8]' + timeout: ${{ inputs.timeout }} + framework: 'vllm' - # bmk-mi325x: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: mi325x - # image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' - # model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - # tp-list: '[1, 2, 4, 8]' - # timeout: ${{ inputs.timeout }} - # framework: 'vllm' + bmk-mi325x: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: mi325x + image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718' + model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + tp-list: '[1, 2, 4, 8]' + timeout: ${{ inputs.timeout }} + framework: 'vllm' # bmk-mi355x: # needs: find-latest-image @@ -167,7 +167,7 @@ jobs: # framework: 'vllm' collect-results: - needs: [ bmk-h200, bmk-h200-trt] + needs: [ bmk-h200, bmk-h200-trt, bmk-b200, bmk-b200-trt, bmk-mi300x, bmk-mi325x] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index eff277bd1..7ede786ba 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -88,41 +88,41 @@ jobs: framework: 'trt' precision: fp4 - # bmk-mi300x: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: mi300x - # image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' - # model: 'deepseek-ai/DeepSeek-R1-0528' - # tp-list: '[8]' - # timeout: ${{ inputs.timeout }} - # framework: 'sglang' - # precision: ${{ inputs.precision }} + bmk-mi300x: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: mi300x + image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' + model: 'deepseek-ai/DeepSeek-R1-0528' + tp-list: '[8]' + timeout: ${{ inputs.timeout }} + framework: 'sglang' + precision: ${{ inputs.precision }} - # bmk-mi325x: - # needs: find-latest-image - # uses: ./.github/workflows/benchmark-tmpl.yml - # secrets: inherit - # with: - # exp-name: ${{ inputs.exp-name }} - # isl: ${{ inputs.isl }} - # osl: ${{ inputs.osl }} - # max-model-len: ${{ inputs.max-model-len }} - # random-range-ratio: ${{ inputs.random-range-ratio }} - # runner: mi325x - # image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' - # model: 'deepseek-ai/DeepSeek-R1-0528' - # tp-list: '[8]' - # timeout: ${{ inputs.timeout }} - # framework: 'sglang' - # precision: ${{ inputs.precision }} + bmk-mi325x: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: mi325x + image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x' + model: 'deepseek-ai/DeepSeek-R1-0528' + tp-list: '[8]' + timeout: ${{ inputs.timeout }} + framework: 'sglang' + precision: ${{ inputs.precision }} # bmk-mi355x: # needs: find-latest-image @@ -140,9 +140,10 @@ jobs: # tp-list: '[8]' # timeout: ${{ inputs.timeout }} # framework: 'sglang' + # precision: ${{ inputs.precision }} collect-results: - needs: [bmk-h200, bmk-b200, bmk-b200-trt-fp4, ] + needs: [bmk-h200, bmk-b200, bmk-b200-trt-fp4, bmk-mi300x, bmk-mi325x] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml index 9011c06bf..70fee5f5a 100644 --- a/.github/workflows/workflow-scheduler.yml +++ b/.github/workflows/workflow-scheduler.yml @@ -1,13 +1,13 @@ name: Workflow Scheduler -# concurrency: -# group: benchmark-lock -# cancel-in-progress: false +concurrency: + group: benchmark-lock + cancel-in-progress: false on: workflow_dispatch: - # schedule: - # - cron: '0 5 * * *' + schedule: + - cron: '0 5 * * *' jobs: _70b-1k1k: @@ -30,43 +30,43 @@ jobs: max-model-len: 2048 random-range-ratio: 0.8 - # _70b-8k1k: - # uses: ./.github/workflows/70b-tmpl.yml - # secrets: inherit - # with: - # exp-name: '70b_8k1k' - # isl: 8192 - # osl: 1024 - # max-model-len: 9216 - # random-range-ratio: 0.8 + _70b-8k1k: + uses: ./.github/workflows/70b-tmpl.yml + secrets: inherit + with: + exp-name: '70b_8k1k' + isl: 8192 + osl: 1024 + max-model-len: 9216 + random-range-ratio: 0.8 - # dsr1-8k1k: - # uses: ./.github/workflows/dsr1-tmpl.yml - # secrets: inherit - # with: - # exp-name: 'dsr1_8k1k' - # isl: 8192 - # osl: 1024 - # max-model-len: 9216 - # random-range-ratio: 0.8 + dsr1-8k1k: + uses: ./.github/workflows/dsr1-tmpl.yml + secrets: inherit + with: + exp-name: 'dsr1_8k1k' + isl: 8192 + osl: 1024 + max-model-len: 9216 + random-range-ratio: 0.8 - # _70b-1k8k: - # uses: ./.github/workflows/70b-tmpl.yml - # secrets: inherit - # with: - # exp-name: '70b_1k8k' - # isl: 1024 - # osl: 8192 - # max-model-len: 9216 - # random-range-ratio: 0.8 - # timeout: 240 + _70b-1k8k: + uses: ./.github/workflows/70b-tmpl.yml + secrets: inherit + with: + exp-name: '70b_1k8k' + isl: 1024 + osl: 8192 + max-model-len: 9216 + random-range-ratio: 0.8 + timeout: 240 - # dsr1-1k8k: - # uses: ./.github/workflows/dsr1-tmpl.yml - # secrets: inherit - # with: - # exp-name: 'dsr1_1k8k' - # isl: 1024 - # osl: 8192 - # max-model-len: 9216 - # random-range-ratio: 0.8 + dsr1-1k8k: + uses: ./.github/workflows/dsr1-tmpl.yml + secrets: inherit + with: + exp-name: 'dsr1_1k8k' + isl: 1024 + osl: 8192 + max-model-len: 9216 + random-range-ratio: 0.8 From 294fe5f2712d98b7ebc529e7f1027a7d63613472 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Sun, 7 Sep 2025 19:55:39 -0700 Subject: [PATCH 6/7] remove MTP --- benchmarks/dsr1_b200_trt_slurm.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/benchmarks/dsr1_b200_trt_slurm.sh b/benchmarks/dsr1_b200_trt_slurm.sh index c5662ac10..417313212 100644 --- a/benchmarks/dsr1_b200_trt_slurm.sh +++ b/benchmarks/dsr1_b200_trt_slurm.sh @@ -32,9 +32,6 @@ kv_cache_config: dtype: fp8 enable_block_reuse: false stream_interval: 10 -speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 moe_config: backend: TRTLLM EOF @@ -65,7 +62,7 @@ python3 bench_serving/benchmark_serving.py \ --dataset-name random \ --random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ --num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos --use-chat-template \ +--request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ --result-dir /workspace/ \ --result-filename $RESULT_FILENAME.json From a5df32918fba0605d3b10b0261e7e36c1b5124d3 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Sun, 7 Sep 2025 21:30:47 -0700 Subject: [PATCH 7/7] update tp-list to include full list --- .github/workflows/70b-tmpl.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index 250a50ad0..968f8e1cc 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -60,7 +60,7 @@ jobs: runner: h200 image: 'kedarpotdar147/vllm0.1:latest' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[2]' + tp-list: '[1, 2, 4, 8]' timeout: ${{ inputs.timeout }} framework: 'vllm' @@ -77,7 +77,7 @@ jobs: runner: h200-trt image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[2]' + tp-list: '[1, 2, 4, 8]' timeout: ${{ inputs.timeout }} framework: 'trt' @@ -94,7 +94,7 @@ jobs: runner: b200 image: 'kedarpotdar147/vllm:05' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[1, 2]' + tp-list: '[1, 2, 4, 8]' timeout: ${{ inputs.timeout }} framework: 'vllm'