diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index 2d820463b..968f8e1cc 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -94,7 +94,7 @@ jobs: runner: b200 image: 'kedarpotdar147/vllm:05' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[1, 2]' + tp-list: '[1, 2, 4, 8]' timeout: ${{ inputs.timeout }} framework: 'vllm' @@ -109,9 +109,9 @@ jobs: max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} runner: b200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - tp-list: '[1, 2]' + tp-list: '[1,2]' timeout: ${{ inputs.timeout }} framework: 'trt' @@ -167,7 +167,7 @@ jobs: # framework: 'vllm' collect-results: - needs: [bmk-h100, bmk-h200, bmk-h200-trt, bmk-b200-trt, bmk-mi300x, bmk-mi325x] + needs: [ bmk-h200, bmk-h200-trt, bmk-b200, bmk-b200-trt, bmk-mi300x, bmk-mi325x] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index 856e501b9..7ede786ba 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -22,6 +22,10 @@ on: required: false type: number default: 180 + precision: + required: false + type: string + default: fp8 jobs: find-latest-image: @@ -46,6 +50,7 @@ jobs: tp-list: '[8]' timeout: ${{ inputs.timeout }} framework: 'sglang' + precision: ${{ inputs.precision }} bmk-b200: needs: find-latest-image @@ -63,6 +68,25 @@ jobs: tp-list: '[8]' timeout: ${{ inputs.timeout }} framework: 'sglang' + precision: ${{ inputs.precision }} + + bmk-b200-trt-fp4: + needs: find-latest-image + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + runner: b200-trt + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' + model: 'nvidia/DeepSeek-R1-0528-FP4' + tp-list: '[8]' + timeout: ${{ inputs.timeout }} + framework: 'trt' + precision: fp4 bmk-mi300x: needs: find-latest-image @@ -80,6 +104,7 @@ jobs: tp-list: '[8]' timeout: ${{ inputs.timeout }} framework: 'sglang' + precision: ${{ inputs.precision }} bmk-mi325x: needs: find-latest-image @@ -97,6 +122,7 @@ jobs: tp-list: '[8]' timeout: ${{ inputs.timeout }} framework: 'sglang' + precision: ${{ inputs.precision }} # bmk-mi355x: # needs: find-latest-image @@ -114,9 +140,10 @@ jobs: # tp-list: '[8]' # timeout: ${{ inputs.timeout }} # framework: 'sglang' + # precision: ${{ inputs.precision }} collect-results: - needs: [bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x] + needs: [bmk-h200, bmk-b200, bmk-b200-trt-fp4, bmk-mi300x, bmk-mi325x] if: ${{ always() && !cancelled() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/benchmarks/dsr1_b200_trt_slurm.sh b/benchmarks/dsr1_b200_trt_slurm.sh new file mode 100644 index 000000000..417313212 --- /dev/null +++ b/benchmarks/dsr1_b200_trt_slurm.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + + +set -x + +cat > dsr1-fp4-config.yml << 'EOF' +enable_attention_dp: false +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: TRTLLM +EOF + +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --ep_size 8 --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --extra_llm_api_options dsr1-fp4-config.yml --port $PORT > $SERVER_LOG 2>&1 & + + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then + sleep 5 + tail -n100 $SERVER_LOG + echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" + exit 1 + fi + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/debug_hw.py b/debug_hw.py new file mode 100644 index 000000000..2fdb69e1c --- /dev/null +++ b/debug_hw.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +import json +from pathlib import Path + +# Test what hw values are actually in the results +results_dir = Path("results") +if results_dir.exists(): + print("Found results directory") + for result_path in results_dir.rglob("*.json"): + print(f"\nFile: {result_path}") + with open(result_path) as f: + result = json.load(f) + print(f" hw: '{result.get('hw', 'MISSING')}'") + print(f" framework: '{result.get('framework', 'MISSING')}'") + print(f" precision: '{result.get('precision', 'MISSING')}'") + print(f" model: '{result.get('model', 'MISSING')}'") +else: + print("No results directory found") diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 933481d72..518f57d9f 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -11,9 +11,9 @@ fi PARTITION="dgx-b200" # Use framework-specific SQSH file if [ "$FRAMEWORK" = "trt" ]; then - SQUASH_FILE="/raid/image_${MODEL_CODE}_b200_trt-0903.sqsh" + SQUASH_FILE="/raid/image_${MODEL_CODE}_b200_trt-0907.sqsh" else - SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0903.sqsh" + SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0907.sqsh" fi salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell diff --git a/utils/plot_perf.py b/utils/plot_perf.py index 87f7a1ec7..1e108f83e 100644 --- a/utils/plot_perf.py +++ b/utils/plot_perf.py @@ -33,10 +33,21 @@ def plot_tput_vs_e2el(precision_filter=None): filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter] for hw_label, color in hw_color.items(): - xs = [result['median_e2el'] for result in filtered_results if result['hw'] == hw_label] - ys = [result['tput_per_gpu'] for result in filtered_results if result['hw'] == hw_label] - if xs and ys: - ax.scatter(xs, ys, label=hw_label.upper(), color=color) + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_e2el'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_e2el'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) for result in filtered_results: x, y = result['median_e2el'], result['tput_per_gpu'] @@ -61,10 +72,21 @@ def plot_tput_vs_intvty(precision_filter=None): filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter] for hw_label, color in hw_color.items(): - xs = [result['median_intvty'] for result in filtered_results if result['hw'] == hw_label] - ys = [result['tput_per_gpu'] for result in filtered_results if result['hw'] == hw_label] - if xs and ys: - ax.scatter(xs, ys, label=hw_label.upper(), color=color) + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_intvty'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_intvty'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) for result in filtered_results: x, y = result['median_intvty'], result['tput_per_gpu'] @@ -84,10 +106,21 @@ def plot_tput_vs_e2el_for_model(model_results, model_name): fig, ax = plt.subplots() for hw_label, color in hw_color.items(): - xs = [result['median_e2el'] for result in model_results if result['hw'] == hw_label] - ys = [result['tput_per_gpu'] for result in model_results if result['hw'] == hw_label] - if xs and ys: - ax.scatter(xs, ys, label=hw_label.upper(), color=color) + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_e2el'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_e2el'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) for result in model_results: x, y = result['median_e2el'], result['tput_per_gpu'] @@ -109,10 +142,21 @@ def plot_tput_vs_intvty_for_model(model_results, model_name): fig, ax = plt.subplots() for hw_label, color in hw_color.items(): - xs = [result['median_intvty'] for result in model_results if result['hw'] == hw_label] - ys = [result['tput_per_gpu'] for result in model_results if result['hw'] == hw_label] - if xs and ys: - ax.scatter(xs, ys, label=hw_label.upper(), color=color) + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_intvty'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_intvty'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) for result in model_results: x, y = result['median_intvty'], result['tput_per_gpu'] @@ -131,13 +175,22 @@ def plot_tput_vs_intvty_for_model(model_results, model_name): # Create one plot per model showing all frameworks and hardware -# Group results by model (70b, dsr1, etc.) -models = set(r.get('model', 'unknown') for r in results) - -for model in models: - # Filter results for this model - model_results = [r for r in results if r.get('model', 'unknown') == model] +# Group results by model family (70b, dsr1, etc.) instead of full model name +def get_model_family(model_name): + if '70b' in model_name.lower() or 'llama-3.3-70b' in model_name.lower(): + return '70b' + elif 'dsr1' in model_name.lower() or 'deepseek-r1' in model_name.lower(): + return 'dsr1' + else: + # Fallback to first part of model name + return model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name + +model_families = set(get_model_family(r.get('model', 'unknown')) for r in results) + +for model_family in model_families: + # Filter results for this model family + model_results = [r for r in results if get_model_family(r.get('model', 'unknown')) == model_family] - # Create plots for this model - plot_tput_vs_e2el_for_model(model_results, model) - plot_tput_vs_intvty_for_model(model_results, model) + # Create plots for this model family + plot_tput_vs_e2el_for_model(model_results, model_family) + plot_tput_vs_intvty_for_model(model_results, model_family)