From a48ffcb298a85054d051d6a36d2c16322ec2d2bc Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 23:50:59 +0800 Subject: [PATCH 1/9] sglang dpskv4 hopper --- .github/configs/nvidia-master.yaml | 19 +++++ .../single_node/dsv4_fp4_h200_sglang.sh | 73 +++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 benchmarks/single_node/dsv4_fp4_h200_sglang.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 38d1101f3..e8a0faab6 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2604,6 +2604,25 @@ dsv4-fp8-h200-vllm: search-space: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } +# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP4, TP-only). +dsv4-fp4-h200-sglang: + image: lmsysorg/sglang:deepseek-v4-hopper + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: h200 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 # pareto sweep. The single-node schema has no explicit data-parallel-size # field, so dp-attn=true is used as the existing vLLM script switch for DP4 diff --git a/benchmarks/single_node/dsv4_fp4_h200_sglang.sh b/benchmarks/single_node/dsv4_fp4_h200_sglang.sh new file mode 100644 index 000000000..a7e822596 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_h200_sglang.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +nvidia-smi + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +set -x +PYTHONNOUSERSITE=1 sglang serve \ + --model-path $MODEL \ + --host 0.0.0.0 \ + --port $PORT \ + --trust-remote-code \ + --tp $TP \ + --moe-runner-backend marlin \ + --chunked-prefill-size 4096 \ + --disable-flashinfer-autotune \ + --mem-fraction-static 0.88 \ + --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ + $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $((CONC * 10)) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x From a479062bf5baa474e5e36ed142122f32506b60da Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 00:32:50 +0800 Subject: [PATCH 2/9] h200 runner: support framework-tagged script names --- runners/launch_h200-dgxc-slurm.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 199f5e4ae..7fbbf1cb6 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -292,13 +292,24 @@ else fi " + # Prefer a framework-tagged script (e.g. dsv4_fp4_h200_sglang.sh) so models + # with multiple inference engines can coexist; fall back to the historical + # name without an engine suffix for scripts that haven't been retagged yet. + SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_h200" + BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" + if [[ ! -f "$BENCH_SCRIPT" ]]; then + LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') + BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" + fi + srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp').sh + bash $BENCH_SCRIPT scancel $JOB_ID From 31da83572b7ab4c03701eb9ad43dbbf960a00365 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 00:40:52 +0800 Subject: [PATCH 3/9] h200 runners: fix script path and /workspace mount conflict --- runners/launch_h200-cw.sh | 19 +++++++++++++++---- runners/launch_h200-dgxc-slurm.sh | 13 ++++++++----- runners/launch_h200-nb.sh | 20 ++++++++++++++++---- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 1486c4fa6..4be548c87 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -4,8 +4,13 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200" +BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" +if [[ ! -f "$BENCH_SCRIPT" ]]; then + LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') + BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" +fi PARTITION="h200" SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -38,13 +43,19 @@ else CONTAINER_IMAGE=$(realpath $SQUASH_FILE) fi +if [[ "$IMAGE" == *deepseek-v4-hopper* ]]; then + CONTAINER_MOUNT_DIR=/ix +else + CONTAINER_MOUNT_DIR=/workspace +fi + srun --jobid=$JOB_ID \ --container-image=$CONTAINER_IMAGE \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-mount-home \ ---container-workdir=/workspace/ \ +--container-workdir=$CONTAINER_MOUNT_DIR/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash $BENCH_SCRIPT rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 7fbbf1cb6..7dd04a6d7 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -292,9 +292,6 @@ else fi " - # Prefer a framework-tagged script (e.g. dsv4_fp4_h200_sglang.sh) so models - # with multiple inference engines can coexist; fall back to the historical - # name without an engine suffix for scripts that haven't been retagged yet. SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_h200" BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" @@ -303,11 +300,17 @@ else BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" fi + if [[ "$IMAGE" == *deepseek-v4-hopper* ]]; then + CONTAINER_MOUNT_DIR=/ix + else + CONTAINER_MOUNT_DIR=/workspace + fi + srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ - --container-workdir=/workspace/ \ + --container-workdir=$CONTAINER_MOUNT_DIR/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash $BENCH_SCRIPT diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 158c30792..de2505158 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -4,19 +4,31 @@ export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200" +BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" +if [[ ! -f "$BENCH_SCRIPT" ]]; then + LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') + BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" +fi + +if [[ "$IMAGE" == *deepseek-v4-hopper* ]]; then + CONTAINER_MOUNT_DIR=/ix +else + CONTAINER_MOUNT_DIR=/workspace +fi + PARTITION="main" set -x srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \ --container-image=$IMAGE \ --container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-remap-root \ --container-writable \ --container-mount-home \ ---container-workdir=/workspace/ \ +--container-workdir=$CONTAINER_MOUNT_DIR/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash $BENCH_SCRIPT From 49661cef02f65325699946cafb619a1f077653f0 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 00:52:17 +0800 Subject: [PATCH 4/9] pin deepseek-v4-hopper image digest --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e8a0faab6..b7993a0b0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2606,7 +2606,7 @@ dsv4-fp8-h200-vllm: # DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP4, TP-only). dsv4-fp4-h200-sglang: - image: lmsysorg/sglang:deepseek-v4-hopper + image: lmsysorg/sglang:deepseek-v4-hopper@sha256:7f19c6dc092e47a10fac2e41f47eab78970280d06648b8e50d312a82f0ae722f model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 From a5f403e553e44754f93acecd040c9771d57b2294 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 12:18:13 +0800 Subject: [PATCH 5/9] fp4 -> fp8 --- .github/configs/nvidia-master.yaml | 6 +++--- .../{dsv4_fp4_h200_sglang.sh => dsv4_fp8_h200_sglang.sh} | 0 2 files changed, 3 insertions(+), 3 deletions(-) rename benchmarks/single_node/{dsv4_fp4_h200_sglang.sh => dsv4_fp8_h200_sglang.sh} (100%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b7993a0b0..1d793f0f0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2604,13 +2604,13 @@ dsv4-fp8-h200-vllm: search-space: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } -# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP4, TP-only). -dsv4-fp4-h200-sglang: +# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). +dsv4-fp8-h200-sglang: image: lmsysorg/sglang:deepseek-v4-hopper@sha256:7f19c6dc092e47a10fac2e41f47eab78970280d06648b8e50d312a82f0ae722f model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 - precision: fp4 + precision: fp8 framework: sglang multinode: false seq-len-configs: diff --git a/benchmarks/single_node/dsv4_fp4_h200_sglang.sh b/benchmarks/single_node/dsv4_fp8_h200_sglang.sh similarity index 100% rename from benchmarks/single_node/dsv4_fp4_h200_sglang.sh rename to benchmarks/single_node/dsv4_fp8_h200_sglang.sh From d58ca0682ef78d8dd80700ef279638df85303622 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 12:19:26 +0800 Subject: [PATCH 6/9] conc: drop 2, add 64 --- .github/configs/nvidia-master.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1d793f0f0..9f44c2d64 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2617,11 +2617,13 @@ dsv4-fp8-h200-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 # pareto sweep. The single-node schema has no explicit data-parallel-size From 189a71d8e780076a647b5a13f006b1f306b19fe6 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sat, 2 May 2026 16:10:18 -0400 Subject: [PATCH 7/9] dsv4-fp8-h200-sglang: disable radix cache and migrate to scenarios.fixed-seq-len schema Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 23 ++++++++++--------- .../single_node/dsv4_fp8_h200_sglang.sh | 1 + 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9f44c2d64..8c9fc0c7a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2613,17 +2613,18 @@ dsv4-fp8-h200-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 # pareto sweep. The single-node schema has no explicit data-parallel-size diff --git a/benchmarks/single_node/dsv4_fp8_h200_sglang.sh b/benchmarks/single_node/dsv4_fp8_h200_sglang.sh index a7e822596..f2fc8541e 100644 --- a/benchmarks/single_node/dsv4_fp8_h200_sglang.sh +++ b/benchmarks/single_node/dsv4_fp8_h200_sglang.sh @@ -42,6 +42,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --moe-runner-backend marlin \ --chunked-prefill-size 4096 \ --disable-flashinfer-autotune \ + --disable-radix-cache \ --mem-fraction-static 0.88 \ --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & From 0fa05d309529ee12f68732b08f1be9a3e2c4ba85 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sat, 2 May 2026 16:24:17 -0400 Subject: [PATCH 8/9] perf-changelog: add dsv4-fp8-h200-sglang entry Co-Authored-By: Claude Opus 4.7 (1M context) --- perf-changelog.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 089ce36f2..6a4f08be5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2103,3 +2103,13 @@ - "Sweep DP attention on/off and EP=8 via DP_ATTENTION and EP_SIZE matrix env vars" - "Ship a DeepSeek-V4 thinking-mode chat template so eval /v1/chat/completions works (the canonical checkpoint ships no chat_template)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1244 + +- config-keys: + - dsv4-fp8-h200-sglang + description: + - "Add DeepSeek-V4-Pro FP8 H200 single-node SGLang recipe (Marlin MoE backend, TP=8, EP=1)" + - "Image: lmsysorg/sglang:deepseek-v4-hopper pinned by digest" + - "Server flags: --moe-runner-backend marlin, --chunked-prefill-size 4096, --disable-flashinfer-autotune, --disable-radix-cache, --mem-fraction-static 0.88" + - "Search space: TP=8 EP=1, conc 1 and 4-64 for both 1k1k and 8k1k" + - "Pinned to the h200-dgxc runner pool (new runners.yaml group); launch_h200-dgxc-slurm.sh extended to support framework-tagged script names and mount /ix instead of /workspace for the deepseek-v4-hopper image" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1264 From 66a2446442057fd394207dfe59e72cfca3a08e94 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sat, 2 May 2026 17:18:46 -0400 Subject: [PATCH 9/9] dsv4-fp8-h200-sglang: pin to h200-dgxc runner pool Revert the cw/nb launcher generalizations from the previous PR-1212 import since the deepseek-v4-hopper image only needs the /ix mount layout from launch_h200-dgxc-slurm.sh. Restrict the recipe to that runner pool by adding a new h200-dgxc group in runners.yaml (parallel to the existing gb300-cw / h200-multinode precedents) and setting runner: h200-dgxc. All 14 h200-dgxc-slurm_* runners already carry the h200-dgxc label in their GH Actions config, so runs-on: h200-dgxc dispatches cleanly. The runner.name prefix is still h200-dgxc-slurm, which matches the existing launch_h200-dgxc-slurm.sh. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 4 +++- .github/configs/runners.yaml | 15 +++++++++++++++ runners/launch_h200-cw.sh | 19 ++++--------------- runners/launch_h200-nb.sh | 20 ++++---------------- 4 files changed, 26 insertions(+), 32 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8c9fc0c7a..4ce7b823e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2605,11 +2605,13 @@ dsv4-fp8-h200-vllm: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } # DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). +# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper +# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up. dsv4-fp8-h200-sglang: image: lmsysorg/sglang:deepseek-v4-hopper@sha256:7f19c6dc092e47a10fac2e41f47eab78970280d06648b8e50d312a82f0ae722f model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: h200 + runner: h200-dgxc precision: fp8 framework: sglang multinode: false diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index f574c629c..48a7173d4 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -49,6 +49,21 @@ h200-multinode: - 'h200-dgxc-slurm_11' - 'h200-dgxc-slurm_12' - 'h200-dgxc-slurm_13' +h200-dgxc: +- 'h200-dgxc-slurm_0' +- 'h200-dgxc-slurm_1' +- 'h200-dgxc-slurm_2' +- 'h200-dgxc-slurm_3' +- 'h200-dgxc-slurm_4' +- 'h200-dgxc-slurm_5' +- 'h200-dgxc-slurm_6' +- 'h200-dgxc-slurm_7' +- 'h200-dgxc-slurm_8' +- 'h200-dgxc-slurm_9' +- 'h200-dgxc-slurm_10' +- 'h200-dgxc-slurm_11' +- 'h200-dgxc-slurm_12' +- 'h200-dgxc-slurm_13' b200: - 'b200-cw_00' - 'b200-cw_01' diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 4be548c87..1486c4fa6 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -4,13 +4,8 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') -BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200" -BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" -if [[ ! -f "$BENCH_SCRIPT" ]]; then - LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') - BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" -fi PARTITION="h200" SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -43,19 +38,13 @@ else CONTAINER_IMAGE=$(realpath $SQUASH_FILE) fi -if [[ "$IMAGE" == *deepseek-v4-hopper* ]]; then - CONTAINER_MOUNT_DIR=/ix -else - CONTAINER_MOUNT_DIR=/workspace -fi - srun --jobid=$JOB_ID \ --container-image=$CONTAINER_IMAGE \ ---container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-mount-home \ ---container-workdir=$CONTAINER_MOUNT_DIR/ \ +--container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash $BENCH_SCRIPT +bash benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index de2505158..158c30792 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -4,31 +4,19 @@ export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') -BENCH_BASE="benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200" -BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" -if [[ ! -f "$BENCH_SCRIPT" ]]; then - LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') - BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" -fi - -if [[ "$IMAGE" == *deepseek-v4-hopper* ]]; then - CONTAINER_MOUNT_DIR=/ix -else - CONTAINER_MOUNT_DIR=/workspace -fi - PARTITION="main" set -x srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \ --container-image=$IMAGE \ --container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \ ---container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-remap-root \ --container-writable \ --container-mount-home \ ---container-workdir=$CONTAINER_MOUNT_DIR/ \ +--container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash $BENCH_SCRIPT +bash benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh