SemiAnalysisAI · Copilot · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/.github/workflows/gsm8k-benchmark.yml b/.github/workflows/gsm8k-benchmark.yml
@@ -0,0 +1,108 @@
+name: GSM8k Benchmark
+
+on:
+    workflow_dispatch:
+        inputs:
+            model:
+                description: "Model to evaluate (e.g., meta-llama/Llama-3.1-8B-Instruct)"
+                required: true
+                type: string
+                default: "meta-llama/Llama-3.1-8B-Instruct"
+            runner:
+                description: "Runner to use (e.g., h100-cr, h200-cw, b200-nv, mi300x-amd, mi325x-amd, mi355x-amd)"
+                required: true
+                type: string
+                default: "h100-cr"
+            image:
+                description: "Docker image to use"
+                required: true
+                type: string
+                default: "vllm/vllm-openai:latest"
+            framework:
+                description: "Framework to use (vllm, sglang)"
+                required: true
+                type: string
+                default: "vllm"
+            precision:
+                description: "Precision (fp16, fp8, fp4)"
+                required: true
+                type: string
+                default: "fp16"
+            tp:
+                description: "Tensor parallelism size"
+                required: true
+                type: string
+                default: "1"
+            num-fewshot:
+                description: "Number of few-shot examples"
+                required: false
+                type: string
+                default: "5"
+            limit:
+                description: "Limit number of examples (empty for all)"
+                required: false
+                type: string
+                default: ""
+
+env:
+    HF_TOKEN: ${{ secrets.HF_TOKEN }}
+    HF_HUB_CACHE: '/mnt/hf_hub_cache/'
+    MODEL: ${{ inputs.model }}
+    FRAMEWORK: ${{ inputs.framework }}
+    PRECISION: ${{ inputs.precision }}
+    TP: ${{ inputs.tp }}
+    NUM_FEWSHOT: ${{ inputs.num-fewshot }}
+    LIMIT: ${{ inputs.limit }}
+    IMAGE: ${{ inputs.image }}
+
+jobs:
+    gsm8k-eval:
+        runs-on: ${{ inputs.runner }}
+        timeout-minutes: 180
+        name: 'GSM8k - ${{ inputs.model }} - ${{ inputs.framework }} - ${{ inputs.precision }} - tp=${{ inputs.tp }}'
+        permissions:
+            contents: read
+        steps:
+            - name: Resource cleanup
+              run: |
+                  if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+                      echo "[Docker] Cleaning up resources ..."
+                      docker ps -aq | xargs -r docker rm -f
+                      docker network prune -f
+                      while [ -n "$(docker ps -aq)" ]; do
+                          docker ps -a
+                          sleep 5
+                      done
+                  fi
+                  if command -v squeue >/dev/null 2>&1; then
+                      echo "[Slurm] Cleaning up resources ..."
+                      scancel -u $USER
+                      while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do
+                          squeue -u $USER
+                          sleep 5
+                      done
+                  fi
+
+            - uses: actions/checkout@v3
-            - uses: actions/checkout@v3
+            - uses: actions/checkout@v4
-            - uses: actions/checkout@v3
+            - uses: actions/checkout@v4
+              with:
+                  token: ${{ secrets.REPO_PAT }}
+                  fetch-depth: 0
+
+            - name: Launch GSM8k evaluation
+              env:
+                  RUNNER_NAME: ${{ runner.name }}
+                  RESULT_FILENAME: gsm8k_${{ env.FRAMEWORK }}_${{ env.PRECISION }}_tp${{ env.TP }}_${{ runner.name }}
+              run: |
-              run: |
+              run: |
+                  # Extract the GPU type from runner names like 'h100_cr' (e.g., 'h100')
-              run: |
+              run: |
+                  # Extract the GPU type from runner names like 'h100_cr' (e.g., 'h100')
+                  bash ./runners/launch_gsm8k_${RUNNER_NAME%%_*}.sh
+                  if [ -f "$RESULT_FILENAME.json" ]; then
+                      echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
+                  else
+                      echo "Run failed: GSM8k result $RESULT_FILENAME.json not found." >&2
+                      exit 1
+                  fi
+
+            - name: Upload result
+              uses: actions/upload-artifact@v4
+              with:
+                  name: ${{ env.RESULT_FILENAME }}
+                  path: ${{ env.RESULT_FILENAME }}.json
diff --git a/benchmarks/gsm8k_sglang_docker.sh b/benchmarks/gsm8k_sglang_docker.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# MODEL
+# TP
+# NUM_FEWSHOT (optional, defaults to 5)
+# LIMIT (optional, empty for all examples)
+# RESULT_FILENAME (optional, for output filename)
+
+set -e
+
+# Set defaults
+NUM_FEWSHOT=${NUM_FEWSHOT:-5}
+PORT=${PORT:-8000}
+OUTPUT_DIR=${OUTPUT_DIR:-/workspace}
+
+# Start SGLang server in background
+echo "Starting SGLang server for model: $MODEL"
+export PYTHONNOUSERSITE=1
+
+python3 -m sglang.launch_server \
+--model-path $MODEL \
+--host 0.0.0.0 \
+--port $PORT \
+--tp-size $TP \
+--mem-fraction-static 0.9 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+echo "Waiting for server to start..."
+MAX_RETRIES=60
+RETRY_COUNT=0
+while ! curl -s http://localhost:$PORT/health > /dev/null; do
+    RETRY_COUNT=$((RETRY_COUNT + 1))
+    if [ $RETRY_COUNT -ge $MAX_RETRIES ]; then
+        echo "Server failed to start within timeout"
+        kill $SERVER_PID 2>/dev/null || true
+        exit 1
+    fi
+    sleep 5
+done
+
+echo "Server started successfully"
+
+# Run lm-eval with GSM8k
+echo "Running GSM8k evaluation..."
+
+# Create temporary output directory
+TEMP_OUTPUT_DIR=$(mktemp -d)
+
+# Build lm-eval command
+# Note: SGLang provides an OpenAI-compatible API, we use vllm model type with custom base_url
+LMEVAL_CMD="lm_eval --model vllm \
+--model_args pretrained=$MODEL,tensor_parallel_size=$TP,dtype=auto,data_parallel_size=1,base_url=http://localhost:$PORT/v1 \
+--tasks gsm8k \
+--num_fewshot $NUM_FEWSHOT \
+--batch_size auto \
+--output_path $TEMP_OUTPUT_DIR"
+
+# Add limit if specified
+if [ -n "$LIMIT" ]; then
+    LMEVAL_CMD="$LMEVAL_CMD --limit $LIMIT"
+fi
+
+# Run evaluation
+eval $LMEVAL_CMD
+
+# Copy results to expected location
+if [ -n "$RESULT_FILENAME" ]; then
+    # lm-eval creates results.json in output directory
+    if [ -f "$TEMP_OUTPUT_DIR/results.json" ]; then
+        cp "$TEMP_OUTPUT_DIR/results.json" "$OUTPUT_DIR/$RESULT_FILENAME.json"
+        echo "Results saved to $OUTPUT_DIR/$RESULT_FILENAME.json"
+    else
+        echo "Error: lm-eval output not found"
+        ls -la $TEMP_OUTPUT_DIR
+    fi
+fi
+
+# Shutdown server
+echo "Shutting down server..."
+kill $SERVER_PID 2>/dev/null || true
+wait $SERVER_PID 2>/dev/null || true
+
+echo "GSM8k evaluation completed"
diff --git a/benchmarks/gsm8k_vllm_docker.sh b/benchmarks/gsm8k_vllm_docker.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# MODEL
+# TP
+# NUM_FEWSHOT (optional, defaults to 5)
+# LIMIT (optional, empty for all examples)
+# RESULT_FILENAME (optional, for output filename)
+
+set -e
+
+# Set defaults
+NUM_FEWSHOT=${NUM_FEWSHOT:-5}
+PORT=${PORT:-8000}
+OUTPUT_DIR=${OUTPUT_DIR:-/workspace}
+
+# Start vLLM server in background
+echo "Starting vLLM server for model: $MODEL"
+export PYTHONNOUSERSITE=1
+
+vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
+--gpu-memory-utilization=0.9 \
+--tensor-parallel-size=$TP \
+--disable-log-requests &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+echo "Waiting for server to start..."
+MAX_RETRIES=60
+RETRY_COUNT=0
+while ! curl -s http://localhost:$PORT/health > /dev/null; do
+    RETRY_COUNT=$((RETRY_COUNT + 1))
+    if [ $RETRY_COUNT -ge $MAX_RETRIES ]; then
+        echo "Server failed to start within timeout"
+        kill $SERVER_PID 2>/dev/null || true
+        exit 1
+    fi
+    sleep 5
+done
+
+echo "Server started successfully"
+
+# Run lm-eval with GSM8k
+echo "Running GSM8k evaluation..."
+
+# Create temporary output directory
+TEMP_OUTPUT_DIR=$(mktemp -d)
+
+# Build lm-eval command
+LMEVAL_CMD="lm_eval --model vllm \
+--model_args pretrained=$MODEL,tensor_parallel_size=$TP,dtype=auto,gpu_memory_utilization=0.9,data_parallel_size=1,base_url=http://localhost:$PORT/v1 \
+--tasks gsm8k \
+--num_fewshot $NUM_FEWSHOT \
+--batch_size auto \
+--output_path $TEMP_OUTPUT_DIR"
+
+# Add limit if specified
+if [ -n "$LIMIT" ]; then
+    LMEVAL_CMD="$LMEVAL_CMD --limit $LIMIT"
+fi
+
+# Run evaluation
+eval $LMEVAL_CMD
+
+# Copy results to expected location
+if [ -n "$RESULT_FILENAME" ]; then
+    # lm-eval creates results.json in output directory
+    if [ -f "$TEMP_OUTPUT_DIR/results.json" ]; then
+        cp "$TEMP_OUTPUT_DIR/results.json" "$OUTPUT_DIR/$RESULT_FILENAME.json"
+        echo "Results saved to $OUTPUT_DIR/$RESULT_FILENAME.json"
+    else
+        echo "Error: lm-eval output not found"
+        ls -la $TEMP_OUTPUT_DIR
+    fi
+fi
+
+# Shutdown server
+echo "Shutting down server..."
+kill $SERVER_PID 2>/dev/null || true
+wait $SERVER_PID 2>/dev/null || true
+
+echo "GSM8k evaluation completed"
diff --git a/docs/GSM8K_BENCHMARK.md b/docs/GSM8K_BENCHMARK.md
@@ -0,0 +1,93 @@
+# GSM8k Benchmark CI
+
+This directory contains the CI workflow for running GSM8k (Grade School Math 8K) benchmarks using lm-evaluation-harness.
+
+## Overview
+
+GSM8k is a dataset of 8.5K high-quality linguistically diverse grade school math word problems. This benchmark evaluates the mathematical reasoning capabilities of language models.
+
+## Workflow
+
+The GSM8k benchmark CI is implemented in `.github/workflows/gsm8k-benchmark.yml` and can be triggered manually via workflow_dispatch.
+
+### Parameters
+
+- **model**: The model to evaluate (e.g., `meta-llama/Llama-3.1-8B-Instruct`)
+- **runner**: The GPU runner to use (e.g., `h100-cr`, `h200-cw`, `b200-nv`, `mi300x-amd`, `mi325x-amd`, `mi355x-amd`)
+- **image**: Docker image to use (e.g., `vllm/vllm-openai:latest`)
+- **framework**: Inference framework (`vllm` or `sglang`)
+- **precision**: Model precision (`fp16`, `fp8`, `fp4`)
+- **tp**: Tensor parallelism size
+- **num-fewshot**: Number of few-shot examples (default: 5)
+- **limit**: Optional limit on number of examples to evaluate
+
+## Architecture
+
+### Benchmark Scripts
+
+Located in `benchmarks/`:
+- `gsm8k_vllm_docker.sh`: Runs GSM8k evaluation using vLLM
+- `gsm8k_sglang_docker.sh`: Runs GSM8k evaluation using SGLang
+
+These scripts:
+1. Start an inference server (vLLM or SGLang)
+2. Wait for the server to be ready
+3. Run lm-eval with GSM8k task
+4. Save results and shutdown the server
+
+### Runner Scripts
+
+Located in `runners/`:
+- `launch_gsm8k_h100.sh`: Launch on H100 GPUs
+- `launch_gsm8k_h200.sh`: Launch on H200 GPUs
+- `launch_gsm8k_b200.sh`: Launch on B200 GPUs
+- `launch_gsm8k_mi300x.sh`: Launch on MI300X GPUs
+- `launch_gsm8k_mi325x.sh`: Launch on MI325X GPUs
+- `launch_gsm8k_mi355x.sh`: Launch on MI355X GPUs
+
+These scripts handle:
+- Docker container setup
+- GPU configuration (CUDA/ROCm)
+- Volume mounts
+- Environment variable passing
+
+## Usage
+
+### Via GitHub Actions UI
+
+1. Go to Actions tab
+2. Select "GSM8k Benchmark" workflow
+3. Click "Run workflow"
+4. Fill in the parameters
+5. Click "Run workflow"
+
+### Example
+
+To evaluate Llama-3.1-8B-Instruct on H100 with vLLM:
+- model: `meta-llama/Llama-3.1-8B-Instruct`
+- runner: `h100-cr`
+- image: `vllm/vllm-openai:latest`
+- framework: `vllm`
+- precision: `fp16`
+- tp: `1`
+
+## Results
+
+Results are saved as JSON files and uploaded as GitHub Actions artifacts. The output includes:
+- Accuracy metrics
+- Per-example results
+- Model and configuration metadata
+
+## Dependencies
+
+The benchmark requires:
+- `lm-eval[vllm]`: Language Model Evaluation Harness with vLLM support
+- A running inference server (vLLM or SGLang)
+- Access to the model on HuggingFace Hub
+
+## References
+
+- [GSM8k Paper](https://arxiv.org/abs/2110.14168)
+- [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
+- [vLLM](https://github.com/vllm-project/vllm)
+- [SGLang](https://github.com/sgl-project/sglang)