diff --git a/.github/workflows/gsm8k-benchmark.yml b/.github/workflows/gsm8k-benchmark.yml new file mode 100644 index 000000000..4800fa873 --- /dev/null +++ b/.github/workflows/gsm8k-benchmark.yml @@ -0,0 +1,108 @@ +name: GSM8k Benchmark + +on: + workflow_dispatch: + inputs: + model: + description: "Model to evaluate (e.g., meta-llama/Llama-3.1-8B-Instruct)" + required: true + type: string + default: "meta-llama/Llama-3.1-8B-Instruct" + runner: + description: "Runner to use (e.g., h100-cr, h200-cw, b200-nv, mi300x-amd, mi325x-amd, mi355x-amd)" + required: true + type: string + default: "h100-cr" + image: + description: "Docker image to use" + required: true + type: string + default: "vllm/vllm-openai:latest" + framework: + description: "Framework to use (vllm, sglang)" + required: true + type: string + default: "vllm" + precision: + description: "Precision (fp16, fp8, fp4)" + required: true + type: string + default: "fp16" + tp: + description: "Tensor parallelism size" + required: true + type: string + default: "1" + num-fewshot: + description: "Number of few-shot examples" + required: false + type: string + default: "5" + limit: + description: "Limit number of examples (empty for all)" + required: false + type: string + default: "" + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_CACHE: '/mnt/hf_hub_cache/' + MODEL: ${{ inputs.model }} + FRAMEWORK: ${{ inputs.framework }} + PRECISION: ${{ inputs.precision }} + TP: ${{ inputs.tp }} + NUM_FEWSHOT: ${{ inputs.num-fewshot }} + LIMIT: ${{ inputs.limit }} + IMAGE: ${{ inputs.image }} + +jobs: + gsm8k-eval: + runs-on: ${{ inputs.runner }} + timeout-minutes: 180 + name: 'GSM8k - ${{ inputs.model }} - ${{ inputs.framework }} - ${{ inputs.precision }} - tp=${{ inputs.tp }}' + permissions: + contents: read + steps: + - name: Resource cleanup + run: | + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + echo "[Docker] Cleaning up resources ..." + docker ps -aq | xargs -r docker rm -f + docker network prune -f + while [ -n "$(docker ps -aq)" ]; do + docker ps -a + sleep 5 + done + fi + if command -v squeue >/dev/null 2>&1; then + echo "[Slurm] Cleaning up resources ..." + scancel -u $USER + while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do + squeue -u $USER + sleep 5 + done + fi + + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Launch GSM8k evaluation + env: + RUNNER_NAME: ${{ runner.name }} + RESULT_FILENAME: gsm8k_${{ env.FRAMEWORK }}_${{ env.PRECISION }}_tp${{ env.TP }}_${{ runner.name }} + run: | + bash ./runners/launch_gsm8k_${RUNNER_NAME%%_*}.sh + if [ -f "$RESULT_FILENAME.json" ]; then + echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV + else + echo "Run failed: GSM8k result $RESULT_FILENAME.json not found." >&2 + exit 1 + fi + + - name: Upload result + uses: actions/upload-artifact@v4 + with: + name: ${{ env.RESULT_FILENAME }} + path: ${{ env.RESULT_FILENAME }}.json diff --git a/benchmarks/gsm8k_sglang_docker.sh b/benchmarks/gsm8k_sglang_docker.sh new file mode 100755 index 000000000..91b77c836 --- /dev/null +++ b/benchmarks/gsm8k_sglang_docker.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# TP +# NUM_FEWSHOT (optional, defaults to 5) +# LIMIT (optional, empty for all examples) +# RESULT_FILENAME (optional, for output filename) + +set -e + +# Set defaults +NUM_FEWSHOT=${NUM_FEWSHOT:-5} +PORT=${PORT:-8000} +OUTPUT_DIR=${OUTPUT_DIR:-/workspace} + +# Start SGLang server in background +echo "Starting SGLang server for model: $MODEL" +export PYTHONNOUSERSITE=1 + +python3 -m sglang.launch_server \ +--model-path $MODEL \ +--host 0.0.0.0 \ +--port $PORT \ +--tp-size $TP \ +--mem-fraction-static 0.9 & + +SERVER_PID=$! + +# Wait for server to be ready +echo "Waiting for server to start..." +MAX_RETRIES=60 +RETRY_COUNT=0 +while ! curl -s http://localhost:$PORT/health > /dev/null; do + RETRY_COUNT=$((RETRY_COUNT + 1)) + if [ $RETRY_COUNT -ge $MAX_RETRIES ]; then + echo "Server failed to start within timeout" + kill $SERVER_PID 2>/dev/null || true + exit 1 + fi + sleep 5 +done + +echo "Server started successfully" + +# Run lm-eval with GSM8k +echo "Running GSM8k evaluation..." + +# Create temporary output directory +TEMP_OUTPUT_DIR=$(mktemp -d) + +# Build lm-eval command +# Note: SGLang provides an OpenAI-compatible API, we use vllm model type with custom base_url +LMEVAL_CMD="lm_eval --model vllm \ +--model_args pretrained=$MODEL,tensor_parallel_size=$TP,dtype=auto,data_parallel_size=1,base_url=http://localhost:$PORT/v1 \ +--tasks gsm8k \ +--num_fewshot $NUM_FEWSHOT \ +--batch_size auto \ +--output_path $TEMP_OUTPUT_DIR" + +# Add limit if specified +if [ -n "$LIMIT" ]; then + LMEVAL_CMD="$LMEVAL_CMD --limit $LIMIT" +fi + +# Run evaluation +eval $LMEVAL_CMD + +# Copy results to expected location +if [ -n "$RESULT_FILENAME" ]; then + # lm-eval creates results.json in output directory + if [ -f "$TEMP_OUTPUT_DIR/results.json" ]; then + cp "$TEMP_OUTPUT_DIR/results.json" "$OUTPUT_DIR/$RESULT_FILENAME.json" + echo "Results saved to $OUTPUT_DIR/$RESULT_FILENAME.json" + else + echo "Error: lm-eval output not found" + ls -la $TEMP_OUTPUT_DIR + fi +fi + +# Shutdown server +echo "Shutting down server..." +kill $SERVER_PID 2>/dev/null || true +wait $SERVER_PID 2>/dev/null || true + +echo "GSM8k evaluation completed" diff --git a/benchmarks/gsm8k_vllm_docker.sh b/benchmarks/gsm8k_vllm_docker.sh new file mode 100755 index 000000000..37fabb3b2 --- /dev/null +++ b/benchmarks/gsm8k_vllm_docker.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# TP +# NUM_FEWSHOT (optional, defaults to 5) +# LIMIT (optional, empty for all examples) +# RESULT_FILENAME (optional, for output filename) + +set -e + +# Set defaults +NUM_FEWSHOT=${NUM_FEWSHOT:-5} +PORT=${PORT:-8000} +OUTPUT_DIR=${OUTPUT_DIR:-/workspace} + +# Start vLLM server in background +echo "Starting vLLM server for model: $MODEL" +export PYTHONNOUSERSITE=1 + +vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ +--gpu-memory-utilization=0.9 \ +--tensor-parallel-size=$TP \ +--disable-log-requests & + +SERVER_PID=$! + +# Wait for server to be ready +echo "Waiting for server to start..." +MAX_RETRIES=60 +RETRY_COUNT=0 +while ! curl -s http://localhost:$PORT/health > /dev/null; do + RETRY_COUNT=$((RETRY_COUNT + 1)) + if [ $RETRY_COUNT -ge $MAX_RETRIES ]; then + echo "Server failed to start within timeout" + kill $SERVER_PID 2>/dev/null || true + exit 1 + fi + sleep 5 +done + +echo "Server started successfully" + +# Run lm-eval with GSM8k +echo "Running GSM8k evaluation..." + +# Create temporary output directory +TEMP_OUTPUT_DIR=$(mktemp -d) + +# Build lm-eval command +LMEVAL_CMD="lm_eval --model vllm \ +--model_args pretrained=$MODEL,tensor_parallel_size=$TP,dtype=auto,gpu_memory_utilization=0.9,data_parallel_size=1,base_url=http://localhost:$PORT/v1 \ +--tasks gsm8k \ +--num_fewshot $NUM_FEWSHOT \ +--batch_size auto \ +--output_path $TEMP_OUTPUT_DIR" + +# Add limit if specified +if [ -n "$LIMIT" ]; then + LMEVAL_CMD="$LMEVAL_CMD --limit $LIMIT" +fi + +# Run evaluation +eval $LMEVAL_CMD + +# Copy results to expected location +if [ -n "$RESULT_FILENAME" ]; then + # lm-eval creates results.json in output directory + if [ -f "$TEMP_OUTPUT_DIR/results.json" ]; then + cp "$TEMP_OUTPUT_DIR/results.json" "$OUTPUT_DIR/$RESULT_FILENAME.json" + echo "Results saved to $OUTPUT_DIR/$RESULT_FILENAME.json" + else + echo "Error: lm-eval output not found" + ls -la $TEMP_OUTPUT_DIR + fi +fi + +# Shutdown server +echo "Shutting down server..." +kill $SERVER_PID 2>/dev/null || true +wait $SERVER_PID 2>/dev/null || true + +echo "GSM8k evaluation completed" diff --git a/docs/GSM8K_BENCHMARK.md b/docs/GSM8K_BENCHMARK.md new file mode 100644 index 000000000..ca6d98a06 --- /dev/null +++ b/docs/GSM8K_BENCHMARK.md @@ -0,0 +1,93 @@ +# GSM8k Benchmark CI + +This directory contains the CI workflow for running GSM8k (Grade School Math 8K) benchmarks using lm-evaluation-harness. + +## Overview + +GSM8k is a dataset of 8.5K high-quality linguistically diverse grade school math word problems. This benchmark evaluates the mathematical reasoning capabilities of language models. + +## Workflow + +The GSM8k benchmark CI is implemented in `.github/workflows/gsm8k-benchmark.yml` and can be triggered manually via workflow_dispatch. + +### Parameters + +- **model**: The model to evaluate (e.g., `meta-llama/Llama-3.1-8B-Instruct`) +- **runner**: The GPU runner to use (e.g., `h100-cr`, `h200-cw`, `b200-nv`, `mi300x-amd`, `mi325x-amd`, `mi355x-amd`) +- **image**: Docker image to use (e.g., `vllm/vllm-openai:latest`) +- **framework**: Inference framework (`vllm` or `sglang`) +- **precision**: Model precision (`fp16`, `fp8`, `fp4`) +- **tp**: Tensor parallelism size +- **num-fewshot**: Number of few-shot examples (default: 5) +- **limit**: Optional limit on number of examples to evaluate + +## Architecture + +### Benchmark Scripts + +Located in `benchmarks/`: +- `gsm8k_vllm_docker.sh`: Runs GSM8k evaluation using vLLM +- `gsm8k_sglang_docker.sh`: Runs GSM8k evaluation using SGLang + +These scripts: +1. Start an inference server (vLLM or SGLang) +2. Wait for the server to be ready +3. Run lm-eval with GSM8k task +4. Save results and shutdown the server + +### Runner Scripts + +Located in `runners/`: +- `launch_gsm8k_h100.sh`: Launch on H100 GPUs +- `launch_gsm8k_h200.sh`: Launch on H200 GPUs +- `launch_gsm8k_b200.sh`: Launch on B200 GPUs +- `launch_gsm8k_mi300x.sh`: Launch on MI300X GPUs +- `launch_gsm8k_mi325x.sh`: Launch on MI325X GPUs +- `launch_gsm8k_mi355x.sh`: Launch on MI355X GPUs + +These scripts handle: +- Docker container setup +- GPU configuration (CUDA/ROCm) +- Volume mounts +- Environment variable passing + +## Usage + +### Via GitHub Actions UI + +1. Go to Actions tab +2. Select "GSM8k Benchmark" workflow +3. Click "Run workflow" +4. Fill in the parameters +5. Click "Run workflow" + +### Example + +To evaluate Llama-3.1-8B-Instruct on H100 with vLLM: +- model: `meta-llama/Llama-3.1-8B-Instruct` +- runner: `h100-cr` +- image: `vllm/vllm-openai:latest` +- framework: `vllm` +- precision: `fp16` +- tp: `1` + +## Results + +Results are saved as JSON files and uploaded as GitHub Actions artifacts. The output includes: +- Accuracy metrics +- Per-example results +- Model and configuration metadata + +## Dependencies + +The benchmark requires: +- `lm-eval[vllm]`: Language Model Evaluation Harness with vLLM support +- A running inference server (vLLM or SGLang) +- Access to the model on HuggingFace Hub + +## References + +- [GSM8k Paper](https://arxiv.org/abs/2110.14168) +- [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) +- [vLLM](https://github.com/vllm-project/vllm) +- [SGLang](https://github.com/sgl-project/sglang) diff --git a/runners/launch_gsm8k_b200.sh b/runners/launch_gsm8k_b200.sh new file mode 100755 index 000000000..ab9ac3f04 --- /dev/null +++ b/runners/launch_gsm8k_b200.sh @@ -0,0 +1,23 @@ +#!/usr/bin/bash + +HF_HUB_CACHE_MOUNT="/mnt/hf_hub_cache/" +PORT=8000 + +container_name="gsm8k-eval" + +echo "Starting GSM8k evaluation container..." + +set -x +docker run --rm --network=host --name=$container_name \ +--runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e NUM_FEWSHOT -e LIMIT -e PORT=$PORT -e RESULT_FILENAME \ +-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +--entrypoint=/bin/bash \ +$IMAGE \ +-lc "pip install -q lm-eval[vllm] && bash /workspace/benchmarks/gsm8k_${FRAMEWORK}_docker.sh" + +set +x + +echo "GSM8k evaluation completed successfully" diff --git a/runners/launch_gsm8k_h100.sh b/runners/launch_gsm8k_h100.sh new file mode 100755 index 000000000..c8bfa30b4 --- /dev/null +++ b/runners/launch_gsm8k_h100.sh @@ -0,0 +1,23 @@ +#!/usr/bin/bash + +HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/" +PORT=8000 + +container_name="gsm8k-eval" + +echo "Starting GSM8k evaluation container..." + +set -x +docker run --rm --network=host --name=$container_name \ +--runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e NUM_FEWSHOT -e LIMIT -e PORT=$PORT -e RESULT_FILENAME \ +-e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +--entrypoint=/bin/bash \ +$IMAGE \ +-lc "pip install -q lm-eval[vllm] && bash /workspace/benchmarks/gsm8k_${FRAMEWORK}_docker.sh" + +set +x + +echo "GSM8k evaluation completed successfully" diff --git a/runners/launch_gsm8k_h200.sh b/runners/launch_gsm8k_h200.sh new file mode 100755 index 000000000..c8bfa30b4 --- /dev/null +++ b/runners/launch_gsm8k_h200.sh @@ -0,0 +1,23 @@ +#!/usr/bin/bash + +HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/" +PORT=8000 + +container_name="gsm8k-eval" + +echo "Starting GSM8k evaluation container..." + +set -x +docker run --rm --network=host --name=$container_name \ +--runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e NUM_FEWSHOT -e LIMIT -e PORT=$PORT -e RESULT_FILENAME \ +-e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +--entrypoint=/bin/bash \ +$IMAGE \ +-lc "pip install -q lm-eval[vllm] && bash /workspace/benchmarks/gsm8k_${FRAMEWORK}_docker.sh" + +set +x + +echo "GSM8k evaluation completed successfully" diff --git a/runners/launch_gsm8k_mi300x.sh b/runners/launch_gsm8k_mi300x.sh new file mode 100755 index 000000000..d9af2dbf1 --- /dev/null +++ b/runners/launch_gsm8k_mi300x.sh @@ -0,0 +1,23 @@ +#!/usr/bin/bash + +HF_HUB_CACHE_MOUNT="/mnt/hf_hub_cache/" +PORT=8000 + +container_name="gsm8k-eval" + +echo "Starting GSM8k evaluation container..." + +set -x +docker run --rm --network=host --name=$container_name \ +--device=/dev/kfd --device=/dev/dri --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e NUM_FEWSHOT -e LIMIT -e PORT=$PORT -e RESULT_FILENAME \ +-e ROCR_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +--entrypoint=/bin/bash \ +$IMAGE \ +-lc "pip install -q lm-eval[vllm] && bash /workspace/benchmarks/gsm8k_${FRAMEWORK}_docker.sh" + +set +x + +echo "GSM8k evaluation completed successfully" diff --git a/runners/launch_gsm8k_mi325x.sh b/runners/launch_gsm8k_mi325x.sh new file mode 100755 index 000000000..d9af2dbf1 --- /dev/null +++ b/runners/launch_gsm8k_mi325x.sh @@ -0,0 +1,23 @@ +#!/usr/bin/bash + +HF_HUB_CACHE_MOUNT="/mnt/hf_hub_cache/" +PORT=8000 + +container_name="gsm8k-eval" + +echo "Starting GSM8k evaluation container..." + +set -x +docker run --rm --network=host --name=$container_name \ +--device=/dev/kfd --device=/dev/dri --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e NUM_FEWSHOT -e LIMIT -e PORT=$PORT -e RESULT_FILENAME \ +-e ROCR_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +--entrypoint=/bin/bash \ +$IMAGE \ +-lc "pip install -q lm-eval[vllm] && bash /workspace/benchmarks/gsm8k_${FRAMEWORK}_docker.sh" + +set +x + +echo "GSM8k evaluation completed successfully" diff --git a/runners/launch_gsm8k_mi355x.sh b/runners/launch_gsm8k_mi355x.sh new file mode 100755 index 000000000..d9af2dbf1 --- /dev/null +++ b/runners/launch_gsm8k_mi355x.sh @@ -0,0 +1,23 @@ +#!/usr/bin/bash + +HF_HUB_CACHE_MOUNT="/mnt/hf_hub_cache/" +PORT=8000 + +container_name="gsm8k-eval" + +echo "Starting GSM8k evaluation container..." + +set -x +docker run --rm --network=host --name=$container_name \ +--device=/dev/kfd --device=/dev/dri --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e NUM_FEWSHOT -e LIMIT -e PORT=$PORT -e RESULT_FILENAME \ +-e ROCR_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +--entrypoint=/bin/bash \ +$IMAGE \ +-lc "pip install -q lm-eval[vllm] && bash /workspace/benchmarks/gsm8k_${FRAMEWORK}_docker.sh" + +set +x + +echo "GSM8k evaluation completed successfully"