diff --git a/.github/scripts/monitor_slurm_job.sh b/.github/scripts/monitor_slurm_job.sh index ba7587ec70..1142e97057 100755 --- a/.github/scripts/monitor_slurm_job.sh +++ b/.github/scripts/monitor_slurm_job.sh @@ -9,11 +9,17 @@ cleanup() { if [ -n "${tail_pid:-}" ]; then kill "${tail_pid}" 2>/dev/null || true fi - # Cancel the SLURM job if the monitor is exiting due to an error - # (e.g., the CI runner is being killed). Don't cancel on success. + # Cancel the SLURM job only if it is still active in the scheduler. + # If the job already left the queue (squeue returns empty), it has finished + # and run_monitored_slurm_job.sh will recover via sacct — don't cancel it. if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then - echo "Monitor exiting abnormally — cancelling SLURM job $job_id" - scancel "$job_id" 2>/dev/null || true + active_state=$(squeue -j "$job_id" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || echo "") + if [ -n "$active_state" ]; then + echo "Monitor exiting abnormally — cancelling SLURM job $job_id (state: $active_state)" + scancel "$job_id" 2>/dev/null || true + else + echo "Monitor exiting abnormally — SLURM job $job_id already left queue, not cancelling" + fi fi } trap cleanup EXIT @@ -56,9 +62,11 @@ get_job_state() { } # Check if a state is terminal (job is done, for better or worse) +# PREEMPTED is intentionally excluded: with --requeue the job restarts under +# the same job ID and we must keep monitoring rather than exiting early. is_terminal_state() { case "$1" in - COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED|REVOKED) + COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|REVOKED) return 0 ;; *) return 1 ;; @@ -74,7 +82,7 @@ while [ ! -f "$output_file" ]; do state=$(get_job_state "$job_id") case "$state" in - PENDING|CONFIGURING) + PENDING|CONFIGURING|PREEMPTED) unknown_count=0 sleep 5 ;; diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh index 87f26fdb5f..130f523c07 100755 --- a/.github/scripts/prebuild-case-optimization.sh +++ b/.github/scripts/prebuild-case-optimization.sh @@ -21,6 +21,8 @@ case "$cluster" in *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;; esac +rm -rf build + . ./mfc.sh load -c "$flag" -m g source .github/scripts/gpu-opts.sh diff --git a/.github/scripts/retry-build.sh b/.github/scripts/retry-build.sh index b82a2e5d8d..38ac08b217 100755 --- a/.github/scripts/retry-build.sh +++ b/.github/scripts/retry-build.sh @@ -1,30 +1,13 @@ #!/bin/bash -# Provides retry_build(): 3-attempt loop with configurable cleanup. -# Set RETRY_CLEAN_CMD to override cleanup (default: rm -rf build/staging build/install build/lock.yaml). +# Provides retry_build(): 2-attempt loop. +# On failure of attempt 1, nukes the entire build directory before attempt 2. # Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry. # Usage: source .github/scripts/retry-build.sh # retry_build ./mfc.sh build -j 8 --gpu acc -# Try normal cleanup; if it fails, escalate to cache nuke. -_retry_clean() { - local clean_cmd="$1" - if eval "$clean_cmd" 2>/dev/null; then - return 0 - fi - echo " Normal cleanup failed." - if type _cache_nuke > /dev/null 2>&1; then - echo " Escalating to NFS cache nuke..." - _cache_nuke - else - echo " _cache_nuke not available, best-effort rm." - rm -rf build/staging build/install build/lock.yaml 2>/dev/null || true - fi -} - retry_build() { - local clean_cmd="${RETRY_CLEAN_CMD:-rm -rf build/staging build/install build/lock.yaml}" local validate_cmd="${RETRY_VALIDATE_CMD:-}" - local max_attempts=3 + local max_attempts=2 local attempt=1 while [ $attempt -le $max_attempts ]; do echo "Build attempt $attempt of $max_attempts..." @@ -33,8 +16,8 @@ retry_build() { if ! eval "$validate_cmd"; then echo "Post-build validation failed on attempt $attempt." if [ $attempt -lt $max_attempts ]; then - echo "Cleaning and retrying in 5s..." - _retry_clean "$clean_cmd" + echo " Nuking build directory before retry..." + rm -rf build 2>/dev/null || true sleep 5 attempt=$((attempt + 1)) continue @@ -48,8 +31,8 @@ retry_build() { return 0 fi if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Retrying in 30s..." - _retry_clean "$clean_cmd" + echo " Build failed — nuking build directory before retry..." + rm -rf build 2>/dev/null || true sleep 30 else echo "Build failed after $max_attempts attempts." diff --git a/.github/scripts/run_monitored_slurm_job.sh b/.github/scripts/run_monitored_slurm_job.sh index 905520c45e..6fb9e254ec 100644 --- a/.github/scripts/run_monitored_slurm_job.sh +++ b/.github/scripts/run_monitored_slurm_job.sh @@ -25,8 +25,10 @@ if [ "$monitor_exit" -ne 0 ]; then echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..." # Give the SLURM epilog time to finalize if the job just finished sleep 30 - final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN") - final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "") + final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true) + final_state="${final_state:-UNKNOWN}" + final_exit=$(sacct -j "$job_id" -X --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || true) + final_exit="${final_exit:-}" echo "Final SLURM state=$final_state exit=$final_exit" if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then echo "SLURM job $job_id completed successfully despite monitor failure — continuing." diff --git a/.github/scripts/run_parallel_benchmarks.sh b/.github/scripts/run_parallel_benchmarks.sh index be9b5c5a94..8c562b911e 100755 --- a/.github/scripts/run_parallel_benchmarks.sh +++ b/.github/scripts/run_parallel_benchmarks.sh @@ -20,6 +20,31 @@ echo "==========================================" echo "Starting parallel benchmark jobs..." echo "==========================================" +# For Phoenix GPU benchmarks, select a consistent GPU partition before launching +# both parallel jobs so PR and master always land on the same GPU type. +if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then + echo "Selecting Phoenix GPU partition for benchmark consistency..." + # Prefer older/smaller partitions first (rtx6000, l40s, v100) to leave + # large modern nodes (h200, h100, a100) free for production workloads. + # rtx6000 has the most nodes and gives the most consistent baselines. + BENCH_GPU_PARTITION="" + for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do + # || true: grep -c exits 1 on zero matches (or when sinfo returns no output + # for an unknown partition); suppress so set -euo pipefail doesn't abort. + idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true) + if [ "${idle:-0}" -gt 0 ]; then + BENCH_GPU_PARTITION="$part" + echo "Selected GPU partition: $BENCH_GPU_PARTITION ($idle idle/mix nodes)" + break + fi + done + if [ -z "$BENCH_GPU_PARTITION" ]; then + echo "WARNING: No idle GPU partition found; falling back to gpu-rtx6000 (may queue)" + BENCH_GPU_PARTITION="gpu-rtx6000" + fi + export BENCH_GPU_PARTITION +fi + # Run both jobs with monitoring using dedicated script from PR # Use stdbuf for line-buffered output and prefix each line for clarity (set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) & @@ -40,6 +65,8 @@ wait "$pr_pid" pr_exit=$? if [ "$pr_exit" -ne 0 ]; then echo "PR job exited with code: $pr_exit" + echo "Last 50 lines of PR job log:" + tail -n 50 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read PR log" else echo "PR job completed successfully" fi @@ -48,6 +75,8 @@ wait "$master_pid" master_exit=$? if [ "$master_exit" -ne 0 ]; then echo "Master job exited with code: $master_exit" + echo "Last 50 lines of master job log:" + tail -n 50 "master/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read master log" else echo "Master job completed successfully" fi diff --git a/.github/scripts/setup-build-cache.sh b/.github/scripts/setup-build-cache.sh deleted file mode 100755 index 7e47175f6e..0000000000 --- a/.github/scripts/setup-build-cache.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash -# Sets up a persistent build cache for self-hosted CI runners. -# Creates a symlink: ./build -> //build -# -# Each runner gets its own cache keyed by (cluster, device, interface, runner). -# This avoids cross-runner path issues entirely — CMake's absolute paths are -# always correct because the same runner always uses the same workspace path. -# -# Usage: source .github/scripts/setup-build-cache.sh - -_cache_cluster="${1:?Usage: setup-build-cache.sh }" -_cache_device="${2:?}" -_cache_interface="${3:-none}" -_cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}" - -# Select cache root based on cluster (each HPC system has its own persistent storage). -case "$_cache_cluster" in - phoenix) - _cache_root="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache" ;; - frontier|frontier_amd) - _cache_root="/lustre/orion/cfd154/scratch/sbryngelson/.mfc-ci-cache" ;; - *) - echo "=== Build Cache Setup ===" - echo " No cache root configured for cluster '$_cache_cluster' — skipping." - echo "=========================" - return 0 2>/dev/null || exit 0 ;; -esac - -_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}" -_cache_base="${_cache_root}/${_cache_key}/build" - -# Check if the cache directory is healthy (readable, writable, no stale handles). -_cache_healthy() { - local dir="$1" - if ! ls "$dir" > /dev/null 2>&1; then - echo " Health check FAILED: cannot list $dir" - return 1 - fi - if [ -e "$dir/lock.yaml" ] && ! stat "$dir/lock.yaml" > /dev/null 2>&1; then - echo " Health check FAILED: cannot stat $dir/lock.yaml" - return 1 - fi - local probe="$dir/.nfs_probe.$$" - if ! touch "$probe" 2>/dev/null || ! rm -f "$probe" 2>/dev/null; then - echo " Health check FAILED: cannot write/remove probe in $dir" - rm -f "$probe" 2>/dev/null - return 1 - fi - return 0 -} - -# Nuclear recovery: rename stale cache out of the way and create a fresh one. -# Uses mv (operates on parent directory entry) which works even when children -# have stale file handles that prevent rm -rf from succeeding. -_cache_nuke() { - local base="${1:-$_cache_base}" - local stale_name="${base}.stale.$(date +%s)" - echo " NFS cache nuke: parking stale dir -> $stale_name" - if mv "$base" "$stale_name" 2>/dev/null; then - echo " NFS cache nuke: renamed successfully" - else - echo " NFS cache nuke: mv failed, trying rm -rf as fallback" - rm -rf "$base" 2>/dev/null || true - fi - mkdir -p "$base" - echo " NFS cache nuke: fresh cache created at $base" -} - -mkdir -p "$_cache_base" -_cache_dir="$(cd "$_cache_base" && pwd -P)" - -echo "=== Build Cache Setup ===" -echo " Cache key: $_cache_key" -echo " Cache dir: $_cache_dir" - -# Pre-flight: detect stale NFS handles before wasting a build attempt. -if ! _cache_healthy "$_cache_dir"; then - echo " Stale NFS cache detected — nuking and recreating." - _cache_nuke "$_cache_base" - _cache_dir="$(cd "$_cache_base" && pwd -P)" -fi - -# Replace any existing build/ (real dir or stale symlink) with a symlink -# to our runner-specific cache directory. -# Use unlink for symlinks to avoid rm -rf following the link and deleting -# the shared cache contents (which another runner may be using). -if [ -L "build" ]; then - unlink "build" -elif [ -e "build" ]; then - rm -rf "build" -fi - -ln -s "$_cache_dir" "build" - -echo " Symlink: build -> $_cache_dir" - -# Garbage-collect stale cache dirs parked by _cache_nuke more than 7 days ago. -_cache_parent="$(dirname "$_cache_base")" -find "$_cache_parent" -maxdepth 1 -name "*.stale.*" -mtime +7 -exec rm -rf {} + 2>/dev/null || true - -echo "=========================" diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh index c081c8692a..e0a6eb7384 100755 --- a/.github/scripts/submit_and_monitor_bench.sh +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -14,12 +14,18 @@ device="$2" interface="$3" cluster="$4" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." cd "$dir" -# Submit and monitor job (submit.sh auto-detects bench mode from script name) -bash .github/workflows/$cluster/submit.sh \ - .github/workflows/$cluster/bench.sh "$device" "$interface" +# Always use the PR's submit.sh so both master and PR builds benefit from the +# run_monitored_slurm_job.sh SIGKILL recovery wrapper. The bench script is +# still resolved relative to the current directory (master/ or pr/) so the +# correct branch code is benchmarked. SLURM_SUBMIT_DIR ensures the job runs +# in the right directory regardless of which submit.sh is invoked. +PR_SUBMIT="${SCRIPT_DIR}/../workflows/${cluster}/submit.sh" +bash "$PR_SUBMIT" .github/workflows/$cluster/bench.sh "$device" "$interface" # Verify the YAML output file was created job_slug="bench-$device-$interface" diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b45fc45e40..8a1c848493 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -85,6 +85,7 @@ jobs: device: gpu interface: omp build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench" + continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }} runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} @@ -106,7 +107,7 @@ jobs: if: matrix.build_script != '' uses: nick-fields/retry@v3 with: - max_attempts: 3 + max_attempts: 2 retry_wait_seconds: 60 timeout_minutes: 150 command: | @@ -118,13 +119,20 @@ jobs: wait $pid2; e2=$? [ $e1 -eq 0 ] && [ $e2 -eq 0 ] on_retry_command: | - (cd pr && ./mfc.sh clean) & - (cd master && ./mfc.sh clean) & - wait + rm -rf pr/build master/build - name: Bench (Master v. PR) run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} + - name: Cancel SLURM Jobs + if: cancelled() + run: | + find . -name "*.slurm_job_id" | while read -r f; do + job_id=$(cat "$f") + echo "Cancelling SLURM job $job_id" + scancel "$job_id" 2>/dev/null || true + done + - name: Generate & Post Comment if: always() run: | @@ -137,6 +145,29 @@ jobs: cat pr/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true cat master/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true + - name: Print Per-Case Logs + if: always() + run: | + passed=() failed=() + for out in pr/build/benchmarks/*/*.out master/build/benchmarks/*/*.out; do + [ -f "$out" ] || continue + [ -f "${out%.out}.yaml" ] && passed+=("$out") || failed+=("$out") + done + + echo "=== Per-Case Summary: ${#failed[@]} failed, ${#passed[@]} passed ===" + for out in "${failed[@]}"; do echo " [FAILED] $out"; done + for out in "${passed[@]}"; do echo " [PASSED] $out"; done + + if [ ${#failed[@]} -gt 0 ]; then + echo "" + echo "=== Failed Case Logs ===" + for out in "${failed[@]}"; do + echo "--- $out ---" + cat "$out" + echo "" + done + fi + # All other runners (non-Phoenix) just run without special env - name: Archive Logs (Frontier) if: always() && matrix.cluster != 'phoenix' diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh index b60f8541a2..b896feb17c 100644 --- a/.github/workflows/frontier/bench.sh +++ b/.github/workflows/frontier/bench.sh @@ -2,8 +2,11 @@ source .github/scripts/bench-preamble.sh +# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes. +n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) )) + if [ "$job_device" = "gpu" ]; then ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks else - ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks + ./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks fi diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh index 10a38d0eea..abaf76f33d 100644 --- a/.github/workflows/phoenix/bench.sh +++ b/.github/workflows/phoenix/bench.sh @@ -2,6 +2,10 @@ source .github/scripts/bench-preamble.sh +# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes +# (GNR nodes have 192 cores but nproc is too aggressive for build/bench). +n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) )) + tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build currentdir=$tmpbuild/run-$(( RANDOM % 900 )) mkdir -p $tmpbuild @@ -18,9 +22,9 @@ fi rm -rf build source .github/scripts/retry-build.sh -RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1 +retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 -./mfc.sh bench $bench_opts -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks +./mfc.sh bench $bench_opts -j $n_jobs -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks sleep 10 rm -rf "$currentdir" || true diff --git a/.github/workflows/phoenix/submit-job.sh b/.github/workflows/phoenix/submit-job.sh new file mode 100755 index 0000000000..caa6bd2175 --- /dev/null +++ b/.github/workflows/phoenix/submit-job.sh @@ -0,0 +1,123 @@ +#!/bin/bash +# Submit a SLURM job without waiting for it to complete. +# Writes the job ID to .slurm_job_id so a separate monitor step can wait. +# Idempotent: if a job for this slug is still RUNNING or PENDING, skip resubmission. +# +# Usage: submit-job.sh [script.sh] [cpu|gpu] [none|acc|omp] + +set -euo pipefail + +# Ignore SIGHUP to survive login node session drops +trap '' HUP + +usage() { + echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp]" +} + +if [ -z "${1:-}" ]; then + usage + exit 1 +fi + +sbatch_script_contents=$(cat "$1") + +# Detect job type from submitted script basename +script_basename="$(basename "$1" .sh)" +case "$script_basename" in + bench*) job_type="bench" ;; + *) job_type="test" ;; +esac + +sbatch_cpu_opts="\ +#SBATCH -p cpu-small # partition +#SBATCH --ntasks-per-node=24 # Number of cores per node required +#SBATCH --mem-per-cpu=2G # Memory per core\ +" + +if [ "$job_type" = "bench" ]; then + bench_partition="${BENCH_GPU_PARTITION:-gpu-rtx6000}" + echo "Submitting bench GPU job to partition: $bench_partition (BENCH_GPU_PARTITION=${BENCH_GPU_PARTITION:-})" + sbatch_gpu_opts="\ +#SBATCH -p $bench_partition +#SBATCH --ntasks-per-node=4 # Number of cores per node required +#SBATCH -G2\ +" + sbatch_time="#SBATCH -t 04:00:00" +else + sbatch_gpu_opts="\ +#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s,gpu-h200 +#SBATCH --ntasks-per-node=4 # Number of cores per node required +#SBATCH -G2\ +" + sbatch_time="#SBATCH -t 03:00:00" +fi + +if [ "$2" = "cpu" ]; then + sbatch_device_opts="$sbatch_cpu_opts" +elif [ "$2" = "gpu" ]; then + sbatch_device_opts="$sbatch_gpu_opts" +else + usage + exit 1 +fi + +job_slug="$(basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-$2-$3" +output_file="$job_slug.out" +id_file="${job_slug}.slurm_job_id" + +# On rerun, cancel any existing job for this slug and submit a fresh one. +# If the job is still live (RUNNING/PENDING), scancel it first as a safety net +# in case the "Cancel SLURM Jobs" step did not fire (e.g. runner was SIGKILL'd). +if [ -f "$id_file" ]; then + existing_id=$(cat "$id_file") + state=$(sacct -j "$existing_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true) + case "${state:-UNKNOWN}" in + RUNNING|PENDING|REQUEUED|COMPLETING) + echo "Cancelling stale SLURM job $existing_id (state=$state) before resubmission" + scancel "$existing_id" 2>/dev/null || true + ;; + *) + echo "Stale job $existing_id (state=${state:-UNKNOWN}) — submitting fresh" + ;; + esac + rm -f "$id_file" +fi + +submit_output=$(sbatch < "$id_file" +echo "Job ID written to $id_file" diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 786489d1c4..0c009bd001 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -1,6 +1,10 @@ #!/bin/bash +# Submit a SLURM job and wait for it to complete. +# Delegates submission (with idempotency) to submit-job.sh, then monitors. +# +# Usage: submit.sh [script.sh] [cpu|gpu] [none|acc|omp] -set -e +set -euo pipefail # Ignore SIGHUP to survive login node session drops trap '' HUP @@ -9,90 +13,22 @@ usage() { echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp]" } -if [ ! -z "$1" ]; then - sbatch_script_contents=`cat $1` -else +if [ -z "${1:-}" ]; then usage exit 1 fi -# Detect job type from submitted script basename -script_basename="$(basename "$1" .sh)" -case "$script_basename" in - bench*) job_type="bench" ;; - *) job_type="test" ;; -esac - -sbatch_cpu_opts="\ -#SBATCH -p cpu-small # partition -#SBATCH --ntasks-per-node=24 # Number of cores per node required -#SBATCH --mem-per-cpu=2G # Memory per core\ -" - -if [ "$job_type" = "bench" ]; then - sbatch_gpu_opts="\ -#SBATCH -CL40S -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ -" - sbatch_time="#SBATCH -t 04:00:00" -else - sbatch_gpu_opts="\ -#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s -#SBATCH --ntasks-per-node=4 # Number of cores per node required -#SBATCH -G2\ -" - sbatch_time="#SBATCH -t 03:00:00" -fi +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -if [ "$2" = "cpu" ]; then - sbatch_device_opts="$sbatch_cpu_opts" -elif [ "$2" = "gpu" ]; then - sbatch_device_opts="$sbatch_gpu_opts" -else - usage - exit 1 -fi +# Submit (idempotent — skips resubmission if a live job already exists) +bash "$SCRIPT_DIR/submit-job.sh" "$@" -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3" +# Derive the same job slug and file paths as submit-job.sh. +# NOTE: this sed pipeline must stay identical to the one in submit-job.sh — +# if they diverge the id-file will not be found and the monitor will fail. +job_slug="$(basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g')-$2-$3" output_file="$job_slug.out" +id_file="${job_slug}.slurm_job_id" -submit_output=$(sbatch < /tmp/env_after diff /tmp/env_before /tmp/env_after | grep '^>' | sed 's/^> //' >> $GITHUB_ENV - - name: Set up Python 3.14 - uses: actions/setup-python@v5 - with: - python-version: '3.14' - - name: Build run: | - /bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL + /bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} $PRECISION $TEST_ALL env: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} + PRECISION: ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }} - name: Test run: | @@ -159,7 +155,10 @@ jobs: name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})" if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && github.event.pull_request.draft != true needs: [lint-gate, file-changes] - continue-on-error: false + # Frontier CCE compiler is periodically broken by toolchain updates (e.g. + # cpe/25.03 introduced an IPA SIGSEGV in CCE 19.0.0). Allow Frontier to + # fail without blocking PR merges; Phoenix remains a hard gate. + continue-on-error: ${{ matrix.runner == 'frontier' }} timeout-minutes: 480 strategy: matrix: @@ -237,7 +236,9 @@ jobs: - name: Clone uses: actions/checkout@v4 with: - clean: true + # clean: false preserves .slurm_job_id files across reruns so + # submit-job.sh can detect and cancel stale SLURM jobs on retry. + clean: false - name: Build if: matrix.cluster != 'phoenix' @@ -249,7 +250,18 @@ jobs: command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} on_retry_command: rm -rf build + - name: Submit SLURM Test Job + if: matrix.cluster == 'phoenix' + run: bash .github/workflows/phoenix/submit-job.sh .github/workflows/phoenix/test.sh ${{ matrix.device }} ${{ matrix.interface }} + + - name: Monitor SLURM Test Job + if: matrix.cluster == 'phoenix' + run: | + slug="test-${{ matrix.device }}-${{ matrix.interface }}" + bash .github/scripts/run_monitored_slurm_job.sh "$(cat ${slug}.slurm_job_id)" "${slug}.out" + - name: Test + if: matrix.cluster != 'phoenix' run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }} - name: Cancel SLURM Jobs @@ -287,7 +299,8 @@ jobs: name: "Case Opt | ${{ matrix.cluster_name }} (${{ matrix.device }}-${{ matrix.interface }})" if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && github.event.pull_request.draft != true needs: [lint-gate, file-changes] - continue-on-error: false + # Frontier is non-blocking for the same reason as the self job above. + continue-on-error: ${{ matrix.runner == 'frontier' }} timeout-minutes: 480 strategy: matrix: @@ -324,7 +337,7 @@ jobs: - name: Clone uses: actions/checkout@v4 with: - clean: true + clean: false - name: Pre-Build (SLURM) if: matrix.cluster == 'phoenix' @@ -334,7 +347,18 @@ jobs: if: matrix.cluster != 'phoenix' run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }} + - name: Submit Case-Optimization Tests + if: matrix.cluster == 'phoenix' + run: bash .github/workflows/phoenix/submit-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} + + - name: Monitor Case-Optimization Tests + if: matrix.cluster == 'phoenix' + run: | + slug="run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}" + bash .github/scripts/run_monitored_slurm_job.sh "$(cat ${slug}.slurm_job_id)" "${slug}.out" + - name: Run Case-Optimization Tests + if: matrix.cluster != 'phoenix' run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} - name: Cancel SLURM Jobs diff --git a/benchmarks/5eq_rk3_weno3_hllc/case.py b/benchmarks/5eq_rk3_weno3_hllc/case.py index 5ecc327e8f..fa09426ffe 100644 --- a/benchmarks/5eq_rk3_weno3_hllc/case.py +++ b/benchmarks/5eq_rk3_weno3_hllc/case.py @@ -191,8 +191,8 @@ "cyl_coord": "F", "dt": dt, "t_step_start": 0, - "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), - "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 3, "model_eqns": 2, diff --git a/benchmarks/hypo_hll/case.py b/benchmarks/hypo_hll/case.py index 1663a507aa..f8d0928a01 100644 --- a/benchmarks/hypo_hll/case.py +++ b/benchmarks/hypo_hll/case.py @@ -44,8 +44,8 @@ "p": Nz, "dt": 1e-8, "t_step_start": 0, - "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), - "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 2, "model_eqns": 2, diff --git a/benchmarks/ibm/case.py b/benchmarks/ibm/case.py index e16cb620b7..303cf7fcaf 100644 --- a/benchmarks/ibm/case.py +++ b/benchmarks/ibm/case.py @@ -48,8 +48,8 @@ "p": Nz, "dt": mydt, "t_step_start": 0, - "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), - "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 1, "model_eqns": 2, diff --git a/benchmarks/igr/case.py b/benchmarks/igr/case.py index 469bff1fa9..4ceed76257 100644 --- a/benchmarks/igr/case.py +++ b/benchmarks/igr/case.py @@ -63,8 +63,8 @@ "cyl_coord": "F", "dt": dt, "t_step_start": 0, - "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), - "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 1, "model_eqns": 2, diff --git a/benchmarks/viscous_weno5_sgb_acoustic/case.py b/benchmarks/viscous_weno5_sgb_acoustic/case.py index 9f1351b0c1..83bdc43e9c 100644 --- a/benchmarks/viscous_weno5_sgb_acoustic/case.py +++ b/benchmarks/viscous_weno5_sgb_acoustic/case.py @@ -94,8 +94,8 @@ "p": Nz, "dt": dt, "t_step_start": 0, - "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(6 * (5 * size + 5)), - "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(6 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 2, "model_eqns": 2, diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py index 74f7469482..58b90e965b 100644 --- a/toolchain/mfc/bench.py +++ b/toolchain/mfc/bench.py @@ -228,8 +228,7 @@ def diff(): grind_time_value = lhs_summary[target.name]["grind"] / rhs_summary[target.name]["grind"] speedups[i] += f" & Grind: {grind_time_value:.2f}" if grind_time_value < 0.95: - cons.print(f"[bold red]Error[/bold red]: Benchmarking failed since grind time speedup for {target.name} below acceptable threshold (<0.95) - Case: {slug}") - err = 1 + cons.print(f"[bold yellow]Warning[/bold yellow]: Grind time speedup for {target.name} below threshold (<0.95) - Case: {slug}") except Exception as e: cons.print( f"[bold red]ERROR[/bold red]: Failed to compute speedup for {target.name} in {slug}: {e}\n" diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 6430f7ad35..08ff6d7510 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -1,6 +1,7 @@ import os, typing, hashlib, dataclasses, subprocess, re, time, sys, threading, queue from rich.panel import Panel +from rich.text import Text from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn, TaskProgressColumn from .case import Case @@ -273,14 +274,14 @@ def _show_build_error(result: subprocess.CompletedProcess, stage: str): stdout_text = result.stdout if isinstance(result.stdout, str) else result.stdout.decode('utf-8', errors='replace') stdout_text = stdout_text.strip() if stdout_text: - cons.raw.print(Panel(stdout_text, title="Output", border_style="yellow")) + cons.raw.print(Panel(Text(stdout_text), title="Output", border_style="yellow")) # Show stderr if available if result.stderr: stderr_text = result.stderr if isinstance(result.stderr, str) else result.stderr.decode('utf-8', errors='replace') stderr_text = stderr_text.strip() if stderr_text: - cons.raw.print(Panel(stderr_text, title="Errors", border_style="red")) + cons.raw.print(Panel(Text(stderr_text), title="Errors", border_style="red")) cons.print()