Skip to content

Commit 0a92045

Browse files
authored
Merge branch 'master' into fix/muscl-thinc-overwrite
2 parents 70bd2a4 + 93e3d09 commit 0a92045

File tree

354 files changed

+21334
-12498
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

354 files changed

+21334
-12498
lines changed

.github/file-filter.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ yml: &yml
2525
- '.github/workflows/phoenix/**'
2626
- '.github/workflows/frontier/**'
2727
- '.github/workflows/frontier_amd/**'
28+
- '.github/scripts/**'
2829
- '.github/workflows/bench.yml'
2930
- '.github/workflows/test.yml'
3031
- '.github/workflows/formatting.yml'

.github/scripts/bench-preamble.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
# Shared preamble for benchmark scripts: detects GPUs, sets build/device opts.
3+
# Sets: $gpu_opts, $build_opts, $device_opts, $n_ranks, $ngpus, $gpu_ids
4+
# Usage: source .github/scripts/bench-preamble.sh
5+
6+
source .github/scripts/detect-gpus.sh
7+
source .github/scripts/gpu-opts.sh
8+
9+
n_ranks=12
10+
build_opts="$gpu_opts"
11+
device_opts=""
12+
if [ "$job_device" = "gpu" ]; then
13+
n_ranks=$ngpus
14+
device_opts="$gpu_opts -g $gpu_ids"
15+
fi
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env python3
2+
3+
"""Validate case-optimization output: check D/*.dat for NaN/Inf via the packer."""
4+
5+
import math
6+
import sys
7+
import os
8+
9+
if len(sys.argv) != 2:
10+
print(f"Usage: {sys.argv[0]} <case_directory>", file=sys.stderr)
11+
sys.exit(1)
12+
13+
# Allow importing from the repo root
14+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
15+
16+
from toolchain.mfc.packer.pack import compile as pack_compile
17+
18+
case_dir = sys.argv[1]
19+
if os.path.isfile(case_dir):
20+
case_dir = os.path.dirname(case_dir)
21+
22+
pack, err = pack_compile(case_dir)
23+
if err is not None:
24+
print(f"ERROR: {err}")
25+
sys.exit(1)
26+
27+
if not pack.entries:
28+
print(f"ERROR: No data found in {case_dir}/D/")
29+
sys.exit(1)
30+
31+
if pack.has_bad_values():
32+
print("ERROR: NaN or Inf detected in output:")
33+
for name, entry in pack.entries.items():
34+
for i, val in enumerate(entry.doubles):
35+
if math.isnan(val) or math.isinf(val):
36+
label = 'NaN' if math.isnan(val) else 'Inf'
37+
print(f" {label} at index {i} in {name}")
38+
break
39+
sys.exit(1)
40+
41+
total = sum(len(e.doubles) for e in pack.entries.values())
42+
print(f"OK: {len(pack.entries)} files, {total} values — no NaN/Inf found")

.github/scripts/clean-build.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
# Provides clean_build(): renames build/ aside and deletes it in the background.
3+
# mv is a metadata-only operation that succeeds even with stale NFS file handles,
4+
# unlike rm -rf which fails on ESTALE. The background delete is best-effort and
5+
# scoped to this job's PID to avoid races with concurrent matrix jobs.
6+
#
7+
# Usage: source .github/scripts/clean-build.sh
8+
# clean_build
9+
10+
clean_build() {
11+
# Clean up leftover stale directories from previous runs before adding a new one.
12+
rm -rf build.stale.* 2>/dev/null || true
13+
mv build "build.stale.$$" 2>/dev/null || true
14+
rm -rf "build.stale.$$" 2>/dev/null & disown
15+
}

.github/scripts/detect-gpus.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
# Detects GPUs (NVIDIA or AMD), sets $ngpus and $gpu_ids.
3+
# Usage: source .github/scripts/detect-gpus.sh
4+
5+
ngpus=0
6+
gpu_ids=""
7+
if command -v nvidia-smi &>/dev/null; then
8+
ngpus=$(nvidia-smi -L | wc -l)
9+
gpu_ids=$(seq -s ' ' 0 $((ngpus - 1)))
10+
elif command -v rocm-smi &>/dev/null; then
11+
gpu_ids=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
12+
ngpus=$(echo "$gpu_ids" | wc -w)
13+
fi

.github/scripts/gpu-opts.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
# Sets $gpu_opts from $job_device and $job_interface.
3+
# Usage: source .github/scripts/gpu-opts.sh
4+
5+
gpu_opts=""
6+
if [ "$job_device" = "gpu" ]; then
7+
gpu_opts="--gpu"
8+
if [ "$job_interface" = "omp" ]; then
9+
gpu_opts+=" mp"
10+
elif [ "$job_interface" = "acc" ]; then
11+
gpu_opts+=" acc"
12+
fi
13+
fi

.github/scripts/monitor_slurm_job.sh

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,17 @@ cleanup() {
99
if [ -n "${tail_pid:-}" ]; then
1010
kill "${tail_pid}" 2>/dev/null || true
1111
fi
12-
# Cancel the SLURM job if the monitor is exiting due to an error
13-
# (e.g., the CI runner is being killed). Don't cancel on success.
12+
# Cancel the SLURM job only if it is still active in the scheduler.
13+
# If the job already left the queue (squeue returns empty), it has finished
14+
# and run_monitored_slurm_job.sh will recover via sacct — don't cancel it.
1415
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
15-
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
16-
scancel "$job_id" 2>/dev/null || true
16+
active_state=$(squeue -j "$job_id" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || echo "")
17+
if [ -n "$active_state" ]; then
18+
echo "Monitor exiting abnormally — cancelling SLURM job $job_id (state: $active_state)"
19+
scancel "$job_id" 2>/dev/null || true
20+
else
21+
echo "Monitor exiting abnormally — SLURM job $job_id already left queue, not cancelling"
22+
fi
1723
fi
1824
}
1925
trap cleanup EXIT
@@ -46,6 +52,15 @@ get_job_state() {
4652
# Fallback to sacct (works for completed/historical jobs)
4753
if command -v sacct >/dev/null 2>&1; then
4854
state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
55+
# When a job is preempted+requeued, sacct -X reports PREEMPTED for the
56+
# original attempt while the requeued run may have completed. Check all
57+
# records (without -X) for a terminal state that supersedes PREEMPTED.
58+
if [ "$state" = "PREEMPTED" ]; then
59+
requeue_state=$(sacct -j "$jid" -n -P -o State 2>/dev/null | grep -v PREEMPTED | head -n1 | cut -d'|' -f1 || true)
60+
if [ -n "$requeue_state" ]; then
61+
state="$requeue_state"
62+
fi
63+
fi
4964
if [ -n "$state" ]; then
5065
echo "$state"
5166
return
@@ -56,9 +71,11 @@ get_job_state() {
5671
}
5772

5873
# Check if a state is terminal (job is done, for better or worse)
74+
# PREEMPTED is intentionally excluded: with --requeue the job restarts under
75+
# the same job ID and we must keep monitoring rather than exiting early.
5976
is_terminal_state() {
6077
case "$1" in
61-
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED|REVOKED)
78+
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|REVOKED)
6279
return 0 ;;
6380
*)
6481
return 1 ;;
@@ -74,7 +91,7 @@ while [ ! -f "$output_file" ]; do
7491
state=$(get_job_state "$job_id")
7592

7693
case "$state" in
77-
PENDING|CONFIGURING)
94+
PENDING|CONFIGURING|PREEMPTED)
7895
unknown_count=0
7996
sleep 5
8097
;;
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
3+
# Pre-builds all benchmark cases with --case-optimization.
4+
# No GPU hardware needed — compilation only.
5+
# Can run in two modes:
6+
# 1. Direct (Frontier login nodes): pass cluster/device/interface as args
7+
# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit-slurm-job.sh
8+
# Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]
9+
10+
set -e
11+
12+
# Support both positional args (direct invocation) and env vars (SLURM)
13+
cluster="${1:-${job_cluster:-phoenix}}"
14+
job_device="${2:-$job_device}"
15+
job_interface="${3:-$job_interface}"
16+
17+
# Derive module flag from cluster name
18+
case "$cluster" in
19+
phoenix) flag="p" ;;
20+
frontier) flag="f" ;;
21+
frontier_amd) flag="famd" ;;
22+
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
23+
esac
24+
25+
source .github/scripts/clean-build.sh
26+
clean_build
27+
28+
. ./mfc.sh load -c "$flag" -m g
29+
30+
# Set GPU build flags from interface — this is always a GPU build.
31+
# Don't use gpu-opts.sh since $job_device may be "cpu" when submitted
32+
# to a CPU SLURM partition (no GPU hardware needed for compilation).
33+
case "$job_interface" in
34+
acc) gpu_opts="--gpu acc" ;;
35+
omp) gpu_opts="--gpu mp" ;;
36+
*) echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;;
37+
esac
38+
39+
for case in benchmarks/*/case.py; do
40+
echo "=== Pre-building: $case ==="
41+
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
42+
done

.github/scripts/retry-build.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash
2+
# Provides retry_build(): 2-attempt loop.
3+
# On failure of attempt 1, nukes the entire build directory before attempt 2.
4+
# If RETRY_VALIDATE_CMD is set, runs it after a successful build; a non-zero
5+
# exit triggers the same nuke-and-retry, catching e.g. SIGILL from binaries
6+
# compiled on a different CPU architecture.
7+
# Usage: source .github/scripts/retry-build.sh
8+
# retry_build ./mfc.sh build -j 8 --gpu acc
9+
# RETRY_VALIDATE_CMD='./syscheck' retry_build ./mfc.sh build -j 8
10+
11+
retry_build() {
12+
local max_attempts=2
13+
local validate_cmd="${RETRY_VALIDATE_CMD:-}"
14+
local attempt=1
15+
while [ $attempt -le $max_attempts ]; do
16+
echo "Build attempt $attempt of $max_attempts..."
17+
if "$@"; then
18+
if [ -n "$validate_cmd" ]; then
19+
if ! eval "$validate_cmd"; then
20+
echo "Post-build validation failed on attempt $attempt."
21+
if [ $attempt -lt $max_attempts ]; then
22+
echo " Nuking build directory before retry..."
23+
rm -rf build 2>/dev/null || true
24+
sleep 5
25+
attempt=$((attempt + 1))
26+
continue
27+
else
28+
echo "Validation still failing after $max_attempts attempts."
29+
return 1
30+
fi
31+
fi
32+
fi
33+
echo "Build succeeded on attempt $attempt."
34+
return 0
35+
fi
36+
if [ $attempt -lt $max_attempts ]; then
37+
echo " Build failed — nuking build directory before retry..."
38+
rm -rf build 2>/dev/null || true
39+
sleep 30
40+
else
41+
echo "Build failed after $max_attempts attempts."
42+
return 1
43+
fi
44+
attempt=$((attempt + 1))
45+
done
46+
}

.github/scripts/retry-sbatch.sh

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/bash
2+
# Provides retry_sbatch(): submits a job script string via sbatch with retries.
3+
# Only retries on known transient SLURM/infrastructure errors (socket timeouts,
4+
# connection failures). Hard failures (bad account, invalid partition, QOS
5+
# violations) are not retried.
6+
#
7+
# Usage: source .github/scripts/retry-sbatch.sh
8+
# job_id=$(retry_sbatch "$script_contents")
9+
10+
retry_sbatch() {
11+
local script_contents="$1"
12+
local max_attempts=3
13+
local attempt=1
14+
local submit_output job_id last_output=""
15+
16+
while [ $attempt -le $max_attempts ]; do
17+
echo "sbatch attempt $attempt of $max_attempts..." >&2
18+
submit_output=$(printf '%s\n' "$script_contents" | sbatch 2>&1) || true
19+
job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$')
20+
if [ -n "$job_id" ]; then
21+
echo "$job_id"
22+
return 0
23+
fi
24+
last_output="$submit_output"
25+
echo "sbatch failed: $submit_output" >&2
26+
if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|temporarily unavailable"; then
27+
echo "Non-transient sbatch failure — not retrying." >&2
28+
return 1
29+
fi
30+
if [ $attempt -lt $max_attempts ]; then
31+
echo "Transient error — retrying in 30s..." >&2
32+
sleep 30
33+
fi
34+
attempt=$((attempt + 1))
35+
done
36+
37+
echo "sbatch failed after $max_attempts attempts. Last error: $last_output" >&2
38+
return 1
39+
}
40+

0 commit comments

Comments
 (0)