MFlowCode
diff --git a/‎.github/file-filter.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/file-filter.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/scripts/bench-preamble.sh‎
Lines changed: 15 additions & 0 deletions b/‎.github/scripts/bench-preamble.sh‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎.github/scripts/check_case_optimization_output.py‎
Lines changed: 42 additions & 0 deletions b/‎.github/scripts/check_case_optimization_output.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎.github/scripts/clean-build.sh‎
Lines changed: 15 additions & 0 deletions b/‎.github/scripts/clean-build.sh‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎.github/scripts/detect-gpus.sh‎
Lines changed: 13 additions & 0 deletions b/‎.github/scripts/detect-gpus.sh‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎.github/scripts/gpu-opts.sh‎
Lines changed: 13 additions & 0 deletions b/‎.github/scripts/gpu-opts.sh‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎.github/scripts/monitor_slurm_job.sh‎
Lines changed: 23 additions & 6 deletions b/‎.github/scripts/monitor_slurm_job.sh‎
Lines changed: 23 additions & 6 deletions
diff --git a/‎.github/scripts/prebuild-case-optimization.sh‎
Lines changed: 42 additions & 0 deletions b/‎.github/scripts/prebuild-case-optimization.sh‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎.github/scripts/retry-build.sh‎
Lines changed: 46 additions & 0 deletions b/‎.github/scripts/retry-build.sh‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎.github/scripts/retry-sbatch.sh‎
Lines changed: 40 additions & 0 deletions b/‎.github/scripts/retry-sbatch.sh‎
Lines changed: 40 additions & 0 deletions
@@ -25,6 +25,7 @@ yml: &yml
   - '.github/workflows/phoenix/**'
   - '.github/workflows/frontier/**'
   - '.github/workflows/frontier_amd/**'
+  - '.github/scripts/**'
   - '.github/workflows/bench.yml'
   - '.github/workflows/test.yml'
   - '.github/workflows/formatting.yml'
 
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Shared preamble for benchmark scripts: detects GPUs, sets build/device opts.
+# Sets: $gpu_opts, $build_opts, $device_opts, $n_ranks, $ngpus, $gpu_ids
+# Usage: source .github/scripts/bench-preamble.sh
+
+source .github/scripts/detect-gpus.sh
+source .github/scripts/gpu-opts.sh
+
+n_ranks=12
+build_opts="$gpu_opts"
+device_opts=""
+if [ "$job_device" = "gpu" ]; then
+    n_ranks=$ngpus
+    device_opts="$gpu_opts -g $gpu_ids"
+fi
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+"""Validate case-optimization output: check D/*.dat for NaN/Inf via the packer."""
+
+import math
+import sys
+import os
+
+if len(sys.argv) != 2:
+    print(f"Usage: {sys.argv[0]} <case_directory>", file=sys.stderr)
+    sys.exit(1)
+
+# Allow importing from the repo root
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+
+from toolchain.mfc.packer.pack import compile as pack_compile
+
+case_dir = sys.argv[1]
+if os.path.isfile(case_dir):
+    case_dir = os.path.dirname(case_dir)
+
+pack, err = pack_compile(case_dir)
+if err is not None:
+    print(f"ERROR: {err}")
+    sys.exit(1)
+
+if not pack.entries:
+    print(f"ERROR: No data found in {case_dir}/D/")
+    sys.exit(1)
+
+if pack.has_bad_values():
+    print("ERROR: NaN or Inf detected in output:")
+    for name, entry in pack.entries.items():
+        for i, val in enumerate(entry.doubles):
+            if math.isnan(val) or math.isinf(val):
+                label = 'NaN' if math.isnan(val) else 'Inf'
+                print(f"  {label} at index {i} in {name}")
+                break
+    sys.exit(1)
+
+total = sum(len(e.doubles) for e in pack.entries.values())
+print(f"OK: {len(pack.entries)} files, {total} values — no NaN/Inf found")
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Provides clean_build(): renames build/ aside and deletes it in the background.
+# mv is a metadata-only operation that succeeds even with stale NFS file handles,
+# unlike rm -rf which fails on ESTALE. The background delete is best-effort and
+# scoped to this job's PID to avoid races with concurrent matrix jobs.
+#
+# Usage: source .github/scripts/clean-build.sh
+#        clean_build
+
+clean_build() {
+    # Clean up leftover stale directories from previous runs before adding a new one.
+    rm -rf build.stale.* 2>/dev/null || true
+    mv build "build.stale.$$" 2>/dev/null || true
+    rm -rf "build.stale.$$" 2>/dev/null & disown
+}
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Detects GPUs (NVIDIA or AMD), sets $ngpus and $gpu_ids.
+# Usage: source .github/scripts/detect-gpus.sh
+
+ngpus=0
+gpu_ids=""
+if command -v nvidia-smi &>/dev/null; then
+    ngpus=$(nvidia-smi -L | wc -l)
+    gpu_ids=$(seq -s ' ' 0 $((ngpus - 1)))
+elif command -v rocm-smi &>/dev/null; then
+    gpu_ids=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
+    ngpus=$(echo "$gpu_ids" | wc -w)
+fi
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Sets $gpu_opts from $job_device and $job_interface.
+# Usage: source .github/scripts/gpu-opts.sh
+
+gpu_opts=""
+if [ "$job_device" = "gpu" ]; then
+    gpu_opts="--gpu"
+    if [ "$job_interface" = "omp" ]; then
+        gpu_opts+=" mp"
+    elif [ "$job_interface" = "acc" ]; then
+        gpu_opts+=" acc"
+    fi
+fi
@@ -9,11 +9,17 @@ cleanup() {
   if [ -n "${tail_pid:-}" ]; then
     kill "${tail_pid}" 2>/dev/null || true
   fi
-  # Cancel the SLURM job if the monitor is exiting due to an error
-  # (e.g., the CI runner is being killed). Don't cancel on success.
+  # Cancel the SLURM job only if it is still active in the scheduler.
+  # If the job already left the queue (squeue returns empty), it has finished
+  # and run_monitored_slurm_job.sh will recover via sacct — don't cancel it.
   if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
-    echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
-    scancel "$job_id" 2>/dev/null || true
+    active_state=$(squeue -j "$job_id" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+    if [ -n "$active_state" ]; then
+      echo "Monitor exiting abnormally — cancelling SLURM job $job_id (state: $active_state)"
+      scancel "$job_id" 2>/dev/null || true
+    else
+      echo "Monitor exiting abnormally — SLURM job $job_id already left queue, not cancelling"
+    fi
   fi
 }
 trap cleanup EXIT
@@ -46,6 +52,15 @@ get_job_state() {
   # Fallback to sacct (works for completed/historical jobs)
   if command -v sacct >/dev/null 2>&1; then
     state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
+    # When a job is preempted+requeued, sacct -X reports PREEMPTED for the
+    # original attempt while the requeued run may have completed.  Check all
+    # records (without -X) for a terminal state that supersedes PREEMPTED.
+    if [ "$state" = "PREEMPTED" ]; then
+      requeue_state=$(sacct -j "$jid" -n -P -o State 2>/dev/null | grep -v PREEMPTED | head -n1 | cut -d'|' -f1 || true)
+      if [ -n "$requeue_state" ]; then
+        state="$requeue_state"
+      fi
+    fi
     if [ -n "$state" ]; then
       echo "$state"
       return
@@ -56,9 +71,11 @@ get_job_state() {
 }
 
 # Check if a state is terminal (job is done, for better or worse)
+# PREEMPTED is intentionally excluded: with --requeue the job restarts under
+# the same job ID and we must keep monitoring rather than exiting early.
 is_terminal_state() {
   case "$1" in
-    COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED|REVOKED)
+    COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|REVOKED)
       return 0 ;;
     *)
       return 1 ;;
@@ -74,7 +91,7 @@ while [ ! -f "$output_file" ]; do
   state=$(get_job_state "$job_id")
 
   case "$state" in
-    PENDING|CONFIGURING)
+    PENDING|CONFIGURING|PREEMPTED)
       unknown_count=0
       sleep 5
       ;;
 
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Pre-builds all benchmark cases with --case-optimization.
+# No GPU hardware needed — compilation only.
+# Can run in two modes:
+#   1. Direct (Frontier login nodes): pass cluster/device/interface as args
+#   2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit-slurm-job.sh
+# Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]
+
+set -e
+
+# Support both positional args (direct invocation) and env vars (SLURM)
+cluster="${1:-${job_cluster:-phoenix}}"
+job_device="${2:-$job_device}"
+job_interface="${3:-$job_interface}"
+
+# Derive module flag from cluster name
+case "$cluster" in
+    phoenix)      flag="p" ;;
+    frontier)     flag="f" ;;
+    frontier_amd) flag="famd" ;;
+    *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
+esac
+
+source .github/scripts/clean-build.sh
+clean_build
+
+. ./mfc.sh load -c "$flag" -m g
+
+# Set GPU build flags from interface — this is always a GPU build.
+# Don't use gpu-opts.sh since $job_device may be "cpu" when submitted
+# to a CPU SLURM partition (no GPU hardware needed for compilation).
+case "$job_interface" in
+    acc) gpu_opts="--gpu acc" ;;
+    omp) gpu_opts="--gpu mp" ;;
+    *)   echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;;
+esac
+
+for case in benchmarks/*/case.py; do
+    echo "=== Pre-building: $case ==="
+    ./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
+done
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Provides retry_build(): 2-attempt loop.
+# On failure of attempt 1, nukes the entire build directory before attempt 2.
+# If RETRY_VALIDATE_CMD is set, runs it after a successful build; a non-zero
+# exit triggers the same nuke-and-retry, catching e.g. SIGILL from binaries
+# compiled on a different CPU architecture.
+# Usage: source .github/scripts/retry-build.sh
+#        retry_build ./mfc.sh build -j 8 --gpu acc
+#        RETRY_VALIDATE_CMD='./syscheck' retry_build ./mfc.sh build -j 8
+
+retry_build() {
+    local max_attempts=2
+    local validate_cmd="${RETRY_VALIDATE_CMD:-}"
+    local attempt=1
+    while [ $attempt -le $max_attempts ]; do
+        echo "Build attempt $attempt of $max_attempts..."
+        if "$@"; then
+            if [ -n "$validate_cmd" ]; then
+                if ! eval "$validate_cmd"; then
+                    echo "Post-build validation failed on attempt $attempt."
+                    if [ $attempt -lt $max_attempts ]; then
+                        echo "  Nuking build directory before retry..."
+                        rm -rf build 2>/dev/null || true
+                        sleep 5
+                        attempt=$((attempt + 1))
+                        continue
+                    else
+                        echo "Validation still failing after $max_attempts attempts."
+                        return 1
+                    fi
+                fi
+            fi
+            echo "Build succeeded on attempt $attempt."
+            return 0
+        fi
+        if [ $attempt -lt $max_attempts ]; then
+            echo "  Build failed — nuking build directory before retry..."
+            rm -rf build 2>/dev/null || true
+            sleep 30
+        else
+            echo "Build failed after $max_attempts attempts."
+            return 1
+        fi
+        attempt=$((attempt + 1))
+    done
+}
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Provides retry_sbatch(): submits a job script string via sbatch with retries.
+# Only retries on known transient SLURM/infrastructure errors (socket timeouts,
+# connection failures). Hard failures (bad account, invalid partition, QOS
+# violations) are not retried.
+#
+# Usage: source .github/scripts/retry-sbatch.sh
+#        job_id=$(retry_sbatch "$script_contents")
+
+retry_sbatch() {
+    local script_contents="$1"
+    local max_attempts=3
+    local attempt=1
+    local submit_output job_id last_output=""
+
+    while [ $attempt -le $max_attempts ]; do
+        echo "sbatch attempt $attempt of $max_attempts..." >&2
+        submit_output=$(printf '%s\n' "$script_contents" | sbatch 2>&1) || true
+        job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$')
+        if [ -n "$job_id" ]; then
+            echo "$job_id"
+            return 0
+        fi
+        last_output="$submit_output"
+        echo "sbatch failed: $submit_output" >&2
+        if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|temporarily unavailable"; then
+            echo "Non-transient sbatch failure — not retrying." >&2
+            return 1
+        fi
+        if [ $attempt -lt $max_attempts ]; then
+            echo "Transient error — retrying in 30s..." >&2
+            sleep 30
+        fi
+        attempt=$((attempt + 1))
+    done
+
+    echo "sbatch failed after $max_attempts attempts. Last error: $last_output" >&2
+    return 1
+}
+