Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions .github/scripts/setup-build-cache.sh
Comment thread
sbryngelson marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/bin/bash
Comment thread
sbryngelson marked this conversation as resolved.
# Sets up a persistent build cache for self-hosted CI runners.
# Creates a symlink: ./build -> <resolved scratch path>/.mfc-ci-cache/<key>/build
#
# This ensures that every run of the same config (cluster/device/interface) finds
# cached build artifacts regardless of which runner instance picks up the job.
#
# Concurrent safety: uses flock to serialize access per cache directory. If
# multiple PRs trigger the same config simultaneously, the second job waits
# for the first to finish (up to 1 hour), then gets a warm cache. If the lock
# times out, falls back to a local build (same as no caching).
#
# Usage: source .github/scripts/setup-build-cache.sh <cluster> <device> <interface>

_cache_cluster="${1:?Usage: setup-build-cache.sh <cluster> <device> <interface>}"
_cache_device="${2:?}"
_cache_interface="${3:-none}"

_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}"
_cache_base="$HOME/scratch/.mfc-ci-cache/${_cache_key}/build"

# Create the cache dir, then resolve to a physical path (no symlinks).
# $HOME/scratch is typically a symlink to a scratch filesystem — resolving
# it ensures the build symlink target remains valid even if intermediate
# symlinks change.
mkdir -p "$_cache_base"
_cache_dir="$(cd "$_cache_base" && pwd -P)"
Comment thread
sbryngelson marked this conversation as resolved.
Outdated

echo "=== Build Cache Setup ==="
echo " Cache key: $_cache_key"
echo " Cache dir: $_cache_dir"

# Acquire an exclusive lock on the cache directory to prevent concurrent
# builds from corrupting it. The lock is fd-based (flock on fd 9), so it
# auto-releases when the calling process exits — no stale locks.
#
# Timeout: 1 hour. If another build holds the lock, we wait. This is fine
# because the waiting job will get a warm cache when it finally acquires.
# If the lock can't be acquired after 1 hour, something is wrong — fall
# back to a local build in the workspace.
_cache_locked=false
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
_lock_file="$_cache_dir/.cache.lock"
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
exec 9>"$_lock_file"
echo " Acquiring cache lock..."
if flock --timeout 3600 9; then
_cache_locked=true
echo " Cache lock acquired"
else
echo " WARNING: Cache lock timeout (1h), building locally without cache"
exec 9>&-
# Remove any existing symlink to the shared cache so we don't write
# into it without the lock. Then create a real local directory.
if [ -L "build" ]; then
rm -f "build"
fi
mkdir -p "build"
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
Outdated
echo "========================="
return 0 2>/dev/null || true
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
Outdated
fi
Comment thread
sbryngelson marked this conversation as resolved.
Outdated

# If build/ exists (real dir or stale symlink), remove it.
# rm -rf on a symlink removes the symlink, not the target — cache is safe.
if [ -e "build" ] || [ -L "build" ]; then
rm -rf "build"
fi
Comment thread
sbryngelson marked this conversation as resolved.
Outdated

ln -s "$_cache_dir" "build"
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
Comment thread
sbryngelson marked this conversation as resolved.

# Handle cross-runner workspace path changes.
# CMakeCache.txt stores absolute paths from whichever runner instance
# originally configured the build. If we're on a different runner, sed-replace
# the old workspace path with the current one so CMake can do incremental builds.
_workspace_marker="$_cache_dir/.workspace_path"
if [ -f "$_workspace_marker" ]; then
_old_workspace=$(cat "$_workspace_marker")
if [ "$_old_workspace" != "$(pwd)" ]; then
echo " Workspace path changed: $_old_workspace -> $(pwd)"
echo " Updating cached CMake paths..."
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
find "$_cache_dir/staging" -type f \
\( -name "CMakeCache.txt" -o -name "*.cmake" \
-o -name "*.make" -o -name "Makefile" \
-o -name "build.ninja" \) \
-exec sed -i "s|${_old_workspace}|$(pwd)|g" {} + 2>/dev/null || true
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
fi
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
fi
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
echo "$(pwd)" > "$_workspace_marker"

echo " Symlink: build -> $_cache_dir"
echo "========================="
8 changes: 8 additions & 0 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ jobs:
- name: Checkouts
uses: actions/checkout@v4

- name: Restore Build Cache
uses: actions/cache@v4
with:
path: build
Comment thread
sbryngelson marked this conversation as resolved.
key: mfc-coverage-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
restore-keys: |
mfc-coverage-

- name: Setup Ubuntu
run: |
sudo apt update -y
Expand Down
7 changes: 5 additions & 2 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ fi

. ./mfc.sh load -c f -m g

# Set up persistent build cache
source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface"

max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
Expand Down Expand Up @@ -45,8 +48,8 @@ while [ $attempt -le $max_attempts ]; do
fi
Comment thread
sbryngelson marked this conversation as resolved.

if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
./mfc.sh clean
echo "Build failed on attempt $attempt. Clearing staging/install and retrying in 30s..."
rm -rf build/staging build/lock.yaml
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
Outdated
sleep 30
fi
attempt=$((attempt + 1))
Expand Down
7 changes: 5 additions & 2 deletions .github/workflows/frontier_amd/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ fi

. ./mfc.sh load -c famd -m g

# Set up persistent build cache
source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface"

max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
Expand Down Expand Up @@ -45,8 +48,8 @@ while [ $attempt -le $max_attempts ]; do
fi
Comment thread
sbryngelson marked this conversation as resolved.

if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
./mfc.sh clean
echo "Build failed on attempt $attempt. Clearing staging/install and retrying in 30s..."
rm -rf build/staging build/lock.yaml
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
sleep 30
fi
attempt=$((attempt + 1))
Expand Down
14 changes: 12 additions & 2 deletions .github/workflows/phoenix/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

# Set up persistent build cache
source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"

max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
Expand All @@ -20,8 +23,8 @@ while [ $attempt -le $max_attempts ]; do
fi

if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
./mfc.sh clean
echo "Build failed on attempt $attempt. Clearing staging/install and retrying in 30s..."
rm -rf build/staging build/lock.yaml
sleep 30
else
echo "Build failed after $max_attempts attempts."
Expand All @@ -30,6 +33,13 @@ while [ $attempt -le $max_attempts ]; do
attempt=$((attempt + 1))
done

# Release the cache lock before running tests. Tests only read installed
# binaries and can take hours — no need to block other builds.
if [ "${_cache_locked:-false}" = true ]; then
exec 9>&-
echo "Released build cache lock before tests"
fi
Comment thread
sbryngelson marked this conversation as resolved.
Outdated

n_test_threads=8

if [ "$job_device" = "gpu" ]; then
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,14 @@ jobs:
- name: Clone
uses: actions/checkout@v4

- name: Restore Build Cache
uses: actions/cache@v4
with:
path: build
key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
restore-keys: |
mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-

- name: Setup MacOS
if: matrix.os == 'macos'
run: |
Expand Down
Loading