Skip to content

Commit cad1e27

Browse files
authored
Merge branch 'master' into MovingBubblesFresh-clean
2 parents 98bcbf9 + afc724e commit cad1e27

File tree

13 files changed

+150
-94
lines changed

13 files changed

+150
-94
lines changed

.github/scripts/run_case_optimization.sh

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,6 @@ if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then
1313
ngpus=1
1414
fi
1515

16-
# Verify the venv Python interpreter exists (created by ./mfc.sh build)
17-
if [ ! -x build/venv/bin/python3 ]; then
18-
echo "ERROR: build/venv/bin/python3 not found."
19-
echo "The MFC build venv may not have been created. Was the pre-build step successful?"
20-
exit 1
21-
fi
22-
2316
benchmarks=(
2417
benchmarks/5eq_rk3_weno3_hllc/case.py
2518
benchmarks/viscous_weno5_sgb_acoustic/case.py
@@ -28,6 +21,30 @@ benchmarks=(
2821
benchmarks/igr/case.py
2922
)
3023

24+
# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only;
25+
# build case-optimized binaries here on the compute node before running.
26+
# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job.
27+
#
28+
# Clean stale MFC target staging before building. On self-hosted CI runners,
29+
# corrupted intermediate files from a prior failed build (e.g. CCE optcg crash)
30+
# can persist and poison subsequent builds. Each case-opt config gets its own
31+
# hash-named staging dir, but install dirs and other artifacts may be stale.
32+
if [ "$job_cluster" != "phoenix" ]; then
33+
# Clean stale MFC target dirs (hash-named) from prior builds, but
34+
# preserve dependency dirs (hipfort, fftw, etc.) since the compute
35+
# node has no internet to re-fetch them.
36+
echo "=== Cleaning stale MFC target staging/install ==="
37+
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
38+
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
39+
40+
echo "=== Building case-optimized binaries on compute node ==="
41+
for case in "${benchmarks[@]}"; do
42+
echo "--- Building: $case ---"
43+
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
44+
done
45+
echo "=== All case-optimized binaries built ==="
46+
fi
47+
3148
passed=0
3249
failed=0
3350
failed_cases=""
@@ -44,7 +61,7 @@ for case in "${benchmarks[@]}"; do
4461
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
4562

4663
# Build + run with --case-optimization, small grid, 10 timesteps
47-
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then
64+
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -c "$job_cluster" -- --gbpp 1 --steps 10; then
4865
# Validate output
4966
if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
5067
echo "PASS: $case_name"

.github/workflows/bench.yml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,42 +68,47 @@ jobs:
6868
flag: f
6969
device: gpu
7070
interface: acc
71-
build_script: "bash .github/workflows/frontier/build.sh gpu acc bench"
71+
build_script: "bash .github/workflows/frontier/build.sh gpu acc"
7272
- cluster: frontier
7373
name: Oak Ridge | Frontier (CCE)
7474
group: phoenix
7575
labels: frontier
7676
flag: f
7777
device: gpu
7878
interface: omp
79-
build_script: "bash .github/workflows/frontier/build.sh gpu omp bench"
79+
build_script: "bash .github/workflows/frontier/build.sh gpu omp"
8080
- cluster: frontier_amd
8181
name: Oak Ridge | Frontier (AMD)
8282
group: phoenix
8383
labels: frontier
8484
flag: famd
8585
device: gpu
8686
interface: omp
87-
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
87+
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp"
8888
continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }}
8989
runs-on:
9090
group: ${{ matrix.group }}
9191
labels: ${{ matrix.labels }}
9292
timeout-minutes: 480
9393
steps:
94+
- name: Clean stale output files
95+
run: rm -f *.out
96+
9497
- name: Clone - PR
9598
uses: actions/checkout@v4
9699
with:
97100
path: pr
101+
clean: false
98102

99103
- name: Clone - Master
100104
uses: actions/checkout@v4
101105
with:
102106
repository: MFlowCode/MFC
103107
ref: master
104108
path: master
109+
clean: false
105110

106-
- name: Setup & Build
111+
- name: Fetch Dependencies
107112
if: matrix.build_script != ''
108113
timeout-minutes: 150
109114
run: |

.github/workflows/common/bench.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,18 @@ if [ "$job_cluster" = "phoenix" ]; then
2121
trap 'rm -rf "$currentdir" || true' EXIT
2222
fi
2323

24-
# --- Build (if not pre-built on login node) ---
25-
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
24+
# --- Build ---
25+
# Phoenix builds everything inside SLURM (no login-node build step).
26+
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
27+
# source code is built here on the compute node.
2628
# Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
2729
if [ "$job_cluster" = "phoenix" ]; then
2830
source .github/scripts/clean-build.sh
2931
clean_build
3032
fi
3133

32-
if [ ! -d "build" ]; then
33-
source .github/scripts/retry-build.sh
34-
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
35-
fi
34+
source .github/scripts/retry-build.sh
35+
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
3636

3737
# --- Bench cluster flag ---
3838
if [ "$job_cluster" = "phoenix" ]; then

.github/workflows/common/build.sh

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/bin/bash
2+
# Build-only script for all clusters.
3+
# Runs inside a SLURM job via submit-slurm-job.sh.
4+
# Builds MFC without running tests (--dry-run).
5+
# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster
6+
7+
set -euo pipefail
8+
9+
source .github/scripts/gpu-opts.sh
10+
build_opts="$gpu_opts"
11+
12+
# --- Phoenix TMPDIR setup ---
13+
if [ "$job_cluster" = "phoenix" ]; then
14+
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
15+
currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
16+
mkdir -p $tmpbuild
17+
mkdir -p $currentdir
18+
export TMPDIR=$currentdir
19+
trap 'rm -rf "$currentdir" || true' EXIT
20+
fi
21+
22+
# --- Build ---
23+
# Phoenix builds everything inside SLURM (no login-node build step).
24+
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
25+
# source code is built here on the compute node.
26+
# Phoenix: always start fresh to avoid SIGILL from stale binaries compiled
27+
# on a different microarchitecture.
28+
if [ "$job_cluster" = "phoenix" ]; then
29+
source .github/scripts/clean-build.sh
30+
clean_build
31+
fi
32+
33+
source .github/scripts/retry-build.sh
34+
35+
# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
36+
# (SIGILL from binaries compiled on a different compute node).
37+
validate_cmd=""
38+
if [ "$job_cluster" = "phoenix" ]; then
39+
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
40+
fi
41+
42+
RETRY_VALIDATE_CMD="$validate_cmd" \
43+
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1

.github/workflows/common/test.sh

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/bin/bash
2-
# Unified test script for all clusters.
2+
# Test-only script for all clusters.
33
# Runs inside a SLURM job via submit-slurm-job.sh.
4+
# Assumes MFC is already built (by a prior build.sh SLURM job).
45
# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster
56

67
set -euo pipefail
@@ -9,9 +10,6 @@ source .github/scripts/gpu-opts.sh
910
build_opts="$gpu_opts"
1011

1112
# --- Phoenix TMPDIR setup ---
12-
# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each
13-
# spawning MPI processes, it fills up and ORTE session dir creation fails.
14-
# Redirect TMPDIR to project storage, same as bench.sh.
1513
if [ "$job_cluster" = "phoenix" ]; then
1614
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
1715
currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
@@ -21,29 +19,6 @@ if [ "$job_cluster" = "phoenix" ]; then
2119
trap 'rm -rf "$currentdir" || true' EXIT
2220
fi
2321

24-
# --- Build (if not pre-built on login node) ---
25-
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
26-
# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
27-
# to avoid SIGILL from stale binaries compiled on a different microarchitecture.
28-
if [ "$job_cluster" = "phoenix" ]; then
29-
source .github/scripts/clean-build.sh
30-
clean_build
31-
fi
32-
33-
if [ ! -d "build" ]; then
34-
source .github/scripts/retry-build.sh
35-
36-
# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
37-
# (SIGILL from binaries compiled on a different compute node).
38-
validate_cmd=""
39-
if [ "$job_cluster" = "phoenix" ]; then
40-
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
41-
fi
42-
43-
RETRY_VALIDATE_CMD="$validate_cmd" \
44-
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1
45-
fi
46-
4722
# --- GPU detection and thread count ---
4823
device_opts=""
4924
rdma_opts=""
@@ -88,4 +63,4 @@ if [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ]; then
8863
prune_flag="--only-changes"
8964
fi
9065

91-
./mfc.sh test -v --max-attempts 3 $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster
66+
./mfc.sh test -v --max-attempts 3 --no-build $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster

.github/workflows/frontier/build.sh

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ esac
1414

1515
job_device=$1
1616
job_interface=$2
17-
run_bench=$3
1817
source .github/scripts/gpu-opts.sh
1918
build_opts="$gpu_opts"
2019

@@ -24,8 +23,4 @@ source .github/scripts/clean-build.sh
2423
clean_build
2524

2625
source .github/scripts/retry-build.sh
27-
if [ "$run_bench" == "bench" ]; then
28-
retry_build ./mfc.sh build -j 8 $build_opts || exit 1
29-
else
30-
retry_build ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts || exit 1
31-
fi
26+
retry_build ./mfc.sh build --deps-only -j 8 $build_opts || exit 1

.github/workflows/test.yml

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ jobs:
225225
if: matrix.os == 'macos'
226226
run: |
227227
brew update
228-
brew upgrade
228+
brew upgrade || true
229229
brew install coreutils python fftw hdf5 gcc@15 boost open-mpi lapack
230230
echo "FC=gfortran-15" >> $GITHUB_ENV
231231
echo "BOOST_INCLUDE=/opt/homebrew/include/" >> $GITHUB_ENV
@@ -400,11 +400,14 @@ jobs:
400400
echo "Coverage cache: none available — full test suite will run"
401401
fi
402402
403-
- name: Build (login node)
403+
- name: Fetch Dependencies
404404
if: matrix.cluster != 'phoenix'
405405
timeout-minutes: 60
406406
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
407407

408+
- name: Build
409+
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/build.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}
410+
408411
- name: Test
409412
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}
410413

@@ -421,23 +424,29 @@ jobs:
421424
if: always()
422425
id: log
423426
run: |
424-
SLUG="test-${{ matrix.device }}-${{ matrix.interface }}"
427+
SHARD_SUFFIX=""
425428
SHARD="${{ matrix.shard }}"
426429
if [ -n "$SHARD" ]; then
427-
SLUG="${SLUG}-$(echo "$SHARD" | sed 's|/|-of-|')"
430+
SHARD_SUFFIX="-$(echo "$SHARD" | sed 's|/|-of-|')"
428431
fi
429-
echo "slug=${SLUG}" >> "$GITHUB_OUTPUT"
432+
echo "build_slug=build-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"
433+
echo "test_slug=test-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"
430434
431435
- name: Print Logs
432436
if: always()
433-
run: cat ${{ steps.log.outputs.slug }}.out
437+
run: |
438+
for f in ${{ steps.log.outputs.build_slug }}.out ${{ steps.log.outputs.test_slug }}.out; do
439+
[ -f "$f" ] && echo "=== $f ===" && cat "$f"
440+
done
434441
435442
- name: Archive Logs
436443
uses: actions/upload-artifact@v4
437444
if: matrix.cluster != 'phoenix'
438445
with:
439-
name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.slug }}
440-
path: ${{ steps.log.outputs.slug }}.out
446+
name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.test_slug }}
447+
path: |
448+
${{ steps.log.outputs.build_slug }}.out
449+
${{ steps.log.outputs.test_slug }}.out
441450
442451
case-optimization:
443452
name: "Case Opt | ${{ matrix.cluster_name }} (${{ matrix.device }}-${{ matrix.interface }})"
@@ -486,15 +495,20 @@ jobs:
486495
- name: Clean stale output files
487496
run: rm -f *.out
488497

498+
- name: Fetch Dependencies
499+
if: matrix.cluster != 'phoenix'
500+
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
501+
489502
- name: Pre-Build (SLURM)
490503
if: matrix.cluster == 'phoenix'
491504
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}
492505

493-
- name: Pre-Build (login node)
506+
- name: Build & Run Case-Optimization Tests
494507
if: matrix.cluster != 'phoenix'
495-
run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
508+
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
496509

497510
- name: Run Case-Optimization Tests
511+
if: matrix.cluster == 'phoenix'
498512
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
499513

500514
- name: Cancel SLURM Jobs

CMakeLists.txt

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
176176
endif()
177177
elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
178178
add_compile_options(
179-
"SHELL:-M 296,878,1391,1069,5025"
179+
"SHELL:-M 296,878,1391,1069,990,5025,7208,7212,7242"
180180
"SHELL:-h static" "SHELL:-h keepfiles"
181181
"SHELL:-h acc_model=auto_async_none"
182182
"SHELL: -h acc_model=no_fast_addr"
@@ -190,9 +190,9 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
190190
add_compile_options(
191191
"SHELL:-h acc_model=auto_async_none"
192192
"SHELL: -h acc_model=no_fast_addr"
193-
"SHELL: -K trap=fp" "SHELL: -G2"
193+
"SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0"
194194
)
195-
add_link_options("SHELL: -K trap=fp" "SHELL: -G2")
195+
add_link_options("SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0")
196196
endif()
197197

198198
elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang")
@@ -665,10 +665,9 @@ if (MFC_SIMULATION)
665665
# Disabling IPA per-file avoids the crashes while preserving IPA for
666666
# the rest of simulation (needed for thermochem INLINEALWAYS inlining).
667667
# Applied to Cray+OpenACC and Cray CPU, but NOT Cray+OpenMP: on OpenMP,
668-
# m_thermochem uses !DIR$ INLINEALWAYS (requires IPA), so disabling IPA
669-
# for these files breaks thermochem on-device calls. On OpenACC the
670-
# pyrometheus patch emits !$acc routine seq instead (no IPA needed).
671-
# See PR #1286.
668+
# CCE 19.0.0 IPA workaround: disable interprocedural analysis for files
669+
# that trigger compiler SIGSEGV during IPA (Bug 3: m_phase_change,
670+
# Bug 4: m_bubbles_EL). See PR #1286.
672671
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray" AND NOT MFC_OpenMP)
673672
set_source_files_properties(
674673
"${CMAKE_BINARY_DIR}/fypp/simulation/m_bubbles_EL.fpp.f90"

docs/index.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
// Aerodynamics
5656
{ name: "Flow over an airfoil (vorticity)", image: "res/simulations/g.png", computer: "Delta", computerUrl: "https://www.ncsa.illinois.edu/research/project-highlights/delta/", accelerators: "128 A100s", walltime: "19h", source: "https://www.youtube.com/watch?v=FvAgnBW59cY" },
5757
{ name: "Pitching airfoil (3D)", image: "res/simulations/m.png", computer: "Phoenix", computerUrl: "https://www.pace.gatech.edu/", accelerators: "1 A100", walltime: "5h", source: "https://www.youtube.com/watch?v=2XH-9MumDHU" },
58-
{ name: "Flow over a corgi (pressure)", image: "res/simulations/u.png", computer: "HiPerGator", computerUrl: "https://www.rc.ufl.edu/about/hipergator/", accelerators: "4 B200s", walltime: "1h", source: "https://www.youtube.com/watch?v=IFNK3psPf3g" },
58+
{ name: "Mach 0.3 flow over a corgi (2M STL)", image: "res/simulations/u.png", computer: "HiPerGator", computerUrl: "https://www.rc.ufl.edu/about/hipergator/", accelerators: "2 GPUs", walltime: "80s", source: "https://www.youtube.com/watch?v=O8dSRqHLp_o" },
5959
// Shock-droplet
6060
{ name: "Shedding water droplet", image: "res/simulations/a.png", computer: "Summit", computerUrl: "https://www.olcf.ornl.gov/summit/", accelerators: "960 V100s", walltime: "4h", source: "https://www.youtube.com/watch?v=Gjj-qZkXcrg" },
6161
// Biomedical & acoustics

docs/res/simulations/u.png

1010 KB
Loading

0 commit comments

Comments
 (0)