Skip to content

Commit ce98373

Browse files
authored
Consolidate CI infrastructure and add NFS-resilient build cache (#1285)
1 parent 472a0de commit ce98373

File tree

27 files changed

+478
-419
lines changed

27 files changed

+478
-419
lines changed

.github/file-filter.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ yml: &yml
2525
- '.github/workflows/phoenix/**'
2626
- '.github/workflows/frontier/**'
2727
- '.github/workflows/frontier_amd/**'
28+
- '.github/scripts/**'
2829
- '.github/workflows/bench.yml'
2930
- '.github/workflows/test.yml'
3031
- '.github/workflows/formatting.yml'

.github/scripts/bench-preamble.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
# Shared preamble for benchmark scripts: detects GPUs, sets build/device opts.
3+
# Sets: $gpu_opts, $build_opts, $device_opts, $n_ranks, $ngpus, $gpu_ids
4+
# Usage: source .github/scripts/bench-preamble.sh
5+
6+
source .github/scripts/detect-gpus.sh
7+
source .github/scripts/gpu-opts.sh
8+
9+
n_ranks=12
10+
build_opts="$gpu_opts"
11+
device_opts=""
12+
if [ "$job_device" = "gpu" ]; then
13+
n_ranks=$ngpus
14+
device_opts="$gpu_opts -g $gpu_ids"
15+
fi
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env python3
2+
3+
"""Validate case-optimization output: check D/*.dat for NaN/Inf via the packer."""
4+
5+
import math
6+
import sys
7+
import os
8+
9+
if len(sys.argv) != 2:
10+
print(f"Usage: {sys.argv[0]} <case_directory>", file=sys.stderr)
11+
sys.exit(1)
12+
13+
# Allow importing from the repo root
14+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
15+
16+
from toolchain.mfc.packer.pack import compile as pack_compile
17+
18+
case_dir = sys.argv[1]
19+
if os.path.isfile(case_dir):
20+
case_dir = os.path.dirname(case_dir)
21+
22+
pack, err = pack_compile(case_dir)
23+
if err is not None:
24+
print(f"ERROR: {err}")
25+
sys.exit(1)
26+
27+
if not pack.entries:
28+
print(f"ERROR: No data found in {case_dir}/D/")
29+
sys.exit(1)
30+
31+
if pack.has_bad_values():
32+
print("ERROR: NaN or Inf detected in output:")
33+
for name, entry in pack.entries.items():
34+
for i, val in enumerate(entry.doubles):
35+
if math.isnan(val) or math.isinf(val):
36+
label = 'NaN' if math.isnan(val) else 'Inf'
37+
print(f" {label} at index {i} in {name}")
38+
break
39+
sys.exit(1)
40+
41+
total = sum(len(e.doubles) for e in pack.entries.values())
42+
print(f"OK: {len(pack.entries)} files, {total} values — no NaN/Inf found")

.github/scripts/detect-gpus.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
# Detects GPUs (NVIDIA or AMD), sets $ngpus and $gpu_ids.
3+
# Usage: source .github/scripts/detect-gpus.sh
4+
5+
ngpus=0
6+
gpu_ids=""
7+
if command -v nvidia-smi &>/dev/null; then
8+
ngpus=$(nvidia-smi -L | wc -l)
9+
gpu_ids=$(seq -s ' ' 0 $((ngpus - 1)))
10+
elif command -v rocm-smi &>/dev/null; then
11+
gpu_ids=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
12+
ngpus=$(echo "$gpu_ids" | wc -w)
13+
fi

.github/scripts/gpu-opts.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
# Sets $gpu_opts from $job_device and $job_interface.
3+
# Usage: source .github/scripts/gpu-opts.sh
4+
5+
gpu_opts=""
6+
if [ "$job_device" = "gpu" ]; then
7+
gpu_opts="--gpu"
8+
if [ "$job_interface" = "omp" ]; then
9+
gpu_opts+=" mp"
10+
elif [ "$job_interface" = "acc" ]; then
11+
gpu_opts+=" acc"
12+
fi
13+
fi
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/bash
2+
3+
# Pre-builds all benchmark cases with --case-optimization.
4+
# Can run in two modes:
5+
# 1. Direct (Frontier login nodes): pass cluster/device/interface as args
6+
# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit.sh
7+
# Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]
8+
9+
set -e
10+
11+
# Support both positional args (direct invocation) and env vars (SLURM via submit.sh)
12+
cluster="${1:-${job_cluster:-phoenix}}"
13+
job_device="${2:-$job_device}"
14+
job_interface="${3:-$job_interface}"
15+
16+
# Derive module flag from cluster name
17+
case "$cluster" in
18+
phoenix) flag="p" ;;
19+
frontier) flag="f" ;;
20+
frontier_amd) flag="famd" ;;
21+
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
22+
esac
23+
24+
. ./mfc.sh load -c "$flag" -m g
25+
source .github/scripts/gpu-opts.sh
26+
27+
for case in benchmarks/*/case.py; do
28+
echo "=== Pre-building: $case ==="
29+
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
30+
done

.github/scripts/retry-build.sh

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/bin/bash
2+
# Provides retry_build(): 3-attempt loop with configurable cleanup.
3+
# Set RETRY_CLEAN_CMD to override cleanup (default: rm -rf build/staging build/install build/lock.yaml).
4+
# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry.
5+
# Usage: source .github/scripts/retry-build.sh
6+
# retry_build ./mfc.sh build -j 8 --gpu acc
7+
8+
# Try normal cleanup; if it fails, escalate to cache nuke.
9+
_retry_clean() {
10+
local clean_cmd="$1"
11+
if eval "$clean_cmd" 2>/dev/null; then
12+
return 0
13+
fi
14+
echo " Normal cleanup failed."
15+
if type _cache_nuke > /dev/null 2>&1; then
16+
echo " Escalating to NFS cache nuke..."
17+
_cache_nuke
18+
else
19+
echo " _cache_nuke not available, best-effort rm."
20+
rm -rf build/staging build/install build/lock.yaml 2>/dev/null || true
21+
fi
22+
}
23+
24+
retry_build() {
25+
local clean_cmd="${RETRY_CLEAN_CMD:-rm -rf build/staging build/install build/lock.yaml}"
26+
local validate_cmd="${RETRY_VALIDATE_CMD:-}"
27+
local max_attempts=3
28+
local attempt=1
29+
while [ $attempt -le $max_attempts ]; do
30+
echo "Build attempt $attempt of $max_attempts..."
31+
if "$@"; then
32+
if [ -n "$validate_cmd" ]; then
33+
if ! eval "$validate_cmd"; then
34+
echo "Post-build validation failed on attempt $attempt."
35+
if [ $attempt -lt $max_attempts ]; then
36+
echo "Cleaning and retrying in 5s..."
37+
_retry_clean "$clean_cmd"
38+
sleep 5
39+
attempt=$((attempt + 1))
40+
continue
41+
else
42+
echo "Validation still failing after $max_attempts attempts."
43+
return 1
44+
fi
45+
fi
46+
fi
47+
echo "Build succeeded on attempt $attempt."
48+
return 0
49+
fi
50+
if [ $attempt -lt $max_attempts ]; then
51+
echo "Build failed on attempt $attempt. Retrying in 30s..."
52+
_retry_clean "$clean_cmd"
53+
sleep 30
54+
else
55+
echo "Build failed after $max_attempts attempts."
56+
return 1
57+
fi
58+
attempt=$((attempt + 1))
59+
done
60+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
# Runs ./mfc.sh test with all provided arguments, then retries a small number
3+
# of sporadic failures (up to 5). Exits non-zero on real failures.
4+
# Usage: bash .github/scripts/run-tests-with-retry.sh [mfc test args...]
5+
6+
# Extract flags that should carry over to retries (retries build their own
7+
# argument list with --only, so we capture passthrough flags here).
8+
PASSTHROUGH=""
9+
for arg in "$@"; do
10+
case "$arg" in
11+
--test-all) PASSTHROUGH="$PASSTHROUGH --test-all" ;;
12+
esac
13+
done
14+
15+
rm -f tests/failed_uuids.txt
16+
TEST_EXIT=0
17+
/bin/bash mfc.sh test "$@" || TEST_EXIT=$?
18+
19+
# Retry only if a small number of tests failed (sporadic failures)
20+
if [ -s tests/failed_uuids.txt ]; then
21+
NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
22+
if [ "$NUM_FAILED" -le 5 ]; then
23+
FAILED=$(tr '\n' ' ' < tests/failed_uuids.txt)
24+
echo ""
25+
echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
26+
echo ""
27+
/bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $PASSTHROUGH || exit $?
28+
else
29+
echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
30+
exit 1
31+
fi
32+
elif [ "$TEST_EXIT" -ne 0 ]; then
33+
exit $TEST_EXIT
34+
fi
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/bin/bash
2+
3+
# Case-optimization CI test script.
4+
# Runs inside a SLURM job — expects $job_device and $job_interface from submit.sh.
5+
6+
set -e
7+
8+
source .github/scripts/detect-gpus.sh
9+
source .github/scripts/gpu-opts.sh
10+
11+
# Default to 1 GPU if detection found none but we're in GPU mode
12+
if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then
13+
ngpus=1
14+
fi
15+
16+
# Verify the venv Python interpreter exists (created by ./mfc.sh build)
17+
if [ ! -x build/venv/bin/python3 ]; then
18+
echo "ERROR: build/venv/bin/python3 not found."
19+
echo "The MFC build venv may not have been created. Was the pre-build step successful?"
20+
exit 1
21+
fi
22+
23+
benchmarks=(
24+
benchmarks/5eq_rk3_weno3_hllc/case.py
25+
benchmarks/viscous_weno5_sgb_acoustic/case.py
26+
benchmarks/hypo_hll/case.py
27+
benchmarks/ibm/case.py
28+
benchmarks/igr/case.py
29+
)
30+
31+
passed=0
32+
failed=0
33+
failed_cases=""
34+
35+
for case in "${benchmarks[@]}"; do
36+
case_dir="$(dirname "$case")"
37+
case_name="$(basename "$case_dir")"
38+
echo ""
39+
echo "========================================"
40+
echo "Case-optimization test: $case_name"
41+
echo "========================================"
42+
43+
# Clean any previous output
44+
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
45+
46+
# Build + run with --case-optimization, small grid, 10 timesteps
47+
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j "$(nproc)" -- --gbpp 1 --steps 10; then
48+
# Validate output
49+
if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
50+
echo "PASS: $case_name"
51+
passed=$((passed + 1))
52+
else
53+
echo "FAIL: $case_name (validation error)"
54+
failed=$((failed + 1))
55+
failed_cases="$failed_cases $case_name"
56+
fi
57+
else
58+
echo "FAIL: $case_name (build or run error)"
59+
failed=$((failed + 1))
60+
failed_cases="$failed_cases $case_name"
61+
fi
62+
63+
# Clean up output between cases
64+
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
65+
done
66+
67+
echo ""
68+
echo "========================================"
69+
echo "Case-optimization summary: $passed passed, $failed failed"
70+
if [ $failed -gt 0 ]; then
71+
echo "Failed cases:$failed_cases"
72+
fi
73+
echo "========================================"
74+
75+
[ $failed -eq 0 ] && exit 0 || exit 1

.github/scripts/setup-build-cache.sh

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22
# Sets up a persistent build cache for self-hosted CI runners.
3-
# Creates a symlink: ./build -> /storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/<key>/build
3+
# Creates a symlink: ./build -> <cache_root>/<key>/build
44
#
55
# Each runner gets its own cache keyed by (cluster, device, interface, runner).
66
# This avoids cross-runner path issues entirely — CMake's absolute paths are
@@ -13,8 +13,58 @@ _cache_device="${2:?}"
1313
_cache_interface="${3:-none}"
1414
_cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}"
1515

16+
# Select cache root based on cluster (each HPC system has its own persistent storage).
17+
case "$_cache_cluster" in
18+
phoenix)
19+
_cache_root="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache" ;;
20+
frontier|frontier_amd)
21+
_cache_root="/lustre/orion/cfd154/scratch/sbryngelson/.mfc-ci-cache" ;;
22+
*)
23+
echo "=== Build Cache Setup ==="
24+
echo " No cache root configured for cluster '$_cache_cluster' — skipping."
25+
echo "========================="
26+
return 0 2>/dev/null || exit 0 ;;
27+
esac
28+
1629
_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}"
17-
_cache_base="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/${_cache_key}/build"
30+
_cache_base="${_cache_root}/${_cache_key}/build"
31+
32+
# Check if the cache directory is healthy (readable, writable, no stale handles).
33+
_cache_healthy() {
34+
local dir="$1"
35+
if ! ls "$dir" > /dev/null 2>&1; then
36+
echo " Health check FAILED: cannot list $dir"
37+
return 1
38+
fi
39+
if [ -e "$dir/lock.yaml" ] && ! stat "$dir/lock.yaml" > /dev/null 2>&1; then
40+
echo " Health check FAILED: cannot stat $dir/lock.yaml"
41+
return 1
42+
fi
43+
local probe="$dir/.nfs_probe.$$"
44+
if ! touch "$probe" 2>/dev/null || ! rm -f "$probe" 2>/dev/null; then
45+
echo " Health check FAILED: cannot write/remove probe in $dir"
46+
rm -f "$probe" 2>/dev/null
47+
return 1
48+
fi
49+
return 0
50+
}
51+
52+
# Nuclear recovery: rename stale cache out of the way and create a fresh one.
53+
# Uses mv (operates on parent directory entry) which works even when children
54+
# have stale file handles that prevent rm -rf from succeeding.
55+
_cache_nuke() {
56+
local base="${1:-$_cache_base}"
57+
local stale_name="${base}.stale.$(date +%s)"
58+
echo " NFS cache nuke: parking stale dir -> $stale_name"
59+
if mv "$base" "$stale_name" 2>/dev/null; then
60+
echo " NFS cache nuke: renamed successfully"
61+
else
62+
echo " NFS cache nuke: mv failed, trying rm -rf as fallback"
63+
rm -rf "$base" 2>/dev/null || true
64+
fi
65+
mkdir -p "$base"
66+
echo " NFS cache nuke: fresh cache created at $base"
67+
}
1868

1969
mkdir -p "$_cache_base"
2070
_cache_dir="$(cd "$_cache_base" && pwd -P)"
@@ -23,6 +73,13 @@ echo "=== Build Cache Setup ==="
2373
echo " Cache key: $_cache_key"
2474
echo " Cache dir: $_cache_dir"
2575

76+
# Pre-flight: detect stale NFS handles before wasting a build attempt.
77+
if ! _cache_healthy "$_cache_dir"; then
78+
echo " Stale NFS cache detected — nuking and recreating."
79+
_cache_nuke "$_cache_base"
80+
_cache_dir="$(cd "$_cache_base" && pwd -P)"
81+
fi
82+
2683
# Replace any existing build/ (real dir or stale symlink) with a symlink
2784
# to our runner-specific cache directory.
2885
# Use unlink for symlinks to avoid rm -rf following the link and deleting
@@ -36,4 +93,9 @@ fi
3693
ln -s "$_cache_dir" "build"
3794

3895
echo " Symlink: build -> $_cache_dir"
96+
97+
# Garbage-collect stale cache dirs parked by _cache_nuke more than 7 days ago.
98+
_cache_parent="$(dirname "$_cache_base")"
99+
find "$_cache_parent" -maxdepth 1 -name "*.stale.*" -mtime +7 -exec rm -rf {} + 2>/dev/null || true
100+
39101
echo "========================="

0 commit comments

Comments
 (0)