11#! /bin/bash
22# Sets up a persistent build cache for self-hosted CI runners.
3- # Creates a symlink: ./build -> /storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache /<key>/build
3+ # Creates a symlink: ./build -> <cache_root> /<key>/build
44#
55# Each runner gets its own cache keyed by (cluster, device, interface, runner).
66# This avoids cross-runner path issues entirely — CMake's absolute paths are
@@ -13,8 +13,58 @@ _cache_device="${2:?}"
1313_cache_interface=" ${3:- none} "
1414_cache_runner=" ${RUNNER_NAME:? RUNNER_NAME not set} "
1515
16+ # Select cache root based on cluster (each HPC system has its own persistent storage).
17+ case " $_cache_cluster " in
18+ phoenix)
19+ _cache_root=" /storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache" ;;
20+ frontier|frontier_amd)
21+ _cache_root=" /lustre/orion/cfd154/scratch/sbryngelson/.mfc-ci-cache" ;;
22+ * )
23+ echo " === Build Cache Setup ==="
24+ echo " No cache root configured for cluster '$_cache_cluster ' — skipping."
25+ echo " ========================="
26+ return 0 2> /dev/null || exit 0 ;;
27+ esac
28+
1629_cache_key=" ${_cache_cluster} -${_cache_device} -${_cache_interface} -${_cache_runner} "
17- _cache_base=" /storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/${_cache_key} /build"
30+ _cache_base=" ${_cache_root} /${_cache_key} /build"
31+
32+ # Check if the cache directory is healthy (readable, writable, no stale handles).
33+ _cache_healthy () {
34+ local dir=" $1 "
35+ if ! ls " $dir " > /dev/null 2>&1 ; then
36+ echo " Health check FAILED: cannot list $dir "
37+ return 1
38+ fi
39+ if [ -e " $dir /lock.yaml" ] && ! stat " $dir /lock.yaml" > /dev/null 2>&1 ; then
40+ echo " Health check FAILED: cannot stat $dir /lock.yaml"
41+ return 1
42+ fi
43+ local probe=" $dir /.nfs_probe.$$ "
44+ if ! touch " $probe " 2> /dev/null || ! rm -f " $probe " 2> /dev/null; then
45+ echo " Health check FAILED: cannot write/remove probe in $dir "
46+ rm -f " $probe " 2> /dev/null
47+ return 1
48+ fi
49+ return 0
50+ }
51+
52+ # Nuclear recovery: rename stale cache out of the way and create a fresh one.
53+ # Uses mv (operates on parent directory entry) which works even when children
54+ # have stale file handles that prevent rm -rf from succeeding.
55+ _cache_nuke () {
56+ local base=" ${1:- $_cache_base } "
57+ local stale_name=" ${base} .stale.$( date +%s) "
58+ echo " NFS cache nuke: parking stale dir -> $stale_name "
59+ if mv " $base " " $stale_name " 2> /dev/null; then
60+ echo " NFS cache nuke: renamed successfully"
61+ else
62+ echo " NFS cache nuke: mv failed, trying rm -rf as fallback"
63+ rm -rf " $base " 2> /dev/null || true
64+ fi
65+ mkdir -p " $base "
66+ echo " NFS cache nuke: fresh cache created at $base "
67+ }
1868
1969mkdir -p " $_cache_base "
2070_cache_dir=" $( cd " $_cache_base " && pwd -P) "
@@ -23,6 +73,13 @@ echo "=== Build Cache Setup ==="
2373echo " Cache key: $_cache_key "
2474echo " Cache dir: $_cache_dir "
2575
76+ # Pre-flight: detect stale NFS handles before wasting a build attempt.
77+ if ! _cache_healthy " $_cache_dir " ; then
78+ echo " Stale NFS cache detected — nuking and recreating."
79+ _cache_nuke " $_cache_base "
80+ _cache_dir=" $( cd " $_cache_base " && pwd -P) "
81+ fi
82+
2683# Replace any existing build/ (real dir or stale symlink) with a symlink
2784# to our runner-specific cache directory.
2885# Use unlink for symlinks to avoid rm -rf following the link and deleting
3693ln -s " $_cache_dir " " build"
3794
3895echo " Symlink: build -> $_cache_dir "
96+
97+ # Garbage-collect stale cache dirs parked by _cache_nuke more than 7 days ago.
98+ _cache_parent=" $( dirname " $_cache_base " ) "
99+ find " $_cache_parent " -maxdepth 1 -name " *.stale.*" -mtime +7 -exec rm -rf {} + 2> /dev/null || true
100+
39101echo " ========================="
0 commit comments