MFlowCode · sbryngelson · Mar 3, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
@@ -25,6 +25,7 @@ yml: &yml
   - '.github/workflows/phoenix/**'
   - '.github/workflows/frontier/**'
   - '.github/workflows/frontier_amd/**'
+  - '.github/scripts/**'
   - '.github/workflows/bench.yml'
   - '.github/workflows/test.yml'
   - '.github/workflows/formatting.yml'

@@ -0,0 +1,15 @@
+#!/bin/bash
+# Shared preamble for benchmark scripts: detects GPUs, sets build/device opts.
+# Sets: $gpu_opts, $build_opts, $device_opts, $n_ranks, $ngpus, $gpu_ids
+# Usage: source .github/scripts/bench-preamble.sh
+
+source .github/scripts/detect-gpus.sh
+source .github/scripts/gpu-opts.sh
+
+n_ranks=12
+build_opts="$gpu_opts"
+device_opts=""
+if [ "$job_device" = "gpu" ]; then
+    n_ranks=$ngpus
+    device_opts="$gpu_opts -g $gpu_ids"
+fi
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+"""Validate case-optimization output: check D/*.dat for NaN/Inf via the packer."""
+
+import math
+import sys
+import os
+
+if len(sys.argv) != 2:
+    print(f"Usage: {sys.argv[0]} <case_directory>", file=sys.stderr)
+    sys.exit(1)
+
+# Allow importing from the repo root
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+
+from toolchain.mfc.packer.pack import compile as pack_compile
+
+case_dir = sys.argv[1]
+if os.path.isfile(case_dir):
+    case_dir = os.path.dirname(case_dir)
+
+pack, err = pack_compile(case_dir)
+if err is not None:
+    print(f"ERROR: {err}")
+    sys.exit(1)
+
+if not pack.entries:
+    print(f"ERROR: No data found in {case_dir}/D/")
+    sys.exit(1)
+
+if pack.has_bad_values():
+    print("ERROR: NaN or Inf detected in output:")
+    for name, entry in pack.entries.items():
+        for i, val in enumerate(entry.doubles):
+            if math.isnan(val) or math.isinf(val):
+                label = 'NaN' if math.isnan(val) else 'Inf'
+                print(f"  {label} at index {i} in {name}")
+                break
+    sys.exit(1)
+
+total = sum(len(e.doubles) for e in pack.entries.values())
+print(f"OK: {len(pack.entries)} files, {total} values — no NaN/Inf found")
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Detects GPUs (NVIDIA or AMD), sets $ngpus and $gpu_ids.
+# Usage: source .github/scripts/detect-gpus.sh
+
+ngpus=0
+gpu_ids=""
+if command -v nvidia-smi &>/dev/null; then
+    ngpus=$(nvidia-smi -L | wc -l)
+    gpu_ids=$(seq -s ' ' 0 $((ngpus - 1)))
+elif command -v rocm-smi &>/dev/null; then
+    gpu_ids=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
+    ngpus=$(echo "$gpu_ids" | wc -w)
+fi
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Sets $gpu_opts from $job_device and $job_interface.
+# Usage: source .github/scripts/gpu-opts.sh
+
+gpu_opts=""
+if [ "$job_device" = "gpu" ]; then
+    gpu_opts="--gpu"
+    if [ "$job_interface" = "omp" ]; then
+        gpu_opts+=" mp"
+    elif [ "$job_interface" = "acc" ]; then
+        gpu_opts+=" acc"
+    fi
+fi
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Pre-builds all benchmark cases with --case-optimization.
+# Can run in two modes:
+#   1. Direct (Frontier login nodes): pass cluster/device/interface as args
+#   2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit.sh
+# Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]
+
+set -e
+
+# Support both positional args (direct invocation) and env vars (SLURM via submit.sh)
+cluster="${1:-${job_cluster:-phoenix}}"
+job_device="${2:-$job_device}"
+job_interface="${3:-$job_interface}"
+
+# Derive module flag from cluster name
+case "$cluster" in
+    phoenix)      flag="p" ;;
+    frontier)     flag="f" ;;
+    frontier_amd) flag="famd" ;;
+    *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
+esac
+
+. ./mfc.sh load -c "$flag" -m g
+source .github/scripts/gpu-opts.sh
+
+for case in benchmarks/*/case.py; do
+    echo "=== Pre-building: $case ==="
+    ./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
+done
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Provides retry_build(): 3-attempt loop with configurable cleanup.
+# Set RETRY_CLEAN_CMD to override cleanup (default: rm -rf build/staging build/install build/lock.yaml).
+# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry.
+# Usage: source .github/scripts/retry-build.sh
+#        retry_build ./mfc.sh build -j 8 --gpu acc
+
+# Try normal cleanup; if it fails, escalate to cache nuke.
+_retry_clean() {
+    local clean_cmd="$1"
+    if eval "$clean_cmd" 2>/dev/null; then
+        return 0
+    fi
+    echo "  Normal cleanup failed."
+    if type _cache_nuke > /dev/null 2>&1; then
+        echo "  Escalating to NFS cache nuke..."
+        _cache_nuke
+    else
+        echo "  _cache_nuke not available, best-effort rm."
+        rm -rf build/staging build/install build/lock.yaml 2>/dev/null || true
+    fi
+}
+
+retry_build() {
+    local clean_cmd="${RETRY_CLEAN_CMD:-rm -rf build/staging build/install build/lock.yaml}"
+    local validate_cmd="${RETRY_VALIDATE_CMD:-}"
+    local max_attempts=3
+    local attempt=1
+    while [ $attempt -le $max_attempts ]; do
+        echo "Build attempt $attempt of $max_attempts..."
+        if "$@"; then
+            if [ -n "$validate_cmd" ]; then
+                if ! eval "$validate_cmd"; then
+                    echo "Post-build validation failed on attempt $attempt."
+                    if [ $attempt -lt $max_attempts ]; then
+                        echo "Cleaning and retrying in 5s..."
+                        _retry_clean "$clean_cmd"
+                        sleep 5
+                        attempt=$((attempt + 1))
+                        continue
+                    else
+                        echo "Validation still failing after $max_attempts attempts."
+                        return 1
+                    fi
+                fi
+            fi
+            echo "Build succeeded on attempt $attempt."
+            return 0
+        fi
+        if [ $attempt -lt $max_attempts ]; then
+            echo "Build failed on attempt $attempt. Retrying in 30s..."
+            _retry_clean "$clean_cmd"
+            sleep 30
+        else
+            echo "Build failed after $max_attempts attempts."
+            return 1
+        fi
+        attempt=$((attempt + 1))
+    done
+}
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Runs ./mfc.sh test with all provided arguments, then retries a small number
+# of sporadic failures (up to 5). Exits non-zero on real failures.
+# Usage: bash .github/scripts/run-tests-with-retry.sh [mfc test args...]
+
+# Extract flags that should carry over to retries (retries build their own
+# argument list with --only, so we capture passthrough flags here).
+PASSTHROUGH=""
+for arg in "$@"; do
+    case "$arg" in
+        --test-all) PASSTHROUGH="$PASSTHROUGH --test-all" ;;
+    esac
+done
+
+rm -f tests/failed_uuids.txt
+TEST_EXIT=0
+/bin/bash mfc.sh test "$@" || TEST_EXIT=$?
+
+# Retry only if a small number of tests failed (sporadic failures)
+if [ -s tests/failed_uuids.txt ]; then
+    NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
+    if [ "$NUM_FAILED" -le 5 ]; then
+        FAILED=$(tr '\n' ' ' < tests/failed_uuids.txt)
+        echo ""
+        echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
+        echo ""
+        /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $PASSTHROUGH || exit $?
+    else
+        echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
+        exit 1
+    fi
+elif [ "$TEST_EXIT" -ne 0 ]; then
+    exit $TEST_EXIT
+fi
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Case-optimization CI test script.
+# Runs inside a SLURM job — expects $job_device and $job_interface from submit.sh.
+
+set -e
+
+source .github/scripts/detect-gpus.sh
+source .github/scripts/gpu-opts.sh
+
+# Default to 1 GPU if detection found none but we're in GPU mode
+if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then
+    ngpus=1
+fi
+
+# Verify the venv Python interpreter exists (created by ./mfc.sh build)
+if [ ! -x build/venv/bin/python3 ]; then
+    echo "ERROR: build/venv/bin/python3 not found."
+    echo "The MFC build venv may not have been created. Was the pre-build step successful?"
+    exit 1
+fi
+
+benchmarks=(
+    benchmarks/5eq_rk3_weno3_hllc/case.py
+    benchmarks/viscous_weno5_sgb_acoustic/case.py
+    benchmarks/hypo_hll/case.py
+    benchmarks/ibm/case.py
+    benchmarks/igr/case.py
+)
+
+passed=0
+failed=0
+failed_cases=""
+
+for case in "${benchmarks[@]}"; do
+    case_dir="$(dirname "$case")"
+    case_name="$(basename "$case_dir")"
+    echo ""
+    echo "========================================"
+    echo "Case-optimization test: $case_name"
+    echo "========================================"
+
+    # Clean any previous output
+    rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
+
+    # Build + run with --case-optimization, small grid, 10 timesteps
+    if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j "$(nproc)" -- --gbpp 1 --steps 10; then
+        # Validate output
+        if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
+            echo "PASS: $case_name"
+            passed=$((passed + 1))
+        else
+            echo "FAIL: $case_name (validation error)"
+            failed=$((failed + 1))
+            failed_cases="$failed_cases $case_name"
+        fi
+    else
+        echo "FAIL: $case_name (build or run error)"
+        failed=$((failed + 1))
+        failed_cases="$failed_cases $case_name"
+    fi
+
+    # Clean up output between cases
+    rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
+done
+
+echo ""
+echo "========================================"
+echo "Case-optimization summary: $passed passed, $failed failed"
+if [ $failed -gt 0 ]; then
+    echo "Failed cases:$failed_cases"
+fi
+echo "========================================"
+
+[ $failed -eq 0 ] && exit 0 || exit 1
@@ -1,6 +1,6 @@
 #!/bin/bash
 # Sets up a persistent build cache for self-hosted CI runners.
-# Creates a symlink: ./build -> /storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/<key>/build
+# Creates a symlink: ./build -> <cache_root>/<key>/build
 #
 # Each runner gets its own cache keyed by (cluster, device, interface, runner).
 # This avoids cross-runner path issues entirely — CMake's absolute paths are
@@ -13,8 +13,58 @@ _cache_device="${2:?}"
 _cache_interface="${3:-none}"
 _cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}"
 
+# Select cache root based on cluster (each HPC system has its own persistent storage).
+case "$_cache_cluster" in
+    phoenix)
+        _cache_root="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache" ;;
+    frontier|frontier_amd)
+        _cache_root="/lustre/orion/cfd154/scratch/sbryngelson/.mfc-ci-cache" ;;
+    *)
+        echo "=== Build Cache Setup ==="
+        echo "  No cache root configured for cluster '$_cache_cluster' — skipping."
+        echo "========================="
+        return 0 2>/dev/null || exit 0 ;;
+esac
+
 _cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}"
-_cache_base="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/${_cache_key}/build"
+_cache_base="${_cache_root}/${_cache_key}/build"
+
+# Check if the cache directory is healthy (readable, writable, no stale handles).
+_cache_healthy() {
+    local dir="$1"
+    if ! ls "$dir" > /dev/null 2>&1; then
+        echo "  Health check FAILED: cannot list $dir"
+        return 1
+    fi
+    if [ -e "$dir/lock.yaml" ] && ! stat "$dir/lock.yaml" > /dev/null 2>&1; then
+        echo "  Health check FAILED: cannot stat $dir/lock.yaml"
+        return 1
+    fi
+    local probe="$dir/.nfs_probe.$$"
+    if ! touch "$probe" 2>/dev/null || ! rm -f "$probe" 2>/dev/null; then
+        echo "  Health check FAILED: cannot write/remove probe in $dir"
+        rm -f "$probe" 2>/dev/null
+        return 1
+    fi
+    return 0
+}
+
+# Nuclear recovery: rename stale cache out of the way and create a fresh one.
+# Uses mv (operates on parent directory entry) which works even when children
+# have stale file handles that prevent rm -rf from succeeding.
+_cache_nuke() {
+    local base="${1:-$_cache_base}"
+    local stale_name="${base}.stale.$(date +%s)"
+    echo "  NFS cache nuke: parking stale dir -> $stale_name"
+    if mv "$base" "$stale_name" 2>/dev/null; then
+        echo "  NFS cache nuke: renamed successfully"
+    else
+        echo "  NFS cache nuke: mv failed, trying rm -rf as fallback"
+        rm -rf "$base" 2>/dev/null || true
+    fi
+    mkdir -p "$base"
+    echo "  NFS cache nuke: fresh cache created at $base"
+}
 
 mkdir -p "$_cache_base"
 _cache_dir="$(cd "$_cache_base" && pwd -P)"
@@ -23,6 +73,13 @@ echo "=== Build Cache Setup ==="
 echo "  Cache key: $_cache_key"
 echo "  Cache dir: $_cache_dir"
 
+# Pre-flight: detect stale NFS handles before wasting a build attempt.
+if ! _cache_healthy "$_cache_dir"; then
+    echo "  Stale NFS cache detected — nuking and recreating."
+    _cache_nuke "$_cache_base"
+    _cache_dir="$(cd "$_cache_base" && pwd -P)"
+fi
+
 # Replace any existing build/ (real dir or stale symlink) with a symlink
 # to our runner-specific cache directory.
 # Use unlink for symlinks to avoid rm -rf following the link and deleting
@@ -36,4 +93,9 @@ fi
 ln -s "$_cache_dir" "build"
 
 echo "  Symlink: build -> $_cache_dir"
+
+# Garbage-collect stale cache dirs parked by _cache_nuke more than 7 days ago.
+_cache_parent="$(dirname "$_cache_base")"
+find "$_cache_parent" -maxdepth 1 -name "*.stale.*" -mtime +7 -exec rm -rf {} + 2>/dev/null || true
+
 echo "========================="