Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/file-filter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ yml: &yml
- '.github/workflows/phoenix/**'
- '.github/workflows/frontier/**'
- '.github/workflows/frontier_amd/**'
- '.github/scripts/**'
- '.github/workflows/bench.yml'
- '.github/workflows/test.yml'
- '.github/workflows/formatting.yml'
Expand Down
15 changes: 15 additions & 0 deletions .github/scripts/bench-preamble.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
# Shared preamble for benchmark scripts: detects GPUs, sets build/device opts.
# Sets: $gpu_opts, $build_opts, $device_opts, $n_ranks, $ngpus, $gpu_ids
# Usage: source .github/scripts/bench-preamble.sh

source .github/scripts/detect-gpus.sh
source .github/scripts/gpu-opts.sh

n_ranks=12
build_opts="$gpu_opts"
device_opts=""
if [ "$job_device" = "gpu" ]; then
n_ranks=$ngpus
device_opts="$gpu_opts -g $gpu_ids"
fi
42 changes: 42 additions & 0 deletions .github/scripts/check_case_optimization_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env python3

"""Validate case-optimization output: check D/*.dat for NaN/Inf via the packer."""

import math
import sys
import os

if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <case_directory>", file=sys.stderr)
sys.exit(1)

# Allow importing from the repo root
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))

from toolchain.mfc.packer.pack import compile as pack_compile

case_dir = sys.argv[1]
if os.path.isfile(case_dir):
case_dir = os.path.dirname(case_dir)

pack, err = pack_compile(case_dir)
if err is not None:
print(f"ERROR: {err}")
sys.exit(1)

if not pack.entries:
print(f"ERROR: No data found in {case_dir}/D/")
sys.exit(1)

if pack.has_bad_values():
print("ERROR: NaN or Inf detected in output:")
for name, entry in pack.entries.items():
for i, val in enumerate(entry.doubles):
if math.isnan(val) or math.isinf(val):
label = 'NaN' if math.isnan(val) else 'Inf'
print(f" {label} at index {i} in {name}")
break
sys.exit(1)

total = sum(len(e.doubles) for e in pack.entries.values())
print(f"OK: {len(pack.entries)} files, {total} values — no NaN/Inf found")
13 changes: 13 additions & 0 deletions .github/scripts/detect-gpus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
# Detects GPUs (NVIDIA or AMD), sets $ngpus and $gpu_ids.
# Usage: source .github/scripts/detect-gpus.sh

ngpus=0
gpu_ids=""
if command -v nvidia-smi &>/dev/null; then
ngpus=$(nvidia-smi -L | wc -l)
gpu_ids=$(seq -s ' ' 0 $((ngpus - 1)))
elif command -v rocm-smi &>/dev/null; then
gpu_ids=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
ngpus=$(echo "$gpu_ids" | wc -w)
fi
13 changes: 13 additions & 0 deletions .github/scripts/gpu-opts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
# Sets $gpu_opts from $job_device and $job_interface.
# Usage: source .github/scripts/gpu-opts.sh

gpu_opts=""
if [ "$job_device" = "gpu" ]; then
gpu_opts="--gpu"
if [ "$job_interface" = "omp" ]; then
gpu_opts+=" mp"
elif [ "$job_interface" = "acc" ]; then
gpu_opts+=" acc"
fi
fi
30 changes: 30 additions & 0 deletions .github/scripts/prebuild-case-optimization.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

# Pre-builds all benchmark cases with --case-optimization.
# Can run in two modes:
# 1. Direct (Frontier login nodes): pass cluster/device/interface as args
# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit.sh
# Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]

set -e

# Support both positional args (direct invocation) and env vars (SLURM via submit.sh)
cluster="${1:-${job_cluster:-phoenix}}"
job_device="${2:-$job_device}"
job_interface="${3:-$job_interface}"

# Derive module flag from cluster name
case "$cluster" in
phoenix) flag="p" ;;
frontier) flag="f" ;;
frontier_amd) flag="famd" ;;
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
esac

. ./mfc.sh load -c "$flag" -m g
source .github/scripts/gpu-opts.sh

for case in benchmarks/*/case.py; do
echo "=== Pre-building: $case ==="
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
done
60 changes: 60 additions & 0 deletions .github/scripts/retry-build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash
# Provides retry_build(): 3-attempt loop with configurable cleanup.
# Set RETRY_CLEAN_CMD to override cleanup (default: rm -rf build/staging build/install build/lock.yaml).
# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry.
# Usage: source .github/scripts/retry-build.sh
# retry_build ./mfc.sh build -j 8 --gpu acc

# Try normal cleanup; if it fails, escalate to cache nuke.
_retry_clean() {
local clean_cmd="$1"
if eval "$clean_cmd" 2>/dev/null; then
return 0
fi
echo " Normal cleanup failed."
if type _cache_nuke > /dev/null 2>&1; then
echo " Escalating to NFS cache nuke..."
_cache_nuke
else
echo " _cache_nuke not available, best-effort rm."
rm -rf build/staging build/install build/lock.yaml 2>/dev/null || true
fi
}

retry_build() {
local clean_cmd="${RETRY_CLEAN_CMD:-rm -rf build/staging build/install build/lock.yaml}"
local validate_cmd="${RETRY_VALIDATE_CMD:-}"
local max_attempts=3
local attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Build attempt $attempt of $max_attempts..."
if "$@"; then
if [ -n "$validate_cmd" ]; then
if ! eval "$validate_cmd"; then
echo "Post-build validation failed on attempt $attempt."
if [ $attempt -lt $max_attempts ]; then
echo "Cleaning and retrying in 5s..."
_retry_clean "$clean_cmd"
sleep 5
attempt=$((attempt + 1))
continue
else
echo "Validation still failing after $max_attempts attempts."
return 1
fi
fi
fi
echo "Build succeeded on attempt $attempt."
return 0
fi
if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Retrying in 30s..."
_retry_clean "$clean_cmd"
sleep 30
else
echo "Build failed after $max_attempts attempts."
return 1
fi
attempt=$((attempt + 1))
done
}
34 changes: 34 additions & 0 deletions .github/scripts/run-tests-with-retry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
# Runs ./mfc.sh test with all provided arguments, then retries a small number
# of sporadic failures (up to 5). Exits non-zero on real failures.
# Usage: bash .github/scripts/run-tests-with-retry.sh [mfc test args...]

# Extract flags that should carry over to retries (retries build their own
# argument list with --only, so we capture passthrough flags here).
PASSTHROUGH=""
for arg in "$@"; do
case "$arg" in
--test-all) PASSTHROUGH="$PASSTHROUGH --test-all" ;;
esac
done

rm -f tests/failed_uuids.txt
TEST_EXIT=0
/bin/bash mfc.sh test "$@" || TEST_EXIT=$?

# Retry only if a small number of tests failed (sporadic failures)
if [ -s tests/failed_uuids.txt ]; then
NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
if [ "$NUM_FAILED" -le 5 ]; then
FAILED=$(tr '\n' ' ' < tests/failed_uuids.txt)
echo ""
echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
echo ""
/bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $PASSTHROUGH || exit $?
else
echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
exit 1
fi
elif [ "$TEST_EXIT" -ne 0 ]; then
exit $TEST_EXIT
fi
75 changes: 75 additions & 0 deletions .github/scripts/run_case_optimization.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash

# Case-optimization CI test script.
# Runs inside a SLURM job — expects $job_device and $job_interface from submit.sh.

set -e

source .github/scripts/detect-gpus.sh
source .github/scripts/gpu-opts.sh

# Default to 1 GPU if detection found none but we're in GPU mode
if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then
ngpus=1
fi

# Verify the venv Python interpreter exists (created by ./mfc.sh build)
if [ ! -x build/venv/bin/python3 ]; then
echo "ERROR: build/venv/bin/python3 not found."
echo "The MFC build venv may not have been created. Was the pre-build step successful?"
exit 1
fi

benchmarks=(
benchmarks/5eq_rk3_weno3_hllc/case.py
benchmarks/viscous_weno5_sgb_acoustic/case.py
benchmarks/hypo_hll/case.py
benchmarks/ibm/case.py
benchmarks/igr/case.py
)

passed=0
failed=0
failed_cases=""

for case in "${benchmarks[@]}"; do
case_dir="$(dirname "$case")"
case_name="$(basename "$case_dir")"
echo ""
echo "========================================"
echo "Case-optimization test: $case_name"
echo "========================================"

# Clean any previous output
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"

# Build + run with --case-optimization, small grid, 10 timesteps
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j "$(nproc)" -- --gbpp 1 --steps 10; then
# Validate output
if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
echo "PASS: $case_name"
passed=$((passed + 1))
else
echo "FAIL: $case_name (validation error)"
failed=$((failed + 1))
failed_cases="$failed_cases $case_name"
fi
else
echo "FAIL: $case_name (build or run error)"
failed=$((failed + 1))
failed_cases="$failed_cases $case_name"
fi

# Clean up output between cases
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
done

echo ""
echo "========================================"
echo "Case-optimization summary: $passed passed, $failed failed"
if [ $failed -gt 0 ]; then
echo "Failed cases:$failed_cases"
fi
echo "========================================"

[ $failed -eq 0 ] && exit 0 || exit 1
66 changes: 64 additions & 2 deletions .github/scripts/setup-build-cache.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
# Sets up a persistent build cache for self-hosted CI runners.
# Creates a symlink: ./build -> /storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/<key>/build
# Creates a symlink: ./build -> <cache_root>/<key>/build
#
# Each runner gets its own cache keyed by (cluster, device, interface, runner).
# This avoids cross-runner path issues entirely — CMake's absolute paths are
Expand All @@ -13,8 +13,58 @@ _cache_device="${2:?}"
_cache_interface="${3:-none}"
_cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}"

# Select cache root based on cluster (each HPC system has its own persistent storage).
case "$_cache_cluster" in
phoenix)
_cache_root="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache" ;;
frontier|frontier_amd)
_cache_root="/lustre/orion/cfd154/scratch/sbryngelson/.mfc-ci-cache" ;;
*)
echo "=== Build Cache Setup ==="
echo " No cache root configured for cluster '$_cache_cluster' — skipping."
echo "========================="
return 0 2>/dev/null || exit 0 ;;
esac

_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}"
_cache_base="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/${_cache_key}/build"
_cache_base="${_cache_root}/${_cache_key}/build"

# Check if the cache directory is healthy (readable, writable, no stale handles).
_cache_healthy() {
local dir="$1"
if ! ls "$dir" > /dev/null 2>&1; then
echo " Health check FAILED: cannot list $dir"
return 1
fi
if [ -e "$dir/lock.yaml" ] && ! stat "$dir/lock.yaml" > /dev/null 2>&1; then
echo " Health check FAILED: cannot stat $dir/lock.yaml"
return 1
fi
local probe="$dir/.nfs_probe.$$"
if ! touch "$probe" 2>/dev/null || ! rm -f "$probe" 2>/dev/null; then
echo " Health check FAILED: cannot write/remove probe in $dir"
rm -f "$probe" 2>/dev/null
return 1
fi
return 0
}

# Nuclear recovery: rename stale cache out of the way and create a fresh one.
# Uses mv (operates on parent directory entry) which works even when children
# have stale file handles that prevent rm -rf from succeeding.
_cache_nuke() {
local base="${1:-$_cache_base}"
local stale_name="${base}.stale.$(date +%s)"
echo " NFS cache nuke: parking stale dir -> $stale_name"
if mv "$base" "$stale_name" 2>/dev/null; then
echo " NFS cache nuke: renamed successfully"
else
echo " NFS cache nuke: mv failed, trying rm -rf as fallback"
rm -rf "$base" 2>/dev/null || true
fi
mkdir -p "$base"
echo " NFS cache nuke: fresh cache created at $base"
}

mkdir -p "$_cache_base"
_cache_dir="$(cd "$_cache_base" && pwd -P)"
Expand All @@ -23,6 +73,13 @@ echo "=== Build Cache Setup ==="
echo " Cache key: $_cache_key"
echo " Cache dir: $_cache_dir"

# Pre-flight: detect stale NFS handles before wasting a build attempt.
if ! _cache_healthy "$_cache_dir"; then
echo " Stale NFS cache detected — nuking and recreating."
_cache_nuke "$_cache_base"
_cache_dir="$(cd "$_cache_base" && pwd -P)"
fi

# Replace any existing build/ (real dir or stale symlink) with a symlink
# to our runner-specific cache directory.
# Use unlink for symlinks to avoid rm -rf following the link and deleting
Expand All @@ -36,4 +93,9 @@ fi
ln -s "$_cache_dir" "build"

echo " Symlink: build -> $_cache_dir"

# Garbage-collect stale cache dirs parked by _cache_nuke more than 7 days ago.
_cache_parent="$(dirname "$_cache_base")"
find "$_cache_parent" -maxdepth 1 -name "*.stale.*" -mtime +7 -exec rm -rf {} + 2>/dev/null || true

echo "========================="
Loading
Loading