diff --git a/.github/file-filter.yml b/.github/file-filter.yml index a2910c89af..c0e7477cf2 100644 --- a/.github/file-filter.yml +++ b/.github/file-filter.yml @@ -25,6 +25,7 @@ yml: &yml - '.github/workflows/phoenix/**' - '.github/workflows/frontier/**' - '.github/workflows/frontier_amd/**' + - '.github/scripts/**' - '.github/workflows/bench.yml' - '.github/workflows/test.yml' - '.github/workflows/formatting.yml' diff --git a/.github/scripts/bench-preamble.sh b/.github/scripts/bench-preamble.sh new file mode 100644 index 0000000000..da9fc1faa3 --- /dev/null +++ b/.github/scripts/bench-preamble.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Shared preamble for benchmark scripts: detects GPUs, sets build/device opts. +# Sets: $gpu_opts, $build_opts, $device_opts, $n_ranks, $ngpus, $gpu_ids +# Usage: source .github/scripts/bench-preamble.sh + +source .github/scripts/detect-gpus.sh +source .github/scripts/gpu-opts.sh + +n_ranks=12 +build_opts="$gpu_opts" +device_opts="" +if [ "$job_device" = "gpu" ]; then + n_ranks=$ngpus + device_opts="$gpu_opts -g $gpu_ids" +fi diff --git a/.github/scripts/check_case_optimization_output.py b/.github/scripts/check_case_optimization_output.py new file mode 100644 index 0000000000..e64d0e125d --- /dev/null +++ b/.github/scripts/check_case_optimization_output.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +"""Validate case-optimization output: check D/*.dat for NaN/Inf via the packer.""" + +import math +import sys +import os + +if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + +# Allow importing from the repo root +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) + +from toolchain.mfc.packer.pack import compile as pack_compile + +case_dir = sys.argv[1] +if os.path.isfile(case_dir): + case_dir = os.path.dirname(case_dir) + +pack, err = pack_compile(case_dir) +if err is not None: + print(f"ERROR: {err}") + sys.exit(1) + +if not pack.entries: + print(f"ERROR: No data found in {case_dir}/D/") + sys.exit(1) + +if pack.has_bad_values(): + print("ERROR: NaN or Inf detected in output:") + for name, entry in pack.entries.items(): + for i, val in enumerate(entry.doubles): + if math.isnan(val) or math.isinf(val): + label = 'NaN' if math.isnan(val) else 'Inf' + print(f" {label} at index {i} in {name}") + break + sys.exit(1) + +total = sum(len(e.doubles) for e in pack.entries.values()) +print(f"OK: {len(pack.entries)} files, {total} values — no NaN/Inf found") diff --git a/.github/scripts/detect-gpus.sh b/.github/scripts/detect-gpus.sh new file mode 100755 index 0000000000..f3b0983f39 --- /dev/null +++ b/.github/scripts/detect-gpus.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Detects GPUs (NVIDIA or AMD), sets $ngpus and $gpu_ids. +# Usage: source .github/scripts/detect-gpus.sh + +ngpus=0 +gpu_ids="" +if command -v nvidia-smi &>/dev/null; then + ngpus=$(nvidia-smi -L | wc -l) + gpu_ids=$(seq -s ' ' 0 $((ngpus - 1))) +elif command -v rocm-smi &>/dev/null; then + gpu_ids=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ') + ngpus=$(echo "$gpu_ids" | wc -w) +fi diff --git a/.github/scripts/gpu-opts.sh b/.github/scripts/gpu-opts.sh new file mode 100755 index 0000000000..1a9dba80de --- /dev/null +++ b/.github/scripts/gpu-opts.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Sets $gpu_opts from $job_device and $job_interface. +# Usage: source .github/scripts/gpu-opts.sh + +gpu_opts="" +if [ "$job_device" = "gpu" ]; then + gpu_opts="--gpu" + if [ "$job_interface" = "omp" ]; then + gpu_opts+=" mp" + elif [ "$job_interface" = "acc" ]; then + gpu_opts+=" acc" + fi +fi diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh new file mode 100755 index 0000000000..87f26fdb5f --- /dev/null +++ b/.github/scripts/prebuild-case-optimization.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Pre-builds all benchmark cases with --case-optimization. +# Can run in two modes: +# 1. Direct (Frontier login nodes): pass cluster/device/interface as args +# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit.sh +# Usage: bash prebuild-case-optimization.sh [ ] + +set -e + +# Support both positional args (direct invocation) and env vars (SLURM via submit.sh) +cluster="${1:-${job_cluster:-phoenix}}" +job_device="${2:-$job_device}" +job_interface="${3:-$job_interface}" + +# Derive module flag from cluster name +case "$cluster" in + phoenix) flag="p" ;; + frontier) flag="f" ;; + frontier_amd) flag="famd" ;; + *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;; +esac + +. ./mfc.sh load -c "$flag" -m g +source .github/scripts/gpu-opts.sh + +for case in benchmarks/*/case.py; do + echo "=== Pre-building: $case ===" + ./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8 +done diff --git a/.github/scripts/retry-build.sh b/.github/scripts/retry-build.sh new file mode 100755 index 0000000000..b82a2e5d8d --- /dev/null +++ b/.github/scripts/retry-build.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Provides retry_build(): 3-attempt loop with configurable cleanup. +# Set RETRY_CLEAN_CMD to override cleanup (default: rm -rf build/staging build/install build/lock.yaml). +# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry. +# Usage: source .github/scripts/retry-build.sh +# retry_build ./mfc.sh build -j 8 --gpu acc + +# Try normal cleanup; if it fails, escalate to cache nuke. +_retry_clean() { + local clean_cmd="$1" + if eval "$clean_cmd" 2>/dev/null; then + return 0 + fi + echo " Normal cleanup failed." + if type _cache_nuke > /dev/null 2>&1; then + echo " Escalating to NFS cache nuke..." + _cache_nuke + else + echo " _cache_nuke not available, best-effort rm." + rm -rf build/staging build/install build/lock.yaml 2>/dev/null || true + fi +} + +retry_build() { + local clean_cmd="${RETRY_CLEAN_CMD:-rm -rf build/staging build/install build/lock.yaml}" + local validate_cmd="${RETRY_VALIDATE_CMD:-}" + local max_attempts=3 + local attempt=1 + while [ $attempt -le $max_attempts ]; do + echo "Build attempt $attempt of $max_attempts..." + if "$@"; then + if [ -n "$validate_cmd" ]; then + if ! eval "$validate_cmd"; then + echo "Post-build validation failed on attempt $attempt." + if [ $attempt -lt $max_attempts ]; then + echo "Cleaning and retrying in 5s..." + _retry_clean "$clean_cmd" + sleep 5 + attempt=$((attempt + 1)) + continue + else + echo "Validation still failing after $max_attempts attempts." + return 1 + fi + fi + fi + echo "Build succeeded on attempt $attempt." + return 0 + fi + if [ $attempt -lt $max_attempts ]; then + echo "Build failed on attempt $attempt. Retrying in 30s..." + _retry_clean "$clean_cmd" + sleep 30 + else + echo "Build failed after $max_attempts attempts." + return 1 + fi + attempt=$((attempt + 1)) + done +} diff --git a/.github/scripts/run-tests-with-retry.sh b/.github/scripts/run-tests-with-retry.sh new file mode 100755 index 0000000000..18f1d05d0b --- /dev/null +++ b/.github/scripts/run-tests-with-retry.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Runs ./mfc.sh test with all provided arguments, then retries a small number +# of sporadic failures (up to 5). Exits non-zero on real failures. +# Usage: bash .github/scripts/run-tests-with-retry.sh [mfc test args...] + +# Extract flags that should carry over to retries (retries build their own +# argument list with --only, so we capture passthrough flags here). +PASSTHROUGH="" +for arg in "$@"; do + case "$arg" in + --test-all) PASSTHROUGH="$PASSTHROUGH --test-all" ;; + esac +done + +rm -f tests/failed_uuids.txt +TEST_EXIT=0 +/bin/bash mfc.sh test "$@" || TEST_EXIT=$? + +# Retry only if a small number of tests failed (sporadic failures) +if [ -s tests/failed_uuids.txt ]; then + NUM_FAILED=$(wc -l < tests/failed_uuids.txt) + if [ "$NUM_FAILED" -le 5 ]; then + FAILED=$(tr '\n' ' ' < tests/failed_uuids.txt) + echo "" + echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ===" + echo "" + /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $PASSTHROUGH || exit $? + else + echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." + exit 1 + fi +elif [ "$TEST_EXIT" -ne 0 ]; then + exit $TEST_EXIT +fi diff --git a/.github/scripts/run_case_optimization.sh b/.github/scripts/run_case_optimization.sh new file mode 100755 index 0000000000..167505ece3 --- /dev/null +++ b/.github/scripts/run_case_optimization.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +# Case-optimization CI test script. +# Runs inside a SLURM job — expects $job_device and $job_interface from submit.sh. + +set -e + +source .github/scripts/detect-gpus.sh +source .github/scripts/gpu-opts.sh + +# Default to 1 GPU if detection found none but we're in GPU mode +if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then + ngpus=1 +fi + +# Verify the venv Python interpreter exists (created by ./mfc.sh build) +if [ ! -x build/venv/bin/python3 ]; then + echo "ERROR: build/venv/bin/python3 not found." + echo "The MFC build venv may not have been created. Was the pre-build step successful?" + exit 1 +fi + +benchmarks=( + benchmarks/5eq_rk3_weno3_hllc/case.py + benchmarks/viscous_weno5_sgb_acoustic/case.py + benchmarks/hypo_hll/case.py + benchmarks/ibm/case.py + benchmarks/igr/case.py +) + +passed=0 +failed=0 +failed_cases="" + +for case in "${benchmarks[@]}"; do + case_dir="$(dirname "$case")" + case_name="$(basename "$case_dir")" + echo "" + echo "========================================" + echo "Case-optimization test: $case_name" + echo "========================================" + + # Clean any previous output + rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data" + + # Build + run with --case-optimization, small grid, 10 timesteps + if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j "$(nproc)" -- --gbpp 1 --steps 10; then + # Validate output + if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then + echo "PASS: $case_name" + passed=$((passed + 1)) + else + echo "FAIL: $case_name (validation error)" + failed=$((failed + 1)) + failed_cases="$failed_cases $case_name" + fi + else + echo "FAIL: $case_name (build or run error)" + failed=$((failed + 1)) + failed_cases="$failed_cases $case_name" + fi + + # Clean up output between cases + rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data" +done + +echo "" +echo "========================================" +echo "Case-optimization summary: $passed passed, $failed failed" +if [ $failed -gt 0 ]; then + echo "Failed cases:$failed_cases" +fi +echo "========================================" + +[ $failed -eq 0 ] && exit 0 || exit 1 diff --git a/.github/scripts/setup-build-cache.sh b/.github/scripts/setup-build-cache.sh index 7da3912c38..7e47175f6e 100755 --- a/.github/scripts/setup-build-cache.sh +++ b/.github/scripts/setup-build-cache.sh @@ -1,6 +1,6 @@ #!/bin/bash # Sets up a persistent build cache for self-hosted CI runners. -# Creates a symlink: ./build -> /storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache//build +# Creates a symlink: ./build -> //build # # Each runner gets its own cache keyed by (cluster, device, interface, runner). # This avoids cross-runner path issues entirely — CMake's absolute paths are @@ -13,8 +13,58 @@ _cache_device="${2:?}" _cache_interface="${3:-none}" _cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}" +# Select cache root based on cluster (each HPC system has its own persistent storage). +case "$_cache_cluster" in + phoenix) + _cache_root="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache" ;; + frontier|frontier_amd) + _cache_root="/lustre/orion/cfd154/scratch/sbryngelson/.mfc-ci-cache" ;; + *) + echo "=== Build Cache Setup ===" + echo " No cache root configured for cluster '$_cache_cluster' — skipping." + echo "=========================" + return 0 2>/dev/null || exit 0 ;; +esac + _cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}" -_cache_base="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/${_cache_key}/build" +_cache_base="${_cache_root}/${_cache_key}/build" + +# Check if the cache directory is healthy (readable, writable, no stale handles). +_cache_healthy() { + local dir="$1" + if ! ls "$dir" > /dev/null 2>&1; then + echo " Health check FAILED: cannot list $dir" + return 1 + fi + if [ -e "$dir/lock.yaml" ] && ! stat "$dir/lock.yaml" > /dev/null 2>&1; then + echo " Health check FAILED: cannot stat $dir/lock.yaml" + return 1 + fi + local probe="$dir/.nfs_probe.$$" + if ! touch "$probe" 2>/dev/null || ! rm -f "$probe" 2>/dev/null; then + echo " Health check FAILED: cannot write/remove probe in $dir" + rm -f "$probe" 2>/dev/null + return 1 + fi + return 0 +} + +# Nuclear recovery: rename stale cache out of the way and create a fresh one. +# Uses mv (operates on parent directory entry) which works even when children +# have stale file handles that prevent rm -rf from succeeding. +_cache_nuke() { + local base="${1:-$_cache_base}" + local stale_name="${base}.stale.$(date +%s)" + echo " NFS cache nuke: parking stale dir -> $stale_name" + if mv "$base" "$stale_name" 2>/dev/null; then + echo " NFS cache nuke: renamed successfully" + else + echo " NFS cache nuke: mv failed, trying rm -rf as fallback" + rm -rf "$base" 2>/dev/null || true + fi + mkdir -p "$base" + echo " NFS cache nuke: fresh cache created at $base" +} mkdir -p "$_cache_base" _cache_dir="$(cd "$_cache_base" && pwd -P)" @@ -23,6 +73,13 @@ echo "=== Build Cache Setup ===" echo " Cache key: $_cache_key" echo " Cache dir: $_cache_dir" +# Pre-flight: detect stale NFS handles before wasting a build attempt. +if ! _cache_healthy "$_cache_dir"; then + echo " Stale NFS cache detected — nuking and recreating." + _cache_nuke "$_cache_base" + _cache_dir="$(cd "$_cache_base" && pwd -P)" +fi + # Replace any existing build/ (real dir or stale symlink) with a symlink # to our runner-specific cache directory. # Use unlink for symlinks to avoid rm -rf following the link and deleting @@ -36,4 +93,9 @@ fi ln -s "$_cache_dir" "build" echo " Symlink: build -> $_cache_dir" + +# Garbage-collect stale cache dirs parked by _cache_nuke more than 7 days ago. +_cache_parent="$(dirname "$_cache_base")" +find "$_cache_parent" -maxdepth 1 -name "*.stale.*" -mtime +7 -exec rm -rf {} + 2>/dev/null || true + echo "=========================" diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh index 38f6349417..b60f8541a2 100644 --- a/.github/workflows/frontier/bench.sh +++ b/.github/workflows/frontier/bench.sh @@ -1,19 +1,6 @@ #!/bin/bash -n_ranks=12 -device_opts="" -if [ "$job_device" = "gpu" ]; then - gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ') - n_ranks=$(echo "$gpus" | wc -w) # number of GPUs on node - gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//') # GPU IDs from rocm-smi - device_opts+="--gpu" - if [ "$job_interface" = "acc" ]; then - device_opts+=" acc" - elif [ "$job_interface" = "omp" ]; then - device_opts+=" mp" - fi - device_opts+=" -g $gpu_ids" -fi +source .github/scripts/bench-preamble.sh if [ "$job_device" = "gpu" ]; then ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index a235f90d84..88446ad2a0 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -15,15 +15,8 @@ esac job_device=$1 job_interface=$2 run_bench=$3 -build_opts="" -if [ "$job_device" = "gpu" ]; then - build_opts+="--gpu" - if [ "$job_interface" = "acc" ]; then - build_opts+=" acc" - elif [ "$job_interface" = "omp" ]; then - build_opts+=" mp" - fi -fi +source .github/scripts/gpu-opts.sh +build_opts="$gpu_opts" . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") @@ -32,36 +25,9 @@ if [ "$run_bench" != "bench" ]; then source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface" fi -max_attempts=3 -attempt=1 -while [ $attempt -le $max_attempts ]; do - echo "Build attempt $attempt of $max_attempts..." - if [ "$run_bench" == "bench" ]; then - if ./mfc.sh build -j 8 $build_opts; then - build_cmd_ok=true - else - build_cmd_ok=false - fi - else - if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then - build_cmd_ok=true - else - build_cmd_ok=false - fi - fi - - if [ "$build_cmd_ok" = true ]; then - echo "Build succeeded on attempt $attempt." - exit 0 - fi - - if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..." - rm -rf build/staging build/install build/lock.yaml - sleep 30 - fi - attempt=$((attempt + 1)) -done - -echo "Build failed after $max_attempts attempts." -exit 1 +source .github/scripts/retry-build.sh +if [ "$run_bench" == "bench" ]; then + retry_build ./mfc.sh build -j 8 $build_opts || exit 1 +else + retry_build ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts || exit 1 +fi diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh index f2c0591b3b..78797ab8ec 100644 --- a/.github/workflows/frontier/test.sh +++ b/.github/workflows/frontier/test.sh @@ -1,17 +1,8 @@ #!/bin/bash -gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' '` -ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c` - -device_opts="" -if [ "$job_device" = "gpu" ]; then - device_opts+="--gpu" - if [ "$job_interface" = "acc" ]; then - device_opts+=" acc" - elif [ "$job_interface" = "omp" ]; then - device_opts+=" mp" - fi -fi +source .github/scripts/detect-gpus.sh +source .github/scripts/gpu-opts.sh +device_opts="$gpu_opts" shard_opts="" if [ -n "$job_shard" ]; then diff --git a/.github/workflows/frontier_amd/bench.sh b/.github/workflows/frontier_amd/bench.sh deleted file mode 100644 index 38f6349417..0000000000 --- a/.github/workflows/frontier_amd/bench.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -n_ranks=12 -device_opts="" -if [ "$job_device" = "gpu" ]; then - gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ') - n_ranks=$(echo "$gpus" | wc -w) # number of GPUs on node - gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//') # GPU IDs from rocm-smi - device_opts+="--gpu" - if [ "$job_interface" = "acc" ]; then - device_opts+=" acc" - elif [ "$job_interface" = "omp" ]; then - device_opts+=" mp" - fi - device_opts+=" -g $gpu_ids" -fi - -if [ "$job_device" = "gpu" ]; then - ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks -else - ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks -fi diff --git a/.github/workflows/frontier_amd/bench.sh b/.github/workflows/frontier_amd/bench.sh new file mode 120000 index 0000000000..2ac24c7604 --- /dev/null +++ b/.github/workflows/frontier_amd/bench.sh @@ -0,0 +1 @@ +../frontier/bench.sh \ No newline at end of file diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh deleted file mode 100644 index a235f90d84..0000000000 --- a/.github/workflows/frontier_amd/build.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash - -# Ignore SIGHUP to survive login node session drops -trap '' HUP - -# Determine compiler flag from directory name -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cluster_name="$(basename "$SCRIPT_DIR")" -case "$cluster_name" in - frontier) compiler_flag="f" ;; - frontier_amd) compiler_flag="famd" ;; - *) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;; -esac - -job_device=$1 -job_interface=$2 -run_bench=$3 -build_opts="" -if [ "$job_device" = "gpu" ]; then - build_opts+="--gpu" - if [ "$job_interface" = "acc" ]; then - build_opts+=" acc" - elif [ "$job_interface" = "omp" ]; then - build_opts+=" mp" - fi -fi - -. ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") - -# Only set up build cache for test suite, not benchmarks -if [ "$run_bench" != "bench" ]; then - source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface" -fi - -max_attempts=3 -attempt=1 -while [ $attempt -le $max_attempts ]; do - echo "Build attempt $attempt of $max_attempts..." - if [ "$run_bench" == "bench" ]; then - if ./mfc.sh build -j 8 $build_opts; then - build_cmd_ok=true - else - build_cmd_ok=false - fi - else - if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then - build_cmd_ok=true - else - build_cmd_ok=false - fi - fi - - if [ "$build_cmd_ok" = true ]; then - echo "Build succeeded on attempt $attempt." - exit 0 - fi - - if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..." - rm -rf build/staging build/install build/lock.yaml - sleep 30 - fi - attempt=$((attempt + 1)) -done - -echo "Build failed after $max_attempts attempts." -exit 1 diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh new file mode 120000 index 0000000000..40fec10411 --- /dev/null +++ b/.github/workflows/frontier_amd/build.sh @@ -0,0 +1 @@ +../frontier/build.sh \ No newline at end of file diff --git a/.github/workflows/frontier_amd/submit.sh b/.github/workflows/frontier_amd/submit.sh deleted file mode 100644 index 16d4f0d73c..0000000000 --- a/.github/workflows/frontier_amd/submit.sh +++ /dev/null @@ -1,106 +0,0 @@ -#!/bin/bash - -set -e - -# Ignore SIGHUP to survive login node session drops -trap '' HUP - -# Determine compiler flag from directory name -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cluster_name="$(basename "$SCRIPT_DIR")" -case "$cluster_name" in - frontier) compiler_flag="f" ;; - frontier_amd) compiler_flag="famd" ;; - *) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;; -esac - -usage() { - echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp] [shard]" -} - -if [ ! -z "$1" ]; then - sbatch_script_contents=`cat $1` -else - usage - exit 1 -fi - -# Detect job type from submitted script basename -script_basename="$(basename "$1" .sh)" -case "$script_basename" in - bench*) job_type="bench" ;; - *) job_type="test" ;; -esac - -if [ "$2" = "cpu" ]; then - sbatch_device_opts="\ -#SBATCH -n 32 # Number of cores required" -elif [ "$2" = "gpu" ]; then - sbatch_device_opts="\ -#SBATCH -n 8 # Number of cores required" -else - usage - exit 1 -fi - -# Select SBATCH params based on job type -if [ "$job_type" = "bench" ]; then - sbatch_account="#SBATCH -A ENG160" - sbatch_time="#SBATCH -t 05:59:00" - sbatch_partition="#SBATCH -p extended" - sbatch_extra="" -else - sbatch_account="#SBATCH -A CFD154" - sbatch_time="#SBATCH -t 01:59:00" - sbatch_partition="#SBATCH -p batch" - sbatch_extra="#SBATCH --qos=normal" -fi - -shard_suffix="" -if [ -n "$4" ]; then - shard_suffix="-$(echo "$4" | sed 's|/|-of-|')" -fi -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3${shard_suffix}" -output_file="$job_slug.out" - -submit_output=$(sbatch </dev/null | head -1) - if [ -n "$syscheck_bin" ] && ! "$syscheck_bin" > /dev/null 2>&1; then - echo "WARNING: syscheck binary crashed — cached install is stale." - if [ $attempt -lt $max_attempts ]; then - echo "Clearing cache and rebuilding..." - rm -rf build/staging build/install build/lock.yaml - sleep 5 - attempt=$((attempt + 1)) - continue - else - echo "ERROR: syscheck still failing after $max_attempts attempts." - exit 1 - fi - fi - - break - fi - - if [ $attempt -lt $max_attempts ]; then - echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..." - rm -rf build/staging build/install build/lock.yaml - sleep 30 - else - echo "Build failed after $max_attempts attempts." - exit 1 - fi - attempt=$((attempt + 1)) -done +# Build with retry; smoke-test cached binaries to catch architecture mismatches +# (SIGILL from binaries compiled on a different compute node). +source .github/scripts/retry-build.sh +RETRY_VALIDATE_CMD='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' \ + retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 n_test_threads=8 if [ "$job_device" = "gpu" ]; then - gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node - gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1 + source .github/scripts/detect-gpus.sh device_opts="-g $gpu_ids" - n_test_threads=`expr $gpu_count \* 2` + n_test_threads=$((ngpus * 2)) fi ./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5c46e91427..fe549ac10f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -156,27 +156,7 @@ jobs: PRECISION: ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }} - name: Test - run: | - rm -f tests/failed_uuids.txt - TEST_EXIT=0 - /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT || TEST_EXIT=$? - - # Retry only if a small number of tests failed (sporadic failures) - if [ -s tests/failed_uuids.txt ]; then - NUM_FAILED=$(wc -l < tests/failed_uuids.txt) - if [ "$NUM_FAILED" -le 5 ]; then - FAILED=$(tr '\n' ' ' < tests/failed_uuids.txt) - echo "" - echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ===" - echo "" - /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $TEST_ALL || exit $? - else - echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." - exit 1 - fi - elif [ "$TEST_EXIT" -ne 0 ]; then - exit $TEST_EXIT - fi + run: bash .github/scripts/run-tests-with-retry.sh -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT env: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }} @@ -299,3 +279,80 @@ jobs: with: name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.slug }} path: ${{ steps.log.outputs.slug }}.out + + case-optimization: + name: "Case Opt | ${{ matrix.cluster_name }} (${{ matrix.device }}-${{ matrix.interface }})" + if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && github.event.pull_request.draft != true + needs: [lint-gate, file-changes] + continue-on-error: false + timeout-minutes: 480 + strategy: + matrix: + include: + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'acc' + - runner: 'gt' + cluster: 'phoenix' + cluster_name: 'Georgia Tech | Phoenix' + device: 'gpu' + interface: 'omp' + - runner: 'frontier' + cluster: 'frontier' + cluster_name: 'Oak Ridge | Frontier' + device: 'gpu' + interface: 'acc' + - runner: 'frontier' + cluster: 'frontier' + cluster_name: 'Oak Ridge | Frontier' + device: 'gpu' + interface: 'omp' + - runner: 'frontier' + cluster: 'frontier_amd' + cluster_name: 'Oak Ridge | Frontier (AMD)' + device: 'gpu' + interface: 'omp' + runs-on: + group: phoenix + labels: ${{ matrix.runner }} + steps: + - name: Clone + uses: actions/checkout@v4 + with: + clean: false + + - name: Pre-Build (SLURM) + if: matrix.cluster == 'phoenix' + run: bash .github/workflows/phoenix/submit.sh .github/scripts/prebuild-case-optimization.sh ${{ matrix.device }} ${{ matrix.interface }} + + - name: Pre-Build (login node) + if: matrix.cluster != 'phoenix' + uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3 + with: + max_attempts: 3 + retry_wait_seconds: 60 + timeout_minutes: 120 + command: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }} + on_retry_command: ./mfc.sh clean + + - name: Run Case-Optimization Tests + run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} + + - name: Print Logs + if: always() + run: | + for f in prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out \ + run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out; do + [ -f "$f" ] && echo "=== $f ===" && cat "$f" + done + + - name: Archive Logs + uses: actions/upload-artifact@v4 + if: always() + with: + name: case-opt-${{ strategy.job-index }}-${{ matrix.cluster }}-${{ matrix.interface }} + path: | + prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out + run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out diff --git a/benchmarks/5eq_rk3_weno3_hllc/case.py b/benchmarks/5eq_rk3_weno3_hllc/case.py index 16047664ff..5ecc327e8f 100644 --- a/benchmarks/5eq_rk3_weno3_hllc/case.py +++ b/benchmarks/5eq_rk3_weno3_hllc/case.py @@ -14,6 +14,7 @@ parser.add_argument("--mfc", type=json.loads, default="{}", metavar="DICT", help="MFC's toolchain's internal state.") parser.add_argument("--gbpp", type=int, metavar="MEM", default=16, help="Adjusts the problem size per rank to fit into [MEM] GB of GPU memory per GPU.") +parser.add_argument("--steps", type=int, default=None, help="Override t_step_stop/t_step_save.") ARGS = vars(parser.parse_args()) DICT = ARGS["mfc"] @@ -190,8 +191,8 @@ "cyl_coord": "F", "dt": dt, "t_step_start": 0, - "t_step_stop": int(7 * (5 * size + 5)), - "t_step_save": int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 3, "model_eqns": 2, @@ -218,7 +219,7 @@ "format": 1, "precision": 2, "prim_vars_wrt": "T", - "parallel_io": "T", + "parallel_io": "F", # I will use 1 for WATER properties, and 2 for AIR properties # Patch 1: Background (AIR - 2) "patch_icpp(1)%geometry": 9, diff --git a/benchmarks/hypo_hll/case.py b/benchmarks/hypo_hll/case.py index c53eb243c5..1663a507aa 100644 --- a/benchmarks/hypo_hll/case.py +++ b/benchmarks/hypo_hll/case.py @@ -12,6 +12,7 @@ parser.add_argument("--mfc", type=json.loads, default="{}", metavar="DICT", help="MFC's toolchain's internal state.") parser.add_argument("--gbpp", type=int, metavar="MEM", default=16, help="Adjusts the problem size per rank to fit into [MEM] GB of GPU memory per GPU.") +parser.add_argument("--steps", type=int, default=None, help="Override t_step_stop/t_step_save.") ARGS = vars(parser.parse_args()) DICT = ARGS["mfc"] @@ -43,8 +44,8 @@ "p": Nz, "dt": 1e-8, "t_step_start": 0, - "t_step_stop": int(7 * (5 * size + 5)), - "t_step_save": int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 2, "model_eqns": 2, diff --git a/benchmarks/ibm/case.py b/benchmarks/ibm/case.py index 8df1683628..e16cb620b7 100644 --- a/benchmarks/ibm/case.py +++ b/benchmarks/ibm/case.py @@ -11,6 +11,7 @@ parser.add_argument("--mfc", type=json.loads, default="{}", metavar="DICT", help="MFC's toolchain's internal state.") parser.add_argument("--gbpp", type=int, metavar="MEM", default=16, help="Adjusts the problem size per rank to fit into [MEM] GB of GPU memory per GPU.") +parser.add_argument("--steps", type=int, default=None, help="Override t_step_stop/t_step_save.") ARGS = vars(parser.parse_args()) DICT = ARGS["mfc"] @@ -47,8 +48,8 @@ "p": Nz, "dt": mydt, "t_step_start": 0, - "t_step_stop": int(7 * (5 * size + 5)), - "t_step_save": int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 1, "model_eqns": 2, @@ -81,7 +82,7 @@ "format": 1, "precision": 2, "prim_vars_wrt": "T", - "parallel_io": "T", + "parallel_io": "F", # Patch 1 L "patch_icpp(1)%geometry": 9, "patch_icpp(1)%x_centroid": 0.5, diff --git a/benchmarks/igr/case.py b/benchmarks/igr/case.py index abc8f5315f..469bff1fa9 100644 --- a/benchmarks/igr/case.py +++ b/benchmarks/igr/case.py @@ -13,6 +13,7 @@ parser.add_argument("--mfc", type=json.loads, default="{}", metavar="DICT", help="MFC's toolchain's internal state.") parser.add_argument("--gbpp", type=int, metavar="MEM", default=16, help="Adjusts the problem size per rank to fit into [MEM] GB of GPU memory per GPU.") +parser.add_argument("--steps", type=int, default=None, help="Override t_step_stop/t_step_save.") ARGS = vars(parser.parse_args()) DICT = ARGS["mfc"] @@ -62,8 +63,8 @@ "cyl_coord": "F", "dt": dt, "t_step_start": 0, - "t_step_stop": int(7 * (5 * size + 5)), - "t_step_save": int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 1, "model_eqns": 2, @@ -91,7 +92,7 @@ "omega_wrt(3)": "T", "qm_wrt": "T", "fd_order": 4, - "parallel_io": "T", + "parallel_io": "F", # Patch 1: Background (AIR - 2) "patch_icpp(1)%geometry": 9, "patch_icpp(1)%x_centroid": 0, diff --git a/benchmarks/viscous_weno5_sgb_acoustic/case.py b/benchmarks/viscous_weno5_sgb_acoustic/case.py index cd4508abe6..9f1351b0c1 100644 --- a/benchmarks/viscous_weno5_sgb_acoustic/case.py +++ b/benchmarks/viscous_weno5_sgb_acoustic/case.py @@ -16,6 +16,7 @@ parser.add_argument("--mfc", type=json.loads, default="{}", metavar="DICT", help="MFC's toolchain's internal state.") parser.add_argument("--gbpp", type=int, metavar="MEM", default=16, help="Adjusts the problem size per rank to fit into [MEM] GB of GPU memory per GPU.") +parser.add_argument("--steps", type=int, default=None, help="Override t_step_stop/t_step_save.") ARGS = vars(parser.parse_args()) DICT = ARGS["mfc"] @@ -93,8 +94,8 @@ "p": Nz, "dt": dt, "t_step_start": 0, - "t_step_stop": int(6 * (5 * size + 5)), - "t_step_save": int(6 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(6 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(6 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 2, "model_eqns": 2, @@ -124,7 +125,7 @@ "format": 1, "precision": 2, "prim_vars_wrt": "T", - "parallel_io": "T", + "parallel_io": "F", # Patch 1 _ Background "patch_icpp(1)%geometry": 9, "patch_icpp(1)%x_centroid": 0.0, diff --git a/toolchain/mfc/packer/pack.py b/toolchain/mfc/packer/pack.py index 98b91de74b..1ad1a76834 100644 --- a/toolchain/mfc/packer/pack.py +++ b/toolchain/mfc/packer/pack.py @@ -20,8 +20,8 @@ def __repr__(self) -> str: return f"{self.filepath} {' '.join([ str(d) for d in self.doubles ])}" -# This class maps to the data contained in the entirety of D/: it is tush a list -# of PackEntry classes. +# This class maps to the data contained in the entirety of D/: a dictionary +# of PackEntry instances keyed by filepath. class Pack: entries: typing.Dict[str, PackEntry] @@ -87,10 +87,10 @@ def save(self, filepath: str): common.file_write(f"{filepath.rstrip('.txt')}-metadata.txt", metadata) - def has_NaNs(self) -> bool: + def has_bad_values(self) -> bool: for entry in self.entries.values(): for double in entry.doubles: - if math.isnan(double): + if math.isnan(double) or math.isinf(double): return True return False diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 049af9e560..2193e677b4 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -419,8 +419,8 @@ def _handle_case(case: TestCase, devices: typing.Set[int]): if err is not None: raise MFCException(f"Test {case}: {err}") - if pack.has_NaNs(): - raise MFCException(f"Test {case}: NaNs detected in the case.") + if pack.has_bad_values(): + raise MFCException(f"Test {case}: NaN or Inf detected in the case.") golden_filepath = os.path.join(case.get_dirpath(), "golden.txt") if ARG("generate"):