Skip to content

Commit 28fc258

Browse files
authored
Add test sharding, proactive clean, and retry logic for self-hosted CI (#1171)
1 parent 35b2134 commit 28fc258

File tree

24 files changed

+338
-299
lines changed

24 files changed

+338
-299
lines changed

.github/scripts/submit_and_monitor_bench.sh

Lines changed: 14 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -14,50 +14,27 @@ device="$2"
1414
interface="$3"
1515
cluster="$4"
1616

17-
# Get the directory where this script lives
18-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
19-
2017
echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
2118
cd "$dir"
2219

23-
# Submit job
24-
submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
25-
.github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1)
26-
27-
job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
28-
job_slug="bench-$device-$interface"
29-
output_file="${job_slug}.out"
30-
31-
if [ -z "$job_id" ]; then
32-
echo "[$dir] ERROR: Failed to submit job"
33-
echo "$submit_output"
34-
exit 1
35-
fi
36-
37-
echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
38-
39-
# Use the monitoring script from PR (where this script lives)
40-
monitor_exit=0
41-
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
42-
if [ "$monitor_exit" -ne 0 ]; then
43-
echo "[$dir] WARNING: SLURM job exited with code $monitor_exit"
44-
else
45-
echo "[$dir] Monitoring complete for job $job_id"
46-
fi
20+
# Submit and monitor job (submit.sh auto-detects bench mode from script name)
21+
bash .github/workflows/$cluster/submit.sh \
22+
.github/workflows/$cluster/bench.sh "$device" "$interface"
4723

4824
# Verify the YAML output file was created
25+
job_slug="bench-$device-$interface"
4926
yaml_file="${job_slug}.yaml"
5027
if [ ! -f "$yaml_file" ]; then
51-
echo "[$dir] ERROR: Expected output file not found: $yaml_file"
52-
echo "[$dir] Directory contents:"
53-
ls -la *.yaml 2>/dev/null || echo " No YAML files found"
54-
echo ""
55-
echo "[$dir] Last 100 lines of job output ($output_file):"
56-
echo "----------------------------------------"
57-
tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file"
58-
echo "----------------------------------------"
59-
exit 1
28+
echo "[$dir] ERROR: Expected output file not found: $yaml_file"
29+
echo "[$dir] Directory contents:"
30+
ls -la *.yaml 2>/dev/null || echo " No YAML files found"
31+
echo ""
32+
output_file="${job_slug}.out"
33+
echo "[$dir] Last 100 lines of job output ($output_file):"
34+
echo "----------------------------------------"
35+
tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file"
36+
echo "----------------------------------------"
37+
exit 1
6038
fi
6139

6240
echo "[$dir] Verified output file exists: $yaml_file ($(stat -f%z "$yaml_file" 2>/dev/null || stat -c%s "$yaml_file" 2>/dev/null) bytes)"
63-

.github/workflows/bench.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ concurrency:
1313
jobs:
1414
file-changes:
1515
name: Detect File Changes
16+
if: >
17+
github.event_name != 'pull_request_review' ||
18+
github.event.review.user.type != 'Bot'
1619
runs-on: 'ubuntu-latest'
1720
outputs:
1821
checkall: ${{ steps.changes.outputs.checkall }}

.github/workflows/frontier/bench.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ if [ "$job_device" = "gpu" ]; then
1616
fi
1717

1818
if [ "$job_device" = "gpu" ]; then
19-
./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks
19+
./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
2020
else
21-
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks
21+
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
2222
fi

.github/workflows/frontier/build.sh

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,15 @@
33
# Ignore SIGHUP to survive login node session drops
44
trap '' HUP
55

6+
# Determine compiler flag from directory name
7+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8+
cluster_name="$(basename "$SCRIPT_DIR")"
9+
case "$cluster_name" in
10+
frontier) compiler_flag="f" ;;
11+
frontier_amd) compiler_flag="famd" ;;
12+
*) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
13+
esac
14+
615
job_device=$1
716
job_interface=$2
817
run_bench=$3
@@ -16,28 +25,25 @@ if [ "$job_device" = "gpu" ]; then
1625
fi
1726
fi
1827

19-
. ./mfc.sh load -c f -m g
28+
. ./mfc.sh load -c $compiler_flag -m g
2029

2130
# Only set up build cache for test suite, not benchmarks
2231
if [ "$run_bench" != "bench" ]; then
23-
source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface"
32+
source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
2433
fi
2534

2635
max_attempts=3
2736
attempt=1
2837
while [ $attempt -le $max_attempts ]; do
2938
echo "Build attempt $attempt of $max_attempts..."
3039
if [ "$run_bench" == "bench" ]; then
31-
build_cmd_ok=true
32-
for dir in benchmarks/*/; do
33-
dirname=$(basename "$dir")
34-
if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
35-
build_cmd_ok=false
36-
break
37-
fi
38-
done
40+
if ./mfc.sh build -j 8 $build_opts; then
41+
build_cmd_ok=true
42+
else
43+
build_cmd_ok=false
44+
fi
3945
else
40-
if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then
46+
if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then
4147
build_cmd_ok=true
4248
else
4349
build_cmd_ok=false

.github/workflows/frontier/submit-bench.sh

Lines changed: 0 additions & 54 deletions
This file was deleted.

.github/workflows/frontier/submit.sh

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,17 @@ set -e
55
# Ignore SIGHUP to survive login node session drops
66
trap '' HUP
77

8+
# Determine compiler flag from directory name
9+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
10+
cluster_name="$(basename "$SCRIPT_DIR")"
11+
case "$cluster_name" in
12+
frontier) compiler_flag="f" ;;
13+
frontier_amd) compiler_flag="famd" ;;
14+
*) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
15+
esac
16+
817
usage() {
9-
echo "Usage: $0 [script.sh] [cpu|gpu]"
18+
echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp] [shard]"
1019
}
1120

1221
if [ ! -z "$1" ]; then
@@ -16,6 +25,13 @@ else
1625
exit 1
1726
fi
1827

28+
# Detect job type from submitted script basename
29+
script_basename="$(basename "$1" .sh)"
30+
case "$script_basename" in
31+
bench*) job_type="bench" ;;
32+
*) job_type="test" ;;
33+
esac
34+
1935
if [ "$2" = "cpu" ]; then
2036
sbatch_device_opts="\
2137
#SBATCH -n 32 # Number of cores required"
@@ -27,19 +43,36 @@ else
2743
exit 1
2844
fi
2945

46+
# Select SBATCH params based on job type
47+
if [ "$job_type" = "bench" ]; then
48+
sbatch_account="#SBATCH -A ENG160"
49+
sbatch_time="#SBATCH -t 05:59:00"
50+
sbatch_partition="#SBATCH -p extended"
51+
sbatch_extra=""
52+
else
53+
sbatch_account="#SBATCH -A CFD154"
54+
sbatch_time="#SBATCH -t 01:59:00"
55+
sbatch_partition="#SBATCH -p batch"
56+
sbatch_extra="#SBATCH --qos=normal"
57+
fi
3058

31-
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
59+
shard_suffix=""
60+
if [ -n "$4" ]; then
61+
shard_suffix="-$(echo "$4" | sed 's|/|-of-|')"
62+
fi
63+
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3${shard_suffix}"
3264
output_file="$job_slug.out"
3365

3466
submit_output=$(sbatch <<EOT
3567
#!/bin/bash
3668
#SBATCH -J MFC-$job_slug # Job name
37-
#SBATCH -A ENG160 # charge account
69+
$sbatch_account
3870
#SBATCH -N 1 # Number of nodes required
3971
$sbatch_device_opts
40-
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
72+
$sbatch_time
4173
#SBATCH -o$output_file # Combined output and error messages file
42-
#SBATCH -p extended # Extended partition for shorter queues
74+
$sbatch_partition
75+
$sbatch_extra
4376
4477
set -e
4578
set -x
@@ -50,8 +83,10 @@ echo "Running in $(pwd):"
5083
job_slug="$job_slug"
5184
job_device="$2"
5285
job_interface="$3"
86+
job_shard="$4"
87+
job_cluster="$cluster_name"
5388
54-
. ./mfc.sh load -c f -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
89+
. ./mfc.sh load -c $compiler_flag -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
5590
5691
$sbatch_script_contents
5792
@@ -68,5 +103,4 @@ fi
68103
echo "Submitted batch job $job_id"
69104

70105
# Use resilient monitoring instead of sbatch -W
71-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
72106
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"

.github/workflows/frontier/test.sh

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,17 @@ if [ "$job_device" = "gpu" ]; then
1313
fi
1414
fi
1515

16+
shard_opts=""
17+
if [ -n "$job_shard" ]; then
18+
shard_opts="--shard $job_shard"
19+
fi
20+
1621
if [ "$job_device" = "gpu" ]; then
17-
./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
22+
rdma_opts=""
23+
if [ "$job_cluster" = "frontier" ]; then
24+
rdma_opts="--rdma-mpi"
25+
fi
26+
./mfc.sh test -v -a $rdma_opts --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c $job_cluster
1827
else
19-
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
28+
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu $shard_opts -- -c $job_cluster
2029
fi

.github/workflows/frontier_amd/bench.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ if [ "$job_device" = "gpu" ]; then
1616
fi
1717

1818
if [ "$job_device" = "gpu" ]; then
19-
./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks
19+
./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
2020
else
21-
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks
21+
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
2222
fi

.github/workflows/frontier_amd/build.sh

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,15 @@
33
# Ignore SIGHUP to survive login node session drops
44
trap '' HUP
55

6+
# Determine compiler flag from directory name
7+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8+
cluster_name="$(basename "$SCRIPT_DIR")"
9+
case "$cluster_name" in
10+
frontier) compiler_flag="f" ;;
11+
frontier_amd) compiler_flag="famd" ;;
12+
*) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
13+
esac
14+
615
job_device=$1
716
job_interface=$2
817
run_bench=$3
@@ -16,28 +25,25 @@ if [ "$job_device" = "gpu" ]; then
1625
fi
1726
fi
1827

19-
. ./mfc.sh load -c famd -m g
28+
. ./mfc.sh load -c $compiler_flag -m g
2029

2130
# Only set up build cache for test suite, not benchmarks
2231
if [ "$run_bench" != "bench" ]; then
23-
source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface"
32+
source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
2433
fi
2534

2635
max_attempts=3
2736
attempt=1
2837
while [ $attempt -le $max_attempts ]; do
2938
echo "Build attempt $attempt of $max_attempts..."
3039
if [ "$run_bench" == "bench" ]; then
31-
build_cmd_ok=true
32-
for dir in benchmarks/*/; do
33-
dirname=$(basename "$dir")
34-
if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
35-
build_cmd_ok=false
36-
break
37-
fi
38-
done
40+
if ./mfc.sh build -j 8 $build_opts; then
41+
build_cmd_ok=true
42+
else
43+
build_cmd_ok=false
44+
fi
3945
else
40-
if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then
46+
if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then
4147
build_cmd_ok=true
4248
else
4349
build_cmd_ok=false

0 commit comments

Comments
 (0)