Skip to content

Commit 0572ddb

Browse files
sbryngelsonclaude
andcommitted
Add gcov-based test pruning with file-level coverage cache
- File-level gcov coverage cache maps 555 test UUIDs to exercised .fpp source files (gzip JSON, 11KB, committed to repo) - --only-changes flag prunes tests by intersecting PR-changed files against coverage cache; --build-coverage-cache builds the cache - New rebuild-cache CI job runs on Phoenix via SLURM when cases.py or Fortran dependency graph changes (on both PRs and master pushes) - Dep-change detection greps PR/push diffs for added use/include statements that would invalidate the coverage cache - Conservative fallbacks: missing cache runs all, missing sim coverage includes test, ALWAYS_RUN_ALL files trigger full suite - Remove continue-on-error from github CI job (fixes auto-cancellation) - TEMP: duplicate use in m_bubbles.fpp + remove CMakeLists.txt from ALWAYS_RUN_ALL to test the full cache rebuild pipeline in CI - 53 unit tests cover core coverage logic Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4ee892c commit 0572ddb

File tree

15 files changed

+1610
-29
lines changed

15 files changed

+1610
-29
lines changed

.github/file-filter.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,6 @@ checkall: &checkall
3636
- *tests
3737
- *scripts
3838
- *yml
39+
40+
cases_py:
41+
- 'toolchain/mfc/test/cases.py'

.github/workflows/frontier/test.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,18 @@ if [ -n "$job_shard" ]; then
1818
shard_opts="--shard $job_shard"
1919
fi
2020

21+
# Only prune tests on PRs; master pushes must run the full suite.
22+
prune_flag=""
23+
if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
24+
prune_flag="--only-changes"
25+
fi
26+
2127
if [ "$job_device" = "gpu" ]; then
2228
rdma_opts=""
2329
if [ "$job_cluster" = "frontier" ]; then
2430
rdma_opts="--rdma-mpi"
2531
fi
26-
./mfc.sh test -v -a $rdma_opts --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c $job_cluster
32+
./mfc.sh test -v -a $rdma_opts --max-attempts 3 $prune_flag -j $ngpus $device_opts $shard_opts -- -c $job_cluster
2733
else
28-
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu $shard_opts -- -c $job_cluster
34+
./mfc.sh test -v -a --max-attempts 3 $prune_flag -j 32 --no-gpu $shard_opts -- -c $job_cluster
2935
fi

.github/workflows/frontier_amd/test.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,18 @@ if [ -n "$job_shard" ]; then
1818
shard_opts="--shard $job_shard"
1919
fi
2020

21+
# Only prune tests on PRs; master pushes must run the full suite.
22+
prune_flag=""
23+
if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
24+
prune_flag="--only-changes"
25+
fi
26+
2127
if [ "$job_device" = "gpu" ]; then
2228
rdma_opts=""
2329
if [ "$job_cluster" = "frontier" ]; then
2430
rdma_opts="--rdma-mpi"
2531
fi
26-
./mfc.sh test -v -a $rdma_opts --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c $job_cluster
32+
./mfc.sh test -v -a $rdma_opts --max-attempts 3 $prune_flag -j $ngpus $device_opts $shard_opts -- -c $job_cluster
2733
else
28-
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu $shard_opts -- -c $job_cluster
34+
./mfc.sh test -v -a --max-attempts 3 $prune_flag -j 32 --no-gpu $shard_opts -- -c $job_cluster
2935
fi
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Number of parallel jobs: use SLURM allocation or default to 24.
5+
# Cap at 64 to avoid overwhelming MPI's ORTE daemons with concurrent launches.
6+
NJOBS="${SLURM_CPUS_ON_NODE:-24}"
7+
if [ "$NJOBS" -gt 64 ]; then NJOBS=64; fi
8+
9+
# Clean stale build artifacts: the self-hosted runner may have a cached
10+
# GPU build (e.g. --gpu mp) whose CMake flags are incompatible with gcov.
11+
./mfc.sh clean
12+
13+
# Build MFC with gcov coverage instrumentation (CPU-only, gfortran).
14+
# -j 8 for compilation (memory-heavy, more cores doesn't help much).
15+
./mfc.sh build --gcov -j 8
16+
17+
# Run all tests in parallel, collecting per-test coverage data.
18+
# Each test gets an isolated GCOV_PREFIX directory so .gcda files
19+
# don't collide. Coverage is collected per-test after all tests finish.
20+
# --gcov is required so the internal build step preserves instrumentation.
21+
./mfc.sh test --build-coverage-cache --gcov -j "$NJOBS"

.github/workflows/phoenix/submit.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ case "$script_basename" in
2424
esac
2525

2626
sbatch_cpu_opts="\
27-
#SBATCH -p cpu-small # partition
28-
#SBATCH --ntasks-per-node=24 # Number of cores per node required
29-
#SBATCH --mem-per-cpu=2G # Memory per core\
27+
#SBATCH -p cpu-gnr # partition (full Granite Rapids node)
28+
#SBATCH --exclusive # exclusive access to all cores
29+
#SBATCH -C graniterapids # constrain to GNR architecture\
3030
"
3131

3232
if [ "$job_type" = "bench" ]; then

.github/workflows/phoenix/test.sh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ while [ $attempt -le $max_attempts ]; do
5151
attempt=$((attempt + 1))
5252
done
5353

54-
n_test_threads=8
54+
# Use up to 64 parallel test threads on CPU (GNR nodes have 192 cores).
55+
# Cap at 64 to avoid overwhelming MPI's ORTE daemons with concurrent launches.
56+
n_test_threads=$(( SLURM_CPUS_ON_NODE > 64 ? 64 : ${SLURM_CPUS_ON_NODE:-8} ))
5557

5658
if [ "$job_device" = "gpu" ]; then
5759
gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node
@@ -60,4 +62,10 @@ if [ "$job_device" = "gpu" ]; then
6062
n_test_threads=`expr $gpu_count \* 2`
6163
fi
6264

63-
./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix
65+
# Only prune tests on PRs; master pushes must run the full suite.
66+
prune_flag=""
67+
if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
68+
prune_flag="--only-changes"
69+
fi
70+
71+
./mfc.sh test -v --max-attempts 3 $prune_flag -a -j $n_test_threads $device_opts -- -c phoenix

.github/workflows/test.yml

Lines changed: 122 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,22 +56,106 @@ jobs:
5656
file-changes:
5757
name: Detect File Changes
5858
runs-on: 'ubuntu-latest'
59-
outputs:
59+
outputs:
6060
checkall: ${{ steps.changes.outputs.checkall }}
61+
cases_py: ${{ steps.changes.outputs.cases_py }}
62+
dep_changed: ${{ steps.dep-check.outputs.dep_changed }}
6163
steps:
6264
- name: Clone
6365
uses: actions/checkout@v4
6466

6567
- name: Detect Changes
6668
uses: dorny/paths-filter@v3
6769
id: changes
68-
with:
70+
with:
6971
filters: ".github/file-filter.yml"
7072

73+
- name: Check for Fortran dependency changes
74+
id: dep-check
75+
env:
76+
GH_TOKEN: ${{ github.token }}
77+
run: |
78+
# Detect added/removed use/include statements that change the
79+
# Fortran dependency graph, which would make the coverage cache stale.
80+
if [ "${{ github.event_name }}" = "pull_request" ]; then
81+
DIFF=$(gh pr diff ${{ github.event.pull_request.number }})
82+
elif [ "${{ github.event_name }}" = "push" ]; then
83+
DIFF=$(git diff ${{ github.event.before }}..${{ github.event.after }} 2>/dev/null || echo "")
84+
else
85+
DIFF=""
86+
fi
87+
if echo "$DIFF" | \
88+
grep -qP '^\+\s*(use[\s,]+\w|#:include\s|include\s+['"'"'"])'; then
89+
echo "dep_changed=true" >> "$GITHUB_OUTPUT"
90+
echo "Fortran dependency change detected — will rebuild coverage cache."
91+
else
92+
echo "dep_changed=false" >> "$GITHUB_OUTPUT"
93+
fi
94+
95+
rebuild-cache:
96+
name: Rebuild Coverage Cache
97+
needs: [lint-gate, file-changes]
98+
if: >-
99+
github.repository == 'MFlowCode/MFC' &&
100+
(
101+
(github.event_name == 'pull_request' &&
102+
(needs.file-changes.outputs.cases_py == 'true' ||
103+
needs.file-changes.outputs.dep_changed == 'true')) ||
104+
(github.event_name == 'push' &&
105+
(needs.file-changes.outputs.cases_py == 'true' ||
106+
needs.file-changes.outputs.dep_changed == 'true')) ||
107+
github.event_name == 'workflow_dispatch'
108+
)
109+
timeout-minutes: 240
110+
runs-on:
111+
group: phoenix
112+
labels: gt
113+
permissions:
114+
contents: write
115+
steps:
116+
- name: Clone
117+
uses: actions/checkout@v4
118+
with:
119+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
120+
clean: false
121+
122+
- name: Rebuild Cache via SLURM
123+
run: bash .github/workflows/phoenix/submit.sh .github/workflows/phoenix/rebuild-cache.sh cpu none
124+
125+
- name: Print Logs
126+
if: always()
127+
run: cat rebuild-cache-cpu-none.out
128+
129+
- name: Upload Cache Artifact
130+
if: github.event_name == 'pull_request'
131+
uses: actions/upload-artifact@v4
132+
with:
133+
name: coverage-cache
134+
path: toolchain/mfc/test/test_coverage_cache.json.gz
135+
retention-days: 1
136+
137+
- name: Commit Cache to Master
138+
if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'
139+
run: |
140+
git config user.name "github-actions[bot]"
141+
git config user.email "github-actions[bot]@users.noreply.github.com"
142+
git add toolchain/mfc/test/test_coverage_cache.json.gz
143+
if git diff --cached --quiet; then
144+
echo "Coverage cache unchanged."
145+
else
146+
git commit -m "Regenerate gcov coverage cache [skip ci]"
147+
git push
148+
fi
149+
71150
github:
72151
name: Github
73-
if: needs.file-changes.outputs.checkall == 'true'
74-
needs: [lint-gate, file-changes]
152+
needs: [lint-gate, file-changes, rebuild-cache]
153+
if: >-
154+
always() &&
155+
needs.lint-gate.result == 'success' &&
156+
needs.file-changes.result == 'success' &&
157+
needs.rebuild-cache.result != 'cancelled' &&
158+
needs.file-changes.outputs.checkall == 'true'
75159
strategy:
76160
matrix:
77161
os: ['ubuntu', 'macos']
@@ -91,13 +175,26 @@ jobs:
91175
intel: false
92176

93177
fail-fast: false
94-
continue-on-error: true
95178
runs-on: ${{ matrix.os }}-latest
96179

97180
steps:
98181
- name: Clone
99182
uses: actions/checkout@v4
100183

184+
- name: Fetch master for coverage diff
185+
run: |
186+
git fetch origin master:master --depth=1
187+
git fetch --deepen=200
188+
continue-on-error: true
189+
190+
- name: Download Coverage Cache
191+
if: needs.rebuild-cache.result == 'success'
192+
uses: actions/download-artifact@v4
193+
with:
194+
name: coverage-cache
195+
path: toolchain/mfc/test
196+
continue-on-error: true
197+
101198
- name: Setup MacOS
102199
if: matrix.os == 'macos'
103200
run: |
@@ -159,7 +256,7 @@ jobs:
159256
run: |
160257
rm -f tests/failed_uuids.txt
161258
TEST_EXIT=0
162-
/bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT || TEST_EXIT=$?
259+
/bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $ONLY_CHANGES $TEST_ALL $TEST_PCT || TEST_EXIT=$?
163260
164261
# Retry only if a small number of tests failed (sporadic failures)
165262
if [ -s tests/failed_uuids.txt ]; then
@@ -180,11 +277,19 @@ jobs:
180277
env:
181278
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
182279
TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
280+
ONLY_CHANGES: ${{ github.event_name == 'pull_request' && '--only-changes' || '' }}
183281

184282
self:
185283
name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
186-
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && github.event.pull_request.draft != true
187-
needs: [lint-gate, file-changes]
284+
needs: [lint-gate, file-changes, rebuild-cache]
285+
if: >-
286+
always() &&
287+
needs.lint-gate.result == 'success' &&
288+
needs.file-changes.result == 'success' &&
289+
needs.rebuild-cache.result != 'cancelled' &&
290+
github.repository == 'MFlowCode/MFC' &&
291+
needs.file-changes.outputs.checkall == 'true' &&
292+
github.event.pull_request.draft != true
188293
continue-on-error: false
189294
timeout-minutes: 480
190295
strategy:
@@ -265,6 +370,14 @@ jobs:
265370
with:
266371
clean: false
267372

373+
- name: Download Coverage Cache
374+
if: needs.rebuild-cache.result == 'success'
375+
uses: actions/download-artifact@v4
376+
with:
377+
name: coverage-cache
378+
path: toolchain/mfc/test
379+
continue-on-error: true
380+
268381
- name: Build
269382
if: matrix.cluster != 'phoenix'
270383
uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3
@@ -299,3 +412,4 @@ jobs:
299412
with:
300413
name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.slug }}
301414
path: ${{ steps.log.outputs.slug }}.out
415+

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ __pycache__
2222
# Auto-generated version file
2323
toolchain/mfc/_version.py
2424

25+
# Raw coverage cache — legacy, not tracked (the .json.gz version IS committed)
26+
toolchain/mfc/test/test_coverage_cache.json
27+
2528
# Auto-generated toolchain files (regenerate with: ./mfc.sh generate)
2629
toolchain/completions/mfc.bash
2730
toolchain/completions/_mfc

CMakeLists.txt

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -131,13 +131,20 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
131131
add_compile_options(
132132
$<$<COMPILE_LANGUAGE:Fortran>:-fprofile-arcs>
133133
$<$<COMPILE_LANGUAGE:Fortran>:-ftest-coverage>
134-
$<$<COMPILE_LANGUAGE:Fortran>:-O1>
135-
)
134+
)
136135

137136
add_link_options(
138137
$<$<COMPILE_LANGUAGE:Fortran>:-lgcov>
139138
$<$<COMPILE_LANGUAGE:Fortran>:--coverage>
140139
)
140+
141+
# Override Release -O3 with -O1 for gcov: coverage instrumentation is
142+
# inaccurate at -O3, and aggressive codegen (e.g. AVX-512 FP16 on
143+
# Granite Rapids) can emit instructions that older assemblers reject.
144+
set(CMAKE_Fortran_FLAGS_RELEASE "-O1 -DNDEBUG" CACHE STRING "" FORCE)
145+
146+
# Use gfortran5 line markers so gcov can map coverage to .fpp sources.
147+
set(FYPP_GCOV_OPTS "--line-marker-format=gfortran5")
141148
endif()
142149

143150
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -224,18 +231,25 @@ endif()
224231

225232
if (CMAKE_BUILD_TYPE STREQUAL "Release")
226233
# Processor tuning: Check if we can target the host's native CPU's ISA.
227-
CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE)
228-
if (SUPPORTS_MARCH_NATIVE)
229-
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-march=native>)
230-
else()
231-
CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE)
232-
if (SUPPORTS_MCPU_NATIVE)
233-
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mcpu=native>)
234+
# Skip for gcov builds — -march=native on newer CPUs (e.g. Granite Rapids)
235+
# can emit instructions the system assembler doesn't support.
236+
if (NOT MFC_GCov)
237+
CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE)
238+
if (SUPPORTS_MARCH_NATIVE)
239+
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-march=native>)
240+
else()
241+
CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE)
242+
if (SUPPORTS_MCPU_NATIVE)
243+
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mcpu=native>)
244+
endif()
234245
endif()
235246
endif()
236247

237-
# Enable LTO/IPO if supported
238-
if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC")
248+
# Enable LTO/IPO if supported (skip for gcov — LTO interferes with coverage
249+
# instrumentation and can trigger assembler errors on newer architectures).
250+
if (MFC_GCov)
251+
message(STATUS "LTO/IPO disabled for gcov build")
252+
elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC")
239253
if (MFC_Unified)
240254
message(STATUS "LTO/IPO is not available with NVHPC using Unified Memory")
241255
elseif (CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER "24.11" AND CMAKE_Fortran_COMPILER_VERSION VERSION_LESS "25.9")
@@ -381,6 +395,7 @@ macro(HANDLE_SOURCES target useCommon)
381395
--no-folding
382396
--line-length=999
383397
--line-numbering-mode=nocontlines
398+
${FYPP_GCOV_OPTS}
384399
"${fpp}" "${f90}"
385400
DEPENDS "${fpp};${${target}_incs}"
386401
COMMENT "Preprocessing (Fypp) ${fpp_filename}"

src/simulation/m_bubbles.fpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ module m_bubbles
1616
use m_variables_conversion !< State variables type conversion procedures
1717

1818
use m_helper_basic !< Functions to compare floating point numbers
19+
use m_helper_basic
1920

2021
implicit none
2122

0 commit comments

Comments
 (0)