From 97c108e27a698dcc1fee030a4ee5ec10062f8ba3 Mon Sep 17 00:00:00 2001 From: Luis Machado Date: Thu, 25 Jun 2026 04:28:04 -0500 Subject: [PATCH 1/2] CI: add GPU hang diagnostics step and restrict tests to gdb.rocm Add a pre-test step that dumps dmesg, amd-smi, and rocm-smi output to help diagnose GPU hangs. Restrict the rocgdb test run to gdb.rocm only with --tests gdb.rocm. Co-Authored-By: Claude Sonnet 4 --- .github/workflows/therock-ci.yml | 5 +- .github/workflows/therock-test-packages.yml | 52 ++++++++++++++++++++- 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/.github/workflows/therock-ci.yml b/.github/workflows/therock-ci.yml index 75dd291b60b..cb858e70ce0 100644 --- a/.github/workflows/therock-ci.yml +++ b/.github/workflows/therock-ci.yml @@ -53,9 +53,10 @@ jobs: amdgpu_families: ${{ matrix.amdgpu_family }} artifact_group: ${{ matrix.amdgpu_family }} extra_cmake_options: > - -DTHEROCK_ENABLE_ALL=OFF - -DTHEROCK_BUILD_TESTING=ON + -DTHEROCK_ENABLE_ALL=OFF + -DTHEROCK_BUILD_TESTING=ON -DTHEROCK_ENABLE_DEBUG_TOOLS=ON + -DTHEROCK_ENABLE_CORE_AMDSMI=ON -DTHEROCK_SHARED_PYTHON_EXECUTABLES=/opt/python-shared/cp310-cp310/bin/python3;/opt/python-shared/cp311-cp311/bin/python3;/opt/python-shared/cp312-cp312/bin/python3;/opt/python-shared/cp313-cp313/bin/python3;/opt/python-shared/cp314-cp314/bin/python3 -DTHEROCK_DIST_PYTHON_EXECUTABLES=/opt/python/cp310-cp310/bin/python;/opt/python/cp311-cp311/bin/python;/opt/python/cp312-cp312/bin/python;/opt/python/cp313-cp313/bin/python -DTHEROCK_USE_EXTERNAL_ROCGDB=ON diff --git a/.github/workflows/therock-test-packages.yml b/.github/workflows/therock-test-packages.yml index d60c118fb8e..44f52ff18df 100644 --- a/.github/workflows/therock-test-packages.yml +++ b/.github/workflows/therock-test-packages.yml @@ -38,7 +38,7 @@ jobs: container: image: ghcr.io/rocm/no_rocm_image_ubuntu24_04_rocgdb@sha256:7063e922b4b9145c92f20011674571f1c97b8fad6faaeb0b7d2d165b0bd9ae8b - options: "--cap-add=SYS_PTRACE --ipc host --group-add video --device /dev/kfd --device /dev/dri --group-add 110 --ulimit memlock=-1:-1 --security-opt seccomp=unconfined --env-file /etc/podinfo/gha-gpu-isolation-settings --user 0:0" + options: "--cap-add=SYS_PTRACE --cap-add=SYSLOG --ipc host --group-add video --device /dev/kfd --device /dev/dri --group-add 110 --ulimit memlock=-1:-1 --security-opt seccomp=unconfined --env-file /etc/podinfo/gha-gpu-isolation-settings --user 0:0" defaults: run: @@ -65,6 +65,54 @@ jobs: FETCH_ARTIFACT_ARGS: "--debug-tools --tests" IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + - name: GPU hang diagnostics + env: + AMD_SMI: ${{ env.OUTPUT_ARTIFACTS_DIR }}/bin/amd-smi + ROCM_SMI: ${{ env.OUTPUT_ARTIFACTS_DIR }}/bin/rocm-smi + run: | + echo "==== dmesg ====" + dmesg || echo "dmesg unavailable" + + echo "" + echo "==== amd-smi static ====" + "$AMD_SMI" static 2>/dev/null || echo "amd-smi static unavailable" + + echo "" + echo "==== amd-smi process ====" + "$AMD_SMI" process 2>/dev/null || echo "amd-smi process unavailable" + + echo "" + echo "==== amd-smi metric ====" + "$AMD_SMI" metric 2>/dev/null || echo "amd-smi metric unavailable" + + echo "" + echo "==== amd-smi topology ====" + "$AMD_SMI" topology 2>/dev/null || echo "amd-smi topology unavailable" + + echo "" + echo "==== rocm-smi ====" + "$ROCM_SMI" 2>/dev/null || echo "rocm-smi unavailable" + + echo "" + echo "==== rocm-smi --showpids ====" + "$ROCM_SMI" --showpids 2>/dev/null || echo "rocm-smi --showpids unavailable" + + echo "" + echo "==== rocm-smi --showmeminfo vram ====" + "$ROCM_SMI" --showmeminfo vram 2>/dev/null || echo "rocm-smi --showmeminfo unavailable" + + echo "" + echo "==== rocm-smi --showuse ====" + "$ROCM_SMI" --showuse 2>/dev/null || echo "rocm-smi --showuse unavailable" + + echo "" + echo "==== rocm-smi --showclkfrq ====" + "$ROCM_SMI" --showclkfrq 2>/dev/null || echo "rocm-smi --showclkfrq unavailable" + + echo "" + echo "==== rocm-smi --showerrors ====" + "$ROCM_SMI" --showerrors 2>/dev/null || echo "rocm-smi --showerrors unavailable" + - name: Run rocgdb tests run: | - python ${{ env.OUTPUT_ARTIFACTS_DIR }}/tests/rocgdb/test_rocgdb.py + python ${{ env.OUTPUT_ARTIFACTS_DIR }}/tests/rocgdb/test_rocgdb.py --tests gdb.rocm/simple.exp --max-failed-retries 0 From 74000a8fc7159d8f799a00365d64ffe63427848d Mon Sep 17 00:00:00 2001 From: Luis Machado Date: Fri, 26 Jun 2026 15:44:27 -0500 Subject: [PATCH 2/2] CI: reset GPU and dump dmesg before rocgdb tests --- .github/workflows/therock-test-packages.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/therock-test-packages.yml b/.github/workflows/therock-test-packages.yml index 44f52ff18df..116312456c8 100644 --- a/.github/workflows/therock-test-packages.yml +++ b/.github/workflows/therock-test-packages.yml @@ -113,6 +113,25 @@ jobs: echo "==== rocm-smi --showerrors ====" "$ROCM_SMI" --showerrors 2>/dev/null || echo "rocm-smi --showerrors unavailable" + - name: Reset GPU(s) in ROCR_VISIBLE_DEVICES + env: + AMD_SMI: ${{ env.OUTPUT_ARTIFACTS_DIR }}/bin/amd-smi + run: | + if [ -z "${ROCR_VISIBLE_DEVICES}" ]; then + echo "ROCR_VISIBLE_DEVICES is not set; skipping GPU reset" + else + echo "Resetting GPU(s): ${ROCR_VISIBLE_DEVICES}" + IFS=',' read -ra GPUS <<< "${ROCR_VISIBLE_DEVICES}" + for gpu in "${GPUS[@]}"; do + echo " amd-smi reset --gpureset --gpu ${gpu}" + "$AMD_SMI" reset --gpureset --gpu "${gpu}" \ + || echo "WARNING: amd-smi reset failed for GPU ${gpu}" + done + fi + + - name: Post-reset dmesg + run: dmesg | tail -n 400 || echo "dmesg unavailable" + - name: Run rocgdb tests run: | python ${{ env.OUTPUT_ARTIFACTS_DIR }}/tests/rocgdb/test_rocgdb.py --tests gdb.rocm/simple.exp --max-failed-retries 0