Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/therock-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ jobs:
amdgpu_families: ${{ matrix.amdgpu_family }}
artifact_group: ${{ matrix.amdgpu_family }}
extra_cmake_options: >
-DTHEROCK_ENABLE_ALL=OFF
-DTHEROCK_BUILD_TESTING=ON
-DTHEROCK_ENABLE_ALL=OFF
-DTHEROCK_BUILD_TESTING=ON
-DTHEROCK_ENABLE_DEBUG_TOOLS=ON
-DTHEROCK_ENABLE_CORE_AMDSMI=ON
-DTHEROCK_SHARED_PYTHON_EXECUTABLES=/opt/python-shared/cp310-cp310/bin/python3;/opt/python-shared/cp311-cp311/bin/python3;/opt/python-shared/cp312-cp312/bin/python3;/opt/python-shared/cp313-cp313/bin/python3;/opt/python-shared/cp314-cp314/bin/python3
-DTHEROCK_DIST_PYTHON_EXECUTABLES=/opt/python/cp310-cp310/bin/python;/opt/python/cp311-cp311/bin/python;/opt/python/cp312-cp312/bin/python;/opt/python/cp313-cp313/bin/python
-DTHEROCK_USE_EXTERNAL_ROCGDB=ON
Expand Down
71 changes: 69 additions & 2 deletions .github/workflows/therock-test-packages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:

container:
image: ghcr.io/rocm/no_rocm_image_ubuntu24_04_rocgdb@sha256:7063e922b4b9145c92f20011674571f1c97b8fad6faaeb0b7d2d165b0bd9ae8b
options: "--cap-add=SYS_PTRACE --ipc host --group-add video --device /dev/kfd --device /dev/dri --group-add 110 --ulimit memlock=-1:-1 --security-opt seccomp=unconfined --env-file /etc/podinfo/gha-gpu-isolation-settings --user 0:0"
options: "--cap-add=SYS_PTRACE --cap-add=SYSLOG --ipc host --group-add video --device /dev/kfd --device /dev/dri --group-add 110 --ulimit memlock=-1:-1 --security-opt seccomp=unconfined --env-file /etc/podinfo/gha-gpu-isolation-settings --user 0:0"

defaults:
run:
Expand All @@ -65,6 +65,73 @@ jobs:
FETCH_ARTIFACT_ARGS: "--debug-tools --tests"
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}

- name: GPU hang diagnostics
env:
AMD_SMI: ${{ env.OUTPUT_ARTIFACTS_DIR }}/bin/amd-smi
ROCM_SMI: ${{ env.OUTPUT_ARTIFACTS_DIR }}/bin/rocm-smi
run: |
echo "==== dmesg ===="
dmesg || echo "dmesg unavailable"

echo ""
echo "==== amd-smi static ===="
"$AMD_SMI" static 2>/dev/null || echo "amd-smi static unavailable"

echo ""
echo "==== amd-smi process ===="
"$AMD_SMI" process 2>/dev/null || echo "amd-smi process unavailable"

echo ""
echo "==== amd-smi metric ===="
"$AMD_SMI" metric 2>/dev/null || echo "amd-smi metric unavailable"

echo ""
echo "==== amd-smi topology ===="
"$AMD_SMI" topology 2>/dev/null || echo "amd-smi topology unavailable"

echo ""
echo "==== rocm-smi ===="
"$ROCM_SMI" 2>/dev/null || echo "rocm-smi unavailable"

echo ""
echo "==== rocm-smi --showpids ===="
"$ROCM_SMI" --showpids 2>/dev/null || echo "rocm-smi --showpids unavailable"

echo ""
echo "==== rocm-smi --showmeminfo vram ===="
"$ROCM_SMI" --showmeminfo vram 2>/dev/null || echo "rocm-smi --showmeminfo unavailable"

echo ""
echo "==== rocm-smi --showuse ===="
"$ROCM_SMI" --showuse 2>/dev/null || echo "rocm-smi --showuse unavailable"

echo ""
echo "==== rocm-smi --showclkfrq ===="
"$ROCM_SMI" --showclkfrq 2>/dev/null || echo "rocm-smi --showclkfrq unavailable"

echo ""
echo "==== rocm-smi --showerrors ===="
"$ROCM_SMI" --showerrors 2>/dev/null || echo "rocm-smi --showerrors unavailable"

- name: Reset GPU(s) in ROCR_VISIBLE_DEVICES
env:
AMD_SMI: ${{ env.OUTPUT_ARTIFACTS_DIR }}/bin/amd-smi
run: |
if [ -z "${ROCR_VISIBLE_DEVICES}" ]; then
echo "ROCR_VISIBLE_DEVICES is not set; skipping GPU reset"
else
echo "Resetting GPU(s): ${ROCR_VISIBLE_DEVICES}"
IFS=',' read -ra GPUS <<< "${ROCR_VISIBLE_DEVICES}"
for gpu in "${GPUS[@]}"; do
echo " amd-smi reset --gpureset --gpu ${gpu}"
"$AMD_SMI" reset --gpureset --gpu "${gpu}" \
|| echo "WARNING: amd-smi reset failed for GPU ${gpu}"
done
fi

- name: Post-reset dmesg
run: dmesg | tail -n 400 || echo "dmesg unavailable"

- name: Run rocgdb tests
run: |
python ${{ env.OUTPUT_ARTIFACTS_DIR }}/tests/rocgdb/test_rocgdb.py
python ${{ env.OUTPUT_ARTIFACTS_DIR }}/tests/rocgdb/test_rocgdb.py --tests gdb.rocm/simple.exp --max-failed-retries 0
Loading