Skip to content

Added CI for ROCm 7.0 #2

Added CI for ROCm 7.0

Added CI for ROCm 7.0 #2

Workflow file for this run

name: Iris Development Tests
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
build-apptainer-image:
runs-on: [self-hosted, mi3008x]
timeout-minutes: 90
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup Apptainer
run: |
apt-get update && apt-get install -y software-properties-common
add-apt-repository -y ppa:apptainer/ppa
apt-get update && apt-get install -y apptainer
- name: Build Iris Apptainer container
run: |
# Create persistent Apptainer directory
mkdir -p ~/apptainer
# Compute hash of the definition file
DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def"
CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}')
HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256
# Check if we need to rebuild
REBUILD=false
if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then
echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
elif [ ! -f "$HASH_FILE" ]; then
echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
else
STORED_HASH=$(cat "$HASH_FILE")
if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then
echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
echo " Previous hash: $STORED_HASH"
echo " Current hash: $CURRENT_HASH"
REBUILD=true
else
echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)"
fi
fi
# Build if needed
if [ "$REBUILD" = true ]; then
apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE"
echo "$CURRENT_HASH" > "$HASH_FILE"
echo "Successfully built and stored hash: $CURRENT_HASH"
fi
test-1-2-4-ranks:
name: Test 1/2/4 Ranks (Parallel) - ROCm ${{ matrix.rocm_version }}
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 20
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Run 1, 2, 4 rank tests in parallel
run: |
# Run tests in parallel with different GPU assignments
# Note: Each test gets 2+ GPUs even if it only uses some of them.
# This allows tests like test_empty_device_handling to verify that
# allocating on a different device correctly raises an error.
# Create unique overlay images for isolation
OVERLAY_1="/tmp/iris_overlay_$(whoami)_1rank_$(date +%s%N).img"
OVERLAY_2="/tmp/iris_overlay_$(whoami)_2rank_$(date +%s%N).img"
OVERLAY_4="/tmp/iris_overlay_$(whoami)_4rank_$(date +%s%N).img"
echo "::group::Creating overlay images"
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_1}"
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_2}"
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_4}"
echo "::endgroup::"
echo "::group::Starting parallel tests"
echo "Starting 1-rank test on GPUs 0,1..."
apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 1
" &
PID1=$!
echo "Starting 2-rank test on GPUs 2,3..."
apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 2
" &
PID2=$!
echo "Starting 4-rank test on GPUs 4,5,6,7..."
apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 4
" &
PID4=$!
echo "::endgroup::"
# Wait for all parallel tests and track failures
echo "::group::Waiting for parallel tests to complete"
FAIL=0
FAILED_TESTS=""
wait $PID1 || { echo "::error::1-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 1-rank"; FAIL=1; }
wait $PID2 || { echo "::error::2-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 2-rank"; FAIL=1; }
wait $PID4 || { echo "::error::4-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 4-rank"; FAIL=1; }
echo "::endgroup::"
# Cleanup overlay images
echo "::group::Cleaning up overlay images"
rm -f "${OVERLAY_1}" "${OVERLAY_2}" "${OVERLAY_4}"
echo "::endgroup::"
if [ $FAIL -eq 1 ]; then
echo "::error::Parallel tests failed:$FAILED_TESTS"
exit 1
fi
echo "✅ All parallel tests (1, 2, 4 ranks) passed!"
test-8-ranks:
name: Test 8 Ranks - ROCm ${{ matrix.rocm_version }}
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 30
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Run 8-rank test
run: |
# Create unique overlay image for isolation
OVERLAY_8="/tmp/iris_overlay_$(whoami)_8rank_$(date +%s%N).img"
echo "::group::Creating overlay image"
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_8}"
echo "::endgroup::"
echo "::group::Running 8-rank test on all GPUs"
apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 8
"
echo "::endgroup::"
# Cleanup overlay image
echo "::group::Cleaning up overlay image"
rm -f "${OVERLAY_8}"
echo "::endgroup::"
echo "✅ 8-rank test passed!"