Added CI for ROCm 7.0 #2
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Iris Development Tests | |
| on: | |
| push: | |
| branches: [ main ] | |
| pull_request: | |
| branches: [ main ] | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} | |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
| jobs: | |
| build-apptainer-image: | |
| runs-on: [self-hosted, mi3008x] | |
| timeout-minutes: 90 | |
| strategy: | |
| matrix: | |
| rocm_version: ["6.3.1", "7.0"] | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Setup Apptainer | |
| run: | | |
| apt-get update && apt-get install -y software-properties-common | |
| add-apt-repository -y ppa:apptainer/ppa | |
| apt-get update && apt-get install -y apptainer | |
| - name: Build Iris Apptainer container | |
| run: | | |
| # Create persistent Apptainer directory | |
| mkdir -p ~/apptainer | |
| # Compute hash of the definition file | |
| DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def" | |
| CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}') | |
| HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256 | |
| # Check if we need to rebuild | |
| REBUILD=false | |
| if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then | |
| echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..." | |
| REBUILD=true | |
| elif [ ! -f "$HASH_FILE" ]; then | |
| echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." | |
| REBUILD=true | |
| else | |
| STORED_HASH=$(cat "$HASH_FILE") | |
| if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then | |
| echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." | |
| echo " Previous hash: $STORED_HASH" | |
| echo " Current hash: $CURRENT_HASH" | |
| REBUILD=true | |
| else | |
| echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)" | |
| fi | |
| fi | |
| # Build if needed | |
| if [ "$REBUILD" = true ]; then | |
| apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE" | |
| echo "$CURRENT_HASH" > "$HASH_FILE" | |
| echo "Successfully built and stored hash: $CURRENT_HASH" | |
| fi | |
| test-1-2-4-ranks: | |
| name: Test 1/2/4 Ranks (Parallel) - ROCm ${{ matrix.rocm_version }} | |
| needs: build-apptainer-image | |
| runs-on: [self-hosted, mi3008x] | |
| timeout-minutes: 20 | |
| strategy: | |
| matrix: | |
| rocm_version: ["6.3.1", "7.0"] | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Run 1, 2, 4 rank tests in parallel | |
| run: | | |
| # Run tests in parallel with different GPU assignments | |
| # Note: Each test gets 2+ GPUs even if it only uses some of them. | |
| # This allows tests like test_empty_device_handling to verify that | |
| # allocating on a different device correctly raises an error. | |
| # Create unique overlay images for isolation | |
| OVERLAY_1="/tmp/iris_overlay_$(whoami)_1rank_$(date +%s%N).img" | |
| OVERLAY_2="/tmp/iris_overlay_$(whoami)_2rank_$(date +%s%N).img" | |
| OVERLAY_4="/tmp/iris_overlay_$(whoami)_4rank_$(date +%s%N).img" | |
| echo "::group::Creating overlay images" | |
| apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_1}" | |
| apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_2}" | |
| apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_4}" | |
| echo "::endgroup::" | |
| echo "::group::Starting parallel tests" | |
| echo "Starting 1-rank test on GPUs 0,1..." | |
| apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \ | |
| --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ | |
| ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " | |
| pip install -e . | |
| bash .github/scripts/run_tests.sh 1 | |
| " & | |
| PID1=$! | |
| echo "Starting 2-rank test on GPUs 2,3..." | |
| apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \ | |
| --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ | |
| ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " | |
| pip install -e . | |
| bash .github/scripts/run_tests.sh 2 | |
| " & | |
| PID2=$! | |
| echo "Starting 4-rank test on GPUs 4,5,6,7..." | |
| apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \ | |
| --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ | |
| ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " | |
| pip install -e . | |
| bash .github/scripts/run_tests.sh 4 | |
| " & | |
| PID4=$! | |
| echo "::endgroup::" | |
| # Wait for all parallel tests and track failures | |
| echo "::group::Waiting for parallel tests to complete" | |
| FAIL=0 | |
| FAILED_TESTS="" | |
| wait $PID1 || { echo "::error::1-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 1-rank"; FAIL=1; } | |
| wait $PID2 || { echo "::error::2-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 2-rank"; FAIL=1; } | |
| wait $PID4 || { echo "::error::4-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 4-rank"; FAIL=1; } | |
| echo "::endgroup::" | |
| # Cleanup overlay images | |
| echo "::group::Cleaning up overlay images" | |
| rm -f "${OVERLAY_1}" "${OVERLAY_2}" "${OVERLAY_4}" | |
| echo "::endgroup::" | |
| if [ $FAIL -eq 1 ]; then | |
| echo "::error::Parallel tests failed:$FAILED_TESTS" | |
| exit 1 | |
| fi | |
| echo "✅ All parallel tests (1, 2, 4 ranks) passed!" | |
| test-8-ranks: | |
| name: Test 8 Ranks - ROCm ${{ matrix.rocm_version }} | |
| needs: build-apptainer-image | |
| runs-on: [self-hosted, mi3008x] | |
| timeout-minutes: 30 | |
| strategy: | |
| matrix: | |
| rocm_version: ["6.3.1", "7.0"] | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Run 8-rank test | |
| run: | | |
| # Create unique overlay image for isolation | |
| OVERLAY_8="/tmp/iris_overlay_$(whoami)_8rank_$(date +%s%N).img" | |
| echo "::group::Creating overlay image" | |
| apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_8}" | |
| echo "::endgroup::" | |
| echo "::group::Running 8-rank test on all GPUs" | |
| apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ | |
| --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ | |
| ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " | |
| pip install -e . | |
| bash .github/scripts/run_tests.sh 8 | |
| " | |
| echo "::endgroup::" | |
| # Cleanup overlay image | |
| echo "::group::Cleaning up overlay image" | |
| rm -f "${OVERLAY_8}" | |
| echo "::endgroup::" | |
| echo "✅ 8-rank test passed!" |