Implement flock-based bitmap GPU allocator for CI parallelization #632
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Iris Performance Regression Test | |
| on: | |
| push: | |
| branches: [ main ] | |
| pull_request: | |
| branches: [ main ] | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} | |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
| env: | |
| DOCKER_IMAGE_NAME: ${{ vars.DOCKER_IMAGE_NAME || 'iris-dev-triton-aafec41' }} | |
| jobs: | |
| build-container-image: | |
| runs-on: [self-hosted, mi3xx] | |
| timeout-minutes: 20 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Setup Apptainer (if not available) | |
| run: | | |
| if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then | |
| echo "Neither Apptainer nor Docker found, installing Apptainer..." | |
| apt-get update && apt-get install -y software-properties-common | |
| add-apt-repository -y ppa:apptainer/ppa | |
| apt-get update && apt-get install -y apptainer | |
| else | |
| echo "Container runtime already available" | |
| fi | |
| - name: Build Iris container | |
| run: | | |
| # Use the universal container build script | |
| bash .github/scripts/container_build.sh | |
| performance-test: | |
| name: ${{ matrix.example_name }} | |
| needs: build-container-image | |
| runs-on: [self-hosted, mi3xx] | |
| timeout-minutes: 30 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| # Performance baselines measured on AMD Instinct MI325X (8 GPUs) | |
| include: | |
| # Disabled https://github.com/ROCm/iris/issues/238 | |
| #- example_name: "GEMM All-Scatter WG Specialization" | |
| # example_path: "10_gemm_all_scatter_wg_specialization" | |
| # tflops_threshold: 1600 # Actual: ~2182 TFLOPs | |
| # benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256" | |
| - example_name: "GEMM All-Scatter" | |
| example_path: "07_gemm_all_scatter" | |
| tflops_threshold: 1000 # Actual: ~1407 TFLOPs | |
| benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 256 --BLK_N 64 --BLK_K 64 --gsize_m 6 --gemm_sms 256" | |
| - example_name: "GEMM All-Scatter Producer-Consumer" | |
| example_path: "11_gemm_all_scatter_producer_consumer" | |
| tflops_threshold: 1600 # Actual: ~2190 TFLOPs | |
| benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256 --comm_sms 48" | |
| - example_name: "GEMM All-Scatter Bulk Synchronous" | |
| example_path: "12_gemm_all_scatter_bulk_synchronous" | |
| tflops_threshold: 900 # Actual: ~1262 TFLOPs | |
| benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256" | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Run ${{ matrix.example_name }} Benchmark (8 ranks) | |
| run: | | |
| set -e | |
| echo "::group::Running performance benchmark" | |
| bash .github/scripts/run_perf_benchmark.sh \ | |
| "${{ matrix.example_path }}" \ | |
| "${{ matrix.tflops_threshold }}" \ | |
| ${{ matrix.benchmark_args }} | |
| echo "::endgroup::" | |