Implement flock-based bitmap GPU allocator for CI parallelization #630

Workflow file for this run

.github/workflows/iris-performance-regression-test.yml at 6b51e89

	name: Iris Performance Regression Test

	on:
	push:
	branches: [ main ]
	pull_request:
	branches: [ main ]
	workflow_dispatch:

	concurrency:
	group: ${{ github.workflow }}-${{ github.head_ref \|\| github.ref }}
	cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

	env:
	DOCKER_IMAGE_NAME: ${{ vars.DOCKER_IMAGE_NAME \|\| 'iris-dev-triton-aafec41' }}

	jobs:
	build-container-image:
	runs-on: [self-hosted, mi3xx]
	timeout-minutes: 20

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Setup Apptainer (if not available)
	run: \|
	if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
	echo "Neither Apptainer nor Docker found, installing Apptainer..."
	apt-get update && apt-get install -y software-properties-common
	add-apt-repository -y ppa:apptainer/ppa
	apt-get update && apt-get install -y apptainer
	else
	echo "Container runtime already available"
	fi

	- name: Build Iris container
	run: \|
	# Use the universal container build script
	bash .github/scripts/container_build.sh

	performance-test:
	name: ${{ matrix.example_name }}
	needs: build-container-image
	runs-on: [self-hosted, mi3xx]
	timeout-minutes: 30
	strategy:
	fail-fast: false
	matrix:
	# Performance baselines measured on AMD Instinct MI325X (8 GPUs)
	include:
	# Disabled https://github.com/ROCm/iris/issues/238
	#- example_name: "GEMM All-Scatter WG Specialization"
	# example_path: "10_gemm_all_scatter_wg_specialization"
	# tflops_threshold: 1600 # Actual: ~2182 TFLOPs
	# benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256"

	- example_name: "GEMM All-Scatter"
	example_path: "07_gemm_all_scatter"
	tflops_threshold: 1000 # Actual: ~1407 TFLOPs
	benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 256 --BLK_N 64 --BLK_K 64 --gsize_m 6 --gemm_sms 256"

	- example_name: "GEMM All-Scatter Producer-Consumer"
	example_path: "11_gemm_all_scatter_producer_consumer"
	tflops_threshold: 1600 # Actual: ~2190 TFLOPs
	benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256 --comm_sms 48"

	- example_name: "GEMM All-Scatter Bulk Synchronous"
	example_path: "12_gemm_all_scatter_bulk_synchronous"
	tflops_threshold: 900 # Actual: ~1262 TFLOPs
	benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256"

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Cleanup lingering ports before tests
	run: \|
	bash .github/scripts/cleanup_ports.sh

	- name: Run ${{ matrix.example_name }} Benchmark (8 ranks)
	run: \|
	set -e

	echo "::group::Running performance benchmark"
	bash .github/scripts/run_perf_benchmark.sh \
	"${{ matrix.example_path }}" \
	"${{ matrix.tflops_threshold }}" \
	${{ matrix.benchmark_args }}
	echo "::endgroup::"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implement flock-based bitmap GPU allocator for CI parallelization #630

Workflow file

Implement flock-based bitmap GPU allocator for CI parallelization #630

Uh oh!

Workflow file for this run