KI Sub-groups #668

christiangnrd · 2025-12-23T01:44:16Z

Depends on #666

github-actions · 2025-12-23T01:45:20Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index d462bcc0..5bc96f40 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -297,7 +297,7 @@ kernel on the host.
 
     Backends must also implement the on-device kernel launch functionality.
 """
-struct Kernel{B,Kern}
+struct Kernel{B, Kern}
     backend::B
     kern::Kern
 end
@@ -438,7 +438,7 @@ There are a few keyword arguments that influence the behavior of `KI.@kernel`:
 """
 macro kernel(backend, ex...)
     call = ex[end]
-    kwargs = map(ex[1:(end-1)]) do kwarg
+    kwargs = map(ex[1:(end - 1)]) do kwarg
         if kwarg isa Symbol
             :($kwarg = $kwarg)
         elseif Meta.isexpr(kwarg, :(=))
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index fe70e96f..d3b4fd86 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -274,7 +274,7 @@ end
     sub_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
 end
 
-@device_override function KI.shfl_down(val::T, offset::Integer) where T
+@device_override function KI.shfl_down(val::T, offset::Integer) where {T}
     sub_group_shuffle(val, get_sub_group_local_id() + offset)
 end
 
diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl
index 123a8282..e97878da 100644
--- a/src/pocl/compiler/compilation.jl
+++ b/src/pocl/compiler/compilation.jl
@@ -27,11 +27,15 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) =
     in(fn, known_intrinsics) ||
     contains(fn, "__spirv_")
 
-function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob),
-                                    mod::LLVM.Module, entry::LLVM.Function)
-    entry = invoke(GPUCompiler.finish_module!,
-                   Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
-                   job, mod, entry)
+function GPUCompiler.finish_module!(
+        @nospecialize(job::OpenCLCompilerJob),
+        mod::LLVM.Module, entry::LLVM.Function
+    )
+    entry = invoke(
+        GPUCompiler.finish_module!,
+        Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
+        job, mod, entry
+    )
 
     # Set the subgroup size
     metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(job.config.params.sub_group_size))])
@@ -70,7 +74,7 @@ end
 
     # create GPUCompiler objects
     target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...)
-    params = OpenCLCompilerParams(; sub_group_size=32)
+    params = OpenCLCompilerParams(; sub_group_size = 32)
     return CompilerConfig(target, params; kernel, name, always_inline)
 end
 
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index d15d5daf..18627113 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -12,12 +12,14 @@ function test_intrinsics_kernel(results)
     i = KI.get_global_id().x
 
     if i <= length(results)
-        @inbounds results[i] = KernelData(KI.get_global_size().x,
-                                          KI.get_global_id().x,
-                                          KI.get_local_size().x,
-                                          KI.get_local_id().x,
-                                          KI.get_num_groups().x,
-                                          KI.get_group_id().x)
+        @inbounds results[i] = KernelData(
+            KI.get_global_size().x,
+            KI.get_global_id().x,
+            KI.get_local_size().x,
+            KI.get_local_id().x,
+            KI.get_num_groups().x,
+            KI.get_group_id().x
+        )
     end
     return
 end
@@ -32,16 +34,18 @@ function test_subgroup_kernel(results)
     i = KI.get_global_id().x
 
     if i <= length(results)
-        @inbounds results[i] = SubgroupData(KI.get_sub_group_size(),
-                                            KI.get_max_sub_group_size(),
-                                            KI.get_num_sub_groups(),
-                                            KI.get_sub_group_id(),
-                                            KI.get_sub_group_local_id())
+        @inbounds results[i] = SubgroupData(
+            KI.get_sub_group_size(),
+            KI.get_max_sub_group_size(),
+            KI.get_num_sub_groups(),
+            KI.get_sub_group_id(),
+            KI.get_sub_group_local_id()
+        )
     end
     return
 end
 
-function shfl_down_test_kernel(a, b, ::Val{N}) where N
+function shfl_down_test_kernel(a, b, ::Val{N}) where {N}
     idx = KI.get_sub_group_local_id()
 
     val = a[idx]
@@ -205,7 +209,7 @@ function intrinsics_testsuite(backend, AT)
                 dev_a = AT(a)
                 dev_b = AT(zeros(T, N))
 
-                KI.@kernel backend() workgroupsize=N shfl_down_test_kernel(dev_a, dev_b, Val(N))
+                KI.@kernel backend() workgroupsize = N shfl_down_test_kernel(dev_a, dev_b, Val(N))
 
                 b = Array(dev_b)
                 @test sum(a) ≈ b[1]
diff --git a/test/localmem.jl b/test/localmem.jl
index 7894ee61..c1d63c7a 100644
--- a/test/localmem.jl
+++ b/test/localmem.jl
@@ -57,7 +57,7 @@ end
     lmem1 = @localmem Int (N,) # Ok iff groupsize is static
     lmem2 = @localmem Int (N,) # Ok iff groupsize is static
     @inbounds begin
-        lmem1[i] = i-1
+        lmem1[i] = i - 1
         lmem2[i] = 1
         @synchronize
         A[I] = lmem1[N2 - i + 1] + lmem2[N2 - i + 1]

Hamiltonian-Action · 2026-01-04T23:19:24Z

test/intrinsics.jl

+
+    offset = 0x00000001
+    while offset < N
+        val += KI.shfl_down(val, offset)


Please do entertain this comment for a bit. It is mentioned above that

""" When writing kernels using this function, it should be assumed that it is not synchronized. """

so this particular test can, on a perfectly conforming implementation as far as the above documentation is concerned, theoretically produce a race condition and hence output a value that is generally not equal to the sum of the array elements.

On one hand, this can be resolved by moving the subgroup barrier to the start of the loop. On the other hand, the only GPU vendor whose name starts with N and whose architecture I am somewhat intimately familiar with has, as far as I am aware, officially deprecated support for most (all?) non-synchronising warp/wavefront/subgroup level intrinsics. If other platforms offer equally strong guarantees as their default, would it be better/simpler to go with that route instead?
I am generally ambivalent towards either route and I understand the hideous hellhole that is GPUCompute's lax standardisation and vendor-specific features but I figured I may as well just put this out there.

Co-Authored-By: Anton Smirnov <[email protected]>

SPIRVIntrinsics 0.5.7 with extra subgroups support not released yet

Makes it easier to know if the right code is running in CI

codecov · 2026-01-07T17:55:10Z

Codecov Report

❌ Patch coverage is 59.18367% with 20 lines in your changes missing coverage. Please review.
✅ Project coverage is 52.42%. Comparing base (03d4ab7) to head (6343fd2).

Files with missing lines	Patch %	Lines
src/pocl/backend.jl	42.30%	15 Missing ⚠️
src/intrinsics.jl	0.00%	3 Missing ⚠️
src/KernelAbstractions.jl	0.00%	1 Missing ⚠️
src/pocl/compiler/compilation.jl	92.30%	1 Missing ⚠️

Additional details and impacted files

@@            Coverage Diff             @@
##             main     #668      +/-   ##
==========================================
+ Coverage   52.27%   52.42%   +0.14%     
==========================================
  Files          22       22              
  Lines        1689     1734      +45     
==========================================
+ Hits          883      909      +26     
- Misses        806      825      +19

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

christiangnrd mentioned this pull request Jan 2, 2026

shfl_down intrinsic #661

Closed

christiangnrd force-pushed the subgroups branch from 6852410 to 84730d2 Compare January 2, 2026 22:38

christiangnrd marked this pull request as draft January 3, 2026 19:47

Hamiltonian-Action reviewed Jan 4, 2026

View reviewed changes

christiangnrd force-pushed the subgroups branch 5 times, most recently from f08198b to befca17 Compare January 7, 2026 15:49

christiangnrd added 3 commits January 7, 2026 12:52

KernelIntrinsics Tweaks

d2d370f

Fix temporary AK compat

04492ac

Improve KI tests

d6dc85f

christiangnrd force-pushed the subgroups branch from befca17 to daea025 Compare January 7, 2026 16:53

christiangnrd and others added 10 commits January 7, 2026 12:55

Add test for kernels with multiple shared buffers

a71954c

Initial subgroups support

ea611f5

[Temp] Use new intrinsics feature branches in CI

6d30c49

shfl_down intrinsics

8bf20bb

Co-Authored-By: Anton Smirnov <[email protected]>

Add note about need to synchronize

20a616d

Local backend support

29d7f73

Temp loosen compat

d270c57

SPIRVIntrinsics 0.5.7 with extra subgroups support not released yet

Temp bump version

a22bb12

Makes it easier to know if the right code is running in CI

Fix tests

9fb672f

Allow setting sub-group size

6343fd2

christiangnrd force-pushed the subgroups branch from daea025 to 6343fd2 Compare January 7, 2026 16:55

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

KI Sub-groups #668

KI Sub-groups #668

Uh oh!

christiangnrd commented Dec 23, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Dec 23, 2025 •

edited

Loading

Uh oh!

Hamiltonian-Action Jan 4, 2026 •

edited

Loading

Uh oh!

codecov bot commented Jan 7, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

KI Sub-groups #668

Are you sure you want to change the base?

KI Sub-groups #668

Uh oh!

Conversation

christiangnrd commented Dec 23, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Dec 23, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Hamiltonian-Action Jan 4, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

codecov bot commented Jan 7, 2026

Codecov Report

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

christiangnrd commented Dec 23, 2025 •

edited

Loading

github-actions bot commented Dec 23, 2025 •

edited

Loading

Hamiltonian-Action Jan 4, 2026 •

edited

Loading