Skip to content

Conversation

@christiangnrd
Copy link
Member

@christiangnrd christiangnrd commented Dec 23, 2025

Depends on #666

@github-actions
Copy link
Contributor

github-actions bot commented Dec 23, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index d462bcc0..5bc96f40 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -297,7 +297,7 @@ kernel on the host.
 
     Backends must also implement the on-device kernel launch functionality.
 """
-struct Kernel{B,Kern}
+struct Kernel{B, Kern}
     backend::B
     kern::Kern
 end
@@ -438,7 +438,7 @@ There are a few keyword arguments that influence the behavior of `KI.@kernel`:
 """
 macro kernel(backend, ex...)
     call = ex[end]
-    kwargs = map(ex[1:(end-1)]) do kwarg
+    kwargs = map(ex[1:(end - 1)]) do kwarg
         if kwarg isa Symbol
             :($kwarg = $kwarg)
         elseif Meta.isexpr(kwarg, :(=))
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index fe70e96f..d3b4fd86 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -274,7 +274,7 @@ end
     sub_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
 end
 
-@device_override function KI.shfl_down(val::T, offset::Integer) where T
+@device_override function KI.shfl_down(val::T, offset::Integer) where {T}
     sub_group_shuffle(val, get_sub_group_local_id() + offset)
 end
 
diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl
index 123a8282..e97878da 100644
--- a/src/pocl/compiler/compilation.jl
+++ b/src/pocl/compiler/compilation.jl
@@ -27,11 +27,15 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) =
     in(fn, known_intrinsics) ||
     contains(fn, "__spirv_")
 
-function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob),
-                                    mod::LLVM.Module, entry::LLVM.Function)
-    entry = invoke(GPUCompiler.finish_module!,
-                   Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
-                   job, mod, entry)
+function GPUCompiler.finish_module!(
+        @nospecialize(job::OpenCLCompilerJob),
+        mod::LLVM.Module, entry::LLVM.Function
+    )
+    entry = invoke(
+        GPUCompiler.finish_module!,
+        Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
+        job, mod, entry
+    )
 
     # Set the subgroup size
     metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(job.config.params.sub_group_size))])
@@ -70,7 +74,7 @@ end
 
     # create GPUCompiler objects
     target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...)
-    params = OpenCLCompilerParams(; sub_group_size=32)
+    params = OpenCLCompilerParams(; sub_group_size = 32)
     return CompilerConfig(target, params; kernel, name, always_inline)
 end
 
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index d15d5daf..18627113 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -12,12 +12,14 @@ function test_intrinsics_kernel(results)
     i = KI.get_global_id().x
 
     if i <= length(results)
-        @inbounds results[i] = KernelData(KI.get_global_size().x,
-                                          KI.get_global_id().x,
-                                          KI.get_local_size().x,
-                                          KI.get_local_id().x,
-                                          KI.get_num_groups().x,
-                                          KI.get_group_id().x)
+        @inbounds results[i] = KernelData(
+            KI.get_global_size().x,
+            KI.get_global_id().x,
+            KI.get_local_size().x,
+            KI.get_local_id().x,
+            KI.get_num_groups().x,
+            KI.get_group_id().x
+        )
     end
     return
 end
@@ -32,16 +34,18 @@ function test_subgroup_kernel(results)
     i = KI.get_global_id().x
 
     if i <= length(results)
-        @inbounds results[i] = SubgroupData(KI.get_sub_group_size(),
-                                            KI.get_max_sub_group_size(),
-                                            KI.get_num_sub_groups(),
-                                            KI.get_sub_group_id(),
-                                            KI.get_sub_group_local_id())
+        @inbounds results[i] = SubgroupData(
+            KI.get_sub_group_size(),
+            KI.get_max_sub_group_size(),
+            KI.get_num_sub_groups(),
+            KI.get_sub_group_id(),
+            KI.get_sub_group_local_id()
+        )
     end
     return
 end
 
-function shfl_down_test_kernel(a, b, ::Val{N}) where N
+function shfl_down_test_kernel(a, b, ::Val{N}) where {N}
     idx = KI.get_sub_group_local_id()
 
     val = a[idx]
@@ -205,7 +209,7 @@ function intrinsics_testsuite(backend, AT)
                 dev_a = AT(a)
                 dev_b = AT(zeros(T, N))
 
-                KI.@kernel backend() workgroupsize=N shfl_down_test_kernel(dev_a, dev_b, Val(N))
+                KI.@kernel backend() workgroupsize = N shfl_down_test_kernel(dev_a, dev_b, Val(N))
 
                 b = Array(dev_b)
                 @test sum(a) ≈ b[1]
diff --git a/test/localmem.jl b/test/localmem.jl
index 7894ee61..c1d63c7a 100644
--- a/test/localmem.jl
+++ b/test/localmem.jl
@@ -57,7 +57,7 @@ end
     lmem1 = @localmem Int (N,) # Ok iff groupsize is static
     lmem2 = @localmem Int (N,) # Ok iff groupsize is static
     @inbounds begin
-        lmem1[i] = i-1
+        lmem1[i] = i - 1
         lmem2[i] = 1
         @synchronize
         A[I] = lmem1[N2 - i + 1] + lmem2[N2 - i + 1]

@christiangnrd christiangnrd mentioned this pull request Jan 2, 2026
@christiangnrd christiangnrd marked this pull request as draft January 3, 2026 19:47

offset = 0x00000001
while offset < N
val += KI.shfl_down(val, offset)
Copy link

@Hamiltonian-Action Hamiltonian-Action Jan 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please do entertain this comment for a bit. It is mentioned above that

"""
When writing kernels using this function, it should be
assumed that it is not synchronized.
"""

so this particular test can, on a perfectly conforming implementation as far as the above documentation is concerned, theoretically produce a race condition and hence output a value that is generally not equal to the sum of the array elements.

On one hand, this can be resolved by moving the subgroup barrier to the start of the loop. On the other hand, the only GPU vendor whose name starts with N and whose architecture I am somewhat intimately familiar with has, as far as I am aware, officially deprecated support for most (all?) non-synchronising warp/wavefront/subgroup level intrinsics. If other platforms offer equally strong guarantees as their default, would it be better/simpler to go with that route instead?
I am generally ambivalent towards either route and I understand the hideous hellhole that is GPUCompute's lax standardisation and vendor-specific features but I figured I may as well just put this out there.

@christiangnrd christiangnrd force-pushed the subgroups branch 5 times, most recently from f08198b to befca17 Compare January 7, 2026 15:49
@codecov
Copy link

codecov bot commented Jan 7, 2026

Codecov Report

❌ Patch coverage is 59.18367% with 20 lines in your changes missing coverage. Please review.
✅ Project coverage is 52.42%. Comparing base (03d4ab7) to head (6343fd2).

Files with missing lines Patch % Lines
src/pocl/backend.jl 42.30% 15 Missing ⚠️
src/intrinsics.jl 0.00% 3 Missing ⚠️
src/KernelAbstractions.jl 0.00% 1 Missing ⚠️
src/pocl/compiler/compilation.jl 92.30% 1 Missing ⚠️
Additional details and impacted files
@@            Coverage Diff             @@
##             main     #668      +/-   ##
==========================================
+ Coverage   52.27%   52.42%   +0.14%     
==========================================
  Files          22       22              
  Lines        1689     1734      +45     
==========================================
+ Hits          883      909      +26     
- Misses        806      825      +19     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants