-
Notifications
You must be signed in to change notification settings - Fork 81
KI Sub-groups #668
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
KI Sub-groups #668
Conversation
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index d462bcc0..5bc96f40 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -297,7 +297,7 @@ kernel on the host.
Backends must also implement the on-device kernel launch functionality.
"""
-struct Kernel{B,Kern}
+struct Kernel{B, Kern}
backend::B
kern::Kern
end
@@ -438,7 +438,7 @@ There are a few keyword arguments that influence the behavior of `KI.@kernel`:
"""
macro kernel(backend, ex...)
call = ex[end]
- kwargs = map(ex[1:(end-1)]) do kwarg
+ kwargs = map(ex[1:(end - 1)]) do kwarg
if kwarg isa Symbol
:($kwarg = $kwarg)
elseif Meta.isexpr(kwarg, :(=))
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index fe70e96f..d3b4fd86 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -274,7 +274,7 @@ end
sub_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
end
-@device_override function KI.shfl_down(val::T, offset::Integer) where T
+@device_override function KI.shfl_down(val::T, offset::Integer) where {T}
sub_group_shuffle(val, get_sub_group_local_id() + offset)
end
diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl
index 123a8282..e97878da 100644
--- a/src/pocl/compiler/compilation.jl
+++ b/src/pocl/compiler/compilation.jl
@@ -27,11 +27,15 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) =
in(fn, known_intrinsics) ||
contains(fn, "__spirv_")
-function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob),
- mod::LLVM.Module, entry::LLVM.Function)
- entry = invoke(GPUCompiler.finish_module!,
- Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
- job, mod, entry)
+function GPUCompiler.finish_module!(
+ @nospecialize(job::OpenCLCompilerJob),
+ mod::LLVM.Module, entry::LLVM.Function
+ )
+ entry = invoke(
+ GPUCompiler.finish_module!,
+ Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
+ job, mod, entry
+ )
# Set the subgroup size
metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(job.config.params.sub_group_size))])
@@ -70,7 +74,7 @@ end
# create GPUCompiler objects
target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...)
- params = OpenCLCompilerParams(; sub_group_size=32)
+ params = OpenCLCompilerParams(; sub_group_size = 32)
return CompilerConfig(target, params; kernel, name, always_inline)
end
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index d15d5daf..18627113 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -12,12 +12,14 @@ function test_intrinsics_kernel(results)
i = KI.get_global_id().x
if i <= length(results)
- @inbounds results[i] = KernelData(KI.get_global_size().x,
- KI.get_global_id().x,
- KI.get_local_size().x,
- KI.get_local_id().x,
- KI.get_num_groups().x,
- KI.get_group_id().x)
+ @inbounds results[i] = KernelData(
+ KI.get_global_size().x,
+ KI.get_global_id().x,
+ KI.get_local_size().x,
+ KI.get_local_id().x,
+ KI.get_num_groups().x,
+ KI.get_group_id().x
+ )
end
return
end
@@ -32,16 +34,18 @@ function test_subgroup_kernel(results)
i = KI.get_global_id().x
if i <= length(results)
- @inbounds results[i] = SubgroupData(KI.get_sub_group_size(),
- KI.get_max_sub_group_size(),
- KI.get_num_sub_groups(),
- KI.get_sub_group_id(),
- KI.get_sub_group_local_id())
+ @inbounds results[i] = SubgroupData(
+ KI.get_sub_group_size(),
+ KI.get_max_sub_group_size(),
+ KI.get_num_sub_groups(),
+ KI.get_sub_group_id(),
+ KI.get_sub_group_local_id()
+ )
end
return
end
-function shfl_down_test_kernel(a, b, ::Val{N}) where N
+function shfl_down_test_kernel(a, b, ::Val{N}) where {N}
idx = KI.get_sub_group_local_id()
val = a[idx]
@@ -205,7 +209,7 @@ function intrinsics_testsuite(backend, AT)
dev_a = AT(a)
dev_b = AT(zeros(T, N))
- KI.@kernel backend() workgroupsize=N shfl_down_test_kernel(dev_a, dev_b, Val(N))
+ KI.@kernel backend() workgroupsize = N shfl_down_test_kernel(dev_a, dev_b, Val(N))
b = Array(dev_b)
@test sum(a) ≈ b[1]
diff --git a/test/localmem.jl b/test/localmem.jl
index 7894ee61..c1d63c7a 100644
--- a/test/localmem.jl
+++ b/test/localmem.jl
@@ -57,7 +57,7 @@ end
lmem1 = @localmem Int (N,) # Ok iff groupsize is static
lmem2 = @localmem Int (N,) # Ok iff groupsize is static
@inbounds begin
- lmem1[i] = i-1
+ lmem1[i] = i - 1
lmem2[i] = 1
@synchronize
A[I] = lmem1[N2 - i + 1] + lmem2[N2 - i + 1] |
6852410 to
84730d2
Compare
|
|
||
| offset = 0x00000001 | ||
| while offset < N | ||
| val += KI.shfl_down(val, offset) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please do entertain this comment for a bit. It is mentioned above that
"""
When writing kernels using this function, it should be
assumed that it is not synchronized.
"""so this particular test can, on a perfectly conforming implementation as far as the above documentation is concerned, theoretically produce a race condition and hence output a value that is generally not equal to the sum of the array elements.
On one hand, this can be resolved by moving the subgroup barrier to the start of the loop. On the other hand, the only GPU vendor whose name starts with N and whose architecture I am somewhat intimately familiar with has, as far as I am aware, officially deprecated support for most (all?) non-synchronising warp/wavefront/subgroup level intrinsics. If other platforms offer equally strong guarantees as their default, would it be better/simpler to go with that route instead?
I am generally ambivalent towards either route and I understand the hideous hellhole that is GPUCompute's lax standardisation and vendor-specific features but I figured I may as well just put this out there.
f08198b to
befca17
Compare
befca17 to
daea025
Compare
Co-Authored-By: Anton Smirnov <[email protected]>
SPIRVIntrinsics 0.5.7 with extra subgroups support not released yet
Makes it easier to know if the right code is running in CI
daea025 to
6343fd2
Compare
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #668 +/- ##
==========================================
+ Coverage 52.27% 52.42% +0.14%
==========================================
Files 22 22
Lines 1689 1734 +45
==========================================
+ Hits 883 909 +26
- Misses 806 825 +19 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
Depends on #666