Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
90c9f6a
Trial support for thread-block clusters
kshyatt Jul 26, 2025
3625427
Merge branch 'ksh/clusters' into eschnett/clusters
eschnett Jan 13, 2026
35f34e9
Rename :clusters -> :clustersize
eschnett Jan 13, 2026
5051195
Correct passing cluster dimension to C
eschnett Jan 14, 2026
b289d1f
Define CuDim3 early
eschnett Jan 14, 2026
506dc1e
Correct attr definition
eschnett Jan 14, 2026
a7a0721
Correct argument types
eschnett Jan 14, 2026
a9c645f
Convert launch config to reference
eschnett Jan 14, 2026
ddaa13f
Correct getting pointers from objects
eschnett Jan 14, 2026
1447b43
CUStream vs. CuStream
eschnett Jan 14, 2026
ebe7180
Add clustersize tests
eschnett Jan 14, 2026
6d3fcc8
Add/correct cluster index/size intrinsics
eschnett Jan 14, 2026
87c7b40
Export new cluster intrinsics
eschnett Jan 14, 2026
39cacc4
Correct blockIdxInCluster
eschnett Jan 14, 2026
c639670
Omit run-time capability check
eschnett Jan 14, 2026
af4a1f4
Undo changes to autogenerated files
eschnett Jan 14, 2026
c7cb54c
Test thread block clusters only for capability >=9.0
eschnett Jan 14, 2026
6301e01
map_shared_rank: New function
eschnett Jan 14, 2026
5996c0a
Changes to address spaces
eschnett Jan 15, 2026
b5a9cad
Debug Julia 1.13
eschnett Jan 15, 2026
3be2035
Debug Julia 1.13
eschnett Jan 15, 2026
c779f7f
Avoid `@device_functions`
eschnett Jan 15, 2026
1e64e99
More Julia 1.13 work-arounds
eschnett Jan 15, 2026
d434e01
Use unsafe_convert instead of pointer
eschnett Jan 15, 2026
041fd0d
CuDistributedSharedArray: New function
eschnett Jan 15, 2026
2294d91
Correct llvmcall
eschnett Jan 15, 2026
04ad606
Correct llvmcall
eschnett Jan 15, 2026
e6ad91b
Correct llvmcall
eschnett Jan 15, 2026
b0a403f
Correct llvmcall
eschnett Jan 15, 2026
9797ff0
Correct llvmcall
eschnett Jan 15, 2026
6d5b3c8
Correct llvmcall
eschnett Jan 15, 2026
dcfe221
Correct llvmcall
eschnett Jan 15, 2026
cb21e0e
Correct llvmcall
eschnett Jan 15, 2026
8cd82fd
Correct llvmcall
eschnett Jan 15, 2026
48424c6
Do not use address space 7
eschnett Jan 15, 2026
11bef6d
Correct blockIdxInCluster and gridClusterDim
eschnett Jan 15, 2026
46bd1f0
Correct gridClusterDim
eschnett Jan 15, 2026
cf149ec
Correct CuDistributedSharedArray
eschnett Jan 15, 2026
5762eea
Correct CuDistributedSharedArray
eschnett Jan 15, 2026
ce87af2
Correct CuDistributedSharedArray
eschnett Jan 15, 2026
3d975be
Correct clusterDim and gridClusterDim
eschnett Jan 16, 2026
287a30d
CI: Disable most builds
eschnett Jan 16, 2026
6db286a
Add cluster sync test
eschnett Jan 16, 2026
3f411a8
libnvml is now a String
eschnett Jan 16, 2026
3704073
Loosen wrong bounds on clusterIdx and gridClusterDim
eschnett Jan 26, 2026
b18e747
Disable indexing range metadata for debugging
eschnett Jan 26, 2026
66276ee
Correct range metadata
eschnett Jan 26, 2026
b40b9ae
Correct range metadata
eschnett Jan 26, 2026
88e5662
Correct range metadata
eschnett Jan 26, 2026
443833e
Diagnose more launch failures
eschnett Jan 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 16 additions & 14 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1.11"
- "1.12"
- "nightly"
#TODO - "1.10"
#TODO - "1.11"
#TODO - "1.12"
- "1.13"
#TODO - "nightly"
adjustments:
- with:
julia: "nightly"
Expand Down Expand Up @@ -68,16 +69,17 @@ steps:
matrix:
setup:
cuda:
- "13.0"
- "12.9"
- "12.8"
- "12.6"
- "12.5"
- "12.4"
- "12.3"
- "12.2"
- "12.1"
- "12.0"
- "13.1"
#TODO - "13.0"
#TODO - "12.9"
#TODO - "12.8"
#TODO - "12.6"
#TODO - "12.5"
#TODO - "12.4"
#TODO - "12.3"
#TODO - "12.2"
#TODO - "12.1"
#TODO - "12.0"
adjustments:
- with:
cuda: "13.0"
Expand Down
37 changes: 29 additions & 8 deletions lib/cudadrv/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,26 @@ end

"""
launch(f::CuFunction; args...; blocks::CuDim=1, threads::CuDim=1,
cooperative=false, shmem=0, stream=stream())
clustersize::CuDim=1, cooperative=false, shmem=0, stream=stream())

Low-level call to launch a CUDA function `f` on the GPU, using `blocks` and `threads` as
respectively the grid and block configuration. Dynamic shared memory is allocated according
to `shmem`, and the kernel is launched on stream `stream`.
to `shmem`, and the kernel is launched on stream `stream`. If `clustersize > 1` and compute
capability is `>= 9.0`, [thread block clusters](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-block-clusters)
are launched. If `clustersize > 1` and compute capability is `< 9.0`, an error is thrown, as
thread block clusters are not supported.

Arguments to a kernel should either be bitstype, in which case they will be copied to the
internal kernel parameter buffer, or a pointer to device memory.

This is a low-level call, prefer to use [`cudacall`](@ref) instead.
"""
function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::CuDim=1,
cooperative::Bool=false, shmem::Integer=0,
clustersize::CuDim=1, cooperative::Bool=false, shmem::Integer=0,
stream::CuStream=stream()) where {N}
blockdim = CuDim3(blocks)
threaddim = CuDim3(threads)
clusterdim = CuDim3(clustersize)

try
pack_arguments(args...) do kernelParams
Expand All @@ -68,29 +72,46 @@ function launch(f::CuFunction, args::Vararg{Any,N}; blocks::CuDim=1, threads::Cu
blockdim.x, blockdim.y, blockdim.z,
threaddim.x, threaddim.y, threaddim.z,
shmem, stream, kernelParams)
else
elseif clusterdim.x == 1 && clusterdim.y == 1 && clusterdim.z == 1
cuLaunchKernel(f,
blockdim.x, blockdim.y, blockdim.z,
threaddim.x, threaddim.y, threaddim.z,
shmem, stream, kernelParams, C_NULL)
else
attr_ref = Ref{CUDA.CUlaunchAttribute}()
GC.@preserve attr_ref stream begin
attr = Base.unsafe_convert(Ptr{CUDA.CUlaunchAttribute}, attr_ref)
attr.id = CUDA.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
attr.value.clusterDim.x = clusterdim.x
attr.value.clusterDim.y = clusterdim.y
attr.value.clusterDim.z = clusterdim.z
config = Ref(CUlaunchConfig(blockdim.x, blockdim.y, blockdim.z,
threaddim.x, threaddim.y, threaddim.z,
shmem, stream.handle, attr, 1))
cuLaunchKernelEx(config, f, kernelParams, C_NULL)
end
end
end
catch err
diagnose_launch_failure(f, err; blockdim, threaddim, shmem)
diagnose_launch_failure(f, err; blockdim, threaddim, clusterdim, shmem)
end
end

@noinline function diagnose_launch_failure(f::CuFunction, err; blockdim, threaddim, shmem)
@noinline function diagnose_launch_failure(f::CuFunction, err; blockdim, threaddim, clusterdim, shmem)
if !isa(err, CuError) || !in(err.code, [ERROR_INVALID_VALUE,
ERROR_LAUNCH_OUT_OF_RESOURCES])
rethrow()
end

# essentials
(blockdim.x>0 && blockdim.y>0 && blockdim.z>0) ||
error("Grid dimensions should be non-null")
error("Grid dimensions $blockdim are not positive")
(threaddim.x>0 && threaddim.y>0 && threaddim.z>0) ||
error("Block dimensions should be non-null")
error("Block dimensions $threaddim are not positive")
(clusterdim.x>0 && clusterdim.y>0 && clusterdim.z>0) ||
error("Cluster dimensions $clusterdim are not positive")
(blockdim.x % clusterdim.x == 0 && blockdim.y % clusterdim.y == 0 && blockdim.z % clusterdim.z == 0) ||
error("Block dimensions $blockdim are not multiples of the cluster dimensions $clusterdim")

# check device limits
dev = device()
Expand Down
35 changes: 18 additions & 17 deletions lib/nvml/NVML.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,23 @@ import Libdl

export has_nvml

function libnvml()
@memoize begin
if Sys.iswindows()
# the NVSMI dir isn't added to PATH by the installer
nvsmi = joinpath(ENV["ProgramFiles"], "NVIDIA Corporation", "NVSMI")
if isdir(nvsmi)
joinpath(nvsmi, "nvml.dll")
else
# let's just hope for the best
"nvml"
end
else
"libnvidia-ml.so.1"
end
end::String
end
#TODO function libnvml()
#TODO @memoize begin
#TODO if Sys.iswindows()
#TODO # the NVSMI dir isn't added to PATH by the installer
#TODO nvsmi = joinpath(ENV["ProgramFiles"], "NVIDIA Corporation", "NVSMI")
#TODO if isdir(nvsmi)
#TODO joinpath(nvsmi, "nvml.dll")
#TODO else
#TODO # let's just hope for the best
#TODO "nvml"
#TODO end
#TODO else
#TODO "libnvidia-ml.so.1"
#TODO end
#TODO end::String
#TODO end
const libnvml::String = "libnvidia-ml.so.1"

function has_nvml()
@memoize begin
Expand All @@ -37,7 +38,7 @@ function has_nvml()
return false
end

if Libdl.dlopen(libnvml(); throw_error=false) === nothing
if Libdl.dlopen(libnvml; throw_error=false) === nothing
return false
end

Expand Down
Loading