Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ demumble_jll = "1e29f10c-031c-5a83-9565-69cddfc27673"
[weakdeps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"

[extensions]
ChainRulesCoreExt = "ChainRulesCore"
EnzymeCoreExt = "EnzymeCore"
FillArraysExt = "FillArrays"
SparseMatricesCSRExt = "SparseMatricesCSR"
SpecialFunctionsExt = "SpecialFunctions"

Expand All @@ -63,6 +65,7 @@ Crayons = "4"
DataFrames = "1.5"
EnzymeCore = "0.8.2"
ExprTools = "0.1"
FillArrays = "1"
GPUArrays = "11.3.1"
GPUCompiler = "1.4"
GPUToolbox = "0.3, 1"
Expand Down Expand Up @@ -92,5 +95,6 @@ julia = "1.10"
[extras]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
69 changes: 69 additions & 0 deletions ext/FillArraysExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
module FillArraysExt

using CUDA
using CUDA.CUSPARSE
using LinearAlgebra
using SparseArrays
using FillArrays

# kron between CuSparseMatrixCOO and Diagonal{T, AbstractFill}
# This is optimized for FillArrays since the diagonal is a constant value
function LinearAlgebra.kron(A::CUSPARSE.CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T2, <:FillArrays.AbstractFill{T2}}) where {Ti, T1, T2}
T = promote_type(T1, T2)
mA, nA = size(A)
mB, nB = size(B)
out_shape = (mA * mB, nA * nB)
Annz = Int64(A.nnz)

# Get the fill value from the diagonal
fill_value = FillArrays.getindex_value(B.diag)

if Annz == 0 || iszero(fill_value)
return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape)
end

row = (A.rowInd .- 1) .* mB
row = repeat(row, inner = nB)
col = (A.colInd .- 1) .* nB
col = repeat(col, inner = nB)
data = repeat(convert(CUDA.CuVector{T}, A.nzVal), inner = nB)

row .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
col .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1

# Multiply by the fill value (already promoted type T)
data .*= fill_value

CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
end

# kron between Diagonal{T, AbstractFill} and CuSparseMatrixCOO
function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::CUSPARSE.CuSparseMatrixCOO{T2, Ti}) where {Ti, T1, T2}
T = promote_type(T1, T2)
mA, nA = size(A)
mB, nB = size(B)
out_shape = (mA * mB, nA * nB)
Bnnz = Int64(B.nnz)

# Get the fill value from the diagonal
fill_value = FillArrays.getindex_value(A.diag)

if Bnnz == 0 || iszero(fill_value)
return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape)
end

row = (0:nA-1) .* mB
row = CUDA.CuVector(repeat(row, inner = Bnnz))
col = (0:nA-1) .* nB
col = CUDA.CuVector(repeat(col, inner = Bnnz))
data = CUDA.fill(T(fill_value), nA * Bnnz)

row .+= repeat(B.rowInd .- 1, outer = nA) .+ 1
col .+= repeat(B.colInd .- 1, outer = nA) .+ 1

data .*= repeat(B.nzVal, outer = nA)

CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
end

end # extension module
46 changes: 32 additions & 14 deletions lib/cusparse/linalg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -58,47 +58,65 @@ function LinearAlgebra.kron(A::CuSparseMatrixCOO{T, Ti}, B::CuSparseMatrixCOO{T,
sparse(row, col, data, out_shape..., fmt = :coo)
end

function LinearAlgebra.kron(A::CuSparseMatrixCOO{T, Ti}, B::Diagonal) where {Ti, T}
function LinearAlgebra.kron(A::CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T2, <:CuVector{T2}}) where {Ti, T1, T2}
T = promote_type(T1, T2)
mA,nA = size(A)
mB,nB = size(B)
out_shape = (mA * mB, nA * nB)
Annz = Int64(A.nnz)
Bnnz = nB

# Find non-zero diagonal elements
nz_mask = map(!iszero, B.diag)
nz_indices = findall(nz_mask)
Bnnz = length(nz_indices)

if Annz == 0 || Bnnz == 0
return CuSparseMatrixCOO(CuVector{Ti}(undef, 0), CuVector{Ti}(undef, 0), CuVector{T}(undef, 0), out_shape)
end

# Only process non-zero diagonal elements
nz_diag = B.diag[nz_indices]
nz_offsets = CuVector(nz_indices .- 1)

row = (A.rowInd .- 1) .* mB
row = repeat(row, inner = Bnnz)
col = (A.colInd .- 1) .* nB
col = repeat(col, inner = Bnnz)
data = repeat(A.nzVal, inner = Bnnz)
data = repeat(convert(CuVector{T}, A.nzVal), inner = Bnnz)

row .+= CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
col .+= CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
row .+= repeat(nz_offsets, outer = Annz) .+ 1
col .+= repeat(nz_offsets, outer = Annz) .+ 1

data .*= repeat(CUDA.ones(T, nB), outer = Annz)
data .*= repeat(nz_diag, outer = Annz)

sparse(row, col, data, out_shape..., fmt = :coo)
end

function LinearAlgebra.kron(A::Diagonal, B::CuSparseMatrixCOO{T, Ti}) where {Ti, T}
function LinearAlgebra.kron(A::Diagonal{T1, <:CuVector{T1}}, B::CuSparseMatrixCOO{T2, Ti}) where {Ti, T1, T2}
T = promote_type(T1, T2)
mA,nA = size(A)
mB,nB = size(B)
out_shape = (mA * mB, nA * nB)
Annz = nA
Bnnz = Int64(B.nnz)

if Annz == 0 || Bnnz == 0
# Find non-zero diagonal elements
nz_mask = map(!iszero, A.diag)
nz_indices = findall(nz_mask)
Annz = length(nz_indices)

if Bnnz == 0 || Annz == 0
return CuSparseMatrixCOO(CuVector{Ti}(undef, 0), CuVector{Ti}(undef, 0), CuVector{T}(undef, 0), out_shape)
end

row = (0:nA-1) .* mB
row = CuVector(repeat(row, inner = Bnnz))
col = (0:nA-1) .* nB
col = CuVector(repeat(col, inner = Bnnz))
data = repeat(CUDA.ones(T, nA), inner = Bnnz)
# Only process non-zero diagonal elements
nz_diag = A.diag[nz_indices]
nz_offsets = CuVector(nz_indices .- 1)

row = nz_offsets .* mB
row = repeat(row, inner = Bnnz)
col = nz_offsets .* nB
col = repeat(col, inner = Bnnz)
data = repeat(convert(CuVector{T}, nz_diag), inner = Bnnz)

row .+= repeat(B.rowInd .- 1, outer = Annz) .+ 1
col .+= repeat(B.colInd .- 1, outer = Annz) .+ 1
Expand Down
3 changes: 2 additions & 1 deletion test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
Expand Down
52 changes: 47 additions & 5 deletions test/libraries/cusparse/linalg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ m = 10
A = sprand(T, m, m, 0.2)
B = sprand(T, m, m, 0.3)
ZA = spzeros(T, m, m)
C = I(div(m, 2))
C = Diagonal(rand(T, div(m, 2)))

dC = adapt(CuArray, C)
@testset "type = $typ" for typ in [CuSparseMatrixCSR, CuSparseMatrixCSC]
dA = typ(A)
dB = typ(B)
Expand Down Expand Up @@ -35,12 +37,52 @@ m = 10
end
end
@testset "kronecker product with I opa = $opa" for opa in (identity, transpose, adjoint)
@test collect(kron(opa(dA), C)) ≈ kron(opa(A), C)
@test collect(kron(C, opa(dA))) ≈ kron(C, opa(A))
@test collect(kron(opa(dZA), C)) ≈ kron(opa(ZA), C)
@test collect(kron(C, opa(dZA))) ≈ kron(C, opa(ZA))
@test collect(kron(opa(dA), dC)) ≈ kron(opa(A), C)
@test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A))
@test collect(kron(opa(dZA), dC)) ≈ kron(opa(ZA), C)
@test collect(kron(dC, opa(dZA))) ≈ kron(C, opa(ZA))
end
end

# Test type promotion for kron with Diagonal
@testset "Type promotion - T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64)]
A_T1 = sprand(T1, m, m, 0.2)
dA_T1 = CuSparseMatrixCSR(A_T1)
dC_T2 = Diagonal(CUDA.rand(T2, div(m, 2)))
C_T2 = collect(dC_T2)

# Test kron(sparse_T1, diagonal_T2)
result_gpu = kron(dA_T1, dC_T2)
result_cpu = kron(A_T1, C_T2)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu

# Test kron(diagonal_T2, sparse_T1)
result_gpu = kron(dC_T2, dA_T1)
result_cpu = kron(C_T2, A_T1)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
end

# Test kron with zero diagonal matrices
@testset "kron with zero Diagonal" begin
A = sprand(T, m, m, 0.2)
dA = CuSparseMatrixCSR(A)
dC_zeros = Diagonal(CUDA.zeros(T, div(m, 2)))
C_zeros = collect(dC_zeros)

# Test kron(sparse, zero_diagonal)
result_gpu = kron(dA, dC_zeros)
result_cpu = kron(A, C_zeros)
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0

# Test kron(zero_diagonal, sparse)
result_gpu = kron(dC_zeros, dA)
result_cpu = kron(C_zeros, A)
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0
end
end

@testset "Reshape $typ (100,100) -> (20, 500) and droptol" for
Expand Down
102 changes: 102 additions & 0 deletions test/libraries/fillarrays.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
using CUDA.CUSPARSE
using LinearAlgebra, SparseArrays
using FillArrays

@testset "FillArrays - Kronecker product" begin
@testset "T = $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
m, n = 100, 80
A = sprand(T, m, n, 0.2)

# Test with Ones
@testset "Ones - size $diag_size" for diag_size in [5, 10]
C_ones = Diagonal(Ones{T}(diag_size))
dA = CuSparseMatrixCSR(A)

# Test kron(sparse, diagonal)
result_gpu = collect(kron(dA, C_ones))
result_cpu = kron(A, collect(C_ones))
@test result_gpu ≈ result_cpu

# Test kron(diagonal, sparse)
result_gpu = collect(kron(C_ones, dA))
result_cpu = kron(collect(C_ones), A)
@test result_gpu ≈ result_cpu
end

# Test with Fill (constant value)
@testset "Fill - value $val, size $diag_size" for val in [T(2.5), T(-1.0)], diag_size in [5, 10]
C_fill = Diagonal(Fill(val, diag_size))
dA = CuSparseMatrixCSR(A)

# Test kron(sparse, diagonal)
result_gpu = collect(kron(dA, C_fill))
result_cpu = kron(A, collect(C_fill))
@test result_gpu ≈ result_cpu

# Test kron(diagonal, sparse)
result_gpu = collect(kron(C_fill, dA))
result_cpu = kron(collect(C_fill), A)
@test result_gpu ≈ result_cpu
end

# Test with Zeros
@testset "Zeros - size $diag_size" for diag_size in [5, 10]
C_zeros = Diagonal(Zeros{T}(diag_size))
dA = CuSparseMatrixCSR(A)

# Test kron(sparse, diagonal)
result_gpu = kron(dA, C_zeros)
result_cpu = kron(A, collect(C_zeros))
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0

# Test kron(diagonal, sparse)
result_gpu = kron(C_zeros, dA)
result_cpu = kron(collect(C_zeros), A)
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0
end

# Test with transpose and adjoint wrappers
@testset "Transpose/Adjoint with Fill" begin
diag_size = 5
val = T(3.0)
C_fill = Diagonal(Fill(val, diag_size))

@testset "opa = $opa" for opa in (identity, transpose, adjoint)
dA = CuSparseMatrixCSR(A)

# Test kron(opa(sparse), diagonal)
result_gpu = collect(kron(opa(dA), C_fill))
result_cpu = kron(opa(A), collect(C_fill))
@test result_gpu ≈ result_cpu

# Test kron(diagonal, opa(sparse))
result_gpu = collect(kron(C_fill, opa(dA)))
result_cpu = kron(collect(C_fill), opa(A))
@test result_gpu ≈ result_cpu
end
end

# Test type promotion
@testset "Type promotion" begin
@testset "T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64), (ComplexF32, Float32)]
A_T1 = sprand(T1, m, n, 0.2)
dA_T1 = CuSparseMatrixCSR(A_T1)
C_fill_T2 = Diagonal(Fill(T2(2.5), 5))

# Test kron(sparse_T1, diagonal_T2)
result_gpu = kron(dA_T1, C_fill_T2)
result_cpu = kron(A_T1, collect(C_fill_T2))
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu

# Test kron(diagonal_T2, sparse_T1)
result_gpu = kron(C_fill_T2, dA_T1)
result_cpu = kron(collect(C_fill_T2), A_T1)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
end
end
end
end