diff --git a/Project.toml b/Project.toml index 3862036fdb..c3a41f2ea9 100644 --- a/Project.toml +++ b/Project.toml @@ -40,12 +40,14 @@ demumble_jll = "1e29f10c-031c-5a83-9565-69cddfc27673" [weakdeps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" [extensions] ChainRulesCoreExt = "ChainRulesCore" EnzymeCoreExt = "EnzymeCore" +FillArraysExt = "FillArrays" SparseMatricesCSRExt = "SparseMatricesCSR" SpecialFunctionsExt = "SpecialFunctions" @@ -63,6 +65,7 @@ Crayons = "4" DataFrames = "1.5" EnzymeCore = "0.8.2" ExprTools = "0.1" +FillArrays = "1" GPUArrays = "11.3.1" GPUCompiler = "1.4" GPUToolbox = "0.3, 1" @@ -92,5 +95,6 @@ julia = "1.10" [extras] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" diff --git a/ext/FillArraysExt.jl b/ext/FillArraysExt.jl new file mode 100644 index 0000000000..865faebf5a --- /dev/null +++ b/ext/FillArraysExt.jl @@ -0,0 +1,69 @@ +module FillArraysExt + +using CUDA +using CUDA.CUSPARSE +using LinearAlgebra +using SparseArrays +using FillArrays + +# kron between CuSparseMatrixCOO and Diagonal{T, AbstractFill} +# This is optimized for FillArrays since the diagonal is a constant value +function LinearAlgebra.kron(A::CUSPARSE.CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T2, <:FillArrays.AbstractFill{T2}}) where {Ti, T1, T2} + T = promote_type(T1, T2) + mA, nA = size(A) + mB, nB = size(B) + out_shape = (mA * mB, nA * nB) + Annz = Int64(A.nnz) + + # Get the fill value from the diagonal + fill_value = FillArrays.getindex_value(B.diag) + + if Annz == 0 || iszero(fill_value) + return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape) + end + + row = (A.rowInd .- 1) .* mB + row = repeat(row, inner = nB) + col = (A.colInd .- 1) .* nB + col = repeat(col, inner = nB) + data = repeat(convert(CUDA.CuVector{T}, A.nzVal), inner = nB) + + row .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1 + col .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1 + + # Multiply by the fill value (already promoted type T) + data .*= fill_value + + CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo) +end + +# kron between Diagonal{T, AbstractFill} and CuSparseMatrixCOO +function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::CUSPARSE.CuSparseMatrixCOO{T2, Ti}) where {Ti, T1, T2} + T = promote_type(T1, T2) + mA, nA = size(A) + mB, nB = size(B) + out_shape = (mA * mB, nA * nB) + Bnnz = Int64(B.nnz) + + # Get the fill value from the diagonal + fill_value = FillArrays.getindex_value(A.diag) + + if Bnnz == 0 || iszero(fill_value) + return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape) + end + + row = (0:nA-1) .* mB + row = CUDA.CuVector(repeat(row, inner = Bnnz)) + col = (0:nA-1) .* nB + col = CUDA.CuVector(repeat(col, inner = Bnnz)) + data = CUDA.fill(T(fill_value), nA * Bnnz) + + row .+= repeat(B.rowInd .- 1, outer = nA) .+ 1 + col .+= repeat(B.colInd .- 1, outer = nA) .+ 1 + + data .*= repeat(B.nzVal, outer = nA) + + CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo) +end + +end # extension module diff --git a/lib/cusparse/linalg.jl b/lib/cusparse/linalg.jl index ebcde249be..189d7caa26 100644 --- a/lib/cusparse/linalg.jl +++ b/lib/cusparse/linalg.jl @@ -58,47 +58,65 @@ function LinearAlgebra.kron(A::CuSparseMatrixCOO{T, Ti}, B::CuSparseMatrixCOO{T, sparse(row, col, data, out_shape..., fmt = :coo) end -function LinearAlgebra.kron(A::CuSparseMatrixCOO{T, Ti}, B::Diagonal) where {Ti, T} +function LinearAlgebra.kron(A::CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T2, <:CuVector{T2}}) where {Ti, T1, T2} + T = promote_type(T1, T2) mA,nA = size(A) mB,nB = size(B) out_shape = (mA * mB, nA * nB) Annz = Int64(A.nnz) - Bnnz = nB + + # Find non-zero diagonal elements + nz_mask = map(!iszero, B.diag) + nz_indices = findall(nz_mask) + Bnnz = length(nz_indices) if Annz == 0 || Bnnz == 0 return CuSparseMatrixCOO(CuVector{Ti}(undef, 0), CuVector{Ti}(undef, 0), CuVector{T}(undef, 0), out_shape) end + # Only process non-zero diagonal elements + nz_diag = B.diag[nz_indices] + nz_offsets = CuVector(nz_indices .- 1) + row = (A.rowInd .- 1) .* mB row = repeat(row, inner = Bnnz) col = (A.colInd .- 1) .* nB col = repeat(col, inner = Bnnz) - data = repeat(A.nzVal, inner = Bnnz) + data = repeat(convert(CuVector{T}, A.nzVal), inner = Bnnz) - row .+= CuVector(repeat(0:nB-1, outer = Annz)) .+ 1 - col .+= CuVector(repeat(0:nB-1, outer = Annz)) .+ 1 + row .+= repeat(nz_offsets, outer = Annz) .+ 1 + col .+= repeat(nz_offsets, outer = Annz) .+ 1 - data .*= repeat(CUDA.ones(T, nB), outer = Annz) + data .*= repeat(nz_diag, outer = Annz) sparse(row, col, data, out_shape..., fmt = :coo) end -function LinearAlgebra.kron(A::Diagonal, B::CuSparseMatrixCOO{T, Ti}) where {Ti, T} +function LinearAlgebra.kron(A::Diagonal{T1, <:CuVector{T1}}, B::CuSparseMatrixCOO{T2, Ti}) where {Ti, T1, T2} + T = promote_type(T1, T2) mA,nA = size(A) mB,nB = size(B) out_shape = (mA * mB, nA * nB) - Annz = nA Bnnz = Int64(B.nnz) - if Annz == 0 || Bnnz == 0 + # Find non-zero diagonal elements + nz_mask = map(!iszero, A.diag) + nz_indices = findall(nz_mask) + Annz = length(nz_indices) + + if Bnnz == 0 || Annz == 0 return CuSparseMatrixCOO(CuVector{Ti}(undef, 0), CuVector{Ti}(undef, 0), CuVector{T}(undef, 0), out_shape) end - row = (0:nA-1) .* mB - row = CuVector(repeat(row, inner = Bnnz)) - col = (0:nA-1) .* nB - col = CuVector(repeat(col, inner = Bnnz)) - data = repeat(CUDA.ones(T, nA), inner = Bnnz) + # Only process non-zero diagonal elements + nz_diag = A.diag[nz_indices] + nz_offsets = CuVector(nz_indices .- 1) + + row = nz_offsets .* mB + row = repeat(row, inner = Bnnz) + col = nz_offsets .* nB + col = repeat(col, inner = Bnnz) + data = repeat(convert(CuVector{T}, nz_diag), inner = Bnnz) row .+= repeat(B.rowInd .- 1, outer = Annz) .+ 1 col .+= repeat(B.colInd .- 1, outer = Annz) .+ 1 diff --git a/test/Project.toml b/test/Project.toml index 861e9cc219..1da3b60af0 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -9,11 +9,12 @@ DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" +FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" +JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" diff --git a/test/libraries/cusparse/linalg.jl b/test/libraries/cusparse/linalg.jl index b4232a8ef6..6c63080033 100644 --- a/test/libraries/cusparse/linalg.jl +++ b/test/libraries/cusparse/linalg.jl @@ -6,7 +6,9 @@ m = 10 A = sprand(T, m, m, 0.2) B = sprand(T, m, m, 0.3) ZA = spzeros(T, m, m) - C = I(div(m, 2)) + C = Diagonal(rand(T, div(m, 2))) + + dC = adapt(CuArray, C) @testset "type = $typ" for typ in [CuSparseMatrixCSR, CuSparseMatrixCSC] dA = typ(A) dB = typ(B) @@ -35,12 +37,52 @@ m = 10 end end @testset "kronecker product with I opa = $opa" for opa in (identity, transpose, adjoint) - @test collect(kron(opa(dA), C)) ≈ kron(opa(A), C) - @test collect(kron(C, opa(dA))) ≈ kron(C, opa(A)) - @test collect(kron(opa(dZA), C)) ≈ kron(opa(ZA), C) - @test collect(kron(C, opa(dZA))) ≈ kron(C, opa(ZA)) + @test collect(kron(opa(dA), dC)) ≈ kron(opa(A), C) + @test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A)) + @test collect(kron(opa(dZA), dC)) ≈ kron(opa(ZA), C) + @test collect(kron(dC, opa(dZA))) ≈ kron(C, opa(ZA)) end end + + # Test type promotion for kron with Diagonal + @testset "Type promotion - T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64)] + A_T1 = sprand(T1, m, m, 0.2) + dA_T1 = CuSparseMatrixCSR(A_T1) + dC_T2 = Diagonal(CUDA.rand(T2, div(m, 2))) + C_T2 = collect(dC_T2) + + # Test kron(sparse_T1, diagonal_T2) + result_gpu = kron(dA_T1, dC_T2) + result_cpu = kron(A_T1, C_T2) + @test eltype(result_gpu) == promote_type(T1, T2) + @test collect(result_gpu) ≈ result_cpu + + # Test kron(diagonal_T2, sparse_T1) + result_gpu = kron(dC_T2, dA_T1) + result_cpu = kron(C_T2, A_T1) + @test eltype(result_gpu) == promote_type(T1, T2) + @test collect(result_gpu) ≈ result_cpu + end + + # Test kron with zero diagonal matrices + @testset "kron with zero Diagonal" begin + A = sprand(T, m, m, 0.2) + dA = CuSparseMatrixCSR(A) + dC_zeros = Diagonal(CUDA.zeros(T, div(m, 2))) + C_zeros = collect(dC_zeros) + + # Test kron(sparse, zero_diagonal) + result_gpu = kron(dA, dC_zeros) + result_cpu = kron(A, C_zeros) + @test SparseMatrixCSC(result_gpu) ≈ result_cpu + @test nnz(result_gpu) == 0 + + # Test kron(zero_diagonal, sparse) + result_gpu = kron(dC_zeros, dA) + result_cpu = kron(C_zeros, A) + @test SparseMatrixCSC(result_gpu) ≈ result_cpu + @test nnz(result_gpu) == 0 + end end @testset "Reshape $typ (100,100) -> (20, 500) and droptol" for diff --git a/test/libraries/fillarrays.jl b/test/libraries/fillarrays.jl new file mode 100644 index 0000000000..e4d97c0c8b --- /dev/null +++ b/test/libraries/fillarrays.jl @@ -0,0 +1,102 @@ +using CUDA.CUSPARSE +using LinearAlgebra, SparseArrays +using FillArrays + +@testset "FillArrays - Kronecker product" begin + @testset "T = $T" for T in [Float32, Float64, ComplexF32, ComplexF64] + m, n = 100, 80 + A = sprand(T, m, n, 0.2) + + # Test with Ones + @testset "Ones - size $diag_size" for diag_size in [5, 10] + C_ones = Diagonal(Ones{T}(diag_size)) + dA = CuSparseMatrixCSR(A) + + # Test kron(sparse, diagonal) + result_gpu = collect(kron(dA, C_ones)) + result_cpu = kron(A, collect(C_ones)) + @test result_gpu ≈ result_cpu + + # Test kron(diagonal, sparse) + result_gpu = collect(kron(C_ones, dA)) + result_cpu = kron(collect(C_ones), A) + @test result_gpu ≈ result_cpu + end + + # Test with Fill (constant value) + @testset "Fill - value $val, size $diag_size" for val in [T(2.5), T(-1.0)], diag_size in [5, 10] + C_fill = Diagonal(Fill(val, diag_size)) + dA = CuSparseMatrixCSR(A) + + # Test kron(sparse, diagonal) + result_gpu = collect(kron(dA, C_fill)) + result_cpu = kron(A, collect(C_fill)) + @test result_gpu ≈ result_cpu + + # Test kron(diagonal, sparse) + result_gpu = collect(kron(C_fill, dA)) + result_cpu = kron(collect(C_fill), A) + @test result_gpu ≈ result_cpu + end + + # Test with Zeros + @testset "Zeros - size $diag_size" for diag_size in [5, 10] + C_zeros = Diagonal(Zeros{T}(diag_size)) + dA = CuSparseMatrixCSR(A) + + # Test kron(sparse, diagonal) + result_gpu = kron(dA, C_zeros) + result_cpu = kron(A, collect(C_zeros)) + @test SparseMatrixCSC(result_gpu) ≈ result_cpu + @test nnz(result_gpu) == 0 + + # Test kron(diagonal, sparse) + result_gpu = kron(C_zeros, dA) + result_cpu = kron(collect(C_zeros), A) + @test SparseMatrixCSC(result_gpu) ≈ result_cpu + @test nnz(result_gpu) == 0 + end + + # Test with transpose and adjoint wrappers + @testset "Transpose/Adjoint with Fill" begin + diag_size = 5 + val = T(3.0) + C_fill = Diagonal(Fill(val, diag_size)) + + @testset "opa = $opa" for opa in (identity, transpose, adjoint) + dA = CuSparseMatrixCSR(A) + + # Test kron(opa(sparse), diagonal) + result_gpu = collect(kron(opa(dA), C_fill)) + result_cpu = kron(opa(A), collect(C_fill)) + @test result_gpu ≈ result_cpu + + # Test kron(diagonal, opa(sparse)) + result_gpu = collect(kron(C_fill, opa(dA))) + result_cpu = kron(collect(C_fill), opa(A)) + @test result_gpu ≈ result_cpu + end + end + + # Test type promotion + @testset "Type promotion" begin + @testset "T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64), (ComplexF32, Float32)] + A_T1 = sprand(T1, m, n, 0.2) + dA_T1 = CuSparseMatrixCSR(A_T1) + C_fill_T2 = Diagonal(Fill(T2(2.5), 5)) + + # Test kron(sparse_T1, diagonal_T2) + result_gpu = kron(dA_T1, C_fill_T2) + result_cpu = kron(A_T1, collect(C_fill_T2)) + @test eltype(result_gpu) == promote_type(T1, T2) + @test collect(result_gpu) ≈ result_cpu + + # Test kron(diagonal_T2, sparse_T1) + result_gpu = kron(C_fill_T2, dA_T1) + result_cpu = kron(collect(C_fill_T2), A_T1) + @test eltype(result_gpu) == promote_type(T1, T2) + @test collect(result_gpu) ≈ result_cpu + end + end + end +end