Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean CUDA operators #108

Merged
merged 1 commit into from
Oct 3, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 114 additions & 105 deletions ext/CombinatorialSpacesCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module CombinatorialSpacesCUDAExt

using CombinatorialSpaces
using CombinatorialSpaces.DiscreteExteriorCalculus: DiscreteHodge
using GeometryBasics
using CUDA
using CUDA.CUSPARSE
Expand All @@ -12,176 +13,184 @@ Point3D = Point3{Float64}
import CombinatorialSpaces: dec_wedge_product, dec_c_wedge_product, dec_c_wedge_product!, dec_p_wedge_product,
dec_boundary, dec_differential, dec_dual_derivative, dec_hodge_star, dec_inv_hodge_star

""" dec_wedge_product(::Type{Tuple{0,0}}, sd::HasDeltaSet, ::Type{Val{:CUDA}})
# Wedge Product
#--------------

Wedge Primal 0, 0 for CUDA
""" dec_wedge_product(::Type{Tuple{j,k}}, sd::HasDeltaSet, ::Type{Val{:CUDA}})

Return a function that computes the wedge product between a primal j-form and a primal k-form via CUDA.

See also: [`dec_p_wedge_product`](@ref), [`dec_c_wedge_product`](@ref), [`dec_c_wedge_product!`](@ref).
"""
function dec_wedge_product(::Type{Tuple{j,k}}, sd::HasDeltaSet, ::Type{Val{:CUDA}}) where {j,k}
error("Unsupported combination of degrees $j and $k. Ensure that their sum is not greater than the degree of the complex.")
end

function dec_wedge_product(::Type{Tuple{0,0}}, sd::HasDeltaSet, ::Type{Val{:CUDA}})
(f, g) -> f .* g
end

""" function dec_wedge_product(::Type{Tuple{k,0}}, sd::HasDeltaSet, ::Type{Val{:CUDA}})

Wedge Primal K, 0 for CUDA
"""
function dec_wedge_product(::Type{Tuple{k,0}}, sd::HasDeltaSet, ::Type{Val{:CUDA}}) where {k}
val_pack = dec_p_wedge_product(Tuple{0,k}, sd, Val{:CUDA})
(α, g) -> dec_c_wedge_product(Tuple{0,k}, g, α, val_pack, Val{:CUDA})
wedge_cache = dec_p_wedge_product(Tuple{0,k}, sd, Val{:CUDA})
(α, g) -> dec_c_wedge_product(Tuple{0,k}, g, α, wedge_cache, Val{:CUDA})
end

""" function dec_wedge_product(::Type{Tuple{0,k}}, sd::HasDeltaSet, ::Type{Val{:CUDA}})

Wedge Primal 0, K for CUDA
"""
function dec_wedge_product(::Type{Tuple{0,k}}, sd::HasDeltaSet, ::Type{Val{:CUDA}}) where {k}
val_pack = dec_p_wedge_product(Tuple{0,k}, sd, Val{:CUDA})
(f, β) -> dec_c_wedge_product(Tuple{0,k}, f, β, val_pack, Val{:CUDA})
wedge_cache = dec_p_wedge_product(Tuple{0,k}, sd, Val{:CUDA})
(f, β) -> dec_c_wedge_product(Tuple{0,k}, f, β, wedge_cache, Val{:CUDA})
end

""" function dec_wedge_product(::Type{Tuple{1,1}}, sd::HasDeltaSet2D, ::Type{Val{:CUDA}})

Wedge Primal 1, 1 for CUDA
"""
function dec_wedge_product(::Type{Tuple{1,1}}, sd::HasDeltaSet2D, ::Type{Val{:CUDA}})
val_pack = dec_p_wedge_product(Tuple{1,1}, sd, Val{:CUDA})
(α, β) -> dec_c_wedge_product(Tuple{1,1}, α, β, val_pack, Val{:CUDA})
wedge_cache = dec_p_wedge_product(Tuple{1,1}, sd, Val{:CUDA})
(α, β) -> dec_c_wedge_product(Tuple{1,1}, α, β, wedge_cache, Val{:CUDA})
end

""" function dec_p_wedge_product(::Type{Tuple{m,n}}, sd, ::Type{Val{:CUDA}})

Preallocation for CUDA Wedges
"""
#Preallocate values to compute a wedge product via CUDA.
function dec_p_wedge_product(::Type{Tuple{m,n}}, sd, ::Type{Val{:CUDA}}) where {m,n}
val_pack = dec_p_wedge_product(Tuple{m,n}, sd)
(CuArray.(val_pack[1:end-1])..., val_pack[end])
end

""" function dec_c_wedge_product(::Type{Tuple{m,n}}, α, β, val_pack, ::Type{Val{:CUDA}})

Computation for CUDA Wedges
"""
function dec_c_wedge_product(::Type{Tuple{m,n}}, α, β, val_pack, ::Type{Val{:CUDA}}) where {m,n}
wedge_terms = CUDA.zeros(eltype(α), last(last(val_pack)))
return dec_c_wedge_product!(Tuple{m,n}, wedge_terms, α, β, val_pack, Val{:CUDA})
end

function dec_c_wedge_product!(::Type{Tuple{0,1}}, wedge_terms, f, α, val_pack, ::Type{Val{:CUDA}})
num_threads = CUDA.max_block_size.x
num_blocks = min(ceil(Int, length(wedge_terms) / num_threads), CUDA.max_grid_size.x)

@cuda threads=num_threads blocks=num_blocks dec_cu_ker_c_wedge_product_01!(wedge_terms, f, α, val_pack[1])
return wedge_terms
wedge_cache = dec_p_wedge_product(Tuple{m,n}, sd)
(CuArray.(wedge_cache[1:end-1])..., wedge_cache[end])
end

function dec_c_wedge_product!(::Type{Tuple{0,2}}, wedge_terms, f, α, val_pack, ::Type{Val{:CUDA}})
num_threads = CUDA.max_block_size.x
num_blocks = min(ceil(Int, length(wedge_terms) / num_threads), CUDA.max_grid_size.x)

@cuda threads=num_threads blocks=num_blocks dec_cu_ker_c_wedge_product_02!(wedge_terms, f, α, val_pack[1], val_pack[2])
return wedge_terms
# Compute with a preallocated wedge product via CUDA.
function dec_c_wedge_product(::Type{Tuple{m,n}}, α, β, wedge_cache, ::Type{Val{:CUDA}}) where {m,n}
res = CUDA.zeros(eltype(α), last(last(wedge_cache)))
dec_c_wedge_product!(Tuple{m,n}, res, α, β, wedge_cache, Val{:CUDA})
end

function dec_c_wedge_product!(::Type{Tuple{1,1}}, wedge_terms, f, α, val_pack, ::Type{Val{:CUDA}})
num_threads = CUDA.max_block_size.x
num_blocks = min(ceil(Int, length(wedge_terms) / num_threads), CUDA.max_grid_size.x)
# Compute with a preallocated wedge product via CUDA in-place.
function dec_c_wedge_product!(::Type{Tuple{j,k}}, res, α, β, wedge_cache, ::Type{Val{:CUDA}}) where {j,k}
# Manually dispatch, since CUDA.jl kernels cannot.
kernel = if (j,k) == (0,1)
dec_cu_ker_c_wedge_product_01!
elseif (j,k) == (0,2)
dec_cu_ker_c_wedge_product_02!
elseif (j,k) == (1,1)
dec_cu_ker_c_wedge_product_11!
else
lukem12345 marked this conversation as resolved.
Show resolved Hide resolved
error("Unsupported combination of degrees $j and $k. Ensure that their sum is not greater than the degree of the complex.")
end

@cuda threads=num_threads blocks=num_blocks dec_cu_ker_c_wedge_product_11!(wedge_terms, f, α, val_pack[1], val_pack[2])
return wedge_terms
n_threads = CUDA.max_block_size.x
n_blocks = min(ceil(Int, length(res) / n_threads), CUDA.max_grid_size.x)
@cuda threads=n_threads blocks=n_blocks kernel(res, α, β, wedge_cache)
res
end

function dec_cu_ker_c_wedge_product_01!(wedge_terms::CuDeviceArray{T}, f, α, primal_vertices) where T
function dec_cu_ker_c_wedge_product_01!(res::CuDeviceArray{T}, f, α, wedge_cache) where T
p = wedge_cache[1]
index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
i = index
@inbounds while i <= Int32(length(wedge_terms))
wedge_terms[i] = T(0.5) * α[i] * (f[primal_vertices[i, Int32(1)]] + f[primal_vertices[i, Int32(2)]])
@inbounds while i <= Int32(length(res))
res[i] = T(0.5) * α[i] * (f[p[i, Int32(1)]] + f[p[i, Int32(2)]])
i += stride
end
return nothing
nothing
end

function dec_cu_ker_c_wedge_product_02!(wedge_terms::CuDeviceArray{T}, f, α, pv, coeffs) where T
index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
function dec_cu_ker_c_wedge_product_02!(res, f, α, wedge_cache)
p, c = wedge_cache[1], wedge_cache[2]
i = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
i = index

@inbounds while i <= Int32(length(wedge_terms))
wedge_terms[i] = α[i] * (coeffs[Int32(1), i] * f[pv[Int32(1), i]] + coeffs[Int32(2), i] * f[pv[Int32(2), i]]
+ coeffs[Int32(3), i] * f[pv[Int32(3), i]] + coeffs[Int32(4), i] * f[pv[Int32(4), i]]
+ coeffs[Int32(5), i] * f[pv[Int32(5), i]] + coeffs[Int32(6), i] * f[pv[Int32(6), i]])
@inbounds while i <= Int32(length(res))
c1, c2, c3, c4, c5, c6 = c[Int32(1),i], c[Int32(2),i], c[Int32(3),i], c[Int32(4),i], c[Int32(5),i], c[Int32(6),i]
p1, p2, p3, p4, p5, p6 = p[Int32(1),i], p[Int32(2),i], p[Int32(3),i], p[Int32(4),i], p[Int32(5),i], p[Int32(6),i]
res[i] = α[i] * (c1*f[p1] + c2*f[p2] + c3*f[p3] + c4*f[p4] + c5*f[p5] + c6*f[p6])
i += stride
end
return nothing
nothing
end
lukem12345 marked this conversation as resolved.
Show resolved Hide resolved

function dec_cu_ker_c_wedge_product_11!(wedge_terms, α, β, e, coeffs)
index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
function dec_cu_ker_c_wedge_product_11!(res, α, β, wedge_cache)
e, c = wedge_cache[1], wedge_cache[2]
i = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
i = index

@inbounds while i <= Int32(length(wedge_terms))
@inbounds while i <= Int32(length(res))
e0, e1, e2 = e[Int32(1), i], e[Int32(2), i], e[Int32(3), i]
c1, c2, c3 = c[Int32(1), i], c[Int32(2), i], c[Int32(3), i]
ae0, ae1, ae2 = α[e0], α[e1], α[e2]
be0, be1, be2 = β[e0], β[e1], β[e2]

c1, c2, c3 = coeffs[Int32(1), i], coeffs[Int32(2), i], coeffs[Int32(3), i]

wedge_terms[i] = (c1 * (ae2 * be1 - ae1 * be2)
+ c2 * (ae2 * be0 - ae0 * be2)
+ c3 * (ae1 * be0 - ae0 * be1))
res[i] =
(c1 * (ae2 * be1 - ae1 * be2) +
c2 * (ae2 * be0 - ae0 * be2) +
c3 * (ae1 * be0 - ae0 * be1))
i += stride
end

return nothing
nothing
end
lukem12345 marked this conversation as resolved.
Show resolved Hide resolved

""" dec_boundary(n::Int, sd::EmbeddedDeltaDualComplex, ::Type{Val{:CUDA}})
# Boundary and Co-boundary
#-------------------------

""" dec_boundary(n::Int, sd::HasDeltaSet, ::Type{Val{:CUDA}})

Boundary matrix as a sparse CUDA matrix
Compute a boundary matrix as a sparse CUDA matrix.
"""
dec_boundary(n::Int, sd::EmbeddedDeltaDualComplex1D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type = CuSparseMatrixCSC{float_type}(dec_boundary(n, sd))
dec_boundary(n::Int, sd::HasDeltaSet, ::Type{Val{:CUDA}}) =
CuSparseMatrixCSC(dec_boundary(n, sd))

""" dec_dual_derivative(n::Int, sd::EmbeddedDeltaDualComplex, ::Type{Val{:CUDA}})
dec_boundary(n::Int, sd::EmbeddedDeltaDualComplex1D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type =
CuSparseMatrixCSC{float_type}(dec_boundary(n, sd))
dec_boundary(n::Int, sd::EmbeddedDeltaDualComplex2D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type =
CuSparseMatrixCSC{float_type}(dec_boundary(n, sd))

Dual derivative matrix as a sparse CUDA matrix
""" dec_dual_derivative(n::Int, sd::EmbeddedDeltaDualComplex1D, ::Type{Val{:CUDA}})

Compute a dual derivative matrix as a sparse CUDA matrix.
"""
dec_dual_derivative(n::Int, sd::EmbeddedDeltaDualComplex1D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type = CuSparseMatrixCSC{float_type}(dec_dual_derivative(n, sd))
dec_dual_derivative(n::Int, sd::HasDeltaSet, ::Type{Val{:CUDA}}) =
CuSparseMatrixCSC(dec_dual_derivative(n, sd))

dec_dual_derivative(n::Int, sd::EmbeddedDeltaDualComplex1D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type =
CuSparseMatrixCSC{float_type}(dec_dual_derivative(n, sd))
dec_dual_derivative(n::Int, sd::EmbeddedDeltaDualComplex2D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type =
CuSparseMatrixCSC{float_type}(dec_dual_derivative(n, sd))

""" dec_differential(n::Int, sd::EmbeddedDeltaDualComplex, ::Type{Val{:CUDA}})

Exterior derivative matrix as a sparse CUDA matrix
""" dec_differential(n::Int, sd::HasDeltaSet, ::Type{Val{:CUDA}})

Compute an exterior derivative matrix as a sparse CUDA matrix.
"""
dec_differential(n::Int, sd::EmbeddedDeltaDualComplex1D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type = CuSparseMatrixCSC{float_type}(dec_differential(n, sd))
dec_differential(n::Int, sd::HasDeltaSet, ::Type{Val{:CUDA}}) =
CuSparseMatrixCSC(dec_differential(n, sd))

dec_boundary(n::Int, sd::EmbeddedDeltaDualComplex2D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type = CuSparseMatrixCSC{float_type}(dec_boundary(n, sd))
dec_dual_derivative(n::Int, sd::EmbeddedDeltaDualComplex2D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type = CuSparseMatrixCSC{float_type}(dec_dual_derivative(n, sd))
dec_differential(n::Int, sd::EmbeddedDeltaDualComplex2D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type = CuSparseMatrixCSC{float_type}(dec_differential(n, sd))
dec_differential(n::Int, sd::EmbeddedDeltaDualComplex1D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type =
CuSparseMatrixCSC{float_type}(dec_differential(n, sd))
dec_differential(n::Int, sd::EmbeddedDeltaDualComplex2D{Bool, float_type, _p} where _p, ::Type{Val{:CUDA}}) where float_type =
CuSparseMatrixCSC{float_type}(dec_differential(n, sd))

dec_boundary(n::Int, sd::HasDeltaSet, ::Type{Val{:CUDA}}) = CuSparseMatrixCSC(dec_boundary(n, sd))
dec_dual_derivative(n::Int, sd::HasDeltaSet, ::Type{Val{:CUDA}}) = CuSparseMatrixCSC(dec_dual_derivative(n, sd))
dec_differential(n::Int, sd::HasDeltaSet, ::Type{Val{:CUDA}}) = CuSparseMatrixCSC(dec_differential(n, sd))
# Hodge Star
#-----------

""" dec_hodge_star(n::Int, sd::HasDeltaSet, ::HodgeType, ::Type{Val{:CUDA}})
""" dec_hodge_star(n::Int, sd::HasDeltaSet, h::DiscreteHodge, ::Type{Val{:CUDA}})

Hodge matrices as appropriate diagonal or sparse CUDA matrices
Compute a Hodge star as a diagonal or generic sparse CUDA matrix.
"""
dec_hodge_star(n::Int, sd::HasDeltaSet, ::DiagonalHodge, ::Type{Val{:CUDA}}) = CuArray(dec_hodge_star(n, sd, DiagonalHodge()))
dec_hodge_star(n::Int, sd::HasDeltaSet, ::GeometricHodge, ::Type{Val{:CUDA}}) = dec_hodge_star(Val{n}, sd, GeometricHodge(), Val{:CUDA})
dec_hodge_star(::Type{Val{n}}, sd::HasDeltaSet, ::GeometricHodge, ::Type{Val{:CUDA}}) where n = CuArray(dec_hodge_star(Val{n}, sd, GeometricHodge()))
dec_hodge_star(::Type{Val{1}}, sd::EmbeddedDeltaDualComplex2D, ::GeometricHodge, ::Type{Val{:CUDA}}) = CuSparseMatrixCSC(dec_hodge_star(Val{1}, sd, GeometricHodge()))
dec_hodge_star(n::Int, sd::HasDeltaSet, h::DiscreteHodge, ::Type{Val{:CUDA}}) =
dec_hodge_star(Val{n}, sd, h, Val{:CUDA})

dec_hodge_star(::Type{Val{n}}, sd::HasDeltaSet, h::DiscreteHodge, ::Type{Val{:CUDA}}) where n =
CuArray(dec_hodge_star(Val{n}, sd, h))

""" dec_inv_hodge_star(n::Int, sd::HasDeltaSet, ::HodgeType, ::Type{Val{:CUDA}})
dec_hodge_star(::Type{Val{1}}, sd::EmbeddedDeltaDualComplex2D, h::GeometricHodge, ::Type{Val{:CUDA}}) =
CuSparseMatrixCSC(dec_hodge_star(Val{1}, sd, h))

Inverse Hodge matrices as diagonal matrices
""" dec_inv_hodge_star(n::Int, sd::HasDeltaSet, h::DiscreteHodge, ::Type{Val{:CUDA}})

Compute an inverse Hodge star matrix as a diagonal CUDA matrix.
"""
dec_inv_hodge_star(n::Int, sd::HasDeltaSet, ::DiagonalHodge, ::Type{Val{:CUDA}}) = CuArray(dec_inv_hodge_star(n, sd, DiagonalHodge()))
dec_inv_hodge_star(n::Int, sd::HasDeltaSet, ::GeometricHodge, ::Type{Val{:CUDA}}) = dec_inv_hodge_star(Val{n}, sd, GeometricHodge(), Val{:CUDA})
dec_inv_hodge_star(::Type{Val{n}}, sd::HasDeltaSet, ::GeometricHodge, ::Type{Val{:CUDA}}) where n = CuArray(dec_inv_hodge_star(Val{n}, sd, GeometricHodge()))
dec_inv_hodge_star(n::Int, sd::HasDeltaSet, h::DiscreteHodge, ::Type{Val{:CUDA}}) =
dec_inv_hodge_star(Val{n}, sd, h, Val{:CUDA})

dec_inv_hodge_star(::Type{Val{n}}, sd::HasDeltaSet, h::DiscreteHodge, ::Type{Val{:CUDA}}) where n =
CuArray(dec_inv_hodge_star(Val{n}, sd, h))

""" function dec_inv_hodge_star(::Type{Val{1}}, sd::EmbeddedDeltaDualComplex2D, ::GeometricHodge, ::Type{Val{:CUDA}})

Inverse Geometric Hodge for Primal 1 implemented as a GMRES solver. This returns a function that
will compute the result.
Return a function that computes the inverse geometric Hodge star of a primal 1-form via a GMRES solver.
"""
function dec_inv_hodge_star(::Type{Val{1}}, sd::EmbeddedDeltaDualComplex2D, ::GeometricHodge, ::Type{Val{:CUDA}})
hdg = -1 * dec_hodge_star(1, sd, GeometricHodge(), Val{:CUDA})
Expand Down
Loading