From 034a52c4acc940155c269220d4f6f4775a818bb5 Mon Sep 17 00:00:00 2001 From: Kai Xu Date: Wed, 17 Apr 2019 18:03:52 +0100 Subject: [PATCH 1/3] port codes from CuGammaFuns.jl --- Project.toml | 1 + src/CuArrays.jl | 3 +- src/broadcast.jl | 5 +++ src/forwarddiff.jl | 11 +++++ src/special/gamma.jl | 65 ++++++++++++++++++++++++++++++ test/runtests.jl | 1 + test/special.jl | 96 ++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 src/special/gamma.jl create mode 100644 test/special.jl diff --git a/Project.toml b/Project.toml index 90aa006b..d7392faa 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" diff --git a/src/CuArrays.jl b/src/CuArrays.jl index dff71611..ab8446b4 100644 --- a/src/CuArrays.jl +++ b/src/CuArrays.jl @@ -8,7 +8,7 @@ using GPUArrays export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, cuzeros, cuones, cufill -import LinearAlgebra +import LinearAlgebra, SpecialFunctions using Adapt @@ -33,6 +33,7 @@ include("array.jl") include("subarray.jl") include("utils.jl") include("indexing.jl") +include("special/gamma.jl") include("broadcast.jl") include("matmul.jl") include("mapreduce.jl") diff --git a/src/broadcast.jl b/src/broadcast.jl index 1cfe2c60..dc8ae2b5 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -37,6 +37,11 @@ for f in libdevice @eval cufunc(::typeof(Base.$f)) = CUDAnative.$f end +cufunc(::typeof(SpecialFunctions.lbeta)) = CuArrays.lbeta +cufunc(::typeof(SpecialFunctions.lgamma)) = CuArrays.lgamma +cufunc(::typeof(SpecialFunctions.digamma)) = CuArrays.digamma +cufunc(::typeof(SpecialFunctions.trigamma)) = CuArrays.trigamma + using MacroTools const _cufuncs = copy(libdevice) diff --git a/src/forwarddiff.jl b/src/forwarddiff.jl index 6331f1cc..95cf1ebc 100644 --- a/src/forwarddiff.jl +++ b/src/forwarddiff.jl @@ -13,3 +13,14 @@ end ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :tanh, 1)] = x -> replace_device(:(1-tanh(x)^2)) eval(ForwardDiff.unary_dual_definition(:CUDAnative, :tanh)) + +# Gamma functions + +ForwardDiff.DiffRules.@define_diffrule CuArrays.lgamma(a) = :(CuArrays.digamma($a)) +eval(ForwardDiff.unary_dual_definition(:CuArrays, :lgamma)) + +ForwardDiff.DiffRules.@define_diffrule CuArrays.digamma(a) = :(CuArrays.trigamma($a)) +eval(ForwardDiff.unary_dual_definition(:CuArrays, :digamma)) + +ForwardDiff.DiffRules.@define_diffrule CuArrays.lbeta(a, b) = :(CuArrays.digamma($a) - CuArrays.digamma($a + $b)), :(CuArrays.digamma($b) - CuArrays.digamma($a + $b)) +eval(ForwardDiff.binary_dual_definition(:CuArrays, :lbeta)) diff --git a/src/special/gamma.jl b/src/special/gamma.jl new file mode 100644 index 00000000..38f8deed --- /dev/null +++ b/src/special/gamma.jl @@ -0,0 +1,65 @@ +# This file is heavlily adopted from https://github.com/JuliaMath/SpecialFunctions.jl. +# License is MIT: http://julialang.org/license + +function lgamma(x) + return CUDAnative.lgamma(x) +end + +function digamma(x) + if x <= 0 # reflection formula + ψ = -π / CUDAnative.tan(π * x) + x = 1 - x + else + ψ = zero(x) + end + if x < 7 + # shift using recurrence formula + ν = one(x) + n = 7 - CUDAnative.floor(x) + while ν <= n - 1 + ψ -= inv(x + ν) + ν += one(x) + end + ψ -= inv(x) + x += n + end + t = inv(x) + ψ += CUDAnative.log(x) - 0.5 * t + t *= t # 1/z^2 + # the coefficients here are Float64(bernoulli[2:9] .// (2*(1:8))) + ψ -= t * @evalpoly(t,0.08333333333333333,-0.008333333333333333,0.003968253968253968,-0.004166666666666667,0.007575757575757576,-0.021092796092796094,0.08333333333333333,-0.4432598039215686) + return ψ +end + +function _trigamma(x) + ψ = zero(x) + if x < 8 + # shift using recurrence formula + n = 8 - CUDAnative.floor(x) + ψ += inv(x)^2 + ν = one(x) + while ν <= n - 1 + ψ += inv(x + ν)^2 + ν += one(x) + end + x += n + end + t = inv(x) + w = t * t # 1/z^2 + ψ += t + 0.5 * w + # the coefficients here are Float64(bernoulli[2:9]) + ψ += t * w * @evalpoly(w,0.16666666666666666,-0.03333333333333333,0.023809523809523808,-0.03333333333333333,0.07575757575757576,-0.2531135531135531,1.1666666666666667,-7.092156862745098) + return ψ +end + +function trigamma(x) + if x <= 0 # reflection formula + return (π / CUDAnative.sin(π * x))^2 - _trigamma(1 - x) + else + return _trigamma(x) + end +end + +function lbeta(x, y) + return CUDAnative.lgamma(x) + CUDAnative.lgamma(y) - CUDAnative.lgamma(x + y) +end diff --git a/test/runtests.jl b/test/runtests.jl index acb829f3..8fdb5b89 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -24,6 +24,7 @@ include("solver.jl") include("fft.jl") include("rand.jl") include("sparse_solver.jl") +include("special.jl") CuArrays.pool_status() CuArrays.pool_timings() diff --git a/test/special.jl b/test/special.jl new file mode 100644 index 00000000..5be974c9 --- /dev/null +++ b/test/special.jl @@ -0,0 +1,96 @@ +using Test +import SpecialFunctions +using Flux: Tracker +using CuArrays + +n = 1000 + +xs_lgamma = randn(Float32, n); xs_lgamma_cu = cu(xs_lgamma) +xs_digamma = randn(Float32, n); xs_digamma_cu = cu(xs_digamma) +xs_trigamma = randn(Float32, n); xs_trigamma_cu = cu(xs_trigamma) +xs_lbeta_tuple = (randn(Float32, n), randn(Float32, n)) +xs_lbeta_tuple = map(xs -> abs.(xs), xs_lbeta_tuple); xs_lbeta_cu_tuple = map(cu, xs_lbeta_tuple) + +catgrads(grads) = cat(map(ta -> ta.data, grads)...; dims=1) +g∑fx(f, xs) = catgrads(Tracker.gradient(_xs -> sum(f.(_xs)), xs)) +g∑fx(f, xs, ys) = catgrads(Tracker.gradient((_xs, _ys) -> sum(f.(_xs, _ys)), xs, ys)) + +results = Dict() +@testset "Forward evaluation" begin + fn = :lgamma + @testset "$fn" begin + lgamma_val_cpu = @time SpecialFunctions.lgamma.(xs_lgamma) + lgamma_val_gpu = @time CuArrays.lgamma.(xs_lgamma_cu) + lgamma_val_gpu = Array(lgamma_val_gpu) + for i = 1:n + @test lgamma_val_cpu[i] ≈ lgamma_val_gpu[i] + end + results[fn] = (lgamma_val_cpu, lgamma_val_gpu) + end + + fn = :digamma + @testset "$fn" begin + digamma_val_cpu = @time SpecialFunctions.digamma.(xs_digamma) + digamma_val_gpu = @time CuArrays.digamma.(xs_digamma_cu) + digamma_val_gpu = Array(digamma_val_gpu) + for i = 1:n + @test digamma_val_cpu[i] ≈ digamma_val_gpu[i] + end + results[fn] = (digamma_val_cpu, digamma_val_gpu) + end + + fn = :trigamma + @testset "$fn" begin + trigamma_val_cpu = @time SpecialFunctions.trigamma.(xs_trigamma) + trigamma_val_gpu = @time CuArrays.trigamma.(xs_trigamma_cu) + trigamma_val_gpu = Array(trigamma_val_gpu) + for i = 1:n + @test trigamma_val_cpu[i] ≈ trigamma_val_gpu[i] + end + results[fn] = (trigamma_val_cpu, trigamma_val_gpu) + end + + fn = :lbeta + @testset "$fn" begin + lbeta_val_cpu = @time SpecialFunctions.lbeta.(xs_lbeta_tuple...) + lbeta_val_gpu = @time CuArrays.lbeta.(xs_lbeta_cu_tuple...) + lbeta_val_gpu = Array(lbeta_val_gpu) + for i = 1:n + @test lbeta_val_cpu[i] ≈ lbeta_val_gpu[i] + end + results[fn] = (lbeta_val_cpu, lbeta_val_gpu) + end + +end + +@testset "Gradient evaluation" begin + fn = :lgamma + @testset "$fn" begin + lgamma_grad_cpu = @time g∑fx(SpecialFunctions.lgamma, xs_lgamma) + lgamma_grad_gpu = @time g∑fx(CuArrays.lgamma, xs_lgamma_cu) + lgamma_grad_gpu = Array(lgamma_grad_gpu) + for i = 1:n + @test lgamma_grad_cpu[i] ≈ lgamma_grad_gpu[i] + end + end + + fn = :digamma + @testset "$fn" begin + digamma_grad_cpu = @time g∑fx(SpecialFunctions.digamma, xs_digamma) + digamma_grad_gpu = @time g∑fx(CuArrays.digamma, xs_digamma_cu) + digamma_grad_gpu = Array(digamma_grad_gpu) + for i = 1:n + @test digamma_grad_cpu[i] ≈ digamma_grad_gpu[i] + end + end + + fn = :lbeta + @testset "$fn" begin + lbeta_grad_cpu = @time g∑fx(SpecialFunctions.lbeta, xs_lbeta_tuple...) + lbeta_grad_gpu = @time g∑fx(CuArrays.lbeta, xs_lbeta_cu_tuple...) + lbeta_grad_gpu = Array(lbeta_grad_gpu) + for i = 1:n + @test lbeta_grad_cpu[i] ≈ lbeta_grad_gpu[i] + end + end +end From 87733ac74dfa465a633724f96081cf9de376c10a Mon Sep 17 00:00:00 2001 From: Kai Xu Date: Tue, 9 Jul 2019 02:09:39 +0100 Subject: [PATCH 2/3] merge with master --- .github/FUNDING.yml | 1 + .gitlab-ci.yml | 1 - Project.toml | 8 ++-- README.md | 9 ++-- deps/build.jl | 14 +++++- docs/src/tutorials/intro.jl | 8 ++-- src/CuArrays.jl | 28 ++++++------ src/array.jl | 69 ++++++++++++++++++++++++++--- src/blas/highlevel.jl | 4 +- src/blas/wrappers.jl | 8 ++-- src/broadcast.jl | 48 +++++++++++++++++++- src/deprecated.jl | 4 ++ src/fft/CUFFT.jl | 2 - src/fft/libcufft.jl | 8 ++++ src/fft/wrappers.jl | 4 ++ src/forwarddiff.jl | 87 ++++++++++++++++++++++++++++++++++--- src/rand/CURAND.jl | 13 ++++-- src/rand/highlevel.jl | 18 +++----- src/solver/highlevel.jl | 2 +- src/solver/sparse.jl | 2 +- src/sparse/array.jl | 2 +- src/sparse/util.jl | 46 ++++++++++---------- src/sparse/wrappers.jl | 52 +++++++++++----------- test/base.jl | 76 +++++++++++++++++++++++++++----- test/blas.jl | 32 ++++++++++---- test/fft.jl | 13 +++--- test/forwarddiff.jl | 68 +++++++++++++++++++++++++++++ test/rand.jl | 23 ++++------ test/runtests.jl | 8 ++-- test/solver.jl | 15 +++---- test/sparse.jl | 83 ++++++++++++++++++++++++++--------- test/sparse_solver.jl | 9 ++-- 32 files changed, 569 insertions(+), 196 deletions(-) create mode 100644 .github/FUNDING.yml create mode 100644 test/forwarddiff.jl diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000..cc27b731 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1 @@ +custom: https://numfocus.salsalabs.org/donate-to-julia/index.html diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8fe77a3c..5b92b9a2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,5 @@ variables: CI_IMAGE_TAG: 'cuda' - CI_DEV_PKGS: 'CUDAapi GPUArrays CUDAnative NNlib CUDAdrv' JULIA_NUM_THREADS: '4' include: diff --git a/Project.toml b/Project.toml index d7392faa..e59dd9ca 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "CuArrays" uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" -version = "1.0.2" +version = "2.0.0" [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" @@ -31,7 +31,7 @@ test = ["Test", "FFTW", "ForwardDiff"] julia = "1.0" CUDAnative = "2.0" CUDAdrv = "3.0" -CUDAapi = "0.5.3, 0.6" -NNlib = "0.5, 0.6" -GPUArrays = "0.7" +CUDAapi = "0.5.3, 0.6, 1.0" +NNlib = "0.6" +GPUArrays = "0.7.1" Adapt = "0.4" diff --git a/README.md b/README.md index 40a8fabf..e3a33d63 100644 --- a/README.md +++ b/README.md @@ -23,10 +23,11 @@ arrays: ## Installation -CuArrays should work **out-of-the-box** on Julia 1.0. You only need to have a -proper set-up of CUDA, meaning the rest of the Julia CUDA stack should work -(notably CUDAapi.jl, CUDAdrv.jl and CUDAnative.jl). If you encounter any issues -with CuArrays.jl, please make sure those other packages are working as expected. +CuArrays should work **out-of-the-box** on stable releases of Julia 1.x. You +only need to have a proper set-up of CUDA, meaning the rest of the Julia CUDA +stack should work (notably CUDAapi.jl, CUDAdrv.jl and CUDAnative.jl). If you +encounter any issues with CuArrays.jl, please make sure those other packages are +working as expected. Some parts of CuArrays.jl depend on **optional libraries**, such as [cuDNN](https://developer.nvidia.com/cudnn). The build process should notify diff --git a/deps/build.jl b/deps/build.jl index 467845d6..debdb0cf 100644 --- a/deps/build.jl +++ b/deps/build.jl @@ -41,11 +41,21 @@ function main() toolkit = find_toolkit() - for name in ("cublas", "cusparse", "cusolver", "cufft", "curand", "cudnn") + # required libraries that are part of the CUDA toolkit + for name in ("cublas", "cusparse", "cusolver", "cufft", "curand") lib = Symbol("lib$name") config[lib] = find_cuda_library(name, toolkit) if config[lib] == nothing - build_warning("Could not find library '$name'") + build_error("Could not find library '$name' (it should be part of the CUDA toolkit)") + end + end + + # optional libraries + for name in ("cudnn", ) + lib = Symbol("lib$name") + config[lib] = find_cuda_library(name, toolkit) + if config[lib] == nothing + build_warning("Could not find optional library '$name'") end end diff --git a/docs/src/tutorials/intro.jl b/docs/src/tutorials/intro.jl index da0ae42c..4605b937 100644 --- a/docs/src/tutorials/intro.jl +++ b/docs/src/tutorials/intro.jl @@ -114,8 +114,8 @@ using BenchmarkTools using CuArrays -x_d = cufill(1.0f0, N) # a vector stored on the GPU filled with 1.0 (Float32) -y_d = cufill(2.0f0, N) # a vector stored on the GPU filled with 2.0 +x_d = CuArrays.fill(1.0f0, N) # a vector stored on the GPU filled with 1.0 (Float32) +y_d = CuArrays.fill(2.0f0, N) # a vector stored on the GPU filled with 2.0 # Here the `d` means "device," in contrast with "host". Now let's do the increment: @@ -220,8 +220,8 @@ CUDAdrv.@profile bench_gpu1!(y_d, x_d) # You can see that 100% of the time was spent in `ptxcall_gpu_add1__1`, the name of the # kernel that `CUDAnative` assigned when compiling `gpu_add1!` for these inputs. (Had you -# created arrays of multiple data types, e.g., `xu_d = cufill(0x01, N)`, you might have -# also seen `ptxcall_gpu_add1__2` and so on. Like the rest of Julia, you can define a +# created arrays of multiple data types, e.g., `xu_d = CuArrays.fill(0x01, N)`, you might +# have also seen `ptxcall_gpu_add1__2` and so on. Like the rest of Julia, you can define a # single method and it will be specialized at compile time for the particular data types # you're using.) diff --git a/src/CuArrays.jl b/src/CuArrays.jl index ab8446b4..528bd722 100644 --- a/src/CuArrays.jl +++ b/src/CuArrays.jl @@ -1,12 +1,10 @@ -__precompile__() - module CuArrays using CUDAdrv, CUDAnative using GPUArrays -export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, cuzeros, cuones, cufill +export CuArray, CuVector, CuMatrix, CuVecOrMat, cu import LinearAlgebra, SpecialFunctions @@ -45,12 +43,12 @@ include("gpuarray_interface.jl") # of CuArrays and/or CUDAnative only use a single context), so keep track of the active one. const active_context = Ref{CuContext}() -libcublas !== nothing && include("blas/CUBLAS.jl") -libcusparse !== nothing && include("sparse/CUSPARSE.jl") -libcusolver !== nothing && include("solver/CUSOLVER.jl") -libcufft !== nothing && include("fft/CUFFT.jl") -libcurand !== nothing && include("rand/CURAND.jl") -libcudnn !== nothing && include("dnn/CUDNN.jl") +include("blas/CUBLAS.jl") +include("sparse/CUSPARSE.jl") +include("solver/CUSOLVER.jl") +include("fft/CUFFT.jl") +include("rand/CURAND.jl") +libcudnn !== nothing && include("dnn/CUDNN.jl") include("nnlib.jl") @@ -84,11 +82,13 @@ function __init__() active_context[] = ctx # wipe the active handles - isdefined(CuArrays, :CUBLAS) && (CUBLAS._handle[] = C_NULL; CUBLAS._xt_handle[] = C_NULL) - isdefined(CuArrays, :CUSOLVER) && (CUSOLVER._dense_handle[] = C_NULL; CUSOLVER._sparse_handle[] = C_NULL) - isdefined(CuArrays, :CUSPARSE) && (CUSPARSE._handle[] = C_NULL) - isdefined(CuArrays, :CURAND) && (CURAND._generator[] = nothing) - isdefined(CuArrays, :CUDNN) && (CUDNN._handle[] = C_NULL) + CUBLAS._handle[] = C_NULL + CUBLAS._xt_handle[] = C_NULL + CUSOLVER._dense_handle[] = C_NULL + CUSOLVER._sparse_handle[] = C_NULL + CUSPARSE._handle[] = C_NULL + CURAND._generator[] = nothing + isdefined(CuArrays, :CUDNN) && (CUDNN._handle[] = C_NULL) end push!(CUDAnative.device!_listeners, callback) diff --git a/src/array.jl b/src/array.jl index ccbee3ce..cd9d8a15 100644 --- a/src/array.jl +++ b/src/array.jl @@ -214,12 +214,12 @@ end cu(xs) = adapt(CuArray{Float32}, xs) Base.getindex(::typeof(cu), xs...) = CuArray([xs...]) -cuzeros(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 0) -cuones(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 1) -cuzeros(dims...) = cuzeros(Float32, dims...) -cuones(dims...) = cuones(Float32, dims...) -cufill(v, dims...) = fill!(CuArray{typeof(v)}(undef, dims...), v) -cufill(v, dims::Dims) = fill!(CuArray{typeof(v)}(undef, dims...), v) +zeros(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 0) +ones(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 1) +zeros(dims...) = CuArrays.zeros(Float32, dims...) +ones(dims...) = CuArrays.ones(Float32, dims...) +fill(v, dims...) = fill!(CuArray{typeof(v)}(undef, dims...), v) +fill(v, dims::Dims) = fill!(CuArray{typeof(v)}(undef, dims...), v) # optimized implementation of `fill!` for types that are directly supported by memset const MemsetTypes = Dict(1=>UInt8, 2=>UInt16, 4=>UInt32) @@ -270,3 +270,60 @@ function LinearAlgebra.triu!(A::CuMatrix{T}, d::Integer = 0) where T @cuda blocks=blk threads=thr kernel!(A, d) return A end + + +## reversing + +function _reverse(input::CuVector{T}, output::CuVector{T}) where {T} + @assert length(input) == length(output) + + nthreads = 256 + nblocks = ceil(Int, length(input) / nthreads) + shmem = nthreads * sizeof(T) + + function kernel(input::CuDeviceVector{T}, output::CuDeviceVector{T}) where {T} + shared = @cuDynamicSharedMem(T, blockDim().x) + + # load one element per thread from device memory and buffer it in reversed order + + offset_in = blockDim().x * (blockIdx().x - 1) + index_in = offset_in + threadIdx().x + + if index_in <= length(input) + index_shared = blockDim().x - threadIdx().x + 1 + @inbounds shared[index_shared] = input[index_in] + end + + sync_threads() + + # write back in forward order, but to the reversed block offset as before + + offset_out = length(output) - blockDim().x * blockIdx().x + index_out = offset_out + threadIdx().x + + if 1 <= index_out <= length(output) + index_shared = threadIdx().x + @inbounds output[index_out] = shared[index_shared] + end + + return + end + + @cuda threads=nthreads blocks=nblocks shmem=shmem kernel(input, output) + + return +end + +function Base.reverse!(v::CuVector, start=1, stop=length(v)) + v′ = view(v, start:stop) + _reverse(v′, v′) + return v +end + +function Base.reverse(v::CuVector, start=1, stop=length(v)) + v′ = similar(v) + start > 1 && copyto!(v′, 1, v, 1, start-1) + _reverse(view(v, start:stop), view(v′, start:stop)) + stop < length(v) && copyto!(v′, stop+1, v, stop+1) + return v′ +end diff --git a/src/blas/highlevel.jl b/src/blas/highlevel.jl index c6754122..14aae87d 100644 --- a/src/blas/highlevel.jl +++ b/src/blas/highlevel.jl @@ -30,7 +30,7 @@ function LinearAlgebra.BLAS.dotc(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ end function LinearAlgebra.BLAS.dot(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64} - dotc(DX, DY) + BLAS.dotc(DX, DY) end function LinearAlgebra.BLAS.dotu(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64} @@ -43,7 +43,7 @@ LinearAlgebra.norm(x::CublasArray) = nrm2(x) LinearAlgebra.BLAS.asum(x::CublasArray) = asum(length(x), x, 1) function LinearAlgebra.axpy!(alpha::Number, x::CuArray{T}, y::CuArray{T}) where T<:CublasFloat - length(x)==length(y) || throw(DimensionMismatch("")) + length(x)==length(y) || throw(DimensionMismatch("axpy arguments have lengths $(length(x)) and $(length(y))")) axpy!(length(x), convert(T,alpha), x, 1, y, 1) end diff --git a/src/blas/wrappers.jl b/src/blas/wrappers.jl index 6360f960..b78dda5d 100644 --- a/src/blas/wrappers.jl +++ b/src/blas/wrappers.jl @@ -1473,7 +1473,7 @@ for (fname, elty) in unsafe_free!(Aptrs) if !Pivot - pivotArray = CuArray(zeros(Cint, (n, length(A)))) + pivotArray = CuArrays.zeros(Cint, (n, length(A))) end pivotArray, info, A end @@ -1513,7 +1513,7 @@ for (fname, elty) in ldc = max(1,stride(C[1],2)) Aptrs = device_batch(A) Cptrs = device_batch(C) - info = CuArray(zeros(Cint,length(A))) + info = CuArrays.zeros(Cint,length(A)) $fname(handle(), n, Aptrs, lda, pivotArray, Cptrs, ldc, info, length(A)) unsafe_free!(Cptrs) unsafe_free!(Aptrs) @@ -1552,7 +1552,7 @@ for (fname, elty) in ldc = max(1,stride(C[1],2)) Aptrs = device_batch(A) Cptrs = device_batch(C) - info = CuArray(zeros(Cint,length(A))) + info = CuArrays.zeros(Cint,length(A)) $fname(handle(), n, Aptrs, lda, Cptrs, ldc, info, length(A)) unsafe_free!(Cptrs) unsafe_free!(Aptrs) @@ -1638,7 +1638,7 @@ for (fname, elty) in Aptrs = device_batch(A) Cptrs = device_batch(C) info = zero(Cint) - infoarray = CuArray(zeros(Cint, length(A))) + infoarray = CuArrays.zeros(Cint, length(A)) $fname(handle(), cutrans, m, n, nrhs, Aptrs, lda, Cptrs, ldc, [info], infoarray, length(A)) unsafe_free!(Cptrs) unsafe_free!(Aptrs) diff --git a/src/broadcast.jl b/src/broadcast.jl index dc8ae2b5..b4e56979 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -6,6 +6,9 @@ function Base.similar(bc::Broadcasted{ArrayStyle{CuArray}}, ::Type{T}) where T similar(CuArray{T}, axes(bc)) end +function Base.similar(bc::Broadcasted{ArrayStyle{CuArray}}, ::Type{T}, dims...) where {T} + similar(CuArray{T}, dims...) +end # replace base functions with libdevice alternatives # TODO: do this with Cassette.jl @@ -42,15 +45,56 @@ cufunc(::typeof(SpecialFunctions.lgamma)) = CuArrays.lgamma cufunc(::typeof(SpecialFunctions.digamma)) = CuArrays.digamma cufunc(::typeof(SpecialFunctions.trigamma)) = CuArrays.trigamma +#broadcast ^ +culiteral_pow(::typeof(^), x::Union{Float32, Float64}, ::Val{0}) = one(x) +culiteral_pow(::typeof(^), x::Union{Float32, Float64}, ::Val{1}) = x +culiteral_pow(::typeof(^), x::Union{Float32, Float64}, ::Val{2}) = x*x +culiteral_pow(::typeof(^), x::Union{Float32, Float64}, ::Val{3}) = x*x*x +culiteral_pow(::typeof(^), x::Union{Float32, Float64}, ::Val{p}) where p = CUDAnative.pow(x, Int32(p)) + +cufunc(::typeof(Base.literal_pow)) = culiteral_pow +cufunc(::typeof(Base.:(^))) = CUDAnative.pow + using MacroTools -const _cufuncs = copy(libdevice) +const _cufuncs = [copy(libdevice); :^] cufuncs() = (global _cufuncs; _cufuncs) +_cuint(x::Int) = Int32(x) +_cuint(x::Expr) = x.head == :call && x.args[1] == :Int32 && x.args[2] isa Int ? Int32(x.args[2]) : x +_cuint(x) = x + +function _cupowliteral(x::Expr) + if x.head == :call && x.args[1] == :(CuArrays.cufunc(^)) && x.args[3] isa Int32 + num = x.args[3] + if 0 <= num <= 3 + sym = gensym(:x) + new_x = Expr(:block, :($sym = $(x.args[2]))) + + if iszero(num) + push!(new_x.args, :(one($sym))) + else + unroll = Expr(:call, :*) + for x = one(num):num + push!(unroll.args, sym) + end + push!(new_x.args, unroll) + end + + x = new_x + end + end + x +end +_cupowliteral(x) = x + function replace_device(ex) global _cufuncs MacroTools.postwalk(ex) do x - x in _cufuncs ? :(CuArrays.cufunc($x)) : x + x = x in _cufuncs ? :(CuArrays.cufunc($x)) : x + x = _cuint(x) + x = _cupowliteral(x) + x end end diff --git a/src/deprecated.jl b/src/deprecated.jl index a11d769d..02ce70ec 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -4,3 +4,7 @@ import Base: @deprecate_binding @deprecate_binding BLAS CUBLAS @deprecate_binding FFT CUFFT + +@deprecate cuzeros CuArrays.zeros +@deprecate cuones CuArrays.ones +@deprecate cufill CuArrays.fill diff --git a/src/fft/CUFFT.jl b/src/fft/CUFFT.jl index fa17f12d..f8c0a447 100644 --- a/src/fft/CUFFT.jl +++ b/src/fft/CUFFT.jl @@ -2,8 +2,6 @@ module CUFFT import CUDAapi -import CUDAdrv: CuPtr, PtrOrCuPtr - using ..CuArrays using ..CuArrays: libcufft, configured diff --git a/src/fft/libcufft.jl b/src/fft/libcufft.jl index 1ef2b1e2..1ac34333 100644 --- a/src/fft/libcufft.jl +++ b/src/fft/libcufft.jl @@ -1,5 +1,7 @@ # low-level wrappers of the CUFFT library +import CUDAdrv: CuPtr, PtrOrCuPtr, CuStream_t + cufftGetVersion() = ccall((:cufftGetVersion,libcufft), Cint, ()) function cufftGetProperty(property::CUDAapi.libraryPropertyType) @@ -75,3 +77,9 @@ function cufftExecD2Z(plan, idata, odata) (cufftHandle_t, CuPtr{cufftDoubleReal}, CuPtr{cufftDoubleComplex}), plan, idata, odata) end + +function cufftSetStream(plan, stream) + @check ccall((:cufftSetStream,libcufft), cufftStatus_t, + (cufftHandle_t, CuStream_t), + plan, stream) +end diff --git a/src/fft/wrappers.jl b/src/fft/wrappers.jl index e9bc060c..ee103c12 100644 --- a/src/fft/wrappers.jl +++ b/src/fft/wrappers.jl @@ -1,5 +1,7 @@ # wrappers of the low-level CUBLAS functionality +import CUDAdrv: CuStream + # Note: we don't implement padded storage dimensions function _mkplan(xtype, xdims, region) nrank = length(region) @@ -126,6 +128,8 @@ convert(::Type{cufftHandle_t}, p::CuFFTPlan) = p.plan destroy_plan(plan::CuFFTPlan) = cufftDestroy(plan) +set_stream(plan::CuFFTPlan, stream::CuStream) = cufftSetStream(plan, stream) + function assert_applicable(p::CuFFTPlan{T,K}, X::CuArray{T}) where {T,K} (size(X) == p.sz) || throw(ArgumentError("CuFFT plan applied to wrong-size input")) diff --git a/src/forwarddiff.jl b/src/forwarddiff.jl index 95cf1ebc..c6366ff3 100644 --- a/src/forwarddiff.jl +++ b/src/forwarddiff.jl @@ -1,8 +1,10 @@ # ForwardDiff integration +byhand = [:lgamma, :digamma, :lbeta, :exp2, :log2, :exp10, :log10, :abs] + for f in libdevice if haskey(ForwardDiff.DiffRules.DEFINED_DIFFRULES, (:Base,f,1)) - f == :tanh && continue + f ∈ byhand && continue diffrule = ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:Base,f,1)] ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative,f,1)] = (args...) -> replace_device(diffrule(args...)) @@ -10,17 +12,88 @@ for f in libdevice end end -ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :tanh, 1)] = x -> - replace_device(:(1-tanh(x)^2)) -eval(ForwardDiff.unary_dual_definition(:CUDAnative, :tanh)) - -# Gamma functions - +# byhand: lgamma ForwardDiff.DiffRules.@define_diffrule CuArrays.lgamma(a) = :(CuArrays.digamma($a)) eval(ForwardDiff.unary_dual_definition(:CuArrays, :lgamma)) +# byhand: digamma ForwardDiff.DiffRules.@define_diffrule CuArrays.digamma(a) = :(CuArrays.trigamma($a)) eval(ForwardDiff.unary_dual_definition(:CuArrays, :digamma)) +# byhand: lbeta ForwardDiff.DiffRules.@define_diffrule CuArrays.lbeta(a, b) = :(CuArrays.digamma($a) - CuArrays.digamma($a + $b)), :(CuArrays.digamma($b) - CuArrays.digamma($a + $b)) eval(ForwardDiff.binary_dual_definition(:CuArrays, :lbeta)) + +# byhand: exp2 +ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :exp2, 1)] = x -> + :((CuArrays.cufunc(exp2))(x) * (CuArrays.cufunc(log))(oftype(x, 2))) +eval(ForwardDiff.unary_dual_definition(:CUDAnative, :exp2)) + +# byhand: log2 +ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :log2, 1)] = x -> + :(inv(x) / (CuArrays.cufunc(log))(oftype(x, 2))) +eval(ForwardDiff.unary_dual_definition(:CUDAnative, :log2)) + +# byhand: exp10 +ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :exp10, 1)] = x -> + :((CuArrays.cufunc(exp10))(x) * (CuArrays.cufunc(log))(oftype(x, 10))) +eval(ForwardDiff.unary_dual_definition(:CUDAnative, :exp10)) + +# byhand: log10 +ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :log10, 1)] = x -> + :(inv(x) / (CuArrays.cufunc(log))(oftype(x, 10))) +eval(ForwardDiff.unary_dual_definition(:CUDAnative, :log10)) + +# byhand: abs +ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :abs, 1)] = x -> + :(signbit(x) ? -one(x) : one(x)) +eval(ForwardDiff.unary_dual_definition(:CUDAnative, :abs)) + + +ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :pow, 2)] = (x, y) -> + replace_device.(ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:Base, :^, 2)](x, y)) + +@eval begin + ForwardDiff.@define_binary_dual_op( + CUDAnative.pow, + begin + vx = ForwardDiff.value(x) + vy = ForwardDiff.value(y) + expv = (CUDAnative.pow)(vx, vy) + + powval = vy * CUDAnative.pow(vx , vy - Int32(1)) + + py = ForwardDiff.partials(y) + px = ForwardDiff.partials(x) + + cond = all(py.values) do x + x == zero(x) + end + + if cond + logval = one(expv) + else + logval = expv * CUDAnative.log(vx) + end + + new_partials = powval * px + logval * py + return ForwardDiff.Dual{Txy}(expv, new_partials) + end, + begin + v = ForwardDiff.value(x) + expv = (CUDAnative.pow)(v, y) + if y == zero(y) + new_partials = zero(ForwardDiff.partials(x)) + else + new_partials = ForwardDiff.partials(x) * y * (CUDAnative.pow)(v, y - Int32(1)) + end + return ForwardDiff.Dual{Tx}(expv, new_partials) + end, + begin + v = ForwardDiff.value(y) + expv = (CUDAnative.pow)(x, v) + deriv = expv*CUDAnative.log(x) + return ForwardDiff.Dual{Ty}(expv, deriv * ForwardDiff.partials(y)) + end + ) +end diff --git a/src/rand/CURAND.jl b/src/rand/CURAND.jl index 3b28fd5e..6f2a59cb 100644 --- a/src/rand/CURAND.jl +++ b/src/rand/CURAND.jl @@ -10,10 +10,7 @@ using GPUArrays using Random -export curand, - curandn, - curand_logn, rand_logn!, rand_logn, - curand_poisson, rand_poisson!, rand_poisson +export rand_logn!, rand_poisson! include("libcurand_types.jl") include("error.jl") @@ -44,3 +41,11 @@ version() = VersionNumber(curandGetProperty(CUDAapi.MAJOR_VERSION), curandGetProperty(CUDAapi.PATCH_LEVEL)) end + +const rand = CURAND.rand +const randn = CURAND.randn +const rand_logn = CURAND.rand_logn +const rand_poisson = CURAND.rand_poisson + +@deprecate curand CuArrays.rand +@deprecate curandn CuArrays.randn diff --git a/src/rand/highlevel.jl b/src/rand/highlevel.jl index b57137d0..fdec33f7 100644 --- a/src/rand/highlevel.jl +++ b/src/rand/highlevel.jl @@ -77,19 +77,13 @@ rand_poisson!(A::CuArray; kwargs...) = rand_poisson!(poisson_rng(A), A; kwargs.. rand_logn(A::CuArray; kwargs...) = rand_logn!(logn_rng(A), A; kwargs...) rand_poisson(A::CuArray; kwargs...) = rand_poisson!(poisson_rng(A), A; kwargs...) - -# need to prefix with `cu` to disambiguate from Random functions that return an Array -# TODO: `@gpu rand` with Cassette -curand(::Type{X}, args...; kwargs...) where {X} = rand!(CuArray{X}(undef, args...); kwargs...) -curandn(::Type{X}, args...; kwargs...) where {X} = randn!(CuArray{X}(undef, args...); kwargs...) -curand_logn(::Type{X}, args...; kwargs...) where {X} = rand_logn!(CuArray{X}(undef, args...); kwargs...) -curand_poisson(::Type{X}, args...; kwargs...) where {X} = rand_poisson!(CuArray{X}(undef, args...); kwargs...) - +rand(::Type{X}, args...; kwargs...) where {X} = rand!(CuArray{X}(undef, args...); kwargs...) +randn(::Type{X}, args...; kwargs...) where {X} = randn!(CuArray{X}(undef, args...); kwargs...) rand_logn(::Type{X}, args...; kwargs...) where {X} = rand_logn!(CuArray{X}(undef, args...); kwargs...) rand_poisson(::Type{X}, args...; kwargs...) where {X} = rand_poisson!(CuArray{X}(undef, args...); kwargs...) # specify default types -curand(args...; kwargs...) where {X} = curand(Float32, args...; kwargs...) -curandn(args...; kwargs...) where {X} = curandn(Float32, args...; kwargs...) -curand_logn(args...; kwargs...) where {X} = curand_logn(Float32, args...; kwargs...) -curand_poisson(args...; kwargs...) where {X} = curand_poisson(Cuint, args...; kwargs...) +rand(args...; kwargs...) where {X} = rand(Float32, args...; kwargs...) +randn(args...; kwargs...) where {X} = randn(Float32, args...; kwargs...) +rand_logn(args...; kwargs...) where {X} = rand_logn(Float32, args...; kwargs...) +rand_poisson(args...; kwargs...) where {X} = rand_poisson(Cuint, args...; kwargs...) diff --git a/src/solver/highlevel.jl b/src/solver/highlevel.jl index a45a69b2..69934bc8 100644 --- a/src/solver/highlevel.jl +++ b/src/solver/highlevel.jl @@ -19,7 +19,7 @@ LinearAlgebra.qr!(A::CuMatrix{T}) where T = CuQR(geqrf!(A::CuMatrix{T})...) Base.size(A::CuQR) = size(A.factors) Base.size(A::CuQRPackedQ, dim::Integer) = 0 < dim ? (dim <= 2 ? size(A.factors, 1) : 1) : throw(BoundsError()) CuArrays.CuMatrix(A::CuQRPackedQ) = orgqr!(copy(A.factors), A.τ) -CuArrays.CuArray(A::CuQRPackedQ) = convert(CuMatrix, A) +CuArrays.CuArray(A::CuQRPackedQ) = CuMatrix(A) Base.Matrix(A::CuQRPackedQ) = Matrix(CuMatrix(A)) function Base.getproperty(A::CuQR, d::Symbol) diff --git a/src/solver/sparse.jl b/src/solver/sparse.jl index 52697af7..fc0190ca 100644 --- a/src/solver/sparse.jl +++ b/src/solver/sparse.jl @@ -209,7 +209,7 @@ for (fname, elty, relty) in ((:cusolverSpScsreigvsi, :Float32, :Float32), cudesca = cusparseMatDescr_t(CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT, cuinda) rcudesca = Ref{cusparseMatDescr_t}(cudesca) x = copy(x_0) - μ = cuzeros($elty,1) + μ = CuArrays.zeros($elty,1) @check ccall(($(string(fname)),libcusolver), cusolverStatus_t, (cusolverSpHandle_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, CuPtr{Cint}, diff --git a/src/sparse/array.jl b/src/sparse/array.jl index 0d273e32..b70829eb 100644 --- a/src/sparse/array.jl +++ b/src/sparse/array.jl @@ -206,7 +206,7 @@ CuSparseMatrixBSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, blockDim, CuSparseVector(Vec::SparseVector) = CuSparseVector(Vec.nzind, Vec.nzval, size(Vec)[1]) CuSparseMatrixCSC(Vec::SparseVector) = CuSparseMatrixCSC([1], Vec.nzind, Vec.nzval, size(Vec)) -CuSparseVector(Mat::SparseMatrixCSC) = size(Mat,2) == 1 ? CuSparseVector(Mat.rowval, Mat.nzval, size(Mat)[1]) : throw(ArgumentError()) +CuSparseVector(Mat::SparseMatrixCSC) = size(Mat,2) == 1 ? CuSparseVector(Mat.rowval, Mat.nzval, size(Mat)[1]) : throw(ArgumentError("The input argument must have a single column")) CuSparseMatrixCSC(Mat::SparseMatrixCSC) = CuSparseMatrixCSC(Mat.colptr, Mat.rowval, Mat.nzval, size(Mat)) CuSparseMatrixCSR(Mat::SparseMatrixCSC) = switch2csr(CuSparseMatrixCSC(Mat)) diff --git a/src/sparse/util.jl b/src/sparse/util.jl index 0b68c952..a8d0aa58 100644 --- a/src/sparse/util.jl +++ b/src/sparse/util.jl @@ -151,9 +151,9 @@ for (fname,elty) in ((:cusparseScsr2csc, :Float32), function switch2csc(csr::CuSparseMatrixCSR{$elty},inda::SparseChar='O') cuind = cusparseindex(inda) m,n = csr.dims - colPtr = cuzeros(Cint, n+1) - rowVal = cuzeros(Cint, csr.nnz) - nzVal = cuzeros($elty, csr.nnz) + colPtr = CuArrays.zeros(Cint, n+1) + rowVal = CuArrays.zeros(Cint, csr.nnz) + nzVal = CuArrays.zeros($elty, csr.nnz) @check ccall(($(string(fname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Cint, CuPtr{$elty}, CuPtr{Cint}, CuPtr{Cint}, CuPtr{$elty}, CuPtr{Cint}, @@ -167,9 +167,9 @@ for (fname,elty) in ((:cusparseScsr2csc, :Float32), function switch2csr(csc::CuSparseMatrixCSC{$elty},inda::SparseChar='O') cuind = cusparseindex(inda) m,n = csc.dims - rowPtr = cuzeros(Cint,m+1) - colVal = cuzeros(Cint,csc.nnz) - nzVal = cuzeros($elty,csc.nnz) + rowPtr = CuArrays.zeros(Cint,m+1) + colVal = CuArrays.zeros(Cint,csc.nnz) + nzVal = CuArrays.zeros($elty,csc.nnz) @check ccall(($(string(fname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Cint, CuPtr{$elty}, CuPtr{Cint}, CuPtr{Cint}, CuPtr{$elty}, CuPtr{Cint}, @@ -200,7 +200,7 @@ for (fname,elty) in ((:cusparseScsr2bsr, :Float32), nnz = Ref{Cint}(1) mb = div((m + blockDim - 1),blockDim) nb = div((n + blockDim - 1),blockDim) - bsrRowPtr = cuzeros(Cint,mb + 1) + bsrRowPtr = CuArrays.zeros(Cint,mb + 1) cudesca = cusparseMatDescr_t(CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT, cuinda) cudescc = cusparseMatDescr_t(CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT, cuindc) @check ccall((:cusparseXcsr2bsrNnz,libcusparse), cusparseStatus_t, @@ -210,8 +210,8 @@ for (fname,elty) in ((:cusparseScsr2bsr, :Float32), CuPtr{Cint}, Ptr{Cint}), handle(), cudir, m, n, Ref(cudesca), csr.rowPtr, csr.colVal, blockDim, Ref(cudescc), bsrRowPtr, nnz) - bsrNzVal = cuzeros($elty, nnz[] * blockDim * blockDim ) - bsrColInd = cuzeros(Cint, nnz[]) + bsrNzVal = CuArrays.zeros($elty, nnz[] * blockDim * blockDim ) + bsrColInd = CuArrays.zeros(Cint, nnz[]) @check ccall(($(string(fname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseDirection_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, @@ -250,9 +250,9 @@ for (fname,elty) in ((:cusparseSbsr2csr, :Float32), nnz = bsr.nnz * bsr.blockDim * bsr.blockDim cudesca = cusparseMatDescr_t(CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT, cuinda) cudescc = cusparseMatDescr_t(CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT, cuindc) - csrRowPtr = cuzeros(Cint, m + 1) - csrColInd = cuzeros(Cint, nnz) - csrNzVal = cuzeros($elty, nnz) + csrRowPtr = CuArrays.zeros(Cint, m + 1) + csrColInd = CuArrays.zeros(Cint, nnz) + csrNzVal = CuArrays.zeros($elty, nnz) @check ccall(($(string(fname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseDirection_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, @@ -280,7 +280,7 @@ for (cname,rname,elty) in ((:cusparseScsc2dense, :cusparseScsr2dense, :Float32), function Base.Array(csr::CuSparseMatrixCSR{$elty},ind::SparseChar='O') cuind = cusparseindex(ind) m,n = csr.dims - denseA = cuzeros($elty,m,n) + denseA = CuArrays.zeros($elty,m,n) lda = max(1,stride(denseA,2)) cudesc = cusparseMatDescr_t(CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT, cuind) @check ccall(($(string(rname)),libcusparse), cusparseStatus_t, @@ -294,7 +294,7 @@ for (cname,rname,elty) in ((:cusparseScsc2dense, :cusparseScsr2dense, :Float32), function Base.Array(csc::CuSparseMatrixCSC{$elty},ind::SparseChar='O') cuind = cusparseindex(ind) m,n = csc.dims - denseA = cuzeros($elty,m,n) + denseA = CuArrays.zeros($elty,m,n) lda = max(1,stride(denseA,2)) cudesc = cusparseMatDescr_t(CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT, cuind) @check ccall(($(string(cname)),libcusparse), cusparseStatus_t, @@ -328,7 +328,7 @@ for (nname,cname,rname,hname,elty) in ((:cusparseSnnz, :cusparseSdense2csc, :cus m,n = size(A) lda = max(1,stride(A,2)) cudesc = cusparseMatDescr_t(CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT, cuind) - nnzRowCol = cuzeros(Cint, fmt == 'R' ? m : n) + nnzRowCol = CuArrays.zeros(Cint, fmt == 'R' ? m : n) nnzTotal = Ref{Cint}(1) @check ccall(($(string(nname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseDirection_t, @@ -336,10 +336,10 @@ for (nname,cname,rname,hname,elty) in ((:cusparseSnnz, :cusparseSdense2csc, :cus Cint, CuPtr{Cint}, Ptr{Cint}), handle(), cudir, m, n, Ref(cudesc), A, lda, nnzRowCol, nnzTotal) - nzVal = cuzeros($elty,nnzTotal[]) + nzVal = CuArrays.zeros($elty,nnzTotal[]) if(fmt == 'R') - rowPtr = cuzeros(Cint,m+1) - colInd = cuzeros(Cint,nnzTotal[]) + rowPtr = CuArrays.zeros(Cint,m+1) + colInd = CuArrays.zeros(Cint,nnzTotal[]) @check ccall(($(string(rname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, @@ -349,8 +349,8 @@ for (nname,cname,rname,hname,elty) in ((:cusparseSnnz, :cusparseSdense2csc, :cus return CuSparseMatrixCSR(rowPtr,colInd,nzVal,nnzTotal[],size(A)) end if(fmt == 'C') - colPtr = cuzeros(Cint,n+1) - rowInd = cuzeros(Cint,nnzTotal[]) + colPtr = CuArrays.zeros(Cint,n+1) + rowInd = CuArrays.zeros(Cint,nnzTotal[]) @check ccall(($(string(cname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, @@ -432,9 +432,9 @@ for (rname,cname,elty) in ((:cusparseShyb2csr, :cusparseShyb2csc, :Float32), cuinda = cusparseindex(inda) m,n = hyb.dims cudesca = cusparseMatDescr_t(CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_LOWER, CUSPARSE_DIAG_TYPE_NON_UNIT, cuinda) - csrRowPtr = cuzeros(Cint, m + 1) - csrColInd = cuzeros(Cint, hyb.nnz) - csrNzVal = cuzeros($elty, hyb.nnz) + csrRowPtr = CuArrays.zeros(Cint, m + 1) + csrColInd = CuArrays.zeros(Cint, hyb.nnz) + csrNzVal = CuArrays.zeros($elty, hyb.nnz) @check ccall(($(string(rname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{cusparseMatDescr_t}, cusparseHybMat_t, CuPtr{$elty}, CuPtr{Cint}, diff --git a/src/sparse/wrappers.jl b/src/sparse/wrappers.jl index e85439ec..6ec2bbe1 100644 --- a/src/sparse/wrappers.jl +++ b/src/sparse/wrappers.jl @@ -204,7 +204,7 @@ for (fname,elty) in ((:cusparseSsctr, :Float32), end function sctr(X::CuSparseVector{$elty}, index::SparseChar) - sctr!(X, cuzeros($elty, X.dims[1]),index) + sctr!(X, CuArrays.zeros($elty, X.dims[1]),index) end end end @@ -374,7 +374,7 @@ for (bname,aname,sname,elty) in ((:cusparseSbsrsv2_bufferSize, :cusparseSbsrsv2_ handle(), cudir, cutransa, mb, A.nnz, Ref(cudesc), A.nzVal, A.rowPtr, A.colVal, A.blockDim, info[1], bufSize) - buffer = cuzeros(UInt8, bufSize[]) + buffer = CuArrays.zeros(UInt8, bufSize[]) @check ccall(($(string(aname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, Cint, Cint, @@ -648,7 +648,7 @@ for (bname,aname,sname,elty) in ((:cusparseScsrsv2_bufferSize, :cusparseScsrsv2_ handle(), cutransa, m, A.nnz, Ref(cudesc), A.nzVal, A.rowPtr, A.colVal, info[1], bufSize) - buffer = cuzeros(UInt8, bufSize[]) + buffer = CuArrays.zeros(UInt8, bufSize[]) @check ccall(($(string(aname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseOperation_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, CuPtr{Cint}, @@ -720,7 +720,7 @@ for (bname,aname,sname,elty) in ((:cusparseScsrsv2_bufferSize, :cusparseScsrsv2_ handle(), cutransa, m, A.nnz, Ref(cudesc), A.nzVal, A.colPtr, A.rowVal, info[1], bufSize) - buffer = cuzeros(UInt8, bufSize[]) + buffer = CuArrays.zeros(UInt8, bufSize[]) @check ccall(($(string(aname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseOperation_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, CuPtr{Cint}, @@ -822,13 +822,13 @@ for elty in (:Float32, :Float64, :ComplexF32, :ComplexF64) A::Union{CuSparseMatrix{$elty},CompressedSparse{$elty}}, X::CuVector{$elty}, index::SparseChar) - mv(transa,alpha,A,X,zero($elty),cuzeros($elty,size(A)[1]),index) + mv(transa,alpha,A,X,zero($elty),CuArrays.zeros($elty,size(A)[1]),index) end function mv(transa::SparseChar, A::Union{CuSparseMatrix{$elty},CompressedSparse{$elty}}, X::CuVector{$elty}, index::SparseChar) - mv(transa,one($elty),A,X,zero($elty),cuzeros($elty,size(A)[1]),index) + mv(transa,one($elty),A,X,zero($elty),CuArrays.zeros($elty,size(A)[1]),index) end end end @@ -1132,21 +1132,21 @@ for elty in (:Float32, :Float64, :ComplexF32, :ComplexF64) B::CuMatrix{$elty}, index::SparseChar) m = transa == 'N' ? size(A)[1] : size(A)[2] - mm!(transa,alpha,A,B,zero($elty),cuzeros($elty,(m,size(B)[2])),index) + mm!(transa,alpha,A,B,zero($elty),CuArrays.zeros($elty,(m,size(B)[2])),index) end function mm(transa::SparseChar, A::CuSparseMatrix{$elty}, B::CuMatrix{$elty}, index::SparseChar) m = transa == 'N' ? size(A)[1] : size(A)[2] - mm!(transa,one($elty),A,B,zero($elty),cuzeros($elty,(m,size(B)[2])),index) + mm!(transa,one($elty),A,B,zero($elty),CuArrays.zeros($elty,(m,size(B)[2])),index) end function mm(transa::SparseChar, A::HermOrSym, B::CuMatrix{$elty}, index::SparseChar) m = transa == 'N' ? size(A)[1] : size(A)[2] - mm!(transa,one($elty),A.data,B,zero($elty),cuzeros($elty,(m,size(B)[2])),index) + mm!(transa,one($elty),A.data,B,zero($elty),CuArrays.zeros($elty,(m,size(B)[2])),index) end end end @@ -1270,7 +1270,7 @@ for elty in (:Float32,:Float64,:ComplexF32,:ComplexF64) index::SparseChar) m = transa == 'N' ? size(A)[1] : size(A)[2] n = transb == 'N' ? size(B)[2] : size(B)[1] - mm2(transa,transb,alpha,A,B,zero($elty),cuzeros($elty,(m,n)),index) + mm2(transa,transb,alpha,A,B,zero($elty),CuArrays.zeros($elty,(m,n)),index) end function mm2(transa::SparseChar, transb::SparseChar, @@ -1279,7 +1279,7 @@ for elty in (:Float32,:Float64,:ComplexF32,:ComplexF64) index::SparseChar) m = transa == 'N' ? size(A)[1] : size(A)[2] n = transb == 'N' ? size(B)[2] : size(B)[1] - mm2(transa,transb,one($elty),A,B,zero($elty),cuzeros($elty,(m,n)),index) + mm2(transa,transb,one($elty),A,B,zero($elty),CuArrays.zeros($elty,(m,n)),index) end end end @@ -1533,7 +1533,7 @@ for (bname,aname,sname,elty) in ((:cusparseSbsrsm2_bufferSize, :cusparseSbsrsm2_ cudir, cutransa, cutransxy, mb, nX, A.nnz, Ref(cudesc), A.nzVal, A.rowPtr, A.colVal, A.blockDim, info[1], bufSize) - buffer = cuzeros(UInt8, bufSize[]) + buffer = CuArrays.zeros(UInt8, bufSize[]) @check ccall(($(string(aname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, Cint, @@ -1609,7 +1609,7 @@ for (fname,elty) in ((:cusparseScsrgeam, :Float32), throw(DimensionMismatch("")) end nnzC = Ref{Cint}(1) - rowPtrC = cuzeros(Cint,mA+1) + rowPtrC = CuArrays.zeros(Cint,mA+1) @check ccall((:cusparseXcsrgeamNnz,libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Ptr{cusparseMatDescr_t}, Cint, CuPtr{Cint}, @@ -1619,7 +1619,7 @@ for (fname,elty) in ((:cusparseScsrgeam, :Float32), A.nnz, A.rowPtr, A.colVal, Ref(cudescb), B.nnz, B.rowPtr, B.colVal, Ref(cudescc), rowPtrC, nnzC) nnz = nnzC[] - C = CuSparseMatrixCSR(rowPtrC, cuzeros(Cint,nnz), cuzeros($elty,nnz), nnz, A.dims) + C = CuSparseMatrixCSR(rowPtrC, CuArrays.zeros(Cint,nnz), CuArrays.zeros($elty,nnz), nnz, A.dims) @check ccall(($(string(fname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Ptr{$elty}, Ptr{cusparseMatDescr_t}, Cint, CuPtr{$elty}, @@ -1652,7 +1652,7 @@ for (fname,elty) in ((:cusparseScsrgeam, :Float32), throw(DimensionMismatch("A and B must have same dimensions!")) end nnzC = Ref{Cint}(1) - rowPtrC = cuzeros(Cint, mA+1) + rowPtrC = CuArrays.zeros(Cint, mA+1) @check ccall((:cusparseXcsrgeamNnz,libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Ptr{cusparseMatDescr_t}, Cint, CuPtr{Cint}, @@ -1662,7 +1662,7 @@ for (fname,elty) in ((:cusparseScsrgeam, :Float32), A.nnz, A.colPtr, A.rowVal, Ref(cudescb), B.nnz, B.colPtr, B.rowVal, Ref(cudescc), rowPtrC, nnzC) nnz = nnzC[] - C = CuSparseMatrixCSC(rowPtrC, cuzeros(Cint, nnz), cuzeros($elty, nnz), nnz, A.dims) + C = CuSparseMatrixCSC(rowPtrC, CuArrays.zeros(Cint, nnz), CuArrays.zeros($elty, nnz), nnz, A.dims) @check ccall(($(string(fname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Ptr{$elty}, Ptr{cusparseMatDescr_t}, Cint, CuPtr{$elty}, @@ -1754,7 +1754,7 @@ for (fname,elty) in ((:cusparseScsrgemm, :Float32), throw(DimensionMismatch("Interior dimension of A, $k, and B, $kB, must match")) end nnzC = Ref{Cint}(1) - rowPtrC = CuArray(zeros(Cint,m + 1)) + rowPtrC = CuArrays.zeros(Cint,m + 1) @check ccall((:cusparseXcsrgemmNnz,libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, Cint, Cint, Cint, @@ -1766,7 +1766,7 @@ for (fname,elty) in ((:cusparseScsrgemm, :Float32), Ref(cudescb), B.nnz, B.rowPtr, B.colVal, Ref(cudescc), rowPtrC, nnzC) nnz = nnzC[] - C = CuSparseMatrixCSR(rowPtrC, CuArray(zeros(Cint,nnz)), CuArray(zeros($elty,nnz)), nnz, (m,n)) + C = CuSparseMatrixCSR(rowPtrC, CuArrays.zeros(Cint,nnz), CuArrays.zeros($elty,nnz), nnz, (m,n)) @check ccall(($(string(fname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, Cint, Cint, Cint, @@ -1819,7 +1819,7 @@ for (fname,elty) in ((:cusparseScsrgemm, :Float32), throw(DimensionMismatch("Interior dimension of A, $k, and B, $kB, must match")) end nnzC = Ref{Cint}(1) - colPtrC = CuArray(zeros(Cint,n + 1)) + colPtrC = CuArrays.zeros(Cint,n + 1) @check ccall((:cusparseXcsrgemmNnz,libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, Cint, Cint, Cint, @@ -1831,7 +1831,7 @@ for (fname,elty) in ((:cusparseScsrgemm, :Float32), Ref(cudescb), B.nnz, B.colPtr, B.rowVal, Ref(cudescc), colPtrC, nnzC) nnz = nnzC[] - C = CuSparseMatrixCSC(colPtrC, CuArray(zeros(Cint,nnz)), CuArray(zeros($elty,nnz)), nnz, (m,n)) + C = CuSparseMatrixCSC(colPtrC, CuArrays.zeros(Cint,nnz), CuArrays.zeros($elty,nnz), nnz, (m,n)) @check ccall(($(string(fname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, Cint, Cint, Cint, @@ -1939,7 +1939,7 @@ for (bname,aname,sname,elty) in ((:cusparseScsric02_bufferSize, :cusparseScsric0 CuPtr{Cint}, csric02Info_t, Ptr{Cint}), handle(), m, A.nnz, Ref(cudesc), A.nzVal, A.rowPtr, A.colVal, info[1], bufSize) - buffer = CuArray(zeros(UInt8, bufSize[])) + buffer = CuArrays.zeros(UInt8, bufSize[]) @check ccall(($(string(aname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, CuPtr{Cint}, @@ -1990,7 +1990,7 @@ for (bname,aname,sname,elty) in ((:cusparseScsric02_bufferSize, :cusparseScsric0 CuPtr{Cint}, csric02Info_t, Ptr{Cint}), handle(), m, A.nnz, Ref(cudesc), A.nzVal, A.colPtr, A.rowVal, info[1], bufSize) - buffer = CuArray(zeros(UInt8, bufSize[])) + buffer = CuArrays.zeros(UInt8, bufSize[]) @check ccall(($(string(aname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, CuPtr{Cint}, @@ -2096,7 +2096,7 @@ for (bname,aname,sname,elty) in ((:cusparseScsrilu02_bufferSize, :cusparseScsril CuPtr{Cint}, csrilu02Info_t, Ptr{Cint}), handle(), m, A.nnz, Ref(cudesc), A.nzVal, A.rowPtr, A.colVal, info[1], bufSize) - buffer = CuArray(zeros(UInt8, bufSize[])) + buffer = CuArrays.zeros(UInt8, bufSize[]) @check ccall(($(string(aname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, CuPtr{Cint}, @@ -2147,7 +2147,7 @@ for (bname,aname,sname,elty) in ((:cusparseScsrilu02_bufferSize, :cusparseScsril CuPtr{Cint}, csrilu02Info_t, Ptr{Cint}), handle(), m, A.nnz, Ref(cudesc), A.nzVal, A.colPtr, A.rowVal, info[1], bufSize) - buffer = CuArray(zeros(UInt8, bufSize[])) + buffer = CuArrays.zeros(UInt8, bufSize[]) @check ccall(($(string(aname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, CuPtr{Cint}, @@ -2201,7 +2201,7 @@ for (bname,aname,sname,elty) in ((:cusparseSbsric02_bufferSize, :cusparseSbsric0 Ptr{Cint}), handle(), cudir, mb, A.nnz, Ref(cudesc), A.nzVal, A.rowPtr, A.colVal, A.blockDim, info[1], bufSize) - buffer = CuArray(zeros(UInt8, bufSize[])) + buffer = CuArrays.zeros(UInt8, bufSize[]) @check ccall(($(string(aname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseDirection_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, @@ -2257,7 +2257,7 @@ for (bname,aname,sname,elty) in ((:cusparseSbsrilu02_bufferSize, :cusparseSbsril Ptr{Cint}), handle(), cudir, mb, A.nnz, Ref(cudesc), A.nzVal, A.rowPtr, A.colVal, A.blockDim, info[1], bufSize) - buffer = CuArray(zeros(UInt8, bufSize[])) + buffer = CuArrays.zeros(UInt8, bufSize[]) @check ccall(($(string(aname)),libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparseDirection_t, Cint, Cint, Ptr{cusparseMatDescr_t}, CuPtr{$elty}, diff --git a/test/base.jl b/test/base.jl index 0bf8da98..c7fd2676 100644 --- a/test/base.jl +++ b/test/base.jl @@ -50,11 +50,11 @@ end @test Base.unsafe_wrap(CuArray{Nothing}, CU_NULL, (1,2)) == CuArray{Nothing,2}(buf, (1,2)) @test Base.unsafe_wrap(CuArray{Nothing,2}, CU_NULL, (1,2)) == CuArray{Nothing,2}(buf, (1,2)) - @test collect(cuzeros(2, 2)) == zeros(Float32, 2, 2) - @test collect(cuones(2, 2)) == ones(Float32, 2, 2) + @test collect(CuArrays.zeros(2, 2)) == zeros(Float32, 2, 2) + @test collect(CuArrays.ones(2, 2)) == ones(Float32, 2, 2) - @test collect(cufill(0, 2, 2)) == zeros(Float32, 2, 2) - @test collect(cufill(1, 2, 2)) == ones(Float32, 2, 2) + @test collect(CuArrays.fill(0, 2, 2)) == zeros(Float32, 2, 2) + @test collect(CuArrays.fill(1, 2, 2)) == ones(Float32, 2, 2) end @testset "Adapt" begin @@ -70,6 +70,14 @@ end @test testf((x) -> sin.(x), rand(2, 3)) @test testf((x) -> log.(x) .+ 1, rand(2, 3)) @test testf((x) -> 2x, rand(2, 3)) + @test testf((x) -> x .^ 0, rand(2, 3)) + @test testf((x) -> x .^ 1, rand(2, 3)) + @test testf((x) -> x .^ 2, rand(2, 3)) + @test testf((x) -> x .^ 3, rand(2, 3)) + @test testf((x) -> x .^ 5, rand(2, 3)) + @test testf((x) -> (z = Int32(5); x .^ z), rand(2, 3)) + @test testf((x) -> (z = Float64(π); x .^ z), rand(2, 3)) + @test testf((x) -> (z = Float32(π); x .^ z), rand(Float32, 2, 3)) @test testf((x, y) -> x .+ y, rand(2, 3), rand(1, 3)) @test testf((z, x, y) -> z .= x .+ y, rand(2, 3), rand(2, 3), rand(2)) @test (CuArray{Ptr{Cvoid}}(undef, 1) .= C_NULL) == CuArray([C_NULL]) @@ -82,20 +90,25 @@ end end @testset "Cufunc" begin - gelu(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3))) + gelu1(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3))) sig(x) = one(x) / (one(x) + exp(-x)) - f(x) = gelu(log(x)) * sig(x) * tanh(x) + f(x) = gelu1(log(x)) * sig(x) * tanh(x) + g(x) = x^7 - 2 * x^f(x^2) + 3 - CuArrays.@cufunc gelu(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3))) + + CuArrays.@cufunc gelu1(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3))) CuArrays.@cufunc sig(x) = one(x) / (one(x) + exp(-x)) - CuArrays.@cufunc f(x) = gelu(log(x)) * sig(x) * tanh(x) + CuArrays.@cufunc f(x) = gelu1(log(x)) * sig(x) * tanh(x) + CuArrays.@cufunc g(x) = x^7 - 2 * x^f(x^2) + 3 - @test :gelu ∈ CuArrays.cufuncs() + @test :gelu1 ∈ CuArrays.cufuncs() @test :sig ∈ CuArrays.cufuncs() @test :f ∈ CuArrays.cufuncs() - @test testf((x) -> gelu.(x), rand(3,3)) + @test :g ∈ CuArrays.cufuncs() + @test testf((x) -> gelu1.(x), rand(3,3)) @test testf((x) -> sig.(x), rand(3,3)) @test testf((x) -> f.(x), rand(3,3)) + @test testf((x) -> g.(x), rand(3,3)) end # https://github.com/JuliaGPU/CUDAnative.jl/issues/223 @@ -231,3 +244,46 @@ end @test testf(x -> filter(y->y .> 0.5, x), rand(2,2)) @test testf(x -> filter(y->y .> 0.5, x), rand(2,2,2)) end + +@testset "generic fallbacks" begin + a = rand(Int8, 3, 3) + b = rand(Int8, 3, 3) + d_a = CuArray{Int8}(a) + d_b = CuArray{Int8}(b) + d_c = d_a*d_b + @test collect(d_c) == a*b + a = rand(Complex{Int8}, 3, 3) + b = rand(Complex{Int8}, 3, 3) + d_a = CuArray{Complex{Int8}}(a) + d_b = CuArray{Complex{Int8}}(b) + d_c = d_a'*d_b + @test collect(d_c) == a'*b + d_c = d_a*d_b' + @test collect(d_c) == a*b' + d_c = d_a'*d_b' + @test collect(d_c) == a'*b' + d_c = transpose(d_a)*d_b' + @test collect(d_c) == transpose(a)*b' + d_c = d_a'*transpose(d_b) + @test collect(d_c) == a'*transpose(b) + d_c = transpose(d_a)*d_b + @test collect(d_c) == transpose(a)*b + d_c = d_a*transpose(d_b) + @test collect(d_c) == a*transpose(b) + d_c = transpose(d_a)*transpose(d_b) + @test collect(d_c) == transpose(a)*transpose(b) + d_c = rmul!(copy(d_a), Complex{Int8}(2, 2)) + @test collect(d_c) == a*Complex{Int8}(2, 2) + d_c = lmul!(Complex{Int8}(2, 2), copy(d_a)) + @test collect(d_c) == Complex{Int8}(2, 2)*a +end + +@testset "reverse" begin + @test testf(x->reverse(x), rand(1000)) + @test testf(x->reverse(x, 10), rand(1000)) + @test testf(x->reverse(x, 10, 90), rand(1000)) + + @test testf(x->reverse!(x), rand(1000)) + @test testf(x->reverse!(x, 10), rand(1000)) + @test testf(x->reverse!(x, 10, 90), rand(1000)) +end diff --git a/test/blas.jl b/test/blas.jl index 58fcde3c..9c20d7c0 100644 --- a/test/blas.jl +++ b/test/blas.jl @@ -2,12 +2,7 @@ using LinearAlgebra @testset "CUBLAS" begin -if !isdefined(CuArrays, :CUBLAS) -@warn "Not testing CUBLAS" -else using CuArrays.CUBLAS -@info "Testing CUBLAS $(CUBLAS.version())" - using CuArrays.CUBLAS: band, bandex m = 20 @@ -27,7 +22,7 @@ CUBLAS.cublasSetMathMode(CUBLAS.CUBLAS_DEFAULT_MATH) ################# @testset "Level 1 with element type $T" for T in [Float32, Float64, ComplexF32, ComplexF64] - A = CuArray(rand(T, m)) + A = CuArrays.rand(T, m) B = CuArray{T}(undef, m) CuArrays.CUBLAS.blascopy!(m,A,1,B,1) @test Array(A) == Array(B) @@ -43,6 +38,15 @@ CUBLAS.cublasSetMathMode(CUBLAS.CUBLAS_DEFAULT_MATH) if T <: Real @test testf(argmin, rand(T, m)) @test testf(argmax, rand(T, m)) + else + @test testf(BLAS.dotu, rand(T, m), rand(T, m)) + x = rand(T, m) + y = rand(T, m) + dx = CuArray(x) + dy = CuArray(y) + dz = BLAS.dot(dx, dy) + z = BLAS.dotc(x, y) + @test dz ≈ z end end # level 1 testset @@ -55,6 +59,16 @@ end # level 1 testset @test testf(*, rand(elty, m, n), rand(elty, n)) @test testf(*, transpose(rand(elty, m, n)), rand(elty, m)) @test testf(*, rand(elty, m, n)', rand(elty, m)) + x = rand(elty, m) + A = rand(elty, m, m + 1 ) + y = rand(elty, m) + dx = CuArray(x) + dA = CuArray(A) + dy = CuArray(y) + @test_throws DimensionMismatch mul!(dy, dA, dx) + A = rand(elty, m + 1, m ) + dA = CuArray(A) + @test_throws DimensionMismatch mul!(dy, dA, dx) end @testset "banded methods" begin # bands @@ -365,6 +379,8 @@ end # level 1 testset # compare @test C1 ≈ h_C1 @test C2 ≈ h_C2 + @test_throws ArgumentError mul!(dhA, dhA, dsA) + @test_throws DimensionMismatch mul!(d_C1, d_A, dsA) end @testset "gemm" begin @@ -1105,6 +1121,6 @@ end # level 1 testset @test C ≈ h_C end end # extensions - end # elty - end # if CUBLAS +end # elty + end # cublas testset diff --git a/test/fft.jl b/test/fft.jl index bc6b042c..90b960f3 100644 --- a/test/fft.jl +++ b/test/fft.jl @@ -1,16 +1,12 @@ @testset "CUFFT" begin -if !isdefined(CuArrays, :CUFFT) -@warn "Not testing CUFFT" -else using CuArrays.CUFFT -@info "Testing CUFFT $(CUFFT.version())" + +using FFTW # notes: # plan_bfft does not need separate testing since it is used by plan_ifft -using FFTW - N1 = 8 N2 = 32 N3 = 64 @@ -278,6 +274,11 @@ end end # testset int FFT +@testset "streams" begin + X = rand(N1) + d_X = CuArray(X) + p = plan_fft(d_X) + CUFFT.set_stream(p, CUDAdrv.CuDefaultStream()) end end diff --git a/test/forwarddiff.jl b/test/forwarddiff.jl new file mode 100644 index 00000000..de3e17c9 --- /dev/null +++ b/test/forwarddiff.jl @@ -0,0 +1,68 @@ +@testset "ForwardDiff" begin + using ForwardDiff + + @info "Testing ForwardDiff integration" + + function test_derivative(f, x::T) where T + buf = CuArray(zeros(T)) + + function kernel(a, x) + a[] = ForwardDiff.derivative(f, x) + return + end + CUDAnative.@cuda kernel(buf, x) + return buf[] + end + + testf(cuf, f, x) = test_derivative(cuf, x) ≈ ForwardDiff.derivative(f, x) + + + @testset "UNARY" begin + fs = filter(x->x[1] ==:CUDAnative && x[3] == 1, keys(ForwardDiff.DiffRules.DEFINED_DIFFRULES)) + + + nonneg = [:log, :log1p, :log10, :log2, :sqrt, :acosh] + + for (m, fn, _) ∈ fs + cuf = @eval $m.$fn + f = @eval $fn + + x32 = rand(Float32) + x64 = rand(Float64) + nx32 = -x32 + nx64 = -x64 + + if fn == :acosh + x32 += 1 + x64 += 1 + end + + @test testf(cuf, f, x32) + @test testf(cuf, f, x64) + + if fn ∉ nonneg + @test testf(cuf, f, nx32) + @test testf(cuf, f, nx64) + end + end + end + + @testset "POW" begin + x32 = rand(Float32) + x64 = rand(Float64) + y32 = rand(Float32) + y64 = rand(Float64) + y = Int32(7) + + @test testf(x->CUDAnative.pow(x, y), x->x^y, x32) + @test testf(x->CUDAnative.pow(x, y), x->x^y, x64) + @test testf(x->CUDAnative.pow(x, y32), x->x^y32, x32) + @test testf(x->CUDAnative.pow(x, y64), x->x^y64, x64) + + @test testf(y->CUDAnative.pow(x32, y), y->x32^y, y32) + @test testf(y->CUDAnative.pow(x64, y), y->x64^y, y64) + + @test testf(x->CUDAnative.pow(x, x), x->x^x, x32) + @test testf(x->CUDAnative.pow(x, x), x->x^x, x64) + end +end diff --git a/test/rand.jl b/test/rand.jl index 29382f30..83260d12 100644 --- a/test/rand.jl +++ b/test/rand.jl @@ -1,10 +1,6 @@ @testset "CURAND" begin -if !isdefined(CuArrays, :CURAND) -@warn "Not testing CURAND" -else using CuArrays.CURAND -@info "Testing CURAND $(CURAND.version())" CURAND.seed!() @@ -19,26 +15,27 @@ for (f,T) in ((rand!,Float32), end # out-of-place, with implicit type -for (f,T) in ((curand,Float32), (curandn,Float32), (curand_logn,Float32), - (curand_poisson,Cuint),(rand,Float64), (randn,Float64)), +for (f,T) in ((CuArrays.rand,Float32), (CuArrays.randn,Float32), + (CuArrays.rand_logn,Float32), (CuArrays.rand_poisson,Cuint), + (rand,Float64), (randn,Float64)), args in ((2,), (2, 2)) A = f(args...) @test eltype(A) == T end # out-of-place, with type specified -for (f,T) in ((curand,Float32), (curandn,Float32), (curand_logn,Float32), - (curand,Float64), (curandn,Float64), (curand_logn,Float64), - (curand_poisson,Cuint), (rand,Float32), (randn,Float32), - (rand_logn,Float32), (rand,Float64), (randn,Float64), - (rand_logn,Float64), (rand_poisson,Cuint)), +for (f,T) in ((CuArrays.rand,Float32), (CuArrays.randn,Float32), (CuArrays.rand_logn,Float32), + (CuArrays.rand,Float64), (CuArrays.randn,Float64), (CuArrays.rand_logn,Float64), + (CuArrays.rand_poisson,Cuint), + (rand,Float32), (randn,Float32), + (rand,Float64), (randn,Float64)), args in ((T, 2), (T, 2, 2), (T, (2, 2))) A = f(args...) @test eltype(A) == T end # unsupported types that fall back to GPUArrays -for (f,T) in ((curand,Int64),), +for (f,T) in ((CuArrays.rand,Int64),), args in ((T, 2), (T, 2, 2), (T, (2, 2))) A = f(args...) @test eltype(A) == T @@ -54,5 +51,3 @@ end @test_throws ErrorException rand_poisson!(CuArray{Float64}(undef, 10)) end - -end diff --git a/test/runtests.jl b/test/runtests.jl index 8fdb5b89..2c027334 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,6 +6,7 @@ using Random Random.seed!(1) using CuArrays +using CUDAnative using GPUArrays import GPUArrays: allowscalar, @allowscalar @@ -17,14 +18,15 @@ allowscalar(false) @testset "CuArrays" begin include("base.jl") -include("dnn.jl") include("blas.jl") +include("rand.jl") +include("fft.jl") include("sparse.jl") include("solver.jl") -include("fft.jl") -include("rand.jl") include("sparse_solver.jl") include("special.jl") +include("dnn.jl") +include("forwarddiff.jl") CuArrays.pool_status() CuArrays.pool_timings() diff --git a/test/solver.jl b/test/solver.jl index 544c692f..a1f7285e 100644 --- a/test/solver.jl +++ b/test/solver.jl @@ -1,10 +1,6 @@ @testset "CUSOLVER" begin -if !isdefined(CuArrays, :CUSOLVER) -@warn "Not testing CUSOLVER" -else using CuArrays.CUSOLVER -@info "Testing CUSOLVER $(CUSOLVER.version())" using LinearAlgebra @@ -33,7 +29,7 @@ k = 1 @test F.L ≈ collect(d_F.L) @test F\(A'B) ≈ collect(d_F\(d_A'd_B)) - @test_throws DimensionMismatch LinearAlgebra.LAPACK.potrs!('U',d_A,CuArray(rand(elty,m,m))) + @test_throws DimensionMismatch LinearAlgebra.LAPACK.potrs!('U',d_A,CuArrays.rand(elty,m,m)) A = rand(elty,m,n) d_A = CuArray(A) @@ -233,7 +229,7 @@ k = 1 end h_W = collect(d_W) @test Eig.values ≈ h_W - d_B = CuArray(rand(elty, m+1, m+1)) + d_B = CuArrays.rand(elty, m+1, m+1) if( elty <: Complex ) @test_throws DimensionMismatch CUSOLVER.hegvd!(1, 'N','U', d_A, d_B) else @@ -299,7 +295,7 @@ k = 1 end # Check that constant propagation works _svd(A) = svd(A, CUSOLVER.QRAlgorithm) - @inferred _svd(CuArrays.CURAND.curand(Float32, 4, 4)) + @inferred _svd(CuArrays.rand(Float32, 4, 4)) @testset "qr" begin @@ -311,6 +307,8 @@ k = 1 d_RR = d_F.Q'*d_A @test d_RR[1:n,:] ≈ d_F.R atol=tol*norm(A) @test norm(d_RR[n+1:end,:]) < tol*norm(A) + @test size(d_F) == size(A) + @test size(d_F.Q, 1) == size(A, 1) A = rand(elty, n, m) d_A = CuArray(A) d_F = qr(d_A) @@ -320,6 +318,7 @@ k = 1 h_q, h_r = qr(d_A) q, r = qr(A) @test Array(h_q) ≈ Array(q) + @test collect(CuArray(h_q)) ≈ Array(q) @test Array(h_r) ≈ Array(r) A = rand(elty, n, m) d_A = CuArray(A) @@ -332,5 +331,3 @@ k = 1 end end - -end diff --git a/test/sparse.jl b/test/sparse.jl index 2843b6be..6c67cf27 100644 --- a/test/sparse.jl +++ b/test/sparse.jl @@ -1,10 +1,6 @@ @testset "CUSPARSE" begin -if !isdefined(CuArrays, :CUSPARSE) -@warn "Not testing CUSPARSE" -else using CuArrays.CUSPARSE -@info "Testing CUSPARSE $(CUSPARSE.version())" using LinearAlgebra using SparseArrays @@ -15,6 +11,13 @@ k = 10 blockdim = 5 @testset "util" begin + x = sprand(m,0.2) + d_x = CuSparseVector(x) + @test length(d_x) == m + @test size(d_x) == (m, 1) + @test size(d_x,1) == m + @test size(d_x,2) == 1 + @test ndims(d_x) == 1 x = sprand(m,n,0.2) d_x = CuSparseMatrixCSC(x) @test length(d_x) == m*n @@ -22,7 +25,11 @@ blockdim = 5 @test size(d_x,1) == m @test size(d_x,2) == n @test size(d_x,3) == 1 + @test ndims(d_x) == 2 + @test !issymmetric(d_x) + @test !ishermitian(d_x) @test_throws ArgumentError size(d_x,0) + @test_throws ArgumentError CUSPARSE.CuSparseVector(x) y = sprand(k,n,0.2) d_y = CuSparseMatrixCSC(y) @test_throws ArgumentError copyto!(d_y,d_x) @@ -269,7 +276,7 @@ end h_Y = collect(d_X) Y = A\(alpha * X) @test Y ≈ h_Y - d_X = CuArray(rand(elty,n,n)) + d_X = CuArrays.rand(elty,n,n) @test_throws DimensionMismatch CUSPARSE.bsrsm2!('N','N',alpha,d_A,d_X,'O') @test_throws DimensionMismatch CUSPARSE.bsrsm2!('N','T',alpha,d_A,d_X,'O') A = sparse(rand(elty,m,n)) @@ -311,7 +318,7 @@ end h_Y = collect(d_X) Y = A\(alpha * X) @test Y ≈ h_Y - d_X = CuArray(rand(elty,n)) + d_X = CuArrays.rand(elty,n) @test_throws DimensionMismatch CUSPARSE.sv2!('N','U',alpha,d_A,d_X,'O') A = sparse(rand(elty,m,n)) d_A = CuSparseMatrixCSR(A) @@ -322,6 +329,7 @@ end @testset "bsrsv2" begin A = rand(elty,m,m) A = triu(A) + Al = tril(A) X = rand(elty,m) alpha = rand(elty) d_X = CuArray(X) @@ -331,9 +339,24 @@ end h_Y = collect(d_Y) Y = A\(alpha * X) @test Y ≈ h_Y - #=d_Y = UpperTriangular(d_A)\d_X + d_Y = UpperTriangular(d_A)\d_X + h_Y = collect(d_Y) + @test h_Y ≈ A\X + #=d_Y = UpperTriangular(d_A)'\d_X + h_Y = collect(d_Y) + @test h_Y ≈ A'\X=# + d_Y = transpose(UpperTriangular(d_A))\d_X + h_Y = collect(d_Y) + @test h_Y ≈ transpose(A)\X + d_Y = LowerTriangular(d_A)\d_X h_Y = collect(d_Y) - @test h_Y ≈ A\X=# + @test h_Y ≈ Al\X + #=d_Y = LowerTriangular(d_A)'\d_X + h_Y = collect(d_Y) + @test h_Y ≈ A'\X=# + d_Y = transpose(LowerTriangular(d_A))\d_X + h_Y = collect(d_Y) + @test h_Y ≈ transpose(Al)\X A = sparse(rand(elty,m,n)) d_A = CuSparseMatrixCSR(A) d_A = CUSPARSE.switch2bsr(d_A, convert(Cint,5)) @@ -513,15 +536,15 @@ end h_y = collect(d_y) y = A\X @test y ≈ h_y - d_y = UpperTriangular(d_A)'\d_X + #=d_y = UpperTriangular(d_A)'\d_X h_y = collect(d_y) - y = A'\X + y = A'\X=# @test y ≈ h_y d_y = transpose(UpperTriangular(d_A))\d_X h_y = collect(d_y) y = transpose(A)\X @test y ≈ h_y - d_X = CuArray(rand(elty,n,n)) + d_X = CuArrays.rand(elty,n,n) @test_throws DimensionMismatch CUSPARSE.sm_solve('N','U',alpha,d_A,d_X,info,'O') A = sparse(rand(elty,m,n)) d_A = CuSparseMatrixCSR(A) @@ -554,7 +577,7 @@ end h_y = collect(d_y) y = transpose(A)\X @test y ≈ h_y - d_X = CuArray(rand(elty,n,n)) + d_X = CuArrays.rand(elty,n,n) @test_throws DimensionMismatch CUSPARSE.sm_solve('N','U',alpha,d_A,d_X,info,'O') A = sparse(rand(elty,m,n)) d_A = CuSparseMatrixCSC(A) @@ -611,6 +634,7 @@ end @testset "csrsv2" begin A = rand(elty,m,m) A = triu(A) + Al = tril(A) X = rand(elty,m) alpha = rand(elty) d_X = CuArray(X) @@ -627,10 +651,22 @@ end h_y = collect(d_y) y = transpose(A)\X @test y ≈ h_y - d_y = UpperTriangular(d_A)'\d_X + #=d_y = UpperTriangular(d_A)'\d_X h_y = collect(d_y) y = A'\X + @test y ≈ h_y=# + d_y = LowerTriangular(d_A)\d_X + h_y = collect(d_y) + y = Al\X + @test y ≈ h_y + d_y = transpose(LowerTriangular(d_A))\d_X + h_y = collect(d_y) + y = transpose(Al)\X @test y ≈ h_y + #=d_y = LowerTriangular(d_A)'\d_X + h_y = collect(d_y) + y = A'\X + @test y ≈ h_y=# A = sparse(rand(elty,m,n)) d_A = CuSparseMatrixCSR(A) @test_throws DimensionMismatch CUSPARSE.sv2('N','U',alpha,d_A,d_X,'O') @@ -639,6 +675,7 @@ end @testset "cscsv2" begin A = rand(elty,m,m) A = triu(A) + Al = tril(A) X = rand(elty,m) alpha = rand(elty) d_X = CuArray(X) @@ -659,6 +696,14 @@ end h_y = collect(d_y) y = transpose(A)\X @test y ≈ h_y + d_y = LowerTriangular(d_A)\d_X + h_y = collect(d_y) + y = Al\X + @test y ≈ h_y + d_y = transpose(LowerTriangular(d_A))\d_X + h_y = collect(d_y) + y = transpose(Al)\X + @test y ≈ h_y #=d_y = UpperTriangular(d_A)'\d_X h_y = collect(d_y) y = A'\X @@ -1030,7 +1075,7 @@ end h_y = collect(d_y) @test h_y ≈ A\x=# - d_x = CuArray(rand(elty,n)) + d_x = CuArrays.rand(elty,n) info = CUSPARSE.sv_analysis('N','T','U',d_A,'O') @test_throws DimensionMismatch CUSPARSE.sv_solve('N','U',alpha,d_A,d_x,info,'O') A = sparse(rand(elty,m,n)) @@ -1137,7 +1182,7 @@ end h_C = collect(d_C) D = transpose(A) * B @test D ≈ h_C - d_B = CuArray(rand(elty,k,n)) + d_B = CuArrays.rand(elty,k,n) @test_throws DimensionMismatch CUSPARSE.mm!('T',alpha,d_A,d_B,beta,d_C,'O') @test_throws DimensionMismatch CUSPARSE.mm!('N',alpha,d_A,d_B,beta,d_B,'O') end @@ -1153,7 +1198,7 @@ end h_C = collect(d_C) D = transpose(A) * B @test D ≈ h_C - d_B = CuArray(rand(elty,k,n)) + d_B = CuArrays.rand(elty,k,n) @test_throws DimensionMismatch CUSPARSE.mm!('T',alpha,d_A,d_B,beta,d_C,'O') @test_throws DimensionMismatch CUSPARSE.mm!('N',alpha,d_A,d_B,beta,d_B,'O') end @@ -1175,7 +1220,7 @@ end h_C = collect(d_C) D = alpha * A * B + beta * C @test D ≈ h_C - d_B = CuArray(rand(elty,k,n)) + d_B = CuArrays.rand(elty,k,n) @test_throws DimensionMismatch CUSPARSE.mm!('T',alpha,d_A,d_B,beta,d_C,'O') @test_throws DimensionMismatch CUSPARSE.mm!('N',alpha,d_A,d_B,beta,d_B,'O') end @@ -1187,7 +1232,7 @@ end h_C = collect(d_C) D = alpha * A * B + beta * C @test D ≈ h_C - d_B = CuArray(rand(elty,k,n)) + d_B = CuArrays.rand(elty,k,n) @test_throws DimensionMismatch CUSPARSE.mm!('T',alpha,d_A,d_B,beta,d_C,'O') @test_throws DimensionMismatch CUSPARSE.mm!('N',alpha,d_A,d_B,beta,d_B,'O') end @@ -1913,5 +1958,3 @@ end end end - -end diff --git a/test/sparse_solver.jl b/test/sparse_solver.jl index 090800dd..82c9a7a3 100644 --- a/test/sparse_solver.jl +++ b/test/sparse_solver.jl @@ -1,8 +1,7 @@ @testset "CUSPARSE + CUSOLVER" begin -if isdefined(CuArrays, :CUSPARSE) && isdefined(CuArrays, :CUSOLVER) -using CuArrays.CUSOLVER using CuArrays.CUSPARSE +using CuArrays.CUSOLVER using LinearAlgebra using SparseArrays @@ -84,7 +83,7 @@ k = 1 A = A + A' d_A = CuSparseMatrixCSR(A) evs = eigvals(Array(A)) - x_0 = CuArray(rand(elty,n)) + x_0 = CuArrays.rand(elty,n) μ,x = CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O') @test μ ≈ evs[1] A = sparse(rand(elty,m,n)) @@ -92,7 +91,7 @@ k = 1 @test_throws DimensionMismatch CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O') A = sparse(rand(elty,n,n)) d_A = CuSparseMatrixCSR(A) - x_0 = CuArray(rand(elty,m)) + x_0 = CuArrays.rand(elty,m) @test_throws DimensionMismatch CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O') end @testset "csreigs" begin @@ -126,5 +125,3 @@ k = 1 end end - -end From 61c66f421636f34d8a77001fd3c422b7e77a947b Mon Sep 17 00:00:00 2001 From: Kai Xu Date: Tue, 9 Jul 2019 02:28:31 +0100 Subject: [PATCH 3/3] bugfix --- src/forwarddiff.jl | 29 ++++++++++++++--------------- src/special/gamma.jl | 4 +--- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/forwarddiff.jl b/src/forwarddiff.jl index c6366ff3..5da13f8f 100644 --- a/src/forwarddiff.jl +++ b/src/forwarddiff.jl @@ -1,6 +1,6 @@ # ForwardDiff integration -byhand = [:lgamma, :digamma, :lbeta, :exp2, :log2, :exp10, :log10, :abs] +byhand = [:exp2, :log2, :exp10, :log10, :abs] for f in libdevice if haskey(ForwardDiff.DiffRules.DEFINED_DIFFRULES, (:Base,f,1)) @@ -12,18 +12,6 @@ for f in libdevice end end -# byhand: lgamma -ForwardDiff.DiffRules.@define_diffrule CuArrays.lgamma(a) = :(CuArrays.digamma($a)) -eval(ForwardDiff.unary_dual_definition(:CuArrays, :lgamma)) - -# byhand: digamma -ForwardDiff.DiffRules.@define_diffrule CuArrays.digamma(a) = :(CuArrays.trigamma($a)) -eval(ForwardDiff.unary_dual_definition(:CuArrays, :digamma)) - -# byhand: lbeta -ForwardDiff.DiffRules.@define_diffrule CuArrays.lbeta(a, b) = :(CuArrays.digamma($a) - CuArrays.digamma($a + $b)), :(CuArrays.digamma($b) - CuArrays.digamma($a + $b)) -eval(ForwardDiff.binary_dual_definition(:CuArrays, :lbeta)) - # byhand: exp2 ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :exp2, 1)] = x -> :((CuArrays.cufunc(exp2))(x) * (CuArrays.cufunc(log))(oftype(x, 2))) @@ -49,9 +37,20 @@ ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :abs, 1)] = x -> :(signbit(x) ? -one(x) : one(x)) eval(ForwardDiff.unary_dual_definition(:CUDAnative, :abs)) +# byhand: lgamma +ForwardDiff.DiffRules.@define_diffrule CuArrays.lgamma(a) = :(CuArrays.digamma($a)) +eval(ForwardDiff.unary_dual_definition(:CuArrays, :lgamma)) + +# byhand: digamma +ForwardDiff.DiffRules.@define_diffrule CuArrays.digamma(a) = :(CuArrays.trigamma($a)) +eval(ForwardDiff.unary_dual_definition(:CuArrays, :digamma)) + +# byhand: lbeta +ForwardDiff.DiffRules.@define_diffrule CuArrays.lbeta(a, b) = :(CuArrays.digamma($a) - CuArrays.digamma($a + $b)), :(CuArrays.digamma($b) - CuArrays.digamma($a + $b)) +eval(ForwardDiff.binary_dual_definition(:CuArrays, :lbeta)) -ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :pow, 2)] = (x, y) -> - replace_device.(ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:Base, :^, 2)](x, y)) +ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :pow, 2)] = + (x, y) -> replace_device.(ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:Base, :^, 2)](x, y)) @eval begin ForwardDiff.@define_binary_dual_op( diff --git a/src/special/gamma.jl b/src/special/gamma.jl index 38f8deed..e0090f88 100644 --- a/src/special/gamma.jl +++ b/src/special/gamma.jl @@ -60,6 +60,4 @@ function trigamma(x) end end -function lbeta(x, y) - return CUDAnative.lgamma(x) + CUDAnative.lgamma(y) - CUDAnative.lgamma(x + y) -end +lbeta(x, y) = lgamma(x) + lgamma(y) - lgamma(x + y)