From 03a8b69e100c571b258ec97a12f4dc9d4fbce41d Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 20 Dec 2019 08:23:08 +0100 Subject: [PATCH 1/3] Rework library handles for multithreading. --- Manifest.toml | 12 ++++++--- src/CuArrays.jl | 26 -------------------- src/blas/CUBLAS.jl | 56 ++++++++++++++++++++++++++++-------------- src/blas/error.jl | 35 ++++++++++++++++++++------ src/dnn/CUDNN.jl | 27 +++++++++++++------- src/dnn/error.jl | 34 ++++++++++++++++++++----- src/dnn/filter.jl | 2 +- src/fft/CUFFT.jl | 2 ++ src/fft/error.jl | 32 +++++++++++++++++++----- src/rand/CURAND.jl | 26 +++++++++++++------- src/rand/error.jl | 32 +++++++++++++++++++----- src/solver/CUSOLVER.jl | 47 +++++++++++++++++++++++------------ src/solver/error.jl | 34 +++++++++++++++++++------ src/sparse/CUSPARSE.jl | 27 +++++++++++++------- src/sparse/error.jl | 34 ++++++++++++++++++++----- src/tensor/CUTENSOR.jl | 29 ++++++++++++++++------ src/tensor/error.jl | 32 ++++++++++++++++++++---- src/utils.jl | 10 ++++++++ 18 files changed, 354 insertions(+), 143 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index da7fb34d..2d2f7153 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -22,19 +22,25 @@ version = "0.2.0" [[CUDAapi]] deps = ["Libdl", "Logging"] -git-tree-sha1 = "6eee47385c81ed3b3f716b745697869c712c2df3" +git-tree-sha1 = "ca1c7f639c5f6326919ee2834fa0dffb5002ff60" +repo-rev = "master" +repo-url = "https://github.com/JuliaGPU/CUDAapi.jl.git" uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3" version = "2.0.0" [[CUDAdrv]] deps = ["CEnum", "CUDAapi", "Printf"] -git-tree-sha1 = "0f39fddace3324707469ace7fbcbc7b28d5cf921" +git-tree-sha1 = "5a9dd9ec20a5a2c78c784c31361cf5a813c7a9c2" +repo-rev = "master" +repo-url = "https://github.com/JuliaGPU/CUDAdrv.jl.git" uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" version = "4.0.4" [[CUDAnative]] deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"] -git-tree-sha1 = "a67b38619d1fa131027bac1c4a81f0012254d1fd" +git-tree-sha1 = "1a04a76171016f68f4790e9643524a3ac31f3d32" +repo-rev = "master" +repo-url = "https://github.com/JuliaGPU/CUDAnative.jl.git" uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" version = "2.6.0" diff --git a/src/CuArrays.jl b/src/CuArrays.jl index 9599dbd8..75f7907d 100644 --- a/src/CuArrays.jl +++ b/src/CuArrays.jl @@ -32,10 +32,6 @@ include("linalg.jl") include("gpuarray_interface.jl") -# many libraries need to be initialized per-device (per-context, really, but we assume users -# of CuArrays and/or CUDAnative only use a single context), so keep track of the active one. -const active_context = Ref{CuContext}() - include("blas/CUBLAS.jl") include("sparse/CUSPARSE.jl") include("solver/CUSOLVER.jl") @@ -112,28 +108,6 @@ function __init__() # package integrations @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl") - # update the active context when we switch devices - callback = (::CuDevice, ctx::CuContext) -> begin - active_context[] = ctx - - # wipe the active handles - CUBLAS._handle[] = C_NULL - CUBLAS._xt_handle[] = C_NULL - CUSOLVER._dense_handle[] = C_NULL - CUSOLVER._sparse_handle[] = C_NULL - CUSPARSE._handle[] = C_NULL - CURAND._generator[] = nothing - CUDNN._handle[] = C_NULL - CUTENSOR._handle[] = nothing - end - push!(CUDAnative.device!_listeners, callback) - - # a device might be active already - existing_ctx = CUDAdrv.CuCurrentContext() - if existing_ctx !== nothing - active_context[] = existing_ctx - end - __init_memory__() __initialized__[] = true diff --git a/src/blas/CUBLAS.jl b/src/blas/CUBLAS.jl index 091f214a..e83a2361 100644 --- a/src/blas/CUBLAS.jl +++ b/src/blas/CUBLAS.jl @@ -5,10 +5,10 @@ using CUDAapi using CUDAdrv using CUDAdrv: CUstream -import CUDAnative +using CUDAnative using ..CuArrays -using ..CuArrays: active_context, unsafe_free! +using ..CuArrays: unsafe_free! using LinearAlgebra using CEnum @@ -27,17 +27,18 @@ include("wrappers.jl") # high-level integrations include("linalg.jl") -const _handles = Dict{CuContext,cublasHandle_t}() -const _xt_handles = Dict{CuContext,cublasXtHandle_t}() -const _handle = Ref{cublasHandle_t}(C_NULL) -const _xt_handle = Ref{cublasXtHandle_t}(C_NULL) +const created_handles = IdDict{CuContext,cublasHandle_t}() +const created_xt_handles = IdDict{CuContext,cublasXtHandle_t}() +const active_handles = Vector{Union{Nothing,cublasHandle_t}}() +const active_xt_handles = Vector{Union{Nothing,cublasXtHandle_t}}() function handle() - if _handle[] == C_NULL - CUDAnative.maybe_initialize("CUBLAS") - _handle[] = get!(_handles, active_context[]) do - context = active_context[] + tid = Threads.threadid() + if @inbounds active_handles[tid] === nothing + context = CuGetContext() + active_handles[tid] = get!(created_handles, context) do handle = cublasCreate_v2() + atexit(()->CUDAdrv.isvalid(context) && cublasDestroy_v2(handle)) # enable tensor math mode if our device supports it, and fast math is enabled dev = CUDAdrv.device(context) @@ -45,27 +46,44 @@ function handle() cublasSetMathMode(CUBLAS_TENSOR_OP_MATH, handle) end - atexit(()->CUDAdrv.isvalid(context) && cublasDestroy_v2(handle)) handle end end - - return _handle[] + @inbounds active_handles[tid] end function xt_handle() - if _xt_handle[] == C_NULL - @assert isassigned(active_context) # some other call should have initialized CUDA - _xt_handle[] = get!(_xt_handles, active_context[]) do - context = active_context[] + tid = Threads.threadid() + if @inbounds active_xt_handles[tid] === nothing + CUDAnative.maybe_initialize("cublasXtGetHandle") + context = CuCurrentContext() + active_xt_handles[tid] = get!(created_xt_handles, context) do handle = cublasXtCreate() + atexit(()->CUDAdrv.isvalid(context) && cublasXtDestroy(handle)) + + # select the devices + # TODO: this is weird, since we typically use a single device per thread/context devs = convert.(Cint, CUDAdrv.devices()) cublasXtDeviceSelect(handle, length(devs), devs) - atexit(()->CUDAdrv.isvalid(context) && cublasXtDestroy(handle)) + handle end end - return _xt_handle[] + @inbounds active_xt_handles[tid] +end + +function __init__() + resize!(active_handles, Threads.nthreads()) + fill!(active_handles, nothing) + + resize!(active_xt_handles, Threads.nthreads()) + fill!(active_xt_handles, nothing) + + CUDAnative.atcontextswitch() do tid, ctx, dev + # we don't eagerly initialize handles, but do so lazily when requested + active_handles[tid] = nothing + active_xt_handles[tid] = nothing + end end end diff --git a/src/blas/error.jl b/src/blas/error.jl index 2b45cb1d..68ea81cd 100644 --- a/src/blas/error.jl +++ b/src/blas/error.jl @@ -37,13 +37,34 @@ function status_message(status) end end -macro check(blas_func) + +## API call wrapper + +# API calls that are allowed without a functional context +const preinit_apicalls = Set{Symbol}([ + :cublasGetVersion, + :cublasGetProperty, + :cublasGetCudartVersion +]) + +# outlined functionality to avoid GC frame allocation +@noinline function throw_api_error(res) + throw(CuError(res)) +end + +macro check(ex) + fun = Symbol(decode_ccall_function(ex)) + init = if !in(fun, preinit_apicalls) + :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + end quote - local err::cublasStatus_t - err = $(esc(blas_func::Expr)) - if err != CUBLAS_STATUS_SUCCESS - throw(CUBLASError(err)) + $init + + res = $(esc(ex)) + if res != CUBLAS_STATUS_SUCCESS + throw_api_error(res) end - err + + return end -end \ No newline at end of file +end diff --git a/src/dnn/CUDNN.jl b/src/dnn/CUDNN.jl index 79f80290..0d5996d8 100644 --- a/src/dnn/CUDNN.jl +++ b/src/dnn/CUDNN.jl @@ -6,12 +6,12 @@ using CUDAapi: libraryPropertyType using CUDAdrv using CUDAdrv: CUstream -import CUDAnative +using CUDAnative using CEnum using ..CuArrays -using ..CuArrays: active_context, @argout, @workspace +using ..CuArrays: @argout, @workspace import ..CuArrays.unsafe_free! import NNlib @@ -41,21 +41,30 @@ include("nnlib.jl") include("compat.jl") -const _handles = Dict{CuContext,cudnnHandle_t}() -const _handle = Ref{cudnnHandle_t}(C_NULL) +const created_handles = IdDict{CuContext,cudnnHandle_t}() +const active_handles = Vector{Union{Nothing,cudnnHandle_t}}() function handle() - if _handle[] == C_NULL - CUDAnative.maybe_initialize("CUDNN") - _handle[] = get!(_handles, active_context[]) do - context = active_context[] + tid = Threads.threadid() + if @inbounds active_handles[tid] === nothing + context = CuGetContext() + active_handles[tid] = get!(created_handles, context) do handle = cudnnCreate() atexit(()->CUDAdrv.isvalid(context) && cudnnDestroy(handle)) handle end end + @inbounds active_handles[tid] +end + +function __init__() + resize!(active_handles, Threads.nthreads()) + fill!(active_handles, nothing) - return _handle[] + CUDAnative.atcontextswitch() do tid, ctx, dev + # we don't eagerly initialize handles, but do so lazily when requested + active_handles[tid] = nothing + end end end diff --git a/src/dnn/error.jl b/src/dnn/error.jl index c8c8d00d..1d716580 100644 --- a/src/dnn/error.jl +++ b/src/dnn/error.jl @@ -11,13 +11,35 @@ function CUDNNError(status::cudnnStatus_t) return CUDNNError(status, msg) end -macro check(dnn_func) + +## API call wrapper + +# API calls that are allowed without a functional context +const preinit_apicalls = Set{Symbol}([ + :cudnnGetVersion, + :cudnnGetProperty, + :cudnnGetCudartVersion, + :cudnnGetErrorString, +]) + +# outlined functionality to avoid GC frame allocation +@noinline function throw_api_error(res) + throw(CUDNNError(res)) +end + +macro check(ex) + fun = Symbol(decode_ccall_function(ex)) + init = if !in(fun, preinit_apicalls) + :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + end quote - local err::cudnnStatus_t - err = $(esc(dnn_func)) - if err != CUDNN_STATUS_SUCCESS - throw(CUDNNError(err)) + $init + + res = $(esc(ex)) + if res != CUDNN_STATUS_SUCCESS + throw_api_error(res) end - err + + return end end diff --git a/src/dnn/filter.jl b/src/dnn/filter.jl index 5bd8df9d..abf2bb16 100644 --- a/src/dnn/filter.jl +++ b/src/dnn/filter.jl @@ -10,7 +10,7 @@ Base.unsafe_convert(::Type{cudnnFilterDescriptor_t}, fd::FilterDesc) = fd.ptr function createFilterDesc() d = Ref{cudnnFilterDescriptor_t}() - @check cudnnCreateFilterDescriptor(d) + cudnnCreateFilterDescriptor(d) return d[] end diff --git a/src/fft/CUFFT.jl b/src/fft/CUFFT.jl index ec5bf930..339eb041 100644 --- a/src/fft/CUFFT.jl +++ b/src/fft/CUFFT.jl @@ -8,6 +8,8 @@ import ..CuArrays: unsafe_free! using CUDAdrv using CUDAdrv: CUstream +using CUDAnative + using CEnum const libcufft = Ref("libcufft") diff --git a/src/fft/error.jl b/src/fft/error.jl index 8e117813..792ecb1f 100644 --- a/src/fft/error.jl +++ b/src/fft/error.jl @@ -51,13 +51,33 @@ function status_message(status) end end -macro check(fft_func) + +## API call wrapper + +# API calls that are allowed without a functional context +const preinit_apicalls = Set{Symbol}([ + :cufftGetVersion, + :cufftGetProperty, +]) + +# outlined functionality to avoid GC frame allocation +@noinline function throw_api_error(res) + throw(CUFFTError(res)) +end + +macro check(ex) + fun = Symbol(decode_ccall_function(ex)) + init = if !in(fun, preinit_apicalls) + :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + end quote - local err::cufftResult - err = $(esc(fft_func::Expr)) - if err != CUFFT_SUCCESS - throw(CUFFTError(err)) + $init + + res = $(esc(ex)) + if res != CUFFT_SUCCESS + throw_api_error(res) end - err + + return end end diff --git a/src/rand/CURAND.jl b/src/rand/CURAND.jl index 7c973e5f..7857d137 100644 --- a/src/rand/CURAND.jl +++ b/src/rand/CURAND.jl @@ -1,14 +1,13 @@ module CURAND using ..CuArrays -using ..CuArrays: active_context using CUDAapi using CUDAdrv using CUDAdrv: CUstream -import CUDAnative +using CUDAnative using CEnum @@ -25,19 +24,28 @@ include("wrappers.jl") # high-level integrations include("random.jl") -const _generators = Dict{CuContext,RNG}() -const _generator = Ref{Union{Nothing,RNG}}(nothing) +const created_generators = IdDict{CuContext,RNG}() +const active_generators = Vector{Union{Nothing,RNG}}() function generator() - if _generator[] == nothing - CUDAnative.maybe_initialize("CURAND") - _generator[] = get!(_generators, active_context[]) do - context = active_context[] + tid = Threads.threadid() + if @inbounds active_generators[tid] === nothing + context = CuGetContext() + active_generators[tid] = get!(created_generators, context) do RNG() end end + @inbounds active_generators[tid] +end + +function __init__() + resize!(active_generators, Threads.nthreads()) + fill!(active_generators, nothing) - return _generator[]::RNG + CUDAnative.atcontextswitch() do tid, ctx, dev + # we don't eagerly initialize handles, but do so lazily when requested + active_generators[tid] = nothing + end end end diff --git a/src/rand/error.jl b/src/rand/error.jl index 31726fa5..f6ce0a6f 100644 --- a/src/rand/error.jl +++ b/src/rand/error.jl @@ -43,13 +43,33 @@ function status_message(status) end end -macro check(func) + +## API call wrapper + +# API calls that are allowed without a functional context +const preinit_apicalls = Set{Symbol}([ + :curandGetVersion, + :curandGetProperty +]) + +# outlined functionality to avoid GC frame allocation +@noinline function throw_api_error(res) + throw(CURANDError(res)) +end + +macro check(ex) + fun = Symbol(decode_ccall_function(ex)) + init = if !in(fun, preinit_apicalls) + :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + end quote - local err::curandStatus_t - err = $(esc(func::Expr)) - if err != CURAND_STATUS_SUCCESS - throw(CURANDError(err)) + $init + + res = $(esc(ex)) + if res != CURAND_STATUS_SUCCESS + throw_api_error(res) end - err + + return end end diff --git a/src/solver/CUSOLVER.jl b/src/solver/CUSOLVER.jl index db89bc55..7cd940ef 100644 --- a/src/solver/CUSOLVER.jl +++ b/src/solver/CUSOLVER.jl @@ -1,7 +1,7 @@ module CUSOLVER using ..CuArrays -using ..CuArrays: active_context, _getindex, unsafe_free!, @argout, @workspace +using ..CuArrays: _getindex, unsafe_free!, @argout, @workspace using ..CUBLAS: cublasFillMode_t, cublasOperation_t, cublasSideMode_t, cublasDiagType_t using ..CUSPARSE: cusparseMatDescr_t @@ -11,7 +11,7 @@ using CUDAapi using CUDAdrv using CUDAdrv: CUstream -import CUDAnative +using CUDAnative using CEnum @@ -29,35 +29,50 @@ include("wrappers.jl") # high-level integrations include("linalg.jl") -const _dense_handles = Dict{CuContext,cusolverDnHandle_t}() -const _dense_handle = Ref{cusolverDnHandle_t}(C_NULL) -const _sparse_handles = Dict{CuContext,cusolverSpHandle_t}() -const _sparse_handle = Ref{cusolverSpHandle_t}(C_NULL) +const created_dense_handles = IdDict{CuContext,cusolverDnHandle_t}() +const created_sparse_handles = IdDict{CuContext,cusolverSpHandle_t}() +const active_dense_handles = Vector{Union{Nothing,cusolverDnHandle_t}}() +const active_sparse_handles = Vector{Union{Nothing,cusolverSpHandle_t}}() function dense_handle() - if _dense_handle[] == C_NULL - CUDAnative.maybe_initialize("CUSOLVER") - _dense_handle[] = get!(_dense_handles, active_context[]) do - context = active_context[] + tid = Threads.threadid() + if @inbounds active_dense_handles[tid] === nothing + context = CuGetContext() + active_dense_handles[tid] = get!(created_dense_handles, context) do handle = cusolverDnCreate() atexit(()->CUDAdrv.isvalid(context) && cusolverDnDestroy(handle)) handle end end - return _dense_handle[] + @inbounds active_dense_handles[tid] end function sparse_handle() - if _sparse_handle[] == C_NULL - CUDAnative.maybe_initialize("CUSOLVER") - _sparse_handle[] = get!(_sparse_handles, active_context[]) do - context = active_context[] + tid = Threads.threadid() + if @inbounds active_sparse_handles[tid] === nothing + CUDAnative.maybe_initialize("cublasXtGetHandle") + context = CuCurrentContext() + active_sparse_handles[tid] = get!(created_sparse_handles, context) do handle = cusolverSpCreate() atexit(()->CUDAdrv.isvalid(context) && cusolverSpDestroy(handle)) handle end end - return _sparse_handle[] + @inbounds active_sparse_handles[tid] +end + +function __init__() + resize!(active_dense_handles, Threads.nthreads()) + fill!(active_dense_handles, nothing) + + resize!(active_sparse_handles, Threads.nthreads()) + fill!(active_sparse_handles, nothing) + + CUDAnative.atcontextswitch() do tid, ctx, dev + # we don't eagerly initialize handles, but do so lazily when requested + active_dense_handles[tid] = nothing + active_sparse_handles[tid] = nothing + end end end diff --git a/src/solver/error.jl b/src/solver/error.jl index 4afeeb9f..ac96fc53 100644 --- a/src/solver/error.jl +++ b/src/solver/error.jl @@ -33,13 +33,33 @@ function status_message(status) end end -macro check(solver_func) + +## API call wrapper + +# API calls that are allowed without a functional context +const preinit_apicalls = Set{Symbol}([ + :cusolverGetVersion, + :cusolverGetProperty, +]) + +# outlined functionality to avoid GC frame allocation +@noinline function throw_api_error(res) + throw(CUSOLVERError(res)) +end + +macro check(ex) + fun = Symbol(decode_ccall_function(ex)) + init = if !in(fun, preinit_apicalls) + :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + end quote - local err::cusolverStatus_t - err = $(esc(solver_func::Expr)) - if err != CUSOLVER_STATUS_SUCCESS - throw(CUSOLVERError(err)) + $init + + res = $(esc(ex)) + if res != CUSOLVER_STATUS_SUCCESS + throw_api_error(res) end - err + + return end -end \ No newline at end of file +end diff --git a/src/sparse/CUSPARSE.jl b/src/sparse/CUSPARSE.jl index 1dbaafd4..4575a058 100644 --- a/src/sparse/CUSPARSE.jl +++ b/src/sparse/CUSPARSE.jl @@ -1,14 +1,14 @@ module CUSPARSE using ..CuArrays -using ..CuArrays: active_context, unsafe_free!, @argout, @workspace +using ..CuArrays: unsafe_free!, @argout, @workspace using CUDAapi using CUDAdrv using CUDAdrv: CUstream -import CUDAnative +using CUDAnative using CEnum @@ -29,21 +29,30 @@ include("wrappers.jl") # high-level integrations include("interfaces.jl") -const _handles = Dict{CuContext,cusparseHandle_t}() -const _handle = Ref{cusparseHandle_t}() +const created_handles = IdDict{CuContext,cusparseHandle_t}() +const active_handles = Vector{Union{Nothing,cusparseHandle_t}}() function handle() - if _handle[] == C_NULL - CUDAnative.maybe_initialize("CUSPARSE") - _handle[] = get!(_handles, active_context[]) do - context = active_context[] + tid = Threads.threadid() + if @inbounds active_handles[tid] === nothing + context = CuGetContext() + active_handles[tid] = get!(created_handles, context) do handle = cusparseCreate() atexit(()->CUDAdrv.isvalid(context) && cusparseDestroy(handle)) handle end end + @inbounds active_handles[tid] +end + +function __init__() + resize!(active_handles, Threads.nthreads()) + fill!(active_handles, nothing) - return _handle[] + CUDAnative.atcontextswitch() do tid, ctx, dev + # we don't eagerly initialize handles, but do so lazily when requested + active_handles[tid] = nothing + end end end diff --git a/src/sparse/error.jl b/src/sparse/error.jl index 1befeae1..01464b45 100644 --- a/src/sparse/error.jl +++ b/src/sparse/error.jl @@ -41,13 +41,35 @@ function status_message( status ) end end -macro check(sparse_func) + +## API call wrapper + +# API calls that are allowed without a functional context +const preinit_apicalls = Set{Symbol}([ + :cusparseGetVersion, + :cusparseGetProperty, + :cusparseGetErrorName, + :cusparseGetErrorString, +]) + +# outlined functionality to avoid GC frame allocation +@noinline function throw_api_error(res) + throw(CUSPARSEError(res)) +end + +macro check(ex) + fun = Symbol(decode_ccall_function(ex)) + init = if !in(fun, preinit_apicalls) + :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + end quote - local err = $(esc(sparse_func::Expr)) - if err != CUSPARSE_STATUS_SUCCESS - throw(CUSPARSEError(cusparseStatus_t(err))) + $init + + res = $(esc(ex)) + if res != CUSPARSE_STATUS_SUCCESS + throw_api_error(res) end - err + + return end end - diff --git a/src/tensor/CUTENSOR.jl b/src/tensor/CUTENSOR.jl index c052c3bf..44a2c1f1 100644 --- a/src/tensor/CUTENSOR.jl +++ b/src/tensor/CUTENSOR.jl @@ -1,14 +1,17 @@ module CUTENSOR using ..CuArrays -using ..CuArrays: active_context, @argout, @workspace +using ..CuArrays: @argout, @workspace using CUDAapi using CUDAdrv using CUDAdrv: CUstream +using CUDAnative + using CEnum + const cudaDataType_t = cudaDataType const libcutensor = Ref("libcutensor") @@ -25,20 +28,30 @@ include("wrappers.jl") # high-level integrations include("interfaces.jl") -const _handles = Dict{CuContext,Ref{cutensorHandle_t}}() -const _handle = Ref{Union{Ref{cutensorHandle_t},Nothing}}(nothing) +const created_handles = IdDict{CuContext,Ref{cutensorHandle_t}}() +const active_handles = Vector{Union{Nothing,Ref{cutensorHandle_t}}}() function handle() - if _handle[] == nothing - @assert isassigned(active_context) # some other call should have initialized CUDA - _handle[] = get!(_handles, active_context[]) do - context = active_context[] + tid = Threads.threadid() + if @inbounds active_handles[tid] === nothing + context = CuGetContext() + active_handles[tid] = get!(created_handles, context) do handle = Ref{cutensorHandle_t}() cutensorInit(handle) handle end end - return _handle[] + @inbounds active_handles[tid] +end + +function __init__() + resize!(active_handles, Threads.nthreads()) + fill!(active_handles, nothing) + + CUDAnative.atcontextswitch() do tid, ctx, dev + # we don't eagerly initialize handles, but do so lazily when requested + active_handles[tid] = nothing + end end end diff --git a/src/tensor/error.jl b/src/tensor/error.jl index 734b029f..e433780b 100644 --- a/src/tensor/error.jl +++ b/src/tensor/error.jl @@ -45,12 +45,34 @@ function statusmessage( status ) end end -macro check(tensor_func) + +## API call wrapper + +# API calls that are allowed without a functional context +const preinit_apicalls = Set{Symbol}([ + :cutensorGetVersion, + :cutensorGetCudartVersion, + :cutensorGetErrorString, +]) + +# outlined functionality to avoid GC frame allocation +@noinline function throw_api_error(res) + throw(CUTENSORError(res)) +end + +macro check(ex) + fun = Symbol(decode_ccall_function(ex)) + init = if !in(fun, preinit_apicalls) + :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + end quote - local err = $(esc(tensor_func::Expr)) - if err != CUTENSOR_STATUS_SUCCESS - throw(CUTENSORError(cutensorStatus_t(err))) + $init + + res = $(esc(ex)) + if res != CUTENSOR_STATUS_SUCCESS + throw_api_error(res) end - err + + return end end diff --git a/src/utils.jl b/src/utils.jl index 469466f2..5520b64d 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -199,3 +199,13 @@ macro workspace(ex...) end end end + +if VERSION <= v"1.1" + # JuliaLang/julia#30187-like functionality, but only for CuContext dicts to avoid clashes + function Base.get!(default::Base.Callable, d::IdDict{CuContext,V}, @nospecialize(key)) where {V} + if !haskey(d, key) + d[key] = default() + end + return d[key] + end +end From 9ebe74e3ed0a8a76ef088d3914220ba826acf3f6 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 20 Dec 2019 09:51:31 +0100 Subject: [PATCH 2/3] Fix RNG finalization. --- src/rand/random.jl | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/rand/random.jl b/src/rand/random.jl index daf13898..2dcf256c 100644 --- a/src/rand/random.jl +++ b/src/rand/random.jl @@ -14,19 +14,27 @@ export rand_logn!, rand_poisson! mutable struct RNG <: Random.AbstractRNG - ptr::curandGenerator_t + handle::curandGenerator_t + ctx::CuContext typ::Int function RNG(typ=CURAND_RNG_PSEUDO_DEFAULT) - ptr = Ref{curandGenerator_t}() - @allocates curandCreateGenerator(ptr, typ) - obj = new(ptr[], typ) - finalizer(curandDestroyGenerator, obj) + handle_ref = Ref{curandGenerator_t}() + @allocates curandCreateGenerator(handle_ref, typ) + + obj = new(handle_ref[], CuCurrentContext(), typ) + finalizer(unsafe_destroy!, obj) return obj end end -Base.unsafe_convert(::Type{curandGenerator_t}, rng::RNG) = rng.ptr +function unsafe_destroy!(rng::RNG) + if CUDAdrv.isvalid(rng.ctx) + curandDestroyGenerator(e) + end +end + +Base.unsafe_convert(::Type{curandGenerator_t}, rng::RNG) = rng.handle ## seeding From 8635a2faa7050b81de13b44fe6081876a896cf9d Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 20 Dec 2019 11:57:17 +0100 Subject: [PATCH 3/3] Adapt to initialization API changes. --- Manifest.toml | 4 ++-- src/blas/CUBLAS.jl | 17 ++++++++--------- src/blas/error.jl | 2 +- src/dnn/CUDNN.jl | 8 ++++---- src/dnn/error.jl | 2 +- src/fft/error.jl | 2 +- src/memory.jl | 4 +--- src/rand/CURAND.jl | 6 +++--- src/rand/error.jl | 2 +- src/solver/CUSOLVER.jl | 15 +++++++-------- src/solver/error.jl | 2 +- src/sparse/CUSPARSE.jl | 8 ++++---- src/sparse/error.jl | 2 +- src/tensor/CUTENSOR.jl | 6 +++--- src/tensor/error.jl | 2 +- 15 files changed, 39 insertions(+), 43 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 2d2f7153..f9626fee 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -30,7 +30,7 @@ version = "2.0.0" [[CUDAdrv]] deps = ["CEnum", "CUDAapi", "Printf"] -git-tree-sha1 = "5a9dd9ec20a5a2c78c784c31361cf5a813c7a9c2" +git-tree-sha1 = "5c2cf00a78503e1f71409cecf3d64508fb33f17f" repo-rev = "master" repo-url = "https://github.com/JuliaGPU/CUDAdrv.jl.git" uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" @@ -38,7 +38,7 @@ version = "4.0.4" [[CUDAnative]] deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"] -git-tree-sha1 = "1a04a76171016f68f4790e9643524a3ac31f3d32" +git-tree-sha1 = "8b1a585344fee94bdb95ac44653fd057d74e32e6" repo-rev = "master" repo-url = "https://github.com/JuliaGPU/CUDAnative.jl.git" uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" diff --git a/src/blas/CUBLAS.jl b/src/blas/CUBLAS.jl index e83a2361..2b2fd4b3 100644 --- a/src/blas/CUBLAS.jl +++ b/src/blas/CUBLAS.jl @@ -35,13 +35,13 @@ const active_xt_handles = Vector{Union{Nothing,cublasXtHandle_t}}() function handle() tid = Threads.threadid() if @inbounds active_handles[tid] === nothing - context = CuGetContext() - active_handles[tid] = get!(created_handles, context) do + ctx = context() + active_handles[tid] = get!(created_handles, ctx) do handle = cublasCreate_v2() - atexit(()->CUDAdrv.isvalid(context) && cublasDestroy_v2(handle)) + atexit(()->CUDAdrv.isvalid(ctx) && cublasDestroy_v2(handle)) # enable tensor math mode if our device supports it, and fast math is enabled - dev = CUDAdrv.device(context) + dev = CUDAdrv.device() if Base.JLOptions().fast_math == 1 && CUDAdrv.capability(dev) >= v"7.0" && version() >= v"9" cublasSetMathMode(CUBLAS_TENSOR_OP_MATH, handle) end @@ -55,11 +55,10 @@ end function xt_handle() tid = Threads.threadid() if @inbounds active_xt_handles[tid] === nothing - CUDAnative.maybe_initialize("cublasXtGetHandle") - context = CuCurrentContext() - active_xt_handles[tid] = get!(created_xt_handles, context) do + ctx = context() + active_xt_handles[tid] = get!(created_xt_handles, ctx) do handle = cublasXtCreate() - atexit(()->CUDAdrv.isvalid(context) && cublasXtDestroy(handle)) + atexit(()->CUDAdrv.isvalid(ctx) && cublasXtDestroy(handle)) # select the devices # TODO: this is weird, since we typically use a single device per thread/context @@ -79,7 +78,7 @@ function __init__() resize!(active_xt_handles, Threads.nthreads()) fill!(active_xt_handles, nothing) - CUDAnative.atcontextswitch() do tid, ctx, dev + CUDAnative.atcontextswitch() do tid, ctx # we don't eagerly initialize handles, but do so lazily when requested active_handles[tid] = nothing active_xt_handles[tid] = nothing diff --git a/src/blas/error.jl b/src/blas/error.jl index 68ea81cd..8fd091de 100644 --- a/src/blas/error.jl +++ b/src/blas/error.jl @@ -55,7 +55,7 @@ end macro check(ex) fun = Symbol(decode_ccall_function(ex)) init = if !in(fun, preinit_apicalls) - :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + :(CUDAnative.maybe_initialize()) end quote $init diff --git a/src/dnn/CUDNN.jl b/src/dnn/CUDNN.jl index 0d5996d8..4e933e92 100644 --- a/src/dnn/CUDNN.jl +++ b/src/dnn/CUDNN.jl @@ -47,10 +47,10 @@ const active_handles = Vector{Union{Nothing,cudnnHandle_t}}() function handle() tid = Threads.threadid() if @inbounds active_handles[tid] === nothing - context = CuGetContext() - active_handles[tid] = get!(created_handles, context) do + ctx = context() + active_handles[tid] = get!(created_handles, ctx) do handle = cudnnCreate() - atexit(()->CUDAdrv.isvalid(context) && cudnnDestroy(handle)) + atexit(()->CUDAdrv.isvalid(ctx) && cudnnDestroy(handle)) handle end end @@ -61,7 +61,7 @@ function __init__() resize!(active_handles, Threads.nthreads()) fill!(active_handles, nothing) - CUDAnative.atcontextswitch() do tid, ctx, dev + CUDAnative.atcontextswitch() do tid, ctx # we don't eagerly initialize handles, but do so lazily when requested active_handles[tid] = nothing end diff --git a/src/dnn/error.jl b/src/dnn/error.jl index 1d716580..9527181a 100644 --- a/src/dnn/error.jl +++ b/src/dnn/error.jl @@ -30,7 +30,7 @@ end macro check(ex) fun = Symbol(decode_ccall_function(ex)) init = if !in(fun, preinit_apicalls) - :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + :(CUDAnative.maybe_initialize()) end quote $init diff --git a/src/fft/error.jl b/src/fft/error.jl index 792ecb1f..4dc4ff31 100644 --- a/src/fft/error.jl +++ b/src/fft/error.jl @@ -68,7 +68,7 @@ end macro check(ex) fun = Symbol(decode_ccall_function(ex)) init = if !in(fun, preinit_apicalls) - :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + :(CUDAnative.maybe_initialize()) end quote $init diff --git a/src/memory.jl b/src/memory.jl index 3ad27678..a8573956 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -280,9 +280,7 @@ synchronized right before and after executing `ex` to exclude any external effec macro time(ex) quote # @time might surround an application, so be sure to initialize CUDA before that - # FIXME: this should be done in CUDAdrv (`synchronize(ctx=CuCurrentOrNewContext()`) - # but the CUDA initialization mechanics are part of CUDAnative.jl - CUDAnative.maybe_initialize("@time") + CUDAnative.maybe_initialize() # coarse synchronization to exclude effects from previously-executed code CUDAdrv.synchronize() diff --git a/src/rand/CURAND.jl b/src/rand/CURAND.jl index 7857d137..6c3fec4a 100644 --- a/src/rand/CURAND.jl +++ b/src/rand/CURAND.jl @@ -30,8 +30,8 @@ const active_generators = Vector{Union{Nothing,RNG}}() function generator() tid = Threads.threadid() if @inbounds active_generators[tid] === nothing - context = CuGetContext() - active_generators[tid] = get!(created_generators, context) do + ctx = context() + active_generators[tid] = get!(created_generators, ctx) do RNG() end end @@ -42,7 +42,7 @@ function __init__() resize!(active_generators, Threads.nthreads()) fill!(active_generators, nothing) - CUDAnative.atcontextswitch() do tid, ctx, dev + CUDAnative.atcontextswitch() do tid, ctx # we don't eagerly initialize handles, but do so lazily when requested active_generators[tid] = nothing end diff --git a/src/rand/error.jl b/src/rand/error.jl index f6ce0a6f..a43af9d5 100644 --- a/src/rand/error.jl +++ b/src/rand/error.jl @@ -60,7 +60,7 @@ end macro check(ex) fun = Symbol(decode_ccall_function(ex)) init = if !in(fun, preinit_apicalls) - :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + :(CUDAnative.maybe_initialize()) end quote $init diff --git a/src/solver/CUSOLVER.jl b/src/solver/CUSOLVER.jl index 7cd940ef..e0e27a0e 100644 --- a/src/solver/CUSOLVER.jl +++ b/src/solver/CUSOLVER.jl @@ -37,10 +37,10 @@ const active_sparse_handles = Vector{Union{Nothing,cusolverSpHandle_t}}() function dense_handle() tid = Threads.threadid() if @inbounds active_dense_handles[tid] === nothing - context = CuGetContext() - active_dense_handles[tid] = get!(created_dense_handles, context) do + ctx = context() + active_dense_handles[tid] = get!(created_dense_handles, ctx) do handle = cusolverDnCreate() - atexit(()->CUDAdrv.isvalid(context) && cusolverDnDestroy(handle)) + atexit(()->CUDAdrv.isvalid(ctx) && cusolverDnDestroy(handle)) handle end end @@ -50,11 +50,10 @@ end function sparse_handle() tid = Threads.threadid() if @inbounds active_sparse_handles[tid] === nothing - CUDAnative.maybe_initialize("cublasXtGetHandle") - context = CuCurrentContext() - active_sparse_handles[tid] = get!(created_sparse_handles, context) do + ctx = context() + active_sparse_handles[tid] = get!(created_sparse_handles, ctx) do handle = cusolverSpCreate() - atexit(()->CUDAdrv.isvalid(context) && cusolverSpDestroy(handle)) + atexit(()->CUDAdrv.isvalid(ctx) && cusolverSpDestroy(handle)) handle end end @@ -68,7 +67,7 @@ function __init__() resize!(active_sparse_handles, Threads.nthreads()) fill!(active_sparse_handles, nothing) - CUDAnative.atcontextswitch() do tid, ctx, dev + CUDAnative.atcontextswitch() do tid, ctx # we don't eagerly initialize handles, but do so lazily when requested active_dense_handles[tid] = nothing active_sparse_handles[tid] = nothing diff --git a/src/solver/error.jl b/src/solver/error.jl index ac96fc53..35937d6f 100644 --- a/src/solver/error.jl +++ b/src/solver/error.jl @@ -50,7 +50,7 @@ end macro check(ex) fun = Symbol(decode_ccall_function(ex)) init = if !in(fun, preinit_apicalls) - :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + :(CUDAnative.maybe_initialize()) end quote $init diff --git a/src/sparse/CUSPARSE.jl b/src/sparse/CUSPARSE.jl index 4575a058..48fb74df 100644 --- a/src/sparse/CUSPARSE.jl +++ b/src/sparse/CUSPARSE.jl @@ -35,10 +35,10 @@ const active_handles = Vector{Union{Nothing,cusparseHandle_t}}() function handle() tid = Threads.threadid() if @inbounds active_handles[tid] === nothing - context = CuGetContext() - active_handles[tid] = get!(created_handles, context) do + ctx = context() + active_handles[tid] = get!(created_handles, ctx) do handle = cusparseCreate() - atexit(()->CUDAdrv.isvalid(context) && cusparseDestroy(handle)) + atexit(()->CUDAdrv.isvalid(ctx) && cusparseDestroy(handle)) handle end end @@ -49,7 +49,7 @@ function __init__() resize!(active_handles, Threads.nthreads()) fill!(active_handles, nothing) - CUDAnative.atcontextswitch() do tid, ctx, dev + CUDAnative.atcontextswitch() do tid, ctx # we don't eagerly initialize handles, but do so lazily when requested active_handles[tid] = nothing end diff --git a/src/sparse/error.jl b/src/sparse/error.jl index 01464b45..25c6d9ea 100644 --- a/src/sparse/error.jl +++ b/src/sparse/error.jl @@ -60,7 +60,7 @@ end macro check(ex) fun = Symbol(decode_ccall_function(ex)) init = if !in(fun, preinit_apicalls) - :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + :(CUDAnative.maybe_initialize()) end quote $init diff --git a/src/tensor/CUTENSOR.jl b/src/tensor/CUTENSOR.jl index 44a2c1f1..574d1881 100644 --- a/src/tensor/CUTENSOR.jl +++ b/src/tensor/CUTENSOR.jl @@ -34,8 +34,8 @@ const active_handles = Vector{Union{Nothing,Ref{cutensorHandle_t}}}() function handle() tid = Threads.threadid() if @inbounds active_handles[tid] === nothing - context = CuGetContext() - active_handles[tid] = get!(created_handles, context) do + ctx = context() + active_handles[tid] = get!(created_handles, ctx) do handle = Ref{cutensorHandle_t}() cutensorInit(handle) handle @@ -48,7 +48,7 @@ function __init__() resize!(active_handles, Threads.nthreads()) fill!(active_handles, nothing) - CUDAnative.atcontextswitch() do tid, ctx, dev + CUDAnative.atcontextswitch() do tid, ctx # we don't eagerly initialize handles, but do so lazily when requested active_handles[tid] = nothing end diff --git a/src/tensor/error.jl b/src/tensor/error.jl index e433780b..7108f976 100644 --- a/src/tensor/error.jl +++ b/src/tensor/error.jl @@ -63,7 +63,7 @@ end macro check(ex) fun = Symbol(decode_ccall_function(ex)) init = if !in(fun, preinit_apicalls) - :(CUDAnative.maybe_initialize($(QuoteNode(fun)))) + :(CUDAnative.maybe_initialize()) end quote $init