From 03a8b69e100c571b258ec97a12f4dc9d4fbce41d Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 20 Dec 2019 08:23:08 +0100
Subject: [PATCH 1/3] Rework library handles for multithreading.

---
 Manifest.toml          | 12 ++++++---
 src/CuArrays.jl        | 26 --------------------
 src/blas/CUBLAS.jl     | 56 ++++++++++++++++++++++++++++--------------
 src/blas/error.jl      | 35 ++++++++++++++++++++------
 src/dnn/CUDNN.jl       | 27 +++++++++++++-------
 src/dnn/error.jl       | 34 ++++++++++++++++++++-----
 src/dnn/filter.jl      |  2 +-
 src/fft/CUFFT.jl       |  2 ++
 src/fft/error.jl       | 32 +++++++++++++++++++-----
 src/rand/CURAND.jl     | 26 +++++++++++++-------
 src/rand/error.jl      | 32 +++++++++++++++++++-----
 src/solver/CUSOLVER.jl | 47 +++++++++++++++++++++++------------
 src/solver/error.jl    | 34 +++++++++++++++++++------
 src/sparse/CUSPARSE.jl | 27 +++++++++++++-------
 src/sparse/error.jl    | 34 ++++++++++++++++++++-----
 src/tensor/CUTENSOR.jl | 29 ++++++++++++++++------
 src/tensor/error.jl    | 32 ++++++++++++++++++++----
 src/utils.jl           | 10 ++++++++
 18 files changed, 354 insertions(+), 143 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index da7fb34d..2d2f7153 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -22,19 +22,25 @@ version = "0.2.0"
 
 [[CUDAapi]]
 deps = ["Libdl", "Logging"]
-git-tree-sha1 = "6eee47385c81ed3b3f716b745697869c712c2df3"
+git-tree-sha1 = "ca1c7f639c5f6326919ee2834fa0dffb5002ff60"
+repo-rev = "master"
+repo-url = "https://github.com/JuliaGPU/CUDAapi.jl.git"
 uuid = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 version = "2.0.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "0f39fddace3324707469ace7fbcbc7b28d5cf921"
+git-tree-sha1 = "5a9dd9ec20a5a2c78c784c31361cf5a813c7a9c2"
+repo-rev = "master"
+repo-url = "https://github.com/JuliaGPU/CUDAdrv.jl.git"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
 version = "4.0.4"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "a67b38619d1fa131027bac1c4a81f0012254d1fd"
+git-tree-sha1 = "1a04a76171016f68f4790e9643524a3ac31f3d32"
+repo-rev = "master"
+repo-url = "https://github.com/JuliaGPU/CUDAnative.jl.git"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
 version = "2.6.0"
 
diff --git a/src/CuArrays.jl b/src/CuArrays.jl
index 9599dbd8..75f7907d 100644
--- a/src/CuArrays.jl
+++ b/src/CuArrays.jl
@@ -32,10 +32,6 @@ include("linalg.jl")
 
 include("gpuarray_interface.jl")
 
-# many libraries need to be initialized per-device (per-context, really, but we assume users
-# of CuArrays and/or CUDAnative only use a single context), so keep track of the active one.
-const active_context = Ref{CuContext}()
-
 include("blas/CUBLAS.jl")
 include("sparse/CUSPARSE.jl")
 include("solver/CUSOLVER.jl")
@@ -112,28 +108,6 @@ function __init__()
         # package integrations
         @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl")
 
-        # update the active context when we switch devices
-        callback = (::CuDevice, ctx::CuContext) -> begin
-            active_context[] = ctx
-
-            # wipe the active handles
-            CUBLAS._handle[] = C_NULL
-            CUBLAS._xt_handle[] = C_NULL
-            CUSOLVER._dense_handle[] = C_NULL
-            CUSOLVER._sparse_handle[] = C_NULL
-            CUSPARSE._handle[] = C_NULL
-            CURAND._generator[] = nothing
-            CUDNN._handle[] = C_NULL
-            CUTENSOR._handle[] = nothing
-        end
-        push!(CUDAnative.device!_listeners, callback)
-
-        # a device might be active already
-        existing_ctx = CUDAdrv.CuCurrentContext()
-        if existing_ctx !== nothing
-            active_context[] = existing_ctx
-        end
-
         __init_memory__()
 
         __initialized__[] = true
diff --git a/src/blas/CUBLAS.jl b/src/blas/CUBLAS.jl
index 091f214a..e83a2361 100644
--- a/src/blas/CUBLAS.jl
+++ b/src/blas/CUBLAS.jl
@@ -5,10 +5,10 @@ using CUDAapi
 using CUDAdrv
 using CUDAdrv: CUstream
 
-import CUDAnative
+using CUDAnative
 
 using ..CuArrays
-using ..CuArrays: active_context, unsafe_free!
+using ..CuArrays: unsafe_free!
 using LinearAlgebra
 
 using CEnum
@@ -27,17 +27,18 @@ include("wrappers.jl")
 # high-level integrations
 include("linalg.jl")
 
-const _handles = Dict{CuContext,cublasHandle_t}()
-const _xt_handles = Dict{CuContext,cublasXtHandle_t}()
-const _handle = Ref{cublasHandle_t}(C_NULL)
-const _xt_handle = Ref{cublasXtHandle_t}(C_NULL)
+const created_handles = IdDict{CuContext,cublasHandle_t}()
+const created_xt_handles = IdDict{CuContext,cublasXtHandle_t}()
+const active_handles = Vector{Union{Nothing,cublasHandle_t}}()
+const active_xt_handles = Vector{Union{Nothing,cublasXtHandle_t}}()
 
 function handle()
-    if _handle[] == C_NULL
-        CUDAnative.maybe_initialize("CUBLAS")
-        _handle[] = get!(_handles, active_context[]) do
-            context = active_context[]
+    tid = Threads.threadid()
+    if @inbounds active_handles[tid] === nothing
+        context = CuGetContext()
+        active_handles[tid] = get!(created_handles, context) do
             handle = cublasCreate_v2()
+            atexit(()->CUDAdrv.isvalid(context) && cublasDestroy_v2(handle))
 
             # enable tensor math mode if our device supports it, and fast math is enabled
             dev = CUDAdrv.device(context)
@@ -45,27 +46,44 @@ function handle()
                 cublasSetMathMode(CUBLAS_TENSOR_OP_MATH, handle)
             end
 
-            atexit(()->CUDAdrv.isvalid(context) && cublasDestroy_v2(handle))
             handle
         end
     end
-
-    return _handle[]
+    @inbounds active_handles[tid]
 end
 
 function xt_handle()
-    if _xt_handle[] == C_NULL
-        @assert isassigned(active_context) # some other call should have initialized CUDA
-        _xt_handle[] = get!(_xt_handles, active_context[]) do
-            context = active_context[]
+    tid = Threads.threadid()
+    if @inbounds active_xt_handles[tid] === nothing
+        CUDAnative.maybe_initialize("cublasXtGetHandle")
+        context = CuCurrentContext()
+        active_xt_handles[tid] = get!(created_xt_handles, context) do
             handle = cublasXtCreate()
+            atexit(()->CUDAdrv.isvalid(context) && cublasXtDestroy(handle))
+
+            # select the devices
+            # TODO: this is weird, since we typically use a single device per thread/context
             devs = convert.(Cint, CUDAdrv.devices())
             cublasXtDeviceSelect(handle, length(devs), devs)
-            atexit(()->CUDAdrv.isvalid(context) && cublasXtDestroy(handle))
+
             handle
         end
     end
-    return _xt_handle[]
+    @inbounds active_xt_handles[tid]
+end
+
+function __init__()
+    resize!(active_handles, Threads.nthreads())
+    fill!(active_handles, nothing)
+
+    resize!(active_xt_handles, Threads.nthreads())
+    fill!(active_xt_handles, nothing)
+
+    CUDAnative.atcontextswitch() do tid, ctx, dev
+        # we don't eagerly initialize handles, but do so lazily when requested
+        active_handles[tid] = nothing
+        active_xt_handles[tid] = nothing
+    end
 end
 
 end
diff --git a/src/blas/error.jl b/src/blas/error.jl
index 2b45cb1d..68ea81cd 100644
--- a/src/blas/error.jl
+++ b/src/blas/error.jl
@@ -37,13 +37,34 @@ function status_message(status)
     end
 end
 
-macro check(blas_func)
+
+## API call wrapper
+
+# API calls that are allowed without a functional context
+const preinit_apicalls = Set{Symbol}([
+    :cublasGetVersion,
+    :cublasGetProperty,
+    :cublasGetCudartVersion
+])
+
+# outlined functionality to avoid GC frame allocation
+@noinline function throw_api_error(res)
+    throw(CuError(res))
+end
+
+macro check(ex)
+    fun = Symbol(decode_ccall_function(ex))
+    init = if !in(fun, preinit_apicalls)
+        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+    end
     quote
-        local err::cublasStatus_t
-        err = $(esc(blas_func::Expr))
-        if err != CUBLAS_STATUS_SUCCESS
-            throw(CUBLASError(err))
+        $init
+
+        res = $(esc(ex))
+        if res != CUBLAS_STATUS_SUCCESS
+            throw_api_error(res)
         end
-        err
+
+        return
     end
-end
\ No newline at end of file
+end
diff --git a/src/dnn/CUDNN.jl b/src/dnn/CUDNN.jl
index 79f80290..0d5996d8 100644
--- a/src/dnn/CUDNN.jl
+++ b/src/dnn/CUDNN.jl
@@ -6,12 +6,12 @@ using CUDAapi: libraryPropertyType
 using CUDAdrv
 using CUDAdrv: CUstream
 
-import CUDAnative
+using CUDAnative
 
 using CEnum
 
 using ..CuArrays
-using ..CuArrays: active_context, @argout, @workspace
+using ..CuArrays: @argout, @workspace
 import ..CuArrays.unsafe_free!
 
 import NNlib
@@ -41,21 +41,30 @@ include("nnlib.jl")
 
 include("compat.jl")
 
-const _handles = Dict{CuContext,cudnnHandle_t}()
-const _handle = Ref{cudnnHandle_t}(C_NULL)
+const created_handles = IdDict{CuContext,cudnnHandle_t}()
+const active_handles = Vector{Union{Nothing,cudnnHandle_t}}()
 
 function handle()
-    if _handle[] == C_NULL
-        CUDAnative.maybe_initialize("CUDNN")
-        _handle[] = get!(_handles, active_context[]) do
-            context = active_context[]
+    tid = Threads.threadid()
+    if @inbounds active_handles[tid] === nothing
+        context = CuGetContext()
+        active_handles[tid] = get!(created_handles, context) do
             handle = cudnnCreate()
             atexit(()->CUDAdrv.isvalid(context) && cudnnDestroy(handle))
             handle
         end
     end
+    @inbounds active_handles[tid]
+end
+
+function __init__()
+    resize!(active_handles, Threads.nthreads())
+    fill!(active_handles, nothing)
 
-    return _handle[]
+    CUDAnative.atcontextswitch() do tid, ctx, dev
+        # we don't eagerly initialize handles, but do so lazily when requested
+        active_handles[tid] = nothing
+    end
 end
 
 end
diff --git a/src/dnn/error.jl b/src/dnn/error.jl
index c8c8d00d..1d716580 100644
--- a/src/dnn/error.jl
+++ b/src/dnn/error.jl
@@ -11,13 +11,35 @@ function CUDNNError(status::cudnnStatus_t)
     return CUDNNError(status, msg)
 end
 
-macro check(dnn_func)
+
+## API call wrapper
+
+# API calls that are allowed without a functional context
+const preinit_apicalls = Set{Symbol}([
+    :cudnnGetVersion,
+    :cudnnGetProperty,
+    :cudnnGetCudartVersion,
+    :cudnnGetErrorString,
+])
+
+# outlined functionality to avoid GC frame allocation
+@noinline function throw_api_error(res)
+    throw(CUDNNError(res))
+end
+
+macro check(ex)
+    fun = Symbol(decode_ccall_function(ex))
+    init = if !in(fun, preinit_apicalls)
+        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+    end
     quote
-        local err::cudnnStatus_t
-        err = $(esc(dnn_func))
-        if err != CUDNN_STATUS_SUCCESS
-            throw(CUDNNError(err))
+        $init
+
+        res = $(esc(ex))
+        if res != CUDNN_STATUS_SUCCESS
+            throw_api_error(res)
         end
-        err
+
+        return
     end
 end
diff --git a/src/dnn/filter.jl b/src/dnn/filter.jl
index 5bd8df9d..abf2bb16 100644
--- a/src/dnn/filter.jl
+++ b/src/dnn/filter.jl
@@ -10,7 +10,7 @@ Base.unsafe_convert(::Type{cudnnFilterDescriptor_t}, fd::FilterDesc) = fd.ptr
 
 function createFilterDesc()
   d = Ref{cudnnFilterDescriptor_t}()
-  @check cudnnCreateFilterDescriptor(d)
+  cudnnCreateFilterDescriptor(d)
   return d[]
 end
 
diff --git a/src/fft/CUFFT.jl b/src/fft/CUFFT.jl
index ec5bf930..339eb041 100644
--- a/src/fft/CUFFT.jl
+++ b/src/fft/CUFFT.jl
@@ -8,6 +8,8 @@ import ..CuArrays: unsafe_free!
 using CUDAdrv
 using CUDAdrv: CUstream
 
+using CUDAnative
+
 using CEnum
 
 const libcufft = Ref("libcufft")
diff --git a/src/fft/error.jl b/src/fft/error.jl
index 8e117813..792ecb1f 100644
--- a/src/fft/error.jl
+++ b/src/fft/error.jl
@@ -51,13 +51,33 @@ function status_message(status)
     end
 end
 
-macro check(fft_func)
+
+## API call wrapper
+
+# API calls that are allowed without a functional context
+const preinit_apicalls = Set{Symbol}([
+    :cufftGetVersion,
+    :cufftGetProperty,
+])
+
+# outlined functionality to avoid GC frame allocation
+@noinline function throw_api_error(res)
+    throw(CUFFTError(res))
+end
+
+macro check(ex)
+    fun = Symbol(decode_ccall_function(ex))
+    init = if !in(fun, preinit_apicalls)
+        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+    end
     quote
-        local err::cufftResult
-        err = $(esc(fft_func::Expr))
-        if err != CUFFT_SUCCESS
-            throw(CUFFTError(err))
+        $init
+
+        res = $(esc(ex))
+        if res != CUFFT_SUCCESS
+            throw_api_error(res)
         end
-        err
+
+        return
     end
 end
diff --git a/src/rand/CURAND.jl b/src/rand/CURAND.jl
index 7c973e5f..7857d137 100644
--- a/src/rand/CURAND.jl
+++ b/src/rand/CURAND.jl
@@ -1,14 +1,13 @@
 module CURAND
 
 using ..CuArrays
-using ..CuArrays: active_context
 
 using CUDAapi
 
 using CUDAdrv
 using CUDAdrv: CUstream
 
-import CUDAnative
+using CUDAnative
 
 using CEnum
 
@@ -25,19 +24,28 @@ include("wrappers.jl")
 # high-level integrations
 include("random.jl")
 
-const _generators = Dict{CuContext,RNG}()
-const _generator = Ref{Union{Nothing,RNG}}(nothing)
+const created_generators = IdDict{CuContext,RNG}()
+const active_generators = Vector{Union{Nothing,RNG}}()
 
 function generator()
-    if _generator[] == nothing
-        CUDAnative.maybe_initialize("CURAND")
-        _generator[] = get!(_generators, active_context[]) do
-            context = active_context[]
+    tid = Threads.threadid()
+    if @inbounds active_generators[tid] === nothing
+        context = CuGetContext()
+        active_generators[tid] = get!(created_generators, context) do
             RNG()
         end
     end
+    @inbounds active_generators[tid]
+end
+
+function __init__()
+    resize!(active_generators, Threads.nthreads())
+    fill!(active_generators, nothing)
 
-    return _generator[]::RNG
+    CUDAnative.atcontextswitch() do tid, ctx, dev
+        # we don't eagerly initialize handles, but do so lazily when requested
+        active_generators[tid] = nothing
+    end
 end
 
 end
diff --git a/src/rand/error.jl b/src/rand/error.jl
index 31726fa5..f6ce0a6f 100644
--- a/src/rand/error.jl
+++ b/src/rand/error.jl
@@ -43,13 +43,33 @@ function status_message(status)
     end
 end
 
-macro check(func)
+
+## API call wrapper
+
+# API calls that are allowed without a functional context
+const preinit_apicalls = Set{Symbol}([
+    :curandGetVersion,
+    :curandGetProperty
+])
+
+# outlined functionality to avoid GC frame allocation
+@noinline function throw_api_error(res)
+    throw(CURANDError(res))
+end
+
+macro check(ex)
+    fun = Symbol(decode_ccall_function(ex))
+    init = if !in(fun, preinit_apicalls)
+        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+    end
     quote
-        local err::curandStatus_t
-        err = $(esc(func::Expr))
-        if err != CURAND_STATUS_SUCCESS
-            throw(CURANDError(err))
+        $init
+
+        res = $(esc(ex))
+        if res != CURAND_STATUS_SUCCESS
+            throw_api_error(res)
         end
-        err
+
+        return
     end
 end
diff --git a/src/solver/CUSOLVER.jl b/src/solver/CUSOLVER.jl
index db89bc55..7cd940ef 100644
--- a/src/solver/CUSOLVER.jl
+++ b/src/solver/CUSOLVER.jl
@@ -1,7 +1,7 @@
 module CUSOLVER
 
 using ..CuArrays
-using ..CuArrays: active_context, _getindex, unsafe_free!, @argout, @workspace
+using ..CuArrays: _getindex, unsafe_free!, @argout, @workspace
 
 using ..CUBLAS: cublasFillMode_t, cublasOperation_t, cublasSideMode_t, cublasDiagType_t
 using ..CUSPARSE: cusparseMatDescr_t
@@ -11,7 +11,7 @@ using CUDAapi
 using CUDAdrv
 using CUDAdrv: CUstream
 
-import CUDAnative
+using CUDAnative
 
 using CEnum
 
@@ -29,35 +29,50 @@ include("wrappers.jl")
 # high-level integrations
 include("linalg.jl")
 
-const _dense_handles = Dict{CuContext,cusolverDnHandle_t}()
-const _dense_handle = Ref{cusolverDnHandle_t}(C_NULL)
-const _sparse_handles = Dict{CuContext,cusolverSpHandle_t}()
-const _sparse_handle = Ref{cusolverSpHandle_t}(C_NULL)
+const created_dense_handles = IdDict{CuContext,cusolverDnHandle_t}()
+const created_sparse_handles = IdDict{CuContext,cusolverSpHandle_t}()
+const active_dense_handles = Vector{Union{Nothing,cusolverDnHandle_t}}()
+const active_sparse_handles = Vector{Union{Nothing,cusolverSpHandle_t}}()
 
 function dense_handle()
-    if _dense_handle[] == C_NULL
-        CUDAnative.maybe_initialize("CUSOLVER")
-        _dense_handle[] = get!(_dense_handles, active_context[]) do
-            context = active_context[]
+    tid = Threads.threadid()
+    if @inbounds active_dense_handles[tid] === nothing
+        context = CuGetContext()
+        active_dense_handles[tid] = get!(created_dense_handles, context) do
             handle = cusolverDnCreate()
             atexit(()->CUDAdrv.isvalid(context) && cusolverDnDestroy(handle))
             handle
         end
     end
-    return _dense_handle[]
+    @inbounds active_dense_handles[tid]
 end
 
 function sparse_handle()
-    if _sparse_handle[] == C_NULL
-        CUDAnative.maybe_initialize("CUSOLVER")
-        _sparse_handle[] = get!(_sparse_handles, active_context[]) do
-            context = active_context[]
+    tid = Threads.threadid()
+    if @inbounds active_sparse_handles[tid] === nothing
+        CUDAnative.maybe_initialize("cublasXtGetHandle")
+        context = CuCurrentContext()
+        active_sparse_handles[tid] = get!(created_sparse_handles, context) do
             handle = cusolverSpCreate()
             atexit(()->CUDAdrv.isvalid(context) && cusolverSpDestroy(handle))
             handle
         end
     end
-    return _sparse_handle[]
+    @inbounds active_sparse_handles[tid]
+end
+
+function __init__()
+    resize!(active_dense_handles, Threads.nthreads())
+    fill!(active_dense_handles, nothing)
+
+    resize!(active_sparse_handles, Threads.nthreads())
+    fill!(active_sparse_handles, nothing)
+
+    CUDAnative.atcontextswitch() do tid, ctx, dev
+        # we don't eagerly initialize handles, but do so lazily when requested
+        active_dense_handles[tid] = nothing
+        active_sparse_handles[tid] = nothing
+    end
 end
 
 end
diff --git a/src/solver/error.jl b/src/solver/error.jl
index 4afeeb9f..ac96fc53 100644
--- a/src/solver/error.jl
+++ b/src/solver/error.jl
@@ -33,13 +33,33 @@ function status_message(status)
     end
 end
 
-macro check(solver_func)
+
+## API call wrapper
+
+# API calls that are allowed without a functional context
+const preinit_apicalls = Set{Symbol}([
+    :cusolverGetVersion,
+    :cusolverGetProperty,
+])
+
+# outlined functionality to avoid GC frame allocation
+@noinline function throw_api_error(res)
+    throw(CUSOLVERError(res))
+end
+
+macro check(ex)
+    fun = Symbol(decode_ccall_function(ex))
+    init = if !in(fun, preinit_apicalls)
+        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+    end
     quote
-        local err::cusolverStatus_t
-        err = $(esc(solver_func::Expr))
-        if err != CUSOLVER_STATUS_SUCCESS
-            throw(CUSOLVERError(err))
+        $init
+
+        res = $(esc(ex))
+        if res != CUSOLVER_STATUS_SUCCESS
+            throw_api_error(res)
         end
-        err
+
+        return
     end
-end
\ No newline at end of file
+end
diff --git a/src/sparse/CUSPARSE.jl b/src/sparse/CUSPARSE.jl
index 1dbaafd4..4575a058 100644
--- a/src/sparse/CUSPARSE.jl
+++ b/src/sparse/CUSPARSE.jl
@@ -1,14 +1,14 @@
 module CUSPARSE
 
 using ..CuArrays
-using ..CuArrays: active_context, unsafe_free!, @argout, @workspace
+using ..CuArrays: unsafe_free!, @argout, @workspace
 
 using CUDAapi
 
 using CUDAdrv
 using CUDAdrv: CUstream
 
-import CUDAnative
+using CUDAnative
 
 using CEnum
 
@@ -29,21 +29,30 @@ include("wrappers.jl")
 # high-level integrations
 include("interfaces.jl")
 
-const _handles = Dict{CuContext,cusparseHandle_t}()
-const _handle = Ref{cusparseHandle_t}()
+const created_handles = IdDict{CuContext,cusparseHandle_t}()
+const active_handles = Vector{Union{Nothing,cusparseHandle_t}}()
 
 function handle()
-    if _handle[] == C_NULL
-        CUDAnative.maybe_initialize("CUSPARSE")
-        _handle[] = get!(_handles, active_context[]) do
-            context = active_context[]
+    tid = Threads.threadid()
+    if @inbounds active_handles[tid] === nothing
+        context = CuGetContext()
+        active_handles[tid] = get!(created_handles, context) do
             handle = cusparseCreate()
             atexit(()->CUDAdrv.isvalid(context) && cusparseDestroy(handle))
             handle
         end
     end
+    @inbounds active_handles[tid]
+end
+
+function __init__()
+    resize!(active_handles, Threads.nthreads())
+    fill!(active_handles, nothing)
 
-    return _handle[]
+    CUDAnative.atcontextswitch() do tid, ctx, dev
+        # we don't eagerly initialize handles, but do so lazily when requested
+        active_handles[tid] = nothing
+    end
 end
 
 end
diff --git a/src/sparse/error.jl b/src/sparse/error.jl
index 1befeae1..01464b45 100644
--- a/src/sparse/error.jl
+++ b/src/sparse/error.jl
@@ -41,13 +41,35 @@ function status_message( status )
     end
 end
 
-macro check(sparse_func)
+
+## API call wrapper
+
+# API calls that are allowed without a functional context
+const preinit_apicalls = Set{Symbol}([
+    :cusparseGetVersion,
+    :cusparseGetProperty,
+    :cusparseGetErrorName,
+    :cusparseGetErrorString,
+])
+
+# outlined functionality to avoid GC frame allocation
+@noinline function throw_api_error(res)
+    throw(CUSPARSEError(res))
+end
+
+macro check(ex)
+    fun = Symbol(decode_ccall_function(ex))
+    init = if !in(fun, preinit_apicalls)
+        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+    end
     quote
-        local err = $(esc(sparse_func::Expr))
-        if err != CUSPARSE_STATUS_SUCCESS
-            throw(CUSPARSEError(cusparseStatus_t(err)))
+        $init
+
+        res = $(esc(ex))
+        if res != CUSPARSE_STATUS_SUCCESS
+            throw_api_error(res)
         end
-        err
+
+        return
     end
 end
-
diff --git a/src/tensor/CUTENSOR.jl b/src/tensor/CUTENSOR.jl
index c052c3bf..44a2c1f1 100644
--- a/src/tensor/CUTENSOR.jl
+++ b/src/tensor/CUTENSOR.jl
@@ -1,14 +1,17 @@
 module CUTENSOR
 
 using ..CuArrays
-using ..CuArrays: active_context, @argout, @workspace
+using ..CuArrays: @argout, @workspace
 
 using CUDAapi
 
 using CUDAdrv
 using CUDAdrv: CUstream
 
+using CUDAnative
+
 using CEnum
+
 const cudaDataType_t = cudaDataType
 
 const libcutensor = Ref("libcutensor")
@@ -25,20 +28,30 @@ include("wrappers.jl")
 # high-level integrations
 include("interfaces.jl")
 
-const _handles = Dict{CuContext,Ref{cutensorHandle_t}}()
-const _handle = Ref{Union{Ref{cutensorHandle_t},Nothing}}(nothing)
+const created_handles = IdDict{CuContext,Ref{cutensorHandle_t}}()
+const active_handles = Vector{Union{Nothing,Ref{cutensorHandle_t}}}()
 
 function handle()
-    if _handle[] == nothing
-        @assert isassigned(active_context) # some other call should have initialized CUDA
-        _handle[] = get!(_handles, active_context[]) do
-            context = active_context[]
+    tid = Threads.threadid()
+    if @inbounds active_handles[tid] === nothing
+        context = CuGetContext()
+        active_handles[tid] = get!(created_handles, context) do
             handle = Ref{cutensorHandle_t}()
             cutensorInit(handle)
             handle
         end
     end
-    return _handle[]
+    @inbounds active_handles[tid]
+end
+
+function __init__()
+    resize!(active_handles, Threads.nthreads())
+    fill!(active_handles, nothing)
+
+    CUDAnative.atcontextswitch() do tid, ctx, dev
+        # we don't eagerly initialize handles, but do so lazily when requested
+        active_handles[tid] = nothing
+    end
 end
 
 end
diff --git a/src/tensor/error.jl b/src/tensor/error.jl
index 734b029f..e433780b 100644
--- a/src/tensor/error.jl
+++ b/src/tensor/error.jl
@@ -45,12 +45,34 @@ function statusmessage( status )
     end
 end
 
-macro check(tensor_func)
+
+## API call wrapper
+
+# API calls that are allowed without a functional context
+const preinit_apicalls = Set{Symbol}([
+    :cutensorGetVersion,
+    :cutensorGetCudartVersion,
+    :cutensorGetErrorString,
+])
+
+# outlined functionality to avoid GC frame allocation
+@noinline function throw_api_error(res)
+    throw(CUTENSORError(res))
+end
+
+macro check(ex)
+    fun = Symbol(decode_ccall_function(ex))
+    init = if !in(fun, preinit_apicalls)
+        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+    end
     quote
-        local err = $(esc(tensor_func::Expr))
-        if err != CUTENSOR_STATUS_SUCCESS
-            throw(CUTENSORError(cutensorStatus_t(err)))
+        $init
+
+        res = $(esc(ex))
+        if res != CUTENSOR_STATUS_SUCCESS
+            throw_api_error(res)
         end
-        err
+
+        return
     end
 end
diff --git a/src/utils.jl b/src/utils.jl
index 469466f2..5520b64d 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -199,3 +199,13 @@ macro workspace(ex...)
         end
     end
 end
+
+if VERSION <= v"1.1"
+  # JuliaLang/julia#30187-like functionality, but only for CuContext dicts to avoid clashes
+  function Base.get!(default::Base.Callable, d::IdDict{CuContext,V}, @nospecialize(key)) where {V}
+    if !haskey(d, key)
+      d[key] = default()
+    end
+    return d[key]
+  end
+end

From 9ebe74e3ed0a8a76ef088d3914220ba826acf3f6 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 20 Dec 2019 09:51:31 +0100
Subject: [PATCH 2/3] Fix RNG finalization.

---
 src/rand/random.jl | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/rand/random.jl b/src/rand/random.jl
index daf13898..2dcf256c 100644
--- a/src/rand/random.jl
+++ b/src/rand/random.jl
@@ -14,19 +14,27 @@ export rand_logn!, rand_poisson!
 
 
 mutable struct RNG <: Random.AbstractRNG
-    ptr::curandGenerator_t
+    handle::curandGenerator_t
+    ctx::CuContext
     typ::Int
 
     function RNG(typ=CURAND_RNG_PSEUDO_DEFAULT)
-        ptr = Ref{curandGenerator_t}()
-        @allocates curandCreateGenerator(ptr, typ)
-        obj = new(ptr[], typ)
-        finalizer(curandDestroyGenerator, obj)
+        handle_ref = Ref{curandGenerator_t}()
+        @allocates curandCreateGenerator(handle_ref, typ)
+
+        obj = new(handle_ref[], CuCurrentContext(), typ)
+        finalizer(unsafe_destroy!, obj)
         return obj
     end
 end
 
-Base.unsafe_convert(::Type{curandGenerator_t}, rng::RNG) = rng.ptr
+function unsafe_destroy!(rng::RNG)
+    if CUDAdrv.isvalid(rng.ctx)
+        curandDestroyGenerator(e)
+    end
+end
+
+Base.unsafe_convert(::Type{curandGenerator_t}, rng::RNG) = rng.handle
 
 
 ## seeding

From 8635a2faa7050b81de13b44fe6081876a896cf9d Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 20 Dec 2019 11:57:17 +0100
Subject: [PATCH 3/3] Adapt to initialization API changes.

---
 Manifest.toml          |  4 ++--
 src/blas/CUBLAS.jl     | 17 ++++++++---------
 src/blas/error.jl      |  2 +-
 src/dnn/CUDNN.jl       |  8 ++++----
 src/dnn/error.jl       |  2 +-
 src/fft/error.jl       |  2 +-
 src/memory.jl          |  4 +---
 src/rand/CURAND.jl     |  6 +++---
 src/rand/error.jl      |  2 +-
 src/solver/CUSOLVER.jl | 15 +++++++--------
 src/solver/error.jl    |  2 +-
 src/sparse/CUSPARSE.jl |  8 ++++----
 src/sparse/error.jl    |  2 +-
 src/tensor/CUTENSOR.jl |  6 +++---
 src/tensor/error.jl    |  2 +-
 15 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 2d2f7153..f9626fee 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -30,7 +30,7 @@ version = "2.0.0"
 
 [[CUDAdrv]]
 deps = ["CEnum", "CUDAapi", "Printf"]
-git-tree-sha1 = "5a9dd9ec20a5a2c78c784c31361cf5a813c7a9c2"
+git-tree-sha1 = "5c2cf00a78503e1f71409cecf3d64508fb33f17f"
 repo-rev = "master"
 repo-url = "https://github.com/JuliaGPU/CUDAdrv.jl.git"
 uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
@@ -38,7 +38,7 @@ version = "4.0.4"
 
 [[CUDAnative]]
 deps = ["Adapt", "CEnum", "CUDAapi", "CUDAdrv", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "Printf", "TimerOutputs"]
-git-tree-sha1 = "1a04a76171016f68f4790e9643524a3ac31f3d32"
+git-tree-sha1 = "8b1a585344fee94bdb95ac44653fd057d74e32e6"
 repo-rev = "master"
 repo-url = "https://github.com/JuliaGPU/CUDAnative.jl.git"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
diff --git a/src/blas/CUBLAS.jl b/src/blas/CUBLAS.jl
index e83a2361..2b2fd4b3 100644
--- a/src/blas/CUBLAS.jl
+++ b/src/blas/CUBLAS.jl
@@ -35,13 +35,13 @@ const active_xt_handles = Vector{Union{Nothing,cublasXtHandle_t}}()
 function handle()
     tid = Threads.threadid()
     if @inbounds active_handles[tid] === nothing
-        context = CuGetContext()
-        active_handles[tid] = get!(created_handles, context) do
+        ctx = context()
+        active_handles[tid] = get!(created_handles, ctx) do
             handle = cublasCreate_v2()
-            atexit(()->CUDAdrv.isvalid(context) && cublasDestroy_v2(handle))
+            atexit(()->CUDAdrv.isvalid(ctx) && cublasDestroy_v2(handle))
 
             # enable tensor math mode if our device supports it, and fast math is enabled
-            dev = CUDAdrv.device(context)
+            dev = CUDAdrv.device()
             if Base.JLOptions().fast_math == 1 && CUDAdrv.capability(dev) >= v"7.0" && version() >= v"9"
                 cublasSetMathMode(CUBLAS_TENSOR_OP_MATH, handle)
             end
@@ -55,11 +55,10 @@ end
 function xt_handle()
     tid = Threads.threadid()
     if @inbounds active_xt_handles[tid] === nothing
-        CUDAnative.maybe_initialize("cublasXtGetHandle")
-        context = CuCurrentContext()
-        active_xt_handles[tid] = get!(created_xt_handles, context) do
+        ctx = context()
+        active_xt_handles[tid] = get!(created_xt_handles, ctx) do
             handle = cublasXtCreate()
-            atexit(()->CUDAdrv.isvalid(context) && cublasXtDestroy(handle))
+            atexit(()->CUDAdrv.isvalid(ctx) && cublasXtDestroy(handle))
 
             # select the devices
             # TODO: this is weird, since we typically use a single device per thread/context
@@ -79,7 +78,7 @@ function __init__()
     resize!(active_xt_handles, Threads.nthreads())
     fill!(active_xt_handles, nothing)
 
-    CUDAnative.atcontextswitch() do tid, ctx, dev
+    CUDAnative.atcontextswitch() do tid, ctx
         # we don't eagerly initialize handles, but do so lazily when requested
         active_handles[tid] = nothing
         active_xt_handles[tid] = nothing
diff --git a/src/blas/error.jl b/src/blas/error.jl
index 68ea81cd..8fd091de 100644
--- a/src/blas/error.jl
+++ b/src/blas/error.jl
@@ -55,7 +55,7 @@ end
 macro check(ex)
     fun = Symbol(decode_ccall_function(ex))
     init = if !in(fun, preinit_apicalls)
-        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+        :(CUDAnative.maybe_initialize())
     end
     quote
         $init
diff --git a/src/dnn/CUDNN.jl b/src/dnn/CUDNN.jl
index 0d5996d8..4e933e92 100644
--- a/src/dnn/CUDNN.jl
+++ b/src/dnn/CUDNN.jl
@@ -47,10 +47,10 @@ const active_handles = Vector{Union{Nothing,cudnnHandle_t}}()
 function handle()
     tid = Threads.threadid()
     if @inbounds active_handles[tid] === nothing
-        context = CuGetContext()
-        active_handles[tid] = get!(created_handles, context) do
+        ctx = context()
+        active_handles[tid] = get!(created_handles, ctx) do
             handle = cudnnCreate()
-            atexit(()->CUDAdrv.isvalid(context) && cudnnDestroy(handle))
+            atexit(()->CUDAdrv.isvalid(ctx) && cudnnDestroy(handle))
             handle
         end
     end
@@ -61,7 +61,7 @@ function __init__()
     resize!(active_handles, Threads.nthreads())
     fill!(active_handles, nothing)
 
-    CUDAnative.atcontextswitch() do tid, ctx, dev
+    CUDAnative.atcontextswitch() do tid, ctx
         # we don't eagerly initialize handles, but do so lazily when requested
         active_handles[tid] = nothing
     end
diff --git a/src/dnn/error.jl b/src/dnn/error.jl
index 1d716580..9527181a 100644
--- a/src/dnn/error.jl
+++ b/src/dnn/error.jl
@@ -30,7 +30,7 @@ end
 macro check(ex)
     fun = Symbol(decode_ccall_function(ex))
     init = if !in(fun, preinit_apicalls)
-        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+        :(CUDAnative.maybe_initialize())
     end
     quote
         $init
diff --git a/src/fft/error.jl b/src/fft/error.jl
index 792ecb1f..4dc4ff31 100644
--- a/src/fft/error.jl
+++ b/src/fft/error.jl
@@ -68,7 +68,7 @@ end
 macro check(ex)
     fun = Symbol(decode_ccall_function(ex))
     init = if !in(fun, preinit_apicalls)
-        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+        :(CUDAnative.maybe_initialize())
     end
     quote
         $init
diff --git a/src/memory.jl b/src/memory.jl
index 3ad27678..a8573956 100644
--- a/src/memory.jl
+++ b/src/memory.jl
@@ -280,9 +280,7 @@ synchronized right before and after executing `ex` to exclude any external effec
 macro time(ex)
     quote
         # @time might surround an application, so be sure to initialize CUDA before that
-        # FIXME: this should be done in CUDAdrv (`synchronize(ctx=CuCurrentOrNewContext()`)
-        #        but the CUDA initialization mechanics are part of CUDAnative.jl
-        CUDAnative.maybe_initialize("@time")
+        CUDAnative.maybe_initialize()
 
         # coarse synchronization to exclude effects from previously-executed code
         CUDAdrv.synchronize()
diff --git a/src/rand/CURAND.jl b/src/rand/CURAND.jl
index 7857d137..6c3fec4a 100644
--- a/src/rand/CURAND.jl
+++ b/src/rand/CURAND.jl
@@ -30,8 +30,8 @@ const active_generators = Vector{Union{Nothing,RNG}}()
 function generator()
     tid = Threads.threadid()
     if @inbounds active_generators[tid] === nothing
-        context = CuGetContext()
-        active_generators[tid] = get!(created_generators, context) do
+        ctx = context()
+        active_generators[tid] = get!(created_generators, ctx) do
             RNG()
         end
     end
@@ -42,7 +42,7 @@ function __init__()
     resize!(active_generators, Threads.nthreads())
     fill!(active_generators, nothing)
 
-    CUDAnative.atcontextswitch() do tid, ctx, dev
+    CUDAnative.atcontextswitch() do tid, ctx
         # we don't eagerly initialize handles, but do so lazily when requested
         active_generators[tid] = nothing
     end
diff --git a/src/rand/error.jl b/src/rand/error.jl
index f6ce0a6f..a43af9d5 100644
--- a/src/rand/error.jl
+++ b/src/rand/error.jl
@@ -60,7 +60,7 @@ end
 macro check(ex)
     fun = Symbol(decode_ccall_function(ex))
     init = if !in(fun, preinit_apicalls)
-        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+        :(CUDAnative.maybe_initialize())
     end
     quote
         $init
diff --git a/src/solver/CUSOLVER.jl b/src/solver/CUSOLVER.jl
index 7cd940ef..e0e27a0e 100644
--- a/src/solver/CUSOLVER.jl
+++ b/src/solver/CUSOLVER.jl
@@ -37,10 +37,10 @@ const active_sparse_handles = Vector{Union{Nothing,cusolverSpHandle_t}}()
 function dense_handle()
     tid = Threads.threadid()
     if @inbounds active_dense_handles[tid] === nothing
-        context = CuGetContext()
-        active_dense_handles[tid] = get!(created_dense_handles, context) do
+        ctx = context()
+        active_dense_handles[tid] = get!(created_dense_handles, ctx) do
             handle = cusolverDnCreate()
-            atexit(()->CUDAdrv.isvalid(context) && cusolverDnDestroy(handle))
+            atexit(()->CUDAdrv.isvalid(ctx) && cusolverDnDestroy(handle))
             handle
         end
     end
@@ -50,11 +50,10 @@ end
 function sparse_handle()
     tid = Threads.threadid()
     if @inbounds active_sparse_handles[tid] === nothing
-        CUDAnative.maybe_initialize("cublasXtGetHandle")
-        context = CuCurrentContext()
-        active_sparse_handles[tid] = get!(created_sparse_handles, context) do
+        ctx = context()
+        active_sparse_handles[tid] = get!(created_sparse_handles, ctx) do
             handle = cusolverSpCreate()
-            atexit(()->CUDAdrv.isvalid(context) && cusolverSpDestroy(handle))
+            atexit(()->CUDAdrv.isvalid(ctx) && cusolverSpDestroy(handle))
             handle
         end
     end
@@ -68,7 +67,7 @@ function __init__()
     resize!(active_sparse_handles, Threads.nthreads())
     fill!(active_sparse_handles, nothing)
 
-    CUDAnative.atcontextswitch() do tid, ctx, dev
+    CUDAnative.atcontextswitch() do tid, ctx
         # we don't eagerly initialize handles, but do so lazily when requested
         active_dense_handles[tid] = nothing
         active_sparse_handles[tid] = nothing
diff --git a/src/solver/error.jl b/src/solver/error.jl
index ac96fc53..35937d6f 100644
--- a/src/solver/error.jl
+++ b/src/solver/error.jl
@@ -50,7 +50,7 @@ end
 macro check(ex)
     fun = Symbol(decode_ccall_function(ex))
     init = if !in(fun, preinit_apicalls)
-        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+        :(CUDAnative.maybe_initialize())
     end
     quote
         $init
diff --git a/src/sparse/CUSPARSE.jl b/src/sparse/CUSPARSE.jl
index 4575a058..48fb74df 100644
--- a/src/sparse/CUSPARSE.jl
+++ b/src/sparse/CUSPARSE.jl
@@ -35,10 +35,10 @@ const active_handles = Vector{Union{Nothing,cusparseHandle_t}}()
 function handle()
     tid = Threads.threadid()
     if @inbounds active_handles[tid] === nothing
-        context = CuGetContext()
-        active_handles[tid] = get!(created_handles, context) do
+        ctx = context()
+        active_handles[tid] = get!(created_handles, ctx) do
             handle = cusparseCreate()
-            atexit(()->CUDAdrv.isvalid(context) && cusparseDestroy(handle))
+            atexit(()->CUDAdrv.isvalid(ctx) && cusparseDestroy(handle))
             handle
         end
     end
@@ -49,7 +49,7 @@ function __init__()
     resize!(active_handles, Threads.nthreads())
     fill!(active_handles, nothing)
 
-    CUDAnative.atcontextswitch() do tid, ctx, dev
+    CUDAnative.atcontextswitch() do tid, ctx
         # we don't eagerly initialize handles, but do so lazily when requested
         active_handles[tid] = nothing
     end
diff --git a/src/sparse/error.jl b/src/sparse/error.jl
index 01464b45..25c6d9ea 100644
--- a/src/sparse/error.jl
+++ b/src/sparse/error.jl
@@ -60,7 +60,7 @@ end
 macro check(ex)
     fun = Symbol(decode_ccall_function(ex))
     init = if !in(fun, preinit_apicalls)
-        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+        :(CUDAnative.maybe_initialize())
     end
     quote
         $init
diff --git a/src/tensor/CUTENSOR.jl b/src/tensor/CUTENSOR.jl
index 44a2c1f1..574d1881 100644
--- a/src/tensor/CUTENSOR.jl
+++ b/src/tensor/CUTENSOR.jl
@@ -34,8 +34,8 @@ const active_handles = Vector{Union{Nothing,Ref{cutensorHandle_t}}}()
 function handle()
     tid = Threads.threadid()
     if @inbounds active_handles[tid] === nothing
-        context = CuGetContext()
-        active_handles[tid] = get!(created_handles, context) do
+        ctx = context()
+        active_handles[tid] = get!(created_handles, ctx) do
             handle = Ref{cutensorHandle_t}()
             cutensorInit(handle)
             handle
@@ -48,7 +48,7 @@ function __init__()
     resize!(active_handles, Threads.nthreads())
     fill!(active_handles, nothing)
 
-    CUDAnative.atcontextswitch() do tid, ctx, dev
+    CUDAnative.atcontextswitch() do tid, ctx
         # we don't eagerly initialize handles, but do so lazily when requested
         active_handles[tid] = nothing
     end
diff --git a/src/tensor/error.jl b/src/tensor/error.jl
index e433780b..7108f976 100644
--- a/src/tensor/error.jl
+++ b/src/tensor/error.jl
@@ -63,7 +63,7 @@ end
 macro check(ex)
     fun = Symbol(decode_ccall_function(ex))
     init = if !in(fun, preinit_apicalls)
-        :(CUDAnative.maybe_initialize($(QuoteNode(fun))))
+        :(CUDAnative.maybe_initialize())
     end
     quote
         $init