diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index 557274a56..3467486cf 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -82,6 +82,55 @@ ValueAndPushforward cudaDeviceSynchronize_pushforward() __attribute__((host)) { return {cudaDeviceSynchronize(), 0}; } + +template +__global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += blockDim.x * gridDim.x) + atomicAdd(&destPtr[i], srcPtr[i]); +} + +template +void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, + cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr, + size_t* d_count, cudaMemcpyKind* d_kind) + __attribute__((host)) { + T* aux_destPtr = nullptr; + if (kind == cudaMemcpyDeviceToHost) { + *d_kind = cudaMemcpyHostToDevice; + cudaMalloc(&aux_destPtr, count); + } else if (kind == cudaMemcpyHostToDevice) { + *d_kind = cudaMemcpyDeviceToHost; + aux_destPtr = (T*)malloc(count); + } + cudaDeviceSynchronize(); // needed in case user uses another stream for + // kernel execution besides the default one + cudaMemcpy(aux_destPtr, d_destPtr, count, *d_kind); + size_t N = count / sizeof(T); + if (kind == cudaMemcpyDeviceToHost) { + // d_kind is host to device, so d_srcPtr is a device pointer + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + size_t maxThreads = deviceProp.maxThreadsPerBlock; + size_t maxBlocks = deviceProp.maxGridSize[0]; + + size_t numThreads = std::min(maxThreads, N); + size_t numBlocks = std::min(maxBlocks, (N + numThreads - 1) / numThreads); + custom_derivatives::atomicAdd_kernel<<>>( + d_srcPtr, aux_destPtr, N); + cudaDeviceSynchronize(); // needed in case the user uses another stream for + // kernel execution besides the default one, so we + // need to make sure the data are updated before + // continuing with the rest of the code + cudaFree(aux_destPtr); + } else if (kind == cudaMemcpyHostToDevice) { + // d_kind is device to host, so d_srcPtr is a host pointer + for (size_t i = 0; i < N; i++) + d_srcPtr[i] += aux_destPtr[i]; + free(aux_destPtr); + } +} + #endif CUDA_HOST_DEVICE inline ValueAndPushforward diff --git a/include/clad/Differentiator/DerivativeBuilder.h b/include/clad/Differentiator/DerivativeBuilder.h index 9ac7165bf..5e9d54ac2 100644 --- a/include/clad/Differentiator/DerivativeBuilder.h +++ b/include/clad/Differentiator/DerivativeBuilder.h @@ -118,7 +118,8 @@ namespace clad { clang::Expr* BuildCallToCustomDerivativeOrNumericalDiff( const std::string& Name, llvm::SmallVectorImpl& CallArgs, clang::Scope* S, clang::DeclContext* originalFnDC, - bool forCustomDerv = true, bool namespaceShouldExist = true); + bool forCustomDerv = true, bool namespaceShouldExist = true, + clang::Expr* CUDAExecConfig = nullptr); bool noOverloadExists(clang::Expr* UnresolvedLookup, llvm::MutableArrayRef ARargs); /// Shorthand to issues a warning or error. diff --git a/include/clad/Differentiator/ReverseModeVisitor.h b/include/clad/Differentiator/ReverseModeVisitor.h index d65871ec4..c9fd1296f 100644 --- a/include/clad/Differentiator/ReverseModeVisitor.h +++ b/include/clad/Differentiator/ReverseModeVisitor.h @@ -354,7 +354,8 @@ namespace clad { clang::Expr* dfdx, llvm::SmallVectorImpl& PreCallStmts, llvm::SmallVectorImpl& PostCallStmts, llvm::SmallVectorImpl& args, - llvm::SmallVectorImpl& outputArgs); + llvm::SmallVectorImpl& outputArgs, + clang::Expr* CUDAExecConfig = nullptr); public: ReverseModeVisitor(DerivativeBuilder& builder, const DiffRequest& request); diff --git a/include/clad/Differentiator/VisitorBase.h b/include/clad/Differentiator/VisitorBase.h index dba1540a2..210f82112 100644 --- a/include/clad/Differentiator/VisitorBase.h +++ b/include/clad/Differentiator/VisitorBase.h @@ -603,7 +603,8 @@ namespace clad { /// \returns The derivative function call. clang::Expr* GetSingleArgCentralDiffCall( clang::Expr* targetFuncCall, clang::Expr* targetArg, unsigned targetPos, - unsigned numArgs, llvm::SmallVectorImpl& args); + unsigned numArgs, llvm::SmallVectorImpl& args, + clang::Expr* CUDAExecConfig = nullptr); /// Emits diagnostic messages on differentiation (or lack thereof) for /// call expressions. diff --git a/lib/Differentiator/CladUtils.cpp b/lib/Differentiator/CladUtils.cpp index 39e98f12f..350eeea07 100644 --- a/lib/Differentiator/CladUtils.cpp +++ b/lib/Differentiator/CladUtils.cpp @@ -679,7 +679,8 @@ namespace clad { } bool IsMemoryFunction(const clang::FunctionDecl* FD) { - + if (FD->getNameAsString() == "cudaMalloc") + return true; #if CLANG_VERSION_MAJOR > 12 if (FD->getBuiltinID() == Builtin::BImalloc) return true; @@ -703,6 +704,8 @@ namespace clad { } bool IsMemoryDeallocationFunction(const clang::FunctionDecl* FD) { + if (FD->getNameAsString() == "cudaFree") + return true; #if CLANG_VERSION_MAJOR > 12 return FD->getBuiltinID() == Builtin::ID::BIfree; #else diff --git a/lib/Differentiator/DerivativeBuilder.cpp b/lib/Differentiator/DerivativeBuilder.cpp index e4ba3f99a..ada7153c6 100644 --- a/lib/Differentiator/DerivativeBuilder.cpp +++ b/lib/Differentiator/DerivativeBuilder.cpp @@ -246,7 +246,8 @@ static void registerDerivative(FunctionDecl* derivedFD, Sema& semaRef) { Expr* DerivativeBuilder::BuildCallToCustomDerivativeOrNumericalDiff( const std::string& Name, llvm::SmallVectorImpl& CallArgs, clang::Scope* S, clang::DeclContext* originalFnDC, - bool forCustomDerv /*=true*/, bool namespaceShouldExist /*=true*/) { + bool forCustomDerv /*=true*/, bool namespaceShouldExist /*=true*/, + Expr* CUDAExecConfig /*=nullptr*/) { CXXScopeSpec SS; LookupResult R = LookupCustomDerivativeOrNumericalDiff( Name, originalFnDC, SS, forCustomDerv, namespaceShouldExist); @@ -265,8 +266,10 @@ static void registerDerivative(FunctionDecl* derivedFD, Sema& semaRef) { if (noOverloadExists(UnresolvedLookup, MARargs)) return nullptr; - OverloadedFn = - m_Sema.ActOnCallExpr(S, UnresolvedLookup, Loc, MARargs, Loc).get(); + OverloadedFn = m_Sema + .ActOnCallExpr(S, UnresolvedLookup, Loc, MARargs, Loc, + CUDAExecConfig) + .get(); // Add the custom derivative to the set of derivatives. // This is required in case the definition of the custom derivative diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 8995a2a67..04f626286 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -471,8 +471,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, if (m_ExternalSource) m_ExternalSource->ActAfterCreatingDerivedFnParams(params); - // if the function is a global kernel, all its parameters reside in the - // global memory of the GPU + // if the function is a global kernel, all the adjoint parameters reside in + // the global memory of the GPU. To facilitate the process, all the params + // of the kernel are added to the set. if (m_DiffReq->hasAttr()) for (auto* param : params) m_CUDAGlobalArgs.emplace(param); @@ -631,7 +632,12 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, if (!m_DiffReq.CUDAGlobalArgsIndexes.empty()) for (auto index : m_DiffReq.CUDAGlobalArgsIndexes) m_CUDAGlobalArgs.emplace(m_Derivative->getParamDecl(index)); - + // if the function is a global kernel, all the adjoint parameters reside in + // the global memory of the GPU. To facilitate the process, all the params + // of the kernel are added to the set. + else if (m_DiffReq->hasAttr()) + for (auto* param : params) + m_CUDAGlobalArgs.emplace(param); m_Derivative->setBody(nullptr); if (!m_DiffReq.DeclarationOnly) { @@ -1667,6 +1673,10 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, return StmtDiff(Clone(CE)); } + Expr* CUDAExecConfig = nullptr; + if (const auto* KCE = dyn_cast(CE)) + CUDAExecConfig = Clone(KCE->getConfig()); + // If the function is non_differentiable, return zero derivative. if (clad::utils::hasNonDifferentiableAttribute(CE)) { // Calling the function without computing derivatives @@ -1675,10 +1685,11 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, ClonedArgs.push_back(Clone(CE->getArg(i))); SourceLocation validLoc = clad::utils::GetValidSLoc(m_Sema); - Expr* Call = m_Sema - .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), - validLoc, ClonedArgs, validLoc) - .get(); + Expr* Call = + m_Sema + .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), + validLoc, ClonedArgs, validLoc, CUDAExecConfig) + .get(); // Creating a zero derivative auto* zero = ConstantFolder::synthesizeLiteral(m_Context.IntTy, m_Context, /*val=*/0); @@ -1825,7 +1836,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, Expr* call = m_Sema .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc, - llvm::MutableArrayRef(CallArgs), Loc) + llvm::MutableArrayRef(CallArgs), Loc, + CUDAExecConfig) .get(); return call; } @@ -1940,7 +1952,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, OverloadedDerivedFn = m_Builder.BuildCallToCustomDerivativeOrNumericalDiff( customPushforward, pushforwardCallArgs, getCurrentScope(), - const_cast(FD->getDeclContext())); + const_cast(FD->getDeclContext()), + /*forCustomDerv=*/true, /*namespaceShouldExist=*/true, + CUDAExecConfig); if (OverloadedDerivedFn) asGrad = false; } @@ -2041,7 +2055,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, OverloadedDerivedFn = m_Builder.BuildCallToCustomDerivativeOrNumericalDiff( customPullback, pullbackCallArgs, getCurrentScope(), - const_cast(FD->getDeclContext())); + const_cast(FD->getDeclContext()), + /*forCustomDerv=*/true, /*namespaceShouldExist=*/true, + CUDAExecConfig); if (baseDiff.getExpr()) pullbackCallArgs.erase(pullbackCallArgs.begin()); } @@ -2057,10 +2073,11 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, CXXScopeSpec(), m_Derivative->getNameInfo(), m_Derivative) .get(); - OverloadedDerivedFn = m_Sema - .ActOnCallExpr(getCurrentScope(), selfRef, - Loc, pullbackCallArgs, Loc) - .get(); + OverloadedDerivedFn = + m_Sema + .ActOnCallExpr(getCurrentScope(), selfRef, Loc, + pullbackCallArgs, Loc, CUDAExecConfig) + .get(); } else { if (m_ExternalSource) m_ExternalSource->ActBeforeDifferentiatingCallExpr( @@ -2112,14 +2129,14 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, OverloadedDerivedFn = GetSingleArgCentralDiffCall( Clone(CE->getCallee()), DerivedCallArgs[0], /*targetPos=*/0, - /*numArgs=*/1, DerivedCallArgs); + /*numArgs=*/1, DerivedCallArgs, CUDAExecConfig); asGrad = !OverloadedDerivedFn; } else { auto CEType = getNonConstType(CE->getType(), m_Context, m_Sema); OverloadedDerivedFn = GetMultiArgCentralDiffCall( Clone(CE->getCallee()), CEType.getCanonicalType(), CE->getNumArgs(), dfdx(), PreCallStmts, PostCallStmts, - DerivedCallArgs, CallArgDx); + DerivedCallArgs, CallArgDx, CUDAExecConfig); } CallExprDiffDiagnostics(FD, CE->getBeginLoc()); if (!OverloadedDerivedFn) { @@ -2137,7 +2154,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, OverloadedDerivedFn = m_Sema .ActOnCallExpr(getCurrentScope(), BuildDeclRef(pullbackFD), - Loc, pullbackCallArgs, Loc) + Loc, pullbackCallArgs, Loc, CUDAExecConfig) .get(); } } @@ -2250,7 +2267,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, call = m_Sema .ActOnCallExpr(getCurrentScope(), BuildDeclRef(calleeFnForwPassFD), Loc, - CallArgs, Loc) + CallArgs, Loc, CUDAExecConfig) .get(); } auto* callRes = StoreAndRef(call); @@ -2285,7 +2302,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, call = m_Sema .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc, - CallArgs, Loc) + CallArgs, Loc, CUDAExecConfig) .get(); return StmtDiff(call); } @@ -2295,7 +2312,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, llvm::SmallVectorImpl& PreCallStmts, llvm::SmallVectorImpl& PostCallStmts, llvm::SmallVectorImpl& args, - llvm::SmallVectorImpl& outputArgs) { + llvm::SmallVectorImpl& outputArgs, + Expr* CUDAExecConfig /*=nullptr*/) { int printErrorInf = m_Builder.shouldPrintNumDiffErrs(); llvm::SmallVector NumDiffArgs = {}; NumDiffArgs.push_back(targetFuncCall); @@ -2336,7 +2354,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, Name, NumDiffArgs, getCurrentScope(), /*OriginalFnDC=*/nullptr, /*forCustomDerv=*/false, - /*namespaceShouldExist=*/false); + /*namespaceShouldExist=*/false, CUDAExecConfig); } StmtDiff ReverseModeVisitor::VisitUnaryOperator(const UnaryOperator* UnOp) { diff --git a/lib/Differentiator/VisitorBase.cpp b/lib/Differentiator/VisitorBase.cpp index 6cd582270..e8fce3628 100644 --- a/lib/Differentiator/VisitorBase.cpp +++ b/lib/Differentiator/VisitorBase.cpp @@ -765,7 +765,8 @@ namespace clad { Expr* VisitorBase::GetSingleArgCentralDiffCall( Expr* targetFuncCall, Expr* targetArg, unsigned targetPos, - unsigned numArgs, llvm::SmallVectorImpl& args) { + unsigned numArgs, llvm::SmallVectorImpl& args, + Expr* CUDAExecConfig /*=nullptr*/) { QualType argType = targetArg->getType(); int printErrorInf = m_Builder.shouldPrintNumDiffErrs(); bool isSupported = argType->isArithmeticType(); @@ -788,7 +789,7 @@ namespace clad { Name, NumDiffArgs, getCurrentScope(), /*OriginalFnDC=*/nullptr, /*forCustomDerv=*/false, - /*namespaceShouldExist=*/false); + /*namespaceShouldExist=*/false, CUDAExecConfig); } void VisitorBase::CallExprDiffDiagnostics(const clang::FunctionDecl* FD, diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index a60604fa6..92f69d92c 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -412,6 +412,96 @@ __global__ void kernel_with_nested_device_call(double *out, double *in, double v //CHECK-NEXT: } //CHECK-NEXT:} +__global__ void kernel_call(double *a, double *b) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + a[index] = *b; +} + +void fn(double *out, double *in) { + kernel_call<<<1, 10>>>(out, in); +} + +// CHECK: void fn_grad(double *out, double *in, double *_d_out, double *_d_in) { +//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in); +//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); +//CHECK-NEXT: } + +double fn_memory(double *out, double *in) { + double *in_dev = nullptr; + cudaMalloc(&in_dev, 10 * sizeof(double)); + cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice); + kernel_call<<<1, 10>>>(out, in_dev); + cudaDeviceSynchronize(); + double *out_host = (double *)malloc(10 * sizeof(double)); + cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost); + double res = 0; + for (int i=0; i < 10; ++i) { + res += out_host[i]; + } + free(out_host); + cudaFree(out); + cudaFree(in_dev); + return res; +} + +// CHECK: void fn_memory_grad(double *out, double *in, double *_d_out, double *_d_in) { +//CHECK-NEXT: int _d_i = 0; +//CHECK-NEXT: int i = 0; +//CHECK-NEXT: clad::tape _t1 = {}; +//CHECK-NEXT: double *_d_in_dev = nullptr; +//CHECK-NEXT: double *in_dev = nullptr; +//CHECK-NEXT: cudaMalloc(&_d_in_dev, 10 * sizeof(double)); +//CHECK-NEXT: cudaMalloc(&in_dev, 10 * sizeof(double)); +//CHECK-NEXT: cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice); +//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in_dev); +//CHECK-NEXT: cudaDeviceSynchronize(); +//CHECK-NEXT: double *_d_out_host = (double *)malloc(10 * sizeof(double)); +//CHECK-NEXT: memset(_d_out_host, 0, 10 * sizeof(double)); +//CHECK-NEXT: double *out_host = (double *)malloc(10 * sizeof(double)); +//CHECK-NEXT: cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost); +//CHECK-NEXT: double _d_res = 0.; +//CHECK-NEXT: double res = 0; +//CHECK-NEXT: unsigned long _t0 = 0UL; +//CHECK-NEXT: for (i = 0; ; ++i) { +//CHECK-NEXT: { +//CHECK-NEXT: if (!(i < 10)) +//CHECK-NEXT: break; +//CHECK-NEXT: } +//CHECK-NEXT: _t0++; +//CHECK-NEXT: clad::push(_t1, res); +//CHECK-NEXT: res += out_host[i]; +//CHECK-NEXT: } +//CHECK-NEXT: _d_res += 1; +//CHECK-NEXT: for (;; _t0--) { +//CHECK-NEXT: { +//CHECK-NEXT: if (!_t0) +//CHECK-NEXT: break; +//CHECK-NEXT: } +//CHECK-NEXT: --i; +//CHECK-NEXT: { +//CHECK-NEXT: res = clad::pop(_t1); +//CHECK-NEXT: double _r_d0 = _d_res; +//CHECK-NEXT: _d_out_host[i] += _r_d0; +//CHECK-NEXT: } +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: unsigned long _r2 = 0UL; +//CHECK-NEXT: cudaMemcpyKind _r3 = static_cast(0U); +//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r2, &_r3); +//CHECK-NEXT: } +//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in_dev, _d_out, _d_in_dev); +//CHECK-NEXT: { +//CHECK-NEXT: unsigned long _r0 = 0UL; +//CHECK-NEXT: cudaMemcpyKind _r1 = static_cast(0U); +//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice, _d_in_dev, _d_in, &_r0, &_r1); +//CHECK-NEXT: } +//CHECK-NEXT: free(out_host); +//CHECK-NEXT: free(_d_out_host); +//CHECK-NEXT: cudaFree(out); +//CHECK-NEXT: cudaFree(in_dev); +//CHECK-NEXT: cudaFree(_d_in_dev); +//CHECK-NEXT:} + // CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) { //CHECK-NEXT: { //CHECK-NEXT: *_d_in += _d_y; @@ -489,7 +579,6 @@ __global__ void kernel_with_nested_device_call(double *out, double *in, double v else { \ test.execute_kernel(grid, block, x, dx); \ } \ - cudaDeviceSynchronize(); \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ for (int i = 0; i < (N - 1); i++) { \ @@ -527,7 +616,6 @@ __global__ void kernel_with_nested_device_call(double *out, double *in, double v } \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%d, ", res[i]); \ } \ @@ -562,7 +650,6 @@ __global__ void kernel_with_nested_device_call(double *out, double *in, double v } \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%d, ", res[i]); \ } \ @@ -597,7 +684,6 @@ __global__ void kernel_with_nested_device_call(double *out, double *in, double v } \ double *res = (double*)malloc(N * sizeof(double)); \ cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%0.2f, ", res[i]); \ } \ @@ -609,22 +695,12 @@ __global__ void kernel_with_nested_device_call(double *out, double *in, double v #define INIT(x, y, val, dx, dy, d_val) \ { \ - double *fives = (double*)malloc(10 * sizeof(double)); \ - for(int i = 0; i < 10; i++) { \ - fives[i] = 5; \ - } \ - double *zeros = (double*)malloc(10 * sizeof(double)); \ - for(int i = 0; i < 10; i++) { \ - zeros[i] = 0; \ - } \ cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); \ cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); \ cudaMemcpy(val, fives, sizeof(double), cudaMemcpyHostToDevice); \ cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); \ cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); \ cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice); \ - free(fives); \ - free(zeros); \ } int main(void) { @@ -644,7 +720,6 @@ int main(void) { cudaFree(a); cudaFree(d_a); - int *dummy_in, *dummy_out, *d_out, *d_in; cudaMalloc(&dummy_in, 10 * sizeof(int)); cudaMalloc(&dummy_out, 10 * sizeof(int)); @@ -671,18 +746,22 @@ int main(void) { TEST_2_D(add_kernel_7, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out_double, dummy_in_double, d_out_double, d_in_double, 10); // CHECK-EXEC: 50.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 - double *val; + double *val, *d_val; cudaMalloc(&val, sizeof(double)); - double *d_val; cudaMalloc(&d_val, sizeof(double)); + + double *fives = (double*)malloc(10 * sizeof(double)); + double *zeros = (double*)malloc(10 * sizeof(double)); + for(int i = 0; i < 10; i++) { fives[i] = 5; zeros[i] = 0; } INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); auto test_device = clad::gradient(kernel_with_device_call, "out, val"); test_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); double *res = (double*)malloc(10 * sizeof(double)); - cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); + cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); // no need for synchronization before or after, + // as the cudaMemcpy call is queued after the kernel call + // on the default stream and the cudaMemcpy call is blocking printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); @@ -690,7 +769,6 @@ int main(void) { auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val"); test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); @@ -698,7 +776,6 @@ int main(void) { auto check_dup = clad::gradient(dup_kernel_with_device_call_2, "out, val"); // check that the pullback function is not regenerated check_dup.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 @@ -706,7 +783,6 @@ int main(void) { auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in"); test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double); - cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 @@ -714,28 +790,39 @@ int main(void) { auto test_device_4 = clad::gradient(kernel_with_device_call_3); test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, val, d_out_double, d_in_double, d_val); - cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + auto test_kernel_call = clad::gradient(fn); + test_kernel_call.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); + cudaMemcpy(res, d_in_double, sizeof(double), cudaMemcpyDeviceToHost); + printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 + + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in"); nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double); - cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + + auto test_memory = clad::gradient(fn_memory); + test_memory.execute(dummy_out_double, fives, d_out_double, zeros); + printf("%0.2f, %0.2f, %0.2f\n", zeros[0], zeros[1], zeros[2]); // CHECK-EXEC: 60.00, 0.00, 0.00 + free(res); - cudaFree(dummy_in_double); - cudaFree(dummy_out_double); + free(fives); + free(zeros); cudaFree(d_out_double); cudaFree(d_in_double); cudaFree(val); cudaFree(d_val); + cudaFree(dummy_in_double); return 0; }