From f70b8b57e45670a1f4ae540478f6ce7b101d189d Mon Sep 17 00:00:00 2001 From: Christina Koutsou Date: Tue, 15 Oct 2024 19:27:27 +0000 Subject: [PATCH 01/11] Add support of kernel pullback functions --- .../clad/Differentiator/BuiltinDerivatives.h | 12 ++ .../clad/Differentiator/DerivativeBuilder.h | 3 +- .../clad/Differentiator/ReverseModeVisitor.h | 3 +- include/clad/Differentiator/VisitorBase.h | 3 +- lib/Differentiator/CladUtils.cpp | 5 +- lib/Differentiator/DerivativeBuilder.cpp | 9 +- lib/Differentiator/ReverseModeVisitor.cpp | 57 +++++---- lib/Differentiator/VisitorBase.cpp | 5 +- test/CUDA/GradientKernels.cu | 113 +++++++++++++++--- 9 files changed, 165 insertions(+), 45 deletions(-) diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index 557274a56..1d3f96aba 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -82,6 +82,18 @@ ValueAndPushforward cudaDeviceSynchronize_pushforward() __attribute__((host)) { return {cudaDeviceSynchronize(), 0}; } + +void cudaMemcpy_pullback(void* destPtr, void* srcPtr, size_t count, + cudaMemcpyKind kind, void* d_destPtr, void* d_srcPtr, + size_t* d_count, cudaMemcpyKind* d_kind) + __attribute__((host)) { + if (kind == cudaMemcpyDeviceToHost) + *d_kind = cudaMemcpyHostToDevice; + else if (kind == cudaMemcpyHostToDevice) + *d_kind = cudaMemcpyDeviceToHost; + cudaMemcpy(d_srcPtr, d_destPtr, count, *d_kind); +} + #endif CUDA_HOST_DEVICE inline ValueAndPushforward diff --git a/include/clad/Differentiator/DerivativeBuilder.h b/include/clad/Differentiator/DerivativeBuilder.h index 9ac7165bf..5e9d54ac2 100644 --- a/include/clad/Differentiator/DerivativeBuilder.h +++ b/include/clad/Differentiator/DerivativeBuilder.h @@ -118,7 +118,8 @@ namespace clad { clang::Expr* BuildCallToCustomDerivativeOrNumericalDiff( const std::string& Name, llvm::SmallVectorImpl& CallArgs, clang::Scope* S, clang::DeclContext* originalFnDC, - bool forCustomDerv = true, bool namespaceShouldExist = true); + bool forCustomDerv = true, bool namespaceShouldExist = true, + clang::Expr* CUDAExecConfig = nullptr); bool noOverloadExists(clang::Expr* UnresolvedLookup, llvm::MutableArrayRef ARargs); /// Shorthand to issues a warning or error. diff --git a/include/clad/Differentiator/ReverseModeVisitor.h b/include/clad/Differentiator/ReverseModeVisitor.h index d65871ec4..c9fd1296f 100644 --- a/include/clad/Differentiator/ReverseModeVisitor.h +++ b/include/clad/Differentiator/ReverseModeVisitor.h @@ -354,7 +354,8 @@ namespace clad { clang::Expr* dfdx, llvm::SmallVectorImpl& PreCallStmts, llvm::SmallVectorImpl& PostCallStmts, llvm::SmallVectorImpl& args, - llvm::SmallVectorImpl& outputArgs); + llvm::SmallVectorImpl& outputArgs, + clang::Expr* CUDAExecConfig = nullptr); public: ReverseModeVisitor(DerivativeBuilder& builder, const DiffRequest& request); diff --git a/include/clad/Differentiator/VisitorBase.h b/include/clad/Differentiator/VisitorBase.h index dba1540a2..210f82112 100644 --- a/include/clad/Differentiator/VisitorBase.h +++ b/include/clad/Differentiator/VisitorBase.h @@ -603,7 +603,8 @@ namespace clad { /// \returns The derivative function call. clang::Expr* GetSingleArgCentralDiffCall( clang::Expr* targetFuncCall, clang::Expr* targetArg, unsigned targetPos, - unsigned numArgs, llvm::SmallVectorImpl& args); + unsigned numArgs, llvm::SmallVectorImpl& args, + clang::Expr* CUDAExecConfig = nullptr); /// Emits diagnostic messages on differentiation (or lack thereof) for /// call expressions. diff --git a/lib/Differentiator/CladUtils.cpp b/lib/Differentiator/CladUtils.cpp index 39e98f12f..350eeea07 100644 --- a/lib/Differentiator/CladUtils.cpp +++ b/lib/Differentiator/CladUtils.cpp @@ -679,7 +679,8 @@ namespace clad { } bool IsMemoryFunction(const clang::FunctionDecl* FD) { - + if (FD->getNameAsString() == "cudaMalloc") + return true; #if CLANG_VERSION_MAJOR > 12 if (FD->getBuiltinID() == Builtin::BImalloc) return true; @@ -703,6 +704,8 @@ namespace clad { } bool IsMemoryDeallocationFunction(const clang::FunctionDecl* FD) { + if (FD->getNameAsString() == "cudaFree") + return true; #if CLANG_VERSION_MAJOR > 12 return FD->getBuiltinID() == Builtin::ID::BIfree; #else diff --git a/lib/Differentiator/DerivativeBuilder.cpp b/lib/Differentiator/DerivativeBuilder.cpp index e4ba3f99a..ada7153c6 100644 --- a/lib/Differentiator/DerivativeBuilder.cpp +++ b/lib/Differentiator/DerivativeBuilder.cpp @@ -246,7 +246,8 @@ static void registerDerivative(FunctionDecl* derivedFD, Sema& semaRef) { Expr* DerivativeBuilder::BuildCallToCustomDerivativeOrNumericalDiff( const std::string& Name, llvm::SmallVectorImpl& CallArgs, clang::Scope* S, clang::DeclContext* originalFnDC, - bool forCustomDerv /*=true*/, bool namespaceShouldExist /*=true*/) { + bool forCustomDerv /*=true*/, bool namespaceShouldExist /*=true*/, + Expr* CUDAExecConfig /*=nullptr*/) { CXXScopeSpec SS; LookupResult R = LookupCustomDerivativeOrNumericalDiff( Name, originalFnDC, SS, forCustomDerv, namespaceShouldExist); @@ -265,8 +266,10 @@ static void registerDerivative(FunctionDecl* derivedFD, Sema& semaRef) { if (noOverloadExists(UnresolvedLookup, MARargs)) return nullptr; - OverloadedFn = - m_Sema.ActOnCallExpr(S, UnresolvedLookup, Loc, MARargs, Loc).get(); + OverloadedFn = m_Sema + .ActOnCallExpr(S, UnresolvedLookup, Loc, MARargs, Loc, + CUDAExecConfig) + .get(); // Add the custom derivative to the set of derivatives. // This is required in case the definition of the custom derivative diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 8995a2a67..67d4c22bc 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -471,8 +471,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, if (m_ExternalSource) m_ExternalSource->ActAfterCreatingDerivedFnParams(params); - // if the function is a global kernel, all its parameters reside in the - // global memory of the GPU + // if the function is a global kernel, all the adjoint parameters reside in + // the global memory of the GPU. To facilitate the process, all the params + // of the kernel are added to the set. if (m_DiffReq->hasAttr()) for (auto* param : params) m_CUDAGlobalArgs.emplace(param); @@ -631,7 +632,11 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, if (!m_DiffReq.CUDAGlobalArgsIndexes.empty()) for (auto index : m_DiffReq.CUDAGlobalArgsIndexes) m_CUDAGlobalArgs.emplace(m_Derivative->getParamDecl(index)); - + // If the function is a global kernel, all its parameters reside in the + // global memory of the GPU + else if (m_DiffReq->hasAttr()) + for (auto param : params) + m_CUDAGlobalArgs.emplace(param); m_Derivative->setBody(nullptr); if (!m_DiffReq.DeclarationOnly) { @@ -1667,6 +1672,10 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, return StmtDiff(Clone(CE)); } + Expr* CUDAExecConfig = nullptr; + if (auto KCE = dyn_cast(CE)) + CUDAExecConfig = Clone(KCE->getConfig()); + // If the function is non_differentiable, return zero derivative. if (clad::utils::hasNonDifferentiableAttribute(CE)) { // Calling the function without computing derivatives @@ -1675,10 +1684,11 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, ClonedArgs.push_back(Clone(CE->getArg(i))); SourceLocation validLoc = clad::utils::GetValidSLoc(m_Sema); - Expr* Call = m_Sema - .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), - validLoc, ClonedArgs, validLoc) - .get(); + Expr* Call = + m_Sema + .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), + validLoc, ClonedArgs, validLoc, CUDAExecConfig) + .get(); // Creating a zero derivative auto* zero = ConstantFolder::synthesizeLiteral(m_Context.IntTy, m_Context, /*val=*/0); @@ -1825,7 +1835,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, Expr* call = m_Sema .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc, - llvm::MutableArrayRef(CallArgs), Loc) + llvm::MutableArrayRef(CallArgs), Loc, + CUDAExecConfig) .get(); return call; } @@ -1940,7 +1951,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, OverloadedDerivedFn = m_Builder.BuildCallToCustomDerivativeOrNumericalDiff( customPushforward, pushforwardCallArgs, getCurrentScope(), - const_cast(FD->getDeclContext())); + const_cast(FD->getDeclContext()), true, true, + CUDAExecConfig); if (OverloadedDerivedFn) asGrad = false; } @@ -2041,7 +2053,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, OverloadedDerivedFn = m_Builder.BuildCallToCustomDerivativeOrNumericalDiff( customPullback, pullbackCallArgs, getCurrentScope(), - const_cast(FD->getDeclContext())); + const_cast(FD->getDeclContext()), true, true, + CUDAExecConfig); if (baseDiff.getExpr()) pullbackCallArgs.erase(pullbackCallArgs.begin()); } @@ -2057,10 +2070,11 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, CXXScopeSpec(), m_Derivative->getNameInfo(), m_Derivative) .get(); - OverloadedDerivedFn = m_Sema - .ActOnCallExpr(getCurrentScope(), selfRef, - Loc, pullbackCallArgs, Loc) - .get(); + OverloadedDerivedFn = + m_Sema + .ActOnCallExpr(getCurrentScope(), selfRef, Loc, + pullbackCallArgs, Loc, CUDAExecConfig) + .get(); } else { if (m_ExternalSource) m_ExternalSource->ActBeforeDifferentiatingCallExpr( @@ -2112,14 +2126,14 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, OverloadedDerivedFn = GetSingleArgCentralDiffCall( Clone(CE->getCallee()), DerivedCallArgs[0], /*targetPos=*/0, - /*numArgs=*/1, DerivedCallArgs); + /*numArgs=*/1, DerivedCallArgs, CUDAExecConfig); asGrad = !OverloadedDerivedFn; } else { auto CEType = getNonConstType(CE->getType(), m_Context, m_Sema); OverloadedDerivedFn = GetMultiArgCentralDiffCall( Clone(CE->getCallee()), CEType.getCanonicalType(), CE->getNumArgs(), dfdx(), PreCallStmts, PostCallStmts, - DerivedCallArgs, CallArgDx); + DerivedCallArgs, CallArgDx, CUDAExecConfig); } CallExprDiffDiagnostics(FD, CE->getBeginLoc()); if (!OverloadedDerivedFn) { @@ -2137,7 +2151,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, OverloadedDerivedFn = m_Sema .ActOnCallExpr(getCurrentScope(), BuildDeclRef(pullbackFD), - Loc, pullbackCallArgs, Loc) + Loc, pullbackCallArgs, Loc, CUDAExecConfig) .get(); } } @@ -2250,7 +2264,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, call = m_Sema .ActOnCallExpr(getCurrentScope(), BuildDeclRef(calleeFnForwPassFD), Loc, - CallArgs, Loc) + CallArgs, Loc, CUDAExecConfig) .get(); } auto* callRes = StoreAndRef(call); @@ -2285,7 +2299,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, call = m_Sema .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc, - CallArgs, Loc) + CallArgs, Loc, CUDAExecConfig) .get(); return StmtDiff(call); } @@ -2295,7 +2309,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, llvm::SmallVectorImpl& PreCallStmts, llvm::SmallVectorImpl& PostCallStmts, llvm::SmallVectorImpl& args, - llvm::SmallVectorImpl& outputArgs) { + llvm::SmallVectorImpl& outputArgs, + Expr* CUDAExecConfig /*=nullptr*/) { int printErrorInf = m_Builder.shouldPrintNumDiffErrs(); llvm::SmallVector NumDiffArgs = {}; NumDiffArgs.push_back(targetFuncCall); @@ -2336,7 +2351,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, Name, NumDiffArgs, getCurrentScope(), /*OriginalFnDC=*/nullptr, /*forCustomDerv=*/false, - /*namespaceShouldExist=*/false); + /*namespaceShouldExist=*/false, CUDAExecConfig); } StmtDiff ReverseModeVisitor::VisitUnaryOperator(const UnaryOperator* UnOp) { diff --git a/lib/Differentiator/VisitorBase.cpp b/lib/Differentiator/VisitorBase.cpp index 6cd582270..e8fce3628 100644 --- a/lib/Differentiator/VisitorBase.cpp +++ b/lib/Differentiator/VisitorBase.cpp @@ -765,7 +765,8 @@ namespace clad { Expr* VisitorBase::GetSingleArgCentralDiffCall( Expr* targetFuncCall, Expr* targetArg, unsigned targetPos, - unsigned numArgs, llvm::SmallVectorImpl& args) { + unsigned numArgs, llvm::SmallVectorImpl& args, + Expr* CUDAExecConfig /*=nullptr*/) { QualType argType = targetArg->getType(); int printErrorInf = m_Builder.shouldPrintNumDiffErrs(); bool isSupported = argType->isArithmeticType(); @@ -788,7 +789,7 @@ namespace clad { Name, NumDiffArgs, getCurrentScope(), /*OriginalFnDC=*/nullptr, /*forCustomDerv=*/false, - /*namespaceShouldExist=*/false); + /*namespaceShouldExist=*/false, CUDAExecConfig); } void VisitorBase::CallExprDiffDiagnostics(const clang::FunctionDecl* FD, diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index a60604fa6..87a8daa28 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -412,6 +412,81 @@ __global__ void kernel_with_nested_device_call(double *out, double *in, double v //CHECK-NEXT: } //CHECK-NEXT:} +__global__ void kernel_call(double *a, double *b) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + a[index] = *b; +} + +void fn(double *out, double *in) { + kernel_call<<<1, 10>>>(out, in); +} + +// CHECK: void fn_grad(double *out, double *in, double *_d_out, double *_d_in) { +//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in); +//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); +//CHECK-NEXT: } + +double fn_memory(double *out, double *in) { + kernel_call<<<1, 10>>>(out, in); + cudaDeviceSynchronize(); + double *out_host = (double*)malloc(10 * sizeof(double)); + cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost); + double res = 0; + for (int i=0; i < 10; ++i) { + res += out_host[i]; + } + free(out_host); + cudaFree(out); + cudaFree(in); + return res; +} + +// CHECK: void fn_memory_grad(double *out, double *in, double *_d_out, double *_d_in) { +//CHECK-NEXT: int _d_i = 0; +//CHECK-NEXT: int i = 0; +//CHECK-NEXT: clad::tape _t1 = {}; +//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in); +//CHECK-NEXT: cudaDeviceSynchronize(); +//CHECK-NEXT: double *_d_out_host = (double *)malloc(10 * sizeof(double)); +//CHECK-NEXT: double *out_host = (double *)malloc(10 * sizeof(double)); +//CHECK-NEXT: cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost); +//CHECK-NEXT: double _d_res = 0.; +//CHECK-NEXT: double res = 0; +//CHECK-NEXT: unsigned long _t0 = 0UL; +//CHECK-NEXT: for (i = 0; ; ++i) { +//CHECK-NEXT: { +//CHECK-NEXT: if (!(i < 10)) +//CHECK-NEXT: break; +//CHECK-NEXT: } +//CHECK-NEXT: _t0++; +//CHECK-NEXT: clad::push(_t1, res); +//CHECK-NEXT: res += out_host[i]; +//CHECK-NEXT: } +//CHECK-NEXT: _d_res += 1; +//CHECK-NEXT: for (;; _t0--) { +//CHECK-NEXT: { +//CHECK-NEXT: if (!_t0) +//CHECK-NEXT: break; +//CHECK-NEXT: } +//CHECK-NEXT: --i; +//CHECK-NEXT: { +//CHECK-NEXT: res = clad::pop(_t1); +//CHECK-NEXT: double _r_d0 = _d_res; +//CHECK-NEXT: _d_out_host[i] += _r_d0; +//CHECK-NEXT: } +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: unsigned long _r0 = 0UL; +//CHECK-NEXT: cudaMemcpyKind _r1 = static_cast(0U); +//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1); +//CHECK-NEXT: } +//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); +//CHECK-NEXT: free(out_host); +//CHECK-NEXT: free(_d_out_host); +//CHECK-NEXT: cudaFree(out); +//CHECK-NEXT: cudaFree(in); +//CHECK-NEXT:} + // CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) { //CHECK-NEXT: { //CHECK-NEXT: *_d_in += _d_y; @@ -609,22 +684,12 @@ __global__ void kernel_with_nested_device_call(double *out, double *in, double v #define INIT(x, y, val, dx, dy, d_val) \ { \ - double *fives = (double*)malloc(10 * sizeof(double)); \ - for(int i = 0; i < 10; i++) { \ - fives[i] = 5; \ - } \ - double *zeros = (double*)malloc(10 * sizeof(double)); \ - for(int i = 0; i < 10; i++) { \ - zeros[i] = 0; \ - } \ cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); \ cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); \ cudaMemcpy(val, fives, sizeof(double), cudaMemcpyHostToDevice); \ cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); \ cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); \ cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice); \ - free(fives); \ - free(zeros); \ } int main(void) { @@ -644,7 +709,6 @@ int main(void) { cudaFree(a); cudaFree(d_a); - int *dummy_in, *dummy_out, *d_out, *d_in; cudaMalloc(&dummy_in, 10 * sizeof(int)); cudaMalloc(&dummy_out, 10 * sizeof(int)); @@ -671,10 +735,13 @@ int main(void) { TEST_2_D(add_kernel_7, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out_double, dummy_in_double, d_out_double, d_in_double, 10); // CHECK-EXEC: 50.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 - double *val; + double *val, *d_val; cudaMalloc(&val, sizeof(double)); - double *d_val; cudaMalloc(&d_val, sizeof(double)); + + double *fives = (double*)malloc(10 * sizeof(double)); + double *zeros = (double*)malloc(10 * sizeof(double)); + for(int i = 0; i < 10; i++) { fives[i] = 5; zeros[i] = 0; } INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); @@ -723,15 +790,31 @@ int main(void) { INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + auto test_kernel_call = clad::gradient(fn); + test_kernel_call.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); + cudaDeviceSynchronize(); + cudaMemcpy(res, d_in_double, sizeof(double), cudaMemcpyDeviceToHost); + printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 + + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in"); nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double); cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); + + auto test_memory = clad::gradient(fn_memory); + test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); + cudaDeviceSynchronize(); + cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); + printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 10.00, 0.00, 0.00 + free(res); - cudaFree(dummy_in_double); - cudaFree(dummy_out_double); + free(fives); + free(zeros); cudaFree(d_out_double); cudaFree(d_in_double); cudaFree(val); From dd9ff383a6d3fa6f0a5933a73aa5122fa896e7cd Mon Sep 17 00:00:00 2001 From: kchristin Date: Fri, 18 Oct 2024 20:20:41 +0300 Subject: [PATCH 02/11] Fix suggestion of clang-tidy about const KCE --- lib/Differentiator/ReverseModeVisitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 67d4c22bc..12e1a23a9 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -1673,7 +1673,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, } Expr* CUDAExecConfig = nullptr; - if (auto KCE = dyn_cast(CE)) + if (const auto KCE = dyn_cast(CE)) CUDAExecConfig = Clone(KCE->getConfig()); // If the function is non_differentiable, return zero derivative. From 0fa65728effbfe12daf7a0ad2dd02b2115cf4bf8 Mon Sep 17 00:00:00 2001 From: kchristin Date: Fri, 18 Oct 2024 20:21:29 +0300 Subject: [PATCH 03/11] Fix suggestion of clang-tidy about const KCE --- lib/Differentiator/ReverseModeVisitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 12e1a23a9..708a56c00 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -1673,7 +1673,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, } Expr* CUDAExecConfig = nullptr; - if (const auto KCE = dyn_cast(CE)) + if (const auto* KCE = dyn_cast(CE)) CUDAExecConfig = Clone(KCE->getConfig()); // If the function is non_differentiable, return zero derivative. From cb37e89012acc36b40d30d2e6497ee69296a5ab4 Mon Sep 17 00:00:00 2001 From: kchristin Date: Mon, 21 Oct 2024 18:24:02 +0300 Subject: [PATCH 04/11] Make cudaMemcpy a plus-assign op --- .../clad/Differentiator/BuiltinDerivatives.h | 45 ++++++++++++++++--- test/CUDA/GradientKernels.cu | 2 +- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index 1d3f96aba..df5ebbac8 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -83,15 +83,48 @@ ValueAndPushforward cudaDeviceSynchronize_pushforward() return {cudaDeviceSynchronize(), 0}; } -void cudaMemcpy_pullback(void* destPtr, void* srcPtr, size_t count, - cudaMemcpyKind kind, void* d_destPtr, void* d_srcPtr, - size_t* d_count, cudaMemcpyKind* d_kind) +template +__global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += blockDim.x * gridDim.x) + atomicAdd(&destPtr[i], srcPtr[i]); +} + +template +void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, + cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr, + size_t* d_count, cudaMemcpyKind* d_kind) __attribute__((host)) { - if (kind == cudaMemcpyDeviceToHost) + T* aux_destPtr; + if (kind == cudaMemcpyDeviceToHost) { *d_kind = cudaMemcpyHostToDevice; - else if (kind == cudaMemcpyHostToDevice) + cudaMalloc(&aux_destPtr, count); + } else if (kind == cudaMemcpyHostToDevice) { *d_kind = cudaMemcpyDeviceToHost; - cudaMemcpy(d_srcPtr, d_destPtr, count, *d_kind); + aux_destPtr = (T*)malloc(count); + } + cudaMemcpy(aux_destPtr, d_destPtr, count, *d_kind); + size_t N = count / sizeof(T); + if (kind == cudaMemcpyDeviceToHost) { + // d_kind is host to device, so d_srcPtr is a device pointer + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + size_t maxThreads = deviceProp.maxThreadsPerBlock; + size_t maxBlocks = deviceProp.maxGridSize[0]; + + size_t numThreads = std::min(maxThreads, N); + size_t numBlocks = std::min(maxBlocks, (N + numThreads - 1) / numThreads); + custom_derivatives::atomicAdd_kernel<<>>( + d_srcPtr, aux_destPtr, N); + cudaDeviceSynchronize(); + cudaFree(aux_destPtr); + } else if (kind == cudaMemcpyHostToDevice) { + // d_kind is device to host, so d_srcPtr is a host pointer + for (size_t i = 0; i < N; ++i) { + d_srcPtr[i] += aux_destPtr[i]; + } + free(aux_destPtr); + } } #endif diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index 87a8daa28..f16d96017 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -810,7 +810,7 @@ int main(void) { test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); - printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 10.00, 0.00, 0.00 + printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 60.00, 0.00, 0.00 free(res); free(fives); From 176aebac01751b69c3507c84f3943adc52d66c06 Mon Sep 17 00:00:00 2001 From: kchristin Date: Mon, 21 Oct 2024 18:27:05 +0300 Subject: [PATCH 05/11] Fix suggestions and format --- include/clad/Differentiator/BuiltinDerivatives.h | 12 ++++++------ lib/Differentiator/ReverseModeVisitor.cpp | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index df5ebbac8..24fde4f8e 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -92,8 +92,8 @@ __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) { template void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, - cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr, - size_t* d_count, cudaMemcpyKind* d_kind) + cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr, + size_t* d_count, cudaMemcpyKind* d_kind) __attribute__((host)) { T* aux_destPtr; if (kind == cudaMemcpyDeviceToHost) { @@ -111,18 +111,18 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, cudaGetDeviceProperties(&deviceProp, 0); size_t maxThreads = deviceProp.maxThreadsPerBlock; size_t maxBlocks = deviceProp.maxGridSize[0]; - + size_t numThreads = std::min(maxThreads, N); size_t numBlocks = std::min(maxBlocks, (N + numThreads - 1) / numThreads); custom_derivatives::atomicAdd_kernel<<>>( d_srcPtr, aux_destPtr, N); - cudaDeviceSynchronize(); + cudaDeviceSynchronize(); // needed in case user uses another stream than the + // default one cudaFree(aux_destPtr); } else if (kind == cudaMemcpyHostToDevice) { // d_kind is device to host, so d_srcPtr is a host pointer - for (size_t i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) d_srcPtr[i] += aux_destPtr[i]; - } free(aux_destPtr); } } diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 708a56c00..33cdc631c 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -635,7 +635,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, // If the function is a global kernel, all its parameters reside in the // global memory of the GPU else if (m_DiffReq->hasAttr()) - for (auto param : params) + for (auto* param : params) m_CUDAGlobalArgs.emplace(param); m_Derivative->setBody(nullptr); From cc0fc393003a6336b0d288f8bfe6acc29b789381 Mon Sep 17 00:00:00 2001 From: kchristin Date: Mon, 21 Oct 2024 18:44:31 +0300 Subject: [PATCH 06/11] Lose unnecessary sync calls to GPU in tests and add them in cudaMemcpy_pullback to mimic its blocking behavior --- .../clad/Differentiator/BuiltinDerivatives.h | 14 ++++-- lib/Differentiator/ReverseModeVisitor.cpp | 5 +- test/CUDA/GradientKernels.cu | 47 ++++++++++--------- 3 files changed, 37 insertions(+), 29 deletions(-) diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index 24fde4f8e..3467486cf 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -85,7 +85,7 @@ ValueAndPushforward cudaDeviceSynchronize_pushforward() template __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) atomicAdd(&destPtr[i], srcPtr[i]); } @@ -95,7 +95,7 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr, size_t* d_count, cudaMemcpyKind* d_kind) __attribute__((host)) { - T* aux_destPtr; + T* aux_destPtr = nullptr; if (kind == cudaMemcpyDeviceToHost) { *d_kind = cudaMemcpyHostToDevice; cudaMalloc(&aux_destPtr, count); @@ -103,6 +103,8 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, *d_kind = cudaMemcpyDeviceToHost; aux_destPtr = (T*)malloc(count); } + cudaDeviceSynchronize(); // needed in case user uses another stream for + // kernel execution besides the default one cudaMemcpy(aux_destPtr, d_destPtr, count, *d_kind); size_t N = count / sizeof(T); if (kind == cudaMemcpyDeviceToHost) { @@ -116,12 +118,14 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, size_t numBlocks = std::min(maxBlocks, (N + numThreads - 1) / numThreads); custom_derivatives::atomicAdd_kernel<<>>( d_srcPtr, aux_destPtr, N); - cudaDeviceSynchronize(); // needed in case user uses another stream than the - // default one + cudaDeviceSynchronize(); // needed in case the user uses another stream for + // kernel execution besides the default one, so we + // need to make sure the data are updated before + // continuing with the rest of the code cudaFree(aux_destPtr); } else if (kind == cudaMemcpyHostToDevice) { // d_kind is device to host, so d_srcPtr is a host pointer - for (size_t i = 0; i < N; ++i) + for (size_t i = 0; i < N; i++) d_srcPtr[i] += aux_destPtr[i]; free(aux_destPtr); } diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 33cdc631c..ef41f42b2 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -632,8 +632,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, if (!m_DiffReq.CUDAGlobalArgsIndexes.empty()) for (auto index : m_DiffReq.CUDAGlobalArgsIndexes) m_CUDAGlobalArgs.emplace(m_Derivative->getParamDecl(index)); - // If the function is a global kernel, all its parameters reside in the - // global memory of the GPU + // if the function is a global kernel, all the adjoint parameters reside in + // the global memory of the GPU. To facilitate the process, all the params + // of the kernel are added to the set. else if (m_DiffReq->hasAttr()) for (auto* param : params) m_CUDAGlobalArgs.emplace(param); diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index f16d96017..05b90265b 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -427,9 +427,12 @@ void fn(double *out, double *in) { //CHECK-NEXT: } double fn_memory(double *out, double *in) { - kernel_call<<<1, 10>>>(out, in); + double *in_dev = nullptr; + cudaMalloc(&in_dev, 10 * sizeof(double)); + cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice); + kernel_call<<<1, 10>>>(out, in_dev); cudaDeviceSynchronize(); - double *out_host = (double*)malloc(10 * sizeof(double)); + double *out_host = (double *)malloc(10 * sizeof(double)); cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost); double res = 0; for (int i=0; i < 10; ++i) { @@ -437,7 +440,7 @@ double fn_memory(double *out, double *in) { } free(out_host); cudaFree(out); - cudaFree(in); + cudaFree(in_dev); return res; } @@ -445,7 +448,12 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: int _d_i = 0; //CHECK-NEXT: int i = 0; //CHECK-NEXT: clad::tape _t1 = {}; -//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in); +//CHECK-NEXT: double *_d_in_dev = nullptr; +//CHECK-NEXT: double *in_dev = nullptr; +//CHECK-NEXT: cudaMalloc(&_d_in_dev, 10 * sizeof(double)); +//CHECK-NEXT: cudaMalloc(&in_dev, 10 * sizeof(double)); +//CHECK-NEXT: cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice); +//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in_dev); //CHECK-NEXT: cudaDeviceSynchronize(); //CHECK-NEXT: double *_d_out_host = (double *)malloc(10 * sizeof(double)); //CHECK-NEXT: double *out_host = (double *)malloc(10 * sizeof(double)); @@ -481,10 +489,16 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1); //CHECK-NEXT: } //CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); +//CHECK-NEXT: { +//CHECK-NEXT: unsigned long _r0 = 0UL; +//CHECK-NEXT: cudaMemcpyKind _r1 = static_cast(0U); +//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice, _d_in_dev, _d_in, &_r0, &_r1); +//CHECK-NEXT: } //CHECK-NEXT: free(out_host); //CHECK-NEXT: free(_d_out_host); //CHECK-NEXT: cudaFree(out); -//CHECK-NEXT: cudaFree(in); +//CHECK-NEXT: cudaFree(in_dev); +//CHECK-NEXT: cudaFree(_d_in_dev); //CHECK-NEXT:} // CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) { @@ -564,7 +578,6 @@ double fn_memory(double *out, double *in) { else { \ test.execute_kernel(grid, block, x, dx); \ } \ - cudaDeviceSynchronize(); \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ for (int i = 0; i < (N - 1); i++) { \ @@ -602,7 +615,6 @@ double fn_memory(double *out, double *in) { } \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%d, ", res[i]); \ } \ @@ -637,7 +649,6 @@ double fn_memory(double *out, double *in) { } \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%d, ", res[i]); \ } \ @@ -672,7 +683,6 @@ double fn_memory(double *out, double *in) { } \ double *res = (double*)malloc(N * sizeof(double)); \ cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%0.2f, ", res[i]); \ } \ @@ -748,8 +758,9 @@ int main(void) { auto test_device = clad::gradient(kernel_with_device_call, "out, val"); test_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); double *res = (double*)malloc(10 * sizeof(double)); - cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); + cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); // no need for synchronization before or after, + // as the cudaMemcpy call is queued after the kernel call + // on the default stream and the cudaMemcpy call is blocking printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); @@ -757,7 +768,6 @@ int main(void) { auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val"); test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); @@ -765,7 +775,6 @@ int main(void) { auto check_dup = clad::gradient(dup_kernel_with_device_call_2, "out, val"); // check that the pullback function is not regenerated check_dup.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 @@ -773,7 +782,6 @@ int main(void) { auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in"); test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double); - cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 @@ -781,9 +789,7 @@ int main(void) { auto test_device_4 = clad::gradient(kernel_with_device_call_3); test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, val, d_out_double, d_in_double, d_val); - cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 @@ -792,7 +798,6 @@ int main(void) { auto test_kernel_call = clad::gradient(fn); test_kernel_call.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); - cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 @@ -800,17 +805,14 @@ int main(void) { auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in"); nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double); - cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); auto test_memory = clad::gradient(fn_memory); - test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); - cudaDeviceSynchronize(); - cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); - printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 60.00, 0.00, 0.00 + test_memory.execute(dummy_out_double, fives, d_out_double, zeros); + printf("%0.2f, %0.2f, %0.2f\n", zeros[0], zeros[1], zeros[2]); // CHECK-EXEC: 60.00, 0.00, 0.00 free(res); free(fives); @@ -819,6 +821,7 @@ int main(void) { cudaFree(d_in_double); cudaFree(val); cudaFree(d_val); + cudaFree(dummy_in_double); return 0; } From be2c90f2611907777f32ca5e72fe10fc64a6cd5b Mon Sep 17 00:00:00 2001 From: kchristin Date: Wed, 23 Oct 2024 01:54:40 +0300 Subject: [PATCH 07/11] Fix cuda tests --- test/CUDA/GradientKernels.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index 05b90265b..f0a2725ce 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -484,8 +484,8 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: } //CHECK-NEXT: } //CHECK-NEXT: { -//CHECK-NEXT: unsigned long _r0 = 0UL; -//CHECK-NEXT: cudaMemcpyKind _r1 = static_cast(0U); +//CHECK-NEXT: unsigned long _r2 = 0UL; +//CHECK-NEXT: cudaMemcpyKind _r3 = static_cast(0U); //CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1); //CHECK-NEXT: } //CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); From d63e0351a2d68a87b96a8e74a575a1546019d234 Mon Sep 17 00:00:00 2001 From: kchristin Date: Wed, 23 Oct 2024 01:59:42 +0300 Subject: [PATCH 08/11] Fix cuda tests 2 --- test/CUDA/GradientKernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index f0a2725ce..46c5b5df9 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -486,7 +486,7 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: { //CHECK-NEXT: unsigned long _r2 = 0UL; //CHECK-NEXT: cudaMemcpyKind _r3 = static_cast(0U); -//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1); +//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r2, &_r3); //CHECK-NEXT: } //CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); //CHECK-NEXT: { From 6a3f79a20dfeeae16d513c20ec1e3073418b3d97 Mon Sep 17 00:00:00 2001 From: kchristin Date: Wed, 23 Oct 2024 02:11:12 +0300 Subject: [PATCH 09/11] Fix cuda tests 3 --- test/CUDA/GradientKernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index 46c5b5df9..d4385851e 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -488,7 +488,7 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: cudaMemcpyKind _r3 = static_cast(0U); //CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r2, &_r3); //CHECK-NEXT: } -//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); +//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in_dev, _d_out, _d_in_dev); //CHECK-NEXT: { //CHECK-NEXT: unsigned long _r0 = 0UL; //CHECK-NEXT: cudaMemcpyKind _r1 = static_cast(0U); From 0d234920670f0eb5ccb4305602e3bc2a97b2fc42 Mon Sep 17 00:00:00 2001 From: kchristin Date: Sun, 27 Oct 2024 22:05:20 +0200 Subject: [PATCH 10/11] Fix tests after master rebase --- test/CUDA/GradientKernels.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index d4385851e..92f69d92c 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -456,6 +456,7 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: kernel_call<<<1, 10>>>(out, in_dev); //CHECK-NEXT: cudaDeviceSynchronize(); //CHECK-NEXT: double *_d_out_host = (double *)malloc(10 * sizeof(double)); +//CHECK-NEXT: memset(_d_out_host, 0, 10 * sizeof(double)); //CHECK-NEXT: double *out_host = (double *)malloc(10 * sizeof(double)); //CHECK-NEXT: cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost); //CHECK-NEXT: double _d_res = 0.; From 5f8b54bee7ac0ab87b586709cad3e4f1767af8d6 Mon Sep 17 00:00:00 2001 From: kchristin Date: Mon, 28 Oct 2024 12:16:38 +0200 Subject: [PATCH 11/11] Specify true args in BuildCallToCustomDerivativeOrNumericalDiff --- lib/Differentiator/ReverseModeVisitor.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index ef41f42b2..04f626286 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -1952,7 +1952,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, OverloadedDerivedFn = m_Builder.BuildCallToCustomDerivativeOrNumericalDiff( customPushforward, pushforwardCallArgs, getCurrentScope(), - const_cast(FD->getDeclContext()), true, true, + const_cast(FD->getDeclContext()), + /*forCustomDerv=*/true, /*namespaceShouldExist=*/true, CUDAExecConfig); if (OverloadedDerivedFn) asGrad = false; @@ -2054,7 +2055,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, OverloadedDerivedFn = m_Builder.BuildCallToCustomDerivativeOrNumericalDiff( customPullback, pullbackCallArgs, getCurrentScope(), - const_cast(FD->getDeclContext()), true, true, + const_cast(FD->getDeclContext()), + /*forCustomDerv=*/true, /*namespaceShouldExist=*/true, CUDAExecConfig); if (baseDiff.getExpr()) pullbackCallArgs.erase(pullbackCallArgs.begin());