From f70b8b57e45670a1f4ae540478f6ce7b101d189d Mon Sep 17 00:00:00 2001
From: Christina Koutsou <christinakoutsou22@gmail.com>
Date: Tue, 15 Oct 2024 19:27:27 +0000
Subject: [PATCH 01/11] Add support of kernel pullback functions

---
 .../clad/Differentiator/BuiltinDerivatives.h  |  12 ++
 .../clad/Differentiator/DerivativeBuilder.h   |   3 +-
 .../clad/Differentiator/ReverseModeVisitor.h  |   3 +-
 include/clad/Differentiator/VisitorBase.h     |   3 +-
 lib/Differentiator/CladUtils.cpp              |   5 +-
 lib/Differentiator/DerivativeBuilder.cpp      |   9 +-
 lib/Differentiator/ReverseModeVisitor.cpp     |  57 +++++----
 lib/Differentiator/VisitorBase.cpp            |   5 +-
 test/CUDA/GradientKernels.cu                  | 113 +++++++++++++++---
 9 files changed, 165 insertions(+), 45 deletions(-)

diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h
index 557274a56..1d3f96aba 100644
--- a/include/clad/Differentiator/BuiltinDerivatives.h
+++ b/include/clad/Differentiator/BuiltinDerivatives.h
@@ -82,6 +82,18 @@ ValueAndPushforward<int, int> cudaDeviceSynchronize_pushforward()
     __attribute__((host)) {
   return {cudaDeviceSynchronize(), 0};
 }
+
+void cudaMemcpy_pullback(void* destPtr, void* srcPtr, size_t count,
+                         cudaMemcpyKind kind, void* d_destPtr, void* d_srcPtr,
+                         size_t* d_count, cudaMemcpyKind* d_kind)
+    __attribute__((host)) {
+  if (kind == cudaMemcpyDeviceToHost)
+    *d_kind = cudaMemcpyHostToDevice;
+  else if (kind == cudaMemcpyHostToDevice)
+    *d_kind = cudaMemcpyDeviceToHost;
+  cudaMemcpy(d_srcPtr, d_destPtr, count, *d_kind);
+}
+
 #endif
 
 CUDA_HOST_DEVICE inline ValueAndPushforward<float, float>
diff --git a/include/clad/Differentiator/DerivativeBuilder.h b/include/clad/Differentiator/DerivativeBuilder.h
index 9ac7165bf..5e9d54ac2 100644
--- a/include/clad/Differentiator/DerivativeBuilder.h
+++ b/include/clad/Differentiator/DerivativeBuilder.h
@@ -118,7 +118,8 @@ namespace clad {
     clang::Expr* BuildCallToCustomDerivativeOrNumericalDiff(
         const std::string& Name, llvm::SmallVectorImpl<clang::Expr*>& CallArgs,
         clang::Scope* S, clang::DeclContext* originalFnDC,
-        bool forCustomDerv = true, bool namespaceShouldExist = true);
+        bool forCustomDerv = true, bool namespaceShouldExist = true,
+        clang::Expr* CUDAExecConfig = nullptr);
     bool noOverloadExists(clang::Expr* UnresolvedLookup,
                           llvm::MutableArrayRef<clang::Expr*> ARargs);
     /// Shorthand to issues a warning or error.
diff --git a/include/clad/Differentiator/ReverseModeVisitor.h b/include/clad/Differentiator/ReverseModeVisitor.h
index d65871ec4..c9fd1296f 100644
--- a/include/clad/Differentiator/ReverseModeVisitor.h
+++ b/include/clad/Differentiator/ReverseModeVisitor.h
@@ -354,7 +354,8 @@ namespace clad {
         clang::Expr* dfdx, llvm::SmallVectorImpl<clang::Stmt*>& PreCallStmts,
         llvm::SmallVectorImpl<clang::Stmt*>& PostCallStmts,
         llvm::SmallVectorImpl<clang::Expr*>& args,
-        llvm::SmallVectorImpl<clang::Expr*>& outputArgs);
+        llvm::SmallVectorImpl<clang::Expr*>& outputArgs,
+        clang::Expr* CUDAExecConfig = nullptr);
 
   public:
     ReverseModeVisitor(DerivativeBuilder& builder, const DiffRequest& request);
diff --git a/include/clad/Differentiator/VisitorBase.h b/include/clad/Differentiator/VisitorBase.h
index dba1540a2..210f82112 100644
--- a/include/clad/Differentiator/VisitorBase.h
+++ b/include/clad/Differentiator/VisitorBase.h
@@ -603,7 +603,8 @@ namespace clad {
     /// \returns The derivative function call.
     clang::Expr* GetSingleArgCentralDiffCall(
         clang::Expr* targetFuncCall, clang::Expr* targetArg, unsigned targetPos,
-        unsigned numArgs, llvm::SmallVectorImpl<clang::Expr*>& args);
+        unsigned numArgs, llvm::SmallVectorImpl<clang::Expr*>& args,
+        clang::Expr* CUDAExecConfig = nullptr);
 
     /// Emits diagnostic messages on differentiation (or lack thereof) for
     /// call expressions.
diff --git a/lib/Differentiator/CladUtils.cpp b/lib/Differentiator/CladUtils.cpp
index 39e98f12f..350eeea07 100644
--- a/lib/Differentiator/CladUtils.cpp
+++ b/lib/Differentiator/CladUtils.cpp
@@ -679,7 +679,8 @@ namespace clad {
     }
 
     bool IsMemoryFunction(const clang::FunctionDecl* FD) {
-
+      if (FD->getNameAsString() == "cudaMalloc")
+        return true;
 #if CLANG_VERSION_MAJOR > 12
       if (FD->getBuiltinID() == Builtin::BImalloc)
         return true;
@@ -703,6 +704,8 @@ namespace clad {
     }
 
     bool IsMemoryDeallocationFunction(const clang::FunctionDecl* FD) {
+      if (FD->getNameAsString() == "cudaFree")
+        return true;
 #if CLANG_VERSION_MAJOR > 12
       return FD->getBuiltinID() == Builtin::ID::BIfree;
 #else
diff --git a/lib/Differentiator/DerivativeBuilder.cpp b/lib/Differentiator/DerivativeBuilder.cpp
index e4ba3f99a..ada7153c6 100644
--- a/lib/Differentiator/DerivativeBuilder.cpp
+++ b/lib/Differentiator/DerivativeBuilder.cpp
@@ -246,7 +246,8 @@ static void registerDerivative(FunctionDecl* derivedFD, Sema& semaRef) {
   Expr* DerivativeBuilder::BuildCallToCustomDerivativeOrNumericalDiff(
       const std::string& Name, llvm::SmallVectorImpl<Expr*>& CallArgs,
       clang::Scope* S, clang::DeclContext* originalFnDC,
-      bool forCustomDerv /*=true*/, bool namespaceShouldExist /*=true*/) {
+      bool forCustomDerv /*=true*/, bool namespaceShouldExist /*=true*/,
+      Expr* CUDAExecConfig /*=nullptr*/) {
     CXXScopeSpec SS;
     LookupResult R = LookupCustomDerivativeOrNumericalDiff(
         Name, originalFnDC, SS, forCustomDerv, namespaceShouldExist);
@@ -265,8 +266,10 @@ static void registerDerivative(FunctionDecl* derivedFD, Sema& semaRef) {
       if (noOverloadExists(UnresolvedLookup, MARargs))
         return nullptr;
 
-      OverloadedFn =
-          m_Sema.ActOnCallExpr(S, UnresolvedLookup, Loc, MARargs, Loc).get();
+      OverloadedFn = m_Sema
+                         .ActOnCallExpr(S, UnresolvedLookup, Loc, MARargs, Loc,
+                                        CUDAExecConfig)
+                         .get();
 
       // Add the custom derivative to the set of derivatives.
       // This is required in case the definition of the custom derivative
diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
index 8995a2a67..67d4c22bc 100644
--- a/lib/Differentiator/ReverseModeVisitor.cpp
+++ b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -471,8 +471,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     if (m_ExternalSource)
       m_ExternalSource->ActAfterCreatingDerivedFnParams(params);
 
-    // if the function is a global kernel, all its parameters reside in the
-    // global memory of the GPU
+    // if the function is a global kernel, all the adjoint parameters reside in
+    // the global memory of the GPU. To facilitate the process, all the params
+    // of the kernel are added to the set.
     if (m_DiffReq->hasAttr<clang::CUDAGlobalAttr>())
       for (auto* param : params)
         m_CUDAGlobalArgs.emplace(param);
@@ -631,7 +632,11 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     if (!m_DiffReq.CUDAGlobalArgsIndexes.empty())
       for (auto index : m_DiffReq.CUDAGlobalArgsIndexes)
         m_CUDAGlobalArgs.emplace(m_Derivative->getParamDecl(index));
-
+    // If the function is a global kernel, all its parameters reside in the
+    // global memory of the GPU
+    else if (m_DiffReq->hasAttr<clang::CUDAGlobalAttr>())
+      for (auto param : params)
+        m_CUDAGlobalArgs.emplace(param);
     m_Derivative->setBody(nullptr);
 
     if (!m_DiffReq.DeclarationOnly) {
@@ -1667,6 +1672,10 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       return StmtDiff(Clone(CE));
     }
 
+    Expr* CUDAExecConfig = nullptr;
+    if (auto KCE = dyn_cast<CUDAKernelCallExpr>(CE))
+      CUDAExecConfig = Clone(KCE->getConfig());
+
     // If the function is non_differentiable, return zero derivative.
     if (clad::utils::hasNonDifferentiableAttribute(CE)) {
       // Calling the function without computing derivatives
@@ -1675,10 +1684,11 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
         ClonedArgs.push_back(Clone(CE->getArg(i)));
 
       SourceLocation validLoc = clad::utils::GetValidSLoc(m_Sema);
-      Expr* Call = m_Sema
-                       .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()),
-                                      validLoc, ClonedArgs, validLoc)
-                       .get();
+      Expr* Call =
+          m_Sema
+              .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()),
+                             validLoc, ClonedArgs, validLoc, CUDAExecConfig)
+              .get();
       // Creating a zero derivative
       auto* zero = ConstantFolder::synthesizeLiteral(m_Context.IntTy, m_Context,
                                                      /*val=*/0);
@@ -1825,7 +1835,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       Expr* call =
           m_Sema
               .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc,
-                             llvm::MutableArrayRef<Expr*>(CallArgs), Loc)
+                             llvm::MutableArrayRef<Expr*>(CallArgs), Loc,
+                             CUDAExecConfig)
               .get();
       return call;
     }
@@ -1940,7 +1951,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       OverloadedDerivedFn =
           m_Builder.BuildCallToCustomDerivativeOrNumericalDiff(
               customPushforward, pushforwardCallArgs, getCurrentScope(),
-              const_cast<DeclContext*>(FD->getDeclContext()));
+              const_cast<DeclContext*>(FD->getDeclContext()), true, true,
+              CUDAExecConfig);
       if (OverloadedDerivedFn)
         asGrad = false;
     }
@@ -2041,7 +2053,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       OverloadedDerivedFn =
           m_Builder.BuildCallToCustomDerivativeOrNumericalDiff(
               customPullback, pullbackCallArgs, getCurrentScope(),
-              const_cast<DeclContext*>(FD->getDeclContext()));
+              const_cast<DeclContext*>(FD->getDeclContext()), true, true,
+              CUDAExecConfig);
       if (baseDiff.getExpr())
         pullbackCallArgs.erase(pullbackCallArgs.begin());
     }
@@ -2057,10 +2070,11 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
                     CXXScopeSpec(), m_Derivative->getNameInfo(), m_Derivative)
                 .get();
 
-        OverloadedDerivedFn = m_Sema
-                                  .ActOnCallExpr(getCurrentScope(), selfRef,
-                                                 Loc, pullbackCallArgs, Loc)
-                                  .get();
+        OverloadedDerivedFn =
+            m_Sema
+                .ActOnCallExpr(getCurrentScope(), selfRef, Loc,
+                               pullbackCallArgs, Loc, CUDAExecConfig)
+                .get();
       } else {
         if (m_ExternalSource)
           m_ExternalSource->ActBeforeDifferentiatingCallExpr(
@@ -2112,14 +2126,14 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
             OverloadedDerivedFn = GetSingleArgCentralDiffCall(
                 Clone(CE->getCallee()), DerivedCallArgs[0],
                 /*targetPos=*/0,
-                /*numArgs=*/1, DerivedCallArgs);
+                /*numArgs=*/1, DerivedCallArgs, CUDAExecConfig);
             asGrad = !OverloadedDerivedFn;
           } else {
             auto CEType = getNonConstType(CE->getType(), m_Context, m_Sema);
             OverloadedDerivedFn = GetMultiArgCentralDiffCall(
                 Clone(CE->getCallee()), CEType.getCanonicalType(),
                 CE->getNumArgs(), dfdx(), PreCallStmts, PostCallStmts,
-                DerivedCallArgs, CallArgDx);
+                DerivedCallArgs, CallArgDx, CUDAExecConfig);
           }
           CallExprDiffDiagnostics(FD, CE->getBeginLoc());
           if (!OverloadedDerivedFn) {
@@ -2137,7 +2151,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
             OverloadedDerivedFn =
                 m_Sema
                     .ActOnCallExpr(getCurrentScope(), BuildDeclRef(pullbackFD),
-                                   Loc, pullbackCallArgs, Loc)
+                                   Loc, pullbackCallArgs, Loc, CUDAExecConfig)
                     .get();
           }
         }
@@ -2250,7 +2264,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
         call = m_Sema
                    .ActOnCallExpr(getCurrentScope(),
                                   BuildDeclRef(calleeFnForwPassFD), Loc,
-                                  CallArgs, Loc)
+                                  CallArgs, Loc, CUDAExecConfig)
                    .get();
       }
       auto* callRes = StoreAndRef(call);
@@ -2285,7 +2299,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
 
     call = m_Sema
                .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()), Loc,
-                              CallArgs, Loc)
+                              CallArgs, Loc, CUDAExecConfig)
                .get();
     return StmtDiff(call);
   }
@@ -2295,7 +2309,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       llvm::SmallVectorImpl<Stmt*>& PreCallStmts,
       llvm::SmallVectorImpl<Stmt*>& PostCallStmts,
       llvm::SmallVectorImpl<Expr*>& args,
-      llvm::SmallVectorImpl<Expr*>& outputArgs) {
+      llvm::SmallVectorImpl<Expr*>& outputArgs,
+      Expr* CUDAExecConfig /*=nullptr*/) {
     int printErrorInf = m_Builder.shouldPrintNumDiffErrs();
     llvm::SmallVector<Expr*, 16U> NumDiffArgs = {};
     NumDiffArgs.push_back(targetFuncCall);
@@ -2336,7 +2351,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
         Name, NumDiffArgs, getCurrentScope(),
         /*OriginalFnDC=*/nullptr,
         /*forCustomDerv=*/false,
-        /*namespaceShouldExist=*/false);
+        /*namespaceShouldExist=*/false, CUDAExecConfig);
   }
 
   StmtDiff ReverseModeVisitor::VisitUnaryOperator(const UnaryOperator* UnOp) {
diff --git a/lib/Differentiator/VisitorBase.cpp b/lib/Differentiator/VisitorBase.cpp
index 6cd582270..e8fce3628 100644
--- a/lib/Differentiator/VisitorBase.cpp
+++ b/lib/Differentiator/VisitorBase.cpp
@@ -765,7 +765,8 @@ namespace clad {
 
   Expr* VisitorBase::GetSingleArgCentralDiffCall(
       Expr* targetFuncCall, Expr* targetArg, unsigned targetPos,
-      unsigned numArgs, llvm::SmallVectorImpl<Expr*>& args) {
+      unsigned numArgs, llvm::SmallVectorImpl<Expr*>& args,
+      Expr* CUDAExecConfig /*=nullptr*/) {
     QualType argType = targetArg->getType();
     int printErrorInf = m_Builder.shouldPrintNumDiffErrs();
     bool isSupported = argType->isArithmeticType();
@@ -788,7 +789,7 @@ namespace clad {
         Name, NumDiffArgs, getCurrentScope(),
         /*OriginalFnDC=*/nullptr,
         /*forCustomDerv=*/false,
-        /*namespaceShouldExist=*/false);
+        /*namespaceShouldExist=*/false, CUDAExecConfig);
   }
 
   void VisitorBase::CallExprDiffDiagnostics(const clang::FunctionDecl* FD,
diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
index a60604fa6..87a8daa28 100644
--- a/test/CUDA/GradientKernels.cu
+++ b/test/CUDA/GradientKernels.cu
@@ -412,6 +412,81 @@ __global__ void kernel_with_nested_device_call(double *out, double *in, double v
 //CHECK-NEXT:    }
 //CHECK-NEXT:}
 
+__global__ void kernel_call(double *a, double *b) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  a[index] = *b;
+}
+
+void fn(double *out, double *in) {
+  kernel_call<<<1, 10>>>(out, in);
+}
+
+// CHECK: void fn_grad(double *out, double *in, double *_d_out, double *_d_in) {
+//CHECK-NEXT:     kernel_call<<<1, 10>>>(out, in);
+//CHECK-NEXT:     kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in);
+//CHECK-NEXT: }
+
+double fn_memory(double *out, double *in) {
+  kernel_call<<<1, 10>>>(out, in);
+  cudaDeviceSynchronize();
+  double *out_host = (double*)malloc(10 * sizeof(double));
+  cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost);
+  double res = 0;
+  for (int i=0; i < 10; ++i) {
+    res += out_host[i];
+  }
+  free(out_host);
+  cudaFree(out);
+  cudaFree(in);
+  return res;
+}
+
+// CHECK: void fn_memory_grad(double *out, double *in, double *_d_out, double *_d_in) {
+//CHECK-NEXT:    int _d_i = 0;
+//CHECK-NEXT:    int i = 0;
+//CHECK-NEXT:    clad::tape<double> _t1 = {};
+//CHECK-NEXT:    kernel_call<<<1, 10>>>(out, in);
+//CHECK-NEXT:    cudaDeviceSynchronize();
+//CHECK-NEXT:    double *_d_out_host = (double *)malloc(10 * sizeof(double));
+//CHECK-NEXT:    double *out_host = (double *)malloc(10 * sizeof(double));
+//CHECK-NEXT:    cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost);
+//CHECK-NEXT:    double _d_res = 0.;
+//CHECK-NEXT:    double res = 0;
+//CHECK-NEXT:    unsigned long _t0 = 0UL;
+//CHECK-NEXT:    for (i = 0; ; ++i) {
+//CHECK-NEXT:        {
+//CHECK-NEXT:            if (!(i < 10))
+//CHECK-NEXT:                break;
+//CHECK-NEXT:        }
+//CHECK-NEXT:        _t0++;
+//CHECK-NEXT:        clad::push(_t1, res);
+//CHECK-NEXT:        res += out_host[i];
+//CHECK-NEXT:    }
+//CHECK-NEXT:    _d_res += 1;
+//CHECK-NEXT:    for (;; _t0--) {
+//CHECK-NEXT:        {
+//CHECK-NEXT:            if (!_t0)
+//CHECK-NEXT:                break;
+//CHECK-NEXT:        }
+//CHECK-NEXT:        --i;
+//CHECK-NEXT:        {
+//CHECK-NEXT:            res = clad::pop(_t1);
+//CHECK-NEXT:            double _r_d0 = _d_res;
+//CHECK-NEXT:            _d_out_host[i] += _r_d0;
+//CHECK-NEXT:        }
+//CHECK-NEXT:    }
+//CHECK-NEXT:    {
+//CHECK-NEXT:        unsigned long _r0 = 0UL;
+//CHECK-NEXT:        cudaMemcpyKind _r1 = static_cast<cudaMemcpyKind>(0U);
+//CHECK-NEXT:        clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1);
+//CHECK-NEXT:    }
+//CHECK-NEXT:    kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in);
+//CHECK-NEXT:    free(out_host);
+//CHECK-NEXT:    free(_d_out_host);
+//CHECK-NEXT:    cudaFree(out);
+//CHECK-NEXT:    cudaFree(in);
+//CHECK-NEXT:}
+
 // CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) {
 //CHECK-NEXT:    {
 //CHECK-NEXT:                *_d_in += _d_y;
@@ -609,22 +684,12 @@ __global__ void kernel_with_nested_device_call(double *out, double *in, double v
 
 #define INIT(x, y, val, dx, dy, d_val)                                          \
 {                                                                               \
-  double *fives = (double*)malloc(10 * sizeof(double));                         \
-  for(int i = 0; i < 10; i++) {                                                 \
-    fives[i] = 5;                                                               \
-  }                                                                             \
-  double *zeros = (double*)malloc(10 * sizeof(double));                         \
-  for(int i = 0; i < 10; i++) {                                                 \
-    zeros[i] = 0;                                                               \
-  }                                                                             \
   cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);            \
   cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);            \
   cudaMemcpy(val, fives, sizeof(double), cudaMemcpyHostToDevice);               \
   cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);           \
   cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);           \
   cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice);             \
-  free(fives);                                                                  \
-  free(zeros);                                                                  \
 }
 
 int main(void) {
@@ -644,7 +709,6 @@ int main(void) {
   cudaFree(a);
   cudaFree(d_a);
 
-
   int *dummy_in, *dummy_out, *d_out, *d_in;
   cudaMalloc(&dummy_in, 10 * sizeof(int));
   cudaMalloc(&dummy_out, 10 * sizeof(int));
@@ -671,10 +735,13 @@ int main(void) {
 
   TEST_2_D(add_kernel_7, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out_double, dummy_in_double, d_out_double, d_in_double, 10); // CHECK-EXEC: 50.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00
 
-  double *val;
+  double *val, *d_val;
   cudaMalloc(&val, sizeof(double));
-  double *d_val;
   cudaMalloc(&d_val, sizeof(double));
+
+  double *fives = (double*)malloc(10 * sizeof(double));
+  double *zeros = (double*)malloc(10 * sizeof(double));
+  for(int i = 0; i < 10; i++) { fives[i] = 5; zeros[i] = 0; }
   
   INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
 
@@ -723,15 +790,31 @@ int main(void) {
 
   INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
 
+  auto test_kernel_call = clad::gradient(fn);
+  test_kernel_call.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double);
+  cudaDeviceSynchronize();
+  cudaMemcpy(res, d_in_double, sizeof(double), cudaMemcpyDeviceToHost);
+  printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
+
+  INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
+
   auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in");
   nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double);
   cudaDeviceSynchronize();
   cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
   printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
 
+  INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
+
+  auto test_memory = clad::gradient(fn_memory);
+  test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double);
+  cudaDeviceSynchronize();
+  cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
+  printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 10.00, 0.00, 0.00
+
   free(res);
-  cudaFree(dummy_in_double);
-  cudaFree(dummy_out_double);
+  free(fives);
+  free(zeros);
   cudaFree(d_out_double);
   cudaFree(d_in_double);
   cudaFree(val);

From dd9ff383a6d3fa6f0a5933a73aa5122fa896e7cd Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Fri, 18 Oct 2024 20:20:41 +0300
Subject: [PATCH 02/11] Fix suggestion of clang-tidy about const KCE

---
 lib/Differentiator/ReverseModeVisitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
index 67d4c22bc..12e1a23a9 100644
--- a/lib/Differentiator/ReverseModeVisitor.cpp
+++ b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -1673,7 +1673,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     }
 
     Expr* CUDAExecConfig = nullptr;
-    if (auto KCE = dyn_cast<CUDAKernelCallExpr>(CE))
+    if (const auto KCE = dyn_cast<CUDAKernelCallExpr>(CE))
       CUDAExecConfig = Clone(KCE->getConfig());
 
     // If the function is non_differentiable, return zero derivative.

From 0fa65728effbfe12daf7a0ad2dd02b2115cf4bf8 Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Fri, 18 Oct 2024 20:21:29 +0300
Subject: [PATCH 03/11] Fix suggestion of clang-tidy about const KCE

---
 lib/Differentiator/ReverseModeVisitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
index 12e1a23a9..708a56c00 100644
--- a/lib/Differentiator/ReverseModeVisitor.cpp
+++ b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -1673,7 +1673,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     }
 
     Expr* CUDAExecConfig = nullptr;
-    if (const auto KCE = dyn_cast<CUDAKernelCallExpr>(CE))
+    if (const auto* KCE = dyn_cast<CUDAKernelCallExpr>(CE))
       CUDAExecConfig = Clone(KCE->getConfig());
 
     // If the function is non_differentiable, return zero derivative.

From cb37e89012acc36b40d30d2e6497ee69296a5ab4 Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Mon, 21 Oct 2024 18:24:02 +0300
Subject: [PATCH 04/11] Make cudaMemcpy a plus-assign op

---
 .../clad/Differentiator/BuiltinDerivatives.h  | 45 ++++++++++++++++---
 test/CUDA/GradientKernels.cu                  |  2 +-
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h
index 1d3f96aba..df5ebbac8 100644
--- a/include/clad/Differentiator/BuiltinDerivatives.h
+++ b/include/clad/Differentiator/BuiltinDerivatives.h
@@ -83,15 +83,48 @@ ValueAndPushforward<int, int> cudaDeviceSynchronize_pushforward()
   return {cudaDeviceSynchronize(), 0};
 }
 
-void cudaMemcpy_pullback(void* destPtr, void* srcPtr, size_t count,
-                         cudaMemcpyKind kind, void* d_destPtr, void* d_srcPtr,
-                         size_t* d_count, cudaMemcpyKind* d_kind)
+template <typename T>
+__global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x)
+    atomicAdd(&destPtr[i], srcPtr[i]);
+}
+
+template <typename T>
+void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
+                          cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr,
+                          size_t* d_count, cudaMemcpyKind* d_kind)
     __attribute__((host)) {
-  if (kind == cudaMemcpyDeviceToHost)
+  T* aux_destPtr;
+  if (kind == cudaMemcpyDeviceToHost) {
     *d_kind = cudaMemcpyHostToDevice;
-  else if (kind == cudaMemcpyHostToDevice)
+    cudaMalloc(&aux_destPtr, count);
+  } else if (kind == cudaMemcpyHostToDevice) {
     *d_kind = cudaMemcpyDeviceToHost;
-  cudaMemcpy(d_srcPtr, d_destPtr, count, *d_kind);
+    aux_destPtr = (T*)malloc(count);
+  }
+  cudaMemcpy(aux_destPtr, d_destPtr, count, *d_kind);
+  size_t N = count / sizeof(T);
+  if (kind == cudaMemcpyDeviceToHost) {
+    // d_kind is host to device, so d_srcPtr is a device pointer
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, 0);
+    size_t maxThreads = deviceProp.maxThreadsPerBlock;
+    size_t maxBlocks = deviceProp.maxGridSize[0];
+   
+    size_t numThreads = std::min(maxThreads, N);
+    size_t numBlocks = std::min(maxBlocks, (N + numThreads - 1) / numThreads);
+    custom_derivatives::atomicAdd_kernel<<<numBlocks, numThreads>>>(
+        d_srcPtr, aux_destPtr, N);
+    cudaDeviceSynchronize();
+    cudaFree(aux_destPtr);
+  } else if (kind == cudaMemcpyHostToDevice) {
+    // d_kind is device to host, so d_srcPtr is a host pointer
+    for (size_t i = 0; i < N; ++i) {
+      d_srcPtr[i] += aux_destPtr[i];
+    }
+    free(aux_destPtr);
+  }
 }
 
 #endif
diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
index 87a8daa28..f16d96017 100644
--- a/test/CUDA/GradientKernels.cu
+++ b/test/CUDA/GradientKernels.cu
@@ -810,7 +810,7 @@ int main(void) {
   test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double);
   cudaDeviceSynchronize();
   cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
-  printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 10.00, 0.00, 0.00
+  printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 60.00, 0.00, 0.00
 
   free(res);
   free(fives);

From 176aebac01751b69c3507c84f3943adc52d66c06 Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Mon, 21 Oct 2024 18:27:05 +0300
Subject: [PATCH 05/11] Fix suggestions and format

---
 include/clad/Differentiator/BuiltinDerivatives.h | 12 ++++++------
 lib/Differentiator/ReverseModeVisitor.cpp        |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h
index df5ebbac8..24fde4f8e 100644
--- a/include/clad/Differentiator/BuiltinDerivatives.h
+++ b/include/clad/Differentiator/BuiltinDerivatives.h
@@ -92,8 +92,8 @@ __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) {
 
 template <typename T>
 void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
-                          cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr,
-                          size_t* d_count, cudaMemcpyKind* d_kind)
+                         cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr,
+                         size_t* d_count, cudaMemcpyKind* d_kind)
     __attribute__((host)) {
   T* aux_destPtr;
   if (kind == cudaMemcpyDeviceToHost) {
@@ -111,18 +111,18 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
     cudaGetDeviceProperties(&deviceProp, 0);
     size_t maxThreads = deviceProp.maxThreadsPerBlock;
     size_t maxBlocks = deviceProp.maxGridSize[0];
-   
+
     size_t numThreads = std::min(maxThreads, N);
     size_t numBlocks = std::min(maxBlocks, (N + numThreads - 1) / numThreads);
     custom_derivatives::atomicAdd_kernel<<<numBlocks, numThreads>>>(
         d_srcPtr, aux_destPtr, N);
-    cudaDeviceSynchronize();
+    cudaDeviceSynchronize(); // needed in case user uses another stream than the
+                             // default one
     cudaFree(aux_destPtr);
   } else if (kind == cudaMemcpyHostToDevice) {
     // d_kind is device to host, so d_srcPtr is a host pointer
-    for (size_t i = 0; i < N; ++i) {
+    for (size_t i = 0; i < N; ++i)
       d_srcPtr[i] += aux_destPtr[i];
-    }
     free(aux_destPtr);
   }
 }
diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
index 708a56c00..33cdc631c 100644
--- a/lib/Differentiator/ReverseModeVisitor.cpp
+++ b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -635,7 +635,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     // If the function is a global kernel, all its parameters reside in the
     // global memory of the GPU
     else if (m_DiffReq->hasAttr<clang::CUDAGlobalAttr>())
-      for (auto param : params)
+      for (auto* param : params)
         m_CUDAGlobalArgs.emplace(param);
     m_Derivative->setBody(nullptr);
 

From cc0fc393003a6336b0d288f8bfe6acc29b789381 Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Mon, 21 Oct 2024 18:44:31 +0300
Subject: [PATCH 06/11] Lose unnecessary sync calls to GPU in tests and add
 them in cudaMemcpy_pullback to mimic its blocking behavior

---
 .../clad/Differentiator/BuiltinDerivatives.h  | 14 ++++--
 lib/Differentiator/ReverseModeVisitor.cpp     |  5 +-
 test/CUDA/GradientKernels.cu                  | 47 ++++++++++---------
 3 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h
index 24fde4f8e..3467486cf 100644
--- a/include/clad/Differentiator/BuiltinDerivatives.h
+++ b/include/clad/Differentiator/BuiltinDerivatives.h
@@ -85,7 +85,7 @@ ValueAndPushforward<int, int> cudaDeviceSynchronize_pushforward()
 
 template <typename T>
 __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x)
     atomicAdd(&destPtr[i], srcPtr[i]);
 }
@@ -95,7 +95,7 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
                          cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr,
                          size_t* d_count, cudaMemcpyKind* d_kind)
     __attribute__((host)) {
-  T* aux_destPtr;
+  T* aux_destPtr = nullptr;
   if (kind == cudaMemcpyDeviceToHost) {
     *d_kind = cudaMemcpyHostToDevice;
     cudaMalloc(&aux_destPtr, count);
@@ -103,6 +103,8 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
     *d_kind = cudaMemcpyDeviceToHost;
     aux_destPtr = (T*)malloc(count);
   }
+  cudaDeviceSynchronize(); // needed in case user uses another stream for
+                           // kernel execution besides the default one
   cudaMemcpy(aux_destPtr, d_destPtr, count, *d_kind);
   size_t N = count / sizeof(T);
   if (kind == cudaMemcpyDeviceToHost) {
@@ -116,12 +118,14 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
     size_t numBlocks = std::min(maxBlocks, (N + numThreads - 1) / numThreads);
     custom_derivatives::atomicAdd_kernel<<<numBlocks, numThreads>>>(
         d_srcPtr, aux_destPtr, N);
-    cudaDeviceSynchronize(); // needed in case user uses another stream than the
-                             // default one
+    cudaDeviceSynchronize(); // needed in case the user uses another stream for
+                             // kernel execution besides the default one, so we
+                             // need to make sure the data are updated before
+                             // continuing with the rest of the code
     cudaFree(aux_destPtr);
   } else if (kind == cudaMemcpyHostToDevice) {
     // d_kind is device to host, so d_srcPtr is a host pointer
-    for (size_t i = 0; i < N; ++i)
+    for (size_t i = 0; i < N; i++)
       d_srcPtr[i] += aux_destPtr[i];
     free(aux_destPtr);
   }
diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
index 33cdc631c..ef41f42b2 100644
--- a/lib/Differentiator/ReverseModeVisitor.cpp
+++ b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -632,8 +632,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     if (!m_DiffReq.CUDAGlobalArgsIndexes.empty())
       for (auto index : m_DiffReq.CUDAGlobalArgsIndexes)
         m_CUDAGlobalArgs.emplace(m_Derivative->getParamDecl(index));
-    // If the function is a global kernel, all its parameters reside in the
-    // global memory of the GPU
+    // if the function is a global kernel, all the adjoint parameters reside in
+    // the global memory of the GPU. To facilitate the process, all the params
+    // of the kernel are added to the set.
     else if (m_DiffReq->hasAttr<clang::CUDAGlobalAttr>())
       for (auto* param : params)
         m_CUDAGlobalArgs.emplace(param);
diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
index f16d96017..05b90265b 100644
--- a/test/CUDA/GradientKernels.cu
+++ b/test/CUDA/GradientKernels.cu
@@ -427,9 +427,12 @@ void fn(double *out, double *in) {
 //CHECK-NEXT: }
 
 double fn_memory(double *out, double *in) {
-  kernel_call<<<1, 10>>>(out, in);
+  double *in_dev = nullptr;
+  cudaMalloc(&in_dev, 10 * sizeof(double));
+  cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice);
+  kernel_call<<<1, 10>>>(out, in_dev);
   cudaDeviceSynchronize();
-  double *out_host = (double*)malloc(10 * sizeof(double));
+  double *out_host = (double *)malloc(10 * sizeof(double));
   cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost);
   double res = 0;
   for (int i=0; i < 10; ++i) {
@@ -437,7 +440,7 @@ double fn_memory(double *out, double *in) {
   }
   free(out_host);
   cudaFree(out);
-  cudaFree(in);
+  cudaFree(in_dev);
   return res;
 }
 
@@ -445,7 +448,12 @@ double fn_memory(double *out, double *in) {
 //CHECK-NEXT:    int _d_i = 0;
 //CHECK-NEXT:    int i = 0;
 //CHECK-NEXT:    clad::tape<double> _t1 = {};
-//CHECK-NEXT:    kernel_call<<<1, 10>>>(out, in);
+//CHECK-NEXT:    double *_d_in_dev = nullptr;
+//CHECK-NEXT:    double *in_dev = nullptr;
+//CHECK-NEXT:    cudaMalloc(&_d_in_dev, 10 * sizeof(double));
+//CHECK-NEXT:    cudaMalloc(&in_dev, 10 * sizeof(double));
+//CHECK-NEXT:    cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice);
+//CHECK-NEXT:    kernel_call<<<1, 10>>>(out, in_dev);
 //CHECK-NEXT:    cudaDeviceSynchronize();
 //CHECK-NEXT:    double *_d_out_host = (double *)malloc(10 * sizeof(double));
 //CHECK-NEXT:    double *out_host = (double *)malloc(10 * sizeof(double));
@@ -481,10 +489,16 @@ double fn_memory(double *out, double *in) {
 //CHECK-NEXT:        clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1);
 //CHECK-NEXT:    }
 //CHECK-NEXT:    kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in);
+//CHECK-NEXT:    {
+//CHECK-NEXT:        unsigned long _r0 = 0UL;
+//CHECK-NEXT:        cudaMemcpyKind _r1 = static_cast<cudaMemcpyKind>(0U);
+//CHECK-NEXT:        clad::custom_derivatives::cudaMemcpy_pullback(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice, _d_in_dev, _d_in, &_r0, &_r1);
+//CHECK-NEXT:    }
 //CHECK-NEXT:    free(out_host);
 //CHECK-NEXT:    free(_d_out_host);
 //CHECK-NEXT:    cudaFree(out);
-//CHECK-NEXT:    cudaFree(in);
+//CHECK-NEXT:    cudaFree(in_dev);
+//CHECK-NEXT:    cudaFree(_d_in_dev);
 //CHECK-NEXT:}
 
 // CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) {
@@ -564,7 +578,6 @@ double fn_memory(double *out, double *in) {
     else {                                                                  \
       test.execute_kernel(grid, block, x, dx);                              \
     }                                                                       \
-    cudaDeviceSynchronize();                                                \
     int *res = (int*)malloc(N * sizeof(int));                               \
     cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost);           \
     for (int i = 0; i < (N - 1); i++) {                                     \
@@ -602,7 +615,6 @@ double fn_memory(double *out, double *in) {
     }                                                                         \
     int *res = (int*)malloc(N * sizeof(int));                                 \
     cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost);             \
-    cudaDeviceSynchronize();                                                  \
     for (int i = 0; i < (N - 1); i++) {                                       \
       printf("%d, ", res[i]);                                                 \
     }                                                                         \
@@ -637,7 +649,6 @@ double fn_memory(double *out, double *in) {
     }                                                                             \
     int *res = (int*)malloc(N * sizeof(int));                                     \
     cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost);                 \
-    cudaDeviceSynchronize();                                                      \
     for (int i = 0; i < (N - 1); i++) {                                           \
       printf("%d, ", res[i]);                                                     \
     }                                                                             \
@@ -672,7 +683,6 @@ double fn_memory(double *out, double *in) {
     }                                                                           \
     double *res = (double*)malloc(N * sizeof(double));                          \
     cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost);            \
-    cudaDeviceSynchronize();                                                    \
     for (int i = 0; i < (N - 1); i++) {                                         \
       printf("%0.2f, ", res[i]);                                                \
     }                                                                           \
@@ -748,8 +758,9 @@ int main(void) {
   auto test_device = clad::gradient(kernel_with_device_call, "out, val");
   test_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
   double *res = (double*)malloc(10 * sizeof(double));
-  cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
+  cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); // no need for synchronization before or after, 
+                                                                  // as the cudaMemcpy call is queued after the kernel call 
+                                                                  // on the default stream and the cudaMemcpy call is blocking
   printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
 
   INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
@@ -757,7 +768,6 @@ int main(void) {
   auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val");
   test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
   cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
   printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
 
   INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
@@ -765,7 +775,6 @@ int main(void) {
   auto check_dup = clad::gradient(dup_kernel_with_device_call_2, "out, val"); // check that the pullback function is not regenerated
   check_dup.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
   cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
   printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error
   printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
 
@@ -773,7 +782,6 @@ int main(void) {
 
   auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in");
   test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double);
-  cudaDeviceSynchronize();
   cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
   printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
 
@@ -781,9 +789,7 @@ int main(void) {
 
   auto test_device_4 = clad::gradient(kernel_with_device_call_3);
   test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, val, d_out_double, d_in_double, d_val);
-  cudaDeviceSynchronize();
   cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
   printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
   cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
   printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
@@ -792,7 +798,6 @@ int main(void) {
 
   auto test_kernel_call = clad::gradient(fn);
   test_kernel_call.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double);
-  cudaDeviceSynchronize();
   cudaMemcpy(res, d_in_double, sizeof(double), cudaMemcpyDeviceToHost);
   printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
 
@@ -800,17 +805,14 @@ int main(void) {
 
   auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in");
   nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double);
-  cudaDeviceSynchronize();
   cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
   printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
 
   INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
 
   auto test_memory = clad::gradient(fn_memory);
-  test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double);
-  cudaDeviceSynchronize();
-  cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
-  printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 60.00, 0.00, 0.00
+  test_memory.execute(dummy_out_double, fives, d_out_double, zeros);
+  printf("%0.2f, %0.2f, %0.2f\n", zeros[0], zeros[1], zeros[2]); // CHECK-EXEC: 60.00, 0.00, 0.00
 
   free(res);
   free(fives);
@@ -819,6 +821,7 @@ int main(void) {
   cudaFree(d_in_double);
   cudaFree(val);
   cudaFree(d_val);
+  cudaFree(dummy_in_double);
 
   return 0;
 }

From be2c90f2611907777f32ca5e72fe10fc64a6cd5b Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Wed, 23 Oct 2024 01:54:40 +0300
Subject: [PATCH 07/11] Fix cuda tests

---
 test/CUDA/GradientKernels.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
index 05b90265b..f0a2725ce 100644
--- a/test/CUDA/GradientKernels.cu
+++ b/test/CUDA/GradientKernels.cu
@@ -484,8 +484,8 @@ double fn_memory(double *out, double *in) {
 //CHECK-NEXT:        }
 //CHECK-NEXT:    }
 //CHECK-NEXT:    {
-//CHECK-NEXT:        unsigned long _r0 = 0UL;
-//CHECK-NEXT:        cudaMemcpyKind _r1 = static_cast<cudaMemcpyKind>(0U);
+//CHECK-NEXT:        unsigned long _r2 = 0UL;
+//CHECK-NEXT:        cudaMemcpyKind _r3 = static_cast<cudaMemcpyKind>(0U);
 //CHECK-NEXT:        clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1);
 //CHECK-NEXT:    }
 //CHECK-NEXT:    kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in);

From d63e0351a2d68a87b96a8e74a575a1546019d234 Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Wed, 23 Oct 2024 01:59:42 +0300
Subject: [PATCH 08/11] Fix cuda tests 2

---
 test/CUDA/GradientKernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
index f0a2725ce..46c5b5df9 100644
--- a/test/CUDA/GradientKernels.cu
+++ b/test/CUDA/GradientKernels.cu
@@ -486,7 +486,7 @@ double fn_memory(double *out, double *in) {
 //CHECK-NEXT:    {
 //CHECK-NEXT:        unsigned long _r2 = 0UL;
 //CHECK-NEXT:        cudaMemcpyKind _r3 = static_cast<cudaMemcpyKind>(0U);
-//CHECK-NEXT:        clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1);
+//CHECK-NEXT:        clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r2, &_r3);
 //CHECK-NEXT:    }
 //CHECK-NEXT:    kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in);
 //CHECK-NEXT:    {

From 6a3f79a20dfeeae16d513c20ec1e3073418b3d97 Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Wed, 23 Oct 2024 02:11:12 +0300
Subject: [PATCH 09/11] Fix cuda tests 3

---
 test/CUDA/GradientKernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
index 46c5b5df9..d4385851e 100644
--- a/test/CUDA/GradientKernels.cu
+++ b/test/CUDA/GradientKernels.cu
@@ -488,7 +488,7 @@ double fn_memory(double *out, double *in) {
 //CHECK-NEXT:        cudaMemcpyKind _r3 = static_cast<cudaMemcpyKind>(0U);
 //CHECK-NEXT:        clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r2, &_r3);
 //CHECK-NEXT:    }
-//CHECK-NEXT:    kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in);
+//CHECK-NEXT:    kernel_call_pullback<<<1, 10>>>(out, in_dev, _d_out, _d_in_dev);
 //CHECK-NEXT:    {
 //CHECK-NEXT:        unsigned long _r0 = 0UL;
 //CHECK-NEXT:        cudaMemcpyKind _r1 = static_cast<cudaMemcpyKind>(0U);

From 0d234920670f0eb5ccb4305602e3bc2a97b2fc42 Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Sun, 27 Oct 2024 22:05:20 +0200
Subject: [PATCH 10/11] Fix tests after master rebase

---
 test/CUDA/GradientKernels.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
index d4385851e..92f69d92c 100644
--- a/test/CUDA/GradientKernels.cu
+++ b/test/CUDA/GradientKernels.cu
@@ -456,6 +456,7 @@ double fn_memory(double *out, double *in) {
 //CHECK-NEXT:    kernel_call<<<1, 10>>>(out, in_dev);
 //CHECK-NEXT:    cudaDeviceSynchronize();
 //CHECK-NEXT:    double *_d_out_host = (double *)malloc(10 * sizeof(double));
+//CHECK-NEXT:    memset(_d_out_host, 0, 10 * sizeof(double));
 //CHECK-NEXT:    double *out_host = (double *)malloc(10 * sizeof(double));
 //CHECK-NEXT:    cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost);
 //CHECK-NEXT:    double _d_res = 0.;

From 5f8b54bee7ac0ab87b586709cad3e4f1767af8d6 Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Mon, 28 Oct 2024 12:16:38 +0200
Subject: [PATCH 11/11] Specify true args in
 BuildCallToCustomDerivativeOrNumericalDiff

---
 lib/Differentiator/ReverseModeVisitor.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
index ef41f42b2..04f626286 100644
--- a/lib/Differentiator/ReverseModeVisitor.cpp
+++ b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -1952,7 +1952,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       OverloadedDerivedFn =
           m_Builder.BuildCallToCustomDerivativeOrNumericalDiff(
               customPushforward, pushforwardCallArgs, getCurrentScope(),
-              const_cast<DeclContext*>(FD->getDeclContext()), true, true,
+              const_cast<DeclContext*>(FD->getDeclContext()),
+              /*forCustomDerv=*/true, /*namespaceShouldExist=*/true,
               CUDAExecConfig);
       if (OverloadedDerivedFn)
         asGrad = false;
@@ -2054,7 +2055,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       OverloadedDerivedFn =
           m_Builder.BuildCallToCustomDerivativeOrNumericalDiff(
               customPullback, pullbackCallArgs, getCurrentScope(),
-              const_cast<DeclContext*>(FD->getDeclContext()), true, true,
+              const_cast<DeclContext*>(FD->getDeclContext()),
+              /*forCustomDerv=*/true, /*namespaceShouldExist=*/true,
               CUDAExecConfig);
       if (baseDiff.getExpr())
         pullbackCallArgs.erase(pullbackCallArgs.begin());