From cc0fc393003a6336b0d288f8bfe6acc29b789381 Mon Sep 17 00:00:00 2001
From: kchristin <christinakoutsou22@gmail.com>
Date: Mon, 21 Oct 2024 18:44:31 +0300
Subject: [PATCH] Lose unnecessary sync calls to GPU in tests and add them in
 cudaMemcpy_pullback to mimic its blocking behavior

---
 .../clad/Differentiator/BuiltinDerivatives.h  | 14 ++++--
 lib/Differentiator/ReverseModeVisitor.cpp     |  5 +-
 test/CUDA/GradientKernels.cu                  | 47 ++++++++++---------
 3 files changed, 37 insertions(+), 29 deletions(-)
diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h
index 24fde4f8e..3467486cf 100644
--- a/include/clad/Differentiator/BuiltinDerivatives.h
+++ b/include/clad/Differentiator/BuiltinDerivatives.h
@@ -85,7 +85,7 @@ ValueAndPushforward<int, int> cudaDeviceSynchronize_pushforward()
 
 template <typename T>
 __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x)
     atomicAdd(&destPtr[i], srcPtr[i]);
 }
@@ -95,7 +95,7 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
                          cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr,
                          size_t* d_count, cudaMemcpyKind* d_kind)
     __attribute__((host)) {
-  T* aux_destPtr;
+  T* aux_destPtr = nullptr;
   if (kind == cudaMemcpyDeviceToHost) {
     *d_kind = cudaMemcpyHostToDevice;
     cudaMalloc(&aux_destPtr, count);
@@ -103,6 +103,8 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
     *d_kind = cudaMemcpyDeviceToHost;
     aux_destPtr = (T*)malloc(count);
   }
+  cudaDeviceSynchronize(); // needed in case user uses another stream for
+                           // kernel execution besides the default one
   cudaMemcpy(aux_destPtr, d_destPtr, count, *d_kind);
   size_t N = count / sizeof(T);
   if (kind == cudaMemcpyDeviceToHost) {
@@ -116,12 +118,14 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
     size_t numBlocks = std::min(maxBlocks, (N + numThreads - 1) / numThreads);
     custom_derivatives::atomicAdd_kernel<<<numBlocks, numThreads>>>(
         d_srcPtr, aux_destPtr, N);
-    cudaDeviceSynchronize(); // needed in case user uses another stream than the
-                             // default one
+    cudaDeviceSynchronize(); // needed in case the user uses another stream for
+                             // kernel execution besides the default one, so we
+                             // need to make sure the data are updated before
+                             // continuing with the rest of the code
     cudaFree(aux_destPtr);
   } else if (kind == cudaMemcpyHostToDevice) {
     // d_kind is device to host, so d_srcPtr is a host pointer
-    for (size_t i = 0; i < N; ++i)
+    for (size_t i = 0; i < N; i++)
       d_srcPtr[i] += aux_destPtr[i];
     free(aux_destPtr);
   }
diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
index 33cdc631c..ef41f42b2 100644
--- a/lib/Differentiator/ReverseModeVisitor.cpp
+++ b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -632,8 +632,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     if (!m_DiffReq.CUDAGlobalArgsIndexes.empty())
       for (auto index : m_DiffReq.CUDAGlobalArgsIndexes)
         m_CUDAGlobalArgs.emplace(m_Derivative->getParamDecl(index));
-    // If the function is a global kernel, all its parameters reside in the
-    // global memory of the GPU
+    // if the function is a global kernel, all the adjoint parameters reside in
+    // the global memory of the GPU. To facilitate the process, all the params
+    // of the kernel are added to the set.
     else if (m_DiffReq->hasAttr<clang::CUDAGlobalAttr>())
       for (auto* param : params)
         m_CUDAGlobalArgs.emplace(param);
diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
index f16d96017..05b90265b 100644
--- a/test/CUDA/GradientKernels.cu
+++ b/test/CUDA/GradientKernels.cu
@@ -427,9 +427,12 @@ void fn(double *out, double *in) {
 //CHECK-NEXT: }
 
 double fn_memory(double *out, double *in) {
-  kernel_call<<<1, 10>>>(out, in);
+  double *in_dev = nullptr;
+  cudaMalloc(&in_dev, 10 * sizeof(double));
+  cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice);
+  kernel_call<<<1, 10>>>(out, in_dev);
   cudaDeviceSynchronize();
-  double *out_host = (double*)malloc(10 * sizeof(double));
+  double *out_host = (double *)malloc(10 * sizeof(double));
   cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost);
   double res = 0;
   for (int i=0; i < 10; ++i) {
@@ -437,7 +440,7 @@ double fn_memory(double *out, double *in) {
   }
   free(out_host);
   cudaFree(out);
-  cudaFree(in);
+  cudaFree(in_dev);
   return res;
 }
 
@@ -445,7 +448,12 @@ double fn_memory(double *out, double *in) {
 //CHECK-NEXT:    int _d_i = 0;
 //CHECK-NEXT:    int i = 0;
 //CHECK-NEXT:    clad::tape<double> _t1 = {};
-//CHECK-NEXT:    kernel_call<<<1, 10>>>(out, in);
+//CHECK-NEXT:    double *_d_in_dev = nullptr;
+//CHECK-NEXT:    double *in_dev = nullptr;
+//CHECK-NEXT:    cudaMalloc(&_d_in_dev, 10 * sizeof(double));
+//CHECK-NEXT:    cudaMalloc(&in_dev, 10 * sizeof(double));
+//CHECK-NEXT:    cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice);
+//CHECK-NEXT:    kernel_call<<<1, 10>>>(out, in_dev);
 //CHECK-NEXT:    cudaDeviceSynchronize();
 //CHECK-NEXT:    double *_d_out_host = (double *)malloc(10 * sizeof(double));
 //CHECK-NEXT:    double *out_host = (double *)malloc(10 * sizeof(double));
@@ -481,10 +489,16 @@ double fn_memory(double *out, double *in) {
 //CHECK-NEXT:        clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1);
 //CHECK-NEXT:    }
 //CHECK-NEXT:    kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in);
+//CHECK-NEXT:    {
+//CHECK-NEXT:        unsigned long _r0 = 0UL;
+//CHECK-NEXT:        cudaMemcpyKind _r1 = static_cast<cudaMemcpyKind>(0U);
+//CHECK-NEXT:        clad::custom_derivatives::cudaMemcpy_pullback(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice, _d_in_dev, _d_in, &_r0, &_r1);
+//CHECK-NEXT:    }
 //CHECK-NEXT:    free(out_host);
 //CHECK-NEXT:    free(_d_out_host);
 //CHECK-NEXT:    cudaFree(out);
-//CHECK-NEXT:    cudaFree(in);
+//CHECK-NEXT:    cudaFree(in_dev);
+//CHECK-NEXT:    cudaFree(_d_in_dev);
 //CHECK-NEXT:}
 
 // CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) {
@@ -564,7 +578,6 @@ double fn_memory(double *out, double *in) {
     else {                                                                  \
       test.execute_kernel(grid, block, x, dx);                              \
     }                                                                       \
-    cudaDeviceSynchronize();                                                \
     int *res = (int*)malloc(N * sizeof(int));                               \
     cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost);           \
     for (int i = 0; i < (N - 1); i++) {                                     \
@@ -602,7 +615,6 @@ double fn_memory(double *out, double *in) {
     }                                                                         \
     int *res = (int*)malloc(N * sizeof(int));                                 \
     cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost);             \
-    cudaDeviceSynchronize();                                                  \
     for (int i = 0; i < (N - 1); i++) {                                       \
       printf("%d, ", res[i]);                                                 \
     }                                                                         \
@@ -637,7 +649,6 @@ double fn_memory(double *out, double *in) {
     }                                                                             \
     int *res = (int*)malloc(N * sizeof(int));                                     \
     cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost);                 \
-    cudaDeviceSynchronize();                                                      \
     for (int i = 0; i < (N - 1); i++) {                                           \
       printf("%d, ", res[i]);                                                     \
     }                                                                             \
@@ -672,7 +683,6 @@ double fn_memory(double *out, double *in) {
     }                                                                           \
     double *res = (double*)malloc(N * sizeof(double));                          \
     cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost);            \
-    cudaDeviceSynchronize();                                                    \
     for (int i = 0; i < (N - 1); i++) {                                         \
       printf("%0.2f, ", res[i]);                                                \
     }                                                                           \
@@ -748,8 +758,9 @@ int main(void) {
   auto test_device = clad::gradient(kernel_with_device_call, "out, val");
   test_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
   double *res = (double*)malloc(10 * sizeof(double));
-  cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
+  cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); // no need for synchronization before or after, 
+                                                                  // as the cudaMemcpy call is queued after the kernel call 
+                                                                  // on the default stream and the cudaMemcpy call is blocking
   printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
 
   INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
@@ -757,7 +768,6 @@ int main(void) {
   auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val");
   test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
   cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
   printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
 
   INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
@@ -765,7 +775,6 @@ int main(void) {
   auto check_dup = clad::gradient(dup_kernel_with_device_call_2, "out, val"); // check that the pullback function is not regenerated
   check_dup.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
   cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
   printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error
   printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
 
@@ -773,7 +782,6 @@ int main(void) {
 
   auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in");
   test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double);
-  cudaDeviceSynchronize();
   cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
   printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
 
@@ -781,9 +789,7 @@ int main(void) {
 
   auto test_device_4 = clad::gradient(kernel_with_device_call_3);
   test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, val, d_out_double, d_in_double, d_val);
-  cudaDeviceSynchronize();
   cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
   printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
   cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
   printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
@@ -792,7 +798,6 @@ int main(void) {
 
   auto test_kernel_call = clad::gradient(fn);
   test_kernel_call.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double);
-  cudaDeviceSynchronize();
   cudaMemcpy(res, d_in_double, sizeof(double), cudaMemcpyDeviceToHost);
   printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
 
@@ -800,17 +805,14 @@ int main(void) {
 
   auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in");
   nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double);
-  cudaDeviceSynchronize();
   cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
   printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
 
   INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);
 
   auto test_memory = clad::gradient(fn_memory);
-  test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double);
-  cudaDeviceSynchronize();
-  cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
-  printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 60.00, 0.00, 0.00
+  test_memory.execute(dummy_out_double, fives, d_out_double, zeros);
+  printf("%0.2f, %0.2f, %0.2f\n", zeros[0], zeros[1], zeros[2]); // CHECK-EXEC: 60.00, 0.00, 0.00
 
   free(res);
   free(fives);
@@ -819,6 +821,7 @@ int main(void) {
   cudaFree(d_in_double);
   cudaFree(val);
   cudaFree(d_val);
+  cudaFree(dummy_in_double);
 
   return 0;
 }