From cc0fc393003a6336b0d288f8bfe6acc29b789381 Mon Sep 17 00:00:00 2001 From: kchristin Date: Mon, 21 Oct 2024 18:44:31 +0300 Subject: [PATCH] Lose unnecessary sync calls to GPU in tests and add them in cudaMemcpy_pullback to mimic its blocking behavior --- .../clad/Differentiator/BuiltinDerivatives.h | 14 ++++-- lib/Differentiator/ReverseModeVisitor.cpp | 5 +- test/CUDA/GradientKernels.cu | 47 ++++++++++--------- 3 files changed, 37 insertions(+), 29 deletions(-) diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index 24fde4f8e..3467486cf 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -85,7 +85,7 @@ ValueAndPushforward cudaDeviceSynchronize_pushforward() template __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) atomicAdd(&destPtr[i], srcPtr[i]); } @@ -95,7 +95,7 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr, size_t* d_count, cudaMemcpyKind* d_kind) __attribute__((host)) { - T* aux_destPtr; + T* aux_destPtr = nullptr; if (kind == cudaMemcpyDeviceToHost) { *d_kind = cudaMemcpyHostToDevice; cudaMalloc(&aux_destPtr, count); @@ -103,6 +103,8 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, *d_kind = cudaMemcpyDeviceToHost; aux_destPtr = (T*)malloc(count); } + cudaDeviceSynchronize(); // needed in case user uses another stream for + // kernel execution besides the default one cudaMemcpy(aux_destPtr, d_destPtr, count, *d_kind); size_t N = count / sizeof(T); if (kind == cudaMemcpyDeviceToHost) { @@ -116,12 +118,14 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, size_t numBlocks = std::min(maxBlocks, (N + numThreads - 1) / numThreads); custom_derivatives::atomicAdd_kernel<<>>( d_srcPtr, aux_destPtr, N); - cudaDeviceSynchronize(); // needed in case user uses another stream than the - // default one + cudaDeviceSynchronize(); // needed in case the user uses another stream for + // kernel execution besides the default one, so we + // need to make sure the data are updated before + // continuing with the rest of the code cudaFree(aux_destPtr); } else if (kind == cudaMemcpyHostToDevice) { // d_kind is device to host, so d_srcPtr is a host pointer - for (size_t i = 0; i < N; ++i) + for (size_t i = 0; i < N; i++) d_srcPtr[i] += aux_destPtr[i]; free(aux_destPtr); } diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 33cdc631c..ef41f42b2 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -632,8 +632,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, if (!m_DiffReq.CUDAGlobalArgsIndexes.empty()) for (auto index : m_DiffReq.CUDAGlobalArgsIndexes) m_CUDAGlobalArgs.emplace(m_Derivative->getParamDecl(index)); - // If the function is a global kernel, all its parameters reside in the - // global memory of the GPU + // if the function is a global kernel, all the adjoint parameters reside in + // the global memory of the GPU. To facilitate the process, all the params + // of the kernel are added to the set. else if (m_DiffReq->hasAttr()) for (auto* param : params) m_CUDAGlobalArgs.emplace(param); diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index f16d96017..05b90265b 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -427,9 +427,12 @@ void fn(double *out, double *in) { //CHECK-NEXT: } double fn_memory(double *out, double *in) { - kernel_call<<<1, 10>>>(out, in); + double *in_dev = nullptr; + cudaMalloc(&in_dev, 10 * sizeof(double)); + cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice); + kernel_call<<<1, 10>>>(out, in_dev); cudaDeviceSynchronize(); - double *out_host = (double*)malloc(10 * sizeof(double)); + double *out_host = (double *)malloc(10 * sizeof(double)); cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost); double res = 0; for (int i=0; i < 10; ++i) { @@ -437,7 +440,7 @@ double fn_memory(double *out, double *in) { } free(out_host); cudaFree(out); - cudaFree(in); + cudaFree(in_dev); return res; } @@ -445,7 +448,12 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: int _d_i = 0; //CHECK-NEXT: int i = 0; //CHECK-NEXT: clad::tape _t1 = {}; -//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in); +//CHECK-NEXT: double *_d_in_dev = nullptr; +//CHECK-NEXT: double *in_dev = nullptr; +//CHECK-NEXT: cudaMalloc(&_d_in_dev, 10 * sizeof(double)); +//CHECK-NEXT: cudaMalloc(&in_dev, 10 * sizeof(double)); +//CHECK-NEXT: cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice); +//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in_dev); //CHECK-NEXT: cudaDeviceSynchronize(); //CHECK-NEXT: double *_d_out_host = (double *)malloc(10 * sizeof(double)); //CHECK-NEXT: double *out_host = (double *)malloc(10 * sizeof(double)); @@ -481,10 +489,16 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1); //CHECK-NEXT: } //CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); +//CHECK-NEXT: { +//CHECK-NEXT: unsigned long _r0 = 0UL; +//CHECK-NEXT: cudaMemcpyKind _r1 = static_cast(0U); +//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice, _d_in_dev, _d_in, &_r0, &_r1); +//CHECK-NEXT: } //CHECK-NEXT: free(out_host); //CHECK-NEXT: free(_d_out_host); //CHECK-NEXT: cudaFree(out); -//CHECK-NEXT: cudaFree(in); +//CHECK-NEXT: cudaFree(in_dev); +//CHECK-NEXT: cudaFree(_d_in_dev); //CHECK-NEXT:} // CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) { @@ -564,7 +578,6 @@ double fn_memory(double *out, double *in) { else { \ test.execute_kernel(grid, block, x, dx); \ } \ - cudaDeviceSynchronize(); \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ for (int i = 0; i < (N - 1); i++) { \ @@ -602,7 +615,6 @@ double fn_memory(double *out, double *in) { } \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%d, ", res[i]); \ } \ @@ -637,7 +649,6 @@ double fn_memory(double *out, double *in) { } \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%d, ", res[i]); \ } \ @@ -672,7 +683,6 @@ double fn_memory(double *out, double *in) { } \ double *res = (double*)malloc(N * sizeof(double)); \ cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%0.2f, ", res[i]); \ } \ @@ -748,8 +758,9 @@ int main(void) { auto test_device = clad::gradient(kernel_with_device_call, "out, val"); test_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); double *res = (double*)malloc(10 * sizeof(double)); - cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); + cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); // no need for synchronization before or after, + // as the cudaMemcpy call is queued after the kernel call + // on the default stream and the cudaMemcpy call is blocking printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); @@ -757,7 +768,6 @@ int main(void) { auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val"); test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); @@ -765,7 +775,6 @@ int main(void) { auto check_dup = clad::gradient(dup_kernel_with_device_call_2, "out, val"); // check that the pullback function is not regenerated check_dup.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 @@ -773,7 +782,6 @@ int main(void) { auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in"); test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double); - cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 @@ -781,9 +789,7 @@ int main(void) { auto test_device_4 = clad::gradient(kernel_with_device_call_3); test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, val, d_out_double, d_in_double, d_val); - cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 @@ -792,7 +798,6 @@ int main(void) { auto test_kernel_call = clad::gradient(fn); test_kernel_call.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); - cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 @@ -800,17 +805,14 @@ int main(void) { auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in"); nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double); - cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); auto test_memory = clad::gradient(fn_memory); - test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); - cudaDeviceSynchronize(); - cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); - printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 60.00, 0.00, 0.00 + test_memory.execute(dummy_out_double, fives, d_out_double, zeros); + printf("%0.2f, %0.2f, %0.2f\n", zeros[0], zeros[1], zeros[2]); // CHECK-EXEC: 60.00, 0.00, 0.00 free(res); free(fives); @@ -819,6 +821,7 @@ int main(void) { cudaFree(d_in_double); cudaFree(val); cudaFree(d_val); + cudaFree(dummy_in_double); return 0; }