From ef0f784e68f22f5229d8c08f302df2983df6d185 Mon Sep 17 00:00:00 2001 From: kchristin Date: Mon, 21 Oct 2024 18:24:02 +0300 Subject: [PATCH] Make cudaMemcpy a plus-assign op --- .../clad/Differentiator/BuiltinDerivatives.h | 45 ++++++++++++++++--- test/CUDA/GradientKernels.cu | 2 +- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h index 1d3f96aba..df5ebbac8 100644 --- a/include/clad/Differentiator/BuiltinDerivatives.h +++ b/include/clad/Differentiator/BuiltinDerivatives.h @@ -83,15 +83,48 @@ ValueAndPushforward cudaDeviceSynchronize_pushforward() return {cudaDeviceSynchronize(), 0}; } -void cudaMemcpy_pullback(void* destPtr, void* srcPtr, size_t count, - cudaMemcpyKind kind, void* d_destPtr, void* d_srcPtr, - size_t* d_count, cudaMemcpyKind* d_kind) +template +__global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += blockDim.x * gridDim.x) + atomicAdd(&destPtr[i], srcPtr[i]); +} + +template +void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count, + cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr, + size_t* d_count, cudaMemcpyKind* d_kind) __attribute__((host)) { - if (kind == cudaMemcpyDeviceToHost) + T* aux_destPtr; + if (kind == cudaMemcpyDeviceToHost) { *d_kind = cudaMemcpyHostToDevice; - else if (kind == cudaMemcpyHostToDevice) + cudaMalloc(&aux_destPtr, count); + } else if (kind == cudaMemcpyHostToDevice) { *d_kind = cudaMemcpyDeviceToHost; - cudaMemcpy(d_srcPtr, d_destPtr, count, *d_kind); + aux_destPtr = (T*)malloc(count); + } + cudaMemcpy(aux_destPtr, d_destPtr, count, *d_kind); + size_t N = count / sizeof(T); + if (kind == cudaMemcpyDeviceToHost) { + // d_kind is host to device, so d_srcPtr is a device pointer + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + size_t maxThreads = deviceProp.maxThreadsPerBlock; + size_t maxBlocks = deviceProp.maxGridSize[0]; + + size_t numThreads = std::min(maxThreads, N); + size_t numBlocks = std::min(maxBlocks, (N + numThreads - 1) / numThreads); + custom_derivatives::atomicAdd_kernel<<>>( + d_srcPtr, aux_destPtr, N); + cudaDeviceSynchronize(); + cudaFree(aux_destPtr); + } else if (kind == cudaMemcpyHostToDevice) { + // d_kind is device to host, so d_srcPtr is a host pointer + for (size_t i = 0; i < N; ++i) { + d_srcPtr[i] += aux_destPtr[i]; + } + free(aux_destPtr); + } } #endif diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index 87a8daa28..f16d96017 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -810,7 +810,7 @@ int main(void) { test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); cudaDeviceSynchronize(); cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); - printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 10.00, 0.00, 0.00 + printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 60.00, 0.00, 0.00 free(res); free(fives);