Skip to content

Commit

Permalink
Lose unnecessary sync calls to GPU in tests and add them in cudaMemcp…
Browse files Browse the repository at this point in the history
…y_pullback to mimic its blocking behavior
  • Loading branch information
kchristin22 authored and vgvassilev committed Oct 27, 2024
1 parent 176aeba commit cc0fc39
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 29 deletions.
14 changes: 9 additions & 5 deletions include/clad/Differentiator/BuiltinDerivatives.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ ValueAndPushforward<int, int> cudaDeviceSynchronize_pushforward()

template <typename T>
__global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x)
atomicAdd(&destPtr[i], srcPtr[i]);
}
Expand All @@ -95,14 +95,16 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr,
size_t* d_count, cudaMemcpyKind* d_kind)
__attribute__((host)) {
T* aux_destPtr;
T* aux_destPtr = nullptr;
if (kind == cudaMemcpyDeviceToHost) {
*d_kind = cudaMemcpyHostToDevice;
cudaMalloc(&aux_destPtr, count);
} else if (kind == cudaMemcpyHostToDevice) {
*d_kind = cudaMemcpyDeviceToHost;
aux_destPtr = (T*)malloc(count);
}
cudaDeviceSynchronize(); // needed in case user uses another stream for
// kernel execution besides the default one
cudaMemcpy(aux_destPtr, d_destPtr, count, *d_kind);
size_t N = count / sizeof(T);
if (kind == cudaMemcpyDeviceToHost) {
Expand All @@ -116,12 +118,14 @@ void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
size_t numBlocks = std::min(maxBlocks, (N + numThreads - 1) / numThreads);
custom_derivatives::atomicAdd_kernel<<<numBlocks, numThreads>>>(
d_srcPtr, aux_destPtr, N);
cudaDeviceSynchronize(); // needed in case user uses another stream than the
// default one
cudaDeviceSynchronize(); // needed in case the user uses another stream for
// kernel execution besides the default one, so we
// need to make sure the data are updated before
// continuing with the rest of the code
cudaFree(aux_destPtr);
} else if (kind == cudaMemcpyHostToDevice) {
// d_kind is device to host, so d_srcPtr is a host pointer
for (size_t i = 0; i < N; ++i)
for (size_t i = 0; i < N; i++)
d_srcPtr[i] += aux_destPtr[i];
free(aux_destPtr);
}
Expand Down
5 changes: 3 additions & 2 deletions lib/Differentiator/ReverseModeVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -632,8 +632,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
if (!m_DiffReq.CUDAGlobalArgsIndexes.empty())
for (auto index : m_DiffReq.CUDAGlobalArgsIndexes)
m_CUDAGlobalArgs.emplace(m_Derivative->getParamDecl(index));
// If the function is a global kernel, all its parameters reside in the
// global memory of the GPU
// if the function is a global kernel, all the adjoint parameters reside in
// the global memory of the GPU. To facilitate the process, all the params
// of the kernel are added to the set.
else if (m_DiffReq->hasAttr<clang::CUDAGlobalAttr>())
for (auto* param : params)
m_CUDAGlobalArgs.emplace(param);
Expand Down
47 changes: 25 additions & 22 deletions test/CUDA/GradientKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -427,25 +427,33 @@ void fn(double *out, double *in) {
//CHECK-NEXT: }

double fn_memory(double *out, double *in) {
kernel_call<<<1, 10>>>(out, in);
double *in_dev = nullptr;
cudaMalloc(&in_dev, 10 * sizeof(double));
cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice);
kernel_call<<<1, 10>>>(out, in_dev);
cudaDeviceSynchronize();
double *out_host = (double*)malloc(10 * sizeof(double));
double *out_host = (double *)malloc(10 * sizeof(double));
cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost);
double res = 0;
for (int i=0; i < 10; ++i) {
res += out_host[i];
}
free(out_host);
cudaFree(out);
cudaFree(in);
cudaFree(in_dev);
return res;
}

// CHECK: void fn_memory_grad(double *out, double *in, double *_d_out, double *_d_in) {
//CHECK-NEXT: int _d_i = 0;
//CHECK-NEXT: int i = 0;
//CHECK-NEXT: clad::tape<double> _t1 = {};
//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in);
//CHECK-NEXT: double *_d_in_dev = nullptr;
//CHECK-NEXT: double *in_dev = nullptr;
//CHECK-NEXT: cudaMalloc(&_d_in_dev, 10 * sizeof(double));
//CHECK-NEXT: cudaMalloc(&in_dev, 10 * sizeof(double));
//CHECK-NEXT: cudaMemcpy(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice);
//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in_dev);
//CHECK-NEXT: cudaDeviceSynchronize();
//CHECK-NEXT: double *_d_out_host = (double *)malloc(10 * sizeof(double));
//CHECK-NEXT: double *out_host = (double *)malloc(10 * sizeof(double));
Expand Down Expand Up @@ -481,10 +489,16 @@ double fn_memory(double *out, double *in) {
//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1);
//CHECK-NEXT: }
//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in);
//CHECK-NEXT: {
//CHECK-NEXT: unsigned long _r0 = 0UL;
//CHECK-NEXT: cudaMemcpyKind _r1 = static_cast<cudaMemcpyKind>(0U);
//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(in_dev, in, 10 * sizeof(double), cudaMemcpyHostToDevice, _d_in_dev, _d_in, &_r0, &_r1);
//CHECK-NEXT: }
//CHECK-NEXT: free(out_host);
//CHECK-NEXT: free(_d_out_host);
//CHECK-NEXT: cudaFree(out);
//CHECK-NEXT: cudaFree(in);
//CHECK-NEXT: cudaFree(in_dev);
//CHECK-NEXT: cudaFree(_d_in_dev);
//CHECK-NEXT:}

// CHECK: __attribute__((device)) void device_fn_pullback_1(double in, double val, double _d_y, double *_d_in, double *_d_val) {
Expand Down Expand Up @@ -564,7 +578,6 @@ double fn_memory(double *out, double *in) {
else { \
test.execute_kernel(grid, block, x, dx); \
} \
cudaDeviceSynchronize(); \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
Expand Down Expand Up @@ -602,7 +615,6 @@ double fn_memory(double *out, double *in) {
} \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
Expand Down Expand Up @@ -637,7 +649,6 @@ double fn_memory(double *out, double *in) {
} \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
Expand Down Expand Up @@ -672,7 +683,6 @@ double fn_memory(double *out, double *in) {
} \
double *res = (double*)malloc(N * sizeof(double)); \
cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (int i = 0; i < (N - 1); i++) { \
printf("%0.2f, ", res[i]); \
} \
Expand Down Expand Up @@ -748,42 +758,38 @@ int main(void) {
auto test_device = clad::gradient(kernel_with_device_call, "out, val");
test_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
double *res = (double*)malloc(10 * sizeof(double));
cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); // no need for synchronization before or after,
// as the cudaMemcpy call is queued after the kernel call
// on the default stream and the cudaMemcpy call is blocking
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val");
test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto check_dup = clad::gradient(dup_kernel_with_device_call_2, "out, val"); // check that the pullback function is not regenerated
check_dup.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in");
test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double);
cudaDeviceSynchronize();
cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00

INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto test_device_4 = clad::gradient(kernel_with_device_call_3);
test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, val, d_out_double, d_in_double, d_val);
cudaDeviceSynchronize();
cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00
Expand All @@ -792,25 +798,21 @@ int main(void) {

auto test_kernel_call = clad::gradient(fn);
test_kernel_call.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double);
cudaDeviceSynchronize();
cudaMemcpy(res, d_in_double, sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in");
nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double);
cudaDeviceSynchronize();
cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00

INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto test_memory = clad::gradient(fn_memory);
test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double);
cudaDeviceSynchronize();
cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 60.00, 0.00, 0.00
test_memory.execute(dummy_out_double, fives, d_out_double, zeros);
printf("%0.2f, %0.2f, %0.2f\n", zeros[0], zeros[1], zeros[2]); // CHECK-EXEC: 60.00, 0.00, 0.00

free(res);
free(fives);
Expand All @@ -819,6 +821,7 @@ int main(void) {
cudaFree(d_in_double);
cudaFree(val);
cudaFree(d_val);
cudaFree(dummy_in_double);

return 0;
}

0 comments on commit cc0fc39

Please sign in to comment.