Skip to content

Commit

Permalink
Add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 committed Oct 10, 2024
1 parent dbbe9a7 commit 00503d3
Showing 1 changed file with 221 additions and 26 deletions.
247 changes: 221 additions & 26 deletions test/CUDA/GradientKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -292,12 +292,12 @@ __device__ double device_fn(double in, double val) {
return in + val;
}

__global__ void device_pullback(double *in, double *out, double val) {
__global__ void kernel_with_device_call(double *out, double *in, double val) {
int index = threadIdx.x;
out[index] = device_fn(in[index], val);
}

// CHECK: void device_pullback_grad_1_2(double *in, double *out, double val, double *_d_out, double *_d_val) {
// CHECK: void kernel_with_device_call_grad_0_2(double *out, double *in, double val, double *_d_out, double *_d_val) {
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x;
//CHECK-NEXT: double _t0 = out[index0];
Expand All @@ -313,6 +313,100 @@ __global__ void device_pullback(double *in, double *out, double val) {
//CHECK-NEXT: }
//CHECK-NEXT:}

__device__ double device_fn_2(double *in, double val) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
return in[index] + val;
}

__global__ void kernel_with_device_call_2(double *out, double *in, double val) {
int index = threadIdx.x;
out[index] = device_fn_2(in, val);
}

// CHECK: void kernel_with_device_call_2_grad_0_2(double *out, double *in, double val, double *_d_out, double *_d_val) {
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x;
//CHECK-NEXT: double _t0 = out[index0];
//CHECK-NEXT: out[index0] = device_fn_2(in, val);
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t0;
//CHECK-NEXT: double _r_d0 = _d_out[index0];
//CHECK-NEXT: _d_out[index0] = 0.;
//CHECK-NEXT: double _r0 = 0.;
//CHECK-NEXT: device_fn_2_pullback(in, val, _r_d0, &_r0);
//CHECK-NEXT: atomicAdd(_d_val, _r0);
//CHECK-NEXT: }
//CHECK-NEXT:}

// CHECK: void kernel_with_device_call_2_grad_0_1(double *out, double *in, double val, double *_d_out, double *_d_in) {
//CHECK-NEXT: double _d_val = 0.;
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x;
//CHECK-NEXT: double _t0 = out[index0];
//CHECK-NEXT: out[index0] = device_fn_2(in, val);
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t0;
//CHECK-NEXT: double _r_d0 = _d_out[index0];
//CHECK-NEXT: _d_out[index0] = 0.;
//CHECK-NEXT: double _r0 = 0.;
//CHECK-NEXT: device_fn_2_pullback(in, val, _r_d0, _d_in, &_r0);
//CHECK-NEXT: _d_val += _r0;
//CHECK-NEXT: }
//CHECK-NEXT:}

__device__ double device_fn_3(double *in, double *val) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
return in[index] + *val;
}

__global__ void kernel_with_device_call_3(double *out, double *in, double *val) {
int index = threadIdx.x;
out[index] = device_fn_3(in, val);
}

// CHECK: void kernel_with_device_call_3_grad(double *out, double *in, double *val, double *_d_out, double *_d_in, double *_d_val) {
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x;
//CHECK-NEXT: double _t0 = out[index0];
//CHECK-NEXT: out[index0] = device_fn_3(in, val);
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t0;
//CHECK-NEXT: double _r_d0 = _d_out[index0];
//CHECK-NEXT: _d_out[index0] = 0.;
//CHECK-NEXT: device_fn_3_pullback(in, val, _r_d0, _d_in, _d_val);
//CHECK-NEXT: }
//CHECK-NEXT:}

__device__ double device_fn_4(double *in, double val) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
return in[index] + val;
}

__device__ double device_with_device_call(double *in, double val) {
return device_fn_4(in, val);
}

__global__ void kernel_with_nested_device_call(double *out, double *in, double val) {
int index = threadIdx.x;
out[index] = device_with_device_call(in, val);
}

// CHECK: void kernel_with_nested_device_call_grad_0_1(double *out, double *in, double val, double *_d_out, double *_d_in) {
//CHECK-NEXT: double _d_val = 0.;
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x;
//CHECK-NEXT: double _t0 = out[index0];
//CHECK-NEXT: out[index0] = device_with_device_call(in, val);
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t0;
//CHECK-NEXT: double _r_d0 = _d_out[index0];
//CHECK-NEXT: _d_out[index0] = 0.;
//CHECK-NEXT: double _r0 = 0.;
//CHECK-NEXT: device_with_device_call_pullback(in, val, _r_d0, _d_in, &_r0);
//CHECK-NEXT: _d_val += _r0;
//CHECK-NEXT: }
//CHECK-NEXT:}

__global__ void kernel_call(double *a, double *b) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
a[index] = *b;
Expand All @@ -322,9 +416,12 @@ void fn(double *out, double *in) {
kernel_call<<<1, 10>>>(out, in);
}

// CHECK: void fn_grad(double *out, double *in, double *_d_out, double *_d_in) {
//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in);
//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in);
//CHECK-NEXT: }

double fn_memory(double *out, double *in) {
cudaMalloc(&in, 10 * sizeof(double));
cudaMalloc(&out, 10 * sizeof(double));
kernel_call<<<1, 10>>>(out, in);
double *out_host = (double*)malloc(10 * sizeof(double));
cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost);
Expand All @@ -341,10 +438,6 @@ double fn_memory(double *out, double *in) {
//CHECK-NEXT: int _d_i = 0;
//CHECK-NEXT: int i = 0;
//CHECK-NEXT: clad::tape<double> _t1 = {};
//CHECK-NEXT: cudaMalloc(&_d_in, 10 * sizeof(double));
//CHECK-NEXT: cudaMalloc(&in, 10 * sizeof(double));
//CHECK-NEXT: cudaMalloc(&_d_out, 10 * sizeof(double));
//CHECK-NEXT: cudaMalloc(&out, 10 * sizeof(double));
//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in);
//CHECK-NEXT: double *_d_out_host = (double *)malloc(10 * sizeof(double));
//CHECK-NEXT: double *out_host = (double *)malloc(10 * sizeof(double));
Expand Down Expand Up @@ -376,26 +469,51 @@ double fn_memory(double *out, double *in) {
//CHECK-NEXT: }
//CHECK-NEXT: {
//CHECK-NEXT: unsigned long _r0 = 0UL;
//CHECK-NEXT: cudaMemcpyKind _r1 = 0U;
//CHECK-NEXT: cudaMemcpyKind _r1 = static_cast<cudaMemcpyKind>(0U);
//CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1);
//CHECK-NEXT: }
//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in);
//CHECK-NEXT: cudaFree(out);
//CHECK-NEXT: cudaFree(in);
//CHECK-NEXT:}

// CHECK: void fn_grad(double *out, double *in, double *_d_out, double *_d_in) {
//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in);
//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in);
//CHECK-NEXT: }

// CHECK: __attribute__((device)) void device_fn_pullback(double in, double val, double _d_y, double *_d_in, double *_d_val) {
//CHECK-NEXT: {
//CHECK-NEXT: *_d_in += _d_y;
//CHECK-NEXT: *_d_val += _d_y;
//CHECK-NEXT: }
//CHECK-NEXT:}

// CHECK: __attribute__((device)) void device_fn_2_pullback(double *in, double val, double _d_y, double *_d_val) {
//CHECK-NEXT: unsigned int _t1 = blockIdx.x;
//CHECK-NEXT: unsigned int _t0 = blockDim.x;
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: *_d_val += _d_y;
//CHECK-NEXT:}

// CHECK: __attribute__((device)) void device_fn_2_pullback(double *in, double val, double _d_y, double *_d_in, double *_d_val) {
//CHECK-NEXT: unsigned int _t1 = blockIdx.x;
//CHECK-NEXT: unsigned int _t0 = blockDim.x;
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: {
//CHECK-NEXT: atomicAdd(&_d_in[index0], _d_y);
//CHECK-NEXT: *_d_val += _d_y;
//CHECK-NEXT: }
//CHECK-NEXT:}

// CHECK: __attribute__((device)) void device_fn_3_pullback(double *in, double *val, double _d_y, double *_d_in, double *_d_val) {
//CHECK-NEXT: unsigned int _t1 = blockIdx.x;
//CHECK-NEXT: unsigned int _t0 = blockDim.x;
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: {
//CHECK-NEXT: atomicAdd(&_d_in[index0], _d_y);
//CHECK-NEXT: atomicAdd(_d_val, _d_y);
//CHECK-NEXT: }
//CHECK-NEXT:}

// CHECK: __attribute__((global)) void kernel_call_pullback(double *a, double *b, double *_d_a, double *_d_b) {
//CHECK-NEXT: unsigned int _t1 = blockIdx.x;
//CHECK-NEXT: unsigned int _t0 = blockDim.x;
Expand All @@ -411,6 +529,25 @@ double fn_memory(double *out, double *in) {
//CHECK-NEXT: }
//CHECK-NEXT:}

// CHECK: __attribute__((device)) void device_with_device_call_pullback(double *in, double val, double _d_y, double *_d_in, double *_d_val) {
//CHECK-NEXT: {
//CHECK-NEXT: double _r0 = 0.;
//CHECK-NEXT: device_fn_4_pullback(in, val, _d_y, _d_in, &_r0);
//CHECK-NEXT: *_d_val += _r0;
//CHECK-NEXT: }
//CHECK-NEXT:}

// CHECK: __attribute__((device)) void device_fn_4_pullback(double *in, double val, double _d_y, double *_d_in, double *_d_val) {
//CHECK-NEXT: unsigned int _t1 = blockIdx.x;
//CHECK-NEXT: unsigned int _t0 = blockDim.x;
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: {
//CHECK-NEXT: atomicAdd(&_d_in[index0], _d_y);
//CHECK-NEXT: *_d_val += _d_y;
//CHECK-NEXT: }
//CHECK-NEXT:}

#define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \
{ \
int *fives = (int*)malloc(N * sizeof(int)); \
Expand Down Expand Up @@ -470,6 +607,7 @@ double fn_memory(double *out, double *in) {
} \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
Expand Down Expand Up @@ -504,6 +642,7 @@ double fn_memory(double *out, double *in) {
} \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
Expand Down Expand Up @@ -538,6 +677,7 @@ double fn_memory(double *out, double *in) {
} \
double *res = (double*)malloc(N * sizeof(double)); \
cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (int i = 0; i < (N - 1); i++) { \
printf("%0.2f, ", res[i]); \
} \
Expand Down Expand Up @@ -619,41 +759,96 @@ int main(void) {
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice);

device_pullback<<<1, 10>>>(y, x, 5);
auto test_device = clad::gradient(kernel_with_device_call, "out, val");
test_device.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, d_val);
double *res = (double*)malloc(10 * sizeof(double));
cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

auto test_device = clad::gradient(device_pullback, "out, val");
test_device.execute_kernel(dim3(1), dim3(10, 1, 1), x, y, 5, dy, d_val);
cudaDeviceSynchronize();
printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error
double *res = (double*)malloc(sizeof(double));
cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice);

auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val");
test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, d_val);
cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);

auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in");
test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, dx);
cudaDeviceSynchronize();
cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice);

double *val;
cudaMalloc(&val, sizeof(double));
cudaMemcpy(val, fives, sizeof(double), cudaMemcpyHostToDevice);

auto test_device_4 = clad::gradient(kernel_with_device_call_3);
test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, val, dy, dx, d_val);
cudaDeviceSynchronize();
cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);

auto test_kernel_call = clad::gradient(fn);
test_kernel_call.execute(y, x, dy, dx);
cudaDeviceSynchronize();
printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error
cudaMemcpy(res, dx, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);

auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in");
nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, dx);
cudaDeviceSynchronize();
cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);

auto test_memory = clad::gradient(fn_memory);
test_memory.execute(y, x, dy, dx);
cudaDeviceSynchronize();
printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error
cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 50.00, 0.00, 0.00

free(fives);
free(zeros);
free(res);
cudaFree(x);
cudaFree(y);
cudaFree(dx);
cudaFree(dy);
cudaFree(d_val);

auto test_memory = clad::gradient(fn_memory);

return 0;
}

0 comments on commit 00503d3

Please sign in to comment.