From 00503d3c92e7a9a3d93e0e7dce9ac32bc4e921bf Mon Sep 17 00:00:00 2001 From: kchristin Date: Fri, 11 Oct 2024 01:16:16 +0300 Subject: [PATCH] Add tests --- test/CUDA/GradientKernels.cu | 247 +++++++++++++++++++++++++++++++---- 1 file changed, 221 insertions(+), 26 deletions(-) diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index 20f10854c..725032cbf 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -292,12 +292,12 @@ __device__ double device_fn(double in, double val) { return in + val; } -__global__ void device_pullback(double *in, double *out, double val) { +__global__ void kernel_with_device_call(double *out, double *in, double val) { int index = threadIdx.x; out[index] = device_fn(in[index], val); } -// CHECK: void device_pullback_grad_1_2(double *in, double *out, double val, double *_d_out, double *_d_val) { +// CHECK: void kernel_with_device_call_grad_0_2(double *out, double *in, double val, double *_d_out, double *_d_val) { //CHECK-NEXT: int _d_index = 0; //CHECK-NEXT: int index0 = threadIdx.x; //CHECK-NEXT: double _t0 = out[index0]; @@ -313,6 +313,100 @@ __global__ void device_pullback(double *in, double *out, double val) { //CHECK-NEXT: } //CHECK-NEXT:} +__device__ double device_fn_2(double *in, double val) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + return in[index] + val; +} + +__global__ void kernel_with_device_call_2(double *out, double *in, double val) { + int index = threadIdx.x; + out[index] = device_fn_2(in, val); +} + +// CHECK: void kernel_with_device_call_2_grad_0_2(double *out, double *in, double val, double *_d_out, double *_d_val) { +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x; +//CHECK-NEXT: double _t0 = out[index0]; +//CHECK-NEXT: out[index0] = device_fn_2(in, val); +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t0; +//CHECK-NEXT: double _r_d0 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0.; +//CHECK-NEXT: double _r0 = 0.; +//CHECK-NEXT: device_fn_2_pullback(in, val, _r_d0, &_r0); +//CHECK-NEXT: atomicAdd(_d_val, _r0); +//CHECK-NEXT: } +//CHECK-NEXT:} + +// CHECK: void kernel_with_device_call_2_grad_0_1(double *out, double *in, double val, double *_d_out, double *_d_in) { +//CHECK-NEXT: double _d_val = 0.; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x; +//CHECK-NEXT: double _t0 = out[index0]; +//CHECK-NEXT: out[index0] = device_fn_2(in, val); +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t0; +//CHECK-NEXT: double _r_d0 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0.; +//CHECK-NEXT: double _r0 = 0.; +//CHECK-NEXT: device_fn_2_pullback(in, val, _r_d0, _d_in, &_r0); +//CHECK-NEXT: _d_val += _r0; +//CHECK-NEXT: } +//CHECK-NEXT:} + +__device__ double device_fn_3(double *in, double *val) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + return in[index] + *val; +} + +__global__ void kernel_with_device_call_3(double *out, double *in, double *val) { + int index = threadIdx.x; + out[index] = device_fn_3(in, val); +} + +// CHECK: void kernel_with_device_call_3_grad(double *out, double *in, double *val, double *_d_out, double *_d_in, double *_d_val) { +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x; +//CHECK-NEXT: double _t0 = out[index0]; +//CHECK-NEXT: out[index0] = device_fn_3(in, val); +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t0; +//CHECK-NEXT: double _r_d0 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0.; +//CHECK-NEXT: device_fn_3_pullback(in, val, _r_d0, _d_in, _d_val); +//CHECK-NEXT: } +//CHECK-NEXT:} + +__device__ double device_fn_4(double *in, double val) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + return in[index] + val; +} + +__device__ double device_with_device_call(double *in, double val) { + return device_fn_4(in, val); +} + +__global__ void kernel_with_nested_device_call(double *out, double *in, double val) { + int index = threadIdx.x; + out[index] = device_with_device_call(in, val); +} + +// CHECK: void kernel_with_nested_device_call_grad_0_1(double *out, double *in, double val, double *_d_out, double *_d_in) { +//CHECK-NEXT: double _d_val = 0.; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x; +//CHECK-NEXT: double _t0 = out[index0]; +//CHECK-NEXT: out[index0] = device_with_device_call(in, val); +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t0; +//CHECK-NEXT: double _r_d0 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0.; +//CHECK-NEXT: double _r0 = 0.; +//CHECK-NEXT: device_with_device_call_pullback(in, val, _r_d0, _d_in, &_r0); +//CHECK-NEXT: _d_val += _r0; +//CHECK-NEXT: } +//CHECK-NEXT:} + __global__ void kernel_call(double *a, double *b) { int index = threadIdx.x + blockIdx.x * blockDim.x; a[index] = *b; @@ -322,9 +416,12 @@ void fn(double *out, double *in) { kernel_call<<<1, 10>>>(out, in); } +// CHECK: void fn_grad(double *out, double *in, double *_d_out, double *_d_in) { +//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in); +//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); +//CHECK-NEXT: } + double fn_memory(double *out, double *in) { - cudaMalloc(&in, 10 * sizeof(double)); - cudaMalloc(&out, 10 * sizeof(double)); kernel_call<<<1, 10>>>(out, in); double *out_host = (double*)malloc(10 * sizeof(double)); cudaMemcpy(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost); @@ -341,10 +438,6 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: int _d_i = 0; //CHECK-NEXT: int i = 0; //CHECK-NEXT: clad::tape _t1 = {}; -//CHECK-NEXT: cudaMalloc(&_d_in, 10 * sizeof(double)); -//CHECK-NEXT: cudaMalloc(&in, 10 * sizeof(double)); -//CHECK-NEXT: cudaMalloc(&_d_out, 10 * sizeof(double)); -//CHECK-NEXT: cudaMalloc(&out, 10 * sizeof(double)); //CHECK-NEXT: kernel_call<<<1, 10>>>(out, in); //CHECK-NEXT: double *_d_out_host = (double *)malloc(10 * sizeof(double)); //CHECK-NEXT: double *out_host = (double *)malloc(10 * sizeof(double)); @@ -376,7 +469,7 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: } //CHECK-NEXT: { //CHECK-NEXT: unsigned long _r0 = 0UL; -//CHECK-NEXT: cudaMemcpyKind _r1 = 0U; +//CHECK-NEXT: cudaMemcpyKind _r1 = static_cast(0U); //CHECK-NEXT: clad::custom_derivatives::cudaMemcpy_pullback(out_host, out, 10 * sizeof(double), cudaMemcpyDeviceToHost, _d_out_host, _d_out, &_r0, &_r1); //CHECK-NEXT: } //CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); @@ -384,11 +477,6 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: cudaFree(in); //CHECK-NEXT:} -// CHECK: void fn_grad(double *out, double *in, double *_d_out, double *_d_in) { -//CHECK-NEXT: kernel_call<<<1, 10>>>(out, in); -//CHECK-NEXT: kernel_call_pullback<<<1, 10>>>(out, in, _d_out, _d_in); -//CHECK-NEXT: } - // CHECK: __attribute__((device)) void device_fn_pullback(double in, double val, double _d_y, double *_d_in, double *_d_val) { //CHECK-NEXT: { //CHECK-NEXT: *_d_in += _d_y; @@ -396,6 +484,36 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: } //CHECK-NEXT:} +// CHECK: __attribute__((device)) void device_fn_2_pullback(double *in, double val, double _d_y, double *_d_val) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: *_d_val += _d_y; +//CHECK-NEXT:} + +// CHECK: __attribute__((device)) void device_fn_2_pullback(double *in, double val, double _d_y, double *_d_in, double *_d_val) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: { +//CHECK-NEXT: atomicAdd(&_d_in[index0], _d_y); +//CHECK-NEXT: *_d_val += _d_y; +//CHECK-NEXT: } +//CHECK-NEXT:} + +// CHECK: __attribute__((device)) void device_fn_3_pullback(double *in, double *val, double _d_y, double *_d_in, double *_d_val) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: { +//CHECK-NEXT: atomicAdd(&_d_in[index0], _d_y); +//CHECK-NEXT: atomicAdd(_d_val, _d_y); +//CHECK-NEXT: } +//CHECK-NEXT:} + // CHECK: __attribute__((global)) void kernel_call_pullback(double *a, double *b, double *_d_a, double *_d_b) { //CHECK-NEXT: unsigned int _t1 = blockIdx.x; //CHECK-NEXT: unsigned int _t0 = blockDim.x; @@ -411,6 +529,25 @@ double fn_memory(double *out, double *in) { //CHECK-NEXT: } //CHECK-NEXT:} +// CHECK: __attribute__((device)) void device_with_device_call_pullback(double *in, double val, double _d_y, double *_d_in, double *_d_val) { +//CHECK-NEXT: { +//CHECK-NEXT: double _r0 = 0.; +//CHECK-NEXT: device_fn_4_pullback(in, val, _d_y, _d_in, &_r0); +//CHECK-NEXT: *_d_val += _r0; +//CHECK-NEXT: } +//CHECK-NEXT:} + +// CHECK: __attribute__((device)) void device_fn_4_pullback(double *in, double val, double _d_y, double *_d_in, double *_d_val) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: { +//CHECK-NEXT: atomicAdd(&_d_in[index0], _d_y); +//CHECK-NEXT: *_d_val += _d_y; +//CHECK-NEXT: } +//CHECK-NEXT:} + #define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \ { \ int *fives = (int*)malloc(N * sizeof(int)); \ @@ -470,6 +607,7 @@ double fn_memory(double *out, double *in) { } \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ + cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%d, ", res[i]); \ } \ @@ -504,6 +642,7 @@ double fn_memory(double *out, double *in) { } \ int *res = (int*)malloc(N * sizeof(int)); \ cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ + cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%d, ", res[i]); \ } \ @@ -538,6 +677,7 @@ double fn_memory(double *out, double *in) { } \ double *res = (double*)malloc(N * sizeof(double)); \ cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \ + cudaDeviceSynchronize(); \ for (int i = 0; i < (N - 1); i++) { \ printf("%0.2f, ", res[i]); \ } \ @@ -619,41 +759,96 @@ int main(void) { cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice); - device_pullback<<<1, 10>>>(y, x, 5); + auto test_device = clad::gradient(kernel_with_device_call, "out, val"); + test_device.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, d_val); + double *res = (double*)malloc(10 * sizeof(double)); + cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); - printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error + printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 - auto test_device = clad::gradient(device_pullback, "out, val"); - test_device.execute_kernel(dim3(1), dim3(10, 1, 1), x, y, 5, dy, d_val); - cudaDeviceSynchronize(); - printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error - double *res = (double*)malloc(sizeof(double)); + cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice); + + auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val"); + test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, d_val); cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 + cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + + auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in"); + test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, dx); + cudaDeviceSynchronize(); + cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost); + printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 + cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice); + double *val; + cudaMalloc(&val, sizeof(double)); + cudaMemcpy(val, fives, sizeof(double), cudaMemcpyHostToDevice); + + auto test_device_4 = clad::gradient(kernel_with_device_call_3); + test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, val, dy, dx, d_val); + cudaDeviceSynchronize(); + cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 + cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); + printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 + + cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + auto test_kernel_call = clad::gradient(fn); test_kernel_call.execute(y, x, dy, dx); cudaDeviceSynchronize(); - printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error cudaMemcpy(res, dx, sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 + cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + + auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in"); + nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, dx); + cudaDeviceSynchronize(); + cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost); + printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 + + cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + + auto test_memory = clad::gradient(fn_memory); + test_memory.execute(y, x, dy, dx); + cudaDeviceSynchronize(); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error + cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost); + printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 50.00, 0.00, 0.00 + free(fives); free(zeros); free(res); - cudaFree(x); - cudaFree(y); cudaFree(dx); cudaFree(dy); cudaFree(d_val); - auto test_memory = clad::gradient(fn_memory); - return 0; }