diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index 0a00fcd74..ca8296382 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -84,87 +84,108 @@ __global__ void add_kernel_3(int *out, int *in) { //CHECK-NEXT: } //CHECK-NEXT:} -int main(void) { - int *a = (int*)malloc(sizeof(int)); - *a = 2; - int *d_a; - cudaMalloc(&d_a, sizeof(int)); - cudaMemcpy(d_a, a, sizeof(int), cudaMemcpyHostToDevice); - - int *asquare = (int*)malloc(sizeof(int)); - *asquare = 1; - int *d_square; - cudaMalloc(&d_square, sizeof(int)); - cudaMemcpy(d_square, asquare, sizeof(int), cudaMemcpyHostToDevice); +#define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \ + { \ + int *fives = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + fives[i] = 5; \ + } \ + int *ones = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + ones[i] = 1; \ + } \ + cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(dx, ones, N * sizeof(int), cudaMemcpyHostToDevice); \ + auto test = clad::gradient(F); \ + if constexpr (use_stream) { \ + cudaStream_t cudaStream; \ + cudaStreamCreate(&cudaStream); \ + test.execute_kernel(grid, block, shared_mem, cudaStream, x, dx); \ + } \ + else { \ + test.execute_kernel(grid, block, x, dx); \ + } \ + cudaDeviceSynchronize(); \ + int *res = (int*)malloc(N * sizeof(int)); \ + cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ + for (int i = 0; i < (N - 1); i++) { \ + printf("%d, ", res[i]); \ + } \ + printf("%d\n", res[N-1]); \ + free(fives); \ + free(ones); \ + free(res); \ + } - auto test = clad::gradient(kernel); - dim3 grid(1); - dim3 block(1); - cudaStream_t cudaStream; - cudaStreamCreate(&cudaStream); - test.execute_kernel(grid, block, 0, cudaStream, d_a, d_square); - cudaDeviceSynchronize(); +#define TEST_2(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \ + { \ + int *fives = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + fives[i] = 5; \ + } \ + int *zeros = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + zeros[i] = 0; \ + } \ + cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(y, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(dy, fives, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(dx, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \ + auto test = clad::gradient(F, args); \ + if constexpr (use_stream) { \ + cudaStream_t cudaStream; \ + cudaStreamCreate(&cudaStream); \ + test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, dy, dx); \ + } \ + else { \ + test.execute_kernel(grid, block, y, x, dy, dx); \ + } \ + cudaDeviceSynchronize(); \ + int *res = (int*)malloc(N * sizeof(int)); \ + cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ + for (int i = 0; i < (N - 1); i++) { \ + printf("%d, ", res[i]); \ + } \ + printf("%d\n", res[N-1]); \ + free(fives); \ + free(zeros); \ + free(res); \ + } - cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost); - cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost); - printf("a = %d, d(a^2)/da = %d\n", *a, *asquare); // CHECK-EXEC: a = 2, d(a^2)/da = 4 - auto error = clad::gradient(fake_kernel); - error.execute_kernel(grid, block, d_a, d_square); // CHECK-EXEC: Use execute() for non-global CUDA kernels +int main(void) { + int *a, *d_a; + cudaMalloc(&a, sizeof(int)); + cudaMalloc(&d_a, sizeof(int)); - test.execute(d_a, d_square); // CHECK-EXEC: Use execute_kernel() for global CUDA kernels + TEST(kernel, dim3(1), dim3(1), 0, false, a, d_a, 1); // CHECK-EXEC: 10 + TEST(kernel, dim3(1), dim3(1), 0, true, a, d_a, 1); // CHECK-EXEC: 10 - cudaMemset(d_a, 5, 1); // first byte is set to 5 - cudaMemset(d_square, 1, 1); + auto error = clad::gradient(fake_kernel); + error.execute_kernel(dim3(1), dim3(1), a, d_a); // CHECK-EXEC: Use execute() for non-global CUDA kernels - test.execute_kernel(grid, block, d_a, d_square); - cudaDeviceSynchronize(); + auto test = clad::gradient(kernel); + test.execute(a, d_a); // CHECK-EXEC: Use execute_kernel() for global CUDA kernels - cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost); - cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost); - printf("a = %d, d(a^2)/da = %d\n", *a, *asquare); // CHECK-EXEC: a = 5, d(a^2)/da = 10 + cudaFree(a); + cudaFree(d_a); - int *dummy_in, *dummy_out; - cudaMalloc(&dummy_in, sizeof(int)); - cudaMalloc(&dummy_out, sizeof(int)); - int *out = (int*)malloc(5 * sizeof(int)); - for(int i = 0; i < 5; i++) { - out[i] = 5; - } - int *d_out; + int *dummy_in, *dummy_out, *d_out, *d_in; + cudaMalloc(&dummy_in, 5 * sizeof(int)); + cudaMalloc(&dummy_out, 5 * sizeof(int)); cudaMalloc(&d_out, 5 * sizeof(int)); - cudaMemcpy(d_out, out, 5 * sizeof(int), cudaMemcpyHostToDevice); - - int *d_in; cudaMalloc(&d_in, 5 * sizeof(int)); - auto add = clad::gradient(add_kernel, "in, out"); - add.execute_kernel(dim3(1), dim3(5, 1, 1), dummy_out, dummy_in, d_out, d_in); - cudaDeviceSynchronize(); - - int *res = (int*)malloc(5 * sizeof(int)); - cudaMemcpy(res, d_in, 5 * sizeof(int), cudaMemcpyDeviceToHost); - printf("%d, %d, %d, %d, %d\n", res[0], res[1], res[2], res[3], res[4]); // CHECK-EXEC: 5, 5, 5, 5, 5 - - cudaMemset(d_in, 0, 5 * sizeof(int)); - auto add_2 = clad::gradient(add_kernel_2, "in, out"); - add_2.execute_kernel(dim3(1), dim3(5, 1, 1), dummy_out, dummy_in, d_out, d_in); - cudaDeviceSynchronize(); - - cudaMemcpy(res, d_in, 5 * sizeof(int), cudaMemcpyDeviceToHost); - printf("%d, %d, %d, %d, %d\n", res[0], res[1], res[2], res[3], res[4]); // CHECK-EXEC: 5, 5, 5, 5, 5 - - - cudaMemset(d_in, 0, 5 * sizeof(int)); - auto add_3 = clad::gradient(add_kernel_3, "in, out"); - add_3.execute_kernel(dim3(5), dim3(1), dummy_out, dummy_in, d_out, d_in); - cudaDeviceSynchronize(); - - cudaMemcpy(res, d_in, 5 * sizeof(int), cudaMemcpyDeviceToHost); - printf("%d, %d, %d, %d, %d\n", res[0], res[1], res[2], res[3], res[4]); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + cudaFree(dummy_in); + cudaFree(dummy_out); + cudaFree(d_out); + cudaFree(d_in); return 0; } \ No newline at end of file