From 2e5560ecdb61d871c57dccf33cbf1b959ca73c29 Mon Sep 17 00:00:00 2001 From: Christina Koutsou <74819775+kchristin22@users.noreply.github.com> Date: Tue, 24 Sep 2024 18:10:05 +0300 Subject: [PATCH] Add support of CUDA builtins (#1092) Added support of CUDA grid configuration builtin variables. Builtins tested: threadIdx, blockIdx, blockDim, gridDim, warpSize --- .../clad/Differentiator/ReverseModeVisitor.h | 1 + lib/Differentiator/ReverseModeVisitor.cpp | 6 + test/CUDA/GradientKernels.cu | 328 ++++++++++++++++-- 3 files changed, 305 insertions(+), 30 deletions(-) diff --git a/include/clad/Differentiator/ReverseModeVisitor.h b/include/clad/Differentiator/ReverseModeVisitor.h index 40feb6723..ad9981bb1 100644 --- a/include/clad/Differentiator/ReverseModeVisitor.h +++ b/include/clad/Differentiator/ReverseModeVisitor.h @@ -396,6 +396,7 @@ namespace clad { StmtDiff VisitImplicitValueInitExpr(const clang::ImplicitValueInitExpr* IVIE); StmtDiff VisitCStyleCastExpr(const clang::CStyleCastExpr* CSCE); + StmtDiff VisitPseudoObjectExpr(const clang::PseudoObjectExpr* POE); StmtDiff VisitInitListExpr(const clang::InitListExpr* ILE); StmtDiff VisitIntegerLiteral(const clang::IntegerLiteral* IL); StmtDiff VisitMemberExpr(const clang::MemberExpr* ME); diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 0bc96501a..4d600caf7 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -3272,6 +3272,12 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, return {castExpr, castExprDiff}; } + StmtDiff + ReverseModeVisitor::VisitPseudoObjectExpr(const PseudoObjectExpr* POE) { + // Used for CUDA Builtins + return {Clone(POE), Clone(POE)}; + } + StmtDiff ReverseModeVisitor::VisitMemberExpr(const MemberExpr* ME) { auto baseDiff = VisitWithExplicitNoDfDx(ME->getBase()); auto* field = ME->getMemberDecl(); diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index e67c74926..01da8a299 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -33,46 +33,314 @@ void fake_kernel(int *a) { *a *= *a; } +__global__ void add_kernel(int *out, int *in) { + int index = threadIdx.x; + out[index] += in[index]; +} + +// CHECK: void add_kernel_grad(int *out, int *in, int *_d_out, int *_d_in) { +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x; +//CHECK-NEXT: int _t0 = out[index0]; +//CHECK-NEXT: out[index0] += in[index0]; +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t0; +//CHECK-NEXT: int _r_d0 = _d_out[index0]; +//CHECK-NEXT: _d_in[index0] += _r_d0; +//CHECK-NEXT: } +//CHECK-NEXT: } + +__global__ void add_kernel_2(int *out, int *in) { + out[threadIdx.x] += in[threadIdx.x]; +} + +// CHECK: void add_kernel_2_grad(int *out, int *in, int *_d_out, int *_d_in) { +//CHECK-NEXT: int _t0 = out[threadIdx.x]; +//CHECK-NEXT: out[threadIdx.x] += in[threadIdx.x]; +//CHECK-NEXT: { +//CHECK-NEXT: out[threadIdx.x] = _t0; +//CHECK-NEXT: int _r_d0 = _d_out[threadIdx.x]; +//CHECK-NEXT: _d_in[threadIdx.x] += _r_d0; +//CHECK-NEXT: } +//CHECK-NEXT: } + +__global__ void add_kernel_3(int *out, int *in) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + out[index] += in[index]; +} + +// CHECK: void add_kernel_3_grad(int *out, int *in, int *_d_out, int *_d_in) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: int _t2 = out[index0]; +//CHECK-NEXT: out[index0] += in[index0]; +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t2; +//CHECK-NEXT: int _r_d0 = _d_out[index0]; +//CHECK-NEXT: _d_in[index0] += _r_d0; +//CHECK-NEXT: } +//CHECK-NEXT:} + +__global__ void add_kernel_4(int *out, int *in) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < 5) { + int sum = 0; + // Each thread sums elements in steps of warpSize + for (int i = index; i < 5; i += warpSize) { + sum += in[i]; + } + out[index] = sum; + } +} + +// CHECK: void add_kernel_4_grad(int *out, int *in, int *_d_out, int *_d_in) { +//CHECK-NEXT: bool _cond0; +//CHECK-NEXT: int _d_sum = 0; +//CHECK-NEXT: int sum = 0; +//CHECK-NEXT: unsigned long _t2; +//CHECK-NEXT: int _d_i = 0; +//CHECK-NEXT: int i = 0; +//CHECK-NEXT: clad::tape _t3 = {}; +//CHECK-NEXT: clad::tape _t4 = {}; +//CHECK-NEXT: int _t5; +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: { +//CHECK-NEXT: _cond0 = index0 < 5; +//CHECK-NEXT: if (_cond0) { +//CHECK-NEXT: sum = 0; +//CHECK-NEXT: _t2 = 0UL; +//CHECK-NEXT: for (i = index0; ; clad::push(_t3, i) , (i += warpSize)) { +//CHECK-NEXT: { +//CHECK-NEXT: if (!(i < 5)) +//CHECK-NEXT: break; +//CHECK-NEXT: } +//CHECK-NEXT: _t2++; +//CHECK-NEXT: clad::push(_t4, sum); +//CHECK-NEXT: sum += in[i]; +//CHECK-NEXT: } +//CHECK-NEXT: _t5 = out[index0]; +//CHECK-NEXT: out[index0] = sum; +//CHECK-NEXT: } +//CHECK-NEXT: } +//CHECK-NEXT: if (_cond0) { +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t5; +//CHECK-NEXT: int _r_d2 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0; +//CHECK-NEXT: _d_sum += _r_d2; +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: for (;; _t2--) { +//CHECK-NEXT: { +//CHECK-NEXT: if (!_t2) +//CHECK-NEXT: break; +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: i = clad::pop(_t3); +//CHECK-NEXT: int _r_d0 = _d_i; +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: sum = clad::pop(_t4); +//CHECK-NEXT: int _r_d1 = _d_sum; +//CHECK-NEXT: _d_in[i] += _r_d1; +//CHECK-NEXT: } +//CHECK-NEXT: } +//CHECK-NEXT: _d_index += _d_i; +//CHECK-NEXT: } +//CHECK-NEXT: } +//CHECK-NEXT:} + +__global__ void add_kernel_5(int *out, int *in) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < 5) { + int sum = 0; + // Calculate the total number of threads in the grid + int totalThreads = blockDim.x * gridDim.x; + // Each thread sums elements in steps of the total number of threads in the grid + for (int i = index; i < 5; i += totalThreads) { + sum += in[i]; + } + out[index] = sum; + } +} + +// CHECK: void add_kernel_5_grad(int *out, int *in, int *_d_out, int *_d_in) { +//CHECK-NEXT: bool _cond0; +//CHECK-NEXT: int _d_sum = 0; +//CHECK-NEXT: int sum = 0; +//CHECK-NEXT: unsigned int _t2; +//CHECK-NEXT: unsigned int _t3; +//CHECK-NEXT: int _d_totalThreads = 0; +//CHECK-NEXT: int totalThreads = 0; +//CHECK-NEXT: unsigned long _t4; +//CHECK-NEXT: int _d_i = 0; +//CHECK-NEXT: int i = 0; +//CHECK-NEXT: clad::tape _t5 = {}; +//CHECK-NEXT: clad::tape _t6 = {}; +//CHECK-NEXT: int _t7; +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: { +//CHECK-NEXT: _cond0 = index0 < 5; +//CHECK-NEXT: if (_cond0) { +//CHECK-NEXT: sum = 0; +//CHECK-NEXT: _t3 = blockDim.x; +//CHECK-NEXT: _t2 = gridDim.x; +//CHECK-NEXT: totalThreads = _t3 * _t2; +//CHECK-NEXT: _t4 = 0UL; +//CHECK-NEXT: for (i = index0; ; clad::push(_t5, i) , (i += totalThreads)) { +//CHECK-NEXT: { +//CHECK-NEXT: if (!(i < 5)) +//CHECK-NEXT: break; +//CHECK-NEXT: } +//CHECK-NEXT: _t4++; +//CHECK-NEXT: clad::push(_t6, sum); +//CHECK-NEXT: sum += in[i]; +//CHECK-NEXT: } +//CHECK-NEXT: _t7 = out[index0]; +//CHECK-NEXT: out[index0] = sum; +//CHECK-NEXT: } +//CHECK-NEXT: } +//CHECK-NEXT: if (_cond0) { +//CHECK-NEXT: { +//CHECK-NEXT: out[index0] = _t7; +//CHECK-NEXT: int _r_d2 = _d_out[index0]; +//CHECK-NEXT: _d_out[index0] = 0; +//CHECK-NEXT: _d_sum += _r_d2; +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: for (;; _t4--) { +//CHECK-NEXT: { +//CHECK-NEXT: if (!_t4) +//CHECK-NEXT: break; +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: i = clad::pop(_t5); +//CHECK-NEXT: int _r_d0 = _d_i; +//CHECK-NEXT: _d_totalThreads += _r_d0; +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: sum = clad::pop(_t6); +//CHECK-NEXT: int _r_d1 = _d_sum; +//CHECK-NEXT: _d_in[i] += _r_d1; +//CHECK-NEXT: } +//CHECK-NEXT: } +//CHECK-NEXT: _d_index += _d_i; +//CHECK-NEXT: } +//CHECK-NEXT: } +//CHECK-NEXT:} + +#define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \ + { \ + int *fives = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + fives[i] = 5; \ + } \ + int *ones = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + ones[i] = 1; \ + } \ + cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(dx, ones, N * sizeof(int), cudaMemcpyHostToDevice); \ + auto test = clad::gradient(F); \ + if constexpr (use_stream) { \ + cudaStream_t cudaStream; \ + cudaStreamCreate(&cudaStream); \ + test.execute_kernel(grid, block, shared_mem, cudaStream, x, dx); \ + } \ + else { \ + test.execute_kernel(grid, block, x, dx); \ + } \ + cudaDeviceSynchronize(); \ + int *res = (int*)malloc(N * sizeof(int)); \ + cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ + for (int i = 0; i < (N - 1); i++) { \ + printf("%d, ", res[i]); \ + } \ + printf("%d\n", res[N-1]); \ + free(fives); \ + free(ones); \ + free(res); \ + } + + +#define TEST_2(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \ + { \ + int *fives = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + fives[i] = 5; \ + } \ + int *zeros = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + zeros[i] = 0; \ + } \ + cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(y, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(dy, fives, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(dx, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \ + auto test = clad::gradient(F, args); \ + if constexpr (use_stream) { \ + cudaStream_t cudaStream; \ + cudaStreamCreate(&cudaStream); \ + test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, dy, dx); \ + } \ + else { \ + test.execute_kernel(grid, block, y, x, dy, dx); \ + } \ + cudaDeviceSynchronize(); \ + int *res = (int*)malloc(N * sizeof(int)); \ + cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ + for (int i = 0; i < (N - 1); i++) { \ + printf("%d, ", res[i]); \ + } \ + printf("%d\n", res[N-1]); \ + free(fives); \ + free(zeros); \ + free(res); \ + } + + int main(void) { - int *a = (int*)malloc(sizeof(int)); - *a = 2; - int *d_a; + int *a, *d_a; + cudaMalloc(&a, sizeof(int)); cudaMalloc(&d_a, sizeof(int)); - cudaMemcpy(d_a, a, sizeof(int), cudaMemcpyHostToDevice); - - int *asquare = (int*)malloc(sizeof(int)); - *asquare = 1; - int *d_square; - cudaMalloc(&d_square, sizeof(int)); - cudaMemcpy(d_square, asquare, sizeof(int), cudaMemcpyHostToDevice); - auto test = clad::gradient(kernel); - dim3 grid(1); - dim3 block(1); - cudaStream_t cudaStream; - cudaStreamCreate(&cudaStream); - test.execute_kernel(grid, block, 0, cudaStream, d_a, d_square); + TEST(kernel, dim3(1), dim3(1), 0, false, a, d_a, 1); // CHECK-EXEC: 10 + TEST(kernel, dim3(1), dim3(1), 0, true, a, d_a, 1); // CHECK-EXEC: 10 - cudaDeviceSynchronize(); + auto error = clad::gradient(fake_kernel); + error.execute_kernel(dim3(1), dim3(1), a, d_a); // CHECK-EXEC: Use execute() for non-global CUDA kernels - cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost); - cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost); - printf("a = %d, a^2 = %d\n", *a, *asquare); // CHECK-EXEC: a = 2, a^2 = 4 + auto test = clad::gradient(kernel); + test.execute(a, d_a); // CHECK-EXEC: Use execute_kernel() for global CUDA kernels - auto error = clad::gradient(fake_kernel); - error.execute_kernel(grid, block, d_a, d_square); // CHECK-EXEC: Use execute() for non-global CUDA kernels + cudaFree(a); + cudaFree(d_a); - test.execute(d_a, d_square); // CHECK-EXEC: Use execute_kernel() for global CUDA kernels - cudaMemset(d_a, 5, 1); // first byte is set to 5 - cudaMemset(d_square, 1, 1); + int *dummy_in, *dummy_out, *d_out, *d_in; + cudaMalloc(&dummy_in, 5 * sizeof(int)); + cudaMalloc(&dummy_out, 5 * sizeof(int)); + cudaMalloc(&d_out, 5 * sizeof(int)); + cudaMalloc(&d_in, 5 * sizeof(int)); - test.execute_kernel(grid, block, d_a, d_square); - cudaDeviceSynchronize(); + TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 - cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost); - cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost); - printf("a = %d, a^2 = %d\n", *a, *asquare); // CHECK-EXEC: a = 5, a^2 = 10 + cudaFree(dummy_in); + cudaFree(dummy_out); + cudaFree(d_out); + cudaFree(d_in); return 0; }