From 4ac4f77d19f15a4a56b8190cfc321b0a8b5518ae Mon Sep 17 00:00:00 2001 From: kchristin Date: Mon, 23 Sep 2024 17:59:44 +0300 Subject: [PATCH] [cuda] Make add-assign op atomic to avoid write race conditions. --- .../clad/Differentiator/ReverseModeVisitor.h | 14 ++ lib/Differentiator/ReverseModeVisitor.cpp | 63 +++++- test/CMakeLists.txt | 2 +- test/CUDA/GradientKernels.cu | 186 +++++++++++++++--- 4 files changed, 233 insertions(+), 32 deletions(-) diff --git a/include/clad/Differentiator/ReverseModeVisitor.h b/include/clad/Differentiator/ReverseModeVisitor.h index ad9981bb1..d765cb372 100644 --- a/include/clad/Differentiator/ReverseModeVisitor.h +++ b/include/clad/Differentiator/ReverseModeVisitor.h @@ -433,6 +433,20 @@ namespace clad { StmtDiff VisitNullStmt(const clang::NullStmt* NS) { return StmtDiff{Clone(NS), Clone(NS)}; }; + + /// Helper function that checks whether the function to be derived + /// is meant to be executed only by the GPU + bool shouldUseCudaAtomicOps(); + + /// Add call to cuda::atomicAdd for the given LHS and RHS expressions. + /// + /// \param[in] LHS The left-hand side expression. + /// + /// \param[in] RHS The right-hand side expression. + /// + /// \returns The atomicAdd call expression. + clang::Expr* BuildCallToCudaAtomicAdd(clang::Expr* LHS, clang::Expr* RHS); + static DeclDiff DifferentiateStaticAssertDecl(const clang::StaticAssertDecl* SAD); diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 4d600caf7..789aa1144 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -104,6 +104,45 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, return CladTapeResult{*this, PushExpr, PopExpr, TapeRef}; } + bool ReverseModeVisitor::shouldUseCudaAtomicOps() { + return m_DiffReq->hasAttr() || + (m_DiffReq->hasAttr() && + !m_DiffReq->hasAttr()); + } + + clang::Expr* ReverseModeVisitor::BuildCallToCudaAtomicAdd(clang::Expr* LHS, + clang::Expr* RHS) { + DeclarationName atomicAddId = &m_Context.Idents.get("atomicAdd"); + LookupResult lookupResult(m_Sema, atomicAddId, SourceLocation(), + Sema::LookupOrdinaryName); + m_Sema.LookupQualifiedName(lookupResult, + m_Context.getTranslationUnitDecl()); + + CXXScopeSpec SS; + Expr* UnresolvedLookup = + m_Sema.BuildDeclarationNameExpr(SS, lookupResult, /*ADL=*/true).get(); + + Expr* finalLHS = LHS; + if (isa(LHS)) + finalLHS = BuildOp(UnaryOperatorKind::UO_AddrOf, LHS); + llvm::SmallVector atomicArgs = {finalLHS, RHS}; + + assert(!m_Builder.noOverloadExists(UnresolvedLookup, atomicArgs) && + "atomicAdd function not found"); + + Expr* atomicAddCall = + m_Sema + .ActOnCallExpr( + getCurrentScope(), + /*Fn=*/UnresolvedLookup, + /*LParenLoc=*/noLoc, + /*ArgExprs=*/llvm::MutableArrayRef(atomicArgs), + /*RParenLoc=*/m_DiffReq->getLocation()) + .get(); + + return atomicAddCall; + } + ReverseModeVisitor::ReverseModeVisitor(DerivativeBuilder& builder, const DiffRequest& request) : VisitorBase(builder, request), m_Result(nullptr) {} @@ -1485,9 +1524,15 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, BuildArraySubscript(target, forwSweepDerivativeIndices); // Create the (target += dfdx) statement. if (dfdx()) { - auto* add_assign = BuildOp(BO_AddAssign, result, dfdx()); - // Add it to the body statements. - addToCurrentBlock(add_assign, direction::reverse); + if (shouldUseCudaAtomicOps()) { + Expr* atomicCall = BuildCallToCudaAtomicAdd(result, dfdx()); + // Add it to the body statements. + addToCurrentBlock(atomicCall, direction::reverse); + } else { + auto* add_assign = BuildOp(BO_AddAssign, result, dfdx()); + // Add it to the body statements. + addToCurrentBlock(add_assign, direction::reverse); + } } if (m_ExternalSource) m_ExternalSource->ActAfterProcessingArraySubscriptExpr(valueForRevSweep); @@ -2279,9 +2324,15 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, derivedE = BuildOp(UnaryOperatorKind::UO_Deref, diff_dx); // Create the (target += dfdx) statement. if (dfdx()) { - auto* add_assign = BuildOp(BO_AddAssign, derivedE, dfdx()); - // Add it to the body statements. - addToCurrentBlock(add_assign, direction::reverse); + if (shouldUseCudaAtomicOps()) { + Expr* atomicCall = BuildCallToCudaAtomicAdd(diff_dx, dfdx()); + // Add it to the body statements. + addToCurrentBlock(atomicCall, direction::reverse); + } else { + auto* add_assign = BuildOp(BO_AddAssign, derivedE, dfdx()); + // Add it to the body statements. + addToCurrentBlock(add_assign, direction::reverse); + } } } return {cloneE, derivedE, derivedE}; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a1d17f51d..0264fc4d3 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -29,7 +29,7 @@ if(CUDAToolkit_FOUND) get_filename_component(CUDA_ROOT "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE) get_filename_component(CUDA_LIBDIR "${CUDA_cudart_static_LIBRARY}" DIRECTORY) - set(LIBOMPTARGET_DEP_CUDA_ARCH "sm_50") + set(LIBOMPTARGET_DEP_CUDA_ARCH "sm_60") if(TARGET nvptx-arch) get_property(LIBOMPTARGET_NVPTX_ARCH TARGET nvptx-arch PROPERTY LOCATION) diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index 01da8a299..171341a7e 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -25,7 +25,7 @@ __global__ void kernel(int *a) { //CHECK-NEXT: int _r_d0 = *_d_a; //CHECK-NEXT: *_d_a = 0; //CHECK-NEXT: *_d_a += _r_d0 * *a; -//CHECK-NEXT: *_d_a += *a * _r_d0; +//CHECK-NEXT: atomicAdd(_d_a, *a * _r_d0); //CHECK-NEXT: } //CHECK-NEXT: } @@ -46,7 +46,7 @@ __global__ void add_kernel(int *out, int *in) { //CHECK-NEXT: { //CHECK-NEXT: out[index0] = _t0; //CHECK-NEXT: int _r_d0 = _d_out[index0]; -//CHECK-NEXT: _d_in[index0] += _r_d0; +//CHECK-NEXT: atomicAdd(&_d_in[index0], _r_d0); //CHECK-NEXT: } //CHECK-NEXT: } @@ -60,7 +60,7 @@ __global__ void add_kernel_2(int *out, int *in) { //CHECK-NEXT: { //CHECK-NEXT: out[threadIdx.x] = _t0; //CHECK-NEXT: int _r_d0 = _d_out[threadIdx.x]; -//CHECK-NEXT: _d_in[threadIdx.x] += _r_d0; +//CHECK-NEXT: atomicAdd(&_d_in[threadIdx.x], _r_d0); //CHECK-NEXT: } //CHECK-NEXT: } @@ -79,23 +79,24 @@ __global__ void add_kernel_3(int *out, int *in) { //CHECK-NEXT: { //CHECK-NEXT: out[index0] = _t2; //CHECK-NEXT: int _r_d0 = _d_out[index0]; -//CHECK-NEXT: _d_in[index0] += _r_d0; +//CHECK-NEXT: atomicAdd(&_d_in[index0], _r_d0); //CHECK-NEXT: } //CHECK-NEXT:} -__global__ void add_kernel_4(int *out, int *in) { +__global__ void add_kernel_4(int *out, int *in, int N) { int index = threadIdx.x + blockIdx.x * blockDim.x; - if (index < 5) { + if (index < N) { int sum = 0; // Each thread sums elements in steps of warpSize - for (int i = index; i < 5; i += warpSize) { + for (int i = index; i < N; i += warpSize) { sum += in[i]; } out[index] = sum; } } -// CHECK: void add_kernel_4_grad(int *out, int *in, int *_d_out, int *_d_in) { +// CHECK: void add_kernel_4_grad_0_1(int *out, int *in, int N, int *_d_out, int *_d_in) { +//CHECK-NEXT: int _d_N = 0; //CHECK-NEXT: bool _cond0; //CHECK-NEXT: int _d_sum = 0; //CHECK-NEXT: int sum = 0; @@ -110,13 +111,13 @@ __global__ void add_kernel_4(int *out, int *in) { //CHECK-NEXT: int _d_index = 0; //CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; //CHECK-NEXT: { -//CHECK-NEXT: _cond0 = index0 < 5; +//CHECK-NEXT: _cond0 = index0 < N; //CHECK-NEXT: if (_cond0) { //CHECK-NEXT: sum = 0; //CHECK-NEXT: _t2 = 0UL; //CHECK-NEXT: for (i = index0; ; clad::push(_t3, i) , (i += warpSize)) { //CHECK-NEXT: { -//CHECK-NEXT: if (!(i < 5)) +//CHECK-NEXT: if (!(i < N)) //CHECK-NEXT: break; //CHECK-NEXT: } //CHECK-NEXT: _t2++; @@ -147,7 +148,7 @@ __global__ void add_kernel_4(int *out, int *in) { //CHECK-NEXT: { //CHECK-NEXT: sum = clad::pop(_t4); //CHECK-NEXT: int _r_d1 = _d_sum; -//CHECK-NEXT: _d_in[i] += _r_d1; +//CHECK-NEXT: atomicAdd(&_d_in[i], _r_d1); //CHECK-NEXT: } //CHECK-NEXT: } //CHECK-NEXT: _d_index += _d_i; @@ -155,23 +156,24 @@ __global__ void add_kernel_4(int *out, int *in) { //CHECK-NEXT: } //CHECK-NEXT:} -__global__ void add_kernel_5(int *out, int *in) { +__global__ void add_kernel_5(int *out, int *in, int N) { int index = threadIdx.x + blockIdx.x * blockDim.x; - if (index < 5) { + if (index < N) { int sum = 0; // Calculate the total number of threads in the grid int totalThreads = blockDim.x * gridDim.x; // Each thread sums elements in steps of the total number of threads in the grid - for (int i = index; i < 5; i += totalThreads) { + for (int i = index; i < N; i += totalThreads) { sum += in[i]; } out[index] = sum; } } -// CHECK: void add_kernel_5_grad(int *out, int *in, int *_d_out, int *_d_in) { +// CHECK: void add_kernel_5_grad_0_1(int *out, int *in, int N, int *_d_out, int *_d_in) { +//CHECK-NEXT: int _d_N = 0; //CHECK-NEXT: bool _cond0; -//CHECK-NEXT: int _d_sum = 0; +//CHECK-NEXT: int _d_sum = 0; //CHECK-NEXT: int sum = 0; //CHECK-NEXT: unsigned int _t2; //CHECK-NEXT: unsigned int _t3; @@ -188,7 +190,7 @@ __global__ void add_kernel_5(int *out, int *in) { //CHECK-NEXT: int _d_index = 0; //CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; //CHECK-NEXT: { -//CHECK-NEXT: _cond0 = index0 < 5; +//CHECK-NEXT: _cond0 = index0 < N; //CHECK-NEXT: if (_cond0) { //CHECK-NEXT: sum = 0; //CHECK-NEXT: _t3 = blockDim.x; @@ -197,7 +199,7 @@ __global__ void add_kernel_5(int *out, int *in) { //CHECK-NEXT: _t4 = 0UL; //CHECK-NEXT: for (i = index0; ; clad::push(_t5, i) , (i += totalThreads)) { //CHECK-NEXT: { -//CHECK-NEXT: if (!(i < 5)) +//CHECK-NEXT: if (!(i < N)) //CHECK-NEXT: break; //CHECK-NEXT: } //CHECK-NEXT: _t4++; @@ -229,7 +231,7 @@ __global__ void add_kernel_5(int *out, int *in) { //CHECK-NEXT: { //CHECK-NEXT: sum = clad::pop(_t6); //CHECK-NEXT: int _r_d1 = _d_sum; -//CHECK-NEXT: _d_in[i] += _r_d1; +//CHECK-NEXT: atomicAdd(&_d_in[i], _r_d1); //CHECK-NEXT: } //CHECK-NEXT: } //CHECK-NEXT: _d_index += _d_i; @@ -237,6 +239,55 @@ __global__ void add_kernel_5(int *out, int *in) { //CHECK-NEXT: } //CHECK-NEXT:} +__global__ void add_kernel_6(int *a, int *b) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + a[index] = *b; +} + +// CHECK: void add_kernel_6_grad(int *a, int *b, int *_d_a, int *_d_b) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: int _t2 = a[index0]; +//CHECK-NEXT: a[index0] = *b; +//CHECK-NEXT: { +//CHECK-NEXT: a[index0] = _t2; +//CHECK-NEXT: int _r_d0 = _d_a[index0]; +//CHECK-NEXT: _d_a[index0] = 0; +//CHECK-NEXT: atomicAdd(_d_b, _r_d0); +//CHECK-NEXT: } +//CHECK-NEXT:} + +__global__ void add_kernel_7(double *a, double *b) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + a[2 * index] = b[0]; + a[2 * index + 1] = b[0]; +} + +// CHECK: void add_kernel_7_grad(double *a, double *b, double *_d_a, double *_d_b) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: double _t2 = a[2 * index0]; +//CHECK-NEXT: a[2 * index0] = b[0]; +//CHECK-NEXT: double _t3 = a[2 * index0 + 1]; +//CHECK-NEXT: a[2 * index0 + 1] = b[0]; +//CHECK-NEXT: { +//CHECK-NEXT: a[2 * index0 + 1] = _t3; +//CHECK-NEXT: double _r_d1 = _d_a[2 * index0 + 1]; +//CHECK-NEXT: _d_a[2 * index0 + 1] = 0.; +//CHECK-NEXT: atomicAdd(&_d_b[0], _r_d1); +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: a[2 * index0] = _t2; +//CHECK-NEXT: double _r_d0 = _d_a[2 * index0]; +//CHECK-NEXT: _d_a[2 * index0] = 0.; +//CHECK-NEXT: atomicAdd(&_d_b[0], _r_d0); +//CHECK-NEXT: } +//CHECK-NEXT:} + #define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \ { \ int *fives = (int*)malloc(N * sizeof(int)); \ @@ -306,6 +357,76 @@ __global__ void add_kernel_5(int *out, int *in) { free(res); \ } +#define TEST_2_N(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \ + { \ + int *fives = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + fives[i] = 5; \ + } \ + int *zeros = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + zeros[i] = 0; \ + } \ + cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(y, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(dy, fives, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(dx, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \ + auto test = clad::gradient(F, args); \ + if constexpr (use_stream) { \ + cudaStream_t cudaStream; \ + cudaStreamCreate(&cudaStream); \ + test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, N, dy, dx); \ + } \ + else { \ + test.execute_kernel(grid, block, y, x, N, dy, dx); \ + } \ + cudaDeviceSynchronize(); \ + int *res = (int*)malloc(N * sizeof(int)); \ + cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ + for (int i = 0; i < (N - 1); i++) { \ + printf("%d, ", res[i]); \ + } \ + printf("%d\n", res[N-1]); \ + free(fives); \ + free(zeros); \ + free(res); \ + } + +#define TEST_2_D(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \ + { \ + double *fives = (double*)malloc(N * sizeof(double)); \ + for(int i = 0; i < N; i++) { \ + fives[i] = 5; \ + } \ + double *zeros = (double*)malloc(N * sizeof(double)); \ + for(int i = 0; i < N; i++) { \ + zeros[i] = 0; \ + } \ + cudaMemcpy(x, fives, N * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(y, zeros, N * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(dy, fives, N * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(dx, zeros, N * sizeof(double), cudaMemcpyHostToDevice); \ + auto test = clad::gradient(F, args); \ + if constexpr (use_stream) { \ + cudaStream_t cudaStream; \ + cudaStreamCreate(&cudaStream); \ + test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, dy, dx); \ + } \ + else { \ + test.execute_kernel(grid, block, y, x, dy, dx); \ + } \ + cudaDeviceSynchronize(); \ + double *res = (double*)malloc(N * sizeof(double)); \ + cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \ + for (int i = 0; i < (N - 1); i++) { \ + printf("%0.2f, ", res[i]); \ + } \ + printf("%0.2f\n", res[N-1]); \ + free(fives); \ + free(zeros); \ + free(res); \ + } + int main(void) { int *a, *d_a; @@ -326,21 +447,36 @@ int main(void) { int *dummy_in, *dummy_out, *d_out, *d_in; - cudaMalloc(&dummy_in, 5 * sizeof(int)); - cudaMalloc(&dummy_out, 5 * sizeof(int)); - cudaMalloc(&d_out, 5 * sizeof(int)); - cudaMalloc(&d_in, 5 * sizeof(int)); + cudaMalloc(&dummy_in, 10 * sizeof(int)); + cudaMalloc(&dummy_out, 10 * sizeof(int)); + cudaMalloc(&d_out, 10 * sizeof(int)); + cudaMalloc(&d_in, 10 * sizeof(int)); TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 - TEST_2(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 - TEST_2(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2_N(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2_N(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2(add_kernel_6, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 25, 0, 0, 0, 0 cudaFree(dummy_in); cudaFree(dummy_out); cudaFree(d_out); cudaFree(d_in); + double *dummy_in_double, *dummy_out_double, *d_out_double, *d_in_double; + cudaMalloc(&dummy_in_double, 10 * sizeof(double)); + cudaMalloc(&dummy_out_double, 10 * sizeof(double)); + cudaMalloc(&d_out_double, 10 * sizeof(double)); + cudaMalloc(&d_in_double, 10 * sizeof(double)); + + TEST_2_D(add_kernel_7, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out_double, dummy_in_double, d_out_double, d_in_double, 10); // CHECK-EXEC: 50.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 + + cudaFree(dummy_in_double); + cudaFree(dummy_out_double); + cudaFree(d_out_double); + cudaFree(d_in_double); + + return 0; }