Skip to content

Commit

Permalink
[cuda] Make add-assign op atomic to avoid write race conditions.
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 authored and vgvassilev committed Oct 5, 2024
1 parent 844d9a3 commit 4c7bea2
Show file tree
Hide file tree
Showing 4 changed files with 233 additions and 32 deletions.
14 changes: 14 additions & 0 deletions include/clad/Differentiator/ReverseModeVisitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,20 @@ namespace clad {
StmtDiff VisitNullStmt(const clang::NullStmt* NS) {
return StmtDiff{Clone(NS), Clone(NS)};
};

/// Helper function that checks whether the function to be derived
/// is meant to be executed only by the GPU
bool shouldUseCudaAtomicOps();

/// Add call to cuda::atomicAdd for the given LHS and RHS expressions.
///
/// \param[in] LHS The left-hand side expression.
///
/// \param[in] RHS The right-hand side expression.
///
/// \returns The atomicAdd call expression.
clang::Expr* BuildCallToCudaAtomicAdd(clang::Expr* LHS, clang::Expr* RHS);

static DeclDiff<clang::StaticAssertDecl>
DifferentiateStaticAssertDecl(const clang::StaticAssertDecl* SAD);

Expand Down
63 changes: 57 additions & 6 deletions lib/Differentiator/ReverseModeVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,45 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
return CladTapeResult{*this, PushExpr, PopExpr, TapeRef};
}

bool ReverseModeVisitor::shouldUseCudaAtomicOps() {
return m_DiffReq->hasAttr<clang::CUDAGlobalAttr>() ||
(m_DiffReq->hasAttr<clang::CUDADeviceAttr>() &&
!m_DiffReq->hasAttr<clang::CUDAHostAttr>());
}

clang::Expr* ReverseModeVisitor::BuildCallToCudaAtomicAdd(clang::Expr* LHS,
clang::Expr* RHS) {
DeclarationName atomicAddId = &m_Context.Idents.get("atomicAdd");
LookupResult lookupResult(m_Sema, atomicAddId, SourceLocation(),
Sema::LookupOrdinaryName);
m_Sema.LookupQualifiedName(lookupResult,
m_Context.getTranslationUnitDecl());

CXXScopeSpec SS;
Expr* UnresolvedLookup =
m_Sema.BuildDeclarationNameExpr(SS, lookupResult, /*ADL=*/true).get();

Expr* finalLHS = LHS;
if (isa<ArraySubscriptExpr>(LHS))
finalLHS = BuildOp(UnaryOperatorKind::UO_AddrOf, LHS);
llvm::SmallVector<Expr*, 2> atomicArgs = {finalLHS, RHS};

assert(!m_Builder.noOverloadExists(UnresolvedLookup, atomicArgs) &&
"atomicAdd function not found");

Expr* atomicAddCall =
m_Sema
.ActOnCallExpr(
getCurrentScope(),
/*Fn=*/UnresolvedLookup,
/*LParenLoc=*/noLoc,
/*ArgExprs=*/llvm::MutableArrayRef<Expr*>(atomicArgs),
/*RParenLoc=*/m_DiffReq->getLocation())
.get();

return atomicAddCall;
}

ReverseModeVisitor::ReverseModeVisitor(DerivativeBuilder& builder,
const DiffRequest& request)
: VisitorBase(builder, request), m_Result(nullptr) {}
Expand Down Expand Up @@ -1485,9 +1524,15 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
BuildArraySubscript(target, forwSweepDerivativeIndices);
// Create the (target += dfdx) statement.
if (dfdx()) {
auto* add_assign = BuildOp(BO_AddAssign, result, dfdx());
// Add it to the body statements.
addToCurrentBlock(add_assign, direction::reverse);
if (shouldUseCudaAtomicOps()) {
Expr* atomicCall = BuildCallToCudaAtomicAdd(result, dfdx());
// Add it to the body statements.
addToCurrentBlock(atomicCall, direction::reverse);
} else {
auto* add_assign = BuildOp(BO_AddAssign, result, dfdx());
// Add it to the body statements.
addToCurrentBlock(add_assign, direction::reverse);
}
}
if (m_ExternalSource)
m_ExternalSource->ActAfterProcessingArraySubscriptExpr(valueForRevSweep);
Expand Down Expand Up @@ -2279,9 +2324,15 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
derivedE = BuildOp(UnaryOperatorKind::UO_Deref, diff_dx);
// Create the (target += dfdx) statement.
if (dfdx()) {
auto* add_assign = BuildOp(BO_AddAssign, derivedE, dfdx());
// Add it to the body statements.
addToCurrentBlock(add_assign, direction::reverse);
if (shouldUseCudaAtomicOps()) {
Expr* atomicCall = BuildCallToCudaAtomicAdd(diff_dx, dfdx());
// Add it to the body statements.
addToCurrentBlock(atomicCall, direction::reverse);
} else {
auto* add_assign = BuildOp(BO_AddAssign, derivedE, dfdx());
// Add it to the body statements.
addToCurrentBlock(add_assign, direction::reverse);
}
}
}
return {cloneE, derivedE, derivedE};
Expand Down
2 changes: 1 addition & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ if(CUDAToolkit_FOUND)
get_filename_component(CUDA_ROOT "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE)
get_filename_component(CUDA_LIBDIR "${CUDA_cudart_static_LIBRARY}" DIRECTORY)

set(LIBOMPTARGET_DEP_CUDA_ARCH "sm_50")
set(LIBOMPTARGET_DEP_CUDA_ARCH "sm_60")

if(TARGET nvptx-arch)
get_property(LIBOMPTARGET_NVPTX_ARCH TARGET nvptx-arch PROPERTY LOCATION)
Expand Down
Loading

0 comments on commit 4c7bea2

Please sign in to comment.