Skip to content

Commit

Permalink
Fix _r local vars being passed to non-ref cuda kernel pullbacks (#1133)
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 authored Nov 12, 2024
1 parent fa89545 commit d3292eb
Show file tree
Hide file tree
Showing 9 changed files with 282 additions and 20 deletions.
2 changes: 1 addition & 1 deletion include/clad/Differentiator/BuiltinDerivatives.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) {
}

template <typename T>
void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
void cudaMemcpy_pullback(T* destPtr, const T* srcPtr, size_t count,
cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr,
size_t* d_count, cudaMemcpyKind* d_kind)
__attribute__((host)) {
Expand Down
10 changes: 10 additions & 0 deletions include/clad/Differentiator/CladUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,16 @@ namespace clad {
clang::DeclContext* DC1,
clang::DeclContext* DC2);

/// Finds the qualified name `name` in the declaration context `DC`.
///
/// \param[in] name
/// \param[in] S
/// \param[in] DC
/// \returns lookup result.
clang::LookupResult LookupQualifiedName(llvm::StringRef name,
clang::Sema& S,
clang::DeclContext* DC = nullptr);

/// Finds namespace 'namespc` under the declaration context `DC` or the
/// translation unit declaration if `DC` is null.
///
Expand Down
21 changes: 20 additions & 1 deletion include/clad/Differentiator/Compatibility.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ static inline IfStmt* IfStmt_Create(const ASTContext &Ctx,
#endif
}

// Compatibility helper function for creation CallExpr.
// Compatibility helper function for creation CallExpr and CUDAKernelCallExpr.
// Clang 12 and above use one extra param.

#if CLANG_VERSION_MAJOR < 12
Expand All @@ -188,13 +188,32 @@ static inline CallExpr* CallExpr_Create(const ASTContext &Ctx, Expr *Fn, ArrayRe
{
return CallExpr::Create(Ctx, Fn, Args, Ty, VK, RParenLoc, MinNumArgs, UsesADL);
}

static inline CUDAKernelCallExpr*
CUDAKernelCallExpr_Create(const ASTContext& Ctx, Expr* Fn, CallExpr* Config,
ArrayRef<Expr*> Args, QualType Ty, ExprValueKind VK,
SourceLocation RParenLoc, unsigned MinNumArgs = 0,
CallExpr::ADLCallKind UsesADL = CallExpr::NotADL) {
return CUDAKernelCallExpr::Create(Ctx, Fn, Config, Args, Ty, VK, RParenLoc,
MinNumArgs);
}
#elif CLANG_VERSION_MAJOR >= 12
static inline CallExpr* CallExpr_Create(const ASTContext &Ctx, Expr *Fn, ArrayRef< Expr *> Args,
QualType Ty, ExprValueKind VK, SourceLocation RParenLoc, FPOptionsOverride FPFeatures,
unsigned MinNumArgs = 0, CallExpr::ADLCallKind UsesADL = CallExpr::NotADL)
{
return CallExpr::Create(Ctx, Fn, Args, Ty, VK, RParenLoc, FPFeatures, MinNumArgs, UsesADL);
}

static inline CUDAKernelCallExpr*
CUDAKernelCallExpr_Create(const ASTContext& Ctx, Expr* Fn, CallExpr* Config,
ArrayRef<Expr*> Args, QualType Ty, ExprValueKind VK,
SourceLocation RParenLoc,
FPOptionsOverride FPFeatures, unsigned MinNumArgs = 0,
CallExpr::ADLCallKind UsesADL = CallExpr::NotADL) {
return CUDAKernelCallExpr::Create(Ctx, Fn, Config, Args, Ty, VK, RParenLoc,
FPFeatures, MinNumArgs);
}
#endif

// Clang 12 and above use one extra param.
Expand Down
2 changes: 1 addition & 1 deletion include/clad/Differentiator/Differentiator.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
if (CUDAkernel) {
constexpr size_t totalArgs = sizeof...(args) + sizeof...(Rest);
std::array<void*, totalArgs> argPtrs = {static_cast<void*>(&args)...,
std::array<void*, totalArgs> argPtrs = {(void*)(&args)...,
static_cast<Rest>(nullptr)...};

void* null_param = nullptr;
Expand Down
1 change: 1 addition & 0 deletions include/clad/Differentiator/StmtClone.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ namespace utils {
DECLARE_CLONE_FN(ExtVectorElementExpr)
DECLARE_CLONE_FN(UnaryExprOrTypeTraitExpr)
DECLARE_CLONE_FN(CallExpr)
DECLARE_CLONE_FN(CUDAKernelCallExpr)
DECLARE_CLONE_FN(ShuffleVectorExpr)
DECLARE_CLONE_FN(ExprWithCleanups)
DECLARE_CLONE_FN(CXXOperatorCallExpr)
Expand Down
12 changes: 12 additions & 0 deletions lib/Differentiator/CladUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,18 @@ namespace clad {
return DC->getPrimaryContext();
}

LookupResult LookupQualifiedName(llvm::StringRef name, clang::Sema& S,
clang::DeclContext* DC) {
ASTContext& C = S.getASTContext();
DeclarationName declName = &C.Idents.get(name);
LookupResult Result(S, declName, SourceLocation(),
Sema::LookupOrdinaryName);
if (!DC)
DC = C.getTranslationUnitDecl();
S.LookupQualifiedName(Result, DC);
return Result;
}

NamespaceDecl* LookupNSD(Sema& S, llvm::StringRef namespc, bool shouldExist,
DeclContext* DC) {
ASTContext& C = S.getASTContext();
Expand Down
74 changes: 72 additions & 2 deletions lib/Differentiator/ReverseModeVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1911,7 +1911,73 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
QualType dArgTy = getNonConstType(arg->getType(), m_Context, m_Sema);
VarDecl* dArgDecl = BuildVarDecl(dArgTy, "_r", getZeroInit(dArgTy));
PreCallStmts.push_back(BuildDeclStmt(dArgDecl));
CallArgDx.push_back(BuildDeclRef(dArgDecl));
DeclRefExpr* dArgRef = BuildDeclRef(dArgDecl);
if (isa<CUDAKernelCallExpr>(CE)) {
// Create variables to be allocated and initialized on the device, and
// then be passed to the kernel pullback.
//
// These need to be pointers because cudaMalloc expects a
// pointer-to-pointer as an arg.
// The memory addresses they point to are initialized to zero through
// cudaMemset.
// After the pullback call, their values will be copied back to the
// corresponding _r variables on the host and the device variables
// will be freed.
//
// Example of the generated code:
//
// double _r0 = 0;
// double* _r1 = nullptr;
// cudaMalloc(&_r1, sizeof(double));
// cudaMemset(_r1, 0, 8);
// kernel_pullback<<<...>>>(..., _r1);
// cudaMemcpy(&_r0, _r1, 8, cudaMemcpyDeviceToHost);
// cudaFree(_r1);

// Create a literal for the size of the type
Expr* sizeLiteral = ConstantFolder::synthesizeLiteral(
m_Context.IntTy, m_Context, m_Context.getTypeSize(dArgTy) / 8);
dArgTy = m_Context.getPointerType(dArgTy);
VarDecl* dArgDeclCUDA =
BuildVarDecl(dArgTy, "_r", getZeroInit(dArgTy));

// Create the cudaMemcpyDeviceToHost argument
LookupResult deviceToHostResult =
utils::LookupQualifiedName("cudaMemcpyDeviceToHost", m_Sema);
if (deviceToHostResult.empty()) {
diag(DiagnosticsEngine::Error, CE->getEndLoc(),
"Failed to create cudaMemcpy call; cudaMemcpyDeviceToHost not "
"found. Creating kernel pullback aborted.");
return StmtDiff(Clone(CE));
}
CXXScopeSpec SS;
Expr* deviceToHostExpr =
m_Sema
.BuildDeclarationNameExpr(SS, deviceToHostResult,
/*ADL=*/false)
.get();

// Add calls to cudaMalloc, cudaMemset, cudaMemcpy, and cudaFree
PreCallStmts.push_back(BuildDeclStmt(dArgDeclCUDA));
Expr* refOp = BuildOp(UO_AddrOf, BuildDeclRef(dArgDeclCUDA));
llvm::SmallVector<Expr*, 3> mallocArgs = {refOp, sizeLiteral};
PreCallStmts.push_back(GetFunctionCall("cudaMalloc", "", mallocArgs));
llvm::SmallVector<Expr*, 3> memsetArgs = {
BuildDeclRef(dArgDeclCUDA), getZeroInit(m_Context.IntTy),
sizeLiteral};
PreCallStmts.push_back(GetFunctionCall("cudaMemset", "", memsetArgs));
llvm::SmallVector<Expr*, 4> cudaMemcpyArgs = {
BuildOp(UO_AddrOf, dArgRef), BuildDeclRef(dArgDeclCUDA),
sizeLiteral, deviceToHostExpr};
PostCallStmts.push_back(
GetFunctionCall("cudaMemcpy", "", cudaMemcpyArgs));
llvm::SmallVector<Expr*, 3> freeArgs = {BuildDeclRef(dArgDeclCUDA)};
PostCallStmts.push_back(GetFunctionCall("cudaFree", "", freeArgs));

// Update arg to be passed to pullback call
dArgRef = BuildDeclRef(dArgDeclCUDA);
}
CallArgDx.push_back(dArgRef);
// Visit using uninitialized reference.
argDiff = Visit(arg, BuildDeclRef(dArgDecl));
}
Expand Down Expand Up @@ -2040,7 +2106,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
Expr* gradArgExpr = nullptr;
QualType paramTy = FD->getParamDecl(idx)->getType();
if (!argDerivative || utils::isArrayOrPointerType(paramTy) ||
isCladArrayType(argDerivative->getType()))
isCladArrayType(argDerivative->getType()) ||
isa<CUDAKernelCallExpr>(CE))
gradArgExpr = argDerivative;
else
gradArgExpr =
Expand Down Expand Up @@ -2228,6 +2295,9 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
m_ExternalSource->ActBeforeFinalizingVisitCallExpr(
CE, OverloadedDerivedFn, DerivedCallArgs, CallArgDx, asGrad);

if (isa<CUDAKernelCallExpr>(CE))
return StmtDiff(Clone(CE));

Expr* call = nullptr;

QualType returnType = FD->getReturnType();
Expand Down
41 changes: 26 additions & 15 deletions lib/Differentiator/StmtClone.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,13 +313,30 @@ Stmt* StmtClone::VisitUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr* Node) {
}

Stmt* StmtClone::VisitCallExpr(CallExpr* Node) {
llvm::SmallVector<Expr*, 4> clonedArgs;
for (Expr* arg : Node->arguments())
clonedArgs.push_back(Clone(arg));

CallExpr* result = clad_compat::CallExpr_Create(
Ctx, Clone(Node->getCallee()), llvm::ArrayRef<Expr*>(),
Ctx, Clone(Node->getCallee()), clonedArgs, CloneType(Node->getType()),
Node->getValueKind(),
Node->getRParenLoc() CLAD_COMPAT_CLANG8_CallExpr_ExtraParams);

// Copy Value and Type dependent
clad_compat::ExprSetDeps(result, Node);

return result;
}

Stmt* StmtClone::VisitCUDAKernelCallExpr(CUDAKernelCallExpr* Node) {
llvm::SmallVector<Expr*, 4> clonedArgs;
for (Expr* arg : Node->arguments())
clonedArgs.push_back(Clone(arg));

CUDAKernelCallExpr* result = clad_compat::CUDAKernelCallExpr_Create(
Ctx, Clone(Node->getCallee()), Clone(Node->getConfig()), clonedArgs,
CloneType(Node->getType()), Node->getValueKind(),
Node->getRParenLoc() CLAD_COMPAT_CLANG8_CallExpr_ExtraParams);
result->setNumArgsUnsafe(Node->getNumArgs());
for (unsigned i = 0, e = Node->getNumArgs(); i < e; ++i)
result->setArg(i, Clone(Node->getArg(i)));

// Copy Value and Type dependent
clad_compat::ExprSetDeps(result, Node);
Expand Down Expand Up @@ -352,28 +369,22 @@ Stmt* StmtClone::VisitCXXOperatorCallExpr(CXXOperatorCallExpr* Node) {
Node->getFPFeatures()
CLAD_COMPAT_CLANG11_CXXOperatorCallExpr_Create_ExtraParamsUse);

//### result->setNumArgs(Ctx, Node->getNumArgs());
result->setNumArgsUnsafe(Node->getNumArgs());
for (unsigned i = 0, e = Node->getNumArgs(); i < e; ++i)
result->setArg(i, Clone(Node->getArg(i)));

// Copy Value and Type dependent
clad_compat::ExprSetDeps(result, Node);

return result;
}

Stmt* StmtClone::VisitCXXMemberCallExpr(CXXMemberCallExpr * Node) {
llvm::SmallVector<Expr*, 4> clonedArgs;
for (Expr* arg : Node->arguments())
clonedArgs.push_back(Clone(arg));

CXXMemberCallExpr* result = clad_compat::CXXMemberCallExpr_Create(
Ctx, Clone(Node->getCallee()), {}, CloneType(Node->getType()),
Ctx, Clone(Node->getCallee()), clonedArgs, CloneType(Node->getType()),
Node->getValueKind(),
Node->getRParenLoc()
/*FP*/ CLAD_COMPAT_CLANG12_CastExpr_GetFPO(Node));
// ### result->setNumArgs(Ctx, Node->getNumArgs());
result->setNumArgsUnsafe(Node->getNumArgs());

for (unsigned i = 0, e = Node->getNumArgs(); i < e; ++i)
result->setArg(i, Clone(Node->getArg(i)));

// Copy Value and Type dependent
clad_compat::ExprSetDeps(result, Node);
Expand Down
Loading

0 comments on commit d3292eb

Please sign in to comment.