Skip to content

Commit

Permalink
Don't skip cuda host functions when they have const args
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 committed Nov 3, 2024
1 parent 130e779 commit 8244bbd
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 23 deletions.
2 changes: 1 addition & 1 deletion include/clad/Differentiator/BuiltinDerivatives.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ __global__ void atomicAdd_kernel(T* destPtr, T* srcPtr, size_t N) {
}

template <typename T>
void cudaMemcpy_pullback(T* destPtr, T* srcPtr, size_t count,
void cudaMemcpy_pullback(T* destPtr, const T* srcPtr, const size_t count,
cudaMemcpyKind kind, T* d_destPtr, T* d_srcPtr,
size_t* d_count, cudaMemcpyKind* d_kind)
__attribute__((host)) {
Expand Down
6 changes: 4 additions & 2 deletions lib/Differentiator/ReverseModeVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1821,7 +1821,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
// If all arguments are constant literals, then this does not contribute to
// the gradient.
// FIXME: revert this when this is integrated in the activity analysis pass.
if (!isa<CXXMemberCallExpr>(CE) && !isa<CXXOperatorCallExpr>(CE)) {
if (!isa<CXXMemberCallExpr>(CE) && !isa<CXXOperatorCallExpr>(CE) &&
CE->getCallReturnType(m_Context).getAsString() != "cudaError_t") {
bool allArgsAreConstantLiterals = true;
for (const Expr* arg : CE->arguments()) {
// if it's of type MaterializeTemporaryExpr, then check its
Expand All @@ -1845,7 +1846,8 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
// derived function. In the case of member functions, `implicit`
// this object is always passed by reference.
if (!dfdx() && !utils::HasAnyReferenceOrPointerArgument(FD) &&
!isa<CXXMemberCallExpr>(CE) && !isa<CXXOperatorCallExpr>(CE)) {
!isa<CXXMemberCallExpr>(CE) && !isa<CXXOperatorCallExpr>(CE) &&
CE->getCallReturnType(m_Context).getAsString() != "cudaError_t") {
for (const Expr* Arg : CE->arguments()) {
StmtDiff ArgDiff = Visit(Arg, dfdx());
CallArgs.push_back(ArgDiff.getExpr());
Expand Down
87 changes: 67 additions & 20 deletions test/CUDA/tensor_demo.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
// RUN: %cladclang_cuda -I%S/../../include --cuda-path=%cudapath \
// RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -otensor_demo.out %s
// RUN: ./tensor_demo.out | %filecheck_exec %s


#include "clad/Differentiator/Differentiator.h"

typedef unsigned long long int size_type;

__device__ void computeStartStep(size_type& A_start, size_type& A_step, size_type& B_start, size_type& B_step, const int idx, const size_type A_dim[3], const size_type B_dim[3], const int contractDims[2]) {
__device__ void computeStartStep(size_type& A_start, size_type& A_step, size_type& B_start, size_type& B_step, const int idx, const size_type A_dim[3], const size_type B_dim[3], const int contractDimA, const int contractDimB) {
size_type A_a, A_b, A_c, B_d, B_e, B_f;
int contractDimA = contractDims[0];
int contractDimB = contractDims[1];

switch (contractDimA) {
case 0:
Expand Down Expand Up @@ -50,18 +53,16 @@ __device__ void computeStartStep(size_type& A_start, size_type& A_step, size_typ
}
}

__global__ void tensorContraction3D(float* C, const float *A, const float *B, const size_type *A_dim, const size_type *B_dim, const int contractDims[2]) {
__global__ void tensorContraction3D(float* C, const float *A, const float *B, const size_type *A_dim, const size_type *B_dim, const int contractDimA, const int contractDimB) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int contractDimA = contractDims[0];
int contractDimB = contractDims[1];

// Each thread computes one element of the output tensor
int totalElements = A_dim[(contractDimA + 1) % 3] * A_dim[(contractDimA + 2) % 3] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3];
if (idx < totalElements) {
size_type A_start, B_start, A_step, B_step;
size_type A_a, A_b, A_c, B_d, B_e, B_f;

computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim, contractDims);
computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim, contractDimA, contractDimB);

float sum = 0.0f;
for (int i = 0; i < A_dim[contractDimA]; i++) { // A_dim[contractDimA] == B_dim[contractDimB]
Expand All @@ -72,12 +73,12 @@ __global__ void tensorContraction3D(float* C, const float *A, const float *B, co
}
}

void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1, const size_type D2, const size_type D3, const size_type D4, const size_type D5) {
void launchTensorContraction3D(float* C, const float* A, const float* B, const size_type D1, const size_type D2, const size_type D3, const size_type D4, const size_type D5) {
float *d_A = nullptr, *d_B = nullptr, *d_C = nullptr;

size_type A_size = D1 * D2 * D3 * sizeof(float);
size_type B_size = D3 * D4 * D5 * sizeof(float);
size_type C_size = D1 * D2 * D4 * D5 * sizeof(float);
const size_type A_size = D1 * D2 * D3 * sizeof(float);
const size_type B_size = D3 * D4 * D5 * sizeof(float);
const size_type C_size = D1 * D2 * D4 * D5 * sizeof(float);

// Allocate device memory and copy data from host to device
cudaMalloc(&d_A, A_size);
Expand All @@ -86,22 +87,17 @@ void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1,
cudaMemcpy(d_A, A, A_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, B_size, cudaMemcpyHostToDevice);

size_type A_dim[3] = {D1, D2, D3};
size_type B_dim[3] = {D3, D4, D5};
const size_type A_dim[3] = {D1, D2, D3};
const size_type B_dim[3] = {D3, D4, D5};

size_type *d_A_dim = nullptr, *d_B_dim = nullptr;
cudaMalloc(&d_A_dim, 3 * sizeof(size_type));
cudaMalloc(&d_B_dim, 3 * sizeof(size_type));
cudaMemcpy(d_A_dim, A_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_dim, B_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice);

int contractDims[2] = {2, 0};
int *d_contractDims = nullptr;
cudaMalloc(&d_contractDims, 2 * sizeof(int));
cudaMemcpy(d_contractDims, contractDims, 2 * sizeof(int), cudaMemcpyHostToDevice);

// Launch the kernel
tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim, d_contractDims);
tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim, /*contractDimA=*/2, /*contractDimB=*/0);

// Copy the result from device to host
cudaMemcpy(C, d_C, C_size, cudaMemcpyDeviceToHost);
Expand All @@ -112,7 +108,6 @@ void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1,
cudaFree(d_C);
cudaFree(d_A_dim);
cudaFree(d_B_dim);
cudaFree(d_contractDims);
}

int main() {
Expand Down Expand Up @@ -195,3 +190,55 @@ int main() {

return 0;
}

// CHECK-EXEC: Result C:
// CHECK-NEXT: 130 140
// CHECK-NEXT: 150 160
// CHECK-NEXT: 170 180
// CHECK-NEXT:
// CHECK-NEXT: 290 316
// CHECK-NEXT: 342 368
// CHECK-NEXT: 394 420
// CHECK-NEXT:
// CHECK-NEXT: 450 492
// CHECK-NEXT: 534 576
// CHECK-NEXT: 618 660
// CHECK-NEXT:
// CHECK-NEXT:
// CHECK-NEXT: 610 668
// CHECK-NEXT: 726 784
// CHECK-NEXT: 842 900
// CHECK-NEXT:
// CHECK-NEXT: 770 844
// CHECK-NEXT: 918 992
// CHECK-NEXT: 1066 1140
// CHECK-NEXT:
// CHECK-NEXT: 930 1020
// CHECK-NEXT: 1110 1200
// CHECK-NEXT: 1290 1380

// CHECK-EXEC: Result C_grad w.r.t. A:
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT:
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT:
// CHECK-EXEC: Result C_grad w.r.t. B:
// CHECK-NEXT: 66 66
// CHECK-NEXT: 66 66
// CHECK-NEXT: 66 66
// CHECK-NEXT:
// CHECK-NEXT: 72 72
// CHECK-NEXT: 72 72
// CHECK-NEXT: 72 72
// CHECK-NEXT:
// CHECK-NEXT: 78 78
// CHECK-NEXT: 78 78
// CHECK-NEXT: 78 78
// CHECK-NEXT:
// CHECK-NEXT: 84 84
// CHECK-NEXT: 84 84
// CHECK-NEXT: 84 84

0 comments on commit 8244bbd

Please sign in to comment.