Skip to content

Commit

Permalink
Add support of CUDA builtins and tests
Browse files Browse the repository at this point in the history
Added support of CUDA grid configuration builtin variables. Builtins tested: threadIdx, blockIdx, blockDim, gridDim, warpSize
  • Loading branch information
kchristin22 committed Sep 15, 2024
1 parent 3ca7de7 commit aaf593c
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 30 deletions.
1 change: 1 addition & 0 deletions include/clad/Differentiator/ReverseModeVisitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ namespace clad {
StmtDiff
VisitImplicitValueInitExpr(const clang::ImplicitValueInitExpr* IVIE);
StmtDiff VisitCStyleCastExpr(const clang::CStyleCastExpr* CSCE);
StmtDiff VisitPseudoObjectExpr(const clang::PseudoObjectExpr* POE);
StmtDiff VisitInitListExpr(const clang::InitListExpr* ILE);
StmtDiff VisitIntegerLiteral(const clang::IntegerLiteral* IL);
StmtDiff VisitMemberExpr(const clang::MemberExpr* ME);
Expand Down
6 changes: 6 additions & 0 deletions lib/Differentiator/ReverseModeVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3272,6 +3272,12 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
return {castExpr, castExprDiff};
}

StmtDiff
ReverseModeVisitor::VisitPseudoObjectExpr(const PseudoObjectExpr* POE) {

Check warning on line 3276 in lib/Differentiator/ReverseModeVisitor.cpp

View check run for this annotation

Codecov / codecov/patch

lib/Differentiator/ReverseModeVisitor.cpp#L3276

Added line #L3276 was not covered by tests
// Used for CUDA Builtins
return {Clone(POE), Clone(POE)};

Check warning on line 3278 in lib/Differentiator/ReverseModeVisitor.cpp

View check run for this annotation

Codecov / codecov/patch

lib/Differentiator/ReverseModeVisitor.cpp#L3278

Added line #L3278 was not covered by tests
}

StmtDiff ReverseModeVisitor::VisitMemberExpr(const MemberExpr* ME) {
auto baseDiff = VisitWithExplicitNoDfDx(ME->getBase());
auto* field = ME->getMemberDecl();
Expand Down
172 changes: 142 additions & 30 deletions test/CUDA/GradientKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,46 +33,158 @@ void fake_kernel(int *a) {
*a *= *a;
}

__global__ void add_kernel(int *out, int *in) {
int index = threadIdx.x;
out[index] += in[index];
}

// CHECK: void add_kernel_grad(int *out, int *in, int *_d_out, int *_d_in) {
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x;
//CHECK-NEXT: int _t0 = out[index0];
//CHECK-NEXT: out[index0] += in[index0];
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t0;
//CHECK-NEXT: int _r_d0 = _d_out[index0];
//CHECK-NEXT: _d_in[index0] += _r_d0;
//CHECK-NEXT: }
//CHECK-NEXT: }

__global__ void add_kernel_2(int *out, int *in) {
out[threadIdx.x] += in[threadIdx.x];
}

// CHECK: void add_kernel_2_grad(int *out, int *in, int *_d_out, int *_d_in) {
//CHECK-NEXT: int _t0 = out[threadIdx.x];
//CHECK-NEXT: out[threadIdx.x] += in[threadIdx.x];
//CHECK-NEXT: {
//CHECK-NEXT: out[threadIdx.x] = _t0;
//CHECK-NEXT: int _r_d0 = _d_out[threadIdx.x];
//CHECK-NEXT: _d_in[threadIdx.x] += _r_d0;
//CHECK-NEXT: }
//CHECK-NEXT: }

__global__ void add_kernel_3(int *out, int *in) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
out[index] += in[index];
}

// CHECK: void add_kernel_3_grad(int *out, int *in, int *_d_out, int *_d_in) {
//CHECK-NEXT: unsigned int _t1 = blockIdx.x;
//CHECK-NEXT: unsigned int _t0 = blockDim.x;
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: int _t2 = out[index0];
//CHECK-NEXT: out[index0] += in[index0];
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t2;
//CHECK-NEXT: int _r_d0 = _d_out[index0];
//CHECK-NEXT: _d_in[index0] += _r_d0;
//CHECK-NEXT: }
//CHECK-NEXT:}

#define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \
{ \
int *fives = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
fives[i] = 5; \
} \
int *ones = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
ones[i] = 1; \
} \
cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, ones, N * sizeof(int), cudaMemcpyHostToDevice); \
auto test = clad::gradient(F); \
if constexpr (use_stream) { \
cudaStream_t cudaStream; \
cudaStreamCreate(&cudaStream); \
test.execute_kernel(grid, block, shared_mem, cudaStream, x, dx); \
} \
else { \
test.execute_kernel(grid, block, x, dx); \
} \
cudaDeviceSynchronize(); \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
printf("%d\n", res[N-1]); \
free(fives); \
free(ones); \
free(res); \
}


#define TEST_2(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \
{ \
int *fives = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
fives[i] = 5; \
} \
int *zeros = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
zeros[i] = 0; \
} \
cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(y, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dy, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \
auto test = clad::gradient(F, args); \
if constexpr (use_stream) { \
cudaStream_t cudaStream; \
cudaStreamCreate(&cudaStream); \
test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, dy, dx); \
} \
else { \
test.execute_kernel(grid, block, y, x, dy, dx); \
} \
cudaDeviceSynchronize(); \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
printf("%d\n", res[N-1]); \
free(fives); \
free(zeros); \
free(res); \
}


int main(void) {
int *a = (int*)malloc(sizeof(int));
*a = 2;
int *d_a;
int *a, *d_a;
cudaMalloc(&a, sizeof(int));
cudaMalloc(&d_a, sizeof(int));
cudaMemcpy(d_a, a, sizeof(int), cudaMemcpyHostToDevice);

int *asquare = (int*)malloc(sizeof(int));
*asquare = 1;
int *d_square;
cudaMalloc(&d_square, sizeof(int));
cudaMemcpy(d_square, asquare, sizeof(int), cudaMemcpyHostToDevice);
TEST(kernel, dim3(1), dim3(1), 0, false, a, d_a, 1); // CHECK-EXEC: 10
TEST(kernel, dim3(1), dim3(1), 0, true, a, d_a, 1); // CHECK-EXEC: 10

auto test = clad::gradient(kernel);
dim3 grid(1);
dim3 block(1);
cudaStream_t cudaStream;
cudaStreamCreate(&cudaStream);
test.execute_kernel(grid, block, 0, cudaStream, d_a, d_square);
auto error = clad::gradient(fake_kernel);
error.execute_kernel(dim3(1), dim3(1), a, d_a); // CHECK-EXEC: Use execute() for non-global CUDA kernels

cudaDeviceSynchronize();

cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
printf("a = %d, a^2 = %d\n", *a, *asquare); // CHECK-EXEC: a = 2, a^2 = 4
auto test = clad::gradient(kernel);
test.execute(a, d_a); // CHECK-EXEC: Use execute_kernel() for global CUDA kernels

auto error = clad::gradient(fake_kernel);
error.execute_kernel(grid, block, d_a, d_square); // CHECK-EXEC: Use execute() for non-global CUDA kernels
cudaFree(a);
cudaFree(d_a);

test.execute(d_a, d_square); // CHECK-EXEC: Use execute_kernel() for global CUDA kernels

cudaMemset(d_a, 5, 1); // first byte is set to 5
cudaMemset(d_square, 1, 1);
int *dummy_in, *dummy_out, *d_out, *d_in;
cudaMalloc(&dummy_in, 5 * sizeof(int));
cudaMalloc(&dummy_out, 5 * sizeof(int));
cudaMalloc(&d_out, 5 * sizeof(int));
cudaMalloc(&d_in, 5 * sizeof(int));

test.execute_kernel(grid, block, d_a, d_square);
cudaDeviceSynchronize();
TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5

cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
printf("a = %d, a^2 = %d\n", *a, *asquare); // CHECK-EXEC: a = 5, a^2 = 10
cudaFree(dummy_in);
cudaFree(dummy_out);
cudaFree(d_out);
cudaFree(d_in);

return 0;
}

0 comments on commit aaf593c

Please sign in to comment.