Skip to content

Commit

Permalink
Add support of CUDA builtins (#1092)
Browse files Browse the repository at this point in the history
Added support of CUDA grid configuration builtin variables. Builtins tested: threadIdx, blockIdx, blockDim, gridDim, warpSize
  • Loading branch information
kchristin22 authored Sep 24, 2024
1 parent e2b8e35 commit 2e5560e
Show file tree
Hide file tree
Showing 3 changed files with 305 additions and 30 deletions.
1 change: 1 addition & 0 deletions include/clad/Differentiator/ReverseModeVisitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ namespace clad {
StmtDiff
VisitImplicitValueInitExpr(const clang::ImplicitValueInitExpr* IVIE);
StmtDiff VisitCStyleCastExpr(const clang::CStyleCastExpr* CSCE);
StmtDiff VisitPseudoObjectExpr(const clang::PseudoObjectExpr* POE);
StmtDiff VisitInitListExpr(const clang::InitListExpr* ILE);
StmtDiff VisitIntegerLiteral(const clang::IntegerLiteral* IL);
StmtDiff VisitMemberExpr(const clang::MemberExpr* ME);
Expand Down
6 changes: 6 additions & 0 deletions lib/Differentiator/ReverseModeVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3272,6 +3272,12 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
return {castExpr, castExprDiff};
}

StmtDiff
ReverseModeVisitor::VisitPseudoObjectExpr(const PseudoObjectExpr* POE) {
// Used for CUDA Builtins
return {Clone(POE), Clone(POE)};
}

StmtDiff ReverseModeVisitor::VisitMemberExpr(const MemberExpr* ME) {
auto baseDiff = VisitWithExplicitNoDfDx(ME->getBase());
auto* field = ME->getMemberDecl();
Expand Down
328 changes: 298 additions & 30 deletions test/CUDA/GradientKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,46 +33,314 @@ void fake_kernel(int *a) {
*a *= *a;
}

__global__ void add_kernel(int *out, int *in) {
int index = threadIdx.x;
out[index] += in[index];
}

// CHECK: void add_kernel_grad(int *out, int *in, int *_d_out, int *_d_in) {
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x;
//CHECK-NEXT: int _t0 = out[index0];
//CHECK-NEXT: out[index0] += in[index0];
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t0;
//CHECK-NEXT: int _r_d0 = _d_out[index0];
//CHECK-NEXT: _d_in[index0] += _r_d0;
//CHECK-NEXT: }
//CHECK-NEXT: }

__global__ void add_kernel_2(int *out, int *in) {
out[threadIdx.x] += in[threadIdx.x];
}

// CHECK: void add_kernel_2_grad(int *out, int *in, int *_d_out, int *_d_in) {
//CHECK-NEXT: int _t0 = out[threadIdx.x];
//CHECK-NEXT: out[threadIdx.x] += in[threadIdx.x];
//CHECK-NEXT: {
//CHECK-NEXT: out[threadIdx.x] = _t0;
//CHECK-NEXT: int _r_d0 = _d_out[threadIdx.x];
//CHECK-NEXT: _d_in[threadIdx.x] += _r_d0;
//CHECK-NEXT: }
//CHECK-NEXT: }

__global__ void add_kernel_3(int *out, int *in) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
out[index] += in[index];
}

// CHECK: void add_kernel_3_grad(int *out, int *in, int *_d_out, int *_d_in) {
//CHECK-NEXT: unsigned int _t1 = blockIdx.x;
//CHECK-NEXT: unsigned int _t0 = blockDim.x;
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: int _t2 = out[index0];
//CHECK-NEXT: out[index0] += in[index0];
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t2;
//CHECK-NEXT: int _r_d0 = _d_out[index0];
//CHECK-NEXT: _d_in[index0] += _r_d0;
//CHECK-NEXT: }
//CHECK-NEXT:}

__global__ void add_kernel_4(int *out, int *in) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < 5) {
int sum = 0;
// Each thread sums elements in steps of warpSize
for (int i = index; i < 5; i += warpSize) {
sum += in[i];
}
out[index] = sum;
}
}

// CHECK: void add_kernel_4_grad(int *out, int *in, int *_d_out, int *_d_in) {
//CHECK-NEXT: bool _cond0;
//CHECK-NEXT: int _d_sum = 0;
//CHECK-NEXT: int sum = 0;
//CHECK-NEXT: unsigned long _t2;
//CHECK-NEXT: int _d_i = 0;
//CHECK-NEXT: int i = 0;
//CHECK-NEXT: clad::tape<int> _t3 = {};
//CHECK-NEXT: clad::tape<int> _t4 = {};
//CHECK-NEXT: int _t5;
//CHECK-NEXT: unsigned int _t1 = blockIdx.x;
//CHECK-NEXT: unsigned int _t0 = blockDim.x;
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: {
//CHECK-NEXT: _cond0 = index0 < 5;
//CHECK-NEXT: if (_cond0) {
//CHECK-NEXT: sum = 0;
//CHECK-NEXT: _t2 = 0UL;
//CHECK-NEXT: for (i = index0; ; clad::push(_t3, i) , (i += warpSize)) {
//CHECK-NEXT: {
//CHECK-NEXT: if (!(i < 5))
//CHECK-NEXT: break;
//CHECK-NEXT: }
//CHECK-NEXT: _t2++;
//CHECK-NEXT: clad::push(_t4, sum);
//CHECK-NEXT: sum += in[i];
//CHECK-NEXT: }
//CHECK-NEXT: _t5 = out[index0];
//CHECK-NEXT: out[index0] = sum;
//CHECK-NEXT: }
//CHECK-NEXT: }
//CHECK-NEXT: if (_cond0) {
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t5;
//CHECK-NEXT: int _r_d2 = _d_out[index0];
//CHECK-NEXT: _d_out[index0] = 0;
//CHECK-NEXT: _d_sum += _r_d2;
//CHECK-NEXT: }
//CHECK-NEXT: {
//CHECK-NEXT: for (;; _t2--) {
//CHECK-NEXT: {
//CHECK-NEXT: if (!_t2)
//CHECK-NEXT: break;
//CHECK-NEXT: }
//CHECK-NEXT: {
//CHECK-NEXT: i = clad::pop(_t3);
//CHECK-NEXT: int _r_d0 = _d_i;
//CHECK-NEXT: }
//CHECK-NEXT: {
//CHECK-NEXT: sum = clad::pop(_t4);
//CHECK-NEXT: int _r_d1 = _d_sum;
//CHECK-NEXT: _d_in[i] += _r_d1;
//CHECK-NEXT: }
//CHECK-NEXT: }
//CHECK-NEXT: _d_index += _d_i;
//CHECK-NEXT: }
//CHECK-NEXT: }
//CHECK-NEXT:}

__global__ void add_kernel_5(int *out, int *in) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < 5) {
int sum = 0;
// Calculate the total number of threads in the grid
int totalThreads = blockDim.x * gridDim.x;
// Each thread sums elements in steps of the total number of threads in the grid
for (int i = index; i < 5; i += totalThreads) {
sum += in[i];
}
out[index] = sum;
}
}

// CHECK: void add_kernel_5_grad(int *out, int *in, int *_d_out, int *_d_in) {
//CHECK-NEXT: bool _cond0;
//CHECK-NEXT: int _d_sum = 0;
//CHECK-NEXT: int sum = 0;
//CHECK-NEXT: unsigned int _t2;
//CHECK-NEXT: unsigned int _t3;
//CHECK-NEXT: int _d_totalThreads = 0;
//CHECK-NEXT: int totalThreads = 0;
//CHECK-NEXT: unsigned long _t4;
//CHECK-NEXT: int _d_i = 0;
//CHECK-NEXT: int i = 0;
//CHECK-NEXT: clad::tape<int> _t5 = {};
//CHECK-NEXT: clad::tape<int> _t6 = {};
//CHECK-NEXT: int _t7;
//CHECK-NEXT: unsigned int _t1 = blockIdx.x;
//CHECK-NEXT: unsigned int _t0 = blockDim.x;
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: {
//CHECK-NEXT: _cond0 = index0 < 5;
//CHECK-NEXT: if (_cond0) {
//CHECK-NEXT: sum = 0;
//CHECK-NEXT: _t3 = blockDim.x;
//CHECK-NEXT: _t2 = gridDim.x;
//CHECK-NEXT: totalThreads = _t3 * _t2;
//CHECK-NEXT: _t4 = 0UL;
//CHECK-NEXT: for (i = index0; ; clad::push(_t5, i) , (i += totalThreads)) {
//CHECK-NEXT: {
//CHECK-NEXT: if (!(i < 5))
//CHECK-NEXT: break;
//CHECK-NEXT: }
//CHECK-NEXT: _t4++;
//CHECK-NEXT: clad::push(_t6, sum);
//CHECK-NEXT: sum += in[i];
//CHECK-NEXT: }
//CHECK-NEXT: _t7 = out[index0];
//CHECK-NEXT: out[index0] = sum;
//CHECK-NEXT: }
//CHECK-NEXT: }
//CHECK-NEXT: if (_cond0) {
//CHECK-NEXT: {
//CHECK-NEXT: out[index0] = _t7;
//CHECK-NEXT: int _r_d2 = _d_out[index0];
//CHECK-NEXT: _d_out[index0] = 0;
//CHECK-NEXT: _d_sum += _r_d2;
//CHECK-NEXT: }
//CHECK-NEXT: {
//CHECK-NEXT: for (;; _t4--) {
//CHECK-NEXT: {
//CHECK-NEXT: if (!_t4)
//CHECK-NEXT: break;
//CHECK-NEXT: }
//CHECK-NEXT: {
//CHECK-NEXT: i = clad::pop(_t5);
//CHECK-NEXT: int _r_d0 = _d_i;
//CHECK-NEXT: _d_totalThreads += _r_d0;
//CHECK-NEXT: }
//CHECK-NEXT: {
//CHECK-NEXT: sum = clad::pop(_t6);
//CHECK-NEXT: int _r_d1 = _d_sum;
//CHECK-NEXT: _d_in[i] += _r_d1;
//CHECK-NEXT: }
//CHECK-NEXT: }
//CHECK-NEXT: _d_index += _d_i;
//CHECK-NEXT: }
//CHECK-NEXT: }
//CHECK-NEXT:}

#define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \
{ \
int *fives = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
fives[i] = 5; \
} \
int *ones = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
ones[i] = 1; \
} \
cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, ones, N * sizeof(int), cudaMemcpyHostToDevice); \
auto test = clad::gradient(F); \
if constexpr (use_stream) { \
cudaStream_t cudaStream; \
cudaStreamCreate(&cudaStream); \
test.execute_kernel(grid, block, shared_mem, cudaStream, x, dx); \
} \
else { \
test.execute_kernel(grid, block, x, dx); \
} \
cudaDeviceSynchronize(); \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
printf("%d\n", res[N-1]); \
free(fives); \
free(ones); \
free(res); \
}


#define TEST_2(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \
{ \
int *fives = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
fives[i] = 5; \
} \
int *zeros = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
zeros[i] = 0; \
} \
cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(y, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dy, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \
auto test = clad::gradient(F, args); \
if constexpr (use_stream) { \
cudaStream_t cudaStream; \
cudaStreamCreate(&cudaStream); \
test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, dy, dx); \
} \
else { \
test.execute_kernel(grid, block, y, x, dy, dx); \
} \
cudaDeviceSynchronize(); \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
printf("%d\n", res[N-1]); \
free(fives); \
free(zeros); \
free(res); \
}


int main(void) {
int *a = (int*)malloc(sizeof(int));
*a = 2;
int *d_a;
int *a, *d_a;
cudaMalloc(&a, sizeof(int));
cudaMalloc(&d_a, sizeof(int));
cudaMemcpy(d_a, a, sizeof(int), cudaMemcpyHostToDevice);

int *asquare = (int*)malloc(sizeof(int));
*asquare = 1;
int *d_square;
cudaMalloc(&d_square, sizeof(int));
cudaMemcpy(d_square, asquare, sizeof(int), cudaMemcpyHostToDevice);

auto test = clad::gradient(kernel);
dim3 grid(1);
dim3 block(1);
cudaStream_t cudaStream;
cudaStreamCreate(&cudaStream);
test.execute_kernel(grid, block, 0, cudaStream, d_a, d_square);
TEST(kernel, dim3(1), dim3(1), 0, false, a, d_a, 1); // CHECK-EXEC: 10
TEST(kernel, dim3(1), dim3(1), 0, true, a, d_a, 1); // CHECK-EXEC: 10

cudaDeviceSynchronize();
auto error = clad::gradient(fake_kernel);
error.execute_kernel(dim3(1), dim3(1), a, d_a); // CHECK-EXEC: Use execute() for non-global CUDA kernels

cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
printf("a = %d, a^2 = %d\n", *a, *asquare); // CHECK-EXEC: a = 2, a^2 = 4
auto test = clad::gradient(kernel);
test.execute(a, d_a); // CHECK-EXEC: Use execute_kernel() for global CUDA kernels

auto error = clad::gradient(fake_kernel);
error.execute_kernel(grid, block, d_a, d_square); // CHECK-EXEC: Use execute() for non-global CUDA kernels
cudaFree(a);
cudaFree(d_a);

test.execute(d_a, d_square); // CHECK-EXEC: Use execute_kernel() for global CUDA kernels

cudaMemset(d_a, 5, 1); // first byte is set to 5
cudaMemset(d_square, 1, 1);
int *dummy_in, *dummy_out, *d_out, *d_in;
cudaMalloc(&dummy_in, 5 * sizeof(int));
cudaMalloc(&dummy_out, 5 * sizeof(int));
cudaMalloc(&d_out, 5 * sizeof(int));
cudaMalloc(&d_in, 5 * sizeof(int));

test.execute_kernel(grid, block, d_a, d_square);
cudaDeviceSynchronize();
TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5

cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
printf("a = %d, a^2 = %d\n", *a, *asquare); // CHECK-EXEC: a = 5, a^2 = 10
cudaFree(dummy_in);
cudaFree(dummy_out);
cudaFree(d_out);
cudaFree(d_in);

return 0;
}

0 comments on commit 2e5560e

Please sign in to comment.