Skip to content

Commit

Permalink
Fix test and create test functions
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 committed Sep 13, 2024
1 parent 3288ffe commit 2824439
Showing 1 changed file with 88 additions and 67 deletions.
155 changes: 88 additions & 67 deletions test/CUDA/GradientKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -84,87 +84,108 @@ __global__ void add_kernel_3(int *out, int *in) {
//CHECK-NEXT: }
//CHECK-NEXT:}

int main(void) {
int *a = (int*)malloc(sizeof(int));
*a = 2;
int *d_a;
cudaMalloc(&d_a, sizeof(int));
cudaMemcpy(d_a, a, sizeof(int), cudaMemcpyHostToDevice);

int *asquare = (int*)malloc(sizeof(int));
*asquare = 1;
int *d_square;
cudaMalloc(&d_square, sizeof(int));
cudaMemcpy(d_square, asquare, sizeof(int), cudaMemcpyHostToDevice);
#define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \
{ \
int *fives = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
fives[i] = 5; \
} \
int *ones = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
ones[i] = 1; \
} \
cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, ones, N * sizeof(int), cudaMemcpyHostToDevice); \
auto test = clad::gradient(F); \
if constexpr (use_stream) { \
cudaStream_t cudaStream; \
cudaStreamCreate(&cudaStream); \
test.execute_kernel(grid, block, shared_mem, cudaStream, x, dx); \
} \
else { \
test.execute_kernel(grid, block, x, dx); \
} \
cudaDeviceSynchronize(); \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
printf("%d\n", res[N-1]); \
free(fives); \
free(ones); \
free(res); \
}

auto test = clad::gradient(kernel);
dim3 grid(1);
dim3 block(1);
cudaStream_t cudaStream;
cudaStreamCreate(&cudaStream);
test.execute_kernel(grid, block, 0, cudaStream, d_a, d_square);

cudaDeviceSynchronize();
#define TEST_2(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \
{ \
int *fives = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
fives[i] = 5; \
} \
int *zeros = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
zeros[i] = 0; \
} \
cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(y, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dy, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \
auto test = clad::gradient(F, args); \
if constexpr (use_stream) { \
cudaStream_t cudaStream; \
cudaStreamCreate(&cudaStream); \
test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, dy, dx); \
} \
else { \
test.execute_kernel(grid, block, y, x, dy, dx); \
} \
cudaDeviceSynchronize(); \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
printf("%d\n", res[N-1]); \
free(fives); \
free(zeros); \
free(res); \
}

cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
printf("a = %d, d(a^2)/da = %d\n", *a, *asquare); // CHECK-EXEC: a = 2, d(a^2)/da = 4

auto error = clad::gradient(fake_kernel);
error.execute_kernel(grid, block, d_a, d_square); // CHECK-EXEC: Use execute() for non-global CUDA kernels
int main(void) {
int *a, *d_a;
cudaMalloc(&a, sizeof(int));
cudaMalloc(&d_a, sizeof(int));

test.execute(d_a, d_square); // CHECK-EXEC: Use execute_kernel() for global CUDA kernels
TEST(kernel, dim3(1), dim3(1), 0, false, a, d_a, 1); // CHECK-EXEC: 10
TEST(kernel, dim3(1), dim3(1), 0, true, a, d_a, 1); // CHECK-EXEC: 10

cudaMemset(d_a, 5, 1); // first byte is set to 5
cudaMemset(d_square, 1, 1);
auto error = clad::gradient(fake_kernel);
error.execute_kernel(dim3(1), dim3(1), a, d_a); // CHECK-EXEC: Use execute() for non-global CUDA kernels

test.execute_kernel(grid, block, d_a, d_square);
cudaDeviceSynchronize();
auto test = clad::gradient(kernel);
test.execute(a, d_a); // CHECK-EXEC: Use execute_kernel() for global CUDA kernels

cudaMemcpy(asquare, d_square, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
printf("a = %d, d(a^2)/da = %d\n", *a, *asquare); // CHECK-EXEC: a = 5, d(a^2)/da = 10
cudaFree(a);
cudaFree(d_a);

int *dummy_in, *dummy_out;
cudaMalloc(&dummy_in, sizeof(int));
cudaMalloc(&dummy_out, sizeof(int));

int *out = (int*)malloc(5 * sizeof(int));
for(int i = 0; i < 5; i++) {
out[i] = 5;
}
int *d_out;
int *dummy_in, *dummy_out, *d_out, *d_in;
cudaMalloc(&dummy_in, 5 * sizeof(int));
cudaMalloc(&dummy_out, 5 * sizeof(int));
cudaMalloc(&d_out, 5 * sizeof(int));
cudaMemcpy(d_out, out, 5 * sizeof(int), cudaMemcpyHostToDevice);

int *d_in;
cudaMalloc(&d_in, 5 * sizeof(int));

auto add = clad::gradient(add_kernel, "in, out");
add.execute_kernel(dim3(1), dim3(5, 1, 1), dummy_out, dummy_in, d_out, d_in);
cudaDeviceSynchronize();

int *res = (int*)malloc(5 * sizeof(int));
cudaMemcpy(res, d_in, 5 * sizeof(int), cudaMemcpyDeviceToHost);
printf("%d, %d, %d, %d, %d\n", res[0], res[1], res[2], res[3], res[4]); // CHECK-EXEC: 5, 5, 5, 5, 5

cudaMemset(d_in, 0, 5 * sizeof(int));
auto add_2 = clad::gradient(add_kernel_2, "in, out");
add_2.execute_kernel(dim3(1), dim3(5, 1, 1), dummy_out, dummy_in, d_out, d_in);
cudaDeviceSynchronize();

cudaMemcpy(res, d_in, 5 * sizeof(int), cudaMemcpyDeviceToHost);
printf("%d, %d, %d, %d, %d\n", res[0], res[1], res[2], res[3], res[4]); // CHECK-EXEC: 5, 5, 5, 5, 5


cudaMemset(d_in, 0, 5 * sizeof(int));
auto add_3 = clad::gradient(add_kernel_3, "in, out");
add_3.execute_kernel(dim3(5), dim3(1), dummy_out, dummy_in, d_out, d_in);
cudaDeviceSynchronize();

cudaMemcpy(res, d_in, 5 * sizeof(int), cudaMemcpyDeviceToHost);
printf("%d, %d, %d, %d, %d\n", res[0], res[1], res[2], res[3], res[4]); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5

cudaFree(dummy_in);
cudaFree(dummy_out);
cudaFree(d_out);
cudaFree(d_in);

return 0;
}

0 comments on commit 2824439

Please sign in to comment.