Skip to content

Commit

Permalink
Add init function in cuda gradient tests
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 committed Oct 11, 2024
1 parent 2090b78 commit e911a65
Showing 1 changed file with 49 additions and 72 deletions.
121 changes: 49 additions & 72 deletions test/CUDA/GradientKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,25 @@ double fn_memory(double *out, double *in) {
free(res); \
}

#define INIT(x, y, val, dx, dy, d_val) \
{ \
double *fives = (double*)malloc(10 * sizeof(double)); \
for(int i = 0; i < 10; i++) { \
fives[i] = 5; \
} \
double *zeros = (double*)malloc(10 * sizeof(double)); \
for(int i = 0; i < 10; i++) { \
zeros[i] = 0; \
} \
cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(val, fives, sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice); \
free(fives); \
free(zeros); \
}

int main(void) {
int *a, *d_a;
Expand Down Expand Up @@ -732,121 +751,79 @@ int main(void) {

TEST_2_D(add_kernel_7, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out_double, dummy_in_double, d_out_double, d_in_double, 10); // CHECK-EXEC: 50.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00

cudaFree(dummy_in_double);
cudaFree(dummy_out_double);
cudaFree(d_out_double);
cudaFree(d_in_double);

double *fives = (double*)malloc(10 * sizeof(double));
for(int i = 0; i < 10; i++) {
fives[i] = 5;
}
double *zeros = (double*)malloc(10 * sizeof(double));
for(int i = 0; i < 10; i++) {
zeros[i] = 0;
}

double *x, *y, *dx, *dy, *d_val;
cudaMalloc(&x, 10 * sizeof(double));
cudaMalloc(&y, 10 * sizeof(double));
cudaMalloc(&dx, 10 * sizeof(double));
cudaMalloc(&dy, 10 * sizeof(double));
double *val;
cudaMalloc(&val, sizeof(double));
double *d_val;
cudaMalloc(&d_val, sizeof(double));

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice);

INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto test_device = clad::gradient(kernel_with_device_call, "out, val");
test_device.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, d_val);
test_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
double *res = (double*)malloc(10 * sizeof(double));
cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice);
INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val");
test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, d_val);
test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val);
cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in");
test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, dx);
test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double);
cudaDeviceSynchronize();
cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice);

double *val;
cudaMalloc(&val, sizeof(double));
cudaMemcpy(val, fives, sizeof(double), cudaMemcpyHostToDevice);
INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto test_device_4 = clad::gradient(kernel_with_device_call_3);
test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, val, dy, dx, d_val);
test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, val, d_out_double, d_in_double, d_val);
cudaDeviceSynchronize();
cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00
cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto test_kernel_call = clad::gradient(fn);
test_kernel_call.execute(y, x, dy, dx);
test_kernel_call.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double);
cudaDeviceSynchronize();
cudaMemcpy(res, dx, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(res, d_in_double, sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("%0.2f\n", *res); // CHECK-EXEC: 50.00

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in");
nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, dx);
nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double);
cudaDeviceSynchronize();
cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00

cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice);
INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val);

auto test_memory = clad::gradient(fn_memory);
test_memory.execute(y, x, dy, dx);
test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double);
cudaDeviceSynchronize();
cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost);
printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error
cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost);
printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 50.00, 0.00, 0.00

free(fives);
free(zeros);

free(res);
cudaFree(dx);
cudaFree(dy);
cudaFree(dummy_in_double);
cudaFree(dummy_out_double);
cudaFree(d_out_double);
cudaFree(d_in_double);
cudaFree(val);
cudaFree(d_val);

return 0;
Expand Down

0 comments on commit e911a65

Please sign in to comment.