From fcc86693c4362a1a2604e6a820bc66e1b0350bb3 Mon Sep 17 00:00:00 2001 From: Brodey Newman Date: Wed, 18 Dec 2024 06:17:51 +0000 Subject: [PATCH] chore: unified pointer ex --- deploy/Dockerfile.unified | 3 +- deploy/start.sh | 2 +- test/unified_2.cu | 171 -------------------------------------- test/unified_pointer.cu | 71 ++++++++++++++++ 4 files changed, 74 insertions(+), 173 deletions(-) delete mode 100644 test/unified_2.cu create mode 100644 test/unified_pointer.cu diff --git a/deploy/Dockerfile.unified b/deploy/Dockerfile.unified index d16804b..16b64b1 100644 --- a/deploy/Dockerfile.unified +++ b/deploy/Dockerfile.unified @@ -23,11 +23,12 @@ RUN apt-get install -y cuda-toolkit-12-2 ENV PATH=/usr/local/cuda-12.2/bin:${PATH} ENV LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64 -ENV SCUDA_SERVER=100.118.7.128 +ENV SCUDA_SERVER=71.183.65.76 ENV libscuda_path=/usr/local/lib/libscuda.so COPY ./libscuda.so /usr/local/lib/libscuda.so COPY unified.o unified.o +COPY unified_pointer.o unified_pointer.o COPY start.sh /start.sh RUN chmod +x /start.sh diff --git a/deploy/start.sh b/deploy/start.sh index 091b4cf..d24f11e 100644 --- a/deploy/start.sh +++ b/deploy/start.sh @@ -13,7 +13,7 @@ elif [[ "$1" == "cublas" ]]; then elif [[ "$1" == "unified" ]]; then echo "Running cublas example..." - LD_PRELOAD="$libscuda_path" /unified.o + LD_PRELOAD="$libscuda_path" /unified_pointer.o else echo "Unknown option: $1. Please specify one of: torch | cublas | unified ." fi \ No newline at end of file diff --git a/test/unified_2.cu b/test/unified_2.cu deleted file mode 100644 index 9642075..0000000 --- a/test/unified_2.cu +++ /dev/null @@ -1,171 +0,0 @@ -// #include -// #include - -// struct Operation { -// float *x; -// float *y; -// int n; -// }; - -// // CUDA kernel to add elements of two arrays -// __global__ void add(Operation *op) { -// int index = blockIdx.x * blockDim.x + threadIdx.x; -// int stride = blockDim.x * gridDim.x; - -// printf("The X is: %x\n", op->x[0]); -// printf("The Y is: %x\n", op->y[0]); -// for (int i = index; i < op->n; i += stride) -// { -// op->y[i] = op->x[i] + op->y[i]; -// printf("The value is: %f\n", op->y[i]); -// } -// } - -// int main(void) { -// Operation host_op; // Host structure -// Operation *device_op; // Device structure - -// // Initialize array size -// host_op.n = 100; - -// // Allocate memory for device operation struct -// cudaMalloc(&device_op, sizeof(Operation)); - -// // Allocate memory for x and y arrays on the device -// cudaMalloc(&host_op.x, host_op.n * sizeof(float)); -// cudaMalloc(&host_op.y, host_op.n * sizeof(float)); - -// // Initialize x and y arrays on the host -// float *host_x = new float[host_op.n]; -// float *host_y = new float[host_op.n]; -// for (int i = 0; i < host_op.n; i++) { -// host_x[i] = 1.0f; -// host_y[i] = 2.0f; -// } - -// // Copy x and y arrays from host to device -// cudaMemcpy(host_op.x, host_x, host_op.n * sizeof(float), cudaMemcpyHostToDevice); -// cudaMemcpy(host_op.y, host_y, host_op.n * sizeof(float), cudaMemcpyHostToDevice); - -// std::cout << "BEFORE COPY DEVICE :" << &host_op.x << std::endl; -// std::cout << "BEFORE COPY DEVICE :" << &host_op.y << std::endl; - -// // Copy host operation struct to device -// cudaMemcpy(device_op, &host_op, sizeof(Operation), cudaMemcpyHostToDevice); - -// std::cout << "AFTER POINTER DEVICE :" << &device_op << std::endl; -// std::cout << "AFTER POINTER HOST :" << &host_op << std::endl; -// std::cout << "AFTER COPY DEVICE :" << &device_op->x << std::endl; -// std::cout << "AFTER COPY DEVICE :" << &device_op->y << std::endl; - -// // Launch kernel -// int blockSize = 256; -// int numBlocks = (host_op.n + blockSize - 1) / blockSize; -// add<<>>(device_op); - -// // Wait for GPU to finish before accessing results -// cudaDeviceSynchronize(); - -// // Copy results from device to host -// cudaMemcpy(host_y, host_op.y, host_op.n * sizeof(float), cudaMemcpyDeviceToHost); - -// // Log results for debugging -// std::cout << "Results (y = x + y):" << std::endl; -// for (int i = 0; i < host_op.n; i++) { -// std::cout << "y[" << i << "] = " << host_y[i] << " (expected: 3.0)" << std::endl; -// } - -// // Check for errors (all values should be 3.0f) -// float maxError = 0.0f; -// for (int i = 0; i < host_op.n; i++) { -// maxError = fmax(maxError, fabs(host_y[i] - 3.0f)); -// } - -// // Free device memory -// cudaFree(host_op.x); -// cudaFree(host_op.y); -// cudaFree(device_op); - -// // Free host memory -// delete[] host_x; -// delete[] host_y; - -// return 0; -// } - - - -// // ******UNIFIED MEMORY EXAMPLE BELOW******* - - -#include -#include - -struct Operation { - float *x; - float *y; - int n; -}; - -// CUDA kernel to add elements of two arrays -__global__ void add(Operation *op) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - printf("The X is: %x\n", op->x[0]); - printf("The Y is: %x\n", op->y[0]); - for (int i = index; i < op->n; i += stride) - { - op->y[i] = op->x[i] + op->y[i]; - printf("The value is: %f\n", op->y[i]); - } -} - -int main(void) { - Operation *op; - - // Allocate Unified Memory -- accessible from CPU or GPU - cudaMallocManaged(&op, sizeof(Operation)); - op->n = 100; - - cudaMallocManaged(&op->x, op->n * sizeof(float)); - cudaMallocManaged(&op->y, op->n * sizeof(float)); - - // initialize x and y arrays on the host - for (int i = 0; i < op->n; i++) { - op->x[i] = 1.0f; - op->y[i] = 2.0f; - } - - // Launch kernel on n elements on the GPU - int blockSize = 256; - int numBlocks = (op->n + blockSize - 1) / blockSize; - - std::cout << "numBlocks: " << numBlocks << std::endl; - std::cout << "N: " << op->n << std::endl; - - add<<>>(op); - - // Wait for GPU to finish before accessing on host - cudaDeviceSynchronize(); - - // Log results for debugging - std::cout << "Results (y = x + y):" << std::endl; - for (int i = 0; i < op->n; i++) { - std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl; - } - - // Check for errors (all values should be 3.0f) - float maxError = 0.0f; - for (int i = 0; i < op->n; i++) { - maxError = fmax(maxError, fabs(op->y[i] - 3.0f)); - } - std::cout << "Max error: " << maxError << std::endl; - - // Free memory - cudaFree(op->x); - cudaFree(op->y); - cudaFree(op); - - return 0; -} diff --git a/test/unified_pointer.cu b/test/unified_pointer.cu new file mode 100644 index 0000000..cf672aa --- /dev/null +++ b/test/unified_pointer.cu @@ -0,0 +1,71 @@ +#include +#include + +struct Operation { + float *x; + float *y; + int n; +}; + +// CUDA kernel to add elements of two arrays +__global__ void add(Operation *op) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + printf("The X is: %x\n", op->x[0]); + printf("The Y is: %x\n", op->y[0]); + for (int i = index; i < op->n; i += stride) + { + op->y[i] = op->x[i] + op->y[i]; + printf("The value is: %f\n", op->y[i]); + } +} + +int main(void) { + Operation *op; + + // Allocate Unified Memory -- accessible from CPU or GPU + cudaMallocManaged(&op, sizeof(Operation)); + op->n = 100; + + cudaMallocManaged(&op->x, op->n * sizeof(float)); + cudaMallocManaged(&op->y, op->n * sizeof(float)); + + // initialize x and y arrays on the host + for (int i = 0; i < op->n; i++) { + op->x[i] = 1.0f; + op->y[i] = 2.0f; + } + + // Launch kernel on n elements on the GPU + int blockSize = 256; + int numBlocks = (op->n + blockSize - 1) / blockSize; + + std::cout << "numBlocks: " << numBlocks << std::endl; + std::cout << "N: " << op->n << std::endl; + + add<<>>(op); + + // Wait for GPU to finish before accessing on host + cudaDeviceSynchronize(); + + // Log results for debugging + std::cout << "Results (y = x + y):" << std::endl; + for (int i = 0; i < op->n; i++) { + std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl; + } + + // Check for errors (all values should be 3.0f) + float maxError = 0.0f; + for (int i = 0; i < op->n; i++) { + maxError = fmax(maxError, fabs(op->y[i] - 3.0f)); + } + std::cout << "Max error: " << maxError << std::endl; + + // Free memory + cudaFree(op->x); + cudaFree(op->y); + cudaFree(op); + + return 0; +}