From fcc86693c4362a1a2604e6a820bc66e1b0350bb3 Mon Sep 17 00:00:00 2001
From: Brodey Newman <brodeynewman@gmail.com>
Date: Wed, 18 Dec 2024 06:17:51 +0000
Subject: [PATCH] chore: unified pointer ex

---
 deploy/Dockerfile.unified |   3 +-
 deploy/start.sh           |   2 +-
 test/unified_2.cu         | 171 --------------------------------------
 test/unified_pointer.cu   |  71 ++++++++++++++++
 4 files changed, 74 insertions(+), 173 deletions(-)
 delete mode 100644 test/unified_2.cu
 create mode 100644 test/unified_pointer.cu
diff --git a/deploy/Dockerfile.unified b/deploy/Dockerfile.unified
index d16804b..16b64b1 100644
--- a/deploy/Dockerfile.unified
+++ b/deploy/Dockerfile.unified
@@ -23,11 +23,12 @@ RUN apt-get install -y cuda-toolkit-12-2
 ENV PATH=/usr/local/cuda-12.2/bin:${PATH}
 ENV LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64
 
-ENV SCUDA_SERVER=100.118.7.128
+ENV SCUDA_SERVER=71.183.65.76
 ENV libscuda_path=/usr/local/lib/libscuda.so
 
 COPY ./libscuda.so /usr/local/lib/libscuda.so
 COPY unified.o unified.o
+COPY unified_pointer.o unified_pointer.o
 
 COPY start.sh /start.sh
 RUN chmod +x /start.sh
diff --git a/deploy/start.sh b/deploy/start.sh
index 091b4cf..d24f11e 100644
--- a/deploy/start.sh
+++ b/deploy/start.sh
@@ -13,7 +13,7 @@ elif [[ "$1" == "cublas" ]]; then
 elif [[ "$1" == "unified" ]]; then
     echo "Running cublas example..."
 
-    LD_PRELOAD="$libscuda_path" /unified.o
+    LD_PRELOAD="$libscuda_path" /unified_pointer.o
 else
     echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
 fi
\ No newline at end of file
diff --git a/test/unified_2.cu b/test/unified_2.cu
deleted file mode 100644
index 9642075..0000000
--- a/test/unified_2.cu
+++ /dev/null
@@ -1,171 +0,0 @@
-// #include <iostream>
-// #include <math.h>
-
-// struct Operation {
-//     float *x;
-//     float *y;
-//     int n;
-// };
-
-// // CUDA kernel to add elements of two arrays
-// __global__ void add(Operation *op) {
-//     int index = blockIdx.x * blockDim.x + threadIdx.x;
-//     int stride = blockDim.x * gridDim.x;
-
-//     printf("The X is: %x\n", op->x[0]);
-//     printf("The Y is: %x\n", op->y[0]);
-//     for (int i = index; i < op->n; i += stride)
-//     {
-//       op->y[i] = op->x[i] + op->y[i];
-//       printf("The value is: %f\n", op->y[i]);
-//     }
-// }
-
-// int main(void) {
-//     Operation host_op; // Host structure
-//     Operation *device_op; // Device structure
-
-//     // Initialize array size
-//     host_op.n = 100;
-
-//     // Allocate memory for device operation struct
-//     cudaMalloc(&device_op, sizeof(Operation));
-
-//     // Allocate memory for x and y arrays on the device
-//     cudaMalloc(&host_op.x, host_op.n * sizeof(float));
-//     cudaMalloc(&host_op.y, host_op.n * sizeof(float));
-
-//     // Initialize x and y arrays on the host
-//     float *host_x = new float[host_op.n];
-//     float *host_y = new float[host_op.n];
-//     for (int i = 0; i < host_op.n; i++) {
-//         host_x[i] = 1.0f;
-//         host_y[i] = 2.0f;
-//     }
-
-//     // Copy x and y arrays from host to device
-//     cudaMemcpy(host_op.x, host_x, host_op.n * sizeof(float), cudaMemcpyHostToDevice);
-//     cudaMemcpy(host_op.y, host_y, host_op.n * sizeof(float), cudaMemcpyHostToDevice);
-
-//     std::cout << "BEFORE COPY DEVICE :" << &host_op.x << std::endl;
-//     std::cout << "BEFORE COPY DEVICE :" << &host_op.y << std::endl;
-
-//     // Copy host operation struct to device
-//     cudaMemcpy(device_op, &host_op, sizeof(Operation), cudaMemcpyHostToDevice);
-
-//     std::cout << "AFTER POINTER DEVICE :" << &device_op << std::endl;
-//     std::cout << "AFTER POINTER HOST :" << &host_op << std::endl;
-//     std::cout << "AFTER COPY DEVICE :" << &device_op->x << std::endl;
-//     std::cout << "AFTER COPY DEVICE :" << &device_op->y << std::endl;
-
-//     // Launch kernel
-//     int blockSize = 256;
-//     int numBlocks = (host_op.n + blockSize - 1) / blockSize;
-//     add<<<numBlocks, blockSize>>>(device_op);
-
-//     // Wait for GPU to finish before accessing results
-//     cudaDeviceSynchronize();
-
-//     // Copy results from device to host
-//     cudaMemcpy(host_y, host_op.y, host_op.n * sizeof(float), cudaMemcpyDeviceToHost);
-
-//     // Log results for debugging
-//     std::cout << "Results (y = x + y):" << std::endl;
-//     for (int i = 0; i < host_op.n; i++) {
-//         std::cout << "y[" << i << "] = " << host_y[i] << " (expected: 3.0)" << std::endl;
-//     }
-
-//     // Check for errors (all values should be 3.0f)
-//     float maxError = 0.0f;
-//     for (int i = 0; i < host_op.n; i++) {
-//         maxError = fmax(maxError, fabs(host_y[i] - 3.0f));
-//     }
-
-//     // Free device memory
-//     cudaFree(host_op.x);
-//     cudaFree(host_op.y);
-//     cudaFree(device_op);
-
-//     // Free host memory
-//     delete[] host_x;
-//     delete[] host_y;
-
-//     return 0;
-// }
-
-
-
-// // ******UNIFIED MEMORY EXAMPLE BELOW*******
-
-
-#include <iostream>
-#include <math.h>
-
-struct Operation {
-    float *x;
-    float *y;
-    int n;
-};
-
-// CUDA kernel to add elements of two arrays
-__global__ void add(Operation *op) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int stride = blockDim.x * gridDim.x;
-
-    printf("The X is: %x\n", op->x[0]);
-    printf("The Y is: %x\n", op->y[0]);
-    for (int i = index; i < op->n; i += stride)
-    {
-      op->y[i] = op->x[i] + op->y[i];
-      printf("The value is: %f\n", op->y[i]);
-    }
-}
-
-int main(void) {
-    Operation *op;
-
-    // Allocate Unified Memory -- accessible from CPU or GPU
-    cudaMallocManaged(&op, sizeof(Operation));
-    op->n = 100;
-
-    cudaMallocManaged(&op->x, op->n * sizeof(float));
-    cudaMallocManaged(&op->y, op->n * sizeof(float));
-
-    // initialize x and y arrays on the host
-    for (int i = 0; i < op->n; i++) {
-        op->x[i] = 1.0f;
-        op->y[i] = 2.0f;
-    }
-
-    // Launch kernel on n elements on the GPU
-    int blockSize = 256;
-    int numBlocks = (op->n + blockSize - 1) / blockSize;
-
-    std::cout << "numBlocks: " << numBlocks << std::endl;
-    std::cout << "N: " << op->n << std::endl;
-
-    add<<<numBlocks, blockSize>>>(op);
-
-    // Wait for GPU to finish before accessing on host
-    cudaDeviceSynchronize();
-
-    // Log results for debugging
-    std::cout << "Results (y = x + y):" << std::endl;
-    for (int i = 0; i < op->n; i++) {
-        std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl;
-    }
-
-    // Check for errors (all values should be 3.0f)
-    float maxError = 0.0f;
-    for (int i = 0; i < op->n; i++) {
-        maxError = fmax(maxError, fabs(op->y[i] - 3.0f));
-    }
-    std::cout << "Max error: " << maxError << std::endl;
-
-    // Free memory
-    cudaFree(op->x);
-    cudaFree(op->y);
-    cudaFree(op);
-
-    return 0;
-}
diff --git a/test/unified_pointer.cu b/test/unified_pointer.cu
new file mode 100644
index 0000000..cf672aa
--- /dev/null
+++ b/test/unified_pointer.cu
@@ -0,0 +1,71 @@
+#include <iostream>
+#include <math.h>
+
+struct Operation {
+    float *x;
+    float *y;
+    int n;
+};
+
+// CUDA kernel to add elements of two arrays
+__global__ void add(Operation *op) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+
+    printf("The X is: %x\n", op->x[0]);
+    printf("The Y is: %x\n", op->y[0]);
+    for (int i = index; i < op->n; i += stride)
+    {
+      op->y[i] = op->x[i] + op->y[i];
+      printf("The value is: %f\n", op->y[i]);
+    }
+}
+
+int main(void) {
+    Operation *op;
+
+    // Allocate Unified Memory -- accessible from CPU or GPU
+    cudaMallocManaged(&op, sizeof(Operation));
+    op->n = 100;
+
+    cudaMallocManaged(&op->x, op->n * sizeof(float));
+    cudaMallocManaged(&op->y, op->n * sizeof(float));
+
+    // initialize x and y arrays on the host
+    for (int i = 0; i < op->n; i++) {
+        op->x[i] = 1.0f;
+        op->y[i] = 2.0f;
+    }
+
+    // Launch kernel on n elements on the GPU
+    int blockSize = 256;
+    int numBlocks = (op->n + blockSize - 1) / blockSize;
+
+    std::cout << "numBlocks: " << numBlocks << std::endl;
+    std::cout << "N: " << op->n << std::endl;
+
+    add<<<numBlocks, blockSize>>>(op);
+
+    // Wait for GPU to finish before accessing on host
+    cudaDeviceSynchronize();
+
+    // Log results for debugging
+    std::cout << "Results (y = x + y):" << std::endl;
+    for (int i = 0; i < op->n; i++) {
+        std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl;
+    }
+
+    // Check for errors (all values should be 3.0f)
+    float maxError = 0.0f;
+    for (int i = 0; i < op->n; i++) {
+        maxError = fmax(maxError, fabs(op->y[i] - 3.0f));
+    }
+    std::cout << "Max error: " << maxError << std::endl;
+
+    // Free memory
+    cudaFree(op->x);
+    cudaFree(op->y);
+    cudaFree(op);
+
+    return 0;
+}