chore: unified pointer ex

kevmo314 · Dec 18, 2024 · fcc8669 · fcc8669
1 parent a6683b4
commit fcc8669
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 173 deletions.
diff --git a/deploy/Dockerfile.unified b/deploy/Dockerfile.unified
@@ -23,11 +23,12 @@ RUN apt-get install -y cuda-toolkit-12-2
 ENV PATH=/usr/local/cuda-12.2/bin:${PATH}
 ENV LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64
 
-ENV SCUDA_SERVER=100.118.7.128
+ENV SCUDA_SERVER=71.183.65.76
 ENV libscuda_path=/usr/local/lib/libscuda.so
 
 COPY ./libscuda.so /usr/local/lib/libscuda.so
 COPY unified.o unified.o
+COPY unified_pointer.o unified_pointer.o
 
 COPY start.sh /start.sh
 RUN chmod +x /start.sh

diff --git a/deploy/start.sh b/deploy/start.sh
@@ -13,7 +13,7 @@ elif [[ "$1" == "cublas" ]]; then
 elif [[ "$1" == "unified" ]]; then
     echo "Running cublas example..."
 
-    LD_PRELOAD="$libscuda_path" /unified.o
+    LD_PRELOAD="$libscuda_path" /unified_pointer.o
 else
     echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
 fi
diff --git a/test/unified_2.cu b/test/unified_2.cu
diff --git a/test/unified_pointer.cu b/test/unified_pointer.cu
@@ -0,0 +1,71 @@
+#include <iostream>
+#include <math.h>
+
+struct Operation {
+    float *x;
+    float *y;
+    int n;
+};
+
+// CUDA kernel to add elements of two arrays
+__global__ void add(Operation *op) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+
+    printf("The X is: %x\n", op->x[0]);
+    printf("The Y is: %x\n", op->y[0]);
+    for (int i = index; i < op->n; i += stride)
+    {
+      op->y[i] = op->x[i] + op->y[i];
+      printf("The value is: %f\n", op->y[i]);
+    }
+}
+
+int main(void) {
+    Operation *op;
+
+    // Allocate Unified Memory -- accessible from CPU or GPU
+    cudaMallocManaged(&op, sizeof(Operation));
+    op->n = 100;
+
+    cudaMallocManaged(&op->x, op->n * sizeof(float));
+    cudaMallocManaged(&op->y, op->n * sizeof(float));
+
+    // initialize x and y arrays on the host
+    for (int i = 0; i < op->n; i++) {
+        op->x[i] = 1.0f;
+        op->y[i] = 2.0f;
+    }
+
+    // Launch kernel on n elements on the GPU
+    int blockSize = 256;
+    int numBlocks = (op->n + blockSize - 1) / blockSize;
+
+    std::cout << "numBlocks: " << numBlocks << std::endl;
+    std::cout << "N: " << op->n << std::endl;
+
+    add<<<numBlocks, blockSize>>>(op);
+
+    // Wait for GPU to finish before accessing on host
+    cudaDeviceSynchronize();
+
+    // Log results for debugging
+    std::cout << "Results (y = x + y):" << std::endl;
+    for (int i = 0; i < op->n; i++) {
+        std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl;
+    }
+
+    // Check for errors (all values should be 3.0f)
+    float maxError = 0.0f;
+    for (int i = 0; i < op->n; i++) {
+        maxError = fmax(maxError, fabs(op->y[i] - 3.0f));
+    }
+    std::cout << "Max error: " << maxError << std::endl;
+
+    // Free memory
+    cudaFree(op->x);
+    cudaFree(op->y);
+    cudaFree(op);
+
+    return 0;
+}