From 062360ca1ff2531fc68e3d80ce256079fbef75fd Mon Sep 17 00:00:00 2001
From: Brodey Newman <brodeynewman@gmail.com>
Date: Mon, 16 Dec 2024 17:04:39 +0000
Subject: [PATCH] chore: build

---
 client.cpp                |  12 +--
 codegen/gen_server.cpp    |   2 +-
 codegen/manual_client.cpp |  17 ++--
 codegen/manual_server.cpp |   2 +
 local.sh                  |   1 +
 test/unified_2.cu         | 163 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 178 insertions(+), 19 deletions(-)
 create mode 100644 test/unified_2.cu

diff --git a/client.cpp b/client.cpp
index b341a23..c8a3887 100644
--- a/client.cpp
+++ b/client.cpp
@@ -277,12 +277,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
 {
     for (int i = 0; i < conns[index].mem_idx; i++) {
         if (kind == cudaMemcpyHostToDevice) {
-            void *dev_ptr = conns[index].unified_mem_pointers[i][0];
-            void *host_ptr = conns[index].unified_mem_pointers[i][1];
-
             size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
 
-            cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice);
+            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][1], size, cudaMemcpyHostToDevice);
 
             if (res != cudaSuccess) {
                 std::cerr << "cudaMemcpy failed for index " << i
@@ -291,12 +288,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
                 std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
             }
         }  else {
-            void *dev_ptr = conns[index].unified_mem_pointers[i][0];
-            void *host_ptr = conns[index].unified_mem_pointers[i][1];
-
             size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
 
-            cudaError_t res = cudaMemcpy(host_ptr, dev_ptr, size, cudaMemcpyDeviceToHost);
+            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][1], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost);
 
             if (res != cudaSuccess) {
                 std::cerr << "cudaMemcpy failed for index " << i
@@ -315,8 +309,6 @@ void* maybe_free_unified_mem(const int index, void *ptr)
         void *target_free_ptr = conns[index].unified_mem_pointers[i][0];
 
         if (dev_ptr == ptr) {
-            std::cout << "freeing dynamic pointer " << target_free_ptr << std::endl;
-
             return target_free_ptr;
         }
     }
diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp
index 1613728..f5ddb9e 100644
--- a/codegen/gen_server.cpp
+++ b/codegen/gen_server.cpp
@@ -19546,7 +19546,7 @@ int handle_cudaFree(void *conn)
         rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
         false)
         goto ERROR_0;
-        
+            
     std::cout << "freeing... " << devPtr << std::endl;
     request_id = rpc_end_request(conn);
     if (request_id < 0)
diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp
index 04ca4ca..c2356d2 100755
--- a/codegen/manual_client.cpp
+++ b/codegen/manual_client.cpp
@@ -422,20 +422,23 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
         // convert each to a void pointer so that we can map it back to...
         // our origional host pointers.
         void *arg_ptr = *reinterpret_cast<void**>(args[i]);
-        void* maybe_ptr = maybe_get_cached_arg_ptr(0, arg_ptr);
+        void *maybe_ptr = maybe_get_cached_arg_ptr(0, arg_ptr);
 
+        // Hardconding 24 bytes for now for the unified memory case. Will remove before merge!
         if (maybe_ptr != 0)
         {
-            std::cout << "writing dynamic pointer" << std::endl;
-            if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
-                rpc_write(0, maybe_ptr, f->arg_sizes[i]) < 0)
+            int size = 24;
+            std::cout << "writing dynamic pointer " << maybe_ptr << std::endl;
+            if (rpc_write(0, &size, sizeof(int)) < 0 ||
+                rpc_write(0, maybe_ptr, size) < 0)
                 return cudaErrorDevicesUnavailable;
         }
         else
         {
+            int size = 24;
             std::cout << "writing original pointer" << std::endl;
-            if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
-                rpc_write(0, args[i], f->arg_sizes[i]) < 0)
+            if (rpc_write(0, &size, sizeof(int)) < 0 ||
+                rpc_write(0, args[i], size) < 0)
                 return cudaErrorDevicesUnavailable;
         }
     }
@@ -540,8 +543,6 @@ void parse_ptx_string(void *fatCubin, const char *ptx_string, unsigned long long
                         if (type_size == 0)
                             continue;
                         arg_size = type_size;
-
-                        std::cout << "arg size: " << arg_size << std::endl;
                     }
                     else if (ptx_string[i] == '[')
                     {
diff --git a/codegen/manual_server.cpp b/codegen/manual_server.cpp
index cd2b0fa..c9a8130 100755
--- a/codegen/manual_server.cpp
+++ b/codegen/manual_server.cpp
@@ -264,6 +264,8 @@ int handle_cudaLaunchKernel(void *conn)
 
     result = cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
 
+    std::cout << "Launch kern result: " << result << std::endl;
+
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_end_response(conn, &result) < 0)
         goto ERROR_1;
diff --git a/local.sh b/local.sh
index 9ec1903..097a239 100755
--- a/local.sh
+++ b/local.sh
@@ -28,6 +28,7 @@ build() {
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_2.cu -o unified_2.o
 
   if [ ! -f "$libscuda_path" ]; then
     echo "libscuda.so not found. build may have failed."
diff --git a/test/unified_2.cu b/test/unified_2.cu
new file mode 100644
index 0000000..41a58c6
--- /dev/null
+++ b/test/unified_2.cu
@@ -0,0 +1,163 @@
+#include <iostream>
+#include <math.h>
+
+struct Operation {
+    float *x;
+    float *y;
+    int n;
+};
+
+// CUDA kernel to add elements of two arrays
+__global__ void add(Operation *op) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+
+    printf("The X is: %x\n", op->x[0]);
+    printf("The Y is: %x\n", op->y[0]);
+    for (int i = index; i < op->n; i += stride)
+    {
+      op->y[i] = op->x[i] + op->y[i];
+      printf("The value is: %f\n", op->y[i]);
+    }
+}
+
+int main(void) {
+    Operation host_op; // Host structure
+    Operation *device_op; // Device structure
+
+    // Initialize array size
+    host_op.n = 100;
+
+    // Allocate memory for device operation struct
+    cudaMalloc(&device_op, sizeof(Operation));
+
+    // Allocate memory for x and y arrays on the device
+    cudaMalloc(&host_op.x, host_op.n * sizeof(float));
+    cudaMalloc(&host_op.y, host_op.n * sizeof(float));
+
+    // Initialize x and y arrays on the host
+    float *host_x = new float[host_op.n];
+    float *host_y = new float[host_op.n];
+    for (int i = 0; i < host_op.n; i++) {
+        host_x[i] = 1.0f;
+        host_y[i] = 2.0f;
+    }
+
+    // Copy x and y arrays from host to device
+    cudaMemcpy(host_op.x, host_x, host_op.n * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(host_op.y, host_y, host_op.n * sizeof(float), cudaMemcpyHostToDevice);
+
+    // Copy host operation struct to device
+    cudaMemcpy(device_op, &host_op, sizeof(Operation), cudaMemcpyHostToDevice);
+
+    // Launch kernel
+    int blockSize = 256;
+    int numBlocks = (host_op.n + blockSize - 1) / blockSize;
+    add<<<numBlocks, blockSize>>>(device_op);
+
+    // Wait for GPU to finish before accessing results
+    cudaDeviceSynchronize();
+
+    // Copy results from device to host
+    cudaMemcpy(host_y, host_op.y, host_op.n * sizeof(float), cudaMemcpyDeviceToHost);
+
+    // Log results for debugging
+    std::cout << "Results (y = x + y):" << std::endl;
+    for (int i = 0; i < host_op.n; i++) {
+        std::cout << "y[" << i << "] = " << host_y[i] << " (expected: 3.0)" << std::endl;
+    }
+
+    // Check for errors (all values should be 3.0f)
+    float maxError = 0.0f;
+    for (int i = 0; i < host_op.n; i++) {
+        maxError = fmax(maxError, fabs(host_y[i] - 3.0f));
+    }
+
+    // Free device memory
+    cudaFree(host_op.x);
+    cudaFree(host_op.y);
+    cudaFree(device_op);
+
+    // Free host memory
+    delete[] host_x;
+    delete[] host_y;
+
+    return 0;
+}
+
+
+
+// // ******UNIFIED MEMORY EXAMPLE BELOW*******
+
+
+// #include <iostream>
+// #include <math.h>
+
+// struct Operation {
+//     float *x;
+//     float *y;
+//     int n;
+// };
+
+// // CUDA kernel to add elements of two arrays
+// __global__ void add(Operation *op) {
+//     int index = blockIdx.x * blockDim.x + threadIdx.x;
+//     int stride = blockDim.x * gridDim.x;
+
+//     printf("The X is: %x\n", op->x[0]);
+//     printf("The Y is: %x\n", op->y[0]);
+//     for (int i = index; i < op->n; i += stride)
+//     {
+//       op->y[i] = op->x[i] + op->y[i];
+//       printf("The value is: %f\n", op->y[i]);
+//     }
+// }
+
+// int main(void) {
+//     Operation *op;
+
+//     // Allocate Unified Memory -- accessible from CPU or GPU
+//     cudaMallocManaged(&op, sizeof(Operation));
+//     op->n = 100;
+
+//     cudaMallocManaged(&op->x, op->n * sizeof(float));
+//     cudaMallocManaged(&op->y, op->n * sizeof(float));
+
+//     // initialize x and y arrays on the host
+//     for (int i = 0; i < op->n; i++) {
+//         op->x[i] = 1.0f;
+//         op->y[i] = 2.0f;
+//     }
+
+//     // Launch kernel on n elements on the GPU
+//     int blockSize = 256;
+//     int numBlocks = (op->n + blockSize - 1) / blockSize;
+
+//     std::cout << "numBlocks: " << numBlocks << std::endl;
+//     std::cout << "N: " << op->n << std::endl;
+
+//     add<<<numBlocks, blockSize>>>(op);
+
+//     // Wait for GPU to finish before accessing on host
+//     cudaDeviceSynchronize();
+
+//     // Log results for debugging
+//     std::cout << "Results (y = x + y):" << std::endl;
+//     for (int i = 0; i < op->n; i++) {
+//         std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl;
+//     }
+
+//     // Check for errors (all values should be 3.0f)
+//     float maxError = 0.0f;
+//     for (int i = 0; i < op->n; i++) {
+//         maxError = fmax(maxError, fabs(op->y[i] - 3.0f));
+//     }
+//     std::cout << "Max error: " << maxError << std::endl;
+
+//     // Free memory
+//     cudaFree(op->x);
+//     cudaFree(op->y);
+//     cudaFree(op);
+
+//     return 0;
+// }