chore: capture device pointers

kevmo314 · Dec 8, 2024 · 3d29fc8 · 3d29fc8
1 parent aeef059
commit 3d29fc8
Show file tree

Hide file tree

Showing 6 changed files with 190 additions and 17 deletions.
diff --git a/client.cpp b/client.cpp
@@ -18,6 +18,7 @@
 #include <cuda.h>
 #include <sys/uio.h>
 #include <netinet/tcp.h>
+#include <cuda_runtime.h>
 
 #include <unordered_map>
 
@@ -34,6 +35,9 @@ typedef struct
     pthread_cond_t read_cond;
     struct iovec write_iov[128];
     int write_iov_count = 0;
+
+    void ***unified_mem_pointers;
+    int mem_idx = 0;
 } conn_t;
 
 pthread_mutex_t conn_mutex;
@@ -220,6 +224,87 @@ int rpc_read(const int index, void *data, size_t size)
     return n;
 }
 
+void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size)
+{
+    std::cout << "Allocating device ptr: " << dev_ptr 
+              << " and host ptr: " << ptr 
+              << " with size: " << size << std::endl;
+
+    if (conns[index].mem_idx == 0) {
+        conns[index].unified_mem_pointers = new void **[5];
+        conns[index].unified_mem_pointers[0] = new void *[3];
+    }
+
+    if (conns[index].mem_idx >= 5) {
+        int new_capacity = conns[index].mem_idx + 1;
+
+        void ***new_arr = new void **[new_capacity];
+
+        for (int i = 0; i < conns[index].mem_idx; ++i) {
+            new_arr[i] = conns[index].unified_mem_pointers[i];
+        }
+
+        for (int i = conns[index].mem_idx; i < new_capacity; ++i) {
+            new_arr[i] = new void *[3];
+        }
+
+        delete[] conns[index].unified_mem_pointers;
+        conns[index].unified_mem_pointers = new_arr;
+    } else {
+        conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3];
+    }
+
+    conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr;
+    conns[index].unified_mem_pointers[conns[index].mem_idx][1] = ptr;
+    conns[index].unified_mem_pointers[conns[index].mem_idx][2] = reinterpret_cast<void*>(size);
+
+    conns[index].mem_idx++;
+}
+
+void cuda_memcpy_unified_ptrs(const int index)
+{
+    std::cout << "copying memory..." << std::endl;
+
+    for (int i = 0; i < conns[index].mem_idx; i++) {
+        void *dev_ptr = conns[index].unified_mem_pointers[i][0];
+        void *host_ptr = conns[index].unified_mem_pointers[i][1];
+        size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
+
+        std::cout << "Index " << i << " Parameters:\n"
+                  << "  Device Pointer (dev_ptr): " << dev_ptr << "\n"
+                  << "  Host Pointer (host_ptr): " << host_ptr << "\n"
+                  << "  Size (bytes): " << size << "\n";
+
+        cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice);
+
+        std::cout << "result: " << res << std::endl;
+
+        if (res != cudaSuccess) {
+            std::cerr << "cudaMemcpy failed for index " << i
+                      << ": " << cudaGetErrorString(res) << std::endl;
+        } else {
+            std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
+        }
+    }
+}
+
+void* maybe_free_unified_mem(const int index, void *ptr)
+{
+    for (int i = 0; i < conns[index].mem_idx; i++) {
+        void *dev_ptr = conns[index].unified_mem_pointers[i][1];
+        void *target_free_ptr = conns[index].unified_mem_pointers[i][0];
+
+        std::cout << "comparing pointers: " << dev_ptr << " ptr " << ptr << std::endl;
+
+        if (dev_ptr == ptr) {
+            std::cout << "freeing pointer " << target_free_ptr << std::endl;
+
+            // mem addresses are the same, free
+            return target_free_ptr;
+        }
+    }
+}
+
 int rpc_end_response(const int index, void *result)
 {
     if (read(conns[index].connfd, result, sizeof(int)) < 0 ||

diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
@@ -6,6 +6,7 @@
 
 #include <cstring>
 #include <string>
+#include <iostream>
 #include <unordered_map>
 
 #include "gen_api.h"
@@ -20,6 +21,8 @@ extern int rpc_wait_for_response(const int index);
 extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
 extern int rpc_close();
+extern void* maybe_free_unified_mem(const int index, void *ptr);
+extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size);
 
 nvmlReturn_t nvmlInit_v2()
 {
@@ -9114,16 +9117,19 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
 
 cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
 {
-    cudaError_t return_value;
-    if (rpc_start_request(0, RPC_cudaMallocManaged) < 0 ||
-        rpc_write(0, devPtr, sizeof(void*)) < 0 ||
-        rpc_write(0, &size, sizeof(size_t)) < 0 ||
-        rpc_write(0, &flags, sizeof(unsigned int)) < 0 ||
-        rpc_wait_for_response(0) < 0 ||
-        rpc_read(0, devPtr, sizeof(void*)) < 0 ||
-        rpc_end_response(0, &return_value) < 0)
-        return cudaErrorDevicesUnavailable;
-    return return_value;
+    std::cout << "CALLING MALLOC WITH DEV PTR: " << devPtr << std::endl;
+
+    void* host_alloc = (void*) malloc(size);
+    void*d_a;
+    cudaMalloc((void **)&d_a, size);
+    std::cout << "AFTER DEVICE PTR: " << d_a << std::endl;
+
+    allocate_unified_mem_pointer(0, d_a, host_alloc, size);
+
+    std::cout << "done allocate_unified_mem_pointer" << std::endl;
+    *devPtr = host_alloc;
+
+    std::cout << "DONE MALLOC" << std::endl;
 }
 
 cudaError_t cudaMalloc(void** devPtr, size_t size)
@@ -9135,6 +9141,8 @@ cudaError_t cudaMalloc(void** devPtr, size_t size)
         rpc_read(0, devPtr, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
+
+    std::cout << "done calling cudaMalloc... " << devPtr << std::endl;
     return return_value;
 }
 
@@ -9186,11 +9194,25 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe
 cudaError_t cudaFree(void* devPtr)
 {
     cudaError_t return_value;
-    if (rpc_start_request(0, RPC_cudaFree) < 0 ||
-        rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
-        rpc_wait_for_response(0) < 0 ||
-        rpc_end_response(0, &return_value) < 0)
-        return cudaErrorDevicesUnavailable;
+    void *maybe_ptr = maybe_free_unified_mem(0, devPtr);
+
+    if (maybe_ptr != nullptr) {
+        std::cout << "POITNER FOUND!! " << maybe_ptr << std::endl;
+
+        if (rpc_start_request(0, RPC_cudaFree) < 0 ||
+            rpc_write(0, &maybe_ptr, sizeof(void*)) < 0 ||
+            rpc_wait_for_response(0) < 0 ||
+            rpc_end_response(0, &return_value) < 0)
+            return cudaErrorDevicesUnavailable;
+    } else {
+        std::cout << "no poitner found..." << std::endl;
+        if (rpc_start_request(0, RPC_cudaFree) < 0 ||
+            rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
+            rpc_wait_for_response(0) < 0 ||
+            rpc_end_response(0, &return_value) < 0)
+            return cudaErrorDevicesUnavailable;
+    }
+
     return return_value;
 }
 

diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp
@@ -19394,6 +19394,7 @@ int handle_cudaMallocManaged(void *conn)
     unsigned int flags;
     int request_id;
     cudaError_t scuda_intercept_result;
+    std::cout << "calling cudaMallocManaged" << std::endl;
     if (
         rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
         rpc_read(conn, &size, sizeof(size_t)) < 0 ||
@@ -19432,6 +19433,8 @@ int handle_cudaMalloc(void *conn)
         goto ERROR_0;
     scuda_intercept_result = cudaMalloc(&devPtr, size);
 
+    std::cout << "ADDRESS : " << &devPtr << std::endl;
+
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
@@ -19543,7 +19546,8 @@ int handle_cudaFree(void *conn)
         rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
         false)
         goto ERROR_0;
-
+
+    std::cout << "freeing... " << devPtr << std::endl;
     request_id = rpc_end_request(conn);
     if (request_id < 0)
         goto ERROR_0;

diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp
@@ -24,6 +24,7 @@ extern int rpc_wait_for_response(const int index);
 extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
 extern int rpc_close();
+void cuda_memcpy_unified_ptrs(const int index);
 
 #define MAX_FUNCTION_NAME 1024
 #define MAX_ARGS 128
@@ -372,6 +373,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
 {
     cudaError_t return_value;
 
+    cuda_memcpy_unified_ptrs(0);
+
     // Start the RPC request
     int request_id = rpc_start_request(0, RPC_cudaLaunchKernel);
     if (request_id < 0)

diff --git a/local.sh b/local.sh
@@ -26,8 +26,8 @@ build() {
 
   nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
-
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
 
   if [ ! -f "$libscuda_path" ]; then
     echo "libscuda.so not found. build may have failed."

diff --git a/test/unified.cu b/test/unified.cu
@@ -0,0 +1,59 @@
+#include <cuda_runtime.h>
+#include <iostream>
+
+// CUDA Kernel to add elements of two arrays
+__global__ void addKernel(int *a, int *b, int *c, int size) {
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < size) {
+        c[idx] = a[idx] * b[idx];
+    }
+}
+
+int main() {
+    // Define array size
+    const int size = 10;
+    const int bytes = size * sizeof(int);
+
+    std::cout << "HELLO" << std::endl;
+
+    // Unified memory allocation
+    int *a, *b, *c;
+    cudaMallocManaged(&a, bytes);
+    cudaMallocManaged(&b, bytes);
+    cudaMallocManaged(&c, bytes);
+
+    // Initialize arrays on the CPU
+    for (int i = 0; i < size; ++i) {
+        a[i] = i;
+        b[i] = i * 2;
+    }
+
+    // Define kernel launch parameters
+    const int threadsPerBlock = 256;
+    const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+    std::cout << "launching kernel..." << std::endl;
+
+    std::cout << "pointer a: " << a << std::endl;
+    std::cout << "pointer b: " << b << std::endl;
+    std::cout << "pointer c: " << c << std::endl;
+
+    // Launch the kernel
+    addKernel<<<blocks, threadsPerBlock>>>(a, b, c, size);
+
+    // Wait for GPU to finish
+    cudaDeviceSynchronize();
+
+    // Display results
+    std::cout << "Results:\n";
+    for (int i = 0; i < size; ++i) {
+        std::cout << "a[" << i << "] + b[" << i << "] = " << c[i] << "\n";
+    }
+
+    // Free unified memory
+    cudaFree(a);
+    cudaFree(b);
+    cudaFree(c);
+
+    return 0;
+}