chore: unified working example

kevmo314 · Dec 13, 2024 · 5b890da · 5b890da
1 parent 3d29fc8
commit 5b890da
Show file tree

Hide file tree

Showing 4 changed files with 132 additions and 65 deletions.
diff --git a/client.cpp b/client.cpp
@@ -226,33 +226,31 @@ int rpc_read(const int index, void *data, size_t size)
 
 void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size)
 {
-    std::cout << "Allocating device ptr: " << dev_ptr 
-              << " and host ptr: " << ptr 
-              << " with size: " << size << std::endl;
-
+    // Initialize unified_mem_pointers if not already initialized
     if (conns[index].mem_idx == 0) {
-        conns[index].unified_mem_pointers = new void **[5];
-        conns[index].unified_mem_pointers[0] = new void *[3];
+        conns[index].unified_mem_pointers = new void **[5]; // Initial capacity of 5
+        for (int i = 0; i < 5; ++i) {
+            conns[index].unified_mem_pointers[i] = nullptr;
+        }
     }
 
-    if (conns[index].mem_idx >= 5) {
-        int new_capacity = conns[index].mem_idx + 1;
+    // we need to handle resize here at some point
+    // // Resize array if current capacity is exceeded
+    // if (conns[index].mem_idx >= 5) {
+    //     int current_capacity = 5 + conns[index].mem_idx;
+    //     int new_capacity = conns[index].mem_idx + 5;
 
-        void ***new_arr = new void **[new_capacity];
+    //     void ***new_arr = new void **[new_capacity];
+    //     for (int i = 0; i < new_capacity; ++i) {
+    //         new_arr[i] = (i < conns[index].mem_idx) ? conns[index].unified_mem_pointers[i] : nullptr;
+    //     }
 
-        for (int i = 0; i < conns[index].mem_idx; ++i) {
-            new_arr[i] = conns[index].unified_mem_pointers[i];
-        }
+    //     delete[] conns[index].unified_mem_pointers;
+    //     conns[index].unified_mem_pointers = new_arr;
+    // }
 
-        for (int i = conns[index].mem_idx; i < new_capacity; ++i) {
-            new_arr[i] = new void *[3];
-        }
-
-        delete[] conns[index].unified_mem_pointers;
-        conns[index].unified_mem_pointers = new_arr;
-    } else {
-        conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3];
-    }
+    // allocate new space for pointer mapping
+    conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3];
 
     conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr;
     conns[index].unified_mem_pointers[conns[index].mem_idx][1] = ptr;
@@ -261,29 +259,45 @@ void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, siz
     conns[index].mem_idx++;
 }
 
-void cuda_memcpy_unified_ptrs(const int index)
+void* maybe_get_cached_arg_ptr(const int index, int arg_index, void* arg)
 {
-    std::cout << "copying memory..." << std::endl;
+    if (arg_index >= conns[index].mem_idx)
+        return nullptr;
+
+   return &conns[index].unified_mem_pointers[arg_index][0];
+}
 
+void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
+{
     for (int i = 0; i < conns[index].mem_idx; i++) {
-        void *dev_ptr = conns[index].unified_mem_pointers[i][0];
-        void *host_ptr = conns[index].unified_mem_pointers[i][1];
-        size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
+        if (kind == cudaMemcpyHostToDevice) {
+            void *dev_ptr = conns[index].unified_mem_pointers[i][0];
+            void *host_ptr = conns[index].unified_mem_pointers[i][1];
 
-        std::cout << "Index " << i << " Parameters:\n"
-                  << "  Device Pointer (dev_ptr): " << dev_ptr << "\n"
-                  << "  Host Pointer (host_ptr): " << host_ptr << "\n"
-                  << "  Size (bytes): " << size << "\n";
+            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
 
-        cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice);
+            cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice);
+
+            if (res != cudaSuccess) {
+                std::cerr << "cudaMemcpy failed for index " << i
+                        << ": " << cudaGetErrorString(res) << std::endl;
+            } else {
+                std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
+            }
+        }  else {
+            void *dev_ptr = conns[index].unified_mem_pointers[i][0];
+            void *host_ptr = conns[index].unified_mem_pointers[i][1];
 
-        std::cout << "result: " << res << std::endl;
+            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
 
-        if (res != cudaSuccess) {
-            std::cerr << "cudaMemcpy failed for index " << i
-                      << ": " << cudaGetErrorString(res) << std::endl;
-        } else {
-            std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
+            cudaError_t res = cudaMemcpy(host_ptr, dev_ptr, size, cudaMemcpyDeviceToHost);
+
+            if (res != cudaSuccess) {
+                std::cerr << "cudaMemcpy failed for index " << i
+                        << ": " << cudaGetErrorString(res) << std::endl;
+            } else {
+                std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
+            }
         }
     }
 }
@@ -294,12 +308,9 @@ void* maybe_free_unified_mem(const int index, void *ptr)
         void *dev_ptr = conns[index].unified_mem_pointers[i][1];
         void *target_free_ptr = conns[index].unified_mem_pointers[i][0];
 
-        std::cout << "comparing pointers: " << dev_ptr << " ptr " << ptr << std::endl;
-
         if (dev_ptr == ptr) {
-            std::cout << "freeing pointer " << target_free_ptr << std::endl;
+            std::cout << "freeing dynamic pointer " << target_free_ptr << std::endl;
 
-            // mem addresses are the same, free
             return target_free_ptr;
         }
     }

diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
@@ -9117,19 +9117,20 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
 
 cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
 {
-    std::cout << "CALLING MALLOC WITH DEV PTR: " << devPtr << std::endl;
+    void* host_alloc = new void*[size];
+    void* d_a;
 
-    void* host_alloc = (void*) malloc(size);
-    void*d_a;
-    cudaMalloc((void **)&d_a, size);
-    std::cout << "AFTER DEVICE PTR: " << d_a << std::endl;
+    cudaError_t err = cudaMalloc((void **)&d_a, size);
+    if (err != cudaSuccess) {
+        std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
+        return err;
+    }
 
     allocate_unified_mem_pointer(0, d_a, host_alloc, size);
 
-    std::cout << "done allocate_unified_mem_pointer" << std::endl;
     *devPtr = host_alloc;
 
-    std::cout << "DONE MALLOC" << std::endl;
+    return cudaSuccess;
 }
 
 cudaError_t cudaMalloc(void** devPtr, size_t size)
@@ -9141,8 +9142,6 @@ cudaError_t cudaMalloc(void** devPtr, size_t size)
         rpc_read(0, devPtr, sizeof(void*)) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return cudaErrorDevicesUnavailable;
-
-    std::cout << "done calling cudaMalloc... " << devPtr << std::endl;
     return return_value;
 }
 

diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp
@@ -24,7 +24,6 @@ extern int rpc_wait_for_response(const int index);
 extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
 extern int rpc_close();
-void cuda_memcpy_unified_ptrs(const int index);
 
 #define MAX_FUNCTION_NAME 1024
 #define MAX_ARGS 128
@@ -373,7 +372,7 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
 {
     cudaError_t return_value;
 
-    cuda_memcpy_unified_ptrs(0);
+    cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);
 
     // Start the RPC request
     int request_id = rpc_start_request(0, RPC_cudaLaunchKernel);
@@ -418,9 +417,22 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
 
     for (int i = 0; i < f->arg_count; ++i)
     {
-        if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
-            rpc_write(0, args[i], f->arg_sizes[i]) < 0)
-            return cudaErrorDevicesUnavailable;
+        void* maybe_ptr = maybe_get_cached_arg_ptr(0, i, args[i]);
+
+        if (maybe_ptr != 0)
+        {
+            std::cout << "writing dynamic pointer" << std::endl;
+            if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
+                rpc_write(0, maybe_ptr, f->arg_sizes[i]) < 0)
+                return cudaErrorDevicesUnavailable;
+        }
+        else
+        {
+            std::cout << "writing original pointer" << std::endl;
+            if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
+                rpc_write(0, args[i], f->arg_sizes[i]) < 0)
+                return cudaErrorDevicesUnavailable;
+        }
     }
 
     if (rpc_wait_for_response(0) < 0)
@@ -433,6 +445,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
         return cudaErrorDevicesUnavailable;
     }
 
+    cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);
+
     return return_value;
 }
 

diff --git a/test/unified.cu b/test/unified.cu
@@ -2,30 +2,70 @@
 #include <iostream>
 
 // CUDA Kernel to add elements of two arrays
-__global__ void addKernel(int *a, int *b, int *c, int size) {
+// __global__ void addKernel(int *a, int *b, int *c, int size) {
+//     int idx = threadIdx.x + blockIdx.x * blockDim.x;
+//     if (idx < size) {
+//         c[idx] = a[idx] * b[idx];
+//     }
+// }
+
+__global__ void mulKernel(int *a, int *c, int size) {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     if (idx < size) {
-        c[idx] = a[idx] * b[idx];
+        c[idx] = a[idx] * 5;
     }
 }
 
 int main() {
+    // const int size = 10;
+    // const int bytes = size * sizeof(int);
+    // int *d_a;
+    // int *d_c;
+
+    // // host pointers
+    // int *a = new int[size];
+    // int *c = new int[size];
+    // // cudaMalloc(&a, bytes);
+    // cudaMalloc(&d_a, bytes);
+    // cudaMalloc(&d_c, bytes);
+    // for (int i = 0; i < size; ++i) {
+    //     a[i] = i;
+    // }
+
+    // cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice);
+    // cudaMemcpy(d_c, c, bytes, cudaMemcpyHostToDevice);
+
+    // const int threadsPerBlock = 256;
+    // std::cout << "ARG SIZE::: " << bytes << std::endl;
+    // const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+    // mulKernel<<<blocks, threadsPerBlock>>>(d_a, d_c, size);
+    // cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost);
+    // cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
+
+    // std::cout << "Results:\n";
+    // for (int i = 0; i < size; ++i) {
+    //     std::cout << "a[" << i << "] + b[" << i << "] = " << c[i] << "\n";
+    // }
+
+    // cudaFree(a);
+    // cudaFree(c);
+
+
+
+
     // Define array size
     const int size = 10;
     const int bytes = size * sizeof(int);
 
-    std::cout << "HELLO" << std::endl;
-
     // Unified memory allocation
-    int *a, *b, *c;
+    int *a, *c;
     cudaMallocManaged(&a, bytes);
-    cudaMallocManaged(&b, bytes);
     cudaMallocManaged(&c, bytes);
 
     // Initialize arrays on the CPU
     for (int i = 0; i < size; ++i) {
         a[i] = i;
-        b[i] = i * 2;
     }
 
     // Define kernel launch parameters
@@ -34,12 +74,16 @@ int main() {
 
     std::cout << "launching kernel..." << std::endl;
 
+    for (int i = 0; i < size; ++i) {
+        std::cout << "a[" << i << "] + b[" << i << "] = " << a[i] << "\n";
+    }
+
     std::cout << "pointer a: " << a << std::endl;
-    std::cout << "pointer b: " << b << std::endl;
     std::cout << "pointer c: " << c << std::endl;
+    std::cout << "size c: " << size << std::endl;
 
     // Launch the kernel
-    addKernel<<<blocks, threadsPerBlock>>>(a, b, c, size);
+    mulKernel<<<blocks, threadsPerBlock>>>(a, c, size);
 
     // Wait for GPU to finish
     cudaDeviceSynchronize();
@@ -52,7 +96,6 @@ int main() {
 
     // Free unified memory
     cudaFree(a);
-    cudaFree(b);
     cudaFree(c);
 
     return 0;