chore: mmap

kevmo314 · Dec 18, 2024 · a6683b4 · a6683b4
1 parent 062360c
commit a6683b4
Show file tree

Hide file tree

Showing 8 changed files with 312 additions and 173 deletions.
diff --git a/client.cpp b/client.cpp
@@ -22,6 +22,13 @@
 
 #include <unordered_map>
 
+#include <setjmp.h>
+#include <signal.h>
+#include <csignal>
+#include <cstdlib>
+#include <cstring>
+#include <sys/mman.h>
+
 #include "codegen/gen_client.h"
 
 typedef struct
@@ -46,8 +53,67 @@ int nconns = 0;
 
 const char *DEFAULT_PORT = "14833";
 
+static int init = 0;
+static jmp_buf catch_segfault;
+static void* faulting_address = nullptr;
+
+void add_host_mem_to_devptr_mapping(const int index, void *dev_ptr, void *host_ptr)
+{
+    for (int i = 0; i < conns[index].mem_idx; i++)
+    {
+        // index 1 is host pointer
+        if (conns[index].unified_mem_pointers[i][0] == dev_ptr)
+        {
+            conns[index].unified_mem_pointers[i][2] = host_ptr;
+        }
+    }
+}
+
+static void segfault(int sig, siginfo_t* info, void* unused) {
+    faulting_address = info->si_addr; 
+    std::cout << "Caught segfault at address: " << faulting_address << std::endl;
+
+    size_t page_size = sysconf(_SC_PAGESIZE);
+    void* aligned_address = (void*)((uintptr_t)faulting_address & ~(page_size - 1));
+
+    // Allocate memory at the faulting address
+    void* allocated = mmap(aligned_address, page_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+    if (allocated == MAP_FAILED) {
+        perror("Failed to allocate memory at faulting address");
+        _exit(1);
+    }
+
+    add_host_mem_to_devptr_mapping(0, faulting_address, allocated);
+
+    std::cout << "Allocated and registered memory at address: " << allocated << std::endl;
+}
+
+static void set_segfault_handlers() {
+    if (init > 0) {
+        return;
+    }
+
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_flags = SA_SIGINFO;
+    sa.sa_sigaction = segfault;
+
+    if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+        perror("sigaction");
+        exit(EXIT_FAILURE);
+    }
+
+    std::cout << "Segfault handler installed." << std::endl;
+
+    init = 1;
+}
+
 int rpc_open()
 {
+    set_segfault_handlers();
+
+    sigsetjmp(catch_segfault, 1);
+
     if (pthread_mutex_lock(&conn_mutex) < 0)
         return -1;
 
@@ -224,37 +290,22 @@ int rpc_read(const int index, void *data, size_t size)
     return n;
 }
 
-void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size)
+void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size)
 {
     // Initialize unified_mem_pointers if not already initialized
     if (conns[index].mem_idx == 0) {
-        conns[index].unified_mem_pointers = new void **[5]; // Initial capacity of 5
+        conns[index].unified_mem_pointers = new void **[10]; // Initial capacity of 5
         for (int i = 0; i < 5; ++i) {
             conns[index].unified_mem_pointers[i] = nullptr;
         }
     }
 
-    // we need to handle resize here at some point
-    // // Resize array if current capacity is exceeded
-    // if (conns[index].mem_idx >= 5) {
-    //     int current_capacity = 5 + conns[index].mem_idx;
-    //     int new_capacity = conns[index].mem_idx + 5;
-
-    //     void ***new_arr = new void **[new_capacity];
-    //     for (int i = 0; i < new_capacity; ++i) {
-    //         new_arr[i] = (i < conns[index].mem_idx) ? conns[index].unified_mem_pointers[i] : nullptr;
-    //     }
-
-    //     delete[] conns[index].unified_mem_pointers;
-    //     conns[index].unified_mem_pointers = new_arr;
-    // }
-
     // allocate new space for pointer mapping
     conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3];
 
     conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr;
-    conns[index].unified_mem_pointers[conns[index].mem_idx][1] = ptr;
-    conns[index].unified_mem_pointers[conns[index].mem_idx][2] = reinterpret_cast<void*>(size);
+    conns[index].unified_mem_pointers[conns[index].mem_idx][1] = reinterpret_cast<void*>(size);
+    conns[index].unified_mem_pointers[conns[index].mem_idx][2] = nullptr;
 
     conns[index].mem_idx++;
 }
@@ -277,9 +328,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
 {
     for (int i = 0; i < conns[index].mem_idx; i++) {
         if (kind == cudaMemcpyHostToDevice) {
-            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
+            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);
 
-            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][1], size, cudaMemcpyHostToDevice);
+            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyHostToDevice);
 
             if (res != cudaSuccess) {
                 std::cerr << "cudaMemcpy failed for index " << i
@@ -288,9 +339,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
                 std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
             }
         }  else {
-            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
+            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);
 
-            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][1], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost);
+            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost);
 
             if (res != cudaSuccess) {
                 std::cerr << "cudaMemcpy failed for index " << i
@@ -305,11 +356,12 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
 void* maybe_free_unified_mem(const int index, void *ptr)
 {
     for (int i = 0; i < conns[index].mem_idx; i++) {
-        void *dev_ptr = conns[index].unified_mem_pointers[i][1];
-        void *target_free_ptr = conns[index].unified_mem_pointers[i][0];
+        void *dev_ptr = conns[index].unified_mem_pointers[i][0];
+        size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);
 
         if (dev_ptr == ptr) {
-            return target_free_ptr;
+            munmap(dev_ptr, size);
+            return dev_ptr;
         }
     }
 }

diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
@@ -22,7 +22,7 @@ extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
 extern int rpc_close();
 extern void* maybe_free_unified_mem(const int index, void *ptr);
-extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size);
+extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size);
 
 nvmlReturn_t nvmlInit_v2()
 {
@@ -9117,18 +9117,19 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
 
 cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
 {
-    void* host_alloc = new void*[size];
-    void* d_a;
+    void* d_mem;
 
-    cudaError_t err = cudaMalloc((void **)&d_a, size);
+    cudaError_t err = cudaMalloc((void**)&d_mem, size);
     if (err != cudaSuccess) {
         std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
         return err;
     }
 
-    allocate_unified_mem_pointer(0, d_a, host_alloc, size);
+    std::cout << "allocated unified device mem " << d_mem << std::endl;
 
-    *devPtr = host_alloc;
+    allocate_unified_mem_pointer(0, d_mem, size);
+
+    *devPtr = d_mem;
 
     return cudaSuccess;
 }

diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp
@@ -419,28 +419,9 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
 
     for (int i = 0; i < f->arg_count; ++i)
     {
-        // convert each to a void pointer so that we can map it back to...
-        // our origional host pointers.
-        void *arg_ptr = *reinterpret_cast<void**>(args[i]);
-        void *maybe_ptr = maybe_get_cached_arg_ptr(0, arg_ptr);
-
-        // Hardconding 24 bytes for now for the unified memory case. Will remove before merge!
-        if (maybe_ptr != 0)
-        {
-            int size = 24;
-            std::cout << "writing dynamic pointer " << maybe_ptr << std::endl;
-            if (rpc_write(0, &size, sizeof(int)) < 0 ||
-                rpc_write(0, maybe_ptr, size) < 0)
-                return cudaErrorDevicesUnavailable;
-        }
-        else
-        {
-            int size = 24;
-            std::cout << "writing original pointer" << std::endl;
-            if (rpc_write(0, &size, sizeof(int)) < 0 ||
-                rpc_write(0, args[i], size) < 0)
-                return cudaErrorDevicesUnavailable;
-        }
+        if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
+            rpc_write(0, args[i], f->arg_sizes[i]) < 0)
+            return cudaErrorDevicesUnavailable;
     }
 
     if (rpc_wait_for_response(0) < 0)
@@ -763,6 +744,8 @@ extern "C"
     {
         void *return_value;
 
+        std::cout << "calling __cudaRegisterVar" << std::endl;
+
         // Start the RPC request
         int request_id = rpc_start_request(0, RPC___cudaRegisterVar);
         if (request_id < 0)

diff --git a/deploy/Dockerfile.unified b/deploy/Dockerfile.unified
@@ -0,0 +1,36 @@
+FROM ubuntu:24.04
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    wget \
+    curl \
+    python3 \
+    python3-pip \
+    gnupg \
+    software-properties-common && \
+    add-apt-repository 'deb http://archive.ubuntu.com/ubuntu jammy main universe' && \
+    apt-get update && \
+    apt-get install -y libtinfo5 && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.0-1_all.deb && \
+    rm cuda-keyring_1.0-1_all.deb && \
+    apt-get update
+
+RUN apt-get install -y cuda-toolkit-12-2
+
+ENV PATH=/usr/local/cuda-12.2/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64
+
+ENV SCUDA_SERVER=100.118.7.128
+ENV libscuda_path=/usr/local/lib/libscuda.so
+
+COPY ./libscuda.so /usr/local/lib/libscuda.so
+COPY unified.o unified.o
+
+COPY start.sh /start.sh
+RUN chmod +x /start.sh
+RUN chmod +x /unified.o
+
+CMD ["/bin/bash", "/start.sh", "unified"]
diff --git a/deploy/start.sh b/deploy/start.sh
@@ -10,6 +10,10 @@ elif [[ "$1" == "cublas" ]]; then
     echo "Running cublas example..."
 
     LD_PRELOAD="$libscuda_path" /matrixMulCUBLAS
+elif [[ "$1" == "unified" ]]; then
+    echo "Running cublas example..."
+
+    LD_PRELOAD="$libscuda_path" /unified.o
 else
-    echo "Unknown option: $1. Please specify 'torch' or 'cublas'."
+    echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
 fi
diff --git a/local.sh b/local.sh
@@ -29,6 +29,7 @@ build() {
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_2.cu -o unified_2.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
 
   if [ ! -f "$libscuda_path" ]; then
     echo "libscuda.so not found. build may have failed."