diff --git a/client.cpp b/client.cpp
index 7ebf37f..235d574 100644
--- a/client.cpp
+++ b/client.cpp
@@ -43,8 +43,7 @@ typedef struct
     struct iovec write_iov[128];
     int write_iov_count = 0;
 
-    void ***unified_mem_pointers;
-    int mem_idx = 0;
+    std::unordered_map<void*, size_t> unified_devices;
 } conn_t;
 
 pthread_mutex_t conn_mutex;
@@ -57,35 +56,44 @@ static int init = 0;
 static jmp_buf catch_segfault;
 static void* faulting_address = nullptr;
 
-void add_host_mem_to_devptr_mapping(const int index, void *dev_ptr, void *host_ptr)
-{
-    for (int i = 0; i < conns[index].mem_idx; i++)
-    {
-        // index 1 is host pointer
-        if (conns[index].unified_mem_pointers[i][0] == dev_ptr)
-        {
-            conns[index].unified_mem_pointers[i][2] = host_ptr;
-        }
-    }
-}
-
 static void segfault(int sig, siginfo_t* info, void* unused) {
     faulting_address = info->si_addr; 
+
     std::cout << "Caught segfault at address: " << faulting_address << std::endl;
 
-    size_t page_size = sysconf(_SC_PAGESIZE);
-    void* aligned_address = (void*)((uintptr_t)faulting_address & ~(page_size - 1));
+    for (const auto & [ ptr, sz ] : conns[0].unified_devices)
+    {
+        if (faulting_address >= (ptr) && faulting_address <= (ptr + sz))
+        {
+            // ensure we assign memory as close to the faulting address as possible...
+            // by masking via the allocated unified memory size.
+            void* aligned_address = (void*)((uintptr_t)faulting_address & ~(sz - 1));
+
+            // Allocate memory at the faulting address
+            void* allocated = mmap(aligned_address, sz, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+            if (allocated == MAP_FAILED) {
+                perror("Failed to allocate memory at faulting address");
+                _exit(1);
+            }
+
+            std::cout << "allocated dynamic memory at address: " << allocated << std::endl;
 
-    // Allocate memory at the faulting address
-    void* allocated = mmap(aligned_address, page_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-    if (allocated == MAP_FAILED) {
-        perror("Failed to allocate memory at faulting address");
-        _exit(1);
+            return;
+        }
     }
 
-    add_host_mem_to_devptr_mapping(0, faulting_address, allocated);
+    // raise our original segfault handler
+    struct sigaction sa;
+    sa.sa_handler = SIG_DFL;
+    sigemptyset(&sa.sa_mask);
+    sa.sa_flags = 0;
+
+    if (sigaction(SIGSEGV, &sa, nullptr) == -1) {
+        perror("Failed to reset SIGSEGV handler");
+        _exit(EXIT_FAILURE);
+    }
 
-    std::cout << "Allocated and registered memory at address: " << allocated << std::endl;
+    raise(SIGSEGV);
 }
 
 static void set_segfault_handlers() {
@@ -292,62 +300,33 @@ int rpc_read(const int index, void *data, size_t size)
 
 void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size)
 {
-    // Initialize unified_mem_pointers if not already initialized
-    if (conns[index].mem_idx == 0) {
-        conns[index].unified_mem_pointers = new void **[10]; // Initial capacity of 5
-        for (int i = 0; i < 5; ++i) {
-            conns[index].unified_mem_pointers[i] = nullptr;
-        }
-    }
-
     // allocate new space for pointer mapping
-    conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3];
-
-    conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr;
-    conns[index].unified_mem_pointers[conns[index].mem_idx][1] = reinterpret_cast<void*>(size);
-    conns[index].unified_mem_pointers[conns[index].mem_idx][2] = nullptr;
-
-    conns[index].mem_idx++;
-}
-
-void* maybe_get_cached_arg_ptr(const int index, void* arg_ptr)
-{
-    for (int i = 0; i < conns[index].mem_idx; i++)
-    {
-        // index 1 is host pointer
-        if (conns[index].unified_mem_pointers[i][1] == arg_ptr)
-        {
-            return &conns[index].unified_mem_pointers[i][0];
-        }
-    }
-
-   return nullptr;
+    conns[index].unified_devices.insert({ dev_ptr, size });
 }
 
 void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
 {
-    for (int i = 0; i < conns[index].mem_idx; i++) {
+    for (const auto & [ ptr, sz ] : conns[index].unified_devices) {
         if (kind == cudaMemcpyHostToDevice) {
-            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);
-
-            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyHostToDevice);
+            size_t size = reinterpret_cast<size_t>(sz);
 
+            // ptr is the same on both host/device
+            cudaError_t res = cudaMemcpy(ptr, ptr, size, cudaMemcpyHostToDevice);
             if (res != cudaSuccess) {
-                std::cerr << "cudaMemcpy failed for index " << i
-                        << ": " << cudaGetErrorString(res) << std::endl;
+                std::cerr << "cudaMemcpy failed :" << cudaGetErrorString(res) << std::endl;
             } else {
-                std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
+                std::cout << "Successfully copied " << size << " bytes" << std::endl;
             }
-        }  else {
-            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);
+        } else {
+            size_t size = reinterpret_cast<size_t>(sz);
 
-            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost);
+            // ptr is the same on both host/device
+            cudaError_t res = cudaMemcpy(ptr, ptr, size, cudaMemcpyDeviceToHost);
 
             if (res != cudaSuccess) {
-                std::cerr << "cudaMemcpy failed for index " << i
-                        << ": " << cudaGetErrorString(res) << std::endl;
+                std::cerr << "cudaMemcpy failed :" << cudaGetErrorString(res) << std::endl;
             } else {
-                std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
+                std::cout << "Successfully copied " << size << " bytes" << std::endl;
             }
         }
     }
@@ -355,11 +334,12 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
 
 void* maybe_free_unified_mem(const int index, void *ptr)
 {
-    for (int i = 0; i < conns[index].mem_idx; i++) {
-        void *dev_ptr = conns[index].unified_mem_pointers[i][0];
-        size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);
+    for (const auto & [ dev_ptr, sz ] : conns[index].unified_devices) {
+        size_t size = reinterpret_cast<size_t>(sz);
 
         if (dev_ptr == ptr) {
+            std::cout << "mem-unampping device ptr: " << dev_ptr << " size " << size << std::endl;
+
             munmap(dev_ptr, size);
             return dev_ptr;
         }
diff --git a/codegen/annotations.h b/codegen/annotations.h
index ddaf798..42fafff 100644
--- a/codegen/annotations.h
+++ b/codegen/annotations.h
@@ -4357,6 +4357,7 @@ cudaError_t cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *f
  */
 cudaError_t cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig);
 /**
+ * @disabled
  * @param devPtr SEND_RECV
  * @param size SEND_ONLY
  * @param flags SEND_ONLY
@@ -4388,6 +4389,7 @@ cudaError_t cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t h
  */
 cudaError_t cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, unsigned int flags);
 /**
+ * @disabled
  * @param devPtr SEND_ONLY
  */
 cudaError_t cudaFree(void *devPtr);
diff --git a/codegen/codegen.py b/codegen/codegen.py
index 65bc104..14d7b84 100644
--- a/codegen/codegen.py
+++ b/codegen/codegen.py
@@ -69,9 +69,11 @@
 # a list of manually implemented cuda/nvml functions.
 # these are automatically appended to each file; operation order is maintained as well.
 MANUAL_IMPLEMENTATIONS = [
+    "cudaFree",
     "cudaMemcpy",
     "cudaMemcpyAsync",
     "cudaLaunchKernel",
+    "cudaMallocManaged",
 ]
 
 @dataclass
diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
index 455d079..03e538b 100644
--- a/codegen/gen_client.cpp
+++ b/codegen/gen_client.cpp
@@ -6,7 +6,6 @@
 
 #include <cstring>
 #include <string>
-#include <iostream>
 #include <unordered_map>
 
 #include "gen_api.h"
@@ -21,8 +20,6 @@ extern int rpc_wait_for_response(const int index);
 extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
 extern int rpc_close();
-extern void* maybe_free_unified_mem(const int index, void *ptr);
-extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size);
 
 nvmlReturn_t nvmlInit_v2()
 {
@@ -9115,25 +9112,6 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
     return return_value;
 }
 
-cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
-{
-    void* d_mem;
-
-    cudaError_t err = cudaMalloc((void**)&d_mem, size);
-    if (err != cudaSuccess) {
-        std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
-        return err;
-    }
-
-    std::cout << "allocated unified device mem " << d_mem << std::endl;
-
-    allocate_unified_mem_pointer(0, d_mem, size);
-
-    *devPtr = d_mem;
-
-    return cudaSuccess;
-}
-
 cudaError_t cudaMalloc(void** devPtr, size_t size)
 {
     cudaError_t return_value;
@@ -9191,31 +9169,6 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe
     return return_value;
 }
 
-cudaError_t cudaFree(void* devPtr)
-{
-    cudaError_t return_value;
-    void *maybe_ptr = maybe_free_unified_mem(0, devPtr);
-
-    if (maybe_ptr != nullptr) {
-        std::cout << "POITNER FOUND!! " << maybe_ptr << std::endl;
-
-        if (rpc_start_request(0, RPC_cudaFree) < 0 ||
-            rpc_write(0, &maybe_ptr, sizeof(void*)) < 0 ||
-            rpc_wait_for_response(0) < 0 ||
-            rpc_end_response(0, &return_value) < 0)
-            return cudaErrorDevicesUnavailable;
-    } else {
-        std::cout << "no poitner found..." << std::endl;
-        if (rpc_start_request(0, RPC_cudaFree) < 0 ||
-            rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
-            rpc_wait_for_response(0) < 0 ||
-            rpc_end_response(0, &return_value) < 0)
-            return cudaErrorDevicesUnavailable;
-    }
-    
-    return return_value;
-}
-
 cudaError_t cudaFreeHost(void* ptr)
 {
     cudaError_t return_value;
@@ -22176,12 +22129,10 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", (void *)cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags},
     {"cudaOccupancyMaxPotentialClusterSize", (void *)cudaOccupancyMaxPotentialClusterSize},
     {"cudaOccupancyMaxActiveClusters", (void *)cudaOccupancyMaxActiveClusters},
-    {"cudaMallocManaged", (void *)cudaMallocManaged},
     {"cudaMalloc", (void *)cudaMalloc},
     {"cudaMallocHost", (void *)cudaMallocHost},
     {"cudaMallocPitch", (void *)cudaMallocPitch},
     {"cudaMallocArray", (void *)cudaMallocArray},
-    {"cudaFree", (void *)cudaFree},
     {"cudaFreeHost", (void *)cudaFreeHost},
     {"cudaFreeArray", (void *)cudaFreeArray},
     {"cudaFreeMipmappedArray", (void *)cudaFreeMipmappedArray},
@@ -22874,9 +22825,11 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cuMemFreeAsync_ptsz", (void *)cuMemFreeAsync},
     {"cuMemAllocAsync_ptsz", (void *)cuMemAllocAsync},
     {"cuMemAllocFromPoolAsync_ptsz", (void *)cuMemAllocFromPoolAsync},
+    {"cudaFree", (void *)cudaFree},
     {"cudaMemcpy", (void *)cudaMemcpy},
     {"cudaMemcpyAsync", (void *)cudaMemcpyAsync},
     {"cudaLaunchKernel", (void *)cudaLaunchKernel},
+    {"cudaMallocManaged", (void *)cudaMallocManaged},
 };
 
 void *get_function_pointer(const char *name)
diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp
index f5ddb9e..9a6cb8b 100644
--- a/codegen/gen_server.cpp
+++ b/codegen/gen_server.cpp
@@ -19387,36 +19387,6 @@ int handle_cudaOccupancyMaxActiveClusters(void *conn)
     return -1;
 }
 
-int handle_cudaMallocManaged(void *conn)
-{
-    void* devPtr;
-    size_t size;
-    unsigned int flags;
-    int request_id;
-    cudaError_t scuda_intercept_result;
-    std::cout << "calling cudaMallocManaged" << std::endl;
-    if (
-        rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
-        rpc_read(conn, &size, sizeof(size_t)) < 0 ||
-        rpc_read(conn, &flags, sizeof(unsigned int)) < 0 ||
-        false)
-        goto ERROR_0;
-
-    request_id = rpc_end_request(conn);
-    if (request_id < 0)
-        goto ERROR_0;
-    scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags);
-
-    if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
-        rpc_end_response(conn, &scuda_intercept_result) < 0)
-        goto ERROR_0;
-
-    return 0;
-ERROR_0:
-    return -1;
-}
-
 int handle_cudaMalloc(void *conn)
 {
     void* devPtr;
@@ -19433,8 +19403,6 @@ int handle_cudaMalloc(void *conn)
         goto ERROR_0;
     scuda_intercept_result = cudaMalloc(&devPtr, size);
 
-    std::cout << "ADDRESS : " << &devPtr << std::endl;
-
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
@@ -19537,31 +19505,6 @@ int handle_cudaMallocArray(void *conn)
     return -1;
 }
 
-int handle_cudaFree(void *conn)
-{
-    void* devPtr;
-    int request_id;
-    cudaError_t scuda_intercept_result;
-    if (
-        rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
-        false)
-        goto ERROR_0;
-            
-    std::cout << "freeing... " << devPtr << std::endl;
-    request_id = rpc_end_request(conn);
-    if (request_id < 0)
-        goto ERROR_0;
-    scuda_intercept_result = cudaFree(devPtr);
-
-    if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_end_response(conn, &scuda_intercept_result) < 0)
-        goto ERROR_0;
-
-    return 0;
-ERROR_0:
-    return -1;
-}
-
 int handle_cudaFreeHost(void *conn)
 {
     void* ptr;
diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp
index 2b921ca..13ccba6 100755
--- a/codegen/manual_client.cpp
+++ b/codegen/manual_client.cpp
@@ -24,8 +24,9 @@ extern int rpc_wait_for_response(const int index);
 extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
 extern int rpc_close();
-void* maybe_get_cached_arg_ptr(const int index, void* arg_ptr);
 void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind);
+extern void* maybe_free_unified_mem(const int index, void *ptr);
+extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size);
 
 #define MAX_FUNCTION_NAME 1024
 #define MAX_ARGS 128
@@ -833,3 +834,44 @@ extern "C"
         }
     }
 }
+
+cudaError_t cudaFree(void* devPtr)
+{
+    cudaError_t return_value;
+    void *maybe_ptr = maybe_free_unified_mem(0, devPtr);
+
+    if (maybe_ptr != nullptr) {
+        if (rpc_start_request(0, RPC_cudaFree) < 0 ||
+            rpc_write(0, &maybe_ptr, sizeof(void*)) < 0 ||
+            rpc_wait_for_response(0) < 0 ||
+            rpc_end_response(0, &return_value) < 0)
+            return cudaErrorDevicesUnavailable;
+    } else {
+        if (rpc_start_request(0, RPC_cudaFree) < 0 ||
+            rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
+            rpc_wait_for_response(0) < 0 ||
+            rpc_end_response(0, &return_value) < 0)
+            return cudaErrorDevicesUnavailable;
+    }
+    
+    return return_value;
+}
+
+cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
+{
+    void* d_mem;
+
+    cudaError_t err = cudaMalloc((void**)&d_mem, size);
+    if (err != cudaSuccess) {
+        std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
+        return err;
+    }
+
+    std::cout << "allocated unified device mem " << d_mem << " size: " << size << std::endl;
+
+    allocate_unified_mem_pointer(0, d_mem, size);
+
+    *devPtr = d_mem;
+
+    return cudaSuccess;
+}
diff --git a/codegen/manual_client.h b/codegen/manual_client.h
index 1666ce3..a10bfcb 100644
--- a/codegen/manual_client.h
+++ b/codegen/manual_client.h
@@ -3,6 +3,8 @@
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 
+cudaError_t cudaFree(void* devPtr);
+cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags);
 cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
 cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
 cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
diff --git a/local.sh b/local.sh
index aeed4bf..ad68a88 100755
--- a/local.sh
+++ b/local.sh
@@ -28,7 +28,7 @@ build() {
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
-  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_2.cu -o unified_2.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
 
   if [ ! -f "$libscuda_path" ]; then
diff --git a/test/unified_pointer.cu b/test/unified_pointer.cu
index cf672aa..1e39104 100644
--- a/test/unified_pointer.cu
+++ b/test/unified_pointer.cu
@@ -17,7 +17,6 @@ __global__ void add(Operation *op) {
     for (int i = index; i < op->n; i += stride)
     {
       op->y[i] = op->x[i] + op->y[i];
-      printf("The value is: %f\n", op->y[i]);
     }
 }
 
@@ -42,7 +41,7 @@ int main(void) {
     int numBlocks = (op->n + blockSize - 1) / blockSize;
 
     std::cout << "numBlocks: " << numBlocks << std::endl;
-    std::cout << "N: " << op->n << std::endl;
+    std::cout << "X: " << &op->x << std::endl;
 
     add<<<numBlocks, blockSize>>>(op);