From a6683b43ae2041f0033ab62b8685ada58d3bddf7 Mon Sep 17 00:00:00 2001
From: Brodey Newman <brodeynewman@gmail.com>
Date: Wed, 18 Dec 2024 05:37:34 +0000
Subject: [PATCH] chore: mmap

---
 client.cpp                | 104 ++++++++++++----
 codegen/gen_client.cpp    |  13 +-
 codegen/manual_client.cpp |  27 +----
 deploy/Dockerfile.unified |  36 ++++++
 deploy/start.sh           |   6 +-
 local.sh                  |   1 +
 test/unified_2.cu         | 244 ++++++++++++++++++++------------------
 test/unified_linked.cu    |  54 +++++++++
 8 files changed, 312 insertions(+), 173 deletions(-)
 create mode 100644 deploy/Dockerfile.unified
 create mode 100644 test/unified_linked.cu
diff --git a/client.cpp b/client.cpp
index c8a3887..7ebf37f 100644
--- a/client.cpp
+++ b/client.cpp
@@ -22,6 +22,13 @@
 
 #include <unordered_map>
 
+#include <setjmp.h>
+#include <signal.h>
+#include <csignal>
+#include <cstdlib>
+#include <cstring>
+#include <sys/mman.h>
+
 #include "codegen/gen_client.h"
 
 typedef struct
@@ -46,8 +53,67 @@ int nconns = 0;
 
 const char *DEFAULT_PORT = "14833";
 
+static int init = 0;
+static jmp_buf catch_segfault;
+static void* faulting_address = nullptr;
+
+void add_host_mem_to_devptr_mapping(const int index, void *dev_ptr, void *host_ptr)
+{
+    for (int i = 0; i < conns[index].mem_idx; i++)
+    {
+        // index 1 is host pointer
+        if (conns[index].unified_mem_pointers[i][0] == dev_ptr)
+        {
+            conns[index].unified_mem_pointers[i][2] = host_ptr;
+        }
+    }
+}
+
+static void segfault(int sig, siginfo_t* info, void* unused) {
+    faulting_address = info->si_addr; 
+    std::cout << "Caught segfault at address: " << faulting_address << std::endl;
+
+    size_t page_size = sysconf(_SC_PAGESIZE);
+    void* aligned_address = (void*)((uintptr_t)faulting_address & ~(page_size - 1));
+
+    // Allocate memory at the faulting address
+    void* allocated = mmap(aligned_address, page_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+    if (allocated == MAP_FAILED) {
+        perror("Failed to allocate memory at faulting address");
+        _exit(1);
+    }
+
+    add_host_mem_to_devptr_mapping(0, faulting_address, allocated);
+
+    std::cout << "Allocated and registered memory at address: " << allocated << std::endl;
+}
+
+static void set_segfault_handlers() {
+    if (init > 0) {
+        return;
+    }
+
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_flags = SA_SIGINFO;
+    sa.sa_sigaction = segfault;
+
+    if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+        perror("sigaction");
+        exit(EXIT_FAILURE);
+    }
+
+    std::cout << "Segfault handler installed." << std::endl;
+
+    init = 1;
+}
+
 int rpc_open()
 {
+    set_segfault_handlers();
+
+    sigsetjmp(catch_segfault, 1);
+
     if (pthread_mutex_lock(&conn_mutex) < 0)
         return -1;
 
@@ -224,37 +290,22 @@ int rpc_read(const int index, void *data, size_t size)
     return n;
 }
 
-void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size)
+void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size)
 {
     // Initialize unified_mem_pointers if not already initialized
     if (conns[index].mem_idx == 0) {
-        conns[index].unified_mem_pointers = new void **[5]; // Initial capacity of 5
+        conns[index].unified_mem_pointers = new void **[10]; // Initial capacity of 5
         for (int i = 0; i < 5; ++i) {
             conns[index].unified_mem_pointers[i] = nullptr;
         }
     }
 
-    // we need to handle resize here at some point
-    // // Resize array if current capacity is exceeded
-    // if (conns[index].mem_idx >= 5) {
-    //     int current_capacity = 5 + conns[index].mem_idx;
-    //     int new_capacity = conns[index].mem_idx + 5;
-
-    //     void ***new_arr = new void **[new_capacity];
-    //     for (int i = 0; i < new_capacity; ++i) {
-    //         new_arr[i] = (i < conns[index].mem_idx) ? conns[index].unified_mem_pointers[i] : nullptr;
-    //     }
-
-    //     delete[] conns[index].unified_mem_pointers;
-    //     conns[index].unified_mem_pointers = new_arr;
-    // }
-
     // allocate new space for pointer mapping
     conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3];
 
     conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr;
-    conns[index].unified_mem_pointers[conns[index].mem_idx][1] = ptr;
-    conns[index].unified_mem_pointers[conns[index].mem_idx][2] = reinterpret_cast<void*>(size);
+    conns[index].unified_mem_pointers[conns[index].mem_idx][1] = reinterpret_cast<void*>(size);
+    conns[index].unified_mem_pointers[conns[index].mem_idx][2] = nullptr;
 
     conns[index].mem_idx++;
 }
@@ -277,9 +328,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
 {
     for (int i = 0; i < conns[index].mem_idx; i++) {
         if (kind == cudaMemcpyHostToDevice) {
-            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
+            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);
 
-            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][1], size, cudaMemcpyHostToDevice);
+            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyHostToDevice);
 
             if (res != cudaSuccess) {
                 std::cerr << "cudaMemcpy failed for index " << i
@@ -288,9 +339,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
                 std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
             }
         }  else {
-            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
+            size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);
 
-            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][1], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost);
+            cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost);
 
             if (res != cudaSuccess) {
                 std::cerr << "cudaMemcpy failed for index " << i
@@ -305,11 +356,12 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
 void* maybe_free_unified_mem(const int index, void *ptr)
 {
     for (int i = 0; i < conns[index].mem_idx; i++) {
-        void *dev_ptr = conns[index].unified_mem_pointers[i][1];
-        void *target_free_ptr = conns[index].unified_mem_pointers[i][0];
+        void *dev_ptr = conns[index].unified_mem_pointers[i][0];
+        size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);
 
         if (dev_ptr == ptr) {
-            return target_free_ptr;
+            munmap(dev_ptr, size);
+            return dev_ptr;
         }
     }
 }
diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
index fef7890..455d079 100644
--- a/codegen/gen_client.cpp
+++ b/codegen/gen_client.cpp
@@ -22,7 +22,7 @@ extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
 extern int rpc_close();
 extern void* maybe_free_unified_mem(const int index, void *ptr);
-extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size);
+extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size);
 
 nvmlReturn_t nvmlInit_v2()
 {
@@ -9117,18 +9117,19 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
 
 cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
 {
-    void* host_alloc = new void*[size];
-    void* d_a;
+    void* d_mem;
 
-    cudaError_t err = cudaMalloc((void **)&d_a, size);
+    cudaError_t err = cudaMalloc((void**)&d_mem, size);
     if (err != cudaSuccess) {
         std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
         return err;
     }
 
-    allocate_unified_mem_pointer(0, d_a, host_alloc, size);
+    std::cout << "allocated unified device mem " << d_mem << std::endl;
 
-    *devPtr = host_alloc;
+    allocate_unified_mem_pointer(0, d_mem, size);
+
+    *devPtr = d_mem;
 
     return cudaSuccess;
 }
diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp
index c2356d2..2b921ca 100755
--- a/codegen/manual_client.cpp
+++ b/codegen/manual_client.cpp
@@ -419,28 +419,9 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
 
     for (int i = 0; i < f->arg_count; ++i)
     {
-        // convert each to a void pointer so that we can map it back to...
-        // our origional host pointers.
-        void *arg_ptr = *reinterpret_cast<void**>(args[i]);
-        void *maybe_ptr = maybe_get_cached_arg_ptr(0, arg_ptr);
-
-        // Hardconding 24 bytes for now for the unified memory case. Will remove before merge!
-        if (maybe_ptr != 0)
-        {
-            int size = 24;
-            std::cout << "writing dynamic pointer " << maybe_ptr << std::endl;
-            if (rpc_write(0, &size, sizeof(int)) < 0 ||
-                rpc_write(0, maybe_ptr, size) < 0)
-                return cudaErrorDevicesUnavailable;
-        }
-        else
-        {
-            int size = 24;
-            std::cout << "writing original pointer" << std::endl;
-            if (rpc_write(0, &size, sizeof(int)) < 0 ||
-                rpc_write(0, args[i], size) < 0)
-                return cudaErrorDevicesUnavailable;
-        }
+        if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
+            rpc_write(0, args[i], f->arg_sizes[i]) < 0)
+            return cudaErrorDevicesUnavailable;
     }
 
     if (rpc_wait_for_response(0) < 0)
@@ -763,6 +744,8 @@ extern "C"
     {
         void *return_value;
 
+        std::cout << "calling __cudaRegisterVar" << std::endl;
+
         // Start the RPC request
         int request_id = rpc_start_request(0, RPC___cudaRegisterVar);
         if (request_id < 0)
diff --git a/deploy/Dockerfile.unified b/deploy/Dockerfile.unified
new file mode 100644
index 0000000..d16804b
--- /dev/null
+++ b/deploy/Dockerfile.unified
@@ -0,0 +1,36 @@
+FROM ubuntu:24.04
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    wget \
+    curl \
+    python3 \
+    python3-pip \
+    gnupg \
+    software-properties-common && \
+    add-apt-repository 'deb http://archive.ubuntu.com/ubuntu jammy main universe' && \
+    apt-get update && \
+    apt-get install -y libtinfo5 && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.0-1_all.deb && \
+    rm cuda-keyring_1.0-1_all.deb && \
+    apt-get update
+
+RUN apt-get install -y cuda-toolkit-12-2
+
+ENV PATH=/usr/local/cuda-12.2/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64
+
+ENV SCUDA_SERVER=100.118.7.128
+ENV libscuda_path=/usr/local/lib/libscuda.so
+
+COPY ./libscuda.so /usr/local/lib/libscuda.so
+COPY unified.o unified.o
+
+COPY start.sh /start.sh
+RUN chmod +x /start.sh
+RUN chmod +x /unified.o
+
+CMD ["/bin/bash", "/start.sh", "unified"]
\ No newline at end of file
diff --git a/deploy/start.sh b/deploy/start.sh
index aa66e39..091b4cf 100644
--- a/deploy/start.sh
+++ b/deploy/start.sh
@@ -10,6 +10,10 @@ elif [[ "$1" == "cublas" ]]; then
     echo "Running cublas example..."
 
     LD_PRELOAD="$libscuda_path" /matrixMulCUBLAS
+elif [[ "$1" == "unified" ]]; then
+    echo "Running cublas example..."
+
+    LD_PRELOAD="$libscuda_path" /unified.o
 else
-    echo "Unknown option: $1. Please specify 'torch' or 'cublas'."
+    echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
 fi
\ No newline at end of file
diff --git a/local.sh b/local.sh
index 097a239..aeed4bf 100755
--- a/local.sh
+++ b/local.sh
@@ -29,6 +29,7 @@ build() {
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_2.cu -o unified_2.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
 
   if [ ! -f "$libscuda_path" ]; then
     echo "libscuda.so not found. build may have failed."
diff --git a/test/unified_2.cu b/test/unified_2.cu
index 41a58c6..9642075 100644
--- a/test/unified_2.cu
+++ b/test/unified_2.cu
@@ -1,95 +1,3 @@
-#include <iostream>
-#include <math.h>
-
-struct Operation {
-    float *x;
-    float *y;
-    int n;
-};
-
-// CUDA kernel to add elements of two arrays
-__global__ void add(Operation *op) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int stride = blockDim.x * gridDim.x;
-
-    printf("The X is: %x\n", op->x[0]);
-    printf("The Y is: %x\n", op->y[0]);
-    for (int i = index; i < op->n; i += stride)
-    {
-      op->y[i] = op->x[i] + op->y[i];
-      printf("The value is: %f\n", op->y[i]);
-    }
-}
-
-int main(void) {
-    Operation host_op; // Host structure
-    Operation *device_op; // Device structure
-
-    // Initialize array size
-    host_op.n = 100;
-
-    // Allocate memory for device operation struct
-    cudaMalloc(&device_op, sizeof(Operation));
-
-    // Allocate memory for x and y arrays on the device
-    cudaMalloc(&host_op.x, host_op.n * sizeof(float));
-    cudaMalloc(&host_op.y, host_op.n * sizeof(float));
-
-    // Initialize x and y arrays on the host
-    float *host_x = new float[host_op.n];
-    float *host_y = new float[host_op.n];
-    for (int i = 0; i < host_op.n; i++) {
-        host_x[i] = 1.0f;
-        host_y[i] = 2.0f;
-    }
-
-    // Copy x and y arrays from host to device
-    cudaMemcpy(host_op.x, host_x, host_op.n * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(host_op.y, host_y, host_op.n * sizeof(float), cudaMemcpyHostToDevice);
-
-    // Copy host operation struct to device
-    cudaMemcpy(device_op, &host_op, sizeof(Operation), cudaMemcpyHostToDevice);
-
-    // Launch kernel
-    int blockSize = 256;
-    int numBlocks = (host_op.n + blockSize - 1) / blockSize;
-    add<<<numBlocks, blockSize>>>(device_op);
-
-    // Wait for GPU to finish before accessing results
-    cudaDeviceSynchronize();
-
-    // Copy results from device to host
-    cudaMemcpy(host_y, host_op.y, host_op.n * sizeof(float), cudaMemcpyDeviceToHost);
-
-    // Log results for debugging
-    std::cout << "Results (y = x + y):" << std::endl;
-    for (int i = 0; i < host_op.n; i++) {
-        std::cout << "y[" << i << "] = " << host_y[i] << " (expected: 3.0)" << std::endl;
-    }
-
-    // Check for errors (all values should be 3.0f)
-    float maxError = 0.0f;
-    for (int i = 0; i < host_op.n; i++) {
-        maxError = fmax(maxError, fabs(host_y[i] - 3.0f));
-    }
-
-    // Free device memory
-    cudaFree(host_op.x);
-    cudaFree(host_op.y);
-    cudaFree(device_op);
-
-    // Free host memory
-    delete[] host_x;
-    delete[] host_y;
-
-    return 0;
-}
-
-
-
-// // ******UNIFIED MEMORY EXAMPLE BELOW*******
-
-
 // #include <iostream>
 // #include <math.h>
 
@@ -114,50 +22,150 @@ int main(void) {
 // }
 
 // int main(void) {
-//     Operation *op;
+//     Operation host_op; // Host structure
+//     Operation *device_op; // Device structure
+
+//     // Initialize array size
+//     host_op.n = 100;
 
-//     // Allocate Unified Memory -- accessible from CPU or GPU
-//     cudaMallocManaged(&op, sizeof(Operation));
-//     op->n = 100;
+//     // Allocate memory for device operation struct
+//     cudaMalloc(&device_op, sizeof(Operation));
 
-//     cudaMallocManaged(&op->x, op->n * sizeof(float));
-//     cudaMallocManaged(&op->y, op->n * sizeof(float));
+//     // Allocate memory for x and y arrays on the device
+//     cudaMalloc(&host_op.x, host_op.n * sizeof(float));
+//     cudaMalloc(&host_op.y, host_op.n * sizeof(float));
 
-//     // initialize x and y arrays on the host
-//     for (int i = 0; i < op->n; i++) {
-//         op->x[i] = 1.0f;
-//         op->y[i] = 2.0f;
+//     // Initialize x and y arrays on the host
+//     float *host_x = new float[host_op.n];
+//     float *host_y = new float[host_op.n];
+//     for (int i = 0; i < host_op.n; i++) {
+//         host_x[i] = 1.0f;
+//         host_y[i] = 2.0f;
 //     }
 
-//     // Launch kernel on n elements on the GPU
-//     int blockSize = 256;
-//     int numBlocks = (op->n + blockSize - 1) / blockSize;
+//     // Copy x and y arrays from host to device
+//     cudaMemcpy(host_op.x, host_x, host_op.n * sizeof(float), cudaMemcpyHostToDevice);
+//     cudaMemcpy(host_op.y, host_y, host_op.n * sizeof(float), cudaMemcpyHostToDevice);
 
-//     std::cout << "numBlocks: " << numBlocks << std::endl;
-//     std::cout << "N: " << op->n << std::endl;
+//     std::cout << "BEFORE COPY DEVICE :" << &host_op.x << std::endl;
+//     std::cout << "BEFORE COPY DEVICE :" << &host_op.y << std::endl;
 
-//     add<<<numBlocks, blockSize>>>(op);
+//     // Copy host operation struct to device
+//     cudaMemcpy(device_op, &host_op, sizeof(Operation), cudaMemcpyHostToDevice);
 
-//     // Wait for GPU to finish before accessing on host
+//     std::cout << "AFTER POINTER DEVICE :" << &device_op << std::endl;
+//     std::cout << "AFTER POINTER HOST :" << &host_op << std::endl;
+//     std::cout << "AFTER COPY DEVICE :" << &device_op->x << std::endl;
+//     std::cout << "AFTER COPY DEVICE :" << &device_op->y << std::endl;
+
+//     // Launch kernel
+//     int blockSize = 256;
+//     int numBlocks = (host_op.n + blockSize - 1) / blockSize;
+//     add<<<numBlocks, blockSize>>>(device_op);
+
+//     // Wait for GPU to finish before accessing results
 //     cudaDeviceSynchronize();
 
+//     // Copy results from device to host
+//     cudaMemcpy(host_y, host_op.y, host_op.n * sizeof(float), cudaMemcpyDeviceToHost);
+
 //     // Log results for debugging
 //     std::cout << "Results (y = x + y):" << std::endl;
-//     for (int i = 0; i < op->n; i++) {
-//         std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl;
+//     for (int i = 0; i < host_op.n; i++) {
+//         std::cout << "y[" << i << "] = " << host_y[i] << " (expected: 3.0)" << std::endl;
 //     }
 
 //     // Check for errors (all values should be 3.0f)
 //     float maxError = 0.0f;
-//     for (int i = 0; i < op->n; i++) {
-//         maxError = fmax(maxError, fabs(op->y[i] - 3.0f));
+//     for (int i = 0; i < host_op.n; i++) {
+//         maxError = fmax(maxError, fabs(host_y[i] - 3.0f));
 //     }
-//     std::cout << "Max error: " << maxError << std::endl;
 
-//     // Free memory
-//     cudaFree(op->x);
-//     cudaFree(op->y);
-//     cudaFree(op);
+//     // Free device memory
+//     cudaFree(host_op.x);
+//     cudaFree(host_op.y);
+//     cudaFree(device_op);
+
+//     // Free host memory
+//     delete[] host_x;
+//     delete[] host_y;
 
 //     return 0;
 // }
+
+
+
+// // ******UNIFIED MEMORY EXAMPLE BELOW*******
+
+
+#include <iostream>
+#include <math.h>
+
+struct Operation {
+    float *x;
+    float *y;
+    int n;
+};
+
+// CUDA kernel to add elements of two arrays
+__global__ void add(Operation *op) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+
+    printf("The X is: %x\n", op->x[0]);
+    printf("The Y is: %x\n", op->y[0]);
+    for (int i = index; i < op->n; i += stride)
+    {
+      op->y[i] = op->x[i] + op->y[i];
+      printf("The value is: %f\n", op->y[i]);
+    }
+}
+
+int main(void) {
+    Operation *op;
+
+    // Allocate Unified Memory -- accessible from CPU or GPU
+    cudaMallocManaged(&op, sizeof(Operation));
+    op->n = 100;
+
+    cudaMallocManaged(&op->x, op->n * sizeof(float));
+    cudaMallocManaged(&op->y, op->n * sizeof(float));
+
+    // initialize x and y arrays on the host
+    for (int i = 0; i < op->n; i++) {
+        op->x[i] = 1.0f;
+        op->y[i] = 2.0f;
+    }
+
+    // Launch kernel on n elements on the GPU
+    int blockSize = 256;
+    int numBlocks = (op->n + blockSize - 1) / blockSize;
+
+    std::cout << "numBlocks: " << numBlocks << std::endl;
+    std::cout << "N: " << op->n << std::endl;
+
+    add<<<numBlocks, blockSize>>>(op);
+
+    // Wait for GPU to finish before accessing on host
+    cudaDeviceSynchronize();
+
+    // Log results for debugging
+    std::cout << "Results (y = x + y):" << std::endl;
+    for (int i = 0; i < op->n; i++) {
+        std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl;
+    }
+
+    // Check for errors (all values should be 3.0f)
+    float maxError = 0.0f;
+    for (int i = 0; i < op->n; i++) {
+        maxError = fmax(maxError, fabs(op->y[i] - 3.0f));
+    }
+    std::cout << "Max error: " << maxError << std::endl;
+
+    // Free memory
+    cudaFree(op->x);
+    cudaFree(op->y);
+    cudaFree(op);
+
+    return 0;
+}
diff --git a/test/unified_linked.cu b/test/unified_linked.cu
new file mode 100644
index 0000000..3bda6f2
--- /dev/null
+++ b/test/unified_linked.cu
@@ -0,0 +1,54 @@
+#include <cstdio>
+#include <cstdlib>
+// error checking macro
+#define cudaCheckErrors(msg) \
+    do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
+                msg, cudaGetErrorString(__err), \
+                __FILE__, __LINE__); \
+            fprintf(stderr, "*** FAILED - ABORTING\n"); \
+            exit(1); \
+        } \
+    } while (0)
+
+struct list_elem {
+  int key;
+  list_elem *next;
+};
+
+template <typename T>
+void alloc_bytes(T &ptr, size_t num_bytes){
+
+  cudaMallocManaged(&ptr, num_bytes);
+}
+
+__host__ __device__
+void print_element(list_elem *list, int ele_num){
+  list_elem *elem = list;
+  for (int i = 0; i < ele_num; i++)
+    elem = elem->next;
+  printf("key = %d\n", elem->key);
+}
+
+__global__ void gpu_print_element(list_elem *list, int ele_num){
+  print_element(list, ele_num);
+}
+
+const int num_elem = 5;
+const int ele = 3;
+int main(){
+
+  list_elem *list_base, *list;
+  alloc_bytes(list_base, sizeof(list_elem));
+  list = list_base;
+  for (int i = 0; i < num_elem; i++){
+    list->key = i;
+    alloc_bytes(list->next, sizeof(list_elem));
+    list = list->next;}
+  print_element(list_base, ele);
+  gpu_print_element<<<1,1>>>(list_base, ele);
+  cudaDeviceSynchronize();
+  cudaCheckErrors("cuda error!");
+}
\ No newline at end of file