From a6683b43ae2041f0033ab62b8685ada58d3bddf7 Mon Sep 17 00:00:00 2001 From: Brodey Newman Date: Wed, 18 Dec 2024 05:37:34 +0000 Subject: [PATCH] chore: mmap --- client.cpp | 104 ++++++++++++---- codegen/gen_client.cpp | 13 +- codegen/manual_client.cpp | 27 +---- deploy/Dockerfile.unified | 36 ++++++ deploy/start.sh | 6 +- local.sh | 1 + test/unified_2.cu | 244 ++++++++++++++++++++------------------ test/unified_linked.cu | 54 +++++++++ 8 files changed, 312 insertions(+), 173 deletions(-) create mode 100644 deploy/Dockerfile.unified create mode 100644 test/unified_linked.cu diff --git a/client.cpp b/client.cpp index c8a3887..7ebf37f 100644 --- a/client.cpp +++ b/client.cpp @@ -22,6 +22,13 @@ #include +#include +#include +#include +#include +#include +#include + #include "codegen/gen_client.h" typedef struct @@ -46,8 +53,67 @@ int nconns = 0; const char *DEFAULT_PORT = "14833"; +static int init = 0; +static jmp_buf catch_segfault; +static void* faulting_address = nullptr; + +void add_host_mem_to_devptr_mapping(const int index, void *dev_ptr, void *host_ptr) +{ + for (int i = 0; i < conns[index].mem_idx; i++) + { + // index 1 is host pointer + if (conns[index].unified_mem_pointers[i][0] == dev_ptr) + { + conns[index].unified_mem_pointers[i][2] = host_ptr; + } + } +} + +static void segfault(int sig, siginfo_t* info, void* unused) { + faulting_address = info->si_addr; + std::cout << "Caught segfault at address: " << faulting_address << std::endl; + + size_t page_size = sysconf(_SC_PAGESIZE); + void* aligned_address = (void*)((uintptr_t)faulting_address & ~(page_size - 1)); + + // Allocate memory at the faulting address + void* allocated = mmap(aligned_address, page_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (allocated == MAP_FAILED) { + perror("Failed to allocate memory at faulting address"); + _exit(1); + } + + add_host_mem_to_devptr_mapping(0, faulting_address, allocated); + + std::cout << "Allocated and registered memory at address: " << allocated << std::endl; +} + +static void set_segfault_handlers() { + if (init > 0) { + return; + } + + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = segfault; + + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + std::cout << "Segfault handler installed." << std::endl; + + init = 1; +} + int rpc_open() { + set_segfault_handlers(); + + sigsetjmp(catch_segfault, 1); + if (pthread_mutex_lock(&conn_mutex) < 0) return -1; @@ -224,37 +290,22 @@ int rpc_read(const int index, void *data, size_t size) return n; } -void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size) +void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size) { // Initialize unified_mem_pointers if not already initialized if (conns[index].mem_idx == 0) { - conns[index].unified_mem_pointers = new void **[5]; // Initial capacity of 5 + conns[index].unified_mem_pointers = new void **[10]; // Initial capacity of 5 for (int i = 0; i < 5; ++i) { conns[index].unified_mem_pointers[i] = nullptr; } } - // we need to handle resize here at some point - // // Resize array if current capacity is exceeded - // if (conns[index].mem_idx >= 5) { - // int current_capacity = 5 + conns[index].mem_idx; - // int new_capacity = conns[index].mem_idx + 5; - - // void ***new_arr = new void **[new_capacity]; - // for (int i = 0; i < new_capacity; ++i) { - // new_arr[i] = (i < conns[index].mem_idx) ? conns[index].unified_mem_pointers[i] : nullptr; - // } - - // delete[] conns[index].unified_mem_pointers; - // conns[index].unified_mem_pointers = new_arr; - // } - // allocate new space for pointer mapping conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3]; conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr; - conns[index].unified_mem_pointers[conns[index].mem_idx][1] = ptr; - conns[index].unified_mem_pointers[conns[index].mem_idx][2] = reinterpret_cast(size); + conns[index].unified_mem_pointers[conns[index].mem_idx][1] = reinterpret_cast(size); + conns[index].unified_mem_pointers[conns[index].mem_idx][2] = nullptr; conns[index].mem_idx++; } @@ -277,9 +328,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind) { for (int i = 0; i < conns[index].mem_idx; i++) { if (kind == cudaMemcpyHostToDevice) { - size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][2]); + size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][1]); - cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][1], size, cudaMemcpyHostToDevice); + cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyHostToDevice); if (res != cudaSuccess) { std::cerr << "cudaMemcpy failed for index " << i @@ -288,9 +339,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind) std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl; } } else { - size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][2]); + size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][1]); - cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][1], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost); + cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost); if (res != cudaSuccess) { std::cerr << "cudaMemcpy failed for index " << i @@ -305,11 +356,12 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind) void* maybe_free_unified_mem(const int index, void *ptr) { for (int i = 0; i < conns[index].mem_idx; i++) { - void *dev_ptr = conns[index].unified_mem_pointers[i][1]; - void *target_free_ptr = conns[index].unified_mem_pointers[i][0]; + void *dev_ptr = conns[index].unified_mem_pointers[i][0]; + size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][1]); if (dev_ptr == ptr) { - return target_free_ptr; + munmap(dev_ptr, size); + return dev_ptr; } } } diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp index fef7890..455d079 100644 --- a/codegen/gen_client.cpp +++ b/codegen/gen_client.cpp @@ -22,7 +22,7 @@ extern int rpc_read(const int index, void *data, const std::size_t size); extern int rpc_end_response(const int index, void *return_value); extern int rpc_close(); extern void* maybe_free_unified_mem(const int index, void *ptr); -extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size); +extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size); nvmlReturn_t nvmlInit_v2() { @@ -9117,18 +9117,19 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags) { - void* host_alloc = new void*[size]; - void* d_a; + void* d_mem; - cudaError_t err = cudaMalloc((void **)&d_a, size); + cudaError_t err = cudaMalloc((void**)&d_mem, size); if (err != cudaSuccess) { std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl; return err; } - allocate_unified_mem_pointer(0, d_a, host_alloc, size); + std::cout << "allocated unified device mem " << d_mem << std::endl; - *devPtr = host_alloc; + allocate_unified_mem_pointer(0, d_mem, size); + + *devPtr = d_mem; return cudaSuccess; } diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp index c2356d2..2b921ca 100755 --- a/codegen/manual_client.cpp +++ b/codegen/manual_client.cpp @@ -419,28 +419,9 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void for (int i = 0; i < f->arg_count; ++i) { - // convert each to a void pointer so that we can map it back to... - // our origional host pointers. - void *arg_ptr = *reinterpret_cast(args[i]); - void *maybe_ptr = maybe_get_cached_arg_ptr(0, arg_ptr); - - // Hardconding 24 bytes for now for the unified memory case. Will remove before merge! - if (maybe_ptr != 0) - { - int size = 24; - std::cout << "writing dynamic pointer " << maybe_ptr << std::endl; - if (rpc_write(0, &size, sizeof(int)) < 0 || - rpc_write(0, maybe_ptr, size) < 0) - return cudaErrorDevicesUnavailable; - } - else - { - int size = 24; - std::cout << "writing original pointer" << std::endl; - if (rpc_write(0, &size, sizeof(int)) < 0 || - rpc_write(0, args[i], size) < 0) - return cudaErrorDevicesUnavailable; - } + if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 || + rpc_write(0, args[i], f->arg_sizes[i]) < 0) + return cudaErrorDevicesUnavailable; } if (rpc_wait_for_response(0) < 0) @@ -763,6 +744,8 @@ extern "C" { void *return_value; + std::cout << "calling __cudaRegisterVar" << std::endl; + // Start the RPC request int request_id = rpc_start_request(0, RPC___cudaRegisterVar); if (request_id < 0) diff --git a/deploy/Dockerfile.unified b/deploy/Dockerfile.unified new file mode 100644 index 0000000..d16804b --- /dev/null +++ b/deploy/Dockerfile.unified @@ -0,0 +1,36 @@ +FROM ubuntu:24.04 + +RUN apt-get update && apt-get install -y \ + build-essential \ + wget \ + curl \ + python3 \ + python3-pip \ + gnupg \ + software-properties-common && \ + add-apt-repository 'deb http://archive.ubuntu.com/ubuntu jammy main universe' && \ + apt-get update && \ + apt-get install -y libtinfo5 && \ + rm -rf /var/lib/apt/lists/* + +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ + rm cuda-keyring_1.0-1_all.deb && \ + apt-get update + +RUN apt-get install -y cuda-toolkit-12-2 + +ENV PATH=/usr/local/cuda-12.2/bin:${PATH} +ENV LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64 + +ENV SCUDA_SERVER=100.118.7.128 +ENV libscuda_path=/usr/local/lib/libscuda.so + +COPY ./libscuda.so /usr/local/lib/libscuda.so +COPY unified.o unified.o + +COPY start.sh /start.sh +RUN chmod +x /start.sh +RUN chmod +x /unified.o + +CMD ["/bin/bash", "/start.sh", "unified"] \ No newline at end of file diff --git a/deploy/start.sh b/deploy/start.sh index aa66e39..091b4cf 100644 --- a/deploy/start.sh +++ b/deploy/start.sh @@ -10,6 +10,10 @@ elif [[ "$1" == "cublas" ]]; then echo "Running cublas example..." LD_PRELOAD="$libscuda_path" /matrixMulCUBLAS +elif [[ "$1" == "unified" ]]; then + echo "Running cublas example..." + + LD_PRELOAD="$libscuda_path" /unified.o else - echo "Unknown option: $1. Please specify 'torch' or 'cublas'." + echo "Unknown option: $1. Please specify one of: torch | cublas | unified ." fi \ No newline at end of file diff --git a/local.sh b/local.sh index 097a239..aeed4bf 100755 --- a/local.sh +++ b/local.sh @@ -29,6 +29,7 @@ build() { nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_2.cu -o unified_2.o + nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o if [ ! -f "$libscuda_path" ]; then echo "libscuda.so not found. build may have failed." diff --git a/test/unified_2.cu b/test/unified_2.cu index 41a58c6..9642075 100644 --- a/test/unified_2.cu +++ b/test/unified_2.cu @@ -1,95 +1,3 @@ -#include -#include - -struct Operation { - float *x; - float *y; - int n; -}; - -// CUDA kernel to add elements of two arrays -__global__ void add(Operation *op) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - printf("The X is: %x\n", op->x[0]); - printf("The Y is: %x\n", op->y[0]); - for (int i = index; i < op->n; i += stride) - { - op->y[i] = op->x[i] + op->y[i]; - printf("The value is: %f\n", op->y[i]); - } -} - -int main(void) { - Operation host_op; // Host structure - Operation *device_op; // Device structure - - // Initialize array size - host_op.n = 100; - - // Allocate memory for device operation struct - cudaMalloc(&device_op, sizeof(Operation)); - - // Allocate memory for x and y arrays on the device - cudaMalloc(&host_op.x, host_op.n * sizeof(float)); - cudaMalloc(&host_op.y, host_op.n * sizeof(float)); - - // Initialize x and y arrays on the host - float *host_x = new float[host_op.n]; - float *host_y = new float[host_op.n]; - for (int i = 0; i < host_op.n; i++) { - host_x[i] = 1.0f; - host_y[i] = 2.0f; - } - - // Copy x and y arrays from host to device - cudaMemcpy(host_op.x, host_x, host_op.n * sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(host_op.y, host_y, host_op.n * sizeof(float), cudaMemcpyHostToDevice); - - // Copy host operation struct to device - cudaMemcpy(device_op, &host_op, sizeof(Operation), cudaMemcpyHostToDevice); - - // Launch kernel - int blockSize = 256; - int numBlocks = (host_op.n + blockSize - 1) / blockSize; - add<<>>(device_op); - - // Wait for GPU to finish before accessing results - cudaDeviceSynchronize(); - - // Copy results from device to host - cudaMemcpy(host_y, host_op.y, host_op.n * sizeof(float), cudaMemcpyDeviceToHost); - - // Log results for debugging - std::cout << "Results (y = x + y):" << std::endl; - for (int i = 0; i < host_op.n; i++) { - std::cout << "y[" << i << "] = " << host_y[i] << " (expected: 3.0)" << std::endl; - } - - // Check for errors (all values should be 3.0f) - float maxError = 0.0f; - for (int i = 0; i < host_op.n; i++) { - maxError = fmax(maxError, fabs(host_y[i] - 3.0f)); - } - - // Free device memory - cudaFree(host_op.x); - cudaFree(host_op.y); - cudaFree(device_op); - - // Free host memory - delete[] host_x; - delete[] host_y; - - return 0; -} - - - -// // ******UNIFIED MEMORY EXAMPLE BELOW******* - - // #include // #include @@ -114,50 +22,150 @@ int main(void) { // } // int main(void) { -// Operation *op; +// Operation host_op; // Host structure +// Operation *device_op; // Device structure + +// // Initialize array size +// host_op.n = 100; -// // Allocate Unified Memory -- accessible from CPU or GPU -// cudaMallocManaged(&op, sizeof(Operation)); -// op->n = 100; +// // Allocate memory for device operation struct +// cudaMalloc(&device_op, sizeof(Operation)); -// cudaMallocManaged(&op->x, op->n * sizeof(float)); -// cudaMallocManaged(&op->y, op->n * sizeof(float)); +// // Allocate memory for x and y arrays on the device +// cudaMalloc(&host_op.x, host_op.n * sizeof(float)); +// cudaMalloc(&host_op.y, host_op.n * sizeof(float)); -// // initialize x and y arrays on the host -// for (int i = 0; i < op->n; i++) { -// op->x[i] = 1.0f; -// op->y[i] = 2.0f; +// // Initialize x and y arrays on the host +// float *host_x = new float[host_op.n]; +// float *host_y = new float[host_op.n]; +// for (int i = 0; i < host_op.n; i++) { +// host_x[i] = 1.0f; +// host_y[i] = 2.0f; // } -// // Launch kernel on n elements on the GPU -// int blockSize = 256; -// int numBlocks = (op->n + blockSize - 1) / blockSize; +// // Copy x and y arrays from host to device +// cudaMemcpy(host_op.x, host_x, host_op.n * sizeof(float), cudaMemcpyHostToDevice); +// cudaMemcpy(host_op.y, host_y, host_op.n * sizeof(float), cudaMemcpyHostToDevice); -// std::cout << "numBlocks: " << numBlocks << std::endl; -// std::cout << "N: " << op->n << std::endl; +// std::cout << "BEFORE COPY DEVICE :" << &host_op.x << std::endl; +// std::cout << "BEFORE COPY DEVICE :" << &host_op.y << std::endl; -// add<<>>(op); +// // Copy host operation struct to device +// cudaMemcpy(device_op, &host_op, sizeof(Operation), cudaMemcpyHostToDevice); -// // Wait for GPU to finish before accessing on host +// std::cout << "AFTER POINTER DEVICE :" << &device_op << std::endl; +// std::cout << "AFTER POINTER HOST :" << &host_op << std::endl; +// std::cout << "AFTER COPY DEVICE :" << &device_op->x << std::endl; +// std::cout << "AFTER COPY DEVICE :" << &device_op->y << std::endl; + +// // Launch kernel +// int blockSize = 256; +// int numBlocks = (host_op.n + blockSize - 1) / blockSize; +// add<<>>(device_op); + +// // Wait for GPU to finish before accessing results // cudaDeviceSynchronize(); +// // Copy results from device to host +// cudaMemcpy(host_y, host_op.y, host_op.n * sizeof(float), cudaMemcpyDeviceToHost); + // // Log results for debugging // std::cout << "Results (y = x + y):" << std::endl; -// for (int i = 0; i < op->n; i++) { -// std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl; +// for (int i = 0; i < host_op.n; i++) { +// std::cout << "y[" << i << "] = " << host_y[i] << " (expected: 3.0)" << std::endl; // } // // Check for errors (all values should be 3.0f) // float maxError = 0.0f; -// for (int i = 0; i < op->n; i++) { -// maxError = fmax(maxError, fabs(op->y[i] - 3.0f)); +// for (int i = 0; i < host_op.n; i++) { +// maxError = fmax(maxError, fabs(host_y[i] - 3.0f)); // } -// std::cout << "Max error: " << maxError << std::endl; -// // Free memory -// cudaFree(op->x); -// cudaFree(op->y); -// cudaFree(op); +// // Free device memory +// cudaFree(host_op.x); +// cudaFree(host_op.y); +// cudaFree(device_op); + +// // Free host memory +// delete[] host_x; +// delete[] host_y; // return 0; // } + + + +// // ******UNIFIED MEMORY EXAMPLE BELOW******* + + +#include +#include + +struct Operation { + float *x; + float *y; + int n; +}; + +// CUDA kernel to add elements of two arrays +__global__ void add(Operation *op) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + printf("The X is: %x\n", op->x[0]); + printf("The Y is: %x\n", op->y[0]); + for (int i = index; i < op->n; i += stride) + { + op->y[i] = op->x[i] + op->y[i]; + printf("The value is: %f\n", op->y[i]); + } +} + +int main(void) { + Operation *op; + + // Allocate Unified Memory -- accessible from CPU or GPU + cudaMallocManaged(&op, sizeof(Operation)); + op->n = 100; + + cudaMallocManaged(&op->x, op->n * sizeof(float)); + cudaMallocManaged(&op->y, op->n * sizeof(float)); + + // initialize x and y arrays on the host + for (int i = 0; i < op->n; i++) { + op->x[i] = 1.0f; + op->y[i] = 2.0f; + } + + // Launch kernel on n elements on the GPU + int blockSize = 256; + int numBlocks = (op->n + blockSize - 1) / blockSize; + + std::cout << "numBlocks: " << numBlocks << std::endl; + std::cout << "N: " << op->n << std::endl; + + add<<>>(op); + + // Wait for GPU to finish before accessing on host + cudaDeviceSynchronize(); + + // Log results for debugging + std::cout << "Results (y = x + y):" << std::endl; + for (int i = 0; i < op->n; i++) { + std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl; + } + + // Check for errors (all values should be 3.0f) + float maxError = 0.0f; + for (int i = 0; i < op->n; i++) { + maxError = fmax(maxError, fabs(op->y[i] - 3.0f)); + } + std::cout << "Max error: " << maxError << std::endl; + + // Free memory + cudaFree(op->x); + cudaFree(op->y); + cudaFree(op); + + return 0; +} diff --git a/test/unified_linked.cu b/test/unified_linked.cu new file mode 100644 index 0000000..3bda6f2 --- /dev/null +++ b/test/unified_linked.cu @@ -0,0 +1,54 @@ +#include +#include +// error checking macro +#define cudaCheckErrors(msg) \ + do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ + msg, cudaGetErrorString(__err), \ + __FILE__, __LINE__); \ + fprintf(stderr, "*** FAILED - ABORTING\n"); \ + exit(1); \ + } \ + } while (0) + +struct list_elem { + int key; + list_elem *next; +}; + +template +void alloc_bytes(T &ptr, size_t num_bytes){ + + cudaMallocManaged(&ptr, num_bytes); +} + +__host__ __device__ +void print_element(list_elem *list, int ele_num){ + list_elem *elem = list; + for (int i = 0; i < ele_num; i++) + elem = elem->next; + printf("key = %d\n", elem->key); +} + +__global__ void gpu_print_element(list_elem *list, int ele_num){ + print_element(list, ele_num); +} + +const int num_elem = 5; +const int ele = 3; +int main(){ + + list_elem *list_base, *list; + alloc_bytes(list_base, sizeof(list_elem)); + list = list_base; + for (int i = 0; i < num_elem; i++){ + list->key = i; + alloc_bytes(list->next, sizeof(list_elem)); + list = list->next;} + print_element(list_base, ele); + gpu_print_element<<<1,1>>>(list_base, ele); + cudaDeviceSynchronize(); + cudaCheckErrors("cuda error!"); +} \ No newline at end of file