diff --git a/client.cpp b/client.cpp index 7ebf37f..235d574 100644 --- a/client.cpp +++ b/client.cpp @@ -43,8 +43,7 @@ typedef struct struct iovec write_iov[128]; int write_iov_count = 0; - void ***unified_mem_pointers; - int mem_idx = 0; + std::unordered_map unified_devices; } conn_t; pthread_mutex_t conn_mutex; @@ -57,35 +56,44 @@ static int init = 0; static jmp_buf catch_segfault; static void* faulting_address = nullptr; -void add_host_mem_to_devptr_mapping(const int index, void *dev_ptr, void *host_ptr) -{ - for (int i = 0; i < conns[index].mem_idx; i++) - { - // index 1 is host pointer - if (conns[index].unified_mem_pointers[i][0] == dev_ptr) - { - conns[index].unified_mem_pointers[i][2] = host_ptr; - } - } -} - static void segfault(int sig, siginfo_t* info, void* unused) { faulting_address = info->si_addr; + std::cout << "Caught segfault at address: " << faulting_address << std::endl; - size_t page_size = sysconf(_SC_PAGESIZE); - void* aligned_address = (void*)((uintptr_t)faulting_address & ~(page_size - 1)); + for (const auto & [ ptr, sz ] : conns[0].unified_devices) + { + if (faulting_address >= (ptr) && faulting_address <= (ptr + sz)) + { + // ensure we assign memory as close to the faulting address as possible... + // by masking via the allocated unified memory size. + void* aligned_address = (void*)((uintptr_t)faulting_address & ~(sz - 1)); + + // Allocate memory at the faulting address + void* allocated = mmap(aligned_address, sz, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (allocated == MAP_FAILED) { + perror("Failed to allocate memory at faulting address"); + _exit(1); + } + + std::cout << "allocated dynamic memory at address: " << allocated << std::endl; - // Allocate memory at the faulting address - void* allocated = mmap(aligned_address, page_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (allocated == MAP_FAILED) { - perror("Failed to allocate memory at faulting address"); - _exit(1); + return; + } } - add_host_mem_to_devptr_mapping(0, faulting_address, allocated); + // raise our original segfault handler + struct sigaction sa; + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + + if (sigaction(SIGSEGV, &sa, nullptr) == -1) { + perror("Failed to reset SIGSEGV handler"); + _exit(EXIT_FAILURE); + } - std::cout << "Allocated and registered memory at address: " << allocated << std::endl; + raise(SIGSEGV); } static void set_segfault_handlers() { @@ -292,62 +300,33 @@ int rpc_read(const int index, void *data, size_t size) void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size) { - // Initialize unified_mem_pointers if not already initialized - if (conns[index].mem_idx == 0) { - conns[index].unified_mem_pointers = new void **[10]; // Initial capacity of 5 - for (int i = 0; i < 5; ++i) { - conns[index].unified_mem_pointers[i] = nullptr; - } - } - // allocate new space for pointer mapping - conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3]; - - conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr; - conns[index].unified_mem_pointers[conns[index].mem_idx][1] = reinterpret_cast(size); - conns[index].unified_mem_pointers[conns[index].mem_idx][2] = nullptr; - - conns[index].mem_idx++; -} - -void* maybe_get_cached_arg_ptr(const int index, void* arg_ptr) -{ - for (int i = 0; i < conns[index].mem_idx; i++) - { - // index 1 is host pointer - if (conns[index].unified_mem_pointers[i][1] == arg_ptr) - { - return &conns[index].unified_mem_pointers[i][0]; - } - } - - return nullptr; + conns[index].unified_devices.insert({ dev_ptr, size }); } void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind) { - for (int i = 0; i < conns[index].mem_idx; i++) { + for (const auto & [ ptr, sz ] : conns[index].unified_devices) { if (kind == cudaMemcpyHostToDevice) { - size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][1]); - - cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyHostToDevice); + size_t size = reinterpret_cast(sz); + // ptr is the same on both host/device + cudaError_t res = cudaMemcpy(ptr, ptr, size, cudaMemcpyHostToDevice); if (res != cudaSuccess) { - std::cerr << "cudaMemcpy failed for index " << i - << ": " << cudaGetErrorString(res) << std::endl; + std::cerr << "cudaMemcpy failed :" << cudaGetErrorString(res) << std::endl; } else { - std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl; + std::cout << "Successfully copied " << size << " bytes" << std::endl; } - } else { - size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][1]); + } else { + size_t size = reinterpret_cast(sz); - cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost); + // ptr is the same on both host/device + cudaError_t res = cudaMemcpy(ptr, ptr, size, cudaMemcpyDeviceToHost); if (res != cudaSuccess) { - std::cerr << "cudaMemcpy failed for index " << i - << ": " << cudaGetErrorString(res) << std::endl; + std::cerr << "cudaMemcpy failed :" << cudaGetErrorString(res) << std::endl; } else { - std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl; + std::cout << "Successfully copied " << size << " bytes" << std::endl; } } } @@ -355,11 +334,12 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind) void* maybe_free_unified_mem(const int index, void *ptr) { - for (int i = 0; i < conns[index].mem_idx; i++) { - void *dev_ptr = conns[index].unified_mem_pointers[i][0]; - size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][1]); + for (const auto & [ dev_ptr, sz ] : conns[index].unified_devices) { + size_t size = reinterpret_cast(sz); if (dev_ptr == ptr) { + std::cout << "mem-unampping device ptr: " << dev_ptr << " size " << size << std::endl; + munmap(dev_ptr, size); return dev_ptr; } diff --git a/codegen/annotations.h b/codegen/annotations.h index ddaf798..42fafff 100644 --- a/codegen/annotations.h +++ b/codegen/annotations.h @@ -4357,6 +4357,7 @@ cudaError_t cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *f */ cudaError_t cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig); /** + * @disabled * @param devPtr SEND_RECV * @param size SEND_ONLY * @param flags SEND_ONLY @@ -4388,6 +4389,7 @@ cudaError_t cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t h */ cudaError_t cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, unsigned int flags); /** + * @disabled * @param devPtr SEND_ONLY */ cudaError_t cudaFree(void *devPtr); diff --git a/codegen/codegen.py b/codegen/codegen.py index 65bc104..14d7b84 100644 --- a/codegen/codegen.py +++ b/codegen/codegen.py @@ -69,9 +69,11 @@ # a list of manually implemented cuda/nvml functions. # these are automatically appended to each file; operation order is maintained as well. MANUAL_IMPLEMENTATIONS = [ + "cudaFree", "cudaMemcpy", "cudaMemcpyAsync", "cudaLaunchKernel", + "cudaMallocManaged", ] @dataclass diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp index 455d079..03e538b 100644 --- a/codegen/gen_client.cpp +++ b/codegen/gen_client.cpp @@ -6,7 +6,6 @@ #include #include -#include #include #include "gen_api.h" @@ -21,8 +20,6 @@ extern int rpc_wait_for_response(const int index); extern int rpc_read(const int index, void *data, const std::size_t size); extern int rpc_end_response(const int index, void *return_value); extern int rpc_close(); -extern void* maybe_free_unified_mem(const int index, void *ptr); -extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size); nvmlReturn_t nvmlInit_v2() { @@ -9115,25 +9112,6 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c return return_value; } -cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags) -{ - void* d_mem; - - cudaError_t err = cudaMalloc((void**)&d_mem, size); - if (err != cudaSuccess) { - std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl; - return err; - } - - std::cout << "allocated unified device mem " << d_mem << std::endl; - - allocate_unified_mem_pointer(0, d_mem, size); - - *devPtr = d_mem; - - return cudaSuccess; -} - cudaError_t cudaMalloc(void** devPtr, size_t size) { cudaError_t return_value; @@ -9191,31 +9169,6 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe return return_value; } -cudaError_t cudaFree(void* devPtr) -{ - cudaError_t return_value; - void *maybe_ptr = maybe_free_unified_mem(0, devPtr); - - if (maybe_ptr != nullptr) { - std::cout << "POITNER FOUND!! " << maybe_ptr << std::endl; - - if (rpc_start_request(0, RPC_cudaFree) < 0 || - rpc_write(0, &maybe_ptr, sizeof(void*)) < 0 || - rpc_wait_for_response(0) < 0 || - rpc_end_response(0, &return_value) < 0) - return cudaErrorDevicesUnavailable; - } else { - std::cout << "no poitner found..." << std::endl; - if (rpc_start_request(0, RPC_cudaFree) < 0 || - rpc_write(0, &devPtr, sizeof(void*)) < 0 || - rpc_wait_for_response(0) < 0 || - rpc_end_response(0, &return_value) < 0) - return cudaErrorDevicesUnavailable; - } - - return return_value; -} - cudaError_t cudaFreeHost(void* ptr) { cudaError_t return_value; @@ -22176,12 +22129,10 @@ std::unordered_map functionMap = { {"cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", (void *)cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags}, {"cudaOccupancyMaxPotentialClusterSize", (void *)cudaOccupancyMaxPotentialClusterSize}, {"cudaOccupancyMaxActiveClusters", (void *)cudaOccupancyMaxActiveClusters}, - {"cudaMallocManaged", (void *)cudaMallocManaged}, {"cudaMalloc", (void *)cudaMalloc}, {"cudaMallocHost", (void *)cudaMallocHost}, {"cudaMallocPitch", (void *)cudaMallocPitch}, {"cudaMallocArray", (void *)cudaMallocArray}, - {"cudaFree", (void *)cudaFree}, {"cudaFreeHost", (void *)cudaFreeHost}, {"cudaFreeArray", (void *)cudaFreeArray}, {"cudaFreeMipmappedArray", (void *)cudaFreeMipmappedArray}, @@ -22874,9 +22825,11 @@ std::unordered_map functionMap = { {"cuMemFreeAsync_ptsz", (void *)cuMemFreeAsync}, {"cuMemAllocAsync_ptsz", (void *)cuMemAllocAsync}, {"cuMemAllocFromPoolAsync_ptsz", (void *)cuMemAllocFromPoolAsync}, + {"cudaFree", (void *)cudaFree}, {"cudaMemcpy", (void *)cudaMemcpy}, {"cudaMemcpyAsync", (void *)cudaMemcpyAsync}, {"cudaLaunchKernel", (void *)cudaLaunchKernel}, + {"cudaMallocManaged", (void *)cudaMallocManaged}, }; void *get_function_pointer(const char *name) diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp index f5ddb9e..9a6cb8b 100644 --- a/codegen/gen_server.cpp +++ b/codegen/gen_server.cpp @@ -19387,36 +19387,6 @@ int handle_cudaOccupancyMaxActiveClusters(void *conn) return -1; } -int handle_cudaMallocManaged(void *conn) -{ - void* devPtr; - size_t size; - unsigned int flags; - int request_id; - cudaError_t scuda_intercept_result; - std::cout << "calling cudaMallocManaged" << std::endl; - if ( - rpc_read(conn, &devPtr, sizeof(void*)) < 0 || - rpc_read(conn, &size, sizeof(size_t)) < 0 || - rpc_read(conn, &flags, sizeof(unsigned int)) < 0 || - false) - goto ERROR_0; - - request_id = rpc_end_request(conn); - if (request_id < 0) - goto ERROR_0; - scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags); - - if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &devPtr, sizeof(void*)) < 0 || - rpc_end_response(conn, &scuda_intercept_result) < 0) - goto ERROR_0; - - return 0; -ERROR_0: - return -1; -} - int handle_cudaMalloc(void *conn) { void* devPtr; @@ -19433,8 +19403,6 @@ int handle_cudaMalloc(void *conn) goto ERROR_0; scuda_intercept_result = cudaMalloc(&devPtr, size); - std::cout << "ADDRESS : " << &devPtr << std::endl; - if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &devPtr, sizeof(void*)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) @@ -19537,31 +19505,6 @@ int handle_cudaMallocArray(void *conn) return -1; } -int handle_cudaFree(void *conn) -{ - void* devPtr; - int request_id; - cudaError_t scuda_intercept_result; - if ( - rpc_read(conn, &devPtr, sizeof(void*)) < 0 || - false) - goto ERROR_0; - - std::cout << "freeing... " << devPtr << std::endl; - request_id = rpc_end_request(conn); - if (request_id < 0) - goto ERROR_0; - scuda_intercept_result = cudaFree(devPtr); - - if (rpc_start_response(conn, request_id) < 0 || - rpc_end_response(conn, &scuda_intercept_result) < 0) - goto ERROR_0; - - return 0; -ERROR_0: - return -1; -} - int handle_cudaFreeHost(void *conn) { void* ptr; diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp index 2b921ca..13ccba6 100755 --- a/codegen/manual_client.cpp +++ b/codegen/manual_client.cpp @@ -24,8 +24,9 @@ extern int rpc_wait_for_response(const int index); extern int rpc_read(const int index, void *data, const std::size_t size); extern int rpc_end_response(const int index, void *return_value); extern int rpc_close(); -void* maybe_get_cached_arg_ptr(const int index, void* arg_ptr); void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind); +extern void* maybe_free_unified_mem(const int index, void *ptr); +extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size); #define MAX_FUNCTION_NAME 1024 #define MAX_ARGS 128 @@ -833,3 +834,44 @@ extern "C" } } } + +cudaError_t cudaFree(void* devPtr) +{ + cudaError_t return_value; + void *maybe_ptr = maybe_free_unified_mem(0, devPtr); + + if (maybe_ptr != nullptr) { + if (rpc_start_request(0, RPC_cudaFree) < 0 || + rpc_write(0, &maybe_ptr, sizeof(void*)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return cudaErrorDevicesUnavailable; + } else { + if (rpc_start_request(0, RPC_cudaFree) < 0 || + rpc_write(0, &devPtr, sizeof(void*)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return cudaErrorDevicesUnavailable; + } + + return return_value; +} + +cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags) +{ + void* d_mem; + + cudaError_t err = cudaMalloc((void**)&d_mem, size); + if (err != cudaSuccess) { + std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl; + return err; + } + + std::cout << "allocated unified device mem " << d_mem << " size: " << size << std::endl; + + allocate_unified_mem_pointer(0, d_mem, size); + + *devPtr = d_mem; + + return cudaSuccess; +} diff --git a/codegen/manual_client.h b/codegen/manual_client.h index 1666ce3..a10bfcb 100644 --- a/codegen/manual_client.h +++ b/codegen/manual_client.h @@ -3,6 +3,8 @@ #include #include +cudaError_t cudaFree(void* devPtr); +cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags); cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); diff --git a/local.sh b/local.sh index aeed4bf..ad68a88 100755 --- a/local.sh +++ b/local.sh @@ -28,7 +28,7 @@ build() { nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o - nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_2.cu -o unified_2.o + nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o if [ ! -f "$libscuda_path" ]; then diff --git a/test/unified_pointer.cu b/test/unified_pointer.cu index cf672aa..1e39104 100644 --- a/test/unified_pointer.cu +++ b/test/unified_pointer.cu @@ -17,7 +17,6 @@ __global__ void add(Operation *op) { for (int i = index; i < op->n; i += stride) { op->y[i] = op->x[i] + op->y[i]; - printf("The value is: %f\n", op->y[i]); } } @@ -42,7 +41,7 @@ int main(void) { int numBlocks = (op->n + blockSize - 1) / blockSize; std::cout << "numBlocks: " << numBlocks << std::endl; - std::cout << "N: " << op->n << std::endl; + std::cout << "X: " << &op->x << std::endl; add<<>>(op);