diff --git a/client.cpp b/client.cpp index 3d5bff2..f63dbd2 100644 --- a/client.cpp +++ b/client.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -34,6 +35,9 @@ typedef struct pthread_cond_t read_cond; struct iovec write_iov[128]; int write_iov_count = 0; + + void ***unified_mem_pointers; + int mem_idx = 0; } conn_t; pthread_mutex_t conn_mutex; @@ -220,6 +224,87 @@ int rpc_read(const int index, void *data, size_t size) return n; } +void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size) +{ + std::cout << "Allocating device ptr: " << dev_ptr + << " and host ptr: " << ptr + << " with size: " << size << std::endl; + + if (conns[index].mem_idx == 0) { + conns[index].unified_mem_pointers = new void **[5]; + conns[index].unified_mem_pointers[0] = new void *[3]; + } + + if (conns[index].mem_idx >= 5) { + int new_capacity = conns[index].mem_idx + 1; + + void ***new_arr = new void **[new_capacity]; + + for (int i = 0; i < conns[index].mem_idx; ++i) { + new_arr[i] = conns[index].unified_mem_pointers[i]; + } + + for (int i = conns[index].mem_idx; i < new_capacity; ++i) { + new_arr[i] = new void *[3]; + } + + delete[] conns[index].unified_mem_pointers; + conns[index].unified_mem_pointers = new_arr; + } else { + conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3]; + } + + conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr; + conns[index].unified_mem_pointers[conns[index].mem_idx][1] = ptr; + conns[index].unified_mem_pointers[conns[index].mem_idx][2] = reinterpret_cast(size); + + conns[index].mem_idx++; +} + +void cuda_memcpy_unified_ptrs(const int index) +{ + std::cout << "copying memory..." << std::endl; + + for (int i = 0; i < conns[index].mem_idx; i++) { + void *dev_ptr = conns[index].unified_mem_pointers[i][0]; + void *host_ptr = conns[index].unified_mem_pointers[i][1]; + size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][2]); + + std::cout << "Index " << i << " Parameters:\n" + << " Device Pointer (dev_ptr): " << dev_ptr << "\n" + << " Host Pointer (host_ptr): " << host_ptr << "\n" + << " Size (bytes): " << size << "\n"; + + cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice); + + std::cout << "result: " << res << std::endl; + + if (res != cudaSuccess) { + std::cerr << "cudaMemcpy failed for index " << i + << ": " << cudaGetErrorString(res) << std::endl; + } else { + std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl; + } + } +} + +void* maybe_free_unified_mem(const int index, void *ptr) +{ + for (int i = 0; i < conns[index].mem_idx; i++) { + void *dev_ptr = conns[index].unified_mem_pointers[i][1]; + void *target_free_ptr = conns[index].unified_mem_pointers[i][0]; + + std::cout << "comparing pointers: " << dev_ptr << " ptr " << ptr << std::endl; + + if (dev_ptr == ptr) { + std::cout << "freeing pointer " << target_free_ptr << std::endl; + + // mem addresses are the same, free + return target_free_ptr; + } + } +} + int rpc_end_response(const int index, void *result) { if (read(conns[index].connfd, result, sizeof(int)) < 0 || diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp index b608b62..4b1bf7a 100644 --- a/codegen/gen_client.cpp +++ b/codegen/gen_client.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include "gen_api.h" @@ -20,6 +21,8 @@ extern int rpc_wait_for_response(const int index); extern int rpc_read(const int index, void *data, const std::size_t size); extern int rpc_end_response(const int index, void *return_value); extern int rpc_close(); +extern void* maybe_free_unified_mem(const int index, void *ptr); +extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size); nvmlReturn_t nvmlInit_v2() { @@ -9114,16 +9117,19 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags) { - cudaError_t return_value; - if (rpc_start_request(0, RPC_cudaMallocManaged) < 0 || - rpc_write(0, devPtr, sizeof(void*)) < 0 || - rpc_write(0, &size, sizeof(size_t)) < 0 || - rpc_write(0, &flags, sizeof(unsigned int)) < 0 || - rpc_wait_for_response(0) < 0 || - rpc_read(0, devPtr, sizeof(void*)) < 0 || - rpc_end_response(0, &return_value) < 0) - return cudaErrorDevicesUnavailable; - return return_value; + std::cout << "CALLING MALLOC WITH DEV PTR: " << devPtr << std::endl; + + void* host_alloc = (void*) malloc(size); + void*d_a; + cudaMalloc((void **)&d_a, size); + std::cout << "AFTER DEVICE PTR: " << d_a << std::endl; + + allocate_unified_mem_pointer(0, d_a, host_alloc, size); + + std::cout << "done allocate_unified_mem_pointer" << std::endl; + *devPtr = host_alloc; + + std::cout << "DONE MALLOC" << std::endl; } cudaError_t cudaMalloc(void** devPtr, size_t size) @@ -9135,6 +9141,8 @@ cudaError_t cudaMalloc(void** devPtr, size_t size) rpc_read(0, devPtr, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + + std::cout << "done calling cudaMalloc... " << devPtr << std::endl; return return_value; } @@ -9186,11 +9194,25 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe cudaError_t cudaFree(void* devPtr) { cudaError_t return_value; - if (rpc_start_request(0, RPC_cudaFree) < 0 || - rpc_write(0, &devPtr, sizeof(void*)) < 0 || - rpc_wait_for_response(0) < 0 || - rpc_end_response(0, &return_value) < 0) - return cudaErrorDevicesUnavailable; + void *maybe_ptr = maybe_free_unified_mem(0, devPtr); + + if (maybe_ptr != nullptr) { + std::cout << "POITNER FOUND!! " << maybe_ptr << std::endl; + + if (rpc_start_request(0, RPC_cudaFree) < 0 || + rpc_write(0, &maybe_ptr, sizeof(void*)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return cudaErrorDevicesUnavailable; + } else { + std::cout << "no poitner found..." << std::endl; + if (rpc_start_request(0, RPC_cudaFree) < 0 || + rpc_write(0, &devPtr, sizeof(void*)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return cudaErrorDevicesUnavailable; + } + return return_value; } diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp index ab04e00..1613728 100644 --- a/codegen/gen_server.cpp +++ b/codegen/gen_server.cpp @@ -19394,6 +19394,7 @@ int handle_cudaMallocManaged(void *conn) unsigned int flags; int request_id; cudaError_t scuda_intercept_result; + std::cout << "calling cudaMallocManaged" << std::endl; if ( rpc_read(conn, &devPtr, sizeof(void*)) < 0 || rpc_read(conn, &size, sizeof(size_t)) < 0 || @@ -19432,6 +19433,8 @@ int handle_cudaMalloc(void *conn) goto ERROR_0; scuda_intercept_result = cudaMalloc(&devPtr, size); + std::cout << "ADDRESS : " << &devPtr << std::endl; + if (rpc_start_response(conn, request_id) < 0 || rpc_write(conn, &devPtr, sizeof(void*)) < 0 || rpc_end_response(conn, &scuda_intercept_result) < 0) @@ -19543,7 +19546,8 @@ int handle_cudaFree(void *conn) rpc_read(conn, &devPtr, sizeof(void*)) < 0 || false) goto ERROR_0; - + + std::cout << "freeing... " << devPtr << std::endl; request_id = rpc_end_request(conn); if (request_id < 0) goto ERROR_0; diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp index 2e1bb59..255be3b 100755 --- a/codegen/manual_client.cpp +++ b/codegen/manual_client.cpp @@ -24,6 +24,7 @@ extern int rpc_wait_for_response(const int index); extern int rpc_read(const int index, void *data, const std::size_t size); extern int rpc_end_response(const int index, void *return_value); extern int rpc_close(); +void cuda_memcpy_unified_ptrs(const int index); #define MAX_FUNCTION_NAME 1024 #define MAX_ARGS 128 @@ -372,6 +373,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void { cudaError_t return_value; + cuda_memcpy_unified_ptrs(0); + // Start the RPC request int request_id = rpc_start_request(0, RPC_cudaLaunchKernel); if (request_id < 0) diff --git a/local.sh b/local.sh index caa9c70..9ec1903 100755 --- a/local.sh +++ b/local.sh @@ -26,8 +26,8 @@ build() { nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o - nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o + nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o if [ ! -f "$libscuda_path" ]; then echo "libscuda.so not found. build may have failed." diff --git a/test/unified.cu b/test/unified.cu new file mode 100644 index 0000000..c4d2b9e --- /dev/null +++ b/test/unified.cu @@ -0,0 +1,59 @@ +#include +#include + +// CUDA Kernel to add elements of two arrays +__global__ void addKernel(int *a, int *b, int *c, int size) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < size) { + c[idx] = a[idx] * b[idx]; + } +} + +int main() { + // Define array size + const int size = 10; + const int bytes = size * sizeof(int); + + std::cout << "HELLO" << std::endl; + + // Unified memory allocation + int *a, *b, *c; + cudaMallocManaged(&a, bytes); + cudaMallocManaged(&b, bytes); + cudaMallocManaged(&c, bytes); + + // Initialize arrays on the CPU + for (int i = 0; i < size; ++i) { + a[i] = i; + b[i] = i * 2; + } + + // Define kernel launch parameters + const int threadsPerBlock = 256; + const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock; + + std::cout << "launching kernel..." << std::endl; + + std::cout << "pointer a: " << a << std::endl; + std::cout << "pointer b: " << b << std::endl; + std::cout << "pointer c: " << c << std::endl; + + // Launch the kernel + addKernel<<>>(a, b, c, size); + + // Wait for GPU to finish + cudaDeviceSynchronize(); + + // Display results + std::cout << "Results:\n"; + for (int i = 0; i < size; ++i) { + std::cout << "a[" << i << "] + b[" << i << "] = " << c[i] << "\n"; + } + + // Free unified memory + cudaFree(a); + cudaFree(b); + cudaFree(c); + + return 0; +}