diff --git a/client.cpp b/client.cpp index f63dbd2..11d1d53 100644 --- a/client.cpp +++ b/client.cpp @@ -226,33 +226,31 @@ int rpc_read(const int index, void *data, size_t size) void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size) { - std::cout << "Allocating device ptr: " << dev_ptr - << " and host ptr: " << ptr - << " with size: " << size << std::endl; - + // Initialize unified_mem_pointers if not already initialized if (conns[index].mem_idx == 0) { - conns[index].unified_mem_pointers = new void **[5]; - conns[index].unified_mem_pointers[0] = new void *[3]; + conns[index].unified_mem_pointers = new void **[5]; // Initial capacity of 5 + for (int i = 0; i < 5; ++i) { + conns[index].unified_mem_pointers[i] = nullptr; + } } - if (conns[index].mem_idx >= 5) { - int new_capacity = conns[index].mem_idx + 1; + // we need to handle resize here at some point + // // Resize array if current capacity is exceeded + // if (conns[index].mem_idx >= 5) { + // int current_capacity = 5 + conns[index].mem_idx; + // int new_capacity = conns[index].mem_idx + 5; - void ***new_arr = new void **[new_capacity]; + // void ***new_arr = new void **[new_capacity]; + // for (int i = 0; i < new_capacity; ++i) { + // new_arr[i] = (i < conns[index].mem_idx) ? conns[index].unified_mem_pointers[i] : nullptr; + // } - for (int i = 0; i < conns[index].mem_idx; ++i) { - new_arr[i] = conns[index].unified_mem_pointers[i]; - } + // delete[] conns[index].unified_mem_pointers; + // conns[index].unified_mem_pointers = new_arr; + // } - for (int i = conns[index].mem_idx; i < new_capacity; ++i) { - new_arr[i] = new void *[3]; - } - - delete[] conns[index].unified_mem_pointers; - conns[index].unified_mem_pointers = new_arr; - } else { - conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3]; - } + // allocate new space for pointer mapping + conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3]; conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr; conns[index].unified_mem_pointers[conns[index].mem_idx][1] = ptr; @@ -261,29 +259,45 @@ void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, siz conns[index].mem_idx++; } -void cuda_memcpy_unified_ptrs(const int index) +void* maybe_get_cached_arg_ptr(const int index, int arg_index, void* arg) { - std::cout << "copying memory..." << std::endl; + if (arg_index >= conns[index].mem_idx) + return nullptr; + + return &conns[index].unified_mem_pointers[arg_index][0]; +} +void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind) +{ for (int i = 0; i < conns[index].mem_idx; i++) { - void *dev_ptr = conns[index].unified_mem_pointers[i][0]; - void *host_ptr = conns[index].unified_mem_pointers[i][1]; - size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][2]); + if (kind == cudaMemcpyHostToDevice) { + void *dev_ptr = conns[index].unified_mem_pointers[i][0]; + void *host_ptr = conns[index].unified_mem_pointers[i][1]; - std::cout << "Index " << i << " Parameters:\n" - << " Device Pointer (dev_ptr): " << dev_ptr << "\n" - << " Host Pointer (host_ptr): " << host_ptr << "\n" - << " Size (bytes): " << size << "\n"; + size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][2]); - cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice); + cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice); + + if (res != cudaSuccess) { + std::cerr << "cudaMemcpy failed for index " << i + << ": " << cudaGetErrorString(res) << std::endl; + } else { + std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl; + } + } else { + void *dev_ptr = conns[index].unified_mem_pointers[i][0]; + void *host_ptr = conns[index].unified_mem_pointers[i][1]; - std::cout << "result: " << res << std::endl; + size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][2]); - if (res != cudaSuccess) { - std::cerr << "cudaMemcpy failed for index " << i - << ": " << cudaGetErrorString(res) << std::endl; - } else { - std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl; + cudaError_t res = cudaMemcpy(host_ptr, dev_ptr, size, cudaMemcpyDeviceToHost); + + if (res != cudaSuccess) { + std::cerr << "cudaMemcpy failed for index " << i + << ": " << cudaGetErrorString(res) << std::endl; + } else { + std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl; + } } } } @@ -294,12 +308,9 @@ void* maybe_free_unified_mem(const int index, void *ptr) void *dev_ptr = conns[index].unified_mem_pointers[i][1]; void *target_free_ptr = conns[index].unified_mem_pointers[i][0]; - std::cout << "comparing pointers: " << dev_ptr << " ptr " << ptr << std::endl; - if (dev_ptr == ptr) { - std::cout << "freeing pointer " << target_free_ptr << std::endl; + std::cout << "freeing dynamic pointer " << target_free_ptr << std::endl; - // mem addresses are the same, free return target_free_ptr; } } diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp index 4b1bf7a..fef7890 100644 --- a/codegen/gen_client.cpp +++ b/codegen/gen_client.cpp @@ -9117,19 +9117,20 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags) { - std::cout << "CALLING MALLOC WITH DEV PTR: " << devPtr << std::endl; + void* host_alloc = new void*[size]; + void* d_a; - void* host_alloc = (void*) malloc(size); - void*d_a; - cudaMalloc((void **)&d_a, size); - std::cout << "AFTER DEVICE PTR: " << d_a << std::endl; + cudaError_t err = cudaMalloc((void **)&d_a, size); + if (err != cudaSuccess) { + std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl; + return err; + } allocate_unified_mem_pointer(0, d_a, host_alloc, size); - std::cout << "done allocate_unified_mem_pointer" << std::endl; *devPtr = host_alloc; - std::cout << "DONE MALLOC" << std::endl; + return cudaSuccess; } cudaError_t cudaMalloc(void** devPtr, size_t size) @@ -9141,8 +9142,6 @@ cudaError_t cudaMalloc(void** devPtr, size_t size) rpc_read(0, devPtr, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; - - std::cout << "done calling cudaMalloc... " << devPtr << std::endl; return return_value; } diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp index 255be3b..4dce023 100755 --- a/codegen/manual_client.cpp +++ b/codegen/manual_client.cpp @@ -24,7 +24,6 @@ extern int rpc_wait_for_response(const int index); extern int rpc_read(const int index, void *data, const std::size_t size); extern int rpc_end_response(const int index, void *return_value); extern int rpc_close(); -void cuda_memcpy_unified_ptrs(const int index); #define MAX_FUNCTION_NAME 1024 #define MAX_ARGS 128 @@ -373,7 +372,7 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void { cudaError_t return_value; - cuda_memcpy_unified_ptrs(0); + cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice); // Start the RPC request int request_id = rpc_start_request(0, RPC_cudaLaunchKernel); @@ -418,9 +417,22 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void for (int i = 0; i < f->arg_count; ++i) { - if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 || - rpc_write(0, args[i], f->arg_sizes[i]) < 0) - return cudaErrorDevicesUnavailable; + void* maybe_ptr = maybe_get_cached_arg_ptr(0, i, args[i]); + + if (maybe_ptr != 0) + { + std::cout << "writing dynamic pointer" << std::endl; + if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 || + rpc_write(0, maybe_ptr, f->arg_sizes[i]) < 0) + return cudaErrorDevicesUnavailable; + } + else + { + std::cout << "writing original pointer" << std::endl; + if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 || + rpc_write(0, args[i], f->arg_sizes[i]) < 0) + return cudaErrorDevicesUnavailable; + } } if (rpc_wait_for_response(0) < 0) @@ -433,6 +445,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void return cudaErrorDevicesUnavailable; } + cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost); + return return_value; } diff --git a/test/unified.cu b/test/unified.cu index c4d2b9e..0dfecea 100644 --- a/test/unified.cu +++ b/test/unified.cu @@ -2,30 +2,70 @@ #include // CUDA Kernel to add elements of two arrays -__global__ void addKernel(int *a, int *b, int *c, int size) { +// __global__ void addKernel(int *a, int *b, int *c, int size) { +// int idx = threadIdx.x + blockIdx.x * blockDim.x; +// if (idx < size) { +// c[idx] = a[idx] * b[idx]; +// } +// } + +__global__ void mulKernel(int *a, int *c, int size) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < size) { - c[idx] = a[idx] * b[idx]; + c[idx] = a[idx] * 5; } } int main() { + // const int size = 10; + // const int bytes = size * sizeof(int); + // int *d_a; + // int *d_c; + + // // host pointers + // int *a = new int[size]; + // int *c = new int[size]; + // // cudaMalloc(&a, bytes); + // cudaMalloc(&d_a, bytes); + // cudaMalloc(&d_c, bytes); + // for (int i = 0; i < size; ++i) { + // a[i] = i; + // } + + // cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice); + // cudaMemcpy(d_c, c, bytes, cudaMemcpyHostToDevice); + + // const int threadsPerBlock = 256; + // std::cout << "ARG SIZE::: " << bytes << std::endl; + // const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock; + + // mulKernel<<>>(d_a, d_c, size); + // cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost); + // cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost); + + // std::cout << "Results:\n"; + // for (int i = 0; i < size; ++i) { + // std::cout << "a[" << i << "] + b[" << i << "] = " << c[i] << "\n"; + // } + + // cudaFree(a); + // cudaFree(c); + + + + // Define array size const int size = 10; const int bytes = size * sizeof(int); - std::cout << "HELLO" << std::endl; - // Unified memory allocation - int *a, *b, *c; + int *a, *c; cudaMallocManaged(&a, bytes); - cudaMallocManaged(&b, bytes); cudaMallocManaged(&c, bytes); // Initialize arrays on the CPU for (int i = 0; i < size; ++i) { a[i] = i; - b[i] = i * 2; } // Define kernel launch parameters @@ -34,12 +74,16 @@ int main() { std::cout << "launching kernel..." << std::endl; + for (int i = 0; i < size; ++i) { + std::cout << "a[" << i << "] + b[" << i << "] = " << a[i] << "\n"; + } + std::cout << "pointer a: " << a << std::endl; - std::cout << "pointer b: " << b << std::endl; std::cout << "pointer c: " << c << std::endl; + std::cout << "size c: " << size << std::endl; // Launch the kernel - addKernel<<>>(a, b, c, size); + mulKernel<<>>(a, c, size); // Wait for GPU to finish cudaDeviceSynchronize(); @@ -52,7 +96,6 @@ int main() { // Free unified memory cudaFree(a); - cudaFree(b); cudaFree(c); return 0;