From 062360ca1ff2531fc68e3d80ce256079fbef75fd Mon Sep 17 00:00:00 2001 From: Brodey Newman Date: Mon, 16 Dec 2024 17:04:39 +0000 Subject: [PATCH] chore: build --- client.cpp | 12 +-- codegen/gen_server.cpp | 2 +- codegen/manual_client.cpp | 17 ++-- codegen/manual_server.cpp | 2 + local.sh | 1 + test/unified_2.cu | 163 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 178 insertions(+), 19 deletions(-) create mode 100644 test/unified_2.cu diff --git a/client.cpp b/client.cpp index b341a23..c8a3887 100644 --- a/client.cpp +++ b/client.cpp @@ -277,12 +277,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind) { for (int i = 0; i < conns[index].mem_idx; i++) { if (kind == cudaMemcpyHostToDevice) { - void *dev_ptr = conns[index].unified_mem_pointers[i][0]; - void *host_ptr = conns[index].unified_mem_pointers[i][1]; - size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][2]); - cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice); + cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][1], size, cudaMemcpyHostToDevice); if (res != cudaSuccess) { std::cerr << "cudaMemcpy failed for index " << i @@ -291,12 +288,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind) std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl; } } else { - void *dev_ptr = conns[index].unified_mem_pointers[i][0]; - void *host_ptr = conns[index].unified_mem_pointers[i][1]; - size_t size = reinterpret_cast(conns[index].unified_mem_pointers[i][2]); - cudaError_t res = cudaMemcpy(host_ptr, dev_ptr, size, cudaMemcpyDeviceToHost); + cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][1], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost); if (res != cudaSuccess) { std::cerr << "cudaMemcpy failed for index " << i @@ -315,8 +309,6 @@ void* maybe_free_unified_mem(const int index, void *ptr) void *target_free_ptr = conns[index].unified_mem_pointers[i][0]; if (dev_ptr == ptr) { - std::cout << "freeing dynamic pointer " << target_free_ptr << std::endl; - return target_free_ptr; } } diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp index 1613728..f5ddb9e 100644 --- a/codegen/gen_server.cpp +++ b/codegen/gen_server.cpp @@ -19546,7 +19546,7 @@ int handle_cudaFree(void *conn) rpc_read(conn, &devPtr, sizeof(void*)) < 0 || false) goto ERROR_0; - + std::cout << "freeing... " << devPtr << std::endl; request_id = rpc_end_request(conn); if (request_id < 0) diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp index 04ca4ca..c2356d2 100755 --- a/codegen/manual_client.cpp +++ b/codegen/manual_client.cpp @@ -422,20 +422,23 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void // convert each to a void pointer so that we can map it back to... // our origional host pointers. void *arg_ptr = *reinterpret_cast(args[i]); - void* maybe_ptr = maybe_get_cached_arg_ptr(0, arg_ptr); + void *maybe_ptr = maybe_get_cached_arg_ptr(0, arg_ptr); + // Hardconding 24 bytes for now for the unified memory case. Will remove before merge! if (maybe_ptr != 0) { - std::cout << "writing dynamic pointer" << std::endl; - if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 || - rpc_write(0, maybe_ptr, f->arg_sizes[i]) < 0) + int size = 24; + std::cout << "writing dynamic pointer " << maybe_ptr << std::endl; + if (rpc_write(0, &size, sizeof(int)) < 0 || + rpc_write(0, maybe_ptr, size) < 0) return cudaErrorDevicesUnavailable; } else { + int size = 24; std::cout << "writing original pointer" << std::endl; - if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 || - rpc_write(0, args[i], f->arg_sizes[i]) < 0) + if (rpc_write(0, &size, sizeof(int)) < 0 || + rpc_write(0, args[i], size) < 0) return cudaErrorDevicesUnavailable; } } @@ -540,8 +543,6 @@ void parse_ptx_string(void *fatCubin, const char *ptx_string, unsigned long long if (type_size == 0) continue; arg_size = type_size; - - std::cout << "arg size: " << arg_size << std::endl; } else if (ptx_string[i] == '[') { diff --git a/codegen/manual_server.cpp b/codegen/manual_server.cpp index cd2b0fa..c9a8130 100755 --- a/codegen/manual_server.cpp +++ b/codegen/manual_server.cpp @@ -264,6 +264,8 @@ int handle_cudaLaunchKernel(void *conn) result = cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream); + std::cout << "Launch kern result: " << result << std::endl; + if (rpc_start_response(conn, request_id) < 0 || rpc_end_response(conn, &result) < 0) goto ERROR_1; diff --git a/local.sh b/local.sh index 9ec1903..097a239 100755 --- a/local.sh +++ b/local.sh @@ -28,6 +28,7 @@ build() { nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o + nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_2.cu -o unified_2.o if [ ! -f "$libscuda_path" ]; then echo "libscuda.so not found. build may have failed." diff --git a/test/unified_2.cu b/test/unified_2.cu new file mode 100644 index 0000000..41a58c6 --- /dev/null +++ b/test/unified_2.cu @@ -0,0 +1,163 @@ +#include +#include + +struct Operation { + float *x; + float *y; + int n; +}; + +// CUDA kernel to add elements of two arrays +__global__ void add(Operation *op) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + printf("The X is: %x\n", op->x[0]); + printf("The Y is: %x\n", op->y[0]); + for (int i = index; i < op->n; i += stride) + { + op->y[i] = op->x[i] + op->y[i]; + printf("The value is: %f\n", op->y[i]); + } +} + +int main(void) { + Operation host_op; // Host structure + Operation *device_op; // Device structure + + // Initialize array size + host_op.n = 100; + + // Allocate memory for device operation struct + cudaMalloc(&device_op, sizeof(Operation)); + + // Allocate memory for x and y arrays on the device + cudaMalloc(&host_op.x, host_op.n * sizeof(float)); + cudaMalloc(&host_op.y, host_op.n * sizeof(float)); + + // Initialize x and y arrays on the host + float *host_x = new float[host_op.n]; + float *host_y = new float[host_op.n]; + for (int i = 0; i < host_op.n; i++) { + host_x[i] = 1.0f; + host_y[i] = 2.0f; + } + + // Copy x and y arrays from host to device + cudaMemcpy(host_op.x, host_x, host_op.n * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(host_op.y, host_y, host_op.n * sizeof(float), cudaMemcpyHostToDevice); + + // Copy host operation struct to device + cudaMemcpy(device_op, &host_op, sizeof(Operation), cudaMemcpyHostToDevice); + + // Launch kernel + int blockSize = 256; + int numBlocks = (host_op.n + blockSize - 1) / blockSize; + add<<>>(device_op); + + // Wait for GPU to finish before accessing results + cudaDeviceSynchronize(); + + // Copy results from device to host + cudaMemcpy(host_y, host_op.y, host_op.n * sizeof(float), cudaMemcpyDeviceToHost); + + // Log results for debugging + std::cout << "Results (y = x + y):" << std::endl; + for (int i = 0; i < host_op.n; i++) { + std::cout << "y[" << i << "] = " << host_y[i] << " (expected: 3.0)" << std::endl; + } + + // Check for errors (all values should be 3.0f) + float maxError = 0.0f; + for (int i = 0; i < host_op.n; i++) { + maxError = fmax(maxError, fabs(host_y[i] - 3.0f)); + } + + // Free device memory + cudaFree(host_op.x); + cudaFree(host_op.y); + cudaFree(device_op); + + // Free host memory + delete[] host_x; + delete[] host_y; + + return 0; +} + + + +// // ******UNIFIED MEMORY EXAMPLE BELOW******* + + +// #include +// #include + +// struct Operation { +// float *x; +// float *y; +// int n; +// }; + +// // CUDA kernel to add elements of two arrays +// __global__ void add(Operation *op) { +// int index = blockIdx.x * blockDim.x + threadIdx.x; +// int stride = blockDim.x * gridDim.x; + +// printf("The X is: %x\n", op->x[0]); +// printf("The Y is: %x\n", op->y[0]); +// for (int i = index; i < op->n; i += stride) +// { +// op->y[i] = op->x[i] + op->y[i]; +// printf("The value is: %f\n", op->y[i]); +// } +// } + +// int main(void) { +// Operation *op; + +// // Allocate Unified Memory -- accessible from CPU or GPU +// cudaMallocManaged(&op, sizeof(Operation)); +// op->n = 100; + +// cudaMallocManaged(&op->x, op->n * sizeof(float)); +// cudaMallocManaged(&op->y, op->n * sizeof(float)); + +// // initialize x and y arrays on the host +// for (int i = 0; i < op->n; i++) { +// op->x[i] = 1.0f; +// op->y[i] = 2.0f; +// } + +// // Launch kernel on n elements on the GPU +// int blockSize = 256; +// int numBlocks = (op->n + blockSize - 1) / blockSize; + +// std::cout << "numBlocks: " << numBlocks << std::endl; +// std::cout << "N: " << op->n << std::endl; + +// add<<>>(op); + +// // Wait for GPU to finish before accessing on host +// cudaDeviceSynchronize(); + +// // Log results for debugging +// std::cout << "Results (y = x + y):" << std::endl; +// for (int i = 0; i < op->n; i++) { +// std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl; +// } + +// // Check for errors (all values should be 3.0f) +// float maxError = 0.0f; +// for (int i = 0; i < op->n; i++) { +// maxError = fmax(maxError, fabs(op->y[i] - 3.0f)); +// } +// std::cout << "Max error: " << maxError << std::endl; + +// // Free memory +// cudaFree(op->x); +// cudaFree(op->y); +// cudaFree(op); + +// return 0; +// }