Skip to content

Commit

Permalink
chore: build
Browse files Browse the repository at this point in the history
  • Loading branch information
brodeynewman committed Dec 16, 2024
1 parent e6ec613 commit 062360c
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 19 deletions.
12 changes: 2 additions & 10 deletions client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,12 +277,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
{
for (int i = 0; i < conns[index].mem_idx; i++) {
if (kind == cudaMemcpyHostToDevice) {
void *dev_ptr = conns[index].unified_mem_pointers[i][0];
void *host_ptr = conns[index].unified_mem_pointers[i][1];

size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);

cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice);
cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][1], size, cudaMemcpyHostToDevice);

if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed for index " << i
Expand All @@ -291,12 +288,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
}
} else {
void *dev_ptr = conns[index].unified_mem_pointers[i][0];
void *host_ptr = conns[index].unified_mem_pointers[i][1];

size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);

cudaError_t res = cudaMemcpy(host_ptr, dev_ptr, size, cudaMemcpyDeviceToHost);
cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][1], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost);

if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed for index " << i
Expand All @@ -315,8 +309,6 @@ void* maybe_free_unified_mem(const int index, void *ptr)
void *target_free_ptr = conns[index].unified_mem_pointers[i][0];

if (dev_ptr == ptr) {
std::cout << "freeing dynamic pointer " << target_free_ptr << std::endl;

return target_free_ptr;
}
}
Expand Down
2 changes: 1 addition & 1 deletion codegen/gen_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19546,7 +19546,7 @@ int handle_cudaFree(void *conn)
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
false)
goto ERROR_0;

std::cout << "freeing... " << devPtr << std::endl;
request_id = rpc_end_request(conn);
if (request_id < 0)
Expand Down
17 changes: 9 additions & 8 deletions codegen/manual_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,20 +422,23 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
// convert each to a void pointer so that we can map it back to...
// our origional host pointers.
void *arg_ptr = *reinterpret_cast<void**>(args[i]);
void* maybe_ptr = maybe_get_cached_arg_ptr(0, arg_ptr);
void *maybe_ptr = maybe_get_cached_arg_ptr(0, arg_ptr);

// Hardconding 24 bytes for now for the unified memory case. Will remove before merge!
if (maybe_ptr != 0)
{
std::cout << "writing dynamic pointer" << std::endl;
if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
rpc_write(0, maybe_ptr, f->arg_sizes[i]) < 0)
int size = 24;
std::cout << "writing dynamic pointer " << maybe_ptr << std::endl;
if (rpc_write(0, &size, sizeof(int)) < 0 ||
rpc_write(0, maybe_ptr, size) < 0)
return cudaErrorDevicesUnavailable;
}
else
{
int size = 24;
std::cout << "writing original pointer" << std::endl;
if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
rpc_write(0, args[i], f->arg_sizes[i]) < 0)
if (rpc_write(0, &size, sizeof(int)) < 0 ||
rpc_write(0, args[i], size) < 0)
return cudaErrorDevicesUnavailable;
}
}
Expand Down Expand Up @@ -540,8 +543,6 @@ void parse_ptx_string(void *fatCubin, const char *ptx_string, unsigned long long
if (type_size == 0)
continue;
arg_size = type_size;

std::cout << "arg size: " << arg_size << std::endl;
}
else if (ptx_string[i] == '[')
{
Expand Down
2 changes: 2 additions & 0 deletions codegen/manual_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ int handle_cudaLaunchKernel(void *conn)

result = cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);

std::cout << "Launch kern result: " << result << std::endl;

if (rpc_start_response(conn, request_id) < 0 ||
rpc_end_response(conn, &result) < 0)
goto ERROR_1;
Expand Down
1 change: 1 addition & 0 deletions local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ build() {
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_2.cu -o unified_2.o

if [ ! -f "$libscuda_path" ]; then
echo "libscuda.so not found. build may have failed."
Expand Down
163 changes: 163 additions & 0 deletions test/unified_2.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
#include <iostream>
#include <math.h>

struct Operation {
float *x;
float *y;
int n;
};

// CUDA kernel to add elements of two arrays
__global__ void add(Operation *op) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;

printf("The X is: %x\n", op->x[0]);
printf("The Y is: %x\n", op->y[0]);
for (int i = index; i < op->n; i += stride)
{
op->y[i] = op->x[i] + op->y[i];
printf("The value is: %f\n", op->y[i]);
}
}

int main(void) {
Operation host_op; // Host structure
Operation *device_op; // Device structure

// Initialize array size
host_op.n = 100;

// Allocate memory for device operation struct
cudaMalloc(&device_op, sizeof(Operation));

// Allocate memory for x and y arrays on the device
cudaMalloc(&host_op.x, host_op.n * sizeof(float));
cudaMalloc(&host_op.y, host_op.n * sizeof(float));

// Initialize x and y arrays on the host
float *host_x = new float[host_op.n];
float *host_y = new float[host_op.n];
for (int i = 0; i < host_op.n; i++) {
host_x[i] = 1.0f;
host_y[i] = 2.0f;
}

// Copy x and y arrays from host to device
cudaMemcpy(host_op.x, host_x, host_op.n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(host_op.y, host_y, host_op.n * sizeof(float), cudaMemcpyHostToDevice);

// Copy host operation struct to device
cudaMemcpy(device_op, &host_op, sizeof(Operation), cudaMemcpyHostToDevice);

// Launch kernel
int blockSize = 256;
int numBlocks = (host_op.n + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(device_op);

// Wait for GPU to finish before accessing results
cudaDeviceSynchronize();

// Copy results from device to host
cudaMemcpy(host_y, host_op.y, host_op.n * sizeof(float), cudaMemcpyDeviceToHost);

// Log results for debugging
std::cout << "Results (y = x + y):" << std::endl;
for (int i = 0; i < host_op.n; i++) {
std::cout << "y[" << i << "] = " << host_y[i] << " (expected: 3.0)" << std::endl;
}

// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < host_op.n; i++) {
maxError = fmax(maxError, fabs(host_y[i] - 3.0f));
}

// Free device memory
cudaFree(host_op.x);
cudaFree(host_op.y);
cudaFree(device_op);

// Free host memory
delete[] host_x;
delete[] host_y;

return 0;
}



// // ******UNIFIED MEMORY EXAMPLE BELOW*******


// #include <iostream>
// #include <math.h>

// struct Operation {
// float *x;
// float *y;
// int n;
// };

// // CUDA kernel to add elements of two arrays
// __global__ void add(Operation *op) {
// int index = blockIdx.x * blockDim.x + threadIdx.x;
// int stride = blockDim.x * gridDim.x;

// printf("The X is: %x\n", op->x[0]);
// printf("The Y is: %x\n", op->y[0]);
// for (int i = index; i < op->n; i += stride)
// {
// op->y[i] = op->x[i] + op->y[i];
// printf("The value is: %f\n", op->y[i]);
// }
// }

// int main(void) {
// Operation *op;

// // Allocate Unified Memory -- accessible from CPU or GPU
// cudaMallocManaged(&op, sizeof(Operation));
// op->n = 100;

// cudaMallocManaged(&op->x, op->n * sizeof(float));
// cudaMallocManaged(&op->y, op->n * sizeof(float));

// // initialize x and y arrays on the host
// for (int i = 0; i < op->n; i++) {
// op->x[i] = 1.0f;
// op->y[i] = 2.0f;
// }

// // Launch kernel on n elements on the GPU
// int blockSize = 256;
// int numBlocks = (op->n + blockSize - 1) / blockSize;

// std::cout << "numBlocks: " << numBlocks << std::endl;
// std::cout << "N: " << op->n << std::endl;

// add<<<numBlocks, blockSize>>>(op);

// // Wait for GPU to finish before accessing on host
// cudaDeviceSynchronize();

// // Log results for debugging
// std::cout << "Results (y = x + y):" << std::endl;
// for (int i = 0; i < op->n; i++) {
// std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl;
// }

// // Check for errors (all values should be 3.0f)
// float maxError = 0.0f;
// for (int i = 0; i < op->n; i++) {
// maxError = fmax(maxError, fabs(op->y[i] - 3.0f));
// }
// std::cout << "Max error: " << maxError << std::endl;

// // Free memory
// cudaFree(op->x);
// cudaFree(op->y);
// cudaFree(op);

// return 0;
// }

0 comments on commit 062360c

Please sign in to comment.