Skip to content

Commit

Permalink
chore: unified working example
Browse files Browse the repository at this point in the history
  • Loading branch information
brodeynewman committed Dec 13, 2024
1 parent 3d29fc8 commit 5b890da
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 65 deletions.
93 changes: 52 additions & 41 deletions client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,33 +226,31 @@ int rpc_read(const int index, void *data, size_t size)

void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size)
{
std::cout << "Allocating device ptr: " << dev_ptr
<< " and host ptr: " << ptr
<< " with size: " << size << std::endl;

// Initialize unified_mem_pointers if not already initialized
if (conns[index].mem_idx == 0) {
conns[index].unified_mem_pointers = new void **[5];
conns[index].unified_mem_pointers[0] = new void *[3];
conns[index].unified_mem_pointers = new void **[5]; // Initial capacity of 5
for (int i = 0; i < 5; ++i) {
conns[index].unified_mem_pointers[i] = nullptr;
}
}

if (conns[index].mem_idx >= 5) {
int new_capacity = conns[index].mem_idx + 1;
// we need to handle resize here at some point
// // Resize array if current capacity is exceeded
// if (conns[index].mem_idx >= 5) {
// int current_capacity = 5 + conns[index].mem_idx;
// int new_capacity = conns[index].mem_idx + 5;

void ***new_arr = new void **[new_capacity];
// void ***new_arr = new void **[new_capacity];
// for (int i = 0; i < new_capacity; ++i) {
// new_arr[i] = (i < conns[index].mem_idx) ? conns[index].unified_mem_pointers[i] : nullptr;
// }

for (int i = 0; i < conns[index].mem_idx; ++i) {
new_arr[i] = conns[index].unified_mem_pointers[i];
}
// delete[] conns[index].unified_mem_pointers;
// conns[index].unified_mem_pointers = new_arr;
// }

for (int i = conns[index].mem_idx; i < new_capacity; ++i) {
new_arr[i] = new void *[3];
}

delete[] conns[index].unified_mem_pointers;
conns[index].unified_mem_pointers = new_arr;
} else {
conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3];
}
// allocate new space for pointer mapping
conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3];

conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr;
conns[index].unified_mem_pointers[conns[index].mem_idx][1] = ptr;
Expand All @@ -261,29 +259,45 @@ void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, siz
conns[index].mem_idx++;
}

void cuda_memcpy_unified_ptrs(const int index)
void* maybe_get_cached_arg_ptr(const int index, int arg_index, void* arg)
{
std::cout << "copying memory..." << std::endl;
if (arg_index >= conns[index].mem_idx)
return nullptr;

return &conns[index].unified_mem_pointers[arg_index][0];
}

void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
{
for (int i = 0; i < conns[index].mem_idx; i++) {
void *dev_ptr = conns[index].unified_mem_pointers[i][0];
void *host_ptr = conns[index].unified_mem_pointers[i][1];
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
if (kind == cudaMemcpyHostToDevice) {
void *dev_ptr = conns[index].unified_mem_pointers[i][0];
void *host_ptr = conns[index].unified_mem_pointers[i][1];

std::cout << "Index " << i << " Parameters:\n"
<< " Device Pointer (dev_ptr): " << dev_ptr << "\n"
<< " Host Pointer (host_ptr): " << host_ptr << "\n"
<< " Size (bytes): " << size << "\n";
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);

cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice);
cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice);

if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed for index " << i
<< ": " << cudaGetErrorString(res) << std::endl;
} else {
std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
}
} else {
void *dev_ptr = conns[index].unified_mem_pointers[i][0];
void *host_ptr = conns[index].unified_mem_pointers[i][1];

std::cout << "result: " << res << std::endl;
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);

if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed for index " << i
<< ": " << cudaGetErrorString(res) << std::endl;
} else {
std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
cudaError_t res = cudaMemcpy(host_ptr, dev_ptr, size, cudaMemcpyDeviceToHost);

if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed for index " << i
<< ": " << cudaGetErrorString(res) << std::endl;
} else {
std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
}
}
}
}
Expand All @@ -294,12 +308,9 @@ void* maybe_free_unified_mem(const int index, void *ptr)
void *dev_ptr = conns[index].unified_mem_pointers[i][1];
void *target_free_ptr = conns[index].unified_mem_pointers[i][0];

std::cout << "comparing pointers: " << dev_ptr << " ptr " << ptr << std::endl;

if (dev_ptr == ptr) {
std::cout << "freeing pointer " << target_free_ptr << std::endl;
std::cout << "freeing dynamic pointer " << target_free_ptr << std::endl;

// mem addresses are the same, free
return target_free_ptr;
}
}
Expand Down
17 changes: 8 additions & 9 deletions codegen/gen_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9117,19 +9117,20 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c

cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
{
std::cout << "CALLING MALLOC WITH DEV PTR: " << devPtr << std::endl;
void* host_alloc = new void*[size];
void* d_a;

void* host_alloc = (void*) malloc(size);
void*d_a;
cudaMalloc((void **)&d_a, size);
std::cout << "AFTER DEVICE PTR: " << d_a << std::endl;
cudaError_t err = cudaMalloc((void **)&d_a, size);
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
return err;
}

allocate_unified_mem_pointer(0, d_a, host_alloc, size);

std::cout << "done allocate_unified_mem_pointer" << std::endl;
*devPtr = host_alloc;

std::cout << "DONE MALLOC" << std::endl;
return cudaSuccess;
}

cudaError_t cudaMalloc(void** devPtr, size_t size)
Expand All @@ -9141,8 +9142,6 @@ cudaError_t cudaMalloc(void** devPtr, size_t size)
rpc_read(0, devPtr, sizeof(void*)) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;

std::cout << "done calling cudaMalloc... " << devPtr << std::endl;
return return_value;
}

Expand Down
24 changes: 19 additions & 5 deletions codegen/manual_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ extern int rpc_wait_for_response(const int index);
extern int rpc_read(const int index, void *data, const std::size_t size);
extern int rpc_end_response(const int index, void *return_value);
extern int rpc_close();
void cuda_memcpy_unified_ptrs(const int index);

#define MAX_FUNCTION_NAME 1024
#define MAX_ARGS 128
Expand Down Expand Up @@ -373,7 +372,7 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
{
cudaError_t return_value;

cuda_memcpy_unified_ptrs(0);
cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);

// Start the RPC request
int request_id = rpc_start_request(0, RPC_cudaLaunchKernel);
Expand Down Expand Up @@ -418,9 +417,22 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void

for (int i = 0; i < f->arg_count; ++i)
{
if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
rpc_write(0, args[i], f->arg_sizes[i]) < 0)
return cudaErrorDevicesUnavailable;
void* maybe_ptr = maybe_get_cached_arg_ptr(0, i, args[i]);

if (maybe_ptr != 0)
{
std::cout << "writing dynamic pointer" << std::endl;
if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
rpc_write(0, maybe_ptr, f->arg_sizes[i]) < 0)
return cudaErrorDevicesUnavailable;
}
else
{
std::cout << "writing original pointer" << std::endl;
if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
rpc_write(0, args[i], f->arg_sizes[i]) < 0)
return cudaErrorDevicesUnavailable;
}
}

if (rpc_wait_for_response(0) < 0)
Expand All @@ -433,6 +445,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
return cudaErrorDevicesUnavailable;
}

cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);

return return_value;
}

Expand Down
63 changes: 53 additions & 10 deletions test/unified.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,70 @@
#include <iostream>

// CUDA Kernel to add elements of two arrays
__global__ void addKernel(int *a, int *b, int *c, int size) {
// __global__ void addKernel(int *a, int *b, int *c, int size) {
// int idx = threadIdx.x + blockIdx.x * blockDim.x;
// if (idx < size) {
// c[idx] = a[idx] * b[idx];
// }
// }

__global__ void mulKernel(int *a, int *c, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < size) {
c[idx] = a[idx] * b[idx];
c[idx] = a[idx] * 5;
}
}

int main() {
// const int size = 10;
// const int bytes = size * sizeof(int);
// int *d_a;
// int *d_c;

// // host pointers
// int *a = new int[size];
// int *c = new int[size];
// // cudaMalloc(&a, bytes);
// cudaMalloc(&d_a, bytes);
// cudaMalloc(&d_c, bytes);
// for (int i = 0; i < size; ++i) {
// a[i] = i;
// }

// cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice);
// cudaMemcpy(d_c, c, bytes, cudaMemcpyHostToDevice);

// const int threadsPerBlock = 256;
// std::cout << "ARG SIZE::: " << bytes << std::endl;
// const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock;

// mulKernel<<<blocks, threadsPerBlock>>>(d_a, d_c, size);
// cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost);
// cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);

// std::cout << "Results:\n";
// for (int i = 0; i < size; ++i) {
// std::cout << "a[" << i << "] + b[" << i << "] = " << c[i] << "\n";
// }

// cudaFree(a);
// cudaFree(c);




// Define array size
const int size = 10;
const int bytes = size * sizeof(int);

std::cout << "HELLO" << std::endl;

// Unified memory allocation
int *a, *b, *c;
int *a, *c;
cudaMallocManaged(&a, bytes);
cudaMallocManaged(&b, bytes);
cudaMallocManaged(&c, bytes);

// Initialize arrays on the CPU
for (int i = 0; i < size; ++i) {
a[i] = i;
b[i] = i * 2;
}

// Define kernel launch parameters
Expand All @@ -34,12 +74,16 @@ int main() {

std::cout << "launching kernel..." << std::endl;

for (int i = 0; i < size; ++i) {
std::cout << "a[" << i << "] + b[" << i << "] = " << a[i] << "\n";
}

std::cout << "pointer a: " << a << std::endl;
std::cout << "pointer b: " << b << std::endl;
std::cout << "pointer c: " << c << std::endl;
std::cout << "size c: " << size << std::endl;

// Launch the kernel
addKernel<<<blocks, threadsPerBlock>>>(a, b, c, size);
mulKernel<<<blocks, threadsPerBlock>>>(a, c, size);

// Wait for GPU to finish
cudaDeviceSynchronize();
Expand All @@ -52,7 +96,6 @@ int main() {

// Free unified memory
cudaFree(a);
cudaFree(b);
cudaFree(c);

return 0;
Expand Down

0 comments on commit 5b890da

Please sign in to comment.