Skip to content

Commit

Permalink
chore: capture device pointers
Browse files Browse the repository at this point in the history
  • Loading branch information
brodeynewman committed Dec 8, 2024
1 parent aeef059 commit 3d29fc8
Show file tree
Hide file tree
Showing 6 changed files with 190 additions and 17 deletions.
85 changes: 85 additions & 0 deletions client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cuda.h>
#include <sys/uio.h>
#include <netinet/tcp.h>
#include <cuda_runtime.h>

#include <unordered_map>

Expand All @@ -34,6 +35,9 @@ typedef struct
pthread_cond_t read_cond;
struct iovec write_iov[128];
int write_iov_count = 0;

void ***unified_mem_pointers;
int mem_idx = 0;
} conn_t;

pthread_mutex_t conn_mutex;
Expand Down Expand Up @@ -220,6 +224,87 @@ int rpc_read(const int index, void *data, size_t size)
return n;
}

void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size)
{
std::cout << "Allocating device ptr: " << dev_ptr
<< " and host ptr: " << ptr
<< " with size: " << size << std::endl;

if (conns[index].mem_idx == 0) {
conns[index].unified_mem_pointers = new void **[5];
conns[index].unified_mem_pointers[0] = new void *[3];
}

if (conns[index].mem_idx >= 5) {
int new_capacity = conns[index].mem_idx + 1;

void ***new_arr = new void **[new_capacity];

for (int i = 0; i < conns[index].mem_idx; ++i) {
new_arr[i] = conns[index].unified_mem_pointers[i];
}

for (int i = conns[index].mem_idx; i < new_capacity; ++i) {
new_arr[i] = new void *[3];
}

delete[] conns[index].unified_mem_pointers;
conns[index].unified_mem_pointers = new_arr;
} else {
conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3];
}

conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr;
conns[index].unified_mem_pointers[conns[index].mem_idx][1] = ptr;
conns[index].unified_mem_pointers[conns[index].mem_idx][2] = reinterpret_cast<void*>(size);

conns[index].mem_idx++;
}

void cuda_memcpy_unified_ptrs(const int index)
{
std::cout << "copying memory..." << std::endl;

for (int i = 0; i < conns[index].mem_idx; i++) {
void *dev_ptr = conns[index].unified_mem_pointers[i][0];
void *host_ptr = conns[index].unified_mem_pointers[i][1];
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);

std::cout << "Index " << i << " Parameters:\n"
<< " Device Pointer (dev_ptr): " << dev_ptr << "\n"
<< " Host Pointer (host_ptr): " << host_ptr << "\n"
<< " Size (bytes): " << size << "\n";

cudaError_t res = cudaMemcpy(dev_ptr, host_ptr, size, cudaMemcpyHostToDevice);

std::cout << "result: " << res << std::endl;

if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed for index " << i
<< ": " << cudaGetErrorString(res) << std::endl;
} else {
std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
}
}
}

void* maybe_free_unified_mem(const int index, void *ptr)
{
for (int i = 0; i < conns[index].mem_idx; i++) {
void *dev_ptr = conns[index].unified_mem_pointers[i][1];
void *target_free_ptr = conns[index].unified_mem_pointers[i][0];

std::cout << "comparing pointers: " << dev_ptr << " ptr " << ptr << std::endl;

if (dev_ptr == ptr) {
std::cout << "freeing pointer " << target_free_ptr << std::endl;

// mem addresses are the same, free
return target_free_ptr;
}
}
}

int rpc_end_response(const int index, void *result)
{
if (read(conns[index].connfd, result, sizeof(int)) < 0 ||
Expand Down
52 changes: 37 additions & 15 deletions codegen/gen_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <cstring>
#include <string>
#include <iostream>
#include <unordered_map>

#include "gen_api.h"
Expand All @@ -20,6 +21,8 @@ extern int rpc_wait_for_response(const int index);
extern int rpc_read(const int index, void *data, const std::size_t size);
extern int rpc_end_response(const int index, void *return_value);
extern int rpc_close();
extern void* maybe_free_unified_mem(const int index, void *ptr);
extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size);

nvmlReturn_t nvmlInit_v2()
{
Expand Down Expand Up @@ -9114,16 +9117,19 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c

cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
{
cudaError_t return_value;
if (rpc_start_request(0, RPC_cudaMallocManaged) < 0 ||
rpc_write(0, devPtr, sizeof(void*)) < 0 ||
rpc_write(0, &size, sizeof(size_t)) < 0 ||
rpc_write(0, &flags, sizeof(unsigned int)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_read(0, devPtr, sizeof(void*)) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;
return return_value;
std::cout << "CALLING MALLOC WITH DEV PTR: " << devPtr << std::endl;

void* host_alloc = (void*) malloc(size);
void*d_a;
cudaMalloc((void **)&d_a, size);
std::cout << "AFTER DEVICE PTR: " << d_a << std::endl;

allocate_unified_mem_pointer(0, d_a, host_alloc, size);

std::cout << "done allocate_unified_mem_pointer" << std::endl;
*devPtr = host_alloc;

std::cout << "DONE MALLOC" << std::endl;
}

cudaError_t cudaMalloc(void** devPtr, size_t size)
Expand All @@ -9135,6 +9141,8 @@ cudaError_t cudaMalloc(void** devPtr, size_t size)
rpc_read(0, devPtr, sizeof(void*)) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;

std::cout << "done calling cudaMalloc... " << devPtr << std::endl;
return return_value;
}

Expand Down Expand Up @@ -9186,11 +9194,25 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe
cudaError_t cudaFree(void* devPtr)
{
cudaError_t return_value;
if (rpc_start_request(0, RPC_cudaFree) < 0 ||
rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;
void *maybe_ptr = maybe_free_unified_mem(0, devPtr);

if (maybe_ptr != nullptr) {
std::cout << "POITNER FOUND!! " << maybe_ptr << std::endl;

if (rpc_start_request(0, RPC_cudaFree) < 0 ||
rpc_write(0, &maybe_ptr, sizeof(void*)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;
} else {
std::cout << "no poitner found..." << std::endl;
if (rpc_start_request(0, RPC_cudaFree) < 0 ||
rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;
}

return return_value;
}

Expand Down
6 changes: 5 additions & 1 deletion codegen/gen_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19394,6 +19394,7 @@ int handle_cudaMallocManaged(void *conn)
unsigned int flags;
int request_id;
cudaError_t scuda_intercept_result;
std::cout << "calling cudaMallocManaged" << std::endl;
if (
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_read(conn, &size, sizeof(size_t)) < 0 ||
Expand Down Expand Up @@ -19432,6 +19433,8 @@ int handle_cudaMalloc(void *conn)
goto ERROR_0;
scuda_intercept_result = cudaMalloc(&devPtr, size);

std::cout << "ADDRESS : " << &devPtr << std::endl;

if (rpc_start_response(conn, request_id) < 0 ||
rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_end_response(conn, &scuda_intercept_result) < 0)
Expand Down Expand Up @@ -19543,7 +19546,8 @@ int handle_cudaFree(void *conn)
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
false)
goto ERROR_0;


std::cout << "freeing... " << devPtr << std::endl;
request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_0;
Expand Down
3 changes: 3 additions & 0 deletions codegen/manual_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ extern int rpc_wait_for_response(const int index);
extern int rpc_read(const int index, void *data, const std::size_t size);
extern int rpc_end_response(const int index, void *return_value);
extern int rpc_close();
void cuda_memcpy_unified_ptrs(const int index);

#define MAX_FUNCTION_NAME 1024
#define MAX_ARGS 128
Expand Down Expand Up @@ -372,6 +373,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
{
cudaError_t return_value;

cuda_memcpy_unified_ptrs(0);

// Start the RPC request
int request_id = rpc_start_request(0, RPC_cudaLaunchKernel);
if (request_id < 0)
Expand Down
2 changes: 1 addition & 1 deletion local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ build() {

nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o

nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o

if [ ! -f "$libscuda_path" ]; then
echo "libscuda.so not found. build may have failed."
Expand Down
59 changes: 59 additions & 0 deletions test/unified.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#include <cuda_runtime.h>
#include <iostream>

// CUDA Kernel to add elements of two arrays
__global__ void addKernel(int *a, int *b, int *c, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < size) {
c[idx] = a[idx] * b[idx];
}
}

int main() {
// Define array size
const int size = 10;
const int bytes = size * sizeof(int);

std::cout << "HELLO" << std::endl;

// Unified memory allocation
int *a, *b, *c;
cudaMallocManaged(&a, bytes);
cudaMallocManaged(&b, bytes);
cudaMallocManaged(&c, bytes);

// Initialize arrays on the CPU
for (int i = 0; i < size; ++i) {
a[i] = i;
b[i] = i * 2;
}

// Define kernel launch parameters
const int threadsPerBlock = 256;
const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock;

std::cout << "launching kernel..." << std::endl;

std::cout << "pointer a: " << a << std::endl;
std::cout << "pointer b: " << b << std::endl;
std::cout << "pointer c: " << c << std::endl;

// Launch the kernel
addKernel<<<blocks, threadsPerBlock>>>(a, b, c, size);

// Wait for GPU to finish
cudaDeviceSynchronize();

// Display results
std::cout << "Results:\n";
for (int i = 0; i < size; ++i) {
std::cout << "a[" << i << "] + b[" << i << "] = " << c[i] << "\n";
}

// Free unified memory
cudaFree(a);
cudaFree(b);
cudaFree(c);

return 0;
}

0 comments on commit 3d29fc8

Please sign in to comment.