Skip to content

Commit

Permalink
chore: cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
brodeynewman committed Dec 19, 2024
1 parent fcc8669 commit a90da39
Show file tree
Hide file tree
Showing 9 changed files with 101 additions and 178 deletions.
116 changes: 48 additions & 68 deletions client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ typedef struct
struct iovec write_iov[128];
int write_iov_count = 0;

void ***unified_mem_pointers;
int mem_idx = 0;
std::unordered_map<void*, size_t> unified_devices;
} conn_t;

pthread_mutex_t conn_mutex;
Expand All @@ -57,35 +56,44 @@ static int init = 0;
static jmp_buf catch_segfault;
static void* faulting_address = nullptr;

void add_host_mem_to_devptr_mapping(const int index, void *dev_ptr, void *host_ptr)
{
for (int i = 0; i < conns[index].mem_idx; i++)
{
// index 1 is host pointer
if (conns[index].unified_mem_pointers[i][0] == dev_ptr)
{
conns[index].unified_mem_pointers[i][2] = host_ptr;
}
}
}

static void segfault(int sig, siginfo_t* info, void* unused) {
faulting_address = info->si_addr;

std::cout << "Caught segfault at address: " << faulting_address << std::endl;

size_t page_size = sysconf(_SC_PAGESIZE);
void* aligned_address = (void*)((uintptr_t)faulting_address & ~(page_size - 1));
for (const auto & [ ptr, sz ] : conns[0].unified_devices)
{
if (faulting_address >= (ptr) && faulting_address <= (ptr + sz))
{
// ensure we assign memory as close to the faulting address as possible...
// by masking via the allocated unified memory size.
void* aligned_address = (void*)((uintptr_t)faulting_address & ~(sz - 1));

// Allocate memory at the faulting address
void* allocated = mmap(aligned_address, sz, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (allocated == MAP_FAILED) {
perror("Failed to allocate memory at faulting address");
_exit(1);
}

std::cout << "allocated dynamic memory at address: " << allocated << std::endl;

// Allocate memory at the faulting address
void* allocated = mmap(aligned_address, page_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (allocated == MAP_FAILED) {
perror("Failed to allocate memory at faulting address");
_exit(1);
return;
}
}

add_host_mem_to_devptr_mapping(0, faulting_address, allocated);
// raise our original segfault handler
struct sigaction sa;
sa.sa_handler = SIG_DFL;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;

if (sigaction(SIGSEGV, &sa, nullptr) == -1) {
perror("Failed to reset SIGSEGV handler");
_exit(EXIT_FAILURE);
}

std::cout << "Allocated and registered memory at address: " << allocated << std::endl;
raise(SIGSEGV);
}

static void set_segfault_handlers() {
Expand Down Expand Up @@ -292,74 +300,46 @@ int rpc_read(const int index, void *data, size_t size)

void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size)
{
// Initialize unified_mem_pointers if not already initialized
if (conns[index].mem_idx == 0) {
conns[index].unified_mem_pointers = new void **[10]; // Initial capacity of 5
for (int i = 0; i < 5; ++i) {
conns[index].unified_mem_pointers[i] = nullptr;
}
}

// allocate new space for pointer mapping
conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3];

conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr;
conns[index].unified_mem_pointers[conns[index].mem_idx][1] = reinterpret_cast<void*>(size);
conns[index].unified_mem_pointers[conns[index].mem_idx][2] = nullptr;

conns[index].mem_idx++;
}

void* maybe_get_cached_arg_ptr(const int index, void* arg_ptr)
{
for (int i = 0; i < conns[index].mem_idx; i++)
{
// index 1 is host pointer
if (conns[index].unified_mem_pointers[i][1] == arg_ptr)
{
return &conns[index].unified_mem_pointers[i][0];
}
}

return nullptr;
conns[index].unified_devices.insert({ dev_ptr, size });
}

void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
{
for (int i = 0; i < conns[index].mem_idx; i++) {
for (const auto & [ ptr, sz ] : conns[index].unified_devices) {
if (kind == cudaMemcpyHostToDevice) {
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);

cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyHostToDevice);
size_t size = reinterpret_cast<size_t>(sz);

// ptr is the same on both host/device
cudaError_t res = cudaMemcpy(ptr, ptr, size, cudaMemcpyHostToDevice);
if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed for index " << i
<< ": " << cudaGetErrorString(res) << std::endl;
std::cerr << "cudaMemcpy failed :" << cudaGetErrorString(res) << std::endl;
} else {
std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
std::cout << "Successfully copied " << size << " bytes" << std::endl;
}
} else {
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);
} else {
size_t size = reinterpret_cast<size_t>(sz);

cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost);
// ptr is the same on both host/device
cudaError_t res = cudaMemcpy(ptr, ptr, size, cudaMemcpyDeviceToHost);

if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed for index " << i
<< ": " << cudaGetErrorString(res) << std::endl;
std::cerr << "cudaMemcpy failed :" << cudaGetErrorString(res) << std::endl;
} else {
std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
std::cout << "Successfully copied " << size << " bytes" << std::endl;
}
}
}
}

void* maybe_free_unified_mem(const int index, void *ptr)
{
for (int i = 0; i < conns[index].mem_idx; i++) {
void *dev_ptr = conns[index].unified_mem_pointers[i][0];
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);
for (const auto & [ dev_ptr, sz ] : conns[index].unified_devices) {
size_t size = reinterpret_cast<size_t>(sz);

if (dev_ptr == ptr) {
std::cout << "mem-unampping device ptr: " << dev_ptr << " size " << size << std::endl;

munmap(dev_ptr, size);
return dev_ptr;
}
Expand Down
2 changes: 2 additions & 0 deletions codegen/annotations.h
Original file line number Diff line number Diff line change
Expand Up @@ -4357,6 +4357,7 @@ cudaError_t cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *f
*/
cudaError_t cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig);
/**
* @disabled
* @param devPtr SEND_RECV
* @param size SEND_ONLY
* @param flags SEND_ONLY
Expand Down Expand Up @@ -4388,6 +4389,7 @@ cudaError_t cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t h
*/
cudaError_t cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, unsigned int flags);
/**
* @disabled
* @param devPtr SEND_ONLY
*/
cudaError_t cudaFree(void *devPtr);
Expand Down
2 changes: 2 additions & 0 deletions codegen/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,11 @@
# a list of manually implemented cuda/nvml functions.
# these are automatically appended to each file; operation order is maintained as well.
MANUAL_IMPLEMENTATIONS = [
"cudaFree",
"cudaMemcpy",
"cudaMemcpyAsync",
"cudaLaunchKernel",
"cudaMallocManaged",
]

@dataclass
Expand Down
51 changes: 2 additions & 49 deletions codegen/gen_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

#include <cstring>
#include <string>
#include <iostream>
#include <unordered_map>

#include "gen_api.h"
Expand All @@ -21,8 +20,6 @@ extern int rpc_wait_for_response(const int index);
extern int rpc_read(const int index, void *data, const std::size_t size);
extern int rpc_end_response(const int index, void *return_value);
extern int rpc_close();
extern void* maybe_free_unified_mem(const int index, void *ptr);
extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size);

nvmlReturn_t nvmlInit_v2()
{
Expand Down Expand Up @@ -9115,25 +9112,6 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
return return_value;
}

cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
{
void* d_mem;

cudaError_t err = cudaMalloc((void**)&d_mem, size);
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
return err;
}

std::cout << "allocated unified device mem " << d_mem << std::endl;

allocate_unified_mem_pointer(0, d_mem, size);

*devPtr = d_mem;

return cudaSuccess;
}

cudaError_t cudaMalloc(void** devPtr, size_t size)
{
cudaError_t return_value;
Expand Down Expand Up @@ -9191,31 +9169,6 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe
return return_value;
}

cudaError_t cudaFree(void* devPtr)
{
cudaError_t return_value;
void *maybe_ptr = maybe_free_unified_mem(0, devPtr);

if (maybe_ptr != nullptr) {
std::cout << "POITNER FOUND!! " << maybe_ptr << std::endl;

if (rpc_start_request(0, RPC_cudaFree) < 0 ||
rpc_write(0, &maybe_ptr, sizeof(void*)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;
} else {
std::cout << "no poitner found..." << std::endl;
if (rpc_start_request(0, RPC_cudaFree) < 0 ||
rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;
}

return return_value;
}

cudaError_t cudaFreeHost(void* ptr)
{
cudaError_t return_value;
Expand Down Expand Up @@ -22176,12 +22129,10 @@ std::unordered_map<std::string, void *> functionMap = {
{"cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", (void *)cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags},
{"cudaOccupancyMaxPotentialClusterSize", (void *)cudaOccupancyMaxPotentialClusterSize},
{"cudaOccupancyMaxActiveClusters", (void *)cudaOccupancyMaxActiveClusters},
{"cudaMallocManaged", (void *)cudaMallocManaged},
{"cudaMalloc", (void *)cudaMalloc},
{"cudaMallocHost", (void *)cudaMallocHost},
{"cudaMallocPitch", (void *)cudaMallocPitch},
{"cudaMallocArray", (void *)cudaMallocArray},
{"cudaFree", (void *)cudaFree},
{"cudaFreeHost", (void *)cudaFreeHost},
{"cudaFreeArray", (void *)cudaFreeArray},
{"cudaFreeMipmappedArray", (void *)cudaFreeMipmappedArray},
Expand Down Expand Up @@ -22874,9 +22825,11 @@ std::unordered_map<std::string, void *> functionMap = {
{"cuMemFreeAsync_ptsz", (void *)cuMemFreeAsync},
{"cuMemAllocAsync_ptsz", (void *)cuMemAllocAsync},
{"cuMemAllocFromPoolAsync_ptsz", (void *)cuMemAllocFromPoolAsync},
{"cudaFree", (void *)cudaFree},
{"cudaMemcpy", (void *)cudaMemcpy},
{"cudaMemcpyAsync", (void *)cudaMemcpyAsync},
{"cudaLaunchKernel", (void *)cudaLaunchKernel},
{"cudaMallocManaged", (void *)cudaMallocManaged},
};

void *get_function_pointer(const char *name)
Expand Down
57 changes: 0 additions & 57 deletions codegen/gen_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19387,36 +19387,6 @@ int handle_cudaOccupancyMaxActiveClusters(void *conn)
return -1;
}

int handle_cudaMallocManaged(void *conn)
{
void* devPtr;
size_t size;
unsigned int flags;
int request_id;
cudaError_t scuda_intercept_result;
std::cout << "calling cudaMallocManaged" << std::endl;
if (
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_read(conn, &size, sizeof(size_t)) < 0 ||
rpc_read(conn, &flags, sizeof(unsigned int)) < 0 ||
false)
goto ERROR_0;

request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_0;
scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags);

if (rpc_start_response(conn, request_id) < 0 ||
rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_end_response(conn, &scuda_intercept_result) < 0)
goto ERROR_0;

return 0;
ERROR_0:
return -1;
}

int handle_cudaMalloc(void *conn)
{
void* devPtr;
Expand All @@ -19433,8 +19403,6 @@ int handle_cudaMalloc(void *conn)
goto ERROR_0;
scuda_intercept_result = cudaMalloc(&devPtr, size);

std::cout << "ADDRESS : " << &devPtr << std::endl;

if (rpc_start_response(conn, request_id) < 0 ||
rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_end_response(conn, &scuda_intercept_result) < 0)
Expand Down Expand Up @@ -19537,31 +19505,6 @@ int handle_cudaMallocArray(void *conn)
return -1;
}

int handle_cudaFree(void *conn)
{
void* devPtr;
int request_id;
cudaError_t scuda_intercept_result;
if (
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
false)
goto ERROR_0;

std::cout << "freeing... " << devPtr << std::endl;
request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_0;
scuda_intercept_result = cudaFree(devPtr);

if (rpc_start_response(conn, request_id) < 0 ||
rpc_end_response(conn, &scuda_intercept_result) < 0)
goto ERROR_0;

return 0;
ERROR_0:
return -1;
}

int handle_cudaFreeHost(void *conn)
{
void* ptr;
Expand Down
Loading

0 comments on commit a90da39

Please sign in to comment.