Skip to content

Commit

Permalink
Brodey | unified memory (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
brodeynewman authored Dec 19, 2024
1 parent af6f363 commit 6073932
Show file tree
Hide file tree
Showing 15 changed files with 467 additions and 85 deletions.
111 changes: 111 additions & 0 deletions client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,17 @@
#include <cuda.h>
#include <sys/uio.h>
#include <netinet/tcp.h>
#include <cuda_runtime.h>

#include <unordered_map>

#include <setjmp.h>
#include <signal.h>
#include <csignal>
#include <cstdlib>
#include <cstring>
#include <sys/mman.h>

#include "codegen/gen_client.h"

typedef struct
Expand All @@ -34,6 +42,8 @@ typedef struct
pthread_cond_t read_cond;
struct iovec write_iov[128];
int write_iov_count = 0;

std::unordered_map<void*, size_t> unified_devices;
} conn_t;

pthread_mutex_t conn_mutex;
Expand All @@ -42,8 +52,76 @@ int nconns = 0;

const char *DEFAULT_PORT = "14833";

static int init = 0;
static jmp_buf catch_segfault;
static void* faulting_address = nullptr;

static void segfault(int sig, siginfo_t* info, void* unused) {
faulting_address = info->si_addr;

std::cout << "Caught segfault at address: " << faulting_address << std::endl;

for (const auto & [ ptr, sz ] : conns[0].unified_devices)
{
if (ptr <= faulting_address && faulting_address < (ptr + sz))
{
// ensure we assign memory as close to the faulting address as possible...
// by masking via the allocated unified memory size.
void* aligned_address = (void*)((uintptr_t)faulting_address & ~(sz - 1));

// Allocate memory at the faulting address
void* allocated = mmap(aligned_address, sz, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (allocated == MAP_FAILED) {
perror("Failed to allocate memory at faulting address");
_exit(1);
}

std::cout << "allocated dynamic memory at address: " << allocated << std::endl;

return;
}
}

// raise our original segfault handler
struct sigaction sa;
sa.sa_handler = SIG_DFL;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;

if (sigaction(SIGSEGV, &sa, nullptr) == -1) {
perror("Failed to reset SIGSEGV handler");
_exit(EXIT_FAILURE);
}

raise(SIGSEGV);
}

static void set_segfault_handlers() {
if (init > 0) {
return;
}

struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = segfault;

if (sigaction(SIGSEGV, &sa, NULL) == -1) {
perror("sigaction");
exit(EXIT_FAILURE);
}

std::cout << "Segfault handler installed." << std::endl;

init = 1;
}

int rpc_open()
{
set_segfault_handlers();

sigsetjmp(catch_segfault, 1);

if (pthread_mutex_lock(&conn_mutex) < 0)
return -1;

Expand Down Expand Up @@ -220,6 +298,39 @@ int rpc_read(const int index, void *data, size_t size)
return n;
}

void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size)
{
// allocate new space for pointer mapping
conns[index].unified_devices.insert({ dev_ptr, size });
}

void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
{
for (const auto & [ ptr, sz ] : conns[index].unified_devices) {
size_t size = reinterpret_cast<size_t>(sz);

// ptr is the same on both host/device
cudaError_t res = cudaMemcpy(ptr, ptr, size, kind);
if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed :" << cudaGetErrorString(res) << std::endl;
} else {
std::cout << "Successfully copied " << size << " bytes" << std::endl;
}
}
}

void maybe_free_unified_mem(const int index, void *ptr)
{
for (const auto & [ dev_ptr, sz ] : conns[index].unified_devices) {
size_t size = reinterpret_cast<size_t>(sz);

if (dev_ptr == ptr) {
munmap(dev_ptr, size);
return;
}
}
}

int rpc_end_response(const int index, void *result)
{
if (read(conns[index].connfd, result, sizeof(int)) < 0 ||
Expand Down
2 changes: 2 additions & 0 deletions codegen/annotations.h
Original file line number Diff line number Diff line change
Expand Up @@ -4357,6 +4357,7 @@ cudaError_t cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *f
*/
cudaError_t cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig);
/**
* @disabled
* @param devPtr SEND_RECV
* @param size SEND_ONLY
* @param flags SEND_ONLY
Expand Down Expand Up @@ -4388,6 +4389,7 @@ cudaError_t cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t h
*/
cudaError_t cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, unsigned int flags);
/**
* @disabled
* @param devPtr SEND_ONLY
*/
cudaError_t cudaFree(void *devPtr);
Expand Down
2 changes: 2 additions & 0 deletions codegen/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,11 @@
# a list of manually implemented cuda/nvml functions.
# these are automatically appended to each file; operation order is maintained as well.
MANUAL_IMPLEMENTATIONS = [
"cudaFree",
"cudaMemcpy",
"cudaMemcpyAsync",
"cudaLaunchKernel",
"cudaMallocManaged",
]

@dataclass
Expand Down
29 changes: 2 additions & 27 deletions codegen/gen_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9112,20 +9112,6 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
return return_value;
}

cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
{
cudaError_t return_value;
if (rpc_start_request(0, RPC_cudaMallocManaged) < 0 ||
rpc_write(0, devPtr, sizeof(void*)) < 0 ||
rpc_write(0, &size, sizeof(size_t)) < 0 ||
rpc_write(0, &flags, sizeof(unsigned int)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_read(0, devPtr, sizeof(void*)) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;
return return_value;
}

cudaError_t cudaMalloc(void** devPtr, size_t size)
{
cudaError_t return_value;
Expand Down Expand Up @@ -9183,17 +9169,6 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe
return return_value;
}

cudaError_t cudaFree(void* devPtr)
{
cudaError_t return_value;
if (rpc_start_request(0, RPC_cudaFree) < 0 ||
rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;
return return_value;
}

cudaError_t cudaFreeHost(void* ptr)
{
cudaError_t return_value;
Expand Down Expand Up @@ -22154,12 +22129,10 @@ std::unordered_map<std::string, void *> functionMap = {
{"cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", (void *)cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags},
{"cudaOccupancyMaxPotentialClusterSize", (void *)cudaOccupancyMaxPotentialClusterSize},
{"cudaOccupancyMaxActiveClusters", (void *)cudaOccupancyMaxActiveClusters},
{"cudaMallocManaged", (void *)cudaMallocManaged},
{"cudaMalloc", (void *)cudaMalloc},
{"cudaMallocHost", (void *)cudaMallocHost},
{"cudaMallocPitch", (void *)cudaMallocPitch},
{"cudaMallocArray", (void *)cudaMallocArray},
{"cudaFree", (void *)cudaFree},
{"cudaFreeHost", (void *)cudaFreeHost},
{"cudaFreeArray", (void *)cudaFreeArray},
{"cudaFreeMipmappedArray", (void *)cudaFreeMipmappedArray},
Expand Down Expand Up @@ -22852,9 +22825,11 @@ std::unordered_map<std::string, void *> functionMap = {
{"cuMemFreeAsync_ptsz", (void *)cuMemFreeAsync},
{"cuMemAllocAsync_ptsz", (void *)cuMemAllocAsync},
{"cuMemAllocFromPoolAsync_ptsz", (void *)cuMemAllocFromPoolAsync},
{"cudaFree", (void *)cudaFree},
{"cudaMemcpy", (void *)cudaMemcpy},
{"cudaMemcpyAsync", (void *)cudaMemcpyAsync},
{"cudaLaunchKernel", (void *)cudaLaunchKernel},
{"cudaMallocManaged", (void *)cudaMallocManaged},
};

void *get_function_pointer(const char *name)
Expand Down
53 changes: 0 additions & 53 deletions codegen/gen_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19387,35 +19387,6 @@ int handle_cudaOccupancyMaxActiveClusters(void *conn)
return -1;
}

int handle_cudaMallocManaged(void *conn)
{
void* devPtr;
size_t size;
unsigned int flags;
int request_id;
cudaError_t scuda_intercept_result;
if (
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_read(conn, &size, sizeof(size_t)) < 0 ||
rpc_read(conn, &flags, sizeof(unsigned int)) < 0 ||
false)
goto ERROR_0;

request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_0;
scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags);

if (rpc_start_response(conn, request_id) < 0 ||
rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_end_response(conn, &scuda_intercept_result) < 0)
goto ERROR_0;

return 0;
ERROR_0:
return -1;
}

int handle_cudaMalloc(void *conn)
{
void* devPtr;
Expand Down Expand Up @@ -19534,30 +19505,6 @@ int handle_cudaMallocArray(void *conn)
return -1;
}

int handle_cudaFree(void *conn)
{
void* devPtr;
int request_id;
cudaError_t scuda_intercept_result;
if (
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
false)
goto ERROR_0;

request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_0;
scuda_intercept_result = cudaFree(devPtr);

if (rpc_start_response(conn, request_id) < 0 ||
rpc_end_response(conn, &scuda_intercept_result) < 0)
goto ERROR_0;

return 0;
ERROR_0:
return -1;
}

int handle_cudaFreeHost(void *conn)
{
void* ptr;
Expand Down
44 changes: 42 additions & 2 deletions codegen/manual_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ extern int rpc_wait_for_response(const int index);
extern int rpc_read(const int index, void *data, const std::size_t size);
extern int rpc_end_response(const int index, void *return_value);
extern int rpc_close();
void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind);
void* maybe_free_unified_mem(const int index, void *ptr);
extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size);

#define MAX_FUNCTION_NAME 1024
#define MAX_ARGS 128
Expand Down Expand Up @@ -337,6 +340,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
{
cudaError_t return_value;

cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);

// Start the RPC request
int request_id = rpc_start_request(0, RPC_cudaLaunchKernel);
if (request_id < 0)
Expand Down Expand Up @@ -395,6 +400,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
return cudaErrorDevicesUnavailable;
}

cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);

return return_value;
}

Expand Down Expand Up @@ -483,8 +490,6 @@ void parse_ptx_string(void *fatCubin, const char *ptx_string, unsigned long long
if (type_size == 0)
continue;
arg_size = type_size;

std::cout << "arg size: " << arg_size << std::endl;
}
else if (ptx_string[i] == '[')
{
Expand Down Expand Up @@ -705,6 +710,8 @@ extern "C"
{
void *return_value;

std::cout << "calling __cudaRegisterVar" << std::endl;

// Start the RPC request
int request_id = rpc_start_request(0, RPC___cudaRegisterVar);
if (request_id < 0)
Expand Down Expand Up @@ -792,3 +799,36 @@ extern "C"
}
}
}

cudaError_t cudaFree(void* devPtr)
{
cudaError_t return_value;
maybe_free_unified_mem(0, devPtr);

if (rpc_start_request(0, RPC_cudaFree) < 0 ||
rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;

return return_value;
}

cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
{
void* d_mem;

cudaError_t err = cudaMalloc((void**)&d_mem, size);
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
return err;
}

std::cout << "allocated unified device mem " << d_mem << " size: " << size << std::endl;

allocate_unified_mem_pointer(0, d_mem, size);

*devPtr = d_mem;

return cudaSuccess;
}
2 changes: 2 additions & 0 deletions codegen/manual_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include <cublas_v2.h>
#include <cuda_runtime_api.h>

cudaError_t cudaFree(void* devPtr);
cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags);
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
Expand Down
Loading

0 comments on commit 6073932

Please sign in to comment.