Skip to content

Commit

Permalink
chore: mmap
Browse files Browse the repository at this point in the history
  • Loading branch information
brodeynewman committed Dec 18, 2024
1 parent 062360c commit a6683b4
Show file tree
Hide file tree
Showing 8 changed files with 312 additions and 173 deletions.
104 changes: 78 additions & 26 deletions client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@

#include <unordered_map>

#include <setjmp.h>
#include <signal.h>
#include <csignal>
#include <cstdlib>
#include <cstring>
#include <sys/mman.h>

#include "codegen/gen_client.h"

typedef struct
Expand All @@ -46,8 +53,67 @@ int nconns = 0;

const char *DEFAULT_PORT = "14833";

static int init = 0;
static jmp_buf catch_segfault;
static void* faulting_address = nullptr;

void add_host_mem_to_devptr_mapping(const int index, void *dev_ptr, void *host_ptr)
{
for (int i = 0; i < conns[index].mem_idx; i++)
{
// index 1 is host pointer
if (conns[index].unified_mem_pointers[i][0] == dev_ptr)
{
conns[index].unified_mem_pointers[i][2] = host_ptr;
}
}
}

static void segfault(int sig, siginfo_t* info, void* unused) {
faulting_address = info->si_addr;
std::cout << "Caught segfault at address: " << faulting_address << std::endl;

size_t page_size = sysconf(_SC_PAGESIZE);
void* aligned_address = (void*)((uintptr_t)faulting_address & ~(page_size - 1));

// Allocate memory at the faulting address
void* allocated = mmap(aligned_address, page_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (allocated == MAP_FAILED) {
perror("Failed to allocate memory at faulting address");
_exit(1);
}

add_host_mem_to_devptr_mapping(0, faulting_address, allocated);

std::cout << "Allocated and registered memory at address: " << allocated << std::endl;
}

static void set_segfault_handlers() {
if (init > 0) {
return;
}

struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = segfault;

if (sigaction(SIGSEGV, &sa, NULL) == -1) {
perror("sigaction");
exit(EXIT_FAILURE);
}

std::cout << "Segfault handler installed." << std::endl;

init = 1;
}

int rpc_open()
{
set_segfault_handlers();

sigsetjmp(catch_segfault, 1);

if (pthread_mutex_lock(&conn_mutex) < 0)
return -1;

Expand Down Expand Up @@ -224,37 +290,22 @@ int rpc_read(const int index, void *data, size_t size)
return n;
}

void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size)
void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size)
{
// Initialize unified_mem_pointers if not already initialized
if (conns[index].mem_idx == 0) {
conns[index].unified_mem_pointers = new void **[5]; // Initial capacity of 5
conns[index].unified_mem_pointers = new void **[10]; // Initial capacity of 5
for (int i = 0; i < 5; ++i) {
conns[index].unified_mem_pointers[i] = nullptr;
}
}

// we need to handle resize here at some point
// // Resize array if current capacity is exceeded
// if (conns[index].mem_idx >= 5) {
// int current_capacity = 5 + conns[index].mem_idx;
// int new_capacity = conns[index].mem_idx + 5;

// void ***new_arr = new void **[new_capacity];
// for (int i = 0; i < new_capacity; ++i) {
// new_arr[i] = (i < conns[index].mem_idx) ? conns[index].unified_mem_pointers[i] : nullptr;
// }

// delete[] conns[index].unified_mem_pointers;
// conns[index].unified_mem_pointers = new_arr;
// }

// allocate new space for pointer mapping
conns[index].unified_mem_pointers[conns[index].mem_idx] = new void *[3];

conns[index].unified_mem_pointers[conns[index].mem_idx][0] = dev_ptr;
conns[index].unified_mem_pointers[conns[index].mem_idx][1] = ptr;
conns[index].unified_mem_pointers[conns[index].mem_idx][2] = reinterpret_cast<void*>(size);
conns[index].unified_mem_pointers[conns[index].mem_idx][1] = reinterpret_cast<void*>(size);
conns[index].unified_mem_pointers[conns[index].mem_idx][2] = nullptr;

conns[index].mem_idx++;
}
Expand All @@ -277,9 +328,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
{
for (int i = 0; i < conns[index].mem_idx; i++) {
if (kind == cudaMemcpyHostToDevice) {
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);

cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][1], size, cudaMemcpyHostToDevice);
cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyHostToDevice);

if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed for index " << i
Expand All @@ -288,9 +339,9 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
std::cout << "Successfully copied " << size << " bytes for index " << i << std::endl;
}
} else {
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][2]);
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);

cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][1], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost);
cudaError_t res = cudaMemcpy(conns[index].unified_mem_pointers[i][0], conns[index].unified_mem_pointers[i][0], size, cudaMemcpyDeviceToHost);

if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed for index " << i
Expand All @@ -305,11 +356,12 @@ void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
void* maybe_free_unified_mem(const int index, void *ptr)
{
for (int i = 0; i < conns[index].mem_idx; i++) {
void *dev_ptr = conns[index].unified_mem_pointers[i][1];
void *target_free_ptr = conns[index].unified_mem_pointers[i][0];
void *dev_ptr = conns[index].unified_mem_pointers[i][0];
size_t size = reinterpret_cast<size_t>(conns[index].unified_mem_pointers[i][1]);

if (dev_ptr == ptr) {
return target_free_ptr;
munmap(dev_ptr, size);
return dev_ptr;
}
}
}
Expand Down
13 changes: 7 additions & 6 deletions codegen/gen_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ extern int rpc_read(const int index, void *data, const std::size_t size);
extern int rpc_end_response(const int index, void *return_value);
extern int rpc_close();
extern void* maybe_free_unified_mem(const int index, void *ptr);
extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, void *ptr, size_t size);
extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size);

nvmlReturn_t nvmlInit_v2()
{
Expand Down Expand Up @@ -9117,18 +9117,19 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c

cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
{
void* host_alloc = new void*[size];
void* d_a;
void* d_mem;

cudaError_t err = cudaMalloc((void **)&d_a, size);
cudaError_t err = cudaMalloc((void**)&d_mem, size);
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
return err;
}

allocate_unified_mem_pointer(0, d_a, host_alloc, size);
std::cout << "allocated unified device mem " << d_mem << std::endl;

*devPtr = host_alloc;
allocate_unified_mem_pointer(0, d_mem, size);

*devPtr = d_mem;

return cudaSuccess;
}
Expand Down
27 changes: 5 additions & 22 deletions codegen/manual_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,28 +419,9 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void

for (int i = 0; i < f->arg_count; ++i)
{
// convert each to a void pointer so that we can map it back to...
// our origional host pointers.
void *arg_ptr = *reinterpret_cast<void**>(args[i]);
void *maybe_ptr = maybe_get_cached_arg_ptr(0, arg_ptr);

// Hardconding 24 bytes for now for the unified memory case. Will remove before merge!
if (maybe_ptr != 0)
{
int size = 24;
std::cout << "writing dynamic pointer " << maybe_ptr << std::endl;
if (rpc_write(0, &size, sizeof(int)) < 0 ||
rpc_write(0, maybe_ptr, size) < 0)
return cudaErrorDevicesUnavailable;
}
else
{
int size = 24;
std::cout << "writing original pointer" << std::endl;
if (rpc_write(0, &size, sizeof(int)) < 0 ||
rpc_write(0, args[i], size) < 0)
return cudaErrorDevicesUnavailable;
}
if (rpc_write(0, &f->arg_sizes[i], sizeof(int)) < 0 ||
rpc_write(0, args[i], f->arg_sizes[i]) < 0)
return cudaErrorDevicesUnavailable;
}

if (rpc_wait_for_response(0) < 0)
Expand Down Expand Up @@ -763,6 +744,8 @@ extern "C"
{
void *return_value;

std::cout << "calling __cudaRegisterVar" << std::endl;

// Start the RPC request
int request_id = rpc_start_request(0, RPC___cudaRegisterVar);
if (request_id < 0)
Expand Down
36 changes: 36 additions & 0 deletions deploy/Dockerfile.unified
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
FROM ubuntu:24.04

RUN apt-get update && apt-get install -y \
build-essential \
wget \
curl \
python3 \
python3-pip \
gnupg \
software-properties-common && \
add-apt-repository 'deb http://archive.ubuntu.com/ubuntu jammy main universe' && \
apt-get update && \
apt-get install -y libtinfo5 && \
rm -rf /var/lib/apt/lists/*

RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \
dpkg -i cuda-keyring_1.0-1_all.deb && \
rm cuda-keyring_1.0-1_all.deb && \
apt-get update

RUN apt-get install -y cuda-toolkit-12-2

ENV PATH=/usr/local/cuda-12.2/bin:${PATH}
ENV LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64

ENV SCUDA_SERVER=100.118.7.128
ENV libscuda_path=/usr/local/lib/libscuda.so

COPY ./libscuda.so /usr/local/lib/libscuda.so
COPY unified.o unified.o

COPY start.sh /start.sh
RUN chmod +x /start.sh
RUN chmod +x /unified.o

CMD ["/bin/bash", "/start.sh", "unified"]
6 changes: 5 additions & 1 deletion deploy/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ elif [[ "$1" == "cublas" ]]; then
echo "Running cublas example..."

LD_PRELOAD="$libscuda_path" /matrixMulCUBLAS
elif [[ "$1" == "unified" ]]; then
echo "Running cublas example..."

LD_PRELOAD="$libscuda_path" /unified.o
else
echo "Unknown option: $1. Please specify 'torch' or 'cublas'."
echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
fi
1 change: 1 addition & 0 deletions local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ build() {
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_2.cu -o unified_2.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o

if [ ! -f "$libscuda_path" ]; then
echo "libscuda.so not found. build may have failed."
Expand Down
Loading

0 comments on commit a6683b4

Please sign in to comment.