Skip to content

Commit

Permalink
chore: unified pointer ex
Browse files Browse the repository at this point in the history
  • Loading branch information
brodeynewman committed Dec 18, 2024
1 parent a6683b4 commit fcc8669
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 173 deletions.
3 changes: 2 additions & 1 deletion deploy/Dockerfile.unified
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ RUN apt-get install -y cuda-toolkit-12-2
ENV PATH=/usr/local/cuda-12.2/bin:${PATH}
ENV LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64

ENV SCUDA_SERVER=100.118.7.128
ENV SCUDA_SERVER=71.183.65.76
ENV libscuda_path=/usr/local/lib/libscuda.so

COPY ./libscuda.so /usr/local/lib/libscuda.so
COPY unified.o unified.o
COPY unified_pointer.o unified_pointer.o

COPY start.sh /start.sh
RUN chmod +x /start.sh
Expand Down
2 changes: 1 addition & 1 deletion deploy/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ elif [[ "$1" == "cublas" ]]; then
elif [[ "$1" == "unified" ]]; then
echo "Running cublas example..."

LD_PRELOAD="$libscuda_path" /unified.o
LD_PRELOAD="$libscuda_path" /unified_pointer.o
else
echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
fi
171 changes: 0 additions & 171 deletions test/unified_2.cu

This file was deleted.

71 changes: 71 additions & 0 deletions test/unified_pointer.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#include <iostream>
#include <math.h>

struct Operation {
float *x;
float *y;
int n;
};

// CUDA kernel to add elements of two arrays
__global__ void add(Operation *op) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;

printf("The X is: %x\n", op->x[0]);
printf("The Y is: %x\n", op->y[0]);
for (int i = index; i < op->n; i += stride)
{
op->y[i] = op->x[i] + op->y[i];
printf("The value is: %f\n", op->y[i]);
}
}

int main(void) {
Operation *op;

// Allocate Unified Memory -- accessible from CPU or GPU
cudaMallocManaged(&op, sizeof(Operation));
op->n = 100;

cudaMallocManaged(&op->x, op->n * sizeof(float));
cudaMallocManaged(&op->y, op->n * sizeof(float));

// initialize x and y arrays on the host
for (int i = 0; i < op->n; i++) {
op->x[i] = 1.0f;
op->y[i] = 2.0f;
}

// Launch kernel on n elements on the GPU
int blockSize = 256;
int numBlocks = (op->n + blockSize - 1) / blockSize;

std::cout << "numBlocks: " << numBlocks << std::endl;
std::cout << "N: " << op->n << std::endl;

add<<<numBlocks, blockSize>>>(op);

// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();

// Log results for debugging
std::cout << "Results (y = x + y):" << std::endl;
for (int i = 0; i < op->n; i++) {
std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl;
}

// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < op->n; i++) {
maxError = fmax(maxError, fabs(op->y[i] - 3.0f));
}
std::cout << "Max error: " << maxError << std::endl;

// Free memory
cudaFree(op->x);
cudaFree(op->y);
cudaFree(op);

return 0;
}

0 comments on commit fcc8669

Please sign in to comment.