Skip to content

Commit

Permalink
chore: unified demo
Browse files Browse the repository at this point in the history
  • Loading branch information
brodeynewman committed Dec 20, 2024
1 parent 5fb85af commit 1eca2d2
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 26 deletions.
22 changes: 4 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,16 @@
SCUDA is a GPU over IP bridge allowing GPUs on remote machines to be attached
to CPU-only machines.

## Demos
## Demo

### CUBLAS Matrix Multiplication
### CUBLAS Matrix Multiplication using Unified Memory

The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
Left pane is a Mac running a docker container with nvidia utils installed.

The docker container runs this [matrixMulCUBLAS](https://github.com/zchee/cuda-sample/blob/master/0_Simple/matrixMulCUBLAS/matrixMulCUBLAS.cpp) example.

You can view the docker image used [here](./deploy/Dockerfile.cublas-test).

https://github.com/user-attachments/assets/4bf130c5-5544-442f-b1a5-6216255ab499

### Simple torch example

The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
Left pane is a Mac running a docker container with nvidia utils installed.

The docker container runs `python3 -c "import torch; print(torch.cuda.is_available())"` to check if cuda is available.

You can view the docker image used [here](./deploy/Dockerfile.torch-test).

https://github.com/user-attachments/assets/035950bb-3cc1-4c73-9ad5-b00871a159ec
The docker container runs this [matrixMulCUBLAS](./deploy/cublas_unified.o) example. This example not only uses cuBLAS, but also takes advantage of unified memory.

You can view the docker image used [here](./deploy/Dockerfile.unified).

## Local development

Expand Down
3 changes: 3 additions & 0 deletions codegen/gen_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ extern int rpc_end_request(const int index);
extern int rpc_wait_for_response(const int index);
extern int rpc_read(const int index, void *data, const std::size_t size);
extern int rpc_end_response(const int index, void *return_value);
void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind);
extern int rpc_close();

nvmlReturn_t nvmlInit_v2()
Expand Down Expand Up @@ -18581,6 +18582,7 @@ cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr

cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount)
{
cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);
cublasStatus_t return_value;
if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 ||
rpc_write(0, &batchCount, sizeof(int)) < 0 ||
Expand All @@ -18603,6 +18605,7 @@ cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t trans
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return CUBLAS_STATUS_NOT_INITIALIZED;
cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);
return return_value;
}

Expand Down
1 change: 1 addition & 0 deletions deploy/Dockerfile.unified
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ ENV libscuda_path=/usr/local/lib/libscuda.so
COPY ./libscuda.so /usr/local/lib/libscuda.so
COPY unified.o unified.o
COPY unified_pointer.o unified_pointer.o
COPY cublas_unified.o cublas_unified.o

COPY start.sh /start.sh
RUN chmod +x /start.sh
Expand Down
2 changes: 1 addition & 1 deletion deploy/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ elif [[ "$1" == "cublas" ]]; then
elif [[ "$1" == "unified" ]]; then
echo "Running cublas example..."

LD_PRELOAD="$libscuda_path" /unified_pointer.o
LD_PRELOAD="$libscuda_path" /cublas_unified.o
else
echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
fi
22 changes: 15 additions & 7 deletions local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,6 @@ build() {

echo "building vector file for test..."

nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o

if [ ! -f "$libscuda_path" ]; then
echo "libscuda.so not found. build may have failed."
exit 1
Expand Down Expand Up @@ -231,6 +224,18 @@ test() {
done
}

build_tests() {
build

nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_unified.cu -o cublas_unified.o
}

run() {
build

Expand All @@ -244,6 +249,9 @@ case "$1" in
build)
build
;;
build_tests)
build_tests
;;
run)
run
;;
Expand Down
116 changes: 116 additions & 0 deletions test/cublas_unified.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#include <cstdio>
#include <cstdlib>
#include <vector>

#include <cublas_v2.h>
#include <cuda_runtime.h>

#include "cublas_utils.h"

using data_type = double;

int main(int argc, char *argv[])
{
cublasHandle_t cublasH = NULL;
cudaStream_t stream = NULL;

const int m = 2;
const int n = 2;
const int k = 2;
const int lda = 2;
const int ldb = 2;
const int ldc = 2;
const int batch_count = 2;

const std::vector<std::vector<data_type>> A_array = {{1.0, 3.0, 2.0, 4.0},
{5.0, 7.0, 6.0, 8.0}};
const std::vector<std::vector<data_type>> B_array = {{5.0, 7.0, 6.0, 8.0},
{9.0, 11.0, 10.0, 12.0}};
std::vector<std::vector<data_type>> C_array(batch_count, std::vector<data_type>(m * n));

const data_type alpha = 1.0;
const data_type beta = 0.0;

data_type **d_A_array = nullptr;
data_type **d_B_array = nullptr;
data_type **d_C_array = nullptr;

std::vector<data_type *> d_A(batch_count, nullptr);
std::vector<data_type *> d_B(batch_count, nullptr);
std::vector<data_type *> d_C(batch_count, nullptr);

cublasOperation_t transa = CUBLAS_OP_N;
cublasOperation_t transb = CUBLAS_OP_N;

printf("A[0]\n");
print_matrix(m, k, A_array[0].data(), lda);
printf("=====\n");

printf("A[1]\n");
print_matrix(m, k, A_array[1].data(), lda);
printf("=====\n");

printf("B[0]\n");
print_matrix(k, n, B_array[0].data(), ldb);
printf("=====\n");

printf("B[1]\n");
print_matrix(k, n, B_array[1].data(), ldb);
printf("=====\n");

/* Step 1: Create cuBLAS handle, bind a stream */
CUBLAS_CHECK(cublasCreate(&cublasH));

CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUBLAS_CHECK(cublasSetStream(cublasH, stream));

/* Step 2: Allocate unified memory */
CUDA_CHECK(cudaMallocManaged(&d_A_array, sizeof(data_type *) * batch_count));
CUDA_CHECK(cudaMallocManaged(&d_B_array, sizeof(data_type *) * batch_count));
CUDA_CHECK(cudaMallocManaged(&d_C_array, sizeof(data_type *) * batch_count));

for (int i = 0; i < batch_count; i++) {
CUDA_CHECK(cudaMallocManaged(&d_A[i], sizeof(data_type) * A_array[i].size()));
CUDA_CHECK(cudaMallocManaged(&d_B[i], sizeof(data_type) * B_array[i].size()));
CUDA_CHECK(cudaMallocManaged(&d_C[i], sizeof(data_type) * C_array[i].size()));

// Copy data to unified memory (host-side initialization is sufficient)
std::copy(A_array[i].begin(), A_array[i].end(), d_A[i]);
std::copy(B_array[i].begin(), B_array[i].end(), d_B[i]);

d_A_array[i] = d_A[i];
d_B_array[i] = d_B[i];
d_C_array[i] = d_C[i];
}

/* Step 3: Compute */
CUBLAS_CHECK(cublasDgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda,
d_B_array, ldb, &beta, d_C_array, ldc, batch_count));

CUDA_CHECK(cudaStreamSynchronize(stream));

/* Step 4: Verify results */
printf("C[0]\n");
print_matrix(m, n, d_C[0], ldc);
printf("=====\n");

printf("C[1]\n");
print_matrix(m, n, d_C[1], ldc);
printf("=====\n");

/* Free resources */
CUDA_CHECK(cudaFree(d_A_array));
CUDA_CHECK(cudaFree(d_B_array));
CUDA_CHECK(cudaFree(d_C_array));
for (int i = 0; i < batch_count; i++) {
CUDA_CHECK(cudaFree(d_A[i]));
CUDA_CHECK(cudaFree(d_B[i]));
CUDA_CHECK(cudaFree(d_C[i]));
}

CUBLAS_CHECK(cublasDestroy(cublasH));
CUDA_CHECK(cudaStreamDestroy(stream));
CUDA_CHECK(cudaDeviceReset());

return EXIT_SUCCESS;
}

0 comments on commit 1eca2d2

Please sign in to comment.