Skip to content

Commit

Permalink
Add tensor contraction as a demo
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 committed Nov 13, 2024
1 parent 91959f6 commit 93c6929
Showing 1 changed file with 78 additions and 22 deletions.
100 changes: 78 additions & 22 deletions test/CUDA/tensor_demo.cu → demos/CUDA/TensorContraction.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
//--------------------------------------------------------------------*- C++ -*-
// clad - The C++ Clang-based Automatic Differentiator
//
// A demo, describing how to calculate a tensor contraction and its gradient.
//
// author: Christina Koutsou <christinakoutsou22-at-gmail.com>
//----------------------------------------------------------------------------//

// RUN: /path/to/clang -std=c++17 -Xclang -add-plugin -Xclang clad -Xclang -load -Xclang \
// /path/to/libclad.so -I/path/to/clad/include -I/path/to/cuda/include \
// TensorContraction.cu -o TensorContraction \
// --cuda-path=path/to/cuda --cuda-gpu-arch=%cuda_arch \
// -L/path/to/cuda/lib64 -lcudart_static -ldl -lrt -pthread -lm -lstdc++
// RUN: ./TensorContraction

#include "cuda_runtime_api.h"
#include "clad/Differentiator/Differentiator.h"

typedef unsigned long long int size_type;

__device__ void computeStartStep(size_type& A_start, size_type& A_step, size_type& B_start, size_type& B_step, const int idx, const size_type A_dim[3], const size_type B_dim[3], const int contractDims[2]) {
__device__ void computeStartStep(size_type& A_start, size_type& A_step, size_type& B_start, size_type& B_step, const int idx, const size_type A_dim[3], const size_type B_dim[3], const int contractDimA, const int contractDimB) {
size_type A_a, A_b, A_c, B_d, B_e, B_f;
int contractDimA = contractDims[0];
int contractDimB = contractDims[1];

switch (contractDimA) {
case 0:
Expand Down Expand Up @@ -50,18 +64,16 @@ __device__ void computeStartStep(size_type& A_start, size_type& A_step, size_typ
}
}

__global__ void tensorContraction3D(float* C, const float *A, const float *B, const size_type *A_dim, const size_type *B_dim, const int contractDims[2]) {
__global__ void tensorContraction3D(float* C, const float *A, const float *B, const size_type *A_dim, const size_type *B_dim, const int contractDimA, const int contractDimB) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int contractDimA = contractDims[0];
int contractDimB = contractDims[1];

// Each thread computes one element of the output tensor
int totalElements = A_dim[(contractDimA + 1) % 3] * A_dim[(contractDimA + 2) % 3] * B_dim[(contractDimB + 1) % 3] * B_dim[(contractDimB + 2) % 3];
if (idx < totalElements) {
size_type A_start, B_start, A_step, B_step;
size_type A_a, A_b, A_c, B_d, B_e, B_f;

computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim, contractDims);
computeStartStep(A_start, A_step, B_start, B_step, idx, A_dim, B_dim, contractDimA, contractDimB);

float sum = 0.0f;
for (int i = 0; i < A_dim[contractDimA]; i++) { // A_dim[contractDimA] == B_dim[contractDimB]
Expand All @@ -72,12 +84,12 @@ __global__ void tensorContraction3D(float* C, const float *A, const float *B, co
}
}

void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1, const size_type D2, const size_type D3, const size_type D4, const size_type D5) {
void launchTensorContraction3D(float* C, const float* A, const float* B, const size_type D1, const size_type D2, const size_type D3, const size_type D4, const size_type D5) {
float *d_A = nullptr, *d_B = nullptr, *d_C = nullptr;

size_type A_size = D1 * D2 * D3 * sizeof(float);
size_type B_size = D3 * D4 * D5 * sizeof(float);
size_type C_size = D1 * D2 * D4 * D5 * sizeof(float);
const size_type A_size = D1 * D2 * D3 * sizeof(float);
const size_type B_size = D3 * D4 * D5 * sizeof(float);
const size_type C_size = D1 * D2 * D4 * D5 * sizeof(float);

// Allocate device memory and copy data from host to device
cudaMalloc(&d_A, A_size);
Expand All @@ -86,22 +98,17 @@ void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1,
cudaMemcpy(d_A, A, A_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, B_size, cudaMemcpyHostToDevice);

size_type A_dim[3] = {D1, D2, D3};
size_type B_dim[3] = {D3, D4, D5};
const size_type A_dim[3] = {D1, D2, D3};
const size_type B_dim[3] = {D3, D4, D5};

size_type *d_A_dim = nullptr, *d_B_dim = nullptr;
cudaMalloc(&d_A_dim, 3 * sizeof(size_type));
cudaMalloc(&d_B_dim, 3 * sizeof(size_type));
cudaMemcpy(d_A_dim, A_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_dim, B_dim, 3 * sizeof(size_type), cudaMemcpyHostToDevice);

int contractDims[2] = {2, 0};
int *d_contractDims = nullptr;
cudaMalloc(&d_contractDims, 2 * sizeof(int));
cudaMemcpy(d_contractDims, contractDims, 2 * sizeof(int), cudaMemcpyHostToDevice);

// Launch the kernel
tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim, d_contractDims);
tensorContraction3D<<<1, 256>>>(d_C, d_A, d_B, d_A_dim, d_B_dim, /*contractDimA=*/2, /*contractDimB=*/0);

// Copy the result from device to host
cudaMemcpy(C, d_C, C_size, cudaMemcpyDeviceToHost);
Expand All @@ -112,7 +119,6 @@ void launchTensorContraction3D(float* C, float* A, float* B, const size_type D1,
cudaFree(d_C);
cudaFree(d_A_dim);
cudaFree(d_B_dim);
cudaFree(d_contractDims);
}

int main() {
Expand All @@ -132,8 +138,6 @@ int main() {

float C[D1][D2][D4][D5] = {0}; // Result tensor

launchTensorContraction3D(&C[0][0][0][0], &A[0][0][0], &B[0][0][0], D1, D2, D3, D4, D5);

// Compute the gradient
auto tensor_grad = clad::gradient(launchTensorContraction3D, "C, A, B");

Expand Down Expand Up @@ -195,3 +199,55 @@ int main() {

return 0;
}

// CHECK-EXEC: Result C:
// CHECK-NEXT: 130 140
// CHECK-NEXT: 150 160
// CHECK-NEXT: 170 180
// CHECK-NEXT:
// CHECK-NEXT: 290 316
// CHECK-NEXT: 342 368
// CHECK-NEXT: 394 420
// CHECK-NEXT:
// CHECK-NEXT: 450 492
// CHECK-NEXT: 534 576
// CHECK-NEXT: 618 660
// CHECK-NEXT:
// CHECK-NEXT:
// CHECK-NEXT: 610 668
// CHECK-NEXT: 726 784
// CHECK-NEXT: 842 900
// CHECK-NEXT:
// CHECK-NEXT: 770 844
// CHECK-NEXT: 918 992
// CHECK-NEXT: 1066 1140
// CHECK-NEXT:
// CHECK-NEXT: 930 1020
// CHECK-NEXT: 1110 1200
// CHECK-NEXT: 1290 1380

// CHECK-EXEC: Result C_grad w.r.t. A:
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT:
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT: 21 57 93 129
// CHECK-NEXT:
// CHECK-EXEC: Result C_grad w.r.t. B:
// CHECK-NEXT: 66 66
// CHECK-NEXT: 66 66
// CHECK-NEXT: 66 66
// CHECK-NEXT:
// CHECK-NEXT: 72 72
// CHECK-NEXT: 72 72
// CHECK-NEXT: 72 72
// CHECK-NEXT:
// CHECK-NEXT: 78 78
// CHECK-NEXT: 78 78
// CHECK-NEXT: 78 78
// CHECK-NEXT:
// CHECK-NEXT: 84 84
// CHECK-NEXT: 84 84
// CHECK-NEXT: 84 84

0 comments on commit 93c6929

Please sign in to comment.