From 3fee8ec292a800f0f07b94a6a88f14f287a8a8e3 Mon Sep 17 00:00:00 2001 From: "-T.K.-" Date: Sun, 9 Jun 2024 01:15:03 -0700 Subject: [PATCH] ADD: add a bunch of operators --- nn/CMakeLists.txt | 12 +++ nn/inc/nn.h | 4 + nn/inc/nn_abs.h | 26 +++++++ nn/inc/nn_matrixnorm.h | 21 +++++ nn/inc/nn_maximum.h | 25 ++++++ nn/inc/nn_min.h | 1 + nn/inc/nn_minimum.h | 24 ++++++ nn/inc/nn_mul.h | 15 ++-- nn/inc/nn_multiply.h | 24 ++++++ nn/inc/nn_neg.h | 25 ++++++ nn/inc/nn_tensor.h | 63 +++++++++++++++ nn/src/abs/nn_abs.c | 14 ++++ nn/src/abs/nn_abs_rvv.c | 25 ++++++ nn/src/matrixnorm/nn_matrixnorm.c | 16 ++++ nn/src/matrixnorm/nn_matrixnorm_rvv.c | 28 +++++++ nn/src/maximum/nn_maximum.c | 19 +++++ nn/src/maximum/nn_maximum_rvv.c | 31 ++++++++ nn/src/minimum/nn_minimum.c | 19 +++++ nn/src/minimum/nn_minimum_rvv.c | 31 ++++++++ nn/src/mul/nn_mul.c | 15 ++-- nn/src/mul/nn_mul_rvv.c | 29 ++++--- nn/src/multiply/nn_multiply.c | 14 ++++ nn/src/multiply/nn_multiply_rvv.c | 28 +++++++ nn/src/neg/nn_neg.c | 14 ++++ nn/src/neg/nn_neg_rvv.c | 25 ++++++ nn/src/nn_tensor.c | 24 ++---- test/src/main.c | 108 +++++++++++++++++++++++++- 27 files changed, 635 insertions(+), 45 deletions(-) create mode 100644 nn/inc/nn_abs.h create mode 100644 nn/inc/nn_matrixnorm.h create mode 100644 nn/inc/nn_maximum.h create mode 100644 nn/inc/nn_minimum.h create mode 100644 nn/inc/nn_multiply.h create mode 100644 nn/inc/nn_neg.h create mode 100644 nn/src/abs/nn_abs.c create mode 100644 nn/src/abs/nn_abs_rvv.c create mode 100644 nn/src/matrixnorm/nn_matrixnorm.c create mode 100644 nn/src/matrixnorm/nn_matrixnorm_rvv.c create mode 100644 nn/src/maximum/nn_maximum.c create mode 100644 nn/src/maximum/nn_maximum_rvv.c create mode 100644 nn/src/minimum/nn_minimum.c create mode 100644 nn/src/minimum/nn_minimum_rvv.c create mode 100644 nn/src/multiply/nn_multiply.c create mode 100644 nn/src/multiply/nn_multiply_rvv.c create mode 100644 nn/src/neg/nn_neg.c create mode 100644 nn/src/neg/nn_neg_rvv.c diff --git a/nn/CMakeLists.txt b/nn/CMakeLists.txt index a23aebc..8d2fa97 100644 --- a/nn/CMakeLists.txt +++ b/nn/CMakeLists.txt @@ -8,16 +8,22 @@ set(INCLUDES set(SOURCES src/nn_tensor.c src/nn_print.c + src/abs/nn_abs.c src/add/nn_add.c src/batchnorm2d/nn_batchnorm2d.c src/conv2d/nn_conv2d.c src/copy/nn_copy.c src/linear/nn_linear.c src/matmul/nn_matmul.c + src/matrixnorm/nn_matrixnorm.c src/max/nn_max.c + src/maximum/nn_maximum.c src/maxpool2d/nn_maxpool2d.c src/min/nn_min.c + src/minimum/nn_minimum.c src/mul/nn_mul.c + src/multiply/nn_multiply.c + src/neg/nn_neg.c src/relu/nn_relu.c src/relu6/nn_relu6.c src/sub/nn_sub.c @@ -26,12 +32,18 @@ set(SOURCES ) set(RVV_SOURCE + src/abs/nn_abs_rvv.c src/add/nn_add_rvv.c src/linear/nn_linear_rvv.c src/matmul/nn_matmul_rvv.c + src/matrixnorm/nn_matrixnorm_rvv.c src/max/nn_max_rvv.c + src/maximum/nn_maximum_rvv.c src/min/nn_min_rvv.c + src/minimum/nn_minimum_rvv.c src/mul/nn_mul_rvv.c + src/multiply/nn_multiply_rvv.c + src/neg/nn_neg_rvv.c src/sub/nn_sub_rvv.c ) diff --git a/nn/inc/nn.h b/nn/inc/nn.h index 69e0f5c..774064f 100644 --- a/nn/inc/nn.h +++ b/nn/inc/nn.h @@ -5,15 +5,19 @@ #include "nn_tensor.h" #include "nn_print.h" +#include "nn_abs.h" #include "nn_add.h" #include "nn_batchnorm2d.h" #include "nn_conv2d.h" #include "nn_copy.h" #include "nn_linear.h" #include "nn_matmul.h" +#include "nn_matrixnorm.h" #include "nn_max.h" #include "nn_min.h" #include "nn_mul.h" +#include "nn_multiply.h" +#include "nn_neg.h" #include "nn_relu.h" #include "nn_relu6.h" #include "nn_sub.h" diff --git a/nn/inc/nn_abs.h b/nn/inc/nn_abs.h new file mode 100644 index 0000000..5cf911e --- /dev/null +++ b/nn/inc/nn_abs.h @@ -0,0 +1,26 @@ +#ifndef __NN_ABS_H +#define __NN_ABS_H + +#include +#include + +#include "nn_tensor.h" + + +/** + * Computes the absolute value of each element in input. + * + * out_i = |input_i| + * + * @param out: the output tensor + * @param input: the input tensor + */ +void NN_abs(Tensor *out, Tensor *input); + +void NN_abs_F32(Tensor *out, Tensor *input); + + +void NN_abs_F32_RVV(Tensor *out, Tensor *input); + + +#endif // __NN_ABS_H diff --git a/nn/inc/nn_matrixnorm.h b/nn/inc/nn_matrixnorm.h new file mode 100644 index 0000000..2fe3d4b --- /dev/null +++ b/nn/inc/nn_matrixnorm.h @@ -0,0 +1,21 @@ +#ifndef __NN_MATRIXNORM_H +#define __NN_MATRIXNORM_H + +#include +#include + +#include "nn_tensor.h" + + +/** + * Computes the Frobenius norm of a matrix. + * + * @param tensor: the input tensor of shape (m, n) + */ +float NN_matrixNorm_F32(Tensor *tensor); + + +float NN_matrixNorm_F32_RVV(Tensor *tensor); + + +#endif // __NN_MATRIXNORM_H diff --git a/nn/inc/nn_maximum.h b/nn/inc/nn_maximum.h new file mode 100644 index 0000000..fa210e8 --- /dev/null +++ b/nn/inc/nn_maximum.h @@ -0,0 +1,25 @@ +#ifndef __NN_MAXIMUM_H +#define __NN_MAXIMUM_H + +#include +#include + +#include "nn_tensor.h" + + +/** + * Computes the element-wise maximum of two tensors. + * + * @param out: the output tensor + * @param a: the input tensor + * @param b: the input tensor + */ +void NN_maximum(Tensor *out, Tensor *a, Tensor *b); + +void NN_maximum_F32(Tensor *out, Tensor *a, Tensor *b); + + +void NN_maximum_F32_RVV(Tensor *out, Tensor *a, Tensor *b); + + +#endif // __NN_MAXIMUM_H diff --git a/nn/inc/nn_min.h b/nn/inc/nn_min.h index ddb625e..165c27b 100644 --- a/nn/inc/nn_min.h +++ b/nn/inc/nn_min.h @@ -16,6 +16,7 @@ float NN_min(Tensor *tensor); float NN_min_F32(Tensor *tensor); + float NN_min_F32_RVV(Tensor *tensor); diff --git a/nn/inc/nn_minimum.h b/nn/inc/nn_minimum.h new file mode 100644 index 0000000..7bb0808 --- /dev/null +++ b/nn/inc/nn_minimum.h @@ -0,0 +1,24 @@ +#ifndef __NN_MINIMUM_H +#define __NN_MINIMUM_H + +#include + +#include "nn_tensor.h" + + +/** + * Computes the element-wise minimum of two tensors. + * + * @param out: the output tensor + * @param a: the input tensor + * @param b: the input tensor + */ +void NN_minimum(Tensor *out, Tensor *a, Tensor *b); + +void NN_minimum_F32(Tensor *out, Tensor *a, Tensor *b); + + +void NN_minimum_F32_RVV(Tensor *out, Tensor *a, Tensor *b); + + +#endif // __NN_MINIMUM_H diff --git a/nn/inc/nn_mul.h b/nn/inc/nn_mul.h index 00e6e8b..278cffc 100644 --- a/nn/inc/nn_mul.h +++ b/nn/inc/nn_mul.h @@ -8,17 +8,20 @@ /** - * Returns the element-wise multiplication of the input tensor with a scalar. + * Returns the element-wise multiplication of two tensors. + * + * out_i = a_i * b_i * * @param out: the output tensor - * @param in: the input tensor - * @param scalar: scalar value + * @param a: the input tensor + * @param b: the input tensor */ -void NN_mul(Tensor *out, Tensor *in, float scalar); +void NN_mul(Tensor *out, Tensor *a, Tensor *b); + +void NN_mul_F32(Tensor *out, Tensor *a, Tensor *b); -void NN_mul_F32(Tensor *out, Tensor *in, float scalar); -void NN_mul_F32_RVV(Tensor *out, Tensor *in, float scalar); +void NN_mul_F32_RVV(Tensor *out, Tensor *a, Tensor *b); #endif // __NN_MUL_H diff --git a/nn/inc/nn_multiply.h b/nn/inc/nn_multiply.h new file mode 100644 index 0000000..e63166b --- /dev/null +++ b/nn/inc/nn_multiply.h @@ -0,0 +1,24 @@ +#ifndef __NN_MULTIPLY_H +#define __NN_MULTIPLY_H + +#include +#include + +#include "nn_tensor.h" + + +/** + * Returns the element-wise multiplication of the input tensor with a scalar. + * + * @param out: the output tensor + * @param in: the input tensor + * @param scalar: scalar value + */ +void NN_multiply(Tensor *out, Tensor *in, float scalar); + +void NN_multiply_F32(Tensor *out, Tensor *in, float scalar); + +void NN_multiply_F32_RVV(Tensor *out, Tensor *in, float scalar); + + +#endif // __NN_MULTIPLY_H diff --git a/nn/inc/nn_neg.h b/nn/inc/nn_neg.h new file mode 100644 index 0000000..9fba627 --- /dev/null +++ b/nn/inc/nn_neg.h @@ -0,0 +1,25 @@ +#ifndef __NN_NEG_H +#define __NN_NEG_H + +#include + +#include "nn_tensor.h" + + +/** + * Returns a tensor with the negative of the elements of input. + * + * out = -1 x input + * + * @param out: the output tensor + * @param input: the input tensor + */ +void NN_neg(Tensor *out, Tensor *input); + +void NN_neg_F32(Tensor *out, Tensor *input); + + +void NN_neg_F32_RVV(Tensor *out, Tensor *input); + + +#endif // __NN_NEG_H diff --git a/nn/inc/nn_tensor.h b/nn/inc/nn_tensor.h index bc64f31..c9d805d 100644 --- a/nn/inc/nn_tensor.h +++ b/nn/inc/nn_tensor.h @@ -5,6 +5,7 @@ #include #include #include +#include #define MAX_DIMS 4 @@ -74,28 +75,57 @@ static inline const char *NN_getDataTypeName(DataType dtype) { } } +/** + * Returns if the tensor is a scalar + * + * A scalar is a 1D tensor with a single element, i.e., shape = (1, ) + * + * @param tensor: the target tensor + */ static inline uint8_t NN_isScalar(Tensor *tensor) { return tensor->ndim == 1 && tensor->shape[0] == 1; } +/** + * Returns if the tensor is a vector + * + * @param tensor: the target tensor + */ static inline uint8_t NN_isVector(Tensor *tensor) { return tensor->ndim == 1; } +/** + * Returns if the tensor is a matrix + * + * @param tensor: the target tensor + */ static inline uint8_t NN_isMatrix(Tensor *tensor) { return tensor->ndim == 2; } +/** + * Returns if the tensor is a 3D tensor + * + * @param tensor: the target tensor + */ static inline uint8_t NN_is3D(Tensor *tensor) { return tensor->ndim == 3; } +/** + * Returns if the tensor is a 4D tensor + * + * @param tensor: the target tensor + */ static inline uint8_t NN_is4D(Tensor *tensor) { return tensor->ndim == 4; } /** * Frees the memory allocated for the tensor data + * + * @param tensor: the target tensor */ static inline void NN_freeTensorData(Tensor *tensor) { free(tensor->data); @@ -103,11 +133,44 @@ static inline void NN_freeTensorData(Tensor *tensor) { /** * Frees the memory allocated for the tensor + * + * @param tensor: the target tensor */ static inline void NN_deleteTensor(Tensor *tensor) { free(tensor); } +/** + * Fills the tensor with the specified value. + * + * @param tensor: the input tensor + * @param value: scalar value + */ +static inline void NN_fill_F32(Tensor *tensor, float value) { + assert(tensor->dtype == DTYPE_F32); + + for (size_t i = 0; i < tensor->size; i += 1) { + ((float *)tensor->data)[i] = value; + } +} + +static inline void NN_fill_I32(Tensor *tensor, int32_t value) { + assert(tensor->dtype == DTYPE_I32); + + for (size_t i = 0; i < tensor->size; i += 1) { + ((int32_t *)tensor->data)[i] = value; + } +} + +static inline void NN_fill_I8(Tensor *tensor, int8_t value) { + assert(tensor->dtype == DTYPE_I8); + + for (size_t i = 0; i < tensor->size; i += 1) { + ((int8_t *)tensor->data)[i] = value; + } +} + + /** * Initialize a given tensor * diff --git a/nn/src/abs/nn_abs.c b/nn/src/abs/nn_abs.c new file mode 100644 index 0000000..bdd4d25 --- /dev/null +++ b/nn/src/abs/nn_abs.c @@ -0,0 +1,14 @@ + +#include "nn_abs.h" + + +void NN_abs_F32(Tensor *out, Tensor *input) { + assert(out->ndim == input->ndim); + assert(input->dtype == DTYPE_F32); + assert(out->dtype == DTYPE_F32); + assert(out->size == input->size); + + for (size_t i = 0; i < out->size; i += 1) { + ((float *)out->data)[i] = fabs(((float *)input->data)[i]); + } +} diff --git a/nn/src/abs/nn_abs_rvv.c b/nn/src/abs/nn_abs_rvv.c new file mode 100644 index 0000000..360fcd4 --- /dev/null +++ b/nn/src/abs/nn_abs_rvv.c @@ -0,0 +1,25 @@ + +#include "nn_abs.h" +#include "riscv_vector.h" + +void NN_abs_F32_RVV(Tensor *out, Tensor *input) { + assert(out->ndim == input->ndim); + assert(input->dtype == DTYPE_F32); + assert(out->dtype == DTYPE_F32); + assert(out->size == input->size); + + float *ptr_out = out->data; + float *ptr_in = input->data; + + size_t n = out->shape[0] * out->shape[1]; + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_in = __riscv_vle32_v_f32m1(ptr_in, vl); + vfloat32m1_t vec_out = __riscv_vfabs_v_f32m1(vec_in, vl); + __riscv_vse32_v_f32m1(ptr_out, vec_out, vl); + ptr_in += vl; + ptr_out += vl; + n -= vl; + } +} + diff --git a/nn/src/matrixnorm/nn_matrixnorm.c b/nn/src/matrixnorm/nn_matrixnorm.c new file mode 100644 index 0000000..8b8d077 --- /dev/null +++ b/nn/src/matrixnorm/nn_matrixnorm.c @@ -0,0 +1,16 @@ + +#include "nn_matrixnorm.h" + + +float NN_matrixNorm_F32(Tensor *tensor) { + assert(tensor->ndim == 2); + assert(tensor->dtype == DTYPE_F32); + + float sum = 0; + for (int i = 0; i < tensor->shape[0]; i += 1) { + for (int j = 0; j < tensor->shape[1]; j += 1) { + sum += pow(((float *)tensor->data)[i * tensor->shape[1] + j], 2); + } + } + return sqrt(sum); +} diff --git a/nn/src/matrixnorm/nn_matrixnorm_rvv.c b/nn/src/matrixnorm/nn_matrixnorm_rvv.c new file mode 100644 index 0000000..169f76d --- /dev/null +++ b/nn/src/matrixnorm/nn_matrixnorm_rvv.c @@ -0,0 +1,28 @@ + +#include "nn_matrixnorm.h" +#include "riscv_vector.h" + + +float NN_matrixNorm_F32_RVV(Tensor *tensor) { + assert(tensor->ndim == 2); + assert(tensor->dtype == DTYPE_F32); + + float *ptr = tensor->data; + + size_t vlmax = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax); + vfloat32m1_t vec_accumulate = __riscv_vfmv_v_f_f32m1(0, vlmax); + + size_t n = tensor->shape[0] * tensor->shape[1]; + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(ptr, vl); + vec_accumulate = __riscv_vfmacc_vv_f32m1(vec_accumulate, vec_a, vec_a, vl); + ptr += vl; + n -= vl; + } + vfloat32m1_t vec_sum = __riscv_vfredusum_vs_f32m1_f32m1(vec_accumulate, vec_zero, vlmax); + float sum = __riscv_vfmv_f_s_f32m1_f32(vec_sum); + + return sqrt(sum); +} diff --git a/nn/src/maximum/nn_maximum.c b/nn/src/maximum/nn_maximum.c new file mode 100644 index 0000000..b6e1f6d --- /dev/null +++ b/nn/src/maximum/nn_maximum.c @@ -0,0 +1,19 @@ + +#include "nn_maximum.h" + + +void NN_maximum_F32(Tensor *out, Tensor *a, Tensor *b) { + assert(b->ndim == a->ndim); + assert(out->ndim == a->ndim); + assert(a->dtype == DTYPE_F32); + assert(b->dtype == DTYPE_F32); + assert(out->dtype == DTYPE_F32); + assert(b->size == a->size); + assert(out->size == a->size); + + for (size_t i = 0; i < out->size; i += 1) { + float a_val = ((float *)a->data)[i]; + float b_val = ((float *)b->data)[i]; + ((float *)out->data)[i] = a_val > b_val ? a_val : b_val; + } +} diff --git a/nn/src/maximum/nn_maximum_rvv.c b/nn/src/maximum/nn_maximum_rvv.c new file mode 100644 index 0000000..c0e0c0f --- /dev/null +++ b/nn/src/maximum/nn_maximum_rvv.c @@ -0,0 +1,31 @@ + +#include "nn_maximum.h" +#include "riscv_vector.h" + +void NN_maximum_F32_RVV(Tensor *out, Tensor *a, Tensor *b) { + assert(b->ndim == a->ndim); + assert(out->ndim == a->ndim); + assert(a->dtype == DTYPE_F32); + assert(b->dtype == DTYPE_F32); + assert(out->dtype == DTYPE_F32); + assert(b->size == a->size); + assert(out->size == a->size); + + float *ptr_out = out->data; + float *ptr_a = a->data; + float *ptr_b = b->data; + + size_t n = out->shape[0] * out->shape[1]; + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(ptr_a, vl); + vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(ptr_b, vl); + vfloat32m1_t vec_out = __riscv_vfmax_vv_f32m1(vec_a, vec_b, vl); + __riscv_vse32_v_f32m1(ptr_out, vec_out, vl); + ptr_a += vl; + ptr_b += vl; + ptr_out += vl; + n -= vl; + } +} + diff --git a/nn/src/minimum/nn_minimum.c b/nn/src/minimum/nn_minimum.c new file mode 100644 index 0000000..15f55b0 --- /dev/null +++ b/nn/src/minimum/nn_minimum.c @@ -0,0 +1,19 @@ + +#include "nn_minimum.h" + + +void NN_minimum_F32(Tensor *out, Tensor *a, Tensor *b) { + assert(b->ndim == a->ndim); + assert(out->ndim == a->ndim); + assert(a->dtype == DTYPE_F32); + assert(b->dtype == DTYPE_F32); + assert(out->dtype == DTYPE_F32); + assert(b->size == a->size); + assert(out->size == a->size); + + for (size_t i = 0; i < out->size; i += 1) { + float a_val = ((float *)a->data)[i]; + float b_val = ((float *)b->data)[i]; + ((float *)out->data)[i] = a_val < b_val ? a_val : b_val; + } +} diff --git a/nn/src/minimum/nn_minimum_rvv.c b/nn/src/minimum/nn_minimum_rvv.c new file mode 100644 index 0000000..4fade23 --- /dev/null +++ b/nn/src/minimum/nn_minimum_rvv.c @@ -0,0 +1,31 @@ + +#include "nn_minimum.h" +#include "riscv_vector.h" + +void NN_minimum_F32_RVV(Tensor *out, Tensor *a, Tensor *b) { + assert(b->ndim == a->ndim); + assert(out->ndim == a->ndim); + assert(a->dtype == DTYPE_F32); + assert(b->dtype == DTYPE_F32); + assert(out->dtype == DTYPE_F32); + assert(b->size == a->size); + assert(out->size == a->size); + + float *ptr_out = out->data; + float *ptr_a = a->data; + float *ptr_b = b->data; + + size_t n = out->shape[0] * out->shape[1]; + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(ptr_a, vl); + vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(ptr_b, vl); + vfloat32m1_t vec_out = __riscv_vfmin_vv_f32m1(vec_a, vec_b, vl); + __riscv_vse32_v_f32m1(ptr_out, vec_out, vl); + ptr_a += vl; + ptr_b += vl; + ptr_out += vl; + n -= vl; + } +} + diff --git a/nn/src/mul/nn_mul.c b/nn/src/mul/nn_mul.c index 528770e..ee4d24f 100644 --- a/nn/src/mul/nn_mul.c +++ b/nn/src/mul/nn_mul.c @@ -1,14 +1,17 @@ #include "nn_mul.h" -void NN_mul_F32(Tensor *out, Tensor *in, float coeff) { - assert(out->ndim == in->ndim); - assert(in->dtype == DTYPE_F32); +void NN_mul_F32(Tensor *out, Tensor *a, Tensor *b) { + assert(b->ndim == a->ndim); + assert(out->ndim == a->ndim); + assert(a->dtype == DTYPE_F32); + assert(b->dtype == DTYPE_F32); assert(out->dtype == DTYPE_F32); - assert(out->size == in->size); + assert(b->size == a->size); + assert(out->size == a->size); - for (size_t i = 0; i < in->size; i += 1) { - ((float *)out->data)[i] = ((float *)in->data)[i] * coeff; + for (size_t i = 0; i < out->size; i += 1) { + ((float *)out->data)[i] = ((float *)a->data)[i] * ((float *)b->data)[i]; } } diff --git a/nn/src/mul/nn_mul_rvv.c b/nn/src/mul/nn_mul_rvv.c index ff7c190..9e21de1 100644 --- a/nn/src/mul/nn_mul_rvv.c +++ b/nn/src/mul/nn_mul_rvv.c @@ -2,26 +2,33 @@ #include "nn_mul.h" #include "riscv_vector.h" -void NN_mul_F32_RVV(Tensor *out, Tensor *in, float scalar) { - assert(out->ndim == in->ndim); - assert(in->dtype == DTYPE_F32); +void NN_mul_F32_RVV(Tensor *out, Tensor *a, Tensor *b) { + assert(b->ndim == a->ndim); + assert(out->ndim == a->ndim); + assert(a->dtype == DTYPE_F32); + assert(b->dtype == DTYPE_F32); assert(out->dtype == DTYPE_F32); + assert(b->size == a->size); + assert(out->size == a->size); - float *in_ptr = in->data; float *out_ptr = out->data; + float *a_ptr = a->data; + float *b_ptr = b->data; // TODO: currently only support 2dim - assert(in->ndim == 2); - assert(out->shape[0] == in->shape[0]); - assert(out->shape[1] == in->shape[1]); + assert(a->ndim == 2); + assert(out->shape[0] == a->shape[0]); + assert(out->shape[1] == a->shape[1]); - size_t n = in->shape[0] * in->shape[1]; + size_t n = out->shape[0] * out->shape[1]; while (n > 0) { size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t vec_in = __riscv_vle32_v_f32m1(in_ptr, vl); - vfloat32m1_t vec_out = __riscv_vfmul_vf_f32m1(vec_in, scalar, vl); + vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(a_ptr, vl); + vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(b_ptr, vl); + vfloat32m1_t vec_out = __riscv_vfmul_vv_f32m1(vec_a, vec_b, vl); __riscv_vse32_v_f32m1(out_ptr, vec_out, vl); - in_ptr += vl; + a_ptr += vl; + b_ptr += vl; out_ptr += vl; n -= vl; } diff --git a/nn/src/multiply/nn_multiply.c b/nn/src/multiply/nn_multiply.c new file mode 100644 index 0000000..bd727dd --- /dev/null +++ b/nn/src/multiply/nn_multiply.c @@ -0,0 +1,14 @@ + +#include "nn_multiply.h" + +void NN_multiply_F32(Tensor *out, Tensor *in, float coeff) { + assert(out->ndim == in->ndim); + assert(in->dtype == DTYPE_F32); + assert(out->dtype == DTYPE_F32); + assert(out->size == in->size); + + for (size_t i = 0; i < out->size; i += 1) { + ((float *)out->data)[i] = ((float *)in->data)[i] * coeff; + } +} + diff --git a/nn/src/multiply/nn_multiply_rvv.c b/nn/src/multiply/nn_multiply_rvv.c new file mode 100644 index 0000000..eced806 --- /dev/null +++ b/nn/src/multiply/nn_multiply_rvv.c @@ -0,0 +1,28 @@ + +#include "nn_multiply.h" +#include "riscv_vector.h" + +void NN_multiply_F32_RVV(Tensor *out, Tensor *in, float scalar) { + assert(out->ndim == in->ndim); + assert(in->dtype == DTYPE_F32); + assert(out->dtype == DTYPE_F32); + + float *in_ptr = in->data; + float *out_ptr = out->data; + + // TODO: currently only support 2dim + assert(in->ndim == 2); + assert(out->shape[0] == in->shape[0]); + assert(out->shape[1] == in->shape[1]); + + size_t n = out->shape[0] * out->shape[1]; + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_in = __riscv_vle32_v_f32m1(in_ptr, vl); + vfloat32m1_t vec_out = __riscv_vfmul_vf_f32m1(vec_in, scalar, vl); + __riscv_vse32_v_f32m1(out_ptr, vec_out, vl); + in_ptr += vl; + out_ptr += vl; + n -= vl; + } +} diff --git a/nn/src/neg/nn_neg.c b/nn/src/neg/nn_neg.c new file mode 100644 index 0000000..4546c38 --- /dev/null +++ b/nn/src/neg/nn_neg.c @@ -0,0 +1,14 @@ + +#include "nn_neg.h" + + +void NN_neg_F32(Tensor *out, Tensor *input) { + assert(out->ndim == input->ndim); + assert(input->dtype == DTYPE_F32); + assert(out->dtype == DTYPE_F32); + assert(out->size == input->size); + + for (size_t i = 0; i < out->size; i += 1) { + ((float *)out->data)[i] = -((float *)input->data)[i]; + } +} diff --git a/nn/src/neg/nn_neg_rvv.c b/nn/src/neg/nn_neg_rvv.c new file mode 100644 index 0000000..cf86aca --- /dev/null +++ b/nn/src/neg/nn_neg_rvv.c @@ -0,0 +1,25 @@ + +#include "nn_neg.h" +#include "riscv_vector.h" + +void NN_neg_F32_RVV(Tensor *out, Tensor *input) { + assert(out->ndim == input->ndim); + assert(input->dtype == DTYPE_F32); + assert(out->dtype == DTYPE_F32); + assert(out->size == input->size); + + float *ptr_out = out->data; + float *ptr_in = input->data; + + size_t n = out->shape[0] * out->shape[1]; + while (n > 0) { + size_t vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t vec_in = __riscv_vle32_v_f32m1(ptr_in, vl); + vfloat32m1_t vec_out = __riscv_vfneg_v_f32m1(vec_in, vl); + __riscv_vse32_v_f32m1(ptr_out, vec_out, vl); + ptr_in += vl; + ptr_out += vl; + n -= vl; + } +} + diff --git a/nn/src/nn_tensor.c b/nn/src/nn_tensor.c index 5bbee81..49176c5 100644 --- a/nn/src/nn_tensor.c +++ b/nn/src/nn_tensor.c @@ -38,19 +38,13 @@ Tensor *NN_zeros(size_t ndim, size_t *shape, DataType dtype) { switch (dtype) { case DTYPE_I8: - for (size_t i = 0; i < t->size; i += 1) { - ((int8_t *)t->data)[i] = 0; - } + NN_fill_I8(t, 0); break; case DTYPE_I32: - for (size_t i = 0; i < t->size; i += 1) { - ((int32_t *)t->data)[i] = 0; - } + NN_fill_I32(t, 0); break; case DTYPE_F32: - for (size_t i = 0; i < t->size; i += 1) { - ((float *)t->data)[i] = 0; - } + NN_fill_F32(t, 0); break; default: printf("[WARNING] Unsupported data type: %d\n", dtype); @@ -64,19 +58,13 @@ Tensor *NN_ones(size_t ndim, size_t *shape, DataType dtype) { switch (dtype) { case DTYPE_I8: - for (size_t i = 0; i < t->size; i += 1) { - ((int8_t *)t->data)[i] = 1; - } + NN_fill_I8(t, 1); break; case DTYPE_I32: - for (size_t i = 0; i < t->size; i += 1) { - ((int32_t *)t->data)[i] = 1; - } + NN_fill_I32(t, 1); break; case DTYPE_F32: - for (size_t i = 0; i < t->size; i += 1) { - ((float *)t->data)[i] = 1; - } + NN_fill_F32(t, 1); break; default: printf("[WARNING] Unsupported data type: %d\n", dtype); diff --git a/test/src/main.c b/test/src/main.c index 98cd596..8ca5cb8 100644 --- a/test/src/main.c +++ b/test/src/main.c @@ -144,9 +144,9 @@ int main() { Tensor *D = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); printf("mulf:\t\t"); - NN_mul_F32(C, A, 10.0f); + NN_multiply_F32(C, A, 10.0f); cycles = READ_CSR("mcycle"); - NN_mul_F32_RVV(D, A, 10.0f); + NN_multiply_F32_RVV(D, A, 10.0f); cycles = READ_CSR("mcycle") - cycles; printf("%s (%lu)\n", compare_2d(C->data, D->data, M, N) ? "PASS" : "FAIL", cycles); @@ -210,7 +210,24 @@ int main() { // matneg { + Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); + Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); + Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); + printf("cwiseneg:\t"); + NN_neg_F32(golden, A); + cycles = READ_CSR("mcycle"); + NN_neg_F32_RVV(actual, A); + cycles = READ_CSR("mcycle") - cycles; + printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); + + NN_freeTensorData(A); + NN_deleteTensor(A); + + NN_freeTensorData(golden); + NN_deleteTensor(golden); + NN_freeTensorData(actual); + NN_deleteTensor(actual); } // matcopy @@ -220,22 +237,99 @@ int main() { // cwiseabs { + Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); + Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); + Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); + + printf("cwiseabs:\t"); + NN_abs_F32(golden, A); + cycles = READ_CSR("mcycle"); + NN_abs_F32_RVV(actual, A); + cycles = READ_CSR("mcycle") - cycles; + printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); + NN_freeTensorData(A); + NN_deleteTensor(A); + + NN_freeTensorData(golden); + NN_deleteTensor(golden); + NN_freeTensorData(actual); + NN_deleteTensor(actual); } // cwisemin { + Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); + Tensor *B = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); + Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); + Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); + printf("cwiseminimum:\t"); + NN_minimum_F32(golden, A, B); + cycles = READ_CSR("mcycle"); + NN_minimum_F32_RVV(actual, A, B); + cycles = READ_CSR("mcycle") - cycles; + printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); + + NN_freeTensorData(A); + NN_deleteTensor(A); + NN_freeTensorData(B); + NN_deleteTensor(B); + + NN_freeTensorData(golden); + NN_deleteTensor(golden); + NN_freeTensorData(actual); + NN_deleteTensor(actual); } // cwisemax { + Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); + Tensor *B = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); + Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); + Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); + + printf("cwisemaximum:\t"); + NN_maximum_F32(golden, A, B); + cycles = READ_CSR("mcycle"); + NN_maximum_F32_RVV(actual, A, B); + cycles = READ_CSR("mcycle") - cycles; + printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); + NN_freeTensorData(A); + NN_deleteTensor(A); + NN_freeTensorData(B); + NN_deleteTensor(B); + + NN_freeTensorData(golden); + NN_deleteTensor(golden); + NN_freeTensorData(actual); + NN_deleteTensor(actual); } // cwisemul { + Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); + Tensor *B = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); + Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); + Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL); + printf("matadd:\t\t"); + NN_mul_F32(golden, A, B); + cycles = READ_CSR("mcycle"); + NN_mul_F32_RVV(actual, A, B); + cycles = READ_CSR("mcycle") - cycles; + printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles); + + NN_freeTensorData(A); + NN_deleteTensor(A); + NN_freeTensorData(B); + NN_deleteTensor(B); + + NN_freeTensorData(golden); + NN_deleteTensor(golden); + NN_freeTensorData(actual); + NN_deleteTensor(actual); } // matset @@ -248,9 +342,15 @@ int main() { } - // matNOrm + // matnorm { - + Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32); + printf("matnorm:\t"); + float norm_cpu = NN_matrixNorm_F32(A); + cycles = READ_CSR("mcycle"); + float norm_actual = NN_matrixNorm_F32_RVV(A); + cycles = READ_CSR("mcycle") - cycles; + printf("%s (%lu)\n", float_eq(norm_cpu, norm_actual, 1e-6) ? "PASS" : "FAIL", cycles); } // transpose