From 3fee8ec292a800f0f07b94a6a88f14f287a8a8e3 Mon Sep 17 00:00:00 2001
From: "-T.K.-" <t_k_233@outlook.com>
Date: Sun, 9 Jun 2024 01:15:03 -0700
Subject: [PATCH] ADD: add a bunch of operators

---
 nn/CMakeLists.txt                     |  12 +++
 nn/inc/nn.h                           |   4 +
 nn/inc/nn_abs.h                       |  26 +++++++
 nn/inc/nn_matrixnorm.h                |  21 +++++
 nn/inc/nn_maximum.h                   |  25 ++++++
 nn/inc/nn_min.h                       |   1 +
 nn/inc/nn_minimum.h                   |  24 ++++++
 nn/inc/nn_mul.h                       |  15 ++--
 nn/inc/nn_multiply.h                  |  24 ++++++
 nn/inc/nn_neg.h                       |  25 ++++++
 nn/inc/nn_tensor.h                    |  63 +++++++++++++++
 nn/src/abs/nn_abs.c                   |  14 ++++
 nn/src/abs/nn_abs_rvv.c               |  25 ++++++
 nn/src/matrixnorm/nn_matrixnorm.c     |  16 ++++
 nn/src/matrixnorm/nn_matrixnorm_rvv.c |  28 +++++++
 nn/src/maximum/nn_maximum.c           |  19 +++++
 nn/src/maximum/nn_maximum_rvv.c       |  31 ++++++++
 nn/src/minimum/nn_minimum.c           |  19 +++++
 nn/src/minimum/nn_minimum_rvv.c       |  31 ++++++++
 nn/src/mul/nn_mul.c                   |  15 ++--
 nn/src/mul/nn_mul_rvv.c               |  29 ++++---
 nn/src/multiply/nn_multiply.c         |  14 ++++
 nn/src/multiply/nn_multiply_rvv.c     |  28 +++++++
 nn/src/neg/nn_neg.c                   |  14 ++++
 nn/src/neg/nn_neg_rvv.c               |  25 ++++++
 nn/src/nn_tensor.c                    |  24 ++----
 test/src/main.c                       | 108 +++++++++++++++++++++++++-
 27 files changed, 635 insertions(+), 45 deletions(-)
 create mode 100644 nn/inc/nn_abs.h
 create mode 100644 nn/inc/nn_matrixnorm.h
 create mode 100644 nn/inc/nn_maximum.h
 create mode 100644 nn/inc/nn_minimum.h
 create mode 100644 nn/inc/nn_multiply.h
 create mode 100644 nn/inc/nn_neg.h
 create mode 100644 nn/src/abs/nn_abs.c
 create mode 100644 nn/src/abs/nn_abs_rvv.c
 create mode 100644 nn/src/matrixnorm/nn_matrixnorm.c
 create mode 100644 nn/src/matrixnorm/nn_matrixnorm_rvv.c
 create mode 100644 nn/src/maximum/nn_maximum.c
 create mode 100644 nn/src/maximum/nn_maximum_rvv.c
 create mode 100644 nn/src/minimum/nn_minimum.c
 create mode 100644 nn/src/minimum/nn_minimum_rvv.c
 create mode 100644 nn/src/multiply/nn_multiply.c
 create mode 100644 nn/src/multiply/nn_multiply_rvv.c
 create mode 100644 nn/src/neg/nn_neg.c
 create mode 100644 nn/src/neg/nn_neg_rvv.c

diff --git a/nn/CMakeLists.txt b/nn/CMakeLists.txt
index a23aebc..8d2fa97 100644
--- a/nn/CMakeLists.txt
+++ b/nn/CMakeLists.txt
@@ -8,16 +8,22 @@ set(INCLUDES
 set(SOURCES
   src/nn_tensor.c
   src/nn_print.c
+  src/abs/nn_abs.c
   src/add/nn_add.c
   src/batchnorm2d/nn_batchnorm2d.c
   src/conv2d/nn_conv2d.c
   src/copy/nn_copy.c
   src/linear/nn_linear.c
   src/matmul/nn_matmul.c
+  src/matrixnorm/nn_matrixnorm.c
   src/max/nn_max.c
+  src/maximum/nn_maximum.c
   src/maxpool2d/nn_maxpool2d.c
   src/min/nn_min.c
+  src/minimum/nn_minimum.c
   src/mul/nn_mul.c
+  src/multiply/nn_multiply.c
+  src/neg/nn_neg.c
   src/relu/nn_relu.c
   src/relu6/nn_relu6.c
   src/sub/nn_sub.c
@@ -26,12 +32,18 @@ set(SOURCES
 )
 
 set(RVV_SOURCE
+  src/abs/nn_abs_rvv.c
   src/add/nn_add_rvv.c
   src/linear/nn_linear_rvv.c
   src/matmul/nn_matmul_rvv.c
+  src/matrixnorm/nn_matrixnorm_rvv.c
   src/max/nn_max_rvv.c
+  src/maximum/nn_maximum_rvv.c
   src/min/nn_min_rvv.c
+  src/minimum/nn_minimum_rvv.c
   src/mul/nn_mul_rvv.c
+  src/multiply/nn_multiply_rvv.c
+  src/neg/nn_neg_rvv.c
   src/sub/nn_sub_rvv.c
 )
 
diff --git a/nn/inc/nn.h b/nn/inc/nn.h
index 69e0f5c..774064f 100644
--- a/nn/inc/nn.h
+++ b/nn/inc/nn.h
@@ -5,15 +5,19 @@
 
 #include "nn_tensor.h"
 #include "nn_print.h"
+#include "nn_abs.h"
 #include "nn_add.h"
 #include "nn_batchnorm2d.h"
 #include "nn_conv2d.h"
 #include "nn_copy.h"
 #include "nn_linear.h"
 #include "nn_matmul.h"
+#include "nn_matrixnorm.h"
 #include "nn_max.h"
 #include "nn_min.h"
 #include "nn_mul.h"
+#include "nn_multiply.h"
+#include "nn_neg.h"
 #include "nn_relu.h"
 #include "nn_relu6.h"
 #include "nn_sub.h"
diff --git a/nn/inc/nn_abs.h b/nn/inc/nn_abs.h
new file mode 100644
index 0000000..5cf911e
--- /dev/null
+++ b/nn/inc/nn_abs.h
@@ -0,0 +1,26 @@
+#ifndef __NN_ABS_H
+#define __NN_ABS_H
+
+#include <assert.h>
+#include <math.h>
+
+#include "nn_tensor.h"
+
+
+/**
+ * Computes the absolute value of each element in input.
+ * 
+ * out_i = |input_i|
+ * 
+ * @param out: the output tensor
+ * @param input: the input tensor
+ */
+void NN_abs(Tensor *out, Tensor *input);
+
+void NN_abs_F32(Tensor *out, Tensor *input);
+
+
+void NN_abs_F32_RVV(Tensor *out, Tensor *input);
+
+
+#endif // __NN_ABS_H
diff --git a/nn/inc/nn_matrixnorm.h b/nn/inc/nn_matrixnorm.h
new file mode 100644
index 0000000..2fe3d4b
--- /dev/null
+++ b/nn/inc/nn_matrixnorm.h
@@ -0,0 +1,21 @@
+#ifndef __NN_MATRIXNORM_H
+#define __NN_MATRIXNORM_H
+
+#include <assert.h>
+#include <math.h>
+
+#include "nn_tensor.h"
+
+
+/**
+ * Computes the Frobenius norm of a matrix.
+ * 
+ * @param tensor: the input tensor of shape (m, n)
+ */
+float NN_matrixNorm_F32(Tensor *tensor);
+
+
+float NN_matrixNorm_F32_RVV(Tensor *tensor);
+
+
+#endif // __NN_MATRIXNORM_H
diff --git a/nn/inc/nn_maximum.h b/nn/inc/nn_maximum.h
new file mode 100644
index 0000000..fa210e8
--- /dev/null
+++ b/nn/inc/nn_maximum.h
@@ -0,0 +1,25 @@
+#ifndef __NN_MAXIMUM_H
+#define __NN_MAXIMUM_H
+
+#include <assert.h>
+#include <float.h>
+
+#include "nn_tensor.h"
+
+
+/**
+ * Computes the element-wise maximum of two tensors.
+ * 
+ * @param out: the output tensor
+ * @param a: the input tensor
+ * @param b: the input tensor
+ */
+void NN_maximum(Tensor *out, Tensor *a, Tensor *b);
+
+void NN_maximum_F32(Tensor *out, Tensor *a, Tensor *b);
+
+
+void NN_maximum_F32_RVV(Tensor *out, Tensor *a, Tensor *b);
+
+
+#endif // __NN_MAXIMUM_H
diff --git a/nn/inc/nn_min.h b/nn/inc/nn_min.h
index ddb625e..165c27b 100644
--- a/nn/inc/nn_min.h
+++ b/nn/inc/nn_min.h
@@ -16,6 +16,7 @@ float NN_min(Tensor *tensor);
 
 float NN_min_F32(Tensor *tensor);
 
+
 float NN_min_F32_RVV(Tensor *tensor);
 
 
diff --git a/nn/inc/nn_minimum.h b/nn/inc/nn_minimum.h
new file mode 100644
index 0000000..7bb0808
--- /dev/null
+++ b/nn/inc/nn_minimum.h
@@ -0,0 +1,24 @@
+#ifndef __NN_MINIMUM_H
+#define __NN_MINIMUM_H
+
+#include <assert.h>
+
+#include "nn_tensor.h"
+
+
+/**
+ * Computes the element-wise minimum of two tensors.
+ * 
+ * @param out: the output tensor
+ * @param a: the input tensor
+ * @param b: the input tensor
+ */
+void NN_minimum(Tensor *out, Tensor *a, Tensor *b);
+
+void NN_minimum_F32(Tensor *out, Tensor *a, Tensor *b);
+
+
+void NN_minimum_F32_RVV(Tensor *out, Tensor *a, Tensor *b);
+
+
+#endif // __NN_MINIMUM_H
diff --git a/nn/inc/nn_mul.h b/nn/inc/nn_mul.h
index 00e6e8b..278cffc 100644
--- a/nn/inc/nn_mul.h
+++ b/nn/inc/nn_mul.h
@@ -8,17 +8,20 @@
 
 
 /**
- * Returns the element-wise multiplication of the input tensor with a scalar.
+ * Returns the element-wise multiplication of two tensors.
+ * 
+ * out_i = a_i * b_i
  * 
  * @param out: the output tensor
- * @param in: the input tensor
- * @param scalar: scalar value
+ * @param a: the input tensor
+ * @param b: the input tensor
  */
-void NN_mul(Tensor *out, Tensor *in, float scalar);
+void NN_mul(Tensor *out, Tensor *a, Tensor *b);
+
+void NN_mul_F32(Tensor *out, Tensor *a, Tensor *b);
 
-void NN_mul_F32(Tensor *out, Tensor *in, float scalar);
 
-void NN_mul_F32_RVV(Tensor *out, Tensor *in, float scalar);
+void NN_mul_F32_RVV(Tensor *out, Tensor *a, Tensor *b);
 
 
 #endif // __NN_MUL_H
diff --git a/nn/inc/nn_multiply.h b/nn/inc/nn_multiply.h
new file mode 100644
index 0000000..e63166b
--- /dev/null
+++ b/nn/inc/nn_multiply.h
@@ -0,0 +1,24 @@
+#ifndef __NN_MULTIPLY_H
+#define __NN_MULTIPLY_H
+
+#include <assert.h>
+#include <float.h>
+
+#include "nn_tensor.h"
+
+
+/**
+ * Returns the element-wise multiplication of the input tensor with a scalar.
+ * 
+ * @param out: the output tensor
+ * @param in: the input tensor
+ * @param scalar: scalar value
+ */
+void NN_multiply(Tensor *out, Tensor *in, float scalar);
+
+void NN_multiply_F32(Tensor *out, Tensor *in, float scalar);
+
+void NN_multiply_F32_RVV(Tensor *out, Tensor *in, float scalar);
+
+
+#endif // __NN_MULTIPLY_H
diff --git a/nn/inc/nn_neg.h b/nn/inc/nn_neg.h
new file mode 100644
index 0000000..9fba627
--- /dev/null
+++ b/nn/inc/nn_neg.h
@@ -0,0 +1,25 @@
+#ifndef __NN_NEG_H
+#define __NN_NEG_H
+
+#include <assert.h>
+
+#include "nn_tensor.h"
+
+
+/**
+ * Returns a tensor with the negative of the elements of input.
+ * 
+ * out = -1 x input
+ * 
+ * @param out: the output tensor
+ * @param input: the input tensor
+ */
+void NN_neg(Tensor *out, Tensor *input);
+
+void NN_neg_F32(Tensor *out, Tensor *input);
+
+
+void NN_neg_F32_RVV(Tensor *out, Tensor *input);
+
+
+#endif // __NN_NEG_H
diff --git a/nn/inc/nn_tensor.h b/nn/inc/nn_tensor.h
index bc64f31..c9d805d 100644
--- a/nn/inc/nn_tensor.h
+++ b/nn/inc/nn_tensor.h
@@ -5,6 +5,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <assert.h>
 
 #define MAX_DIMS            4
 
@@ -74,28 +75,57 @@ static inline const char *NN_getDataTypeName(DataType dtype) {
   }
 }
 
+/**
+ * Returns if the tensor is a scalar
+ * 
+ * A scalar is a 1D tensor with a single element, i.e., shape = (1, )
+ * 
+ * @param tensor: the target tensor
+ */
 static inline uint8_t NN_isScalar(Tensor *tensor) {
   return tensor->ndim == 1 && tensor->shape[0] == 1;
 }
 
+/**
+ * Returns if the tensor is a vector
+ * 
+ * @param tensor: the target tensor
+ */
 static inline uint8_t NN_isVector(Tensor *tensor) {
   return tensor->ndim == 1;
 }
 
+/**
+ * Returns if the tensor is a matrix
+ * 
+ * @param tensor: the target tensor
+ */
 static inline uint8_t NN_isMatrix(Tensor *tensor) {
   return tensor->ndim == 2;
 }
 
+/**
+ * Returns if the tensor is a 3D tensor
+ * 
+ * @param tensor: the target tensor
+ */
 static inline uint8_t NN_is3D(Tensor *tensor) {
   return tensor->ndim == 3;
 }
 
+/**
+ * Returns if the tensor is a 4D tensor
+ * 
+ * @param tensor: the target tensor
+ */
 static inline uint8_t NN_is4D(Tensor *tensor) {
   return tensor->ndim == 4;
 }
 
 /**
  * Frees the memory allocated for the tensor data
+ * 
+ * @param tensor: the target tensor
  */
 static inline void NN_freeTensorData(Tensor *tensor) {
   free(tensor->data);
@@ -103,11 +133,44 @@ static inline void NN_freeTensorData(Tensor *tensor) {
 
 /**
  * Frees the memory allocated for the tensor
+ * 
+ * @param tensor: the target tensor
  */
 static inline void NN_deleteTensor(Tensor *tensor) {
   free(tensor);
 }
 
+/**
+ * Fills the tensor with the specified value.
+ * 
+ * @param tensor: the input tensor
+ * @param value: scalar value
+ */
+static inline void NN_fill_F32(Tensor *tensor, float value) {
+  assert(tensor->dtype == DTYPE_F32);
+  
+  for (size_t i = 0; i < tensor->size; i += 1) {
+    ((float *)tensor->data)[i] = value;
+  }
+}
+
+static inline void NN_fill_I32(Tensor *tensor, int32_t value) {
+  assert(tensor->dtype == DTYPE_I32);
+  
+  for (size_t i = 0; i < tensor->size; i += 1) {
+    ((int32_t *)tensor->data)[i] = value;
+  }
+}
+
+static inline void NN_fill_I8(Tensor *tensor, int8_t value) {
+  assert(tensor->dtype == DTYPE_I8);
+  
+  for (size_t i = 0; i < tensor->size; i += 1) {
+    ((int8_t *)tensor->data)[i] = value;
+  }
+}
+
+
 /**
  * Initialize a given tensor
  * 
diff --git a/nn/src/abs/nn_abs.c b/nn/src/abs/nn_abs.c
new file mode 100644
index 0000000..bdd4d25
--- /dev/null
+++ b/nn/src/abs/nn_abs.c
@@ -0,0 +1,14 @@
+
+#include "nn_abs.h"
+
+
+void NN_abs_F32(Tensor *out, Tensor *input) {
+  assert(out->ndim == input->ndim);
+  assert(input->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  assert(out->size == input->size);
+  
+  for (size_t i = 0; i < out->size; i += 1) {
+    ((float *)out->data)[i] = fabs(((float *)input->data)[i]);
+  }
+}
diff --git a/nn/src/abs/nn_abs_rvv.c b/nn/src/abs/nn_abs_rvv.c
new file mode 100644
index 0000000..360fcd4
--- /dev/null
+++ b/nn/src/abs/nn_abs_rvv.c
@@ -0,0 +1,25 @@
+
+#include "nn_abs.h"
+#include "riscv_vector.h"
+
+void NN_abs_F32_RVV(Tensor *out, Tensor *input) {
+  assert(out->ndim == input->ndim);
+  assert(input->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  assert(out->size == input->size);
+
+  float *ptr_out = out->data;
+  float *ptr_in = input->data;
+  
+  size_t n = out->shape[0] * out->shape[1];
+  while (n > 0) {
+    size_t vl = __riscv_vsetvl_e32m1(n);
+    vfloat32m1_t vec_in = __riscv_vle32_v_f32m1(ptr_in, vl);
+    vfloat32m1_t vec_out = __riscv_vfabs_v_f32m1(vec_in, vl);
+    __riscv_vse32_v_f32m1(ptr_out, vec_out, vl);
+    ptr_in += vl;
+    ptr_out += vl;
+    n -= vl;
+  }
+}
+
diff --git a/nn/src/matrixnorm/nn_matrixnorm.c b/nn/src/matrixnorm/nn_matrixnorm.c
new file mode 100644
index 0000000..8b8d077
--- /dev/null
+++ b/nn/src/matrixnorm/nn_matrixnorm.c
@@ -0,0 +1,16 @@
+
+#include "nn_matrixnorm.h"
+
+
+float NN_matrixNorm_F32(Tensor *tensor) {
+  assert(tensor->ndim == 2);
+  assert(tensor->dtype == DTYPE_F32);
+
+  float sum = 0;
+  for (int i = 0; i < tensor->shape[0]; i += 1) {
+    for (int j = 0; j < tensor->shape[1]; j += 1) {
+      sum += pow(((float *)tensor->data)[i * tensor->shape[1] + j], 2);
+    }
+  }
+  return sqrt(sum);
+}
diff --git a/nn/src/matrixnorm/nn_matrixnorm_rvv.c b/nn/src/matrixnorm/nn_matrixnorm_rvv.c
new file mode 100644
index 0000000..169f76d
--- /dev/null
+++ b/nn/src/matrixnorm/nn_matrixnorm_rvv.c
@@ -0,0 +1,28 @@
+
+#include "nn_matrixnorm.h"
+#include "riscv_vector.h"
+
+
+float NN_matrixNorm_F32_RVV(Tensor *tensor) {
+  assert(tensor->ndim == 2);
+  assert(tensor->dtype == DTYPE_F32);
+  
+  float *ptr = tensor->data;
+
+  size_t vlmax = __riscv_vsetvlmax_e32m1();
+  vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax);
+  vfloat32m1_t vec_accumulate = __riscv_vfmv_v_f_f32m1(0, vlmax);
+
+  size_t n = tensor->shape[0] * tensor->shape[1];
+  while (n > 0) {
+    size_t vl = __riscv_vsetvl_e32m1(n);
+    vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(ptr, vl);
+    vec_accumulate = __riscv_vfmacc_vv_f32m1(vec_accumulate, vec_a, vec_a, vl);
+    ptr += vl;
+    n -= vl;
+  }
+  vfloat32m1_t vec_sum = __riscv_vfredusum_vs_f32m1_f32m1(vec_accumulate, vec_zero, vlmax);
+  float sum = __riscv_vfmv_f_s_f32m1_f32(vec_sum);
+
+  return sqrt(sum);
+}
diff --git a/nn/src/maximum/nn_maximum.c b/nn/src/maximum/nn_maximum.c
new file mode 100644
index 0000000..b6e1f6d
--- /dev/null
+++ b/nn/src/maximum/nn_maximum.c
@@ -0,0 +1,19 @@
+
+#include "nn_maximum.h"
+
+
+void NN_maximum_F32(Tensor *out, Tensor *a, Tensor *b) {
+  assert(b->ndim == a->ndim);
+  assert(out->ndim == a->ndim);
+  assert(a->dtype == DTYPE_F32);
+  assert(b->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  assert(b->size == a->size);
+  assert(out->size == a->size);
+  
+  for (size_t i = 0; i < out->size; i += 1) {
+    float a_val = ((float *)a->data)[i];
+    float b_val = ((float *)b->data)[i];
+    ((float *)out->data)[i] = a_val > b_val ? a_val : b_val;
+  }
+}
diff --git a/nn/src/maximum/nn_maximum_rvv.c b/nn/src/maximum/nn_maximum_rvv.c
new file mode 100644
index 0000000..c0e0c0f
--- /dev/null
+++ b/nn/src/maximum/nn_maximum_rvv.c
@@ -0,0 +1,31 @@
+
+#include "nn_maximum.h"
+#include "riscv_vector.h"
+
+void NN_maximum_F32_RVV(Tensor *out, Tensor *a, Tensor *b) {
+  assert(b->ndim == a->ndim);
+  assert(out->ndim == a->ndim);
+  assert(a->dtype == DTYPE_F32);
+  assert(b->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  assert(b->size == a->size);
+  assert(out->size == a->size);
+
+  float *ptr_out = out->data;
+  float *ptr_a = a->data;
+  float *ptr_b = b->data;
+
+  size_t n = out->shape[0] * out->shape[1];
+  while (n > 0) {
+    size_t vl = __riscv_vsetvl_e32m1(n);
+    vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(ptr_a, vl);
+    vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(ptr_b, vl);
+    vfloat32m1_t vec_out = __riscv_vfmax_vv_f32m1(vec_a, vec_b, vl);
+    __riscv_vse32_v_f32m1(ptr_out, vec_out, vl);
+    ptr_a += vl;
+    ptr_b += vl;
+    ptr_out += vl;
+    n -= vl;
+  }
+}
+
diff --git a/nn/src/minimum/nn_minimum.c b/nn/src/minimum/nn_minimum.c
new file mode 100644
index 0000000..15f55b0
--- /dev/null
+++ b/nn/src/minimum/nn_minimum.c
@@ -0,0 +1,19 @@
+
+#include "nn_minimum.h"
+
+
+void NN_minimum_F32(Tensor *out, Tensor *a, Tensor *b) {
+  assert(b->ndim == a->ndim);
+  assert(out->ndim == a->ndim);
+  assert(a->dtype == DTYPE_F32);
+  assert(b->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  assert(b->size == a->size);
+  assert(out->size == a->size);
+  
+  for (size_t i = 0; i < out->size; i += 1) {
+    float a_val = ((float *)a->data)[i];
+    float b_val = ((float *)b->data)[i];
+    ((float *)out->data)[i] = a_val < b_val ? a_val : b_val;
+  }
+}
diff --git a/nn/src/minimum/nn_minimum_rvv.c b/nn/src/minimum/nn_minimum_rvv.c
new file mode 100644
index 0000000..4fade23
--- /dev/null
+++ b/nn/src/minimum/nn_minimum_rvv.c
@@ -0,0 +1,31 @@
+
+#include "nn_minimum.h"
+#include "riscv_vector.h"
+
+void NN_minimum_F32_RVV(Tensor *out, Tensor *a, Tensor *b) {
+  assert(b->ndim == a->ndim);
+  assert(out->ndim == a->ndim);
+  assert(a->dtype == DTYPE_F32);
+  assert(b->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  assert(b->size == a->size);
+  assert(out->size == a->size);
+
+  float *ptr_out = out->data;
+  float *ptr_a = a->data;
+  float *ptr_b = b->data;
+
+  size_t n = out->shape[0] * out->shape[1];
+  while (n > 0) {
+    size_t vl = __riscv_vsetvl_e32m1(n);
+    vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(ptr_a, vl);
+    vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(ptr_b, vl);
+    vfloat32m1_t vec_out = __riscv_vfmin_vv_f32m1(vec_a, vec_b, vl);
+    __riscv_vse32_v_f32m1(ptr_out, vec_out, vl);
+    ptr_a += vl;
+    ptr_b += vl;
+    ptr_out += vl;
+    n -= vl;
+  }
+}
+
diff --git a/nn/src/mul/nn_mul.c b/nn/src/mul/nn_mul.c
index 528770e..ee4d24f 100644
--- a/nn/src/mul/nn_mul.c
+++ b/nn/src/mul/nn_mul.c
@@ -1,14 +1,17 @@
 
 #include "nn_mul.h"
 
-void NN_mul_F32(Tensor *out, Tensor *in, float coeff) {
-  assert(out->ndim == in->ndim);
-  assert(in->dtype == DTYPE_F32);
+void NN_mul_F32(Tensor *out, Tensor *a, Tensor *b) {
+  assert(b->ndim == a->ndim);
+  assert(out->ndim == a->ndim);
+  assert(a->dtype == DTYPE_F32);
+  assert(b->dtype == DTYPE_F32);
   assert(out->dtype == DTYPE_F32);
-  assert(out->size == in->size);
+  assert(b->size == a->size);
+  assert(out->size == a->size);
   
-  for (size_t i = 0; i < in->size; i += 1) {
-    ((float *)out->data)[i] = ((float *)in->data)[i] * coeff;
+  for (size_t i = 0; i < out->size; i += 1) {
+    ((float *)out->data)[i] = ((float *)a->data)[i] * ((float *)b->data)[i];
   }
 }
 
diff --git a/nn/src/mul/nn_mul_rvv.c b/nn/src/mul/nn_mul_rvv.c
index ff7c190..9e21de1 100644
--- a/nn/src/mul/nn_mul_rvv.c
+++ b/nn/src/mul/nn_mul_rvv.c
@@ -2,26 +2,33 @@
 #include "nn_mul.h"
 #include "riscv_vector.h"
 
-void NN_mul_F32_RVV(Tensor *out, Tensor *in, float scalar) {
-  assert(out->ndim == in->ndim);
-  assert(in->dtype == DTYPE_F32);
+void NN_mul_F32_RVV(Tensor *out, Tensor *a, Tensor *b) {
+  assert(b->ndim == a->ndim);
+  assert(out->ndim == a->ndim);
+  assert(a->dtype == DTYPE_F32);
+  assert(b->dtype == DTYPE_F32);
   assert(out->dtype == DTYPE_F32);
+  assert(b->size == a->size);
+  assert(out->size == a->size);
   
-  float *in_ptr = in->data;
   float *out_ptr = out->data;
+  float *a_ptr = a->data;
+  float *b_ptr = b->data;
   
   // TODO: currently only support 2dim
-  assert(in->ndim == 2);
-  assert(out->shape[0] == in->shape[0]);
-  assert(out->shape[1] == in->shape[1]);
+  assert(a->ndim == 2);
+  assert(out->shape[0] == a->shape[0]);
+  assert(out->shape[1] == a->shape[1]);
   
-  size_t n = in->shape[0] * in->shape[1];
+  size_t n = out->shape[0] * out->shape[1];
   while (n > 0) {
     size_t vl = __riscv_vsetvl_e32m1(n);
-    vfloat32m1_t vec_in = __riscv_vle32_v_f32m1(in_ptr, vl);
-    vfloat32m1_t vec_out = __riscv_vfmul_vf_f32m1(vec_in, scalar, vl);
+    vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(a_ptr, vl);
+    vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(b_ptr, vl);
+    vfloat32m1_t vec_out = __riscv_vfmul_vv_f32m1(vec_a, vec_b, vl);
     __riscv_vse32_v_f32m1(out_ptr, vec_out, vl);
-    in_ptr += vl;
+    a_ptr += vl;
+    b_ptr += vl;
     out_ptr += vl;
     n -= vl;
   }
diff --git a/nn/src/multiply/nn_multiply.c b/nn/src/multiply/nn_multiply.c
new file mode 100644
index 0000000..bd727dd
--- /dev/null
+++ b/nn/src/multiply/nn_multiply.c
@@ -0,0 +1,14 @@
+
+#include "nn_multiply.h"
+
+void NN_multiply_F32(Tensor *out, Tensor *in, float coeff) {
+  assert(out->ndim == in->ndim);
+  assert(in->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  assert(out->size == in->size);
+  
+  for (size_t i = 0; i < out->size; i += 1) {
+    ((float *)out->data)[i] = ((float *)in->data)[i] * coeff;
+  }
+}
+
diff --git a/nn/src/multiply/nn_multiply_rvv.c b/nn/src/multiply/nn_multiply_rvv.c
new file mode 100644
index 0000000..eced806
--- /dev/null
+++ b/nn/src/multiply/nn_multiply_rvv.c
@@ -0,0 +1,28 @@
+
+#include "nn_multiply.h"
+#include "riscv_vector.h"
+
+void NN_multiply_F32_RVV(Tensor *out, Tensor *in, float scalar) {
+  assert(out->ndim == in->ndim);
+  assert(in->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  
+  float *in_ptr = in->data;
+  float *out_ptr = out->data;
+  
+  // TODO: currently only support 2dim
+  assert(in->ndim == 2);
+  assert(out->shape[0] == in->shape[0]);
+  assert(out->shape[1] == in->shape[1]);
+  
+  size_t n = out->shape[0] * out->shape[1];
+  while (n > 0) {
+    size_t vl = __riscv_vsetvl_e32m1(n);
+    vfloat32m1_t vec_in = __riscv_vle32_v_f32m1(in_ptr, vl);
+    vfloat32m1_t vec_out = __riscv_vfmul_vf_f32m1(vec_in, scalar, vl);
+    __riscv_vse32_v_f32m1(out_ptr, vec_out, vl);
+    in_ptr += vl;
+    out_ptr += vl;
+    n -= vl;
+  }
+}
diff --git a/nn/src/neg/nn_neg.c b/nn/src/neg/nn_neg.c
new file mode 100644
index 0000000..4546c38
--- /dev/null
+++ b/nn/src/neg/nn_neg.c
@@ -0,0 +1,14 @@
+
+#include "nn_neg.h"
+
+
+void NN_neg_F32(Tensor *out, Tensor *input) {
+  assert(out->ndim == input->ndim);
+  assert(input->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  assert(out->size == input->size);
+  
+  for (size_t i = 0; i < out->size; i += 1) {
+    ((float *)out->data)[i] = -((float *)input->data)[i];
+  }
+}
diff --git a/nn/src/neg/nn_neg_rvv.c b/nn/src/neg/nn_neg_rvv.c
new file mode 100644
index 0000000..cf86aca
--- /dev/null
+++ b/nn/src/neg/nn_neg_rvv.c
@@ -0,0 +1,25 @@
+
+#include "nn_neg.h"
+#include "riscv_vector.h"
+
+void NN_neg_F32_RVV(Tensor *out, Tensor *input) {
+  assert(out->ndim == input->ndim);
+  assert(input->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  assert(out->size == input->size);
+
+  float *ptr_out = out->data;
+  float *ptr_in = input->data;
+  
+  size_t n = out->shape[0] * out->shape[1];
+  while (n > 0) {
+    size_t vl = __riscv_vsetvl_e32m1(n);
+    vfloat32m1_t vec_in = __riscv_vle32_v_f32m1(ptr_in, vl);
+    vfloat32m1_t vec_out = __riscv_vfneg_v_f32m1(vec_in, vl);
+    __riscv_vse32_v_f32m1(ptr_out, vec_out, vl);
+    ptr_in += vl;
+    ptr_out += vl;
+    n -= vl;
+  }
+}
+
diff --git a/nn/src/nn_tensor.c b/nn/src/nn_tensor.c
index 5bbee81..49176c5 100644
--- a/nn/src/nn_tensor.c
+++ b/nn/src/nn_tensor.c
@@ -38,19 +38,13 @@ Tensor *NN_zeros(size_t ndim, size_t *shape, DataType dtype) {
 
   switch (dtype) {
     case DTYPE_I8:
-      for (size_t i = 0; i < t->size; i += 1) {
-        ((int8_t *)t->data)[i] = 0;
-      }
+      NN_fill_I8(t, 0);
       break;
     case DTYPE_I32:
-      for (size_t i = 0; i < t->size; i += 1) {
-        ((int32_t *)t->data)[i] = 0;
-      }
+      NN_fill_I32(t, 0);
       break;
     case DTYPE_F32:
-      for (size_t i = 0; i < t->size; i += 1) {
-        ((float *)t->data)[i] = 0;
-      }
+      NN_fill_F32(t, 0);
       break;
     default:
       printf("[WARNING] Unsupported data type: %d\n", dtype);
@@ -64,19 +58,13 @@ Tensor *NN_ones(size_t ndim, size_t *shape, DataType dtype) {
 
   switch (dtype) {
     case DTYPE_I8:
-      for (size_t i = 0; i < t->size; i += 1) {
-        ((int8_t *)t->data)[i] = 1;
-      }
+      NN_fill_I8(t, 1);
       break;
     case DTYPE_I32:
-      for (size_t i = 0; i < t->size; i += 1) {
-        ((int32_t *)t->data)[i] = 1;
-      }
+      NN_fill_I32(t, 1);
       break;
     case DTYPE_F32:
-      for (size_t i = 0; i < t->size; i += 1) {
-        ((float *)t->data)[i] = 1;
-      }
+      NN_fill_F32(t, 1);
       break;
     default:
       printf("[WARNING] Unsupported data type: %d\n", dtype);
diff --git a/test/src/main.c b/test/src/main.c
index 98cd596..8ca5cb8 100644
--- a/test/src/main.c
+++ b/test/src/main.c
@@ -144,9 +144,9 @@ int main() {
     Tensor *D = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
     
     printf("mulf:\t\t");
-    NN_mul_F32(C, A, 10.0f);
+    NN_multiply_F32(C, A, 10.0f);
     cycles = READ_CSR("mcycle");
-    NN_mul_F32_RVV(D, A, 10.0f);
+    NN_multiply_F32_RVV(D, A, 10.0f);
     cycles = READ_CSR("mcycle") - cycles;
     printf("%s (%lu)\n", compare_2d(C->data, D->data, M, N) ? "PASS" : "FAIL", cycles);
 
@@ -210,7 +210,24 @@ int main() {
 
   // matneg
   {
+    Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
+    Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
+    Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
 
+    printf("cwiseneg:\t");
+    NN_neg_F32(golden, A);
+    cycles = READ_CSR("mcycle");
+    NN_neg_F32_RVV(actual, A);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles);
+
+    NN_freeTensorData(A);
+    NN_deleteTensor(A);
+
+    NN_freeTensorData(golden);
+    NN_deleteTensor(golden);
+    NN_freeTensorData(actual);
+    NN_deleteTensor(actual);
   }
 
   // matcopy
@@ -220,22 +237,99 @@ int main() {
 
   // cwiseabs
   {
+    Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
+    Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
+    Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
+
+    printf("cwiseabs:\t");
+    NN_abs_F32(golden, A);
+    cycles = READ_CSR("mcycle");
+    NN_abs_F32_RVV(actual, A);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles);
 
+    NN_freeTensorData(A);
+    NN_deleteTensor(A);
+
+    NN_freeTensorData(golden);
+    NN_deleteTensor(golden);
+    NN_freeTensorData(actual);
+    NN_deleteTensor(actual);
   }
 
   // cwisemin
   {
+    Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
+    Tensor *B = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
+    Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
+    Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
 
+    printf("cwiseminimum:\t");
+    NN_minimum_F32(golden, A, B);
+    cycles = READ_CSR("mcycle");
+    NN_minimum_F32_RVV(actual, A, B);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles);
+
+    NN_freeTensorData(A);
+    NN_deleteTensor(A);
+    NN_freeTensorData(B);
+    NN_deleteTensor(B);
+
+    NN_freeTensorData(golden);
+    NN_deleteTensor(golden);
+    NN_freeTensorData(actual);
+    NN_deleteTensor(actual);
   }
 
   // cwisemax
   {
+    Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
+    Tensor *B = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
+    Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
+    Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
+
+    printf("cwisemaximum:\t");
+    NN_maximum_F32(golden, A, B);
+    cycles = READ_CSR("mcycle");
+    NN_maximum_F32_RVV(actual, A, B);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles);
 
+    NN_freeTensorData(A);
+    NN_deleteTensor(A);
+    NN_freeTensorData(B);
+    NN_deleteTensor(B);
+
+    NN_freeTensorData(golden);
+    NN_deleteTensor(golden);
+    NN_freeTensorData(actual);
+    NN_deleteTensor(actual);
   }
 
   // cwisemul
   {
+    Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
+    Tensor *B = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
+    Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
+    Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
 
+    printf("matadd:\t\t");
+    NN_mul_F32(golden, A, B);
+    cycles = READ_CSR("mcycle");
+    NN_mul_F32_RVV(actual, A, B);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "PASS" : "FAIL", cycles);
+
+    NN_freeTensorData(A);
+    NN_deleteTensor(A);
+    NN_freeTensorData(B);
+    NN_deleteTensor(B);
+
+    NN_freeTensorData(golden);
+    NN_deleteTensor(golden);
+    NN_freeTensorData(actual);
+    NN_deleteTensor(actual);
   }
 
   // matset
@@ -248,9 +342,15 @@ int main() {
 
   }
 
-  // matNOrm
+  // matnorm
   {
-
+    Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
+    printf("matnorm:\t");
+    float norm_cpu = NN_matrixNorm_F32(A);
+    cycles = READ_CSR("mcycle");
+    float norm_actual = NN_matrixNorm_F32_RVV(A);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s (%lu)\n", float_eq(norm_cpu, norm_actual, 1e-6) ? "PASS" : "FAIL", cycles);
   }
 
   // transpose