diff --git a/nn/CMakeLists.txt b/nn/CMakeLists.txt
index 5a7bab7..d522071 100644
--- a/nn/CMakeLists.txt
+++ b/nn/CMakeLists.txt
@@ -7,12 +7,22 @@ set(INCLUDES
 set(SOURCES
   src/nn_tensor.c
   src/nn_print.c
+  
   src/add/nn_add.c
+  src/add/nn_add_rvv.c
+
   src/copy/nn_copy.c
-  src/matmul/nn_matmul.c
   
-  src/add/nn_add_rvv.c
+  src/matmul/nn_matmul.c
+  src/matmul/nn_matmul_eaglex.c
   src/matmul/nn_matmul_rvv.c
+  
+  src/max/nn_max.c
+  src/max/nn_max_rvv.c
+  
+  src/min/nn_min.c
+  src/min/nn_min_rvv.c
+
 )
 
 add_library(nn ${SOURCES})
diff --git a/nn/inc/nn.h b/nn/inc/nn.h
index 1a8a0b3..f63a6c3 100644
--- a/nn/inc/nn.h
+++ b/nn/inc/nn.h
@@ -8,6 +8,8 @@
 #include "nn_add.h"
 #include "nn_copy.h"
 #include "nn_matmul.h"
+#include "nn_max.h"
+#include "nn_min.h"
 
 
 // http://elm-chan.org/junk/32bit/binclude.html
diff --git a/nn/inc/nn_linear.h b/nn/inc/nn_linear.h
index 4521b6b..552c463 100644
--- a/nn/inc/nn_linear.h
+++ b/nn/inc/nn_linear.h
@@ -14,8 +14,4 @@
  */
 void NN_linear_F32(Tensor *y, Tensor *x, Tensor *w, Tensor *b);
 
-void NN_linear_I32(Tensor *y, Tensor *x, Tensor *w, Tensor *b);
-
-void NN_linear_I8_I8_I8_I32(Tensor *y, Tensor *x, Tensor *w, Tensor *b);
-
 #endif // __NN_LINEAR_H
diff --git a/nn/inc/nn_matmul.h b/nn/inc/nn_matmul.h
index 4ae4279..0511fce 100644
--- a/nn/inc/nn_matmul.h
+++ b/nn/inc/nn_matmul.h
@@ -26,5 +26,7 @@ void NN_matmul_I32(Tensor *out, Tensor *a, Tensor *b);
 
 void NN_matmul_F32_RVV(Tensor *out, Tensor *a, Tensor *b);
 
+void NN_matmul_I8_I8_I32_EAGLEX(Tensor *out, Tensor *a, Tensor *b);
+
 
 #endif // __NN_MATMUL_H
diff --git a/nn/inc/nn_max.h b/nn/inc/nn_max.h
new file mode 100644
index 0000000..17946d6
--- /dev/null
+++ b/nn/inc/nn_max.h
@@ -0,0 +1,22 @@
+#ifndef __NN_MAX_H
+#define __NN_MAX_H
+
+#include <assert.h>
+#include <float.h>
+
+#include "nn_tensor.h"
+
+
+/**
+ * Returns the maximum value of all elements in the input tensor.
+ * 
+ * @param t: input tensor
+ */
+float NN_max(Tensor *t);
+
+float NN_max_F32(Tensor *t);
+
+float NN_max_F32_RVV(Tensor *t);
+
+
+#endif // __NN_MAX_H
diff --git a/nn/inc/nn_min.h b/nn/inc/nn_min.h
new file mode 100644
index 0000000..d668275
--- /dev/null
+++ b/nn/inc/nn_min.h
@@ -0,0 +1,22 @@
+#ifndef __NN_MIN_H
+#define __NN_MIN_H
+
+#include <assert.h>
+#include <float.h>
+
+#include "nn_tensor.h"
+
+
+/**
+ * Returns the minimum value of all elements in the input tensor.
+ * 
+ * @param t: input tensor
+ */
+float NN_min(Tensor *t);
+
+float NN_min_F32(Tensor *t);
+
+float NN_min_F32_RVV(Tensor *t);
+
+
+#endif // __NN_MIN_H
diff --git a/nn/inc/nn_tensor.h b/nn/inc/nn_tensor.h
index 200173c..5b8027a 100644
--- a/nn/inc/nn_tensor.h
+++ b/nn/inc/nn_tensor.h
@@ -77,35 +77,80 @@ static inline const char *NN_getDataTypeName(DataType dtype) {
   }
 }
 
+/**
+ * Frees the memory allocated for the tensor data
+ */
 static inline void NN_freeTensorData(Tensor *t) {
   free(t->data);
 }
 
+/**
+ * Frees the memory allocated for the tensor
+ */
 static inline void NN_deleteTensor(Tensor *t) {
   free(t);
 }
 
-
 /**
- * Initialize a tensor
+ * Initialize a given tensor
  * 
  * @param ndim: number of dimensions
  * @param shape: shape of tensor
- * @param dtype: DataType
+ * @param dtype: data type
  * @param data: pointer to data, if NULL, the data will be allocated
  */
 void NN_initTensor(Tensor *t, size_t ndim, size_t *shape, DataType dtype, void *data);
 
+/**
+ * Create a new tensor
+ * 
+ * @param ndim: number of dimensions
+ * @param shape: shape of tensor
+ * @param dtype: data type
+ * @param data: pointer to data, if NULL, the data will be allocated
+ * @return Tensor
+*/
 Tensor *NN_tensor(size_t ndim, size_t *shape, DataType dtype, void *data);
 
+/**
+ * Returns a tensor filled with the scalar value 0.
+ * 
+ * @param ndim: number of dimensions
+ * @param shape: shape of tensor
+ * @param dtype: data type
+ * @return Tensor
+ */
 Tensor *NN_zeros(size_t ndim, size_t *shape, DataType dtype);
 
+/**
+ * Returns a tensor filled with the scalar value 1.
+ * 
+ * @param ndim: number of dimensions
+ * @param shape: shape of tensor
+ * @param dtype: data type
+ * @return Tensor
+ */
 Tensor *NN_ones(size_t ndim, size_t *shape, DataType dtype);
 
+/**
+ * Returns a tensor filled with random numbers from a uniform distribution.
+ * 
+ * The range of the random number is dependent on the data type:
+ * - For Float32, the range is [0, 1]
+ * - For Int8, the range is [0, 255]
+ * - For Int32, the range is [0, RAND_MAX]
+ * 
+ * @param ndim: number of dimensions
+ * @param shape: shape of tensor
+ * @param dtype: data type
+ * @return Tensor
+ */
 Tensor *NN_rand(size_t ndim, size_t *shape, DataType dtype);
 
 /**
- * Convert tensor data type
+ * Returns this tensor cast to the type of the given tensor.
+ * 
+ * This is a no-op if the tensor is already of the correct type. 
  * 
  * @param t: input tensor
  * @param dtype: target data type
diff --git a/nn/inc/rv.h b/nn/inc/rv.h
new file mode 100644
index 0000000..8169899
--- /dev/null
+++ b/nn/inc/rv.h
@@ -0,0 +1,94 @@
+/**
+ * @file rv_common.h
+ * @brief RISC-V Definitions
+ * 
+ * This header file provides common definitions and operations for RISC-V core programming.
+ * It includes memory register attributes, bit operation definitions, RISC-V specific definitions,
+ * and common enumerations for state and status values.
+ *
+ * The memory register attributes define volatile permissions for read-only, write-only, and read/write access.
+ * The bit operation definitions provide macros for setting, clearing, reading, and writing specific bits in a register.
+ * The RISC-V specific definitions include macros for reading and writing control and status registers (CSRs),
+ * as well as operations to swap, set, and clear specific bits in a CSR.
+ * The common definitions include enumerations for state values (such as RESET and SET), and status values (such as OK and ERROR).
+ *
+ * @note This file should be included to access RISC-V core-specific definitions and perform common operations.
+ *
+ * @author -T.K.-
+ * @date 2023-05-20
+ */
+ 
+#ifndef __RV_H
+#define __RV_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+
+/* ================ Memory register attributes ================ */
+#ifdef __cplusplus
+  #define   __I     volatile             /** Defines "read only" permissions */
+#else
+  #define   __I     volatile const       /** Defines "read only" permissions */
+#endif
+#define     __O     volatile             /** Defines "write only" permissions */
+#define     __IO    volatile             /** Defines "read / write" permissions */
+
+/* following defines should be used for structure members */
+#define     __IM     volatile const      /** Defines "read only" structure member permissions */
+#define     __OM     volatile            /** Defines "write only" structure member permissions */
+#define     __IOM    volatile            /** Defines "read / write" structure member permissions */
+
+
+/* ================ Bit Operation definitions ================ */
+#define SET_BITS(REG, BIT)                    ((REG) |= (BIT))
+#define CLEAR_BITS(REG, BIT)                  ((REG) &= ~(BIT))
+#define READ_BITS(REG, BIT)                   ((REG) & (BIT))
+#define WRITE_BITS(REG, CLEARMASK, SETMASK)   ((REG) = (((REG) & (~(CLEARMASK))) | (SETMASK)))
+
+
+/* ================ RISC-V specific definitions ================ */
+#define READ_CSR(REG) ({                          \
+  unsigned long __tmp;                            \
+  asm volatile ("csrr %0, " REG : "=r"(__tmp));  \
+  __tmp; })
+
+#define WRITE_CSR(REG, VAL) ({                    \
+  asm volatile ("csrw " REG ", %0" :: "rK"(VAL)); })
+
+#define SWAP_CSR(REG, VAL) ({                     \
+  unsigned long __tmp;                            \
+  asm volatile ("csrrw %0, " REG ", %1" : "=r"(__tmp) : "rK"(VAL)); \
+  __tmp; })
+
+#define SET_CSR_BITS(REG, BIT) ({                 \
+  unsigned long __tmp;                            \
+  asm volatile ("csrrs %0, " REG ", %1" : "=r"(__tmp) : "rK"(BIT)); \
+  __tmp; })
+
+#define CLEAR_CSR_BITS(REG, BIT) ({               \
+  unsigned long __tmp;                            \
+  asm volatile ("csrrc %0, " REG ", %1" : "=r"(__tmp) : "rK"(BIT)); \
+  __tmp; })
+
+
+/* ================ Common definitions ================ */
+typedef enum {
+  RESET = 0UL,
+  SET   = !RESET,
+
+  DISABLE = RESET,
+  ENABLE  = SET,
+  
+  LOW   = RESET,
+  HIGH  = SET,
+} State;
+
+typedef enum {
+  OK = 0U,
+  ERROR,
+  BUSY,
+  TIMEOUT
+} Status;
+
+#endif /* __RV_H */
diff --git a/nn/src/add/nn_add_rvv.c b/nn/src/add/nn_add_rvv.c
index 49d4d97..d82b1e2 100644
--- a/nn/src/add/nn_add_rvv.c
+++ b/nn/src/add/nn_add_rvv.c
@@ -15,14 +15,16 @@ void NN_add_F32_RVV(Tensor *out, Tensor *a, Tensor *b) {
   float *a_data = (float *)a->data;
   float *b_data = (float *)b->data;
 
-  int k = out->shape[0] * out->shape[1];
-  int l = 0;
-  for (size_t vl; k > 0; k -= vl, l += vl) {
+  // TODO: add broadcasting support
+
+  size_t i = 0;
+  size_t vl = 0;
+  for (size_t k = out->shape[0] * out->shape[1]; k > 0; k -= vl, i += vl) {
     vl = __riscv_vsetvl_e32m1(k);
-    vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(a_data + l, vl);
-    vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(b_data + l, vl);
+    vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(a_data + i, vl);
+    vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(b_data + i, vl);
     vfloat32m1_t vec_c = __riscv_vfadd_vv_f32m1(vec_a, vec_b, vl);
-    __riscv_vse32_v_f32m1(out_data + l, vec_c, vl);
+    __riscv_vse32_v_f32m1(out_data + i, vec_c, vl);
   }
 }
 
diff --git a/nn/src/linear/nn_linear.c b/nn/src/linear/nn_linear.c
index 45879c2..e503c92 100644
--- a/nn/src/linear/nn_linear.c
+++ b/nn/src/linear/nn_linear.c
@@ -14,27 +14,3 @@ void NN_linear_F32(Tensor *y, Tensor *x, Tensor *w, Tensor *b) {
   NN_matmul_F32(y, x, w);
   NN_add_F32(y, y, b);
 }
-
-void NN_linear_I32(Tensor *y, Tensor *x, Tensor *w, Tensor *b) {
-  assert(x->shape[1] == w->shape[0]);
-  assert(y->shape[0] == x->shape[0]);
-  assert(y->shape[1] == w->shape[1]);
-  assert(b->shape[0] == w->shape[1]);
-  assert(x->dtype == DTYPE_I32);
-  assert(w->dtype == DTYPE_I32);
-  assert(b->dtype == DTYPE_I32);
-  assert(y->dtype == DTYPE_I32);
-
-  NN_matmul_I32(y, x, w);
-  NN_add_I32(y, y, b);
-}
-
-void NN_linear_I8_I8_I8_I32(Tensor *y, Tensor *x, Tensor *w, Tensor *b) {
-  assert(x->dtype == DTYPE_I8);
-  assert(w->dtype == DTYPE_I8);
-  assert(b->dtype == DTYPE_I8);
-  assert(y->dtype == DTYPE_I32);
-
-  NN_matmul_I8_I8_I32(y, x, w);
-  NN_add_I32_I8_I32(y, y, b);
-}
diff --git a/nn/src/matmul/nn_matmul_eaglex.c b/nn/src/matmul/nn_matmul_eaglex.c
new file mode 100644
index 0000000..4711e83
--- /dev/null
+++ b/nn/src/matmul/nn_matmul_eaglex.c
@@ -0,0 +1,7 @@
+
+#include "nn_matmul.h"
+
+void NN_matmul_I8_I8_I32_EAGLEX(Tensor *out, Tensor *a, Tensor *b) {
+  // TODO: port to here
+}
+
diff --git a/nn/src/matmul/nn_matmul_rvv.c b/nn/src/matmul/nn_matmul_rvv.c
index d37ab61..4cb4f10 100644
--- a/nn/src/matmul/nn_matmul_rvv.c
+++ b/nn/src/matmul/nn_matmul_rvv.c
@@ -19,6 +19,7 @@ void NN_matmul_F32_RVV(Tensor *out, Tensor *a, Tensor *b) {
       float *ptr_a = (float *)a->data + i * a->shape[1];
       float *ptr_b = (float *)b->data + j;
       vfloat32m1_t vec_s = __riscv_vfmv_v_f_f32m1(0, vlmax);
+      
       size_t vl = 0;
       for (int k = a->shape[1]; k > 0; k -= vl, ptr_a += vl, ptr_b += vl) {
         vl = __riscv_vsetvl_e32m1(k);
diff --git a/nn/src/max/nn_max.c b/nn/src/max/nn_max.c
new file mode 100644
index 0000000..03cf1aa
--- /dev/null
+++ b/nn/src/max/nn_max.c
@@ -0,0 +1,20 @@
+
+#include "nn_max.h"
+
+
+float NN_max_F32(Tensor *t) {
+  assert(t->dtype == DTYPE_F32);
+  
+  float max = -FLT_MAX;
+  float *t_data = (float *)t->data;
+
+  for (size_t i = 0; i < t->shape[0]; i += 1) {
+    for (size_t j = 0; j < t->shape[1]; j += 1) {
+      if (t_data[i * t->shape[1] + j] > max) {
+        max = t_data[i * t->shape[1] + j];
+      }
+    }
+  }
+  
+  return max;
+}
diff --git a/nn/src/max/nn_max_rvv.c b/nn/src/max/nn_max_rvv.c
new file mode 100644
index 0000000..accb948
--- /dev/null
+++ b/nn/src/max/nn_max_rvv.c
@@ -0,0 +1,22 @@
+
+#include "nn_max.h"
+#include "riscv_vector.h"
+
+float NN_max_F32_RVV(Tensor *t) {
+  assert(t->dtype == DTYPE_F32);
+
+  float max = -FLT_MAX;
+  float *t_data = (float *)t->data;
+
+  vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(max, 1);
+  size_t i = 0;
+  size_t vl = 0;
+  for (size_t k = t->shape[0] * t->shape[1]; k > 0; k -= vl, i += vl) {
+    vl = __riscv_vsetvl_e32m1(k);
+    vfloat32m1_t vec_t = __riscv_vle32_v_f32m1(t_data + i, vl);
+    vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_t, vec_max, vl);
+  }
+  max = __riscv_vfmv_f_s_f32m1_f32(vec_max);
+  return max;
+}
+
diff --git a/nn/src/min/nn_min.c b/nn/src/min/nn_min.c
new file mode 100644
index 0000000..d61e727
--- /dev/null
+++ b/nn/src/min/nn_min.c
@@ -0,0 +1,20 @@
+
+#include "nn_min.h"
+
+
+float NN_min_F32(Tensor *t) {
+  assert(t->dtype == DTYPE_F32);
+  
+  float min = FLT_MAX;
+  float *t_data = (float *)t->data;
+
+  for (size_t i = 0; i < t->shape[0]; i += 1) {
+    for (size_t j = 0; j < t->shape[1]; j += 1) {
+      if (t_data[i * t->shape[1] + j] < min) {
+        min = t_data[i * t->shape[1] + j];
+      }
+    }
+  }
+  
+  return min;
+}
diff --git a/nn/src/min/nn_min_rvv.c b/nn/src/min/nn_min_rvv.c
new file mode 100644
index 0000000..4fe80ea
--- /dev/null
+++ b/nn/src/min/nn_min_rvv.c
@@ -0,0 +1,22 @@
+
+#include "nn_min.h"
+#include "riscv_vector.h"
+
+float NN_min_F32_RVV(Tensor *t) {
+  assert(t->dtype == DTYPE_F32);
+
+  float min = FLT_MAX;
+  float *t_data = (float *)t->data;
+
+  vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(min, 1);
+  size_t i = 0;
+  size_t vl = 0;
+  for (size_t k = t->shape[0] * t->shape[1]; k > 0; k -= vl, i += vl) {
+    vl = __riscv_vsetvl_e32m1(k);
+    vfloat32m1_t vec_t = __riscv_vle32_v_f32m1(t_data + i, vl);
+    vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_t, vec_min, vl);
+  }
+  min = __riscv_vfmv_f_s_f32m1_f32(vec_min);
+  return min;
+}
+
diff --git a/nn/src/nn_tensor.c b/nn/src/nn_tensor.c
index 0ec6eac..ef9227f 100644
--- a/nn/src/nn_tensor.c
+++ b/nn/src/nn_tensor.c
@@ -98,7 +98,7 @@ Tensor *NN_rand(size_t ndim, size_t *shape, DataType dtype) {
   switch (dtype) {
     case DTYPE_I8:
       for (size_t i = 0; i<t->size; i+=1) {
-        ((int8_t *)t->data)[i] = rand() % 256 - 128;
+        ((int8_t *)t->data)[i] = rand() % 256;
       }
       break;
     case DTYPE_I32:
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 32cbe7b..edf38d1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,3 +1,7 @@
+
+# the CMake files are still very hacky
+# TODO: organize the build system properly
+
 cmake_minimum_required(VERSION 3.15)
 
 set(PROJECT_NAME        "example")
@@ -23,8 +27,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_BUILD_TYPE Debug)
 set(BUILD_SHARED_LIBS OFF)
 set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} -Og")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
+set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} -O2")
 
 set(STATIC_LINKING TRUE)
 
@@ -49,11 +53,13 @@ set(CMAKE_SIZE         "${TOOLCHAIN_PREFIX}-size")
 set(CMAKE_STRIP        "${TOOLCHAIN_PREFIX}-ld")
 
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcmodel=medany -march=rv64gcv_zfh -mabi=lp64d -fno-common -fno-builtin-printf")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Og -mcmodel=medany -march=rv64gcv_zfh -mabi=lp64d -fno-common -fno-builtin-printf")
 set(CMAKE_C_FLAGS ${CMAKE_CXX_FLAGS})
 set(CMAKE_EXE_LINKER_FLAGS "-static -lm -lstdc++ -Wl,-Map=output.map -L${LIBGLOSS_DIR} -specs=${SPECS_FILE} -specs=${WRAP_SPECS_FILE} -T ${CMAKE_SOURCE_DIR}/htif.ld")
 
 
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -u _printf_float")
+
 message(CMAKE_C_FLAGS: ${CMAKE_C_FLAGS})
 
 add_executable(${PROJECT_NAME} ${PROJECT_SOURCES})
diff --git a/test/src/main.c b/test/src/main.c
index 81b1d04..5921e39 100644
--- a/test/src/main.c
+++ b/test/src/main.c
@@ -1,6 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
+#include <rv.h>
 
 #include "nn.h"
 #include "riscv_vector.h"
@@ -36,24 +37,24 @@ int main() {
   enable_vector_operations();
   
   uint32_t seed = 0xdeadbeef;
-  uint64_t start, total;
+  size_t cycles;
   srand(seed);
 
-
+  // matmul
   {
     Tensor *A = NN_rand(2, (size_t[]){M, O}, DTYPE_F32);
     Tensor *B = NN_rand(2, (size_t[]){O, N}, DTYPE_F32);
     Tensor *f = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
 
-    printf("matmul:         ");
+    printf("matmul:\t\t");
     Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
     Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
     
     NN_matmul_F32(golden, A, B);
-    // start = read_cycles();
+    cycles = READ_CSR("mcycle");
     NN_matmul_F32_RVV(actual, A, B);
-    // total = read_cycles() - start;
-    printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "pass" : "fail", total);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "pass" : "fail", cycles);
 
     // NN_printf(golden);
     // NN_printf(actual);
@@ -71,18 +72,62 @@ int main() {
     NN_deleteTensor(actual);
   }
 
+  // matvec
+  {
+
+
+  }
+
+  // max and min
+  {
+    Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
+    
+    printf("max:\t\t");
+    float max_cpu = NN_max_F32(A);
+    cycles = READ_CSR("mcycle");
+    float max_actual = NN_max_F32_RVV(A);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s (%lu)\n", float_eq(max_cpu, max_actual, 1e-6) ? "pass" : "fail", cycles);
+
+    printf("min:\t\t");
+    float min_cpu = NN_min_F32(A);
+    cycles = READ_CSR("mcycle");
+    float min_actual =  NN_min_F32_RVV(A);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s (%lu)\n", float_eq(min_cpu, min_actual, 1e-6) ? "pass" : "fail", cycles);
+
+    NN_printf(A);
+    printf("max:");
+    NN_printFloat(max_cpu, 6);
+    printf("\n");
+
+    NN_freeTensorData(A);
+    NN_deleteTensor(A);
+  }
+
+  // matmulf
+  {
+
+  }
+
+  // matsub
+  {
+
+  }
+
+  // matadd
   {
     Tensor *A = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
     Tensor *B = NN_rand(2, (size_t[]){M, N}, DTYPE_F32);
     Tensor *golden = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
     Tensor *actual = NN_tensor(2, (size_t[]){M, N}, DTYPE_F32, NULL);
 
-    printf("matadd:         ");
+    printf("matadd:\t\t");
     NN_add_F32(golden, A, B);
     // start = read_cycles();
     NN_add_F32_RVV(actual, A, B);
     // total = read_cycles() - start;
-    printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "pass" : "fail", total);
+    printf("%s (%lu)\n", compare_2d(golden->data, actual->data, N, M) ? "pass" : "fail", cycles);
 
     // NN_printf(A);
     // NN_printf(B);
@@ -100,6 +145,56 @@ int main() {
     NN_deleteTensor(actual);
   }
 
+  // matneg
+  {
+
+  }
+
+  // matcopy
+  {
+
+  }
+
+  // cwiseabs
+  {
+
+  }
+
+  // cwisemin
+  {
+
+  }
+
+  // cwisemax
+  {
+
+  }
+
+  // cwisemul
+  {
+
+  }
+
+  // matset
+  {
+
+  }
+
+  // matsetv
+  {
+
+  }
+
+  // matnorm
+  {
+
+  }
+
+  // transpose
+  {
+
+  }
+
 
   return 0;
 }
\ No newline at end of file