diff --git a/example/fast-depth/main.c b/example/fast-depth/main.c
index bbda924..978ca5d 100644
--- a/example/fast-depth/main.c
+++ b/example/fast-depth/main.c
@@ -86,7 +86,7 @@ int main() {
   init(model);
 
   printf("setting input data...\n");
-  // NN_fill_F32(&model->x, 0.0);
+  // NN_fill(&model->x, 0.0);
   memcpy((uint8_t *)model->x.data, (uint8_t *)model_input_data, (size_t)model_input_end - (size_t)model_input_start);
 
   // cycles = READ_CSR("mcycle");
diff --git a/nn/CMakeLists.txt b/nn/CMakeLists.txt
index ebd481d..ea5780b 100644
--- a/nn/CMakeLists.txt
+++ b/nn/CMakeLists.txt
@@ -6,7 +6,7 @@ option(RVV "Use RISCV vector extension implementation" OFF)
 add_library(nn 
   src/nn_tensor.c
   src/nn_print.c
-  src/abs/nn_abs.c
+  src/nn_abs.c
   src/nn_add.c
   src/batchnorm2d/nn_batchnorm2d.c
   src/conv2d/nn_conv2d.c
@@ -16,11 +16,11 @@ add_library(nn
   src/nn_linear.c
   src/nn_matmul.c
   src/matrixnorm/nn_matrixnorm.c
-  src/max/nn_max.c
-  src/maximum/nn_maximum.c
+  src/nn_max.c
+  src/nn_maximum.c
   src/maxpool2d/nn_maxpool2d.c
-  src/min/nn_min.c
-  src/minimum/nn_minimum.c
+  src/nn_min.c
+  src/nn_minimum.c
   src/mul/nn_mul.c
   src/neg/nn_neg.c
   src/relu/nn_relu.c
diff --git a/nn/inc/kernel/abs.h b/nn/inc/kernel/abs.h
new file mode 100644
index 0000000..afc8699
--- /dev/null
+++ b/nn/inc/kernel/abs.h
@@ -0,0 +1,56 @@
+#ifndef __NN__ABS_H
+#define __NN__ABS_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <math.h>
+
+#ifdef AVX
+  #include <immintrin.h>
+#endif
+
+#ifdef RVV
+  #include <riscv_vector.h>
+#endif
+
+static inline void NN__abs_F32(size_t n, float *y, float *x) {
+  #if defined(AVX)
+    // Mask to clear the sign bit
+    __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
+
+    size_t vl = 8;
+
+    while (n > 0) {
+      size_t count = n < vl ? n : vl;
+
+      // Load input values into an AVX register
+      __m256 vec_x = _mm256_loadu_ps(x);
+      
+      // Compute the absolute values
+      __m256 vec_y = _mm256_and_ps(vec_x, mask);
+      
+      // Store the result
+      _mm256_storeu_ps(y, vec_y);
+      
+      x += count;
+      y += count;
+      n -= count;
+    }
+  #elif defined(RVV)
+    while (n > 0) {
+      size_t vl = __riscv_vsetvl_e32m1(n);
+      vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl);
+      vfloat32m1_t vec_y = __riscv_vfabs_v_f32m1(vec_x, vl);
+      __riscv_vse32_v_f32m1(y, vec_y, vl);
+      x += vl;
+      y += vl;
+      n -= vl;
+    }
+  #else
+    for (size_t i = 0; i < n; i += 1) {
+      y[i] = fabsf(x[i]);
+    }
+  #endif
+}
+
+#endif // __NN__ABS_H
diff --git a/nn/inc/kernel/max.h b/nn/inc/kernel/max.h
new file mode 100644
index 0000000..4c33d84
--- /dev/null
+++ b/nn/inc/kernel/max.h
@@ -0,0 +1,35 @@
+#ifndef __NN__MAX_H
+#define __NN__MAX_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <float.h>
+
+#ifdef RVV
+  #include <riscv_vector.h>
+#endif
+
+static inline void NN__max_F32(size_t n, float *s, float *x) {
+  float max = -FLT_MAX;
+  
+  #ifdef RVV
+    vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(max, 1);
+    while (n > 0) {
+      size_t vl = __riscv_vsetvl_e32m1(n);
+      vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl);
+      vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_x, vec_max, vl);
+      x += vl;
+      n -= vl;
+    }
+    max = __riscv_vfmv_f_s_f32m1_f32(vec_max);
+  #else
+    for (size_t i = 0; i < n; i += 1) {
+      float val = x[i];
+      max = val > max ? val : max;
+    }
+  #endif
+
+  *s = max;
+}
+
+#endif // __NN__MAX_H
diff --git a/nn/inc/kernel/min.h b/nn/inc/kernel/min.h
new file mode 100644
index 0000000..44f2b0f
--- /dev/null
+++ b/nn/inc/kernel/min.h
@@ -0,0 +1,35 @@
+#ifndef __NN__MIN_H
+#define __NN__MIN_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <float.h>
+
+#ifdef RVV
+  #include <riscv_vector.h>
+#endif
+
+static inline void NN__min_F32(size_t n, float *s, float *x) {
+  float min = FLT_MAX;
+  
+  #ifdef RVV
+    vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(min, 1);
+    while (n > 0) {
+      size_t vl = __riscv_vsetvl_e32m1(n);
+      vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x, vl);
+      vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_x, vec_min, vl);
+      x += vl;
+      n -= vl;
+    }
+    min = __riscv_vfmv_f_s_f32m1_f32(vec_min);
+  #else
+    for (size_t i = 0; i < n; i += 1) {
+      float val = x[i];
+      min = val < min ? val : min;
+    }
+  #endif
+
+  *s = min;
+}
+
+#endif // __NN__MIN_H
diff --git a/nn/inc/nn_abs.h b/nn/inc/nn_abs.h
index 0d78fc8..2f047a3 100644
--- a/nn/inc/nn_abs.h
+++ b/nn/inc/nn_abs.h
@@ -2,9 +2,9 @@
 #define __NN_ABS_H
 
 #include <assert.h>
-#include <math.h>
 
 #include "nn_tensor.h"
+#include "kernel/abs.h"
 
 
 /**
@@ -17,12 +17,5 @@
  */
 void NN_abs(Tensor *out, Tensor *input);
 
-void NN_abs_F32(Tensor *out, Tensor *input);
-
-
-void NN_abs_F32_AVX(Tensor *out, Tensor *input);
-
-void NN_abs_F32_RVV(Tensor *out, Tensor *input);
-
 
 #endif // __NN_ABS_H
diff --git a/nn/inc/nn_add.h b/nn/inc/nn_add.h
index 6087bf8..9d488e0 100644
--- a/nn/inc/nn_add.h
+++ b/nn/inc/nn_add.h
@@ -32,7 +32,7 @@ void NN_add(Tensor *out, Tensor *a, Tensor *b);
  * @param in: the input tensor
  * @param scalar: scalar value
  */
-void NN_add1_F32(Tensor *out, Tensor *in, float scalar);
+void NN_add1(Tensor *out, Tensor *in, float scalar);
 
 /**
  * Element-wise addition
@@ -52,7 +52,7 @@ void NN_addInplace(Tensor *b, Tensor *a);
  * @param b: the target tensor
  * @param scalar: scalar value
  */
-void NN_addInplace1_F32(Tensor *b, float scalar);
+void NN_addInplace1(Tensor *b, float scalar);
 
 void NN_add_1D(Tensor *out, Tensor *a, Tensor *b);
 
diff --git a/nn/inc/nn_fill.h b/nn/inc/nn_fill.h
index 8e534c7..51ea231 100644
--- a/nn/inc/nn_fill.h
+++ b/nn/inc/nn_fill.h
@@ -13,11 +13,7 @@
  * @param tensor: the input tensor
  * @param value: scalar value
  */
-void NN_fill_F32(Tensor *tensor, float value);
-
-void NN_fill_I32(Tensor *tensor, int32_t value);
-
-void NN_fill_I8(Tensor *tensor, int8_t value);
+void NN_fill(Tensor *tensor, float value);
 
 Tensor *NN_zeros(size_t ndim, const size_t *shape, DataType dtype);
 
diff --git a/nn/inc/nn_math.h b/nn/inc/nn_math.h
index 41b49c6..77c8ba0 100644
--- a/nn/inc/nn_math.h
+++ b/nn/inc/nn_math.h
@@ -12,7 +12,7 @@
 // fundamental operations
 //
 
-inline static void NN__cpy_F32 (const int n, float *y, const float *x)                  { for (int i = 0; i < n; i += 1) y[i]  = x[i];        }
+
 inline static void NN__neg_F32 (const int n, float *y, const float *x)                  { for (int i = 0; i < n; i += 1) y[i]  = -x[i];       }
 inline static void NN__mul_F32 (const int n, float *z, const float *x, const float *y)  { for (int i = 0; i < n; i += 1) z[i]  = x[i]*y[i];   }
 inline static void NN__div_F32 (const int n, float *z, const float *x, const float *y)  { for (int i = 0; i < n; i += 1) z[i]  = x[i]/y[i];   }
@@ -349,7 +349,7 @@ inline static void NN__div_F32 (const int n, float *z, const float *x, const flo
 // inline static void NN__sqr_F32  (const int n, float *y, const float *x) { for (int i = 0; i < n; i += 1) y[i] = x[i]*x[i];   }
 // inline static void NN__sqrt_F32 (const int n, float *y, const float *x) { for (int i = 0; i < n; i += 1) y[i] = sqrtf(x[i]); }
 // inline static void NN__log_F32  (const int n, float *y, const float *x) { for (int i = 0; i < n; i += 1) y[i] = logf(x[i]);   }
-// inline static void NN__abs_F32  (const int n, float *y, const float *x) { for (int i = 0; i < n; i += 1) y[i] = fabsf(x[i]); }
+
 // inline static void NN__sgn_F32  (const int n, float *y, const float *x) { for (int i = 0; i < n; i += 1) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 // inline static void NN__step_F32 (const int n, float *y, const float *x) { for (int i = 0; i < n; i += 1) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
 // inline static void NN__tanh_F32 (const int n, float *y, const float *x) { for (int i = 0; i < n; i += 1) y[i] = tanhf(x[i]);  }
@@ -754,18 +754,6 @@ inline static void NN__div_F32 (const int n, float *z, const float *x, const flo
 //     *s = sum;
 // }
 
-// inline static void NN__max_F32(const int n, float *s, const float *x) {
-// #ifndef GGML_USE_ACCELERATE
-//     float max = -INFINITY;
-//     for (int i = 0; i < n; i += 1) {
-//         max = MAX(max, x[i]);
-//     }
-//     *s = max;
-// #else
-//     vDSP_maxv(x, 1, s, n);
-// #endif
-// }
-
 // inline static void NN__norm_inv_F32(const int n, float *s, const float *x) {
 //     NN__norm_F32(n, s, x);
 //     *s = 1.f/(*s);
diff --git a/nn/inc/nn_max.h b/nn/inc/nn_max.h
index 4d1de86..4cde93e 100644
--- a/nn/inc/nn_max.h
+++ b/nn/inc/nn_max.h
@@ -5,6 +5,7 @@
 #include <float.h>
 
 #include "nn_tensor.h"
+#include "kernel/max.h"
 
 
 /**
@@ -14,9 +15,5 @@
  */
 float NN_max(Tensor *tensor);
 
-float NN_max_F32(Tensor *tensor);
-
-float NN_max_F32_RVV(Tensor *tensor);
-
 
 #endif // __NN_MAX_H
diff --git a/nn/inc/nn_maximum.h b/nn/inc/nn_maximum.h
index fa210e8..c7158dd 100644
--- a/nn/inc/nn_maximum.h
+++ b/nn/inc/nn_maximum.h
@@ -2,7 +2,9 @@
 #define __NN_MAXIMUM_H
 
 #include <assert.h>
-#include <float.h>
+#ifdef RVV
+  #include <riscv_vector.h>
+#endif
 
 #include "nn_tensor.h"
 
@@ -16,10 +18,5 @@
  */
 void NN_maximum(Tensor *out, Tensor *a, Tensor *b);
 
-void NN_maximum_F32(Tensor *out, Tensor *a, Tensor *b);
-
-
-void NN_maximum_F32_RVV(Tensor *out, Tensor *a, Tensor *b);
-
 
 #endif // __NN_MAXIMUM_H
diff --git a/nn/inc/nn_min.h b/nn/inc/nn_min.h
index 165c27b..b25db8d 100644
--- a/nn/inc/nn_min.h
+++ b/nn/inc/nn_min.h
@@ -5,6 +5,7 @@
 #include <float.h>
 
 #include "nn_tensor.h"
+#include "kernel/min.h"
 
 
 /**
@@ -14,10 +15,5 @@
  */
 float NN_min(Tensor *tensor);
 
-float NN_min_F32(Tensor *tensor);
-
-
-float NN_min_F32_RVV(Tensor *tensor);
-
 
 #endif // __NN_MIN_H
diff --git a/nn/inc/nn_minimum.h b/nn/inc/nn_minimum.h
index 7bb0808..04e13ce 100644
--- a/nn/inc/nn_minimum.h
+++ b/nn/inc/nn_minimum.h
@@ -2,6 +2,9 @@
 #define __NN_MINIMUM_H
 
 #include <assert.h>
+#ifdef RVV
+  #include <riscv_vector.h>
+#endif
 
 #include "nn_tensor.h"
 
@@ -15,10 +18,5 @@
  */
 void NN_minimum(Tensor *out, Tensor *a, Tensor *b);
 
-void NN_minimum_F32(Tensor *out, Tensor *a, Tensor *b);
-
-
-void NN_minimum_F32_RVV(Tensor *out, Tensor *a, Tensor *b);
-
 
 #endif // __NN_MINIMUM_H
diff --git a/nn/inc/nn_print.h b/nn/inc/nn_print.h
index 31c4cbc..cea4297 100644
--- a/nn/inc/nn_print.h
+++ b/nn/inc/nn_print.h
@@ -1,6 +1,8 @@
 #ifndef __NN_PRINT_H
 #define __NN_PRINT_H
 
+#include <math.h>
+
 #include "nn_tensor.h"
 
 
diff --git a/nn/src/abs/nn_abs.c b/nn/src/abs/nn_abs.c
deleted file mode 100644
index bdd4d25..0000000
--- a/nn/src/abs/nn_abs.c
+++ /dev/null
@@ -1,14 +0,0 @@
-
-#include "nn_abs.h"
-
-
-void NN_abs_F32(Tensor *out, Tensor *input) {
-  assert(out->ndim == input->ndim);
-  assert(input->dtype == DTYPE_F32);
-  assert(out->dtype == DTYPE_F32);
-  assert(out->size == input->size);
-  
-  for (size_t i = 0; i < out->size; i += 1) {
-    ((float *)out->data)[i] = fabs(((float *)input->data)[i]);
-  }
-}
diff --git a/nn/src/abs/nn_abs_avx.c b/nn/src/abs/nn_abs_avx.c
deleted file mode 100644
index 8092943..0000000
--- a/nn/src/abs/nn_abs_avx.c
+++ /dev/null
@@ -1,36 +0,0 @@
-
-#include "nn_abs.h"
-#include <immintrin.h>
-
-void NN_abs_F32_AVX(Tensor *out, Tensor *input) {
-  assert(out->ndim == input->ndim);
-  assert(input->dtype == DTYPE_F32);
-  assert(out->dtype == DTYPE_F32);
-  assert(out->size == input->size);
-
-  float *ptr_out = out->data;
-  float *ptr_in = input->data;
-  
-  size_t n = out->shape[0] * out->shape[1];
-  size_t vl = 8;
-
-  __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));  // Mask to clear the sign bit
-
-  while (n > 0) {
-    size_t count = n < vl ? n : vl;
-
-    // Load input values into an AVX register
-    __m256 vec_in = _mm256_loadu_ps(ptr_in);
-    
-    // Compute the absolute values
-    __m256 vec_out = _mm256_and_ps(vec_in, mask);
-    
-    // Store the result
-    _mm256_storeu_ps(ptr_out, vec_out);
-    
-    ptr_in += count;
-    ptr_out += count;
-    n -= count;
-  }
-}
-
diff --git a/nn/src/abs/nn_abs_rvv.c b/nn/src/abs/nn_abs_rvv.c
deleted file mode 100644
index 360fcd4..0000000
--- a/nn/src/abs/nn_abs_rvv.c
+++ /dev/null
@@ -1,25 +0,0 @@
-
-#include "nn_abs.h"
-#include "riscv_vector.h"
-
-void NN_abs_F32_RVV(Tensor *out, Tensor *input) {
-  assert(out->ndim == input->ndim);
-  assert(input->dtype == DTYPE_F32);
-  assert(out->dtype == DTYPE_F32);
-  assert(out->size == input->size);
-
-  float *ptr_out = out->data;
-  float *ptr_in = input->data;
-  
-  size_t n = out->shape[0] * out->shape[1];
-  while (n > 0) {
-    size_t vl = __riscv_vsetvl_e32m1(n);
-    vfloat32m1_t vec_in = __riscv_vle32_v_f32m1(ptr_in, vl);
-    vfloat32m1_t vec_out = __riscv_vfabs_v_f32m1(vec_in, vl);
-    __riscv_vse32_v_f32m1(ptr_out, vec_out, vl);
-    ptr_in += vl;
-    ptr_out += vl;
-    n -= vl;
-  }
-}
-
diff --git a/nn/src/max/nn_max.c b/nn/src/max/nn_max.c
deleted file mode 100644
index 5d6cb13..0000000
--- a/nn/src/max/nn_max.c
+++ /dev/null
@@ -1,18 +0,0 @@
-
-#include "nn_max.h"
-
-
-float NN_max_F32(Tensor *tensor) {
-  assert(tensor->dtype == DTYPE_F32);
-  
-  float max = -FLT_MAX;
-  
-  for (size_t i = 0; i < tensor->size; i += 1) {
-    float val = ((float *)tensor->data)[i];
-    if (val > max) {
-      max = val;
-    }
-  }
-  
-  return max;
-}
diff --git a/nn/src/max/nn_max_rvv.c b/nn/src/max/nn_max_rvv.c
deleted file mode 100644
index 5fe1c54..0000000
--- a/nn/src/max/nn_max_rvv.c
+++ /dev/null
@@ -1,25 +0,0 @@
-
-#include "nn_max.h"
-#include "riscv_vector.h"
-
-float NN_max_F32_RVV(Tensor *tensor) {
-  assert(tensor->dtype == DTYPE_F32);
-
-  float max = -FLT_MAX;
-  float *ptr = tensor->data;
-
-  vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(max, 1);
-
-  size_t n = tensor->shape[0] * tensor->shape[1];
-  while (n > 0) {
-    size_t vl = __riscv_vsetvl_e32m1(n);
-    vfloat32m1_t vec_data = __riscv_vle32_v_f32m1(ptr, vl);
-    vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_data, vec_max, vl);
-    ptr += vl;
-    n -= vl;
-  }
-  max = __riscv_vfmv_f_s_f32m1_f32(vec_max);
-
-  return max;
-}
-
diff --git a/nn/src/maximum/nn_maximum.c b/nn/src/maximum/nn_maximum.c
deleted file mode 100644
index b6e1f6d..0000000
--- a/nn/src/maximum/nn_maximum.c
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#include "nn_maximum.h"
-
-
-void NN_maximum_F32(Tensor *out, Tensor *a, Tensor *b) {
-  assert(b->ndim == a->ndim);
-  assert(out->ndim == a->ndim);
-  assert(a->dtype == DTYPE_F32);
-  assert(b->dtype == DTYPE_F32);
-  assert(out->dtype == DTYPE_F32);
-  assert(b->size == a->size);
-  assert(out->size == a->size);
-  
-  for (size_t i = 0; i < out->size; i += 1) {
-    float a_val = ((float *)a->data)[i];
-    float b_val = ((float *)b->data)[i];
-    ((float *)out->data)[i] = a_val > b_val ? a_val : b_val;
-  }
-}
diff --git a/nn/src/maximum/nn_maximum_rvv.c b/nn/src/maximum/nn_maximum_rvv.c
deleted file mode 100644
index c0e0c0f..0000000
--- a/nn/src/maximum/nn_maximum_rvv.c
+++ /dev/null
@@ -1,31 +0,0 @@
-
-#include "nn_maximum.h"
-#include "riscv_vector.h"
-
-void NN_maximum_F32_RVV(Tensor *out, Tensor *a, Tensor *b) {
-  assert(b->ndim == a->ndim);
-  assert(out->ndim == a->ndim);
-  assert(a->dtype == DTYPE_F32);
-  assert(b->dtype == DTYPE_F32);
-  assert(out->dtype == DTYPE_F32);
-  assert(b->size == a->size);
-  assert(out->size == a->size);
-
-  float *ptr_out = out->data;
-  float *ptr_a = a->data;
-  float *ptr_b = b->data;
-
-  size_t n = out->shape[0] * out->shape[1];
-  while (n > 0) {
-    size_t vl = __riscv_vsetvl_e32m1(n);
-    vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(ptr_a, vl);
-    vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(ptr_b, vl);
-    vfloat32m1_t vec_out = __riscv_vfmax_vv_f32m1(vec_a, vec_b, vl);
-    __riscv_vse32_v_f32m1(ptr_out, vec_out, vl);
-    ptr_a += vl;
-    ptr_b += vl;
-    ptr_out += vl;
-    n -= vl;
-  }
-}
-
diff --git a/nn/src/min/nn_min.c b/nn/src/min/nn_min.c
deleted file mode 100644
index ad2e637..0000000
--- a/nn/src/min/nn_min.c
+++ /dev/null
@@ -1,18 +0,0 @@
-
-#include "nn_min.h"
-
-
-float NN_min_F32(Tensor *tensor) {
-  assert(tensor->dtype == DTYPE_F32);
-  
-  float min = FLT_MAX;
-  
-  for (size_t i = 0; i < tensor->size; i += 1) {
-    float val = ((float *)tensor->data)[i];
-    if (val < min) {
-      min = val;
-    }
-  }
-  
-  return min;
-}
diff --git a/nn/src/min/nn_min_rvv.c b/nn/src/min/nn_min_rvv.c
deleted file mode 100644
index f39ec03..0000000
--- a/nn/src/min/nn_min_rvv.c
+++ /dev/null
@@ -1,25 +0,0 @@
-
-#include "nn_min.h"
-#include "riscv_vector.h"
-
-float NN_min_F32_RVV(Tensor *tensor) {
-  assert(tensor->dtype == DTYPE_F32);
-
-  float min = FLT_MAX;
-  float *ptr = tensor->data;
-
-  vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(min, 1);
-
-  size_t n = tensor->shape[0] * tensor->shape[1];
-  while (n > 0) {
-    size_t vl = __riscv_vsetvl_e32m1(n);
-    vfloat32m1_t vec_data = __riscv_vle32_v_f32m1(ptr, vl);
-    vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_data, vec_min, vl);
-    ptr += vl;
-    n -= vl;
-  }
-  min = __riscv_vfmv_f_s_f32m1_f32(vec_min);
-
-  return min;
-}
-
diff --git a/nn/src/minimum/nn_minimum.c b/nn/src/minimum/nn_minimum.c
deleted file mode 100644
index 15f55b0..0000000
--- a/nn/src/minimum/nn_minimum.c
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#include "nn_minimum.h"
-
-
-void NN_minimum_F32(Tensor *out, Tensor *a, Tensor *b) {
-  assert(b->ndim == a->ndim);
-  assert(out->ndim == a->ndim);
-  assert(a->dtype == DTYPE_F32);
-  assert(b->dtype == DTYPE_F32);
-  assert(out->dtype == DTYPE_F32);
-  assert(b->size == a->size);
-  assert(out->size == a->size);
-  
-  for (size_t i = 0; i < out->size; i += 1) {
-    float a_val = ((float *)a->data)[i];
-    float b_val = ((float *)b->data)[i];
-    ((float *)out->data)[i] = a_val < b_val ? a_val : b_val;
-  }
-}
diff --git a/nn/src/minimum/nn_minimum_rvv.c b/nn/src/minimum/nn_minimum_rvv.c
deleted file mode 100644
index 4fade23..0000000
--- a/nn/src/minimum/nn_minimum_rvv.c
+++ /dev/null
@@ -1,31 +0,0 @@
-
-#include "nn_minimum.h"
-#include "riscv_vector.h"
-
-void NN_minimum_F32_RVV(Tensor *out, Tensor *a, Tensor *b) {
-  assert(b->ndim == a->ndim);
-  assert(out->ndim == a->ndim);
-  assert(a->dtype == DTYPE_F32);
-  assert(b->dtype == DTYPE_F32);
-  assert(out->dtype == DTYPE_F32);
-  assert(b->size == a->size);
-  assert(out->size == a->size);
-
-  float *ptr_out = out->data;
-  float *ptr_a = a->data;
-  float *ptr_b = b->data;
-
-  size_t n = out->shape[0] * out->shape[1];
-  while (n > 0) {
-    size_t vl = __riscv_vsetvl_e32m1(n);
-    vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(ptr_a, vl);
-    vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(ptr_b, vl);
-    vfloat32m1_t vec_out = __riscv_vfmin_vv_f32m1(vec_a, vec_b, vl);
-    __riscv_vse32_v_f32m1(ptr_out, vec_out, vl);
-    ptr_a += vl;
-    ptr_b += vl;
-    ptr_out += vl;
-    n -= vl;
-  }
-}
-
diff --git a/nn/src/nn_abs.c b/nn/src/nn_abs.c
new file mode 100644
index 0000000..250a235
--- /dev/null
+++ b/nn/src/nn_abs.c
@@ -0,0 +1,20 @@
+
+#include "nn_abs.h"
+
+
+void NN_abs(Tensor *out, Tensor *in) {
+  assert(out->ndim == in->ndim);
+  assert(out->dtype == in->dtype);
+  assert(out->size == in->size);
+  
+  switch (out->dtype) {
+    case DTYPE_F32:
+      NN_abs_F32(out, in);
+      return;
+
+    default:
+  }
+  printf("[ERROR] Unsupported operation of tensor with dtype %s = |%s|\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(in->dtype)
+  );
+}
diff --git a/nn/src/nn_add.c b/nn/src/nn_add.c
index 7ea1c90..b802577 100644
--- a/nn/src/nn_add.c
+++ b/nn/src/nn_add.c
@@ -29,9 +29,21 @@ void NN_add(Tensor *out, Tensor *a, Tensor *b) {
   }
 }
 
-void NN_add1_F32(Tensor *out, Tensor *a, float b) {
-  assert(out->dtype == DTYPE_F32 && a->dtype == DTYPE_F32);
-  NN__add1_F32(out->size, (float *)out->data, (float *)a->data, b);
+void NN_add1(Tensor *out, Tensor *a, float b) {
+  assert(out->ndim == a->ndim);
+  assert(out->dtype == a->dtype);
+  assert(out->size == a->size);
+
+  switch (out->dtype) {
+    case DTYPE_F32:
+      NN__add1_F32(out->size, (float *)out->data, (float *)a->data, b);
+      return;
+
+    default:
+  }
+  printf("[ERROR] Unsupported operation between tensor with dtype %s += %s\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype)
+  );
 }
 
 void NN_addInplace(Tensor *b, Tensor *a) {
@@ -49,13 +61,21 @@ void NN_addInplace(Tensor *b, Tensor *a) {
   }
 
   printf("[ERROR] Unsupported operation between tensor with dtype %s += %s\n", 
-    NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
+    NN_getDataTypeName(b->dtype), NN_getDataTypeName(a->dtype)
   );
 }
 
-void NN_addInplace1_F32(Tensor *b, float scalar) {
-  assert(b->dtype == DTYPE_F32);
-  NN__acc1_F32(b->size, (float *)b->data, scalar);
+void NN_addInplace1(Tensor *b, float scalar) {
+  switch (b->dtype) {
+    case DTYPE_F32:
+      NN__acc1_F32(b->size, (float *)b->data, scalar);
+      return;
+    default:
+  }
+
+  printf("[ERROR] Unsupported operation between tensor with dtype %s += float\n", 
+    NN_getDataTypeName(b->dtype)
+  );
 }
 
 void NN_add_1D(Tensor *out, Tensor *a, Tensor *b) {
@@ -71,8 +91,8 @@ void NN_add_1D(Tensor *out, Tensor *a, Tensor *b) {
     return;
   }
 
-  printf("[ERROR] Unsupported operation between tensor with dtype %s + %s -> %s\n", 
-    NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype), NN_getDataTypeName(out->dtype)
+  printf("[ERROR] Unsupported operation between tensor with dtype %s = %s + %s\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
   );
 }
 
@@ -108,8 +128,8 @@ void NN_add_2D(Tensor *out, Tensor *a, Tensor *b) {
     return;
   }
 
-  printf("[ERROR] Unsupported operation between tensor with dtype %s + %s -> %s\n", 
-    NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype), NN_getDataTypeName(out->dtype)
+  printf("[ERROR] Unsupported operation between tensor with dtype %s = %s + %s\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
   );
 }
 
@@ -149,8 +169,8 @@ void NN_add_3D(Tensor *out, Tensor *a, Tensor *b) {
     }
   }
   
-  printf("[ERROR] Unsupported operation between tensor with dtype %s + %s -> %s\n", 
-    NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype), NN_getDataTypeName(out->dtype)
+  printf("[ERROR] Unsupported operation between tensor with dtype %s = %s + %s\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
   );
 }
 
@@ -196,8 +216,8 @@ void NN_add_4D(Tensor *out, Tensor *a, Tensor *b) {
     }
   }
 
-  printf("[ERROR] Unsupported operation between tensor with dtype %s + %s -> %s\n", 
-    NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype), NN_getDataTypeName(out->dtype)
+  printf("[ERROR] Unsupported operation between tensor with dtype %s = %s + %s\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
   );
 }
 
diff --git a/nn/src/nn_fill.c b/nn/src/nn_fill.c
index e063d4a..354fdbb 100644
--- a/nn/src/nn_fill.c
+++ b/nn/src/nn_fill.c
@@ -2,40 +2,26 @@
 #include "nn_fill.h"
 
 
-void NN_fill_F32(Tensor *tensor, float value) {
-  assert(tensor->dtype == DTYPE_F32);
-  
-  NN__fill_F32(tensor->size, (float *)tensor->data, value);
-}
-
-void NN_fill_I32(Tensor *tensor, int32_t value) {
-  assert(tensor->dtype == DTYPE_I32);
-  
-  NN__fill_I32(tensor->size, (int32_t *)tensor->data, value);
-}
-
-void NN_fill_I8(Tensor *tensor, int8_t value) {
-  assert(tensor->dtype == DTYPE_I8);
-  
-  NN__fill_I8(tensor->size, (int8_t *)tensor->data, value);
-}
-
-Tensor *NN_zeros(size_t ndim, const size_t *shape, DataType dtype) {
-  Tensor *t = NN_tensor(ndim, shape, dtype, NULL);
-
-  switch (dtype) {
+void NN_fill(Tensor *tensor, float value) {
+  switch (tensor->dtype) {
     case DTYPE_I8:
-      NN_fill_I8(t, 0);
-      break;
+      NN__fill_I8(tensor->size, (int8_t *)tensor->data, (int8_t)value);
+      return;
     case DTYPE_I32:
-      NN_fill_I32(t, 0);
-      break;
+      NN__fill_I32(tensor->size, (int32_t *)tensor->data, (int32_t)value);
+      return;
     case DTYPE_F32:
-      NN_fill_F32(t, 0);
-      break;
+      NN__fill_F32(tensor->size, (float *)tensor->data, value);
+      return;
     default:
-      printf("[WARNING] Unsupported data type: %d\n", dtype);
+      printf("[ERROR] Unsupported operation fill to tensor with dtype: %d\n", tensor->dtype);
   }
+}
+
+Tensor *NN_zeros(size_t ndim, const size_t *shape, DataType dtype) {
+  Tensor *t = NN_tensor(ndim, shape, dtype, NULL);
+
+  NN_fill(t, 0);
 
   return t;
 }
@@ -43,19 +29,7 @@ Tensor *NN_zeros(size_t ndim, const size_t *shape, DataType dtype) {
 Tensor *NN_ones(size_t ndim, const size_t *shape, DataType dtype) {
   Tensor *t = NN_tensor(ndim, shape, dtype, NULL);
 
-  switch (dtype) {
-    case DTYPE_I8:
-      NN_fill_I8(t, 1);
-      break;
-    case DTYPE_I32:
-      NN_fill_I32(t, 1);
-      break;
-    case DTYPE_F32:
-      NN_fill_F32(t, 1);
-      break;
-    default:
-      printf("[WARNING] Unsupported data type: %d\n", dtype);
-  }
+  NN_fill(t, 1);
 
   return t;
 }
@@ -80,7 +54,7 @@ Tensor *NN_rand(size_t ndim, const size_t *shape, DataType dtype) {
       }
       break;
     default:
-      printf("[WARNING] Unsupported data type: %d\n", dtype);
+      printf("[ERROR] Unsupported data type: %d\n", dtype);
   }
 
   return t;
diff --git a/nn/src/nn_matmul.c b/nn/src/nn_matmul.c
index d819ed4..0d98d76 100644
--- a/nn/src/nn_matmul.c
+++ b/nn/src/nn_matmul.c
@@ -6,7 +6,9 @@ void NN_matmul(Tensor *out, Tensor *a, Tensor *b) {
     NN_matmul_F32(out, a, b);
     return;
   }
-  printf("Unsupported operation: %s @ %s -> %s\n", NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype), NN_getDataTypeName(out->dtype));
+  printf("Unsupported operation: %s = %s @ %s\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
+  );
 }
 
 void NN_matmul_F32(Tensor *out, Tensor *a, Tensor *b) {
@@ -36,7 +38,9 @@ void NN_matmulT(Tensor *out, Tensor *a, Tensor *b) {
     NN_matmulT_F32(out, a, b);
     return;
   }
-  printf("Unsupported operation: %s @ %s -> %s\n", NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype), NN_getDataTypeName(out->dtype));
+  printf("Unsupported operation: %s = %s @ %s\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
+  );
 }
 
 void NN_matmulT_F32(Tensor *out, Tensor *a, Tensor *b) {
diff --git a/nn/src/nn_max.c b/nn/src/nn_max.c
new file mode 100644
index 0000000..928d377
--- /dev/null
+++ b/nn/src/nn_max.c
@@ -0,0 +1,20 @@
+
+#include "nn_max.h"
+
+
+float NN_max(Tensor *tensor) {
+  float max;
+
+  switch (tensor->dtype) {
+    case DTYPE_F32:
+      NN__max_F32(tensor->size, &max, (float *)tensor->data);
+      break;
+    
+    default:
+      printf("[ERROR] Unsupported operation of tensor with dtype max(%s)\n", 
+        NN_getDataTypeName(tensor->dtype)
+      );
+  }
+  
+  return max;
+}
diff --git a/nn/src/nn_maximum.c b/nn/src/nn_maximum.c
new file mode 100644
index 0000000..349a709
--- /dev/null
+++ b/nn/src/nn_maximum.c
@@ -0,0 +1,48 @@
+
+#include "nn_maximum.h"
+
+
+void NN_maximum(Tensor *out, Tensor *a, Tensor *b) {
+  assert(b->ndim == a->ndim);
+  assert(out->ndim == a->ndim);
+  assert(a->dtype == DTYPE_F32);
+  assert(b->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  assert(b->size == a->size);
+  assert(out->size == a->size);
+
+  switch (out->dtype) {
+    case DTYPE_F32:
+      #ifdef RVV
+        float *ptr_out = out->data;
+        float *ptr_a = a->data;
+        float *ptr_b = b->data;
+
+        size_t n = out->size;
+        while (n > 0) {
+          size_t vl = __riscv_vsetvl_e32m1(n);
+          vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(ptr_a, vl);
+          vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(ptr_b, vl);
+          vfloat32m1_t vec_out = __riscv_vfmax_vv_f32m1(vec_a, vec_b, vl);
+          __riscv_vse32_v_f32m1(ptr_out, vec_out, vl);
+          ptr_a += vl;
+          ptr_b += vl;
+          ptr_out += vl;
+          n -= vl;
+        }
+      #else
+        for (size_t i = 0; i < out->size; i += 1) {
+          float a_val = ((float *)a->data)[i];
+          float b_val = ((float *)b->data)[i];
+          ((float *)out->data)[i] = a_val > b_val ? a_val : b_val;
+        }
+      #endif
+      return;
+
+    default:
+  }
+  
+  printf("[ERROR] Unsupported operation between tensor with dtype %s = max(%s, %s)\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
+  );
+}
diff --git a/nn/src/nn_min.c b/nn/src/nn_min.c
new file mode 100644
index 0000000..c25e7cf
--- /dev/null
+++ b/nn/src/nn_min.c
@@ -0,0 +1,20 @@
+
+#include "nn_min.h"
+
+
+float NN_min(Tensor *tensor) {
+  float min;
+
+  switch (tensor->dtype) {
+    case DTYPE_F32:
+      NN__min_F32(tensor->size, &min, (float *)tensor->data);
+      break;
+    
+    default:
+      printf("[ERROR] Unsupported operation of tensor with dtype min(%s)\n", 
+        NN_getDataTypeName(tensor->dtype)
+      );
+  }
+  
+  return min;
+}
diff --git a/nn/src/nn_minimum.c b/nn/src/nn_minimum.c
new file mode 100644
index 0000000..afc3216
--- /dev/null
+++ b/nn/src/nn_minimum.c
@@ -0,0 +1,48 @@
+
+#include "nn_minimum.h"
+
+
+void NN_minimum(Tensor *out, Tensor *a, Tensor *b) {
+  assert(b->ndim == a->ndim);
+  assert(out->ndim == a->ndim);
+  assert(a->dtype == DTYPE_F32);
+  assert(b->dtype == DTYPE_F32);
+  assert(out->dtype == DTYPE_F32);
+  assert(b->size == a->size);
+  assert(out->size == a->size);
+
+  switch (out->dtype) {
+    case DTYPE_F32:
+      #ifdef RVV
+        float *ptr_out = out->data;
+        float *ptr_a = a->data;
+        float *ptr_b = b->data;
+
+        size_t n = out->size;
+        while (n > 0) {
+          size_t vl = __riscv_vsetvl_e32m1(n);
+          vfloat32m1_t vec_a = __riscv_vle32_v_f32m1(ptr_a, vl);
+          vfloat32m1_t vec_b = __riscv_vle32_v_f32m1(ptr_b, vl);
+          vfloat32m1_t vec_out = __riscv_vfmin_vv_f32m1(vec_a, vec_b, vl);
+          __riscv_vse32_v_f32m1(ptr_out, vec_out, vl);
+          ptr_a += vl;
+          ptr_b += vl;
+          ptr_out += vl;
+          n -= vl;
+        }
+      #else
+        for (size_t i = 0; i < out->size; i += 1) {
+          float a_val = ((float *)a->data)[i];
+          float b_val = ((float *)b->data)[i];
+          ((float *)out->data)[i] = a_val < b_val ? a_val : b_val;
+        }
+      #endif
+      return;
+
+    default:
+  }
+  
+  printf("[ERROR] Unsupported operation between tensor with dtype %s = max(%s, %s)\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
+  );
+}
diff --git a/nn/src/nn_sub.c b/nn/src/nn_sub.c
index 0847086..b322b03 100644
--- a/nn/src/nn_sub.c
+++ b/nn/src/nn_sub.c
@@ -24,7 +24,7 @@ void NN_sub(Tensor *out, Tensor *a, Tensor *b) {
       return;
     
     default:
-      printf("[ERROR] Unsupported tensor dimension: %d\n", out->ndim);
+      printf("[ERROR] Unsupported tensor dimension: %lu\n", out->ndim);
       return;
   }
 }
@@ -42,8 +42,8 @@ void NN_sub_1D(Tensor *out, Tensor *a, Tensor *b) {
     return;
   }
 
-  printf("[ERROR] Unsupported operation between tensor with dtype %s + %s -> %s\n", 
-    NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype), NN_getDataTypeName(out->dtype)
+  printf("[ERROR] Unsupported operation between tensor with dtype %s = %s - %s\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
   );
 }
 
@@ -79,8 +79,8 @@ void NN_sub_2D(Tensor *out, Tensor *a, Tensor *b) {
     return;
   }
 
-  printf("[ERROR] Unsupported operation between tensor with dtype %s + %s -> %s\n", 
-    NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype), NN_getDataTypeName(out->dtype)
+  printf("[ERROR] Unsupported operation between tensor with dtype %s = %s - %s\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
   );
 }
 
@@ -109,8 +109,8 @@ void NN_sub_3D(Tensor *out, Tensor *a, Tensor *b) {
     }
   }
   
-  printf("[ERROR] Unsupported operation between tensor with dtype %s + %s -> %s\n", 
-    NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype), NN_getDataTypeName(out->dtype)
+  printf("[ERROR] Unsupported operation between tensor with dtype %s = %s - %s\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
   );
 }
 
@@ -144,8 +144,8 @@ void NN_sub_4D(Tensor *out, Tensor *a, Tensor *b) {
     }
   }
 
-  printf("[ERROR] Unsupported operation between tensor with dtype %s + %s -> %s\n", 
-    NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype), NN_getDataTypeName(out->dtype)
+  printf("[ERROR] Unsupported operation between tensor with dtype %s = %s - %s\n", 
+    NN_getDataTypeName(out->dtype), NN_getDataTypeName(a->dtype), NN_getDataTypeName(b->dtype)
   );
 }
 
diff --git a/tests/src/generate_test.py b/tests/src/generate_test.py
index 288be30..864ed43 100644
--- a/tests/src/generate_test.py
+++ b/tests/src/generate_test.py
@@ -56,6 +56,33 @@
 }
 """
 
+def generateImmTestPattern(op, function, inputs, detail=""):
+    result = function(*[value for name, value in inputs]).item()
+
+    test_template = env.from_string("""
+  {
+    printf("{{ (op + ":").ljust(24) }}");
+{% for name, value in inputs %}{% if (type(value) == torch.Tensor and name != "actual") %}
+    Tensor *{{ name }} = NN_tensor({{ len(value.shape) }}, (size_t[]){ {{ value.shape | join(", ") }} }, DTYPE_F32, (float[]){ {{ value.numpy().flatten() | join(", ") }} });
+{% elif str(type(value)) == "<class 'float'>" %}
+    float {{ name }} = {{ value }};{% endif %}{% endfor %}
+    
+    float golden = {{ result }};
+    float actual;
+
+    cycles = READ_CSR("mcycle");
+    actual = NN_{{ op }}({{ inputs[0][0] }});
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s  (%lu cycles)\\n", float_eq(golden, actual, 1e-6) ? "PASS" : "FAIL", cycles);
+
+{% for name, value in inputs %}{% if (type(value) == torch.Tensor and name != "actual") %}
+    NN_deleteTensor({{ name }});{% endif %}{% endfor %}
+    
+  }
+""")
+    
+    return test_template.render(op=op, inputs=inputs, result=result, detail=detail)
+
 
 def generateTestPattern(op, function, inputs, detail=""):
     result = function(*[value for name, value in inputs])
@@ -99,15 +126,15 @@ def generateTestPattern(op, function, inputs, detail=""):
         "add",
         lambda a, b: a + b,
         (
-            ("a", torch.rand((3, 3))),
-            ("b", torch.rand((3, 3))),
+            ("a", torch.rand((7, 7))),
+            ("b", torch.rand((7, 7))),
         ),
     ),
     generateTestPattern(
-        "add1_F32",
+        "add1",
         lambda a, b: a + b,
         (
-            ("a", torch.rand((3, 3))),
+            ("a", torch.rand((7, 7))),
             ("v", random.random()),
         ),
     ),
@@ -115,23 +142,23 @@ def generateTestPattern(op, function, inputs, detail=""):
         "sub",
         lambda a, b: a - b,
         (
-            ("a", torch.rand((3, 3))),
-            ("b", torch.rand((3, 3))),
+            ("a", torch.rand((7, 7))),
+            ("b", torch.rand((7, 7))),
         ),
     ),
     generateTestPattern(
         "addInplace",
         lambda a, b: a + b,
         (
-            ("actual", torch.zeros((3, 3))),
-            ("b", torch.rand((3, 3))),
+            ("actual", torch.zeros((7, 7))),
+            ("b", torch.rand((7, 7))),
         )
     ),
     generateTestPattern(
-        "fill_F32",
+        "fill",
         lambda a, v: a.fill_(v),
         (
-            ("actual", torch.zeros((3, 3))),
+            ("actual", torch.zeros((7, 7))),
             ("v", random.random()),
         ),
     ),
@@ -139,18 +166,49 @@ def generateTestPattern(op, function, inputs, detail=""):
         "matmulT",
         lambda a, b: a @ b.T,
         (
-            ("a", torch.rand((6, 3))),
-            ("b", torch.rand((5, 3))),
+            ("a", torch.rand((6, 7))),
+            ("b", torch.rand((5, 7))),
         ),
     ),
     generateTestPattern(
         "matmul",
         lambda a, b: a @ b,
         (
-            ("a", torch.rand((6, 3))),
-            ("b", torch.rand((3, 5))),
+            ("a", torch.rand((6, 7))),
+            ("b", torch.rand((7, 5))),
         ),
     ),
+    generateTestPattern(
+        "maximum",
+        lambda a, b: torch.maximum(a, b),
+        (
+            ("a", torch.rand((7, 7))),
+            ("b", torch.rand((7, 7))),
+        ),
+    ),
+    generateTestPattern(
+        "minimum",
+        lambda a, b: torch.minimum(a, b),
+        (
+            ("a", torch.rand((7, 7))),
+            ("b", torch.rand((7, 7))),
+        ),
+    ),
+    generateImmTestPattern(
+        "max",
+        lambda a: torch.max(a),
+        (
+            ("a", torch.rand((7, 7))),
+        ),
+    ),
+    generateImmTestPattern(
+        "min",
+        lambda a: torch.min(a),
+        (
+            ("a", torch.rand((7, 7))),
+        ),
+    ),
+    
     ]
 ))
 
diff --git a/tests/src/main.c b/tests/src/main.c
index 2f054a6..c4543ba 100644
--- a/tests/src/main.c
+++ b/tests/src/main.c
@@ -41,13 +41,13 @@ int main() {
   {
     printf("add:                    ");
 
-    Tensor *a = NN_tensor(2, (size_t[]){ 3, 3 }, DTYPE_F32, (float[]){ 0.4962566, 0.7682218, 0.08847743, 0.13203049, 0.30742282, 0.6340787, 0.4900934, 0.89644474, 0.45562798 });
+    Tensor *a = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.4962566, 0.7682218, 0.08847743, 0.13203049, 0.30742282, 0.6340787, 0.4900934, 0.89644474, 0.45562798, 0.6323063, 0.34889346, 0.4017173, 0.022325754, 0.16885895, 0.29388845, 0.5185218, 0.6976676, 0.8000114, 0.16102946, 0.28226858, 0.68160856, 0.915194, 0.3970999, 0.8741559, 0.41940832, 0.55290705, 0.9527381, 0.03616482, 0.18523103, 0.37341738, 0.30510002, 0.9320004, 0.17591017, 0.26983356, 0.15067977, 0.031719506, 0.20812976, 0.929799, 0.7231092, 0.7423363, 0.5262958, 0.24365824, 0.58459234, 0.03315264, 0.13871688, 0.242235, 0.81546897, 0.7931606, 0.27825248 });
 
-    Tensor *b = NN_tensor(2, (size_t[]){ 3, 3 }, DTYPE_F32, (float[]){ 0.6323063, 0.34889346, 0.4017173, 0.022325754, 0.16885895, 0.29388845, 0.5185218, 0.6976676, 0.8000114 });
+    Tensor *b = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.4819588, 0.81978035, 0.99706656, 0.6984411, 0.5675464, 0.83524317, 0.20559883, 0.593172, 0.112347245, 0.15345693, 0.24170822, 0.7262365, 0.7010802, 0.20382375, 0.65105355, 0.774486, 0.43689132, 0.5190908, 0.61585236, 0.8101883, 0.98009706, 0.11468822, 0.31676513, 0.69650495, 0.9142747, 0.93510365, 0.9411784, 0.5995073, 0.06520867, 0.54599625, 0.18719733, 0.034022927, 0.94424623, 0.8801799, 0.0012360215, 0.593586, 0.41577, 0.41771942, 0.27112156, 0.6922781, 0.20384824, 0.68329567, 0.75285405, 0.8579358, 0.6869556, 0.005132377, 0.17565155, 0.7496575, 0.6046507 });
 
     
-    Tensor *golden = NN_tensor(2, (size_t[]){ 3, 3 }, DTYPE_F32, (float[]){ 1.1285629, 1.1171153, 0.49019474, 0.15435624, 0.47628176, 0.92796713, 1.0086153, 1.5941124, 1.2556393 });
-    Tensor *actual = NN_zeros(2, (size_t[]){ 3, 3 }, DTYPE_F32);
+    Tensor *golden = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.9782154, 1.5880022, 1.085544, 0.8304716, 0.87496924, 1.4693218, 0.69569224, 1.4896168, 0.5679752, 0.7857632, 0.5906017, 1.1279538, 0.72340596, 0.3726827, 0.944942, 1.2930079, 1.1345589, 1.3191022, 0.7768818, 1.0924568, 1.6617056, 1.0298822, 0.71386504, 1.5706608, 1.333683, 1.4880106, 1.8939165, 0.6356721, 0.2504397, 0.9194136, 0.49229735, 0.9660233, 1.1201564, 1.1500134, 0.15191579, 0.62530553, 0.62389976, 1.3475184, 0.99423075, 1.4346144, 0.730144, 0.9269539, 1.3374465, 0.8910884, 0.82567245, 0.24736738, 0.9911205, 1.5428181, 0.88290316 });
+    Tensor *actual = NN_zeros(2, (size_t[]){ 7, 7 }, DTYPE_F32);
 
     cycles = READ_CSR("mcycle");
     NN_add(actual, a, b);
@@ -63,17 +63,17 @@ int main() {
   }
 
   {
-    printf("add1_F32:               ");
+    printf("add1:                   ");
 
-    Tensor *a = NN_tensor(2, (size_t[]){ 3, 3 }, DTYPE_F32, (float[]){ 0.16102946, 0.28226858, 0.68160856, 0.915194, 0.3970999, 0.8741559, 0.41940832, 0.55290705, 0.9527381 });
+    Tensor *a = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.10995799, 0.21209025, 0.97037464, 0.83690894, 0.28198743, 0.3741576, 0.023700953, 0.49101293, 0.123470545, 0.11432165, 0.4724502, 0.5750725, 0.29523486, 0.7966888, 0.19573045, 0.95368505, 0.84264994, 0.07835853, 0.37555784, 0.5225613, 0.57295054, 0.61858714, 0.69621414, 0.5299501, 0.25603563, 0.7365945, 0.02037555, 0.20364666, 0.37483507, 0.25644332, 0.32508332, 0.09018916, 0.39364243, 0.6068782, 0.17426711, 0.47434032, 0.8579254, 0.44859987, 0.5138961, 0.45686555, 0.6011907, 0.81791973, 0.9736231, 0.81752795, 0.97470677, 0.46383917, 0.050839245, 0.2629614, 0.8404526 });
 
     float v = 0.8444218515250481;
     
-    Tensor *golden = NN_tensor(2, (size_t[]){ 3, 3 }, DTYPE_F32, (float[]){ 1.0054513, 1.1266904, 1.5260304, 1.7596159, 1.2415218, 1.7185777, 1.2638302, 1.3973289, 1.7971599 });
-    Tensor *actual = NN_zeros(2, (size_t[]){ 3, 3 }, DTYPE_F32);
+    Tensor *golden = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.95437986, 1.0565121, 1.8147964, 1.6813308, 1.1264093, 1.2185795, 0.8681228, 1.3354348, 0.9678924, 0.9587435, 1.3168721, 1.4194944, 1.1396568, 1.6411107, 1.0401523, 1.7981069, 1.6870718, 0.9227804, 1.2199798, 1.3669832, 1.4173725, 1.463009, 1.5406361, 1.374372, 1.1004574, 1.5810163, 0.8647974, 1.0480685, 1.2192569, 1.1008651, 1.1695051, 0.934611, 1.2380643, 1.4513001, 1.0186889, 1.3187622, 1.7023473, 1.2930217, 1.358318, 1.3012874, 1.4456125, 1.6623416, 1.8180449, 1.6619499, 1.8191286, 1.308261, 0.8952611, 1.1073833, 1.6848745 });
+    Tensor *actual = NN_zeros(2, (size_t[]){ 7, 7 }, DTYPE_F32);
 
     cycles = READ_CSR("mcycle");
-    NN_add1_F32(actual, a, v);
+    NN_add1(actual, a, v);
     cycles = READ_CSR("mcycle") - cycles;
     printf("%s  (%lu cycles)\n", compare(golden, actual) ? "PASS" : "FAIL", cycles);
 
@@ -87,13 +87,13 @@ int main() {
   {
     printf("sub:                    ");
 
-    Tensor *a = NN_tensor(2, (size_t[]){ 3, 3 }, DTYPE_F32, (float[]){ 0.03616482, 0.18523103, 0.37341738, 0.30510002, 0.9320004, 0.17591017, 0.26983356, 0.15067977, 0.031719506 });
+    Tensor *a = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.49675876, 0.25147682, 0.11684412, 0.032073975, 0.0779959, 0.39858162, 0.774203, 0.77032053, 0.017784059, 0.811891, 0.10874528, 0.39429486, 0.29726368, 0.40369236, 0.40182865, 0.051325023, 0.068281054, 0.42176026, 0.5064661, 0.27286255, 0.6883496, 0.049970806, 0.46625638, 0.9397097, 0.296054, 0.95150155, 0.6810769, 0.048769534, 0.8163487, 0.44230276, 0.27679658, 0.89982665, 0.09595239, 0.55365247, 0.39531565, 0.8570563, 0.63957226, 0.7402527, 0.6765795, 0.37976265, 0.39484727, 0.08795929, 0.77092206, 0.89698905, 0.8421124, 0.14731085, 0.52229995, 0.14753294, 0.22475791 });
 
-    Tensor *b = NN_tensor(2, (size_t[]){ 3, 3 }, DTYPE_F32, (float[]){ 0.20812976, 0.929799, 0.7231092, 0.7423363, 0.5262958, 0.24365824, 0.58459234, 0.03315264, 0.13871688 });
+    Tensor *b = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.20864725, 0.6708725, 0.20204341, 0.4890914, 0.52103406, 0.8223115, 0.122039974, 0.15674388, 0.20966923, 0.8499667, 0.3202675, 0.92174435, 0.6808038, 0.563313, 0.496278, 0.40115923, 0.5627332, 0.38582766, 0.49648678, 0.5637965, 0.10889745, 0.23793429, 0.90374637, 0.09422666, 0.4640969, 0.99461937, 0.6806185, 0.5141565, 0.066695035, 0.74768895, 0.14385962, 0.35806787, 0.33224183, 0.4259563, 0.50546914, 0.91240376, 0.5624194, 0.9478464, 0.8058562, 0.18389302, 0.72425205, 0.14655197, 0.28808743, 0.64706135, 0.66509604, 0.875114, 0.33904207, 0.50080043, 0.7574118 });
 
     
-    Tensor *golden = NN_tensor(2, (size_t[]){ 3, 3 }, DTYPE_F32, (float[]){ -0.17196494, -0.744568, -0.3496918, -0.43723625, 0.40570462, -0.06774807, -0.31475878, 0.11752713, -0.10699737 });
-    Tensor *actual = NN_zeros(2, (size_t[]){ 3, 3 }, DTYPE_F32);
+    Tensor *golden = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.2881115, -0.4193957, -0.0851993, -0.45701742, -0.44303817, -0.4237299, 0.652163, 0.61357665, -0.19188517, -0.038075686, -0.21152222, -0.5274495, -0.3835401, -0.15962064, -0.09444934, -0.3498342, -0.49445212, 0.0359326, 0.009979308, -0.29093397, 0.57945216, -0.18796349, -0.43749, 0.84548306, -0.1680429, -0.04311782, 0.00045835972, -0.465387, 0.74965364, -0.3053862, 0.13293695, 0.5417588, -0.23628944, 0.12769616, -0.110153496, -0.055347443, 0.07715285, -0.20759374, -0.12927675, 0.19586962, -0.32940477, -0.058592677, 0.48283464, 0.2499277, 0.17701638, -0.7278032, 0.18325788, -0.3532675, -0.53265387 });
+    Tensor *actual = NN_zeros(2, (size_t[]){ 7, 7 }, DTYPE_F32);
 
     cycles = READ_CSR("mcycle");
     NN_sub(actual, a, b);
@@ -111,11 +111,11 @@ int main() {
   {
     printf("addInplace:             ");
 
-    Tensor *b = NN_tensor(2, (size_t[]){ 3, 3 }, DTYPE_F32, (float[]){ 0.242235, 0.81546897, 0.7931606, 0.27825248, 0.4819588, 0.81978035, 0.99706656, 0.6984411, 0.5675464 });
+    Tensor *b = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.016453922, 0.8614903, 0.08653879, 0.50689125, 0.41499162, 0.23666352, 0.5660855, 0.91345936, 0.35384023, 0.20315295, 0.31508058, 0.0044258237, 0.725697, 0.25986814, 0.16632986, 0.21194929, 0.787478, 0.76478684, 0.8837609, 0.68136156, 0.33302015, 0.36027592, 0.647715, 0.91101736, 0.6359461, 0.26342732, 0.2649613, 0.02726549, 0.608024, 0.21940875, 0.054212093, 0.93843824, 0.1752944, 0.44311923, 0.64324677, 0.51592916, 0.16355914, 0.09583914, 0.8985412, 0.58141935, 0.91481227, 0.3323797, 0.6472777, 0.3856619, 0.47776443, 0.1954779, 0.66910046, 0.65808296, 0.4896857 });
 
     
-    Tensor *golden = NN_tensor(2, (size_t[]){ 3, 3 }, DTYPE_F32, (float[]){ 0.242235, 0.81546897, 0.7931606, 0.27825248, 0.4819588, 0.81978035, 0.99706656, 0.6984411, 0.5675464 });
-    Tensor *actual = NN_zeros(2, (size_t[]){ 3, 3 }, DTYPE_F32);
+    Tensor *golden = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.016453922, 0.8614903, 0.08653879, 0.50689125, 0.41499162, 0.23666352, 0.5660855, 0.91345936, 0.35384023, 0.20315295, 0.31508058, 0.0044258237, 0.725697, 0.25986814, 0.16632986, 0.21194929, 0.787478, 0.76478684, 0.8837609, 0.68136156, 0.33302015, 0.36027592, 0.647715, 0.91101736, 0.6359461, 0.26342732, 0.2649613, 0.02726549, 0.608024, 0.21940875, 0.054212093, 0.93843824, 0.1752944, 0.44311923, 0.64324677, 0.51592916, 0.16355914, 0.09583914, 0.8985412, 0.58141935, 0.91481227, 0.3323797, 0.6472777, 0.3856619, 0.47776443, 0.1954779, 0.66910046, 0.65808296, 0.4896857 });
+    Tensor *actual = NN_zeros(2, (size_t[]){ 7, 7 }, DTYPE_F32);
 
     cycles = READ_CSR("mcycle");
     NN_addInplace(actual, b);
@@ -130,15 +130,15 @@ int main() {
   }
 
   {
-    printf("fill_F32:               ");
+    printf("fill:                   ");
 
     float v = 0.7579544029403025;
     
-    Tensor *golden = NN_tensor(2, (size_t[]){ 3, 3 }, DTYPE_F32, (float[]){ 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544 });
-    Tensor *actual = NN_zeros(2, (size_t[]){ 3, 3 }, DTYPE_F32);
+    Tensor *golden = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544, 0.7579544 });
+    Tensor *actual = NN_zeros(2, (size_t[]){ 7, 7 }, DTYPE_F32);
 
     cycles = READ_CSR("mcycle");
-    NN_fill_F32(actual, v);
+    NN_fill(actual, v);
     cycles = READ_CSR("mcycle") - cycles;
     printf("%s  (%lu cycles)\n", compare(golden, actual) ? "PASS" : "FAIL", cycles);
 
@@ -151,12 +151,12 @@ int main() {
   {
     printf("matmulT:                ");
 
-    Tensor *a = NN_tensor(2, (size_t[]){ 6, 3 }, DTYPE_F32, (float[]){ 0.83524317, 0.20559883, 0.593172, 0.112347245, 0.15345693, 0.24170822, 0.7262365, 0.7010802, 0.20382375, 0.65105355, 0.774486, 0.43689132, 0.5190908, 0.61585236, 0.8101883, 0.98009706, 0.11468822, 0.31676513 });
+    Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F32, (float[]){ 0.38754892, 0.1917851, 0.8457724, 0.12778795, 0.70483273, 0.33187324, 0.258766, 0.58982253, 0.24027151, 0.6152024, 0.5981904, 0.12875527, 0.5832493, 0.7129646, 0.6979155, 0.43706065, 0.09010619, 0.42292297, 0.67365384, 0.31756145, 0.68979055, 0.8329813, 0.2389242, 0.5049309, 0.7067495, 0.5391889, 0.54176575, 0.5624327, 0.10692614, 0.5392941, 0.8462349, 0.9505569, 0.79387546, 0.5670015, 0.7335071, 0.25676018, 0.08565581, 0.07003945, 0.99880487, 0.8173947, 0.15438312, 0.6956213 });
 
-    Tensor *b = NN_tensor(2, (size_t[]){ 5, 3 }, DTYPE_F32, (float[]){ 0.69650495, 0.9142747, 0.93510365, 0.9411784, 0.5995073, 0.06520867, 0.54599625, 0.18719733, 0.034022927, 0.94424623, 0.8801799, 0.0012360215, 0.593586, 0.41577, 0.41771942 });
+    Tensor *b = NN_tensor(2, (size_t[]){ 5, 7 }, DTYPE_F32, (float[]){ 0.8775838, 0.9998074, 0.93719745, 0.8873769, 0.38537037, 0.32452917, 0.9105244, 0.7801898, 0.19911051, 0.9495086, 0.7415793, 0.77256775, 0.18661183, 0.6434499, 0.32471877, 0.8906783, 0.4100297, 0.69465625, 0.5888109, 0.7127341, 0.33008623, 0.7437857, 0.15076452, 0.6129275, 0.16170406, 0.006731212, 0.09847212, 0.89473504, 0.7705178, 0.96910787, 0.9005606, 0.053477287, 0.15878445, 0.4192087, 0.17528385 });
 
     
-    Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F32, (float[]){ 1.3244021, 0.94805074, 0.51470864, 0.9703723, 0.82904994, 0.44457445, 0.21349882, 0.098291524, 0.2414519, 0.23145676, 1.3374035, 1.1171119, 0.5346975, 1.3030747, 0.8077131, 1.5700936, 1.1055566, 0.5153189, 1.2969818, 0.8909623, 1.6822176, 0.91059625, 0.42627248, 1.0332117, 0.90260935, 1.0837072, 1.0118583, 0.56737596, 1.0267907, 0.7617748 });
+    Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F32, (float[]){ 2.0528438, 2.0113466, 1.4691894, 1.1251802, 1.5493748, 2.7533019, 2.20283, 1.8001729, 1.6449451, 1.6632531, 2.4999304, 2.0542672, 1.797321, 1.3615949, 1.4260775, 2.9659772, 2.58056, 2.0705404, 1.6395576, 1.7771993, 3.4274392, 2.8903294, 2.6360347, 1.5506982, 1.9102607, 2.261407, 2.1324728, 1.703151, 1.0514295, 0.7137757 });
     Tensor *actual = NN_zeros(2, (size_t[]){ 6, 5 }, DTYPE_F32);
 
     cycles = READ_CSR("mcycle");
@@ -175,12 +175,12 @@ int main() {
   {
     printf("matmul:                 ");
 
-    Tensor *a = NN_tensor(2, (size_t[]){ 6, 3 }, DTYPE_F32, (float[]){ 0.27112156, 0.6922781, 0.20384824, 0.68329567, 0.75285405, 0.8579358, 0.6869556, 0.005132377, 0.17565155, 0.7496575, 0.6046507, 0.10995799, 0.21209025, 0.97037464, 0.83690894, 0.28198743, 0.3741576, 0.023700953 });
+    Tensor *a = NN_tensor(2, (size_t[]){ 6, 7 }, DTYPE_F32, (float[]){ 0.84719825, 0.121996105, 0.25604928, 0.016954303, 0.21612722, 0.91123873, 0.90938, 0.85791886, 0.88606364, 0.94459325, 0.3719685, 0.72000104, 0.9454652, 0.6654094, 0.9998382, 0.75933146, 0.81082416, 0.32500392, 0.73991376, 0.5574533, 0.38059133, 0.21814507, 0.21944171, 0.11525959, 0.83566517, 0.8554656, 0.44309366, 0.210657, 0.88645273, 0.81974447, 0.537167, 0.26393235, 0.9595239, 0.70447034, 0.12042731, 0.97854143, 0.8796869, 0.31775457, 0.78107727, 0.21590549, 0.42164284, 0.9245506 });
 
-    Tensor *b = NN_tensor(2, (size_t[]){ 3, 5 }, DTYPE_F32, (float[]){ 0.49101293, 0.123470545, 0.11432165, 0.4724502, 0.5750725, 0.29523486, 0.7966888, 0.19573045, 0.95368505, 0.84264994, 0.07835853, 0.37555784, 0.5225613, 0.57295054, 0.61858714 });
+    Tensor *b = NN_tensor(2, (size_t[]){ 7, 5 }, DTYPE_F32, (float[]){ 0.52065957, 0.14639091, 0.33288354, 0.36427742, 0.4035356, 0.5478503, 0.9624148, 0.5267702, 0.19128, 0.52562714, 0.7397436, 0.7480201, 0.04303074, 0.41052878, 0.12842774, 0.2866572, 0.6801467, 0.1449349, 0.68586344, 0.92438906, 0.5327942, 0.16675615, 0.32085752, 0.60918206, 0.11884099, 0.74840516, 0.04606521, 0.01935333, 0.014169693, 0.39856833, 0.83621645, 0.026760519, 0.91559356, 0.29998857, 0.64644206 });
 
     
-    Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F32, (float[]){ 0.35348204, 0.6615625, 0.27301818, 0.90510166, 0.86536056, 0.6250023, 1.0063617, 0.673796, 1.5323635, 1.5580451, 0.3525831, 0.15487501, 0.17132716, 0.4300866, 0.5080299, 0.5552216, 0.61557466, 0.26151043, 0.99382263, 1.0086348, 0.45620644, 1.1135812, 0.6515146, 1.5051413, 1.4573545, 0.250781, 0.3418054, 0.11785651, 0.503633, 0.49210823 });
+    Tensor *golden = NN_tensor(2, (size_t[]){ 6, 5 }, DTYPE_F32, (float[]){ 2.259774, 0.54684687, 1.2793616, 0.8660709, 1.4312925, 3.3851278, 2.1193447, 1.705457, 1.776533, 2.1696422, 2.7592213, 1.8639714, 1.4114801, 1.6380526, 1.7633034, 1.522168, 1.0664225, 0.7902231, 1.332517, 1.4051012, 2.5228224, 1.6957082, 1.220037, 1.5118933, 1.5742182, 2.6540992, 1.8389752, 1.8399594, 1.6057416, 2.4114716 });
     Tensor *actual = NN_zeros(2, (size_t[]){ 6, 5 }, DTYPE_F32);
 
     cycles = READ_CSR("mcycle");
@@ -196,4 +196,90 @@ int main() {
     NN_deleteTensor(actual);
   }
 
+  {
+    printf("maximum:                ");
+
+    Tensor *a = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.52280146, 0.049140453, 0.9146645, 0.7692217, 0.99699783, 0.7526061, 0.1699655, 0.9172919, 0.5268722, 0.73710823, 0.09908545, 0.35618675, 0.009061217, 0.30525374, 0.6078656, 0.10741913, 0.6593821, 0.7684034, 0.56965464, 0.16545832, 0.11234015, 0.3457417, 0.7194791, 0.9931982, 0.7875145, 0.44369537, 0.6753082, 0.009468555, 0.07294935, 0.73330396, 0.2167924, 0.74054784, 0.14703393, 0.25234455, 0.08815551, 0.76092035, 0.44905245, 0.88480055, 0.8094361, 0.7766713, 0.51607805, 0.345411, 0.39128417, 0.5664503, 0.74785477, 0.14970505, 0.91963893, 0.44563496, 0.08102721 });
+
+    Tensor *b = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.22947109, 0.94240886, 0.9572636, 0.036860168, 0.85264915, 0.7505796, 0.79595923, 0.9232646, 0.23052484, 0.6578879, 0.7046166, 0.35225332, 0.66732657, 0.3561433, 0.80913067, 0.3612727, 0.31360215, 0.6258745, 0.6773468, 0.25571418, 0.54419917, 0.78976786, 0.45025164, 0.65216696, 0.3794065, 0.6752498, 0.1378029, 0.2059856, 0.24620473, 0.95950544, 0.36545795, 0.49863482, 0.25775224, 0.99914503, 0.9883351, 0.122906685, 0.09466505, 0.12100351, 0.49758863, 0.37254804, 0.17272717, 0.32066393, 0.59446543, 0.23875463, 0.61079127, 0.38534206, 0.25771832, 0.56869274, 0.9111291 });
+
+    
+    Tensor *golden = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.52280146, 0.94240886, 0.9572636, 0.7692217, 0.99699783, 0.7526061, 0.79595923, 0.9232646, 0.5268722, 0.73710823, 0.7046166, 0.35618675, 0.66732657, 0.3561433, 0.80913067, 0.3612727, 0.6593821, 0.7684034, 0.6773468, 0.25571418, 0.54419917, 0.78976786, 0.7194791, 0.9931982, 0.7875145, 0.6752498, 0.6753082, 0.2059856, 0.24620473, 0.95950544, 0.36545795, 0.74054784, 0.25775224, 0.99914503, 0.9883351, 0.76092035, 0.44905245, 0.88480055, 0.8094361, 0.7766713, 0.51607805, 0.345411, 0.59446543, 0.5664503, 0.74785477, 0.38534206, 0.91963893, 0.56869274, 0.9111291 });
+    Tensor *actual = NN_zeros(2, (size_t[]){ 7, 7 }, DTYPE_F32);
+
+    cycles = READ_CSR("mcycle");
+    NN_maximum(actual, a, b);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s  (%lu cycles)\n", compare(golden, actual) ? "PASS" : "FAIL", cycles);
+
+
+    NN_deleteTensor(a);
+    NN_deleteTensor(b);
+    NN_deleteTensor(golden);
+    NN_freeTensorData(actual);
+    NN_deleteTensor(actual);
+  }
+
+  {
+    printf("minimum:                ");
+
+    Tensor *a = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.16196036, 0.5232172, 0.31561613, 0.99065316, 0.025618374, 0.0206694, 0.9926925, 0.18365502, 0.5958617, 0.45684695, 0.3946715, 0.3883261, 0.8177203, 0.5238985, 0.013192713, 0.20481992, 0.32954985, 0.7516082, 0.17643315, 0.9714598, 0.38863534, 0.410219, 0.891779, 0.75130385, 0.92406017, 0.7892222, 0.34832305, 0.1682638, 0.46279848, 0.9138188, 0.3321901, 0.036315024, 0.7049642, 0.9867357, 0.3576584, 0.08598822, 0.046470165, 0.6252997, 0.46214014, 0.24750638, 0.60106593, 0.6898794, 0.8976595, 0.8881911, 0.42515814, 0.059116423, 0.048188448, 0.9668448, 0.7210276 });
+
+    Tensor *b = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.7179537, 0.06738949, 0.96300787, 0.97367156, 0.95143014, 0.07820749, 0.3113383, 0.1561181, 0.9734828, 0.28516, 0.27172273, 0.76195645, 0.26870382, 0.25373894, 0.45626426, 0.45194024, 0.11051077, 0.91683406, 0.27943915, 0.67735744, 0.9348918, 0.7521582, 0.57078993, 0.9254285, 0.5672131, 0.2686717, 0.97299975, 0.61834025, 0.012159586, 0.3576542, 0.15941626, 0.9383765, 0.41742706, 0.044237554, 0.46856833, 0.81400645, 0.6299002, 0.6581022, 0.5464366, 0.68640935, 0.378174, 0.3010999, 0.032645762, 0.12333155, 0.71670127, 0.20394331, 0.57173324, 0.6595957, 0.53540194 });
+
+    
+    Tensor *golden = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.16196036, 0.06738949, 0.31561613, 0.97367156, 0.025618374, 0.0206694, 0.3113383, 0.1561181, 0.5958617, 0.28516, 0.27172273, 0.3883261, 0.26870382, 0.25373894, 0.013192713, 0.20481992, 0.11051077, 0.7516082, 0.17643315, 0.67735744, 0.38863534, 0.410219, 0.57078993, 0.75130385, 0.5672131, 0.2686717, 0.34832305, 0.1682638, 0.012159586, 0.3576542, 0.15941626, 0.036315024, 0.41742706, 0.044237554, 0.3576584, 0.08598822, 0.046470165, 0.6252997, 0.46214014, 0.24750638, 0.378174, 0.3010999, 0.032645762, 0.12333155, 0.42515814, 0.059116423, 0.048188448, 0.6595957, 0.53540194 });
+    Tensor *actual = NN_zeros(2, (size_t[]){ 7, 7 }, DTYPE_F32);
+
+    cycles = READ_CSR("mcycle");
+    NN_minimum(actual, a, b);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s  (%lu cycles)\n", compare(golden, actual) ? "PASS" : "FAIL", cycles);
+
+
+    NN_deleteTensor(a);
+    NN_deleteTensor(b);
+    NN_deleteTensor(golden);
+    NN_freeTensorData(actual);
+    NN_deleteTensor(actual);
+  }
+
+  {
+    printf("max:                    ");
+
+    Tensor *a = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.17582512, 0.9781642, 0.20925027, 0.9112503, 0.10224587, 0.37972575, 0.7719844, 0.29570967, 0.9200215, 0.15592176, 0.080114245, 0.27454042, 0.5808252, 0.96037793, 0.26129955, 0.6788141, 0.37464648, 0.39156884, 0.8676517, 0.112507045, 0.55310667, 0.9702046, 0.4312939, 0.88821906, 0.3460216, 0.9024811, 0.016334832, 0.42793816, 0.4121768, 0.6620425, 0.6961637, 0.88390845, 0.425507, 0.48017246, 0.8424056, 0.36471343, 0.9383168, 0.16709393, 0.44589508, 0.47314453, 0.72310495, 0.84183806, 0.4207481, 0.0857597, 0.7477461, 0.6495659, 0.70084965, 0.19156617, 0.8217978 });
+
+    
+    float golden = 0.9781641960144043;
+    float actual;
+
+    cycles = READ_CSR("mcycle");
+    actual = NN_max(a);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s  (%lu cycles)\n", float_eq(golden, actual, 1e-6) ? "PASS" : "FAIL", cycles);
+
+
+    NN_deleteTensor(a);
+    
+  }
+
+  {
+    printf("min:                    ");
+
+    Tensor *a = NN_tensor(2, (size_t[]){ 7, 7 }, DTYPE_F32, (float[]){ 0.9735775, 0.5433857, 0.032975793, 0.85099494, 0.12927437, 0.61493605, 0.5726589, 0.26598173, 0.6740978, 0.052783668, 0.61387974, 0.18302453, 0.44593316, 0.5643144, 0.9259722, 0.26143986, 0.82031804, 0.4364831, 0.2625361, 0.06460017, 0.04124081, 0.98830533, 0.37530023, 0.5249744, 0.63555616, 0.8398661, 0.92673707, 0.9055086, 0.12955844, 0.4198916, 0.20413119, 0.21432412, 0.6186035, 0.969324, 0.099448025, 0.80260223, 0.24076664, 0.40261286, 0.89688545, 0.38691485, 0.5455279, 0.15048373, 0.92562044, 0.43536508, 0.13430476, 0.64640516, 0.14449131, 0.10324633, 0.5304596 });
+
+    
+    float golden = 0.03297579288482666;
+    float actual;
+
+    cycles = READ_CSR("mcycle");
+    actual = NN_min(a);
+    cycles = READ_CSR("mcycle") - cycles;
+    printf("%s  (%lu cycles)\n", float_eq(golden, actual, 1e-6) ? "PASS" : "FAIL", cycles);
+
+
+    NN_deleteTensor(a);
+    
+  }
+
 }
\ No newline at end of file
diff --git a/tests/src/main_fp16.c b/tests/src/main_fp16.c
new file mode 100644
index 0000000..3b1b414
--- /dev/null
+++ b/tests/src/main_fp16.c
@@ -0,0 +1,67 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <rv.h>
+
+#include "nn.h"
+// #include "riscv_vector.h"
+
+
+void print_bits_half(float16_t x) {
+  for(int i = 15; i >= 0; i -= 1) {
+    printf("%d", ((x>>i)&1));
+    if(i == 15 || i == 10)
+      printf(" ");
+    if(i == 10)
+      printf("      ");
+  }
+  printf("\n");
+}
+void print_bits(float x) {
+  uint32_t b = *(uint32_t*)&x;
+  for(int i = 31; i >= 0; i -= 1) {
+    printf("%d", ((b>>i)&1));
+    if(i == 31 || i == 23)
+      printf(" ");
+    if(i == 23)
+      printf("   ");
+  }
+  printf("\n");
+}
+
+uint8_t compareResult(float golden, float actual) {
+  float diff = fabs(golden - actual);
+  if (diff > 1e-2) {
+    printf("FAILED ");
+    printf("golden: ");
+    NN_printFloat(golden, 6);
+    printf("\n");
+    printf("actual: ");
+    NN_printFloat(actual, 6);
+    printf("\n");
+    return 1;
+  }
+  printf("PASSED\n");
+  return 0;
+}
+
+int main() {
+  // for (size_t i = 0; i < 100; i += 1) {
+    // float x = rand() / (float)RAND_MAX * 1000.0f;
+
+    float x = (float)(0x47ca9334);
+
+    float16_t x_compressed = NN_floatToHalf(x);
+    float x_decompressed = NN_halfToFloat(x_compressed);
+    
+    print_bits(x);
+    print_bits_half(x_compressed);
+    print_bits(x_decompressed);
+
+    printf("%f\t", x);
+    printf("%f\n", x_decompressed);
+
+    compareResult(x, x_decompressed);
+  // }
+  return 0;
+}
\ No newline at end of file