diff --git a/modules/cpu/src/runtime/cmodel/include/layernorm.h b/modules/cpu/src/runtime/cmodel/include/layernorm.h
index b646a2b7b8..80ac9522fe 100644
--- a/modules/cpu/src/runtime/cmodel/include/layernorm.h
+++ b/modules/cpu/src/runtime/cmodel/include/layernorm.h
@@ -5,6 +5,7 @@
 #include <riscv_vector.h>
 #endif
 
+using namespace nncase::runtime::cpu;
 namespace kernels {
 
 namespace {
@@ -26,8 +27,8 @@ void layernorm_naive_impl(const T *input, const T *sum, T *sum_sqr, T *output,
         if (rms_norm) {
             mean = 0;
         }
-        auto sigma =
-            std::sqrt(sum_sqr[o_offset] / norm_size - mean * mean + eps);
+        auto sigma = nncase_mt.float_unary_sqrt(sum_sqr[o_offset] / norm_size -
+                                                mean * mean + eps);
 
         auto input_offset = offset(input_stride, input_index);
         auto in_offset = offset(gamma_strides, input_index.subspan(axis));
@@ -142,8 +143,8 @@ template <class T>
 void layernorm(const T *input, T *sum, T *sum_sqr, T *output, T *gamma, T *beta,
                dims_t input_dims, strides_t input_strides,
                strides_t output_strides, strides_t sum_strides,
-               strides_t gamma_strides, T eps, int32_t axis,
-               int32_t norm_size, bool rms_norm = false) {
+               strides_t gamma_strides, T eps, int32_t axis, int32_t norm_size,
+               bool rms_norm = false) {
 #ifdef __riscv_vector
     return layernorm_rvv_impl(
         input, sum, sum_sqr, gamma, beta,
diff --git a/modules/cpu/src/runtime/cmodel/include/matmul.h b/modules/cpu/src/runtime/cmodel/include/matmul.h
index 9aeb902448..5c8aa42cd4 100644
--- a/modules/cpu/src/runtime/cmodel/include/matmul.h
+++ b/modules/cpu/src/runtime/cmodel/include/matmul.h
@@ -35,9 +35,9 @@ void contiguous_matmul_impl(const T *input_a, const T *input_b, T *output,
     auto b_unit_size = new_b_shape[3] * new_b_shape[4];
     auto out_unit_size = new_a_shape[3] * new_b_shape[4];
 
-    auto dim0 = std::max(new_a_shape[0], new_b_shape[0]);
-    auto dim1 = std::max(new_a_shape[1], new_b_shape[1]);
-    auto dim2 = std::max(new_a_shape[2], new_b_shape[2]);
+    auto dim0 = new_a_shape[0]> new_b_shape[0]? new_a_shape[0] : new_b_shape[0];
+    auto dim1 = new_a_shape[1]> new_b_shape[1]? new_a_shape[1] : new_b_shape[1];
+    auto dim2 = new_a_shape[2]> new_b_shape[2]? new_a_shape[2] : new_b_shape[2];
     auto ah_size = a_unit_size * new_a_shape[2];
     auto bh_size = b_unit_size * new_b_shape[2];
     auto oh_size = out_unit_size * dim2;
diff --git a/modules/cpu/src/runtime/cmodel/include/reduce.h b/modules/cpu/src/runtime/cmodel/include/reduce.h
index 1ca7592bbf..79ec08d853 100644
--- a/modules/cpu/src/runtime/cmodel/include/reduce.h
+++ b/modules/cpu/src/runtime/cmodel/include/reduce.h
@@ -37,8 +37,11 @@ template <class TShape>
 size_t get_reduce_block_size(const TShape &in_shape, const TShape &axis) {
     size_t size = 1;
     for (size_t i = 0; i < in_shape.size(); i++) {
-        if (std::find(axis.begin(), axis.end(), i) != axis.end()) {
-            size *= in_shape[i];
+        for (size_t j = 0; j < axis.size(); j++) {
+            if (i == axis[j]) {
+                size *= in_shape[i];
+                break;
+            }
         }
     }
 
@@ -143,14 +146,14 @@ void reduce(reduce_op_t op, const T *init_value, const T *input, T *output,
             gsl::span<const size_t> out_strides, bool keep_dims) noexcept {
     auto out_shape = get_reduced_shape(in_shape, axis, keep_dims);
     switch (op) {
-        REDUCE_IMPL(reduce_op_t::mean, std::plus<T>(),
+        REDUCE_IMPL(reduce_op_t::mean, [](T a, T b) { return a + b; },
                     [block_size = (T)get_reduce_block_size(in_shape, axis)](
                         T v) { return v / block_size; });
         REDUCE_IMPL_NO_POST(reduce_op_t::min,
-                            [](T a, T b) { return std::min(a, b); });
+                            [](T a, T b) { return a > b ? b : a; });
         REDUCE_IMPL_NO_POST(reduce_op_t::max,
-                            [](T a, T b) { return std::max(a, b); });
-        REDUCE_IMPL_NO_POST(reduce_op_t::sum, std::plus<T>());
+                            [](T a, T b) { return a > b ? a : b; });
+        REDUCE_IMPL_NO_POST(reduce_op_t::sum, [](T a, T b) { return a + b; });
         REDUCE_IMPL_NO_POST(reduce_op_t::sum_sqr,
                             [](T a, T b) { return a + (b * b); });
     case reduce_op_t::prod:
diff --git a/modules/cpu/src/runtime/cmodel/include/runtime_utils.h b/modules/cpu/src/runtime/cmodel/include/runtime_utils.h
index 3ca7b63f10..e6216187b4 100644
--- a/modules/cpu/src/runtime/cmodel/include/runtime_utils.h
+++ b/modules/cpu/src/runtime/cmodel/include/runtime_utils.h
@@ -169,7 +169,14 @@ inline dims_t get_reduced_offset(gsl::span<const size_t> in_offset,
     dims_t off;
     off.reserve(in_offset.size() - (keep_dims ? 0 : axis.size()));
     for (size_t i = 0; i < in_offset.size(); i++) {
-        if (std::find(axis.begin(), axis.end(), i) == axis.end()) {
+        bool found = false;
+        for (size_t j = 0; j < axis.size(); j++) {
+            if (i == axis[j]) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
             off.push_back(in_offset[i]);
         } else {
             if (keep_dims)
@@ -199,7 +206,14 @@ inline dims_t get_reduced_shape(gsl::span<const size_t> in_shape,
     dims_t shape;
     shape.reserve(in_shape.size() - (keep_dims ? 0 : axis.size()));
     for (size_t i = 0; i < in_shape.size(); i++) {
-        if (std::find(axis.begin(), axis.end(), i) == axis.end()) {
+        bool found = false;
+        for (size_t j = 0; j < axis.size(); j++) {
+            if (i == axis[j]) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
             shape.push_back(in_shape[i]);
         } else {
             if (keep_dims)
diff --git a/modules/cpu/src/runtime/cmodel/include/softmax.h b/modules/cpu/src/runtime/cmodel/include/softmax.h
index 6967b30ac1..89ca7fc448 100644
--- a/modules/cpu/src/runtime/cmodel/include/softmax.h
+++ b/modules/cpu/src/runtime/cmodel/include/softmax.h
@@ -15,6 +15,8 @@
 #include <cmath>
 #include <runtime_utils.h>
 
+using namespace nncase::runtime::cpu;
+
 namespace kernels {
 
 namespace {
@@ -30,7 +32,10 @@ void softmax_impl(const T *input, T *output, gsl::span<const size_t> in_shape,
     auto reduced_shape = get_reduced_shape(in_shape, axes, true);
     auto reduced_strides = get_default_strides(reduced_shape);
     auto reduced_size = compute_size(reduced_shape);
-    std::vector<T> tmp(reduced_size, std::numeric_limits<T>::lowest());
+    auto tmp = (T *)runtime_util.malloc(reduced_size * sizeof(T));
+    for (size_t i = 0; i < reduced_size; i++) {
+        tmp[i] = std::numeric_limits<T>().lowest();
+    }
 
     // reduce_max
     (apply(in_shape, [&](gsl::span<const size_t> index) -> void {
@@ -41,7 +46,7 @@ void softmax_impl(const T *input, T *output, gsl::span<const size_t> in_shape,
         auto out_idx = offset(reduced_strides, out_index);
         auto &out = tmp[out_idx];
 
-        out = std::max(in, out);
+        out = in > out ? in : out;
     }));
 
     // x - reduce_max
@@ -57,7 +62,9 @@ void softmax_impl(const T *input, T *output, gsl::span<const size_t> in_shape,
     }));
 
     // exp(x - reduce_max) and sum
-    tmp.assign(tmp.size(), static_cast<T>(0));
+    for (size_t i = 0; i < reduced_size; i++) {
+        tmp[i] = static_cast<T>(0);
+    }
     (apply(in_shape, [&](gsl::span<const size_t> index) -> void {
         auto in_idx = offset(out_strides, index);
         const auto in = output[in_idx];
@@ -78,7 +85,12 @@ void softmax_impl(const T *input, T *output, gsl::span<const size_t> in_shape,
         auto &out = output[out_idx];
         out /= in;
         if (needLog) {
-            out = std::log(out);
+            if (std::is_same_v<T, float>) {
+                out = nncase_mt.float_unary_log(out);
+            } else {
+                runtime_util.rt_assert(
+                    false, (char *)"Not supported Type in softmax!");
+            }
         }
     }));
 }
diff --git a/modules/cpu/src/runtime/cmodel/include/tdma.h b/modules/cpu/src/runtime/cmodel/include/tdma.h
index b91b194bc4..9faced3135 100644
--- a/modules/cpu/src/runtime/cmodel/include/tdma.h
+++ b/modules/cpu/src/runtime/cmodel/include/tdma.h
@@ -155,7 +155,7 @@ template <typename T, loc_t BLoc>
 void concat(std::initializer_list<tensor<T, loc_t::local>> inits,
             tensor<T, BLoc> &output, size_t axis) {
     itlib::small_vector<const gsl::byte *const, 8> inputs(inits.size());
-    std::vector<strides_t> in_strides(inits.size());
+    itlib::small_vector<strides_t> in_strides(inits.size());
     auto concat_dims = dims_t(inits.size(), 1);
     for (size_t i = 0; i < inits.size(); ++i) {
         if (inits[i].dimension().size() != 0) {
@@ -273,7 +273,8 @@ template <class T, loc_t Src, loc_t Dest>
 void tdma_load_broadcast_async([[maybe_unused]] tensor<T, Dest> &dest,
                                [[maybe_unused]] tensor<T, Src> &src,
                                [[maybe_unused]] thread_context &ctx) {
-    throw std::system_error(std::make_error_code(std::errc::not_supported));
+    // throw std::system_error(std::make_error_code(std::errc::not_supported));
+    runtime_util.rt_assert(false, (char*)"not_supported");
 }
 
 template <class T>
@@ -287,7 +288,7 @@ void tdma_reduce_async(tensor<T, loc_t::local> &src,
             new_dims.insert(new_dims.begin(), BLOCKS * CORES);
             if (visited == 1) {
                 if (global_hardware_ctx.global_var != nullptr) {
-                    throw std::runtime_error(" the global var has been used!");
+                    runtime_util.rt_assert(false, (char*)"the global var has been used!");
                 }
                 gather_tensor = new tensor<T>(new_dims);
                 global_hardware_ctx.global_var = (void *)gather_tensor;
@@ -359,7 +360,7 @@ void tdma_all_reduce_async(tensor<T, ALoc> &src, tensor<T, BLoc> &dest,
             new_dims.insert(new_dims.begin(), BLOCKS * CORES);
             if (visited == 1) {
                 if (global_hardware_ctx.global_var != nullptr) {
-                    throw std::runtime_error(" the global var has been used!");
+                    runtime_util.rt_assert(false, (char*)"the global var has been used!");
                 }
                 gather_tensor = new tensor<T>(new_dims);
                 global_hardware_ctx.global_var = (void *)gather_tensor;
diff --git a/modules/cpu/src/runtime/cmodel/include/tensor.h b/modules/cpu/src/runtime/cmodel/include/tensor.h
index 75e9eabc0e..ed064dd87a 100644
--- a/modules/cpu/src/runtime/cmodel/include/tensor.h
+++ b/modules/cpu/src/runtime/cmodel/include/tensor.h
@@ -34,7 +34,7 @@ template <typename T, loc_t Loc = loc_t::local> class tensor {
           strides_(get_default_strides(dims_)),
           size_(compute_size(dims_)) {
         if (size_ != data_.size()) {
-            throw std::errc::invalid_argument;
+            runtime_util.rt_assert(false, (char*)"Invalid tensor size");
         }
     }
 
@@ -45,7 +45,7 @@ template <typename T, loc_t Loc = loc_t::local> class tensor {
           strides_(strides),
           size_(compute_size(dims_, strides_)) {
         if (size_ != data_.size()) {
-            throw std::errc::invalid_argument;
+            runtime_util.rt_assert(false, (char*)"Invalid tensor size");
         }
     }