diff --git a/modules/cpu/src/runtime/cmodel/include/layernorm.h b/modules/cpu/src/runtime/cmodel/include/layernorm.h index b646a2b7b8..80ac9522fe 100644 --- a/modules/cpu/src/runtime/cmodel/include/layernorm.h +++ b/modules/cpu/src/runtime/cmodel/include/layernorm.h @@ -5,6 +5,7 @@ #include #endif +using namespace nncase::runtime::cpu; namespace kernels { namespace { @@ -26,8 +27,8 @@ void layernorm_naive_impl(const T *input, const T *sum, T *sum_sqr, T *output, if (rms_norm) { mean = 0; } - auto sigma = - std::sqrt(sum_sqr[o_offset] / norm_size - mean * mean + eps); + auto sigma = nncase_mt.float_unary_sqrt(sum_sqr[o_offset] / norm_size - + mean * mean + eps); auto input_offset = offset(input_stride, input_index); auto in_offset = offset(gamma_strides, input_index.subspan(axis)); @@ -142,8 +143,8 @@ template void layernorm(const T *input, T *sum, T *sum_sqr, T *output, T *gamma, T *beta, dims_t input_dims, strides_t input_strides, strides_t output_strides, strides_t sum_strides, - strides_t gamma_strides, T eps, int32_t axis, - int32_t norm_size, bool rms_norm = false) { + strides_t gamma_strides, T eps, int32_t axis, int32_t norm_size, + bool rms_norm = false) { #ifdef __riscv_vector return layernorm_rvv_impl( input, sum, sum_sqr, gamma, beta, diff --git a/modules/cpu/src/runtime/cmodel/include/matmul.h b/modules/cpu/src/runtime/cmodel/include/matmul.h index 9aeb902448..5c8aa42cd4 100644 --- a/modules/cpu/src/runtime/cmodel/include/matmul.h +++ b/modules/cpu/src/runtime/cmodel/include/matmul.h @@ -35,9 +35,9 @@ void contiguous_matmul_impl(const T *input_a, const T *input_b, T *output, auto b_unit_size = new_b_shape[3] * new_b_shape[4]; auto out_unit_size = new_a_shape[3] * new_b_shape[4]; - auto dim0 = std::max(new_a_shape[0], new_b_shape[0]); - auto dim1 = std::max(new_a_shape[1], new_b_shape[1]); - auto dim2 = std::max(new_a_shape[2], new_b_shape[2]); + auto dim0 = new_a_shape[0]> new_b_shape[0]? new_a_shape[0] : new_b_shape[0]; + auto dim1 = new_a_shape[1]> new_b_shape[1]? new_a_shape[1] : new_b_shape[1]; + auto dim2 = new_a_shape[2]> new_b_shape[2]? new_a_shape[2] : new_b_shape[2]; auto ah_size = a_unit_size * new_a_shape[2]; auto bh_size = b_unit_size * new_b_shape[2]; auto oh_size = out_unit_size * dim2; diff --git a/modules/cpu/src/runtime/cmodel/include/reduce.h b/modules/cpu/src/runtime/cmodel/include/reduce.h index 1ca7592bbf..79ec08d853 100644 --- a/modules/cpu/src/runtime/cmodel/include/reduce.h +++ b/modules/cpu/src/runtime/cmodel/include/reduce.h @@ -37,8 +37,11 @@ template size_t get_reduce_block_size(const TShape &in_shape, const TShape &axis) { size_t size = 1; for (size_t i = 0; i < in_shape.size(); i++) { - if (std::find(axis.begin(), axis.end(), i) != axis.end()) { - size *= in_shape[i]; + for (size_t j = 0; j < axis.size(); j++) { + if (i == axis[j]) { + size *= in_shape[i]; + break; + } } } @@ -143,14 +146,14 @@ void reduce(reduce_op_t op, const T *init_value, const T *input, T *output, gsl::span out_strides, bool keep_dims) noexcept { auto out_shape = get_reduced_shape(in_shape, axis, keep_dims); switch (op) { - REDUCE_IMPL(reduce_op_t::mean, std::plus(), + REDUCE_IMPL(reduce_op_t::mean, [](T a, T b) { return a + b; }, [block_size = (T)get_reduce_block_size(in_shape, axis)]( T v) { return v / block_size; }); REDUCE_IMPL_NO_POST(reduce_op_t::min, - [](T a, T b) { return std::min(a, b); }); + [](T a, T b) { return a > b ? b : a; }); REDUCE_IMPL_NO_POST(reduce_op_t::max, - [](T a, T b) { return std::max(a, b); }); - REDUCE_IMPL_NO_POST(reduce_op_t::sum, std::plus()); + [](T a, T b) { return a > b ? a : b; }); + REDUCE_IMPL_NO_POST(reduce_op_t::sum, [](T a, T b) { return a + b; }); REDUCE_IMPL_NO_POST(reduce_op_t::sum_sqr, [](T a, T b) { return a + (b * b); }); case reduce_op_t::prod: diff --git a/modules/cpu/src/runtime/cmodel/include/runtime_utils.h b/modules/cpu/src/runtime/cmodel/include/runtime_utils.h index 3ca7b63f10..e6216187b4 100644 --- a/modules/cpu/src/runtime/cmodel/include/runtime_utils.h +++ b/modules/cpu/src/runtime/cmodel/include/runtime_utils.h @@ -169,7 +169,14 @@ inline dims_t get_reduced_offset(gsl::span in_offset, dims_t off; off.reserve(in_offset.size() - (keep_dims ? 0 : axis.size())); for (size_t i = 0; i < in_offset.size(); i++) { - if (std::find(axis.begin(), axis.end(), i) == axis.end()) { + bool found = false; + for (size_t j = 0; j < axis.size(); j++) { + if (i == axis[j]) { + found = true; + break; + } + } + if (!found) { off.push_back(in_offset[i]); } else { if (keep_dims) @@ -199,7 +206,14 @@ inline dims_t get_reduced_shape(gsl::span in_shape, dims_t shape; shape.reserve(in_shape.size() - (keep_dims ? 0 : axis.size())); for (size_t i = 0; i < in_shape.size(); i++) { - if (std::find(axis.begin(), axis.end(), i) == axis.end()) { + bool found = false; + for (size_t j = 0; j < axis.size(); j++) { + if (i == axis[j]) { + found = true; + break; + } + } + if (!found) { shape.push_back(in_shape[i]); } else { if (keep_dims) diff --git a/modules/cpu/src/runtime/cmodel/include/softmax.h b/modules/cpu/src/runtime/cmodel/include/softmax.h index 6967b30ac1..89ca7fc448 100644 --- a/modules/cpu/src/runtime/cmodel/include/softmax.h +++ b/modules/cpu/src/runtime/cmodel/include/softmax.h @@ -15,6 +15,8 @@ #include #include +using namespace nncase::runtime::cpu; + namespace kernels { namespace { @@ -30,7 +32,10 @@ void softmax_impl(const T *input, T *output, gsl::span in_shape, auto reduced_shape = get_reduced_shape(in_shape, axes, true); auto reduced_strides = get_default_strides(reduced_shape); auto reduced_size = compute_size(reduced_shape); - std::vector tmp(reduced_size, std::numeric_limits::lowest()); + auto tmp = (T *)runtime_util.malloc(reduced_size * sizeof(T)); + for (size_t i = 0; i < reduced_size; i++) { + tmp[i] = std::numeric_limits().lowest(); + } // reduce_max (apply(in_shape, [&](gsl::span index) -> void { @@ -41,7 +46,7 @@ void softmax_impl(const T *input, T *output, gsl::span in_shape, auto out_idx = offset(reduced_strides, out_index); auto &out = tmp[out_idx]; - out = std::max(in, out); + out = in > out ? in : out; })); // x - reduce_max @@ -57,7 +62,9 @@ void softmax_impl(const T *input, T *output, gsl::span in_shape, })); // exp(x - reduce_max) and sum - tmp.assign(tmp.size(), static_cast(0)); + for (size_t i = 0; i < reduced_size; i++) { + tmp[i] = static_cast(0); + } (apply(in_shape, [&](gsl::span index) -> void { auto in_idx = offset(out_strides, index); const auto in = output[in_idx]; @@ -78,7 +85,12 @@ void softmax_impl(const T *input, T *output, gsl::span in_shape, auto &out = output[out_idx]; out /= in; if (needLog) { - out = std::log(out); + if (std::is_same_v) { + out = nncase_mt.float_unary_log(out); + } else { + runtime_util.rt_assert( + false, (char *)"Not supported Type in softmax!"); + } } })); } diff --git a/modules/cpu/src/runtime/cmodel/include/tdma.h b/modules/cpu/src/runtime/cmodel/include/tdma.h index b91b194bc4..9faced3135 100644 --- a/modules/cpu/src/runtime/cmodel/include/tdma.h +++ b/modules/cpu/src/runtime/cmodel/include/tdma.h @@ -155,7 +155,7 @@ template void concat(std::initializer_list> inits, tensor &output, size_t axis) { itlib::small_vector inputs(inits.size()); - std::vector in_strides(inits.size()); + itlib::small_vector in_strides(inits.size()); auto concat_dims = dims_t(inits.size(), 1); for (size_t i = 0; i < inits.size(); ++i) { if (inits[i].dimension().size() != 0) { @@ -273,7 +273,8 @@ template void tdma_load_broadcast_async([[maybe_unused]] tensor &dest, [[maybe_unused]] tensor &src, [[maybe_unused]] thread_context &ctx) { - throw std::system_error(std::make_error_code(std::errc::not_supported)); + // throw std::system_error(std::make_error_code(std::errc::not_supported)); + runtime_util.rt_assert(false, (char*)"not_supported"); } template @@ -287,7 +288,7 @@ void tdma_reduce_async(tensor &src, new_dims.insert(new_dims.begin(), BLOCKS * CORES); if (visited == 1) { if (global_hardware_ctx.global_var != nullptr) { - throw std::runtime_error(" the global var has been used!"); + runtime_util.rt_assert(false, (char*)"the global var has been used!"); } gather_tensor = new tensor(new_dims); global_hardware_ctx.global_var = (void *)gather_tensor; @@ -359,7 +360,7 @@ void tdma_all_reduce_async(tensor &src, tensor &dest, new_dims.insert(new_dims.begin(), BLOCKS * CORES); if (visited == 1) { if (global_hardware_ctx.global_var != nullptr) { - throw std::runtime_error(" the global var has been used!"); + runtime_util.rt_assert(false, (char*)"the global var has been used!"); } gather_tensor = new tensor(new_dims); global_hardware_ctx.global_var = (void *)gather_tensor; diff --git a/modules/cpu/src/runtime/cmodel/include/tensor.h b/modules/cpu/src/runtime/cmodel/include/tensor.h index 75e9eabc0e..ed064dd87a 100644 --- a/modules/cpu/src/runtime/cmodel/include/tensor.h +++ b/modules/cpu/src/runtime/cmodel/include/tensor.h @@ -34,7 +34,7 @@ template class tensor { strides_(get_default_strides(dims_)), size_(compute_size(dims_)) { if (size_ != data_.size()) { - throw std::errc::invalid_argument; + runtime_util.rt_assert(false, (char*)"Invalid tensor size"); } } @@ -45,7 +45,7 @@ template class tensor { strides_(strides), size_(compute_size(dims_, strides_)) { if (size_ != data_.size()) { - throw std::errc::invalid_argument; + runtime_util.rt_assert(false, (char*)"Invalid tensor size"); } }