From 08342c6f9f09d06f284be7c219766e6657ff5b2b Mon Sep 17 00:00:00 2001
From: sunnycase <sunnycase@live.cn>
Date: Thu, 12 Sep 2024 01:39:32 +0000
Subject: [PATCH 01/10] [ntt.x86] Refactor reduce kernel

---
 .../nncase/ntt/arch/x86_64/primitive_ops.h    |  54 ++++
 .../nncase/ntt/kernels/packed_layer_norm.h    | 144 ---------
 .../nncase/ntt/kernels/packed_softmax.h       | 110 -------
 .../include/nncase/ntt/kernels/reduce.h       | 241 ++++++++------
 src/Native/include/nncase/ntt/ntt.h           |   2 -
 src/Native/include/nncase/ntt/primitive_ops.h |  37 ++-
 src/Native/include/nncase/ntt/tensor_ops.h    |  24 +-
 src/Native/src/test.cpp                       | 298 +-----------------
 .../benchmark_test/benchmark_ntt_reduce.cpp   | 224 +++++--------
 src/Native/test/ctest/test_ntt_reduce.cpp     | 120 +++----
 10 files changed, 363 insertions(+), 891 deletions(-)
 delete mode 100644 src/Native/include/nncase/ntt/kernels/packed_layer_norm.h
 delete mode 100644 src/Native/include/nncase/ntt/kernels/packed_softmax.h
diff --git a/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h b/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h
index caa7a621f2..26c553e989 100644
--- a/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h
+++ b/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h
@@ -651,6 +651,60 @@ template <> struct max<float, ntt::vector<float, 8>> {
     }
 };
 
+template <> struct reduce<add, float, ntt::vector<float, 8>> {
+    float operator()(const ntt::vector<float, 8> &v,
+                     float init_value) const noexcept {
+        return init_value + operator()(v);
+    }
+
+    float operator()(const ntt::vector<float, 8> &v) const noexcept {
+        // Sum the elements in the 256-bit vector directly
+        __m128 sum =
+            _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
+        sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+        sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 1));
+
+        // Extract and return the final sum
+        return _mm_cvtss_f32(sum);
+    }
+};
+
+template <> struct reduce<max, float, ntt::vector<float, 8>> {
+    float operator()(const ntt::vector<float, 8> &v,
+                     float init_value) const noexcept {
+        return ntt::max(init_value, operator()(v));
+    }
+
+    float operator()(const ntt::vector<float, 8> &v) const noexcept {
+        // Sum the elements in the 256-bit vector directly
+        __m128 sum =
+            _mm_max_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
+        sum = _mm_max_ps(sum, _mm_movehl_ps(sum, sum));
+        sum = _mm_max_ss(sum, _mm_shuffle_ps(sum, sum, 1));
+
+        // Extract and return the final sum
+        return _mm_cvtss_f32(sum);
+    }
+};
+
+template <> struct reduce<min, float, ntt::vector<float, 8>> {
+    float operator()(const ntt::vector<float, 8> &v,
+                     float init_value) const noexcept {
+        return ntt::min(init_value, operator()(v));
+    }
+
+    float operator()(const ntt::vector<float, 8> &v) const noexcept {
+        // Sum the elements in the 256-bit vector directly
+        __m128 sum =
+            _mm_min_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
+        sum = _mm_min_ps(sum, _mm_movehl_ps(sum, sum));
+        sum = _mm_min_ss(sum, _mm_shuffle_ps(sum, sum, 1));
+
+        // Extract and return the final sum
+        return _mm_cvtss_f32(sum);
+    }
+};
+
 template <bool AccC>
 struct mma<AccC, ntt::vector<float, 8, 8>, ntt::vector<float, 8, 8>,
            ntt::vector<float, 8, 8>> {
diff --git a/src/Native/include/nncase/ntt/kernels/packed_layer_norm.h b/src/Native/include/nncase/ntt/kernels/packed_layer_norm.h
deleted file mode 100644
index 6432e7685c..0000000000
--- a/src/Native/include/nncase/ntt/kernels/packed_layer_norm.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright 2019-2021 Canaan Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include "../apply.h"
-#include "../tensor_ops.h"
-#include "../utility.h"
-#include "binary.h"
-#include "unary.h"
-
-namespace nncase::ntt {
-
-namespace packed_layer_norm_detail {
-
-template <size_t Axis, IsFixedTensor TIn, IsFixedTensor TScale,
-          IsFixedTensor TBias, IsFixedTensor TOut, typename TEp,
-          IsFixedDims PackedAxes, IsFixedDims PadedNums>
-void within_axis_pack_impl(const TIn &input, const TScale &scale,
-                           const TBias &bias, TOut &&output, const TEp &epsilon,
-                           const bool &use_mean, PackedAxes, PadedNums) {
-    using TElem = typename TIn::element_type;
-    constexpr auto input_shape = typename TIn::shape_type{};
-    constexpr auto input_strides = typename TIn::strides_type{};
-    constexpr auto scale_shape = typename TScale::shape_type{};
-    constexpr auto scale_strides = typename TScale::strides_type{};
-    constexpr auto bias_shape = typename TBias::shape_type{};
-    constexpr auto bias_strides = typename TBias::strides_type{};
-    constexpr auto output_shape = typename std::decay_t<TOut>::shape_type{};
-    constexpr auto output_strides = typename std::decay_t<TOut>::strides_type{};
-    constexpr size_t in_contigous_dim =
-        contiguous_dims(input_shape, input_strides);
-    constexpr size_t scale_contiguous_dims =
-        contiguous_dims(scale_shape, scale_strides);
-    constexpr size_t bias_contiguous_dims =
-        contiguous_dims(bias_shape, bias_strides);
-    constexpr size_t output_contiguous_dims =
-        contiguous_dims(output_shape, output_strides);
-    static_assert(in_contigous_dim != 0 || scale_contiguous_dims != 0 ||
-                      bias_contiguous_dims != 0 || output_contiguous_dims != 0,
-                  "currently not support no contiguous!");
-    static_assert(is_same_seq(input_shape, output_shape), "shape not match");
-    static_assert(is_same_seq(input_strides, output_strides),
-                  "strides not match");
-    constexpr auto domain = slice_fixed_dims<Axis>(input_shape);
-    constexpr auto strides = slice_fixed_dims<Axis>(input_strides);
-
-    constexpr size_t inner_size =
-        slice_fixed_dims<input_shape.rank() - Axis, Axis>(input_shape).length();
-    // constexpr size_t no_paded_rank =
-    //     PackedAxes::rank() == 0 ? 0
-    //                             : input_shape.rank() - PackedAxes::at(0) - 1;
-    // constexpr size_t paded_axis =
-    //     PackedAxes::rank() == 0 ? 0 : PackedAxes::at(0) + 1;
-    // // clang-format off
-    // constexpr size_t paded_inner_size = (PadedNums::rank() == 0 ||
-    // (PadedNums::rank() == 1 && PadedNums::at(0) == 0))
-    //   ? 0
-    //   : PadedNums::at(0) * slice_fixed_dims<no_paded_rank,
-    //   paded_axis>(input_shape).length();
-    // // clang-format on
-    constexpr bool UseVectorReduce =
-        PackedAxes::rank() == 1 && PackedAxes::at(0) >= Axis;
-
-    TElem finner_size = (TElem)inner_size;
-    if constexpr (UseVectorReduce) {
-        finner_size = finner_size * (TElem)TElem::shape_type::length();
-    }
-    // remove pad nums, NOTE after mul elem size
-    // finner_size = sub_op(finner_size, paded_inner_size);
-
-    apply(domain, [&](auto index) {
-        const auto input_p =
-            input.elements().data() + linear_offset(index, strides);
-        const auto scale_p = scale.elements().data();
-        const auto bias_p = bias.elements().data();
-        auto output_p =
-            output.elements().data() + linear_offset(index, strides);
-
-        // compute mean
-        TElem mean1 = (TElem)0;
-        if (use_mean) {
-            for (size_t i = 0; i < inner_size; i++)
-                mean1 = mean1 + (input_p[i] / finner_size);
-            if constexpr (UseVectorReduce) {
-                mean1 = (TElem)reduce_sum(mean1);
-            }
-        }
-
-        std::array<TElem, inner_size> sub;
-        for (auto i = 0; i < inner_size; i++)
-            sub[i] = input_p[i] - mean1;
-
-        std::array<TElem, inner_size> pow;
-        for (auto i = 0; i < inner_size; i++)
-            pow[i] = sub[i] * sub[i];
-
-        TElem mean2 = (TElem)0;
-        for (auto i = 0; i < inner_size; i++)
-            mean2 = mean2 + (pow[i] / finner_size);
-        if constexpr (UseVectorReduce) {
-            mean2 = (TElem)reduce_sum(mean2);
-        }
-
-        TElem add = mean2 + epsilon;
-        TElem sqrt = ntt::sqrt(add);
-
-        std::array<TElem, inner_size> norm;
-        for (auto i = 0; i < inner_size; i++)
-            norm[i] = sub[i] / sqrt;
-
-        for (auto i = 0; i < inner_size; i++)
-            output_p[i] = (norm[i] * scale_p[i]) + (TElem)bias_p[i];
-    });
-}
-} // namespace packed_layer_norm_detail
-
-template <size_t Axis, IsFixedTensor TIn, IsFixedTensor TScale,
-          IsFixedTensor TBias, IsFixedTensor TOut, typename TEp,
-          IsFixedDims PackedAxes, IsFixedDims PadedNums>
-void packed_layer_norm(const TIn &input, const TScale &scale, const TBias &bias,
-                       TOut &&output, const TEp &epsilon, const bool &use_mean,
-                       PackedAxes packedAxes, PadedNums padedNums) {
-    static_assert(PackedAxes::rank() < 2, "currently not support 2d packing.");
-    if constexpr (PackedAxes::rank() <= 1) {
-        static_assert(PadedNums::rank() == 0 ||
-                          (PadedNums::rank() == 1 && PadedNums::at(0) == 0),
-                      "not support padding");
-        packed_layer_norm_detail::within_axis_pack_impl<Axis>(
-            input, scale, bias, output, epsilon, use_mean, packedAxes,
-            padedNums);
-    }
-}
-} // namespace nncase::ntt
diff --git a/src/Native/include/nncase/ntt/kernels/packed_softmax.h b/src/Native/include/nncase/ntt/kernels/packed_softmax.h
deleted file mode 100644
index dbeebb40e6..0000000000
--- a/src/Native/include/nncase/ntt/kernels/packed_softmax.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2019-2021 Canaan Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include "../apply.h"
-#include "../shape_infer/reduce_axis.h"
-#include "../tensor_ops.h"
-#include "../utility.h"
-#include "binary.h"
-#include "unary.h"
-#include <algorithm>
-
-namespace nncase::ntt {
-
-namespace softmax_detail {
-template <size_t Axis, IsFixedTensor TIn, IsFixedTensor TOut,
-          typename PackedAxes>
-void packed_on_axis_impl(const TIn &input, TOut &&output,
-                         [[maybe_unused]] PackedAxes packedAxes) {
-    using TElem = typename TIn::element_type;
-    constexpr auto input_shape = typename TIn::shape_type{};
-    constexpr auto output_shape = typename std::decay_t<TOut>::shape_type{};
-    static_assert(is_same_seq(input_shape, output_shape),
-                  "the input output shape not equal!");
-
-    constexpr auto need_reduce =
-        PackedAxes::rank() != 0 && Axis == PackedAxes::at(0);
-    constexpr auto domain =
-        shape_infer::reduced_shape_by_axis<Axis>(input_shape);
-    apply(domain, [&](auto index) {
-        // max
-        TElem max_value = input(index);
-        for (index[Axis] = 0; index[Axis] < input_shape.at(Axis);
-             index[Axis]++) {
-            max_value = max(max_value, input(index));
-        }
-
-        // reduce_max
-        if constexpr (need_reduce) {
-            max_value = (TElem)reduce_max(max_value);
-        }
-
-        // (x - reduce_max) * beta
-        for (index[Axis] = 0; index[Axis] < input_shape.at(Axis);
-             index[Axis]++) {
-            output(index) = input(index) - max_value;
-        }
-
-        // exp((x - reduce_max) * beta) and sum
-        TElem sum = (TElem)0;
-        for (index[Axis] = 0; index[Axis] < input_shape.at(Axis);
-             index[Axis]++) {
-            output(index) = exp(output(index));
-            sum += output(index);
-        }
-
-        // reduce sum
-        if constexpr (need_reduce) {
-            sum = (TElem)reduce_sum(sum);
-        }
-
-        // div
-        for (index[Axis] = 0; index[Axis] < input_shape.at(Axis);
-             index[Axis]++) {
-            output(index) = output(index) / sum;
-        }
-    });
-}
-
-template <size_t Axis, IsFixedTensor TIn, IsFixedTensor TOut,
-          typename PackedAxes>
-void packed_softmax_1d(const TIn &input, TOut &&output, PackedAxes packedAxes) {
-    packed_on_axis_impl<Axis>(input, output, packedAxes);
-}
-
-} // namespace softmax_detail
-
-/**
- * @brief packed softmax
- *  implement notice:
- *    1. need support 2d pack.
- *    2. need support paded nums.
- *    3. need different implementation when the packed axis is equal or not
- * equal axis.
- * @tparam Axis softmax reduced axis
- * @param input input tensor.
- * @param output output output.
- * @param packedAxes  packed axes
- */
-template <size_t Axis, IsFixedTensor TIn, IsFixedTensor TOut,
-          typename PackedAxes /* , typename PadedNums */>
-void packed_softmax(const TIn &input, TOut &&output,
-                    [[maybe_unused]] PackedAxes packedAxes
-                    /* , [[maybe_unused]] PadedNums padednums */) noexcept {
-    static_assert(PackedAxes::rank() < 2, "currently not support 2d pack");
-    // static_assert(PadedNums::at(0) == 0, "currently not support pad");
-    softmax_detail::packed_softmax_1d<Axis>(input, output, packedAxes);
-}
-} // namespace nncase::ntt
diff --git a/src/Native/include/nncase/ntt/kernels/reduce.h b/src/Native/include/nncase/ntt/kernels/reduce.h
index 70d8ab05f5..593d7055d4 100644
--- a/src/Native/include/nncase/ntt/kernels/reduce.h
+++ b/src/Native/include/nncase/ntt/kernels/reduce.h
@@ -14,132 +14,169 @@
  */
 #pragma once
 #include "../apply.h"
-#include "../loop.h"
 #include "../primitive_ops.h"
+#include "../profiler.h"
+#include "../shape_infer/reduce.h"
 #include "../tensor_ops.h"
-#include "../unrool.h"
+#include "../tensor_traits.h"
 #include "../utility.h"
+#include "nncase/ntt/shape.h"
+#include <limits>
+#include <type_traits>
+#include <utility>
 
 namespace nncase::ntt {
+enum class reduce_op {
+    mean,
+    min,
+    max,
+    sum,
+    prod,
+};
 
-namespace reduce_detail {
-
-template <template <class, class> class Op, class TElem, IsFixedDims Axes,
-          IsFixedDims PackedAxes>
-constexpr size_t unroll_arch() {
-#if defined(__riscv)
-    return 1;
-#elif defined(__x86_64__)
-    constexpr bool is_pattern =
-        (Axes::rank() == 1) && (PackedAxes::rank() == 0);
-    constexpr bool is_op =
-        std::is_same_v<Op<TElem, TElem>, ntt::ops::mean<TElem, TElem>> ||
-        std::is_same_v<Op<TElem, TElem>, ntt::ops::add<TElem, TElem>>;
-    if (is_pattern && is_op) {
-        return 1;
-    }
-    return 1;
-#else
-    return 1;
-#endif
-}
+namespace detail {
+template <reduce_op Op> struct reduce_to_binary_type;
+
+template <> struct reduce_to_binary_type<reduce_op::mean> {
+    template <class T1, class T2> using type = ops::add<T1, T2>;
+};
+
+template <> struct reduce_to_binary_type<reduce_op::min> {
+    template <class T1, class T2> using type = ops::min<T1, T2>;
+};
+
+template <> struct reduce_to_binary_type<reduce_op::max> {
+    template <class T1, class T2> using type = ops::max<T1, T2>;
+};
+
+template <> struct reduce_to_binary_type<reduce_op::sum> {
+    template <class T1, class T2> using type = ops::add<T1, T2>;
+};
+
+template <> struct reduce_to_binary_type<reduce_op::prod> {
+    template <class T1, class T2> using type = ops::mul<T1, T2>;
+};
+
+template <reduce_op Op, bool Accumulate, IsTensor TIn, IsTensor TOut,
+          IsFixedDims Axes, IsFixedDims PackedAxes, class PadedNums>
+class reduce_impl {
+    using TInElem = typename TIn::element_type;
+    using TOutElem = typename TOut::element_type;
+    using TOutScalar = element_or_scalar_t<TOutElem>;
 
-template <template <class T1, class T2> class Op, IsFixedTensor TIn,
-          IsFixedTensor TOut, IsFixedDims Axes, IsFixedDims PackedAxes,
-          IsFixedDims PadedNums>
-void reduce_impl(const TIn &input, TOut &&output, Axes axes, PackedAxes,
-                 PadedNums) {
-    using TIElem = typename TIn::element_type;
-    using TOElem = typename std::decay_t<TOut>::element_type;
-    constexpr auto input_shape = typename TIn::shape_type{};
-    constexpr auto input_strides = typename TIn::strides_type{};
-    static_assert(is_same_seq(shift_fixed_dims<Axes::at(0)>(axes),
-                              make_index_sequence(axes)),
-                  "only support contiguous axis for now!");
-    constexpr auto output_shape = typename std::decay_t<TOut>::shape_type{};
-    constexpr auto output_strides = typename std::decay_t<TOut>::strides_type{};
-
-    constexpr size_t in_contigous_dim =
-        contiguous_dims(input_shape, input_strides);
-    constexpr size_t output_contiguous_dims =
-        contiguous_dims(output_shape, output_strides);
-    static_assert(in_contigous_dim == input_shape.rank() &&
-                      output_contiguous_dims == output_shape.rank(),
-                  "only support contiguous for now!");
-
-    constexpr auto domain = concat_fixed_dims(
-        slice_fixed_dims<Axes::at(0)>(input_shape),
-        slice_fixed_dims<input_shape.rank() - Axes::rank() - Axes::at(0),
-                         Axes::at(0) + Axes::rank()>(input_shape));
-    constexpr auto strides = concat_fixed_dims(
-        slice_fixed_dims<Axes::at(0)>(input_strides),
-        slice_fixed_dims<input_strides.rank() - Axes::rank() - Axes::at(0),
-                         Axes::at(0) + Axes::rank()>(input_strides));
-
-    [[maybe_unused]] constexpr auto ostrides = output_strides;
-    constexpr auto rank =
-        input_shape.rank() == output_shape.rank()
-            ? output_strides.rank() - Axes::rank() - Axes::at(0)
-            : 0;
-    constexpr auto ostrides_keep_dims = concat_fixed_dims(
-        slice_fixed_dims<Axes::at(0)>(output_strides),
-        slice_fixed_dims<rank, Axes::at(0) + Axes::rank()>(output_strides));
-
-    constexpr size_t inner_size =
-        slice_fixed_dims<Axes::rank(), axes.at(0)>(input_shape).length();
-    constexpr bool UseVectorReduce =
+    static constexpr bool use_vector_reduce =
         PackedAxes::rank() == 1 && PackedAxes::at(0) >= Axes::at(0);
 
-    constexpr size_t unroll_num = unroll_arch<Op, TIElem, Axes, PackedAxes>();
+    static constexpr TOutElem initial_value() noexcept {
+        if constexpr (Op == reduce_op::mean || Op == reduce_op::sum) {
+            return (TOutElem)0;
+        } else if constexpr (Op == reduce_op::min) {
+            return (TOutElem)std::numeric_limits<TOutScalar>::max();
+        } else if constexpr (Op == reduce_op::max) {
+            return (TOutElem)std::numeric_limits<TOutScalar>::lowest();
+        } else if constexpr (Op == reduce_op::prod) {
+            return (TOutElem)1;
+        }
+    }
 
-    constexpr auto input_stride = input_strides[Axes::at(Axes::rank() - 1)];
-    apply(domain, [&](auto index) {
-        auto input_p = input.elements().data() + linear_offset(index, strides);
-        auto output_p = output.elements().data();
-        if constexpr (input_shape.rank() == output_shape.rank()) {
-            output_p += linear_offset(index, ostrides_keep_dims);
-        } else {
-            output_p += linear_offset(index, ostrides);
+  public:
+    constexpr void operator()(const TIn &input, TOut &output) {
+        auto in_p = input.elements().data();
+        auto out_p = output.elements().data();
+        // 1. Initialize
+        if constexpr (!Accumulate) {
+            ntt::apply(output.shape(),
+                  [&](auto index) { output(index) = initial_value(); });
         }
 
-        if constexpr (std::is_same_v<Op<TIElem, TIElem>,
-                                     ntt::ops::mean<TIElem, TIElem>>) {
-            TIElem sum;
-            sum = loop_unrool<ntt::ops::add, TIElem, unroll_num, inner_size,
-                              input_stride>(input_p);
+        // 2. Reduce
+        apply<0>(input, output, in_p, out_p);
 
-            if constexpr (UseVectorReduce) {
-                sum = sum / (inner_size * TIElem::shape_type::length());
-                output_p[0] = reduce_sum(sum);
-            } else {
-                output_p[0] = sum / inner_size;
+        // 3. Mean
+        if constexpr (Op == reduce_op::mean) {
+            size_t inner_size =
+                slice_fixed_dims<Axes::rank(), Axes::at(0)>(input.shape())
+                    .length();
+            if constexpr (use_vector_reduce) {
+                inner_size *= TInElem::shape_type::length();
             }
-        } else {
-            TIElem ret;
-            ret = loop_unrool<Op, TIElem, unroll_num, inner_size, input_stride>(
-                input_p);
 
-            if constexpr (UseVectorReduce) {
-                output_p[0] = ops::reduce<Op, TOElem, TIElem>()(ret);
+            auto denom = (TOutScalar)inner_size;
+            ntt::apply(output.shape(), [&](auto index) { output(index) /= denom; });
+        }
+    }
+
+  private:
+    template <size_t Axis, class TInP, class TOutP>
+    constexpr void apply(const TIn &input, TOut &output, TInP in_p,
+                         TOutP out_p) {
+        for (size_t i = 0; i < input.shape()[Axis]; i++) {
+            if constexpr (Axis == TIn::rank() - 1) {
+                reduce(*out_p, *in_p);
             } else {
-                output_p[0] = ret;
+                apply<Axis + 1>(input, output, in_p, out_p);
             }
+
+            in_p += input.strides()[Axis];
+            out_p +=
+                utility_detail::get_safe_stride(output, Axis, TOut::shape());
         }
-    });
-}
-} // namespace reduce_detail
+    }
+
+    template <class TOutElem, class TInElem>
+    void reduce(TOutElem &output, const TInElem input) {
+        if constexpr (IsScalar<TOutElem>) {
+            output = ntt::reduce<reduce_to_binary_type<Op>::template type>(
+                input, output);
+        } else {
+            output =
+                reduce_to_binary_type<Op>::template type<TOutElem, TInElem>()(
+                    output, input);
+        }
+    }
+};
+} // namespace detail
 
-template <template <class T1, class T2> class Op, IsFixedTensor TIn,
-          IsFixedTensor TOut, IsFixedDims Axes, IsFixedDims PackedAxes,
-          IsFixedDims PadedNums>
-void reduce(const TIn &input, TOut &&output, Axes axes, PackedAxes packedAxes,
-            PadedNums padedNums) noexcept {
+template <reduce_op Op, IsFixedDims Axes, IsFixedDims PackedAxes,
+          IsFixedDims PadedNums, class TIn, class TOut>
+void reduce(const TIn &input, TOut &&output) noexcept {
     static_assert(PackedAxes::rank() < 2, "currently not support 2d packing.");
 
     static_assert(PadedNums::rank() == 0 ||
                       (PadedNums::rank() == 1 && PadedNums::at(0) == 0),
                   "not support padding");
     AUTO_NTT_PROFILER
-    reduce_detail::reduce_impl<Op>(input, output, axes, packedAxes, padedNums);
+    detail::reduce_impl<Op, false, std::decay_t<TIn>, std::decay_t<TOut>, Axes,
+                        PackedAxes, PadedNums>
+        impl;
+    impl(input, output);
+}
+
+template <IsFixedDims Axes, IsFixedDims PackedAxes = fixed_shape<>,
+          IsFixedDims PadedNums = fixed_shape<>, class TIn, class TOut>
+void reduce_sum(const TIn &input, TOut &&output) noexcept {
+    return reduce<reduce_op::sum, Axes, PackedAxes, PadedNums>(
+        input, std::forward<TOut>(output));
+}
+
+template <IsFixedDims Axes, IsFixedDims PackedAxes = fixed_shape<>,
+          IsFixedDims PadedNums = fixed_shape<>, class TIn, class TOut>
+void reduce_min(const TIn &input, TOut &&output) noexcept {
+    return reduce<reduce_op::min, Axes, PackedAxes, PadedNums>(
+        input, std::forward<TOut>(output));
+}
+
+template <IsFixedDims Axes, IsFixedDims PackedAxes = fixed_shape<>,
+          IsFixedDims PadedNums = fixed_shape<>, class TIn, class TOut>
+void reduce_max(const TIn &input, TOut &&output) noexcept {
+    return reduce<reduce_op::max, Axes, PackedAxes, PadedNums>(
+        input, std::forward<TOut>(output));
+}
+
+template <IsFixedDims Axes, IsFixedDims PackedAxes = fixed_shape<>,
+          IsFixedDims PadedNums = fixed_shape<>, class TIn, class TOut>
+void reduce_mean(const TIn &input, TOut &&output) noexcept {
+    return reduce<reduce_op::mean, Axes, PackedAxes, PadedNums>(
+        input, std::forward<TOut>(output));
 }
 } // namespace nncase::ntt
diff --git a/src/Native/include/nncase/ntt/ntt.h b/src/Native/include/nncase/ntt/ntt.h
index ad68514c29..e1a6bcec9c 100644
--- a/src/Native/include/nncase/ntt/ntt.h
+++ b/src/Native/include/nncase/ntt/ntt.h
@@ -25,8 +25,6 @@
 #include "kernels/instance_norm.h"
 #include "kernels/matmul.h"
 #include "kernels/pack.h"
-#include "kernels/packed_layer_norm.h"
-#include "kernels/packed_softmax.h"
 #include "kernels/pad.h"
 #include "kernels/reduce.h"
 #include "kernels/reduce_arg.h"
diff --git a/src/Native/include/nncase/ntt/primitive_ops.h b/src/Native/include/nncase/ntt/primitive_ops.h
index 89f6cce0ed..199cf85f28 100644
--- a/src/Native/include/nncase/ntt/primitive_ops.h
+++ b/src/Native/include/nncase/ntt/primitive_ops.h
@@ -210,10 +210,6 @@ template <class T1, class T2> struct pow {
     }
 };
 
-template <class T1, class T2> struct mean {
-    constexpr auto operator()(const T1 &v1) const noexcept { return v1; }
-};
-
 template <class T, class B> struct swishb {
     constexpr T operator()(const T &v, const B &beta) const noexcept;
 };
@@ -222,8 +218,13 @@ template <class T, class B> struct swishb {
 
 template <template <class T1, class T2> class BinaryOp, class TResult, class T>
 struct reduce {
+    constexpr TResult operator()(const T &v,
+                                 TResult init_value) const noexcept {
+        return BinaryOp<TResult, T>()(init_value, v);
+    }
+
     constexpr TResult operator()(const T &v) const noexcept {
-        return TResult(v);
+        return (TResult)v;
     }
 };
 
@@ -261,7 +262,11 @@ template <class T1, class T2> struct clamp {
         using TResult =                                                        \
             std::conditional_t<std::is_same_v<TResultOrVoid, void>,            \
                                element_or_scalar_t<T>, TResultOrVoid>;         \
-        return ops::reduce<op, TResult, T>()(v);                               \
+        return ntt::reduce<op, TResult>(v);                                    \
+    }                                                                          \
+    template <IsScalar TResult, IsTensorOrScalar T>                            \
+    constexpr auto name(const T &v, TResult init_value) noexcept {             \
+        return ntt::reduce<op>(v, init_value);                                 \
     }
 
 NTT_DEFINE_UNARY_FUNC_IMPL(abs)
@@ -301,10 +306,6 @@ NTT_DEFINE_BINARY_FUNC_IMPL(max)
 NTT_DEFINE_BINARY_FUNC_IMPL(pow)
 NTT_DEFINE_BINARY_FUNC_IMPL(swishb)
 
-NTT_DEFINE_REDUCE_FUNC_IMPL(reduce_sum, ops::add)
-NTT_DEFINE_REDUCE_FUNC_IMPL(reduce_max, ops::max)
-NTT_DEFINE_REDUCE_FUNC_IMPL(reduce_min, ops::min)
-
 template <IsTensorOrScalar T1, IsTensorOrScalar T2, IsTensorOrScalar TResult>
 constexpr TResult mul_add(const T1 &v1, const T2 &v2,
                           const TResult &v3) noexcept {
@@ -316,6 +317,22 @@ constexpr TResult mma(const T1 &v1, const T2 &v2, const TResult &v3) noexcept {
     return ops::mma<AccC, T1, T2, TResult>()(v1, v2, v3);
 }
 
+template <template <class T1, class T2> class BinaryOp, IsScalar TResult,
+          IsTensorOrScalar T>
+constexpr TResult reduce(const T &v, TResult init_value) noexcept {
+    return ops::reduce<BinaryOp, TResult, T>()(v, init_value);
+}
+
+template <template <class T1, class T2> class BinaryOp, IsScalar TResult,
+          IsTensorOrScalar T>
+constexpr TResult reduce(const T &v) noexcept {
+    return ops::reduce<BinaryOp, TResult, T>()(v);
+}
+
+NTT_DEFINE_REDUCE_FUNC_IMPL(reduce_sum, ops::add)
+NTT_DEFINE_REDUCE_FUNC_IMPL(reduce_max, ops::max)
+NTT_DEFINE_REDUCE_FUNC_IMPL(reduce_min, ops::min)
+
 /**
  * @defgroup Builtin operators
  * @{
diff --git a/src/Native/include/nncase/ntt/tensor_ops.h b/src/Native/include/nncase/ntt/tensor_ops.h
index a1024bf74d..400aef6d1f 100644
--- a/src/Native/include/nncase/ntt/tensor_ops.h
+++ b/src/Native/include/nncase/ntt/tensor_ops.h
@@ -49,7 +49,8 @@ struct tensor_unary_impl<Op, TTensor> {
 };
 
 template <template <class T> class Op, IsTensor TTensor>
-requires(TTensor::rank() == 2) struct tensor_unary_impl<Op, TTensor> {
+    requires(TTensor::rank() == 2)
+struct tensor_unary_impl<Op, TTensor> {
     using sub_vector_type =
         fixed_tensor_alike_t<TTensor, TTensor::shape().at(1)>;
 
@@ -93,8 +94,8 @@ struct tensor_binary_impl<Op, TTensor, T2> {
 };
 
 template <template <class T1, class T2> class Op, IsTensor T1, IsTensor T2>
-requires(T1::rank() == 2 &&
-         T2::rank() == 2) struct tensor_binary_impl<Op, T1, T2> {
+    requires(T1::rank() == 2 && T2::rank() == 2)
+struct tensor_binary_impl<Op, T1, T2> {
     using sub_vector_type = fixed_tensor_alike_t<T1, T1::shape().at(1)>;
 
     constexpr T1 operator()(const T1 &v1, const T2 &v2) const noexcept {
@@ -177,8 +178,8 @@ template <IsTensor TTensor> struct inner_product<TTensor, TTensor> {
 
     constexpr auto operator()(const TTensor &v1,
                               const TTensor &v2) const noexcept {
-        using result_type = decltype(
-            op_(std::declval<element_type>(), std::declval<element_type>()));
+        using result_type = decltype(op_(std::declval<element_type>(),
+                                         std::declval<element_type>()));
         result_type value{};
         apply(v1.shape(),
               [&](auto index) { value += op_(v1(index), v2(index)); });
@@ -251,11 +252,22 @@ struct mul_add<TScalar, TTensor, TTensor> {
     ops::mul_add<element_type, element_type, element_type> op_;
 };
 
-template <template <class T1, class T2> class Op, class TResult,
+template <template <class T1, class T2> class Op, IsScalar TResult,
           IsTensor TTensor>
 struct reduce<Op, TResult, TTensor> {
     using element_type = typename TTensor::element_type;
 
+    constexpr TResult operator()(const TTensor &v,
+                                 TResult init_value) const noexcept {
+        Op<TResult, element_type> op;
+        auto count = v.shape()[0];
+        auto value = init_value;
+        for (size_t i = 0; i < count; i++) {
+            value = op(value, v(i));
+        }
+        return value;
+    }
+
     constexpr TResult operator()(const TTensor &v) const noexcept {
         Op<TResult, element_type> op;
         auto count = v.shape()[0];
diff --git a/src/Native/src/test.cpp b/src/Native/src/test.cpp
index 8545923eb4..8f84d85d8a 100644
--- a/src/Native/src/test.cpp
+++ b/src/Native/src/test.cpp
@@ -385,301 +385,6 @@ int main() {
         });
     }
 
-    // layer norm1 (packed axis >= layer norm axis)
-    {
-        ntt::tensor<float, ntt::fixed_shape<1, 16, 2>> buffer_1;
-        ntt::tensor<float, ntt::fixed_shape<16, 2>> buffer_4;
-        ntt::tensor<float, ntt::fixed_shape<16, 2>> buffer_7;
-        std::iota(buffer_1.elements().begin(), buffer_1.elements().end(), 0.f);
-        std::iota(buffer_4.elements().begin(), buffer_4.elements().end(), 0.f);
-        std::iota(buffer_7.elements().begin(), buffer_7.elements().end(), 0.f);
-
-        // no pack
-        ntt::tensor<float, ntt::fixed_shape<1, 16, 2>> buffer_11;
-        packed_layer_norm<1>(buffer_1, buffer_4, buffer_7, buffer_11, 1e-06,
-                             true, ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
-        assert(buffer_11(0, 0, 0) == 0.0f);
-        assert(std::abs(buffer_11(0, 0, 1) - (-0.57043804f)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 1, 0) - (-0.92426393f)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 1, 1) - (-1.06147768f)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 15, 0) - (77.11314114f)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 15, 1) - (83.04106739f)) < 1e-4f);
-
-        // packed
-        ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<1, 2, 2>> buffer_2;
-        ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<2, 2>> buffer_5;
-        ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<2, 2>> buffer_8;
-        ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<1, 2, 2>> buffer_9;
-        pack<1>(buffer_1, buffer_2);
-        pack<0>(buffer_4, buffer_5);
-        pack<0>(buffer_7, buffer_8);
-        packed_layer_norm<1>(buffer_2, buffer_5, buffer_8, buffer_9,
-                             ntt::vector<float, 8>::from_scalar(1E-06), true,
-                             ntt::fixed_shape<1>{}, ntt::fixed_shape<0>{});
-
-        ntt::tensor<float, ntt::fixed_shape<1, 16, 2>> buffer_10;
-        unpack<1>(buffer_9, buffer_10);
-        assert(buffer_10(0, 0, 0) == 0.0f);
-        assert(std::abs(buffer_10(0, 0, 1) - (-0.57043804f)) < 1e-4f);
-        assert(std::abs(buffer_10(0, 1, 0) - (-0.92426393f)) < 1e-4f);
-        assert(std::abs(buffer_10(0, 1, 1) - (-1.06147768f)) < 1e-4f);
-        assert(std::abs(buffer_10(0, 15, 0) - (77.11314114f)) < 1e-4f);
-        assert(std::abs(buffer_10(0, 15, 1) - (83.04106739f)) < 1e-4f);
-    }
-
-    // layer norm2 (packed axis == layer norm axis)
-    {
-        ntt::tensor<float, ntt::fixed_shape<1, 2, 16>> input;
-        ntt::tensor<float, ntt::fixed_shape<16>> scale;
-        ntt::tensor<float, ntt::fixed_shape<16>> bias;
-        std::iota(input.elements().begin(), input.elements().end(), 0.f);
-        std::iota(scale.elements().begin(), scale.elements().end(), 0.f);
-        std::iota(bias.elements().rbegin(), bias.elements().rend(), 0.f);
-
-        ntt::tensor<ntt::vector<float, 4>, ntt::fixed_shape<1, 2, 4>>
-            input_packed;
-        ntt::tensor<ntt::vector<float, 4>, ntt::fixed_shape<4>> scale_packed;
-        ntt::tensor<ntt::vector<float, 4>, ntt::fixed_shape<4>> bias_packed;
-        ntt::pack<2>(input, input_packed);
-        ntt::pack<0>(scale, scale_packed);
-        ntt::pack<0>(bias, bias_packed);
-        ntt::tensor<ntt::vector<float, 4>, ntt::fixed_shape<1, 2, 4>>
-            output_packed;
-        packed_layer_norm<2>(input_packed, scale_packed, bias_packed,
-                             output_packed,
-                             ntt::vector<float, 4>::from_scalar(1E-06), true,
-                             ntt::fixed_shape<2>{}, ntt::fixed_shape<0>{});
-
-        ntt::tensor<float, ntt::fixed_shape<1, 2, 16>> output;
-        unpack<2>(output_packed, output);
-
-        assert(std::abs(output(0, 0, 0) - (15.f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 1) - (12.58995206f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 2) - (10.61376502f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 3) - (9.07143889f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 4) - (7.96297366f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 5) - (7.28836934f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 6) - (7.04762593f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 7) - (7.24074342f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 8) - (7.86772181f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 9) - (8.92856111f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 10) - (10.42326132f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 11) - (12.35182243f)) < 1e-6f);
-        assert(std::abs(output(0, 0, 12) - (14.71424445f)) < 1e-4f);
-        assert(std::abs(output(0, 0, 13) - (17.51052737f)) < 1e-4f);
-        assert(std::abs(output(0, 0, 14) - (20.7406712f)) < 1e-4f);
-        assert(std::abs(output(0, 0, 15) - (24.40467593f)) < 1e-4f);
-    }
-
-    // layer_norm2 (packed axis >= layer norm axis, with padding)
-    {
-        ntt::tensor<float, ntt::fixed_shape<1, 13, 2>> buffer_1;
-        ntt::tensor<float, ntt::fixed_shape<13, 2>> buffer_4;
-        ntt::tensor<float, ntt::fixed_shape<13, 2>> buffer_7;
-        std::iota(buffer_1.elements().begin(), buffer_1.elements().end(), 0.f);
-        std::iota(buffer_4.elements().begin(), buffer_4.elements().end(), 0.f);
-        std::iota(buffer_7.elements().begin(), buffer_7.elements().end(), 0.f);
-
-        // no pack
-        ntt::tensor<float, ntt::fixed_shape<1, 13, 2>> buffer_11;
-        packed_layer_norm<1>(buffer_1, buffer_4, buffer_7, buffer_11, 1e-06,
-                             true, ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
-        assert(std::abs(buffer_11(0, 1, 0) - (-7.99999975e-01)) < 1e-6f);
-        assert(std::abs(buffer_11(0, 1, 1) - (-7.99999966e-01)) < 1e-6f);
-        assert(std::abs(buffer_11(0, 2, 0) - (-5.33333293e-01)) < 1e-6f);
-        assert(std::abs(buffer_11(0, 2, 1) - (4.44444437e-08)) < 1e-6f);
-        assert(std::abs(buffer_11(0, 3, 0) - (8.00000046e-01)) < 1e-6f);
-        assert(std::abs(buffer_11(0, 3, 1) - (1.86666671e+00)) < 1e-6f);
-        assert(std::abs(buffer_11(0, 4, 0) - (3.20000004e+00)) < 1e-6f);
-        assert(std::abs(buffer_11(0, 4, 1) - (4.80000004e+00)) < 1e-6f);
-        assert(std::abs(buffer_11(0, 5, 0) - (6.66666670e+00)) < 1e-6f);
-        assert(std::abs(buffer_11(0, 5, 1) - (8.80000002e+00)) < 1e-6f);
-        assert(std::abs(buffer_11(0, 6, 0) - (1.12000000e+01)) < 1e-6f);
-        assert(std::abs(buffer_11(0, 6, 1) - (1.38666667e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 7, 0) - (1.68000000e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 7, 1) - (2.00000000e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 8, 0) - (2.34666666e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 8, 1) - (2.71999999e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 9, 0) - (3.11999999e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 9, 1) - (3.54666665e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 10, 0) - (3.99999998e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 10, 1) - (4.47999998e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 11, 0) - (4.98666664e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 11, 1) - (5.51999997e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 12, 0) - (6.07999997e+01)) < 1e-4f);
-        assert(std::abs(buffer_11(0, 12, 1) - (6.66666663e+01)) < 1e-4f);
-
-        // todo packed with pad
-        // ntt::tensor<float, ntt::fixed_shape<1, 16, 2>> buffer_1_pad;
-        // ntt::tensor<float, ntt::fixed_shape<16, 2>> buffer_4_pad;
-        // ntt::tensor<float, ntt::fixed_shape<16, 2>> buffer_7_pad;
-        // ntt::pad<0, 0, 0, 3, 0, 0>(buffer_1, buffer_1_pad, float{0});
-        // ntt::pad<0, 3, 0, 0>(buffer_4, buffer_4_pad, float{0});
-        // ntt::pad<0, 3, 0, 0>(buffer_7, buffer_7_pad, float{0});
-
-        // ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<1, 2, 2>>
-        // buffer_2; ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<2, 2>>
-        // buffer_5; ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<2, 2>>
-        // buffer_8; ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<1, 2,
-        // 2>> buffer_9; pack<1>(buffer_1_pad, buffer_2); pack<0>(buffer_4_pad,
-        // buffer_5); pack<0>(buffer_7_pad, buffer_8);
-        // packed_layer_norm<1>(buffer_2, buffer_5, buffer_8, buffer_9,
-        //                      ntt::vector<float, 8>{1E-06}, true,
-        //                      ntt::fixed_shape<1>{}, ntt::fixed_shape<3>{});
-        // ntt::tensor<float, ntt::fixed_shape<1, 16, 2>> buffer_10;
-        // unpack<1>(buffer_9, buffer_10);
-
-        // ntt::tensor<float, ntt::fixed_shape<1, 13, 2>> buffer_12;
-        // ntt::slice<ntt::fixed_shape<0, 0, 0>, ntt::fixed_shape<1, 13, 2>,
-        //            ntt::fixed_shape<0, 1, 2>, ntt::fixed_shape<1, 1, 1>>(
-        //     buffer_10, buffer_12);
-
-        // ntt::apply(buffer_11.shape(), [&](auto index) {
-        //     assert(buffer_11(index) == buffer_12(index));
-        // });
-    }
-
-    // layer_norm3 (packed axis < layer norm axis)
-    {
-        ntt::tensor<float, ntt::fixed_shape<1, 16, 8>> input;
-        ntt::tensor<float, ntt::fixed_shape<8>> scale;
-        ntt::tensor<float, ntt::fixed_shape<8>> bias;
-        std::iota(input.elements().begin(), input.elements().end(), 0.f);
-        std::iota(scale.elements().begin(), scale.elements().end(), 0.f);
-        std::iota(bias.elements().begin(), bias.elements().end(), 0.f);
-
-        // packed
-        ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<1, 2, 8>>
-            packed_input;
-        ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<1, 2, 8>>
-            packed_output;
-        pack<1>(input, packed_input);
-        packed_layer_norm<2>(packed_input, scale, bias, packed_output,
-                             ntt::vector<float, 8>::from_scalar(1E-06), true,
-                             ntt::fixed_shape<1>{}, ntt::fixed_shape<0>{});
-
-        ntt::tensor<float, ntt::fixed_shape<1, 16, 8>> unpacked_output;
-        unpack<1>(packed_output, unpacked_output);
-
-        assert(std::abs(unpacked_output(0, 0, 1) - (-0.09108935f)) < 1e-6f);
-        assert(std::abs(unpacked_output(0, 0, 2) - (0.69069278f)) < 1e-6f);
-        assert(std::abs(unpacked_output(0, 0, 3) - (2.34534639f)) < 1e-6f);
-        assert(std::abs(unpacked_output(0, 0, 4) - (4.87287148f)) < 1e-6f);
-        assert(std::abs(unpacked_output(0, 0, 5) - (8.27326804f)) < 1e-6f);
-        assert(std::abs(unpacked_output(0, 0, 6) - (12.54653608f)) < 1e-6f);
-        assert(std::abs(unpacked_output(0, 0, 7) - (17.6926756f)) < 1e-6f);
-        ntt::loop<15>([&]([[maybe_unused]] auto i) {
-            ntt::loop<7>([&]([[maybe_unused]] auto j) {
-                assert(unpacked_output(0, 0, j) ==
-                       unpacked_output(0, 1 + i, j));
-            });
-        });
-    }
-
-    // packed softmax(softmax axis == packed axis)
-    {
-        ntt::tensor<float, ntt::fixed_shape<1, 16, 2>> buffer_1;
-        ntt::tensor<float, ntt::fixed_shape<1, 16, 2>> buffer_3;
-        std::iota(buffer_1.elements().begin(), buffer_1.elements().end(), 0.f);
-        ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<1, 2, 2>> buffer_2;
-
-        pack<1>(buffer_1, buffer_2);
-        ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<1, 2, 2>> buffer_9;
-        packed_softmax<1>(buffer_2, buffer_9, ntt::fixed_shape<1>{});
-        ntt::tensor<float, ntt::fixed_shape<1, 16, 2>> buffer_10;
-        unpack<1>(buffer_9, buffer_10);
-
-        assert(std::abs(buffer_10(0, 13, 0) - (1.58368867e-02)) < 1e-6f);
-        assert(std::abs(buffer_10(0, 13, 1) - (1.58368867e-02)) < 1e-6f);
-        assert(std::abs(buffer_10(0, 14, 0) - (1.17019644e-01)) < 1e-6f);
-        assert(std::abs(buffer_10(0, 14, 1) - (1.17019644e-01)) < 1e-6f);
-        assert(std::abs(buffer_10(0, 15, 0) - (8.64664717e-01)) < 1e-6f);
-        assert(std::abs(buffer_10(0, 15, 1) - (8.64664717e-01)) < 1e-6f);
-
-        packed_softmax<1>(buffer_1, buffer_3, ntt::fixed_shape<>{});
-        assert(std::abs(buffer_3(0, 13, 0) - (1.58368867e-02)) < 1e-6f);
-        assert(std::abs(buffer_3(0, 13, 1) - (1.58368867e-02)) < 1e-6f);
-        assert(std::abs(buffer_3(0, 14, 0) - (1.17019644e-01)) < 1e-6f);
-        assert(std::abs(buffer_3(0, 14, 1) - (1.17019644e-01)) < 1e-6f);
-        assert(std::abs(buffer_3(0, 15, 0) - (8.64664717e-01)) < 1e-6f);
-        assert(std::abs(buffer_3(0, 15, 1) - (8.64664717e-01)) < 1e-6f);
-        ntt::apply(buffer_3.shape(), [&]([[maybe_unused]] auto index) {
-            assert(std::abs(buffer_3(index) - buffer_10(index)) < 1e-6f);
-        });
-    }
-
-    // packed softmax1(softmax axis != packed axis)
-    {
-        ntt::tensor<float, ntt::fixed_shape<1, 3, 16, 16>> buffer_1, buffer_2,
-            buffer_3;
-        std::iota(buffer_1.elements().begin(), buffer_1.elements().end(), 0.f);
-        ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<1, 3, 2, 16>>
-            buffer_4, buffer_5;
-        pack<2>(buffer_1, buffer_4);
-        packed_softmax<1>(buffer_4, buffer_5, ntt::fixed_shape<2>{});
-        unpack<2>(buffer_5, buffer_3);
-
-        packed_softmax<1>(buffer_1, buffer_2, ntt::fixed_shape<>{});
-        ntt::apply(buffer_2.shape(), [&]([[maybe_unused]] auto index) {
-            if (std::abs(buffer_2(index) - buffer_3(index)) >= 1e-6f) {
-                std::cout << "index: ";
-                for (size_t i = 0; i < index.rank(); i++)
-                    std::cout << index[i] << " ";
-                std::cout << ": buffer_2(index)=" << buffer_2(index)
-                          << ", buffer_3(index)=" << buffer_3(index);
-                std::cout << std::endl;
-            }
-        });
-    }
-
-    // packed softmax2(softmax axis != packed axis)
-    {
-        ntt::tensor<float, ntt::fixed_shape<1, 3, 16, 16>> buffer_1, buffer_2,
-            buffer_3;
-        std::iota(buffer_1.elements().begin(), buffer_1.elements().end(), 0.f);
-        ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<1, 3, 16, 2>>
-            buffer_4, buffer_5;
-        pack<3>(buffer_1, buffer_4);
-        packed_softmax<1>(buffer_4, buffer_5, ntt::fixed_shape<2>{});
-        unpack<3>(buffer_5, buffer_3);
-
-        packed_softmax<1>(buffer_1, buffer_2, ntt::fixed_shape<>{});
-        ntt::apply(buffer_2.shape(), [&]([[maybe_unused]] auto index) {
-            if (std::abs(buffer_2(index) - buffer_3(index)) >= 1e-6f) {
-                std::cout << "index: ";
-                for (size_t i = 0; i < index.rank(); i++)
-                    std::cout << index[i] << " ";
-                std::cout << ": buffer_2(index)=" << buffer_2(index)
-                          << ", buffer_3(index)=" << buffer_3(index);
-                std::cout << std::endl;
-            }
-        });
-    }
-
-    // packed softmax3(softmax axis != packed axis)
-    {
-        ntt::tensor<float, ntt::fixed_shape<1, 3, 16, 16>> buffer_1, buffer_2,
-            buffer_3;
-        std::iota(buffer_1.elements().begin(), buffer_1.elements().end(), 0.f);
-        ntt::tensor<ntt::vector<float, 8>, ntt::fixed_shape<1, 3, 2, 16>>
-            buffer_4, buffer_5;
-        pack<2>(buffer_1, buffer_4);
-        packed_softmax<3>(buffer_4, buffer_5, ntt::fixed_shape<2>{});
-        unpack<2>(buffer_5, buffer_3);
-
-        packed_softmax<3>(buffer_1, buffer_2, ntt::fixed_shape<>{});
-        ntt::apply(buffer_2.shape(), [&]([[maybe_unused]] auto index) {
-            if (std::abs(buffer_2(index) - buffer_3(index)) >= 1e-6f) {
-                std::cout << "index: ";
-                for (size_t i = 0; i < index.rank(); i++)
-                    std::cout << index[i] << " ";
-                std::cout << ": buffer_2(index)=" << buffer_2(index)
-                          << ", buffer_3(index)=" << buffer_3(index);
-                std::cout << std::endl;
-            }
-        });
-    }
-
     // packed matmul 1d on k
     {
         ntt::tensor<float, ntt::fixed_shape<3, 16>> ta;
@@ -1107,8 +812,7 @@ int main() {
         ntt::pack<1>(ta, tav.view());
 
         ntt::tensor<float, ntt::fixed_shape<2, 1>> tb;
-        ntt::reduce<ntt::ops::add>(tav, tb, ntt::fixed_shape<1>{},
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(tav, tb);
         assert(are_floats_equal(tb(0, 0), 8.f));
         assert(are_floats_equal(tb(1, 0), 25.6f));
     }
diff --git a/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp b/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp
index d953088ec6..5a6d4b0617 100644
--- a/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp
+++ b/src/Native/test/benchmark_test/benchmark_ntt_reduce.cpp
@@ -35,15 +35,12 @@ std::string benchmark_ntt_reduce_add_reduceN_noPack() {
     ntt::tensor<float, ntt::fixed_shape<M, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::add>(ta, tb[i], ntt::fixed_shape<1>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<1>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::add>(ta, tb[warmup_num + i],
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<1>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
 
@@ -69,15 +66,13 @@ std::string benchmark_ntt_reduce_add_reduceN_packN() {
     ntt::tensor<float, ntt::fixed_shape<M, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::add>(taP, tb[i], ntt::fixed_shape<1>{},
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(taP, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::add>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<1>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -102,15 +97,12 @@ std::string benchmark_ntt_reduce_add_reduceM_noPack() {
     ntt::tensor<float, ntt::fixed_shape<1, N>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::add>(ta, tb[i], ntt::fixed_shape<0>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<0>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::add>(ta, tb[warmup_num + i],
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<0>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -137,15 +129,13 @@ std::string benchmark_ntt_reduce_add_reduceM_packM() {
     ntt::tensor<float, ntt::fixed_shape<1, N>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::add>(taP, tb[i], ntt::fixed_shape<0>{},
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(taP, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::add>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<0>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -170,15 +160,12 @@ std::string benchmark_ntt_reduce_add_reduceMN_noPack() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::add>(ta, tb[i], ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<0, 1>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::add>(ta, tb[warmup_num + i],
-                                   ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<0, 1>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -205,15 +192,14 @@ std::string benchmark_ntt_reduce_add_reduceMN_packN() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::add>(taP, tb[i], ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(taP,
+                                                                     tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::add>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -240,15 +226,14 @@ std::string benchmark_ntt_reduce_add_reduceMN_packM() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::add>(taP, tb[i], ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(taP,
+                                                                     tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::add>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{});
+        ntt::reduce_sum<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -273,15 +258,12 @@ std::string benchmark_ntt_reduce_max_reduceN_noPack() {
     ntt::tensor<float, ntt::fixed_shape<M, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::max>(ta, tb[i], ntt::fixed_shape<1>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<1>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::max>(ta, tb[warmup_num + i],
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<1>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -308,15 +290,13 @@ std::string benchmark_ntt_reduce_max_reduceN_packN() {
     ntt::tensor<float, ntt::fixed_shape<M, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::max>(taP, tb[i], ntt::fixed_shape<1>{},
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(taP, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::max>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<1>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -341,15 +321,12 @@ std::string benchmark_ntt_reduce_max_reduceM_noPack() {
     ntt::tensor<float, ntt::fixed_shape<1, N>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::max>(ta, tb[i], ntt::fixed_shape<0>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<0>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::max>(ta, tb[warmup_num + i],
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<0>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -376,15 +353,13 @@ std::string benchmark_ntt_reduce_max_reduceM_packM() {
     ntt::tensor<float, ntt::fixed_shape<1, N>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::max>(taP, tb[i], ntt::fixed_shape<0>{},
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(taP, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::max>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<0>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -409,15 +384,12 @@ std::string benchmark_ntt_reduce_max_reduceMN_noPack() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::max>(ta, tb[i], ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<0, 1>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::max>(ta, tb[warmup_num + i],
-                                   ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<0, 1>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -444,15 +416,14 @@ std::string benchmark_ntt_reduce_max_reduceMN_packN() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::max>(taP, tb[i], ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(taP,
+                                                                     tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::max>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -479,15 +450,14 @@ std::string benchmark_ntt_reduce_max_reduceMN_packM() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::max>(taP, tb[i], ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(taP,
+                                                                     tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::max>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{});
+        ntt::reduce_max<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -512,15 +482,12 @@ std::string benchmark_ntt_reduce_min_reduceN_noPack() {
     ntt::tensor<float, ntt::fixed_shape<M, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::min>(ta, tb[i], ntt::fixed_shape<1>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<1>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::min>(ta, tb[warmup_num + i],
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<1>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -547,15 +514,13 @@ std::string benchmark_ntt_reduce_min_reduceN_packN() {
     ntt::tensor<float, ntt::fixed_shape<M, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::min>(taP, tb[i], ntt::fixed_shape<1>{},
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(taP, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::min>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<1>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -580,15 +545,12 @@ std::string benchmark_ntt_reduce_min_reduceM_noPack() {
     ntt::tensor<float, ntt::fixed_shape<1, N>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::min>(ta, tb[i], ntt::fixed_shape<0>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<0>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::min>(ta, tb[warmup_num + i],
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<0>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -615,15 +577,13 @@ std::string benchmark_ntt_reduce_min_reduceM_packM() {
     ntt::tensor<float, ntt::fixed_shape<1, N>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::min>(taP, tb[i], ntt::fixed_shape<0>{},
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(taP, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::min>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<0>{},
-                                   ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -649,15 +609,12 @@ std::string benchmark_ntt_reduce_min_reduceMN_noPack() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::min>(ta, tb[i], ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<0, 1>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::min>(ta, tb[warmup_num + i],
-                                   ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<0, 1>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -684,15 +641,14 @@ std::string benchmark_ntt_reduce_min_reduceMN_packN() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::min>(taP, tb[i], ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(taP,
+                                                                     tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::min>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -719,15 +675,14 @@ std::string benchmark_ntt_reduce_min_reduceMN_packM() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::min>(taP, tb[i], ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(taP,
+                                                                     tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::min>(taP, tb[warmup_num + i],
-                                   ntt::fixed_shape<0, 1>{},
-                                   ntt::fixed_shape<0>{}, ntt::fixed_shape<>{});
+        ntt::reduce_min<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -752,15 +707,12 @@ std::string benchmark_ntt_reduce_mean_reduceN_noPack() {
     ntt::tensor<float, ntt::fixed_shape<M, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::mean>(ta, tb[i], ntt::fixed_shape<1>{},
-                                    ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<1>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::mean>(ta, tb[warmup_num + i],
-                                    ntt::fixed_shape<1>{}, ntt::fixed_shape<>{},
-                                    ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<1>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -787,16 +739,13 @@ std::string benchmark_ntt_reduce_mean_reduceN_packN() {
     ntt::tensor<float, ntt::fixed_shape<M, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::mean>(taP, tb[i], ntt::fixed_shape<1>{},
-                                    ntt::fixed_shape<1>{},
-                                    ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(taP, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::mean>(
-            taP, tb[warmup_num + i], ntt::fixed_shape<1>{},
-            ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -821,15 +770,12 @@ std::string benchmark_ntt_reduce_mean_reduceM_noPack() {
     ntt::tensor<float, ntt::fixed_shape<1, N>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::mean>(ta, tb[i], ntt::fixed_shape<0>{},
-                                    ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<0>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::mean>(ta, tb[warmup_num + i],
-                                    ntt::fixed_shape<0>{}, ntt::fixed_shape<>{},
-                                    ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<0>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -856,16 +802,13 @@ std::string benchmark_ntt_reduce_mean_reduceM_packM() {
     ntt::tensor<float, ntt::fixed_shape<1, N>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::mean>(taP, tb[i], ntt::fixed_shape<0>{},
-                                    ntt::fixed_shape<0>{},
-                                    ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(taP, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::mean>(
-            taP, tb[warmup_num + i], ntt::fixed_shape<0>{},
-            ntt::fixed_shape<0>{}, ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -890,15 +833,12 @@ std::string benchmark_ntt_reduce_mean_reduceMN_noPack() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::mean>(ta, tb[i], ntt::fixed_shape<0, 1>{},
-                                    ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<0, 1>>(ta, tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::mean>(ta, tb[warmup_num + i],
-                                    ntt::fixed_shape<0, 1>{},
-                                    ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<0, 1>>(ta, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -925,16 +865,14 @@ std::string benchmark_ntt_reduce_mean_reduceMN_packN() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::mean>(taP, tb[i], ntt::fixed_shape<0, 1>{},
-                                    ntt::fixed_shape<1>{},
-                                    ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(taP,
+                                                                      tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::mean>(
-            taP, tb[warmup_num + i], ntt::fixed_shape<0, 1>{},
-            ntt::fixed_shape<1>{}, ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
@@ -961,16 +899,14 @@ std::string benchmark_ntt_reduce_mean_reduceMN_packM() {
     ntt::tensor<float, ntt::fixed_shape<1, 1>> tb[warmup_num + run_num];
 
     for (size_t i = 0; i < warmup_num; i++) {
-        ntt::reduce<ntt::ops::mean>(taP, tb[i], ntt::fixed_shape<0, 1>{},
-                                    ntt::fixed_shape<0>{},
-                                    ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(taP,
+                                                                      tb[i]);
     }
 
     auto t1 = NttTest::get_cpu_cycle();
     for (size_t i = 0; i < run_num; i++) {
-        ntt::reduce<ntt::ops::mean>(
-            taP, tb[warmup_num + i], ntt::fixed_shape<0, 1>{},
-            ntt::fixed_shape<0>{}, ntt::fixed_shape<>{});
+        ntt::reduce_mean<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(
+            taP, tb[warmup_num + i]);
     }
     auto t2 = NttTest::get_cpu_cycle();
     asm volatile("" ::"g"(tb));
diff --git a/src/Native/test/ctest/test_ntt_reduce.cpp b/src/Native/test/ctest/test_ntt_reduce.cpp
index 10a4689bc5..5acf66e3a2 100644
--- a/src/Native/test/ctest/test_ntt_reduce.cpp
+++ b/src/Native/test/ctest/test_ntt_reduce.cpp
@@ -12,6 +12,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "nncase/ntt/kernels/reduce.h"
 #include "ntt_test.h"
 #include "ortki_helper.h"
 #include <gtest/gtest.h>
@@ -34,8 +35,7 @@ TEST(ReduceSumTestFloat, ReduceM_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, N>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::add>(*ntt_input, *ntt_output1, ntt::fixed_shape<0>{},
-                               ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+    ntt::reduce_sum<ntt::fixed_shape<0>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -63,8 +63,7 @@ TEST(ReduceMaxTestFloat, ReduceM_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, N>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::max>(*ntt_input, *ntt_output1, ntt::fixed_shape<0>{},
-                               ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+    ntt::reduce_max<ntt::fixed_shape<0>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -92,8 +91,7 @@ TEST(ReduceMinTestFloat, ReduceM_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, N>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::min>(*ntt_input, *ntt_output1, ntt::fixed_shape<0>{},
-                               ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+    ntt::reduce_min<ntt::fixed_shape<0>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -121,8 +119,7 @@ TEST(ReduceMeanTestFloat, ReduceM_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, N>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::mean>(*ntt_input, *ntt_output1, ntt::fixed_shape<0>{},
-                                ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+    ntt::reduce_mean<ntt::fixed_shape<0>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -155,9 +152,8 @@ TEST(ReduceSumTestFloat, ReduceM_PackM) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, N>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::add>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<0>{}, ntt::fixed_shape<0>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_sum<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(ntt_input_pack,
+                                                              *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -190,9 +186,8 @@ TEST(ReduceMaxTestFloat, ReduceM_PackM) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, N>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::max>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<0>{}, ntt::fixed_shape<0>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_max<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(ntt_input_pack,
+                                                              *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -225,9 +220,8 @@ TEST(ReduceMinTestFloat, ReduceM_PackM) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, N>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::min>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<0>{}, ntt::fixed_shape<0>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_min<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(ntt_input_pack,
+                                                              *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -260,9 +254,8 @@ TEST(ReduceMeanTestFloat, ReduceM_PackM) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, N>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::mean>(ntt_input_pack, *ntt_output1,
-                                ntt::fixed_shape<0>{}, ntt::fixed_shape<0>{},
-                                ntt::fixed_shape<>{});
+    ntt::reduce_mean<ntt::fixed_shape<0>, ntt::fixed_shape<0>>(ntt_input_pack,
+                                                               *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -290,8 +283,7 @@ TEST(ReduceSumTestFloat, ReduceN_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<M, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::add>(*ntt_input, *ntt_output1, ntt::fixed_shape<1>{},
-                               ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+    ntt::reduce_sum<ntt::fixed_shape<1>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -319,8 +311,7 @@ TEST(ReduceMaxTestFloat, ReduceN_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<M, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::max>(*ntt_input, *ntt_output1, ntt::fixed_shape<1>{},
-                               ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+    ntt::reduce_max<ntt::fixed_shape<1>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -348,8 +339,7 @@ TEST(ReduceMinTestFloat, ReduceN_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<M, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::min>(*ntt_input, *ntt_output1, ntt::fixed_shape<1>{},
-                               ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+    ntt::reduce_min<ntt::fixed_shape<1>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -377,8 +367,7 @@ TEST(ReduceMeanTestFloat, ReduceN_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<M, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::mean>(*ntt_input, *ntt_output1, ntt::fixed_shape<1>{},
-                                ntt::fixed_shape<>{}, ntt::fixed_shape<>{});
+    ntt::reduce_mean<ntt::fixed_shape<1>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -411,9 +400,7 @@ TEST(ReduceSumTestFloat, ReduceN_PackN) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<M, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::add>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<1>{}, ntt::fixed_shape<1>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_sum<ntt::fixed_shape<1>>(ntt_input_pack, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -446,9 +433,8 @@ TEST(ReduceMaxTestFloat, ReduceN_PackN) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<M, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::max>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<1>{}, ntt::fixed_shape<1>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_max<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(ntt_input_pack,
+                                                              *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -481,9 +467,8 @@ TEST(ReduceMinTestFloat, ReduceN_PackN) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<M, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::min>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<1>{}, ntt::fixed_shape<1>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_min<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(ntt_input_pack,
+                                                              *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -516,9 +501,8 @@ TEST(ReduceMeanTestFloat, ReduceN_PackN) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<M, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::mean>(ntt_input_pack, *ntt_output1,
-                                ntt::fixed_shape<1>{}, ntt::fixed_shape<1>{},
-                                ntt::fixed_shape<>{});
+    ntt::reduce_mean<ntt::fixed_shape<1>, ntt::fixed_shape<1>>(ntt_input_pack,
+                                                               *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -546,9 +530,7 @@ TEST(ReduceSumTestFloat, ReduceMN_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::add>(*ntt_input, *ntt_output1,
-                               ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_sum<ntt::fixed_shape<0, 1>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -576,9 +558,7 @@ TEST(ReduceMaxTestFloat, ReduceMN_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::max>(*ntt_input, *ntt_output1,
-                               ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_max<ntt::fixed_shape<0, 1>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -606,9 +586,7 @@ TEST(ReduceMinTestFloat, ReduceMN_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::min>(*ntt_input, *ntt_output1,
-                               ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_min<ntt::fixed_shape<0, 1>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -636,9 +614,7 @@ TEST(ReduceMeanTestFloat, ReduceMN_NoPack) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::mean>(*ntt_input, *ntt_output1,
-                                ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<>{},
-                                ntt::fixed_shape<>{});
+    ntt::reduce_mean<ntt::fixed_shape<0, 1>>(*ntt_input, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -671,9 +647,8 @@ TEST(ReduceSumTestFloat, ReduceMN_PackM) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::add>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<0>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_sum<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(ntt_input_pack,
+                                                                 *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -706,9 +681,8 @@ TEST(ReduceMaxTestFloat, ReduceMN_PackM) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::max>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<0>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_max<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(ntt_input_pack,
+                                                                 *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -741,9 +715,8 @@ TEST(ReduceMinTestFloat, ReduceMN_PackM) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::min>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<0>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_min<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(ntt_input_pack,
+                                                                 *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -776,9 +749,8 @@ TEST(ReduceMeanTestFloat, ReduceMN_PackM) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::mean>(ntt_input_pack, *ntt_output1,
-                                ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<0>{},
-                                ntt::fixed_shape<>{});
+    ntt::reduce_mean<ntt::fixed_shape<0, 1>, ntt::fixed_shape<0>>(
+        ntt_input_pack, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -811,9 +783,8 @@ TEST(ReduceSumTestFloat, ReduceMN_PackN) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::add>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<1>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_sum<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(ntt_input_pack,
+                                                                 *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -846,9 +817,8 @@ TEST(ReduceMaxTestFloat, ReduceMN_PackN) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::max>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<1>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_max<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(ntt_input_pack,
+                                                                 *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -881,9 +851,8 @@ TEST(ReduceMinTestFloat, ReduceMN_PackN) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::min>(ntt_input_pack, *ntt_output1,
-                               ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<1>{},
-                               ntt::fixed_shape<>{});
+    ntt::reduce_min<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(ntt_input_pack,
+                                                                 *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);
@@ -916,9 +885,8 @@ TEST(ReduceMeanTestFloat, ReduceMN_PackN) {
     // ntt
     using tensor_type2 = ntt::tensor<float, ntt::fixed_shape<1, 1>>;
     std::unique_ptr<tensor_type2> ntt_output1(new tensor_type2);
-    ntt::reduce<ntt::ops::mean>(ntt_input_pack, *ntt_output1,
-                                ntt::fixed_shape<0, 1>{}, ntt::fixed_shape<0>{},
-                                ntt::fixed_shape<>{});
+    ntt::reduce_mean<ntt::fixed_shape<0, 1>, ntt::fixed_shape<1>>(
+        ntt_input_pack, *ntt_output1);
 
     // ort
     auto ort_input = NttTest::ntt2ort(*ntt_input);

From 6ab08a14f8bdc79b299995f556f593b028a8a171 Mon Sep 17 00:00:00 2001
From: sunnycase <sunnycase@live.cn>
Date: Thu, 12 Sep 2024 07:33:04 +0000
Subject: [PATCH 02/10] [ntt.x86]Add u_reduce

---
 .../include/nncase/ntt/arch/x86_64/ukernels.h |  38 +++++-
 src/Native/include/nncase/ntt/kernels/pack.h  |   2 +-
 .../include/nncase/ntt/kernels/reduce.h       | 108 ++++++------------
 src/Native/include/nncase/ntt/primitive_ops.h |   8 ++
 src/Native/include/nncase/ntt/shape.h         |  24 +++-
 src/Native/include/nncase/ntt/ukernels.h      |  51 ++++++++-
 src/Native/test/benchmark_test/CMakeLists.txt |   1 +
 7 files changed, 149 insertions(+), 83 deletions(-)

diff --git a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
index fe06e06fd7..a8a74e74df 100644
--- a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
+++ b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
@@ -15,10 +15,12 @@
 #pragma once
 #include "../../ukernels.h"
 #include "arch_types.h"
+#include "nncase/ntt/vector.h"
+#include <vector>
 
 namespace nncase::ntt::ukernels {
 template <size_t M, size_t N, size_t MStrides>
-class upack<M, N, MStrides, true, float, vector<float, 8>> {
+class u_pack<M, N, MStrides, true, float, vector<float, 8>> {
   public:
     constexpr void operator()(const float *input,
                               vector<float, 8> *output) noexcept {
@@ -37,4 +39,38 @@ class upack<M, N, MStrides, true, float, vector<float, 8>> {
         }
     }
 };
+
+template <reduce_op Op> struct u_reduce<Op, vector<float, 8>, true> {
+  public:
+    constexpr vector<float, 8>
+    operator()(const vector<float, 8> *input, size_t input_stride, size_t count,
+               vector<float, 8> init_value) noexcept {
+        using binary_op_t =
+            typename reduce_to_binary_type<Op>::template type<vector<float, 8>,
+                                                              vector<float, 8>>;
+        binary_op_t op;
+        if (count / 8) {
+            vector<float, 8> tmp[4];
+            while (count / 8) {
+                for (size_t j = 0; j < 4; j++) {
+                    tmp[j] = op(input[(j * 2) * input_stride],
+                                input[(j * 2 + 1) * input_stride]);
+                }
+                input += input_stride * 8;
+                count -= 8;
+
+                tmp[0] = op(tmp[0], tmp[1]);
+                tmp[2] = op(tmp[2], tmp[3]);
+                tmp[0] = op(tmp[0], tmp[2]);
+                init_value = op(init_value, tmp[0]);
+            }
+        }
+
+        for (size_t i = 0; i < count; i++) {
+            init_value = op(init_value, *input);
+            input += input_stride;
+        }
+        return init_value;
+    }
+};
 } // namespace nncase::ntt::ukernels
diff --git a/src/Native/include/nncase/ntt/kernels/pack.h b/src/Native/include/nncase/ntt/kernels/pack.h
index 982b25e845..3237f183e9 100644
--- a/src/Native/include/nncase/ntt/kernels/pack.h
+++ b/src/Native/include/nncase/ntt/kernels/pack.h
@@ -118,7 +118,7 @@ class pack_impl<TIn, TOut, PackAxis> {
                 slice_fixed_dims<rest_rank, TOut::rank() - rest_rank>(
                     TOut::shape());
             constexpr auto N = rest_dims.length();
-            ntt::upack<M, N, MStrides>(in_p, out_p);
+            ntt::u_pack<M, N, MStrides>(in_p, out_p);
         } else {
             for (size_t i = 0; i < TOut::shape()[Axis]; i++) {
                 apply_transpose<Axis + 1, ContiguousDims, M, MStrides>(
diff --git a/src/Native/include/nncase/ntt/kernels/reduce.h b/src/Native/include/nncase/ntt/kernels/reduce.h
index 593d7055d4..f41d476af4 100644
--- a/src/Native/include/nncase/ntt/kernels/reduce.h
+++ b/src/Native/include/nncase/ntt/kernels/reduce.h
@@ -19,6 +19,7 @@
 #include "../shape_infer/reduce.h"
 #include "../tensor_ops.h"
 #include "../tensor_traits.h"
+#include "../ukernels.h"
 #include "../utility.h"
 #include "nncase/ntt/shape.h"
 #include <limits>
@@ -26,36 +27,7 @@
 #include <utility>
 
 namespace nncase::ntt {
-enum class reduce_op {
-    mean,
-    min,
-    max,
-    sum,
-    prod,
-};
-
 namespace detail {
-template <reduce_op Op> struct reduce_to_binary_type;
-
-template <> struct reduce_to_binary_type<reduce_op::mean> {
-    template <class T1, class T2> using type = ops::add<T1, T2>;
-};
-
-template <> struct reduce_to_binary_type<reduce_op::min> {
-    template <class T1, class T2> using type = ops::min<T1, T2>;
-};
-
-template <> struct reduce_to_binary_type<reduce_op::max> {
-    template <class T1, class T2> using type = ops::max<T1, T2>;
-};
-
-template <> struct reduce_to_binary_type<reduce_op::sum> {
-    template <class T1, class T2> using type = ops::add<T1, T2>;
-};
-
-template <> struct reduce_to_binary_type<reduce_op::prod> {
-    template <class T1, class T2> using type = ops::mul<T1, T2>;
-};
 
 template <reduce_op Op, bool Accumulate, IsTensor TIn, IsTensor TOut,
           IsFixedDims Axes, IsFixedDims PackedAxes, class PadedNums>
@@ -81,57 +53,47 @@ class reduce_impl {
 
   public:
     constexpr void operator()(const TIn &input, TOut &output) {
-        auto in_p = input.elements().data();
-        auto out_p = output.elements().data();
-        // 1. Initialize
-        if constexpr (!Accumulate) {
-            ntt::apply(output.shape(),
-                  [&](auto index) { output(index) = initial_value(); });
-        }
-
-        // 2. Reduce
-        apply<0>(input, output, in_p, out_p);
-
-        // 3. Mean
-        if constexpr (Op == reduce_op::mean) {
-            size_t inner_size =
-                slice_fixed_dims<Axes::rank(), Axes::at(0)>(input.shape())
-                    .length();
-            if constexpr (use_vector_reduce) {
-                inner_size *= TInElem::shape_type::length();
+        ntt::apply(output.shape(), [&](auto index) {
+            auto reduced_in = (TInElem)initial_value();
+            apply_reduce<0>(input, index, reduced_in);
+            if constexpr (IsScalar<TOutElem>) {
+                output(index) = ntt::reduce<
+                    ukernels::reduce_to_binary_type<Op>::template type,
+                    TOutElem>(reduced_in);
+            } else {
+                output(index) = reduced_in;
             }
 
-            auto denom = (TOutScalar)inner_size;
-            ntt::apply(output.shape(), [&](auto index) { output(index) /= denom; });
-        }
+            // Mean
+            if constexpr (Op == reduce_op::mean) {
+                size_t inner_size =
+                    slice_fixed_dims<Axes::rank(), Axes::at(0)>(input.shape())
+                        .length();
+                if constexpr (use_vector_reduce) {
+                    inner_size *= TInElem::shape_type::length();
+                }
+
+                auto denom = (TOutScalar)inner_size;
+                output(index) /= denom;
+            }
+        });
     }
 
   private:
-    template <size_t Axis, class TInP, class TOutP>
-    constexpr void apply(const TIn &input, TOut &output, TInP in_p,
-                         TOutP out_p) {
-        for (size_t i = 0; i < input.shape()[Axis]; i++) {
-            if constexpr (Axis == TIn::rank() - 1) {
-                reduce(*out_p, *in_p);
-            } else {
-                apply<Axis + 1>(input, output, in_p, out_p);
+    template <size_t ReduceIndex>
+    constexpr void apply_reduce(const TIn &input,
+                                ranked_shape<TIn::rank()> index,
+                                TInElem &reduced_in) {
+        constexpr size_t Axis = Axes::at(ReduceIndex);
+        if constexpr (ReduceIndex < Axes::rank() - 1) {
+            for (size_t i = 0; i < input.shape()[Axis]; i++) {
+                index[Axis] = i;
+                apply_reduce<ReduceIndex + 1>(input, index, reduced_in);
             }
-
-            in_p += input.strides()[Axis];
-            out_p +=
-                utility_detail::get_safe_stride(output, Axis, TOut::shape());
-        }
-    }
-
-    template <class TOutElem, class TInElem>
-    void reduce(TOutElem &output, const TInElem input) {
-        if constexpr (IsScalar<TOutElem>) {
-            output = ntt::reduce<reduce_to_binary_type<Op>::template type>(
-                input, output);
         } else {
-            output =
-                reduce_to_binary_type<Op>::template type<TOutElem, TInElem>()(
-                    output, input);
+            const TInElem *in_p = &input(index);
+            reduced_in = ntt::u_reduce<Op>(in_p, input.strides()[Axis],
+                                           input.shape()[Axis], reduced_in);
         }
     }
 };
diff --git a/src/Native/include/nncase/ntt/primitive_ops.h b/src/Native/include/nncase/ntt/primitive_ops.h
index 199cf85f28..6d10e05b89 100644
--- a/src/Native/include/nncase/ntt/primitive_ops.h
+++ b/src/Native/include/nncase/ntt/primitive_ops.h
@@ -21,6 +21,14 @@
 #include <type_traits>
 
 namespace nncase::ntt {
+enum class reduce_op {
+    mean,
+    min,
+    max,
+    sum,
+    prod,
+};
+
 namespace ops {
 
 /**
diff --git a/src/Native/include/nncase/ntt/shape.h b/src/Native/include/nncase/ntt/shape.h
index 4e4bafb3d1..be4feab037 100644
--- a/src/Native/include/nncase/ntt/shape.h
+++ b/src/Native/include/nncase/ntt/shape.h
@@ -33,6 +33,12 @@ template <size_t... Dims> struct fixed_dims_base {
         return std::array<size_t, sizeof...(Dims)>{Dims...}[index];
     }
 
+    static constexpr size_t last() noexcept { return at(rank() - 1); }
+
+    static constexpr bool contains(size_t value) noexcept {
+        return (false || ... || (Dims == value));
+    }
+
     constexpr size_t operator[](size_t index) const noexcept {
         return at(index);
     }
@@ -52,6 +58,12 @@ template <size_t Rank> struct ranked_dims_base {
     constexpr auto begin() const noexcept { return dims_.begin(); }
     constexpr auto end() const noexcept { return dims_.end(); }
 
+    constexpr size_t last() const noexcept { return at(rank() - 1); }
+
+    constexpr bool contains(size_t value) const noexcept {
+        return std::find(begin(), end(), value) != end();
+    }
+
     std::array<size_t, Rank> dims_;
 };
 } // namespace detail
@@ -62,7 +74,9 @@ struct fixed_shape : detail::fixed_dims_base<Dims...> {
         using type = fixed_shape<I, Dims...>;
     };
 
-    template <size_t I> struct append { using type = fixed_shape<Dims..., I>; };
+    template <size_t I> struct append {
+        using type = fixed_shape<Dims..., I>;
+    };
 
     static constexpr size_t length() noexcept { return (Dims * ... * 1); }
 };
@@ -258,10 +272,10 @@ constexpr size_t contiguous_dims(const Shape &shape, const Strides &strides) {
 }
 
 template <class Shape, class Strides>
-inline constexpr size_t max_size_v = (is_fixed_dims_v<Shape> &&
-                                      is_fixed_dims_v<Strides>)
-                                         ? linear_size(Shape{}, Strides{})
-                                         : std::dynamic_extent;
+inline constexpr size_t max_size_v =
+    (is_fixed_dims_v<Shape> && is_fixed_dims_v<Strides>)
+        ? linear_size(Shape{}, Strides{})
+        : std::dynamic_extent;
 
 template <class Index, class Shape>
 constexpr bool in_bound(const Index &index, const Shape &shape) {
diff --git a/src/Native/include/nncase/ntt/ukernels.h b/src/Native/include/nncase/ntt/ukernels.h
index c6707916dd..d497576387 100644
--- a/src/Native/include/nncase/ntt/ukernels.h
+++ b/src/Native/include/nncase/ntt/ukernels.h
@@ -19,7 +19,7 @@
 
 namespace nncase::ntt::ukernels {
 template <size_t M, size_t N, size_t MStrides, bool Arch, class TIn, class TOut>
-class upack {
+class u_pack {
   public:
     constexpr void operator()(const TIn *input, TOut *output) noexcept {
         for (size_t j = 0; j < N; j++) {
@@ -37,13 +37,58 @@ class upack {
         }
     }
 };
+
+template <reduce_op Op> struct reduce_to_binary_type;
+
+template <> struct reduce_to_binary_type<reduce_op::mean> {
+    template <class T1, class T2> using type = ops::add<T1, T2>;
+};
+
+template <> struct reduce_to_binary_type<reduce_op::min> {
+    template <class T1, class T2> using type = ops::min<T1, T2>;
+};
+
+template <> struct reduce_to_binary_type<reduce_op::max> {
+    template <class T1, class T2> using type = ops::max<T1, T2>;
+};
+
+template <> struct reduce_to_binary_type<reduce_op::sum> {
+    template <class T1, class T2> using type = ops::add<T1, T2>;
+};
+
+template <> struct reduce_to_binary_type<reduce_op::prod> {
+    template <class T1, class T2> using type = ops::mul<T1, T2>;
+};
+
+template <reduce_op Op, class T, bool Arch> struct u_reduce {
+  public:
+    constexpr T operator()(const T *input, size_t input_stride, size_t count,
+                           T init_value) noexcept {
+        using binary_op_t =
+            typename reduce_to_binary_type<Op>::template type<T, T>;
+
+        for (size_t i = 0; i < count; i++) {
+            init_value = binary_op_t()(init_value, *input);
+            input += input_stride;
+        }
+        return init_value;
+    }
+};
 } // namespace nncase::ntt::ukernels
 
 namespace nncase::ntt {
 template <size_t M, size_t N, size_t MStrides, class TIn, class TOut>
-constexpr void upack(const TIn *input, TOut *output) noexcept {
-    ukernels::upack<M, N, MStrides, true, std::decay_t<TIn>, std::decay_t<TOut>>
+constexpr void u_pack(const TIn *input, TOut *output) noexcept {
+    ukernels::u_pack<M, N, MStrides, true, std::decay_t<TIn>,
+                     std::decay_t<TOut>>
         impl;
     impl(input, output);
 }
+
+template <reduce_op Op, class T>
+constexpr T u_reduce(const T *input, size_t input_stride, size_t count,
+                     T init_value) {
+    ukernels::u_reduce<Op, T, true> impl;
+    return impl(input, input_stride, count, init_value);
+}
 } // namespace nncase::ntt
diff --git a/src/Native/test/benchmark_test/CMakeLists.txt b/src/Native/test/benchmark_test/CMakeLists.txt
index df509c8410..1bbae71414 100644
--- a/src/Native/test/benchmark_test/CMakeLists.txt
+++ b/src/Native/test/benchmark_test/CMakeLists.txt
@@ -17,4 +17,5 @@ foreach(test_name ${TEST_NAMES})
     endif()
     add_executable(${tname} ${tname}.cpp)
     target_link_libraries(${tname} PRIVATE nncaseruntime)
+    target_compile_options(${tname} PRIVATE -ffast-math)
 endforeach()
\ No newline at end of file

From 90632e151090270ecb29591168dce0517fd17800 Mon Sep 17 00:00:00 2001
From: sunnycase <sunnycase@users.noreply.github.com>
Date: Thu, 12 Sep 2024 07:37:27 +0000
Subject: [PATCH 03/10] Apply code-format changes

---
 src/Native/include/nncase/ntt/shape.h      | 12 +++++-------
 src/Native/include/nncase/ntt/tensor_ops.h | 11 +++++------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/Native/include/nncase/ntt/shape.h b/src/Native/include/nncase/ntt/shape.h
index be4feab037..5c95256cc9 100644
--- a/src/Native/include/nncase/ntt/shape.h
+++ b/src/Native/include/nncase/ntt/shape.h
@@ -74,9 +74,7 @@ struct fixed_shape : detail::fixed_dims_base<Dims...> {
         using type = fixed_shape<I, Dims...>;
     };
 
-    template <size_t I> struct append {
-        using type = fixed_shape<Dims..., I>;
-    };
+    template <size_t I> struct append { using type = fixed_shape<Dims..., I>; };
 
     static constexpr size_t length() noexcept { return (Dims * ... * 1); }
 };
@@ -272,10 +270,10 @@ constexpr size_t contiguous_dims(const Shape &shape, const Strides &strides) {
 }
 
 template <class Shape, class Strides>
-inline constexpr size_t max_size_v =
-    (is_fixed_dims_v<Shape> && is_fixed_dims_v<Strides>)
-        ? linear_size(Shape{}, Strides{})
-        : std::dynamic_extent;
+inline constexpr size_t max_size_v = (is_fixed_dims_v<Shape> &&
+                                      is_fixed_dims_v<Strides>)
+                                         ? linear_size(Shape{}, Strides{})
+                                         : std::dynamic_extent;
 
 template <class Index, class Shape>
 constexpr bool in_bound(const Index &index, const Shape &shape) {
diff --git a/src/Native/include/nncase/ntt/tensor_ops.h b/src/Native/include/nncase/ntt/tensor_ops.h
index 400aef6d1f..9c713663c0 100644
--- a/src/Native/include/nncase/ntt/tensor_ops.h
+++ b/src/Native/include/nncase/ntt/tensor_ops.h
@@ -49,8 +49,7 @@ struct tensor_unary_impl<Op, TTensor> {
 };
 
 template <template <class T> class Op, IsTensor TTensor>
-    requires(TTensor::rank() == 2)
-struct tensor_unary_impl<Op, TTensor> {
+requires(TTensor::rank() == 2) struct tensor_unary_impl<Op, TTensor> {
     using sub_vector_type =
         fixed_tensor_alike_t<TTensor, TTensor::shape().at(1)>;
 
@@ -94,8 +93,8 @@ struct tensor_binary_impl<Op, TTensor, T2> {
 };
 
 template <template <class T1, class T2> class Op, IsTensor T1, IsTensor T2>
-    requires(T1::rank() == 2 && T2::rank() == 2)
-struct tensor_binary_impl<Op, T1, T2> {
+requires(T1::rank() == 2 &&
+         T2::rank() == 2) struct tensor_binary_impl<Op, T1, T2> {
     using sub_vector_type = fixed_tensor_alike_t<T1, T1::shape().at(1)>;
 
     constexpr T1 operator()(const T1 &v1, const T2 &v2) const noexcept {
@@ -178,8 +177,8 @@ template <IsTensor TTensor> struct inner_product<TTensor, TTensor> {
 
     constexpr auto operator()(const TTensor &v1,
                               const TTensor &v2) const noexcept {
-        using result_type = decltype(op_(std::declval<element_type>(),
-                                         std::declval<element_type>()));
+        using result_type = decltype(
+            op_(std::declval<element_type>(), std::declval<element_type>()));
         result_type value{};
         apply(v1.shape(),
               [&](auto index) { value += op_(v1(index), v2(index)); });

From dee30a4bd6c1b1db0d5d287cfbf321d0e3aeaa59 Mon Sep 17 00:00:00 2001
From: sunnycase <sunnycase@live.cn>
Date: Thu, 12 Sep 2024 08:49:24 +0000
Subject: [PATCH 04/10] [ntt.x86] Optimize u_reduce

---
 .../include/nncase/ntt/arch/x86_64/ukernels.h | 81 ++++++++++++++++---
 src/Native/test/benchmark_test/CMakeLists.txt |  1 -
 2 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
index a8a74e74df..0a2becffd5 100644
--- a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
+++ b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
@@ -49,21 +49,80 @@ template <reduce_op Op> struct u_reduce<Op, vector<float, 8>, true> {
             typename reduce_to_binary_type<Op>::template type<vector<float, 8>,
                                                               vector<float, 8>>;
         binary_op_t op;
-        if (count / 8) {
+        if (count / 4) {
             vector<float, 8> tmp[4];
-            while (count / 8) {
-                for (size_t j = 0; j < 4; j++) {
-                    tmp[j] = op(input[(j * 2) * input_stride],
-                                input[(j * 2 + 1) * input_stride]);
+            for (size_t i = 0; i < 4; i++) {
+                tmp[i] = input[i * input_stride];
+            }
+            input += input_stride * 4;
+            count -= 4;
+            while (count / 4) {
+                for (size_t i = 0; i < 4; i++) {
+                    tmp[i] = op(tmp[i], input[i * input_stride]);
                 }
-                input += input_stride * 8;
-                count -= 8;
+                input += input_stride * 4;
+                count -= 4;
+            }
 
-                tmp[0] = op(tmp[0], tmp[1]);
-                tmp[2] = op(tmp[2], tmp[3]);
-                tmp[0] = op(tmp[0], tmp[2]);
-                init_value = op(init_value, tmp[0]);
+            tmp[0] = op(tmp[0], tmp[1]);
+            tmp[2] = op(tmp[2], tmp[3]);
+            tmp[0] = op(tmp[0], tmp[2]);
+            init_value = op(init_value, tmp[0]);
+        }
+        
+        if (count / 2) {
+            vector<float, 8> tmp[2];
+            for (size_t i = 0; i < 2; i++) {
+                tmp[i] = input[i * input_stride];
             }
+            input += input_stride * 2;
+            count -= 2;
+            while (count / 2) {
+                for (size_t i = 0; i < 2; i++) {
+                    tmp[i] = op(tmp[i], input[i * input_stride]);
+                }
+                input += input_stride * 2;
+                count -= 2;
+            }
+
+            tmp[0] = op(tmp[0], tmp[1]);
+            init_value = op(init_value, tmp[0]);
+        }
+
+        for (size_t i = 0; i < count; i++) {
+            init_value = op(init_value, *input);
+            input += input_stride;
+        }
+        return init_value;
+    }
+};
+
+template <reduce_op Op> struct u_reduce<Op, float, true> {
+  public:
+    constexpr float operator()(const float *input, size_t input_stride,
+                               size_t count, float init_value) noexcept {
+        using binary_op_t =
+            typename reduce_to_binary_type<Op>::template type<float, float>;
+        binary_op_t op;
+        if (count / 4) {
+            float tmp[4];
+            for (size_t i = 0; i < 4; i++) {
+                tmp[i] = input[i * input_stride];
+            }
+            input += input_stride * 4;
+            count -= 4;
+            while (count / 4) {
+                for (size_t i = 0; i < 4; i++) {
+                    tmp[i] = op(tmp[i], input[i * input_stride]);
+                }
+                input += input_stride * 4;
+                count -= 4;
+            }
+
+            tmp[0] = op(tmp[0], tmp[1]);
+            tmp[2] = op(tmp[2], tmp[3]);
+            tmp[0] = op(tmp[0], tmp[2]);
+            init_value = op(init_value, tmp[0]);
         }
 
         for (size_t i = 0; i < count; i++) {
diff --git a/src/Native/test/benchmark_test/CMakeLists.txt b/src/Native/test/benchmark_test/CMakeLists.txt
index 1bbae71414..df509c8410 100644
--- a/src/Native/test/benchmark_test/CMakeLists.txt
+++ b/src/Native/test/benchmark_test/CMakeLists.txt
@@ -17,5 +17,4 @@ foreach(test_name ${TEST_NAMES})
     endif()
     add_executable(${tname} ${tname}.cpp)
     target_link_libraries(${tname} PRIVATE nncaseruntime)
-    target_compile_options(${tname} PRIVATE -ffast-math)
 endforeach()
\ No newline at end of file

From c0888b0727bbcf9cabaf319b58aaa9815e459b5c Mon Sep 17 00:00:00 2001
From: sunnycase <sunnycase@users.noreply.github.com>
Date: Thu, 12 Sep 2024 08:51:49 +0000
Subject: [PATCH 05/10] Apply code-format changes

---
 src/Native/include/nncase/ntt/arch/x86_64/ukernels.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
index 0a2becffd5..0db13ca123 100644
--- a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
+++ b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
@@ -69,7 +69,7 @@ template <reduce_op Op> struct u_reduce<Op, vector<float, 8>, true> {
             tmp[0] = op(tmp[0], tmp[2]);
             init_value = op(init_value, tmp[0]);
         }
-        
+
         if (count / 2) {
             vector<float, 8> tmp[2];
             for (size_t i = 0; i < 2; i++) {

From 8b83855ee2e6345178663fbb479290dd77b388e6 Mon Sep 17 00:00:00 2001
From: sunnycase <sunnycase@live.cn>
Date: Thu, 12 Sep 2024 09:06:00 +0000
Subject: [PATCH 06/10] Update actions

---
 .../compiler-python-build.yml                 |  6 +++---
 .github/disable-workflows/runtime-k210.yml    |  2 +-
 .../runtime-linux-x64-gcc.yml                 |  4 ++--
 .../runtime-macos-x64-appleclang.yml          |  4 ++--
 .../runtime-win-x64-msvc.yml                  |  4 ++--
 .github/workflows/compiler-build.yml          | 20 +++++++++----------
 .github/workflows/compiler-python-release.yml |  6 +++---
 .github/workflows/runtime-build.yml           |  6 +++---
 8 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/.github/disable-workflows/compiler-python-build.yml b/.github/disable-workflows/compiler-python-build.yml
index c70237c0bd..af22688caf 100644
--- a/.github/disable-workflows/compiler-python-build.yml
+++ b/.github/disable-workflows/compiler-python-build.yml
@@ -38,7 +38,7 @@ jobs:
       run: python -m cibuildwheel --output-dir wheelhouse
       
     - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
+      uses: actions/upload-artifact@v4
       if: runner.os == 'Windows'
       with:
         name: nncase-python-windows
@@ -46,7 +46,7 @@ jobs:
         if-no-files-found: error
       
     - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
+      uses: actions/upload-artifact@v4
       if: runner.os == 'Linux'
       with:
         name: nncase-python-linux
@@ -54,7 +54,7 @@ jobs:
         if-no-files-found: error
       
     - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
+      uses: actions/upload-artifact@v4
       if: runner.os == 'Macos'
       with:
         name: nncase-python-macos
diff --git a/.github/disable-workflows/runtime-k210.yml b/.github/disable-workflows/runtime-k210.yml
index cc98ee8c3b..d3b2cdb551 100644
--- a/.github/disable-workflows/runtime-k210.yml
+++ b/.github/disable-workflows/runtime-k210.yml
@@ -73,7 +73,7 @@ jobs:
       run: cmake --install . --prefix ../install
       
     - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
+      uses: actions/upload-artifact@v4
       with:
         name: nncaseruntime-k210
         path: ${{github.workspace}}/install
diff --git a/.github/disable-workflows/runtime-linux-x64-gcc.yml b/.github/disable-workflows/runtime-linux-x64-gcc.yml
index fe755cd81a..fc8ebf539e 100644
--- a/.github/disable-workflows/runtime-linux-x64-gcc.yml
+++ b/.github/disable-workflows/runtime-linux-x64-gcc.yml
@@ -58,14 +58,14 @@ jobs:
       run: ${{github.workspace}}/install/bin/benchnncase > benchnncase.log
       
     - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
+      uses: actions/upload-artifact@v4
       with:
         name: nncaseruntime-linux-x64-gcc
         path: ${{github.workspace}}/install
         if-no-files-found: error
       
     - name: Upload Benchmark Result
-      uses: actions/upload-artifact@v2.2.2
+      uses: actions/upload-artifact@v4
       with:
         name: nncasebenchmark-linux-x64-gcc
         path: ${{github.workspace}}/benchnncase.log
diff --git a/.github/disable-workflows/runtime-macos-x64-appleclang.yml b/.github/disable-workflows/runtime-macos-x64-appleclang.yml
index 45b4a489dc..eabefadda9 100644
--- a/.github/disable-workflows/runtime-macos-x64-appleclang.yml
+++ b/.github/disable-workflows/runtime-macos-x64-appleclang.yml
@@ -55,14 +55,14 @@ jobs:
       run: ${{github.workspace}}/install/bin/benchnncase > benchnncase.log
       
     - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
+      uses: actions/upload-artifact@v4
       with:
         name: nncaseruntime-macos-x64-appleclang
         path: ${{github.workspace}}/install
         if-no-files-found: error
       
     - name: Upload Benchmark Result
-      uses: actions/upload-artifact@v2.2.2
+      uses: actions/upload-artifact@v4
       with:
         name: nncasebenchmark-macos-x64-appleclang
         path: ${{github.workspace}}/benchnncase.log
diff --git a/.github/disable-workflows/runtime-win-x64-msvc.yml b/.github/disable-workflows/runtime-win-x64-msvc.yml
index a51d8be994..374cf15770 100644
--- a/.github/disable-workflows/runtime-win-x64-msvc.yml
+++ b/.github/disable-workflows/runtime-win-x64-msvc.yml
@@ -54,14 +54,14 @@ jobs:
       run: .\install\bin\benchnncase.exe > benchnncase.log
       
     - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
+      uses: actions/upload-artifact@v4
       with:
         name: nncaseruntime-win-x64-msvc
         path: ${{github.workspace}}/install
         if-no-files-found: error
       
     - name: Upload Benchmark Result
-      uses: actions/upload-artifact@v2.2.2
+      uses: actions/upload-artifact@v4
       with:
         name: nncasebenchmark-win-x64-msvc
         path: ${{github.workspace}}/benchnncase.log
diff --git a/.github/workflows/compiler-build.yml b/.github/workflows/compiler-build.yml
index 2c7d703282..5348c36676 100644
--- a/.github/workflows/compiler-build.yml
+++ b/.github/workflows/compiler-build.yml
@@ -61,7 +61,7 @@ jobs:
           cmake --install build/${{matrix.config.buildType}} --prefix install
 
       - name: Upload nncase Native Build Artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: nncase-native-${{matrix.config.name}}
           path: ${{github.workspace}}/install
@@ -97,7 +97,7 @@ jobs:
             ${{ runner.os }}-nuget-
 
       - name: Install nncase native Artifact
-        uses: actions/download-artifact@v2.0.9
+        uses: actions/download-artifact@v4
         with:
           name: nncase-native-${{matrix.config.name}}
           path: ${{github.workspace}}/install
@@ -139,7 +139,7 @@ jobs:
           dotnet-coverage merge -o coverage.unit.xml -f cobertura -r coverage/*.xml
 
       - name: Upload Coverage
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         if: matrix.config.name == 'x86_64-linux'
         with:
           name: nncase-coverage-unit
@@ -147,7 +147,7 @@ jobs:
           if-no-files-found: error
 
       - name: Upload nncase Build Artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: nncase-${{matrix.config.name}}
           path: ${{github.workspace}}/src/Nncase.Compiler/bin/${{matrix.config.buildType}}/net${{matrix.dotnet-version}}/${{matrix.config.rid}}/publish
@@ -185,13 +185,13 @@ jobs:
           cache-dependency-path: '**/requirements.test.txt'
 
       - name: Install nncase native Artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: nncase-native-${{matrix.config.name}}
           path: ${{github.workspace}}/install
 
       - name: Install nncase
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: nncase-${{matrix.config.name}}
           path: ${{github.workspace}}/install
@@ -259,7 +259,7 @@ jobs:
           dotnet-coverage merge -o coverage.integration.xml -f cobertura -r coverage/*.xml
 
       - name: Upload Coverage
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         if: matrix.config.name == 'x86_64-linux'
         with:
           name: nncase-coverage-integration
@@ -283,13 +283,13 @@ jobs:
           dotnet-version: "8.0"
 
       - name: Download Unit Test Coverage
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: nncase-coverage-unit
           path: ${{github.workspace}}/coverage
 
       - name: Download Integration Test Coverage
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: nncase-coverage-integration
           path: ${{github.workspace}}/coverage
@@ -314,7 +314,7 @@ jobs:
           reportgenerator -reports:coverage.xml -targetdir:"coveragereport" -reporttypes:Html
 
       - name: Upload Coverage Report
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: nncase-coverage-report
           path: coveragereport
diff --git a/.github/workflows/compiler-python-release.yml b/.github/workflows/compiler-python-release.yml
index 7b36487bbc..dc4cff2939 100644
--- a/.github/workflows/compiler-python-release.yml
+++ b/.github/workflows/compiler-python-release.yml
@@ -39,7 +39,7 @@ jobs:
           dotnet publish src/Nncase.Compiler -c ${{matrix.config.buildType}} --no-restore --sc false -r ${{matrix.config.rid}}
 
       - name: Upload nncase Build Artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: nncase-${{matrix.config.name}}
           path: ${{github.workspace}}/src/Nncase.Compiler/bin/${{matrix.config.buildType}}/net${{matrix.dotnet-version}}/${{matrix.config.rid}}/publish
@@ -69,7 +69,7 @@ jobs:
         dotnet-version: ${{matrix.dotnet-version}}
 
     - name: Install nncase
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: nncase-${{matrix.config.name}}
         path: ${{github.workspace}}/install
@@ -97,7 +97,7 @@ jobs:
       run: python -m cibuildwheel --output-dir wheelhouse
 
     - name: Upload nncase-python Build Artifact
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: nncase-python-${{matrix.config.name}}
         path: ${{github.workspace}}/wheelhouse
diff --git a/.github/workflows/runtime-build.yml b/.github/workflows/runtime-build.yml
index 7a1e8746fa..ea95335d53 100644
--- a/.github/workflows/runtime-build.yml
+++ b/.github/workflows/runtime-build.yml
@@ -67,14 +67,14 @@ jobs:
       #    cat benchnncase.log
 
       - name: Upload nncaseruntime Build Artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: nncaseruntime-${{matrix.config.name}}
           path: ${{github.workspace}}/install
           if-no-files-found: error
 
     #- name: Upload nncaseruntime Benchmark
-    #  uses: actions/upload-artifact@v3
+    #  uses: actions/upload-artifact@v4
     #  with:
     #    name: nncaseruntime-benchmark-${{matrix.config.name}}
     #    path: ${{github.workspace}}/benchnncase.log
@@ -134,7 +134,7 @@ jobs:
           ctest -C ${{matrix.config.buildType}} --test-dir src/Native/test/ctest --output-on-failure -j4
 
       - name: Upload nncaseruntime Build Artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: nncaseruntime-${{matrix.config.name}}
           path: ${{github.workspace}}/install

From 6dc1c32a15f967dc77cdd7686a8ff91185279aaa Mon Sep 17 00:00:00 2001
From: sunnycase <sunnycase@live.cn>
Date: Thu, 12 Sep 2024 09:41:22 +0000
Subject: [PATCH 07/10] Remove packed softmax/layernorm packRule

---
 .../CodeGen/CPU/CSourceExtensions.cs          |   4 +-
 .../CPU/KernelCSourceConvertVisitor.cs        |   2 +-
 .../Passes/Rules/CPU/PackRule.cs              | 118 ------------------
 .../Nncase.Modules.CPU/Targets/CPUTarget.cs   |   2 -
 .../Targets/UnitTestCPUKernels.cs             |  23 ----
 5 files changed, 3 insertions(+), 146 deletions(-)

diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs
index 3eb5c56457..f31e6216a7 100644
--- a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs
+++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceExtensions.cs
@@ -41,9 +41,9 @@ public static string ToC(this PrimType primType) =>
     {
         ReduceOp.Min => "min",
         ReduceOp.Max => "max",
-        ReduceOp.Sum => "add",
+        ReduceOp.Sum => "sum",
         ReduceOp.Mean => "mean",
-        ReduceOp.Prod => "mul",
+        ReduceOp.Prod => "prod",
         _ => throw new NotImplementedException(),
     };
 
diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs
index e67399d2a4..9546629d5f 100644
--- a/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs
+++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/KernelCSourceConvertVisitor.cs
@@ -452,7 +452,7 @@ protected override CSymbol VisitCall(Call expr)
                     IndentScope.Writer.Write($"pad<{string.Join(",", pad.Paddings)}>({Visit(args[0]).Name}, {Visit(args[1]).Name}, {args[0].CheckedDataType.ToC()} {{ {pad.PadValue} }} );\n");
                     break;
                 case TIR.CPU.Reduce reduce:
-                    IndentScope.Writer.Write($"reduce<ops::{reduce.ReduceOp.ToC()}>({Visit(args[0]).Name}, {Visit(args[1]).Name}, fixed_shape<{string.Join(",", reduce.Axis)}>{{}}, fixed_shape<{string.Join(",", reduce.PackedAxes)}>{{}}, fixed_shape<{string.Join(",", reduce.PadedNums)}>{{}});\n");
+                    IndentScope.Writer.Write($"reduce_{reduce.ReduceOp.ToC()}<fixed_shape<{string.Join(",", reduce.Axis)}>, fixed_shape<{string.Join(",", reduce.PackedAxes)}>, fixed_shape<{string.Join(",", reduce.PadedNums)}>>({Visit(args[0]).Name}, {Visit(args[1]).Name});\n");
                     break;
                 case TIR.CPU.ReduceArg reduceArg:
                     IndentScope.Writer.Write($"reduce_arg<ops::{reduceArg.ReduceArgOp.ToC()[4..]}, {reduceArg.Axis}, {reduceArg.SelectLastIndex.ToString().ToLower(System.Globalization.CultureInfo.CurrentCulture)}, {reduceArg.KeepDims.ToString().ToLower(System.Globalization.CultureInfo.CurrentCulture)}>({Visit(args[0]).Name}, {Visit(args[1]).Name}, fixed_shape<>{{}}, fixed_shape<>{{}});\n");
diff --git a/modules/Nncase.Modules.CPU/Passes/Rules/CPU/PackRule.cs b/modules/Nncase.Modules.CPU/Passes/Rules/CPU/PackRule.cs
index ec1342c708..711125efb5 100644
--- a/modules/Nncase.Modules.CPU/Passes/Rules/CPU/PackRule.cs
+++ b/modules/Nncase.Modules.CPU/Passes/Rules/CPU/PackRule.cs
@@ -35,52 +35,6 @@ public PackRule(int rank, int lane)
     public override Expr? GetReplace(IMatchResult result, RunPassContext options) => throw new NotImplementedException();
 }
 
-public class PackSoftmax : PackRule
-{
-    public PackSoftmax(int rank, int lane)
-        : base(rank, lane)
-    {
-    }
-
-    public override Pattern Pattern { get; } = IsSoftmax(
-      "target",
-      IsWildcard("input") with { TypePattern = IsFloat() },
-      IsWildcard("axis") with { TypePattern = IsIntegralScalar() });
-
-    public override List<Expr> GetReplaceCandidates(IMatchResult result, RunPassContext context)
-    {
-        var rets = new List<Expr>();
-        var input = (Expr)result["input"];
-        var axis = ((TensorConst)result["axis"]).Value.ToScalar<int>();
-        var inShape = input.CheckedShape.ToValueArray();
-
-        void AddCandidate(int[] packedAxes, int[] lanes)
-        {
-            var packed = IR.F.CPU.Pack(PackUtility.PadForPack(input, inShape, packedAxes, lanes, float.NegativeInfinity, out var pads), lanes, packedAxes);
-            var softmax = IR.F.CPU.PackedSoftmax(packed, axis, packedAxes);
-            if (softmax.CheckedType is not InvalidType)
-            {
-                var post = PackUtility.SliceForPack(IR.F.CPU.Unpack(softmax, lanes, packedAxes), inShape, pads);
-                rets.Add(post);
-            }
-        }
-
-        for (int i = 0; i < input.CheckedShape.Count; i++)
-        {
-            AddCandidate(new[] { i }, new[] { Lane });
-            for (int j = i + 1; j < input.CheckedShape.Count; j++)
-            {
-                if (Rank > 1)
-                {
-                    AddCandidate(new[] { i, j }, new[] { Lane, Lane });
-                }
-            }
-        }
-
-        return rets;
-    }
-}
-
 public sealed class PackResizeImage : PackRule
 {
     public PackResizeImage(int rank, int lane)
@@ -190,78 +144,6 @@ void AddCandidate(int[] packedAxes, int[] lanes)
     }
 }
 
-public sealed class PackLayerNorm : PackRule
-{
-    public PackLayerNorm(int rank, int lane)
-        : base(rank, lane)
-    {
-    }
-
-    public override Pattern Pattern { get; } = IsLayerNorm(
-      "target",
-      _ => true,
-      IsWildcard("input") with { TypePattern = IsFloat() },
-      IsWildcard("scale") with { TypePattern = IsFloat() },
-      IsWildcard("bias") with { TypePattern = IsFloat() });
-
-    public override List<Expr> GetReplaceCandidates(IMatchResult result, RunPassContext context)
-    {
-        var rets = new List<Expr>();
-        var op = (IR.NN.LayerNorm)result["target"];
-        var input = (Expr)result["input"];
-        var scale = (Expr)result["scale"];
-        var bias = (Expr)result["bias"];
-        var inShape = input.CheckedShape.ToValueArray();
-        var pshape = scale.CheckedShape.ToValueArray();
-
-        void AddCandidate(int[] packedAxes, int[] lanes)
-        {
-            var packedInput = IR.F.CPU.Pack(PackUtility.PadForPack(input, inShape, packedAxes, lanes, 0f, out var padsInput), lanes, packedAxes);
-
-            // todo support padings.
-            if (padsInput.Any(x => x > 0))
-            {
-                return;
-            }
-
-            var pAxes = packedAxes.Where(i => i >= op.Axis).Select(i => i - op.Axis).ToArray();
-            var packedScale = PackUtility.PadForPack(scale, pshape, pAxes, lanes, 0f, out var padsScale);
-            if (pAxes.Length > 0)
-            {
-                packedScale = IR.F.CPU.Pack(packedScale, Enumerable.Repeat(Lane, pAxes.Length).ToArray(), pAxes);
-            }
-
-            var packedBias = PackUtility.PadForPack(bias, pshape, pAxes, lanes, 0f, out var padsBias);
-            if (pAxes.Length > 0)
-            {
-                packedBias = IR.F.CPU.Pack(packedBias, Enumerable.Repeat(Lane, pAxes.Length).ToArray(), pAxes);
-            }
-
-            var layernorm = IR.F.CPU.PackedLayerNorm(packedInput, packedScale, packedBias, op.Axis, op.Epsilon, op.UseMean, packedAxes, padsInput);
-
-            if (layernorm.CheckedType is not InvalidType)
-            {
-                var post = PackUtility.SliceForPack(IR.F.CPU.Unpack(layernorm, lanes, packedAxes), inShape, padsInput);
-                rets.Add(post);
-            }
-        }
-
-        for (int i = 0; i < input.CheckedShape.Count; i++)
-        {
-            AddCandidate(new[] { i }, new[] { Lane });
-            for (int j = i + 1; j < input.CheckedShape.Count; j++)
-            {
-                if (Rank > 1)
-                {
-                    AddCandidate(new[] { i, j }, new[] { Lane, Lane });
-                }
-            }
-        }
-
-        return rets;
-    }
-}
-
 public sealed class PackMatMul : PackRule
 {
     public PackMatMul(int rank, int lane)
diff --git a/modules/Nncase.Modules.CPU/Targets/CPUTarget.cs b/modules/Nncase.Modules.CPU/Targets/CPUTarget.cs
index 99d894abfa..c391ff123c 100644
--- a/modules/Nncase.Modules.CPU/Targets/CPUTarget.cs
+++ b/modules/Nncase.Modules.CPU/Targets/CPUTarget.cs
@@ -94,9 +94,7 @@ public void RegisterTargetDependentAfterQuantPass(IPassManager passManager, Comp
                 // todo config it in the target options.
                 var rank = 1;
                 var lane = System.Runtime.Intrinsics.Vector256.IsHardwareAccelerated ? 8 : 4;
-                p.Add<Passes.Rules.CPU.PackSoftmax>(rank, lane);
                 p.Add<Passes.Rules.CPU.PackSwish>(rank, lane);
-                p.Add<Passes.Rules.CPU.PackLayerNorm>(rank, lane);
                 p.Add<Passes.Rules.CPU.PackResizeImage>(rank, lane);
                 p.Add<Passes.Rules.CPU.PackMatMul>(rank, lane);
                 p.Add<Passes.Rules.CPU.PackConv2D>(rank, lane);
diff --git a/src/Nncase.Tests/Targets/UnitTestCPUKernels.cs b/src/Nncase.Tests/Targets/UnitTestCPUKernels.cs
index 3720b2c449..348b8c8df3 100644
--- a/src/Nncase.Tests/Targets/UnitTestCPUKernels.cs
+++ b/src/Nncase.Tests/Targets/UnitTestCPUKernels.cs
@@ -164,29 +164,6 @@ public async Task TestPackBinary(BinaryOp op, int[] lhsShape, int[] rhsShape, in
         await RunCases(Path.Join(CompileOptions.DumpDir.ToString(), $"Theory{count}"), feedDict, posts);
     }
 
-    [Theory]
-    [InlineData(new object[] { new[] { 1, 2, 16 }, 2, 1e-6, true, 0 })]
-    [InlineData(new object[] { new[] { 1, 2, 16 }, 2, 1e-6, false, 1 })]
-    public async Task TestLayerNorm(int[] shape, int axis, float epsion, bool useMean, int count)
-    {
-        var input = new Var(new TensorType(DataTypes.Float32, shape));
-        var pshape = shape.Skip(axis).ToArray();
-        var scale = new Var(new TensorType(DataTypes.Float32, pshape));
-        var bias = new Var(new TensorType(DataTypes.Float32, pshape));
-        var pre = IR.F.NN.LayerNorm(axis, epsion, input, scale, bias, useMean);
-
-        var feedDict = new Dictionary<Var, IValue>() {
-            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
-            { scale, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, pshape).Evaluate() },
-            { bias, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, pshape).Evaluate() },
-        };
-
-        var rule = new Passes.Rules.CPU.PackLayerNorm(Rank, Lane);
-        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
-        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext())).Where(e => e is not Call { Target: Slice });
-        await RunCases(Path.Join(CompileOptions.DumpDir.ToString(), $"Theory{count}"), feedDict, posts);
-    }
-
     [Theory]
     [InlineData(new object[] { new[] { 1, 2, 16, 32 }, 1e-5, 0 })]
     [InlineData(new object[] { new[] { 1, 32, 2048 }, 1e-6, 1 })]

From b9f7c84eb2756fd8dfc83f1861d42169c0c55eaa Mon Sep 17 00:00:00 2001
From: sunnycase <sunnycase@live.cn>
Date: Fri, 13 Sep 2024 10:07:17 +0000
Subject: [PATCH 08/10] Support reduce with keepdims=False

---
 .../include/nncase/ntt/arch/x86_64/ukernels.h | 93 +------------------
 .../include/nncase/ntt/kernels/reduce.h       | 45 ++++++++-
 src/Native/include/nncase/ntt/shape.h         | 19 +++-
 src/Native/include/nncase/ntt/tensor.h        | 68 +++++++++++++-
 src/Native/include/nncase/ntt/ukernels.h      | 49 ++++++++++
 src/Native/include/nncase/ntt/utility.h       | 35 +++++++
 6 files changed, 209 insertions(+), 100 deletions(-)

diff --git a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
index 0db13ca123..721f530dc9 100644
--- a/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
+++ b/src/Native/include/nncase/ntt/arch/x86_64/ukernels.h
@@ -40,96 +40,7 @@ class u_pack<M, N, MStrides, true, float, vector<float, 8>> {
     }
 };
 
-template <reduce_op Op> struct u_reduce<Op, vector<float, 8>, true> {
-  public:
-    constexpr vector<float, 8>
-    operator()(const vector<float, 8> *input, size_t input_stride, size_t count,
-               vector<float, 8> init_value) noexcept {
-        using binary_op_t =
-            typename reduce_to_binary_type<Op>::template type<vector<float, 8>,
-                                                              vector<float, 8>>;
-        binary_op_t op;
-        if (count / 4) {
-            vector<float, 8> tmp[4];
-            for (size_t i = 0; i < 4; i++) {
-                tmp[i] = input[i * input_stride];
-            }
-            input += input_stride * 4;
-            count -= 4;
-            while (count / 4) {
-                for (size_t i = 0; i < 4; i++) {
-                    tmp[i] = op(tmp[i], input[i * input_stride]);
-                }
-                input += input_stride * 4;
-                count -= 4;
-            }
-
-            tmp[0] = op(tmp[0], tmp[1]);
-            tmp[2] = op(tmp[2], tmp[3]);
-            tmp[0] = op(tmp[0], tmp[2]);
-            init_value = op(init_value, tmp[0]);
-        }
-
-        if (count / 2) {
-            vector<float, 8> tmp[2];
-            for (size_t i = 0; i < 2; i++) {
-                tmp[i] = input[i * input_stride];
-            }
-            input += input_stride * 2;
-            count -= 2;
-            while (count / 2) {
-                for (size_t i = 0; i < 2; i++) {
-                    tmp[i] = op(tmp[i], input[i * input_stride]);
-                }
-                input += input_stride * 2;
-                count -= 2;
-            }
-
-            tmp[0] = op(tmp[0], tmp[1]);
-            init_value = op(init_value, tmp[0]);
-        }
-
-        for (size_t i = 0; i < count; i++) {
-            init_value = op(init_value, *input);
-            input += input_stride;
-        }
-        return init_value;
-    }
-};
-
-template <reduce_op Op> struct u_reduce<Op, float, true> {
-  public:
-    constexpr float operator()(const float *input, size_t input_stride,
-                               size_t count, float init_value) noexcept {
-        using binary_op_t =
-            typename reduce_to_binary_type<Op>::template type<float, float>;
-        binary_op_t op;
-        if (count / 4) {
-            float tmp[4];
-            for (size_t i = 0; i < 4; i++) {
-                tmp[i] = input[i * input_stride];
-            }
-            input += input_stride * 4;
-            count -= 4;
-            while (count / 4) {
-                for (size_t i = 0; i < 4; i++) {
-                    tmp[i] = op(tmp[i], input[i * input_stride]);
-                }
-                input += input_stride * 4;
-                count -= 4;
-            }
-
-            tmp[0] = op(tmp[0], tmp[1]);
-            tmp[2] = op(tmp[2], tmp[3]);
-            tmp[0] = op(tmp[0], tmp[2]);
-            init_value = op(init_value, tmp[0]);
-        }
-
-        for (size_t i = 0; i < count; i++) {
-            init_value = op(init_value, *input);
-            input += input_stride;
-        }
-        return init_value;
-    }
+template <reduce_op Op, class T> struct u_reduce_policy<Op, T, true> {
+    static constexpr size_t unroll = 8;
 };
 } // namespace nncase::ntt::ukernels
diff --git a/src/Native/include/nncase/ntt/kernels/reduce.h b/src/Native/include/nncase/ntt/kernels/reduce.h
index f41d476af4..99d28a3d8b 100644
--- a/src/Native/include/nncase/ntt/kernels/reduce.h
+++ b/src/Native/include/nncase/ntt/kernels/reduce.h
@@ -55,7 +55,7 @@ class reduce_impl {
     constexpr void operator()(const TIn &input, TOut &output) {
         ntt::apply(output.shape(), [&](auto index) {
             auto reduced_in = (TInElem)initial_value();
-            apply_reduce<0>(input, index, reduced_in);
+            apply_reduce(input, reduce_source_offset<TIn::rank(), Axes>(index), reduced_in);
             if constexpr (IsScalar<TOutElem>) {
                 output(index) = ntt::reduce<
                     ukernels::reduce_to_binary_type<Op>::template type,
@@ -80,15 +80,54 @@ class reduce_impl {
     }
 
   private:
-    template <size_t ReduceIndex>
     constexpr void apply_reduce(const TIn &input,
                                 ranked_shape<TIn::rank()> index,
                                 TInElem &reduced_in) {
+        auto src_tensor =
+            input.view(index, fixed_reduce_source_shape_type<Axes, TIn>());
+        auto conti_dims =
+            contiguous_dims(src_tensor.shape(), src_tensor.strides());
+        if (conti_dims > 1) {
+            ranked_shape<TIn::rank()> src_index{};
+            apply_contiguous_reduce<0>(src_index, conti_dims, src_tensor,
+                                       reduced_in);
+        } else {
+            apply_non_contiguous_reduce<0>(input, index, reduced_in);
+        }
+    }
+
+    template <size_t Axis, class TSubIn>
+    constexpr void apply_contiguous_reduce(ranked_shape<TSubIn::rank()> &index,
+                                           size_t conti_dims,
+                                           const TSubIn &input,
+                                           TInElem &reduced_in) {
+        const auto outer_dims = TSubIn::rank() - conti_dims;
+        if (Axis >= outer_dims) {
+            size_t inner_size = 1;
+            for (size_t i = outer_dims; i < input.shape().rank(); i++)
+                inner_size *= input.shape()[i];
+            auto input_p =
+                input.buffer().data() + linear_offset(index, input.strides());
+            reduced_in = ntt::u_reduce<Op>(input_p, 1, inner_size, reduced_in);
+        } else if constexpr (Axis < TSubIn::rank() - 1) {
+            const auto dim = input.shape()[Axis];
+            for (index[Axis] = 0; index[Axis] < dim; index[Axis]++) {
+                apply_contiguous_reduce<Axis + 1>(index, conti_dims, input,
+                                                  reduced_in);
+            }
+        }
+    }
+
+    template <size_t ReduceIndex>
+    constexpr void apply_non_contiguous_reduce(const TIn &input,
+                                               ranked_shape<TIn::rank()> index,
+                                               TInElem &reduced_in) {
         constexpr size_t Axis = Axes::at(ReduceIndex);
         if constexpr (ReduceIndex < Axes::rank() - 1) {
             for (size_t i = 0; i < input.shape()[Axis]; i++) {
                 index[Axis] = i;
-                apply_reduce<ReduceIndex + 1>(input, index, reduced_in);
+                apply_non_contiguous_reduce<ReduceIndex + 1>(input, index,
+                                                             reduced_in);
             }
         } else {
             const TInElem *in_p = &input(index);
diff --git a/src/Native/include/nncase/ntt/shape.h b/src/Native/include/nncase/ntt/shape.h
index 5c95256cc9..31e3f06ae3 100644
--- a/src/Native/include/nncase/ntt/shape.h
+++ b/src/Native/include/nncase/ntt/shape.h
@@ -59,6 +59,7 @@ template <size_t Rank> struct ranked_dims_base {
     constexpr auto end() const noexcept { return dims_.end(); }
 
     constexpr size_t last() const noexcept { return at(rank() - 1); }
+    constexpr size_t &last() noexcept { return at(rank() - 1); }
 
     constexpr bool contains(size_t value) const noexcept {
         return std::find(begin(), end(), value) != end();
@@ -74,7 +75,9 @@ struct fixed_shape : detail::fixed_dims_base<Dims...> {
         using type = fixed_shape<I, Dims...>;
     };
 
-    template <size_t I> struct append { using type = fixed_shape<Dims..., I>; };
+    template <size_t I> struct append {
+        using type = fixed_shape<Dims..., I>;
+    };
 
     static constexpr size_t length() noexcept { return (Dims * ... * 1); }
 };
@@ -270,10 +273,10 @@ constexpr size_t contiguous_dims(const Shape &shape, const Strides &strides) {
 }
 
 template <class Shape, class Strides>
-inline constexpr size_t max_size_v = (is_fixed_dims_v<Shape> &&
-                                      is_fixed_dims_v<Strides>)
-                                         ? linear_size(Shape{}, Strides{})
-                                         : std::dynamic_extent;
+inline constexpr size_t max_size_v =
+    (is_fixed_dims_v<Shape> && is_fixed_dims_v<Strides>)
+        ? linear_size(Shape{}, Strides{})
+        : std::dynamic_extent;
 
 template <class Index, class Shape>
 constexpr bool in_bound(const Index &index, const Shape &shape) {
@@ -324,4 +327,10 @@ ranked_shape<Rank> get_reduced_offset(Index in_offset) {
 
     return off;
 }
+
+template <size_t RankA, size_t RankB>
+bool operator==(const ranked_shape<RankA> &lhs,
+                const ranked_shape<RankB> &rhs) noexcept {
+    return RankA == RankB && std::equal(lhs.begin(), lhs.end(), rhs.begin());
+}
 } // namespace nncase::ntt
diff --git a/src/Native/include/nncase/ntt/tensor.h b/src/Native/include/nncase/ntt/tensor.h
index 9a95445889..e3fcd4c125 100644
--- a/src/Native/include/nncase/ntt/tensor.h
+++ b/src/Native/include/nncase/ntt/tensor.h
@@ -15,6 +15,7 @@
 #pragma once
 #include "detail/shape_storage.h"
 #include "detail/tensor_storage.h"
+#include "nncase/ntt/shape.h"
 #include "tensor_traits.h"
 
 namespace nncase::ntt {
@@ -41,7 +42,8 @@ struct fixed_tensor_alike_type<basic_tensor<T, Shape, Strides, MaxSize, IsView>,
 
 namespace detail {
 template <class T, class Shape, class Strides, size_t MaxSize, bool IsView,
-          bool IsFixedShape = is_fixed_dims_v<Shape> &&is_fixed_dims_v<Strides>>
+          bool IsFixedShape =
+              is_fixed_dims_v<Shape> && is_fixed_dims_v<Strides>>
 class tensor_impl;
 
 // dynamic tensor
@@ -137,12 +139,55 @@ class basic_tensor
 
     using impl_type::impl_type;
 
+    class const_iterator {
+      public:
+        const_iterator(const basic_tensor &tensor,
+                       ranked_shape<shape_type::rank()> index) noexcept
+            : tensor_(tensor), index_(index) {}
+
+        const_iterator &operator++(int) noexcept {
+            index_.last() += 1;
+            for (size_t i = index_.rank() - 1; i > 0; i--) {
+                if (index_[i] >= tensor_.shape()[i]) {
+                    index_[i - 1]++;
+                    index_[i] = 0;
+                }
+            }
+            return *this;
+        }
+
+        const_iterator operator++() noexcept {
+            auto old = *this;
+            operator++(0);
+            return old;
+        }
+
+        T &operator*() noexcept { return tensor_(index_); }
+
+        bool operator==(const const_iterator &other) const noexcept {
+            return &tensor_ == &other.tensor_ && index_ == other.index_;
+        }
+
+      private:
+        const basic_tensor &tensor_;
+        ranked_shape<shape_type::rank()> index_;
+    };
+
     static basic_tensor<T, Shape, Strides, MaxSize, IsView>
     from_scalar(T value) noexcept;
 
     operator const buffer_type &() const noexcept { return buffer(); }
     operator buffer_type &() noexcept { return buffer(); }
 
+    const_iterator begin() const noexcept {
+        return const_iterator(*this, ranked_shape<shape_type::rank()>{});
+    }
+
+    const_iterator end() const noexcept {
+        return const_iterator(*this,
+                              ranked_shape<shape_type::rank()>{shape()[0]});
+    }
+
     template <class Index, class UShape>
     constexpr tensor_view<T, UShape, Strides> view(Index index,
                                                    UShape shape) noexcept {
@@ -163,6 +208,27 @@ class basic_tensor
         }
     }
 
+    template <class Index, class UShape>
+    constexpr tensor_view<const T, UShape, Strides>
+    view(Index index, UShape shape) const noexcept {
+        if constexpr (is_fixed_dims_v<Strides>) {
+            auto offset = linear_offset(index, strides());
+            auto begin = elements().data() + offset;
+            if constexpr (is_fixed_dims_v<UShape>) {
+                constexpr size_t size = linear_size(shape, strides());
+                return {std::span<const T, size>(begin, size), shape,
+                        strides()};
+            } else {
+                size_t size = linear_size(shape, strides());
+                return {std::span(begin, size), shape, strides()};
+            }
+        } else {
+            return {elements().subspan(linear_offset(index, strides()),
+                                       linear_size(shape, strides())),
+                    shape, strides()};
+        }
+    }
+
     template <typename TNewShape>
     constexpr tensor_view<T, TNewShape, default_strides_t<TNewShape>>
     reshape(TNewShape shape) noexcept {
diff --git a/src/Native/include/nncase/ntt/ukernels.h b/src/Native/include/nncase/ntt/ukernels.h
index d497576387..6ff76198f9 100644
--- a/src/Native/include/nncase/ntt/ukernels.h
+++ b/src/Native/include/nncase/ntt/ukernels.h
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 #pragma once
+#include "apply.h"
 #include "primitive_ops.h"
 #include "tensor.h"
 #include "tensor_traits.h"
@@ -60,12 +61,49 @@ template <> struct reduce_to_binary_type<reduce_op::prod> {
     template <class T1, class T2> using type = ops::mul<T1, T2>;
 };
 
+template <reduce_op Op, class T, bool Arch> struct u_reduce_policy {
+    static constexpr size_t unroll = 1;
+};
+
 template <reduce_op Op, class T, bool Arch> struct u_reduce {
   public:
     constexpr T operator()(const T *input, size_t input_stride, size_t count,
                            T init_value) noexcept {
         using binary_op_t =
             typename reduce_to_binary_type<Op>::template type<T, T>;
+        using policy_t = u_reduce_policy<Op, T, Arch>;
+        constexpr auto unroll = policy_t::unroll;
+
+        if (count / unroll) {
+            T temp[unroll];
+#if 1
+            for (size_t i = 0; i < unroll; i++) {
+                temp[i] = *input;
+                input += input_stride;
+                count--;
+            }
+
+            while (count / unroll) {
+                for (size_t i = 0; i < unroll; i++) {
+                    temp[i] = binary_op_t()(temp[i], *input);
+                    input += input_stride;
+                    count--;
+                }
+            }
+
+            init_value = binary_op_t()(init_value, tree_reduce<unroll>(temp));
+#else
+            while (count / unroll) {
+                for (size_t i = 0; i < unroll; i++) {
+                    temp[i] = *input;
+                    input += input_stride;
+                    count--;
+                }
+                init_value =
+                    binary_op_t()(init_value, tree_reduce<unroll>(temp));
+            }
+#endif
+        }
 
         for (size_t i = 0; i < count; i++) {
             init_value = binary_op_t()(init_value, *input);
@@ -73,6 +111,17 @@ template <reduce_op Op, class T, bool Arch> struct u_reduce {
         }
         return init_value;
     }
+
+    template <size_t N> constexpr T tree_reduce(T *input) noexcept {
+        using binary_op_t =
+            typename reduce_to_binary_type<Op>::template type<T, T>;
+        if constexpr (N == 2) {
+            return binary_op_t()(input[0], input[1]);
+        } else {
+            return binary_op_t()(tree_reduce<N / 2>(input),
+                                 tree_reduce<N / 2>(input + N / 2));
+        }
+    }
 };
 } // namespace nncase::ntt::ukernels
 
diff --git a/src/Native/include/nncase/ntt/utility.h b/src/Native/include/nncase/ntt/utility.h
index d6532b1814..a72aa0bb16 100644
--- a/src/Native/include/nncase/ntt/utility.h
+++ b/src/Native/include/nncase/ntt/utility.h
@@ -14,6 +14,7 @@
  */
 #pragma once
 #include "shape.h"
+#include "tensor_traits.h"
 #include <cstddef>
 #include <cstring>
 #include <span>
@@ -64,6 +65,13 @@ template <int32_t Offset, template <size_t...> class A, size_t... Dims>
 inline constexpr auto shift_fixed_dims(A<Dims...>) {
     return A<(Dims - Offset)...>{};
 }
+
+template <IsFixedDims Axes, IsFixedTensor TTensor, size_t... Ints>
+constexpr auto
+fixed_reduce_source_shape_type(std::index_sequence<Ints...>) noexcept {
+    return fixed_shape<(Axes::contains(Ints) ? TTensor::shape_type::at(Ints)
+                                             : 1)...>{};
+}
 } // namespace utility_detail
 
 template <class U, class T, size_t Extent>
@@ -114,4 +122,31 @@ template <int32_t Offset, template <size_t...> class A, size_t... Dims>
 inline constexpr auto shift_fixed_dims(A<Dims...> a) {
     return utility_detail::shift_fixed_dims<Offset>(a);
 }
+
+template <IsFixedDims Axes, IsFixedTensor TTensor>
+constexpr auto fixed_reduce_source_shape_type() noexcept {
+    return utility_detail::fixed_reduce_source_shape_type<Axes, TTensor>(
+        std::make_index_sequence<TTensor::rank()>());
+}
+
+template <size_t InRank, IsFixedDims Axes, size_t OutRank>
+constexpr ranked_shape<InRank>
+reduce_source_offset(ranked_shape<OutRank> out_index) noexcept {
+    // Keep dims
+    if constexpr (InRank == OutRank) {
+        return out_index;
+    } else {
+        ranked_shape<InRank> in_index;
+        size_t shrinked_dims = 0;
+        for (size_t i = 0; i < InRank; i++) {
+            if (Axes::contains(i)) {
+                in_index[i] = 0;
+                shrinked_dims++;
+            } else {
+                in_index[i] = out_index[i - shrinked_dims];
+            }
+        }
+        return in_index;
+    }
+}
 } // namespace nncase::ntt

From 1a1594a7ad31af001485338f84613089bc90889a Mon Sep 17 00:00:00 2001
From: sunnycase <sunnycase@users.noreply.github.com>
Date: Fri, 13 Sep 2024 10:09:47 +0000
Subject: [PATCH 09/10] Apply code-format changes

---
 src/Native/include/nncase/ntt/kernels/reduce.h |  3 ++-
 src/Native/include/nncase/ntt/shape.h          | 12 +++++-------
 src/Native/include/nncase/ntt/tensor.h         |  3 +--
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/Native/include/nncase/ntt/kernels/reduce.h b/src/Native/include/nncase/ntt/kernels/reduce.h
index 99d28a3d8b..2bcfcc81ca 100644
--- a/src/Native/include/nncase/ntt/kernels/reduce.h
+++ b/src/Native/include/nncase/ntt/kernels/reduce.h
@@ -55,7 +55,8 @@ class reduce_impl {
     constexpr void operator()(const TIn &input, TOut &output) {
         ntt::apply(output.shape(), [&](auto index) {
             auto reduced_in = (TInElem)initial_value();
-            apply_reduce(input, reduce_source_offset<TIn::rank(), Axes>(index), reduced_in);
+            apply_reduce(input, reduce_source_offset<TIn::rank(), Axes>(index),
+                         reduced_in);
             if constexpr (IsScalar<TOutElem>) {
                 output(index) = ntt::reduce<
                     ukernels::reduce_to_binary_type<Op>::template type,
diff --git a/src/Native/include/nncase/ntt/shape.h b/src/Native/include/nncase/ntt/shape.h
index 31e3f06ae3..7ab0a97eba 100644
--- a/src/Native/include/nncase/ntt/shape.h
+++ b/src/Native/include/nncase/ntt/shape.h
@@ -75,9 +75,7 @@ struct fixed_shape : detail::fixed_dims_base<Dims...> {
         using type = fixed_shape<I, Dims...>;
     };
 
-    template <size_t I> struct append {
-        using type = fixed_shape<Dims..., I>;
-    };
+    template <size_t I> struct append { using type = fixed_shape<Dims..., I>; };
 
     static constexpr size_t length() noexcept { return (Dims * ... * 1); }
 };
@@ -273,10 +271,10 @@ constexpr size_t contiguous_dims(const Shape &shape, const Strides &strides) {
 }
 
 template <class Shape, class Strides>
-inline constexpr size_t max_size_v =
-    (is_fixed_dims_v<Shape> && is_fixed_dims_v<Strides>)
-        ? linear_size(Shape{}, Strides{})
-        : std::dynamic_extent;
+inline constexpr size_t max_size_v = (is_fixed_dims_v<Shape> &&
+                                      is_fixed_dims_v<Strides>)
+                                         ? linear_size(Shape{}, Strides{})
+                                         : std::dynamic_extent;
 
 template <class Index, class Shape>
 constexpr bool in_bound(const Index &index, const Shape &shape) {
diff --git a/src/Native/include/nncase/ntt/tensor.h b/src/Native/include/nncase/ntt/tensor.h
index e3fcd4c125..82b15552c6 100644
--- a/src/Native/include/nncase/ntt/tensor.h
+++ b/src/Native/include/nncase/ntt/tensor.h
@@ -42,8 +42,7 @@ struct fixed_tensor_alike_type<basic_tensor<T, Shape, Strides, MaxSize, IsView>,
 
 namespace detail {
 template <class T, class Shape, class Strides, size_t MaxSize, bool IsView,
-          bool IsFixedShape =
-              is_fixed_dims_v<Shape> && is_fixed_dims_v<Strides>>
+          bool IsFixedShape = is_fixed_dims_v<Shape> &&is_fixed_dims_v<Strides>>
 class tensor_impl;
 
 // dynamic tensor

From 0277d26e171e1d55caf5de28ad86826fe73c6620 Mon Sep 17 00:00:00 2001
From: sunnycase <sunnycase@live.cn>
Date: Fri, 13 Sep 2024 10:13:44 +0000
Subject: [PATCH 10/10] Avoid infinite recursion for tree_reduce

---
 src/Native/include/nncase/ntt/ukernels.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Native/include/nncase/ntt/ukernels.h b/src/Native/include/nncase/ntt/ukernels.h
index 6ff76198f9..e4045b7216 100644
--- a/src/Native/include/nncase/ntt/ukernels.h
+++ b/src/Native/include/nncase/ntt/ukernels.h
@@ -62,7 +62,7 @@ template <> struct reduce_to_binary_type<reduce_op::prod> {
 };
 
 template <reduce_op Op, class T, bool Arch> struct u_reduce_policy {
-    static constexpr size_t unroll = 1;
+    static constexpr size_t unroll = 2;
 };
 
 template <reduce_op Op, class T, bool Arch> struct u_reduce {