From 08342c6f9f09d06f284be7c219766e6657ff5b2b Mon Sep 17 00:00:00 2001 From: sunnycase Date: Thu, 12 Sep 2024 01:39:32 +0000 Subject: [PATCH 01/10] [ntt.x86] Refactor reduce kernel --- .../nncase/ntt/arch/x86_64/primitive_ops.h | 54 ++++ .../nncase/ntt/kernels/packed_layer_norm.h | 144 --------- .../nncase/ntt/kernels/packed_softmax.h | 110 ------- .../include/nncase/ntt/kernels/reduce.h | 241 ++++++++------ src/Native/include/nncase/ntt/ntt.h | 2 - src/Native/include/nncase/ntt/primitive_ops.h | 37 ++- src/Native/include/nncase/ntt/tensor_ops.h | 24 +- src/Native/src/test.cpp | 298 +----------------- .../benchmark_test/benchmark_ntt_reduce.cpp | 224 +++++-------- src/Native/test/ctest/test_ntt_reduce.cpp | 120 +++---- 10 files changed, 363 insertions(+), 891 deletions(-) delete mode 100644 src/Native/include/nncase/ntt/kernels/packed_layer_norm.h delete mode 100644 src/Native/include/nncase/ntt/kernels/packed_softmax.h diff --git a/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h b/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h index caa7a621f2..26c553e989 100644 --- a/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h +++ b/src/Native/include/nncase/ntt/arch/x86_64/primitive_ops.h @@ -651,6 +651,60 @@ template <> struct max> { } }; +template <> struct reduce> { + float operator()(const ntt::vector &v, + float init_value) const noexcept { + return init_value + operator()(v); + } + + float operator()(const ntt::vector &v) const noexcept { + // Sum the elements in the 256-bit vector directly + __m128 sum = + _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1)); + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 1)); + + // Extract and return the final sum + return _mm_cvtss_f32(sum); + } +}; + +template <> struct reduce> { + float operator()(const ntt::vector &v, + float init_value) const noexcept { + return ntt::max(init_value, operator()(v)); + } + + float operator()(const ntt::vector &v) const noexcept { + // Sum the elements in the 256-bit vector directly + __m128 sum = + _mm_max_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1)); + sum = _mm_max_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_max_ss(sum, _mm_shuffle_ps(sum, sum, 1)); + + // Extract and return the final sum + return _mm_cvtss_f32(sum); + } +}; + +template <> struct reduce> { + float operator()(const ntt::vector &v, + float init_value) const noexcept { + return ntt::min(init_value, operator()(v)); + } + + float operator()(const ntt::vector &v) const noexcept { + // Sum the elements in the 256-bit vector directly + __m128 sum = + _mm_min_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1)); + sum = _mm_min_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_min_ss(sum, _mm_shuffle_ps(sum, sum, 1)); + + // Extract and return the final sum + return _mm_cvtss_f32(sum); + } +}; + template struct mma, ntt::vector, ntt::vector> { diff --git a/src/Native/include/nncase/ntt/kernels/packed_layer_norm.h b/src/Native/include/nncase/ntt/kernels/packed_layer_norm.h deleted file mode 100644 index 6432e7685c..0000000000 --- a/src/Native/include/nncase/ntt/kernels/packed_layer_norm.h +++ /dev/null @@ -1,144 +0,0 @@ -/* Copyright 2019-2021 Canaan Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include "../apply.h" -#include "../tensor_ops.h" -#include "../utility.h" -#include "binary.h" -#include "unary.h" - -namespace nncase::ntt { - -namespace packed_layer_norm_detail { - -template -void within_axis_pack_impl(const TIn &input, const TScale &scale, - const TBias &bias, TOut &&output, const TEp &epsilon, - const bool &use_mean, PackedAxes, PadedNums) { - using TElem = typename TIn::element_type; - constexpr auto input_shape = typename TIn::shape_type{}; - constexpr auto input_strides = typename TIn::strides_type{}; - constexpr auto scale_shape = typename TScale::shape_type{}; - constexpr auto scale_strides = typename TScale::strides_type{}; - constexpr auto bias_shape = typename TBias::shape_type{}; - constexpr auto bias_strides = typename TBias::strides_type{}; - constexpr auto output_shape = typename std::decay_t::shape_type{}; - constexpr auto output_strides = typename std::decay_t::strides_type{}; - constexpr size_t in_contigous_dim = - contiguous_dims(input_shape, input_strides); - constexpr size_t scale_contiguous_dims = - contiguous_dims(scale_shape, scale_strides); - constexpr size_t bias_contiguous_dims = - contiguous_dims(bias_shape, bias_strides); - constexpr size_t output_contiguous_dims = - contiguous_dims(output_shape, output_strides); - static_assert(in_contigous_dim != 0 || scale_contiguous_dims != 0 || - bias_contiguous_dims != 0 || output_contiguous_dims != 0, - "currently not support no contiguous!"); - static_assert(is_same_seq(input_shape, output_shape), "shape not match"); - static_assert(is_same_seq(input_strides, output_strides), - "strides not match"); - constexpr auto domain = slice_fixed_dims(input_shape); - constexpr auto strides = slice_fixed_dims(input_strides); - - constexpr size_t inner_size = - slice_fixed_dims(input_shape).length(); - // constexpr size_t no_paded_rank = - // PackedAxes::rank() == 0 ? 0 - // : input_shape.rank() - PackedAxes::at(0) - 1; - // constexpr size_t paded_axis = - // PackedAxes::rank() == 0 ? 0 : PackedAxes::at(0) + 1; - // // clang-format off - // constexpr size_t paded_inner_size = (PadedNums::rank() == 0 || - // (PadedNums::rank() == 1 && PadedNums::at(0) == 0)) - // ? 0 - // : PadedNums::at(0) * slice_fixed_dims(input_shape).length(); - // // clang-format on - constexpr bool UseVectorReduce = - PackedAxes::rank() == 1 && PackedAxes::at(0) >= Axis; - - TElem finner_size = (TElem)inner_size; - if constexpr (UseVectorReduce) { - finner_size = finner_size * (TElem)TElem::shape_type::length(); - } - // remove pad nums, NOTE after mul elem size - // finner_size = sub_op(finner_size, paded_inner_size); - - apply(domain, [&](auto index) { - const auto input_p = - input.elements().data() + linear_offset(index, strides); - const auto scale_p = scale.elements().data(); - const auto bias_p = bias.elements().data(); - auto output_p = - output.elements().data() + linear_offset(index, strides); - - // compute mean - TElem mean1 = (TElem)0; - if (use_mean) { - for (size_t i = 0; i < inner_size; i++) - mean1 = mean1 + (input_p[i] / finner_size); - if constexpr (UseVectorReduce) { - mean1 = (TElem)reduce_sum(mean1); - } - } - - std::array sub; - for (auto i = 0; i < inner_size; i++) - sub[i] = input_p[i] - mean1; - - std::array pow; - for (auto i = 0; i < inner_size; i++) - pow[i] = sub[i] * sub[i]; - - TElem mean2 = (TElem)0; - for (auto i = 0; i < inner_size; i++) - mean2 = mean2 + (pow[i] / finner_size); - if constexpr (UseVectorReduce) { - mean2 = (TElem)reduce_sum(mean2); - } - - TElem add = mean2 + epsilon; - TElem sqrt = ntt::sqrt(add); - - std::array norm; - for (auto i = 0; i < inner_size; i++) - norm[i] = sub[i] / sqrt; - - for (auto i = 0; i < inner_size; i++) - output_p[i] = (norm[i] * scale_p[i]) + (TElem)bias_p[i]; - }); -} -} // namespace packed_layer_norm_detail - -template -void packed_layer_norm(const TIn &input, const TScale &scale, const TBias &bias, - TOut &&output, const TEp &epsilon, const bool &use_mean, - PackedAxes packedAxes, PadedNums padedNums) { - static_assert(PackedAxes::rank() < 2, "currently not support 2d packing."); - if constexpr (PackedAxes::rank() <= 1) { - static_assert(PadedNums::rank() == 0 || - (PadedNums::rank() == 1 && PadedNums::at(0) == 0), - "not support padding"); - packed_layer_norm_detail::within_axis_pack_impl( - input, scale, bias, output, epsilon, use_mean, packedAxes, - padedNums); - } -} -} // namespace nncase::ntt diff --git a/src/Native/include/nncase/ntt/kernels/packed_softmax.h b/src/Native/include/nncase/ntt/kernels/packed_softmax.h deleted file mode 100644 index dbeebb40e6..0000000000 --- a/src/Native/include/nncase/ntt/kernels/packed_softmax.h +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright 2019-2021 Canaan Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include "../apply.h" -#include "../shape_infer/reduce_axis.h" -#include "../tensor_ops.h" -#include "../utility.h" -#include "binary.h" -#include "unary.h" -#include - -namespace nncase::ntt { - -namespace softmax_detail { -template -void packed_on_axis_impl(const TIn &input, TOut &&output, - [[maybe_unused]] PackedAxes packedAxes) { - using TElem = typename TIn::element_type; - constexpr auto input_shape = typename TIn::shape_type{}; - constexpr auto output_shape = typename std::decay_t::shape_type{}; - static_assert(is_same_seq(input_shape, output_shape), - "the input output shape not equal!"); - - constexpr auto need_reduce = - PackedAxes::rank() != 0 && Axis == PackedAxes::at(0); - constexpr auto domain = - shape_infer::reduced_shape_by_axis(input_shape); - apply(domain, [&](auto index) { - // max - TElem max_value = input(index); - for (index[Axis] = 0; index[Axis] < input_shape.at(Axis); - index[Axis]++) { - max_value = max(max_value, input(index)); - } - - // reduce_max - if constexpr (need_reduce) { - max_value = (TElem)reduce_max(max_value); - } - - // (x - reduce_max) * beta - for (index[Axis] = 0; index[Axis] < input_shape.at(Axis); - index[Axis]++) { - output(index) = input(index) - max_value; - } - - // exp((x - reduce_max) * beta) and sum - TElem sum = (TElem)0; - for (index[Axis] = 0; index[Axis] < input_shape.at(Axis); - index[Axis]++) { - output(index) = exp(output(index)); - sum += output(index); - } - - // reduce sum - if constexpr (need_reduce) { - sum = (TElem)reduce_sum(sum); - } - - // div - for (index[Axis] = 0; index[Axis] < input_shape.at(Axis); - index[Axis]++) { - output(index) = output(index) / sum; - } - }); -} - -template -void packed_softmax_1d(const TIn &input, TOut &&output, PackedAxes packedAxes) { - packed_on_axis_impl(input, output, packedAxes); -} - -} // namespace softmax_detail - -/** - * @brief packed softmax - * implement notice: - * 1. need support 2d pack. - * 2. need support paded nums. - * 3. need different implementation when the packed axis is equal or not - * equal axis. - * @tparam Axis softmax reduced axis - * @param input input tensor. - * @param output output output. - * @param packedAxes packed axes - */ -template -void packed_softmax(const TIn &input, TOut &&output, - [[maybe_unused]] PackedAxes packedAxes - /* , [[maybe_unused]] PadedNums padednums */) noexcept { - static_assert(PackedAxes::rank() < 2, "currently not support 2d pack"); - // static_assert(PadedNums::at(0) == 0, "currently not support pad"); - softmax_detail::packed_softmax_1d(input, output, packedAxes); -} -} // namespace nncase::ntt diff --git a/src/Native/include/nncase/ntt/kernels/reduce.h b/src/Native/include/nncase/ntt/kernels/reduce.h index 70d8ab05f5..593d7055d4 100644 --- a/src/Native/include/nncase/ntt/kernels/reduce.h +++ b/src/Native/include/nncase/ntt/kernels/reduce.h @@ -14,132 +14,169 @@ */ #pragma once #include "../apply.h" -#include "../loop.h" #include "../primitive_ops.h" +#include "../profiler.h" +#include "../shape_infer/reduce.h" #include "../tensor_ops.h" -#include "../unrool.h" +#include "../tensor_traits.h" #include "../utility.h" +#include "nncase/ntt/shape.h" +#include +#include +#include namespace nncase::ntt { +enum class reduce_op { + mean, + min, + max, + sum, + prod, +}; -namespace reduce_detail { - -template