diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index 50fa933833..f9786c575e 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -8,9 +8,10 @@ #include +#include +#include #include #include -#include #include // @lint-ignore CLANGTIDY facebook-unused-include-check #include @@ -34,18 +35,10 @@ namespace util { constexpr size_t kKVDim = 4; template -inline void _store(T* dst, ::executorch::vec::Vectorized src) { +inline void _store(T* dst, ::at::vec::Vectorized src) { src.store(dst); } -/* -inline void _store(::Half* dst, at::vec::Vectorized src) { - //fp16_ieee_to_fp32_value - auto res = at::vec::convert_float_half(src, src); - res.store(dst, at::vec::Vectorized::size()); -} -*/ - template inline T data_index_init(T offset) { return offset; @@ -78,7 +71,7 @@ inline double calculate_scale(const Tensor& query, optional scale) { } } // namespace util -namespace vec = ::executorch::vec; +namespace vec = ::at::vec; using Tensor = exec_aten::Tensor; namespace { diff --git a/kernels/optimized/cpu/moments_utils.h b/kernels/optimized/cpu/moments_utils.h index 97ea25fa6d..1bba201f84 100644 --- a/kernels/optimized/cpu/moments_utils.h +++ b/kernels/optimized/cpu/moments_utils.h @@ -12,7 +12,7 @@ // for use in optimized ExecuTorch ops. Template specializations of BFloat16 // are excluded. -#include +#include #include #include @@ -47,12 +47,12 @@ void AddMoments( template ET_INLINE void AddMomentsVec( int64_t m0_add, - const executorch::vec::Vectorized& m1_add, - const executorch::vec::Vectorized& m2_add, + const at::vec::Vectorized& m1_add, + const at::vec::Vectorized& m2_add, int64_t& m0, - executorch::vec::Vectorized& m1, - executorch::vec::Vectorized& m2) { - using Vec = executorch::vec::Vectorized; + at::vec::Vectorized& m1, + at::vec::Vectorized& m2) { + using Vec = at::vec::Vectorized; const int64_t n = m0 + m0_add; const T c = n == 0 ? static_cast(0) : static_cast(m0_add) / static_cast(n); @@ -67,11 +67,11 @@ template inline void UpdateMomentsVec( int64_t m0, const T* X_ptr, - const std::array>, kChunkSize>& c_vecs, + const std::array>, kChunkSize>& c_vecs, int64_t& m0_stk0, - executorch::vec::Vectorized>& m1_stk0, - executorch::vec::Vectorized>& m2_stk0) { - using Vec = executorch::vec::Vectorized>; + at::vec::Vectorized>& m1_stk0, + at::vec::Vectorized>& m2_stk0) { + using Vec = at::vec::Vectorized>; Vec m1_vec(0); Vec m2_vec(0); for (int64_t j = 0; j < m0; ++j) { @@ -92,13 +92,13 @@ std::pair, acc_t> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) { using T_ACC = acc_t; - constexpr int64_t kVecSize = executorch::vec::Vectorized::size(); - constexpr int64_t kAccVecSize = executorch::vec::Vectorized::size(); + constexpr int64_t kVecSize = at::vec::Vectorized::size(); + constexpr int64_t kAccVecSize = at::vec::Vectorized::size(); const int64_t n = N / kVecSize; const int64_t m = executorch::utils::divup(n, kChunkSize); const int64_t depth = executorch::utils::CeilLog2(m); - using Vec = executorch::vec::Vectorized; + using Vec = at::vec::Vectorized; const Vec kZeroVec(T_ACC(0)); std::array m0_stk; std::array m1_stk; @@ -168,7 +168,7 @@ RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) { template std::pair, acc_t> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) { - using Vec = executorch::vec::Vectorized; + using Vec = at::vec::Vectorized; constexpr int64_t kVecSize = Vec::size(); const int64_t n = N / kVecSize; const int64_t m = executorch::utils::divup(n, kChunkSize); diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp index 2b31a8d5db..5c1f8f5a15 100644 --- a/kernels/optimized/cpu/op_add.cpp +++ b/kernels/optimized/cpu/op_add.cpp @@ -6,9 +6,10 @@ * LICENSE file in the root directory of this source tree. */ +#include +#include #include #include -#include #include #include #include @@ -99,8 +100,8 @@ Tensor& opt_add_out( CTYPE_B b_val = *b.const_data_ptr(); CTYPE b_casted = static_cast(b_val); - using Vec = executorch::vec::Vectorized; - executorch::vec::map( + using Vec = at::vec::Vectorized; + at::vec::map( [alpha_val, b_casted](Vec x) { return x + Vec(alpha_val * b_casted); }, @@ -131,8 +132,8 @@ Tensor& opt_add_out( ET_KERNEL_CHECK( ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); - using Vec = executorch::vec::Vectorized; - executorch::vec::map2( + using Vec = at::vec::Vectorized; + at::vec::map2( [alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -166,7 +167,7 @@ Tensor& opt_add_out( ET_KERNEL_CHECK( ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); - using Vec = executorch::vec::Vectorized; + using Vec = at::vec::Vectorized; executorch::vec::broadcasting_map_2d_by_1d( [alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; }, out.mutable_data_ptr(), @@ -244,8 +245,8 @@ Tensor& opt_add_scalar_out( CTYPE alpha_val; ET_EXTRACT_SCALAR(alpha, alpha_val); - using Vec = executorch::vec::Vectorized; - executorch::vec::map( + using Vec = at::vec::Vectorized; + at::vec::map( [alpha_val, b_casted](Vec x) { return x + Vec(alpha_val * b_casted); }, diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp index 4d7b8efe9e..3d895cc388 100644 --- a/kernels/optimized/cpu/op_div.cpp +++ b/kernels/optimized/cpu/op_div.cpp @@ -6,9 +6,10 @@ * LICENSE file in the root directory of this source tree. */ +#include +#include #include #include -#include #include #include #include @@ -76,16 +77,16 @@ Tensor& opt_div_out( CTYPE_SCALAR scalar_val = *scalar->const_data_ptr(); CTYPE scalar_casted = static_cast(scalar_val); - using Vec = executorch::vec::Vectorized; + using Vec = at::vec::Vectorized; if (a.numel() == 1) { - executorch::vec::map( + at::vec::map( [scalar_casted](Vec x) { return Vec(scalar_casted) / x; }, out.mutable_data_ptr(), tensor->const_data_ptr(), out.numel()); } else { Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted); - executorch::vec::map( + at::vec::map( [inv_scalar_casted_vec](Vec x) { return x * inv_scalar_casted_vec; }, @@ -111,8 +112,8 @@ Tensor& opt_div_out( "Failed to resize output tensor."); ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "div.out", CTYPE, [&]() { - using Vec = executorch::vec::Vectorized; - executorch::vec::map2( + using Vec = at::vec::Vectorized; + at::vec::map2( [](Vec x, Vec y) { return x / y; }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -142,7 +143,7 @@ Tensor& opt_div_out( out, "Failed to resize output tensor."); ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() { - using Vec = executorch::vec::Vectorized; + using Vec = at::vec::Vectorized; if (selected_optimized_path == ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) { executorch::vec::broadcasting_map_2d_by_1d( @@ -222,9 +223,9 @@ Tensor& opt_div_scalar_out( ET_EXTRACT_SCALAR(b, b_val); CTYPE b_casted = static_cast(b_val); - using Vec = executorch::vec::Vectorized; + using Vec = at::vec::Vectorized; Vec inv_b_casted_vec(CTYPE(1) / b_casted); - executorch::vec::map( + at::vec::map( [inv_b_casted_vec](Vec x) { return x * inv_b_casted_vec; }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_exp.cpp b/kernels/optimized/cpu/op_exp.cpp index 8c234d3d1d..3692501678 100644 --- a/kernels/optimized/cpu/op_exp.cpp +++ b/kernels/optimized/cpu/op_exp.cpp @@ -8,8 +8,9 @@ #include +#include +#include #include -#include #include namespace torch { @@ -34,8 +35,8 @@ void exp_data( const CTYPE_IN* in_data, const size_t numel, CTYPE_OUT* out_data) { - using Vec = executorch::vec::Vectorized; - executorch::vec::map( + using Vec = at::vec::Vectorized; + at::vec::map( [](Vec x) { return x.exp(); }, out_data, in_data, numel); } diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp index 3f4a1fec8b..5f929d9d54 100644 --- a/kernels/optimized/cpu/op_le.cpp +++ b/kernels/optimized/cpu/op_le.cpp @@ -6,8 +6,9 @@ * LICENSE file in the root directory of this source tree. */ +#include +#include #include -#include #include #include #include @@ -44,8 +45,8 @@ Tensor& opt_le_tensor_out( if (a_type == b_type && a_type == out_type) { ET_SWITCH_REAL_TYPES_AND( Bool, out_type, ctx, "le.Tensor_out", CTYPE, [&]() { - using Vec = executorch::vec::Vectorized; - executorch::vec::map2( + using Vec = at::vec::Vectorized; + at::vec::map2( [](Vec x, Vec y) { return x.le(y); }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -109,8 +110,8 @@ Tensor& opt_le_scalar_out( CTYPE_B b_val = 0; ET_EXTRACT_SCALAR(b, b_val); CTYPE b_casted = static_cast(b_val); - using Vec = executorch::vec::Vectorized; - executorch::vec::map( + using Vec = at::vec::Vectorized; + at::vec::map( [b_casted](Vec x) { return x.le(Vec(b_casted)); }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp index 7feee4e156..e803fe0b24 100644 --- a/kernels/optimized/cpu/op_mul.cpp +++ b/kernels/optimized/cpu/op_mul.cpp @@ -6,9 +6,10 @@ * LICENSE file in the root directory of this source tree. */ +#include +#include #include #include -#include #include #include #include // IWYU pragma: export @@ -95,7 +96,7 @@ Tensor& handle_last_dim_broadcast( const size_t outer_size = getLeadingDims(out, out.dim() - 1); const auto broadcast_size = out.size(out.dim() - 1); ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() { - using Vec = executorch::vec::Vectorized; + using Vec = at::vec::Vectorized; executorch::vec::broadcasting_map_broadcast_last_dim( [](Vec x, Vec y) { return x * y; }, out.mutable_data_ptr(), @@ -164,7 +165,7 @@ Tensor& handle_broadcast_mul( inner_size = lhs->sizes()[lhs->dim() - 1]; } ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() { - using Vec = executorch::vec::Vectorized; + using Vec = at::vec::Vectorized; executorch::vec::broadcasting_map_3d_and_unsqueezed_3d( [](Vec x, Vec y) { return x * y; }, out.mutable_data_ptr(), @@ -203,8 +204,8 @@ Tensor& opt_mul_out( CTYPE_B b_val = *b.const_data_ptr(); CTYPE b_casted = static_cast(b_val); - using Vec = executorch::vec::Vectorized; - executorch::vec::map( + using Vec = at::vec::Vectorized; + at::vec::map( [b_casted](Vec x) { return x * Vec(b_casted); }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -229,8 +230,8 @@ Tensor& opt_mul_out( "Failed to resize output tensor."); ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() { - using Vec = executorch::vec::Vectorized; - executorch::vec::map2( + using Vec = at::vec::Vectorized; + at::vec::map2( [](Vec x, Vec y) { return x * y; }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -306,8 +307,8 @@ Tensor& opt_mul_scalar_out( ET_EXTRACT_SCALAR(b, b_val); CTYPE b_casted = static_cast(b_val); - using Vec = executorch::vec::Vectorized; - executorch::vec::map( + using Vec = at::vec::Vectorized; + at::vec::map( [b_casted](Vec x) { return x * Vec(b_casted); }, out.mutable_data_ptr(), a.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_native_layer_norm.cpp b/kernels/optimized/cpu/op_native_layer_norm.cpp index d04265f336..8b41625abe 100644 --- a/kernels/optimized/cpu/op_native_layer_norm.cpp +++ b/kernels/optimized/cpu/op_native_layer_norm.cpp @@ -10,9 +10,10 @@ #include #include +#include +#include #include #include -#include #include namespace torch { @@ -33,7 +34,7 @@ void layer_norm( Tensor& out, Tensor& mean, Tensor& rstd) { - using Vec = executorch::vec::Vectorized; + using Vec = at::vec::Vectorized; const size_t dim = input.dim() - normalized_shape.size(); const size_t dim_size = input.size(dim); @@ -93,7 +94,7 @@ void layer_norm( dst_ptr[j] = (src_ptr[j] * scale + offset) * gamma_v + beta_v; } } else { - executorch::vec::map3( + at::vec::map3( [scale, offset](Vec x, Vec gamma, Vec beta) { return (x * Vec(scale) + Vec(offset)) * gamma + beta; }, diff --git a/kernels/optimized/cpu/op_neg.cpp b/kernels/optimized/cpu/op_neg.cpp index c46a004e0b..00bde2c136 100644 --- a/kernels/optimized/cpu/op_neg.cpp +++ b/kernels/optimized/cpu/op_neg.cpp @@ -6,8 +6,9 @@ * LICENSE file in the root directory of this source tree. */ +#include +#include #include -#include #include namespace torch { @@ -27,8 +28,8 @@ Tensor& opt_neg_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { "Failed to resize output tensor."); ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "neg.out", CTYPE, [&] { - using Vec = executorch::vec::Vectorized; - executorch::vec::map( + using Vec = at::vec::Vectorized; + at::vec::map( [](Vec x) { return x.neg(); }, out.mutable_data_ptr(), in.const_data_ptr(), diff --git a/kernels/optimized/cpu/op_sigmoid.cpp b/kernels/optimized/cpu/op_sigmoid.cpp index 751038cc8b..840f38d400 100644 --- a/kernels/optimized/cpu/op_sigmoid.cpp +++ b/kernels/optimized/cpu/op_sigmoid.cpp @@ -8,8 +8,9 @@ #include +#include +#include #include -#include #include namespace torch { @@ -33,8 +34,8 @@ void sigmoid_data( const CTYPE_IN* in_data, const size_t numel, CTYPE_OUT* out_data) { - using Vec = executorch::vec::Vectorized; - executorch::vec::map( + using Vec = at::vec::Vectorized; + at::vec::map( [](Vec x) { auto one_plus_exp = x.neg().exp() + Vec(static_cast(1.0)); return one_plus_exp.reciprocal(); diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp index 51ff4fbd57..622cefb4d1 100644 --- a/kernels/optimized/cpu/op_sub.cpp +++ b/kernels/optimized/cpu/op_sub.cpp @@ -6,9 +6,10 @@ * LICENSE file in the root directory of this source tree. */ +#include +#include #include #include -#include #include #include #include @@ -114,9 +115,9 @@ Tensor& opt_sub_out( CTYPE_SCALAR scalar_val = *scalar->const_data_ptr(); CTYPE scalar_casted = static_cast(scalar_val); - using Vec = executorch::vec::Vectorized; + using Vec = at::vec::Vectorized; if (a.numel() == 1) { - executorch::vec::map( + at::vec::map( [alpha_val, scalar_casted](Vec x) { return Vec(scalar_casted) - Vec(alpha_val) * x; }, @@ -124,7 +125,7 @@ Tensor& opt_sub_out( tensor->const_data_ptr(), out.numel()); } else { - executorch::vec::map( + at::vec::map( [alpha_val, scalar_casted](Vec x) { return x - Vec(alpha_val * scalar_casted); }, @@ -154,8 +155,8 @@ Tensor& opt_sub_out( ET_KERNEL_CHECK( ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); - using Vec = executorch::vec::Vectorized; - executorch::vec::map2( + using Vec = at::vec::Vectorized; + at::vec::map2( [alpha_val](Vec x, Vec y) { return x - Vec(alpha_val) * y; }, out.mutable_data_ptr(), a.const_data_ptr(), @@ -189,7 +190,7 @@ Tensor& opt_sub_out( ET_KERNEL_CHECK( ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, ); - using Vec = executorch::vec::Vectorized; + using Vec = at::vec::Vectorized; if (selected_optimized_path == ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) { executorch::vec::broadcasting_map_2d_by_1d( @@ -279,8 +280,8 @@ Tensor& opt_sub_scalar_out( CTYPE alpha_val; ET_EXTRACT_SCALAR(alpha, alpha_val); - using Vec = executorch::vec::Vectorized; - executorch::vec::map( + using Vec = at::vec::Vectorized; + at::vec::map( [alpha_val, b_casted](Vec x) { return x - Vec(alpha_val * b_casted); }, diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index 0a6208e2c1..b5646964e6 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -5,6 +5,7 @@ _OPTIMIZED_ATEN_OPS = ( op_target( name = "op_add", deps = [ + ":aten_headers_for_executorch", ":binary_ops", "//executorch/kernels/portable/cpu:scalar_utils", "//executorch/kernels/portable/cpu/util:broadcast_util", @@ -19,13 +20,24 @@ _OPTIMIZED_ATEN_OPS = ( op_target( name = "op_div", deps = [ + ":aten_headers_for_executorch", ":binary_ops", "//executorch/kernels/portable/cpu:scalar_utils", "//executorch/kernels/portable/cpu/util:broadcast_util", ], ), - op_target(name = "op_exp"), - op_target(name = "op_sigmoid"), + op_target( + name = "op_exp", + deps = [ + ":aten_headers_for_executorch", + ], + ), + op_target( + name = "op_sigmoid", + deps = [ + ":aten_headers_for_executorch", + ], + ), op_target( name = "op_gelu", deps = [ @@ -35,6 +47,7 @@ _OPTIMIZED_ATEN_OPS = ( op_target( name = "op_le", deps = [ + ":aten_headers_for_executorch", "//executorch/kernels/portable/cpu:scalar_utils", ], ), @@ -67,6 +80,7 @@ _OPTIMIZED_ATEN_OPS = ( op_target( name = "op_mul", deps = [ + ":aten_headers_for_executorch", ":binary_ops", "//executorch/kernels/portable/cpu:scalar_utils", "//executorch/kernels/portable/cpu/util:broadcast_util", @@ -76,14 +90,21 @@ _OPTIMIZED_ATEN_OPS = ( op_target( name = "op_native_layer_norm", deps = [ + ":aten_headers_for_executorch", ":moments_utils", "//executorch/kernels/portable/cpu/util:normalization_ops_util", ], ), - op_target(name = "op_neg"), + op_target( + name = "op_neg", + deps = [ + ":aten_headers_for_executorch", + ], + ), op_target( name = "op_sub", deps = [ + ":aten_headers_for_executorch", ":binary_ops", "//executorch/kernels/portable/cpu:scalar_utils", "//executorch/kernels/portable/cpu/util:broadcast_util", @@ -172,6 +193,7 @@ def define_common_targets(): exported_headers = ["moments_utils.h"], visibility = ["//executorch/kernels/optimized/..."], exported_deps = [ + ":aten_headers_for_executorch", "//executorch/kernels/optimized:libvec", "//executorch/kernels/optimized:libutils", ], diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl index 659c7afe09..a270b7030e 100644 --- a/kernels/optimized/lib_defs.bzl +++ b/kernels/optimized/lib_defs.bzl @@ -132,6 +132,9 @@ def define_libs(is_fbcode=False): "//executorch/...", "@EXECUTORCH_CLIENTS", ], + exported_deps = [ + "//executorch/kernels/optimized/cpu:aten_headers_for_executorch", + ], cxx_platform_deps = select({ "DEFAULT": [ ( diff --git a/kernels/optimized/test/libvec_test.cpp b/kernels/optimized/test/libvec_test.cpp deleted file mode 100644 index a8504b6bce..0000000000 --- a/kernels/optimized/test/libvec_test.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include - -#define TEST_FORALL_SUPPORTED_CTYPES(_) \ - _(); \ - _(); \ - _(); \ - _(); - -namespace { - -// Fill a vector with a monotonic sequence of integer values -template -void fill_monotonic( - std::vector& arr, - const int start = 0, - const int step = 1) { - int value = start; - for (size_t i = 0; i < arr.size(); ++i) { - arr[i] = static_cast(value); - value += step; - } -} - -template -bool check_all_equal_to(std::vector& arr, const float value) { - for (size_t i = 0; i < arr.size(); ++i) { - if (arr[i] != static_cast(value)) { - return false; - } - } - return true; -} - -} // namespace - -template -void test_load_and_add() { - using Vec = executorch::vec::Vectorized; - - constexpr size_t kVecSize = static_cast(Vec::size()); - - std::vector in_1(kVecSize); - fill_monotonic(in_1); - - std::vector in_2(kVecSize); - fill_monotonic(in_2, kVecSize, -1); - - const Vec in_1_vec = Vec::loadu(in_1.data()); - const Vec in_2_vec = Vec::loadu(in_2.data()); - - const Vec out_vec = in_1_vec + in_2_vec; - - std::vector out(kVecSize); - out_vec.store(out.data()); - - EXPECT_TRUE(check_all_equal_to(out, static_cast(kVecSize))); -} - -TEST(VecFloatTest, LoadAndAdd) { - TEST_FORALL_SUPPORTED_CTYPES(test_load_and_add); -} diff --git a/kernels/optimized/test/targets.bzl b/kernels/optimized/test/targets.bzl index e4740a9ad7..919620ffaf 100644 --- a/kernels/optimized/test/targets.bzl +++ b/kernels/optimized/test/targets.bzl @@ -38,6 +38,5 @@ def define_common_targets(): """ define_supported_features_lib() - _lib_test_bin("libvec_test_bin") _lib_test_bin("moments_utils_test_bin", in_cpu = True) _lib_test_bin("libblas_test_bin") diff --git a/kernels/optimized/vec/functional_base.h b/kernels/optimized/vec/functional_base.h index f4113aafa7..cb00d7e4f4 100644 --- a/kernels/optimized/vec/functional_base.h +++ b/kernels/optimized/vec/functional_base.h @@ -11,7 +11,7 @@ // DO NOT DEFINE STATIC DATA IN THIS HEADER! // See Note [Do not compile initializers with AVX] -#include +#include namespace executorch { namespace vec { @@ -20,9 +20,9 @@ namespace vec { template inline scalar_t vec_reduce_all( const Op& vec_fun, - vec::Vectorized acc_vec, + at::vec::Vectorized acc_vec, int64_t size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; scalar_t acc_arr[Vec::size()]; acc_vec.store(acc_arr); for (int64_t i = 1; i < size; ++i) { @@ -37,8 +37,8 @@ inline scalar_t vec_reduce_all( template struct VecReduceAllSIMD { - static inline scalar_t apply(const Op& vec_fun, const Vectorized& acc_vec) { - return vec_reduce_all(vec_fun, acc_vec, Vectorized::size()); + static inline scalar_t apply(const Op& vec_fun, const at::vec::Vectorized& acc_vec) { + return vec_reduce_all(vec_fun, acc_vec, at::vec::Vectorized::size()); } }; @@ -46,8 +46,8 @@ struct VecReduceAllSIMD { #if defined(CPU_CAPABILITY_AVX2) template struct VecReduceAllSIMD { - static inline float apply(const Op& vec_fun, const Vectorized& acc_vec) { - using Vec = Vectorized; + static inline float apply(const Op& vec_fun, const at::vec::Vectorized& acc_vec) { + using Vec = at::vec::Vectorized; Vec v = acc_vec; // 128-bit shuffle Vec v1 = _mm256_permute2f128_ps(v, v, 0x1); @@ -65,8 +65,8 @@ struct VecReduceAllSIMD { #if defined(CPU_CAPABILITY_AVX512) template struct VecReduceAllSIMD { - static inline float apply(const Op& vec_fun, const Vectorized& acc_vec) { - using Vec = Vectorized; + static inline float apply(const Op& vec_fun, const at::vec::Vectorized& acc_vec) { + using Vec = at::vec::Vectorized; Vec v = acc_vec; // 256-bit shuffle Vec v1 = _mm512_shuffle_f32x4(v, v, 0x4E); @@ -87,13 +87,13 @@ struct VecReduceAllSIMD { #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE) template -inline scalar_t vec_reduce_all(const Op& vec_fun, const Vectorized& acc_vec) { +inline scalar_t vec_reduce_all(const Op& vec_fun, const at::vec::Vectorized& acc_vec) { return VecReduceAllSIMD::apply(vec_fun, acc_vec); } template inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; if (size < Vec::size()) return vec_reduce_all(vec_fun, Vec::loadu(data, size), size); int64_t d = Vec::size(); @@ -113,7 +113,7 @@ inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size template inline std::pair reduce2_all(const Op1& vec_fun1, const Op2& vec_fun2, const scalar_t* data, int64_t size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; if (size < Vec::size()) { auto loaded_data = Vec::loadu(data, size); return std::pair( @@ -144,7 +144,7 @@ inline scalar_t map_reduce_all( const ReduceOp& red_fun, const scalar_t* data, int64_t size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; if (size < Vec::size()) return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size); int64_t d = Vec::size(); @@ -169,7 +169,7 @@ inline scalar_t map2_reduce_all( const scalar_t* data, const scalar_t* data2, int64_t size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; if (size < Vec::size()) { Vec data_vec = Vec::loadu(data, size); Vec data2_vec = Vec::loadu(data2, size); @@ -201,7 +201,7 @@ inline scalar_t map3_reduce_all( const scalar_t* data2, const scalar_t* data3, int64_t size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; if (size < Vec::size()) { Vec data_vec = Vec::loadu(data, size); Vec data2_vec = Vec::loadu(data2, size); @@ -235,7 +235,7 @@ inline void map( scalar_t* output_data, const scalar_t* input_data, int64_t size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { Vec output_vec = vec_fun(Vec::loadu(input_data + d)); @@ -254,7 +254,7 @@ inline void map2( const scalar_t* input_data, const scalar_t* input_data2, int64_t size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { Vec data_vec = Vec::loadu(input_data + d); @@ -278,7 +278,7 @@ inline void map3( const scalar_t* input_data2, const scalar_t* input_data3, int64_t size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { Vec data_vec1 = Vec::loadu(input_data1 + d); @@ -305,7 +305,7 @@ inline void map4( const scalar_t* input_data3, const scalar_t* input_data4, int64_t size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { Vec data_vec1 = Vec::loadu(input_data1 + d); @@ -341,7 +341,7 @@ inline void broadcasting_map_3d_and_unsqueezed_3d( int64_t outer_size, int64_t broadcast_size, int64_t inner_size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; int64_t outer_stride_lhs = inner_size * broadcast_size; int64_t outer_stride_rhs = inner_size; int64_t broadcast_stride_lhs = inner_size; @@ -398,7 +398,7 @@ inline void broadcasting_map_broadcast_last_dim( const scalar_t* rhs, int64_t outer_size, int64_t broadcast_size) { - using Vec = vec::Vectorized; + using Vec = at::vec::Vectorized; int64_t outer_stride_lhs = broadcast_size; for (int64_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) { const scalar_t* lhs_outer = lhs + outer_idx * outer_stride_lhs; diff --git a/kernels/optimized/vec/intrinsics.h b/kernels/optimized/vec/intrinsics.h deleted file mode 100644 index e248b843a1..0000000000 --- a/kernels/optimized/vec/intrinsics.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once -#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) -/* GCC or clang-compatible compiler, targeting x86/x86-64 */ -#include -#elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__)) -/* Clang-compatible compiler, targeting arm neon */ -#include -#elif defined(_MSC_VER) -/* Microsoft C/C++-compatible compiler */ -#include -#if _MSC_VER <= 1900 -#define _mm256_extract_epi64(X, Y) (_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2)) -#define _mm256_extract_epi32(X, Y) (_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4)) -#define _mm256_extract_epi16(X, Y) (_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8)) -#define _mm256_extract_epi8(X, Y) (_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16)) -#endif -#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__)) -/* GCC-compatible compiler, targeting ARM with NEON */ -#include -#if defined (MISSING_ARM_VLD1) -#include -#elif defined (MISSING_ARM_VST1) -#include -#endif -#elif defined(__GNUC__) && defined(__IWMMXT__) -/* GCC-compatible compiler, targeting ARM with WMMX */ -#include -#elif defined(__s390x__) -// targets Z/architecture -// we will include vecintrin later -#elif (defined(__GNUC__) || defined(__xlC__)) && \ - (defined(__VEC__) || defined(__ALTIVEC__)) -/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ -#include -/* We need to undef those tokens defined by to avoid conflicts - with the C++ types. => Can still use __bool/__vector */ -#undef bool -#undef vector -#undef pixel -#elif defined(__GNUC__) && defined(__SPE__) -/* GCC-compatible compiler, targeting PowerPC with SPE */ -#include -#endif diff --git a/kernels/optimized/vec/vec.h b/kernels/optimized/vec/vec.h deleted file mode 100644 index 0290ead1d8..0000000000 --- a/kernels/optimized/vec/vec.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace executorch { -namespace vec { - -// See Note [CPU_CAPABILITY namespace] -inline namespace CPU_CAPABILITY { - -inline Vectorized convert_to_bool(Vectorized x) { - __at_align__ bool buffer[x.size()]; - x.ne(Vectorized(0)).store(buffer); - - Vectorized ret; - static_assert(x.size() == ret.size(), ""); - std::memcpy(ret, buffer, ret.size() * sizeof(bool)); - return ret; -} - -template <> -inline Vectorized Vectorized::loadu(const void* ptr) { - // See NOTE [Loading boolean values] - return convert_to_bool(Vectorized::loadu(ptr)); -} - -template <> -inline Vectorized Vectorized::loadu(const void* ptr, int64_t count) { - // See NOTE [Loading boolean values] - return convert_to_bool(Vectorized::loadu(ptr, count)); -} - -} // namespace CPU_CAPABILITY - -} // namespace vec -} // namespace executorch diff --git a/kernels/optimized/vec/vec256/missing_vld1_neon.h b/kernels/optimized/vec/vec256/missing_vld1_neon.h deleted file mode 100644 index 3d5acbf2f8..0000000000 --- a/kernels/optimized/vec/vec256/missing_vld1_neon.h +++ /dev/null @@ -1,460 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7. */ - -__extension__ extern __inline uint8x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u8_x2 (const uint8_t *__a) -{ - uint8x8x2_t ret; - asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline int8x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s8_x2 (const int8_t *__a) -{ - int8x8x2_t ret; - asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline uint16x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u16_x2 (const uint16_t *__a) -{ - uint16x4x2_t ret; - asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline int16x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s16_x2 (const int16_t *__a) -{ - int16x4x2_t ret; - asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline uint32x2x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u32_x2 (const uint32_t *__a) -{ - uint32x2x2_t ret; - asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline int32x2x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s32_x2 (const int32_t *__a) -{ - int32x2x2_t ret; - asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline uint64x1x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u64_x2 (const uint64_t *__a) -{ - uint64x1x2_t ret; - asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline int64x1x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s64_x2 (const int64_t *__a) -{ - int64x1x2_t ret; - __builtin_aarch64_simd_oi __o; - asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline float16x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_f16_x2 (const float16_t *__a) -{ - float16x4x2_t ret; - asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline float32x2x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_f32_x2 (const float32_t *__a) -{ - float32x2x2_t ret; - asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline float64x1x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_f64_x2 (const float64_t *__a) -{ - float64x1x2_t ret; - asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline poly8x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_p8_x2 (const poly8_t *__a) -{ - poly8x8x2_t ret; - asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline poly16x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_p16_x2 (const poly16_t *__a) -{ - poly16x4x2_t ret; - asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline poly64x1x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_p64_x2 (const poly64_t *__a) -{ - poly64x1x2_t ret; - asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline uint8x16x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u8_x2 (const uint8_t *__a) -{ - uint8x16x2_t ret; - asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline int8x16x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s8_x2 (const int8_t *__a) -{ - int8x16x2_t ret; - asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline uint16x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u16_x2 (const uint16_t *__a) -{ - uint16x8x2_t ret; - asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline int16x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s16_x2 (const int16_t *__a) -{ - int16x8x2_t ret; - asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline uint32x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u32_x2 (const uint32_t *__a) -{ - uint32x4x2_t ret; - asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline int32x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s32_x2 (const int32_t *__a) -{ - int32x4x2_t ret; - asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline uint64x2x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u64_x2 (const uint64_t *__a) -{ - uint64x2x2_t ret; - asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline int64x2x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s64_x2 (const int64_t *__a) -{ - int64x2x2_t ret; - asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline float16x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_f16_x2 (const float16_t *__a) -{ - float16x8x2_t ret; - asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline float32x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_f32_x2 (const float32_t *__a) -{ - float32x4x2_t ret; - asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline float64x2x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_f64_x2 (const float64_t *__a) -{ - float64x2x2_t ret; - asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline poly8x16x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_p8_x2 (const poly8_t *__a) -{ - poly8x16x2_t ret; - asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline poly16x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_p16_x2 (const poly16_t *__a) -{ - poly16x8x2_t ret; - asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -__extension__ extern __inline poly64x2x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_p64_x2 (const poly64_t *__a) -{ - poly64x2x2_t ret; - asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a)); - return ret; -} - -/* vst1x2 */ - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s64_x2 (int64_t * __a, int64x1x2_t val) -{ - asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val) -{ - asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f64_x2 (float64_t * __a, float64x1x2_t val) -{ - asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s8_x2 (int8_t * __a, int8x8x2_t val) -{ - asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val) -{ - asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s16_x2 (int16_t * __a, int16x4x2_t val) -{ - asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val) -{ - asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s32_x2 (int32_t * __a, int32x2x2_t val) -{ - asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val) -{ - asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val) -{ - asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val) -{ - asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f16_x2 (float16_t * __a, float16x4x2_t val) -{ - asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f32_x2 (float32_t * __a, float32x2x2_t val) -{ - asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val) -{ - asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s8_x2 (int8_t * __a, int8x16x2_t val) -{ - asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val) -{ - asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s16_x2 (int16_t * __a, int16x8x2_t val) -{ - asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val) -{ - asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s32_x2 (int32_t * __a, int32x4x2_t val) -{ - asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s64_x2 (int64_t * __a, int64x2x2_t val) -{ - asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val) -{ - asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val) -{ - asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val) -{ - asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val) -{ - asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f16_x2 (float16_t * __a, float16x8x2_t val) -{ - asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f32_x2 (float32_t * __a, float32x4x2_t val) -{ - asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f64_x2 (float64_t * __a, float64x2x2_t val) -{ - asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val)); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val) -{ - asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val)); -} diff --git a/kernels/optimized/vec/vec256/missing_vst1_neon.h b/kernels/optimized/vec/vec256/missing_vst1_neon.h deleted file mode 100644 index 53097d25ea..0000000000 --- a/kernels/optimized/vec/vec256/missing_vst1_neon.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* Workaround for missing vst1q_f32_x2 in gcc-8. */ - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f32_x2 (float32_t * __a, float32x4x2_t val) -{ - asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val)); -} diff --git a/kernels/optimized/vec/vec256/vec256.h b/kernels/optimized/vec/vec256/vec256.h deleted file mode 100644 index efb471e1df..0000000000 --- a/kernels/optimized/vec/vec256/vec256.h +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// DO NOT DEFINE STATIC DATA IN THIS HEADER! -// See Note [Do not compile initializers with AVX] - -#include - -#include -#if !(defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR)) -#include -#include -#include -#include -#endif - -#include -#include -#include -#include -#include - -namespace executorch { -namespace vec { - -// Note [CPU_CAPABILITY namespace] -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -// This header, and all of its subheaders, will be compiled with -// different architecture flags for each supported set of vector -// intrinsics. So we need to make sure they aren't inadvertently -// linked together. We do this by declaring objects in an `inline -// namespace` which changes the name mangling, but can still be -// accessed as `at::vec`. -inline namespace CPU_CAPABILITY { - -template -std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { - T buf[Vectorized::size()]; - vec.store(buf); - stream << "vec["; - for (size_t i = 0; i != Vectorized::size(); i++) { - if (i != 0) { - stream << ", "; - } - stream << buf[i]; - } - stream << "]"; - return stream; -} - - -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) - -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -template<> -inline Vectorized cast(const Vectorized& src) { - return _mm256_castpd_ps(src); -} - -template<> -inline Vectorized cast(const Vectorized& src) { - return _mm256_castps_pd(src); -} - -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -template -std::enable_if_t> -inline gather(const double* base_addr, const Vectorized& vindex) { - return _mm256_i64gather_pd(base_addr, vindex, scale); -} - -template -std::enable_if_t> -inline gather(const float* base_addr, const Vectorized& vindex) { - return _mm256_i32gather_ps(base_addr, vindex, scale); -} - -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -template -std::enable_if_t> -inline mask_gather(const Vectorized& src, const double* base_addr, - const Vectorized& vindex, const Vectorized& mask) { - return _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale); -} - -template -std::enable_if_t> -inline mask_gather(const Vectorized& src, const float* base_addr, - const Vectorized& vindex, const Vectorized& mask) { - return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); -} - -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -// Only works for inputs in the range: [-2^51, 2^51] -// From: https://stackoverflow.com/a/41148578 -template<> -Vectorized -inline convert_to_int_of_same_size(const Vectorized &src) { - auto x = _mm256_add_pd(src, _mm256_set1_pd(0x0018000000000000)); - return _mm256_sub_epi64( - _mm256_castpd_si256(x), - _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)) - ); -} - -template<> -Vectorized -inline convert_to_int_of_same_size(const Vectorized &src) { - return _mm256_cvttps_epi32(src); -} - -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -template <> -std::pair, Vectorized> -inline interleave2(const Vectorized& a, const Vectorized& b) { - // inputs: - // a = {a0, a1, a3, a3} - // b = {b0, b1, b2, b3} - - // swap lanes: - // a_swapped = {a0, a1, b0, b1} - // b_swapped = {a2, a3, b2, b3} - auto a_swapped = _mm256_permute2f128_pd(a, b, 0b0100000); // 0, 2. 4 bits apart - auto b_swapped = _mm256_permute2f128_pd(a, b, 0b0110001); // 1, 3. 4 bits apart - - // group cols crossing lanes: - // return {a0, b0, a1, b1} - // {a2, b2, a3, b3} - return std::make_pair(_mm256_permute4x64_pd(a_swapped, 0b11011000), // 0, 2, 1, 3 - _mm256_permute4x64_pd(b_swapped, 0b11011000)); // 0, 2, 1, 3 -} - -template <> -std::pair, Vectorized> -inline interleave2(const Vectorized& a, const Vectorized& b) { - // inputs: - // a = {a0, a1, a2, a3, a4, a5, a6, a7} - // b = {b0, b1, b2, b3, b4, b5, b6, b7} - - // swap lanes: - // a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3} - // b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7} - // TODO: can we support caching this? - auto a_swapped = _mm256_permute2f128_ps(a, b, 0b0100000); // 0, 2. 4 bits apart - auto b_swapped = _mm256_permute2f128_ps(a, b, 0b0110001); // 1, 3. 4 bits apart - - // group cols crossing lanes: - // return {a0, b0, a1, b1, a2, b2, a3, b3} - // {a4, b4, a5, b5, a6, b6, a7, b7} - const __m256i group_ctrl = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); - return std::make_pair(_mm256_permutevar8x32_ps(a_swapped, group_ctrl), - _mm256_permutevar8x32_ps(b_swapped, group_ctrl)); -} - -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -template <> -std::pair, Vectorized> -inline deinterleave2(const Vectorized& a, const Vectorized& b) { - // inputs: - // a = {a0, b0, a1, b1} - // b = {a2, b2, a3, b3} - - // group cols crossing lanes: - // a_grouped = {a0, a1, b0, b1} - // b_grouped = {a2, a3, b2, b3} - auto a_grouped = _mm256_permute4x64_pd(a, 0b11011000); // 0, 2, 1, 3 - auto b_grouped = _mm256_permute4x64_pd(b, 0b11011000); // 0, 2, 1, 3 - - // swap lanes: - // return {a0, a1, a2, a3} - // {b0, b1, b2, b3} - return std::make_pair(_mm256_permute2f128_pd(a_grouped, b_grouped, 0b0100000), // 0, 2. 4 bits apart - _mm256_permute2f128_pd(a_grouped, b_grouped, 0b0110001)); // 1, 3. 4 bits apart -} - -template <> -std::pair, Vectorized> -inline deinterleave2(const Vectorized& a, const Vectorized& b) { - // inputs: - // a = {a0, b0, a1, b1, a2, b2, a3, b3} - // b = {a4, b4, a5, b5, a6, b6, a7, b7} - - // group cols crossing lanes: - // a_grouped = {a0, a1, a2, a3, b0, b1, b2, b3} - // b_grouped = {a4, a5, a6, a7, b4, b5, b6, b7} - // TODO: can we support caching this? - const __m256i group_ctrl = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); - auto a_grouped = _mm256_permutevar8x32_ps(a, group_ctrl); - auto b_grouped = _mm256_permutevar8x32_ps(b, group_ctrl); - - // swap lanes: - // return {a0, a1, a2, a3, a4, a5, a6, a7} - // {b0, b1, b2, b3, b4, b5, b6, b7} - return std::make_pair(_mm256_permute2f128_ps(a_grouped, b_grouped, 0b0100000), // 0, 2. 4 bits apart - _mm256_permute2f128_ps(a_grouped, b_grouped, 0b0110001)); // 1, 3. 4 bits apart -} - -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -template<> -inline Vectorized flip(const Vectorized & v) { - const __m256i mask_float = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - return _mm256_permutevar8x32_ps(v, mask_float); -} - -template<> -inline Vectorized flip(const Vectorized & v) { - return _mm256_permute4x64_pd(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3) -} - -template<> -inline Vectorized flip(const Vectorized & v) { - return _mm256_permute4x64_epi64(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3) -} - -template<> -inline Vectorized flip(const Vectorized & v) { - const __m256i mask_int32 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); - return _mm256_permutevar8x32_epi32(v, mask_int32); -} - -template<> -inline Vectorized flip(const Vectorized & v) { - const __m256i mask = _mm256_set_epi8( - 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, - 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 - ); - auto reversed = _mm256_shuffle_epi8(v, mask); - return _mm256_permute2x128_si256(reversed, reversed, 1); -} - -inline __m256i flip8(const __m256i & v) { - const __m256i mask_int8 = _mm256_set_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - ); - auto reversed = _mm256_shuffle_epi8(v, mask_int8); - return _mm256_permute2x128_si256(reversed, reversed, 1); -} - -template<> -inline Vectorized flip(const Vectorized & v) { - return flip8(v); -} - -template<> -inline Vectorized flip(const Vectorized & v) { - return flip8(v); -} - -#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) - -}}} diff --git a/kernels/optimized/vec/vec256/vec256_double.h b/kernels/optimized/vec/vec256/vec256_double.h deleted file mode 100644 index 9ccde96cda..0000000000 --- a/kernels/optimized/vec/vec256/vec256_double.h +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// DO NOT DEFINE STATIC DATA IN THIS HEADER! -// See Note [Do not compile initializers with AVX] - -#include -#include - -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -#include -#endif - -namespace executorch { -namespace vec { -// See Note [CPU_CAPABILITY namespace] -inline namespace CPU_CAPABILITY { - - -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) - -template <> class Vectorized { -private: - __m256d values; -public: - using value_type = double; - using size_type = int; - static constexpr size_type size() { - return 4; - } - Vectorized() {} - Vectorized(__m256d v) : values(v) {} - Vectorized(double val) { - values = _mm256_set1_pd(val); - } - Vectorized(double val1, double val2, double val3, double val4) { - values = _mm256_setr_pd(val1, val2, val3, val4); - } - operator __m256d() const { - return values; - } - template - static Vectorized blend(const Vectorized& a, const Vectorized& b) { - return _mm256_blend_pd(a.values, b.values, mask); - } - static Vectorized blendv(const Vectorized& a, const Vectorized& b, - const Vectorized& mask) { - return _mm256_blendv_pd(a.values, b.values, mask.values); - } - template - static Vectorized arange(double base = 0., step_t step = static_cast(1)) { - return Vectorized(base, base + step, base + 2 * step, base + 3 * step); - } - static Vectorized set(const Vectorized& a, const Vectorized& b, - int64_t count = size()) { - switch (count) { - case 0: - return a; - case 1: - return blend<1>(a, b); - case 2: - return blend<3>(a, b); - case 3: - return blend<7>(a, b); - } - return b; - } - static Vectorized loadu(const void* ptr, int64_t count = size()) { - if (count == size()) - return _mm256_loadu_pd(reinterpret_cast(ptr)); - - - __at_align__ double tmp_values[size()]; - // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 - // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two - // instructions while a loop would be compiled to one instruction. - for (size_t i = 0; i < size(); ++i) { - tmp_values[i] = 0.0; - } - std::memcpy( - tmp_values, - reinterpret_cast(ptr), - count * sizeof(double)); - return _mm256_load_pd(tmp_values); - } - void store(void* ptr, int count = size()) const { - if (count == size()) { - _mm256_storeu_pd(reinterpret_cast(ptr), values); - } else if (count > 0) { - double tmp_values[size()]; - _mm256_storeu_pd(reinterpret_cast(tmp_values), values); - std::memcpy(ptr, tmp_values, count * sizeof(double)); - } - } - const double& operator[](int idx) const = delete; - double& operator[](int idx) = delete; - int zero_mask() const { - // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit - __m256d cmp = _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_EQ_OQ); - return _mm256_movemask_pd(cmp); - } - Vectorized isnan() const { - return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q); - } - Vectorized map(double (*const f)(double)) const { - __at_align__ double tmp[size()]; - store(tmp); - for (size_t i = 0; i < size(); ++i) { - tmp[i] = f(tmp[i]); - } - return loadu(tmp); - } - Vectorized abs() const { - auto mask = _mm256_set1_pd(-0.f); - return _mm256_andnot_pd(mask, values); - } - Vectorized acos() const { - return Vectorized(Sleef_acosd4_u10(values)); - } - Vectorized asin() const { - return Vectorized(Sleef_asind4_u10(values)); - } - Vectorized atan() const { - return Vectorized(Sleef_atand4_u10(values)); - } - Vectorized atan2(const Vectorized &b) const { - return Vectorized(Sleef_atan2d4_u10(values, b)); - } - Vectorized copysign(const Vectorized &sign) const { - return Vectorized(Sleef_copysignd4(values, sign)); - } - Vectorized erf() const { - return Vectorized(Sleef_erfd4_u10(values)); - } - Vectorized erfc() const { - return Vectorized(Sleef_erfcd4_u15(values)); - } - Vectorized exp() const { - return Vectorized(Sleef_expd4_u10(values)); - } - Vectorized exp2() const { - return Vectorized(Sleef_exp2d4_u10(values)); - } - Vectorized expm1() const { - return Vectorized(Sleef_expm1d4_u10(values)); - } - Vectorized fmod(const Vectorized& q) const { - return Vectorized(Sleef_fmodd4(values, q)); - } - Vectorized hypot(const Vectorized &b) const { - return Vectorized(Sleef_hypotd4_u05(values, b)); - } - Vectorized log() const { - return Vectorized(Sleef_logd4_u10(values)); - } - Vectorized log2() const { - return Vectorized(Sleef_log2d4_u10(values)); - } - Vectorized log10() const { - return Vectorized(Sleef_log10d4_u10(values)); - } - Vectorized log1p() const { - return Vectorized(Sleef_log1pd4_u10(values)); - } - Vectorized sin() const { - return Vectorized(Sleef_sind4_u10(values)); - } - Vectorized sinh() const { - return Vectorized(Sleef_sinhd4_u10(values)); - } - Vectorized cos() const { - return Vectorized(Sleef_cosd4_u10(values)); - } - Vectorized cosh() const { - return Vectorized(Sleef_coshd4_u10(values)); - } - Vectorized ceil() const { - return _mm256_ceil_pd(values); - } - Vectorized floor() const { - return _mm256_floor_pd(values); - } - Vectorized frac() const; - Vectorized neg() const { - return _mm256_xor_pd(_mm256_set1_pd(-0.), values); - } - Vectorized nextafter(const Vectorized &b) const { - return Vectorized(Sleef_nextafterd4(values, b)); - } - Vectorized round() const { - return _mm256_round_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); - } - Vectorized tan() const { - return Vectorized(Sleef_tand4_u10(values)); - } - Vectorized tanh() const { - return Vectorized(Sleef_tanhd4_u10(values)); - } - Vectorized trunc() const { - return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); - } - Vectorized lgamma() const { - return Vectorized(Sleef_lgammad4_u10(values)); - } - Vectorized sqrt() const { - return _mm256_sqrt_pd(values); - } - Vectorized reciprocal() const { - return _mm256_div_pd(_mm256_set1_pd(1), values); - } - Vectorized rsqrt() const { - return _mm256_div_pd(_mm256_set1_pd(1), _mm256_sqrt_pd(values)); - } - Vectorized pow(const Vectorized &b) const { - return Vectorized(Sleef_powd4_u10(values, b)); - } - // Comparison using the _CMP_**_OQ predicate. - // `O`: get false if an operand is NaN - // `Q`: do not raise if an operand is NaN - Vectorized operator==(const Vectorized& other) const { - return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ); - } - - Vectorized operator!=(const Vectorized& other) const { - return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ); - } - - Vectorized operator<(const Vectorized& other) const { - return _mm256_cmp_pd(values, other.values, _CMP_LT_OQ); - } - - Vectorized operator<=(const Vectorized& other) const { - return _mm256_cmp_pd(values, other.values, _CMP_LE_OQ); - } - - Vectorized operator>(const Vectorized& other) const { - return _mm256_cmp_pd(values, other.values, _CMP_GT_OQ); - } - - Vectorized operator>=(const Vectorized& other) const { - return _mm256_cmp_pd(values, other.values, _CMP_GE_OQ); - } - - Vectorized eq(const Vectorized& other) const; - Vectorized ne(const Vectorized& other) const; - Vectorized lt(const Vectorized& other) const; - Vectorized le(const Vectorized& other) const; - Vectorized gt(const Vectorized& other) const; - Vectorized ge(const Vectorized& other) const; -}; - -template <> -Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { - return _mm256_add_pd(a, b); -} - -template <> -Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { - return _mm256_sub_pd(a, b); -} - -template <> -Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { - return _mm256_mul_pd(a, b); -} - -template <> -Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { - return _mm256_div_pd(a, b); -} - -// frac. Implement this here so we can use subtraction. -inline Vectorized Vectorized::frac() const { - return *this - this->trunc(); -} - -// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if -// either input is a NaN. -template <> -Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { - Vectorized max = _mm256_max_pd(a, b); - Vectorized isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q); - // Exploit the fact that all-ones is a NaN. - return _mm256_or_pd(max, isnan); -} - -// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if -// either input is a NaN. -template <> -Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { - Vectorized min = _mm256_min_pd(a, b); - Vectorized isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q); - // Exploit the fact that all-ones is a NaN. - return _mm256_or_pd(min, isnan); -} - -template <> -Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { - return _mm256_min_pd(max, _mm256_max_pd(min, a)); -} - -template <> -Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { - return _mm256_max_pd(min, a); -} - -template <> -Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { - return _mm256_min_pd(max, a); -} - -template <> -Vectorized inline operator&(const Vectorized& a, const Vectorized& b) { - return _mm256_and_pd(a, b); -} - -template <> -Vectorized inline operator|(const Vectorized& a, const Vectorized& b) { - return _mm256_or_pd(a, b); -} - -template <> -Vectorized inline operator^(const Vectorized& a, const Vectorized& b) { - return _mm256_xor_pd(a, b); -} - -inline Vectorized Vectorized::eq(const Vectorized& other) const { - return (*this == other) & Vectorized(1.0); -} - -inline Vectorized Vectorized::ne(const Vectorized& other) const { - return (*this != other) & Vectorized(1.0); -} - -inline Vectorized Vectorized::gt(const Vectorized& other) const { - return (*this > other) & Vectorized(1.0); -} - -inline Vectorized Vectorized::ge(const Vectorized& other) const { - return (*this >= other) & Vectorized(1.0); -} - -inline Vectorized Vectorized::lt(const Vectorized& other) const { - return (*this < other) & Vectorized(1.0); -} - -inline Vectorized Vectorized::le(const Vectorized& other) const { - return (*this <= other) & Vectorized(1.0); -} - -template <> -inline void convert(const double* src, double* dst, int64_t n) { - int64_t i; -#pragma unroll - for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { - _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i)); - } -#pragma unroll - for (; i < n; i++) { - dst[i] = src[i]; - } -} - -#ifdef CPU_CAPABILITY_AVX2 -template <> -Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { - return _mm256_fmadd_pd(a, b, c); -} - -template <> -Vectorized inline fmsub(const Vectorized& a, const Vectorized& b, const Vectorized& c) { - return _mm256_fmsub_pd(a, b, c); -} -#endif - -#endif - -}}} diff --git a/kernels/optimized/vec/vec256/vec256_float.h b/kernels/optimized/vec/vec256/vec256_float.h deleted file mode 100644 index 8d8108a20e..0000000000 --- a/kernels/optimized/vec/vec256/vec256_float.h +++ /dev/null @@ -1,518 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// DO NOT DEFINE STATIC DATA IN THIS HEADER! -// See Note [Do not compile initializers with AVX] - -#include -#include - -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -#include -#endif - -namespace executorch { -namespace vec { -// See Note [CPU_CAPABILITY namespace] -inline namespace CPU_CAPABILITY { - -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) - -template <> class Vectorized { -private: - __m256 values; -public: - using value_type = float; - using size_type = int; - static constexpr size_type size() { - return 8; - } - Vectorized() {} - Vectorized(__m256 v) : values(v) {} - Vectorized(float val) { - values = _mm256_set1_ps(val); - } - Vectorized(float val1, float val2, float val3, float val4, - float val5, float val6, float val7, float val8) { - values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8); - } - operator __m256() const { - return values; - } - template - static Vectorized blend(const Vectorized& a, const Vectorized& b) { - return _mm256_blend_ps(a.values, b.values, mask); - } - static Vectorized blendv(const Vectorized& a, const Vectorized& b, - const Vectorized& mask) { - return _mm256_blendv_ps(a.values, b.values, mask.values); - } - template - static Vectorized arange(float base = 0.f, step_t step = static_cast(1)) { - return Vectorized( - base, base + step, base + 2 * step, base + 3 * step, - base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step); - } - static Vectorized set(const Vectorized& a, const Vectorized& b, - int64_t count = size()) { - switch (count) { - case 0: - return a; - case 1: - return blend<1>(a, b); - case 2: - return blend<3>(a, b); - case 3: - return blend<7>(a, b); - case 4: - return blend<15>(a, b); - case 5: - return blend<31>(a, b); - case 6: - return blend<63>(a, b); - case 7: - return blend<127>(a, b); - } - return b; - } - static Vectorized loadu(const void* ptr, int64_t count = size()) { - if (count == size()) - return _mm256_loadu_ps(reinterpret_cast(ptr)); - __at_align__ float tmp_values[size()]; - // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 - // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two - // instructions while a loop would be compiled to one instruction. - for (size_t i = 0; i < size(); ++i) { - tmp_values[i] = 0.0; - } - std::memcpy( - tmp_values, reinterpret_cast(ptr), count * sizeof(float)); - return _mm256_loadu_ps(tmp_values); - } - void store(void* ptr, int64_t count = size()) const { - if (count == size()) { - _mm256_storeu_ps(reinterpret_cast(ptr), values); - } else if (count > 0) { - float tmp_values[size()]; - _mm256_storeu_ps(reinterpret_cast(tmp_values), values); - std::memcpy(ptr, tmp_values, count * sizeof(float)); - } - } - const float& operator[](int idx) const = delete; - float& operator[](int idx) = delete; - int zero_mask() const { - // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit - __m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ); - return _mm256_movemask_ps(cmp); - } - Vectorized isnan() const { - return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q); - } - Vectorized map(float (*const f)(float)) const { - __at_align__ float tmp[size()]; - store(tmp); - for (size_t i = 0; i < size(); ++i) { - tmp[i] = f(tmp[i]); - } - return loadu(tmp); - } - Vectorized abs() const { - auto mask = _mm256_set1_ps(-0.f); - return _mm256_andnot_ps(mask, values); - } - Vectorized acos() const { - return Vectorized(Sleef_acosf8_u10(values)); - } - Vectorized asin() const { - return Vectorized(Sleef_asinf8_u10(values)); - } - Vectorized atan() const { - return Vectorized(Sleef_atanf8_u10(values)); - } - Vectorized atan2(const Vectorized &b) const { - return Vectorized(Sleef_atan2f8_u10(values, b)); - } - Vectorized copysign(const Vectorized &sign) const { - return Vectorized(Sleef_copysignf8(values, sign)); - } - Vectorized erf() const { - // constants - const auto neg_zero_vec = _mm256_set1_ps(-0.f); - const auto one_vec = _mm256_set1_ps(1.0f); - const auto p = _mm256_set1_ps(0.3275911f); - const auto p1 = _mm256_set1_ps(0.254829592f); - const auto p2 = _mm256_set1_ps(-0.284496736f); - const auto p3 = _mm256_set1_ps(1.421413741f); - const auto p4 = _mm256_set1_ps(-1.453152027f); - const auto p5 = _mm256_set1_ps(1.061405429f); - // sign(x) - auto sign_mask = _mm256_and_ps(neg_zero_vec, values); - auto abs_vec = _mm256_xor_ps(sign_mask, values); - // t = 1 / (p * abs(x) + 1) - auto tmp0 = _mm256_fmadd_ps(p, abs_vec, one_vec); - auto t = _mm256_div_ps(one_vec, tmp0); - // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1 - auto tmp1 = _mm256_fmadd_ps(p5, t, p4); - auto tmp2 = _mm256_fmadd_ps(tmp1, t, p3); - auto tmp3 = _mm256_fmadd_ps(tmp2, t, p2); - auto r = _mm256_fmadd_ps(tmp3, t, p1); - // - exp(- x * x) - auto pow_2 = _mm256_mul_ps(values, values); - auto neg_pow_2 = _mm256_xor_ps(neg_zero_vec, pow_2); - // auto tmp4 = exp(neg_pow_2); - auto tmp4 = Vectorized(Sleef_expf8_u10(neg_pow_2)); - auto tmp5 = _mm256_xor_ps(neg_zero_vec, tmp4); - // erf(x) = sign(x) * (1 - r * t * exp(- x * x)) - auto tmp6 = _mm256_mul_ps(tmp5, t); - auto tmp7 = _mm256_fmadd_ps(tmp6, r, one_vec); - return _mm256_xor_ps(sign_mask, tmp7); - } - Vectorized erfc() const { - return Vectorized(Sleef_erfcf8_u15(values)); - } - Vectorized exp() const { - return Vectorized(Sleef_expf8_u10(values)); - } - Vectorized exp2() const { - return Vectorized(Sleef_exp2f8_u10(values)); - } - Vectorized expm1() const { - return Vectorized(Sleef_expm1f8_u10(values)); - } - Vectorized fmod(const Vectorized& q) const { - return Vectorized(Sleef_fmodf8(values, q)); - } - Vectorized log() const { - return Vectorized(Sleef_logf8_u10(values)); - } - Vectorized log2() const { - return Vectorized(Sleef_log2f8_u10(values)); - } - Vectorized log10() const { - return Vectorized(Sleef_log10f8_u10(values)); - } - Vectorized log1p() const { - return Vectorized(Sleef_log1pf8_u10(values)); - } - Vectorized frac() const; - Vectorized sin() const { - return Vectorized(Sleef_sinf8_u35(values)); - } - Vectorized sinh() const { - return Vectorized(Sleef_sinhf8_u10(values)); - } - Vectorized cos() const { - return Vectorized(Sleef_cosf8_u35(values)); - } - Vectorized cosh() const { - return Vectorized(Sleef_coshf8_u10(values)); - } - Vectorized ceil() const { - return _mm256_ceil_ps(values); - } - Vectorized floor() const { - return _mm256_floor_ps(values); - } - Vectorized hypot(const Vectorized &b) const { - return Vectorized(Sleef_hypotf8_u05(values, b)); - } - Vectorized neg() const { - return _mm256_xor_ps(_mm256_set1_ps(-0.f), values); - } - Vectorized nextafter(const Vectorized &b) const { - return Vectorized(Sleef_nextafterf8(values, b)); - } - Vectorized round() const { - return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); - } - Vectorized tan() const { - return Vectorized(Sleef_tanf8_u10(values)); - } - Vectorized tanh() const { - return Vectorized(Sleef_tanhf8_u10(values)); - } - Vectorized trunc() const { - return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); - } - Vectorized lgamma() const { - return Vectorized(Sleef_lgammaf8_u10(values)); - } - Vectorized sqrt() const { - return _mm256_sqrt_ps(values); - } - Vectorized reciprocal() const { - return _mm256_div_ps(_mm256_set1_ps(1), values); - } - Vectorized rsqrt() const { - return _mm256_div_ps(_mm256_set1_ps(1), _mm256_sqrt_ps(values)); - } - Vectorized pow(const Vectorized &b) const { - return Vectorized(Sleef_powf8_u10(values, b)); - } - // Comparison using the _CMP_**_OQ predicate. - // `O`: get false if an operand is NaN - // `Q`: do not raise if an operand is NaN - Vectorized operator==(const Vectorized& other) const { - return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ); - } - - Vectorized operator!=(const Vectorized& other) const { - return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ); - } - - Vectorized operator<(const Vectorized& other) const { - return _mm256_cmp_ps(values, other.values, _CMP_LT_OQ); - } - - Vectorized operator<=(const Vectorized& other) const { - return _mm256_cmp_ps(values, other.values, _CMP_LE_OQ); - } - - Vectorized operator>(const Vectorized& other) const { - return _mm256_cmp_ps(values, other.values, _CMP_GT_OQ); - } - - Vectorized operator>=(const Vectorized& other) const { - return _mm256_cmp_ps(values, other.values, _CMP_GE_OQ); - } - - Vectorized eq(const Vectorized& other) const; - Vectorized ne(const Vectorized& other) const; - Vectorized gt(const Vectorized& other) const; - Vectorized ge(const Vectorized& other) const; - Vectorized lt(const Vectorized& other) const; - Vectorized le(const Vectorized& other) const; -}; - -template <> -Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { - return _mm256_add_ps(a, b); -} - -template <> -Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { - return _mm256_sub_ps(a, b); -} - -template <> -Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { - return _mm256_mul_ps(a, b); -} - -template <> -Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { - return _mm256_div_ps(a, b); -} - -// frac. Implement this here so we can use subtraction -inline Vectorized Vectorized::frac() const { - return *this - this->trunc(); -} - -// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if -// either input is a NaN. -template <> -Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { - Vectorized max = _mm256_max_ps(a, b); - Vectorized isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q); - // Exploit the fact that all-ones is a NaN. - return _mm256_or_ps(max, isnan); -} - -// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if -// either input is a NaN. -template <> -Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { - Vectorized min = _mm256_min_ps(a, b); - Vectorized isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q); - // Exploit the fact that all-ones is a NaN. - return _mm256_or_ps(min, isnan); -} - -template <> -Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { - return _mm256_min_ps(max, _mm256_max_ps(min, a)); -} - -template <> -Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { - return _mm256_min_ps(max, a); -} - -template <> -Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { - return _mm256_max_ps(min, a); -} - -template <> -Vectorized inline operator&(const Vectorized& a, const Vectorized& b) { - return _mm256_and_ps(a, b); -} - -template <> -Vectorized inline operator|(const Vectorized& a, const Vectorized& b) { - return _mm256_or_ps(a, b); -} - -template <> -Vectorized inline operator^(const Vectorized& a, const Vectorized& b) { - return _mm256_xor_ps(a, b); -} - -inline Vectorized Vectorized::eq(const Vectorized& other) const { - return (*this == other) & Vectorized(1.0f); -} - -inline Vectorized Vectorized::ne(const Vectorized& other) const { - return (*this != other) & Vectorized(1.0f); -} - -inline Vectorized Vectorized::gt(const Vectorized& other) const { - return (*this > other) & Vectorized(1.0f); -} - -inline Vectorized Vectorized::ge(const Vectorized& other) const { - return (*this >= other) & Vectorized(1.0f); -} - -inline Vectorized Vectorized::lt(const Vectorized& other) const { - return (*this < other) & Vectorized(1.0f); -} - -inline Vectorized Vectorized::le(const Vectorized& other) const { - return (*this <= other) & Vectorized(1.0f); -} - -template <> -inline void convert(const float* src, float* dst, int64_t n) { - int64_t i; -#pragma unroll - for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { - _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i)); - } -#pragma unroll - for (; i < n; i++) { - dst[i] = src[i]; - } -} - - -template <> -Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { - return _mm256_fmadd_ps(a, b, c); -} - -template <> -Vectorized inline fmsub(const Vectorized& a, const Vectorized& b, const Vectorized& c) { - return _mm256_fmsub_ps(a, b, c); -} - -// Used by Inductor CPP codegen -template<> -inline void transpose_mxn( - const float* src, - int64_t ld_src, - float* dst, - int64_t ld_dst) { - // load from src to registers - // a: a0 a1 a2 a3 a4 a5 a6 a7 - // b: b0 b1 b2 b3 b4 b5 b6 b7 - // c: c0 c1 c2 c3 c4 c5 c6 c7 - // d: d0 d1 d2 d3 d4 d5 d6 d7 - // e: e0 e1 e2 e3 e4 e5 e6 e7 - // f: f0 f1 f2 f3 f4 f5 f6 f7 - // g: g0 g1 g2 g3 g4 g5 g6 g7 - // h: h0 h1 h2 h3 h4 h5 h6 h7 - __m256 a = _mm256_loadu_ps(&src[0 * ld_src]); - __m256 b = _mm256_loadu_ps(&src[1 * ld_src]); - __m256 c = _mm256_loadu_ps(&src[2 * ld_src]); - __m256 d = _mm256_loadu_ps(&src[3 * ld_src]); - __m256 e = _mm256_loadu_ps(&src[4 * ld_src]); - __m256 f = _mm256_loadu_ps(&src[5 * ld_src]); - __m256 g = _mm256_loadu_ps(&src[6 * ld_src]); - __m256 h = _mm256_loadu_ps(&src[7 * ld_src]); - - __m256 ta, tb, tc, td, te, tf, tg, th; - // unpacking and interleaving 32-bit elements - // a0 b0 a1 b1 a4 b4 a5 b5 - // a2 b2 a3 b3 a6 b6 a7 b7 - // c0 d0 c1 d1 ... - // c2 d2 c3 d3 ... - // e0 f0 e1 f1 ... - // e2 f2 e3 f3 ... - // g0 h0 g1 h1 ... - // g2 h2 g3 h3 ... - ta = _mm256_unpacklo_ps(a, b); - tb = _mm256_unpackhi_ps(a, b); - tc = _mm256_unpacklo_ps(c, d); - td = _mm256_unpackhi_ps(c, d); - te = _mm256_unpacklo_ps(e, f); - tf = _mm256_unpackhi_ps(e, f); - tg = _mm256_unpacklo_ps(g, h); - th = _mm256_unpackhi_ps(g, h); - - // unpacking and interleaving 64-bit elements - // a0 b0 c0 d0 a4 b4 c4 d4 - // a1 b1 c1 d1 ... - // a2 b2 c2 d2 ... - // a3 b3 c3 d3 ... - // e0 f0 g0 h0 e4 f4 g4 h4 - // e1 f1 g1 h1 ... - // e2 f2 g2 h2 ... - // e3 f3 g3 h3 ... - a = _mm256_castpd_ps( - _mm256_unpacklo_pd(_mm256_castps_pd(ta), _mm256_castps_pd(tc))); - b = _mm256_castpd_ps( - _mm256_unpackhi_pd(_mm256_castps_pd(ta), _mm256_castps_pd(tc))); - c = _mm256_castpd_ps( - _mm256_unpacklo_pd(_mm256_castps_pd(tb), _mm256_castps_pd(td))); - d = _mm256_castpd_ps( - _mm256_unpackhi_pd(_mm256_castps_pd(tb), _mm256_castps_pd(td))); - e = _mm256_castpd_ps( - _mm256_unpacklo_pd(_mm256_castps_pd(te), _mm256_castps_pd(tg))); - f = _mm256_castpd_ps( - _mm256_unpackhi_pd(_mm256_castps_pd(te), _mm256_castps_pd(tg))); - g = _mm256_castpd_ps( - _mm256_unpacklo_pd(_mm256_castps_pd(tf), _mm256_castps_pd(th))); - h = _mm256_castpd_ps( - _mm256_unpackhi_pd(_mm256_castps_pd(tf), _mm256_castps_pd(th))); - - // shuffle 128-bits (composed of 4 32-bit elements) - // a0 b0 c0 d0 e0 f0 g0 h0 - // a1 b1 c1 d1 ... - // a2 b2 c2 d2 ... - // a3 b3 c3 d3 ... - // a4 b4 c4 d4 ... - // a5 b5 c5 d5 ... - // a6 b6 c6 d6 ... - // a7 b7 c7 d7 ... - ta = _mm256_permute2f128_ps(a, e, 0x20); - tb = _mm256_permute2f128_ps(b, f, 0x20); - tc = _mm256_permute2f128_ps(c, g, 0x20); - td = _mm256_permute2f128_ps(d, h, 0x20); - te = _mm256_permute2f128_ps(a, e, 0x31); - tf = _mm256_permute2f128_ps(b, f, 0x31); - tg = _mm256_permute2f128_ps(c, g, 0x31); - th = _mm256_permute2f128_ps(d, h, 0x31); - - // store from registers to dst - _mm256_storeu_ps(&dst[0 * ld_dst], ta); - _mm256_storeu_ps(&dst[1 * ld_dst], tb); - _mm256_storeu_ps(&dst[2 * ld_dst], tc); - _mm256_storeu_ps(&dst[3 * ld_dst], td); - _mm256_storeu_ps(&dst[4 * ld_dst], te); - _mm256_storeu_ps(&dst[5 * ld_dst], tf); - _mm256_storeu_ps(&dst[6 * ld_dst], tg); - _mm256_storeu_ps(&dst[7 * ld_dst], th); -} - -#endif - -}}} diff --git a/kernels/optimized/vec/vec256/vec256_float_neon.h b/kernels/optimized/vec/vec256/vec256_float_neon.h deleted file mode 100644 index c13725f9ee..0000000000 --- a/kernels/optimized/vec/vec256/vec256_float_neon.h +++ /dev/null @@ -1,808 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// DO NOT DEFINE STATIC DATA IN THIS HEADER! -// See Note [Do not compile initializers with AVX] - -#include -#include - - -#if defined(__aarch64__) && defined(ET_BUILD_ARM_VEC256_WITH_SLEEF) -#include -#endif - -// Sleef offers vectorized versions of some transcedentals -// such as sin, cos, tan etc.. -// However for now opting for STL, since we are not building -// with Sleef for mobile yet. - -namespace executorch { -namespace vec { -// See Note [CPU_CAPABILITY namespace] -inline namespace CPU_CAPABILITY { - -// Right now contains only aarch64 implementation. -// Due to follow two reasons aarch32 is not currently supported. -// 1. Due to difference in ISA been aarch32 and aarch64, intrinsics -// that work for aarch64 dont work for aarch32. -// 2. Android NDK r21 has problems with compiling aarch32. -// Clang seg faults. -// https://github.com/android/ndk/issues/1248 -// https://bugs.llvm.org/show_bug.cgi?id=45824 -// Most likely we will do aarch32 support with inline asm. -#if defined(__aarch64__) - -#ifdef __BIG_ENDIAN__ -#error "Big endian is not supported." -#endif - -#if defined(ET_BUILD_ARM_VEC256_WITH_SLEEF) -#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code -#else -#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code -#endif - -template -struct BlendRegs { - static float32x4_t impl( - const float32x4_t& a, const float32x4_t& b, float32x4_t& res); -}; - -template -struct BlendRegs{ - static float32x4_t impl( - const float32x4_t& a, const float32x4_t& b, float32x4_t& res) { - return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index); - } -}; - -template -struct BlendRegs{ - static float32x4_t impl( - const float32x4_t& a, const float32x4_t& b, float32x4_t& res) { - return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index); - } -}; - -template <> class Vectorized { -private: - float32x4x2_t values; -public: - using value_type = float; - using size_type = int; - static constexpr size_type size() { - return 8; - } - Vectorized() {} - Vectorized(float32x4x2_t v) : values(v) {} - Vectorized(float val) : values{vdupq_n_f32(val), vdupq_n_f32(val) } {} - Vectorized(float val0, float val1, float val2, float val3, - float val4, float val5, float val6, float val7) : - values{val0, val1, val2, val3, val4, val5, val6, val7} {} - Vectorized(float32x4_t val0, float32x4_t val1) : values{val0, val1} {} - operator float32x4x2_t() const { - return values; - } - template - static Vectorized blend(const Vectorized& a, const Vectorized& b) { - Vectorized vec; - // 0. - vec.values.val[0] = - BlendRegs<0, (mask & 0x01)!=0>::impl( - a.values.val[0], b.values.val[0], vec.values.val[0]); - vec.values.val[0] = - BlendRegs<1, (mask & 0x02)!=0>::impl( - a.values.val[0], b.values.val[0], vec.values.val[0]); - vec.values.val[0] = - BlendRegs<2, (mask & 0x04)!=0>::impl( - a.values.val[0], b.values.val[0], vec.values.val[0]); - vec.values.val[0] = - BlendRegs<3, (mask & 0x08)!=0>::impl( - a.values.val[0], b.values.val[0], vec.values.val[0]); - // 1. - vec.values.val[1] = - BlendRegs<0, (mask & 0x10)!=0>::impl( - a.values.val[1], b.values.val[1], vec.values.val[1]); - vec.values.val[1] = - BlendRegs<1, (mask & 0x20)!=0>::impl( - a.values.val[1], b.values.val[1], vec.values.val[1]); - vec.values.val[1] = - BlendRegs<2, (mask & 0x40)!=0>::impl( - a.values.val[1], b.values.val[1], vec.values.val[1]); - vec.values.val[1] = - BlendRegs<3, (mask & 0x80)!=0>::impl( - a.values.val[1], b.values.val[1], vec.values.val[1]); - return vec; - } - static Vectorized blendv(const Vectorized& a, const Vectorized& b, - const Vectorized& mask) { - // TODO - // NB: This requires that each value, i.e., each uint value, - // of the mask either all be zeros or all be 1s. - // We perhaps need some kind of an assert? - // But that will affect performance. - Vectorized vec(mask.values); - vec.values.val[0] = vbslq_f32( - vreinterpretq_u32_f32(vec.values.val[0]), - b.values.val[0], - a.values.val[0]); - vec.values.val[1] = vbslq_f32( - vreinterpretq_u32_f32(vec.values.val[1]), - b.values.val[1], - a.values.val[1]); - return vec; - } - template - static Vectorized arange(float base = 0.f, step_t step = static_cast(1)) { - const Vectorized base_vec(base); - const Vectorized step_vec(step); - const Vectorized step_sizes(0, 1, 2, 3, 4, 5, 6, 7); - return fmadd(step_sizes, step_vec, base_vec); - } - static Vectorized set(const Vectorized& a, const Vectorized& b, - int64_t count = size()) { - switch (count) { - case 0: - return a; - case 1: - { - Vectorized vec; - static uint32x4_t mask_low = {0xFFFFFFFF, 0x0, 0x0, 0x0}; - vec.values.val[0] = vreinterpretq_f32_u32(mask_low); - vec.values.val[1] = a.values.val[1]; - vec.values.val[0] = vbslq_f32( - vreinterpretq_u32_f32(vec.values.val[0]), - b.values.val[0], - a.values.val[0]); - return vec; - } - case 2: - { - Vectorized vec; - static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0}; - vec.values.val[0] = vreinterpretq_f32_u32(mask_low); - vec.values.val[1] = a.values.val[1]; - vec.values.val[0] = vbslq_f32( - vreinterpretq_u32_f32(vec.values.val[0]), - b.values.val[0], - a.values.val[0]); - return vec; - } - case 3: - { - Vectorized vec; - static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0}; - vec.values.val[0] = vreinterpretq_f32_u32(mask_low); - vec.values.val[1] = a.values.val[1]; - vec.values.val[0] = vbslq_f32( - vreinterpretq_u32_f32(vec.values.val[0]), - b.values.val[0], - a.values.val[0]); - return vec; - } - case 4: - return Vectorized(b.values.val[0], a.values.val[1]); - case 5: - { - Vectorized vec; - static uint32x4_t mask_high = {0xFFFFFFFF, 0x0, 0x0, 0x0}; - vec.values.val[0] = b.values.val[0]; - vec.values.val[1] = vreinterpretq_f32_u32(mask_high); - vec.values.val[1] = vbslq_f32( - vreinterpretq_u32_f32(vec.values.val[1]), - b.values.val[1], - a.values.val[1]); - return vec; - } - case 6: - { - Vectorized vec; - static uint32x4_t mask_high = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0}; - vec.values.val[0] = b.values.val[0]; - vec.values.val[1] = vreinterpretq_f32_u32(mask_high); - vec.values.val[1] = vbslq_f32( - vreinterpretq_u32_f32(vec.values.val[1]), - b.values.val[1], - a.values.val[1]); - return vec; - } - case 7: - { - Vectorized vec; - static uint32x4_t mask_high = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0}; - vec.values.val[0] = b.values.val[0]; - vec.values.val[1] = vreinterpretq_f32_u32(mask_high); - vec.values.val[1] = vbslq_f32( - vreinterpretq_u32_f32(vec.values.val[1]), - b.values.val[1], - a.values.val[1]); - return vec; - } - } - return b; - } - static Vectorized loadu(const void* ptr, int64_t count = size()) { - if (count == size()) { - return vld1q_f32_x2(reinterpret_cast(ptr)); - } - else if (count == (size() >> 1)) { - Vectorized res; - res.values.val[0] = vld1q_f32(reinterpret_cast(ptr)); - res.values.val[1] = vdupq_n_f32(0.f); - return res; - } - else { - __at_align__ float tmp_values[size()]; - for (size_t i = 0; i < size(); ++i) { - tmp_values[i] = 0.0; - } - std::memcpy( - tmp_values, - reinterpret_cast(ptr), - count * sizeof(float)); - return vld1q_f32_x2(reinterpret_cast(tmp_values)); - } - } - void store(void* ptr, int64_t count = size()) const { - if (count == size()) { - vst1q_f32_x2(reinterpret_cast(ptr), values); - } - else if (count == (size() >> 1)) { - vst1q_f32(reinterpret_cast(ptr), values.val[0]); - } - else { - float tmp_values[size()]; - vst1q_f32_x2(reinterpret_cast(tmp_values), values); - std::memcpy(ptr, tmp_values, count * sizeof(float)); - } - } - inline const float32x4_t& get_low() const { - return values.val[0]; - } - inline float32x4_t& get_low() { - return values.val[0]; - } - inline const float32x4_t& get_high() const { - return values.val[1]; - } - inline float32x4_t& get_high() { - return values.val[1]; - } - // Very slow implementation of indexing. - // Only required because vec256_qint refers to this. - // Once we specialize that implementation for ARM - // this should be removed. TODO (kimishpatel) - float operator[](int idx) const { - __at_align__ float tmp[size()]; - store(tmp); - return tmp[idx]; - } - float operator[](int idx) { - __at_align__ float tmp[size()]; - store(tmp); - return tmp[idx]; - } - // For boolean version where we want to if any 1/all zero - // etc. can be done faster in a different way. - int zero_mask() const { - __at_align__ float tmp[size()]; - store(tmp); - int mask = 0; - for (size_t i = 0; i < size(); ++ i) { - if (tmp[i] == 0.f) { - mask |= (1 << i); - } - } - return mask; - } - Vectorized isnan() const { - __at_align__ float tmp[size()]; - __at_align__ float res[size()]; - store(tmp); - for (size_t i = 0; i < size(); ++i) { - if (std::isnan(tmp[i])) { - std::memset(static_cast(&res[i]), 0xFF, sizeof(float)); - } else { - std::memset(static_cast(&res[i]), 0, sizeof(float)); - } - } - return loadu(res); - }; - Vectorized map(float (*const f)(float)) const { - __at_align__ float tmp[size()]; - store(tmp); - for (size_t i = 0; i < size(); ++i) { - tmp[i] = f(tmp[i]); - } - return loadu(tmp); - } - Vectorized abs() const { - return Vectorized(vabsq_f32(values.val[0]), vabsq_f32(values.val[1])); - } - Vectorized acos() const { - return USE_SLEEF( - Vectorized(Sleef_acosf4_u10(values.val[0]), Sleef_acosf4_u10(values.val[1])), - map(std::acos) - ); - } - Vectorized asin() const { - return USE_SLEEF( - Vectorized(Sleef_asinf4_u10(values.val[0]), Sleef_asinf4_u10(values.val[1])), - map(std::asin) - ); - } - Vectorized atan() const { - return USE_SLEEF( - Vectorized(Sleef_atanf4_u10(values.val[0]), Sleef_atanf4_u10(values.val[1])), - map(std::atan) - ); - } - Vectorized atan2(const Vectorized &exp) const { - USE_SLEEF( - { - return Vectorized(Sleef_atan2f4_u10(values.val[0], exp.values.val[0]), - Sleef_atan2f4_u10(values.val[1], exp.values.val[1])); - }, - { - __at_align__ float tmp[size()]; - __at_align__ float tmp_exp[size()]; - store(tmp); - exp.store(tmp_exp); - for (size_t i = 0; i < size(); ++i) { - tmp[i] = std::atan2(tmp[i], tmp_exp[i]); - } - return loadu(tmp); - } - ) - } - Vectorized copysign(const Vectorized &sign) const { - USE_SLEEF( - { - return Vectorized(Sleef_copysignf4(values.val[0], sign.values.val[0]), - Sleef_copysignf4(values.val[1], sign.values.val[1])); - }, - { - __at_align__ float tmp[size()]; - __at_align__ float tmp_sign[size()]; - store(tmp); - sign.store(tmp_sign); - for (size_t i = 0; i < size(); i++) { - tmp[i] = std::copysign(tmp[i], tmp_sign[i]); - } - return loadu(tmp); - } - ) - } - Vectorized erf() const { - return USE_SLEEF( - Vectorized(Sleef_erff4_u10(values.val[0]), Sleef_erff4_u10(values.val[1])), - map(std::erf); - ); - } - Vectorized erfc() const { - return USE_SLEEF( - Vectorized(Sleef_erfcf4_u15(values.val[0]), Sleef_erfcf4_u15(values.val[1])), - map(std::erfc) - ); - } - Vectorized exp() const { - return USE_SLEEF( - Vectorized(Sleef_expf4_u10(values.val[0]), Sleef_expf4_u10(values.val[1])), - map(std::exp) - ); - } - Vectorized exp2() const { - return USE_SLEEF( - Vectorized(Sleef_exp2f4_u10(values.val[0]), Sleef_exp2f4_u10(values.val[1])), - map(std::exp2) - ); - } - Vectorized expm1() const { - return USE_SLEEF( - Vectorized(Sleef_expm1f4_u10(values.val[0]), Sleef_expm1f4_u10(values.val[1])), - map(std::expm1) - ); - } - Vectorized fmod(const Vectorized& q) const { - USE_SLEEF( - { - return Vectorized(Sleef_fmodf4(values.val[0], q.values.val[0]), - Sleef_fmodf4(values.val[1], q.values.val[1])); - }, - { - __at_align__ float tmp[size()]; - __at_align__ float tmp_q[size()]; - store(tmp); - q.store(tmp_q); - for (size_t i = 0; i < size(); ++i) { - tmp[i] = std::fmod(tmp[i], tmp_q[i]); - } - return loadu(tmp); - } - ) - } - Vectorized hypot(const Vectorized &b) const { - USE_SLEEF( - { - return Vectorized(Sleef_hypotf4_u05(values.val[0], b.values.val[0]), - Sleef_hypotf4_u05(values.val[1], b.values.val[1])); - }, - { - __at_align__ float tmp[size()]; - __at_align__ float tmp_b[size()]; - store(tmp); - b.store(tmp_b); - for (size_t i = 0; i < size(); ++i) { - tmp[i] = std::hypot(tmp[i], tmp_b[i]); - } - return loadu(tmp); - } - ) - } - Vectorized log() const { - return USE_SLEEF( - Vectorized(Sleef_logf4_u10(values.val[0]), Sleef_logf4_u10(values.val[1])), - map(std::log) - ); - } - Vectorized log10() const { - return USE_SLEEF( - Vectorized(Sleef_log10f4_u10(values.val[0]), Sleef_log10f4_u10(values.val[1])), - map(std::log10) - ); - } - Vectorized log1p() const { - return USE_SLEEF( - Vectorized(Sleef_log1pf4_u10(values.val[0]), Sleef_log1pf4_u10(values.val[1])), - map(std::log1p) - ); - } - Vectorized log2() const { - return USE_SLEEF( - Vectorized(Sleef_log2f4_u10(values.val[0]), Sleef_log2f4_u10(values.val[1])), - map(std::log2) - ); - } - Vectorized nextafter(const Vectorized &b) const { - USE_SLEEF( - { - return Vectorized(Sleef_nextafterf4(values.val[0], b.values.val[0]), - Sleef_nextafterf4(values.val[1], b.values.val[1])); - }, - { - __at_align__ float tmp[size()]; - __at_align__ float tmp_b[size()]; - store(tmp); - b.store(tmp_b); - for (size_t i = 0; i < size(); ++i) { - tmp[i] = std::nextafter(tmp[i], tmp_b[i]); - } - return loadu(tmp); - } - ) - } - Vectorized frac() const; - Vectorized sin() const { - return USE_SLEEF( - Vectorized(Sleef_sinf4_u10(values.val[0]), Sleef_sinf4_u10(values.val[1])), - map(std::sin) - ); - } - Vectorized sinh() const { - return USE_SLEEF( - Vectorized(Sleef_sinhf4_u10(values.val[0]), Sleef_sinhf4_u10(values.val[1])), - map(std::sinh) - ); - } - Vectorized cos() const { - return USE_SLEEF( - Vectorized(Sleef_cosf4_u10(values.val[0]), Sleef_cosf4_u10(values.val[1])), - map(std::cos) - ); - } - Vectorized cosh() const { - return USE_SLEEF( - Vectorized(Sleef_coshf4_u10(values.val[0]), Sleef_coshf4_u10(values.val[1])), - map(std::cosh) - ); - } - Vectorized ceil() const { - return map(std::ceil); - } - Vectorized floor() const { - return map(std::floor); - } - Vectorized neg() const { - return Vectorized( - vnegq_f32(values.val[0]), - vnegq_f32(values.val[1])); - } - Vectorized round() const { - return map(std::round); - } - Vectorized tan() const { - return USE_SLEEF( - Vectorized(Sleef_tanf4_u10(values.val[0]), Sleef_tanf4_u10(values.val[1])), - map(std::tan) - ); - } - Vectorized tanh() const { - return USE_SLEEF( - Vectorized(Sleef_tanhf4_u10(values.val[0]), Sleef_tanhf4_u10(values.val[1])), - map(std::tanh) - ); - } - Vectorized trunc() const { - float32x4_t r0 = vrndq_f32(values.val[0]); - float32x4_t r1 = vrndq_f32(values.val[1]); - return Vectorized(r0, r1); - } - Vectorized lgamma() const { - return USE_SLEEF( - Vectorized(Sleef_lgammaf4_u10(values.val[0]), Sleef_lgammaf4_u10(values.val[1])), - map(std::lgamma) - ); - } - Vectorized sqrt() const { - return Vectorized( - vsqrtq_f32(values.val[0]), - vsqrtq_f32(values.val[1])); - } - Vectorized reciprocal() const { - auto r0 = vdivq_f32(vdupq_n_f32(1.0f), values.val[0]); - auto r1 = vdivq_f32(vdupq_n_f32(1.0f), values.val[1]); - return Vectorized(r0, r1); - } - Vectorized rsqrt() const { - return this->sqrt().reciprocal(); - } - Vectorized pow(const Vectorized &exp) const { - USE_SLEEF( - { - return Vectorized(Sleef_powf4_u10(values.val[0], exp.values.val[0]), - Sleef_powf4_u10(values.val[1], exp.values.val[1])); - }, - { - __at_align__ float tmp[size()]; - __at_align__ float tmp_exp[size()]; - store(tmp); - exp.store(tmp_exp); - for (size_t i = 0; i < size(); ++i) { - tmp[i] = std::pow(tmp[i], tmp_exp[i]); - } - return loadu(tmp); - } - ) - } - Vectorized operator==(const Vectorized& other) const { - float32x4_t r0 = - vreinterpretq_f32_u32(vceqq_f32(values.val[0], other.values.val[0])); - float32x4_t r1 = - vreinterpretq_f32_u32(vceqq_f32(values.val[1], other.values.val[1])); - return Vectorized(r0, r1); - } - - Vectorized operator!=(const Vectorized& other) const { - float32x4_t r0 = vreinterpretq_f32_u32( - vmvnq_u32(vceqq_f32(values.val[0], other.values.val[0]))); - float32x4_t r1 = vreinterpretq_f32_u32( - vmvnq_u32(vceqq_f32(values.val[1], other.values.val[1]))); - return Vectorized(r0, r1); - } - - Vectorized operator<(const Vectorized& other) const { - float32x4_t r0 = - vreinterpretq_f32_u32(vcltq_f32(values.val[0], other.values.val[0])); - float32x4_t r1 = - vreinterpretq_f32_u32(vcltq_f32(values.val[1], other.values.val[1])); - return Vectorized(r0, r1); - } - - Vectorized operator<=(const Vectorized& other) const { - float32x4_t r0 = - vreinterpretq_f32_u32(vcleq_f32(values.val[0], other.values.val[0])); - float32x4_t r1 = - vreinterpretq_f32_u32(vcleq_f32(values.val[1], other.values.val[1])); - return Vectorized(r0, r1); - } - - Vectorized operator>(const Vectorized& other) const { - float32x4_t r0 = - vreinterpretq_f32_u32(vcgtq_f32(values.val[0], other.values.val[0])); - float32x4_t r1 = - vreinterpretq_f32_u32(vcgtq_f32(values.val[1], other.values.val[1])); - return Vectorized(r0, r1); - } - - Vectorized operator>=(const Vectorized& other) const { - float32x4_t r0 = - vreinterpretq_f32_u32(vcgeq_f32(values.val[0], other.values.val[0])); - float32x4_t r1 = - vreinterpretq_f32_u32(vcgeq_f32(values.val[1], other.values.val[1])); - return Vectorized(r0, r1); - } - - Vectorized eq(const Vectorized& other) const; - Vectorized ne(const Vectorized& other) const; - Vectorized gt(const Vectorized& other) const; - Vectorized ge(const Vectorized& other) const; - Vectorized lt(const Vectorized& other) const; - Vectorized le(const Vectorized& other) const; -}; - -template <> -Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { - float32x4_t r0 = vaddq_f32(a.get_low(), b.get_low()); - float32x4_t r1 = vaddq_f32(a.get_high(), b.get_high()); - return Vectorized(r0, r1); -} - -template <> -Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { - float32x4_t r0 = vsubq_f32(a.get_low(), b.get_low()); - float32x4_t r1 = vsubq_f32(a.get_high(), b.get_high()); - return Vectorized(r0, r1); -} - -template <> -Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { - float32x4_t r0 = vmulq_f32(a.get_low(), b.get_low()); - float32x4_t r1 = vmulq_f32(a.get_high(), b.get_high()); - return Vectorized(r0, r1); -} - -template <> -Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { - float32x4_t r0 = vdivq_f32(a.get_low(), b.get_low()); - float32x4_t r1 = vdivq_f32(a.get_high(), b.get_high()); - return Vectorized(r0, r1); -} - -// frac. Implement this here so we can use subtraction -inline Vectorized Vectorized::frac() const { - return *this - this->trunc(); -} - -// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if -// either input is a NaN. -template <> -Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { - float32x4_t r0 = vmaxq_f32(a.get_low(), b.get_low()); - float32x4_t r1 = vmaxq_f32(a.get_high(), b.get_high()); - return Vectorized(r0, r1); -} - -// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if -// either input is a NaN. -template <> -Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { - float32x4_t r0 = vminq_f32(a.get_low(), b.get_low()); - float32x4_t r1 = vminq_f32(a.get_high(), b.get_high()); - return Vectorized(r0, r1); -} - -template <> -Vectorized inline clamp(const Vectorized& a, const Vectorized& min, const Vectorized& max) { - return minimum(max, maximum(min, a)); -} - -template <> -Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max) { - return minimum(max, a); -} - -template <> -Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min) { - return maximum(min, a); -} - -template <> -Vectorized inline operator&(const Vectorized& a, const Vectorized& b) { - float32x4_t r0 = vreinterpretq_f32_u32(vandq_u32( - vreinterpretq_u32_f32(a.get_low()), - vreinterpretq_u32_f32(b.get_low()))); - float32x4_t r1 = vreinterpretq_f32_u32(vandq_u32( - vreinterpretq_u32_f32(a.get_high()), - vreinterpretq_u32_f32(b.get_high()))); - return Vectorized(r0, r1); -} - -template <> -Vectorized inline operator|(const Vectorized& a, const Vectorized& b) { - float32x4_t r0 = vreinterpretq_f32_u32(vorrq_u32( - vreinterpretq_u32_f32(a.get_low()), - vreinterpretq_u32_f32(b.get_low()))); - float32x4_t r1 = vreinterpretq_f32_u32(vorrq_u32( - vreinterpretq_u32_f32(a.get_high()), - vreinterpretq_u32_f32(b.get_high()))); - return Vectorized(r0, r1); -} - -template <> -Vectorized inline operator^(const Vectorized& a, const Vectorized& b) { - float32x4_t r0 = vreinterpretq_f32_u32(veorq_u32( - vreinterpretq_u32_f32(a.get_low()), - vreinterpretq_u32_f32(b.get_low()))); - float32x4_t r1 = vreinterpretq_f32_u32(veorq_u32( - vreinterpretq_u32_f32(a.get_high()), - vreinterpretq_u32_f32(b.get_high()))); - return Vectorized(r0, r1); -} - -inline Vectorized Vectorized::eq(const Vectorized& other) const { - return (*this == other) & Vectorized(1.0f); -} - -inline Vectorized Vectorized::ne(const Vectorized& other) const { - return (*this != other) & Vectorized(1.0f); -} - -inline Vectorized Vectorized::gt(const Vectorized& other) const { - return (*this > other) & Vectorized(1.0f); -} - -inline Vectorized Vectorized::ge(const Vectorized& other) const { - return (*this >= other) & Vectorized(1.0f); -} - -inline Vectorized Vectorized::lt(const Vectorized& other) const { - return (*this < other) & Vectorized(1.0f); -} - -inline Vectorized Vectorized::le(const Vectorized& other) const { - return (*this <= other) & Vectorized(1.0f); -} - -template <> -inline void convert(const float* src, int32_t* dst, int64_t n) { - int64_t i; -#pragma unroll - for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { - vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i))); - vst1q_s32(dst + i + 4, vcvtq_s32_f32(vld1q_f32(src + i + 4))); - } -#pragma unroll - for (; i < n; i++) { - dst[i] = static_cast(src[i]); - } -} - -template <> -inline void convert(const int32_t* src, float* dst, int64_t n) { - int64_t i; -#pragma unroll - for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { - vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i))); - vst1q_f32(dst + i + 4, vcvtq_f32_s32(vld1q_s32(src + i + 4))); - } -#pragma unroll - for (; i < n; i++) { - dst[i] = static_cast(src[i]); - } -} - -template <> -Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { - float32x4_t r0 = vfmaq_f32(c.get_low(), a.get_low(), b.get_low()); - float32x4_t r1 = vfmaq_f32(c.get_high(), a.get_high(), b.get_high()); - return Vectorized(r0, r1); -} - -template <> -Vectorized inline fmsub(const Vectorized& a, const Vectorized& b, const Vectorized& c) { - float32x4_t r0 = vfmsq_f32(c.get_low(), a.get_low(), b.get_low()); - float32x4_t r1 = vfmsq_f32(c.get_high(), a.get_high(), b.get_high()); - return Vectorized(r0, r1); -} - -#endif /* defined(aarch64) */ - -}}} diff --git a/kernels/optimized/vec/vec256/vec256_int.h b/kernels/optimized/vec/vec256/vec256_int.h deleted file mode 100644 index c6982641fa..0000000000 --- a/kernels/optimized/vec/vec256/vec256_int.h +++ /dev/null @@ -1,1537 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// DO NOT DEFINE STATIC DATA IN THIS HEADER! -// See Note [Do not compile initializers with AVX] - -#include -#include - -#include - -namespace executorch { -namespace vec { -inline namespace CPU_CAPABILITY { - -#ifdef CPU_CAPABILITY_AVX2 - -struct Vectorizedi { -protected: - __m256i values; - - static inline __m256i invert(const __m256i& v) { - const auto ones = _mm256_set1_epi64x(-1); - return _mm256_xor_si256(ones, v); - } -public: - Vectorizedi() {} - Vectorizedi(__m256i v) : values(v) {} - operator __m256i() const { - return values; - } -}; - -#else - -struct Vectorizedi {}; // dummy definition to make Vectorizedi always defined - -#endif // CPU_CAPABILITY_AVX2 - -#ifdef CPU_CAPABILITY_AVX2 - -template <> -class Vectorized : public Vectorizedi { -private: - static const Vectorized ones; -public: - using value_type = int64_t; - using size_type = int; - static constexpr size_type size() { - return 4; - } - using Vectorizedi::Vectorizedi; - Vectorized() {} - Vectorized(int64_t v) { values = _mm256_set1_epi64x(v); } - Vectorized(int64_t val1, int64_t val2, int64_t val3, int64_t val4) { - values = _mm256_setr_epi64x(val1, val2, val3, val4); - } - template - static Vectorized blend(Vectorized a, Vectorized b) { - __at_align__ int64_t tmp_values[size()]; - a.store(tmp_values); - if (mask & 0x01) - tmp_values[0] = _mm256_extract_epi64(b.values, 0); - if (mask & 0x02) - tmp_values[1] = _mm256_extract_epi64(b.values, 1); - if (mask & 0x04) - tmp_values[2] = _mm256_extract_epi64(b.values, 2); - if (mask & 0x08) - tmp_values[3] = _mm256_extract_epi64(b.values, 3); - return loadu(tmp_values); - } - static Vectorized blendv(const Vectorized& a, const Vectorized& b, - const Vectorized& mask) { - return _mm256_blendv_epi8(a.values, b.values, mask.values); - } - template - static Vectorized arange(int64_t base = 0, step_t step = static_cast(1)) { - return Vectorized(base, base + step, base + 2 * step, base + 3 * step); - } - static Vectorized - set(Vectorized a, Vectorized b, int64_t count = size()) { - switch (count) { - case 0: - return a; - case 1: - return blend<1>(a, b); - case 2: - return blend<3>(a, b); - case 3: - return blend<7>(a, b); - } - return b; - } - static Vectorized loadu(const void* ptr) { - return _mm256_loadu_si256(reinterpret_cast(ptr)); - } - static Vectorized loadu(const void* ptr, int64_t count) { - __at_align__ int64_t tmp_values[size()]; - // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 - // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two - // instructions while a loop would be compiled to one instruction. - for (size_t i = 0; i < size(); ++i) { - tmp_values[i] = 0; - } - std::memcpy(tmp_values, ptr, count * sizeof(int64_t)); - return loadu(tmp_values); - } - void store(void* ptr, int count = size()) const { - if (count == size()) { - // ptr need not to be aligned here. See - // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html - _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); - } else if (count > 0) { - __at_align__ int64_t tmp_values[size()]; - _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); - std::memcpy(ptr, tmp_values, count * sizeof(int64_t)); - } - } - const int64_t& operator[](int idx) const = delete; - int64_t& operator[](int idx) = delete; - Vectorized abs() const { - auto zero = _mm256_set1_epi64x(0); - auto is_larger = _mm256_cmpgt_epi64(zero, values); - auto inverse = _mm256_xor_si256(values, is_larger); - return _mm256_sub_epi64(inverse, is_larger); - } - Vectorized real() const { - return *this; - } - Vectorized imag() const { - return _mm256_set1_epi64x(0); - } - Vectorized conj() const { - return *this; - } - Vectorized neg() const; - Vectorized operator==(const Vectorized& other) const { - return _mm256_cmpeq_epi64(values, other.values); - } - Vectorized operator!=(const Vectorized& other) const { - return invert(_mm256_cmpeq_epi64(values, other.values)); - } - Vectorized operator<(const Vectorized& other) const { - return _mm256_cmpgt_epi64(other.values, values); - } - Vectorized operator<=(const Vectorized& other) const { - return invert(_mm256_cmpgt_epi64(values, other.values)); - } - Vectorized operator>(const Vectorized& other) const { - return _mm256_cmpgt_epi64(values, other.values); - } - Vectorized operator>=(const Vectorized& other) const { - return invert(_mm256_cmpgt_epi64(other.values, values)); - } - - Vectorized eq(const Vectorized& other) const; - Vectorized ne(const Vectorized& other) const; - Vectorized gt(const Vectorized& other) const; - Vectorized ge(const Vectorized& other) const; - Vectorized lt(const Vectorized& other) const; - Vectorized le(const Vectorized& other) const; -}; - -template <> -class Vectorized : public Vectorizedi { -private: - static const Vectorized ones; -public: - using value_type = int32_t; - using size_type = int; - static constexpr int size() { - return 8; - } - using Vectorizedi::Vectorizedi; - Vectorized() {} - Vectorized(int32_t v) { values = _mm256_set1_epi32(v); } - Vectorized(int32_t val1, int32_t val2, int32_t val3, int32_t val4, - int32_t val5, int32_t val6, int32_t val7, int32_t val8) { - values = _mm256_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8); - } - template - static Vectorized blend(Vectorized a, Vectorized b) { - return _mm256_blend_epi32(a, b, mask); - } - static Vectorized blendv(const Vectorized& a, const Vectorized& b, - const Vectorized& mask) { - return _mm256_blendv_epi8(a.values, b.values, mask.values); - } - template - static Vectorized arange(int32_t base = 0, step_t step = static_cast(1)) { - return Vectorized( - base, base + step, base + 2 * step, base + 3 * step, - base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step); - } - static Vectorized - set(Vectorized a, Vectorized b, int32_t count = size()) { - switch (count) { - case 0: - return a; - case 1: - return blend<1>(a, b); - case 2: - return blend<3>(a, b); - case 3: - return blend<7>(a, b); - case 4: - return blend<15>(a, b); - case 5: - return blend<31>(a, b); - case 6: - return blend<63>(a, b); - case 7: - return blend<127>(a, b); - } - return b; - } - static Vectorized loadu(const void* ptr) { - return _mm256_loadu_si256(reinterpret_cast(ptr)); - } - static Vectorized loadu(const void* ptr, int32_t count) { - __at_align__ int32_t tmp_values[size()]; - // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 - // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two - // instructions while a loop would be compiled to one instruction. - for (size_t i = 0; i < size(); ++i) { - tmp_values[i] = 0; - } - std::memcpy(tmp_values, ptr, count * sizeof(int32_t)); - return loadu(tmp_values); - } - void store(void* ptr, int count = size()) const { - if (count == size()) { - // ptr need not to be aligned here. See - // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html - _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); - } else if (count > 0) { - __at_align__ int32_t tmp_values[size()]; - _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); - std::memcpy(ptr, tmp_values, count * sizeof(int32_t)); - } - } - const int32_t& operator[](int idx) const = delete; - int32_t& operator[](int idx) = delete; - Vectorized abs() const { - return _mm256_abs_epi32(values); - } - Vectorized real() const { - return *this; - } - Vectorized imag() const { - return _mm256_set1_epi32(0); - } - Vectorized conj() const { - return *this; - } - Vectorized neg() const; - Vectorized operator==(const Vectorized& other) const { - return _mm256_cmpeq_epi32(values, other.values); - } - Vectorized operator!=(const Vectorized& other) const { - return invert(_mm256_cmpeq_epi32(values, other.values)); - } - Vectorized operator<(const Vectorized& other) const { - return _mm256_cmpgt_epi32(other.values, values); - } - Vectorized operator<=(const Vectorized& other) const { - return invert(_mm256_cmpgt_epi32(values, other.values)); - } - Vectorized operator>(const Vectorized& other) const { - return _mm256_cmpgt_epi32(values, other.values); - } - Vectorized operator>=(const Vectorized& other) const { - return invert(_mm256_cmpgt_epi32(other.values, values)); - } - Vectorized eq(const Vectorized& other) const; - Vectorized ne(const Vectorized& other) const; - Vectorized gt(const Vectorized& other) const; - Vectorized ge(const Vectorized& other) const; - Vectorized lt(const Vectorized& other) const; - Vectorized le(const Vectorized& other) const; -}; - -template <> -inline void convert(const int32_t *src, float *dst, int64_t n) { - int64_t i; - // int32_t and float have same size -#ifndef _MSC_VER -# pragma unroll -#endif - for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { - auto input_vec = _mm256_loadu_si256(reinterpret_cast(src + i)); - auto output_vec = _mm256_cvtepi32_ps(input_vec); - _mm256_storeu_ps(reinterpret_cast(dst + i), output_vec); - } -#ifndef _MSC_VER -# pragma unroll -#endif - for (; i < n; i++) { - dst[i] = static_cast(src[i]); - } -} - -template <> -inline void convert(const int32_t *src, double *dst, int64_t n) { - int64_t i; - // int32_t has half the size of double -#ifndef _MSC_VER -# pragma unroll -#endif - for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { - auto input_128_vec = _mm_loadu_si128(reinterpret_cast(src + i)); - auto output_vec = _mm256_cvtepi32_pd(input_128_vec); - _mm256_storeu_pd(reinterpret_cast(dst + i), output_vec); - } -#ifndef _MSC_VER -# pragma unroll -#endif - for (; i < n; i++) { - dst[i] = static_cast(src[i]); - } -} - -template <> -class Vectorized : public Vectorizedi { -private: - static const Vectorized ones; -public: - using value_type = int16_t; - using size_type = int; - static constexpr int size() { - return 16; - } - using Vectorizedi::Vectorizedi; - Vectorized() {} - Vectorized(int16_t v) { values = _mm256_set1_epi16(v); } - Vectorized(int16_t val1, int16_t val2, int16_t val3, int16_t val4, - int16_t val5, int16_t val6, int16_t val7, int16_t val8, - int16_t val9, int16_t val10, int16_t val11, int16_t val12, - int16_t val13, int16_t val14, int16_t val15, int16_t val16) { - values = _mm256_setr_epi16(val1, val2, val3, val4, val5, val6, val7, val8, - val9, val10, val11, val12, val13, val14, val15, val16); - } - template - static Vectorized blend(Vectorized a, Vectorized b) { - __at_align__ int16_t tmp_values[size()]; - a.store(tmp_values); - if (mask & 0x01) - tmp_values[0] = _mm256_extract_epi16(b.values, 0); - if (mask & 0x02) - tmp_values[1] = _mm256_extract_epi16(b.values, 1); - if (mask & 0x04) - tmp_values[2] = _mm256_extract_epi16(b.values, 2); - if (mask & 0x08) - tmp_values[3] = _mm256_extract_epi16(b.values, 3); - if (mask & 0x10) - tmp_values[4] = _mm256_extract_epi16(b.values, 4); - if (mask & 0x20) - tmp_values[5] = _mm256_extract_epi16(b.values, 5); - if (mask & 0x40) - tmp_values[6] = _mm256_extract_epi16(b.values, 6); - if (mask & 0x80) - tmp_values[7] = _mm256_extract_epi16(b.values, 7); - if (mask & 0x100) - tmp_values[8] = _mm256_extract_epi16(b.values, 8); - if (mask & 0x200) - tmp_values[9] = _mm256_extract_epi16(b.values, 9); - if (mask & 0x400) - tmp_values[10] = _mm256_extract_epi16(b.values, 10); - if (mask & 0x800) - tmp_values[11] = _mm256_extract_epi16(b.values, 11); - if (mask & 0x1000) - tmp_values[12] = _mm256_extract_epi16(b.values, 12); - if (mask & 0x2000) - tmp_values[13] = _mm256_extract_epi16(b.values, 13); - if (mask & 0x4000) - tmp_values[14] = _mm256_extract_epi16(b.values, 14); - if (mask & 0x8000) - tmp_values[15] = _mm256_extract_epi16(b.values, 15); - return loadu(tmp_values); - } - static Vectorized blendv(const Vectorized& a, const Vectorized& b, - const Vectorized& mask) { - return _mm256_blendv_epi8(a.values, b.values, mask.values); - } - template - static Vectorized arange(int16_t base = 0, step_t step = static_cast(1)) { - return Vectorized( - base, base + step, base + 2 * step, base + 3 * step, - base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step, - base + 8 * step, base + 9 * step, base + 10 * step, base + 11 * step, - base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step); - } - static Vectorized - set(Vectorized a, Vectorized b, int16_t count = size()) { - switch (count) { - case 0: - return a; - case 1: - return blend<1>(a, b); - case 2: - return blend<3>(a, b); - case 3: - return blend<7>(a, b); - case 4: - return blend<15>(a, b); - case 5: - return blend<31>(a, b); - case 6: - return blend<63>(a, b); - case 7: - return blend<127>(a, b); - case 8: - return blend<255>(a, b); - case 9: - return blend<511>(a, b); - case 10: - return blend<1023>(a, b); - case 11: - return blend<2047>(a, b); - case 12: - return blend<4095>(a, b); - case 13: - return blend<8191>(a, b); - case 14: - return blend<16383>(a, b); - case 15: - return blend<32767>(a, b); - } - return b; - } - static Vectorized loadu(const void* ptr) { - return _mm256_loadu_si256(reinterpret_cast(ptr)); - } - static Vectorized loadu(const void* ptr, int16_t count) { - __at_align__ int16_t tmp_values[size()]; - // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 - // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two - // instructions while a loop would be compiled to one instruction. - for (size_t i = 0; i < size(); ++i) { - tmp_values[i] = 0; - } - std::memcpy(tmp_values, ptr, count * sizeof(int16_t)); - return loadu(tmp_values); - } - void store(void* ptr, int count = size()) const { - if (count == size()) { - // ptr need not to be aligned here. See - // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html - _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); - } else if (count > 0) { - __at_align__ int16_t tmp_values[size()]; - _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); - std::memcpy(ptr, tmp_values, count * sizeof(int16_t)); - } - } - const int16_t& operator[](int idx) const = delete; - int16_t& operator[](int idx) = delete; - Vectorized abs() const { - return _mm256_abs_epi16(values); - } - Vectorized real() const { - return *this; - } - Vectorized imag() const { - return _mm256_set1_epi16(0); - } - Vectorized conj() const { - return *this; - } - Vectorized neg() const; - Vectorized operator==(const Vectorized& other) const { - return _mm256_cmpeq_epi16(values, other.values); - } - Vectorized operator!=(const Vectorized& other) const { - return invert(_mm256_cmpeq_epi16(values, other.values)); - } - Vectorized operator<(const Vectorized& other) const { - return _mm256_cmpgt_epi16(other.values, values); - } - Vectorized operator<=(const Vectorized& other) const { - return invert(_mm256_cmpgt_epi16(values, other.values)); - } - Vectorized operator>(const Vectorized& other) const { - return _mm256_cmpgt_epi16(values, other.values); - } - Vectorized operator>=(const Vectorized& other) const { - return invert(_mm256_cmpgt_epi16(other.values, values)); - } - - Vectorized eq(const Vectorized& other) const; - Vectorized ne(const Vectorized& other) const; - Vectorized gt(const Vectorized& other) const; - Vectorized ge(const Vectorized& other) const; - Vectorized lt(const Vectorized& other) const; - Vectorized le(const Vectorized& other) const; -}; - -template -class Vectorized8 : public Vectorizedi { - static_assert( - std::is_same::value || std::is_same::value, - "Only int8_t/uint8_t are supported"); -protected: - static const Vectorized ones; -public: - using value_type = T; - using size_type = int; - static constexpr int size() { - return 32; - } - using Vectorizedi::Vectorizedi; - Vectorized8() {} - Vectorized8(T v) { values = _mm256_set1_epi8(v); } - Vectorized8(T val1, T val2, T val3, T val4, - T val5, T val6, T val7, T val8, - T val9, T val10, T val11, T val12, - T val13, T val14, T val15, T val16, - T val17, T val18, T val19, T val20, - T val21, T val22, T val23, T val24, - T val25, T val26, T val27, T val28, - T val29, T val30, T val31, T val32) { - values = _mm256_setr_epi8(val1, val2, val3, val4, val5, val6, val7, val8, - val9, val10, val11, val12, val13, val14, val15, val16, - val17, val18, val19, val20, val21, val22, val23, val24, - val25, val26, val27, val28, val29, val30, val31, val32); - } - template - static Vectorized blend(Vectorized a, Vectorized b) { - __at_align__ T tmp_values[size()]; - a.store(tmp_values); - if (mask & 0x01) - tmp_values[0] = _mm256_extract_epi8(b.values, 0); - if (mask & 0x02) - tmp_values[1] = _mm256_extract_epi8(b.values, 1); - if (mask & 0x04) - tmp_values[2] = _mm256_extract_epi8(b.values, 2); - if (mask & 0x08) - tmp_values[3] = _mm256_extract_epi8(b.values, 3); - if (mask & 0x10) - tmp_values[4] = _mm256_extract_epi8(b.values, 4); - if (mask & 0x20) - tmp_values[5] = _mm256_extract_epi8(b.values, 5); - if (mask & 0x40) - tmp_values[6] = _mm256_extract_epi8(b.values, 6); - if (mask & 0x80) - tmp_values[7] = _mm256_extract_epi8(b.values, 7); - if (mask & 0x100) - tmp_values[8] = _mm256_extract_epi8(b.values, 8); - if (mask & 0x200) - tmp_values[9] = _mm256_extract_epi8(b.values, 9); - if (mask & 0x400) - tmp_values[10] = _mm256_extract_epi8(b.values, 10); - if (mask & 0x800) - tmp_values[11] = _mm256_extract_epi8(b.values, 11); - if (mask & 0x1000) - tmp_values[12] = _mm256_extract_epi8(b.values, 12); - if (mask & 0x2000) - tmp_values[13] = _mm256_extract_epi8(b.values, 13); - if (mask & 0x4000) - tmp_values[14] = _mm256_extract_epi8(b.values, 14); - if (mask & 0x8000) - tmp_values[15] = _mm256_extract_epi8(b.values, 15); - if (mask & 0x010000) - tmp_values[16] = _mm256_extract_epi8(b.values, 16); - if (mask & 0x020000) - tmp_values[17] = _mm256_extract_epi8(b.values, 17); - if (mask & 0x040000) - tmp_values[18] = _mm256_extract_epi8(b.values, 18); - if (mask & 0x080000) - tmp_values[19] = _mm256_extract_epi8(b.values, 19); - if (mask & 0x100000) - tmp_values[20] = _mm256_extract_epi8(b.values, 20); - if (mask & 0x200000) - tmp_values[21] = _mm256_extract_epi8(b.values, 21); - if (mask & 0x400000) - tmp_values[22] = _mm256_extract_epi8(b.values, 22); - if (mask & 0x800000) - tmp_values[23] = _mm256_extract_epi8(b.values, 23); - if (mask & 0x1000000) - tmp_values[24] = _mm256_extract_epi8(b.values, 24); - if (mask & 0x2000000) - tmp_values[25] = _mm256_extract_epi8(b.values, 25); - if (mask & 0x4000000) - tmp_values[26] = _mm256_extract_epi8(b.values, 26); - if (mask & 0x8000000) - tmp_values[27] = _mm256_extract_epi8(b.values, 27); - if (mask & 0x10000000) - tmp_values[28] = _mm256_extract_epi8(b.values, 28); - if (mask & 0x20000000) - tmp_values[29] = _mm256_extract_epi8(b.values, 29); - if (mask & 0x40000000) - tmp_values[30] = _mm256_extract_epi8(b.values, 30); - if (mask & 0x80000000) - tmp_values[31] = _mm256_extract_epi8(b.values, 31); - return loadu(tmp_values); - } - static Vectorized blendv(const Vectorized& a, const Vectorized& b, - const Vectorized& mask) { - return _mm256_blendv_epi8(a.values, b.values, mask.values); - } - template - static Vectorized arange(T base = 0, step_t step = static_cast(1)) { - return Vectorized( - base, base + step, base + 2 * step, base + 3 * step, - base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step, - base + 8 * step, base + 9 * step, base + 10 * step, base + 11 * step, - base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step, - base + 16 * step, base + 17 * step, base + 18 * step, base + 19 * step, - base + 20 * step, base + 21 * step, base + 22 * step, base + 23 * step, - base + 24 * step, base + 25 * step, base + 26 * step, base + 27 * step, - base + 28 * step, base + 29 * step, base + 30 * step, base + 31 * step); - } - static Vectorized - set(Vectorized a, Vectorized b, T count = size()) { - switch (count) { - case 0: - return a; - case 1: - return blend<0x1>(a, b); - case 2: - return blend<0x3>(a, b); - case 3: - return blend<0x7>(a, b); - case 4: - return blend<0xF>(a, b); - case 5: - return blend<0x1F>(a, b); - case 6: - return blend<0x3F>(a, b); - case 7: - return blend<0x7F>(a, b); - case 8: - return blend<0xFF>(a, b); - case 9: - return blend<0x1FF>(a, b); - case 10: - return blend<0x3FF>(a, b); - case 11: - return blend<0x7FF>(a, b); - case 12: - return blend<0xFFF>(a, b); - case 13: - return blend<0x1FFF>(a, b); - case 14: - return blend<0x3FFF>(a, b); - case 15: - return blend<0x7FFF>(a, b); - case 16: - return blend<0xFFFF>(a, b); - case 17: - return blend<0x1FFFF>(a, b); - case 18: - return blend<0x3FFFF>(a, b); - case 19: - return blend<0x7FFFF>(a, b); - case 20: - return blend<0xFFFFF>(a, b); - case 21: - return blend<0x1FFFFF>(a, b); - case 22: - return blend<0x3FFFFF>(a, b); - case 23: - return blend<0x7FFFFF>(a, b); - case 24: - return blend<0xFFFFFF>(a, b); - case 25: - return blend<0x1FFFFFF>(a, b); - case 26: - return blend<0x3FFFFFF>(a, b); - case 27: - return blend<0x7FFFFFF>(a, b); - case 28: - return blend<0xFFFFFFF>(a, b); - case 29: - return blend<0x1FFFFFFF>(a, b); - case 30: - return blend<0x3FFFFFFF>(a, b); - case 31: - return blend<0x7FFFFFFF>(a, b); - } - return b; - } - static Vectorized loadu(const void* ptr) { - return _mm256_loadu_si256(reinterpret_cast(ptr)); - } - static Vectorized loadu(const void* ptr, T count) { - __at_align__ T tmp_values[size()]; - // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 - // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two - // instructions while a loop would be compiled to one instruction. - for (size_t i = 0; i < size(); ++i) { - tmp_values[i] = 0; - } - std::memcpy(tmp_values, ptr, count * sizeof(T)); - return loadu(tmp_values); - } - void store(void* ptr, int count = size()) const { - if (count == size()) { - // ptr need not to be aligned here. See - // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html - _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); - } else if (count > 0) { - __at_align__ T tmp_values[size()]; - _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); - std::memcpy(ptr, tmp_values, count * sizeof(T)); - } - } - const T& operator[](int idx) const = delete; - T& operator[](int idx) = delete; - Vectorized real() const { - return *this; - } - Vectorized imag() const { - return _mm256_set1_epi8(0); - } - Vectorized conj() const { - return *this; - } -}; - -template<> -class Vectorized: public Vectorized8 { -public: - using Vectorized8::Vectorized8; - - Vectorized neg() const; - - Vectorized abs() const { - return _mm256_abs_epi8(values); - } - - Vectorized operator==(const Vectorized& other) const { - return _mm256_cmpeq_epi8(values, other.values); - } - Vectorized operator!=(const Vectorized& other) const { - return invert(_mm256_cmpeq_epi8(values, other.values)); - } - Vectorized operator<(const Vectorized& other) const { - return _mm256_cmpgt_epi8(other.values, values); - } - Vectorized operator<=(const Vectorized& other) const { - return invert(_mm256_cmpgt_epi8(values, other.values)); - } - Vectorized operator>(const Vectorized& other) const { - return other < *this; - } - Vectorized operator>=(const Vectorized& other) const { - return other <= *this; - } - - Vectorized eq(const Vectorized& other) const; - Vectorized ne(const Vectorized& other) const; - Vectorized gt(const Vectorized& other) const; - Vectorized ge(const Vectorized& other) const; - Vectorized lt(const Vectorized& other) const; - Vectorized le(const Vectorized& other) const; -}; - -template<> -class Vectorized: public Vectorized8 { -public: - using Vectorized8::Vectorized8; - - Vectorized neg() const; - - Vectorized abs() const { - return *this; - } - - Vectorized operator==(const Vectorized& other) const { - return _mm256_cmpeq_epi8(values, other.values); - } - Vectorized operator!=(const Vectorized& other) const { - return invert(_mm256_cmpeq_epi8(values, other.values)); - } - Vectorized operator<(const Vectorized& other) const { - __m256i max = _mm256_max_epu8(values, other.values); - return invert(_mm256_cmpeq_epi8(max, values)); - } - Vectorized operator<=(const Vectorized& other) const { - __m256i max = _mm256_max_epu8(values, other.values); - return _mm256_cmpeq_epi8(max, other.values); - } - Vectorized operator>(const Vectorized& other) const { - return other < *this; - } - Vectorized operator>=(const Vectorized& other) const { - return other <= *this; - } - - Vectorized eq(const Vectorized& other) const; - Vectorized ne(const Vectorized& other) const; - Vectorized gt(const Vectorized& other) const; - Vectorized ge(const Vectorized& other) const; - Vectorized lt(const Vectorized& other) const; - Vectorized le(const Vectorized& other) const; -}; - -template <> -Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { - return _mm256_add_epi64(a, b); -} - -template <> -Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { - return _mm256_add_epi32(a, b); -} - -template <> -Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { - return _mm256_add_epi16(a, b); -} - -template <> -Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { - return _mm256_add_epi8(a, b); -} - -template <> -Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { - return _mm256_add_epi8(a, b); -} - -template <> -Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { - return _mm256_sub_epi64(a, b); -} - -template <> -Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { - return _mm256_sub_epi32(a, b); -} - -template <> -Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { - return _mm256_sub_epi16(a, b); -} - -template <> -Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { - return _mm256_sub_epi8(a, b); -} - -template <> -Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { - return _mm256_sub_epi8(a, b); -} - -// Negation. Defined here so we can utilize operator- -inline Vectorized Vectorized::neg() const { - return Vectorized(0) - *this; -} - -inline Vectorized Vectorized::neg() const { - return Vectorized(0) - *this; -} - -inline Vectorized Vectorized::neg() const { - return Vectorized(0) - *this; -} - -inline Vectorized Vectorized::neg() const { - return Vectorized(0) - *this; -} - -inline Vectorized Vectorized::neg() const { - return Vectorized(0) - *this; -} - -// Emulate operations with no native 64-bit support in avx, -// by extracting each element, performing the operation pointwise, -// then combining the results into a vector. -template -Vectorized inline emulate(const Vectorized& a, const Vectorized& b, const op_t& op) { - int64_t a0 = _mm256_extract_epi64(a, 0); - int64_t a1 = _mm256_extract_epi64(a, 1); - int64_t a2 = _mm256_extract_epi64(a, 2); - int64_t a3 = _mm256_extract_epi64(a, 3); - - int64_t b0 = _mm256_extract_epi64(b, 0); - int64_t b1 = _mm256_extract_epi64(b, 1); - int64_t b2 = _mm256_extract_epi64(b, 2); - int64_t b3 = _mm256_extract_epi64(b, 3); - - int64_t c0 = op(a0, b0); - int64_t c1 = op(a1, b1); - int64_t c2 = op(a2, b2); - int64_t c3 = op(a3, b3); - - return _mm256_set_epi64x(c3, c2, c1, c0); -} - -template -Vectorized inline emulate(const Vectorized& a, const Vectorized& b, const Vectorized& c, const op_t& op) { - int64_t a0 = _mm256_extract_epi64(a, 0); - int64_t a1 = _mm256_extract_epi64(a, 1); - int64_t a2 = _mm256_extract_epi64(a, 2); - int64_t a3 = _mm256_extract_epi64(a, 3); - - int64_t b0 = _mm256_extract_epi64(b, 0); - int64_t b1 = _mm256_extract_epi64(b, 1); - int64_t b2 = _mm256_extract_epi64(b, 2); - int64_t b3 = _mm256_extract_epi64(b, 3); - - int64_t c0 = _mm256_extract_epi64(c, 0); - int64_t c1 = _mm256_extract_epi64(c, 1); - int64_t c2 = _mm256_extract_epi64(c, 2); - int64_t c3 = _mm256_extract_epi64(c, 3); - - int64_t d0 = op(a0, b0, c0); - int64_t d1 = op(a1, b1, c1); - int64_t d2 = op(a2, b2, c2); - int64_t d3 = op(a3, b3, c3); - - return _mm256_set_epi64x(d3, d2, d1, d0); -} - -// AVX2 has no intrinsic for int64_t multiply so it needs to be emulated -// This could be implemented more efficiently using epi32 instructions -// This is also technically avx compatible, but then we'll need AVX -// code for add as well. -// Note: intentionally ignores undefined behavior like (-lowest * -1). -template <> -Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { - return emulate(a, b, [](int64_t a_point, int64_t b_point) {return a_point * b_point;}); -} - -template <> -Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { - return _mm256_mullo_epi32(a, b); -} - -template <> -Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { - return _mm256_mullo_epi16(a, b); -} - -template -Vectorized inline int_elementwise_binary_256(const Vectorized& a, const Vectorized& b, Op op) { - T values_a[Vectorized::size()]; - T values_b[Vectorized::size()]; - a.store(values_a); - b.store(values_b); - for (size_t i = 0; i != Vectorized::size(); i++) { - values_a[i] = op(values_a[i], values_b[i]); - } - return Vectorized::loadu(values_a); -} - -template <> -Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { - // We don't have an instruction for multiplying int8_t - return int_elementwise_binary_256(a, b, std::multiplies()); -} - -template <> -Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { - // We don't have an instruction for multiplying uint8_t - return int_elementwise_binary_256(a, b, std::multiplies()); -} - -template <> -Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { - return emulate(a, b, [](int64_t a_point, int64_t b_point) {return std::min(a_point, b_point);}); -} - -template <> -Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { - return _mm256_min_epi32(a, b); -} - -template <> -Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { - return _mm256_min_epi16(a, b); -} - -template <> -Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { - return _mm256_min_epi8(a, b); -} - -template <> -Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { - return _mm256_min_epu8(a, b); -} - -template <> -Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { - return emulate(a, b, [](int64_t a_point, int64_t b_point) {return std::max(a_point, b_point);}); -} - -template <> -Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { - return _mm256_max_epi32(a, b); -} - -template <> -Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { - return _mm256_max_epi16(a, b); -} - -template <> -Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { - return _mm256_max_epi8(a, b); -} - -template <> -Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { - return _mm256_max_epu8(a, b); -} - -template <> -Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { - return emulate(a, min_val, max_val, [](int64_t a_point, int64_t min_point, int64_t max_point) {return std::min(max_point, std::max(a_point, min_point));}); -} - -template <> -Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { - return _mm256_min_epi32(max_val, _mm256_max_epi32(a, min_val)); -} - -template <> -Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { - return _mm256_min_epi16(max_val, _mm256_max_epi16(a, min_val)); -} - -template <> -Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { - return _mm256_min_epi8(max_val, _mm256_max_epi8(a, min_val)); -} - -template <> -Vectorized inline clamp(const Vectorized& a, const Vectorized& min_val, const Vectorized& max_val) { - return _mm256_min_epu8(max_val, _mm256_max_epu8(a, min_val)); -} - -template <> -Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { - return emulate(a, max_val, [](int64_t a_point, int64_t max_point) {return std::min(max_point, a_point);}); -} - -template <> -Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { - return _mm256_min_epi32(max_val, a); -} - -template <> -Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { - return _mm256_min_epi16(max_val, a); -} - -template <> -Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { - return _mm256_min_epi8(max_val, a); -} - -template <> -Vectorized inline clamp_max(const Vectorized& a, const Vectorized& max_val) { - return _mm256_min_epu8(max_val, a); -} - -template <> -Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { - return emulate(a, min_val, [](int64_t a_point, int64_t min_point) {return std::max(min_point, a_point);}); -} - -template <> -Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { - return _mm256_max_epi32(min_val, a); -} - -template <> -Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { - return _mm256_max_epi16(min_val, a); -} - -template <> -Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { - return _mm256_max_epi8(min_val, a); -} - -template <> -Vectorized inline clamp_min(const Vectorized& a, const Vectorized& min_val) { - return _mm256_max_epu8(min_val, a); -} - -template -Vectorized inline convert_to_int32(const T* ptr) { - return Vectorized::loadu(ptr); -} - -template<> -Vectorized inline convert_to_int32(const int8_t* ptr) { - return _mm256_cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast(ptr))); -} - -template<> -Vectorized inline convert_to_int32(const uint8_t* ptr) { - return _mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast(ptr))); -} - -template <> -Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { - return int_elementwise_binary_256(a, b, std::divides()); -} -template <> -Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { - return int_elementwise_binary_256(a, b, std::divides()); -} -template <> -Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { - return int_elementwise_binary_256(a, b, std::divides()); -} -template <> -Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { - return int_elementwise_binary_256(a, b, std::divides()); -} -template <> -Vectorized inline operator/(const Vectorized& a, const Vectorized& b) { - return int_elementwise_binary_256(a, b, std::divides()); -} - -template>::value, int> = 0> -inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { - return _mm256_and_si256(a, b); -} -template>::value, int> = 0> -inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { - return _mm256_or_si256(a, b); -} -template>::value, int> = 0> -inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { - return _mm256_xor_si256(a, b); -} -template>::value, int> = 0> -inline Vectorized operator~(const Vectorized& a) { - return _mm256_xor_si256(a, _mm256_set1_epi32(-1)); -} - -inline Vectorized Vectorized::eq(const Vectorized& other) const { - return (*this == other) & Vectorized(1); -} - -inline Vectorized Vectorized::ne(const Vectorized& other) const { - return (*this != other) & Vectorized(1); -} - -inline Vectorized Vectorized::gt(const Vectorized& other) const { - return (*this > other) & Vectorized(1); -} - -inline Vectorized Vectorized::ge(const Vectorized& other) const { - return (*this >= other) & Vectorized(1); -} - -inline Vectorized Vectorized::lt(const Vectorized& other) const { - return (*this < other) & Vectorized(1); -} - -inline Vectorized Vectorized::le(const Vectorized& other) const { - return (*this <= other) & Vectorized(1); -} - -inline Vectorized Vectorized::eq(const Vectorized& other) const { - return (*this == other) & Vectorized(1); -} - -inline Vectorized Vectorized::ne(const Vectorized& other) const { - return (*this != other) & Vectorized(1); -} - -inline Vectorized Vectorized::gt(const Vectorized& other) const { - return (*this > other) & Vectorized(1); -} - -inline Vectorized Vectorized::ge(const Vectorized& other) const { - return (*this >= other) & Vectorized(1); -} - -inline Vectorized Vectorized::lt(const Vectorized& other) const { - return (*this < other) & Vectorized(1); -} - -inline Vectorized Vectorized::le(const Vectorized& other) const { - return (*this <= other) & Vectorized(1); -} - -inline Vectorized Vectorized::eq(const Vectorized& other) const { - return (*this == other) & Vectorized(1); -} - -inline Vectorized Vectorized::ne(const Vectorized& other) const { - return (*this != other) & Vectorized(1); -} - -inline Vectorized Vectorized::gt(const Vectorized& other) const { - return (*this > other) & Vectorized(1); -} - -inline Vectorized Vectorized::ge(const Vectorized& other) const { - return (*this >= other) & Vectorized(1); -} - -inline Vectorized Vectorized::lt(const Vectorized& other) const { - return (*this < other) & Vectorized(1); -} - -inline Vectorized Vectorized::le(const Vectorized& other) const { - return (*this <= other) & Vectorized(1); -} - -inline Vectorized Vectorized::eq(const Vectorized& other) const { - return (*this == other) & Vectorized(1); -} - -inline Vectorized Vectorized::ne(const Vectorized& other) const { - return (*this != other) & Vectorized(1); -} - -inline Vectorized Vectorized::gt(const Vectorized& other) const { - return (*this > other) & Vectorized(1); -} - -inline Vectorized Vectorized::ge(const Vectorized& other) const { - return (*this >= other) & Vectorized(1); -} - -inline Vectorized Vectorized::lt(const Vectorized& other) const { - return (*this < other) & Vectorized(1); -} - -inline Vectorized Vectorized::le(const Vectorized& other) const { - return (*this <= other) & Vectorized(1); -} - -inline Vectorized Vectorized::eq(const Vectorized& other) const { - return (*this == other) & Vectorized(1); -} - -inline Vectorized Vectorized::ne(const Vectorized& other) const { - return (*this != other) & Vectorized(1); -} - -inline Vectorized Vectorized::gt(const Vectorized& other) const { - return (*this > other) & Vectorized(1); -} - -inline Vectorized Vectorized::ge(const Vectorized& other) const { - return (*this >= other) & Vectorized(1); -} - -inline Vectorized Vectorized::lt(const Vectorized& other) const { - return (*this < other) & Vectorized(1); -} - -inline Vectorized Vectorized::le(const Vectorized& other) const { - return (*this <= other) & Vectorized(1); -} - -template -Vectorized inline shift_256_16(const Vectorized& a, const Vectorized& b) { - // No vector instruction for shifting int16_t, so emulating it instead. - - // Control masks for shuffle operation, treating 256 bits as an - // array of 16-bit elements, and considering pairs of neighboring - // elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and - // M!=N) is set so that shuffle will move element with index M from - // input pair into element with index N in output pair, and element - // with index M in output pair will be set to all 0s. - __m256i ctl_0_1 = _mm256_set_epi8(29, 28, 0x80, 0x80, 25, 24, 0x80, 0x80, - 21, 20, 0x80, 0x80, 17, 16, 0x80, 0x80, - 13, 12, 0x80, 0x80, 9, 8, 0x80, 0x80, - 5, 4, 0x80, 0x80, 1, 0, 0x80, 0x80); - __m256i ctl_1_0 = _mm256_set_epi8(0x80, 0x80, 31, 30, 0x80, 0x80, 27, 26, - 0x80, 0x80, 23, 22, 0x80, 0x80, 19, 18, - 0x80, 0x80, 15, 14, 0x80, 0x80, 11, 10, - 0x80, 0x80, 7, 6, 0x80, 0x80, 3, 2); - - // Masks for bitwise and operation, treating 256 bits as an array of - // 16-bit elements, and considering them in pairs of neighboring - // elements. A mask named "keep_M" (M in [0,1]) is set so that - // bitwise and will copy element with index M from input pair into - // element with the same index in output pair, while the other - // element in output pair will be set to all 0s. - __m256i keep_0 = _mm256_set1_epi32(0xFFFF); - __m256i keep_1 = _mm256_set1_epi32(0xFFFF0000); - - // Take each 16-bit element with idx%2==0 from input array to be - // shifted and extend it to 32 bits so that 0s are added to the - // right. Then, perform shifting on this 32-bit number. Upper 16 - // bits will be proper result of shifting original 16-bit number, so - // write them to result array, into the same position from which - // corresponding input element is taken. Also, make sure that - // result array elements with idx%2!=0 are set to all 0s. - // - // Note that number of bits to shift for is extended to 32 bits by - // adding 0s to the left. That means this number is not properly - // sign-extended for negative values. However, number of bits to - // shift is treated as an unsigned integer by respective shift - // intrinsics anyway so if negative then either with or without - // proper sign extension, it will be interpreted as a number greater - // than 32, and the shifting result will be the same. - __m256i a0 = _mm256_shuffle_epi8(a, ctl_0_1); - __m256i b0 = _mm256_and_si256(b, keep_0); - __m256i c0; - if (left_shift) - c0 = _mm256_sllv_epi32(a0, b0); - else - c0 = _mm256_srav_epi32(a0, b0); - c0 = _mm256_shuffle_epi8(c0, ctl_1_0); - - // Peform shifting the same way for input array elements with - // idx%2==1. - __m256i a1 = _mm256_and_si256(a, keep_1); - __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0); - __m256i c1; - if (left_shift) - c1 = _mm256_sllv_epi32(a1, b1); - else - c1 = _mm256_srav_epi32(a1, b1); - c1 = _mm256_and_si256(c1, keep_1); - - // Merge partial results into the final result. - __m256i c = _mm256_or_si256(c0, c1); - - return c; -} - -template ::value || std::is_same::value, int> = 0> -Vectorized inline shift_256_8(const Vectorized& a, const Vectorized& b) { - // No vector instruction for shifting int8_t/uint8_t, so emulating - // it instead. - - // Control masks for shuffle operation, treating 256 bits as an - // array of 8-bit elements, and considering quadruples of - // neighboring elements. Specifially, a mask named "ctl_M_N" (M,N - // in [0,1,2,3], and M!=N) is set so that shuffle will move element - // with index M from input quadruple into element with index N in - // output quadruple, and other elements in output quadruple will be - // set to all 0s. - __m256i ctl_0_3 = _mm256_set_epi8(28, 0x80, 0x80, 0x80, 24, 0x80, 0x80, 0x80, - 20, 0x80, 0x80, 0x80, 16, 0x80, 0x80, 0x80, - 12, 0x80, 0x80, 0x80, 8, 0x80, 0x80, 0x80, - 4, 0x80, 0x80, 0x80, 0, 0x80, 0x80, 0x80); - __m256i ctl_1_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 29, 0x80, 0x80, 0x80, 25, - 0x80, 0x80, 0x80, 21, 0x80, 0x80, 0x80, 17, - 0x80, 0x80, 0x80, 13, 0x80, 0x80, 0x80, 9, - 0x80, 0x80, 0x80, 5, 0x80, 0x80, 0x80, 1); - __m256i ctl_1_3 = _mm256_set_epi8(29, 0x80, 0x80, 0x80, 25, 0x80, 0x80, 0x80, - 21, 0x80, 0x80, 0x80, 17, 0x80, 0x80, 0x80, - 13, 0x80, 0x80, 0x80, 9, 0x80, 0x80, 0x80, - 5, 0x80, 0x80, 0x80, 1, 0x80, 0x80, 0x80); - __m256i ctl_2_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 30, 0x80, 0x80, 0x80, 26, - 0x80, 0x80, 0x80, 22, 0x80, 0x80, 0x80, 18, - 0x80, 0x80, 0x80, 14, 0x80, 0x80, 0x80, 10, - 0x80, 0x80, 0x80, 6, 0x80, 0x80, 0x80, 2); - __m256i ctl_2_3 = _mm256_set_epi8(30, 0x80, 0x80, 0x80, 26, 0x80, 0x80, 0x80, - 22, 0x80, 0x80, 0x80, 18, 0x80, 0x80, 0x80, - 14, 0x80, 0x80, 0x80, 10, 0x80, 0x80, 0x80, - 6, 0x80, 0x80, 0x80, 2, 0x80, 0x80, 0x80); - __m256i ctl_3_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 31, 0x80, 0x80, 0x80, 27, - 0x80, 0x80, 0x80, 23, 0x80, 0x80, 0x80, 19, - 0x80, 0x80, 0x80, 15, 0x80, 0x80, 0x80, 11, - 0x80, 0x80, 0x80, 7, 0x80, 0x80, 0x80, 3); - __m256i ctl_3_1 = _mm256_set_epi8(0x80, 0x80, 31, 0x80, 0x80, 0x80, 27, 0x80, - 0x80, 0x80, 23, 0x80, 0x80, 0x80, 19, 0x80, - 0x80, 0x80, 15, 0x80, 0x80, 0x80, 11, 0x80, - 0x80, 0x80, 7, 0x80, 0x80, 0x80, 3, 0x80); - __m256i ctl_3_2 = _mm256_set_epi8(0x80, 31, 0x80, 0x80, 0x80, 27, 0x80, 0x80, - 0x80, 23, 0x80, 0x80, 0x80, 19, 0x80, 0x80, - 0x80, 15, 0x80, 0x80, 0x80, 11, 0x80, 0x80, - 0x80, 7, 0x80, 0x80, 0x80, 3, 0x80, 0x80); - - // Masks for bitwise and operation, treating 256 bits as an array of - // 8-bit elements, and considering them in quadruples of neighboring - // elements. A mask named "keep_M" (M in [0,1,2,3]) is set so that - // bitwise and will copy element with index M from input quadruple - // into element with the same index in output quadruple, while the - // other elements in output quadruple will be set to all 0s. - __m256i keep_0 = _mm256_set1_epi32(0xFF); - __m256i keep_3 = _mm256_set1_epi32(0xFF000000); - - // Take each 8-bit element with idx%4==0 from input array to be - // shifted and extend it to 32 bits so that 0s are added to the - // right. Then, perform shifting on this 32-bit number. Upper 8 - // bits will be proper result of shifting original 8-bit number, so - // write them to result array, into the same position from which - // corresponding input element is taken. Also, make sure that - // result array elements with idx%4!=0 are set to all 0s. - // - // Note that number of bits to shift for is extended to 32 bits by - // adding 0s to the left. That means this number is not properly - // sign-extended for negative values. However, number of bits to - // shift is treated as an unsigned integer by respective shift - // intrinsics anyway so if negative then either with or without - // proper sign extension, it will be interpreted as a number greater - // than 32, and the shifting result will be the same. - __m256i a0 = _mm256_shuffle_epi8(a, ctl_0_3); - __m256i b0 = _mm256_and_si256(b, keep_0); - __m256i c0; - if (left_shift) - c0 = _mm256_sllv_epi32(a0, b0); - else - if (std::is_same::value) - c0 = _mm256_srav_epi32(a0, b0); - else - c0 = _mm256_srlv_epi32(a0, b0); - c0 = _mm256_shuffle_epi8(c0, ctl_3_0); - - // Peform shifting the same way for input array elements with - // idx%4==1. - __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3); - __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0); - __m256i c1; - if (left_shift) - c1 = _mm256_sllv_epi32(a1, b1); - else - if (std::is_same::value) - c1 = _mm256_srav_epi32(a1, b1); - else - c1 = _mm256_srlv_epi32(a1, b1); - c1 = _mm256_shuffle_epi8(c1, ctl_3_1); - - // Peform shifting the same way for input array elements with - // idx%4==2. - __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3); - __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0); - __m256i c2; - if (left_shift) - c2 = _mm256_sllv_epi32(a2, b2); - else - if (std::is_same::value) - c2 = _mm256_srav_epi32(a2, b2); - else - c2 = _mm256_srlv_epi32(a2, b2); - c2 = _mm256_shuffle_epi8(c2, ctl_3_2); - - // Peform shifting the same way for input array elements with - // idx%4==3. - __m256i a3 = _mm256_and_si256(a, keep_3); - __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0); - __m256i c3; - if (left_shift) - c3 = _mm256_sllv_epi32(a3, b3); - else - if (std::is_same::value) - c3 = _mm256_srav_epi32(a3, b3); - else - c3 = _mm256_srlv_epi32(a3, b3); - c3 = _mm256_and_si256(c3, keep_3); - - // Merge partial results into the final result. - __m256i c01 = _mm256_or_si256(c0, c1); - __m256i c23 = _mm256_or_si256(c2, c3); - __m256i c = _mm256_or_si256(c01, c23); - - return c; -} - -template <> -Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { - return _mm256_sllv_epi64(a, b); -} - -template <> -Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { - return _mm256_sllv_epi32(a, b); -} - -template <> -Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { - return shift_256_16(a, b); -} - -template <> -Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { - return shift_256_8(a, b); -} - -template <> -Vectorized inline operator<<(const Vectorized& a, const Vectorized& b) { - return shift_256_8(a, b); -} - -template <> -Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { - // No vector instruction for right arithmetic shifting int64_t, so emulating it - // instead. - - // Clamp the shift values such that shift values < 0 and > 64 are changed to 64 - // which results in -1 for negative input and 0 for non-negative input. - __m256i zero = _mm256_set1_epi64x(0); - __m256i max_shift = _mm256_set1_epi64x(64); - __m256i mask = _mm256_or_si256(_mm256_cmpgt_epi64(zero, b), _mm256_cmpgt_epi64(b, max_shift)); - __m256i shift = _mm256_blendv_epi8(b, max_shift, mask); - // Shift the number logically to the right, thus filling the most - // significant bits with 0s. Then, replace these bits with the sign - // bit. - __m256i sign_bits = _mm256_cmpgt_epi64(zero, a); - __m256i sign_shift = _mm256_sub_epi64(max_shift, shift); - __m256i sign_ext = _mm256_sllv_epi64(sign_bits, sign_shift); - __m256i c = _mm256_srlv_epi64(a, shift); - c = _mm256_or_si256(c, sign_ext); - - return c; -} - -template <> -Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { - return _mm256_srav_epi32(a, b); -} - -template <> -Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { - return shift_256_16(a, b); -} - -template <> -Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { - return shift_256_8(a, b); -} - -template <> -Vectorized inline operator>>(const Vectorized& a, const Vectorized& b) { - return shift_256_8(a, b); -} - -#endif - -}}} diff --git a/kernels/optimized/vec/vec_base.h b/kernels/optimized/vec/vec_base.h deleted file mode 100644 index 5ff4327e6f..0000000000 --- a/kernels/optimized/vec/vec_base.h +++ /dev/null @@ -1,886 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -// These macros helped us unify vec_base.h -#ifdef CPU_CAPABILITY_AVX512 -#if defined(__GNUC__) -#define __at_align__ __attribute__((aligned(64))) -#elif defined(_WIN32) -#define __at_align__ __declspec(align(64)) -#else -#define __at_align__ -#endif -#define VECTOR_WIDTH 64 -#define int_vector __m512i -#else // CPU_CAPABILITY_AVX512 -#if defined(__GNUC__) -#define __at_align__ __attribute__((aligned(32))) -#elif defined(_WIN32) -#define __at_align__ __declspec(align(32)) -#else -#define __at_align__ -#endif -#define VECTOR_WIDTH 32 -#define int_vector __m256i -#endif // CPU_CAPABILITY_AVX512 - -namespace executorch { -namespace vec { - -// See Note [CPU_CAPABILITY namespace] -inline namespace CPU_CAPABILITY { - -template struct int_of_size; - -#define DEFINE_INT_OF_SIZE(int_t) \ -template<> struct int_of_size { using type = int_t; } - -DEFINE_INT_OF_SIZE(int64_t); -DEFINE_INT_OF_SIZE(int32_t); -DEFINE_INT_OF_SIZE(int16_t); -DEFINE_INT_OF_SIZE(int8_t); - -#undef DEFINE_INT_OF_SIZE - -template -using int_same_size_t = typename int_of_size::type; - -// NOTE: If you specialize on a type, you must define all operations! - -// emulates Vectorized types -#if defined(__s390x__) -template -#else -template -#endif -struct Vectorized { -private: - __at_align__ T values[VECTOR_WIDTH / sizeof(T)]; -public: - using value_type = T; - using size_type = int; - // Note [constexpr static function to avoid odr-usage compiler bug] - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // Why, you might ask, is size defined to be a static constexpr function, - // rather than a more ordinary 'static constexpr int size;' variable? - // The problem lies within ODR rules for static constexpr members versus - // static constexpr functions. First, recall that this class (along with all - // of its derivations) live in an anonymous namespace: they are intended to be - // *completely* inlined at their use-sites, because we need to compile it - // multiple times for different instruction sets. - // - // Because of this constraint, we CANNOT provide a single definition for - // any static members in this class; since we want to compile the class - // multiple times, there wouldn't actually be any good place to put the - // definition. Now here is the problem: if we ODR-use a static constexpr - // member, we are *obligated* to provide a definition. Without the - // definition, you get a compile error like: - // - // relocation R_X86_64_PC32 against undefined symbol - // `_ZN2at6vec25612_GLOBAL__N_16VectorizedIdE4sizeE' can not be used when making - // a shared object; recompile with -fPIC - // - // If this were C++17, we could replace a static constexpr variable with - // an inline variable which doesn't require one definition. But we are not - // C++17. So the next best thing is to replace the member with a static - // constexpr (and therefore inline) function, which does not require ODR - // either. - // - // Also, technically according to the C++ standard, we don't have to define - // a constexpr variable if we never odr-use it. But it seems that some - // versions GCC/Clang have buggy determinations on whether or not an - // identifier is odr-used or not, and in any case it's hard to tell if - // a variable is odr-used or not. So best to just cut the problem at the root. - static constexpr size_type size_T = sizeof(T); // Workaround to compile with VS2022. - static constexpr size_type size() { - return VECTOR_WIDTH / size_T; - } - Vectorized() : values{static_cast(0)} {} - Vectorized(T val) { - for (size_t i = 0; i != size(); i++) { - values[i] = val; - } - } - template> - Vectorized(Args... vals) : values{vals...}{ - } - // This also implies const T& operator[](int idx) const - inline operator const T*() const { - return values; - } - // This also implies T& operator[](int idx) - inline operator T*() { - return values; - } - // Return the values as char* for type punning - auto as_bytes() const -> const char* { - return reinterpret_cast(values); - } - template - static Vectorized blend(const Vectorized& a, const Vectorized& b) { - int64_t mask = mask_; - Vectorized vector; - for (size_t i = 0; i < size(); ++i) { - if (mask & 0x01) { - vector[i] = b[i]; - } else { - vector[i] = a[i]; - } - mask = mask >> 1; - } - return vector; - } - static Vectorized blendv(const Vectorized& a, const Vectorized& b, - const Vectorized& mask) { - Vectorized vector; - int_same_size_t buffer[size()]; - mask.store(buffer); - for (size_t i = 0; i < size(); ++i) { - if (buffer[i] & 0x01) - { - vector[i] = b[i]; - } else { - vector[i] = a[i]; - } - } - return vector; - } - template // step sometimes requires a higher precision type (e.g., T=int, step_t=double) - static Vectorized arange(T base = static_cast(0), step_t step = static_cast(1)) { - Vectorized vector; - for (size_t i = 0; i < size(); ++i) { - vector.values[i] = base + i * step; - } - return vector; - } - static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { - Vectorized vector; - for (size_t i = 0; i < size(); ++i) { - if (i < count) { - vector[i] = b[i]; - } else { - vector[i] = a[i]; - } - } - return vector; - } - static Vectorized loadu(const void* ptr) { - Vectorized vector; - std::memcpy(vector.values, ptr, VECTOR_WIDTH); - return vector; - } - static Vectorized loadu(const void* ptr, int64_t count) { - Vectorized vector; - std::memcpy(vector.values, ptr, count * sizeof(T)); - return vector; - } - void store(void* ptr, int count = size()) const { - std::memcpy(ptr, values, count * sizeof(T)); - } - int zero_mask() const { - // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit - int mask = 0; - for (size_t i = 0; i < size(); ++ i) { - if (values[i] == static_cast(0)) { - mask |= (1 << i); - } - } - return mask; - } - Vectorized isnan() const { - Vectorized vector; - for (size_t i = 0; i != size(); i++) { - if (std::isnan(values[i])) { - std::memset(static_cast(vector.values + i), 0xFF, sizeof(T)); - } else { - std::memset(static_cast(vector.values + i), 0, sizeof(T)); - } - } - return vector; - } - Vectorized map(T (*const f)(T)) const { - Vectorized ret; - for (size_t i = 0; i != size(); i++) { - ret[i] = f(values[i]); - } - return ret; - } - Vectorized map(T (*const f)(const T &)) const { - Vectorized ret; - for (size_t i = 0; i != size(); i++) { - ret[i] = f(values[i]); - } - return ret; - } - template ::value, int>::type = 0> - Vectorized abs() const { - // other_t_abs is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "other_t_abs must be T"); - return map([](T x) -> T { return x < static_cast(0) ? -x : x; }); - } - template ::value, int>::type = 0> - Vectorized abs() const { - // float_t_abs is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "float_t_abs must be T"); - // Specifically deal with floating-point because the generic code above won't handle -0.0 (which should result in - // 0.0) properly. - return map([](T x) -> T { return std::abs(x); }); - } - - Vectorized acos() const { - return map(std::acos); - } - Vectorized asin() const { - return map(std::asin); - } - Vectorized atan() const { - return map(std::atan); - } - Vectorized atan2(const Vectorized &exp) const { - Vectorized ret; - for (size_t i = 0; i < size(); ++i) { - ret[i] = std::atan2(values[i], exp[i]); - } - return ret; - } - template < - typename U = T, - typename std::enable_if_t::value, int> = 0> - Vectorized copysign(const Vectorized &sign) const { - Vectorized ret; - for (size_t i = 0; i < size(); i++) { - ret[i] = std::copysign(values[i], sign[i]); - } - return ret; - } - Vectorized erf() const { - return map(std::erf); - } - Vectorized erfc() const { - return map(std::erfc); - } - Vectorized exp() const { - return map(std::exp); - } - Vectorized exp2() const { - return map(std::exp2); - } - Vectorized expm1() const { - return map(std::expm1); - } - Vectorized frac() const { - return *this - this->trunc(); - } - template < - typename U = T, - typename std::enable_if_t::value, int> = 0> - Vectorized fmod(const Vectorized& q) const { - // U is for SFINAE purposes only. Make sure it is not changed. - static_assert(std::is_same::value, "U must be T"); - Vectorized ret; - for (size_t i = 0; i < size(); ++i) { - ret[i] = std::fmod(values[i], q[i]); - } - return ret; - } - Vectorized log() const { - return map(std::log); - } - Vectorized log10() const { - return map(std::log10); - } - Vectorized log1p() const { - return map(std::log1p); - } - Vectorized log2() const { - return map(std::log2); - } - Vectorized ceil() const { - return map(std::ceil); - } - Vectorized cos() const { - return map(std::cos); - } - Vectorized cosh() const { - return map(std::cosh); - } - Vectorized floor() const { - return map(std::floor); - } - Vectorized hypot(const Vectorized &b) const { - Vectorized ret; - for (size_t i = 0; i < size(); ++i) { - ret[i] = std::hypot(values[i], b[i]); - } - return ret; - } - Vectorized neg() const { - // NB: the trailing return type is needed because we need to coerce the - // return value back to T in the case of unary operator- incuring a - // promotion - return map([](T x) -> T { return -x; }); - } - Vectorized nextafter(const Vectorized &b) const { - Vectorized ret; - for (size_t i = 0; i < size(); ++i) { - ret[i] = std::nextafter(values[i], b[i]); - } - return ret; - } - Vectorized round() const { - // TODO(T149257433): implement custom round that rounds midway numbers to - // the nearest even integer. - return map(std::round); - } - Vectorized sin() const { - return map(std::sin); - } - Vectorized sinh() const { - return map(std::sinh); - } - Vectorized tan() const { - return map(std::tan); - } - Vectorized tanh() const { - return map(std::tanh); - } - Vectorized trunc() const { - return map(std::trunc); - } - Vectorized lgamma() const { - return map(std::lgamma); - } - Vectorized sqrt() const { - return map(std::sqrt); - } - Vectorized reciprocal() const { - return map([](T x) { return (T)(1) / x; }); - } - Vectorized rsqrt() const { - return map([](T x) { return (T)1 / std::sqrt(x); }); - } - Vectorized pow(const Vectorized &exp) const { - Vectorized ret; - for (size_t i = 0; i < size(); ++i) { - ret[i] = std::pow(values[i], exp[i]); - } - return ret; - } -private: - template - inline Vectorized binary_pred(const Vectorized& other, Op op) const { - // All bits are set to 1 if the pred is true, otherwise 0. - Vectorized vector; - for (size_t i = 0; i != size(); i++) { - if (op(values[i], other.values[i])) { - std::memset(static_cast(vector.values + i), 0xFF, sizeof(T)); - } else { - std::memset(static_cast(vector.values + i), 0, sizeof(T)); - } - } - return vector; - } - -public: - Vectorized operator==(const Vectorized& other) const { return binary_pred(other, std::equal_to()); } - Vectorized operator!=(const Vectorized& other) const { return binary_pred(other, std::not_equal_to()); } - Vectorized operator>=(const Vectorized& other) const { return binary_pred(other, std::greater_equal()); } - Vectorized operator<=(const Vectorized& other) const { return binary_pred(other, std::less_equal()); } - Vectorized operator>(const Vectorized& other) const { return binary_pred(other, std::greater()); } - Vectorized operator<(const Vectorized& other) const { return binary_pred(other, std::less()); } - -private: - template - inline Vectorized binary_pred_bool(const Vectorized& other, Op op) const { - // 1 if the pred is true, otherwise 0. - Vectorized vector; - for (size_t i = 0; i != size(); ++ i) { - vector[i] = static_cast(op(values[i], other.values[i])); - } - return vector; - } - -public: - Vectorized eq(const Vectorized& other) const { return binary_pred_bool(other, std::equal_to()); } - Vectorized ne(const Vectorized& other) const { return binary_pred_bool(other, std::not_equal_to()); } - Vectorized gt(const Vectorized& other) const { return binary_pred_bool(other, std::greater()); } - Vectorized ge(const Vectorized& other) const { return binary_pred_bool(other, std::greater_equal()); } - Vectorized lt(const Vectorized& other) const { return binary_pred_bool(other, std::less()); } - Vectorized le(const Vectorized& other) const { return binary_pred_bool(other, std::less_equal()); } -}; - -template Vectorized inline operator+(const Vectorized &a, const Vectorized &b) { - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - c[i] = a[i] + b[i]; - } - return c; -} - -template Vectorized inline operator-(const Vectorized &a, const Vectorized &b) { - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - c[i] = a[i] - b[i]; - } - return c; -} - -template Vectorized inline operator*(const Vectorized &a, const Vectorized &b) { - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - c[i] = a[i] * b[i]; - } - return c; -} - -template Vectorized inline operator/(const Vectorized &a, const Vectorized &b) { - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - c[i] = a[i] / b[i]; - } - return c; -} - -template Vectorized inline operator||( - const Vectorized &a, const Vectorized &b) { - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - c[i] = a[i] || b[i]; - } - return c; -} - -// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if -// either input is a NaN. -template -Vectorized inline maximum(const Vectorized &a, const Vectorized &b) { - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - c[i] = (a[i] > b[i]) ? a[i] : b[i]; - if (std::isnan(a[i])) { - // If either input is NaN, propagate a NaN. - // NOTE: The case where b[i] was NaN is handled correctly by the naive - // ternary operator above. - c[i] = a[i]; - } - } - return c; -} - -// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if -// either input is a NaN. -template -Vectorized inline minimum(const Vectorized &a, const Vectorized &b) { - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - c[i] = (a[i] < b[i]) ? a[i] : b[i]; - if (std::isnan(a[i])) { - // If either input is NaN, propagate a NaN. - // NOTE: The case where b[i] was NaN is handled correctly by the naive - // ternary operator above. - c[i] = a[i]; - } - } - return c; -} - -template -Vectorized inline clamp(const Vectorized &a, const Vectorized &min_vec, const Vectorized &max_vec) { - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - c[i] = std::min(std::max(a[i], min_vec[i]), max_vec[i]); - } - return c; -} - -template -Vectorized inline clamp_max(const Vectorized &a, const Vectorized &max_vec) { - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - c[i] = a[i] > max_vec[i] ? max_vec[i] : a[i]; - } - return c; -} - -template -Vectorized inline clamp_min(const Vectorized &a, const Vectorized &min_vec) { - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - c[i] = a[i] < min_vec[i] ? min_vec[i] : a[i]; - } - return c; -} - -struct Vectorizedi; - -#if defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512) -template -static inline Vectorized bitwise_binary_op(const Vectorized &a, const Vectorized &b, Op op) { - int_vector buffer; -#if defined(CPU_CAPABILITY_AVX2) - int_vector a_buffer = _mm256_load_si256(reinterpret_cast((const T*)a)); - int_vector b_buffer = _mm256_load_si256(reinterpret_cast((const T*)b)); -#elif defined(CPU_CAPABILITY_AVX512) - int_vector a_buffer = _mm512_load_si512(reinterpret_cast((const T*)a)); - int_vector b_buffer = _mm512_load_si512(reinterpret_cast((const T*)b)); -#endif - buffer = op(a_buffer, b_buffer); - __at_align__ T results[Vectorized::size()]; - -#if defined(CPU_CAPABILITY_AVX2) - _mm256_store_si256(reinterpret_cast(results), buffer); -#elif defined(CPU_CAPABILITY_AVX512) - _mm512_store_si512(reinterpret_cast(results), buffer); -#endif - return Vectorized::loadu(results); -} - -template>::value, int> = 0> -inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { - // We enclose _mm512_and_si512 or _mm256_and_si256 with lambda because it is always_inline -#if defined(CPU_CAPABILITY_AVX2) - return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm256_and_si256(a, b); }); -#elif defined(CPU_CAPABILITY_AVX512) - return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm512_and_si512(a, b); }); -#endif -} -template>::value, int> = 0> -inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { - // We enclose _mm512_or_si512 or _mm256_or_si256 with lambda because it is always_inline -#if defined(CPU_CAPABILITY_AVX2) - return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm256_or_si256(a, b); }); -#elif defined(CPU_CAPABILITY_AVX512) - return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm512_or_si512(a, b); }); -#endif -} -template>::value, int> = 0> -inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { - // We enclose _mm512_xor_si512 or _mm256_xor_si256 with lambda because it is always_inline -#if defined(CPU_CAPABILITY_AVX2) - return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm256_xor_si256(a, b); }); -#elif defined(CPU_CAPABILITY_AVX512) - return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm512_xor_si512(a, b); }); -#endif -} - -#else - -template -auto load(char const* data) -> T { - T ret; - std::memcpy(&ret, data, sizeof(ret)); - return ret; -} - -template -static inline Vectorized bitwise_binary_op(const Vectorized &a, const Vectorized &b, Op op) { - static constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t); - __at_align__ intmax_t buffer[element_no]; - static_assert(VECTOR_WIDTH % sizeof(intmax_t) == 0, "VECTOR_WIDTH not a multiple of sizeof(intmax_t)"); - static_assert(sizeof(buffer) == sizeof(Vectorized), "sizeof(buffer) must match sizeof(Vectorized)"); - // We should be using memcpy in order to respect the strict aliasing rule - // see: https://github.com/pytorch/pytorch/issues/66119 - // Using char* is defined in the C11 standard 6.5 Expression paragraph 7 - // (http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf) - const auto* a_data = a.as_bytes(); - const auto* b_data = b.as_bytes(); - // load each intmax_t chunk and process; increase pointers by sizeof(intmax_t) - for (auto& out : buffer) { - out = op(load(a_data), load(b_data)); - a_data += sizeof(intmax_t); - b_data += sizeof(intmax_t); - } - assert(a_data == a.as_bytes() + sizeof(a)); - assert(b_data == b.as_bytes() + sizeof(b)); - return Vectorized::loadu(buffer); -} - -template>::value, int> = 0> -inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { - return bitwise_binary_op(a, b, std::bit_and()); -} -template>::value, int> = 0> -inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { - return bitwise_binary_op(a, b, std::bit_or()); -} -template>::value, int> = 0> -inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { - return bitwise_binary_op(a, b, std::bit_xor()); -} - -#endif // defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512) - -template>::value, int> = 0> -inline Vectorized operator~(const Vectorized& a) { - Vectorized ones; // All bits are 1 - memset((T*) ones, 0xFF, VECTOR_WIDTH); - return a ^ ones; -} - -template Vectorized inline operator<<(const Vectorized &a, const Vectorized &b) { - constexpr T max_shift = sizeof(T) * CHAR_BIT; - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - T shift = b[i]; - if ((static_cast>(shift) < 0) || (shift >= max_shift)) { - c[i] = 0; - } else { - c[i] = static_cast>(a[i]) << shift; - } - } - return c; -} - -template Vectorized inline operator>>(const Vectorized &a, const Vectorized &b) { - // right shift value to retain sign bit for signed and no bits for unsigned - constexpr T max_shift = sizeof(T) * CHAR_BIT - std::is_signed_v; - Vectorized c; - for (size_t i = 0; i != Vectorized::size(); i++) { - T shift = b[i]; - if ((static_cast>(shift) < 0) || (shift >= max_shift)) { - c[i] = a[i] >> max_shift; - } else { - c[i] = a[i] >> shift; - } - } - return c; -} - -template -inline Vectorized& operator += (Vectorized& a, const Vectorized& b) { - a = a + b; - return a; -} -template -inline Vectorized& operator -= (Vectorized& a, const Vectorized& b) { - a = a - b; - return a; -} -template -inline Vectorized& operator /= (Vectorized& a, const Vectorized& b) { - a = a / b; - return a; -} -template -inline Vectorized& operator %= (Vectorized& a, const Vectorized& b) { - a = a % b; - return a; -} -template -inline Vectorized& operator *= (Vectorized& a, const Vectorized& b) { - a = a * b; - return a; -} - -template -inline Vectorized& operator <<= (Vectorized& a, const Vectorized& b) { - a = a << b; - return a; -} - -template -inline Vectorized& operator >>= (Vectorized& a, const Vectorized& b) { - a = a >> b; - return a; -} - -template -inline Vectorized fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { - return a * b + c; -} - -template -inline Vectorized fmsub(const Vectorized& a, const Vectorized& b, const Vectorized& c) { - return a * b - c; -} - -template -std::enable_if_t> -inline gather(T const* base_addr, const Vectorized>& vindex) { - static constexpr int size = Vectorized::size(); - int_same_size_t index_arr[size]; - vindex.store(static_cast(index_arr)); - T buffer[size]; - for (size_t i = 0; i < size; ++i) { - buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; - } - return Vectorized::loadu(static_cast(buffer)); -} - -template -std::enable_if_t> -inline mask_gather(const Vectorized& src, T const* base_addr, - const Vectorized>& vindex, Vectorized& mask) { - static constexpr int size = Vectorized::size(); - T src_arr[size]; - int_same_size_t mask_arr[size]; // use int type so we can logical and - int_same_size_t index_arr[size]; - src.store(static_cast(src_arr)); - mask.store(static_cast(mask_arr)); - vindex.store(static_cast(index_arr)); - T buffer[size]; - for (size_t i = 0; i < size; ++i) { - if (mask_arr[i] & 0x01) { // check highest bit - buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; - } else { - buffer[i] = src_arr[i]; - } - } - mask = Vectorized(); // "zero out" mask - return Vectorized::loadu(static_cast(buffer)); -} - -// Cast a given vector to another type without changing the bits representation. -// So a Vectorized of 512 bits containing all ones can be cast to a -// Vectorized of 512 bits containing all ones (i.e., eight negative 1s). -// A Vec of 256 bits containing all ones can be cast to a -// Vec of 256 bits containing all ones (i.e., four negative 1s). -// There is a struct here because we don't have static_if and I can't -// partially specialize a templated function. -template -struct CastImpl { - static inline Vectorized apply(const Vectorized& src) { - src_t src_arr[Vectorized::size()]; - src.store(static_cast(src_arr)); - return Vectorized::loadu(static_cast(src_arr)); - } -}; - -template -struct CastImpl { - static inline Vectorized apply(const Vectorized& src) { - return src; - } -}; - -template -inline Vectorized cast(const Vectorized& src) { - return CastImpl::apply(src); -} - -template -inline Vectorized> convert_to_int_of_same_size(const Vectorized& src) { - static constexpr int size = Vectorized::size(); - T src_arr[size]; - src.store(static_cast(src_arr)); - int_same_size_t buffer[size]; - for (size_t i = 0; i < size; ++i) { - buffer[i] = static_cast>(src_arr[i]); - } - return Vectorized>::loadu(static_cast(buffer)); -} - -// Example inputs for AVX512: -// a Vectorized = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7} -// b Vectorized = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15} -// returns: -// Vectorized = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15} -// Vectorized = {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} -// Example inputs for AVX2: a Vectorized = {a0, b0, a1, b1, a2, b2, a3, b3} -// b Vectorized = {a4, b4, a5, b5, a6, b6, a7, b7} -// returns: Vectorized = {a0, a1, a2, a3, a4, a5, a6, a7} -// Vectorized = {b0, b1, b2, b3, b4, b5, b6, b7} -template -inline std::enable_if_t::size() % 2 == 0, std::pair, Vectorized>> -deinterleave2(const Vectorized& a, const Vectorized& b) { - static constexpr int size = Vectorized::size(); - static constexpr int half_size = size / 2; - T a_arr[size]; - T b_arr[size]; - T buffer1[size]; - T buffer2[size]; - a.store(static_cast(a_arr)); - b.store(static_cast(b_arr)); - for (size_t i = 0; i < half_size; ++i) { - buffer1[i] = a_arr[i * 2]; - buffer1[half_size + i] = b_arr[i * 2]; - buffer2[i] = a_arr[i * 2 + 1]; - buffer2[half_size + i] = b_arr[i * 2 + 1]; - } - return std::make_pair(Vectorized::loadu(static_cast(buffer1)), - Vectorized::loadu(static_cast(buffer2))); -} - -// inverse operation of deinterleave2 -// Example inputs for AVX512: -// a Vectorized = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15} -// b Vectorized = {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} -// returns, for AVX512: -// Vectorized = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7} -// Vectorized = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15} -// Example inputs for AVX2 : a Vectorized = {a0, a1, a2, a3, a4, a5, a6, a7} -// b Vectorized = {b0, b1, b2, b3, b4, b5, b6, b7} -// returns: Vectorized = {a0, b0, a1, b1, a2, b2, a3, b3} -// Vectorized = {a4, b4, a5, b5, a6, b6, a7, b7} -template -inline std::enable_if_t::size() % 2 == 0, std::pair, Vectorized>> -interleave2(const Vectorized& a, const Vectorized& b) { - static constexpr int size = Vectorized::size(); - static constexpr int half_size = size / 2; - T a_arr[size]; - T b_arr[size]; - T buffer1[size]; - T buffer2[size]; - a.store(static_cast(a_arr)); - b.store(static_cast(b_arr)); - for (size_t i = 0; i < half_size; ++i) { - buffer1[i * 2] = a_arr[i]; - buffer1[i * 2 + 1] = b_arr[i]; - buffer2[i * 2] = a_arr[half_size + i]; - buffer2[i * 2 + 1] = b_arr[half_size + i]; - } - return std::make_pair(Vectorized::loadu(static_cast(buffer1)), - Vectorized::loadu(static_cast(buffer2))); -} - -template -inline void convert(const src_T *src, dst_T *dst, int64_t n) { -#ifndef _MSC_VER -# pragma unroll -#endif - for (int64_t i = 0; i < n; ++i) { - (void)i; //Suppress unused variable warning - *dst = static_cast(*src); - src++; - dst++; - } -} - -template -inline Vectorized flip(const Vectorized & data) { - static constexpr int size = Vectorized::size(); - T output[size]; - T buffer[size]; - data.store(static_cast(buffer)); - for (size_t i = 0; i < size; ++i) { - output[i] = buffer[size - i - 1]; - } - return Vectorized::loadu(static_cast(output)); -} - -// Transpose the `src` buffer of type `T` and size (M,N) into the `dst` buffer. `ld_src` is the leading -// dimension of `src` and `ld_dst` is the leading dimension of `dst`. -template -inline void transpose_mxn(const T* src, int64_t ld_src, T* dst, int64_t ld_dst) { - for (size_t i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { - dst[j*ld_dst + i] = src[i*ld_src + j]; - } - } -} - -} // namespace CPU_CAPABILITY - -} // namespace vec -} // namespace executorch