Skip to content

Commit

Permalink
Remove ExecuTorch copy of Vectorized
Browse files Browse the repository at this point in the history
All uses are outside ExecuTorch core, so we can just use ATen Vectorized.

Differential Revision: [D66396016](https://our.internmc.facebook.com/intern/diff/D66396016/)

ghstack-source-id: 255095942
Pull Request resolved: #7042
  • Loading branch information
swolchok committed Nov 23, 2024
1 parent 5036f3d commit 45e70e6
Show file tree
Hide file tree
Showing 26 changed files with 128 additions and 5,147 deletions.
15 changes: 4 additions & 11 deletions extension/llm/custom_ops/op_sdpa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@

#include <executorch/extension/llm/custom_ops/op_sdpa.h>

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/blas/CPUBlas.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
// @lint-ignore CLANGTIDY facebook-unused-include-check
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
Expand All @@ -34,18 +35,10 @@ namespace util {
constexpr size_t kKVDim = 4;

template <typename T>
inline void _store(T* dst, ::executorch::vec::Vectorized<T> src) {
inline void _store(T* dst, ::at::vec::Vectorized<T> src) {
src.store(dst);
}

/*
inline void _store(::Half* dst, at::vec::Vectorized<float> src) {
//fp16_ieee_to_fp32_value
auto res = at::vec::convert_float_half(src, src);
res.store(dst, at::vec::Vectorized<float>::size());
}
*/

template <typename T>
inline T data_index_init(T offset) {
return offset;
Expand Down Expand Up @@ -78,7 +71,7 @@ inline double calculate_scale(const Tensor& query, optional<double> scale) {
}

} // namespace util
namespace vec = ::executorch::vec;
namespace vec = ::at::vec;
using Tensor = exec_aten::Tensor;

namespace {
Expand Down
28 changes: 14 additions & 14 deletions kernels/optimized/cpu/moments_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// for use in optimized ExecuTorch ops. Template specializations of BFloat16
// are excluded.

#include <executorch/kernels/optimized/vec/vec.h>
#include <ATen/cpu/vec/vec.h>

#include <executorch/kernels/optimized/utils/math_utils.h>
#include <executorch/runtime/platform/compiler.h>
Expand Down Expand Up @@ -47,12 +47,12 @@ void AddMoments(
template <typename T>
ET_INLINE void AddMomentsVec(
int64_t m0_add,
const executorch::vec::Vectorized<T>& m1_add,
const executorch::vec::Vectorized<T>& m2_add,
const at::vec::Vectorized<T>& m1_add,
const at::vec::Vectorized<T>& m2_add,
int64_t& m0,
executorch::vec::Vectorized<T>& m1,
executorch::vec::Vectorized<T>& m2) {
using Vec = executorch::vec::Vectorized<T>;
at::vec::Vectorized<T>& m1,
at::vec::Vectorized<T>& m2) {
using Vec = at::vec::Vectorized<T>;
const int64_t n = m0 + m0_add;
const T c =
n == 0 ? static_cast<T>(0) : static_cast<T>(m0_add) / static_cast<T>(n);
Expand All @@ -67,11 +67,11 @@ template <typename T>
inline void UpdateMomentsVec(
int64_t m0,
const T* X_ptr,
const std::array<executorch::vec::Vectorized<acc_t<T>>, kChunkSize>& c_vecs,
const std::array<at::vec::Vectorized<acc_t<T>>, kChunkSize>& c_vecs,
int64_t& m0_stk0,
executorch::vec::Vectorized<acc_t<T>>& m1_stk0,
executorch::vec::Vectorized<acc_t<T>>& m2_stk0) {
using Vec = executorch::vec::Vectorized<acc_t<T>>;
at::vec::Vectorized<acc_t<T>>& m1_stk0,
at::vec::Vectorized<acc_t<T>>& m2_stk0) {
using Vec = at::vec::Vectorized<acc_t<T>>;
Vec m1_vec(0);
Vec m2_vec(0);
for (int64_t j = 0; j < m0; ++j) {
Expand All @@ -92,13 +92,13 @@ std::pair<acc_t<T>, acc_t<T>>
RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
using T_ACC = acc_t<T>;

constexpr int64_t kVecSize = executorch::vec::Vectorized<T>::size();
constexpr int64_t kAccVecSize = executorch::vec::Vectorized<T_ACC>::size();
constexpr int64_t kVecSize = at::vec::Vectorized<T>::size();
constexpr int64_t kAccVecSize = at::vec::Vectorized<T_ACC>::size();
const int64_t n = N / kVecSize;
const int64_t m = executorch::utils::divup(n, kChunkSize);
const int64_t depth = executorch::utils::CeilLog2(m);

using Vec = executorch::vec::Vectorized<T_ACC>;
using Vec = at::vec::Vectorized<T_ACC>;
const Vec kZeroVec(T_ACC(0));
std::array<int64_t, kMaxDepth> m0_stk;
std::array<Vec, kMaxDepth> m1_stk;
Expand Down Expand Up @@ -168,7 +168,7 @@ RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
template <typename T>
std::pair<acc_t<T>, acc_t<T>>
RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
using Vec = executorch::vec::Vectorized<T>;
using Vec = at::vec::Vectorized<T>;
constexpr int64_t kVecSize = Vec::size();
const int64_t n = N / kVecSize;
const int64_t m = executorch::utils::divup(n, kChunkSize);
Expand Down
17 changes: 9 additions & 8 deletions kernels/optimized/cpu/op_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
Expand Down Expand Up @@ -99,8 +100,8 @@ Tensor& opt_add_out(
CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[alpha_val, b_casted](Vec x) {
return x + Vec(alpha_val * b_casted);
},
Expand Down Expand Up @@ -131,8 +132,8 @@ Tensor& opt_add_out(
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -166,7 +167,7 @@ Tensor& opt_add_out(
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
[alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -244,8 +245,8 @@ Tensor& opt_add_scalar_out(
CTYPE alpha_val;
ET_EXTRACT_SCALAR(alpha, alpha_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[alpha_val, b_casted](Vec x) {
return x + Vec(alpha_val * b_casted);
},
Expand Down
19 changes: 10 additions & 9 deletions kernels/optimized/cpu/op_div.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
Expand Down Expand Up @@ -76,16 +77,16 @@ Tensor& opt_div_out(
CTYPE_SCALAR scalar_val = *scalar->const_data_ptr<CTYPE_SCALAR>();
CTYPE scalar_casted = static_cast<CTYPE>(scalar_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
if (a.numel() == 1) {
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[scalar_casted](Vec x) { return Vec(scalar_casted) / x; },
out.mutable_data_ptr<CTYPE>(),
tensor->const_data_ptr<CTYPE>(),
out.numel());
} else {
Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted);
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[inv_scalar_casted_vec](Vec x) {
return x * inv_scalar_casted_vec;
},
Expand All @@ -111,8 +112,8 @@ Tensor& opt_div_out(
"Failed to resize output tensor.");

ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "div.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x / y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -142,7 +143,7 @@ Tensor& opt_div_out(
out,
"Failed to resize output tensor.");
ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
if (selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
Expand Down Expand Up @@ -222,9 +223,9 @@ Tensor& opt_div_scalar_out(
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
Vec inv_b_casted_vec(CTYPE(1) / b_casted);
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[inv_b_casted_vec](Vec x) { return x * inv_b_casted_vec; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
7 changes: 4 additions & 3 deletions kernels/optimized/cpu/op_exp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

#include <cmath>

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/runtime/kernel/kernel_includes.h>

namespace torch {
Expand All @@ -34,8 +35,8 @@ void exp_data(
const CTYPE_IN* in_data,
const size_t numel,
CTYPE_OUT* out_data) {
using Vec = executorch::vec::Vectorized<CTYPE_IN>;
executorch::vec::map<CTYPE_IN>(
using Vec = at::vec::Vectorized<CTYPE_IN>;
at::vec::map<CTYPE_IN>(
[](Vec x) { return x.exp(); }, out_data, in_data, numel);
}

Expand Down
11 changes: 6 additions & 5 deletions kernels/optimized/cpu/op_le.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>
Expand Down Expand Up @@ -44,8 +45,8 @@ Tensor& opt_le_tensor_out(
if (a_type == b_type && a_type == out_type) {
ET_SWITCH_REAL_TYPES_AND(
Bool, out_type, ctx, "le.Tensor_out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x.le(y); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -109,8 +110,8 @@ Tensor& opt_le_scalar_out(
CTYPE_B b_val = 0;
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x.le(Vec(b_casted)); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
19 changes: 10 additions & 9 deletions kernels/optimized/cpu/op_mul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h> // IWYU pragma: export
Expand Down Expand Up @@ -95,7 +96,7 @@ Tensor& handle_last_dim_broadcast(
const size_t outer_size = getLeadingDims(out, out.dim() - 1);
const auto broadcast_size = out.size(out.dim() - 1);
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_broadcast_last_dim<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -164,7 +165,7 @@ Tensor& handle_broadcast_mul(
inner_size = lhs->sizes()[lhs->dim() - 1];
}
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_3d_and_unsqueezed_3d<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -203,8 +204,8 @@ Tensor& opt_mul_out(
CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x * Vec(b_casted); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand All @@ -229,8 +230,8 @@ Tensor& opt_mul_out(
"Failed to resize output tensor.");

ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -306,8 +307,8 @@ Tensor& opt_mul_scalar_out(
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x * Vec(b_casted); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
7 changes: 4 additions & 3 deletions kernels/optimized/cpu/op_native_layer_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
#include <cmath>
#include <tuple>

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/moments_utils.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/util/normalization_ops_util.h>

namespace torch {
Expand All @@ -33,7 +34,7 @@ void layer_norm(
Tensor& out,
Tensor& mean,
Tensor& rstd) {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;

const size_t dim = input.dim() - normalized_shape.size();
const size_t dim_size = input.size(dim);
Expand Down Expand Up @@ -93,7 +94,7 @@ void layer_norm(
dst_ptr[j] = (src_ptr[j] * scale + offset) * gamma_v + beta_v;
}
} else {
executorch::vec::map3<CTYPE>(
at::vec::map3<CTYPE>(
[scale, offset](Vec x, Vec gamma, Vec beta) {
return (x * Vec(scale) + Vec(offset)) * gamma + beta;
},
Expand Down
7 changes: 4 additions & 3 deletions kernels/optimized/cpu/op_neg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/runtime/kernel/kernel_includes.h>

namespace torch {
Expand All @@ -27,8 +28,8 @@ Tensor& opt_neg_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
"Failed to resize output tensor.");

ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "neg.out", CTYPE, [&] {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[](Vec x) { return x.neg(); },
out.mutable_data_ptr<CTYPE>(),
in.const_data_ptr<CTYPE>(),
Expand Down
Loading

0 comments on commit 45e70e6

Please sign in to comment.