Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove ExecuTorch copy of Vectorized #7042

Open
wants to merge 6 commits into
base: gh/swolchok/121/base
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 4 additions & 11 deletions extension/llm/custom_ops/op_sdpa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@

#include <executorch/extension/llm/custom_ops/op_sdpa.h>

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/blas/CPUBlas.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
// @lint-ignore CLANGTIDY facebook-unused-include-check
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
Expand All @@ -34,18 +35,10 @@ namespace util {
constexpr size_t kKVDim = 4;

template <typename T>
inline void _store(T* dst, ::executorch::vec::Vectorized<T> src) {
inline void _store(T* dst, ::at::vec::Vectorized<T> src) {
src.store(dst);
}

/*
inline void _store(::Half* dst, at::vec::Vectorized<float> src) {
//fp16_ieee_to_fp32_value
auto res = at::vec::convert_float_half(src, src);
res.store(dst, at::vec::Vectorized<float>::size());
}
*/

template <typename T>
inline T data_index_init(T offset) {
return offset;
Expand Down Expand Up @@ -78,7 +71,7 @@ inline double calculate_scale(const Tensor& query, optional<double> scale) {
}

} // namespace util
namespace vec = ::executorch::vec;
namespace vec = ::at::vec;
using Tensor = exec_aten::Tensor;

namespace {
Expand Down
28 changes: 14 additions & 14 deletions kernels/optimized/cpu/moments_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// for use in optimized ExecuTorch ops. Template specializations of BFloat16
// are excluded.

#include <executorch/kernels/optimized/vec/vec.h>
#include <ATen/cpu/vec/vec.h>

#include <executorch/kernels/optimized/utils/math_utils.h>
#include <executorch/runtime/platform/compiler.h>
Expand Down Expand Up @@ -47,12 +47,12 @@ void AddMoments(
template <typename T>
ET_INLINE void AddMomentsVec(
int64_t m0_add,
const executorch::vec::Vectorized<T>& m1_add,
const executorch::vec::Vectorized<T>& m2_add,
const at::vec::Vectorized<T>& m1_add,
const at::vec::Vectorized<T>& m2_add,
int64_t& m0,
executorch::vec::Vectorized<T>& m1,
executorch::vec::Vectorized<T>& m2) {
using Vec = executorch::vec::Vectorized<T>;
at::vec::Vectorized<T>& m1,
at::vec::Vectorized<T>& m2) {
using Vec = at::vec::Vectorized<T>;
const int64_t n = m0 + m0_add;
const T c =
n == 0 ? static_cast<T>(0) : static_cast<T>(m0_add) / static_cast<T>(n);
Expand All @@ -67,11 +67,11 @@ template <typename T>
inline void UpdateMomentsVec(
int64_t m0,
const T* X_ptr,
const std::array<executorch::vec::Vectorized<acc_t<T>>, kChunkSize>& c_vecs,
const std::array<at::vec::Vectorized<acc_t<T>>, kChunkSize>& c_vecs,
int64_t& m0_stk0,
executorch::vec::Vectorized<acc_t<T>>& m1_stk0,
executorch::vec::Vectorized<acc_t<T>>& m2_stk0) {
using Vec = executorch::vec::Vectorized<acc_t<T>>;
at::vec::Vectorized<acc_t<T>>& m1_stk0,
at::vec::Vectorized<acc_t<T>>& m2_stk0) {
using Vec = at::vec::Vectorized<acc_t<T>>;
Vec m1_vec(0);
Vec m2_vec(0);
for (int64_t j = 0; j < m0; ++j) {
Expand All @@ -92,13 +92,13 @@ std::pair<acc_t<T>, acc_t<T>>
RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
using T_ACC = acc_t<T>;

constexpr int64_t kVecSize = executorch::vec::Vectorized<T>::size();
constexpr int64_t kAccVecSize = executorch::vec::Vectorized<T_ACC>::size();
constexpr int64_t kVecSize = at::vec::Vectorized<T>::size();
constexpr int64_t kAccVecSize = at::vec::Vectorized<T_ACC>::size();
const int64_t n = N / kVecSize;
const int64_t m = executorch::utils::divup(n, kChunkSize);
const int64_t depth = executorch::utils::CeilLog2(m);

using Vec = executorch::vec::Vectorized<T_ACC>;
using Vec = at::vec::Vectorized<T_ACC>;
const Vec kZeroVec(T_ACC(0));
std::array<int64_t, kMaxDepth> m0_stk;
std::array<Vec, kMaxDepth> m1_stk;
Expand Down Expand Up @@ -168,7 +168,7 @@ RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
template <typename T>
std::pair<acc_t<T>, acc_t<T>>
RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
using Vec = executorch::vec::Vectorized<T>;
using Vec = at::vec::Vectorized<T>;
constexpr int64_t kVecSize = Vec::size();
const int64_t n = N / kVecSize;
const int64_t m = executorch::utils::divup(n, kChunkSize);
Expand Down
17 changes: 9 additions & 8 deletions kernels/optimized/cpu/op_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
Expand Down Expand Up @@ -99,8 +100,8 @@ Tensor& opt_add_out(
CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[alpha_val, b_casted](Vec x) {
return x + Vec(alpha_val * b_casted);
},
Expand Down Expand Up @@ -131,8 +132,8 @@ Tensor& opt_add_out(
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -166,7 +167,7 @@ Tensor& opt_add_out(
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
[alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -244,8 +245,8 @@ Tensor& opt_add_scalar_out(
CTYPE alpha_val;
ET_EXTRACT_SCALAR(alpha, alpha_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[alpha_val, b_casted](Vec x) {
return x + Vec(alpha_val * b_casted);
},
Expand Down
19 changes: 10 additions & 9 deletions kernels/optimized/cpu/op_div.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
Expand Down Expand Up @@ -76,16 +77,16 @@ Tensor& opt_div_out(
CTYPE_SCALAR scalar_val = *scalar->const_data_ptr<CTYPE_SCALAR>();
CTYPE scalar_casted = static_cast<CTYPE>(scalar_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
if (a.numel() == 1) {
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[scalar_casted](Vec x) { return Vec(scalar_casted) / x; },
out.mutable_data_ptr<CTYPE>(),
tensor->const_data_ptr<CTYPE>(),
out.numel());
} else {
Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted);
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[inv_scalar_casted_vec](Vec x) {
return x * inv_scalar_casted_vec;
},
Expand All @@ -111,8 +112,8 @@ Tensor& opt_div_out(
"Failed to resize output tensor.");

ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "div.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x / y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -142,7 +143,7 @@ Tensor& opt_div_out(
out,
"Failed to resize output tensor.");
ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
if (selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
Expand Down Expand Up @@ -222,9 +223,9 @@ Tensor& opt_div_scalar_out(
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
Vec inv_b_casted_vec(CTYPE(1) / b_casted);
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[inv_b_casted_vec](Vec x) { return x * inv_b_casted_vec; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
7 changes: 4 additions & 3 deletions kernels/optimized/cpu/op_exp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

#include <cmath>

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/runtime/kernel/kernel_includes.h>

namespace torch {
Expand All @@ -34,8 +35,8 @@ void exp_data(
const CTYPE_IN* in_data,
const size_t numel,
CTYPE_OUT* out_data) {
using Vec = executorch::vec::Vectorized<CTYPE_IN>;
executorch::vec::map<CTYPE_IN>(
using Vec = at::vec::Vectorized<CTYPE_IN>;
at::vec::map<CTYPE_IN>(
[](Vec x) { return x.exp(); }, out_data, in_data, numel);
}

Expand Down
11 changes: 6 additions & 5 deletions kernels/optimized/cpu/op_le.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>
Expand Down Expand Up @@ -44,8 +45,8 @@ Tensor& opt_le_tensor_out(
if (a_type == b_type && a_type == out_type) {
ET_SWITCH_REAL_TYPES_AND(
Bool, out_type, ctx, "le.Tensor_out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x.le(y); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -109,8 +110,8 @@ Tensor& opt_le_scalar_out(
CTYPE_B b_val = 0;
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x.le(Vec(b_casted)); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
19 changes: 10 additions & 9 deletions kernels/optimized/cpu/op_mul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h> // IWYU pragma: export
Expand Down Expand Up @@ -95,7 +96,7 @@ Tensor& handle_last_dim_broadcast(
const size_t outer_size = getLeadingDims(out, out.dim() - 1);
const auto broadcast_size = out.size(out.dim() - 1);
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_broadcast_last_dim<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -164,7 +165,7 @@ Tensor& handle_broadcast_mul(
inner_size = lhs->sizes()[lhs->dim() - 1];
}
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_3d_and_unsqueezed_3d<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -203,8 +204,8 @@ Tensor& opt_mul_out(
CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x * Vec(b_casted); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand All @@ -229,8 +230,8 @@ Tensor& opt_mul_out(
"Failed to resize output tensor.");

ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -306,8 +307,8 @@ Tensor& opt_mul_scalar_out(
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x * Vec(b_casted); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
7 changes: 4 additions & 3 deletions kernels/optimized/cpu/op_native_layer_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
#include <cmath>
#include <tuple>

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/moments_utils.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/util/normalization_ops_util.h>

namespace torch {
Expand All @@ -33,7 +34,7 @@ void layer_norm(
Tensor& out,
Tensor& mean,
Tensor& rstd) {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;

const size_t dim = input.dim() - normalized_shape.size();
const size_t dim_size = input.size(dim);
Expand Down Expand Up @@ -93,7 +94,7 @@ void layer_norm(
dst_ptr[j] = (src_ptr[j] * scale + offset) * gamma_v + beta_v;
}
} else {
executorch::vec::map3<CTYPE>(
at::vec::map3<CTYPE>(
[scale, offset](Vec x, Vec gamma, Vec beta) {
return (x * Vec(scale) + Vec(offset)) * gamma + beta;
},
Expand Down
7 changes: 4 additions & 3 deletions kernels/optimized/cpu/op_neg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/runtime/kernel/kernel_includes.h>

namespace torch {
Expand All @@ -27,8 +28,8 @@ Tensor& opt_neg_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
"Failed to resize output tensor.");

ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, "neg.out", CTYPE, [&] {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[](Vec x) { return x.neg(); },
out.mutable_data_ptr<CTYPE>(),
in.const_data_ptr<CTYPE>(),
Expand Down
Loading
Loading