From 2b8cd1708cc3884a11615e618a14d9011d98f421 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 25 Sep 2024 12:59:37 -0700 Subject: [PATCH 01/36] Add microbenchmark for layer normalization --- cmake/onnxruntime_unittests.cmake | 3 +- .../core/providers/cpu/nn/layer_norm_impl.cc | 113 +++++++++--------- .../core/providers/cpu/nn/layer_norm_impl.h | 39 ++++++ .../microbenchmark/layer_normalization.cc | 108 +++++++++++++++++ 4 files changed, 207 insertions(+), 56 deletions(-) create mode 100644 onnxruntime/test/onnx/microbenchmark/layer_normalization.cc diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index a4ba85e868896..f6ace371531f9 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -1128,7 +1128,8 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) ${BENCHMARK_DIR}/gelu.cc ${BENCHMARK_DIR}/activation.cc ${BENCHMARK_DIR}/quantize.cc - ${BENCHMARK_DIR}/reduceminmax.cc) + ${BENCHMARK_DIR}/reduceminmax.cc + ${BENCHMARK_DIR}/layer_normalization.cc) target_include_directories(onnxruntime_benchmark PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_graph_header} ${ONNXRUNTIME_ROOT}/core/mlas/inc) target_compile_definitions(onnxruntime_benchmark PRIVATE BENCHMARK_STATIC_DEFINE) if(WIN32) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 23630dcb63efa..57eb8c69a3067 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -12,6 +12,8 @@ namespace onnxruntime { +namespace { + // Utility to convert from MLFloat16 to float only when the input type is MLFloat16. template ORT_FORCEINLINE Ret ConvertMLFloat16ToDoubleOrFloatIfNeeded(T val); @@ -63,15 +65,16 @@ ORT_FORCEINLINE constexpr double ConvertToMLFloat16IfNeeded(double val) { return val; } +} // namespace + LayerNormImpl::LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified, bool contrib_op) : OpKernel(op_kernel_info), simplified_{simplified}, contrib_op_{contrib_op} { ORT_ENFORCE(op_kernel_info.GetAttr("axis", &axis_).IsOK()); ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon_).IsOK()); } -namespace { template -Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified) { +Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified) const { // Inputs const Tensor* X = p_ctx->Input(0); const Tensor* scale = p_ctx->Input(1); @@ -81,21 +84,12 @@ Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, boo const T* bias_data = (simplified || nullptr == bias) ? nullptr : bias->Data(); const TensorShape& x_shape = X->Shape(); - const int64_t axis = HandleNegativeAxis(orig_axis, x_shape.NumDimensions()); - int64_t norm_count = x_shape.SizeToDimension(onnxruntime::narrow(axis)); - int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow(axis)); - - const auto scale_size = scale->Shape().Size(); - const auto bias_size = (bias_data) ? bias->Shape().Size() : 0; - if (scale_size != norm_size || (bias_data && bias_size != norm_size)) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Size of X.shape()[axis:] == ", norm_size, - ". Size of scale and bias (if provided) must match this. Got scale size of ", - scale_size, " and bias size of ", bias_size); - } - + const TensorShape& scale_shape = scale->Shape(); + const TensorShape& bias_shape = bias->Shape(); Tensor* Y = p_ctx->Output(0, x_shape); - auto Y_data = Y->MutableData(); + T* Y_data = Y->MutableData(); + + const int64_t axis = HandleNegativeAxis(orig_axis, x_shape.NumDimensions()); std::vector mean_inv_std_dev_dim; mean_inv_std_dev_dim.reserve(x_shape.NumDimensions()); @@ -107,17 +101,11 @@ Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, boo } } - AllocatorPtr alloc; - ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); - int output_index = 1; - + Tensor* mean = p_ctx->Output(output_index++, TensorShape(mean_inv_std_dev_dim)); U* mean_data = nullptr; - if (!simplified) { - Tensor* mean = p_ctx->Output(output_index++, TensorShape(mean_inv_std_dev_dim)); - if (mean != nullptr) { - mean_data = mean->MutableData(); - } + if (mean != nullptr) { + mean_data = mean->MutableData(); } U* inv_std_dev_data = nullptr; @@ -126,8 +114,51 @@ Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, boo inv_std_dev_data = inv_std_dev->MutableData(); } + onnxruntime::concurrency::ThreadPool* thread_pool = p_ctx->GetOperatorThreadPool(); + + return ComputeWithoutContext(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, + Y_data, mean_data, inv_std_dev_data, thread_pool, axis, epsilon, simplified); +} + +Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { + const auto elem_type = p_ctx->Input(0)->GetElementType(); + + using SupportedTypeList = boost::mp11::mp_list; + + utils::MLTypeCallDispatcherFromTypeList t_disp(elem_type); + return t_disp.InvokeRet(this, p_ctx, axis_, epsilon_, simplified_, contrib_op_); +} + +template +Status LayerNormImpl::ComputeWithoutContext( + const T* X_data, + const TensorShape& x_shape, + const T* scale_data, + const TensorShape& scale_shape, + const T* bias_data, + const TensorShape& bias_shape, + T* Y_data, + U* mean_data, + U* inv_std_dev_data, + onnxruntime::concurrency::ThreadPool* thread_pool, + int64_t axis, + float epsilon, + bool simplified +) const { + int64_t norm_count = x_shape.SizeToDimension(onnxruntime::narrow(axis)); + int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow(axis)); + + const auto scale_size = scale_shape.Size(); + const auto bias_size = (bias_data) ? bias_shape.Size() : 0; + if (scale_size != norm_size || (bias_data && bias_size != norm_size)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Size of X.shape()[axis:] == ", norm_size, + ". Size of scale and bias (if provided) must match this. Got scale size of ", + scale_size, " and bias size of ", bias_size); + } + concurrency::ThreadPool::TryBatchParallelFor( - p_ctx->GetOperatorThreadPool(), static_cast(norm_count), + thread_pool, static_cast(norm_count), [&](ptrdiff_t task_idx) { const T* p_input = X_data + task_idx * norm_size; T* p_output = Y_data + task_idx * norm_size; @@ -159,7 +190,7 @@ Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, boo DoubleOrFloat scale_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(scale_data[h]); if (simplified) { p_output[h] = ConvertToMLFloat16IfNeeded(input_value / mean_square * scale_value); - } else if (nullptr == bias) { + } else if (nullptr == bias_data) { p_output[h] = ConvertToMLFloat16IfNeeded((input_value - mean) / mean_square * scale_value); } else { DoubleOrFloat bias_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(bias_data[h]); @@ -181,32 +212,4 @@ Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, boo return Status::OK(); } -template -struct SrcDispatcher { - Status operator()(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified, bool contrib_op) const { - // the contrib op kernel was always registered with the same type for all constraints. - // our implementation of the onnx op only supports 'float' as the U constraint. -#if !defined(DISABLE_CONTRIB_OPS) - if (contrib_op) { - return ComputeImpl(p_ctx, orig_axis, epsilon, simplified); - } else -#else - ORT_UNUSED_PARAMETER(contrib_op); -#endif - { - return ComputeImpl(p_ctx, orig_axis, epsilon, simplified); - } - } -}; -} // namespace - -Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { - const auto elem_type = p_ctx->Input(0)->GetElementType(); - - using SupportedTypeList = boost::mp11::mp_list; - - utils::MLTypeCallDispatcherFromTypeList t_disp(elem_type); - return t_disp.InvokeRet(p_ctx, axis_, epsilon_, simplified_, contrib_op_); -} - } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h index 393c637dbda18..086adb8dfa94b 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h @@ -14,7 +14,46 @@ class LayerNormImpl : public OpKernel { LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified = false, bool contrib_op = false); Status Compute(OpKernelContext* p_op_kernel_context) const override; + // This method was created so that it can be called directly from `test/onnx/microbenchmark/layer_normalization.cc`. + template + Status ComputeWithoutContext( + const T* X_data, + const TensorShape& x_shape, + const T* scale_data, + const TensorShape& scale_shape, + const T* bias_data, + const TensorShape& bias_shape, + T* Y_data, + U* mean_data, + U* inv_std_dev, + onnxruntime::concurrency::ThreadPool* thread_pool, + int64_t axis, + float epsilon = epsilon_, + bool simplified = simplified_ + ) const; + private: + template + Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified) const; + + template + struct SrcDispatcher { + Status operator()(const LayerNormImpl* p_instance, OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified, bool contrib_op) const { + // the contrib op kernel was always registered with the same type for all constraints. + // our implementation of the onnx op only supports 'float' as the U constraint. + #if !defined(DISABLE_CONTRIB_OPS) + if (contrib_op) { + return p_instance->ComputeImpl(p_ctx, orig_axis, epsilon, simplified); + } else + #else + ORT_UNUSED_PARAMETER(contrib_op); + #endif + { + return p_instance->ComputeImpl(p_ctx, orig_axis, epsilon, simplified); + } + } + }; + int64_t axis_; float epsilon_; const bool simplified_; diff --git a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc new file mode 100644 index 0000000000000..20089262e1d1c --- /dev/null +++ b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc @@ -0,0 +1,108 @@ +#include "core/platform/threadpool.h" +#include "core/util/thread_utils.h" +#include + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + +#include "core/framework/allocator.h" +#include "core/framework/config_options.h" +#include "core/framework/data_transfer_manager.h" +#include "core/framework/op_kernel_info.h" +#include "core/framework/ort_value_name_idx_map.h" +#include "core/platform/windows/env.h" +#include "core/providers/cpu/nn/layer_norm_impl.h" +#include "core/providers/cpu/cpu_provider_factory.h" +#include "core/providers/cpu/cpu_provider_factory_creator.h" +#include "core/util/thread_utils.h" + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + +using namespace onnxruntime; + +template +static void BM_LayerNormalization(benchmark::State& state) { + bool simplified = false; + const float epsilon = 1e-05f; + int64_t axis = 1; + + onnxruntime::Node node; + // Required by LayerNormImpl constructor + node.AddAttribute("axis", axis); + node.AddAttribute("epsilon", epsilon); + + KernelDef kernel_def; + std::unique_ptr execution_provider = CPUProviderFactoryCreator::Create(true)->CreateProvider(); + std::unordered_map constant_initialized_tensors; + OrtValueNameIdxMap mlvalue_name_idx_map; + DataTransferManager data_transfer_mgr; + AllocatorMap allocators; + ConfigOptions config_options; + + OpKernelInfo op_kernel_info(node, kernel_def, *execution_provider, constant_initialized_tensors, mlvalue_name_idx_map, + data_transfer_mgr, allocators, config_options); + + LayerNormImpl layer_norm_impl(op_kernel_info); + + std::vector x_dims{2, 2, 2}; + TensorShape x_shape(x_dims); + std::vector x{1, 1, 1, 1, 1, 1, 1, 1}; + + std::vector scale_bias_dims{1, 2, 2}; + TensorShape scale_shape(scale_bias_dims); + TensorShape bias_shape(scale_bias_dims); + std::vector scale{1, 1, 1, 1}; + std::vector bias{1, 1, 1, 1}; + + T* X_data = static_cast(malloc(x.size() * sizeof(T))); + T* scale_data = static_cast(malloc(scale.size() * sizeof(T))); + T* bias_data = static_cast(malloc(bias.size() * sizeof(T))); + for (size_t i = 0; i < x.size(); i++) { + X_data[i] = T(x[i]); + } + for (size_t i = 0; i < scale.size(); i++) { + scale_data[i] = T(scale[i]); + } + for (size_t i = 0; i < bias.size(); i++) { + bias_data[i] = T(bias[i]); + } + + T* Y_data = static_cast(malloc(x.size() * sizeof(T))); + U* mean_data = static_cast(malloc(x.size() * sizeof(U))); + U* inv_std_dev_data = static_cast(malloc(x.size() * sizeof(U))); + + OrtThreadPoolParams tp_params; + tp_params.name = ORT_TSTR("intra-op"); + std::unique_ptr thread_pool = concurrency::CreateThreadPool( + &Env::Default(), tp_params, concurrency::ThreadPoolType::INTRA_OP); + + for (auto _ : state) { + auto status = layer_norm_impl.ComputeWithoutContext(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, + Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis, epsilon, simplified); + + if (! status.IsOK()) + { + std::cout << "ComputeWithoutContext status not OK: " << status.ErrorMessage() << std::endl; + break; + } + } +} + + +BENCHMARK(BM_LayerNormalization) + ->Arg(1) + ->Arg(256) + ->Arg(1024) + ->UseRealTime() + ->Unit(benchmark::TimeUnit::kMicrosecond); + +BENCHMARK(BM_LayerNormalization) + ->Arg(1) + ->Arg(256) + ->Arg(1024) + ->UseRealTime() + ->Unit(benchmark::TimeUnit::kMicrosecond); From 0c89631e7f05a819494372d6ee093786aa381a3a Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 25 Sep 2024 14:25:02 -0700 Subject: [PATCH 02/36] fix warnings --- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 2 +- onnxruntime/core/providers/cpu/nn/layer_norm_impl.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 57eb8c69a3067..885e676998ed7 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -129,7 +129,7 @@ Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { return t_disp.InvokeRet(this, p_ctx, axis_, epsilon_, simplified_, contrib_op_); } -template +template Status LayerNormImpl::ComputeWithoutContext( const T* X_data, const TensorShape& x_shape, diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h index 086adb8dfa94b..9c2ed303eef5e 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h @@ -15,7 +15,7 @@ class LayerNormImpl : public OpKernel { Status Compute(OpKernelContext* p_op_kernel_context) const override; // This method was created so that it can be called directly from `test/onnx/microbenchmark/layer_normalization.cc`. - template + template Status ComputeWithoutContext( const T* X_data, const TensorShape& x_shape, @@ -28,8 +28,8 @@ class LayerNormImpl : public OpKernel { U* inv_std_dev, onnxruntime::concurrency::ThreadPool* thread_pool, int64_t axis, - float epsilon = epsilon_, - bool simplified = simplified_ + float epsilon, + bool simplified ) const; private: From bca13ca03647866f1d22769f080174f703727603 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 26 Sep 2024 08:46:11 -0700 Subject: [PATCH 03/36] initialize test input data at compile time --- .../microbenchmark/layer_normalization.cc | 64 ++++++++++--------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc index 20089262e1d1c..a2987b4d7c25a 100644 --- a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc +++ b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc @@ -24,6 +24,29 @@ using namespace onnxruntime; +namespace { + +static const std::vector dims{1, 256, 1024}; +static const size_t num_elems = dims[0] * dims[1] * dims[2]; +static const std::vector float_vals(num_elems, 1.0f); +static const std::vector MLFloat16_vals(num_elems, MLFloat16(1.0f)); + +} // namespace + +template +const T* getVector(); + +template <> +const float* getVector() { + return float_vals.data(); +} + +template <> +const MLFloat16* getVector() { + return MLFloat16_vals.data(); +} + + template static void BM_LayerNormalization(benchmark::State& state) { bool simplified = false; @@ -48,32 +71,17 @@ static void BM_LayerNormalization(benchmark::State& state) { LayerNormImpl layer_norm_impl(op_kernel_info); - std::vector x_dims{2, 2, 2}; - TensorShape x_shape(x_dims); - std::vector x{1, 1, 1, 1, 1, 1, 1, 1}; - - std::vector scale_bias_dims{1, 2, 2}; - TensorShape scale_shape(scale_bias_dims); - TensorShape bias_shape(scale_bias_dims); - std::vector scale{1, 1, 1, 1}; - std::vector bias{1, 1, 1, 1}; - - T* X_data = static_cast(malloc(x.size() * sizeof(T))); - T* scale_data = static_cast(malloc(scale.size() * sizeof(T))); - T* bias_data = static_cast(malloc(bias.size() * sizeof(T))); - for (size_t i = 0; i < x.size(); i++) { - X_data[i] = T(x[i]); - } - for (size_t i = 0; i < scale.size(); i++) { - scale_data[i] = T(scale[i]); - } - for (size_t i = 0; i < bias.size(); i++) { - bias_data[i] = T(bias[i]); - } + TensorShape x_shape(dims); + TensorShape scale_shape(dims); + TensorShape bias_shape(dims); + + const T* x_data = getVector(); + const T* scale_data = getVector(); + const T* bias_data = getVector(); - T* Y_data = static_cast(malloc(x.size() * sizeof(T))); - U* mean_data = static_cast(malloc(x.size() * sizeof(U))); - U* inv_std_dev_data = static_cast(malloc(x.size() * sizeof(U))); + T* Y_data = static_cast(malloc(num_elems * sizeof(T))); + U* mean_data = static_cast(malloc(num_elems * sizeof(U))); + U* inv_std_dev_data = static_cast(malloc(num_elems * sizeof(U))); OrtThreadPoolParams tp_params; tp_params.name = ORT_TSTR("intra-op"); @@ -81,7 +89,7 @@ static void BM_LayerNormalization(benchmark::State& state) { &Env::Default(), tp_params, concurrency::ThreadPoolType::INTRA_OP); for (auto _ : state) { - auto status = layer_norm_impl.ComputeWithoutContext(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, + auto status = layer_norm_impl.ComputeWithoutContext(x_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis, epsilon, simplified); if (! status.IsOK()) @@ -95,14 +103,10 @@ static void BM_LayerNormalization(benchmark::State& state) { BENCHMARK(BM_LayerNormalization) ->Arg(1) - ->Arg(256) - ->Arg(1024) ->UseRealTime() ->Unit(benchmark::TimeUnit::kMicrosecond); BENCHMARK(BM_LayerNormalization) ->Arg(1) - ->Arg(256) - ->Arg(1024) ->UseRealTime() ->Unit(benchmark::TimeUnit::kMicrosecond); From 680cf4fcf2e88af9415e69786a408990ca063ffa Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 26 Sep 2024 09:12:38 -0700 Subject: [PATCH 04/36] remove unused specialization that fails on pipeline --- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 5 ----- onnxruntime/core/providers/cpu/nn/layer_norm_impl.h | 6 +++--- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 885e676998ed7..546557f6f9015 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -23,11 +23,6 @@ ORT_FORCEINLINE float ConvertMLFloat16ToDoubleOrFloatIfNeeded( return val.ToFloat(); } -template <> -ORT_FORCEINLINE double ConvertMLFloat16ToDoubleOrFloatIfNeeded(MLFloat16 val) { - return double(ConvertMLFloat16ToDoubleOrFloatIfNeeded(val)); -} - template <> ORT_FORCEINLINE constexpr float ConvertMLFloat16ToDoubleOrFloatIfNeeded(float val) { return val; diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h index 9c2ed303eef5e..aa876357ed3c8 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h @@ -29,8 +29,7 @@ class LayerNormImpl : public OpKernel { onnxruntime::concurrency::ThreadPool* thread_pool, int64_t axis, float epsilon, - bool simplified - ) const; + bool simplified) const; private: template @@ -38,7 +37,8 @@ class LayerNormImpl : public OpKernel { template struct SrcDispatcher { - Status operator()(const LayerNormImpl* p_instance, OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified, bool contrib_op) const { + Status operator()(const LayerNormImpl* p_instance, OpKernelContext* p_ctx, int64_t orig_axis, + float epsilon, bool simplified, bool contrib_op) const { // the contrib op kernel was always registered with the same type for all constraints. // our implementation of the onnx op only supports 'float' as the U constraint. #if !defined(DISABLE_CONTRIB_OPS) From f0df5263f9b2d9ebcf7cef8aa34c7118bd3746b3 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 01:15:13 -0700 Subject: [PATCH 05/36] fix build on linux --- onnxruntime/test/onnx/microbenchmark/layer_normalization.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc index a2987b4d7c25a..5c7bd5716832a 100644 --- a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc +++ b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc @@ -1,3 +1,5 @@ +#ifdef _WIN32 + #include "core/platform/threadpool.h" #include "core/util/thread_utils.h" #include @@ -110,3 +112,5 @@ BENCHMARK(BM_LayerNormalization) ->Arg(1) ->UseRealTime() ->Unit(benchmark::TimeUnit::kMicrosecond); + +#endif From 87725c37e4f65f4372ffcf05228b6e15ff081077 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 08:20:28 -0700 Subject: [PATCH 06/36] convert all inputs to float efficiently if needed --- .../contrib_ops/cpu/skip_layer_norm.cc | 81 ++++++++++++------- .../core/providers/cpu/nn/layer_norm_impl.cc | 69 ++++++++++------ 2 files changed, 99 insertions(+), 51 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index faf78cae80ee1..50ce160f38153 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "core/framework/tensor.h" +#include "core/mlas/inc/mlas.h" #include "core/util/math_cpuonly.h" #include "core/providers/common.h" #include "core/platform/threadpool.h" @@ -36,30 +37,32 @@ REGISTER_KERNEL_TYPED(float) REGISTER_KERNEL_TYPED(double) REGISTER_KERNEL_TYPED(MLFloat16) -// Utility to convert from MLFloat16 to float only when the input type is MLFloat16. -template -ORT_FORCEINLINE Ret ConvertMLFloat16ToDoubleOrFloatIfNeeded(T val); -template <> -ORT_FORCEINLINE float ConvertMLFloat16ToDoubleOrFloatIfNeeded(MLFloat16 val) { - return val.ToFloat(); -} +template +std::shared_ptr> ConvertHalfToFloatIfNeeded(const T* p_input, int num_elems); -template <> -ORT_FORCEINLINE double ConvertMLFloat16ToDoubleOrFloatIfNeeded(MLFloat16 val) { - return static_cast(ConvertMLFloat16ToDoubleOrFloatIfNeeded(val)); +template +std::shared_ptr> ConvertHalfToFloatIfNeeded( + const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) +{ + return nullptr; } -template <> -ORT_FORCEINLINE constexpr float ConvertMLFloat16ToDoubleOrFloatIfNeeded(float val) { - return val; -} +template<> +std::shared_ptr> ConvertHalfToFloatIfNeeded(const MLFloat16* p_input, int num_elems) +{ + if (!p_input) { + return nullptr; + } -template <> -ORT_FORCEINLINE constexpr double ConvertMLFloat16ToDoubleOrFloatIfNeeded(double val) { - return val; + // Efficiently convert all the MLFloat16 values to floats. + std::shared_ptr> vec = std::make_shared>(num_elems); + MlasConvertHalfToFloatBuffer(p_input, &(*vec)[0], num_elems); + + return vec; } + // Function template that only converts the input value to MLFloat16 if T is MLFloat16. template ORT_FORCEINLINE constexpr typename std::enable_if_t || std::is_same_v, T> @@ -145,15 +148,28 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { DoubleOrFloat mean(0.0f); DoubleOrFloat mean_square(0.0f); + std::shared_ptr> float_input = ConvertHalfToFloatIfNeeded(p_input, hidden_size); + const DoubleOrFloat* converted_input = + float_input == nullptr + ? reinterpret_cast(p_input) + : reinterpret_cast(&(*float_input)[0]); + std::shared_ptr> float_skip = ConvertHalfToFloatIfNeeded(p_skip, hidden_size); + const DoubleOrFloat* converted_skip = + float_skip == nullptr + ? reinterpret_cast(p_skip) + : reinterpret_cast(&(*float_skip)[0]); + std::shared_ptr> float_bias = ConvertHalfToFloatIfNeeded(bias_data, hidden_size); + const DoubleOrFloat* converted_bias = + float_bias == nullptr + ? reinterpret_cast(bias_data) + : reinterpret_cast(&(*float_bias)[0]); + std::unique_ptr output_buffer = std::make_unique(hidden_size); for (size_t h = 0; h < static_cast(hidden_size); h++) { - DoubleOrFloat input_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(p_input[h]); - DoubleOrFloat skip_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(p_skip[h]); - - DoubleOrFloat value = input_value + skip_value; + DoubleOrFloat value = converted_input[h] + converted_skip[h]; if (nullptr != bias_data) { - value += ConvertMLFloat16ToDoubleOrFloatIfNeeded(bias_data[h]); + value += converted_bias[h]; } output_buffer[h] = value; @@ -173,15 +189,26 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon_); } + std::shared_ptr> float_gamma = ConvertHalfToFloatIfNeeded(gamma_data, hidden_size); + const DoubleOrFloat* converted_gamma = + float_gamma == nullptr + ? reinterpret_cast(gamma_data) + : reinterpret_cast(&(*float_gamma)[0]); + std::shared_ptr> float_beta = ConvertHalfToFloatIfNeeded(beta_data, hidden_size); + const DoubleOrFloat* converted_beta = + float_beta == nullptr + ? reinterpret_cast(beta_data) + : reinterpret_cast(&(*float_beta)[0]); for (size_t h = 0; h < static_cast(hidden_size); h++) { - DoubleOrFloat gamma_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(gamma_data[h]); if (simplified) { - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded(output_buffer[h] / mean_square * gamma_value); + p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( + output_buffer[h] / mean_square * converted_gamma[h]); } else if (nullptr == beta_data) { - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded((output_buffer[h] - mean) / mean_square * gamma_value); + p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( + (output_buffer[h] - mean) / mean_square * converted_gamma[h]); } else { - DoubleOrFloat beta_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(beta_data[h]); - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded((output_buffer[h] - mean) / mean_square * gamma_value + beta_value); + p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( + (output_buffer[h] - mean) / mean_square * converted_gamma[h] + converted_beta[h]); } } }, diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 546557f6f9015..9010dd7d6f1b4 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -5,6 +5,7 @@ #include "core/common/safeint.h" #include "core/framework/tensor.h" +#include "core/mlas/inc/mlas.h" #include "core/platform/threadpool.h" #include "core/providers/common.h" #include "core/util/force_inline.h" @@ -14,23 +15,28 @@ namespace onnxruntime { namespace { -// Utility to convert from MLFloat16 to float only when the input type is MLFloat16. -template -ORT_FORCEINLINE Ret ConvertMLFloat16ToDoubleOrFloatIfNeeded(T val); +template +std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded(const T* p_input, int num_elems); -template <> -ORT_FORCEINLINE float ConvertMLFloat16ToDoubleOrFloatIfNeeded(MLFloat16 val) { - return val.ToFloat(); +template +std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded( + const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) +{ + return nullptr; } -template <> -ORT_FORCEINLINE constexpr float ConvertMLFloat16ToDoubleOrFloatIfNeeded(float val) { - return val; -} +template<> +std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded(const MLFloat16* p_input, int num_elems) +{ + if (!p_input) { + return nullptr; + } -template <> -ORT_FORCEINLINE constexpr double ConvertMLFloat16ToDoubleOrFloatIfNeeded(double val) { - return val; + // Efficiently convert all the MLFloat16 values to floats. + std::shared_ptr> vec = std::make_shared>(num_elems); + MlasConvertHalfToFloatBuffer(p_input, &(*vec)[0], num_elems); + + return vec; } ORT_FORCEINLINE constexpr float ConvertToFloatIfNeeded(float val) { @@ -138,8 +144,7 @@ Status LayerNormImpl::ComputeWithoutContext( onnxruntime::concurrency::ThreadPool* thread_pool, int64_t axis, float epsilon, - bool simplified -) const { + bool simplified) const { int64_t norm_count = x_shape.SizeToDimension(onnxruntime::narrow(axis)); int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow(axis)); @@ -167,10 +172,17 @@ Status LayerNormImpl::ComputeWithoutContext( DoubleOrFloat mean(0.0f); DoubleOrFloat mean_square(0.0f); + std::shared_ptr> float_input = ConvertMLFloat16ToFloatIfNeeded(p_input, norm_size); + const DoubleOrFloat* converted_input = + float_input == nullptr + ? reinterpret_cast(p_input) + : reinterpret_cast(&(*float_input)[0]); + + std::unique_ptr output_buffer = std::make_unique(norm_size); for (int64_t h = 0; h < norm_size; h++) { - DoubleOrFloat input_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(p_input[h]); - mean += input_value; - mean_square += input_value * input_value; + output_buffer[h] = converted_input[h]; + mean += converted_input[h]; + mean_square += converted_input[h] * converted_input[h]; } mean = mean / norm_size; @@ -180,16 +192,25 @@ Status LayerNormImpl::ComputeWithoutContext( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } + std::shared_ptr> float_scale = ConvertMLFloat16ToFloatIfNeeded(scale_data, norm_size); + const DoubleOrFloat* converted_scale = + float_scale == nullptr + ? reinterpret_cast(scale_data) + : reinterpret_cast(&(*float_scale)[0]); + std::shared_ptr> float_bias = ConvertMLFloat16ToFloatIfNeeded(bias_data, norm_size); + const DoubleOrFloat* converted_bias = + float_bias == nullptr + ? reinterpret_cast(bias_data) + : reinterpret_cast(&(*float_bias)[0]); + for (int64_t h = 0; h < norm_size; h++) { - DoubleOrFloat input_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(p_input[h]); - DoubleOrFloat scale_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(scale_data[h]); if (simplified) { - p_output[h] = ConvertToMLFloat16IfNeeded(input_value / mean_square * scale_value); + p_output[h] = ConvertToMLFloat16IfNeeded(output_buffer[h] / mean_square * converted_scale[h]); } else if (nullptr == bias_data) { - p_output[h] = ConvertToMLFloat16IfNeeded((input_value - mean) / mean_square * scale_value); + p_output[h] = ConvertToMLFloat16IfNeeded((output_buffer[h] - mean) / mean_square * converted_scale[h]); } else { - DoubleOrFloat bias_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(bias_data[h]); - p_output[h] = ConvertToMLFloat16IfNeeded((input_value - mean) / mean_square * scale_value + bias_value); + p_output[h] = ConvertToMLFloat16IfNeeded( + (output_buffer[h] - mean) / mean_square * converted_scale[h] + converted_bias[h]); } } From 8aa80daa2532d4edb1d69111c43436b99a69a774 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 10:13:17 -0700 Subject: [PATCH 07/36] convert output buffer efficiently in layer_norm_impl --- .../contrib_ops/cpu/skip_layer_norm.cc | 16 ++--- .../core/providers/cpu/nn/layer_norm_impl.cc | 63 +++++++++++++++---- 2 files changed, 59 insertions(+), 20 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 50ce160f38153..9178a2f17015b 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -39,17 +39,17 @@ REGISTER_KERNEL_TYPED(MLFloat16) template -std::shared_ptr> ConvertHalfToFloatIfNeeded(const T* p_input, int num_elems); +std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const T* p_input, int num_elems); template -std::shared_ptr> ConvertHalfToFloatIfNeeded( +std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { return nullptr; } template<> -std::shared_ptr> ConvertHalfToFloatIfNeeded(const MLFloat16* p_input, int num_elems) +std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) { if (!p_input) { return nullptr; @@ -148,17 +148,17 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { DoubleOrFloat mean(0.0f); DoubleOrFloat mean_square(0.0f); - std::shared_ptr> float_input = ConvertHalfToFloatIfNeeded(p_input, hidden_size); + std::shared_ptr> float_input = ConvertHalfToFloatBufferIfNeeded(p_input, hidden_size); const DoubleOrFloat* converted_input = float_input == nullptr ? reinterpret_cast(p_input) : reinterpret_cast(&(*float_input)[0]); - std::shared_ptr> float_skip = ConvertHalfToFloatIfNeeded(p_skip, hidden_size); + std::shared_ptr> float_skip = ConvertHalfToFloatBufferIfNeeded(p_skip, hidden_size); const DoubleOrFloat* converted_skip = float_skip == nullptr ? reinterpret_cast(p_skip) : reinterpret_cast(&(*float_skip)[0]); - std::shared_ptr> float_bias = ConvertHalfToFloatIfNeeded(bias_data, hidden_size); + std::shared_ptr> float_bias = ConvertHalfToFloatBufferIfNeeded(bias_data, hidden_size); const DoubleOrFloat* converted_bias = float_bias == nullptr ? reinterpret_cast(bias_data) @@ -189,12 +189,12 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon_); } - std::shared_ptr> float_gamma = ConvertHalfToFloatIfNeeded(gamma_data, hidden_size); + std::shared_ptr> float_gamma = ConvertHalfToFloatBufferIfNeeded(gamma_data, hidden_size); const DoubleOrFloat* converted_gamma = float_gamma == nullptr ? reinterpret_cast(gamma_data) : reinterpret_cast(&(*float_gamma)[0]); - std::shared_ptr> float_beta = ConvertHalfToFloatIfNeeded(beta_data, hidden_size); + std::shared_ptr> float_beta = ConvertHalfToFloatBufferIfNeeded(beta_data, hidden_size); const DoubleOrFloat* converted_beta = float_beta == nullptr ? reinterpret_cast(beta_data) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 9010dd7d6f1b4..a7ab7c6b526d6 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,18 +15,38 @@ namespace onnxruntime { namespace { +double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) +{ + return p_output; +} + +float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) +{ + return p_output; +} + +float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) +{ + if (!p_output) { + return nullptr; + } + + return new float[num_elems]; +} + + template -std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded(const T* p_input, int num_elems); +std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int num_elems); template -std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded( - const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) +std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( + const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { return nullptr; } template<> -std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded(const MLFloat16* p_input, int num_elems) +std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) { if (!p_input) { return nullptr; @@ -39,6 +59,17 @@ std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded(c return vec; } + +void ConvertFloatBufferToMLFloat16(const float* output_buffer, MLFloat16* p_output, int num_elems) +{ + if (!output_buffer || !p_output) { + return; + } + + MlasConvertFloatToHalfBuffer(output_buffer, p_output, num_elems); +} + + ORT_FORCEINLINE constexpr float ConvertToFloatIfNeeded(float val) { return val; } @@ -172,13 +203,16 @@ Status LayerNormImpl::ComputeWithoutContext( DoubleOrFloat mean(0.0f); DoubleOrFloat mean_square(0.0f); - std::shared_ptr> float_input = ConvertMLFloat16ToFloatIfNeeded(p_input, norm_size); + std::shared_ptr> float_input = ConvertMLFloat16ToFloatBufferIfNeeded(p_input, norm_size); const DoubleOrFloat* converted_input = float_input == nullptr ? reinterpret_cast(p_input) : reinterpret_cast(&(*float_input)[0]); - std::unique_ptr output_buffer = std::make_unique(norm_size); + // If T is float or double, then output_buffer will be the same as p_output, so we don't allocate new memory. + // If T is MLFloat16, then we allocate norm_size floats in output_buffer. + DoubleOrFloat* output_buffer = static_cast(OnlyCreateBufferIfMLFloat16(p_output, norm_size)); + for (int64_t h = 0; h < norm_size; h++) { output_buffer[h] = converted_input[h]; mean += converted_input[h]; @@ -192,12 +226,12 @@ Status LayerNormImpl::ComputeWithoutContext( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } - std::shared_ptr> float_scale = ConvertMLFloat16ToFloatIfNeeded(scale_data, norm_size); + std::shared_ptr> float_scale = ConvertMLFloat16ToFloatBufferIfNeeded(scale_data, norm_size); const DoubleOrFloat* converted_scale = float_scale == nullptr ? reinterpret_cast(scale_data) : reinterpret_cast(&(*float_scale)[0]); - std::shared_ptr> float_bias = ConvertMLFloat16ToFloatIfNeeded(bias_data, norm_size); + std::shared_ptr> float_bias = ConvertMLFloat16ToFloatBufferIfNeeded(bias_data, norm_size); const DoubleOrFloat* converted_bias = float_bias == nullptr ? reinterpret_cast(bias_data) @@ -205,15 +239,20 @@ Status LayerNormImpl::ComputeWithoutContext( for (int64_t h = 0; h < norm_size; h++) { if (simplified) { - p_output[h] = ConvertToMLFloat16IfNeeded(output_buffer[h] / mean_square * converted_scale[h]); + output_buffer[h] = output_buffer[h] / mean_square * converted_scale[h]; } else if (nullptr == bias_data) { - p_output[h] = ConvertToMLFloat16IfNeeded((output_buffer[h] - mean) / mean_square * converted_scale[h]); + output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_scale[h]; } else { - p_output[h] = ConvertToMLFloat16IfNeeded( - (output_buffer[h] - mean) / mean_square * converted_scale[h] + converted_bias[h]); + output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_scale[h] + converted_bias[h]; } } + if (std::is_same_v) { + ConvertFloatBufferToMLFloat16( + reinterpret_cast(output_buffer), reinterpret_cast(p_output), norm_size); + delete[] output_buffer; + } + if (mean_data != nullptr) { // ONNX spec doesn't support 'double' for 'U' so when 'T' == double, 'U' == float and we need to narrow mean_data[task_idx] = ConvertToMLFloat16IfNeeded(ConvertToFloatIfNeeded(mean)); From 295d6527228ca7dc4a264ace35a2ca5d172c7760 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 11:46:33 -0700 Subject: [PATCH 08/36] convert output buffer efficiently in skip_layer_norm --- .../contrib_ops/cpu/skip_layer_norm.cc | 86 ++++++++++++------- .../core/providers/cpu/nn/layer_norm_impl.cc | 1 + 2 files changed, 57 insertions(+), 30 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 9178a2f17015b..47174ec54fafd 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -38,6 +38,28 @@ REGISTER_KERNEL_TYPED(double) REGISTER_KERNEL_TYPED(MLFloat16) +namespace { + +double* CreateBufferIfMLFloat16(double* p_output, int num_elems) +{ + return p_output; +} + +float* CreateBufferIfMLFloat16(float* p_output, int num_elems) +{ + return p_output; +} + +float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) +{ + if (!p_output) { + return nullptr; + } + + return new float[num_elems]; +} + + template std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const T* p_input, int num_elems); @@ -63,24 +85,17 @@ std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( } -// Function template that only converts the input value to MLFloat16 if T is MLFloat16. -template -ORT_FORCEINLINE constexpr typename std::enable_if_t || std::is_same_v, T> -ConvertDoubleOrFloatToMLFloat16IfNeeded(T val) { - return val; -} +void ConvertFloatBufferToMLFloat16(const float* output_buffer, MLFloat16* p_output, int num_elems) +{ + if (!output_buffer || !p_output) { + return; + } -template -ORT_FORCEINLINE constexpr typename std::enable_if_t, T> -ConvertDoubleOrFloatToMLFloat16IfNeeded(float val) { - return MLFloat16(val); + MlasConvertFloatToHalfBuffer(output_buffer, p_output, num_elems); } -template -ORT_FORCEINLINE constexpr typename std::enable_if_t, T> -ConvertDoubleOrFloatToMLFloat16IfNeeded(double val) { - return MLFloat16(static_cast(val)); -} +} // namespace + template SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) @@ -164,22 +179,30 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { ? reinterpret_cast(bias_data) : reinterpret_cast(&(*float_bias)[0]); - std::unique_ptr output_buffer = std::make_unique(hidden_size); + // If T is float or double, then output_buffer will be the same as p_output, so we don't allocate new memory. + // If T is MLFloat16, then we allocate hidden_size floats in output_buffer. + DoubleOrFloat* output_buffer = static_cast(CreateBufferIfMLFloat16(p_output, hidden_size)); + for (size_t h = 0; h < static_cast(hidden_size); h++) { - DoubleOrFloat value = converted_input[h] + converted_skip[h]; + DoubleOrFloat val = converted_input[h] + converted_skip[h]; if (nullptr != bias_data) { - value += converted_bias[h]; + val += converted_bias[h]; } - output_buffer[h] = value; - T converted_value = ConvertDoubleOrFloatToMLFloat16IfNeeded(value); - if (nullptr != p_skip_input_bias_add_output_data) { - p_skip_input_bias_add_output_data[h] = converted_value; + output_buffer[h] = val; + mean += val; + mean_square += val * val; + + if (nullptr != p_skip_input_bias_add_output_data && (std::is_same_v || std::is_same_v)) { + p_skip_input_bias_add_output_data[h] = *(reinterpret_cast(&val)); } + } - mean += value; - mean_square += value * value; + if (nullptr != p_skip_input_bias_add_output_data && std::is_same_v) { + ConvertFloatBufferToMLFloat16(reinterpret_cast(output_buffer), + reinterpret_cast(p_skip_input_bias_add_output_data), + hidden_size); } mean = mean / hidden_size; @@ -201,16 +224,19 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { : reinterpret_cast(&(*float_beta)[0]); for (size_t h = 0; h < static_cast(hidden_size); h++) { if (simplified) { - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( - output_buffer[h] / mean_square * converted_gamma[h]); + output_buffer[h] = output_buffer[h] / mean_square * converted_gamma[h]; } else if (nullptr == beta_data) { - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( - (output_buffer[h] - mean) / mean_square * converted_gamma[h]); + output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_gamma[h]; } else { - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( - (output_buffer[h] - mean) / mean_square * converted_gamma[h] + converted_beta[h]); + output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_gamma[h] + converted_beta[h]; } } + + if (std::is_same_v) { + ConvertFloatBufferToMLFloat16( + reinterpret_cast(output_buffer), reinterpret_cast(p_output), hidden_size); + delete[] output_buffer; + } }, 0); diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index a7ab7c6b526d6..cc7bfb039d112 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -99,6 +99,7 @@ ORT_FORCEINLINE constexpr double ConvertToMLFloat16IfNeeded(double val) { } // namespace + LayerNormImpl::LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified, bool contrib_op) : OpKernel(op_kernel_info), simplified_{simplified}, contrib_op_{contrib_op} { ORT_ENFORCE(op_kernel_info.GetAttr("axis", &axis_).IsOK()); From 405a0a0caf98f7688e6e7c306cbb90733935f6ba Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 11:55:17 -0700 Subject: [PATCH 09/36] add inline and fix some lint issues --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 16 ++++++---------- .../core/providers/cpu/nn/layer_norm_impl.cc | 16 ++++++---------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 47174ec54fafd..ff100e617d2f8 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -40,31 +40,27 @@ REGISTER_KERNEL_TYPED(MLFloat16) namespace { -double* CreateBufferIfMLFloat16(double* p_output, int num_elems) +ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, int num_elems) { return p_output; } -float* CreateBufferIfMLFloat16(float* p_output, int num_elems) +ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, int num_elems) { return p_output; } -float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) +ORT_FORCEINLINE float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { - if (!p_output) { - return nullptr; - } - - return new float[num_elems]; + return p_output == nullptr ? nullptr : new float[num_elems]; } template -std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const T* p_input, int num_elems); +ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const T* p_input, int num_elems); template -std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( +ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { return nullptr; diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index cc7bfb039d112..35bac4b94d2c1 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,31 +15,27 @@ namespace onnxruntime { namespace { -double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) +ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) { return p_output; } -float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) { return p_output; } -float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { - if (!p_output) { - return nullptr; - } - - return new float[num_elems]; + return p_output == nullptr ? nullptr : new float[num_elems]; } template -std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int num_elems); +ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int num_elems); template -std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( +ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { return nullptr; From 245f298eba41e5d69f8e977865d5b96de16986e8 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 12:32:11 -0700 Subject: [PATCH 10/36] fix some lint errors --- .../contrib_ops/cpu/skip_layer_norm.cc | 18 ++++++----------- .../core/providers/cpu/nn/layer_norm_impl.cc | 20 +++++++------------ 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index ff100e617d2f8..5d4ae6f67d972 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -40,18 +40,15 @@ REGISTER_KERNEL_TYPED(MLFloat16) namespace { -ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, int num_elems) -{ +ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, int num_elems) { return p_output; } -ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, int num_elems) -{ +ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, int num_elems) { return p_output; } -ORT_FORCEINLINE float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) -{ +ORT_FORCEINLINE float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { return p_output == nullptr ? nullptr : new float[num_elems]; } @@ -61,14 +58,12 @@ ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNe template ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( - const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) -{ + const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { return nullptr; } template<> -std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) -{ +std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) { if (!p_input) { return nullptr; } @@ -81,8 +76,7 @@ std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( } -void ConvertFloatBufferToMLFloat16(const float* output_buffer, MLFloat16* p_output, int num_elems) -{ +void ConvertFloatBufferToMLFloat16(const float* output_buffer, MLFloat16* p_output, int num_elems) { if (!output_buffer || !p_output) { return; } diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 35bac4b94d2c1..33e631152ffcf 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,35 +15,30 @@ namespace onnxruntime { namespace { -ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) -{ +ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) -{ +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) -{ +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { return p_output == nullptr ? nullptr : new float[num_elems]; } template -ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int num_elems); +ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int64_t num_elems); template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) -{ + const std::enable_if_t || std::is_same_v, T>* p_input, int64_t num_elems) { return nullptr; } template<> -std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) -{ +std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, int64_t num_elems) { if (!p_input) { return nullptr; } @@ -56,8 +51,7 @@ std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded Date: Mon, 30 Sep 2024 12:44:07 -0700 Subject: [PATCH 11/36] fix warning --- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 33e631152ffcf..00cc0900e9577 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,15 +15,15 @@ namespace onnxruntime { namespace { -ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) { +ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int64_t num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) { +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int64_t num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int64_t num_elems) { return p_output == nullptr ? nullptr : new float[num_elems]; } From a483ca480cb09275d46158061e8f6c0c57056ada Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 22:50:00 -0700 Subject: [PATCH 12/36] maybe_unused --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 10 ++++++---- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 5d4ae6f67d972..6505ee3fc6fd4 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -40,11 +40,11 @@ REGISTER_KERNEL_TYPED(MLFloat16) namespace { -ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, int num_elems) { +ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, [[maybe_unused]] int num_elems) { return p_output; } -ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, int num_elems) { +ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, [[maybe_unused]] int num_elems) { return p_output; } @@ -54,11 +54,13 @@ ORT_FORCEINLINE float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elem template -ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const T* p_input, int num_elems); +ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( + [[maybe_unused]] const T* p_input, [[maybe_unused]] int num_elems); template ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( - const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { + [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, + [[maybe_unused]] int num_elems) { return nullptr; } diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 00cc0900e9577..d3da791be81fe 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,11 +15,11 @@ namespace onnxruntime { namespace { -ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int64_t num_elems) { +ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, [[maybe_unused]] int64_t num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int64_t num_elems) { +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, [[maybe_unused]] int64_t num_elems) { return p_output; } @@ -29,11 +29,13 @@ ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int64_t template -ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int64_t num_elems); +ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( + [[maybe_unused]] const T* p_input, [[maybe_unused]] int64_t num_elems); template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - const std::enable_if_t || std::is_same_v, T>* p_input, int64_t num_elems) { + [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, + [[maybe_unused]] int64_t num_elems) { return nullptr; } From 19d225a017d7ed83587f94945ccde47f340bc1f1 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Tue, 1 Oct 2024 07:12:57 -0700 Subject: [PATCH 13/36] Fix bug --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 2 +- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 6505ee3fc6fd4..c963668d58e6e 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -224,7 +224,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { } } - if (std::is_same_v) { + if (std::is_same_v) { ConvertFloatBufferToMLFloat16( reinterpret_cast(output_buffer), reinterpret_cast(p_output), hidden_size); delete[] output_buffer; diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index d3da791be81fe..50fa0d55af270 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -240,7 +240,7 @@ Status LayerNormImpl::ComputeWithoutContext( } } - if (std::is_same_v) { + if (std::is_same_v) { ConvertFloatBufferToMLFloat16( reinterpret_cast(output_buffer), reinterpret_cast(p_output), norm_size); delete[] output_buffer; From 05b5037b410d800959d4daaa45be3d0bf21521fd Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Tue, 1 Oct 2024 12:14:33 -0700 Subject: [PATCH 14/36] separate MLFloat16 implementation in skip_layer_norm --- .../contrib_ops/cpu/skip_layer_norm.cc | 252 +++++++++--------- 1 file changed, 125 insertions(+), 127 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index c963668d58e6e..a4c7a19dfb5c9 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -40,50 +40,138 @@ REGISTER_KERNEL_TYPED(MLFloat16) namespace { -ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, [[maybe_unused]] int num_elems) { - return p_output; -} +template || std::is_same_v, void>> +void ComputeJob( + const T* input_data, + const T* skip_data, + const T* gamma_data, + const T* beta_data, + const T* bias_data, + ptrdiff_t task_idx, + int hidden_size, + int64_t skip_size, + float epsilon, + bool simplified, + T* output_data, + T* skip_input_bias_add_output_data +) { + auto offset = task_idx * hidden_size; + const T* p_input = input_data + offset; + const T* p_skip = skip_data + (offset % skip_size); + T* p_output = output_data + offset; + T* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset; + + T mean(0.0f); + T mean_square(0.0f); + + for (decltype(hidden_size) h = 0; h < hidden_size; h++) { + T val = p_input[h] + p_skip[h]; + + if (nullptr != bias_data) { + val += bias_data[h]; + } + + if (nullptr != p_skip_input_bias_add_output) { + p_skip_input_bias_add_output[h] = val; + } + + p_output[h] = val; + mean += val; + mean_square += val * val; + } -ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, [[maybe_unused]] int num_elems) { - return p_output; -} + mean = mean / hidden_size; + if (simplified) { + mean_square = sqrt(mean_square / hidden_size + epsilon); + } else { + mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); + } -ORT_FORCEINLINE float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { - return p_output == nullptr ? nullptr : new float[num_elems]; + for (decltype(hidden_size) h = 0; h < hidden_size; h++) { + if (simplified) { + p_output[h] = p_output[h] / mean_square * gamma_data[h]; + } else if (nullptr == beta_data) { + p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h]; + } else { + p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h] + beta_data[h]; + } + } } +void ComputeJob( + const MLFloat16* input_data, + const MLFloat16* skip_data, + const MLFloat16* gamma_data, + const MLFloat16* beta_data, + const MLFloat16* bias_data, + ptrdiff_t task_idx, + int hidden_size, + int64_t skip_size, + float epsilon, + bool simplified, + MLFloat16* output_data, + MLFloat16* skip_input_bias_add_output_data +) { + auto offset = task_idx * hidden_size; + const MLFloat16* p_input = input_data + offset; + const MLFloat16* p_skip = skip_data + (offset % skip_size); + MLFloat16* p_output = output_data + offset; + MLFloat16* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset; + + float mean(0.0f); + float mean_square(0.0f); + + std::vector float_input(hidden_size); + MlasConvertHalfToFloatBuffer(p_input, &float_input[0], hidden_size); + std::vector float_skip(hidden_size); + MlasConvertHalfToFloatBuffer(p_skip, &float_skip[0], hidden_size); + std::vector float_bias; + if (bias_data != nullptr) { + float_bias.resize(hidden_size); + MlasConvertHalfToFloatBuffer(bias_data, &float_bias[0], hidden_size); + } + + std::vector float_output(hidden_size); -template -ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( - [[maybe_unused]] const T* p_input, [[maybe_unused]] int num_elems); + for (decltype(hidden_size) h = 0; h < hidden_size; h++) { + float val = float_input[h] + float_skip[h]; -template -ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( - [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, - [[maybe_unused]] int num_elems) { - return nullptr; -} + if (nullptr != bias_data) { + val += float_bias[h]; + } -template<> -std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) { - if (!p_input) { - return nullptr; + float_output[h] = val; + mean += val; + mean_square += val * val; } - // Efficiently convert all the MLFloat16 values to floats. - std::shared_ptr> vec = std::make_shared>(num_elems); - MlasConvertHalfToFloatBuffer(p_input, &(*vec)[0], num_elems); - - return vec; -} + if (nullptr != p_skip_input_bias_add_output) { + MlasConvertFloatToHalfBuffer(&float_output[0], p_skip_input_bias_add_output, hidden_size); + } + mean = mean / hidden_size; + if (simplified) { + mean_square = sqrt(mean_square / hidden_size + epsilon); + } else { + mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); + } -void ConvertFloatBufferToMLFloat16(const float* output_buffer, MLFloat16* p_output, int num_elems) { - if (!output_buffer || !p_output) { - return; + std::vector float_gamma(hidden_size); + MlasConvertHalfToFloatBuffer(gamma_data, &float_gamma[0], hidden_size); + std::vector float_beta(hidden_size); + MlasConvertHalfToFloatBuffer(beta_data, &float_beta[0], hidden_size); + + for (decltype(hidden_size) h = 0; h < hidden_size; h++) { + if (simplified) { + float_output[h] = float_output[h] / mean_square * float_gamma[h]; + } else if (nullptr == beta_data) { + float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h]; + } else { + float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h] + float_beta[h]; + } } - MlasConvertFloatToHalfBuffer(output_buffer, p_output, num_elems); + MlasConvertFloatToHalfBuffer(&float_output[0], p_output, hidden_size); } } // namespace @@ -104,8 +192,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const Tensor* beta = p_ctx->Input(3); const Tensor* bias = p_ctx->Input(4); Tensor* output = p_ctx->Output(0, input->Shape()); - // For inferencing, we support one more optional output which is the sum - // of the input and skip tensors + // For inferencing, we support one more optional output which is the sum of the input and skip tensors Tensor* skip_input_bias_add_output = p_ctx->Output(3, input->Shape()); const auto& input_dims = input->Shape().GetDims(); @@ -130,105 +217,16 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { T* output_data = output->MutableData(); - // For inferencing, we support one more optional output which is the sum - // of the input and skip tensors - T* skip_input_bias_add_output_data = skip_input_bias_add_output != nullptr ? skip_input_bias_add_output->MutableData() : nullptr; + // For inferencing, we support one more optional output which is the sum of the input and skip tensors + T* skip_input_bias_add_output_data = skip_input_bias_add_output == nullptr ? nullptr : skip_input_bias_add_output->MutableData(); - const auto& skip_size = skip->Shape().Size(); + const int64_t& skip_size = skip->Shape().Size(); concurrency::ThreadPool::TryBatchParallelFor( p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { - auto offset = task_idx * hidden_size; - - const T* p_input = input_data + offset; - const T* p_skip = skip_data + (offset % skip_size); - T* p_output = output_data + offset; - T* p_skip_input_bias_add_output_data = skip_input_bias_add_output_data != nullptr ? skip_input_bias_add_output_data + offset : nullptr; - - using DoubleOrFloat = typename std::conditional< - std::is_same::value, // If T is double - double, // Use double - float // Otherwise, use float (covers float and MLFloat16) - >::type; - - DoubleOrFloat mean(0.0f); - DoubleOrFloat mean_square(0.0f); - - std::shared_ptr> float_input = ConvertHalfToFloatBufferIfNeeded(p_input, hidden_size); - const DoubleOrFloat* converted_input = - float_input == nullptr - ? reinterpret_cast(p_input) - : reinterpret_cast(&(*float_input)[0]); - std::shared_ptr> float_skip = ConvertHalfToFloatBufferIfNeeded(p_skip, hidden_size); - const DoubleOrFloat* converted_skip = - float_skip == nullptr - ? reinterpret_cast(p_skip) - : reinterpret_cast(&(*float_skip)[0]); - std::shared_ptr> float_bias = ConvertHalfToFloatBufferIfNeeded(bias_data, hidden_size); - const DoubleOrFloat* converted_bias = - float_bias == nullptr - ? reinterpret_cast(bias_data) - : reinterpret_cast(&(*float_bias)[0]); - - // If T is float or double, then output_buffer will be the same as p_output, so we don't allocate new memory. - // If T is MLFloat16, then we allocate hidden_size floats in output_buffer. - DoubleOrFloat* output_buffer = static_cast(CreateBufferIfMLFloat16(p_output, hidden_size)); - - for (size_t h = 0; h < static_cast(hidden_size); h++) { - DoubleOrFloat val = converted_input[h] + converted_skip[h]; - - if (nullptr != bias_data) { - val += converted_bias[h]; - } - - output_buffer[h] = val; - mean += val; - mean_square += val * val; - - if (nullptr != p_skip_input_bias_add_output_data && (std::is_same_v || std::is_same_v)) { - p_skip_input_bias_add_output_data[h] = *(reinterpret_cast(&val)); - } - } - - if (nullptr != p_skip_input_bias_add_output_data && std::is_same_v) { - ConvertFloatBufferToMLFloat16(reinterpret_cast(output_buffer), - reinterpret_cast(p_skip_input_bias_add_output_data), - hidden_size); - } - - mean = mean / hidden_size; - if (simplified) { - mean_square = sqrt(mean_square / hidden_size + epsilon_); - } else { - mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon_); - } - - std::shared_ptr> float_gamma = ConvertHalfToFloatBufferIfNeeded(gamma_data, hidden_size); - const DoubleOrFloat* converted_gamma = - float_gamma == nullptr - ? reinterpret_cast(gamma_data) - : reinterpret_cast(&(*float_gamma)[0]); - std::shared_ptr> float_beta = ConvertHalfToFloatBufferIfNeeded(beta_data, hidden_size); - const DoubleOrFloat* converted_beta = - float_beta == nullptr - ? reinterpret_cast(beta_data) - : reinterpret_cast(&(*float_beta)[0]); - for (size_t h = 0; h < static_cast(hidden_size); h++) { - if (simplified) { - output_buffer[h] = output_buffer[h] / mean_square * converted_gamma[h]; - } else if (nullptr == beta_data) { - output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_gamma[h]; - } else { - output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_gamma[h] + converted_beta[h]; - } - } - - if (std::is_same_v) { - ConvertFloatBufferToMLFloat16( - reinterpret_cast(output_buffer), reinterpret_cast(p_output), hidden_size); - delete[] output_buffer; - } + ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, epsilon_, + simplified, output_data, skip_input_bias_add_output_data); }, 0); From ab2e5f2e4b286f48de6278441649be408bd4bc95 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Tue, 1 Oct 2024 12:39:18 -0700 Subject: [PATCH 15/36] fix linter issues --- .../contrib_ops/cpu/skip_layer_norm.cc | 56 ++++++++--------- .../core/providers/cpu/nn/layer_norm_impl.cc | 62 +++++++++---------- .../core/providers/cpu/nn/layer_norm_impl.h | 34 +++++----- .../microbenchmark/layer_normalization.cc | 23 +++---- 4 files changed, 82 insertions(+), 93 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index a4c7a19dfb5c9..66ca8c4dfd37f 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -37,24 +37,22 @@ REGISTER_KERNEL_TYPED(float) REGISTER_KERNEL_TYPED(double) REGISTER_KERNEL_TYPED(MLFloat16) - namespace { template || std::is_same_v, void>> void ComputeJob( - const T* input_data, - const T* skip_data, - const T* gamma_data, - const T* beta_data, - const T* bias_data, - ptrdiff_t task_idx, - int hidden_size, - int64_t skip_size, - float epsilon, - bool simplified, - T* output_data, - T* skip_input_bias_add_output_data -) { + const T* input_data, + const T* skip_data, + const T* gamma_data, + const T* beta_data, + const T* bias_data, + ptrdiff_t task_idx, + int hidden_size, + int64_t skip_size, + float epsilon, + bool simplified, + T* output_data, + T* skip_input_bias_add_output_data) { auto offset = task_idx * hidden_size; const T* p_input = input_data + offset; const T* p_skip = skip_data + (offset % skip_size); @@ -99,19 +97,18 @@ void ComputeJob( } void ComputeJob( - const MLFloat16* input_data, - const MLFloat16* skip_data, - const MLFloat16* gamma_data, - const MLFloat16* beta_data, - const MLFloat16* bias_data, - ptrdiff_t task_idx, - int hidden_size, - int64_t skip_size, - float epsilon, - bool simplified, - MLFloat16* output_data, - MLFloat16* skip_input_bias_add_output_data -) { + const MLFloat16* input_data, + const MLFloat16* skip_data, + const MLFloat16* gamma_data, + const MLFloat16* beta_data, + const MLFloat16* bias_data, + ptrdiff_t task_idx, + int hidden_size, + int64_t skip_size, + float epsilon, + bool simplified, + MLFloat16* output_data, + MLFloat16* skip_input_bias_add_output_data) { auto offset = task_idx * hidden_size; const MLFloat16* p_input = input_data + offset; const MLFloat16* p_skip = skip_data + (offset % skip_size); @@ -174,8 +171,7 @@ void ComputeJob( MlasConvertFloatToHalfBuffer(&float_output[0], p_output, hidden_size); } -} // namespace - +} // namespace template SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) @@ -226,7 +222,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, epsilon_, - simplified, output_data, skip_input_bias_add_output_data); + simplified, output_data, skip_input_bias_add_output_data); }, 0); diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 50fa0d55af270..28ff0420a7323 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -27,19 +27,18 @@ ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int64_t return p_output == nullptr ? nullptr : new float[num_elems]; } - template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - [[maybe_unused]] const T* p_input, [[maybe_unused]] int64_t num_elems); + [[maybe_unused]] const T* p_input, [[maybe_unused]] int64_t num_elems); template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, - [[maybe_unused]] int64_t num_elems) { + [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, + [[maybe_unused]] int64_t num_elems) { return nullptr; } -template<> +template <> std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, int64_t num_elems) { if (!p_input) { return nullptr; @@ -52,7 +51,6 @@ std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeededGetOperatorThreadPool(); return ComputeWithoutContext(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, - Y_data, mean_data, inv_std_dev_data, thread_pool, axis, epsilon, simplified); + Y_data, mean_data, inv_std_dev_data, thread_pool, axis, epsilon, simplified); } Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { @@ -156,19 +152,19 @@ Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { template Status LayerNormImpl::ComputeWithoutContext( - const T* X_data, - const TensorShape& x_shape, - const T* scale_data, - const TensorShape& scale_shape, - const T* bias_data, - const TensorShape& bias_shape, - T* Y_data, - U* mean_data, - U* inv_std_dev_data, - onnxruntime::concurrency::ThreadPool* thread_pool, - int64_t axis, - float epsilon, - bool simplified) const { + const T* X_data, + const TensorShape& x_shape, + const T* scale_data, + const TensorShape& scale_shape, + const T* bias_data, + const TensorShape& bias_shape, + T* Y_data, + U* mean_data, + U* inv_std_dev_data, + onnxruntime::concurrency::ThreadPool* thread_pool, + int64_t axis, + float epsilon, + bool simplified) const { int64_t norm_count = x_shape.SizeToDimension(onnxruntime::narrow(axis)); int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow(axis)); @@ -198,9 +194,9 @@ Status LayerNormImpl::ComputeWithoutContext( std::shared_ptr> float_input = ConvertMLFloat16ToFloatBufferIfNeeded(p_input, norm_size); const DoubleOrFloat* converted_input = - float_input == nullptr - ? reinterpret_cast(p_input) - : reinterpret_cast(&(*float_input)[0]); + float_input == nullptr + ? reinterpret_cast(p_input) + : reinterpret_cast(&(*float_input)[0]); // If T is float or double, then output_buffer will be the same as p_output, so we don't allocate new memory. // If T is MLFloat16, then we allocate norm_size floats in output_buffer. @@ -221,14 +217,14 @@ Status LayerNormImpl::ComputeWithoutContext( std::shared_ptr> float_scale = ConvertMLFloat16ToFloatBufferIfNeeded(scale_data, norm_size); const DoubleOrFloat* converted_scale = - float_scale == nullptr - ? reinterpret_cast(scale_data) - : reinterpret_cast(&(*float_scale)[0]); + float_scale == nullptr + ? reinterpret_cast(scale_data) + : reinterpret_cast(&(*float_scale)[0]); std::shared_ptr> float_bias = ConvertMLFloat16ToFloatBufferIfNeeded(bias_data, norm_size); const DoubleOrFloat* converted_bias = - float_bias == nullptr - ? reinterpret_cast(bias_data) - : reinterpret_cast(&(*float_bias)[0]); + float_bias == nullptr + ? reinterpret_cast(bias_data) + : reinterpret_cast(&(*float_bias)[0]); for (int64_t h = 0; h < norm_size; h++) { if (simplified) { @@ -242,7 +238,7 @@ Status LayerNormImpl::ComputeWithoutContext( if (std::is_same_v) { ConvertFloatBufferToMLFloat16( - reinterpret_cast(output_buffer), reinterpret_cast(p_output), norm_size); + reinterpret_cast(output_buffer), reinterpret_cast(p_output), norm_size); delete[] output_buffer; } diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h index aa876357ed3c8..64e1c2ba2f902 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h @@ -17,19 +17,19 @@ class LayerNormImpl : public OpKernel { // This method was created so that it can be called directly from `test/onnx/microbenchmark/layer_normalization.cc`. template Status ComputeWithoutContext( - const T* X_data, - const TensorShape& x_shape, - const T* scale_data, - const TensorShape& scale_shape, - const T* bias_data, - const TensorShape& bias_shape, - T* Y_data, - U* mean_data, - U* inv_std_dev, - onnxruntime::concurrency::ThreadPool* thread_pool, - int64_t axis, - float epsilon, - bool simplified) const; + const T* X_data, + const TensorShape& x_shape, + const T* scale_data, + const TensorShape& scale_shape, + const T* bias_data, + const TensorShape& bias_shape, + T* Y_data, + U* mean_data, + U* inv_std_dev, + onnxruntime::concurrency::ThreadPool* thread_pool, + int64_t axis, + float epsilon, + bool simplified) const; private: template @@ -38,16 +38,16 @@ class LayerNormImpl : public OpKernel { template struct SrcDispatcher { Status operator()(const LayerNormImpl* p_instance, OpKernelContext* p_ctx, int64_t orig_axis, - float epsilon, bool simplified, bool contrib_op) const { + float epsilon, bool simplified, bool contrib_op) const { // the contrib op kernel was always registered with the same type for all constraints. // our implementation of the onnx op only supports 'float' as the U constraint. - #if !defined(DISABLE_CONTRIB_OPS) +#if !defined(DISABLE_CONTRIB_OPS) if (contrib_op) { return p_instance->ComputeImpl(p_ctx, orig_axis, epsilon, simplified); } else - #else +#else ORT_UNUSED_PARAMETER(contrib_op); - #endif +#endif { return p_instance->ComputeImpl(p_ctx, orig_axis, epsilon, simplified); } diff --git a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc index 5c7bd5716832a..4660cb85a43f1 100644 --- a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc +++ b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc @@ -33,23 +33,22 @@ static const size_t num_elems = dims[0] * dims[1] * dims[2]; static const std::vector float_vals(num_elems, 1.0f); static const std::vector MLFloat16_vals(num_elems, MLFloat16(1.0f)); -} // namespace +} // namespace template const T* getVector(); template <> const float* getVector() { - return float_vals.data(); + return float_vals.data(); } template <> const MLFloat16* getVector() { - return MLFloat16_vals.data(); + return MLFloat16_vals.data(); } - -template +template static void BM_LayerNormalization(benchmark::State& state) { bool simplified = false; const float epsilon = 1e-05f; @@ -69,7 +68,7 @@ static void BM_LayerNormalization(benchmark::State& state) { ConfigOptions config_options; OpKernelInfo op_kernel_info(node, kernel_def, *execution_provider, constant_initialized_tensors, mlvalue_name_idx_map, - data_transfer_mgr, allocators, config_options); + data_transfer_mgr, allocators, config_options); LayerNormImpl layer_norm_impl(op_kernel_info); @@ -88,21 +87,19 @@ static void BM_LayerNormalization(benchmark::State& state) { OrtThreadPoolParams tp_params; tp_params.name = ORT_TSTR("intra-op"); std::unique_ptr thread_pool = concurrency::CreateThreadPool( - &Env::Default(), tp_params, concurrency::ThreadPoolType::INTRA_OP); + &Env::Default(), tp_params, concurrency::ThreadPoolType::INTRA_OP); for (auto _ : state) { auto status = layer_norm_impl.ComputeWithoutContext(x_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, - Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis, epsilon, simplified); + Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis, epsilon, simplified); - if (! status.IsOK()) - { - std::cout << "ComputeWithoutContext status not OK: " << status.ErrorMessage() << std::endl; - break; + if (!status.IsOK()) { + std::cout << "ComputeWithoutContext status not OK: " << status.ErrorMessage() << std::endl; + break; } } } - BENCHMARK(BM_LayerNormalization) ->Arg(1) ->UseRealTime() From 63e9644ce4016dbb77608dcc550397b157f55ac1 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Tue, 1 Oct 2024 14:03:54 -0700 Subject: [PATCH 16/36] fix precision warning --- .../core/providers/cpu/nn/layer_norm_impl.cc | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 28ff0420a7323..3259d0b67ef92 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -29,17 +29,17 @@ ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int64_t template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - [[maybe_unused]] const T* p_input, [[maybe_unused]] int64_t num_elems); + [[maybe_unused]] const T* p_input, [[maybe_unused]] size_t num_elems); template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, - [[maybe_unused]] int64_t num_elems) { + [[maybe_unused]] size_t num_elems) { return nullptr; } template <> -std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, int64_t num_elems) { +std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, size_t num_elems) { if (!p_input) { return nullptr; } @@ -51,7 +51,7 @@ std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded> float_input = ConvertMLFloat16ToFloatBufferIfNeeded(p_input, norm_size); + std::shared_ptr> float_input = ConvertMLFloat16ToFloatBufferIfNeeded( + p_input, static_cast(norm_size)); const DoubleOrFloat* converted_input = float_input == nullptr ? reinterpret_cast(p_input) @@ -215,12 +216,14 @@ Status LayerNormImpl::ComputeWithoutContext( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } - std::shared_ptr> float_scale = ConvertMLFloat16ToFloatBufferIfNeeded(scale_data, norm_size); + std::shared_ptr> float_scale = ConvertMLFloat16ToFloatBufferIfNeeded( + scale_data, static_cast(norm_size)); const DoubleOrFloat* converted_scale = float_scale == nullptr ? reinterpret_cast(scale_data) : reinterpret_cast(&(*float_scale)[0]); - std::shared_ptr> float_bias = ConvertMLFloat16ToFloatBufferIfNeeded(bias_data, norm_size); + std::shared_ptr> float_bias = ConvertMLFloat16ToFloatBufferIfNeeded( + bias_data, static_cast(norm_size)); const DoubleOrFloat* converted_bias = float_bias == nullptr ? reinterpret_cast(bias_data) @@ -238,7 +241,9 @@ Status LayerNormImpl::ComputeWithoutContext( if (std::is_same_v) { ConvertFloatBufferToMLFloat16( - reinterpret_cast(output_buffer), reinterpret_cast(p_output), norm_size); + reinterpret_cast(output_buffer), + reinterpret_cast(p_output), + static_cast(norm_size)); delete[] output_buffer; } From 11eb7fbccb77271a98157ed242235895ebcf6577 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Tue, 1 Oct 2024 22:23:19 -0700 Subject: [PATCH 17/36] cast --- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 3259d0b67ef92..4f986d2ab0066 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,15 +15,15 @@ namespace onnxruntime { namespace { -ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, [[maybe_unused]] int64_t num_elems) { +ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, [[maybe_unused]] size_t num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, [[maybe_unused]] int64_t num_elems) { +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, [[maybe_unused]] size_t num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int64_t num_elems) { +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, size_t num_elems) { return p_output == nullptr ? nullptr : new float[num_elems]; } @@ -201,7 +201,8 @@ Status LayerNormImpl::ComputeWithoutContext( // If T is float or double, then output_buffer will be the same as p_output, so we don't allocate new memory. // If T is MLFloat16, then we allocate norm_size floats in output_buffer. - DoubleOrFloat* output_buffer = static_cast(OnlyCreateBufferIfMLFloat16(p_output, norm_size)); + DoubleOrFloat* output_buffer = static_cast( + OnlyCreateBufferIfMLFloat16(p_output, static_cast(norm_size))); for (int64_t h = 0; h < norm_size; h++) { output_buffer[h] = converted_input[h]; From 46775a764e5b9da75e4af1a04943f4be66c3838f Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 2 Oct 2024 00:17:58 -0700 Subject: [PATCH 18/36] separate implementation for MLFloat16 inside layer_norm_impl --- .../core/providers/cpu/nn/layer_norm_impl.cc | 228 ++++++++---------- 1 file changed, 99 insertions(+), 129 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 4f986d2ab0066..32427b2c39db1 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,75 +15,119 @@ namespace onnxruntime { namespace { -ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, [[maybe_unused]] size_t num_elems) { - return p_output; -} +template || std::is_same_v, void>> +void ComputeJob( + const T* X_data, + const T* scale_data, + const T* bias_data, + const ptrdiff_t task_idx, + const int64_t norm_size, + float epsilon, + bool simplified, + T* Y_data, + U* mean_data, + U* inv_std_dev_data) { + const T* p_input = X_data + task_idx * norm_size; + T* p_output = Y_data + task_idx * norm_size; -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, [[maybe_unused]] size_t num_elems) { - return p_output; -} + T mean(0.0f); + T mean_square(0.0f); -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, size_t num_elems) { - return p_output == nullptr ? nullptr : new float[num_elems]; -} - -template -ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - [[maybe_unused]] const T* p_input, [[maybe_unused]] size_t num_elems); + for (int64_t h = 0; h < norm_size; h++) { + p_output[h] = p_input[h]; + mean += p_input[h]; + mean_square += p_input[h] * p_input[h]; + } -template -ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, - [[maybe_unused]] size_t num_elems) { - return nullptr; -} + mean = mean / norm_size; + if (simplified) { + mean_square = sqrt(mean_square / norm_size + epsilon); + } else { + mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); + } -template <> -std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, size_t num_elems) { - if (!p_input) { - return nullptr; + for (int64_t h = 0; h < norm_size; h++) { + if (simplified) { + p_output[h] = p_output[h] / mean_square * scale_data[h]; + } else if (nullptr == bias_data) { + p_output[h] = (p_output[h] - mean) / mean_square * scale_data[h]; + } else { + p_output[h] = (p_output[h] - mean) / mean_square * scale_data[h] + bias_data[h]; + } } - // Efficiently convert all the MLFloat16 values to floats. - std::shared_ptr> vec = std::make_shared>(num_elems); - MlasConvertHalfToFloatBuffer(p_input, &(*vec)[0], num_elems); + if (mean_data != nullptr) { + // ONNX spec doesn't support 'double' for 'U' so when 'T' == double, 'U' == float and we need to narrow + mean_data[task_idx] = gsl::narrow_cast(mean); + } - return vec; + if (inv_std_dev_data != nullptr) { + inv_std_dev_data[task_idx] = gsl::narrow_cast(1 / mean_square); + } } -void ConvertFloatBufferToMLFloat16(const float* output_buffer, MLFloat16* p_output, size_t num_elems) { - if (!output_buffer || !p_output) { - return; +template +void ComputeJob( + const MLFloat16* X_data, + const MLFloat16* scale_data, + const MLFloat16* bias_data, + const ptrdiff_t task_idx, + const int64_t norm_size, + float epsilon, + bool simplified, + MLFloat16* Y_data, + U* mean_data, + U* inv_std_dev_data) { + const MLFloat16* p_input = X_data + task_idx * norm_size; + MLFloat16* p_output = Y_data + task_idx * norm_size; + + float mean(0.0f); + float mean_square(0.0f); + + std::vector float_input(norm_size); + MlasConvertHalfToFloatBuffer(p_input, &float_input[0], norm_size); + + std::vector float_output(norm_size); + for (int64_t h = 0; h < norm_size; h++) { + float_output[h] = float_input[h]; + mean += float_input[h]; + mean_square += float_input[h] * float_input[h]; } - MlasConvertFloatToHalfBuffer(output_buffer, p_output, num_elems); -} + mean = mean / norm_size; + if (simplified) { + mean_square = sqrt(mean_square / norm_size + epsilon); + } else { + mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); + } -ORT_FORCEINLINE constexpr float ConvertToFloatIfNeeded(float val) { - return val; -} + std::vector float_scale(norm_size); + MlasConvertHalfToFloatBuffer(scale_data, &float_scale[0], norm_size); + std::vector float_bias(norm_size); + MlasConvertHalfToFloatBuffer(bias_data, &float_bias[0], norm_size); -ORT_FORCEINLINE constexpr float ConvertToFloatIfNeeded(double val) { - // ONNX spec doesn't support 'double' for 'Ret' so when 'T' == double, 'Ret' == float and we need to narrow - return gsl::narrow_cast(val); -} + for (int64_t h = 0; h < norm_size; h++) { + if (simplified) { + float_output[h] = float_output[h] / mean_square * float_scale[h]; + } else if (nullptr == bias_data) { + float_output[h] = (float_output[h] - mean) / mean_square * float_scale[h]; + } else { + float_output[h] = (float_output[h] - mean) / mean_square * float_scale[h] + float_bias[h]; + } + } -// Function template that only converts the input value to MLFloat16 if T is MLFloat16. -template -ORT_FORCEINLINE constexpr typename std::enable_if_t || std::is_same_v, float> -ConvertToMLFloat16IfNeeded(float val) { - return val; -} + MlasConvertFloatToHalfBuffer(&float_output[0], p_output, static_cast(norm_size)); -template -ORT_FORCEINLINE constexpr typename std::enable_if_t, MLFloat16> -ConvertToMLFloat16IfNeeded(float val) { - return MLFloat16(val); -} + if (mean_data != nullptr) { + // ONNX spec doesn't support 'double' for 'U' so when 'T' == double, 'U' == float and we need to narrow + mean_data[task_idx] = MLFloat16(mean); + } -template -ORT_FORCEINLINE constexpr double ConvertToMLFloat16IfNeeded(double val) { - return val; + if (inv_std_dev_data != nullptr) { + inv_std_dev_data[task_idx] = MLFloat16(1 / mean_square); + } } } // namespace @@ -180,82 +224,8 @@ Status LayerNormImpl::ComputeWithoutContext( concurrency::ThreadPool::TryBatchParallelFor( thread_pool, static_cast(norm_count), [&](ptrdiff_t task_idx) { - const T* p_input = X_data + task_idx * norm_size; - T* p_output = Y_data + task_idx * norm_size; - - using DoubleOrFloat = typename std::conditional< - std::is_same::value, // If T is double - double, // Use double - float // Otherwise, use float (covers float and MLFloat16) - >::type; - - DoubleOrFloat mean(0.0f); - DoubleOrFloat mean_square(0.0f); - - std::shared_ptr> float_input = ConvertMLFloat16ToFloatBufferIfNeeded( - p_input, static_cast(norm_size)); - const DoubleOrFloat* converted_input = - float_input == nullptr - ? reinterpret_cast(p_input) - : reinterpret_cast(&(*float_input)[0]); - - // If T is float or double, then output_buffer will be the same as p_output, so we don't allocate new memory. - // If T is MLFloat16, then we allocate norm_size floats in output_buffer. - DoubleOrFloat* output_buffer = static_cast( - OnlyCreateBufferIfMLFloat16(p_output, static_cast(norm_size))); - - for (int64_t h = 0; h < norm_size; h++) { - output_buffer[h] = converted_input[h]; - mean += converted_input[h]; - mean_square += converted_input[h] * converted_input[h]; - } - - mean = mean / norm_size; - if (simplified) { - mean_square = sqrt(mean_square / norm_size + epsilon); - } else { - mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); - } - - std::shared_ptr> float_scale = ConvertMLFloat16ToFloatBufferIfNeeded( - scale_data, static_cast(norm_size)); - const DoubleOrFloat* converted_scale = - float_scale == nullptr - ? reinterpret_cast(scale_data) - : reinterpret_cast(&(*float_scale)[0]); - std::shared_ptr> float_bias = ConvertMLFloat16ToFloatBufferIfNeeded( - bias_data, static_cast(norm_size)); - const DoubleOrFloat* converted_bias = - float_bias == nullptr - ? reinterpret_cast(bias_data) - : reinterpret_cast(&(*float_bias)[0]); - - for (int64_t h = 0; h < norm_size; h++) { - if (simplified) { - output_buffer[h] = output_buffer[h] / mean_square * converted_scale[h]; - } else if (nullptr == bias_data) { - output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_scale[h]; - } else { - output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_scale[h] + converted_bias[h]; - } - } - - if (std::is_same_v) { - ConvertFloatBufferToMLFloat16( - reinterpret_cast(output_buffer), - reinterpret_cast(p_output), - static_cast(norm_size)); - delete[] output_buffer; - } - - if (mean_data != nullptr) { - // ONNX spec doesn't support 'double' for 'U' so when 'T' == double, 'U' == float and we need to narrow - mean_data[task_idx] = ConvertToMLFloat16IfNeeded(ConvertToFloatIfNeeded(mean)); - } - - if (inv_std_dev_data != nullptr) { - inv_std_dev_data[task_idx] = ConvertToMLFloat16IfNeeded(ConvertToFloatIfNeeded(1 / mean_square)); - } + ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, epsilon, simplified, + Y_data, mean_data, inv_std_dev_data); }, 0); From fd904f6951dfe10c77ba209c9d3c8da193bbf1d9 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 2 Oct 2024 01:24:43 -0700 Subject: [PATCH 19/36] don't use vectors --- .../contrib_ops/cpu/skip_layer_norm.cc | 44 +++++++++++-------- .../core/providers/cpu/nn/layer_norm_impl.cc | 30 +++++++------ 2 files changed, 43 insertions(+), 31 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 66ca8c4dfd37f..3a1ff01d870b3 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -118,19 +118,24 @@ void ComputeJob( float mean(0.0f); float mean_square(0.0f); - std::vector float_input(hidden_size); - MlasConvertHalfToFloatBuffer(p_input, &float_input[0], hidden_size); - std::vector float_skip(hidden_size); - MlasConvertHalfToFloatBuffer(p_skip, &float_skip[0], hidden_size); - std::vector float_bias; + const size_t num_elems = static_cast(hidden_size); + float* float_output = new float[num_elems]; + float* float_input = new float[num_elems]; + float* float_skip = new float[num_elems]; + float* float_gamma = new float[num_elems]; + float* float_beta = new float[num_elems]; + float* float_bias = nullptr; if (bias_data != nullptr) { - float_bias.resize(hidden_size); - MlasConvertHalfToFloatBuffer(bias_data, &float_bias[0], hidden_size); + float_bias = new float[num_elems]; + MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); } + MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); + MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); + MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); + MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); + MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); - std::vector float_output(hidden_size); - - for (decltype(hidden_size) h = 0; h < hidden_size; h++) { + for (size_t h = 0; h < num_elems; h++) { float val = float_input[h] + float_skip[h]; if (nullptr != bias_data) { @@ -143,7 +148,7 @@ void ComputeJob( } if (nullptr != p_skip_input_bias_add_output) { - MlasConvertFloatToHalfBuffer(&float_output[0], p_skip_input_bias_add_output, hidden_size); + MlasConvertFloatToHalfBuffer(float_output, p_skip_input_bias_add_output, num_elems); } mean = mean / hidden_size; @@ -153,12 +158,7 @@ void ComputeJob( mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); } - std::vector float_gamma(hidden_size); - MlasConvertHalfToFloatBuffer(gamma_data, &float_gamma[0], hidden_size); - std::vector float_beta(hidden_size); - MlasConvertHalfToFloatBuffer(beta_data, &float_beta[0], hidden_size); - - for (decltype(hidden_size) h = 0; h < hidden_size; h++) { + for (size_t h = 0; h < num_elems; h++) { if (simplified) { float_output[h] = float_output[h] / mean_square * float_gamma[h]; } else if (nullptr == beta_data) { @@ -168,7 +168,15 @@ void ComputeJob( } } - MlasConvertFloatToHalfBuffer(&float_output[0], p_output, hidden_size); + MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); + delete[] float_output; + delete[] float_input; + delete[] float_skip; + delete[] float_gamma; + delete[] float_beta; + if (float_bias != nullptr) { + delete[] float_bias; + } } } // namespace diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 32427b2c39db1..8787fc2cb8085 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -86,11 +86,17 @@ void ComputeJob( float mean(0.0f); float mean_square(0.0f); - std::vector float_input(norm_size); - MlasConvertHalfToFloatBuffer(p_input, &float_input[0], norm_size); - - std::vector float_output(norm_size); - for (int64_t h = 0; h < norm_size; h++) { + const size_t num_elems = static_cast(norm_size); + float* float_input = new float[num_elems]; + float* float_scale = new float[num_elems]; + float* float_bias = new float[num_elems]; + float* float_output = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); + MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); + MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); + MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); + + for (size_t h = 0; h < num_elems; h++) { float_output[h] = float_input[h]; mean += float_input[h]; mean_square += float_input[h] * float_input[h]; @@ -103,12 +109,7 @@ void ComputeJob( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } - std::vector float_scale(norm_size); - MlasConvertHalfToFloatBuffer(scale_data, &float_scale[0], norm_size); - std::vector float_bias(norm_size); - MlasConvertHalfToFloatBuffer(bias_data, &float_bias[0], norm_size); - - for (int64_t h = 0; h < norm_size; h++) { + for (size_t h = 0; h < num_elems; h++) { if (simplified) { float_output[h] = float_output[h] / mean_square * float_scale[h]; } else if (nullptr == bias_data) { @@ -118,8 +119,6 @@ void ComputeJob( } } - MlasConvertFloatToHalfBuffer(&float_output[0], p_output, static_cast(norm_size)); - if (mean_data != nullptr) { // ONNX spec doesn't support 'double' for 'U' so when 'T' == double, 'U' == float and we need to narrow mean_data[task_idx] = MLFloat16(mean); @@ -128,6 +127,11 @@ void ComputeJob( if (inv_std_dev_data != nullptr) { inv_std_dev_data[task_idx] = MLFloat16(1 / mean_square); } + + delete[] float_input; + delete[] float_output; + delete[] float_scale; + delete[] float_bias; } } // namespace From a41b802bb4ab10721590da7df88ecebb83ed776a Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 2 Oct 2024 03:44:55 -0700 Subject: [PATCH 20/36] reuse allocated arrays when possible --- .../contrib_ops/cpu/skip_layer_norm.cc | 27 +++++++++---------- .../core/providers/cpu/nn/layer_norm_impl.cc | 21 +++++++-------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 3a1ff01d870b3..4f3b49c0a7250 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -119,22 +119,17 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(hidden_size); - float* float_output = new float[num_elems]; float* float_input = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); float* float_skip = new float[num_elems]; - float* float_gamma = new float[num_elems]; - float* float_beta = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); float* float_bias = nullptr; if (bias_data != nullptr) { float_bias = new float[num_elems]; MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); } - MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); - MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); - MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); - MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); - MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); + float* float_output = new float[num_elems]; for (size_t h = 0; h < num_elems; h++) { float val = float_input[h] + float_skip[h]; @@ -146,6 +141,9 @@ void ComputeJob( mean += val; mean_square += val * val; } + if (float_bias != nullptr) { + delete[] float_bias; + } if (nullptr != p_skip_input_bias_add_output) { MlasConvertFloatToHalfBuffer(float_output, p_skip_input_bias_add_output, num_elems); @@ -158,6 +156,10 @@ void ComputeJob( mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); } + float* float_gamma = float_input; // overwrite float_input with gamma values, since they have the same size + MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); + float* float_beta = float_skip; // overwrite float_input with beta values, since they have the same size + MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); for (size_t h = 0; h < num_elems; h++) { if (simplified) { float_output[h] = float_output[h] / mean_square * float_gamma[h]; @@ -167,16 +169,11 @@ void ComputeJob( float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h] + float_beta[h]; } } + delete[] float_gamma; // also deletes float_input + delete[] float_beta; // also deletes float_skip MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); delete[] float_output; - delete[] float_input; - delete[] float_skip; - delete[] float_gamma; - delete[] float_beta; - if (float_bias != nullptr) { - delete[] float_bias; - } } } // namespace diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 8787fc2cb8085..1c40071d60f7c 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -88,14 +88,9 @@ void ComputeJob( const size_t num_elems = static_cast(norm_size); float* float_input = new float[num_elems]; - float* float_scale = new float[num_elems]; - float* float_bias = new float[num_elems]; - float* float_output = new float[num_elems]; MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); - MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); - MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); - MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); + float* float_output = new float[num_elems]; for (size_t h = 0; h < num_elems; h++) { float_output[h] = float_input[h]; mean += float_input[h]; @@ -109,6 +104,10 @@ void ComputeJob( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } + float* float_scale = float_input; // overwrite float_input with scale values, since they have the same size + MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); + float* float_bias = new float[num_elems]; + MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); for (size_t h = 0; h < num_elems; h++) { if (simplified) { float_output[h] = float_output[h] / mean_square * float_scale[h]; @@ -118,6 +117,11 @@ void ComputeJob( float_output[h] = (float_output[h] - mean) / mean_square * float_scale[h] + float_bias[h]; } } + delete[] float_scale; // also deletes float_input + delete[] float_bias; + + MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); + delete[] float_output; if (mean_data != nullptr) { // ONNX spec doesn't support 'double' for 'U' so when 'T' == double, 'U' == float and we need to narrow @@ -127,11 +131,6 @@ void ComputeJob( if (inv_std_dev_data != nullptr) { inv_std_dev_data[task_idx] = MLFloat16(1 / mean_square); } - - delete[] float_input; - delete[] float_output; - delete[] float_scale; - delete[] float_bias; } } // namespace From 6aece952b6919b8f07e05b60f8294fc546904ca9 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 2 Oct 2024 04:37:06 -0700 Subject: [PATCH 21/36] make_unique instead of new --- .../contrib_ops/cpu/skip_layer_norm.cc | 32 ++++++++----------- .../core/providers/cpu/nn/layer_norm_impl.cc | 17 ++++------ 2 files changed, 21 insertions(+), 28 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 4f3b49c0a7250..18e3d2ab29df3 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -119,17 +119,19 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(hidden_size); - float* float_input = new float[num_elems]; - MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); - float* float_skip = new float[num_elems]; - MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); - float* float_bias = nullptr; + + std::unique_ptr float_input = std::make_unique(num_elems); + MlasConvertHalfToFloatBuffer(p_input, float_input.get(), num_elems); + + std::unique_ptr float_skip = std::make_unique(num_elems); + MlasConvertHalfToFloatBuffer(p_skip, float_skip.get(), num_elems); + std::unique_ptr float_bias = nullptr; if (bias_data != nullptr) { - float_bias = new float[num_elems]; - MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); + float_bias = std::make_unique(num_elems); + MlasConvertHalfToFloatBuffer(bias_data, float_bias.get(), num_elems); } - float* float_output = new float[num_elems]; + std::unique_ptr float_output = std::make_unique(num_elems); for (size_t h = 0; h < num_elems; h++) { float val = float_input[h] + float_skip[h]; @@ -141,12 +143,9 @@ void ComputeJob( mean += val; mean_square += val * val; } - if (float_bias != nullptr) { - delete[] float_bias; - } if (nullptr != p_skip_input_bias_add_output) { - MlasConvertFloatToHalfBuffer(float_output, p_skip_input_bias_add_output, num_elems); + MlasConvertFloatToHalfBuffer(float_output.get(), p_skip_input_bias_add_output, num_elems); } mean = mean / hidden_size; @@ -156,9 +155,9 @@ void ComputeJob( mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); } - float* float_gamma = float_input; // overwrite float_input with gamma values, since they have the same size + float* float_gamma = float_input.get(); // overwrite float_input with gamma values, since they have the same size MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); - float* float_beta = float_skip; // overwrite float_input with beta values, since they have the same size + float* float_beta = float_skip.get(); // overwrite float_skip with beta values, since they have the same size MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); for (size_t h = 0; h < num_elems; h++) { if (simplified) { @@ -169,11 +168,8 @@ void ComputeJob( float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h] + float_beta[h]; } } - delete[] float_gamma; // also deletes float_input - delete[] float_beta; // also deletes float_skip - MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); - delete[] float_output; + MlasConvertFloatToHalfBuffer(float_output.get(), p_output, num_elems); } } // namespace diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 1c40071d60f7c..3654059cbe21b 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -87,10 +87,10 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(norm_size); - float* float_input = new float[num_elems]; - MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); + std::unique_ptr float_input = std::make_unique(num_elems); + MlasConvertHalfToFloatBuffer(p_input, float_input.get(), num_elems); - float* float_output = new float[num_elems]; + std::unique_ptr float_output = std::make_unique(num_elems); for (size_t h = 0; h < num_elems; h++) { float_output[h] = float_input[h]; mean += float_input[h]; @@ -104,10 +104,10 @@ void ComputeJob( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } - float* float_scale = float_input; // overwrite float_input with scale values, since they have the same size + float* float_scale = float_input.get(); // overwrite float_input with scale values, since they have the same size MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); - float* float_bias = new float[num_elems]; - MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); + std::unique_ptr float_bias = std::make_unique(num_elems); + MlasConvertHalfToFloatBuffer(bias_data, float_bias.get(), num_elems); for (size_t h = 0; h < num_elems; h++) { if (simplified) { float_output[h] = float_output[h] / mean_square * float_scale[h]; @@ -117,11 +117,8 @@ void ComputeJob( float_output[h] = (float_output[h] - mean) / mean_square * float_scale[h] + float_bias[h]; } } - delete[] float_scale; // also deletes float_input - delete[] float_bias; - MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); - delete[] float_output; + MlasConvertFloatToHalfBuffer(float_output.get(), p_output, num_elems); if (mean_data != nullptr) { // ONNX spec doesn't support 'double' for 'U' so when 'T' == double, 'U' == float and we need to narrow From 766c4b230f1425f3e1ab6506ba2248383b203377 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 2 Oct 2024 04:37:17 -0700 Subject: [PATCH 22/36] Revert "make_unique instead of new" for latency This reverts commit 6aece952b6919b8f07e05b60f8294fc546904ca9. --- .../contrib_ops/cpu/skip_layer_norm.cc | 32 +++++++++++-------- .../core/providers/cpu/nn/layer_norm_impl.cc | 17 ++++++---- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 18e3d2ab29df3..4f3b49c0a7250 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -119,19 +119,17 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(hidden_size); - - std::unique_ptr float_input = std::make_unique(num_elems); - MlasConvertHalfToFloatBuffer(p_input, float_input.get(), num_elems); - - std::unique_ptr float_skip = std::make_unique(num_elems); - MlasConvertHalfToFloatBuffer(p_skip, float_skip.get(), num_elems); - std::unique_ptr float_bias = nullptr; + float* float_input = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); + float* float_skip = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); + float* float_bias = nullptr; if (bias_data != nullptr) { - float_bias = std::make_unique(num_elems); - MlasConvertHalfToFloatBuffer(bias_data, float_bias.get(), num_elems); + float_bias = new float[num_elems]; + MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); } - std::unique_ptr float_output = std::make_unique(num_elems); + float* float_output = new float[num_elems]; for (size_t h = 0; h < num_elems; h++) { float val = float_input[h] + float_skip[h]; @@ -143,9 +141,12 @@ void ComputeJob( mean += val; mean_square += val * val; } + if (float_bias != nullptr) { + delete[] float_bias; + } if (nullptr != p_skip_input_bias_add_output) { - MlasConvertFloatToHalfBuffer(float_output.get(), p_skip_input_bias_add_output, num_elems); + MlasConvertFloatToHalfBuffer(float_output, p_skip_input_bias_add_output, num_elems); } mean = mean / hidden_size; @@ -155,9 +156,9 @@ void ComputeJob( mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); } - float* float_gamma = float_input.get(); // overwrite float_input with gamma values, since they have the same size + float* float_gamma = float_input; // overwrite float_input with gamma values, since they have the same size MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); - float* float_beta = float_skip.get(); // overwrite float_skip with beta values, since they have the same size + float* float_beta = float_skip; // overwrite float_input with beta values, since they have the same size MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); for (size_t h = 0; h < num_elems; h++) { if (simplified) { @@ -168,8 +169,11 @@ void ComputeJob( float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h] + float_beta[h]; } } + delete[] float_gamma; // also deletes float_input + delete[] float_beta; // also deletes float_skip - MlasConvertFloatToHalfBuffer(float_output.get(), p_output, num_elems); + MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); + delete[] float_output; } } // namespace diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 3654059cbe21b..1c40071d60f7c 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -87,10 +87,10 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(norm_size); - std::unique_ptr float_input = std::make_unique(num_elems); - MlasConvertHalfToFloatBuffer(p_input, float_input.get(), num_elems); + float* float_input = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); - std::unique_ptr float_output = std::make_unique(num_elems); + float* float_output = new float[num_elems]; for (size_t h = 0; h < num_elems; h++) { float_output[h] = float_input[h]; mean += float_input[h]; @@ -104,10 +104,10 @@ void ComputeJob( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } - float* float_scale = float_input.get(); // overwrite float_input with scale values, since they have the same size + float* float_scale = float_input; // overwrite float_input with scale values, since they have the same size MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); - std::unique_ptr float_bias = std::make_unique(num_elems); - MlasConvertHalfToFloatBuffer(bias_data, float_bias.get(), num_elems); + float* float_bias = new float[num_elems]; + MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); for (size_t h = 0; h < num_elems; h++) { if (simplified) { float_output[h] = float_output[h] / mean_square * float_scale[h]; @@ -117,8 +117,11 @@ void ComputeJob( float_output[h] = (float_output[h] - mean) / mean_square * float_scale[h] + float_bias[h]; } } + delete[] float_scale; // also deletes float_input + delete[] float_bias; - MlasConvertFloatToHalfBuffer(float_output.get(), p_output, num_elems); + MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); + delete[] float_output; if (mean_data != nullptr) { // ONNX spec doesn't support 'double' for 'U' so when 'T' == double, 'U' == float and we need to narrow From cb55d4bdfb3ea1038cfb6a2f1ee519abdb356071 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 2 Oct 2024 04:41:58 -0700 Subject: [PATCH 23/36] lint --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 8 ++++---- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 4f3b49c0a7250..9dea120949bdf 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -156,9 +156,9 @@ void ComputeJob( mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); } - float* float_gamma = float_input; // overwrite float_input with gamma values, since they have the same size + float* float_gamma = float_input; // overwrite float_input with gamma values, since they have the same size MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); - float* float_beta = float_skip; // overwrite float_input with beta values, since they have the same size + float* float_beta = float_skip; // overwrite float_skip with beta values, since they have the same size MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); for (size_t h = 0; h < num_elems; h++) { if (simplified) { @@ -169,8 +169,8 @@ void ComputeJob( float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h] + float_beta[h]; } } - delete[] float_gamma; // also deletes float_input - delete[] float_beta; // also deletes float_skip + delete[] float_gamma; // also deletes float_input + delete[] float_beta; // also deletes float_skip MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); delete[] float_output; diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 1c40071d60f7c..44e1ee9c078bf 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -104,7 +104,7 @@ void ComputeJob( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } - float* float_scale = float_input; // overwrite float_input with scale values, since they have the same size + float* float_scale = float_input; // overwrite float_input with scale values, since they have the same size MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); float* float_bias = new float[num_elems]; MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); @@ -117,7 +117,7 @@ void ComputeJob( float_output[h] = (float_output[h] - mean) / mean_square * float_scale[h] + float_bias[h]; } } - delete[] float_scale; // also deletes float_input + delete[] float_scale; // also deletes float_input delete[] float_bias; MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); @@ -228,7 +228,7 @@ Status LayerNormImpl::ComputeWithoutContext( thread_pool, static_cast(norm_count), [&](ptrdiff_t task_idx) { ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, epsilon, simplified, - Y_data, mean_data, inv_std_dev_data); + Y_data, mean_data, inv_std_dev_data); }, 0); From 2895f37f2917282551bd1bf823fcdd794f2ec4fb Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 2 Oct 2024 08:31:42 -0700 Subject: [PATCH 24/36] fix bug --- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 44e1ee9c078bf..2fb53e04f23e4 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -170,9 +170,9 @@ Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, flo } int output_index = 1; - Tensor* mean = p_ctx->Output(output_index++, TensorShape(mean_inv_std_dev_dim)); U* mean_data = nullptr; - if (mean != nullptr) { + if (!simplified) { + Tensor* mean = p_ctx->Output(output_index++, TensorShape(mean_inv_std_dev_dim)); mean_data = mean->MutableData(); } From f93ccb7cb6bee2e268745d224b9f1ad1f4e9efe6 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 2 Oct 2024 09:28:19 -0700 Subject: [PATCH 25/36] fix bug --- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 2fb53e04f23e4..71dd5ab803263 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -173,7 +173,9 @@ Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, flo U* mean_data = nullptr; if (!simplified) { Tensor* mean = p_ctx->Output(output_index++, TensorShape(mean_inv_std_dev_dim)); - mean_data = mean->MutableData(); + if (mean != nullptr) { + mean_data = mean->MutableData(); + } } U* inv_std_dev_data = nullptr; From 4be02551a5e708582ce0f562aeb39a10cd59776b Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 3 Oct 2024 10:36:43 -0700 Subject: [PATCH 26/36] handle errors --- .../contrib_ops/cpu/skip_layer_norm.cc | 107 ++++++++++++++---- .../core/providers/cpu/nn/layer_norm_impl.cc | 72 ++++++++++-- 2 files changed, 146 insertions(+), 33 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 9dea120949bdf..fa1315d90e1b6 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -40,7 +40,7 @@ REGISTER_KERNEL_TYPED(MLFloat16) namespace { template || std::is_same_v, void>> -void ComputeJob( +Status ComputeJob( const T* input_data, const T* skip_data, const T* gamma_data, @@ -94,9 +94,11 @@ void ComputeJob( p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h] + beta_data[h]; } } + + return Status::OK(); } -void ComputeJob( +Status ComputeJob( const MLFloat16* input_data, const MLFloat16* skip_data, const MLFloat16* gamma_data, @@ -117,23 +119,45 @@ void ComputeJob( float mean(0.0f); float mean_square(0.0f); - const size_t num_elems = static_cast(hidden_size); - float* float_input = new float[num_elems]; - MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); - float* float_skip = new float[num_elems]; - MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); + + float* float_input = nullptr; + try { + float_input = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert input data to float: ", e.what()); + } + + float* float_skip = nullptr; + try { + float_skip = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert skip data to float: ", e.what()); + } + float* float_bias = nullptr; if (bias_data != nullptr) { - float_bias = new float[num_elems]; - MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); + try { + float_bias = new float[num_elems]; + MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert bias data to float: ", e.what()); + } + } + + float* float_output = nullptr; + try { + float_output = new float[num_elems]; + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to allocate memory for float output.", e.what()); } - float* float_output = new float[num_elems]; for (size_t h = 0; h < num_elems; h++) { float val = float_input[h] + float_skip[h]; - if (nullptr != bias_data) { + if (nullptr != float_bias) { val += float_bias[h]; } @@ -141,12 +165,17 @@ void ComputeJob( mean += val; mean_square += val * val; } + if (float_bias != nullptr) { delete[] float_bias; } if (nullptr != p_skip_input_bias_add_output) { - MlasConvertFloatToHalfBuffer(float_output, p_skip_input_bias_add_output, num_elems); + try { + MlasConvertFloatToHalfBuffer(float_output, p_skip_input_bias_add_output, num_elems); + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert skip_input_bias_add_output data to MLFLoat16: ", e.what()); + } } mean = mean / hidden_size; @@ -157,23 +186,43 @@ void ComputeJob( } float* float_gamma = float_input; // overwrite float_input with gamma values, since they have the same size - MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); - float* float_beta = float_skip; // overwrite float_skip with beta values, since they have the same size - MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); + try { + MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert gamma data to float: ", e.what()); + } + + float* float_beta = nullptr; // overwrite float_skip with beta values, since they have the same size + if (beta_data) { + float_beta = float_skip; + try { + MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert beta data to float: ", e.what()); + } + } + for (size_t h = 0; h < num_elems; h++) { if (simplified) { float_output[h] = float_output[h] / mean_square * float_gamma[h]; - } else if (nullptr == beta_data) { + } else if (nullptr == float_beta) { float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h]; } else { float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h] + float_beta[h]; } } delete[] float_gamma; // also deletes float_input - delete[] float_beta; // also deletes float_skip + delete[] float_skip; // also deletes float_beta if used + + try { + MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert float output data to MLFLoat16: ", e.what()); + } - MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); delete[] float_output; + + return Status::OK(); } } // namespace @@ -211,27 +260,43 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { int64_t task_count = input->Shape().SizeToDimension(input_dims_size - 1); const T* input_data = input->Data(); + if (!input_data) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The input data should not be null."); + } const T* skip_data = skip->Data(); + if (!skip_data) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The skip data should not be null."); + } const T* gamma_data = gamma->Data(); + if (!gamma_data) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The gamma data should not be null."); + } const T* beta_data = beta == nullptr ? nullptr : beta->Data(); const T* bias_data = bias == nullptr ? nullptr : bias->Data(); T* output_data = output->MutableData(); + if (!output_data) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The output data pointer should not be null."); + } // For inferencing, we support one more optional output which is the sum of the input and skip tensors T* skip_input_bias_add_output_data = skip_input_bias_add_output == nullptr ? nullptr : skip_input_bias_add_output->MutableData(); const int64_t& skip_size = skip->Shape().Size(); + auto return_status = Status::OK(); concurrency::ThreadPool::TryBatchParallelFor( p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { - ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, epsilon_, - simplified, output_data, skip_input_bias_add_output_data); + auto status = ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, + skip_size, epsilon_, simplified, output_data, skip_input_bias_add_output_data); + if (status != Status::OK()) { + return_status = status; + } }, 0); - return Status::OK(); + return return_status; } } // namespace contrib diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 71dd5ab803263..4bd042ac59de3 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -18,7 +18,7 @@ namespace { template || std::is_same_v, void>> -void ComputeJob( +Status ComputeJob( const T* X_data, const T* scale_data, const T* bias_data, @@ -66,10 +66,12 @@ void ComputeJob( if (inv_std_dev_data != nullptr) { inv_std_dev_data[task_idx] = gsl::narrow_cast(1 / mean_square); } + + return Status::OK(); } template -void ComputeJob( +Status ComputeJob( const MLFloat16* X_data, const MLFloat16* scale_data, const MLFloat16* bias_data, @@ -87,10 +89,21 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(norm_size); - float* float_input = new float[num_elems]; - MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); + float* float_input = nullptr; + try { + float_input = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert input data to float: ", e.what()); + } + + float* float_output = nullptr; + try { + float_output = new float[num_elems]; + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to allocate memory for float output.", e.what()); + } - float* float_output = new float[num_elems]; for (size_t h = 0; h < num_elems; h++) { float_output[h] = float_input[h]; mean += float_input[h]; @@ -105,9 +118,22 @@ void ComputeJob( } float* float_scale = float_input; // overwrite float_input with scale values, since they have the same size - MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); - float* float_bias = new float[num_elems]; - MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); + try { + MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert scale data to float: ", e.what()); + } + + float* float_bias = nullptr; + if (bias_data) { + try { + float_bias = new float[num_elems]; + MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert bias data to float: ", e.what()); + } + } + for (size_t h = 0; h < num_elems; h++) { if (simplified) { float_output[h] = float_output[h] / mean_square * float_scale[h]; @@ -118,9 +144,16 @@ void ComputeJob( } } delete[] float_scale; // also deletes float_input - delete[] float_bias; + if (float_bias) { + delete[] float_bias; + } + + try { + MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); + } catch (const std::exception& e) { + return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert float output data to MLFLoat16: ", e.what()); + } - MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); delete[] float_output; if (mean_data != nullptr) { @@ -131,6 +164,8 @@ void ComputeJob( if (inv_std_dev_data != nullptr) { inv_std_dev_data[task_idx] = MLFloat16(1 / mean_square); } + + return Status::OK(); } } // namespace @@ -148,7 +183,13 @@ Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, flo const Tensor* scale = p_ctx->Input(1); const Tensor* bias = p_ctx->Input(2); const T* X_data = X->Data(); + if (!X_data) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The input data should not be null."); + } const T* scale_data = scale->Data(); + if (!scale_data) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The scale data should not be null."); + } const T* bias_data = (simplified || nullptr == bias) ? nullptr : bias->Data(); const TensorShape& x_shape = X->Shape(); @@ -156,6 +197,9 @@ Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, flo const TensorShape& bias_shape = bias->Shape(); Tensor* Y = p_ctx->Output(0, x_shape); T* Y_data = Y->MutableData(); + if (!Y_data) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The output data pointer should not be null."); + } const int64_t axis = HandleNegativeAxis(orig_axis, x_shape.NumDimensions()); @@ -226,11 +270,15 @@ Status LayerNormImpl::ComputeWithoutContext( scale_size, " and bias size of ", bias_size); } + auto return_status = Status::OK(); concurrency::ThreadPool::TryBatchParallelFor( thread_pool, static_cast(norm_count), [&](ptrdiff_t task_idx) { - ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, epsilon, simplified, - Y_data, mean_data, inv_std_dev_data); + auto status = ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, epsilon, simplified, + Y_data, mean_data, inv_std_dev_data); + if (status != Status::OK()) { + return_status = status; + } }, 0); From 48ce9790da1ddb0e39fc1c77f0ba05365f2d9118 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 3 Oct 2024 11:13:15 -0700 Subject: [PATCH 27/36] remove checks on tensor data --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 12 ------------ onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 9 --------- 2 files changed, 21 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index fa1315d90e1b6..a36cbab755bee 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -260,24 +260,12 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { int64_t task_count = input->Shape().SizeToDimension(input_dims_size - 1); const T* input_data = input->Data(); - if (!input_data) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The input data should not be null."); - } const T* skip_data = skip->Data(); - if (!skip_data) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The skip data should not be null."); - } const T* gamma_data = gamma->Data(); - if (!gamma_data) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The gamma data should not be null."); - } const T* beta_data = beta == nullptr ? nullptr : beta->Data(); const T* bias_data = bias == nullptr ? nullptr : bias->Data(); T* output_data = output->MutableData(); - if (!output_data) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The output data pointer should not be null."); - } // For inferencing, we support one more optional output which is the sum of the input and skip tensors T* skip_input_bias_add_output_data = skip_input_bias_add_output == nullptr ? nullptr : skip_input_bias_add_output->MutableData(); diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 4bd042ac59de3..f52c1ae4ff730 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -183,13 +183,7 @@ Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, flo const Tensor* scale = p_ctx->Input(1); const Tensor* bias = p_ctx->Input(2); const T* X_data = X->Data(); - if (!X_data) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The input data should not be null."); - } const T* scale_data = scale->Data(); - if (!scale_data) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The scale data should not be null."); - } const T* bias_data = (simplified || nullptr == bias) ? nullptr : bias->Data(); const TensorShape& x_shape = X->Shape(); @@ -197,9 +191,6 @@ Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, flo const TensorShape& bias_shape = bias->Shape(); Tensor* Y = p_ctx->Output(0, x_shape); T* Y_data = Y->MutableData(); - if (!Y_data) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The output data pointer should not be null."); - } const int64_t axis = HandleNegativeAxis(orig_axis, x_shape.NumDimensions()); From 3d6b990e963d2d8f06435664cb013c3c4745d960 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 3 Oct 2024 14:44:43 -0700 Subject: [PATCH 28/36] remove try/catch due to -fno-exceptions --- .../contrib_ops/cpu/skip_layer_norm.cc | 58 ++++--------------- .../core/providers/cpu/nn/layer_norm_impl.cc | 36 +++--------- 2 files changed, 18 insertions(+), 76 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index a36cbab755bee..6eaf9781ce836 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -121,38 +121,19 @@ Status ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(hidden_size); - float* float_input = nullptr; - try { - float_input = new float[num_elems]; - MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert input data to float: ", e.what()); - } + float* float_input = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); - float* float_skip = nullptr; - try { - float_skip = new float[num_elems]; - MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert skip data to float: ", e.what()); - } + float* float_skip = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); float* float_bias = nullptr; if (bias_data != nullptr) { - try { - float_bias = new float[num_elems]; - MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert bias data to float: ", e.what()); - } + float_bias = new float[num_elems]; + MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); } - float* float_output = nullptr; - try { - float_output = new float[num_elems]; - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to allocate memory for float output.", e.what()); - } + float* float_output = new float[num_elems]; for (size_t h = 0; h < num_elems; h++) { float val = float_input[h] + float_skip[h]; @@ -171,11 +152,7 @@ Status ComputeJob( } if (nullptr != p_skip_input_bias_add_output) { - try { - MlasConvertFloatToHalfBuffer(float_output, p_skip_input_bias_add_output, num_elems); - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert skip_input_bias_add_output data to MLFLoat16: ", e.what()); - } + MlasConvertFloatToHalfBuffer(float_output, p_skip_input_bias_add_output, num_elems); } mean = mean / hidden_size; @@ -186,20 +163,12 @@ Status ComputeJob( } float* float_gamma = float_input; // overwrite float_input with gamma values, since they have the same size - try { - MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert gamma data to float: ", e.what()); - } + MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); float* float_beta = nullptr; // overwrite float_skip with beta values, since they have the same size if (beta_data) { float_beta = float_skip; - try { - MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert beta data to float: ", e.what()); - } + MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); } for (size_t h = 0; h < num_elems; h++) { @@ -214,12 +183,7 @@ Status ComputeJob( delete[] float_gamma; // also deletes float_input delete[] float_skip; // also deletes float_beta if used - try { - MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert float output data to MLFLoat16: ", e.what()); - } - + MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); delete[] float_output; return Status::OK(); diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index f52c1ae4ff730..d828da4353a40 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -89,20 +89,10 @@ Status ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(norm_size); - float* float_input = nullptr; - try { - float_input = new float[num_elems]; - MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert input data to float: ", e.what()); - } + float* float_input = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); - float* float_output = nullptr; - try { - float_output = new float[num_elems]; - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to allocate memory for float output.", e.what()); - } + float* float_output = new float[num_elems]; for (size_t h = 0; h < num_elems; h++) { float_output[h] = float_input[h]; @@ -118,20 +108,12 @@ Status ComputeJob( } float* float_scale = float_input; // overwrite float_input with scale values, since they have the same size - try { - MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert scale data to float: ", e.what()); - } + MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); float* float_bias = nullptr; if (bias_data) { - try { - float_bias = new float[num_elems]; - MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert bias data to float: ", e.what()); - } + float_bias = new float[num_elems]; + MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); } for (size_t h = 0; h < num_elems; h++) { @@ -148,11 +130,7 @@ Status ComputeJob( delete[] float_bias; } - try { - MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); - } catch (const std::exception& e) { - return ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, "Failed to convert float output data to MLFLoat16: ", e.what()); - } + MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); delete[] float_output; From f04aac04d81e32d3d13d2e814350f676998e5a39 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 9 Oct 2024 06:35:24 -0700 Subject: [PATCH 29/36] Prepack scale and bias in layer_norm_impl --- .../core/providers/cpu/nn/layer_norm_impl.cc | 59 ++++++++++++++++--- .../core/providers/cpu/nn/layer_norm_impl.h | 6 ++ 2 files changed, 56 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index d828da4353a40..48c24b75ff6dc 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -24,11 +24,16 @@ Status ComputeJob( const T* bias_data, const ptrdiff_t task_idx, const int64_t norm_size, + const IAllocatorUniquePtr& scale_fp32, + const IAllocatorUniquePtr& bias_fp32, float epsilon, bool simplified, T* Y_data, U* mean_data, U* inv_std_dev_data) { + ORT_UNUSED_PARAMETER(scale_fp32); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(bias_fp32); // only used in MLFloat16 overload + const T* p_input = X_data + task_idx * norm_size; T* p_output = Y_data + task_idx * norm_size; @@ -77,6 +82,8 @@ Status ComputeJob( const MLFloat16* bias_data, const ptrdiff_t task_idx, const int64_t norm_size, + const IAllocatorUniquePtr& scale_fp32, + const IAllocatorUniquePtr& bias_fp32, float epsilon, bool simplified, MLFloat16* Y_data, @@ -107,13 +114,20 @@ Status ComputeJob( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } - float* float_scale = float_input; // overwrite float_input with scale values, since they have the same size - MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); + float* float_scale = scale_fp32.get(); + if (float_scale == nullptr) { + float_scale = float_input; // overwrite float_input with scale values, since they have the same size + MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); + } float* float_bias = nullptr; if (bias_data) { - float_bias = new float[num_elems]; - MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); + if (bias_fp32 != nullptr) { + float_bias = bias_fp32.get(); + } else { + float_bias = new float[num_elems]; + MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); + } } for (size_t h = 0; h < num_elems; h++) { @@ -125,8 +139,9 @@ Status ComputeJob( float_output[h] = (float_output[h] - mean) / mean_square * float_scale[h] + float_bias[h]; } } - delete[] float_scale; // also deletes float_input - if (float_bias) { + + delete[] float_input; // also takes care of float_scale if reused + if (float_bias && (bias_fp32 == nullptr)) { delete[] float_bias; } @@ -146,10 +161,22 @@ Status ComputeJob( return Status::OK(); } +void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr& dest, bool& is_packed) { + if (tensor.GetElementType() == utils::ToTensorProtoElementType()) { + auto tensor_data_ptr = tensor.Data(); + auto tensor_size = static_cast(tensor.Shape().Size()); + auto float_ptr = IAllocator::MakeUniquePtr(alloc, tensor_size, true); + + MlasConvertHalfToFloatBuffer(tensor_data_ptr, float_ptr.get(), tensor_size); + dest = std::move(float_ptr); + is_packed = true; + } +} + } // namespace LayerNormImpl::LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified, bool contrib_op) - : OpKernel(op_kernel_info), simplified_{simplified}, contrib_op_{contrib_op} { + : OpKernel(op_kernel_info), simplified_{simplified}, contrib_op_{contrib_op}, scale_fp32_(nullptr), bias_fp32_(nullptr) { ORT_ENFORCE(op_kernel_info.GetAttr("axis", &axis_).IsOK()); ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon_).IsOK()); } @@ -212,6 +239,20 @@ Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { return t_disp.InvokeRet(this, p_ctx, axis_, epsilon_, simplified_, contrib_op_); } +Status LayerNormImpl::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, + bool& is_packed, PrePackedWeights* prepacked_weights) { + ORT_UNUSED_PARAMETER(prepacked_weights); + + is_packed = false; + if (input_idx == 1) { // scale + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, scale_fp32_, is_packed); + } else if (input_idx == 2) { // bias + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, bias_fp32_, is_packed); + } + + return Status::OK(); +} + template Status LayerNormImpl::ComputeWithoutContext( const T* X_data, @@ -243,8 +284,8 @@ Status LayerNormImpl::ComputeWithoutContext( concurrency::ThreadPool::TryBatchParallelFor( thread_pool, static_cast(norm_count), [&](ptrdiff_t task_idx) { - auto status = ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, epsilon, simplified, - Y_data, mean_data, inv_std_dev_data); + auto status = ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, scale_fp32_, bias_fp32_, + epsilon, simplified, Y_data, mean_data, inv_std_dev_data); if (status != Status::OK()) { return_status = status; } diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h index 64e1c2ba2f902..b4bf2cecf1198 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h @@ -4,6 +4,7 @@ #pragma once #include "core/common/common.h" +#include "core/framework/allocator.h" #include "core/framework/op_kernel.h" #include "core/framework/tensor.h" @@ -14,6 +15,9 @@ class LayerNormImpl : public OpKernel { LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified = false, bool contrib_op = false); Status Compute(OpKernelContext* p_op_kernel_context) const override; + Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, + bool& is_packed, PrePackedWeights* prepacked_weights) override; + // This method was created so that it can be called directly from `test/onnx/microbenchmark/layer_normalization.cc`. template Status ComputeWithoutContext( @@ -58,6 +62,8 @@ class LayerNormImpl : public OpKernel { float epsilon_; const bool simplified_; const bool contrib_op_; + IAllocatorUniquePtr scale_fp32_; + IAllocatorUniquePtr bias_fp32_; }; } // namespace onnxruntime From 1eaa63ff45b1751210bd0867bb436d28e92c5a24 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 9 Oct 2024 08:03:33 -0700 Subject: [PATCH 30/36] Prepack skip, gamma, beta, bias in skip_layer_norm --- .../contrib_ops/cpu/skip_layer_norm.cc | 95 +++++++++++++++---- onnxruntime/contrib_ops/cpu/skip_layer_norm.h | 7 ++ .../core/providers/cpu/nn/layer_norm_impl.cc | 6 +- 3 files changed, 89 insertions(+), 19 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 6eaf9781ce836..e37595d27a0a1 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -46,6 +46,10 @@ Status ComputeJob( const T* gamma_data, const T* beta_data, const T* bias_data, + const IAllocatorUniquePtr& skip_fp32, + const IAllocatorUniquePtr& gamma_fp32, + const IAllocatorUniquePtr& beta_fp32, + const IAllocatorUniquePtr& bias_fp32, ptrdiff_t task_idx, int hidden_size, int64_t skip_size, @@ -53,6 +57,10 @@ Status ComputeJob( bool simplified, T* output_data, T* skip_input_bias_add_output_data) { + ORT_UNUSED_PARAMETER(skip_fp32); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(gamma_fp32); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(beta_fp32); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(bias_fp32); // only used in MLFloat16 overload auto offset = task_idx * hidden_size; const T* p_input = input_data + offset; const T* p_skip = skip_data + (offset % skip_size); @@ -104,6 +112,10 @@ Status ComputeJob( const MLFloat16* gamma_data, const MLFloat16* beta_data, const MLFloat16* bias_data, + const IAllocatorUniquePtr& skip_fp32, + const IAllocatorUniquePtr& gamma_fp32, + const IAllocatorUniquePtr& beta_fp32, + const IAllocatorUniquePtr& bias_fp32, ptrdiff_t task_idx, int hidden_size, int64_t skip_size, @@ -124,13 +136,20 @@ Status ComputeJob( float* float_input = new float[num_elems]; MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); - float* float_skip = new float[num_elems]; - MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); + float* float_skip = skip_fp32.get(); + if (nullptr == float_skip) { + float_skip = new float[num_elems]; + MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); + } float* float_bias = nullptr; - if (bias_data != nullptr) { - float_bias = new float[num_elems]; - MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); + if (bias_data) { + if (nullptr != bias_fp32) { + float_bias = bias_fp32.get(); + } else { + float_bias = new float[num_elems]; + MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); + } } float* float_output = new float[num_elems]; @@ -147,7 +166,7 @@ Status ComputeJob( mean_square += val * val; } - if (float_bias != nullptr) { + if (float_bias && (nullptr == bias_fp32)) { delete[] float_bias; } @@ -162,13 +181,20 @@ Status ComputeJob( mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); } - float* float_gamma = float_input; // overwrite float_input with gamma values, since they have the same size - MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); + float* float_gamma = gamma_fp32.get(); + if (nullptr == float_gamma) { + float_gamma = float_input; // overwrite float_input with gamma values, since they have the same size + MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); + } - float* float_beta = nullptr; // overwrite float_skip with beta values, since they have the same size + float* float_beta = nullptr; if (beta_data) { - float_beta = float_skip; - MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); + if (nullptr != beta_fp32) { + float_beta = beta_fp32.get(); + } else { + float_beta = new float[num_elems]; + MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); + } } for (size_t h = 0; h < num_elems; h++) { @@ -180,8 +206,13 @@ Status ComputeJob( float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h] + float_beta[h]; } } - delete[] float_gamma; // also deletes float_input - delete[] float_skip; // also deletes float_beta if used + delete[] float_input; // also takes care of float_gamma if reused + if (float_skip && (nullptr == skip_fp32)) { + delete[] float_skip; + } + if (beta_data && (nullptr == beta_fp32)) { + delete[] float_beta; + } MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); delete[] float_output; @@ -189,11 +220,23 @@ Status ComputeJob( return Status::OK(); } +void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr& dest, bool& is_packed) { + if (tensor.GetElementType() == utils::ToTensorProtoElementType()) { + auto tensor_data_ptr = tensor.Data(); + auto tensor_size = static_cast(tensor.Shape().Size()); + auto float_ptr = IAllocator::MakeUniquePtr(alloc, tensor_size, true); + + MlasConvertHalfToFloatBuffer(tensor_data_ptr, float_ptr.get(), tensor_size); + dest = std::move(float_ptr); + is_packed = true; + } +} + } // namespace template SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) - : OpKernel(op_kernel_info) { + : OpKernel(op_kernel_info), skip_fp32_(nullptr), gamma_fp32_(nullptr), beta_fp32_(nullptr), bias_fp32_(nullptr) { ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon_).IsOK()); ORT_ENFORCE(epsilon_ >= 0); } @@ -240,8 +283,9 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { concurrency::ThreadPool::TryBatchParallelFor( p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { - auto status = ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, - skip_size, epsilon_, simplified, output_data, skip_input_bias_add_output_data); + auto status = ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, skip_fp32_, gamma_fp32_, + beta_fp32_, bias_fp32_, task_idx, hidden_size, skip_size, epsilon_, simplified, + output_data, skip_input_bias_add_output_data); if (status != Status::OK()) { return_status = status; } @@ -251,5 +295,24 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { return return_status; } +template +Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, + bool& is_packed, PrePackedWeights* prepacked_weights) { + ORT_UNUSED_PARAMETER(prepacked_weights); + + is_packed = false; + if (input_idx == 1) { // skip + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, skip_fp32_, is_packed); + } else if (input_idx == 2) { // gamma + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, gamma_fp32_, is_packed); + } else if (input_idx == 3) { // beta + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, beta_fp32_, is_packed); + } else if (input_idx == 4) { // bias + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, bias_fp32_, is_packed); + } + + return Status::OK(); +} + } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h index 69edf4609e340..5b1cbaf02364b 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h @@ -16,8 +16,15 @@ class SkipLayerNorm final : public OpKernel { SkipLayerNorm(const OpKernelInfo& op_kernel_info); Status Compute(OpKernelContext* p_op_kernel_context) const override; + Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, + bool& is_packed, PrePackedWeights* prepacked_weights) override; + private: float epsilon_; + IAllocatorUniquePtr skip_fp32_; + IAllocatorUniquePtr gamma_fp32_; + IAllocatorUniquePtr beta_fp32_; + IAllocatorUniquePtr bias_fp32_; }; } // namespace contrib diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 48c24b75ff6dc..1a63b26b28161 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -115,14 +115,14 @@ Status ComputeJob( } float* float_scale = scale_fp32.get(); - if (float_scale == nullptr) { + if (nullptr == float_scale) { float_scale = float_input; // overwrite float_input with scale values, since they have the same size MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); } float* float_bias = nullptr; if (bias_data) { - if (bias_fp32 != nullptr) { + if (nullptr != bias_fp32) { float_bias = bias_fp32.get(); } else { float_bias = new float[num_elems]; @@ -141,7 +141,7 @@ Status ComputeJob( } delete[] float_input; // also takes care of float_scale if reused - if (float_bias && (bias_fp32 == nullptr)) { + if (float_bias && (nullptr == bias_fp32)) { delete[] float_bias; } From 26ddc6c14166b230e1921c5813d06c2318402b76 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 9 Oct 2024 08:10:16 -0700 Subject: [PATCH 31/36] return void from ComputeJob --- .../contrib_ops/cpu/skip_layer_norm.cc | 20 ++++++------------- .../core/providers/cpu/nn/layer_norm_impl.cc | 16 ++++----------- 2 files changed, 10 insertions(+), 26 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index e37595d27a0a1..3ce5db3d02bb0 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -40,7 +40,7 @@ REGISTER_KERNEL_TYPED(MLFloat16) namespace { template || std::is_same_v, void>> -Status ComputeJob( +void ComputeJob( const T* input_data, const T* skip_data, const T* gamma_data, @@ -102,11 +102,9 @@ Status ComputeJob( p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h] + beta_data[h]; } } - - return Status::OK(); } -Status ComputeJob( +void ComputeJob( const MLFloat16* input_data, const MLFloat16* skip_data, const MLFloat16* gamma_data, @@ -216,8 +214,6 @@ Status ComputeJob( MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); delete[] float_output; - - return Status::OK(); } void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr& dest, bool& is_packed) { @@ -279,20 +275,16 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const int64_t& skip_size = skip->Shape().Size(); - auto return_status = Status::OK(); concurrency::ThreadPool::TryBatchParallelFor( p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { - auto status = ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, skip_fp32_, gamma_fp32_, - beta_fp32_, bias_fp32_, task_idx, hidden_size, skip_size, epsilon_, simplified, - output_data, skip_input_bias_add_output_data); - if (status != Status::OK()) { - return_status = status; - } + ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, skip_fp32_, gamma_fp32_, beta_fp32_, + bias_fp32_, task_idx, hidden_size, skip_size, epsilon_, simplified,output_data, + skip_input_bias_add_output_data); }, 0); - return return_status; + return Status::OK(); } template diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 1a63b26b28161..8ca4e1a2a22d7 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -18,7 +18,7 @@ namespace { template || std::is_same_v, void>> -Status ComputeJob( +void ComputeJob( const T* X_data, const T* scale_data, const T* bias_data, @@ -71,12 +71,10 @@ Status ComputeJob( if (inv_std_dev_data != nullptr) { inv_std_dev_data[task_idx] = gsl::narrow_cast(1 / mean_square); } - - return Status::OK(); } template -Status ComputeJob( +void ComputeJob( const MLFloat16* X_data, const MLFloat16* scale_data, const MLFloat16* bias_data, @@ -157,8 +155,6 @@ Status ComputeJob( if (inv_std_dev_data != nullptr) { inv_std_dev_data[task_idx] = MLFloat16(1 / mean_square); } - - return Status::OK(); } void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr& dest, bool& is_packed) { @@ -280,15 +276,11 @@ Status LayerNormImpl::ComputeWithoutContext( scale_size, " and bias size of ", bias_size); } - auto return_status = Status::OK(); concurrency::ThreadPool::TryBatchParallelFor( thread_pool, static_cast(norm_count), [&](ptrdiff_t task_idx) { - auto status = ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, scale_fp32_, bias_fp32_, - epsilon, simplified, Y_data, mean_data, inv_std_dev_data); - if (status != Status::OK()) { - return_status = status; - } + ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, scale_fp32_, bias_fp32_, + epsilon, simplified, Y_data, mean_data, inv_std_dev_data); }, 0); From 3231cffef055043f6e95f036f7806ff5da63eefe Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 9 Oct 2024 08:57:23 -0700 Subject: [PATCH 32/36] lint --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 3ce5db3d02bb0..cd028e2314f9d 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -279,7 +279,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, skip_fp32_, gamma_fp32_, beta_fp32_, - bias_fp32_, task_idx, hidden_size, skip_size, epsilon_, simplified,output_data, + bias_fp32_, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, skip_input_bias_add_output_data); }, 0); From 2a37a92f24c6b8d041579abfaa0fbcfcdab4256c Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 14 Oct 2024 06:06:35 -0700 Subject: [PATCH 33/36] Use GenerateArrayWithRandomValue in microbenchmark --- .../microbenchmark/layer_normalization.cc | 73 +++++++++++++------ 1 file changed, 50 insertions(+), 23 deletions(-) diff --git a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc index 4660cb85a43f1..cc7366b2da0cd 100644 --- a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc +++ b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc @@ -20,6 +20,8 @@ #include "core/providers/cpu/cpu_provider_factory_creator.h" #include "core/util/thread_utils.h" +#include "test/onnx/microbenchmark/common.h" + #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic pop #endif @@ -28,25 +30,18 @@ using namespace onnxruntime; namespace { -static const std::vector dims{1, 256, 1024}; -static const size_t num_elems = dims[0] * dims[1] * dims[2]; -static const std::vector float_vals(num_elems, 1.0f); -static const std::vector MLFloat16_vals(num_elems, MLFloat16(1.0f)); - -} // namespace +std::vector createMLFloat16Vector(float* vals, int64_t num_elems) { + std::vector fp16vec; + fp16vec.reserve(num_elems); -template -const T* getVector(); + for (int64_t i = 0; i < num_elems; i++) { + fp16vec.push_back(MLFloat16(vals[i])); + } -template <> -const float* getVector() { - return float_vals.data(); + return fp16vec; } -template <> -const MLFloat16* getVector() { - return MLFloat16_vals.data(); -} +} // namespace template static void BM_LayerNormalization(benchmark::State& state) { @@ -72,17 +67,41 @@ static void BM_LayerNormalization(benchmark::State& state) { LayerNormImpl layer_norm_impl(op_kernel_info); + const std::vector dims{1, 256, 1024}; + const size_t num_elems = dims[0] * dims[1] * dims[2]; + TensorShape x_shape(dims); TensorShape scale_shape(dims); TensorShape bias_shape(dims); - const T* x_data = getVector(); - const T* scale_data = getVector(); - const T* bias_data = getVector(); + const float low = -1.0f; + const float high = 1.0f; + + float* x_float = GenerateArrayWithRandomValue(num_elems, low, high); + float* scale_float = GenerateArrayWithRandomValue(num_elems, 0.1f, high); + float* bias_float = GenerateArrayWithRandomValue(num_elems, low, high); + + std::vector x_MLFloat16 = createMLFloat16Vector(x_float, num_elems); + std::vector scale_MLFloat16 = createMLFloat16Vector(scale_float, num_elems); + std::vector bias_MLFloat16 = createMLFloat16Vector(bias_float, num_elems); + + T* x_data = nullptr; + T* scale_data = nullptr; + T* bias_data = nullptr; + if (std::is_same_v) { + x_data = (T*)x_MLFloat16.data(); + scale_data = (T*)scale_MLFloat16.data(); + bias_data = (T*)bias_MLFloat16.data(); + } else if (std::is_same_v) { + x_data = (T*)x_float; + scale_data = (T*)scale_float; + bias_data = (T*)bias_float; + } + assert(x_data); - T* Y_data = static_cast(malloc(num_elems * sizeof(T))); - U* mean_data = static_cast(malloc(num_elems * sizeof(U))); - U* inv_std_dev_data = static_cast(malloc(num_elems * sizeof(U))); + T* Y_data = static_cast(aligned_alloc(num_elems * sizeof(T), 64)); + U* mean_data = static_cast(aligned_alloc(num_elems * sizeof(U), 64)); + U* inv_std_dev_data = static_cast(aligned_alloc(num_elems * sizeof(U), 64)); OrtThreadPoolParams tp_params; tp_params.name = ORT_TSTR("intra-op"); @@ -91,13 +110,21 @@ static void BM_LayerNormalization(benchmark::State& state) { for (auto _ : state) { auto status = layer_norm_impl.ComputeWithoutContext(x_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, - Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis, epsilon, simplified); + Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis, + epsilon, simplified); if (!status.IsOK()) { std::cout << "ComputeWithoutContext status not OK: " << status.ErrorMessage() << std::endl; break; } } + + aligned_free(x_float); + aligned_free(scale_float); + aligned_free(bias_float); + aligned_free(Y_data); + aligned_free(mean_data); + aligned_free(inv_std_dev_data); } BENCHMARK(BM_LayerNormalization) @@ -105,7 +132,7 @@ BENCHMARK(BM_LayerNormalization) ->UseRealTime() ->Unit(benchmark::TimeUnit::kMicrosecond); -BENCHMARK(BM_LayerNormalization) +BENCHMARK(BM_LayerNormalization) ->Arg(1) ->UseRealTime() ->Unit(benchmark::TimeUnit::kMicrosecond); From d8b11abb13a6fe8fb892a87e781d060c37250578 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 14 Oct 2024 08:21:26 -0700 Subject: [PATCH 34/36] Use allocator instead of new --- .../contrib_ops/cpu/skip_layer_norm.cc | 32 ++++++++++++------- .../core/providers/cpu/nn/layer_norm_impl.cc | 31 ++++++++++-------- .../core/providers/cpu/nn/layer_norm_impl.h | 3 +- .../microbenchmark/layer_normalization.cc | 5 +-- 4 files changed, 43 insertions(+), 28 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index cd028e2314f9d..cee74b5b7bf8e 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -56,11 +56,14 @@ void ComputeJob( float epsilon, bool simplified, T* output_data, - T* skip_input_bias_add_output_data) { + T* skip_input_bias_add_output_data, + AllocatorPtr alloc) { ORT_UNUSED_PARAMETER(skip_fp32); // only used in MLFloat16 overload ORT_UNUSED_PARAMETER(gamma_fp32); // only used in MLFloat16 overload ORT_UNUSED_PARAMETER(beta_fp32); // only used in MLFloat16 overload ORT_UNUSED_PARAMETER(bias_fp32); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(alloc); + auto offset = task_idx * hidden_size; const T* p_input = input_data + offset; const T* p_skip = skip_data + (offset % skip_size); @@ -120,7 +123,8 @@ void ComputeJob( float epsilon, bool simplified, MLFloat16* output_data, - MLFloat16* skip_input_bias_add_output_data) { + MLFloat16* skip_input_bias_add_output_data, + AllocatorPtr alloc) { auto offset = task_idx * hidden_size; const MLFloat16* p_input = input_data + offset; const MLFloat16* p_skip = skip_data + (offset % skip_size); @@ -131,12 +135,12 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(hidden_size); - float* float_input = new float[num_elems]; + float* float_input = (float*)alloc->Alloc(num_elems * sizeof(float)); MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); float* float_skip = skip_fp32.get(); if (nullptr == float_skip) { - float_skip = new float[num_elems]; + float_skip = (float*)alloc->Alloc(num_elems * sizeof(float)); MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); } @@ -145,12 +149,12 @@ void ComputeJob( if (nullptr != bias_fp32) { float_bias = bias_fp32.get(); } else { - float_bias = new float[num_elems]; + float_bias = (float*)alloc->Alloc(num_elems * sizeof(float)); MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); } } - float* float_output = new float[num_elems]; + float* float_output = (float*)alloc->Alloc(num_elems * sizeof(float)); for (size_t h = 0; h < num_elems; h++) { float val = float_input[h] + float_skip[h]; @@ -190,7 +194,7 @@ void ComputeJob( if (nullptr != beta_fp32) { float_beta = beta_fp32.get(); } else { - float_beta = new float[num_elems]; + float_beta = (float*)alloc->Alloc(num_elems * sizeof(float)); MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); } } @@ -204,16 +208,17 @@ void ComputeJob( float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h] + float_beta[h]; } } - delete[] float_input; // also takes care of float_gamma if reused + + alloc->Free(float_input); // also takes care of float_gamma if reused if (float_skip && (nullptr == skip_fp32)) { - delete[] float_skip; + alloc->Free(float_skip); } if (beta_data && (nullptr == beta_fp32)) { - delete[] float_beta; + alloc->Free(float_beta); } MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); - delete[] float_output; + alloc->Free(float_output); } void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr& dest, bool& is_packed) { @@ -275,12 +280,15 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const int64_t& skip_size = skip->Shape().Size(); + AllocatorPtr alloc; + ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); + concurrency::ThreadPool::TryBatchParallelFor( p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, skip_fp32_, gamma_fp32_, beta_fp32_, bias_fp32_, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, - skip_input_bias_add_output_data); + skip_input_bias_add_output_data, alloc); }, 0); diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 8ca4e1a2a22d7..7917ed4912cda 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -30,9 +30,11 @@ void ComputeJob( bool simplified, T* Y_data, U* mean_data, - U* inv_std_dev_data) { + U* inv_std_dev_data, + AllocatorPtr alloc) { ORT_UNUSED_PARAMETER(scale_fp32); // only used in MLFloat16 overload ORT_UNUSED_PARAMETER(bias_fp32); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(alloc); const T* p_input = X_data + task_idx * norm_size; T* p_output = Y_data + task_idx * norm_size; @@ -86,7 +88,8 @@ void ComputeJob( bool simplified, MLFloat16* Y_data, U* mean_data, - U* inv_std_dev_data) { + U* inv_std_dev_data, + AllocatorPtr alloc) { const MLFloat16* p_input = X_data + task_idx * norm_size; MLFloat16* p_output = Y_data + task_idx * norm_size; @@ -94,10 +97,10 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(norm_size); - float* float_input = new float[num_elems]; + float* float_input = (float*)alloc->Alloc(num_elems * sizeof(float)); MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); - float* float_output = new float[num_elems]; + float* float_output = (float*)alloc->Alloc(num_elems * sizeof(float)); for (size_t h = 0; h < num_elems; h++) { float_output[h] = float_input[h]; @@ -123,7 +126,7 @@ void ComputeJob( if (nullptr != bias_fp32) { float_bias = bias_fp32.get(); } else { - float_bias = new float[num_elems]; + float_bias = (float*)alloc->Alloc(num_elems * sizeof(float)); MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); } } @@ -138,14 +141,13 @@ void ComputeJob( } } - delete[] float_input; // also takes care of float_scale if reused + alloc->Free(float_input); // also takes care of float_scale if reused if (float_bias && (nullptr == bias_fp32)) { - delete[] float_bias; + alloc->Free(float_bias); } MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); - - delete[] float_output; + alloc->Free(float_output); if (mean_data != nullptr) { // ONNX spec doesn't support 'double' for 'U' so when 'T' == double, 'U' == float and we need to narrow @@ -222,8 +224,10 @@ Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, flo onnxruntime::concurrency::ThreadPool* thread_pool = p_ctx->GetOperatorThreadPool(); - return ComputeWithoutContext(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, - Y_data, mean_data, inv_std_dev_data, thread_pool, axis, epsilon, simplified); + AllocatorPtr alloc; + ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); + return ComputeWithoutContext(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, Y_data, mean_data, + inv_std_dev_data, thread_pool, axis, epsilon, simplified, alloc); } Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { @@ -263,7 +267,8 @@ Status LayerNormImpl::ComputeWithoutContext( onnxruntime::concurrency::ThreadPool* thread_pool, int64_t axis, float epsilon, - bool simplified) const { + bool simplified, + AllocatorPtr alloc) const { int64_t norm_count = x_shape.SizeToDimension(onnxruntime::narrow(axis)); int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow(axis)); @@ -280,7 +285,7 @@ Status LayerNormImpl::ComputeWithoutContext( thread_pool, static_cast(norm_count), [&](ptrdiff_t task_idx) { ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, scale_fp32_, bias_fp32_, - epsilon, simplified, Y_data, mean_data, inv_std_dev_data); + epsilon, simplified, Y_data, mean_data, inv_std_dev_data, alloc); }, 0); diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h index b4bf2cecf1198..9242ea5e6df6e 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h @@ -33,7 +33,8 @@ class LayerNormImpl : public OpKernel { onnxruntime::concurrency::ThreadPool* thread_pool, int64_t axis, float epsilon, - bool simplified) const; + bool simplified, + AllocatorPtr alloc) const; private: template diff --git a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc index cc7366b2da0cd..75ce7b77acd4e 100644 --- a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc +++ b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc @@ -108,11 +108,12 @@ static void BM_LayerNormalization(benchmark::State& state) { std::unique_ptr thread_pool = concurrency::CreateThreadPool( &Env::Default(), tp_params, concurrency::ThreadPoolType::INTRA_OP); + OrtMemoryInfo memory_info(onnxruntime::CPU, OrtAllocatorType::OrtArenaAllocator); + AllocatorPtr alloc = std::make_shared(memory_info); for (auto _ : state) { auto status = layer_norm_impl.ComputeWithoutContext(x_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis, - epsilon, simplified); - + epsilon, simplified, alloc); if (!status.IsOK()) { std::cout << "ComputeWithoutContext status not OK: " << status.ErrorMessage() << std::endl; break; From 402b65d6b7ed53cff6338b2ef91663bb8fea01dd Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 14 Oct 2024 08:23:14 -0700 Subject: [PATCH 35/36] lint --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index cee74b5b7bf8e..bbf4a4eec2e46 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -209,7 +209,7 @@ void ComputeJob( } } - alloc->Free(float_input); // also takes care of float_gamma if reused + alloc->Free(float_input); // also takes care of float_gamma if reused if (float_skip && (nullptr == skip_fp32)) { alloc->Free(float_skip); } From 57c3e6374d085cf1da275c7560b11963166a20fa Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 14 Oct 2024 13:55:45 -0700 Subject: [PATCH 36/36] switch to IAllocator::MakeUniquePtr --- .../contrib_ops/cpu/skip_layer_norm.cc | 105 +++++++----------- onnxruntime/contrib_ops/cpu/skip_layer_norm.h | 8 +- .../core/providers/cpu/nn/layer_norm_impl.cc | 60 +++++----- .../core/providers/cpu/nn/layer_norm_impl.h | 4 +- 4 files changed, 75 insertions(+), 102 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index bbf4a4eec2e46..67b4950af73bf 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -46,10 +46,10 @@ void ComputeJob( const T* gamma_data, const T* beta_data, const T* bias_data, - const IAllocatorUniquePtr& skip_fp32, - const IAllocatorUniquePtr& gamma_fp32, - const IAllocatorUniquePtr& beta_fp32, - const IAllocatorUniquePtr& bias_fp32, + IAllocatorUniquePtr& skip_float_uptr, + IAllocatorUniquePtr& gamma_float_uptr, + IAllocatorUniquePtr& beta_float_uptr, + IAllocatorUniquePtr& bias_float_uptr, ptrdiff_t task_idx, int hidden_size, int64_t skip_size, @@ -58,10 +58,10 @@ void ComputeJob( T* output_data, T* skip_input_bias_add_output_data, AllocatorPtr alloc) { - ORT_UNUSED_PARAMETER(skip_fp32); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(gamma_fp32); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(beta_fp32); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(bias_fp32); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(skip_float_uptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(gamma_float_uptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(beta_float_uptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(bias_float_uptr); // only used in MLFloat16 overload ORT_UNUSED_PARAMETER(alloc); auto offset = task_idx * hidden_size; @@ -113,10 +113,10 @@ void ComputeJob( const MLFloat16* gamma_data, const MLFloat16* beta_data, const MLFloat16* bias_data, - const IAllocatorUniquePtr& skip_fp32, - const IAllocatorUniquePtr& gamma_fp32, - const IAllocatorUniquePtr& beta_fp32, - const IAllocatorUniquePtr& bias_fp32, + IAllocatorUniquePtr& skip_float_uptr, + IAllocatorUniquePtr& gamma_float_uptr, + IAllocatorUniquePtr& beta_float_uptr, + IAllocatorUniquePtr& bias_float_uptr, ptrdiff_t task_idx, int hidden_size, int64_t skip_size, @@ -135,45 +135,39 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(hidden_size); - float* float_input = (float*)alloc->Alloc(num_elems * sizeof(float)); - MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); + IAllocatorUniquePtr input_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems); - float* float_skip = skip_fp32.get(); - if (nullptr == float_skip) { - float_skip = (float*)alloc->Alloc(num_elems * sizeof(float)); - MlasConvertHalfToFloatBuffer(p_skip, float_skip, num_elems); + if (!skip_float_uptr) { + skip_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(p_skip, skip_float_uptr.get(), num_elems); } - float* float_bias = nullptr; - if (bias_data) { - if (nullptr != bias_fp32) { - float_bias = bias_fp32.get(); - } else { - float_bias = (float*)alloc->Alloc(num_elems * sizeof(float)); - MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); - } + if (bias_data && !bias_float_uptr) { + bias_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(bias_data, bias_float_uptr.get(), num_elems); } - float* float_output = (float*)alloc->Alloc(num_elems * sizeof(float)); + IAllocatorUniquePtr output_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); + float* output_float_ptr = output_float_uptr.get(); + const float* input_float_ptr = input_float_uptr.get(); + const float* skip_float_ptr = skip_float_uptr.get(); + const float* bias_float_ptr = bias_float_uptr.get(); for (size_t h = 0; h < num_elems; h++) { - float val = float_input[h] + float_skip[h]; + float val = input_float_ptr[h] + skip_float_ptr[h]; - if (nullptr != float_bias) { - val += float_bias[h]; + if (bias_float_uptr) { + val += bias_float_ptr[h]; } - float_output[h] = val; + output_float_ptr[h] = val; mean += val; mean_square += val * val; } - if (float_bias && (nullptr == bias_fp32)) { - delete[] float_bias; - } - if (nullptr != p_skip_input_bias_add_output) { - MlasConvertFloatToHalfBuffer(float_output, p_skip_input_bias_add_output, num_elems); + MlasConvertFloatToHalfBuffer(output_float_ptr, p_skip_input_bias_add_output, num_elems); } mean = mean / hidden_size; @@ -183,42 +177,29 @@ void ComputeJob( mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); } - float* float_gamma = gamma_fp32.get(); - if (nullptr == float_gamma) { - float_gamma = float_input; // overwrite float_input with gamma values, since they have the same size - MlasConvertHalfToFloatBuffer(gamma_data, float_gamma, num_elems); + if (!gamma_float_uptr) { + gamma_float_uptr = std::move(input_float_uptr); // overwrite input with gamma values, since they have the same size + MlasConvertHalfToFloatBuffer(gamma_data, gamma_float_uptr.get(), num_elems); } - float* float_beta = nullptr; - if (beta_data) { - if (nullptr != beta_fp32) { - float_beta = beta_fp32.get(); - } else { - float_beta = (float*)alloc->Alloc(num_elems * sizeof(float)); - MlasConvertHalfToFloatBuffer(beta_data, float_beta, num_elems); - } + if (beta_data && !beta_float_uptr) { + beta_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(beta_data, beta_float_uptr.get(), num_elems); } + const float* gamma_float_ptr = gamma_float_uptr.get(); + const float* beta_float_ptr = beta_float_uptr.get(); for (size_t h = 0; h < num_elems; h++) { if (simplified) { - float_output[h] = float_output[h] / mean_square * float_gamma[h]; - } else if (nullptr == float_beta) { - float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h]; + output_float_ptr[h] = output_float_ptr[h] / mean_square * gamma_float_ptr[h]; + } else if (nullptr == beta_float_uptr) { + output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h]; } else { - float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h] + float_beta[h]; + output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h] + beta_float_ptr[h]; } } - alloc->Free(float_input); // also takes care of float_gamma if reused - if (float_skip && (nullptr == skip_fp32)) { - alloc->Free(float_skip); - } - if (beta_data && (nullptr == beta_fp32)) { - alloc->Free(float_beta); - } - - MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); - alloc->Free(float_output); + MlasConvertFloatToHalfBuffer(output_float_ptr, p_output, num_elems); } void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr& dest, bool& is_packed) { diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h index 5b1cbaf02364b..08e2276c3d9d5 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h @@ -21,10 +21,10 @@ class SkipLayerNorm final : public OpKernel { private: float epsilon_; - IAllocatorUniquePtr skip_fp32_; - IAllocatorUniquePtr gamma_fp32_; - IAllocatorUniquePtr beta_fp32_; - IAllocatorUniquePtr bias_fp32_; + mutable IAllocatorUniquePtr skip_fp32_; + mutable IAllocatorUniquePtr gamma_fp32_; + mutable IAllocatorUniquePtr beta_fp32_; + mutable IAllocatorUniquePtr bias_fp32_; }; } // namespace contrib diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 7917ed4912cda..f73efcddcedd4 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -24,16 +24,16 @@ void ComputeJob( const T* bias_data, const ptrdiff_t task_idx, const int64_t norm_size, - const IAllocatorUniquePtr& scale_fp32, - const IAllocatorUniquePtr& bias_fp32, + IAllocatorUniquePtr& scale_float_uptr, + IAllocatorUniquePtr& bias_float_uptr, float epsilon, bool simplified, T* Y_data, U* mean_data, U* inv_std_dev_data, AllocatorPtr alloc) { - ORT_UNUSED_PARAMETER(scale_fp32); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(bias_fp32); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(scale_float_uptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(bias_float_uptr); // only used in MLFloat16 overload ORT_UNUSED_PARAMETER(alloc); const T* p_input = X_data + task_idx * norm_size; @@ -82,8 +82,8 @@ void ComputeJob( const MLFloat16* bias_data, const ptrdiff_t task_idx, const int64_t norm_size, - const IAllocatorUniquePtr& scale_fp32, - const IAllocatorUniquePtr& bias_fp32, + IAllocatorUniquePtr& scale_float_uptr, + IAllocatorUniquePtr& bias_float_uptr, float epsilon, bool simplified, MLFloat16* Y_data, @@ -97,15 +97,17 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(norm_size); - float* float_input = (float*)alloc->Alloc(num_elems * sizeof(float)); - MlasConvertHalfToFloatBuffer(p_input, float_input, num_elems); + IAllocatorUniquePtr input_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems); - float* float_output = (float*)alloc->Alloc(num_elems * sizeof(float)); + IAllocatorUniquePtr output_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); + float* output_float_ptr = output_float_uptr.get(); + const float* input_float_ptr = input_float_uptr.get(); for (size_t h = 0; h < num_elems; h++) { - float_output[h] = float_input[h]; - mean += float_input[h]; - mean_square += float_input[h] * float_input[h]; + output_float_ptr[h] = input_float_ptr[h]; + mean += input_float_ptr[h]; + mean_square += input_float_ptr[h] * input_float_ptr[h]; } mean = mean / norm_size; @@ -115,39 +117,29 @@ void ComputeJob( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } - float* float_scale = scale_fp32.get(); - if (nullptr == float_scale) { - float_scale = float_input; // overwrite float_input with scale values, since they have the same size - MlasConvertHalfToFloatBuffer(scale_data, float_scale, num_elems); + if (!scale_float_uptr) { + scale_float_uptr = std::move(input_float_uptr); // overwrite input with scale values, since they have the same size + MlasConvertHalfToFloatBuffer(scale_data, scale_float_uptr.get(), num_elems); } - float* float_bias = nullptr; - if (bias_data) { - if (nullptr != bias_fp32) { - float_bias = bias_fp32.get(); - } else { - float_bias = (float*)alloc->Alloc(num_elems * sizeof(float)); - MlasConvertHalfToFloatBuffer(bias_data, float_bias, num_elems); - } + if (bias_data && !bias_float_uptr) { + bias_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(bias_data, bias_float_uptr.get(), num_elems); } + const float* scale_float_ptr = scale_float_uptr.get(); + const float* bias_float_ptr = bias_float_uptr.get(); for (size_t h = 0; h < num_elems; h++) { if (simplified) { - float_output[h] = float_output[h] / mean_square * float_scale[h]; + output_float_ptr[h] = output_float_ptr[h] / mean_square * scale_float_ptr[h]; } else if (nullptr == bias_data) { - float_output[h] = (float_output[h] - mean) / mean_square * float_scale[h]; + output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * scale_float_ptr[h]; } else { - float_output[h] = (float_output[h] - mean) / mean_square * float_scale[h] + float_bias[h]; + output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * scale_float_ptr[h] + bias_float_ptr[h]; } } - alloc->Free(float_input); // also takes care of float_scale if reused - if (float_bias && (nullptr == bias_fp32)) { - alloc->Free(float_bias); - } - - MlasConvertFloatToHalfBuffer(float_output, p_output, num_elems); - alloc->Free(float_output); + MlasConvertFloatToHalfBuffer(output_float_ptr, p_output, num_elems); if (mean_data != nullptr) { // ONNX spec doesn't support 'double' for 'U' so when 'T' == double, 'U' == float and we need to narrow diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h index 9242ea5e6df6e..f6325c31cc71a 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h @@ -63,8 +63,8 @@ class LayerNormImpl : public OpKernel { float epsilon_; const bool simplified_; const bool contrib_op_; - IAllocatorUniquePtr scale_fp32_; - IAllocatorUniquePtr bias_fp32_; + mutable IAllocatorUniquePtr scale_fp32_; + mutable IAllocatorUniquePtr bias_fp32_; }; } // namespace onnxruntime