Skip to content

Commit

Permalink
Add microbenchmark for layer normalization and improve latency (#22223)
Browse files Browse the repository at this point in the history
- Added a microbenchmark for the `LayerNormalization` MLFloat16 support
added in #22063.
- Updated the `LayerNormalization` MLFloat16 implementation to improve
the latency.

```
----------------------------------------------------------------------------------------------
Original MLFloat16 support                                   Time             CPU   Iterations
----------------------------------------------------------------------------------------------
BM_LayerNormalization<MLFloat16, float>/1/real_time      15599 us        15625 us           47
BM_LayerNormalization<MLFloat16, float>/1/real_time      14714 us        14824 us           39
BM_LayerNormalization<MLFloat16, float>/1/real_time      14634 us        14688 us           50


----------------------------------------------------------------------------------------------
Updated MLFloat16 support                                    Time             CPU   Iterations
----------------------------------------------------------------------------------------------
BM_LayerNormalization<MLFloat16, float>/1/real_time       7276 us         7254 us           84
BM_LayerNormalization<MLFloat16, float>/1/real_time       6820 us         6720 us           93
BM_LayerNormalization<MLFloat16, float>/1/real_time       6840 us         6882 us           84
```
  • Loading branch information
amarin16 authored Oct 15, 2024
1 parent 4af593a commit 7d17c46
Show file tree
Hide file tree
Showing 6 changed files with 604 additions and 229 deletions.
3 changes: 2 additions & 1 deletion cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1164,7 +1164,8 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
${BENCHMARK_DIR}/gelu.cc
${BENCHMARK_DIR}/activation.cc
${BENCHMARK_DIR}/quantize.cc
${BENCHMARK_DIR}/reduceminmax.cc)
${BENCHMARK_DIR}/reduceminmax.cc
${BENCHMARK_DIR}/layer_normalization.cc)
target_include_directories(onnxruntime_benchmark PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_graph_header} ${ONNXRUNTIME_ROOT}/core/mlas/inc)
target_compile_definitions(onnxruntime_benchmark PRIVATE BENCHMARK_STATIC_DEFINE)
if(WIN32)
Expand Down
299 changes: 202 additions & 97 deletions onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Licensed under the MIT License.

#include "core/framework/tensor.h"
#include "core/mlas/inc/mlas.h"
#include "core/util/math_cpuonly.h"
#include "core/providers/common.h"
#include "core/platform/threadpool.h"
Expand Down Expand Up @@ -36,52 +37,188 @@ REGISTER_KERNEL_TYPED(float)
REGISTER_KERNEL_TYPED(double)
REGISTER_KERNEL_TYPED(MLFloat16)

// Utility to convert from MLFloat16 to float only when the input type is MLFloat16.
template <typename T, typename Ret>
ORT_FORCEINLINE Ret ConvertMLFloat16ToDoubleOrFloatIfNeeded(T val);

template <>
ORT_FORCEINLINE float ConvertMLFloat16ToDoubleOrFloatIfNeeded<MLFloat16, float>(MLFloat16 val) {
return val.ToFloat();
}

template <>
ORT_FORCEINLINE double ConvertMLFloat16ToDoubleOrFloatIfNeeded<MLFloat16, double>(MLFloat16 val) {
return static_cast<double>(ConvertMLFloat16ToDoubleOrFloatIfNeeded<MLFloat16, float>(val));
namespace {

template <typename T, typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double>, void>>
void ComputeJob(
const T* input_data,
const T* skip_data,
const T* gamma_data,
const T* beta_data,
const T* bias_data,
IAllocatorUniquePtr<float>& skip_float_uptr,
IAllocatorUniquePtr<float>& gamma_float_uptr,
IAllocatorUniquePtr<float>& beta_float_uptr,
IAllocatorUniquePtr<float>& bias_float_uptr,
ptrdiff_t task_idx,
int hidden_size,
int64_t skip_size,
float epsilon,
bool simplified,
T* output_data,
T* skip_input_bias_add_output_data,
AllocatorPtr alloc) {
ORT_UNUSED_PARAMETER(skip_float_uptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(gamma_float_uptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(beta_float_uptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(bias_float_uptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(alloc);

auto offset = task_idx * hidden_size;
const T* p_input = input_data + offset;
const T* p_skip = skip_data + (offset % skip_size);
T* p_output = output_data + offset;
T* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset;

T mean(0.0f);
T mean_square(0.0f);

for (decltype(hidden_size) h = 0; h < hidden_size; h++) {
T val = p_input[h] + p_skip[h];

if (nullptr != bias_data) {
val += bias_data[h];
}

if (nullptr != p_skip_input_bias_add_output) {
p_skip_input_bias_add_output[h] = val;
}

p_output[h] = val;
mean += val;
mean_square += val * val;
}

mean = mean / hidden_size;
if (simplified) {
mean_square = sqrt(mean_square / hidden_size + epsilon);
} else {
mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon);
}

for (decltype(hidden_size) h = 0; h < hidden_size; h++) {
if (simplified) {
p_output[h] = p_output[h] / mean_square * gamma_data[h];
} else if (nullptr == beta_data) {
p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h];
} else {
p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h] + beta_data[h];
}
}
}

template <>
ORT_FORCEINLINE constexpr float ConvertMLFloat16ToDoubleOrFloatIfNeeded<float, float>(float val) {
return val;
void ComputeJob(
const MLFloat16* input_data,
const MLFloat16* skip_data,
const MLFloat16* gamma_data,
const MLFloat16* beta_data,
const MLFloat16* bias_data,
IAllocatorUniquePtr<float>& skip_float_uptr,
IAllocatorUniquePtr<float>& gamma_float_uptr,
IAllocatorUniquePtr<float>& beta_float_uptr,
IAllocatorUniquePtr<float>& bias_float_uptr,
ptrdiff_t task_idx,
int hidden_size,
int64_t skip_size,
float epsilon,
bool simplified,
MLFloat16* output_data,
MLFloat16* skip_input_bias_add_output_data,
AllocatorPtr alloc) {
auto offset = task_idx * hidden_size;
const MLFloat16* p_input = input_data + offset;
const MLFloat16* p_skip = skip_data + (offset % skip_size);
MLFloat16* p_output = output_data + offset;
MLFloat16* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset;

float mean(0.0f);
float mean_square(0.0f);
const size_t num_elems = static_cast<size_t>(hidden_size);

IAllocatorUniquePtr<float> input_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems);

if (!skip_float_uptr) {
skip_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(p_skip, skip_float_uptr.get(), num_elems);
}

if (bias_data && !bias_float_uptr) {
bias_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(bias_data, bias_float_uptr.get(), num_elems);
}

IAllocatorUniquePtr<float> output_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
float* output_float_ptr = output_float_uptr.get();

const float* input_float_ptr = input_float_uptr.get();
const float* skip_float_ptr = skip_float_uptr.get();
const float* bias_float_ptr = bias_float_uptr.get();
for (size_t h = 0; h < num_elems; h++) {
float val = input_float_ptr[h] + skip_float_ptr[h];

if (bias_float_uptr) {
val += bias_float_ptr[h];
}

output_float_ptr[h] = val;
mean += val;
mean_square += val * val;
}

if (nullptr != p_skip_input_bias_add_output) {
MlasConvertFloatToHalfBuffer(output_float_ptr, p_skip_input_bias_add_output, num_elems);
}

mean = mean / hidden_size;
if (simplified) {
mean_square = sqrt(mean_square / hidden_size + epsilon);
} else {
mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon);
}

if (!gamma_float_uptr) {
gamma_float_uptr = std::move(input_float_uptr); // overwrite input with gamma values, since they have the same size
MlasConvertHalfToFloatBuffer(gamma_data, gamma_float_uptr.get(), num_elems);
}

if (beta_data && !beta_float_uptr) {
beta_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(beta_data, beta_float_uptr.get(), num_elems);
}

const float* gamma_float_ptr = gamma_float_uptr.get();
const float* beta_float_ptr = beta_float_uptr.get();
for (size_t h = 0; h < num_elems; h++) {
if (simplified) {
output_float_ptr[h] = output_float_ptr[h] / mean_square * gamma_float_ptr[h];
} else if (nullptr == beta_float_uptr) {
output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h];
} else {
output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h] + beta_float_ptr[h];
}
}

MlasConvertFloatToHalfBuffer(output_float_ptr, p_output, num_elems);
}

template <>
ORT_FORCEINLINE constexpr double ConvertMLFloat16ToDoubleOrFloatIfNeeded<double, double>(double val) {
return val;
}
void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr<float>& dest, bool& is_packed) {
if (tensor.GetElementType() == utils::ToTensorProtoElementType<MLFloat16>()) {
auto tensor_data_ptr = tensor.Data<MLFloat16>();
auto tensor_size = static_cast<size_t>(tensor.Shape().Size());
auto float_ptr = IAllocator::MakeUniquePtr<float>(alloc, tensor_size, true);

// Function template that only converts the input value to MLFloat16 if T is MLFloat16.
template <typename T>
ORT_FORCEINLINE constexpr typename std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double>, T>
ConvertDoubleOrFloatToMLFloat16IfNeeded(T val) {
return val;
MlasConvertHalfToFloatBuffer(tensor_data_ptr, float_ptr.get(), tensor_size);
dest = std::move(float_ptr);
is_packed = true;
}
}

template <typename T>
ORT_FORCEINLINE constexpr typename std::enable_if_t<std::is_same_v<T, MLFloat16>, T>
ConvertDoubleOrFloatToMLFloat16IfNeeded(float val) {
return MLFloat16(val);
}

template <typename T>
ORT_FORCEINLINE constexpr typename std::enable_if_t<std::is_same_v<T, MLFloat16>, T>
ConvertDoubleOrFloatToMLFloat16IfNeeded(double val) {
return MLFloat16(static_cast<float>(val));
}
} // namespace

template <typename T, bool simplified>
SkipLayerNorm<T, simplified>::SkipLayerNorm(const OpKernelInfo& op_kernel_info)
: OpKernel(op_kernel_info) {
: OpKernel(op_kernel_info), skip_fp32_(nullptr), gamma_fp32_(nullptr), beta_fp32_(nullptr), bias_fp32_(nullptr) {
ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &epsilon_).IsOK());
ORT_ENFORCE(epsilon_ >= 0);
}
Expand All @@ -94,8 +231,7 @@ Status SkipLayerNorm<T, simplified>::Compute(OpKernelContext* p_ctx) const {
const Tensor* beta = p_ctx->Input<Tensor>(3);
const Tensor* bias = p_ctx->Input<Tensor>(4);
Tensor* output = p_ctx->Output(0, input->Shape());
// For inferencing, we support one more optional output which is the sum
// of the input and skip tensors
// For inferencing, we support one more optional output which is the sum of the input and skip tensors
Tensor* skip_input_bias_add_output = p_ctx->Output(3, input->Shape());

const auto& input_dims = input->Shape().GetDims();
Expand All @@ -120,75 +256,44 @@ Status SkipLayerNorm<T, simplified>::Compute(OpKernelContext* p_ctx) const {

T* output_data = output->MutableData<T>();

// For inferencing, we support one more optional output which is the sum
// of the input and skip tensors
T* skip_input_bias_add_output_data = skip_input_bias_add_output != nullptr ? skip_input_bias_add_output->MutableData<T>() : nullptr;
// For inferencing, we support one more optional output which is the sum of the input and skip tensors
T* skip_input_bias_add_output_data = skip_input_bias_add_output == nullptr ? nullptr : skip_input_bias_add_output->MutableData<T>();

const auto& skip_size = skip->Shape().Size();
const int64_t& skip_size = skip->Shape().Size();

AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc));

concurrency::ThreadPool::TryBatchParallelFor(
p_ctx->GetOperatorThreadPool(), static_cast<int32_t>(task_count),
[&](ptrdiff_t task_idx) {
auto offset = task_idx * hidden_size;

const T* p_input = input_data + offset;
const T* p_skip = skip_data + (offset % skip_size);
T* p_output = output_data + offset;
T* p_skip_input_bias_add_output_data = skip_input_bias_add_output_data != nullptr ? skip_input_bias_add_output_data + offset : nullptr;

using DoubleOrFloat = typename std::conditional<
std::is_same<T, double>::value, // If T is double
double, // Use double
float // Otherwise, use float (covers float and MLFloat16)
>::type;

DoubleOrFloat mean(0.0f);
DoubleOrFloat mean_square(0.0f);

std::unique_ptr<DoubleOrFloat[]> output_buffer = std::make_unique<DoubleOrFloat[]>(hidden_size);
for (size_t h = 0; h < static_cast<size_t>(hidden_size); h++) {
DoubleOrFloat input_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded<T, DoubleOrFloat>(p_input[h]);
DoubleOrFloat skip_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded<T, DoubleOrFloat>(p_skip[h]);

DoubleOrFloat value = input_value + skip_value;

if (nullptr != bias_data) {
value += ConvertMLFloat16ToDoubleOrFloatIfNeeded<T, DoubleOrFloat>(bias_data[h]);
}

output_buffer[h] = value;
T converted_value = ConvertDoubleOrFloatToMLFloat16IfNeeded<T>(value);
if (nullptr != p_skip_input_bias_add_output_data) {
p_skip_input_bias_add_output_data[h] = converted_value;
}

mean += value;
mean_square += value * value;
}

mean = mean / hidden_size;
if (simplified) {
mean_square = sqrt(mean_square / hidden_size + epsilon_);
} else {
mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon_);
}

for (size_t h = 0; h < static_cast<size_t>(hidden_size); h++) {
DoubleOrFloat gamma_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded<T, DoubleOrFloat>(gamma_data[h]);
if (simplified) {
p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded<T>(output_buffer[h] / mean_square * gamma_value);
} else if (nullptr == beta_data) {
p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded<T>((output_buffer[h] - mean) / mean_square * gamma_value);
} else {
DoubleOrFloat beta_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded<T, DoubleOrFloat>(beta_data[h]);
p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded<T>((output_buffer[h] - mean) / mean_square * gamma_value + beta_value);
}
}
ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, skip_fp32_, gamma_fp32_, beta_fp32_,
bias_fp32_, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data,
skip_input_bias_add_output_data, alloc);
},
0);

return Status::OK();
}

template <typename T, bool simplified>
Status SkipLayerNorm<T, simplified>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
bool& is_packed, PrePackedWeights* prepacked_weights) {
ORT_UNUSED_PARAMETER(prepacked_weights);

is_packed = false;
if (input_idx == 1) { // skip
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, skip_fp32_, is_packed);
} else if (input_idx == 2) { // gamma
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, gamma_fp32_, is_packed);
} else if (input_idx == 3) { // beta
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, beta_fp32_, is_packed);
} else if (input_idx == 4) { // bias
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, bias_fp32_, is_packed);
}

return Status::OK();
}

} // namespace contrib
} // namespace onnxruntime
7 changes: 7 additions & 0 deletions onnxruntime/contrib_ops/cpu/skip_layer_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,15 @@ class SkipLayerNorm final : public OpKernel {
SkipLayerNorm(const OpKernelInfo& op_kernel_info);
Status Compute(OpKernelContext* p_op_kernel_context) const override;

Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
bool& is_packed, PrePackedWeights* prepacked_weights) override;

private:
float epsilon_;
mutable IAllocatorUniquePtr<float> skip_fp32_;
mutable IAllocatorUniquePtr<float> gamma_fp32_;
mutable IAllocatorUniquePtr<float> beta_fp32_;
mutable IAllocatorUniquePtr<float> bias_fp32_;
};

} // namespace contrib
Expand Down
Loading

0 comments on commit 7d17c46

Please sign in to comment.