Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix LayerNorm f16 CPU implementation #22479

Merged
merged 5 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 48 additions & 34 deletions onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@
const T* bias_data,
const ptrdiff_t task_idx,
const int64_t norm_size,
IAllocatorUniquePtr<float>& scale_float_uptr,
IAllocatorUniquePtr<float>& bias_float_uptr,
const float* scale_float_ptr,

Check warning on line 27 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:27: Do not indent within a namespace. [whitespace/indent_namespace] [4]
const float* bias_float_ptr,

Check warning on line 28 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:28: Do not indent within a namespace. [whitespace/indent_namespace] [4]
float epsilon,
bool simplified,
T* Y_data,
U* mean_data,
U* inv_std_dev_data,
AllocatorPtr alloc) {
ORT_UNUSED_PARAMETER(scale_float_uptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(bias_float_uptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(scale_float_ptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(bias_float_ptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(alloc);

const T* p_input = X_data + task_idx * norm_size;
Expand Down Expand Up @@ -82,14 +82,17 @@
const MLFloat16* bias_data,
const ptrdiff_t task_idx,
const int64_t norm_size,
IAllocatorUniquePtr<float>& scale_float_uptr,
IAllocatorUniquePtr<float>& bias_float_uptr,
const float* scale_float_ptr,

Check warning on line 85 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:85: Do not indent within a namespace. [whitespace/indent_namespace] [4]
const float* bias_float_ptr,

Check warning on line 86 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:86: Do not indent within a namespace. [whitespace/indent_namespace] [4]
float epsilon,
bool simplified,
MLFloat16* Y_data,
U* mean_data,
U* inv_std_dev_data,
AllocatorPtr alloc) {
ORT_UNUSED_PARAMETER(scale_data); // only used in float/double overload
ORT_UNUSED_PARAMETER(bias_data); // only used in float/double overload

const MLFloat16* p_input = X_data + task_idx * norm_size;
MLFloat16* p_output = Y_data + task_idx * norm_size;

Expand Down Expand Up @@ -117,22 +120,10 @@
mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon);
}

if (!scale_float_uptr) {
scale_float_uptr = std::move(input_float_uptr); // overwrite input with scale values, since they have the same size
MlasConvertHalfToFloatBuffer(scale_data, scale_float_uptr.get(), num_elems);
}

if (bias_data && !bias_float_uptr) {
bias_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(bias_data, bias_float_uptr.get(), num_elems);
}

const float* scale_float_ptr = scale_float_uptr.get();
const float* bias_float_ptr = bias_float_uptr.get();
for (size_t h = 0; h < num_elems; h++) {
if (simplified) {
output_float_ptr[h] = output_float_ptr[h] / mean_square * scale_float_ptr[h];
} else if (nullptr == bias_data) {
} else if (nullptr == bias_float_ptr) {
output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * scale_float_ptr[h];
} else {
output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * scale_float_ptr[h] + bias_float_ptr[h];
Expand Down Expand Up @@ -166,7 +157,13 @@
} // namespace

LayerNormImpl::LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified, bool contrib_op)
: OpKernel(op_kernel_info), simplified_{simplified}, contrib_op_{contrib_op}, scale_fp32_(nullptr), bias_fp32_(nullptr) {
: OpKernel(op_kernel_info),

Check warning on line 160 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:160: Do not indent within a namespace. [whitespace/indent_namespace] [4]
simplified_{simplified},

Check warning on line 161 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:161: Do not indent within a namespace. [whitespace/indent_namespace] [4]
contrib_op_{contrib_op},

Check warning on line 162 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:162: Do not indent within a namespace. [whitespace/indent_namespace] [4]
prepacked_scale_fp32_data_(nullptr),

Check warning on line 163 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:163: Do not indent within a namespace. [whitespace/indent_namespace] [4]
prepacked_scale_fp32_size_(0),

Check warning on line 164 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:164: Do not indent within a namespace. [whitespace/indent_namespace] [4]
prepacked_bias_fp32_data_(nullptr),

Check warning on line 165 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:165: Do not indent within a namespace. [whitespace/indent_namespace] [4]
prepacked_bias_fp32_size_(0) {
ORT_ENFORCE(op_kernel_info.GetAttr("axis", &axis_).IsOK());
ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &epsilon_).IsOK());
}
Expand All @@ -175,15 +172,15 @@
Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified) const {
// Inputs
const Tensor* X = p_ctx->Input<Tensor>(0);
const Tensor* scale = p_ctx->Input<Tensor>(1);
const Tensor* bias = p_ctx->Input<Tensor>(2);
const Tensor* scale = prepacked_scale_fp32_data_ ? nullptr : p_ctx->Input<Tensor>(1);
const Tensor* bias = prepacked_bias_fp32_data_ ? nullptr : p_ctx->Input<Tensor>(2);
const T* X_data = X->Data<T>();
const T* scale_data = scale->Data<T>();
const T* scale_data = scale ? scale->Data<T>() : nullptr;
const T* bias_data = (simplified || nullptr == bias) ? nullptr : bias->Data<T>();

const TensorShape& x_shape = X->Shape();
const TensorShape& scale_shape = scale->Shape();
const TensorShape& bias_shape = bias->Shape();
size_t scale_size = scale ? static_cast<size_t>(scale->Shape().Size()) : prepacked_scale_fp32_size_;
size_t bias_size = bias ? static_cast<size_t>(bias->Shape().Size()) : prepacked_bias_fp32_size_;
Tensor* Y = p_ctx->Output(0, x_shape);
T* Y_data = Y->MutableData<T>();

Expand Down Expand Up @@ -218,7 +215,7 @@

AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc));
return ComputeWithoutContext<T, U>(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, Y_data, mean_data,
return ComputeWithoutContext<T, U>(X_data, x_shape, scale_data, scale_size, bias_data, bias_size, Y_data, mean_data,
inv_std_dev_data, thread_pool, axis, epsilon, simplified, alloc);
}

Expand All @@ -237,9 +234,11 @@

is_packed = false;
if (input_idx == 1) { // scale
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, scale_fp32_, is_packed);
prepacked_scale_fp32_size_ = static_cast<size_t>(tensor.Shape().Size());
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_scale_fp32_data_, is_packed);
} else if (input_idx == 2) { // bias
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, bias_fp32_, is_packed);
prepacked_bias_fp32_size_ = static_cast<size_t>(tensor.Shape().Size());
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed);
}

return Status::OK();
Expand All @@ -250,9 +249,9 @@
const T* X_data,
const TensorShape& x_shape,
const T* scale_data,
const TensorShape& scale_shape,
size_t scale_size,
const T* bias_data,
const TensorShape& bias_shape,
size_t bias_size,
T* Y_data,
U* mean_data,
U* inv_std_dev_data,
Expand All @@ -264,19 +263,34 @@
int64_t norm_count = x_shape.SizeToDimension(onnxruntime::narrow<size_t>(axis));
int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow<size_t>(axis));

const auto scale_size = scale_shape.Size();
const auto bias_size = (bias_data) ? bias_shape.Size() : 0;
if (scale_size != norm_size || (bias_data && bias_size != norm_size)) {
if (static_cast<int64_t>(scale_size) != norm_size || (bias_data && static_cast<int64_t>(bias_size) != norm_size)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Size of X.shape()[axis:] == ", norm_size,
". Size of scale and bias (if provided) must match this. Got scale size of ",
scale_size, " and bias size of ", bias_size);
}

IAllocatorUniquePtr<float> scale_fp32;
IAllocatorUniquePtr<float> bias_fp32;
if constexpr (std::is_same_v<T, MLFloat16>) {
if (prepacked_scale_fp32_data_ == nullptr) {
const size_t num_elems = static_cast<size_t>(norm_size);
scale_fp32 = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(scale_data, scale_fp32.get(), num_elems);
}
if (prepacked_bias_fp32_data_ == nullptr && bias_data) {
const size_t num_elems = static_cast<size_t>(norm_size);
bias_fp32 = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(bias_data, bias_fp32.get(), num_elems);
}
}

concurrency::ThreadPool::TryBatchParallelFor(
thread_pool, static_cast<int32_t>(norm_count),
[&](ptrdiff_t task_idx) {
ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, scale_fp32_, bias_fp32_,
ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size,
prepacked_scale_fp32_data_ ? prepacked_scale_fp32_data_.get() : scale_fp32.get(),
prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(),
epsilon, simplified, Y_data, mean_data, inv_std_dev_data, alloc);
},
0);
Expand Down
10 changes: 6 additions & 4 deletions onnxruntime/core/providers/cpu/nn/layer_norm_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ class LayerNormImpl : public OpKernel {
const T* X_data,
const TensorShape& x_shape,
const T* scale_data,
const TensorShape& scale_shape,
size_t scale_size,
const T* bias_data,
const TensorShape& bias_shape,
size_t bias_size,
T* Y_data,
U* mean_data,
U* inv_std_dev,
Expand Down Expand Up @@ -63,8 +63,10 @@ class LayerNormImpl : public OpKernel {
float epsilon_;
const bool simplified_;
const bool contrib_op_;
mutable IAllocatorUniquePtr<float> scale_fp32_;
mutable IAllocatorUniquePtr<float> bias_fp32_;
IAllocatorUniquePtr<float> prepacked_scale_fp32_data_;
size_t prepacked_scale_fp32_size_;
IAllocatorUniquePtr<float> prepacked_bias_fp32_data_;
size_t prepacked_bias_fp32_size_;
};

} // namespace onnxruntime
29 changes: 29 additions & 0 deletions onnxruntime/test/contrib_ops/layer_norm_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,20 @@ TEST(LayerNormTest, LayerNorm_Scale_Float16InputScaleOutput) {
kNnapiExecutionProvider, kQnnExecutionProvider, kCoreMLExecutionProvider});
}

TEST(LayerNormTest, LayerNorm_Scale_Float16InputScaleOutput_Initializers) {
OpTester test("LayerNormalization");
test.AddAttribute<float>("epsilon", 1e-05f);

std::vector<int64_t> dims{2, 2, 2};
test.AddInput<MLFloat16>("x", dims, ToFloat16({-10.264f, 8.6453f, 43.1561f, -0.641239f, -8.2164f, 0.11412f, 41.3156f, 3.0458f}));
test.AddInput<MLFloat16>("gamma", {2}, ToFloat16({-0.6953f, 5.1824f}), true);
test.AddOutput<MLFloat16>("output", dims, ToFloat16({0.6953f, 5.1824f, -0.6953f, -5.1824f, 0.6953f, 5.1824f, -0.6953f, -5.1824f}));
// TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
test.Run(OpTester::ExpectResult::kExpectSuccess, "",
{kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
kNnapiExecutionProvider, kQnnExecutionProvider, kCoreMLExecutionProvider});
}

TEST(LayerNormTest, LayerNorm_Scale_Bias) {
OpTester test("LayerNormalization");
test.AddAttribute<float>("epsilon", 1e-05f);
Expand Down Expand Up @@ -211,6 +225,21 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_Float16InputScaleBiasOutput) {
kNnapiExecutionProvider, kQnnExecutionProvider, kCoreMLExecutionProvider});
}

TEST(LayerNormTest, LayerNorm_Scale_Bias_Float16InputScaleBiasOutput_Initializers) {
OpTester test("LayerNormalization");
test.AddAttribute<float>("epsilon", 1e-05f);

std::vector<int64_t> dims{1, 3, 2};
test.AddInput<MLFloat16>("x", dims, ToFloat16({1.2416f, 0.946123f, 13.1685f, 0.36423f, 21.145f, 0.03941f}));
test.AddInput<MLFloat16>("gamma", {2}, ToFloat16({-0.6953f, 5.1824f}), true);
test.AddInput<MLFloat16>("bias", {2}, ToFloat16({0.6435f, -0.3964f}), true);
test.AddOutput<MLFloat16>("output", dims, ToFloat16({-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f}));
// TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
test.Run(OpTester::ExpectResult::kExpectSuccess, "",
{kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
kNnapiExecutionProvider, kQnnExecutionProvider, kCoreMLExecutionProvider});
}

// LayerNormalization became an ONNX operator in opset 17. It uses the same implementation so this is a sanity check.
TEST(LayerNormTest, LayerNorm17_float) {
OpTester test("LayerNormalization", 17);
Expand Down
17 changes: 14 additions & 3 deletions onnxruntime/test/onnx/microbenchmark/layer_normalization.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,20 @@ static void BM_LayerNormalization(benchmark::State& state) {
OrtMemoryInfo memory_info(onnxruntime::CPU, OrtAllocatorType::OrtArenaAllocator);
AllocatorPtr alloc = std::make_shared<CPUAllocator>(memory_info);
for (auto _ : state) {
auto status = layer_norm_impl.ComputeWithoutContext(x_data, x_shape, scale_data, scale_shape, bias_data, bias_shape,
Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis,
epsilon, simplified, alloc);
auto status = layer_norm_impl.ComputeWithoutContext(x_data,
x_shape,
scale_data,
static_cast<size_t>(scale_shape.Size()),
bias_data,
static_cast<size_t>(bias_shape.Size()),
Y_data,
mean_data,
inv_std_dev_data,
thread_pool.get(),
axis,
epsilon,
simplified,
alloc);
if (!status.IsOK()) {
std::cout << "ComputeWithoutContext status not OK: " << status.ErrorMessage() << std::endl;
break;
Expand Down
Loading