Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix LayerNorm f16 CPU implementation #22479

Merged
merged 5 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 31 additions & 23 deletions onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@
const T* bias_data,
const ptrdiff_t task_idx,
const int64_t norm_size,
IAllocatorUniquePtr<float>& scale_float_uptr,
IAllocatorUniquePtr<float>& bias_float_uptr,
const float* scale_float_ptr,

Check warning on line 27 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:27: Do not indent within a namespace. [whitespace/indent_namespace] [4]
const float* bias_float_ptr,

Check warning on line 28 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:28: Do not indent within a namespace. [whitespace/indent_namespace] [4]
float epsilon,
bool simplified,
T* Y_data,
U* mean_data,
U* inv_std_dev_data,
AllocatorPtr alloc) {
ORT_UNUSED_PARAMETER(scale_float_uptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(bias_float_uptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(scale_float_ptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(bias_float_ptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(alloc);

const T* p_input = X_data + task_idx * norm_size;
Expand Down Expand Up @@ -82,14 +82,17 @@
const MLFloat16* bias_data,
const ptrdiff_t task_idx,
const int64_t norm_size,
IAllocatorUniquePtr<float>& scale_float_uptr,
IAllocatorUniquePtr<float>& bias_float_uptr,
const float* scale_float_ptr,

Check warning on line 85 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:85: Do not indent within a namespace. [whitespace/indent_namespace] [4]
const float* bias_float_ptr,

Check warning on line 86 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:86: Do not indent within a namespace. [whitespace/indent_namespace] [4]
float epsilon,
bool simplified,
MLFloat16* Y_data,
U* mean_data,
U* inv_std_dev_data,
AllocatorPtr alloc) {
ORT_UNUSED_PARAMETER(scale_data); // only used in float/double overload
ORT_UNUSED_PARAMETER(bias_data); // only used in float/double overload

const MLFloat16* p_input = X_data + task_idx * norm_size;
MLFloat16* p_output = Y_data + task_idx * norm_size;

Expand Down Expand Up @@ -117,22 +120,10 @@
mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon);
}

if (!scale_float_uptr) {
scale_float_uptr = std::move(input_float_uptr); // overwrite input with scale values, since they have the same size
MlasConvertHalfToFloatBuffer(scale_data, scale_float_uptr.get(), num_elems);
}

if (bias_data && !bias_float_uptr) {
bias_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(bias_data, bias_float_uptr.get(), num_elems);
}

const float* scale_float_ptr = scale_float_uptr.get();
const float* bias_float_ptr = bias_float_uptr.get();
for (size_t h = 0; h < num_elems; h++) {
if (simplified) {
output_float_ptr[h] = output_float_ptr[h] / mean_square * scale_float_ptr[h];
} else if (nullptr == bias_data) {
} else if (nullptr == bias_float_ptr) {
output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * scale_float_ptr[h];
} else {
output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * scale_float_ptr[h] + bias_float_ptr[h];
Expand Down Expand Up @@ -183,7 +174,7 @@

const TensorShape& x_shape = X->Shape();
const TensorShape& scale_shape = scale->Shape();
const TensorShape& bias_shape = bias->Shape();
const TensorShape* bias_shape = bias ? &bias->Shape() : nullptr;
Tensor* Y = p_ctx->Output(0, x_shape);
T* Y_data = Y->MutableData<T>();

Expand Down Expand Up @@ -252,7 +243,7 @@
const T* scale_data,
const TensorShape& scale_shape,
const T* bias_data,
const TensorShape& bias_shape,
const TensorShape* bias_shape,

Check warning on line 246 in onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Do not indent within a namespace. [whitespace/indent_namespace] [4] Raw Output: onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc:246: Do not indent within a namespace. [whitespace/indent_namespace] [4]
T* Y_data,
U* mean_data,
U* inv_std_dev_data,
Expand All @@ -265,18 +256,35 @@
int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow<size_t>(axis));

const auto scale_size = scale_shape.Size();
const auto bias_size = (bias_data) ? bias_shape.Size() : 0;
const auto bias_size = bias_shape ? bias_shape->Size() : 0;
if (scale_size != norm_size || (bias_data && bias_size != norm_size)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Size of X.shape()[axis:] == ", norm_size,
". Size of scale and bias (if provided) must match this. Got scale size of ",
scale_size, " and bias size of ", bias_size);
}

if constexpr (std::is_same_v<T, MLFloat16>) {
if (scale_fp32_ == nullptr || (bias_fp32_ == nullptr && bias_data)) {
std::lock_guard<OrtMutex> lock(mutex_);

const size_t num_elems = static_cast<size_t>(norm_size);
if (scale_fp32_ == nullptr) {
scale_fp32_ = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
fs-eire marked this conversation as resolved.
Show resolved Hide resolved
MlasConvertHalfToFloatBuffer(scale_data, scale_fp32_.get(), num_elems);
}

if (bias_fp32_ == nullptr && bias_data) {
bias_fp32_ = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(bias_data, bias_fp32_.get(), num_elems);
}
}
}

concurrency::ThreadPool::TryBatchParallelFor(
thread_pool, static_cast<int32_t>(norm_count),
[&](ptrdiff_t task_idx) {
ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, scale_fp32_, bias_fp32_,
ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, scale_fp32_.get(), bias_fp32_.get(),
epsilon, simplified, Y_data, mean_data, inv_std_dev_data, alloc);
},
0);
Expand Down
3 changes: 2 additions & 1 deletion onnxruntime/core/providers/cpu/nn/layer_norm_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class LayerNormImpl : public OpKernel {
const T* scale_data,
const TensorShape& scale_shape,
const T* bias_data,
const TensorShape& bias_shape,
const TensorShape* bias_shape,
T* Y_data,
U* mean_data,
U* inv_std_dev,
Expand Down Expand Up @@ -65,6 +65,7 @@ class LayerNormImpl : public OpKernel {
const bool contrib_op_;
mutable IAllocatorUniquePtr<float> scale_fp32_;
fs-eire marked this conversation as resolved.
Show resolved Hide resolved
mutable IAllocatorUniquePtr<float> bias_fp32_;
mutable OrtMutex mutex_;
fs-eire marked this conversation as resolved.
Show resolved Hide resolved
};

} // namespace onnxruntime
6 changes: 3 additions & 3 deletions onnxruntime/test/onnx/microbenchmark/layer_normalization.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,9 @@ static void BM_LayerNormalization(benchmark::State& state) {
OrtMemoryInfo memory_info(onnxruntime::CPU, OrtAllocatorType::OrtArenaAllocator);
AllocatorPtr alloc = std::make_shared<CPUAllocator>(memory_info);
for (auto _ : state) {
auto status = layer_norm_impl.ComputeWithoutContext(x_data, x_shape, scale_data, scale_shape, bias_data, bias_shape,
Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis,
epsilon, simplified, alloc);
auto status = layer_norm_impl.ComputeWithoutContext(x_data, x_shape, scale_data, scale_shape, bias_data,
&bias_shape, Y_data, mean_data, inv_std_dev_data,
thread_pool.get(), axis, epsilon, simplified, alloc);
if (!status.IsOK()) {
std::cout << "ComputeWithoutContext status not OK: " << status.ErrorMessage() << std::endl;
break;
Expand Down
Loading