Skip to content

Commit

Permalink
fix LayerNorm f16 CPU implementation (microsoft#22479)
Browse files Browse the repository at this point in the history
### Description

The recent PR microsoft#22223 introduced 2 bugs in implementation of CPU
LayerNorm f16:
- possible access to nullptr for bias
`const TensorShape& bias_shape = bias->Shape();` will crash when `bias`
does not exist. (amazingly seems this one is not coverred by any test
case)
   - fix: guard with pointer check
- a racing condition inside ComputeJob
`ComputeJob()` is dispatched to threadpool and it internally tries to
modify `LayerNormImpl::scale_fp32_` and `LayerNormImpl::bias_fp32_`,
which are `std::unique_ptr`s and are not thread-safe.
- fix: move the modification of `LayerNormImpl::scale_fp32_` and
`LayerNormImpl::bias_fp32_` out of `ComputeJob()` and put into
`LayerNormImpl::ComputeWithoutContext()`. It may still have racing
condition because `ConcurrentRunSupported` is set to `true` for CPU EP.
Added an OrtMutex.

This should fixes the recent flaky tests as well.
  • Loading branch information
fs-eire authored Oct 18, 2024
1 parent e5c2e50 commit b4cb937
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 41 deletions.
82 changes: 48 additions & 34 deletions onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ void ComputeJob(
const T* bias_data,
const ptrdiff_t task_idx,
const int64_t norm_size,
IAllocatorUniquePtr<float>& scale_float_uptr,
IAllocatorUniquePtr<float>& bias_float_uptr,
const float* scale_float_ptr,
const float* bias_float_ptr,
float epsilon,
bool simplified,
T* Y_data,
U* mean_data,
U* inv_std_dev_data,
AllocatorPtr alloc) {
ORT_UNUSED_PARAMETER(scale_float_uptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(bias_float_uptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(scale_float_ptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(bias_float_ptr); // only used in MLFloat16 overload
ORT_UNUSED_PARAMETER(alloc);

const T* p_input = X_data + task_idx * norm_size;
Expand Down Expand Up @@ -82,14 +82,17 @@ void ComputeJob(
const MLFloat16* bias_data,
const ptrdiff_t task_idx,
const int64_t norm_size,
IAllocatorUniquePtr<float>& scale_float_uptr,
IAllocatorUniquePtr<float>& bias_float_uptr,
const float* scale_float_ptr,
const float* bias_float_ptr,
float epsilon,
bool simplified,
MLFloat16* Y_data,
U* mean_data,
U* inv_std_dev_data,
AllocatorPtr alloc) {
ORT_UNUSED_PARAMETER(scale_data); // only used in float/double overload
ORT_UNUSED_PARAMETER(bias_data); // only used in float/double overload

const MLFloat16* p_input = X_data + task_idx * norm_size;
MLFloat16* p_output = Y_data + task_idx * norm_size;

Expand Down Expand Up @@ -117,22 +120,10 @@ void ComputeJob(
mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon);
}

if (!scale_float_uptr) {
scale_float_uptr = std::move(input_float_uptr); // overwrite input with scale values, since they have the same size
MlasConvertHalfToFloatBuffer(scale_data, scale_float_uptr.get(), num_elems);
}

if (bias_data && !bias_float_uptr) {
bias_float_uptr = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(bias_data, bias_float_uptr.get(), num_elems);
}

const float* scale_float_ptr = scale_float_uptr.get();
const float* bias_float_ptr = bias_float_uptr.get();
for (size_t h = 0; h < num_elems; h++) {
if (simplified) {
output_float_ptr[h] = output_float_ptr[h] / mean_square * scale_float_ptr[h];
} else if (nullptr == bias_data) {
} else if (nullptr == bias_float_ptr) {
output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * scale_float_ptr[h];
} else {
output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * scale_float_ptr[h] + bias_float_ptr[h];
Expand Down Expand Up @@ -166,7 +157,13 @@ void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, I
} // namespace

LayerNormImpl::LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified, bool contrib_op)
: OpKernel(op_kernel_info), simplified_{simplified}, contrib_op_{contrib_op}, scale_fp32_(nullptr), bias_fp32_(nullptr) {
: OpKernel(op_kernel_info),
simplified_{simplified},
contrib_op_{contrib_op},
prepacked_scale_fp32_data_(nullptr),
prepacked_scale_fp32_size_(0),
prepacked_bias_fp32_data_(nullptr),
prepacked_bias_fp32_size_(0) {
ORT_ENFORCE(op_kernel_info.GetAttr("axis", &axis_).IsOK());
ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &epsilon_).IsOK());
}
Expand All @@ -175,15 +172,15 @@ template <typename T, typename U>
Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified) const {
// Inputs
const Tensor* X = p_ctx->Input<Tensor>(0);
const Tensor* scale = p_ctx->Input<Tensor>(1);
const Tensor* bias = p_ctx->Input<Tensor>(2);
const Tensor* scale = prepacked_scale_fp32_data_ ? nullptr : p_ctx->Input<Tensor>(1);
const Tensor* bias = prepacked_bias_fp32_data_ ? nullptr : p_ctx->Input<Tensor>(2);
const T* X_data = X->Data<T>();
const T* scale_data = scale->Data<T>();
const T* scale_data = scale ? scale->Data<T>() : nullptr;
const T* bias_data = (simplified || nullptr == bias) ? nullptr : bias->Data<T>();

const TensorShape& x_shape = X->Shape();
const TensorShape& scale_shape = scale->Shape();
const TensorShape& bias_shape = bias->Shape();
size_t scale_size = scale ? static_cast<size_t>(scale->Shape().Size()) : prepacked_scale_fp32_size_;
size_t bias_size = bias ? static_cast<size_t>(bias->Shape().Size()) : prepacked_bias_fp32_size_;
Tensor* Y = p_ctx->Output(0, x_shape);
T* Y_data = Y->MutableData<T>();

Expand Down Expand Up @@ -218,7 +215,7 @@ Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, flo

AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc));
return ComputeWithoutContext<T, U>(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, Y_data, mean_data,
return ComputeWithoutContext<T, U>(X_data, x_shape, scale_data, scale_size, bias_data, bias_size, Y_data, mean_data,
inv_std_dev_data, thread_pool, axis, epsilon, simplified, alloc);
}

Expand All @@ -237,9 +234,11 @@ Status LayerNormImpl::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr

is_packed = false;
if (input_idx == 1) { // scale
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, scale_fp32_, is_packed);
prepacked_scale_fp32_size_ = static_cast<size_t>(tensor.Shape().Size());
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_scale_fp32_data_, is_packed);
} else if (input_idx == 2) { // bias
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, bias_fp32_, is_packed);
prepacked_bias_fp32_size_ = static_cast<size_t>(tensor.Shape().Size());
ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed);
}

return Status::OK();
Expand All @@ -250,9 +249,9 @@ Status LayerNormImpl::ComputeWithoutContext(
const T* X_data,
const TensorShape& x_shape,
const T* scale_data,
const TensorShape& scale_shape,
size_t scale_size,
const T* bias_data,
const TensorShape& bias_shape,
size_t bias_size,
T* Y_data,
U* mean_data,
U* inv_std_dev_data,
Expand All @@ -264,19 +263,34 @@ Status LayerNormImpl::ComputeWithoutContext(
int64_t norm_count = x_shape.SizeToDimension(onnxruntime::narrow<size_t>(axis));
int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow<size_t>(axis));

const auto scale_size = scale_shape.Size();
const auto bias_size = (bias_data) ? bias_shape.Size() : 0;
if (scale_size != norm_size || (bias_data && bias_size != norm_size)) {
if (static_cast<int64_t>(scale_size) != norm_size || (bias_data && static_cast<int64_t>(bias_size) != norm_size)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Size of X.shape()[axis:] == ", norm_size,
". Size of scale and bias (if provided) must match this. Got scale size of ",
scale_size, " and bias size of ", bias_size);
}

IAllocatorUniquePtr<float> scale_fp32;
IAllocatorUniquePtr<float> bias_fp32;
if constexpr (std::is_same_v<T, MLFloat16>) {
if (prepacked_scale_fp32_data_ == nullptr) {
const size_t num_elems = static_cast<size_t>(norm_size);
scale_fp32 = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(scale_data, scale_fp32.get(), num_elems);
}
if (prepacked_bias_fp32_data_ == nullptr && bias_data) {
const size_t num_elems = static_cast<size_t>(norm_size);
bias_fp32 = IAllocator::MakeUniquePtr<float>(alloc, num_elems);
MlasConvertHalfToFloatBuffer(bias_data, bias_fp32.get(), num_elems);
}
}

concurrency::ThreadPool::TryBatchParallelFor(
thread_pool, static_cast<int32_t>(norm_count),
[&](ptrdiff_t task_idx) {
ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size, scale_fp32_, bias_fp32_,
ComputeJob(X_data, scale_data, bias_data, task_idx, norm_size,
prepacked_scale_fp32_data_ ? prepacked_scale_fp32_data_.get() : scale_fp32.get(),
prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(),
epsilon, simplified, Y_data, mean_data, inv_std_dev_data, alloc);
},
0);
Expand Down
10 changes: 6 additions & 4 deletions onnxruntime/core/providers/cpu/nn/layer_norm_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ class LayerNormImpl : public OpKernel {
const T* X_data,
const TensorShape& x_shape,
const T* scale_data,
const TensorShape& scale_shape,
size_t scale_size,
const T* bias_data,
const TensorShape& bias_shape,
size_t bias_size,
T* Y_data,
U* mean_data,
U* inv_std_dev,
Expand Down Expand Up @@ -63,8 +63,10 @@ class LayerNormImpl : public OpKernel {
float epsilon_;
const bool simplified_;
const bool contrib_op_;
mutable IAllocatorUniquePtr<float> scale_fp32_;
mutable IAllocatorUniquePtr<float> bias_fp32_;
IAllocatorUniquePtr<float> prepacked_scale_fp32_data_;
size_t prepacked_scale_fp32_size_;
IAllocatorUniquePtr<float> prepacked_bias_fp32_data_;
size_t prepacked_bias_fp32_size_;
};

} // namespace onnxruntime
29 changes: 29 additions & 0 deletions onnxruntime/test/contrib_ops/layer_norm_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,20 @@ TEST(LayerNormTest, LayerNorm_Scale_Float16InputScaleOutput) {
kNnapiExecutionProvider, kQnnExecutionProvider, kCoreMLExecutionProvider});
}

TEST(LayerNormTest, LayerNorm_Scale_Float16InputScaleOutput_Initializers) {
OpTester test("LayerNormalization");
test.AddAttribute<float>("epsilon", 1e-05f);

std::vector<int64_t> dims{2, 2, 2};
test.AddInput<MLFloat16>("x", dims, ToFloat16({-10.264f, 8.6453f, 43.1561f, -0.641239f, -8.2164f, 0.11412f, 41.3156f, 3.0458f}));
test.AddInput<MLFloat16>("gamma", {2}, ToFloat16({-0.6953f, 5.1824f}), true);
test.AddOutput<MLFloat16>("output", dims, ToFloat16({0.6953f, 5.1824f, -0.6953f, -5.1824f, 0.6953f, 5.1824f, -0.6953f, -5.1824f}));
// TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
test.Run(OpTester::ExpectResult::kExpectSuccess, "",
{kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
kNnapiExecutionProvider, kQnnExecutionProvider, kCoreMLExecutionProvider});
}

TEST(LayerNormTest, LayerNorm_Scale_Bias) {
OpTester test("LayerNormalization");
test.AddAttribute<float>("epsilon", 1e-05f);
Expand Down Expand Up @@ -211,6 +225,21 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_Float16InputScaleBiasOutput) {
kNnapiExecutionProvider, kQnnExecutionProvider, kCoreMLExecutionProvider});
}

TEST(LayerNormTest, LayerNorm_Scale_Bias_Float16InputScaleBiasOutput_Initializers) {
OpTester test("LayerNormalization");
test.AddAttribute<float>("epsilon", 1e-05f);

std::vector<int64_t> dims{1, 3, 2};
test.AddInput<MLFloat16>("x", dims, ToFloat16({1.2416f, 0.946123f, 13.1685f, 0.36423f, 21.145f, 0.03941f}));
test.AddInput<MLFloat16>("gamma", {2}, ToFloat16({-0.6953f, 5.1824f}), true);
test.AddInput<MLFloat16>("bias", {2}, ToFloat16({0.6435f, -0.3964f}), true);
test.AddOutput<MLFloat16>("output", dims, ToFloat16({-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f}));
// TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
test.Run(OpTester::ExpectResult::kExpectSuccess, "",
{kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
kNnapiExecutionProvider, kQnnExecutionProvider, kCoreMLExecutionProvider});
}

// LayerNormalization became an ONNX operator in opset 17. It uses the same implementation so this is a sanity check.
TEST(LayerNormTest, LayerNorm17_float) {
OpTester test("LayerNormalization", 17);
Expand Down
17 changes: 14 additions & 3 deletions onnxruntime/test/onnx/microbenchmark/layer_normalization.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,20 @@ static void BM_LayerNormalization(benchmark::State& state) {
OrtMemoryInfo memory_info(onnxruntime::CPU, OrtAllocatorType::OrtArenaAllocator);
AllocatorPtr alloc = std::make_shared<CPUAllocator>(memory_info);
for (auto _ : state) {
auto status = layer_norm_impl.ComputeWithoutContext(x_data, x_shape, scale_data, scale_shape, bias_data, bias_shape,
Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis,
epsilon, simplified, alloc);
auto status = layer_norm_impl.ComputeWithoutContext(x_data,
x_shape,
scale_data,
static_cast<size_t>(scale_shape.Size()),
bias_data,
static_cast<size_t>(bias_shape.Size()),
Y_data,
mean_data,
inv_std_dev_data,
thread_pool.get(),
axis,
epsilon,
simplified,
alloc);
if (!status.IsOK()) {
std::cout << "ComputeWithoutContext status not OK: " << status.ErrorMessage() << std::endl;
break;
Expand Down

0 comments on commit b4cb937

Please sign in to comment.