diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index fe5267f20712b..76003d343403e 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -167,46 +167,6 @@ ONNX_CPU_OPERATOR_KERNEL( using PadsVector = PadBase::PadsVector; -// This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values) -template -static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch, - size_t block_size, size_t block_count) { - for (size_t block_index = 0; block_index < block_count; block_index++) { - for (size_t i = 0; i < block_size; i++) { - *output++ = *input; - input += input_delta; - } - input += input_pitch; - } -} - -// These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1, -// and inputPitch and inputDelta are just a single value added each iteration. -template -static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) { - for (size_t block_index = 0; block_index < block_count; block_index++) { - *output++ = *input; - input += input_delta; - } -} - -// For constant padding, there is no input, just a size to write the constant to -template -static void PadAxisConstant(T* output, T constant, size_t size) { - if (size == 1) { - *output = constant; - } else if (size == 2) { - *output = constant; - *(output + 1) = constant; - } else { - // This would be faster with SSE instructions. - // That would mean to have an implementation for each type (uint8, uint32, uint64). - T* end = output + size; - for (; output != end;) - *output++ = constant; - } -} - Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) { switch (mode) { case Mode::Constant: { @@ -242,34 +202,62 @@ Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_sh return Status::OK(); } -// special handling for edge case where the input has one or more dims with value of 0 -template -static Status PadInputWithDimValueOfZero(OpKernelContext* ctx, - const Mode& mode, - const TensorShape& input_shape, - TensorShapeVector& output_dims, - T value) { - TensorShape output_shape(output_dims); - ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode, input_shape, output_shape)); - - auto& output_tensor = *ctx->Output(0, output_shape); - - // we need to add pads if mode is constant, otherwise the output has one or more dim values of 0 so is empty - if (mode == Mode::Constant) { - // we add pads with the default value to all dims including those with a value of 0 - auto* output = reinterpret_cast(output_tensor.MutableDataRaw()); - std::fill_n(output, output_shape.Size(), value); +void PadBase::ComputePadWithAxes( + gsl::span pads_tensor_raw_data, + std::function get_axis, + size_t axes_size, + size_t data_rank, + PadsVector& pads) { + for (size_t i = 0; i < axes_size; ++i) { + const size_t axis = onnxruntime::narrow(HandleNegativeAxis(get_axis(i), data_rank)); + pads[axis] = pads_tensor_raw_data[i]; // xi_begin + pads[data_rank + axis] = pads_tensor_raw_data[axes_size + i]; // xi_end } +} - return Status::OK(); +void PadBase::ComputePads(OpKernelContext* ctx, size_t data_rank, gsl::span pads_data, + PadsVector& pads) { + pads.reserve(2 * data_rank); + const Tensor* axes_tensor = ctx->Input(3); + if (axes_tensor) { + const size_t num_axes_dims = axes_tensor->Shape().NumDimensions(); + ORT_ENFORCE(num_axes_dims == 1, "Axes tensor should be a 1D tensor "); + + pads.resize(2 * data_rank, 0); + if (axes_tensor->IsDataType()) { + auto axes_data = axes_tensor->DataAsSpan(); + ComputePadWithAxes( + pads_data, + [axes_data](size_t idx) -> int64_t { + return axes_data[idx]; + }, + axes_data.size(), + data_rank, + pads); + } else if (axes_tensor->IsDataType()) { + auto axes_data = axes_tensor->DataAsSpan(); + ComputePadWithAxes( + pads_data, + [axes_data](size_t idx) { + return axes_data[idx]; + }, + axes_data.size(), + data_rank, + pads); + } + } else { + ORT_ENFORCE(pads_data.size() == 2 * data_rank, + "Pads tensor size should be equal to twice the input dimension count "); + pads.assign(pads_data.begin(), pads_data.end()); + } } // Flatten no padding inner most Axis, so one memcpy cover multiple Axis. // For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as // [1,224,224*3] with padding [0,3,3*3,0,3,3*3]. -static void FlattenInnerShape(const TensorShapeVector& input_dims, const PadsVector& pads, - const PadsVector& slices, TensorShapeVector& reshaped_dims) { - size_t dims_count = input_dims.size(); +void PadBase::FlattenInnerShape(gsl::span input_dims, gsl::span pads, + gsl::span slices, TensorShapeVector& reshaped_dims) { + const size_t dims_count = input_dims.size(); size_t inner_axis = dims_count - 1; size_t inner_size = 1; @@ -288,14 +276,14 @@ static void FlattenInnerShape(const TensorShapeVector& input_dims, const PadsVec } while (inner_axis-- > 0); reshaped_dims.reserve(inner_axis + 1); - std::copy(input_dims.cbegin(), input_dims.cbegin() + inner_axis + 1, std::back_inserter(reshaped_dims)); + std::copy(input_dims.begin(), input_dims.begin() + inner_axis + 1, std::back_inserter(reshaped_dims)); // Flatten inner axis. reshaped_dims[inner_axis] = inner_size; } -static void ReshapePads(const PadsVector& src_pad, size_t src_dim_count, size_t new_dim_count, - size_t inner_no_pad_size, PadsVector& reshaped_pad) { +void PadBase::ReshapePads(gsl::span src_pad, size_t src_dim_count, size_t new_dim_count, + size_t inner_no_pad_size, PadsVector& reshaped_pad) { size_t inner_axis = new_dim_count - 1; std::copy(src_pad.begin(), src_pad.begin() + inner_axis, reshaped_pad.begin()); std::copy(src_pad.begin() + src_dim_count, src_pad.begin() + src_dim_count + inner_axis, @@ -306,6 +294,68 @@ static void ReshapePads(const PadsVector& src_pad, size_t src_dim_count, size_t reshaped_pad[inner_axis + new_dim_count] = src_pad[inner_axis + src_dim_count] * inner_no_pad_size; } +// special handling for edge case where the input has one or more dims with value of 0 +template +static Status PadInputWithDimValueOfZero(OpKernelContext* ctx, + const Mode& mode, + const TensorShape& input_shape, + TensorShapeVector& output_dims, + T value) { + TensorShape output_shape(output_dims); + ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode, input_shape, output_shape)); + + auto& output_tensor = *ctx->Output(0, output_shape); + + // we need to add pads if mode is constant, otherwise the output has one or more dim values of 0 so is empty + if (mode == Mode::Constant) { + // we add pads with the default value to all dims including those with a value of 0 + auto* output = reinterpret_cast(output_tensor.MutableDataRaw()); + std::fill_n(output, output_shape.Size(), value); + } + + return Status::OK(); +} + +// This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values) +template +static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch, + size_t block_size, size_t block_count) { + for (size_t block_index = 0; block_index < block_count; block_index++) { + for (size_t i = 0; i < block_size; i++) { + *output++ = *input; + input += input_delta; + } + input += input_pitch; + } +} + +// These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1, +// and inputPitch and inputDelta are just a single value added each iteration. +template +static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) { + for (size_t block_index = 0; block_index < block_count; block_index++) { + *output++ = *input; + input += input_delta; + } +} + +// For constant padding, there is no input, just a size to write the constant to +template +static void PadAxisConstant(T* output, T constant, size_t size) { + if (size == 1) { + *output = constant; + } else if (size == 2) { + *output = constant; + *(output + 1) = constant; + } else { + // This would be faster with SSE instructions. + // That would mean to have an implementation for each type (uint8, uint32, uint64). + T* end = output + size; + for (; output != end;) + *output++ = constant; + } +} + template static Status PadImpl(OpKernelContext* ctx, const PadsVector& pads, @@ -327,7 +377,7 @@ static Status PadImpl(OpKernelContext* ctx, // Reshape input dims TensorShapeVector reshaped_input_dims; - FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims); + PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims); // Reshape padding size_t new_dims_count = reshaped_input_dims.size(); @@ -336,8 +386,8 @@ static Status PadImpl(OpKernelContext* ctx, ? reshaped_input_dims[inner_axis] / output_dims[inner_axis] : 0); PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count); - ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad); - ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice); + PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad); + PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice); TensorShapeVector reshaped_output_dims = reshaped_input_dims; TensorShapeVector input_starts; @@ -575,20 +625,6 @@ static PadValue PadValueFromFloat(float value, MLDataType data_type) { return result; } -template -void ComputePadWithAxes( - gsl::span pads_tensor_raw_data, - gsl::span axes_tensor_raw_data, - size_t data_rank, - PadsVector& pads) { - size_t axes_size = axes_tensor_raw_data.size(); - for (size_t i = 0; i < axes_size; ++i) { - int64_t axis = HandleNegativeAxis(onnxruntime::narrow(axes_tensor_raw_data[i]), data_rank); - pads[onnxruntime::narrow(axis)] = pads_tensor_raw_data[i]; // xi_begin - pads[data_rank + onnxruntime::narrow(axis)] = pads_tensor_raw_data[axes_size + i]; // xi_end - } -} - Status Pad::Compute(OpKernelContext* ctx) const { const Tensor& input_tensor = *ctx->Input(0); MLDataType data_type = input_tensor.DataType(); @@ -608,48 +644,13 @@ Status Pad::Compute(OpKernelContext* ctx) const { ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1), "Pads tensor should be a 1D tensor of shape [2 * num_axes] " "or a 2D tensor of shape [1, 2 * num_axes]"); - const int64_t* pads_tensor_raw_data = pads_tensor.Data(); - size_t pads_size = static_cast(pads_tensor.Shape().Size()); - pads.reserve(2 * data_rank); - - const Tensor* axes_tensor = ctx->Input(3); - if (axes_tensor) { - const auto& axes_tensor_dims = axes_tensor->Shape().GetDims(); - ORT_ENFORCE(axes_tensor_dims.size() == 1, "Axes tensor should be a 1D tensor "); - int64_t axes_size = axes_tensor_dims[0]; - - pads.resize(2 * data_rank, 0); - if (axes_tensor->IsDataType()) { - const int32_t* axes_tensor_raw_data = axes_tensor->Data(); - ComputePadWithAxes( - {pads_tensor_raw_data, onnxruntime::narrow(2 * axes_size)}, - {axes_tensor_raw_data, onnxruntime::narrow(axes_size)}, - data_rank, - pads); - } else if (axes_tensor->IsDataType()) { - const int64_t* axes_tensor_raw_data = axes_tensor->Data(); - ComputePadWithAxes( - {pads_tensor_raw_data, onnxruntime::narrow(2 * axes_size)}, - {axes_tensor_raw_data, onnxruntime::narrow(axes_size)}, - data_rank, - pads); - } - } else { - ORT_ENFORCE(pads_size == 2 * data_rank, - "Pads tensor size should be equal to twice the input dimension count "); - for (size_t i = 0; i < pads_size; ++i) { - pads.push_back(pads_tensor_raw_data[i]); - } - } + const auto pads_data = pads_tensor.DataAsSpan(); + + // Compute Pads by applying axes if specified otherwise copy the supplied pads. + PadBase::ComputePads(ctx, data_rank, pads_data, pads); // Separate out any negative pads into the slices array - slices.assign(pads.size(), 0); - for (size_t index = 0; index < pads.size(); index++) { - if (pads[index] < 0) { - slices[index] = pads[index]; - pads[index] = 0; - } - } + PadBase::SeparateNegativeToSlices(pads, slices); value.u64 = 0U; const Tensor* value_tensor = ctx->Input(2); diff --git a/onnxruntime/core/providers/cpu/tensor/padbase.h b/onnxruntime/core/providers/cpu/tensor/padbase.h index d869ed1a6dda2..4f64828153286 100644 --- a/onnxruntime/core/providers/cpu/tensor/padbase.h +++ b/onnxruntime/core/providers/cpu/tensor/padbase.h @@ -4,6 +4,7 @@ #pragma once #include "core/common/inlined_containers.h" +#include namespace onnxruntime { @@ -23,6 +24,34 @@ class PadBase { // in the input_shape with a value of zero. static Status HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape); + static void FlattenInnerShape(gsl::span input_dims, gsl::span pads, + gsl::span slices, TensorShapeVector& reshaped_dims); + + static void ReshapePads(gsl::span src_pad, size_t src_dim_count, size_t new_dim_count, + size_t inner_no_pad_size, PadsVector& reshaped_pad); + + // Compute Pads by applying axes if specified otherwise copy the supplied pads. + static void ComputePads(OpKernelContext* ctx, size_t data_rank, gsl::span pads_data, + PadsVector& pads); + + static void ComputePadWithAxes( + gsl::span pads_tensor_raw_data, + std::function get_axis, + size_t axes_size, + size_t data_rank, + PadsVector& pads); + + // Separate out any negative pads into the slices array + static void SeparateNegativeToSlices(gsl::span pads, PadsVector& slices) { + slices.assign(pads.size(), 0); + for (size_t index = 0, lim = pads.size(); index < lim; index++) { + if (pads[index] < 0) { + slices[index] = pads[index]; + pads[index] = 0; + } + } + } + protected: PadBase(const OpKernelInfo& info) : value_(info.GetAttrOrDefault("value", 0.f)) { std::string mode; diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index 4584e5fd8272c..a3705af18bc01 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -99,23 +99,16 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1), "Pads tensor should be a 1D tensor of shape [2 * input_rank] or a 2D tensor of shape [1, 2 * input_rank]"); - const int64_t* pads_tensor_raw_data = pads_tensor.Data(); - size_t pads_size = static_cast(pads_tensor.Shape().Size()); + const auto pads_data = pads_tensor.DataAsSpan(); + const size_t pads_size = static_cast(pads_tensor.Shape().Size()); ORT_ENFORCE(pads_size == 2 * static_cast(dimension_count), "Pads tensor size should be equal to twice the input dimension count "); - pads.reserve(2LL * dimension_count); - for (size_t i = 0; i < pads_size; ++i) { - pads.push_back(pads_tensor_raw_data[i]); - } + pads.reserve(pads_size); + pads.assign(pads_data.begin(), pads_data.end()); + // Separate out any negative pads into the slices array - slices.resize(pads.size(), 0); - for (size_t index = 0; index < pads.size(); index++) { - if (pads[index] < 0) { - slices[index] = pads[index]; - pads[index] = 0; - } - } + PadBase::SeparateNegativeToSlices(pads, slices); T raw_value{}; const Tensor* value_tensor = ctx->Input(2);