Skip to content

Commit

Permalink
Pad-18 Cuda implementation (#19211)
Browse files Browse the repository at this point in the history
### Description
Implement Pad-18 for Cuda.

### Motivation and Context
Latest models converted by Dynamo fall back on CPU for Pad with
performance degradation.

This contributes to
microsoft/onnx-rewriter#126
  • Loading branch information
yuslepukhin authored Jan 25, 2024
1 parent 4477f57 commit 7dd1f4b
Show file tree
Hide file tree
Showing 9 changed files with 287 additions and 171 deletions.
3 changes: 2 additions & 1 deletion docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,8 @@ Do not modify directly.*
|PRelu|*in* X:**T**<br> *in* slope:**T**<br> *out* Y:**T**|16+|**T** = tensor(double), tensor(float), tensor(float16)|
|||[9, 15]|**T** = tensor(double), tensor(float), tensor(float16)|
|||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||[13, 17]|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|||[2, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
|ParametricSoftplus|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
Expand Down
8 changes: 7 additions & 1 deletion onnxruntime/core/providers/cpu/cpu_provider_shared.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,13 @@ struct ProviderHostCPUImpl : ProviderHostCPU {
const TensorShape& indice_shape,
const TensorShape& update_shape) override { return ScatterND::ValidateShapes(input_shape, indice_shape, update_shape); }
// From cpu/tensor/padbase.h (direct)
Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) override { return PadBase::HandleDimValueZero(mode, input_shape, output_shape); }
Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape) override { return PadBase::HandleDimValueZero(mode, input_shape, output_shape); }

void PadBase__ComputePads(OpKernelContext& ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
PadsVector& pads) override {
PadBase::ComputePads(ctx, data_rank, pads_data, pads);
}

// From cpu/tensor/split.h (direct)
Status SplitBase__PrepareForCompute(const SplitBase* p, const TensorShape& input_shape, int num_outputs, int64_t& axis, int& before_dims,
int& after_dims_including_split_axis, int& after_dims_excluding_split,
Expand Down
8 changes: 7 additions & 1 deletion onnxruntime/core/providers/cpu/cpu_provider_shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ class UnsqueezeBase__Prepare; // Directly maps to UnsqueezeBase::Pr
class contrib__AdamWOptimizerBase__Prepare;
class contrib__SGDOptimizerV2Base__Prepare;

using PadsVector = InlinedVector<int64_t, kTensorShapeSmallBufferElementsSize * 2>;

struct ProviderHostCPU {
// From cpu/tensor/gatherbase.h
virtual Status GatherBase__PrepareForCompute(const GatherBase* p, OpKernelContext* context, GatherBase__Prepare& prepare) = 0;
Expand All @@ -44,7 +46,11 @@ struct ProviderHostCPU {
const TensorShape& indice_shape,
const TensorShape& update_shape) = 0;
// From cpu/tensor/padbase.h
virtual Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) = 0;
virtual Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape) = 0;

virtual void PadBase__ComputePads(OpKernelContext& ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
PadsVector& pads) = 0;

// From cpu/tensor/split.h
virtual Status SplitBase__PrepareForCompute(const SplitBase* p, const TensorShape& input_shape, int num_outputs, int64_t& axis, int& before_dims,
int& after_dims_including_split_axis, int& after_dims_excluding_split,
Expand Down
252 changes: 130 additions & 122 deletions onnxruntime/core/providers/cpu/tensor/pad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include "core/providers/op_kernel_type_control.h"
#include "core/util/math.h"

#include <functional>

// there's no way to use a raw pointer as the copy destination with std::copy_n
// (which gsl::copy uses with span::data() which returns a raw pointer) with the 14.11 toolset
// without generating a 4996 warning. going through an iterator is way too much overhead so turn off the warning.
Expand Down Expand Up @@ -167,47 +169,7 @@ ONNX_CPU_OPERATOR_KERNEL(

using PadsVector = PadBase::PadsVector;

// This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values)
template <typename T>
static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
size_t block_size, size_t block_count) {
for (size_t block_index = 0; block_index < block_count; block_index++) {
for (size_t i = 0; i < block_size; i++) {
*output++ = *input;
input += input_delta;
}
input += input_pitch;
}
}

// These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1,
// and inputPitch and inputDelta are just a single value added each iteration.
template <typename T>
static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
for (size_t block_index = 0; block_index < block_count; block_index++) {
*output++ = *input;
input += input_delta;
}
}

// For constant padding, there is no input, just a size to write the constant to
template <typename T>
static void PadAxisConstant(T* output, T constant, size_t size) {
if (size == 1) {
*output = constant;
} else if (size == 2) {
*output = constant;
*(output + 1) = constant;
} else {
// This would be faster with SSE instructions.
// That would mean to have an implementation for each type (uint8, uint32, uint64).
T* end = output + size;
for (; output != end;)
*output++ = constant;
}
}

Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) {
Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape) {
switch (mode) {
case Mode::Constant: {
// default behavior is fine
Expand Down Expand Up @@ -242,34 +204,66 @@ Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_sh
return Status::OK();
}

// special handling for edge case where the input has one or more dims with value of 0
template <typename T>
static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
const Mode& mode,
const TensorShape& input_shape,
TensorShapeVector& output_dims,
T value) {
TensorShape output_shape(output_dims);
ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode, input_shape, output_shape));

auto& output_tensor = *ctx->Output(0, output_shape);

// we need to add pads if mode is constant, otherwise the output has one or more dim values of 0 so is empty
if (mode == Mode::Constant) {
// we add pads with the default value to all dims including those with a value of 0
auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
std::fill_n(output, output_shape.Size(), value);
static void ComputePadWithAxes(
gsl::span<const int64_t> pads_tensor_raw_data,
std::function<int64_t(size_t)> get_axis,
size_t axes_size,
size_t data_rank,
PadsVector& pads) {
for (size_t i = 0; i < axes_size; ++i) {
const size_t axis = onnxruntime::narrow<size_t>(HandleNegativeAxis(get_axis(i), data_rank));
pads[axis] = pads_tensor_raw_data[i]; // xi_begin
pads[data_rank + axis] = pads_tensor_raw_data[axes_size + i]; // xi_end
}
}

return Status::OK();
void PadBase::ComputePads(OpKernelContext& ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
PadsVector& pads) {
pads.reserve(2 * data_rank);
const Tensor* axes_tensor = ctx.Input<Tensor>(3);
if (axes_tensor) {
const size_t num_axes_dims = axes_tensor->Shape().NumDimensions();
ORT_ENFORCE(num_axes_dims == 1, "Axes tensor should be a 1D tensor ");

const int64_t num_axes = axes_tensor->Shape().Size();
ORT_ENFORCE(pads_data.size() == narrow<size_t>(2 * num_axes),
"Pads tensor size should be equal to twice the number of explicitly provided axes.");

pads.resize(2 * data_rank, 0);
if (axes_tensor->IsDataType<int32_t>()) {
auto axes_data = axes_tensor->DataAsSpan<int32_t>();
ComputePadWithAxes(
pads_data,
[axes_data](size_t idx) -> int64_t {
return axes_data[idx];
},
axes_data.size(),
data_rank,
pads);
} else if (axes_tensor->IsDataType<int64_t>()) {
auto axes_data = axes_tensor->DataAsSpan<int64_t>();
ComputePadWithAxes(
pads_data,
[axes_data](size_t idx) {
return axes_data[idx];
},
axes_data.size(),
data_rank,
pads);
}
} else {
ORT_ENFORCE(pads_data.size() == 2 * data_rank,
"Pads tensor size should be equal to twice the input dimension count ");
pads.assign(pads_data.begin(), pads_data.end());
}
}

// Flatten no padding inner most Axis, so one memcpy cover multiple Axis.
// For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as
// [1,224,224*3] with padding [0,3,3*3,0,3,3*3].
static void FlattenInnerShape(const TensorShapeVector& input_dims, const PadsVector& pads,
const PadsVector& slices, TensorShapeVector& reshaped_dims) {
size_t dims_count = input_dims.size();
void PadBase::FlattenInnerShape(gsl::span<const int64_t> input_dims, gsl::span<const int64_t> pads,
gsl::span<const int64_t> slices, TensorShapeVector& reshaped_dims) {
const size_t dims_count = input_dims.size();
size_t inner_axis = dims_count - 1;
size_t inner_size = 1;

Expand All @@ -288,14 +282,14 @@ static void FlattenInnerShape(const TensorShapeVector& input_dims, const PadsVec
} while (inner_axis-- > 0);

reshaped_dims.reserve(inner_axis + 1);
std::copy(input_dims.cbegin(), input_dims.cbegin() + inner_axis + 1, std::back_inserter(reshaped_dims));
std::copy(input_dims.begin(), input_dims.begin() + inner_axis + 1, std::back_inserter(reshaped_dims));

// Flatten inner axis.
reshaped_dims[inner_axis] = inner_size;
}

static void ReshapePads(const PadsVector& src_pad, size_t src_dim_count, size_t new_dim_count,
size_t inner_no_pad_size, PadsVector& reshaped_pad) {
void PadBase::ReshapePads(gsl::span<const int64_t> src_pad, size_t src_dim_count, size_t new_dim_count,
size_t inner_no_pad_size, PadsVector& reshaped_pad) {
size_t inner_axis = new_dim_count - 1;
std::copy(src_pad.begin(), src_pad.begin() + inner_axis, reshaped_pad.begin());
std::copy(src_pad.begin() + src_dim_count, src_pad.begin() + src_dim_count + inner_axis,
Expand All @@ -306,6 +300,68 @@ static void ReshapePads(const PadsVector& src_pad, size_t src_dim_count, size_t
reshaped_pad[inner_axis + new_dim_count] = src_pad[inner_axis + src_dim_count] * inner_no_pad_size;
}

// special handling for edge case where the input has one or more dims with value of 0
template <typename T>
static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
const Mode& mode,
const TensorShape& input_shape,
TensorShapeVector& output_dims,
T value) {
TensorShape output_shape(output_dims);
ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode, input_shape, output_shape));

auto& output_tensor = *ctx->Output(0, output_shape);

// we need to add pads if mode is constant, otherwise the output has one or more dim values of 0 so is empty
if (mode == Mode::Constant) {
// we add pads with the default value to all dims including those with a value of 0
auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
std::fill_n(output, output_shape.Size(), value);
}

return Status::OK();
}

// This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values)
template <typename T>
static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
size_t block_size, size_t block_count) {
for (size_t block_index = 0; block_index < block_count; block_index++) {
for (size_t i = 0; i < block_size; i++) {
*output++ = *input;
input += input_delta;
}
input += input_pitch;
}
}

// These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1,
// and inputPitch and inputDelta are just a single value added each iteration.
template <typename T>
static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
for (size_t block_index = 0; block_index < block_count; block_index++) {
*output++ = *input;
input += input_delta;
}
}

// For constant padding, there is no input, just a size to write the constant to
template <typename T>
static void PadAxisConstant(T* output, T constant, size_t size) {
if (size == 1) {
*output = constant;
} else if (size == 2) {
*output = constant;
*(output + 1) = constant;
} else {
// This would be faster with SSE instructions.
// That would mean to have an implementation for each type (uint8, uint32, uint64).
T* end = output + size;
for (; output != end;)
*output++ = constant;
}
}

template <typename T>
static Status PadImpl(OpKernelContext* ctx,
const PadsVector& pads,
Expand All @@ -327,7 +383,7 @@ static Status PadImpl(OpKernelContext* ctx,

// Reshape input dims
TensorShapeVector reshaped_input_dims;
FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);
PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);

// Reshape padding
size_t new_dims_count = reshaped_input_dims.size();
Expand All @@ -336,8 +392,8 @@ static Status PadImpl(OpKernelContext* ctx,
? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
: 0);
PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count);
ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);
PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);

TensorShapeVector reshaped_output_dims = reshaped_input_dims;
TensorShapeVector input_starts;
Expand Down Expand Up @@ -575,20 +631,6 @@ static PadValue PadValueFromFloat(float value, MLDataType data_type) {
return result;
}

template <class T>
void ComputePadWithAxes(
gsl::span<const int64_t> pads_tensor_raw_data,
gsl::span<const T> axes_tensor_raw_data,
size_t data_rank,
PadsVector& pads) {
size_t axes_size = axes_tensor_raw_data.size();
for (size_t i = 0; i < axes_size; ++i) {
int64_t axis = HandleNegativeAxis(onnxruntime::narrow<int64_t>(axes_tensor_raw_data[i]), data_rank);
pads[onnxruntime::narrow<size_t>(axis)] = pads_tensor_raw_data[i]; // xi_begin
pads[data_rank + onnxruntime::narrow<size_t>(axis)] = pads_tensor_raw_data[axes_size + i]; // xi_end
}
}

Status Pad::Compute(OpKernelContext* ctx) const {
const Tensor& input_tensor = *ctx->Input<Tensor>(0);
MLDataType data_type = input_tensor.DataType();
Expand All @@ -608,48 +650,14 @@ Status Pad::Compute(OpKernelContext* ctx) const {
ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1),
"Pads tensor should be a 1D tensor of shape [2 * num_axes] "
"or a 2D tensor of shape [1, 2 * num_axes]");
const int64_t* pads_tensor_raw_data = pads_tensor.Data<int64_t>();
size_t pads_size = static_cast<size_t>(pads_tensor.Shape().Size());
pads.reserve(2 * data_rank);

const Tensor* axes_tensor = ctx->Input<Tensor>(3);
if (axes_tensor) {
const auto& axes_tensor_dims = axes_tensor->Shape().GetDims();
ORT_ENFORCE(axes_tensor_dims.size() == 1, "Axes tensor should be a 1D tensor ");
int64_t axes_size = axes_tensor_dims[0];

pads.resize(2 * data_rank, 0);
if (axes_tensor->IsDataType<int32_t>()) {
const int32_t* axes_tensor_raw_data = axes_tensor->Data<int32_t>();
ComputePadWithAxes<int32_t>(
{pads_tensor_raw_data, onnxruntime::narrow<size_t>(2 * axes_size)},
{axes_tensor_raw_data, onnxruntime::narrow<size_t>(axes_size)},
data_rank,
pads);
} else if (axes_tensor->IsDataType<int64_t>()) {
const int64_t* axes_tensor_raw_data = axes_tensor->Data<int64_t>();
ComputePadWithAxes<int64_t>(
{pads_tensor_raw_data, onnxruntime::narrow<size_t>(2 * axes_size)},
{axes_tensor_raw_data, onnxruntime::narrow<size_t>(axes_size)},
data_rank,
pads);
}
} else {
ORT_ENFORCE(pads_size == 2 * data_rank,
"Pads tensor size should be equal to twice the input dimension count ");
for (size_t i = 0; i < pads_size; ++i) {
pads.push_back(pads_tensor_raw_data[i]);
}
}

const auto pads_data = pads_tensor.DataAsSpan<int64_t>();

// Compute Pads by applying axes if specified otherwise copy the supplied pads.
PadBase::ComputePads(*ctx, data_rank, pads_data, pads);

// Separate out any negative pads into the slices array
slices.assign(pads.size(), 0);
for (size_t index = 0; index < pads.size(); index++) {
if (pads[index] < 0) {
slices[index] = pads[index];
pads[index] = 0;
}
}
PadBase::SeparateNegativeToSlices(pads, slices);

value.u64 = 0U;
const Tensor* value_tensor = ctx->Input<Tensor>(2);
Expand Down
Loading

0 comments on commit 7dd1f4b

Please sign in to comment.