Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pad-18 Cuda implementation #19211

Merged
merged 5 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,8 @@ Do not modify directly.*
|PRelu|*in* X:**T**<br> *in* slope:**T**<br> *out* Y:**T**|16+|**T** = tensor(double), tensor(float), tensor(float16)|
|||[9, 15]|**T** = tensor(double), tensor(float), tensor(float16)|
|||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||[13, 17]|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|||[2, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
|ParametricSoftplus|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
Expand Down
8 changes: 7 additions & 1 deletion onnxruntime/core/providers/cpu/cpu_provider_shared.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,13 @@
const TensorShape& indice_shape,
const TensorShape& update_shape) override { return ScatterND::ValidateShapes(input_shape, indice_shape, update_shape); }
// From cpu/tensor/padbase.h (direct)
Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) override { return PadBase::HandleDimValueZero(mode, input_shape, output_shape); }
Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape) override { return PadBase::HandleDimValueZero(mode, input_shape, output_shape); }

Check warning on line 90 in onnxruntime/core/providers/cpu/cpu_provider_shared.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/cpu/cpu_provider_shared.cc#L90

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
onnxruntime/core/providers/cpu/cpu_provider_shared.cc:90:  Lines should be <= 120 characters long  [whitespace/line_length] [2]

void PadBase__ComputePads(OpKernelContext* ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
PadsVector& pads) override {
PadBase::ComputePads(ctx, data_rank, pads_data, pads);
}

// From cpu/tensor/split.h (direct)
Status SplitBase__PrepareForCompute(const SplitBase* p, const TensorShape& input_shape, int num_outputs, int64_t& axis, int& before_dims,
int& after_dims_including_split_axis, int& after_dims_excluding_split,
Expand Down
8 changes: 7 additions & 1 deletion onnxruntime/core/providers/cpu/cpu_provider_shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
class contrib__AdamWOptimizerBase__Prepare;
class contrib__SGDOptimizerV2Base__Prepare;

using PadsVector = InlinedVector<int64_t, kTensorShapeSmallBufferElementsSize * 2>;

struct ProviderHostCPU {
// From cpu/tensor/gatherbase.h
virtual Status GatherBase__PrepareForCompute(const GatherBase* p, OpKernelContext* context, GatherBase__Prepare& prepare) = 0;
Expand All @@ -44,7 +46,11 @@
const TensorShape& indice_shape,
const TensorShape& update_shape) = 0;
// From cpu/tensor/padbase.h
virtual Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) = 0;
virtual Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape) = 0;

Check warning on line 49 in onnxruntime/core/providers/cpu/cpu_provider_shared.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/cpu/cpu_provider_shared.h#L49

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
onnxruntime/core/providers/cpu/cpu_provider_shared.h:49:  Lines should be <= 120 characters long  [whitespace/line_length] [2]

virtual void PadBase__ComputePads(OpKernelContext* ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
PadsVector& pads) = 0;

// From cpu/tensor/split.h
virtual Status SplitBase__PrepareForCompute(const SplitBase* p, const TensorShape& input_shape, int num_outputs, int64_t& axis, int& before_dims,
int& after_dims_including_split_axis, int& after_dims_excluding_split,
Expand Down
250 changes: 128 additions & 122 deletions onnxruntime/core/providers/cpu/tensor/pad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,47 +167,7 @@ ONNX_CPU_OPERATOR_KERNEL(

using PadsVector = PadBase::PadsVector;

// This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values)
template <typename T>
static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
size_t block_size, size_t block_count) {
for (size_t block_index = 0; block_index < block_count; block_index++) {
for (size_t i = 0; i < block_size; i++) {
*output++ = *input;
input += input_delta;
}
input += input_pitch;
}
}

// These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1,
// and inputPitch and inputDelta are just a single value added each iteration.
template <typename T>
static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
for (size_t block_index = 0; block_index < block_count; block_index++) {
*output++ = *input;
input += input_delta;
}
}

// For constant padding, there is no input, just a size to write the constant to
template <typename T>
static void PadAxisConstant(T* output, T constant, size_t size) {
if (size == 1) {
*output = constant;
} else if (size == 2) {
*output = constant;
*(output + 1) = constant;
} else {
// This would be faster with SSE instructions.
// That would mean to have an implementation for each type (uint8, uint32, uint64).
T* end = output + size;
for (; output != end;)
*output++ = constant;
}
}

Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) {
Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape) {
switch (mode) {
case Mode::Constant: {
// default behavior is fine
Expand Down Expand Up @@ -242,34 +202,66 @@ Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_sh
return Status::OK();
}

// special handling for edge case where the input has one or more dims with value of 0
template <typename T>
static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
const Mode& mode,
const TensorShape& input_shape,
TensorShapeVector& output_dims,
T value) {
TensorShape output_shape(output_dims);
ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode, input_shape, output_shape));

auto& output_tensor = *ctx->Output(0, output_shape);

// we need to add pads if mode is constant, otherwise the output has one or more dim values of 0 so is empty
if (mode == Mode::Constant) {
// we add pads with the default value to all dims including those with a value of 0
auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
std::fill_n(output, output_shape.Size(), value);
static void ComputePadWithAxes(
gsl::span<const int64_t> pads_tensor_raw_data,
std::function<int64_t(size_t)> get_axis,
size_t axes_size,
size_t data_rank,
PadsVector& pads) {
for (size_t i = 0; i < axes_size; ++i) {
const size_t axis = onnxruntime::narrow<size_t>(HandleNegativeAxis(get_axis(i), data_rank));
pads[axis] = pads_tensor_raw_data[i]; // xi_begin
pads[data_rank + axis] = pads_tensor_raw_data[axes_size + i]; // xi_end
}
}

return Status::OK();
void PadBase::ComputePads(OpKernelContext* ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
PadsVector& pads) {
pads.reserve(2 * data_rank);
const Tensor* axes_tensor = ctx->Input<Tensor>(3);
if (axes_tensor) {
const size_t num_axes_dims = axes_tensor->Shape().NumDimensions();
ORT_ENFORCE(num_axes_dims == 1, "Axes tensor should be a 1D tensor ");

const int64_t num_axes = axes_tensor->Shape().Size();
ORT_ENFORCE(pads_data.size() == narrow<size_t>(2 * num_axes),
"Pads tensor size should be equal to twice the number of explicitly provided axes.");

pads.resize(2 * data_rank, 0);
if (axes_tensor->IsDataType<int32_t>()) {
auto axes_data = axes_tensor->DataAsSpan<int32_t>();
ComputePadWithAxes(
pads_data,
[axes_data](size_t idx) -> int64_t {
return axes_data[idx];
},
axes_data.size(),
data_rank,
pads);
} else if (axes_tensor->IsDataType<int64_t>()) {
auto axes_data = axes_tensor->DataAsSpan<int64_t>();
ComputePadWithAxes(
pads_data,
[axes_data](size_t idx) {
return axes_data[idx];
},
axes_data.size(),
data_rank,
pads);
}
} else {
ORT_ENFORCE(pads_data.size() == 2 * data_rank,
"Pads tensor size should be equal to twice the input dimension count ");
pads.assign(pads_data.begin(), pads_data.end());
}
}

// Flatten no padding inner most Axis, so one memcpy cover multiple Axis.
// For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as
// [1,224,224*3] with padding [0,3,3*3,0,3,3*3].
static void FlattenInnerShape(const TensorShapeVector& input_dims, const PadsVector& pads,
const PadsVector& slices, TensorShapeVector& reshaped_dims) {
size_t dims_count = input_dims.size();
void PadBase::FlattenInnerShape(gsl::span<const int64_t> input_dims, gsl::span<const int64_t> pads,
gsl::span<const int64_t> slices, TensorShapeVector& reshaped_dims) {
const size_t dims_count = input_dims.size();
size_t inner_axis = dims_count - 1;
size_t inner_size = 1;

Expand All @@ -288,14 +280,14 @@ static void FlattenInnerShape(const TensorShapeVector& input_dims, const PadsVec
} while (inner_axis-- > 0);

reshaped_dims.reserve(inner_axis + 1);
std::copy(input_dims.cbegin(), input_dims.cbegin() + inner_axis + 1, std::back_inserter(reshaped_dims));
std::copy(input_dims.begin(), input_dims.begin() + inner_axis + 1, std::back_inserter(reshaped_dims));

// Flatten inner axis.
reshaped_dims[inner_axis] = inner_size;
}

static void ReshapePads(const PadsVector& src_pad, size_t src_dim_count, size_t new_dim_count,
size_t inner_no_pad_size, PadsVector& reshaped_pad) {
void PadBase::ReshapePads(gsl::span<const int64_t> src_pad, size_t src_dim_count, size_t new_dim_count,
size_t inner_no_pad_size, PadsVector& reshaped_pad) {
size_t inner_axis = new_dim_count - 1;
std::copy(src_pad.begin(), src_pad.begin() + inner_axis, reshaped_pad.begin());
std::copy(src_pad.begin() + src_dim_count, src_pad.begin() + src_dim_count + inner_axis,
Expand All @@ -306,6 +298,68 @@ static void ReshapePads(const PadsVector& src_pad, size_t src_dim_count, size_t
reshaped_pad[inner_axis + new_dim_count] = src_pad[inner_axis + src_dim_count] * inner_no_pad_size;
}

// special handling for edge case where the input has one or more dims with value of 0
template <typename T>
static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
const Mode& mode,
const TensorShape& input_shape,
TensorShapeVector& output_dims,
T value) {
TensorShape output_shape(output_dims);
ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode, input_shape, output_shape));

auto& output_tensor = *ctx->Output(0, output_shape);

// we need to add pads if mode is constant, otherwise the output has one or more dim values of 0 so is empty
if (mode == Mode::Constant) {
// we add pads with the default value to all dims including those with a value of 0
auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
std::fill_n(output, output_shape.Size(), value);
}

return Status::OK();
}

// This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values)
template <typename T>
static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
size_t block_size, size_t block_count) {
for (size_t block_index = 0; block_index < block_count; block_index++) {
for (size_t i = 0; i < block_size; i++) {
*output++ = *input;
input += input_delta;
}
input += input_pitch;
}
}

// These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1,
// and inputPitch and inputDelta are just a single value added each iteration.
template <typename T>
static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
for (size_t block_index = 0; block_index < block_count; block_index++) {
*output++ = *input;
input += input_delta;
}
}

// For constant padding, there is no input, just a size to write the constant to
template <typename T>
static void PadAxisConstant(T* output, T constant, size_t size) {
if (size == 1) {
*output = constant;
} else if (size == 2) {
*output = constant;
*(output + 1) = constant;
} else {
// This would be faster with SSE instructions.
// That would mean to have an implementation for each type (uint8, uint32, uint64).
T* end = output + size;
for (; output != end;)
*output++ = constant;
}
}

template <typename T>
static Status PadImpl(OpKernelContext* ctx,
const PadsVector& pads,
Expand All @@ -327,7 +381,7 @@ static Status PadImpl(OpKernelContext* ctx,

// Reshape input dims
TensorShapeVector reshaped_input_dims;
FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);
PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);

// Reshape padding
size_t new_dims_count = reshaped_input_dims.size();
Expand All @@ -336,8 +390,8 @@ static Status PadImpl(OpKernelContext* ctx,
? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
: 0);
PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count);
ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);
PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);

TensorShapeVector reshaped_output_dims = reshaped_input_dims;
TensorShapeVector input_starts;
Expand Down Expand Up @@ -575,20 +629,6 @@ static PadValue PadValueFromFloat(float value, MLDataType data_type) {
return result;
}

template <class T>
void ComputePadWithAxes(
gsl::span<const int64_t> pads_tensor_raw_data,
gsl::span<const T> axes_tensor_raw_data,
size_t data_rank,
PadsVector& pads) {
size_t axes_size = axes_tensor_raw_data.size();
for (size_t i = 0; i < axes_size; ++i) {
int64_t axis = HandleNegativeAxis(onnxruntime::narrow<int64_t>(axes_tensor_raw_data[i]), data_rank);
pads[onnxruntime::narrow<size_t>(axis)] = pads_tensor_raw_data[i]; // xi_begin
pads[data_rank + onnxruntime::narrow<size_t>(axis)] = pads_tensor_raw_data[axes_size + i]; // xi_end
}
}

Status Pad::Compute(OpKernelContext* ctx) const {
const Tensor& input_tensor = *ctx->Input<Tensor>(0);
MLDataType data_type = input_tensor.DataType();
Expand All @@ -608,48 +648,14 @@ Status Pad::Compute(OpKernelContext* ctx) const {
ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1),
"Pads tensor should be a 1D tensor of shape [2 * num_axes] "
"or a 2D tensor of shape [1, 2 * num_axes]");
const int64_t* pads_tensor_raw_data = pads_tensor.Data<int64_t>();
size_t pads_size = static_cast<size_t>(pads_tensor.Shape().Size());
pads.reserve(2 * data_rank);

const Tensor* axes_tensor = ctx->Input<Tensor>(3);
if (axes_tensor) {
const auto& axes_tensor_dims = axes_tensor->Shape().GetDims();
ORT_ENFORCE(axes_tensor_dims.size() == 1, "Axes tensor should be a 1D tensor ");
int64_t axes_size = axes_tensor_dims[0];

pads.resize(2 * data_rank, 0);
if (axes_tensor->IsDataType<int32_t>()) {
const int32_t* axes_tensor_raw_data = axes_tensor->Data<int32_t>();
ComputePadWithAxes<int32_t>(
{pads_tensor_raw_data, onnxruntime::narrow<size_t>(2 * axes_size)},
{axes_tensor_raw_data, onnxruntime::narrow<size_t>(axes_size)},
data_rank,
pads);
} else if (axes_tensor->IsDataType<int64_t>()) {
const int64_t* axes_tensor_raw_data = axes_tensor->Data<int64_t>();
ComputePadWithAxes<int64_t>(
{pads_tensor_raw_data, onnxruntime::narrow<size_t>(2 * axes_size)},
{axes_tensor_raw_data, onnxruntime::narrow<size_t>(axes_size)},
data_rank,
pads);
}
} else {
ORT_ENFORCE(pads_size == 2 * data_rank,
"Pads tensor size should be equal to twice the input dimension count ");
for (size_t i = 0; i < pads_size; ++i) {
pads.push_back(pads_tensor_raw_data[i]);
}
}

const auto pads_data = pads_tensor.DataAsSpan<int64_t>();

// Compute Pads by applying axes if specified otherwise copy the supplied pads.
PadBase::ComputePads(ctx, data_rank, pads_data, pads);

// Separate out any negative pads into the slices array
slices.assign(pads.size(), 0);
for (size_t index = 0; index < pads.size(); index++) {
if (pads[index] < 0) {
slices[index] = pads[index];
pads[index] = 0;
}
}
PadBase::SeparateNegativeToSlices(pads, slices);

value.u64 = 0U;
const Tensor* value_tensor = ctx->Input<Tensor>(2);
Expand Down
Loading
Loading