Re-work CPU code for sharing

microsoft · Jan 19, 2024 · edb568a · edb568a
1 parent a3ecb63
commit edb568a
Show file tree

Hide file tree

Showing 3 changed files with 157 additions and 134 deletions.
diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -167,46 +167,6 @@ ONNX_CPU_OPERATOR_KERNEL(
 
 using PadsVector = PadBase::PadsVector;
 
-// This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values)
-template <typename T>
-static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
-                    size_t block_size, size_t block_count) {
-  for (size_t block_index = 0; block_index < block_count; block_index++) {
-    for (size_t i = 0; i < block_size; i++) {
-      *output++ = *input;
-      input += input_delta;
-    }
-    input += input_pitch;
-  }
-}
-
-// These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1,
-// and inputPitch and inputDelta are just a single value added each iteration.
-template <typename T>
-static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
-  for (size_t block_index = 0; block_index < block_count; block_index++) {
-    *output++ = *input;
-    input += input_delta;
-  }
-}
-
-// For constant padding, there is no input, just a size to write the constant to
-template <typename T>
-static void PadAxisConstant(T* output, T constant, size_t size) {
-  if (size == 1) {
-    *output = constant;
-  } else if (size == 2) {
-    *output = constant;
-    *(output + 1) = constant;
-  } else {
-    // This would be faster with SSE instructions.
-    // That would mean to have an implementation for each type (uint8, uint32, uint64).
-    T* end = output + size;
-    for (; output != end;)
-      *output++ = constant;
-  }
-}
-
 Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape) {
   switch (mode) {
     case Mode::Constant: {
@@ -242,34 +202,62 @@ Status PadBase::HandleDimValueZero(const Mode& mode, const TensorShape& input_sh
   return Status::OK();
 }
 
-// special handling for edge case where the input has one or more dims with value of 0
-template <typename T>
-static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
-                                         const Mode& mode,
-                                         const TensorShape& input_shape,
-                                         TensorShapeVector& output_dims,
-                                         T value) {
-  TensorShape output_shape(output_dims);
-  ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode, input_shape, output_shape));
-
-  auto& output_tensor = *ctx->Output(0, output_shape);
-
-  // we need to add pads if mode is constant, otherwise the output has one or more dim values of 0 so is empty
-  if (mode == Mode::Constant) {
-    // we add pads with the default value to all dims including those with a value of 0
-    auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
-    std::fill_n(output, output_shape.Size(), value);
+void PadBase::ComputePadWithAxes(
+    gsl::span<const int64_t> pads_tensor_raw_data,
+    std::function<int64_t(size_t)> get_axis,
+    size_t axes_size,
+    size_t data_rank,
+    PadsVector& pads) {
+  for (size_t i = 0; i < axes_size; ++i) {
+    const size_t axis = onnxruntime::narrow<size_t>(HandleNegativeAxis(get_axis(i), data_rank));
+    pads[axis] = pads_tensor_raw_data[i];                          // xi_begin
+    pads[data_rank + axis] = pads_tensor_raw_data[axes_size + i];  // xi_end
   }
+}
 
-  return Status::OK();
+void PadBase::ComputePads(OpKernelContext* ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
+                          PadsVector& pads) {
+  pads.reserve(2 * data_rank);
+  const Tensor* axes_tensor = ctx->Input<Tensor>(3);
+  if (axes_tensor) {
+    const size_t num_axes_dims = axes_tensor->Shape().NumDimensions();
+    ORT_ENFORCE(num_axes_dims == 1, "Axes tensor should be a 1D tensor ");
+
+    pads.resize(2 * data_rank, 0);
+    if (axes_tensor->IsDataType<int32_t>()) {
+      auto axes_data = axes_tensor->DataAsSpan<int32_t>();
+      ComputePadWithAxes(
+          pads_data,
+          [axes_data](size_t idx) -> int64_t {
+            return axes_data[idx];
+          },
+          axes_data.size(),
+          data_rank,
+          pads);
+    } else if (axes_tensor->IsDataType<int64_t>()) {
+      auto axes_data = axes_tensor->DataAsSpan<int64_t>();
+      ComputePadWithAxes(
+          pads_data,
+          [axes_data](size_t idx) {
+            return axes_data[idx];
+          },
+          axes_data.size(),
+          data_rank,
+          pads);
+    }
+  } else {
+    ORT_ENFORCE(pads_data.size() == 2 * data_rank,
+                "Pads tensor size should be equal to twice the input dimension count ");
+    pads.assign(pads_data.begin(), pads_data.end());
+  }
 }
 
 // Flatten no padding inner most Axis, so one memcpy cover multiple Axis.
 // For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as
 // [1,224,224*3] with padding [0,3,3*3,0,3,3*3].
-static void FlattenInnerShape(const TensorShapeVector& input_dims, const PadsVector& pads,
-                              const PadsVector& slices, TensorShapeVector& reshaped_dims) {
-  size_t dims_count = input_dims.size();
+void PadBase::FlattenInnerShape(gsl::span<const int64_t> input_dims, gsl::span<const int64_t> pads,
+                                gsl::span<const int64_t> slices, TensorShapeVector& reshaped_dims) {
+  const size_t dims_count = input_dims.size();
   size_t inner_axis = dims_count - 1;
   size_t inner_size = 1;
 
@@ -288,14 +276,14 @@ static void FlattenInnerShape(const TensorShapeVector& input_dims, const PadsVec
   } while (inner_axis-- > 0);
 
   reshaped_dims.reserve(inner_axis + 1);
-  std::copy(input_dims.cbegin(), input_dims.cbegin() + inner_axis + 1, std::back_inserter(reshaped_dims));
+  std::copy(input_dims.begin(), input_dims.begin() + inner_axis + 1, std::back_inserter(reshaped_dims));
 
   // Flatten inner axis.
   reshaped_dims[inner_axis] = inner_size;
 }
 
-static void ReshapePads(const PadsVector& src_pad, size_t src_dim_count, size_t new_dim_count,
-                        size_t inner_no_pad_size, PadsVector& reshaped_pad) {
+void PadBase::ReshapePads(gsl::span<const int64_t> src_pad, size_t src_dim_count, size_t new_dim_count,
+                          size_t inner_no_pad_size, PadsVector& reshaped_pad) {
   size_t inner_axis = new_dim_count - 1;
   std::copy(src_pad.begin(), src_pad.begin() + inner_axis, reshaped_pad.begin());
   std::copy(src_pad.begin() + src_dim_count, src_pad.begin() + src_dim_count + inner_axis,
@@ -306,6 +294,68 @@ static void ReshapePads(const PadsVector& src_pad, size_t src_dim_count, size_t
   reshaped_pad[inner_axis + new_dim_count] = src_pad[inner_axis + src_dim_count] * inner_no_pad_size;
 }
 
+// special handling for edge case where the input has one or more dims with value of 0
+template <typename T>
+static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
+                                         const Mode& mode,
+                                         const TensorShape& input_shape,
+                                         TensorShapeVector& output_dims,
+                                         T value) {
+  TensorShape output_shape(output_dims);
+  ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode, input_shape, output_shape));
+
+  auto& output_tensor = *ctx->Output(0, output_shape);
+
+  // we need to add pads if mode is constant, otherwise the output has one or more dim values of 0 so is empty
+  if (mode == Mode::Constant) {
+    // we add pads with the default value to all dims including those with a value of 0
+    auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
+    std::fill_n(output, output_shape.Size(), value);
+  }
+
+  return Status::OK();
+}
+
+// This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values)
+template <typename T>
+static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
+                    size_t block_size, size_t block_count) {
+  for (size_t block_index = 0; block_index < block_count; block_index++) {
+    for (size_t i = 0; i < block_size; i++) {
+      *output++ = *input;
+      input += input_delta;
+    }
+    input += input_pitch;
+  }
+}
+
+// These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1,
+// and inputPitch and inputDelta are just a single value added each iteration.
+template <typename T>
+static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
+  for (size_t block_index = 0; block_index < block_count; block_index++) {
+    *output++ = *input;
+    input += input_delta;
+  }
+}
+
+// For constant padding, there is no input, just a size to write the constant to
+template <typename T>
+static void PadAxisConstant(T* output, T constant, size_t size) {
+  if (size == 1) {
+    *output = constant;
+  } else if (size == 2) {
+    *output = constant;
+    *(output + 1) = constant;
+  } else {
+    // This would be faster with SSE instructions.
+    // That would mean to have an implementation for each type (uint8, uint32, uint64).
+    T* end = output + size;
+    for (; output != end;)
+      *output++ = constant;
+  }
+}
+
 template <typename T>
 static Status PadImpl(OpKernelContext* ctx,
                       const PadsVector& pads,
@@ -327,7 +377,7 @@ static Status PadImpl(OpKernelContext* ctx,
 
   // Reshape input dims
   TensorShapeVector reshaped_input_dims;
-  FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);
+  PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);
 
   // Reshape padding
   size_t new_dims_count = reshaped_input_dims.size();
@@ -336,8 +386,8 @@ static Status PadImpl(OpKernelContext* ctx,
                                                              ? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
                                                              : 0);
   PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count);
-  ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
-  ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);
+  PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
+  PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);
 
   TensorShapeVector reshaped_output_dims = reshaped_input_dims;
   TensorShapeVector input_starts;
@@ -575,20 +625,6 @@ static PadValue PadValueFromFloat(float value, MLDataType data_type) {
   return result;
 }
 
-template <class T>
-void ComputePadWithAxes(
-    gsl::span<const int64_t> pads_tensor_raw_data,
-    gsl::span<const T> axes_tensor_raw_data,
-    size_t data_rank,
-    PadsVector& pads) {
-  size_t axes_size = axes_tensor_raw_data.size();
-  for (size_t i = 0; i < axes_size; ++i) {
-    int64_t axis = HandleNegativeAxis(onnxruntime::narrow<int64_t>(axes_tensor_raw_data[i]), data_rank);
-    pads[onnxruntime::narrow<size_t>(axis)] = pads_tensor_raw_data[i];                          // xi_begin
-    pads[data_rank + onnxruntime::narrow<size_t>(axis)] = pads_tensor_raw_data[axes_size + i];  // xi_end
-  }
-}
-
 Status Pad::Compute(OpKernelContext* ctx) const {
   const Tensor& input_tensor = *ctx->Input<Tensor>(0);
   MLDataType data_type = input_tensor.DataType();
@@ -608,48 +644,13 @@ Status Pad::Compute(OpKernelContext* ctx) const {
     ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1),
                 "Pads tensor should be a 1D tensor of shape [2 * num_axes] "
                 "or a 2D tensor of shape [1, 2 * num_axes]");
-    const int64_t* pads_tensor_raw_data = pads_tensor.Data<int64_t>();
-    size_t pads_size = static_cast<size_t>(pads_tensor.Shape().Size());
-    pads.reserve(2 * data_rank);
-
-    const Tensor* axes_tensor = ctx->Input<Tensor>(3);
-    if (axes_tensor) {
-      const auto& axes_tensor_dims = axes_tensor->Shape().GetDims();
-      ORT_ENFORCE(axes_tensor_dims.size() == 1, "Axes tensor should be a 1D tensor ");
-      int64_t axes_size = axes_tensor_dims[0];
-
-      pads.resize(2 * data_rank, 0);
-      if (axes_tensor->IsDataType<int32_t>()) {
-        const int32_t* axes_tensor_raw_data = axes_tensor->Data<int32_t>();
-        ComputePadWithAxes<int32_t>(
-            {pads_tensor_raw_data, onnxruntime::narrow<size_t>(2 * axes_size)},
-            {axes_tensor_raw_data, onnxruntime::narrow<size_t>(axes_size)},
-            data_rank,
-            pads);
-      } else if (axes_tensor->IsDataType<int64_t>()) {
-        const int64_t* axes_tensor_raw_data = axes_tensor->Data<int64_t>();
-        ComputePadWithAxes<int64_t>(
-            {pads_tensor_raw_data, onnxruntime::narrow<size_t>(2 * axes_size)},
-            {axes_tensor_raw_data, onnxruntime::narrow<size_t>(axes_size)},
-            data_rank,
-            pads);
-      }
-    } else {
-      ORT_ENFORCE(pads_size == 2 * data_rank,
-                  "Pads tensor size should be equal to twice the input dimension count ");
-      for (size_t i = 0; i < pads_size; ++i) {
-        pads.push_back(pads_tensor_raw_data[i]);
-      }
-    }
+    const auto pads_data = pads_tensor.DataAsSpan<int64_t>();
+
+    // Compute Pads by applying axes if specified otherwise copy the supplied pads.
+    PadBase::ComputePads(ctx, data_rank, pads_data, pads);
 
     // Separate out any negative pads into the slices array
-    slices.assign(pads.size(), 0);
-    for (size_t index = 0; index < pads.size(); index++) {
-      if (pads[index] < 0) {
-        slices[index] = pads[index];
-        pads[index] = 0;
-      }
-    }
+    PadBase::SeparateNegativeToSlices(pads, slices);
 
     value.u64 = 0U;
     const Tensor* value_tensor = ctx->Input<Tensor>(2);

diff --git a/onnxruntime/core/providers/cpu/tensor/padbase.h b/onnxruntime/core/providers/cpu/tensor/padbase.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "core/common/inlined_containers.h"
+#include <functional>
 
 namespace onnxruntime {
 
@@ -23,6 +24,34 @@ class PadBase {
   // in the input_shape with a value of zero.
   static Status HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, TensorShape& output_shape);
 
+  static void FlattenInnerShape(gsl::span<const int64_t> input_dims, gsl::span<const int64_t> pads,
+                                gsl::span<const int64_t> slices, TensorShapeVector& reshaped_dims);
+
+  static void ReshapePads(gsl::span<const int64_t> src_pad, size_t src_dim_count, size_t new_dim_count,
+                          size_t inner_no_pad_size, PadsVector& reshaped_pad);
+
+  // Compute Pads by applying axes if specified otherwise copy the supplied pads.
+  static void ComputePads(OpKernelContext* ctx, size_t data_rank, gsl::span<const int64_t> pads_data,
+                          PadsVector& pads);
+
+  static void ComputePadWithAxes(
+      gsl::span<const int64_t> pads_tensor_raw_data,
+      std::function<int64_t(size_t)> get_axis,
+      size_t axes_size,
+      size_t data_rank,
+      PadsVector& pads);
+
+  // Separate out any negative pads into the slices array
+  static void SeparateNegativeToSlices(gsl::span<int64_t> pads, PadsVector& slices) {
+    slices.assign(pads.size(), 0);
+    for (size_t index = 0, lim = pads.size(); index < lim; index++) {
+      if (pads[index] < 0) {
+        slices[index] = pads[index];
+        pads[index] = 0;
+      }
+    }
+  }
+
  protected:
   PadBase(const OpKernelInfo& info) : value_(info.GetAttrOrDefault("value", 0.f)) {
     std::string mode;

diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -99,23 +99,16 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
     ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1),
                 "Pads tensor should be a 1D tensor of shape [2 * input_rank] or a 2D tensor of shape [1, 2 * input_rank]");
 
-    const int64_t* pads_tensor_raw_data = pads_tensor.Data<int64_t>();
-    size_t pads_size = static_cast<size_t>(pads_tensor.Shape().Size());
+    const auto pads_data = pads_tensor.DataAsSpan<int64_t>();
+    const size_t pads_size = static_cast<size_t>(pads_tensor.Shape().Size());
     ORT_ENFORCE(pads_size == 2 * static_cast<size_t>(dimension_count),
                 "Pads tensor size should be equal to twice the input dimension count ");
 
-    pads.reserve(2LL * dimension_count);
-    for (size_t i = 0; i < pads_size; ++i) {
-      pads.push_back(pads_tensor_raw_data[i]);
-    }
+    pads.reserve(pads_size);
+    pads.assign(pads_data.begin(), pads_data.end());
+
     // Separate out any negative pads into the slices array
-    slices.resize(pads.size(), 0);
-    for (size_t index = 0; index < pads.size(); index++) {
-      if (pads[index] < 0) {
-        slices[index] = pads[index];
-        pads[index] = 0;
-      }
-    }
+    PadBase::SeparateNegativeToSlices(pads, slices);
 
     T raw_value{};
     const Tensor* value_tensor = ctx->Input<Tensor>(2);