Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add float16 type support to SplitToSequence and make code type independent #18594

Merged
merged 10 commits into from
Nov 29, 2023
121 changes: 65 additions & 56 deletions onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -334,27 +334,14 @@

// SplitToSequence

namespace op_kernel_type_control {
ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES_ALL_OPSETS(
kCpuExecutionProvider, kOnnxDomain, SplitToSequence, Input, 0,
float, double, int32_t, int64_t, std::string);
} // namespace op_kernel_type_control

namespace {
using EnabledSplitToSequenceDataTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(
kCpuExecutionProvider, kOnnxDomain, SplitToSequence, Input, 0);
} // namespace

ONNX_CPU_OPERATOR_KERNEL(
SplitToSequence,
11,
KernelDefBuilder()
.TypeConstraint("T",
BuildKernelDefConstraintsFromTypeList<EnabledSplitToSequenceDataTypes>())
BuildKernelDefConstraints<float, MLFloat16, double, int32_t, int64_t, std::string>())
.TypeConstraint("S", DataTypeImpl::AllSequenceTensorTypes())
.TypeConstraint("I", std::vector<MLDataType>{
DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>()}),
.TypeConstraint("I", BuildKernelDefConstraints<int32_t, int64_t>()),
SplitToSequence);

SplitToSequence::SplitToSequence(const OpKernelInfo& info) : OpKernel(info) {
Expand All @@ -366,29 +353,14 @@
const Tensor& input = *context->Input<Tensor>(0);
const Tensor* p_split_input = context->Input<Tensor>(1);

Status status;

if (input.IsDataType<float>())
status = ComputeImpl<float>(*context, input, p_split_input);
else if (input.IsDataType<double>())
status = ComputeImpl<double>(*context, input, p_split_input);
else if (input.IsDataType<int32_t>())
status = ComputeImpl<int32_t>(*context, input, p_split_input);
else if (input.IsDataType<int64_t>())
status = ComputeImpl<int64_t>(*context, input, p_split_input);
else if (input.IsDataTypeString())
status = ComputeImpl<std::string>(*context, input, p_split_input);
else
status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "SplitToSequence operator does not support ", input.DataType(), " yet");

return status;
return ComputeImpl(*context, input, p_split_input);
}

Status SplitToSequence::PrepareForCompute(const TensorShape& input_shape, int64_t split_scalar, bool is_split_input_scalar,
int64_t& num_outputs, int64_t& axis, int& before_dims,
int& after_dims_including_split_axis, int& after_dims_excluding_split,
bool& is_uneven_split, int& num_remaining_splits,
std::vector<int64_t>& split_sizes) const {
InlinedVector<int64_t>& split_sizes) const {
auto input_dims = input_shape.GetDims();
const auto num_dimensions = gsl::narrow_cast<int64_t>(input_shape.NumDimensions());
axis = HandleNegativeAxis(axis_, num_dimensions); // handle negative and enforce axis is valid
Expand Down Expand Up @@ -416,7 +388,7 @@
// populate split_sizes with the same size for each output
num_outputs = split_dim_size;
// https://github.com/onnx/onnx/issues/2396
split_sizes = std::vector<int64_t>(static_cast<size_t>(num_outputs), DEFAULT_LENGTH_EACH_OUTPUT_);
split_sizes = InlinedVector<int64_t>(static_cast<size_t>(num_outputs), DEFAULT_LENGTH_EACH_OUTPUT_);
} else {
auto split_size_sum = std::accumulate(split_sizes.cbegin(), split_sizes.cend(), 0LL);
if (split_size_sum != split_dim_size) {
Expand Down Expand Up @@ -453,7 +425,7 @@
return retval;
}

static void GetSplitSizesInput(const Tensor& tensor, std::vector<int64_t>& split_sizes) {
static void GetSplitSizesInput(const Tensor& tensor, InlinedVector<int64_t>& split_sizes) {
auto num_elems = tensor.Shape().Size();
split_sizes.reserve(onnxruntime::narrow<size_t>(num_elems));
if (tensor.IsDataType<int32_t>()) {
Expand All @@ -467,13 +439,8 @@
}
}

template <typename T>
Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& input,
const Tensor* p_split_input) const {
if (!utils::HasType<EnabledSplitToSequenceDataTypes, T>()) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type is not supported in this build.");
}

auto& input_shape = input.Shape();
int64_t num_outputs = 0;
int64_t axis = axis_;
Expand All @@ -484,7 +451,9 @@
bool is_split_input_scalar = false;
bool is_uneven_split = false;
int num_remaining_splits = 0;
std::vector<int64_t> split_sizes;
InlinedVector<int64_t> split_sizes;
const bool is_string_type = input.IsDataTypeString();
const size_t element_size = (is_string_type) ? 0U : input.DataType()->Size();

// figure out split_scalar or split_sizes
if (p_split_input) {
Expand Down Expand Up @@ -520,8 +489,8 @@

// copy dimensions so we can update the selected axis in place
auto output_dimensions = input_shape.AsShapeVector();
int64_t input_offset = 0;
const T* input_data = input.Data<T>();
SafeInt<size_t> input_offset = 0;
const void* input_data = input.DataRaw();
for (int i = 0; i < num_outputs; ++i) {
// update size of dimension for axis we're splitting on while considering uneven split
int split_size;
Expand All @@ -535,20 +504,60 @@
AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(context.GetTempSpaceAllocator(&alloc));
Tensor output_tensor(input.DataType(), onnxruntime::TensorShape(output_dimensions), alloc);
T* output_data = output_tensor.MutableData<T>();

::onnxruntime::math::CopyMatrix<T>(
before_dims, // M
split_size * after_dims_excluding_split, // N
static_cast<const T*>(input_data + input_offset), // A
after_dims_including_split_axis, // lda
static_cast<T*>(output_data), // B
split_size * after_dims_excluding_split, // ldb
[](const T* src, T* dst, size_t count) {
copy_data<T>(src, dst, count);
});

input_offset += static_cast<int64_t>(split_size) * after_dims_excluding_split; // offset by the N data we used in this iteration
void* output_data = output_tensor.MutableDataRaw();

const auto M = before_dims;
const auto N = split_size * after_dims_excluding_split;
const auto* A = static_cast<const char*>(input_data) + static_cast<size_t>(input_offset * element_size);
const auto lda = after_dims_including_split_axis;
auto* B = output_data;
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
const auto ldb = split_size * after_dims_excluding_split;
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved

//::onnxruntime::math::CopyMatrix<T>(

Check warning on line 516 in onnxruntime/core/providers/cpu/sequence/sequence_ops.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/cpu/sequence/sequence_ops.cc#L516

Should have a space between // and comment [whitespace/comments] [4]
Raw output
onnxruntime/core/providers/cpu/sequence/sequence_ops.cc:516:  Should have a space between // and comment  [whitespace/comments] [4]
// before_dims, // M
// split_size * after_dims_excluding_split, // N
// static_cast<const T*>(input_data + input_offset), // A
// after_dims_including_split_axis, // lda
// static_cast<T*>(output_data), // B
// split_size * after_dims_excluding_split, // ldb
// [](const T* src, T* dst, size_t count) {
// copy_data<T>(src, dst, count);
// });

if (is_string_type) {
const auto* src = reinterpret_cast<const std::string*>(A);
auto* dst = reinterpret_cast<std::string*>(B);
if (lda == N && ldb == N) {
copy_data<std::string>(src, dst, static_cast<size_t>(M * N));
} else {
size_t lda_offset = 0;
size_t ldb_offset = 0;
for (size_t idx = 0; idx < static_cast<size_t>(M); ++idx,
lda_offset += lda, ldb_offset += ldb) {
copy_data<std::string>(src + lda_offset, dst + ldb_offset, static_cast<size_t>(N));

Check warning on line 537 in onnxruntime/core/providers/cpu/sequence/sequence_ops.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/cpu/sequence/sequence_ops.cc#L537

Add #include <string> for string [build/include_what_you_use] [4]
Raw output
onnxruntime/core/providers/cpu/sequence/sequence_ops.cc:537:  Add #include <string> for string  [build/include_what_you_use] [4]
}
}
} else {
if (lda == N && ldb == N) {
// if the data is contiguous, we can just copy the data
const size_t bytes_to_copy = static_cast<size_t>(N) * static_cast<size_t>(M) * element_size;
memcpy(B, A, bytes_to_copy);
} else {
// otherwise we need to copy each row
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
const size_t row_bytes = static_cast<size_t>(N) * element_size;
const auto lda_bytes_inc = narrow<size_t>(lda) * element_size;
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
const auto ldb_bytes_inc = narrow<size_t>(ldb) * element_size;
size_t lda_bytes_offset = 0;
size_t ldb_bytes_offset = 0;
for (size_t idx = 0; idx < static_cast<size_t>(M); ++idx,
lda_bytes_offset += lda_bytes_inc, ldb_bytes_offset += ldb_bytes_inc) {
memcpy(reinterpret_cast<char*>(B) + ldb_bytes_offset, reinterpret_cast<const char*>(A) + lda_bytes_offset,
row_bytes);
}
}
}

input_offset += SafeInt<size_t>(split_size) * after_dims_excluding_split; // offset by the N data we used in this iteration

Check warning on line 560 in onnxruntime/core/providers/cpu/sequence/sequence_ops.cc

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/cpu/sequence/sequence_ops.cc#L560

Lines should be <= 120 characters long [whitespace/line_length] [2]
Raw output
onnxruntime/core/providers/cpu/sequence/sequence_ops.cc:560:  Lines should be <= 120 characters long  [whitespace/line_length] [2]

// if keep_dims = 0, reshape the tensor by dropping the dimension corresponding to 'axis'
if (use_keep_dims && keepdims_ == 0) {
Expand Down
3 changes: 1 addition & 2 deletions onnxruntime/core/providers/cpu/sequence/sequence_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,12 @@ class SplitToSequence final : public OpKernel {
Status Compute(OpKernelContext* context) const override;

private:
template <typename T>
Status ComputeImpl(OpKernelContext& context, const Tensor& input, const Tensor* p_split_input) const;
Status PrepareForCompute(const TensorShape& input_shape, int64_t split_scalar, bool is_split_input_scalar,
int64_t& num_outputs, int64_t& axis, int& before_dims,
int& after_dims_including_split_axis, int& after_dims_excluding_split,
bool& is_uneven_split, int& num_remaining_splits,
std::vector<int64_t>& split_sizes) const;
InlinedVector<int64_t>& split_sizes) const;
int64_t axis_{};
int64_t keepdims_{1};
const int64_t DEFAULT_LENGTH_EACH_OUTPUT_ = 1;
Expand Down
37 changes: 36 additions & 1 deletion onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -330,12 +330,24 @@ TEST(SequenceOpsTest, SequenceConstructPositive) {

// SplitToSequence
template <typename T>
static std::vector<T> GetConsequtiveVector(T start, int num) {
static std::vector<T> GetConsequtiveVector(T start, size_t num) {
yuslepukhin marked this conversation as resolved.
Show resolved Hide resolved
std::vector<T> inputv(num);
std::iota(inputv.begin(), inputv.end(), start);
return inputv;
}

template<>
std::vector<MLFloat16> GetConsequtiveVector<MLFloat16>(MLFloat16 start, size_t num) {
std::vector<MLFloat16> inputv;
inputv.reserve(num);
float start_f = start.ToFloat();
for (size_t i = 0; i < num; ++i) {
inputv.push_back(MLFloat16{start_f + static_cast<float>(i)});
}
return inputv;
}


TEST(SequenceOpsTest, SplitToSequence_DefaultAxis0EqualSplitFloat) {
OpTester test("SplitToSequence", 11);
test.AddInput<float>("input", {4, 2}, GetConsequtiveVector<float>(1.f, 8));
Expand All @@ -347,6 +359,29 @@ TEST(SequenceOpsTest, SplitToSequence_DefaultAxis0EqualSplitFloat) {
test.Run();
}

TEST(SequenceOpsTest, SplitToSequence_DefaultAxis0EqualSplitMLFloat16) {
OpTester test("SplitToSequence", 11);
test.AddInput<MLFloat16>("input", {4, 2}, GetConsequtiveVector<MLFloat16>(MLFloat16::One, 8));
test.AddInput<int64_t>("split", {1, 2}, {2, 2});
SeqTensors<MLFloat16> output;

std::vector<MLFloat16> tensor_1;
const auto data_1 = {1.f, 2.f, 3.f, 4.f};
for (auto f : data_1)
tensor_1.push_back(MLFloat16{f});

std::vector<MLFloat16> tensor_2;
const auto data_2 = {5.f, 6.f, 7.f, 8.f};
for (auto f : data_2)
tensor_2.push_back(MLFloat16{f});

output.AddTensor({2, 2}, tensor_1);
output.AddTensor({2, 2}, tensor_2);
test.AddSeqOutput("S2", output);
test.Run();
}


TEST(SequenceOpsTest, SplitToSequence_DefaultAxis0EqualSplitLong) {
OpTester test("SplitToSequence", 11);
test.AddInput<int64_t>("input", {4, 2}, GetConsequtiveVector<int64_t>(1, 8));
Expand Down
Loading