microsoft · yuslepukhin · Jan 12, 2024 · Oct 18, 2023 · Oct 18, 2023 · Oct 18, 2023
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
@@ -1,4 +1,4 @@
-// Copyright (c Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #include "qlinear_util.h"

diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -989,6 +989,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN);
 #endif
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringSplit);
 
 // !!PLEASE READ BELOW!! Following that, add new entries above this comment
 
@@ -2447,6 +2448,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN)>,
 #endif
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringSplit)>,
   };
 
   for (auto& function_table_entry : function_table) {

diff --git a/onnxruntime/core/providers/cpu/nn/string_split.cc b/onnxruntime/core/providers/cpu/nn/string_split.cc
@@ -0,0 +1,102 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cpu/nn/string_split.h"
+#include <algorithm>
+#include <limits>
+#include <string>
+#include "core/common/common.h"
+namespace onnxruntime {
+
+ONNX_CPU_OPERATOR_KERNEL(StringSplit, 20,
+                         KernelDefBuilder()
+                             .TypeConstraint("T1", DataTypeImpl::GetTensorType<std::string>())
+                             .TypeConstraint("T2", DataTypeImpl::GetTensorType<std::string>())
+                             .TypeConstraint("T3", DataTypeImpl::GetTensorType<int64_t>()),
+                         StringSplit);
+
+/// Calculate substrings in ``str`` delimited by ``delimiter``. A maximum of ``max_splits`` splits are permitted.
+/// Returns a vector of string slices into ``str`` representing the substrings as string views. The user must ensure
+/// the returned views' lifetime does not exceed ``str``'s.
+InlinedVector<std::string_view> ComputeSubstrings(std::string_view str, std::string_view delimiter, int64_t max_splits) {
+  InlinedVector<std::string_view> output;
+  if (str.empty()) {
+    return output;
+  }
+  if (delimiter.empty()) {
+    // Count consecutive whitespace as one delimiter. Preceding and trailing whitespace is meant to be ignored.
+    size_t pos = str.find_first_not_of(" ");
+    int64_t token_count = 0;
+    while (pos != std::string::npos) {
+      if (token_count++ == max_splits) {
+        // Trim down last substring as required in specification
+        size_t next_pos = str.length() - 1;
+        while (str[next_pos] == ' ') {
+          next_pos--;
+        }
+        output.push_back(str.substr(pos, next_pos - pos + 1));
+        break;
+      } else {
+        auto next_pos = str.find_first_of(" ", pos);
+        output.push_back(str.substr(pos, next_pos - pos));
+        pos = str.find_first_not_of(" ", next_pos);
+      }
+    }
+    return output;
+  } else {
+    size_t pos = 0;
+    int64_t token_count = 0;
+    while (pos != std::string::npos) {
+      auto next_pos = str.find(delimiter, pos);
+      if (token_count++ == max_splits || next_pos == std::string::npos) {
+        output.push_back(str.substr(pos));
+        break;
+      }
+      output.push_back(str.substr(pos, next_pos - pos));
+      pos = next_pos + delimiter.size();
+    }
+    return output;
+  }
+}
+
+StringSplit::StringSplit(const OpKernelInfo& info) : OpKernel(info) {
+  info.GetAttrOrDefault("maxsplit", &maxsplit_, std::numeric_limits<int64_t>::max() - 1);
+  info.GetAttrOrDefault("delimiter", &delimiter_, std::string());
+}
+
+Status StringSplit::Compute(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  auto input_data = input->template DataAsSpan<std::string>();
+
+  // Set up number of tokens output
+  auto num_tokens_data = context->Output(1, input->Shape())->template MutableDataAsSpan<int64_t>();
+  auto num_tokens_iter = num_tokens_data.begin();
+
+  InlinedVector<InlinedVector<std::string_view>> input_slices;
+  input_slices.reserve(input_data.size());
+  int64_t last_dim = 1;
+
+  for (auto input_iter = input_data.begin(); input_iter != input_data.end(); input_iter++, num_tokens_iter++) {
+    auto substrs = ComputeSubstrings(*input_iter, delimiter_, maxsplit_);
+    auto substr_count = static_cast<int64_t>(substrs.size());
+    input_slices.push_back(std::move(substrs));
+    last_dim = std::max(last_dim, substr_count);
+    *num_tokens_iter = substr_count;
+  }
+
+  last_dim = std::min(last_dim, maxsplit_ + 1);
+
+  // Set up splits output
+  auto splits_shape = input->Shape().AsShapeVector();
+  splits_shape.push_back(last_dim);
+
+  auto splits_data = context->Output(0, splits_shape)->template MutableDataAsSpan<std::string>();
+  auto slices_iter = input_slices.begin();
+  for (auto output_splits_iter = splits_data.begin(); output_splits_iter != splits_data.end(); output_splits_iter += last_dim, slices_iter++) {
+    std::copy(slices_iter->begin(), slices_iter->end(), output_splits_iter);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/string_split.h b/onnxruntime/core/providers/cpu/nn/string_split.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+
+class StringSplit final : public OpKernel {
+ public:
+  explicit StringSplit(const OpKernelInfo& info);
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  std::string delimiter_;
+  int64_t maxsplit_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/nn/string_split_test.cc b/onnxruntime/test/providers/cpu/nn/string_split_test.cc
@@ -0,0 +1,108 @@
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+TEST(StringSplit, BasicSplitTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {3}, {"hello world", "hello", "world"});
+  test.AddAttribute<std::string>("delimiter", " ");
+  test.AddOutput<std::string>("Y", {3, 2}, {"hello", "world", "hello", "", "world", ""});
+  test.AddOutput<int64_t>("Z", {3}, {2, 1, 1});
+  test.Run();
+}
+
+TEST(StringSplit, MaxSplitTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {2, 2}, {"eggs;milk;chesse", "pepper;salt", "chicken;fish;pork", "spinach"});
+  test.AddAttribute<std::string>("delimiter", ";");
+  test.AddAttribute<int64_t>("maxsplit", 1);
+  test.AddOutput<std::string>("Y", {2, 2, 2},
+                              {"eggs", "milk;chesse", "pepper", "salt", "chicken", "fish;pork", "spinach", ""});
+  test.AddOutput<int64_t>("Z", {2, 2}, {2, 2, 2, 1});
+  test.Run();
+}
+
+TEST(StringSplit, EmptyStringDelimiterTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 4}, {"hello world", "hello  world", " hello world", "hello world  "});
+  test.AddAttribute<std::string>("delimiter", "");
+  test.AddOutput<std::string>("Y", {1, 4, 2}, {"hello", "world", "hello", "world", "hello", "world", "hello", "world"});
+  test.AddOutput<int64_t>("Z", {1, 4}, {2, 2, 2, 2});
+  test.Run();
+}
+
+TEST(StringSplit, SubsequentWhitespaceDefaultTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 4}, {"hello world", "hello  world", "   hello world", "hello world  "});
+  test.AddOutput<std::string>("Y", {1, 4, 2}, {"hello", "world", "hello", "world", "hello", "world", "hello", "world"});
+  test.AddOutput<int64_t>("Z", {1, 4}, {2, 2, 2, 2});
+  test.Run();
+}
+
+TEST(StringSplit, SubsequentWhitespaceWithLimitTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 4},
+                             {"lorem  ipsum doler", " Open Neural Network Exchange (ONNX)", "onnx", "ONNX runtime "});
+  test.AddAttribute<int64_t>("maxsplit", 1);
+  test.AddOutput<std::string>(
+      "Y", {1, 4, 2},
+      {"lorem", "ipsum doler", "Open", "Neural Network Exchange (ONNX)", "onnx", "", "ONNX", "runtime"});
+  test.AddOutput<int64_t>("Z", {1, 4}, {2, 2, 1, 2});
+  test.Run();
+}
+
+TEST(StringSplit, SingleTokenTest) {
+  OpTester test("StringSplit", 20);
+  test.AddAttribute<std::string>("delimiter", "*");
+  test.AddInput<std::string>("X", {1, 1, 1}, {"lorem"});
+  test.AddOutput<std::string>("Y", {1, 1, 1, 1}, {"lorem"});
+  test.AddOutput<int64_t>("Z", {1, 1, 1}, {1});
+  test.Run();
+}
+
+TEST(StringSplit, SingleTokenWhitespaceTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 1, 1}, {"lorem"});
+  test.AddOutput<std::string>("Y", {1, 1, 1, 1}, {"lorem"});
+  test.AddOutput<int64_t>("Z", {1, 1, 1}, {1});
+  test.Run();
+}
+
+TEST(StringSplit, EdgeWhitespaceTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 1, 1}, {"         lorem "});
+  test.AddOutput<std::string>("Y", {1, 1, 1, 1}, {"lorem"});
+  test.AddOutput<int64_t>("Z", {1, 1, 1}, {1});
+  test.Run();
+}
+
+TEST(StringSplit, EmptyInputTest) {
+  OpTester test("StringSplit", 20);
+  test.AddAttribute<std::string>("delimiter", "*");
+  test.AddInput<std::string>("X", {1, 3, 1}, {"", "+", "*"});
+  test.AddOutput<std::string>("Y", {1, 3, 1, 2}, {"", "", "+", "", "", ""});
+  test.AddOutput<int64_t>("Z", {1, 3, 1}, {0, 1, 2});
+  test.Run();
+}
+
+TEST(StringSplit, OnlyEmptyInputTest) {
+  OpTester test("StringSplit", 20);
+  test.AddAttribute<std::string>("delimiter", "*");
+  test.AddInput<std::string>("X", {1, 2, 1}, {"", ""});
+  test.AddOutput<std::string>("Y", {1, 2, 1, 1}, {"", ""});
+  test.AddOutput<int64_t>("Z", {1, 2, 1}, {0, 0});
+  test.Run();
+}
+
+TEST(StringSplit, OnlyEmptyNoDelimiterInputTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 2, 1}, {"", ""});
+  test.AddOutput<std::string>("Y", {1, 2, 1, 1}, {"", ""});
+  test.AddOutput<int64_t>("Z", {1, 2, 1}, {0, 0});
+  test.Run();
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -256,12 +256,6 @@
         "^test_string_concat_empty_string",
         "^test_string_concat_utf8",
         "^test_string_concat_zero_dimensional",
-        "^test_string_split_basic",
-        "^test_string_split_consecutive_delimiters",
-        "^test_string_split_empty_string_delimiter",
-        "^test_string_split_empty_tensor",
-        "^test_string_split_maxsplit",
-        "^test_string_split_no_delimiter",
         "^test_reduce_l1_empty_set_cuda",
         "^test_reduce_l1_empty_set_expanded_cuda",
         "^test_reduce_l2_empty_set_cuda",