Fix utf8 spliting for English

k2-fsa · Oct 25, 2023 · 26890c6 · 26890c6
1 parent 6e5efa4
commit 26890c6
Showing 1 changed file with 53 additions and 1 deletion.
diff --git a/sherpa-onnx/csrc/text-utils.cc b/sherpa-onnx/csrc/text-utils.cc
@@ -162,10 +162,62 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
                                   bool omit_empty_strings,
                                   std::vector<double> *out);
 
+static std::vector<std::string> MergeCharactersIntoWords(
+    const std::vector<std::string> &words) {
+  std::vector<std::string> ans;
+
+  int32_t n = static_cast<int32_t>(words.size());
+  int32_t i = 0;
+  int32_t prev = -1;
+
+  while (i < n) {
+    const auto &w = words[i];
+    if (w.size() > 1 ||
+        (w.size() == 1 && (std::ispunct(w[0]) || std::isspace(w[0])))) {
+      if (prev != -1) {
+        std::string t;
+        for (; prev < i; ++prev) {
+          t.append(words[prev]);
+        }
+        prev = -1;
+        ans.push_back(std::move(t));
+      }
+
+      if (!std::isspace(w[0])) {
+        ans.push_back(w);
+      }
+      ++i;
+      continue;
+    }
+
+    if (w.size() == 1) {
+      if (prev == -1) {
+        prev = i;
+      }
+      ++i;
+      continue;
+    }
+
+    SHERPA_ONNX_LOGE("Ignore %s", w.c_str());
+  }
+
+  if (prev != -1) {
+    std::string t;
+    for (; prev < i; ++prev) {
+      t.append(words[prev]);
+    }
+    ans.push_back(std::move(t));
+  }
+
+  return ans;
+}
+
 std::vector<std::string> SplitUtf8(const std::string &text) {
   const uint8_t *begin = reinterpret_cast<const uint8_t *>(text.c_str());
   const uint8_t *end = begin + text.size();
 
+  // Note that English words are split into single characters.
+  // We need to invoke MergeCharactersIntoWords() to merge them
   std::vector<std::string> ans;
 
   auto start = begin;
@@ -195,7 +247,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) {
     }
   }
 
-  return ans;
+  return MergeCharactersIntoWords(ans);
 }
 
 }  // namespace sherpa_onnx