Skip to content

Commit

Permalink
Fix Byte BPE string results for Python. (#512)
Browse files Browse the repository at this point in the history
It ignores invalid UTF8 strings.
  • Loading branch information
csukuangfj authored Jan 3, 2024
1 parent d011421 commit e215d0c
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 3 deletions.
11 changes: 11 additions & 0 deletions sherpa-onnx/csrc/offline-recognizer-ctc-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
#ifndef SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_
#define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_

#include <ios>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -42,6 +44,15 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src,
}
auto sym = sym_table[src.tokens[i]];
text.append(sym);

if (sym.size() == 1 && sym[0] != ' ') {
// for byte bpe models
std::ostringstream os;
os << "<0x" << std::hex << std::uppercase
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
sym = os.str();
}

r.tokens.push_back(std::move(sym));
}
r.text = std::move(text);
Expand Down
10 changes: 10 additions & 0 deletions sherpa-onnx/csrc/offline-recognizer-transducer-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
#define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_

#include <fstream>
#include <ios>
#include <memory>
#include <regex> // NOLINT
#include <sstream>
#include <string>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -44,6 +46,14 @@ static OfflineRecognitionResult Convert(
auto sym = sym_table[i];
text.append(sym);

if (sym.size() == 1 && sym[0] != ' ') {
// for byte bpe models
std::ostringstream os;
os << "<0x" << std::hex << std::uppercase
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
sym = os.str();
}

r.tokens.push_back(std::move(sym));
}
r.text = std::move(text);
Expand Down
11 changes: 11 additions & 0 deletions sherpa-onnx/csrc/online-recognizer-ctc-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
#define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_CTC_IMPL_H_

#include <algorithm>
#include <ios>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -35,6 +37,15 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src,
auto sym = sym_table[i];

r.text.append(sym);

if (sym.size() == 1 && sym[0] != ' ') {
// for byte bpe models
std::ostringstream os;
os << "<0x" << std::hex << std::uppercase
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
sym = os.str();
}

r.tokens.push_back(std::move(sym));
}

Expand Down
11 changes: 11 additions & 0 deletions sherpa-onnx/csrc/online-recognizer-transducer-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
#define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_TRANSDUCER_IMPL_H_

#include <algorithm>
#include <ios>
#include <memory>
#include <regex> // NOLINT
#include <sstream>
#include <string>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -47,6 +49,15 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src,
auto sym = sym_table[i];

r.text.append(sym);

if (sym.size() == 1 && sym[0] != ' ') {
// for byte bpe models
std::ostringstream os;
os << "<0x" << std::hex << std::uppercase
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
sym = os.str();
}

r.tokens.push_back(std::move(sym));
}

Expand Down
8 changes: 6 additions & 2 deletions sherpa-onnx/python/csrc/offline-stream.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,12 @@ Process audio samples.
static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT
using PyClass = OfflineRecognitionResult;
py::class_<PyClass>(*m, "OfflineRecognitionResult")
.def_property_readonly("text",
[](const PyClass &self) { return self.text; })
.def_property_readonly(
"text",
[](const PyClass &self) -> py::str {
return py::str(PyUnicode_DecodeUTF8(self.text.c_str(),
self.text.size(), "ignore"));
})
.def_property_readonly("tokens",
[](const PyClass &self) { return self.tokens; })
.def_property_readonly(
Expand Down
6 changes: 5 additions & 1 deletion sherpa-onnx/python/csrc/online-recognizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ static void PybindOnlineRecognizerResult(py::module *m) {
using PyClass = OnlineRecognizerResult;
py::class_<PyClass>(*m, "OnlineRecognizerResult")
.def_property_readonly(
"text", [](PyClass &self) -> std::string { return self.text; })
"text",
[](PyClass &self) -> py::str {
return py::str(PyUnicode_DecodeUTF8(self.text.c_str(),
self.text.size(), "ignore"));
})
.def_property_readonly(
"tokens",
[](PyClass &self) -> std::vector<std::string> { return self.tokens; })
Expand Down

0 comments on commit e215d0c

Please sign in to comment.