From 3f2a17ef47c9ccba8491bc2abad3bf70acedfdc7 Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Fri, 26 Jan 2024 12:23:20 +0100 Subject: [PATCH] Fixes issue #535 , fix hexa 1-char tokens in ASR output. (#550) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Avoid output like : `[' K', '<0x64>', '<0x79>', 'ť', ' a', '<0x75>', 'to', 'bu', '<0x73>', '<0x75>', ... ]` with regular 500 BPE units. - Don't rewrite 1-char tokens in range [ 0x20 (space) .. 0x7E (tilde) ] --- sherpa-onnx/csrc/offline-recognizer-ctc-impl.h | 4 +++- sherpa-onnx/csrc/offline-recognizer-transducer-impl.h | 6 ++++-- sherpa-onnx/csrc/online-recognizer-ctc-impl.h | 4 +++- sherpa-onnx/csrc/online-recognizer-transducer-impl.h | 4 +++- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h index 2f4961350..ca0b5522e 100644 --- a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h @@ -45,8 +45,10 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src, auto sym = sym_table[src.tokens[i]]; text.append(sym); - if (sym.size() == 1 && sym[0] != ' ') { + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) { // for byte bpe models + // (but don't rewrite printable characters 0x20..0x7e, + // which collide with standard BPE units) std::ostringstream os; os << "<0x" << std::hex << std::uppercase << (static_cast(sym[0]) & 0xff) << ">"; diff --git a/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h b/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h index 084d39fe9..de9f6263b 100644 --- a/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h @@ -46,8 +46,10 @@ static OfflineRecognitionResult Convert( auto sym = sym_table[i]; text.append(sym); - if (sym.size() == 1 && sym[0] != ' ') { - // for byte bpe models + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) { + // for byte bpe models, + // (but don't rewrite printable characters 0x20..0x7e, + // which collide with standard BPE units) std::ostringstream os; os << "<0x" << std::hex << std::uppercase << (static_cast(sym[0]) & 0xff) << ">"; diff --git a/sherpa-onnx/csrc/online-recognizer-ctc-impl.h b/sherpa-onnx/csrc/online-recognizer-ctc-impl.h index fa81d6590..5697a77e8 100644 --- a/sherpa-onnx/csrc/online-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-ctc-impl.h @@ -38,8 +38,10 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src, r.text.append(sym); - if (sym.size() == 1 && sym[0] != ' ') { + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) { // for byte bpe models + // (but don't rewrite printable characters 0x20..0x7e, + // which collide with standard BPE units) std::ostringstream os; os << "<0x" << std::hex << std::uppercase << (static_cast(sym[0]) & 0xff) << ">"; diff --git a/sherpa-onnx/csrc/online-recognizer-transducer-impl.h b/sherpa-onnx/csrc/online-recognizer-transducer-impl.h index 8b193a67e..b3f31cdd0 100644 --- a/sherpa-onnx/csrc/online-recognizer-transducer-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-transducer-impl.h @@ -50,8 +50,10 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src, r.text.append(sym); - if (sym.size() == 1 && sym[0] != ' ') { + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) { // for byte bpe models + // (but don't rewrite printable characters 0x20..0x7e, + // which collide with standard BPE units) std::ostringstream os; os << "<0x" << std::hex << std::uppercase << (static_cast(sym[0]) & 0xff) << ">";