From 565f335ae1b7b2007839a264a51f7f6cdfeb0ec8 Mon Sep 17 00:00:00 2001 From: Wei Kang Date: Tue, 16 Jul 2024 19:41:31 +0800 Subject: [PATCH] Fix hotwords OOV log (#1139) --- sherpa-onnx/csrc/utils.cc | 6 +++--- sherpa-onnx/python/sherpa_onnx/utils.py | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/sherpa-onnx/csrc/utils.cc b/sherpa-onnx/csrc/utils.cc index b5df9682f..f40b67697 100644 --- a/sherpa-onnx/csrc/utils.cc +++ b/sherpa-onnx/csrc/utils.cc @@ -62,9 +62,9 @@ static bool EncodeBase(const std::vector &lines, break; default: SHERPA_ONNX_LOGE( - "Cannot find ID for token %s at line: %s. (Hint: words on " - "the same line are separated by spaces)", - word.c_str(), line.c_str()); + "Cannot find ID for token %s at line: %s. (Hint: Check the " + "tokens.txt see if %s in it)", + word.c_str(), line.c_str(), word.c_str()); has_oov = true; break; } diff --git a/sherpa-onnx/python/sherpa_onnx/utils.py b/sherpa-onnx/python/sherpa_onnx/utils.py index 7b152a95a..fd36f2c03 100644 --- a/sherpa-onnx/python/sherpa_onnx/utils.py +++ b/sherpa-onnx/python/sherpa_onnx/utils.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import List, Optional, Union + def text2token( texts: List[str], tokens: str, @@ -33,20 +34,20 @@ def text2token( is True, or it is a list of list of tokens. """ try: - import sentencepiece as spm + import sentencepiece as spm except ImportError: - print('Please run') - print(' pip install sentencepiece') - print('before you continue') + print("Please run") + print(" pip install sentencepiece") + print("before you continue") raise try: from pypinyin import pinyin from pypinyin.contrib.tone_convert import to_initials, to_finals_tone except ImportError: - print('Please run') - print(' pip install pypinyin') - print('before you continue') + print("Please run") + print(" pip install pypinyin") + print("before you continue") raise assert Path(tokens).is_file(), f"File not exists, {tokens}" @@ -119,7 +120,10 @@ def text2token( if txt in tokens_table: text_list.append(tokens_table[txt] if output_ids else txt) else: - print(f"OOV token : {txt}, skipping text : {text}.") + print( + f"Can't find token {txt} in token table, check your " + f"tokens.txt see if {txt} in it. skipping text : {text}." + ) contain_oov = True break if contain_oov: