diff --git a/python-api-examples/generate-subtitles.py b/python-api-examples/generate-subtitles.py index e9edb03c9..b1b9eca58 100755 --- a/python-api-examples/generate-subtitles.py +++ b/python-api-examples/generate-subtitles.py @@ -180,6 +180,17 @@ def get_args(): """, ) + parser.add_argument( + "--whisper-tail-paddings", + default=-1, + type=int, + help="""Number of tail padding frames. + We have removed the 30-second constraint from whisper, so you need to + choose the amount of tail padding frames by yourself. + Use -1 to use a default value for tail padding. + """, + ) + parser.add_argument( "--decoding-method", type=str, @@ -294,6 +305,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: debug=args.debug, language=args.whisper_language, task=args.whisper_task, + tail_paddings=args.whisper_tail_paddings, ) else: raise ValueError("Please specify at least one model") diff --git a/python-api-examples/non_streaming_server.py b/python-api-examples/non_streaming_server.py index ac83d9a42..20e8b68a0 100755 --- a/python-api-examples/non_streaming_server.py +++ b/python-api-examples/non_streaming_server.py @@ -277,6 +277,17 @@ def add_whisper_model_args(parser: argparse.ArgumentParser): """, ) + parser.add_argument( + "--whisper-tail-paddings", + default=-1, + type=int, + help="""Number of tail padding frames. + We have removed the 30-second constraint from whisper, so you need to + choose the amount of tail padding frames by yourself. + Use -1 to use a default value for tail padding. + """, + ) + def add_model_args(parser: argparse.ArgumentParser): add_transducer_model_args(parser) @@ -913,6 +924,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: decoding_method=args.decoding_method, language=args.whisper_language, task=args.whisper_task, + tail_paddings=args.whisper_tail_paddings, ) elif args.tdnn_model: assert_file_exists(args.tdnn_model) diff --git a/python-api-examples/offline-decode-files.py b/python-api-examples/offline-decode-files.py index 16b11360d..78a1af042 100755 --- a/python-api-examples/offline-decode-files.py +++ b/python-api-examples/offline-decode-files.py @@ -220,6 +220,17 @@ def get_args(): """, ) + parser.add_argument( + "--whisper-tail-paddings", + default=-1, + type=int, + help="""Number of tail padding frames. + We have removed the 30-second constraint from whisper, so you need to + choose the amount of tail padding frames by yourself. + Use -1 to use a default value for tail padding. + """, + ) + parser.add_argument( "--decoding-method", type=str, @@ -391,6 +402,7 @@ def main(): debug=args.debug, language=args.whisper_language, task=args.whisper_task, + tail_paddings=args.whisper_tail_paddings, ) elif args.tdnn_model: assert_file_exists(args.tdnn_model) diff --git a/python-api-examples/two-pass-speech-recognition-from-microphone.py b/python-api-examples/two-pass-speech-recognition-from-microphone.py index 12a57ffa8..697e94850 100755 --- a/python-api-examples/two-pass-speech-recognition-from-microphone.py +++ b/python-api-examples/two-pass-speech-recognition-from-microphone.py @@ -195,6 +195,17 @@ def add_second_pass_whisper_model_args(parser: argparse.ArgumentParser): """, ) + parser.add_argument( + "--second-whisper-tail-paddings", + default=-1, + type=int, + help="""Number of tail padding frames. + We have removed the 30-second constraint from whisper, so you need to + choose the amount of tail padding frames by yourself. + Use -1 to use a default value for tail padding. + """, + ) + def add_second_pass_non_streaming_model_args(parser: argparse.ArgumentParser): add_second_pass_transducer_model_args(parser) @@ -314,6 +325,7 @@ def create_second_pass_recognizer(args) -> sherpa_onnx.OfflineRecognizer: decoding_method="greedy_search", language=args.second_whisper_language, task=args.second_whisper_task, + tail_paddings=args.second_whisper_tail_paddings, ) else: raise ValueError("Please specify at least one model for the second pass") diff --git a/python-api-examples/vad-with-non-streaming-asr.py b/python-api-examples/vad-with-non-streaming-asr.py index a73a1bab2..4f4974389 100755 --- a/python-api-examples/vad-with-non-streaming-asr.py +++ b/python-api-examples/vad-with-non-streaming-asr.py @@ -166,6 +166,17 @@ def get_args(): """, ) + parser.add_argument( + "--whisper-tail-paddings", + default=-1, + type=int, + help="""Number of tail padding frames. + We have removed the 30-second constraint from whisper, so you need to + choose the amount of tail padding frames by yourself. + Use -1 to use a default value for tail padding. + """, + ) + parser.add_argument( "--decoding-method", type=str, @@ -256,6 +267,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: debug=args.debug, language=args.whisper_language, task=args.whisper_task, + tail_paddings=args.whisper_tail_paddings, ) else: raise ValueError("Please specify at least one model") diff --git a/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h b/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h index c748b593e..4f5be4ca1 100644 --- a/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h @@ -116,18 +116,12 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl { NormalizeFeatures(f.data(), num_frames, feat_dim); - // note that 50 is an experience value. - // see also ../../scripts/whisper/test.py - // - // You can replace 50 by other values, say, 100. + // note that 1000 is an experience-value. + // You can replace 1000 by other values, say, 100. // // Since we have removed the 30 seconds constraint, we need // tail_padding_frames so that whisper is able to detect the eot token. - int32_t tail_padding_frames = 50; - if (model_->IsMultiLingual()) { - // 300 is an experience value. If it throws, please use a larger value. - tail_padding_frames = 300; - } + int32_t tail_padding_frames = 1000; if (config_.model_config.whisper.tail_paddings > 0) { tail_padding_frames = config_.model_config.whisper.tail_paddings; @@ -140,11 +134,13 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl { Ort::Value mel = Ort::Value::CreateTensor( model_->Allocator(), shape.data(), shape.size()); + float *p_mel = mel.GetTensorMutableData(); - std::copy(f.data(), f.data() + actual_frames * feat_dim, p_mel); + std::copy(f.data(), f.data() + num_frames * feat_dim, p_mel); + + std::fill_n(p_mel + num_frames * feat_dim, + (actual_frames - num_frames) * feat_dim, 0); - memset(p_mel + f.size(), 0, - (actual_frames - num_frames) * feat_dim * sizeof(float)); mel = Transpose12(model_->Allocator(), &mel); try { @@ -156,8 +152,12 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl { auto r = Convert(results[0], symbol_table_); s->SetResult(r); } catch (const Ort::Exception &ex) { - SHERPA_ONNX_LOGE("\n\nCaught exception:\n\n%s\n\nReturn an empty result", - ex.what()); + SHERPA_ONNX_LOGE( + "\n\nCaught exception:\n\n%s\n\nReturn an empty result. Number of " + "input frames: %d, Current tail " + "paddings: %d. If you see a lot of such exceptions, please consider " + "using a larger --whisper-tail-paddings", + ex.what(), num_frames, tail_padding_frames); return; } } diff --git a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py index 57afb2dca..214f98a1d 100644 --- a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py +++ b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py @@ -261,6 +261,7 @@ def from_whisper( decoding_method: str = "greedy_search", debug: bool = False, provider: str = "cpu", + tail_paddings: int = -1, ): """ Please refer to @@ -305,6 +306,7 @@ def from_whisper( decoder=decoder, language=language, task=task, + tail_paddings=tail_paddings, ), tokens=tokens, num_threads=num_threads,