diff --git a/sherpa-onnx/csrc/features.cc b/sherpa-onnx/csrc/features.cc index 90fc55772e..f465818a3d 100644 --- a/sherpa-onnx/csrc/features.cc +++ b/sherpa-onnx/csrc/features.cc @@ -30,7 +30,13 @@ void FeatureExtractorConfig::Register(ParseOptions *po) { "Low cutoff frequency for mel bins"); po->Register("high-freq", &high_freq, - "High cutoff frequency for mel bins (if <= 0, offset from Nyquist)"); + "High cutoff frequency for mel bins " + "(if <= 0, offset from Nyquist)"); + + po->Register("dither", &dither, + "Dithering constant (0.0 means no dither). " + "By default the audio samples are in range [-1,+1], " + "so dithering constant should be around 0.0001"); } std::string FeatureExtractorConfig::ToString() const { @@ -40,7 +46,8 @@ std::string FeatureExtractorConfig::ToString() const { os << "sampling_rate=" << sampling_rate << ", "; os << "feature_dim=" << feature_dim << ", "; os << "low_freq=" << low_freq << ", "; - os << "high_freq=" << high_freq << ")"; + os << "high_freq=" << high_freq << ", "; + os << "dither=" << dither << ")"; return os.str(); } @@ -48,7 +55,7 @@ std::string FeatureExtractorConfig::ToString() const { class FeatureExtractor::Impl { public: explicit Impl(const FeatureExtractorConfig &config) : config_(config) { - opts_.frame_opts.dither = 0; + opts_.frame_opts.dither = config.dither; opts_.frame_opts.snip_edges = config.snip_edges; opts_.frame_opts.samp_freq = config.sampling_rate; opts_.frame_opts.frame_shift_ms = config.frame_shift_ms; diff --git a/sherpa-onnx/csrc/features.h b/sherpa-onnx/csrc/features.h index d03fbaa0ca..1b68ce3911 100644 --- a/sherpa-onnx/csrc/features.h +++ b/sherpa-onnx/csrc/features.h @@ -34,6 +34,14 @@ struct FeatureExtractorConfig { // https://github.com/k2-fsa/sherpa-onnx/issues/514 float high_freq = -400.0f; + // dithering constant, useful for signals with hard-zeroes in non-speech parts + // this prevents large negative values in log-mel filterbanks + // + // By default the audio samples are in range [-1,+1], + // so dithering constant should be around 0.0001 + // + float dither = 0.0f; + // Set internally by some models, e.g., paraformer sets it to false. // This parameter is not exposed to users from the commandline // If true, the feature extractor expects inputs to be normalized to diff --git a/sherpa-onnx/python/csrc/features.cc b/sherpa-onnx/python/csrc/features.cc index 106a8e0d37..333c6b6758 100644 --- a/sherpa-onnx/python/csrc/features.cc +++ b/sherpa-onnx/python/csrc/features.cc @@ -11,15 +11,17 @@ namespace sherpa_onnx { static void PybindFeatureExtractorConfig(py::module *m) { using PyClass = FeatureExtractorConfig; py::class_(*m, "FeatureExtractorConfig") - .def(py::init(), + .def(py::init(), py::arg("sampling_rate") = 16000, py::arg("feature_dim") = 80, py::arg("low_freq") = 20.0f, - py::arg("high_freq") = -400.0f) + py::arg("high_freq") = -400.0f, + py::arg("dither") = 0.0f) .def_readwrite("sampling_rate", &PyClass::sampling_rate) .def_readwrite("feature_dim", &PyClass::feature_dim) .def_readwrite("low_freq", &PyClass::low_freq) .def_readwrite("high_freq", &PyClass::high_freq) + .def_readwrite("dither", &PyClass::high_freq) .def("__str__", &PyClass::ToString); } diff --git a/sherpa-onnx/python/sherpa_onnx/online_recognizer.py b/sherpa-onnx/python/sherpa_onnx/online_recognizer.py index a5350d90b6..c343619609 100644 --- a/sherpa-onnx/python/sherpa_onnx/online_recognizer.py +++ b/sherpa-onnx/python/sherpa_onnx/online_recognizer.py @@ -43,6 +43,7 @@ def from_transducer( feature_dim: int = 80, low_freq: float = 20.0, high_freq: float = -400.0, + dither: float = 0.0, enable_endpoint_detection: bool = False, rule1_min_trailing_silence: float = 2.4, rule2_min_trailing_silence: float = 1.2, @@ -87,6 +88,10 @@ def from_transducer( high_freq: High cutoff frequency for mel bins in feature extraction (if <= 0, offset from Nyquist) + dither: + Dithering constant (0.0 means no dither). + By default the audio samples are in range [-1,+1], + so dithering constant should be around 0.0001 enable_endpoint_detection: True to enable endpoint detection. False to disable endpoint detection. @@ -149,6 +154,7 @@ def from_transducer( feature_dim=feature_dim, low_freq=low_freq, high_freq=high_freq, + dither=dither, ) endpoint_config = EndpointConfig(