surface dithering constant, 0.0 disables dithering

- currently, dithering is not yet implemented in https://github.com/csukuangfj/kaldi-native-fbank - i can port it there from kaldi
k2-fsa · Mar 13, 2024 · 468fe1c · 468fe1c
1 parent 20f7aca
commit 468fe1c
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 5 deletions.
diff --git a/sherpa-onnx/csrc/features.cc b/sherpa-onnx/csrc/features.cc
@@ -30,7 +30,13 @@ void FeatureExtractorConfig::Register(ParseOptions *po) {
                "Low cutoff frequency for mel bins");
 
   po->Register("high-freq", &high_freq,
-               "High cutoff frequency for mel bins (if <= 0, offset from Nyquist)");
+               "High cutoff frequency for mel bins "
+               "(if <= 0, offset from Nyquist)");
+
+  po->Register("dither", &dither,
+               "Dithering constant (0.0 means no dither). "
+               "By default the audio samples are in range [-1,+1], "
+               "so dithering constant should be around 0.0001");
 }
 
 std::string FeatureExtractorConfig::ToString() const {
@@ -40,15 +46,16 @@ std::string FeatureExtractorConfig::ToString() const {
   os << "sampling_rate=" << sampling_rate << ", ";
   os << "feature_dim=" << feature_dim << ", ";
   os << "low_freq=" << low_freq << ", ";
-  os << "high_freq=" << high_freq << ")";
+  os << "high_freq=" << high_freq << ", ";
+  os << "dither=" << dither << ")";
 
   return os.str();
 }
 
 class FeatureExtractor::Impl {
  public:
   explicit Impl(const FeatureExtractorConfig &config) : config_(config) {
-    opts_.frame_opts.dither = 0;
+    opts_.frame_opts.dither = config.dither;
     opts_.frame_opts.snip_edges = config.snip_edges;
     opts_.frame_opts.samp_freq = config.sampling_rate;
     opts_.frame_opts.frame_shift_ms = config.frame_shift_ms;

diff --git a/sherpa-onnx/csrc/features.h b/sherpa-onnx/csrc/features.h
@@ -34,6 +34,14 @@ struct FeatureExtractorConfig {
   // https://github.com/k2-fsa/sherpa-onnx/issues/514
   float high_freq = -400.0f;
 
+  // dithering constant, useful for signals with hard-zeroes in non-speech parts
+  // this prevents large negative values in log-mel filterbanks
+  //
+  // By default the audio samples are in range [-1,+1],
+  // so dithering constant should be around 0.0001
+  //
+  float dither = 0.0f;
+
   // Set internally by some models, e.g., paraformer sets it to false.
   // This parameter is not exposed to users from the commandline
   // If true, the feature extractor expects inputs to be normalized to

diff --git a/sherpa-onnx/python/csrc/features.cc b/sherpa-onnx/python/csrc/features.cc
@@ -11,15 +11,17 @@ namespace sherpa_onnx {
 static void PybindFeatureExtractorConfig(py::module *m) {
   using PyClass = FeatureExtractorConfig;
   py::class_<PyClass>(*m, "FeatureExtractorConfig")
-      .def(py::init<int32_t, int32_t, float, float>(),
+      .def(py::init<int32_t, int32_t, float, float, float>(),
            py::arg("sampling_rate") = 16000,
            py::arg("feature_dim") = 80,
            py::arg("low_freq") = 20.0f,
-           py::arg("high_freq") = -400.0f)
+           py::arg("high_freq") = -400.0f,
+           py::arg("dither") = 0.0f)
       .def_readwrite("sampling_rate", &PyClass::sampling_rate)
       .def_readwrite("feature_dim", &PyClass::feature_dim)
       .def_readwrite("low_freq", &PyClass::low_freq)
       .def_readwrite("high_freq", &PyClass::high_freq)
+      .def_readwrite("dither", &PyClass::high_freq)
       .def("__str__", &PyClass::ToString);
 }
 

diff --git a/sherpa-onnx/python/sherpa_onnx/online_recognizer.py b/sherpa-onnx/python/sherpa_onnx/online_recognizer.py
@@ -43,6 +43,7 @@ def from_transducer(
         feature_dim: int = 80,
         low_freq: float = 20.0,
         high_freq: float = -400.0,
+        dither: float = 0.0,
         enable_endpoint_detection: bool = False,
         rule1_min_trailing_silence: float = 2.4,
         rule2_min_trailing_silence: float = 1.2,
@@ -87,6 +88,10 @@ def from_transducer(
           high_freq:
             High cutoff frequency for mel bins in feature extraction
             (if <= 0, offset from Nyquist)
+          dither:
+            Dithering constant (0.0 means no dither).
+            By default the audio samples are in range [-1,+1],
+            so dithering constant should be around 0.0001
           enable_endpoint_detection:
             True to enable endpoint detection. False to disable endpoint
             detection.
@@ -149,6 +154,7 @@ def from_transducer(
             feature_dim=feature_dim,
             low_freq=low_freq,
             high_freq=high_freq,
+            dither=dither,
         )
 
         endpoint_config = EndpointConfig(