From b421d9131a44493568e13391ea3a23ce146afced Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Mon, 26 Feb 2024 23:22:22 +0800
Subject: [PATCH 1/2] Add more mel checks between vocoder and acoustic model

---
 OpenUtau.Core/DiffSinger/DiffSingerConfig.cs  |  9 ++-
 .../DiffSinger/DiffSingerRenderer.cs          | 70 +++++++++++++++++--
 OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs | 12 +++-
 3 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
index 4843b1c5a..5f088f174 100644
--- a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
+++ b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
@@ -33,8 +33,6 @@ public class DsConfig {
         public string linguistic;
         public string pitch;
         public string variance;
-        public int hop_size = 512;
-        public int sample_rate = 44100;
         public bool predict_dur = true;
         public bool predict_energy = true;
         public bool predict_breathiness = true;
@@ -42,6 +40,13 @@ public class DsConfig {
         public bool predict_tension = false;
         public bool use_expr = false;
         public bool use_note_rest = false;
+        public int sample_rate = 44100;
+        public int hop_size = 512;
+        public int num_mel_bins = 128;
+        public double mel_fmin = 40;
+        public double mel_fmax = 16000;
+        public string mel_base = "10";  // or "e"
+        public string mel_scale = "slaney";  // or "htk"
         public float frameMs(){
             return 1000f * hop_size / sample_rate;
         }
diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
index dd71f66ee..1bd7cd859 100644
--- a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
+++ b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
@@ -134,12 +134,54 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
             }
 
             var vocoder = singer.getVocoder();
-            //Vocoder and singer should have the same hop sizes and sample rates.
+            //mel specification validity checks
+            //mel base must be 10 or e
+            if (vocoder.mel_base != "10" && vocoder.mel_base != "e") {
+                throw new Exception(
+                    $"Mel base must be \"10\" or \"e\", but got \"{vocoder.mel_base}\" from vocoder");
+            }
+            if (singer.dsConfig.mel_base != "10" && singer.dsConfig.mel_base != "e") {
+                throw new Exception(
+                    $"Mel base must be \"10\" or \"e\", but got \"{singer.dsConfig.mel_base}\" from acoustic model");
+            }
+            //mel scale must be slaney or htk
+            if (vocoder.mel_scale != "slaney" && vocoder.mel_scale != "htk") {
+                throw new Exception(
+                    $"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from vocoder");
+            }
+            if (singer.dsConfig.mel_scale != "slaney" && singer.dsConfig.mel_scale != "htk") {
+                throw new Exception(
+                    $"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from acoustic model");
+            }
+            //mel specification matching checks
+            if(vocoder.sample_rate != singer.dsConfig.sample_rate) {
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching sample rate ({vocoder.sample_rate} != {singer.dsConfig.sample_rate})");
+            }
             if(vocoder.hop_size != singer.dsConfig.hop_size){
-                throw new Exception($"Vocoder's hop size is {vocoder.hop_size}, but acoustic's hop size is {singer.dsConfig.hop_size}.");
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching hop size ({vocoder.hop_size} != {singer.dsConfig.hop_size})");
+            }
+            if (vocoder.num_mel_bins != singer.dsConfig.num_mel_bins) {
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching mel bins ({vocoder.num_mel_bins} != {singer.dsConfig.num_mel_bins})");
+            }
+            if (Math.Abs(vocoder.mel_fmin - singer.dsConfig.mel_fmin) > 1e-5) {
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching fmin ({vocoder.mel_fmin} != {singer.dsConfig.mel_fmin})");
+            }
+            if (Math.Abs(vocoder.mel_fmax - singer.dsConfig.mel_fmax) > 1e-5) {
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching fmax ({vocoder.mel_fmax} != {singer.dsConfig.mel_fmax})");
             }
-            if(vocoder.sample_rate != singer.dsConfig.sample_rate){
-                throw new Exception($"Vocoder's sample rate is {vocoder.sample_rate}, but acoustic's sample rate is {singer.dsConfig.sample_rate}.");
+            // mismatching mel base can be transformed
+            // if (vocoder.mel_base != singer.dsConfig.mel_base) {
+            //     throw new Exception(
+            //         $"Vocoder and acoustic model has mismatching mel base ({vocoder.mel_base} != {singer.dsConfig.mel_base})");
+            // }
+            if (vocoder.mel_scale != singer.dsConfig.mel_scale) {
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching mel scale ({vocoder.mel_scale} != {singer.dsConfig.mel_scale})");
             }
 
             var acousticModel = singer.getAcousticSession();
@@ -301,6 +343,26 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
                 var acousticOutputs = acousticModel.Run(acousticInputs);
                 mel = acousticOutputs.First().AsTensor<float>().Clone();
             }
+            //mel transforms for different mel base
+            if (vocoder.mel_base != singer.dsConfig.mel_base) {
+                float k;
+                if (vocoder.mel_base == "e" && singer.dsConfig.mel_base == "10") {
+                    k = 2.30259f;
+                }
+                else if (vocoder.mel_base == "10" && singer.dsConfig.mel_base == "e") {
+                    k = 0.434294f;
+                } else {
+                    // this should never happen
+                    throw new Exception("This should never happen");
+                }
+                for (int b = 0; b < mel.Dimensions[0]; ++b) {
+                    for (int t = 0; t < mel.Dimensions[1]; ++t) {
+                        for (int c = 0; c < mel.Dimensions[2]; ++c) {
+                            mel[b, t, c] *= k;
+                        }
+                    }
+                }
+            }
             //vocoder
             //waveform = session.run(['waveform'], {'mel': mel, 'f0': f0})[0]
             var vocoderInputs = new List<NamedOnnxValue>();
diff --git a/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs b/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs
index 1a5fc35e7..8fce85761 100644
--- a/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs
+++ b/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs
@@ -11,6 +11,10 @@ public class DsVocoder : IDisposable {
         public int num_mel_bins => config.num_mel_bins;
         public int hop_size => config.hop_size;
         public int sample_rate => config.sample_rate;
+        public double mel_fmin => config.mel_fmin;
+        public double mel_fmax => config.mel_fmax;
+        public string mel_base => config.mel_base;
+        public string mel_scale => config.mel_scale;
 
         //Get vocoder by package name
         public DsVocoder(string name) {
@@ -54,8 +58,12 @@ public void Dispose() {
     public class DsVocoderConfig {
         public string name = "vocoder";
         public string model = "model.onnx";
-        public int num_mel_bins = 128;
-        public int hop_size = 512;
         public int sample_rate = 44100;
+        public int hop_size = 512;
+        public int num_mel_bins = 128;
+        public double mel_fmin = 40;
+        public double mel_fmax = 16000;
+        public string mel_base = "10";  // or "e"
+        public string mel_scale = "slaney";  // or "htk"
     }
 }

From 93950e524e1b05c47935e339f54ecbeffed97d35 Mon Sep 17 00:00:00 2001
From: yqzhishen <yangqian_1015@icloud.com>
Date: Tue, 27 Feb 2024 00:35:21 +0800
Subject: [PATCH 2/2] Add checks for `win_size` and `fft_size`

---
 OpenUtau.Core/DiffSinger/DiffSingerConfig.cs   | 2 ++
 OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs | 8 ++++++++
 OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs  | 4 ++++
 3 files changed, 14 insertions(+)

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
index 5f088f174..aef933d8b 100644
--- a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
+++ b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
@@ -42,6 +42,8 @@ public class DsConfig {
         public bool use_note_rest = false;
         public int sample_rate = 44100;
         public int hop_size = 512;
+        public int win_size = 2048;
+        public int fft_size = 2048;
         public int num_mel_bins = 128;
         public double mel_fmin = 40;
         public double mel_fmax = 16000;
diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
index 1bd7cd859..168b5f844 100644
--- a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
+++ b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
@@ -162,6 +162,14 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
                 throw new Exception(
                     $"Vocoder and acoustic model has mismatching hop size ({vocoder.hop_size} != {singer.dsConfig.hop_size})");
             }
+            if(vocoder.win_size != singer.dsConfig.win_size){
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching win size ({vocoder.win_size} != {singer.dsConfig.win_size})");
+            }
+            if(vocoder.fft_size != singer.dsConfig.fft_size){
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching FFT size ({vocoder.fft_size} != {singer.dsConfig.fft_size})");
+            }
             if (vocoder.num_mel_bins != singer.dsConfig.num_mel_bins) {
                 throw new Exception(
                     $"Vocoder and acoustic model has mismatching mel bins ({vocoder.num_mel_bins} != {singer.dsConfig.num_mel_bins})");
diff --git a/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs b/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs
index 8fce85761..9b8aa5ad6 100644
--- a/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs
+++ b/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs
@@ -10,6 +10,8 @@ public class DsVocoder : IDisposable {
 
         public int num_mel_bins => config.num_mel_bins;
         public int hop_size => config.hop_size;
+        public int win_size => config.win_size;
+        public int fft_size => config.fft_size;
         public int sample_rate => config.sample_rate;
         public double mel_fmin => config.mel_fmin;
         public double mel_fmax => config.mel_fmax;
@@ -60,6 +62,8 @@ public class DsVocoderConfig {
         public string model = "model.onnx";
         public int sample_rate = 44100;
         public int hop_size = 512;
+        public int win_size = 2048;
+        public int fft_size = 2048;
         public int num_mel_bins = 128;
         public double mel_fmin = 40;
         public double mel_fmax = 16000;