stakira · stakira · Mar 28, 2024 · Feb 26, 2024 · Feb 26, 2024
diff --git a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
@@ -33,15 +33,22 @@ public class DsConfig {
         public string linguistic;
         public string pitch;
         public string variance;
-        public int hop_size = 512;
-        public int sample_rate = 44100;
         public bool predict_dur = true;
         public bool predict_energy = true;
         public bool predict_breathiness = true;
         public bool predict_voicing = false;
         public bool predict_tension = false;
         public bool use_expr = false;
         public bool use_note_rest = false;
+        public int sample_rate = 44100;
+        public int hop_size = 512;
+        public int win_size = 2048;
+        public int fft_size = 2048;
+        public int num_mel_bins = 128;
+        public double mel_fmin = 40;
+        public double mel_fmax = 16000;
+        public string mel_base = "10";  // or "e"
+        public string mel_scale = "slaney";  // or "htk"
         public float frameMs(){
             return 1000f * hop_size / sample_rate;
         }

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
@@ -134,12 +134,62 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
             }
 
             var vocoder = singer.getVocoder();
-            //Vocoder and singer should have the same hop sizes and sample rates.
+            //mel specification validity checks
+            //mel base must be 10 or e
+            if (vocoder.mel_base != "10" && vocoder.mel_base != "e") {
+                throw new Exception(
+                    $"Mel base must be \"10\" or \"e\", but got \"{vocoder.mel_base}\" from vocoder");
+            }
+            if (singer.dsConfig.mel_base != "10" && singer.dsConfig.mel_base != "e") {
+                throw new Exception(
+                    $"Mel base must be \"10\" or \"e\", but got \"{singer.dsConfig.mel_base}\" from acoustic model");
+            }
+            //mel scale must be slaney or htk
+            if (vocoder.mel_scale != "slaney" && vocoder.mel_scale != "htk") {
+                throw new Exception(
+                    $"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from vocoder");
+            }
+            if (singer.dsConfig.mel_scale != "slaney" && singer.dsConfig.mel_scale != "htk") {
+                throw new Exception(
+                    $"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from acoustic model");
+            }
+            //mel specification matching checks
+            if(vocoder.sample_rate != singer.dsConfig.sample_rate) {
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching sample rate ({vocoder.sample_rate} != {singer.dsConfig.sample_rate})");
+            }
             if(vocoder.hop_size != singer.dsConfig.hop_size){
-                throw new Exception($"Vocoder's hop size is {vocoder.hop_size}, but acoustic's hop size is {singer.dsConfig.hop_size}.");
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching hop size ({vocoder.hop_size} != {singer.dsConfig.hop_size})");
+            }
+            if(vocoder.win_size != singer.dsConfig.win_size){
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching win size ({vocoder.win_size} != {singer.dsConfig.win_size})");
+            }
+            if(vocoder.fft_size != singer.dsConfig.fft_size){
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching FFT size ({vocoder.fft_size} != {singer.dsConfig.fft_size})");
+            }
+            if (vocoder.num_mel_bins != singer.dsConfig.num_mel_bins) {
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching mel bins ({vocoder.num_mel_bins} != {singer.dsConfig.num_mel_bins})");
+            }
+            if (Math.Abs(vocoder.mel_fmin - singer.dsConfig.mel_fmin) > 1e-5) {
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching fmin ({vocoder.mel_fmin} != {singer.dsConfig.mel_fmin})");
             }
-            if(vocoder.sample_rate != singer.dsConfig.sample_rate){
-                throw new Exception($"Vocoder's sample rate is {vocoder.sample_rate}, but acoustic's sample rate is {singer.dsConfig.sample_rate}.");
+            if (Math.Abs(vocoder.mel_fmax - singer.dsConfig.mel_fmax) > 1e-5) {
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching fmax ({vocoder.mel_fmax} != {singer.dsConfig.mel_fmax})");
+            }
+            // mismatching mel base can be transformed
+            // if (vocoder.mel_base != singer.dsConfig.mel_base) {
+            //     throw new Exception(
+            //         $"Vocoder and acoustic model has mismatching mel base ({vocoder.mel_base} != {singer.dsConfig.mel_base})");
+            // }
+            if (vocoder.mel_scale != singer.dsConfig.mel_scale) {
+                throw new Exception(
+                    $"Vocoder and acoustic model has mismatching mel scale ({vocoder.mel_scale} != {singer.dsConfig.mel_scale})");
             }
 
             var acousticModel = singer.getAcousticSession();
@@ -301,6 +351,26 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
                 var acousticOutputs = acousticModel.Run(acousticInputs);
                 mel = acousticOutputs.First().AsTensor<float>().Clone();
             }
+            //mel transforms for different mel base
+            if (vocoder.mel_base != singer.dsConfig.mel_base) {
+                float k;
+                if (vocoder.mel_base == "e" && singer.dsConfig.mel_base == "10") {
+                    k = 2.30259f;
+                }
+                else if (vocoder.mel_base == "10" && singer.dsConfig.mel_base == "e") {
+                    k = 0.434294f;
+                } else {
+                    // this should never happen
+                    throw new Exception("This should never happen");
+                }
+                for (int b = 0; b < mel.Dimensions[0]; ++b) {
+                    for (int t = 0; t < mel.Dimensions[1]; ++t) {
+                        for (int c = 0; c < mel.Dimensions[2]; ++c) {
+                            mel[b, t, c] *= k;
+                        }
+                    }
+                }
+            }
             //vocoder
             //waveform = session.run(['waveform'], {'mel': mel, 'f0': f0})[0]
             var vocoderInputs = new List<NamedOnnxValue>();

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs b/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs
@@ -10,7 +10,13 @@ public class DsVocoder : IDisposable {
 
         public int num_mel_bins => config.num_mel_bins;
         public int hop_size => config.hop_size;
+        public int win_size => config.win_size;
+        public int fft_size => config.fft_size;
         public int sample_rate => config.sample_rate;
+        public double mel_fmin => config.mel_fmin;
+        public double mel_fmax => config.mel_fmax;
+        public string mel_base => config.mel_base;
+        public string mel_scale => config.mel_scale;
 
         //Get vocoder by package name
         public DsVocoder(string name) {
@@ -54,8 +60,14 @@ public void Dispose() {
     public class DsVocoderConfig {
         public string name = "vocoder";
         public string model = "model.onnx";
-        public int num_mel_bins = 128;
-        public int hop_size = 512;
         public int sample_rate = 44100;
+        public int hop_size = 512;
+        public int win_size = 2048;
+        public int fft_size = 2048;
+        public int num_mel_bins = 128;
+        public double mel_fmin = 40;
+        public double mel_fmax = 16000;
+        public string mel_base = "10";  // or "e"
+        public string mel_scale = "slaney";  // or "htk"
     }
 }