From b421d9131a44493568e13391ea3a23ce146afced Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Mon, 26 Feb 2024 23:22:22 +0800 Subject: [PATCH 1/2] Add more mel checks between vocoder and acoustic model --- OpenUtau.Core/DiffSinger/DiffSingerConfig.cs | 9 ++- .../DiffSinger/DiffSingerRenderer.cs | 70 +++++++++++++++++-- OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs | 12 +++- 3 files changed, 83 insertions(+), 8 deletions(-) diff --git a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs index 4843b1c5a..5f088f174 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs @@ -33,8 +33,6 @@ public class DsConfig { public string linguistic; public string pitch; public string variance; - public int hop_size = 512; - public int sample_rate = 44100; public bool predict_dur = true; public bool predict_energy = true; public bool predict_breathiness = true; @@ -42,6 +40,13 @@ public class DsConfig { public bool predict_tension = false; public bool use_expr = false; public bool use_note_rest = false; + public int sample_rate = 44100; + public int hop_size = 512; + public int num_mel_bins = 128; + public double mel_fmin = 40; + public double mel_fmax = 16000; + public string mel_base = "10"; // or "e" + public string mel_scale = "slaney"; // or "htk" public float frameMs(){ return 1000f * hop_size / sample_rate; } diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs index dd71f66ee..1bd7cd859 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs @@ -134,12 +134,54 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati } var vocoder = singer.getVocoder(); - //Vocoder and singer should have the same hop sizes and sample rates. + //mel specification validity checks + //mel base must be 10 or e + if (vocoder.mel_base != "10" && vocoder.mel_base != "e") { + throw new Exception( + $"Mel base must be \"10\" or \"e\", but got \"{vocoder.mel_base}\" from vocoder"); + } + if (singer.dsConfig.mel_base != "10" && singer.dsConfig.mel_base != "e") { + throw new Exception( + $"Mel base must be \"10\" or \"e\", but got \"{singer.dsConfig.mel_base}\" from acoustic model"); + } + //mel scale must be slaney or htk + if (vocoder.mel_scale != "slaney" && vocoder.mel_scale != "htk") { + throw new Exception( + $"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from vocoder"); + } + if (singer.dsConfig.mel_scale != "slaney" && singer.dsConfig.mel_scale != "htk") { + throw new Exception( + $"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from acoustic model"); + } + //mel specification matching checks + if(vocoder.sample_rate != singer.dsConfig.sample_rate) { + throw new Exception( + $"Vocoder and acoustic model has mismatching sample rate ({vocoder.sample_rate} != {singer.dsConfig.sample_rate})"); + } if(vocoder.hop_size != singer.dsConfig.hop_size){ - throw new Exception($"Vocoder's hop size is {vocoder.hop_size}, but acoustic's hop size is {singer.dsConfig.hop_size}."); + throw new Exception( + $"Vocoder and acoustic model has mismatching hop size ({vocoder.hop_size} != {singer.dsConfig.hop_size})"); + } + if (vocoder.num_mel_bins != singer.dsConfig.num_mel_bins) { + throw new Exception( + $"Vocoder and acoustic model has mismatching mel bins ({vocoder.num_mel_bins} != {singer.dsConfig.num_mel_bins})"); + } + if (Math.Abs(vocoder.mel_fmin - singer.dsConfig.mel_fmin) > 1e-5) { + throw new Exception( + $"Vocoder and acoustic model has mismatching fmin ({vocoder.mel_fmin} != {singer.dsConfig.mel_fmin})"); + } + if (Math.Abs(vocoder.mel_fmax - singer.dsConfig.mel_fmax) > 1e-5) { + throw new Exception( + $"Vocoder and acoustic model has mismatching fmax ({vocoder.mel_fmax} != {singer.dsConfig.mel_fmax})"); } - if(vocoder.sample_rate != singer.dsConfig.sample_rate){ - throw new Exception($"Vocoder's sample rate is {vocoder.sample_rate}, but acoustic's sample rate is {singer.dsConfig.sample_rate}."); + // mismatching mel base can be transformed + // if (vocoder.mel_base != singer.dsConfig.mel_base) { + // throw new Exception( + // $"Vocoder and acoustic model has mismatching mel base ({vocoder.mel_base} != {singer.dsConfig.mel_base})"); + // } + if (vocoder.mel_scale != singer.dsConfig.mel_scale) { + throw new Exception( + $"Vocoder and acoustic model has mismatching mel scale ({vocoder.mel_scale} != {singer.dsConfig.mel_scale})"); } var acousticModel = singer.getAcousticSession(); @@ -301,6 +343,26 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati var acousticOutputs = acousticModel.Run(acousticInputs); mel = acousticOutputs.First().AsTensor().Clone(); } + //mel transforms for different mel base + if (vocoder.mel_base != singer.dsConfig.mel_base) { + float k; + if (vocoder.mel_base == "e" && singer.dsConfig.mel_base == "10") { + k = 2.30259f; + } + else if (vocoder.mel_base == "10" && singer.dsConfig.mel_base == "e") { + k = 0.434294f; + } else { + // this should never happen + throw new Exception("This should never happen"); + } + for (int b = 0; b < mel.Dimensions[0]; ++b) { + for (int t = 0; t < mel.Dimensions[1]; ++t) { + for (int c = 0; c < mel.Dimensions[2]; ++c) { + mel[b, t, c] *= k; + } + } + } + } //vocoder //waveform = session.run(['waveform'], {'mel': mel, 'f0': f0})[0] var vocoderInputs = new List(); diff --git a/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs b/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs index 1a5fc35e7..8fce85761 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs @@ -11,6 +11,10 @@ public class DsVocoder : IDisposable { public int num_mel_bins => config.num_mel_bins; public int hop_size => config.hop_size; public int sample_rate => config.sample_rate; + public double mel_fmin => config.mel_fmin; + public double mel_fmax => config.mel_fmax; + public string mel_base => config.mel_base; + public string mel_scale => config.mel_scale; //Get vocoder by package name public DsVocoder(string name) { @@ -54,8 +58,12 @@ public void Dispose() { public class DsVocoderConfig { public string name = "vocoder"; public string model = "model.onnx"; - public int num_mel_bins = 128; - public int hop_size = 512; public int sample_rate = 44100; + public int hop_size = 512; + public int num_mel_bins = 128; + public double mel_fmin = 40; + public double mel_fmax = 16000; + public string mel_base = "10"; // or "e" + public string mel_scale = "slaney"; // or "htk" } } From 93950e524e1b05c47935e339f54ecbeffed97d35 Mon Sep 17 00:00:00 2001 From: yqzhishen Date: Tue, 27 Feb 2024 00:35:21 +0800 Subject: [PATCH 2/2] Add checks for `win_size` and `fft_size` --- OpenUtau.Core/DiffSinger/DiffSingerConfig.cs | 2 ++ OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs | 8 ++++++++ OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs | 4 ++++ 3 files changed, 14 insertions(+) diff --git a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs index 5f088f174..aef933d8b 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs @@ -42,6 +42,8 @@ public class DsConfig { public bool use_note_rest = false; public int sample_rate = 44100; public int hop_size = 512; + public int win_size = 2048; + public int fft_size = 2048; public int num_mel_bins = 128; public double mel_fmin = 40; public double mel_fmax = 16000; diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs index 1bd7cd859..168b5f844 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs @@ -162,6 +162,14 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati throw new Exception( $"Vocoder and acoustic model has mismatching hop size ({vocoder.hop_size} != {singer.dsConfig.hop_size})"); } + if(vocoder.win_size != singer.dsConfig.win_size){ + throw new Exception( + $"Vocoder and acoustic model has mismatching win size ({vocoder.win_size} != {singer.dsConfig.win_size})"); + } + if(vocoder.fft_size != singer.dsConfig.fft_size){ + throw new Exception( + $"Vocoder and acoustic model has mismatching FFT size ({vocoder.fft_size} != {singer.dsConfig.fft_size})"); + } if (vocoder.num_mel_bins != singer.dsConfig.num_mel_bins) { throw new Exception( $"Vocoder and acoustic model has mismatching mel bins ({vocoder.num_mel_bins} != {singer.dsConfig.num_mel_bins})"); diff --git a/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs b/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs index 8fce85761..9b8aa5ad6 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs @@ -10,6 +10,8 @@ public class DsVocoder : IDisposable { public int num_mel_bins => config.num_mel_bins; public int hop_size => config.hop_size; + public int win_size => config.win_size; + public int fft_size => config.fft_size; public int sample_rate => config.sample_rate; public double mel_fmin => config.mel_fmin; public double mel_fmax => config.mel_fmax; @@ -60,6 +62,8 @@ public class DsVocoderConfig { public string model = "model.onnx"; public int sample_rate = 44100; public int hop_size = 512; + public int win_size = 2048; + public int fft_size = 2048; public int num_mel_bins = 128; public double mel_fmin = 40; public double mel_fmax = 16000;