Skip to content

Commit

Permalink
Add more mel checks between vocoder and acoustic model
Browse files Browse the repository at this point in the history
  • Loading branch information
yqzhishen committed Feb 26, 2024
1 parent 8597a98 commit b421d91
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 8 deletions.
9 changes: 7 additions & 2 deletions OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,20 @@ public class DsConfig {
public string linguistic;
public string pitch;
public string variance;
public int hop_size = 512;
public int sample_rate = 44100;
public bool predict_dur = true;
public bool predict_energy = true;
public bool predict_breathiness = true;
public bool predict_voicing = false;
public bool predict_tension = false;
public bool use_expr = false;
public bool use_note_rest = false;
public int sample_rate = 44100;
public int hop_size = 512;
public int num_mel_bins = 128;
public double mel_fmin = 40;
public double mel_fmax = 16000;
public string mel_base = "10"; // or "e"
public string mel_scale = "slaney"; // or "htk"
public float frameMs(){
return 1000f * hop_size / sample_rate;
}
Expand Down
70 changes: 66 additions & 4 deletions OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,54 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
}

var vocoder = singer.getVocoder();
//Vocoder and singer should have the same hop sizes and sample rates.
//mel specification validity checks
//mel base must be 10 or e
if (vocoder.mel_base != "10" && vocoder.mel_base != "e") {
throw new Exception(
$"Mel base must be \"10\" or \"e\", but got \"{vocoder.mel_base}\" from vocoder");
}
if (singer.dsConfig.mel_base != "10" && singer.dsConfig.mel_base != "e") {
throw new Exception(
$"Mel base must be \"10\" or \"e\", but got \"{singer.dsConfig.mel_base}\" from acoustic model");
}
//mel scale must be slaney or htk
if (vocoder.mel_scale != "slaney" && vocoder.mel_scale != "htk") {
throw new Exception(
$"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from vocoder");
}
if (singer.dsConfig.mel_scale != "slaney" && singer.dsConfig.mel_scale != "htk") {
throw new Exception(
$"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from acoustic model");
}
//mel specification matching checks
if(vocoder.sample_rate != singer.dsConfig.sample_rate) {
throw new Exception(
$"Vocoder and acoustic model has mismatching sample rate ({vocoder.sample_rate} != {singer.dsConfig.sample_rate})");
}
if(vocoder.hop_size != singer.dsConfig.hop_size){
throw new Exception($"Vocoder's hop size is {vocoder.hop_size}, but acoustic's hop size is {singer.dsConfig.hop_size}.");
throw new Exception(
$"Vocoder and acoustic model has mismatching hop size ({vocoder.hop_size} != {singer.dsConfig.hop_size})");
}
if (vocoder.num_mel_bins != singer.dsConfig.num_mel_bins) {
throw new Exception(
$"Vocoder and acoustic model has mismatching mel bins ({vocoder.num_mel_bins} != {singer.dsConfig.num_mel_bins})");
}
if (Math.Abs(vocoder.mel_fmin - singer.dsConfig.mel_fmin) > 1e-5) {
throw new Exception(
$"Vocoder and acoustic model has mismatching fmin ({vocoder.mel_fmin} != {singer.dsConfig.mel_fmin})");
}
if (Math.Abs(vocoder.mel_fmax - singer.dsConfig.mel_fmax) > 1e-5) {
throw new Exception(
$"Vocoder and acoustic model has mismatching fmax ({vocoder.mel_fmax} != {singer.dsConfig.mel_fmax})");
}
if(vocoder.sample_rate != singer.dsConfig.sample_rate){
throw new Exception($"Vocoder's sample rate is {vocoder.sample_rate}, but acoustic's sample rate is {singer.dsConfig.sample_rate}.");
// mismatching mel base can be transformed
// if (vocoder.mel_base != singer.dsConfig.mel_base) {
// throw new Exception(
// $"Vocoder and acoustic model has mismatching mel base ({vocoder.mel_base} != {singer.dsConfig.mel_base})");
// }
if (vocoder.mel_scale != singer.dsConfig.mel_scale) {
throw new Exception(
$"Vocoder and acoustic model has mismatching mel scale ({vocoder.mel_scale} != {singer.dsConfig.mel_scale})");
}

var acousticModel = singer.getAcousticSession();
Expand Down Expand Up @@ -301,6 +343,26 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
var acousticOutputs = acousticModel.Run(acousticInputs);
mel = acousticOutputs.First().AsTensor<float>().Clone();
}
//mel transforms for different mel base
if (vocoder.mel_base != singer.dsConfig.mel_base) {
float k;
if (vocoder.mel_base == "e" && singer.dsConfig.mel_base == "10") {
k = 2.30259f;
}
else if (vocoder.mel_base == "10" && singer.dsConfig.mel_base == "e") {
k = 0.434294f;
} else {
// this should never happen
throw new Exception("This should never happen");
}
for (int b = 0; b < mel.Dimensions[0]; ++b) {
for (int t = 0; t < mel.Dimensions[1]; ++t) {
for (int c = 0; c < mel.Dimensions[2]; ++c) {
mel[b, t, c] *= k;
}
}
}
}
//vocoder
//waveform = session.run(['waveform'], {'mel': mel, 'f0': f0})[0]
var vocoderInputs = new List<NamedOnnxValue>();
Expand Down
12 changes: 10 additions & 2 deletions OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ public class DsVocoder : IDisposable {
public int num_mel_bins => config.num_mel_bins;
public int hop_size => config.hop_size;
public int sample_rate => config.sample_rate;
public double mel_fmin => config.mel_fmin;
public double mel_fmax => config.mel_fmax;
public string mel_base => config.mel_base;
public string mel_scale => config.mel_scale;

//Get vocoder by package name
public DsVocoder(string name) {
Expand Down Expand Up @@ -54,8 +58,12 @@ public void Dispose() {
public class DsVocoderConfig {
public string name = "vocoder";
public string model = "model.onnx";
public int num_mel_bins = 128;
public int hop_size = 512;
public int sample_rate = 44100;
public int hop_size = 512;
public int num_mel_bins = 128;
public double mel_fmin = 40;
public double mel_fmax = 16000;
public string mel_base = "10"; // or "e"
public string mel_scale = "slaney"; // or "htk"
}
}

0 comments on commit b421d91

Please sign in to comment.