Skip to content

Commit

Permalink
enhance error message for phoneme not in phoneme list or pitch model …
Browse files Browse the repository at this point in the history
…not found
  • Loading branch information
oxygen-dioxide committed May 9, 2024
1 parent bd73561 commit d5165f2
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 11 deletions.
10 changes: 9 additions & 1 deletion OpenUtau.Core/DiffSinger/DiffSingerBasePhonemizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,14 @@ public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
}
return speakerEmbedManager;
}

int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by timing model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
}
return result;
}

protected override void ProcessPart(Note[][] phrase) {
float padding = 500f;//Padding time for consonants at the beginning of a sentence, ms
Expand Down Expand Up @@ -247,7 +255,7 @@ protected override void ProcessPart(Note[][] phrase) {
//Linguistic Encoder
var tokens = phrasePhonemes
.SelectMany(n => n.Phonemes)
.Select(p => (Int64)phonemes.IndexOf(p.Symbol))
.Select(p => (Int64)PhonemeTokenize(p.Symbol))
.ToArray();
var word_div = phrasePhonemes.Take(phrasePhonemes.Count-1)
.Select(n => (Int64)n.Phonemes.Count)
Expand Down
18 changes: 15 additions & 3 deletions OpenUtau.Core/DiffSinger/DiffSingerPitch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ public DsPitch(string rootPath)
dsConfig = Core.Yaml.DefaultDeserializer.Deserialize<DsConfig>(
File.ReadAllText(Path.Combine(rootPath, "dsconfig.yaml"),
System.Text.Encoding.UTF8));
if(dsConfig.pitch == null){
throw new Exception("This voicebank doesn't contain a pitch model");
}
//Load phonemes list
string phonemesPath = Path.Combine(rootPath, dsConfig.phonemes);
phonemes = File.ReadLines(phonemesPath, Encoding.UTF8).ToList();
Expand Down Expand Up @@ -77,6 +80,14 @@ void SetRange<T>(T[] list, T value, int startIndex, int endIndex){
list[i] = value;
}
}

int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by pitch model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
}
return result;
}

public RenderPitchResult Process(RenderPhrase phrase){
var startMs = Math.Min(phrase.notes[0].positionMs, phrase.phones[0].positionMs) - headMs;
Expand All @@ -86,9 +97,10 @@ public RenderPitchResult Process(RenderPhrase phrase){
//Linguistic Encoder
var linguisticInputs = new List<NamedOnnxValue>();
var tokens = phrase.phones
.Select(p => (Int64)phonemes.IndexOf(p.phoneme))
.Prepend((Int64)phonemes.IndexOf("SP"))
.Append((Int64)phonemes.IndexOf("SP"))
.Select(p => p.phoneme)
.Prepend("SP")
.Append("SP")
.Select(x => (Int64)PhonemeTokenize(x))
.ToArray();
var ph_dur = phrase.phones
.Select(p=>(int)Math.Round(p.endMs/frameMs) - (int)Math.Round(p.positionMs/frameMs))
Expand Down
4 changes: 2 additions & 2 deletions OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -202,13 +202,13 @@ float[] InvokeDiffsinger(RenderPhrase phrase, double depth, int steps, Cancellat
//mel = session.run(['mel'], {'tokens': tokens, 'durations': durations, 'f0': f0, 'speedup': speedup})[0]
//tokens: phoneme index in the phoneme set
//durations: phoneme duration in frames
//f0: pitch curve in Hz by frame
//f0: pitch curve in Hz by frame
//speedup: Diffusion render speedup, int
var tokens = phrase.phones
.Select(p => p.phoneme)
.Prepend("SP")
.Append("SP")
.Select(x => (long)(singer.phonemes.IndexOf(x)))
.Select(phoneme => (Int64)singer.PhonemeTokenize(phoneme))
.ToList();
var durations = phrase.phones
.Select(p => (int)Math.Round(p.endMs / frameMs) - (int)Math.Round(p.positionMs / frameMs))//prevent cumulative error
Expand Down
9 changes: 8 additions & 1 deletion OpenUtau.Core/DiffSinger/DiffSingerSinger.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
using OpenUtau.Core.Ustx;
using Serilog;
using Microsoft.ML.OnnxRuntime;
using NumSharp;

namespace OpenUtau.Core.DiffSinger {
class DiffSingerSinger : USinger {
Expand Down Expand Up @@ -194,6 +193,14 @@ public DsVariance getVariancePredictor(){
return variancePredictor;
}

public int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by acoustic model. Please check {Path.Combine(Location, dsConfig.phonemes)}");
}
return result;
}

public override void FreeMemory(){
Log.Information($"Freeing memory for singer {Id}");
if(acousticSession != null) {
Expand Down
15 changes: 11 additions & 4 deletions OpenUtau.Core/DiffSinger/DiffSingerVariance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,22 @@ public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
return speakerEmbedManager;
}

int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by variance model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
}
return result;
}
public VarianceResult Process(RenderPhrase phrase){
int headFrames = (int)Math.Round(headMs / frameMs);
int tailFrames = (int)Math.Round(tailMs / frameMs);
//Linguistic Encoder
var linguisticInputs = new List<NamedOnnxValue>();
var tokens = phrase.phones
.Select(p => (Int64)phonemes.IndexOf(p.phoneme))
.Prepend((Int64)phonemes.IndexOf("SP"))
.Append((Int64)phonemes.IndexOf("SP"))
var tokens = phrase.phones.Select(p => p.phoneme)
.Prepend("SP")
.Append("SP")
.Select(x => (Int64)PhonemeTokenize(x))
.ToArray();
var ph_dur = phrase.phones
.Select(p => (int)Math.Round(p.endMs / frameMs) - (int)Math.Round(p.positionMs / frameMs))//prevent cumulative error
Expand Down

0 comments on commit d5165f2

Please sign in to comment.