Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance diffsinger error messages; diffsinger phonemizer fix #1131

Merged
merged 6 commits into from
Jun 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions OpenUtau.Core/Api/G2pDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ public Builder AddSymbol(string symbol, string type) {
phonemeSymbols[symbol] = type == "vowel";
if(type == "semivowel" || type == "liquid") {
glideSymbols.Add(symbol);
} else {
glideSymbols.Remove(symbol);
}
return this;
}
Expand All @@ -88,6 +90,8 @@ public Builder AddSymbol(string symbol, bool isVowel, bool isGlide) {
phonemeSymbols[symbol] = isVowel;
if (isGlide && !isVowel) {
glideSymbols.Add(symbol);
} else {
glideSymbols.Remove(symbol);
}
return this;
}
Expand Down
22 changes: 19 additions & 3 deletions OpenUtau.Core/DiffSinger/DiffSingerBasePhonemizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,20 +74,28 @@ public override void SetSinger(USinger singer) {
}

protected virtual IG2p LoadG2p(string rootPath) {
//Each phonemizer has a delicated dictionary name, such as dsdict-en.yaml, dsdict-ru.yaml.
//If this dictionary exists, load it.
//If not, load dsdict.yaml.
var g2ps = new List<IG2p>();
var dictionaryNames = new string[] {GetDictionaryName(), "dsdict.yaml"};
// Load dictionary from singer folder.
G2pDictionary.Builder g2pBuilder = new G2pDictionary.Builder();
foreach(var dictionaryName in dictionaryNames){
string dictionaryPath = Path.Combine(rootPath, dictionaryName);
if (File.Exists(dictionaryPath)) {
try {
g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(dictionaryPath)).Build());
g2pBuilder.Load(File.ReadAllText(dictionaryPath)).Build();
} catch (Exception e) {
Log.Error(e, $"Failed to load {dictionaryPath}");
}
break;
}
}
//SP and AP should always be vowel
g2pBuilder.AddSymbol("SP", true);
g2pBuilder.AddSymbol("AP", true);
g2ps.Add(g2pBuilder.Build());
return new G2pFallbacks(g2ps.ToArray());
}

Expand Down Expand Up @@ -148,7 +156,7 @@ List<phonemesPerNote> ProcessWord(Note[] notes, string[] symbols){
var isGlide = dsPhonemes.Select(s => g2p.IsGlide(s.Symbol)).ToArray();
var nonExtensionNotes = notes.Where(n=>!IsSyllableVowelExtensionNote(n)).ToArray();
var isStart = new bool[dsPhonemes.Length];
if(!isStart.Any()){
if(isVowel.All(b=>!b)){
isStart[0] = true;
}
for(int i=0; i<dsPhonemes.Length; i++){
Expand Down Expand Up @@ -211,6 +219,14 @@ public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
}
return speakerEmbedManager;
}

int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by timing model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
}
return result;
}

protected override void ProcessPart(Note[][] phrase) {
float padding = 500f;//Padding time for consonants at the beginning of a sentence, ms
Expand Down Expand Up @@ -247,7 +263,7 @@ protected override void ProcessPart(Note[][] phrase) {
//Linguistic Encoder
var tokens = phrasePhonemes
.SelectMany(n => n.Phonemes)
.Select(p => (Int64)phonemes.IndexOf(p.Symbol))
.Select(p => (Int64)PhonemeTokenize(p.Symbol))
.ToArray();
var word_div = phrasePhonemes.Take(phrasePhonemes.Count-1)
.Select(n => (Int64)n.Phonemes.Count)
Expand Down
36 changes: 25 additions & 11 deletions OpenUtau.Core/DiffSinger/DiffSingerPitch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ public DsPitch(string rootPath)
dsConfig = Core.Yaml.DefaultDeserializer.Deserialize<DsConfig>(
File.ReadAllText(Path.Combine(rootPath, "dsconfig.yaml"),
System.Text.Encoding.UTF8));
if(dsConfig.pitch == null){
throw new Exception("This voicebank doesn't contain a pitch model");
}
//Load phonemes list
string phonemesPath = Path.Combine(rootPath, dsConfig.phonemes);
phonemes = File.ReadLines(phonemesPath, Encoding.UTF8).ToList();
Expand All @@ -52,17 +55,16 @@ public DsPitch(string rootPath)
}

protected IG2p LoadG2p(string rootPath) {
var g2ps = new List<IG2p>();
// Load dictionary from singer folder.
string file = Path.Combine(rootPath, "dsdict.yaml");
if (File.Exists(file)) {
try {
g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(file)).Build());
} catch (Exception e) {
Log.Error(e, $"Failed to load {file}");
}
if(!File.Exists(file)){
throw new Exception($"File not found: {file}");
}
return new G2pFallbacks(g2ps.ToArray());
var g2pBuilder = G2pDictionary.NewBuilder().Load(File.ReadAllText(file));
//SP and AP should always be vowel
g2pBuilder.AddSymbol("SP", true);
g2pBuilder.AddSymbol("AP", true);
return g2pBuilder.Build();
}

public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
Expand All @@ -77,6 +79,14 @@ void SetRange<T>(T[] list, T value, int startIndex, int endIndex){
list[i] = value;
}
}

int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by pitch model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
}
return result;
}

public RenderPitchResult Process(RenderPhrase phrase){
var startMs = Math.Min(phrase.notes[0].positionMs, phrase.phones[0].positionMs) - headMs;
Expand All @@ -86,9 +96,10 @@ public RenderPitchResult Process(RenderPhrase phrase){
//Linguistic Encoder
var linguisticInputs = new List<NamedOnnxValue>();
var tokens = phrase.phones
.Select(p => (Int64)phonemes.IndexOf(p.phoneme))
.Prepend((Int64)phonemes.IndexOf("SP"))
.Append((Int64)phonemes.IndexOf("SP"))
.Select(p => p.phoneme)
.Prepend("SP")
.Append("SP")
.Select(x => (Int64)PhonemeTokenize(x))
.ToArray();
var ph_dur = phrase.phones
.Select(p=>(int)Math.Round(p.endMs/frameMs) - (int)Math.Round(p.positionMs/frameMs))
Expand All @@ -104,6 +115,9 @@ public RenderPitchResult Process(RenderPhrase phrase){
var vowelIds = Enumerable.Range(0,phrase.phones.Length)
.Where(i=>g2p.IsVowel(phrase.phones[i].phoneme))
.ToArray();
if(vowelIds.Length == 0){
vowelIds = new int[]{phrase.phones.Length-1};
}
var word_div = vowelIds.Zip(vowelIds.Skip(1),(a,b)=>(Int64)(b-a))
.Prepend(vowelIds[0] + 1)
.Append(phrase.phones.Length - vowelIds[^1] + 1)
Expand Down
2 changes: 1 addition & 1 deletion OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ float[] InvokeDiffsinger(RenderPhrase phrase, double depth, int steps, Cancellat
.Select(p => p.phoneme)
.Prepend("SP")
.Append("SP")
.Select(x => (long)(singer.phonemes.IndexOf(x)))
.Select(phoneme => (Int64)singer.PhonemeTokenize(phoneme))
.ToList();
var durations = phrase.phones
.Select(p => (int)Math.Round(p.endMs / frameMs) - (int)Math.Round(p.positionMs / frameMs))//prevent cumulative error
Expand Down
9 changes: 8 additions & 1 deletion OpenUtau.Core/DiffSinger/DiffSingerSinger.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
using OpenUtau.Core.Ustx;
using Serilog;
using Microsoft.ML.OnnxRuntime;
using NumSharp;

namespace OpenUtau.Core.DiffSinger {
class DiffSingerSinger : USinger {
Expand Down Expand Up @@ -194,6 +193,14 @@ public DsVariance getVariancePredictor(){
return variancePredictor;
}

public int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by acoustic model. Please check {Path.Combine(Location, dsConfig.phonemes)}");
}
return result;
}

public override void FreeMemory(){
Log.Information($"Freeing memory for singer {Id}");
if(acousticSession != null) {
Expand Down
33 changes: 21 additions & 12 deletions OpenUtau.Core/DiffSinger/DiffSingerVariance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,16 @@ public DsVariance(string rootPath)
}

protected IG2p LoadG2p(string rootPath) {
var g2ps = new List<IG2p>();
// Load dictionary from singer folder.
string file = Path.Combine(rootPath, "dsdict.yaml");
if (File.Exists(file)) {
try {
g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(file)).Build());
} catch (Exception e) {
Log.Error(e, $"Failed to load {file}");
}
if(!File.Exists(file)){
throw new Exception($"File not found: {file}");
}
return new G2pFallbacks(g2ps.ToArray());
var g2pBuilder = G2pDictionary.NewBuilder().Load(File.ReadAllText(file));
//SP and AP should always be vowel
g2pBuilder.AddSymbol("SP", true);
g2pBuilder.AddSymbol("AP", true);
return g2pBuilder.Build();
}

public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
Expand All @@ -78,15 +77,22 @@ public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
return speakerEmbedManager;
}

int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by variance model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
}
return result;
}
public VarianceResult Process(RenderPhrase phrase){
int headFrames = (int)Math.Round(headMs / frameMs);
int tailFrames = (int)Math.Round(tailMs / frameMs);
//Linguistic Encoder
var linguisticInputs = new List<NamedOnnxValue>();
var tokens = phrase.phones
.Select(p => (Int64)phonemes.IndexOf(p.phoneme))
.Prepend((Int64)phonemes.IndexOf("SP"))
.Append((Int64)phonemes.IndexOf("SP"))
var tokens = phrase.phones.Select(p => p.phoneme)
.Prepend("SP")
.Append("SP")
.Select(x => (Int64)PhonemeTokenize(x))
.ToArray();
var ph_dur = phrase.phones
.Select(p => (int)Math.Round(p.endMs / frameMs) - (int)Math.Round(p.positionMs / frameMs))//prevent cumulative error
Expand All @@ -102,6 +108,9 @@ public VarianceResult Process(RenderPhrase phrase){
var vowelIds = Enumerable.Range(0,phrase.phones.Length)
.Where(i=>g2p.IsVowel(phrase.phones[i].phoneme))
.ToArray();
if(vowelIds.Length == 0){
vowelIds = new int[]{phrase.phones.Length-1};
}
var word_div = vowelIds.Zip(vowelIds.Skip(1),(a,b)=>(Int64)(b-a))
.Prepend(vowelIds[0] + 1)
.Append(phrase.phones.Length - vowelIds[^1] + 1)
Expand Down
12 changes: 9 additions & 3 deletions OpenUtau.Core/DiffSinger/Phonemizers/DiffSingerG2pPhonemizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,25 @@ protected override IG2p LoadG2p(string rootPath) {
var g2ps = new List<IG2p>();

// Load dictionary from singer folder.
G2pDictionary.Builder g2pBuilder = new G2pDictionary.Builder();
var replacements = new Dictionary<string,string>();
foreach(var dictionaryName in dictionaryNames){
string dictionaryPath = Path.Combine(rootPath, dictionaryName);
if (File.Exists(dictionaryPath)) {
try {
g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(dictionaryPath)).Build());
replacements = G2pReplacementsData.Load(File.ReadAllText(dictionaryPath)).toDict();
string dictText = File.ReadAllText(dictionaryPath);
replacements = G2pReplacementsData.Load(dictText).toDict();
g2pBuilder.Load(dictText);
} catch (Exception e) {
Log.Error(e, $"Failed to load {dictionaryPath}");
}
break;
}
}
//SP and AP should always be vowel
g2pBuilder.AddSymbol("SP", true);
g2pBuilder.AddSymbol("AP", true);
g2ps.Add(g2pBuilder.Build());

// Load base g2p.
var baseG2p = LoadBaseG2p();
Expand All @@ -83,7 +89,7 @@ protected override IG2p LoadG2p(string rootPath) {
}
}
}
g2ps.Add(new G2pRemapper(baseG2p,phonemeSymbols, replacements));
g2ps.Add(new G2pRemapper(baseG2p, phonemeSymbols, replacements));
return new G2pFallbacks(g2ps.ToArray());
}
}
Expand Down
Loading