Skip to content

Commit

Permalink
Merge pull request #1131 from oxygen-dioxide/diffsinger
Browse files Browse the repository at this point in the history
Enhance diffsinger error messages; diffsinger phonemizer fix
  • Loading branch information
stakira authored Jun 9, 2024
2 parents b8e7dc3 + 83d635a commit 6d706b3
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 31 deletions.
4 changes: 4 additions & 0 deletions OpenUtau.Core/Api/G2pDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ public Builder AddSymbol(string symbol, string type) {
phonemeSymbols[symbol] = type == "vowel";
if(type == "semivowel" || type == "liquid") {
glideSymbols.Add(symbol);
} else {
glideSymbols.Remove(symbol);
}
return this;
}
Expand All @@ -88,6 +90,8 @@ public Builder AddSymbol(string symbol, bool isVowel, bool isGlide) {
phonemeSymbols[symbol] = isVowel;
if (isGlide && !isVowel) {
glideSymbols.Add(symbol);
} else {
glideSymbols.Remove(symbol);
}
return this;
}
Expand Down
22 changes: 19 additions & 3 deletions OpenUtau.Core/DiffSinger/DiffSingerBasePhonemizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,20 +74,28 @@ public override void SetSinger(USinger singer) {
}

protected virtual IG2p LoadG2p(string rootPath) {
//Each phonemizer has a delicated dictionary name, such as dsdict-en.yaml, dsdict-ru.yaml.
//If this dictionary exists, load it.
//If not, load dsdict.yaml.
var g2ps = new List<IG2p>();
var dictionaryNames = new string[] {GetDictionaryName(), "dsdict.yaml"};
// Load dictionary from singer folder.
G2pDictionary.Builder g2pBuilder = new G2pDictionary.Builder();
foreach(var dictionaryName in dictionaryNames){
string dictionaryPath = Path.Combine(rootPath, dictionaryName);
if (File.Exists(dictionaryPath)) {
try {
g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(dictionaryPath)).Build());
g2pBuilder.Load(File.ReadAllText(dictionaryPath)).Build();
} catch (Exception e) {
Log.Error(e, $"Failed to load {dictionaryPath}");
}
break;
}
}
//SP and AP should always be vowel
g2pBuilder.AddSymbol("SP", true);
g2pBuilder.AddSymbol("AP", true);
g2ps.Add(g2pBuilder.Build());
return new G2pFallbacks(g2ps.ToArray());
}

Expand Down Expand Up @@ -148,7 +156,7 @@ List<phonemesPerNote> ProcessWord(Note[] notes, string[] symbols){
var isGlide = dsPhonemes.Select(s => g2p.IsGlide(s.Symbol)).ToArray();
var nonExtensionNotes = notes.Where(n=>!IsSyllableVowelExtensionNote(n)).ToArray();
var isStart = new bool[dsPhonemes.Length];
if(!isStart.Any()){
if(isVowel.All(b=>!b)){
isStart[0] = true;
}
for(int i=0; i<dsPhonemes.Length; i++){
Expand Down Expand Up @@ -211,6 +219,14 @@ public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
}
return speakerEmbedManager;
}

int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by timing model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
}
return result;
}

protected override void ProcessPart(Note[][] phrase) {
float padding = 500f;//Padding time for consonants at the beginning of a sentence, ms
Expand Down Expand Up @@ -247,7 +263,7 @@ protected override void ProcessPart(Note[][] phrase) {
//Linguistic Encoder
var tokens = phrasePhonemes
.SelectMany(n => n.Phonemes)
.Select(p => (Int64)phonemes.IndexOf(p.Symbol))
.Select(p => (Int64)PhonemeTokenize(p.Symbol))
.ToArray();
var word_div = phrasePhonemes.Take(phrasePhonemes.Count-1)
.Select(n => (Int64)n.Phonemes.Count)
Expand Down
36 changes: 25 additions & 11 deletions OpenUtau.Core/DiffSinger/DiffSingerPitch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ public DsPitch(string rootPath)
dsConfig = Core.Yaml.DefaultDeserializer.Deserialize<DsConfig>(
File.ReadAllText(Path.Combine(rootPath, "dsconfig.yaml"),
System.Text.Encoding.UTF8));
if(dsConfig.pitch == null){
throw new Exception("This voicebank doesn't contain a pitch model");
}
//Load phonemes list
string phonemesPath = Path.Combine(rootPath, dsConfig.phonemes);
phonemes = File.ReadLines(phonemesPath, Encoding.UTF8).ToList();
Expand All @@ -52,17 +55,16 @@ public DsPitch(string rootPath)
}

protected IG2p LoadG2p(string rootPath) {
var g2ps = new List<IG2p>();
// Load dictionary from singer folder.
string file = Path.Combine(rootPath, "dsdict.yaml");
if (File.Exists(file)) {
try {
g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(file)).Build());
} catch (Exception e) {
Log.Error(e, $"Failed to load {file}");
}
if(!File.Exists(file)){
throw new Exception($"File not found: {file}");
}
return new G2pFallbacks(g2ps.ToArray());
var g2pBuilder = G2pDictionary.NewBuilder().Load(File.ReadAllText(file));
//SP and AP should always be vowel
g2pBuilder.AddSymbol("SP", true);
g2pBuilder.AddSymbol("AP", true);
return g2pBuilder.Build();
}

public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
Expand All @@ -77,6 +79,14 @@ void SetRange<T>(T[] list, T value, int startIndex, int endIndex){
list[i] = value;
}
}

int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by pitch model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
}
return result;
}

public RenderPitchResult Process(RenderPhrase phrase){
var startMs = Math.Min(phrase.notes[0].positionMs, phrase.phones[0].positionMs) - headMs;
Expand All @@ -86,9 +96,10 @@ public RenderPitchResult Process(RenderPhrase phrase){
//Linguistic Encoder
var linguisticInputs = new List<NamedOnnxValue>();
var tokens = phrase.phones
.Select(p => (Int64)phonemes.IndexOf(p.phoneme))
.Prepend((Int64)phonemes.IndexOf("SP"))
.Append((Int64)phonemes.IndexOf("SP"))
.Select(p => p.phoneme)
.Prepend("SP")
.Append("SP")
.Select(x => (Int64)PhonemeTokenize(x))
.ToArray();
var ph_dur = phrase.phones
.Select(p=>(int)Math.Round(p.endMs/frameMs) - (int)Math.Round(p.positionMs/frameMs))
Expand All @@ -104,6 +115,9 @@ public RenderPitchResult Process(RenderPhrase phrase){
var vowelIds = Enumerable.Range(0,phrase.phones.Length)
.Where(i=>g2p.IsVowel(phrase.phones[i].phoneme))
.ToArray();
if(vowelIds.Length == 0){
vowelIds = new int[]{phrase.phones.Length-1};
}
var word_div = vowelIds.Zip(vowelIds.Skip(1),(a,b)=>(Int64)(b-a))
.Prepend(vowelIds[0] + 1)
.Append(phrase.phones.Length - vowelIds[^1] + 1)
Expand Down
2 changes: 1 addition & 1 deletion OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ float[] InvokeDiffsinger(RenderPhrase phrase, double depth, int steps, Cancellat
.Select(p => p.phoneme)
.Prepend("SP")
.Append("SP")
.Select(x => (long)(singer.phonemes.IndexOf(x)))
.Select(phoneme => (Int64)singer.PhonemeTokenize(phoneme))
.ToList();
var durations = phrase.phones
.Select(p => (int)Math.Round(p.endMs / frameMs) - (int)Math.Round(p.positionMs / frameMs))//prevent cumulative error
Expand Down
9 changes: 8 additions & 1 deletion OpenUtau.Core/DiffSinger/DiffSingerSinger.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
using OpenUtau.Core.Ustx;
using Serilog;
using Microsoft.ML.OnnxRuntime;
using NumSharp;

namespace OpenUtau.Core.DiffSinger {
class DiffSingerSinger : USinger {
Expand Down Expand Up @@ -194,6 +193,14 @@ public DsVariance getVariancePredictor(){
return variancePredictor;
}

public int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by acoustic model. Please check {Path.Combine(Location, dsConfig.phonemes)}");
}
return result;
}

public override void FreeMemory(){
Log.Information($"Freeing memory for singer {Id}");
if(acousticSession != null) {
Expand Down
33 changes: 21 additions & 12 deletions OpenUtau.Core/DiffSinger/DiffSingerVariance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,16 @@ public DsVariance(string rootPath)
}

protected IG2p LoadG2p(string rootPath) {
var g2ps = new List<IG2p>();
// Load dictionary from singer folder.
string file = Path.Combine(rootPath, "dsdict.yaml");
if (File.Exists(file)) {
try {
g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(file)).Build());
} catch (Exception e) {
Log.Error(e, $"Failed to load {file}");
}
if(!File.Exists(file)){
throw new Exception($"File not found: {file}");
}
return new G2pFallbacks(g2ps.ToArray());
var g2pBuilder = G2pDictionary.NewBuilder().Load(File.ReadAllText(file));
//SP and AP should always be vowel
g2pBuilder.AddSymbol("SP", true);
g2pBuilder.AddSymbol("AP", true);
return g2pBuilder.Build();
}

public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
Expand All @@ -78,15 +77,22 @@ public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
return speakerEmbedManager;
}

int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by variance model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
}
return result;
}
public VarianceResult Process(RenderPhrase phrase){
int headFrames = (int)Math.Round(headMs / frameMs);
int tailFrames = (int)Math.Round(tailMs / frameMs);
//Linguistic Encoder
var linguisticInputs = new List<NamedOnnxValue>();
var tokens = phrase.phones
.Select(p => (Int64)phonemes.IndexOf(p.phoneme))
.Prepend((Int64)phonemes.IndexOf("SP"))
.Append((Int64)phonemes.IndexOf("SP"))
var tokens = phrase.phones.Select(p => p.phoneme)
.Prepend("SP")
.Append("SP")
.Select(x => (Int64)PhonemeTokenize(x))
.ToArray();
var ph_dur = phrase.phones
.Select(p => (int)Math.Round(p.endMs / frameMs) - (int)Math.Round(p.positionMs / frameMs))//prevent cumulative error
Expand All @@ -102,6 +108,9 @@ public VarianceResult Process(RenderPhrase phrase){
var vowelIds = Enumerable.Range(0,phrase.phones.Length)
.Where(i=>g2p.IsVowel(phrase.phones[i].phoneme))
.ToArray();
if(vowelIds.Length == 0){
vowelIds = new int[]{phrase.phones.Length-1};
}
var word_div = vowelIds.Zip(vowelIds.Skip(1),(a,b)=>(Int64)(b-a))
.Prepend(vowelIds[0] + 1)
.Append(phrase.phones.Length - vowelIds[^1] + 1)
Expand Down
12 changes: 9 additions & 3 deletions OpenUtau.Core/DiffSinger/Phonemizers/DiffSingerG2pPhonemizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,25 @@ protected override IG2p LoadG2p(string rootPath) {
var g2ps = new List<IG2p>();

// Load dictionary from singer folder.
G2pDictionary.Builder g2pBuilder = new G2pDictionary.Builder();
var replacements = new Dictionary<string,string>();
foreach(var dictionaryName in dictionaryNames){
string dictionaryPath = Path.Combine(rootPath, dictionaryName);
if (File.Exists(dictionaryPath)) {
try {
g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(dictionaryPath)).Build());
replacements = G2pReplacementsData.Load(File.ReadAllText(dictionaryPath)).toDict();
string dictText = File.ReadAllText(dictionaryPath);
replacements = G2pReplacementsData.Load(dictText).toDict();
g2pBuilder.Load(dictText);
} catch (Exception e) {
Log.Error(e, $"Failed to load {dictionaryPath}");
}
break;
}
}
//SP and AP should always be vowel
g2pBuilder.AddSymbol("SP", true);
g2pBuilder.AddSymbol("AP", true);
g2ps.Add(g2pBuilder.Build());

// Load base g2p.
var baseG2p = LoadBaseG2p();
Expand All @@ -83,7 +89,7 @@ protected override IG2p LoadG2p(string rootPath) {
}
}
}
g2ps.Add(new G2pRemapper(baseG2p,phonemeSymbols, replacements));
g2ps.Add(new G2pRemapper(baseG2p, phonemeSymbols, replacements));
return new G2pFallbacks(g2ps.ToArray());
}
}
Expand Down

0 comments on commit 6d706b3

Please sign in to comment.