enhance error message for phoneme not in phoneme list or pitch model …

…not found
stakira · May 9, 2024 · d5165f2 · d5165f2
1 parent bd73561
commit d5165f2
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 11 deletions.
diff --git a/OpenUtau.Core/DiffSinger/DiffSingerBasePhonemizer.cs b/OpenUtau.Core/DiffSinger/DiffSingerBasePhonemizer.cs
@@ -211,6 +211,14 @@ public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
             }
             return speakerEmbedManager;
         }
+
+        int PhonemeTokenize(string phoneme){
+            int result = phonemes.IndexOf(phoneme);
+            if(result < 0){
+                throw new Exception($"Phoneme \"{phoneme}\" isn't supported by timing model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
+            }
+            return result;
+        }
 
         protected override void ProcessPart(Note[][] phrase) {
             float padding = 500f;//Padding time for consonants at the beginning of a sentence, ms
@@ -247,7 +255,7 @@ protected override void ProcessPart(Note[][] phrase) {
             //Linguistic Encoder
             var tokens = phrasePhonemes
                 .SelectMany(n => n.Phonemes)
-                .Select(p => (Int64)phonemes.IndexOf(p.Symbol))
+                .Select(p => (Int64)PhonemeTokenize(p.Symbol))
                 .ToArray();
             var word_div = phrasePhonemes.Take(phrasePhonemes.Count-1)
                 .Select(n => (Int64)n.Phonemes.Count)

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs b/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs
@@ -36,6 +36,9 @@ public DsPitch(string rootPath)
             dsConfig = Core.Yaml.DefaultDeserializer.Deserialize<DsConfig>(
                 File.ReadAllText(Path.Combine(rootPath, "dsconfig.yaml"),
                     System.Text.Encoding.UTF8));
+            if(dsConfig.pitch == null){
+                throw new Exception("This voicebank doesn't contain a pitch model");
+            }
             //Load phonemes list
             string phonemesPath = Path.Combine(rootPath, dsConfig.phonemes);
             phonemes = File.ReadLines(phonemesPath, Encoding.UTF8).ToList();
@@ -77,6 +80,14 @@ void SetRange<T>(T[] list, T value, int startIndex, int endIndex){
                 list[i] = value;
             }
         }
+
+        int PhonemeTokenize(string phoneme){
+            int result = phonemes.IndexOf(phoneme);
+            if(result < 0){
+                throw new Exception($"Phoneme \"{phoneme}\" isn't supported by pitch model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
+            }
+            return result;
+        }
 
         public RenderPitchResult Process(RenderPhrase phrase){
             var startMs = Math.Min(phrase.notes[0].positionMs, phrase.phones[0].positionMs) - headMs;
@@ -86,9 +97,10 @@ public RenderPitchResult Process(RenderPhrase phrase){
             //Linguistic Encoder
             var linguisticInputs = new List<NamedOnnxValue>();
             var tokens = phrase.phones
-                .Select(p => (Int64)phonemes.IndexOf(p.phoneme))
-                .Prepend((Int64)phonemes.IndexOf("SP"))
-                .Append((Int64)phonemes.IndexOf("SP"))
+                .Select(p => p.phoneme)
+                .Prepend("SP")
+                .Append("SP")
+                .Select(x => (Int64)PhonemeTokenize(x))
                 .ToArray();
             var ph_dur = phrase.phones
                 .Select(p=>(int)Math.Round(p.endMs/frameMs) - (int)Math.Round(p.positionMs/frameMs))

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
@@ -202,13 +202,13 @@ float[] InvokeDiffsinger(RenderPhrase phrase, double depth, int steps, Cancellat
             //mel = session.run(['mel'], {'tokens': tokens, 'durations': durations, 'f0': f0, 'speedup': speedup})[0]
             //tokens: phoneme index in the phoneme set
             //durations: phoneme duration in frames
-            //f0: pitch curve in Hz by frame
+            //f0: pitch curve in Hz by frame 
             //speedup: Diffusion render speedup, int
             var tokens = phrase.phones
                 .Select(p => p.phoneme)
                 .Prepend("SP")
                 .Append("SP")
-                .Select(x => (long)(singer.phonemes.IndexOf(x)))
+                .Select(phoneme => (Int64)singer.PhonemeTokenize(phoneme))
                 .ToList();
             var durations = phrase.phones
                 .Select(p => (int)Math.Round(p.endMs / frameMs) - (int)Math.Round(p.positionMs / frameMs))//prevent cumulative error

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs b/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs
@@ -8,7 +8,6 @@
 using OpenUtau.Core.Ustx;
 using Serilog;
 using Microsoft.ML.OnnxRuntime;
-using NumSharp;
 
 namespace OpenUtau.Core.DiffSinger {
     class DiffSingerSinger : USinger {
@@ -194,6 +193,14 @@ public DsVariance getVariancePredictor(){
             return variancePredictor;
         }
 
+        public int PhonemeTokenize(string phoneme){
+            int result = phonemes.IndexOf(phoneme);
+            if(result < 0){
+                throw new Exception($"Phoneme \"{phoneme}\" isn't supported by acoustic model. Please check {Path.Combine(Location, dsConfig.phonemes)}");
+            }
+            return result;
+        }
+
         public override void FreeMemory(){
             Log.Information($"Freeing memory for singer {Id}");
             if(acousticSession != null) {

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs b/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs
@@ -78,15 +78,22 @@ public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
             return speakerEmbedManager;
         }
 
+        int PhonemeTokenize(string phoneme){
+            int result = phonemes.IndexOf(phoneme);
+            if(result < 0){
+                throw new Exception($"Phoneme \"{phoneme}\" isn't supported by variance model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
+            }
+            return result;
+        }
         public VarianceResult Process(RenderPhrase phrase){
             int headFrames = (int)Math.Round(headMs / frameMs);
             int tailFrames = (int)Math.Round(tailMs / frameMs);
             //Linguistic Encoder
             var linguisticInputs = new List<NamedOnnxValue>();
-            var tokens = phrase.phones
-                .Select(p => (Int64)phonemes.IndexOf(p.phoneme))
-                .Prepend((Int64)phonemes.IndexOf("SP"))
-                .Append((Int64)phonemes.IndexOf("SP"))
+            var tokens = phrase.phones.Select(p => p.phoneme)
+                .Prepend("SP")
+                .Append("SP")
+                .Select(x => (Int64)PhonemeTokenize(x))
                 .ToArray();
             var ph_dur = phrase.phones
                 .Select(p => (int)Math.Round(p.endMs / frameMs) - (int)Math.Round(p.positionMs / frameMs))//prevent cumulative error