diff --git a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs index f75ce88dd..4843b1c5a 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs @@ -23,7 +23,9 @@ public class DsConfig { public bool useKeyShiftEmbed = false; public bool useSpeedEmbed = false; public bool useEnergyEmbed = false; - public bool useBreathinessEmbed= false; + public bool useBreathinessEmbed = false; + public bool useVoicingEmbed = false; + public bool useTensionEmbed = false; public AugmentationArgs augmentationArgs; public bool useShallowDiffusion = false; public int maxDepth = -1; @@ -34,6 +36,10 @@ public class DsConfig { public int hop_size = 512; public int sample_rate = 44100; public bool predict_dur = true; + public bool predict_energy = true; + public bool predict_breathiness = true; + public bool predict_voicing = false; + public bool predict_tension = false; public bool use_expr = false; public bool use_note_rest = false; public float frameMs(){ diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs index fa8f60f01..dd71f66ee 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs @@ -29,6 +29,8 @@ public class DiffSingerRenderer : IRenderer { Format.Ustx.GENC, Format.Ustx.CLR, Format.Ustx.BREC, + Format.Ustx.VOIC, + Format.Ustx.TENC, VELC, ENE, PEXP, @@ -228,9 +230,12 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati acousticInputs.Add(NamedOnnxValue.CreateFromTensor("velocity", velocityTensor)); } - //Variance: Energy and Breathiness - - if(singer.dsConfig.useBreathinessEmbed || singer.dsConfig.useEnergyEmbed){ + //Variance: Energy, Breathiness, Voicing and Tension + if( + singer.dsConfig.useBreathinessEmbed + || singer.dsConfig.useEnergyEmbed + || singer.dsConfig.useVoicingEmbed + || singer.dsConfig.useTensionEmbed) { var variancePredictor = singer.getVariancePredictor(); VarianceResult varianceResult; lock(variancePredictor){ @@ -266,6 +271,26 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati new DenseTensor(breathiness, new int[] { breathiness.Length }) .Reshape(new int[] { 1, breathiness.Length }))); } + if(singer.dsConfig.useVoicingEmbed){ + var userVoicing = DiffSingerUtils.SampleCurve(phrase, phrase.voicing, + 0, frameMs, totalFrames, headFrames, tailFrames, + x => x); + var predictedVoicing = DiffSingerUtils.ResampleCurve(varianceResult.voicing, totalFrames); + var voicing = predictedVoicing.Zip(userVoicing, (x,y)=>(float)Math.Min(x + (y-100)*12/100, 0)).ToArray(); + acousticInputs.Add(NamedOnnxValue.CreateFromTensor("voicing", + new DenseTensor(voicing, new int[] { voicing.Length }) + .Reshape(new int[] { 1, voicing.Length }))); + } + if(singer.dsConfig.useTensionEmbed){ + var userTension = DiffSingerUtils.SampleCurve(phrase, phrase.tension, + 0, frameMs, totalFrames, headFrames, tailFrames, + x => x); + var predictedTension = DiffSingerUtils.ResampleCurve(varianceResult.tension, totalFrames); + var tension = predictedTension.Zip(userTension, (x,y)=>(float)(x + y * 5 / 100)).ToArray(); + acousticInputs.Add(NamedOnnxValue.CreateFromTensor("tension", + new DenseTensor(tension, new int[] { tension.Length }) + .Reshape(new int[] { 1, tension.Length }))); + } } Tensor mel; lock(acousticModel){ diff --git a/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs b/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs index 95f32e619..017d04ea2 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs @@ -14,8 +14,10 @@ namespace OpenUtau.Core.DiffSinger{ public struct VarianceResult{ - public float[] energy; - public float[] breathiness; + public float[]? energy; + public float[]? breathiness; + public float[]? voicing; + public float[]? tension; } public class DsVariance : IDisposable{ string rootPath; @@ -127,9 +129,6 @@ public VarianceResult Process(RenderPhrase phrase){ var pitch = DiffSingerUtils.SampleCurve(phrase, phrase.pitches, 0, frameMs, totalFrames, headFrames, tailFrames, x => x * 0.01) .Select(f => (float)f).ToArray(); - var energy = Enumerable.Repeat(0f, totalFrames).ToArray(); - var breathiness = Enumerable.Repeat(0f, totalFrames).ToArray(); - var retake = Enumerable.Repeat(true, totalFrames*2).ToArray(); var speedup = Preferences.Default.DiffsingerSpeedup; var varianceInputs = new List(); @@ -140,15 +139,41 @@ public VarianceResult Process(RenderPhrase phrase){ varianceInputs.Add(NamedOnnxValue.CreateFromTensor("pitch", new DenseTensor(pitch, new int[] { pitch.Length }, false) .Reshape(new int[] { 1, totalFrames }))); - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("energy", - new DenseTensor(energy, new int[] { energy.Length }, false) - .Reshape(new int[] { 1, totalFrames }))); - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("breathiness", - new DenseTensor(breathiness, new int[] { breathiness.Length }, false) - .Reshape(new int[] { 1, totalFrames }))); + if (dsConfig.predict_energy) { + var energy = Enumerable.Repeat(0f, totalFrames).ToArray(); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("energy", + new DenseTensor(energy, new int[] { energy.Length }, false) + .Reshape(new int[] { 1, totalFrames }))); + } + if (dsConfig.predict_breathiness) { + var breathiness = Enumerable.Repeat(0f, totalFrames).ToArray(); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("breathiness", + new DenseTensor(breathiness, new int[] { breathiness.Length }, false) + .Reshape(new int[] { 1, totalFrames }))); + } + if (dsConfig.predict_voicing) { + var voicing = Enumerable.Repeat(0f, totalFrames).ToArray(); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("voicing", + new DenseTensor(voicing, new int[] { voicing.Length }, false) + .Reshape(new int[] { 1, totalFrames }))); + } + if (dsConfig.predict_tension) { + var tension = Enumerable.Repeat(0f, totalFrames).ToArray(); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("tension", + new DenseTensor(tension, new int[] { tension.Length }, false) + .Reshape(new int[] { 1, totalFrames }))); + } + + var numVariances = new[] { + dsConfig.predict_energy, + dsConfig.predict_breathiness, + dsConfig.predict_voicing, + dsConfig.predict_tension, + }.Sum(Convert.ToInt32); + var retake = Enumerable.Repeat(true, totalFrames * numVariances).ToArray(); varianceInputs.Add(NamedOnnxValue.CreateFromTensor("retake", new DenseTensor(retake, new int[] { retake.Length }, false) - .Reshape(new int[] { 1, totalFrames, 2 }))); + .Reshape(new int[] { 1, totalFrames, numVariances }))); varianceInputs.Add(NamedOnnxValue.CreateFromTensor("speedup", new DenseTensor(new long[] { speedup }, new int[] { 1 },false))); //Speaker @@ -159,17 +184,35 @@ public VarianceResult Process(RenderPhrase phrase){ } Onnx.VerifyInputNames(varianceModel, varianceInputs); var varianceOutputs = varianceModel.Run(varianceInputs); - Tensor energy_pred = varianceOutputs - .Where(o => o.Name == "energy_pred") - .First() - .AsTensor(); - Tensor breathiness_pred = varianceOutputs - .Where(o => o.Name == "breathiness_pred") - .First() - .AsTensor(); + Tensor? energy_pred = dsConfig.predict_energy + ? varianceOutputs + .Where(o => o.Name == "energy_pred") + .First() + .AsTensor() + : null; + Tensor? breathiness_pred = dsConfig.predict_breathiness + ? varianceOutputs + .Where(o => o.Name == "breathiness_pred") + .First() + .AsTensor() + : null; + Tensor? voicing_pred = dsConfig.predict_voicing + ? varianceOutputs + .Where(o => o.Name == "voicing_pred") + .First() + .AsTensor() + : null; + Tensor? tension_pred = dsConfig.predict_tension + ? varianceOutputs + .Where(o => o.Name == "tension_pred") + .First() + .AsTensor() + : null; return new VarianceResult{ - energy = energy_pred.ToArray(), - breathiness = breathiness_pred.ToArray() + energy = energy_pred?.ToArray(), + breathiness = breathiness_pred?.ToArray(), + voicing = voicing_pred?.ToArray(), + tension = tension_pred?.ToArray(), }; }