Skip to content

Commit

Permalink
Merge pull request #1045 from yqzhishen/tension
Browse files Browse the repository at this point in the history
[DiffSinger] Add support for tension and voicing
  • Loading branch information
stakira authored Feb 25, 2024
2 parents 73dcd7f + ca68af3 commit 8ed13f8
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 26 deletions.
8 changes: 7 additions & 1 deletion OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ public class DsConfig {
public bool useKeyShiftEmbed = false;
public bool useSpeedEmbed = false;
public bool useEnergyEmbed = false;
public bool useBreathinessEmbed= false;
public bool useBreathinessEmbed = false;
public bool useVoicingEmbed = false;
public bool useTensionEmbed = false;
public AugmentationArgs augmentationArgs;
public bool useShallowDiffusion = false;
public int maxDepth = -1;
Expand All @@ -34,6 +36,10 @@ public class DsConfig {
public int hop_size = 512;
public int sample_rate = 44100;
public bool predict_dur = true;
public bool predict_energy = true;
public bool predict_breathiness = true;
public bool predict_voicing = false;
public bool predict_tension = false;
public bool use_expr = false;
public bool use_note_rest = false;
public float frameMs(){
Expand Down
31 changes: 28 additions & 3 deletions OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ public class DiffSingerRenderer : IRenderer {
Format.Ustx.GENC,
Format.Ustx.CLR,
Format.Ustx.BREC,
Format.Ustx.VOIC,
Format.Ustx.TENC,
VELC,
ENE,
PEXP,
Expand Down Expand Up @@ -228,9 +230,12 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("velocity", velocityTensor));
}

//Variance: Energy and Breathiness

if(singer.dsConfig.useBreathinessEmbed || singer.dsConfig.useEnergyEmbed){
//Variance: Energy, Breathiness, Voicing and Tension
if(
singer.dsConfig.useBreathinessEmbed
|| singer.dsConfig.useEnergyEmbed
|| singer.dsConfig.useVoicingEmbed
|| singer.dsConfig.useTensionEmbed) {
var variancePredictor = singer.getVariancePredictor();
VarianceResult varianceResult;
lock(variancePredictor){
Expand Down Expand Up @@ -266,6 +271,26 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
new DenseTensor<float>(breathiness, new int[] { breathiness.Length })
.Reshape(new int[] { 1, breathiness.Length })));
}
if(singer.dsConfig.useVoicingEmbed){
var userVoicing = DiffSingerUtils.SampleCurve(phrase, phrase.voicing,
0, frameMs, totalFrames, headFrames, tailFrames,
x => x);
var predictedVoicing = DiffSingerUtils.ResampleCurve(varianceResult.voicing, totalFrames);
var voicing = predictedVoicing.Zip(userVoicing, (x,y)=>(float)Math.Min(x + (y-100)*12/100, 0)).ToArray();
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("voicing",
new DenseTensor<float>(voicing, new int[] { voicing.Length })
.Reshape(new int[] { 1, voicing.Length })));
}
if(singer.dsConfig.useTensionEmbed){
var userTension = DiffSingerUtils.SampleCurve(phrase, phrase.tension,
0, frameMs, totalFrames, headFrames, tailFrames,
x => x);
var predictedTension = DiffSingerUtils.ResampleCurve(varianceResult.tension, totalFrames);
var tension = predictedTension.Zip(userTension, (x,y)=>(float)(x + y * 5 / 100)).ToArray();
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("tension",
new DenseTensor<float>(tension, new int[] { tension.Length })
.Reshape(new int[] { 1, tension.Length })));
}
}
Tensor<float> mel;
lock(acousticModel){
Expand Down
87 changes: 65 additions & 22 deletions OpenUtau.Core/DiffSinger/DiffSingerVariance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@

namespace OpenUtau.Core.DiffSinger{
public struct VarianceResult{
public float[] energy;
public float[] breathiness;
public float[]? energy;
public float[]? breathiness;
public float[]? voicing;
public float[]? tension;
}
public class DsVariance : IDisposable{
string rootPath;
Expand Down Expand Up @@ -127,9 +129,6 @@ public VarianceResult Process(RenderPhrase phrase){
var pitch = DiffSingerUtils.SampleCurve(phrase, phrase.pitches, 0, frameMs, totalFrames, headFrames, tailFrames,
x => x * 0.01)
.Select(f => (float)f).ToArray();
var energy = Enumerable.Repeat(0f, totalFrames).ToArray();
var breathiness = Enumerable.Repeat(0f, totalFrames).ToArray();
var retake = Enumerable.Repeat(true, totalFrames*2).ToArray();
var speedup = Preferences.Default.DiffsingerSpeedup;

var varianceInputs = new List<NamedOnnxValue>();
Expand All @@ -140,15 +139,41 @@ public VarianceResult Process(RenderPhrase phrase){
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("pitch",
new DenseTensor<float>(pitch, new int[] { pitch.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("energy",
new DenseTensor<float>(energy, new int[] { energy.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("breathiness",
new DenseTensor<float>(breathiness, new int[] { breathiness.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
if (dsConfig.predict_energy) {
var energy = Enumerable.Repeat(0f, totalFrames).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("energy",
new DenseTensor<float>(energy, new int[] { energy.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
}
if (dsConfig.predict_breathiness) {
var breathiness = Enumerable.Repeat(0f, totalFrames).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("breathiness",
new DenseTensor<float>(breathiness, new int[] { breathiness.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
}
if (dsConfig.predict_voicing) {
var voicing = Enumerable.Repeat(0f, totalFrames).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("voicing",
new DenseTensor<float>(voicing, new int[] { voicing.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
}
if (dsConfig.predict_tension) {
var tension = Enumerable.Repeat(0f, totalFrames).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("tension",
new DenseTensor<float>(tension, new int[] { tension.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
}

var numVariances = new[] {
dsConfig.predict_energy,
dsConfig.predict_breathiness,
dsConfig.predict_voicing,
dsConfig.predict_tension,
}.Sum(Convert.ToInt32);
var retake = Enumerable.Repeat(true, totalFrames * numVariances).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("retake",
new DenseTensor<bool>(retake, new int[] { retake.Length }, false)
.Reshape(new int[] { 1, totalFrames, 2 })));
.Reshape(new int[] { 1, totalFrames, numVariances })));
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("speedup",
new DenseTensor<long>(new long[] { speedup }, new int[] { 1 },false)));
//Speaker
Expand All @@ -159,17 +184,35 @@ public VarianceResult Process(RenderPhrase phrase){
}
Onnx.VerifyInputNames(varianceModel, varianceInputs);
var varianceOutputs = varianceModel.Run(varianceInputs);
Tensor<float> energy_pred = varianceOutputs
.Where(o => o.Name == "energy_pred")
.First()
.AsTensor<float>();
Tensor<float> breathiness_pred = varianceOutputs
.Where(o => o.Name == "breathiness_pred")
.First()
.AsTensor<float>();
Tensor<float>? energy_pred = dsConfig.predict_energy
? varianceOutputs
.Where(o => o.Name == "energy_pred")
.First()
.AsTensor<float>()
: null;
Tensor<float>? breathiness_pred = dsConfig.predict_breathiness
? varianceOutputs
.Where(o => o.Name == "breathiness_pred")
.First()
.AsTensor<float>()
: null;
Tensor<float>? voicing_pred = dsConfig.predict_voicing
? varianceOutputs
.Where(o => o.Name == "voicing_pred")
.First()
.AsTensor<float>()
: null;
Tensor<float>? tension_pred = dsConfig.predict_tension
? varianceOutputs
.Where(o => o.Name == "tension_pred")
.First()
.AsTensor<float>()
: null;
return new VarianceResult{
energy = energy_pred.ToArray(),
breathiness = breathiness_pred.ToArray()
energy = energy_pred?.ToArray(),
breathiness = breathiness_pred?.ToArray(),
voicing = voicing_pred?.ToArray(),
tension = tension_pred?.ToArray(),
};
}

Expand Down

0 comments on commit 8ed13f8

Please sign in to comment.