Skip to content

Commit

Permalink
Support voicing as VOIC
Browse files Browse the repository at this point in the history
  • Loading branch information
yqzhishen committed Feb 18, 2024
1 parent b093198 commit ca68af3
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 2 deletions.
2 changes: 2 additions & 0 deletions OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public class DsConfig {
public bool useSpeedEmbed = false;
public bool useEnergyEmbed = false;
public bool useBreathinessEmbed = false;
public bool useVoicingEmbed = false;
public bool useTensionEmbed = false;
public AugmentationArgs augmentationArgs;
public bool useShallowDiffusion = false;
Expand All @@ -37,6 +38,7 @@ public class DsConfig {
public bool predict_dur = true;
public bool predict_energy = true;
public bool predict_breathiness = true;
public bool predict_voicing = false;
public bool predict_tension = false;
public bool use_expr = false;
public bool use_note_rest = false;
Expand Down
19 changes: 17 additions & 2 deletions OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ public class DiffSingerRenderer : IRenderer {
Format.Ustx.GENC,
Format.Ustx.CLR,
Format.Ustx.BREC,
Format.Ustx.VOIC,
Format.Ustx.TENC,
VELC,
ENE,
Expand Down Expand Up @@ -229,8 +230,12 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("velocity", velocityTensor));
}

//Variance: Energy, Breathiness and Tension
if(singer.dsConfig.useBreathinessEmbed || singer.dsConfig.useEnergyEmbed || singer.dsConfig.useTensionEmbed){
//Variance: Energy, Breathiness, Voicing and Tension
if(
singer.dsConfig.useBreathinessEmbed
|| singer.dsConfig.useEnergyEmbed
|| singer.dsConfig.useVoicingEmbed
|| singer.dsConfig.useTensionEmbed) {
var variancePredictor = singer.getVariancePredictor();
VarianceResult varianceResult;
lock(variancePredictor){
Expand Down Expand Up @@ -266,6 +271,16 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
new DenseTensor<float>(breathiness, new int[] { breathiness.Length })
.Reshape(new int[] { 1, breathiness.Length })));
}
if(singer.dsConfig.useVoicingEmbed){
var userVoicing = DiffSingerUtils.SampleCurve(phrase, phrase.voicing,
0, frameMs, totalFrames, headFrames, tailFrames,
x => x);
var predictedVoicing = DiffSingerUtils.ResampleCurve(varianceResult.voicing, totalFrames);
var voicing = predictedVoicing.Zip(userVoicing, (x,y)=>(float)Math.Min(x + (y-100)*12/100, 0)).ToArray();
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("voicing",
new DenseTensor<float>(voicing, new int[] { voicing.Length })
.Reshape(new int[] { 1, voicing.Length })));
}
if(singer.dsConfig.useTensionEmbed){
var userTension = DiffSingerUtils.SampleCurve(phrase, phrase.tension,
0, frameMs, totalFrames, headFrames, tailFrames,
Expand Down
15 changes: 15 additions & 0 deletions OpenUtau.Core/DiffSinger/DiffSingerVariance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ namespace OpenUtau.Core.DiffSinger{
public struct VarianceResult{
public float[]? energy;
public float[]? breathiness;
public float[]? voicing;
public float[]? tension;
}
public class DsVariance : IDisposable{
Expand Down Expand Up @@ -150,6 +151,12 @@ public VarianceResult Process(RenderPhrase phrase){
new DenseTensor<float>(breathiness, new int[] { breathiness.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
}
if (dsConfig.predict_voicing) {
var voicing = Enumerable.Repeat(0f, totalFrames).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("voicing",
new DenseTensor<float>(voicing, new int[] { voicing.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
}
if (dsConfig.predict_tension) {
var tension = Enumerable.Repeat(0f, totalFrames).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("tension",
Expand All @@ -160,6 +167,7 @@ public VarianceResult Process(RenderPhrase phrase){
var numVariances = new[] {
dsConfig.predict_energy,
dsConfig.predict_breathiness,
dsConfig.predict_voicing,
dsConfig.predict_tension,
}.Sum(Convert.ToInt32);
var retake = Enumerable.Repeat(true, totalFrames * numVariances).ToArray();
Expand Down Expand Up @@ -188,6 +196,12 @@ public VarianceResult Process(RenderPhrase phrase){
.First()
.AsTensor<float>()
: null;
Tensor<float>? voicing_pred = dsConfig.predict_voicing
? varianceOutputs
.Where(o => o.Name == "voicing_pred")
.First()
.AsTensor<float>()
: null;
Tensor<float>? tension_pred = dsConfig.predict_tension
? varianceOutputs
.Where(o => o.Name == "tension_pred")
Expand All @@ -197,6 +211,7 @@ public VarianceResult Process(RenderPhrase phrase){
return new VarianceResult{
energy = energy_pred?.ToArray(),
breathiness = breathiness_pred?.ToArray(),
voicing = voicing_pred?.ToArray(),
tension = tension_pred?.ToArray(),
};
}
Expand Down

0 comments on commit ca68af3

Please sign in to comment.