Skip to content

Commit

Permalink
Merge pull request #71 from oxygen-dioxide/diffsinger
Browse files Browse the repository at this point in the history
Diffsinger
  • Loading branch information
oxygen-dioxide authored Oct 20, 2023
2 parents 66bf6a1 + 4def4ca commit ca9a71b
Show file tree
Hide file tree
Showing 18 changed files with 264 additions and 91 deletions.
14 changes: 7 additions & 7 deletions OpenUtau.Core/Classic/ClassicRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ public RenderResult Layout(RenderPhrase phrase) {
};
}

public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, CancellationTokenSource cancellation, bool isPreRender) {
public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int trackNo, CancellationTokenSource cancellation, bool isPreRender) {
if (phrase.wavtool == SharpWavtool.nameConvergence || phrase.wavtool == SharpWavtool.nameSimple) {
return RenderInternal(phrase, progress, cancellation, isPreRender);
return RenderInternal(phrase, progress, trackNo, cancellation, isPreRender);
} else {
return RenderExternal(phrase, progress, cancellation, isPreRender);
return RenderExternal(phrase, progress, trackNo, cancellation, isPreRender);
}
}

public Task<RenderResult> RenderInternal(RenderPhrase phrase, Progress progress, CancellationTokenSource cancellation, bool isPreRender) {
public Task<RenderResult> RenderInternal(RenderPhrase phrase, Progress progress, int trackNo, CancellationTokenSource cancellation, bool isPreRender) {
var resamplerItems = new List<ResamplerItem>();
foreach (var phone in phrase.phones) {
resamplerItems.Add(new ResamplerItem(phrase, phone));
Expand All @@ -78,7 +78,7 @@ public Task<RenderResult> RenderInternal(RenderPhrase phrase, Progress progress,
VoicebankFiles.Inst.CopyBackMetaFiles(item.inputFile, item.inputTemp);
}
}
progress.Complete(1, $"{item.resampler} \"{item.phone.phoneme}\"");
progress.Complete(1, $"Track {trackNo}: {item.resampler} \"{item.phone.phoneme}\"");
});
var result = Layout(phrase);
var wavtool = new SharpWavtool(true);
Expand All @@ -91,13 +91,13 @@ public Task<RenderResult> RenderInternal(RenderPhrase phrase, Progress progress,
return task;
}

public Task<RenderResult> RenderExternal(RenderPhrase phrase, Progress progress, CancellationTokenSource cancellation, bool isPreRender) {
public Task<RenderResult> RenderExternal(RenderPhrase phrase, Progress progress, int trackNo, CancellationTokenSource cancellation, bool isPreRender) {
var resamplerItems = new List<ResamplerItem>();
foreach (var phone in phrase.phones) {
resamplerItems.Add(new ResamplerItem(phrase, phone));
}
var task = Task.Run(() => {
string progressInfo = $"{phrase.wavtool} \"{string.Join(" ", phrase.phones.Select(p => p.phoneme))}\"";
string progressInfo = $"Track {trackNo} : {phrase.wavtool} \"{string.Join(" ", phrase.phones.Select(p => p.phoneme))}\"";
progress.Complete(0, progressInfo);
var wavPath = Path.Join(PathManager.Inst.CachePath, $"cat-{phrase.hash:x16}.wav");
var result = Layout(phrase);
Expand Down
44 changes: 23 additions & 21 deletions OpenUtau.Core/Classic/ResamplerItem.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,27 +74,29 @@ public ResamplerItem(RenderPhrase phrase, RenderPhone phone) {
pitchCount = Math.Max(pitchCount, 0);
pitches = new int[pitchCount];

double phoneStartMs = phone.positionMs - pitchLeadingMs;
double phraseStartMs = phrase.positionMs - phrase.leadingMs;
for (int i = 0; i < phone.tempos.Length; i++) {
double startMs = Math.Max(phrase.timeAxis.TickPosToMsPos(phone.tempos[i].position), phoneStartMs);
double endMs = i + 1 < phone.tempos.Length ? phrase.timeAxis.TickPosToMsPos(phone.tempos[i + 1].position) : phone.positionMs + phone.envelope[4].X;
double durationMs = endMs - startMs;
int tempoPitchCount = (int)Math.Floor(MusicMath.TempoMsToTick(tempo, durationMs) / 5.0);
int tempoPitchSkip = (int)Math.Floor(MusicMath.TempoMsToTick(tempo, startMs - phoneStartMs) / 5.0);
tempoPitchCount = Math.Min(tempoPitchCount, pitches.Length - tempoPitchSkip);
int phrasePitchSkip = (int)Math.Floor(phrase.timeAxis.TicksBetweenMsPos(phraseStartMs, startMs) / 5.0);
double intervalPitchMs = 120 / tempo * 500 / 480 * 5;
double diffPitchMs = startMs - phraseStartMs - phrase.timeAxis.TickPosToMsPos(phrasePitchSkip * 5);
double tempoRatio = phone.tempos[i].bpm / tempo;
for (int j = 0; j < tempoPitchCount; j++) {
int index = tempoPitchSkip + j;
int scaled = phrasePitchSkip + (int)Math.Ceiling(j * tempoRatio);
scaled = Math.Clamp(scaled, 0, phrase.pitches.Length - 1);
int nextScaled = Math.Clamp(scaled + 1, 0, phrase.pitches.Length - 1);
index = Math.Clamp(index, 0, pitchCount - 1);
pitches[index] = (int)Math.Round((phrase.pitches[nextScaled]- phrase.pitches[scaled]) /intervalPitchMs * diffPitchMs + phrase.pitches[scaled] - phone.tone * 100);
}
var phrasePitchStartMs = phrase.positionMs - phrase.leadingMs;
var phrasePitchStartTick = (int)Math.Floor(phrase.timeAxis.MsPosToNonExactTickPos(phrasePitchStartMs));

var pitchIntervalMs = MusicMath.TempoTickToMs(tempo, 5);
var pitchSampleStartMs = phone.positionMs - pitchLeadingMs;

for (int i=0; i<pitches.Length; i++) {
var samplePosMs = pitchSampleStartMs + pitchIntervalMs * i;
var samplePosTick = (int)Math.Floor(phrase.timeAxis.MsPosToNonExactTickPos(samplePosMs));

var sampleInterval = phrase.timeAxis.TickPosToMsPos(samplePosTick + 5) - phrase.timeAxis.TickPosToMsPos(samplePosTick);
var sampleIndex = (samplePosTick - phrasePitchStartTick) / 5.0;
sampleIndex = Math.Clamp(sampleIndex, 0, phrase.pitches.Length - 1);

var sampleStart = (int)Math.Floor(sampleIndex);
var sampleEnd = (int)Math.Ceiling(sampleIndex);

var diffPitchMs = samplePosMs - phrase.timeAxis.TickPosToMsPos(phrasePitchStartTick + sampleStart * 5);
var sampleAlpha = diffPitchMs / sampleInterval;

var sampleLerped = phrase.pitches[sampleStart] + (phrase.pitches[sampleEnd] - phrase.pitches[sampleStart]) * sampleAlpha;

pitches[i] = (int)Math.Round(sampleLerped - phone.tone * 100);
}

hash = Hash();
Expand Down
3 changes: 2 additions & 1 deletion OpenUtau.Core/Classic/Ust.cs
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,8 @@ private static void ParseNote(UNote note, int lastNotePos, int lastNoteEnd, List
lyric = note.lyric,
position = note.position,
duration = note.duration,
noteNum = note.tone
noteNum = note.tone,
pitch = note.pitch
};
ustNote.Parse(lastNotePos, lastNoteEnd, iniLines, out noteTempo);
note.lyric = ustNote.lyric;
Expand Down
85 changes: 57 additions & 28 deletions OpenUtau.Core/Classic/UstNote.cs
Original file line number Diff line number Diff line change
Expand Up @@ -277,48 +277,77 @@ void ParseEnvelope(string ust, IniLine ustLine) {
}

void ParsePitchBend(string pbs, string pbw, string pby, string pbm) {
var pitch = this.pitch != null ? this.pitch.Clone() : new UPitch() ;
var points = pitch.data;

// PBS
if (!string.IsNullOrWhiteSpace(pbs)) {
var pitch = new UPitch();
var points = pitch.data;
points.Clear();
// PBS
var parts = pbs.Contains(';') ? pbs.Split(';') : pbs.Split(',');
float pbsX = parts.Length >= 1 && ParseFloat(parts[0], out pbsX) ? pbsX : 0;
float pbsY = parts.Length >= 2 && ParseFloat(parts[1], out pbsY) ? pbsY : 0;
points.Add(new PitchPoint(pbsX, pbsY));
// PBW, PBY
var x = points.First().X;
if (!string.IsNullOrWhiteSpace(pbw)) {
var w = pbw.Split(',').Select(s => ParseFloat(s, out var v) ? v : 0).ToList();
var y = (pby ?? "").Split(',').Select(s => ParseFloat(s, out var v) ? v : 0).ToList();
if(points.Count > 0) {
points[0] = new PitchPoint(pbsX, pbsY);
} else {
points.Add(new PitchPoint(pbsX, pbsY));
}
}
if (points.Count == 0) {
return;
}
// PBW, PBY
var x = points.First().X;
var w = new List<float>();
var y = new List<float>();
if (!string.IsNullOrWhiteSpace(pbw)) {
w = pbw.Split(',').Select(s => ParseFloat(s, out var v) ? v : 0).ToList();
}
if (!string.IsNullOrWhiteSpace(pby)) {
y = pby.Split(',').Select(s => ParseFloat(s, out var v) ? v : 0).ToList();
}
if (w.Count != 0 || y.Count != 0) {
if (points.Count > 1 && points.Count - 1 == w.Count && y.Count == 0) { // replace w only
for (var i = 0; i < w.Count(); i++) {
x += w[i];
points[i + 1].X = x;
}
} else if (points.Count > 1 && w.Count == 0 && points.Count - 1 == y.Count) { // replace y only
for (var i = 0; i < y.Count(); i++) {
points[i + 1].Y = y[i];
}
} else {
while (w.Count > y.Count) {
y.Add(0);
}
for (var i = points.Count - 1; i > 0; i--) {
points.Remove(points[i]);
}
for (var i = 0; i < w.Count(); i++) {
x += w[i];
points.Add(new PitchPoint(x, y[i]));
}
}
// PBM
if (!string.IsNullOrWhiteSpace(pbm)) {
var m = pbm.Split(new[] { ',' });
for (var i = 0; i < m.Count() && i < points.Count; i++) {
switch (m[i]) {
case "r":
points[i].shape = PitchPointShape.o;
break;
case "s":
points[i].shape = PitchPointShape.l;
break;
case "j":
points[i].shape = PitchPointShape.i;
break;
default:
points[i].shape = PitchPointShape.io;
break;
}
}
// PBM
if (!string.IsNullOrWhiteSpace(pbm)) {
var m = pbm.Split(new[] { ',' });
for (var i = 0; i < m.Count() && i < points.Count; i++) {
switch (m[i]) {
case "r":
points[i].shape = PitchPointShape.o;
break;
case "s":
points[i].shape = PitchPointShape.l;
break;
case "j":
points[i].shape = PitchPointShape.i;
break;
default:
points[i].shape = PitchPointShape.io;
break;
}
}
}
if (points.Count > 1) {
this.pitch = pitch;
}
}
Expand Down
4 changes: 2 additions & 2 deletions OpenUtau.Core/Classic/WorldlineRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ public RenderResult Layout(RenderPhrase phrase) {
};
}

public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, CancellationTokenSource cancellation, bool isPreRender) {
public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int trackNo, CancellationTokenSource cancellation, bool isPreRender) {
var resamplerItems = new List<ResamplerItem>();
foreach (var phone in phrase.phones) {
resamplerItems.Add(new ResamplerItem(phrase, phone));
}
var task = Task.Run(() => {
var result = Layout(phrase);
var wavPath = Path.Join(PathManager.Inst.CachePath, $"wdl-{phrase.hash:x16}.wav");
string progressInfo = $"{this} {string.Join(" ", phrase.phones.Select(p => p.phoneme))}";
string progressInfo = $"Track {trackNo}: {this} {string.Join(" ", phrase.phones.Select(p => p.phoneme))}";
progress.Complete(0, progressInfo);
if (File.Exists(wavPath)) {
try {
Expand Down
2 changes: 1 addition & 1 deletion OpenUtau.Core/DependencyInstaller.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public static void Install(string archivePath) {
entry.WriteToFile(Path.Combine(basePath, entry.Key));
}
}
DocManager.Inst.ExecuteCmd(new ProgressBarNotification(0, $"dependency \"{name}\" installaion finished"));
DocManager.Inst.ExecuteCmd(new ProgressBarNotification(0, $"Installed dependency \"{name}\""));
}
}
}
Expand Down
1 change: 1 addition & 0 deletions OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public class DsConfig {
public int sample_rate = 44100;
public bool predict_dur = true;
public bool use_expr = false;
public bool use_note_rest = false;
public float frameMs(){
return 1000f * hop_size / sample_rate;
}
Expand Down
85 changes: 83 additions & 2 deletions OpenUtau.Core/DiffSinger/DiffSingerPitch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
Expand Down Expand Up @@ -67,6 +68,11 @@ public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
return speakerEmbedManager;
}

void SetRange<T>(T[] list, T value, int startIndex, int endIndex){
for(int i=startIndex;i<endIndex;i++){
list[i] = value;
}
}
public RenderPitchResult Process(RenderPhrase phrase){
var startMs = Math.Min(phrase.notes[0].positionMs, phrase.phones[0].positionMs) - headMs;
var endMs = phrase.notes[^1].endMs + tailMs;
Expand Down Expand Up @@ -125,11 +131,80 @@ public RenderPitchResult Process(RenderPhrase phrase){
.First()
.AsTensor<bool>();

//Pitch Predictor
//Pitch Predictor
var note_rest = new List<bool>{true};
bool prevNoteRest = true;
int phIndex = 0;
foreach(var note in phrase.notes) {
//Slur notes follow the previous note's rest status
if(note.lyric.StartsWith("+")) {
note_rest.Add(prevNoteRest);
continue;
}
//find all the phonemes in the note's time range
while(phIndex<phrase.phones.Length && phrase.phones[phIndex].endMs<=note.endMs) {
phIndex++;
}
var phs = phrase.phones
.SkipWhile(ph => ph.end <= note.position + 1)
.TakeWhile(ph => ph.position < note.end - 1)
.ToArray();
//If all the phonemes in a note's time range are AP, SP or consonant,
//it is a rest note
bool isRest = phs.Length == 0
|| phs.All(ph => ph.phoneme == "AP" || ph.phoneme == "SP" || !g2p.IsVowel(ph.phoneme));
note_rest.Add(isRest);
prevNoteRest = isRest;
}

var note_midi = phrase.notes
.Select(n=>(float)n.tone)
.Prepend((float)phrase.notes[0].tone)
.ToArray();
//get the index of groups of consecutive rest notes
int restGroupStart = 0;
var restGroups = new List<Tuple<int,int>>{};
foreach(int noteIndex in Enumerable.Range(1,note_rest.Count - 1)) {
if(!note_rest[noteIndex-1] && note_rest[noteIndex]) {
//start a new rest group
restGroupStart = noteIndex;
}
if(note_rest[noteIndex-1] && !note_rest[noteIndex]) {
//end the current rest group
restGroups.Add(new Tuple<int,int>(restGroupStart,noteIndex));
}
}
if(!note_rest[^1]) {
//end the last rest group
restGroups.Add(new Tuple<int,int>(restGroupStart,note_rest.Count));
}
//Set tone for each rest group
foreach(var restGroup in restGroups){
if(restGroup.Item1 == 0 && restGroup.Item2 == note_rest.Count){
//If All the notes are rest notes, don't set tone
break;
}
if(restGroup.Item1 == 0){
//If the first note is a rest note, set the tone to the tone of the first non-rest note
SetRange<float>(note_midi, note_midi[restGroup.Item2], 0, restGroup.Item2);
} else if(restGroup.Item2 == note_rest.Count){
//If the last note is a rest note, set the tone to the tone of the last non-rest note
SetRange<float>(note_midi, note_midi[restGroup.Item1-1], restGroup.Item1, note_rest.Count);
} else {
//If the first and last notes are non-rest notes, set the tone to the nearest non-rest note
SetRange<float>(note_midi,
note_midi[restGroup.Item1-1],
restGroup.Item1,
(restGroup.Item1 + restGroup.Item2 + 1)/2
);
SetRange<float>(note_midi,
note_midi[restGroup.Item2],
(restGroup.Item1 + restGroup.Item2 + 1)/2,
restGroup.Item2
);
}
}

//use the delta of the positions of the next note and the current note
//to prevent incorrect timing when there is a small space between two notes
var note_dur = phrase.notes.Zip(phrase.notes.Skip(1),
Expand All @@ -138,7 +213,6 @@ public RenderPitchResult Process(RenderPhrase phrase){
.Append(0)
.ToList();
note_dur[^1]=totalFrames-note_dur.Sum();

var pitch = Enumerable.Repeat(60f, totalFrames).ToArray();
var retake = Enumerable.Repeat(true, totalFrames).ToArray();
var speedup = Preferences.Default.DiffsingerSpeedup;
Expand Down Expand Up @@ -185,6 +259,13 @@ public RenderPitchResult Process(RenderPhrase phrase){
pitchInputs.Add(NamedOnnxValue.CreateFromTensor("spk_embed", spkEmbedTensor));
}

//Melody encoder
if(dsConfig.use_note_rest) {
pitchInputs.Add(NamedOnnxValue.CreateFromTensor("note_rest",
new DenseTensor<bool>(note_rest.ToArray(), new int[] { note_rest.Count }, false)
.Reshape(new int[] { 1, note_rest.Count })));
}

var pitchOutputs = pitchModel.Run(pitchInputs);
var pitch_out = pitchOutputs.First().AsTensor<float>().ToArray();
var pitchEnd = phrase.timeAxis.MsPosToTickPos(startMs + (totalFrames - 1) * frameMs) - phrase.position;
Expand Down
Loading

0 comments on commit ca9a71b

Please sign in to comment.