Skip to content

Commit

Permalink
Merge branch 'master' into ds-tensor-cache
Browse files Browse the repository at this point in the history
  • Loading branch information
yqzhishen authored Apr 1, 2024
2 parents 3f15241 + f810f08 commit 0e9958e
Show file tree
Hide file tree
Showing 162 changed files with 31,803 additions and 406 deletions.
2 changes: 0 additions & 2 deletions LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

This license covers both OpenUtau source code and Worldline prebuilt binaries.
21 changes: 15 additions & 6 deletions OpenUtau.Core/Api/Phonemizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,21 @@ public struct PhonemeAttributes {
public string voiceColor;
}

public struct PhonemeExpression {
public string abbr;
public float value;
}

/// <summary>
/// The output struct that represents a phoneme.
/// </summary>
public struct Phoneme {
/// <summary>
/// Number to manage phonemes in note.
/// Optional. Whether to specify an index or not should be consistent within Phonemizer (All phonemes should be indexed, or all should be unindexed).
/// </summary>
public int? index;

/// <summary>
/// Phoneme name. Should match one of oto alias.
/// Note that you don't have to return tone-mapped phonemes. OpenUtau will do it afterwards.
Expand All @@ -111,16 +122,14 @@ public struct Phoneme {

/// <summary>
/// Position of phoneme in note. Measured in ticks.
/// Use TickToMs() and MsToTick() to convert between ticks and milliseconds .
/// Use TickToMs() and MsToTick() to convert between ticks and milliseconds.
/// </summary>
public int position;

/// <summary>
/// Suggested attributes. May or may not be used eventually.
/// Suggested attributes. It may later be overwritten with a user-specified value.
/// </summary>
public PhonemeAttributes attributes;

public int? index;
public List<PhonemeExpression> expressions;

public override string ToString() => $"\"{phoneme}\" pos:{position}";
}
Expand Down Expand Up @@ -161,7 +170,7 @@ public struct Result {
/// </summary>
public virtual bool LegacyMapping => false;

public virtual void SetUp(Note[][] notes) { }
public virtual void SetUp(Note[][] notes, UProject project, UTrack track) { }

/// <summary>
/// Phonemize a consecutive sequence of notes. This is the main logic of a phonemizer.
Expand Down
2 changes: 1 addition & 1 deletion OpenUtau.Core/Api/PhonemizerRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ static PhonemizerResponse Phonemize(PhonemizerRequest request) {
phonemizer.SetSinger(request.singer);
phonemizer.SetTiming(request.timeAxis);
try {
phonemizer.SetUp(notes);
phonemizer.SetUp(notes, DocManager.Inst.Project, DocManager.Inst.Project.tracks[request.part.trackNo]);
} catch (Exception e) {
Log.Error(e, $"phonemizer failed to setup.");
}
Expand Down
8 changes: 4 additions & 4 deletions OpenUtau.Core/BaseChinesePhonemizer.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
using System;
using System.Collections.Generic;
using System.Collections.Generic;
using System.Linq;
using OpenUtau.Core.G2p;
using OpenUtau.Api;
using OpenUtau.Core.G2p;
using OpenUtau.Core.Ustx;

namespace OpenUtau.Core {
public abstract class BaseChinesePhonemizer : Phonemizer {
Expand Down Expand Up @@ -43,7 +43,7 @@ public static void RomanizeNotes(Note[][] groups) {
Enumerable.Zip(groups, ResultLyrics, ChangeLyric).Last();
}

public override void SetUp(Note[][] groups) {
public override void SetUp(Note[][] groups, UProject project, UTrack track) {
RomanizeNotes(groups);
}
}
Expand Down
1 change: 1 addition & 0 deletions OpenUtau.Core/Classic/ClassicRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ public class ClassicRenderer : IRenderer {
Ustx.ATK,
Ustx.DEC,
Ustx.MOD,
Ustx.MODP,
Ustx.ALT,
};

Expand Down
2 changes: 2 additions & 0 deletions OpenUtau.Core/Classic/ClassicSingerLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ static USinger AdjustSingerType(Voicebank v) {
return new Core.Enunu.EnunuSinger(v) as USinger;
case USingerType.DiffSinger:
return new Core.DiffSinger.DiffSingerSinger(v) as USinger;
case USingerType.Voicevox:
return new Core.Voicevox.VoicevoxSinger(v) as USinger;
default:
return new ClassicSinger(v) as USinger;
}
Expand Down
49 changes: 34 additions & 15 deletions OpenUtau.Core/Classic/Frq.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using System.IO;
using System.Linq;
using System.Text;
using OpenUtau.Core;

namespace OpenUtau.Classic {
public class Frq {
Expand All @@ -12,22 +13,40 @@ public class Frq {
public double[] f0 = new double[0];
public double[] amp = new double[0];

public void Load(Stream stream) {
using (var reader = new BinaryReader(stream)) {
string header = new string(reader.ReadChars(8));
if (header != "FREQ0003") {
throw new FormatException("FREQ0003 header not found.");
}
hopSize = reader.ReadInt32();
averageF0 = reader.ReadDouble();
_ = reader.ReadBytes(16); // blank
int length = reader.ReadInt32();
f0 = new double[length];
amp = new double[length];
for (int i = 0; i < length; i++) {
f0[i] = reader.ReadDouble();
amp[i] = reader.ReadDouble();
/// <summary>
/// If the wav path is null (machine learning voicebank), return false.
/// <summary>
public bool Load(string wavPath) {
if (string.IsNullOrEmpty(wavPath)) {
return false;
}
string frqFile = VoicebankFiles.GetFrqFile(wavPath);
if (!File.Exists(frqFile)) {
return false;
}
try {
using (var fileStream = File.OpenRead(frqFile)) {
using (var reader = new BinaryReader(fileStream)) {
string header = new string(reader.ReadChars(8));
if (header != "FREQ0003") {
throw new FormatException("FREQ0003 header not found.");
}
hopSize = reader.ReadInt32();
averageF0 = reader.ReadDouble();
_ = reader.ReadBytes(16); // blank
int length = reader.ReadInt32();
f0 = new double[length];
amp = new double[length];
for (int i = 0; i < length; i++) {
f0[i] = reader.ReadDouble();
amp[i] = reader.ReadDouble();
}
}
}
return true;
} catch (Exception e) {
DocManager.Inst.ExecuteCmd(new ErrorMessageNotification("failed to load frq file", e));
return false;
}
}

Expand Down
2 changes: 1 addition & 1 deletion OpenUtau.Core/Classic/Plugin.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public void Run(string tempFile) {
}
var startInfo = new ProcessStartInfo() {
FileName = Executable,
Arguments = tempFile,
Arguments = $"\"{tempFile}\"",
WorkingDirectory = Path.GetDirectoryName(Executable),
UseShellExecute = UseShell,
};
Expand Down
1 change: 1 addition & 0 deletions OpenUtau.Core/Classic/WorldlineRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ public class WorldlineRenderer : IRenderer {
Ustx.VEL,
Ustx.VOL,
Ustx.MOD,
Ustx.MODP,
Ustx.ALT,
Ustx.GENC,
Ustx.BREC,
Expand Down
22 changes: 20 additions & 2 deletions OpenUtau.Core/Commands/NoteCommands.cs
Original file line number Diff line number Diff line change
Expand Up @@ -95,26 +95,44 @@ public override void Unexecute() {
}

public class ResizeNoteCommand : NoteCommand {
readonly int NewPartDuration;
readonly int OldPartDuration;
readonly int DeltaDur;
public ResizeNoteCommand(UVoicePart part, UNote note, int deltaDur) : base(part, note) {
DeltaDur = deltaDur;
OldPartDuration = part.Duration;
DocManager.Inst.Project.timeAxis.TickPosToBarBeat(note.End + deltaDur, out int bar, out int beat, out int remainingTicks);
int minDurTick = DocManager.Inst.Project.timeAxis.BarBeatToTickPos(bar + 2, 0) - part.position;
if (part.Duration < minDurTick) {
NewPartDuration = minDurTick;
}
}
public ResizeNoteCommand(UVoicePart part, List<UNote> notes, int deltaDur) : base(part, notes) {
DeltaDur = deltaDur;
OldPartDuration = part.Duration;
DocManager.Inst.Project.timeAxis.TickPosToBarBeat((Notes.LastOrDefault()?.End ?? 1) + deltaDur, out int bar, out int beat, out int remainingTicks);
int minDurTick = DocManager.Inst.Project.timeAxis.BarBeatToTickPos(bar + 2, 0) - part.position;
if (part.Duration < minDurTick) {
NewPartDuration = minDurTick;
}
}
public override string ToString() { return $"Change {Notes.Count()} notes duration"; }
public override void Execute() {
lock (Part) {
foreach (var note in Notes) {
note.duration += DeltaDur;
}
if (NewPartDuration > 0) {
Part.Duration = NewPartDuration;
}
}
}
public override void Unexecute() {
lock (Part) {
foreach (var note in Notes) {
note.duration -= DeltaDur;
}
Part.Duration = OldPartDuration;
}
}
}
Expand Down Expand Up @@ -502,11 +520,11 @@ public ChangePhonemeAliasCommand(UVoicePart part, UNote note, int index, string?

public override void Execute() {
var o = note.GetPhonemeOverride(index);
o.phoneme = string.IsNullOrWhiteSpace(newAlias) ? string.Empty : newAlias;
o.phoneme = string.IsNullOrWhiteSpace(newAlias) ? null : newAlias;
}
public override void Unexecute() {
var o = note.GetPhonemeOverride(index);
o.phoneme = string.IsNullOrWhiteSpace(oldAlias) ? string.Empty : oldAlias;
o.phoneme = string.IsNullOrWhiteSpace(oldAlias) ? null : oldAlias;
}
public override string ToString() => "Change phoneme alias";
}
Expand Down
11 changes: 9 additions & 2 deletions OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,22 @@ public class DsConfig {
public string linguistic;
public string pitch;
public string variance;
public int hop_size = 512;
public int sample_rate = 44100;
public bool predict_dur = true;
public bool predict_energy = true;
public bool predict_breathiness = true;
public bool predict_voicing = false;
public bool predict_tension = false;
public bool use_expr = false;
public bool use_note_rest = false;
public int sample_rate = 44100;
public int hop_size = 512;
public int win_size = 2048;
public int fft_size = 2048;
public int num_mel_bins = 128;
public double mel_fmin = 40;
public double mel_fmax = 16000;
public string mel_base = "10"; // or "e"
public string mel_scale = "slaney"; // or "htk"
public float frameMs(){
return 1000f * hop_size / sample_rate;
}
Expand Down
78 changes: 74 additions & 4 deletions OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,62 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
}

var vocoder = singer.getVocoder();
//Vocoder and singer should have the same hop sizes and sample rates.
//mel specification validity checks
//mel base must be 10 or e
if (vocoder.mel_base != "10" && vocoder.mel_base != "e") {
throw new Exception(
$"Mel base must be \"10\" or \"e\", but got \"{vocoder.mel_base}\" from vocoder");
}
if (singer.dsConfig.mel_base != "10" && singer.dsConfig.mel_base != "e") {
throw new Exception(
$"Mel base must be \"10\" or \"e\", but got \"{singer.dsConfig.mel_base}\" from acoustic model");
}
//mel scale must be slaney or htk
if (vocoder.mel_scale != "slaney" && vocoder.mel_scale != "htk") {
throw new Exception(
$"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from vocoder");
}
if (singer.dsConfig.mel_scale != "slaney" && singer.dsConfig.mel_scale != "htk") {
throw new Exception(
$"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from acoustic model");
}
//mel specification matching checks
if(vocoder.sample_rate != singer.dsConfig.sample_rate) {
throw new Exception(
$"Vocoder and acoustic model has mismatching sample rate ({vocoder.sample_rate} != {singer.dsConfig.sample_rate})");
}
if(vocoder.hop_size != singer.dsConfig.hop_size){
throw new Exception($"Vocoder's hop size is {vocoder.hop_size}, but acoustic's hop size is {singer.dsConfig.hop_size}.");
throw new Exception(
$"Vocoder and acoustic model has mismatching hop size ({vocoder.hop_size} != {singer.dsConfig.hop_size})");
}
if(vocoder.win_size != singer.dsConfig.win_size){
throw new Exception(
$"Vocoder and acoustic model has mismatching win size ({vocoder.win_size} != {singer.dsConfig.win_size})");
}
if(vocoder.fft_size != singer.dsConfig.fft_size){
throw new Exception(
$"Vocoder and acoustic model has mismatching FFT size ({vocoder.fft_size} != {singer.dsConfig.fft_size})");
}
if (vocoder.num_mel_bins != singer.dsConfig.num_mel_bins) {
throw new Exception(
$"Vocoder and acoustic model has mismatching mel bins ({vocoder.num_mel_bins} != {singer.dsConfig.num_mel_bins})");
}
if (Math.Abs(vocoder.mel_fmin - singer.dsConfig.mel_fmin) > 1e-5) {
throw new Exception(
$"Vocoder and acoustic model has mismatching fmin ({vocoder.mel_fmin} != {singer.dsConfig.mel_fmin})");
}
if(vocoder.sample_rate != singer.dsConfig.sample_rate){
throw new Exception($"Vocoder's sample rate is {vocoder.sample_rate}, but acoustic's sample rate is {singer.dsConfig.sample_rate}.");
if (Math.Abs(vocoder.mel_fmax - singer.dsConfig.mel_fmax) > 1e-5) {
throw new Exception(
$"Vocoder and acoustic model has mismatching fmax ({vocoder.mel_fmax} != {singer.dsConfig.mel_fmax})");
}
// mismatching mel base can be transformed
// if (vocoder.mel_base != singer.dsConfig.mel_base) {
// throw new Exception(
// $"Vocoder and acoustic model has mismatching mel base ({vocoder.mel_base} != {singer.dsConfig.mel_base})");
// }
if (vocoder.mel_scale != singer.dsConfig.mel_scale) {
throw new Exception(
$"Vocoder and acoustic model has mismatching mel scale ({vocoder.mel_scale} != {singer.dsConfig.mel_scale})");
}

var acousticModel = singer.getAcousticSession();
Expand Down Expand Up @@ -308,6 +358,26 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
acousticCache?.Save(acousticOutputs);
}
Tensor<float> mel = acousticOutputs.First().AsTensor<float>().Clone();
//mel transforms for different mel base
if (vocoder.mel_base != singer.dsConfig.mel_base) {
float k;
if (vocoder.mel_base == "e" && singer.dsConfig.mel_base == "10") {
k = 2.30259f;
}
else if (vocoder.mel_base == "10" && singer.dsConfig.mel_base == "e") {
k = 0.434294f;
} else {
// this should never happen
throw new Exception("This should never happen");
}
for (int b = 0; b < mel.Dimensions[0]; ++b) {
for (int t = 0; t < mel.Dimensions[1]; ++t) {
for (int c = 0; c < mel.Dimensions[2]; ++c) {
mel[b, t, c] *= k;
}
}
}
}
//vocoder
//waveform = session.run(['waveform'], {'mel': mel, 'f0': f0})[0]
var vocoderInputs = new List<NamedOnnxValue>();
Expand Down
24 changes: 21 additions & 3 deletions OpenUtau.Core/DiffSinger/DiffSingerSinger.cs
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,30 @@ public DiffSingerSinger(Voicebank voicebank) {

//Load diffsinger config of a voicebank
string configPath = Path.Combine(Location, "dsconfig.yaml");
dsConfig = Core.Yaml.DefaultDeserializer.Deserialize<DsConfig>(
File.ReadAllText(configPath, TextFileEncoding));
if(configPath != null && File.Exists(configPath)){
try {
dsConfig = Core.Yaml.DefaultDeserializer.Deserialize<DsConfig>(
File.ReadAllText(configPath, Encoding.UTF8));
} catch (Exception e) {
Log.Error(e, $"Failed to load dsconfig.yaml for {Name} from {configPath}");
dsConfig = new DsConfig();
}
} else {
Log.Error($"dsconfig.yaml not found for {Name} at {configPath}");
dsConfig = new DsConfig();
}

//Load phoneme list
string phonemesPath = Path.Combine(Location, dsConfig.phonemes);
phonemes = File.ReadLines(phonemesPath,TextFileEncoding).ToList();
if(phonemesPath != null && File.Exists(phonemesPath)){
try {
phonemes = File.ReadLines(phonemesPath, TextFileEncoding).ToList();
} catch (Exception e){
Log.Error(e, $"Failed to load phoneme list for {Name} from {phonemesPath}");
}
} else {
Log.Error($"phonemes file not found for {Name} at {phonemesPath}");
}

var dummyOtoSet = new UOtoSet(new OtoSet(), Location);
foreach (var phone in phonemes) {
Expand Down
Loading

0 comments on commit 0e9958e

Please sign in to comment.