Skip to content

Commit

Permalink
Merge pull request #776 from oxygen-dioxide/monophone
Browse files Browse the repository at this point in the history
ZH CVV: custom dictionary support
  • Loading branch information
stakira authored Jul 28, 2023
2 parents b695ace + ff2198f commit 3fd2f8a
Show file tree
Hide file tree
Showing 4 changed files with 321 additions and 269 deletions.
140 changes: 72 additions & 68 deletions OpenUtau.Plugin.Builtin/ChineseCVVPhonemizer.cs
Original file line number Diff line number Diff line change
@@ -1,16 +1,66 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using OpenUtau.Api;
using OpenUtau.Core;
using OpenUtau.Core.Ustx;
using Serilog;

namespace OpenUtau.Plugin.Builtin {
namespace OpenUtau.Plugin.Builtin
{
/// <summary>
/// Chinese 十月式整音扩张 CVV Phonemizer.
/// <para>It works by spliting "duang" to "duang" + "_ang", to produce the proper tail sound.</para>
/// </summary>
[Phonemizer("Chinese CVV (十月式整音扩张) Phonemizer", "ZH CVV", language: "ZH")]
public class ChineseCVVPhonemizer : BaseChinesePhonemizer {
public class ChineseCVVMonophonePhonemizer : MonophonePhonemizer
{
static readonly string pinyins = "a,ai,an,ang,ao,ba,bai,ban,bang,bao,bei,ben,beng,bi,bian,biao,bie,bin,bing,bo,bu,ca,cai,can,cang,cao,ce,cei,cen,ceng,cha,chai,chan,chang,chao,che,chen,cheng,chi,chong,chou,chu,chua,chuai,chuan,chuang,chui,chun,chuo,ci,cong,cou,cu,cuan,cui,cun,cuo,da,dai,dan,dang,dao,de,dei,den,deng,di,dia,dian,diao,die,ding,diu,dong,dou,du,duan,dui,dun,duo,e,ei,en,eng,er,fa,fan,fang,fei,fen,feng,fo,fou,fu,ga,gai,gan,gang,gao,ge,gei,gen,geng,gong,gou,gu,gua,guai,guan,guang,gui,gun,guo,ha,hai,han,hang,hao,he,hei,hen,heng,hong,hou,hu,hua,huai,huan,huang,hui,hun,huo,ji,jia,jian,jiang,jiao,jie,jin,jing,jiong,jiu,ju,jv,juan,jvan,jue,jve,jun,jvn,ka,kai,kan,kang,kao,ke,kei,ken,keng,kong,kou,ku,kua,kuai,kuan,kuang,kui,kun,kuo,la,lai,lan,lang,lao,le,lei,leng,li,lia,lian,liang,liao,lie,lin,ling,liu,lo,long,lou,lu,luan,lun,luo,lv,lve,ma,mai,man,mang,mao,me,mei,men,meng,mi,mian,miao,mie,min,ming,miu,mo,mou,mu,na,nai,nan,nang,nao,ne,nei,nen,neng,ni,nian,niang,niao,nie,nin,ning,niu,nong,nou,nu,nuan,nun,nuo,nv,nve,o,ou,pa,pai,pan,pang,pao,pei,pen,peng,pi,pian,piao,pie,pin,ping,po,pou,pu,qi,qia,qian,qiang,qiao,qie,qin,qing,qiong,qiu,qu,qv,quan,qvan,que,qve,qun,qvn,ran,rang,rao,re,ren,reng,ri,rong,rou,ru,rua,ruan,rui,run,ruo,sa,sai,san,sang,sao,se,sen,seng,sha,shai,shan,shang,shao,she,shei,shen,sheng,shi,shou,shu,shua,shuai,shuan,shuang,shui,shun,shuo,si,song,sou,su,suan,sui,sun,suo,ta,tai,tan,tang,tao,te,tei,teng,ti,tian,tiao,tie,ting,tong,tou,tu,tuan,tui,tun,tuo,wa,wai,wan,wang,wei,wen,weng,wo,wu,xi,xia,xian,xiang,xiao,xie,xin,xing,xiong,xiu,xu,xv,xuan,xvan,xue,xve,xun,xvn,ya,yan,yang,yao,ye,yi,yin,ying,yo,yong,you,yu,yv,yuan,yvan,yue,yve,yun,yvn,za,zai,zan,zang,zao,ze,zei,zen,zeng,zha,zhai,zhan,zhang,zhao,zhe,zhei,zhen,zheng,zhi,zhong,zhou,zhu,zhua,zhuai,zhuan,zhuang,zhui,zhun,zhuo,zi,zong,zou,zu,zuan,zui,zun";
static readonly string tails = "_vn,_ing,_ong,_an,_ou,_er,_ao,_eng,_ang,_en,_en2,_ai,_iong,_in,_ei";

static readonly string[] pinyinList = pinyins.Split(',');
static readonly string[] tailList = tails.Split(',');

public ChineseCVVMonophonePhonemizer() {
ConsonantLength = 120;
}

protected override IG2p LoadG2p() {
var g2ps = new List<IG2p>();

// Load dictionary from plugin folder.
string path = Path.Combine(PluginDir, "zhcvv.yaml");
if (File.Exists(path)) {
g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(path)).Build());
}

// Load dictionary from singer folder.
if (singer != null && singer.Found && singer.Loaded) {
string file = Path.Combine(singer.Location, "zhcvv.yaml");
if (File.Exists(file)) {
try {
g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(file)).Build());
} catch (Exception e) {
Log.Error(e, $"Failed to load {file}");
}
}
}
g2ps.Add(new ChineseCVVG2p());
return new G2pFallbacks(g2ps.ToArray());
}

protected override Dictionary<string, string[]> LoadVowelFallbacks() {
return "_un=_en".Split(';')
.Select(entry => entry.Split('='))
.ToDictionary(parts => parts[0], parts => parts[1].Split(','));
}

public override void SetUp(Note[][] groups) {
BaseChinesePhonemizer.RomanizeNotes(groups);
}
}

class ChineseCVVG2p : IG2p{
/// <summary>
/// The consonant table.
/// </summary>
Expand All @@ -22,26 +72,22 @@ public class ChineseCVVPhonemizer : BaseChinesePhonemizer {

static HashSet<string> cSet;
static Dictionary<string, string> vDict;

static ChineseCVVPhonemizer() {
static ChineseCVVG2p() {
cSet = new HashSet<string>(consonants.Split(','));
vDict = vowels.Split(',')
.Select(s => s.Split('='))
.ToDictionary(a => a[0], a => a[1]);
}

private USinger singer;

// Simply stores the singer in a field.
public override void SetSinger(USinger singer) => this.singer = singer;
public bool IsVowel(string phoneme){
return !phoneme.StartsWith("_");
}

public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevNeighbour, Note? nextNeighbour, Note[] prevNeighbours) {
public string[] Query(string lyric){
// The overall logic is:
// 1. Remove consonant: "duang" -> "uang".
// 2. Lookup the trailing sound in vowel table: "uang" -> "_ang".
// 3. Split the total duration and returns "duang" and "_ang".
var lyric = notes[0].lyric;
var note = notes[0];
string consonant = string.Empty;
string vowel = string.Empty;
if (lyric.Length > 2 && cSet.Contains(lyric.Substring(0, 2))) {
Expand All @@ -63,62 +109,20 @@ public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevN
if ((vowel == "an") && (consonant == "y")) {
vowel = "ian";
}
string phoneme0 = lyric;
// Get color
string color = string.Empty;
int toneShift = 0;
if (note.phonemeAttributes != null) {
var attr = note.phonemeAttributes.FirstOrDefault(attr => attr.index == 0);
color = attr.voiceColor;
toneShift = attr.toneShift;
if(vDict.TryGetValue(vowel, out var tail)){
return new string[] { lyric, tail };
}else{
return new string[] { lyric };
}
// We will need to split the total duration for phonemes, so we compute it here.
int totalDuration = notes.Sum(n => n.duration);
// Lookup the vowel split table. For example, "uang" will match "_ang".
if (vDict.TryGetValue(vowel, out var phoneme1)) {
// Now phoneme0="duang" and phoneme1="_ang",
// try to give "_ang" 120 ticks, but no more than half of the total duration.
int length1 = 120;
if (length1 > totalDuration / 2) {
length1 = totalDuration / 2;
}
if (singer.TryGetMappedOto(phoneme0, note.tone + toneShift, color, out var oto0)) {
phoneme0 = oto0.Alias;
}

if (singer.TryGetMappedOto(phoneme1, note.tone + toneShift, color, out var oto1)) {
phoneme1 = oto1.Alias;
}

if (phoneme1.Contains("_un") && !singer.TryGetMappedOto(phoneme1, note.tone + toneShift, color, out var oto2)) {
phoneme1 = "_en";
} else if (phoneme1.Contains("_un") && singer.TryGetMappedOto(phoneme1, note.tone + toneShift, color, out var oto3)) {
phoneme1 = oto3.Alias;
}

}
public bool IsValidSymbol(string symbol){
return true;
}

return new Result {
phonemes = new Phoneme[] {
new Phoneme() {
phoneme = phoneme0,
},
new Phoneme() {
phoneme = phoneme1,
position = totalDuration - length1,
}
},
};
}
if (singer.TryGetMappedOto(phoneme0, note.tone + toneShift, color, out var oto)) {
phoneme0 = oto.Alias;
}
// Not spliting is needed. Return as is.
return new Result {
phonemes = new Phoneme[] {
new Phoneme() {
phoneme = phoneme0,
}
},
};
public string[] UnpackHint(string hint, char separator = ' ') {
return hint.Split(separator)
.ToArray();
}
}
}
}
203 changes: 2 additions & 201 deletions OpenUtau.Plugin.Builtin/LatinDiphonePhonemizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,172 +10,8 @@ namespace OpenUtau.Plugin.Builtin {
/// <summary>
/// A base diphone phonemizer for latin languages.
/// </summary>
public abstract class LatinDiphonePhonemizer : Phonemizer {
protected Dictionary<string, string[]> vowelFallback;
protected USinger singer;
protected IG2p g2p;
protected bool isDictionaryLoading;

//[(index of phoneme, tick position from the lyrical note in notes[], is manual)]
protected readonly List<Tuple<int, int, bool>> alignments = new List<Tuple<int, int, bool>>();

/// <summary>
/// This property will later be exposed in UI for user adjustment.
/// </summary>
public int ConsonantLength { get; set; } = 60;

public LatinDiphonePhonemizer() {
try {
Initialize();
} catch (Exception e) {
Log.Error(e, "Failed to initialize.");
}
}

protected abstract IG2p LoadG2p();

protected abstract Dictionary<string, string[]> LoadVowelFallbacks();

protected void Initialize() {
g2p = LoadG2p();
vowelFallback = LoadVowelFallbacks();
}

public override void SetSinger(USinger singer) {
this.singer = singer;
g2p = LoadG2p();
}

public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevNeighbour, Note? nextNeighbour, Note[] prevNeighbours) {
if (isDictionaryLoading) {
return MakeSimpleResult("");
}
var note = notes[0];

// Force alias using ? prefix
if (!string.IsNullOrEmpty(note.lyric) && note.lyric[0] == '?') {
return MakeSimpleResult(note.lyric.Substring(1));
}

// Get the symbols of previous note.
var prevSymbols = prevNeighbour == null ? null : GetSymbols(prevNeighbour.Value);
// The user is using a tail "-" note to produce a "<something> -" sound.
if (note.lyric == "-" && prevSymbols != null) {
var attr = note.phonemeAttributes?.FirstOrDefault() ?? default;
string color = attr.voiceColor;
string alias = $"{prevSymbols.Last()} -";
if (singer.TryGetMappedOto(alias, note.tone, color, out var oto)) {
return MakeSimpleResult(oto.Alias);
}
return MakeSimpleResult(alias);
}
// Get the symbols of current note.
string[] symbols = GetSymbols(note);
if (nextNeighbour == null) {
// Auto add tail "-".
symbols = symbols.Append("-").ToArray();
}
if (symbols == null || symbols.Length == 0) {
// No symbol is found for current note.
// Otherwise assumes the user put in an alias.
return MakeSimpleResult(note.lyric);
}
// Find phone types of symbols.
var isVowel = symbols.Select(s => g2p.IsVowel(s)).ToArray();
// Arpasing aligns the first vowel at 0 and shifts leading consonants to negative positions,
// so we need to find the first vowel.
var phonemes = new Phoneme[symbols.Length];

// Alignments
// - Tries to align every note to one syllable.
// - "+n" manually aligns to n-th phoneme.
alignments.Clear();
//notes except those whose lyrics start witn "+*" or "+~"
var nonExtensionNotes = notes.Where(n=>!IsSyllableVowelExtensionNote(n)).ToArray();
for (int i = 0; i < symbols.Length; i++) {
if (isVowel[i] && alignments.Count < nonExtensionNotes.Length) {
alignments.Add(Tuple.Create(i, nonExtensionNotes[alignments.Count].position - notes[0].position, false));
}
}
int position = notes[0].duration;
for (int i = 1; i < notes.Length; ++i) {
if (int.TryParse(notes[i].lyric.Substring(1), out var idx)) {
alignments.Add(Tuple.Create(idx - 1, position, true));
}
position += notes[i].duration;
}
alignments.Add(Tuple.Create(phonemes.Length, position, true));
alignments.Sort((a, b) => a.Item1.CompareTo(b.Item1));
for (int i = 0; i < alignments.Count; ++i) {
if (alignments[i].Item3) {
while (i > 0 && (alignments[i - 1].Item2 >= alignments[i].Item2 ||
alignments[i - 1].Item1 == alignments[i].Item1)) {
alignments.RemoveAt(i - 1);
i--;
}
while (i < alignments.Count - 1 && (alignments[i + 1].Item2 <= alignments[i].Item2 ||
alignments[i + 1].Item1 == alignments[i].Item1)) {
alignments.RemoveAt(i + 1);
}
}
}

int startIndex = 0;
int firstVowel = Array.IndexOf(isVowel, true);
int startTick = -ConsonantLength * firstVowel;
foreach (var alignment in alignments) {
// Distributes phonemes between two aligment points.
DistributeDuration(isVowel, phonemes, startIndex, alignment.Item1, startTick, alignment.Item2);
startIndex = alignment.Item1;
startTick = alignment.Item2;
}
alignments.Clear();

// Select aliases.
int noteIndex = 0;
string prevSymbol = prevSymbols == null ? "-" : prevSymbols.Last();
for (int i = 0; i < symbols.Length; i++) {
var attr = note.phonemeAttributes?.FirstOrDefault(attr => attr.index == i) ?? default;
string alt = attr.alternate?.ToString() ?? string.Empty;
string color = attr.voiceColor;
int toneShift = attr.toneShift;
var phoneme = phonemes[i];
while (noteIndex < notes.Length - 1 && notes[noteIndex].position - note.position < phoneme.position) {
noteIndex++;
}
int tone = (i == 0 && prevNeighbours != null && prevNeighbours.Length > 0)
? prevNeighbours.Last().tone : notes[noteIndex].tone;
phoneme.phoneme = GetPhonemeOrFallback(prevSymbol, symbols[i], tone + toneShift, color, alt);
phonemes[i] = phoneme;
prevSymbol = symbols[i];
}

return new Result {
phonemes = phonemes,
};
}

/// <summary>
/// Does this note extend the previous syllable?
/// </summary>
/// <param name="note"></param>
/// <returns></returns>
protected bool IsSyllableVowelExtensionNote(Note note) {
return note.lyric.StartsWith("+~") || note.lyric.StartsWith("+*");
}

string[] GetSymbols(Note note) {
if (string.IsNullOrEmpty(note.phoneticHint)) {
// User has not provided hint, query CMUdict.
return g2p.Query(note.lyric.ToLowerInvariant());
}
// Split space-separated symbols into an array.
return note.phoneticHint.Split()
.Where(s => g2p.IsValidSymbol(s)) // skip the invalid symbols.
.ToArray();
}

string GetPhonemeOrFallback(string prevSymbol, string symbol, int tone, string color, string alt) {
public abstract class LatinDiphonePhonemizer : PhonemeBasedPhonemizer {
protected override string GetPhonemeOrFallback(string prevSymbol, string symbol, int tone, string color, string alt) {
if (!string.IsNullOrEmpty(alt) && singer.TryGetMappedOto($"{prevSymbol} {symbol}{alt}", tone, color, out var oto)) {
return oto.Alias;
}
Expand All @@ -194,40 +30,5 @@ string GetPhonemeOrFallback(string prevSymbol, string symbol, int tone, string c
}
return $"{prevSymbol} {symbol}{alt}";
}

void DistributeDuration(bool[] isVowel, Phoneme[] phonemes, int startIndex, int endIndex, int startTick, int endTick) {
if (startIndex == endIndex) {
return;
}
// First count number of vowels and consonants.
int consonants = 0;
int vowels = 0;
int duration = endTick - startTick;
for (int i = startIndex; i < endIndex; i++) {
if (isVowel[i]) {
vowels++;
} else {
consonants++;
}
}
// If vowels exist, consonants are given fixed length, but combined no more than half duration.
// However, if no vowel exists, consonants are evenly distributed within the total duration.
int consonantDuration = vowels > 0
? (consonants > 0 ? Math.Min(ConsonantLength, duration / 2 / consonants) : 0)
: duration / consonants;
// Vowels are evenly distributed within (total duration - total consonant duration).
int vowelDuration = vowels > 0 ? (duration - consonantDuration * consonants) / vowels : 0;
int position = startTick;
// Compute positions using previously computed durations.
for (int i = startIndex; i < endIndex; i++) {
if (isVowel[i]) {
phonemes[i].position = position;
position += vowelDuration;
} else {
phonemes[i].position = position;
position += consonantDuration;
}
}
}
}
}
Loading

0 comments on commit 3fd2f8a

Please sign in to comment.