From 2d377e73b8d2086f24de26d6b041872b9b678e64 Mon Sep 17 00:00:00 2001 From: Rin Iwai Date: Thu, 1 Feb 2024 15:56:13 +0900 Subject: [PATCH 1/8] =?UTF-8?q?TextAnalyzer=20trait=E3=81=ABstring->Accent?= =?UTF-8?q?PhraseModel[]=E3=82=92=E7=A7=BB=E5=8B=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/lib.rs | 1 + crates/voicevox_core/src/synthesizer.rs | 82 ++------------ crates/voicevox_core/src/text_analyzer.rs | 124 ++++++++++++++++++++++ 3 files changed, 134 insertions(+), 73 deletions(-) create mode 100644 crates/voicevox_core/src/text_analyzer.rs diff --git a/crates/voicevox_core/src/lib.rs b/crates/voicevox_core/src/lib.rs index ea74c9f7c..b4bf19fda 100644 --- a/crates/voicevox_core/src/lib.rs +++ b/crates/voicevox_core/src/lib.rs @@ -12,6 +12,7 @@ mod numerics; mod result; mod synthesizer; mod task; +mod text_analyzer; mod user_dict; mod version; mod voice_model; diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 202e917c7..59ac4db00 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -80,7 +80,7 @@ pub(crate) mod blocking { use enum_map::enum_map; use crate::{ - engine::{self, create_kana, parse_kana, MoraModel, OjtPhoneme, Utterance}, + engine::{self, create_kana, MoraModel, OjtPhoneme}, error::ErrorRepr, infer::{ domain::{ @@ -92,6 +92,7 @@ pub(crate) mod blocking { InferenceSessionOptions, }, numerics::F32Ext as _, + text_analyzer::{KanaParser, OpenJtalk, TextAnalyzer}, AccentPhraseModel, AudioQueryModel, FullcontextExtractor, Result, StyleId, SupportedDevices, SynthesisOptions, VoiceModelId, VoiceModelMeta, }; @@ -103,7 +104,8 @@ pub(crate) mod blocking { /// 音声シンセサイザ。 pub struct Synthesizer { pub(super) status: Status, - open_jtalk: O, + open_jtalk: OpenJtalk, + kana_parser: KanaParser, use_gpu: bool, } @@ -176,7 +178,8 @@ pub(crate) mod blocking { return Ok(Self { status, - open_jtalk, + open_jtalk: OpenJtalk::new(open_jtalk), + kana_parser: KanaParser::new(), use_gpu, }); @@ -457,7 +460,8 @@ pub(crate) mod blocking { kana: &str, style_id: StyleId, ) -> Result> { - self.replace_mora_data(&parse_kana(kana)?, style_id) + let accent_phrases = self.kana_parser.analyze(kana)?; + self.replace_mora_data(&accent_phrases, style_id) } /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。 @@ -743,75 +747,7 @@ pub(crate) mod blocking { text: &str, style_id: StyleId, ) -> Result> { - if text.is_empty() { - return Ok(Vec::new()); - } - - let utterance = Utterance::extract_full_context_label(&self.open_jtalk, text)?; - - let accent_phrases: Vec = utterance - .breath_groups() - .iter() - .enumerate() - .fold(Vec::new(), |mut accum_vec, (i, breath_group)| { - accum_vec.extend(breath_group.accent_phrases().iter().enumerate().map( - |(j, accent_phrase)| { - let moras = accent_phrase - .moras() - .iter() - .map(|mora| { - let mora_text = mora - .phonemes() - .iter() - .map(|phoneme| phoneme.phoneme().to_string()) - .collect::>() - .join(""); - - let (consonant, consonant_length) = - if let Some(consonant) = mora.consonant() { - (Some(consonant.phoneme().to_string()), Some(0.)) - } else { - (None, None) - }; - - MoraModel::new( - mora_to_text(mora_text), - consonant, - consonant_length, - mora.vowel().phoneme().into(), - 0., - 0., - ) - }) - .collect(); - - let pause_mora = if i != utterance.breath_groups().len() - 1 - && j == breath_group.accent_phrases().len() - 1 - { - Some(MoraModel::new( - "、".into(), - None, - None, - "pau".into(), - 0., - 0., - )) - } else { - None - }; - - AccentPhraseModel::new( - moras, - *accent_phrase.accent(), - pause_mora, - *accent_phrase.is_interrogative(), - ) - }, - )); - - accum_vec - }); - + let accent_phrases = self.open_jtalk.analyze(text)?; self.replace_mora_data(&accent_phrases, style_id) } diff --git a/crates/voicevox_core/src/text_analyzer.rs b/crates/voicevox_core/src/text_analyzer.rs new file mode 100644 index 000000000..5e4cf1560 --- /dev/null +++ b/crates/voicevox_core/src/text_analyzer.rs @@ -0,0 +1,124 @@ +use crate::{ + engine::{self, parse_kana, MoraModel, Utterance}, + AccentPhraseModel, FullcontextExtractor, Result, +}; + +pub trait TextAnalyzer { + fn analyze(&self, text: &str) -> Result>; +} + +/// AquesTalk風記法からAccentPhraseの配列を生成するTextAnalyzer +pub struct KanaParser {} + +impl KanaParser { + pub fn new() -> Self { + Self {} + } +} + +impl TextAnalyzer for KanaParser { + fn analyze(&self, text: &str) -> Result> { + Ok(parse_kana(text)?) + } +} + +/// OpenJtalkからAccentPhraseの配列を生成するTextAnalyzer +pub struct OpenJtalk { + open_jtalk: O, +} + +impl OpenJtalk { + pub fn new(open_jtalk: O) -> Self { + Self { open_jtalk } + } +} + +impl TextAnalyzer for OpenJtalk { + fn analyze(&self, text: &str) -> Result> { + if text.is_empty() { + return Ok(Vec::new()); + } + + let utterance = Utterance::extract_full_context_label(&self.open_jtalk, text)?; + + let accent_phrases: Vec = utterance + .breath_groups() + .iter() + .enumerate() + .fold(Vec::new(), |mut accum_vec, (i, breath_group)| { + accum_vec.extend(breath_group.accent_phrases().iter().enumerate().map( + |(j, accent_phrase)| { + let moras = accent_phrase + .moras() + .iter() + .map(|mora| { + let mora_text = mora + .phonemes() + .iter() + .map(|phoneme| phoneme.phoneme().to_string()) + .collect::>() + .join(""); + + let (consonant, consonant_length) = + if let Some(consonant) = mora.consonant() { + (Some(consonant.phoneme().to_string()), Some(0.)) + } else { + (None, None) + }; + + MoraModel::new( + mora_to_text(mora_text), + consonant, + consonant_length, + mora.vowel().phoneme().into(), + 0., + 0., + ) + }) + .collect(); + + let pause_mora = if i != utterance.breath_groups().len() - 1 + && j == breath_group.accent_phrases().len() - 1 + { + Some(MoraModel::new( + "、".into(), + None, + None, + "pau".into(), + 0., + 0., + )) + } else { + None + }; + + AccentPhraseModel::new( + moras, + *accent_phrase.accent(), + pause_mora, + *accent_phrase.is_interrogative(), + ) + }, + )); + + accum_vec + }); + + Ok(accent_phrases) + } +} + +fn mora_to_text(mora: impl AsRef) -> String { + let last_char = mora.as_ref().chars().last().unwrap(); + let mora = if ['A', 'I', 'U', 'E', 'O'].contains(&last_char) { + format!( + "{}{}", + &mora.as_ref()[0..mora.as_ref().len() - 1], + last_char.to_lowercase() + ) + } else { + mora.as_ref().to_string() + }; + // もしカタカナに変換できなければ、引数で与えた文字列がそのまま返ってくる + engine::mora2text(&mora).to_string() +} From dd73aca1141488a954d03c4169a4526700edbc1c Mon Sep 17 00:00:00 2001 From: Rin Iwai Date: Thu, 1 Feb 2024 16:28:38 +0900 Subject: [PATCH 2/8] refactor --- crates/voicevox_core/src/synthesizer.rs | 14 +++++++------- crates/voicevox_core/src/text_analyzer.rs | 18 ++++++++---------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 59ac4db00..1a439e234 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -92,7 +92,7 @@ pub(crate) mod blocking { InferenceSessionOptions, }, numerics::F32Ext as _, - text_analyzer::{KanaParser, OpenJtalk, TextAnalyzer}, + text_analyzer::{KanaAnalyzer, OpenJTalkAnalyzer, TextAnalyzer}, AccentPhraseModel, AudioQueryModel, FullcontextExtractor, Result, StyleId, SupportedDevices, SynthesisOptions, VoiceModelId, VoiceModelMeta, }; @@ -104,8 +104,8 @@ pub(crate) mod blocking { /// 音声シンセサイザ。 pub struct Synthesizer { pub(super) status: Status, - open_jtalk: OpenJtalk, - kana_parser: KanaParser, + open_jtalk_analyzer: OpenJTalkAnalyzer, + kana_analyzer: KanaAnalyzer, use_gpu: bool, } @@ -178,8 +178,8 @@ pub(crate) mod blocking { return Ok(Self { status, - open_jtalk: OpenJtalk::new(open_jtalk), - kana_parser: KanaParser::new(), + open_jtalk_analyzer: OpenJTalkAnalyzer::new(open_jtalk), + kana_analyzer: KanaAnalyzer::new(), use_gpu, }); @@ -460,7 +460,7 @@ pub(crate) mod blocking { kana: &str, style_id: StyleId, ) -> Result> { - let accent_phrases = self.kana_parser.analyze(kana)?; + let accent_phrases = self.kana_analyzer.analyze(kana)?; self.replace_mora_data(&accent_phrases, style_id) } @@ -747,7 +747,7 @@ pub(crate) mod blocking { text: &str, style_id: StyleId, ) -> Result> { - let accent_phrases = self.open_jtalk.analyze(text)?; + let accent_phrases = self.open_jtalk_analyzer.analyze(text)?; self.replace_mora_data(&accent_phrases, style_id) } diff --git a/crates/voicevox_core/src/text_analyzer.rs b/crates/voicevox_core/src/text_analyzer.rs index 5e4cf1560..7f842c1f2 100644 --- a/crates/voicevox_core/src/text_analyzer.rs +++ b/crates/voicevox_core/src/text_analyzer.rs @@ -8,38 +8,36 @@ pub trait TextAnalyzer { } /// AquesTalk風記法からAccentPhraseの配列を生成するTextAnalyzer -pub struct KanaParser {} +pub struct KanaAnalyzer; -impl KanaParser { +impl KanaAnalyzer { pub fn new() -> Self { Self {} } } -impl TextAnalyzer for KanaParser { +impl TextAnalyzer for KanaAnalyzer { fn analyze(&self, text: &str) -> Result> { Ok(parse_kana(text)?) } } /// OpenJtalkからAccentPhraseの配列を生成するTextAnalyzer -pub struct OpenJtalk { - open_jtalk: O, -} +pub struct OpenJTalkAnalyzer(O); -impl OpenJtalk { +impl OpenJTalkAnalyzer { pub fn new(open_jtalk: O) -> Self { - Self { open_jtalk } + Self(open_jtalk) } } -impl TextAnalyzer for OpenJtalk { +impl TextAnalyzer for OpenJTalkAnalyzer { fn analyze(&self, text: &str) -> Result> { if text.is_empty() { return Ok(Vec::new()); } - let utterance = Utterance::extract_full_context_label(&self.open_jtalk, text)?; + let utterance = Utterance::extract_full_context_label(&self.0, text)?; let accent_phrases: Vec = utterance .breath_groups() From e9fea231eb2dceb74b64aaf25c7f090bd91bc338 Mon Sep 17 00:00:00 2001 From: Rin Iwai Date: Fri, 2 Feb 2024 02:45:48 +0900 Subject: [PATCH 3/8] refactor --- crates/voicevox_core/src/text_analyzer.rs | 132 +++++++++++----------- 1 file changed, 68 insertions(+), 64 deletions(-) diff --git a/crates/voicevox_core/src/text_analyzer.rs b/crates/voicevox_core/src/text_analyzer.rs index 7f842c1f2..33c886549 100644 --- a/crates/voicevox_core/src/text_analyzer.rs +++ b/crates/voicevox_core/src/text_analyzer.rs @@ -18,6 +18,9 @@ impl KanaAnalyzer { impl TextAnalyzer for KanaAnalyzer { fn analyze(&self, text: &str) -> Result> { + if text.is_empty() { + return Ok(Vec::new()); + } Ok(parse_kana(text)?) } } @@ -36,74 +39,75 @@ impl TextAnalyzer for OpenJTalkAnalyzer { if text.is_empty() { return Ok(Vec::new()); } - let utterance = Utterance::extract_full_context_label(&self.0, text)?; + Ok(utterance_to_accent_phrases(utterance)) + } +} - let accent_phrases: Vec = utterance - .breath_groups() - .iter() - .enumerate() - .fold(Vec::new(), |mut accum_vec, (i, breath_group)| { - accum_vec.extend(breath_group.accent_phrases().iter().enumerate().map( - |(j, accent_phrase)| { - let moras = accent_phrase - .moras() - .iter() - .map(|mora| { - let mora_text = mora - .phonemes() - .iter() - .map(|phoneme| phoneme.phoneme().to_string()) - .collect::>() - .join(""); - - let (consonant, consonant_length) = - if let Some(consonant) = mora.consonant() { - (Some(consonant.phoneme().to_string()), Some(0.)) - } else { - (None, None) - }; - - MoraModel::new( - mora_to_text(mora_text), - consonant, - consonant_length, - mora.vowel().phoneme().into(), - 0., - 0., - ) - }) - .collect(); - - let pause_mora = if i != utterance.breath_groups().len() - 1 - && j == breath_group.accent_phrases().len() - 1 - { - Some(MoraModel::new( - "、".into(), - None, - None, - "pau".into(), +fn utterance_to_accent_phrases(utterance: Utterance) -> Vec { + let accent_phrases: Vec = utterance.breath_groups().iter().enumerate().fold( + Vec::new(), + |mut accum_vec, (i, breath_group)| { + accum_vec.extend(breath_group.accent_phrases().iter().enumerate().map( + |(j, accent_phrase)| { + let moras = accent_phrase + .moras() + .iter() + .map(|mora| { + let mora_text = mora + .phonemes() + .iter() + .map(|phoneme| phoneme.phoneme().to_string()) + .collect::>() + .join(""); + + let (consonant, consonant_length) = + if let Some(consonant) = mora.consonant() { + (Some(consonant.phoneme().to_string()), Some(0.)) + } else { + (None, None) + }; + + MoraModel::new( + mora_to_text(mora_text), + consonant, + consonant_length, + mora.vowel().phoneme().into(), 0., 0., - )) - } else { - None - }; - - AccentPhraseModel::new( - moras, - *accent_phrase.accent(), - pause_mora, - *accent_phrase.is_interrogative(), - ) - }, - )); - - accum_vec - }); - - Ok(accent_phrases) - } + ) + }) + .collect(); + + let pause_mora = if i != utterance.breath_groups().len() - 1 + && j == breath_group.accent_phrases().len() - 1 + { + Some(MoraModel::new( + "、".into(), + None, + None, + "pau".into(), + 0., + 0., + )) + } else { + None + }; + + AccentPhraseModel::new( + moras, + *accent_phrase.accent(), + pause_mora, + *accent_phrase.is_interrogative(), + ) + }, + )); + + accum_vec + }, + ); + + accent_phrases } fn mora_to_text(mora: impl AsRef) -> String { From 43dff65b4753ca05e659773980a041fc634d1f60 Mon Sep 17 00:00:00 2001 From: Rin Iwai Date: Fri, 2 Feb 2024 03:05:26 +0900 Subject: [PATCH 4/8] =?UTF-8?q?Synthesizer=E3=81=AE=E5=BD=A2?= =?UTF-8?q?=E3=81=A7=E5=91=BC=E3=81=B3=E5=87=BA=E3=81=99=E6=99=82=E3=81=AE?= =?UTF-8?q?=E3=81=9F=E3=82=81=E3=81=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/voicevox_core/src/lib.rs b/crates/voicevox_core/src/lib.rs index b4bf19fda..f0a948354 100644 --- a/crates/voicevox_core/src/lib.rs +++ b/crates/voicevox_core/src/lib.rs @@ -12,13 +12,13 @@ mod numerics; mod result; mod synthesizer; mod task; -mod text_analyzer; mod user_dict; mod version; mod voice_model; pub mod __internal; pub mod blocking; +pub mod text_analyzer; pub mod tokio; #[cfg(test)] From 38551f81c05e9013a3e43323fc7078afa5898e59 Mon Sep 17 00:00:00 2001 From: Rin Iwai Date: Fri, 2 Feb 2024 03:30:03 +0900 Subject: [PATCH 5/8] =?UTF-8?q?TextAnalyzer=E3=81=ABClone=E8=BF=BD?= =?UTF-8?q?=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/text_analyzer.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/voicevox_core/src/text_analyzer.rs b/crates/voicevox_core/src/text_analyzer.rs index 33c886549..3a5e4a442 100644 --- a/crates/voicevox_core/src/text_analyzer.rs +++ b/crates/voicevox_core/src/text_analyzer.rs @@ -8,6 +8,7 @@ pub trait TextAnalyzer { } /// AquesTalk風記法からAccentPhraseの配列を生成するTextAnalyzer +#[derive(Clone)] pub struct KanaAnalyzer; impl KanaAnalyzer { @@ -26,6 +27,7 @@ impl TextAnalyzer for KanaAnalyzer { } /// OpenJtalkからAccentPhraseの配列を生成するTextAnalyzer +#[derive(Clone)] pub struct OpenJTalkAnalyzer(O); impl OpenJTalkAnalyzer { From a37b05f30669ad5b1e32bd50c1c6992682ee284c Mon Sep 17 00:00:00 2001 From: Rin Iwai <72665456+eyr1n@users.noreply.github.com> Date: Thu, 8 Feb 2024 05:04:20 +0900 Subject: [PATCH 6/8] Update crates/voicevox_core/src/text_analyzer.rs Co-authored-by: Ryo Yamashita --- crates/voicevox_core/src/text_analyzer.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/crates/voicevox_core/src/text_analyzer.rs b/crates/voicevox_core/src/text_analyzer.rs index 3a5e4a442..e91d94b9c 100644 --- a/crates/voicevox_core/src/text_analyzer.rs +++ b/crates/voicevox_core/src/text_analyzer.rs @@ -11,12 +11,6 @@ pub trait TextAnalyzer { #[derive(Clone)] pub struct KanaAnalyzer; -impl KanaAnalyzer { - pub fn new() -> Self { - Self {} - } -} - impl TextAnalyzer for KanaAnalyzer { fn analyze(&self, text: &str) -> Result> { if text.is_empty() { From 60c7ca50f301bc4d7eed6a1d7b1faf0477955660 Mon Sep 17 00:00:00 2001 From: Rin Iwai Date: Thu, 8 Feb 2024 05:08:47 +0900 Subject: [PATCH 7/8] remove KanaAnalyzer::new() --- crates/voicevox_core/src/synthesizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 1a439e234..f369eee7e 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -179,7 +179,7 @@ pub(crate) mod blocking { return Ok(Self { status, open_jtalk_analyzer: OpenJTalkAnalyzer::new(open_jtalk), - kana_analyzer: KanaAnalyzer::new(), + kana_analyzer: KanaAnalyzer {}, use_gpu, }); From e5524d3e2a047f7f5492152dd05aa0f4d7b422c5 Mon Sep 17 00:00:00 2001 From: Rin Iwai Date: Thu, 15 Feb 2024 16:58:15 +0900 Subject: [PATCH 8/8] =?UTF-8?q?mora=5Fto=5Ftext=E3=82=92=E7=B5=B1=E5=90=88?= =?UTF-8?q?=EF=BC=8C=E7=B4=B0=E3=81=8B=E3=81=84fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/synthesizer.rs | 21 +++------------------ crates/voicevox_core/src/text_analyzer.rs | 2 +- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index f369eee7e..1ee16ace0 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -80,7 +80,7 @@ pub(crate) mod blocking { use enum_map::enum_map; use crate::{ - engine::{self, create_kana, MoraModel, OjtPhoneme}, + engine::{create_kana, MoraModel, OjtPhoneme}, error::ErrorRepr, infer::{ domain::{ @@ -92,7 +92,7 @@ pub(crate) mod blocking { InferenceSessionOptions, }, numerics::F32Ext as _, - text_analyzer::{KanaAnalyzer, OpenJTalkAnalyzer, TextAnalyzer}, + text_analyzer::{mora_to_text, KanaAnalyzer, OpenJTalkAnalyzer, TextAnalyzer}, AccentPhraseModel, AudioQueryModel, FullcontextExtractor, Result, StyleId, SupportedDevices, SynthesisOptions, VoiceModelId, VoiceModelMeta, }; @@ -179,7 +179,7 @@ pub(crate) mod blocking { return Ok(Self { status, open_jtalk_analyzer: OpenJTalkAnalyzer::new(open_jtalk), - kana_analyzer: KanaAnalyzer {}, + kana_analyzer: KanaAnalyzer, use_gpu, }); @@ -1111,21 +1111,6 @@ pub(crate) mod blocking { (consonant_phoneme_list, vowel_phoneme_list, vowel_indexes) } - fn mora_to_text(mora: impl AsRef) -> String { - let last_char = mora.as_ref().chars().last().unwrap(); - let mora = if ['A', 'I', 'U', 'E', 'O'].contains(&last_char) { - format!( - "{}{}", - &mora.as_ref()[0..mora.as_ref().len() - 1], - last_char.to_lowercase() - ) - } else { - mora.as_ref().to_string() - }; - // もしカタカナに変換できなければ、引数で与えた文字列がそのまま返ってくる - engine::mora2text(&mora).to_string() - } - impl AudioQueryModel { fn from_accent_phrases(accent_phrases: Vec) -> Self { let kana = create_kana(&accent_phrases); diff --git a/crates/voicevox_core/src/text_analyzer.rs b/crates/voicevox_core/src/text_analyzer.rs index e91d94b9c..5ecb89d56 100644 --- a/crates/voicevox_core/src/text_analyzer.rs +++ b/crates/voicevox_core/src/text_analyzer.rs @@ -106,7 +106,7 @@ fn utterance_to_accent_phrases(utterance: Utterance) -> Vec { accent_phrases } -fn mora_to_text(mora: impl AsRef) -> String { +pub fn mora_to_text(mora: impl AsRef) -> String { let last_char = mora.as_ref().chars().last().unwrap(); let mora = if ['A', 'I', 'U', 'E', 'O'].contains(&last_char) { format!(