From 81181647b47cc645640f3346eb946da8a72c016d Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Thu, 31 Oct 2024 01:40:48 +0900 Subject: [PATCH 1/3] =?UTF-8?q?refactor:=20`Synthesizer`=E3=81=AE=E5=AE=9F?= =?UTF-8?q?=E8=A3=85=E3=82=92`Inner<=5F,=20impl=20Async>`=E3=81=AB?= =?UTF-8?q?=E9=9B=86=E7=B4=84=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/status.rs | 8 +- crates/voicevox_core/src/synthesizer.rs | 1090 +++++++++++------ crates/voicevox_core/src/voice_model.rs | 48 +- .../tests/e2e/snapshots.toml | 20 +- crates/voicevox_core_python_api/src/lib.rs | 4 +- 5 files changed, 760 insertions(+), 410 deletions(-) diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 3234c27c4..64986f627 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -415,8 +415,8 @@ mod tests { }, ); let model = &crate::nonblocking::VoiceModelFile::sample().await.unwrap(); - let model_contents = &model.read_inference_models().await.unwrap(); - let result = status.insert_model(model.header(), model_contents); + let model_contents = &model.inner().read_inference_models().await.unwrap(); + let result = status.insert_model(model.inner().header(), model_contents); assert_debug_fmt_eq!(Ok(()), result); assert_eq!(1, status.loaded_models.lock().unwrap().0.len()); } @@ -431,8 +431,8 @@ mod tests { }, ); let vvm = &crate::nonblocking::VoiceModelFile::sample().await.unwrap(); - let model_header = vvm.header(); - let model_contents = &vvm.read_inference_models().await.unwrap(); + let model_header = vvm.inner().header(); + let model_contents = &vvm.inner().read_inference_models().await.unwrap(); assert!( !status.is_loaded_model(model_header.manifest.id), "model should not be loaded" diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index adf73cb68..aa16759a2 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -1,15 +1,4 @@ -// TODO: `VoiceModelFile`のように、次のような設計にする。 -// -// ``` -// pub(crate) mod blocking { -// pub struct Synthesizer(Inner); -// // … -// } -// pub(crate) mod nonblocking { -// pub struct Synthesizer(Inner); -// // … -// } -// ``` +use crate::asyncs::{Async, BlockingThreadPool, SingleTasked}; /// [`blocking::Synthesizer::synthesis`]および[`nonblocking::Synthesizer::synthesis`]のオプション。 /// @@ -79,12 +68,44 @@ pub struct InitializeOptions { pub cpu_num_threads: u16, } -pub(crate) mod blocking { +trait AsyncForOnnxruntime: Async { + async fn unblock(f: F) -> T + where + F: FnOnce() -> T + Send + 'static, + T: Send + 'static; +} + +impl AsyncForOnnxruntime for SingleTasked { + async fn unblock(f: F) -> T + where + F: FnOnce() -> T + Send + 'static, + T: Send + 'static, + { + f() + } +} + +impl AsyncForOnnxruntime for BlockingThreadPool { + async fn unblock(f: F) -> T + where + F: FnOnce() -> T + Send + 'static, + T: Send + 'static, + { + ::blocking::unblock(f).await + } +} + +mod inner { use enum_map::enum_map; - use std::io::{Cursor, Write as _}; + use std::{ + io::{Cursor, Write as _}, + marker::PhantomData, + sync::Arc, + }; use tracing::info; use crate::{ + asyncs::{BlockingThreadPool, SingleTasked}, devices::{DeviceSpec, GpuSpec}, engine::{create_kana, mora_to_text, wav_from_s16le, Mora, OjtPhoneme}, error::ErrorRepr, @@ -95,15 +116,15 @@ pub(crate) mod blocking { PredictIntonationOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation, }, - InferenceRuntime as _, InferenceSessionOptions, + InferenceRuntime, InferenceSessionOptions, }, status::Status, text_analyzer::{KanaAnalyzer, OpenJTalkAnalyzer, TextAnalyzer}, - AccentPhrase, AudioQuery, FullcontextExtractor, Result, StyleId, SynthesisOptions, - VoiceModelId, VoiceModelMeta, + voice_model, AccentPhrase, AudioQuery, FullcontextExtractor, Result, StyleId, + SynthesisOptions, VoiceModelId, VoiceModelMeta, }; - use super::{AccelerationMode, InitializeOptions, TtsOptions}; + use super::{AccelerationMode, AsyncForOnnxruntime, InitializeOptions, TtsOptions}; const DEFAULT_SAMPLING_RATE: u32 = 24000; @@ -123,52 +144,28 @@ pub(crate) mod blocking { audio_query: AudioQuery, } - /// 音声シンセサイザ。 - pub struct Synthesizer { - pub(super) status: Status, + pub struct Inner { + status: Arc>, open_jtalk_analyzer: OpenJTalkAnalyzer, kana_analyzer: KanaAnalyzer, use_gpu: bool, + _marker: PhantomData A>, } - impl self::Synthesizer { - /// `Synthesizer`をコンストラクトする。 - /// - /// # Example - /// - #[cfg_attr(feature = "load-onnxruntime", doc = "```")] - #[cfg_attr(not(feature = "load-onnxruntime"), doc = "```compile_fail")] - /// # fn main() -> anyhow::Result<()> { - /// # use test_util::{ONNXRUNTIME_DYLIB_PATH, OPEN_JTALK_DIC_DIR}; - /// # - /// # const ACCELERATION_MODE: AccelerationMode = AccelerationMode::Cpu; - /// # - /// use std::sync::Arc; - /// - /// use voicevox_core::{ - /// blocking::{Onnxruntime, OpenJtalk, Synthesizer}, - /// AccelerationMode, InitializeOptions, - /// }; - /// - /// # if cfg!(windows) { - /// # // Windows\System32\onnxruntime.dllを回避 - /// # voicevox_core::blocking::Onnxruntime::load_once() - /// # .filename(test_util::ONNXRUNTIME_DYLIB_PATH) - /// # .exec()?; - /// # } - /// let mut syntesizer = Synthesizer::new( - /// Onnxruntime::load_once().exec()?, - /// Arc::new(OpenJtalk::new(OPEN_JTALK_DIC_DIR).unwrap()), - /// &InitializeOptions { - /// acceleration_mode: ACCELERATION_MODE, - /// ..Default::default() - /// }, - /// )?; - /// # - /// # Ok(()) - /// # } - /// ``` - pub fn new( + impl From> for Inner { + fn from(from: Inner) -> Self { + Self { + status: from.status, + open_jtalk_analyzer: from.open_jtalk_analyzer, + kana_analyzer: KanaAnalyzer, + use_gpu: from.use_gpu, + _marker: PhantomData, + } + } + } + + impl Inner { + pub(super) fn new( onnxruntime: &'static crate::blocking::Onnxruntime, open_jtalk: O, options: &InitializeOptions, @@ -225,7 +222,8 @@ pub(crate) mod blocking { TalkOperation::RenderAudioSegment => heavy_session_options, }, }, - ); + ) + .into(); let use_gpu = matches!(device_for_heavy, DeviceSpec::Gpu(_)); @@ -234,46 +232,44 @@ pub(crate) mod blocking { open_jtalk_analyzer: OpenJTalkAnalyzer::new(open_jtalk), kana_analyzer: KanaAnalyzer, use_gpu, + _marker: PhantomData, }) } - pub fn onnxruntime(&self) -> &'static crate::blocking::Onnxruntime { + pub(super) fn onnxruntime(&self) -> &'static crate::blocking::Onnxruntime { self.status.rt } - /// ハードウェアアクセラレーションがGPUモードか判定する。 - pub fn is_gpu_mode(&self) -> bool { + pub(super) fn is_gpu_mode(&self) -> bool { self.use_gpu } - /// 音声モデルを読み込む。 - pub fn load_voice_model(&self, model: &crate::blocking::VoiceModelFile) -> Result<()> { - let model_bytes = &model.read_inference_models()?; + pub(super) async fn load_voice_model( + &self, + model: &voice_model::Inner, + ) -> crate::Result<()> { + let model_bytes = &model.read_inference_models().await?; + // TODO: 重い操作なので、asyncにする self.status.insert_model(model.header(), model_bytes) } - /// 音声モデルの読み込みを解除する。 - pub fn unload_voice_model(&self, voice_model_id: VoiceModelId) -> Result<()> { + pub(super) fn unload_voice_model(&self, voice_model_id: VoiceModelId) -> Result<()> { self.status.unload_model(voice_model_id) } - /// 指定したIDの音声モデルが読み込まれているか判定する。 - pub fn is_loaded_voice_model(&self, voice_model_id: VoiceModelId) -> bool { + pub(super) fn is_loaded_voice_model(&self, voice_model_id: VoiceModelId) -> bool { self.status.is_loaded_model(voice_model_id) } - #[doc(hidden)] - pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { + pub(super) fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { self.status.is_loaded_model_by_style_id(style_id) } - /// 今読み込んでいる音声モデルのメタ情報を返す。 - pub fn metas(&self) -> VoiceModelMeta { + pub(super) fn metas(&self) -> VoiceModelMeta { self.status.metas() } - /// AudioQueryから音声合成用の中間表現を生成する。 - pub fn precompute_render( + pub(super) async fn precompute_render( &self, audio_query: &AudioQuery, style_id: StyleId, @@ -393,13 +389,15 @@ pub(crate) mod blocking { padding_size, ); - let spec = self.generate_full_intermediate( - f0_with_padding.len(), - OjtPhoneme::num_phoneme(), - &f0_with_padding, - &phoneme_with_padding, - style_id, - )?; + let spec = self + .generate_full_intermediate( + f0_with_padding.len(), + OjtPhoneme::num_phoneme(), + &f0_with_padding, + &phoneme_with_padding, + style_id, + ) + .await?; return Ok(AudioFeature { internal_state: spec, style_id, @@ -499,8 +497,12 @@ pub(crate) mod blocking { } } - /// 中間表現から16bit PCMで音声波形を生成する。 - pub fn render(&self, audio: &AudioFeature, start: usize, end: usize) -> Result> { + pub(super) async fn render( + &self, + audio: &AudioFeature, + start: usize, + end: usize, + ) -> Result> { // TODO: 44.1kHzなどの対応 const MARGIN: usize = 14; // 使われているHifiGANのreceptive fieldから計算される安全マージン use std::cmp::min; @@ -526,8 +528,9 @@ pub(crate) mod blocking { let segment = audio .internal_state .slice(ndarray::s![slice_start..slice_end, ..]); - let wave_with_margin = - self.render_audio_segment(segment.into_owned(), audio.style_id)?; + let wave_with_margin = self + .render_audio_segment(segment.into_owned(), audio.style_id) + .await?; // 変換前に追加した安全マージンを生成音声から取り除く let wave = wave_with_margin .slice(ndarray::s![ @@ -565,15 +568,16 @@ pub(crate) mod blocking { } } - /// AudioQueryから直接WAVフォーマットで音声波形を生成する。 - pub fn synthesis( + pub(super) async fn synthesis( &self, audio_query: &AudioQuery, style_id: StyleId, options: &SynthesisOptions, ) -> Result> { - let audio = self.precompute_render(audio_query, style_id, options)?; - let pcm = self.render(&audio, 0, audio.frame_length)?; + let audio = self + .precompute_render(audio_query, style_id, options) + .await?; + let pcm = self.render(&audio, 0, audio.frame_length).await?; Ok(wav_from_s16le( &pcm, audio_query.output_sampling_rate, @@ -581,53 +585,27 @@ pub(crate) mod blocking { )) } - /// AquesTalk風記法からAccentPhrase (アクセント句)の配列を生成する。 - /// - /// # Example - /// - /// ``` - /// # fn main() -> anyhow::Result<()> { - /// # use pollster::FutureExt as _; - /// # use voicevox_core::__internal::doctest_fixtures::IntoBlocking as _; - /// # - /// # let synthesizer = - /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( - /// # test_util::SAMPLE_VOICE_MODEL_FILE_PATH, - /// # test_util::ONNXRUNTIME_DYLIB_PATH, - /// # test_util::OPEN_JTALK_DIC_DIR, - /// # ) - /// # .block_on()? - /// # .into_blocking(); - /// # - /// use voicevox_core::StyleId; - /// - /// let accent_phrases = synthesizer - /// .create_accent_phrases_from_kana("コンニチワ'", StyleId::new(302))?; - /// # - /// # Ok(()) - /// # } - /// ``` - pub fn create_accent_phrases_from_kana( + pub(super) async fn create_accent_phrases_from_kana( &self, kana: &str, style_id: StyleId, ) -> Result> { let accent_phrases = self.kana_analyzer.analyze(kana)?; - self.replace_mora_data(&accent_phrases, style_id) + self.replace_mora_data(&accent_phrases, style_id).await } - /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。 - pub fn replace_mora_data( + pub(super) async fn replace_mora_data( &self, accent_phrases: &[AccentPhrase], style_id: StyleId, ) -> Result> { - let accent_phrases = self.replace_phoneme_length(accent_phrases, style_id)?; - self.replace_mora_pitch(&accent_phrases, style_id) + let accent_phrases = self + .replace_phoneme_length(accent_phrases, style_id) + .await?; + self.replace_mora_pitch(&accent_phrases, style_id).await } - /// AccentPhraseの配列の音素長を、特定の声で生成しなおす。 - pub fn replace_phoneme_length( + pub(super) async fn replace_phoneme_length( &self, accent_phrases: &[AccentPhrase], style_id: StyleId, @@ -640,7 +618,7 @@ pub(crate) mod blocking { .iter() .map(|phoneme_data| phoneme_data.phoneme_id()) .collect(); - let phoneme_length = self.predict_duration(&phoneme_list_s, style_id)?; + let phoneme_length = self.predict_duration(&phoneme_list_s, style_id).await?; let mut index = 0; let new_accent_phrases = accent_phrases @@ -677,8 +655,7 @@ pub(crate) mod blocking { Ok(new_accent_phrases) } - /// AccentPhraseの配列の音高を、特定の声で生成しなおす。 - pub fn replace_mora_pitch( + pub(super) async fn replace_mora_pitch( &self, accent_phrases: &[AccentPhrase], style_id: StyleId, @@ -727,16 +704,18 @@ pub(crate) mod blocking { end_accent_phrase_list.push(base_end_accent_phrase_list[vowel_index as usize]); } - let mut f0_list = self.predict_intonation( - vowel_phoneme_list.len(), - &vowel_phoneme_list, - &consonant_phoneme_list, - &start_accent_list, - &end_accent_list, - &start_accent_phrase_list, - &end_accent_phrase_list, - style_id, - )?; + let mut f0_list = self + .predict_intonation( + vowel_phoneme_list.len(), + &vowel_phoneme_list, + &consonant_phoneme_list, + &start_accent_list, + &end_accent_list, + &start_accent_phrase_list, + &end_accent_phrase_list, + style_id, + ) + .await?; for i in 0..vowel_phoneme_data_list.len() { const UNVOICED_MORA_PHONEME_LIST: &[&str] = &["A", "I", "U", "E", "O", "cl", "pau"]; @@ -802,124 +781,59 @@ pub(crate) mod blocking { } } - /// AquesTalk風記法から[AudioQuery]を生成する。 - /// - /// # Example - /// - /// ``` - /// # fn main() -> anyhow::Result<()> { - /// # use pollster::FutureExt as _; - /// # use voicevox_core::__internal::doctest_fixtures::IntoBlocking as _; - /// # - /// # let synthesizer = - /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( - /// # test_util::SAMPLE_VOICE_MODEL_FILE_PATH, - /// # test_util::ONNXRUNTIME_DYLIB_PATH, - /// # test_util::OPEN_JTALK_DIC_DIR, - /// # ) - /// # .block_on()? - /// # .into_blocking(); - /// # - /// use voicevox_core::StyleId; - /// - /// let audio_query = synthesizer.audio_query_from_kana("コンニチワ'", StyleId::new(302))?; - /// # - /// # Ok(()) - /// # } - /// ``` - /// - /// [AudioQuery]: crate::AudioQuery - pub fn audio_query_from_kana(&self, kana: &str, style_id: StyleId) -> Result { - let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id)?; + pub(super) async fn audio_query_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> Result { + let accent_phrases = self.create_accent_phrases_from_kana(kana, style_id).await?; Ok(AudioQuery::from_accent_phrases(accent_phrases).with_kana(Some(kana.to_owned()))) } - /// AquesTalk風記法から音声合成を行う。 - pub fn tts_from_kana( + pub(super) async fn tts_from_kana( &self, kana: &str, style_id: StyleId, options: &TtsOptions, ) -> Result> { - let audio_query = &self.audio_query_from_kana(kana, style_id)?; + let audio_query = &self.audio_query_from_kana(kana, style_id).await?; self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) + .await } } - impl self::Synthesizer { - /// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。 - /// - /// # Example - /// - /// ``` - /// # fn main() -> anyhow::Result<()> { - /// # use pollster::FutureExt as _; - /// # use voicevox_core::__internal::doctest_fixtures::IntoBlocking as _; - /// # - /// # let synthesizer = - /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( - /// # test_util::SAMPLE_VOICE_MODEL_FILE_PATH, - /// # test_util::ONNXRUNTIME_DYLIB_PATH, - /// # test_util::OPEN_JTALK_DIC_DIR, - /// # ) - /// # .block_on()? - /// # .into_blocking(); - /// # - /// use voicevox_core::StyleId; - /// - /// let accent_phrases = synthesizer.create_accent_phrases("こんにちは", StyleId::new(302))?; - /// # - /// # Ok(()) - /// # } - /// ``` - pub fn create_accent_phrases( + impl Inner { + pub(super) async fn create_accent_phrases( &self, text: &str, style_id: StyleId, ) -> Result> { let accent_phrases = self.open_jtalk_analyzer.analyze(text)?; - self.replace_mora_data(&accent_phrases, style_id) + self.replace_mora_data(&accent_phrases, style_id).await } - /// 日本語のテキストから[AudioQuery]を生成する。 - /// - /// # Examples - /// - /// ``` - /// # fn main() -> anyhow::Result<()> { - /// # use pollster::FutureExt as _; - /// # use voicevox_core::__internal::doctest_fixtures::IntoBlocking as _; - /// # - /// # let synthesizer = - /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( - /// # test_util::SAMPLE_VOICE_MODEL_FILE_PATH, - /// # test_util::ONNXRUNTIME_DYLIB_PATH, - /// # test_util::OPEN_JTALK_DIC_DIR, - /// # ) - /// # .block_on()? - /// # .into_blocking(); - /// # - /// use voicevox_core::StyleId; - /// - /// let audio_query = synthesizer.audio_query("こんにちは", StyleId::new(302))?; - /// # - /// # Ok(()) - /// # } - /// ``` - /// - /// [AudioQuery]: crate::AudioQuery - pub fn audio_query(&self, text: &str, style_id: StyleId) -> Result { - let accent_phrases = self.create_accent_phrases(text, style_id)?; + pub(super) async fn audio_query( + &self, + text: &str, + style_id: StyleId, + ) -> Result { + let accent_phrases = self.create_accent_phrases(text, style_id).await?; Ok(AudioQuery::from_accent_phrases(accent_phrases)) } - /// 日本語のテキストから音声合成を行う。 - pub fn tts(&self, text: &str, style_id: StyleId, options: &TtsOptions) -> Result> { - let audio_query = &self.audio_query(text, style_id)?; + pub(super) async fn tts( + &self, + text: &str, + style_id: StyleId, + options: &TtsOptions, + ) -> Result> { + let audio_query = &self.audio_query(text, style_id).await?; self.synthesis(audio_query, style_id, &SynthesisOptions::from(options)) + .await } } + // TODO: `mod blocking`に移動する pub trait PerformInference { /// `predict_duration`を実行する。 /// @@ -980,33 +894,23 @@ pub(crate) mod blocking { ) -> Result>; } - impl PerformInference for self::Synthesizer { - fn predict_duration(&self, phoneme_vector: &[i64], style_id: StyleId) -> Result> { - let (model_id, inner_voice_id) = self.status.ids_for::(style_id)?; - - let PredictDurationOutput { - phoneme_length: output, - } = self.status.run_session( - model_id, - PredictDurationInput { - phoneme_list: ndarray::arr1(phoneme_vector), - speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), - }, - )?; - let mut output = output.into_raw_vec(); - - for output_item in output.iter_mut() { - if *output_item < PHONEME_LENGTH_MINIMAL { - *output_item = PHONEME_LENGTH_MINIMAL; - } - } - - return Ok(output); - - const PHONEME_LENGTH_MINIMAL: f32 = 0.01; + impl Inner { + pub(super) async fn predict_duration( + &self, + phoneme_vector: &[i64], + style_id: StyleId, + ) -> Result> { + let status = self.status.clone(); + let phoneme_vector = ndarray::arr1(phoneme_vector); + A::unblock(move || status.predict_duration(phoneme_vector, style_id)).await } - fn predict_intonation( + #[expect( + clippy::too_many_arguments, + reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\ + まとめたりしても可読性に寄与しない" + )] + pub(super) async fn predict_intonation( &self, length: usize, vowel_phoneme_vector: &[i64], @@ -1017,26 +921,29 @@ pub(crate) mod blocking { end_accent_phrase_vector: &[i64], style_id: StyleId, ) -> Result> { - let (model_id, inner_voice_id) = self.status.ids_for::(style_id)?; - - let PredictIntonationOutput { f0_list: output } = self.status.run_session( - model_id, - PredictIntonationInput { - length: ndarray::arr0(length as i64), - vowel_phoneme_list: ndarray::arr1(vowel_phoneme_vector), - consonant_phoneme_list: ndarray::arr1(consonant_phoneme_vector), - start_accent_list: ndarray::arr1(start_accent_vector), - end_accent_list: ndarray::arr1(end_accent_vector), - start_accent_phrase_list: ndarray::arr1(start_accent_phrase_vector), - end_accent_phrase_list: ndarray::arr1(end_accent_phrase_vector), - speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), - }, - )?; - - Ok(output.into_raw_vec()) + let status = self.status.clone(); + let vowel_phoneme_vector = ndarray::arr1(vowel_phoneme_vector); + let consonant_phoneme_vector = ndarray::arr1(consonant_phoneme_vector); + let start_accent_vector = ndarray::arr1(start_accent_vector); + let end_accent_vector = ndarray::arr1(end_accent_vector); + let start_accent_phrase_vector = ndarray::arr1(start_accent_phrase_vector); + let end_accent_phrase_vector = ndarray::arr1(end_accent_phrase_vector); + A::unblock(move || { + status.predict_intonation( + length, + vowel_phoneme_vector, + consonant_phoneme_vector, + start_accent_vector, + end_accent_vector, + start_accent_phrase_vector, + end_accent_phrase_vector, + style_id, + ) + }) + .await } - fn generate_full_intermediate( + pub(super) async fn generate_full_intermediate( &self, length: usize, phoneme_size: usize, @@ -1044,15 +951,127 @@ pub(crate) mod blocking { phoneme_vector: &[f32], style_id: StyleId, ) -> Result> { - let (model_id, inner_voice_id) = self.status.ids_for::(style_id)?; + let status = self.status.clone(); + let f0 = ndarray::arr1(f0); + let phoneme_vector = ndarray::arr1(phoneme_vector); + A::unblock(move || { + status.generate_full_intermediate( + length, + phoneme_size, + f0, + phoneme_vector, + style_id, + ) + }) + .await + } + + pub(super) async fn render_audio_segment( + &self, + spec: ndarray::Array2, + style_id: StyleId, + ) -> Result> { + let status = self.status.clone(); + A::unblock(move || status.render_audio_segment(spec, style_id)).await + } - let GenerateFullIntermediateOutput { spec } = self.status.run_session( + pub(super) async fn decode( + &self, + length: usize, + phoneme_size: usize, + f0: &[f32], + phoneme_vector: &[f32], + style_id: StyleId, + ) -> Result> { + let status = self.status.clone(); + let f0 = ndarray::arr1(f0); + let phoneme_vector = ndarray::arr1(phoneme_vector); + A::unblock(move || status.decode(length, phoneme_size, f0, phoneme_vector, style_id)) + .await + } + } + + // CPU/GPU-bound + impl Status { + fn predict_duration( + &self, + phoneme_vector: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let PredictDurationOutput { + phoneme_length: output, + } = self.run_session( + model_id, + PredictDurationInput { + phoneme_list: phoneme_vector, + speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), + }, + )?; + let mut output = output.into_raw_vec(); + + for output_item in output.iter_mut() { + if *output_item < PHONEME_LENGTH_MINIMAL { + *output_item = PHONEME_LENGTH_MINIMAL; + } + } + + return Ok(output); + + const PHONEME_LENGTH_MINIMAL: f32 = 0.01; + } + + #[expect( + clippy::too_many_arguments, + reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\ + まとめたりしても可読性に寄与しない" + )] + fn predict_intonation( + &self, + length: usize, + vowel_phoneme_vector: ndarray::Array1, + consonant_phoneme_vector: ndarray::Array1, + start_accent_vector: ndarray::Array1, + end_accent_vector: ndarray::Array1, + start_accent_phrase_vector: ndarray::Array1, + end_accent_phrase_vector: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let PredictIntonationOutput { f0_list: output } = self.run_session( + model_id, + PredictIntonationInput { + length: ndarray::arr0(length as i64), + vowel_phoneme_list: vowel_phoneme_vector, + consonant_phoneme_list: consonant_phoneme_vector, + start_accent_list: start_accent_vector, + end_accent_list: end_accent_vector, + start_accent_phrase_list: start_accent_phrase_vector, + end_accent_phrase_list: end_accent_phrase_vector, + speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), + }, + )?; + + Ok(output.into_raw_vec()) + } + + fn generate_full_intermediate( + &self, + length: usize, + phoneme_size: usize, + f0: ndarray::Array1, + phoneme_vector: ndarray::Array1, + style_id: StyleId, + ) -> Result> { + let (model_id, inner_voice_id) = self.ids_for::(style_id)?; + + let GenerateFullIntermediateOutput { spec } = self.run_session( model_id, GenerateFullIntermediateInput { - f0: ndarray::arr1(f0).into_shape([length, 1]).unwrap(), - phoneme: ndarray::arr1(phoneme_vector) - .into_shape([length, phoneme_size]) - .unwrap(), + f0: f0.into_shape([length, 1]).unwrap(), + phoneme: phoneme_vector.into_shape([length, phoneme_size]).unwrap(), speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]), }, )?; @@ -1064,10 +1083,9 @@ pub(crate) mod blocking { spec: ndarray::Array2, style_id: StyleId, ) -> Result> { - let (model_id, _inner_voice_id) = self.status.ids_for::(style_id)?; - let RenderAudioSegmentOutput { wave } = self - .status - .run_session(model_id, RenderAudioSegmentInput { spec })?; + let (model_id, _inner_voice_id) = self.ids_for::(style_id)?; + let RenderAudioSegmentOutput { wave } = + self.run_session(model_id, RenderAudioSegmentInput { spec })?; Ok(wave) } @@ -1075,8 +1093,8 @@ pub(crate) mod blocking { &self, length: usize, phoneme_size: usize, - f0: &[f32], - phoneme_vector: &[f32], + f0: ndarray::Array1, + phoneme_vector: ndarray::Array1, style_id: StyleId, ) -> Result> { let intermediate = self.generate_full_intermediate( @@ -1228,17 +1246,403 @@ pub(crate) mod blocking { } } -pub(crate) mod nonblocking { - use std::sync::Arc; +pub(crate) mod blocking { + use crate::{ + asyncs::SingleTasked, future::FutureExt as _, AccentPhrase, AudioQuery, + FullcontextExtractor, StyleId, VoiceModelId, VoiceModelMeta, + }; + + use super::{inner::Inner, InitializeOptions, SynthesisOptions, TtsOptions}; + + pub use super::inner::{AudioFeature, PerformInference}; + + /// 音声シンセサイザ。 + pub struct Synthesizer(pub(super) Inner); + + impl self::Synthesizer { + /// `Synthesizer`をコンストラクトする。 + /// + /// # Example + /// + #[cfg_attr(feature = "load-onnxruntime", doc = "```")] + #[cfg_attr(not(feature = "load-onnxruntime"), doc = "```compile_fail")] + /// # fn main() -> anyhow::Result<()> { + /// # use test_util::{ONNXRUNTIME_DYLIB_PATH, OPEN_JTALK_DIC_DIR}; + /// # + /// # const ACCELERATION_MODE: AccelerationMode = AccelerationMode::Cpu; + /// # + /// use std::sync::Arc; + /// + /// use voicevox_core::{ + /// blocking::{Onnxruntime, OpenJtalk, Synthesizer}, + /// AccelerationMode, InitializeOptions, + /// }; + /// + /// # if cfg!(windows) { + /// # // Windows\System32\onnxruntime.dllを回避 + /// # voicevox_core::blocking::Onnxruntime::load_once() + /// # .filename(test_util::ONNXRUNTIME_DYLIB_PATH) + /// # .exec()?; + /// # } + /// let mut syntesizer = Synthesizer::new( + /// Onnxruntime::load_once().exec()?, + /// Arc::new(OpenJtalk::new(OPEN_JTALK_DIC_DIR).unwrap()), + /// &InitializeOptions { + /// acceleration_mode: ACCELERATION_MODE, + /// ..Default::default() + /// }, + /// )?; + /// # + /// # Ok(()) + /// # } + /// ``` + pub fn new( + onnxruntime: &'static crate::blocking::Onnxruntime, + open_jtalk: O, + options: &InitializeOptions, + ) -> crate::Result { + Inner::new(onnxruntime, open_jtalk, options).map(Self) + } + + pub fn onnxruntime(&self) -> &'static crate::blocking::Onnxruntime { + self.0.onnxruntime() + } + + /// ハードウェアアクセラレーションがGPUモードか判定する。 + pub fn is_gpu_mode(&self) -> bool { + self.0.is_gpu_mode() + } + + /// 音声モデルを読み込む。 + pub fn load_voice_model( + &self, + model: &crate::blocking::VoiceModelFile, + ) -> crate::Result<()> { + self.0.load_voice_model(model.inner()).block_on() + } + + /// 音声モデルの読み込みを解除する。 + pub fn unload_voice_model(&self, voice_model_id: VoiceModelId) -> crate::Result<()> { + self.0.unload_voice_model(voice_model_id) + } + + /// 指定したIDの音声モデルが読み込まれているか判定する。 + pub fn is_loaded_voice_model(&self, voice_model_id: VoiceModelId) -> bool { + self.0.is_loaded_voice_model(voice_model_id) + } + + #[doc(hidden)] + pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { + self.0.is_loaded_model_by_style_id(style_id) + } + + /// 今読み込んでいる音声モデルのメタ情報を返す。 + pub fn metas(&self) -> VoiceModelMeta { + self.0.metas() + } + + /// AudioQueryから音声合成用の中間表現を生成する。 + pub fn precompute_render( + &self, + audio_query: &AudioQuery, + style_id: StyleId, + options: &SynthesisOptions, + ) -> crate::Result { + self.0 + .precompute_render(audio_query, style_id, options) + .block_on() + } + + /// 中間表現から16bit PCMで音声波形を生成する。 + pub fn render( + &self, + audio: &AudioFeature, + start: usize, + end: usize, + ) -> crate::Result> { + self.0.render(audio, start, end).block_on() + } + + /// AudioQueryから直接WAVフォーマットで音声波形を生成する。 + pub fn synthesis( + &self, + audio_query: &AudioQuery, + style_id: StyleId, + options: &SynthesisOptions, + ) -> crate::Result> { + self.0.synthesis(audio_query, style_id, options).block_on() + } + + /// AquesTalk風記法からAccentPhrase (アクセント句)の配列を生成する。 + /// + /// # Example + /// + /// ``` + /// # fn main() -> anyhow::Result<()> { + /// # use pollster::FutureExt as _; + /// # use voicevox_core::__internal::doctest_fixtures::IntoBlocking as _; + /// # + /// # let synthesizer = + /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( + /// # test_util::SAMPLE_VOICE_MODEL_FILE_PATH, + /// # test_util::ONNXRUNTIME_DYLIB_PATH, + /// # test_util::OPEN_JTALK_DIC_DIR, + /// # ) + /// # .block_on()? + /// # .into_blocking(); + /// # + /// use voicevox_core::StyleId; + /// + /// let accent_phrases = synthesizer + /// .create_accent_phrases_from_kana("コンニチワ'", StyleId::new(302))?; + /// # + /// # Ok(()) + /// # } + /// ``` + pub fn create_accent_phrases_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> crate::Result> { + self.0 + .create_accent_phrases_from_kana(kana, style_id) + .block_on() + } + /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。 + pub fn replace_mora_data( + &self, + accent_phrases: &[AccentPhrase], + style_id: StyleId, + ) -> crate::Result> { + self.0 + .replace_mora_data(accent_phrases, style_id) + .block_on() + } + + /// AccentPhraseの配列の音素長を、特定の声で生成しなおす。 + pub fn replace_phoneme_length( + &self, + accent_phrases: &[AccentPhrase], + style_id: StyleId, + ) -> crate::Result> { + self.0 + .replace_phoneme_length(accent_phrases, style_id) + .block_on() + } + + /// AccentPhraseの配列の音高を、特定の声で生成しなおす。 + pub fn replace_mora_pitch( + &self, + accent_phrases: &[AccentPhrase], + style_id: StyleId, + ) -> crate::Result> { + self.0 + .replace_mora_pitch(accent_phrases, style_id) + .block_on() + } + + /// AquesTalk風記法から[AudioQuery]を生成する。 + /// + /// # Example + /// + /// ``` + /// # fn main() -> anyhow::Result<()> { + /// # use pollster::FutureExt as _; + /// # use voicevox_core::__internal::doctest_fixtures::IntoBlocking as _; + /// # + /// # let synthesizer = + /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( + /// # test_util::SAMPLE_VOICE_MODEL_FILE_PATH, + /// # test_util::ONNXRUNTIME_DYLIB_PATH, + /// # test_util::OPEN_JTALK_DIC_DIR, + /// # ) + /// # .block_on()? + /// # .into_blocking(); + /// # + /// use voicevox_core::StyleId; + /// + /// let audio_query = synthesizer.audio_query_from_kana("コンニチワ'", StyleId::new(302))?; + /// # + /// # Ok(()) + /// # } + /// ``` + /// + /// [AudioQuery]: crate::AudioQuery + pub fn audio_query_from_kana( + &self, + kana: &str, + style_id: StyleId, + ) -> crate::Result { + self.0.audio_query_from_kana(kana, style_id).block_on() + } + + /// AquesTalk風記法から音声合成を行う。 + pub fn tts_from_kana( + &self, + kana: &str, + style_id: StyleId, + options: &TtsOptions, + ) -> crate::Result> { + self.0.tts_from_kana(kana, style_id, options).block_on() + } + } + + impl self::Synthesizer { + /// 日本語のテキストからAccentPhrase (アクセント句)の配列を生成する。 + /// + /// # Example + /// + /// ``` + /// # fn main() -> anyhow::Result<()> { + /// # use pollster::FutureExt as _; + /// # use voicevox_core::__internal::doctest_fixtures::IntoBlocking as _; + /// # + /// # let synthesizer = + /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( + /// # test_util::SAMPLE_VOICE_MODEL_FILE_PATH, + /// # test_util::ONNXRUNTIME_DYLIB_PATH, + /// # test_util::OPEN_JTALK_DIC_DIR, + /// # ) + /// # .block_on()? + /// # .into_blocking(); + /// # + /// use voicevox_core::StyleId; + /// + /// let accent_phrases = synthesizer.create_accent_phrases("こんにちは", StyleId::new(302))?; + /// # + /// # Ok(()) + /// # } + /// ``` + pub fn create_accent_phrases( + &self, + text: &str, + style_id: StyleId, + ) -> crate::Result> { + self.0.create_accent_phrases(text, style_id).block_on() + } + + /// 日本語のテキストから[AudioQuery]を生成する。 + /// + /// # Examples + /// + /// ``` + /// # fn main() -> anyhow::Result<()> { + /// # use pollster::FutureExt as _; + /// # use voicevox_core::__internal::doctest_fixtures::IntoBlocking as _; + /// # + /// # let synthesizer = + /// # voicevox_core::__internal::doctest_fixtures::synthesizer_with_sample_voice_model( + /// # test_util::SAMPLE_VOICE_MODEL_FILE_PATH, + /// # test_util::ONNXRUNTIME_DYLIB_PATH, + /// # test_util::OPEN_JTALK_DIC_DIR, + /// # ) + /// # .block_on()? + /// # .into_blocking(); + /// # + /// use voicevox_core::StyleId; + /// + /// let audio_query = synthesizer.audio_query("こんにちは", StyleId::new(302))?; + /// # + /// # Ok(()) + /// # } + /// ``` + /// + /// [AudioQuery]: crate::AudioQuery + pub fn audio_query(&self, text: &str, style_id: StyleId) -> crate::Result { + self.0.audio_query(text, style_id).block_on() + } + + /// 日本語のテキストから音声合成を行う。 + pub fn tts( + &self, + text: &str, + style_id: StyleId, + options: &TtsOptions, + ) -> crate::Result> { + self.0.tts(text, style_id, options).block_on() + } + } + + impl PerformInference for self::Synthesizer { + fn predict_duration( + &self, + phoneme_vector: &[i64], + style_id: StyleId, + ) -> crate::Result> { + self.0.predict_duration(phoneme_vector, style_id).block_on() + } + + fn predict_intonation( + &self, + length: usize, + vowel_phoneme_vector: &[i64], + consonant_phoneme_vector: &[i64], + start_accent_vector: &[i64], + end_accent_vector: &[i64], + start_accent_phrase_vector: &[i64], + end_accent_phrase_vector: &[i64], + style_id: StyleId, + ) -> crate::Result> { + self.0 + .predict_intonation( + length, + vowel_phoneme_vector, + consonant_phoneme_vector, + start_accent_vector, + end_accent_vector, + start_accent_phrase_vector, + end_accent_phrase_vector, + style_id, + ) + .block_on() + } + + fn generate_full_intermediate( + &self, + length: usize, + phoneme_size: usize, + f0: &[f32], + phoneme_vector: &[f32], + style_id: StyleId, + ) -> crate::Result> { + self.0 + .generate_full_intermediate(length, phoneme_size, f0, phoneme_vector, style_id) + .block_on() + } + + fn render_audio_segment( + &self, + spec: ndarray::Array2, + style_id: StyleId, + ) -> crate::Result> { + self.0.render_audio_segment(spec, style_id).block_on() + } + + fn decode( + &self, + length: usize, + phoneme_size: usize, + f0: &[f32], + phoneme_vector: &[f32], + style_id: StyleId, + ) -> crate::Result> { + self.0 + .decode(length, phoneme_size, f0, phoneme_vector, style_id) + .block_on() + } + } +} + +pub(crate) mod nonblocking { use easy_ext::ext; use crate::{ - AccentPhrase, AudioQuery, FullcontextExtractor, Result, StyleId, SynthesisOptions, - VoiceModelId, VoiceModelMeta, + asyncs::BlockingThreadPool, AccentPhrase, AudioQuery, FullcontextExtractor, Result, + StyleId, SynthesisOptions, VoiceModelId, VoiceModelMeta, }; - use super::{InitializeOptions, TtsOptions}; + use super::{inner::Inner, InitializeOptions, TtsOptions}; /// 音声シンセサイザ。 /// @@ -1248,8 +1652,7 @@ pub(crate) mod nonblocking { /// /// [blocking]: https://docs.rs/crate/blocking /// [`nonblocking`モジュールのドキュメント]: crate::nonblocking - #[derive(Clone)] - pub struct Synthesizer(pub(super) Arc>); + pub struct Synthesizer(pub(super) Inner); impl self::Synthesizer { /// `Synthesizer`をコンストラクトする。 @@ -1294,7 +1697,7 @@ pub(crate) mod nonblocking { open_jtalk: O, options: &InitializeOptions, ) -> Result { - super::blocking::Synthesizer::new(&onnxruntime.0, open_jtalk, options) + Inner::new(&onnxruntime.0, open_jtalk, options) .map(Into::into) .map(Self) } @@ -1313,8 +1716,7 @@ pub(crate) mod nonblocking { &self, model: &crate::nonblocking::VoiceModelFile, ) -> Result<()> { - let model_bytes = &model.read_inference_models().await?; - self.0.status.insert_model(model.header(), model_bytes) + self.0.load_voice_model(model.inner()).await } /// 音声モデルの読み込みを解除する。 @@ -1344,12 +1746,7 @@ pub(crate) mod nonblocking { style_id: StyleId, options: &SynthesisOptions, ) -> Result> { - let blocking = self.0.clone(); - let audio_query = audio_query.clone(); - let options = options.clone(); - - crate::task::asyncify(move || blocking.synthesis(&audio_query, style_id, &options)) - .await + self.0.synthesis(audio_query, style_id, options).await } /// AquesTalk風記法からAccentPhrase (アクセント句)の配列を生成する。 @@ -1381,11 +1778,7 @@ pub(crate) mod nonblocking { kana: &str, style_id: StyleId, ) -> Result> { - let blocking = self.0.clone(); - let kana = kana.to_owned(); - - crate::task::asyncify(move || blocking.create_accent_phrases_from_kana(&kana, style_id)) - .await + self.0.create_accent_phrases_from_kana(kana, style_id).await } /// AccentPhraseの配列の音高・音素長を、特定の声で生成しなおす。 @@ -1394,11 +1787,7 @@ pub(crate) mod nonblocking { accent_phrases: &[AccentPhrase], style_id: StyleId, ) -> Result> { - let blocking = self.0.clone(); - let accent_phrases = accent_phrases.to_owned(); - - crate::task::asyncify(move || blocking.replace_mora_data(&accent_phrases, style_id)) - .await + self.0.replace_mora_data(accent_phrases, style_id).await } /// AccentPhraseの配列の音素長を、特定の声で生成しなおす。 @@ -1407,13 +1796,9 @@ pub(crate) mod nonblocking { accent_phrases: &[AccentPhrase], style_id: StyleId, ) -> Result> { - let blocking = self.0.clone(); - let accent_phrases = accent_phrases.to_owned(); - - crate::task::asyncify(move || { - blocking.replace_phoneme_length(&accent_phrases, style_id) - }) - .await + self.0 + .replace_phoneme_length(accent_phrases, style_id) + .await } /// AccentPhraseの配列の音高を、特定の声で生成しなおす。 @@ -1422,11 +1807,7 @@ pub(crate) mod nonblocking { accent_phrases: &[AccentPhrase], style_id: StyleId, ) -> Result> { - let blocking = self.0.clone(); - let accent_phrases = accent_phrases.to_owned(); - - crate::task::asyncify(move || blocking.replace_mora_pitch(&accent_phrases, style_id)) - .await + self.0.replace_mora_pitch(accent_phrases, style_id).await } /// AquesTalk風記法から[AudioQuery]を生成する。 @@ -1460,10 +1841,7 @@ pub(crate) mod nonblocking { kana: &str, style_id: StyleId, ) -> Result { - let blocking = self.0.clone(); - let kana = kana.to_owned(); - - crate::task::asyncify(move || blocking.audio_query_from_kana(&kana, style_id)).await + self.0.audio_query_from_kana(kana, style_id).await } /// AquesTalk風記法から音声合成を行う。 @@ -1473,11 +1851,7 @@ pub(crate) mod nonblocking { style_id: StyleId, options: &TtsOptions, ) -> Result> { - let blocking = self.0.clone(); - let kana = kana.to_owned(); - let options = options.clone(); - - crate::task::asyncify(move || blocking.tts_from_kana(&kana, style_id, &options)).await + self.0.tts_from_kana(kana, style_id, options).await } } @@ -1511,10 +1885,7 @@ pub(crate) mod nonblocking { text: &str, style_id: StyleId, ) -> Result> { - let blocking = self.0.clone(); - let text = text.to_owned(); - - crate::task::asyncify(move || blocking.create_accent_phrases(&text, style_id)).await + self.0.create_accent_phrases(text, style_id).await } /// 日本語のテキストから[AudioQuery]を生成する。 @@ -1544,10 +1915,7 @@ pub(crate) mod nonblocking { /// /// [AudioQuery]: crate::AudioQuery pub async fn audio_query(&self, text: &str, style_id: StyleId) -> Result { - let blocking = self.0.clone(); - let text = text.to_owned(); - - crate::task::asyncify(move || blocking.audio_query(&text, style_id)).await + self.0.audio_query(text, style_id).await } /// 日本語のテキストから音声合成を行う。 @@ -1557,26 +1925,21 @@ pub(crate) mod nonblocking { style_id: StyleId, options: &TtsOptions, ) -> Result> { - let blocking = self.0.clone(); - let text = text.to_owned(); - let options = options.clone(); - - crate::task::asyncify(move || blocking.tts(&text, style_id, &options)).await + self.0.tts(text, style_id, options).await } } #[ext(IntoBlocking)] impl self::Synthesizer { - pub fn into_blocking(self) -> Arc> { - self.0 + pub fn into_blocking(self) -> super::blocking::Synthesizer { + super::blocking::Synthesizer(self.0.into()) } } } #[cfg(test)] mod tests { - - use super::{blocking::PerformInference as _, AccelerationMode, InitializeOptions}; + use super::{AccelerationMode, InitializeOptions}; use crate::{engine::Mora, macros::tests::assert_debug_fmt_eq, AccentPhrase, Result, StyleId}; use ::test_util::OPEN_JTALK_DIC_DIR; use rstest::rstest; @@ -1686,7 +2049,8 @@ mod tests { let result = syntesizer .0 - .predict_duration(&phoneme_vector, StyleId::new(1)); + .predict_duration(&phoneme_vector, StyleId::new(1)) + .await; assert!(result.is_ok(), "{result:?}"); assert_eq!(result.unwrap().len(), phoneme_vector.len()); @@ -1719,16 +2083,19 @@ mod tests { let start_accent_phrase_vector = [0, 1, 0, 0, 0]; let end_accent_phrase_vector = [0, 0, 0, 1, 0]; - let result = syntesizer.0.predict_intonation( - vowel_phoneme_vector.len(), - &vowel_phoneme_vector, - &consonant_phoneme_vector, - &start_accent_vector, - &end_accent_vector, - &start_accent_phrase_vector, - &end_accent_phrase_vector, - StyleId::new(1), - ); + let result = syntesizer + .0 + .predict_intonation( + vowel_phoneme_vector.len(), + &vowel_phoneme_vector, + &consonant_phoneme_vector, + &start_accent_vector, + &end_accent_vector, + &start_accent_phrase_vector, + &end_accent_phrase_vector, + StyleId::new(1), + ) + .await; assert!(result.is_ok(), "{result:?}"); assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len()); @@ -1777,7 +2144,8 @@ mod tests { let result = syntesizer .0 - .decode(F0_LENGTH, PHONEME_SIZE, &f0, &phoneme, StyleId::new(1)); + .decode(F0_LENGTH, PHONEME_SIZE, &f0, &phoneme, StyleId::new(1)) + .await; assert!(result.is_ok(), "{result:?}"); assert_eq!(result.unwrap().len(), F0_LENGTH * 256); diff --git a/crates/voicevox_core/src/voice_model.rs b/crates/voicevox_core/src/voice_model.rs index f31b8ecb1..c2920398b 100644 --- a/crates/voicevox_core/src/voice_model.rs +++ b/crates/voicevox_core/src/voice_model.rs @@ -62,7 +62,7 @@ impl VoiceModelId { } #[self_referencing] -struct Inner { +pub(crate) struct Inner { header: VoiceModelHeader, #[borrows(header)] @@ -182,11 +182,11 @@ impl Inner { &self.borrow_header().metas } - fn header(&self) -> &VoiceModelHeader { + pub(crate) fn header(&self) -> &VoiceModelHeader { self.borrow_header() } - async fn read_inference_models( + pub(crate) async fn read_inference_models( &self, ) -> LoadModelResult> { let path = &self.borrow_header().path; @@ -412,12 +412,9 @@ impl InferenceDomainMap { pub(crate) mod blocking { use std::path::Path; - use crate::{ - asyncs::SingleTasked, error::LoadModelResult, future::FutureExt as _, - infer::domains::InferenceDomainMap, VoiceModelMeta, - }; + use crate::{asyncs::SingleTasked, future::FutureExt as _, VoiceModelMeta}; - use super::{Inner, ModelBytesWithInnerVoiceIdsByDomain, VoiceModelHeader, VoiceModelId}; + use super::{Inner, VoiceModelId}; /// 音声モデルファイル。 /// @@ -425,17 +422,15 @@ pub(crate) mod blocking { pub struct VoiceModelFile(Inner); impl self::VoiceModelFile { - pub(crate) fn read_inference_models( - &self, - ) -> LoadModelResult> { - self.0.read_inference_models().block_on() - } - /// VVMファイルを開く。 pub fn open(path: impl AsRef) -> crate::Result { Inner::open(path).block_on().map(Self) } + pub(crate) fn inner(&self) -> &Inner { + &self.0 + } + /// ID。 pub fn id(&self) -> VoiceModelId { self.0.id() @@ -445,22 +440,15 @@ pub(crate) mod blocking { pub fn metas(&self) -> &VoiceModelMeta { self.0.metas() } - - pub(crate) fn header(&self) -> &VoiceModelHeader { - self.0.header() - } } } pub(crate) mod nonblocking { use std::path::Path; - use crate::{ - asyncs::BlockingThreadPool, error::LoadModelResult, infer::domains::InferenceDomainMap, - Result, VoiceModelMeta, - }; + use crate::{asyncs::BlockingThreadPool, Result, VoiceModelMeta}; - use super::{Inner, ModelBytesWithInnerVoiceIdsByDomain, VoiceModelHeader, VoiceModelId}; + use super::{Inner, VoiceModelId}; /// 音声モデルファイル。 /// @@ -475,12 +463,6 @@ pub(crate) mod nonblocking { pub struct VoiceModelFile(Inner); impl self::VoiceModelFile { - pub(crate) async fn read_inference_models( - &self, - ) -> LoadModelResult> { - self.0.read_inference_models().await - } - /// VVMファイルを開く。 pub async fn open(path: impl AsRef) -> Result { Inner::open(path).await.map(Self) @@ -491,6 +473,10 @@ pub(crate) mod nonblocking { self.0.into_heads().zip.into_inner().close().await; } + pub(crate) fn inner(&self) -> &Inner { + &self.0 + } + /// ID。 pub fn id(&self) -> VoiceModelId { self.0.id() @@ -500,10 +486,6 @@ pub(crate) mod nonblocking { pub fn metas(&self) -> &VoiceModelMeta { self.0.metas() } - - pub(crate) fn header(&self) -> &VoiceModelHeader { - self.0.header() - } } } diff --git a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml index 602824543..d5168b3c5 100644 --- a/crates/voicevox_core_c_api/tests/e2e/snapshots.toml +++ b/crates/voicevox_core_c_api/tests/e2e/snapshots.toml @@ -53,11 +53,11 @@ metas = ''' stderr.windows = ''' {timestamp} INFO ort: Loaded ONNX Runtime dylib with version '{onnxruntime_version}' {windows-video-cards} -{timestamp} INFO voicevox_core::synthesizer::blocking: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します ''' stderr.unix = ''' {timestamp} INFO ort: Loaded ONNX Runtime dylib with version '{onnxruntime_version}' -{timestamp} INFO voicevox_core::synthesizer::blocking: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します ''' [compatible_engine_load_model_before_initialize] @@ -123,11 +123,11 @@ output."こんにちは、音声合成の世界へようこそ".wav_length = 176 stderr.windows = ''' {timestamp} INFO ort: Loaded ONNX Runtime dylib with version '{onnxruntime_version}' {windows-video-cards} -{timestamp} INFO voicevox_core::synthesizer::blocking: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します ''' stderr.unix = ''' {timestamp} INFO ort: Loaded ONNX Runtime dylib with version '{onnxruntime_version}' -{timestamp} INFO voicevox_core::synthesizer::blocking: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します ''' [synthesizer_new_output_json] @@ -185,11 +185,11 @@ metas = ''' stderr.windows = ''' {timestamp} INFO ort: Loaded ONNX Runtime dylib with version '{onnxruntime_version}' {windows-video-cards} -{timestamp} INFO voicevox_core::synthesizer::blocking: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します ''' stderr.unix = ''' {timestamp} INFO ort: Loaded ONNX Runtime dylib with version '{onnxruntime_version}' -{timestamp} INFO voicevox_core::synthesizer::blocking: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します ''' [tts_via_audio_query] @@ -197,22 +197,22 @@ output."こんにちは、音声合成の世界へようこそ".wav_length = 176 stderr.windows = ''' {timestamp} INFO ort: Loaded ONNX Runtime dylib with version '{onnxruntime_version}' {windows-video-cards} -{timestamp} INFO voicevox_core::synthesizer::blocking: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します ''' stderr.unix = ''' {timestamp} INFO ort: Loaded ONNX Runtime dylib with version '{onnxruntime_version}' -{timestamp} INFO voicevox_core::synthesizer::blocking: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します ''' [user_dict_load] stderr.windows = ''' {timestamp} INFO ort: Loaded ONNX Runtime dylib with version '{onnxruntime_version}' {windows-video-cards} -{timestamp} INFO voicevox_core::synthesizer::blocking: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します ''' stderr.unix = ''' {timestamp} INFO ort: Loaded ONNX Runtime dylib with version '{onnxruntime_version}' -{timestamp} INFO voicevox_core::synthesizer::blocking: CPUを利用します +{timestamp} INFO voicevox_core::synthesizer::inner: CPUを利用します ''' [user_dict_manipulate] diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs index 58791ead1..ccf1b2b7f 100644 --- a/crates/voicevox_core_python_api/src/lib.rs +++ b/crates/voicevox_core_python_api/src/lib.rs @@ -1057,7 +1057,7 @@ mod asyncio { pub(crate) struct Synthesizer { synthesizer: Arc< Closable< - voicevox_core::nonblocking::Synthesizer, + Arc>, Self, Tokio, >, @@ -1088,7 +1088,7 @@ mod asyncio { cpu_num_threads, }, ); - let synthesizer = Python::with_gil(|py| synthesizer.into_py_result(py))?; + let synthesizer = Python::with_gil(|py| synthesizer.into_py_result(py))?.into(); let synthesizer = Closable::new(synthesizer).into(); Ok(Self { synthesizer }) } From 44edb5975d046481403571378b3ee907c696a158 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Thu, 31 Oct 2024 14:32:58 +0900 Subject: [PATCH 2/3] Add a fixme --- crates/voicevox_core_python_api/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs index ccf1b2b7f..a2d1c2475 100644 --- a/crates/voicevox_core_python_api/src/lib.rs +++ b/crates/voicevox_core_python_api/src/lib.rs @@ -1055,6 +1055,8 @@ mod asyncio { #[pyclass] pub(crate) struct Synthesizer { + // FIXME: `Arc`ではなく、`Arc>`を + // `clone`する synthesizer: Arc< Closable< Arc>, From c941504fad578600577e284327f9e6eced30b6f3 Mon Sep 17 00:00:00 2001 From: Ryo Yamashita Date: Sat, 2 Nov 2024 18:40:16 +0900 Subject: [PATCH 3/3] =?UTF-8?q?`PerformInference`=E3=82=92easy-ext?= =?UTF-8?q?=E3=81=A7=E3=81=AE=E5=AE=9F=E8=A3=85=E3=81=AB=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/voicevox_core/src/synthesizer.rs | 89 ++++++------------------- 1 file changed, 20 insertions(+), 69 deletions(-) diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index aa16759a2..b30d7c3ca 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -833,67 +833,6 @@ mod inner { } } - // TODO: `mod blocking`に移動する - pub trait PerformInference { - /// `predict_duration`を実行する。 - /// - /// # Performance - /// - /// CPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 - fn predict_duration(&self, phoneme_vector: &[i64], style_id: StyleId) -> Result>; - - /// `predict_intonation`を実行する。 - /// - /// # Performance - /// - /// CPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 - #[expect( - clippy::too_many_arguments, - reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\ - まとめたりしても可読性に寄与しない" - )] - fn predict_intonation( - &self, - length: usize, - vowel_phoneme_vector: &[i64], - consonant_phoneme_vector: &[i64], - start_accent_vector: &[i64], - end_accent_vector: &[i64], - start_accent_phrase_vector: &[i64], - end_accent_phrase_vector: &[i64], - style_id: StyleId, - ) -> Result>; - - fn generate_full_intermediate( - &self, - length: usize, - phoneme_size: usize, - f0: &[f32], - phoneme_vector: &[f32], - style_id: StyleId, - ) -> Result>; - - fn render_audio_segment( - &self, - spec: ndarray::Array2, - style_id: StyleId, - ) -> Result>; - - /// `decode`を実行する。 - /// - /// # Performance - /// - /// CPU/GPU-boundな操作であるため、非同期ランタイム上では直接実行されるべきではない。 - fn decode( - &self, - length: usize, - phoneme_size: usize, - f0: &[f32], - phoneme_vector: &[f32], - style_id: StyleId, - ) -> Result>; - } - impl Inner { pub(super) async fn predict_duration( &self, @@ -991,8 +930,8 @@ mod inner { } } - // CPU/GPU-bound impl Status { + /// CPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。 fn predict_duration( &self, phoneme_vector: ndarray::Array1, @@ -1022,6 +961,7 @@ mod inner { const PHONEME_LENGTH_MINIMAL: f32 = 0.01; } + /// CPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。 #[expect( clippy::too_many_arguments, reason = "compatible_engineでの`predict_intonation`の形を考えると、ここの引数を構造体に\ @@ -1057,6 +997,7 @@ mod inner { Ok(output.into_raw_vec()) } + /// CPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。 fn generate_full_intermediate( &self, length: usize, @@ -1078,6 +1019,7 @@ mod inner { Ok(spec) } + /// CPU/GPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。 fn render_audio_segment( &self, spec: ndarray::Array2, @@ -1089,6 +1031,7 @@ mod inner { Ok(wave) } + /// CPU/GPU-boundな操作なので、非同期ランタイム上では直接実行されるべきではない。 fn decode( &self, length: usize, @@ -1246,7 +1189,14 @@ mod inner { } } +#[expect( + clippy::too_many_arguments, + reason = "`PerformInference::predict_intonation`用。compatible_engineでの`predict_intonation`の\ + 形を考えると、ここの引数を構造体にまとめたりしても可読性に寄与しない" +)] pub(crate) mod blocking { + use easy_ext::ext; + use crate::{ asyncs::SingleTasked, future::FutureExt as _, AccentPhrase, AudioQuery, FullcontextExtractor, StyleId, VoiceModelId, VoiceModelMeta, @@ -1254,7 +1204,7 @@ pub(crate) mod blocking { use super::{inner::Inner, InitializeOptions, SynthesisOptions, TtsOptions}; - pub use super::inner::{AudioFeature, PerformInference}; + pub use super::inner::AudioFeature; /// 音声シンセサイザ。 pub struct Synthesizer(pub(super) Inner); @@ -1564,8 +1514,9 @@ pub(crate) mod blocking { } } - impl PerformInference for self::Synthesizer { - fn predict_duration( + #[ext(PerformInference)] + impl self::Synthesizer<()> { + pub fn predict_duration( &self, phoneme_vector: &[i64], style_id: StyleId, @@ -1573,7 +1524,7 @@ pub(crate) mod blocking { self.0.predict_duration(phoneme_vector, style_id).block_on() } - fn predict_intonation( + pub fn predict_intonation( &self, length: usize, vowel_phoneme_vector: &[i64], @@ -1598,7 +1549,7 @@ pub(crate) mod blocking { .block_on() } - fn generate_full_intermediate( + pub fn generate_full_intermediate( &self, length: usize, phoneme_size: usize, @@ -1611,7 +1562,7 @@ pub(crate) mod blocking { .block_on() } - fn render_audio_segment( + pub fn render_audio_segment( &self, spec: ndarray::Array2, style_id: StyleId, @@ -1619,7 +1570,7 @@ pub(crate) mod blocking { self.0.render_audio_segment(spec, style_id).block_on() } - fn decode( + pub fn decode( &self, length: usize, phoneme_size: usize,