diff --git a/Cargo.lock b/Cargo.lock index 595e0e30f..a4460dae4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -730,6 +730,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + [[package]] name = "cookie" version = "0.14.4" @@ -956,6 +962,19 @@ dependencies = [ "syn 1.0.102", ] +[[package]] +name = "derive_more" +version = "0.99.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "rustc_version 0.4.0", + "syn 1.0.102", +] + [[package]] name = "diff" version = "0.1.13" @@ -3942,6 +3961,7 @@ dependencies = [ "cfg-if", "derive-getters", "derive-new", + "derive_more", "easy-ext", "flate2", "fs-err", diff --git a/crates/voicevox_core/Cargo.toml b/crates/voicevox_core/Cargo.toml index 05949fa90..c542a9860 100644 --- a/crates/voicevox_core/Cargo.toml +++ b/crates/voicevox_core/Cargo.toml @@ -14,6 +14,7 @@ async_zip.workspace = true cfg-if = "1.0.0" derive-getters.workspace = true derive-new = "0.5.9" +derive_more = "0.99.17" easy-ext.workspace = true fs-err.workspace = true futures = "0.3.26" diff --git a/crates/voicevox_core/src/engine/synthesis_engine.rs b/crates/voicevox_core/src/engine/synthesis_engine.rs index 0adce4fe2..96d2353e4 100644 --- a/crates/voicevox_core/src/engine/synthesis_engine.rs +++ b/crates/voicevox_core/src/engine/synthesis_engine.rs @@ -30,10 +30,6 @@ impl SynthesisEngine { &self.inference_core } - pub fn inference_core_mut(&mut self) -> &mut InferenceCore { - &mut self.inference_core - } - pub async fn create_accent_phrases( &self, text: &str, diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs index 7ad258a34..12883f2ab 100644 --- a/crates/voicevox_core/src/error.rs +++ b/crates/voicevox_core/src/error.rs @@ -21,14 +21,8 @@ pub enum Error { #[error("{}", base_error_message(VOICEVOX_RESULT_GPU_SUPPORT_ERROR))] GpuSupport, - #[error("{} ({}): {source}", base_error_message(VOICEVOX_RESULT_LOAD_MODEL_ERROR), path.display())] - LoadModel { - path: PathBuf, - #[source] - source: anyhow::Error, - }, - #[error("{} ({})", base_error_message(VOICEVOX_RESULT_ALREADY_LOADED_MODEL_ERROR), path.display())] - AlreadyLoadedModel { path: PathBuf }, + #[error(transparent)] + LoadModel(#[from] LoadModelError), #[error( "{} ({model_id:?})", @@ -36,29 +30,6 @@ pub enum Error { )] UnloadedModel { model_id: VoiceModelId }, - #[error( - "{}({path}):{source}", - base_error_message(VOICEVOX_RESULT_OPEN_FILE_ERROR) - )] - OpenFile { - path: PathBuf, - #[source] - source: anyhow::Error, - }, - - #[error( - "{}({path}):{source}", - base_error_message(VOICEVOX_RESULT_VVM_MODEL_READ_ERROR) - )] - VvmRead { - path: PathBuf, - #[source] - source: anyhow::Error, - }, - - #[error("{},{0}", base_error_message(VOICEVOX_RESULT_LOAD_METAS_ERROR))] - LoadMetas(#[source] anyhow::Error), - #[error( "{},{0}", base_error_message(VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR) @@ -111,6 +82,48 @@ pub enum Error { InvalidWord(InvalidWordError), } +pub(crate) type LoadModelResult = std::result::Result; + +/// 音声モデル読み込みのエラー。 +#[derive(Error, Debug)] +#[error( + "`{path}`の読み込みに失敗しました: {context}{}", + source.as_ref().map(|e| format!(": {e}")).unwrap_or_default()) +] +pub struct LoadModelError { + pub(crate) path: PathBuf, + pub(crate) context: LoadModelErrorKind, + #[source] + pub(crate) source: Option, +} + +impl LoadModelError { + pub fn context(&self) -> &LoadModelErrorKind { + &self.context + } +} + +#[derive(derive_more::Display, Debug)] +pub enum LoadModelErrorKind { + //#[display(fmt = "{}", "base_error_message(VOICEVOX_RESULT_OPEN_ZIP_FILE_ERROR)")] + #[display(fmt = "ZIPファイルとして開くことができませんでした")] + OpenZipFile, + //#[display(fmt = "{}", "base_error_message(VOICEVOX_RESULT_READ_ZIP_ENTRY_ERROR)")] + #[display(fmt = "`{filename}`を読み取れませんでした")] + ReadZipEntry { filename: String }, + //#[display(fmt = "{}", "base_error_message(VOICEVOX_RESULT_MODEL_ALREADY_LOADED_ERROR)")] + #[display(fmt = "モデル`{id}`は既に読み込まれています")] + ModelAlreadyLoaded { id: VoiceModelId }, + //#[display(fmt = "{}", "base_error_message(VOICEVOX_RESULT_STYLE_ALREADY_LOADED_ERROR)")] + #[display(fmt = "スタイル`{id}`は既に読み込まれています")] + StyleAlreadyLoaded { id: StyleId }, + #[display( + fmt = "{}", + "base_error_message(VOICEVOX_RESULT_INVALID_MODEL_DATA_ERROR)" + )] + InvalidModelData, +} + fn base_error_message(result_code: VoicevoxResultCode) -> &'static str { let c_message: &'static str = crate::result_code::error_result_to_message(result_code); &c_message[..(c_message.len() - 1)] diff --git a/crates/voicevox_core/src/inference_core.rs b/crates/voicevox_core/src/inference_core.rs index 66ec3bb18..d88943fc8 100644 --- a/crates/voicevox_core/src/inference_core.rs +++ b/crates/voicevox_core/src/inference_core.rs @@ -1,9 +1,6 @@ use self::status::*; use super::*; -use onnxruntime::{ - ndarray, - session::{AnyArray, NdArray}, -}; +use onnxruntime::{ndarray, session::NdArray}; const PHONEME_LENGTH_MINIMAL: f32 = 0.01; @@ -18,7 +15,7 @@ impl InferenceCore { load_all_models: bool, ) -> Result { if !use_gpu || Self::can_support_gpu_feature()? { - let mut status = Status::new(use_gpu, cpu_num_threads); + let status = Status::new(use_gpu, cpu_num_threads); if load_all_models { for model in &VoiceModel::get_all_models().await? { @@ -43,14 +40,14 @@ impl InferenceCore { } } - pub async fn load_model(&mut self, model: &VoiceModel) -> Result<()> { + pub async fn load_model(&self, model: &VoiceModel) -> Result<()> { self.status.load_model(model).await } - pub fn unload_model(&mut self, voice_model_id: &VoiceModelId) -> Result<()> { + pub fn unload_model(&self, voice_model_id: &VoiceModelId) -> Result<()> { self.status.unload_model(voice_model_id) } - pub fn metas(&self) -> &VoiceModelMeta { + pub fn metas(&self) -> VoiceModelMeta { self.status.metas() } @@ -71,21 +68,15 @@ impl InferenceCore { return Err(Error::InvalidStyleId { style_id }); } - let (model_id, model_inner_id) = self - .status - .id_relations - .get(&style_id) - .ok_or(Error::InvalidStyleId { style_id })?; - - let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector)); - let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64])); + let (model_id, model_inner_id) = self.status.ids_for(style_id)?; - let input_tensors: Vec<&mut dyn AnyArray> = - vec![&mut phoneme_vector_array, &mut speaker_id_array]; + let phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector)); + let speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id().into()])); let mut output = self .status - .predict_duration_session_run(model_id, input_tensors)?; + .predict_duration_session_run(&model_id, phoneme_vector_array, speaker_id_array) + .await?; for output_item in output.iter_mut() { if *output_item < PHONEME_LENGTH_MINIMAL { @@ -112,37 +103,31 @@ impl InferenceCore { return Err(Error::InvalidStyleId { style_id }); } - let (model_id, model_inner_id) = self - .status - .id_relations - .get(&style_id) - .ok_or(Error::InvalidStyleId { style_id })?; - - let mut length_array = NdArray::new(ndarray::arr0(length as i64)); - let mut vowel_phoneme_vector_array = NdArray::new(ndarray::arr1(vowel_phoneme_vector)); - let mut consonant_phoneme_vector_array = - NdArray::new(ndarray::arr1(consonant_phoneme_vector)); - let mut start_accent_vector_array = NdArray::new(ndarray::arr1(start_accent_vector)); - let mut end_accent_vector_array = NdArray::new(ndarray::arr1(end_accent_vector)); - let mut start_accent_phrase_vector_array = + let (model_id, model_inner_id) = self.status.ids_for(style_id)?; + + let length_array = NdArray::new(ndarray::arr0(length as i64)); + let vowel_phoneme_vector_array = NdArray::new(ndarray::arr1(vowel_phoneme_vector)); + let consonant_phoneme_vector_array = NdArray::new(ndarray::arr1(consonant_phoneme_vector)); + let start_accent_vector_array = NdArray::new(ndarray::arr1(start_accent_vector)); + let end_accent_vector_array = NdArray::new(ndarray::arr1(end_accent_vector)); + let start_accent_phrase_vector_array = NdArray::new(ndarray::arr1(start_accent_phrase_vector)); - let mut end_accent_phrase_vector_array = - NdArray::new(ndarray::arr1(end_accent_phrase_vector)); - let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64])); - - let input_tensors: Vec<&mut dyn AnyArray> = vec![ - &mut length_array, - &mut vowel_phoneme_vector_array, - &mut consonant_phoneme_vector_array, - &mut start_accent_vector_array, - &mut end_accent_vector_array, - &mut start_accent_phrase_vector_array, - &mut end_accent_phrase_vector_array, - &mut speaker_id_array, - ]; + let end_accent_phrase_vector_array = NdArray::new(ndarray::arr1(end_accent_phrase_vector)); + let speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id().into()])); self.status - .predict_intonation_session_run(model_id, input_tensors) + .predict_intonation_session_run( + &model_id, + length_array, + vowel_phoneme_vector_array, + consonant_phoneme_vector_array, + start_accent_vector_array, + end_accent_vector_array, + start_accent_phrase_vector_array, + end_accent_phrase_vector_array, + speaker_id_array, + ) + .await } pub async fn decode( @@ -157,11 +142,7 @@ impl InferenceCore { return Err(Error::InvalidStyleId { style_id }); } - let (model_id, model_inner_id) = self - .status - .id_relations - .get(&style_id) - .ok_or(Error::InvalidStyleId { style_id })?; + let (model_id, model_inner_id) = self.status.ids_for(style_id)?; // 音が途切れてしまうのを避けるworkaround処理が入っている // TODO: 改善したらここのpadding処理を取り除く @@ -179,23 +160,21 @@ impl InferenceCore { padding_size, ); - let mut f0_array = NdArray::new( + let f0_array = NdArray::new( ndarray::arr1(&f0_with_padding) .into_shape([length_with_padding, 1]) .unwrap(), ); - let mut phoneme_array = NdArray::new( + let phoneme_array = NdArray::new( ndarray::arr1(&phoneme_with_padding) .into_shape([length_with_padding, phoneme_size]) .unwrap(), ); - let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64])); - - let input_tensors: Vec<&mut dyn AnyArray> = - vec![&mut f0_array, &mut phoneme_array, &mut speaker_id_array]; + let speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id().into()])); self.status - .decode_session_run(model_id, input_tensors) + .decode_session_run(&model_id, f0_array, phoneme_array, speaker_id_array) + .await .map(|output| Self::trim_padding_from_output(output, padding_size)) } diff --git a/crates/voicevox_core/src/result_code.rs b/crates/voicevox_core/src/result_code.rs index 07cf05e18..541c65a79 100644 --- a/crates/voicevox_core/src/result_code.rs +++ b/crates/voicevox_core/src/result_code.rs @@ -11,14 +11,10 @@ pub enum VoicevoxResultCode { VOICEVOX_RESULT_OK = 0, /// open_jtalk辞書ファイルが読み込まれていない VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR = 1, - /// modelの読み込みに失敗した - VOICEVOX_RESULT_LOAD_MODEL_ERROR = 2, /// サポートされているデバイス情報取得に失敗した VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR = 3, /// GPUモードがサポートされていない VOICEVOX_RESULT_GPU_SUPPORT_ERROR = 4, - /// メタ情報読み込みに失敗した - VOICEVOX_RESULT_LOAD_METAS_ERROR = 5, /// 無効なstyle_idが指定された VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR = 6, /// 無効なmodel_idが指定された @@ -35,12 +31,16 @@ pub enum VoicevoxResultCode { VOICEVOX_RESULT_INVALID_AUDIO_QUERY_ERROR = 14, /// 無効なAccentPhrase VOICEVOX_RESULT_INVALID_ACCENT_PHRASE_ERROR = 15, - /// ファイルオープンエラー - VOICEVOX_RESULT_OPEN_FILE_ERROR = 16, - /// Modelを読み込めなかった - VOICEVOX_RESULT_VVM_MODEL_READ_ERROR = 17, - /// すでに読み込まれているModelを読み込もうとした - VOICEVOX_RESULT_ALREADY_LOADED_MODEL_ERROR = 18, + /// ZIPファイルを開くことに失敗した + VOICEVOX_RESULT_OPEN_ZIP_FILE_ERROR = 16, + /// ZIP内のファイルが読めなかった + VOICEVOX_RESULT_READ_ZIP_ENTRY_ERROR = 17, + /// すでに読み込まれている音声モデルを読み込もうとした + VOICEVOX_RESULT_MODEL_ALREADY_LOADED_ERROR = 18, + /// すでに読み込まれているスタイルを読み込もうとした + VOICEVOX_RESULT_STYLE_ALREADY_LOADED_ERROR = 26, + /// 無効なモデルデータ + VOICEVOX_RESULT_INVALID_MODEL_DATA_ERROR = 27, /// Modelが読み込まれていない VOICEVOX_RESULT_UNLOADED_MODEL_ERROR = 19, /// ユーザー辞書を読み込めなかった @@ -64,8 +64,6 @@ pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'stati VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR => { "OpenJTalkの辞書が読み込まれていません\0" } - VOICEVOX_RESULT_LOAD_MODEL_ERROR => "modelデータ読み込みに失敗しました\0", - VOICEVOX_RESULT_LOAD_METAS_ERROR => "メタデータ読み込みに失敗しました\0", VOICEVOX_RESULT_GPU_SUPPORT_ERROR => "GPU機能をサポートすることができません\0", VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR => { @@ -85,11 +83,11 @@ pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'stati } VOICEVOX_RESULT_INVALID_AUDIO_QUERY_ERROR => "無効なaudio_queryです\0", VOICEVOX_RESULT_INVALID_ACCENT_PHRASE_ERROR => "無効なaccent_phraseです\0", - VOICEVOX_RESULT_OPEN_FILE_ERROR => "ファイルオープンに失敗しました\0", - VOICEVOX_RESULT_VVM_MODEL_READ_ERROR => "Modelを読み込めませんでした\0", - VOICEVOX_RESULT_ALREADY_LOADED_MODEL_ERROR => { - "すでに読み込まれているModelを読み込もうとしました\0" - } + VOICEVOX_RESULT_OPEN_ZIP_FILE_ERROR => "ZIPファイルのオープンに失敗しました\0", + VOICEVOX_RESULT_READ_ZIP_ENTRY_ERROR => "ZIP内のファイルを読むことができませんでした\0", + VOICEVOX_RESULT_MODEL_ALREADY_LOADED_ERROR => "同じIDのモデルを読むことはできません\0", + VOICEVOX_RESULT_STYLE_ALREADY_LOADED_ERROR => "同じIDのスタイルを読むことはできません\0", + VOICEVOX_RESULT_INVALID_MODEL_DATA_ERROR => "モデルデータを読むことができませんでした\0", VOICEVOX_RESULT_UNLOADED_MODEL_ERROR => "Modelが読み込まれていません\0", VOICEVOX_RESULT_LOAD_USER_DICT_ERROR => "ユーザー辞書を読み込めませんでした\0", VOICEVOX_RESULT_SAVE_USER_DICT_ERROR => "ユーザー辞書を書き込めませんでした\0", diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 12f7e1bd6..a92ffae8e 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -1,11 +1,13 @@ use super::*; +use itertools::iproduct; use once_cell::sync::Lazy; use onnxruntime::{ environment::Environment, - session::{AnyArray, Session}, + ndarray::{Ix0, Ix1, Ix2}, + session::{NdArray, Session}, GraphOptimizationLevel, LoggingLevel, }; -use std::sync::Mutex; +use std::sync::Arc; use std::{env, path::Path}; use tracing::error; @@ -19,18 +21,9 @@ cfg_if! { use std::collections::BTreeMap; pub struct Status { - models: StatusModels, - merged_metas: VoiceModelMeta, + loaded_models: std::sync::Mutex, light_session_options: SessionOptions, // 軽いモデルはこちらを使う heavy_session_options: SessionOptions, // 重いモデルはこちらを使う - pub id_relations: BTreeMap, // FIXME: pubはやめたい -} - -struct StatusModels { - metas: BTreeMap, - predict_duration: BTreeMap>>, - predict_intonation: BTreeMap>>, - decode: BTreeMap>>, } #[derive(new, Getters)] @@ -58,38 +51,21 @@ static ENVIRONMENT: Lazy = Lazy::new(|| { .unwrap() }); -#[allow(unsafe_code)] -unsafe impl Send for Status {} - -#[allow(unsafe_code)] -unsafe impl Sync for Status {} - impl Status { pub fn new(use_gpu: bool, cpu_num_threads: u16) -> Self { Self { - models: StatusModels { - metas: BTreeMap::new(), - predict_duration: BTreeMap::new(), - predict_intonation: BTreeMap::new(), - decode: BTreeMap::new(), - }, - merged_metas: VoiceModelMeta::default(), + loaded_models: Default::default(), light_session_options: SessionOptions::new(cpu_num_threads, false), heavy_session_options: SessionOptions::new(cpu_num_threads, use_gpu), - id_relations: BTreeMap::default(), } } - pub async fn load_model(&mut self, model: &VoiceModel) -> Result<()> { - for speaker in model.metas().iter() { - for style in speaker.styles().iter() { - if self.id_relations.contains_key(style.id()) { - Err(Error::AlreadyLoadedModel { - path: model.path().clone(), - })?; - } - } - } + pub async fn load_model(&self, model: &VoiceModel) -> Result<()> { + self.loaded_models + .lock() + .unwrap() + .ensure_acceptable(model)?; + let models = model.read_inference_models().await?; let predict_duration_session = self.new_session( @@ -107,79 +83,37 @@ impl Status { &self.heavy_session_options, model.path(), )?; - self.models - .metas - .insert(model.id().clone(), model.metas().clone()); - - for speaker in model.metas().iter() { - for style in speaker.styles().iter() { - self.id_relations.insert( - *style.id(), - (model.id().clone(), model.model_inner_id_for(*style.id())), - ); - } - } - self.set_metas(); - - self.models - .predict_duration - .insert(model.id().clone(), Mutex::new(predict_duration_session)); - self.models - .predict_intonation - .insert(model.id().clone(), Mutex::new(predict_intonation_session)); - - self.models - .decode - .insert(model.id().clone(), Mutex::new(decode_model)); + self.loaded_models.lock().unwrap().insert( + model, + predict_duration_session, + predict_intonation_session, + decode_model, + )?; Ok(()) } - pub fn unload_model(&mut self, voice_model_id: &VoiceModelId) -> Result<()> { - if self.is_loaded_model(voice_model_id) { - self.models.predict_intonation.remove(voice_model_id); - self.models.predict_duration.remove(voice_model_id); - self.models.decode.remove(voice_model_id); - - let remove_style_ids = self - .id_relations - .iter() - .filter(|&(_, (loaded_model_id, _))| loaded_model_id == voice_model_id) - .map(|(&style_id, _)| style_id) - .collect::>(); - - for style_id in remove_style_ids.iter() { - self.id_relations.remove(style_id); - } - self.set_metas(); - Ok(()) - } else { - Err(Error::UnloadedModel { - model_id: voice_model_id.clone(), - }) - } + pub fn unload_model(&self, voice_model_id: &VoiceModelId) -> Result<()> { + self.loaded_models.lock().unwrap().remove(voice_model_id) } - fn set_metas(&mut self) { - let mut meta = VoiceModelMeta::default(); - for m in self.models.metas.values() { - meta.extend_from_slice(m); - } - self.merged_metas = meta; + pub fn metas(&self) -> VoiceModelMeta { + self.loaded_models.lock().unwrap().metas() } - pub fn metas(&self) -> &VoiceModelMeta { - &self.merged_metas + pub(crate) fn ids_for(&self, style_id: StyleId) -> Result<(VoiceModelId, ModelInnerId)> { + self.loaded_models.lock().unwrap().ids_for(style_id) } pub fn is_loaded_model(&self, voice_model_id: &VoiceModelId) -> bool { - self.models.predict_duration.contains_key(voice_model_id) - && self.models.predict_intonation.contains_key(voice_model_id) - && self.models.decode.contains_key(voice_model_id) + self.loaded_models + .lock() + .unwrap() + .contains_voice_model(voice_model_id) } pub fn is_loaded_model_by_style_id(&self, style_id: StyleId) -> bool { - self.id_relations.contains_key(&style_id) + self.loaded_models.lock().unwrap().contains_style(style_id) } fn new_session( @@ -187,11 +121,12 @@ impl Status { model: &[u8], session_options: &SessionOptions, path: impl AsRef, - ) -> Result> { + ) -> LoadModelResult> { self.new_session_from_bytes(|| model_file::decrypt(model), session_options) - .map_err(|source| Error::LoadModel { - path: path.as_ref().into(), - source, + .map_err(|source| LoadModelError { + path: path.as_ref().to_owned(), + context: LoadModelErrorKind::InvalidModelData, + source: Some(source), }) } @@ -226,62 +161,291 @@ impl Status { } pub fn validate_speaker_id(&self, style_id: StyleId) -> bool { - self.id_relations.contains_key(&style_id) + self.is_loaded_model_by_style_id(style_id) } - pub fn predict_duration_session_run( + /// # Panics + /// + /// `self`が`model_id`を含んでいないとき、パニックする。 + pub async fn predict_duration_session_run( &self, model_id: &VoiceModelId, - inputs: Vec<&mut dyn AnyArray>, + mut phoneme_vector_array: NdArray, + mut speaker_id_array: NdArray, ) -> Result> { - if let Some(model) = self.models.predict_duration.get(model_id) { - if let Ok(output_tensors) = model.lock().unwrap().run(inputs) { - Ok(output_tensors[0].as_slice().unwrap().to_owned()) - } else { - Err(Error::InferenceFailed) - } - } else { - Err(Error::InvalidModelId { - model_id: model_id.clone(), - }) - } + let predict_duration = self.loaded_models.lock().unwrap().get( + model_id, + |SessionSet { + predict_duration, .. + }| predict_duration, + ); + + tokio::task::spawn_blocking(move || { + let mut predict_duration = predict_duration.lock().unwrap(); + + let output_tensors = predict_duration + .run(vec![&mut phoneme_vector_array, &mut speaker_id_array]) + .map_err(|_| Error::InferenceFailed)?; + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + }) + .await + .unwrap() } - pub fn predict_intonation_session_run( + /// # Panics + /// + /// `self`が`model_id`を含んでいないとき、パニックする。 + #[allow(clippy::too_many_arguments)] + pub async fn predict_intonation_session_run( &self, model_id: &VoiceModelId, - inputs: Vec<&mut dyn AnyArray>, + mut length_array: NdArray, + mut vowel_phoneme_vector_array: NdArray, + mut consonant_phoneme_vector_array: NdArray, + mut start_accent_vector_array: NdArray, + mut end_accent_vector_array: NdArray, + mut start_accent_phrase_vector_array: NdArray, + mut end_accent_phrase_vector_array: NdArray, + mut speaker_id_array: NdArray, ) -> Result> { - if let Some(model) = self.models.predict_intonation.get(model_id) { - if let Ok(output_tensors) = model.lock().unwrap().run(inputs) { - Ok(output_tensors[0].as_slice().unwrap().to_owned()) - } else { - Err(Error::InferenceFailed) - } - } else { - Err(Error::InvalidModelId { - model_id: model_id.clone(), - }) - } + let predict_intonation = self.loaded_models.lock().unwrap().get( + model_id, + |SessionSet { + predict_intonation, .. + }| predict_intonation, + ); + + tokio::task::spawn_blocking(move || { + let mut predict_intonation = predict_intonation.lock().unwrap(); + + let output_tensors = predict_intonation + .run(vec![ + &mut length_array, + &mut vowel_phoneme_vector_array, + &mut consonant_phoneme_vector_array, + &mut start_accent_vector_array, + &mut end_accent_vector_array, + &mut start_accent_phrase_vector_array, + &mut end_accent_phrase_vector_array, + &mut speaker_id_array, + ]) + .map_err(|_| Error::InferenceFailed)?; + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + }) + .await + .unwrap() } - pub fn decode_session_run( + /// # Panics + /// + /// `self`が`model_id`を含んでいないとき、パニックする。 + pub async fn decode_session_run( &self, model_id: &VoiceModelId, - inputs: Vec<&mut dyn AnyArray>, + mut f0_array: NdArray, + mut phoneme_array: NdArray, + mut speaker_id_array: NdArray, ) -> Result> { - if let Some(model) = self.models.decode.get(model_id) { - if let Ok(output_tensors) = model.lock().unwrap().run(inputs) { - Ok(output_tensors[0].as_slice().unwrap().to_owned()) - } else { - Err(Error::InferenceFailed) - } - } else { - Err(Error::InvalidModelId { - model_id: model_id.clone(), + let decode = self + .loaded_models + .lock() + .unwrap() + .get(model_id, |SessionSet { decode, .. }| decode); + + tokio::task::spawn_blocking(move || { + let mut decode = decode.lock().unwrap(); + + let output_tensors = decode + .run(vec![ + &mut f0_array, + &mut phoneme_array, + &mut speaker_id_array, + ]) + .map_err(|_| Error::InferenceFailed)?; + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + }) + .await + .unwrap() + } +} + +/// 読み込んだモデルの`Session`とそのメタ情報を保有し、追加/削除/取得の操作を提供する。 +/// +/// この構造体のメソッドは、すべて一瞬で完了すべきである。 +#[derive(Default)] +struct LoadedModels(BTreeMap); + +struct LoadedModel { + model_inner_ids: BTreeMap, + metas: VoiceModelMeta, + session_set: SessionSet, +} + +impl LoadedModels { + fn metas(&self) -> VoiceModelMeta { + self.0 + .values() + .flat_map(|LoadedModel { metas, .. }| metas) + .cloned() + .collect() + } + + fn ids_for(&self, style_id: StyleId) -> Result<(VoiceModelId, ModelInnerId)> { + let ( + model_id, + LoadedModel { + model_inner_ids, .. + }, + ) = self + .0 + .iter() + .find(|(_, LoadedModel { metas, .. })| { + metas + .iter() + .flat_map(SpeakerMeta::styles) + .any(|style| *style.id() == style_id) }) + .ok_or(Error::InvalidStyleId { style_id })?; + + let model_inner_id = *model_inner_ids + .get(&style_id) + .expect("`model_inner_ids` should contains all of the style IDs in the model"); + + Ok((model_id.clone(), model_inner_id)) + } + + /// # Panics + /// + /// `self`が`model_id`を含んでいないとき、パニックする。 + fn get( + &self, + model_id: &VoiceModelId, + which: fn(&SessionSet) -> &Arc>>>, + ) -> Arc>>> { + which(&self.0[model_id].session_set).clone() + } + + fn contains_voice_model(&self, model_id: &VoiceModelId) -> bool { + self.0.contains_key(model_id) + } + + fn contains_style(&self, style_id: StyleId) -> bool { + self.styles().any(|style| *style.id() == style_id) + } + + /// 与えられた`VoiceModel`を受け入れ可能かをチェックする。 + /// + /// # Errors + /// + /// 音声モデルIDかスタイルIDが`model`と重複するとき、エラーを返す。 + fn ensure_acceptable(&self, model: &VoiceModel) -> LoadModelResult<()> { + let loaded = self.styles(); + let external = model.metas().iter().flat_map(|speaker| speaker.styles()); + + let error = |context| LoadModelError { + path: model.path().clone(), + context, + source: None, + }; + + if self.0.contains_key(model.id()) { + return Err(error(LoadModelErrorKind::ModelAlreadyLoaded { + id: model.id().clone(), + })); + } + if let Some((style, _)) = + iproduct!(loaded, external).find(|(loaded, external)| loaded.id() == external.id()) + { + return Err(error(LoadModelErrorKind::StyleAlreadyLoaded { + id: *style.id(), + })); + } + Ok(()) + } + + fn insert( + &mut self, + model: &VoiceModel, + predict_duration: Session<'static>, + predict_intonation: Session<'static>, + decode: Session<'static>, + ) -> Result<()> { + self.ensure_acceptable(model)?; + + let prev = self.0.insert( + model.id().clone(), + LoadedModel { + model_inner_ids: model.model_inner_ids(), + metas: model.metas().clone(), + session_set: SessionSet { + predict_duration: Arc::new(std::sync::Mutex::new(predict_duration.into())), + predict_intonation: Arc::new(std::sync::Mutex::new(predict_intonation.into())), + decode: Arc::new(std::sync::Mutex::new(decode.into())), + }, + }, + ); + assert!(prev.is_none()); + Ok(()) + } + + fn remove(&mut self, model_id: &VoiceModelId) -> Result<()> { + if self.0.remove(model_id).is_none() { + return Err(Error::UnloadedModel { + model_id: model_id.clone(), + }); + } + Ok(()) + } + + fn styles(&self) -> impl Iterator { + self.0 + .values() + .flat_map(|LoadedModel { metas, .. }| metas) + .flat_map(|speaker| speaker.styles()) + } +} + +struct SessionSet { + predict_duration: Arc>>>, + predict_intonation: Arc>>>, + decode: Arc>>>, +} + +// FIXME: 以下のことをちゃんと確認した後、onnxruntime-rs側で`Session`が`Send`であると宣言する。 +// https://github.com/VOICEVOX/voicevox_core/issues/307#issuecomment-1276184614 + +use self::assert_send::AssertSend; + +mod assert_send { + use std::ops::{Deref, DerefMut}; + + use onnxruntime::session::Session; + + pub(super) struct AssertSend(T); + + impl From> for AssertSend> { + fn from(session: Session<'static>) -> Self { + Self(session) + } + } + + impl Deref for AssertSend { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } + } + + impl DerefMut for AssertSend { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 } } + + // SAFETY: `Session` is probably "send"able. + #[allow(unsafe_code)] + unsafe impl Send for AssertSend {} } #[cfg(test)] @@ -311,27 +475,22 @@ mod tests { cpu_num_threads, status.heavy_session_options.cpu_num_threads ); - assert!(status.models.predict_duration.is_empty()); - assert!(status.models.predict_intonation.is_empty()); - assert!(status.models.decode.is_empty()); - assert!(status.id_relations.is_empty()); + assert!(status.loaded_models.lock().unwrap().0.is_empty()); } #[rstest] #[tokio::test] async fn status_load_model_works() { - let mut status = Status::new(false, 0); + let status = Status::new(false, 0); let result = status.load_model(&open_default_vvm_file().await).await; assert_debug_fmt_eq!(Ok(()), result); - assert_eq!(1, status.models.predict_duration.len()); - assert_eq!(1, status.models.predict_intonation.len()); - assert_eq!(1, status.models.decode.len()); + assert_eq!(1, status.loaded_models.lock().unwrap().0.len()); } #[rstest] #[tokio::test] async fn status_is_model_loaded_works() { - let mut status = Status::new(false, 0); + let status = Status::new(false, 0); let vvm = open_default_vvm_file().await; assert!( !status.is_loaded_model(vvm.id()), diff --git a/crates/voicevox_core/src/voice_model.rs b/crates/voicevox_core/src/voice_model.rs index dea396ffb..2eb114b6a 100644 --- a/crates/voicevox_core/src/voice_model.rs +++ b/crates/voicevox_core/src/voice_model.rs @@ -1,12 +1,11 @@ -use anyhow::anyhow; use async_zip::{read::fs::ZipFileReader, ZipEntry}; use futures::future::{join3, join_all}; use serde::{de::DeserializeOwned, Deserialize}; use super::*; use std::{ - collections::HashMap, - env, + collections::{BTreeMap, HashMap}, + env, io, path::{Path, PathBuf}, }; @@ -16,7 +15,9 @@ use std::{ pub type RawVoiceModelId = String; /// 音声モデルID。 -#[derive(PartialEq, Eq, Clone, Ord, PartialOrd, Deserialize, new, Getters, Debug)] +#[derive( + PartialEq, Eq, Clone, Ord, PartialOrd, Deserialize, new, Getters, derive_more::Display, Debug, +)] pub struct VoiceModelId { raw_voice_model_id: RawVoiceModelId, } @@ -42,7 +43,7 @@ pub(crate) struct InferenceModels { } impl VoiceModel { - pub(crate) async fn read_inference_models(&self) -> Result { + pub(crate) async fn read_inference_models(&self) -> LoadModelResult { let reader = VvmEntryReader::open(&self.path).await?; let (decode_model_result, predict_duration_model_result, predict_intonation_model_result) = join3( @@ -53,39 +54,18 @@ impl VoiceModel { .await; Ok(InferenceModels { - predict_duration_model: predict_duration_model_result.map_err(|e| Error::VvmRead { - path: self.path.clone(), - source: e, - })?, - predict_intonation_model: predict_intonation_model_result.map_err(|e| { - Error::VvmRead { - path: self.path.clone(), - source: e, - } - })?, - decode_model: decode_model_result.map_err(|e| Error::VvmRead { - path: self.path.clone(), - source: e, - })?, + predict_duration_model: predict_duration_model_result?, + predict_intonation_model: predict_intonation_model_result?, + decode_model: decode_model_result?, }) } /// VVMファイルから`VoiceModel`をコンストラクトする。 - pub async fn from_path(path: impl AsRef) -> Result { - let reader = VvmEntryReader::open(&path).await?; - let manifest = reader - .read_vvm_json::("manifest.json") - .await - .map_err(|e| Error::VvmRead { - path: path.as_ref().into(), - source: e, - })?; + pub async fn from_path(path: impl AsRef) -> LoadModelResult { + let reader = VvmEntryReader::open(path.as_ref()).await?; + let manifest = reader.read_vvm_json::("manifest.json").await?; let metas = reader .read_vvm_json::(manifest.metas_filename()) - .await - .map_err(|e| Error::VvmRead { - path: path.as_ref().into(), - source: e, - })?; + .await?; let id = VoiceModelId::new(nanoid!()); Ok(Self { @@ -96,6 +76,10 @@ impl VoiceModel { }) } + // FIXME: `load_all_models`自体を廃止し、これはENGINE専用とする + /// # Panics + /// + /// 目的のディレクトリが読めなかったらパニックする pub async fn get_all_models() -> Result> { let root_dir = if cfg!(test) { Path::new(env!("CARGO_WORKSPACE_DIR")).join("model") @@ -113,26 +97,37 @@ impl VoiceModel { let vvm_paths = root_dir .read_dir() .and_then(|entries| entries.collect::, _>>()) - .map_err(|e| Error::LoadModel { - path: root_dir.clone(), - source: e.into(), - })? + .unwrap_or_else(|e| panic!("{}が読めませんでした: {e}", root_dir.display())) .into_iter() .filter(|entry| entry.path().extension().map_or(false, |ext| ext == "vvm")) .map(|entry| Self::from_path(entry.path())); - join_all(vvm_paths).await.into_iter().collect() + join_all(vvm_paths) + .await + .into_iter() + .collect::>() + .map_err(Into::into) } const ROOT_DIR_ENV_NAME: &str = "VV_MODELS_ROOT_DIR"; - /// スタイルIDからモデル内IDを取得する。 + /// モデル内のすべてのスタイルに対するモデル内IDを取得する。 + /// /// モデル内IDのマッピングが存在しない場合はそのままスタイルIDを返す。 - pub(crate) fn model_inner_id_for(&self, style_id: StyleId) -> ModelInnerId { - self.manifest - .style_id_to_model_inner_id() - .get(&style_id) - .cloned() - .unwrap_or_else(|| ModelInnerId::new(style_id.raw_id())) + pub(crate) fn model_inner_ids(&self) -> BTreeMap { + self.metas + .iter() + .flat_map(SpeakerMeta::styles) + .map(StyleMeta::id) + .map(|&style_id| { + let model_inner_id = self + .manifest + .style_id_to_model_inner_id() + .get(&style_id) + .copied() + .unwrap_or_else(|| ModelInnerId::new(style_id.raw_id())); + (style_id, model_inner_id) + }) + .collect() } } @@ -148,12 +143,13 @@ struct VvmEntryReader { } impl VvmEntryReader { - async fn open(path: impl AsRef) -> Result { - let reader = ZipFileReader::new(path.as_ref()) + async fn open(path: &Path) -> LoadModelResult { + let reader = ZipFileReader::new(path) .await - .map_err(|e| Error::OpenFile { - path: path.as_ref().into(), - source: e.into(), + .map_err(|source| LoadModelError { + path: path.to_owned(), + context: LoadModelErrorKind::OpenZipFile, + source: Some(source.into()), })?; let entry_map: HashMap<_, _> = reader .file() @@ -173,22 +169,38 @@ impl VvmEntryReader { .collect(); Ok(VvmEntryReader::new(reader, entry_map)) } - async fn read_vvm_json(&self, filename: &str) -> anyhow::Result { + async fn read_vvm_json(&self, filename: &str) -> LoadModelResult { let bytes = self.read_vvm_entry(filename).await?; - serde_json::from_slice(&bytes).map_err(|e| e.into()) + serde_json::from_slice(&bytes).map_err(|source| LoadModelError { + path: self.reader.path().to_owned(), + context: LoadModelErrorKind::ReadZipEntry { + filename: filename.to_owned(), + }, + source: Some(source.into()), + }) } - async fn read_vvm_entry(&self, filename: &str) -> anyhow::Result> { - let me = self - .entry_map - .get(filename) - .ok_or_else(|| anyhow!("Not found in vvm entries: {}", filename))?; - let mut manifest_reader = self.reader.entry(me.index).await?; - let mut buf = Vec::with_capacity(me.entry.uncompressed_size() as usize); - manifest_reader - .read_to_end_checked(&mut buf, &me.entry) - .await?; - Ok(buf) + async fn read_vvm_entry(&self, filename: &str) -> LoadModelResult> { + (|| async { + let me = self + .entry_map + .get(filename) + .ok_or_else(|| io::Error::from(io::ErrorKind::NotFound))?; + let mut manifest_reader = self.reader.entry(me.index).await?; + let mut buf = Vec::with_capacity(me.entry.uncompressed_size() as usize); + manifest_reader + .read_to_end_checked(&mut buf, &me.entry) + .await?; + Ok::<_, anyhow::Error>(buf) + })() + .await + .map_err(|source| LoadModelError { + path: self.reader.path().to_owned(), + context: LoadModelErrorKind::ReadZipEntry { + filename: filename.to_owned(), + }, + source: Some(source), + }) } } diff --git a/crates/voicevox_core/src/voice_synthesizer.rs b/crates/voicevox_core/src/voice_synthesizer.rs index 41c0406da..a798d8e17 100644 --- a/crates/voicevox_core/src/voice_synthesizer.rs +++ b/crates/voicevox_core/src/voice_synthesizer.rs @@ -173,18 +173,18 @@ impl Synthesizer { } /// 音声モデルを読み込む。 - pub async fn load_voice_model(&mut self, model: &VoiceModel) -> Result<()> { + pub async fn load_voice_model(&self, model: &VoiceModel) -> Result<()> { self.synthesis_engine - .inference_core_mut() + .inference_core() .load_model(model) .await?; Ok(()) } /// 音声モデルの読み込みを解除する。 - pub fn unload_voice_model(&mut self, voice_model_id: &VoiceModelId) -> Result<()> { + pub fn unload_voice_model(&self, voice_model_id: &VoiceModelId) -> Result<()> { self.synthesis_engine - .inference_core_mut() + .inference_core() .unload_model(voice_model_id) } @@ -203,7 +203,7 @@ impl Synthesizer { } /// 今読み込んでいる音声モデルのメタ情報を返す。 - pub fn metas(&self) -> &VoiceModelMeta { + pub fn metas(&self) -> VoiceModelMeta { self.synthesis_engine.inference_core().metas() } @@ -616,7 +616,7 @@ mod tests { #[case(Ok(()))] #[tokio::test] async fn load_model_works(#[case] expected_result_at_initialized: Result<()>) { - let mut syntesizer = Synthesizer::new_with_initialize( + let syntesizer = Synthesizer::new_with_initialize( Arc::new(OpenJtalk::new_without_dic()), &InitializeOptions { acceleration_mode: AccelerationMode::Cpu, @@ -657,7 +657,7 @@ mod tests { #[tokio::test] async fn is_loaded_model_by_style_id_works(#[case] style_id: u32, #[case] expected: bool) { let style_id = StyleId::new(style_id); - let mut syntesizer = Synthesizer::new_with_initialize( + let syntesizer = Synthesizer::new_with_initialize( Arc::new(OpenJtalk::new_without_dic()), &InitializeOptions { acceleration_mode: AccelerationMode::Cpu, @@ -686,7 +686,7 @@ mod tests { #[rstest] #[tokio::test] async fn predict_duration_works() { - let mut syntesizer = Synthesizer::new_with_initialize( + let syntesizer = Synthesizer::new_with_initialize( Arc::new(OpenJtalk::new_without_dic()), &InitializeOptions { acceleration_mode: AccelerationMode::Cpu, @@ -718,7 +718,7 @@ mod tests { #[rstest] #[tokio::test] async fn predict_intonation_works() { - let mut syntesizer = Synthesizer::new_with_initialize( + let syntesizer = Synthesizer::new_with_initialize( Arc::new(OpenJtalk::new_without_dic()), &InitializeOptions { acceleration_mode: AccelerationMode::Cpu, @@ -760,7 +760,7 @@ mod tests { #[rstest] #[tokio::test] async fn decode_works() { - let mut syntesizer = Synthesizer::new_with_initialize( + let syntesizer = Synthesizer::new_with_initialize( Arc::new(OpenJtalk::new_without_dic()), &InitializeOptions { acceleration_mode: AccelerationMode::Cpu, diff --git a/crates/voicevox_core_c_api/include/voicevox_core.h b/crates/voicevox_core_c_api/include/voicevox_core.h index d5cf4f6b2..8b012e901 100644 --- a/crates/voicevox_core_c_api/include/voicevox_core.h +++ b/crates/voicevox_core_c_api/include/voicevox_core.h @@ -94,10 +94,6 @@ enum VoicevoxResultCode * open_jtalk辞書ファイルが読み込まれていない */ VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR = 1, - /** - * modelの読み込みに失敗した - */ - VOICEVOX_RESULT_LOAD_MODEL_ERROR = 2, /** * サポートされているデバイス情報取得に失敗した */ @@ -106,10 +102,6 @@ enum VoicevoxResultCode * GPUモードがサポートされていない */ VOICEVOX_RESULT_GPU_SUPPORT_ERROR = 4, - /** - * メタ情報読み込みに失敗した - */ - VOICEVOX_RESULT_LOAD_METAS_ERROR = 5, /** * 無効なstyle_idが指定された */ @@ -143,17 +135,25 @@ enum VoicevoxResultCode */ VOICEVOX_RESULT_INVALID_ACCENT_PHRASE_ERROR = 15, /** - * ファイルオープンエラー + * ZIPファイルを開くことに失敗した + */ + VOICEVOX_RESULT_OPEN_ZIP_FILE_ERROR = 16, + /** + * ZIP内のファイルが読めなかった + */ + VOICEVOX_RESULT_READ_ZIP_ENTRY_ERROR = 17, + /** + * すでに読み込まれている音声モデルを読み込もうとした */ - VOICEVOX_RESULT_OPEN_FILE_ERROR = 16, + VOICEVOX_RESULT_MODEL_ALREADY_LOADED_ERROR = 18, /** - * Modelを読み込めなかった + * すでに読み込まれているスタイルを読み込もうとした */ - VOICEVOX_RESULT_VVM_MODEL_READ_ERROR = 17, + VOICEVOX_RESULT_STYLE_ALREADY_LOADED_ERROR = 26, /** - * すでに読み込まれているModelを読み込もうとした + * 無効なモデルデータ */ - VOICEVOX_RESULT_ALREADY_LOADED_MODEL_ERROR = 18, + VOICEVOX_RESULT_INVALID_MODEL_DATA_ERROR = 27, /** * Modelが読み込まれていない */ @@ -569,7 +569,7 @@ void voicevox_synthesizer_delete(struct VoicevoxSynthesizer *synthesizer); #ifdef _WIN32 __declspec(dllimport) #endif -VoicevoxResultCode voicevox_synthesizer_load_voice_model(struct VoicevoxSynthesizer *synthesizer, +VoicevoxResultCode voicevox_synthesizer_load_voice_model(const struct VoicevoxSynthesizer *synthesizer, const struct VoicevoxVoiceModel *model); /** @@ -588,7 +588,7 @@ VoicevoxResultCode voicevox_synthesizer_load_voice_model(struct VoicevoxSynthesi #ifdef _WIN32 __declspec(dllimport) #endif -VoicevoxResultCode voicevox_synthesizer_unload_voice_model(struct VoicevoxSynthesizer *synthesizer, +VoicevoxResultCode voicevox_synthesizer_unload_voice_model(const struct VoicevoxSynthesizer *synthesizer, VoicevoxVoiceModelId model_id); /** @@ -629,19 +629,20 @@ bool voicevox_synthesizer_is_loaded_voice_model(const struct VoicevoxSynthesizer /** * 今読み込んでいる音声モデルのメタ情報を、JSONで取得する。 * + * JSONの解放は ::voicevox_json_free で行う。 + * * @param [in] synthesizer 音声シンセサイザ * * @return メタ情報のJSON文字列 * * \safety{ * - `synthesizer`は ::voicevox_synthesizer_new_with_initialize で得たものでなければならず、また ::voicevox_synthesizer_delete で解放されていてはいけない。 - * - 戻り値の文字列の生存期間(_lifetime_)は次にこの関数が呼ばれるか、`synthesizer`が破棄されるまでである。この生存期間を越えて文字列にアクセスしてはならない。 * } */ #ifdef _WIN32 __declspec(dllimport) #endif -const char *voicevox_synthesizer_get_metas_json(const struct VoicevoxSynthesizer *synthesizer); +char *voicevox_synthesizer_create_metas_json(const struct VoicevoxSynthesizer *synthesizer); /** * このライブラリで利用可能なデバイスの情報を、JSONで取得する。 @@ -951,6 +952,7 @@ VoicevoxResultCode voicevox_synthesizer_tts(const struct VoicevoxSynthesizer *sy * \safety{ * - `json`は以下のAPIで得られたポインタでなくてはいけない。 * - ::voicevox_create_supported_devices_json + * - ::voicevox_synthesizer_create_metas_json * - ::voicevox_synthesizer_create_audio_query * - ::voicevox_synthesizer_create_accent_phrases * - ::voicevox_synthesizer_replace_mora_data diff --git a/crates/voicevox_core_c_api/src/c_impls.rs b/crates/voicevox_core_c_api/src/c_impls.rs index f90db2337..a891593a4 100644 --- a/crates/voicevox_core_c_api/src/c_impls.rs +++ b/crates/voicevox_core_c_api/src/c_impls.rs @@ -1,12 +1,8 @@ -use std::{ - ffi::{CStr, CString}, - path::Path, - sync::Arc, -}; +use std::{ffi::CString, path::Path, sync::Arc}; use voicevox_core::{InitializeOptions, OpenJtalk, Result, Synthesizer, VoiceModel, VoiceModelId}; -use crate::{OpenJtalkRc, VoicevoxSynthesizer, VoicevoxVoiceModel}; +use crate::{CApiResult, OpenJtalkRc, VoicevoxSynthesizer, VoicevoxVoiceModel}; impl OpenJtalkRc { pub(crate) fn new_with_initialize(open_jtalk_dic_dir: impl AsRef) -> Result { @@ -23,30 +19,22 @@ impl VoicevoxSynthesizer { ) -> Result { let synthesizer = Synthesizer::new_with_initialize(open_jtalk.open_jtalk.clone(), options).await?; - let metas = synthesizer.metas(); - let metas_cstring = CString::new(serde_json::to_string(&metas).unwrap()).unwrap(); - Ok(Self { - synthesizer, - metas_cstring, - }) + Ok(Self { synthesizer }) } - pub(crate) async fn load_voice_model(&mut self, model: &VoiceModel) -> Result<()> { + pub(crate) async fn load_voice_model(&self, model: &VoiceModel) -> CApiResult<()> { self.synthesizer.load_voice_model(model).await?; - let metas = self.synthesizer.metas(); - self.metas_cstring = CString::new(serde_json::to_string(metas).unwrap()).unwrap(); Ok(()) } - pub(crate) fn unload_voice_model(&mut self, model_id: &VoiceModelId) -> Result<()> { + pub(crate) fn unload_voice_model(&self, model_id: &VoiceModelId) -> Result<()> { self.synthesizer.unload_voice_model(model_id)?; - let metas = self.synthesizer.metas(); - self.metas_cstring = CString::new(serde_json::to_string(metas).unwrap()).unwrap(); Ok(()) } - pub(crate) fn metas(&self) -> &CStr { - &self.metas_cstring + pub(crate) fn metas(&self) -> CString { + let metas = &self.synthesizer.metas(); + CString::new(serde_json::to_string(metas).unwrap()).unwrap() } } diff --git a/crates/voicevox_core_c_api/src/helpers.rs b/crates/voicevox_core_c_api/src/helpers.rs index 72d43497e..0e9cfd279 100644 --- a/crates/voicevox_core_c_api/src/helpers.rs +++ b/crates/voicevox_core_c_api/src/helpers.rs @@ -18,15 +18,20 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes } fn into_result_code(result: CApiResult<()>) -> VoicevoxResultCode { - use voicevox_core::{result_code::VoicevoxResultCode::*, Error::*}; + use voicevox_core::{result_code::VoicevoxResultCode::*, Error::*, LoadModelErrorKind::*}; use CApiError::*; match result { Ok(()) => VOICEVOX_RESULT_OK, Err(RustApi(NotLoadedOpenjtalkDict)) => VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR, Err(RustApi(GpuSupport)) => VOICEVOX_RESULT_GPU_SUPPORT_ERROR, - Err(RustApi(LoadModel { .. })) => VOICEVOX_RESULT_LOAD_MODEL_ERROR, - Err(RustApi(LoadMetas(_))) => VOICEVOX_RESULT_LOAD_METAS_ERROR, + Err(RustApi(LoadModel(err))) => match err.context() { + OpenZipFile => VOICEVOX_RESULT_OPEN_ZIP_FILE_ERROR, + ReadZipEntry { .. } => VOICEVOX_RESULT_READ_ZIP_ENTRY_ERROR, + ModelAlreadyLoaded { .. } => VOICEVOX_RESULT_MODEL_ALREADY_LOADED_ERROR, + StyleAlreadyLoaded { .. } => VOICEVOX_RESULT_STYLE_ALREADY_LOADED_ERROR, + InvalidModelData => VOICEVOX_RESULT_INVALID_MODEL_DATA_ERROR, + }, Err(RustApi(GetSupportedDevices(_))) => VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR, Err(RustApi(InvalidStyleId { .. })) => VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR, Err(RustApi(InvalidModelId { .. })) => VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR, @@ -35,9 +40,6 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR } Err(RustApi(UnloadedModel { .. })) => VOICEVOX_RESULT_UNLOADED_MODEL_ERROR, - Err(RustApi(AlreadyLoadedModel { .. })) => VOICEVOX_RESULT_ALREADY_LOADED_MODEL_ERROR, - Err(RustApi(OpenFile { .. })) => VOICEVOX_RESULT_OPEN_FILE_ERROR, - Err(RustApi(VvmRead { .. })) => VOICEVOX_RESULT_VVM_MODEL_READ_ERROR, Err(RustApi(ParseKana(_))) => VOICEVOX_RESULT_PARSE_KANA_ERROR, Err(RustApi(LoadUserDict(_))) => VOICEVOX_RESULT_LOAD_USER_DICT_ERROR, Err(RustApi(SaveUserDict(_))) => VOICEVOX_RESULT_SAVE_USER_DICT_ERROR, @@ -52,10 +54,10 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes } } -type CApiResult = std::result::Result; +pub(crate) type CApiResult = std::result::Result; #[derive(Error, Debug)] -pub(crate) enum CApiError { +pub enum CApiError { #[error("{0}")] RustApi(#[from] voicevox_core::Error), #[error("UTF-8として不正な入力です")] diff --git a/crates/voicevox_core_c_api/src/lib.rs b/crates/voicevox_core_c_api/src/lib.rs index db06de484..e97b0e5bb 100644 --- a/crates/voicevox_core_c_api/src/lib.rs +++ b/crates/voicevox_core_c_api/src/lib.rs @@ -316,7 +316,6 @@ pub extern "C" fn voicevox_voice_model_delete(model: Box) { #[derive(Getters)] pub struct VoicevoxSynthesizer { synthesizer: Synthesizer, - metas_cstring: CString, } /// ::VoicevoxSynthesizer を構築(_construct_)する。 @@ -376,14 +375,10 @@ pub extern "C" fn voicevox_synthesizer_delete(synthesizer: Box VoicevoxResultCode { - into_result_code_with_error( - RUNTIME - .block_on(synthesizer.load_voice_model(model.model())) - .map_err(Into::into), - ) + into_result_code_with_error(RUNTIME.block_on(synthesizer.load_voice_model(model.model()))) } /// 音声モデルの読み込みを解除する。 @@ -399,7 +394,7 @@ pub extern "C" fn voicevox_synthesizer_load_voice_model( /// } #[no_mangle] pub unsafe extern "C" fn voicevox_synthesizer_unload_voice_model( - synthesizer: &mut VoicevoxSynthesizer, + synthesizer: &VoicevoxSynthesizer, model_id: VoicevoxVoiceModelId, ) -> VoicevoxResultCode { into_result_code_with_error((|| { @@ -448,19 +443,21 @@ pub unsafe extern "C" fn voicevox_synthesizer_is_loaded_voice_model( /// 今読み込んでいる音声モデルのメタ情報を、JSONで取得する。 /// +/// JSONの解放は ::voicevox_json_free で行う。 +/// /// @param [in] synthesizer 音声シンセサイザ /// /// @return メタ情報のJSON文字列 /// /// \safety{ /// - `synthesizer`は ::voicevox_synthesizer_new_with_initialize で得たものでなければならず、また ::voicevox_synthesizer_delete で解放されていてはいけない。 -/// - 戻り値の文字列の生存期間(_lifetime_)は次にこの関数が呼ばれるか、`synthesizer`が破棄されるまでである。この生存期間を越えて文字列にアクセスしてはならない。 /// } #[no_mangle] -pub extern "C" fn voicevox_synthesizer_get_metas_json( +pub extern "C" fn voicevox_synthesizer_create_metas_json( synthesizer: &VoicevoxSynthesizer, -) -> *const c_char { - synthesizer.metas().as_ptr() +) -> *mut c_char { + let metas = synthesizer.metas(); + C_STRING_DROP_CHECKER.whitelist(metas).into_raw() } /// このライブラリで利用可能なデバイスの情報を、JSONで取得する。 @@ -895,6 +892,7 @@ pub unsafe extern "C" fn voicevox_synthesizer_tts( /// \safety{ /// - `json`は以下のAPIで得られたポインタでなくてはいけない。 /// - ::voicevox_create_supported_devices_json +/// - ::voicevox_synthesizer_create_metas_json /// - ::voicevox_synthesizer_create_audio_query /// - ::voicevox_synthesizer_create_accent_phrases /// - ::voicevox_synthesizer_replace_mora_data @@ -1238,13 +1236,6 @@ mod tests { Err(Error::NotLoadedOpenjtalkDict), VoicevoxResultCode::VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT_ERROR )] - #[case( - Err(Error::LoadModel { - path: "path/to/model.onnx".into(), - source: anyhow!("some load model error"), - }), - VoicevoxResultCode::VOICEVOX_RESULT_LOAD_MODEL_ERROR - )] #[case( Err(Error::GetSupportedDevices(anyhow!("some get supported devices error"))), VoicevoxResultCode::VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR diff --git a/crates/voicevox_core_c_api/tests/e2e/symbols.rs b/crates/voicevox_core_c_api/tests/e2e/symbols.rs index 47d15ec1d..32634bbbf 100644 --- a/crates/voicevox_core_c_api/tests/e2e/symbols.rs +++ b/crates/voicevox_core_c_api/tests/e2e/symbols.rs @@ -55,8 +55,8 @@ pub(crate) struct Symbols<'lib> { 'lib, unsafe extern "C" fn(*const VoicevoxSynthesizer, VoicevoxVoiceModelId) -> bool, >, - pub(crate) voicevox_synthesizer_get_metas_json: - Symbol<'lib, unsafe extern "C" fn(*const VoicevoxSynthesizer) -> *const c_char>, + pub(crate) voicevox_synthesizer_create_metas_json: + Symbol<'lib, unsafe extern "C" fn(*const VoicevoxSynthesizer) -> *mut c_char>, pub(crate) voicevox_create_supported_devices_json: Symbol<'lib, unsafe extern "C" fn(*mut *mut c_char) -> VoicevoxResultCode>, pub(crate) voicevox_make_default_audio_query_options: @@ -203,7 +203,7 @@ impl<'lib> Symbols<'lib> { voicevox_synthesizer_unload_voice_model, voicevox_synthesizer_is_gpu_mode, voicevox_synthesizer_is_loaded_voice_model, - voicevox_synthesizer_get_metas_json, + voicevox_synthesizer_create_metas_json, voicevox_create_supported_devices_json, voicevox_make_default_audio_query_options, voicevox_synthesizer_create_audio_query, diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/synthesizer_new_with_initialize_output_json.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/synthesizer_new_with_initialize_output_json.rs index 502880a14..41a7d853c 100644 --- a/crates/voicevox_core_c_api/tests/e2e/testcases/synthesizer_new_with_initialize_output_json.rs +++ b/crates/voicevox_core_c_api/tests/e2e/testcases/synthesizer_new_with_initialize_output_json.rs @@ -31,7 +31,8 @@ impl assert_cdylib::TestCase for TestCase { voicevox_open_jtalk_rc_delete, voicevox_synthesizer_new_with_initialize, voicevox_synthesizer_delete, - voicevox_synthesizer_get_metas_json, + voicevox_synthesizer_create_metas_json, + voicevox_json_free, .. } = Symbols::new(lib)?; @@ -60,9 +61,11 @@ impl assert_cdylib::TestCase for TestCase { }; let metas_json = { - let metas_json = - CStr::from_ptr(voicevox_synthesizer_get_metas_json(synthesizer)).to_str()?; - serde_json::to_string_pretty(&metas_json.parse::()?).unwrap() + let raw = voicevox_synthesizer_create_metas_json(synthesizer); + let metas_json = &CStr::from_ptr(raw).to_str()?.parse::()?; + let metas_json = serde_json::to_string_pretty(metas_json).unwrap(); + voicevox_json_free(raw); + metas_json }; std::assert_eq!(SNAPSHOTS.metas, metas_json); diff --git a/crates/voicevox_core_python_api/src/lib.rs b/crates/voicevox_core_python_api/src/lib.rs index 40b9e31bd..3c76dd8a0 100644 --- a/crates/voicevox_core_python_api/src/lib.rs +++ b/crates/voicevox_core_python_api/src/lib.rs @@ -178,7 +178,7 @@ impl Synthesizer { #[getter] fn metas<'py>(&self, py: Python<'py>) -> PyResult> { let synthesizer = self.synthesizer.get()?; - to_pydantic_voice_model_meta(RUNTIME.block_on(synthesizer.lock()).metas(), py) + to_pydantic_voice_model_meta(&RUNTIME.block_on(synthesizer.lock()).metas(), py) } fn load_voice_model<'py>(