VOICEVOX · qryxip · Oct 29, 2024 · Oct 9, 2024 · Oct 10, 2024 · Oct 11, 2024
diff --git a/crates/voicevox_core/src/blocking.rs b/crates/voicevox_core/src/blocking.rs
@@ -2,8 +2,8 @@
 
 pub use crate::{
     engine::open_jtalk::blocking::OpenJtalk, infer::runtimes::onnxruntime::blocking::Onnxruntime,
-    synthesizer::blocking::Synthesizer, user_dict::dict::blocking::UserDict,
-    voice_model::blocking::VoiceModelFile,
+    synthesizer::blocking::Audio, synthesizer::blocking::Synthesizer,
+    user_dict::dict::blocking::UserDict, voice_model::blocking::VoiceModelFile,
 };
 
 pub mod onnxruntime {

diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs
@@ -108,6 +108,22 @@ pub(crate) mod blocking {
 
     const DEFAULT_SAMPLING_RATE: u32 = 24000;
 
+    /// ユーザに渡す中間生成物。
+    pub struct Audio {
+        /// (フレーム数, 特徴数)の形を持つ音声特徴量。
+        pub internal_state: ndarray::Array2<f32>,
+        /// 生成時に指定したスタイル番号。
+        pub style_id: crate::StyleId,
+        /// workaround paddingを除いた音声特徴量のフレーム数。
+        pub length: usize,
+        /// サンプリングレート。全体の秒数は`length / sampling_rate`で表せる。
+        pub sampling_rate: f32,
+        /// workaroundとして付け足されているパディング長。
+        pub padding_length: usize,
+        /// 生成時に利用したクエリ。
+        pub audio_query: AudioQuery,
+    }
+
     /// 音声シンセサイザ。
     pub struct Synthesizer<O> {
         pub(super) status: Status<crate::blocking::Onnxruntime>,
@@ -257,13 +273,13 @@ pub(crate) mod blocking {
             self.status.metas()
         }
 
-        /// AudioQueryから音声合成を行う。
-        pub fn synthesis(
+        /// AudioQueryから音声合成用の中間表現を生成する。
+        pub fn seekable_synthesis(
             &self,
             audio_query: &AudioQuery,
             style_id: StyleId,
             options: &SynthesisOptions,
-        ) -> Result<Vec<u8>> {
+        ) -> Result<Audio> {
             let AudioQuery {
                 accent_phrases,
                 speed_scale,
@@ -362,14 +378,36 @@ pub(crate) mod blocking {
                 }
             }
 
-            let wave = &self.decode(
-                f0.len(),
-                OjtPhoneme::num_phoneme(),
-                &f0,
+            // 音が途切れてしまうのを避けるworkaround処理が入っている
+            // TODO: 改善したらここのpadding処理を取り除く
+            const PADDING_SIZE: f64 = 0.4;
+            let padding_size =
+                ((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize;
+            let start_and_end_padding_size = 2 * padding_size;
+            let length_with_padding = f0.len() + start_and_end_padding_size;
+            let f0_with_padding = make_f0_with_padding(&f0, length_with_padding, padding_size);
+            let phoneme_with_padding = make_phoneme_with_padding(
                 phoneme.as_flattened(),
+                OjtPhoneme::num_phoneme(),
+                length_with_padding,
+                padding_size,
+            );
+
+            let spec = self.generate_full_intermediate(
+                f0_with_padding.len(),
+                OjtPhoneme::num_phoneme(),
+                &f0_with_padding,
+                &phoneme_with_padding,
                 style_id,
             )?;
-            return Ok(to_wav(wave, audio_query));
+            return Ok(Audio {
+                internal_state: spec,
+                style_id,
+                length: f0.len(),
+                sampling_rate: (DEFAULT_SAMPLING_RATE as f32) / 256.0,
+                padding_length: padding_size,
+                audio_query: audio_query.clone(),
+            });
 
             fn adjust_interrogative_accent_phrases(
                 accent_phrases: &[AccentPhrase],
@@ -420,24 +458,127 @@ pub(crate) mod blocking {
                 }
             }
 
-            fn to_wav(
+            fn make_f0_with_padding(
+                f0_slice: &[f32],
+                length_with_padding: usize,
+                padding_size: usize,
+            ) -> Vec<f32> {
+                // 音が途切れてしまうのを避けるworkaround処理
+                // 改善したらこの関数を削除する
+                let mut f0_with_padding = Vec::with_capacity(length_with_padding);
+                let padding = vec![0.0; padding_size];
+                f0_with_padding.extend_from_slice(&padding);
+                f0_with_padding.extend_from_slice(f0_slice);
+                f0_with_padding.extend_from_slice(&padding);
+                f0_with_padding
+            }
+
+            fn make_phoneme_with_padding(
+                phoneme_slice: &[f32],
+                phoneme_size: usize,
+                length_with_padding: usize,
+                padding_size: usize,
+            ) -> Vec<f32> {
+                // 音が途切れてしまうのを避けるworkaround処理
+                // 改善したらこの関数を削除する
+                let mut padding_phoneme = vec![0.0; phoneme_size];
+                padding_phoneme[0] = 1.0;
+                let padding_phoneme_len = padding_phoneme.len();
+                let padding_phonemes: Vec<f32> = padding_phoneme
+                    .into_iter()
+                    .cycle()
+                    .take(padding_phoneme_len * padding_size)
+                    .collect();
+                let mut phoneme_with_padding =
+                    Vec::with_capacity(phoneme_size * length_with_padding);
+                phoneme_with_padding.extend_from_slice(&padding_phonemes);
+                phoneme_with_padding.extend_from_slice(phoneme_slice);
+                phoneme_with_padding.extend_from_slice(&padding_phonemes);
+
+                phoneme_with_padding
+            }
+        }
+
+        /// 中間表現から16bit PCMで音声波形を生成する。
+        pub fn render(&self, audio: &Audio, begin: usize, end: usize) -> Result<Vec<u8>> {
+            const MARGIN: usize = 14; // 使われているHifiGANのreceptive fieldから計算される安全マージン
+            use std::cmp::min;
+            // 実態(workaround paddingを含まない)上での区間
+            let clipped_begin = min(begin, audio.length);
+            let clipped_end = min(end, audio.length);
+            // データからはみ出さない安全マージン
+            let left_margin = min(MARGIN, audio.padding_length + clipped_begin);
+            let right_margin = min(MARGIN, audio.padding_length + (audio.length - clipped_end));
+            // 安全マージンを追加したデータ上での区間
+            let slice_begin = audio.padding_length + clipped_begin - left_margin;
+            let slice_end = audio.padding_length + clipped_end + right_margin;
+            let window = audio
+                .internal_state
+                .slice(ndarray::s![slice_begin..slice_end, ..]);
+            let wave_with_margin =
+                self.render_audio_segment(window.into_owned(), audio.style_id)?;
+            let wave = wave_with_margin
+                .slice(ndarray::s![
+                    left_margin * 256..wave_with_margin.len() - right_margin * 256
+                ])
+                .into_owned()
+                .into_raw_vec();
+            return Ok(to_s16le_pcm(&wave, &audio.audio_query));
+
+            fn to_s16le_pcm(
                 wave: &[f32],
                 &AudioQuery {
                     volume_scale,
                     output_sampling_rate,
                     output_stereo,
                     ..
                 }: &AudioQuery,
+            ) -> Vec<u8> {
+                let num_channels: u16 = if output_stereo { 2 } else { 1 };
+                let repeat_count: u32 =
+                    (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32;
+                let bytes_size = wave.len() as u32 * repeat_count * 2;
+                let buf: Vec<u8> = Vec::with_capacity(bytes_size as usize);
+                let mut cur = Cursor::new(buf);
+
+                for value in wave {
+                    let v = (value * volume_scale).clamp(-1., 1.);
+                    let data = (v * 0x7fff as f32) as i16;
+                    for _ in 0..repeat_count {
+                        cur.write_all(&data.to_le_bytes()).unwrap();
+                    }
+                }
+
+                cur.into_inner()
+            }
+        }
+
+        /// AudioQueryから直接WAVフォーマットで音声波形を生成する。
+        pub fn synthesis(
+            &self,
+            audio_query: &AudioQuery,
+            style_id: StyleId,
+            options: &SynthesisOptions,
+        ) -> Result<Vec<u8>> {
+            let audio = self.seekable_synthesis(audio_query, style_id, options)?;
+            let pcm = self.render(&audio, 0, audio.length)?;
+            return Ok(to_wav(&pcm, &audio_query));
+
+            fn to_wav(
+                pcm: &[u8],
+                &AudioQuery {
+                    output_sampling_rate,
+                    output_stereo,
+                    ..
+                }: &AudioQuery,
             ) -> Vec<u8> {
                 // TODO: 44.1kHzなどの対応
 
                 let num_channels: u16 = if output_stereo { 2 } else { 1 };
                 let bit_depth: u16 = 16;
-                let repeat_count: u32 =
-                    (output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32;
                 let block_size: u16 = bit_depth * num_channels / 8;
 
-                let bytes_size = wave.len() as u32 * repeat_count * 2;
+                let bytes_size = pcm.len() as u32;
                 let wave_size = bytes_size + 44;
 
                 let buf: Vec<u8> = Vec::with_capacity(wave_size as usize);
@@ -458,15 +599,7 @@ pub(crate) mod blocking {
                 cur.write_all(&bit_depth.to_le_bytes()).unwrap();
                 cur.write_all("data".as_bytes()).unwrap();
                 cur.write_all(&bytes_size.to_le_bytes()).unwrap();
-
-                for value in wave {
-                    let v = (value * volume_scale).clamp(-1., 1.);
-                    let data = (v * 0x7fff as f32) as i16;
-                    for _ in 0..repeat_count {
-                        cur.write_all(&data.to_le_bytes()).unwrap();
-                    }
-                }
-
+                cur.write_all(&pcm).unwrap();
                 cur.into_inner()
             }
         }
@@ -840,6 +973,21 @@ pub(crate) mod blocking {
             style_id: StyleId,
         ) -> Result<Vec<f32>>;
 
+        fn generate_full_intermediate(
+            &self,
+            length: usize,
+            phoneme_size: usize,
+            f0: &[f32],
+            phoneme_vector: &[f32],
+            style_id: StyleId,
+        ) -> Result<ndarray::Array2<f32>>;
+
+        fn render_audio_segment(
+            &self,
+            spec: ndarray::Array2<f32>,
+            style_id: StyleId,
+        ) -> Result<ndarray::Array1<f32>>;
+
         /// `decode`を実行する。
         ///
         /// # Performance
@@ -911,102 +1059,58 @@ pub(crate) mod blocking {
             Ok(output.into_raw_vec())
         }
 
-        fn decode(
+        fn generate_full_intermediate(
             &self,
             length: usize,
             phoneme_size: usize,
             f0: &[f32],
             phoneme_vector: &[f32],
             style_id: StyleId,
-        ) -> Result<Vec<f32>> {
+        ) -> Result<ndarray::Array2<f32>> {
             let (model_id, inner_voice_id) = self.status.ids_for::<TalkDomain>(style_id)?;
 
-            // 音が途切れてしまうのを避けるworkaround処理が入っている
-            // TODO: 改善したらここのpadding処理を取り除く
-            const PADDING_SIZE: f64 = 0.4;
-            let padding_size =
-                ((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize;
-            let start_and_end_padding_size = 2 * padding_size;
-            let length_with_padding = length + start_and_end_padding_size;
-            let f0_with_padding = make_f0_with_padding(f0, length_with_padding, padding_size);
-
-            let phoneme_with_padding = make_phoneme_with_padding(
-                phoneme_vector,
-                phoneme_size,
-                length_with_padding,
-                padding_size,
-            );
-
             let GenerateFullIntermediateOutput { spec } = self.status.run_session(
                 model_id,
                 GenerateFullIntermediateInput {
-                    f0: ndarray::arr1(&f0_with_padding)
-                        .into_shape([length_with_padding, 1])
-                        .unwrap(),
-                    phoneme: ndarray::arr1(&phoneme_with_padding)
-                        .into_shape([length_with_padding, phoneme_size])
+                    f0: ndarray::arr1(&f0).into_shape([length, 1]).unwrap(),
+                    phoneme: ndarray::arr1(&phoneme_vector)
+                        .into_shape([length, phoneme_size])
                         .unwrap(),
                     speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
                 },
             )?;
+            Ok(spec)
+        }
 
-            let RenderAudioSegmentOutput { wave: output } = self
+        fn render_audio_segment(
+            &self,
+            spec: ndarray::Array2<f32>,
+            style_id: StyleId,
+        ) -> Result<ndarray::Array1<f32>> {
+            let (model_id, _inner_voice_id) = self.status.ids_for::<TalkDomain>(style_id)?;
+            let RenderAudioSegmentOutput { wave } = self
                 .status
                 .run_session(model_id, RenderAudioSegmentInput { spec })?;
+            Ok(wave)
+        }
 
-            return Ok(trim_padding_from_output(
-                output.into_raw_vec(),
-                padding_size,
-            ));
-
-            fn make_f0_with_padding(
-                f0_slice: &[f32],
-                length_with_padding: usize,
-                padding_size: usize,
-            ) -> Vec<f32> {
-                // 音が途切れてしまうのを避けるworkaround処理
-                // 改善したらこの関数を削除する
-                let mut f0_with_padding = Vec::with_capacity(length_with_padding);
-                let padding = vec![0.0; padding_size];
-                f0_with_padding.extend_from_slice(&padding);
-                f0_with_padding.extend_from_slice(f0_slice);
-                f0_with_padding.extend_from_slice(&padding);
-                f0_with_padding
-            }
-
-            fn make_phoneme_with_padding(
-                phoneme_slice: &[f32],
-                phoneme_size: usize,
-                length_with_padding: usize,
-                padding_size: usize,
-            ) -> Vec<f32> {
-                // 音が途切れてしまうのを避けるworkaround処理
-                // 改善したらこの関数を削除する
-                let mut padding_phoneme = vec![0.0; phoneme_size];
-                padding_phoneme[0] = 1.0;
-                let padding_phoneme_len = padding_phoneme.len();
-                let padding_phonemes: Vec<f32> = padding_phoneme
-                    .into_iter()
-                    .cycle()
-                    .take(padding_phoneme_len * padding_size)
-                    .collect();
-                let mut phoneme_with_padding =
-                    Vec::with_capacity(phoneme_size * length_with_padding);
-                phoneme_with_padding.extend_from_slice(&padding_phonemes);
-                phoneme_with_padding.extend_from_slice(phoneme_slice);
-                phoneme_with_padding.extend_from_slice(&padding_phonemes);
-
-                phoneme_with_padding
-            }
-
-            fn trim_padding_from_output(mut output: Vec<f32>, padding_f0_size: usize) -> Vec<f32> {
-                // 音が途切れてしまうのを避けるworkaround処理
-                // 改善したらこの関数を削除する
-                let padding_sampling_size = padding_f0_size * 256;
-                output
-                    .drain(padding_sampling_size..output.len() - padding_sampling_size)
-                    .collect()
-            }
+        fn decode(
+            &self,
+            length: usize,
+            phoneme_size: usize,
+            f0: &[f32],
+            phoneme_vector: &[f32],
+            style_id: StyleId,
+        ) -> Result<Vec<f32>> {
+            let intermediate = self.generate_full_intermediate(
+                length,
+                phoneme_size,
+                &f0,
+                &phoneme_vector,
+                style_id,
+            )?;
+            let output = self.render_audio_segment(intermediate, style_id)?;
+            return Ok(output.into_raw_vec());
         }
     }