Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ストリーミングモードのdecodeを実装(precompute_renderとrender) #854

Merged
merged 23 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/voicevox_core/src/blocking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

pub use crate::{
engine::open_jtalk::blocking::OpenJtalk, infer::runtimes::onnxruntime::blocking::Onnxruntime,
synthesizer::blocking::Synthesizer, user_dict::dict::blocking::UserDict,
voice_model::blocking::VoiceModelFile,
synthesizer::blocking::Audio, synthesizer::blocking::Synthesizer,
user_dict::dict::blocking::UserDict, voice_model::blocking::VoiceModelFile,
};

pub mod onnxruntime {
Expand Down
300 changes: 202 additions & 98 deletions crates/voicevox_core/src/synthesizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,22 @@ pub(crate) mod blocking {

const DEFAULT_SAMPLING_RATE: u32 = 24000;

/// ユーザに渡す中間生成物。
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved
pub struct Audio {
/// (フレーム数, 特徴数)の形を持つ音声特徴量。
pub internal_state: ndarray::Array2<f32>,
/// 生成時に指定したスタイル番号。
pub style_id: crate::StyleId,
/// workaround paddingを除いた音声特徴量のフレーム数。
pub length: usize,
/// サンプリングレート。全体の秒数は`length / sampling_rate`で表せる。
pub sampling_rate: f32,
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved
/// workaroundとして付け足されているパディング長。
pub padding_length: usize,
/// 生成時に利用したクエリ。
pub audio_query: AudioQuery,
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved
}

/// 音声シンセサイザ。
pub struct Synthesizer<O> {
pub(super) status: Status<crate::blocking::Onnxruntime>,
Expand Down Expand Up @@ -257,13 +273,13 @@ pub(crate) mod blocking {
self.status.metas()
}

/// AudioQueryから音声合成を行う
pub fn synthesis(
/// AudioQueryから音声合成用の中間表現を生成する
pub fn seekable_synthesis(
&self,
audio_query: &AudioQuery,
style_id: StyleId,
options: &SynthesisOptions,
) -> Result<Vec<u8>> {
) -> Result<Audio> {
let AudioQuery {
accent_phrases,
speed_scale,
Expand Down Expand Up @@ -362,14 +378,36 @@ pub(crate) mod blocking {
}
}

let wave = &self.decode(
f0.len(),
OjtPhoneme::num_phoneme(),
&f0,
// 音が途切れてしまうのを避けるworkaround処理が入っている
// TODO: 改善したらここのpadding処理を取り除く
const PADDING_SIZE: f64 = 0.4;
let padding_size =
((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize;
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved
let start_and_end_padding_size = 2 * padding_size;
let length_with_padding = f0.len() + start_and_end_padding_size;
let f0_with_padding = make_f0_with_padding(&f0, length_with_padding, padding_size);
let phoneme_with_padding = make_phoneme_with_padding(
phoneme.as_flattened(),
OjtPhoneme::num_phoneme(),
length_with_padding,
padding_size,
);

let spec = self.generate_full_intermediate(
f0_with_padding.len(),
OjtPhoneme::num_phoneme(),
&f0_with_padding,
&phoneme_with_padding,
style_id,
)?;
return Ok(to_wav(wave, audio_query));
return Ok(Audio {
internal_state: spec,
style_id,
length: f0.len(),
sampling_rate: (DEFAULT_SAMPLING_RATE as f32) / 256.0,
padding_length: padding_size,
audio_query: audio_query.clone(),
});

fn adjust_interrogative_accent_phrases(
accent_phrases: &[AccentPhrase],
Expand Down Expand Up @@ -420,24 +458,127 @@ pub(crate) mod blocking {
}
}

fn to_wav(
fn make_f0_with_padding(
f0_slice: &[f32],
length_with_padding: usize,
padding_size: usize,
) -> Vec<f32> {
// 音が途切れてしまうのを避けるworkaround処理
// 改善したらこの関数を削除する
let mut f0_with_padding = Vec::with_capacity(length_with_padding);
let padding = vec![0.0; padding_size];
f0_with_padding.extend_from_slice(&padding);
f0_with_padding.extend_from_slice(f0_slice);
f0_with_padding.extend_from_slice(&padding);
f0_with_padding
}

fn make_phoneme_with_padding(
phoneme_slice: &[f32],
phoneme_size: usize,
length_with_padding: usize,
padding_size: usize,
) -> Vec<f32> {
// 音が途切れてしまうのを避けるworkaround処理
// 改善したらこの関数を削除する
let mut padding_phoneme = vec![0.0; phoneme_size];
padding_phoneme[0] = 1.0;
let padding_phoneme_len = padding_phoneme.len();
let padding_phonemes: Vec<f32> = padding_phoneme
.into_iter()
.cycle()
.take(padding_phoneme_len * padding_size)
.collect();
let mut phoneme_with_padding =
Vec::with_capacity(phoneme_size * length_with_padding);
phoneme_with_padding.extend_from_slice(&padding_phonemes);
phoneme_with_padding.extend_from_slice(phoneme_slice);
phoneme_with_padding.extend_from_slice(&padding_phonemes);

phoneme_with_padding
}
}

/// 中間表現から16bit PCMで音声波形を生成する。
pub fn render(&self, audio: &Audio, begin: usize, end: usize) -> Result<Vec<u8>> {
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved
const MARGIN: usize = 14; // 使われているHifiGANのreceptive fieldから計算される安全マージン
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved
use std::cmp::min;
// 実態(workaround paddingを含まない)上での区間
let clipped_begin = min(begin, audio.length);
let clipped_end = min(end, audio.length);
// データからはみ出さない安全マージン
let left_margin = min(MARGIN, audio.padding_length + clipped_begin);
let right_margin = min(MARGIN, audio.padding_length + (audio.length - clipped_end));
// 安全マージンを追加したデータ上での区間
let slice_begin = audio.padding_length + clipped_begin - left_margin;
let slice_end = audio.padding_length + clipped_end + right_margin;
let window = audio
.internal_state
.slice(ndarray::s![slice_begin..slice_end, ..]);
let wave_with_margin =
self.render_audio_segment(window.into_owned(), audio.style_id)?;
let wave = wave_with_margin
.slice(ndarray::s![
left_margin * 256..wave_with_margin.len() - right_margin * 256
])
.into_owned()
.into_raw_vec();
return Ok(to_s16le_pcm(&wave, &audio.audio_query));

fn to_s16le_pcm(
wave: &[f32],
&AudioQuery {
volume_scale,
output_sampling_rate,
output_stereo,
..
}: &AudioQuery,
) -> Vec<u8> {
let num_channels: u16 = if output_stereo { 2 } else { 1 };
let repeat_count: u32 =
(output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32;
let bytes_size = wave.len() as u32 * repeat_count * 2;
let buf: Vec<u8> = Vec::with_capacity(bytes_size as usize);
let mut cur = Cursor::new(buf);

for value in wave {
let v = (value * volume_scale).clamp(-1., 1.);
let data = (v * 0x7fff as f32) as i16;
for _ in 0..repeat_count {
cur.write_all(&data.to_le_bytes()).unwrap();
}
}

cur.into_inner()
}
}

/// AudioQueryから直接WAVフォーマットで音声波形を生成する。
pub fn synthesis(
&self,
audio_query: &AudioQuery,
style_id: StyleId,
options: &SynthesisOptions,
) -> Result<Vec<u8>> {
let audio = self.seekable_synthesis(audio_query, style_id, options)?;
let pcm = self.render(&audio, 0, audio.length)?;
return Ok(to_wav(&pcm, &audio_query));

fn to_wav(
pcm: &[u8],
&AudioQuery {
output_sampling_rate,
output_stereo,
..
}: &AudioQuery,
) -> Vec<u8> {
// TODO: 44.1kHzなどの対応

let num_channels: u16 = if output_stereo { 2 } else { 1 };
let bit_depth: u16 = 16;
let repeat_count: u32 =
(output_sampling_rate / DEFAULT_SAMPLING_RATE) * num_channels as u32;
let block_size: u16 = bit_depth * num_channels / 8;

let bytes_size = wave.len() as u32 * repeat_count * 2;
let bytes_size = pcm.len() as u32;
let wave_size = bytes_size + 44;

let buf: Vec<u8> = Vec::with_capacity(wave_size as usize);
Expand All @@ -458,15 +599,7 @@ pub(crate) mod blocking {
cur.write_all(&bit_depth.to_le_bytes()).unwrap();
cur.write_all("data".as_bytes()).unwrap();
cur.write_all(&bytes_size.to_le_bytes()).unwrap();

for value in wave {
let v = (value * volume_scale).clamp(-1., 1.);
let data = (v * 0x7fff as f32) as i16;
for _ in 0..repeat_count {
cur.write_all(&data.to_le_bytes()).unwrap();
}
}

cur.write_all(&pcm).unwrap();
cur.into_inner()
}
}
Expand Down Expand Up @@ -840,6 +973,21 @@ pub(crate) mod blocking {
style_id: StyleId,
) -> Result<Vec<f32>>;

fn generate_full_intermediate(
&self,
length: usize,
phoneme_size: usize,
f0: &[f32],
phoneme_vector: &[f32],
style_id: StyleId,
) -> Result<ndarray::Array2<f32>>;

fn render_audio_segment(
&self,
spec: ndarray::Array2<f32>,
style_id: StyleId,
) -> Result<ndarray::Array1<f32>>;

/// `decode`を実行する。
///
/// # Performance
Expand Down Expand Up @@ -911,102 +1059,58 @@ pub(crate) mod blocking {
Ok(output.into_raw_vec())
}

fn decode(
fn generate_full_intermediate(
&self,
length: usize,
phoneme_size: usize,
f0: &[f32],
phoneme_vector: &[f32],
style_id: StyleId,
) -> Result<Vec<f32>> {
) -> Result<ndarray::Array2<f32>> {
let (model_id, inner_voice_id) = self.status.ids_for::<TalkDomain>(style_id)?;

// 音が途切れてしまうのを避けるworkaround処理が入っている
// TODO: 改善したらここのpadding処理を取り除く
const PADDING_SIZE: f64 = 0.4;
let padding_size =
((PADDING_SIZE * DEFAULT_SAMPLING_RATE as f64) / 256.0).round() as usize;
let start_and_end_padding_size = 2 * padding_size;
let length_with_padding = length + start_and_end_padding_size;
let f0_with_padding = make_f0_with_padding(f0, length_with_padding, padding_size);

let phoneme_with_padding = make_phoneme_with_padding(
phoneme_vector,
phoneme_size,
length_with_padding,
padding_size,
);

let GenerateFullIntermediateOutput { spec } = self.status.run_session(
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved
model_id,
GenerateFullIntermediateInput {
f0: ndarray::arr1(&f0_with_padding)
.into_shape([length_with_padding, 1])
.unwrap(),
phoneme: ndarray::arr1(&phoneme_with_padding)
.into_shape([length_with_padding, phoneme_size])
f0: ndarray::arr1(&f0).into_shape([length, 1]).unwrap(),
phoneme: ndarray::arr1(&phoneme_vector)
.into_shape([length, phoneme_size])
.unwrap(),
speaker_id: ndarray::arr1(&[inner_voice_id.raw_id().into()]),
},
)?;
Ok(spec)
}

let RenderAudioSegmentOutput { wave: output } = self
fn render_audio_segment(
&self,
spec: ndarray::Array2<f32>,
style_id: StyleId,
) -> Result<ndarray::Array1<f32>> {
let (model_id, _inner_voice_id) = self.status.ids_for::<TalkDomain>(style_id)?;
let RenderAudioSegmentOutput { wave } = self
.status
.run_session(model_id, RenderAudioSegmentInput { spec })?;
Ok(wave)
}

return Ok(trim_padding_from_output(
output.into_raw_vec(),
padding_size,
));

fn make_f0_with_padding(
f0_slice: &[f32],
length_with_padding: usize,
padding_size: usize,
) -> Vec<f32> {
// 音が途切れてしまうのを避けるworkaround処理
// 改善したらこの関数を削除する
let mut f0_with_padding = Vec::with_capacity(length_with_padding);
let padding = vec![0.0; padding_size];
f0_with_padding.extend_from_slice(&padding);
f0_with_padding.extend_from_slice(f0_slice);
f0_with_padding.extend_from_slice(&padding);
f0_with_padding
}

fn make_phoneme_with_padding(
phoneme_slice: &[f32],
phoneme_size: usize,
length_with_padding: usize,
padding_size: usize,
) -> Vec<f32> {
// 音が途切れてしまうのを避けるworkaround処理
// 改善したらこの関数を削除する
let mut padding_phoneme = vec![0.0; phoneme_size];
padding_phoneme[0] = 1.0;
let padding_phoneme_len = padding_phoneme.len();
let padding_phonemes: Vec<f32> = padding_phoneme
.into_iter()
.cycle()
.take(padding_phoneme_len * padding_size)
.collect();
let mut phoneme_with_padding =
Vec::with_capacity(phoneme_size * length_with_padding);
phoneme_with_padding.extend_from_slice(&padding_phonemes);
phoneme_with_padding.extend_from_slice(phoneme_slice);
phoneme_with_padding.extend_from_slice(&padding_phonemes);

phoneme_with_padding
}

fn trim_padding_from_output(mut output: Vec<f32>, padding_f0_size: usize) -> Vec<f32> {
// 音が途切れてしまうのを避けるworkaround処理
// 改善したらこの関数を削除する
let padding_sampling_size = padding_f0_size * 256;
output
.drain(padding_sampling_size..output.len() - padding_sampling_size)
.collect()
}
fn decode(
Yosshi999 marked this conversation as resolved.
Show resolved Hide resolved
&self,
length: usize,
phoneme_size: usize,
f0: &[f32],
phoneme_vector: &[f32],
style_id: StyleId,
) -> Result<Vec<f32>> {
let intermediate = self.generate_full_intermediate(
length,
phoneme_size,
&f0,
&phoneme_vector,
style_id,
)?;
let output = self.render_audio_segment(intermediate, style_id)?;
return Ok(output.into_raw_vec());
}
}

Expand Down
Loading
Loading