From 9af4963892f36a6b9022d33f204aec12f7a4ffd3 Mon Sep 17 00:00:00 2001 From: Patchethium <74458240+Patchethium@users.noreply.github.com> Date: Mon, 11 Apr 2022 03:00:08 +0800 Subject: [PATCH] Guided synthesis - API Improvement (#376) * forced alignment, f0 extraction and entry point * kind of finished * change julius4seg, doesn't seem to help * run pysen format * add speaker id to api * run pysen format * add accent_phrase api, finish * add request parameter * improve error handling * run pysen format * add parameters * run pysen format * a little boundary check * add normalization for different WAV format * run format * run format * move synthesis and accent phrase to synthesis engine * add test for mock * change url for apis * simplify * error type * do something * do something * run format * resolve conflict * add usage to README * add comments and experimental flag for guided api * add guided info to AudioQuery model * improve api definition * run format, update README * add error handling for wrong audio formats, edit README * reserve unvoiced mora, add response type * remove 422 error, move boundary check * Update voicevox_engine/synthesis_engine/synthesis_engine.py Co-authored-by: Hiroshiba * move guided info to the outside of query * run fmt * update README * fix README Co-authored-by: Hiroshiba --- README.md | 69 +++++++++------- run.py | 78 ++++++------------- voicevox_engine/dev/synthesis_engine/mock.py | 18 ++--- .../experimental/guided_extractor.py | 13 +++- .../synthesis_engine/synthesis_engine.py | 35 +++++---- .../synthesis_engine/synthesis_engine_base.py | 12 +-- 6 files changed, 109 insertions(+), 116 deletions(-) diff --git a/README.md b/README.md index ebf8436a8..8ca584fe6 100644 --- a/README.md +++ b/README.md @@ -245,34 +245,49 @@ curl -s \ ``` ### Guidied Synthsis -Currently, we have two apis which accept an uploaded audio file and return corresponding synthesis information. -Both of them recommend setting `is_kana` to be `true` and use `kana` section from `AudioQuery` for the best performance. -You can also get the kana text in AquesTalk section. +Currently, we have two apis generates audio (`guided_synthesis`) and a list of AccentPhrase (`guided_accent_phrases`) referencing an external audio source. +It's worth noting that different from `guided_accent_phrases`, `guided_synthesis` works in the resolution of frames, as a result they are not compatible with each other. +**The external audio should be in wav format.** ```bash -# Returns an audio file which is synthesised referencing uploaded audio -# this example needs a recording whose content is -# "また 東寺のように 五大明王と 呼ばれる 主要な 明王の 中央に 配されることも多い" - -curl -L -X POST 'localhost:50021/guided_synthesis' \ - -F 'kana="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \ - -F 'speaker_id="5"' \ - -F 'audio_file=@"/full_path_to_your_recording"' \ - -F 'normalize="true"' \ - -F 'stereo="true"' \ - -F 'sample_rate="24000"' \ - -F 'volume_scale="1"' \ - -F 'pitch_scale="0"' \ - -F 'speed_scale="1"' - -# Returns a list of AccentPhrases - -curl -L -X POST 'localhost:50021/guided_accent_phrase' \ - -F 'text="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \ - -F 'speaker="5"' \ - -F 'audio_file=@"/full_path_to_your_recording"' \ - -F 'normalize="true"' \ - -F 'is_kana="true"' \ - -F 'enable_interrogative="false"' +# guided_syhthesis returns an audio file which is synthesised referencing the external audio source + +echo -n "また 東寺のように 五大明王と 呼ばれる 主要な 明王の 中央に 配されることも多い" > text.txt + +curl -s \ + -X POST \ + "localhost:50021/audio_query?speaker=1" \ + --get --data-urlencode text@text.txt \ + > query.json + +# if true, the average of f0 will be normalized to the predicted average +normalize="true" +# full path to your audio record +audio_path="/home/.../sample.wav" + +curl -s \ + -H "Content-Type: application/json" \ + -X POST \ + -d @query.json \ + "localhost:50021/guided_synthesis?speaker=1&normalize=$normalize&audio_path=$audio_path" \ + > audio.wav + +# guided_accent_phrases returns a list of AccentPhrases +curl -s \ + -H "Content-Type: application/json" \ + -X POST \ + -d @query.json \ + "http://localhost:50021/guided_accent_phrases?speaker=0&normalize=$normalize&audio_path=$audio_path" \ + > newphrases.json + +# replace the accent_phrases section in query +cat query.json | sed -e "s/\[{.*}\]/$(cat newphrases.json)/g" > newquery.json + +curl -s \ + -H "Content-Type: application/json" \ + -X POST \ + -d @newquery.json \ + "localhost:50021/synthesis?speaker=1" \ + > audio.wav ``` ### 話者の追加情報を取得するサンプルコード diff --git a/run.py b/run.py index a4eb691a4..ccdd4373a 100644 --- a/run.py +++ b/run.py @@ -17,7 +17,7 @@ import soundfile import uvicorn -from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile +from fastapi import FastAPI, HTTPException, Request, Response from fastapi.middleware.cors import CORSMiddleware from fastapi.params import Query from pydantic import ValidationError, conint @@ -219,17 +219,16 @@ def accent_phrases( return engine.create_accent_phrases(text, speaker_id=speaker) @app.post( - "/guided_accent_phrase", + "/guided_accent_phrases", response_model=List[AccentPhrase], tags=["クエリ編集"], summary="Create Accent Phrase from External Audio", ) - def guided_accent_phrase( - text: str = Form(...), # noqa:B008 - speaker: int = Form(...), # noqa:B008 - is_kana: bool = Form(...), # noqa:B008 - audio_file: UploadFile = File(...), # noqa: B008 - normalize: bool = Form(...), # noqa:B008 + def guided_accent_phrases( + query: AudioQuery, + speaker: int, + audio_path: str, + normalize: bool, core_version: Optional[str] = None, ): """ @@ -243,32 +242,13 @@ def guided_accent_phrase( detail="実験的機能はデフォルトで無効になっています。使用するには引数を指定してください。", ) engine = get_engine(core_version) - if is_kana: - try: - accent_phrases = parse_kana(text) - except ParseKanaError as err: - raise HTTPException( - status_code=400, - detail=ParseKanaBadRequest(err).dict(), - ) - else: - accent_phrases = engine.create_accent_phrases( - text, - speaker_id=speaker, - ) - try: return engine.guided_accent_phrases( - accent_phrases=accent_phrases, + query=query, speaker=speaker, - audio_file=audio_file.file, + audio_path=audio_path, normalize=normalize, ) - except ParseKanaError as err: - raise HTTPException( - status_code=422, - detail=ParseKanaBadRequest(err).dict(), - ) except StopIteration: print(traceback.format_exc()) raise HTTPException( @@ -505,6 +485,7 @@ def _synthesis_morphing( @app.post( "/guided_synthesis", + response_class=FileResponse, responses={ 200: { "content": { @@ -516,15 +497,10 @@ def _synthesis_morphing( summary="Audio synthesis guided by external audio and phonemes", ) def guided_synthesis( - kana: str = Form(...), # noqa: B008 - speaker_id: int = Form(...), # noqa: B008 - normalize: bool = Form(...), # noqa: B008 - audio_file: UploadFile = File(...), # noqa: B008 - stereo: bool = Form(...), # noqa: B008 - sample_rate: int = Form(...), # noqa: B008 - volume_scale: float = Form(...), # noqa: B008 - pitch_scale: float = Form(...), # noqa: B008 - speed_scale: float = Form(...), # noqa: B008 + query: AudioQuery, + speaker: int, + audio_path: str, + normalize: bool, core_version: Optional[str] = None, ): """ @@ -539,28 +515,17 @@ def guided_synthesis( ) engine = get_engine(core_version) try: - accent_phrases = parse_kana(kana) - query = AudioQuery( - accent_phrases=accent_phrases, - speedScale=speed_scale, - pitchScale=pitch_scale, - intonationScale=1, - volumeScale=volume_scale, - prePhonemeLength=0.1, - postPhonemeLength=0.1, - outputSamplingRate=sample_rate, - outputStereo=stereo, - kana=kana, - ) wave = engine.guided_synthesis( - audio_file=audio_file.file, query=query, - speaker=speaker_id, + speaker=speaker, + audio_path=audio_path, normalize=normalize, ) with NamedTemporaryFile(delete=False) as f: - soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV") + soundfile.write( + file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV" + ) return FileResponse(f.name, media_type="audio/wav") except ParseKanaError as err: @@ -581,6 +546,11 @@ def guided_synthesis( status_code=500, detail="Failed in Forced Alignment.", ) + elif str(e) == "Wrong Audio Encoding Format": + raise HTTPException( + status_code=500, + detail=str(e), + ) else: raise HTTPException( status_code=500, diff --git a/voicevox_engine/dev/synthesis_engine/mock.py b/voicevox_engine/dev/synthesis_engine/mock.py index 2c026f8d3..6c7637026 100644 --- a/voicevox_engine/dev/synthesis_engine/mock.py +++ b/voicevox_engine/dev/synthesis_engine/mock.py @@ -1,6 +1,5 @@ from logging import getLogger from typing import Any, Dict, List, Optional -from typing.io import IO import numpy as np from pyopenjtalk import tts @@ -140,8 +139,9 @@ def guided_synthesis( self, query: AudioQuery, speaker: int, - audio_file: IO, - normalize: int, + audio_path: str, + normalize: bool, + core_version: Optional[str] = None, ) -> np.ndarray: """ Open jtalk doesn't have a guided function [Mock] @@ -151,7 +151,7 @@ def guided_synthesis( ---------- query speaker - audio_file + audio_path normalize Returns @@ -162,10 +162,10 @@ def guided_synthesis( def guided_accent_phrases( self, - accent_phrases: List[AccentPhrase], + query: AudioQuery, speaker: int, - audio_file: IO, - normalize: int, + audio_path: str, + normalize: bool, ) -> List[AccentPhrase]: """ guided_accent_phrases 入力accent_phrasesを変更せずにそのまま返します [Mock] @@ -174,11 +174,11 @@ def guided_accent_phrases( ---------- query speaker - audio_file + audio_path normalize Returns ------- """ - return accent_phrases + return query.accent_phrases diff --git a/voicevox_engine/experimental/guided_extractor.py b/voicevox_engine/experimental/guided_extractor.py index 8e4c280d2..9ca7c2769 100644 --- a/voicevox_engine/experimental/guided_extractor.py +++ b/voicevox_engine/experimental/guided_extractor.py @@ -3,7 +3,6 @@ import tarfile from os.path import exists from pathlib import PurePath -from typing.io import IO from urllib.request import urlretrieve import numpy as np @@ -93,9 +92,12 @@ def _no_nan(num): return 0.0 if np.isnan(num) else num -def extract_guided_feature(audio_file: IO, kana: str): +def extract_guided_feature(audio_file: str, kana: str): _lazy_init() - sr, wave = wavfile.read(audio_file) + try: + sr, wave = wavfile.read(audio_file) + except ValueError: + raise Exception("Wrong Audio Encoding Format") # stereo to mono if len(wave.shape) == 2: wave = wave.sum(axis=1) / 2 @@ -120,6 +122,11 @@ def extract_guided_feature(audio_file: IO, kana: str): ) phones = forced_align(julius_wave, julius_kana) + + # a bit boundary check + f0[f0 > 6.5] = 6.5 + f0[(0 < f0) & (f0 < 3)] = 3.0 + return f0, phones diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index fa54e387a..895eae048 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -1,7 +1,6 @@ from copy import deepcopy from itertools import chain from typing import List, Optional, Tuple -from typing.io import IO import numpy from scipy.signal import resample @@ -492,10 +491,12 @@ def guided_synthesis( self, query: AudioQuery, speaker: int, - audio_file: IO, - normalize: int, + audio_path: str, + normalize: bool, + core_version: Optional[str] = None, ): - f0, phonemes = extract_guided_feature(audio_file, query.kana) + kana = create_kana(query.accent_phrases) + f0, phonemes = extract_guided_feature(audio_path, kana) phone_list = numpy.zeros((len(f0), OjtPhoneme.num_phoneme), dtype=numpy.float32) @@ -516,13 +517,9 @@ def guided_synthesis( phone_list[s - 1 : e] = OjtPhoneme(start=s, end=e, phoneme=p).onehot if normalize: - f0 += get_normalize_diff( - engine=self, kana=query.kana, f0=f0, speaker_id=speaker - ) + f0 += get_normalize_diff(engine=self, kana=kana, f0=f0, speaker_id=speaker) f0 *= 2 ** query.pitchScale - f0[f0 > 6.5] = 6.5 - f0[(0 < f0) & (f0 < 3)] = 3.0 f0 = resample(f0, int(len(f0) / query.speedScale)) phone_list = resample(phone_list, int(len(phone_list) / query.speedScale)) @@ -551,13 +548,13 @@ def guided_synthesis( def guided_accent_phrases( self, - accent_phrases: List[AccentPhrase], + query: AudioQuery, speaker: int, - audio_file: IO, - normalize: int, + audio_path: str, + normalize: bool, ) -> List[AccentPhrase]: - kana = create_kana(accent_phrases=accent_phrases) - f0, phonemes = extract_guided_feature(audio_file, kana) + kana = create_kana(query.accent_phrases) + f0, phonemes = extract_guided_feature(audio_path, kana) timed_phonemes = frame_to_second(deepcopy(phonemes)) phrase_info = [] @@ -577,10 +574,12 @@ def guided_accent_phrases( engine=self, kana=kana, f0=f0, speaker_id=speaker ) for p in phrase_info: - p.pitch += normalize_diff + if p.pitch != 0: + p.pitch += normalize_diff idx = 1 - for phrase in accent_phrases: + for phrase in query.accent_phrases: + phrase.pause_mora = None for mora in phrase.moras: if mora.consonant is not None: mora.pitch = ( @@ -593,6 +592,8 @@ def guided_accent_phrases( mora.pitch = phrase_info[idx].pitch mora.vowel_length = phrase_info[idx].length idx += 1 + if mora.vowel in unvoiced_mora_phoneme_list: + mora.pitch = 0 if phrase_info[idx].phoneme == "sp": phrase.pause_mora = Mora( text="、", @@ -604,4 +605,4 @@ def guided_accent_phrases( ) idx += 1 - return accent_phrases + return query.accent_phrases diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/synthesis_engine/synthesis_engine_base.py index c84a213fe..485b93e81 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine_base.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine_base.py @@ -1,7 +1,6 @@ import copy from abc import ABCMeta, abstractmethod from typing import List, Optional -from typing.io import IO from .. import full_context_label from ..full_context_label import extract_full_context_label @@ -231,17 +230,18 @@ def guided_synthesis( self, query: AudioQuery, speaker: int, - audio_file: IO, - normalize: int, + audio_path: str, + normalize: bool, + core_version: Optional[str] = None, ): raise NotImplementedError() @abstractmethod def guided_accent_phrases( self, - accent_phrases: List[AccentPhrase], + query: AudioQuery, speaker: int, - audio_file: IO, - normalize: int, + audio_path: str, + normalize: bool, ) -> List[AccentPhrase]: raise NotImplementedError()