Skip to content

Commit

Permalink
Guided synthesis - API Improvement (#376)
Browse files Browse the repository at this point in the history
* forced alignment, f0 extraction and entry point

* kind of finished

* change julius4seg, doesn't seem to help

* run pysen format

* add speaker id to api

* run pysen format

* add accent_phrase api, finish

* add request parameter

* improve error handling

* run pysen format

* add parameters

* run pysen format

* a little boundary check

* add normalization for different WAV format

* run format

* run format

* move synthesis and accent phrase to synthesis engine

* add test for mock

* change url for apis

* simplify

* error type

* do something

* do something

* run format

* resolve conflict

* add usage to README

* add comments and experimental flag for guided api

* add guided info to AudioQuery model

* improve api definition

* run format, update README

* add error handling for wrong audio formats, edit README

* reserve unvoiced mora, add response type

* remove 422 error, move boundary check

* Update voicevox_engine/synthesis_engine/synthesis_engine.py

Co-authored-by: Hiroshiba <[email protected]>

* move guided info to the outside of query

* run fmt

* update README

* fix README

Co-authored-by: Hiroshiba <[email protected]>
  • Loading branch information
Patchethium and Hiroshiba authored Apr 10, 2022
1 parent 0751917 commit 9af4963
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 116 deletions.
69 changes: 42 additions & 27 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -245,34 +245,49 @@ curl -s \
```

### Guidied Synthsis
Currently, we have two apis which accept an uploaded audio file and return corresponding synthesis information.
Both of them recommend setting `is_kana` to be `true` and use `kana` section from `AudioQuery` for the best performance.
You can also get the kana text in AquesTalk section.
Currently, we have two apis generates audio (`guided_synthesis`) and a list of AccentPhrase (`guided_accent_phrases`) referencing an external audio source.
It's worth noting that different from `guided_accent_phrases`, `guided_synthesis` works in the resolution of frames, as a result they are not compatible with each other.
**The external audio should be in wav format.**
```bash
# Returns an audio file which is synthesised referencing uploaded audio
# this example needs a recording whose content is
# "また 東寺のように 五大明王と 呼ばれる 主要な 明王の 中央に 配されることも多い"

curl -L -X POST 'localhost:50021/guided_synthesis' \
-F 'kana="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \
-F 'speaker_id="5"' \
-F 'audio_file=@"/full_path_to_your_recording"' \
-F 'normalize="true"' \
-F 'stereo="true"' \
-F 'sample_rate="24000"' \
-F 'volume_scale="1"' \
-F 'pitch_scale="0"' \
-F 'speed_scale="1"'

# Returns a list of AccentPhrases

curl -L -X POST 'localhost:50021/guided_accent_phrase' \
-F 'text="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \
-F 'speaker="5"' \
-F 'audio_file=@"/full_path_to_your_recording"' \
-F 'normalize="true"' \
-F 'is_kana="true"' \
-F 'enable_interrogative="false"'
# guided_syhthesis returns an audio file which is synthesised referencing the external audio source

echo -n "また 東寺のように 五大明王と 呼ばれる 主要な 明王の 中央に 配されることも多い" > text.txt

curl -s \
-X POST \
"localhost:50021/audio_query?speaker=1" \
--get --data-urlencode [email protected] \
> query.json

# if true, the average of f0 will be normalized to the predicted average
normalize="true"
# full path to your audio record
audio_path="/home/.../sample.wav"

curl -s \
-H "Content-Type: application/json" \
-X POST \
-d @query.json \
"localhost:50021/guided_synthesis?speaker=1&normalize=$normalize&audio_path=$audio_path" \
> audio.wav

# guided_accent_phrases returns a list of AccentPhrases
curl -s \
-H "Content-Type: application/json" \
-X POST \
-d @query.json \
"http://localhost:50021/guided_accent_phrases?speaker=0&normalize=$normalize&audio_path=$audio_path" \
> newphrases.json

# replace the accent_phrases section in query
cat query.json | sed -e "s/\[{.*}\]/$(cat newphrases.json)/g" > newquery.json

curl -s \
-H "Content-Type: application/json" \
-X POST \
-d @newquery.json \
"localhost:50021/synthesis?speaker=1" \
> audio.wav
```

### 話者の追加情報を取得するサンプルコード
Expand Down
78 changes: 24 additions & 54 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import soundfile
import uvicorn
from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile
from fastapi import FastAPI, HTTPException, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.params import Query
from pydantic import ValidationError, conint
Expand Down Expand Up @@ -219,17 +219,16 @@ def accent_phrases(
return engine.create_accent_phrases(text, speaker_id=speaker)

@app.post(
"/guided_accent_phrase",
"/guided_accent_phrases",
response_model=List[AccentPhrase],
tags=["クエリ編集"],
summary="Create Accent Phrase from External Audio",
)
def guided_accent_phrase(
text: str = Form(...), # noqa:B008
speaker: int = Form(...), # noqa:B008
is_kana: bool = Form(...), # noqa:B008
audio_file: UploadFile = File(...), # noqa: B008
normalize: bool = Form(...), # noqa:B008
def guided_accent_phrases(
query: AudioQuery,
speaker: int,
audio_path: str,
normalize: bool,
core_version: Optional[str] = None,
):
"""
Expand All @@ -243,32 +242,13 @@ def guided_accent_phrase(
detail="実験的機能はデフォルトで無効になっています。使用するには引数を指定してください。",
)
engine = get_engine(core_version)
if is_kana:
try:
accent_phrases = parse_kana(text)
except ParseKanaError as err:
raise HTTPException(
status_code=400,
detail=ParseKanaBadRequest(err).dict(),
)
else:
accent_phrases = engine.create_accent_phrases(
text,
speaker_id=speaker,
)

try:
return engine.guided_accent_phrases(
accent_phrases=accent_phrases,
query=query,
speaker=speaker,
audio_file=audio_file.file,
audio_path=audio_path,
normalize=normalize,
)
except ParseKanaError as err:
raise HTTPException(
status_code=422,
detail=ParseKanaBadRequest(err).dict(),
)
except StopIteration:
print(traceback.format_exc())
raise HTTPException(
Expand Down Expand Up @@ -505,6 +485,7 @@ def _synthesis_morphing(

@app.post(
"/guided_synthesis",
response_class=FileResponse,
responses={
200: {
"content": {
Expand All @@ -516,15 +497,10 @@ def _synthesis_morphing(
summary="Audio synthesis guided by external audio and phonemes",
)
def guided_synthesis(
kana: str = Form(...), # noqa: B008
speaker_id: int = Form(...), # noqa: B008
normalize: bool = Form(...), # noqa: B008
audio_file: UploadFile = File(...), # noqa: B008
stereo: bool = Form(...), # noqa: B008
sample_rate: int = Form(...), # noqa: B008
volume_scale: float = Form(...), # noqa: B008
pitch_scale: float = Form(...), # noqa: B008
speed_scale: float = Form(...), # noqa: B008
query: AudioQuery,
speaker: int,
audio_path: str,
normalize: bool,
core_version: Optional[str] = None,
):
"""
Expand All @@ -539,28 +515,17 @@ def guided_synthesis(
)
engine = get_engine(core_version)
try:
accent_phrases = parse_kana(kana)
query = AudioQuery(
accent_phrases=accent_phrases,
speedScale=speed_scale,
pitchScale=pitch_scale,
intonationScale=1,
volumeScale=volume_scale,
prePhonemeLength=0.1,
postPhonemeLength=0.1,
outputSamplingRate=sample_rate,
outputStereo=stereo,
kana=kana,
)
wave = engine.guided_synthesis(
audio_file=audio_file.file,
query=query,
speaker=speaker_id,
speaker=speaker,
audio_path=audio_path,
normalize=normalize,
)

with NamedTemporaryFile(delete=False) as f:
soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV")
soundfile.write(
file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV"
)

return FileResponse(f.name, media_type="audio/wav")
except ParseKanaError as err:
Expand All @@ -581,6 +546,11 @@ def guided_synthesis(
status_code=500,
detail="Failed in Forced Alignment.",
)
elif str(e) == "Wrong Audio Encoding Format":
raise HTTPException(
status_code=500,
detail=str(e),
)
else:
raise HTTPException(
status_code=500,
Expand Down
18 changes: 9 additions & 9 deletions voicevox_engine/dev/synthesis_engine/mock.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from logging import getLogger
from typing import Any, Dict, List, Optional
from typing.io import IO

import numpy as np
from pyopenjtalk import tts
Expand Down Expand Up @@ -140,8 +139,9 @@ def guided_synthesis(
self,
query: AudioQuery,
speaker: int,
audio_file: IO,
normalize: int,
audio_path: str,
normalize: bool,
core_version: Optional[str] = None,
) -> np.ndarray:
"""
Open jtalk doesn't have a guided function [Mock]
Expand All @@ -151,7 +151,7 @@ def guided_synthesis(
----------
query
speaker
audio_file
audio_path
normalize
Returns
Expand All @@ -162,10 +162,10 @@ def guided_synthesis(

def guided_accent_phrases(
self,
accent_phrases: List[AccentPhrase],
query: AudioQuery,
speaker: int,
audio_file: IO,
normalize: int,
audio_path: str,
normalize: bool,
) -> List[AccentPhrase]:
"""
guided_accent_phrases 入力accent_phrasesを変更せずにそのまま返します [Mock]
Expand All @@ -174,11 +174,11 @@ def guided_accent_phrases(
----------
query
speaker
audio_file
audio_path
normalize
Returns
-------
"""
return accent_phrases
return query.accent_phrases
13 changes: 10 additions & 3 deletions voicevox_engine/experimental/guided_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import tarfile
from os.path import exists
from pathlib import PurePath
from typing.io import IO
from urllib.request import urlretrieve

import numpy as np
Expand Down Expand Up @@ -93,9 +92,12 @@ def _no_nan(num):
return 0.0 if np.isnan(num) else num


def extract_guided_feature(audio_file: IO, kana: str):
def extract_guided_feature(audio_file: str, kana: str):
_lazy_init()
sr, wave = wavfile.read(audio_file)
try:
sr, wave = wavfile.read(audio_file)
except ValueError:
raise Exception("Wrong Audio Encoding Format")
# stereo to mono
if len(wave.shape) == 2:
wave = wave.sum(axis=1) / 2
Expand All @@ -120,6 +122,11 @@ def extract_guided_feature(audio_file: IO, kana: str):
)

phones = forced_align(julius_wave, julius_kana)

# a bit boundary check
f0[f0 > 6.5] = 6.5
f0[(0 < f0) & (f0 < 3)] = 3.0

return f0, phones


Expand Down
35 changes: 18 additions & 17 deletions voicevox_engine/synthesis_engine/synthesis_engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from copy import deepcopy
from itertools import chain
from typing import List, Optional, Tuple
from typing.io import IO

import numpy
from scipy.signal import resample
Expand Down Expand Up @@ -492,10 +491,12 @@ def guided_synthesis(
self,
query: AudioQuery,
speaker: int,
audio_file: IO,
normalize: int,
audio_path: str,
normalize: bool,
core_version: Optional[str] = None,
):
f0, phonemes = extract_guided_feature(audio_file, query.kana)
kana = create_kana(query.accent_phrases)
f0, phonemes = extract_guided_feature(audio_path, kana)

phone_list = numpy.zeros((len(f0), OjtPhoneme.num_phoneme), dtype=numpy.float32)

Expand All @@ -516,13 +517,9 @@ def guided_synthesis(
phone_list[s - 1 : e] = OjtPhoneme(start=s, end=e, phoneme=p).onehot

if normalize:
f0 += get_normalize_diff(
engine=self, kana=query.kana, f0=f0, speaker_id=speaker
)
f0 += get_normalize_diff(engine=self, kana=kana, f0=f0, speaker_id=speaker)

f0 *= 2 ** query.pitchScale
f0[f0 > 6.5] = 6.5
f0[(0 < f0) & (f0 < 3)] = 3.0

f0 = resample(f0, int(len(f0) / query.speedScale))
phone_list = resample(phone_list, int(len(phone_list) / query.speedScale))
Expand Down Expand Up @@ -551,13 +548,13 @@ def guided_synthesis(

def guided_accent_phrases(
self,
accent_phrases: List[AccentPhrase],
query: AudioQuery,
speaker: int,
audio_file: IO,
normalize: int,
audio_path: str,
normalize: bool,
) -> List[AccentPhrase]:
kana = create_kana(accent_phrases=accent_phrases)
f0, phonemes = extract_guided_feature(audio_file, kana)
kana = create_kana(query.accent_phrases)
f0, phonemes = extract_guided_feature(audio_path, kana)
timed_phonemes = frame_to_second(deepcopy(phonemes))

phrase_info = []
Expand All @@ -577,10 +574,12 @@ def guided_accent_phrases(
engine=self, kana=kana, f0=f0, speaker_id=speaker
)
for p in phrase_info:
p.pitch += normalize_diff
if p.pitch != 0:
p.pitch += normalize_diff

idx = 1
for phrase in accent_phrases:
for phrase in query.accent_phrases:
phrase.pause_mora = None
for mora in phrase.moras:
if mora.consonant is not None:
mora.pitch = (
Expand All @@ -593,6 +592,8 @@ def guided_accent_phrases(
mora.pitch = phrase_info[idx].pitch
mora.vowel_length = phrase_info[idx].length
idx += 1
if mora.vowel in unvoiced_mora_phoneme_list:
mora.pitch = 0
if phrase_info[idx].phoneme == "sp":
phrase.pause_mora = Mora(
text="、",
Expand All @@ -604,4 +605,4 @@ def guided_accent_phrases(
)
idx += 1

return accent_phrases
return query.accent_phrases
Loading

0 comments on commit 9af4963

Please sign in to comment.