Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Guided Synthesis #252

Merged
merged 32 commits into from
Mar 10, 2022
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a73892b
forced alignment, f0 extraction and entry point
Patchethium Dec 28, 2021
28cf7c2
Merge branch 'master' into guided_synthesis
Patchethium Dec 28, 2021
a060398
kind of finished
Patchethium Dec 28, 2021
f7a3713
change julius4seg, doesn't seem to help
Patchethium Dec 29, 2021
6b0651f
run pysen format
Patchethium Dec 29, 2021
f1a663a
add speaker id to api
Patchethium Dec 29, 2021
668df80
run pysen format
Patchethium Dec 29, 2021
ad4bdbd
add accent_phrase api, finish
Patchethium Dec 30, 2021
ea95405
add request parameter
Patchethium Dec 30, 2021
6dff2ec
improve error handling
Patchethium Dec 30, 2021
34eec39
run pysen format
Patchethium Dec 30, 2021
a0cba4d
add parameters
Patchethium Dec 30, 2021
90e41e2
run pysen format
Patchethium Dec 30, 2021
e889207
a little boundary check
Patchethium Dec 30, 2021
c98c8be
add normalization for different WAV format
Patchethium Dec 31, 2021
1c6d96e
run format
Patchethium Dec 31, 2021
2d74993
run format
Patchethium Dec 31, 2021
ca356df
Merge branch 'master' into guided_synthesis
Patchethium Dec 31, 2021
f088176
move synthesis and accent phrase to synthesis engine
Patchethium Dec 31, 2021
cf18c3c
add test for mock
Patchethium Dec 31, 2021
98d387c
change url for apis
Patchethium Dec 31, 2021
48b629f
simplify
Patchethium Dec 31, 2021
061483c
error type
Patchethium Jan 11, 2022
fc45886
Merge branch 'master' into guided_synthesis
Patchethium Jan 24, 2022
0e26bbb
do something
Patchethium Feb 21, 2022
365ed92
do something
Patchethium Feb 21, 2022
29427d9
run format
Patchethium Feb 21, 2022
ddc6537
Merge branch 'master' into guided_synthesis
Patchethium Feb 21, 2022
ca6df3b
resolve conflict
Patchethium Feb 21, 2022
730917f
add usage to README
Patchethium Feb 22, 2022
3522370
Merge branch 'master' into guided_synthesis
Patchethium Feb 27, 2022
9b75c6c
add comments and experimental flag for guided api
Patchethium Mar 9, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,9 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

# Guided Synthesis temp files
/voicevox_engine/experimental/dictation-kit*
first_pass*
second_pass*
tmp.wav
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,37 @@ curl -s \
> audio.wav
```

### Guidied Synthsis
Currently, we have two apis which accept an uploaded audio file and return corresponding synthesis information.
Both of them recommend setting `is_kana` to be `true` and use `kana` section from `AudioQuery` for the best performance.
You can also get the kana text in AquesTalk section.
```bash
# Returns an audio file which is synthesised referencing uploaded audio
# this example needs a recording whose content is
# "また 東寺のように 五大明王と 呼ばれる 主要な 明王の 中央に 配されることも多い"

curl -L -X POST 'localhost:50021/guided_synthesis' \
-F 'kana="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \
-F 'speaker_id="5"' \
-F 'audio_file=@"/full_path_to_your_recording"' \
-F 'normalize="true"' \
-F 'stereo="true"' \
-F 'sample_rate="24000"' \
-F 'volume_scale="1"' \
-F 'pitch_scale="0"' \
-F 'speed_scale="1"'

# Returns a list of AccentPhrases

curl -L -X POST 'localhost:50021/guided_accent_phrase' \
-F 'text="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \
-F 'speaker="5"' \
-F 'audio_file=@"/full_path_to_your_recording"' \
-F 'normalize="true"' \
-F 'is_kana="true"' \
-F 'enable_interrogative="false"'
```

### 話者の追加情報を取得するサンプルコード

追加情報の中の portrait.png を取得するコードです。
Expand Down
138 changes: 136 additions & 2 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import soundfile
import uvicorn
from fastapi import FastAPI, HTTPException, Request, Response
from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from fastapi.params import Query
from pydantic import ValidationError
Expand Down Expand Up @@ -213,6 +213,66 @@ def accent_phrases(
else:
return engine.create_accent_phrases(text, speaker_id=speaker)

@app.post(
"/guided_accent_phrase",
response_model=List[AccentPhrase],
tags=["クエリ編集"],
summary="Create Accent Phrase from External Audio",
)
def guided_accent_phrase(
text: str = Form(...), # noqa:B008
speaker: int = Form(...), # noqa:B008
is_kana: bool = Form(...), # noqa:B008
audio_file: UploadFile = File(...), # noqa: B008
normalize: bool = Form(...), # noqa:B008
core_version: Optional[str] = None,
):
engine = get_engine(core_version)
if is_kana:
try:
accent_phrases = parse_kana(text)
except ParseKanaError as err:
raise HTTPException(
status_code=400,
detail=ParseKanaBadRequest(err).dict(),
)
else:
accent_phrases = engine.create_accent_phrases(
text,
speaker_id=speaker,
)

try:
return engine.guided_accent_phrases(
accent_phrases=accent_phrases,
speaker=speaker,
audio_file=audio_file.file,
normalize=normalize,
)
except ParseKanaError as err:
raise HTTPException(
status_code=422,
detail=ParseKanaBadRequest(err).dict(),
)
except StopIteration:
print(traceback.format_exc())
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment",
)
except Exception as e:
print(traceback.format_exc())
if str(e) == "Decode Failed":
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment",
)
else:
raise HTTPException(
status_code=500,
detail="Internal Server Error",
)

@app.post(
"/mora_data",
response_model=List[AccentPhrase],
Expand Down Expand Up @@ -364,7 +424,7 @@ def multi_synthesis(
format="WAV",
)
wav_file.seek(0)
zip_file.writestr(f"{str(i+1).zfill(3)}.wav", wav_file.read())
zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read())

return FileResponse(f.name, media_type="application/zip")

Expand Down Expand Up @@ -418,6 +478,80 @@ def _synthesis_morphing(

return FileResponse(f.name, media_type="audio/wav")

@app.post(
"/guided_synthesis",
responses={
200: {
"content": {
"audio/wav": {"schema": {"type": "string", "format": "binary"}}
},
}
},
tags=["音声合成"],
summary="Audio synthesis guided by external audio and phonemes",
)
def guided_synthesis(
Hiroshiba marked this conversation as resolved.
Show resolved Hide resolved
kana: str = Form(...), # noqa: B008
speaker_id: int = Form(...), # noqa: B008
normalize: int = Form(...), # noqa: B008
audio_file: UploadFile = File(...), # noqa: B008
stereo: int = Form(...), # noqa: B008
sample_rate: int = Form(...), # noqa: B008
volume_scale: float = Form(...), # noqa: B008
pitch_scale: float = Form(...), # noqa: B008
speed_scale: float = Form(...), # noqa: B008
core_version: Optional[str] = None,
):
engine = get_engine(core_version)
try:
accent_phrases = parse_kana(kana)
query = AudioQuery(
accent_phrases=accent_phrases,
speedScale=speed_scale,
pitchScale=pitch_scale,
intonationScale=1,
volumeScale=volume_scale,
prePhonemeLength=0.1,
postPhonemeLength=0.1,
outputSamplingRate=sample_rate,
outputStereo=stereo,
kana=kana,
)
wave = engine.guided_synthesis(
audio_file=audio_file.file,
query=query,
speaker=speaker_id,
normalize=normalize,
)

with NamedTemporaryFile(delete=False) as f:
soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV")

return FileResponse(f.name, media_type="audio/wav")
except ParseKanaError as err:
raise HTTPException(
status_code=400,
detail=ParseKanaBadRequest(err).dict(),
)
except StopIteration:
print(traceback.format_exc())
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment.",
)
except Exception as e:
print(traceback.format_exc())
if str(e) == "Decode Failed":
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment.",
)
else:
raise HTTPException(
status_code=500,
detail="Internal Server Error.",
)

@app.post(
"/connect_waves",
response_class=FileResponse,
Expand Down
48 changes: 48 additions & 0 deletions voicevox_engine/dev/synthesis_engine/mock.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from logging import getLogger
from typing import Any, Dict, List, Optional
from typing.io import IO

import numpy as np
from pyopenjtalk import tts
Expand Down Expand Up @@ -130,3 +131,50 @@ def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray:
wave, sr = tts(text)
wave = resample(wave, 24000 * len(wave) // 48000)
return wave

def guided_synthesis(
self,
query: AudioQuery,
speaker: int,
audio_file: IO,
normalize: int,
) -> np.ndarray:
"""
Open jtalk doesn't have a guided function [Mock]
simply calling mock synthesis

Parameters
----------
query
speaker
audio_file
normalize

Returns
-------

"""
return self.synthesis(query=query, speaker_id=speaker)

def guided_accent_phrases(
self,
accent_phrases: List[AccentPhrase],
speaker: int,
audio_file: IO,
normalize: int,
) -> List[AccentPhrase]:
"""
guided_accent_phrases 入力accent_phrasesを変更せずにそのまま返します [Mock]

Parameters
----------
query
speaker
audio_file
normalize

Returns
-------

"""
return accent_phrases
Empty file.
Loading