Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Guided Synthesis #252

Merged
merged 32 commits into from
Mar 10, 2022
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a73892b
forced alignment, f0 extraction and entry point
Patchethium Dec 28, 2021
28cf7c2
Merge branch 'master' into guided_synthesis
Patchethium Dec 28, 2021
a060398
kind of finished
Patchethium Dec 28, 2021
f7a3713
change julius4seg, doesn't seem to help
Patchethium Dec 29, 2021
6b0651f
run pysen format
Patchethium Dec 29, 2021
f1a663a
add speaker id to api
Patchethium Dec 29, 2021
668df80
run pysen format
Patchethium Dec 29, 2021
ad4bdbd
add accent_phrase api, finish
Patchethium Dec 30, 2021
ea95405
add request parameter
Patchethium Dec 30, 2021
6dff2ec
improve error handling
Patchethium Dec 30, 2021
34eec39
run pysen format
Patchethium Dec 30, 2021
a0cba4d
add parameters
Patchethium Dec 30, 2021
90e41e2
run pysen format
Patchethium Dec 30, 2021
e889207
a little boundary check
Patchethium Dec 30, 2021
c98c8be
add normalization for different WAV format
Patchethium Dec 31, 2021
1c6d96e
run format
Patchethium Dec 31, 2021
2d74993
run format
Patchethium Dec 31, 2021
ca356df
Merge branch 'master' into guided_synthesis
Patchethium Dec 31, 2021
f088176
move synthesis and accent phrase to synthesis engine
Patchethium Dec 31, 2021
cf18c3c
add test for mock
Patchethium Dec 31, 2021
98d387c
change url for apis
Patchethium Dec 31, 2021
48b629f
simplify
Patchethium Dec 31, 2021
061483c
error type
Patchethium Jan 11, 2022
fc45886
Merge branch 'master' into guided_synthesis
Patchethium Jan 24, 2022
0e26bbb
do something
Patchethium Feb 21, 2022
365ed92
do something
Patchethium Feb 21, 2022
29427d9
run format
Patchethium Feb 21, 2022
ddc6537
Merge branch 'master' into guided_synthesis
Patchethium Feb 21, 2022
ca6df3b
resolve conflict
Patchethium Feb 21, 2022
730917f
add usage to README
Patchethium Feb 22, 2022
3522370
Merge branch 'master' into guided_synthesis
Patchethium Feb 27, 2022
9b75c6c
add comments and experimental flag for guided api
Patchethium Mar 9, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,9 @@ venv/
/cache

/licenses.json

# Guided Synthesis temp files
/voicevox_engine/dictation-kit*
first_pass*
second_pass*
tmp.wav
135 changes: 133 additions & 2 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import base64
import json
import multiprocessing
import traceback
import zipfile
from functools import lru_cache
from pathlib import Path
Expand All @@ -11,7 +12,7 @@

import soundfile
import uvicorn
from fastapi import FastAPI, HTTPException, Request, Response
from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from fastapi.params import Query
from starlette.responses import FileResponse
Expand Down Expand Up @@ -206,6 +207,63 @@ def accent_phrases(
enable_interrogative=enable_interrogative,
)

@app.post(
"/guided_accent_phrase",
response_model=AudioQuery,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ここはList[AccentPhrase]が正しそうです。

response_model=List[AccentPhrase],

tags=["クエリ作成"],
summary="Create Audio Query Guided by External Audio",
)
def guided_accent_phrase(
kana: str = Form(...), # noqa: B008
speaker_id: int = Form(...), # noqa: B008
normalize: int = Form(...), # noqa: B008
audio_file: UploadFile = File(...), # noqa: B008
):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kanaは「ひらがな」の意味ではなく、「AquesTalk記法のテキスト」という意味で用いています。

他のAPIと形式を合わせておくと、ユーザーにとって使い勝手が良さそうです。
こちらとAPI形式を合わせて、このようにしてください。

    def guided_accent_phrase(
        text: str,
        speaker: int,
        is_kana: bool = False,
        enable_interrogative: bool = enable_interrogative_query_param(),  # noqa B008,
        audio_file: UploadFile = File(...),  # noqa: B008
    ):

try:
accent_phrases, _ = parse_kana(kana, False)
query = AudioQuery(
accent_phrases=accent_phrases,
speedScale=1,
pitchScale=0,
intonationScale=1,
volumeScale=1,
prePhonemeLength=0.1,
postPhonemeLength=0.1,
outputSamplingRate=default_sampling_rate,
outputStereo=False,
kana=kana,
)
return engine.guided_accent_phrases(
query=query,
speaker_id=speaker_id,
audio_file=audio_file.file,
normalize=normalize,
)
except ParseKanaError:
print(traceback.format_exc())
raise HTTPException(
status_code=500,
Copy link
Member

@takana-v takana-v Jan 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think using 422 instead of 500 for the status code is better.
ref #91

detail="Failed to Parse Kana",
)
except StopIteration:
print(traceback.format_exc())
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment",
)
except Exception as e:
print(traceback.format_exc())
if str(e) == "Decode Failed":
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment",
)
else:
raise HTTPException(
status_code=500,
detail="Internal Server Error",
)

@app.post(
"/mora_data",
response_model=List[AccentPhrase],
Expand Down Expand Up @@ -324,7 +382,7 @@ def multi_synthesis(queries: List[AudioQuery], speaker: int):
format="WAV",
)
wav_file.seek(0)
zip_file.writestr(f"{str(i+1).zfill(3)}.wav", wav_file.read())
zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read())

return FileResponse(f.name, media_type="application/zip")

Expand Down Expand Up @@ -376,6 +434,79 @@ def _synthesis_morphing(

return FileResponse(f.name, media_type="audio/wav")

@app.post(
"/guided_synthesis",
responses={
200: {
"content": {
"audio/wav": {"schema": {"type": "string", "format": "binary"}}
},
}
},
tags=["音声合成"],
summary="Audio synthesis guided by external audio and phonemes",
)
def guided_synthesis(
Hiroshiba marked this conversation as resolved.
Show resolved Hide resolved
kana: str = Form(...), # noqa: B008
speaker_id: int = Form(...), # noqa: B008
normalize: int = Form(...), # noqa: B008
audio_file: UploadFile = File(...), # noqa: B008
stereo: int = Form(...), # noqa: B008
sample_rate: int = Form(...), # noqa: B008
volumeScale: float = Form(...), # noqa: B008
pitchScale: float = Form(...), # noqa: B008
speedScale: float = Form(...), # noqa: B008
):
try:
accent_phrases, _ = parse_kana(kana, False)
query = AudioQuery(
accent_phrases=accent_phrases,
speedScale=speedScale,
pitchScale=pitchScale,
intonationScale=1,
volumeScale=volumeScale,
prePhonemeLength=0.1,
postPhonemeLength=0.1,
outputSamplingRate=sample_rate,
outputStereo=stereo,
kana=kana,
)
wave = engine.guided_synthesis(
audio_file=audio_file.file,
query=query,
speaker_id=speaker_id,
normalize=normalize,
)

with NamedTemporaryFile(delete=False) as f:
soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV")

return FileResponse(f.name, media_type="audio/wav")
except ParseKanaError:
print(traceback.format_exc())
raise HTTPException(
status_code=500,
detail="Failed to Parse Kana",
)
except StopIteration:
print(traceback.format_exc())
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment.",
)
except Exception as e:
print(traceback.format_exc())
if str(e) == "Decode Failed":
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment.",
)
else:
raise HTTPException(
status_code=500,
detail="Internal Server Error.",
)

@app.post(
"/connect_waves",
response_class=FileResponse,
Expand Down
50 changes: 49 additions & 1 deletion voicevox_engine/dev/synthesis_engine/mock.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from logging import getLogger
from typing import Any, Dict, List
from typing import Any, Dict, List, Optional
from typing.io import IO

import numpy as np
from pyopenjtalk import tts
Expand Down Expand Up @@ -125,3 +126,50 @@ def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray:
24000 * len(wave) // 48000,
)
return wave.astype("int16")

def guided_synthesis(
self,
query: AudioQuery,
speaker_id: int,
audio_file: Optional[IO],
normalize: int,
) -> np.ndarray:
"""
Open jtalk doesn't have a guided function
simply calling mock synthesis

Parameters
----------
query
speaker_id
audio_file
normalize

Returns
-------

"""
return self.synthesis(query=query, speaker_id=speaker_id)

def guided_accent_phrases(
self,
query: AudioQuery,
speaker_id: int,
audio_file: Optional[IO],
normalize: int,
) -> AudioQuery:
"""
guided_accent_phrases 入力accent_phrasesを変更せずにそのまま返します [Mock]

Parameters
----------
query
speaker_id
audio_file
normalize

Returns
-------

"""
return query
Loading