-
Notifications
You must be signed in to change notification settings - Fork 206
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Guided Synthesis #252
Guided Synthesis #252
Changes from 22 commits
a73892b
28cf7c2
a060398
f7a3713
6b0651f
f1a663a
668df80
ad4bdbd
ea95405
6dff2ec
34eec39
a0cba4d
90e41e2
e889207
c98c8be
1c6d96e
2d74993
ca356df
f088176
cf18c3c
98d387c
48b629f
061483c
fc45886
0e26bbb
365ed92
29427d9
ddc6537
ca6df3b
730917f
3522370
9b75c6c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
import base64 | ||
import json | ||
import multiprocessing | ||
import traceback | ||
import zipfile | ||
from functools import lru_cache | ||
from pathlib import Path | ||
|
@@ -11,7 +12,7 @@ | |
|
||
import soundfile | ||
import uvicorn | ||
from fastapi import FastAPI, HTTPException, Request, Response | ||
from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile | ||
from fastapi.middleware.cors import CORSMiddleware | ||
from fastapi.params import Query | ||
from starlette.responses import FileResponse | ||
|
@@ -206,6 +207,63 @@ def accent_phrases( | |
enable_interrogative=enable_interrogative, | ||
) | ||
|
||
@app.post( | ||
"/guided_accent_phrase", | ||
response_model=AudioQuery, | ||
tags=["クエリ作成"], | ||
summary="Create Audio Query Guided by External Audio", | ||
) | ||
def guided_accent_phrase( | ||
kana: str = Form(...), # noqa: B008 | ||
speaker_id: int = Form(...), # noqa: B008 | ||
normalize: int = Form(...), # noqa: B008 | ||
audio_file: UploadFile = File(...), # noqa: B008 | ||
): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
他のAPIと形式を合わせておくと、ユーザーにとって使い勝手が良さそうです。 def guided_accent_phrase(
text: str,
speaker: int,
is_kana: bool = False,
enable_interrogative: bool = enable_interrogative_query_param(), # noqa B008,
audio_file: UploadFile = File(...), # noqa: B008
): |
||
try: | ||
accent_phrases, _ = parse_kana(kana, False) | ||
query = AudioQuery( | ||
accent_phrases=accent_phrases, | ||
speedScale=1, | ||
pitchScale=0, | ||
intonationScale=1, | ||
volumeScale=1, | ||
prePhonemeLength=0.1, | ||
postPhonemeLength=0.1, | ||
outputSamplingRate=default_sampling_rate, | ||
outputStereo=False, | ||
kana=kana, | ||
) | ||
return engine.guided_accent_phrases( | ||
query=query, | ||
speaker_id=speaker_id, | ||
audio_file=audio_file.file, | ||
normalize=normalize, | ||
) | ||
except ParseKanaError: | ||
print(traceback.format_exc()) | ||
raise HTTPException( | ||
status_code=500, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think using 422 instead of 500 for the status code is better. |
||
detail="Failed to Parse Kana", | ||
) | ||
except StopIteration: | ||
print(traceback.format_exc()) | ||
raise HTTPException( | ||
status_code=500, | ||
detail="Failed in Forced Alignment", | ||
) | ||
except Exception as e: | ||
print(traceback.format_exc()) | ||
if str(e) == "Decode Failed": | ||
raise HTTPException( | ||
status_code=500, | ||
detail="Failed in Forced Alignment", | ||
) | ||
else: | ||
raise HTTPException( | ||
status_code=500, | ||
detail="Internal Server Error", | ||
) | ||
|
||
@app.post( | ||
"/mora_data", | ||
response_model=List[AccentPhrase], | ||
|
@@ -324,7 +382,7 @@ def multi_synthesis(queries: List[AudioQuery], speaker: int): | |
format="WAV", | ||
) | ||
wav_file.seek(0) | ||
zip_file.writestr(f"{str(i+1).zfill(3)}.wav", wav_file.read()) | ||
zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read()) | ||
|
||
return FileResponse(f.name, media_type="application/zip") | ||
|
||
|
@@ -376,6 +434,79 @@ def _synthesis_morphing( | |
|
||
return FileResponse(f.name, media_type="audio/wav") | ||
|
||
@app.post( | ||
"/guided_synthesis", | ||
responses={ | ||
200: { | ||
"content": { | ||
"audio/wav": {"schema": {"type": "string", "format": "binary"}} | ||
}, | ||
} | ||
}, | ||
tags=["音声合成"], | ||
summary="Audio synthesis guided by external audio and phonemes", | ||
) | ||
def guided_synthesis( | ||
Hiroshiba marked this conversation as resolved.
Show resolved
Hide resolved
|
||
kana: str = Form(...), # noqa: B008 | ||
speaker_id: int = Form(...), # noqa: B008 | ||
normalize: int = Form(...), # noqa: B008 | ||
audio_file: UploadFile = File(...), # noqa: B008 | ||
stereo: int = Form(...), # noqa: B008 | ||
sample_rate: int = Form(...), # noqa: B008 | ||
volumeScale: float = Form(...), # noqa: B008 | ||
pitchScale: float = Form(...), # noqa: B008 | ||
speedScale: float = Form(...), # noqa: B008 | ||
): | ||
try: | ||
accent_phrases, _ = parse_kana(kana, False) | ||
query = AudioQuery( | ||
accent_phrases=accent_phrases, | ||
speedScale=speedScale, | ||
pitchScale=pitchScale, | ||
intonationScale=1, | ||
volumeScale=volumeScale, | ||
prePhonemeLength=0.1, | ||
postPhonemeLength=0.1, | ||
outputSamplingRate=sample_rate, | ||
outputStereo=stereo, | ||
kana=kana, | ||
) | ||
wave = engine.guided_synthesis( | ||
audio_file=audio_file.file, | ||
query=query, | ||
speaker_id=speaker_id, | ||
normalize=normalize, | ||
) | ||
|
||
with NamedTemporaryFile(delete=False) as f: | ||
soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV") | ||
|
||
return FileResponse(f.name, media_type="audio/wav") | ||
except ParseKanaError: | ||
print(traceback.format_exc()) | ||
raise HTTPException( | ||
status_code=500, | ||
detail="Failed to Parse Kana", | ||
) | ||
except StopIteration: | ||
print(traceback.format_exc()) | ||
raise HTTPException( | ||
status_code=500, | ||
detail="Failed in Forced Alignment.", | ||
) | ||
except Exception as e: | ||
print(traceback.format_exc()) | ||
if str(e) == "Decode Failed": | ||
raise HTTPException( | ||
status_code=500, | ||
detail="Failed in Forced Alignment.", | ||
) | ||
else: | ||
raise HTTPException( | ||
status_code=500, | ||
detail="Internal Server Error.", | ||
) | ||
|
||
@app.post( | ||
"/connect_waves", | ||
response_class=FileResponse, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ここは
List[AccentPhrase]
が正しそうです。voicevox_engine/run.py
Line 161 in bdf712f