VOICEVOX · Hiroshiba · Mar 10, 2022 · Dec 28, 2021 · Dec 28, 2021 · Dec 28, 2021
@@ -23,3 +23,9 @@ venv/
 /cache
 
 /licenses.json
+
+# Guided Synthesis temp files
+/voicevox_engine/dictation-kit*
+first_pass*
+second_pass*
+tmp.wav
@@ -3,6 +3,7 @@
 import base64
 import json
 import multiprocessing
+import traceback
 import zipfile
 from functools import lru_cache
 from pathlib import Path
@@ -11,7 +12,7 @@
 
 import soundfile
 import uvicorn
-from fastapi import FastAPI, HTTPException, Request, Response
+from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.params import Query
 from starlette.responses import FileResponse
@@ -206,6 +207,63 @@ def accent_phrases(
                 enable_interrogative=enable_interrogative,
             )
 
+    @app.post(
+        "/guided_accent_phrase",
+        response_model=AudioQuery,
         response_model=List[AccentPhrase], 
         response_model=List[AccentPhrase], 
+        tags=["クエリ作成"],
+        summary="Create Audio Query Guided by External Audio",
+    )
+    def guided_accent_phrase(
+        kana: str = Form(...),  # noqa: B008
+        speaker_id: int = Form(...),  # noqa: B008
+        normalize: int = Form(...),  # noqa: B008
+        audio_file: UploadFile = File(...),  # noqa: B008
+    ):
+        try:
+            accent_phrases, _ = parse_kana(kana, False)
+            query = AudioQuery(
+                accent_phrases=accent_phrases,
+                speedScale=1,
+                pitchScale=0,
+                intonationScale=1,
+                volumeScale=1,
+                prePhonemeLength=0.1,
+                postPhonemeLength=0.1,
+                outputSamplingRate=default_sampling_rate,
+                outputStereo=False,
+                kana=kana,
+            )
+            return engine.guided_accent_phrases(
+                query=query,
+                speaker_id=speaker_id,
+                audio_file=audio_file.file,
+                normalize=normalize,
+            )
+        except ParseKanaError:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed to Parse Kana",
+            )
+        except StopIteration:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed in Forced Alignment",
+            )
+        except Exception as e:
+            print(traceback.format_exc())
+            if str(e) == "Decode Failed":
+                raise HTTPException(
+                    status_code=500,
+                    detail="Failed in Forced Alignment",
+                )
+            else:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Internal Server Error",
+                )
+
     @app.post(
         "/mora_data",
         response_model=List[AccentPhrase],
@@ -324,7 +382,7 @@ def multi_synthesis(queries: List[AudioQuery], speaker: int):
                             format="WAV",
                         )
                         wav_file.seek(0)
-                        zip_file.writestr(f"{str(i+1).zfill(3)}.wav", wav_file.read())
+                        zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read())
 
         return FileResponse(f.name, media_type="application/zip")
 
@@ -376,6 +434,79 @@ def _synthesis_morphing(
 
         return FileResponse(f.name, media_type="audio/wav")
 
+    @app.post(
+        "/guided_synthesis",
+        responses={
+            200: {
+                "content": {
+                    "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+                },
+            }
+        },
+        tags=["音声合成"],
+        summary="Audio synthesis guided by external audio and phonemes",
+    )
+    def guided_synthesis(
+        kana: str = Form(...),  # noqa: B008
+        speaker_id: int = Form(...),  # noqa: B008
+        normalize: int = Form(...),  # noqa: B008
+        audio_file: UploadFile = File(...),  # noqa: B008
+        stereo: int = Form(...),  # noqa: B008
+        sample_rate: int = Form(...),  # noqa: B008
+        volumeScale: float = Form(...),  # noqa: B008
+        pitchScale: float = Form(...),  # noqa: B008
+        speedScale: float = Form(...),  # noqa: B008
+    ):
+        try:
+            accent_phrases, _ = parse_kana(kana, False)
+            query = AudioQuery(
+                accent_phrases=accent_phrases,
+                speedScale=speedScale,
+                pitchScale=pitchScale,
+                intonationScale=1,
+                volumeScale=volumeScale,
+                prePhonemeLength=0.1,
+                postPhonemeLength=0.1,
+                outputSamplingRate=sample_rate,
+                outputStereo=stereo,
+                kana=kana,
+            )
+            wave = engine.guided_synthesis(
+                audio_file=audio_file.file,
+                query=query,
+                speaker_id=speaker_id,
+                normalize=normalize,
+            )
+
+            with NamedTemporaryFile(delete=False) as f:
+                soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV")
+
+            return FileResponse(f.name, media_type="audio/wav")
+        except ParseKanaError:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed to Parse Kana",
+            )
+        except StopIteration:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed in Forced Alignment.",
+            )
+        except Exception as e:
+            print(traceback.format_exc())
+            if str(e) == "Decode Failed":
+                raise HTTPException(
+                    status_code=500,
+                    detail="Failed in Forced Alignment.",
+                )
+            else:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Internal Server Error.",
+                )
+
     @app.post(
         "/connect_waves",
         response_class=FileResponse,

@@ -1,5 +1,6 @@
 from logging import getLogger
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
+from typing.io import IO
 
 import numpy as np
 from pyopenjtalk import tts
@@ -125,3 +126,50 @@ def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray:
             24000 * len(wave) // 48000,
         )
         return wave.astype("int16")
+
+    def guided_synthesis(
+        self,
+        query: AudioQuery,
+        speaker_id: int,
+        audio_file: Optional[IO],
+        normalize: int,
+    ) -> np.ndarray:
+        """
+        Open jtalk doesn't have a guided function
+        simply calling mock synthesis
+
+        Parameters
+        ----------
+        query
+        speaker_id
+        audio_file
+        normalize
+
+        Returns
+        -------
+
+        """
+        return self.synthesis(query=query, speaker_id=speaker_id)
+
+    def guided_accent_phrases(
+        self,
+        query: AudioQuery,
+        speaker_id: int,
+        audio_file: Optional[IO],
+        normalize: int,
+    ) -> AudioQuery:
+        """
+        guided_accent_phrases 入力accent_phrasesを変更せずにそのまま返します [Mock]
+
+        Parameters
+        ----------
+        query
+        speaker_id
+        audio_file
+        normalize
+
+        Returns
+        -------
+
+        """
+        return query