From 9af4963892f36a6b9022d33f204aec12f7a4ffd3 Mon Sep 17 00:00:00 2001
From: Patchethium <74458240+Patchethium@users.noreply.github.com>
Date: Mon, 11 Apr 2022 03:00:08 +0800
Subject: [PATCH] Guided synthesis - API Improvement (#376)

* forced alignment, f0 extraction and entry point

* kind of finished

* change julius4seg, doesn't seem to help

* run pysen format

* add speaker id to api

* run pysen format

* add accent_phrase api, finish

* add request parameter

* improve error handling

* run pysen format

* add parameters

* run pysen format

* a little boundary check

* add normalization for different WAV format

* run format

* run format

* move synthesis and accent phrase to synthesis engine

* add test for mock

* change url for apis

* simplify

* error type

* do something

* do something

* run format

* resolve conflict

* add usage to README

* add comments and experimental flag for guided api

* add guided info to AudioQuery model

* improve api definition

* run format, update README

* add error handling for wrong audio formats, edit README

* reserve unvoiced mora, add response type

* remove 422 error, move boundary check

* Update voicevox_engine/synthesis_engine/synthesis_engine.py

Co-authored-by: Hiroshiba <hihokaruta@gmail.com>

* move guided info to the outside of query

* run fmt

* update README

* fix README

Co-authored-by: Hiroshiba <hihokaruta@gmail.com>
---
 README.md                                     | 69 +++++++++-------
 run.py                                        | 78 ++++++-------------
 voicevox_engine/dev/synthesis_engine/mock.py  | 18 ++---
 .../experimental/guided_extractor.py          | 13 +++-
 .../synthesis_engine/synthesis_engine.py      | 35 +++++----
 .../synthesis_engine/synthesis_engine_base.py | 12 +--
 6 files changed, 109 insertions(+), 116 deletions(-)

diff --git a/README.md b/README.md
index ebf8436a8..8ca584fe6 100644
--- a/README.md
+++ b/README.md
@@ -245,34 +245,49 @@ curl -s \
 ```
 
 ### Guidied Synthsis
-Currently, we have two apis which accept an uploaded audio file and return corresponding synthesis information.  
-Both of them recommend setting `is_kana` to be `true` and use `kana` section from `AudioQuery` for the best performance.  
-You can also get the kana text in AquesTalk section.  
+Currently, we have two apis generates audio (`guided_synthesis`) and a list of AccentPhrase (`guided_accent_phrases`) referencing an external audio source.  
+It's worth noting that different from `guided_accent_phrases`, `guided_synthesis` works in the resolution of frames, as a result they are not compatible with each other.  
+**The external audio should be in wav format.**  
 ```bash
-# Returns an audio file which is synthesised referencing uploaded audio
-# this example needs a recording whose content is
-# "また 東寺のように 五大明王と 呼ばれる 主要な 明王の 中央に 配されることも多い"
-
-curl -L -X POST 'localhost:50021/guided_synthesis' \
-    -F 'kana="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \
-    -F 'speaker_id="5"' \
-    -F 'audio_file=@"/full_path_to_your_recording"' \
-    -F 'normalize="true"' \
-    -F 'stereo="true"' \
-    -F 'sample_rate="24000"' \
-    -F 'volume_scale="1"' \
-    -F 'pitch_scale="0"' \
-    -F 'speed_scale="1"'
-
-# Returns a list of AccentPhrases
-
-curl -L -X POST 'localhost:50021/guided_accent_phrase' \
-    -F 'text="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \
-    -F 'speaker="5"' \
-    -F 'audio_file=@"/full_path_to_your_recording"' \
-    -F 'normalize="true"' \
-    -F 'is_kana="true"' \
-    -F 'enable_interrogative="false"'
+# guided_syhthesis returns an audio file which is synthesised referencing the external audio source
+
+echo -n "また 東寺のように 五大明王と 呼ばれる 主要な 明王の 中央に 配されることも多い" > text.txt
+
+curl -s \
+    -X POST \
+    "localhost:50021/audio_query?speaker=1" \
+    --get --data-urlencode text@text.txt \
+    > query.json
+
+# if true, the average of f0 will be normalized to the predicted average
+normalize="true"
+# full path to your audio record
+audio_path="/home/.../sample.wav"
+
+curl -s \
+    -H "Content-Type: application/json" \
+    -X POST \
+    -d @query.json \
+    "localhost:50021/guided_synthesis?speaker=1&normalize=$normalize&audio_path=$audio_path" \
+    > audio.wav
+
+# guided_accent_phrases returns a list of AccentPhrases
+curl -s \
+    -H "Content-Type: application/json" \
+    -X POST \
+    -d @query.json \
+    "http://localhost:50021/guided_accent_phrases?speaker=0&normalize=$normalize&audio_path=$audio_path" \
+    > newphrases.json
+
+# replace the accent_phrases section in query
+cat query.json | sed -e "s/\[{.*}\]/$(cat newphrases.json)/g" > newquery.json
+
+curl -s \
+    -H "Content-Type: application/json" \
+    -X POST \
+    -d @newquery.json \
+    "localhost:50021/synthesis?speaker=1" \
+    > audio.wav
 ```
 
 ### 話者の追加情報を取得するサンプルコード
diff --git a/run.py b/run.py
index a4eb691a4..ccdd4373a 100644
--- a/run.py
+++ b/run.py
@@ -17,7 +17,7 @@
 
 import soundfile
 import uvicorn
-from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile
+from fastapi import FastAPI, HTTPException, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.params import Query
 from pydantic import ValidationError, conint
@@ -219,17 +219,16 @@ def accent_phrases(
             return engine.create_accent_phrases(text, speaker_id=speaker)
 
     @app.post(
-        "/guided_accent_phrase",
+        "/guided_accent_phrases",
         response_model=List[AccentPhrase],
         tags=["クエリ編集"],
         summary="Create Accent Phrase from External Audio",
     )
-    def guided_accent_phrase(
-        text: str = Form(...),  # noqa:B008
-        speaker: int = Form(...),  # noqa:B008
-        is_kana: bool = Form(...),  # noqa:B008
-        audio_file: UploadFile = File(...),  # noqa: B008
-        normalize: bool = Form(...),  # noqa:B008
+    def guided_accent_phrases(
+        query: AudioQuery,
+        speaker: int,
+        audio_path: str,
+        normalize: bool,
         core_version: Optional[str] = None,
     ):
         """
@@ -243,32 +242,13 @@ def guided_accent_phrase(
                 detail="実験的機能はデフォルトで無効になっています。使用するには引数を指定してください。",
             )
         engine = get_engine(core_version)
-        if is_kana:
-            try:
-                accent_phrases = parse_kana(text)
-            except ParseKanaError as err:
-                raise HTTPException(
-                    status_code=400,
-                    detail=ParseKanaBadRequest(err).dict(),
-                )
-        else:
-            accent_phrases = engine.create_accent_phrases(
-                text,
-                speaker_id=speaker,
-            )
-
         try:
             return engine.guided_accent_phrases(
-                accent_phrases=accent_phrases,
+                query=query,
                 speaker=speaker,
-                audio_file=audio_file.file,
+                audio_path=audio_path,
                 normalize=normalize,
             )
-        except ParseKanaError as err:
-            raise HTTPException(
-                status_code=422,
-                detail=ParseKanaBadRequest(err).dict(),
-            )
         except StopIteration:
             print(traceback.format_exc())
             raise HTTPException(
@@ -505,6 +485,7 @@ def _synthesis_morphing(
 
     @app.post(
         "/guided_synthesis",
+        response_class=FileResponse,
         responses={
             200: {
                 "content": {
@@ -516,15 +497,10 @@ def _synthesis_morphing(
         summary="Audio synthesis guided by external audio and phonemes",
     )
     def guided_synthesis(
-        kana: str = Form(...),  # noqa: B008
-        speaker_id: int = Form(...),  # noqa: B008
-        normalize: bool = Form(...),  # noqa: B008
-        audio_file: UploadFile = File(...),  # noqa: B008
-        stereo: bool = Form(...),  # noqa: B008
-        sample_rate: int = Form(...),  # noqa: B008
-        volume_scale: float = Form(...),  # noqa: B008
-        pitch_scale: float = Form(...),  # noqa: B008
-        speed_scale: float = Form(...),  # noqa: B008
+        query: AudioQuery,
+        speaker: int,
+        audio_path: str,
+        normalize: bool,
         core_version: Optional[str] = None,
     ):
         """
@@ -539,28 +515,17 @@ def guided_synthesis(
             )
         engine = get_engine(core_version)
         try:
-            accent_phrases = parse_kana(kana)
-            query = AudioQuery(
-                accent_phrases=accent_phrases,
-                speedScale=speed_scale,
-                pitchScale=pitch_scale,
-                intonationScale=1,
-                volumeScale=volume_scale,
-                prePhonemeLength=0.1,
-                postPhonemeLength=0.1,
-                outputSamplingRate=sample_rate,
-                outputStereo=stereo,
-                kana=kana,
-            )
             wave = engine.guided_synthesis(
-                audio_file=audio_file.file,
                 query=query,
-                speaker=speaker_id,
+                speaker=speaker,
+                audio_path=audio_path,
                 normalize=normalize,
             )
 
             with NamedTemporaryFile(delete=False) as f:
-                soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV")
+                soundfile.write(
+                    file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV"
+                )
 
             return FileResponse(f.name, media_type="audio/wav")
         except ParseKanaError as err:
@@ -581,6 +546,11 @@ def guided_synthesis(
                     status_code=500,
                     detail="Failed in Forced Alignment.",
                 )
+            elif str(e) == "Wrong Audio Encoding Format":
+                raise HTTPException(
+                    status_code=500,
+                    detail=str(e),
+                )
             else:
                 raise HTTPException(
                     status_code=500,
diff --git a/voicevox_engine/dev/synthesis_engine/mock.py b/voicevox_engine/dev/synthesis_engine/mock.py
index 2c026f8d3..6c7637026 100644
--- a/voicevox_engine/dev/synthesis_engine/mock.py
+++ b/voicevox_engine/dev/synthesis_engine/mock.py
@@ -1,6 +1,5 @@
 from logging import getLogger
 from typing import Any, Dict, List, Optional
-from typing.io import IO
 
 import numpy as np
 from pyopenjtalk import tts
@@ -140,8 +139,9 @@ def guided_synthesis(
         self,
         query: AudioQuery,
         speaker: int,
-        audio_file: IO,
-        normalize: int,
+        audio_path: str,
+        normalize: bool,
+        core_version: Optional[str] = None,
     ) -> np.ndarray:
         """
         Open jtalk doesn't have a guided function [Mock]
@@ -151,7 +151,7 @@ def guided_synthesis(
         ----------
         query
         speaker
-        audio_file
+        audio_path
         normalize
 
         Returns
@@ -162,10 +162,10 @@ def guided_synthesis(
 
     def guided_accent_phrases(
         self,
-        accent_phrases: List[AccentPhrase],
+        query: AudioQuery,
         speaker: int,
-        audio_file: IO,
-        normalize: int,
+        audio_path: str,
+        normalize: bool,
     ) -> List[AccentPhrase]:
         """
         guided_accent_phrases 入力accent_phrasesを変更せずにそのまま返します [Mock]
@@ -174,11 +174,11 @@ def guided_accent_phrases(
         ----------
         query
         speaker
-        audio_file
+        audio_path
         normalize
 
         Returns
         -------
 
         """
-        return accent_phrases
+        return query.accent_phrases
diff --git a/voicevox_engine/experimental/guided_extractor.py b/voicevox_engine/experimental/guided_extractor.py
index 8e4c280d2..9ca7c2769 100644
--- a/voicevox_engine/experimental/guided_extractor.py
+++ b/voicevox_engine/experimental/guided_extractor.py
@@ -3,7 +3,6 @@
 import tarfile
 from os.path import exists
 from pathlib import PurePath
-from typing.io import IO
 from urllib.request import urlretrieve
 
 import numpy as np
@@ -93,9 +92,12 @@ def _no_nan(num):
     return 0.0 if np.isnan(num) else num
 
 
-def extract_guided_feature(audio_file: IO, kana: str):
+def extract_guided_feature(audio_file: str, kana: str):
     _lazy_init()
-    sr, wave = wavfile.read(audio_file)
+    try:
+        sr, wave = wavfile.read(audio_file)
+    except ValueError:
+        raise Exception("Wrong Audio Encoding Format")
     # stereo to mono
     if len(wave.shape) == 2:
         wave = wave.sum(axis=1) / 2
@@ -120,6 +122,11 @@ def extract_guided_feature(audio_file: IO, kana: str):
     )
 
     phones = forced_align(julius_wave, julius_kana)
+
+    # a bit boundary check
+    f0[f0 > 6.5] = 6.5
+    f0[(0 < f0) & (f0 < 3)] = 3.0
+
     return f0, phones
 
 
diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py
index fa54e387a..895eae048 100644
--- a/voicevox_engine/synthesis_engine/synthesis_engine.py
+++ b/voicevox_engine/synthesis_engine/synthesis_engine.py
@@ -1,7 +1,6 @@
 from copy import deepcopy
 from itertools import chain
 from typing import List, Optional, Tuple
-from typing.io import IO
 
 import numpy
 from scipy.signal import resample
@@ -492,10 +491,12 @@ def guided_synthesis(
         self,
         query: AudioQuery,
         speaker: int,
-        audio_file: IO,
-        normalize: int,
+        audio_path: str,
+        normalize: bool,
+        core_version: Optional[str] = None,
     ):
-        f0, phonemes = extract_guided_feature(audio_file, query.kana)
+        kana = create_kana(query.accent_phrases)
+        f0, phonemes = extract_guided_feature(audio_path, kana)
 
         phone_list = numpy.zeros((len(f0), OjtPhoneme.num_phoneme), dtype=numpy.float32)
 
@@ -516,13 +517,9 @@ def guided_synthesis(
             phone_list[s - 1 : e] = OjtPhoneme(start=s, end=e, phoneme=p).onehot
 
         if normalize:
-            f0 += get_normalize_diff(
-                engine=self, kana=query.kana, f0=f0, speaker_id=speaker
-            )
+            f0 += get_normalize_diff(engine=self, kana=kana, f0=f0, speaker_id=speaker)
 
         f0 *= 2 ** query.pitchScale
-        f0[f0 > 6.5] = 6.5
-        f0[(0 < f0) & (f0 < 3)] = 3.0
 
         f0 = resample(f0, int(len(f0) / query.speedScale))
         phone_list = resample(phone_list, int(len(phone_list) / query.speedScale))
@@ -551,13 +548,13 @@ def guided_synthesis(
 
     def guided_accent_phrases(
         self,
-        accent_phrases: List[AccentPhrase],
+        query: AudioQuery,
         speaker: int,
-        audio_file: IO,
-        normalize: int,
+        audio_path: str,
+        normalize: bool,
     ) -> List[AccentPhrase]:
-        kana = create_kana(accent_phrases=accent_phrases)
-        f0, phonemes = extract_guided_feature(audio_file, kana)
+        kana = create_kana(query.accent_phrases)
+        f0, phonemes = extract_guided_feature(audio_path, kana)
         timed_phonemes = frame_to_second(deepcopy(phonemes))
 
         phrase_info = []
@@ -577,10 +574,12 @@ def guided_accent_phrases(
                 engine=self, kana=kana, f0=f0, speaker_id=speaker
             )
             for p in phrase_info:
-                p.pitch += normalize_diff
+                if p.pitch != 0:
+                    p.pitch += normalize_diff
 
         idx = 1
-        for phrase in accent_phrases:
+        for phrase in query.accent_phrases:
+            phrase.pause_mora = None
             for mora in phrase.moras:
                 if mora.consonant is not None:
                     mora.pitch = (
@@ -593,6 +592,8 @@ def guided_accent_phrases(
                     mora.pitch = phrase_info[idx].pitch
                     mora.vowel_length = phrase_info[idx].length
                     idx += 1
+                if mora.vowel in unvoiced_mora_phoneme_list:
+                    mora.pitch = 0
             if phrase_info[idx].phoneme == "sp":
                 phrase.pause_mora = Mora(
                     text="、",
@@ -604,4 +605,4 @@ def guided_accent_phrases(
                 )
                 idx += 1
 
-        return accent_phrases
+        return query.accent_phrases
diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/synthesis_engine/synthesis_engine_base.py
index c84a213fe..485b93e81 100644
--- a/voicevox_engine/synthesis_engine/synthesis_engine_base.py
+++ b/voicevox_engine/synthesis_engine/synthesis_engine_base.py
@@ -1,7 +1,6 @@
 import copy
 from abc import ABCMeta, abstractmethod
 from typing import List, Optional
-from typing.io import IO
 
 from .. import full_context_label
 from ..full_context_label import extract_full_context_label
@@ -231,17 +230,18 @@ def guided_synthesis(
         self,
         query: AudioQuery,
         speaker: int,
-        audio_file: IO,
-        normalize: int,
+        audio_path: str,
+        normalize: bool,
+        core_version: Optional[str] = None,
     ):
         raise NotImplementedError()
 
     @abstractmethod
     def guided_accent_phrases(
         self,
-        accent_phrases: List[AccentPhrase],
+        query: AudioQuery,
         speaker: int,
-        audio_file: IO,
-        normalize: int,
+        audio_path: str,
+        normalize: bool,
     ) -> List[AccentPhrase]:
         raise NotImplementedError()