Allow <voice>;<vocoder> format for MaryTTS voice

rhasspy · Apr 21, 2021 · 489c6c1 · 489c6c1
1 parent d099f70
commit 489c6c1
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -54,6 +54,16 @@ $ docker run -it -p 59125:5002 rhasspy/larynx:<LANG>
 
 The `/process` HTTP endpoint should now work for voices formatted as `<LANG>/<VOICE>` such as `en-us/harvard-glow_tts`.
 
+You can specify the vocoder by adding `;<VOCODER>` to the MaryTTS voice.
+
+For example: `en-us/harvard-glow_tts;hifi_gan:vctk_small` will use the lowest quality (but fastest) vocoder. This is usually necessary to get decent performance on a Raspberry Pi.
+
+Available vocoders are:
+
+* `hifi_gan:universal_large` (best quality, slowest, default)
+* `hifi_gan:vctk_medium` (medium quality)
+* `hifi_gan:vctk_small` (lowest quality, fastest)
+
 ## Python Installation
 
 ```sh

diff --git a/larynx/server.py b/larynx/server.py
@@ -15,7 +15,6 @@
 from urllib.parse import parse_qs
 from uuid import uuid4
 
-import gruut_ipa
 import hypercorn
 import numpy as np
 import quart_cors
@@ -411,7 +410,11 @@ async def api_process():
         text = request.args.get("INPUT_TEXT", "")
         voice = request.args.get("VOICE", "")
 
-    wav_bytes = await text_to_wav(text, voice, vocoder=_DEFAULT_VOCODER)
+    # <VOICE>;<VOCODER>
+    voice, vocoder = voice.split(";", maxsplit=1)
+    vocoder = vocoder or _DEFAULT_VOCODER
+
+    wav_bytes = await text_to_wav(text, voice, vocoder=vocoder)
 
     return Response(wav_bytes, mimetype="audio/wav")