elevenlabs: error on non-PCM data

brittbinler · Aug 3, 2024 · 297db92 · 297db92
1 parent 88e75d7
commit 297db92
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 11 deletions.
diff --git a/.changeset/violet-students-shout.md b/.changeset/violet-students-shout.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-elevenlabs": patch
+---
+
+gracefully error on non-PCM data
diff --git a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py
@@ -156,6 +156,8 @@ def __init__(
     ) -> None:
         super().__init__()
         self._text, self._opts, self._session = text, opts, session
+        if _encoding_from_format(self._opts.encoding) == "mp3":
+            self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
 
     @utils.log_exceptions(logger=logger)
     async def _main_task(self) -> None:
@@ -181,21 +183,39 @@ async def _main_task(self) -> None:
             headers={AUTHORIZATION_HEADER: self._opts.api_key},
             json=data,
         ) as resp:
-            async for bytes_data, _ in resp.content.iter_chunks():
-                for frame in bstream.write(bytes_data):
+            if not resp.content_type.startswith("audio/"):
+                content = await resp.text()
+                logger.error("11labs returned non-audio data: %s", content)
+                return
+            encoding = _encoding_from_format(self._opts.encoding)
+            if encoding == "mp3":
+                async for bytes_data, _ in resp.content.iter_chunks():
+                    for frame in self._mp3_decoder.decode_chunk(bytes_data):
+                        self._event_ch.send_nowait(
+                            tts.SynthesizedAudio(
+                                request_id=request_id,
+                                segment_id=segment_id,
+                                frame=frame,
+                            )
+                        )
+            else:
+                async for bytes_data, _ in resp.content.iter_chunks():
+                    for frame in bstream.write(bytes_data):
+                        self._event_ch.send_nowait(
+                            tts.SynthesizedAudio(
+                                request_id=request_id,
+                                segment_id=segment_id,
+                                frame=frame,
+                            )
+                        )
+
+                for frame in bstream.flush():
                     self._event_ch.send_nowait(
                         tts.SynthesizedAudio(
                             request_id=request_id, segment_id=segment_id, frame=frame
                         )
                     )
 
-            for frame in bstream.flush():
-                self._event_ch.send_nowait(
-                    tts.SynthesizedAudio(
-                        request_id=request_id, segment_id=segment_id, frame=frame
-                    )
-                )
-
 
 class SynthesizeStream(tts.SynthesizeStream):
     """Streamed API using websockets"""
@@ -388,11 +408,11 @@ def _synthesize_url(opts: _TTSOptions) -> str:
     base_url = opts.base_url
     voice_id = opts.voice.id
     model_id = opts.model_id
-    sample_rate = _sample_rate_from_format(opts.encoding)
+    output_format = opts.encoding
     latency = opts.streaming_latency
     return (
         f"{base_url}/text-to-speech/{voice_id}/stream?"
-        f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
+        f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
     )