diff --git a/.changeset/violet-students-shout.md b/.changeset/violet-students-shout.md new file mode 100644 index 000000000..0ac116426 --- /dev/null +++ b/.changeset/violet-students-shout.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-elevenlabs": patch +--- + +gracefully error on non-PCM data diff --git a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py index 701c5db7c..dd509c11a 100644 --- a/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py +++ b/livekit-plugins/livekit-plugins-elevenlabs/livekit/plugins/elevenlabs/tts.py @@ -156,6 +156,8 @@ def __init__( ) -> None: super().__init__() self._text, self._opts, self._session = text, opts, session + if _encoding_from_format(self._opts.encoding) == "mp3": + self._mp3_decoder = utils.codecs.Mp3StreamDecoder() @utils.log_exceptions(logger=logger) async def _main_task(self) -> None: @@ -181,21 +183,39 @@ async def _main_task(self) -> None: headers={AUTHORIZATION_HEADER: self._opts.api_key}, json=data, ) as resp: - async for bytes_data, _ in resp.content.iter_chunks(): - for frame in bstream.write(bytes_data): + if not resp.content_type.startswith("audio/"): + content = await resp.text() + logger.error("11labs returned non-audio data: %s", content) + return + encoding = _encoding_from_format(self._opts.encoding) + if encoding == "mp3": + async for bytes_data, _ in resp.content.iter_chunks(): + for frame in self._mp3_decoder.decode_chunk(bytes_data): + self._event_ch.send_nowait( + tts.SynthesizedAudio( + request_id=request_id, + segment_id=segment_id, + frame=frame, + ) + ) + else: + async for bytes_data, _ in resp.content.iter_chunks(): + for frame in bstream.write(bytes_data): + self._event_ch.send_nowait( + tts.SynthesizedAudio( + request_id=request_id, + segment_id=segment_id, + frame=frame, + ) + ) + + for frame in bstream.flush(): self._event_ch.send_nowait( tts.SynthesizedAudio( request_id=request_id, segment_id=segment_id, frame=frame ) ) - for frame in bstream.flush(): - self._event_ch.send_nowait( - tts.SynthesizedAudio( - request_id=request_id, segment_id=segment_id, frame=frame - ) - ) - class SynthesizeStream(tts.SynthesizeStream): """Streamed API using websockets""" @@ -388,11 +408,11 @@ def _synthesize_url(opts: _TTSOptions) -> str: base_url = opts.base_url voice_id = opts.voice.id model_id = opts.model_id - sample_rate = _sample_rate_from_format(opts.encoding) + output_format = opts.encoding latency = opts.streaming_latency return ( f"{base_url}/text-to-speech/{voice_id}/stream?" - f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}" + f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}" )