Skip to content

Commit

Permalink
Merge pull request #1024 from pipecat-ai/mb/elevenlabs-http
Browse files Browse the repository at this point in the history
Add ElevenLabsHttpTTSService
  • Loading branch information
markbackman authored Jan 18, 2025
2 parents e9162ae + b81323d commit e159f2d
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 2 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Added `ElevenLabsHttpTTSService` and the
`07d-interruptible-elevenlabs-http.py` foundational example.

- Introduced pipeline frame observers. Observers can view all the frames that go
through the pipeline without the need to inject processors in the
pipeline. This can be useful, for example, to implement frame loggers or
Expand Down
165 changes: 163 additions & 2 deletions src/pipecat/services/elevenlabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,17 @@
import asyncio
import base64
import json
from typing import Any, AsyncGenerator, Dict, List, Literal, Mapping, Optional, Tuple
from typing import Any, AsyncGenerator, Dict, List, Literal, Mapping, Optional, Tuple, Union

import aiohttp
from loguru import logger
from pydantic import BaseModel, model_validator

from pipecat.frames.frames import (
BotStoppedSpeakingFrame,
CancelFrame,
EndFrame,
ErrorFrame,
Frame,
LLMFullResponseEndFrame,
StartFrame,
Expand All @@ -26,13 +28,15 @@
TTSStoppedFrame,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import WordTTSService
from pipecat.services.ai_services import TTSService, WordTTSService
from pipecat.services.websocket_service import WebsocketService
from pipecat.transcriptions.language import Language

# See .env.example for ElevenLabs configuration needed
try:
import websockets
from elevenlabs import VoiceSettings
from elevenlabs.client import ElevenLabs
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
Expand Down Expand Up @@ -418,3 +422,160 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
yield None
except Exception as e:
logger.error(f"{self} exception: {e}")


class ElevenLabsHttpTTSService(TTSService):
"""ElevenLabs Text-to-Speech service using HTTP streaming.
Args:
api_key: ElevenLabs API key
voice_id: ID of the voice to use
aiohttp_session: aiohttp ClientSession
model: Model ID (default: "eleven_flash_v2_5" for low latency)
base_url: API base URL
output_format: Audio output format (PCM)
params: Additional parameters for voice configuration
"""

class InputParams(BaseModel):
language: Optional[Language] = Language.EN
optimize_streaming_latency: Optional[int] = None
stability: Optional[float] = None
similarity_boost: Optional[float] = None
style: Optional[float] = None
use_speaker_boost: Optional[bool] = None

def __init__(
self,
*,
api_key: str,
voice_id: str,
aiohttp_session: aiohttp.ClientSession,
model: str = "eleven_flash_v2_5",
base_url: str = "https://api.elevenlabs.io",
output_format: ElevenLabsOutputFormat = "pcm_24000",
params: InputParams = InputParams(),
**kwargs,
):
super().__init__(sample_rate=sample_rate_from_output_format(output_format), **kwargs)

self._api_key = api_key
self._base_url = base_url
self._output_format = output_format
self._params = params
self._session = aiohttp_session

self._settings = {
"sample_rate": sample_rate_from_output_format(output_format),
"language": self.language_to_service_language(params.language)
if params.language
else "en",
"output_format": output_format,
"optimize_streaming_latency": params.optimize_streaming_latency,
"stability": params.stability,
"similarity_boost": params.similarity_boost,
"style": params.style,
"use_speaker_boost": params.use_speaker_boost,
}
self.set_model_name(model)
self.set_voice(voice_id)
self._voice_settings = self._set_voice_settings()

def can_generate_metrics(self) -> bool:
return True

def _set_voice_settings(self) -> Optional[Dict[str, Union[float, bool]]]:
"""Configure voice settings if stability and similarity_boost are provided.
Returns:
Dictionary of voice settings or None if required parameters are missing.
"""
voice_settings: Dict[str, Union[float, bool]] = {}
if (
self._settings["stability"] is not None
and self._settings["similarity_boost"] is not None
):
voice_settings["stability"] = float(self._settings["stability"])
voice_settings["similarity_boost"] = float(self._settings["similarity_boost"])
if self._settings["style"] is not None:
voice_settings["style"] = float(self._settings["style"])
if self._settings["use_speaker_boost"] is not None:
voice_settings["use_speaker_boost"] = bool(self._settings["use_speaker_boost"])
else:
if self._settings["style"] is not None:
logger.warning(
"'style' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)
if self._settings["use_speaker_boost"] is not None:
logger.warning(
"'use_speaker_boost' is set but will not be applied because 'stability' and 'similarity_boost' are not both set."
)

return voice_settings or None

async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"""Generate speech from text using ElevenLabs streaming API.
Args:
text: The text to convert to speech
Yields:
Frames containing audio data and status information
"""
logger.debug(f"Generating TTS: [{text}]")

url = f"{self._base_url}/v1/text-to-speech/{self._voice_id}/stream"

payload = {
"text": text,
"model_id": self._model_name,
}

if self._voice_settings:
payload["voice_settings"] = json.dumps(self._voice_settings)

if self._settings["language"]:
payload["language_code"] = self._settings["language"]

headers = {
"xi-api-key": self._api_key,
"Content-Type": "application/json",
}

# Build query parameters
params = {
"output_format": self._output_format,
}
if self._settings["optimize_streaming_latency"] is not None:
params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]

logger.debug(f"ElevenLabs request - payload: {payload}, params: {params}")

try:
await self.start_ttfb_metrics()

async with self._session.post(
url, json=payload, headers=headers, params=params
) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"{self} error: {error_text}")
yield ErrorFrame(error=f"ElevenLabs API error: {error_text}")
return

await self.start_tts_usage_metrics(text)
yield TTSStartedFrame()

async for chunk in response.content:
if chunk:
await self.stop_ttfb_metrics()
yield TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)

yield TTSStoppedFrame()

except Exception as e:
logger.error(f"Error in run_tts: {e}")
yield ErrorFrame(error=str(e))

finally:
yield TTSStoppedFrame()

0 comments on commit e159f2d

Please sign in to comment.