From a4ae9c928c4f4ad80fdcf46668e952a4cf205bd6 Mon Sep 17 00:00:00 2001 From: Long Chen Date: Wed, 13 Nov 2024 23:39:08 +0800 Subject: [PATCH 1/4] fix: expose multimodal agent metrics --- .../livekit/agents/metrics/__init__.py | 4 ++ livekit-agents/livekit/agents/metrics/base.py | 26 +++++++ .../agents/multimodal/multimodal_agent.py | 6 ++ .../plugins/openai/realtime/realtime_model.py | 70 +++++++++++++++++-- 4 files changed, 100 insertions(+), 6 deletions(-) diff --git a/livekit-agents/livekit/agents/metrics/__init__.py b/livekit-agents/livekit/agents/metrics/__init__.py index fc5a00d65..668e4aaaa 100644 --- a/livekit-agents/livekit/agents/metrics/__init__.py +++ b/livekit-agents/livekit/agents/metrics/__init__.py @@ -1,6 +1,8 @@ from .base import ( AgentMetrics, LLMMetrics, + MultiModalLLMError, + MultiModalLLMMetrics, PipelineEOUMetrics, PipelineLLMMetrics, PipelineSTTMetrics, @@ -15,6 +17,8 @@ __all__ = [ "LLMMetrics", + "MultiModalLLMError", + "MultiModalLLMMetrics", "AgentMetrics", "PipelineEOUMetrics", "PipelineSTTMetrics", diff --git a/livekit-agents/livekit/agents/metrics/base.py b/livekit-agents/livekit/agents/metrics/base.py index c1ba321f7..d71f95779 100644 --- a/livekit-agents/livekit/agents/metrics/base.py +++ b/livekit-agents/livekit/agents/metrics/base.py @@ -98,6 +98,31 @@ class PipelineVADMetrics(VADMetrics): pass +@dataclass +class MultiModalLLMError(Error): + type: str | None + reason: str | None = None + code: str | None = None + message: str | None = None + + +@dataclass +class MultiModalLLMMetrics(LLMMetrics): + @dataclass + class InputTokenDetails: + cached_tokens: int + text_tokens: int + audio_tokens: int + + @dataclass + class OutputTokenDetails: + text_tokens: int + audio_tokens: int + + input_token_details: InputTokenDetails + output_token_details: OutputTokenDetails + + AgentMetrics = Union[ STTMetrics, LLMMetrics, @@ -108,4 +133,5 @@ class PipelineVADMetrics(VADMetrics): PipelineLLMMetrics, PipelineTTSMetrics, PipelineVADMetrics, + MultiModalLLMMetrics, ] diff --git a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py index 6de4d930e..9bfb09ad9 100644 --- a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py +++ b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py @@ -8,6 +8,7 @@ from livekit import rtc from livekit.agents import llm, stt, tokenize, transcription, utils, vad from livekit.agents.llm import ChatMessage +from livekit.agents.metrics import MultiModalLLMMetrics from .._constants import ATTRIBUTE_AGENT_STATE from .._types import AgentState @@ -24,6 +25,7 @@ "agent_speech_interrupted", "function_calls_collected", "function_calls_finished", + "metrics_collected", ] @@ -240,6 +242,10 @@ def _function_calls_collected(fnc_call_infos: list[llm.FunctionCallInfo]): def _function_calls_finished(called_fncs: list[llm.CalledFunction]): self.emit("function_calls_finished", called_fncs) + @self._session.on("metrics_collected") + def _metrics_collected(metrics: MultiModalLLMMetrics): + self.emit("metrics_collected", metrics) + def _update_state(self, state: AgentState, delay: float = 0.0): """Set the current state of the agent""" diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py index 9aaad86e0..e05f7d05c 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py @@ -3,6 +3,7 @@ import asyncio import base64 import os +import time from copy import deepcopy from dataclasses import dataclass from typing import AsyncIterable, Callable, Literal, Union, cast, overload @@ -12,6 +13,7 @@ from livekit import rtc from livekit.agents import llm, utils from livekit.agents.llm import _oai_api +from livekit.agents.metrics import MultiModalLLMError, MultiModalLLMMetrics from typing_extensions import TypedDict from . import api_proto, remote_items @@ -33,6 +35,7 @@ "response_done", "function_calls_collected", "function_calls_finished", + "metrics_collected", ] @@ -66,6 +69,10 @@ class RealtimeResponse: """usage of the response""" done_fut: asyncio.Future[None] """future that will be set when the response is completed""" + created_timestamp: float + """timestamp when the response was created""" + first_token_timestamp: float | None = None + """timestamp when the first token was received""" @dataclass @@ -703,6 +710,7 @@ def __init__( loop: asyncio.AbstractEventLoop, ) -> None: super().__init__() + self._label = f"{type(self).__module__}.{type(self).__name__}" self._main_atask = asyncio.create_task( self._main_task(), name="openai-realtime-session" ) @@ -1210,6 +1218,7 @@ def _handle_response_created( output=[], usage=response.get("usage"), done_fut=done_fut, + created_timestamp=time.time(), ) self._pending_responses[new_response.id] = new_response self.emit("response_created", new_response) @@ -1264,6 +1273,7 @@ def _handle_response_content_part_added( content_type=content_type, ) output.content.append(new_content) + response.first_token_timestamp = time.time() self.emit("response_content_added", new_content) def _handle_response_audio_delta( @@ -1368,15 +1378,19 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon response.status_details = response_data.get("status_details") response.usage = response_data.get("usage") + metrics_error = None + cancelled = False if response.status == "failed": assert response.status_details is not None - error = response.status_details.get("error") - code: str | None = None - message: str | None = None - if error is not None: - code = error.get("code") # type: ignore - message = error.get("message") # type: ignore + error = response.status_details.get("error", {}) + code: str | None = error.get("code") # type: ignore + message: str | None = error.get("message") # type: ignore + metrics_error = MultiModalLLMError( + type=response.status_details.get("type"), + code=code, + message=message, + ) logger.error( "response generation failed", @@ -1386,13 +1400,57 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon assert response.status_details is not None reason = response.status_details.get("reason") + metrics_error = MultiModalLLMError( + type=response.status_details.get("type"), + reason=reason, # type: ignore + ) + logger.warning( "response generation incomplete", extra={"reason": reason, **self.logging_extra()}, ) + elif response.status == "cancelled": + cancelled = True self.emit("response_done", response) + # calculate metrics + ttft = -1.0 + if response.first_token_timestamp is not None: + ttft = response.first_token_timestamp - response.created_timestamp + duration = time.time() - response.created_timestamp + + usage = response.usage or {} # type: ignore + metrics = MultiModalLLMMetrics( + timestamp=response.created_timestamp, + request_id=response.id, + ttft=ttft, + duration=duration, + cancelled=cancelled, + label=self._label, + completion_tokens=usage.get("output_tokens", 0), + prompt_tokens=usage.get("input_tokens", 0), + total_tokens=usage.get("total_tokens", 0), + tokens_per_second=usage.get("output_tokens", 0) / duration, + error=metrics_error, + input_token_details=MultiModalLLMMetrics.InputTokenDetails( + cached_tokens=usage.get("input_token_details", {}).get( + "cached_tokens", 0 + ), + text_tokens=usage.get("input_token_details", {}).get("text_tokens", 0), + audio_tokens=usage.get("input_token_details", {}).get( + "audio_tokens", 0 + ), + ), + output_token_details=MultiModalLLMMetrics.OutputTokenDetails( + text_tokens=usage.get("output_token_details", {}).get("text_tokens", 0), + audio_tokens=usage.get("output_token_details", {}).get( + "audio_tokens", 0 + ), + ), + ) + self.emit("metrics_collected", metrics) + def _get_content(self, ptr: _ContentPtr) -> RealtimeContent: response = self._pending_responses[ptr["response_id"]] output = response.output[ptr["output_index"]] From 42c450230050a79a800cb9658c4a8cf0ca343c81 Mon Sep 17 00:00:00 2001 From: Long Chen Date: Thu, 14 Nov 2024 10:12:35 +0800 Subject: [PATCH 2/4] fix: mark timestamp states in RealtimeResponse private --- .../plugins/openai/realtime/realtime_model.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py index e05f7d05c..c735b2c26 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py @@ -69,9 +69,9 @@ class RealtimeResponse: """usage of the response""" done_fut: asyncio.Future[None] """future that will be set when the response is completed""" - created_timestamp: float + _created_timestamp: float """timestamp when the response was created""" - first_token_timestamp: float | None = None + _first_token_timestamp: float | None = None """timestamp when the first token was received""" @@ -1218,7 +1218,7 @@ def _handle_response_created( output=[], usage=response.get("usage"), done_fut=done_fut, - created_timestamp=time.time(), + _created_timestamp=time.time(), ) self._pending_responses[new_response.id] = new_response self.emit("response_created", new_response) @@ -1273,7 +1273,7 @@ def _handle_response_content_part_added( content_type=content_type, ) output.content.append(new_content) - response.first_token_timestamp = time.time() + response._first_token_timestamp = time.time() self.emit("response_content_added", new_content) def _handle_response_audio_delta( @@ -1416,13 +1416,13 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon # calculate metrics ttft = -1.0 - if response.first_token_timestamp is not None: - ttft = response.first_token_timestamp - response.created_timestamp - duration = time.time() - response.created_timestamp + if response._first_token_timestamp is not None: + ttft = response._first_token_timestamp - response._created_timestamp + duration = time.time() - response._created_timestamp usage = response.usage or {} # type: ignore metrics = MultiModalLLMMetrics( - timestamp=response.created_timestamp, + timestamp=response._created_timestamp, request_id=response.id, ttft=ttft, duration=duration, From a47d987bdeeee0c601a5cb164488d8a6e9cbe04c Mon Sep 17 00:00:00 2001 From: Long Chen Date: Thu, 14 Nov 2024 10:16:18 +0800 Subject: [PATCH 3/4] chore: add changeset --- .changeset/sharp-islands-flow.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changeset/sharp-islands-flow.md diff --git a/.changeset/sharp-islands-flow.md b/.changeset/sharp-islands-flow.md new file mode 100644 index 000000000..44dfe6d2c --- /dev/null +++ b/.changeset/sharp-islands-flow.md @@ -0,0 +1,6 @@ +--- +"livekit-plugins-openai": patch +"livekit-agents": patch +--- + +expose multimodal agent metrics From 601ed19fba6cf5bd7559bb80553c63b11c19b03c Mon Sep 17 00:00:00 2001 From: Long Chen Date: Thu, 14 Nov 2024 22:26:17 +0800 Subject: [PATCH 4/4] fix: rename to MultimodalLLMMetrics --- .../{sharp-islands-flow.md => tough-boats-appear.md} | 2 +- livekit-agents/livekit/agents/metrics/__init__.py | 8 ++++---- livekit-agents/livekit/agents/metrics/base.py | 6 +++--- .../livekit/agents/multimodal/multimodal_agent.py | 4 ++-- .../plugins/openai/realtime/realtime_model.py | 12 ++++++------ 5 files changed, 16 insertions(+), 16 deletions(-) rename .changeset/{sharp-islands-flow.md => tough-boats-appear.md} (67%) diff --git a/.changeset/sharp-islands-flow.md b/.changeset/tough-boats-appear.md similarity index 67% rename from .changeset/sharp-islands-flow.md rename to .changeset/tough-boats-appear.md index 44dfe6d2c..80b77cd33 100644 --- a/.changeset/sharp-islands-flow.md +++ b/.changeset/tough-boats-appear.md @@ -3,4 +3,4 @@ "livekit-agents": patch --- -expose multimodal agent metrics +Expose multimodal agent metrics diff --git a/livekit-agents/livekit/agents/metrics/__init__.py b/livekit-agents/livekit/agents/metrics/__init__.py index 668e4aaaa..a61f430e8 100644 --- a/livekit-agents/livekit/agents/metrics/__init__.py +++ b/livekit-agents/livekit/agents/metrics/__init__.py @@ -1,8 +1,8 @@ from .base import ( AgentMetrics, LLMMetrics, - MultiModalLLMError, - MultiModalLLMMetrics, + MultimodalLLMError, + MultimodalLLMMetrics, PipelineEOUMetrics, PipelineLLMMetrics, PipelineSTTMetrics, @@ -17,8 +17,8 @@ __all__ = [ "LLMMetrics", - "MultiModalLLMError", - "MultiModalLLMMetrics", + "MultimodalLLMError", + "MultimodalLLMMetrics", "AgentMetrics", "PipelineEOUMetrics", "PipelineSTTMetrics", diff --git a/livekit-agents/livekit/agents/metrics/base.py b/livekit-agents/livekit/agents/metrics/base.py index d71f95779..78d09e4f2 100644 --- a/livekit-agents/livekit/agents/metrics/base.py +++ b/livekit-agents/livekit/agents/metrics/base.py @@ -99,7 +99,7 @@ class PipelineVADMetrics(VADMetrics): @dataclass -class MultiModalLLMError(Error): +class MultimodalLLMError(Error): type: str | None reason: str | None = None code: str | None = None @@ -107,7 +107,7 @@ class MultiModalLLMError(Error): @dataclass -class MultiModalLLMMetrics(LLMMetrics): +class MultimodalLLMMetrics(LLMMetrics): @dataclass class InputTokenDetails: cached_tokens: int @@ -133,5 +133,5 @@ class OutputTokenDetails: PipelineLLMMetrics, PipelineTTSMetrics, PipelineVADMetrics, - MultiModalLLMMetrics, + MultimodalLLMMetrics, ] diff --git a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py index 9bfb09ad9..e7baa4cb9 100644 --- a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py +++ b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py @@ -8,7 +8,7 @@ from livekit import rtc from livekit.agents import llm, stt, tokenize, transcription, utils, vad from livekit.agents.llm import ChatMessage -from livekit.agents.metrics import MultiModalLLMMetrics +from livekit.agents.metrics import MultimodalLLMMetrics from .._constants import ATTRIBUTE_AGENT_STATE from .._types import AgentState @@ -243,7 +243,7 @@ def _function_calls_finished(called_fncs: list[llm.CalledFunction]): self.emit("function_calls_finished", called_fncs) @self._session.on("metrics_collected") - def _metrics_collected(metrics: MultiModalLLMMetrics): + def _metrics_collected(metrics: MultimodalLLMMetrics): self.emit("metrics_collected", metrics) def _update_state(self, state: AgentState, delay: float = 0.0): diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py index c735b2c26..e77f94e07 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py @@ -13,7 +13,7 @@ from livekit import rtc from livekit.agents import llm, utils from livekit.agents.llm import _oai_api -from livekit.agents.metrics import MultiModalLLMError, MultiModalLLMMetrics +from livekit.agents.metrics import MultimodalLLMError, MultimodalLLMMetrics from typing_extensions import TypedDict from . import api_proto, remote_items @@ -1386,7 +1386,7 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon error = response.status_details.get("error", {}) code: str | None = error.get("code") # type: ignore message: str | None = error.get("message") # type: ignore - metrics_error = MultiModalLLMError( + metrics_error = MultimodalLLMError( type=response.status_details.get("type"), code=code, message=message, @@ -1400,7 +1400,7 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon assert response.status_details is not None reason = response.status_details.get("reason") - metrics_error = MultiModalLLMError( + metrics_error = MultimodalLLMError( type=response.status_details.get("type"), reason=reason, # type: ignore ) @@ -1421,7 +1421,7 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon duration = time.time() - response._created_timestamp usage = response.usage or {} # type: ignore - metrics = MultiModalLLMMetrics( + metrics = MultimodalLLMMetrics( timestamp=response._created_timestamp, request_id=response.id, ttft=ttft, @@ -1433,7 +1433,7 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon total_tokens=usage.get("total_tokens", 0), tokens_per_second=usage.get("output_tokens", 0) / duration, error=metrics_error, - input_token_details=MultiModalLLMMetrics.InputTokenDetails( + input_token_details=MultimodalLLMMetrics.InputTokenDetails( cached_tokens=usage.get("input_token_details", {}).get( "cached_tokens", 0 ), @@ -1442,7 +1442,7 @@ def _handle_response_done(self, response_done: api_proto.ServerEvent.ResponseDon "audio_tokens", 0 ), ), - output_token_details=MultiModalLLMMetrics.OutputTokenDetails( + output_token_details=MultimodalLLMMetrics.OutputTokenDetails( text_tokens=usage.get("output_token_details", {}).get("text_tokens", 0), audio_tokens=usage.get("output_token_details", {}).get( "audio_tokens", 0