From 4e6babac612c20b1a8d9121d39fe57902d22228f Mon Sep 17 00:00:00 2001 From: Ben Cherry Date: Thu, 26 Sep 2024 13:52:40 -0700 Subject: [PATCH] Hotfix for new API format (#67) Co-authored-by: aoife cassidy --- .changeset/dirty-cycles-suffer.md | 5 + examples/src/minimal_assistant.ts | 8 +- plugins/openai/src/omni_assistant/index.ts | 237 +++---- plugins/openai/src/omni_assistant/proto.ts | 781 ++++++++++++++------- 4 files changed, 667 insertions(+), 364 deletions(-) create mode 100644 .changeset/dirty-cycles-suffer.md diff --git a/.changeset/dirty-cycles-suffer.md b/.changeset/dirty-cycles-suffer.md new file mode 100644 index 00000000..5ae45df7 --- /dev/null +++ b/.changeset/dirty-cycles-suffer.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents-plugin-openai": minor +--- + +Hotfix for new API format diff --git a/examples/src/minimal_assistant.ts b/examples/src/minimal_assistant.ts index 2efad8c4..bcb44f67 100644 --- a/examples/src/minimal_assistant.ts +++ b/examples/src/minimal_assistant.ts @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 import { type JobContext, WorkerOptions, cli, defineAgent } from '@livekit/agents'; -import { OmniAssistant, defaultConversationConfig } from '@livekit/agents-plugin-openai'; +import { OmniAssistant, defaultSessionConfig } from '@livekit/agents-plugin-openai'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -13,9 +13,9 @@ export default defineAgent({ console.log('starting assistant example agent'); const assistant = new OmniAssistant({ - conversationConfig: { - ...defaultConversationConfig, - system_message: 'You are a helpful assistant.', + sessionConfig: { + ...defaultSessionConfig, + instructions: 'You are a helpful assistant.', }, functions: { weather: { diff --git a/plugins/openai/src/omni_assistant/index.ts b/plugins/openai/src/omni_assistant/index.ts index b10867cf..1ef33c33 100644 --- a/plugins/openai/src/omni_assistant/index.ts +++ b/plugins/openai/src/omni_assistant/index.ts @@ -26,35 +26,30 @@ import * as proto from './proto.js'; import { BasicTranscriptionForwarder } from './transcription_forwarder.js'; /** @hidden */ -export const defaultSessionConfig: proto.SessionConfig = { - turn_detection: 'server_vad', - input_audio_format: proto.AudioFormat.PCM16, - transcribe_input: true, - vad: { +export const defaultSessionConfig: Partial = { + turn_detection: { + type: 'server_vad', threshold: 0.5, prefix_padding_ms: 300, silence_duration_ms: 200, }, -}; - -/** @hidden */ -export const defaultConversationConfig: proto.ConversationConfig = { - system_message: 'You are a helpful assistant.', + input_audio_format: proto.AudioFormat.PCM16, + input_audio_transcription: { + model: 'whisper-1', + }, + modalities: ['text', 'audio'], + instructions: 'You are a helpful assistant.', voice: proto.Voice.ALLOY, - subscribe_to_user_audio: true, output_audio_format: proto.AudioFormat.PCM16, tools: [], tool_choice: proto.ToolChoice.AUTO, temperature: 0.8, - max_tokens: 2048, - disable_audio: false, - transcribe_input: true, + // max_output_tokens: 2048, }; type ImplOptions = { apiKey: string; - sessionConfig: proto.SessionConfig; - conversationConfig: proto.ConversationConfig; + sessionConfig: Partial; functions: llm.FunctionContext; }; @@ -68,12 +63,10 @@ export class OmniAssistant { constructor({ sessionConfig = defaultSessionConfig, - conversationConfig = defaultConversationConfig, functions = {}, apiKey = process.env.OPENAI_API_KEY || '', }: { - sessionConfig?: proto.SessionConfig; - conversationConfig?: proto.ConversationConfig; + sessionConfig?: Partial; functions?: llm.FunctionContext; apiKey?: string; }) { @@ -81,11 +74,10 @@ export class OmniAssistant { throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY'); } - conversationConfig.tools = tools(functions); + sessionConfig.tools = tools(functions); this.options = { apiKey, sessionConfig, - conversationConfig, functions, }; } @@ -106,10 +98,10 @@ export class OmniAssistant { } set funcCtx(ctx: llm.FunctionContext) { this.options.functions = ctx; - this.options.conversationConfig.tools = tools(ctx); + this.options.sessionConfig.tools = tools(ctx); this.sendClientCommand({ - event: proto.ClientEventType.UPDATE_CONVERSATION_CONFIG, - ...this.options.conversationConfig, + type: proto.ClientEventType.SessionUpdate, + session: this.options.sessionConfig, }); } @@ -163,20 +155,12 @@ export class OmniAssistant { this.ws = new WebSocket(proto.API_URL, { headers: { Authorization: `Bearer ${this.options.apiKey}`, + 'OpenAI-Beta': 'realtime=v1', }, }); this.ws.onopen = () => { this.connected = true; - this.sendClientCommand({ - event: proto.ClientEventType.UPDATE_SESSION_CONFIG, - ...this.options.sessionConfig, - }); - this.sendClientCommand({ - event: proto.ClientEventType.UPDATE_CONVERSATION_CONFIG, - ...this.options.conversationConfig, - }); - resolve(); }; this.ws.onerror = (error) => { @@ -189,12 +173,20 @@ export class OmniAssistant { }; this.ws.onmessage = (message) => { - this.handleServerEvent(JSON.parse(message.data as string)); + const event = JSON.parse(message.data as string); + this.handleServerEvent(event); + + if (event.type === 'session.created') { + this.sendClientCommand({ + type: proto.ClientEventType.SessionUpdate, + session: this.options.sessionConfig, + }); + resolve(); + } }; }); } - // user-initiated close close() { if (!this.connected || !this.ws) return; this.logger.debug('stopping assistant'); @@ -203,8 +195,9 @@ export class OmniAssistant { addUserMessage(text: string, generate: boolean = true): void { this.sendClientCommand({ - event: proto.ClientEventType.ADD_MESSAGE, - message: { + type: proto.ClientEventType.ConversationItemCreate, + item: { + type: 'message', role: 'user', content: [ { @@ -216,7 +209,8 @@ export class OmniAssistant { }); if (generate) { this.sendClientCommand({ - event: proto.ClientEventType.GENERATE, + type: proto.ClientEventType.ResponseCreate, + response: {}, }); } } @@ -225,12 +219,12 @@ export class OmniAssistant { // don't override thinking until done if (this.thinking) return; if (this.room?.isConnected && this.room.localParticipant) { - const currentState = this.room.localParticipant.attributes['voice_assistant.state']; + const currentState = this.room.localParticipant.attributes['lk.agent.state']; if (currentState !== state) { this.room.localParticipant!.setAttributes({ - 'voice_assistant.state': state, + 'lk.agent.state': state, }); - this.logger.debug(`voice_assistant.state updated from ${currentState} to ${state}`); + this.logger.debug(`lk.agent.state updated from ${currentState} to ${state}`); } } } @@ -248,16 +242,25 @@ export class OmniAssistant { } } - if (untypedEvent.data && typeof untypedEvent.data === 'string') { + if (untypedEvent.audio && typeof untypedEvent.audio === 'string') { const truncatedData = - untypedEvent.data.slice(0, maxLength) + (untypedEvent.data.length > maxLength ? '…' : ''); - return { ...untypedEvent, data: truncatedData }; + untypedEvent.audio.slice(0, maxLength) + (untypedEvent.audio.length > maxLength ? '…' : ''); + return { ...untypedEvent, audio: truncatedData }; + } + if ( + untypedEvent.delta && + typeof untypedEvent.delta === 'string' && + event.type === 'response.audio.delta' + ) { + const truncatedDelta = + untypedEvent.delta.slice(0, maxLength) + (untypedEvent.delta.length > maxLength ? '…' : ''); + return { ...untypedEvent, delta: truncatedDelta }; } return untypedEvent; } private sendClientCommand(command: proto.ClientEvent): void { - const isAudio = command.event === proto.ClientEventType.ADD_USER_AUDIO; + const isAudio = command.type === 'input_audio_buffer.append'; if (!this.connected || !this.ws) { if (!isAudio) this.logger.error('WebSocket is not connected'); @@ -273,30 +276,31 @@ export class OmniAssistant { private handleServerEvent(event: proto.ServerEvent): void { this.logger.debug(`<- ${JSON.stringify(this.loggableEvent(event))}`); - switch (event.event) { - case proto.ServerEventType.START_SESSION: + switch (event.type) { + case 'session.created': this.setState(proto.State.LISTENING); break; - case proto.ServerEventType.ADD_MESSAGE: + case 'conversation.item.created': break; - case proto.ServerEventType.ADD_CONTENT: + case 'response.audio_transcript.delta': + case 'response.audio.delta': this.handleAddContent(event); break; - case proto.ServerEventType.MESSAGE_ADDED: + case 'conversation.item.created': this.handleMessageAdded(event); break; - case proto.ServerEventType.VAD_SPEECH_STARTED: + case 'input_audio_buffer.speech_started': this.handleVadSpeechStarted(event); break; - case proto.ServerEventType.VAD_SPEECH_STOPPED: - break; - case proto.ServerEventType.INPUT_TRANSCRIBED: + // case 'input_audio_transcription.stopped': + // break; + case 'conversation.item.input_audio_transcription.completed': this.handleInputTranscribed(event); break; - case proto.ServerEventType.GENERATION_CANCELED: - this.handleGenerationCanceled(); - break; - case proto.ServerEventType.GENERATION_FINISHED: + // case 'response.canceled': + // this.handleGenerationCanceled(); + // break; + case 'response.done': this.handleGenerationFinished(event); break; default: @@ -304,9 +308,9 @@ export class OmniAssistant { } } - private handleAddContent(event: proto.ServerEvent): void { - if (event.event !== proto.ServerEventType.ADD_CONTENT) return; - + private handleAddContent( + event: proto.ResponseAudioDeltaEvent | proto.ResponseAudioTranscriptDeltaEvent, + ): void { const trackSid = this.getLocalTrackSid(); if (!this.room || !this.room.localParticipant || !trackSid || !this.agentPlayout) { log().error('Room or local participant not set'); @@ -318,59 +322,47 @@ export class OmniAssistant { this.room, this.room?.localParticipant?.identity, trackSid, - event.message_id, + event.response_id, ); this.setState(proto.State.SPEAKING); - this.playingHandle = this.agentPlayout.play(event.message_id, trFwd); + this.playingHandle = this.agentPlayout.play(event.response_id, trFwd); this.playingHandle.on('complete', () => { this.setState(proto.State.LISTENING); }); } - switch (event.type) { - case 'audio': - this.playingHandle?.pushAudio(Buffer.from(event.data, 'base64')); - break; - case 'text': - this.playingHandle?.pushText(event.data); - break; - case 'tool_call': - break; - default: - this.logger.warn(`Unknown content event type: ${event.type}`); - break; + if (event.type === 'response.audio.delta') { + this.playingHandle?.pushAudio(Buffer.from(event.delta, 'base64')); + } else if (event.type === 'response.audio_transcript.delta') { + this.playingHandle?.pushText(event.delta); } } - private handleMessageAdded(event: proto.ServerEvent): void { - if (event.event !== proto.ServerEventType.MESSAGE_ADDED) return; - for (const toolCall of event.content || []) { + private handleMessageAdded(event: proto.ConversationItemCreatedEvent): void { + if (event.item.type === 'function_call') { + const toolCall = event.item; this.options.functions[toolCall.name].execute(toolCall.arguments).then((content) => { this.thinking = false; this.sendClientCommand({ - event: proto.ClientEventType.ADD_MESSAGE, - message: { - role: 'tool', - tool_call_id: toolCall.tool_call_id, - content: [ - { - type: 'text', - text: content, - }, - ], + type: proto.ClientEventType.ConversationItemCreate, + item: { + type: 'function_call_output', + call_id: toolCall.call_id, + output: content, }, }); this.sendClientCommand({ - event: proto.ClientEventType.GENERATE, + type: proto.ClientEventType.ResponseCreate, + response: {}, }); }); - break; } } - private handleInputTranscribed(event: proto.ServerEvent): void { - if (event.event !== proto.ServerEventType.INPUT_TRANSCRIBED) return; - const messageId = event.message_id; + private handleInputTranscribed( + event: proto.ConversationItemInputAudioTranscriptionCompletedEvent, + ): void { + const messageId = event.item_id; const transcription = event.transcript; if (!messageId || transcription === undefined) { this.logger.error('Message ID or transcription not set'); @@ -385,23 +377,23 @@ export class OmniAssistant { } } - private handleGenerationCanceled(): void { - if (this.playingHandle && !this.playingHandle.done) { - this.playingHandle.interrupt(); - this.sendClientCommand({ - event: proto.ClientEventType.TRUNCATE_CONTENT, - message_id: this.playingHandle.messageId, - index: 0, // ignored for now (see OAI docs) - text_chars: this.playingHandle.publishedTextChars(), - audio_samples: this.playingHandle.playedAudioSamples, - }); - } - } - - private handleGenerationFinished(event: proto.ServerEvent): void { - if (event.event !== proto.ServerEventType.GENERATION_FINISHED) return; - if (event.reason !== 'interrupt' && event.reason !== 'stop') { - log().warn(`assistant turn finished unexpectedly reason ${event.reason}`); + private handleGenerationFinished(event: proto.ResponseDoneEvent): void { + if ( + event.response.status === 'cancelled' && + event.response.status_details?.type === 'cancelled' && + event.response.status_details?.reason === 'turn_detected' + ) { + if (this.playingHandle && !this.playingHandle.done) { + this.playingHandle.interrupt(); + this.sendClientCommand({ + type: proto.ClientEventType.ConversationItemTruncate, + item_id: this.playingHandle.messageId, + content_index: 0, // ignored for now (see OAI docs) + audio_end_ms: (this.playingHandle.playedAudioSamples * 1000) / proto.SAMPLE_RATE, + }); + } + } else if (event.response.status !== 'completed') { + log().warn(`assistant turn finished unexpectedly reason ${event.response.status}`); } if (this.playingHandle && !this.playingHandle.interrupted) { @@ -409,9 +401,8 @@ export class OmniAssistant { } } - private handleVadSpeechStarted(event: proto.ServerEvent): void { - if (event.event !== proto.ServerEventType.VAD_SPEECH_STARTED) return; - const messageId = event.message_id; + private handleVadSpeechStarted(event: proto.InputAudioBufferSpeechStartedEvent): void { + const messageId = event.item_id; const participantIdentity = this.linkedParticipant?.identity; const trackSid = this.subscribedTrack?.sid; if (participantIdentity && trackSid && messageId) { @@ -454,8 +445,8 @@ export class OmniAssistant { const audioData = ev.frame.data; for (const frame of bstream.write(audioData.buffer)) { this.sendClientCommand({ - event: proto.ClientEventType.ADD_USER_AUDIO, - data: Buffer.from(frame.data.buffer).toString('base64'), + type: proto.ClientEventType.InputAudioBufferAppend, + audio: Buffer.from(frame.data.buffer).toString('base64'), }); } }); @@ -514,6 +505,14 @@ export class OmniAssistant { isFinal: boolean, id: string, ): void { + // Log all parameters + log().info('Publishing transcription', { + participantIdentity, + trackSid, + text, + isFinal, + id, + }); if (!this.room?.localParticipant) { log().error('Room or local participant not set'); return; @@ -538,10 +537,8 @@ export class OmniAssistant { const tools = (ctx: llm.FunctionContext): proto.Tool[] => Object.entries(ctx).map(([name, func]) => ({ + name, + description: func.description, + parameters: llm.oaiParams(func.parameters), type: 'function', - function: { - name, - description: func.description, - parameters: llm.oaiParams(func.parameters), - }, })); diff --git a/plugins/openai/src/omni_assistant/proto.ts b/plugins/openai/src/omni_assistant/proto.ts index a437349b..d81e171e 100644 --- a/plugins/openai/src/omni_assistant/proto.ts +++ b/plugins/openai/src/omni_assistant/proto.ts @@ -1,280 +1,581 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 + +export const API_URL = 'wss://api.openai.com/v1/realtime'; +export const SAMPLE_RATE = 24000; +export const NUM_CHANNELS = 1; +export const INPUT_PCM_FRAME_SIZE = 2400; // 100ms +export const OUTPUT_PCM_FRAME_SIZE = 1200; // 50ms + export enum Voice { ALLOY = 'alloy', SHIMMER = 'shimmer', ECHO = 'echo', } -export enum TurnEndType { - SERVER_DETECTION = 'server_detection', - CLIENT_DECISION = 'client_decision', -} - export enum AudioFormat { PCM16 = 'pcm16', // G711_ULAW = 'g711-ulaw', // G711_ALAW = 'g711-alaw', } -export enum ServerEventType { - START_SESSION = 'start_session', - ERROR = 'error', - ADD_MESSAGE = 'add_message', - ADD_CONTENT = 'add_content', - MESSAGE_ADDED = 'message_added', - VAD_SPEECH_STARTED = 'vad_speech_started', - VAD_SPEECH_STOPPED = 'vad_speech_stopped', - INPUT_TRANSCRIBED = 'input_transcribed', - GENERATION_CANCELED = 'generation_canceled', - SEND_STATE = 'send_state', - GENERATION_FINISHED = 'generation_finished', +export interface Tool { + type: 'function'; + name: string; + description?: string; + parameters: { + type: 'object'; + properties: { + [prop: string]: { + [prop: string]: any; + }; + }; + required_properties: string[]; + }; } -export type ServerEvent = - | { - event: ServerEventType.START_SESSION; - session_id: string; - model: string; - system_fingerprint: string; - } +export enum ToolChoice { + AUTO = 'auto', + NONE = 'none', + REQUIRED = 'required', +} + +export enum State { + INITIALIZING = 'initializing', + LISTENING = 'listening', + THINKING = 'thinking', + SPEAKING = 'speaking', +} + +export type AudioBase64Bytes = string; + +// Content Part Types +export interface InputTextContent { + type: 'text'; + text: string; +} + +export interface InputAudioContent { + type: 'input_audio'; + // 'audio' field is excluded when rendered + // audio: AudioBase64Bytes; + transcript?: string; +} + +export interface TextContent { + type: 'text'; + text: string; +} + +export interface AudioContent { + type: 'audio'; + // 'audio' field is excluded when rendered + // audio: AudioBase64Bytes; + transcript: string; +} + +export type ContentPart = InputTextContent | InputAudioContent | TextContent | AudioContent; + +// Item Resource Types +export interface BaseItem { + id: string; + object: 'realtime.item'; + previous_item_id?: string; + type: string; +} + +export interface SystemMessageItem extends BaseItem { + type: 'message'; + role: 'system'; + content: InputTextContent; +} + +export interface UserMessageItem extends BaseItem { + type: 'message'; + role: 'user'; + content: (InputTextContent | InputAudioContent)[]; +} + +export interface AssistantMessageItem extends BaseItem { + type: 'message'; + role: 'assistant'; + content: (TextContent | AudioContent)[]; +} + +export interface FunctionCallItem extends BaseItem { + type: 'function_call'; + call_id: string; + name: string; + arguments: string; +} + +export interface FunctionCallOutputItem extends BaseItem { + type: 'function_call_output'; + call_id: string; + output: string; +} + +export type ItemResource = + | SystemMessageItem + | UserMessageItem + | AssistantMessageItem + | FunctionCallItem + | FunctionCallOutputItem; + +// Session Resource +export interface SessionResource { + id: string; + object: 'realtime.session'; + model: string; + modalities: ['text', 'audio'] | ['text']; // default: ["text", "audio"] + instructions?: string; // default: null + voice: Voice; // default: "alloy" + input_audio_format: AudioFormat; // default: "pcm16" + output_audio_format: AudioFormat; // default: "pcm16" + input_audio_transcription?: { + model: 'whisper-1'; + }; // default: null + turn_detection: + | { + type: 'server_vad'; + threshold: number; // 0.0 to 1.0, default: 0.5 + prefix_padding_ms: number; // default: 300 + silence_duration_ms: number; // default: 200 + } + | 'none'; + tools: Tool[]; + tool_choice: ToolChoice; // default: "auto" + temperature: number; // default: 0.8 + // max_output_tokens: number | null; // FIXME: currently rejected by OpenAI and fails the whole update +} + +// Conversation Resource +export interface ConversationResource { + id: string; + object: 'realtime.conversation'; +} + +// Response Resource +export enum ResponseStatus { + IN_PROGRESS = 'in_progress', + COMPLETED = 'completed', + INCOMPLETE = 'incomplete', + CANCELLED = 'cancelled', + FAILED = 'failed', +} + +export type ResponseStatusDetails = | { - event: ServerEventType.ERROR; - error: string; + type: ResponseStatus.INCOMPLETE; + reason: 'max_output_tokens' | 'content_filter'; } | { - event: ServerEventType.ADD_MESSAGE; - previous_id: string; - conversation_label: string; - message: { - role: 'assistant'; - content: ( - | { - type: 'text'; - text: string; - } - | { - type: 'audio'; - audio: string; - } - | { - type: 'tool_call'; - name: string; - arguments: string; - tool_call_id: string; - } - )[]; + type: ResponseStatus.FAILED; + error?: { + code: 'server_error' | 'rate_limit_exceeded' | string; + message: string; }; } | { - event: ServerEventType.ADD_CONTENT; - message_id: string; - type: 'text' | 'audio' | 'tool_call'; - data: string; // text or base64 audio or JSON stringified object - } - | { - event: ServerEventType.MESSAGE_ADDED; - id: string; - previous_id: string; - conversation_label: string; - content: - | { - type: 'tool_call'; - name: string; - tool_call_id: string; - arguments: string; // JSON stringified object - }[] - | null; - } - | { - event: ServerEventType.GENERATION_FINISHED; - reason: 'stop' | 'max_tokens' | 'content_filter' | 'interrupt'; - conversation_label: string; - message_ids: string[]; - } - | { - event: ServerEventType.SEND_STATE; - session_id: string; - input_audio_format: AudioFormat; - vad_active: boolean; - audio_buffer: string; - conversations: any; // TODO(nbsp): get this - session_config: SessionConfig; - } - | { - event: - | ServerEventType.VAD_SPEECH_STARTED - | ServerEventType.VAD_SPEECH_STOPPED - | ServerEventType.GENERATION_CANCELED; - sample_index: number; - message_id: string; - } - | { - event: ServerEventType.INPUT_TRANSCRIBED; - message_id: string; - transcript: string; + type: ResponseStatus.CANCELLED; + reason: 'turn_detected' | 'client_cancelled'; }; -export enum ClientEventType { - UPDATE_SESSION_CONFIG = 'update_session_config', - UPDATE_CONVERSATION_CONFIG = 'update_conversation_config', - ADD_MESSAGE = 'add_message', - DELETE_MESSAGE = 'delete_message', - ADD_USER_AUDIO = 'add_user_audio', - COMMIT_USER_AUDIO = 'commit_user_audio', - CANCEL_GENERATION = 'cancel_generation', - GENERATE = 'generate', - CREATE_CONVERSATION = 'create_conversation', - DELETE_CONVERSATION = 'delete_conversation', - TRUNCATE_CONTENT = 'truncate_content', - REQUEST_STATE = 'request_state', +export interface ResponseResource { + id: string; + object: 'realtime.response'; + status: ResponseStatus; + status_details: ResponseStatusDetails; + output: ItemResource[]; + usage?: { + total_tokens: number; + input_tokens: number; + output_tokens: number; + }; } -export type ClientEvent = - | ({ - event: ClientEventType.UPDATE_SESSION_CONFIG; - } & SessionConfig) - | ({ - event: ClientEventType.UPDATE_CONVERSATION_CONFIG; - } & ConversationConfig) - | { - event: ClientEventType.ADD_MESSAGE; - // id, previous_id, conversation_label are unused by us - message: ( - | { - role: 'tool'; - tool_call_id: string; - } - | { - role: 'user' | 'assistant' | 'system'; - } - ) & - ( - | { - content: ( - | { - type: 'text'; - text: string; - } - | { - type: 'tool_call'; - name: string; - arguments: string; - tool_call_id: string; - } - )[]; - } - | { - role: 'user' | 'tool'; - content: ( - | { - type: 'text'; - text: string; - } - | { - type: 'tool_call'; - name: string; - arguments: string; - tool_call_id: string; - } - | { - type: 'audio'; - audio: string; // base64 encoded buffer - } - )[]; - } - ); - } - | { - event: ClientEventType.DELETE_MESSAGE; - id: string; - conversation_label?: string; // defaults to 'default' - } - | { - event: ClientEventType.ADD_USER_AUDIO; - data: string; // base64 encoded buffer - } - | { - event: ClientEventType.COMMIT_USER_AUDIO | ClientEventType.CANCEL_GENERATION; - } - | { - event: ClientEventType.GENERATE; - conversation_label?: string; // defaults to 'default' - } - | { - event: - | ClientEventType.CREATE_CONVERSATION - | ClientEventType.DELETE_CONVERSATION - | ClientEventType.REQUEST_STATE; - label: string; - } - | { - event: ClientEventType.TRUNCATE_CONTENT; - message_id: string; - index: number; // integer, ignored - text_chars?: number; // integer - audio_samples?: number; // integer +// Client Events +interface BaseClientEvent { + event_id?: string; + type: ClientEventType; +} + +export interface SessionUpdateEvent extends BaseClientEvent { + type: ClientEventType.SessionUpdate; + session: Partial<{ + modalities: ['text', 'audio'] | ['text']; + instructions: string; + voice: Voice; + input_audio_format: AudioFormat; + output_audio_format: AudioFormat; + input_audio_transcription: { + model: 'whisper-1'; }; + turn_detection: + | { + type: 'server_vad'; + threshold?: number; + prefix_padding_ms?: number; + silence_duration_ms?: number; + } + | 'none'; + tools: Tool[]; + tool_choice: ToolChoice; + temperature: number; + max_output_tokens: number; + }>; +} -export enum ToolChoice { - AUTO = 'auto', - NONE = 'none', - REQUIRED = 'required', +export interface InputAudioBufferAppendEvent extends BaseClientEvent { + type: ClientEventType.InputAudioBufferAppend; + audio: AudioBase64Bytes; } -export interface Tool { - type: 'function'; - function: { - name: string; - description: string; - parameters: { - type: 'object'; - properties: { - [prop: string]: { - [prop: string]: any; - }; +export interface InputAudioBufferCommitEvent extends BaseClientEvent { + type: ClientEventType.InputAudioBufferCommit; +} + +export interface InputAudioBufferClearEvent extends BaseClientEvent { + type: ClientEventType.InputAudioBufferClear; +} + +export interface ConversationItemCreateEvent extends BaseClientEvent { + type: ClientEventType.ConversationItemCreate; + item: + | { + type: 'message'; + role: 'user'; + content: (InputTextContent | InputAudioContent)[]; + } + | { + type: 'message'; + role: 'assistant'; + content: TextContent[]; + } + | { + type: 'message'; + role: 'system'; + content: InputTextContent[]; + } + | { + type: 'function_call_output'; + call_id: string; + output: string; }; - required_properties: string[]; - }; - }; } -export const API_URL = 'wss://api.openai.com/v1/realtime'; -export const SAMPLE_RATE = 24000; -export const NUM_CHANNELS = 1; +export interface ConversationItemTruncateEvent extends BaseClientEvent { + type: ClientEventType.ConversationItemTruncate; + item_id: string; + content_index: number; + audio_end_ms: number; +} -export const INPUT_PCM_FRAME_SIZE = 2400; // 100ms -export const OUTPUT_PCM_FRAME_SIZE = 1200; // 50ms +export interface ConversationItemDeleteEvent extends BaseClientEvent { + type: ClientEventType.ConversationItemDelete; + item_id: string; +} -export type SessionConfig = Partial<{ - turn_detection: 'disabled' | 'server_vad'; - input_audio_format: AudioFormat; - transcribe_input: boolean; - vad: Partial<{ - threshold: number; // 0..1 inclusive, default 0.5 - prefix_padding_ms: number; // default 300 - silence_duration_ms: number; // default 200 +export interface ResponseCreateEvent extends BaseClientEvent { + type: ClientEventType.ResponseCreate; + response: Partial<{ + modalities: ['text', 'audio'] | ['text']; + instructions: string; + voice: Voice; + output_audio_format: AudioFormat; + tools?: Tool[]; + tool_choice: ToolChoice; + temperature: number; + max_output_tokens: number; }>; -}>; +} -export type ConversationConfig = Partial<{ - system_message: string; - voice: Voice; - subscribe_to_user_audio: boolean; - output_audio_format: AudioFormat; - tools: Tool[]; - tool_choice: ToolChoice; - temperature: number; // 0.6..1.2 inclusive, default 0.8 - max_tokens: number; // 1..4096, default 2048; - disable_audio: boolean; - transcribe_input: boolean; - conversation_label: string; // default "default" -}>; +export interface ResponseCancelEvent extends BaseClientEvent { + type: ClientEventType.ResponseCancel; +} -export enum State { - INITIALIZING = 'initializing', - LISTENING = 'listening', - THINKING = 'thinking', - SPEAKING = 'speaking', +export enum ClientEventType { + SessionUpdate = 'session.update', + InputAudioBufferAppend = 'input_audio_buffer.append', + InputAudioBufferCommit = 'input_audio_buffer.commit', + InputAudioBufferClear = 'input_audio_buffer.clear', + ConversationItemCreate = 'conversation.item.create', + ConversationItemTruncate = 'conversation.item.truncate', + ConversationItemDelete = 'conversation.item.delete', + ResponseCreate = 'response.create', + ResponseCancel = 'response.cancel', +} + +export type ClientEvent = + | SessionUpdateEvent + | InputAudioBufferAppendEvent + | InputAudioBufferCommitEvent + | InputAudioBufferClearEvent + | ConversationItemCreateEvent + | ConversationItemTruncateEvent + | ConversationItemDeleteEvent + | ResponseCreateEvent + | ResponseCancelEvent; + +// Server Events +interface BaseServerEvent { + event_id: string; + type: ServerEventType; +} + +export interface ErrorEvent extends BaseServerEvent { + type: ServerEventType.Error; + error: { + type: 'invalid_request_error' | 'server_error' | string; + code?: string; + message: string; + param: string; + event_id: string; + }; +} + +export interface SessionCreatedEvent extends BaseServerEvent { + type: ServerEventType.SessionCreated; + session: SessionResource; +} + +export interface SessionUpdatedEvent extends BaseServerEvent { + type: ServerEventType.SessionUpdated; + session: SessionResource; +} + +export interface ConversationCreatedEvent extends BaseServerEvent { + type: ServerEventType.ConversationCreated; + conversation: ConversationResource; +} + +export interface InputAudioBufferCommittedEvent extends BaseServerEvent { + type: ServerEventType.InputAudioBufferCommitted; + item_id: string; +} + +export interface InputAudioBufferClearedEvent extends BaseServerEvent { + type: ServerEventType.InputAudioBufferCleared; +} + +export interface InputAudioBufferSpeechStartedEvent extends BaseServerEvent { + type: ServerEventType.InputAudioBufferSpeechStarted; + audio_start_ms: number; + item_id: string; +} + +export interface InputAudioBufferSpeechStoppedEvent extends BaseServerEvent { + type: ServerEventType.InputAudioBufferSpeechStopped; + audio_end_ms: number; + item_id: string; +} + +export interface ConversationItemCreatedEvent extends BaseServerEvent { + type: ServerEventType.ConversationItemCreated; + item: ItemResource; +} + +export interface ConversationItemInputAudioTranscriptionCompletedEvent extends BaseServerEvent { + type: ServerEventType.ConversationItemInputAudioTranscriptionCompleted; + item_id: string; + content_index: number; + transcript: string; +} + +export interface ConversationItemInputAudioTranscriptionFailedEvent extends BaseServerEvent { + type: ServerEventType.ConversationItemInputAudioTranscriptionFailed; + item_id: string; + content_index: number; + error: { + type: string; + code?: string; + message: string; + param: null; + }; +} + +export interface ConversationItemTruncatedEvent extends BaseServerEvent { + type: ServerEventType.ConversationItemTruncated; + item_id: string; + content_index: number; + audio_end_ms: number; +} + +export interface ConversationItemDeletedEvent extends BaseServerEvent { + type: ServerEventType.ConversationItemDeleted; + item_id: string; +} + +export interface ResponseCreatedEvent extends BaseServerEvent { + type: ServerEventType.ResponseCreated; + response: ResponseResource; +} + +export interface ResponseDoneEvent extends BaseServerEvent { + type: ServerEventType.ResponseDone; + response: ResponseResource; +} + +export interface ResponseOutputAddedEvent extends BaseServerEvent { + type: ServerEventType.ResponseOutputAdded; + response_id: string; + output_index: number; + item: ItemResource; +} + +export interface ResponseOutputDoneEvent extends BaseServerEvent { + type: ServerEventType.ResponseOutputDone; + response_id: string; + output_index: number; + item: ItemResource; +} + +export interface ResponseContentAddedEvent extends BaseServerEvent { + type: ServerEventType.ResponseContentAdded; + response_id: string; + output_index: number; + content_index: number; + part: ContentPart; +} + +export interface ResponseContentDoneEvent extends BaseServerEvent { + type: ServerEventType.ResponseContentDone; + response_id: string; + output_index: number; + content_index: number; + part: ContentPart; } + +export interface ResponseTextDeltaEvent extends BaseServerEvent { + type: ServerEventType.ResponseTextDelta; + response_id: string; + output_index: number; + content_index: number; + delta: string; +} + +export interface ResponseTextDoneEvent extends BaseServerEvent { + type: ServerEventType.ResponseTextDone; + response_id: string; + output_index: number; + content_index: number; + text: string; +} + +export interface ResponseAudioTranscriptDeltaEvent extends BaseServerEvent { + type: ServerEventType.ResponseAudioTranscriptDelta; + response_id: string; + output_index: number; + content_index: number; + delta: string; +} + +export interface ResponseAudioTranscriptDoneEvent extends BaseServerEvent { + type: ServerEventType.ResponseAudioTranscriptDone; + response_id: string; + output_index: number; + content_index: number; + transcript: string; +} + +export interface ResponseAudioDeltaEvent extends BaseServerEvent { + type: ServerEventType.ResponseAudioDelta; + response_id: string; + output_index: number; + content_index: number; + delta: AudioBase64Bytes; +} + +export interface ResponseAudioDoneEvent extends BaseServerEvent { + type: ServerEventType.ResponseAudioDone; + response_id: string; + output_index: number; + content_index: number; + // 'audio' field is excluded from rendering +} + +export interface ResponseFunctionCallArgumentsDeltaEvent extends BaseServerEvent { + type: ServerEventType.ResponseFunctionCallArgumentsDelta; + response_id: string; + output_index: number; + delta: string; +} + +export interface ResponseFunctionCallArgumentsDoneEvent extends BaseServerEvent { + type: ServerEventType.ResponseFunctionCallArgumentsDone; + response_id: string; + output_index: number; + arguments: string; +} + +export interface RateLimitsUpdatedEvent extends BaseServerEvent { + type: ServerEventType.RateLimitsUpdated; + rate_limits: { + name: 'requests' | 'tokens' | 'input_tokens' | 'output_tokens'; + limit: number; + remaining: number; + reset_seconds: number; + }[]; +} + +export enum ServerEventType { + Error = 'error', + SessionCreated = 'session.created', + SessionUpdated = 'session.updated', + ConversationCreated = 'conversation.created', + InputAudioBufferCommitted = 'input_audio_buffer.committed', + InputAudioBufferCleared = 'input_audio_buffer.cleared', + InputAudioBufferSpeechStarted = 'input_audio_buffer.speech_started', + InputAudioBufferSpeechStopped = 'input_audio_buffer.speech_stopped', + ConversationItemCreated = 'conversation.item.created', + ConversationItemInputAudioTranscriptionCompleted = 'conversation.item.input_audio_transcription.completed', + ConversationItemInputAudioTranscriptionFailed = 'conversation.item.input_audio_transcription.failed', + ConversationItemTruncated = 'conversation.item.truncated', + ConversationItemDeleted = 'conversation.item.deleted', + ResponseCreated = 'response.created', + ResponseDone = 'response.done', + ResponseOutputAdded = 'response.output.added', + ResponseOutputDone = 'response.output.done', + ResponseContentAdded = 'response.content.added', + ResponseContentDone = 'response.content.done', + ResponseTextDelta = 'response.text.delta', + ResponseTextDone = 'response.text.done', + ResponseAudioTranscriptDelta = 'response.audio_transcript.delta', + ResponseAudioTranscriptDone = 'response.audio_transcript.done', + ResponseAudioDelta = 'response.audio.delta', + ResponseAudioDone = 'response.audio.done', + ResponseFunctionCallArgumentsDelta = 'response.function_call_arguments.delta', + ResponseFunctionCallArgumentsDone = 'response.function_call_arguments.done', + RateLimitsUpdated = 'response.rate_limits.updated', +} + +export type ServerEvent = + | ErrorEvent + | SessionCreatedEvent + | SessionUpdatedEvent + | ConversationCreatedEvent + | InputAudioBufferCommittedEvent + | InputAudioBufferClearedEvent + | InputAudioBufferSpeechStartedEvent + | InputAudioBufferSpeechStoppedEvent + | ConversationItemCreatedEvent + | ConversationItemInputAudioTranscriptionCompletedEvent + | ConversationItemInputAudioTranscriptionFailedEvent + | ConversationItemTruncatedEvent + | ConversationItemDeletedEvent + | ResponseCreatedEvent + | ResponseDoneEvent + | ResponseOutputAddedEvent + | ResponseOutputDoneEvent + | ResponseContentAddedEvent + | ResponseContentDoneEvent + | ResponseTextDeltaEvent + | ResponseTextDoneEvent + | ResponseAudioTranscriptDeltaEvent + | ResponseAudioTranscriptDoneEvent + | ResponseAudioDeltaEvent + | ResponseAudioDoneEvent + | ResponseFunctionCallArgumentsDeltaEvent + | ResponseFunctionCallArgumentsDoneEvent + | RateLimitsUpdatedEvent;