From a18a45a8c6a93a83b77a699ececa5017f91e6508 Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Mon, 30 Sep 2024 08:28:25 +0300 Subject: [PATCH] fix(realtime): another round of API changes (#81) --- plugins/openai/src/realtime/api_proto.ts | 33 ++++++++-------- plugins/openai/src/realtime/realtime_model.ts | 38 +++++++++++++------ 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/plugins/openai/src/realtime/api_proto.ts b/plugins/openai/src/realtime/api_proto.ts index c355f694..382582f6 100644 --- a/plugins/openai/src/realtime/api_proto.ts +++ b/plugins/openai/src/realtime/api_proto.ts @@ -83,16 +83,12 @@ export interface Tool { }; } -export type TurnDetectionType = - | { - type: 'server_vad'; - threshold?: number; // 0.0 to 1.0, default: 0.5 - prefix_padding_ms?: number; // default: 300 - silence_duration_ms?: number; // default: 200 - } - | { - type: 'none'; - }; +export type TurnDetectionType = { + type: 'server_vad'; + threshold?: number; // 0.0 to 1.0, default: 0.5 + prefix_padding_ms?: number; // default: 300 + silence_duration_ms?: number; // default: 200 +}; export type InputAudioTranscription = { model: InputTranscriptionModel; @@ -176,16 +172,17 @@ export interface SessionResource { object: 'realtime.session'; model: string; modalities: ['text', 'audio'] | ['text']; // default: ["text", "audio"] - instructions?: string; // default: null + instructions: string; voice: Voice; // default: "alloy" input_audio_format: AudioFormat; // default: "pcm16" output_audio_format: AudioFormat; // default: "pcm16" - input_audio_transcription?: InputAudioTranscription; // default: null - turn_detection: TurnDetectionType; + input_audio_transcription: InputAudioTranscription | null; + turn_detection: TurnDetectionType | null; tools: Tool[]; tool_choice: ToolChoice; // default: "auto" temperature: number; // default: 0.8 - max_response_output_tokens: number | null; + max_response_output_tokens: number | 'inf'; + expires_at: number; } // Conversation Resource @@ -238,12 +235,12 @@ export interface SessionUpdateEvent extends BaseClientEvent { voice: Voice; input_audio_format: AudioFormat; output_audio_format: AudioFormat; - input_audio_transcription?: InputAudioTranscription; - turn_detection: TurnDetectionType; + input_audio_transcription: InputAudioTranscription | null; + turn_detection: TurnDetectionType | null; tools: Tool[]; tool_choice: ToolChoice; temperature: number; - max_response_output_tokens: number; + max_response_output_tokens: number | 'inf'; }>; } @@ -318,7 +315,7 @@ export interface ResponseCreateEvent extends BaseClientEvent { tools?: Tool[]; tool_choice: ToolChoice; temperature: number; - max_response_output_tokens: number; + max_output_tokens: number | 'inf'; }>; } diff --git a/plugins/openai/src/realtime/realtime_model.ts b/plugins/openai/src/realtime/realtime_model.ts index 1a898e47..cd13e5db 100644 --- a/plugins/openai/src/realtime/realtime_model.ts +++ b/plugins/openai/src/realtime/realtime_model.ts @@ -10,14 +10,14 @@ import * as api_proto from './api_proto.js'; interface ModelOptions { modalities: ['text', 'audio'] | ['text']; - instructions?: string; + instructions: string; voice: api_proto.Voice; inputAudioFormat: api_proto.AudioFormat; outputAudioFormat: api_proto.AudioFormat; - inputAudioTranscription?: api_proto.InputAudioTranscription; - turnDetection: api_proto.TurnDetectionType; + inputAudioTranscription: api_proto.InputAudioTranscription | null; + turnDetection: api_proto.TurnDetectionType | null; temperature: number; - maxResponseOutputTokens?: number; + maxResponseOutputTokens: number; model: api_proto.Model; apiKey: string; baseURL: string; @@ -184,14 +184,14 @@ export class RealtimeModel extends multimodal.RealtimeModel { constructor({ modalities = ['text', 'audio'], - instructions = undefined, + instructions = '', voice = 'alloy', inputAudioFormat = 'pcm16', outputAudioFormat = 'pcm16', inputAudioTranscription = { model: 'whisper-1' }, turnDetection = { type: 'server_vad' }, temperature = 0.8, - maxResponseOutputTokens = undefined, + maxResponseOutputTokens = Infinity, model = 'gpt-4o-realtime-preview-2024-10-01', apiKey = process.env.OPENAI_API_KEY || '', baseURL = api_proto.API_URL, @@ -255,8 +255,8 @@ export class RealtimeModel extends multimodal.RealtimeModel { voice?: api_proto.Voice; inputAudioFormat?: api_proto.AudioFormat; outputAudioFormat?: api_proto.AudioFormat; - inputAudioTranscription?: api_proto.InputAudioTranscription; - turnDetection?: api_proto.TurnDetectionType; + inputAudioTranscription?: api_proto.InputAudioTranscription | null; + turnDetection?: api_proto.TurnDetectionType | null; temperature?: number; maxResponseOutputTokens?: number; }): RealtimeSession { @@ -291,6 +291,7 @@ export class RealtimeSession extends multimodal.RealtimeSession { #pendingResponses: { [id: string]: RealtimeResponse } = {}; #sessionId = 'not-connected'; #ws: WebSocket | null = null; + #expiresAt: number | null = null; #logger = log(); #task: Promise; #closing = true; @@ -338,6 +339,13 @@ export class RealtimeSession extends multimodal.RealtimeSession { return new Response(this); } + get expiration(): number { + if (!this.#expiresAt) { + throw new Error('session not started'); + } + return this.#expiresAt * 1000; + } + queueMsg(command: api_proto.ClientEvent): void { this.#sendQueue.put(command); } @@ -389,8 +397,8 @@ export class RealtimeSession extends multimodal.RealtimeSession { voice?: api_proto.Voice; inputAudioFormat?: api_proto.AudioFormat; outputAudioFormat?: api_proto.AudioFormat; - inputAudioTranscription?: api_proto.InputAudioTranscription; - turnDetection?: api_proto.TurnDetectionType; + inputAudioTranscription?: api_proto.InputAudioTranscription | null; + turnDetection?: api_proto.TurnDetectionType | null; temperature?: number; maxResponseOutputTokens?: number; toolChoice?: api_proto.ToolChoice; @@ -430,7 +438,10 @@ export class RealtimeSession extends multimodal.RealtimeSession { input_audio_transcription: this.#opts.inputAudioTranscription, turn_detection: this.#opts.turnDetection, temperature: this.#opts.temperature, - max_response_output_tokens: this.#opts.maxResponseOutputTokens, + max_response_output_tokens: + this.#opts.maxResponseOutputTokens === Infinity + ? 'inf' + : this.#opts.maxResponseOutputTokens, tools, tool_choice: toolChoice, }, @@ -561,6 +572,9 @@ export class RealtimeSession extends multimodal.RealtimeSession { sendTask(); this.#ws.onclose = () => { + if (this.#expiresAt && Date.now() >= this.#expiresAt * 1000) { + this.#closing = true; + } if (!this.#closing) { reject('OpenAI Realtime connection closed unexpectedly'); } @@ -590,6 +604,8 @@ export class RealtimeSession extends multimodal.RealtimeSession { #handleSessionCreated(event: api_proto.SessionCreatedEvent): void { this.#sessionId = event.session.id; + this.#expiresAt = event.session.expires_at; + this.#logger = this.#logger.child({ sessionId: this.#sessionId }); } // eslint-disable-next-line @typescript-eslint/no-unused-vars