Skip to content

Commit

Permalink
fix(realtime): another round of API changes (#81)
Browse files Browse the repository at this point in the history
  • Loading branch information
nbsp authored Sep 30, 2024
1 parent 5ec3db8 commit a18a45a
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 29 deletions.
33 changes: 15 additions & 18 deletions plugins/openai/src/realtime/api_proto.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,12 @@ export interface Tool {
};
}

export type TurnDetectionType =
| {
type: 'server_vad';
threshold?: number; // 0.0 to 1.0, default: 0.5
prefix_padding_ms?: number; // default: 300
silence_duration_ms?: number; // default: 200
}
| {
type: 'none';
};
export type TurnDetectionType = {
type: 'server_vad';
threshold?: number; // 0.0 to 1.0, default: 0.5
prefix_padding_ms?: number; // default: 300
silence_duration_ms?: number; // default: 200
};

export type InputAudioTranscription = {
model: InputTranscriptionModel;
Expand Down Expand Up @@ -176,16 +172,17 @@ export interface SessionResource {
object: 'realtime.session';
model: string;
modalities: ['text', 'audio'] | ['text']; // default: ["text", "audio"]
instructions?: string; // default: null
instructions: string;
voice: Voice; // default: "alloy"
input_audio_format: AudioFormat; // default: "pcm16"
output_audio_format: AudioFormat; // default: "pcm16"
input_audio_transcription?: InputAudioTranscription; // default: null
turn_detection: TurnDetectionType;
input_audio_transcription: InputAudioTranscription | null;
turn_detection: TurnDetectionType | null;
tools: Tool[];
tool_choice: ToolChoice; // default: "auto"
temperature: number; // default: 0.8
max_response_output_tokens: number | null;
max_response_output_tokens: number | 'inf';
expires_at: number;
}

// Conversation Resource
Expand Down Expand Up @@ -238,12 +235,12 @@ export interface SessionUpdateEvent extends BaseClientEvent {
voice: Voice;
input_audio_format: AudioFormat;
output_audio_format: AudioFormat;
input_audio_transcription?: InputAudioTranscription;
turn_detection: TurnDetectionType;
input_audio_transcription: InputAudioTranscription | null;
turn_detection: TurnDetectionType | null;
tools: Tool[];
tool_choice: ToolChoice;
temperature: number;
max_response_output_tokens: number;
max_response_output_tokens: number | 'inf';
}>;
}

Expand Down Expand Up @@ -318,7 +315,7 @@ export interface ResponseCreateEvent extends BaseClientEvent {
tools?: Tool[];
tool_choice: ToolChoice;
temperature: number;
max_response_output_tokens: number;
max_output_tokens: number | 'inf';
}>;
}

Expand Down
38 changes: 27 additions & 11 deletions plugins/openai/src/realtime/realtime_model.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ import * as api_proto from './api_proto.js';

interface ModelOptions {
modalities: ['text', 'audio'] | ['text'];
instructions?: string;
instructions: string;
voice: api_proto.Voice;
inputAudioFormat: api_proto.AudioFormat;
outputAudioFormat: api_proto.AudioFormat;
inputAudioTranscription?: api_proto.InputAudioTranscription;
turnDetection: api_proto.TurnDetectionType;
inputAudioTranscription: api_proto.InputAudioTranscription | null;
turnDetection: api_proto.TurnDetectionType | null;
temperature: number;
maxResponseOutputTokens?: number;
maxResponseOutputTokens: number;
model: api_proto.Model;
apiKey: string;
baseURL: string;
Expand Down Expand Up @@ -184,14 +184,14 @@ export class RealtimeModel extends multimodal.RealtimeModel {

constructor({
modalities = ['text', 'audio'],
instructions = undefined,
instructions = '',
voice = 'alloy',
inputAudioFormat = 'pcm16',
outputAudioFormat = 'pcm16',
inputAudioTranscription = { model: 'whisper-1' },
turnDetection = { type: 'server_vad' },
temperature = 0.8,
maxResponseOutputTokens = undefined,
maxResponseOutputTokens = Infinity,
model = 'gpt-4o-realtime-preview-2024-10-01',
apiKey = process.env.OPENAI_API_KEY || '',
baseURL = api_proto.API_URL,
Expand Down Expand Up @@ -255,8 +255,8 @@ export class RealtimeModel extends multimodal.RealtimeModel {
voice?: api_proto.Voice;
inputAudioFormat?: api_proto.AudioFormat;
outputAudioFormat?: api_proto.AudioFormat;
inputAudioTranscription?: api_proto.InputAudioTranscription;
turnDetection?: api_proto.TurnDetectionType;
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
turnDetection?: api_proto.TurnDetectionType | null;
temperature?: number;
maxResponseOutputTokens?: number;
}): RealtimeSession {
Expand Down Expand Up @@ -291,6 +291,7 @@ export class RealtimeSession extends multimodal.RealtimeSession {
#pendingResponses: { [id: string]: RealtimeResponse } = {};
#sessionId = 'not-connected';
#ws: WebSocket | null = null;
#expiresAt: number | null = null;
#logger = log();
#task: Promise<void>;
#closing = true;
Expand Down Expand Up @@ -338,6 +339,13 @@ export class RealtimeSession extends multimodal.RealtimeSession {
return new Response(this);
}

get expiration(): number {
if (!this.#expiresAt) {
throw new Error('session not started');
}
return this.#expiresAt * 1000;
}

queueMsg(command: api_proto.ClientEvent): void {
this.#sendQueue.put(command);
}
Expand Down Expand Up @@ -389,8 +397,8 @@ export class RealtimeSession extends multimodal.RealtimeSession {
voice?: api_proto.Voice;
inputAudioFormat?: api_proto.AudioFormat;
outputAudioFormat?: api_proto.AudioFormat;
inputAudioTranscription?: api_proto.InputAudioTranscription;
turnDetection?: api_proto.TurnDetectionType;
inputAudioTranscription?: api_proto.InputAudioTranscription | null;
turnDetection?: api_proto.TurnDetectionType | null;
temperature?: number;
maxResponseOutputTokens?: number;
toolChoice?: api_proto.ToolChoice;
Expand Down Expand Up @@ -430,7 +438,10 @@ export class RealtimeSession extends multimodal.RealtimeSession {
input_audio_transcription: this.#opts.inputAudioTranscription,
turn_detection: this.#opts.turnDetection,
temperature: this.#opts.temperature,
max_response_output_tokens: this.#opts.maxResponseOutputTokens,
max_response_output_tokens:
this.#opts.maxResponseOutputTokens === Infinity
? 'inf'
: this.#opts.maxResponseOutputTokens,
tools,
tool_choice: toolChoice,
},
Expand Down Expand Up @@ -561,6 +572,9 @@ export class RealtimeSession extends multimodal.RealtimeSession {
sendTask();

this.#ws.onclose = () => {
if (this.#expiresAt && Date.now() >= this.#expiresAt * 1000) {
this.#closing = true;
}
if (!this.#closing) {
reject('OpenAI Realtime connection closed unexpectedly');
}
Expand Down Expand Up @@ -590,6 +604,8 @@ export class RealtimeSession extends multimodal.RealtimeSession {

#handleSessionCreated(event: api_proto.SessionCreatedEvent): void {
this.#sessionId = event.session.id;
this.#expiresAt = event.session.expires_at;
this.#logger = this.#logger.child({ sessionId: this.#sessionId });
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
Expand Down

0 comments on commit a18a45a

Please sign in to comment.