OpenAI.Audio.pas

﻿unit OpenAI.Audio;

interface

uses
  System.Classes, System.SysUtils, System.Net.Mime, OpenAI.API.Params,
  OpenAI.API;

{$SCOPEDENUMS ON}

type
  TAudioResponseFormat = (Json, Text, Srt, VerboseJson, Vtt);

  TAudioResponseFormatHelper = record helper for TAudioResponseFormat
    function ToString: string;
  end;

  TAudioTranscription = class(TMultipartFormData)
    /// <summary>
    /// Required.
    /// The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    /// </summary>
    function &File(const FileName: TFileName): TAudioTranscription; overload;
    /// <summary>
    /// Required.
    /// The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    /// </summary>
    function &File(const Stream: TStream; const FileName: TFileName): TAudioTranscription; overload;
    /// <summary>
    /// Required.
    /// ID of the model to use. Only whisper-1 is currently available.
    /// </summary>
    function Model(const Value: string): TAudioTranscription; overload;
    /// <summary>
    /// An optional text to guide the model's style or continue a previous audio segment.
    /// The prompt should match the audio language.
    /// </summary>
    /// <seealso>https://platform.openai.com/docs/guides/speech-to-text/prompting</seealso>
    function Prompt(const Value: string): TAudioTranscription; overload;
    /// <summary>
    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
    /// </summary>
    function ResponseFormat(const Value: string): TAudioTranscription; overload;
    /// <summary>
    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
    /// </summary>
    function ResponseFormat(const Value: TAudioResponseFormat = TAudioResponseFormat.Json): TAudioTranscription; overload;
    /// <summary>
    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random,
    /// while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use
    /// log probability to automatically increase the temperature until certain thresholds are hit.
    /// </summary>
    function Temperature(const Value: Single = 0): TAudioTranscription;
    /// <summary>
    /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency (like en, ru, uk).
    /// </summary>
    function Language(const Value: string): TAudioTranscription; overload;
    constructor Create; reintroduce;
  end;

  TAudioTranslation = class(TMultipartFormData)
    /// <summary>
    /// Required.
    /// The audio file object (not file name) translate, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    /// </summary>
    function &File(const FileName: TFileName): TAudioTranslation; overload;
    /// <summary>
    /// Required.
    /// The audio file object (not file name) translate, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
    /// </summary>
    function &File(const Stream: TStream; const FileName: TFileName): TAudioTranslation; overload;
    /// <summary>
    /// Required.
    /// ID of the model to use. Only whisper-1 is currently available.
    /// </summary>
    function Model(const Value: string): TAudioTranslation; overload;
    /// <summary>
    /// An optional text to guide the model's style or continue a previous audio segment. The prompt should be in English.
    /// </summary>
    /// <seealso>https://platform.openai.com/docs/guides/speech-to-text/prompting</seealso>
    function Prompt(const Value: string): TAudioTranslation; overload;
    /// <summary>
    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
    /// </summary>
    function ResponseFormat(const Value: string = 'json'): TAudioTranslation;
    /// <summary>
    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random,
    /// while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use
    /// log probability to automatically increase the temperature until certain thresholds are hit.
    /// </summary>
    function Temperature(const Value: Single = 0): TAudioTranslation;
    constructor Create; reintroduce;
  end;

  TAudioSpeechParams = class(TJSONParam)
    /// <summary>
    /// One of the available TTS models: tts-1 or tts-1-hd
    /// </summary>
    function Model(const Value: string): TAudioSpeechParams;
    /// <summary>
    /// The text to generate audio for. The maximum length is 4096 characters.
    /// </summary>
    function Input(const Value: string): TAudioSpeechParams; overload;
    /// <summary>
    /// The voice to use when generating the audio.
    /// Supported voices are alloy, echo, fable, onyx, nova, and shimmer.
    /// </summary>
    function Voice(const Value: string): TAudioSpeechParams; overload;
    /// <summary>
    /// The format to audio in. Supported formats are mp3, opus, aac, and flac.
    /// </summary>
    function ResponseFormat(const Value: string): TAudioSpeechParams;
    /// <summary>
    /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
    /// </summary>
    function Speed(const Value: Single = 1): TAudioSpeechParams;
    constructor Create; override;
  end;

  TAudioText = class
  private
    FText: string;
  public
    property Text: string read FText write FText;
  end;

  /// <summary>
  /// Learn how to turn audio into text.
  /// </summary>
  TAudioRoute = class(TOpenAIAPIRoute)
  public
    /// <summary>
    /// Transcribes audio into the input language.
    /// </summary>
    function CreateTranscription(ParamProc: TProc<TAudioTranscription>): TAudioText;
    /// <summary>
    /// Translates audio into into English.
    /// </summary>
    function CreateTranslation(ParamProc: TProc<TAudioTranslation>): TAudioText;
    /// <summary>
    /// Generates audio from the input text.
    /// </summary>
    procedure CreateSpeech(ParamProc: TProc<TAudioSpeechParams>; Stream: TStream);
  end;

implementation

{ TAudioRoute }

procedure TAudioRoute.CreateSpeech(ParamProc: TProc<TAudioSpeechParams>; Stream: TStream);
begin
  API.Post<TAudioSpeechParams>('audio/speech', ParamProc, Stream);
end;

function TAudioRoute.CreateTranscription(ParamProc: TProc<TAudioTranscription>): TAudioText;
begin
  Result := API.PostForm<TAudioText, TAudioTranscription>('audio/transcriptions', ParamProc);
end;

function TAudioRoute.CreateTranslation(ParamProc: TProc<TAudioTranslation>): TAudioText;
begin
  Result := API.PostForm<TAudioText, TAudioTranslation>('audio/translations', ParamProc);
end;

{ TAudioTranscription }

function TAudioTranscription.&File(const FileName: TFileName): TAudioTranscription;
begin
  AddFile('file', FileName);
  Result := Self;
end;

constructor TAudioTranscription.Create;
begin
  inherited Create(True);
  Model('whisper-1');
end;

function TAudioTranscription.&File(const Stream: TStream; const FileName: TFileName): TAudioTranscription;
begin
  AddStream('file', Stream, FileName);
  Result := Self;
end;

function TAudioTranscription.Language(const Value: string): TAudioTranscription;
begin
  AddField('language', Value);
  Result := Self;
end;

function TAudioTranscription.Temperature(const Value: Single): TAudioTranscription;
begin
  AddField('temperature', FormatFloat('0,0', Value));
  Result := Self;
end;

function TAudioTranscription.Prompt(const Value: string): TAudioTranscription;
begin
  AddField('prompt', Value);
  Result := Self;
end;

function TAudioTranscription.ResponseFormat(const Value: TAudioResponseFormat): TAudioTranscription;
begin
  Result := ResponseFormat(Value.ToString);
end;

function TAudioTranscription.ResponseFormat(const Value: string): TAudioTranscription;
begin
  AddField('response_format', Value);
  Result := Self;
end;

function TAudioTranscription.Model(const Value: string): TAudioTranscription;
begin
  AddField('model', Value);
  Result := Self;
end;

{ TAudioTranslation }

function TAudioTranslation.&File(const FileName: TFileName): TAudioTranslation;
begin
  AddFile('file', FileName);
  Result := Self;
end;

constructor TAudioTranslation.Create;
begin
  inherited Create(True);
end;

function TAudioTranslation.&File(const Stream: TStream; const FileName: TFileName): TAudioTranslation;
begin
  AddStream('file', Stream, FileName);
  Result := Self;
end;

function TAudioTranslation.Temperature(const Value: Single): TAudioTranslation;
begin
  AddField('temperature', FormatFloat('0,0', Value));
  Result := Self;
end;

function TAudioTranslation.Prompt(const Value: string): TAudioTranslation;
begin
  AddField('prompt', Value);
  Result := Self;
end;

function TAudioTranslation.ResponseFormat(const Value: string): TAudioTranslation;
begin
  AddField('response_format', Value);
  Result := Self;
end;

function TAudioTranslation.Model(const Value: string): TAudioTranslation;
begin
  AddField('model', Value);
  Result := Self;
end;

{ TAudioResponseFormatHelper }

function TAudioResponseFormatHelper.ToString: string;
begin
  case Self of
    TAudioResponseFormat.Json:
      Result := 'json';
    TAudioResponseFormat.Text:
      Result := 'text';
    TAudioResponseFormat.Srt:
      Result := 'srt';
    TAudioResponseFormat.VerboseJson:
      Result := 'verbose_json';
    TAudioResponseFormat.Vtt:
      Result := 'vtt';
  end;
end;

{ TAudioSpeechParams }

constructor TAudioSpeechParams.Create;
begin
  inherited;
  Model('tts-1');
  Voice('alloy');
end;

function TAudioSpeechParams.Input(const Value: string): TAudioSpeechParams;
begin
  Result := TAudioSpeechParams(Add('input', Value));
end;

function TAudioSpeechParams.Model(const Value: string): TAudioSpeechParams;
begin
  Result := TAudioSpeechParams(Add('model', Value));
end;

function TAudioSpeechParams.ResponseFormat(const Value: string): TAudioSpeechParams;
begin
  Result := TAudioSpeechParams(Add('response_format', Value));
end;

function TAudioSpeechParams.Speed(const Value: Single): TAudioSpeechParams;
begin
  Result := TAudioSpeechParams(Add('speed', Value));
end;

function TAudioSpeechParams.Voice(const Value: string): TAudioSpeechParams;
begin
  Result := TAudioSpeechParams(Add('voice', Value));
end;

end.