proto/speech-recognition-open-api.proto

syntax = "proto3";

package ekstep.speech_recognition;

option java_multiple_files = true;
option java_outer_classname = "SpeechRecognitionProto";
option java_package = "com.ekstep.endpoints.speech_recognition";

import "google/api/annotations.proto";

service SpeechRecognizer {
  rpc recognize_audio(stream Message) returns (stream Response) {}
  rpc punctuate(PunctuateRequest) returns (PunctuateResponse) {
    option (google.api.http) = {post: "/v1/punctuate/{language}", body: "*"};
  }
  rpc recognize (SpeechRecognitionRequest) returns (SpeechRecognitionResult) {
    option (google.api.http) = {post: "/v1/recognize/{config.language.sourceLanguage}", body: "*"};
  }
}

message Message {
  bytes audio = 1;
  string user = 2;
  string language = 3;
  bool speaking = 4;
  bool isEnd = 5;
}

message Response {
  string transcription = 1;
  string user = 2;
  string language = 3;
  string action = 4;
}


message SpeechRecognitionRequest {
  RecognitionConfig config = 1;
  repeated RecognitionAudio audio = 2;
}


message RecognitionConfig {
  message TranscriptionFormat{
    TranscriptionFormatEnum value = 1;
  }
  enum TranscriptionFormatEnum {
    transcript = 0;
    srt = 1;
    alternatives = 2;
  }
  enum AudioBitsPerSample {
    sixteen = 0;
    eight = 1;
  }
  enum AudioChannel {
    mono = 0;
    stereo = 1;
  }
  enum AudioFormat {
    wav = 0;
    pcm = 1;
    mp3 = 2;
    flac = 3;
  }
  enum Domain {
    GENERAL = 0;
    NEWS = 1;
    EDUCATION = 2 ;
    LEGAL = 3 ;
    GOVERNMENT_PRESS_RELEASE = 4;
    HEALTHCARE = 5;
    MOVIES = 6;
    SUBTITLES = 7;
    SPORTS = 8;
  }
  enum Model {
    command_and_search = 0;
    phone_call = 1;
    video = 2;
    default = 3;
  }

  Language language = 1;
  optional AudioFormat audioFormat = 2;
  optional AudioChannel channel = 3;
  optional int64 samplingRate = 4;
  optional AudioBitsPerSample bitsPerSample = 5;
  optional TranscriptionFormat transcriptionFormat = 6;
  optional bool profanityFilter = 7;
  repeated Domain domain = 8;
  optional bool detailed = 9;
  optional bool punctuation = 10;
  optional Model model = 11;
  //  optional bool enableAutomaticPunctuation = 12;
  optional bool enableInverseTextNormalization = 12;
}

message Language {
  enum LanguageCode {
    hi = 0;
    en = 1;
    mr = 2;
    ta = 3 ;
    te = 4;
    kn = 5;
    gu = 6;
    pa = 7;
    bn = 8;
    ml = 9;
    as = 10;
    brx = 11;
    doi = 12;
    ks = 13;
    kok = 14;
    mai = 15;
    mni = 16;
    ne = 17;
    or = 18;
    sd = 19;
    si = 20;
    ur = 21;
    sat = 23;
    lus = 24;
    njz = 25;
    pnr = 26;
    kha = 27;
    grt = 28;
    sa = 29;
    raj = 30;
    bho = 31;
    en_bio = 32;
    hi_en = 33;
  }
  optional string name = 1;
  LanguageCode sourceLanguage = 2;
}

message RecognitionAudio {
  optional string audioUri = 1;
  optional bytes audioContent = 2;
}

message SpeechRecognitionResult {
  enum Status {
    SUCCESS = 0;
    NO_MATCH = 1;
    INITIAL_SILENCE_TIMEOUT = 2;
    BABBLE_TIMEOUT = 3;
    ERROR = 4;
  }
  message Output{
    string source = 1;
  }
  Status status = 1;
  repeated Output output = 2;
  optional RecognitionDetails config = 3;
  optional string status_text = 4;
}

message RecognitionDetails {
  int32 channelTag = 1;
  Language language = 2;
  int32 snr = 3;
  int32 samplingRate = 4;
  int32 bitsPerSample = 5;
}

message Alternative {
  string word = 1;
  string startTime = 2;
  string endTime = 3;
}

message PunctuateRequest {
  string text = 1;
  string language = 2;
  bool enabledItn = 3;
}

message PunctuateResponse {
  string text = 1;
  string language = 2;
}