embedded_assistant.proto

syntax = "proto3";

package google.assistant.embedded.v1alpha1;

option java_multiple_files = true;
option java_outer_classname = "AssistantProto";
option java_package = "com.google.assistant.embedded.v1alpha1";

import "google/api/annotations.proto";
import "google/rpc/status.proto";

// Service that implements Google Assistant API.
service EmbeddedAssistant {
  // Initiates or continues a conversation with the embedded assistant service.
  // Each call performs one round-trip, sending an audio request to the service
  // and receiving the audio response. Uses bidirectional streaming to receive
  // results, such as the `END_OF_UTTERANCE` event, while sending audio.
  //
  // A conversation is one or more gRPC connections, each consisting of several
  // streamed requests and responses.
  // For example, the user says *Add to my shopping list* and the assistant
  // responds *What do you want to add?*. The sequence of streamed requests and
  // responses in the first gRPC message could be:
  //
  // *   ConverseRequest.config
  // *   ConverseRequest.audio_in
  // *   ConverseResponse.interim_spoken_request_text "add"
  // *   ConverseRequest.audio_in
  // *   ConverseResponse.interim_spoken_request_text "add to my"
  // *   ConverseRequest.audio_in
  // *   ConverseResponse.interim_spoken_request_text "add to my shopping"
  // *   ConverseRequest.audio_in
  // *   ConverseResponse.event_type.END_OF_UTTERANCE
  // *   ConverseResponse.result.microphone_mode.DIALOG_FOLLOW_ON
  // *   ConverseResponse.audio_out
  // *   ConverseResponse.audio_out
  // *   ConverseResponse.audio_out
  //
  // Note that interim_spoken_request_text is returned asynchronously, which
  // means that it could arrive in any order before the END_OF_UTTERANCE event
  // occurs.
  //
  // The user then says *bagels* and the assistant responds
  // *OK, I've added bagels to your shopping list*. This is sent as another gRPC
  // connection call to the `Converse` method, again with streamed requests and
  // responses, such as:
  //
  // *   ConverseRequest.config
  // *   ConverseRequest.audio_in
  // *   ConverseRequest.audio_in
  // *   ConverseRequest.audio_in
  // *   ConverseResponse.event_type.END_OF_UTTERANCE
  // *   ConverseResponse.result.microphone_mode.CLOSE_MICROPHONE
  // *   ConverseResponse.screen_out
  // *   ConverseResponse.audio_out
  // *   ConverseResponse.audio_out
  // *   ConverseResponse.audio_out
  // *   ConverseResponse.audio_out
  //
  // Although the precise order of responses is not guaranteed, sequential
  // ConverseResponse.audio_out messages will always contain sequential portions
  // of audio.
  rpc Converse(stream ConverseRequest) returns (stream ConverseResponse);
}

// Specifies how to process the `ConverseRequest` messages.
message ConverseConfig {
  // *Required* Specifies how to process the subsequent incoming audio.
  AudioInConfig audio_in_config = 1;

  // *Required* Specifies how to format the audio that will be returned.
  AudioOutConfig audio_out_config = 2;

  // *Required* Represents the current dialog state.
  ConverseState converse_state = 3;

  // Device configuration that uniquely identifies a specific device.
  DeviceConfig device_config = 4;
}

// Specifies how to process the `audio_in` data that will be provided in
// subsequent requests. For recommended settings, see the Google Assistant SDK
// [best practices](https://developers.google.com/assistant/sdk/best-practices/audio).
message AudioInConfig {
  // Audio encoding of the data sent in the audio message.
  // Audio must be one-channel (mono). The only language supported is "en-US".
  enum Encoding {
    // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
    ENCODING_UNSPECIFIED = 0;

    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
    // This encoding includes no header, only the raw audio bytes.
    LINEAR16 = 1;

    // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
    // Codec) is the recommended encoding because it is
    // lossless--therefore recognition is not compromised--and
    // requires only about half the bandwidth of `LINEAR16`. This encoding
    // includes the `FLAC` stream header followed by audio data. It supports
    // 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are
    // supported.
    FLAC = 2;
  }

  // *Required* Encoding of audio data sent in all `audio_in` messages.
  Encoding encoding = 1;

  // *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in`
  // messages. Valid values are from 16000-24000, but 16000 is optimal.
  // For best results, set the sampling rate of the audio source to 16000 Hz.
  // If that's not possible, use the native sample rate of the audio source
  // (instead of re-sampling).
  int32 sample_rate_hertz = 2;
}

// Specifies the desired format for the server to use when it returns
// `audio_out` messages.
message AudioOutConfig {
  // Audio encoding of the data returned in the audio message. All encodings are
  // raw audio bytes with no header, except as indicated below.
  enum Encoding {
    // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
    ENCODING_UNSPECIFIED = 0;

    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
    LINEAR16 = 1;

    // MP3 audio encoding. The sample rate is encoded in the payload.
    MP3 = 2;

    // Opus-encoded audio wrapped in an ogg container. The result will be a
    // file which can be played natively on Android and in some browsers (such
    // as Chrome). The quality of the encoding is considerably higher than MP3
    // while using the same bitrate. The sample rate is encoded in the payload.
    OPUS_IN_OGG = 3;
  }

  // *Required* The encoding of audio data to be returned in all `audio_out`
  // messages.
  Encoding encoding = 1;

  // *Required* The sample rate in Hertz of the audio data returned in
  // `audio_out` messages. Valid values are: 16000-24000.
  int32 sample_rate_hertz = 2;

  // *Required* Current volume setting of the device's audio output.
  // Valid values are 1 to 100 (corresponding to 1% to 100%).
  int32 volume_percentage = 3;
}

// Provides information about the current dialog state.
message ConverseState {
  // *Required* The `conversation_state` value returned in the prior
  // `ConverseResponse`. Omit (do not set the field) if there was no prior
  // `ConverseResponse`. If there was a prior `ConverseResponse`, do not omit
  // this field; doing so will end that conversation (and this new request will
  // start a new conversation).
  bytes conversation_state = 1;

  // *Optional* A way to provide context to assist the Assistant.
  ConverseContext context = 3;

  // Specifies user is in SIGN_OUT mode. That means user credential is not
  // available on the client. Note that client still needs to provide valid
  // credentials to do server to server authentication (i.e., service account).
  bool is_signed_out_mode = 4;
}

// The audio containing the Assistant's response to the query. Sequential chunks
// of audio data are received in sequential `ConverseResponse` messages.
message AudioOut {
  // *Output-only* The audio data containing the assistant's response to the
  // query. Sequential chunks of audio data are received in sequential
  // `ConverseResponse` messages.
  bytes audio_data = 1;
}

// The visual containing the Assistant's response to the query. Contains the
// whole visual output.
message ScreenOut {
  // Possible formats of the visual data.
  enum Format {
    // No format specified.
    FORMAT_UNSPECIFIED = 0;

    // Data will contain a fully-formed HTML5 layout encoded in UTF-8, e.g.
    // "<html><body><div>...</div></body></html>". It is intended to be rendered
    // along with the audio response. Note that HTML5 doctype should be included
    // in the actual HTML data.
    HTML = 1;
  }

  // *Output-only* The format of the provided visual data.
  Format format = 1;

  // *Output-only* The raw visual data to be displayed as the result of the
  // Assistant query.
  bytes data = 2;
}

// The semantic result for the user's spoken query. Multiple of these messages
// could be received, for example one containing the recognized transcript in
// spoken_request_text followed by one containing the semantics of the response,
// i.e. containing the relevant data among conversation_state, microphone_mode,
// and volume_percentage.
message ConverseResult {
  // *Output-only* The recognized transcript of what the user said.
  string spoken_request_text = 1;

  // *Output-only* The text of the assistant's spoken response. This is only
  // returned for an IFTTT action.
  string spoken_response_text = 2;

  // *Output-only* State information for subsequent `ConverseRequest`. This
  // value should be saved in the client and returned in the
  // `conversation_state` with the next `ConverseRequest`. (The client does not
  // need to interpret or otherwise use this value.) There is no need to save
  // this information across device restarts.
  bytes conversation_state = 3;

  // Possible states of the microphone after a `Converse` RPC completes.
  enum MicrophoneMode {
    // No mode specified.
    MICROPHONE_MODE_UNSPECIFIED = 0;

    // The service is not expecting a follow-on question from the user.
    // The microphone should remain off until the user re-activates it.
    CLOSE_MICROPHONE = 1;

    // The service is expecting a follow-on question from the user. The
    // microphone should be re-opened when the `AudioOut` playback completes
    // (by starting a new `Converse` RPC call to send the new audio).
    DIALOG_FOLLOW_ON = 2;
  }

  // *Output-only* Specifies the mode of the microphone after this `Converse`
  // RPC is processed.
  MicrophoneMode microphone_mode = 4;

  // *Output-only* Updated volume level. The value will be 0 or omitted
  // (indicating no change) unless a voice command such as "Increase the volume"
  // or "Set volume level 4" was recognized, in which case the value will be
  // between 1 and 100 (corresponding to the new volume level of 1% to 100%).
  // Typically, a client should use this volume level when playing the
  // `audio_out` data, and retain this value as the current volume level and
  // supply it in the `AudioOutConfig` of the next `ConverseRequest`. (Some
  // clients may also implement other ways to allow the current volume level to
  // be changed, for example, by providing a knob that the user can turn.)
  int32 volume_percentage = 5;
}

// The top-level message sent by the client. Clients must send at least two, and
// typically numerous `ConverseRequest` messages. The first message must
// contain a `config` message and must not contain `audio_in` data. All
// subsequent messages must contain `audio_in` data and must not contain a
// `config` message.
message ConverseRequest {

  // Exactly one of these fields must be specified in each `ConverseRequest`.
  oneof converse_request {
    // The `config` message provides information to the recognizer that
    // specifies how to process the request.
    // The first `ConverseRequest` message must contain a `config` message.
    ConverseConfig config = 1;

    // The audio data to be recognized. Sequential chunks of audio data are sent
    // in sequential `ConverseRequest` messages. The first `ConverseRequest`
    // message must not contain `audio_in` data and all subsequent
    // `ConverseRequest` messages must contain `audio_in` data. The audio bytes
    // must be encoded as specified in `AudioInConfig`.
    // Audio must be sent at approximately real-time (16000 samples per second).
    // An error will be returned if audio is sent significantly faster or
    // slower.
    bytes audio_in = 2;
  }
}

// The top-level message received by the client. A series of one or more
// `ConverseResponse` messages are streamed back to the client.
message ConverseResponse {
  // Indicates the type of event.
  enum EventType {
    // No event specified.
    EVENT_TYPE_UNSPECIFIED = 0;

    // This event indicates that the server has detected the end of the user's
    // speech utterance and expects no additional speech. Therefore, the server
    // will not process additional audio (although it may subsequently return
    // additional results). The client should stop sending additional audio
    // data, half-close the gRPC connection, and wait for any additional results
    // until the server closes the gRPC connection.
    END_OF_UTTERANCE = 1;
  }

  // Exactly one of these fields will be populated in each `ConverseResponse`.
  oneof converse_response {
    // *Output-only* If set, returns a [google.rpc.Status][] message that
    // specifies the error for the operation.
    // If an error occurs during processing, this message will be set and there
    // will be no further messages sent.
    google.rpc.Status error = 1;

    // *Output-only* Indicates the type of event.
    EventType event_type = 2;

    // *Output-only* The audio containing the Assistant's response to the query.
    AudioOut audio_out = 3;

    // *Output-only* The visual containing the Assistant's response to the
    // query.
    ScreenOut screen_out = 8;

    // *Output-only* Contains the action triggered by the query with the
    // appropriate payloads and semantic parsing.
    DeviceAction device_action = 9;

    // *Output-only* The interim recognition result that corresponds to the
    // audio currently being processed. The text returned is purely
    // informational and will not be issued as commands to the Assistant. Once
    // the user's spoken query has been fully recognized, the final complete
    // utterance issued to the Assistant is returned in 'spoken_request_text' in
    // [ConverseResult](google.assistant.embedded.v1alpha1.ConverseResult)
    InterimSpokenRequestText interim_spoken_request_text = 6;

    // *Output-only* The final semantic result for the user's spoken query.
    ConverseResult result = 5;
  }
}

// The interim speech recognition text as the user is speaking. Contains the
// estimated transcription of what the user has spoken thus far from the input
// audio. It can be used to display the current guess of the user's query.
message InterimSpokenRequestText {
  // *Output-only* Transcript text representing the words that the user spoke.
  string transcript = 1;

  // *Output-only* An estimate of the likelihood that the Assistant will not
  // change its guess about this result. Values range from 0.0 (completely
  // unstable) to 1.0 (completely stable). The default of 0.0 is a sentinel
  // value indicating `stability` was not set.
  double stability = 2;
}

// The identification information for devices integrated with the Assistant.
// These fields should be populated for any queries sent from 3P devices.
message DeviceConfig {
  // *Required* Unique identifier for the device. Example: DBCDW098234. This
  // MUST match the device_id returned from device registration. This device_id
  // is used matched against the user's registered devices to lookup the
  // supported traits and capabilities of this device.
  string device_id = 1;

  // *Optional* The model of this device as registered in the Device Model
  // Registration API. This is only required for syndication partners.
  string device_model_id = 3;
}

// Context provided to the Assistant to enhance its capabilities for the
// Converse API.
message ConverseContext {
  // *Optional* A list of strings containing words and phrases "hints" so that
  // the speech recognition is more likely to recognize them. This can be used
  // to improve the accuracy for specific words and phrases, such as words
  // or commands that the user is likely to say (for example, choices displayed
  // on-screen).
  repeated string hinted_phrases = 1;

  // *Optional* Custom context from the device passed along to 3P fulfillment
  // service. This can provide information about the device's current state.
  // This data is opaque to the Assistant and is defined by the 3P provider,
  // such as a JSON string or binary data. For passing binary data, the data
  // should be base64 encoded.
  string third_party_context = 2;
}

// The response returned to the device if any 3P Custom Device Grammar is
// triggered. The 3P Custom Device Grammar is enabled through the specific
// [DeviceConfig](google.assistant.embedded.v1alpha1.DeviceConfig) provided by
// this device, and should be handled appropriately. For example, a 3P device
// which supports the customized query "do a dance" would receive a DeviceAction
// with action_type: "device_control" and a JSON payload containing the
// semantics of the request.
message DeviceAction {
  // The type of action to be executed on the device.
  enum ActionType {
    // Unknown action type.
    ACTION_TYPE_UNSPECIFIED = 0;

    // 3P custom device action.
    DEVICE_CONTROL = 1;
  }

  // What type of action this DeviceAction contains.
  ActionType action_type = 1;

  // JSON containing the device control response generated from the triggered 3P
  // Custom Device Grammar. The format is given by the [action.devices.EXECUTE](
  // https://developers.google.com/actions/smarthome/create-app#actiondevicesexecute)
  // request type.
  string device_control_json = 2;
}