// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.assistant.embedded.v1alpha2; import "google/api/annotations.proto"; import "google/type/latlng.proto"; option go_package = "google.golang.org/genproto/googleapis/assistant/embedded/v1alpha2;embedded"; option java_multiple_files = true; option java_outer_classname = "AssistantProto"; option java_package = "com.google.assistant.embedded.v1alpha2"; option objc_class_prefix = "ASTSDK"; // Service that implements the Google Assistant API. service EmbeddedAssistant { // Initiates or continues a conversation with the embedded Assistant Service. // Each call performs one round-trip, sending an audio request to the service // and receiving the audio response. Uses bidirectional streaming to receive // results, such as the `END_OF_UTTERANCE` event, while sending audio. // // A conversation is one or more gRPC connections, each consisting of several // streamed requests and responses. // For example, the user says *Add to my shopping list* and the Assistant // responds *What do you want to add?*. The sequence of streamed requests and // responses in the first gRPC message could be: // // * AssistRequest.config // * AssistRequest.audio_in // * AssistRequest.audio_in // * AssistRequest.audio_in // * AssistRequest.audio_in // * AssistResponse.event_type.END_OF_UTTERANCE // * AssistResponse.speech_results.transcript "add to my shopping list" // * AssistResponse.dialog_state_out.microphone_mode.DIALOG_FOLLOW_ON // * AssistResponse.audio_out // * AssistResponse.audio_out // * AssistResponse.audio_out // // // The user then says *bagels* and the Assistant responds // *OK, I've added bagels to your shopping list*. This is sent as another gRPC // connection call to the `Assist` method, again with streamed requests and // responses, such as: // // * AssistRequest.config // * AssistRequest.audio_in // * AssistRequest.audio_in // * AssistRequest.audio_in // * AssistResponse.event_type.END_OF_UTTERANCE // * AssistResponse.dialog_state_out.microphone_mode.CLOSE_MICROPHONE // * AssistResponse.audio_out // * AssistResponse.audio_out // * AssistResponse.audio_out // * AssistResponse.audio_out // // Although the precise order of responses is not guaranteed, sequential // `AssistResponse.audio_out` messages will always contain sequential portions // of audio. rpc Assist(stream AssistRequest) returns (stream AssistResponse); } // The top-level message sent by the client. Clients must send at least two, and // typically numerous `AssistRequest` messages. The first message must // contain a `config` message and must not contain `audio_in` data. All // subsequent messages must contain `audio_in` data and must not contain a // `config` message. message AssistRequest { // Exactly one of these fields must be specified in each `AssistRequest`. oneof type { // The `config` message provides information to the recognizer that // specifies how to process the request. // The first `AssistRequest` message must contain a `config` message. AssistConfig config = 1; // The audio data to be recognized. Sequential chunks of audio data are sent // in sequential `AssistRequest` messages. The first `AssistRequest` // message must not contain `audio_in` data and all subsequent // `AssistRequest` messages must contain `audio_in` data. The audio bytes // must be encoded as specified in `AudioInConfig`. // Audio must be sent at approximately real-time (16000 samples per second). // An error will be returned if audio is sent significantly faster or // slower. bytes audio_in = 2; } } // The top-level message received by the client. A series of one or more // `AssistResponse` messages are streamed back to the client. message AssistResponse { // Indicates the type of event. enum EventType { // No event specified. EVENT_TYPE_UNSPECIFIED = 0; // This event indicates that the server has detected the end of the user's // speech utterance and expects no additional speech. Therefore, the server // will not process additional audio (although it may subsequently return // additional results). The client should stop sending additional audio // data, half-close the gRPC connection, and wait for any additional results // until the server closes the gRPC connection. END_OF_UTTERANCE = 1; } // *Output-only* Indicates the type of event. EventType event_type = 1; // *Output-only* The audio containing the Assistant's response to the query. AudioOut audio_out = 3; // *Output-only* Contains the Assistant's visual response to the query. ScreenOut screen_out = 4; // *Output-only* Contains the action triggered by the query with the // appropriate payloads and semantic parsing. DeviceAction device_action = 6; // *Output-only* This repeated list contains zero or more speech recognition // results that correspond to consecutive portions of the audio currently // being processed, starting with the portion corresponding to the earliest // audio (and most stable portion) to the portion corresponding to the most // recent audio. The strings can be concatenated to view the full // in-progress response. When the speech recognition completes, this list // will contain one item with `stability` of `1.0`. repeated SpeechRecognitionResult speech_results = 2; // *Output-only* Contains output related to the user's query. DialogStateOut dialog_state_out = 5; // *Output-only* Debugging info for developer. Only returned if request set // `return_debug_info` to true. DebugInfo debug_info = 8; } // Debug info for developer. Only returned if request set `return_debug_info` // to true. message DebugInfo { // The original JSON response from an Action-on-Google agent to Google server. // See // https://developers.google.com/actions/reference/rest/Shared.Types/AppResponse. // It will only be populated if the request maker owns the AoG project and the // AoG project is in preview mode. string aog_agent_to_assistant_json = 1; } // Specifies how to process the `AssistRequest` messages. message AssistConfig { oneof type { // Specifies how to process the subsequent incoming audio. Required if // [AssistRequest.audio_in][google.assistant.embedded.v1alpha2.AssistRequest.audio_in] // bytes will be provided in subsequent requests. AudioInConfig audio_in_config = 1; // The text input to be sent to the Assistant. This can be populated from a // text interface if audio input is not available. string text_query = 6; } // *Required* Specifies how to format the audio that will be returned. AudioOutConfig audio_out_config = 2; // *Optional* Specifies the desired format to use when server returns a // visual screen response. ScreenOutConfig screen_out_config = 8; // *Required* Represents the current dialog state. DialogStateIn dialog_state_in = 3; // Device configuration that uniquely identifies a specific device. DeviceConfig device_config = 4; // *Optional* Debugging parameters for the whole `Assist` RPC. DebugConfig debug_config = 5; } // Specifies how to process the `audio_in` data that will be provided in // subsequent requests. For recommended settings, see the Google Assistant SDK // [best // practices](https://developers.google.com/assistant/sdk/guides/service/python/best-practices/audio). message AudioInConfig { // Audio encoding of the data sent in the audio message. // Audio must be one-channel (mono). enum Encoding { // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][]. ENCODING_UNSPECIFIED = 0; // Uncompressed 16-bit signed little-endian samples (Linear PCM). // This encoding includes no header, only the raw audio bytes. LINEAR16 = 1; // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio // Codec) is the recommended encoding because it is // lossless--therefore recognition is not compromised--and // requires only about half the bandwidth of `LINEAR16`. This encoding // includes the `FLAC` stream header followed by audio data. It supports // 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are // supported. FLAC = 2; } // *Required* Encoding of audio data sent in all `audio_in` messages. Encoding encoding = 1; // *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in` // messages. Valid values are from 16000-24000, but 16000 is optimal. // For best results, set the sampling rate of the audio source to 16000 Hz. // If that's not possible, use the native sample rate of the audio source // (instead of re-sampling). int32 sample_rate_hertz = 2; } // Specifies the desired format for the server to use when it returns // `audio_out` messages. message AudioOutConfig { // Audio encoding of the data returned in the audio message. All encodings are // raw audio bytes with no header, except as indicated below. enum Encoding { // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][]. ENCODING_UNSPECIFIED = 0; // Uncompressed 16-bit signed little-endian samples (Linear PCM). LINEAR16 = 1; // MP3 audio encoding. The sample rate is encoded in the payload. MP3 = 2; // Opus-encoded audio wrapped in an ogg container. The result will be a // file which can be played natively on Android and in some browsers (such // as Chrome). The quality of the encoding is considerably higher than MP3 // while using the same bitrate. The sample rate is encoded in the payload. OPUS_IN_OGG = 3; } // *Required* The encoding of audio data to be returned in all `audio_out` // messages. Encoding encoding = 1; // *Required* The sample rate in Hertz of the audio data returned in // `audio_out` messages. Valid values are: 16000-24000. int32 sample_rate_hertz = 2; // *Required* Current volume setting of the device's audio output. // Valid values are 1 to 100 (corresponding to 1% to 100%). int32 volume_percentage = 3; } // Specifies the desired format for the server to use when it returns // `screen_out` response. message ScreenOutConfig { // Possible modes for visual screen-output on the device. enum ScreenMode { // No video mode specified. // The Assistant may respond as if in `OFF` mode. SCREEN_MODE_UNSPECIFIED = 0; // Screen is off (or has brightness or other settings set so low it is // not visible). The Assistant will typically not return a screen response // in this mode. OFF = 1; // The Assistant will typically return a partial-screen response in this // mode. PLAYING = 3; } // Current visual screen-mode for the device while issuing the query. ScreenMode screen_mode = 1; } // Provides information about the current dialog state. message DialogStateIn { // *Required* This field must always be set to the // [DialogStateOut.conversation_state][google.assistant.embedded.v1alpha2.DialogStateOut.conversation_state] // value that was returned in the prior `Assist` RPC. It should only be // omitted (field not set) if there was no prior `Assist` RPC because this is // the first `Assist` RPC made by this device after it was first setup and/or // a factory-default reset. bytes conversation_state = 1; // *Required* Language of the request in // [IETF BCP 47 syntax](https://tools.ietf.org/html/bcp47) (for example, // "en-US"). See [Language // Support](https://developers.google.com/assistant/sdk/reference/rpc/languages) // for more information. If you have selected a language for this `device_id` // using the // [Settings](https://developers.google.com/assistant/sdk/reference/assistant-app/assistant-settings) // menu in your phone's Google Assistant app, that selection will override // this value. string language_code = 2; // *Optional* Location of the device where the query originated. DeviceLocation device_location = 5; // *Optional* If true, the server will treat the request as a new conversation // and not use state from the prior request. Set this field to true when the // conversation should be restarted, such as after a device reboot, or after a // significant lapse of time since the prior query. bool is_new_conversation = 7; } // *Required* Fields that identify the device to the Assistant. // // See also: // // * [Register a Device - REST // API](https://developers.google.com/assistant/sdk/reference/device-registration/register-device-manual) // * [Device Model and Instance // Schemas](https://developers.google.com/assistant/sdk/reference/device-registration/model-and-instance-schemas) // * [Device // Proto](https://developers.google.com/assistant/sdk/reference/rpc/google.assistant.devices.v1alpha2#device) message DeviceConfig { // *Required* Unique identifier for the device. The id length must be 128 // characters or less. Example: DBCDW098234. This MUST match the device_id // returned from device registration. This device_id is used to match against // the user's registered devices to lookup the supported traits and // capabilities of this device. This information should not change across // device reboots. However, it should not be saved across // factory-default resets. string device_id = 1; // *Required* Unique identifier for the device model. The combination of // device_model_id and device_id must have been previously associated through // device registration. string device_model_id = 3; } // The audio containing the Assistant's response to the query. Sequential chunks // of audio data are received in sequential `AssistResponse` messages. message AudioOut { // *Output-only* The audio data containing the Assistant's response to the // query. Sequential chunks of audio data are received in sequential // `AssistResponse` messages. bytes audio_data = 1; } // The Assistant's visual output response to query. Enabled by // `screen_out_config`. message ScreenOut { // Possible formats of the screen data. enum Format { // No format specified. FORMAT_UNSPECIFIED = 0; // Data will contain a fully-formed HTML5 layout encoded in UTF-8, e.g. // `
...
`. It is intended to be rendered // along with the audio response. Note that HTML5 doctype should be included // in the actual HTML data. HTML = 1; } // *Output-only* The format of the provided screen data. Format format = 1; // *Output-only* The raw screen data to be displayed as the result of the // Assistant query. bytes data = 2; } // The response returned to the device if the user has triggered a Device // Action. For example, a device which supports the query *Turn on the light* // would receive a `DeviceAction` with a JSON payload containing the semantics // of the request. message DeviceAction { // JSON containing the device command response generated from the triggered // Device Action grammar. The format is given by the // `action.devices.EXECUTE` intent for a given // [trait](https://developers.google.com/assistant/sdk/reference/traits/). string device_request_json = 1; } // The estimated transcription of a phrase the user has spoken. This could be // a single segment or the full guess of the user's spoken query. message SpeechRecognitionResult { // *Output-only* Transcript text representing the words that the user spoke. string transcript = 1; // *Output-only* An estimate of the likelihood that the Assistant will not // change its guess about this result. Values range from 0.0 (completely // unstable) to 1.0 (completely stable and final). The default of 0.0 is a // sentinel value indicating `stability` was not set. float stability = 2; } // The dialog state resulting from the user's query. Multiple of these messages // may be received. message DialogStateOut { // Possible states of the microphone after a `Assist` RPC completes. enum MicrophoneMode { // No mode specified. MICROPHONE_MODE_UNSPECIFIED = 0; // The service is not expecting a follow-on question from the user. // The microphone should remain off until the user re-activates it. CLOSE_MICROPHONE = 1; // The service is expecting a follow-on question from the user. The // microphone should be re-opened when the `AudioOut` playback completes // (by starting a new `Assist` RPC call to send the new audio). DIALOG_FOLLOW_ON = 2; } // *Output-only* Supplemental display text from the Assistant. This could be // the same as the speech spoken in `AssistResponse.audio_out` or it could // be some additional information which aids the user's understanding. string supplemental_display_text = 1; // *Output-only* State information for the subsequent `Assist` RPC. This // value should be saved in the client and returned in the // [`DialogStateIn.conversation_state`](#dialogstatein) field with the next // `Assist` RPC. (The client does not need to interpret or otherwise use this // value.) This information should be saved across device reboots. However, // this value should be cleared (not saved in the client) during a // factory-default reset. bytes conversation_state = 2; // *Output-only* Specifies the mode of the microphone after this `Assist` // RPC is processed. MicrophoneMode microphone_mode = 3; // *Output-only* Updated volume level. The value will be 0 or omitted // (indicating no change) unless a voice command such as *Increase the volume* // or *Set volume level 4* was recognized, in which case the value will be // between 1 and 100 (corresponding to the new volume level of 1% to 100%). // Typically, a client should use this volume level when playing the // `audio_out` data, and retain this value as the current volume level and // supply it in the `AudioOutConfig` of the next `AssistRequest`. (Some // clients may also implement other ways to allow the current volume level to // be changed, for example, by providing a knob that the user can turn.) int32 volume_percentage = 4; } // Debugging parameters for the current request. message DebugConfig { // When this field is set to true, the `debug_info` field in `AssistResponse` // may be populated. However it will significantly increase latency of // responses. Do not set this field true in production code. bool return_debug_info = 6; } // There are three sources of locations. They are used with this precedence: // // 1. This `DeviceLocation`, which is primarily used for mobile devices with // GPS . // 2. Location specified by the user during device setup; this is per-user, per // device. This location is used if `DeviceLocation` is not specified. // 3. Inferred location based on IP address. This is used only if neither of the // above are specified. message DeviceLocation { oneof type { // Latitude and longitude of device. google.type.LatLng coordinates = 1; } }