// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.mediatranslation.v1alpha1; import "google/api/field_behavior.proto"; import "google/rpc/status.proto"; import "google/api/client.proto"; option cc_enable_arenas = true; option go_package = "cloud.google.com/go/mediatranslation/apiv1alpha1/mediatranslationpb;mediatranslationpb"; option java_package = "com.google.cloud.mediatranslation.v1alpha1"; // Provides translation from/to media types. service SpeechTranslationService { option (google.api.default_host) = "mediatranslation.googleapis.com"; option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; // Performs bidirectional streaming speech translation: receive results while // sending audio. This method is only available via the gRPC API (not REST). rpc StreamingTranslateSpeech(stream StreamingTranslateSpeechRequest) returns (stream StreamingTranslateSpeechResponse) { } } // Provides information to the speech translation that specifies how to process // the request. message TranslateSpeechConfig { // Required. Encoding of audio data. // Supported formats: // // - `linear16` // // Uncompressed 16-bit signed little-endian samples (Linear PCM). // // - `flac` // // `flac` (Free Lossless Audio Codec) is the recommended encoding // because it is lossless--therefore recognition is not compromised--and // requires only about half the bandwidth of `linear16`. // // - `mulaw` // // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. // // - `amr` // // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. // // - `amr-wb` // // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. // // - `ogg-opus` // // Opus encoded audio frames in Ogg container // ([OggOpus](https://wiki.xiph.org/OggOpus)). // `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000. // // - `mp3` // // MP3 audio. Support all standard MP3 bitrates (which range from 32-320 // kbps). When using this encoding, `sample_rate_hertz` has to match the // sample rate of the file being used. // // string audio_encoding = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Source language code (BCP-47) of the input audio. string source_language_code = 2 [(google.api.field_behavior) = REQUIRED]; // Required. Target language code (BCP-47) of the output. string target_language_code = 3 [(google.api.field_behavior) = REQUIRED]; // Optional. A list of up to 3 additional language codes (BCP-47), listing possible // alternative languages of the supplied audio. If alternative source // languages are listed, speech translation result will translate in the most // likely language detected including the main source_language_code. The // translated result will include the language code of the language detected // in the audio. // Note: // 1. If the provided alternative_source_language_code is not supported // by current API version, we will skip that language code. // 2. If user only provided one eligible alternative_source_language_codes, // the translation will happen between source_language_code and // alternative_source_language_codes. The target_language_code will be // ignored. It will be useful in conversation mode. repeated string alternative_source_language_codes = 6 [(google.api.field_behavior) = OPTIONAL]; // Optional. Sample rate in Hertz of the audio data. Valid values are: // 8000-48000. 16000 is optimal. For best results, set the sampling rate of // the audio source to 16000 Hz. If that's not possible, use the native sample // rate of the audio source (instead of re-sampling). // int32 sample_rate_hertz = 4 [(google.api.field_behavior) = OPTIONAL]; // Optional. string model = 5 [(google.api.field_behavior) = OPTIONAL]; } // Config used for streaming translation. message StreamingTranslateSpeechConfig { // Required. The common config for all the following audio contents. TranslateSpeechConfig audio_config = 1 [(google.api.field_behavior) = REQUIRED]; // Optional. If `false` or omitted, the system performs // continuous translation (continuing to wait for and process audio even if // the user pauses speaking) until the client closes the input stream (gRPC // API) or until the maximum time limit has been reached. May return multiple // `StreamingTranslateSpeechResult`s with the `is_final` flag set to `true`. // // If `true`, the speech translator will detect a single spoken utterance. // When it detects that the user has paused or stopped speaking, it will // return an `END_OF_SINGLE_UTTERANCE` event and cease translation. // When the client receives `END_OF_SINGLE_UTTERANCE` event, the client should // stop sending the requests. However, clients should keep receiving remaining // responses until the stream is terminated. To construct the complete // sentence in a streaming way, one should override (if `is_final` of previous // response is false), or append (if 'is_final' of previous response is true). bool single_utterance = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. Stability control for the media translation text. The value should be // "LOW", "MEDIUM", "HIGH". It applies to text/text_and_audio translation // only. // For audio translation mode, we only support HIGH stability mode, // low/medium stability mode will throw argument error. // Default empty string will be treated as "HIGH" in audio translation mode; // will be treated as "LOW" in other translation mode. // Note that stability and speed would be trade off. // 1. "LOW": In low mode, translation service will start to do translation // right after getting recognition response. The speed will be faster. // 2. "MEDIUM": In medium mode, translation service will // check if the recognition response is stable enough or not, and only // translate recognition response which is not likely to be changed later. // 3. "HIGH": In high mode, translation service will wait for more stable // recognition responses, and then start to do translation. Also, the // following recognition responses cannot modify previous recognition // responses. Thus it may impact quality in some situation. "HIGH" stability // will generate "final" responses more frequently. // string stability = 3 [(google.api.field_behavior) = OPTIONAL]; // Optional. Translation mode, the value should be "text", "audio", "text_and_audio". // Default empty string will be treated as "text". // 1. "text": The response will be text translation. Text translation has a // field "is_final". Detailed definition can be found in // `TextTranslationResult`. // 2. "audio": The response will be audio translation. Audio translation does // not have "is_final" field, which means each audio translation response is // stable and will not be changed by later response. // Translation mode "audio" can only be used with "high" stability mode, // 3. "text_and_audio": The response will have a text translation, when // "is_final" is true, we will also output its corresponding audio // translation. When "is_final" is false, audio_translation field will be // empty. string translation_mode = 4 [(google.api.field_behavior) = OPTIONAL]; // Optional. If disable_interim_results is true, we will only return "final" responses. // Otherwise, we will return all the responses. Default value will be false. // User can only set disable_interim_results to be true with "high" stability // mode. bool disable_interim_results = 5 [(google.api.field_behavior) = OPTIONAL]; } // The top-level message sent by the client for the `StreamingTranslateSpeech` // method. Multiple `StreamingTranslateSpeechRequest` messages are sent. The // first message must contain a `streaming_config` message and must not contain // `audio_content` data. All subsequent messages must contain `audio_content` // data and must not contain a `streaming_config` message. message StreamingTranslateSpeechRequest { // The streaming request, which is either a streaming config or content. oneof streaming_request { // Provides information to the recognizer that specifies how to process the // request. The first `StreamingTranslateSpeechRequest` message must contain // a `streaming_config` message. StreamingTranslateSpeechConfig streaming_config = 1; // The audio data to be translated. Sequential chunks of audio data are sent // in sequential `StreamingTranslateSpeechRequest` messages. The first // `StreamingTranslateSpeechRequest` message must not contain // `audio_content` data and all subsequent `StreamingTranslateSpeechRequest` // messages must contain `audio_content` data. The audio bytes must be // encoded as specified in `StreamingTranslateSpeechConfig`. Note: as with // all bytes fields, protobuffers use a pure binary representation (not // base64). bytes audio_content = 2; } } // A streaming speech translation result corresponding to a portion of the audio // that is currently being processed. message StreamingTranslateSpeechResult { // Text translation result. message TextTranslationResult { // Output only. The translated sentence. string translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. If `false`, this `StreamingTranslateSpeechResult` represents // an interim result that may change. If `true`, this is the final time the // translation service will return this particular // `StreamingTranslateSpeechResult`, the streaming translator will not // return any further hypotheses for this portion of the transcript and // corresponding audio. bool is_final = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Audio translation result. message AudioTranslationResult { // Output only. The translated audio. bytes audio_translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Text translation result. TextTranslationResult text_translation_result = 1; // Audio translation result. AudioTranslationResult audio_translation_result = 2; // Output only. The debug only recognition result in original language. This field is debug // only and will be set to empty string if not available. // This is implementation detail and will not be backward compatible. string recognition_result = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. string detected_source_language_code = 4 [(google.api.field_behavior) = OUTPUT_ONLY]; } // A streaming speech translation response corresponding to a portion of // the audio currently processed. message StreamingTranslateSpeechResponse { // Indicates the type of speech event. enum SpeechEventType { // No speech event specified. SPEECH_EVENT_TYPE_UNSPECIFIED = 0; // This event indicates that the server has detected the end of the user's // speech utterance and expects no additional speech. Therefore, the server // will not process additional audio (although it may subsequently return // additional results). When the client receives `END_OF_SINGLE_UTTERANCE` // event, the client should stop sending the requests. However, clients // should keep receiving remaining responses until the stream is terminated. // To construct the complete sentence in a streaming way, one should // override (if `is_final` of previous response is `false`), or append (if // `is_final` of previous response is `true`). This event is only sent if // `single_utterance` was set to `true`, and is not used otherwise. END_OF_SINGLE_UTTERANCE = 1; } // Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that // specifies the error for the operation. google.rpc.Status error = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. The translation result that is currently being processed (For text // translation, `is_final` could be `true` or `false`. // For audio translation, we do not have is_final field, which means each // audio response is stable and will not get changed later. For // text_and_audio, we still have `is_final` field in text translation, but we // only output corresponsding audio when `is_final` is true.). StreamingTranslateSpeechResult result = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Indicates the type of speech event. SpeechEventType speech_event_type = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; }