// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.mediatranslation.v1beta1;

import "google/api/field_behavior.proto";
import "google/rpc/status.proto";
import "google/api/client.proto";

option cc_enable_arenas = true;
option go_package = "cloud.google.com/go/mediatranslation/apiv1beta1/mediatranslationpb;mediatranslationpb";
option java_multiple_files = true;
option java_outer_classname = "MediaTranslationProto";
option java_package = "com.google.cloud.mediatranslation.v1beta1";
option csharp_namespace = "Google.Cloud.MediaTranslation.V1Beta1";
option ruby_package = "Google::Cloud::MediaTranslation::V1beta1";
option php_namespace = "Google\\Cloud\\MediaTranslation\\V1beta1";

// Provides translation from/to media types.
service SpeechTranslationService {
  option (google.api.default_host) = "mediatranslation.googleapis.com";
  option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";

  // Performs bidirectional streaming speech translation: receive results while
  // sending audio. This method is only available via the gRPC API (not REST).
  rpc StreamingTranslateSpeech(stream StreamingTranslateSpeechRequest) returns (stream StreamingTranslateSpeechResponse) {
  }
}

// Provides information to the speech translation that specifies how to process
// the request.
message TranslateSpeechConfig {
  // Required. Encoding of audio data.
  // Supported formats:
  //
  // - `linear16`
  //
  //   Uncompressed 16-bit signed little-endian samples (Linear PCM).
  //
  // - `flac`
  //
  //   `flac` (Free Lossless Audio Codec) is the recommended encoding
  //   because it is lossless--therefore recognition is not compromised--and
  //   requires only about half the bandwidth of `linear16`.
  //
  // - `mulaw`
  //
  //   8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
  //
  // - `amr`
  //
  //   Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
  //
  // - `amr-wb`
  //
  //   Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
  //
  // - `ogg-opus`
  //
  //   Opus encoded audio frames in [Ogg](https://wikipedia.org/wiki/Ogg)
  //   container. `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000,
  //   or 48000.
  //
  // - `mp3`
  //
  //   MP3 audio. Support all standard MP3 bitrates (which range from 32-320
  //   kbps). When using this encoding, `sample_rate_hertz` has to match the
  //   sample rate of the file being used.
  string audio_encoding = 1 [(google.api.field_behavior) = REQUIRED];

  // Required. Source language code (BCP-47) of the input audio.
  string source_language_code = 2 [(google.api.field_behavior) = REQUIRED];

  // Required. Target language code (BCP-47) of the output.
  string target_language_code = 3 [(google.api.field_behavior) = REQUIRED];

  // Optional. Sample rate in Hertz of the audio data. Valid values are:
  // 8000-48000. 16000 is optimal. For best results, set the sampling rate of
  // the audio source to 16000 Hz. If that's not possible, use the native sample
  // rate of the audio source (instead of re-sampling).
  int32 sample_rate_hertz = 4 [(google.api.field_behavior) = OPTIONAL];

  // Optional. `google-provided-model/video` and
  // `google-provided-model/enhanced-phone-call` are premium models.
  // `google-provided-model/phone-call` is not premium model.
  string model = 5 [(google.api.field_behavior) = OPTIONAL];
}

// Config used for streaming translation.
message StreamingTranslateSpeechConfig {
  // Required. The common config for all the following audio contents.
  TranslateSpeechConfig audio_config = 1 [(google.api.field_behavior) = REQUIRED];

  // Optional. If `false` or omitted, the system performs
  // continuous translation (continuing to wait for and process audio even if
  // the user pauses speaking) until the client closes the input stream (gRPC
  // API) or until the maximum time limit has been reached. May return multiple
  // `StreamingTranslateSpeechResult`s with the `is_final` flag set to `true`.
  //
  // If `true`, the speech translator will detect a single spoken utterance.
  // When it detects that the user has paused or stopped speaking, it will
  // return an `END_OF_SINGLE_UTTERANCE` event and cease translation.
  // When the client receives 'END_OF_SINGLE_UTTERANCE' event, the client should
  // stop sending the requests. However, clients should keep receiving remaining
  // responses until the stream is terminated. To construct the complete
  // sentence in a streaming way, one should override (if 'is_final' of previous
  // response is false), or append (if 'is_final' of previous response is true).
  bool single_utterance = 2 [(google.api.field_behavior) = OPTIONAL];
}

// The top-level message sent by the client for the `StreamingTranslateSpeech`
// method. Multiple `StreamingTranslateSpeechRequest` messages are sent. The
// first message must contain a `streaming_config` message and must not contain
// `audio_content` data. All subsequent messages must contain `audio_content`
// data and must not contain a `streaming_config` message.
message StreamingTranslateSpeechRequest {
  // The streaming request, which is either a streaming config or content.
  oneof streaming_request {
    // Provides information to the recognizer that specifies how to process the
    // request. The first `StreamingTranslateSpeechRequest` message must contain
    // a `streaming_config` message.
    StreamingTranslateSpeechConfig streaming_config = 1;

    // The audio data to be translated. Sequential chunks of audio data are sent
    // in sequential `StreamingTranslateSpeechRequest` messages. The first
    // `StreamingTranslateSpeechRequest` message must not contain
    // `audio_content` data and all subsequent `StreamingTranslateSpeechRequest`
    // messages must contain `audio_content` data. The audio bytes must be
    // encoded as specified in `StreamingTranslateSpeechConfig`. Note: as with
    // all bytes fields, protobuffers use a pure binary representation (not
    // base64).
    bytes audio_content = 2;
  }
}

// A streaming speech translation result corresponding to a portion of the audio
// that is currently being processed.
message StreamingTranslateSpeechResult {
  // Text translation result.
  message TextTranslationResult {
    // Output only. The translated sentence.
    string translation = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

    // Output only. If `false`, this `StreamingTranslateSpeechResult` represents
    // an interim result that may change. If `true`, this is the final time the
    // translation service will return this particular
    // `StreamingTranslateSpeechResult`, the streaming translator will not
    // return any further hypotheses for this portion of the transcript and
    // corresponding audio.
    bool is_final = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  }

  // Translation result.
  oneof result {
    // Text translation result.
    TextTranslationResult text_translation_result = 1;
  }
}

// A streaming speech translation response corresponding to a portion of
// the audio currently processed.
message StreamingTranslateSpeechResponse {
  // Indicates the type of speech event.
  enum SpeechEventType {
    // No speech event specified.
    SPEECH_EVENT_TYPE_UNSPECIFIED = 0;

    // This event indicates that the server has detected the end of the user's
    // speech utterance and expects no additional speech. Therefore, the server
    // will not process additional audio (although it may subsequently return
    // additional results). When the client receives 'END_OF_SINGLE_UTTERANCE'
    // event, the client should stop sending the requests. However, clients
    // should keep receiving remaining responses until the stream is terminated.
    // To construct the complete sentence in a streaming way, one should
    // override (if 'is_final' of previous response is false), or append (if
    // 'is_final' of previous response is true). This event is only sent if
    // `single_utterance` was set to `true`, and is not used otherwise.
    END_OF_SINGLE_UTTERANCE = 1;
  }

  // Output only. If set, returns a [google.rpc.Status][google.rpc.Status] message that
  // specifies the error for the operation.
  google.rpc.Status error = 1 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. The translation result that is currently being processed (is_final could be
  // true or false).
  StreamingTranslateSpeechResult result = 2 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. Indicates the type of speech event.
  SpeechEventType speech_event_type = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}