// DO NOT CHANGE THIS FILE!
// This proto is copied from
// http://google3/speech/soda/chrome/extended_soda_api.proto That is the source
// of truth, and any changes should be submitted and approved there before being
// copied into here.

syntax = "proto2";

package speech.soda.api;

// Optimize generated output for Lite, since it's going to be running on
// end-user devices.
option optimize_for = LITE_RUNTIME;
option java_multiple_files = true;

// Next ID to use: 12
message SerializedSodaConfigMsg {
  // Number of channels in RAW audio that will be provided to SODA.
  optional int32 channel_count = 1;
  // Sample rate, in Hz.
  optional int32 sample_rate = 2;

  // Maximum size of buffer to use in PipeStream. By default, is 0, which means
  // unlimited.
  optional int32 max_buffer_bytes = 4 [default = 0];

  // If set to true, forces the audio provider to simulate realtime audio
  // provision. This only makes sense during testing, to simulate realtime audio
  // providing from a big chunk of audio.
  // This slows down audio provided to SODA to a maximum of real-time, which
  // means more accurate endpointer behavior, but is unsuitable for execution in
  // real production environments. Set with caution!
  optional bool simulate_realtime_testonly = 5 [default = false];

  // config file location for languagepack.
  optional string config_file_location = 3 [deprecated = true];

  // API key used for call verification.
  optional string api_key = 6;

  // Directory of the language pack to use.
  optional string language_pack_directory = 7;

  enum RecognitionMode {
    UNKNOWN = 0;

    // Intended for voice input for keyboard usage.
    IME = 1;

    // Intended to caption a stream of audio.
    CAPTION = 2;
  }
  // What kind of recognition to execute here. Impacts model usage.
  optional RecognitionMode recognition_mode = 8 [default = IME];

  // Whether terse_processor should force a new session after every final
  // recognition result.
  // This will cause the terse processor to stop processing new audio once an
  // endpoint event is detected and wait for it to generate a final event using
  // audio up to the endpoint. This will cause processing bursts when a new
  // session starts.
  optional bool reset_on_final_result = 9 [default = true];

  // Whether to populate the timing_metrics field on Recognition and Endpoint
  // events.
  optional bool include_timing_metrics = 10 [default = true];

  // Whether or not to request lang id events.
  optional bool enable_lang_id = 11 [default = false];
}

// Next id: 5
message TimingMetrics {
  // Epoch time of first audio buffer of main query that is fed into ASR.
  // This is the wall time read from the system clock when the first audio
  // buffer is received by the terse processor.
  optional int64 audio_start_epoch_usec = 1;

  // Start time in audio time from start of SODA session.
  // This time measures the amount of audio input into SODA.
  optional int64 audio_start_time_usec = 2;

  // Elapsed wall time usec since first frame.
  optional int64 elapsed_wall_time_usec = 3;

  // Elapsed processed audio usec from first frame after preamble.
  optional int64 event_end_time_usec = 4;
}

// Next id: 5
message SodaRecognitionResult {
  // Hypothesis from recognition, in order of probability. We don't get the
  // probability from SODA, so the only given is that the first is the "best".
  repeated string hypothesis = 1;
  enum ResultType {
    UNKNOWN = 0;
    // Partial result of a speech segment so far.
    PARTIAL = 1;
    // Final result for this segment.
    FINAL = 2;
    // Prefetch is only sent for likely query strings. This won't happen for
    // non-query mode SODA, but we add here for completeness.
    PREFETCH = 3;
  }

  // What kind of result set this is.
  optional ResultType result_type = 2;

  enum FinalResultEndpointReason {
    ENDPOINT_UNKNOWN = 0;
    // End of speech from endpointer.
    ENDPOINT_END_OF_SPEECH = 1;
    // End of utterance from endpointer.
    ENDPOINT_END_OF_UTTERANCE = 2;
    // No more audio.
    ENDPOINT_END_OF_AUDIO = 3;
    // Final was generated because a hotword was detected.
    ENDPOINT_ASR_RESET_BY_HOTWORD = 4;
    // ASR was reset via the external API.
    ENDPOINT_ASR_RESET_EXTERNAL = 5;
    // Final recognition result was generated due to an error in ASR.
    ENDPOINT_ASR_ERROR = 6;
  }
  // If this is a final result, why was the recognition marked final.
  optional FinalResultEndpointReason endpoint_reason = 3;

  // Timing information for the event.
  optional TimingMetrics timing_metrics = 4;
}

// Next id: 3
message SodaEndpointEvent {
  // What endpoint type we're referring to here.
  enum EndpointType {
    // A start-of-speech moment has been detected at this time. Audio currently
    // contains speech.
    START_OF_SPEECH = 0;

    // End of speech has been detected by the endpointer, audio does not contain
    // speech right now.
    END_OF_SPEECH = 1;

    // End of Audio due to an end-of-mic data event.
    END_OF_AUDIO = 2;

    // End of Utterance detected from the endpointer. Not used in
    // Caption/Transcription.
    END_OF_UTTERANCE = 3;

    UNKNOWN = 4;
  }

  optional EndpointType endpoint_type = 1 [default = UNKNOWN];

  // Timing information for the event.
  optional TimingMetrics timing_metrics = 2;
}

message SodaAudioLevelInfo {
  // Low-pass filtered RMS in range 0..1.
  optional float rms = 1;

  // Speech likelihood score from in range 0..1.
  optional float audio_level = 2;

  // Amount of audio seen from start of SODA session until an audio level event.
  // This value is only set when audio_level is set.
  optional int64 audio_time_usec = 3;
}

message SodaLangIdEvent {
  // Locale, e.g. "en-us" or "af-za"
  optional string language = 1;
  // Equal to the internal enum from langid confidence.
  optional int32 confidence_level = 2;
}

message SodaResponse {
  enum SodaMessageType {
    UNKNOWN = 0;
    RECOGNITION = 1;
    STOP = 2;
    SHUTDOWN = 3;
    START = 4;
    ENDPOINT = 5;
    AUDIO_LEVEL = 6;
    LANGID = 7;
  }

  optional SodaMessageType soda_type = 1 [default = UNKNOWN];

  // Set when type is RECOGNITION
  optional SodaRecognitionResult recognition_result = 2;

  // Set when type is ENDPOINT
  optional SodaEndpointEvent endpoint_event = 3;

  // Set when type is AUDIO_LEVEL
  optional SodaAudioLevelInfo audio_level_info = 4;

  // Set when type is LANGID
  optional SodaLangIdEvent langid_event = 5;
}