// DO NOT CHANGE THIS FILE! // This proto is copied from // http://google3/speech/soda/chrome/extended_soda_api.proto That is the source // of truth, and any changes should be submitted and approved there before being // copied into here. syntax = "proto2"; package speech.soda.api; // Optimize generated output for Lite, since it's going to be running on // end-user devices. option optimize_for = LITE_RUNTIME; option java_multiple_files = true; // Next ID to use: 12 message SerializedSodaConfigMsg { // Number of channels in RAW audio that will be provided to SODA. optional int32 channel_count = 1; // Sample rate, in Hz. optional int32 sample_rate = 2; // Maximum size of buffer to use in PipeStream. By default, is 0, which means // unlimited. optional int32 max_buffer_bytes = 4 [default = 0]; // If set to true, forces the audio provider to simulate realtime audio // provision. This only makes sense during testing, to simulate realtime audio // providing from a big chunk of audio. // This slows down audio provided to SODA to a maximum of real-time, which // means more accurate endpointer behavior, but is unsuitable for execution in // real production environments. Set with caution! optional bool simulate_realtime_testonly = 5 [default = false]; // config file location for languagepack. optional string config_file_location = 3 [deprecated = true]; // API key used for call verification. optional string api_key = 6; // Directory of the language pack to use. optional string language_pack_directory = 7; enum RecognitionMode { UNKNOWN = 0; // Intended for voice input for keyboard usage. IME = 1; // Intended to caption a stream of audio. CAPTION = 2; } // What kind of recognition to execute here. Impacts model usage. optional RecognitionMode recognition_mode = 8 [default = IME]; // Whether terse_processor should force a new session after every final // recognition result. // This will cause the terse processor to stop processing new audio once an // endpoint event is detected and wait for it to generate a final event using // audio up to the endpoint. This will cause processing bursts when a new // session starts. optional bool reset_on_final_result = 9 [default = true]; // Whether to populate the timing_metrics field on Recognition and Endpoint // events. optional bool include_timing_metrics = 10 [default = true]; // Whether or not to request lang id events. optional bool enable_lang_id = 11 [default = false]; } // Next id: 5 message TimingMetrics { // Epoch time of first audio buffer of main query that is fed into ASR. // This is the wall time read from the system clock when the first audio // buffer is received by the terse processor. optional int64 audio_start_epoch_usec = 1; // Start time in audio time from start of SODA session. // This time measures the amount of audio input into SODA. optional int64 audio_start_time_usec = 2; // Elapsed wall time usec since first frame. optional int64 elapsed_wall_time_usec = 3; // Elapsed processed audio usec from first frame after preamble. optional int64 event_end_time_usec = 4; } // Next id: 5 message SodaRecognitionResult { // Hypothesis from recognition, in order of probability. We don't get the // probability from SODA, so the only given is that the first is the "best". repeated string hypothesis = 1; enum ResultType { UNKNOWN = 0; // Partial result of a speech segment so far. PARTIAL = 1; // Final result for this segment. FINAL = 2; // Prefetch is only sent for likely query strings. This won't happen for // non-query mode SODA, but we add here for completeness. PREFETCH = 3; } // What kind of result set this is. optional ResultType result_type = 2; enum FinalResultEndpointReason { ENDPOINT_UNKNOWN = 0; // End of speech from endpointer. ENDPOINT_END_OF_SPEECH = 1; // End of utterance from endpointer. ENDPOINT_END_OF_UTTERANCE = 2; // No more audio. ENDPOINT_END_OF_AUDIO = 3; // Final was generated because a hotword was detected. ENDPOINT_ASR_RESET_BY_HOTWORD = 4; // ASR was reset via the external API. ENDPOINT_ASR_RESET_EXTERNAL = 5; // Final recognition result was generated due to an error in ASR. ENDPOINT_ASR_ERROR = 6; } // If this is a final result, why was the recognition marked final. optional FinalResultEndpointReason endpoint_reason = 3; // Timing information for the event. optional TimingMetrics timing_metrics = 4; } // Next id: 3 message SodaEndpointEvent { // What endpoint type we're referring to here. enum EndpointType { // A start-of-speech moment has been detected at this time. Audio currently // contains speech. START_OF_SPEECH = 0; // End of speech has been detected by the endpointer, audio does not contain // speech right now. END_OF_SPEECH = 1; // End of Audio due to an end-of-mic data event. END_OF_AUDIO = 2; // End of Utterance detected from the endpointer. Not used in // Caption/Transcription. END_OF_UTTERANCE = 3; UNKNOWN = 4; } optional EndpointType endpoint_type = 1 [default = UNKNOWN]; // Timing information for the event. optional TimingMetrics timing_metrics = 2; } message SodaAudioLevelInfo { // Low-pass filtered RMS in range 0..1. optional float rms = 1; // Speech likelihood score from in range 0..1. optional float audio_level = 2; // Amount of audio seen from start of SODA session until an audio level event. // This value is only set when audio_level is set. optional int64 audio_time_usec = 3; } message SodaLangIdEvent { // Locale, e.g. "en-us" or "af-za" optional string language = 1; // Equal to the internal enum from langid confidence. optional int32 confidence_level = 2; } message SodaResponse { enum SodaMessageType { UNKNOWN = 0; RECOGNITION = 1; STOP = 2; SHUTDOWN = 3; START = 4; ENDPOINT = 5; AUDIO_LEVEL = 6; LANGID = 7; } optional SodaMessageType soda_type = 1 [default = UNKNOWN]; // Set when type is RECOGNITION optional SodaRecognitionResult recognition_result = 2; // Set when type is ENDPOINT optional SodaEndpointEvent endpoint_event = 3; // Set when type is AUDIO_LEVEL optional SodaAudioLevelInfo audio_level_info = 4; // Set when type is LANGID optional SodaLangIdEvent langid_event = 5; }