/// The top-level message sent by the client for the `Recognize` method. #[derive(Clone, PartialEq, ::prost::Message)] pub struct RecognizeRequest { /// Required. Provides information to the recognizer that specifies how to /// process the request. #[prost(message, optional, tag = "1")] pub config: ::core::option::Option, /// Required. The audio data to be recognized. #[prost(message, optional, tag = "2")] pub audio: ::core::option::Option, } /// The top-level message sent by the client for the `LongRunningRecognize` /// method. #[derive(Clone, PartialEq, ::prost::Message)] pub struct LongRunningRecognizeRequest { /// Required. Provides information to the recognizer that specifies how to /// process the request. #[prost(message, optional, tag = "1")] pub config: ::core::option::Option, /// Required. The audio data to be recognized. #[prost(message, optional, tag = "2")] pub audio: ::core::option::Option, /// Optional. Specifies an optional destination for the recognition results. #[prost(message, optional, tag = "4")] pub output_config: ::core::option::Option, } /// Specifies an optional destination for the recognition results. #[derive(Clone, PartialEq, ::prost::Message)] pub struct TranscriptOutputConfig { #[prost(oneof = "transcript_output_config::OutputType", tags = "1")] pub output_type: ::core::option::Option, } /// Nested message and enum types in `TranscriptOutputConfig`. pub mod transcript_output_config { #[derive(Clone, PartialEq, ::prost::Oneof)] pub enum OutputType { /// Specifies a Cloud Storage URI for the recognition results. Must be /// specified in the format: `gs://bucket_name/object_name`, and the bucket /// must already exist. #[prost(string, tag = "1")] GcsUri(::prost::alloc::string::String), } } /// The top-level message sent by the client for the `StreamingRecognize` method. /// Multiple `StreamingRecognizeRequest` messages are sent. The first message /// must contain a `streaming_config` message and must not contain /// `audio_content`. All subsequent messages must contain `audio_content` and /// must not contain a `streaming_config` message. #[derive(Clone, PartialEq, ::prost::Message)] pub struct StreamingRecognizeRequest { /// The streaming request, which is either a streaming config or audio content. #[prost(oneof = "streaming_recognize_request::StreamingRequest", tags = "1, 2")] pub streaming_request: ::core::option::Option, } /// Nested message and enum types in `StreamingRecognizeRequest`. pub mod streaming_recognize_request { /// The streaming request, which is either a streaming config or audio content. #[derive(Clone, PartialEq, ::prost::Oneof)] pub enum StreamingRequest { /// Provides information to the recognizer that specifies how to process the /// request. The first `StreamingRecognizeRequest` message must contain a /// `streaming_config` message. #[prost(message, tag = "1")] StreamingConfig(super::StreamingRecognitionConfig), /// The audio data to be recognized. Sequential chunks of audio data are sent /// in sequential `StreamingRecognizeRequest` messages. The first /// `StreamingRecognizeRequest` message must not contain `audio_content` data /// and all subsequent `StreamingRecognizeRequest` messages must contain /// `audio_content` data. The audio bytes must be encoded as specified in /// `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a /// pure binary representation (not base64). See /// [content limits](). #[prost(bytes, tag = "2")] AudioContent(::prost::alloc::vec::Vec), } } /// Provides information to the recognizer that specifies how to process the /// request. #[derive(Clone, PartialEq, ::prost::Message)] pub struct StreamingRecognitionConfig { /// Required. Provides information to the recognizer that specifies how to /// process the request. #[prost(message, optional, tag = "1")] pub config: ::core::option::Option, /// If `false` or omitted, the recognizer will perform continuous /// recognition (continuing to wait for and process audio even if the user /// pauses speaking) until the client closes the input stream (gRPC API) or /// until the maximum time limit has been reached. May return multiple /// `StreamingRecognitionResult`s with the `is_final` flag set to `true`. /// /// If `true`, the recognizer will detect a single spoken utterance. When it /// detects that the user has paused or stopped speaking, it will return an /// `END_OF_SINGLE_UTTERANCE` event and cease recognition. It will return no /// more than one `StreamingRecognitionResult` with the `is_final` flag set to /// `true`. /// /// The `single_utterance` field can only be used with specified models, /// otherwise an error is thrown. The `model` field in \[`RecognitionConfig`][\] /// must be set to: /// /// * `command_and_search` /// * `phone_call` AND additional field `useEnhanced`=`true` /// * The `model` field is left undefined. In this case the API auto-selects /// a model based on any other parameters that you set in /// `RecognitionConfig`. #[prost(bool, tag = "2")] pub single_utterance: bool, /// If `true`, interim results (tentative hypotheses) may be /// returned as they become available (these interim results are indicated with /// the `is_final=false` flag). /// If `false` or omitted, only `is_final=true` result(s) are returned. #[prost(bool, tag = "3")] pub interim_results: bool, } /// Provides information to the recognizer that specifies how to process the /// request. #[derive(Clone, PartialEq, ::prost::Message)] pub struct RecognitionConfig { /// Encoding of audio data sent in all `RecognitionAudio` messages. /// This field is optional for `FLAC` and `WAV` audio files and required /// for all other audio formats. For details, see \[AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding\]. #[prost(enumeration = "recognition_config::AudioEncoding", tag = "1")] pub encoding: i32, /// Sample rate in Hertz of the audio data sent in all /// `RecognitionAudio` messages. Valid values are: 8000-48000. /// 16000 is optimal. For best results, set the sampling rate of the audio /// source to 16000 Hz. If that's not possible, use the native sample rate of /// the audio source (instead of re-sampling). /// This field is optional for FLAC and WAV audio files, but is /// required for all other audio formats. For details, see \[AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding\]. #[prost(int32, tag = "2")] pub sample_rate_hertz: i32, /// The number of channels in the input audio data. /// ONLY set this for MULTI-CHANNEL recognition. /// Valid values for LINEAR16 and FLAC are `1`-`8`. /// Valid values for OGG_OPUS are '1'-'254'. /// Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`. /// If `0` or omitted, defaults to one channel (mono). /// Note: We only recognize the first channel by default. /// To perform independent recognition on each channel set /// `enable_separate_recognition_per_channel` to 'true'. #[prost(int32, tag = "7")] pub audio_channel_count: i32, /// This needs to be set to `true` explicitly and `audio_channel_count` > 1 /// to get each channel recognized separately. The recognition result will /// contain a `channel_tag` field to state which channel that result belongs /// to. If this is not true, we will only recognize the first channel. The /// request is billed cumulatively for all channels recognized: /// `audio_channel_count` multiplied by the length of the audio. #[prost(bool, tag = "12")] pub enable_separate_recognition_per_channel: bool, /// Required. The language of the supplied audio as a /// \[BCP-47\]() language tag. /// Example: "en-US". /// See [Language /// Support]() for a list /// of the currently supported language codes. #[prost(string, tag = "3")] pub language_code: ::prost::alloc::string::String, /// Maximum number of recognition hypotheses to be returned. /// Specifically, the maximum number of `SpeechRecognitionAlternative` messages /// within each `SpeechRecognitionResult`. /// The server may return fewer than `max_alternatives`. /// Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of /// one. If omitted, will return a maximum of one. #[prost(int32, tag = "4")] pub max_alternatives: i32, /// If set to `true`, the server will attempt to filter out /// profanities, replacing all but the initial character in each filtered word /// with asterisks, e.g. "f***". If set to `false` or omitted, profanities /// won't be filtered out. #[prost(bool, tag = "5")] pub profanity_filter: bool, /// Array of \[SpeechContext][google.cloud.speech.v1.SpeechContext\]. /// A means to provide context to assist the speech recognition. For more /// information, see /// [speech /// adaptation](). #[prost(message, repeated, tag = "6")] pub speech_contexts: ::prost::alloc::vec::Vec, /// If `true`, the top result includes a list of words and /// the start and end time offsets (timestamps) for those words. If /// `false`, no word-level time offset information is returned. The default is /// `false`. #[prost(bool, tag = "8")] pub enable_word_time_offsets: bool, /// If 'true', adds punctuation to recognition result hypotheses. /// This feature is only available in select languages. Setting this for /// requests in other languages has no effect at all. /// The default 'false' value does not add punctuation to result hypotheses. #[prost(bool, tag = "11")] pub enable_automatic_punctuation: bool, /// Config to enable speaker diarization and set additional /// parameters to make diarization better suited for your application. /// Note: When this is enabled, we send all the words from the beginning of the /// audio for the top alternative in every consecutive STREAMING responses. /// This is done in order to improve our speaker tags as our models learn to /// identify the speakers in the conversation over time. /// For non-streaming requests, the diarization results will be provided only /// in the top alternative of the FINAL SpeechRecognitionResult. #[prost(message, optional, tag = "19")] pub diarization_config: ::core::option::Option, /// Metadata regarding this request. #[prost(message, optional, tag = "9")] pub metadata: ::core::option::Option, /// Which model to select for the given request. Select the model /// best suited to your domain to get best results. If a model is not /// explicitly specified, then we auto-select a model based on the parameters /// in the RecognitionConfig. /// /// /// /// /// /// /// /// /// /// /// /// /// /// /// /// /// /// /// /// /// ///
ModelDescription
command_and_searchBest for short queries such as voice commands or voice search.
phone_callBest for audio that originated from a phone call (typically /// recorded at an 8khz sampling rate).
videoBest for audio that originated from video or includes multiple /// speakers. Ideally the audio is recorded at a 16khz or greater /// sampling rate. This is a premium model that costs more than the /// standard rate.
defaultBest for audio that is not one of the specific audio models. /// For example, long-form audio. Ideally the audio is high-fidelity, /// recorded at a 16khz or greater sampling rate.
#[prost(string, tag = "13")] pub model: ::prost::alloc::string::String, /// Set to true to use an enhanced model for speech recognition. /// If `use_enhanced` is set to true and the `model` field is not set, then /// an appropriate enhanced model is chosen if an enhanced model exists for /// the audio. /// /// If `use_enhanced` is true and an enhanced version of the specified model /// does not exist, then the speech is recognized using the standard version /// of the specified model. #[prost(bool, tag = "14")] pub use_enhanced: bool, } /// Nested message and enum types in `RecognitionConfig`. pub mod recognition_config { /// The encoding of the audio data sent in the request. /// /// All encodings support only 1 channel (mono) audio, unless the /// `audio_channel_count` and `enable_separate_recognition_per_channel` fields /// are set. /// /// For best results, the audio source should be captured and transmitted using /// a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech /// recognition can be reduced if lossy codecs are used to capture or transmit /// audio, particularly if background noise is present. Lossy codecs include /// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`. /// /// The `FLAC` and `WAV` audio file formats include a header that describes the /// included audio content. You can request recognition for `WAV` files that /// contain either `LINEAR16` or `MULAW` encoded audio. /// If you send `FLAC` or `WAV` audio file format in /// your request, you do not need to specify an `AudioEncoding`; the audio /// encoding format is determined from the file header. If you specify /// an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the /// encoding configuration must match the encoding described in the audio /// header; otherwise the request returns an /// \[google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT\] error code. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum AudioEncoding { /// Not specified. EncodingUnspecified = 0, /// Uncompressed 16-bit signed little-endian samples (Linear PCM). Linear16 = 1, /// `FLAC` (Free Lossless Audio /// Codec) is the recommended encoding because it is /// lossless--therefore recognition is not compromised--and /// requires only about half the bandwidth of `LINEAR16`. `FLAC` stream /// encoding supports 16-bit and 24-bit samples, however, not all fields in /// `STREAMINFO` are supported. Flac = 2, /// 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. Mulaw = 3, /// Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. Amr = 4, /// Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. AmrWb = 5, /// Opus encoded audio frames in Ogg container /// (\[OggOpus\]()). /// `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000. OggOpus = 6, /// Although the use of lossy encodings is not recommended, if a very low /// bitrate encoding is required, `OGG_OPUS` is highly preferred over /// Speex encoding. The \[Speex\]() encoding supported by /// Cloud Speech API has a header byte in each block, as in MIME type /// `audio/x-speex-with-header-byte`. /// It is a variant of the RTP Speex encoding defined in /// [RFC 5574](). /// The stream is a sequence of blocks, one block per RTP packet. Each block /// starts with a byte containing the length of the block, in bytes, followed /// by one or more frames of Speex data, padded to an integral number of /// bytes (octets) as specified in RFC 5574. In other words, each RTP header /// is replaced with a single byte containing the block length. Only Speex /// wideband is supported. `sample_rate_hertz` must be 16000. SpeexWithHeaderByte = 7, } } /// Config to enable speaker diarization. #[derive(Clone, PartialEq, ::prost::Message)] pub struct SpeakerDiarizationConfig { /// If 'true', enables speaker detection for each recognized word in /// the top alternative of the recognition result using a speaker_tag provided /// in the WordInfo. #[prost(bool, tag = "1")] pub enable_speaker_diarization: bool, /// Minimum number of speakers in the conversation. This range gives you more /// flexibility by allowing the system to automatically determine the correct /// number of speakers. If not set, the default value is 2. #[prost(int32, tag = "2")] pub min_speaker_count: i32, /// Maximum number of speakers in the conversation. This range gives you more /// flexibility by allowing the system to automatically determine the correct /// number of speakers. If not set, the default value is 6. #[prost(int32, tag = "3")] pub max_speaker_count: i32, /// Output only. Unused. #[deprecated] #[prost(int32, tag = "5")] pub speaker_tag: i32, } /// Description of audio data to be recognized. #[derive(Clone, PartialEq, ::prost::Message)] pub struct RecognitionMetadata { /// The use case most closely describing the audio content to be recognized. #[prost(enumeration = "recognition_metadata::InteractionType", tag = "1")] pub interaction_type: i32, /// The industry vertical to which this speech recognition request most /// closely applies. This is most indicative of the topics contained /// in the audio. Use the 6-digit NAICS code to identify the industry /// vertical - see #[prost(uint32, tag = "3")] pub industry_naics_code_of_audio: u32, /// The audio type that most closely describes the audio being recognized. #[prost(enumeration = "recognition_metadata::MicrophoneDistance", tag = "4")] pub microphone_distance: i32, /// The original media the speech was recorded on. #[prost(enumeration = "recognition_metadata::OriginalMediaType", tag = "5")] pub original_media_type: i32, /// The type of device the speech was recorded with. #[prost(enumeration = "recognition_metadata::RecordingDeviceType", tag = "6")] pub recording_device_type: i32, /// The device used to make the recording. Examples 'Nexus 5X' or /// 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or /// 'Cardioid Microphone'. #[prost(string, tag = "7")] pub recording_device_name: ::prost::alloc::string::String, /// Mime type of the original audio file. For example `audio/m4a`, /// `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`. /// A list of possible audio mime types is maintained at /// #[prost(string, tag = "8")] pub original_mime_type: ::prost::alloc::string::String, /// Description of the content. Eg. "Recordings of federal supreme court /// hearings from 2012". #[prost(string, tag = "10")] pub audio_topic: ::prost::alloc::string::String, } /// Nested message and enum types in `RecognitionMetadata`. pub mod recognition_metadata { /// Use case categories that the audio recognition request can be described /// by. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum InteractionType { /// Use case is either unknown or is something other than one of the other /// values below. Unspecified = 0, /// Multiple people in a conversation or discussion. For example in a /// meeting with two or more people actively participating. Typically /// all the primary people speaking would be in the same room (if not, /// see PHONE_CALL) Discussion = 1, /// One or more persons lecturing or presenting to others, mostly /// uninterrupted. Presentation = 2, /// A phone-call or video-conference in which two or more people, who are /// not in the same room, are actively participating. PhoneCall = 3, /// A recorded message intended for another person to listen to. Voicemail = 4, /// Professionally produced audio (eg. TV Show, Podcast). ProfessionallyProduced = 5, /// Transcribe spoken questions and queries into text. VoiceSearch = 6, /// Transcribe voice commands, such as for controlling a device. VoiceCommand = 7, /// Transcribe speech to text to create a written document, such as a /// text-message, email or report. Dictation = 8, } /// Enumerates the types of capture settings describing an audio file. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum MicrophoneDistance { /// Audio type is not known. Unspecified = 0, /// The audio was captured from a closely placed microphone. Eg. phone, /// dictaphone, or handheld microphone. Generally if there speaker is within /// 1 meter of the microphone. Nearfield = 1, /// The speaker if within 3 meters of the microphone. Midfield = 2, /// The speaker is more than 3 meters away from the microphone. Farfield = 3, } /// The original media the speech was recorded on. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum OriginalMediaType { /// Unknown original media type. Unspecified = 0, /// The speech data is an audio recording. Audio = 1, /// The speech data originally recorded on a video. Video = 2, } /// The type of device the speech was recorded with. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum RecordingDeviceType { /// The recording device is unknown. Unspecified = 0, /// Speech was recorded on a smartphone. Smartphone = 1, /// Speech was recorded using a personal computer or tablet. Pc = 2, /// Speech was recorded over a phone line. PhoneLine = 3, /// Speech was recorded in a vehicle. Vehicle = 4, /// Speech was recorded outdoors. OtherOutdoorDevice = 5, /// Speech was recorded indoors. OtherIndoorDevice = 6, } } /// Provides "hints" to the speech recognizer to favor specific words and phrases /// in the results. #[derive(Clone, PartialEq, ::prost::Message)] pub struct SpeechContext { /// A list of strings containing words and phrases "hints" so that /// the speech recognition is more likely to recognize them. This can be used /// to improve the accuracy for specific words and phrases, for example, if /// specific commands are typically spoken by the user. This can also be used /// to add additional words to the vocabulary of the recognizer. See /// [usage limits](). /// /// List items can also be set to classes for groups of words that represent /// common concepts that occur in natural language. For example, rather than /// providing phrase hints for every month of the year, using the $MONTH class /// improves the likelihood of correctly transcribing audio that includes /// months. #[prost(string, repeated, tag = "1")] pub phrases: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } /// Contains audio data in the encoding specified in the `RecognitionConfig`. /// Either `content` or `uri` must be supplied. Supplying both or neither /// returns \[google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT\]. See /// [content limits](). #[derive(Clone, PartialEq, ::prost::Message)] pub struct RecognitionAudio { /// The audio source, which is either inline content or a Google Cloud /// Storage uri. #[prost(oneof = "recognition_audio::AudioSource", tags = "1, 2")] pub audio_source: ::core::option::Option, } /// Nested message and enum types in `RecognitionAudio`. pub mod recognition_audio { /// The audio source, which is either inline content or a Google Cloud /// Storage uri. #[derive(Clone, PartialEq, ::prost::Oneof)] pub enum AudioSource { /// The audio data bytes encoded as specified in /// `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a /// pure binary representation, whereas JSON representations use base64. #[prost(bytes, tag = "1")] Content(::prost::alloc::vec::Vec), /// URI that points to a file that contains audio data bytes as specified in /// `RecognitionConfig`. The file must not be compressed (for example, gzip). /// Currently, only Google Cloud Storage URIs are /// supported, which must be specified in the following format: /// `gs://bucket_name/object_name` (other URI formats return /// \[google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT\]). For more information, see /// [Request URIs](). #[prost(string, tag = "2")] Uri(::prost::alloc::string::String), } } /// The only message returned to the client by the `Recognize` method. It /// contains the result as zero or more sequential `SpeechRecognitionResult` /// messages. #[derive(Clone, PartialEq, ::prost::Message)] pub struct RecognizeResponse { /// Sequential list of transcription results corresponding to /// sequential portions of audio. #[prost(message, repeated, tag = "2")] pub results: ::prost::alloc::vec::Vec, /// When available, billed audio seconds for the corresponding request. #[prost(message, optional, tag = "3")] pub total_billed_time: ::core::option::Option<::prost_types::Duration>, } /// The only message returned to the client by the `LongRunningRecognize` method. /// It contains the result as zero or more sequential `SpeechRecognitionResult` /// messages. It is included in the `result.response` field of the `Operation` /// returned by the `GetOperation` call of the `google::longrunning::Operations` /// service. #[derive(Clone, PartialEq, ::prost::Message)] pub struct LongRunningRecognizeResponse { /// Sequential list of transcription results corresponding to /// sequential portions of audio. #[prost(message, repeated, tag = "2")] pub results: ::prost::alloc::vec::Vec, /// When available, billed audio seconds for the corresponding request. #[prost(message, optional, tag = "3")] pub total_billed_time: ::core::option::Option<::prost_types::Duration>, } /// Describes the progress of a long-running `LongRunningRecognize` call. It is /// included in the `metadata` field of the `Operation` returned by the /// `GetOperation` call of the `google::longrunning::Operations` service. #[derive(Clone, PartialEq, ::prost::Message)] pub struct LongRunningRecognizeMetadata { /// Approximate percentage of audio processed thus far. Guaranteed to be 100 /// when the audio is fully processed and the results are available. #[prost(int32, tag = "1")] pub progress_percent: i32, /// Time when the request was received. #[prost(message, optional, tag = "2")] pub start_time: ::core::option::Option<::prost_types::Timestamp>, /// Time of the most recent processing update. #[prost(message, optional, tag = "3")] pub last_update_time: ::core::option::Option<::prost_types::Timestamp>, /// Output only. The URI of the audio file being transcribed. Empty if the audio was sent /// as byte content. #[prost(string, tag = "4")] pub uri: ::prost::alloc::string::String, } /// `StreamingRecognizeResponse` is the only message returned to the client by /// `StreamingRecognize`. A series of zero or more `StreamingRecognizeResponse` /// messages are streamed back to the client. If there is no recognizable /// audio, and `single_utterance` is set to false, then no messages are streamed /// back to the client. /// /// Here's an example of a series of `StreamingRecognizeResponse`s that might be /// returned while processing audio: /// /// 1. results { alternatives { transcript: "tube" } stability: 0.01 } /// /// 2. results { alternatives { transcript: "to be a" } stability: 0.01 } /// /// 3. results { alternatives { transcript: "to be" } stability: 0.9 } /// results { alternatives { transcript: " or not to be" } stability: 0.01 } /// /// 4. results { alternatives { transcript: "to be or not to be" /// confidence: 0.92 } /// alternatives { transcript: "to bee or not to bee" } /// is_final: true } /// /// 5. results { alternatives { transcript: " that's" } stability: 0.01 } /// /// 6. results { alternatives { transcript: " that is" } stability: 0.9 } /// results { alternatives { transcript: " the question" } stability: 0.01 } /// /// 7. results { alternatives { transcript: " that is the question" /// confidence: 0.98 } /// alternatives { transcript: " that was the question" } /// is_final: true } /// /// Notes: /// /// - Only two of the above responses #4 and #7 contain final results; they are /// indicated by `is_final: true`. Concatenating these together generates the /// full transcript: "to be or not to be that is the question". /// /// - The others contain interim `results`. #3 and #6 contain two interim /// `results`: the first portion has a high stability and is less likely to /// change; the second portion has a low stability and is very likely to /// change. A UI designer might choose to show only high stability `results`. /// /// - The specific `stability` and `confidence` values shown above are only for /// illustrative purposes. Actual values may vary. /// /// - In each response, only one of these fields will be set: /// `error`, /// `speech_event_type`, or /// one or more (repeated) `results`. #[derive(Clone, PartialEq, ::prost::Message)] pub struct StreamingRecognizeResponse { /// If set, returns a \[google.rpc.Status][google.rpc.Status\] message that /// specifies the error for the operation. #[prost(message, optional, tag = "1")] pub error: ::core::option::Option, /// This repeated list contains zero or more results that /// correspond to consecutive portions of the audio currently being processed. /// It contains zero or one `is_final=true` result (the newly settled portion), /// followed by zero or more `is_final=false` results (the interim results). #[prost(message, repeated, tag = "2")] pub results: ::prost::alloc::vec::Vec, /// Indicates the type of speech event. #[prost(enumeration = "streaming_recognize_response::SpeechEventType", tag = "4")] pub speech_event_type: i32, /// When available, billed audio seconds for the stream. /// Set only if this is the last response in the stream. #[prost(message, optional, tag = "5")] pub total_billed_time: ::core::option::Option<::prost_types::Duration>, } /// Nested message and enum types in `StreamingRecognizeResponse`. pub mod streaming_recognize_response { /// Indicates the type of speech event. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SpeechEventType { /// No speech event specified. SpeechEventUnspecified = 0, /// This event indicates that the server has detected the end of the user's /// speech utterance and expects no additional speech. Therefore, the server /// will not process additional audio (although it may subsequently return /// additional results). The client should stop sending additional audio /// data, half-close the gRPC connection, and wait for any additional results /// until the server closes the gRPC connection. This event is only sent if /// `single_utterance` was set to `true`, and is not used otherwise. EndOfSingleUtterance = 1, } } /// A streaming speech recognition result corresponding to a portion of the audio /// that is currently being processed. #[derive(Clone, PartialEq, ::prost::Message)] pub struct StreamingRecognitionResult { /// May contain one or more recognition hypotheses (up to the /// maximum specified in `max_alternatives`). /// These alternatives are ordered in terms of accuracy, with the top (first) /// alternative being the most probable, as ranked by the recognizer. #[prost(message, repeated, tag = "1")] pub alternatives: ::prost::alloc::vec::Vec, /// If `false`, this `StreamingRecognitionResult` represents an /// interim result that may change. If `true`, this is the final time the /// speech service will return this particular `StreamingRecognitionResult`, /// the recognizer will not return any further hypotheses for this portion of /// the transcript and corresponding audio. #[prost(bool, tag = "2")] pub is_final: bool, /// An estimate of the likelihood that the recognizer will not /// change its guess about this interim result. Values range from 0.0 /// (completely unstable) to 1.0 (completely stable). /// This field is only provided for interim results (`is_final=false`). /// The default of 0.0 is a sentinel value indicating `stability` was not set. #[prost(float, tag = "3")] pub stability: f32, /// Time offset of the end of this result relative to the /// beginning of the audio. #[prost(message, optional, tag = "4")] pub result_end_time: ::core::option::Option<::prost_types::Duration>, /// For multi-channel audio, this is the channel number corresponding to the /// recognized result for the audio from that channel. /// For audio_channel_count = N, its output values can range from '1' to 'N'. #[prost(int32, tag = "5")] pub channel_tag: i32, /// The \[BCP-47\]() language tag of /// the language in this result. This language code was detected to have the /// most likelihood of being spoken in the audio. #[prost(string, tag = "6")] pub language_code: ::prost::alloc::string::String, } /// A speech recognition result corresponding to a portion of the audio. #[derive(Clone, PartialEq, ::prost::Message)] pub struct SpeechRecognitionResult { /// May contain one or more recognition hypotheses (up to the /// maximum specified in `max_alternatives`). /// These alternatives are ordered in terms of accuracy, with the top (first) /// alternative being the most probable, as ranked by the recognizer. #[prost(message, repeated, tag = "1")] pub alternatives: ::prost::alloc::vec::Vec, /// For multi-channel audio, this is the channel number corresponding to the /// recognized result for the audio from that channel. /// For audio_channel_count = N, its output values can range from '1' to 'N'. #[prost(int32, tag = "2")] pub channel_tag: i32, } /// Alternative hypotheses (a.k.a. n-best list). #[derive(Clone, PartialEq, ::prost::Message)] pub struct SpeechRecognitionAlternative { /// Transcript text representing the words that the user spoke. #[prost(string, tag = "1")] pub transcript: ::prost::alloc::string::String, /// The confidence estimate between 0.0 and 1.0. A higher number /// indicates an estimated greater likelihood that the recognized words are /// correct. This field is set only for the top alternative of a non-streaming /// result or, of a streaming result where `is_final=true`. /// This field is not guaranteed to be accurate and users should not rely on it /// to be always provided. /// The default of 0.0 is a sentinel value indicating `confidence` was not set. #[prost(float, tag = "2")] pub confidence: f32, /// A list of word-specific information for each recognized word. /// Note: When `enable_speaker_diarization` is true, you will see all the words /// from the beginning of the audio. #[prost(message, repeated, tag = "3")] pub words: ::prost::alloc::vec::Vec, } /// Word-specific information for recognized words. #[derive(Clone, PartialEq, ::prost::Message)] pub struct WordInfo { /// Time offset relative to the beginning of the audio, /// and corresponding to the start of the spoken word. /// This field is only set if `enable_word_time_offsets=true` and only /// in the top hypothesis. /// This is an experimental feature and the accuracy of the time offset can /// vary. #[prost(message, optional, tag = "1")] pub start_time: ::core::option::Option<::prost_types::Duration>, /// Time offset relative to the beginning of the audio, /// and corresponding to the end of the spoken word. /// This field is only set if `enable_word_time_offsets=true` and only /// in the top hypothesis. /// This is an experimental feature and the accuracy of the time offset can /// vary. #[prost(message, optional, tag = "2")] pub end_time: ::core::option::Option<::prost_types::Duration>, /// The word corresponding to this set of information. #[prost(string, tag = "3")] pub word: ::prost::alloc::string::String, /// Output only. A distinct integer value is assigned for every speaker within /// the audio. This field specifies which one of those speakers was detected to /// have spoken this word. Value ranges from '1' to diarization_speaker_count. /// speaker_tag is set if enable_speaker_diarization = 'true' and only in the /// top alternative. #[prost(int32, tag = "5")] pub speaker_tag: i32, } #[doc = r" Generated client implementations."] pub mod speech_client { #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] use tonic::codegen::*; #[doc = " Service that implements Google Cloud Speech API."] #[derive(Debug, Clone)] pub struct SpeechClient { inner: tonic::client::Grpc, } impl SpeechClient where T: tonic::client::GrpcService, T::ResponseBody: Body + Send + 'static, T::Error: Into, ::Error: Into + Send, { pub fn new(inner: T) -> Self { let inner = tonic::client::Grpc::new(inner); Self { inner } } pub fn with_interceptor( inner: T, interceptor: F, ) -> SpeechClient> where F: tonic::service::Interceptor, T: tonic::codegen::Service< http::Request, Response = http::Response< >::ResponseBody, >, >, >>::Error: Into + Send + Sync, { SpeechClient::new(InterceptedService::new(inner, interceptor)) } #[doc = r" Compress requests with `gzip`."] #[doc = r""] #[doc = r" This requires the server to support it otherwise it might respond with an"] #[doc = r" error."] pub fn send_gzip(mut self) -> Self { self.inner = self.inner.send_gzip(); self } #[doc = r" Enable decompressing responses with `gzip`."] pub fn accept_gzip(mut self) -> Self { self.inner = self.inner.accept_gzip(); self } #[doc = " Performs synchronous speech recognition: receive results after all audio"] #[doc = " has been sent and processed."] pub async fn recognize( &mut self, request: impl tonic::IntoRequest, ) -> Result, tonic::Status> { self.inner.ready().await.map_err(|e| { tonic::Status::new( tonic::Code::Unknown, format!("Service was not ready: {}", e.into()), ) })?; let codec = tonic::codec::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static("/google.cloud.speech.v1.Speech/Recognize"); self.inner.unary(request.into_request(), path, codec).await } #[doc = " Performs asynchronous speech recognition: receive results via the"] #[doc = " google.longrunning.Operations interface. Returns either an"] #[doc = " `Operation.error` or an `Operation.response` which contains"] #[doc = " a `LongRunningRecognizeResponse` message."] #[doc = " For more information on asynchronous speech recognition, see the"] #[doc = " [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize)."] pub async fn long_running_recognize( &mut self, request: impl tonic::IntoRequest, ) -> Result< tonic::Response, tonic::Status, > { self.inner.ready().await.map_err(|e| { tonic::Status::new( tonic::Code::Unknown, format!("Service was not ready: {}", e.into()), ) })?; let codec = tonic::codec::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.cloud.speech.v1.Speech/LongRunningRecognize", ); self.inner.unary(request.into_request(), path, codec).await } #[doc = " Performs bidirectional streaming speech recognition: receive results while"] #[doc = " sending audio. This method is only available via the gRPC API (not REST)."] pub async fn streaming_recognize( &mut self, request: impl tonic::IntoStreamingRequest, ) -> Result< tonic::Response>, tonic::Status, > { self.inner.ready().await.map_err(|e| { tonic::Status::new( tonic::Code::Unknown, format!("Service was not ready: {}", e.into()), ) })?; let codec = tonic::codec::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/google.cloud.speech.v1.Speech/StreamingRecognize", ); self.inner.streaming(request.into_streaming_request(), path, codec).await } } }