// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: MIT // Copyright 2019 Google LLC. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // syntax = "proto3"; package nvidia.riva.asr; option cc_enable_arenas = true; option go_package = "nvidia.com/riva_speech"; import "riva/proto/riva_audio.proto"; /* * The RivaSpeechRecognition service provides two mechanisms for converting speech to text. */ service RivaSpeechRecognition { // Recognize expects a RecognizeRequest and returns a RecognizeResponse. This request will block // until the audio is uploaded, processed, and a transcript is returned. rpc Recognize(RecognizeRequest) returns (RecognizeResponse) {} // StreamingRecognize is a non-blocking API call that allows audio data to be fed to the server in // chunks as it becomes available. Depending on the configuration in the StreamingRecognizeRequest, // intermediate results can be sent back to the client. Recognition ends when the stream is closed // by the client. rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {} //Enables clients to request the configuration of the current ASR service, or a specific model within the service. rpc GetRivaSpeechRecognitionConfig(RivaSpeechRecognitionConfigRequest) returns (RivaSpeechRecognitionConfigResponse) {} } /* * RivaSpeechRecognitionConfigRequest */ message RivaSpeechRecognitionConfigRequest { //If model is specified only return config for model, otherwise return all configs. string model_name = 1; } message RivaSpeechRecognitionConfigResponse { message Config { string model_name = 1; map parameters = 2; } repeated Config model_config = 1; } /* * RecognizeRequest is used for batch processing of a single audio recording. */ message RecognizeRequest { // Provides information to recognizer that specifies how to process the request. RecognitionConfig config = 1; // The raw audio data to be processed. The audio bytes must be encoded as specified in // `RecognitionConfig`. bytes audio = 2; } /* * A StreamingRecognizeRequest is used to configure and stream audio content to the * Riva ASR Service. The first message sent must include only a StreamingRecognitionConfig. * Subsequent messages sent in the stream must contain only raw bytes of the audio * to be recognized. */ message StreamingRecognizeRequest { // The streaming request, which is either a streaming config or audio content. oneof streaming_request { // Provides information to the recognizer that specifies how to process the // request. The first `StreamingRecognizeRequest` message must contain a // `streaming_config` message. StreamingRecognitionConfig streaming_config = 1; // The audio data to be recognized. Sequential chunks of audio data are sent // in sequential `StreamingRecognizeRequest` messages. The first // `StreamingRecognizeRequest` message must not contain `audio` data // and all subsequent `StreamingRecognizeRequest` messages must contain // `audio` data. The audio bytes must be encoded as specified in // `RecognitionConfig`. bytes audio_content = 2; } } // Provides information to the recognizer that specifies how to process the request message RecognitionConfig { // The encoding of the audio data sent in the request. // // All encodings support only 1 channel (mono) audio. AudioEncoding encoding = 1; // The sample rate in hertz (Hz) of the audio data sent in the // `RecognizeRequest` or `StreamingRecognizeRequest` messages. // The Riva server will automatically down-sample/up-sample the audio to match the ASR acoustic model sample rate. // The sample rate value below 8kHz will not produce any meaningful output. int32 sample_rate_hertz = 2; // Required. The language of the supplied audio as a // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. // Example: "en-US". string language_code = 3; // Maximum number of recognition hypotheses to be returned. // Specifically, the maximum number of `SpeechRecognizeAlternative` messages // within each `SpeechRecognizeResult`. // The server may return fewer than `max_alternatives`. // If omitted, will return a maximum of one. int32 max_alternatives = 4; // A custom field that enables profanity filtering for the generated transcripts. // If set to 'true', the server filters out profanities, replacing all but the initial // character in each filtered word with asterisks. For example, "x**". // If set to `false` or omitted, profanities will not be filtered out. The default is `false`. bool profanity_filter=5; // Array of SpeechContext. // A means to provide context to assist the speech recognition. For more // information, see SpeechContext section repeated SpeechContext speech_contexts = 6; // The number of channels in the input audio data. // ONLY set this for MULTI-CHANNEL recognition. // Valid values for LINEAR16 and FLAC are `1`-`8`. // Valid values for OGG_OPUS are '1'-'254'. // Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`. // If `0` or omitted, defaults to one channel (mono). // Note: We only recognize the first channel by default. // To perform independent recognition on each channel set // `enable_separate_recognition_per_channel` to 'true'. int32 audio_channel_count = 7; // If `true`, the top result includes a list of words and the start and end // time offsets (timestamps), and confidence scores for those words. If // `false`, no word-level time offset information is returned. The default // is `false`. bool enable_word_time_offsets = 8; // If 'true', adds punctuation to recognition result hypotheses. The // default 'false' value does not add punctuation to result hypotheses. bool enable_automatic_punctuation = 11; // This needs to be set to `true` explicitly and `audio_channel_count` > 1 // to get each channel recognized separately. The recognition result will // contain a `channel_tag` field to state which channel that result belongs // to. If this is not true, we will only recognize the first channel. The // request is billed cumulatively for all channels recognized: // `audio_channel_count` multiplied by the length of the audio. bool enable_separate_recognition_per_channel = 12; // Which model to select for the given request. // If empty, Riva will select the right model based on the other RecognitionConfig parameters. // The model should correspond to the name passed to `riva-build` with the `--name` argument string model = 13; // The verbatim_transcripts flag enables or disable inverse text normalization. // 'true' returns exactly what was said, with no denormalization. // 'false' applies inverse text normalization, also this is the default bool verbatim_transcripts = 14; // Config to enable speaker diarization and set additional // parameters. For non-streaming requests, the diarization results will be provided only // in the top alternative of the FINAL SpeechRecognitionResult. SpeakerDiarizationConfig diarization_config = 19; // Custom fields for passing request-level // configuration options to plugins used in the // model pipeline. map custom_configuration = 24; } // Provides information to the recognizer that specifies how to process the request message StreamingRecognitionConfig { // Provides information to the recognizer that specifies how to process the request RecognitionConfig config = 1; // If `true`, interim results (tentative hypotheses) may be // returned as they become available (these interim results are indicated with // the `is_final=false` flag). // If `false` or omitted, only `is_final=true` result(s) are returned. bool interim_results = 2; } // Config to enable speaker diarization. message SpeakerDiarizationConfig { // If 'true', enables speaker detection for each recognized word in // the top alternative of the recognition result using a speaker_tag provided // in the WordInfo. bool enable_speaker_diarization = 1; // Maximum number of speakers in the conversation. This gives flexibility by // allowing the system to automatically determine the correct number of speakers. // If not set, the default value is 8. int32 max_speaker_count = 2; } // Provides "hints" to the speech recognizer to favor specific words and phrases // in the results. message SpeechContext { // A list of strings containing words and phrases "hints" so that // the speech recognition is more likely to recognize them. This can be used // to improve the accuracy for specific words and phrases, for example, if // specific commands are typically spoken by the user. This can also be used // to add additional words to the vocabulary of the recognizer. repeated string phrases = 1; // Hint Boost. Positive value will increase the probability that a specific // phrase will be recognized over other similar sounding phrases. The higher // the boost, the higher the chance of false positive recognition as well. // Though `boost` can accept a wide range of positive values, most use cases are best served with // values between 0 and 20. We recommend using a binary search approach to // finding the optimal value for your use case. float boost = 4; } // The only message returned to the client by the `Recognize` method. It // contains the result as zero or more sequential `SpeechRecognitionResult` // messages. message RecognizeResponse { // Sequential list of transcription results corresponding to // sequential portions of audio. Currently only returns one transcript. repeated SpeechRecognitionResult results = 1; } // A speech recognition result corresponding to the latest transcript message SpeechRecognitionResult { // May contain one or more recognition hypotheses (up to the // maximum specified in `max_alternatives`). // These alternatives are ordered in terms of accuracy, with the top (first) // alternative being the most probable, as ranked by the recognizer. repeated SpeechRecognitionAlternative alternatives = 1; // For multi-channel audio, this is the channel number corresponding to the // recognized result for the audio from that channel. // For audio_channel_count = N, its output values can range from '1' to 'N'. int32 channel_tag = 2; // Length of audio processed so far in seconds float audio_processed = 3; } // Alternative hypotheses (a.k.a. n-best list). message SpeechRecognitionAlternative { // Transcript text representing the words that the user spoke. string transcript = 1; // The confidence estimate. A higher number indicates an estimated greater // likelihood that the recognized word is correct. This field is set only for // a non-streaming result or, for a streaming result where is_final=true. // This field is not guaranteed to be accurate and users should not rely on // it to be always provided. Although confidence can currently be roughly // interpreted as a natural-log probability, the estimate computation varies // with difference configurations, and is subject to change. The default of // 0.0 is a sentinel value indicating confidence was not set. float confidence = 2; // A list of word-specific information for each recognized word. Only populated // if is_final=true repeated WordInfo words = 3; } // Word-specific information for recognized words. message WordInfo { // Time offset relative to the beginning of the audio in ms // and corresponding to the start of the spoken word. // This field is only set if `enable_word_time_offsets=true` and only // in the top hypothesis. int32 start_time = 1; // Time offset relative to the beginning of the audio in ms // and corresponding to the end of the spoken word. // This field is only set if `enable_word_time_offsets=true` and only // in the top hypothesis. int32 end_time = 2; // The word corresponding to this set of information. string word = 3; // The confidence estimate. A higher number indicates an estimated greater // likelihood that the recognized word is correct. This field is set only for // a non-streaming result or, for a streaming result where is_final=true. // This field is not guaranteed to be accurate and users should not rely on // it to be always provided. Although confidence can currently be roughly // interpreted as a natural-log probability, the estimate computation varies // with difference configurations, and is subject to change. The default of // 0.0 is a sentinel value indicating confidence was not set. float confidence = 4; // Output only. A distinct integer value is assigned for every speaker within // the audio. This field specifies which one of those speakers was detected to // have spoken this word. Value ranges from '1' to diarization_speaker_count. // speaker_tag is set if enable_speaker_diarization = 'true' and only in the // top alternative. int32 speaker_tag = 5; } // `StreamingRecognizeResponse` is the only message returned to the client by // `StreamingRecognize`. A series of zero or more `StreamingRecognizeResponse` // messages are streamed back to the client. // // Here are few examples of `StreamingRecognizeResponse`s // // 1. results { alternatives { transcript: "tube" } stability: 0.01 } // // 2. results { alternatives { transcript: "to be a" } stability: 0.01 } // // 3. results { alternatives { transcript: "to be or not to be" // confidence: 0.92 } // alternatives { transcript: "to bee or not to bee" } // is_final: true } // message StreamingRecognizeResponse { // This repeated list contains the latest transcript(s) corresponding to // audio currently being processed. // Currently one result is returned, where each result can have multiple // alternatives repeated StreamingRecognitionResult results = 1; } // A streaming speech recognition result corresponding to a portion of the audio // that is currently being processed. message StreamingRecognitionResult { // May contain one or more recognition hypotheses (up to the // maximum specified in `max_alternatives`). // These alternatives are ordered in terms of accuracy, with the top (first) // alternative being the most probable, as ranked by the recognizer. repeated SpeechRecognitionAlternative alternatives = 1; // If `false`, this `StreamingRecognitionResult` represents an // interim result that may change. If `true`, this is the final time the // speech service will return this particular `StreamingRecognitionResult`, // the recognizer will not return any further hypotheses for this portion of // the transcript and corresponding audio. bool is_final = 2; // An estimate of the likelihood that the recognizer will not // change its guess about this interim result. Values range from 0.0 // (completely unstable) to 1.0 (completely stable). // This field is only provided for interim results (`is_final=false`). // The default of 0.0 is a sentinel value indicating `stability` was not set. float stability = 3; // For multi-channel audio, this is the channel number corresponding to the // recognized result for the audio from that channel. // For audio_channel_count = N, its output values can range from '1' to 'N'. int32 channel_tag = 5; // Length of audio processed so far in seconds float audio_processed = 6; }