// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. // // NVIDIA CORPORATION and its licensors retain all intellectual property // and proprietary rights in and to this software, related documentation // and any modifications thereto. Any use, reproduction, disclosure or // distribution of this software and related documentation without an express // license agreement from NVIDIA CORPORATION is strictly prohibited. syntax="proto3"; package nvidia.riva.nmt; option cc_enable_arenas = true; option go_package = "nvidia.com/riva_speech"; import "riva/proto/riva_audio.proto"; import "riva/proto/riva_asr.proto"; import "riva/proto/riva_tts.proto"; /* * RivaTranslation service provides rpcs to translate between languages. * */ service RivaTranslation { // Translate text to text, from a source to a target language. Currently source and target language fields is required, along with the model name. // Multiple texts may be passed per request up to the given batch size for the model, which is set at translation pipeline creation time. rpc TranslateText(TranslateTextRequest) returns (TranslateTextResponse) {} // Lists the available language pairs and models names to be used for TranslateText rpc ListSupportedLanguagePairs(AvailableLanguageRequest) returns (AvailableLanguageResponse) {} //streaming speech to text translation api. rpc StreamingTranslateSpeechToText(stream StreamingTranslateSpeechToTextRequest) returns (stream StreamingTranslateSpeechToTextResponse) {} rpc StreamingTranslateSpeechToSpeech(stream StreamingTranslateSpeechToSpeechRequest) returns (stream StreamingTranslateSpeechToSpeechResponse) {} } /* * Configuration for Translate S2S. reuse existing protos from other services. */ message StreamingTranslateSpeechToSpeechConfig { nvidia.riva.asr.StreamingRecognitionConfig asr_config = 1; //from riva_asr.proto SynthesizeSpeechConfig tts_config = 2; TranslationConfig translation_config = 3; } /* * Streaming translate speech to speech used to configure the entire pipline for speech translation. This can be * be backed by a cascade of ASR, NMT, TTS models or an end to end model * */ message StreamingTranslateSpeechToSpeechRequest { oneof streaming_request { StreamingTranslateSpeechToSpeechConfig config = 1; bytes audio_content = 2; } } message TranslationConfig { //BCP-47 "en-US" string source_language_code = 1; string target_language_code = 2; string model_name = 3; } message SynthesizeSpeechConfig { AudioEncoding encoding = 1; int32 sample_rate_hz = 2; string voice_name = 3; string language_code = 4; } /* * */ message StreamingTranslateSpeechToSpeechResponse { // Contains speech responses, the last response sends an empty buffer to mark the end of stream. nvidia.riva.tts.SynthesizeSpeechResponse speech = 1; //from riva_tts.proto } message StreamingTranslateSpeechToTextRequest { oneof streaming_request { StreamingTranslateSpeechToTextConfig config = 1; bytes audio_content = 2; } } message StreamingTranslateSpeechToTextResponse { repeated nvidia.riva.asr.StreamingRecognitionResult results = 1; //from riva_asr.proto } message StreamingTranslateSpeechToTextConfig { nvidia.riva.asr.StreamingRecognitionConfig asr_config = 1; //existing ASR config TranslationConfig translation_config = 2; } // request for synchronous translation of each text in texts. // Available languages can be queried using ListSupportLanguagePairs RPC. // source and target languages must be specified, are currently two character ISO codes, this will likely change to BCP-47 inline with other Riva Services for GA. // message TranslateTextRequest { repeated string texts = 1; string model = 2; string source_language = 3; string target_language = 4; } // contains a single translation, collecting into the translate text response // Includes the target language code, since with multi lingual models there are multiple possibilities. message Translation { string text = 1; string language = 2; } // Translations are returned as text:language pairs. These are 1:1 for the passed in 'texts' from the request. message TranslateTextResponse { repeated Translation translations = 1; } // Returns a map of model names to its source and target language pairs. // Can specificy a specific model name to retrieve only its language pairs. message AvailableLanguageRequest { // If empty returns all available languages. string model = 1; } // Language pairs are the sets of src to tgt languages available per model. // languages contains all the model_name -> Language pair message AvailableLanguageResponse { message LanguagePair { repeated string src_lang = 1; repeated string tgt_lang = 2; } map languages = 1; }