// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: MIT syntax = "proto3"; package nvidia.riva.tts; option cc_enable_arenas = true; option go_package = "nvidia.com/riva_speech"; import "riva/proto/riva_audio.proto"; service RivaSpeechSynthesis { // Used to request text-to-speech from the service. Submit a request containing the // desired text and configuration, and receive audio bytes in the requested format. rpc Synthesize(SynthesizeSpeechRequest) returns (SynthesizeSpeechResponse) {} // Used to request text-to-speech returned via stream as it becomes available. // Submit a SynthesizeSpeechRequest with desired text and configuration, // and receive stream of bytes in the requested format. rpc SynthesizeOnline(SynthesizeSpeechRequest) returns (stream SynthesizeSpeechResponse) {} //Enables clients to request the configuration of the current Synthesize service, or a specific model within the service. rpc GetRivaSynthesisConfig(RivaSynthesisConfigRequest) returns (RivaSynthesisConfigResponse) {} } message RivaSynthesisConfigRequest { //If model is specified only return config for model, otherwise return all configs. string model_name = 1; } message RivaSynthesisConfigResponse { message Config { string model_name = 1; map parameters = 2; } repeated Config model_config = 1; } message SynthesizeSpeechRequest { string text = 1; string language_code = 2; // audio encoding params AudioEncoding encoding = 3; // The sample rate in hertz (Hz) of the audio output requested through `SynthesizeSpeechRequest` messages. // Models produce an output at a fixed rate. The sample rate enables you to resample the generated audio output if required. // You use the sample rate to up-sample or down-sample the audio for various scenarios. For example, the sample rate can be set to 8kHz (kilohertz) if the output // audio is desired for a low bandwidth application. // The sample rate values below 8kHz will not produce any meaningful output. Also, up-sampling too much will increase the // size of the output without improving the output audio quality. int32 sample_rate_hz = 4; // voice params string voice_name = 5; } message SynthesizeSpeechResponseMetadata { // Currently experimental API addition that returns the input text // after preprocessing has been completed as well as the predicted // duration for each token. // Note: this message is subject to future breaking changes, and potential // removal. string text = 1; string processed_text = 2; repeated float predicted_durations = 8; } message SynthesizeSpeechResponse { bytes audio = 1; SynthesizeSpeechResponseMetadata meta = 2; } /* * */