// Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2018, TensorFlow Authors. All rights reserved. syntax = "proto3"; package inference; //@@.. cpp:namespace:: inference //@@ //@@.. cpp:enum:: DataType //@@ //@@ Data types supported for input and output tensors. //@@ enum DataType { //@@ .. cpp:enumerator:: DataType::INVALID = 0 TYPE_INVALID = 0; //@@ .. cpp:enumerator:: DataType::BOOL = 1 TYPE_BOOL = 1; //@@ .. cpp:enumerator:: DataType::UINT8 = 2 TYPE_UINT8 = 2; //@@ .. cpp:enumerator:: DataType::UINT16 = 3 TYPE_UINT16 = 3; //@@ .. cpp:enumerator:: DataType::UINT32 = 4 TYPE_UINT32 = 4; //@@ .. cpp:enumerator:: DataType::UINT64 = 5 TYPE_UINT64 = 5; //@@ .. cpp:enumerator:: DataType::INT8 = 6 TYPE_INT8 = 6; //@@ .. cpp:enumerator:: DataType::INT16 = 7 TYPE_INT16 = 7; //@@ .. cpp:enumerator:: DataType::INT32 = 8 TYPE_INT32 = 8; //@@ .. cpp:enumerator:: DataType::INT64 = 9 TYPE_INT64 = 9; //@@ .. cpp:enumerator:: DataType::FP16 = 10 TYPE_FP16 = 10; //@@ .. cpp:enumerator:: DataType::FP32 = 11 TYPE_FP32 = 11; //@@ .. cpp:enumerator:: DataType::FP64 = 12 TYPE_FP64 = 12; //@@ .. cpp:enumerator:: DataType::STRING = 13 TYPE_STRING = 13; //@@ .. cpp:enumerator:: DataType::BF16 = 14 TYPE_BF16 = 14; } //@@ //@@ .. cpp:var:: message ModelRateLimiter //@@ //@@ The specifications required by the rate limiter to properly //@@ schedule the inference requests across the different models //@@ and their instances. //@@ message ModelRateLimiter { //@@ .. cpp:var:: message Resource //@@ //@@ The resource property. //@@ message Resource { //@@ .. cpp:var:: string name //@@ //@@ The name associated with the resource. //@@ string name = 1; //@@ .. cpp:var:: bool global //@@ //@@ Whether or not the resource is global. If true then the resource //@@ is assumed to be shared among the devices otherwise specified //@@ count of the resource is assumed for each device associated //@@ with the instance. //@@ bool global = 2; //@@ .. cpp:var:: uint32 count //@@ //@@ The number of resources required for the execution of the model //@@ instance. //@@ uint32 count = 3; } //@@ .. cpp:var:: Resource resources (repeated) //@@ //@@ The resources required to execute the request on a model instance. //@@ Resources are just names with a corresponding count. The execution //@@ of the instance will be blocked until the specificied resources are //@@ available. By default an instance uses no rate-limiter resources. //@@ repeated Resource resources = 1; //@@ .. cpp:var:: uint32 priority //@@ //@@ The optional weighting value to be used for prioritizing across //@@ instances. An instance with priority 2 will be given 1/2 the //@@ number of scheduling chances as an instance_group with priority //@@ 1. The default priority is 1. The priority of value 0 will be //@@ treated as priority 1. //@@ uint32 priority = 2; } //@@ //@@.. cpp:var:: message ModelInstanceGroup //@@ //@@ A group of one or more instances of a model and resources made //@@ available for those instances. //@@ message ModelInstanceGroup { //@@ //@@ .. cpp:enum:: Kind //@@ //@@ Kind of this instance group. //@@ enum Kind { //@@ .. cpp:enumerator:: Kind::KIND_AUTO = 0 //@@ //@@ This instance group represents instances that can run on either //@@ CPU or GPU. If all GPUs listed in 'gpus' are available then //@@ instances will be created on GPU(s), otherwise instances will //@@ be created on CPU. //@@ KIND_AUTO = 0; //@@ .. cpp:enumerator:: Kind::KIND_GPU = 1 //@@ //@@ This instance group represents instances that must run on the //@@ GPU. //@@ KIND_GPU = 1; //@@ .. cpp:enumerator:: Kind::KIND_CPU = 2 //@@ //@@ This instance group represents instances that must run on the //@@ CPU. //@@ KIND_CPU = 2; //@@ .. cpp:enumerator:: Kind::KIND_MODEL = 3 //@@ //@@ This instance group represents instances that should run on the //@@ CPU and/or GPU(s) as specified by the model or backend itself. //@@ The inference server will not override the model/backend //@@ settings. //@@ KIND_MODEL = 3; } //@@ //@@ .. cpp:var:: message SecondaryDevice //@@ //@@ A secondary device required for a model instance. //@@ message SecondaryDevice { //@@ //@@ .. cpp:enum:: SecondaryDeviceKind //@@ //@@ The kind of the secondary device. //@@ enum SecondaryDeviceKind { //@@ .. cpp:enumerator:: SecondaryDeviceKind::KIND_NVDLA = 0 //@@ //@@ An NVDLA core. http://nvdla.org //@@ Currently KIND_NVDLA is only supported by the TensorRT backend. //@@ KIND_NVDLA = 0; } //@@ .. cpp:var:: SecondaryDeviceKind kind //@@ //@@ The secondary device kind. //@@ SecondaryDeviceKind kind = 1; //@@ .. cpp:var:: int64 device_id //@@ //@@ Identifier for the secondary device. //@@ int64 device_id = 2; } //@@ .. cpp:var:: string name //@@ //@@ Optional name of this group of instances. If not specified the //@@ name will be formed as _. The name of //@@ individual instances will be further formed by a unique instance //@@ number and GPU index: //@@ string name = 1; //@@ .. cpp:var:: Kind kind //@@ //@@ The kind of this instance group. Default is KIND_AUTO. If //@@ KIND_AUTO or KIND_GPU then both 'count' and 'gpu' are valid and //@@ may be specified. If KIND_CPU or KIND_MODEL only 'count' is valid //@@ and 'gpu' cannot be specified. //@@ Kind kind = 4; //@@ .. cpp:var:: int32 count //@@ //@@ For a group assigned to GPU, the number of instances created for //@@ each GPU listed in 'gpus'. For a group assigned to CPU the number //@@ of instances created. Default is 1. int32 count = 2; //@@ .. cpp:var:: ModelRateLimiter rate_limiter //@@ //@@ The rate limiter specific settings to be associated with this //@@ instance group. Optional, if not specified no rate limiting //@@ will be applied to this instance group. //@@ ModelRateLimiter rate_limiter = 6; //@@ .. cpp:var:: int32 gpus (repeated) //@@ //@@ GPU(s) where instances should be available. For each GPU listed, //@@ 'count' instances of the model will be available. Setting 'gpus' //@@ to empty (or not specifying at all) is eqivalent to listing all //@@ available GPUs. //@@ repeated int32 gpus = 3; //@@ .. cpp:var:: SecondaryDevice secondary_devices (repeated) //@@ //@@ Secondary devices that are required by instances specified by this //@@ instance group. Optional. //@@ repeated SecondaryDevice secondary_devices = 8; //@@ .. cpp:var:: string profile (repeated) //@@ //@@ For TensorRT models containing multiple optimization profile, this //@@ parameter specifies a set of optimization profiles available to this //@@ instance group. The inference server will choose the optimal profile //@@ based on the shapes of the input tensors. This field should lie //@@ between 0 and - 1 //@@ and be specified only for TensorRT backend, otherwise an error will //@@ be generated. If not specified, the server will select the first //@@ optimization profile by default. //@@ repeated string profile = 5; //@@ .. cpp:var:: bool passive //@@ //@@ Whether the instances within this instance group will be accepting //@@ inference requests from the scheduler. If true, the instances will //@@ not be added to the scheduler. Default value is false. //@@ bool passive = 7; //@@ .. cpp:var:: string host_policy //@@ //@@ The host policy name that the instance to be associated with. //@@ The default value is set to reflect the device kind of the instance, //@@ for instance, KIND_CPU is "cpu", KIND_MODEL is "model" and //@@ KIND_GPU is "gpu_". //@@ string host_policy = 9; } //@@ //@@.. cpp:var:: message ModelTensorReshape //@@ //@@ Reshape specification for input and output tensors. //@@ message ModelTensorReshape { //@@ .. cpp:var:: int64 shape (repeated) //@@ //@@ The shape to use for reshaping. //@@ repeated int64 shape = 1; } //@@ //@@.. cpp:var:: message ModelInput //@@ //@@ An input required by the model. //@@ message ModelInput { //@@ //@@ .. cpp:enum:: Format //@@ //@@ The format for the input. //@@ enum Format { //@@ .. cpp:enumerator:: Format::FORMAT_NONE = 0 //@@ //@@ The input has no specific format. This is the default. //@@ FORMAT_NONE = 0; //@@ .. cpp:enumerator:: Format::FORMAT_NHWC = 1 //@@ //@@ HWC image format. Tensors with this format require 3 dimensions //@@ if the model does not support batching (max_batch_size = 0) or 4 //@@ dimensions if the model does support batching (max_batch_size //@@ >= 1). In either case the 'dims' below should only specify the //@@ 3 non-batch dimensions (i.e. HWC or CHW). //@@ FORMAT_NHWC = 1; //@@ .. cpp:enumerator:: Format::FORMAT_NCHW = 2 //@@ //@@ CHW image format. Tensors with this format require 3 dimensions //@@ if the model does not support batching (max_batch_size = 0) or 4 //@@ dimensions if the model does support batching (max_batch_size //@@ >= 1). In either case the 'dims' below should only specify the //@@ 3 non-batch dimensions (i.e. HWC or CHW). //@@ FORMAT_NCHW = 2; } //@@ .. cpp:var:: string name //@@ //@@ The name of the input. //@@ string name = 1; //@@ .. cpp:var:: DataType data_type //@@ //@@ The data-type of the input. //@@ DataType data_type = 2; //@@ .. cpp:var:: Format format //@@ //@@ The format of the input. Optional. //@@ Format format = 3; //@@ .. cpp:var:: int64 dims (repeated) //@@ //@@ The dimensions/shape of the input tensor that must be provided //@@ when invoking the inference API for this model. //@@ repeated int64 dims = 4; //@@ .. cpp:var:: ModelTensorReshape reshape //@@ //@@ The shape expected for this input by the backend. The input will //@@ be reshaped to this before being presented to the backend. The //@@ reshape must have the same number of elements as the input shape //@@ specified by 'dims'. Optional. //@@ ModelTensorReshape reshape = 5; //@@ .. cpp:var:: bool is_shape_tensor //@@ //@@ Whether or not the input is a shape tensor to the model. This field //@@ is currently supported only for the TensorRT model. An error will be //@@ generated if this specification does not comply with underlying //@@ model. //@@ bool is_shape_tensor = 6; //@@ .. cpp:var:: bool allow_ragged_batch //@@ //@@ Whether or not the input is allowed to be "ragged" in a dynamically //@@ created batch. Default is false indicating that two requests will //@@ only be batched if this tensor has the same shape in both requests. //@@ True indicates that two requests can be batched even if this tensor //@@ has a different shape in each request. //@@ bool allow_ragged_batch = 7; //@@ .. cpp:var:: bool optional //@@ //@@ Whether or not the input is optional for the model execution. //@@ If true, the input is not required in the inference request. //@@ Default value is false. //@@ bool optional = 8; } //@@ //@@.. cpp:var:: message ModelOutput //@@ //@@ An output produced by the model. //@@ message ModelOutput { //@@ .. cpp:var:: string name //@@ //@@ The name of the output. //@@ string name = 1; //@@ .. cpp:var:: DataType data_type //@@ //@@ The data-type of the output. //@@ DataType data_type = 2; //@@ .. cpp:var:: int64 dims (repeated) //@@ //@@ The dimensions/shape of the output tensor. //@@ repeated int64 dims = 3; //@@ .. cpp:var:: ModelTensorReshape reshape //@@ //@@ The shape produced for this output by the backend. The output will //@@ be reshaped from this to the shape specifed in 'dims' before being //@@ returned in the inference response. The reshape must have the same //@@ number of elements as the output shape specified by 'dims'. Optional. //@@ ModelTensorReshape reshape = 5; //@@ .. cpp:var:: string label_filename //@@ //@@ The label file associated with this output. Should be specified only //@@ for outputs that represent classifications. Optional. //@@ string label_filename = 4; //@@ .. cpp:var:: bool is_shape_tensor //@@ //@@ Whether or not the output is a shape tensor to the model. This field //@@ is currently supported only for the TensorRT model. An error will be //@@ generated if this specification does not comply with underlying //@@ model. //@@ bool is_shape_tensor = 6; } //@@ .. cpp:var:: message BatchInput //@@ //@@ A batch input is an additional input that must be added by //@@ the backend based on all the requests in a batch. //@@ message BatchInput { //@@ //@@ .. cpp:enum:: Kind //@@ //@@ The kind of the batch input. //@@ enum Kind { //@@ .. cpp:enumerator:: Kind::BATCH_ELEMENT_COUNT = 0 //@@ //@@ The element count of the 'source_input' will be added as //@@ input with shape [1]. //@@ BATCH_ELEMENT_COUNT = 0; //@@ .. cpp:enumerator:: Kind::BATCH_ACCUMULATED_ELEMENT_COUNT = 1 //@@ //@@ The accumulated element count of the 'source_input' will be //@@ added as input with shape [1]. For example, if there is a //@@ batch of two request, each with 2 elements, an input of value //@@ 2 will be added to the first request, and an input of value //@@ 4 will be added to the second request. //@@ BATCH_ACCUMULATED_ELEMENT_COUNT = 1; //@@ .. cpp:enumerator:: //@@ Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2 //@@ //@@ The accumulated element count of the 'source_input' will be //@@ added as input with shape [1], except for the first request //@@ in the batch. For the first request in the batch, the input //@@ will have shape [2] where the first element is value 0. //@@ BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2; //@@ .. cpp:enumerator:: Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3 //@@ //@@ Among the requests in the batch, the max element count of the //@@ 'source_input' will be added as input with shape //@@ [max_element_count] for the first request in the batch. //@@ For other requests, such input will be with shape [0]. //@@ The data of the tensor will be uninitialized. //@@ BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3; //@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE = 4 //@@ //@@ Among the requests in the batch, the shape of the //@@ 'source_input' will be added as input with shape //@@ [batch_size, len(input_dim)]. For example, if one //@@ batch-2 input with shape [3, 1] and batch-1 input //@@ with shape [2, 2] are batched, the batch input will //@@ have shape [3, 2] and value [ [3, 1], [3, 1], [2, 2]]. //@@ BATCH_ITEM_SHAPE = 4; //@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE_FLATTEN = 5 //@@ //@@ Among the requests in the batch, the shape of the //@@ 'source_input' will be added as input with single dimensional //@@ shape [batch_size * len(input_dim)]. For example, if one //@@ batch-2 input with shape [3, 1] and batch-1 input //@@ with shape [2, 2] are batched, the batch input will //@@ have shape [6] and value [3, 1, 3, 1, 2, 2]. //@@ BATCH_ITEM_SHAPE_FLATTEN = 5; } //@@ .. cpp:var:: Kind kind //@@ //@@ The kind of this batch input. //@@ Kind kind = 1; //@@ .. cpp:var:: string target_name (repeated) //@@ //@@ The name of the model inputs that the backend will create //@@ for this batch input. //@@ repeated string target_name = 2; //@@ .. cpp:var:: DataType data_type //@@ //@@ The input's datatype. The data type can be TYPE_INT32 or //@@ TYPE_FP32. //@@ DataType data_type = 3; //@@ .. cpp:var:: string source_input (repeated) //@@ //@@ The backend derives the value for each batch input from one or //@@ more other inputs. 'source_input' gives the names of those //@@ inputs. //@@ repeated string source_input = 4; } //@@.. cpp:var:: message BatchOutput //@@ //@@ A batch output is an output produced by the model that must be handled //@@ differently by the backend based on all the requests in a batch. //@@ message BatchOutput { //@@ //@@ .. cpp:enum:: Kind //@@ //@@ The kind of the batch output. //@@ enum Kind { //@@ .. cpp:enumerator:: Kind::BATCH_SCATTER_WITH_INPUT_SHAPE = 0 //@@ //@@ The output should be scattered according to the shape of //@@ 'source_input'. The dynamic dimension of the output will //@@ be set to the value of the same dimension in the input. //@@ BATCH_SCATTER_WITH_INPUT_SHAPE = 0; } //@@ .. cpp:var:: string target_name (repeated) //@@ //@@ The name of the outputs to be produced by this batch output //@@ specification. //@@ repeated string target_name = 1; //@@ .. cpp:var:: Kind kind //@@ //@@ The kind of this batch output. //@@ Kind kind = 2; //@@ .. cpp:var:: string source_input (repeated) //@@ //@@ The backend derives each batch output from one or more inputs. //@@ 'source_input' gives the names of those inputs. //@@ repeated string source_input = 3; } //@@ //@@.. cpp:var:: message ModelVersionPolicy //@@ //@@ Policy indicating which versions of a model should be made //@@ available by the inference server. //@@ message ModelVersionPolicy { //@@ .. cpp:var:: message Latest //@@ //@@ Serve only the latest version(s) of a model. This is //@@ the default policy. //@@ message Latest { //@@ .. cpp:var:: uint32 num_versions //@@ //@@ Serve only the 'num_versions' highest-numbered versions. T //@@ The default value of 'num_versions' is 1, indicating that by //@@ default only the single highest-number version of a //@@ model will be served. //@@ uint32 num_versions = 1; } //@@ .. cpp:var:: message All //@@ //@@ Serve all versions of the model. //@@ message All {} //@@ .. cpp:var:: message Specific //@@ //@@ Serve only specific versions of the model. //@@ message Specific { //@@ .. cpp:var:: int64 versions (repeated) //@@ //@@ The specific versions of the model that will be served. //@@ repeated int64 versions = 1; } //@@ .. cpp:var:: oneof policy_choice //@@ //@@ Each model must implement only a single version policy. The //@@ default policy is 'Latest'. //@@ oneof policy_choice { //@@ .. cpp:var:: Latest latest //@@ //@@ Serve only latest version(s) of the model. //@@ Latest latest = 1; //@@ .. cpp:var:: All all //@@ //@@ Serve all versions of the model. //@@ All all = 2; //@@ .. cpp:var:: Specific specific //@@ //@@ Serve only specific version(s) of the model. //@@ Specific specific = 3; } } //@@ //@@.. cpp:var:: message ModelOptimizationPolicy //@@ //@@ Optimization settings for a model. These settings control if/how a //@@ model is optimized and prioritized by the backend framework when //@@ it is loaded. //@@ message ModelOptimizationPolicy { //@@ //@@ .. cpp:var:: message Graph //@@ //@@ Enable generic graph optimization of the model. If not specified //@@ the framework's default level of optimization is used. Supports //@@ TensorFlow graphdef and savedmodel and Onnx models. For TensorFlow //@@ causes XLA to be enabled/disabled for the model. For Onnx defaults //@@ to enabling all optimizations, -1 enables only basic optimizations, //@@ +1 enables only basic and extended optimizations. //@@ message Graph { //@@ .. cpp:var:: int32 level //@@ //@@ The optimization level. Defaults to 0 (zero) if not specified. //@@ //@@ - -1: Disabled //@@ - 0: Framework default //@@ - 1+: Enable optimization level (greater values indicate //@@ higher optimization levels) //@@ int32 level = 1; } //@@ //@@ .. cpp:enum:: ModelPriority //@@ //@@ Model priorities. A model will be given scheduling and execution //@@ preference over models at lower priorities. Current model //@@ priorities only work for TensorRT models. //@@ enum ModelPriority { //@@ .. cpp:enumerator:: ModelPriority::PRIORITY_DEFAULT = 0 //@@ //@@ The default model priority. //@@ PRIORITY_DEFAULT = 0; //@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MAX = 1 //@@ //@@ The maximum model priority. //@@ PRIORITY_MAX = 1; //@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MIN = 2 //@@ //@@ The minimum model priority. //@@ PRIORITY_MIN = 2; } //@@ //@@ .. cpp:var:: message Cuda //@@ //@@ CUDA-specific optimization settings. //@@ message Cuda { //@@ .. cpp:var:: message GraphSpec //@@ //@@ Specification of the CUDA graph to be captured. //@@ message GraphSpec { //@@ .. cpp:var:: message Dims //@@ //@@ Specification of tensor dimension. //@@ message Shape { //@@ .. cpp:var:: int64 dim (repeated) //@@ //@@ The dimension. //@@ repeated int64 dim = 1; } message LowerBound { //@@ .. cpp:var:: int32 batch_size //@@ //@@ The batch size of the CUDA graph. If 'max_batch_size' is 0, //@@ 'batch_size' must be set to 0. Otherwise, 'batch_size' must //@@ be set to value between 1 and 'max_batch_size'. //@@ int32 batch_size = 1; //@@ .. cpp:var:: map input //@@ //@@ The specification of the inputs. 'Shape' is the shape of //@@ the input without batching dimension. //@@ map input = 2; } //@@ .. cpp:var:: int32 batch_size //@@ //@@ The batch size of the CUDA graph. If 'max_batch_size' is 0, //@@ 'batch_size' must be set to 0. Otherwise, 'batch_size' must //@@ be set to value between 1 and 'max_batch_size'. //@@ int32 batch_size = 1; //@@ .. cpp:var:: map input //@@ //@@ The specification of the inputs. 'Shape' is the shape of the //@@ input without batching dimension. //@@ map input = 2; //@@ .. cpp:var:: LowerBound graph_lower_bound //@@ //@@ Specify the lower bound of the CUDA graph. Optional. //@@ If specified, the graph can be used for input shapes and //@@ batch sizes that are in closed interval between the lower //@@ bound specification and graph specification. For dynamic //@@ shape model, this allows CUDA graphs to be launched //@@ frequently without capturing all possible shape combinations. //@@ However, using graph for shape combinations different from //@@ the one used for capturing introduces uninitialized data for //@@ execution and it may distort the inference result if //@@ the model is sensitive to uninitialized data. //@@ LowerBound graph_lower_bound = 3; } //@@ .. cpp:var:: bool graphs //@@ //@@ Use CUDA graphs API to capture model operations and execute //@@ them more efficiently. Default value is false. //@@ Currently only recognized by TensorRT backend. //@@ bool graphs = 1; //@@ .. cpp:var:: bool busy_wait_events //@@ //@@ Use busy-waiting to synchronize CUDA events to achieve minimum //@@ latency from event complete to host thread to be notified, with //@@ the cost of high CPU load. Default value is false. //@@ Currently only recognized by TensorRT backend. //@@ bool busy_wait_events = 2; //@@ .. cpp:var:: GraphSpec graph_spec (repeated) //@@ //@@ Specification of the CUDA graph to be captured. If not specified //@@ and 'graphs' is true, the default CUDA graphs will be captured //@@ based on model settings. //@@ Currently only recognized by TensorRT backend. //@@ repeated GraphSpec graph_spec = 3; //@@ .. cpp:var:: bool output_copy_stream //@@ //@@ Uses a CUDA stream separate from the inference stream to copy the //@@ output to host. However, be aware that setting this option to //@@ true will lead to an increase in the memory consumption of the //@@ model as Triton will allocate twice as much GPU memory for its //@@ I/O tensor buffers. Default value is false. //@@ Currently only recognized by TensorRT backend. //@@ bool output_copy_stream = 4; } //@@ //@@ .. cpp:var:: message ExecutionAccelerators //@@ //@@ Specify the preferred execution accelerators to be used to execute //@@ the model. Currently only recognized by ONNX Runtime backend and //@@ TensorFlow backend. //@@ //@@ For ONNX Runtime backend, it will deploy the model with the execution //@@ accelerators by priority, the priority is determined based on the //@@ order that they are set, i.e. the provider at the front has highest //@@ priority. Overall, the priority will be in the following order: //@@ (if instance is on GPU) //@@ CUDA Execution Provider (if instance is on GPU) //@@ //@@ Default CPU Execution Provider //@@ message ExecutionAccelerators { //@@ //@@ .. cpp:var:: message Accelerator //@@ //@@ Specify the accelerator to be used to execute the model. //@@ Accelerator with the same name may accept different parameters //@@ depending on the backends. //@@ message Accelerator { //@@ .. cpp:var:: string name //@@ //@@ The name of the execution accelerator. //@@ string name = 1; //@@ .. cpp:var:: map parameters //@@ //@@ Additional paremeters used to configure the accelerator. //@@ map parameters = 2; } //@@ .. cpp:var:: Accelerator gpu_execution_accelerator (repeated) //@@ //@@ The preferred execution provider to be used if the model instance //@@ is deployed on GPU. //@@ //@@ For ONNX Runtime backend, possible value is "tensorrt" as name, //@@ and no parameters are required. //@@ //@@ For TensorFlow backend, possible values are "tensorrt", //@@ "auto_mixed_precision", "gpu_io". //@@ //@@ For "tensorrt", the following parameters can be specified: //@@ "precision_mode": The precision used for optimization. //@@ Allowed values are "FP32" and "FP16". Default value is "FP32". //@@ //@@ "max_cached_engines": The maximum number of cached TensorRT //@@ engines in dynamic TensorRT ops. Default value is 100. //@@ //@@ "minimum_segment_size": The smallest model subgraph that will //@@ be considered for optimization by TensorRT. Default value is 3. //@@ //@@ "max_workspace_size_bytes": The maximum GPU memory the model //@@ can use temporarily during execution. Default value is 1GB. //@@ //@@ For "auto_mixed_precision", no parameters are required. If set, //@@ the model will try to use FP16 for better performance. //@@ This optimization can not be set with "tensorrt". //@@ //@@ For "gpu_io", no parameters are required. If set, the model will //@@ be executed using TensorFlow Callable API to set input and output //@@ tensors in GPU memory if possible, which can reduce data transfer //@@ overhead if the model is used in ensemble. However, the Callable //@@ object will be created on model creation and it will request all //@@ outputs for every model execution, which may impact the //@@ performance if a request does not require all outputs. This //@@ optimization will only take affect if the model instance is //@@ created with KIND_GPU. //@@ repeated Accelerator gpu_execution_accelerator = 1; //@@ .. cpp:var:: Accelerator cpu_execution_accelerator (repeated) //@@ //@@ The preferred execution provider to be used if the model instance //@@ is deployed on CPU. //@@ //@@ For ONNX Runtime backend, possible value is "openvino" as name, //@@ and no parameters are required. //@@ repeated Accelerator cpu_execution_accelerator = 2; } //@@ //@@ .. cpp:var:: message PinnedMemoryBuffer //@@ //@@ Specify whether to use a pinned memory buffer when transferring data //@@ between non-pinned system memory and GPU memory. Using a pinned //@@ memory buffer for system from/to GPU transfers will typically provide //@@ increased performance. For example, in the common use case where the //@@ request provides inputs and delivers outputs via non-pinned system //@@ memory, if the model instance accepts GPU IOs, the inputs will be //@@ processed by two copies: from non-pinned system memory to pinned //@@ memory, and from pinned memory to GPU memory. Similarly, pinned //@@ memory will be used for delivering the outputs. //@@ message PinnedMemoryBuffer { //@@ .. cpp:var:: bool enable //@@ //@@ Use pinned memory buffer. Default is true. //@@ bool enable = 1; } //@@ .. cpp:var:: Graph graph //@@ //@@ The graph optimization setting for the model. Optional. //@@ Graph graph = 1; //@@ .. cpp:var:: ModelPriority priority //@@ //@@ The priority setting for the model. Optional. //@@ ModelPriority priority = 2; //@@ .. cpp:var:: Cuda cuda //@@ //@@ CUDA-specific optimization settings. Optional. //@@ Cuda cuda = 3; //@@ .. cpp:var:: ExecutionAccelerators execution_accelerators //@@ //@@ The accelerators used for the model. Optional. //@@ ExecutionAccelerators execution_accelerators = 4; //@@ .. cpp:var:: PinnedMemoryBuffer input_pinned_memory //@@ //@@ Use pinned memory buffer when the data transfer for inputs //@@ is between GPU memory and non-pinned system memory. //@@ Default is true. //@@ PinnedMemoryBuffer input_pinned_memory = 5; //@@ .. cpp:var:: PinnedMemoryBuffer output_pinned_memory //@@ //@@ Use pinned memory buffer when the data transfer for outputs //@@ is between GPU memory and non-pinned system memory. //@@ Default is true. //@@ PinnedMemoryBuffer output_pinned_memory = 6; //@@ .. cpp:var:: uint32 gather_kernel_buffer_threshold //@@ //@@ The backend may use a gather kernel to gather input data if the //@@ device has direct access to the source buffer and the destination //@@ buffer. In such case, the gather kernel will be used only if the //@@ number of buffers to be gathered is greater or equal to //@@ the specifed value. If 0, the gather kernel will be disabled. //@@ Default value is 0. //@@ Currently only recognized by TensorRT backend. //@@ uint32 gather_kernel_buffer_threshold = 7; //@@ .. cpp:var:: bool eager_batching //@@ //@@ Start preparing the next batch before the model instance is ready //@@ for the next inference. This option can be used to overlap the //@@ batch preparation with model execution, with the trade-off that //@@ the next batch might be smaller than what it could have been. //@@ Default value is false. //@@ Currently only recognized by TensorRT backend. //@@ bool eager_batching = 8; } //@@ //@@.. cpp:var:: message ModelQueuePolicy //@@ //@@ Queue policy for inference requests. //@@ message ModelQueuePolicy { //@@ //@@ .. cpp:enum:: TimeoutAction //@@ //@@ The action applied to timed-out requests. //@@ enum TimeoutAction { //@@ .. cpp:enumerator:: Action::REJECT = 0 //@@ //@@ Reject the request and return error message accordingly. //@@ REJECT = 0; //@@ .. cpp:enumerator:: Action::DELAY = 1 //@@ //@@ Delay the request until all other requests at the same //@@ (or higher) priority levels that have not reached their timeouts //@@ are processed. A delayed request will eventually be processed, //@@ but may be delayed indefinitely due to newly arriving requests. //@@ DELAY = 1; } //@@ //@@ .. cpp:var:: TimeoutAction timeout_action //@@ //@@ The action applied to timed-out request. //@@ The default action is REJECT. //@@ TimeoutAction timeout_action = 1; //@@ //@@ .. cpp:var:: uint64 default_timeout_microseconds //@@ //@@ The default timeout for every request, in microseconds. //@@ The default value is 0 which indicates that no timeout is set. //@@ uint64 default_timeout_microseconds = 2; //@@ //@@ .. cpp:var:: bool allow_timeout_override //@@ //@@ Whether individual request can override the default timeout value. //@@ When true, individual requests can set a timeout that is less than //@@ the default timeout value but may not increase the timeout. //@@ The default value is false. //@@ bool allow_timeout_override = 3; //@@ //@@ .. cpp:var:: uint32 max_queue_size //@@ //@@ The maximum queue size for holding requests. A request will be //@@ rejected immediately if it can't be enqueued because the queue is //@@ full. The default value is 0 which indicates that no maximum //@@ queue size is enforced. //@@ uint32 max_queue_size = 4; } //@@ //@@.. cpp:var:: message ModelDynamicBatching //@@ //@@ Dynamic batching configuration. These settings control how dynamic //@@ batching operates for the model. //@@ message ModelDynamicBatching { //@@ .. cpp:var:: int32 preferred_batch_size (repeated) //@@ //@@ Preferred batch sizes for dynamic batching. If a batch of one of //@@ these sizes can be formed it will be executed immediately. If //@@ not specified a preferred batch size will be chosen automatically //@@ based on model and GPU characteristics. //@@ repeated int32 preferred_batch_size = 1; //@@ .. cpp:var:: uint64 max_queue_delay_microseconds //@@ //@@ The maximum time, in microseconds, a request will be delayed in //@@ the scheduling queue to wait for additional requests for //@@ batching. Default is 0. //@@ uint64 max_queue_delay_microseconds = 2; //@@ .. cpp:var:: bool preserve_ordering //@@ //@@ Should the dynamic batcher preserve the ordering of responses to //@@ match the order of requests received by the scheduler. Default is //@@ false. If true, the responses will be returned in the same order as //@@ the order of requests sent to the scheduler. If false, the responses //@@ may be returned in arbitrary order. This option is specifically //@@ needed when a sequence of related inference requests (i.e. inference //@@ requests with the same correlation ID) are sent to the dynamic //@@ batcher to ensure that the sequence responses are in the correct //@@ order. //@@ bool preserve_ordering = 3; //@@ .. cpp:var:: uint32 priority_levels //@@ //@@ The number of priority levels to be enabled for the model, //@@ the priority level starts from 1 and 1 is the highest priority. //@@ Requests are handled in priority order with all priority 1 requests //@@ processed before priority 2, all priority 2 requests processed before //@@ priority 3, etc. Requests with the same priority level will be //@@ handled in the order that they are received. //@@ uint32 priority_levels = 4; //@@ .. cpp:var:: uint32 default_priority_level //@@ //@@ The priority level used for requests that don't specify their //@@ priority. The value must be in the range [ 1, 'priority_levels' ]. //@@ uint32 default_priority_level = 5; //@@ .. cpp:var:: ModelQueuePolicy default_queue_policy //@@ //@@ The default queue policy used for requests that don't require //@@ priority handling and requests that specify priority levels where //@@ there is no specific policy given. If not specified, a policy with //@@ default field values will be used. //@@ ModelQueuePolicy default_queue_policy = 6; //@@ .. cpp:var:: map priority_queue_policy //@@ //@@ Specify the queue policy for the priority level. The default queue //@@ policy will be used if a priority level doesn't specify a queue //@@ policy. //@@ map priority_queue_policy = 7; } //@@ //@@.. cpp:var:: message ModelSequenceBatching //@@ //@@ Sequence batching configuration. These settings control how sequence //@@ batching operates for the model. //@@ message ModelSequenceBatching { //@@ .. cpp:var:: message Control //@@ //@@ A control is a signal that the sequence batcher uses to //@@ communicate with a backend. //@@ message Control { //@@ //@@ .. cpp:enum:: Kind //@@ //@@ The kind of the control. //@@ enum Kind { //@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_START = 0 //@@ //@@ A new sequence is/is-not starting. If true a sequence is //@@ starting, if false a sequence is continuing. Must //@@ specify either int32_false_true, fp32_false_true or //@@ bool_false_true for this control. This control is optional. //@@ CONTROL_SEQUENCE_START = 0; //@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_READY = 1 //@@ //@@ A sequence is/is-not ready for inference. If true the //@@ input tensor data is valid and should be used. If false //@@ the input tensor data is invalid and inferencing should //@@ be "skipped". Must specify either int32_false_true, //@@ fp32_false_true or bool_false_true for this control. This //@@ control is optional. //@@ CONTROL_SEQUENCE_READY = 1; //@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_END = 2 //@@ //@@ A sequence is/is-not ending. If true a sequence is //@@ ending, if false a sequence is continuing. Must specify //@@ either int32_false_true, fp32_false_true or bool_false_true //@@ for this control. This control is optional. //@@ CONTROL_SEQUENCE_END = 2; //@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_CORRID = 3 //@@ //@@ The correlation ID of the sequence. The correlation ID //@@ is an uint64_t value that is communicated in whole or //@@ in part by the tensor. The tensor's datatype must be //@@ specified by data_type and must be TYPE_UINT64, TYPE_INT64, //@@ TYPE_UINT32 or TYPE_INT32. If a 32-bit datatype is specified //@@ the correlation ID will be truncated to the low-order 32 //@@ bits. This control is optional. //@@ CONTROL_SEQUENCE_CORRID = 3; } //@@ .. cpp:var:: Kind kind //@@ //@@ The kind of this control. //@@ Kind kind = 1; //@@ .. cpp:var:: int32 int32_false_true (repeated) //@@ //@@ The control's true and false setting is indicated by setting //@@ a value in an int32 tensor. The tensor must be a //@@ 1-dimensional tensor with size equal to the batch size of //@@ the request. 'int32_false_true' must have two entries: the //@@ first the false value and the second the true value. //@@ repeated int32 int32_false_true = 2; //@@ .. cpp:var:: float fp32_false_true (repeated) //@@ //@@ The control's true and false setting is indicated by setting //@@ a value in a fp32 tensor. The tensor must be a //@@ 1-dimensional tensor with size equal to the batch size of //@@ the request. 'fp32_false_true' must have two entries: the //@@ first the false value and the second the true value. //@@ repeated float fp32_false_true = 3; //@@ .. cpp:var:: bool bool_false_true (repeated) //@@ //@@ The control's true and false setting is indicated by setting //@@ a value in a bool tensor. The tensor must be a //@@ 1-dimensional tensor with size equal to the batch size of //@@ the request. 'bool_false_true' must have two entries: the //@@ first the false value and the second the true value. //@@ repeated bool bool_false_true = 5; //@@ .. cpp:var:: DataType data_type //@@ //@@ The control's datatype. //@@ DataType data_type = 4; } //@@ .. cpp:var:: message ControlInput //@@ //@@ The sequence control values to communicate by a model input. //@@ message ControlInput { //@@ .. cpp:var:: string name //@@ //@@ The name of the model input. //@@ string name = 1; //@@ .. cpp:var:: Control control (repeated) //@@ //@@ The control value(s) that should be communicated to the //@@ model using this model input. //@@ repeated Control control = 2; } //@@ //@@ .. cpp:var:: message InitialState //@@ //@@ Settings used to initialize data for implicit state. //@@ message InitialState { //@@ .. cpp:var:: DataType data_type //@@ //@@ The data-type of the state. //@@ DataType data_type = 1; //@@ .. cpp:var:: int64 dims (repeated) //@@ //@@ The shape of the state tensor, not including the batch //@@ dimension. //@@ repeated int64 dims = 2; //@@ .. cpp:var:: oneof state_data //@@ //@@ Specify how the initial state data is generated. //@@ oneof state_data { //@@ //@@ .. cpp:var:: bool zero_data //@@ //@@ The identifier for using zeros as initial state data. //@@ Note that the value of 'zero_data' will not be checked, //@@ instead, zero data will be used as long as the field is set. //@@ bool zero_data = 3; //@@ .. cpp:var:: string data_file //@@ //@@ The file whose content will be used as the initial data for //@@ the state in row-major order. The file must be provided in //@@ sub-directory 'initial_state' under the model directory. //@@ string data_file = 4; } //@@ .. cpp:var:: string name //@@ //@@ The name of the state initialization. //@@ string name = 5; } //@@ .. cpp:var:: message State //@@ //@@ An input / output pair of tensors that carry state for the sequence. //@@ message State { //@@ .. cpp:var:: string input_name //@@ //@@ The name of the model state input. //@@ string input_name = 1; //@@ .. cpp:var:: string output_name //@@ //@@ The name of the model state output. //@@ string output_name = 2; //@@ .. cpp:var:: DataType data_type //@@ //@@ The data-type of the state. //@@ DataType data_type = 3; //@@ .. cpp:var:: int64 dim (repeated) //@@ //@@ The dimension. //@@ repeated int64 dims = 4; //@@ .. cpp:var:: InitialState initial_state (repeated) //@@ //@@ The optional field to specify the initial state for the model. //@@ repeated InitialState initial_state = 5; } //@@ .. cpp:var:: message StrategyDirect //@@ //@@ The sequence batcher uses a specific, unique batch //@@ slot for each sequence. All inference requests in a //@@ sequence are directed to the same batch slot in the same //@@ model instance over the lifetime of the sequence. This //@@ is the default strategy. //@@ message StrategyDirect { //@@ .. cpp:var:: uint64 max_queue_delay_microseconds //@@ //@@ The maximum time, in microseconds, a candidate request //@@ will be delayed in the sequence batch scheduling queue to //@@ wait for additional requests for batching. Default is 0. //@@ uint64 max_queue_delay_microseconds = 1; //@@ .. cpp:var:: float minimum_slot_utilization //@@ //@@ The minimum slot utilization that must be satisfied to //@@ execute the batch before 'max_queue_delay_microseconds' expires. //@@ For example, a value of 0.5 indicates that the batch should be //@@ executed as soon as 50% or more of the slots are ready even if //@@ the 'max_queue_delay_microseconds' timeout has not expired. //@@ The default is 0.0, indicating that a batch will be executed //@@ before 'max_queue_delay_microseconds' timeout expires if at least //@@ one batch slot is ready. 'max_queue_delay_microseconds' will be //@@ ignored unless minimum_slot_utilization is set to a non-zero //@@ value. //@@ float minimum_slot_utilization = 2; } //@@ .. cpp:var:: message StrategyOldest //@@ //@@ The sequence batcher maintains up to 'max_candidate_sequences' //@@ candidate sequences. 'max_candidate_sequences' can be greater //@@ than the model's 'max_batch_size'. For inferencing the batcher //@@ chooses from the candidate sequences up to 'max_batch_size' //@@ inference requests. Requests are chosen in an oldest-first //@@ manner across all candidate sequences. A given sequence is //@@ not guaranteed to be assigned to the same batch slot for //@@ all inference requests of that sequence. //@@ message StrategyOldest { //@@ .. cpp:var:: int32 max_candidate_sequences //@@ //@@ Maximum number of candidate sequences that the batcher //@@ maintains. Excess seqences are kept in an ordered backlog //@@ and become candidates when existing candidate sequences //@@ complete. //@@ int32 max_candidate_sequences = 1; //@@ .. cpp:var:: int32 preferred_batch_size (repeated) //@@ //@@ Preferred batch sizes for dynamic batching of candidate //@@ sequences. If a batch of one of these sizes can be formed //@@ it will be executed immediately. If not specified a //@@ preferred batch size will be chosen automatically //@@ based on model and GPU characteristics. //@@ repeated int32 preferred_batch_size = 2; //@@ .. cpp:var:: uint64 max_queue_delay_microseconds //@@ //@@ The maximum time, in microseconds, a candidate request //@@ will be delayed in the dynamic batch scheduling queue to //@@ wait for additional requests for batching. Default is 0. //@@ uint64 max_queue_delay_microseconds = 3; } //@@ .. cpp:var:: oneof strategy_choice //@@ //@@ The strategy used by the sequence batcher. Default strategy //@@ is 'direct'. //@@ oneof strategy_choice { //@@ .. cpp:var:: StrategyDirect direct //@@ //@@ StrategyDirect scheduling strategy. //@@ StrategyDirect direct = 3; //@@ .. cpp:var:: StrategyOldest oldest //@@ //@@ StrategyOldest scheduling strategy. //@@ StrategyOldest oldest = 4; } //@@ .. cpp:var:: uint64 max_sequence_idle_microseconds //@@ //@@ The maximum time, in microseconds, that a sequence is allowed to //@@ be idle before it is aborted. The inference server considers a //@@ sequence idle when it does not have any inference request queued //@@ for the sequence. If this limit is exceeded, the inference server //@@ will free the sequence slot allocated by the sequence and make it //@@ available for another sequence. If not specified (or specified as //@@ zero) a default value of 1000000 (1 second) is used. //@@ uint64 max_sequence_idle_microseconds = 1; //@@ .. cpp:var:: ControlInput control_input (repeated) //@@ //@@ The model input(s) that the server should use to communicate //@@ sequence start, stop, ready and similar control values to the //@@ model. //@@ repeated ControlInput control_input = 2; //@@ .. cpp:var:: State state (repeated) //@@ //@@ The optional state that can be stored in Triton for performing //@@ inference requests on a sequence. Each sequence holds an implicit //@@ state local to itself. The output state tensor provided by the //@@ model in 'output_name' field of the current inference request will //@@ be transferred as an input tensor named 'input_name' in the next //@@ request of the same sequence. The input state of the first request //@@ in the sequence contains garbage data. //@@ repeated State state = 5; } //@@ //@@.. cpp:var:: message ModelEnsembling //@@ //@@ Model ensembling configuration. These settings specify the models that //@@ compose the ensemble and how data flows between the models. //@@ message ModelEnsembling { //@@ .. cpp:var:: message Step //@@ //@@ Each step specifies a model included in the ensemble, //@@ maps ensemble tensor names to the model input tensors, //@@ and maps model output tensors to ensemble tensor names //@@ message Step { //@@ .. cpp:var:: string model_name //@@ //@@ The name of the model to execute for this step of the ensemble. //@@ string model_name = 1; //@@ .. cpp:var:: int64 model_version //@@ //@@ The version of the model to use for inference. If -1 //@@ the latest/most-recent version of the model is used. //@@ int64 model_version = 2; //@@ .. cpp:var:: map input_map //@@ //@@ Map from name of an input tensor on this step's model to ensemble //@@ tensor name. The ensemble tensor must have the same data type and //@@ shape as the model input. Each model input must be assigned to //@@ one ensemble tensor, but the same ensemble tensor can be assigned //@@ to multiple model inputs. //@@ map input_map = 3; //@@ .. cpp:var:: map output_map //@@ //@@ Map from name of an output tensor on this step's model to ensemble //@@ tensor name. The data type and shape of the ensemble tensor will //@@ be inferred from the model output. It is optional to assign all //@@ model outputs to ensemble tensors. One ensemble tensor name //@@ can appear in an output map only once. //@@ map output_map = 4; } //@@ .. cpp:var:: Step step (repeated) //@@ //@@ The models and the input / output mappings used within the ensemble. //@@ repeated Step step = 1; } //@@ //@@.. cpp:var:: message ModelParameter //@@ //@@ A model parameter. //@@ message ModelParameter { //@@ .. cpp:var:: string string_value //@@ //@@ The string value of the parameter. //@@ string string_value = 1; } //@@ //@@.. cpp:var:: message ModelWarmup //@@ //@@ Settings used to construct the request sample for model warmup. //@@ message ModelWarmup { //@@ //@@ .. cpp:var:: message Input //@@ //@@ Meta data associated with an input. //@@ message Input { //@@ .. cpp:var:: DataType data_type //@@ //@@ The data-type of the input. //@@ DataType data_type = 1; //@@ .. cpp:var:: int64 dims (repeated) //@@ //@@ The shape of the input tensor, not including the batch dimension. //@@ repeated int64 dims = 2; //@@ .. cpp:var:: oneof input_data_type //@@ //@@ Specify how the input data is generated. If the input has STRING //@@ data type and 'random_data' is set, the data generation will fall //@@ back to 'zero_data'. //@@ oneof input_data_type { //@@ //@@ .. cpp:var:: bool zero_data //@@ //@@ The identifier for using zeros as input data. Note that the //@@ value of 'zero_data' will not be checked, instead, zero data //@@ will be used as long as the field is set. //@@ bool zero_data = 3; //@@ //@@ .. cpp:var:: bool random_data //@@ //@@ The identifier for using random data as input data. Note that //@@ the value of 'random_data' will not be checked, instead, //@@ random data will be used as long as the field is set. //@@ bool random_data = 4; //@@ .. cpp:var:: string input_data_file //@@ //@@ The file whose content will be used as raw input data in //@@ row-major order. The file must be provided in a sub-directory //@@ 'warmup' under the model directory. The file contents should be //@@ in binary format. For TYPE_STRING data-type, an element is //@@ represented by a 4-byte unsigned integer giving the length //@@ followed by the actual bytes. //@@ string input_data_file = 5; } } //@@ .. cpp:var:: string name //@@ //@@ The name of the request sample. //@@ string name = 1; //@@ .. cpp:var:: uint32 batch_size //@@ //@@ The batch size of the inference request. This must be >= 1. For //@@ models that don't support batching, batch_size must be 1. If //@@ batch_size > 1, the 'inputs' specified below will be duplicated to //@@ match the batch size requested. //@@ uint32 batch_size = 2; //@@ .. cpp:var:: map inputs //@@ //@@ The warmup meta data associated with every model input, including //@@ control tensors. //@@ map inputs = 3; //@@ .. cpp:var:: uint32 count //@@ //@@ The number of iterations that this warmup sample will be executed. //@@ For example, if this field is set to 2, 2 model executions using this //@@ sample will be scheduled for warmup. Default value is 0 which //@@ indicates that this sample will be used only once. //@@ Note that for sequence model, 'count' may not work well //@@ because the model often expect a valid sequence of requests which //@@ should be represented by a series of warmup samples. 'count > 1' //@@ essentially "resends" one of the sample, which may invalidate the //@@ sequence and result in unexpected warmup failure. //@@ uint32 count = 4; } //@@ //@@ .. cpp:var:: message ModelOperations //@@ //@@ The metadata of libraries providing custom operations for this model. //@@ message ModelOperations { //@@ .. cpp:var:: string op_library_filename (repeated) //@@ //@@ Optional paths of the libraries providing custom operations for //@@ this model. Valid only for ONNX models. //@@ repeated string op_library_filename = 1; } //@@ //@@ .. cpp:var:: message ModelTransactionPolicy //@@ //@@ The specification that describes the nature of transactions //@@ to be expected from the model. //@@ message ModelTransactionPolicy { //@@ .. cpp:var:: bool decoupled //@@ //@@ Indicates whether responses generated by the model are decoupled with //@@ the requests issued to it, which means the number of responses //@@ generated by model may differ from number of requests issued, and //@@ that the responses may be out of order relative to the order of //@@ requests. The default is false, which means the model will generate //@@ exactly one response for each request. //@@ bool decoupled = 1; } //@@ //@@.. cpp:var:: message ModelRepositoryAgents //@@ //@@ The repository agents for the model. //@@ message ModelRepositoryAgents { //@@ //@@ .. cpp:var:: message Agent //@@ //@@ A repository agent that should be invoked for the specified //@@ repository actions for this model. //@@ message Agent { //@@ .. cpp:var:: string name //@@ //@@ The name of the agent. //@@ string name = 1; //@@ .. cpp:var:: map parameters //@@ //@@ The parameters for the agent. //@@ map parameters = 2; } //@@ //@@ .. cpp:var:: Agent agents (repeated) //@@ //@@ The ordered list of agents for the model. These agents will be //@@ invoked in order to respond to repository actions occuring for the //@@ model. //@@ repeated Agent agents = 1; } //@@ //@@.. cpp:var:: message ModelResponseCache //@@ //@@ The response cache setting for the model. //@@ message ModelResponseCache { //@@ //@@ .. cpp::var:: bool enable //@@ //@@ Whether or not to use response cache for the model. If True, the //@@ responses from the model are cached and when identical request //@@ is encountered, instead of going through the model execution, //@@ the response from the cache is utilized. By default, response //@@ cache is disabled for the models. //@@ bool enable = 1; } //@@ //@@.. cpp:var:: message ModelConfig //@@ //@@ A model configuration. //@@ message ModelConfig { //@@ .. cpp:var:: string name //@@ //@@ The name of the model. //@@ string name = 1; //@@ .. cpp:var:: string platform //@@ //@@ The framework for the model. Possible values are //@@ "tensorrt_plan", "tensorflow_graphdef", //@@ "tensorflow_savedmodel", "onnxruntime_onnx", //@@ "pytorch_libtorch". //@@ string platform = 2; //@@ .. cpp:var:: string backend //@@ //@@ The backend used by the model. //@@ string backend = 17; //@@ .. cpp:var:: ModelVersionPolicy version_policy //@@ //@@ Policy indicating which version(s) of the model will be served. //@@ ModelVersionPolicy version_policy = 3; //@@ .. cpp:var:: int32 max_batch_size //@@ //@@ Maximum batch size allowed for inference. This can only decrease //@@ what is allowed by the model itself. A max_batch_size value of 0 //@@ indicates that batching is not allowed for the model and the //@@ dimension/shape of the input and output tensors must exactly //@@ match what is specified in the input and output configuration. A //@@ max_batch_size value > 0 indicates that batching is allowed and //@@ so the model expects the input tensors to have an additional //@@ initial dimension for the batching that is not specified in the //@@ input (for example, if the model supports batched inputs of //@@ 2-dimensional tensors then the model configuration will specify //@@ the input shape as [ X, Y ] but the model will expect the actual //@@ input tensors to have shape [ N, X, Y ]). For max_batch_size > 0 //@@ returned outputs will also have an additional initial dimension //@@ for the batch. //@@ int32 max_batch_size = 4; //@@ .. cpp:var:: ModelInput input (repeated) //@@ //@@ The inputs request by the model. //@@ repeated ModelInput input = 5; //@@ .. cpp:var:: ModelOutput output (repeated) //@@ //@@ The outputs produced by the model. //@@ repeated ModelOutput output = 6; //@@ .. cpp:var:: BatchInput batch_input (repeated) //@@ //@@ The model input(s) that the server should use to communicate //@@ batch related values to the model. //@@ repeated BatchInput batch_input = 20; //@@ .. cpp:var:: BatchOutput batch_output (repeated) //@@ //@@ The outputs produced by the model that requires special handling //@@ by the model backend. //@@ repeated BatchOutput batch_output = 21; //@@ .. cpp:var:: ModelOptimizationPolicy optimization //@@ //@@ Optimization configuration for the model. If not specified //@@ then default optimization policy is used. //@@ ModelOptimizationPolicy optimization = 12; //@@ .. cpp:var:: oneof scheduling_choice //@@ //@@ The scheduling policy for the model. If not specified the //@@ default scheduling policy is used for the model. The default //@@ policy is to execute each inference request independently. //@@ oneof scheduling_choice { //@@ .. cpp:var:: ModelDynamicBatching dynamic_batching //@@ //@@ If specified, enables the dynamic-batching scheduling //@@ policy. With dynamic-batching the scheduler may group //@@ together independent requests into a single batch to //@@ improve inference throughput. //@@ ModelDynamicBatching dynamic_batching = 11; //@@ .. cpp:var:: ModelSequenceBatching sequence_batching //@@ //@@ If specified, enables the sequence-batching scheduling //@@ policy. With sequence-batching, inference requests //@@ with the same correlation ID are routed to the same //@@ model instance. Multiple sequences of inference requests //@@ may be batched together into a single batch to //@@ improve inference throughput. //@@ ModelSequenceBatching sequence_batching = 13; //@@ .. cpp:var:: ModelEnsembling ensemble_scheduling //@@ //@@ If specified, enables the model-ensembling scheduling //@@ policy. With model-ensembling, inference requests //@@ will be processed according to the specification, such as an //@@ execution sequence of models. The input specified in this model //@@ config will be the input for the ensemble, and the output //@@ specified will be the output of the ensemble. //@@ ModelEnsembling ensemble_scheduling = 15; } //@@ .. cpp:var:: ModelInstanceGroup instance_group (repeated) //@@ //@@ Instances of this model. If not specified, one instance //@@ of the model will be instantiated on each available GPU. //@@ repeated ModelInstanceGroup instance_group = 7; //@@ .. cpp:var:: string default_model_filename //@@ //@@ Optional filename of the model file to use if a //@@ compute-capability specific model is not specified in //@@ :cpp:var:`cc_model_filenames`. If not specified the default name //@@ is 'model.graphdef', 'model.savedmodel', 'model.plan' or //@@ 'model.pt' depending on the model type. //@@ string default_model_filename = 8; //@@ .. cpp:var:: map cc_model_filenames //@@ //@@ Optional map from CUDA compute capability to the filename of //@@ the model that supports that compute capability. The filename //@@ refers to a file within the model version directory. //@@ map cc_model_filenames = 9; //@@ .. cpp:var:: map metric_tags //@@ //@@ Optional metric tags. User-specific key-value pairs for metrics //@@ reported for this model. These tags are applied to the metrics //@@ reported on the HTTP metrics port. //@@ map metric_tags = 10; //@@ .. cpp:var:: map parameters //@@ //@@ Optional model parameters. User-specified parameter values. //@@ map parameters = 14; //@@ .. cpp:var:: ModelWarmup model_warmup (repeated) //@@ //@@ Warmup setting of this model. If specified, all instances //@@ will be run with the request samples in sequence before //@@ serving the model. //@@ This field can only be specified if the model is not an ensemble //@@ model. //@@ repeated ModelWarmup model_warmup = 16; //@@ .. cpp:var:: ModelOperations model_operations //@@ //@@ Optional metadata of the libraries providing custom operations for //@@ this model. //@@ ModelOperations model_operations = 18; //@@ .. cpp:var:: ModelTransactionPolicy model_transaction_policy //@@ //@@ Optional specification that describes the nature of transactions //@@ to be expected from the model. //@@ ModelTransactionPolicy model_transaction_policy = 19; //@@ .. cpp:var:: ModelRepositoryAgents model_repository_agents //@@ //@@ Optional specification of the agent(s) that should be invoked //@@ with repository actions are performed for this model. //@@ ModelRepositoryAgents model_repository_agents = 23; //@@ .. cpp:var:: ModelResponseCache response_cache //@@ //@@ Optional setting for utilizing the response cache for this //@@ model. //@@ ModelResponseCache response_cache = 24; }