// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

syntax = "proto3";

package inference;

//@@.. cpp:namespace:: inference

import "model_config.proto";

//@@
//@@.. cpp:var:: service InferenceService
//@@
//@@   Inference Server GRPC endpoints.
//@@
service GRPCInferenceService
{
  //@@  .. cpp:var:: rpc ServerLive(ServerLiveRequest) returns
  //@@       (ServerLiveResponse)
  //@@
  //@@     Check liveness of the inference server.
  //@@
  rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {}

  //@@  .. cpp:var:: rpc ServerReady(ServerReadyRequest) returns
  //@@       (ServerReadyResponse)
  //@@
  //@@     Check readiness of the inference server.
  //@@
  rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {}

  //@@  .. cpp:var:: rpc ModelReady(ModelReadyRequest) returns
  //@@       (ModelReadyResponse)
  //@@
  //@@     Check readiness of a model in the inference server.
  //@@
  rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {}

  //@@  .. cpp:var:: rpc ServerMetadata(ServerMetadataRequest) returns
  //@@       (ServerMetadataResponse)
  //@@
  //@@     Get server metadata.
  //@@
  rpc ServerMetadata(ServerMetadataRequest) returns (ServerMetadataResponse) {}

  //@@  .. cpp:var:: rpc ModelMetadata(ModelMetadataRequest) returns
  //@@       (ModelMetadataResponse)
  //@@
  //@@     Get model metadata.
  //@@
  rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {}

  //@@  .. cpp:var:: rpc ModelInfer(ModelInferRequest) returns
  //@@       (ModelInferResponse)
  //@@
  //@@     Perform inference using a specific model.
  //@@
  rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {}

  //@@  .. cpp:var:: rpc ModelStreamInfer(stream ModelInferRequest) returns
  //@@       (stream ModelStreamInferResponse)
  //@@
  //@@     Perform streaming inference.
  //@@
  rpc ModelStreamInfer(stream ModelInferRequest)
      returns (stream ModelStreamInferResponse)
  {
  }

  //@@  .. cpp:var:: rpc ModelConfig(ModelConfigRequest) returns
  //@@       (ModelConfigResponse)
  //@@
  //@@     Get model configuration.
  //@@
  rpc ModelConfig(ModelConfigRequest) returns (ModelConfigResponse) {}

  //@@  .. cpp:var:: rpc ModelStatistics(
  //@@                     ModelStatisticsRequest)
  //@@                   returns (ModelStatisticsResponse)
  //@@
  //@@     Get the cumulative inference statistics for a model.
  //@@
  rpc ModelStatistics(ModelStatisticsRequest) returns (ModelStatisticsResponse)
  {
  }

  //@@  .. cpp:var:: rpc RepositoryIndex(RepositoryIndexRequest) returns
  //@@       (RepositoryIndexResponse)
  //@@
  //@@     Get the index of model repository contents.
  //@@
  rpc RepositoryIndex(RepositoryIndexRequest) returns (RepositoryIndexResponse)
  {
  }

  //@@  .. cpp:var:: rpc RepositoryModelLoad(RepositoryModelLoadRequest) returns
  //@@       (RepositoryModelLoadResponse)
  //@@
  //@@     Load or reload a model from a repository.
  //@@
  rpc RepositoryModelLoad(RepositoryModelLoadRequest)
      returns (RepositoryModelLoadResponse)
  {
  }

  //@@  .. cpp:var:: rpc RepositoryModelUnload(RepositoryModelUnloadRequest)
  //@@       returns (RepositoryModelUnloadResponse)
  //@@
  //@@     Unload a model.
  //@@
  rpc RepositoryModelUnload(RepositoryModelUnloadRequest)
      returns (RepositoryModelUnloadResponse)
  {
  }

  //@@  .. cpp:var:: rpc SystemSharedMemoryStatus(
  //@@                     SystemSharedMemoryStatusRequest)
  //@@                   returns (SystemSharedMemoryStatusRespose)
  //@@
  //@@     Get the status of all registered system-shared-memory regions.
  //@@
  rpc SystemSharedMemoryStatus(SystemSharedMemoryStatusRequest)
      returns (SystemSharedMemoryStatusResponse)
  {
  }

  //@@  .. cpp:var:: rpc SystemSharedMemoryRegister(
  //@@                     SystemSharedMemoryRegisterRequest)
  //@@                   returns (SystemSharedMemoryRegisterResponse)
  //@@
  //@@     Register a system-shared-memory region.
  //@@
  rpc SystemSharedMemoryRegister(SystemSharedMemoryRegisterRequest)
      returns (SystemSharedMemoryRegisterResponse)
  {
  }

  //@@  .. cpp:var:: rpc SystemSharedMemoryUnregister(
  //@@                     SystemSharedMemoryUnregisterRequest)
  //@@                   returns (SystemSharedMemoryUnregisterResponse)
  //@@
  //@@     Unregister a system-shared-memory region.
  //@@
  rpc SystemSharedMemoryUnregister(SystemSharedMemoryUnregisterRequest)
      returns (SystemSharedMemoryUnregisterResponse)
  {
  }

  //@@  .. cpp:var:: rpc CudaSharedMemoryStatus(
  //@@                     CudaSharedMemoryStatusRequest)
  //@@                   returns (CudaSharedMemoryStatusRespose)
  //@@
  //@@     Get the status of all registered CUDA-shared-memory regions.
  //@@
  rpc CudaSharedMemoryStatus(CudaSharedMemoryStatusRequest)
      returns (CudaSharedMemoryStatusResponse)
  {
  }

  //@@  .. cpp:var:: rpc CudaSharedMemoryRegister(
  //@@                     CudaSharedMemoryRegisterRequest)
  //@@                   returns (CudaSharedMemoryRegisterResponse)
  //@@
  //@@     Register a CUDA-shared-memory region.
  //@@
  rpc CudaSharedMemoryRegister(CudaSharedMemoryRegisterRequest)
      returns (CudaSharedMemoryRegisterResponse)
  {
  }

  //@@  .. cpp:var:: rpc CudaSharedMemoryUnregister(
  //@@                     CudaSharedMemoryUnregisterRequest)
  //@@                   returns (CudaSharedMemoryUnregisterResponse)
  //@@
  //@@     Unregister a CUDA-shared-memory region.
  //@@
  rpc CudaSharedMemoryUnregister(CudaSharedMemoryUnregisterRequest)
      returns (CudaSharedMemoryUnregisterResponse)
  {
  }

  //@@  .. cpp:var:: rpc TraceSetting(TraceSettingRequest)
  //@@                   returns (TraceSettingResponse)
  //@@
  //@@     Update and get the trace setting of the Triton server.
  //@@
  rpc TraceSetting(TraceSettingRequest) returns (TraceSettingResponse)
  {
  }
}

//@@
//@@.. cpp:var:: message ServerLiveRequest
//@@
//@@   Request message for ServerLive.
//@@
message ServerLiveRequest {}

//@@
//@@.. cpp:var:: message ServerLiveResponse
//@@
//@@   Response message for ServerLive.
//@@
message ServerLiveResponse
{
  //@@
  //@@  .. cpp:var:: bool live
  //@@
  //@@     True if the inference server is live, false it not live.
  //@@
  bool live = 1;
}

//@@
//@@.. cpp:var:: message ServerReadyRequest
//@@
//@@   Request message for ServerReady.
//@@
message ServerReadyRequest {}

//@@
//@@.. cpp:var:: message ServerReadyResponse
//@@
//@@   Response message for ServerReady.
//@@
message ServerReadyResponse
{
  //@@
  //@@  .. cpp:var:: bool ready
  //@@
  //@@     True if the inference server is ready, false it not ready.
  //@@
  bool ready = 1;
}

//@@
//@@.. cpp:var:: message ModelReadyRequest
//@@
//@@   Request message for ModelReady.
//@@
message ModelReadyRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the model to check for readiness.
  //@@
  string name = 1;

  //@@  .. cpp:var:: string version
  //@@
  //@@     The version of the model to check for readiness. If not given the
  //@@     server will choose a version based on the model and internal policy.
  //@@
  string version = 2;
}

//@@
//@@.. cpp:var:: message ModelReadyResponse
//@@
//@@   Response message for ModelReady.
//@@
message ModelReadyResponse
{
  //@@
  //@@  .. cpp:var:: bool ready
  //@@
  //@@     True if the model is ready, false it not ready.
  //@@
  bool ready = 1;
}

//@@
//@@.. cpp:var:: message ServerMetadataRequest
//@@
//@@   Request message for ServerMetadata.
//@@
message ServerMetadataRequest {}

//@@
//@@.. cpp:var:: message ServerMetadataResponse
//@@
//@@   Response message for ServerMetadata.
//@@
message ServerMetadataResponse
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The server name.
  //@@
  string name = 1;

  //@@
  //@@  .. cpp:var:: string version
  //@@
  //@@     The server version.
  //@@
  string version = 2;

  //@@
  //@@  .. cpp:var:: string extensions (repeated)
  //@@
  //@@     The extensions supported by the server.
  //@@
  repeated string extensions = 3;
}

//@@
//@@.. cpp:var:: message ModelMetadataRequest
//@@
//@@   Request message for ModelMetadata.
//@@
message ModelMetadataRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the model.
  //@@
  string name = 1;

  //@@  .. cpp:var:: string version
  //@@
  //@@     The version of the model to check for readiness. If not
  //@@     given the server will choose a version based on the
  //@@     model and internal policy.
  //@@
  string version = 2;
}

//@@
//@@.. cpp:var:: message ModelMetadataResponse
//@@
//@@   Response message for ModelMetadata.
//@@
message ModelMetadataResponse
{
  //@@
  //@@  .. cpp:var:: message TensorMetadata
  //@@
  //@@     Metadata for a tensor.
  //@@
  message TensorMetadata
  {
    //@@
    //@@    .. cpp:var:: string name
    //@@
    //@@       The tensor name.
    //@@
    string name = 1;

    //@@
    //@@    .. cpp:var:: string datatype
    //@@
    //@@       The tensor data type.
    //@@
    string datatype = 2;

    //@@
    //@@    .. cpp:var:: int64 shape (repeated)
    //@@
    //@@       The tensor shape. A variable-size dimension is represented
    //@@       by a -1 value.
    //@@
    repeated int64 shape = 3;
  }

  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The model name.
  //@@
  string name = 1;

  //@@
  //@@  .. cpp:var:: string versions (repeated)
  //@@
  //@@     The versions of the model.
  //@@
  repeated string versions = 2;

  //@@
  //@@  .. cpp:var:: string platform
  //@@
  //@@     The model's platform.
  //@@
  string platform = 3;

  //@@
  //@@  .. cpp:var:: TensorMetadata inputs (repeated)
  //@@
  //@@     The model's inputs.
  //@@
  repeated TensorMetadata inputs = 4;

  //@@
  //@@  .. cpp:var:: TensorMetadata outputs (repeated)
  //@@
  //@@     The model's outputs.
  //@@
  repeated TensorMetadata outputs = 5;
}

//@@
//@@.. cpp:var:: message InferParameter
//@@
//@@   An inference parameter value.
//@@
message InferParameter
{
  //@@  .. cpp:var:: oneof parameter_choice
  //@@
  //@@     The parameter value can be a string, an int64 or
  //@@     a boolean
  //@@
  oneof parameter_choice
  {
    //@@    .. cpp:var:: bool bool_param
    //@@
    //@@       A boolean parameter value.
    //@@
    bool bool_param = 1;

    //@@    .. cpp:var:: int64 int64_param
    //@@
    //@@       An int64 parameter value.
    //@@
    int64 int64_param = 2;

    //@@    .. cpp:var:: string string_param
    //@@
    //@@       A string parameter value.
    //@@
    string string_param = 3;
  }
}

//@@
//@@.. cpp:var:: message InferTensorContents
//@@
//@@   The data contained in a tensor represented by the repeated type
//@@   that matches the tensor's data type. Protobuf oneof is not used
//@@   because oneofs cannot contain repeated fields.
//@@
message InferTensorContents
{
  //@@
  //@@  .. cpp:var:: bool bool_contents (repeated)
  //@@
  //@@     Representation for BOOL data type. The size must match what is
  //@@     expected by the tensor's shape. The contents must be the flattened,
  //@@     one-dimensional, row-major order of the tensor elements.
  //@@
  repeated bool bool_contents = 1;

  //@@
  //@@  .. cpp:var:: int32 int_contents (repeated)
  //@@
  //@@     Representation for INT8, INT16, and INT32 data types. The size
  //@@     must match what is expected by the tensor's shape. The contents
  //@@     must be the flattened, one-dimensional, row-major order of the
  //@@     tensor elements.
  //@@
  repeated int32 int_contents = 2;

  //@@
  //@@  .. cpp:var:: int64 int64_contents (repeated)
  //@@
  //@@     Representation for INT64 data types. The size must match what
  //@@     is expected by the tensor's shape. The contents must be the
  //@@     flattened, one-dimensional, row-major order of the tensor elements.
  //@@
  repeated int64 int64_contents = 3;

  //@@
  //@@  .. cpp:var:: uint32 uint_contents (repeated)
  //@@
  //@@     Representation for UINT8, UINT16, and UINT32 data types. The size
  //@@     must match what is expected by the tensor's shape. The contents
  //@@     must be the flattened, one-dimensional, row-major order of the
  //@@     tensor elements.
  //@@
  repeated uint32 uint_contents = 4;

  //@@
  //@@  .. cpp:var:: uint64 uint64_contents (repeated)
  //@@
  //@@     Representation for UINT64 data types. The size must match what
  //@@     is expected by the tensor's shape. The contents must be the
  //@@     flattened, one-dimensional, row-major order of the tensor elements.
  //@@
  repeated uint64 uint64_contents = 5;

  //@@
  //@@  .. cpp:var:: float fp32_contents (repeated)
  //@@
  //@@     Representation for FP32 data type. The size must match what is
  //@@     expected by the tensor's shape. The contents must be the flattened,
  //@@     one-dimensional, row-major order of the tensor elements.
  //@@
  repeated float fp32_contents = 6;

  //@@
  //@@  .. cpp:var:: double fp64_contents (repeated)
  //@@
  //@@     Representation for FP64 data type. The size must match what is
  //@@     expected by the tensor's shape. The contents must be the flattened,
  //@@     one-dimensional, row-major order of the tensor elements.
  //@@
  repeated double fp64_contents = 7;

  //@@
  //@@  .. cpp:var:: bytes bytes_contents (repeated)
  //@@
  //@@     Representation for BYTES data type. The size must match what is
  //@@     expected by the tensor's shape. The contents must be the flattened,
  //@@     one-dimensional, row-major order of the tensor elements.
  //@@
  repeated bytes bytes_contents = 8;
}

//@@
//@@.. cpp:var:: message ModelInferRequest
//@@
//@@   Request message for ModelInfer.
//@@
message ModelInferRequest
{
  //@@
  //@@  .. cpp:var:: message InferInputTensor
  //@@
  //@@     An input tensor for an inference request.
  //@@
  message InferInputTensor
  {
    //@@
    //@@    .. cpp:var:: string name
    //@@
    //@@       The tensor name.
    //@@
    string name = 1;

    //@@
    //@@    .. cpp:var:: string datatype
    //@@
    //@@       The tensor data type.
    //@@
    string datatype = 2;

    //@@
    //@@    .. cpp:var:: int64 shape (repeated)
    //@@
    //@@       The tensor shape.
    //@@
    repeated int64 shape = 3;

    //@@    .. cpp:var:: map<string,InferParameter> parameters
    //@@
    //@@       Optional inference input tensor parameters.
    //@@
    map<string, InferParameter> parameters = 4;

    //@@    .. cpp:var:: InferTensorContents contents
    //@@
    //@@       The tensor contents using a data-type format. This field
    //@@       must not be specified if tensor contents are being specified
    //@@       in ModelInferRequest.raw_input_contents.
    //@@
    InferTensorContents contents = 5;
  }

  //@@
  //@@  .. cpp:var:: message InferRequestedOutputTensor
  //@@
  //@@     An output tensor requested for an inference request.
  //@@
  message InferRequestedOutputTensor
  {
    //@@
    //@@    .. cpp:var:: string name
    //@@
    //@@       The tensor name.
    //@@
    string name = 1;

    //@@    .. cpp:var:: map<string,InferParameter> parameters
    //@@
    //@@       Optional requested output tensor parameters.
    //@@
    map<string, InferParameter> parameters = 2;
  }

  //@@  .. cpp:var:: string model_name
  //@@
  //@@     The name of the model to use for inferencing.
  //@@
  string model_name = 1;

  //@@  .. cpp:var:: string model_version
  //@@
  //@@     The version of the model to use for inference. If not
  //@@     given the latest/most-recent version of the model is used.
  //@@
  string model_version = 2;

  //@@  .. cpp:var:: string id
  //@@
  //@@     Optional identifier for the request. If specified will be
  //@@     returned in the response.
  //@@
  string id = 3;

  //@@  .. cpp:var:: map<string,InferParameter> parameters
  //@@
  //@@     Optional inference parameters.
  //@@
  map<string, InferParameter> parameters = 4;

  //@@
  //@@  .. cpp:var:: InferInputTensor inputs (repeated)
  //@@
  //@@     The input tensors for the inference.
  //@@
  repeated InferInputTensor inputs = 5;

  //@@
  //@@  .. cpp:var:: InferRequestedOutputTensor outputs (repeated)
  //@@
  //@@     The requested output tensors for the inference. Optional, if not
  //@@     specified all outputs specified in the model config will be
  //@@     returned.
  //@@
  repeated InferRequestedOutputTensor outputs = 6;

  //@@
  //@@  .. cpp:var:: bytes raw_input_contents
  //@@
  //@@     The data contained in an input tensor can be represented in
  //@@     "raw" bytes form or in the repeated type that matches the
  //@@     tensor's data type. Using the "raw" bytes form will
  //@@     typically allow higher performance due to the way protobuf
  //@@     allocation and reuse interacts with GRPC. For example, see
  //@@     https://github.com/grpc/grpc/issues/23231.
  //@@
  //@@     To use the raw representation 'raw_input_contents' must be
  //@@     initialized with data for each tensor in the same order as
  //@@     'inputs'. For each tensor, the size of this content must
  //@@     match what is expected by the tensor's shape and data
  //@@     type. The raw data must be the flattened, one-dimensional,
  //@@     row-major order of the tensor elements without any stride
  //@@     or padding between the elements. Note that the FP16 and BF16 data
  //@@     types must be represented as raw content as there is no
  //@@     specific data type for a 16-bit float type.
  //@@
  //@@     If this field is specified then InferInputTensor::contents
  //@@     must not be specified for any input tensor.
  //@@
  repeated bytes raw_input_contents = 7;
}

//@@
//@@.. cpp:var:: message ModelInferResponse
//@@
//@@   Response message for ModelInfer.
//@@
message ModelInferResponse
{
  //@@
  //@@  .. cpp:var:: message InferOutputTensor
  //@@
  //@@     An output tensor returned for an inference request.
  //@@
  message InferOutputTensor
  {
    //@@
    //@@    .. cpp:var:: string name
    //@@
    //@@       The tensor name.
    //@@
    string name = 1;

    //@@
    //@@    .. cpp:var:: string datatype
    //@@
    //@@       The tensor data type.
    //@@
    string datatype = 2;

    //@@
    //@@    .. cpp:var:: int64 shape (repeated)
    //@@
    //@@       The tensor shape.
    //@@
    repeated int64 shape = 3;

    //@@    .. cpp:var:: map<string,InferParameter> parameters
    //@@
    //@@       Optional output tensor parameters.
    //@@
    map<string, InferParameter> parameters = 4;

    //@@    .. cpp:var:: InferTensorContents contents
    //@@
    //@@       The tensor contents using a data-type format. This field
    //@@       must not be specified if tensor contents are being specified
    //@@       in ModelInferResponse.raw_output_contents.
    //@@
    InferTensorContents contents = 5;
  }

  //@@  .. cpp:var:: string model_name
  //@@
  //@@     The name of the model used for inference.
  //@@
  string model_name = 1;

  //@@  .. cpp:var:: string model_version
  //@@
  //@@     The version of the model used for inference.
  //@@
  string model_version = 2;

  //@@  .. cpp:var:: string id
  //@@
  //@@     The id of the inference request if one was specified.
  //@@
  string id = 3;

  //@@  .. cpp:var:: map<string,InferParameter> parameters
  //@@
  //@@     Optional inference response parameters.
  //@@
  map<string, InferParameter> parameters = 4;

  //@@
  //@@  .. cpp:var:: InferOutputTensor outputs (repeated)
  //@@
  //@@     The output tensors holding inference results.
  //@@
  repeated InferOutputTensor outputs = 5;

  //@@
  //@@  .. cpp:var:: bytes raw_output_contents
  //@@
  //@@     The data contained in an output tensor can be represented in
  //@@     "raw" bytes form or in the repeated type that matches the
  //@@     tensor's data type. Using the "raw" bytes form will
  //@@     typically allow higher performance due to the way protobuf
  //@@     allocation and reuse interacts with GRPC. For example, see
  //@@     https://github.com/grpc/grpc/issues/23231.
  //@@
  //@@     To use the raw representation 'raw_output_contents' must be
  //@@     initialized with data for each tensor in the same order as
  //@@     'outputs'. For each tensor, the size of this content must
  //@@     match what is expected by the tensor's shape and data
  //@@     type. The raw data must be the flattened, one-dimensional,
  //@@     row-major order of the tensor elements without any stride
  //@@     or padding between the elements. Note that the FP16 and BF16 data
  //@@     types must be represented as raw content as there is no
  //@@     specific data type for a 16-bit float type.
  //@@
  //@@     If this field is specified then InferOutputTensor::contents
  //@@     must not be specified for any output tensor.
  //@@
  repeated bytes raw_output_contents = 6;
}

//@@
//@@.. cpp:var:: message ModelStreamInferResponse
//@@
//@@   Response message for ModelStreamInfer.
//@@
message ModelStreamInferResponse
{
  //@@
  //@@  .. cpp:var:: string error_message
  //@@
  //@@     The message describing the error. The empty message
  //@@     indicates the inference was successful without errors.
  //@@
  string error_message = 1;

  //@@
  //@@  .. cpp:var:: ModelInferResponse infer_response
  //@@
  //@@     Holds the results of the request.
  //@@
  ModelInferResponse infer_response = 2;
}

//@@
//@@.. cpp:var:: message ModelConfigRequest
//@@
//@@   Request message for ModelConfig.
//@@
message ModelConfigRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the model.
  //@@
  string name = 1;

  //@@  .. cpp:var:: string version
  //@@
  //@@     The version of the model. If not given the model version
  //@@     is selected automatically based on the version policy.
  //@@
  string version = 2;
}

//@@
//@@.. cpp:var:: message ModelConfigResponse
//@@
//@@   Response message for ModelConfig.
//@@
message ModelConfigResponse
{
  //@@
  //@@  .. cpp:var:: ModelConfig config
  //@@
  //@@     The model configuration.
  //@@
  ModelConfig config = 1;
}

//@@
//@@.. cpp:var:: message ModelStatisticsRequest
//@@
//@@   Request message for ModelStatistics.
//@@
message ModelStatisticsRequest
{
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the model. If not given returns statistics for
  //@@     all models.
  //@@
  string name = 1;

  //@@  .. cpp:var:: string version
  //@@
  //@@     The version of the model. If not given returns statistics for
  //@@     all model versions.
  //@@
  string version = 2;
}


//@@
//@@.. cpp:var:: message StatisticDuration
//@@
//@@   Statistic recording a cumulative duration metric.
//@@
message StatisticDuration
{
  //@@  .. cpp:var:: uint64 count
  //@@
  //@@     Cumulative number of times this metric occurred.
  //@@
  uint64 count = 1;

  //@@  .. cpp:var:: uint64 total_time_ns
  //@@
  //@@     Total collected duration of this metric in nanoseconds.
  //@@
  uint64 ns = 2;
}

//@@
//@@.. cpp:var:: message InferStatistics
//@@
//@@   Inference statistics.
//@@
message InferStatistics
{
  //@@  .. cpp:var:: StatisticDuration success
  //@@
  //@@     Cumulative count and duration for successful inference
  //@@     request. The "success" count and cumulative duration includes
  //@@     cache hits.
  //@@
  StatisticDuration success = 1;

  //@@  .. cpp:var:: StatisticDuration fail
  //@@
  //@@     Cumulative count and duration for failed inference
  //@@     request.
  //@@
  StatisticDuration fail = 2;

  //@@  .. cpp:var:: StatisticDuration queue
  //@@
  //@@     The count and cumulative duration that inference requests wait in
  //@@     scheduling or other queues. The "queue" count and cumulative 
  //@@     duration includes cache hits.
  //@@
  StatisticDuration queue = 3;

  //@@  .. cpp:var:: StatisticDuration compute_input
  //@@
  //@@     The count and cumulative duration to prepare input tensor data as
  //@@     required by the model framework / backend. For example, this duration
  //@@     should include the time to copy input tensor data to the GPU.
  //@@     The "compute_input" count and cumulative duration do not account for
  //@@     requests that were a cache hit. See the "cache_hit" field for more
  //@@     info.
  //@@
  StatisticDuration compute_input = 4;

  //@@  .. cpp:var:: StatisticDuration compute_infer
  //@@
  //@@     The count and cumulative duration to execute the model.
  //@@     The "compute_infer" count and cumulative duration do not account for
  //@@     requests that were a cache hit. See the "cache_hit" field for more
  //@@     info.
  //@@
  StatisticDuration compute_infer = 5;

  //@@  .. cpp:var:: StatisticDuration compute_output
  //@@
  //@@     The count and cumulative duration to extract output tensor data
  //@@     produced by the model framework / backend. For example, this duration
  //@@     should include the time to copy output tensor data from the GPU.
  //@@     The "compute_output" count and cumulative duration do not account for
  //@@     requests that were a cache hit. See the "cache_hit" field for more
  //@@     info.
  //@@
  StatisticDuration compute_output = 6;

  //@@  .. cpp:var:: StatisticDuration cache_hit
  //@@
  //@@     The count of response cache hits and cumulative duration to lookup
  //@@     and extract output tensor data from the Response Cache on a cache
  //@@     hit. For example, this duration should include the time to copy
  //@@     output tensor data from the Response Cache to the response object.
  //@@     On cache hits, triton does not need to go to the model/backend 
  //@@     for the output tensor data, so the "compute_input", "compute_infer",
  //@@     and "compute_output" fields are not updated. Assuming the response
  //@@     cache is enabled for a given model, a cache hit occurs for a
  //@@     request to that model when the request metadata (model name,
  //@@     model version, model inputs) hashes to an existing entry in the
  //@@     cache. On a cache miss, the request hash and response output tensor
  //@@     data is added to the cache. See response cache docs for more info:
  //@@     https://github.com/triton-inference-server/server/blob/main/docs/response_cache.md
  //@@
  StatisticDuration cache_hit = 7;

  //@@  .. cpp:var:: StatisticDuration cache_miss
  //@@
  //@@     The count of response cache misses and cumulative duration to lookup
  //@@     and insert output tensor data from the computed response to the cache.
  //@@     For example, this duration should include the time to copy
  //@@     output tensor data from the response object to the Response Cache.
  //@@     Assuming the response cache is enabled for a given model, a cache
  //@@     miss occurs for a request to that model when the request metadata
  //@@     does NOT hash to an existing entry in the cache. See the response
  //@@     cache docs for more info:
  //@@     https://github.com/triton-inference-server/server/blob/main/docs/response_cache.md
  //@@
  StatisticDuration cache_miss = 8;
}

//@@
//@@.. cpp:var:: message InferBatchStatistics
//@@
//@@   Inference batch statistics.
//@@
message InferBatchStatistics
{
  //@@  .. cpp:var:: uint64 batch_size
  //@@
  //@@     The size of the batch.
  //@@
  uint64 batch_size = 1;

  //@@  .. cpp:var:: StatisticDuration compute_input
  //@@
  //@@     The count and cumulative duration to prepare input tensor data as
  //@@     required by the model framework / backend with the given batch size.
  //@@     For example, this duration should include the time to copy input
  //@@     tensor data to the GPU.
  //@@
  StatisticDuration compute_input = 2;

  //@@  .. cpp:var:: StatisticDuration compute_infer
  //@@
  //@@     The count and cumulative duration to execute the model with the given
  //@@     batch size.
  //@@
  StatisticDuration compute_infer = 3;

  //@@  .. cpp:var:: StatisticDuration compute_output
  //@@
  //@@     The count and cumulative duration to extract output tensor data
  //@@     produced by the model framework / backend with the given batch size.
  //@@     For example, this duration should include the time to copy output
  //@@     tensor data from the GPU.
  //@@
  StatisticDuration compute_output = 4;
}

//@@
//@@.. cpp:var:: message ModelStatistics
//@@
//@@   Statistics for a specific model and version.
//@@
message ModelStatistics
{
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the model. If not given returns statistics for all
  //@@
  string name = 1;

  //@@  .. cpp:var:: string version
  //@@
  //@@     The version of the model.
  //@@
  string version = 2;

  //@@  .. cpp:var:: uint64 last_inference
  //@@
  //@@     The timestamp of the last inference request made for this model,
  //@@     as milliseconds since the epoch.
  //@@
  uint64 last_inference = 3;

  //@@  .. cpp:var:: uint64 last_inference
  //@@
  //@@     The cumulative count of successful inference requests made for this
  //@@     model. Each inference in a batched request is counted as an
  //@@     individual inference. For example, if a client sends a single
  //@@     inference request with batch size 64, "inference_count" will be
  //@@     incremented by 64. Similarly, if a clients sends 64 individual
  //@@     requests each with batch size 1, "inference_count" will be
  //@@     incremented by 64. The "inference_count" value DOES NOT include
  //@@     cache hits.
  //@@
  uint64 inference_count = 4;

  //@@  .. cpp:var:: uint64 last_inference
  //@@
  //@@     The cumulative count of the number of successful inference executions
  //@@     performed for the model. When dynamic batching is enabled, a single
  //@@     model execution can perform inferencing for more than one inference
  //@@     request. For example, if a clients sends 64 individual requests each
  //@@     with batch size 1 and the dynamic batcher batches them into a single
  //@@     large batch for model execution then "execution_count" will be
  //@@     incremented by 1. If, on the other hand, the dynamic batcher is not
  //@@     enabled for that each of the 64 individual requests is executed
  //@@     independently, then "execution_count" will be incremented by 64.
  //@@     The "execution_count" value DOES NOT include cache hits.
  //@@
  uint64 execution_count = 5;

  //@@  .. cpp:var:: InferStatistics inference_stats
  //@@
  //@@     The aggregate statistics for the model/version.
  //@@
  InferStatistics inference_stats = 6;

  //@@  .. cpp:var:: InferBatchStatistics batch_stats (repeated)
  //@@
  //@@     The aggregate statistics for each different batch size that is
  //@@     executed in the model. The batch statistics indicate how many actual
  //@@     model executions were performed and show differences due to different
  //@@     batch size (for example, larger batches typically take longer to
  //@@     compute).
  //@@
  repeated InferBatchStatistics batch_stats = 7;
}

//@@
//@@.. cpp:var:: message ModelStatisticsResponse
//@@
//@@   Response message for ModelStatistics.
//@@
message ModelStatisticsResponse
{
  //@@  .. cpp:var:: ModelStatistics model_stats (repeated)
  //@@
  //@@     Statistics for each requested model.
  //@@
  repeated ModelStatistics model_stats = 1;
}

//@@
//@@.. cpp:var:: message ModelRepositoryParameter
//@@
//@@   An model repository parameter value.
//@@
message ModelRepositoryParameter
{
  //@@  .. cpp:var:: oneof parameter_choice
  //@@
  //@@     The parameter value can be a string, an int64 or
  //@@     a boolean
  //@@
  oneof parameter_choice
  {
    //@@    .. cpp:var:: bool bool_param
    //@@
    //@@       A boolean parameter value.
    //@@
    bool bool_param = 1;

    //@@    .. cpp:var:: int64 int64_param
    //@@
    //@@       An int64 parameter value.
    //@@
    int64 int64_param = 2;

    //@@    .. cpp:var:: string string_param
    //@@
    //@@       A string parameter value.
    //@@
    string string_param = 3;

    //@@    .. cpp:var:: bytes bytes_param
    //@@
    //@@       A bytes parameter value.
    //@@
    bytes bytes_param = 4;
  }
}

//@@
//@@.. cpp:var:: message RepositoryIndexRequest
//@@
//@@   Request message for RepositoryIndex.
//@@
message RepositoryIndexRequest
{
  //@@  .. cpp:var:: string repository_name
  //@@
  //@@     The name of the repository. If empty the index is returned
  //@@     for all repositories.
  //@@
  string repository_name = 1;

  //@@  .. cpp:var:: bool ready
  //@@
  //@@     If true returned only models currently ready for inferencing.
  //@@
  bool ready = 2;
}

//@@
//@@.. cpp:var:: message RepositoryIndexResponse
//@@
//@@   Response message for RepositoryIndex.
//@@
message RepositoryIndexResponse
{
  //@@
  //@@  .. cpp:var:: message ModelIndex
  //@@
  //@@     Index entry for a model.
  //@@
  message ModelIndex
  {
    //@@
    //@@    .. cpp:var:: string name
    //@@
    //@@       The name of the model.
    //@@
    string name = 1;

    //@@    .. cpp:var:: string version
    //@@
    //@@       The version of the model.
    //@@
    string version = 2;

    //@@
    //@@    .. cpp:var:: string state
    //@@
    //@@       The state of the model.
    //@@
    string state = 3;

    //@@
    //@@    .. cpp:var:: string reason
    //@@
    //@@       The reason, if any, that the model is in the given state.
    //@@
    string reason = 4;
  }

  //@@
  //@@  .. cpp:var:: ModelIndex models (repeated)
  //@@
  //@@     An index entry for each model.
  //@@
  repeated ModelIndex models = 1;
}

//@@
//@@.. cpp:var:: message RepositoryModelLoadRequest
//@@
//@@   Request message for RepositoryModelLoad.
//@@
message RepositoryModelLoadRequest
{
  //@@  .. cpp:var:: string repository_name
  //@@
  //@@     The name of the repository to load from. If empty the model
  //@@     is loaded from any repository.
  //@@
  string repository_name = 1;

  //@@  .. cpp:var:: string repository_name
  //@@
  //@@     The name of the model to load, or reload.
  //@@
  string model_name = 2;

  //@@  .. cpp:var:: map<string,ModelRepositoryParameter> parameters
  //@@
  //@@     Optional model repository request parameters.
  //@@
  map<string, ModelRepositoryParameter> parameters = 3;
}

//@@
//@@.. cpp:var:: message RepositoryModelLoadResponse
//@@
//@@   Response message for RepositoryModelLoad.
//@@
message RepositoryModelLoadResponse {}

//@@
//@@.. cpp:var:: message RepositoryModelUnloadRequest
//@@
//@@   Request message for RepositoryModelUnload.
//@@
message RepositoryModelUnloadRequest
{
  //@@  .. cpp:var:: string repository_name
  //@@
  //@@     The name of the repository from which the model was originally
  //@@     loaded. If empty the repository is not considered.
  //@@
  string repository_name = 1;

  //@@  .. cpp:var:: string repository_name
  //@@
  //@@     The name of the model to unload.
  //@@
  string model_name = 2;

  //@@  .. cpp:var:: map<string,ModelRepositoryParameter> parameters
  //@@
  //@@     Optional model repository request parameters.
  //@@
  map<string, ModelRepositoryParameter> parameters = 3;
}

//@@
//@@.. cpp:var:: message RepositoryModelUnloadResponse
//@@
//@@   Response message for RepositoryModelUnload.
//@@
message RepositoryModelUnloadResponse {}

//@@
//@@.. cpp:var:: message SystemSharedMemoryStatusRequest
//@@
//@@   Request message for SystemSharedMemoryStatus.
//@@
message SystemSharedMemoryStatusRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the region to get status for. If empty the
  //@@     status is returned for all registered regions.
  //@@
  string name = 1;
}

//@@
//@@.. cpp:var:: message SystemSharedMemoryStatusResponse
//@@
//@@   Response message for SystemSharedMemoryStatus.
//@@
message SystemSharedMemoryStatusResponse
{
  //@@
  //@@  .. cpp:var:: message RegionStatus
  //@@
  //@@     Status for a shared memory region.
  //@@
  message RegionStatus
  {
    //@@
    //@@    .. cpp:var:: string name
    //@@
    //@@       The name for the shared memory region.
    //@@
    string name = 1;

    //@@    .. cpp:var:: string shared_memory_key
    //@@
    //@@       The key of the underlying memory object that contains the
    //@@       shared memory region.
    //@@
    string key = 2;

    //@@    .. cpp:var:: uint64 offset
    //@@
    //@@       Offset, in bytes, within the underlying memory object to
    //@@       the start of the shared memory region.
    //@@
    uint64 offset = 3;

    //@@    .. cpp:var:: uint64 byte_size
    //@@
    //@@       Size of the shared memory region, in bytes.
    //@@
    uint64 byte_size = 4;
  }

  //@@
  //@@  .. cpp:var:: map<string,RegionStatus> regions
  //@@
  //@@     Status for each of the registered regions, indexed by
  //@@     region name.
  //@@
  map<string, RegionStatus> regions = 1;
}

//@@
//@@.. cpp:var:: message SystemSharedMemoryRegisterRequest
//@@
//@@   Request message for SystemSharedMemoryRegister.
//@@
message SystemSharedMemoryRegisterRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the region to register.
  //@@
  string name = 1;

  //@@  .. cpp:var:: string shared_memory_key
  //@@
  //@@     The key of the underlying memory object that contains the
  //@@     shared memory region.
  //@@
  string key = 2;

  //@@  .. cpp:var:: uint64 offset
  //@@
  //@@     Offset, in bytes, within the underlying memory object to
  //@@     the start of the shared memory region.
  //@@
  uint64 offset = 3;

  //@@  .. cpp:var:: uint64 byte_size
  //@@
  //@@     Size of the shared memory region, in bytes.
  //@@
  uint64 byte_size = 4;
}

//@@
//@@.. cpp:var:: message SystemSharedMemoryRegisterResponse
//@@
//@@   Response message for SystemSharedMemoryRegister.
//@@
message SystemSharedMemoryRegisterResponse {}

//@@
//@@.. cpp:var:: message SystemSharedMemoryUnregisterRequest
//@@
//@@   Request message for SystemSharedMemoryUnregister.
//@@
message SystemSharedMemoryUnregisterRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the system region to unregister. If empty
  //@@     all system shared-memory regions are unregistered.
  //@@
  string name = 1;
}

//@@
//@@.. cpp:var:: message SystemSharedMemoryUnregisterResponse
//@@
//@@   Response message for SystemSharedMemoryUnregister.
//@@
message SystemSharedMemoryUnregisterResponse {}

//@@
//@@.. cpp:var:: message CudaSharedMemoryStatusRequest
//@@
//@@   Request message for CudaSharedMemoryStatus.
//@@
message CudaSharedMemoryStatusRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the region to get status for. If empty the
  //@@     status is returned for all registered regions.
  //@@
  string name = 1;
}

//@@
//@@.. cpp:var:: message CudaSharedMemoryStatusResponse
//@@
//@@   Response message for CudaSharedMemoryStatus.
//@@
message CudaSharedMemoryStatusResponse
{
  //@@
  //@@  .. cpp:var:: message RegionStatus
  //@@
  //@@     Status for a shared memory region.
  //@@
  message RegionStatus
  {
    //@@
    //@@    .. cpp:var:: string name
    //@@
    //@@       The name for the shared memory region.
    //@@
    string name = 1;

    //@@    .. cpp:var:: uin64 device_id
    //@@
    //@@       The GPU device ID where the cudaIPC handle was created.
    //@@
    uint64 device_id = 2;

    //@@    .. cpp:var:: uint64 byte_size
    //@@
    //@@       Size of the shared memory region, in bytes.
    //@@
    uint64 byte_size = 3;
  }

  //@@
  //@@  .. cpp:var:: map<string,RegionStatus> regions
  //@@
  //@@     Status for each of the registered regions, indexed by
  //@@     region name.
  //@@
  map<string, RegionStatus> regions = 1;
}

//@@
//@@.. cpp:var:: message CudaSharedMemoryRegisterRequest
//@@
//@@   Request message for CudaSharedMemoryRegister.
//@@
message CudaSharedMemoryRegisterRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the region to register.
  //@@
  string name = 1;

  //@@  .. cpp:var:: bytes raw_handle
  //@@
  //@@     The raw serialized cudaIPC handle.
  //@@
  bytes raw_handle = 2;

  //@@  .. cpp:var:: int64 device_id
  //@@
  //@@     The GPU device ID on which the cudaIPC handle was created.
  //@@
  int64 device_id = 3;

  //@@  .. cpp:var:: uint64 byte_size
  //@@
  //@@     Size of the shared memory block, in bytes.
  //@@
  uint64 byte_size = 4;
}

//@@
//@@.. cpp:var:: message CudaSharedMemoryRegisterResponse
//@@
//@@   Response message for CudaSharedMemoryRegister.
//@@
message CudaSharedMemoryRegisterResponse {}

//@@
//@@.. cpp:var:: message CudaSharedMemoryUnregisterRequest
//@@
//@@   Request message for CudaSharedMemoryUnregister.
//@@
message CudaSharedMemoryUnregisterRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the cuda region to unregister. If empty
  //@@     all cuda shared-memory regions are unregistered.
  //@@
  string name = 1;
}

//@@
//@@.. cpp:var:: message CudaSharedMemoryUnregisterResponse
//@@
//@@   Response message for CudaSharedMemoryUnregister.
//@@
message CudaSharedMemoryUnregisterResponse {}

//@@
//@@.. cpp:var:: message TraceSettingRequest
//@@
//@@   Request message for TraceSetting.
//@@
message TraceSettingRequest
{
  //@@
  //@@  .. cpp:var:: message SettingValue
  //@@
  //@@     The values to be associated with a trace setting.
  //@@     If no value is provided, the setting will be clear and
  //@@     the global setting value will be used.
  //@@
  message SettingValue
  {
    //@@
    //@@    .. cpp:var:: string value (repeated)
    //@@
    //@@       The value.
    //@@
    repeated string value = 1;
  }

  //@@  .. cpp:var:: map<string,SettingValue> settings
  //@@
  //@@     The new setting values to be updated,
  //@@     settings that are not specified will remain unchanged.
  //@@
  map<string, SettingValue> settings = 1;

  //@@
  //@@  .. cpp:var:: string model_name
  //@@
  //@@     The name of the model to apply the new trace settings.
  //@@     If not given, the new settings will be applied globally.
  //@@
  string model_name = 2;
}

//@@
//@@.. cpp:var:: message TraceSettingResponse
//@@
//@@   Response message for TraceSetting.
//@@
message TraceSettingResponse
{
  //@@
  //@@  .. cpp:var:: message SettingValue
  //@@
  //@@     The values to be associated with a trace setting.
  //@@
  message SettingValue
  {
    //@@
    //@@    .. cpp:var:: string value (repeated)
    //@@
    //@@       The value.
    //@@
    repeated string value = 1;
  }

  //@@  .. cpp:var:: map<string,SettingValue> settings
  //@@
  //@@     The current trace settings, including any changes specified
  //@@     by TraceSettingRequest.
  //@@
  map<string, SettingValue> settings = 1;
}