syntax = "proto3";

package xrt;

import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
import "tensorflow/compiler/xla/service/hlo.proto";
import "tensorflow/compiler/xla/xla.proto";
import "tensorflow/compiler/xla/xla_data.proto";

message DeviceAssignment {
  message ComputationDevice {
    message DeviceMeshCoordinates {
      // The mesh coordinates for the device. Usually (X, Y, Core), in the order
      // in which they are returned in the TopologyProto.
      //  X    = value(0)
      //  Y    = value(1)
      //  Core = value(2)
      repeated int32 value = 1;
    }
    // As many replicas as there are in the replicated computation.
    repeated DeviceMeshCoordinates replica_devices = 1;
  }
  // As many ComputationDevice as many there are computations (number
  // of cores per replica).
  repeated ComputationDevice computation_devices = 1;
}

// Options for an XLA compilation.
message XLAComputationConfig {
  // The number of replicas the computation will be run on. If this is
  // default (0) it is interpreted as 1.
  int32 num_replicas = 1;
  // The number of "model-parallel" cores per replica. If this is
  // default (0) it is interpreted as 1.
  int32 num_cores_per_replica = 2;
  // Optional metadata about host sends and recvs.
  tensorflow.tf2xla.HostComputeMetadata host_compute_metadata = 3;

  // The arg/result shapes for the whole computation.
  xla.ProgramShapeProto program_shape = 4;
  // The arg/result shapes for each core of a model-parallel
  // computation. per_core_args_and_result_shapes is optional for a
  // single-core computation.
  repeated xla.ProgramShapeProto per_core_program_shape = 5;
  // Describes how replicated computation instances should be assigned to
  // devices. There are num_cores_per_replica computations, and each one will be
  // sent and executed to the set of replica device numbers described in the
  // DeviceAssignment proto.
  DeviceAssignment device_assignment = 6;
  // The debugging options to be passed to the XLA compilation process.
  xla.DebugOptions debug_options = 7;
}

// Options and XLA computation for a compilation.
message XLAComputation {
  XLAComputationConfig config = 1;
  xla.HloSnapshot hlo_snapshot = 2;
}

// Literal to allocate space for, and transfer to, device memory.
message XLAAllocation {
  reserved 1;
  xla.LiteralProto value = 2;
}

// Node in a tree describing a tuple constructed from input handles. A
// node is an internal node if tuples is non-empty, in which case
// input_index and release_input_handle are ignored. Otherwise a node
// is a leaf node. Each leaf XLATupleNode is the index of an input
// which corresponds to a handle that will be grafted onto the output
// tuple at that location. If release_input_handle is true that input
// handle will be released and become invalid.  Inputs may be repeated
// in which case leaves of the output tuple will alias. If an input is
// repeated, release_input_handle must be false for every leaf where
// that input appears.
//
// For example, if input 0 has shape {} and input 1 has shape {2,3}
// then the XLATupleNode with structure {1,{0,1}} corresponds to a
// tuple with shape {{2,3},{{},{2,3}}}.
message XLATupleNode {
  int32 input_index = 1;
  bool release_input_handle = 2;
  repeated XLATupleNode tuples = 3;
}

// Options for an XLA execution.
message XRTExecutionConfig {
  // Local device to run on. This is present because the execute Op
  // may be placed on a device such as CPU or TPU_SYSTEM that
  // logically manages multiple cores.
  int32 device_ordinal = 1;
  // Which model-parallel computation to run from the compiled bundle.
  int32 core_index_in_replica = 2;
  // Optional key to disambiguate between executions. This is only
  // needed if multiple host send/recvs may be outstanding
  // concurrently with executions.
  string execution_instance_key = 3;
  // If non-zero, rng_seed to reset the core with.
  uint32 rng_seed = 4;
  // If true, release allocation handles on the inputs after running.
  bool release_input_handles = 5;
  // If true, release the handle to the computation after running.
  bool release_compilation_handle = 6;
  // If set to true, and the result shape is a tuple, then instead of returning
  // a single tuple allocation the execution will return a vector of
  // allocations, one for each of the first-level elements of the result tuple.
  bool return_exploded_tuple = 7;
}

message XRTChainedExecuteConfig {
  // If non-zero, rng_seed to reset the core with.
  uint32 rng_seed = 1;
  // Which model-parallel computation to run from the compiled bundle.
  int32 core_index_in_replica = 2;
  // Optional key to disambiguate between executions. This is only needed if
  // multiple host send/recvs may be outstanding concurrently with executions.
  string execution_instance_key = 3;
}

// A single chained execute operation. An operation can either be a device data
// load, or an existing (as in, previously compiled and accessible via its int64
// handle) XLA computation execution.
message XRTChainedExecuteOp {
  // Represents an input for this operation.
  message Input {
    // The index within the XRTChainedExecutePlan.ops post-order of the source
    // operation for this input.
    int64 op_index = 1;
    // The output index of the value generated by the operation at op_index.
    // Zero (default value) means no index ({}) while if an indexing is
    // required, output_index needs to be set to index+1.
    // Thanks proto3!
    int64 output_index = 2;
  }
  // Represents an output of the XRTChainedExecute operation, which should
  // originate by the output of this operation.
  message Output {
    // The index in the value generated by this operation, which should be
    // forwarded as XRTChainedExecute output. If output_index is zero (default
    // value) the whole output will be used as result. This means that if the
    // output shape is a tuple, the result will be the full tuple. Otherwise the
    // real sub-tuple index will be output_index - 1.
    int64 output_index = 1;
    // The index in the vector of the results returned by the XRTChainedExecute
    // operation, where this output should be forwarded.
    int64 result_index = 2;
  }

  oneof op_oneof {
    // The handle to an existing XRT device data.
    int64 data_handle = 1;
    // The handle to an existing XRT compiled computation.
    int64 computation_handle = 2;
  }
  // The outputs of this XRTChainedExecuteOp operation.
  repeated Output outputs = 3;
  // The inputs of this XRTChainedExecuteOp operation. If data_handle is set,
  // there are no inputs.
  repeated Input inputs = 4;
}

// Execution plan for the XRTChainedExecute operation.
message XRTChainedExecutePlan {
  // The post order with the XRT computations to be executed.
  repeated XRTChainedExecuteOp ops = 1;
}