syntax = "proto3"; package tensorflow.eager; import "tensorflow/core/framework/attr_value.proto"; import "tensorflow/core/framework/device_attributes.proto"; import "tensorflow/core/framework/function.proto"; import "tensorflow/core/framework/tensor.proto"; import "tensorflow/core/framework/tensor_shape.proto"; import "tensorflow/core/framework/versions.proto"; import "tensorflow/core/protobuf/tensorflow_server.proto"; message RemoteTensorHandle { // The ID of the operation that produced this tensor. int64 op_id = 1; // The index into the outputs of the operation that produced this tensor. int32 output_num = 2; } // A proto representation of an eager operation. message Operation { // A unique identifier for the operation. Set by the client so that the client // can uniquely identify the outputs of the scheduled operation. // // In the initial implementation, sending duplicate IDs has undefined // behaviour, but additional constraints may be placed upon this in the // future. int64 id = 1; string name = 2; repeated RemoteTensorHandle inputs = 3; // Control Operation IDs that will be respected when ops are re-ordered by // async execution. If async execution (+ op re-ordering) is not enabled, this // should have no effect. repeated int64 control_op_ids = 4; map attrs = 5; string device = 6; } message QueueItem { // The remote executor should be able to handle either executing ops directly, // or releasing any unused tensor handles, since the tensor lifetime is // maintained by the client. oneof item { RemoteTensorHandle handle_to_decref = 1; Operation operation = 2; } } message QueueResponse { repeated TensorShapeProto shape = 1; } message CreateContextRequest { // Identifies the full cluster, and this particular worker's position within. ServerDef server_def = 1; // Whether the ops on the worker should be executed synchronously or // asynchronously. By default, ops are executed synchronously. bool async = 2; // Number of seconds to keep the context alive. If more than keep_alive_secs // has passed since a particular context has been communicated with, it will // be garbage collected. int64 keep_alive_secs = 3; // This is the version for all the ops that will be enqueued by the client. VersionDef version_def = 4; // This ID will be used for all future communications. It is essential that // both ends use this ID for selecting a rendezvous to get everything to // match. int64 rendezvous_id = 5; // Device attributes in the cluster repeated DeviceAttributes cluster_device_attributes = 6; } message CreateContextResponse { // The ID of the created context. This is usually a randomly generated number, // that will be used to identify the context in future requests to the // service. Contexts are not persisted through server restarts. fixed64 context_id = 1; // List of devices that are locally accessible to the worker. repeated DeviceAttributes device_attributes = 2; } message EnqueueRequest { fixed64 context_id = 1; repeated QueueItem queue = 3; } message EnqueueResponse { // A single operation response for every item in the request. repeated QueueResponse queue_response = 1; } message WaitQueueDoneRequest { fixed64 context_id = 1; // Ids to wait on. If empty, wait on everything currently pending. repeated int64 op_id = 2; } message WaitQueueDoneResponse { // TODO(nareshmodi): Consider adding NodeExecStats here to be able to // propagate some stats. } message KeepAliveRequest { fixed64 context_id = 1; } message KeepAliveResponse {} message CloseContextRequest { fixed64 context_id = 1; } message CloseContextResponse {} message RegisterFunctionRequest { fixed64 context_id = 1; FunctionDef function_def = 2; } message RegisterFunctionResponse {} message SendTensorRequest { fixed64 context_id = 1; // All remote tensors are identified by . To mimic this // situation when directly sending tensors, we include an "artificial" op ID // (which would have corresponded to the _Recv op when not using SendTensor). int64 op_id = 2; // The index within the repeated field is the output number that will help // uniquely identify (along with the above op_id) the particular tensor. repeated TensorProto tensors = 3; // The device on which the tensors should be resident. string device_name = 4; } message SendTensorResponse {} //////////////////////////////////////////////////////////////////////////////// // // Eager Service defines a TensorFlow service that executes operations eagerly // on a set of local devices, on behalf of a remote Eager executor. // // The service impl will keep track of the various clients and devices it has // access to and allows the client to enqueue ops on any devices that it is able // to access and schedule data transfers from/to any of the peers. // // A client can generate multiple contexts to be able to independently execute // operations, but cannot share data between the two contexts. // // NOTE: Even though contexts generated by clients should be independent, the // lower level tensorflow execution engine is not, so they might share some data // (e.g. a Device's ResourceMgr). // //////////////////////////////////////////////////////////////////////////////// service EagerService { // This initializes the worker, informing it about the other workers in the // cluster and exchanging authentication tokens which will be used in all // other RPCs to detect whether the worker has restarted. rpc CreateContext(CreateContextRequest) returns (CreateContextResponse); // This takes a list of Execute and DeleteTensorHandle operations and enqueues // (in async mode) or executes (in sync mode) them on the remote server. // All outputs of ops which were not explicitly deleted with // DeleteTensorHandle entries will be assumed to be alive and are usable by // future calls to Enqueue. rpc Enqueue(EnqueueRequest) returns (EnqueueResponse); // Takes a set of op IDs and waits until those ops are done. Returns any error // in the stream so far. rpc WaitQueueDone(WaitQueueDoneRequest) returns (WaitQueueDoneResponse); // Contexts are always created with a deadline and no RPCs within a deadline // will trigger a context garbage collection. KeepAlive calls can be used to // delay this. rpc KeepAlive(KeepAliveRequest) returns (KeepAliveResponse); // Closes the context. No calls to other methods using the existing context ID // are valid after this. rpc CloseContext(CloseContextRequest) returns (CloseContextResponse); // Takes a FunctionDef and makes it enqueable on the remote worker. rpc RegisterFunction(RegisterFunctionRequest) returns (RegisterFunctionResponse); // An RPC to push tensors to the server. At times, certain environments don't // allow the server to connect back to the client. rpc SendTensor(SendTensorRequest) returns (SendTensorResponse); }