// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.tpu.v1; import "google/api/annotations.proto"; import "google/api/client.proto"; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; import "google/longrunning/operations.proto"; import "google/protobuf/timestamp.proto"; option go_package = "cloud.google.com/go/tpu/apiv1/tpupb;tpupb"; option java_multiple_files = true; option java_outer_classname = "CloudTpuProto"; option java_package = "com.google.cloud.tpu.v1"; // Manages TPU nodes and other resources // // TPU API v1 service Tpu { option (google.api.default_host) = "tpu.googleapis.com"; option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; // Lists nodes. rpc ListNodes(ListNodesRequest) returns (ListNodesResponse) { option (google.api.http) = { get: "/v1/{parent=projects/*/locations/*}/nodes" }; option (google.api.method_signature) = "parent"; } // Gets the details of a node. rpc GetNode(GetNodeRequest) returns (Node) { option (google.api.http) = { get: "/v1/{name=projects/*/locations/*/nodes/*}" }; option (google.api.method_signature) = "name"; } // Creates a node. rpc CreateNode(CreateNodeRequest) returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v1/{parent=projects/*/locations/*}/nodes" body: "node" }; option (google.api.method_signature) = "parent,node,node_id"; option (google.longrunning.operation_info) = { response_type: "Node" metadata_type: "OperationMetadata" }; } // Deletes a node. rpc DeleteNode(DeleteNodeRequest) returns (google.longrunning.Operation) { option (google.api.http) = { delete: "/v1/{name=projects/*/locations/*/nodes/*}" }; option (google.api.method_signature) = "name"; option (google.longrunning.operation_info) = { response_type: "Node" metadata_type: "OperationMetadata" }; } // Reimages a node's OS. rpc ReimageNode(ReimageNodeRequest) returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v1/{name=projects/*/locations/*/nodes/*}:reimage" body: "*" }; option (google.longrunning.operation_info) = { response_type: "Node" metadata_type: "OperationMetadata" }; } // Stops a node, this operation is only available with single TPU nodes. rpc StopNode(StopNodeRequest) returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v1/{name=projects/*/locations/*/nodes/*}:stop" body: "*" }; option (google.longrunning.operation_info) = { response_type: "Node" metadata_type: "OperationMetadata" }; } // Starts a node. rpc StartNode(StartNodeRequest) returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v1/{name=projects/*/locations/*/nodes/*}:start" body: "*" }; option (google.longrunning.operation_info) = { response_type: "Node" metadata_type: "OperationMetadata" }; } // List TensorFlow versions supported by this API. rpc ListTensorFlowVersions(ListTensorFlowVersionsRequest) returns (ListTensorFlowVersionsResponse) { option (google.api.http) = { get: "/v1/{parent=projects/*/locations/*}/tensorflowVersions" }; option (google.api.method_signature) = "parent"; } // Gets TensorFlow Version. rpc GetTensorFlowVersion(GetTensorFlowVersionRequest) returns (TensorFlowVersion) { option (google.api.http) = { get: "/v1/{name=projects/*/locations/*/tensorflowVersions/*}" }; option (google.api.method_signature) = "name"; } // Lists accelerator types supported by this API. rpc ListAcceleratorTypes(ListAcceleratorTypesRequest) returns (ListAcceleratorTypesResponse) { option (google.api.http) = { get: "/v1/{parent=projects/*/locations/*}/acceleratorTypes" }; option (google.api.method_signature) = "parent"; } // Gets AcceleratorType. rpc GetAcceleratorType(GetAcceleratorTypeRequest) returns (AcceleratorType) { option (google.api.http) = { get: "/v1/{name=projects/*/locations/*/acceleratorTypes/*}" }; option (google.api.method_signature) = "name"; } } // Sets the scheduling options for this node. message SchedulingConfig { // Defines whether the node is preemptible. bool preemptible = 1; // Whether the node is created under a reservation. bool reserved = 2; } // A network endpoint over which a TPU worker can be reached. message NetworkEndpoint { // The IP address of this network endpoint. string ip_address = 1; // The port of this network endpoint. int32 port = 2; } // A TPU instance. message Node { option (google.api.resource) = { type: "tpu.googleapis.com/Node" pattern: "projects/{project}/locations/{location}/nodes/{node}" }; // Represents the different states of a TPU node during its lifecycle. enum State { // TPU node state is not known/set. STATE_UNSPECIFIED = 0; // TPU node is being created. CREATING = 1; // TPU node has been created. READY = 2; // TPU node is restarting. RESTARTING = 3; // TPU node is undergoing reimaging. REIMAGING = 4; // TPU node is being deleted. DELETING = 5; // TPU node is being repaired and may be unusable. Details can be // found in the `help_description` field. REPAIRING = 6; // TPU node is stopped. STOPPED = 8; // TPU node is currently stopping. STOPPING = 9; // TPU node is currently starting. STARTING = 10; // TPU node has been preempted. Only applies to Preemptible TPU Nodes. PREEMPTED = 11; // TPU node has been terminated due to maintenance or has reached the end of // its life cycle (for preemptible nodes). TERMINATED = 12; // TPU node is currently hiding. HIDING = 13; // TPU node has been hidden. HIDDEN = 14; // TPU node is currently unhiding. UNHIDING = 15; } // Health defines the status of a TPU node as reported by // Health Monitor. enum Health { // Health status is unknown: not initialized or failed to retrieve. HEALTH_UNSPECIFIED = 0; // The resource is healthy. HEALTHY = 1; // The resource is unhealthy. DEPRECATED_UNHEALTHY = 2; // The resource is unresponsive. TIMEOUT = 3; // The in-guest ML stack is unhealthy. UNHEALTHY_TENSORFLOW = 4; // The node is under maintenance/priority boost caused rescheduling and // will resume running once rescheduled. UNHEALTHY_MAINTENANCE = 5; } // TPU API Version. enum ApiVersion { // API version is unknown. API_VERSION_UNSPECIFIED = 0; // TPU API V1Alpha1 version. V1_ALPHA1 = 1; // TPU API V1 version. V1 = 2; // TPU API V2Alpha1 version. V2_ALPHA1 = 3; } // Output only. Immutable. The name of the TPU string name = 1 [ (google.api.field_behavior) = IMMUTABLE, (google.api.field_behavior) = OUTPUT_ONLY ]; // The user-supplied description of the TPU. Maximum of 512 characters. string description = 3; // Required. The type of hardware accelerators associated with this node. string accelerator_type = 5 [(google.api.field_behavior) = REQUIRED]; // Output only. DEPRECATED! Use network_endpoints instead. // The network address for the TPU Node as visible to Compute Engine // instances. string ip_address = 8 [deprecated = true]; // Output only. DEPRECATED! Use network_endpoints instead. // The network port for the TPU Node as visible to Compute Engine instances. string port = 14 [deprecated = true]; // Output only. The current state for the TPU Node. State state = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. If this field is populated, it contains a description of why // the TPU Node is unhealthy. string health_description = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; // Required. The version of Tensorflow running in the Node. string tensorflow_version = 11 [(google.api.field_behavior) = REQUIRED]; // The name of a network they wish to peer the TPU node to. It must be a // preexisting Compute Engine network inside of the project on which this API // has been activated. If none is provided, "default" will be used. string network = 12; // The CIDR block that the TPU node will use when selecting an IP address. // This CIDR block must be a /29 block; the Compute Engine networks API // forbids a smaller block, and using a larger block would be wasteful (a // node can only consume one IP address). Errors will occur if the CIDR block // has already been used for a currently existing TPU node, the CIDR block // conflicts with any subnetworks in the user's provided network, or the // provided network is peered with another network that is using that CIDR // block. string cidr_block = 13; // Output only. The service account used to run the tensor flow services // within the node. To share resources, including Google Cloud Storage data, // with the Tensorflow job running in the Node, this account must have // permissions to that data. string service_account = 15 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. The time when the node was created. google.protobuf.Timestamp create_time = 16 [(google.api.field_behavior) = OUTPUT_ONLY]; // The scheduling options for this node. SchedulingConfig scheduling_config = 17; // Output only. The network endpoints where TPU workers can be accessed and // sent work. It is recommended that Tensorflow clients of the node reach out // to the 0th entry in this map first. repeated NetworkEndpoint network_endpoints = 21 [(google.api.field_behavior) = OUTPUT_ONLY]; // The health status of the TPU node. Health health = 22; // Resource labels to represent user-provided metadata. map labels = 24; // Whether the VPC peering for the node is set up through Service Networking // API. The VPC Peering should be set up before provisioning the node. // If this field is set, cidr_block field should not be specified. If the // network, that you want to peer the TPU Node to, is Shared VPC networks, // the node must be created with this this field enabled. bool use_service_networking = 27; // Output only. The API version that created this Node. ApiVersion api_version = 38 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. The Symptoms that have occurred to the TPU Node. repeated Symptom symptoms = 39 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Request for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes]. message ListNodesRequest { // Required. The parent resource name. string parent = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { child_type: "tpu.googleapis.com/Node" } ]; // The maximum number of items to return. int32 page_size = 2; // The next_page_token value returned from a previous List request, if any. string page_token = 3; } // Response for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes]. message ListNodesResponse { // The listed nodes. repeated Node nodes = 1; // The next page token or empty if none. string next_page_token = 2; // Locations that could not be reached. repeated string unreachable = 3; } // Request for [GetNode][google.cloud.tpu.v1.Tpu.GetNode]. message GetNodeRequest { // Required. The resource name. string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" } ]; } // Request for [CreateNode][google.cloud.tpu.v1.Tpu.CreateNode]. message CreateNodeRequest { // Required. The parent resource name. string parent = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { child_type: "tpu.googleapis.com/Node" } ]; // The unqualified resource name. string node_id = 2; // Required. The node. Node node = 3 [(google.api.field_behavior) = REQUIRED]; } // Request for [DeleteNode][google.cloud.tpu.v1.Tpu.DeleteNode]. message DeleteNodeRequest { // Required. The resource name. string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" } ]; } // Request for [ReimageNode][google.cloud.tpu.v1.Tpu.ReimageNode]. message ReimageNodeRequest { // The resource name. string name = 1; // The version for reimage to create. string tensorflow_version = 2; } // Request for [StopNode][google.cloud.tpu.v1.Tpu.StopNode]. message StopNodeRequest { // The resource name. string name = 1; } // Request for [StartNode][google.cloud.tpu.v1.Tpu.StartNode]. message StartNodeRequest { // The resource name. string name = 1; } // A tensorflow version that a Node can be configured with. message TensorFlowVersion { option (google.api.resource) = { type: "tpu.googleapis.com/TensorFlowVersion" pattern: "projects/{project}/locations/{location}/tensorFlowVersions/{tensor_flow_version}" }; // The resource name. string name = 1; // the tensorflow version. string version = 2; } // Request for // [GetTensorFlowVersion][google.cloud.tpu.v1.Tpu.GetTensorFlowVersion]. message GetTensorFlowVersionRequest { // Required. The resource name. string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/TensorFlowVersion" } ]; } // Request for // [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions]. message ListTensorFlowVersionsRequest { // Required. The parent resource name. string parent = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/TensorFlowVersion" } ]; // The maximum number of items to return. int32 page_size = 2; // The next_page_token value returned from a previous List request, if any. string page_token = 3; // List filter. string filter = 5; // Sort results. string order_by = 6; } // Response for // [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions]. message ListTensorFlowVersionsResponse { // The listed nodes. repeated TensorFlowVersion tensorflow_versions = 1; // The next page token or empty if none. string next_page_token = 2; // Locations that could not be reached. repeated string unreachable = 3; } // A accelerator type that a Node can be configured with. message AcceleratorType { option (google.api.resource) = { type: "tpu.googleapis.com/AcceleratorType" pattern: "projects/{project}/locations/{location}/acceleratorTypes/{accelerator_type}" }; // The resource name. string name = 1; // the accelerator type. string type = 2; } // Request for [GetAcceleratorType][google.cloud.tpu.v1.Tpu.GetAcceleratorType]. message GetAcceleratorTypeRequest { // Required. The resource name. string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/AcceleratorType" } ]; } // Request for // [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes]. message ListAcceleratorTypesRequest { // Required. The parent resource name. string parent = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/AcceleratorType" } ]; // The maximum number of items to return. int32 page_size = 2; // The next_page_token value returned from a previous List request, if any. string page_token = 3; // List filter. string filter = 5; // Sort results. string order_by = 6; } // Response for // [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes]. message ListAcceleratorTypesResponse { // The listed nodes. repeated AcceleratorType accelerator_types = 1; // The next page token or empty if none. string next_page_token = 2; // Locations that could not be reached. repeated string unreachable = 3; } // Metadata describing an [Operation][google.longrunning.Operation] message OperationMetadata { // The time the operation was created. google.protobuf.Timestamp create_time = 1; // The time the operation finished running. google.protobuf.Timestamp end_time = 2; // Target of the operation - for example // projects/project-1/connectivityTests/test-1 string target = 3; // Name of the verb executed by the operation. string verb = 4; // Human-readable status of the operation, if any. string status_detail = 5; // Specifies if cancellation was requested for the operation. bool cancel_requested = 6; // API version. string api_version = 7; } // A Symptom instance. message Symptom { // SymptomType represents the different types of Symptoms that a TPU can be // at. enum SymptomType { // Unspecified symptom. SYMPTOM_TYPE_UNSPECIFIED = 0; // TPU VM memory is low. LOW_MEMORY = 1; // TPU runtime is out of memory. OUT_OF_MEMORY = 2; // TPU runtime execution has timed out. EXECUTE_TIMED_OUT = 3; // TPU runtime fails to construct a mesh that recognizes each TPU device's // neighbors. MESH_BUILD_FAIL = 4; // TPU HBM is out of memory. HBM_OUT_OF_MEMORY = 5; // Abusive behaviors have been identified on the current project. PROJECT_ABUSE = 6; } // Timestamp when the Symptom is created. google.protobuf.Timestamp create_time = 1; // Type of the Symptom. SymptomType symptom_type = 2; // Detailed information of the current Symptom. string details = 3; // A string used to uniquely distinguish a worker within a TPU node. string worker_id = 4; }