// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.tpu.v2; import "google/api/annotations.proto"; import "google/api/client.proto"; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; import "google/longrunning/operations.proto"; import "google/protobuf/empty.proto"; import "google/protobuf/field_mask.proto"; import "google/protobuf/timestamp.proto"; option go_package = "cloud.google.com/go/tpu/apiv2/tpupb;tpupb"; option java_multiple_files = true; option java_outer_classname = "CloudTpuProto"; option java_package = "com.google.cloud.tpu.v2"; // Manages TPU nodes and other resources // // TPU API v2 service Tpu { option (google.api.default_host) = "tpu.googleapis.com"; option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; // Lists nodes. rpc ListNodes(ListNodesRequest) returns (ListNodesResponse) { option (google.api.http) = { get: "/v2/{parent=projects/*/locations/*}/nodes" }; option (google.api.method_signature) = "parent"; } // Gets the details of a node. rpc GetNode(GetNodeRequest) returns (Node) { option (google.api.http) = { get: "/v2/{name=projects/*/locations/*/nodes/*}" }; option (google.api.method_signature) = "name"; } // Creates a node. rpc CreateNode(CreateNodeRequest) returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v2/{parent=projects/*/locations/*}/nodes" body: "node" }; option (google.api.method_signature) = "parent,node,node_id"; option (google.longrunning.operation_info) = { response_type: "Node" metadata_type: "OperationMetadata" }; } // Deletes a node. rpc DeleteNode(DeleteNodeRequest) returns (google.longrunning.Operation) { option (google.api.http) = { delete: "/v2/{name=projects/*/locations/*/nodes/*}" }; option (google.api.method_signature) = "name"; option (google.longrunning.operation_info) = { response_type: "google.protobuf.Empty" metadata_type: "OperationMetadata" }; } // Stops a node. This operation is only available with single TPU nodes. rpc StopNode(StopNodeRequest) returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v2/{name=projects/*/locations/*/nodes/*}:stop" body: "*" }; option (google.longrunning.operation_info) = { response_type: "Node" metadata_type: "OperationMetadata" }; } // Starts a node. rpc StartNode(StartNodeRequest) returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v2/{name=projects/*/locations/*/nodes/*}:start" body: "*" }; option (google.longrunning.operation_info) = { response_type: "Node" metadata_type: "OperationMetadata" }; } // Updates the configurations of a node. rpc UpdateNode(UpdateNodeRequest) returns (google.longrunning.Operation) { option (google.api.http) = { patch: "/v2/{node.name=projects/*/locations/*/nodes/*}" body: "node" }; option (google.api.method_signature) = "node,update_mask"; option (google.longrunning.operation_info) = { response_type: "Node" metadata_type: "OperationMetadata" }; } // Generates the Cloud TPU service identity for the project. rpc GenerateServiceIdentity(GenerateServiceIdentityRequest) returns (GenerateServiceIdentityResponse) { option (google.api.http) = { post: "/v2/{parent=projects/*/locations/*}:generateServiceIdentity" body: "*" }; } // Lists accelerator types supported by this API. rpc ListAcceleratorTypes(ListAcceleratorTypesRequest) returns (ListAcceleratorTypesResponse) { option (google.api.http) = { get: "/v2/{parent=projects/*/locations/*}/acceleratorTypes" }; option (google.api.method_signature) = "parent"; } // Gets AcceleratorType. rpc GetAcceleratorType(GetAcceleratorTypeRequest) returns (AcceleratorType) { option (google.api.http) = { get: "/v2/{name=projects/*/locations/*/acceleratorTypes/*}" }; option (google.api.method_signature) = "name"; } // Lists runtime versions supported by this API. rpc ListRuntimeVersions(ListRuntimeVersionsRequest) returns (ListRuntimeVersionsResponse) { option (google.api.http) = { get: "/v2/{parent=projects/*/locations/*}/runtimeVersions" }; option (google.api.method_signature) = "parent"; } // Gets a runtime version. rpc GetRuntimeVersion(GetRuntimeVersionRequest) returns (RuntimeVersion) { option (google.api.http) = { get: "/v2/{name=projects/*/locations/*/runtimeVersions/*}" }; option (google.api.method_signature) = "name"; } // Retrieves the guest attributes for the node. rpc GetGuestAttributes(GetGuestAttributesRequest) returns (GetGuestAttributesResponse) { option (google.api.http) = { post: "/v2/{name=projects/*/locations/*/nodes/*}:getGuestAttributes" body: "*" }; } } // A guest attributes. message GuestAttributes { // The path to be queried. This can be the default namespace ('/') or a // nested namespace ('/\/') or a specified key // ('/\/\') string query_path = 1; // The value of the requested queried path. GuestAttributesValue query_value = 2; } // Array of guest attribute namespace/key/value tuples. message GuestAttributesValue { // The list of guest attributes entries. repeated GuestAttributesEntry items = 1; } // A guest attributes namespace/key/value entry. message GuestAttributesEntry { // Namespace for the guest attribute entry. string namespace = 1; // Key for the guest attribute entry. string key = 2; // Value for the guest attribute entry. string value = 3; } // A node-attached disk resource. // Next ID: 8; message AttachedDisk { // The different mode of the attached disk. enum DiskMode { // The disk mode is not known/set. DISK_MODE_UNSPECIFIED = 0; // Attaches the disk in read-write mode. Only one TPU node can attach a disk // in read-write mode at a time. READ_WRITE = 1; // Attaches the disk in read-only mode. Multiple TPU nodes can attach // a disk in read-only mode at a time. READ_ONLY = 2; } // Specifies the full path to an existing disk. // For example: "projects/my-project/zones/us-central1-c/disks/my-disk". string source_disk = 3; // The mode in which to attach this disk. // If not specified, the default is READ_WRITE mode. // Only applicable to data_disks. DiskMode mode = 4; } // Sets the scheduling options for this node. message SchedulingConfig { // Defines whether the node is preemptible. bool preemptible = 1; // Whether the node is created under a reservation. bool reserved = 2; } // A network endpoint over which a TPU worker can be reached. message NetworkEndpoint { // The internal IP address of this network endpoint. string ip_address = 1; // The port of this network endpoint. int32 port = 2; // The access config for the TPU worker. AccessConfig access_config = 5; } // An access config attached to the TPU worker. message AccessConfig { // Output only. An external IP address associated with the TPU worker. string external_ip = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Network related configurations. message NetworkConfig { // The name of the network for the TPU node. It must be a preexisting Google // Compute Engine network. If none is provided, "default" will be used. string network = 1; // The name of the subnetwork for the TPU node. It must be a preexisting // Google Compute Engine subnetwork. If none is provided, "default" will be // used. string subnetwork = 2; // Indicates that external IP addresses would be associated with the TPU // workers. If set to false, the specified subnetwork or network should have // Private Google Access enabled. bool enable_external_ips = 3; // Allows the TPU node to send and receive packets with non-matching // destination or source IPs. This is required if you plan to use the TPU // workers to forward routes. bool can_ip_forward = 4; } // A service account. message ServiceAccount { // Email address of the service account. If empty, default Compute service // account will be used. string email = 1; // The list of scopes to be made available for this service account. If empty, // access to all Cloud APIs will be allowed. repeated string scope = 2; } // A TPU instance. message Node { option (google.api.resource) = { type: "tpu.googleapis.com/Node" pattern: "projects/{project}/locations/{location}/nodes/{node}" }; // Represents the different states of a TPU node during its lifecycle. enum State { // TPU node state is not known/set. STATE_UNSPECIFIED = 0; // TPU node is being created. CREATING = 1; // TPU node has been created. READY = 2; // TPU node is restarting. RESTARTING = 3; // TPU node is undergoing reimaging. REIMAGING = 4; // TPU node is being deleted. DELETING = 5; // TPU node is being repaired and may be unusable. Details can be // found in the 'help_description' field. REPAIRING = 6; // TPU node is stopped. STOPPED = 8; // TPU node is currently stopping. STOPPING = 9; // TPU node is currently starting. STARTING = 10; // TPU node has been preempted. Only applies to Preemptible TPU Nodes. PREEMPTED = 11; // TPU node has been terminated due to maintenance or has reached the end of // its life cycle (for preemptible nodes). TERMINATED = 12; // TPU node is currently hiding. HIDING = 13; // TPU node has been hidden. HIDDEN = 14; // TPU node is currently unhiding. UNHIDING = 15; } // Health defines the status of a TPU node as reported by // Health Monitor. enum Health { // Health status is unknown: not initialized or failed to retrieve. HEALTH_UNSPECIFIED = 0; // The resource is healthy. HEALTHY = 1; // The resource is unresponsive. TIMEOUT = 3; // The in-guest ML stack is unhealthy. UNHEALTHY_TENSORFLOW = 4; // The node is under maintenance/priority boost caused rescheduling and // will resume running once rescheduled. UNHEALTHY_MAINTENANCE = 5; } // TPU API Version. enum ApiVersion { // API version is unknown. API_VERSION_UNSPECIFIED = 0; // TPU API V1Alpha1 version. V1_ALPHA1 = 1; // TPU API V1 version. V1 = 2; // TPU API V2Alpha1 version. V2_ALPHA1 = 3; // TPU API V2 version. V2 = 4; } // Output only. Immutable. The name of the TPU. string name = 1 [ (google.api.field_behavior) = OUTPUT_ONLY, (google.api.field_behavior) = IMMUTABLE ]; // The user-supplied description of the TPU. Maximum of 512 characters. string description = 3; // Optional. The type of hardware accelerators associated with this node. string accelerator_type = 5 [(google.api.field_behavior) = OPTIONAL]; // Output only. The current state for the TPU Node. State state = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. If this field is populated, it contains a description of why // the TPU Node is unhealthy. string health_description = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; // Required. The runtime version running in the Node. string runtime_version = 11 [(google.api.field_behavior) = REQUIRED]; // Network configurations for the TPU node. NetworkConfig network_config = 36; // The CIDR block that the TPU node will use when selecting an IP address. // This CIDR block must be a /29 block; the Compute Engine networks API // forbids a smaller block, and using a larger block would be wasteful (a // node can only consume one IP address). Errors will occur if the CIDR block // has already been used for a currently existing TPU node, the CIDR block // conflicts with any subnetworks in the user's provided network, or the // provided network is peered with another network that is using that CIDR // block. string cidr_block = 13; // The Google Cloud Platform Service Account to be used by the TPU node VMs. // If None is specified, the default compute service account will be used. ServiceAccount service_account = 37; // Output only. The time when the node was created. google.protobuf.Timestamp create_time = 16 [(google.api.field_behavior) = OUTPUT_ONLY]; // The scheduling options for this node. SchedulingConfig scheduling_config = 17; // Output only. The network endpoints where TPU workers can be accessed and // sent work. It is recommended that runtime clients of the node reach out // to the 0th entry in this map first. repeated NetworkEndpoint network_endpoints = 21 [(google.api.field_behavior) = OUTPUT_ONLY]; // The health status of the TPU node. Health health = 22; // Resource labels to represent user-provided metadata. map labels = 24; // Custom metadata to apply to the TPU Node. // Can set startup-script and shutdown-script map metadata = 34; // Tags to apply to the TPU Node. Tags are used to identify valid sources or // targets for network firewalls. repeated string tags = 40; // Output only. The unique identifier for the TPU Node. int64 id = 33 [(google.api.field_behavior) = OUTPUT_ONLY]; // The additional data disks for the Node. repeated AttachedDisk data_disks = 41; // Output only. The API version that created this Node. ApiVersion api_version = 38 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. The Symptoms that have occurred to the TPU Node. repeated Symptom symptoms = 39 [(google.api.field_behavior) = OUTPUT_ONLY]; // Shielded Instance options. ShieldedInstanceConfig shielded_instance_config = 45; // The AccleratorConfig for the TPU Node. AcceleratorConfig accelerator_config = 46; // Output only. The qualified name of the QueuedResource that requested this // Node. string queued_resource = 47 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Whether the Node belongs to a Multislice group. bool multislice_node = 48 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Request for [ListNodes][google.cloud.tpu.v2.Tpu.ListNodes]. message ListNodesRequest { // Required. The parent resource name. string parent = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { child_type: "tpu.googleapis.com/Node" } ]; // The maximum number of items to return. int32 page_size = 2; // The next_page_token value returned from a previous List request, if any. string page_token = 3; } // Response for [ListNodes][google.cloud.tpu.v2.Tpu.ListNodes]. message ListNodesResponse { // The listed nodes. repeated Node nodes = 1; // The next page token or empty if none. string next_page_token = 2; // Locations that could not be reached. repeated string unreachable = 3; } // Request for [GetNode][google.cloud.tpu.v2.Tpu.GetNode]. message GetNodeRequest { // Required. The resource name. string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" } ]; } // Request for [CreateNode][google.cloud.tpu.v2.Tpu.CreateNode]. message CreateNodeRequest { // Required. The parent resource name. string parent = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "locations.googleapis.com/Location" } ]; // The unqualified resource name. string node_id = 2; // Required. The node. Node node = 3 [(google.api.field_behavior) = REQUIRED]; } // Request for [DeleteNode][google.cloud.tpu.v2.Tpu.DeleteNode]. message DeleteNodeRequest { // Required. The resource name. string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" } ]; } // Request for [StopNode][google.cloud.tpu.v2.Tpu.StopNode]. message StopNodeRequest { // Required. The resource name. string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" } ]; } // Request for [StartNode][google.cloud.tpu.v2.Tpu.StartNode]. message StartNodeRequest { // Required. The resource name. string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" } ]; } // Request for [UpdateNode][google.cloud.tpu.v2.Tpu.UpdateNode]. message UpdateNodeRequest { // Required. Mask of fields from [Node][Tpu.Node] to update. // Supported fields: [description, tags, labels, metadata, // network_config.enable_external_ips]. google.protobuf.FieldMask update_mask = 1 [(google.api.field_behavior) = REQUIRED]; // Required. The node. Only fields specified in update_mask are updated. Node node = 2 [(google.api.field_behavior) = REQUIRED]; } // The per-product per-project service identity for Cloud TPU service. message ServiceIdentity { // The email address of the service identity. string email = 1; } // Request for // [GenerateServiceIdentity][google.cloud.tpu.v2.Tpu.GenerateServiceIdentity]. message GenerateServiceIdentityRequest { // Required. The parent resource name. string parent = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "locations.googleapis.com/Location" } ]; } // Response for // [GenerateServiceIdentity][google.cloud.tpu.v2.Tpu.GenerateServiceIdentity]. message GenerateServiceIdentityResponse { // ServiceIdentity that was created or retrieved. ServiceIdentity identity = 1; } // A accelerator type that a Node can be configured with. message AcceleratorType { option (google.api.resource) = { type: "tpu.googleapis.com/AcceleratorType" pattern: "projects/{project}/locations/{location}/acceleratorTypes/{accelerator_type}" }; // The resource name. string name = 1; // The accelerator type. string type = 2; // The accelerator config. repeated AcceleratorConfig accelerator_configs = 3; } // Request for [GetAcceleratorType][google.cloud.tpu.v2.Tpu.GetAcceleratorType]. message GetAcceleratorTypeRequest { // Required. The resource name. string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/AcceleratorType" } ]; } // Request for // [ListAcceleratorTypes][google.cloud.tpu.v2.Tpu.ListAcceleratorTypes]. message ListAcceleratorTypesRequest { // Required. The parent resource name. string parent = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { child_type: "tpu.googleapis.com/AcceleratorType" } ]; // The maximum number of items to return. int32 page_size = 2; // The next_page_token value returned from a previous List request, if any. string page_token = 3; // List filter. string filter = 5; // Sort results. string order_by = 6; } // Response for // [ListAcceleratorTypes][google.cloud.tpu.v2.Tpu.ListAcceleratorTypes]. message ListAcceleratorTypesResponse { // The listed nodes. repeated AcceleratorType accelerator_types = 1; // The next page token or empty if none. string next_page_token = 2; // Locations that could not be reached. repeated string unreachable = 3; } // A runtime version that a Node can be configured with. message RuntimeVersion { option (google.api.resource) = { type: "tpu.googleapis.com/RuntimeVersion" pattern: "projects/{project}/locations/{location}/runtimeVersions/{runtime_version}" }; // The resource name. string name = 1; // The runtime version. string version = 2; } // Request for [GetRuntimeVersion][google.cloud.tpu.v2.Tpu.GetRuntimeVersion]. message GetRuntimeVersionRequest { // Required. The resource name. string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/RuntimeVersion" } ]; } // Request for // [ListRuntimeVersions][google.cloud.tpu.v2.Tpu.ListRuntimeVersions]. message ListRuntimeVersionsRequest { // Required. The parent resource name. string parent = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { child_type: "tpu.googleapis.com/RuntimeVersion" } ]; // The maximum number of items to return. int32 page_size = 2; // The next_page_token value returned from a previous List request, if any. string page_token = 3; // List filter. string filter = 5; // Sort results. string order_by = 6; } // Response for // [ListRuntimeVersions][google.cloud.tpu.v2.Tpu.ListRuntimeVersions]. message ListRuntimeVersionsResponse { // The listed nodes. repeated RuntimeVersion runtime_versions = 1; // The next page token or empty if none. string next_page_token = 2; // Locations that could not be reached. repeated string unreachable = 3; } // Metadata describing an [Operation][google.longrunning.Operation] message OperationMetadata { // The time the operation was created. google.protobuf.Timestamp create_time = 1; // The time the operation finished running. google.protobuf.Timestamp end_time = 2; // Target of the operation - for example // projects/project-1/connectivityTests/test-1 string target = 3; // Name of the verb executed by the operation. string verb = 4; // Human-readable status of the operation, if any. string status_detail = 5; // Specifies if cancellation was requested for the operation. bool cancel_requested = 6; // API version. string api_version = 7; } // A Symptom instance. message Symptom { // SymptomType represents the different types of Symptoms that a TPU can be // at. enum SymptomType { // Unspecified symptom. SYMPTOM_TYPE_UNSPECIFIED = 0; // TPU VM memory is low. LOW_MEMORY = 1; // TPU runtime is out of memory. OUT_OF_MEMORY = 2; // TPU runtime execution has timed out. EXECUTE_TIMED_OUT = 3; // TPU runtime fails to construct a mesh that recognizes each TPU device's // neighbors. MESH_BUILD_FAIL = 4; // TPU HBM is out of memory. HBM_OUT_OF_MEMORY = 5; // Abusive behaviors have been identified on the current project. PROJECT_ABUSE = 6; } // Timestamp when the Symptom is created. google.protobuf.Timestamp create_time = 1; // Type of the Symptom. SymptomType symptom_type = 2; // Detailed information of the current Symptom. string details = 3; // A string used to uniquely distinguish a worker within a TPU node. string worker_id = 4; } // Request for [GetGuestAttributes][google.cloud.tpu.v2.Tpu.GetGuestAttributes]. message GetGuestAttributesRequest { // Required. The resource name. string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" } ]; // The guest attributes path to be queried. string query_path = 2; // The 0-based worker ID. If it is empty, all workers' GuestAttributes will be // returned. repeated string worker_ids = 3; } // Response for // [GetGuestAttributes][google.cloud.tpu.v2.Tpu.GetGuestAttributes]. message GetGuestAttributesResponse { // The guest attributes for the TPU workers. repeated GuestAttributes guest_attributes = 1; } // A TPU accelerator configuration. message AcceleratorConfig { // TPU type. enum Type { // Unspecified version. TYPE_UNSPECIFIED = 0; // TPU v2. V2 = 2; // TPU v3. V3 = 4; // TPU v4. V4 = 7; } // Required. Type of TPU. Type type = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Topology of TPU in chips. string topology = 2 [(google.api.field_behavior) = REQUIRED]; } // A set of Shielded Instance options. message ShieldedInstanceConfig { // Defines whether the instance has Secure Boot enabled. bool enable_secure_boot = 1; }