// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.dataproc.v1; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; option go_package = "cloud.google.com/go/dataproc/v2/apiv1/dataprocpb;dataprocpb"; option java_multiple_files = true; option java_outer_classname = "SharedProto"; option java_package = "com.google.cloud.dataproc.v1"; option (google.api.resource_definition) = { type: "container.googleapis.com/Cluster" pattern: "projects/{project}/locations/{location}/clusters/{cluster}" }; option (google.api.resource_definition) = { type: "metastore.googleapis.com/Service" pattern: "projects/{project}/locations/{location}/services/{service}" }; option (google.api.resource_definition) = { type: "cloudkms.googleapis.com/CryptoKey" pattern: "projects/{project}/locations/{location}/keyRings/{key_ring}/cryptoKeys/{crypto_key}" }; // Runtime configuration for a workload. message RuntimeConfig { // Optional. Version of the batch runtime. string version = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Optional custom container image for the job runtime environment. // If not specified, a default container image will be used. string container_image = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. A mapping of property names to values, which are used to // configure workload execution. map properties = 3 [(google.api.field_behavior) = OPTIONAL]; // Optional. Dependency repository configuration. RepositoryConfig repository_config = 5 [(google.api.field_behavior) = OPTIONAL]; // Optional. Autotuning configuration of the workload. AutotuningConfig autotuning_config = 6 [(google.api.field_behavior) = OPTIONAL]; // Optional. Cohort identifier. Identifies families of the workloads having // the same shape, e.g. daily ETL jobs. string cohort = 7 [(google.api.field_behavior) = OPTIONAL]; } // Environment configuration for a workload. message EnvironmentConfig { // Optional. Execution configuration for a workload. ExecutionConfig execution_config = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Peripherals configuration that workload has access to. PeripheralsConfig peripherals_config = 2 [(google.api.field_behavior) = OPTIONAL]; } // Execution configuration for a workload. message ExecutionConfig { // Optional. Service account that used to execute workload. string service_account = 2 [(google.api.field_behavior) = OPTIONAL]; // Network configuration for workload execution. oneof network { // Optional. Network URI to connect workload to. string network_uri = 4 [(google.api.field_behavior) = OPTIONAL]; // Optional. Subnetwork URI to connect workload to. string subnetwork_uri = 5 [(google.api.field_behavior) = OPTIONAL]; } // Optional. Tags used for network traffic control. repeated string network_tags = 6 [(google.api.field_behavior) = OPTIONAL]; // Optional. The Cloud KMS key to use for encryption. string kms_key = 7 [(google.api.field_behavior) = OPTIONAL]; // Optional. Applies to sessions only. The duration to keep the session alive // while it's idling. Exceeding this threshold causes the session to // terminate. This field cannot be set on a batch workload. Minimum value is // 10 minutes; maximum value is 14 days (see JSON representation of // [Duration](https://developers.google.com/protocol-buffers/docs/proto3#json)). // Defaults to 1 hour if not set. // If both `ttl` and `idle_ttl` are specified for an interactive session, // the conditions are treated as `OR` conditions: the workload will be // terminated when it has been idle for `idle_ttl` or when `ttl` has been // exceeded, whichever occurs first. google.protobuf.Duration idle_ttl = 8 [(google.api.field_behavior) = OPTIONAL]; // Optional. The duration after which the workload will be terminated, // specified as the JSON representation for // [Duration](https://protobuf.dev/programming-guides/proto3/#json). // When the workload exceeds this duration, it will be unconditionally // terminated without waiting for ongoing work to finish. If `ttl` is not // specified for a batch workload, the workload will be allowed to run until // it exits naturally (or run forever without exiting). If `ttl` is not // specified for an interactive session, it defaults to 24 hours. If `ttl` is // not specified for a batch that uses 2.1+ runtime version, it defaults to 4 // hours. Minimum value is 10 minutes; maximum value is 14 days. If both `ttl` // and `idle_ttl` are specified (for an interactive session), the conditions // are treated as `OR` conditions: the workload will be terminated when it has // been idle for `idle_ttl` or when `ttl` has been exceeded, whichever occurs // first. google.protobuf.Duration ttl = 9 [(google.api.field_behavior) = OPTIONAL]; // Optional. A Cloud Storage bucket used to stage workload dependencies, // config files, and store workload output and other ephemeral data, such as // Spark history files. If you do not specify a staging bucket, Cloud Dataproc // will determine a Cloud Storage location according to the region where your // workload is running, and then create and manage project-level, per-location // staging and temporary buckets. // **This field requires a Cloud Storage bucket name, not a `gs://...` URI to // a Cloud Storage bucket.** string staging_bucket = 10 [(google.api.field_behavior) = OPTIONAL]; } // Spark History Server configuration for the workload. message SparkHistoryServerConfig { // Optional. Resource name of an existing Dataproc Cluster to act as a Spark // History Server for the workload. // // Example: // // * `projects/[project_id]/regions/[region]/clusters/[cluster_name]` string dataproc_cluster = 1 [(google.api.field_behavior) = OPTIONAL]; } // Auxiliary services configuration for a workload. message PeripheralsConfig { // Optional. Resource name of an existing Dataproc Metastore service. // // Example: // // * `projects/[project_id]/locations/[region]/services/[service_id]` string metastore_service = 1 [ (google.api.field_behavior) = OPTIONAL, (google.api.resource_reference) = { type: "metastore.googleapis.com/Service" } ]; // Optional. The Spark History Server configuration for the workload. SparkHistoryServerConfig spark_history_server_config = 2 [(google.api.field_behavior) = OPTIONAL]; } // Runtime information about workload execution. message RuntimeInfo { // Output only. Map of remote access endpoints (such as web interfaces and // APIs) to their URIs. map endpoints = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. A URI pointing to the location of the stdout and stderr of the // workload. string output_uri = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. A URI pointing to the location of the diagnostics tarball. string diagnostic_output_uri = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Approximate workload resource usage, calculated when // the workload completes (see [Dataproc Serverless pricing] // (https://cloud.google.com/dataproc-serverless/pricing)). // // **Note:** This metric calculation may change in the future, for // example, to capture cumulative workload resource // consumption during workload execution (see the // [Dataproc Serverless release notes] // (https://cloud.google.com/dataproc-serverless/docs/release-notes) // for announcements, changes, fixes // and other Dataproc developments). UsageMetrics approximate_usage = 6 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Snapshot of current workload resource usage. UsageSnapshot current_usage = 7 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Usage metrics represent approximate total resources consumed by a workload. message UsageMetrics { // Optional. DCU (Dataproc Compute Units) usage in (`milliDCU` x `seconds`) // (see [Dataproc Serverless pricing] // (https://cloud.google.com/dataproc-serverless/pricing)). int64 milli_dcu_seconds = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Shuffle storage usage in (`GB` x `seconds`) (see // [Dataproc Serverless pricing] // (https://cloud.google.com/dataproc-serverless/pricing)). int64 shuffle_storage_gb_seconds = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. Accelerator usage in (`milliAccelerator` x `seconds`) (see // [Dataproc Serverless pricing] // (https://cloud.google.com/dataproc-serverless/pricing)). int64 milli_accelerator_seconds = 3 [(google.api.field_behavior) = OPTIONAL]; // Optional. Accelerator type being used, if any string accelerator_type = 4 [(google.api.field_behavior) = OPTIONAL]; } // The usage snapshot represents the resources consumed by a workload at a // specified time. message UsageSnapshot { // Optional. Milli (one-thousandth) Dataproc Compute Units (DCUs) (see // [Dataproc Serverless pricing] // (https://cloud.google.com/dataproc-serverless/pricing)). int64 milli_dcu = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Shuffle Storage in gigabytes (GB). (see [Dataproc Serverless // pricing] (https://cloud.google.com/dataproc-serverless/pricing)) int64 shuffle_storage_gb = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. Milli (one-thousandth) Dataproc Compute Units (DCUs) charged at // premium tier (see [Dataproc Serverless pricing] // (https://cloud.google.com/dataproc-serverless/pricing)). int64 milli_dcu_premium = 4 [(google.api.field_behavior) = OPTIONAL]; // Optional. Shuffle Storage in gigabytes (GB) charged at premium tier. (see // [Dataproc Serverless pricing] // (https://cloud.google.com/dataproc-serverless/pricing)) int64 shuffle_storage_gb_premium = 5 [(google.api.field_behavior) = OPTIONAL]; // Optional. Milli (one-thousandth) accelerator. (see [Dataproc // Serverless pricing] (https://cloud.google.com/dataproc-serverless/pricing)) int64 milli_accelerator = 6 [(google.api.field_behavior) = OPTIONAL]; // Optional. Accelerator type being used, if any string accelerator_type = 7 [(google.api.field_behavior) = OPTIONAL]; // Optional. The timestamp of the usage snapshot. google.protobuf.Timestamp snapshot_time = 3 [(google.api.field_behavior) = OPTIONAL]; } // The cluster's GKE config. message GkeClusterConfig { // Optional. A target GKE cluster to deploy to. It must be in the same project // and region as the Dataproc cluster (the GKE cluster can be zonal or // regional). Format: // 'projects/{project}/locations/{location}/clusters/{cluster_id}' string gke_cluster_target = 2 [ (google.api.field_behavior) = OPTIONAL, (google.api.resource_reference) = { type: "container.googleapis.com/Cluster" } ]; // Optional. GKE node pools where workloads will be scheduled. At least one // node pool must be assigned the `DEFAULT` // [GkeNodePoolTarget.Role][google.cloud.dataproc.v1.GkeNodePoolTarget.Role]. // If a `GkeNodePoolTarget` is not specified, Dataproc constructs a `DEFAULT` // `GkeNodePoolTarget`. Each role can be given to only one // `GkeNodePoolTarget`. All node pools must have the same location settings. repeated GkeNodePoolTarget node_pool_target = 3 [(google.api.field_behavior) = OPTIONAL]; } // The configuration for running the Dataproc cluster on Kubernetes. message KubernetesClusterConfig { // Optional. A namespace within the Kubernetes cluster to deploy into. If this // namespace does not exist, it is created. If it exists, Dataproc verifies // that another Dataproc VirtualCluster is not installed into it. If not // specified, the name of the Dataproc Cluster is used. string kubernetes_namespace = 1 [(google.api.field_behavior) = OPTIONAL]; oneof config { // Required. The configuration for running the Dataproc cluster on GKE. GkeClusterConfig gke_cluster_config = 2 [(google.api.field_behavior) = REQUIRED]; } // Optional. The software configuration for this Dataproc cluster running on // Kubernetes. KubernetesSoftwareConfig kubernetes_software_config = 3 [(google.api.field_behavior) = OPTIONAL]; } // The software configuration for this Dataproc cluster running on Kubernetes. message KubernetesSoftwareConfig { // The components that should be installed in this Dataproc cluster. The key // must be a string from the KubernetesComponent enumeration. The value is // the version of the software to be installed. // At least one entry must be specified. map component_version = 1; // The properties to set on daemon config files. // // Property keys are specified in `prefix:property` format, for example // `spark:spark.kubernetes.container.image`. The following are supported // prefixes and their mappings: // // * spark: `spark-defaults.conf` // // For more information, see [Cluster // properties](https://cloud.google.com/dataproc/docs/concepts/cluster-properties). map properties = 2; } // GKE node pools that Dataproc workloads run on. message GkeNodePoolTarget { // `Role` specifies the tasks that will run on the node pool. Roles can be // specific to workloads. Exactly one // [GkeNodePoolTarget][google.cloud.dataproc.v1.GkeNodePoolTarget] within the // virtual cluster must have the `DEFAULT` role, which is used to run all // workloads that are not associated with a node pool. enum Role { // Role is unspecified. ROLE_UNSPECIFIED = 0; // At least one node pool must have the `DEFAULT` role. // Work assigned to a role that is not associated with a node pool // is assigned to the node pool with the `DEFAULT` role. For example, // work assigned to the `CONTROLLER` role will be assigned to the node pool // with the `DEFAULT` role if no node pool has the `CONTROLLER` role. DEFAULT = 1; // Run work associated with the Dataproc control plane (for example, // controllers and webhooks). Very low resource requirements. CONTROLLER = 2; // Run work associated with a Spark driver of a job. SPARK_DRIVER = 3; // Run work associated with a Spark executor of a job. SPARK_EXECUTOR = 4; } // Required. The target GKE node pool. // Format: // 'projects/{project}/locations/{location}/clusters/{cluster}/nodePools/{node_pool}' string node_pool = 1 [(google.api.field_behavior) = REQUIRED]; // Required. The roles associated with the GKE node pool. repeated Role roles = 2 [(google.api.field_behavior) = REQUIRED]; // Input only. The configuration for the GKE node pool. // // If specified, Dataproc attempts to create a node pool with the // specified shape. If one with the same name already exists, it is // verified against all specified fields. If a field differs, the // virtual cluster creation will fail. // // If omitted, any node pool with the specified name is used. If a // node pool with the specified name does not exist, Dataproc create a // node pool with default values. // // This is an input only field. It will not be returned by the API. GkeNodePoolConfig node_pool_config = 3 [(google.api.field_behavior) = INPUT_ONLY]; } // The configuration of a GKE node pool used by a [Dataproc-on-GKE // cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster). message GkeNodePoolConfig { // Parameters that describe cluster nodes. message GkeNodeConfig { // Optional. The name of a Compute Engine [machine // type](https://cloud.google.com/compute/docs/machine-types). string machine_type = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. The number of local SSD disks to attach to the node, which is // limited by the maximum number of disks allowable per zone (see [Adding // Local SSDs](https://cloud.google.com/compute/docs/disks/local-ssd)). int32 local_ssd_count = 7 [(google.api.field_behavior) = OPTIONAL]; // Optional. Whether the nodes are created as legacy [preemptible VM // instances] (https://cloud.google.com/compute/docs/instances/preemptible). // Also see // [Spot][google.cloud.dataproc.v1.GkeNodePoolConfig.GkeNodeConfig.spot] // VMs, preemptible VM instances without a maximum lifetime. Legacy and Spot // preemptible nodes cannot be used in a node pool with the `CONTROLLER` // [role] // (/dataproc/docs/reference/rest/v1/projects.regions.clusters#role) // or in the DEFAULT node pool if the CONTROLLER role is not assigned (the // DEFAULT node pool will assume the CONTROLLER role). bool preemptible = 10 [(google.api.field_behavior) = OPTIONAL]; // Optional. A list of [hardware // accelerators](https://cloud.google.com/compute/docs/gpus) to attach to // each node. repeated GkeNodePoolAcceleratorConfig accelerators = 11 [(google.api.field_behavior) = OPTIONAL]; // Optional. [Minimum CPU // platform](https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform) // to be used by this instance. The instance may be scheduled on the // specified or a newer CPU platform. Specify the friendly names of CPU // platforms, such as "Intel Haswell"` or Intel Sandy Bridge". string min_cpu_platform = 13 [(google.api.field_behavior) = OPTIONAL]; // Optional. The [Customer Managed Encryption Key (CMEK)] // (https://cloud.google.com/kubernetes-engine/docs/how-to/using-cmek) // used to encrypt the boot disk attached to each node in the node pool. // Specify the key using the following format: // projects/KEY_PROJECT_ID/locations/LOCATION/keyRings/RING_NAME/cryptoKeys/KEY_NAME. string boot_disk_kms_key = 23 [(google.api.field_behavior) = OPTIONAL]; // Optional. Whether the nodes are created as [Spot VM instances] // (https://cloud.google.com/compute/docs/instances/spot). // Spot VMs are the latest update to legacy // [preemptible // VMs][google.cloud.dataproc.v1.GkeNodePoolConfig.GkeNodeConfig.preemptible]. // Spot VMs do not have a maximum lifetime. Legacy and Spot preemptible // nodes cannot be used in a node pool with the `CONTROLLER` // [role](/dataproc/docs/reference/rest/v1/projects.regions.clusters#role) // or in the DEFAULT node pool if the CONTROLLER role is not assigned (the // DEFAULT node pool will assume the CONTROLLER role). bool spot = 32 [(google.api.field_behavior) = OPTIONAL]; } // A GkeNodeConfigAcceleratorConfig represents a Hardware Accelerator request // for a node pool. message GkeNodePoolAcceleratorConfig { // The number of accelerator cards exposed to an instance. int64 accelerator_count = 1; // The accelerator type resource namename (see GPUs on Compute Engine). string accelerator_type = 2; // Size of partitions to create on the GPU. Valid values are described in // the NVIDIA [mig user // guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning). string gpu_partition_size = 3; } // GkeNodePoolAutoscaling contains information the cluster autoscaler needs to // adjust the size of the node pool to the current cluster usage. message GkeNodePoolAutoscalingConfig { // The minimum number of nodes in the node pool. Must be >= 0 and <= // max_node_count. int32 min_node_count = 2; // The maximum number of nodes in the node pool. Must be >= min_node_count, // and must be > 0. // **Note:** Quota must be sufficient to scale up the cluster. int32 max_node_count = 3; } // Optional. The node pool configuration. GkeNodeConfig config = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. The list of Compute Engine // [zones](https://cloud.google.com/compute/docs/zones#available) where // node pool nodes associated with a Dataproc on GKE virtual cluster // will be located. // // **Note:** All node pools associated with a virtual cluster // must be located in the same region as the virtual cluster, and they must // be located in the same zone within that region. // // If a location is not specified during node pool creation, Dataproc on GKE // will choose the zone. repeated string locations = 13 [(google.api.field_behavior) = OPTIONAL]; // Optional. The autoscaler configuration for this node pool. The autoscaler // is enabled only when a valid configuration is present. GkeNodePoolAutoscalingConfig autoscaling = 4 [(google.api.field_behavior) = OPTIONAL]; } // Autotuning configuration of the workload. message AutotuningConfig { // Scenario represents a specific goal that autotuning will attempt to achieve // by modifying workloads. enum Scenario { // Default value. SCENARIO_UNSPECIFIED = 0; // Scaling recommendations such as initialExecutors. SCALING = 2; // Adding hints for potential relation broadcasts. BROADCAST_HASH_JOIN = 3; // Memory management for workloads. MEMORY = 4; } // Optional. Scenarios for which tunings are applied. repeated Scenario scenarios = 2 [(google.api.field_behavior) = OPTIONAL]; } // Configuration for dependency repositories message RepositoryConfig { // Optional. Configuration for PyPi repository. PyPiRepositoryConfig pypi_repository_config = 1 [(google.api.field_behavior) = OPTIONAL]; } // Configuration for PyPi repository message PyPiRepositoryConfig { // Optional. PyPi repository address string pypi_repository = 1 [(google.api.field_behavior) = OPTIONAL]; } // Cluster components that can be activated. enum Component { // Unspecified component. Specifying this will cause Cluster creation to fail. COMPONENT_UNSPECIFIED = 0; // The Anaconda component is no longer supported or applicable to // [supported Dataproc on Compute Engine image versions] // (https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-version-clusters#supported-dataproc-image-versions). // It cannot be activated on clusters created with supported Dataproc on // Compute Engine image versions. ANACONDA = 5; // Docker DOCKER = 13; // The Druid query engine. (alpha) DRUID = 9; // Flink FLINK = 14; // HBase. (beta) HBASE = 11; // The Hive Web HCatalog (the REST service for accessing HCatalog). HIVE_WEBHCAT = 3; // Hudi. HUDI = 18; // The Jupyter Notebook. JUPYTER = 1; // The Presto query engine. PRESTO = 6; // The Trino query engine. TRINO = 17; // The Ranger service. RANGER = 12; // The Solr service. SOLR = 10; // The Zeppelin notebook. ZEPPELIN = 4; // The Zookeeper service. ZOOKEEPER = 8; } // Actions in response to failure of a resource associated with a cluster. enum FailureAction { // When FailureAction is unspecified, failure action defaults to NO_ACTION. FAILURE_ACTION_UNSPECIFIED = 0; // Take no action on failure to create a cluster resource. NO_ACTION is the // default. NO_ACTION = 1; // Delete the failed cluster resource. DELETE = 2; }