// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.dataflow.v1beta3; import "google/api/field_behavior.proto"; import "google/protobuf/any.proto"; import "google/protobuf/struct.proto"; option csharp_namespace = "Google.Cloud.Dataflow.V1Beta3"; option go_package = "cloud.google.com/go/dataflow/apiv1beta3/dataflowpb;dataflowpb"; option java_multiple_files = true; option java_outer_classname = "EnvironmentProto"; option java_package = "com.google.dataflow.v1beta3"; option php_namespace = "Google\\Cloud\\Dataflow\\V1beta3"; option ruby_package = "Google::Cloud::Dataflow::V1beta3"; // Describes the environment in which a Dataflow Job runs. message Environment { // The prefix of the resources the system should use for temporary // storage. The system will append the suffix "/temp-{JOBNAME} to // this resource prefix, where {JOBNAME} is the value of the // job_name field. The resulting bucket and object prefix is used // as the prefix of the resources used to store temporary data // needed during the job execution. NOTE: This will override the // value in taskrunner_settings. // The supported resource type is: // // Google Cloud Storage: // // storage.googleapis.com/{bucket}/{object} // bucket.storage.googleapis.com/{object} string temp_storage_prefix = 1; // The type of cluster manager API to use. If unknown or // unspecified, the service will attempt to choose a reasonable // default. This should be in the form of the API service name, // e.g. "compute.googleapis.com". string cluster_manager_api_service = 2; // The list of experiments to enable. This field should be used for SDK // related experiments and not for service related experiments. The proper // field for service related experiments is service_options. repeated string experiments = 3; // The list of service options to enable. This field should be used for // service related experiments only. These experiments, when graduating to GA, // should be replaced by dedicated fields or become default (i.e. always on). repeated string service_options = 16; // If set, contains the Cloud KMS key identifier used to encrypt data // at rest, AKA a Customer Managed Encryption Key (CMEK). // // Format: // projects/PROJECT_ID/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY string service_kms_key_name = 12; // The worker pools. At least one "harness" worker pool must be // specified in order for the job to have workers. repeated WorkerPool worker_pools = 4; // A description of the process that generated the request. google.protobuf.Struct user_agent = 5; // A structure describing which components and their versions of the service // are required in order to run the job. google.protobuf.Struct version = 6; // The dataset for the current project where various workflow // related tables are stored. // // The supported resource type is: // // Google BigQuery: // bigquery.googleapis.com/{dataset} string dataset = 7; // The Cloud Dataflow SDK pipeline options specified by the user. These // options are passed through the service and are used to recreate the // SDK pipeline options on the worker in a language agnostic and platform // independent way. google.protobuf.Struct sdk_pipeline_options = 8; // Experimental settings. google.protobuf.Any internal_experiments = 9; // Identity to run virtual machines as. Defaults to the default account. string service_account_email = 10; // Which Flexible Resource Scheduling mode to run in. FlexResourceSchedulingGoal flex_resource_scheduling_goal = 11; // The Compute Engine region // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in // which worker processing should occur, e.g. "us-west1". Mutually exclusive // with worker_zone. If neither worker_region nor worker_zone is specified, // default to the control plane's region. string worker_region = 13; // The Compute Engine zone // (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in // which worker processing should occur, e.g. "us-west1-a". Mutually exclusive // with worker_region. If neither worker_region nor worker_zone is specified, // a zone in the control plane's region is chosen based on available capacity. string worker_zone = 14; // Output only. The shuffle mode used for the job. ShuffleMode shuffle_mode = 15 [(google.api.field_behavior) = OUTPUT_ONLY]; // Any debugging options to be supplied to the job. DebugOptions debug_options = 17; } // The packages that must be installed in order for a worker to run the // steps of the Cloud Dataflow job that will be assigned to its worker // pool. // // This is the mechanism by which the Cloud Dataflow SDK causes code to // be loaded onto the workers. For example, the Cloud Dataflow Java SDK // might use this to install jars containing the user's code and all of the // various dependencies (libraries, data files, etc.) required in order // for that code to run. message Package { // The name of the package. string name = 1; // The resource to read the package from. The supported resource type is: // // Google Cloud Storage: // // storage.googleapis.com/{bucket} // bucket.storage.googleapis.com/ string location = 2; } // Specifies the processing model used by a // [google.dataflow.v1beta3.Job], which determines the way the Job is // managed by the Cloud Dataflow service (how workers are scheduled, how // inputs are sharded, etc). enum JobType { // The type of the job is unspecified, or unknown. JOB_TYPE_UNKNOWN = 0; // A batch job with a well-defined end point: data is read, data is // processed, data is written, and the job is done. JOB_TYPE_BATCH = 1; // A continuously streaming job with no end: data is read, // processed, and written continuously. JOB_TYPE_STREAMING = 2; } // Specifies the resource to optimize for in Flexible Resource Scheduling. enum FlexResourceSchedulingGoal { // Run in the default mode. FLEXRS_UNSPECIFIED = 0; // Optimize for lower execution time. FLEXRS_SPEED_OPTIMIZED = 1; // Optimize for lower cost. FLEXRS_COST_OPTIMIZED = 2; } // Describes the data disk used by a workflow job. message Disk { // Size of disk in GB. If zero or unspecified, the service will // attempt to choose a reasonable default. int32 size_gb = 1; // Disk storage type, as defined by Google Compute Engine. This // must be a disk type appropriate to the project and zone in which // the workers will run. If unknown or unspecified, the service // will attempt to choose a reasonable default. // // For example, the standard persistent disk type is a resource name // typically ending in "pd-standard". If SSD persistent disks are // available, the resource name typically ends with "pd-ssd". The // actual valid values are defined the Google Compute Engine API, // not by the Cloud Dataflow API; consult the Google Compute Engine // documentation for more information about determining the set of // available disk types for a particular project and zone. // // Google Compute Engine Disk types are local to a particular // project in a particular zone, and so the resource name will // typically look something like this: // // compute.googleapis.com/projects/project-id/zones/zone/diskTypes/pd-standard string disk_type = 2; // Directory in a VM where disk is mounted. string mount_point = 3; } // Provides data to pass through to the worker harness. message WorkerSettings { // The base URL for accessing Google Cloud APIs. // // When workers access Google Cloud APIs, they logically do so via // relative URLs. If this field is specified, it supplies the base // URL to use for resolving these relative URLs. The normative // algorithm used is defined by RFC 1808, "Relative Uniform Resource // Locators". // // If not specified, the default value is "http://www.googleapis.com/" string base_url = 1; // Whether to send work progress updates to the service. bool reporting_enabled = 2; // The Cloud Dataflow service path relative to the root URL, for example, // "dataflow/v1b3/projects". string service_path = 3; // The Shuffle service path relative to the root URL, for example, // "shuffle/v1beta1". string shuffle_service_path = 4; // The ID of the worker running this pipeline. string worker_id = 5; // The prefix of the resources the system should use for temporary // storage. // // The supported resource type is: // // Google Cloud Storage: // // storage.googleapis.com/{bucket}/{object} // bucket.storage.googleapis.com/{object} string temp_storage_prefix = 6; } // Taskrunner configuration settings. message TaskRunnerSettings { // The UNIX user ID on the worker VM to use for tasks launched by // taskrunner; e.g. "root". string task_user = 1; // The UNIX group ID on the worker VM to use for tasks launched by // taskrunner; e.g. "wheel". string task_group = 2; // The OAuth2 scopes to be requested by the taskrunner in order to // access the Cloud Dataflow API. repeated string oauth_scopes = 3; // The base URL for the taskrunner to use when accessing Google Cloud APIs. // // When workers access Google Cloud APIs, they logically do so via // relative URLs. If this field is specified, it supplies the base // URL to use for resolving these relative URLs. The normative // algorithm used is defined by RFC 1808, "Relative Uniform Resource // Locators". // // If not specified, the default value is "http://www.googleapis.com/" string base_url = 4; // The API version of endpoint, e.g. "v1b3" string dataflow_api_version = 5; // The settings to pass to the parallel worker harness. WorkerSettings parallel_worker_settings = 6; // The location on the worker for task-specific subdirectories. string base_task_dir = 7; // Whether to continue taskrunner if an exception is hit. bool continue_on_exception = 8; // Whether to send taskrunner log info to Google Compute Engine VM serial // console. bool log_to_serialconsole = 9; // Whether to also send taskrunner log info to stderr. bool alsologtostderr = 10; // Indicates where to put logs. If this is not specified, the logs // will not be uploaded. // // The supported resource type is: // // Google Cloud Storage: // storage.googleapis.com/{bucket}/{object} // bucket.storage.googleapis.com/{object} string log_upload_location = 11; // The directory on the VM to store logs. string log_dir = 12; // The prefix of the resources the taskrunner should use for // temporary storage. // // The supported resource type is: // // Google Cloud Storage: // storage.googleapis.com/{bucket}/{object} // bucket.storage.googleapis.com/{object} string temp_storage_prefix = 13; // The command to launch the worker harness. string harness_command = 14; // The file to store the workflow in. string workflow_file_name = 15; // The file to store preprocessing commands in. string commandlines_file_name = 16; // The ID string of the VM. string vm_id = 17; // The suggested backend language. string language_hint = 18; // The streaming worker main class name. string streaming_worker_main_class = 19; } // Specifies what happens to a resource when a Cloud Dataflow // [google.dataflow.v1beta3.Job][google.dataflow.v1beta3.Job] has completed. enum TeardownPolicy { // The teardown policy isn't specified, or is unknown. TEARDOWN_POLICY_UNKNOWN = 0; // Always teardown the resource. TEARDOWN_ALWAYS = 1; // Teardown the resource on success. This is useful for debugging // failures. TEARDOWN_ON_SUCCESS = 2; // Never teardown the resource. This is useful for debugging and // development. TEARDOWN_NEVER = 3; } // The default set of packages to be staged on a pool of workers. enum DefaultPackageSet { // The default set of packages to stage is unknown, or unspecified. DEFAULT_PACKAGE_SET_UNKNOWN = 0; // Indicates that no packages should be staged at the worker unless // explicitly specified by the job. DEFAULT_PACKAGE_SET_NONE = 1; // Stage packages typically useful to workers written in Java. DEFAULT_PACKAGE_SET_JAVA = 2; // Stage packages typically useful to workers written in Python. DEFAULT_PACKAGE_SET_PYTHON = 3; } // Specifies the algorithm used to determine the number of worker // processes to run at any given point in time, based on the amount of // data left to process, the number of workers, and how quickly // existing workers are processing data. enum AutoscalingAlgorithm { // The algorithm is unknown, or unspecified. AUTOSCALING_ALGORITHM_UNKNOWN = 0; // Disable autoscaling. AUTOSCALING_ALGORITHM_NONE = 1; // Increase worker count over time to reduce job execution time. AUTOSCALING_ALGORITHM_BASIC = 2; } // Settings for WorkerPool autoscaling. message AutoscalingSettings { // The algorithm to use for autoscaling. AutoscalingAlgorithm algorithm = 1; // The maximum number of workers to cap scaling at. int32 max_num_workers = 2; } // Specifies how IP addresses should be allocated to the worker machines. enum WorkerIPAddressConfiguration { // The configuration is unknown, or unspecified. WORKER_IP_UNSPECIFIED = 0; // Workers should have public IP addresses. WORKER_IP_PUBLIC = 1; // Workers should have private IP addresses. WORKER_IP_PRIVATE = 2; } // Defines a SDK harness container for executing Dataflow pipelines. message SdkHarnessContainerImage { // A docker container image that resides in Google Container Registry. string container_image = 1; // If true, recommends the Dataflow service to use only one core per SDK // container instance with this image. If false (or unset) recommends using // more than one core per SDK container instance with this image for // efficiency. Note that Dataflow service may choose to override this property // if needed. bool use_single_core_per_container = 2; // Environment ID for the Beam runner API proto Environment that corresponds // to the current SDK Harness. string environment_id = 3; // The set of capabilities enumerated in the above Environment proto. See also // https://github.com/apache/beam/blob/master/model/pipeline/src/main/proto/beam_runner_api.proto repeated string capabilities = 4; } // Describes one particular pool of Cloud Dataflow workers to be // instantiated by the Cloud Dataflow service in order to perform the // computations required by a job. Note that a workflow job may use // multiple pools, in order to match the various computational // requirements of the various stages of the job. message WorkerPool { // The kind of the worker pool; currently only `harness` and `shuffle` // are supported. string kind = 1; // Number of Google Compute Engine workers in this pool needed to // execute the job. If zero or unspecified, the service will // attempt to choose a reasonable default. int32 num_workers = 2; // Packages to be installed on workers. repeated Package packages = 3; // The default package set to install. This allows the service to // select a default set of packages which are useful to worker // harnesses written in a particular language. DefaultPackageSet default_package_set = 4; // Machine type (e.g. "n1-standard-1"). If empty or unspecified, the // service will attempt to choose a reasonable default. string machine_type = 5; // Sets the policy for determining when to turndown worker pool. // Allowed values are: `TEARDOWN_ALWAYS`, `TEARDOWN_ON_SUCCESS`, and // `TEARDOWN_NEVER`. // `TEARDOWN_ALWAYS` means workers are always torn down regardless of whether // the job succeeds. `TEARDOWN_ON_SUCCESS` means workers are torn down // if the job succeeds. `TEARDOWN_NEVER` means the workers are never torn // down. // // If the workers are not torn down by the service, they will // continue to run and use Google Compute Engine VM resources in the // user's project until they are explicitly terminated by the user. // Because of this, Google recommends using the `TEARDOWN_ALWAYS` // policy except for small, manually supervised test jobs. // // If unknown or unspecified, the service will attempt to choose a reasonable // default. TeardownPolicy teardown_policy = 6; // Size of root disk for VMs, in GB. If zero or unspecified, the service will // attempt to choose a reasonable default. int32 disk_size_gb = 7; // Type of root disk for VMs. If empty or unspecified, the service will // attempt to choose a reasonable default. string disk_type = 16; // Fully qualified source image for disks. string disk_source_image = 8; // Zone to run the worker pools in. If empty or unspecified, the service // will attempt to choose a reasonable default. string zone = 9; // Settings passed through to Google Compute Engine workers when // using the standard Dataflow task runner. Users should ignore // this field. TaskRunnerSettings taskrunner_settings = 10; // The action to take on host maintenance, as defined by the Google // Compute Engine API. string on_host_maintenance = 11; // Data disks that are used by a VM in this workflow. repeated Disk data_disks = 12; // Metadata to set on the Google Compute Engine VMs. map metadata = 13; // Settings for autoscaling of this WorkerPool. AutoscalingSettings autoscaling_settings = 14; // Extra arguments for this worker pool. google.protobuf.Any pool_args = 15; // Network to which VMs will be assigned. If empty or unspecified, // the service will use the network "default". string network = 17; // Subnetwork to which VMs will be assigned, if desired. Expected to be of // the form "regions/REGION/subnetworks/SUBNETWORK". string subnetwork = 19; // Required. Docker container image that executes the Cloud Dataflow worker // harness, residing in Google Container Registry. // // Deprecated for the Fn API path. Use sdk_harness_container_images instead. string worker_harness_container_image = 18; // The number of threads per worker harness. If empty or unspecified, the // service will choose a number of threads (according to the number of cores // on the selected machine type for batch, or 1 by convention for streaming). int32 num_threads_per_worker = 20; // Configuration for VM IPs. WorkerIPAddressConfiguration ip_configuration = 21; // Set of SDK harness containers needed to execute this pipeline. This will // only be set in the Fn API path. For non-cross-language pipelines this // should have only one entry. Cross-language pipelines will have two or more // entries. repeated SdkHarnessContainerImage sdk_harness_container_images = 22; } // Specifies the shuffle mode used by a // [google.dataflow.v1beta3.Job], which determines the approach data is shuffled // during processing. More details in: // https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-shuffle enum ShuffleMode { // Shuffle mode information is not available. SHUFFLE_MODE_UNSPECIFIED = 0; // Shuffle is done on the worker VMs. VM_BASED = 1; // Shuffle is done on the service side. SERVICE_BASED = 2; } // Describes any options that have an effect on the debugging of pipelines. message DebugOptions { // When true, enables the logging of the literal hot key to the user's Cloud // Logging. bool enable_hot_key_logging = 1; }