// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.aiplatform.v1beta1; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; import "google/cloud/aiplatform/v1beta1/encryption_spec.proto"; import "google/cloud/aiplatform/v1beta1/env_var.proto"; import "google/cloud/aiplatform/v1beta1/io.proto"; import "google/cloud/aiplatform/v1beta1/job_state.proto"; import "google/cloud/aiplatform/v1beta1/machine_resources.proto"; import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; import "google/rpc/status.proto"; option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1"; option go_package = "cloud.google.com/go/aiplatform/apiv1beta1/aiplatformpb;aiplatformpb"; option java_multiple_files = true; option java_outer_classname = "CustomJobProto"; option java_package = "com.google.cloud.aiplatform.v1beta1"; option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1"; option ruby_package = "Google::Cloud::AIPlatform::V1beta1"; // Represents a job that runs custom workloads such as a Docker container or a // Python package. A CustomJob can have multiple worker pools and each worker // pool can have its own machine and input spec. A CustomJob will be cleaned up // once the job enters terminal state (failed or succeeded). message CustomJob { option (google.api.resource) = { type: "aiplatform.googleapis.com/CustomJob" pattern: "projects/{project}/locations/{location}/customJobs/{custom_job}" }; // Output only. Resource name of a CustomJob. string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Required. The display name of the CustomJob. // The name can be up to 128 characters long and can consist of any UTF-8 // characters. string display_name = 2 [(google.api.field_behavior) = REQUIRED]; // Required. Job spec. CustomJobSpec job_spec = 4 [(google.api.field_behavior) = REQUIRED]; // Output only. The detailed state of the job. JobState state = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Time when the CustomJob was created. google.protobuf.Timestamp create_time = 6 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Time when the CustomJob for the first time entered the // `JOB_STATE_RUNNING` state. google.protobuf.Timestamp start_time = 7 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Time when the CustomJob entered any of the following states: // `JOB_STATE_SUCCEEDED`, `JOB_STATE_FAILED`, `JOB_STATE_CANCELLED`. google.protobuf.Timestamp end_time = 8 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Time when the CustomJob was most recently updated. google.protobuf.Timestamp update_time = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Only populated when job's state is `JOB_STATE_FAILED` or // `JOB_STATE_CANCELLED`. google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; // The labels with user-defined metadata to organize CustomJobs. // // Label keys and values can be no longer than 64 characters // (Unicode codepoints), can only contain lowercase letters, numeric // characters, underscores and dashes. International characters are allowed. // // See https://goo.gl/xmQnxf for more information and examples of labels. map labels = 11; // Customer-managed encryption key options for a CustomJob. If this is set, // then all resources created by the CustomJob will be encrypted with the // provided encryption key. EncryptionSpec encryption_spec = 12; // Output only. URIs for accessing [interactive // shells](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) // (one URI for each training node). Only available if // [job_spec.enable_web_access][google.cloud.aiplatform.v1beta1.CustomJobSpec.enable_web_access] // is `true`. // // The keys are names of each node in the training job; for example, // `workerpool0-0` for the primary node, `workerpool1-0` for the first node in // the second worker pool, and `workerpool1-1` for the second node in the // second worker pool. // // The values are the URIs for each node's interactive shell. map web_access_uris = 16 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Reserved for future use. bool satisfies_pzs = 18 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Reserved for future use. bool satisfies_pzi = 19 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Represents the spec of a CustomJob. message CustomJobSpec { // Optional. The ID of the PersistentResource in the same Project and Location // which to run // // If this is specified, the job will be run on existing machines held by the // PersistentResource instead of on-demand short-live machines. // The network and CMEK configs on the job should be consistent with those on // the PersistentResource, otherwise, the job will be rejected. string persistent_resource_id = 14 [ (google.api.field_behavior) = OPTIONAL, (google.api.resource_reference) = { type: "aiplatform.googleapis.com/PersistentResource" } ]; // Required. The spec of the worker pools including machine type and Docker // image. All worker pools except the first one are optional and can be // skipped by providing an empty value. repeated WorkerPoolSpec worker_pool_specs = 1 [(google.api.field_behavior) = REQUIRED]; // Scheduling options for a CustomJob. Scheduling scheduling = 3; // Specifies the service account for workload run-as account. // Users submitting jobs must have act-as permission on this run-as account. // If unspecified, the [Vertex AI Custom Code Service // Agent](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) // for the CustomJob's project is used. string service_account = 4; // Optional. The full name of the Compute Engine // [network](/compute/docs/networks-and-firewalls#networks) to which the Job // should be peered. For example, `projects/12345/global/networks/myVPC`. // [Format](/compute/docs/reference/rest/v1/networks/insert) // is of the form `projects/{project}/global/networks/{network}`. // Where {project} is a project number, as in `12345`, and {network} is a // network name. // // To specify this field, you must have already [configured VPC Network // Peering for Vertex // AI](https://cloud.google.com/vertex-ai/docs/general/vpc-peering). // // If this field is left unspecified, the job is not peered with any network. string network = 5 [ (google.api.field_behavior) = OPTIONAL, (google.api.resource_reference) = { type: "compute.googleapis.com/Network" } ]; // Optional. A list of names for the reserved ip ranges under the VPC network // that can be used for this job. // // If set, we will deploy the job within the provided ip ranges. Otherwise, // the job will be deployed to any ip ranges under the provided VPC // network. // // Example: ['vertex-ai-ip-range']. repeated string reserved_ip_ranges = 13 [(google.api.field_behavior) = OPTIONAL]; // The Cloud Storage location to store the output of this CustomJob or // HyperparameterTuningJob. For HyperparameterTuningJob, // the baseOutputDirectory of // each child CustomJob backing a Trial is set to a subdirectory of name // [id][google.cloud.aiplatform.v1beta1.Trial.id] under its parent // HyperparameterTuningJob's baseOutputDirectory. // // The following Vertex AI environment variables will be passed to // containers or python modules when this field is set: // // For CustomJob: // // * AIP_MODEL_DIR = `/model/` // * AIP_CHECKPOINT_DIR = `/checkpoints/` // * AIP_TENSORBOARD_LOG_DIR = `/logs/` // // For CustomJob backing a Trial of HyperparameterTuningJob: // // * AIP_MODEL_DIR = `//model/` // * AIP_CHECKPOINT_DIR = `//checkpoints/` // * AIP_TENSORBOARD_LOG_DIR = `//logs/` GcsDestination base_output_directory = 6; // The ID of the location to store protected artifacts. e.g. us-central1. // Populate only when the location is different than CustomJob location. // List of supported locations: // https://cloud.google.com/vertex-ai/docs/general/locations string protected_artifact_location_id = 19; // Optional. The name of a Vertex AI // [Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard] resource to // which this CustomJob will upload Tensorboard logs. Format: // `projects/{project}/locations/{location}/tensorboards/{tensorboard}` string tensorboard = 7 [ (google.api.field_behavior) = OPTIONAL, (google.api.resource_reference) = { type: "aiplatform.googleapis.com/Tensorboard" } ]; // Optional. Whether you want Vertex AI to enable [interactive shell // access](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) // to training containers. // // If set to `true`, you can access interactive shells at the URIs given // by // [CustomJob.web_access_uris][google.cloud.aiplatform.v1beta1.CustomJob.web_access_uris] // or // [Trial.web_access_uris][google.cloud.aiplatform.v1beta1.Trial.web_access_uris] // (within // [HyperparameterTuningJob.trials][google.cloud.aiplatform.v1beta1.HyperparameterTuningJob.trials]). bool enable_web_access = 10 [(google.api.field_behavior) = OPTIONAL]; // Optional. Whether you want Vertex AI to enable access to the customized // dashboard in training chief container. // // If set to `true`, you can access the dashboard at the URIs given // by // [CustomJob.web_access_uris][google.cloud.aiplatform.v1beta1.CustomJob.web_access_uris] // or // [Trial.web_access_uris][google.cloud.aiplatform.v1beta1.Trial.web_access_uris] // (within // [HyperparameterTuningJob.trials][google.cloud.aiplatform.v1beta1.HyperparameterTuningJob.trials]). bool enable_dashboard_access = 16 [(google.api.field_behavior) = OPTIONAL]; // Optional. The Experiment associated with this job. // Format: // `projects/{project}/locations/{location}/metadataStores/{metadataStores}/contexts/{experiment-name}` string experiment = 17 [ (google.api.field_behavior) = OPTIONAL, (google.api.resource_reference) = { type: "aiplatform.googleapis.com/Context" } ]; // Optional. The Experiment Run associated with this job. // Format: // `projects/{project}/locations/{location}/metadataStores/{metadataStores}/contexts/{experiment-name}-{experiment-run-name}` string experiment_run = 18 [ (google.api.field_behavior) = OPTIONAL, (google.api.resource_reference) = { type: "aiplatform.googleapis.com/Context" } ]; // Optional. The name of the Model resources for which to generate a mapping // to artifact URIs. Applicable only to some of the Google-provided custom // jobs. Format: `projects/{project}/locations/{location}/models/{model}` // // In order to retrieve a specific version of the model, also provide // the version ID or version alias. // Example: `projects/{project}/locations/{location}/models/{model}@2` // or // `projects/{project}/locations/{location}/models/{model}@golden` // If no version ID or alias is specified, the "default" version will be // returned. The "default" version alias is created for the first version of // the model, and can be moved to other versions later on. There will be // exactly one default version. repeated string models = 20 [ (google.api.field_behavior) = OPTIONAL, (google.api.resource_reference) = { type: "aiplatform.googleapis.com/Model" } ]; } // Represents the spec of a worker pool in a job. message WorkerPoolSpec { // The custom task to be executed in this worker pool. oneof task { // The custom container task. ContainerSpec container_spec = 6; // The Python packaged task. PythonPackageSpec python_package_spec = 7; } // Optional. Immutable. The specification of a single machine. MachineSpec machine_spec = 1 [ (google.api.field_behavior) = OPTIONAL, (google.api.field_behavior) = IMMUTABLE ]; // Optional. The number of worker replicas to use for this worker pool. int64 replica_count = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. List of NFS mount spec. repeated NfsMount nfs_mounts = 4 [(google.api.field_behavior) = OPTIONAL]; // Disk spec. DiskSpec disk_spec = 5; } // The spec of a Container. message ContainerSpec { // Required. The URI of a container image in the Container Registry that is to // be run on each worker replica. string image_uri = 1 [(google.api.field_behavior) = REQUIRED]; // The command to be invoked when the container is started. // It overrides the entrypoint instruction in Dockerfile when provided. repeated string command = 2; // The arguments to be passed when starting the container. repeated string args = 3; // Environment variables to be passed to the container. // Maximum limit is 100. repeated EnvVar env = 4; } // The spec of a Python packaged code. message PythonPackageSpec { // Required. The URI of a container image in Artifact Registry that will run // the provided Python package. Vertex AI provides a wide range of executor // images with pre-installed packages to meet users' various use cases. See // the list of [pre-built containers for // training](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers). // You must use an image from this list. string executor_image_uri = 1 [(google.api.field_behavior) = REQUIRED]; // Required. The Google Cloud Storage location of the Python package files // which are the training program and its dependent packages. The maximum // number of package URIs is 100. repeated string package_uris = 2 [(google.api.field_behavior) = REQUIRED]; // Required. The Python module name to run after installing the packages. string python_module = 3 [(google.api.field_behavior) = REQUIRED]; // Command line arguments to be passed to the Python task. repeated string args = 4; // Environment variables to be passed to the python module. // Maximum limit is 100. repeated EnvVar env = 5; } // All parameters related to queuing and scheduling of custom jobs. message Scheduling { // Optional. This determines which type of scheduling strategy to use. Right // now users have two options such as STANDARD which will use regular on // demand resources to schedule the job, the other is SPOT which would // leverage spot resources alongwith regular resources to schedule // the job. enum Strategy { // Strategy will default to STANDARD. STRATEGY_UNSPECIFIED = 0; // Deprecated. Regular on-demand provisioning strategy. ON_DEMAND = 1 [deprecated = true]; // Deprecated. Low cost by making potential use of spot resources. LOW_COST = 2 [deprecated = true]; // Standard provisioning strategy uses regular on-demand resources. STANDARD = 3; // Spot provisioning strategy uses spot resources. SPOT = 4; // Flex Start strategy uses DWS to queue for resources. FLEX_START = 6; } // The maximum job running time. The default is 7 days. google.protobuf.Duration timeout = 1; // Restarts the entire CustomJob if a worker gets restarted. // This feature can be used by distributed training jobs that are not // resilient to workers leaving and joining a job. bool restart_job_on_worker_restart = 3; // Optional. This determines which type of scheduling strategy to use. Strategy strategy = 4 [(google.api.field_behavior) = OPTIONAL]; // Optional. Indicates if the job should retry for internal errors after the // job starts running. If true, overrides // `Scheduling.restart_job_on_worker_restart` to false. bool disable_retries = 5 [(google.api.field_behavior) = OPTIONAL]; // Optional. This is the maximum duration that a job will wait for the // requested resources to be provisioned if the scheduling strategy is set to // [Strategy.DWS_FLEX_START]. // If set to 0, the job will wait indefinitely. The default is 24 hours. google.protobuf.Duration max_wait_duration = 6 [(google.api.field_behavior) = OPTIONAL]; }