// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.aiplatform.v1; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; import "google/cloud/aiplatform/v1/encryption_spec.proto"; import "google/cloud/aiplatform/v1/io.proto"; import "google/cloud/aiplatform/v1/saved_query.proto"; import "google/protobuf/struct.proto"; import "google/protobuf/timestamp.proto"; option csharp_namespace = "Google.Cloud.AIPlatform.V1"; option go_package = "cloud.google.com/go/aiplatform/apiv1/aiplatformpb;aiplatformpb"; option java_multiple_files = true; option java_outer_classname = "DatasetProto"; option java_package = "com.google.cloud.aiplatform.v1"; option php_namespace = "Google\\Cloud\\AIPlatform\\V1"; option ruby_package = "Google::Cloud::AIPlatform::V1"; // A collection of DataItems and Annotations on them. message Dataset { option (google.api.resource) = { type: "aiplatform.googleapis.com/Dataset" pattern: "projects/{project}/locations/{location}/datasets/{dataset}" }; // Output only. The resource name of the Dataset. string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Required. The user-defined name of the Dataset. // The name can be up to 128 characters long and can consist of any UTF-8 // characters. string display_name = 2 [(google.api.field_behavior) = REQUIRED]; // The description of the Dataset. string description = 16; // Required. Points to a YAML file stored on Google Cloud Storage describing // additional information about the Dataset. The schema is defined as an // OpenAPI 3.0.2 Schema Object. The schema files that can be used here are // found in gs://google-cloud-aiplatform/schema/dataset/metadata/. string metadata_schema_uri = 3 [(google.api.field_behavior) = REQUIRED]; // Required. Additional information about the Dataset. google.protobuf.Value metadata = 8 [(google.api.field_behavior) = REQUIRED]; // Output only. The number of DataItems in this Dataset. Only apply for // non-structured Dataset. int64 data_item_count = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Timestamp when this Dataset was created. google.protobuf.Timestamp create_time = 4 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Timestamp when this Dataset was last updated. google.protobuf.Timestamp update_time = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; // Used to perform consistent read-modify-write updates. If not set, a blind // "overwrite" update happens. string etag = 6; // The labels with user-defined metadata to organize your Datasets. // // Label keys and values can be no longer than 64 characters // (Unicode codepoints), can only contain lowercase letters, numeric // characters, underscores and dashes. International characters are allowed. // No more than 64 user labels can be associated with one Dataset (System // labels are excluded). // // See https://goo.gl/xmQnxf for more information and examples of labels. // System reserved label keys are prefixed with "aiplatform.googleapis.com/" // and are immutable. Following system labels exist for each Dataset: // // * "aiplatform.googleapis.com/dataset_metadata_schema": output only, its // value is the // [metadata_schema's][google.cloud.aiplatform.v1.Dataset.metadata_schema_uri] // title. map labels = 7; // All SavedQueries belong to the Dataset will be returned in List/Get // Dataset response. The annotation_specs field // will not be populated except for UI cases which will only use // [annotation_spec_count][google.cloud.aiplatform.v1.SavedQuery.annotation_spec_count]. // In CreateDataset request, a SavedQuery is created together if // this field is set, up to one SavedQuery can be set in CreateDatasetRequest. // The SavedQuery should not contain any AnnotationSpec. repeated SavedQuery saved_queries = 9; // Customer-managed encryption key spec for a Dataset. If set, this Dataset // and all sub-resources of this Dataset will be secured by this key. EncryptionSpec encryption_spec = 11; // Output only. The resource name of the Artifact that was created in // MetadataStore when creating the Dataset. The Artifact resource name pattern // is // `projects/{project}/locations/{location}/metadataStores/{metadata_store}/artifacts/{artifact}`. string metadata_artifact = 17 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Describes the location from where we import data into a Dataset, together // with the labels that will be applied to the DataItems and the Annotations. message ImportDataConfig { // The source of the input. oneof source { // The Google Cloud Storage location for the input content. GcsSource gcs_source = 1; } // Labels that will be applied to newly imported DataItems. If an identical // DataItem as one being imported already exists in the Dataset, then these // labels will be appended to these of the already existing one, and if labels // with identical key is imported before, the old label value will be // overwritten. If two DataItems are identical in the same import data // operation, the labels will be combined and if key collision happens in this // case, one of the values will be picked randomly. Two DataItems are // considered identical if their content bytes are identical (e.g. image bytes // or pdf bytes). // These labels will be overridden by Annotation labels specified inside index // file referenced by // [import_schema_uri][google.cloud.aiplatform.v1.ImportDataConfig.import_schema_uri], // e.g. jsonl file. map data_item_labels = 2; // Labels that will be applied to newly imported Annotations. If two // Annotations are identical, one of them will be deduped. Two Annotations are // considered identical if their // [payload][google.cloud.aiplatform.v1.Annotation.payload], // [payload_schema_uri][google.cloud.aiplatform.v1.Annotation.payload_schema_uri] // and all of their [labels][google.cloud.aiplatform.v1.Annotation.labels] are // the same. These labels will be overridden by Annotation labels specified // inside index file referenced by // [import_schema_uri][google.cloud.aiplatform.v1.ImportDataConfig.import_schema_uri], // e.g. jsonl file. map annotation_labels = 3; // Required. Points to a YAML file stored on Google Cloud Storage describing // the import format. Validation will be done against the schema. The schema // is defined as an [OpenAPI 3.0.2 Schema // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject). string import_schema_uri = 4 [(google.api.field_behavior) = REQUIRED]; } // Describes what part of the Dataset is to be exported, the destination of // the export and how to export. message ExportDataConfig { // ExportUse indicates the usage of the exported files. It restricts file // destination, format, annotations to be exported, whether to allow // unannotated data to be exported and whether to clone files to temp Cloud // Storage bucket. enum ExportUse { // Regular user export. EXPORT_USE_UNSPECIFIED = 0; // Export for custom code training. CUSTOM_CODE_TRAINING = 6; } // The destination of the output. oneof destination { // The Google Cloud Storage location where the output is to be written to. // In the given directory a new directory will be created with name: // `export-data--` where // timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format. All export // output will be written into that directory. Inside that directory, // annotations with the same schema will be grouped into sub directories // which are named with the corresponding annotations' schema title. Inside // these sub directories, a schema.yaml will be created to describe the // output format. GcsDestination gcs_destination = 1; } // The instructions how the export data should be split between the // training, validation and test sets. oneof split { // Split based on fractions defining the size of each set. ExportFractionSplit fraction_split = 5; // Split based on the provided filters for each set. ExportFilterSplit filter_split = 7; } // An expression for filtering what part of the Dataset is to be exported. // Only Annotations that match this filter will be exported. The filter syntax // is the same as in // [ListAnnotations][google.cloud.aiplatform.v1.DatasetService.ListAnnotations]. string annotations_filter = 2; // The ID of a SavedQuery (annotation set) under the Dataset specified by // [dataset_id][] used for filtering Annotations for training. // // Only used for custom training data export use cases. // Only applicable to Datasets that have SavedQueries. // // Only Annotations that are associated with this SavedQuery are used in // respectively training. When used in conjunction with // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter], // the Annotations used for training are filtered by both // [saved_query_id][google.cloud.aiplatform.v1.ExportDataConfig.saved_query_id] // and // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter]. // // Only one of // [saved_query_id][google.cloud.aiplatform.v1.ExportDataConfig.saved_query_id] // and // [annotation_schema_uri][google.cloud.aiplatform.v1.ExportDataConfig.annotation_schema_uri] // should be specified as both of them represent the same thing: problem type. string saved_query_id = 11; // The Cloud Storage URI that points to a YAML file describing the annotation // schema. The schema is defined as an OpenAPI 3.0.2 [Schema // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject). // The schema files that can be used here are found in // gs://google-cloud-aiplatform/schema/dataset/annotation/, note that the // chosen schema must be consistent with // [metadata][google.cloud.aiplatform.v1.Dataset.metadata_schema_uri] of the // Dataset specified by [dataset_id][]. // // Only used for custom training data export use cases. // Only applicable to Datasets that have DataItems and Annotations. // // Only Annotations that both match this schema and belong to DataItems not // ignored by the split method are used in respectively training, validation // or test role, depending on the role of the DataItem they are on. // // When used in conjunction with // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter], // the Annotations used for training are filtered by both // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter] // and // [annotation_schema_uri][google.cloud.aiplatform.v1.ExportDataConfig.annotation_schema_uri]. string annotation_schema_uri = 12; // Indicates the usage of the exported files. ExportUse export_use = 4; } // Assigns the input data to training, validation, and test sets as per the // given fractions. Any of `training_fraction`, `validation_fraction` and // `test_fraction` may optionally be provided, they must sum to up to 1. If the // provided ones sum to less than 1, the remainder is assigned to sets as // decided by Vertex AI. If none of the fractions are set, by default roughly // 80% of data is used for training, 10% for validation, and 10% for test. message ExportFractionSplit { // The fraction of the input data that is to be used to train the Model. double training_fraction = 1; // The fraction of the input data that is to be used to validate the Model. double validation_fraction = 2; // The fraction of the input data that is to be used to evaluate the Model. double test_fraction = 3; } // Assigns input data to training, validation, and test sets based on the given // filters, data pieces not matched by any filter are ignored. Currently only // supported for Datasets containing DataItems. // If any of the filters in this message are to match nothing, then they can be // set as '-' (the minus sign). // // Supported only for unstructured Datasets. message ExportFilterSplit { // Required. A filter on DataItems of the Dataset. DataItems that match // this filter are used to train the Model. A filter with same syntax // as the one used in // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] // may be used. If a single DataItem is matched by more than one of the // FilterSplit filters, then it is assigned to the first set that applies to // it in the training, validation, test order. string training_filter = 1 [(google.api.field_behavior) = REQUIRED]; // Required. A filter on DataItems of the Dataset. DataItems that match // this filter are used to validate the Model. A filter with same syntax // as the one used in // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] // may be used. If a single DataItem is matched by more than one of the // FilterSplit filters, then it is assigned to the first set that applies to // it in the training, validation, test order. string validation_filter = 2 [(google.api.field_behavior) = REQUIRED]; // Required. A filter on DataItems of the Dataset. DataItems that match // this filter are used to test the Model. A filter with same syntax // as the one used in // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] // may be used. If a single DataItem is matched by more than one of the // FilterSplit filters, then it is assigned to the first set that applies to // it in the training, validation, test order. string test_filter = 3 [(google.api.field_behavior) = REQUIRED]; }