// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.documentai.v1beta3; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; import "google/cloud/documentai/v1beta3/document.proto"; import "google/cloud/documentai/v1beta3/document_io.proto"; import "google/cloud/documentai/v1beta3/document_schema.proto"; option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3"; option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb"; option java_multiple_files = true; option java_outer_classname = "DatasetProto"; option java_package = "com.google.cloud.documentai.v1beta3"; option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3"; option ruby_package = "Google::Cloud::DocumentAI::V1beta3"; option (google.api.resource_definition) = { type: "contentwarehouse.googleapis.com/Schema" pattern: "projects/{project}/locations/{location}/schemas/{schema}" }; // A singleton resource under a // [Processor][google.cloud.documentai.v1beta3.Processor] which configures a // collection of documents. message Dataset { option (google.api.resource) = { type: "documentai.googleapis.com/Dataset" pattern: "projects/{project}/locations/{location}/processors/{processor}/dataset" }; // Configuration specific to the Cloud Storage-based implementation. message GCSManagedConfig { // Required. The Cloud Storage URI (a directory) where the documents // belonging to the dataset must be stored. GcsPrefix gcs_prefix = 1 [(google.api.field_behavior) = REQUIRED]; } // Configuration specific to the Document AI Warehouse-based implementation. message DocumentWarehouseConfig { // Output only. The collection in Document AI Warehouse associated with the // dataset. string collection = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. The schema in Document AI Warehouse associated with the // dataset. string schema = 2 [ (google.api.field_behavior) = OUTPUT_ONLY, (google.api.resource_reference) = { type: "contentwarehouse.googleapis.com/Schema" } ]; } // Configuration specific to an unmanaged dataset. message UnmanagedDatasetConfig {} // Configuration specific to spanner-based indexing. message SpannerIndexingConfig {} // Different states of a dataset. enum State { // Default unspecified enum, should not be used. STATE_UNSPECIFIED = 0; // Dataset has not been initialized. UNINITIALIZED = 1; // Dataset is being initialized. INITIALIZING = 2; // Dataset has been initialized. INITIALIZED = 3; } oneof storage_source { // Optional. User-managed Cloud Storage dataset configuration. Use this // configuration if the dataset documents are stored under a user-managed // Cloud Storage location. GCSManagedConfig gcs_managed_config = 3 [(google.api.field_behavior) = OPTIONAL]; // Optional. Deprecated. Warehouse-based dataset configuration is not // supported. DocumentWarehouseConfig document_warehouse_config = 5 [deprecated = true, (google.api.field_behavior) = OPTIONAL]; // Optional. Unmanaged dataset configuration. Use this configuration if the // dataset documents are managed by the document service internally (not // user-managed). UnmanagedDatasetConfig unmanaged_dataset_config = 6 [(google.api.field_behavior) = OPTIONAL]; } oneof indexing_source { // Optional. A lightweight indexing source with low latency and high // reliability, but lacking advanced features like CMEK and content-based // search. SpannerIndexingConfig spanner_indexing_config = 4 [(google.api.field_behavior) = OPTIONAL]; } // Dataset resource name. // Format: // `projects/{project}/locations/{location}/processors/{processor}/dataset` string name = 1; // Required. State of the dataset. Ignored when updating dataset. State state = 2 [(google.api.field_behavior) = REQUIRED]; // Output only. Reserved for future use. bool satisfies_pzs = 8 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Reserved for future use. bool satisfies_pzi = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Document Identifier. message DocumentId { // Identifies a document uniquely within the scope of a dataset in the // user-managed Cloud Storage option. message GCSManagedDocumentId { // Required. The Cloud Storage URI where the actual document is stored. string gcs_uri = 1 [(google.api.field_behavior) = REQUIRED]; // Id of the document (indexed) managed by Content Warehouse. string cw_doc_id = 2 [deprecated = true]; } // Identifies a document uniquely within the scope of a dataset in unmanaged // option. message UnmanagedDocumentId { // Required. The id of the document. string doc_id = 1 [(google.api.field_behavior) = REQUIRED]; } oneof type { // A document id within user-managed Cloud Storage. GCSManagedDocumentId gcs_managed_doc_id = 1; // A document id within unmanaged dataset. UnmanagedDocumentId unmanaged_doc_id = 4; } // Points to a specific revision of the document if set. RevisionRef revision_ref = 3; } // Dataset Schema. message DatasetSchema { option (google.api.resource) = { type: "documentai.googleapis.com/DatasetSchema" pattern: "projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema" }; // Dataset schema resource name. // Format: // `projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema` string name = 1; // Optional. Schema of the dataset. DocumentSchema document_schema = 3 [(google.api.field_behavior) = OPTIONAL]; } // Dataset documents that the batch operation will be applied to. message BatchDatasetDocuments { // List of individual DocumentIds. message IndividualDocumentIds { // Required. List of Document IDs indicating where the actual documents are // stored. repeated DocumentId document_ids = 1 [(google.api.field_behavior) = REQUIRED]; } oneof criteria { // Document identifiers. IndividualDocumentIds individual_document_ids = 1; // A filter matching the documents. // Follows the same format and restriction as // [google.cloud.documentai.master.ListDocumentsRequest.filter]. string filter = 2; } }