// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.documentai.v1beta3; import "google/api/annotations.proto"; import "google/api/client.proto"; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; import "google/cloud/documentai/v1beta3/dataset.proto"; import "google/cloud/documentai/v1beta3/document.proto"; import "google/cloud/documentai/v1beta3/document_io.proto"; import "google/cloud/documentai/v1beta3/operation_metadata.proto"; import "google/longrunning/operations.proto"; import "google/protobuf/field_mask.proto"; import "google/rpc/status.proto"; option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3"; option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb"; option java_multiple_files = true; option java_outer_classname = "DocumentAiDocumentService"; option java_package = "com.google.cloud.documentai.v1beta3"; option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3"; option ruby_package = "Google::Cloud::DocumentAI::V1beta3"; // Service to call Cloud DocumentAI to manage document collection (dataset). service DocumentService { option (google.api.default_host) = "documentai.googleapis.com"; option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; // Updates metadata associated with a dataset. // Note that this method requires the // `documentai.googleapis.com/datasets.update` permission on the project, // which is highly privileged. A user or service account with this permission // can create new processors that can interact with any gcs bucket in your // project. rpc UpdateDataset(UpdateDatasetRequest) returns (google.longrunning.Operation) { option (google.api.http) = { patch: "/v1beta3/{dataset.name=projects/*/locations/*/processors/*/dataset}" body: "dataset" }; option (google.api.method_signature) = "dataset,update_mask"; option (google.longrunning.operation_info) = { response_type: "Dataset" metadata_type: "UpdateDatasetOperationMetadata" }; } // Import documents into a dataset. rpc ImportDocuments(ImportDocumentsRequest) returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:importDocuments" body: "*" }; option (google.api.method_signature) = "dataset"; option (google.longrunning.operation_info) = { response_type: "ImportDocumentsResponse" metadata_type: "ImportDocumentsMetadata" }; } // Returns relevant fields present in the requested document. rpc GetDocument(GetDocumentRequest) returns (GetDocumentResponse) { option (google.api.http) = { get: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:getDocument" }; option (google.api.method_signature) = "dataset"; } // Returns a list of documents present in the dataset. rpc ListDocuments(ListDocumentsRequest) returns (ListDocumentsResponse) { option (google.api.http) = { post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:listDocuments" body: "*" }; option (google.api.method_signature) = "dataset"; } // Deletes a set of documents. rpc BatchDeleteDocuments(BatchDeleteDocumentsRequest) returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v1beta3/{dataset=projects/*/locations/*/processors/*/dataset}:batchDeleteDocuments" body: "*" }; option (google.api.method_signature) = "dataset"; option (google.longrunning.operation_info) = { response_type: "BatchDeleteDocumentsResponse" metadata_type: "BatchDeleteDocumentsMetadata" }; } // Gets the `DatasetSchema` of a `Dataset`. rpc GetDatasetSchema(GetDatasetSchemaRequest) returns (DatasetSchema) { option (google.api.http) = { get: "/v1beta3/{name=projects/*/locations/*/processors/*/dataset/datasetSchema}" }; option (google.api.method_signature) = "name"; } // Updates a `DatasetSchema`. rpc UpdateDatasetSchema(UpdateDatasetSchemaRequest) returns (DatasetSchema) { option (google.api.http) = { patch: "/v1beta3/{dataset_schema.name=projects/*/locations/*/processors/*/dataset/datasetSchema}" body: "dataset_schema" }; option (google.api.method_signature) = "dataset_schema,update_mask"; } } // Documents belonging to a dataset will be split into different groups // referred to as splits: train, test. enum DatasetSplitType { // Default value if the enum is not set. DATASET_SPLIT_TYPE_UNSPECIFIED = 0; // Identifies the train documents. DATASET_SPLIT_TRAIN = 1; // Identifies the test documents. DATASET_SPLIT_TEST = 2; // Identifies the unassigned documents. DATASET_SPLIT_UNASSIGNED = 3; } // Describes the labeling status of a document. enum DocumentLabelingState { // Default value if the enum is not set. DOCUMENT_LABELING_STATE_UNSPECIFIED = 0; // Document has been labeled. DOCUMENT_LABELED = 1; // Document has not been labeled. DOCUMENT_UNLABELED = 2; // Document has been auto-labeled. DOCUMENT_AUTO_LABELED = 3; } message UpdateDatasetRequest { // Required. The `name` field of the `Dataset` is used to identify the // resource to be updated. Dataset dataset = 1 [(google.api.field_behavior) = REQUIRED]; // The update mask applies to the resource. google.protobuf.FieldMask update_mask = 2; } message UpdateDatasetOperationMetadata { // The basic metadata of the long-running operation. CommonOperationMetadata common_metadata = 1; } message ImportDocumentsRequest { // Config for importing documents. // Each batch can have its own dataset split type. message BatchDocumentsImportConfig { // The config for auto-split. message AutoSplitConfig { // Ratio of training dataset split. float training_split_ratio = 1; } oneof split_type_config { // Target dataset split where the documents must be stored. DatasetSplitType dataset_split = 2; // If set, documents will be automatically split into training and test // split category with the specified ratio. AutoSplitConfig auto_split_config = 3; } // The common config to specify a set of documents used as input. BatchDocumentsInputConfig batch_input_config = 1; } // Required. The dataset resource name. // Format: // projects/{project}/locations/{location}/processors/{processor}/dataset string dataset = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "documentai.googleapis.com/Dataset" } ]; // Required. The Cloud Storage uri containing raw documents that must be // imported. repeated BatchDocumentsImportConfig batch_documents_import_configs = 4 [(google.api.field_behavior) = REQUIRED]; } // Response of the import document operation. message ImportDocumentsResponse {} // Metadata of the import document operation. message ImportDocumentsMetadata { // The status of each individual document in the import process. message IndividualImportStatus { // The source Cloud Storage URI of the document. string input_gcs_source = 1; // The status of the importing of the document. google.rpc.Status status = 2; // The document id of imported document if it was successful, otherwise // empty. DocumentId output_document_id = 4; } // The validation status of each import config. Status is set to an error if // there are no documents to import in the `import_config`, or `OK` if the // operation will try to proceed with at least one document. message ImportConfigValidationResult { // The source Cloud Storage URI specified in the import config. string input_gcs_source = 1; // The validation status of import config. google.rpc.Status status = 2; } // The basic metadata of the long-running operation. CommonOperationMetadata common_metadata = 1; // The list of response details of each document. repeated IndividualImportStatus individual_import_statuses = 2; // Validation statuses of the batch documents import config. repeated ImportConfigValidationResult import_config_validation_results = 4; // Total number of the documents that are qualified for importing. int32 total_document_count = 3; } message GetDocumentRequest { // Required. The resource name of the dataset that the document belongs to . // Format: // projects/{project}/locations/{location}/processors/{processor}/dataset string dataset = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "documentai.googleapis.com/Dataset" } ]; // Required. Document identifier. DocumentId document_id = 2 [(google.api.field_behavior) = REQUIRED]; // If set, only fields listed here will be returned. Otherwise, all fields // will be returned by default. google.protobuf.FieldMask read_mask = 3; // List of pages for which the fields specified in the `read_mask` must // be served. DocumentPageRange page_range = 4; } message GetDocumentResponse { Document document = 1; } message ListDocumentsRequest { // Required. The resource name of the dataset to be listed. // Format: // projects/{project}/locations/{location}/processors/{processor}/dataset string dataset = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "documentai.googleapis.com/Dataset" } ]; // The maximum number of documents to return. The service may return // fewer than this value. // If unspecified, at most 20 documents will be returned. // The maximum value is 100; values above 100 will be coerced to 100. int32 page_size = 2; // A page token, received from a previous `ListDocuments` call. // Provide this to retrieve the subsequent page. // // When paginating, all other parameters provided to `ListDocuments` // must match the call that provided the page token. string page_token = 3; // Optional. Query to filter the documents based on // https://google.aip.dev/160. // ## Currently support query strings are: // // `SplitType=DATASET_SPLIT_TEST|DATASET_SPLIT_TRAIN|DATASET_SPLIT_UNASSIGNED` // - `LabelingState=DOCUMENT_LABELED|DOCUMENT_UNLABELED|DOCUMENT_AUTO_LABELED` // - `DisplayName=\"file_name.pdf\"` // - `EntityType=abc/def` // - `TagName=\"auto-labeling-running\"|\"sampled\"` // // Note: // - Only `AND`, `=` and `!=` are supported. // e.g. `DisplayName=file_name AND EntityType!=abc` IS supported. // - Wildcard `*` is supported only in `DisplayName` filter // - No duplicate filter keys are allowed, // e.g. `EntityType=a AND EntityType=b` is NOT supported. // - String match is case sensitive (for filter `DisplayName` & `EntityType`). string filter = 4 [(google.api.field_behavior) = OPTIONAL]; // Optional. Controls if the request requires a total size of matched // documents. See // [ListDocumentsResponse.total_size][google.cloud.documentai.v1beta3.ListDocumentsResponse.total_size]. // // Enabling this flag may adversely impact performance. // // Defaults to false. bool return_total_size = 6 [(google.api.field_behavior) = OPTIONAL]; // Optional. Number of results to skip beginning from the `page_token` if // provided. https://google.aip.dev/158#skipping-results. It must be a // non-negative integer. Negative values will be rejected. Note that this is // not the number of pages to skip. If this value causes the cursor to move // past the end of results, // [ListDocumentsResponse.document_metadata][google.cloud.documentai.v1beta3.ListDocumentsResponse.document_metadata] // and // [ListDocumentsResponse.next_page_token][google.cloud.documentai.v1beta3.ListDocumentsResponse.next_page_token] // will be empty. int32 skip = 8 [(google.api.field_behavior) = OPTIONAL]; } message ListDocumentsResponse { // Document metadata corresponding to the listed documents. repeated DocumentMetadata document_metadata = 1; // A token, which can be sent as // [ListDocumentsRequest.page_token][google.cloud.documentai.v1beta3.ListDocumentsRequest.page_token] // to retrieve the next page. If this field is omitted, there are no // subsequent pages. string next_page_token = 2; // Total count of documents queried. int32 total_size = 3; } message BatchDeleteDocumentsRequest { // Required. The dataset resource name. // Format: // projects/{project}/locations/{location}/processors/{processor}/dataset string dataset = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Dataset documents input. If given `filter`, all documents // satisfying the filter will be deleted. If given documentIds, a maximum of // 50 documents can be deleted in a batch. The request will be rejected if // more than 50 document_ids are provided. BatchDatasetDocuments dataset_documents = 3 [(google.api.field_behavior) = REQUIRED]; } // Response of the delete documents operation. message BatchDeleteDocumentsResponse {} message BatchDeleteDocumentsMetadata { // The status of each individual document in the batch delete process. message IndividualBatchDeleteStatus { // The document id of the document. DocumentId document_id = 1; // The status of deleting the document in storage. google.rpc.Status status = 2; } // The basic metadata of the long-running operation. CommonOperationMetadata common_metadata = 1; // The list of response details of each document. repeated IndividualBatchDeleteStatus individual_batch_delete_statuses = 2; // Total number of documents deleting from dataset. int32 total_document_count = 3; // Total number of documents that failed to be deleted in storage. int32 error_document_count = 4; } // Request for `GetDatasetSchema`. message GetDatasetSchemaRequest { // Required. The dataset schema resource name. // Format: // projects/{project}/locations/{location}/processors/{processor}/dataset/datasetSchema string name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "documentai.googleapis.com/DatasetSchema" } ]; // If set, only returns the visible fields of the schema. bool visible_fields_only = 2; } // Request for `UpdateDatasetSchema`. message UpdateDatasetSchemaRequest { // Required. The name field of the `DatasetSchema` is used to identify the // resource to be updated. DatasetSchema dataset_schema = 1 [(google.api.field_behavior) = REQUIRED]; // The update mask applies to the resource. google.protobuf.FieldMask update_mask = 2; } // Range of pages present in a document. message DocumentPageRange { // First page number (one-based index) to be returned. int32 start = 1; // Last page number (one-based index) to be returned. int32 end = 2; } // Metadata about a document. message DocumentMetadata { // Document identifier. DocumentId document_id = 1; // Number of pages in the document. int32 page_count = 2; // Type of the dataset split to which the document belongs. DatasetSplitType dataset_type = 3; // Labeling state of the document. DocumentLabelingState labeling_state = 5; // The display name of the document. string display_name = 6; }