// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.contentwarehouse.v1; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; import "google/cloud/contentwarehouse/v1/common.proto"; import "google/iam/v1/policy.proto"; import "google/rpc/status.proto"; option csharp_namespace = "Google.Cloud.ContentWarehouse.V1"; option go_package = "cloud.google.com/go/contentwarehouse/apiv1/contentwarehousepb;contentwarehousepb"; option java_multiple_files = true; option java_outer_classname = "PipelinesProto"; option java_package = "com.google.cloud.contentwarehouse.v1"; option php_namespace = "Google\\Cloud\\ContentWarehouse\\V1"; option ruby_package = "Google::Cloud::ContentWarehouse::V1"; option (google.api.resource_definition) = { type: "cloudfunctions.googleapis.com/CloudFunction" pattern: "projects/{project}/locations/{location}/functions/{function}" }; // Response message of RunPipeline method. message RunPipelineResponse {} // Metadata message of RunPipeline method. message RunPipelineMetadata { // The metadata message for GcsIngest pipeline. message GcsIngestPipelineMetadata { // The input Cloud Storage folder in this pipeline. // Format: `gs:///`. string input_path = 1; } // The metadata message for Export-to-CDW pipeline. message ExportToCdwPipelineMetadata { // The input list of all the resource names of the documents to be exported. repeated string documents = 1; // The output CDW dataset resource name. string doc_ai_dataset = 2; // The output Cloud Storage folder in this pipeline. string output_path = 3; } // The metadata message for Process-with-DocAi pipeline. message ProcessWithDocAiPipelineMetadata { // The input list of all the resource names of the documents to be // processed. repeated string documents = 1; // The DocAI processor to process the documents with. ProcessorInfo processor_info = 2; } // The status of processing a document. message IndividualDocumentStatus { // Document identifier of an existing document. string document_id = 1; // The status processing the document. google.rpc.Status status = 2; } // Number of files that were processed by the pipeline. int32 total_file_count = 1; // Number of files that have failed at some point in the pipeline. int32 failed_file_count = 2; // User unique identification and groups information. UserInfo user_info = 3; // The pipeline metadata. oneof pipeline_metadata { // The pipeline metadata for GcsIngest pipeline. GcsIngestPipelineMetadata gcs_ingest_pipeline_metadata = 4; // The pipeline metadata for Export-to-CDW pipeline. ExportToCdwPipelineMetadata export_to_cdw_pipeline_metadata = 6; // The pipeline metadata for Process-with-DocAi pipeline. ProcessWithDocAiPipelineMetadata process_with_doc_ai_pipeline_metadata = 7; } // The list of response details of each document. repeated IndividualDocumentStatus individual_document_statuses = 5; } // The DocAI processor information. message ProcessorInfo { // The processor resource name. // Format is `projects/{project}/locations/{location}/processors/{processor}`, // or // `projects/{project}/locations/{location}/processors/{processor}/processorVersions/{processorVersion}` string processor_name = 1; // The processor will process the documents with this document type. string document_type = 2; // The Document schema resource name. All documents processed by this // processor will use this schema. // Format: // projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}. string schema_name = 3; } // The ingestion pipeline config. message IngestPipelineConfig { // The document level acl policy config. // This refers to an Identity and Access (IAM) policy, which specifies access // controls for all documents ingested by the pipeline. The // [role][google.iam.v1.Binding.role] and // [members][google.iam.v1.Binding.role] under the policy needs to be // specified. // // The following roles are supported for document level acl control: // * roles/contentwarehouse.documentAdmin // * roles/contentwarehouse.documentEditor // * roles/contentwarehouse.documentViewer // // The following members are supported for document level acl control: // * user:user-email@example.com // * group:group-email@example.com // Note that for documents searched with LLM, only single level user or group // acl check is supported. google.iam.v1.Policy document_acl_policy = 1; // The document text extraction enabled flag. // If the flag is set to true, DWH will perform text extraction on the raw // document. bool enable_document_text_extraction = 2; // Optional. The name of the folder to which all ingested documents will be // linked during ingestion process. Format is // `projects/{project}/locations/{location}/documents/{folder_id}` string folder = 3 [(google.api.field_behavior) = OPTIONAL]; // The Cloud Function resource name. The Cloud Function needs to live inside // consumer project and is accessible to Document AI Warehouse P4SA. // Only Cloud Functions V2 is supported. Cloud function execution should // complete within 5 minutes or this file ingestion may fail due to timeout. // Format: `https://{region}-{project_id}.cloudfunctions.net/{cloud_function}` // The following keys are available the request json payload. // * display_name // * properties // * plain_text // * reference_id // * document_schema_name // * raw_document_path // * raw_document_file_type // // The following keys from the cloud function json response payload will be // ingested to the Document AI Warehouse as part of Document proto content // and/or related information. The original values will be overridden if any // key is present in the response. // * display_name // * properties // * plain_text // * document_acl_policy // * folder string cloud_function = 4 [(google.api.resource_reference) = { type: "cloudfunctions.googleapis.com/CloudFunction" }]; } // The configuration of the Cloud Storage Ingestion pipeline. message GcsIngestPipeline { // The input Cloud Storage folder. All files under this folder will be // imported to Document Warehouse. // Format: `gs:///`. string input_path = 1; // The Document Warehouse schema resource name. All documents processed by // this pipeline will use this schema. // Format: // projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}. string schema_name = 2; // The Doc AI processor type name. Only used when the format of ingested // files is Doc AI Document proto format. string processor_type = 3; // The flag whether to skip ingested documents. // If it is set to true, documents in Cloud Storage contains key "status" with // value "status=ingested" in custom metadata will be skipped to ingest. bool skip_ingested_documents = 4; // Optional. The config for the Cloud Storage Ingestion pipeline. // It provides additional customization options to run the pipeline and can be // skipped if it is not applicable. IngestPipelineConfig pipeline_config = 5 [(google.api.field_behavior) = OPTIONAL]; } // The configuration of the Cloud Storage Ingestion with DocAI Processors // pipeline. message GcsIngestWithDocAiProcessorsPipeline { // The input Cloud Storage folder. All files under this folder will be // imported to Document Warehouse. // Format: `gs:///`. string input_path = 1; // The split and classify processor information. // The split and classify result will be used to find a matched extract // processor. ProcessorInfo split_classify_processor_info = 2; // The extract processors information. // One matched extract processor will be used to process documents based on // the classify processor result. If no classify processor is specified, the // first extract processor will be used. repeated ProcessorInfo extract_processor_infos = 3; // The Cloud Storage folder path used to store the raw results from // processors. // Format: `gs:///`. string processor_results_folder_path = 4; // The flag whether to skip ingested documents. // If it is set to true, documents in Cloud Storage contains key "status" with // value "status=ingested" in custom metadata will be skipped to ingest. bool skip_ingested_documents = 5; // Optional. The config for the Cloud Storage Ingestion with DocAI Processors // pipeline. It provides additional customization options to run the pipeline // and can be skipped if it is not applicable. IngestPipelineConfig pipeline_config = 6 [(google.api.field_behavior) = OPTIONAL]; } // The configuration of exporting documents from the Document Warehouse to CDW // pipeline. message ExportToCdwPipeline { // The list of all the resource names of the documents to be processed. // Format: // projects/{project_number}/locations/{location}/documents/{document_id}. repeated string documents = 1; // The Cloud Storage folder path used to store the exported documents before // being sent to CDW. // Format: `gs:///`. string export_folder_path = 2; // Optional. The CDW dataset resource name. This field is optional. If not // set, the documents will be exported to Cloud Storage only. Format: // projects/{project}/locations/{location}/processors/{processor}/dataset string doc_ai_dataset = 3 [(google.api.field_behavior) = OPTIONAL]; // Ratio of training dataset split. When importing into Document AI Workbench, // documents will be automatically split into training and test split category // with the specified ratio. This field is required if doc_ai_dataset is set. float training_split_ratio = 4; } // The configuration of processing documents in Document Warehouse with DocAi // processors pipeline. message ProcessWithDocAiPipeline { // The list of all the resource names of the documents to be processed. // Format: // projects/{project_number}/locations/{location}/documents/{document_id}. repeated string documents = 1; // The Cloud Storage folder path used to store the exported documents before // being sent to CDW. // Format: `gs:///`. string export_folder_path = 2; // The CDW processor information. ProcessorInfo processor_info = 3; // The Cloud Storage folder path used to store the raw results from // processors. // Format: `gs:///`. string processor_results_folder_path = 4; }