// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.discoveryengine.v1alpha; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; option csharp_namespace = "Google.Cloud.DiscoveryEngine.V1Alpha"; option go_package = "cloud.google.com/go/discoveryengine/apiv1alpha/discoveryenginepb;discoveryenginepb"; option java_multiple_files = true; option java_outer_classname = "DocumentProcessingConfigProto"; option java_package = "com.google.cloud.discoveryengine.v1alpha"; option objc_class_prefix = "DISCOVERYENGINE"; option php_namespace = "Google\\Cloud\\DiscoveryEngine\\V1alpha"; option ruby_package = "Google::Cloud::DiscoveryEngine::V1alpha"; // A singleton resource of // [DataStore][google.cloud.discoveryengine.v1alpha.DataStore]. It's empty when // [DataStore][google.cloud.discoveryengine.v1alpha.DataStore] is created, which // defaults to digital parser. The first call to // [DataStoreService.UpdateDocumentProcessingConfig][google.cloud.discoveryengine.v1alpha.DataStoreService.UpdateDocumentProcessingConfig] // method will initialize the config. message DocumentProcessingConfig { option (google.api.resource) = { type: "discoveryengine.googleapis.com/DocumentProcessingConfig" pattern: "projects/{project}/locations/{location}/dataStores/{data_store}/documentProcessingConfig" pattern: "projects/{project}/locations/{location}/collections/{collection}/dataStores/{data_store}/documentProcessingConfig" }; // Configuration for chunking config. message ChunkingConfig { // Configuration for the layout based chunking. message LayoutBasedChunkingConfig { // The token size limit for each chunk. // // Supported values: 100-500 (inclusive). // Default value: 500. int32 chunk_size = 1; // Whether to include appending different levels of headings to chunks // from the middle of the document to prevent context loss. // // Default value: False. bool include_ancestor_headings = 2; } // Additional configs that defines the behavior of the chunking. oneof chunk_mode { // Configuration for the layout based chunking. LayoutBasedChunkingConfig layout_based_chunking_config = 1; } } // Related configurations applied to a specific type of document parser. message ParsingConfig { // The digital parsing configurations for documents. message DigitalParsingConfig {} // The OCR parsing configurations for documents. message OcrParsingConfig { // [DEPRECATED] This field is deprecated. To use the additional enhanced // document elements processing, please switch to `layout_parsing_config`. repeated string enhanced_document_elements = 1 [deprecated = true]; // If true, will use native text instead of OCR text on pages containing // native text. bool use_native_text = 2; } // The layout parsing configurations for documents. message LayoutParsingConfig {} // Configs for document processing types. oneof type_dedicated_config { // Configurations applied to digital parser. DigitalParsingConfig digital_parsing_config = 1; // Configurations applied to OCR parser. Currently it only applies to // PDFs. OcrParsingConfig ocr_parsing_config = 2; // Configurations applied to layout parser. LayoutParsingConfig layout_parsing_config = 3; } } // The full resource name of the Document Processing Config. // Format: // `projects/*/locations/*/collections/*/dataStores/*/documentProcessingConfig`. string name = 1; // Whether chunking mode is enabled. ChunkingConfig chunking_config = 3; // Configurations for default Document parser. // If not specified, we will configure it as default DigitalParsingConfig, and // the default parsing config will be applied to all file types for Document // parsing. ParsingConfig default_parsing_config = 4; // Map from file type to override the default parsing configuration based on // the file type. Supported keys: // // * `pdf`: Override parsing config for PDF files, either digital parsing, ocr // parsing or layout parsing is supported. // * `html`: Override parsing config for HTML files, only digital parsing and // layout parsing are supported. // * `docx`: Override parsing config for DOCX files, only digital parsing and // layout parsing are supported. // * `pptx`: Override parsing config for PPTX files, only digital parsing and // layout parsing are supported. // * `xlsx`: Override parsing config for XLSX files, only digital parsing and // layout parsing are supported. map parsing_config_overrides = 5; }