// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.documentai.v1; import "google/protobuf/field_mask.proto"; option csharp_namespace = "Google.Cloud.DocumentAI.V1"; option go_package = "cloud.google.com/go/documentai/apiv1/documentaipb;documentaipb"; option java_multiple_files = true; option java_outer_classname = "DocumentIoProto"; option java_package = "com.google.cloud.documentai.v1"; option php_namespace = "Google\\Cloud\\DocumentAI\\V1"; option ruby_package = "Google::Cloud::DocumentAI::V1"; // Payload message of raw document content (bytes). message RawDocument { // Inline document content. bytes content = 1; // An IANA MIME type (RFC6838) indicating the nature and format of the // [content][google.cloud.documentai.v1.RawDocument.content]. string mime_type = 2; // The display name of the document, it supports all Unicode characters except // the following: // `*`, `?`, `[`, `]`, `%`, `{`, `}`,`'`, `\"`, `,` // `~`, `=` and `:` are reserved. // If not specified, a default ID is generated. string display_name = 3; } // Specifies a document stored on Cloud Storage. message GcsDocument { // The Cloud Storage object uri. string gcs_uri = 1; // An IANA MIME type (RFC6838) of the content. string mime_type = 2; } // Specifies a set of documents on Cloud Storage. message GcsDocuments { // The list of documents. repeated GcsDocument documents = 1; } // Specifies all documents on Cloud Storage with a common prefix. message GcsPrefix { // The URI prefix. string gcs_uri_prefix = 1; } // The common config to specify a set of documents used as input. message BatchDocumentsInputConfig { // The source. oneof source { // The set of documents that match the specified Cloud Storage `gcs_prefix`. GcsPrefix gcs_prefix = 1; // The set of documents individually specified on Cloud Storage. GcsDocuments gcs_documents = 2; } } // Config that controls the output of documents. All documents will be written // as a JSON file. message DocumentOutputConfig { // The configuration used when outputting documents. message GcsOutputConfig { // The sharding config for the output document. message ShardingConfig { // The number of pages per shard. int32 pages_per_shard = 1; // The number of overlapping pages between consecutive shards. int32 pages_overlap = 2; } // The Cloud Storage uri (a directory) of the output. string gcs_uri = 1; // Specifies which fields to include in the output documents. // Only supports top level document and pages field so it must be in the // form of `{document_field_name}` or `pages.{page_field_name}`. google.protobuf.FieldMask field_mask = 2; // Specifies the sharding config for the output document. ShardingConfig sharding_config = 3; } // The destination of the results. oneof destination { // Output config to write the results to Cloud Storage. GcsOutputConfig gcs_output_config = 1; } } // Config for Document OCR. message OcrConfig { // Hints for OCR Engine message Hints { // List of BCP-47 language codes to use for OCR. In most cases, not // specifying it yields the best results since it enables automatic language // detection. For languages based on the Latin alphabet, setting hints is // not needed. In rare cases, when the language of the text in the // image is known, setting a hint will help get better results (although it // will be a significant hindrance if the hint is wrong). repeated string language_hints = 1; } // Configurations for premium OCR features. message PremiumFeatures { // Turn on selection mark detector in OCR engine. Only available in OCR 2.0 // (and later) processors. bool enable_selection_mark_detection = 3; // Turn on font identification model and return font style information. bool compute_style_info = 4; // Turn on the model that can extract LaTeX math formulas. bool enable_math_ocr = 5; } // Hints for the OCR model. Hints hints = 2; // Enables special handling for PDFs with existing text information. Results // in better text extraction quality in such PDF inputs. bool enable_native_pdf_parsing = 3; // Enables intelligent document quality scores after OCR. Can help with // diagnosing why OCR responses are of poor quality for a given input. // Adds additional latency comparable to regular OCR to the process call. bool enable_image_quality_scores = 4; // A list of advanced OCR options to further fine-tune OCR behavior. Current // valid values are: // // - `legacy_layout`: a heuristics layout detection algorithm, which serves as // an alternative to the current ML-based layout detection algorithm. // Customers can choose the best suitable layout algorithm based on their // situation. repeated string advanced_ocr_options = 5; // Includes symbol level OCR information if set to true. bool enable_symbol = 6; // Turn on font identification model and return font style information. // Deprecated, use // [PremiumFeatures.compute_style_info][google.cloud.documentai.v1.OcrConfig.PremiumFeatures.compute_style_info] // instead. bool compute_style_info = 8 [deprecated = true]; // Turn off character box detector in OCR engine. Character box detection is // enabled by default in OCR 2.0 (and later) processors. bool disable_character_boxes_detection = 10; // Configurations for premium OCR features. PremiumFeatures premium_features = 11; }