// Copyright 2023 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.contentwarehouse.v1; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; import "google/cloud/documentai/v1/document.proto"; import "google/protobuf/timestamp.proto"; import "google/type/datetime.proto"; option csharp_namespace = "Google.Cloud.ContentWarehouse.V1"; option go_package = "cloud.google.com/go/contentwarehouse/apiv1/contentwarehousepb;contentwarehousepb"; option java_multiple_files = true; option java_outer_classname = "DocumentProto"; option java_package = "com.google.cloud.contentwarehouse.v1"; option php_namespace = "Google\\Cloud\\ContentWarehouse\\V1"; option ruby_package = "Google::Cloud::ContentWarehouse::V1"; // Defines the structure for content warehouse document proto. message Document { option (google.api.resource) = { type: "contentwarehouse.googleapis.com/Document" pattern: "projects/{project}/locations/{location}/documents/{document}" pattern: "projects/{project}/locations/{location}/documents/referenceId/{reference_id}" }; // The resource name of the document. // Format: // projects/{project_number}/locations/{location}/documents/{document_id}. // // The name is ignored when creating a document. string name = 1; // The reference ID set by customers. Must be unique per project and location. string reference_id = 11; // Required. Display name of the document given by the user. This name will be // displayed in the UI. Customer can populate this field with the name of the // document. This differs from the 'title' field as 'title' is optional and // stores the top heading in the document. string display_name = 2 [(google.api.field_behavior) = REQUIRED]; // Title that describes the document. // This can be the top heading or text that describes the document. string title = 18; // Uri to display the document, for example, in the UI. string display_uri = 17; // The Document schema name. // Format: // projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}. string document_schema_name = 3 [(google.api.resource_reference) = { type: "contentwarehouse.googleapis.com/DocumentSchema" }]; oneof structured_content { // Other document format, such as PPTX, XLXS string plain_text = 15; // Document AI format to save the structured content, including OCR. google.cloud.documentai.v1.Document cloud_ai_document = 4; } // A path linked to structured content file. string structured_content_uri = 16 [deprecated = true]; // Raw document file. oneof raw_document { // Raw document file in Cloud Storage path. string raw_document_path = 5; // Raw document content. bytes inline_raw_document = 6; } // List of values that are user supplied metadata. repeated Property properties = 7; // Output only. The time when the document is last updated. google.protobuf.Timestamp update_time = 8 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. The time when the document is created. google.protobuf.Timestamp create_time = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; // This is used when DocAI was not used to load the document and parsing/ // extracting is needed for the inline_raw_document. For example, if // inline_raw_document is the byte representation of a PDF file, then // this should be set to: RAW_DOCUMENT_FILE_TYPE_PDF. RawDocumentFileType raw_document_file_type = 10; // If true, makes the document visible to asynchronous policies and rules. bool async_enabled = 12 [deprecated = true]; // Indicates the category (image, audio, video etc.) of the original content. ContentCategory content_category = 20; // If true, text extraction will not be performed. bool text_extraction_disabled = 19 [deprecated = true]; // If true, text extraction will be performed. bool text_extraction_enabled = 21; // The user who creates the document. string creator = 13; // The user who lastly updates the document. string updater = 14; } // References to the documents. message DocumentReference { // Required. Name of the referenced document. string document_name = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "contentwarehouse.googleapis.com/Document" } ]; // display_name of the referenced document; this name does not need to be // consistent to the display_name in the Document proto, depending on the ACL // constraint. string display_name = 2; // Stores the subset of the referenced document's content. // This is useful to allow user peek the information of the referenced // document. string snippet = 3; // The document type of the document being referenced. bool document_is_folder = 4; // Output only. The time when the document is last updated. google.protobuf.Timestamp update_time = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. The time when the document is created. google.protobuf.Timestamp create_time = 6 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. The time when the document is deleted. google.protobuf.Timestamp delete_time = 7 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Property of a document. message Property { // Required. Must match the name of a PropertyDefinition in the // DocumentSchema. string name = 1 [(google.api.field_behavior) = REQUIRED]; // Type of the property. // Must match the property_options type of the matching PropertyDefinition. // Value of the Property parsed into a specific data type. // Specific type value(s) obtained from Document AIs Property.mention_text // field. oneof values { // Integer property values. IntegerArray integer_values = 2; // Float property values. FloatArray float_values = 3; // String/text property values. TextArray text_values = 4; // Enum property values. EnumArray enum_values = 5; // Nested structured data property values. PropertyArray property_values = 6; // Date time property values. // It is not supported by CMEK compliant deployment. DateTimeArray date_time_values = 7; // Map property values. MapProperty map_property = 8; // Timestamp property values. // It is not supported by CMEK compliant deployment. TimestampArray timestamp_values = 9; } } // Integer values. message IntegerArray { // List of integer values. repeated int32 values = 1; } // Float values. message FloatArray { // List of float values. repeated float values = 1; } // String/text values. message TextArray { // List of text values. repeated string values = 1; } // Enum values. message EnumArray { // List of enum values. repeated string values = 1; } // DateTime values. message DateTimeArray { // List of datetime values. // Both OffsetDateTime and ZonedDateTime are supported. repeated google.type.DateTime values = 1; } // Timestamp values. message TimestampArray { // List of timestamp values. repeated TimestampValue values = 1; } // Timestamp value type. message TimestampValue { oneof value { // Timestamp value google.protobuf.Timestamp timestamp_value = 1; // The string must represent a valid instant in UTC and is parsed using // java.time.format.DateTimeFormatter.ISO_INSTANT. // e.g. "2013-09-29T18:46:19Z" string text_value = 2; } } // Property values. message PropertyArray { // List of property values. repeated Property properties = 1; } // Map property value. // Represents a structured entries of key value pairs, consisting of field names // which map to dynamically typed values. message MapProperty { // Unordered map of dynamically typed values. map fields = 1; } // `Value` represents a dynamically typed value which can be either be // a float, a integer, a string, or a datetime value. A producer of value is // expected to set one of these variants. Absence of any variant indicates an // error. message Value { // The kind of value. oneof kind { // Represents a float value. float float_value = 1; // Represents a integer value. int32 int_value = 2; // Represents a string value. string string_value = 3; // Represents an enum value. EnumValue enum_value = 4; // Represents a datetime value. google.type.DateTime datetime_value = 5; // Represents a timestamp value. TimestampValue timestamp_value = 6; // Represents a boolean value. bool boolean_value = 7; } } // Represents the string value of the enum field. message EnumValue { // String value of the enum field. This must match defined set of enums // in document schema using EnumTypeOptions. string value = 1; } // When a raw document is supplied, this indicates the file format enum RawDocumentFileType { // No raw document specified or it is non-parsable RAW_DOCUMENT_FILE_TYPE_UNSPECIFIED = 0; // Adobe PDF format RAW_DOCUMENT_FILE_TYPE_PDF = 1; // Microsoft Word format RAW_DOCUMENT_FILE_TYPE_DOCX = 2; // Microsoft Excel format RAW_DOCUMENT_FILE_TYPE_XLSX = 3; // Microsoft Powerpoint format RAW_DOCUMENT_FILE_TYPE_PPTX = 4; // UTF-8 encoded text format RAW_DOCUMENT_FILE_TYPE_TEXT = 5; // TIFF or TIF image file format RAW_DOCUMENT_FILE_TYPE_TIFF = 6; } // When a raw document or structured content is supplied, this stores the // content category. enum ContentCategory { // No category is specified. CONTENT_CATEGORY_UNSPECIFIED = 0; // Content is of image type. CONTENT_CATEGORY_IMAGE = 1; // Content is of audio type. CONTENT_CATEGORY_AUDIO = 2; // Content is of video type. CONTENT_CATEGORY_VIDEO = 3; }