/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ syntax = "proto2"; package orc.proto; option java_package = "org.apache.orc"; message IntegerStatistics { optional sint64 minimum = 1; optional sint64 maximum = 2; optional sint64 sum = 3; } message DoubleStatistics { optional double minimum = 1; optional double maximum = 2; optional double sum = 3; } message StringStatistics { optional string minimum = 1; optional string maximum = 2; // sum will store the total length of all strings in a stripe optional sint64 sum = 3; // If the minimum or maximum value was longer than 1024 bytes, store a lower or upper // bound instead of the minimum or maximum values above. optional string lowerBound = 4; optional string upperBound = 5; } message BucketStatistics { repeated uint64 count = 1 [packed=true]; } message DecimalStatistics { optional string minimum = 1; optional string maximum = 2; optional string sum = 3; } message DateStatistics { // min,max values saved as days since epoch optional sint32 minimum = 1; optional sint32 maximum = 2; } message TimestampStatistics { // min,max values saved as milliseconds since epoch optional sint64 minimum = 1; optional sint64 maximum = 2; optional sint64 minimumUtc = 3; optional sint64 maximumUtc = 4; // store the lower 6 TS digits for min/max to achieve nanosecond precision optional int32 minimumNanos = 5; optional int32 maximumNanos = 6; } message BinaryStatistics { // sum will store the total binary blob length in a stripe optional sint64 sum = 1; } // Statistics for list and map message CollectionStatistics { optional uint64 minChildren = 1; optional uint64 maxChildren = 2; optional uint64 totalChildren = 3; } message ColumnStatistics { optional uint64 numberOfValues = 1; optional IntegerStatistics intStatistics = 2; optional DoubleStatistics doubleStatistics = 3; optional StringStatistics stringStatistics = 4; optional BucketStatistics bucketStatistics = 5; optional DecimalStatistics decimalStatistics = 6; optional DateStatistics dateStatistics = 7; optional BinaryStatistics binaryStatistics = 8; optional TimestampStatistics timestampStatistics = 9; optional bool hasNull = 10; optional uint64 bytesOnDisk = 11; optional CollectionStatistics collectionStatistics = 12; } message RowIndexEntry { repeated uint64 positions = 1 [packed=true]; optional ColumnStatistics statistics = 2; } message RowIndex { repeated RowIndexEntry entry = 1; } message BloomFilter { optional uint32 numHashFunctions = 1; repeated fixed64 bitset = 2; optional bytes utf8bitset = 3; } message BloomFilterIndex { repeated BloomFilter bloomFilter = 1; } message Stream { // if you add new index stream kinds, you need to make sure to update // StreamName to ensure it is added to the stripe in the right area enum Kind { PRESENT = 0; DATA = 1; LENGTH = 2; DICTIONARY_DATA = 3; DICTIONARY_COUNT = 4; SECONDARY = 5; ROW_INDEX = 6; BLOOM_FILTER = 7; BLOOM_FILTER_UTF8 = 8; // Virtual stream kinds to allocate space for encrypted index and data. ENCRYPTED_INDEX = 9; ENCRYPTED_DATA = 10; // stripe statistics streams STRIPE_STATISTICS = 100; // A virtual stream kind that is used for setting the encryption IV. FILE_STATISTICS = 101; } optional Kind kind = 1; optional uint32 column = 2; optional uint64 length = 3; } message ColumnEncoding { enum Kind { DIRECT = 0; DICTIONARY = 1; DIRECT_V2 = 2; DICTIONARY_V2 = 3; } optional Kind kind = 1; optional uint32 dictionarySize = 2; // The encoding of the bloom filters for this column: // 0 or missing = none or original // 1 = ORC-135 (utc for timestamps) optional uint32 bloomEncoding = 3; } message StripeEncryptionVariant { repeated Stream streams = 1; repeated ColumnEncoding encoding = 2; } // each stripe looks like: // index streams // unencrypted // variant 1..N // data streams // unencrypted // variant 1..N // footer message StripeFooter { repeated Stream streams = 1; repeated ColumnEncoding columns = 2; optional string writerTimezone = 3; // one for each column encryption variant repeated StripeEncryptionVariant encryption = 4; } // the file tail looks like: // encrypted stripe statistics: ColumnarStripeStatistics (order by variant) // stripe statistics: Metadata // footer: Footer // postscript: PostScript // psLen: byte message StringPair { optional string key = 1; optional string value = 2; } message Type { enum Kind { BOOLEAN = 0; BYTE = 1; SHORT = 2; INT = 3; LONG = 4; FLOAT = 5; DOUBLE = 6; STRING = 7; BINARY = 8; TIMESTAMP = 9; LIST = 10; MAP = 11; STRUCT = 12; UNION = 13; DECIMAL = 14; DATE = 15; VARCHAR = 16; CHAR = 17; TIMESTAMP_INSTANT = 18; } optional Kind kind = 1; repeated uint32 subtypes = 2 [packed=true]; repeated string fieldNames = 3; optional uint32 maximumLength = 4; optional uint32 precision = 5; optional uint32 scale = 6; repeated StringPair attributes = 7; } message StripeInformation { // the global file offset of the start of the stripe optional uint64 offset = 1; // the number of bytes of index optional uint64 indexLength = 2; // the number of bytes of data optional uint64 dataLength = 3; // the number of bytes in the stripe footer optional uint64 footerLength = 4; // the number of rows in this stripe optional uint64 numberOfRows = 5; // If this is present, the reader should use this value for the encryption // stripe id for setting the encryption IV. Otherwise, the reader should // use one larger than the previous stripe's encryptStripeId. // For unmerged ORC files, the first stripe will use 1 and the rest of the // stripes won't have it set. For merged files, the stripe information // will be copied from their original files and thus the first stripe of // each of the input files will reset it to 1. // Note that 1 was choosen, because protobuf v3 doesn't serialize // primitive types that are the default (eg. 0). optional uint64 encryptStripeId = 6; // For each encryption variant, the new encrypted local key to use // until we find a replacement. repeated bytes encryptedLocalKeys = 7; } message UserMetadataItem { optional string name = 1; optional bytes value = 2; } // StripeStatistics (1 per a stripe), which each contain the // ColumnStatistics for each column. // This message type is only used in ORC v0 and v1. message StripeStatistics { repeated ColumnStatistics colStats = 1; } // This message type is only used in ORC v0 and v1. message Metadata { repeated StripeStatistics stripeStats = 1; } // In ORC v2 (and for encrypted columns in v1), each column has // their column statistics written separately. message ColumnarStripeStatistics { // one value for each stripe in the file repeated ColumnStatistics colStats = 1; } enum EncryptionAlgorithm { UNKNOWN_ENCRYPTION = 0; // used for detecting future algorithms AES_CTR_128 = 1; AES_CTR_256 = 2; } message FileStatistics { repeated ColumnStatistics column = 1; } // How was the data masked? This isn't necessary for reading the file, but // is documentation about how the file was written. message DataMask { // the kind of masking, which may include third party masks optional string name = 1; // parameters for the mask repeated string maskParameters = 2; // the unencrypted column roots this mask was applied to repeated uint32 columns = 3 [packed = true]; } // Information about the encryption keys. message EncryptionKey { optional string keyName = 1; optional uint32 keyVersion = 2; optional EncryptionAlgorithm algorithm = 3; } // The description of an encryption variant. // Each variant is a single subtype that is encrypted with a single key. message EncryptionVariant { // the column id of the root optional uint32 root = 1; // The master key that was used to encrypt the local key, referenced as // an index into the Encryption.key list. optional uint32 key = 2; // the encrypted key for the file footer optional bytes encryptedKey = 3; // the stripe statistics for this variant repeated Stream stripeStatistics = 4; // encrypted file statistics as a FileStatistics optional bytes fileStatistics = 5; } // Which KeyProvider encrypted the local keys. enum KeyProviderKind { UNKNOWN = 0; HADOOP = 1; AWS = 2; GCP = 3; AZURE = 4; } message Encryption { // all of the masks used in this file repeated DataMask mask = 1; // all of the keys used in this file repeated EncryptionKey key = 2; // The encrypted variants. // Readers should prefer the first variant that the user has access to // the corresponding key. If they don't have access to any of the keys, // they should get the unencrypted masked data. repeated EncryptionVariant variants = 3; // How are the local keys encrypted? optional KeyProviderKind keyProvider = 4; } enum CalendarKind { UNKNOWN_CALENDAR = 0; // A hybrid Julian/Gregorian calendar with a cutover point in October 1582. JULIAN_GREGORIAN = 1; // A calendar that extends the Gregorian calendar back forever. PROLEPTIC_GREGORIAN = 2; } message Footer { optional uint64 headerLength = 1; optional uint64 contentLength = 2; repeated StripeInformation stripes = 3; repeated Type types = 4; repeated UserMetadataItem metadata = 5; optional uint64 numberOfRows = 6; repeated ColumnStatistics statistics = 7; optional uint32 rowIndexStride = 8; // Each implementation that writes ORC files should register for a code // 0 = ORC Java // 1 = ORC C++ // 2 = Presto // 3 = Scritchley Go from https://github.com/scritchley/orc // 4 = Trino optional uint32 writer = 9; // information about the encryption in this file optional Encryption encryption = 10; optional CalendarKind calendar = 11; // informative description about the version of the software that wrote // the file. It is assumed to be within a given writer, so for example // ORC 1.7.2 = "1.7.2". It may include suffixes, such as "-SNAPSHOT". optional string softwareVersion = 12; } enum CompressionKind { NONE = 0; ZLIB = 1; SNAPPY = 2; LZO = 3; LZ4 = 4; ZSTD = 5; } // Serialized length must be less that 255 bytes message PostScript { optional uint64 footerLength = 1; optional CompressionKind compression = 2; optional uint64 compressionBlockSize = 3; // the version of the file format // [0, 11] = Hive 0.11 // [0, 12] = Hive 0.12 repeated uint32 version = 4 [packed = true]; optional uint64 metadataLength = 5; // The version of the writer that wrote the file. This number is // updated when we make fixes or large changes to the writer so that // readers can detect whether a given bug is present in the data. // // Only the Java ORC writer may use values under 6 (or missing) so that // readers that predate ORC-202 treat the new writers correctly. Each // writer should assign their own sequence of versions starting from 6. // // Version of the ORC Java writer: // 0 = original // 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics & // string statistics use utf8 for min/max) // 2 = HIVE-4243 fixed (use real column names from Hive tables) // 3 = HIVE-12055 added (vectorized writer implementation) // 4 = HIVE-13083 fixed (decimals write present stream correctly) // 5 = ORC-101 fixed (bloom filters use utf8 consistently) // 6 = ORC-135 fixed (timestamp statistics use utc) // 7 = ORC-517 fixed (decimal64 min/max incorrect) // 8 = ORC-203 added (trim very long string statistics) // 9 = ORC-14 added (column encryption) // // Version of the ORC C++ writer: // 6 = original // // Version of the Presto writer: // 6 = original // // Version of the Scritchley Go writer: // 6 = original // // Version of the Trino writer: // 6 = original // optional uint32 writerVersion = 6; // the number of bytes in the encrypted stripe statistics optional uint64 stripeStatisticsLength = 7; // Leave this last in the record optional string magic = 8000; } // The contents of the file tail that must be serialized. // This gets serialized as part of OrcSplit, also used by footer cache. message FileTail { optional PostScript postscript = 1; optional Footer footer = 2; optional uint64 fileLength = 3; optional uint64 postscriptLength = 4; }