// SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors syntax = "proto3"; package lance.file; // A file descriptor that describes the contents of a Lance file message FileDescriptor { // The schema of the file Schema schema = 1; // The number of rows in the file uint64 length = 2; } // A schema which describes the data type of each of the columns message Schema { // All fields in this file, including the nested fields. repeated lance.file.Field fields = 1; // Schema metadata. map metadata = 5; } // Metadata of one Lance file. message Metadata { // 4 was used for StatisticsMetadata in the past, but has been moved to prevent // a bug in older readers. reserved 4; // Position of the manifest in the file. If it is zero, the manifest is stored // externally. uint64 manifest_position = 1; // Logical offsets of each chunk group, i.e., number of the rows in each // chunk. repeated int32 batch_offsets = 2; // The file position that page table is stored. // // A page table is a matrix of N x M x 2, where N = num_fields, and M = // num_batches. Each cell in the table is a pair of of the page. Both position and length are int64 values. The // of all the pages in the same column are then // contiguously stored. // // Every field that is a part of the file will have a run in the page table. // This includes struct columns, which will have a run of length 0 since // they don't store any actual data. // // For example, for the column 5 and batch 4, we have: // ```text // position = page_table[5][4][0]; // length = page_table[5][4][1]; // ``` uint64 page_table_position = 3; message StatisticsMetadata { // The schema of the statistics. // // This might be empty, meaning there are no statistics. It also might not // contain statistics for every field. repeated Field schema = 1; // The field ids of the statistics leaf fields. // // This plays a similar role to the `fields` field in the DataFile message. // Each of these field ids corresponds to a field in the stats_schema. There // is one per column in the stats page table. repeated int32 fields = 2; // The file position of the statistics page table // // The page table is a matrix of N x 2, where N = length of stats_fields. This is // the same layout as the main page table, except there is always only one // batch. // // For example, to get the stats column 5, we have: // ```text // position = stats_page_table[5][0]; // length = stats_page_table[5][1]; // ``` uint64 page_table_position = 3; } StatisticsMetadata statistics = 5; } // Metadata // Supported encodings. enum Encoding { // Invalid encoding. NONE = 0; // Plain encoding. PLAIN = 1; // Var-length binary encoding. VAR_BINARY = 2; // Dictionary encoding. DICTIONARY = 3; // Run-length encoding. RLE = 4; } // Dictionary field metadata message Dictionary { /// The file offset for storing the dictionary value. /// It is only valid if encoding is DICTIONARY. /// /// The logic type presents the value type of the column, i.e., string value. int64 offset = 1; /// The length of dictionary values. int64 length = 2; } // Field metadata for a column. message Field { enum Type { PARENT = 0; REPEATED = 1; LEAF = 2; } Type type = 1; // Fully qualified name. string name = 2; /// Field Id. /// /// See the comment in `DataFile.fields` for how field ids are assigned. int32 id = 3; /// Parent Field ID. If not set, this is a top-level column. int32 parent_id = 4; // Logical types, support parameterized Arrow Type. // // PARENT types will always have logical type "struct". // // REPEATED types may have logical types: // * "list" // * "large_list" // * "list.struct" // * "large_list.struct" // The final two are used if the list values are structs, and therefore the // field is both implicitly REPEATED and PARENT. // // LEAF types may have logical types: // * "null" // * "bool" // * "int8" / "uint8" // * "int16" / "uint16" // * "int32" / "uint32" // * "int64" / "uint64" // * "halffloat" / "float" / "double" // * "string" / "large_string" // * "binary" / "large_binary" // * "date32:day" // * "date64:ms" // * "decimal:128:{precision}:{scale}" / "decimal:256:{precision}:{scale}" // * "time:{unit}" / "timestamp:{unit}" / "duration:{unit}", where unit is "s", "ms", "us", "ns" // * "dict:{value_type}:{index_type}:false" string logical_type = 5; // If this field is nullable. bool nullable = 6; Encoding encoding = 7; /// The file offset for storing the dictionary value. /// It is only valid if encoding is DICTIONARY. /// /// The logic type presents the value type of the column, i.e., string value. Dictionary dictionary = 8; // Deprecated: optional extension type name, use metadata field ARROW:extension:name string extension_name = 9; // optional field metadata (e.g. extension type name/parameters) map metadata = 10; }