// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

syntax = "proto3";

package lance.encodings;
 
import "google/protobuf/empty.proto";

// This file contains a specification for encodings that can be used
// to store and load Arrow data into a Lance file.
//
// # Types
//
// This file assumes the user wants to load data into Arrow arrays and
// explains how to map Arrow arrays into Lance files.  Encodings are divided
// into "array encoding" (which maps to an Arrow array and may contain multiple
// buffers) and "buffer encoding" (which encodes a single buffer of data).
//
// # Encoding Tree
//
// Most encodings are layered on top of each other.  These form a tree of
// encodings with a single root node.  To encode an array you will typically
// start with the root node and then take the output from that root encoding
// and feed it into child encodings.  The decoding process works in reverse.
//
// # Multi-column Encodings
//
// Some Arrow arrays will map to more than one column of Lance data.  For
// example, struct arrays and list arrays.  This file only contains encodings
// for a single column.  However, it does describe how multi-column arrays can
// be encoded.

// A pointer to a buffer in a Lance file
//
// A writer can place a buffer in three different locations.  The buffer
// can go in the data page, in the column metadata, or in the file metadata.
// The writer is free to choose whatever is most appropriate (for example, a dictionary
// that is shared across all pages in a column will probably go in the column
// metadata).  This specification does not dictate where the buffer should go.
message Buffer {
    // The index of the buffer in the collection of buffers
    uint32 buffer_index = 1;
    // The collection holding the buffer
    enum BufferType {
      // The buffer is stored in the data page itself
      page = 0;
      // The buffer is stored in the column metadata
      column = 1;
      // The buffer is stored in the file metadata
      file = 2;
    };
    BufferType buffer_type = 2;
}

// An encoding that adds nullability to another array encoding
//
// This can wrap any array encoding and add nullability information
message Nullable {
  message NoNull {
    ArrayEncoding values = 1;
  }
  message AllNull {}
  message SomeNull {
    ArrayEncoding validity = 1;
    ArrayEncoding values = 2;
  }
  oneof nullability {
    // The array has no nulls and there is a single buffer needed
    NoNull no_nulls = 1;
    // The array may have nulls and we need two buffers
    SomeNull some_nulls = 2;
    // All values are null (no buffers needed)
    AllNull all_nulls = 3;
  }
}

// An array encoding for variable-length list fields
message List {
    // An array containing the offsets into an items array.
    //
    // This array will have num_rows items and will never
    // have nulls.
    //
    // If the list at index i is not null then offsets[i] will
    // contain `base + len(list)` where `base` is defined as:
    //   i == 0: 0
    //   i >  0: (offsets[i-1] % null_offset_adjustment)
    //
    // To help understand we can consider the following example list:
    // [ [A, B], null, [], [C, D, E] ]
    //
    // The offsets will be [2, ?, 2, 5]
    //
    // If the incoming list at index i IS null then offsets[i] will
    // contain `base + len(list) + null_offset_adjustment` where `base`
    // is defined the same as above.
    //
    // To complete the above example let's assume that `null_offset_adjustment`
    // is 7.  Then the offsets will be [2, 9, 2, 5]
    //
    // If there are no nulls then the offsets we write here are exactly the
    // same as the offsets in an Arrow list array (except we omit the leading
    // 0 which is redundant)
    //
    // The reason we do this is so that reading a single list at index i only
    // requires us to load the indices at i and i-1.
    //
    // If the offset at index i is greater than `null_offset_adjustment``
    // then the list at index i is null.
    //
    // Otherwise the length of the list is `offsets[i] - base` where
    // base is defined the same as above.
    //
    // Let's consider our example offsets: [2, 9, 2, 5]
    //
    // We can take any range of lists and determine how many list items are
    // referenced by the sublist.
    //
    // 0..3: [_, 5] -> items 0..5 (base = 0* and end is 5)
    // 0..2: [_, 2] -> items 0..2 (base = 0* and end is 2)
    // 0..1: [_, 9] -> items 0..2 (base = 0* and end is 9 % 7)
    // 1..3: [2, 5] -> items 2..5 (base = 2 and end is 5)
    // 1..2: [2, 2] -> items 2..2 (base = 2 and end is 2)
    // 2..3: [9, 5] -> items 2..5 (base = 9 % 7 and end is 5)
    //
    // * When the start of our range is the 0th item the base is always 0 and we only
    //   need to load a single index from disk to determine the range.
    //
    // The data type of the offsets array is flexible and does not need
    // to match the data type of the destination array.  Please note that the offsets
    // array is very likely to be efficiently encoded by bit packing deltas.
    ArrayEncoding offsets = 1;
    // If a list is null then we add this value to the offset
    //
    // This value must be greater than the length of the items so that
    // (offset + null_offset_adjustment) is never used by a non-null list.
    //
    // Note that this value cannot be equal to the length of the items
    // because then a page with a single list would store [ X ] and we
    // couldn't know if that is a null list or a list with X items.
    //
    // Therefore, the best choice for this value is 1 + # of items.
    // Choosing this will maximize the bit packing that we can apply to the offsets.
    uint64 null_offset_adjustment = 2;
    // How many items are referenced by these offsets.  This is needed in
    // order to determine which items pages map to this offsets page.
    uint64 num_items = 3;
}

// An array encoding for fixed-size list fields
message FixedSizeList {
  /// The number of items in each list
  uint32 dimension = 1;
  /// The items in the list
  ArrayEncoding items = 2;
}

message Compression {
  string scheme = 1;
}

// Fixed width items placed contiguously in a buffer
message Flat {
  // the number of bits per value, must be greater than 0, does
  // not need to be a multiple of 8
  uint64 bits_per_value = 1;
  // the buffer of values
  Buffer buffer = 2;
  // The Compression message can specify the compression scheme (e.g. zstd) and any
  // other information that is needed for decompression.
  Compression compression = 3;
}

// An array encoding for shredded structs that will never be null
//
// There is no actual data in this column.
//
// TODO: Struct validity bitmaps will be placed here.
message SimpleStruct {}

// An array encoding for binary fields
message Binary {
  ArrayEncoding indices = 1;
  ArrayEncoding bytes = 2;
  uint64 null_adjustment = 3;
}

// An array encoding for dictionary-encoded fields
message Dictionary {
  ArrayEncoding indices = 1;
  ArrayEncoding items = 2;
  uint32 num_dictionary_items = 3;
}

// Encodings that decode into an Arrow array
message ArrayEncoding {
    oneof array_encoding {
        Flat flat = 1;
        Nullable nullable = 2;
        FixedSizeList fixed_size_list = 3;
        List list = 4;
        SimpleStruct struct = 5;
        Binary binary = 6;
        Dictionary dictionary = 7;
    }
}

// Wraps a column with a zone map index that can be used
// to apply pushdown filters
message ZoneIndex {
  uint32 rows_per_zone = 1;
  Buffer zone_map_buffer = 2;
  ColumnEncoding inner = 3;
}

// Encodings that describe a column of values
message ColumnEncoding {
  oneof column_encoding {
    // No special encoding, just column values
    google.protobuf.Empty values = 1;
    ZoneIndex zone_index = 2;
  }
}