// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // kernel.h: general definitions for kernels. #ifndef GEMMLOWP_INTERNAL_KERNEL_H_ #define GEMMLOWP_INTERNAL_KERNEL_H_ #include "../public/bit_depth.h" #include "common.h" namespace gemmlowp { // Explanation of general gemmlowp terminology // =========================================== // // We use the following abbreviations: // LHS = "left-hand side" // RHS = "right-hand side" // Sometimes when referring to either LHS or RHS, we just say a "Side". // // In a matrix product of a MxK matrix times a KxN matrix, // we call K the 'depth'. Note that M is the number of rows // of the result (and of the LHS), and N is the number of columns // of the result (and of the RHS). // // In each of the LHS and RHS matrices, we call 'width' the // other dimension, besides the depth. So in the LHS, 'width' // is the number of rows, while in the RHS, 'width' is the number // of columns. // // So in the LHS MxK matrix, the depth is K and the width in M. // And in the RHS KxN matrix, the depth is K and the width in N. // // This is illustrated in this picture: // // RHS width // <-----------------> // +-----------------+ ^ // | RHS | | Depth // +-----------------+ v // ^ +--+ +-----------------+ // | |L | | | // LHS width | |H | | Result | // | |S | | | // v +--+ +-----------------+ // <--> // Depth // Explanation of gemmlowp kernel formats and "cells" // ================================================== // // Kernels operate on small LHS and RHS blocks that fit in registers. // These blocks are stored contiguously in memory, but not always // in a traditional column-major or row-major order; instead, // they consist of a number of sub-blocks, which we call "cells", // that are stored in column-major or row-major order. However, // what really matters to us is not so much rows vs columns, but // rather width vs depth. So we refer to "width-major" and "depth-major" // storage orders. In the LHS, width-major means row-major, // while in the RHS, width-major means column-major. // There is also a third possibility, "diagonal order", // which is unused at the moment. // // We aim to treat both sides, LHS and RHS, on an equal footing, // so we call them both 'sides'. A KernelFormat thus is just a pair // of KernelSideFormat's, one for LHS and one for RHS; each KernelSideFormat // contains a CellFormat and a number of cells; cells are only ever // stacked in the width dimension, which means stacked vertically in the // LHS and stacked horizondally in the RHS. // // Example // ======= // // Let's work out the data layout expected by a kernel having the // following format (the struct names here are defined below in this file): // // KernelFormat< // KernelSideFormat, 3>, // KernelSideFormat, 2> // > // // The LHS format, KernelSideFormat, 3>, means: // 3 cells, each cell having dimensions (width=3, depth=4), laid out in // DepthMajor order (the default value, see CellFormat). In the LHS, // DepthMajor means column-major, so the LHS cells are of size 3x4 in // column-major order, so the LHS layout is: // // 0 3 6 9 // 1 4 7 10 // 2 5 8 11 // 12 15 18 21 // 13 16 19 22 // 14 17 20 23 // 24 27 30 33 // 25 28 31 34 // 26 29 32 35 // // The RHS format, KernelSideFormat, 2>, means: // 2 cells each having dimensions (width=5, depth=4), laid out in // DepthMajor order (the default value, see CellFormat). In the RHS, // DepthMajor means row-major, so the RHS cells are of size 4x5 in // row-major order, so the RHS layout is: // // 0 1 2 3 4 20 21 22 23 24 // 5 6 7 8 9 25 26 27 28 29 // 10 11 12 13 14 30 31 32 33 34 // 15 16 17 18 19 35 36 37 38 39 // CellOrder enumerates the possible storage orders (=layouts) for // a cell (see explanation above). enum class CellOrder { DepthMajor, WidthMajor, Diagonal }; // CellFormat describes how data is laid // out in a cell. That is, a CellOrder together with actual dimensions. template struct CellFormat { static const int kWidth = tWidth; static const int kDepth = tDepth; static const CellOrder kOrder = tOrder; static const int kSize = kWidth * kDepth; }; // KernelSideFormat describes how data is laid out in a kernel side // (i.e. LHS or RHS). That is, a CellFormat together with a number of // cells. These cells are always stacked in the Width dimension. // For example, in the LHS case, the Width dimension is the rows dimension, // se we're saying that in the LHS, cells are stacked vertically. // We never stack cells in the Depth dimension. template struct KernelSideFormat { typedef tCellFormat Cell; static const int kCells = tCells; static const int kWidth = kCells * Cell::kWidth; static const int kDepth = Cell::kDepth; typedef std::uint8_t Scalar; // The scalar type of the Format. typedef std::uint8_t InputScalar; // The scalar type of the original input. }; // KernelSideFormat for int8 fast kernel trick. The original input is uint8, but // packs converts it to int8. template struct KernelSideFormatInt8 : KernelSideFormat { typedef std::int8_t Scalar; typedef std::uint8_t InputScalar; }; // KernelSideFormat for int8 inputs, enabling int8 fast kernel trick without // pack conversion. template struct KernelSideFormatInt8Inputs : KernelSideFormat { typedef std::int8_t Scalar; typedef std::int8_t InputScalar; }; // KernelFormat describes fully the input data layout that a kernel expects. // It consists of two KernelSideFormat's, one for LHS and one for RHS. template struct KernelFormat { typedef tLhs Lhs; typedef tRhs Rhs; static_assert(Lhs::Cell::kDepth == Rhs::Cell::kDepth, ""); static const int kDepth = Lhs::Cell::kDepth; static const int kRows = Lhs::Cell::kWidth * Lhs::kCells; static const int kCols = Rhs::Cell::kWidth * Rhs::kCells; }; inline const char* CellOrderName(CellOrder o) { switch (o) { case CellOrder::DepthMajor: return "DepthMajor"; case CellOrder::WidthMajor: return "WidthMajor"; case CellOrder::Diagonal: return "Diagonal"; default: assert(false); return nullptr; } } // Returns the offset into a cell, at which a given coefficient is stored. template inline int OffsetIntoCell(int w, int d) { const int size = CellFormat::kWidth; switch (CellFormat::kOrder) { case CellOrder::DepthMajor: return w + d * CellFormat::kWidth; case CellOrder::WidthMajor: return d + w * CellFormat::kDepth; case CellOrder::Diagonal: assert(CellFormat::kWidth == CellFormat::kDepth); return ((size + w - d) * size + d) % (size * size); default: assert(false); return 0; } } // KernelBase is the virtual base class below all kernels. // The idea is that we don't need to templatize all our code on the exact // kernel type; we only need to templatize on kernel format. Kernels // sharing the same format can thus share the same packing/unpacking code. struct KernelBase { virtual const char* Name() const = 0; // This is the kernel implementation. We use the word 'run' consistently // throughout gemmlowp to mean an inner loop, the implementation of which // is to be provided by a separate optimized function. virtual void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr, std::size_t start_depth, std::size_t run_depth) const = 0; virtual ~KernelBase() {} }; template struct ZeroPointInputValue {}; template <> struct ZeroPointInputValue { static constexpr std::uint8_t kValue = 0; }; template <> struct ZeroPointInputValue { static constexpr std::uint8_t kValue = 128; }; template <> struct ZeroPointInputValue { static constexpr std::uint8_t kValue = 0; }; } // namespace gemmlowp #endif // GEMMLOWP_INTERNAL_KERNEL_H_