/* * deflate_decompress.c - a decompressor for DEFLATE * * Copyright 2016 Eric Biggers * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * --------------------------------------------------------------------------- * * This is a highly optimized DEFLATE decompressor. When compiled with gcc on * x86_64, it decompresses data in about 52% of the time of zlib (48% if BMI2 * instructions are available). On other architectures it should still be * significantly faster than zlib, but the difference may be smaller. * * Why this is faster than zlib's implementation: * * - Word accesses rather than byte accesses when reading input * - Word accesses rather than byte accesses when copying matches * - Faster Huffman decoding combined with various DEFLATE-specific tricks * - Larger bitbuffer variable that doesn't need to be filled as often * - Other optimizations to remove unnecessary branches * - Only full-buffer decompression is supported, so the code doesn't need to * support stopping and resuming decompression. * - On x86_64, compile a version of the decompression routine using BMI2 * instructions and use it automatically at runtime when supported. */ #include #include "deflate_constants.h" #include "unaligned.h" #include "libdeflate.h" /* * If the expression passed to SAFETY_CHECK() evaluates to false, then the * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the * compressed data is invalid. * * Theoretically, these checks could be disabled for specialized applications * where all input to the decompressor will be trusted. */ #if 0 # pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!") # define SAFETY_CHECK(expr) (void)(expr) #else # define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA #endif /* * Each TABLEBITS number is the base-2 logarithm of the number of entries in the * main portion of the corresponding decode table. Each number should be large * enough to ensure that for typical data, the vast majority of symbols can be * decoded by a direct lookup of the next TABLEBITS bits of compressed data. * However, this must be balanced against the fact that a larger table requires * more memory and requires more time to fill. * * Note: you cannot change a TABLEBITS number without also changing the * corresponding ENOUGH number! */ #define PRECODE_TABLEBITS 7 #define LITLEN_TABLEBITS 10 #define OFFSET_TABLEBITS 8 /* * Each ENOUGH number is the maximum number of decode table entries that may be * required for the corresponding Huffman code, including the main table and all * subtables. Each number depends on three parameters: * * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS) * (2) the number of main table bits (the TABLEBITS numbers defined above) * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN) * * The ENOUGH numbers were computed using the utility program 'enough' from * zlib. This program enumerates all possible relevant Huffman codes to find * the worst-case usage of decode table entries. */ #define PRECODE_ENOUGH 128 /* enough 19 7 7 */ #define LITLEN_ENOUGH 1334 /* enough 288 10 15 */ #define OFFSET_ENOUGH 402 /* enough 32 8 15 */ /* * Type for codeword lengths. */ typedef u8 len_t; /* * The main DEFLATE decompressor structure. Since this implementation only * supports full buffer decompression, this structure does not store the entire * decompression state, but rather only some arrays that are too large to * comfortably allocate on the stack. */ struct libdeflate_decompressor { /* * The arrays aren't all needed at the same time. 'precode_lens' and * 'precode_decode_table' are unneeded after 'lens' has been filled. * Furthermore, 'lens' need not be retained after building the litlen * and offset decode tables. In fact, 'lens' can be in union with * 'litlen_decode_table' provided that 'offset_decode_table' is separate * and is built first. */ union { len_t precode_lens[DEFLATE_NUM_PRECODE_SYMS]; struct { len_t lens[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS + DEFLATE_MAX_LENS_OVERRUN]; u32 precode_decode_table[PRECODE_ENOUGH]; } l; u32 litlen_decode_table[LITLEN_ENOUGH]; } u; u32 offset_decode_table[OFFSET_ENOUGH]; /* used only during build_decode_table() */ u16 sorted_syms[DEFLATE_MAX_NUM_SYMS]; bool static_codes_loaded; }; /***************************************************************************** * Input bitstream * *****************************************************************************/ /* * The state of the "input bitstream" consists of the following variables: * * - in_next: pointer to the next unread byte in the input buffer * * - in_end: pointer just past the end of the input buffer * * - bitbuf: a word-sized variable containing bits that have been read from * the input buffer. The buffered bits are right-aligned * (they're the low-order bits). * * - bitsleft: number of bits in 'bitbuf' that are valid. * * To make it easier for the compiler to optimize the code by keeping variables * in registers, these are declared as normal variables and manipulated using * macros. */ /* * The type for the bitbuffer variable ('bitbuf' described above). For best * performance, this should have size equal to a machine word. * * 64-bit platforms have a significant advantage: they get a bigger bitbuffer * which they have to fill less often. * * For GDEFLATE the bitbuffer has to be at least 64-bits long. */ #ifdef GDEFLATE typedef u64 bitbuf_t; #else typedef machine_word_t bitbuf_t; #endif /* * Number of bits the bitbuffer variable can hold. * * This is one less than the obvious value because of the optimized arithmetic * in FILL_BITS_WORDWISE() that leaves 'bitsleft' in the range * [WORDBITS - 8, WORDBITS - 1] rather than [WORDBITS - 7, WORDBITS]. */ #define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1) /* * The maximum number of bits that can be ensured in the bitbuffer variable, * i.e. the maximum value of 'n' that can be passed ENSURE_BITS(n). The decoder * only reads whole bytes from memory, so this is the lowest value of 'bitsleft' * at which another byte cannot be read without first consuming some bits. */ #define MAX_ENSURE (BITBUF_NBITS - 7) /* * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if * 'n' is too large to be passed to ENSURE_BITS(n). Note: if 'n' is a compile * time constant, then this expression will be a compile-type constant. * Therefore, CAN_ENSURE() can be used choose between alternative * implementations at compile time. */ #define CAN_ENSURE(n) ((n) <= MAX_ENSURE) /* * Fill the bitbuffer variable, reading one byte at a time. * * If we would overread the input buffer, we just don't read anything, leaving * the bits zeroed but marking them filled. This simplifies the decompressor * because it removes the need to distinguish between real overreads and * overreads that occur only because of the decompressor's own lookahead. * * The disadvantage is that real overreads are not detected immediately. * However, this is safe because the decompressor is still guaranteed to make * forward progress when presented never-ending 0 bits. In an existing block * output will be getting generated, whereas new blocks can only be uncompressed * (since the type code for uncompressed blocks is 0), for which we check for * previous overread. But even if we didn't check, uncompressed blocks would * fail to validate because LEN would not equal ~NLEN. So the decompressor will * eventually either detect that the output buffer is full, or detect invalid * input, or finish the final block. */ #define FILL_BITS_BYTEWISE() \ do { \ if (likely(in_next != in_end)) \ bitbuf |= (bitbuf_t)*in_next++ << bitsleft; \ else \ overrun_count++; \ bitsleft += 8; \ } while (bitsleft <= BITBUF_NBITS - 8) /* * Fill the bitbuffer variable by reading the next word from the input buffer * and branchlessly updating 'in_next' and 'bitsleft' based on how many bits * were filled. This can be significantly faster than FILL_BITS_BYTEWISE(). * However, for this to work correctly, the word must be interpreted in * little-endian format. In addition, the memory access may be unaligned. * Therefore, this method is most efficient on little-endian architectures that * support fast unaligned access, such as x86 and x86_64. * * For faster updating of 'bitsleft', we consider the bitbuffer size in bits to * be 1 less than the word size and therefore be all 1 bits. Then the number of * bits filled is the value of the 0 bits in position >= 3 when changed to 1. * E.g. if words are 64 bits and bitsleft = 16 = b010000 then we refill b101000 * = 40 bits = 5 bytes. This uses only 4 operations to update 'in_next' and * 'bitsleft': one each of +, ^, >>, and |. (Not counting operations the * compiler optimizes out.) In contrast, the alternative of: * * in_next += (BITBUF_NBITS - bitsleft) >> 3; * bitsleft += (BITBUF_NBITS - bitsleft) & ~7; * * (where BITBUF_NBITS would be WORDBITS rather than WORDBITS - 1) would on * average refill an extra bit, but uses 5 operations: two +, and one each of * -, >>, and &. Also the - and & must be completed before 'bitsleft' can be * updated, while the current solution updates 'bitsleft' with no dependencies. */ #define FILL_BITS_WORDWISE() \ do { \ /* BITBUF_NBITS must be all 1's in binary, see above */ \ STATIC_ASSERT((BITBUF_NBITS & (BITBUF_NBITS + 1)) == 0);\ \ bitbuf |= get_unaligned_leword(in_next) << bitsleft; \ in_next += (bitsleft ^ BITBUF_NBITS) >> 3; \ bitsleft |= BITBUF_NBITS & ~7; \ } while (0) /* * Does the bitbuffer variable currently contain at least 'n' bits? */ #define HAVE_BITS(n) (bitsleft >= (n)) /* * Load more bits from the input buffer until the specified number of bits is * present in the bitbuffer variable. 'n' cannot be too large; see MAX_ENSURE * and CAN_ENSURE(). */ #define ENSURE_BITS(n) \ if (!HAVE_BITS(n)) { \ if (CPU_IS_LITTLE_ENDIAN() && \ UNALIGNED_ACCESS_IS_FAST && \ likely(in_end - in_next >= sizeof(bitbuf_t))) \ FILL_BITS_WORDWISE(); \ else \ FILL_BITS_BYTEWISE(); \ } /* * Return the next 'n' bits from the bitbuffer variable without removing them. */ #define BITS(n) ((u32)bitbuf & (((u32)1 << (n)) - 1)) /* * Remove the next 'n' bits from the bitbuffer variable. */ #define REMOVE_BITS(n) (bitbuf >>= (n), bitsleft -= (n)) /* * Remove and return the next 'n' bits from the bitbuffer variable. */ #define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32) /* * Verify that the input buffer hasn't been overread, then align the input to * the next byte boundary, discarding any remaining bits in the current byte. * * Note that if the bitbuffer variable currently contains more than 7 bits, then * we must rewind 'in_next', effectively putting those bits back. Only the bits * in what would be the "current" byte if we were reading one byte at a time can * be actually discarded. */ #define ALIGN_INPUT() \ do { \ SAFETY_CHECK(overrun_count <= (bitsleft >> 3)); \ in_next -= (bitsleft >> 3) - overrun_count; \ overrun_count = 0; \ bitbuf = 0; \ bitsleft = 0; \ } while(0) /* * Read a 16-bit value from the input. This must have been preceded by a call * to ALIGN_INPUT(), and the caller must have already checked for overrun. */ #define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16) /***************************************************************************** * Huffman decoding * *****************************************************************************/ /* * A decode table for order TABLEBITS consists of a main table of (1 << * TABLEBITS) entries followed by a variable number of subtables. * * The decoding algorithm takes the next TABLEBITS bits of compressed data and * uses them as an index into the decode table. The resulting entry is either a * "direct entry", meaning that it contains the value desired, or a "subtable * pointer", meaning that the entry references a subtable that must be indexed * using more bits of the compressed data to decode the symbol. * * Each decode table (a main table along with its subtables, if any) is * associated with a Huffman code. Logically, the result of a decode table * lookup is a symbol from the alphabet from which the corresponding Huffman * code was constructed. A symbol with codeword length n <= TABLEBITS is * associated with 2**(TABLEBITS - n) direct entries in the table, whereas a * symbol with codeword length n > TABLEBITS is associated with one or more * subtable entries. * * On top of this basic design, we implement several optimizations: * * - We store the length of each codeword directly in each of its decode table * entries. This allows the codeword length to be produced without indexing * an additional table. * * - When beneficial, we don't store the Huffman symbol itself, but instead data * generated from it. For example, when decoding an offset symbol in DEFLATE, * it's more efficient if we can decode the offset base and number of extra * offset bits directly rather than decoding the offset symbol and then * looking up both of those values in an additional table or tables. * * The size of each decode table entry is 32 bits, which provides slightly * better performance than 16-bit entries on 32 and 64 bit processers, provided * that the table doesn't get so large that it takes up too much memory and * starts generating cache misses. The bits of each decode table entry are * defined as follows: * * - Bits 30 -- 31: flags (see below) * - Bits 8 -- 29: decode result: a Huffman symbol or related data * - Bits 0 -- 7: codeword length */ /* * This flag is set in all main decode table entries that represent subtable * pointers. */ #define HUFFDEC_SUBTABLE_POINTER 0x80000000 /* * This flag is set in all entries in the litlen decode table that represent * literals. */ #define HUFFDEC_LITERAL 0x40000000 /* Mask for extracting the codeword length from a decode table entry. */ #define HUFFDEC_LENGTH_MASK 0xFF /* Shift to extract the decode result from a decode table entry. */ #define HUFFDEC_RESULT_SHIFT 8 /* Shift a decode result into its position in the decode table entry. */ #define HUFFDEC_RESULT_ENTRY(result) ((u32)(result) << HUFFDEC_RESULT_SHIFT) /* The decode result for each precode symbol. There is no special optimization * for the precode; the decode result is simply the symbol value. */ static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = { #define ENTRY(presym) HUFFDEC_RESULT_ENTRY(presym) ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , ENTRY(16) , ENTRY(17) , ENTRY(18) , #undef ENTRY }; /* The decode result for each litlen symbol. For literals, this is the literal * value itself and the HUFFDEC_LITERAL flag. For lengths, this is the length * base and the number of extra length bits. */ static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = { /* Literals */ #define ENTRY(literal) (HUFFDEC_LITERAL | HUFFDEC_RESULT_ENTRY(literal)) ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) , ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) , ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) , ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) , ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) , ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) , ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) , ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) , ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) , ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) , ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) , ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) , ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) , ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) , ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) , ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) , ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) , ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) , ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) , ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) , ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) , ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) , ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) , ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) , ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) , ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) , ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) , ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) , ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) , ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) , ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) , ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) , ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) , ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) , ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) , ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) , ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) , ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) , ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) , ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) , ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) , ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) , ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) , ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) , ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) , ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) , ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) , ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) , ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) , ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) , ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) , ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) , ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) , ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) , ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) , ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) , ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) , ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) , ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) , ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) , #undef ENTRY #define HUFFDEC_EXTRA_LENGTH_BITS_MASK 0xFF #define HUFFDEC_LENGTH_BASE_SHIFT 8 #define HUFFDEC_END_OF_BLOCK_LENGTH 0 #define ENTRY(length_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \ ((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits)) /* End of block */ ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0), /* Lengths */ ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), #ifndef DEFLATE64 ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , #else ENTRY(3, 16) , ENTRY(3, 16) , ENTRY(3, 16) , #endif #undef ENTRY }; /* The decode result for each offset symbol. This is the offset base and the * number of extra offset bits. */ static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = { #define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16 #define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1) #define ENTRY(offset_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \ ((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) | \ (offset_base)) ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) , #undef ENTRY }; /* * Build a table for fast decoding of symbols from a Huffman code. As input, * this function takes the codeword length of each symbol which may be used in * the code. As output, it produces a decode table for the canonical Huffman * code described by the codeword lengths. The decode table is built with the * assumption that it will be indexed with "bit-reversed" codewords, where the * low-order bit is the first bit of the codeword. This format is used for all * Huffman codes in DEFLATE. * * @decode_table * The array in which the decode table will be generated. This array must * have sufficient length; see the definition of the ENOUGH numbers. * @lens * An array which provides, for each symbol, the length of the * corresponding codeword in bits, or 0 if the symbol is unused. This may * alias @decode_table, since nothing is written to @decode_table until all * @lens have been consumed. All codeword lengths are assumed to be <= * @max_codeword_len but are otherwise considered untrusted. If they do * not form a valid Huffman code, then the decode table is not built and * %false is returned. * @num_syms * The number of symbols in the code, including all unused symbols. * @decode_results * An array which provides, for each symbol, the actual value to store into * the decode table. This value will be directly produced as the result of * decoding that symbol, thereby moving the indirection out of the decode * loop and into the table initialization. * @table_bits * The log base-2 of the number of main table entries to use. * @max_codeword_len * The maximum allowed codeword length for this Huffman code. * Must be <= DEFLATE_MAX_CODEWORD_LEN. * @sorted_syms * A temporary array of length @num_syms. * * Returns %true if successful; %false if the codeword lengths do not form a * valid Huffman code. */ static bool build_decode_table(u32 decode_table[], const len_t lens[], const unsigned num_syms, const u32 decode_results[], const unsigned table_bits, const unsigned max_codeword_len, u16 *sorted_syms) { unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1]; unsigned sym; /* current symbol */ unsigned codeword; /* current codeword, bit-reversed */ unsigned len; /* current codeword length in bits */ unsigned count; /* num codewords remaining with this length */ u32 codespace_used; /* codespace used out of '2^max_codeword_len' */ unsigned cur_table_end; /* end index of current table */ unsigned subtable_prefix; /* codeword prefix of current subtable */ unsigned subtable_start; /* start index of current subtable */ unsigned subtable_bits; /* log2 of current subtable length */ /* Count how many codewords have each length, including 0. */ for (len = 0; len <= max_codeword_len; len++) len_counts[len] = 0; for (sym = 0; sym < num_syms; sym++) len_counts[lens[sym]]++; /* * Sort the symbols primarily by increasing codeword length and * secondarily by increasing symbol value; or equivalently by their * codewords in lexicographic order, since a canonical code is assumed. * * For efficiency, also compute 'codespace_used' in the same pass over * 'len_counts[]' used to build 'offsets[]' for sorting. */ /* Ensure that 'codespace_used' cannot overflow. */ STATIC_ASSERT(sizeof(codespace_used) == 4); STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >= DEFLATE_MAX_NUM_SYMS); offsets[0] = 0; offsets[1] = len_counts[0]; codespace_used = 0; for (len = 1; len < max_codeword_len; len++) { offsets[len + 1] = offsets[len] + len_counts[len]; codespace_used = (codespace_used << 1) + len_counts[len]; } codespace_used = (codespace_used << 1) + len_counts[len]; for (sym = 0; sym < num_syms; sym++) sorted_syms[offsets[lens[sym]]++] = sym; sorted_syms += offsets[0]; /* Skip unused symbols */ /* lens[] is done being used, so we can write to decode_table[] now. */ /* * Check whether the lengths form a complete code (exactly fills the * codespace), an incomplete code (doesn't fill the codespace), or an * overfull code (overflows the codespace). A codeword of length 'n' * uses proportion '1/(2^n)' of the codespace. An overfull code is * nonsensical, so is considered invalid. An incomplete code is * considered valid only in two specific cases; see below. */ /* overfull code? */ if (unlikely(codespace_used > (1U << max_codeword_len))) return false; /* incomplete code? */ if (unlikely(codespace_used < (1U << max_codeword_len))) { u32 entry; unsigned i; if (codespace_used == 0) { /* * An empty code is allowed. This can happen for the * offset code in DEFLATE, since a dynamic Huffman block * need not contain any matches. */ /* sym=0, len=1 (arbitrary) */ entry = decode_results[0] | 1; } else { /* * Allow codes with a single used symbol, with codeword * length 1. The DEFLATE RFC is unclear regarding this * case. What zlib's decompressor does is permit this * for the litlen and offset codes and assume the * codeword is '0' rather than '1'. We do the same * except we allow this for precodes too, since there's * no convincing reason to treat the codes differently. * We also assign both codewords '0' and '1' to the * symbol to avoid having to handle '1' specially. */ if (codespace_used != (1U << (max_codeword_len - 1)) || len_counts[1] != 1) return false; entry = decode_results[*sorted_syms] | 1; } /* * Note: the decode table still must be fully initialized, in * case the stream is malformed and contains bits from the part * of the codespace the incomplete code doesn't use. */ for (i = 0; i < (1U << table_bits); i++) decode_table[i] = entry; return true; } /* * The lengths form a complete code. Now, enumerate the codewords in * lexicographic order and fill the decode table entries for each one. * * First, process all codewords with len <= table_bits. Each one gets * '2^(table_bits-len)' direct entries in the table. * * Since DEFLATE uses bit-reversed codewords, these entries aren't * consecutive but rather are spaced '2^len' entries apart. This makes * filling them naively somewhat awkward and inefficient, since strided * stores are less cache-friendly and preclude the use of word or * vector-at-a-time stores to fill multiple entries per instruction. * * To optimize this, we incrementally double the table size. When * processing codewords with length 'len', the table is treated as * having only '2^len' entries, so each codeword uses just one entry. * Then, each time 'len' is incremented, the table size is doubled and * the first half is copied to the second half. This significantly * improves performance over naively doing strided stores. * * Note that some entries copied for each table doubling may not have * been initialized yet, but it doesn't matter since they're guaranteed * to be initialized later (because the Huffman code is complete). */ codeword = 0; len = 1; while ((count = len_counts[len]) == 0) len++; cur_table_end = 1U << len; while (len <= table_bits) { /* Process all 'count' codewords with length 'len' bits. */ do { unsigned bit; /* Fill the first entry for the current codeword. */ decode_table[codeword] = decode_results[*sorted_syms++] | len; if (codeword == cur_table_end - 1) { /* Last codeword (all 1's) */ for (; len < table_bits; len++) { memcpy(&decode_table[cur_table_end], decode_table, cur_table_end * sizeof(decode_table[0])); cur_table_end <<= 1; } return true; } /* * To advance to the lexicographically next codeword in * the canonical code, the codeword must be incremented, * then 0's must be appended to the codeword as needed * to match the next codeword's length. * * Since the codeword is bit-reversed, appending 0's is * a no-op. However, incrementing it is nontrivial. To * do so efficiently, use the 'bsr' instruction to find * the last (highest order) 0 bit in the codeword, set * it, and clear any later (higher order) 1 bits. But * 'bsr' actually finds the highest order 1 bit, so to * use it first flip all bits in the codeword by XOR'ing * it with (1U << len) - 1 == cur_table_end - 1. */ bit = 1U << bsr32(codeword ^ (cur_table_end - 1)); codeword &= bit - 1; codeword |= bit; } while (--count); /* Advance to the next codeword length. */ do { if (++len <= table_bits) { memcpy(&decode_table[cur_table_end], decode_table, cur_table_end * sizeof(decode_table[0])); cur_table_end <<= 1; } } while ((count = len_counts[len]) == 0); } /* Process codewords with len > table_bits. These require subtables. */ cur_table_end = 1U << table_bits; subtable_prefix = -1; subtable_start = 0; for (;;) { u32 entry; unsigned i; unsigned stride; unsigned bit; /* * Start a new subtable if the first 'table_bits' bits of the * codeword don't match the prefix of the current subtable. */ if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) { subtable_prefix = (codeword & ((1U << table_bits) - 1)); subtable_start = cur_table_end; /* * Calculate the subtable length. If the codeword has * length 'table_bits + n', then the subtable needs * '2^n' entries. But it may need more; if fewer than * '2^n' codewords of length 'table_bits + n' remain, * then the length will need to be incremented to bring * in longer codewords until the subtable can be * completely filled. Note that because the Huffman * code is complete, it will always be possible to fill * the subtable eventually. */ subtable_bits = len - table_bits; codespace_used = count; while (codespace_used < (1U << subtable_bits)) { subtable_bits++; codespace_used = (codespace_used << 1) + len_counts[table_bits + subtable_bits]; } cur_table_end = subtable_start + (1U << subtable_bits); /* * Create the entry that points from the main table to * the subtable. This entry contains the index of the * start of the subtable and the number of bits with * which the subtable is indexed (the log base 2 of the * number of entries it contains). */ decode_table[subtable_prefix] = HUFFDEC_SUBTABLE_POINTER | HUFFDEC_RESULT_ENTRY(subtable_start) | subtable_bits; } /* Fill the subtable entries for the current codeword. */ entry = decode_results[*sorted_syms++] | (len - table_bits); i = subtable_start + (codeword >> table_bits); stride = 1U << (len - table_bits); do { decode_table[i] = entry; i += stride; } while (i < cur_table_end); /* Advance to the next codeword. */ if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */ return true; bit = 1U << bsr32(codeword ^ ((1U << len) - 1)); codeword &= bit - 1; codeword |= bit; count--; while (count == 0) count = len_counts[++len]; } } /* Build the decode table for the precode. */ static bool build_precode_decode_table(struct libdeflate_decompressor *d) { /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128); return build_decode_table(d->u.l.precode_decode_table, d->u.precode_lens, DEFLATE_NUM_PRECODE_SYMS, precode_decode_results, PRECODE_TABLEBITS, DEFLATE_MAX_PRE_CODEWORD_LEN, d->sorted_syms); } /* Build the decode table for the literal/length code. */ static bool build_litlen_decode_table(struct libdeflate_decompressor *d, unsigned num_litlen_syms, unsigned num_offset_syms) { /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334); return build_decode_table(d->u.litlen_decode_table, d->u.l.lens, num_litlen_syms, litlen_decode_results, LITLEN_TABLEBITS, DEFLATE_MAX_LITLEN_CODEWORD_LEN, d->sorted_syms); } /* Build the decode table for the offset code. */ static bool build_offset_decode_table(struct libdeflate_decompressor *d, unsigned num_litlen_syms, unsigned num_offset_syms) { /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402); return build_decode_table(d->offset_decode_table, d->u.l.lens + num_litlen_syms, num_offset_syms, offset_decode_results, OFFSET_TABLEBITS, DEFLATE_MAX_OFFSET_CODEWORD_LEN, d->sorted_syms); } static forceinline machine_word_t repeat_byte(u8 b) { machine_word_t v; STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); v = b; v |= v << 8; v |= v << 16; v |= v << ((WORDBITS == 64) ? 32 : 0); return v; } static forceinline void copy_word_unaligned(const void *src, void *dst) { store_word_unaligned(load_word_unaligned(src), dst); } /***************************************************************************** * Main decompression routine *****************************************************************************/ typedef enum libdeflate_result (*decompress_func_t) (struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); #undef DEFAULT_IMPL #undef DISPATCH #if defined(__i386__) || defined(__x86_64__) # include "x86/decompress_impl.h" #endif #ifndef DEFAULT_IMPL # define FUNCNAME deflate_decompress_default # define ATTRIBUTES # ifdef GDEFLATE # include "gdeflate_decompress_template.h" # else # include "decompress_template.h" # endif # define DEFAULT_IMPL deflate_decompress_default #endif #ifdef DISPATCH static enum libdeflate_result dispatch(struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); static volatile decompress_func_t decompress_impl = dispatch; /* Choose the fastest implementation at runtime */ static enum libdeflate_result dispatch(struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) { decompress_func_t f = arch_select_decompress_func(); if (f == NULL) f = DEFAULT_IMPL; decompress_impl = f; return (*f)(d, in, in_nbytes, out, out_nbytes_avail, actual_in_nbytes_ret, actual_out_nbytes_ret); } #else # define decompress_impl DEFAULT_IMPL /* only one implementation, use it */ #endif #ifndef HIDE_INTERFACE /* * This is the main DEFLATE decompression routine. See libdeflate.h for the * documentation. * * Note that the real code is in decompress_template.h. The part here just * handles calling the appropriate implementation depending on the CPU features * at runtime. */ LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) { return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail, actual_in_nbytes_ret, actual_out_nbytes_ret); } LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, size_t *actual_out_nbytes_ret) { return libdeflate_deflate_decompress_ex(d, in, in_nbytes, out, out_nbytes_avail, NULL, actual_out_nbytes_ret); } LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI libdeflate_alloc_decompressor(void) { /* * Note that only certain parts of the decompressor actually must be * initialized here: * * - 'static_codes_loaded' must be initialized to false. * * - The first half of the main portion of each decode table must be * initialized to any value, to avoid reading from uninitialized * memory during table expansion in build_decode_table(). (Although, * this is really just to avoid warnings with dynamic tools like * valgrind, since build_decode_table() is guaranteed to initialize * all entries eventually anyway.) * * But for simplicity, we currently just zero the whole decompressor. */ struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d)); if (d == NULL) return NULL; memset(d, 0, sizeof(*d)); return d; } LIBDEFLATEEXPORT void LIBDEFLATEAPI libdeflate_free_decompressor(struct libdeflate_decompressor *d) { libdeflate_free(d); } #endif