/* * gdeflate_decompress_template.h * * Copyright 2016 Eric Biggers * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This is the actual GDEFLATE decompression routine, lifted out of * gdeflate_decompress.c so that it can be compiled multiple times with * different target instruction sets. */ /* * Does the bitbuffer variable currently contain at least 'n' bits? */ #undef HAVE_BITS #define HAVE_BITS(n) (s->bitsleft[s->idx] >= (n)) /* * Load more bits from the input buffer until the specified number of bits is * present in the bitbuffer variable. 'n' cannot be too large; see MAX_ENSURE * and CAN_ENSURE(). */ #undef ENSURE_BITS #define ENSURE_BITS(n) \ if (!HAVE_BITS(n)) { \ s->bitbuf[s->idx] |= (bitbuf_t)(get_unaligned_le32(in_next)) << \ s->bitsleft[s->idx]; \ in_next += BITS_PER_PACKET/8; \ s->bitsleft[s->idx] += BITS_PER_PACKET; \ } /* * Return the next 'n' bits from the bitbuffer variable without removing them. */ #undef BITS #define BITS(n) ((u32)s->bitbuf[s->idx] & (((u32)1 << (n)) - 1)) /* * Remove the next 'n' bits from the bitbuffer variable. */ #undef REMOVE_BITS #define REMOVE_BITS(n) (s->bitbuf[s->idx] >>= (n), s->bitsleft[s->idx] -= (n)) /* * Setup copy advance method depending on a number of streams used. */ #if (NUM_STREAMS == 32) #define ADVANCE_COPIES() is_copy = _rotr(is_copy, 1) #else # pragma message("Invalid number of GDeflate streams used!") #endif /* * Reset GDeflate stream index. */ #define RESET() s->idx = 0 /* * Advance GDeflate stream index. Refill bits if necessary. */ #define ADVANCE() do { \ ENSURE_BITS(LOW_WATERMARK_BITS); \ s->idx = (s->idx + 1)%NUM_STREAMS; \ ADVANCE_COPIES(); \ } while(0) /* * Tells if current GDeflate stream has a deferred copy. */ #define IS_COPY() (is_copy&1) /* * Stores a deferred copy in current GDeflate stream. */ #define STORE_COPY(len, out) do { \ s->copies[s->idx].length = len; \ s->copies[s->idx].out_next = out; \ is_copy |= 1; \ } while(0) /* * Marks a copy in current current GDeflate stream as complete. */ #define COPY_COMPLETE() (is_copy &= ~1) /* * Prevent multiple type declarations. */ #ifndef GDEFLATE_TYPES_DECLARED /* * Setup is_copy type depending on a number of streams used. */ #if (NUM_STREAMS == 32) typedef u32 is_copy_t; #else # pragma message("Invalid number of GDeflate streams used!") #endif /* * GDeflate deferred copy state structure. */ struct gdeflate_deferred_copy { unsigned length; u8 * out_next; }; /* * GDeflate state structure. */ struct gdeflate_state { bitbuf_t bitbuf[NUM_STREAMS]; unsigned bitsleft[NUM_STREAMS]; struct gdeflate_deferred_copy copies[NUM_STREAMS]; unsigned idx; }; /* * Prevent multiple type declarations. */ #define GDEFLATE_TYPES_DECLARED #endif /* GDEFLATE_TYPES_DECLARED */ /* * Specialize gdeflate_do_copy function name. */ #define DO_COPY CONCAT(FUNCNAME, _gdeflate_do_copy) /* * Perform a deferred GDeflate copy. */ static forceinline enum libdeflate_result ATTRIBUTES DO_COPY(struct libdeflate_decompressor * restrict d, struct gdeflate_state * s, u8 * out, u8 * const out_end) { u32 entry; u32 offset; const u8 *src; u8 *dst; u32 tmp32; /* Pop match params. */ u32 length = s->copies[s->idx].length; u8 * out_next = s->copies[s->idx].out_next; /* Decode the match offset. */ entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)]; if (entry & HUFFDEC_SUBTABLE_POINTER) { /* Offset subtable required (uncommon case) */ REMOVE_BITS(OFFSET_TABLEBITS); entry = d->offset_decode_table[ ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + BITS(entry & HUFFDEC_LENGTH_MASK)]; } REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); entry >>= HUFFDEC_RESULT_SHIFT; /* Pop the extra offset bits and add them to the offset base to * produce the full offset. */ offset = (entry & HUFFDEC_OFFSET_BASE_MASK) + POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT); /* The match source must not begin before the beginning of the * output buffer. */ SAFETY_CHECK(offset <= out_next - (const u8 *)out); /* * Copy the match: 'length' bytes at 'out_next - offset' to * 'out_next', possibly overlapping. If the match doesn't end * too close to the end of the buffer and offset >= WORDBYTES || * offset == 1, take a fast path which copies a word at a time * -- potentially more than the length of the match, but that's * fine as long as we check for enough extra space. * * The remaining cases are not performance-critical so are * handled by a simple byte-by-byte copy. */ src = out_next - offset; dst = out_next; out_next += length; if (UNALIGNED_ACCESS_IS_FAST && likely(out_end - out_next >= WORDBYTES && length >= WORDBYTES)) { if (offset >= WORDBYTES) { /* words don't overlap? */ while (dst < out_next - WORDBYTES) { copy_word_unaligned(src, dst); src += WORDBYTES; dst += WORDBYTES; } /* Tail. */ while (dst < out_next) *dst++ = *src++; } else if (offset == 1) { /* RLE encoding of previous byte, common if the * data contains many repeated bytes */ machine_word_t v = repeat_byte(*src); while (dst < out_next - WORDBYTES) { store_word_unaligned(v, dst); dst += WORDBYTES; } /* Tail. */ while (dst < out_next) *dst++ = (u8)v; } else { *dst++ = *src++; *dst++ = *src++; do { *dst++ = *src++; } while (dst < out_next); } } else { STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); *dst++ = *src++; *dst++ = *src++; do { *dst++ = *src++; } while (dst < out_next); } return LIBDEFLATE_SUCCESS; } static enum libdeflate_result ATTRIBUTES FUNCNAME(struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) { u8 *out_next = out; u8 * const out_end = out_next + out_nbytes_avail; const u8 *in_next = in; const u8 * const in_end = in_next + in_nbytes; struct gdeflate_state state; struct gdeflate_state * s = &state; unsigned i; unsigned is_final_block; unsigned block_type; u16 len; unsigned num_litlen_syms; unsigned num_offset_syms; u32 tmp32; is_copy_t is_copy = 0; /* Starting to read GDeflate stream. */ RESET(); for (unsigned n = 0; n < NUM_STREAMS; n++) { s->bitbuf[n] = 0; s->bitsleft[n] = 0; s->copies[n].length = 0; ADVANCE(); } next_block: /* Starting to read the next block. */ RESET(); /* BFINAL: 1 bit */ is_final_block = POP_BITS(1); /* BTYPE: 2 bits */ block_type = POP_BITS(2); ENSURE_BITS(LOW_WATERMARK_BITS); if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { /* Dynamic Huffman block. */ /* The order in which precode lengths are stored. */ static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 }; unsigned num_explicit_precode_lens; /* Read the codeword length counts. */ STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257); num_litlen_syms = POP_BITS(5) + 257; STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1); num_offset_syms = POP_BITS(5) + 1; STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4); num_explicit_precode_lens = POP_BITS(4) + 4; d->static_codes_loaded = false; ENSURE_BITS(LOW_WATERMARK_BITS); /* Read the precode codeword lengths. */ STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); for (i = 0; i < num_explicit_precode_lens; i++) { d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); ADVANCE(); } for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; /* Build the decode table for the precode. */ SAFETY_CHECK(build_precode_decode_table(d)); RESET(); /* Expand the literal/length and offset codeword lengths. */ for (i = 0; i < num_litlen_syms + num_offset_syms; ) { u32 entry; unsigned presym; u8 rep_val; unsigned rep_count; /* (The code below assumes that the precode decode table * does not have any subtables.) */ STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); /* Read the next precode symbol. */ entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)]; REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); presym = entry >> HUFFDEC_RESULT_SHIFT; if (presym < 16) { /* Explicit codeword length */ d->u.l.lens[i++] = presym; ADVANCE(); continue; } /* Run-length encoded codeword lengths */ /* Note: we don't need verify that the repeat count * doesn't overflow the number of elements, since we * have enough extra spaces to allow for the worst-case * overflow (138 zeroes when only 1 length was * remaining). * * In the case of the small repeat counts (presyms 16 * and 17), it is fastest to always write the maximum * number of entries. That gets rid of branches that * would otherwise be required. * * It is not just because of the numerical order that * our checks go in the order 'presym < 16', 'presym == * 16', and 'presym == 17'. For typical data this is * ordered from most frequent to least frequent case. */ STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); if (presym == 16) { /* Repeat the previous length 3 - 6 times */ SAFETY_CHECK(i != 0); rep_val = d->u.l.lens[i - 1]; STATIC_ASSERT(3 + ((1 << 2) - 1) == 6); rep_count = 3 + POP_BITS(2); d->u.l.lens[i + 0] = rep_val; d->u.l.lens[i + 1] = rep_val; d->u.l.lens[i + 2] = rep_val; d->u.l.lens[i + 3] = rep_val; d->u.l.lens[i + 4] = rep_val; d->u.l.lens[i + 5] = rep_val; i += rep_count; } else if (presym == 17) { /* Repeat zero 3 - 10 times */ STATIC_ASSERT(3 + ((1 << 3) - 1) == 10); rep_count = 3 + POP_BITS(3); d->u.l.lens[i + 0] = 0; d->u.l.lens[i + 1] = 0; d->u.l.lens[i + 2] = 0; d->u.l.lens[i + 3] = 0; d->u.l.lens[i + 4] = 0; d->u.l.lens[i + 5] = 0; d->u.l.lens[i + 6] = 0; d->u.l.lens[i + 7] = 0; d->u.l.lens[i + 8] = 0; d->u.l.lens[i + 9] = 0; i += rep_count; } else { /* Repeat zero 11 - 138 times */ STATIC_ASSERT(11 + ((1 << 7) - 1) == 138); rep_count = 11 + POP_BITS(7); memset(&d->u.l.lens[i], 0, rep_count * sizeof(d->u.l.lens[i])); i += rep_count; } ADVANCE(); } } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { /* Uncompressed block: copy 'len' bytes literally from the input * buffer to the output buffer. */ /* Count bits in the bit buffers. */ u32 num_buffered_bits = 0; for (u32 n = 0; n < NUM_STREAMS; n++) num_buffered_bits += s->bitsleft[n]; SAFETY_CHECK(in_end - in_next + (num_buffered_bits + 7)/8 >= 2); len = POP_BITS(16); if (unlikely(len > out_end - out_next)) return LIBDEFLATE_INSUFFICIENT_SPACE; SAFETY_CHECK(len <= in_end - in_next + (num_buffered_bits + 7)/8); while (len) { *out_next++ = POP_BITS(8); len--; ADVANCE(); } goto block_done; } else { SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); /* * Static Huffman block: build the decode tables for the static * codes. Skip doing so if the tables are already set up from * an earlier static block; this speeds up decompression of * degenerate input of many empty or very short static blocks. * * Afterwards, the remainder is the same as decompressing a * dynamic Huffman block. */ if (d->static_codes_loaded) goto have_decode_tables; d->static_codes_loaded = true; STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); for (i = 0; i < 144; i++) d->u.l.lens[i] = 8; for (; i < 256; i++) d->u.l.lens[i] = 9; for (; i < 280; i++) d->u.l.lens[i] = 7; for (; i < 288; i++) d->u.l.lens[i] = 8; for (; i < 288 + 32; i++) d->u.l.lens[i] = 5; num_litlen_syms = 288; num_offset_syms = 32; } /* Decompressing a Huffman block (either dynamic or static) */ SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); have_decode_tables: RESET(); /* The main GDEFLATE decode loop */ for (;;) { u32 entry; u32 length; if (likely(!IS_COPY())) { /* Decode a litlen symbol. */ entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)]; if (entry & HUFFDEC_SUBTABLE_POINTER) { /* Litlen subtable required (uncommon case) */ REMOVE_BITS(LITLEN_TABLEBITS); entry = d->u.litlen_decode_table[ ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + BITS(entry & HUFFDEC_LENGTH_MASK)]; } REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); if (entry & HUFFDEC_LITERAL) { /* Literal */ if (unlikely(out_next == out_end)) return LIBDEFLATE_INSUFFICIENT_SPACE; *out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT); ADVANCE(); continue; } /* Match or end-of-block */ entry >>= HUFFDEC_RESULT_SHIFT; /* Pop the extra length bits and add them to the length base to * produce the full length. */ length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) + POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK); /* The match destination must not end after the end of the * output buffer. For efficiency, combine this check with the * end-of-block check. We're using 0 for the special * end-of-block length, so subtract 1 and it turn it into * SIZE_MAX. */ STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0); if (unlikely((size_t)length - 1 >= out_end - out_next)) { if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH)) return LIBDEFLATE_INSUFFICIENT_SPACE; goto block_done; } /* Store copy for use later. */ STORE_COPY(length, out_next); /* Advance output stream. */ out_next += length; } else { enum libdeflate_result res = DO_COPY(d, s, out, out_end); if (unlikely(res)) return res; COPY_COMPLETE(); } ADVANCE(); } block_done: /* Run the outstanding deferred copies. */ for (unsigned n = 0; n < NUM_STREAMS; n++) { if (IS_COPY()) { enum libdeflate_result res = DO_COPY(d, s, out, out_end); if (unlikely(res)) return res; COPY_COMPLETE(); } ADVANCE(); } /* Finished decoding a block. */ if (!is_final_block) goto next_block; /* That was the last block. */ /* Optionally return the actual number of bytes read */ if (actual_in_nbytes_ret) *actual_in_nbytes_ret = in_next - (u8 *)in; /* Optionally return the actual number of bytes written */ if (actual_out_nbytes_ret) { *actual_out_nbytes_ret = out_next - (u8 *)out; } else { if (out_next != out_end) return LIBDEFLATE_SHORT_OUTPUT; } return LIBDEFLATE_SUCCESS; } #undef FUNCNAME #undef ATTRIBUTES #undef IS_COPY #undef DO_COPY #undef ADVANCE_COPIES #undef STORE_COPY #undef COPY_COMPLETE #undef RESET #undef ADVANCE