/* rans_byte.h originally from https://github.com/rygorous/ryg_rans * * This is a public-domain implementation of several rANS variants. rANS is an * entropy coder from the ANS family, as described in Jarek Duda's paper * "Asymmetric numeral systems" (http://arxiv.org/abs/1311.2540). */ /*-------------------------------------------------------------------------- */ /* rans_byte.h from https://github.com/rygorous/ryg_rans */ // Simple byte-aligned rANS encoder/decoder - public domain - Fabian 'ryg' Giesen 2014 // // Not intended to be "industrial strength"; just meant to illustrate the general // idea. #ifndef RANS_WORD_HEADER #define RANS_WORD_HEADER #include #include #include #include "htscodecs_endian.h" #ifdef assert #define RansAssert assert #else #define RansAssert(x) #endif // READ ME FIRST: // // This is designed like a typical arithmetic coder API, but there's three // twists you absolutely should be aware of before you start hacking: // // 1. You need to encode data in *reverse* - last symbol first. rANS works // like a stack: last in, first out. // 2. Likewise, the encoder outputs bytes *in reverse* - that is, you give // it a pointer to the *end* of your buffer (exclusive), and it will // slowly move towards the beginning as more bytes are emitted. // 3. Unlike basically any other entropy coder implementation you might // have used, you can interleave data from multiple independent rANS // encoders into the same bytestream without any extra signaling; // you can also just write some bytes by yourself in the middle if // you want to. This is in addition to the usual arithmetic encoder // property of being able to switch models on the fly. Writing raw // bytes can be useful when you have some data that you know is // incompressible, and is cheaper than going through the rANS encode // function. Using multiple rANS coders on the same byte stream wastes // a few bytes compared to using just one, but execution of two // independent encoders can happen in parallel on superscalar and // Out-of-Order CPUs, so this can be *much* faster in tight decoding // loops. // // This is why all the rANS functions take the write pointer as an // argument instead of just storing it in some context struct. // -------------------------------------------------------------------------- // L ('l' in the paper) is the lower bound of our normalization interval. // Between this and our byte-aligned emission, we use 31 (not 32!) bits. // This is done intentionally because exact reciprocals for 31-bit uints // fit in 32-bit uints: this permits some optimizations during encoding. #define RANS_BYTE_L (1u << 15) // lower bound of our normalization interval // State for a rANS encoder. Yep, that's all there is to it. typedef uint32_t RansState; // Initialize a rANS encoder. static inline void RansEncInit(RansState* r) { *r = RANS_BYTE_L; } // Renormalize the encoder. Internal function. static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq, uint32_t scale_bits) { uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 16) * freq; // this turns into a shift. if (x >= x_max) { uint16_t* ptr = (uint16_t *)*pptr; *--ptr = (uint16_t) (x & 0xffff); x >>= 16; *pptr = (uint8_t *)ptr; } return x; } // Encodes a single symbol with range start "start" and frequency "freq". // All frequencies are assumed to sum to "1 << scale_bits", and the // resulting bytes get written to ptr (which is updated). // // NOTE: With rANS, you need to encode symbols in *reverse order*, i.e. from // beginning to end! Likewise, the output bytestream is written *backwards*: // ptr starts pointing at the end of the output buffer and keeps decrementing. static inline void RansEncPut(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) { // renormalize RansState x = RansEncRenorm(*r, pptr, freq, scale_bits); // x = C(s,x) *r = ((x / freq) << scale_bits) + (x % freq) + start; } // Flushes the rANS encoder. static inline void RansEncFlush(RansState* r, uint8_t** pptr) { uint32_t x = *r; uint8_t* ptr = *pptr; ptr -= 4; ptr[0] = (uint8_t) (x >> 0); ptr[1] = (uint8_t) (x >> 8); ptr[2] = (uint8_t) (x >> 16); ptr[3] = (uint8_t) (x >> 24); *pptr = ptr; } // Initializes a rANS decoder. // Unlike the encoder, the decoder works forwards as you'd expect. static inline void RansDecInit(RansState* r, uint8_t** pptr) { uint32_t x; uint8_t* ptr = *pptr; x = ptr[0] << 0; x |= ptr[1] << 8; x |= ptr[2] << 16; x |= ((uint32_t)ptr[3]) << 24; ptr += 4; *pptr = ptr; *r = x; } // Returns the current cumulative frequency (map it to a symbol yourself!) static inline uint32_t RansDecGet(RansState* r, uint32_t scale_bits) { return *r & ((1u << scale_bits) - 1); } // Advances in the bit stream by "popping" a single symbol with range start // "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits", // and the resulting bytes get written to ptr (which is updated). static inline void RansDecAdvance(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) { uint32_t mask = (1u << scale_bits) - 1; // s, x = D(x) uint32_t x = *r; x = freq * (x >> scale_bits) + (x & mask) - start; // renormalize if (x < RANS_BYTE_L) { uint8_t* ptr = *pptr; do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L); *pptr = ptr; } *r = x; } // -------------------------------------------------------------------------- // That's all you need for a full encoder; below here are some utility // functions with extra convenience or optimizations. // Encoder symbol description // This (admittedly odd) selection of parameters was chosen to make // RansEncPutSymbol as cheap as possible. typedef struct { uint32_t x_max; // (Exclusive) upper bound of pre-normalization interval uint32_t rcp_freq; // Fixed-point reciprocal frequency uint32_t bias; // Bias uint16_t cmpl_freq; // Complement of frequency: (1 << scale_bits) - freq uint16_t rcp_shift; // Reciprocal shift // FIXME: temporary uint16_t scale_bits; uint16_t freq; uint16_t start; } RansEncSymbol; // Decoder symbols are straightforward. typedef struct { uint16_t start; // Start of range. uint16_t freq; // Symbol frequency. } RansDecSymbol; // Initializes an encoder symbol to start "start" and frequency "freq" static inline void RansEncSymbolInit(RansEncSymbol* s, uint32_t start, uint32_t freq, uint32_t scale_bits) { RansAssert(scale_bits <= 16); RansAssert(start <= (1u << scale_bits)); RansAssert(freq <= (1u << scale_bits) - start); // Say M := 1 << scale_bits. // // The original encoder does: // x_new = (x/freq)*M + start + (x%freq) // // The fast encoder does (schematically): // q = mul_hi(x, rcp_freq) >> rcp_shift (division) // r = x - q*freq (remainder) // x_new = q*M + bias + r (new x) // plugging in r into x_new yields: // x_new = bias + x + q*(M - freq) // =: bias + x + q*cmpl_freq (*) // // and we can just precompute cmpl_freq. Now we just need to // set up our parameters such that the original encoder and // the fast encoder agree. // FIXME: temporary s->scale_bits = scale_bits; s->freq = freq; s->start = start; s->x_max = ((RANS_BYTE_L >> scale_bits) << 16) * freq; s->cmpl_freq = (uint16_t) ((1 << scale_bits) - freq); if (freq < 2) { // freq=0 symbols are never valid to encode, so it doesn't matter what // we set our values to. // // freq=1 is tricky, since the reciprocal of 1 is 1; unfortunately, // our fixed-point reciprocal approximation can only multiply by values // smaller than 1. // // So we use the "next best thing": rcp_freq=0xffffffff, rcp_shift=0. // This gives: // q = mul_hi(x, rcp_freq) >> rcp_shift // = mul_hi(x, (1<<32) - 1)) >> 0 // = floor(x - x/(2^32)) // = x - 1 if 1 <= x < 2^32 // and we know that x>0 (x=0 is never in a valid normalization interval). // // So we now need to choose the other parameters such that // x_new = x*M + start // plug it in: // x*M + start (desired result) // = bias + x + q*cmpl_freq (*) // = bias + x + (x - 1)*(M - 1) (plug in q=x-1, cmpl_freq) // = bias + 1 + (x - 1)*M // = x*M + (bias + 1 - M) // // so we have start = bias + 1 - M, or equivalently // bias = start + M - 1. s->rcp_freq = ~0u; s->rcp_shift = 0; s->bias = start + (1 << scale_bits) - 1; } else { // Alverson, "Integer Division using reciprocals" // shift=ceil(log2(freq)) uint32_t shift = 0; while (freq > (1u << shift)) shift++; s->rcp_freq = (uint32_t) (((1ull << (shift + 31)) + freq-1) / freq); s->rcp_shift = shift - 1; // With these values, 'q' is the correct quotient, so we // have bias=start. s->bias = start; } s->rcp_shift += 32; // Avoid the extra >>32 in RansEncPutSymbol } // Initialize a decoder symbol to start "start" and frequency "freq" static inline void RansDecSymbolInit(RansDecSymbol* s, uint32_t start, uint32_t freq) { RansAssert(start <= (1 << 16)); RansAssert(freq <= (1 << 16) - start); s->start = (uint16_t) start; s->freq = (uint16_t) freq; } // Encodes a given symbol. This is faster than straight RansEnc since we can do // multiplications instead of a divide. // // See RansEncSymbolInit for a description of how this works. static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol const* sym) { //RansAssert(sym->x_max != 0); // can't encode symbol with freq=0 // renormalize uint32_t x = *r; uint32_t x_max = sym->x_max; #ifdef HTSCODECS_LITTLE_ENDIAN // Branchless renorm. // // This works best on high entropy data where branch prediction // is poor. // // Note the bit-packing and RLE modes are more likely to be used on // low entropy data, making this assertion generally true. // TODO: maybe have two different variants, and a detection mechanism // based on freq table to work out which method would be most efficient? int c = x >= x_max; uint16_t* ptr = *(uint16_t **)pptr; ptr[-1] = x & 0xffff; x >>= c*16; *pptr = (uint8_t *)(ptr-c); #else if (x >= x_max) { uint8_t* ptr = *pptr; ptr -= 2; ptr[0] = x & 0xff; ptr[1] = (x >> 8) & 0xff; x >>= 16; *pptr = ptr; } #endif // x = C(s,x) // NOTE: written this way so we get a 32-bit "multiply high" when // available. If you're on a 64-bit platform with cheap multiplies // (e.g. x64), just bake the +32 into rcp_shift. //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift; // Slow method, but robust // *r = ((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start; // return; // The extra >>32 has already been added to RansEncSymbolInit uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift); *r = x + sym->bias + q * sym->cmpl_freq; // assert(((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start == *r); } static inline void RansEncPutSymbol_branched(RansState* r, uint8_t** pptr, RansEncSymbol const* sym) { //RansAssert(sym->x_max != 0); // can't encode symbol with freq=0 // renormalize uint32_t x = *r; uint32_t x_max = sym->x_max; #ifdef HTSCODECS_LITTLE_ENDIAN if (x >= x_max) { (*pptr) -= 2; **(uint16_t **)pptr = x; x >>= 16; } #else if (x >= x_max) { uint8_t* ptr = *pptr; ptr -= 2; ptr[0] = x & 0xff; ptr[1] = (x >> 8) & 0xff; x >>= 16; *pptr = ptr; } #endif // x = C(s,x) // NOTE: written this way so we get a 32-bit "multiply high" when // available. If you're on a 64-bit platform with cheap multiplies // (e.g. x64), just bake the +32 into rcp_shift. //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift; // Slow method, but robust // *r = ((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start; // return; // The extra >>32 has already been added to RansEncSymbolInit uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift); *r = x + sym->bias + q * sym->cmpl_freq; // assert(((x / sym->freq) << sym->scale_bits) + (x % sym->freq) + sym->start == *r); } // Equivalent to RansDecAdvance that takes a symbol. static inline void RansDecAdvanceSymbol(RansState* r, uint8_t** pptr, RansDecSymbol const* sym, uint32_t scale_bits) { RansDecAdvance(r, pptr, sym->start, sym->freq, scale_bits); } // Advances in the bit stream by "popping" a single symbol with range start // "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits". // No renormalization or output happens. static inline void RansDecAdvanceStep(RansState* r, uint32_t start, uint32_t freq, uint32_t scale_bits) { uint32_t mask = (1u << scale_bits) - 1; // s, x = D(x) uint32_t x = *r; *r = freq * (x >> scale_bits) + (x & mask) - start; } // Equivalent to RansDecAdvanceStep that takes a symbol. static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* sym, uint32_t scale_bits) { RansDecAdvanceStep(r, sym->start, sym->freq, scale_bits); } // Renormalize. #if defined(__x86_64) && !defined(__ILP32__) /* * Assembly variants of the RansDecRenorm code. * These are based on joint ideas from Rob Davies and from looking at * the clang assembly output. */ static inline void RansDecRenorm(RansState* r, uint8_t** pptr) { // q4 q40 // clang 730/608 717/467 // gcc8 733/588 737/458 uint32_t x = *r; uint16_t *ptr = *(uint16_t **)pptr; __asm__ ("movzwl (%0), %%eax\n\t" "mov %1, %%edx\n\t" "shl $0x10, %%edx\n\t" "or %%eax, %%edx\n\t" "xor %%eax, %%eax\n\t" "cmp $0x8000,%1\n\t" "cmovb %%edx, %1\n\t" "lea 2(%0), %%rax\n\t" "cmovb %%rax, %0\n\t" : "=r" (ptr), "=r" (x) : "0" (ptr), "1" (x) : "eax", "edx" ); *pptr = (uint8_t *)ptr; *r = x; } #else /* __x86_64 */ static inline void RansDecRenorm(RansState* r, uint8_t** pptr) { // renormalize uint32_t x = *r; // Up to 6% quicker (rans4x16pr -t) if using unaligned access, // but normally closer. uint32_t y = (*pptr)[0] | ((*pptr)[1]<<8); if (x < RANS_BYTE_L) (*pptr)+=2; if (x < RANS_BYTE_L) x = (x << 16) | y; *r = x; } #endif /* __x86_64 */ // Note the data may not be word aligned here. // This function is only used sparingly, for the last few bytes in the buffer, // so speed isn't critical. static inline void RansDecRenormSafe(RansState* r, uint8_t** pptr, uint8_t *ptr_end) { uint32_t x = *r; if (x >= RANS_BYTE_L || *pptr+1 >= ptr_end) return; uint16_t y = (*pptr)[0] + ((*pptr)[1]<<8); x = (x << 16) | y; (*pptr) += 2; *r = x; } #endif // RANS_WORD_HEADER