/* * Copyright (c) 2019-2022 Genome Research Ltd. * Author(s): James Bonfield * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger * Institute nor the names of its contributors may be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include // pthread_once is used to allocate large local memory blocks in hist8 // and hist1_4. This avoids issues with systems having small stacks, and // removes overheads of repeated malloc/free cycles. // // If NO_THREADS is defined it reverts back to the more traditional malloc // and free instead. #ifndef NO_THREADS #include static pthread_once_t hist_once = PTHREAD_ONCE_INIT; static pthread_key_t hist_key; static void hist_tb_free(void *vp) { free(vp); } static void hist_tls_init(void) { pthread_key_create(&hist_key, hist_tb_free); } #endif /* * Data transpose by N. Common to rANS4x16 and arith_dynamic decoders. * * Tuned for specific common cases of N. */ static inline void unstripe(unsigned char *out, unsigned char *outN, unsigned int ulen, unsigned int N, unsigned int idxN[256]) { int j = 0, k; if (ulen >= N) { switch (N) { case 4: #define LLN 16 if (ulen >= 4*LLN) { while (j < ulen-4*LLN) { int l; for (l = 0; l < LLN; l++) { for (k = 0; k < 4; k++) out[j+k+l*4] = outN[idxN[k]+l]; } for (k = 0; k < 4; k++) idxN[k] += LLN; j += 4*LLN; } } while (j < ulen-4) { for (k = 0; k < 4; k++) out[j++] = outN[idxN[k]++]; } #undef LLN break; case 2: #define LLN 4 if (ulen >= 2*LLN) { while (j < ulen-2*LLN) { int l; for (l = 0; l < LLN; l++) { for (k = 0; k < 2; k++) out[j++] = outN[idxN[k]+l]; } for (k = 0; k < 2; k++) idxN[k] += l; } } while (j < ulen-2) { for (k = 0; k < 2; k++) out[j++] = outN[idxN[k]++]; } #undef LLN break; default: // General case, around 25% slower overall decode while (j < ulen-N) { for (k = 0; k < N; k++) out[j++] = outN[idxN[k]++]; } break; } } for (k = 0; j < ulen; k++) out[j++] = outN[idxN[k]++]; } #define MAGIC 8 /* * Order 0 histogram construction. 8-way unrolled to avoid cache collisions. */ static inline void hist8(unsigned char *in, unsigned int in_size, uint32_t F0[256]) { if (in_size > 500000) { // Note this is a static inline so we allocate more blocks than // we really need, so we could consider moving this initialisation // code to a utils.c instead. // However it's relatively bounded and not a leak. #ifndef NO_THREADS pthread_once(&hist_once, hist_tls_init); uint32_t *f0 = pthread_getspecific(hist_key); if (!f0) { f0 = calloc((65536+37)*3, sizeof(uint32_t)); pthread_setspecific(hist_key, f0); } else { memset(f0, 0, (65536+37)*3*sizeof(*f0)); } #else uint32_t *f0 = calloc((65536+37)*3, sizeof(*f0)); #endif uint32_t *f1 = f0 + 65536+37; uint32_t *f2 = f1 + 65536+37; uint32_t i, i8 = in_size & ~15; for (i = 0; i < i8; i+=16) { uint16_t i16a[4], i16b[4]; memcpy(i16a, in+i, 8); f0[i16a[0]]++; f1[i16a[1]]++; f2[i16a[2]]++; f0[i16a[3]]++; memcpy(i16b, in+i+8, 8); f1[i16b[0]]++; f0[i16b[1]]++; f1[i16b[2]]++; f2[i16b[3]]++; } while (i < in_size) F0[in[i++]]++; for (i = 0; i < 65536; i++) { F0[i & 0xff] += f0[i] + f1[i] + f2[i]; F0[i >> 8 ] += f0[i] + f1[i] + f2[i]; } #ifdef NO_THREADS free(f0); #endif } else { uint32_t F1[256+MAGIC] = {0}, F2[256+MAGIC] = {0}, F3[256+MAGIC] = {0}; uint32_t i, i8 = in_size & ~7; for (i = 0; i < i8; i+=8) { F0[in[i+0]]++; F1[in[i+1]]++; F2[in[i+2]]++; F3[in[i+3]]++; F0[in[i+4]]++; F1[in[i+5]]++; F2[in[i+6]]++; F3[in[i+7]]++; } while (i < in_size) F0[in[i++]]++; for (i = 0; i < 256; i++) F0[i] += F1[i] + F2[i] + F3[i]; } } /* * A variant of hist8 that simply marks the presence of a symbol rather * than its frequency. */ static inline void present8(unsigned char *in, unsigned int in_size, uint32_t F0[256]) { uint32_t F1[256+MAGIC] = {0}, F2[256+MAGIC] = {0}, F3[256+MAGIC] = {0}; uint32_t F4[256+MAGIC] = {0}, F5[256+MAGIC] = {0}, F6[256+MAGIC] = {0}; uint32_t F7[256+MAGIC] = {0}; unsigned int i, i8 = in_size & ~7; for (i = 0; i < i8; i+=8) { F0[in[i+0]]=1; F1[in[i+1]]=1; F2[in[i+2]]=1; F3[in[i+3]]=1; F4[in[i+4]]=1; F5[in[i+5]]=1; F6[in[i+6]]=1; F7[in[i+7]]=1; } while (i < in_size) F0[in[i++]]=1; for (i = 0; i < 256; i++) F0[i] += F1[i] + F2[i] + F3[i] + F4[i] + F5[i] + F6[i] + F7[i]; } /* * Order 1 histogram construction. 4-way unrolled to avoid cache collisions. */ #if 1 static inline void hist1_4(unsigned char *in, unsigned int in_size, uint32_t F0[256][256], uint32_t *T0) { unsigned char l = 0, c; unsigned char *in_end = in + in_size; unsigned char cc[5] = {0}; if (in_size > 500000) { #ifndef NO_THREADS pthread_once(&hist_once, hist_tls_init); // NB (*F1)[259] works better on old Opterons or other systems // with low N-way associative caches. However it's slightly poorer // on more modern CPUs. Mileage may vary by CPU and compiler. // // Note our pthread_once key is shared by both this and hist8, so // the buffer allocated here is reused by both functions and has to // be allocated accordingly (hence to the larger hist8 size). uint32_t (*F1)[259] = pthread_getspecific(hist_key); if (!F1) { F1 = calloc((65536+37)*3, sizeof(uint32_t)); pthread_setspecific(hist_key, F1); } else { memset(F1, 0, 256*sizeof(*F1)); } #else uint32_t (*F1)[259] = calloc(256, sizeof(*F1)); #endif while (in < in_end-8) { memcpy(cc, in, 4); in += 4; F0[cc[4]][cc[0]]++; F1[cc[0]][cc[1]]++; F0[cc[1]][cc[2]]++; F1[cc[2]][cc[3]]++; cc[4] = cc[3]; memcpy(cc, in, 4); in += 4; F0[cc[4]][cc[0]]++; F1[cc[0]][cc[1]]++; F0[cc[1]][cc[2]]++; F1[cc[2]][cc[3]]++; cc[4] = cc[3]; } l = cc[3]; while (in < in_end) { F0[l][c = *in++]++; l = c; } T0[l]++; int i, j; for (i = 0; i < 256; i++) { int tt = 0; for (j = 0; j < 256; j++) { F0[i][j] += F1[i][j]; tt += F0[i][j]; } T0[i]+=tt; } #ifdef NO_THREADS free(F1); #endif } else { while (in < in_end-8) { memcpy(cc, in, 4); in += 4; F0[cc[4]][cc[0]]++; F0[cc[0]][cc[1]]++; F0[cc[1]][cc[2]]++; F0[cc[2]][cc[3]]++; cc[4] = cc[3]; memcpy(cc, in, 4); in += 4; F0[cc[4]][cc[0]]++; F0[cc[0]][cc[1]]++; F0[cc[1]][cc[2]]++; F0[cc[2]][cc[3]]++; cc[4] = cc[3]; } l = cc[3]; while (in < in_end) { F0[l][c = *in++]++; l = c; } T0[l]++; int i, j; for (i = 0; i < 256; i++) { int tt = 0; for (j = 0; j < 256; j++) tt += F0[i][j]; T0[i]+=tt; } } } #else // 16 bit mode, similar to O0 freq. // This is better on some low entropy data, but generally we prefer to do // bit-packing and/or RLE to turn it into higher-entropy data first. // // Kept here for posterity incase we need it again, as it's quick tricky. static inline void hist1_4(unsigned char *in, unsigned int in_size, uint32_t F0[256][256], uint32_t *T0) { uint32_t f0[65536+MAGIC] = {0}; uint32_t f1[65536+MAGIC] = {0}; uint32_t i, i8 = (in_size-1) & ~15; T0[0]++; f0[in[0]<<8]++; for (i = 0; i < i8; i+=16) { uint16_t i16a[16]; memcpy(i16a, in+i, 16); // faster in 2 as gcc recognises this memcpy(i16a+8, in+i+1, 16); // faster in 2 as gcc recognises this f0[i16a[0]]++; f1[i16a[1]]++; f0[i16a[2]]++; f1[i16a[3]]++; f0[i16a[4]]++; f1[i16a[5]]++; f0[i16a[6]]++; f1[i16a[7]]++; f0[i16a[8]]++; f1[i16a[9]]++; f0[i16a[10]]++; f1[i16a[11]]++; f0[i16a[12]]++; f1[i16a[13]]++; f0[i16a[14]]++; f1[i16a[15]]++; } while (i < in_size-1) { F0[in[i]][in[i+1]]++; T0[in[i+1]]++; i++; } for (i = 0; i < 65536; i++) { F0[i&0xff][i>>8] += f0[i] + f1[i]; T0[i>>8] += f0[i] + f1[i]; } } #endif /* RANS_UTILS_H */