/* * Copyright (c) 2016-2021 Genome Research Ltd. * Author(s): James Bonfield * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger * Institute nor the names of its contributors may be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // cc -O3 -g -DTEST_TOKENISER tokenise_name3.c arith_dynamic.c rANS_static4x16pr.c pooled_alloc.c -I.. -I. -lbz2 -pthread // Name tokeniser. // It generates a series of byte streams (per token) and compresses these // either using static rANS or dynamic arithmetic coding. Arith coding is // typically 1-5% smaller, but around 50-100% slower. We only envisage it // being used at the higher compression levels. // TODO // // - Is it better when encoding 1, 2, 3, 3, 4, 5, 5, 6, 7, 9, 9, 10 to encode // this as a mixture of MATCH and DELTA ops, or as entirely as DELTA ops // with some delta values being zero? I suspect the latter, but it is // not implemented here. See "last_token_delta" comments in code. // // - Consider variable size string implementations. // Pascal style strings (length + str), // C style strings (nul terminated), // Or split blocks: length block and string contents block. // // - Is this one token-block or many serialised token-blocks? // A) Lots of different models but feeding one bit-buffer emitted to // by the entropy encoder => one block (fqzcomp). // B) Lots of different models each feeding their own bit-buffers // => many blocks. // // - multiple integer types depending on size; 1, 2, 4 byte long. // // - Consider token choice for isalnum instead of isalpha. Sometimes better. // // - Consider token synchronisation (eg on matching chr symbols?) incase of // variable number. Eg consider foo:0999, foo:1000, foo:1001 (the leading // zero adds an extra token). // // - Optimisation of tokens. Eg: // HS25_09827:2:2102:11274:80442#49 // HS25_09827:2:2109:12941:31311#49 // // We'll have tokens for HS 25 _ 09827 : 2 : that are entirely // after the initial token. These 7 tokens could be one ALPHA instead // of 7 distinct tokens, with 1 MATCH instead of 7. This is both a speed // improvement for decoding as well as a space saving (fewer token-blocks // and associated overhead). // // - XOR. Like ALPHA, but used when previous symbol is ALPHA or XOR // and string lengths match. Useful when names are similar, eg: // the sequence in 07.names: // // @VP2-06:112:H7LNDMCVY:1:1105:26919:1172 1:N:0:ATTCAGAA+AGGAGAAG // @VP2-06:112:H7LNDMCVY:1:1105:27100:1172 1:N:0:ATTCAGAA+AGGCGAAG // @VP2-06:112:H7LNDMCVY:1:1105:27172:1172 1:N:0:ATTCAGAA+AGGCTAAG #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include //#define NO_THREADS #ifndef NO_THREADS #include #endif #include "pooled_alloc.h" #include "arith_dynamic.h" #include "rANS_static4x16.h" #include "tokenise_name3.h" #include "varint.h" // 128 is insufficient for SAM names (max 256 bytes) as // we may alternate a0a0a0a0a0 etc. However if we fail, // we just give up and switch to another codec, so this // isn't a serious limit. Maybe up to 256 to permit all // SAM names? #define MAX_TOKENS 128 #define MAX_TBLOCKS (MAX_TOKENS<<4) // Number of names per block #define MAX_NAMES 1000000 enum name_type {N_ERR = -1, N_TYPE = 0, N_ALPHA, N_CHAR, N_DIGITS0, N_DZLEN, N_DUP, N_DIFF, N_DIGITS, N_DDELTA, N_DDELTA0, N_MATCH, N_NOP, N_END, N_ALL}; char *types[]={"TYPE", "ALPHA", "CHAR", "DIG0", "DZLEN", "DUP", "DIFF", "DIGITS", "DDELTA", "DDELTA0", "MATCH", "NOP", "END"}; typedef struct trie { char c; int count; struct trie *next, *sibling; int n; // Nth line } trie_t; typedef struct { char *last_name; int last_ntok; enum name_type last_token_type[MAX_TOKENS]; int last_token_int[MAX_TOKENS]; int last_token_str[MAX_TOKENS]; //int last_token_delta[MAX_TOKENS]; } last_context; typedef struct { uint8_t *buf; size_t buf_a, buf_l; // alloc and used length. int tnum, ttype; int dup_from; } descriptor; typedef struct { last_context *lc; // For finding entire line dups int counter; // Trie used in encoder only trie_t *t_head; pool_alloc_t *pool; // token blocks descriptor desc[MAX_TBLOCKS]; // summary stats per token int token_dcount[MAX_TOKENS]; int token_icount[MAX_TOKENS]; //int token_zcount[MAX_TOKENS]; int max_tok; // tracks which desc/[id]count elements have been initialised int max_names; } name_context; #ifndef NO_THREADS /* * Thread local storage, used to avoid repeated malloc/free calls. */ pthread_once_t tok_once = PTHREAD_ONCE_INIT; pthread_key_t tok_key; static void tok_tls_init(void) { pthread_key_create(&tok_key, free); } #endif static name_context *create_context(int max_names) { if (max_names <= 0) return NULL; #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION if (max_names > 100000) return NULL; #endif // An arbitrary limit to prevent malformed data from consuming excessive // amounts of memory. Consider upping this if we have genuine use cases // for larger blocks. if (max_names > 1e7) { fprintf(stderr, "Name codec currently has a max of 10 million rec.\n"); return NULL; } #ifndef NO_THREADS pthread_once(&tok_once, tok_tls_init); name_context *ctx = pthread_getspecific(tok_key); if (!ctx) { ctx = malloc(sizeof(*ctx) + ++max_names*sizeof(*ctx->lc)); if (!ctx) return NULL; ctx->max_names = max_names; pthread_setspecific(tok_key, ctx); } else if (ctx->max_names < max_names+1) { ctx = realloc(ctx, sizeof(*ctx) + ++max_names*sizeof(*ctx->lc)); if (!ctx) return NULL; ctx->max_names = max_names; pthread_setspecific(tok_key, ctx); } #else name_context *ctx = malloc(sizeof(*ctx) + ++max_names*sizeof(*ctx->lc)); if (!ctx) return NULL; ctx->max_names = max_names; #endif ctx->counter = 0; ctx->t_head = NULL; ctx->lc = (last_context *)(((char *)ctx) + sizeof(*ctx)); ctx->pool = NULL; memset(&ctx->desc[0], 0, 2*16 * sizeof(ctx->desc[0])); memset(&ctx->token_dcount[0], 0, sizeof(int)); memset(&ctx->token_icount[0], 0, sizeof(int)); ctx->max_tok = 1; ctx->lc[0].last_ntok = 0; return ctx; } static void free_context(name_context *ctx) { if (!ctx) return; if (ctx->t_head) free(ctx->t_head); if (ctx->pool) pool_destroy(ctx->pool); int i; for (i = 0; i < ctx->max_tok*16; i++) free(ctx->desc[i].buf); #ifdef NO_THREADS free(ctx); #endif } //----------------------------------------------------------------------------- // Fast unsigned integer printing code. // Returns number of bytes written. static int append_uint32_fixed(char *cp, uint32_t i, uint8_t l) { switch (l) { case 9:*cp++ = i / 100000000 + '0', i %= 100000000; case 8:*cp++ = i / 10000000 + '0', i %= 10000000; case 7:*cp++ = i / 1000000 + '0', i %= 1000000; case 6:*cp++ = i / 100000 + '0', i %= 100000; case 5:*cp++ = i / 10000 + '0', i %= 10000; case 4:*cp++ = i / 1000 + '0', i %= 1000; case 3:*cp++ = i / 100 + '0', i %= 100; case 2:*cp++ = i / 10 + '0', i %= 10; case 1:*cp++ = i + '0'; case 0:break; } return l; } static int append_uint32_var(char *cp, uint32_t i) { char *op = cp; uint32_t j; //if (i < 10) goto b0; if (i < 100) goto b1; //if (i < 1000) goto b2; if (i < 10000) goto b3; //if (i < 100000) goto b4; if (i < 1000000) goto b5; //if (i < 10000000) goto b6; if (i < 100000000) goto b7; if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;} if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;} b7:if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;} if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;} b5:if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;} if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;} b3:if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;} if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;} b1:if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;} if (i) *cp++ = i + '0'; return cp-op; x8:*cp++ = i / 100000000 + '0', i %= 100000000; x7:*cp++ = i / 10000000 + '0', i %= 10000000; x6:*cp++ = i / 1000000 + '0', i %= 1000000; x5:*cp++ = i / 100000 + '0', i %= 100000; x4:*cp++ = i / 10000 + '0', i %= 10000; x3:*cp++ = i / 1000 + '0', i %= 1000; x2:*cp++ = i / 100 + '0', i %= 100; x1:*cp++ = i / 10 + '0', i %= 10; x0:*cp++ = i + '0'; return cp-op; } //----------------------------------------------------------------------------- // Example descriptor encoding and IO. // // Here we just append to a buffer so we can dump out the results. // These could then be passed through a static entropy encoder that // encodes the entire buffer. // // Alternatively an adaptive entropy encoder could be place inline // here to encode as it goes using additional knowledge from the // supplied context. // Ensure room for sz more bytes. static int descriptor_grow(descriptor *fd, uint32_t sz) { while (fd->buf_l + sz > fd->buf_a) { size_t buf_a = fd->buf_a ? fd->buf_a*2 : 65536; unsigned char *buf = realloc(fd->buf, buf_a); if (!buf) return -1; fd->buf = buf; fd->buf_a = buf_a; } return 0; } static int encode_token_type(name_context *ctx, int ntok, enum name_type type) { int id = ntok<<4; if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1; ctx->desc[id].buf[ctx->desc[id].buf_l++] = type; return 0; } static int encode_token_match(name_context *ctx, int ntok) { return encode_token_type(ctx, ntok, N_MATCH); } static int encode_token_end(name_context *ctx, int ntok) { return encode_token_type(ctx, ntok, N_END); } static enum name_type decode_token_type(name_context *ctx, int ntok) { int id = ntok<<4; if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a) return -1; return ctx->desc[id].buf[ctx->desc[id].buf_l++]; } // int stored as 32-bit quantities static int encode_token_int(name_context *ctx, int ntok, enum name_type type, uint32_t val) { int id = (ntok<<4) | type; if (encode_token_type(ctx, ntok, type) < 0) return -1; if (descriptor_grow(&ctx->desc[id], 4) < 0) return -1; uint8_t *cp = &ctx->desc[id].buf[ctx->desc[id].buf_l]; cp[0] = (val >> 0) & 0xff; cp[1] = (val >> 8) & 0xff; cp[2] = (val >> 16) & 0xff; cp[3] = (val >> 24) & 0xff; ctx->desc[id].buf_l += 4; return 0; } // Return 0 on success, -1 on failure; static int decode_token_int(name_context *ctx, int ntok, enum name_type type, uint32_t *val) { int id = (ntok<<4) | type; if (ctx->desc[id].buf_l + 4 > ctx->desc[id].buf_a) return -1; uint8_t *cp = ctx->desc[id].buf + ctx->desc[id].buf_l; *val = (cp[0]) + (cp[1]<<8) + (cp[2]<<16) + ((uint32_t)cp[3]<<24); ctx->desc[id].buf_l += 4; return 0; } // 8 bit integer quantity static int encode_token_int1(name_context *ctx, int ntok, enum name_type type, uint32_t val) { int id = (ntok<<4) | type; if (encode_token_type(ctx, ntok, type) < 0) return -1; if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1; ctx->desc[id].buf[ctx->desc[id].buf_l++] = val; return 0; } static int encode_token_int1_(name_context *ctx, int ntok, enum name_type type, uint32_t val) { int id = (ntok<<4) | type; if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1; ctx->desc[id].buf[ctx->desc[id].buf_l++] = val; return 0; } // Return 0 on success, -1 on failure; static int decode_token_int1(name_context *ctx, int ntok, enum name_type type, uint32_t *val) { int id = (ntok<<4) | type; if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a) return -1; *val = ctx->desc[id].buf[ctx->desc[id].buf_l++]; return 0; } // Basic C-string style for now. // // Maybe XOR with previous string as context? // This permits partial match to be encoded efficiently. static int encode_token_alpha(name_context *ctx, int ntok, char *str, int len) { int id = (ntok<<4) | N_ALPHA; if (encode_token_type(ctx, ntok, N_ALPHA) < 0) return -1; if (descriptor_grow(&ctx->desc[id], len+1) < 0) return -1; memcpy(&ctx->desc[id].buf[ctx->desc[id].buf_l], str, len); ctx->desc[id].buf[ctx->desc[id].buf_l+len] = 0; ctx->desc[id].buf_l += len+1; return 0; } // FIXME: need limit on string length for security. // Return length on success, -1 on failure; static int decode_token_alpha(name_context *ctx, int ntok, char *str, int max_len) { int id = (ntok<<4) | N_ALPHA; char c; int len = 0; if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a) return -1; do { c = ctx->desc[id].buf[ctx->desc[id].buf_l++]; str[len++] = c; } while(c && len < max_len && ctx->desc[id].buf_l < ctx->desc[id].buf_a); return len-1; } static int encode_token_char(name_context *ctx, int ntok, char c) { int id = (ntok<<4) | N_CHAR; if (encode_token_type(ctx, ntok, N_CHAR) < 0) return -1; if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1; ctx->desc[id].buf[ctx->desc[id].buf_l++] = c; return 0; } // FIXME: need limit on string length for security // Return length on success, -1 on failure; static int decode_token_char(name_context *ctx, int ntok, char *str) { int id = (ntok<<4) | N_CHAR; if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a) return -1; *str = ctx->desc[id].buf[ctx->desc[id].buf_l++]; return 1; } // A duplicated name static int encode_token_dup(name_context *ctx, uint32_t val) { return encode_token_int(ctx, 0, N_DUP, val); } // Which read to delta against static int encode_token_diff(name_context *ctx, uint32_t val) { return encode_token_int(ctx, 0, N_DIFF, val); } //----------------------------------------------------------------------------- // Trie implementation for tracking common name prefixes. static int build_trie(name_context *ctx, char *data, size_t len, int n) { int nlines = 0; size_t i; trie_t *t; if (!ctx->t_head) { ctx->t_head = calloc(1, sizeof(*ctx->t_head)); if (!ctx->t_head) return -1; } // Build our trie, also counting input lines for (nlines = i = 0; i < len; i++, nlines++) { t = ctx->t_head; t->count++; while (i < len && data[i] > '\n') { unsigned char c = data[i++]; if (c & 0x80) //fprintf(stderr, "8-bit ASCII is unsupported\n"); abort(); c &= 127; trie_t *x = t->next, *l = NULL; while (x && x->c != c) { l = x; x = x->sibling; } if (!x) { if (!ctx->pool) ctx->pool = pool_create(sizeof(trie_t)); if (!(x = (trie_t *)pool_alloc(ctx->pool))) return -1; memset(x, 0, sizeof(*x)); if (!l) x = t->next = x; else x = l->sibling = x; x->n = n; x->c = c; } t = x; t->c = c; t->count++; } } return 0; } #if 0 void dump_trie(trie_t *t, int depth) { if (depth == 0) { printf("graph x_%p {\n splines = ortho\n ranksep=2\n", t); printf(" p_%p [label=\"\"];\n", t); dump_trie(t, 1); printf("}\n"); } else { int j, k, count;//, cj; char label[100], *cp; trie_t *tp = t; // patricia: // for (count = j = 0; j < 128; j++) // if (t->next[j]) // count++, cj=j; // // if (count == 1) { // t = t->next[cj]; // *cp++ = cj; // goto patricia; // } trie_t *x; for (x = t->next; x; x = x->sibling) { printf(" p_%p [label=\"%c\"];\n", x, x->c); printf(" p_%p -- p_%p [label=\"%d\", penwidth=\"%f\"];\n", tp, x, x->count, MAX((log(x->count)-3)*2,1)); //if (depth <= 11) dump_trie(x, depth+1); } #if 0 for (j = 0; j < 128; j++) { trie_t *tn; if (!t->next[j]) continue; cp = label; tn = t->next[j]; *cp++ = j; // patricia: for (count = k = 0; k < 128; k++) if (tn->next[k]) count++;//, cj=k; // if (count == 1) { // tn = tn->next[cj]; // *cp++ = cj; // goto patricia; // } *cp++ = 0; printf(" p_%p [label=\"%s\"];\n", tn, label); printf(" p_%p -- p_%p [label=\"%d\", penwidth=\"%f\"];\n", tp, tn, tn->count, MAX((log(tn->count)-3)*2,1)); if (depth <= 11) dump_trie(tn, depth+1); } #endif } } #endif static int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, int *is_fixed, int *fixed_len) { int nlines = 0; size_t i; trie_t *t; int from = -1, p3 = -1; *exact = 0; *fixed_len = 0; *is_fixed = 0; // Horrid hack for the encoder only. // We optimise per known name format here. int prefix_len; char *d = *data == '@' ? data+1 : data; int l = *data == '@' ? len-1 : len; int f = (*data == '>') ? 1 : 0; if (l > 70 && d[f+0] == 'm' && d[7] == '_' && d[f+14] == '_' && d[f+61] == '/') { prefix_len = 60; // PacBio *is_fixed = 0; } else if (l == 17 && d[f+5] == ':' && d[f+11] == ':') { prefix_len = 6; // IonTorrent *fixed_len = 6; *is_fixed = 1; } else if (l > 37 && d[f+8] == '-' && d[f+13] == '-' && d[f+18] == '-' && d[f+23] == '-' && ((d[f+0] >= '0' && d[f+0] <='9') || (d[f+0] >= 'a' && d[f+0] <= 'f')) && ((d[f+35] >= '0' && d[f+35] <='9') || (d[f+35] >= 'a' && d[f+35] <= 'f'))) { // ONT: f33d30d5-6eb8-4115-8f46-154c2620a5da_Basecall_1D_template... prefix_len = 37; *fixed_len = 37; *is_fixed = 1; } else { // Check Illumina and trim back to lane:tile:x:y. int colons = 0; for (i = 0; i < len && data[i] > ' '; i++) ; while (i > 0 && colons < 4) if (data[--i] == ':') colons++; if (colons == 4) { // Constant illumina prefix *fixed_len = i+1; prefix_len = i+1; *is_fixed = 1; } else { // Unknown, don't use a fixed len, but still search // for any exact matches. prefix_len = INT_MAX; *is_fixed = 0; } } //prefix_len = INT_MAX; if (!ctx->t_head) { ctx->t_head = calloc(1, sizeof(*ctx->t_head)); if (!ctx->t_head) return -1; } // Find an item in the trie for (nlines = i = 0; i < len; i++, nlines++) { t = ctx->t_head; while (i < len && data[i] > '\n') { unsigned char c = data[i++]; if (c & 0x80) //fprintf(stderr, "8-bit ASCII is unsupported\n"); abort(); c &= 127; trie_t *x = t->next; while (x && x->c != c) x = x->sibling; t = x; // t = t->next[c]; // if (!t) // return -1; from = t->n; if (i == prefix_len) p3 = t->n; //if (t->count >= .0035*ctx->t_head->count && t->n != n) p3 = t->n; // pacbio //if (i == 60) p3 = t->n; // pacbio //if (i == 7) p3 = t->n; // iontorrent t->n = n; } } //printf("Looked for %d, found %d, prefix %d\n", n, from, p3); *exact = (n != from) && len; return *exact ? from : p3; } //----------------------------------------------------------------------------- // Name encoder /* * Tokenises a read name using ctx as context as the previous * tokenisation. * * Parsed elements are then emitted for encoding by calling the * encode_token() function with the context, token number (Nth token * in line), token type and token value. * * Returns 0 on success; * -1 on failure. */ static int encode_name(name_context *ctx, char *name, int len, int mode) { int i, is_fixed, fixed_len; int exact; int cnum = ctx->counter++; int pnum = search_trie(ctx, name, len, cnum, &exact, &is_fixed, &fixed_len); if (pnum < 0) pnum = cnum ? cnum-1 : 0; //pnum = pnum & (MAX_NAMES-1); //cnum = cnum & (MAX_NAMES-1); //if (pnum == cnum) {pnum = cnum ? cnum-1 : 0;} #ifdef ENC_DEBUG fprintf(stderr, "%d: pnum=%d (%d), exact=%d\n%s\n%s\n", ctx->counter, pnum, cnum-pnum, exact, ctx->lc[pnum].last_name, name); #endif // Return DUP or DIFF switch, plus the distance. if (exact && len == strlen(ctx->lc[pnum].last_name)) { encode_token_dup(ctx, cnum-pnum); ctx->lc[cnum].last_name = name; ctx->lc[cnum].last_ntok = ctx->lc[pnum].last_ntok; // FIXME: optimise this int nc = ctx->lc[cnum].last_ntok ? ctx->lc[cnum].last_ntok : MAX_TOKENS; memcpy(ctx->lc[cnum].last_token_type, ctx->lc[pnum].last_token_type, nc * sizeof(int)); memcpy(ctx->lc[cnum].last_token_int , ctx->lc[pnum].last_token_int , nc * sizeof(int)); memcpy(ctx->lc[cnum].last_token_str , ctx->lc[pnum].last_token_str , nc * sizeof(int)); return 0; } encode_token_diff(ctx, cnum-pnum); int ntok = 1; i = 0; if (is_fixed) { if (ntok >= ctx->max_tok) { memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0])); memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int)); memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int)); ctx->max_tok = ntok+1; } if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_ALPHA) { if (ctx->lc[pnum].last_token_int[ntok] == fixed_len && memcmp(name, ctx->lc[pnum].last_name, fixed_len) == 0) { encode_token_match(ctx, ntok); } else { encode_token_alpha(ctx, ntok, name, fixed_len); } } else { encode_token_alpha(ctx, ntok, name, fixed_len); } ctx->lc[cnum].last_token_int[ntok] = fixed_len; ctx->lc[cnum].last_token_str[ntok] = 0; ctx->lc[cnum].last_token_type[ntok++] = N_ALPHA; i = fixed_len; } for (; i < len; i++) { if (ntok >= ctx->max_tok) { memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0])); memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int)); memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int)); ctx->max_tok = ntok+1; } /* Determine data type of this segment */ if (isalpha(name[i])) { int s = i+1; // int S = i+1; // // FIXME: try which of these is best. alnum is good sometimes. // while (s < len && isalpha(name[s])) while (s < len && (isalpha(name[s]) || ispunct(name[s]))) // while (s < len && name[s] != ':') // while (s < len && !isdigit(name[s]) && name[s] != ':') s++; // if (!is_fixed) { // while (S < len && isalnum(name[S])) // S++; // if (s < S) // s = S; // } // Single byte strings are better encoded as chars. if (s-i == 1) goto n_char; if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_ALPHA) { if (s-i == ctx->lc[pnum].last_token_int[ntok] && memcmp(&name[i], &ctx->lc[pnum].last_name[ctx->lc[pnum].last_token_str[ntok]], s-i) == 0) { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (alpha-mat, %.*s)\n", N_MATCH, s-i, &name[i]); #endif if (encode_token_match(ctx, ntok) < 0) return -1; } else { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (alpha, %.*s / %.*s)\n", N_ALPHA, s-i, &ctx->lc[pnum].last_name[ctx->lc[pnum].last_token_str[ntok]], s-i, &name[i]); #endif // same token/length, but mismatches if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1; } } else { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (new alpha, %.*s)\n", N_ALPHA, s-i, &name[i]); #endif if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1; } ctx->lc[cnum].last_token_int[ntok] = s-i; ctx->lc[cnum].last_token_str[ntok] = i; ctx->lc[cnum].last_token_type[ntok] = N_ALPHA; i = s-1; } else if (name[i] == '0') digits0: { // Digits starting with zero; encode length + value uint32_t s = i; uint32_t v = 0; int d = 0; while (s < len && isdigit(name[s]) && s-i < 9) { v = v*10 + name[s] - '0'; //putchar(name[s]); s++; } // TODO: optimise choice over whether to switch from DIGITS to DELTA // regularly vs all DIGITS, also MATCH vs DELTA 0. if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_DIGITS0) { d = v - ctx->lc[pnum].last_token_int[ntok]; if (d == 0 && ctx->lc[pnum].last_token_str[ntok] == s-i) { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v); #endif if (encode_token_match(ctx, ntok) < 0) return -1; //ctx->lc[pnum].last_token_delta[ntok]=0; } else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last_token_str[ntok] == s-i) { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last_token_int[ntok], v); #endif //if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1; if (encode_token_int1(ctx, ntok, N_DDELTA0, d) < 0) return -1; //ctx->lc[pnum].last_token_delta[ntok]=1; } else { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last_token_int[ntok], v); #endif if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1; if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1; //ctx->lc[pnum].last_token_delta[ntok]=0; } } else { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v); #endif if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1; if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1; //ctx->lc[pnum].last_token_delta[ntok]=0; } ctx->lc[cnum].last_token_str[ntok] = s-i; // length ctx->lc[cnum].last_token_int[ntok] = v; ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0; i = s-1; } else if (isdigit(name[i])) { // digits starting 1-9; encode value uint32_t s = i; uint32_t v = 0; int d = 0; while (s < len && isdigit(name[s]) && s-i < 9) { v = v*10 + name[s] - '0'; //putchar(name[s]); s++; } // dataset/10/K562_cytosol_LID8465_TopHat_v2.names // col 4 is Illumina lane - we don't want match & delta in there // as it has multiple lanes (so not ALL match) and delta is just // random chance, increasing entropy instead. // if (ntok == 4 || ntok == 8 || ntok == 10) { // encode_token_int(ctx, ntok, N_DIGITS, v); // } else { // If the last token was DIGITS0 and we are the same length, then encode // using that method instead as it seems likely the entire column is fixed // width, sometimes with leading zeros. if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_DIGITS0 && ctx->lc[pnum].last_token_str[ntok] == s-i) goto digits0; // TODO: optimise choice over whether to switch from DIGITS to DELTA // regularly vs all DIGITS, also MATCH vs DELTA 0. if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_DIGITS) { d = v - ctx->lc[pnum].last_token_int[ntok]; if (d == 0) { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v); #endif if (encode_token_match(ctx, ntok) < 0) return -1; //ctx->lc[pnum].last_token_delta[ntok]=0; //ctx->token_zcount[ntok]++; } else if (mode == 1 && d < 256 && d >= 0 //&& (10+ctx->token_dcount[ntok]) > (ctx->token_icount[ntok]+ctx->token_zcount[ntok]) && (5+ctx->token_dcount[ntok]) > ctx->token_icount[ntok] ) { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last_token_int[ntok], v); #endif if (encode_token_int1(ctx, ntok, N_DDELTA, d) < 0) return -1; //ctx->lc[pnum].last_token_delta[ntok]=1; ctx->token_dcount[ntok]++; } else { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last_token_int[ntok], v); #endif if (encode_token_int(ctx, ntok, N_DIGITS, v) < 0) return -1; //ctx->lc[pnum].last_token_delta[ntok]=0; ctx->token_icount[ntok]++; } } else { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v); #endif if (encode_token_int(ctx, ntok, N_DIGITS, v) < 0) return -1; //ctx->lc[pnum].last_token_delta[ntok]=0; } // } ctx->lc[cnum].last_token_int[ntok] = v; ctx->lc[cnum].last_token_type[ntok] = N_DIGITS; i = s-1; } else { n_char: //if (!isalpha(name[i])) putchar(name[i]); if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_CHAR) { if (name[i] == ctx->lc[pnum].last_token_int[ntok]) { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (chr-mat, %c)\n", N_MATCH, name[i]); #endif if (encode_token_match(ctx, ntok) < 0) return -1; } else { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (chr, %c / %c)\n", N_CHAR, ctx->lc[pnum].last_token_int[ntok], name[i]); #endif if (encode_token_char(ctx, ntok, name[i]) < 0) return -1; } } else { #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (new chr, %c)\n", N_CHAR, name[i]); #endif if (encode_token_char(ctx, ntok, name[i]) < 0) return -1; } ctx->lc[cnum].last_token_int[ntok] = name[i]; ctx->lc[cnum].last_token_type[ntok] = N_CHAR; } ntok++; //putchar(' '); } #ifdef ENC_DEBUG fprintf(stderr, "Tok %d (end)\n", N_END); #endif if (ntok >= ctx->max_tok) { memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0])); memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int)); memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int)); ctx->max_tok = ntok+1; } if (encode_token_end(ctx, ntok) < 0) return -1; #ifdef ENC_DEBUG fprintf(stderr, "ntok=%d max_tok=%d\n", ntok, ctx->max_tok); #endif //printf("Encoded %.*s with %d tokens\n", len, name, ntok); ctx->lc[cnum].last_name = name; ctx->lc[cnum].last_ntok = ntok; return 0; } //----------------------------------------------------------------------------- // Name decoder static int decode_name(name_context *ctx, char *name, int name_len) { int t0 = decode_token_type(ctx, 0); uint32_t dist; int pnum, cnum = ctx->counter++; if (cnum >= ctx->max_names) return -1; if (t0 < 0 || t0 >= ctx->max_tok*16) return 0; if (decode_token_int(ctx, 0, t0, &dist) < 0 || dist > cnum) return -1; if ((pnum = cnum - dist) < 0) pnum = 0; //fprintf(stderr, "t0=%d, dist=%d, pnum=%d, cnum=%d\n", t0, dist, pnum, cnum); if (t0 == N_DUP) { if (pnum == cnum) return -1; if (strlen(ctx->lc[pnum].last_name) +1 >= name_len) return -1; strcpy(name, ctx->lc[pnum].last_name); // FIXME: optimise this ctx->lc[cnum].last_name = name; ctx->lc[cnum].last_ntok = ctx->lc[pnum].last_ntok; int nc = ctx->lc[cnum].last_ntok ? ctx->lc[cnum].last_ntok : MAX_TOKENS; memcpy(ctx->lc[cnum].last_token_type, ctx->lc[pnum].last_token_type, nc * sizeof(int)); memcpy(ctx->lc[cnum].last_token_int , ctx->lc[pnum].last_token_int , nc * sizeof(int)); memcpy(ctx->lc[cnum].last_token_str , ctx->lc[pnum].last_token_str , nc * sizeof(int)); return strlen(name)+1; } *name = 0; int ntok, len = 0, len2; for (ntok = 1; ntok < MAX_TOKENS && ntok < ctx->max_tok; ntok++) { uint32_t v, vl; enum name_type tok; tok = decode_token_type(ctx, ntok); //fprintf(stderr, "Tok %d = %d\n", ntok, tok); ctx->lc[cnum].last_ntok = 0; switch (tok) { case N_CHAR: if (len+1 >= name_len) return -1; if (decode_token_char(ctx, ntok, &name[len]) < 0) return -1; //fprintf(stderr, "Tok %d CHAR %c\n", ntok, name[len]); ctx->lc[cnum].last_token_type[ntok] = N_CHAR; ctx->lc[cnum].last_token_int [ntok] = name[len++]; break; case N_ALPHA: if ((len2 = decode_token_alpha(ctx, ntok, &name[len], name_len - len)) < 0) return -1; //fprintf(stderr, "Tok %d ALPHA %.*s\n", ntok, len2, &name[len]); ctx->lc[cnum].last_token_type[ntok] = N_ALPHA; ctx->lc[cnum].last_token_str [ntok] = len; ctx->lc[cnum].last_token_int [ntok] = len2; len += len2; break; case N_DIGITS0: // [0-9]* if (decode_token_int1(ctx, ntok, N_DZLEN, &vl) < 0) return -1; if (decode_token_int(ctx, ntok, N_DIGITS0, &v) < 0) return -1; if (len+20+vl >= name_len) return -1; len += append_uint32_fixed(&name[len], v, vl); //fprintf(stderr, "Tok %d DIGITS0 %0*d\n", ntok, vl, v); ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0; ctx->lc[cnum].last_token_int [ntok] = v; ctx->lc[cnum].last_token_str [ntok] = vl; break; case N_DDELTA0: if (ntok >= ctx->lc[pnum].last_ntok) return -1; if (decode_token_int1(ctx, ntok, N_DDELTA0, &v) < 0) return -1; v += ctx->lc[pnum].last_token_int[ntok]; if (len+ctx->lc[pnum].last_token_str[ntok]+1 >= name_len) return -1; len += append_uint32_fixed(&name[len], v, ctx->lc[pnum].last_token_str[ntok]); //fprintf(stderr, "Tok %d DELTA0 %0*d\n", ntok, ctx->lc[pnum].last_token_str[ntok], v); ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0; ctx->lc[cnum].last_token_int [ntok] = v; ctx->lc[cnum].last_token_str [ntok] = ctx->lc[pnum].last_token_str[ntok]; break; case N_DIGITS: // [1-9][0-9]* if (decode_token_int(ctx, ntok, N_DIGITS, &v) < 0) return -1; if (len+20 >= name_len) return -1; len += append_uint32_var(&name[len], v); //fprintf(stderr, "Tok %d DIGITS %d\n", ntok, v); ctx->lc[cnum].last_token_type[ntok] = N_DIGITS; ctx->lc[cnum].last_token_int [ntok] = v; break; case N_DDELTA: if (ntok >= ctx->lc[pnum].last_ntok) return -1; if (decode_token_int1(ctx, ntok, N_DDELTA, &v) < 0) return -1; v += ctx->lc[pnum].last_token_int[ntok]; if (len+20 >= name_len) return -1; len += append_uint32_var(&name[len], v); //fprintf(stderr, "Tok %d DELTA %d\n", ntok, v); ctx->lc[cnum].last_token_type[ntok] = N_DIGITS; ctx->lc[cnum].last_token_int [ntok] = v; break; case N_NOP: ctx->lc[cnum].last_token_type[ntok] = N_NOP; break; case N_MATCH: if (ntok >= ctx->lc[pnum].last_ntok) return -1; switch (ctx->lc[pnum].last_token_type[ntok]) { case N_CHAR: if (len+1 >= name_len) return -1; name[len++] = ctx->lc[pnum].last_token_int[ntok]; //fprintf(stderr, "Tok %d MATCH CHAR %c\n", ntok, ctx->lc[pnum].last_token_int[ntok]); ctx->lc[cnum].last_token_type[ntok] = N_CHAR; ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok]; break; case N_ALPHA: if (ctx->lc[pnum].last_token_int[ntok] < 0 || len+ctx->lc[pnum].last_token_int[ntok] >= name_len) return -1; memcpy(&name[len], &ctx->lc[pnum].last_name[ctx->lc[pnum].last_token_str[ntok]], ctx->lc[pnum].last_token_int[ntok]); //fprintf(stderr, "Tok %d MATCH ALPHA %.*s\n", ntok, ctx->lc[pnum].last_token_int[ntok], &name[len]); ctx->lc[cnum].last_token_type[ntok] = N_ALPHA; ctx->lc[cnum].last_token_str [ntok] = len; ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok]; len += ctx->lc[pnum].last_token_int[ntok]; break; case N_DIGITS: if (len+20 >= name_len) return -1; len += append_uint32_var(&name[len], ctx->lc[pnum].last_token_int[ntok]); //fprintf(stderr, "Tok %d MATCH DIGITS %d\n", ntok, ctx->lc[pnum].last_token_int[ntok]); ctx->lc[cnum].last_token_type[ntok] = N_DIGITS; ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok]; break; case N_DIGITS0: if (len+ctx->lc[pnum].last_token_str[ntok] >= name_len) return -1; len += append_uint32_fixed(&name[len], ctx->lc[pnum].last_token_int[ntok], ctx->lc[pnum].last_token_str[ntok]); //fprintf(stderr, "Tok %d MATCH DIGITS %0*d\n", ntok, ctx->lc[pnum].last_token_str[ntok], ctx->lc[pnum].last_token_int[ntok]); ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0; ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok]; ctx->lc[cnum].last_token_str [ntok] = ctx->lc[pnum].last_token_str[ntok]; break; default: return -1; } break; default: // an elided N_END case N_END: if (len+1 >= name_len) return -1; name[len++] = 0; ctx->lc[cnum].last_token_type[ntok] = N_END; ctx->lc[cnum].last_name = name; ctx->lc[cnum].last_ntok = ntok; return len; } } return -1; } //----------------------------------------------------------------------------- // arith adaptive codec or static rANS 4x16pr codec static int arith_encode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len, int method) { unsigned int olen = *out_len-6, nb; if (arith_compress_to(in, in_len, out+6, &olen, method) == NULL) return -1; nb = var_put_u32(out, out + *out_len, olen); memmove(out+nb, out+6, olen); *out_len = olen+nb; return 0; } // Returns number of bytes read from 'in' on success, // -1 on failure. static int64_t arith_decode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) { unsigned int olen = *out_len; uint32_t clen; int nb = var_get_u32(in, in+in_len, &clen); //fprintf(stderr, "Arith decode %x\n", in[nb]); if (arith_uncompress_to(in+nb, in_len-nb, out, &olen) == NULL) return -1; //fprintf(stderr, " Stored clen=%d\n", (int)clen); return clen+nb; } static int rans_encode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len, int method) { unsigned int olen = *out_len-6, nb; if (rans_compress_to_4x16(in, in_len, out+6, &olen, method) == NULL) return -1; nb = var_put_u32(out, out + *out_len, olen); memmove(out+nb, out+6, olen); *out_len = olen+nb; return 0; } // Returns number of bytes read from 'in' on success, // -1 on failure. static int64_t rans_decode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) { unsigned int olen = *out_len; uint32_t clen; int nb = var_get_u32(in, in+in_len, &clen); //fprintf(stderr, "Arith decode %x\n", in[nb]); if (rans_uncompress_to_4x16(in+nb, in_len-nb, out, &olen) == NULL) return -1; //fprintf(stderr, " Stored clen=%d\n", (int)clen); return clen+nb; } static int compress(uint8_t *in, uint64_t in_len, int level, int use_arith, uint8_t *out, uint64_t *out_len) { uint64_t best_sz = UINT64_MAX; int best = 0; uint64_t olen = *out_len; //fprintf(stderr, "=== try %d ===\n", (int)in_len); int m, rmethods[5][12] = { {2, 0, 128}, // 1 {2, 0, 192+8}, // 3 {3, 0, 128, 193+8}, // 5 {6, 0,1, 129, 65, 193, 193+8}, // 7 {9, 0,1,128,129,64,65,192,193, 193+8}, // 9 }; // 1-9 to 0-4 level = (level-1)/2; if (level<0) level=0; if (level>4) level=4; for (m = 1; m <= rmethods[level][0]; m++) { *out_len = olen; if (in_len % 4 != 0 && (rmethods[level][m] & 8)) continue; if (use_arith) { if (arith_encode(in, in_len, out, out_len, rmethods[level][m]) < 0) return -1; } else { if (rans_encode(in, in_len, out, out_len, rmethods[level][m]) < 0) return -1; } if (best_sz > *out_len) { best_sz = *out_len; best = rmethods[level][m]; } } *out_len = olen; if (use_arith) { if (arith_encode(in, in_len, out, out_len, best) < 0) return -1; } else { if (rans_encode(in, in_len, out, out_len, best) < 0) return -1; } // uint64_t tmp; // fprintf(stderr, "%d -> %d via method %x, %x\n", (int)in_len, (int)best_sz, best, out[i7get(out,&tmp)]); return 0; } static uint64_t uncompressed_size(uint8_t *in, uint64_t in_len) { uint32_t clen, ulen; // in[0] in part of buffer written by us int nb = var_get_u32(in, in+in_len, &clen); // in[nb] is part of buffer written to by arith_dynamic. var_get_u32(in+nb+1, in+in_len, &ulen); return ulen; } static int uncompress(int use_arith, uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) { uint32_t clen; var_get_u32(in, in+in_len, &clen); return use_arith ? arith_decode(in, in_len, out, out_len) : rans_decode(in, in_len, out, out_len); } //----------------------------------------------------------------------------- /* * Converts a line or \0 separated block of reading names to a compressed buffer. * The code can only encode whole lines and will not attempt a partial line. * Use the "last_start_p" return value to identify the partial line start * offset, for continuation purposes. * * Returns a malloced buffer holding compressed data of size *out_len, * or NULL on failure */ uint8_t *encode_names(char *blk, int len, int level, int use_arith, int *out_len, int *last_start_p) { int last_start = 0, i, j, nreads; // Count lines for (nreads = i = 0; i < len; i++) if (blk[i] <= '\n') // \n or \0 separated entries nreads++; name_context *ctx = create_context(nreads); if (!ctx) return NULL; // Construct trie int ctr = 0; for (i = j = 0; i < len; j=++i) { while (i < len && blk[i] > '\n') i++; if (i >= len) break; //blk[i] = '\0'; last_start = i+1; if (build_trie(ctx, &blk[j], i-j, ctr++) < 0) { free_context(ctx); return NULL; } } if (last_start_p) *last_start_p = last_start; //fprintf(stderr, "Processed %d of %d in block, line %d\n", last_start, len, ctr); // Encode name for (i = j = 0; i < len; j=++i) { while (i < len && blk[i] > '\n') i++; if (i >= len) break; blk[i] = '\0'; // try both 0 and 1 and pick best? if (encode_name(ctx, &blk[j], i-j, 1) < 0) { free_context(ctx); return NULL; } } #if 0 for (i = 0; i < ctx->max_tok*16; i++) { char fn[1024]; if (!ctx->desc[i].buf_l) continue; sprintf(fn, "_tok.%02d_%02d.%d", i>>4,i&15,i); FILE *fp = fopen(fn, "w"); fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp); fclose(fp); } #endif //dump_trie(t_head, 0); // FIXME: merge descriptors // // If we see foo7:1 foo7:12 foo7:7 etc then foo: is constant, // but it's encoded as alpha+dig<7>+char<:> instead of alpha. // Any time token type 0 is all match beyond the first location we have // a candidate for merging in string form. // // This saves around .1 to 1.3 percent on varying data sets. // Cruder hack is dedicated prefix/suffix matching to short-cut this. // Drop N_TYPE blocks if they all contain matches bar the first item, // as we can regenerate these from the subsequent blocks types during // decode. for (i = 0; i < ctx->max_tok*16; i+=16) { if (!ctx->desc[i].buf_l) continue; int z; for (z=1; zdesc[i].buf_l; z++) { if (ctx->desc[i].buf[z] != N_MATCH) break; } if (z == ctx->desc[i].buf_l) { int k; for (k=1; k<16; k++) if (ctx->desc[i+k].buf_l) break; if (k < 16) { ctx->desc[i].buf_l = 0; free(ctx->desc[i].buf); ctx->desc[i].buf = NULL; } } } // Serialise descriptors uint32_t tot_size = 9; int ndesc = 0; for (i = 0; i < ctx->max_tok*16; i++) { if (!ctx->desc[i].buf_l) continue; ndesc++; int tnum = i>>4; int ttype = i&15; uint64_t out_len = 1.5 * arith_compress_bound(ctx->desc[i].buf_l, 1); // guesswork uint8_t *out = malloc(out_len); if (!out) { free_context(ctx); return NULL; } if (compress(ctx->desc[i].buf, ctx->desc[i].buf_l, level, use_arith, out, &out_len) < 0) { free_context(ctx); return NULL; } free(ctx->desc[i].buf); ctx->desc[i].buf = out; ctx->desc[i].buf_l = out_len; ctx->desc[i].tnum = tnum; ctx->desc[i].ttype = ttype; // Find dups int j; for (j = 0; j < i; j++) { if (!ctx->desc[j].buf) continue; if (ctx->desc[i].buf_l != ctx->desc[j].buf_l || ctx->desc[i].buf_l <= 4) continue; if (memcmp(ctx->desc[i].buf, ctx->desc[j].buf, ctx->desc[i].buf_l) == 0) break; } if (j < i) { ctx->desc[i].dup_from = j; tot_size += 3; // flag, dup_from, ttype } else { ctx->desc[i].dup_from = 0; tot_size += out_len + 1; // ttype } } #if 0 for (i = 0; i < ctx->max_tok*16; i++) { char fn[1024]; if (!ctx->desc[i].buf_l && !ctx->desc[i].dup_from) continue; sprintf(fn, "_tok.%02d_%02d.%d.comp", i>>4,i&15,i); FILE *fp = fopen(fn, "w"); fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp); fclose(fp); } #endif // Write uint8_t *out = malloc(tot_size+13); if (!out) { free_context(ctx); return NULL; } uint8_t *cp = out; *out_len = tot_size; // *(uint32_t *)cp = last_start; cp += 4; // *(uint32_t *)cp = nreads; cp += 4; *cp++ = (last_start >> 0) & 0xff; *cp++ = (last_start >> 8) & 0xff; *cp++ = (last_start >> 16) & 0xff; *cp++ = (last_start >> 24) & 0xff; *cp++ = (nreads >> 0) & 0xff; *cp++ = (nreads >> 8) & 0xff; *cp++ = (nreads >> 16) & 0xff; *cp++ = (nreads >> 24) & 0xff; *cp++ = use_arith; //write(1, &nreads, 4); int last_tnum = -1; for (i = 0; i < ctx->max_tok*16; i++) { if (!ctx->desc[i].buf_l) continue; uint8_t ttype8 = ctx->desc[i].ttype; if (ctx->desc[i].tnum != last_tnum) { ttype8 |= 128; last_tnum = ctx->desc[i].tnum; } if (ctx->desc[i].dup_from) { //fprintf(stderr, "Dup %d from %d, sz %d\n", i, ctx->desc[i].dup_from, ctx->desc[i].buf_l); *cp++ = ttype8 | 64; *cp++ = ctx->desc[i].dup_from >> 4; *cp++ = ctx->desc[i].dup_from & 15; } else { *cp++ = ttype8; memcpy(cp, ctx->desc[i].buf, ctx->desc[i].buf_l); cp += ctx->desc[i].buf_l; } } //assert(cp-out == tot_size); free_context(ctx); return out; } /* * Decodes a compressed block of read names into \0 separated names. * The size of the data returned (malloced) is in *out_len. * * Returns NULL on failure. */ uint8_t *decode_names(uint8_t *in, uint32_t sz, uint32_t *out_len) { if (sz < 9) return NULL; int i, o = 9; //int ulen = *(uint32_t *)in; int ulen = (in[0]<<0) | (in[1]<<8) | (in[2]<<16) | (((uint32_t)in[3])<<24); if (ulen < 0 || ulen >= INT_MAX-1024) return NULL; //int nreads = *(uint32_t *)(in+4); int nreads = (in[4]<<0) | (in[5]<<8) | (in[6]<<16) | (((uint32_t)in[7])<<24); int use_arith = in[8]; name_context *ctx = create_context(nreads); if (!ctx) return NULL; // Unpack descriptors int tnum = -1; while (o < sz) { uint8_t ttype = in[o++]; if (ttype & 64) { if (o+2 >= sz) goto err; int j = in[o++]<<4; j += in[o++]; if (ttype & 128) { tnum++; if (tnum >= MAX_TOKENS) goto err; ctx->max_tok = tnum+1; memset(&ctx->desc[tnum<<4], 0, 16*sizeof(ctx->desc[tnum])); } if ((ttype & 15) != 0 && (ttype & 128)) { if (tnum < 0) goto err; ctx->desc[tnum<<4].buf = malloc(nreads); if (!ctx->desc[tnum<<4].buf) goto err; ctx->desc[tnum<<4].buf_l = 0; ctx->desc[tnum<<4].buf_a = nreads; ctx->desc[tnum<<4].buf[0] = ttype&15; memset(&ctx->desc[tnum<<4].buf[1], N_MATCH, nreads-1); } if (tnum < 0) goto err; i = (tnum<<4) | (ttype&15); if (j >= i) goto err; ctx->desc[i].buf_l = 0; ctx->desc[i].buf_a = ctx->desc[j].buf_a; if (ctx->desc[i].buf) free(ctx->desc[i].buf); ctx->desc[i].buf = malloc(ctx->desc[i].buf_a); if (!ctx->desc[i].buf) goto err; memcpy(ctx->desc[i].buf, ctx->desc[j].buf, ctx->desc[i].buf_a); //fprintf(stderr, "Copy ttype %d, i=%d,j=%d, size %d\n", ttype, i, j, (int)ctx->desc[i].buf_a); continue; } //if (ttype == 0) if (ttype & 128) { tnum++; if (tnum >= MAX_TOKENS) goto err; ctx->max_tok = tnum+1; memset(&ctx->desc[tnum<<4], 0, 16*sizeof(ctx->desc[tnum])); } if ((ttype & 15) != 0 && (ttype & 128)) { if (tnum < 0) goto err; if (ctx->desc[tnum<<4].buf) free(ctx->desc[tnum<<4].buf); ctx->desc[tnum<<4].buf = malloc(nreads); if (!ctx->desc[tnum<<4].buf) goto err; ctx->desc[tnum<<4].buf_l = 0; ctx->desc[tnum<<4].buf_a = nreads; ctx->desc[tnum<<4].buf[0] = ttype&15; memset(&ctx->desc[tnum<<4].buf[1], N_MATCH, nreads-1); } //fprintf(stderr, "Read %02x\n", c); // Load compressed block int64_t clen, ulen = uncompressed_size(&in[o], sz-o); if (ulen < 0 || ulen >= INT_MAX) goto err; if (tnum < 0) goto err; i = (tnum<<4) | (ttype&15); if (i >= MAX_TBLOCKS || i < 0) goto err; ctx->desc[i].buf_l = 0; if (ctx->desc[i].buf) free(ctx->desc[i].buf); ctx->desc[i].buf = malloc(ulen); if (!ctx->desc[i].buf) goto err; ctx->desc[i].buf_a = ulen; uint64_t usz = ctx->desc[i].buf_a; // convert from size_t for 32-bit sys clen = uncompress(use_arith, &in[o], sz-o, ctx->desc[i].buf, &usz); ctx->desc[i].buf_a = usz; if (clen < 0) goto err; assert(ctx->desc[i].buf_a == ulen); // fprintf(stderr, "%d: Decode tnum %d type %d clen %d ulen %d via %d\n", // o, tnum, ttype, (int)clen, (int)ctx->desc[i].buf_a, ctx->desc[i].buf[0]); o += clen; // Encode tnum 0 type 0 ulen 100000 clen 12530 via 2 // Encode tnum 0 type 6 ulen 196800 clen 43928 via 3 // Encode tnum 0 type 7 ulen 203200 clen 17531 via 3 // Encode tnum 1 type 0 ulen 50800 clen 10 via 1 // Encode tnum 1 type 1 ulen 3 clen 5 via 0 // Encode tnum 2 type 0 ulen 50800 clen 10 via 1 // } int ret; ulen += 1024; // for easy coding in decode_name. uint8_t *out = malloc(ulen); if (!out) goto err; size_t out_sz = 0; while ((ret = decode_name(ctx, (char *)out+out_sz, ulen)) > 0) { out_sz += ret; ulen -= ret; } if (ret < 0) free(out); free_context(ctx); *out_len = out_sz; return ret == 0 ? out : NULL; err: free_context(ctx); return NULL; }