/* Variable-length binary prefix codes for integers * * Contents: * 1. Exponential Golomb codes * 2. Golomb-Rice codes * 3. Elias delta code * 4. Google varint codes * 5. Unit tests * 6. Test driver * 7. Example * * See also: * esl_huffman : Huffman coding */ #include #include #include #include "easel.h" #include "esl_varint.h" /***************************************************************** * 1. Exponential Golomb codes *****************************************************************/ /* Function: esl_varint_expgol() * Synopsis: Generalized exponential Golomb coding * Incept: SRE, Wed 09 Jan 2019 * * Purpose: Encode nonnegative integer in exponential-Golomb- * code. $k=0$ is standard exponential Golomb code; $k>0$ * are the generalized Golomb codes. Optionally return codeword in * low-order bits of <*opt_code> (i.e. right flush), and * its length in <*opt_n>. * * Args: v : integer to encode; v >= 0 * k : parameter for which exp-Golomb code; k >= 0 * opt_code : optRETURN: encoding, right flushed (low order bits) * opt_n : optRETURN: length of encoding in bits * * Returns: on success. * * Throws: if caller requested <*opt_code> to be returned, * but encoding doesn't fit in a single . Now <*opt_code> * and <*opt_n> are set to 0. */ int esl_varint_expgol(int v, int k, uint64_t *opt_code, int *opt_n) { ESL_DASSERT1(( v >= 0 && k >= 0 )); uint64_t m = (1 << k); // 2^k. For k=0 standard expgol: m=1, uint64_t q = v / m; // \floor (v / 2^k) piece, to be encoded in exp-golomb-0 ... q=v, all bits here, uint64_t r = v % m; // will become rightmost bits ... r=0, no bits stored int n = 0; uint64_t code = q+1; int status; // q stored in exp-golomb-0. We already set low order bits to q+1; now we just need to determine width of q+1. q = q+1; while (q) { q >>= 1; n++; } // Let a = # of bits needed to represent q+1 n = 2 * n - 1; // now a-1 leading bits are 0, followed by a bits of q+1, for an exp-golomb-0 code. if (opt_code && n > 64) ESL_XEXCEPTION(eslERANGE, "exponential Golomb codeword length > 64"); // r stored in its binary representation, k bits. code = (code << k) | r; n += k; if (opt_code) *opt_code = code; if (opt_n) *opt_n = n; return eslOK; ERROR: if (opt_code) *opt_code = 0; if (opt_n) *opt_n = 0; return status; } /* Function: esl_varint_expgol_decode() * Synopsis: Decode an exponential Golomb codeword. * Incept: SRE, Thu 10 Jan 2019 [Slaid Cleaves, Still Fighting the War] * * Purpose: Decode the leading (high order) prefix bits of


 *            according to exponential-Golomb- code. Optionally
 *            return the decoded integer value in <*opt_v>, and the
 *            length of the decoded codeword in <*opt_n>.
 *            
 *            Note that  is _left flush_ for decoding: the
 *            prefix codeword is decoded from the leftmost, most
 *            significant bit. 
 *
 *            If  consists of only 0's, and therefore has no
 *            complete codeword, return . This can be used to
 *            detect normal end of a bit string.
 *
 *            If  does contain a 1 bit but does not contain a
 *            complete codeword, assume it is corrupted somehow,
 *            and return .
 *
 * Args:      code : bit string to decode, starting from MSB
 *            k    : parameter for which exp-Golomb code; k >= 0
 *            opt_v: optRETURN: integer value
 *            opt_n: optRETURN: length of decoded prefix codeword, in bits
 *
 * Returns:    on success. 
 *
 *             if  is all 0 bits.  
 *             if at least one 1 bit is set, yet no
 *            codeword is present. On errors, <*opt_v> and <*opt_n>
 *            are 0.
 */
int
esl_varint_expgol_decode(uint64_t code, int k, int *opt_v, int *opt_n)
{
  int b,n,r,v;
  int status;

  if (! code) { status = eslEOD; goto ERROR; }   // all exp-Golomb codewords contain at least one 1

  for (b = 63; b >= 0; b--)
    if (code & (1ull << b)) break;  // now b is position of leftmost 1 bit (i.e. MSB); 63..0.
  n = 2*(64-b)+k-1;                 // ...and from that, we know the size of the codeword.
  if (n > 64) { status = eslECORRUPT; goto ERROR; } // looks like a code but can't be valid

  // the n-bit codeword consists of the binary representation of q+1 (with leading zeros),
  // followed by a k-bit remainder. 
  code >>= 64-n;
  r    = code & ((1ull << k) - 1);
  code >>= k;
  v    = (code-1) * (1ull << k) + r;

  if (opt_v) *opt_v = v;
  if (opt_n) *opt_n = n;
  return eslOK;

 ERROR:
  if (opt_v) *opt_v = 0;
  if (opt_n) *opt_n = 0;
  return status;
}
    

/*****************************************************************
 * 2. Golomb-Rice codes
 *****************************************************************/

/* Function:  esl_varint_rice()
 * Synopsis:  Golomb-Rice coding
 * Incept:    SRE, Sat 12 Jan 2019 [Ramin Djawadi, Westworld, The Maze]
 *
 * Purpose:   Encode nonnegative integer  in Golomb-Rice- code,
 *            for $k \geq 0$. Optionally return codeword in low-order bits of 
 *            <*opt_code> (i.e. right flush) and its length in <*opt_n>.
 *
 * Args:      v        : integer to encode; v >= 0
 *            k        : parameter for which Golomb-Rice code; k >= 0
 *            opt_code : optRETURN: encoding, right flushed (low order bits)
 *            opt_n    : optRETURN: length of encoding in bits
 *
 * Returns:    on success.
 *
 * Throws:     if caller requested <*opt_code> to be returned,
 *            but encoding doesn't fit in a single . Now <*opt_code>
 *            and <*opt_n> are set to 0.
 */
int
esl_varint_rice(int v, int k, uint64_t *opt_code, int *opt_n)
{
  ESL_DASSERT1(( v >= 0 && k >= 0));
  int      m    = (1 << k); // 2^k
  int      q    = v / m;    // int(v / 2^k), quotient.  code leads with this many 1's; then a 0
  int      r    = v % m;    // remainder. encoded in k bits
  uint64_t code = 0;
  int      status;

  if (opt_code && q > 64-(k+1)) ESL_XEXCEPTION(eslERANGE, "Golomb-Rice codeword length > 64");

  code = (1ull << q) - 1;       // sets q low-order bits to 1; we know 2^{q+1} fits in uint64_t
  code  = code << (1 + k);      // bit k+1 is a 0; low-order k bits hold r
  code |= r;

  if (opt_code) *opt_code = code;
  if (opt_n)    *opt_n    = q + k + 1;
  return eslOK;

 ERROR:
  if (opt_code) *opt_code = 0;
  if (opt_n)    *opt_n    = 0;
  return status;
}


/* Function:  esl_varint_rice_decode()
 * Synopsis:  Decode a Golomb-Rice codeword
 * Incept:    SRE, Sat 12 Jan 2019 [Real Madrid v. Leganes]
 *
 * Purpose:   Decode the leading (high order) prefix bits of  
 *            according to Golomb-Rice- code. Optionally return the
 *            decoded integer value in <*opt_v>, and the length
 *            of the decoded codeword in <*opt_n>.
 *            
 *            Note that  is _left flush_ for decoding:
 *            the prefix codeword is decoded from the leftmost,
 *            most significant bit.
 *            
 *            If  does not contain a 0 bit, return ; it
 *            cannot contain a valid codeword.  A caller can shift on
 *            a pattern of all 1 bits to signal EOD.
 *            
 *            There is no check for overflowing . We assume that
 *             encodes a value that fits in an int.
 *
 * Args:      code : bit string to decode, starting from MSB
 *            k    : parameter for which Google varint code; k >= 2
 *            opt_v: optRETURN: integer value
 *            opt_n: optRETURN: length of decoded prefix codeword, in bits
 *
 * Returns:    on success. 
 *             if  has no completed codeword.
 *
 * Throws:    (no abnormal error conditions)
 */
int
esl_varint_rice_decode(uint64_t code, int k, int *opt_v, int *opt_n)
{
  int q;
  int status;

  if (~code == 0) { status = eslEOD; goto ERROR; } // valid rice code has at least one 0 bit

  for (q = 0; q <= 64-k; q++)
    if ( (~code) & (1ull << (63-q))) break;  // q = # of leading 1 bits. +1, +k: n = q + k + 1.

  code >>= 64-(q+k+1);  // shift 64 - (q+k+1) bits off to the right, right-flushing the codeword.
  /*                   q  *    2^k    +    r, obtained by masking low k bits */
  if (opt_v) *opt_v =  (int) (q * (1ull << k) + (code & ( (1ull << (k+1)) - 1))); 
  if (opt_n) *opt_n =  q + k + 1;
  return eslOK;

 ERROR:
  if (opt_v) *opt_v = 0;
  if (opt_n) *opt_n = 0;
  return status;
}



/*****************************************************************
 * 3. Elias delta code
 *****************************************************************/

/* Function:  esl_varint_delta
 * Synopsis:  Elias delta coding
 * Incept:    SRE, Sat 12 Jan 2019
 *
 * Purpose:   Encode positive integer  in Elias delta code.
 *            Optionally return codeword in low-order bits of
 *            <*opt_code> (i.e. right flush), and its length
 *            in <*opt_n>.
 *            
 * Args:      v        : integer to encode; v >= 1
 *            opt_code : optRETURN: encoding, right flushed (low order bits)
 *            opt_n    : optRETURN: length of encoding in bits
 *
 * Returns:    on success.
 *
 * Throws:     if caller requested <*opt_code> to be returned,
 *            but encoding doesn't fit in a single . Now <*opt_code>
 *            and <*opt_n> are set to 0.
 */
int
esl_varint_delta(int v, uint64_t *opt_code, int *opt_n)
{
  ESL_DASSERT1(( v >= 1 ));
  int  a = 0;  // \floor log_2 v      2^a is the highest power of 2 in v
  int  b = 0;  // \floor log_2 (a+1)
  int  n = 0;
  uint64_t code = 0;
  uint64_t tmp;
  int  status;

  tmp = ((uint64_t) v     >> 1); while (tmp) { tmp = tmp >> 1; a++; }  
  tmp = ((uint64_t) (a+1) >> 1); while (tmp) { tmp = tmp >> 1; b++; }

  n =  b;                                               // b leading zeros
  n += b+1; code = a+1;                                 // b+1 bits, representation of a+1
  n += a;   code = (code << a) | (v & ((1 << a) - 1));  // a bits, v % 2^a

  if (opt_code && n > 64) ESL_XEXCEPTION(eslERANGE, "Elias delta codeword length > 64");

  if (opt_code) *opt_code = code;
  if (opt_n)    *opt_n    = n;
  return eslOK;

 ERROR:
  if (opt_code) *opt_code = 0;
  if (opt_n)    *opt_n    = 0;
  return status;
}


/* Function:  esl_varint_delta_decode()
 * Synopsis:  Decode an Elias delta codeword.
 * Incept:    SRE, Sat 12 Jan 2019
 *
 * Purpose:   Decode the leading (high order) prefix bits of 
 *            using Elias delta code. Optionally return the decoded
 *            integer value in <*opt_v>, and/or the length of the
 *            decoded codeword in <*opt_n>.
 *            
 *            Note that  is _left flush_ for decoding: the 
 *            prefix codeword is decoded from the leftmost, most
 *            significant bit.
 *            
 *            If  consists of only 0's, and therefore has
 *            no complete codeword, return . This can be
 *            used to detect the normal end of a bit string.
 *            
 *            If  does contain a 1 bit but does not contain a
 *            complete codeword, assume it is corrupted somehow, and
 *            throw .
 *
 * Args:      code : bit string to decode, starting from MSB
 *            opt_v: optRETURN: integer value
 *            opt_n: optRETURN: length of decoded prefix codeword, in bits
 *
 * Returns:    on success. 
 *
 *             if  is all 0 bits.
 *             if at least one 1 bit is set, yet no codeword
 *            is present. On errors, <*opt_v> and <*opt_n> are 0.
 */
int
esl_varint_delta_decode(uint64_t code, int *opt_v, int *opt_n)
{
  int a,b,n,status;

  if (code == 0) { status = eslEOD; goto ERROR; }

  for (b = 0; b < 64; b++)
    if (code & (1ull << (63-b))) break;  // stop on first 1.  b = # of leading 0 bits
  a = (code >> (63 - 2*b)) - 1;          // After b leading 0's, b+1 bits encode a+1
  n = 2*b + a + 1;

  if (n > 64) { status = eslECORRUPT; goto ERROR; }

  // Last a bits encodes v mod 2^a, all but the highest order bit;
  // shift and mask to get those a bits, then add 2^a back.
  if (opt_v) *opt_v = (1<> (64 - n)) & ( (1< in Google varint- code;
 *            $k \geq 2$; $k=8$ gives you one-byte codewords, the code
 *            used by Google Protobuf. Optionally return code in
 *            low-order bits of <*opt_code> (i.e. right flushed), and
 *            length of that code in <*opt_n>.
 *          
 * Args:      v        : integer to encode; v >= 0
 *            k        : parameter for which Google varint code; k >= 2
 *            opt_code : optRETURN: encoding, right flushed (low order bits)
 *            opt_n    : optRETURN: length of encoding in bits
 *
 * Returns:    on success.
 *
 * Throws:     if caller requested <*opt_code> to be returned,
 *            but binary encoding won't fit in a single
 *            . <*opt_code> and <*opt_n> are set to 0.
 */
int
esl_varint_google(int v, int k, uint64_t *opt_code, int *opt_n)
{
  ESL_DASSERT1(( k >= 2 ));
  ESL_DASSERT1(( v >= 0 ));

  int      base = 1 << (k-1);  // 2^(k-1)
  uint64_t code = 0;
  int      n    = 0;
  int      status;
      
  code |= v % base;  
  v /= base;
  n += k;
  while (v > 0) 
    {
      code |= (1 << (k-1));
      code = code << k;
      
      code |= v % base;
      v /= base;
      n += k;
    }
  if (n > 64 && opt_code) ESL_XEXCEPTION(eslERANGE, "google varint code length > 64");
  
  if (opt_code) *opt_code = code;
  if (opt_n)    *opt_n    = n;
  return eslOK;

 ERROR:
  if (opt_code) *opt_code = 0;
  if (opt_n)    *opt_n    = 0;
  return status;
}


/* Function:  esl_varint_google_decode()
 * Synopsis:  Decode a Google varint codeword
 * Incept:    SRE, Fri 11 Jan 2019 
 *
 * Purpose:   Decode the leading (high order) prefix bits of 
 *            according to Google varint- code. Optionally
 *            return the decoded integer value in <*opt_v>, and the
 *            length of the decoded codeword in <*opt_n>.
 *            
 *            Note that  is _left flush_ for decoding: the
 *            prefix codeword is decoded from the leftmost, most
 *            significant bit. 
 *
 *            If  does not contain a termination codegroup (i.e.
 *            it consists only of continuation groups that start with
 *            1), return . A caller can shift on a pattern of
 *            all 1 bits to signal EOD.
 *
 *            There is no check for overflowing . We assume that
 *             encodes a value that fits in an int.
 *
 * Args:      code : bit string to decode, starting from MSB
 *            k    : parameter for which Google varint code; k >= 2
 *            opt_v: optRETURN: integer value
 *            opt_n: optRETURN: length of decoded prefix codeword, in bits
 *
 * Returns:    on success. 
 *             if  has no completed codeword.
 */
int
esl_varint_google_decode(uint64_t code, int k, int *opt_v, int *opt_n)
{
  uint64_t cmask = 1ull << (k-1);
  uint64_t vmask = cmask-1;
  int      v     = 0;
  uint64_t grp;                     // current k-bit code group 
  int      g;                       // which group (least significant group first)
  int      status;

  for (g = 0; g < 64/k; code <<= k, g++)
    {
      grp = code >> (64-k);
      v  += (grp & vmask) << (g*(k-1));
      if (!(grp & cmask)) break;
    }
  if (grp & cmask) { status = eslEOD; goto ERROR; }  

  if (opt_v) *opt_v = v;
  if (opt_n) *opt_n = (g+1)*k;
  return eslOK;
  
 ERROR:
  if (opt_v) *opt_v = 0;
  if (opt_n) *opt_n = 0;
  return status;
}




/*****************************************************************
 * 5. Unit tests
 *****************************************************************/
#ifdef eslVARINT_TESTDRIVE

#include "esl_random.h"

/* floor_log2()
 * Returns $w = \lfloor \log_2 v \rfloor$
 * w is the highest power of 2 in v;
 * equivalently, w is the MSB set in v
 * 
 * Example: v=53 ==> 00110101 ==> w=5  (2^5 = 32)
 */
static int
floor_log2(int v)
{
  return (v == 0 ? 0 : (int) floor(log2(v)));
}

/* floor_log2k()
 * Returns $\lfloor \log_{2^k} v \rfloor$
 */
static int
floor_log2k(int v, int k)
{
  return ( v == 0 ? 0 : (int) floor( log2(v)) / k);
}

int     len_expgol(int v, int k) { return (2 * floor_log2((v / (1<",                    0 },
  {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[]  = "[-options]";
static char banner[] = "test driver for varint module";


int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go   = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
  ESL_RANDOMNESS *rng  = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
 
  fprintf(stderr, "## %s\n", argv[0]);
  fprintf(stderr, "#  rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));

  utest_expgol(rng);
  utest_rice  (rng);
  utest_delta (rng);
  utest_google(rng);
 
  fprintf(stderr, "#  status = ok\n");
 
  esl_randomness_Destroy(rng);
  esl_getopts_Destroy(go);
  return eslOK;
}
#endif // eslVARINT_TESTDRIVE

/*****************************************************************
 * 7. Example
 *****************************************************************/
#ifdef eslVARINT_EXAMPLE

#include "esl_getopts.h"

/* `./esl_varint` or   `./esl_varint --tbl` (--tbl is default)
 *     makes tables of int v | codelen | codeword, as used in documentation.
 *     Markdown table format.
 * `./esl_varint -1 --fig1`
 *     makes xy data file of value vs. codelen, XMGRACE XY format.
 * `./esl_varint -1 --fig2`
 *     makes xy data file of value vs. implicit probability, XMGRACE XY format.
 *     
 * The -1 option is because when we encode runlengths, we never need
 * to encode 0. We get shorter codes if we offset our integer
 * runlength by -1 before encoding it.
 */

#define CODEOPTS "--expgol,--rice,--google,--delta"
#define OUTOPTS  "--tbl,--fig1,--fig2"

static ESL_OPTIONS options[] = {
  /* name          type          default  env range  toggles   reqs   incomp     help                   docgroup*/
  { "-h",          eslARG_NONE,   FALSE, NULL, NULL,     NULL, NULL,    NULL,  "show help and usage",                       0},
  { "-k",          eslARG_INT,      "2", NULL, NULL,     NULL, NULL,"--delta", "k parameter for an encoding",               0},
  { "-L",          eslARG_NONE,   FALSE, NULL, NULL,     NULL, NULL,    "-t",  "only show lengths, not code (for large -n)",0},
  { "-n",          eslARG_INT,    "200", NULL, NULL,     NULL, NULL,    NULL,  "show table for encoding of up to v=",    0},
  { "-t",          eslARG_NONE,   FALSE, NULL, NULL,     NULL, NULL,    NULL,  "format table for LaTeX tabular",            0},
  { "-1",          eslARG_NONE,   FALSE, NULL, NULL,     NULL, NULL,"--delta", "encode v-1 (i.e. 1..n not 0..n)",           0},
  { "--expgol",    eslARG_NONE,"default",NULL, NULL, CODEOPTS, NULL,    NULL,  "Use exp-Golomb-k encoding",                 0},
  { "--rice",      eslARG_NONE,   FALSE, NULL, NULL, CODEOPTS, NULL,    NULL,  "Use Golomb-Rice-k encoding",                0},
  { "--delta",     eslARG_NONE,   FALSE, NULL, NULL, CODEOPTS, NULL,    NULL,  "Use Elias delta encoding",                  0},
  { "--google",    eslARG_NONE,   FALSE, NULL, NULL, CODEOPTS, NULL,    NULL,  "Use Varint-k encoding",                     0},
  { "--tbl",       eslARG_NONE,"default",NULL, NULL,  OUTOPTS, NULL,    NULL,  "output table of ints and their codes",      0},
  { "--fig1",      eslARG_NONE,   FALSE, NULL, NULL,  OUTOPTS, NULL,    NULL,  "output XY fig data for v vs codelen",       0},
  { "--fig2",      eslARG_NONE,   FALSE, NULL, NULL,  OUTOPTS, NULL,    NULL,  "output XY fig data for v vs implicit prob", 0},
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 
};


static char banner[] = "table and figure data for varint prefix codes";
static char usage[]  = "[-options]";

static void
dump_uint64(FILE *fp, uint64_t code, int n, int fieldwidth)
{
  ESL_DASSERT1(( n > 0 ));
  ESL_DASSERT1(( fieldwidth > 0 ));
  int      i;

  fprintf(fp, "%*s", fieldwidth-n, "");  // i.e. right justification of the output
  for (i = n-1; i >= 0; i--)
    putc( (code & (1ull<