/* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)
2023-04-02 : Igor Pavlov : Public domain */

#include "Precomp.h"

/* #define SHOW_STAT */
#ifdef SHOW_STAT
#include <stdio.h>
#define PRF2(s) printf("%s ip=%8x  tempPos=%d  src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));
#else
#define PRF2(s)
#endif

#include "Bcj2.h"
#include "CpuArch.h"

#define kTopValue ((UInt32)1 << 24)
#define kNumBitModelTotalBits 11
#define kBitModelTotal (1 << kNumBitModelTotalBits)
#define kNumMoveBits 5

void Bcj2Enc_Init(CBcj2Enc *p)
{
  unsigned i;
  p->state = BCJ2_ENC_STATE_ORIG;
  p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
  p->context = 0;
  p->flushRem = 5;
  p->isFlushState = 0;
  p->cache = 0;
  p->range = 0xffffffff;
  p->low = 0;
  p->cacheSize = 1;
  p->ip64 = 0;
  p->fileIp64 = 0;
  p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;
  p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;
  // p->relatExcludeBits = 0;
  p->tempPos = 0;
  for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
    p->probs[i] = kBitModelTotal >> 1;
}

// Z7_NO_INLINE
Z7_FORCE_INLINE
static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
{
  const UInt32 low = (UInt32)p->low;
  const unsigned high = (unsigned)
    #if defined(Z7_MSC_VER_ORIGINAL) \
        && defined(MY_CPU_X86) \
        && defined(MY_CPU_LE) \
        && !defined(MY_CPU_64BIT)
      // we try to rid of __aullshr() call in MSVS-x86
      (((const UInt32 *)&p->low)[1]); // [1] : for little-endian only
    #else
      (p->low >> 32);
    #endif
  if (low < (UInt32)0xff000000 || high != 0)
  {
    Byte *buf = p->bufs[BCJ2_STREAM_RC];
    do
    {
      if (buf == p->lims[BCJ2_STREAM_RC])
      {
        p->state = BCJ2_STREAM_RC;
        p->bufs[BCJ2_STREAM_RC] = buf;
        return True;
      }
      *buf++ = (Byte)(p->cache + high);
      p->cache = 0xff;
    }
    while (--p->cacheSize);
    p->bufs[BCJ2_STREAM_RC] = buf;
    p->cache = (Byte)(low >> 24);
  }
  p->cacheSize++;
  p->low = low << 8;
  return False;
}


/*
We can use 2 alternative versions of code:
1) non-marker version:
  Byte CBcj2Enc::context
  Byte temp[8];
  Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.
  Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction
  with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.

2) marker version:
  UInt32 CBcj2Enc::context
  Byte CBcj2Enc::temp[4];
  MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.
  it's allowed that
    one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,
    and another call of Bcj2Enc_Encode_2() does offset conversion.
    So different values of (fileIp) and (fileSize) are possible
    in these different Bcj2Enc_Encode_2() calls.

Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.
So we use non-marker version.
*/

/*
  Corner cases with overlap in multi-block.
  before v23: there was one corner case, where converted instruction
    could start in one sub-stream and finish in next sub-stream.
  If multi-block (solid) encoding is used,
    and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.
    and (0f) is last byte of previous sub-stream
    and (8x) is first byte of current sub-stream
  then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.
  BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,
  if that offset meets limit requirements.
  If encoder allows 32-bit offset conversion for such overlap case,
  then the data in 3 uncompressed BCJ2 streams for some sub-stream
  can depend from data of previous sub-stream.
  That corner case is not big problem, and it's rare case.
  Since v23.00 we do additional check to prevent conversions in such overlap cases.
*/

/*
  Bcj2Enc_Encode_2() output variables at exit:
  {
    if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))
    {
      it means that encoder needs more input data.
      if (p->srcLim == p->src) at exit, then
      {
        (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
        all input data were read and processed, and we are ready for
        new input data.
      }
      else
      {
        (p->srcLim != p->src)
        (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
          The encoder have found e8/e9/0f_8x marker,
          and p->src points to last byte of that marker,
          Bcj2Enc_Encode_2() needs more input data to get totally
          5 bytes (last byte of marker and 32-bit branch offset)
          as continuous array starting from p->src.
        (p->srcLim - p->src < 5) requirement is met after exit.
          So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.
      }
    }
  }
*/

Z7_NO_INLINE
static void Bcj2Enc_Encode_2(CBcj2Enc *p)
{
  if (!p->isFlushState)
  {
    const Byte *src;
    UInt32 v;
    {
      const unsigned state = p->state;
      if (BCJ2_IS_32BIT_STREAM(state))
      {
        Byte *cur = p->bufs[state];
        if (cur == p->lims[state])
          return;
        SetBe32a(cur, p->tempTarget)
        p->bufs[state] = cur + 4;
      }
    }
    p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit
    src = p->src;
    v = p->context;
    
    // #define WRITE_CONTEXT  p->context = v; // for marker version
    #define WRITE_CONTEXT           p->context = (Byte)v;
    #define WRITE_CONTEXT_AND_SRC   p->src = src;  WRITE_CONTEXT

    for (;;)
    {
      // const Byte *src;
      // UInt32 v;
      CBcj2Enc_ip_unsigned ip;
      if (p->range < kTopValue)
      {
        // to reduce register pressure and code size: we save and restore local variables.
        WRITE_CONTEXT_AND_SRC
        if (Bcj2_RangeEnc_ShiftLow(p))
          return;
        p->range <<= 8;
        src = p->src;
        v = p->context;
      }
      // src = p->src;
      // #define MARKER_FLAG  ((UInt32)1 << 17)
      // if ((v & MARKER_FLAG) == 0) // for marker version
      {
        const Byte *srcLim;
        Byte *dest = p->bufs[BCJ2_STREAM_MAIN];
        {
          const SizeT remSrc = (SizeT)(p->srcLim - src);
          SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
          if (rem >= remSrc)
            rem = remSrc;
          srcLim = src + rem;
        }
        /* p->context contains context of previous byte:
           bits [0 : 7]  : src[-1], if (src) was changed in this call
           bits [8 : 31] : are undefined for non-marker version
        */
        // v = p->context;
        #define NUM_SHIFT_BITS  24
        #define CONV_FLAG  ((UInt32)1 << 16)
        #define ONE_ITER { \
          b = src[0]; \
          *dest++ = (Byte)b; \
          v = (v << NUM_SHIFT_BITS) | b; \
          if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
          if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
              ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
          src++; if (src == srcLim) { break; } }

        if (src != srcLim)
        for (;;)
        {
          /* clang can generate ineffective code with setne instead of two jcc instructions.
             we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */
          unsigned b;
          ONE_ITER
          ONE_ITER
        }
        
        ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);
        p->bufs[BCJ2_STREAM_MAIN] = dest;
        p->ip64 = ip;

        if (src == srcLim)
        {
          WRITE_CONTEXT_AND_SRC
          if (src != p->srcLim)
          {
            p->state = BCJ2_STREAM_MAIN;
            return;
          }
          /* (p->src == p->srcLim)
          (p->state == BCJ2_ENC_STATE_ORIG) */
          if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
            return;
          /* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */
          // (p->flushRem == 5);
          p->isFlushState = 1;
          break;
        }
        src++;
        // p->src = src;
      }
      // ip = p->ip; // for marker version
      /* marker was found */
      /* (v) contains marker that was found:
           bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]
                         : value of src[-2] : xx/xx/0f
           bits [0 : 7]  : value of src[-1] : e8/e9/8x
      */
      {
        {
        #if NUM_SHIFT_BITS != 24
          v &= ~(UInt32)CONV_FLAG;
        #endif
          // UInt32 relat = 0;
          if ((SizeT)(p->srcLim - src) >= 4)
          {
            /*
            if (relat != 0 || (Byte)v != 0xe8)
            BoolInt isBigOffset = True;
            */
            const UInt32 relat = GetUi32(src);
            /*
            #define EXCLUDE_FLAG  ((UInt32)1 << 4)
            #define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)
            if (p->relatExcludeBits != 0)
            {
              const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);
              isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);
            }
            // isBigOffset = False; // for debug
            */
            ip -= p->fileIp64;
            // Use the following if check, if (ip) is 64-bit:
            if (ip > (((v + 0x20) >> 5) & 1))  // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)
            if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)
            if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)
              v |= CONV_FLAG;
          }
          else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
          {
            // (p->srcLim - src < 4)
            // /*
            // for non-marker version
            p->ip64--; // p->ip = ip - 1;
            p->bufs[BCJ2_STREAM_MAIN]--;
            src--;
            v >>= NUM_SHIFT_BITS;
            // (0 < p->srcLim - p->src <= 4)
            // */
            // v |= MARKER_FLAG; // for marker version
            /* (p->state == BCJ2_ENC_STATE_ORIG) */
            WRITE_CONTEXT_AND_SRC
            return;
          }
          {
            const unsigned c = ((v + 0x17) >> 6) & 1;
            CBcj2Prob *prob = p->probs + (unsigned)
                (((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
            /*
                ((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :
                ((Byte)v < 0xe8 ? 0 : 1));  // ((v >> 5) & 1));
            */
            const unsigned ttt = *prob;
            const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;
            if ((v & CONV_FLAG) == 0)
            {
              // static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);
              // v = (Byte)v; // for marker version
              p->range = bound;
              *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
              // WRITE_CONTEXT_AND_SRC
              continue;
            }
            p->low += bound;
            p->range -= bound;
            *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
          }
          // p->context = src[3];
          {
            // const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);
            const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
            ip = p->ip64;
            v = GetUi32(src); // relat
            ip += 4;
            p->ip64 = ip;
            src += 4;
            // p->src = src;
            {
              const UInt32 absol = (UInt32)ip + v;
              Byte *cur = p->bufs[cj];
              v >>= 24;
              // WRITE_CONTEXT
              if (cur == p->lims[cj])
              {
                p->state = cj;
                p->tempTarget = absol;
                WRITE_CONTEXT_AND_SRC
                return;
              }
              SetBe32a(cur, absol)
              p->bufs[cj] = cur + 4;
            }
          }
        }
      }
    } // end of loop
  }

  for (; p->flushRem != 0; p->flushRem--)
    if (Bcj2_RangeEnc_ShiftLow(p))
      return;
  p->state = BCJ2_ENC_STATE_FINISHED;
}


/*
BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.
So base function Bcj2Enc_Encode_2()
  in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with
  (p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)
Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.
  so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),
    then (p->src == p->srcLim).
  And the caller's code is simpler with Bcj2Enc_Encode().
*/

Z7_NO_INLINE
void Bcj2Enc_Encode(CBcj2Enc *p)
{
  PRF2("\n----")
  if (p->tempPos != 0)
  {
    /* extra: number of bytes that were copied from (src) to (temp) buffer in this call */
    unsigned extra = 0;
    /* We will touch only minimal required number of bytes in input (src) stream.
       So we will add input bytes from (src) stream to temp[] with step of 1 byte.
       We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call
         in first loop iteration because
         - previous call of Bcj2Enc_Encode() could use another (finishMode),
         - previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).
       the case with full temp[] buffer (p->tempPos == 4) is possible here.
    */
    for (;;)
    {
      // (0 < p->tempPos <= 5) // in non-marker version
      /* p->src : the current src data position including extra bytes
                  that were copied to temp[] buffer in this call */
      const Byte *src = p->src;
      const Byte *srcLim = p->srcLim;
      const EBcj2Enc_FinishMode finishMode = p->finishMode;
      if (src != srcLim)
      {
        /* if there are some src data after the data copied to temp[],
           then we use MODE_CONTINUE for temp data */
        p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
      }
      p->src = p->temp;
      p->srcLim = p->temp + p->tempPos;
      PRF2("    ")
      Bcj2Enc_Encode_2(p);
      {
        const unsigned num = (unsigned)(p->src - p->temp);
        const unsigned tempPos = p->tempPos - num;
        unsigned i;
        p->tempPos = tempPos;
        for (i = 0; i < tempPos; i++)
          p->temp[i] = p->temp[(SizeT)i + num];
        // tempPos : number of bytes in temp buffer
        p->src = src;
        p->srcLim = srcLim;
        p->finishMode = finishMode;
        if (p->state != BCJ2_ENC_STATE_ORIG)
        {
          // (p->tempPos <= 4) // in non-marker version
          /* if (the reason of exit from Bcj2Enc_Encode_2()
                 is not BCJ2_ENC_STATE_ORIG),
             then we exit from Bcj2Enc_Encode() with same reason */
          // optional code begin : we rollback (src) and tempPos, if it's possible:
          if (extra >= tempPos)
            extra = tempPos;
          p->src = src - extra;
          p->tempPos = tempPos - extra;
          // optional code end : rollback of (src) and tempPos
          return;
        }
        /* (p->tempPos <= 4)
           (p->state == BCJ2_ENC_STATE_ORIG)
             so encoder needs more data than in temp[] */
        if (src == srcLim)
          return; // src buffer has no more input data.
        /* (src != srcLim)
           so we can provide more input data from src for Bcj2Enc_Encode_2() */
        if (extra >= tempPos)
        {
          /* (extra >= tempPos) means that temp buffer contains
             only data from src buffer of this call.
             So now we can encode without temp buffer */
          p->src = src - tempPos; // rollback (src)
          p->tempPos = 0;
          break;
        }
        // we append one additional extra byte from (src) to temp[] buffer:
        p->temp[tempPos] = *src;
        p->tempPos = tempPos + 1;
        // (0 < p->tempPos <= 5) // in non-marker version
        p->src = src + 1;
        extra++;
      }
    }
  }

  PRF2("++++")
  // (p->tempPos == 0)
  Bcj2Enc_Encode_2(p);
  PRF2("====")
  
  if (p->state == BCJ2_ENC_STATE_ORIG)
  {
    const Byte *src = p->src;
    const Byte *srcLim = p->srcLim;
    const unsigned rem = (unsigned)(srcLim - src);
    /* (rem <= 4) here.
       if (p->src != p->srcLim), then
         - we copy non-processed bytes from (p->src) to temp[] buffer,
         - we set p->src equal to p->srcLim.
    */
    if (rem)
    {
      unsigned i = 0;
      p->src = srcLim;
      p->tempPos = rem;
      // (0 < p->tempPos <= 4)
      do
        p->temp[i] = src[i];
      while (++i != rem);
    }
    // (p->tempPos <= 4)
    // (p->src == p->srcLim)
  }
}

#undef PRF2
#undef CONV_FLAG
#undef MARKER_FLAG
#undef WRITE_CONTEXT
#undef WRITE_CONTEXT_AND_SRC
#undef ONE_ITER
#undef NUM_SHIFT_BITS
#undef kTopValue
#undef kNumBitModelTotalBits
#undef kBitModelTotal
#undef kNumMoveBits