/* Mednafen - Multi-system Emulator
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <boolean.h>
#include <retro_miscellaneous.h>
#include <compat/intrinsics.h>
#include <math/float_minmax.h>

#include "psx.h"
#include "gte.h"

#include "../math_ops.h"
#include "../state_helpers.h"
#include "../pgxp/pgxp_gte.h"
#include "../pgxp/pgxp_main.h"

extern bool psx_gte_overclock;

#include "../clamp.h"

/* Notes:

 AVSZ3/AVSZ4:
	OTZ is MAC0 >> 12
	OTZ overflow/underflow flag is set in an overflow condition even if MAC0 == 0.
	sf field bit has no effect?

 FLAG register:
	Bits present mask: 0xfffff000

	Checksum bit can't be directly set, it's apparently calculated like (bool)(FLAGS & 0x7f87e000)

	Instructions effectively clear it 0 at start. (todo: test "invalid" instructions)

 X/Y FIFO [3] register write pushes a copy down to [2]

*/

typedef struct
{
   int16_t MX[3][3];
   int16_t dummy;
}
#ifndef _MSC_VER
__attribute__((__packed__))
#endif
gtematrix;

typedef struct
{
   union
   {
      struct
      {
         uint8_t R;
         uint8_t G;
         uint8_t B;
         uint8_t CD;
      };
      uint8_t Raw8[4];
   };
} gtergb;

typedef struct
{
   int16_t X;
   int16_t Y;
} gtexy;

static uint32_t CR[32];

typedef union
{
 gtematrix All[4];
 int32_t Raw[4][5];	// Don't read from this(Raw[][]), only write(and when writing, if running on a big-endian platform, swap the upper 16-bits with the lower 16-bits)
 int16_t Raw16[4][10];

 struct
 {
  gtematrix Rot;
  gtematrix Light;
  gtematrix Color;
  gtematrix AbbyNormal;
 };
} Matrices_t;


static union
{
 int32_t All[4][4];	// Really only [4][3], but [4] to ease address calculation.
  
 struct
 {
  int32_t T[4];
  int32_t B[4];
  int32_t FC[4];
  int32_t Null[4];
 };
} CRVectors;

/* Control registers */
static int32_t OFX;     /* Screen offset X: signed 16.16 */
static int32_t OFY;     /* Screen offset Y: signed 16.16 */
static uint16_t H;      /* Projection plane distance */
static int16_t DQA;     /* Depth queing coefficient: signed 8.8 */
static int32_t DQB;     /* Depth queing offset: signed 8.24 */
 
static int16_t ZSF3;    /* Scale factor when computing the average of 3 Z values
                         * (triangle): signed 4.12 */
static int16_t ZSF4;    /* Scale factor when computing the average of 4 Z values
                         * (quad): signed 4.12 */

static Matrices_t Matrices;   /* Three 3x3 signed 4.12 matrices: rotation, light, and color */

// Begin DR
static int16_t Vectors[3][4]; /* Five 3x signed words control vectors: translation,
                                 BackgroundColor, FarColor and Zero (which is always equal to
                                 [0, 0, 0]. */

static uint32_t FLAGS;	      /* Overflow flags generated by the GTE commands */

static int32_t MAC[4];        /* Accumulators for intermediate results, 4 x signed word */

static uint16_t OTZ;          /* Z average value */

static gtergb RGB;            /* RGB color. High byte is passed around but not used in
                                 computations, it often contains a GPU  GP0 command byte. */

static int16_t IR[4];         /* Accumulators for intermediate results, 4 x signed halfwords */

static gtexy XY_FIFO[4];      /* XY FIFO : 4 x 2 x signed half words */

static uint16_t Z_FIFO[4];    /* Z FIFO : 4 x unsigned half words */

static gtergb RGB_FIFO[3];    /* RGB color FIFO */

static uint32_t LZCS;         /* Input value used to compute the 'lzcr' value below */

static uint32_t LZCR;         /* Contains the numbers of leading zeros in LZSC if it's possible
                                 (lzcs[31] is 0) or leading ones if it's negative (lzcs[31] is 1) */

static uint32_t Reg23;        /* Register 23: 32bit read/write but not used for anything */

#define IR0 IR[0]
#define IR1 IR[1]
#define IR2 IR[2]
#define IR3 IR[3]


// end DR

extern "C" unsigned char widescreen_hack;
extern "C" unsigned char widescreen_hack_aspect_ratio_setting;

static INLINE uint8_t Sat5(int16_t cc)
{
   if(cc < 0)
      return 0;
   if(cc > 0x1F)
      return 0x1F;
   return(cc);
}

//
// Newton-Raphson division table.  (Initialized at startup; do NOT save in save states!)
//
static uint8_t DivTable[0x100 + 1];
static INLINE int32_t CalcRecip(uint16 divisor)
{
 int32_t x = (0x101 + DivTable[(((divisor & 0x7FFF) + 0x40) >> 7)]);
 int32_t tmp = (((int32_t)divisor * -x) + 0x80) >> 8;
 int32_t tmp2 = ((x * (131072 + tmp)) + 0x80) >> 8;

 return(tmp2);
}

void GTE_Init(void)
{
   uint32_t divisor;

   for(divisor = 0x8000; divisor < 0x10000; divisor += 0x80)
   {
      unsigned i;
      uint32_t xa = 512;

      for(i = 1; i < 5; i++)
         xa = (xa * (1024 * 512 - ((divisor >> 7) * xa))) >> 18;

      DivTable[(divisor >> 7) & 0xFF] = ((xa + 1) >> 1) - 0x101;
      //printf("%04x, %02x\n", divisor, ((xa + 1) >> 1) - 0x101);
   }

   // To avoid a bounds limiting if statement in the emulation code:
   DivTable[0x100] = DivTable[0xFF];
}


void GTE_Power(void)
{
   memset(CR, 0, sizeof(CR));
   //memset(DR, 0, sizeof(DR));

   memset(Matrices.All, 0, sizeof(Matrices.All));
   memset(CRVectors.All, 0, sizeof(CRVectors.All));
   OFX = 0;
   OFY = 0;
   H = 0;
   DQA = 0;
   DQB = 0;
   ZSF3 = 0;
   ZSF4 = 0;


   memset(Vectors, 0, sizeof(Vectors));
   memset(&RGB, 0, sizeof(RGB));
   OTZ = 0;
   IR0 = 0;
   IR1 = 0;
   IR2 = 0;
   IR3 = 0;

   memset(XY_FIFO, 0, sizeof(XY_FIFO));
   memset(Z_FIFO, 0, sizeof(Z_FIFO));
   memset(RGB_FIFO, 0, sizeof(RGB_FIFO));
   memset(MAC, 0, sizeof(MAC));
   LZCS = 0;
   LZCR = 0;

   Reg23 = 0;
}

// TODO: Don't save redundant state, regarding CR cache variables
int GTE_StateAction(StateMem *sm, int load, int data_only)
{
   SFORMAT StateRegs[] =
   {
      { CR, (uint32_t)(32 * sizeof(uint32_t)), MDFNSTATE_RLSB32 | 0, "CR" },
      { &FLAGS, sizeof(FLAGS), MDFNSTATE_RLSB | 0, "FLAGS" },
      SFARRAY16(&Matrices.Raw16[0][0], 4 * 10),

      SFARRAY32(&CRVectors.All[0][0], 4 * 4),

      SFVARN(OFX, "OFX"),
      SFVARN(OFY, "OFY"),
      SFVARN(H, "H"),
      SFVARN(DQA, "DQA"),
      SFVARN(DQB, "DQB"),

      SFVARN(ZSF3, "ZSF3"),
      SFVARN(ZSF4, "ZSF4"),
      SFARRAY16(&Vectors[0][0], 3 * 4),

      SFARRAY(RGB.Raw8, 4),
      SFVARN(OTZ, "OTZ"),
      SFARRAY16(IR, 4),

      SFVAR(XY_FIFO[0].X),
      SFVAR(XY_FIFO[0].Y),
      SFVAR(XY_FIFO[1].X),
      SFVAR(XY_FIFO[1].Y),
      SFVAR(XY_FIFO[2].X),
      SFVAR(XY_FIFO[2].Y),
      SFVAR(XY_FIFO[3].X),
      SFVAR(XY_FIFO[3].Y),

      SFARRAY16(Z_FIFO, 4),

      SFARRAY(RGB_FIFO[0].Raw8, 4),
      SFARRAY(RGB_FIFO[1].Raw8, 4),
      SFARRAY(RGB_FIFO[2].Raw8, 4),

      SFARRAY32(MAC, 4),

      SFVARN(LZCS, "LZCS"),
      SFVARN(LZCR, "LZCR"),
      SFVARN(Reg23, "Reg23"),

      SFEND
   };
   int ret = MDFNSS_StateAction(sm, load, data_only, StateRegs, "GTE");

#if 0
   if(load)
   {

   }
#endif

   return(ret);
}


void GTE_WriteCR(unsigned int which, uint32_t value)
{
   static const uint32_t mask_table[32] = {
      /* 0x00 */
      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,

      /* 0x08 */
      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,

      /* 0x10 */
      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,

      /* 0x18 */
      0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF, 0x0000FFFF, 0xFFFFFFFF, 0x0000FFFF, 0x0000FFFF, 0xFFFFFFFF
   };

   //PSX_WARNING("[GTE] Write CR %d, 0x%08x", which, value);

   value &= mask_table[which];

   CR[which] = value | (CR[which] & ~mask_table[which]);

   if(which < 24)
   {
      int we = which >> 3;
      which &= 0x7;

      if(which >= 5)
         CRVectors.All[we][which - 5] = value;
      else
      {
#ifdef MSB_FIRST
         Matrices.Raw[we][which] = (value << 16) | (value >> 16);
#else
         Matrices.Raw[we][which] = value;
#endif
      }
      return;
   }

   switch(which)
   {
      case 24:
         OFX = value;
         break;

      case 25:
         OFY = value;
         break;

      case 26:
         H = value;
         break;

      case 27:
         DQA = value;
         break;

      case 28:
         DQB = value;
         break;

      case 29:
         ZSF3 = value;
         break;

      case 30:
         ZSF4 = value;
         break;

      case 31:
         CR[31] = (value & 0x7ffff000) | ((value & 0x7f87e000) ? (1 << 31) : 0);
         break;
   }
}

uint32_t GTE_ReadCR(unsigned int which)
{
   uint32_t ret = 0;

   switch(which)
   {
      default:
         ret = CR[which];
         if(which == 4 || which == 12 || which == 20)
            ret = (int16)ret;
         break;

      case 24:
         ret = OFX;
         break;

      case 25:
         ret = OFY;
         break;

      case 26:
         ret = (int16)H;
         break;

      case 27:
         ret = (int16)DQA;
         break;

      case 28:
         ret = DQB;
         break;

      case 29:
         ret = (int16)ZSF3;
         break;

      case 30:
         ret = (int16)ZSF4;
         break;

      case 31:
         ret = CR[31];
         break;
   }

   return(ret);
}

void GTE_WriteDR(unsigned int which, uint32_t value)
{
   switch(which & 0x1F)
   {
      case 0:
         Vectors[0][0] = value;
         Vectors[0][1] = value >> 16;
         break;

      case 1:
         Vectors[0][2] = value;
         break;

      case 2:
         Vectors[1][0] = value;
         Vectors[1][1] = value >> 16;
         break;

      case 3:
         Vectors[1][2] = value;
         break;

      case 4:
         Vectors[2][0] = value;
         Vectors[2][1] = value >> 16;
         break;

      case 5:
         Vectors[2][2] = value;
         break;

      case 6:
         RGB.R = value >> 0;
         RGB.G = value >> 8;
         RGB.B = value >> 16;
         RGB.CD = value >> 24;
         break;

      case 7:
         OTZ = value;
         break;

      case 8:
         IR0 = value;
         break;

      case 9:
         IR1 = value;
         break;

      case 10:
         IR2 = value;
         break;

      case 11:
         IR3 = value;
         break;

      case 12:
         XY_FIFO[0].X = value;
         XY_FIFO[0].Y = value >> 16;
         break;

      case 13:
         XY_FIFO[1].X = value;
         XY_FIFO[1].Y = value >> 16;
         break;

      case 14:
         XY_FIFO[2].X = value;
         XY_FIFO[2].Y = value >> 16;
         XY_FIFO[3].X = value;
         XY_FIFO[3].Y = value >> 16;
         break;

      case 15:
         XY_FIFO[3].X = value;
         XY_FIFO[3].Y = value >> 16;

         XY_FIFO[0] = XY_FIFO[1];
         XY_FIFO[1] = XY_FIFO[2];
         XY_FIFO[2] = XY_FIFO[3];
         break;

      case 16:
         Z_FIFO[0] = value;
         break;

      case 17:
         Z_FIFO[1] = value;
         break;

      case 18:
         Z_FIFO[2] = value;
         break;

      case 19:
         Z_FIFO[3] = value;
         break;

      case 20:
         RGB_FIFO[0].R = value;
         RGB_FIFO[0].G = value >> 8;
         RGB_FIFO[0].B = value >> 16;
         RGB_FIFO[0].CD = value >> 24;
         break;

      case 21:
         RGB_FIFO[1].R = value;
         RGB_FIFO[1].G = value >> 8;
         RGB_FIFO[1].B = value >> 16;
         RGB_FIFO[1].CD = value >> 24;
         break;

      case 22:
         RGB_FIFO[2].R = value;
         RGB_FIFO[2].G = value >> 8;
         RGB_FIFO[2].B = value >> 16;
         RGB_FIFO[2].CD = value >> 24;
         break;

      case 23:
         Reg23 = value;
         break;

      case 24:
         MAC[0] = value;
         break;

      case 25:
         MAC[1] = value;
         break;

      case 26:
         MAC[2] = value;
         break;

      case 27:
         MAC[3] = value;
         break;

      case 28:
         IR1 = ((value >> 0) & 0x1F) << 7;
         IR2 = ((value >> 5) & 0x1F) << 7;
         IR3 = ((value >> 10) & 0x1F) << 7;
         break;

      case 29:	// Read-only
         break;

      case 30:

         LZCS = value;
		 LZCR = MDFN_lzcount32(value ^ ((int32)value >> 31));
         break;

      case 31:	// Read-only
         break;
   }
}

uint32_t GTE_ReadDR(unsigned int which)
{
   uint32_t ret = 0;

   switch(which & 0x1F)
   {
      case 0:
         ret = (uint16_t)Vectors[0][0] | ((uint16_t)Vectors[0][1] << 16);
         break;

      case 1:
         ret = (int16_t)Vectors[0][2];
         break;

      case 2:
         ret = (uint16_t)Vectors[1][0] | ((uint16_t)Vectors[1][1] << 16);
         break;

      case 3:
         ret = (int16_t)Vectors[1][2];
         break;

      case 4:
         ret = (uint16_t)Vectors[2][0] | ((uint16_t)Vectors[2][1] << 16);
         break;

      case 5:
         ret = (int16_t)Vectors[2][2];
         break;

      case 6:
         ret = RGB.R | (RGB.G << 8) | (RGB.B << 16) | (RGB.CD << 24);
         break;

      case 7:
         ret = (uint16_t)OTZ;
         break;

      case 8:
         ret = (int16_t)IR0;
         break;

      case 9:
         ret = (int16_t)IR1;
         break;

      case 10:
         ret = (int16_t)IR2;
         break;

      case 11:
         ret = (int16_t)IR3;
         break;

      case 12:
         ret = (uint16_t)XY_FIFO[0].X | ((uint16_t)XY_FIFO[0].Y << 16);
         break;

      case 13:
         ret = (uint16_t)XY_FIFO[1].X | ((uint16_t)XY_FIFO[1].Y << 16);
         break;

      case 14:
         ret = (uint16_t)XY_FIFO[2].X | ((uint16_t)XY_FIFO[2].Y << 16);
         break;

      case 15:
         ret = (uint16_t)XY_FIFO[3].X | ((uint16_t)XY_FIFO[3].Y << 16);
         break;

      case 16:
         ret = (uint16_t)Z_FIFO[0];
         break;

      case 17:
         ret = (uint16_t)Z_FIFO[1];
         break;

      case 18:
         ret = (uint16_t)Z_FIFO[2];
         break;

      case 19:
         ret = (uint16_t)Z_FIFO[3];
         break;

      case 20:
         ret = RGB_FIFO[0].R | (RGB_FIFO[0].G << 8) | (RGB_FIFO[0].B << 16) | (RGB_FIFO[0].CD << 24);
         break;

      case 21:
         ret = RGB_FIFO[1].R | (RGB_FIFO[1].G << 8) | (RGB_FIFO[1].B << 16) | (RGB_FIFO[1].CD << 24);
         break;

      case 22:
         ret = RGB_FIFO[2].R | (RGB_FIFO[2].G << 8) | (RGB_FIFO[2].B << 16) | (RGB_FIFO[2].CD << 24);
         break;

      case 23:
         ret = Reg23;
         break;

      case 24:
         ret = MAC[0];
         break;

      case 25:
         ret = MAC[1];
         break;

      case 26:
         ret = MAC[2];
         break;

      case 27:
         ret = MAC[3];
         break;

      case 28:
      case 29:
         ret = Sat5(IR1 >> 7) | (Sat5(IR2 >> 7) << 5) | (Sat5(IR3 >> 7) << 10);
         break;

      case 30:
         ret = LZCS;
         break;

      case 31:
         ret = LZCR;
         break;
   }
   return(ret);
}

#define sign_x_to_s64(_bits, _value) (((int64_t)((uint64_t)(_value) << (64 - _bits))) >> (64 - _bits))

static INLINE int64_t A_MV(unsigned which, int64_t value)
{
   if(value >= (INT64_C(1) << 43))
      FLAGS |= 1 << (30 - which);

   if(value < -(INT64_C(1) << 43))
      FLAGS |= 1 << (27 - which);

   return sign_x_to_s64(44, value);
}

static INLINE int64_t F(int64_t value)
{
   if(value < -2147483648LL)
   {
      // flag set here
      FLAGS |= 1 << 15;
   }

   if(value > 2147483647LL)
   {
      // flag set here
      FLAGS |= 1 << 16;
   }
   return(value);
}


/* Truncate i64 value to only keep the low 43 bits + sign and
 * update the flags if an overflow occurs */
static INLINE int64_t i64_to_i44(unsigned which, int64_t value)
{
   if(value >= 0x7ffffffffffLL)
      FLAGS |= 1 << (30 - which);

   if(value < -0x80000000000LL)
      FLAGS |= 1 << (27 - which);

   return (((int64_t)((uint64_t)(value) << (64 - 44))) >> (64 - 44));
}


/* Truncate i32 value to an i16, saturating in case of an
 * overflow and updating the flags if an overflow occurs. If
 * `flags.clamp_negative` is true negative values will be clamped
 * to 0. */
static INLINE int16_t i32_to_i16_saturate(unsigned int which, int32_t value, int lm)
{
   int32_t tmp = lm << 15;

   if(value < (-32768 + tmp))
   {
      // set flag here
      FLAGS |= 1 << (24 - which);
      return -32768 + tmp;
   }

   if(value > 32767)
   {
      // Set flag here
      FLAGS |= 1 << (24 - which);
      return 32767;
   }

   return(value);
}

static INLINE int16_t Lm_B_PTZ(unsigned int which, int32_t value, int32_t ftv_value, int lm)
{
   int32_t tmp = lm << 15;

   if(ftv_value < -32768)
      FLAGS |= 1 << (24 - which);

   if(ftv_value > 32767)
      FLAGS |= 1 << (24 - which);

   clamp(&value, (-32768 + tmp), 32767);

   return(value);
}

static INLINE uint8_t Lm_C(unsigned int which, int32_t value)
{
   if(value & ~0xFF)
   {
      // Set flag here
      FLAGS |= 1 << (21 - which);	// Tested with GPF

      if(value < 0)
         value = 0;

      if(value > 255)
         value = 255;
   }

   return(value);
}

static INLINE int32_t Lm_D(int32_t value, int unchained)
{
   // Not sure if we should have it as int64, or just chain on to and special case when the F flags are set.
   if(!unchained)
   {
      if(FLAGS & (1 << 15))
      {
         FLAGS |= 1 << 18;
         return(0);
      }

      if(FLAGS & (1 << 16))
      {
         FLAGS |= 1 << 18;
         return(0xFFFF);
      }
   }

   if(value < 0)
   {
      // Set flag here
      value = 0;
      FLAGS |= 1 << 18;	// Tested with AVSZ3
   }
   else if(value > 65535)
   {
      // Set flag here.
      value = 65535;
      FLAGS |= 1 << 18;	// Tested with AVSZ3
   }

   return(value);
}

static INLINE int32_t Lm_G(unsigned int which, int32_t value)
{
   if(value < -1024)
   {
      // Set flag here
      value = -1024;
      FLAGS |= 1 << (14 - which);
   }

   if(value > 1023)
   {
      // Set flag here.
      value = 1023;
      FLAGS |= 1 << (14 - which);
   }

   return(value);
}

// limit to 4096, not 4095
static INLINE int32_t Lm_H(int32_t value)
{
#if 0
   if(FLAGS & (1 << 15))
   {
      value = 0;
      FLAGS |= 1 << 12;
      return value;
   }

   if(FLAGS & (1 << 16))
   {
      value = 4096;
      FLAGS |= 1 << 12;
      return value;
   }
#endif

   if(value < 0)
   {
      value = 0;
      FLAGS |= 1 << 12;
   }

   if(value > 4096)
   {
      value = 4096;
      FLAGS |= 1 << 12;
   }

   return(value);
}

/* Convert a 64bit signed average value to an unsigned halfword
 * while updating the overflow flags */
static INLINE uint16_t i64_to_otz(int64_t average, int unchained)
{
   int32_t value = average >> 12;
   /* Not sure if we should have it as int64, or just chain 
    * on to and special case when the F flags are set. */
   if(!unchained)
   {
      if(FLAGS & (1 << 15))
      {
         FLAGS |= 1 << 18;
         return(0);
      }

      if(FLAGS & (1 << 16))
      {
         FLAGS |= 1 << 18;
         return(0xFFFF);
      }
   }

   if(value < 0)
   {
      // Set flag here
      FLAGS |= 1 << 18;	// Tested with AVSZ3
      return 0;
   }
   else if(value > 65535)
   {
      // Set flag here.
      FLAGS |= 1 << 18;	// Tested with AVSZ3
      return 65535;
   }

   return value;
}

static INLINE int32_t i32_to_i11_saturate(uint8_t flag, int32_t value)
{
   if(value < -0x400)
   {
      FLAGS |= 1 << (14 - flag);
      return -0x400;
   }

   if(value > 0x3ff)
   {
      FLAGS |= 1 << (14 - flag);
      return 0x3ff;
   }

   return value;
}

static INLINE uint8_t MAC_to_COLOR(uint8_t flag, int32_t mac)
{
   int32_t c = mac >> 4;

   if (c < 0)
   {
      FLAGS |= 1 << (21 - flag);	/* Tested with GPF */
      return 0;
   }
   if (c > 0xff)
   {
      FLAGS |= 1 << (21 - flag);	/* Tested with GPF */
      return 0xff;
   }
   return c;
}

static INLINE void MAC_to_RGB_FIFO(void)
{
   RGB_FIFO[0] = RGB_FIFO[1];
   RGB_FIFO[1] = RGB_FIFO[2];
   RGB_FIFO[2].R = Lm_C(0, MAC[1] >> 4);
   RGB_FIFO[2].G = Lm_C(1, MAC[2] >> 4);
   RGB_FIFO[2].B = Lm_C(2, MAC[3] >> 4);
   RGB_FIFO[2].CD = RGB.CD;
}

static INLINE int16_t Lm_B(unsigned int which, int32_t value, int lm)
{
   int32_t tmp = lm << 15;

   if(value < (-32768 + tmp))
   {
      // set flag here
      FLAGS |= 1 << (24 - which);
      value = -32768 + tmp;
   }

   if(value > 32767)
   {
      // Set flag here
      FLAGS |= 1 << (24 - which);
      value = 32767;
   }

   return(value);
}

static INLINE void MAC_to_IR(int lm)
{
   IR1 = i32_to_i16_saturate(0, MAC[1], lm);
   IR2 = i32_to_i16_saturate(1, MAC[2], lm);
   IR3 = i32_to_i16_saturate(2, MAC[3], lm);
}

static INLINE void MultiplyMatrixByVector(const gtematrix *matrix, const int16_t *v, const int32_t *crv, uint32_t sf, int lm)
{
   unsigned i;

   for(i = 0; i < 3; i++)
   {
      int64_t tmp;
      int32_t mulr[3];

      tmp = (uint64_t)(int64_t)crv[i] << 12;

      if(matrix == &Matrices.AbbyNormal)
      {
         if(i == 0)
         {
            mulr[0] = -(RGB.R << 4);
            mulr[1] = (RGB.R << 4);
            mulr[2] = IR0;
         }
         else
         {
            mulr[0] = (int16_t)CR[i];
            mulr[1] = (int16_t)CR[i];
            mulr[2] = (int16_t)CR[i];
         }
      }
      else
      {
         mulr[0] = matrix->MX[i][0];
         mulr[1] = matrix->MX[i][1];
         mulr[2] = matrix->MX[i][2];
      }
      mulr[0] *= v[0];
      mulr[1] *= v[1];
      mulr[2] *= v[2];

      tmp = A_MV(i, tmp + mulr[0]);
      if(crv == CRVectors.FC)
      {
         Lm_B(i, tmp >> sf, false);
         tmp = 0;
      }

      tmp = A_MV(i, tmp + mulr[1]);
      tmp = A_MV(i, tmp + mulr[2]);

      MAC[1 + i] = tmp >> sf;
   }

   MAC_to_IR(lm);
}

static INLINE void MultiplyMatrixByVector_PT(const gtematrix *matrix, const int16_t *v, const int32_t *crv, uint32_t sf, int lm)
{
   int64_t tmp[3];
   unsigned i;

   for(i = 0; i < 3; i++)
   {
      int32_t mulr[3];

      tmp[i] = (uint64_t)(int64_t)crv[i] << 12;

      mulr[0] = matrix->MX[i][0] * v[0];
      mulr[1] = matrix->MX[i][1] * v[1];
      mulr[2] = matrix->MX[i][2] * v[2];

      tmp[i] = A_MV(i, tmp[i] + mulr[0]);
      tmp[i] = A_MV(i, tmp[i] + mulr[1]);
      tmp[i] = A_MV(i, tmp[i] + mulr[2]);

      MAC[1 + i] = tmp[i] >> sf;
   }

   IR1 = Lm_B(0, MAC[1], lm);
   IR2 = Lm_B(1, MAC[2], lm);
   //printf("FTV: %08x %08x\n", crv[2], (uint32)(tmp[2] >> 12));
   IR3 = Lm_B_PTZ(2, MAC[3], tmp[2] >> 12, lm);

   Z_FIFO[0] = Z_FIFO[1];
   Z_FIFO[1] = Z_FIFO[2];
   Z_FIFO[2] = Z_FIFO[3];
   Z_FIFO[3] = Lm_D(tmp[2] >> 12, true);
}

#define DECODE_FIELDS							\
 const uint32 sf MDFN_NOWARN_UNUSED = (instr & (1 << 19)) ? 12 : 0;		\
 const uint32 mx MDFN_NOWARN_UNUSED = (instr >> 17) & 0x3;			\
 const uint32 v_i = (instr >> 15) & 0x3;				\
 const int32* cv MDFN_NOWARN_UNUSED = CRVectors.All[(instr >> 13) & 0x3];	\
 const int lm MDFN_NOWARN_UNUSED = (instr >> 10) & 1;			\
 int16 v[3] MDFN_NOWARN_UNUSED;					\
 if(v_i == 3)							\
 {								\
  v[0] = IR1;							\
  v[1] = IR2;							\
  v[2] = IR3;							\
 }								\
 else								\
 {								\
  v[0] = Vectors[v_i][0];					\
  v[1] = Vectors[v_i][1];					\
  v[2] = Vectors[v_i][2];					\
 }

/* SQR - Square Vector */
static int32_t SQR(uint32_t instr)
{
   DECODE_FIELDS;

   /* PSX GTE test fails with this code */
   unsigned i;

   for (i = 1; i < 4; i++)
   {
      int32_t ir = IR[i];
      MAC[i]     = (ir * ir) >> sf;
   }

   MAC_to_IR(lm);

   return(5);
}

/* MVMVA - Multiply Vector by Matrix And Vector Add */
static int32_t MVMVA(uint32_t instr)
{
   DECODE_FIELDS;

   MultiplyMatrixByVector(&Matrices.All[mx], v, cv, sf, lm);

   return(8);
}

static INLINE uint32_t Divide(uint32_t dividend, uint32_t divisor)
{
   if((divisor * 2) > dividend)
   {
      /* GTE-specific division algorithm for dist / z.
       * Returns a saturated 17bit value. */
      unsigned shift_bias = compat_clz_u16(divisor);

      dividend <<= shift_bias;
      divisor <<= shift_bias;

      return std::min<uint32>(0x1FFFF, ((uint64_t)dividend * CalcRecip(divisor | 0x8000) + 32768) >> 16);
   }

   /* If the Z coordinate is smaller than or equal to half the 
    * projection plane distance, we clip it */

   FLAGS |= 1 << 17;
   return 0x1FFFF;
}

static INLINE void check_mac_overflow(int64_t value)
{
   if(value < -2147483648LL)
      FLAGS |= 1 << 15;
   if(value > 2147483647LL)
      FLAGS |= 1 << 16;
}

static INLINE void TransformXY(int64_t h_div_sz, float precise_h_div_sz, float precise_z)
{

   float widescreen_hack_aspect_ratio;
   switch(widescreen_hack_aspect_ratio_setting)
   {
      case 0: // 16:10
         widescreen_hack_aspect_ratio = 0.80f;
         break;
      case 1: // 16:9 (default)
         widescreen_hack_aspect_ratio = 0.75f;
         break;
      case 2: // 18:9 (smartphone)
         widescreen_hack_aspect_ratio = 0.66f;
         break;
      case 3: // 19:9 (smartphone)
         widescreen_hack_aspect_ratio = 0.63f;
         break;
      case 4: // 20:9 (smartphone)
         widescreen_hack_aspect_ratio = 0.6f;
         break;
      case 5: // 21:9 (ultrawide, 64:27)
         widescreen_hack_aspect_ratio = 0.55f;
         break;
      case 6: // 32:9 (superwide)
         widescreen_hack_aspect_ratio = 0.37f;
         break;
   }

   MAC[0] = F((int64_t)OFX + IR1 * h_div_sz * ((widescreen_hack) ? widescreen_hack_aspect_ratio : 1.00)) >> 16;
   XY_FIFO[3].X = Lm_G(0, MAC[0]);

   MAC[0] = F((int64_t)OFY + IR2 * h_div_sz) >> 16;
   XY_FIFO[3].Y = Lm_G(1, MAC[0]);

   XY_FIFO[0] = XY_FIFO[1];
   XY_FIFO[1] = XY_FIFO[2];
   XY_FIFO[2] = XY_FIFO[3];

   /*
    * PGXP hack to add subpixel precision as well
    */
   float fofx       = ((float)OFX / (float)(1 << 16));
   float fofy       = ((float)OFY / (float)(1 << 16));

   /* Project X and Y onto the plane */
   int64_t screen_x = (int64_t)OFX + IR1 * h_div_sz * ((widescreen_hack) ? widescreen_hack_aspect_ratio : 1.00);
   int64_t screen_y = (int64_t)OFY + IR2 * h_div_sz;

   /* Increased precision calculation (sub-pixel precision) */
   float precise_x = fofx + ((float)IR1 * precise_h_div_sz) * ((widescreen_hack) ? widescreen_hack_aspect_ratio : 1.00);
   float precise_y = fofy + ((float)IR2 * precise_h_div_sz);

   uint32 value = *((uint32*)&XY_FIFO[3]);

   /* Clamp precision values to valid range */
   precise_x = float_max(-0x400, float_min(precise_x, 0x3ff));
   precise_y = float_max(-0x400, float_min(precise_y, 0x3ff));

   PGXP_pushSXYZ2f(precise_x, precise_y, precise_z, value);
}


static INLINE void TransformDQ(int64_t h_div_sz)
{
   MAC[0] = F((int64_t)DQB + DQA * h_div_sz);
   IR0 = Lm_H(((int64_t)DQB + DQA * h_div_sz) >> 12);
}

static INLINE int32 RTPS(uint32 instr)
{
 DECODE_FIELDS;
 int64 h_div_sz;
 float precise_z;
 float precise_h_div_sz;

 MultiplyMatrixByVector_PT(&Matrices.Rot, Vectors[0], CRVectors.T, sf, lm);
 h_div_sz = Divide(H, Z_FIFO[3]);

 precise_z = float_max(H/2.f, (float)Z_FIFO[3]);
 precise_h_div_sz  = (float)H / precise_z;

 TransformXY(h_div_sz, precise_h_div_sz, precise_z);
 TransformDQ(h_div_sz);

 return(15);
}

static INLINE int32 RTPT(uint32 instr)
{
 DECODE_FIELDS;
 int i;

 for(i = 0; i < 3; i++)
 {
  int64 h_div_sz;
  float precise_z;
  float precise_h_div_sz;

  MultiplyMatrixByVector_PT(&Matrices.Rot, Vectors[i], CRVectors.T, sf, lm);
  h_div_sz = Divide(H, Z_FIFO[3]);

  precise_z = float_max(H/2.f, (float)Z_FIFO[3]);
  precise_h_div_sz  = (float)H / precise_z;

  TransformXY(h_div_sz, precise_h_div_sz, precise_z);

  if(i == 2)
   TransformDQ(h_div_sz);
 }

 return(23);
}

static INLINE void NormColor(uint32_t sf, int lm, uint32_t v)
{
   int16_t tmp_vector[3];

   MultiplyMatrixByVector(&Matrices.Light, Vectors[v], CRVectors.Null, sf, lm);

   tmp_vector[0] = IR1; tmp_vector[1] = IR2; tmp_vector[2] = IR3;
   MultiplyMatrixByVector(&Matrices.Color, tmp_vector, CRVectors.B, sf, lm);

   MAC_to_RGB_FIFO();
}

static int32_t NCS(uint32_t instr)
{
   const uint32_t sf = (instr & (1 << 19)) ? 12 : 0;
   const int      lm = (instr >> 10) & 1;

   NormColor(sf, lm, 0);

   return(14);
}

static int32_t NCT(uint32_t instr)
{
   unsigned i;
   const uint32_t sf = (instr & (1 << 19)) ? 12 : 0;
   const int      lm = (instr >> 10) & 1;

   NormColor(sf, lm, 0);
   NormColor(sf, lm, 1);
   NormColor(sf, lm, 2);

   return(30);
}

/* NCC - Normal Color Color */
static INLINE void NCC(uint32_t vector_index, uint32_t sf, int lm)
{
   int16_t tmp_vector[3];

   MultiplyMatrixByVector(&Matrices.Light, Vectors[vector_index], CRVectors.Null, sf, lm);

   tmp_vector[0] = IR1; tmp_vector[1] = IR2; tmp_vector[2] = IR3;
   MultiplyMatrixByVector(&Matrices.Color, tmp_vector, CRVectors.B, sf, lm);

   MAC[1] = ((RGB.R << 4) * IR1) >> sf;
   MAC[2] = ((RGB.G << 4) * IR2) >> sf;
   MAC[3] = ((RGB.B << 4) * IR3) >> sf;

   MAC_to_IR(lm);
   MAC_to_RGB_FIFO();
}

static int32_t NCCS(uint32_t instr)
{
   const uint32_t sf = (instr & (1 << 19)) ? 12 : 0;
   const int      lm = (instr >> 10) & 1;

   NCC(0, sf, lm);
   return(17);
}


static int32_t NCCT(uint32_t instr)
{
   const uint32_t sf = (instr & (1 << 19)) ? 12 : 0;
   const int      lm = (instr >> 10) & 1;

   NCC(0, sf, lm);
   NCC(1, sf, lm);
   NCC(2, sf, lm);

   return(39);
}

static INLINE void DPC(uint32_t instr)
{
   int i;
   int32_t RGB_temp[3];
   const uint32_t sf   = (instr & (1 << 19)) ? 12 : 0;
   const int      lm   = (instr >> 10) & 1;

   RGB_temp[0] = RGB_FIFO[0].R << 4;
   RGB_temp[1] = RGB_FIFO[0].G << 4;
   RGB_temp[2] = RGB_FIFO[0].B << 4;

   for(i = 0; i < 3; i++)
   {
      MAC[1 + i] = i64_to_i44(i, ((int64_t)((uint64_t)(int64_t)CRVectors.FC[i] << 12) - (int32)((uint32)RGB_temp[i] << 12))) >> sf;
      MAC[1 + i] = i64_to_i44(i, ((int64_t)((uint64_t)(int64_t)RGB_temp[i] << 12) + IR0 * i32_to_i16_saturate(i, MAC[1 + i], false))) >> sf;
   }

   MAC_to_IR(lm);

   MAC_to_RGB_FIFO();
}


/* DCPL - Depth Cue Color Light */
static int32_t DCPL(uint32_t instr)
{
   int i;
   int32_t RGB_temp[3];
   int32_t IR_temp[3] = { IR1, IR2, IR3 };
   const uint32_t sf  = (instr & (1 << 19)) ? 12 : 0;
   const int      lm  = (instr >> 10) & 1;

   RGB_temp[0] = RGB.R << 4;
   RGB_temp[1] = RGB.G << 4;
   RGB_temp[2] = RGB.B << 4;

   for(i = 0; i < 3; i++)
   {
      MAC[1 + i] = i64_to_i44(i, ((int64_t)((uint64_t)(int64_t)CRVectors.FC[i] << 12) - RGB_temp[i] * IR_temp[i])) >> sf;
      MAC[1 + i] = i64_to_i44(i, (RGB_temp[i] * IR_temp[i] + IR0 * i32_to_i16_saturate(i, MAC[1 + i], false))) >> sf;
   }

   MAC_to_IR(lm);

   MAC_to_RGB_FIFO();


   return(8);
}


static int32_t DPCS(uint32_t instr)
{
   int i;
   int32_t RGB_temp[3];
   const uint32_t sf  = (instr & (1 << 19)) ? 12 : 0;
   const int      lm  = (instr >> 10) & 1;

   //assert(sf);
   RGB_temp[0] = RGB.R << 4;
   RGB_temp[1] = RGB.G << 4;
   RGB_temp[2] = RGB.B << 4;

   for(i = 0; i < 3; i++)
   {
      MAC[1 + i] = i64_to_i44(i, ((int64_t)((uint64_t)(int64_t)CRVectors.FC[i] << 12) - (int32)((uint32)RGB_temp[i] << 12))) >> sf;
      MAC[1 + i] = i64_to_i44(i, ((int64_t)((uint64_t)(int64_t)RGB_temp[i] << 12) + IR0 * i32_to_i16_saturate(i, MAC[1 + i], false))) >> sf;
   }

   MAC_to_IR(lm);
   MAC_to_RGB_FIFO();

   return(8);
}

/* DPCT - Depth Cue Triple */
static int32_t DPCT(uint32_t instr)
{
   /* Each call uses the oldest entry in the RGB FIFO
    * and pushes the result at the top so the three calls
    * will process and replace the entire contents of the FIFO. */
   DPC(instr);
   DPC(instr);
   DPC(instr);

   return(17);
}

/* INTPL - Interpolate Between a vector and the far color */

static int32_t INTPL(uint32_t instr)
{
   const uint32_t sf = (instr & (1 << 19)) ? 12 : 0;
   const int      lm = (instr >> 10) & 1;

   MAC[1] = i64_to_i44(0, ((int64_t)((uint64_t)(int64_t)CRVectors.FC[0] << 12) - (int32)((uint32)(int32)IR1 << 12))) >> sf;
   MAC[2] = i64_to_i44(1, ((int64_t)((uint64_t)(int64_t)CRVectors.FC[1] << 12) - (int32)((uint32)(int32)IR2 << 12))) >> sf;
   MAC[3] = i64_to_i44(2, ((int64_t)((uint64_t)(int64_t)CRVectors.FC[2] << 12) - (int32)((uint32)(int32)IR3 << 12))) >> sf;

   MAC[1] = i64_to_i44(0, ((int64_t)((uint64_t)(int64_t)IR1 << 12) + IR0 * i32_to_i16_saturate(0, MAC[1], false)) >> sf);
   MAC[2] = i64_to_i44(1, ((int64_t)((uint64_t)(int64_t)IR2 << 12) + IR0 * i32_to_i16_saturate(1, MAC[2], false)) >> sf);
   MAC[3] = i64_to_i44(2, ((int64_t)((uint64_t)(int64_t)IR3 << 12) + IR0 * i32_to_i16_saturate(2, MAC[3], false)) >> sf);

   MAC_to_IR(lm);
   MAC_to_RGB_FIFO();

   return(8);
}


static INLINE void NormColorDepthCue(uint32_t instr, uint32_t v)
{
   int16_t tmp_vector[3];
   const uint32_t sf = (instr & (1 << 19)) ? 12 : 0;
   const int      lm = (instr >> 10) & 1;

   MultiplyMatrixByVector(&Matrices.Light, Vectors[v], CRVectors.Null, sf, lm);

   /* Use the custom 4th vector to store the intermediate
    * values. This vector does not exist in the real hardware
    * (at least not in the registers), it's just a hack to make
    * the code simpler. */
   tmp_vector[0] = IR1;
   tmp_vector[1] = IR2;
   tmp_vector[2] = IR3;
   MultiplyMatrixByVector(&Matrices.Color, tmp_vector, CRVectors.B, sf, lm);

   DCPL(instr);
}

/* NCDS - Normal Color Depth Cue Single vector */
static int32_t NCDS(uint32_t instr)
{
   NormColorDepthCue(instr, 0);

   return(19);
}

/* NDCT - Normal Color Depth Cue Triple */
static int32_t NCDT(uint32_t instr)
{
   NormColorDepthCue(instr, 0);
   NormColorDepthCue(instr, 1);
   NormColorDepthCue(instr, 2);

   return(44);
}

/* CC - Color Color */
static int32_t CC(uint32_t instr)
{
   const uint32_t     sf = (instr & (1 << 19)) ? 12 : 0;
   const int          lm = (instr >> 10) & 1;
   int16_t tmp_vector[3] = {IR1, IR2, IR3 };

   MultiplyMatrixByVector(&Matrices.Color, tmp_vector, CRVectors.B, sf, lm);

   MAC[1] = ((RGB.R << 4) * IR1) >> sf;
   MAC[2] = ((RGB.G << 4) * IR2) >> sf;
   MAC[3] = ((RGB.B << 4) * IR3) >> sf;

   MAC_to_IR(lm);
   MAC_to_RGB_FIFO();

   return(11);
}

static int32_t CDP(uint32_t instr)
{
   int16_t tmp_vector[3];
   const uint32_t sf = (instr & (1 << 19)) ? 12 : 0;
   const int      lm = (instr >> 10) & 1;

   tmp_vector[0] = IR1;
   tmp_vector[1] = IR2;
   tmp_vector[2] = IR3;
   MultiplyMatrixByVector(&Matrices.Color, tmp_vector, CRVectors.B, sf, lm);

   DCPL(instr);

   return(13);
}

/* Normal Clipping */
static int32_t NCLIP(uint32_t instr)
{
   int16_t x0     = XY_FIFO[0].X;
   int16_t y0     = XY_FIFO[0].Y;
   int16_t x1     = XY_FIFO[1].X;
   int16_t y1     = XY_FIFO[1].Y;
   int16_t x2     = XY_FIFO[2].X;
   int16_t y2     = XY_FIFO[2].Y;
   int64_t a      = x0 * (y1 - y2);
   int64_t b      = x1 * (y2 - y0);
   int64_t c      = x2 * (y0 - y1);
   int32_t sum    = a + b + c;

   if ((PGXP_GetModes() & PGXP_NCLIP_IMPL) &&
       PGXP_NCLIP_valid(*((uint32*)&XY_FIFO[0]), *((uint32*)&XY_FIFO[1]), *((uint32*)&XY_FIFO[2]))) {
      sum = PGXP_NCLIP();
   } else {
      sum = F( (int64_t)(XY_FIFO[0].X * (XY_FIFO[1].Y - XY_FIFO[2].Y)) + (XY_FIFO[1].X * (XY_FIFO[2].Y - XY_FIFO[0].Y)) + (XY_FIFO[2].X * (XY_FIFO[0].Y - XY_FIFO[1].Y))
         );
      check_mac_overflow(sum);
   }

   MAC[0] = sum;

   return(8);
}

/* Average three Z Values */
static int32_t AVSZ3(uint32_t instr)
{
   uint32_t z1     = Z_FIFO[1];
   uint32_t z2     = Z_FIFO[2];
   uint32_t z3     = Z_FIFO[3];
   uint64_t sum    = z1 + z2 + z3;
   /* The average factor should generally be set to 1/3th of 
    * the ordering table size. So for instance, for a table of
    * 1024 entries, it should be set at 341 to use the full
    * table granularity. */
   int64_t zsf3    = ZSF3;
   int64_t average = zsf3 * sum;

   check_mac_overflow(average);

   MAC[0] = (int32_t)average;
   OTZ    = i64_to_otz(MAC[0], false);

   return(5);
}

/* Average four Z values */
static int32_t AVSZ4(uint32_t instr)
{
   uint32_t z0     = Z_FIFO[0];
   uint32_t z1     = Z_FIFO[1];
   uint32_t z2     = Z_FIFO[2];
   uint32_t z3     = Z_FIFO[3];
   uint64_t sum    = z0 + z1 + z2 + z3;
   /* The average factor should generally be set to 1/4th of 
    * the ordering table size. So for instance, for a table of
    * 1024 entries, it should be set at 256 to use the full
    * table granularity. */
   int64_t zsf4    = ZSF4;
   int64_t average = zsf4 * sum;

   check_mac_overflow(average);

   MAC[0] = (int32_t)average;
   OTZ    = i64_to_otz(MAC[0], false);

   return(5);
}


// -32768 * -32768 - 32767 * -32768 = 2147450880
// (2 ^ 31) - 1 =		      2147483647
static int32_t OP(uint32_t instr)
{
   const uint32_t sf = (instr & (1 << 19)) ? 12 : 0;
   const int      lm = (instr >> 10) & 1;
   int32_t       ir1 = IR1;
   int32_t       ir2 = IR2;
   int32_t       ir3 = IR3;
   int32_t        r0 = Matrices.Rot.MX[0][0];
   int32_t        r1 = Matrices.Rot.MX[1][1];
   int32_t        r2 = Matrices.Rot.MX[2][2];

   MAC[1] = (r1 * ir3 - r2 * ir2) >> sf;
   MAC[2] = (r2 * ir1 - r0 * ir3) >> sf;
   MAC[3] = (r0 * ir2 - r1 * ir1) >> sf;

   MAC_to_IR(lm);

   return(6);
}

static int32_t GPF(uint32_t instr)
{
   const uint32_t sf = (instr & (1 << 19)) ? 12 : 0;
   const int      lm = (instr >> 10) & 1;

   MAC[1] = (IR0 * IR1) >> sf;
   MAC[2] = (IR0 * IR2) >> sf;
   MAC[3] = (IR0 * IR3) >> sf;

   MAC_to_IR(lm);

   MAC_to_RGB_FIFO();

   return(5);
}

static int32_t GPL(uint32_t instr)
{
   const uint32_t sf = (instr & (1 << 19)) ? 12 : 0;
   const int      lm = (instr >> 10) & 1;

   MAC[1] = i64_to_i44(0, (int64_t)((uint64_t)(int64_t)MAC[1] << sf) + (IR0 * IR1)) >> sf;
   MAC[2] = i64_to_i44(1, (int64_t)((uint64_t)(int64_t)MAC[2] << sf) + (IR0 * IR2)) >> sf;
   MAC[3] = i64_to_i44(2, (int64_t)((uint64_t)(int64_t)MAC[3] << sf) + (IR0 * IR3)) >> sf;

   MAC_to_IR(lm);

   MAC_to_RGB_FIFO();

   return(5);
}

/*

---------------------------------------------------------------------------------------------
| 24 23 22 21 20 | 19 | 18 17 | 16 15 | 14 13 | 12  11 | 10 | 9  8  7  6 | 5  4  3  2  1  0 |
|-------------------------------------------------------------------------------------------|
|    (unused)    | sf |  mx   |   v   |   cv  |(unused)| lm |  (unused)  |     opcode       |
---------------------------------------------------------------------------------------------
 (unused) = unused, ignored

 sf = shift 12

 mx = matrix selection

 v = source vector

 cv = add vector(translation/back/far color(bugged)/none)

 (unused) = unused, ignored

 lm = limit negative results to 0

 (unused) = unused, ignored

 opcode = operation code 
*/

int32_t GTE_Instruction(uint32_t instr)
{
   const unsigned code = instr & 0x3F;
   int32_t ret = 1;

   FLAGS = 0;

   switch(code)
   {
      default: 
         break;
      case 0x00:	// alternate?
      case 0x01:
         ret = RTPS(instr);
         break;

         /*
            case 0x02:	// UNSTABLE?
            break;

            case 0x03:	// UNSTABLE?
            break;

            case 0x04:	// Probably simple with v,cv,sf,mx,lm ignored.  Same calculation as 0x3B?
            break;

            case 0x05:	// UNSTABLE?
            break;
            */

      case 0x06:
         ret = NCLIP(instr);
         break;

         /*
            case 0x07:	// UNSTABLE?
            break;

            case 0x08:	// UNSTABLE?
            break;

            case 0x09:	// UNSTABLE?
            break;

            case 0x0A:	// UNSTABLE?
            break;

            case 0x0B:	// UNSTABLE?
            break;

*/

      case 0x0C:
         ret = OP(instr);
         break;

         /*
            case 0x0D:	// UNSTABLE?
            break;

            case 0x0E:	// UNSTABLE?
            break;

            case 0x0F:	// UNSTABLE?
            break;
            */

      case 0x10:
         ret = DPCS(instr);
         break;

      case 0x11:
         ret = INTPL(instr);
         break;

      case 0x12:
         ret = MVMVA(instr);
         break;

      case 0x13:
         ret = NCDS(instr);
         break;

      case 0x14:
         ret = CDP(instr);
         break;


         /*
            case 0x15:	// does one push on RGB FIFO, what else...
            break;
            */

      case 0x16:
         ret = NCDT(instr);
         break;

         /*
            case 0x17:	// PARTIALLY UNSTABLE(depending on sf or v or cv or mx or lm), similar behavior under some conditions to 0x16?
            break;

            case 0x18:
            break;

            case 0x19:
            break;
            */

      case 0x1B:
         ret = NCCS(instr);
         break;

      case 0x1C:
         ret = CC(instr);
         break;

         /*
            case 0x1D:
            break;
            */

      case 0x1E:
         ret = NCS(instr);
         break;

         /*
            case 0x1F:
            break;
            */

      case 0x20:
         ret = NCT(instr);
         break;
         /*
            case 0x21:
            break;

            case 0x22:	// UNSTABLE?
            break;

            case 0x23:
            break;

            case 0x24:
            break;

            case 0x25:
            break;

            case 0x26:
            break;

            case 0x27:
            break;
            */

      case 0x28:
         ret = SQR(instr);
         break;

	  case 0x1A:	// Alternate for 0x29?
      case 0x29:
         ret = DCPL(instr);
         break;

      case 0x2A:
         ret = DPCT(instr);
         break;

         /*
            case 0x2B:
            break;

            case 0x2C:
            break;
            */

      case 0x2D:
         ret = AVSZ3(instr);
         break;

      case 0x2E:
         ret = AVSZ4(instr);
         break;

         /*
            case 0x2F:	// UNSTABLE?
            break;
            */

      case 0x30:
         ret = RTPT(instr);
         break;

         /*
            case 0x31:	// UNSTABLE?
            break;

            case 0x32:	// UNSTABLE?
            break;

            case 0x33:	// UNSTABLE?
            break;

            case 0x34:	// UNSTABLE?
            break;

            case 0x35:	// UNSTABLE?
            break;

            case 0x36:	// UNSTABLE?
            break;

            case 0x37:	// UNSTABLE?
            break;

            case 0x38:
            break;

            case 0x39:	// Probably simple with v,cv,sf,mx,lm ignored.
            break;

            case 0x3A:	// Probably simple with v,cv,sf,mx,lm ignored.
            break;

            case 0x3B:	// Probably simple with v,cv,sf,mx,lm ignored.  Same calculation as 0x04?
            break;

            case 0x3C:	// UNSTABLE?
            break;
            */

      case 0x3D:
         ret = GPF(instr);
         break;

      case 0x3E:
         ret = GPL(instr);
         break;

      case 0x3F:
         ret = NCCT(instr);
         break;
   }

   // Overclock: force all GTE instruction to have 1 cycle latency
   if (psx_gte_overclock)
      ret = 1;

   if(FLAGS & 0x7f87e000)
      FLAGS |= 1 << 31;

   CR[31] = FLAGS;

   return(ret - 1);
}