/* Mednafen - Multi-system Emulator * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* MDEC_READ_FIFO(tfr) vs InCounter vs MDEC_DMACanRead() is a bit fragile right now. Actually, the entire horrible state machine monstrosity is fragile. TODO: OutFIFOReady, so <16bpp works right. TODO CODE: bool InFIFOReady; if(InFIFO.CanWrite()) { InFIFO.Write(V); if(InCommand) { if(InCounter != 0xFFFF) { InCounter--; // This condition when InFIFO.CanWrite() != 0 is a bit buggy on real hardware, decoding loop *seems* to be reading too // much and pulling garbage from the FIFO. if(InCounter == 0xFFFF) InFIFOReady = true; } if(InFIFO.CanWrite() == 0) InFIFOReady = true; } } */ // Good test-case games: // Dragon Knight 4(bad disc?) // Final Fantasy 7 intro movie. // GameShark Version 4.0 intro movie; (clever) abuse of DMA channel 0. // SimCity 2000 startup. #include "psx.h" #include "mdec.h" #include "../masmem.h" #include "../math_ops.h" #include "../state_helpers.h" #include "FastFIFO.h" #include #if defined(__SSE2__) #include #include #endif #if defined(ARCH_POWERPC_ALTIVEC) && defined(HAVE_ALTIVEC_H) #include #endif static int32 ClockCounter; static unsigned MDRPhase; static FastFIFO InFIFO; static FastFIFO OutFIFO; static int8 block_y[8][8]; static int8 block_cb[8][8]; // [y >> 1][x >> 1] static int8 block_cr[8][8]; // [y >> 1][x >> 1] static uint32 Control; static uint32 Command; static bool InCommand; static uint8 QMatrix[2][64]; static uint32 QMIndex; MDFN_ALIGN(16) static int16 IDCTMatrix[64]; static uint32 IDCTMIndex; static uint8 QScale; MDFN_ALIGN(16) static int16 Coeff[64]; static uint32 CoeffIndex; static uint32 DecodeWB; static union { uint32 pix32[48]; uint16 pix16[96]; uint8 pix8[192]; } PixelBuffer; static uint32 PixelBufferReadOffset; static uint32 PixelBufferCount32; static uint16 InCounter; static uint8 RAMOffsetY; static uint8 RAMOffsetCounter; static uint8 RAMOffsetWWS; static const uint8 ZigZag[64] = { 0x00, 0x08, 0x01, 0x02, 0x09, 0x10, 0x18, 0x11, 0x0a, 0x03, 0x04, 0x0b, 0x12, 0x19, 0x20, 0x28, 0x21, 0x1a, 0x13, 0x0c, 0x05, 0x06, 0x0d, 0x14, 0x1b, 0x22, 0x29, 0x30, 0x38, 0x31, 0x2a, 0x23, 0x1c, 0x15, 0x0e, 0x07, 0x0f, 0x16, 0x1d, 0x24, 0x2b, 0x32, 0x39, 0x3a, 0x33, 0x2c, 0x25, 0x1e, 0x17, 0x1f, 0x26, 0x2d, 0x34, 0x3b, 0x3c, 0x35, 0x2e, 0x27, 0x2f, 0x36, 0x3d, 0x3e, 0x37, 0x3f, }; extern int32 EventCycles; void MDEC_Power(void) { ClockCounter = 0; MDRPhase = 0; InFIFO.Flush(); OutFIFO.Flush(); memset(block_y, 0, sizeof(block_y)); memset(block_cb, 0, sizeof(block_cb)); memset(block_cr, 0, sizeof(block_cr)); Control = 0; Command = 0; InCommand = false; memset(QMatrix, 0, sizeof(QMatrix)); QMIndex = 0; memset(IDCTMatrix, 0, sizeof(IDCTMatrix)); IDCTMIndex = 0; QScale = 0; memset(Coeff, 0, sizeof(Coeff)); CoeffIndex = 0; DecodeWB = 0; memset(PixelBuffer.pix32, 0, sizeof(PixelBuffer.pix32)); PixelBufferReadOffset = 0; PixelBufferCount32 = 0; InCounter = 0; RAMOffsetY = 0; RAMOffsetCounter = 0; RAMOffsetWWS = 0; } int MDEC_StateAction(StateMem *sm, int load, int data_only) { SFORMAT StateRegs[] = { SFVAR(ClockCounter), SFVAR(MDRPhase), #define SFFIFO32(fifoobj) SFARRAY32(&fifoobj.data[0], sizeof(fifoobj.data) / sizeof(fifoobj.data[0])), \ SFVAR(fifoobj.read_pos), \ SFVAR(fifoobj.write_pos), \ SFVAR(fifoobj.in_count) SFFIFO32(InFIFO), SFFIFO32(OutFIFO), #undef SFFIFO SFARRAY(&block_y[0][0], sizeof(block_y) / sizeof(block_y[0][0])), SFARRAY(&block_cb[0][0], sizeof(block_cb) / sizeof(block_cb[0][0])), SFARRAY(&block_cr[0][0], sizeof(block_cr) / sizeof(block_cr[0][0])), SFVAR(Control), SFVAR(Command), SFVAR(InCommand), SFARRAY(&QMatrix[0][0], sizeof(QMatrix) / sizeof(QMatrix[0][0])), SFVAR(QMIndex), SFARRAY16(&IDCTMatrix[0], sizeof(IDCTMatrix) / sizeof(IDCTMatrix[0])), SFVAR(IDCTMIndex), SFVAR(QScale), SFARRAY16(&Coeff[0], sizeof(Coeff) / sizeof(Coeff[0])), SFVAR(CoeffIndex), SFVAR(DecodeWB), SFARRAY32(&PixelBuffer.pix32[0], sizeof(PixelBuffer.pix32) / sizeof(PixelBuffer.pix32[0])), SFVAR(PixelBufferReadOffset), SFVAR(PixelBufferCount32), SFVAR(InCounter), SFVAR(RAMOffsetY), SFVAR(RAMOffsetCounter), SFVAR(RAMOffsetWWS), SFEND }; int ret = MDFNSS_StateAction(sm, load, data_only, StateRegs, "MDEC"); if(load) { InFIFO.SaveStatePostLoad(); OutFIFO.SaveStatePostLoad(); PixelBufferCount32 %= (sizeof(PixelBuffer.pix32) / sizeof(PixelBuffer.pix32[0])) + 1; } return(ret); } static INLINE int8 Mask9ClampS8(int32 v) { v = sign_x_to_s32(9, v); if(v < -128) v = -128; if(v > 127) v = 127; return v; } template static void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff) { unsigned col, x; for(col = 0; col < 8; col++) { #if defined(__SSE2__) __m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]); #endif for( x = 0; x < 8; x++) { #ifdef __SSE2__ MDFN_ALIGN(16) int32 tmp[4]; __m128i m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]); __m128i sum = _mm_madd_epi16(m, c); sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6))); sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2))); _mm_store_si128((__m128i*)tmp, sum); if(sizeof(T) == 1) out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15); else out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15; #else int32 sum = 0; unsigned u; for(u = 0; u < 8; u++) sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]); if(sizeof(T) == 1) out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15); else out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15; #endif } } } static void IDCT(int16 *in_coeff, int8 *out_coeff) { MDFN_ALIGN(16) int16 tmpbuf[64]; IDCT_1D_Multi(in_coeff, tmpbuf); IDCT_1D_Multi(tmpbuf, out_coeff); } static INLINE void YCbCr_to_RGB(const int8 y, const int8 cb, const int8 cr, int &r, int &g, int &b) { // The formula for green is still a bit off(precision/rounding issues when both cb and cr are non-zero). r = Mask9ClampS8(y + (((359 * cr) + 0x80) >> 8)); //g = Mask9ClampS8(y + (((-88 * cb) + (-183 * cr) + 0x80) >> 8)); g = Mask9ClampS8(y + ((((-88 * cb) &~ 0x1F) + ((-183 * cr) &~ 0x07) + 0x80) >> 8)); b = Mask9ClampS8(y + (((454 * cb) + 0x80) >> 8)); r ^= 0x80; g ^= 0x80; b ^= 0x80; } static INLINE uint16 RGB_to_RGB555(uint8 r, uint8 g, uint8 b) { r = (r + 4) >> 3; g = (g + 4) >> 3; b = (b + 4) >> 3; if(r > 0x1F) r = 0x1F; if(g > 0x1F) g = 0x1F; if(b > 0x1F) b = 0x1F; return((r << 0) | (g << 5) | (b << 10)); } static void EncodeImage(const unsigned ybn) { //printf("ENCODE, %d\n", (Command & 0x08000000) ? 256 : 384); PixelBufferCount32 = 0; switch((Command >> 27) & 0x3) { case 0: // 4bpp { const uint8 us_xor = (Command & (1U << 26)) ? 0x00 : 0x88; uint8* pix_out = PixelBuffer.pix8; for(int y = 0; y < 8; y++) { for(int x = 0; x < 8; x += 2) { uint8 p0 = std::min(127, block_y[y][x + 0] + 8); uint8 p1 = std::min(127, block_y[y][x + 1] + 8); *pix_out = ((p0 >> 4) | (p1 & 0xF0)) ^ us_xor; pix_out++; } } PixelBufferCount32 = 8; } break; case 1: // 8bpp { const uint8 us_xor = (Command & (1U << 26)) ? 0x00 : 0x80; uint8* pix_out = PixelBuffer.pix8; for(int y = 0; y < 8; y++) { for(int x = 0; x < 8; x++) { *pix_out = (uint8)block_y[y][x] ^ us_xor; pix_out++; } } PixelBufferCount32 = 16; } break; case 2: // 24bpp { const uint8 rgb_xor = (Command & (1U << 26)) ? 0x80 : 0x00; uint8* pix_out = PixelBuffer.pix8; for(int y = 0; y < 8; y++) { const int8* by = &block_y[y][0]; const int8* cb = &block_cb[(y >> 1) | ((ybn & 2) << 1)][(ybn & 1) << 2]; const int8* cr = &block_cr[(y >> 1) | ((ybn & 2) << 1)][(ybn & 1) << 2]; for(int x = 0; x < 8; x++) { int r, g, b; YCbCr_to_RGB(by[x], cb[x >> 1], cr[x >> 1], r, g, b); pix_out[0] = r ^ rgb_xor; pix_out[1] = g ^ rgb_xor; pix_out[2] = b ^ rgb_xor; pix_out += 3; } } PixelBufferCount32 = 48; } break; case 3: // 16bpp { uint16 pixel_xor = ((Command & 0x02000000) ? 0x8000 : 0x0000) | ((Command & (1U << 26)) ? 0x4210 : 0x0000); uint16* pix_out = PixelBuffer.pix16; for(int y = 0; y < 8; y++) { const int8* by = &block_y[y][0]; const int8* cb = &block_cb[(y >> 1) | ((ybn & 2) << 1)][(ybn & 1) << 2]; const int8* cr = &block_cr[(y >> 1) | ((ybn & 2) << 1)][(ybn & 1) << 2]; for(int x = 0; x < 8; x++) { int r, g, b; YCbCr_to_RGB(by[x], cb[x >> 1], cr[x >> 1], r, g, b); StoreU16_LE(pix_out, pixel_xor ^ RGB_to_RGB555(r, g, b)); pix_out++; } } PixelBufferCount32 = 32; } break; } } static INLINE void WriteImageData(uint16 V, int32* eat_cycles) { const uint32 qmw = (bool)(DecodeWB < 2); //printf("MDEC DMA SubWrite: %04x, %d\n", V, CoeffIndex); if(!CoeffIndex) { if(V == 0xFE00) { //printf("FE00 @ %u\n", DecodeWB); return; } QScale = V >> 10; { int q = QMatrix[qmw][0]; // No QScale here! int ci = sign_10_to_s16(V & 0x3FF); int tmp; if(q != 0) tmp = (int32)((uint32)(ci * q) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0); else tmp = (uint32)(ci * 2) << 4; // Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8? Coeff[ZigZag[0]] = std::min(0x3FFF, std::max(-0x4000, tmp)); CoeffIndex++; } } else { if(V == 0xFE00) { while(CoeffIndex < 64) Coeff[ZigZag[CoeffIndex++]] = 0; } else { uint32 rlcount = V >> 10; for(uint32 i = 0; i < rlcount && CoeffIndex < 64; i++) { Coeff[ZigZag[CoeffIndex]] = 0; CoeffIndex++; } if(CoeffIndex < 64) { int q = QScale * QMatrix[qmw][CoeffIndex]; int ci = sign_10_to_s16(V & 0x3FF); int tmp; if(q != 0) tmp = (int32)((uint32)((ci * q) >> 3) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0); else tmp = (uint32)(ci * 2) << 4; // Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8? Coeff[ZigZag[CoeffIndex]] = std::min(0x3FFF, std::max(-0x4000, tmp)); CoeffIndex++; } } } if(CoeffIndex == 64) { CoeffIndex = 0; //printf("Block %d finished\n", DecodeWB); switch(DecodeWB) { case 0: IDCT(Coeff, &block_cr[0][0]); break; case 1: IDCT(Coeff, &block_cb[0][0]); break; case 2: case 3: case 4: case 5: IDCT(Coeff, &block_y[0][0]); break; } // Timing in the actual PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is // about 512. *eat_cycles += 512; if(DecodeWB >= 2) EncodeImage((DecodeWB + 4) % 6); DecodeWB++; if(DecodeWB == (((Command >> 27) & 2) ? 6 : 3)) DecodeWB = ((Command >> 27) & 2) ? 0 : 2; } } void MDEC_Run(int32 clocks) { static const unsigned MDRPhaseBias = 0 + 1; ClockCounter += clocks; if(ClockCounter > EventCycles) { //if(MDRPhase != 0) // printf("SNORT: %d\n", ClockCounter); ClockCounter = EventCycles; } switch(MDRPhase + MDRPhaseBias) { for(;;) { InCommand = false; { { case 1: if(!(InFIFO.in_count)) { MDRPhase = 2 - MDRPhaseBias - 1; return; } }; Command = InFIFO.Read(); }; InCommand = true; { ClockCounter -= (1); { case 3: if(!(ClockCounter > 0)) { MDRPhase = 4 - MDRPhaseBias - 1; return; } }; }; //printf("****************** Command: %08x, %02x\n", Command, Command >> 29); // // // if(((Command >> 29) & 0x7) == 1) { InCounter = Command & 0xFFFF; OutFIFO.Flush(); //OutBuffer.Flush(); PixelBufferCount32 = 0; CoeffIndex = 0; if((Command >> 27) & 2) DecodeWB = 0; else DecodeWB = 2; switch((Command >> 27) & 0x3) { case 0: case 1: RAMOffsetWWS = 0; break; case 2: RAMOffsetWWS = 6; break; case 3: RAMOffsetWWS = 4; break; } RAMOffsetY = 0; RAMOffsetCounter = RAMOffsetWWS; InCounter--; do { uint32 tfr; int32 need_eat; // = 0; { { case 5: if(!(InFIFO.in_count)) { MDRPhase = 6 - MDRPhaseBias - 1; return; } }; tfr = InFIFO.Read(); }; InCounter--; // printf("KA: %04x %08x\n", InCounter, tfr); need_eat = 0; PixelBufferCount32 = 0; WriteImageData(tfr, &need_eat); WriteImageData(tfr >> 16, &need_eat); { ClockCounter -= (need_eat); { case 7: if(!(ClockCounter > 0)) { MDRPhase = 8 - MDRPhaseBias - 1; return; } }; }; PixelBufferReadOffset = 0; while(PixelBufferReadOffset < PixelBufferCount32) { { { case 9: if(!(OutFIFO.CanWrite())) { MDRPhase = 10 - MDRPhaseBias - 1; return; } }; OutFIFO.Write(LoadU32_LE(&PixelBuffer.pix32[PixelBufferReadOffset++])); }; } } while(InCounter != 0xFFFF); } // // // else if(((Command >> 29) & 0x7) == 2) { QMIndex = 0; InCounter = 0x10 + ((Command & 0x1) ? 0x10 : 0x00); InCounter--; do { uint32 tfr; { { case 11: if(!(InFIFO.in_count)) { MDRPhase = 12 - MDRPhaseBias - 1; return; } }; tfr = InFIFO.Read(); }; InCounter--; //printf("KA: %04x %08x\n", InCounter, tfr); for(int i = 0; i < 4; i++) { QMatrix[QMIndex >> 6][QMIndex & 0x3F] = (uint8)tfr; QMIndex = (QMIndex + 1) & 0x7F; tfr >>= 8; } } while(InCounter != 0xFFFF); } // // // else if(((Command >> 29) & 0x7) == 3) { IDCTMIndex = 0; InCounter = 0x20; InCounter--; do { uint32 tfr; { { case 13: if(!(InFIFO.in_count)) { MDRPhase = 14 - MDRPhaseBias - 1; return; } }; tfr = InFIFO.Read(); }; InCounter--; for(unsigned i = 0; i < 2; i++) { IDCTMatrix[((IDCTMIndex & 0x7) << 3) | ((IDCTMIndex >> 3) & 0x7)] = (int16)(tfr & 0xFFFF) >> 3; IDCTMIndex = (IDCTMIndex + 1) & 0x3F; tfr >>= 16; } } while(InCounter != 0xFFFF); } else { InCounter = Command & 0xFFFF; } } // end for(;;) } } void MDEC_DMAWrite(uint32 V) { if(!InFIFO.CanWrite()) return; InFIFO.Write(V); MDEC_Run(0); } uint32 MDEC_DMARead(uint32* offs) { uint32 V = 0; *offs = 0; if(MDFN_LIKELY(OutFIFO.in_count)) { V = OutFIFO.Read(); *offs = (RAMOffsetY & 0x7) * RAMOffsetWWS; if(RAMOffsetY & 0x08) { *offs = (*offs - RAMOffsetWWS*7); } RAMOffsetCounter--; if(!RAMOffsetCounter) { RAMOffsetCounter = RAMOffsetWWS; RAMOffsetY++; } MDEC_Run(0); } return(V); } bool MDEC_DMACanWrite(void) { return((InFIFO.CanWrite() >= 0x20) && (Control & (1U << 30)) && InCommand && InCounter != 0xFFFF); } bool MDEC_DMACanRead(void) { return((OutFIFO.in_count >= 0x20) && (Control & (1U << 29))); } void MDEC_Write(const int32_t timestamp, uint32 A, uint32 V) { //PSX_WARNING("[MDEC] Write: 0x%08x 0x%08x, %d --- %u %u", A, V, timestamp, InFIFO.in_count, OutFIFO.in_count); if(A & 4) { if(V & 0x80000000) // Reset? { MDRPhase = 0; InCounter = 0; Command = 0; InCommand = false; PixelBufferCount32 = 0; ClockCounter = 0; QMIndex = 0; IDCTMIndex = 0; QScale = 0; memset(Coeff, 0, sizeof(Coeff)); CoeffIndex = 0; DecodeWB = 0; InFIFO.Flush(); OutFIFO.Flush(); } Control = V & 0x7FFFFFFF; } else { if(InFIFO.CanWrite()) { InFIFO.Write(V); if(!InCommand) { if(ClockCounter < 1) ClockCounter = 1; } MDEC_Run(0); } } } uint32 MDEC_Read(const int32_t timestamp, uint32 A) { uint32 ret = 0; if(A & 4) { ret = 0; ret |= (OutFIFO.in_count == 0) << 31; ret |= (InFIFO.CanWrite() == 0) << 30; ret |= InCommand << 29; ret |= MDEC_DMACanWrite() << 28; ret |= MDEC_DMACanRead() << 27; ret |= ((Command >> 25) & 0xF) << 23; // Needs refactoring elsewhere to work right: ret |= ((DecodeWB + 4) % 6) << 16; ret |= InCounter & 0xFFFF; } else { if(OutFIFO.in_count) ret = OutFIFO.Read(); } //PSX_WARNING("[MDEC] Read: 0x%08x 0x%08x -- %d %d", A, ret, InputBuffer.in_count, InCounter); return(ret); }