P_SYSCTRL db 0 P_DISPCTRL db 0 P_STATUS db 0 PPU_READ_BUFFER db 0 P_SCANLINE dw 0 vseg dw 0xA000 ; VGA video segment ; vseg is here so that ShowScanline can use the "les" instruction. P_SPRINPOS dw 0 P_SPRRENPOS dw 0 section .text PPU_tick: mov al, 0 P_VBL_STATE EQU $-1 cmp al, 0 jz .VBLstate0 jl .VBLminus .VBLplus: cmp al, 2 jne .VBLplus2 or byte [P_STATUS], 0x80 ; set invblank flag .VBLplus2: dec ax mov [P_VBL_STATE], al jmp .VBLstateDef .VBLminus: cmp al, -5 jne .VBLminus2 mov byte [P_STATUS], 0x00 .VBLminus2: inc ax mov [P_VBL_STATE], al jmp .VBLstateDef .VBLstate0: ; NMI = status & sysctrl & 80h mov al, [P_STATUS] and al, [P_SYSCTRL] and al, 80h ;shr al, 7 mov [C_NMI], al .VBLstateDef: cmp word [P_SCANLINE], 240 jge .NoRendering ;; rendering... ;if(reg.ShowBGSP) rendering_tick(); mov al, [P_DISPCTRL] test al, 3 << 3 jz .NoRenderingTick call PPU_RenderingTick .NoRenderingTick: ;if(scanline >= 0 && x < 256) render_pixel(); cmp word [P_SCANLINE], 0 jl .DoneRendering cmp byte [P_X+1], 0 jnz .DoneRendering call PPU_RenderPixel .DoneRendering: ;; end rendering... .NoRendering: inc word [P_X] ; NTSC phase is incremented by 8 after every pixel, ; rendered or not. mov bl, 0 P_NTSC_PHASE EQU $-1 call NTSC_phase_inc_bl mov [P_NTSC_PHASE], bl mov ax, [P_X] cmp ax, 341 P_SCANLINE_END EQU $-2 jb .ScanlineUnfinished mov word [P_SCANLINE_END], 341 mov [P_X], word 0 mov ax, [P_SCANLINE] cmp ax, 240 jae .DontShow call ShowScanline mov ax, [P_SCANLINE] .DontShow: inc ax mov [P_SCANLINE], ax cmp ax, 241 je .VBLbegin cmp ax, 0 je .EndOfPreRenderLine cmp ax, 261 jl .ScanlineUnfinished .VBLend:mov [P_VBL_STATE], byte -5 mov [P_SCANLINE], word -1 xor [P_EVENODD], byte 1 %if 0 mov ax, 0 P_FRAMECOUNT EQU $-2 dec ax jns .NotReset mov ax, 0 P_FRAME_SKIP EQU $-2 .NotReset: mov [P_FRAMECOUNT], ax %endif jmp short .ScanlineUnfinished .VBLbegin: mov [P_VBL_STATE], byte 2 jmp short .ScanlineUnfinished .EndOfPreRenderLine: ; FIXME: This should happen at x=304 (small timing difference): test byte [P_DISPCTRL], 15*2 jz .ScanlineUnfinished ; Only do vaddr=taddr if _some_ part of rendering is enabled mov ax, [TADDR_BUF] mov [VADDR_BUF], ax ; The 340-length scanline also happens only if rendering is enabled mov ax, word 341 sub al, byte 0 P_EVENODD EQU $-1 mov [P_SCANLINE_END], ax .ScanlineUnfinished: ret PPU_write: ; dl = byte ; ax = index (0..7), already clamped mov [PPU_OPEN_BUS], dl cmp ax, 1 jb .WriteSysCtrl je .WriteDispCtrl cmp ax, 3 je .WriteOAMaddr jb .WriteNothing cmp ax, 5 jb .WriteOAM je .WriteScroll cmp ax, 7 je .WriteMemory jb .WriteMemoryPosition .WriteNothing: ret .WriteSysCtrl: mov [P_SYSCTRL], dl ; scroll.basenta = reg.BaseNTA mov ax, [TADDR_BUF] and ax, ~(3 << 10) and dx, 3 shl dx, 10 or ax, dx mov [TADDR_BUF], ax ret .WriteDispCtrl: mov [P_DISPCTRL], dl ; Build attenuation mask for generating ; the color de-emphasis bits in NTSC signal xor eax, eax shl dl, 1 jnc .Not4 or eax, 00111111000000111111000000111111b .Not4: shl dl, 1 jnc .Not2 or eax, 11110000001111110000001111110000b .Not2: shl dl, 1 jnc .Not1 or eax, 00000011111100000011111100000011b .Not1: mov [P_ATTENUATION_MASK], eax ret .WriteOAMaddr: mov [P_OAMADDR], dl ret .WriteOAM: push bx mov bx, word 0x0000 P_OAMADDR EQU $-2 mov [OAM+bx], dl inc byte [P_OAMADDR] pop bx ret .WriteScroll: mov al, [P_OFFSETTOGGLE] cmp al, 0 mov ax, [TADDR_BUF] jnz .SHi .SLo: ; Set xfine and xcoarse and al, ~31 mov dh, dl shr dh, 3 or al, dh and dx, 7 mov [XFINE], dx jmp short .Sdone .SHi: ; Set yfine and ycoarse and ax, ~((7 << 12) | (31 << 5)) mov dh, dl and dh, 7 ; yfine = dl&7 shl dh, 4 or ah, dh shl dx, 2 ; ycoarse = dl>>3 (placed at bitpos 5) and dx, 31<<5 or ax, dx ; y ;jmp short .Sdone .Sdone: mov [TADDR_BUF], ax jmp short .DidWriteToggle .WriteMemoryPosition: ;pusha ; mov dx, .WMP ; mov ah, 9 ; int 21h ;popa ;pusha ; mov al, [PPU_OPEN_BUS] ; call PrintHexByte ; call PrintNewline ; jmp .WMP2 ; .WMP: db 'Set mem pos $' ; .WMP2: ;popa mov al, byte 00h P_OFFSETTOGGLE EQU $-1 cmp al, 0 jnz .MLo .MHi: and dl, 0x3F mov [TADDR_BUF+1], dl jmp .DidWriteToggle .MLo: mov ah, [TADDR_BUF+1] mov al, dl mov [TADDR_BUF+0], al mov [VADDR_BUF+0], ax ;jmp .DidWriteToggle .DidWriteToggle: not byte [P_OFFSETTOGGLE] ;pusha ; mov dx, .WMP3 ; mov ah, 9 ; int 21h ;popa ;pusha ; mov ax, [VADDR_BUF] ; call PrintHexWord ; call PrintNewline ; jmp .WMP4 ; .WMP3: db 'Begets $' ; .WMP4: ;popa ret .WriteMemory: push bx push dx mov ax, [VADDR_BUF] call PPU_mmap ; bx = t jc .DoneWriteMemory ; Don't write to read-only memory (0000-1FFF). 2000-3FFF is ok. cmp ax, 0x3F00 jae .PaletteWrite mov al, [PPU_OPEN_BUS] mov [bx], al jmp short .DoneWriteMemory .PaletteWrite: mov bx, ax test al, 3 jnz .NotZero and bx, 0x0F ; x10, x14, x18 and x1C must be mirrors of x00, x04, x08 and x0C .NotZero: and bx, 0x1F mov al, [PPU_OPEN_BUS] and al, 0x3F mov [PALETTE+bx], al ; pusha ; mov dx, .PPPT ; mov ah, 9 ; int 21h ; mov al, bl ; call PrintHexByte ; mov dl, ' ' ; call ConsolePutc ; mov al, [PPU_OPEN_BUS] ; and al, 0x3F ; call PrintHexByte ; call PrintNewline ; jmp .PPPT2 ; .PPPT: db 'Wrote palette $' ; .PPPT2: ; popa .DoneWriteMemory: call PPU_IncAddr pop dx pop bx ret PPU_read: ; ax = index (0..7), already clamped cmp ax, 2 jb .DoneRead je .ReadStatus cmp ax, 4 jb .DoneRead je .ReadOAM cmp ax, 7 je .ReadMemory .DoneRead: mov al, 0x00 PPU_OPEN_BUS EQU $-1 ret .ReadOAM: push bx mov bx, [P_OAMADDR] mov al, [OAM+bx] mov [PPU_OPEN_BUS], al; FIXME: For %4=2, update only &0xE3 pop bx ret .ReadStatus: mov al, [PPU_OPEN_BUS] and al, 0x1F or al, [P_STATUS] mov [PPU_OPEN_BUS], al and byte [P_STATUS], 0x7F ; clear invblank flag mov byte [P_OFFSETTOGGLE], 0 cmp [P_VBL_STATE], byte -5 je .DontResetVBLstate mov [P_VBL_STATE], byte 0 ; this may also cancel the setting of InVBlank .DontResetVBLstate: ret .ReadMemory: push bx push dx mov ax, [VADDR_BUF] call PPU_mmap ; bx = t; ax = vaddr_raw mov dl, [bx] xchg dl, [PPU_READ_BUFFER] ; Put memory data in read-buffer ; The read-buffer thing happens even if the address is of palette. ; pusha ; pusha ; mov dx, .BBB ; mov ah, 9 ; int 21h ; popa ; call PrintHexWord ; call PrintNewline ; jmp .RBB2 ; .BBB: db 'Reads from $' ; .RBB2: ; popa cmp ax, 0x3F00 jae short .PaletteRead mov [PPU_OPEN_BUS], dl ; Old contents of read-buffer jmp short .DoneReadMemory .PaletteRead: mov bx, PALETTE ; mov [.PPPR3], al test al, 3 jnz .NotZero and al, 0x0F ; x10, x14, x18 and x1C must be mirrors of x00, x04, x08 and x0C .NotZero: and al, 0x1F xlatb ; mov al, [PALETTE+al] mov ah, [PPU_OPEN_BUS] and ax, 0xC03F ; update only &0x3F or al, ah mov [PPU_OPEN_BUS], al ; pusha ; mov dx, .PPPR ; mov ah, 9 ; int 21h ; mov al, byte 00h ; .PPPR3 EQU $-1 ; call PrintHexByte ; mov dl, ' ' ; call ConsolePutc ; mov al, [PPU_OPEN_BUS] ; call PrintHexByte ; call PrintNewline ; jmp .PPPR2 ; .PPPR: db 'Read palette $' ; .PPPR2: ; popa .DoneReadMemory: call PPU_IncAddr pop dx pop bx jmp .DoneRead PPU_mmap: ; In: AX = address in PPU's memory ; Out: BX = address of physical data in emulator's memory ; CF = address is VROM (not writable) ; Preserves AX LOW 14 BITS; PRESERVES CX-DI and ax, 0x3FFF test ax, 0x2000 jz .VBankRead ; Nametable read mov bx, ax shr bx, 10-1 and bx, 3*2 mov bx, [C_NTA + bx] push ax and ax, 0x3FF add bx, ax pop ax clc ; clear carry flag (content is RAM) ret .VBankRead: ; FOR NOW, OUR GRANULARITY IS 8k (CNROM) mov bx, ax ;and bx, 0x1FFF add bx, [C_VROMPAGE] stc ; set carry flag to indicate the content is ROM ret PPU_IncAddr: inc word [VADDR_BUF] ; add 1 test byte [P_SYSCTRL], 4 jnz .IncBy32 ret .IncBy32: add word [VADDR_BUF], 31 ; add 32 ret PPU_RenderingTick: mov cx, 0x0000 P_X EQU $-2 mov bx, cx ; tile_decode_mode = x<256 || (x >= 320 && x < 336) shr cx, 4 ; cl = x/16 mov edx, 1 shl edx, cl ; edx = 1 << (x/16) and edx, 0x10FFFF ;tile_decode_mode = edx<>0 and bx, 7 ; x % 8 shl bx, 1 jmp [.Mod8Table + bx] .Mod8Table: dw .Mod8_0, .Mod8_1, .Mod8_2, .Mod8_3, .Mod8_4, .Mod8_5, .Mod8_6, .Mod8_7 .Mod8_2: ; Point to attribute table ;ioaddr = 0x23C0 + 0x400*reg.vaddr_basenta + 8*(reg.vaddr_ycoarse/4) + (reg.vaddr_xcoarse/4); mov ax, [VADDR_BUF] mov cx, ax mov bx, ax ;shr bx, 7 ;and bx, 7 ;shl bx, 3 ; bx = (ycoarse/4)*8 shr bx, 4 and bx, (7<<3) ; bx = (ycoarse/4)*8 shr ax, 2 and ax, 7 ; ax = xcoarse/4 ;add ax, bx and cx, 0xC00 ; cx = basenta*0x400 lea ax, [0x23C0 + ebx + eax] add ax, cx ;add ax, 0x23C0 mov [P_IOADDR], ax or edx, edx jnz .Mod8_break ;passthru if zero (sprite mode) .Mod8_0: ;Point to nametable mov ax, word 0x0000 VADDR_BUF EQU $-2 and ax, 0xFFF or ax, 0x2000 mov [P_IOADDR], ax ; Reset sprite data xor ax, ax mov bx, [P_X] or bx, bx jz .Mod8_0_0 cmp bx, 256 jnz .Mod8_break .Mod8_0_256: mov [P_SPRRENPOS], al jmp .Mod8_break .Mod8_0_0: mov [P_SPRINPOS], al mov [P_SPROUTPOS], al mov [P_OAMADDR], byte 0x00 jmp .Mod8_break .Mod8_1: ;Name table access ; pat_addr = 0x1000*reg.BGaddr + 16*mmap(ioaddr) + reg.vaddr_yfine mov ax, word 0x0000 P_IOADDR EQU $-2 call PPU_mmap mov bl, [bx] mov bh, 0 shl bx, 4 ; bx = 16*mmap(ioaddr) mov al, [VADDR_BUF+1] shr al, 4 and ax, 7 ; ax = yfine add bx, ax mov al, 0 mov ah, [P_SYSCTRL] and ah, 0x10 ; ax = bgaddr*0x1000 (bgaddr happens to be sysctrl&0x10) add ax, bx mov [P_PATADDR], ax or edx, edx jnz .Mod8_1_continues ; Not tile mode? Check special actions cmp word [P_X], 257 jne .Mod8_break ; copy xcoarse, basenta_h from scroll to vaddr mov ax, word 0x0000 TADDR_BUF EQU $-2 and ax, ((1 << 10) | (31 << 0)) and word [VADDR_BUF], ~((1 << 10) | (31 << 0)) or word [VADDR_BUF], ax jmp .Mod8_break .Mod8_1_continues: ;// Push the current tile into shift registers. ;// The bitmap pattern is 16 bits, while the attribute is 2 bits, repeated 8 times. ;misc.bg_shift_pat = (misc.bg_shift_pat >> 16) + 0x00010000 * tilepat; ;misc.bg_shift_attr = (misc.bg_shift_attr >> 16) + 0x55550000 * tileattr; mov eax, [P_BG_SHIFT_PAT] mov ax, word 0xAAAA P_TILEPAT EQU $-2 ror eax, 16 mov [P_BG_SHIFT_PAT], eax mov eax, [P_BG_SHIFT_ATTR] mov bx, word 0xAAAA P_TILEATTR EQU $-2 mov si, bx ; bx+si = bx*2 mov ax, [.AttrTable + bx+si] ror eax, 16 mov [P_BG_SHIFT_ATTR], eax jmp .Mod8_break section .const .AttrTable: ; This lookup table translates ; a 2-bit value into 16-bit value ; by duplicating it 8 times. dw 0000000000000000b dw 0101010101010101b dw 1010101010101010b dw 1111111111111111b section .text .Mod8_3: ;attribute table access or edx, edx jnz .Mod8_3_tilemode cmp word [P_X], 335 jae .Mod8_3_done .Mod8_3_spritemode: ; pat_addr = 0x1000 * reg.SPaddr mov al, [P_SYSCTRL] and ax, 8 shl ax, 9 mov [P_PATADDR], ax mov bx, [P_SPRRENPOS] ; sno cmp bl, [P_SPROUTPOS] jae .Mod8_3_done ; Select sprite pattern instead of background pattern mov al, [OAM2_sprindex + bx] mov ah, [OAM2_attr + bx] mov cl, [OAM2_x + bx] mov [OAM3_sprindex + bx], al mov [OAM3_attr + bx], ah mov [OAM3_x + bx], cl ; y = scanline - OAM2_y[sno] mov al, [OAM2_y + bx] mov ah, 0 sub ax, [P_SCANLINE] neg ax ; ax = y mov cx, [OAM2_index + bx] test byte [P_SYSCTRL], 32 ; 16-tall? jz .Mod8_3_sprite_8 .Mod8_3_sprite_16: ; Deal with 16-tall sprites test byte [OAM3_attr + bx], 0x80 jz .NoYflip_16 xor al, 15 .NoYflip_16: shl cx, 12 and cx, 0x1000 mov [P_PATADDR], cx mov cx, [OAM2_index + bx] and cx, 0xFE jmp short .ChoseSprite .Mod8_3_sprite_8: ; Deal with 8-tall sprites test byte [OAM3_attr + bx], 0x80 jz .NoYflip_8 xor al, 7 .NoYflip_8: mov ch, 0 ;and cx, 0xFF .ChoseSprite: shl cx, 4 add [P_PATADDR], cx .Mod8_3_sprite_done: mov bx, ax ; bx, ax = y and bx, 8 shl bx, 1 ; bx = (y&8)*2 and ax, 7 add ax, bx ; ax = (y&7) + (y&8)*2 add [P_PATADDR], ax jmp .Mod8_3_done .Mod8_3_tilemode: ;tileattr = (mmap(ioaddr) >> ((reg.vaddr_xcoarse&2) + 2*(reg.vaddr_ycoarse&2))) & 3 mov al, [VADDR_BUF] ; fedcba9876543210 mov cl, al ; .........4....2. and cl, 2 ; cl = xcoarse&2 shr al, 4 and al, 2<<1 ; al = (ycoarse&2)*2 add cl, al ; cl = (xcoarse&2) + 2*(ycoarse&2) mov ax, [P_IOADDR] call PPU_mmap mov al, [bx] shr al, cl ;mov al, 2 ; TEST and ax, 3 mov [P_TILEATTR], ax ; Go to the next tile horizontally (and switch nametable if it wraps) ; ; Increment xcoarse (0..31 at bitpos 0). ; If it wraps, toggle basenta_h (0..1 at bitpos 10). mov ax, [VADDR_BUF] mov cl, al inc cx and cl, 31 jnz .DidntWrapHoriz xor ah, 1 << (10-8) ; Toggle horizontal nametable index .DidntWrapHoriz: and al, ~31 or al, cl cmp word [P_X], 251 jne .Mod8_3_tilemode_done ; At the edge of the screen, do the same but vertically ; ; Increment yfine (0..7 at bitpos 12). ; If it wraps, increment ycoarse (0..31 at bitpos 5). ; If ycoarse hits 30, set ycoarse=0 ; and toggle basenta_v (0..1 at bitpos 11). ; add ah, 1 << (12-8) and ah, 0x7F test ah, 7 << (12-8) jnz .Mod8_3_tilemode_done ; ++ycoarse mov cx, ax add cx, 1 << 5 and cx, 31 << 5 cmp cx, 30 << 5 jne .DidntWrapVert xor cx, cx ;ycoarse=0 xor ah, 1 << (11-8) ; Toggle vertical nametable index .DidntWrapVert: and ax, ~(31 << 5) or ax, cx .Mod8_3_tilemode_done: mov [VADDR_BUF], ax ;jmp .Mod8_3_done .Mod8_3_done: mov [P_IOADDR], word 0xAAAA P_PATADDR EQU $-2 jmp .Mod8_break .Mod8_5: ; Read first byte of tile pattern mov ax, [P_IOADDR] call PPU_mmap mov al, [bx] mov [P_TILEPAT], al jmp .Mod8_break .Mod8_7: ; Read second byte of tile pattern mov ax, [P_IOADDR] or ax, 8 call PPU_mmap mov ah, [bx] ; high byte (now read) mov al, [P_TILEPAT] ; low byte (previously read) ; interleave the bits of the two pattern bytes mov bx, ax mov cx, ax and ax, 0xF00F ; AAAAbbbbccccAAAA FEDCBA9876543210 and bx, 0x0F00 ; becomes and cx, 0x00F0 ; AAAAccccbbbbAAAA FEDC7654BA983210 shr bx, 4 shl cx, 4 or ax, bx or ax, cx mov bx, ax mov cx, ax and ax, 0xC3C3 ; AAbbccAAAAbbccAA FEDC7654BA983210 and bx, 0x3030 ; becomes and cx, 0x0C0C ; AAccbbAAAAccbbAA FE76DC54BA329810 shr bx, 2 shl cx, 2 or ax, bx or ax, cx mov bx, ax mov cx, ax and ax, 0x9999 ; AbcAAbcAAbcAAbcA FE76DC54BA329810 and bx, 0x4444 ; becomes and cx, 0x2222 ; AcbAAcbAAcbAAcbA F.E.D.C.B.A.9.8. shr bx, 1 ; 7 6 5 4 3 2 1 0 shl cx, 1 or ax, bx or ax, cx mov [P_TILEPAT], ax ; save 16-bit tile ; When decoding sprites, save the sprite graphics and move to next sprite or edx, edx jnz .Mod8_break mov bx, [P_SPRRENPOS] cmp bl, [P_SPROUTPOS] jae .Mod8_break inc bx mov [P_SPRRENPOS], bx shl bx, 1 mov [OAM3_pattern + bx-2], ax .Mod8_6: .Mod8_4: .Mod8_break: mov ax, [P_X] cmp ax, 64 jb .DoneRenderTick cmp ax, 256 jae .DoneRenderTick ; THIS PART USES SIMPLER CODE FROM YOUTUBE VIDEO ; Rather than the complex one that supports ; the crazy 9-sprite malfunction. mov bx, [P_OAMADDR] test ax, 1 jz .SpriteAccessOAM inc byte [P_OAMADDR] and ebx, 3 mov al, 0xAA P_SPR_DATA EQU $-1 mov si, word 0x0000 P_SPROUTPOS EQU $-2 jmp [.SpriteCases + ebx*2] .SpriteAccessOAM: mov bh, 0 mov al, [OAM+bx] mov [P_SPR_DATA], al jmp .DoneRenderTick .SpriteCases: dw .SpriteCase0, .SpriteCase1, .SpriteCase2, .SpriteCase3 .SpriteCase0: cmp byte [P_SPRINPOS], 64 jae .SpriteDone inc byte [P_SPRINPOS] ; next sprite cmp si, 8 jae .Already8 mov [OAM2_y+si], al mov ah, [P_OAMADDR] mov [OAM2_sprindex+si], ah .Already8: ; if(!(scanline >= y1 && scanline < y2 )) ; if(scanline < y1 || scanline >= y2 ) mov dx, [P_SCANLINE] ; ax = y1 mov ah, 0 cmp dx, ax jl .SpriteNotInRange ; make y2 mov ah, [P_SYSCTRL] and ah, 32 ; 0 or 32 shr ah, 2 ; 0 or 8 add ah, 8 add al, ah mov ah, 0 cmp dx, ax jl short .DoneRenderTick ; Sprite in range, will go to next case. .SpriteNotInRange: add byte [P_OAMADDR], 3 jmp .SpriteCase3_cont .SpriteCase1: cmp si, 8 jae short .DoneRenderTick mov [OAM2_index+si], al jmp short .DoneRenderTick .SpriteCase2: cmp si, 8 jae short .DoneRenderTick mov [OAM2_attr+ si], al jmp short .DoneRenderTick .SpriteCase3: cmp si, 8 jae .SpriteOverflow mov [OAM2_x+ si], al inc word [P_SPROUTPOS] jmp .SpriteCase3_cont .SpriteOverflow: or byte [P_STATUS], 0x20 .SpriteCase3_cont: cmp byte [P_SPRINPOS], 2 jne short .DoneRenderTick mov byte [P_OAMADDR], 8 .DoneRenderTick: ret .SpriteDone: mov byte [P_OAMADDR], 0 ret PPU_RenderPixel: mov cx, [P_X] mov dh, cl add dh, 8 ; dh = u8(x+8) ; xpos = ~((x&7) + (reg.taddr_xfine&7) + ((x&7) ? 8 : 0)) & 15 and cx, 7 ; x&7 jz .zero or cl, 8 ; x&7 + ((x&7)?8:0) .zero: add cx, word 0xAAAA ;0-7 really XFINE EQU $-2 not cx and cx, 15 shl cl, 1 ; cl = xpos*2 ; showbg and showsp: mov al, [P_DISPCTRL] mov dl, 0 test al, 8+2 ; No BG/BG8 = deny jz .Showbg_false test al, 8 ; Yes BG = allow jnz .Showbg_true cmp dh, 16 ; In edge = deny jb .Showbg_false .Showbg_true: inc dx ; dl&1 = showbg .Showbg_false: test al, 16+4 jz .Showsp_false test al, 16 jnz .Showsp_true cmp dh, 16 jb .Showsp_false .Showsp_true: inc dx inc dx ; dl&2 = showsp .Showsp_false: ; Pick a pixel from the shift registers, if BG is allowed ; xor si, si ; si = pixel xor di, di ; di = attr test dl, 1 jz .BGdisabled mov esi, dword 0xAAAAAAAA P_BG_SHIFT_PAT EQU $-4 shr esi, cl and si, 3 ; pixel jz .BGchosen ; Keep zero attribute if pixel=0 mov edi, dword 0xAAAAAAAA P_BG_SHIFT_ATTR EQU $-4 shr edi, cl and di, 3 ; attr jmp .BGchosen .BGdisabled: mov ax, [VADDR_BUF] push ax and ax, 0x3F00 cmp ax, 0x3F00 pop ax jne .BGchosen test byte [P_DISPCTRL], 2+4+8+16 jnz .BGchosen ; only set bg from palette if BG/BG8/SP/SP8 are all false mov si, ax ; pixel .BGchosen: test dl, 2 jz .DoneRenderingSprites ; Overlay the sprites xor ebx, ebx not bx ;mov bx, -1 .OverlaySpritesLoop: inc bx cmp bl, [P_SPRRENPOS] jae .DoneRenderingSprites ; Check if the sprite is horizontally in range mov ax, [P_X] mov ch, 0 mov cl, [OAM3_x + bx] sub ax, cx ; xdiff = x - oam3_x[sno] cmp ax, 8 jae .OverlaySpritesLoop ; ax = xdiff ; Determine which pixel to display; skip transparent pixels test [OAM3_attr+bx], byte 0x40 jnz .NoXflip xor al, 7 ; ax = 7-ax .NoXflip: ; spritepixel = (misc.OAM3_pattern[sno] >> (xdiff*2)) & 3 mov cl, al shl cl, 1 ; cl = xdiff*2 mov ax, [OAM3_pattern + ebx*2] shr ax, cl and ax, 3 ; ax = spritepixel jz .OverlaySpritesLoop ; spritepixel 0 is always transparent ; Register sprite-0 hit if applicable cmp word [P_X], 255 ; x must be < 255 jae .NoSprite0hit test si, si ; background pixel must be non-0 jz .NoSprite0hit cmp byte [OAM3_sprindex + bx], 4 ; sprite index must be 0 jae .NoSprite0hit or byte [P_STATUS], 0x40 ; set sp0hit flag .NoSprite0hit: ; Render the pixel unless behind-background placement wanted mov cl, [OAM3_attr + bx] test si, si jz .DoRenderSpritePixel ; background=0? Render test cl, 0x20 jnz .DoneRenderingSprites ; 0x20 not set? Render -- 0x20 set = don't render .DoRenderSpritePixel: and cx, 3 ; attribute add cx, 4 mov di, cx ; attr = (s.attr & 3) + 4 mov si, ax ; pixel = spritepixel ; Only process the first non-transparent sprite pixel. .DoneRenderingSprites: ; map pixel through palette ;mov di, 1*4+2 lea di, [esi + edi*4] ; pixel + attr*4 and di, 0x1F mov al, [PALETTE+di] test byte [P_DISPCTRL], 1 jz .Notgrayscale and al, 0x30 .Notgrayscale: ; Plot pixel (al=pixel, +use emphasis attributes) mov di, [P_X] NTSC_SYNTHESIS_DISABLE: jmp short .DoNTSC ; REPLACED WITH 2*NOP if necessary ; NO NTSC SIM: Just store the raw pixel. ;mov ax, di add al,16 mov [NTSCline + di], al .DontGenerate: ret .DoNTSC: %if 0 cmp word [P_FRAMECOUNT], 0 jnz .DontGenerate %endif ; DO NTSC and di, 0xFF ; Just to make sure we don't do buffer-overflow. jnz .DontMakeBorders ; Generate borders while at it. ; Our NTSCline is 282*8 samples long. This means 26*8 is reserved for edges. ; We are supposed to render 15 pixels of edge at left, 11 pixels of edge at right. push di push ax mov al, [P_NTSC_PHASE] mov bl, al cbw shl ax, 2 mov [P_NTSC_PHASE_LINEBEGIN], ax ;sub bl, 15*8 ; which is 10*12. No effect. mov di, NTSCline .LeftLoop: mov al, [PALETTE+0] call NTSC_synthesize cmp di, NTSCline + 15*8*4 jb .LeftLoop mov di, NTSCline + 15*8*4 + 256*8*4 ;call NTSC_phase_inc_bl add bl, 8 ; 256*8 mod 12 .RightLoop: mov al, [PALETTE+0] call NTSC_synthesize cmp di, NTSCline + 282*8*4 jb .RightLoop pop ax pop di .DontMakeBorders: shl di, 3+2 ; 8 floats per pixel, 4 bytes per float; 32 bytes per pixel add di, NTSCline + 15*8*4 mov bl, [P_NTSC_PHASE] jmp NTSC_synthesize ; tail-call NTSC_synthesize_with_offset: lea di, [NTSCline + ebx*4] ;passthru NTSC_synthesize: ; DI = Pointer to NTSCline (Out: incremented by 8*4) ; BL = NTSC phase (Out: incremented by 8) ; Uses AX, CX, EDX, BP, SI movzx dx, al ; level and ax, 0x0F ; color shr dl, 4 cmp al, 13 jbe .Not1415 mov dl, 1 ; For colors 14..15, level 1 is forced. .Not1415: ;add di, 8*4 ; TEMPORARY ;ret ; TEMPORARY ; AX = color ; DX = level lea si, [NTSC_levels + edx*4] ; Level has been handled. What remains still is AX = color ; right: phase ; down: color ; 1 1 1 1 1 1 1 1 1 1 1 1 ; 1 1 1 1 1 0 0 0 0 0 0 1 ; 1 1 1 1 0 0 0 0 0 0 1 1 ; 1 1 1 0 0 0 0 0 0 1 1 1 ; 1 1 0 0 0 0 0 0 1 1 1 1 ; 1 0 0 0 0 0 0 1 1 1 1 1 ; 0 0 0 0 0 0 1 1 1 1 1 1 ; 0 0 0 0 0 1 1 1 1 1 1 0 ; 0 0 0 0 1 1 1 1 1 1 0 0 ; 0 0 0 1 1 1 1 1 1 0 0 0 ; 0 0 1 1 1 1 1 1 0 0 0 0 ; 0 1 1 1 1 1 1 0 0 0 0 0 ; 1 1 1 1 1 1 0 0 0 0 0 0 ; 0 0 0 0 0 0 0 0 0 0 0 0 ; 0 0 0 0 0 0 0 0 0 0 0 0 ; 0 0 0 0 0 0 0 0 0 0 0 0 xor dx, dx cmp ax, 12 ja .BeginNTSCloop ; For colors 13..15, signal low is forced (000000000000) not dx test ax, ax jz .BeginNTSCloop ; For color 0, signal high is forced (111111111111, from "not dx") add al, bl ; NTSC phase, 0..20 ;aam 12 ; modulo 12 mov cl, al .mod12: mov edx, 00111111000000111111000000111111b shr edx, cl .BeginNTSCloop: mov ebp, dword 0 P_ATTENUATION_MASK EQU $-4 mov cl, bl shr ebp, cl mov ecx, ebp cld ; Using %rep and %endrep costs some ROM space, but it relieves CX as a register. %rep 8 xor bp, bp ; Determine whether to add 4*4 or not, by judging color & phase ; 4 * (color <= 12 * ((color+phase)%12 < 6)) ; TODO: Determine whether to add 8*4 or not, by judging ; the color emphasis bits and the phase. rcr cx, 1 rcl bp, 1 ; Cf becomes 0x01 rcr dx, 1 rcl bp, 5 ; previous Cf becomes 0x20, Cf becomes 0x10 ; flag = (0451326 >> (phase/2*3)) & emphasisbits mov eax, [bp+si] stosd %endrep ;jmp NTSC_phase_inc_bl NTSC_phase_inc_bl: add bl, 8 push ax mov ax, bx aam 12 ; al = al mod 12 mov bl, al pop ax ret section .const NTSC_levels: ; Prenormalized values. ; We don't support de-emphasis bits for now. ; Calculated as: ; normalized_value = (%1 - 0.518) / (1.962 - 0.518) ; factored_value = normalized_value * brightness / 12 ; with brightness = 1 dd -0.00969529 ;0.350 dd 0.00000000 ;0.518 dd 0.02562327 ;0.962 dd 0.05955679 ;1.550 ; Signal low dd 0.03324100 ;1.094 dd 0.05701754 ;1.506 dd 0.08333333 ;1.962 dd 0.08333333 ;1.962 ; Signal high ; The same, but attenuated by a factor of 0.746 before normalization dd -0.01482572 dd -0.00759303 dd 0.01152193 dd 0.03683633 dd 0.01720476 dd 0.03494206 dd 0.05457364 dd 0.05457364 saturation: dd 1.7 bayer4x4: db 0, 12, 3, 15 db 8, 4, 11, 7 db 2, 14, 1, 13 db 10, 6, 9, 5 ; YIQ matrix multiplied by (16*5, 16*6 and 16*8) ;y_r dd 1.0 i_r dd 0.946882 q_r dd 0.623357 ;y_g dd 1.0 i_g dd -0.274788 q_g dd -0.635691 ;y_b dd 1.0 i_b dd -1.108545 q_b dd 1.709007 section .text ;VESA_Granularity_kB dw 0 VESA_Granularity_bytes dd 0 ShowScanline: %if 0 cmp word [P_FRAMECOUNT], 0 jz .DoRender ret .DoRender: %endif push es ; MODE-X: ; Memory position for (x,y) = (y*320+x)/4 ; Into port 3C4h, put xx02h where xx = 1 << (x%4). xor cx, cx ; Plane index MODEX_RENDERING_ENABLE: jmp short .ModeXrendering .TrueColorRendering: xor edi, edi xor edx, edx les di, [P_SCANLINE] shl edi, 8 lea edi, [edi + edi*4] ; y = 320*4*scanline = 1280*scanline = 256*scanline + 1024*scanline mov eax, edi mov ebp, edi div dword [VESA_Granularity_bytes] ; eax = bank number, edx = starting address in this bank ; Figure out which granularity-page this scanline begins from mov di, dx ; Modulo bytes = starting address call .SetVESAbank ; Figure out the beginning of the next bank inc ax mul dword [VESA_Granularity_bytes] ; eax = beginning of next bank xor si, si cld ; Figure out if a seam goes in the middle of this scanline sub eax, ebp sar eax, 2 ; from dwords into pixels cmp ax, 320 jge .ScanlineLoop2 mov [.FirstLimit], ax .ScanlineLoop: call DoOnePix stosd cmp si, 320 .FirstLimit EQU $-2 jb .ScanlineLoop mov ax, [.LastBank] inc ax call .SetVESAbank xor di, di ; continue with new bank ;jmp .skip .ScanlineLoop2: call DoOnePix stosd cmp si, 320 jb .ScanlineLoop2 pop es ret .SetVESAbank: cmp ax, 0xAAAA .LastBank EQU $-2 je .Done mov [.LastBank], ax mov dx, ax push ax mov ax, 0x4F05 xor bx,bx int 10h pop ax .Done: ret .ModeXrendering: AllPlanesLoop: les di, [P_SCANLINE] mov ax, di and ax, 3 shl ax, 2 add ax, bayer4x4 mov word [bayer_base], ax shl di, 4 lea di, [edi + edi*4] ; We'll begin at this address four times. add di, byte 0 ; When NTSC is not disabled, add 32-pix margin by offseting the coordinate. SCREEN_MARGIN EQU $-1 mov ax, 0x0102 shl ah, cl mov dx, 0x3C4 out dx, ax ; Set plane index mov si, cx ; Source pixel index (0..3), will cover to 320 ; TODO: Calculate 80 pixels (320/4), in groups of 4 (20 loops) OnePlaneLoop: ; Calculate four pixels at once before writing to VGA RAM call DoOnePix call DoOnePix call DoOnePix call DoOnePix ; Send four pixels stosd cmp si, 320 RENDER_WIDTH EQU $-2 jb OnePlaneLoop inc cx ; Go to next plane cmp cl, 3 jbe AllPlanesLoop pop es ret DoOnePix: ;db 0xBB ;mov bx,,.. jmp short DecodeNTSCpixel NTSC_DECODE_DISABLE EQU $-2 mov al, [NTSCline + si] jmp DecodeNTSC_return DecodeNTSCpixel: ;mov dx, si ;shr dx, 2 ;mov al, dl ;add al, 16 ;;mov al, 4*(6*8) + 5*(8) + 0 ;jmp DecodeNTSC_return ; Translate si (0..292) into begin,end (both 0..2047) ; Center = si*2048/292+4 ; Begin = Center-6 ; End = Center+6 ; Begin = si*2048/292-2 ; End = si*2048/292+10 mov bx, si ; bx+si = si*2 mov bx, word [xbegins +bx+si] ; Begin push eax ; backup eax (the four pixels) ; bp = sintable + 4*(bx % 12) ; bx is already pre-multiplied by 4. lea ax, [bx + 24*4 + 4*4] add ax, word 0x1111 P_NTSC_PHASE_LINEBEGIN EQU $-2 cwd mov bp, 12*4 div bp ; modulo 12 (times 4) add bx, NTSCline lea bp, [sincos + edx] lea dx, [bx + 12*4] ; End cmp bx, NTSCline jge .NotZero mov bx, NTSCline .NotZero: ; y=i=q=0 ; while(bx < dx) ; { ; value = signal[bx] * factor ; y += value ; value *= saturation ; i += value * cos((pi/6) * (phase+bx)) ; q += value * sin((pi/6) * (phase+bx)) ; } call NTSCdecodeIntoYUV MODEX_DECODE_ENABLE: jmp short .MakePalettedVersion nop nop .MakeTrueColorVersion: pop ebx call NTSCdecodeMakeR call .TrueColorHelper call NTSCdecodeMakeG call .TrueColorHelper call NTSCdecodeMakeB call .TrueColorHelper xchg ebx, eax inc si ; Jump 1 pixel ahead ret .TrueColorHelper: call FloatToPositiveIntWithClamp dd 255.49 dw 255 shl ebx, 8 mov bl, al ret .MakePalettedVersion: xor ax, ax call NTSCdecodeMakeR call NTSCdecodeMakeLinear dd 64.0 ; 16*4 mov bx, 4 call YIQcalc call NTSCdecodeMakeG call NTSCdecodeMakeLinear dd 112.0 ; 16*7 mov bx, 7 mul bx ;mov bx, ax ;sal ax, 3 ;sub ax, bx ;mov bx, 7 ;shl ax, 8 ;aad 7 call YIQcalc call NTSCdecodeMakeB call NTSCdecodeMakeLinear dd 144.0 ; 16*9 mov bx, 9 mul bx ;mov bx, ax ;sal ax, 3 ;add ax, bx ;mov bx, 9 ;shl ax, 8 ;aad 9 call YIQcalc .Bypass: ;mov ax, si;4*(6*8) + 5*(8) + 0 ;test, should make a yellow pixel add al, 4 xchg bx, ax ;lea bx, [eax+4] pop eax mov al, bl DecodeNTSC_return: add si, 4 ; jump 4 pixels ahead ror eax, 8 ret NTSCdecodeIntoYUV: fld dword [saturation] fldz ;i fldz ;q fldz ;y jmp .L2 .L3: fld dword [bx] add bx, 4 ; next sample from scanline fadd to st1 ;y(st1) += value fmul st4 ;value(st0) *= saturation fld dword [bp + 12] ; cos(x) = sin(x+3) when unit is pi/6 fmul st1 ;st0 = cos()*value faddp st3 ;i += st0 fmul dword [bp] ;value*=sin add bp, 4 ; next cell in sincos table faddp st3 ;q += st0 .L2: cmp bx, dx jb .L3 fstp st3 ; forget the dummy saturation value fxch st1 fxch st2 ret NTSCdecodeMakeR: fld dword [i_r] fmul st2 fadd st1 fld dword [q_r] jmp fmul_st4_faddp_st1_return NTSCdecodeMakeG: fld dword [i_g] fmul st2 fadd st1 fld dword [q_g] fmul_st4_faddp_st1_return: fmul st4 jmp faddp_st1_return NTSCdecodeMakeB: fxch st1 fmul dword [i_b] faddp st1 fxch st1 fmul dword [q_b] faddp_st1_return: faddp st1 ret NTSCdecodeMakeLinear: pop bp ; Convert gamma-corrected RGB into linear RGB ; For simplicity, we assume gamma of 2.0. It's close enough. ftst xchg bx,ax fnstsw ax test ah, 69 xchg bx,ax je .notzero fstp st0 ; Replace value with zero fldz add bp, 4 jmp bp .notzero: NTSC_DECODE_POWER2: fmul st0 ; x^2 fmul dword [bp] add bp, 4 jmp bp FloatToPositiveIntWithClamp: pop bp ftst fnstsw ax test ah, 69 jne .zero fmul dword [bp] fistp word [.temp] fwait mov ax, 0 .temp EQU $-2 cmp ax, [bp+4] jbe short .truedone mov ax, [bp+4] jmp short .truedone .zero: fstp st0 xor ax, ax .truedone: add bp,6 jmp bp YIQcalc: fistp dword [YIQ_temp] xor edx, edx mov bp, si and bp, 3 mov dl, [bayer4x4 + bp] bayer_base EQU $-2 ;and dl, 0x08 ; Our video is not RGB, and our palette is not ; the NES palette. We use dithering to compensate. fwait add edx, dword 0xAAAAAAAA YIQ_temp equ $-4 sar dx, 4 ; Divide by 16 cmp dx, bx jb .ok lea dx, [bx-1] .ok: add ax, dx ;.ZeroPix: ret