; new count bit routine ; part of this code is origined from ; new GOGO-no-coda (1999, 2000) ; Copyright (C) 1999 shigeo ; modified by Keiichi SAKAI %include "nasm.h" globaldef choose_table_MMX globaldef MMX_masking externdef largetbl externdef t1l externdef table23 externdef table56 segment_data align 16 D14_14_14_14 dd 0x000E000E, 0x000E000E D15_15_15_15 dd 0xfff0fff0, 0xfff0fff0 mul_add dd 0x00010010, 0x00010010 mul_add23 dd 0x00010003, 0x00010003 mul_add56 dd 0x00010004, 0x00010004 tableDEF dd 0x00010003,0x01,0x00050005,0x05,0x00070006,0x07,0x00090008,0x08,0x000a0008, 0x09 dd 0x000a0009,0x0a,0x000b000a,0x0a,0x000b000a,0x0b,0x000c000a,0x0a,0x000c000b, 0x0b dd 0x000c000b,0x0c,0x000d000c,0x0c,0x000d000c,0x0d,0x000d000c,0x0d,0x000e000d, 0x0e dd 0x000b000e,0x0e,0x00040005,0x04,0x00060005,0x06,0x00080007,0x08,0x00090008, 0x09 dd 0x000a0009,0x0a,0x000b0009,0x0a,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0b dd 0x000c000b,0x0b,0x000c000b,0x0c,0x000d000c,0x0c,0x000e000c,0x0d,0x000d000c, 0x0e dd 0x000e000d,0x0e,0x000b000d,0x0e,0x00070006,0x07,0x00080007,0x08,0x00090007, 0x09 dd 0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c dd 0x000d000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000c,0x0d,0x000d000c, 0x0d dd 0x000e000d,0x0e,0x000e000d,0x0f,0x000c000d,0x0f,0x00090007,0x08,0x00090008, 0x09 dd 0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c dd 0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000c,0x0d,0x000e000c, 0x0d dd 0x000e000c,0x0d,0x000f000d,0x0e,0x000f000d,0x0f,0x000d000d,0x0f,0x000a0008, 0x09 dd 0x000a0008,0x09,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c dd 0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0c,0x000e000b,0x0d,0x000e000c, 0x0d dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d,0x0f,0x000c000d, 0x10 dd 0x000a0009,0x0a,0x000a0009,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c dd 0x000d000a,0x0c,0x000d000b,0x0d,0x000e000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d dd 0x000e000c,0x0e,0x000f000c,0x0d,0x000f000d,0x0f,0x000f000d,0x0f,0x0010000d, 0x10 dd 0x000d000e,0x10,0x000b000a,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c dd 0x000d000a,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f dd 0x0010000e,0x10,0x000d000e,0x10,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0c dd 0x000c000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0e,0x000e000c, 0x0e dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0f,0x000f000c,0x0f,0x000f000d, 0x0f dd 0x0011000d,0x10,0x0011000d,0x12,0x000d000e,0x12,0x000b000a,0x0a,0x000c000a, 0x0a dd 0x000c000a,0x0b,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000d, 0x0f dd 0x0010000d,0x0f,0x0010000e,0x10,0x0010000e,0x11,0x000d000e,0x11,0x000c000a, 0x0b dd 0x000c000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d dd 0x000e000c,0x0d,0x000f000c,0x0f,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f dd 0x0010000d,0x10,0x000f000d,0x10,0x0010000e,0x10,0x000f000e,0x12,0x000e000e, 0x11 dd 0x000c000b,0x0b,0x000d000b,0x0c,0x000c000b,0x0c,0x000d000b,0x0d,0x000e000c, 0x0d dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0e,0x0010000d, 0x0f dd 0x0010000d,0x10,0x0010000d,0x0f,0x0011000d,0x10,0x0011000e,0x11,0x0010000f, 0x12 dd 0x000d000e,0x13,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b, 0x0d dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0e,0x0010000c,0x0e,0x0010000d, 0x0f dd 0x0010000d,0x0f,0x0010000d,0x0f,0x0010000d,0x10,0x0010000e,0x11,0x000f000e, 0x11 dd 0x0010000e,0x11,0x000e000f,0x12,0x000d000c,0x0c,0x000e000c,0x0d,0x000e000b, 0x0d dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0f,0x000f000d,0x0e,0x000f000d, 0x0f dd 0x000f000d,0x10,0x0011000d,0x10,0x0010000d,0x11,0x0010000d,0x11,0x0010000e, 0x11 dd 0x0010000e,0x12,0x0012000f,0x12,0x000e000f,0x12,0x000f000c,0x0d,0x000e000c, 0x0d dd 0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0f,0x000f000d,0x0f,0x0010000d, 0x10 dd 0x0010000d,0x10,0x0010000d,0x10,0x0012000e,0x10,0x0011000e,0x10,0x0011000e, 0x11 dd 0x0011000e,0x12,0x0013000e,0x11,0x0011000f,0x12,0x000e000f,0x12,0x000e000d, 0x0e dd 0x000f000d,0x0e,0x000d000d,0x0e,0x000e000d,0x0f,0x0010000d,0x0f,0x0010000d, 0x0f dd 0x000f000d,0x11,0x0010000d,0x10,0x0010000e,0x10,0x0011000e,0x13,0x0012000e, 0x11 dd 0x0011000e,0x11,0x0013000f,0x11,0x0011000f,0x13,0x0010000e,0x12,0x000e000f, 0x12 dd 0x000b000d,0x0d,0x000b000d,0x0e,0x000b000d,0x0f,0x000c000d,0x10,0x000c000d, 0x10 dd 0x000d000d,0x10,0x000d000d,0x11,0x000d000e,0x10,0x000e000e,0x11,0x000e000e, 0x11 dd 0x000e000e,0x12,0x000e000e,0x12,0x000e000f,0x15,0x000e000f,0x14,0x000e000f, 0x15 dd 0x000c000f,0x12 tableABC dd 0x00020004,0x1,0x00040004,0x4,0x00060006,0x7,0x00080008,0x9,0x00090009,0xa,0x000a000a,0xa dd 0x0009000a,0xa,0x000a000a,0xb,0x00000000,0x0,0x00020003,0x1,0x00040004,0x4,0x00070006,0x7 dd 0x00090007,0x9,0x00090009,0x9,0x000a000a,0xa,0x00000000,0x0,0x00040004,0x4,0x00050005,0x6 dd 0x00060006,0x8,0x00080007,0x9,0x000a0009,0xa,0x000a0009,0xb,0x0009000a,0xa,0x000a000a,0xa dd 0x00000000,0x0,0x00040004,0x4,0x00040005,0x6,0x00060006,0x8,0x000a0007,0x9,0x000a0008,0x9 dd 0x000a000a,0xa,0x00000000,0x0,0x00060006,0x7,0x00070006,0x8,0x00080007,0x9,0x00090008,0xa dd 0x000a0009,0xb,0x000b000a,0xc,0x000a0009,0xb,0x000a000a,0xb,0x00000000,0x0,0x00070005,0x7 dd 0x00060006,0x7,0x00080007,0x9,0x000a0008,0xa,0x000a0009,0xa,0x000b000a,0xb,0x00000000,0x0 dd 0x00080007,0x8,0x00080007,0x9,0x00090008,0xa,0x000b0008,0xb,0x000a0009,0xc,0x000c000a,0xc dd 0x000a000a,0xb,0x000b000a,0xc,0x00000000,0x0,0x00090007,0x8,0x000a0007,0x9,0x000a0008,0xa dd 0x000b0009,0xb,0x000b0009,0xb,0x000c000a,0xb,0x00000000,0x0,0x00090008,0x9,0x000a0008,0xa dd 0x000a0009,0xb,0x000b0009,0xc,0x000b000a,0xc,0x000c000a,0xc,0x000b000a,0xc,0x000c000b,0xc dd 0x00000000,0x0,0x00090008,0x8,0x00090008,0x9,0x000a0009,0xa,0x000b0009,0xb,0x000c000a,0xb dd 0x000c000b,0xc,0x00000000,0x0,0x00090009,0xa,0x000a0009,0xb,0x000b000a,0xc,0x000c000a,0xc dd 0x000c000a,0xd,0x000d000b,0xd,0x000c000a,0xc,0x000d000b,0xd,0x00000000,0x0,0x000a0009,0x9 dd 0x000a0009,0xa,0x000b000a,0xb,0x000b000a,0xc,0x000d000b,0xc,0x000d000b,0xc,0x00000000,0x0 dd 0x00090009,0x9,0x00090009,0xa,0x00090009,0xb,0x000a000a,0xc,0x000b000a,0xc,0x000c000b,0xc dd 0x000c000b,0xd,0x000c000c,0xd,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0 dd 0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x0009000a,0xa,0x0009000a,0xa dd 0x000a000a,0xb,0x000b000b,0xc,0x000c000b,0xc,0x000c000b,0xd,0x000c000b,0xd,0x000c000c,0xd dd 0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0 dd 0x0,0x00000000, 0x0,0x00000000 linbits32 dd 0x00040004,0x10001,0x00040004,0x20002,0x00040004,0x30003,0x00040004,0x40004 dd 0x00050005,0x60006,0x00060006,0x60006,0x00070007,0x80008,0x00080008,0x80008 dd 0x00090009,0xa000a,0x000b000b,0xa000a,0x000b000b,0xd000d,0x000d000d,0xd000d dd 0x000d000d,0xd000d choose_table_H dw 0x1810, 0x1811, 0x1812, 0x1813, 0x1914, 0x1a14, 0x1b15, 0x1c15 dw 0x1d16, 0x1e16, 0x1e17, 0x1f17, 0x1f17 choose_jump_table_L: dd table_MMX.L_case_0 - choose_table_MMX dd table_MMX.L_case_1 - choose_table_MMX dd table_MMX.L_case_2 - choose_table_MMX dd table_MMX.L_case_3 - choose_table_MMX dd table_MMX.L_case_45 - choose_table_MMX dd table_MMX.L_case_45 - choose_table_MMX dd table_MMX.L_case_67 - choose_table_MMX dd table_MMX.L_case_67 - choose_table_MMX dd table_MMX.L_case_8_15 - choose_table_MMX dd table_MMX.L_case_8_15 - choose_table_MMX dd table_MMX.L_case_8_15 - choose_table_MMX dd table_MMX.L_case_8_15 - choose_table_MMX dd table_MMX.L_case_8_15 - choose_table_MMX dd table_MMX.L_case_8_15 - choose_table_MMX dd table_MMX.L_case_8_15 - choose_table_MMX dd table_MMX.L_case_8_15 - choose_table_MMX segment_code ; ; use MMX ; PIC_OFFSETTABLE align 16 ; int choose_table(int *ix, int *end, int *s) choose_table_MMX: push ebp call get_pc.bp add ebp, PIC_BASE() mov ecx,[esp+8] ;ecx = begin mov edx,[esp+12] ;edx = end sub ecx,edx ;ecx = begin-end(should be minus) test ecx,8 pxor mm0,mm0 ;mm0=[0:0] movq mm1,[edx+ecx] jz .lp add ecx,8 jz .exit align 4 .lp: movq mm4,[edx+ecx] movq mm5,[edx+ecx+8] add ecx,16 psubusw mm4,mm0 ; 本当は dword でないといけないのだが psubusw mm5,mm1 ; そんなコマンドはない :-p paddw mm0,mm4 ; が, ここで扱う値の範囲は 8191+15 以下なので問題ない paddw mm1,mm5 jnz .lp .exit: psubusw mm1,mm0 ; これも本当は dword でないといけない paddw mm0,mm1 movq mm4,mm0 punpckhdq mm4,mm4 psubusw mm4,mm0 ; これも本当は dword でないといけない paddw mm0,mm4 movd eax,mm0 cmp eax,15 ja .with_ESC lea ecx,[PIC_EBP_REL(choose_table_MMX)] add ecx,[PIC_EBP_REL(choose_jump_table_L+eax*4)] jmp ecx .with_ESC1: emms mov ecx, [esp+16] ; *s mov [ecx], eax or eax,-1 pop ebp ret .with_ESC: cmp eax, 8191+15 ja .with_ESC1 sub eax,15 push ebx push esi bsr eax, eax %assign _P 4*2 movq mm5, [PIC_EBP_REL(D15_15_15_15)] movq mm6, [PIC_EBP_REL(D14_14_14_14)] movq mm3, [PIC_EBP_REL(mul_add)] mov ecx, [esp+_P+8] ; = ix ; mov edx, [esp+_P+12] ; = end sub ecx, edx xor esi, esi ; sum = 0 test ecx, 8 pxor mm7, mm7 ; linbits_sum, 14を越えたものの数 jz .H_dual_lp1 movq mm0, [edx+ecx] add ecx,8 packssdw mm0,mm7 movq mm2, mm0 paddusw mm0, mm5 ; mm0 = min(ix, 15)+0xfff0 pcmpgtw mm2, mm6 ; 14より大きいか? psubw mm7, mm2 ; 14より大きいとき linbits_sum++; pmaddwd mm0, mm3 ; {0, 0, y, x}*{1, 16, 1, 16} movd ebx, mm0 mov esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)] jz .H_dual_exit align 4 .H_dual_lp1: movq mm0, [edx+ecx] movq mm1, [edx+ecx+8] packssdw mm0,mm1 movq mm2, mm0 paddusw mm0, mm5 ; mm0 = min(ix, 15)+0xfff0 pcmpgtw mm2, mm6 ; 14より大きいか? pmaddwd mm0, mm3 ; {y, x, y, x}*{1, 16, 1, 16} movd ebx, mm0 punpckhdq mm0,mm0 add esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)] movd ebx, mm0 add esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)] add ecx, 16 psubw mm7, mm2 ; 14より大きいとき linbits_sum++; jnz .H_dual_lp1 .H_dual_exit: pmov mm1,mm7 punpckhdq mm7,mm7 paddd mm7,mm1 punpckldq mm7,mm7 pmaddwd mm7, [PIC_EBP_REL(linbits32+eax*8)] ; linbits mov ax, [PIC_EBP_REL(choose_table_H+eax*2)] movd ecx, mm7 punpckhdq mm7,mm7 movd edx,mm7 emms shl edx, 16 add ecx, edx add ecx, esi pop esi pop ebx mov edx, ecx and ecx, 0xffff ; ecx = sum2 shr edx, 16 ; edx = sum cmp edx, ecx jle .chooseE_s1 mov edx, ecx shr eax, 8 .chooseE_s1: mov ecx, [esp+16] ; *s and eax, 0xff add [ecx], edx pop ebp ret table_MMX.L_case_0: emms pop ebp ret table_MMX.L_case_1: emms mov eax, [esp+16] ; *s mov ecx, [esp+8] ; *ix sub ecx, edx push ebx .lp: mov ebx, [edx+ecx] add ebx, ebx add ebx, [edx+ecx+4] movzx ebx, byte [PIC_EBP_REL(ebx+t1l)] add [eax], ebx add ecx, 8 jnz .lp pop ebx mov eax, 1 pop ebp ret table_MMX.L_case_45: push dword 7 lea ecx, [PIC_EBP_REL(tableABC+9*8)] jmp from3 table_MMX.L_case_67: push dword 10 lea ecx, [PIC_EBP_REL(tableABC)] jmp from3 table_MMX.L_case_8_15: push dword 13 lea ecx, [PIC_EBP_REL(tableDEF)] from3: mov eax,[esp+12] ;eax = *begin ; mov edx,[esp+16] ;edx = *end push ebx sub eax, edx movq mm5,[PIC_EBP_REL(mul_add)] pxor mm2,mm2 ;mm2 = sum test eax, 8 jz .choose3_lp1 ; odd length movq mm0,[edx+eax] ;mm0 = ix[0] | ix[1] add eax,8 packssdw mm0,mm2 pmaddwd mm0,mm5 movd ebx,mm0 movq mm2, [ecx+ebx*8] jz .choose3_exit align 4 .choose3_lp1 movq mm0,[edx+eax] movq mm1,[edx+eax+8] add eax,16 packssdw mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3] pmaddwd mm0,mm5 movd ebx,mm0 punpckhdq mm0,mm0 paddd mm2, [ecx+ebx*8] movd ebx,mm0 paddd mm2, [ecx+ebx*8] jnz .choose3_lp1 .choose3_exit ; xor eax,eax movd ebx, mm2 punpckhdq mm2,mm2 mov ecx, ebx and ecx, 0xffff ; ecx = sum2 shr ebx, 16 ; ebx = sum1 movd edx, mm2 ; edx = sum cmp edx, ebx jle .choose3_s1 mov edx, ebx inc eax .choose3_s1: emms pop ebx cmp edx, ecx jle .choose3_s2 mov edx, ecx mov eax, 2 .choose3_s2: pop ecx add eax, ecx mov ecx, [esp+16] ; *s add [ecx], edx pop ebp ret table_MMX.L_case_2: push dword 2 lea ecx,[PIC_EBP_REL(table23)] pmov mm5,[PIC_EBP_REL(mul_add23)] jmp from2 table_MMX.L_case_3: push dword 5 lea ecx,[PIC_EBP_REL(table56)] pmov mm5,[PIC_EBP_REL(mul_add56)] from2: mov eax,[esp+12] ;eax = *begin ; mov edx,[esp+16] ;edx = *end push ebx push edi sub eax, edx xor edi, edi test eax, 8 jz .choose2_lp1 ; odd length movq mm0,[edx+eax] ;mm0 = ix[0] | ix[1] pxor mm2,mm2 ;mm2 = sum packssdw mm0,mm2 pmaddwd mm0,mm5 movd ebx,mm0 mov edi, [ecx+ebx*4] add eax,8 jz .choose2_exit align 4 .choose2_lp1 movq mm0,[edx+eax] movq mm1,[edx+eax+8] packssdw mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3] pmaddwd mm0,mm5 movd ebx,mm0 punpckhdq mm0,mm0 add edi, [ecx+ebx*4] movd ebx, mm0 add edi, [ecx+ebx*4] add eax,16 jnc .choose2_lp1 .choose2_exit mov ecx, edi pop edi pop ebx pop eax ; table num. emms mov edx, ecx and ecx, 0xffff ; ecx = sum2 shr edx, 16 ; edx = sum1 cmp edx, ecx jle .choose2_s1 mov edx, ecx inc eax .choose2_s1: mov ecx, [esp+16] ; *s add [ecx], edx pop ebp ret end