; for new GOGO-no-coda (1999/09) ; Copyright (C) 1999 shigeo ; special thanks to Keiichi SAKAI, URURI %include "nasm.h" globaldef fht_3DN globaldef fht externdef costab_fft externdef sintab_fft externdef gray_index segment_data align 16 D_MSB1_0 dd 0 ,0x80000000 D_SQRT2 dd 1.414213562,1.414213562 t_s0 dd 0 ;[ t_c:t_s] t_c0 dd 0 t_c1 dd 0 ;[-t_s:t_c] t_s1 dd 0 D_s1c1 dd 0, 0 D_Mc1s1 dd 0, 0 D_s2c2 dd 0, 0 D_Mc2s2 dd 0, 0 D_0_1 dd 1.0, 0.0 S_05 DD 0.5 S_00005 DD 0.0005 fht dd 0 ;関数ポインタ segment_code ;************************************************************************ ; by shigeo ; 99/08/16 ; 23000clk 辛かった〜 ; 18500clk bit reversal from gogo1 by URURI ;void fht(float *fz, int n); align 16 fht_3DN: push ebx push esi push edi push ebp %assign _P 4*4 ;まず最初のループ... はfht()の外へ移動 mov esi,[esp+_P+4] ;esi=fz mov ecx,[esp+_P+8] ;ecx=n ;メインループ movq mm7,[D_MSB1_0] ;mm7=[1<<31:0] %assign LOCAL_STACK 16 sub esp,LOCAL_STACK %assign _P (_P+LOCAL_STACK) xor eax,eax mov [esp],eax ;k=0 %define k dword [esp] %define kx dword [esp+4] %define fn dword [esp+8] .lp30: ;k=0; do{ mov ecx,k add ecx,2 mov k,ecx mov eax,1 shl eax,cl ;eax=k1 = 1<>1 mov kx,esi ;保存(後で使う) mov edi,[esp+_P+4] ;edi=fi=fz lea ebp,[edi+esi*4] ;ebp=gi=fz+kx mov esi,[esp+_P+8] ;esi=n lea esi,[edi+esi*4] ;esi=fn=fz+n movq mm6,[D_SQRT2] ;mm6=[√2:√2] .lp31: ;fn=fz+n; do{ FLOAT g0,f0,f1,... movd mm0,[edi] ;mm0=[0:fi[ 0]] movd mm1,[edi+eax*4] ;mm1=[0:fi[k1]] punpckldq mm0,mm0 ;mm0=[fi_0 :fi_0 ] punpckldq mm1,mm1 ;mm1=[fi_k1:fi_k1] movd mm2,[edi+ebx*4] movd mm3,[edi+ecx*4] punpckldq mm2,mm2 ;mm2=[fi_k2:fi_k2] punpckldq mm3,mm3 ;mm3=[fi_k3:fi_k3] pxor mm1,mm7 ;mm1=[-fi_k1:fi_k1] pxor mm3,mm7 ;mm3=[-fi_k3:fi_k3] pfadd mm0,mm1 ;mm0=[f1:f0]=[fi_0 -fi_k1 : fi_0 +fi_k1] pfadd mm2,mm3 ;mm2=[f3:f2]=[fi_k2-fi_k3 : fi_k2+fi_k3] movq mm3,mm0 ;mm3=[f1:f0] pfadd mm0,mm2 ;mm0=[f1+f3:f0+f2] movd [edi],mm0 ;fi[0]=f0+f2 psrlq mm0,32 ;mm0=[0:f1+f3] pfsub mm3,mm2 ;mm3=[f1-f3:f0-f2] movd [edi+eax*4],mm0 ;fi[k1]=f1+f3 movd [edi+ebx*4],mm3 ;fi[k2]=f0-f2 psrlq mm3,32 ;mm3=[0:f1-f3] movd [edi+ecx*4],mm3 ;fi[k3]=f1-f3 movd mm0,[ebp] ;mm0=[0:gi_0] movd mm1,[ebp+eax*4] ;mm1=[0:gi_k1] punpckldq mm0,mm0 ;mm0=[gi_0 :gi_0 ] punpckldq mm1,mm1 ;mm1=[gi_k1:gi_k1] movd mm2,[ebp+ebx*4] ;mm2=[0:gi_k2] pxor mm1,mm7 ;mm1=[-gi_k1:gi_k1] punpckldq mm2,[ebp+ecx*4] ;mm2=[gi_k3:gi_k2] pfadd mm0,mm1 ;mm0=[g1:g0]=[gi_0 -gi_k1:gi_0 +gi_k1] pfmul mm2,mm6 ;mm2=[g3:g2]=sqrt2 * [gi_k3:gi_k2] movq mm1,mm0 ;mm1=[g1:g0] pfadd mm0,mm2 ;mm0=[g1+g3:g0+g2] movd [ebp],mm0 ;gi[0]=g0+g2 psrlq mm0,32 ;mm0=[0:g1+g3] pfsub mm1,mm2 ;mm1=[g1-g3:g0-g2] movd [ebp+eax*4],mm0 ;gi[k1]=g1+g3 movd [ebp+ebx*4],mm1 ;gi[k2]=g0-g2 psrlq mm1,32 ;mm1=[0:g1-g3] movd [ebp+ecx*4],mm1 ;gi[k3]=g1-g3 lea edi,[edi+edx*4] ;fi += k4 lea ebp,[ebp+edx*4] ;gi += k4 cmp edi,esi jc near .lp31 ;}while(fi