1159b3361Sopenharmony_ci 2159b3361Sopenharmony_ci; for new GOGO-no-coda (1999/09) 3159b3361Sopenharmony_ci; Copyright (C) 1999 shigeo 4159b3361Sopenharmony_ci; special thanks to Keiichi SAKAI, URURI 5159b3361Sopenharmony_ci%include "nasm.h" 6159b3361Sopenharmony_ci 7159b3361Sopenharmony_ci globaldef fht_3DN 8159b3361Sopenharmony_ci globaldef fht 9159b3361Sopenharmony_ci externdef costab_fft 10159b3361Sopenharmony_ci externdef sintab_fft 11159b3361Sopenharmony_ci externdef gray_index 12159b3361Sopenharmony_ci 13159b3361Sopenharmony_ci segment_data 14159b3361Sopenharmony_ci align 16 15159b3361Sopenharmony_ciD_MSB1_0 dd 0 ,0x80000000 16159b3361Sopenharmony_ciD_SQRT2 dd 1.414213562,1.414213562 17159b3361Sopenharmony_cit_s0 dd 0 ;[ t_c:t_s] 18159b3361Sopenharmony_cit_c0 dd 0 19159b3361Sopenharmony_cit_c1 dd 0 ;[-t_s:t_c] 20159b3361Sopenharmony_cit_s1 dd 0 21159b3361Sopenharmony_ciD_s1c1 dd 0, 0 22159b3361Sopenharmony_ciD_Mc1s1 dd 0, 0 23159b3361Sopenharmony_ciD_s2c2 dd 0, 0 24159b3361Sopenharmony_ciD_Mc2s2 dd 0, 0 25159b3361Sopenharmony_ciD_0_1 dd 1.0, 0.0 26159b3361Sopenharmony_ciS_05 DD 0.5 27159b3361Sopenharmony_ciS_00005 DD 0.0005 28159b3361Sopenharmony_cifht dd 0 ;�ؿ��ݥ��� 29159b3361Sopenharmony_ci 30159b3361Sopenharmony_ci segment_code 31159b3361Sopenharmony_ci 32159b3361Sopenharmony_ci;************************************************************************ 33159b3361Sopenharmony_ci 34159b3361Sopenharmony_ci; by shigeo 35159b3361Sopenharmony_ci; 99/08/16 36159b3361Sopenharmony_ci; 23000clk �ɤ��ä��� 37159b3361Sopenharmony_ci; 18500clk bit reversal from gogo1 by URURI 38159b3361Sopenharmony_ci 39159b3361Sopenharmony_ci;void fht(float *fz, int n); 40159b3361Sopenharmony_ci align 16 41159b3361Sopenharmony_cifht_3DN: 42159b3361Sopenharmony_ci push ebx 43159b3361Sopenharmony_ci push esi 44159b3361Sopenharmony_ci push edi 45159b3361Sopenharmony_ci push ebp 46159b3361Sopenharmony_ci%assign _P 4*4 47159b3361Sopenharmony_ci ;�ޤ��ǽ�Υ롼��... ��fht()�γ��ذ�ư 48159b3361Sopenharmony_ci 49159b3361Sopenharmony_ci mov esi,[esp+_P+4] ;esi=fz 50159b3361Sopenharmony_ci mov ecx,[esp+_P+8] ;ecx=n 51159b3361Sopenharmony_ci 52159b3361Sopenharmony_ci ;�ᥤ��롼�� 53159b3361Sopenharmony_ci movq mm7,[D_MSB1_0] ;mm7=[1<<31:0] 54159b3361Sopenharmony_ci 55159b3361Sopenharmony_ci%assign LOCAL_STACK 16 56159b3361Sopenharmony_ci sub esp,LOCAL_STACK 57159b3361Sopenharmony_ci%assign _P (_P+LOCAL_STACK) 58159b3361Sopenharmony_ci xor eax,eax 59159b3361Sopenharmony_ci mov [esp],eax ;k=0 60159b3361Sopenharmony_ci%define k dword [esp] 61159b3361Sopenharmony_ci%define kx dword [esp+4] 62159b3361Sopenharmony_ci%define fn dword [esp+8] 63159b3361Sopenharmony_ci 64159b3361Sopenharmony_ci.lp30: ;k=0; do{ 65159b3361Sopenharmony_ci mov ecx,k 66159b3361Sopenharmony_ci add ecx,2 67159b3361Sopenharmony_ci mov k,ecx 68159b3361Sopenharmony_ci mov eax,1 69159b3361Sopenharmony_ci shl eax,cl ;eax=k1 = 1<<k 70159b3361Sopenharmony_ci lea ebx,[eax+eax] ;ebx=k2 = k1*2 71159b3361Sopenharmony_ci lea ecx,[eax+eax*2] ;ecx=k3 = k2 + k1 = k1*3 72159b3361Sopenharmony_ci lea edx,[ebx+ebx] ;edx=k4 = k1*4 73159b3361Sopenharmony_ci mov esi,eax 74159b3361Sopenharmony_ci shr esi,1 ;esi=kx=k1>>1 75159b3361Sopenharmony_ci mov kx,esi ;��¸(��ǻȤ�) 76159b3361Sopenharmony_ci mov edi,[esp+_P+4] ;edi=fi=fz 77159b3361Sopenharmony_ci lea ebp,[edi+esi*4] ;ebp=gi=fz+kx 78159b3361Sopenharmony_ci mov esi,[esp+_P+8] ;esi=n 79159b3361Sopenharmony_ci lea esi,[edi+esi*4] ;esi=fn=fz+n 80159b3361Sopenharmony_ci movq mm6,[D_SQRT2] ;mm6=[��2:��2] 81159b3361Sopenharmony_ci 82159b3361Sopenharmony_ci.lp31: ;fn=fz+n; do{ FLOAT g0,f0,f1,... 83159b3361Sopenharmony_ci movd mm0,[edi] ;mm0=[0:fi[ 0]] 84159b3361Sopenharmony_ci movd mm1,[edi+eax*4] ;mm1=[0:fi[k1]] 85159b3361Sopenharmony_ci punpckldq mm0,mm0 ;mm0=[fi_0 :fi_0 ] 86159b3361Sopenharmony_ci punpckldq mm1,mm1 ;mm1=[fi_k1:fi_k1] 87159b3361Sopenharmony_ci movd mm2,[edi+ebx*4] 88159b3361Sopenharmony_ci movd mm3,[edi+ecx*4] 89159b3361Sopenharmony_ci punpckldq mm2,mm2 ;mm2=[fi_k2:fi_k2] 90159b3361Sopenharmony_ci punpckldq mm3,mm3 ;mm3=[fi_k3:fi_k3] 91159b3361Sopenharmony_ci pxor mm1,mm7 ;mm1=[-fi_k1:fi_k1] 92159b3361Sopenharmony_ci pxor mm3,mm7 ;mm3=[-fi_k3:fi_k3] 93159b3361Sopenharmony_ci pfadd mm0,mm1 ;mm0=[f1:f0]=[fi_0 -fi_k1 : fi_0 +fi_k1] 94159b3361Sopenharmony_ci pfadd mm2,mm3 ;mm2=[f3:f2]=[fi_k2-fi_k3 : fi_k2+fi_k3] 95159b3361Sopenharmony_ci movq mm3,mm0 ;mm3=[f1:f0] 96159b3361Sopenharmony_ci pfadd mm0,mm2 ;mm0=[f1+f3:f0+f2] 97159b3361Sopenharmony_ci movd [edi],mm0 ;fi[0]=f0+f2 98159b3361Sopenharmony_ci psrlq mm0,32 ;mm0=[0:f1+f3] 99159b3361Sopenharmony_ci pfsub mm3,mm2 ;mm3=[f1-f3:f0-f2] 100159b3361Sopenharmony_ci movd [edi+eax*4],mm0 ;fi[k1]=f1+f3 101159b3361Sopenharmony_ci movd [edi+ebx*4],mm3 ;fi[k2]=f0-f2 102159b3361Sopenharmony_ci psrlq mm3,32 ;mm3=[0:f1-f3] 103159b3361Sopenharmony_ci movd [edi+ecx*4],mm3 ;fi[k3]=f1-f3 104159b3361Sopenharmony_ci 105159b3361Sopenharmony_ci movd mm0,[ebp] ;mm0=[0:gi_0] 106159b3361Sopenharmony_ci movd mm1,[ebp+eax*4] ;mm1=[0:gi_k1] 107159b3361Sopenharmony_ci punpckldq mm0,mm0 ;mm0=[gi_0 :gi_0 ] 108159b3361Sopenharmony_ci punpckldq mm1,mm1 ;mm1=[gi_k1:gi_k1] 109159b3361Sopenharmony_ci movd mm2,[ebp+ebx*4] ;mm2=[0:gi_k2] 110159b3361Sopenharmony_ci pxor mm1,mm7 ;mm1=[-gi_k1:gi_k1] 111159b3361Sopenharmony_ci punpckldq mm2,[ebp+ecx*4] ;mm2=[gi_k3:gi_k2] 112159b3361Sopenharmony_ci pfadd mm0,mm1 ;mm0=[g1:g0]=[gi_0 -gi_k1:gi_0 +gi_k1] 113159b3361Sopenharmony_ci pfmul mm2,mm6 ;mm2=[g3:g2]=sqrt2 * [gi_k3:gi_k2] 114159b3361Sopenharmony_ci movq mm1,mm0 ;mm1=[g1:g0] 115159b3361Sopenharmony_ci pfadd mm0,mm2 ;mm0=[g1+g3:g0+g2] 116159b3361Sopenharmony_ci movd [ebp],mm0 ;gi[0]=g0+g2 117159b3361Sopenharmony_ci psrlq mm0,32 ;mm0=[0:g1+g3] 118159b3361Sopenharmony_ci pfsub mm1,mm2 ;mm1=[g1-g3:g0-g2] 119159b3361Sopenharmony_ci movd [ebp+eax*4],mm0 ;gi[k1]=g1+g3 120159b3361Sopenharmony_ci movd [ebp+ebx*4],mm1 ;gi[k2]=g0-g2 121159b3361Sopenharmony_ci psrlq mm1,32 ;mm1=[0:g1-g3] 122159b3361Sopenharmony_ci movd [ebp+ecx*4],mm1 ;gi[k3]=g1-g3 123159b3361Sopenharmony_ci lea edi,[edi+edx*4] ;fi += k4 124159b3361Sopenharmony_ci lea ebp,[ebp+edx*4] ;gi += k4 125159b3361Sopenharmony_ci cmp edi,esi 126159b3361Sopenharmony_ci jc near .lp31 ;}while(fi<fn); 127159b3361Sopenharmony_ci 128159b3361Sopenharmony_ci; �����ޤǤ�¿ʬO.K. 129159b3361Sopenharmony_ci 130159b3361Sopenharmony_ci mov fn,esi ;fn=fz+n 131159b3361Sopenharmony_ci ;�����ͤϰ���³���Ȥ� 132159b3361Sopenharmony_ci ;eax=k1,ebx=k2,ecx=k3,edx=k4 133159b3361Sopenharmony_ci 134159b3361Sopenharmony_ci mov edi,k 135159b3361Sopenharmony_ci lea ebp,[costab_fft+edi*4] 136159b3361Sopenharmony_ci mov ebp,[ebp] ;ebp=t_c 137159b3361Sopenharmony_ci mov [t_c0],ebp 138159b3361Sopenharmony_ci mov [t_c1],ebp ;t_c 139159b3361Sopenharmony_ci lea ebp,[sintab_fft+edi*4] 140159b3361Sopenharmony_ci mov ebp,[ebp] ;ebx=t_s 141159b3361Sopenharmony_ci mov [t_s0],ebp 142159b3361Sopenharmony_ci xor ebp,0x80000000 143159b3361Sopenharmony_ci mov [t_s1],ebp ;-t_s 144159b3361Sopenharmony_ci 145159b3361Sopenharmony_ci movq mm1,[D_0_1] ;mm1=[0:1] 146159b3361Sopenharmony_ci movq [D_s1c1],mm1 ;mm1=[s1:c1] 147159b3361Sopenharmony_ci mov esi,1 ;esi=i=1 148159b3361Sopenharmony_ci 149159b3361Sopenharmony_ci.lp32: ; for(i=1;i<kx;i++){ 150159b3361Sopenharmony_ci movq mm0,[D_s1c1] ;mm1=[s1:t]=[s1:c1] 151159b3361Sopenharmony_ci movq mm2,mm0 152159b3361Sopenharmony_ci pfmul mm0,[t_c1] ;mm0=[-s1*t_s: t*t_c] 153159b3361Sopenharmony_ci pfmul mm2,[t_s0] ;mm2=[ s1*t_c: t*t_s] 154159b3361Sopenharmony_ci pfacc mm0,mm2 ;mm0=[s1:c1]=[ s1*t_c+t*t_s:-s1*t_s+t*t_c] 155159b3361Sopenharmony_ci movq mm2,mm0 ;mm2=[s1:c1] 156159b3361Sopenharmony_ci movq [D_s1c1],mm0 ;��¸ 157159b3361Sopenharmony_ci movq mm6,mm2 158159b3361Sopenharmony_ci punpckldq mm5,mm6 159159b3361Sopenharmony_ci punpckhdq mm6,mm5 ;mm6=[ c1:s1] 160159b3361Sopenharmony_ci pxor mm6,mm7 ;mm6=[-c1:s1] 161159b3361Sopenharmony_ci movq [D_Mc1s1],mm6 ;��¸ 162159b3361Sopenharmony_ci pfmul mm2,mm2 ;mm2=[s1*s1:c1*c1] 163159b3361Sopenharmony_ci movq mm3,mm0 ;mm3=[s1:c1] 164159b3361Sopenharmony_ci pxor mm2,mm7 ;mm2=[-s1*s1:c1*c1] 165159b3361Sopenharmony_ci psrlq mm3,32 ;mm3=[ 0:s1] 166159b3361Sopenharmony_ci pfacc mm2,mm2 ;mm2=[c2:c2]=[c1*c1-s1*s1:<] 167159b3361Sopenharmony_ci pfmul mm0,mm3 ;mm0=[ 0:c1*s1] 168159b3361Sopenharmony_ci pfadd mm0,mm0 ;mm0=[0:s2]=[ 0:2*c1*s1] 169159b3361Sopenharmony_ci punpckldq mm2,mm0 ;mm2=[s2:c2] 170159b3361Sopenharmony_ci movq [D_s2c2],mm2 ;��¸ 171159b3361Sopenharmony_ci 172159b3361Sopenharmony_ci punpckldq mm0,mm2 173159b3361Sopenharmony_ci punpckhdq mm2,mm0 ;mm2=[c2:s2] 174159b3361Sopenharmony_ci pxor mm2,mm7 ;mm2=[-c2:s2] 175159b3361Sopenharmony_ci movq [D_Mc2s2],mm2 ;��¸ 176159b3361Sopenharmony_ci 177159b3361Sopenharmony_ci mov edi,[esp+_P+4] ;edi=fz 178159b3361Sopenharmony_ci lea edi,[edi+esi*4] ;edi=fz+i 179159b3361Sopenharmony_ci 180159b3361Sopenharmony_ci mov ebp,[esp+_P+4] ;ebp=fz 181159b3361Sopenharmony_ci neg esi ;esi=-i 182159b3361Sopenharmony_ci lea ebp,[ebp+eax*4] ;ebp=fz+k1 183159b3361Sopenharmony_ci lea ebp,[ebp+esi*4] ;ebp=gi=fz+k1-i 184159b3361Sopenharmony_ci neg esi ;esi=i 185159b3361Sopenharmony_ci 186159b3361Sopenharmony_ci.lp33: ; do{ FLOAT a,b,g0,f0,f1,g1,f2,g2,f3,g3; 187159b3361Sopenharmony_ci 188159b3361Sopenharmony_ci movd mm0,[edi+eax*4] ;mm0=[0:fi_k1] 189159b3361Sopenharmony_ci punpckldq mm0,[ebp+eax*4] ;mm0=[gi_k1:fi_k1] 190159b3361Sopenharmony_ci movq mm1,mm0 191159b3361Sopenharmony_ci pfmul mm0,[D_s2c2] ;mm0=[ s2*gi_k1:c2*fi_k1] 192159b3361Sopenharmony_ci pfmul mm1,[D_Mc2s2] ;mm1=[-c2*gi_k1:s2*fi_k1] 193159b3361Sopenharmony_ci pfacc mm0,mm1 ;mm0=[b:a] 194159b3361Sopenharmony_ci movd mm4,[edi] ;mm4=[0:fi_0] 195159b3361Sopenharmony_ci movq mm3,mm0 ;mm3=[b:a] 196159b3361Sopenharmony_ci punpckldq mm4,[ebp] ;mm4=[gi_0:fi_0] 197159b3361Sopenharmony_ci pfadd mm3,mm4 ;mm3=[g0:f0]=[gi_0+b:fi_0+a] 198159b3361Sopenharmony_ci pfsub mm4,mm0 ;mm4=[g1:f1]=[gi_0-b:fi_0-a] 199159b3361Sopenharmony_ci 200159b3361Sopenharmony_ci movd mm0,[edi+ecx*4] ;mm0=[0:fi_k3] 201159b3361Sopenharmony_ci punpckldq mm0,[ebp+ecx*4] ;mm0=[gi_k3:fi_k3] 202159b3361Sopenharmony_ci movq mm1,mm0 203159b3361Sopenharmony_ci pfmul mm0,[D_s2c2] ;mm0=[ s2*gi_k3:c2*fi_k3] 204159b3361Sopenharmony_ci pfmul mm1,[D_Mc2s2] ;mm1=[-c2*gi_k3:s2*fi_k3] 205159b3361Sopenharmony_ci pfacc mm0,mm1 ;mm0=[b:a] 206159b3361Sopenharmony_ci movd mm5,[edi+ebx*4] ;mm5=[0:fi_k2] 207159b3361Sopenharmony_ci movq mm6,mm0 ;mm6=[b:a] 208159b3361Sopenharmony_ci punpckldq mm5,[ebp+ebx*4] ;mm5=[gi_k2:fi_k2] 209159b3361Sopenharmony_ci pfadd mm6,mm5 ;mm6=[g2:f2]=[gi_k2+b:fi_k2+a] 210159b3361Sopenharmony_ci pfsub mm5,mm0 ;mm5=[g3:f3]=[gi_k2-b:fi_k2-a] 211159b3361Sopenharmony_ci 212159b3361Sopenharmony_ci punpckldq mm1,mm6 ;mm1=[f2:*] 213159b3361Sopenharmony_ci movq mm0,[D_s1c1] ;mm0=[s1:c1] 214159b3361Sopenharmony_ci punpckhdq mm1,mm5 ;mm1=[g3:f2] 215159b3361Sopenharmony_ci pfmul mm0,mm1 ;mm0=[ s1*g3:c1*f2] 216159b3361Sopenharmony_ci movq mm2,[D_Mc1s1] ;mm2=[-c1:s1] 217159b3361Sopenharmony_ci pfmul mm2,mm1 ;mm2=[-c1*g3:s1*f2] 218159b3361Sopenharmony_ci pfacc mm0,mm2 ;mm0=[b:a] 219159b3361Sopenharmony_ci 220159b3361Sopenharmony_ci punpckldq mm1,mm3 ;mm1=[f0:*] 221159b3361Sopenharmony_ci punpckhdq mm1,mm4 ;mm1=[g1:f0] 222159b3361Sopenharmony_ci movq mm2,mm0 ;mm2=[b:a] 223159b3361Sopenharmony_ci pfadd mm0,mm1 ;mm0=[g1+b:f0+a] 224159b3361Sopenharmony_ci pfsubr mm2,mm1 ;mm2=[g1-b:f0-a] 225159b3361Sopenharmony_ci movd [edi],mm0 ;fi[0]=f0+a 226159b3361Sopenharmony_ci psrlq mm0,32 ;mm0=[0:g1+b] 227159b3361Sopenharmony_ci movd [edi+ebx*4],mm2 ;fi[k2]=f0-a 228159b3361Sopenharmony_ci psrlq mm2,32 ;mm2=[0:g1-b] 229159b3361Sopenharmony_ci movd [ebp+eax*4],mm0 ;gi[k1]=g1+b 230159b3361Sopenharmony_ci movd [ebp+ecx*4],mm2 ;gi[k3]=g1-b 231159b3361Sopenharmony_ci psrlq mm6,32 ;mm6=[0:g2] 232159b3361Sopenharmony_ci movq mm0,[D_s1c1] ;mm0=[s1:c1] 233159b3361Sopenharmony_ci punpckldq mm5,mm6 ;mm5=[g2:f3] 234159b3361Sopenharmony_ci pfmul mm0,mm5 ;mm0=[g2* s1:f3*c1] 235159b3361Sopenharmony_ci pfmul mm5,[D_Mc1s1] ;mm5=[g2*-c1:f3*s1] 236159b3361Sopenharmony_ci pfacc mm0,mm5 ;mm0=[-b:a] 237159b3361Sopenharmony_ci psrlq mm3,32 ;mm3=[0:g0] 238159b3361Sopenharmony_ci movq mm1,mm0 ;mm1=[-b:a] 239159b3361Sopenharmony_ci punpckldq mm3,mm4 ;mm3=[f1:g0] 240159b3361Sopenharmony_ci pfadd mm0,mm3 ;mm0=[f1-b:g0+a] 241159b3361Sopenharmony_ci pfsubr mm1,mm3 ;mm1=[f1+b:g0-a] 242159b3361Sopenharmony_ci movd [ebp],mm0 ;gi[0]=g0+a 243159b3361Sopenharmony_ci psrlq mm0,32 ;mm0=[0:f1-b] 244159b3361Sopenharmony_ci movd [ebp+ebx*4],mm1 ;gi[k2]=g0-a 245159b3361Sopenharmony_ci psrlq mm1,32 ;mm1=[0:f1+b] 246159b3361Sopenharmony_ci movd [edi+ecx*4],mm0 ;fi[k3]=f1-b 247159b3361Sopenharmony_ci movd [edi+eax*4],mm1 ;fi[k1]=f1+b 248159b3361Sopenharmony_ci 249159b3361Sopenharmony_ci lea edi,[edi+edx*4] ;fi += k4 250159b3361Sopenharmony_ci lea ebp,[ebp+edx*4] ;gi += k4 251159b3361Sopenharmony_ci cmp edi,fn 252159b3361Sopenharmony_ci jc near .lp33 ;}while(fi<fn) 253159b3361Sopenharmony_ci inc esi 254159b3361Sopenharmony_ci cmp esi,kx 255159b3361Sopenharmony_ci jnz near .lp32 ;} 256159b3361Sopenharmony_ci cmp edx,[esp+_P+8] 257159b3361Sopenharmony_ci jnz near .lp30 ;}while(k4<n) 258159b3361Sopenharmony_ci 259159b3361Sopenharmony_ci 260159b3361Sopenharmony_ci.exit: 261159b3361Sopenharmony_ci add esp,LOCAL_STACK 262159b3361Sopenharmony_ci femms 263159b3361Sopenharmony_ci pop ebp 264159b3361Sopenharmony_ci pop edi 265159b3361Sopenharmony_ci pop esi 266159b3361Sopenharmony_ci pop ebx 267159b3361Sopenharmony_ci ret 268