1159b3361Sopenharmony_ci; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA 2159b3361Sopenharmony_ci 3159b3361Sopenharmony_ci; GOGO-no-coda 4159b3361Sopenharmony_ci; Copyright (C) 1999 shigeo 5159b3361Sopenharmony_ci; special thanks to Keiichi SAKAI 6159b3361Sopenharmony_ci 7159b3361Sopenharmony_ci%include "nasm.h" 8159b3361Sopenharmony_ci 9159b3361Sopenharmony_ci globaldef fht_SSE 10159b3361Sopenharmony_ci 11159b3361Sopenharmony_ci segment_data 12159b3361Sopenharmony_ci align 16 13159b3361Sopenharmony_ciQ_MMPP dd 0x0,0x0,0x80000000,0x80000000 14159b3361Sopenharmony_ciQ_MPMP dd 0x0,0x80000000,0x0,0x80000000 15159b3361Sopenharmony_ciD_1100 dd 0.0, 0.0, 1.0, 1.0 16159b3361Sopenharmony_cicostab_fft: 17159b3361Sopenharmony_ci dd 9.238795325112867e-01 18159b3361Sopenharmony_ci dd 3.826834323650898e-01 19159b3361Sopenharmony_ci dd 9.951847266721969e-01 20159b3361Sopenharmony_ci dd 9.801714032956060e-02 21159b3361Sopenharmony_ci dd 9.996988186962042e-01 22159b3361Sopenharmony_ci dd 2.454122852291229e-02 23159b3361Sopenharmony_ci dd 9.999811752836011e-01 24159b3361Sopenharmony_ci dd 6.135884649154475e-03 25159b3361Sopenharmony_ciS_SQRT2 dd 1.414213562 26159b3361Sopenharmony_ci 27159b3361Sopenharmony_ci segment_code 28159b3361Sopenharmony_ci 29159b3361Sopenharmony_ciPIC_OFFSETTABLE 30159b3361Sopenharmony_ci 31159b3361Sopenharmony_ci;------------------------------------------------------------------------ 32159b3361Sopenharmony_ci; by K. SAKAI 33159b3361Sopenharmony_ci; 99/08/18 PIII 23k[clk] 34159b3361Sopenharmony_ci; 99/08/19 ̿�������촹�� PIII 22k[clk] 35159b3361Sopenharmony_ci; 99/08/20 bit reversal ����夫��ܿ����� PIII 17k[clk] 36159b3361Sopenharmony_ci; 99/08/23 ���� unroll PIII 14k[clk] 37159b3361Sopenharmony_ci; 99/11/12 clean up 38159b3361Sopenharmony_ci; 39159b3361Sopenharmony_ci;void fht_SSE(float *fz, int n); 40159b3361Sopenharmony_ci align 16 41159b3361Sopenharmony_cifht_SSE: 42159b3361Sopenharmony_ci push ebx 43159b3361Sopenharmony_ci push esi 44159b3361Sopenharmony_ci push edi 45159b3361Sopenharmony_ci push ebp 46159b3361Sopenharmony_ci 47159b3361Sopenharmony_ci%assign _P 4*5 48159b3361Sopenharmony_ci 49159b3361Sopenharmony_ci ;2���ܤΥ롼�� 50159b3361Sopenharmony_ci mov eax,[esp+_P+0] ;eax=fz 51159b3361Sopenharmony_ci mov ebp,[esp+_P+4] ;=n 52159b3361Sopenharmony_ci shl ebp,3 53159b3361Sopenharmony_ci add ebp,eax ; fn = fz + n, ���δؿ���λ�ޤ����� 54159b3361Sopenharmony_ci push ebp 55159b3361Sopenharmony_ci 56159b3361Sopenharmony_ci call get_pc.bp 57159b3361Sopenharmony_ci add ebp, PIC_BASE() 58159b3361Sopenharmony_ci 59159b3361Sopenharmony_ci lea ecx,[PIC_EBP_REL(costab_fft)] 60159b3361Sopenharmony_ci xor eax,eax 61159b3361Sopenharmony_ci mov al,8 ; =k1=1*(sizeof float) // 4, 16, 64, 256,... 62159b3361Sopenharmony_ci.lp2: ; do{ 63159b3361Sopenharmony_ci mov esi,[esp+_P+4] ; esi=fi=fz 64159b3361Sopenharmony_ci lea edx,[eax+eax*2] 65159b3361Sopenharmony_ci mov ebx, esi 66159b3361Sopenharmony_ci 67159b3361Sopenharmony_ci; ��������2�������ԤǤ��ʤ���ʬ��FPU�Τۤ���®���� 68159b3361Sopenharmony_ci loopalign 16 69159b3361Sopenharmony_ci.lp20: ; do{ 70159b3361Sopenharmony_ci; f0 = fi[0 ] + fi[k1]; 71159b3361Sopenharmony_ci; f2 = fi[k2] + fi[k3]; 72159b3361Sopenharmony_ci; f1 = fi[0 ] - fi[k1]; 73159b3361Sopenharmony_ci; f3 = fi[k2] - fi[k3]; 74159b3361Sopenharmony_ci; fi[0 ] = f0 + f2; 75159b3361Sopenharmony_ci; fi[k1] = f1 + f3; 76159b3361Sopenharmony_ci; fi[k2] = f0 - f2; 77159b3361Sopenharmony_ci; fi[k3] = f1 - f3; 78159b3361Sopenharmony_ci lea edi,[ebx+eax] ; edi=gi=fi+ki/2 79159b3361Sopenharmony_ci fld dword [ebx] 80159b3361Sopenharmony_ci fadd dword [ebx+eax*2] 81159b3361Sopenharmony_ci fld dword [ebx+eax*4] 82159b3361Sopenharmony_ci fadd dword [ebx+edx*2] 83159b3361Sopenharmony_ci 84159b3361Sopenharmony_ci fld dword [ebx] 85159b3361Sopenharmony_ci fsub dword [ebx+eax*2] 86159b3361Sopenharmony_ci fld dword [ebx+eax*4] 87159b3361Sopenharmony_ci fsub dword [ebx+edx*2] 88159b3361Sopenharmony_ci 89159b3361Sopenharmony_ci fld st1 90159b3361Sopenharmony_ci fadd st0,st1 91159b3361Sopenharmony_ci fstp dword [ebx+eax*2] 92159b3361Sopenharmony_ci fsubp st1,st0 93159b3361Sopenharmony_ci fstp dword [ebx+edx*2] 94159b3361Sopenharmony_ci 95159b3361Sopenharmony_ci fld st1 96159b3361Sopenharmony_ci fadd st0,st1 97159b3361Sopenharmony_ci fstp dword [ebx] 98159b3361Sopenharmony_ci fsubp st1,st0 99159b3361Sopenharmony_ci fstp dword [ebx+eax*4] 100159b3361Sopenharmony_ci 101159b3361Sopenharmony_ci lea ebx,[ebx + eax*8] ; = fi += (k1 * 4); 102159b3361Sopenharmony_ci; g0 = gi[0 ] + gi[k1]; 103159b3361Sopenharmony_ci; g2 = SQRT2 * gi[k2]; 104159b3361Sopenharmony_ci; g1 = gi[0 ] - gi[k1]; 105159b3361Sopenharmony_ci; g3 = SQRT2 * gi[k3]; 106159b3361Sopenharmony_ci; gi[0 ] = g0 + g2; 107159b3361Sopenharmony_ci; gi[k2] = g0 - g2; 108159b3361Sopenharmony_ci; gi[k1] = g1 + g3; 109159b3361Sopenharmony_ci; gi[k3] = g1 - g3; 110159b3361Sopenharmony_ci fld dword [edi] 111159b3361Sopenharmony_ci fadd dword [edi+eax*2] 112159b3361Sopenharmony_ci fld dword [PIC_EBP_REL(S_SQRT2)] 113159b3361Sopenharmony_ci fmul dword [edi+eax*4] 114159b3361Sopenharmony_ci 115159b3361Sopenharmony_ci fld dword [edi] 116159b3361Sopenharmony_ci fsub dword [edi+eax*2] 117159b3361Sopenharmony_ci fld dword [PIC_EBP_REL(S_SQRT2)] 118159b3361Sopenharmony_ci fmul dword [edi+edx*2] 119159b3361Sopenharmony_ci 120159b3361Sopenharmony_ci fld st1 121159b3361Sopenharmony_ci fadd st0,st1 122159b3361Sopenharmony_ci fstp dword [edi+eax*2] 123159b3361Sopenharmony_ci fsubp st1,st0 124159b3361Sopenharmony_ci fstp dword [edi+edx*2] 125159b3361Sopenharmony_ci 126159b3361Sopenharmony_ci fld st1 127159b3361Sopenharmony_ci fadd st0,st1 128159b3361Sopenharmony_ci fstp dword [edi] 129159b3361Sopenharmony_ci fsubp st1,st0 130159b3361Sopenharmony_ci fstp dword [edi+eax*4] 131159b3361Sopenharmony_ci 132159b3361Sopenharmony_ci cmp ebx,[esp] 133159b3361Sopenharmony_ci jl near .lp20 ; while (fi<fn); 134159b3361Sopenharmony_ci 135159b3361Sopenharmony_ci 136159b3361Sopenharmony_ci; i = 1; //for (i=1;i<kx;i++){ 137159b3361Sopenharmony_ci; c1 = 1.0*t_c - 0.0*t_s; 138159b3361Sopenharmony_ci; s1 = 0.0*t_c + 1.0*t_s; 139159b3361Sopenharmony_ci movlps xmm6,[ecx] ; = { --, --, s1, c1} 140159b3361Sopenharmony_ci movaps xmm7,xmm6 141159b3361Sopenharmony_ci 142159b3361Sopenharmony_ci shufps xmm6,xmm6,R4(0,1,1,0) ; = {+c1, +s1, +s1, +c1} -> ɬ�� 143159b3361Sopenharmony_ci; c2 = c1*c1 - s1*s1 = 1 - (2*s1)*s1; 144159b3361Sopenharmony_ci; s2 = c1*s1 + s1*c1 = 2*s1*c1; 145159b3361Sopenharmony_ci shufps xmm7,xmm7,R4(1,0,0,1) 146159b3361Sopenharmony_ci movss xmm5,xmm7 ; = { --, --, --, s1} 147159b3361Sopenharmony_ci xorps xmm7,[PIC_EBP_REL(Q_MMPP)] ; = {-s1, -c1, +c1, +s1} -> ɬ�� 148159b3361Sopenharmony_ci 149159b3361Sopenharmony_ci addss xmm5,xmm5 ; = (--, --, --, 2*s1) 150159b3361Sopenharmony_ci add esi,4 ; esi = fi = fz + i 151159b3361Sopenharmony_ci shufps xmm5,xmm5,R4(0,0,0,0) ; = (2*s1, 2*s1, 2*s1, 2*s1) 152159b3361Sopenharmony_ci mulps xmm5,xmm6 ; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1) 153159b3361Sopenharmony_ci subps xmm5,[PIC_EBP_REL(D_1100)] ; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2} 154159b3361Sopenharmony_ci movaps xmm4,xmm5 155159b3361Sopenharmony_ci shufps xmm5,xmm5,R4(2,0,2,0) ; = {-c2, s2, -c2, s2} -> ɬ�� 156159b3361Sopenharmony_ci 157159b3361Sopenharmony_ci xorps xmm4,[PIC_EBP_REL(Q_MMPP)] ; = {--, c2, --, s2} 158159b3361Sopenharmony_ci shufps xmm4,xmm4,R4(0,2,0,2) ; = {s2, c2, s2, c2} -> ɬ�� 159159b3361Sopenharmony_ci 160159b3361Sopenharmony_ci loopalign 16 161159b3361Sopenharmony_ci.lp21: ; do{ 162159b3361Sopenharmony_ci; a = c2*fi[k1] + s2*gi[k1]; 163159b3361Sopenharmony_ci; b = s2*fi[k1] - c2*gi[k1]; 164159b3361Sopenharmony_ci; c = c2*fi[k3] + s2*gi[k3]; 165159b3361Sopenharmony_ci; d = s2*fi[k3] - c2*gi[k3]; 166159b3361Sopenharmony_ci; f0 = fi[0 ] + a; 167159b3361Sopenharmony_ci; g0 = gi[0 ] + b; 168159b3361Sopenharmony_ci; f2 = fi[k1 * 2] + c; 169159b3361Sopenharmony_ci; g2 = gi[k1 * 2] + d; 170159b3361Sopenharmony_ci; f1 = fi[0 ] - a; 171159b3361Sopenharmony_ci; g1 = gi[0 ] - b; 172159b3361Sopenharmony_ci; f3 = fi[k1 * 2] - c; 173159b3361Sopenharmony_ci; g3 = gi[k1 * 2] - d; 174159b3361Sopenharmony_ci lea edi,[esi + eax*2 - 8] ; edi = gi = fz +k1-i 175159b3361Sopenharmony_ci 176159b3361Sopenharmony_ci movss xmm0,[esi + eax*2] ; = fi[k1] 177159b3361Sopenharmony_ci movss xmm2,[esi + edx*2] ; = fi[k3] 178159b3361Sopenharmony_ci shufps xmm0,xmm2,0x00 ; = {fi[k3], fi[k3], fi[k1], fi[k1]} 179159b3361Sopenharmony_ci movss xmm1,[edi + eax*2] ; = fi[k1] 180159b3361Sopenharmony_ci movss xmm3,[edi + edx*2] ; = fi[k3] 181159b3361Sopenharmony_ci shufps xmm1,xmm3,0x00 ; = {gi[k3], gi[k3], gi[k1], gi[k1]} 182159b3361Sopenharmony_ci movss xmm2,[esi] ; = fi[0] 183159b3361Sopenharmony_ci mulps xmm0,xmm4 ; *= {+s2, +c2, +s2, +c2} 184159b3361Sopenharmony_ci movss xmm3,[esi + eax*4] ; = fi[k2] 185159b3361Sopenharmony_ci unpcklps xmm2,xmm3 ; = {--, --, fi[k2], fi[0]} 186159b3361Sopenharmony_ci mulps xmm1,xmm5 ; *= {-c2, +s2, -c2, +s2} 187159b3361Sopenharmony_ci movss xmm3,[edi + eax*4] ; = gi[k2] 188159b3361Sopenharmony_ci addps xmm0,xmm1 ; = {d, c, b, a} 189159b3361Sopenharmony_ci movss xmm1,[edi] ; = gi[0] 190159b3361Sopenharmony_ci unpcklps xmm1,xmm3 ; = {--, --, gi[k2], gi[0]} 191159b3361Sopenharmony_ci unpcklps xmm2,xmm1 ; = {gi[k2], fi[k2], gi[0], fi[0]} 192159b3361Sopenharmony_ci movaps xmm1,xmm2 193159b3361Sopenharmony_ci addps xmm1,xmm0 ; = {g2, f2, g0, f0} 194159b3361Sopenharmony_ci subps xmm2,xmm0 ; = {g3, f3, g1, f1} 195159b3361Sopenharmony_ci 196159b3361Sopenharmony_ci; a = c1*f2 + s1*g3; 197159b3361Sopenharmony_ci; c = s1*g2 + c1*f3; 198159b3361Sopenharmony_ci; b = s1*f2 - c1*g3; 199159b3361Sopenharmony_ci; d = c1*g2 - s1*f3; 200159b3361Sopenharmony_ci; fi[0 ] = f0 + a; 201159b3361Sopenharmony_ci; gi[0 ] = g0 + c; 202159b3361Sopenharmony_ci; gi[k1] = g1 + b; 203159b3361Sopenharmony_ci; fi[k1] = f1 + d; 204159b3361Sopenharmony_ci; fi[k1 * 2] = f0 - a; 205159b3361Sopenharmony_ci; gi[k1 * 2] = g0 - c; 206159b3361Sopenharmony_ci; gi[k3] = g1 - b; 207159b3361Sopenharmony_ci; fi[k3] = f1 - d; 208159b3361Sopenharmony_ci movaps xmm3,xmm1 209159b3361Sopenharmony_ci movhlps xmm1,xmm1 ; = {g2, f2, g2, f2} 210159b3361Sopenharmony_ci shufps xmm3,xmm2,0x14 ; = {f1, g1, g0, f0} 211159b3361Sopenharmony_ci mulps xmm1,xmm6 ; *= {+c1, +s1, +s1, +c1} 212159b3361Sopenharmony_ci shufps xmm2,xmm2,0xBB ; = {f3, g3, f3, g3} 213159b3361Sopenharmony_ci mulps xmm2,xmm7 ; *= {-s1, -c1, +c1, +s1} 214159b3361Sopenharmony_ci addps xmm1,xmm2 ; = {d, b, c, a} 215159b3361Sopenharmony_ci movaps xmm2,xmm3 216159b3361Sopenharmony_ci addps xmm3,xmm1 ; = {fi[k1], gi[k1], gi[0], fi[0]} 217159b3361Sopenharmony_ci subps xmm2,xmm1 ; = {fi[k3], gi[k3], gi[k1*2], fi[k1*2]} 218159b3361Sopenharmony_ci movhlps xmm0,xmm3 219159b3361Sopenharmony_ci movss [esi],xmm3 220159b3361Sopenharmony_ci shufps xmm3,xmm3,0x55 221159b3361Sopenharmony_ci movss [edi+eax*2],xmm0 222159b3361Sopenharmony_ci shufps xmm0,xmm0,0x55 223159b3361Sopenharmony_ci movss [edi],xmm3 224159b3361Sopenharmony_ci movss [esi+eax*2],xmm0 225159b3361Sopenharmony_ci movhlps xmm0,xmm2 226159b3361Sopenharmony_ci movss [esi+eax*4],xmm2 227159b3361Sopenharmony_ci shufps xmm2,xmm2,0x55 228159b3361Sopenharmony_ci movss [edi+edx*2],xmm0 229159b3361Sopenharmony_ci shufps xmm0,xmm0,0x55 230159b3361Sopenharmony_ci movss [edi+eax*4],xmm2 231159b3361Sopenharmony_ci movss [esi+edx*2],xmm0 232159b3361Sopenharmony_ci lea esi,[esi + eax*8] ; fi += (k1 * 4); 233159b3361Sopenharmony_ci cmp esi,[esp] 234159b3361Sopenharmony_ci jl near .lp21 ; while (fi<fn); 235159b3361Sopenharmony_ci 236159b3361Sopenharmony_ci 237159b3361Sopenharmony_ci; unroll����do loop��43+4̿�� 238159b3361Sopenharmony_ci 239159b3361Sopenharmony_ci; ������ǤϤʤ�for�롼�פ�i=2�������unrolling���� 240159b3361Sopenharmony_ci; kx= 2, 8, 32, 128 241159b3361Sopenharmony_ci; k4= 16, 64, 256, 1024 242159b3361Sopenharmony_ci; 0, 6/2,30/2,126/2 243159b3361Sopenharmony_ci 244159b3361Sopenharmony_ci xor ebx,ebx 245159b3361Sopenharmony_ci mov bl, 4*2 ; = i = 4 246159b3361Sopenharmony_ci cmp ebx,eax ; i < k1 247159b3361Sopenharmony_ci jnl near .F22 248159b3361Sopenharmony_ci; for (i=2;i<kx;i+=2){ 249159b3361Sopenharmony_ci loopalign 16 250159b3361Sopenharmony_ci.lp22: 251159b3361Sopenharmony_ci; at here, xmm6 is {c3, s3, s3, c3} 252159b3361Sopenharmony_ci; c1 = c3*t_c - s3*t_s; 253159b3361Sopenharmony_ci; s1 = c3*t_s + s3*t_c; 254159b3361Sopenharmony_ci movlps xmm0,[ecx] 255159b3361Sopenharmony_ci shufps xmm0,xmm0,R4(1,1,0,0) ; = {t_s, t_s, t_c, t_c} 256159b3361Sopenharmony_ci mulps xmm6,xmm0 ; = {c3*ts, s3*ts, s3*tc, c3*tc} 257159b3361Sopenharmony_ci movhlps xmm4,xmm6 ; = {--, --, c3*ts, s3*ts} 258159b3361Sopenharmony_ci xorps xmm4,[PIC_EBP_REL(Q_MPMP)] ; = {--, --, -c3*ts, s3*ts} 259159b3361Sopenharmony_ci subps xmm6,xmm4 ; = {-,-, c3*ts+s3*tc, c3*tc-s3*ts}={-,-,s1,c1} 260159b3361Sopenharmony_ci 261159b3361Sopenharmony_ci; c3 = c1*t_c - s1*t_s; 262159b3361Sopenharmony_ci; s3 = s1*t_c + c1*t_s; 263159b3361Sopenharmony_ci shufps xmm6,xmm6,0x14 ; = {c1, s1, s1, c1} 264159b3361Sopenharmony_ci mulps xmm0,xmm6 ; = {ts*c1 ts*s1 tc*s1 tc*c1} 265159b3361Sopenharmony_ci movhlps xmm3,xmm0 266159b3361Sopenharmony_ci xorps xmm3,[PIC_EBP_REL(Q_MPMP)] 267159b3361Sopenharmony_ci subps xmm0,xmm3 ; = {--, --, s3, c3} 268159b3361Sopenharmony_ci 269159b3361Sopenharmony_ci; {s2 s4 c4 c2} = {2*s1*c1 2*s3*c3 1-2*s3*s3 1-2*s1*s1} 270159b3361Sopenharmony_ci unpcklps xmm6,xmm0 ; xmm6 = {s3, s1, c3, c1} 271159b3361Sopenharmony_ci movaps xmm7, xmm6 272159b3361Sopenharmony_ci shufps xmm6,xmm6,R4(2,3,1,0) ; xmm6 = {s1, s3, c3, c1} 273159b3361Sopenharmony_ci addps xmm7, xmm7 ; {s3*2, s1*2, --, --} 274159b3361Sopenharmony_ci mov edi,[esp+_P+4] ; = fz 275159b3361Sopenharmony_ci shufps xmm7, xmm7, R4(2,3,3,2) ; {s1*2, s3*2, s3*2, s1*2} 276159b3361Sopenharmony_ci sub edi,ebx ; edi = fz - i/2 277159b3361Sopenharmony_ci mulps xmm7, xmm6 ; {s1*s1*2, s3*s3*2, s3*c3*2, s1*c1*2} 278159b3361Sopenharmony_ci lea esi,[edi + ebx*2] ; esi = fi = fz +i/2 279159b3361Sopenharmony_ci subps xmm7, [PIC_EBP_REL(D_1100)] ; {-c2, -c4, s4, s2} 280159b3361Sopenharmony_ci lea edi,[edi + eax*2-4] ; edi = gi = fz +k1-i/2 281159b3361Sopenharmony_ci 282159b3361Sopenharmony_ci; fi = fz +i; 283159b3361Sopenharmony_ci; gi = fz +k1-i; 284159b3361Sopenharmony_ci; do{ 285159b3361Sopenharmony_ci.lp220: 286159b3361Sopenharmony_ci; unroll���do loop��51+4̿�� 287159b3361Sopenharmony_ci; a = c2*fi[k1 ] + s2*gi[k1 ]; 288159b3361Sopenharmony_ci; e = c4*fi[k1+1] + s4*gi[k1-1]; 289159b3361Sopenharmony_ci; f = s4*fi[k1+1] - c4*gi[k1-1]; 290159b3361Sopenharmony_ci; b = s2*fi[k1 ] - c2*gi[k1 ]; 291159b3361Sopenharmony_ci; c = c2*fi[k3 ] + s2*gi[k3 ]; 292159b3361Sopenharmony_ci; g = c4*fi[k3+1] + s4*gi[k3-1]; 293159b3361Sopenharmony_ci; h = s4*fi[k3+1] - c4*gi[k3-1]; 294159b3361Sopenharmony_ci; d = s2*fi[k3 ] - c2*gi[k3 ]; 295159b3361Sopenharmony_ci 296159b3361Sopenharmony_ci movaps xmm4,xmm7 ; = {-c2 -c4 s4 s2} 297159b3361Sopenharmony_ci xorps xmm4,[PIC_EBP_REL(Q_MMPP)] ; = { c2 c4 s4 s2} 298159b3361Sopenharmony_ci shufps xmm4,xmm4,0x1B ; = { s2 s4 c4 c2} 299159b3361Sopenharmony_ci movlps xmm0,[esi+eax*2] 300159b3361Sopenharmony_ci movlps xmm1,[edi+eax*2] 301159b3361Sopenharmony_ci movlps xmm2,[esi+edx*2] 302159b3361Sopenharmony_ci movlps xmm3,[edi+edx*2] 303159b3361Sopenharmony_ci shufps xmm0,xmm0,0x14 304159b3361Sopenharmony_ci shufps xmm1,xmm1,0x41 305159b3361Sopenharmony_ci shufps xmm2,xmm2,0x14 306159b3361Sopenharmony_ci shufps xmm3,xmm3,0x41 307159b3361Sopenharmony_ci mulps xmm0,xmm4 308159b3361Sopenharmony_ci mulps xmm1,xmm7 309159b3361Sopenharmony_ci mulps xmm2,xmm4 310159b3361Sopenharmony_ci mulps xmm3,xmm7 311159b3361Sopenharmony_ci addps xmm0,xmm1 ; xmm0 = {b, f, e, a} 312159b3361Sopenharmony_ci addps xmm2,xmm3 ; xmm2 = {d, h, g, c} 313159b3361Sopenharmony_ci;17 314159b3361Sopenharmony_ci 315159b3361Sopenharmony_ci; f0 = fi[0 ] + a; 316159b3361Sopenharmony_ci; f4 = fi[0 +1] + e; 317159b3361Sopenharmony_ci; g4 = gi[0 -1] + f; 318159b3361Sopenharmony_ci; g0 = gi[0 ] + b; 319159b3361Sopenharmony_ci; f1 = fi[0 ] - a; 320159b3361Sopenharmony_ci; f5 = fi[0 +1] - e; 321159b3361Sopenharmony_ci; g5 = gi[0 -1] - f; 322159b3361Sopenharmony_ci; g1 = gi[0 ] - b; 323159b3361Sopenharmony_ci; f2 = fi[k2 ] + c; 324159b3361Sopenharmony_ci; f6 = fi[k2+1] + g; 325159b3361Sopenharmony_ci; g6 = gi[k2-1] + h; 326159b3361Sopenharmony_ci; g2 = gi[k2 ] + d; 327159b3361Sopenharmony_ci; f3 = fi[k2 ] - c; 328159b3361Sopenharmony_ci; f7 = fi[k2+1] - g; 329159b3361Sopenharmony_ci; g7 = gi[k2-1] - h; 330159b3361Sopenharmony_ci; g3 = gi[k2 ] - d; 331159b3361Sopenharmony_ci movlps xmm1,[esi ] 332159b3361Sopenharmony_ci movhps xmm1,[edi ] 333159b3361Sopenharmony_ci movaps xmm4,xmm1 334159b3361Sopenharmony_ci subps xmm1,xmm0 ; xmm1 = {g1, g5, f5, f1} 335159b3361Sopenharmony_ci movlps xmm3,[esi+eax*4] 336159b3361Sopenharmony_ci movhps xmm3,[edi+eax*4] 337159b3361Sopenharmony_ci movaps xmm5,xmm3 338159b3361Sopenharmony_ci subps xmm3,xmm2 ; xmm3 = {g3, g7, f7, f3} 339159b3361Sopenharmony_ci addps xmm0,xmm4 ; xmm0 = {g0, g4, f4, f0} 340159b3361Sopenharmony_ci addps xmm2,xmm5 ; xmm2 = {g2, g6, f6, f2} 341159b3361Sopenharmony_ci;10 342159b3361Sopenharmony_ci 343159b3361Sopenharmony_ci; a = c1*f2 + s1*g3; ��*�� + ��*�� 344159b3361Sopenharmony_ci; e = c3*f6 + s3*g7; 345159b3361Sopenharmony_ci; g = s3*g6 + c3*f7; 346159b3361Sopenharmony_ci; c = s1*g2 + c1*f3; 347159b3361Sopenharmony_ci; d = c1*g2 - s1*f3; ��*�� - ��*�� 348159b3361Sopenharmony_ci; h = c3*g6 - s3*f7; 349159b3361Sopenharmony_ci; f = s3*f6 - c3*g7; 350159b3361Sopenharmony_ci; b = s1*f2 - c1*g3; 351159b3361Sopenharmony_ci 352159b3361Sopenharmony_ci movaps xmm5,xmm6 ; xmm6 = {s1, s3, c3, c1} 353159b3361Sopenharmony_ci shufps xmm5,xmm5,0x1B ; = {c1, c3, s3, s1} 354159b3361Sopenharmony_ci movaps xmm4,xmm2 355159b3361Sopenharmony_ci mulps xmm4,xmm6 356159b3361Sopenharmony_ci shufps xmm2,xmm2,0x1B ; xmm2 = {f2, f6, g6, g2} 357159b3361Sopenharmony_ci mulps xmm2,xmm6 358159b3361Sopenharmony_ci mulps xmm5,xmm3 359159b3361Sopenharmony_ci mulps xmm3,xmm6 360159b3361Sopenharmony_ci shufps xmm3,xmm3,0x1B 361159b3361Sopenharmony_ci addps xmm4,xmm3 ; = {c, g, e, a} 362159b3361Sopenharmony_ci subps xmm2,xmm5 ; = {b, f, h, d} 363159b3361Sopenharmony_ci;10 364159b3361Sopenharmony_ci 365159b3361Sopenharmony_ci; fi[0 ] = f0 + a; 366159b3361Sopenharmony_ci; fi[0 +1] = f4 + e; 367159b3361Sopenharmony_ci; gi[0 -1] = g4 + g; 368159b3361Sopenharmony_ci; gi[0 ] = g0 + c; 369159b3361Sopenharmony_ci; fi[k2 ] = f0 - a; 370159b3361Sopenharmony_ci; fi[k2+1] = f4 - e; 371159b3361Sopenharmony_ci; gi[k2-1] = g4 - g; 372159b3361Sopenharmony_ci; gi[k2 ] = g0 - c; 373159b3361Sopenharmony_ci; fi[k1 ] = f1 + d; 374159b3361Sopenharmony_ci; fi[k1+1] = f5 + h; 375159b3361Sopenharmony_ci; gi[k1-1] = g5 + f; 376159b3361Sopenharmony_ci; gi[k1 ] = g1 + b; 377159b3361Sopenharmony_ci; fi[k3 ] = f1 - d; 378159b3361Sopenharmony_ci; fi[k3+1] = f5 - h; 379159b3361Sopenharmony_ci; gi[k3-1] = g5 - f; 380159b3361Sopenharmony_ci; gi[k3 ] = g1 - b; 381159b3361Sopenharmony_ci movaps xmm3,xmm0 382159b3361Sopenharmony_ci subps xmm0,xmm4 383159b3361Sopenharmony_ci movlps [esi+eax*4],xmm0 384159b3361Sopenharmony_ci movhps [edi+eax*4],xmm0 385159b3361Sopenharmony_ci addps xmm4,xmm3 386159b3361Sopenharmony_ci movlps [esi ],xmm4 387159b3361Sopenharmony_ci movhps [edi ],xmm4 388159b3361Sopenharmony_ci 389159b3361Sopenharmony_ci movaps xmm5,xmm1 390159b3361Sopenharmony_ci subps xmm1,xmm2 391159b3361Sopenharmony_ci movlps [esi+edx*2],xmm1 392159b3361Sopenharmony_ci movhps [edi+edx*2],xmm1 393159b3361Sopenharmony_ci addps xmm2,xmm5 394159b3361Sopenharmony_ci movlps [esi+eax*2],xmm2 395159b3361Sopenharmony_ci movhps [edi+eax*2],xmm2 396159b3361Sopenharmony_ci; 14 397159b3361Sopenharmony_ci; gi += k4; 398159b3361Sopenharmony_ci; fi += k4; 399159b3361Sopenharmony_ci lea edi,[edi + eax*8] ; gi += (k1 * 4); 400159b3361Sopenharmony_ci lea esi,[esi + eax*8] ; fi += (k1 * 4); 401159b3361Sopenharmony_ci cmp esi,[esp] 402159b3361Sopenharmony_ci jl near .lp220 ; while (fi<fn); 403159b3361Sopenharmony_ci; } while (fi<fn); 404159b3361Sopenharmony_ci 405159b3361Sopenharmony_ci add ebx,byte 2*4 ; i+= 4 406159b3361Sopenharmony_ci cmp ebx,eax ; i < k1 407159b3361Sopenharmony_ci shufps xmm6,xmm6,R4(1,2,2,1) ; (--,s3,c3,--) => {c3, s3, s3, c3} 408159b3361Sopenharmony_ci jl near .lp22 409159b3361Sopenharmony_ci; } 410159b3361Sopenharmony_ci.F22: 411159b3361Sopenharmony_ci shl eax,2 412159b3361Sopenharmony_ci add ecx, byte 8 413159b3361Sopenharmony_ci cmp eax,[esp+_P+8] ; while ((k1 * 4)<n); 414159b3361Sopenharmony_ci jle near .lp2 415159b3361Sopenharmony_ci pop ebp 416159b3361Sopenharmony_ci pop ebp 417159b3361Sopenharmony_ci pop edi 418159b3361Sopenharmony_ci pop esi 419159b3361Sopenharmony_ci pop ebx 420159b3361Sopenharmony_ci ret 421159b3361Sopenharmony_ci 422159b3361Sopenharmony_ci end 423