1159b3361Sopenharmony_ci; from a new GOGO-no-coda (1999/09) 2159b3361Sopenharmony_ci; Copyright (C) 1999 shigeo 3159b3361Sopenharmony_ci; special thanks to Keiichi SAKAI, URURI 4159b3361Sopenharmony_ci; hacked and back-ported to LAME 5159b3361Sopenharmony_ci; by Takehiro TOMINAGA Nov 2000 6159b3361Sopenharmony_ci 7159b3361Sopenharmony_ci%include "nasm.h" 8159b3361Sopenharmony_ci 9159b3361Sopenharmony_ci globaldef fht_3DN 10159b3361Sopenharmony_ci 11159b3361Sopenharmony_ci segment_data 12159b3361Sopenharmony_ci align 16 13159b3361Sopenharmony_cicostab dd 0x80000000, 0 14159b3361Sopenharmony_ci dd 1.414213562,1.414213562 15159b3361Sopenharmony_ci dd 9.238795283293805e-01, 9.238795283293805e-01 16159b3361Sopenharmony_ci dd 3.826834424611044e-01, 3.826834424611044e-01 17159b3361Sopenharmony_ci dd 9.951847264044178e-01, 9.951847264044178e-01 18159b3361Sopenharmony_ci dd 9.801714304836734e-02, 9.801714304836734e-02 19159b3361Sopenharmony_ci dd 9.996988186794428e-01, 9.996988186794428e-01 20159b3361Sopenharmony_ci dd 2.454122920569705e-02, 2.454122920569705e-02 21159b3361Sopenharmony_ci dd 9.999811752815535e-01, 9.999811752815535e-01 22159b3361Sopenharmony_ci dd 6.135884819898878e-03, 6.135884819898878e-03 23159b3361Sopenharmony_ciD_1_0_0_0 dd 0.0 , 1.0 24159b3361Sopenharmony_ci 25159b3361Sopenharmony_ci segment_code 26159b3361Sopenharmony_ci 27159b3361Sopenharmony_ciPIC_OFFSETTABLE 28159b3361Sopenharmony_ci 29159b3361Sopenharmony_ci 30159b3361Sopenharmony_ci;void fht_3DN(float *fz, int nn); 31159b3361Sopenharmony_ci 32159b3361Sopenharmony_ciproc fht_3DN 33159b3361Sopenharmony_ci 34159b3361Sopenharmony_ci pushd ebp, ebx, esi, edi 35159b3361Sopenharmony_ci 36159b3361Sopenharmony_ci sub esp, 20 37159b3361Sopenharmony_ci 38159b3361Sopenharmony_ci call get_pc.bp 39159b3361Sopenharmony_ci add ebp, PIC_BASE() 40159b3361Sopenharmony_ci 41159b3361Sopenharmony_ci mov r0, [esp+40] ;fi 42159b3361Sopenharmony_ci mov r1, [esp+44] ;r1 = nn 43159b3361Sopenharmony_ci lea r3, [PIC_EBP_REL(costab)] ;tri = costab 44159b3361Sopenharmony_ci lea r4, [r0+r1*8] ;r4 = fn = &fz[n] 45159b3361Sopenharmony_ci mov [esp+16], r4 46159b3361Sopenharmony_ci mov r4, 8 ;kx = k1/2 47159b3361Sopenharmony_ci 48159b3361Sopenharmony_ci pmov mm7, [r3] 49159b3361Sopenharmony_ci 50159b3361Sopenharmony_ci loopalign 16 51159b3361Sopenharmony_ci.do1 52159b3361Sopenharmony_ci lea r3, [r3+16] ;tri += 2; 53159b3361Sopenharmony_ci pmov mm6, [PIC_EBP_REL(costab+8)] 54159b3361Sopenharmony_ci lea r2, [r4+r4*2] ;k3*fsize/2 55159b3361Sopenharmony_ci mov r5, 4 ;i = 1*fsize 56159b3361Sopenharmony_ci 57159b3361Sopenharmony_ci loopalign 16 58159b3361Sopenharmony_ci.do2: 59159b3361Sopenharmony_ci lea r1, [r0+r4] ;gi = fi + kx 60159b3361Sopenharmony_ci ;f 61159b3361Sopenharmony_ci pmov mm0, [r0] ;fi0 62159b3361Sopenharmony_ci pmov mm1, [r0+r4*2] ;fi1 63159b3361Sopenharmony_ci pmov mm2, [r0+r2*2] ;fi3 64159b3361Sopenharmony_ci pmov mm3, [r0+r4*4] ;fi2 65159b3361Sopenharmony_ci 66159b3361Sopenharmony_ci pupldq mm0, mm0 ;fi0 | fi0 67159b3361Sopenharmony_ci pupldq mm1, mm1 ;fi1 | fi1 68159b3361Sopenharmony_ci pupldq mm2, mm2 ;fi2 | fi2 69159b3361Sopenharmony_ci pupldq mm3, mm3 ;fi3 | fi3 70159b3361Sopenharmony_ci 71159b3361Sopenharmony_ci pxor mm1, mm7 ;fi1 | -fi1 72159b3361Sopenharmony_ci pxor mm3, mm7 ;fi3 | -fi3 73159b3361Sopenharmony_ci 74159b3361Sopenharmony_ci pfsub mm0, mm1 ;f1 | f0 75159b3361Sopenharmony_ci pfsub mm2, mm3 ;f3 | f2 76159b3361Sopenharmony_ci 77159b3361Sopenharmony_ci pmov mm4, mm0 78159b3361Sopenharmony_ci pfadd mm0, mm2 ;f1+f3|f0+f2 = fi1 | fi0 79159b3361Sopenharmony_ci pfsub mm4, mm2 ;f1-f3|f0-f2 = fi3 | fi2 80159b3361Sopenharmony_ci 81159b3361Sopenharmony_ci pmovd [r0], mm0 ;fi[0] 82159b3361Sopenharmony_ci puphdq mm0, mm0 83159b3361Sopenharmony_ci pmovd [r0+r4*4], mm4 ;fi[k2] 84159b3361Sopenharmony_ci puphdq mm4, mm4 85159b3361Sopenharmony_ci 86159b3361Sopenharmony_ci pmovd [r0+r4*2], mm4 ;fi[k1] 87159b3361Sopenharmony_ci pmovd [r0+r2*2], mm0 ;fi[k3] 88159b3361Sopenharmony_ci lea r0, [r0+r4*8] 89159b3361Sopenharmony_ci 90159b3361Sopenharmony_ci ;g 91159b3361Sopenharmony_ci pmov mm0, [r1] ;gi0 92159b3361Sopenharmony_ci pmov mm1, [r1+r4*2] ;gi1 93159b3361Sopenharmony_ci pmov mm2, [r1+r4*4] ;gi2 94159b3361Sopenharmony_ci pmov mm3, [r1+r2*2] ;gi3 95159b3361Sopenharmony_ci 96159b3361Sopenharmony_ci pupldq mm1, mm1 97159b3361Sopenharmony_ci pupldq mm0, mm0 ;gi0 | gi0 98159b3361Sopenharmony_ci pupldq mm2, mm3 ;gi3 | gi2 99159b3361Sopenharmony_ci 100159b3361Sopenharmony_ci pxor mm1, mm7 ;gi1 | -gi1 101159b3361Sopenharmony_ci 102159b3361Sopenharmony_ci pfsub mm0, mm1 ;gi0-gi1|gi0+gi1 = g1 | g0 103159b3361Sopenharmony_ci pfmul mm2, mm6 ;gi3*SQRT2|gi2*SQRT2 = g3 | g2 104159b3361Sopenharmony_ci 105159b3361Sopenharmony_ci pmov mm4, mm0 106159b3361Sopenharmony_ci pfadd mm0, mm2 ;g1+g3|g0+g2 = gi1 | gi0 107159b3361Sopenharmony_ci pfsub mm4, mm2 ;g1-g3|g0-g2 = gi3 | gi2 108159b3361Sopenharmony_ci 109159b3361Sopenharmony_ci pmovd [r1], mm0 ;gi[0] 110159b3361Sopenharmony_ci puphdq mm0, mm0 111159b3361Sopenharmony_ci pmovd [r1+r4*4], mm4 ;gi[k2] 112159b3361Sopenharmony_ci puphdq mm4, mm4 113159b3361Sopenharmony_ci 114159b3361Sopenharmony_ci cmp r0, [esp + 16] 115159b3361Sopenharmony_ci pmovd [r1+r4*2], mm0 ;gi[k1] 116159b3361Sopenharmony_ci pmovd [r1+r2*2], mm4 ;gi[k3] 117159b3361Sopenharmony_ci 118159b3361Sopenharmony_ci jb near .do2 119159b3361Sopenharmony_ci 120159b3361Sopenharmony_ci pmov mm6, [r3+r5] ; this is not aligned address!! 121159b3361Sopenharmony_ci 122159b3361Sopenharmony_ci loopalign 16 123159b3361Sopenharmony_ci.for: 124159b3361Sopenharmony_ci; 125159b3361Sopenharmony_ci; mm6 = c1 | s1 126159b3361Sopenharmony_ci; mm7 = 0x800000000 | 0 127159b3361Sopenharmony_ci; 128159b3361Sopenharmony_ci pmov mm1, mm6 129159b3361Sopenharmony_ci mov r0, [esp+40] ; fz 130159b3361Sopenharmony_ci puphdq mm1, mm1 ; c1 | c1 131159b3361Sopenharmony_ci lea r1, [r0+r4*2] 132159b3361Sopenharmony_ci pfadd mm1, mm1 ; c1+c1 | c1+c1 133159b3361Sopenharmony_ci pfmul mm1, mm6 ; 2*c1*c1 | 2*c1*s1 134159b3361Sopenharmony_ci pfsub mm1, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2 135159b3361Sopenharmony_ci 136159b3361Sopenharmony_ci pmov mm0, mm1 137159b3361Sopenharmony_ci pxor mm7, mm6 ; c1 | -s1 138159b3361Sopenharmony_ci 139159b3361Sopenharmony_ci pupldq mm2, mm0 140159b3361Sopenharmony_ci pupldq mm3, mm6 ; ** | c1 141159b3361Sopenharmony_ci puphdq mm0, mm2 ; s2 | c2 142159b3361Sopenharmony_ci puphdq mm6, mm3 ;-s1 | c1 143159b3361Sopenharmony_ci 144159b3361Sopenharmony_ci pxor mm0, [PIC_EBP_REL(costab)] ; c2 | -s2 145159b3361Sopenharmony_ci 146159b3361Sopenharmony_ci; mm0 = s2| c2 147159b3361Sopenharmony_ci; mm1 = -c2| s2 148159b3361Sopenharmony_ci; mm6 = c1| s1 149159b3361Sopenharmony_ci; mm7 = s1|-c1 (we use the opposite sign. from GOGO here) 150159b3361Sopenharmony_ci 151159b3361Sopenharmony_ci pmov [esp], mm0 152159b3361Sopenharmony_ci pmov [esp+8], mm1 153159b3361Sopenharmony_ci 154159b3361Sopenharmony_ci sub r1, r5 ;r1 = gi 155159b3361Sopenharmony_ci add r0, r5 ;r0 = fi 156159b3361Sopenharmony_ci 157159b3361Sopenharmony_ci loopalign 16 158159b3361Sopenharmony_ci.do3: 159159b3361Sopenharmony_ci pmov mm2, [r0+r4*2] ; fi[k1] 160159b3361Sopenharmony_ci pmov mm4, [r1+r4*2] ; gi[k1] 161159b3361Sopenharmony_ci pmov mm3, [r0+r2*2] ; fi[k3] 162159b3361Sopenharmony_ci pmov mm5, [r1+r2*2] ; gi[k3] 163159b3361Sopenharmony_ci 164159b3361Sopenharmony_ci pupldq mm2, mm2 ; fi1 | fi1 165159b3361Sopenharmony_ci pupldq mm4, mm4 ; gi1 | gi1 166159b3361Sopenharmony_ci pupldq mm3, mm3 ; fi3 | fi3 167159b3361Sopenharmony_ci pupldq mm5, mm5 ; gi3 | gi3 168159b3361Sopenharmony_ci 169159b3361Sopenharmony_ci pfmul mm2, mm0 ; s2 * fi1 | c2 * fi1 170159b3361Sopenharmony_ci pfmul mm4, mm1 ;-c2 * gi1 | s2 * gi1 171159b3361Sopenharmony_ci pfmul mm3, mm0 ; s2 * fi3 | c2 * fi3 172159b3361Sopenharmony_ci pfmul mm5, mm1 ;-c2 * gi3 | s2 * gi3 173159b3361Sopenharmony_ci 174159b3361Sopenharmony_ci pfadd mm2, mm4 ;b | a 175159b3361Sopenharmony_ci pfadd mm3, mm5 ;d | c 176159b3361Sopenharmony_ci 177159b3361Sopenharmony_ci pmov mm0, [r0] 178159b3361Sopenharmony_ci pmov mm4, [r1] 179159b3361Sopenharmony_ci pmov mm1, [r0+r4*4] 180159b3361Sopenharmony_ci pmov mm5, [r1+r4*4] 181159b3361Sopenharmony_ci 182159b3361Sopenharmony_ci pupldq mm0, mm4 ;gi0 | fi0 183159b3361Sopenharmony_ci pupldq mm1, mm5 ;gi2 | fi2 184159b3361Sopenharmony_ci 185159b3361Sopenharmony_ci pmov mm4, mm2 186159b3361Sopenharmony_ci pmov mm5, mm3 187159b3361Sopenharmony_ci 188159b3361Sopenharmony_ci pfadd mm2, mm0 ;g0 | f0 189159b3361Sopenharmony_ci pfadd mm3, mm1 ;g2 | f2 190159b3361Sopenharmony_ci 191159b3361Sopenharmony_ci pfsub mm0, mm4 ;g1 | f1 192159b3361Sopenharmony_ci pfsub mm1, mm5 ;g3 | f3 193159b3361Sopenharmony_ci 194159b3361Sopenharmony_ci pmov mm4, mm3 195159b3361Sopenharmony_ci pmov mm5, mm1 196159b3361Sopenharmony_ci 197159b3361Sopenharmony_ci pupldq mm4, mm4 ;f2 | f2 198159b3361Sopenharmony_ci puphdq mm5, mm5 ;g3 | g3 199159b3361Sopenharmony_ci puphdq mm3, mm3 ;g2 | g2 200159b3361Sopenharmony_ci pupldq mm1, mm1 ;f3 | f3 201159b3361Sopenharmony_ci 202159b3361Sopenharmony_ci pfmul mm4, mm6 ;f2 * c1 | f2 * s1 203159b3361Sopenharmony_ci pfmul mm5, mm7 ;g3 * s1 | g3 *-c1 204159b3361Sopenharmony_ci pfmul mm3, mm6 ;g2 * c1 | g2 * s1 205159b3361Sopenharmony_ci pfmul mm1, mm7 ;f3 * s1 | f3 *-c1 206159b3361Sopenharmony_ci 207159b3361Sopenharmony_ci pfadd mm4, mm5 ;a | b 208159b3361Sopenharmony_ci pfsub mm3, mm1 ;d | c 209159b3361Sopenharmony_ci 210159b3361Sopenharmony_ci pmov mm5, mm2 211159b3361Sopenharmony_ci pmov mm1, mm0 212159b3361Sopenharmony_ci 213159b3361Sopenharmony_ci pupldq mm2, mm2 ;f0 | f0 214159b3361Sopenharmony_ci pupldq mm0, mm0 ;f1 | f1 215159b3361Sopenharmony_ci 216159b3361Sopenharmony_ci puphdq mm1, mm2 ;f0 | g1 217159b3361Sopenharmony_ci puphdq mm5, mm0 ;f1 | g0 218159b3361Sopenharmony_ci 219159b3361Sopenharmony_ci pmov mm2, mm4 220159b3361Sopenharmony_ci pmov mm0, mm3 221159b3361Sopenharmony_ci 222159b3361Sopenharmony_ci pfadd mm4, mm1 ;fi0 | gi1 223159b3361Sopenharmony_ci pfadd mm3, mm5 ;fi1 | gi0 224159b3361Sopenharmony_ci pfsub mm1, mm2 ;fi2 | gi3 225159b3361Sopenharmony_ci pfsub mm5, mm0 ;fi3 | gi2 226159b3361Sopenharmony_ci 227159b3361Sopenharmony_ci pmovd [r1+r4*2], mm4 ;gi[k1] 228159b3361Sopenharmony_ci puphdq mm4, mm4 229159b3361Sopenharmony_ci pmovd [r1], mm3 ;gi[0] 230159b3361Sopenharmony_ci puphdq mm3, mm3 231159b3361Sopenharmony_ci pmovd [r1+r2*2], mm1 ;gi[k3] 232159b3361Sopenharmony_ci puphdq mm1, mm1 233159b3361Sopenharmony_ci pmovd [r1+r4*4], mm5 ;gi[k2] 234159b3361Sopenharmony_ci puphdq mm5, mm5 235159b3361Sopenharmony_ci 236159b3361Sopenharmony_ci pmovd [r0], mm4 ;fi[0] 237159b3361Sopenharmony_ci pmovd [r0+r4*2], mm3 ;fi[k1] 238159b3361Sopenharmony_ci pmovd [r0+r4*4], mm1 ;fi[k2] 239159b3361Sopenharmony_ci pmovd [r0+r2*2], mm5 ;fi[k3] 240159b3361Sopenharmony_ci 241159b3361Sopenharmony_ci lea r0, [r0+r4*8] 242159b3361Sopenharmony_ci lea r1, [r1+r4*8] 243159b3361Sopenharmony_ci cmp r0, [esp + 16] 244159b3361Sopenharmony_ci pmov mm0, [esp] 245159b3361Sopenharmony_ci pmov mm1, [esp+8] 246159b3361Sopenharmony_ci 247159b3361Sopenharmony_ci jb near .do3 248159b3361Sopenharmony_ci 249159b3361Sopenharmony_ci add r5, 4 250159b3361Sopenharmony_ci; mm6 = c1| s1 251159b3361Sopenharmony_ci; mm7 = s1|-c1 (we use the opposite sign. from GOGO here) 252159b3361Sopenharmony_ci pfmul mm6, [r3] ; c1*a | s1*a 253159b3361Sopenharmony_ci pfmul mm7, [r3+8] ; s1*b |-c1*b 254159b3361Sopenharmony_ci cmp r5, r4 255159b3361Sopenharmony_ci 256159b3361Sopenharmony_ci pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b 257159b3361Sopenharmony_ci pupldq mm7,mm6 258159b3361Sopenharmony_ci puphdq mm6,mm7 259159b3361Sopenharmony_ci pmov mm7, [PIC_EBP_REL(costab)] 260159b3361Sopenharmony_ci jb near .for 261159b3361Sopenharmony_ci 262159b3361Sopenharmony_ci mov r0, [esp+40] ;fi 263159b3361Sopenharmony_ci cmp r4, [esp+40+4] 264159b3361Sopenharmony_ci lea r4, [r4*4] ;kx *= 4 265159b3361Sopenharmony_ci 266159b3361Sopenharmony_ci jb near .do1 267159b3361Sopenharmony_ci.exitttt 268159b3361Sopenharmony_ci femms 269159b3361Sopenharmony_ci add esp,20 270159b3361Sopenharmony_ci popd ebp, ebx, esi, edi 271159b3361Sopenharmony_ciendproc 272159b3361Sopenharmony_ci 273159b3361Sopenharmony_ci 274159b3361Sopenharmony_ci;void fht_E3DN(float *fz, int nn); 275159b3361Sopenharmony_ci 276159b3361Sopenharmony_ciproc fht_E3DN 277159b3361Sopenharmony_ci 278159b3361Sopenharmony_ci pushd ebp, ebx, esi, edi 279159b3361Sopenharmony_ci 280159b3361Sopenharmony_ci sub esp, 20 281159b3361Sopenharmony_ci 282159b3361Sopenharmony_ci call get_pc.bp 283159b3361Sopenharmony_ci add ebp, PIC_BASE() 284159b3361Sopenharmony_ci 285159b3361Sopenharmony_ci mov r0, [esp+40] ;fi 286159b3361Sopenharmony_ci mov r1, [esp+44] ;r1 = nn 287159b3361Sopenharmony_ci lea r3, [PIC_EBP_REL(costab)] ;tri = costab 288159b3361Sopenharmony_ci lea r4, [r0+r1*8] ;r4 = fn = &fz[n] 289159b3361Sopenharmony_ci mov [esp+16], r4 290159b3361Sopenharmony_ci mov r4, 8 ;kx = k1/2 291159b3361Sopenharmony_ci 292159b3361Sopenharmony_ci pmov mm7, [r3] 293159b3361Sopenharmony_ci 294159b3361Sopenharmony_ci loopalign 16 295159b3361Sopenharmony_ci.do1 296159b3361Sopenharmony_ci lea r3, [r3+16] ;tri += 2; 297159b3361Sopenharmony_ci pmov mm6, [PIC_EBP_REL(costab+8)] 298159b3361Sopenharmony_ci lea r2, [r4+r4*2] ;k3*fsize/2 299159b3361Sopenharmony_ci mov r5, 4 ;i = 1*fsize 300159b3361Sopenharmony_ci 301159b3361Sopenharmony_ci loopalign 16 302159b3361Sopenharmony_ci.do2: 303159b3361Sopenharmony_ci lea r1, [r0+r4] ;gi = fi + kx 304159b3361Sopenharmony_ci;f 305159b3361Sopenharmony_ci pmov mm0, [r0] ; X | fi0 306159b3361Sopenharmony_ci pmov mm1, [r0+r4*4] ; X | fi2 307159b3361Sopenharmony_ci pupldq mm0, [r0+r4*2] ;fi1 | fi0 308159b3361Sopenharmony_ci pupldq mm1, [r0+r2*2] ;fi3 | fi2 309159b3361Sopenharmony_ci pfpnacc mm0, mm0 ;fi0+fi1 | fi0-fi1 = f0|f1 310159b3361Sopenharmony_ci pfpnacc mm1, mm1 ;fi2+fi3 | fi2-fi3 = f2|f3 311159b3361Sopenharmony_ci 312159b3361Sopenharmony_ci pmov mm2, mm0 313159b3361Sopenharmony_ci pfadd mm0, mm1 ;f0+f2|f1+f3 = fi0 | fi1 314159b3361Sopenharmony_ci pfsub mm2, mm1 ;f0-f2|f1-f3 = fi2 | fi3 315159b3361Sopenharmony_ci 316159b3361Sopenharmony_ci pmovd [r0+r4*2], mm0 ;fi[k1] 317159b3361Sopenharmony_ci pmovd [r0+r2*2], mm2 ;fi[k3] 318159b3361Sopenharmony_ci 319159b3361Sopenharmony_ci puphdq mm0, mm0 320159b3361Sopenharmony_ci puphdq mm2, mm2 321159b3361Sopenharmony_ci pmovd [r0], mm0 ;fi[0] 322159b3361Sopenharmony_ci pmovd [r0+r4*4], mm2 ;fi[k2] 323159b3361Sopenharmony_ci 324159b3361Sopenharmony_ci lea r0, [r0+r4*8] 325159b3361Sopenharmony_ci;g 326159b3361Sopenharmony_ci pmov mm3, [r1] ; gi0 327159b3361Sopenharmony_ci pmov mm4, [r1+r2*2] ; gi3 328159b3361Sopenharmony_ci pupldq mm3, [r1+r4*2] ;gi1|gi0 329159b3361Sopenharmony_ci pupldq mm4, [r1+r4*4] ;gi2|gi3 330159b3361Sopenharmony_ci 331159b3361Sopenharmony_ci pfpnacc mm3, mm3 ;gi0+gi1 |gi0-gi1 = f0|f1 332159b3361Sopenharmony_ci pfmul mm4, mm6 ;gi2*SQRT2|gi3*SQRT2 = f2|f3 333159b3361Sopenharmony_ci 334159b3361Sopenharmony_ci pmov mm5, mm3 335159b3361Sopenharmony_ci pfadd mm3, mm4 ;f0+f2|f1+f3 336159b3361Sopenharmony_ci pfsub mm5, mm4 ;f0-f2|f1-f3 337159b3361Sopenharmony_ci 338159b3361Sopenharmony_ci cmp r0, [esp + 16] 339159b3361Sopenharmony_ci pmovd [r1+r4*2], mm3 ;gi[k1] 340159b3361Sopenharmony_ci pmovd [r1+r2*2], mm5 ;gi[k3] 341159b3361Sopenharmony_ci puphdq mm3, mm3 342159b3361Sopenharmony_ci puphdq mm5, mm5 343159b3361Sopenharmony_ci pmovd [r1], mm3 ;gi[0] 344159b3361Sopenharmony_ci pmovd [r1+r4*4], mm5 ;gi[k2] 345159b3361Sopenharmony_ci 346159b3361Sopenharmony_ci jb near .do2 347159b3361Sopenharmony_ci 348159b3361Sopenharmony_ci pmov mm6, [r3+r5] ; this is not aligned address!! 349159b3361Sopenharmony_ci 350159b3361Sopenharmony_ci loopalign 16 351159b3361Sopenharmony_ci.for: 352159b3361Sopenharmony_ci; 353159b3361Sopenharmony_ci; mm6 = c1 | s1 354159b3361Sopenharmony_ci; mm7 = 0x800000000 | 0 355159b3361Sopenharmony_ci; 356159b3361Sopenharmony_ci pmov mm5, mm6 357159b3361Sopenharmony_ci mov r0, [esp+40] ; fz 358159b3361Sopenharmony_ci puphdq mm5, mm5 ; c1 | c1 359159b3361Sopenharmony_ci lea r1, [r0+r4*2] 360159b3361Sopenharmony_ci pfadd mm5, mm5 ; c1+c1 | c1+c1 361159b3361Sopenharmony_ci pfmul mm5, mm6 ; 2*c1*c1 | 2*c1*s1 362159b3361Sopenharmony_ci pfsub mm5, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2 363159b3361Sopenharmony_ci 364159b3361Sopenharmony_ci pswapd mm4, mm5 ; s2 |-c2 365159b3361Sopenharmony_ci pxor mm4, mm7 ; s2 | c2 366159b3361Sopenharmony_ci pxor mm7, mm6 ; c1 |-s1 367159b3361Sopenharmony_ci pswapd mm6, mm6 ; s1 | c1 368159b3361Sopenharmony_ci 369159b3361Sopenharmony_ci; mm4 = s2| c2 370159b3361Sopenharmony_ci; mm5 = -c2| s2 371159b3361Sopenharmony_ci; mm6 = c1| s1 372159b3361Sopenharmony_ci; mm7 = s1|-c1 (we use the opposite sign. from GOGO here) 373159b3361Sopenharmony_ci 374159b3361Sopenharmony_ci pmov [esp], mm4 375159b3361Sopenharmony_ci pmov [esp+8], mm5 376159b3361Sopenharmony_ci 377159b3361Sopenharmony_ci sub r1, r5 ;r1 = gi 378159b3361Sopenharmony_ci add r0, r5 ;r0 = fi 379159b3361Sopenharmony_ci 380159b3361Sopenharmony_ci loopalign 16 381159b3361Sopenharmony_ci.do3: 382159b3361Sopenharmony_ci pmov mm0, [r0+r2*2] ; fi[k1] 383159b3361Sopenharmony_ci pmov mm2, [r1+r2*2] ; gi[k1] 384159b3361Sopenharmony_ci pmov mm1, [r0+r4*2] ; fi[k3] 385159b3361Sopenharmony_ci pmov mm3, [r1+r4*2] ; gi[k3] 386159b3361Sopenharmony_ci 387159b3361Sopenharmony_ci pupldq mm0, mm0 388159b3361Sopenharmony_ci pupldq mm2, mm2 389159b3361Sopenharmony_ci pupldq mm1, mm1 390159b3361Sopenharmony_ci pupldq mm3, mm3 391159b3361Sopenharmony_ci 392159b3361Sopenharmony_ci pfmul mm0, mm4 393159b3361Sopenharmony_ci pfmul mm2, mm5 394159b3361Sopenharmony_ci pfmul mm1, mm4 395159b3361Sopenharmony_ci pfmul mm3, mm5 396159b3361Sopenharmony_ci 397159b3361Sopenharmony_ci pfadd mm0, mm2 ;d | c 398159b3361Sopenharmony_ci pfadd mm1, mm3 ;b | a 399159b3361Sopenharmony_ci 400159b3361Sopenharmony_ci pmov mm2, [r0+r4*4] ;fi2 401159b3361Sopenharmony_ci pupldq mm3, [r1+r4*4] ;gi2 | - 402159b3361Sopenharmony_ci pmov mm4, [r0] ;fi0 403159b3361Sopenharmony_ci pupldq mm5, [r1] ;gi0 | - 404159b3361Sopenharmony_ci 405159b3361Sopenharmony_ci pupldq mm2, mm0 ;c | fi2 406159b3361Sopenharmony_ci puphdq mm3, mm0 ;d | gi2 407159b3361Sopenharmony_ci pupldq mm4, mm1 ;a | fi0 408159b3361Sopenharmony_ci puphdq mm5, mm1 ;b | gi0 409159b3361Sopenharmony_ci 410159b3361Sopenharmony_ci pfpnacc mm2, mm2 ;f2 | f3 411159b3361Sopenharmony_ci pfpnacc mm3, mm3 ;g2 | g3 412159b3361Sopenharmony_ci pfpnacc mm4, mm4 ;f0 | f1 413159b3361Sopenharmony_ci pfpnacc mm5, mm5 ;g0 | g1 414159b3361Sopenharmony_ci 415159b3361Sopenharmony_ci pmov mm0, mm2 416159b3361Sopenharmony_ci pmov mm1, mm3 417159b3361Sopenharmony_ci pupldq mm2, mm2 ;f3 | f3 418159b3361Sopenharmony_ci pupldq mm3, mm3 ;g3 | g3 419159b3361Sopenharmony_ci puphdq mm0, mm0 ;f2 | f2 420159b3361Sopenharmony_ci puphdq mm1, mm1 ;g2 | g2 421159b3361Sopenharmony_ci 422159b3361Sopenharmony_ci pswapd mm4, mm4 ;f1 | f0 423159b3361Sopenharmony_ci pswapd mm5, mm5 ;g1 | g0 424159b3361Sopenharmony_ci 425159b3361Sopenharmony_ci pfmul mm0, mm7 ;f2 * s1 | f2 *-c1 426159b3361Sopenharmony_ci pfmul mm3, mm6 ;g3 * c1 | g3 * s1 427159b3361Sopenharmony_ci pfmul mm1, mm6 ;g2 * c1 | g2 * s1 428159b3361Sopenharmony_ci pfmul mm2, mm7 ;f3 * s1 | f3 *-c1 429159b3361Sopenharmony_ci 430159b3361Sopenharmony_ci pfsub mm0, mm3 ; b |-a 431159b3361Sopenharmony_ci pfsub mm1, mm2 ; d | c 432159b3361Sopenharmony_ci 433159b3361Sopenharmony_ci pmov mm2, mm5 434159b3361Sopenharmony_ci pmov mm3, mm4 435159b3361Sopenharmony_ci pupldq mm4, mm0 ;-a | f0 436159b3361Sopenharmony_ci pupldq mm5, mm1 ; c | g0 437159b3361Sopenharmony_ci puphdq mm2, mm0 ; b | g1 438159b3361Sopenharmony_ci puphdq mm3, mm1 ; d | f1 439159b3361Sopenharmony_ci 440159b3361Sopenharmony_ci pfpnacc mm4, mm4 ;fi2 | fi0 441159b3361Sopenharmony_ci pfpnacc mm5, mm5 ;gi0 | gi2 442159b3361Sopenharmony_ci pfpnacc mm2, mm2 ;gi1 | gi3 443159b3361Sopenharmony_ci pfpnacc mm3, mm3 ;fi1 | fi3 444159b3361Sopenharmony_ci 445159b3361Sopenharmony_ci pmovd [r0], mm4 ;fi[0] 446159b3361Sopenharmony_ci pmovd [r1+r4*4], mm5 ;gi[k2] 447159b3361Sopenharmony_ci pmovd [r1+r2*2], mm2 ;gi[k3] 448159b3361Sopenharmony_ci pmovd [r0+r2*2], mm3 ;fi[k3] 449159b3361Sopenharmony_ci 450159b3361Sopenharmony_ci puphdq mm4, mm4 451159b3361Sopenharmony_ci puphdq mm5, mm5 452159b3361Sopenharmony_ci puphdq mm2, mm2 453159b3361Sopenharmony_ci puphdq mm3, mm3 454159b3361Sopenharmony_ci pmovd [r0+r4*4], mm4 ;fi[k2] 455159b3361Sopenharmony_ci pmovd [r1], mm5 ;gi[0] 456159b3361Sopenharmony_ci pmovd [r1+r4*2], mm2 ;gi[k1] 457159b3361Sopenharmony_ci pmovd [r0+r4*2], mm3 ;fi[k1] 458159b3361Sopenharmony_ci 459159b3361Sopenharmony_ci lea r0, [r0+r4*8] 460159b3361Sopenharmony_ci lea r1, [r1+r4*8] 461159b3361Sopenharmony_ci cmp r0, [esp + 16] 462159b3361Sopenharmony_ci pmov mm4, [esp] 463159b3361Sopenharmony_ci pmov mm5, [esp+8] 464159b3361Sopenharmony_ci 465159b3361Sopenharmony_ci jb near .do3 466159b3361Sopenharmony_ci 467159b3361Sopenharmony_ci add r5, 4 468159b3361Sopenharmony_ci; mm6 = c1| s1 469159b3361Sopenharmony_ci; mm7 = s1|-c1 (we use the opposite sign. from GOGO here) 470159b3361Sopenharmony_ci pfmul mm6, [r3] ; c1*a | s1*a 471159b3361Sopenharmony_ci pfmul mm7, [r3+8] ; s1*b |-c1*b 472159b3361Sopenharmony_ci cmp r5, r4 473159b3361Sopenharmony_ci 474159b3361Sopenharmony_ci pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b 475159b3361Sopenharmony_ci pswapd mm6, mm6 ; ??? ; s1*a+c1*b | c1*a-s1*b 476159b3361Sopenharmony_ci pmov mm7, [PIC_EBP_REL(costab)] 477159b3361Sopenharmony_ci jb near .for 478159b3361Sopenharmony_ci 479159b3361Sopenharmony_ci mov r0, [esp+40] ;fi 480159b3361Sopenharmony_ci cmp r4, [esp+40+4] 481159b3361Sopenharmony_ci lea r4, [r4*4] ;kx *= 4 482159b3361Sopenharmony_ci 483159b3361Sopenharmony_ci jb near .do1 484159b3361Sopenharmony_ci.exitttt 485159b3361Sopenharmony_ci femms 486159b3361Sopenharmony_ci add esp,20 487159b3361Sopenharmony_ci popd ebp, ebx, esi, edi 488159b3361Sopenharmony_ciendproc 489