1;****************************************************************************** 2;* FFT transform with SSE/AVX optimizations 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2011 Vitor Sessak 5;* 6;* This algorithm (though not any of the implementation details) is 7;* based on libdjbfft by D. J. Bernstein. 8;* 9;* This file is part of FFmpeg. 10;* 11;* FFmpeg is free software; you can redistribute it and/or 12;* modify it under the terms of the GNU Lesser General Public 13;* License as published by the Free Software Foundation; either 14;* version 2.1 of the License, or (at your option) any later version. 15;* 16;* FFmpeg is distributed in the hope that it will be useful, 17;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19;* Lesser General Public License for more details. 20;* 21;* You should have received a copy of the GNU Lesser General Public 22;* License along with FFmpeg; if not, write to the Free Software 23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24;****************************************************************************** 25 26; These functions are not individually interchangeable with the C versions. 27; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results 28; in blocks as conventient to the vector size. 29; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) 30 31%include "libavutil/x86/x86util.asm" 32 33%if ARCH_X86_64 34%define pointer resq 35%else 36%define pointer resd 37%endif 38 39struc FFTContext 40 .nbits: resd 1 41 .reverse: resd 1 42 .revtab: pointer 1 43 .tmpbuf: pointer 1 44 .mdctsize: resd 1 45 .mdctbits: resd 1 46 .tcos: pointer 1 47 .tsin: pointer 1 48 .fftperm: pointer 1 49 .fftcalc: pointer 1 50 .imdctcalc:pointer 1 51 .imdcthalf:pointer 1 52endstruc 53 54SECTION_RODATA 32 55 56%define M_SQRT1_2 0.70710678118654752440 57%define M_COS_PI_1_8 0.923879532511287 58%define M_COS_PI_3_8 0.38268343236509 59 60ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 61ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 62 63ps_root2: times 8 dd M_SQRT1_2 64ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 65ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 66 67perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 68perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 69ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 70ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 71ps_m1p1: dd 1<<31, 0 72 73cextern ps_neg 74 75%assign i 16 76%rep 14 77cextern cos_ %+ i 78%assign i i<<1 79%endrep 80 81%if ARCH_X86_64 82 %define pointer dq 83%else 84 %define pointer dd 85%endif 86 87%macro IF0 1+ 88%endmacro 89%macro IF1 1+ 90 %1 91%endmacro 92 93SECTION .text 94 95; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} 96; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} 97; %3, %4, %5 tmp 98; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} 99; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} 100%macro T8_AVX 5 101 vsubps %5, %1, %2 ; v = %1 - %2 102 vaddps %3, %1, %2 ; w = %1 + %2 103 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 104 vpermilps %2, %2, [perm1] 105 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} 106 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} 107 vsubps %4, %5, %1 ; s = r - q 108 vaddps %1, %5, %1 ; u = r + q 109 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} 110 vshufps %5, %4, %1, 0xbb 111 vshufps %3, %4, %1, 0xee 112 vperm2f128 %3, %3, %5, 0x13 113 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} 114 vshufps %2, %1, %4, 0xdd 115 vshufps %1, %1, %4, 0x88 116 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} 117 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} 118 vsubps %5, %1, %3 119 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} 120 vsubps %2, %4, %1 ; %2 = v - w 121 vaddps %1, %4, %1 ; %1 = v + w 122%endmacro 123 124; In SSE mode do one fft4 transforms 125; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} 126; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} 127; 128; In AVX mode do two fft4 transforms 129; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} 130; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} 131%macro T4_SSE 3 132 subps %3, %1, %2 ; {t3,t4,-t8,t7} 133 addps %1, %1, %2 ; {t1,t2,t6,t5} 134 xorps %3, %3, [ps_p1p1m1p1] 135 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} 136 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} 137 subps %3, %1, %2 ; {r2,i2,r3,i3} 138 addps %1, %1, %2 ; {r0,i0,r1,i1} 139 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} 140 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} 141%endmacro 142 143; In SSE mode do one FFT8 144; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} 145; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} 146; 147; In AVX mode do two FFT8 148; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} 149; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} 150; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} 151; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} 152%macro T8_SSE 6 153 addps %6, %3, %4 ; {t1,t2,t3,t4} 154 subps %3, %3, %4 ; {r5,i5,r7,i7} 155 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} 156 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} 157 mulps %4, %4, [ps_root2] 158 addps %3, %3, %4 ; {t8,t7,ta,t9} 159 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} 160 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} 161 subps %3, %6, %4 ; {t6,t5,tc,tb} 162 addps %6, %6, %4 ; {t1,t2,t9,ta} 163 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} 164 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} 165 subps %3, %1, %6 ; {r4,r5,r6,r7} 166 addps %1, %1, %6 ; {r0,r1,r2,r3} 167 subps %4, %2, %5 ; {i4,i5,i6,i7} 168 addps %2, %2, %5 ; {i0,i1,i2,i3} 169%endmacro 170 171%macro INTERL 5 172%if cpuflag(avx) 173 vunpckhps %3, %2, %1 174 vunpcklps %2, %2, %1 175 vextractf128 %4(%5), %2, 0 176 vextractf128 %4 %+ H(%5), %3, 0 177 vextractf128 %4(%5 + 1), %2, 1 178 vextractf128 %4 %+ H(%5 + 1), %3, 1 179%elif cpuflag(sse) 180 mova %3, %2 181 unpcklps %2, %1 182 unpckhps %3, %1 183 mova %4(%5), %2 184 mova %4(%5+1), %3 185%endif 186%endmacro 187 188; scheduled for cpu-bound sizes 189%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim 190IF%1 mova m4, Z(4) 191IF%1 mova m5, Z(5) 192 mova m0, %2 ; wre 193 mova m1, %3 ; wim 194 mulps m2, m4, m0 ; r2*wre 195IF%1 mova m6, Z2(6) 196 mulps m3, m5, m1 ; i2*wim 197IF%1 mova m7, Z2(7) 198 mulps m4, m4, m1 ; r2*wim 199 mulps m5, m5, m0 ; i2*wre 200 addps m2, m2, m3 ; r2*wre + i2*wim 201 mulps m3, m1, m7 ; i3*wim 202 subps m5, m5, m4 ; i2*wre - r2*wim 203 mulps m1, m1, m6 ; r3*wim 204 mulps m4, m0, m6 ; r3*wre 205 mulps m0, m0, m7 ; i3*wre 206 subps m4, m4, m3 ; r3*wre - i3*wim 207 mova m3, Z(0) 208 addps m0, m0, m1 ; i3*wre + r3*wim 209 subps m1, m4, m2 ; t3 210 addps m4, m4, m2 ; t5 211 subps m3, m3, m4 ; r2 212 addps m4, m4, Z(0) ; r0 213 mova m6, Z(2) 214 mova Z(4), m3 215 mova Z(0), m4 216 subps m3, m5, m0 ; t4 217 subps m4, m6, m3 ; r3 218 addps m3, m3, m6 ; r1 219 mova Z2(6), m4 220 mova Z(2), m3 221 mova m2, Z(3) 222 addps m3, m5, m0 ; t6 223 subps m2, m2, m1 ; i3 224 mova m7, Z(1) 225 addps m1, m1, Z(3) ; i1 226 mova Z2(7), m2 227 mova Z(3), m1 228 subps m4, m7, m3 ; i2 229 addps m3, m3, m7 ; i0 230 mova Z(5), m4 231 mova Z(1), m3 232%endmacro 233 234; scheduled to avoid store->load aliasing 235%macro PASS_BIG 1 ; (!interleave) 236 mova m4, Z(4) ; r2 237 mova m5, Z(5) ; i2 238 mova m0, [wq] ; wre 239 mova m1, [wq+o1q] ; wim 240 mulps m2, m4, m0 ; r2*wre 241 mova m6, Z2(6) ; r3 242 mulps m3, m5, m1 ; i2*wim 243 mova m7, Z2(7) ; i3 244 mulps m4, m4, m1 ; r2*wim 245 mulps m5, m5, m0 ; i2*wre 246 addps m2, m2, m3 ; r2*wre + i2*wim 247 mulps m3, m1, m7 ; i3*wim 248 mulps m1, m1, m6 ; r3*wim 249 subps m5, m5, m4 ; i2*wre - r2*wim 250 mulps m4, m0, m6 ; r3*wre 251 mulps m0, m0, m7 ; i3*wre 252 subps m4, m4, m3 ; r3*wre - i3*wim 253 mova m3, Z(0) 254 addps m0, m0, m1 ; i3*wre + r3*wim 255 subps m1, m4, m2 ; t3 256 addps m4, m4, m2 ; t5 257 subps m3, m3, m4 ; r2 258 addps m4, m4, Z(0) ; r0 259 mova m6, Z(2) 260 mova Z(4), m3 261 mova Z(0), m4 262 subps m3, m5, m0 ; t4 263 subps m4, m6, m3 ; r3 264 addps m3, m3, m6 ; r1 265IF%1 mova Z2(6), m4 266IF%1 mova Z(2), m3 267 mova m2, Z(3) 268 addps m5, m5, m0 ; t6 269 subps m2, m2, m1 ; i3 270 mova m7, Z(1) 271 addps m1, m1, Z(3) ; i1 272IF%1 mova Z2(7), m2 273IF%1 mova Z(3), m1 274 subps m6, m7, m5 ; i2 275 addps m5, m5, m7 ; i0 276IF%1 mova Z(5), m6 277IF%1 mova Z(1), m5 278%if %1==0 279 INTERL m1, m3, m7, Z, 2 280 INTERL m2, m4, m0, Z2, 6 281 282 mova m1, Z(0) 283 mova m2, Z(4) 284 285 INTERL m5, m1, m3, Z, 0 286 INTERL m6, m2, m7, Z, 4 287%endif 288%endmacro 289 290%define Z(x) [r0+mmsize*x] 291%define Z2(x) [r0+mmsize*x] 292%define ZH(x) [r0+mmsize*x+mmsize/2] 293 294INIT_YMM avx 295 296%if HAVE_AVX_EXTERNAL 297align 16 298fft8_avx: 299 mova m0, Z(0) 300 mova m1, Z(1) 301 T8_AVX m0, m1, m2, m3, m4 302 mova Z(0), m0 303 mova Z(1), m1 304 ret 305 306 307align 16 308fft16_avx: 309 mova m2, Z(2) 310 mova m3, Z(3) 311 T4_SSE m2, m3, m7 312 313 mova m0, Z(0) 314 mova m1, Z(1) 315 T8_AVX m0, m1, m4, m5, m7 316 317 mova m4, [ps_cos16_1] 318 mova m5, [ps_cos16_2] 319 vmulps m6, m2, m4 320 vmulps m7, m3, m5 321 vaddps m7, m7, m6 322 vmulps m2, m2, m5 323 vmulps m3, m3, m4 324 vsubps m3, m3, m2 325 vblendps m2, m7, m3, 0xf0 326 vperm2f128 m3, m7, m3, 0x21 327 vaddps m4, m2, m3 328 vsubps m2, m3, m2 329 vperm2f128 m2, m2, m2, 0x01 330 vsubps m3, m1, m2 331 vaddps m1, m1, m2 332 vsubps m5, m0, m4 333 vaddps m0, m0, m4 334 vextractf128 Z(0), m0, 0 335 vextractf128 ZH(0), m1, 0 336 vextractf128 Z(1), m0, 1 337 vextractf128 ZH(1), m1, 1 338 vextractf128 Z(2), m5, 0 339 vextractf128 ZH(2), m3, 0 340 vextractf128 Z(3), m5, 1 341 vextractf128 ZH(3), m3, 1 342 ret 343 344align 16 345fft32_avx: 346 call fft16_avx 347 348 mova m0, Z(4) 349 mova m1, Z(5) 350 351 T4_SSE m0, m1, m4 352 353 mova m2, Z(6) 354 mova m3, Z(7) 355 356 T8_SSE m0, m1, m2, m3, m4, m6 357 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} 358 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} 359 360 vperm2f128 m4, m0, m2, 0x20 361 vperm2f128 m5, m1, m3, 0x20 362 vperm2f128 m6, m0, m2, 0x31 363 vperm2f128 m7, m1, m3, 0x31 364 365 PASS_SMALL 0, [cos_32], [cos_32+32] 366 367 ret 368 369fft32_interleave_avx: 370 call fft32_avx 371 mov r2d, 32 372.deint_loop: 373 mova m2, Z(0) 374 mova m3, Z(1) 375 vunpcklps m0, m2, m3 376 vunpckhps m1, m2, m3 377 vextractf128 Z(0), m0, 0 378 vextractf128 ZH(0), m1, 0 379 vextractf128 Z(1), m0, 1 380 vextractf128 ZH(1), m1, 1 381 add r0, mmsize*2 382 sub r2d, mmsize/4 383 jg .deint_loop 384 ret 385 386%endif 387 388INIT_XMM sse 389 390align 16 391fft4_avx: 392fft4_sse: 393 mova m0, Z(0) 394 mova m1, Z(1) 395 T4_SSE m0, m1, m2 396 mova Z(0), m0 397 mova Z(1), m1 398 ret 399 400align 16 401fft8_sse: 402 mova m0, Z(0) 403 mova m1, Z(1) 404 T4_SSE m0, m1, m2 405 mova m2, Z(2) 406 mova m3, Z(3) 407 T8_SSE m0, m1, m2, m3, m4, m5 408 mova Z(0), m0 409 mova Z(1), m1 410 mova Z(2), m2 411 mova Z(3), m3 412 ret 413 414align 16 415fft16_sse: 416 mova m0, Z(0) 417 mova m1, Z(1) 418 T4_SSE m0, m1, m2 419 mova m2, Z(2) 420 mova m3, Z(3) 421 T8_SSE m0, m1, m2, m3, m4, m5 422 mova m4, Z(4) 423 mova m5, Z(5) 424 mova Z(0), m0 425 mova Z(1), m1 426 mova Z(2), m2 427 mova Z(3), m3 428 T4_SSE m4, m5, m6 429 mova m6, Z2(6) 430 mova m7, Z2(7) 431 T4_SSE m6, m7, m0 432 PASS_SMALL 0, [cos_16], [cos_16+16] 433 ret 434 435 436%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] 437%define Z2(x) [zcq + o3q + mmsize*(x&1)] 438%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] 439%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] 440 441%macro DECL_PASS 2+ ; name, payload 442align 16 443%1: 444DEFINE_ARGS zc, w, n, o1, o3 445 lea o3q, [nq*3] 446 lea o1q, [nq*8] 447 shl o3q, 4 448.loop: 449 %2 450 add zcq, mmsize*2 451 add wq, mmsize 452 sub nd, mmsize/8 453 jg .loop 454 rep ret 455%endmacro 456 457%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs 458 lea r2, [dispatch_tab%1] 459 mov r2, [r2 + (%2q-2)*gprsize] 460%ifdef PIC 461 lea r3, [$$] 462 add r2, r3 463%endif 464 call r2 465%endmacro ; FFT_DISPATCH 466 467INIT_YMM avx 468 469%if HAVE_AVX_EXTERNAL 470DECL_PASS pass_avx, PASS_BIG 1 471DECL_PASS pass_interleave_avx, PASS_BIG 0 472 473cglobal fft_calc, 2,5,8 474 mov r3d, [r0 + FFTContext.nbits] 475 mov r0, r1 476 mov r1, r3 477 FFT_DISPATCH _interleave %+ SUFFIX, r1 478 REP_RET 479 480%endif 481 482INIT_XMM sse 483 484DECL_PASS pass_sse, PASS_BIG 1 485DECL_PASS pass_interleave_sse, PASS_BIG 0 486 487INIT_XMM sse 488cglobal fft_calc, 2,5,8 489 mov r3d, [r0 + FFTContext.nbits] 490 PUSH r1 491 PUSH r3 492 mov r0, r1 493 mov r1, r3 494 FFT_DISPATCH _interleave %+ SUFFIX, r1 495 POP rcx 496 POP r4 497 cmp rcx, 3+(mmsize/16) 498 jg .end 499 mov r2, -1 500 add rcx, 3 501 shl r2, cl 502 sub r4, r2 503.loop: 504 movaps xmm0, [r4 + r2] 505 movaps xmm1, xmm0 506 unpcklps xmm0, [r4 + r2 + 16] 507 unpckhps xmm1, [r4 + r2 + 16] 508 movaps [r4 + r2], xmm0 509 movaps [r4 + r2 + 16], xmm1 510 add r2, mmsize*2 511 jl .loop 512.end: 513 REP_RET 514 515cglobal fft_permute, 2,7,1 516 mov r4, [r0 + FFTContext.revtab] 517 mov r5, [r0 + FFTContext.tmpbuf] 518 mov ecx, [r0 + FFTContext.nbits] 519 mov r2, 1 520 shl r2, cl 521 xor r0, r0 522%if ARCH_X86_32 523 mov r1, r1m 524%endif 525.loop: 526 movaps xmm0, [r1 + 8*r0] 527 movzx r6, word [r4 + 2*r0] 528 movzx r3, word [r4 + 2*r0 + 2] 529 movlps [r5 + 8*r6], xmm0 530 movhps [r5 + 8*r3], xmm0 531 add r0, 2 532 cmp r0, r2 533 jl .loop 534 shl r2, 3 535 add r1, r2 536 add r5, r2 537 neg r2 538; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B 539.loopcopy: 540 movaps xmm0, [r5 + r2] 541 movaps xmm1, [r5 + r2 + 16] 542 movaps [r1 + r2], xmm0 543 movaps [r1 + r2 + 16], xmm1 544 add r2, 32 545 jl .loopcopy 546 REP_RET 547 548INIT_XMM sse 549cglobal imdct_calc, 3,5,3 550 mov r3d, [r0 + FFTContext.mdctsize] 551 mov r4, [r0 + FFTContext.imdcthalf] 552 add r1, r3 553 PUSH r3 554 PUSH r1 555%if ARCH_X86_32 556 push r2 557 push r1 558 push r0 559%else 560 sub rsp, 8+32*WIN64 ; allocate win64 shadow space 561%endif 562 call r4 563%if ARCH_X86_32 564 add esp, 12 565%else 566 add rsp, 8+32*WIN64 567%endif 568 POP r1 569 POP r3 570 lea r0, [r1 + 2*r3] 571 mov r2, r3 572 sub r3, mmsize 573 neg r2 574 mova m2, [ps_neg] 575.loop: 576 mova m0, [r1 + r3] 577 mova m1, [r0 + r2] 578 shufps m0, m0, 0x1b 579 shufps m1, m1, 0x1b 580 xorps m0, m2 581 mova [r0 + r3], m1 582 mova [r1 + r2], m0 583 sub r3, mmsize 584 add r2, mmsize 585 jl .loop 586 REP_RET 587 588%ifdef PIC 589%define SECTION_REL - $$ 590%else 591%define SECTION_REL 592%endif 593 594%macro DECL_FFT 1-2 ; nbits, suffix 595%ifidn %0, 1 596%xdefine fullsuffix SUFFIX 597%else 598%xdefine fullsuffix %2 %+ SUFFIX 599%endif 600%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL 601%if %1>=5 602%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL 603%endif 604%if %1>=6 605%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL 606%endif 607 608%assign n 1<<%1 609%rep 18-%1 610%assign n2 n/2 611%assign n4 n/4 612%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL 613 614align 16 615fft %+ n %+ fullsuffix: 616 call fft %+ n2 %+ SUFFIX 617 add r0, n*4 - (n&(-2<<%1)) 618 call fft %+ n4 %+ SUFFIX 619 add r0, n*2 - (n2&(-2<<%1)) 620 call fft %+ n4 %+ SUFFIX 621 sub r0, n*6 + (n2&(-2<<%1)) 622 lea r1, [cos_ %+ n] 623 mov r2d, n4/2 624 jmp pass %+ fullsuffix 625 626%assign n n*2 627%endrep 628%undef n 629 630align 8 631dispatch_tab %+ fullsuffix: pointer list_of_fft 632%endmacro ; DECL_FFT 633 634%if HAVE_AVX_EXTERNAL 635INIT_YMM avx 636DECL_FFT 6 637DECL_FFT 6, _interleave 638%endif 639INIT_XMM sse 640DECL_FFT 5 641DECL_FFT 5, _interleave 642 643INIT_XMM sse 644%undef mulps 645%undef addps 646%undef subps 647%undef unpcklps 648%undef unpckhps 649 650%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 651 movaps xmm0, [%3+%2*4] 652 movaps xmm1, [%3+%1*4-0x10] 653 movaps xmm2, xmm0 654 shufps xmm0, xmm1, 0x88 655 shufps xmm1, xmm2, 0x77 656 movlps xmm4, [%4+%2*2] 657 movlps xmm5, [%5+%2*2+0x0] 658 movhps xmm4, [%4+%1*2-0x8] 659 movhps xmm5, [%5+%1*2-0x8] 660 movaps xmm2, xmm0 661 movaps xmm3, xmm1 662 mulps xmm0, xmm5 663 mulps xmm1, xmm4 664 mulps xmm2, xmm4 665 mulps xmm3, xmm5 666 subps xmm1, xmm0 667 addps xmm2, xmm3 668 movaps xmm0, xmm1 669 unpcklps xmm1, xmm2 670 unpckhps xmm0, xmm2 671%endmacro 672 673%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 674 mulps m6, %3, [%5+%1] 675 mulps m7, %2, [%5+%1] 676 mulps %2, %2, [%6+%1] 677 mulps %3, %3, [%6+%1] 678 subps %2, %2, m6 679 addps %3, %3, m7 680%endmacro 681 682%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 683.post: 684%if cpuflag(avx) 685 vmovaps ymm1, [%3+%1*2] 686 vmovaps ymm0, [%3+%1*2+0x20] 687 vmovaps ymm3, [%3+%2*2] 688 vmovaps ymm2, [%3+%2*2+0x20] 689 690 CMUL %1, ymm0, ymm1, %3, %4, %5 691 CMUL %2, ymm2, ymm3, %3, %4, %5 692 vshufps ymm1, ymm1, ymm1, 0x1b 693 vshufps ymm3, ymm3, ymm3, 0x1b 694 vperm2f128 ymm1, ymm1, ymm1, 0x01 695 vperm2f128 ymm3, ymm3, ymm3, 0x01 696 vunpcklps ymm6, ymm2, ymm1 697 vunpckhps ymm4, ymm2, ymm1 698 vunpcklps ymm7, ymm0, ymm3 699 vunpckhps ymm5, ymm0, ymm3 700 701 vextractf128 [%3+%1*2], ymm7, 0 702 vextractf128 [%3+%1*2+0x10], ymm5, 0 703 vextractf128 [%3+%1*2+0x20], ymm7, 1 704 vextractf128 [%3+%1*2+0x30], ymm5, 1 705 706 vextractf128 [%3+%2*2], ymm6, 0 707 vextractf128 [%3+%2*2+0x10], ymm4, 0 708 vextractf128 [%3+%2*2+0x20], ymm6, 1 709 vextractf128 [%3+%2*2+0x30], ymm4, 1 710 sub %2, 0x20 711 add %1, 0x20 712 jl .post 713%else 714 movaps xmm1, [%3+%1*2] 715 movaps xmm0, [%3+%1*2+0x10] 716 CMUL %1, xmm0, xmm1, %3, %4, %5 717 movaps xmm5, [%3+%2*2] 718 movaps xmm4, [%3+%2*2+0x10] 719 CMUL %2, xmm4, xmm5, %3, %4, %5 720 shufps xmm1, xmm1, 0x1b 721 shufps xmm5, xmm5, 0x1b 722 movaps xmm6, xmm4 723 unpckhps xmm4, xmm1 724 unpcklps xmm6, xmm1 725 movaps xmm2, xmm0 726 unpcklps xmm0, xmm5 727 unpckhps xmm2, xmm5 728 movaps [%3+%2*2], xmm6 729 movaps [%3+%2*2+0x10], xmm4 730 movaps [%3+%1*2], xmm0 731 movaps [%3+%1*2+0x10], xmm2 732 sub %2, 0x10 733 add %1, 0x10 734 jl .post 735%endif 736%endmacro 737 738%macro DECL_IMDCT 0 739cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input 740%if ARCH_X86_64 741%define rrevtab r7 742%define rtcos r8 743%define rtsin r9 744%else 745%define rrevtab r6 746%define rtsin r6 747%define rtcos r5 748%endif 749 mov r3d, [r0+FFTContext.mdctsize] 750 add r2, r3 751 shr r3, 1 752 mov rtcos, [r0+FFTContext.tcos] 753 mov rtsin, [r0+FFTContext.tsin] 754 add rtcos, r3 755 add rtsin, r3 756%if ARCH_X86_64 == 0 757 push rtcos 758 push rtsin 759%endif 760 shr r3, 1 761 mov rrevtab, [r0+FFTContext.revtab] 762 add rrevtab, r3 763%if ARCH_X86_64 == 0 764 push rrevtab 765%endif 766 767 sub r3, 4 768%if ARCH_X86_64 769 xor r4, r4 770 sub r4, r3 771%endif 772.pre: 773%if ARCH_X86_64 == 0 774;unspill 775 xor r4, r4 776 sub r4, r3 777 mov rtcos, [esp+8] 778 mov rtsin, [esp+4] 779%endif 780 781 PREROTATER r4, r3, r2, rtcos, rtsin 782%if ARCH_X86_64 783 movzx r5, word [rrevtab+r4-4] 784 movzx r6, word [rrevtab+r4-2] 785 movzx r10, word [rrevtab+r3] 786 movzx r11, word [rrevtab+r3+2] 787 movlps [r1+r5 *8], xmm0 788 movhps [r1+r6 *8], xmm0 789 movlps [r1+r10*8], xmm1 790 movhps [r1+r11*8], xmm1 791 add r4, 4 792%else 793 mov r6, [esp] 794 movzx r5, word [r6+r4-4] 795 movzx r4, word [r6+r4-2] 796 movlps [r1+r5*8], xmm0 797 movhps [r1+r4*8], xmm0 798 movzx r5, word [r6+r3] 799 movzx r4, word [r6+r3+2] 800 movlps [r1+r5*8], xmm1 801 movhps [r1+r4*8], xmm1 802%endif 803 sub r3, 4 804 jns .pre 805 806 mov r5, r0 807 mov r6, r1 808 mov r0, r1 809 mov r1d, [r5+FFTContext.nbits] 810 811 FFT_DISPATCH SUFFIX, r1 812 813 mov r0d, [r5+FFTContext.mdctsize] 814 add r6, r0 815 shr r0, 1 816%if ARCH_X86_64 == 0 817%define rtcos r2 818%define rtsin r3 819 mov rtcos, [esp+8] 820 mov rtsin, [esp+4] 821%endif 822 neg r0 823 mov r1, -mmsize 824 sub r1, r0 825 POSROTATESHUF r0, r1, r6, rtcos, rtsin 826%if ARCH_X86_64 == 0 827 add esp, 12 828%endif 829 RET 830%endmacro 831 832DECL_IMDCT 833 834INIT_YMM avx 835 836%if HAVE_AVX_EXTERNAL 837DECL_IMDCT 838%endif 839