1;****************************************************************************** 2;* Copyright (c) Lynne 3;* 4;* This file is part of FFmpeg. 5;* 6;* FFmpeg is free software; you can redistribute it and/or 7;* modify it under the terms of the GNU Lesser General Public 8;* License as published by the Free Software Foundation; either 9;* version 2.1 of the License, or (at your option) any later version. 10;* 11;* FFmpeg is distributed in the hope that it will be useful, 12;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14;* Lesser General Public License for more details. 15;* 16;* You should have received a copy of the GNU Lesser General Public 17;* License along with FFmpeg; if not, write to the Free Software 18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19;****************************************************************************** 20 21; Open `doc/transforms.md` to see the code upon which the transforms here were 22; based upon and compare. 23 24; TODO: 25; carry over registers from smaller transforms to save on ~8 loads/stores 26; check if vinsertf could be faster than verpm2f128 for duplication 27; even faster FFT8 (current one is very #instructions optimized) 28; replace some xors with blends + addsubs? 29; replace some shuffles with vblends? 30; avx512 split-radix 31 32%include "libavutil/x86/x86util.asm" 33 34%define private_prefix ff_tx 35 36%if ARCH_X86_64 37%define ptr resq 38%else 39%define ptr resd 40%endif 41 42%assign i 16 43%rep 14 44cextern tab_ %+ i %+ _float ; ff_tab_i_float... 45%assign i (i << 1) 46%endrep 47 48struc AVTXContext 49 .len: resd 1 ; Length 50 .inv resd 1 ; Inverse flag 51 .map: ptr 1 ; Lookup table(s) 52 .exp: ptr 1 ; Exponentiation factors 53 .tmp: ptr 1 ; Temporary data 54 55 .sub: ptr 1 ; Subcontexts 56 .fn: ptr 4 ; Subcontext functions 57 .nb_sub: resd 1 ; Subcontext count 58 59 ; Everything else is inaccessible 60endstruc 61 62SECTION_RODATA 32 63 64%define POS 0x00000000 65%define NEG 0x80000000 66 67%define M_SQRT1_2 0.707106781186547524401 68%define COS16_1 0.92387950420379638671875 69%define COS16_3 0.3826834261417388916015625 70 71d8_mult_odd: dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \ 72 M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2 73 74s8_mult_odd: dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 75s8_perm_even: dd 1, 3, 0, 2, 1, 3, 2, 0 76s8_perm_odd1: dd 3, 3, 1, 1, 1, 1, 3, 3 77s8_perm_odd2: dd 1, 2, 0, 3, 1, 0, 0, 1 78 79s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2 80s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, COS16_3, -COS16_3 81s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1 82s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2 83 84mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG 85mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG 86mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS 87mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG 88mask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS 89mask_pmpmpmpm: times 4 dd POS, NEG 90 91SECTION .text 92 93; Load complex values (64 bits) via a lookup table 94; %1 - output register 95; %2 - GRP of base input memory address 96; %3 - GPR of LUT (int32_t indices) address 97; %4 - LUT offset 98; %5 - temporary GPR (only used if vgather is not used) 99; %6 - temporary register (for avx only) 100; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set) 101%macro LOAD64_LUT 5-7 102%if %0 > 6 && cpuflag(avx2) 103 pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25 104 movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction 105 vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args 106%else 107 mov %5d, [%3 + %4 + 0] 108 movsd xmm%1, [%2 + %5q*8] 109%if mmsize == 32 110 mov %5d, [%3 + %4 + 8] 111 movsd xmm%6, [%2 + %5q*8] 112%endif 113 mov %5d, [%3 + %4 + 4] 114 movhps xmm%1, [%2 + %5q*8] 115%if mmsize == 32 116 mov %5d, [%3 + %4 + 12] 117 movhps xmm%6, [%2 + %5q*8] 118 vinsertf128 %1, %1, xmm%6, 1 119%endif 120%endif 121%endmacro 122 123; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode) 124; %1 - coefficients (r0.reim, r1.reim) 125; %2 - temporary 126%macro FFT2 2 127 shufps %2, %1, %1, q3322 128 shufps %1, %1, %1, q1100 129 130 addsubps %1, %1, %2 131 132 shufps %1, %1, %1, q2031 133%endmacro 134 135; Single 4-point in-place complex FFT (will do 2 transforms at once in [AVX] mode) 136; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) 137; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) 138; %3 - temporary 139%macro FFT4 3 140 subps %3, %1, %2 ; r1234, [r5678] 141 addps %1, %1, %2 ; t1234, [t5678] 142 143 shufps %2, %1, %3, q1010 ; t12, r12 144 shufps %1, %1, %3, q2332 ; t34, r43 145 146 subps %3, %2, %1 ; a34, b32 147 addps %2, %2, %1 ; a12, b14 148 149 shufps %1, %2, %3, q1010 ; a1234 even 150 151 shufps %2, %2, %3, q2332 ; b1423 152 shufps %2, %2, %2, q1320 ; b1234 odd 153%endmacro 154 155; Single/Dual 8-point in-place complex FFT (will do 2 transforms in [AVX] mode) 156; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim]) 157; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim]) 158; %3 - odd coefficients (a1.reim, a3.reim, [b1.reim, b3.reim]) 159; %4 - odd coefficients (a5.reim, a7.reim, [b5.reim, b7.reim]) 160; %5 - temporary 161; %6 - temporary 162%macro FFT8 6 163 addps %5, %1, %3 ; q1-8 164 addps %6, %2, %4 ; k1-8 165 166 subps %1, %1, %3 ; r1-8 167 subps %2, %2, %4 ; j1-8 168 169 shufps %4, %1, %1, q2323 ; r4343 170 shufps %3, %5, %6, q3032 ; q34, k14 171 172 shufps %1, %1, %1, q1010 ; r1212 173 shufps %5, %5, %6, q1210 ; q12, k32 174 175 xorps %4, %4, [mask_pmmppmmp] ; r4343 * pmmp 176 addps %6, %5, %3 ; s12, g12 177 178 mulps %2, %2, [d8_mult_odd] ; r8 * d8_mult_odd 179 subps %5, %5, %3 ; s34, g43 180 181 addps %3, %1, %4 ; z1234 182 unpcklpd %1, %6, %5 ; s1234 183 184 shufps %4, %2, %2, q2301 ; j2143 185 shufps %6, %6, %5, q2332 ; g1234 186 187 addsubps %2, %2, %4 ; l2143 188 shufps %5, %2, %2, q0123 ; l3412 189 addsubps %5, %5, %2 ; t1234 190 191 subps %2, %1, %6 ; h1234 even 192 subps %4, %3, %5 ; u1234 odd 193 194 addps %1, %1, %6 ; w1234 even 195 addps %3, %3, %5 ; o1234 odd 196%endmacro 197 198; Single 8-point in-place complex FFT in 20 instructions 199; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) 200; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) 201; %3 - temporary 202; %4 - temporary 203%macro FFT8_AVX 4 204 subps %3, %1, %2 ; r1234, r5678 205 addps %1, %1, %2 ; q1234, q5678 206 207 vpermilps %2, %3, [s8_perm_odd1] ; r4422, r6688 208 shufps %4, %1, %1, q3322 ; q1122, q5566 209 210 movsldup %3, %3 ; r1133, r5577 211 shufps %1, %1, %1, q1100 ; q3344, q7788 212 213 addsubps %3, %3, %2 ; z1234, z5678 214 addsubps %1, %1, %4 ; s3142, s7586 215 216 mulps %3, %3, [s8_mult_odd] ; z * s8_mult_odd 217 vpermilps %1, %1, [s8_perm_even] ; s1234, s5687 ! 218 219 shufps %2, %3, %3, q2332 ; junk, z7887 220 xorps %4, %1, [mask_mmmmpppm] ; e1234, e5687 ! 221 222 vpermilps %3, %3, [s8_perm_odd2] ; z2314, z6556 223 vperm2f128 %1, %1, %4, 0x03 ; e5687, s1234 224 225 addsubps %2, %2, %3 ; junk, t5678 226 subps %1, %1, %4 ; w1234, w5678 even 227 228 vperm2f128 %2, %2, %2, 0x11 ; t5678, t5678 229 vperm2f128 %3, %3, %3, 0x00 ; z2314, z2314 230 231 xorps %2, %2, [mask_ppmpmmpm] ; t * ppmpmmpm 232 addps %2, %3, %2 ; u1234, u5678 odd 233%endmacro 234 235; Single 16-point in-place complex FFT 236; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) 237; %2 - even coefficients (r8.reim, r10.reim, r12.reim, r14.reim) 238; %3 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) 239; %4 - odd coefficients (r9.reim, r11.reim, r13.reim, r15.reim) 240; %5, %6 - temporary 241; %7, %8 - temporary (optional) 242%macro FFT16 6-8 243 FFT4 %3, %4, %5 244%if %0 > 7 245 FFT8_AVX %1, %2, %6, %7 246 movaps %8, [mask_mpmppmpm] 247 movaps %7, [s16_perm] 248%define mask %8 249%define perm %7 250%elif %0 > 6 251 FFT8_AVX %1, %2, %6, %7 252 movaps %7, [s16_perm] 253%define mask [mask_mpmppmpm] 254%define perm %7 255%else 256 FFT8_AVX %1, %2, %6, %5 257%define mask [mask_mpmppmpm] 258%define perm [s16_perm] 259%endif 260 xorps %5, %5, %5 ; 0 261 262 shufps %6, %4, %4, q2301 ; z12.imre, z13.imre... 263 shufps %5, %5, %3, q2301 ; 0, 0, z8.imre... 264 265 mulps %4, %4, [s16_mult_odd1] ; z.reim * costab 266 xorps %5, %5, [mask_mppmmpmp] 267%if cpuflag(fma3) 268 fmaddps %6, %6, [s16_mult_odd2], %4 ; s[8..15] 269 addps %5, %3, %5 ; s[0...7] 270%else 271 mulps %6, %6, [s16_mult_odd2] ; z.imre * costab 272 273 addps %5, %3, %5 ; s[0...7] 274 addps %6, %4, %6 ; s[8..15] 275%endif 276 mulps %5, %5, [s16_mult_even] ; s[0...7]*costab 277 278 xorps %4, %6, mask ; s[8..15]*mpmppmpm 279 xorps %3, %5, mask ; s[0...7]*mpmppmpm 280 281 vperm2f128 %4, %4, %4, 0x01 ; s[12..15, 8..11] 282 vperm2f128 %3, %3, %3, 0x01 ; s[4..7, 0..3] 283 284 addps %6, %6, %4 ; y56, u56, y34, u34 285 addps %5, %5, %3 ; w56, x56, w34, x34 286 287 vpermilps %6, %6, perm ; y56, u56, y43, u43 288 vpermilps %5, %5, perm ; w56, x56, w43, x43 289 290 subps %4, %2, %6 ; odd part 2 291 addps %3, %2, %6 ; odd part 1 292 293 subps %2, %1, %5 ; even part 2 294 addps %1, %1, %5 ; even part 1 295%undef mask 296%undef perm 297%endmacro 298 299; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs 300; Uses all 16 of registers. 301; Output is slightly permuted such that tx2,3's coefficients are interleaved 302; on a 2-point basis (look at `doc/transforms.md`) 303%macro SPLIT_RADIX_COMBINE 17 304%if %1 && mmsize == 32 305 vperm2f128 %14, %6, %7, 0x20 ; m2[0], m2[1], m3[0], m3[1] even 306 vperm2f128 %16, %9, %8, 0x20 ; m2[0], m2[1], m3[0], m3[1] odd 307 vperm2f128 %15, %6, %7, 0x31 ; m2[2], m2[3], m3[2], m3[3] even 308 vperm2f128 %17, %9, %8, 0x31 ; m2[2], m2[3], m3[2], m3[3] odd 309%endif 310 311 shufps %12, %10, %10, q2200 ; cos00224466 312 shufps %13, %11, %11, q1133 ; wim77553311 313 movshdup %10, %10 ; cos11335577 314 shufps %11, %11, %11, q0022 ; wim66442200 315 316%if %1 && mmsize == 32 317 shufps %6, %14, %14, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even 318 shufps %8, %16, %16, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd 319 shufps %7, %15, %15, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even 320 shufps %9, %17, %17, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd 321 322 mulps %14, %14, %13 ; m2[0123]reim * wim7531 even 323 mulps %16, %16, %11 ; m2[0123]reim * wim7531 odd 324 mulps %15, %15, %13 ; m3[0123]reim * wim7531 even 325 mulps %17, %17, %11 ; m3[0123]reim * wim7531 odd 326%else 327 mulps %14, %6, %13 ; m2,3[01]reim * wim7531 even 328 mulps %16, %8, %11 ; m2,3[01]reim * wim7531 odd 329 mulps %15, %7, %13 ; m2,3[23]reim * wim7531 even 330 mulps %17, %9, %11 ; m2,3[23]reim * wim7531 odd 331 ; reorder the multiplies to save movs reg, reg in the %if above 332 shufps %6, %6, %6, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even 333 shufps %8, %8, %8, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd 334 shufps %7, %7, %7, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even 335 shufps %9, %9, %9, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd 336%endif 337 338%if cpuflag(fma3) ; 11 - 5 = 6 instructions saved through FMA! 339 fmaddsubps %6, %6, %12, %14 ; w[0..8] even 340 fmaddsubps %8, %8, %10, %16 ; w[0..8] odd 341 fmsubaddps %7, %7, %12, %15 ; j[0..8] even 342 fmsubaddps %9, %9, %10, %17 ; j[0..8] odd 343 movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!" 344%else 345 mulps %6, %6, %12 ; m2,3[01]imre * cos0246 346 mulps %8, %8, %10 ; m2,3[01]imre * cos0246 347 movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!" 348 mulps %7, %7, %12 ; m2,3[23]reim * cos0246 349 mulps %9, %9, %10 ; m2,3[23]reim * cos0246 350 addsubps %6, %6, %14 ; w[0..8] 351 addsubps %8, %8, %16 ; w[0..8] 352 xorps %15, %15, %13 ; +-m2,3[23]imre * wim7531 353 xorps %17, %17, %13 ; +-m2,3[23]imre * wim7531 354 addps %7, %7, %15 ; j[0..8] 355 addps %9, %9, %17 ; j[0..8] 356%endif 357 358 addps %14, %6, %7 ; t10235476 even 359 addps %16, %8, %9 ; t10235476 odd 360 subps %15, %6, %7 ; +-r[0..7] even 361 subps %17, %8, %9 ; +-r[0..7] odd 362 363 shufps %14, %14, %14, q2301 ; t[0..7] even 364 shufps %16, %16, %16, q2301 ; t[0..7] odd 365 xorps %15, %15, %13 ; r[0..7] even 366 xorps %17, %17, %13 ; r[0..7] odd 367 368 subps %6, %2, %14 ; m2,3[01] even 369 subps %8, %4, %16 ; m2,3[01] odd 370 subps %7, %3, %15 ; m2,3[23] even 371 subps %9, %5, %17 ; m2,3[23] odd 372 373 addps %2, %2, %14 ; m0 even 374 addps %4, %4, %16 ; m0 odd 375 addps %3, %3, %15 ; m1 even 376 addps %5, %5, %17 ; m1 odd 377%endmacro 378 379; Same as above, only does one parity at a time, takes 3 temporary registers, 380; however, if the twiddles aren't needed after this, the registers they use 381; can be used as any of the temporary registers. 382%macro SPLIT_RADIX_COMBINE_HALF 10 383%if %1 384 shufps %8, %6, %6, q2200 ; cos00224466 385 shufps %9, %7, %7, q1133 ; wim77553311 386%else 387 shufps %8, %6, %6, q3311 ; cos11335577 388 shufps %9, %7, %7, q0022 ; wim66442200 389%endif 390 391 mulps %10, %4, %9 ; m2,3[01]reim * wim7531 even 392 mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even 393 394 shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even 395 shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even 396 397%if cpuflag(fma3) 398 fmaddsubps %4, %4, %8, %10 ; w[0..8] even 399 fmsubaddps %5, %5, %8, %9 ; j[0..8] even 400 movaps %10, [mask_pmpmpmpm] 401%else 402 mulps %4, %4, %8 ; m2,3[01]imre * cos0246 403 mulps %5, %5, %8 ; m2,3[23]reim * cos0246 404 addsubps %4, %4, %10 ; w[0..8] 405 movaps %10, [mask_pmpmpmpm] 406 xorps %9, %9, %10 ; +-m2,3[23]imre * wim7531 407 addps %5, %5, %9 ; j[0..8] 408%endif 409 410 addps %8, %4, %5 ; t10235476 411 subps %9, %4, %5 ; +-r[0..7] 412 413 shufps %8, %8, %8, q2301 ; t[0..7] 414 xorps %9, %9, %10 ; r[0..7] 415 416 subps %4, %2, %8 ; %3,3[01] 417 subps %5, %3, %9 ; %3,3[23] 418 419 addps %2, %2, %8 ; m0 420 addps %3, %3, %9 ; m1 421%endmacro 422 423; Same as above, tries REALLY hard to use 2 temporary registers. 424%macro SPLIT_RADIX_COMBINE_LITE 9 425%if %1 426 shufps %8, %6, %6, q2200 ; cos00224466 427 shufps %9, %7, %7, q1133 ; wim77553311 428%else 429 shufps %8, %6, %6, q3311 ; cos11335577 430 shufps %9, %7, %7, q0022 ; wim66442200 431%endif 432 433 mulps %9, %9, %4 ; m2,3[01]reim * wim7531 even 434 shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even 435 436%if cpuflag(fma3) 437 fmaddsubps %4, %4, %8, %9 ; w[0..8] even 438%else 439 mulps %4, %4, %8 ; m2,3[01]imre * cos0246 440 addsubps %4, %4, %9 ; w[0..8] 441%endif 442 443%if %1 444 shufps %9, %7, %7, q1133 ; wim77553311 445%else 446 shufps %9, %7, %7, q0022 ; wim66442200 447%endif 448 449 mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even 450 shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even 451%if cpuflag (fma3) 452 fmsubaddps %5, %5, %8, %9 ; j[0..8] even 453%else 454 mulps %5, %5, %8 ; m2,3[23]reim * cos0246 455 xorps %9, %9, [mask_pmpmpmpm] ; +-m2,3[23]imre * wim7531 456 addps %5, %5, %9 ; j[0..8] 457%endif 458 459 addps %8, %4, %5 ; t10235476 460 subps %9, %4, %5 ; +-r[0..7] 461 462 shufps %8, %8, %8, q2301 ; t[0..7] 463 xorps %9, %9, [mask_pmpmpmpm] ; r[0..7] 464 465 subps %4, %2, %8 ; %3,3[01] 466 subps %5, %3, %9 ; %3,3[23] 467 468 addps %2, %2, %8 ; m0 469 addps %3, %3, %9 ; m1 470%endmacro 471 472%macro SPLIT_RADIX_COMBINE_64 0 473 SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2 474 475 movaps [outq + 0*mmsize], m0 476 movaps [outq + 4*mmsize], m1 477 movaps [outq + 8*mmsize], tx1_e0 478 movaps [outq + 12*mmsize], tx2_e0 479 480 SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0 481 482 movaps [outq + 2*mmsize], m2 483 movaps [outq + 6*mmsize], m3 484 movaps [outq + 10*mmsize], tx1_o0 485 movaps [outq + 14*mmsize], tx2_o0 486 487 movaps tw_e, [tab_64_float + mmsize] 488 vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 489 490 movaps m0, [outq + 1*mmsize] 491 movaps m1, [outq + 3*mmsize] 492 movaps m2, [outq + 5*mmsize] 493 movaps m3, [outq + 7*mmsize] 494 495 SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \ 496 tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers 497 498 movaps [outq + 1*mmsize], m0 499 movaps [outq + 3*mmsize], m1 500 movaps [outq + 5*mmsize], m2 501 movaps [outq + 7*mmsize], m3 502 503 movaps [outq + 9*mmsize], tx1_e1 504 movaps [outq + 11*mmsize], tx1_o1 505 movaps [outq + 13*mmsize], tx2_e1 506 movaps [outq + 15*mmsize], tx2_o1 507%endmacro 508 509; Perform a single even/odd split radix combination with loads and stores 510; The _4 indicates this is a quarter of the iterations required to complete a full 511; combine loop 512; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6 513%macro SPLIT_RADIX_LOAD_COMBINE_4 8 514 movaps m8, [rtabq + (%5)*mmsize + %7] 515 vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23 516 517 movaps m0, [outq + (0 + %4)*mmsize + %6] 518 movaps m2, [outq + (2 + %4)*mmsize + %6] 519 movaps m1, [outq + %1 + (0 + %4)*mmsize + %6] 520 movaps m3, [outq + %1 + (2 + %4)*mmsize + %6] 521 522 movaps m4, [outq + %2 + (0 + %4)*mmsize + %6] 523 movaps m6, [outq + %2 + (2 + %4)*mmsize + %6] 524 movaps m5, [outq + %3 + (0 + %4)*mmsize + %6] 525 movaps m7, [outq + %3 + (2 + %4)*mmsize + %6] 526 527 SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ 528 m4, m5, m6, m7, \ 529 m8, m9, \ 530 m10, m11, m12, m13, m14, m15 531 532 movaps [outq + (0 + %4)*mmsize + %6], m0 533 movaps [outq + (2 + %4)*mmsize + %6], m2 534 movaps [outq + %1 + (0 + %4)*mmsize + %6], m1 535 movaps [outq + %1 + (2 + %4)*mmsize + %6], m3 536 537 movaps [outq + %2 + (0 + %4)*mmsize + %6], m4 538 movaps [outq + %2 + (2 + %4)*mmsize + %6], m6 539 movaps [outq + %3 + (0 + %4)*mmsize + %6], m5 540 movaps [outq + %3 + (2 + %4)*mmsize + %6], m7 541%endmacro 542 543%macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5 544%if %0 > 2 545%define offset_c %3 546%else 547%define offset_c 0 548%endif 549%if %0 > 3 550%define offset_r %4 551%else 552%define offset_r 0 553%endif 554%if %0 > 4 555%define offset_i %5 556%else 557%define offset_i 0 558%endif 559 560 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i 561 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i 562 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i 563 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i 564%endmacro 565 566; Perform a single even/odd split radix combination with loads, deinterleaves and 567; stores. The _2 indicates this is a half of the iterations required to complete 568; a full combine+deinterleave loop 569; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6 570%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6 571 movaps m8, [rtabq + (0 + %2)*mmsize] 572 vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23 573 574 movaps m0, [outq + (0 + 0 + %1)*mmsize + %6] 575 movaps m2, [outq + (2 + 0 + %1)*mmsize + %6] 576 movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6] 577 movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6] 578 579 movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6] 580 movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6] 581 movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6] 582 movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6] 583 584 SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ 585 m4, m5, m6, m7, \ 586 m8, m9, \ 587 m10, m11, m12, m13, m14, m15 588 589 unpckhpd m10, m0, m2 590 unpckhpd m11, m1, m3 591 unpckhpd m12, m4, m6 592 unpckhpd m13, m5, m7 593 unpcklpd m0, m0, m2 594 unpcklpd m1, m1, m3 595 unpcklpd m4, m4, m6 596 unpcklpd m5, m5, m7 597 598 vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 0], m0, 0 599 vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 16], m10, 0 600 vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 0], m1, 0 601 vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0 602 603 vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 0], m4, 0 604 vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0 605 vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 0], m5, 0 606 vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0 607 608 vperm2f128 m10, m10, m0, 0x13 609 vperm2f128 m11, m11, m1, 0x13 610 vperm2f128 m12, m12, m4, 0x13 611 vperm2f128 m13, m13, m5, 0x13 612 613 movaps m8, [rtabq + (1 + %2)*mmsize] 614 vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23 615 616 movaps m0, [outq + (0 + 1 + %1)*mmsize + %6] 617 movaps m2, [outq + (2 + 1 + %1)*mmsize + %6] 618 movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6] 619 movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6] 620 621 movaps [outq + (0 + 1 + %1)*mmsize + %6], m10 ; m0 conflict 622 movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 ; m1 conflict 623 624 movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6] 625 movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6] 626 movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6] 627 movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6] 628 629 movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 ; m4 conflict 630 movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 ; m5 conflict 631 632 SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ 633 m4, m5, m6, m7, \ 634 m8, m9, \ 635 m10, m11, m12, m13, m14, m15 ; temporary registers 636 637 unpcklpd m8, m0, m2 638 unpcklpd m9, m1, m3 639 unpcklpd m10, m4, m6 640 unpcklpd m11, m5, m7 641 unpckhpd m0, m0, m2 642 unpckhpd m1, m1, m3 643 unpckhpd m4, m4, m6 644 unpckhpd m5, m5, m7 645 646 vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 0], m8, 0 647 vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 16], m0, 0 648 vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 0], m8, 1 649 vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 16], m0, 1 650 651 vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 0], m9, 0 652 vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1, 0 653 vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 0], m9, 1 654 vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1, 1 655 656 vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 0], m10, 0 657 vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4, 0 658 vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 0], m10, 1 659 vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4, 1 660 661 vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 0], m11, 0 662 vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5, 0 663 vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 0], m11, 1 664 vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5, 1 665%endmacro 666 667%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3 668%if %0 > 2 669%define offset %3 670%else 671%define offset 0 672%endif 673 SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset 674 SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset 675%endmacro 676 677INIT_XMM sse3 678cglobal fft2_float, 4, 4, 2, ctx, out, in, stride 679 movaps m0, [inq] 680 FFT2 m0, m1 681 movaps [outq], m0 682 RET 683 684%macro FFT4 2 685INIT_XMM sse2 686cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride 687 movaps m0, [inq + 0*mmsize] 688 movaps m1, [inq + 1*mmsize] 689 690%if %2 691 shufps m2, m1, m0, q3210 692 shufps m0, m0, m1, q3210 693 movaps m1, m2 694%endif 695 696 FFT4 m0, m1, m2 697 698 unpcklpd m2, m0, m1 699 unpckhpd m0, m0, m1 700 701 movaps [outq + 0*mmsize], m2 702 movaps [outq + 1*mmsize], m0 703 704 RET 705%endmacro 706 707FFT4 fwd, 0 708FFT4 inv, 1 709 710%macro FFT8_SSE_FN 2 711INIT_XMM sse3 712cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp 713%if %2 714 mov ctxq, [ctxq + AVTXContext.map] 715 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq 716 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq 717 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq 718 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq 719%else 720 movaps m0, [inq + 0*mmsize] 721 movaps m1, [inq + 1*mmsize] 722 movaps m2, [inq + 2*mmsize] 723 movaps m3, [inq + 3*mmsize] 724%endif 725 726 FFT8 m0, m1, m2, m3, m4, m5 727 728 unpcklpd m4, m0, m3 729 unpcklpd m5, m1, m2 730 unpckhpd m0, m0, m3 731 unpckhpd m1, m1, m2 732 733 movups [outq + 0*mmsize], m4 734 movups [outq + 1*mmsize], m0 735 movups [outq + 2*mmsize], m5 736 movups [outq + 3*mmsize], m1 737 738 RET 739%endmacro 740 741FFT8_SSE_FN float, 1 742FFT8_SSE_FN ns_float, 0 743 744%macro FFT8_AVX_FN 2 745INIT_YMM avx 746cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp 747%if %2 748 mov ctxq, [ctxq + AVTXContext.map] 749 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2 750 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3 751%else 752 movaps m0, [inq + 0*mmsize] 753 movaps m1, [inq + 1*mmsize] 754%endif 755 756 FFT8_AVX m0, m1, m2, m3 757 758 unpcklpd m2, m0, m1 759 unpckhpd m0, m0, m1 760 761 ; Around 2% faster than 2x vperm2f128 + 2x movapd 762 vextractf128 [outq + 16*0], m2, 0 763 vextractf128 [outq + 16*1], m0, 0 764 vextractf128 [outq + 16*2], m2, 1 765 vextractf128 [outq + 16*3], m0, 1 766 767 RET 768%endmacro 769 770FFT8_AVX_FN float, 1 771FFT8_AVX_FN ns_float, 0 772 773%macro FFT16_FN 3 774INIT_YMM %1 775cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp 776%if %3 777 movaps m0, [inq + 0*mmsize] 778 movaps m1, [inq + 1*mmsize] 779 movaps m2, [inq + 2*mmsize] 780 movaps m3, [inq + 3*mmsize] 781%else 782 mov ctxq, [ctxq + AVTXContext.map] 783 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4 784 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5 785 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6 786 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7 787%endif 788 789 FFT16 m0, m1, m2, m3, m4, m5, m6, m7 790 791 unpcklpd m5, m1, m3 792 unpcklpd m4, m0, m2 793 unpckhpd m1, m1, m3 794 unpckhpd m0, m0, m2 795 796 vextractf128 [outq + 16*0], m4, 0 797 vextractf128 [outq + 16*1], m0, 0 798 vextractf128 [outq + 16*2], m4, 1 799 vextractf128 [outq + 16*3], m0, 1 800 vextractf128 [outq + 16*4], m5, 0 801 vextractf128 [outq + 16*5], m1, 0 802 vextractf128 [outq + 16*6], m5, 1 803 vextractf128 [outq + 16*7], m1, 1 804 805 RET 806%endmacro 807 808FFT16_FN avx, float, 0 809FFT16_FN avx, ns_float, 1 810FFT16_FN fma3, float, 0 811FFT16_FN fma3, ns_float, 1 812 813%macro FFT32_FN 3 814INIT_YMM %1 815cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp 816%if %3 817 movaps m4, [inq + 4*mmsize] 818 movaps m5, [inq + 5*mmsize] 819 movaps m6, [inq + 6*mmsize] 820 movaps m7, [inq + 7*mmsize] 821%else 822 mov ctxq, [ctxq + AVTXContext.map] 823 LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m12 824 LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9, m13 825 LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10, m14 826 LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11, m15 827%endif 828 829 FFT8 m4, m5, m6, m7, m8, m9 830 831%if %3 832 movaps m0, [inq + 0*mmsize] 833 movaps m1, [inq + 1*mmsize] 834 movaps m2, [inq + 2*mmsize] 835 movaps m3, [inq + 3*mmsize] 836%else 837 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m12 838 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m9, m13 839 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10, m14 840 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11, m15 841%endif 842 843 movaps m8, [tab_32_float] 844 vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23 845 846 FFT16 m0, m1, m2, m3, m10, m11, m12, m13 847 848 SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \ 849 m10, m11, m12, m13, m14, m15 ; temporary registers 850 851 unpcklpd m9, m1, m3 852 unpcklpd m10, m5, m7 853 unpcklpd m8, m0, m2 854 unpcklpd m11, m4, m6 855 unpckhpd m1, m1, m3 856 unpckhpd m5, m5, m7 857 unpckhpd m0, m0, m2 858 unpckhpd m4, m4, m6 859 860 vextractf128 [outq + 16* 0], m8, 0 861 vextractf128 [outq + 16* 1], m0, 0 862 vextractf128 [outq + 16* 2], m8, 1 863 vextractf128 [outq + 16* 3], m0, 1 864 vextractf128 [outq + 16* 4], m9, 0 865 vextractf128 [outq + 16* 5], m1, 0 866 vextractf128 [outq + 16* 6], m9, 1 867 vextractf128 [outq + 16* 7], m1, 1 868 869 vextractf128 [outq + 16* 8], m11, 0 870 vextractf128 [outq + 16* 9], m4, 0 871 vextractf128 [outq + 16*10], m11, 1 872 vextractf128 [outq + 16*11], m4, 1 873 vextractf128 [outq + 16*12], m10, 0 874 vextractf128 [outq + 16*13], m5, 0 875 vextractf128 [outq + 16*14], m10, 1 876 vextractf128 [outq + 16*15], m5, 1 877 878 RET 879%endmacro 880 881%if ARCH_X86_64 882FFT32_FN avx, float, 0 883FFT32_FN avx, ns_float, 1 884FFT32_FN fma3, float, 0 885FFT32_FN fma3, ns_float, 1 886%endif 887 888%macro FFT_SPLIT_RADIX_DEF 1-2 889ALIGN 16 890.%1 %+ pt: 891 PUSH lenq 892 mov lenq, (%1/4) 893 894 add outq, (%1*4) - (%1/1) 895 call .32pt 896 897 add outq, (%1*2) - (%1/2) ; the synth loops also increment outq 898 call .32pt 899 900 POP lenq 901 sub outq, (%1*4) + (%1*2) + (%1/2) 902 903 lea rtabq, [tab_ %+ %1 %+ _float] 904 lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7] 905 906%if %0 > 1 907 cmp tgtq, %1 908 je .deinterleave 909 910 mov tmpq, %1 911 912.synth_ %+ %1: 913 SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0 914 add outq, 8*mmsize 915 add rtabq, 4*mmsize 916 sub itabq, 4*mmsize 917 sub tmpq, 4*mmsize 918 jg .synth_ %+ %1 919 920 cmp lenq, %1 921 jg %2 ; can't do math here, nasm doesn't get it 922 ret 923%endif 924%endmacro 925 926%macro FFT_SPLIT_RADIX_FN 3 927INIT_YMM %1 928cglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt 929 movsxd lenq, dword [lutq + AVTXContext.len] 930 mov lutq, [lutq + AVTXContext.map] 931 mov tgtq, lenq 932 933; Bottom-most/32-point transform =============================================== 934ALIGN 16 935.32pt: 936%if %3 937 movaps m4, [inq + 4*mmsize] 938 movaps m5, [inq + 5*mmsize] 939 movaps m6, [inq + 6*mmsize] 940 movaps m7, [inq + 7*mmsize] 941%else 942 LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m12 943 LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m9, m13 944 LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10, m14 945 LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11, m15 946%endif 947 948 FFT8 m4, m5, m6, m7, m8, m9 949 950%if %3 951 movaps m0, [inq + 0*mmsize] 952 movaps m1, [inq + 1*mmsize] 953 movaps m2, [inq + 2*mmsize] 954 movaps m3, [inq + 3*mmsize] 955%else 956 LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m12 957 LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m9, m13 958 LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10, m14 959 LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11, m15 960%endif 961 962 movaps m8, [tab_32_float] 963 vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23 964 965 FFT16 m0, m1, m2, m3, m10, m11, m12, m13 966 967 SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \ 968 m10, m11, m12, m13, m14, m15 ; temporary registers 969 970 movaps [outq + 1*mmsize], m1 971 movaps [outq + 3*mmsize], m3 972 movaps [outq + 5*mmsize], m5 973 movaps [outq + 7*mmsize], m7 974 975%if %3 976 add inq, 8*mmsize 977%else 978 add lutq, (mmsize/2)*8 979%endif 980 cmp lenq, 32 981 jg .64pt 982 983 movaps [outq + 0*mmsize], m0 984 movaps [outq + 2*mmsize], m2 985 movaps [outq + 4*mmsize], m4 986 movaps [outq + 6*mmsize], m6 987 988 ret 989 990; 64-point transform =========================================================== 991ALIGN 16 992.64pt: 993; Helper defines, these make it easier to track what's happening 994%define tx1_e0 m4 995%define tx1_e1 m5 996%define tx1_o0 m6 997%define tx1_o1 m7 998%define tx2_e0 m8 999%define tx2_e1 m9 1000%define tx2_o0 m10 1001%define tx2_o1 m11 1002%define tw_e m12 1003%define tw_o m13 1004%define tmp1 m14 1005%define tmp2 m15 1006 1007 SWAP m4, m1 1008 SWAP m6, m3 1009 1010%if %3 1011 movaps tx1_e0, [inq + 0*mmsize] 1012 movaps tx1_e1, [inq + 1*mmsize] 1013 movaps tx1_o0, [inq + 2*mmsize] 1014 movaps tx1_o1, [inq + 3*mmsize] 1015%else 1016 LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tmp1 1017 LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o, tmp2 1018 LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tmp1 1019 LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tw_o, tmp2 1020%endif 1021 1022 FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1 1023 1024%if %3 1025 movaps tx2_e0, [inq + 4*mmsize] 1026 movaps tx2_e1, [inq + 5*mmsize] 1027 movaps tx2_o0, [inq + 6*mmsize] 1028 movaps tx2_o1, [inq + 7*mmsize] 1029%else 1030 LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tw_e, tmp1 1031 LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_o, tmp2 1032 LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_e, tmp1 1033 LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_o, tmp2 1034%endif 1035 1036 FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o 1037 1038 movaps tw_e, [tab_64_float] 1039 vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23 1040 1041%if %3 1042 add inq, 8*mmsize 1043%else 1044 add lutq, (mmsize/2)*8 1045%endif 1046 cmp tgtq, 64 1047 je .deinterleave 1048 1049 SPLIT_RADIX_COMBINE_64 1050 1051 cmp lenq, 64 1052 jg .128pt 1053 ret 1054 1055; 128-point transform ========================================================== 1056ALIGN 16 1057.128pt: 1058 PUSH lenq 1059 mov lenq, 32 1060 1061 add outq, 16*mmsize 1062 call .32pt 1063 1064 add outq, 8*mmsize 1065 call .32pt 1066 1067 POP lenq 1068 sub outq, 24*mmsize 1069 1070 lea rtabq, [tab_128_float] 1071 lea itabq, [tab_128_float + 128 - 4*7] 1072 1073 cmp tgtq, 128 1074 je .deinterleave 1075 1076 SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128 1077 1078 cmp lenq, 128 1079 jg .256pt 1080 ret 1081 1082; 256-point transform ========================================================== 1083ALIGN 16 1084.256pt: 1085 PUSH lenq 1086 mov lenq, 64 1087 1088 add outq, 32*mmsize 1089 call .32pt 1090 1091 add outq, 16*mmsize 1092 call .32pt 1093 1094 POP lenq 1095 sub outq, 48*mmsize 1096 1097 lea rtabq, [tab_256_float] 1098 lea itabq, [tab_256_float + 256 - 4*7] 1099 1100 cmp tgtq, 256 1101 je .deinterleave 1102 1103 SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256 1104 SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize 1105 1106 cmp lenq, 256 1107 jg .512pt 1108 ret 1109 1110; 512-point transform ========================================================== 1111ALIGN 16 1112.512pt: 1113 PUSH lenq 1114 mov lenq, 128 1115 1116 add outq, 64*mmsize 1117 call .32pt 1118 1119 add outq, 32*mmsize 1120 call .32pt 1121 1122 POP lenq 1123 sub outq, 96*mmsize 1124 1125 lea rtabq, [tab_512_float] 1126 lea itabq, [tab_512_float + 512 - 4*7] 1127 1128 cmp tgtq, 512 1129 je .deinterleave 1130 1131 mov tmpq, 4 1132 1133.synth_512: 1134 SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512 1135 add outq, 8*mmsize 1136 add rtabq, 4*mmsize 1137 sub itabq, 4*mmsize 1138 sub tmpq, 1 1139 jg .synth_512 1140 1141 cmp lenq, 512 1142 jg .1024pt 1143 ret 1144 1145; 1024-point transform ========================================================== 1146ALIGN 16 1147.1024pt: 1148 PUSH lenq 1149 mov lenq, 256 1150 1151 add outq, 96*mmsize 1152 call .32pt 1153 1154 add outq, 64*mmsize 1155 call .32pt 1156 1157 POP lenq 1158 sub outq, 192*mmsize 1159 1160 lea rtabq, [tab_1024_float] 1161 lea itabq, [tab_1024_float + 1024 - 4*7] 1162 1163 cmp tgtq, 1024 1164 je .deinterleave 1165 1166 mov tmpq, 8 1167 1168.synth_1024: 1169 SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024 1170 add outq, 8*mmsize 1171 add rtabq, 4*mmsize 1172 sub itabq, 4*mmsize 1173 sub tmpq, 1 1174 jg .synth_1024 1175 1176 cmp lenq, 1024 1177 jg .2048pt 1178 ret 1179 1180; 2048 to 131072-point transforms ============================================== 1181FFT_SPLIT_RADIX_DEF 2048, .4096pt 1182FFT_SPLIT_RADIX_DEF 4096, .8192pt 1183FFT_SPLIT_RADIX_DEF 8192, .16384pt 1184FFT_SPLIT_RADIX_DEF 16384, .32768pt 1185FFT_SPLIT_RADIX_DEF 32768, .65536pt 1186FFT_SPLIT_RADIX_DEF 65536, .131072pt 1187FFT_SPLIT_RADIX_DEF 131072 1188 1189;=============================================================================== 1190; Final synthesis + deinterleaving code 1191;=============================================================================== 1192.deinterleave: 1193 cmp lenq, 64 1194 je .64pt_deint 1195 1196 imul tmpq, lenq, 2 1197 lea lutq, [4*lenq + tmpq] 1198 1199.synth_deinterleave: 1200 SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, lutq 1201 add outq, 8*mmsize 1202 add rtabq, 4*mmsize 1203 sub itabq, 4*mmsize 1204 sub lenq, 4*mmsize 1205 jg .synth_deinterleave 1206 1207 RET 1208 1209; 64-point deinterleave which only has to load 4 registers ===================== 1210.64pt_deint: 1211 SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2 1212 SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e 1213 1214 unpcklpd tmp1, m0, m2 1215 unpcklpd tmp2, m1, m3 1216 unpcklpd tw_o, tx1_e0, tx1_o0 1217 unpcklpd tw_e, tx2_e0, tx2_o0 1218 unpckhpd m0, m0, m2 1219 unpckhpd m1, m1, m3 1220 unpckhpd tx1_e0, tx1_e0, tx1_o0 1221 unpckhpd tx2_e0, tx2_e0, tx2_o0 1222 1223 vextractf128 [outq + 0*mmsize + 0], tmp1, 0 1224 vextractf128 [outq + 0*mmsize + 16], m0, 0 1225 vextractf128 [outq + 4*mmsize + 0], tmp2, 0 1226 vextractf128 [outq + 4*mmsize + 16], m1, 0 1227 1228 vextractf128 [outq + 8*mmsize + 0], tw_o, 0 1229 vextractf128 [outq + 8*mmsize + 16], tx1_e0, 0 1230 vextractf128 [outq + 9*mmsize + 0], tw_o, 1 1231 vextractf128 [outq + 9*mmsize + 16], tx1_e0, 1 1232 1233 vperm2f128 tmp1, tmp1, m0, 0x31 1234 vperm2f128 tmp2, tmp2, m1, 0x31 1235 1236 vextractf128 [outq + 12*mmsize + 0], tw_e, 0 1237 vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0 1238 vextractf128 [outq + 13*mmsize + 0], tw_e, 1 1239 vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1 1240 1241 movaps tw_e, [tab_64_float + mmsize] 1242 vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 1243 1244 movaps m0, [outq + 1*mmsize] 1245 movaps m1, [outq + 3*mmsize] 1246 movaps m2, [outq + 5*mmsize] 1247 movaps m3, [outq + 7*mmsize] 1248 1249 movaps [outq + 1*mmsize], tmp1 1250 movaps [outq + 5*mmsize], tmp2 1251 1252 SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \ 1253 tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers 1254 1255 unpcklpd tmp1, m0, m1 1256 unpcklpd tmp2, m2, m3 1257 unpcklpd tw_e, tx1_e1, tx1_o1 1258 unpcklpd tw_o, tx2_e1, tx2_o1 1259 unpckhpd m0, m0, m1 1260 unpckhpd m2, m2, m3 1261 unpckhpd tx1_e1, tx1_e1, tx1_o1 1262 unpckhpd tx2_e1, tx2_e1, tx2_o1 1263 1264 vextractf128 [outq + 2*mmsize + 0], tmp1, 0 1265 vextractf128 [outq + 2*mmsize + 16], m0, 0 1266 vextractf128 [outq + 3*mmsize + 0], tmp1, 1 1267 vextractf128 [outq + 3*mmsize + 16], m0, 1 1268 1269 vextractf128 [outq + 6*mmsize + 0], tmp2, 0 1270 vextractf128 [outq + 6*mmsize + 16], m2, 0 1271 vextractf128 [outq + 7*mmsize + 0], tmp2, 1 1272 vextractf128 [outq + 7*mmsize + 16], m2, 1 1273 1274 vextractf128 [outq + 10*mmsize + 0], tw_e, 0 1275 vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0 1276 vextractf128 [outq + 11*mmsize + 0], tw_e, 1 1277 vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1 1278 1279 vextractf128 [outq + 14*mmsize + 0], tw_o, 0 1280 vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0 1281 vextractf128 [outq + 15*mmsize + 0], tw_o, 1 1282 vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1 1283 1284 RET 1285%endmacro 1286 1287%if ARCH_X86_64 1288FFT_SPLIT_RADIX_FN fma3, float, 0 1289FFT_SPLIT_RADIX_FN fma3, ns_float, 1 1290%if HAVE_AVX2_EXTERNAL 1291FFT_SPLIT_RADIX_FN avx2, float, 0 1292FFT_SPLIT_RADIX_FN avx2, ns_float, 1 1293%endif 1294%endif 1295