1;******************************************************************************
2;* Copyright (c) Lynne
3;*
4;* This file is part of FFmpeg.
5;*
6;* FFmpeg is free software; you can redistribute it and/or
7;* modify it under the terms of the GNU Lesser General Public
8;* License as published by the Free Software Foundation; either
9;* version 2.1 of the License, or (at your option) any later version.
10;*
11;* FFmpeg is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14;* Lesser General Public License for more details.
15;*
16;* You should have received a copy of the GNU Lesser General Public
17;* License along with FFmpeg; if not, write to the Free Software
18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19;******************************************************************************
20
21; Open `doc/transforms.md` to see the code upon which the transforms here were
22; based upon and compare.
23
24; TODO:
25;       carry over registers from smaller transforms to save on ~8 loads/stores
26;       check if vinsertf could be faster than verpm2f128 for duplication
27;       even faster FFT8 (current one is very #instructions optimized)
28;       replace some xors with blends + addsubs?
29;       replace some shuffles with vblends?
30;       avx512 split-radix
31
32%include "libavutil/x86/x86util.asm"
33
34%define private_prefix ff_tx
35
36%if ARCH_X86_64
37%define ptr resq
38%else
39%define ptr resd
40%endif
41
42%assign i 16
43%rep 14
44cextern tab_ %+ i %+ _float ; ff_tab_i_float...
45%assign i (i << 1)
46%endrep
47
48struc AVTXContext
49    .len:          resd 1 ; Length
50    .inv           resd 1 ; Inverse flag
51    .map:           ptr 1 ; Lookup table(s)
52    .exp:           ptr 1 ; Exponentiation factors
53    .tmp:           ptr 1 ; Temporary data
54
55    .sub:           ptr 1 ; Subcontexts
56    .fn:            ptr 4 ; Subcontext functions
57    .nb_sub:       resd 1 ; Subcontext count
58
59    ; Everything else is inaccessible
60endstruc
61
62SECTION_RODATA 32
63
64%define POS 0x00000000
65%define NEG 0x80000000
66
67%define M_SQRT1_2 0.707106781186547524401
68%define COS16_1   0.92387950420379638671875
69%define COS16_3   0.3826834261417388916015625
70
71d8_mult_odd:   dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \
72                  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2
73
74s8_mult_odd:   dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
75s8_perm_even:  dd 1, 3, 0, 2, 1, 3, 2, 0
76s8_perm_odd1:  dd 3, 3, 1, 1, 1, 1, 3, 3
77s8_perm_odd2:  dd 1, 2, 0, 3, 1, 0, 0, 1
78
79s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2
80s16_mult_odd1: dd COS16_1,  COS16_1,  COS16_3,  COS16_3,  COS16_1, -COS16_1,  COS16_3, -COS16_3
81s16_mult_odd2: dd COS16_3, -COS16_3,  COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
82s16_perm:      dd 0, 1, 2, 3, 1, 0, 3, 2
83
84mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
85mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
86mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS
87mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG
88mask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS
89mask_pmpmpmpm: times 4 dd POS, NEG
90
91SECTION .text
92
93; Load complex values (64 bits) via a lookup table
94; %1 - output register
95; %2 - GRP of base input memory address
96; %3 - GPR of LUT (int32_t indices) address
97; %4 - LUT offset
98; %5 - temporary GPR (only used if vgather is not used)
99; %6 - temporary register (for avx only)
100; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
101%macro LOAD64_LUT 5-7
102%if %0 > 6 && cpuflag(avx2)
103    pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
104    movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction
105    vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args
106%else
107    mov      %5d, [%3 + %4 + 0]
108    movsd  xmm%1, [%2 + %5q*8]
109%if mmsize == 32
110    mov      %5d, [%3 + %4 + 8]
111    movsd  xmm%6, [%2 + %5q*8]
112%endif
113    mov      %5d, [%3 + %4 + 4]
114    movhps xmm%1, [%2 + %5q*8]
115%if mmsize == 32
116    mov      %5d, [%3 + %4 + 12]
117    movhps xmm%6, [%2 + %5q*8]
118    vinsertf128 %1, %1, xmm%6, 1
119%endif
120%endif
121%endmacro
122
123; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode)
124; %1 - coefficients (r0.reim, r1.reim)
125; %2 - temporary
126%macro FFT2 2
127    shufps   %2, %1, %1, q3322
128    shufps   %1, %1, %1, q1100
129
130    addsubps %1, %1, %2
131
132    shufps   %1, %1, %1, q2031
133%endmacro
134
135; Single 4-point in-place complex FFT (will do 2 transforms at once in [AVX] mode)
136; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
137; %2 - odd coefficients  (r1.reim, r3.reim, r5.reim, r7.reim)
138; %3 - temporary
139%macro FFT4 3
140    subps  %3, %1, %2         ;  r1234, [r5678]
141    addps  %1, %1, %2         ;  t1234, [t5678]
142
143    shufps %2, %1, %3, q1010  ;  t12, r12
144    shufps %1, %1, %3, q2332  ;  t34, r43
145
146    subps  %3, %2, %1         ;  a34, b32
147    addps  %2, %2, %1         ;  a12, b14
148
149    shufps %1, %2, %3, q1010  ;  a1234     even
150
151    shufps %2, %2, %3, q2332  ;  b1423
152    shufps %2, %2, %2, q1320  ;  b1234     odd
153%endmacro
154
155; Single/Dual 8-point in-place complex FFT (will do 2 transforms in [AVX] mode)
156; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim])
157; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim])
158; %3 - odd coefficients  (a1.reim, a3.reim, [b1.reim, b3.reim])
159; %4 - odd coefficients  (a5.reim, a7.reim, [b5.reim, b7.reim])
160; %5 - temporary
161; %6 - temporary
162%macro FFT8 6
163    addps    %5, %1, %3               ; q1-8
164    addps    %6, %2, %4               ; k1-8
165
166    subps    %1, %1, %3               ; r1-8
167    subps    %2, %2, %4               ; j1-8
168
169    shufps   %4, %1, %1, q2323        ; r4343
170    shufps   %3, %5, %6, q3032        ; q34, k14
171
172    shufps   %1, %1, %1, q1010        ; r1212
173    shufps   %5, %5, %6, q1210        ; q12, k32
174
175    xorps    %4, %4, [mask_pmmppmmp]  ; r4343 * pmmp
176    addps    %6, %5, %3               ; s12, g12
177
178    mulps    %2, %2, [d8_mult_odd]    ; r8 * d8_mult_odd
179    subps    %5, %5, %3               ; s34, g43
180
181    addps    %3, %1, %4               ; z1234
182    unpcklpd %1, %6, %5               ; s1234
183
184    shufps   %4, %2, %2, q2301        ; j2143
185    shufps   %6, %6, %5, q2332        ; g1234
186
187    addsubps %2, %2, %4               ; l2143
188    shufps   %5, %2, %2, q0123        ; l3412
189    addsubps %5, %5, %2               ; t1234
190
191    subps    %2, %1, %6               ; h1234 even
192    subps    %4, %3, %5               ; u1234 odd
193
194    addps    %1, %1, %6               ; w1234 even
195    addps    %3, %3, %5               ; o1234 odd
196%endmacro
197
198; Single 8-point in-place complex FFT in 20 instructions
199; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
200; %2 - odd coefficients  (r1.reim, r3.reim, r5.reim, r7.reim)
201; %3 - temporary
202; %4 - temporary
203%macro FFT8_AVX 4
204    subps      %3, %1, %2               ;  r1234, r5678
205    addps      %1, %1, %2               ;  q1234, q5678
206
207    vpermilps  %2, %3, [s8_perm_odd1]   ;  r4422, r6688
208    shufps     %4, %1, %1, q3322        ;  q1122, q5566
209
210    movsldup   %3, %3                   ;  r1133, r5577
211    shufps     %1, %1, %1, q1100        ;  q3344, q7788
212
213    addsubps   %3, %3, %2               ;  z1234, z5678
214    addsubps   %1, %1, %4               ;  s3142, s7586
215
216    mulps      %3, %3, [s8_mult_odd]    ;  z * s8_mult_odd
217    vpermilps  %1, %1, [s8_perm_even]   ;  s1234, s5687 !
218
219    shufps     %2, %3, %3, q2332        ;   junk, z7887
220    xorps      %4, %1, [mask_mmmmpppm]  ;  e1234, e5687 !
221
222    vpermilps  %3, %3, [s8_perm_odd2]   ;  z2314, z6556
223    vperm2f128 %1, %1, %4, 0x03         ;  e5687, s1234
224
225    addsubps   %2, %2, %3               ;   junk, t5678
226    subps      %1, %1, %4               ;  w1234, w5678 even
227
228    vperm2f128 %2, %2, %2, 0x11         ;  t5678, t5678
229    vperm2f128 %3, %3, %3, 0x00         ;  z2314, z2314
230
231    xorps      %2, %2, [mask_ppmpmmpm]  ;  t * ppmpmmpm
232    addps      %2, %3, %2               ;  u1234, u5678 odd
233%endmacro
234
235; Single 16-point in-place complex FFT
236; %1 - even coefficients (r0.reim, r2.reim,  r4.reim,  r6.reim)
237; %2 - even coefficients (r8.reim, r10.reim, r12.reim, r14.reim)
238; %3 - odd coefficients  (r1.reim, r3.reim,  r5.reim,  r7.reim)
239; %4 - odd coefficients  (r9.reim, r11.reim, r13.reim, r15.reim)
240; %5, %6 - temporary
241; %7, %8 - temporary (optional)
242%macro FFT16 6-8
243    FFT4       %3, %4, %5
244%if %0 > 7
245    FFT8_AVX   %1, %2, %6, %7
246    movaps     %8, [mask_mpmppmpm]
247    movaps     %7, [s16_perm]
248%define mask %8
249%define perm %7
250%elif %0 > 6
251    FFT8_AVX   %1, %2, %6, %7
252    movaps     %7, [s16_perm]
253%define mask [mask_mpmppmpm]
254%define perm %7
255%else
256    FFT8_AVX   %1, %2, %6, %5
257%define mask [mask_mpmppmpm]
258%define perm [s16_perm]
259%endif
260    xorps      %5, %5, %5                   ; 0
261
262    shufps     %6, %4, %4, q2301            ; z12.imre, z13.imre...
263    shufps     %5, %5, %3, q2301            ; 0, 0, z8.imre...
264
265    mulps      %4, %4, [s16_mult_odd1]      ; z.reim * costab
266    xorps      %5, %5, [mask_mppmmpmp]
267%if cpuflag(fma3)
268    fmaddps    %6, %6, [s16_mult_odd2], %4  ; s[8..15]
269    addps      %5, %3, %5                   ; s[0...7]
270%else
271    mulps      %6, %6, [s16_mult_odd2]      ; z.imre * costab
272
273    addps      %5, %3, %5                   ; s[0...7]
274    addps      %6, %4, %6                   ; s[8..15]
275%endif
276    mulps      %5, %5, [s16_mult_even]      ; s[0...7]*costab
277
278    xorps      %4, %6, mask                 ; s[8..15]*mpmppmpm
279    xorps      %3, %5, mask                 ; s[0...7]*mpmppmpm
280
281    vperm2f128 %4, %4, %4, 0x01             ; s[12..15, 8..11]
282    vperm2f128 %3, %3, %3, 0x01             ; s[4..7, 0..3]
283
284    addps      %6, %6, %4                   ; y56, u56, y34, u34
285    addps      %5, %5, %3                   ; w56, x56, w34, x34
286
287    vpermilps  %6, %6, perm                 ; y56, u56, y43, u43
288    vpermilps  %5, %5, perm                 ; w56, x56, w43, x43
289
290    subps      %4, %2, %6                   ; odd  part 2
291    addps      %3, %2, %6                   ; odd  part 1
292
293    subps      %2, %1, %5                   ; even part 2
294    addps      %1, %1, %5                   ; even part 1
295%undef mask
296%undef perm
297%endmacro
298
299; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs
300; Uses all 16 of registers.
301; Output is slightly permuted such that tx2,3's coefficients are interleaved
302; on a 2-point basis (look at `doc/transforms.md`)
303%macro SPLIT_RADIX_COMBINE 17
304%if %1 && mmsize == 32
305    vperm2f128 %14, %6, %7, 0x20     ; m2[0], m2[1], m3[0], m3[1] even
306    vperm2f128 %16, %9, %8, 0x20     ; m2[0], m2[1], m3[0], m3[1] odd
307    vperm2f128 %15, %6, %7, 0x31     ; m2[2], m2[3], m3[2], m3[3] even
308    vperm2f128 %17, %9, %8, 0x31     ; m2[2], m2[3], m3[2], m3[3] odd
309%endif
310
311    shufps     %12, %10, %10, q2200  ; cos00224466
312    shufps     %13, %11, %11, q1133  ; wim77553311
313    movshdup   %10, %10              ; cos11335577
314    shufps     %11, %11, %11, q0022  ; wim66442200
315
316%if %1 && mmsize == 32
317    shufps     %6, %14, %14, q2301   ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even
318    shufps     %8, %16, %16, q2301   ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd
319    shufps     %7, %15, %15, q2301   ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even
320    shufps     %9, %17, %17, q2301   ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd
321
322    mulps      %14, %14, %13         ; m2[0123]reim * wim7531 even
323    mulps      %16, %16, %11         ; m2[0123]reim * wim7531 odd
324    mulps      %15, %15, %13         ; m3[0123]reim * wim7531 even
325    mulps      %17, %17, %11         ; m3[0123]reim * wim7531 odd
326%else
327    mulps      %14, %6, %13          ; m2,3[01]reim * wim7531 even
328    mulps      %16, %8, %11          ; m2,3[01]reim * wim7531 odd
329    mulps      %15, %7, %13          ; m2,3[23]reim * wim7531 even
330    mulps      %17, %9, %11          ; m2,3[23]reim * wim7531 odd
331    ; reorder the multiplies to save movs reg, reg in the %if above
332    shufps     %6, %6, %6, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
333    shufps     %8, %8, %8, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd
334    shufps     %7, %7, %7, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
335    shufps     %9, %9, %9, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd
336%endif
337
338%if cpuflag(fma3) ; 11 - 5 = 6 instructions saved through FMA!
339    fmaddsubps %6, %6, %12, %14      ; w[0..8] even
340    fmaddsubps %8, %8, %10, %16      ; w[0..8] odd
341    fmsubaddps %7, %7, %12, %15      ; j[0..8] even
342    fmsubaddps %9, %9, %10, %17      ; j[0..8] odd
343    movaps     %13, [mask_pmpmpmpm]  ; "subaddps? pfft, who needs that!"
344%else
345    mulps      %6, %6, %12           ; m2,3[01]imre * cos0246
346    mulps      %8, %8, %10           ; m2,3[01]imre * cos0246
347    movaps     %13, [mask_pmpmpmpm]  ; "subaddps? pfft, who needs that!"
348    mulps      %7, %7, %12           ; m2,3[23]reim * cos0246
349    mulps      %9, %9, %10           ; m2,3[23]reim * cos0246
350    addsubps   %6, %6, %14           ; w[0..8]
351    addsubps   %8, %8, %16           ; w[0..8]
352    xorps      %15, %15, %13         ; +-m2,3[23]imre * wim7531
353    xorps      %17, %17, %13         ; +-m2,3[23]imre * wim7531
354    addps      %7, %7, %15           ; j[0..8]
355    addps      %9, %9, %17           ; j[0..8]
356%endif
357
358    addps      %14, %6, %7           ; t10235476 even
359    addps      %16, %8, %9           ; t10235476 odd
360    subps      %15, %6, %7           ; +-r[0..7] even
361    subps      %17, %8, %9           ; +-r[0..7] odd
362
363    shufps     %14, %14, %14, q2301  ; t[0..7] even
364    shufps     %16, %16, %16, q2301  ; t[0..7] odd
365    xorps      %15, %15, %13         ; r[0..7] even
366    xorps      %17, %17, %13         ; r[0..7] odd
367
368    subps      %6, %2, %14           ; m2,3[01] even
369    subps      %8, %4, %16           ; m2,3[01] odd
370    subps      %7, %3, %15           ; m2,3[23] even
371    subps      %9, %5, %17           ; m2,3[23] odd
372
373    addps      %2, %2, %14           ; m0 even
374    addps      %4, %4, %16           ; m0 odd
375    addps      %3, %3, %15           ; m1 even
376    addps      %5, %5, %17           ; m1 odd
377%endmacro
378
379; Same as above, only does one parity at a time, takes 3 temporary registers,
380; however, if the twiddles aren't needed after this, the registers they use
381; can be used as any of the temporary registers.
382%macro SPLIT_RADIX_COMBINE_HALF 10
383%if %1
384    shufps     %8, %6, %6, q2200     ; cos00224466
385    shufps     %9, %7, %7, q1133     ; wim77553311
386%else
387    shufps     %8, %6, %6, q3311     ; cos11335577
388    shufps     %9, %7, %7, q0022     ; wim66442200
389%endif
390
391    mulps      %10, %4, %9           ; m2,3[01]reim * wim7531 even
392    mulps      %9, %9, %5            ; m2,3[23]reim * wim7531 even
393
394    shufps     %4, %4, %4, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
395    shufps     %5, %5, %5, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
396
397%if cpuflag(fma3)
398    fmaddsubps %4, %4, %8, %10       ; w[0..8] even
399    fmsubaddps %5, %5, %8, %9        ; j[0..8] even
400    movaps     %10, [mask_pmpmpmpm]
401%else
402    mulps      %4, %4, %8            ; m2,3[01]imre * cos0246
403    mulps      %5, %5, %8            ; m2,3[23]reim * cos0246
404    addsubps   %4, %4, %10           ; w[0..8]
405    movaps     %10, [mask_pmpmpmpm]
406    xorps      %9, %9, %10           ; +-m2,3[23]imre * wim7531
407    addps      %5, %5, %9            ; j[0..8]
408%endif
409
410    addps      %8, %4, %5            ; t10235476
411    subps      %9, %4, %5            ; +-r[0..7]
412
413    shufps     %8, %8, %8, q2301     ; t[0..7]
414    xorps      %9, %9, %10           ; r[0..7]
415
416    subps      %4, %2, %8            ; %3,3[01]
417    subps      %5, %3, %9            ; %3,3[23]
418
419    addps      %2, %2, %8            ; m0
420    addps      %3, %3, %9            ; m1
421%endmacro
422
423; Same as above, tries REALLY hard to use 2 temporary registers.
424%macro SPLIT_RADIX_COMBINE_LITE 9
425%if %1
426    shufps     %8, %6, %6, q2200        ; cos00224466
427    shufps     %9, %7, %7, q1133        ; wim77553311
428%else
429    shufps     %8, %6, %6, q3311        ; cos11335577
430    shufps     %9, %7, %7, q0022        ; wim66442200
431%endif
432
433    mulps      %9, %9, %4               ; m2,3[01]reim * wim7531 even
434    shufps     %4, %4, %4, q2301        ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
435
436%if cpuflag(fma3)
437    fmaddsubps %4, %4, %8, %9           ; w[0..8] even
438%else
439    mulps      %4, %4, %8               ; m2,3[01]imre * cos0246
440    addsubps   %4, %4, %9               ; w[0..8]
441%endif
442
443%if %1
444    shufps     %9, %7, %7, q1133        ; wim77553311
445%else
446    shufps     %9, %7, %7, q0022        ; wim66442200
447%endif
448
449    mulps      %9, %9, %5               ; m2,3[23]reim * wim7531 even
450    shufps     %5, %5, %5, q2301        ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
451%if cpuflag (fma3)
452    fmsubaddps %5, %5, %8, %9           ; j[0..8] even
453%else
454    mulps      %5, %5, %8               ; m2,3[23]reim * cos0246
455    xorps      %9, %9, [mask_pmpmpmpm]  ; +-m2,3[23]imre * wim7531
456    addps      %5, %5, %9               ; j[0..8]
457%endif
458
459    addps      %8, %4, %5               ; t10235476
460    subps      %9, %4, %5               ; +-r[0..7]
461
462    shufps     %8, %8, %8, q2301        ; t[0..7]
463    xorps      %9, %9, [mask_pmpmpmpm]  ; r[0..7]
464
465    subps      %4, %2, %8               ; %3,3[01]
466    subps      %5, %3, %9               ; %3,3[23]
467
468    addps      %2, %2, %8               ; m0
469    addps      %3, %3, %9               ; m1
470%endmacro
471
472%macro SPLIT_RADIX_COMBINE_64 0
473    SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
474
475    movaps [outq +  0*mmsize], m0
476    movaps [outq +  4*mmsize], m1
477    movaps [outq +  8*mmsize], tx1_e0
478    movaps [outq + 12*mmsize], tx2_e0
479
480    SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0
481
482    movaps [outq +  2*mmsize], m2
483    movaps [outq +  6*mmsize], m3
484    movaps [outq + 10*mmsize], tx1_o0
485    movaps [outq + 14*mmsize], tx2_o0
486
487    movaps tw_e,           [tab_64_float + mmsize]
488    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
489
490    movaps m0, [outq +  1*mmsize]
491    movaps m1, [outq +  3*mmsize]
492    movaps m2, [outq +  5*mmsize]
493    movaps m3, [outq +  7*mmsize]
494
495    SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
496                           tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
497
498    movaps [outq +  1*mmsize], m0
499    movaps [outq +  3*mmsize], m1
500    movaps [outq +  5*mmsize], m2
501    movaps [outq +  7*mmsize], m3
502
503    movaps [outq +  9*mmsize], tx1_e1
504    movaps [outq + 11*mmsize], tx1_o1
505    movaps [outq + 13*mmsize], tx2_e1
506    movaps [outq + 15*mmsize], tx2_o1
507%endmacro
508
509; Perform a single even/odd split radix combination with loads and stores
510; The _4 indicates this is a quarter of the iterations required to complete a full
511; combine loop
512; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6
513%macro SPLIT_RADIX_LOAD_COMBINE_4 8
514    movaps m8,         [rtabq + (%5)*mmsize + %7]
515    vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23
516
517    movaps m0, [outq +      (0 + %4)*mmsize + %6]
518    movaps m2, [outq +      (2 + %4)*mmsize + %6]
519    movaps m1, [outq + %1 + (0 + %4)*mmsize + %6]
520    movaps m3, [outq + %1 + (2 + %4)*mmsize + %6]
521
522    movaps m4, [outq + %2 + (0 + %4)*mmsize + %6]
523    movaps m6, [outq + %2 + (2 + %4)*mmsize + %6]
524    movaps m5, [outq + %3 + (0 + %4)*mmsize + %6]
525    movaps m7, [outq + %3 + (2 + %4)*mmsize + %6]
526
527    SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
528                           m4, m5, m6, m7, \
529                           m8, m9, \
530                           m10, m11, m12, m13, m14, m15
531
532    movaps [outq +      (0 + %4)*mmsize + %6], m0
533    movaps [outq +      (2 + %4)*mmsize + %6], m2
534    movaps [outq + %1 + (0 + %4)*mmsize + %6], m1
535    movaps [outq + %1 + (2 + %4)*mmsize + %6], m3
536
537    movaps [outq + %2 + (0 + %4)*mmsize + %6], m4
538    movaps [outq + %2 + (2 + %4)*mmsize + %6], m6
539    movaps [outq + %3 + (0 + %4)*mmsize + %6], m5
540    movaps [outq + %3 + (2 + %4)*mmsize + %6], m7
541%endmacro
542
543%macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5
544%if %0 > 2
545%define offset_c %3
546%else
547%define offset_c 0
548%endif
549%if %0 > 3
550%define offset_r %4
551%else
552%define offset_r 0
553%endif
554%if %0 > 4
555%define offset_i %5
556%else
557%define offset_i 0
558%endif
559
560    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i
561    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i
562    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i
563    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i
564%endmacro
565
566; Perform a single even/odd split radix combination with loads, deinterleaves and
567; stores. The _2 indicates this is a half of the iterations required to complete
568; a full combine+deinterleave loop
569; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6
570%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6
571    movaps m8,         [rtabq + (0 + %2)*mmsize]
572    vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23
573
574    movaps m0, [outq +      (0 + 0 + %1)*mmsize + %6]
575    movaps m2, [outq +      (2 + 0 + %1)*mmsize + %6]
576    movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6]
577    movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6]
578
579    movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6]
580    movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6]
581    movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6]
582    movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6]
583
584    SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
585       m4, m5, m6, m7, \
586       m8, m9, \
587       m10, m11, m12, m13, m14, m15
588
589    unpckhpd m10, m0, m2
590    unpckhpd m11, m1, m3
591    unpckhpd m12, m4, m6
592    unpckhpd m13, m5, m7
593    unpcklpd m0, m0, m2
594    unpcklpd m1, m1, m3
595    unpcklpd m4, m4, m6
596    unpcklpd m5, m5, m7
597
598    vextractf128 [outq +      (0 + 0 + %1)*mmsize + %6 +  0], m0,  0
599    vextractf128 [outq +      (0 + 0 + %1)*mmsize + %6 + 16], m10, 0
600    vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 +  0], m1,  0
601    vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0
602
603    vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 +  0], m4,  0
604    vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0
605    vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 +  0], m5,  0
606    vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0
607
608    vperm2f128 m10, m10, m0, 0x13
609    vperm2f128 m11, m11, m1, 0x13
610    vperm2f128 m12, m12, m4, 0x13
611    vperm2f128 m13, m13, m5, 0x13
612
613    movaps m8,         [rtabq + (1 + %2)*mmsize]
614    vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23
615
616    movaps m0, [outq +      (0 + 1 + %1)*mmsize + %6]
617    movaps m2, [outq +      (2 + 1 + %1)*mmsize + %6]
618    movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6]
619    movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6]
620
621    movaps [outq +      (0 + 1 + %1)*mmsize + %6], m10 ; m0 conflict
622    movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 ; m1 conflict
623
624    movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6]
625    movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6]
626    movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6]
627    movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6]
628
629    movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 ; m4 conflict
630    movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 ; m5 conflict
631
632    SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
633                           m4, m5, m6, m7, \
634                           m8, m9, \
635                           m10, m11, m12, m13, m14, m15 ; temporary registers
636
637    unpcklpd m8,  m0, m2
638    unpcklpd m9,  m1, m3
639    unpcklpd m10, m4, m6
640    unpcklpd m11, m5, m7
641    unpckhpd m0, m0, m2
642    unpckhpd m1, m1, m3
643    unpckhpd m4, m4, m6
644    unpckhpd m5, m5, m7
645
646    vextractf128 [outq +      (2 + 0 + %1)*mmsize + %6 +  0], m8,  0
647    vextractf128 [outq +      (2 + 0 + %1)*mmsize + %6 + 16], m0,  0
648    vextractf128 [outq +      (2 + 1 + %1)*mmsize + %6 +  0], m8,  1
649    vextractf128 [outq +      (2 + 1 + %1)*mmsize + %6 + 16], m0,  1
650
651    vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 +  0], m9,  0
652    vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1,  0
653    vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 +  0], m9,  1
654    vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1,  1
655
656    vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 +  0], m10, 0
657    vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4,  0
658    vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 +  0], m10, 1
659    vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4,  1
660
661    vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 +  0], m11, 0
662    vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5,  0
663    vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 +  0], m11, 1
664    vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5,  1
665%endmacro
666
667%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3
668%if %0 > 2
669%define offset %3
670%else
671%define offset 0
672%endif
673    SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset
674    SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset
675%endmacro
676
677INIT_XMM sse3
678cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
679    movaps m0, [inq]
680    FFT2 m0, m1
681    movaps [outq], m0
682    RET
683
684%macro FFT4 2
685INIT_XMM sse2
686cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
687    movaps m0, [inq + 0*mmsize]
688    movaps m1, [inq + 1*mmsize]
689
690%if %2
691    shufps m2, m1, m0, q3210
692    shufps m0, m0, m1, q3210
693    movaps m1, m2
694%endif
695
696    FFT4 m0, m1, m2
697
698    unpcklpd m2, m0, m1
699    unpckhpd m0, m0, m1
700
701    movaps [outq + 0*mmsize], m2
702    movaps [outq + 1*mmsize], m0
703
704    RET
705%endmacro
706
707FFT4 fwd, 0
708FFT4 inv, 1
709
710%macro FFT8_SSE_FN 2
711INIT_XMM sse3
712cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp
713%if %2
714    mov ctxq, [ctxq + AVTXContext.map]
715    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
716    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
717    LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
718    LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
719%else
720    movaps m0, [inq + 0*mmsize]
721    movaps m1, [inq + 1*mmsize]
722    movaps m2, [inq + 2*mmsize]
723    movaps m3, [inq + 3*mmsize]
724%endif
725
726    FFT8 m0, m1, m2, m3, m4, m5
727
728    unpcklpd m4, m0, m3
729    unpcklpd m5, m1, m2
730    unpckhpd m0, m0, m3
731    unpckhpd m1, m1, m2
732
733    movups [outq + 0*mmsize], m4
734    movups [outq + 1*mmsize], m0
735    movups [outq + 2*mmsize], m5
736    movups [outq + 3*mmsize], m1
737
738    RET
739%endmacro
740
741FFT8_SSE_FN float,    1
742FFT8_SSE_FN ns_float, 0
743
744%macro FFT8_AVX_FN 2
745INIT_YMM avx
746cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp
747%if %2
748    mov ctxq, [ctxq + AVTXContext.map]
749    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
750    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
751%else
752    movaps m0, [inq + 0*mmsize]
753    movaps m1, [inq + 1*mmsize]
754%endif
755
756    FFT8_AVX m0, m1, m2, m3
757
758    unpcklpd m2, m0, m1
759    unpckhpd m0, m0, m1
760
761    ; Around 2% faster than 2x vperm2f128 + 2x movapd
762    vextractf128 [outq + 16*0], m2, 0
763    vextractf128 [outq + 16*1], m0, 0
764    vextractf128 [outq + 16*2], m2, 1
765    vextractf128 [outq + 16*3], m0, 1
766
767    RET
768%endmacro
769
770FFT8_AVX_FN float,    1
771FFT8_AVX_FN ns_float, 0
772
773%macro FFT16_FN 3
774INIT_YMM %1
775cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp
776%if %3
777    movaps m0, [inq + 0*mmsize]
778    movaps m1, [inq + 1*mmsize]
779    movaps m2, [inq + 2*mmsize]
780    movaps m3, [inq + 3*mmsize]
781%else
782    mov ctxq, [ctxq + AVTXContext.map]
783    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
784    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
785    LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6
786    LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7
787%endif
788
789    FFT16 m0, m1, m2, m3, m4, m5, m6, m7
790
791    unpcklpd m5, m1, m3
792    unpcklpd m4, m0, m2
793    unpckhpd m1, m1, m3
794    unpckhpd m0, m0, m2
795
796    vextractf128 [outq + 16*0], m4, 0
797    vextractf128 [outq + 16*1], m0, 0
798    vextractf128 [outq + 16*2], m4, 1
799    vextractf128 [outq + 16*3], m0, 1
800    vextractf128 [outq + 16*4], m5, 0
801    vextractf128 [outq + 16*5], m1, 0
802    vextractf128 [outq + 16*6], m5, 1
803    vextractf128 [outq + 16*7], m1, 1
804
805    RET
806%endmacro
807
808FFT16_FN avx,  float,    0
809FFT16_FN avx,  ns_float, 1
810FFT16_FN fma3, float,    0
811FFT16_FN fma3, ns_float, 1
812
813%macro FFT32_FN 3
814INIT_YMM %1
815cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
816%if %3
817    movaps m4, [inq + 4*mmsize]
818    movaps m5, [inq + 5*mmsize]
819    movaps m6, [inq + 6*mmsize]
820    movaps m7, [inq + 7*mmsize]
821%else
822    mov ctxq, [ctxq + AVTXContext.map]
823    LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq,  m8, m12
824    LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq,  m9, m13
825    LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10, m14
826    LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11, m15
827%endif
828
829    FFT8 m4, m5, m6, m7, m8, m9
830
831%if %3
832    movaps m0, [inq + 0*mmsize]
833    movaps m1, [inq + 1*mmsize]
834    movaps m2, [inq + 2*mmsize]
835    movaps m3, [inq + 3*mmsize]
836%else
837    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq,  m8, m12
838    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq,  m9, m13
839    LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10, m14
840    LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11, m15
841%endif
842
843    movaps m8,         [tab_32_float]
844    vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
845
846    FFT16 m0, m1, m2, m3, m10, m11, m12, m13
847
848    SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
849                           m10, m11, m12, m13, m14, m15 ; temporary registers
850
851    unpcklpd  m9, m1, m3
852    unpcklpd m10, m5, m7
853    unpcklpd  m8, m0, m2
854    unpcklpd m11, m4, m6
855    unpckhpd  m1, m1, m3
856    unpckhpd  m5, m5, m7
857    unpckhpd  m0, m0, m2
858    unpckhpd  m4, m4, m6
859
860    vextractf128 [outq + 16* 0],  m8, 0
861    vextractf128 [outq + 16* 1],  m0, 0
862    vextractf128 [outq + 16* 2],  m8, 1
863    vextractf128 [outq + 16* 3],  m0, 1
864    vextractf128 [outq + 16* 4],  m9, 0
865    vextractf128 [outq + 16* 5],  m1, 0
866    vextractf128 [outq + 16* 6],  m9, 1
867    vextractf128 [outq + 16* 7],  m1, 1
868
869    vextractf128 [outq + 16* 8], m11, 0
870    vextractf128 [outq + 16* 9],  m4, 0
871    vextractf128 [outq + 16*10], m11, 1
872    vextractf128 [outq + 16*11],  m4, 1
873    vextractf128 [outq + 16*12], m10, 0
874    vextractf128 [outq + 16*13],  m5, 0
875    vextractf128 [outq + 16*14], m10, 1
876    vextractf128 [outq + 16*15],  m5, 1
877
878    RET
879%endmacro
880
881%if ARCH_X86_64
882FFT32_FN avx,  float,    0
883FFT32_FN avx,  ns_float, 1
884FFT32_FN fma3, float,    0
885FFT32_FN fma3, ns_float, 1
886%endif
887
888%macro FFT_SPLIT_RADIX_DEF 1-2
889ALIGN 16
890.%1 %+ pt:
891    PUSH lenq
892    mov lenq, (%1/4)
893
894    add outq, (%1*4) - (%1/1)
895    call .32pt
896
897    add outq, (%1*2) - (%1/2) ; the synth loops also increment outq
898    call .32pt
899
900    POP lenq
901    sub outq, (%1*4) + (%1*2) + (%1/2)
902
903    lea rtabq, [tab_ %+ %1 %+ _float]
904    lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7]
905
906%if %0 > 1
907    cmp tgtq, %1
908    je .deinterleave
909
910    mov tmpq, %1
911
912.synth_ %+ %1:
913    SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0
914    add outq, 8*mmsize
915    add rtabq, 4*mmsize
916    sub itabq, 4*mmsize
917    sub tmpq, 4*mmsize
918    jg .synth_ %+ %1
919
920    cmp lenq, %1
921    jg %2 ; can't do math here, nasm doesn't get it
922    ret
923%endif
924%endmacro
925
926%macro FFT_SPLIT_RADIX_FN 3
927INIT_YMM %1
928cglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
929    movsxd lenq, dword [lutq + AVTXContext.len]
930    mov lutq, [lutq + AVTXContext.map]
931    mov tgtq, lenq
932
933; Bottom-most/32-point transform ===============================================
934ALIGN 16
935.32pt:
936%if %3
937    movaps m4, [inq + 4*mmsize]
938    movaps m5, [inq + 5*mmsize]
939    movaps m6, [inq + 6*mmsize]
940    movaps m7, [inq + 7*mmsize]
941%else
942    LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq,  m8, m12
943    LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq,  m9, m13
944    LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10, m14
945    LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11, m15
946%endif
947
948    FFT8 m4, m5, m6, m7, m8, m9
949
950%if %3
951    movaps m0, [inq + 0*mmsize]
952    movaps m1, [inq + 1*mmsize]
953    movaps m2, [inq + 2*mmsize]
954    movaps m3, [inq + 3*mmsize]
955%else
956    LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq,  m8, m12
957    LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq,  m9, m13
958    LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10, m14
959    LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11, m15
960%endif
961
962    movaps m8,         [tab_32_float]
963    vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
964
965    FFT16 m0, m1, m2, m3, m10, m11, m12, m13
966
967    SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
968                           m10, m11, m12, m13, m14, m15 ; temporary registers
969
970    movaps [outq + 1*mmsize], m1
971    movaps [outq + 3*mmsize], m3
972    movaps [outq + 5*mmsize], m5
973    movaps [outq + 7*mmsize], m7
974
975%if %3
976    add inq, 8*mmsize
977%else
978    add lutq, (mmsize/2)*8
979%endif
980    cmp lenq, 32
981    jg .64pt
982
983    movaps [outq + 0*mmsize], m0
984    movaps [outq + 2*mmsize], m2
985    movaps [outq + 4*mmsize], m4
986    movaps [outq + 6*mmsize], m6
987
988    ret
989
990; 64-point transform ===========================================================
991ALIGN 16
992.64pt:
993; Helper defines, these make it easier to track what's happening
994%define tx1_e0 m4
995%define tx1_e1 m5
996%define tx1_o0 m6
997%define tx1_o1 m7
998%define tx2_e0 m8
999%define tx2_e1 m9
1000%define tx2_o0 m10
1001%define tx2_o1 m11
1002%define tw_e m12
1003%define tw_o m13
1004%define tmp1 m14
1005%define tmp2 m15
1006
1007    SWAP m4, m1
1008    SWAP m6, m3
1009
1010%if %3
1011    movaps tx1_e0, [inq + 0*mmsize]
1012    movaps tx1_e1, [inq + 1*mmsize]
1013    movaps tx1_o0, [inq + 2*mmsize]
1014    movaps tx1_o1, [inq + 3*mmsize]
1015%else
1016    LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tmp1
1017    LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o, tmp2
1018    LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tmp1
1019    LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tw_o, tmp2
1020%endif
1021
1022    FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
1023
1024%if %3
1025    movaps tx2_e0, [inq + 4*mmsize]
1026    movaps tx2_e1, [inq + 5*mmsize]
1027    movaps tx2_o0, [inq + 6*mmsize]
1028    movaps tx2_o1, [inq + 7*mmsize]
1029%else
1030    LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tw_e, tmp1
1031    LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_o, tmp2
1032    LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_e, tmp1
1033    LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_o, tmp2
1034%endif
1035
1036    FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
1037
1038    movaps tw_e,           [tab_64_float]
1039    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
1040
1041%if %3
1042    add inq, 8*mmsize
1043%else
1044    add lutq, (mmsize/2)*8
1045%endif
1046    cmp tgtq, 64
1047    je .deinterleave
1048
1049    SPLIT_RADIX_COMBINE_64
1050
1051    cmp lenq, 64
1052    jg .128pt
1053    ret
1054
1055; 128-point transform ==========================================================
1056ALIGN 16
1057.128pt:
1058    PUSH lenq
1059    mov lenq, 32
1060
1061    add outq, 16*mmsize
1062    call .32pt
1063
1064    add outq, 8*mmsize
1065    call .32pt
1066
1067    POP lenq
1068    sub outq, 24*mmsize
1069
1070    lea rtabq, [tab_128_float]
1071    lea itabq, [tab_128_float + 128 - 4*7]
1072
1073    cmp tgtq, 128
1074    je .deinterleave
1075
1076    SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128
1077
1078    cmp lenq, 128
1079    jg .256pt
1080    ret
1081
1082; 256-point transform ==========================================================
1083ALIGN 16
1084.256pt:
1085    PUSH lenq
1086    mov lenq, 64
1087
1088    add outq, 32*mmsize
1089    call .32pt
1090
1091    add outq, 16*mmsize
1092    call .32pt
1093
1094    POP lenq
1095    sub outq, 48*mmsize
1096
1097    lea rtabq, [tab_256_float]
1098    lea itabq, [tab_256_float + 256 - 4*7]
1099
1100    cmp tgtq, 256
1101    je .deinterleave
1102
1103    SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256
1104    SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize
1105
1106    cmp lenq, 256
1107    jg .512pt
1108    ret
1109
1110; 512-point transform ==========================================================
1111ALIGN 16
1112.512pt:
1113    PUSH lenq
1114    mov lenq, 128
1115
1116    add outq, 64*mmsize
1117    call .32pt
1118
1119    add outq, 32*mmsize
1120    call .32pt
1121
1122    POP lenq
1123    sub outq, 96*mmsize
1124
1125    lea rtabq, [tab_512_float]
1126    lea itabq, [tab_512_float + 512 - 4*7]
1127
1128    cmp tgtq, 512
1129    je .deinterleave
1130
1131    mov tmpq, 4
1132
1133.synth_512:
1134    SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512
1135    add outq, 8*mmsize
1136    add rtabq, 4*mmsize
1137    sub itabq, 4*mmsize
1138    sub tmpq, 1
1139    jg .synth_512
1140
1141    cmp lenq, 512
1142    jg .1024pt
1143    ret
1144
1145; 1024-point transform ==========================================================
1146ALIGN 16
1147.1024pt:
1148    PUSH lenq
1149    mov lenq, 256
1150
1151    add outq, 96*mmsize
1152    call .32pt
1153
1154    add outq, 64*mmsize
1155    call .32pt
1156
1157    POP lenq
1158    sub outq, 192*mmsize
1159
1160    lea rtabq, [tab_1024_float]
1161    lea itabq, [tab_1024_float + 1024 - 4*7]
1162
1163    cmp tgtq, 1024
1164    je .deinterleave
1165
1166    mov tmpq, 8
1167
1168.synth_1024:
1169    SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024
1170    add outq, 8*mmsize
1171    add rtabq, 4*mmsize
1172    sub itabq, 4*mmsize
1173    sub tmpq, 1
1174    jg .synth_1024
1175
1176    cmp lenq, 1024
1177    jg .2048pt
1178    ret
1179
1180; 2048 to 131072-point transforms ==============================================
1181FFT_SPLIT_RADIX_DEF 2048,  .4096pt
1182FFT_SPLIT_RADIX_DEF 4096,  .8192pt
1183FFT_SPLIT_RADIX_DEF 8192,  .16384pt
1184FFT_SPLIT_RADIX_DEF 16384, .32768pt
1185FFT_SPLIT_RADIX_DEF 32768, .65536pt
1186FFT_SPLIT_RADIX_DEF 65536, .131072pt
1187FFT_SPLIT_RADIX_DEF 131072
1188
1189;===============================================================================
1190; Final synthesis + deinterleaving code
1191;===============================================================================
1192.deinterleave:
1193    cmp lenq, 64
1194    je .64pt_deint
1195
1196    imul tmpq, lenq, 2
1197    lea lutq, [4*lenq + tmpq]
1198
1199.synth_deinterleave:
1200    SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, lutq
1201    add outq, 8*mmsize
1202    add rtabq, 4*mmsize
1203    sub itabq, 4*mmsize
1204    sub lenq, 4*mmsize
1205    jg .synth_deinterleave
1206
1207    RET
1208
1209; 64-point deinterleave which only has to load 4 registers =====================
1210.64pt_deint:
1211    SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
1212    SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e
1213
1214    unpcklpd tmp1, m0, m2
1215    unpcklpd tmp2, m1, m3
1216    unpcklpd tw_o, tx1_e0, tx1_o0
1217    unpcklpd tw_e, tx2_e0, tx2_o0
1218    unpckhpd m0, m0, m2
1219    unpckhpd m1, m1, m3
1220    unpckhpd tx1_e0, tx1_e0, tx1_o0
1221    unpckhpd tx2_e0, tx2_e0, tx2_o0
1222
1223    vextractf128 [outq +  0*mmsize +  0], tmp1,   0
1224    vextractf128 [outq +  0*mmsize + 16], m0,     0
1225    vextractf128 [outq +  4*mmsize +  0], tmp2,   0
1226    vextractf128 [outq +  4*mmsize + 16], m1,     0
1227
1228    vextractf128 [outq +  8*mmsize +  0], tw_o,   0
1229    vextractf128 [outq +  8*mmsize + 16], tx1_e0, 0
1230    vextractf128 [outq +  9*mmsize +  0], tw_o,   1
1231    vextractf128 [outq +  9*mmsize + 16], tx1_e0, 1
1232
1233    vperm2f128 tmp1, tmp1, m0, 0x31
1234    vperm2f128 tmp2, tmp2, m1, 0x31
1235
1236    vextractf128 [outq + 12*mmsize +  0], tw_e,   0
1237    vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0
1238    vextractf128 [outq + 13*mmsize +  0], tw_e,   1
1239    vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
1240
1241    movaps tw_e,           [tab_64_float + mmsize]
1242    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
1243
1244    movaps m0, [outq +  1*mmsize]
1245    movaps m1, [outq +  3*mmsize]
1246    movaps m2, [outq +  5*mmsize]
1247    movaps m3, [outq +  7*mmsize]
1248
1249    movaps [outq +  1*mmsize], tmp1
1250    movaps [outq +  5*mmsize], tmp2
1251
1252    SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
1253                           tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
1254
1255    unpcklpd tmp1, m0, m1
1256    unpcklpd tmp2, m2, m3
1257    unpcklpd tw_e, tx1_e1, tx1_o1
1258    unpcklpd tw_o, tx2_e1, tx2_o1
1259    unpckhpd m0, m0, m1
1260    unpckhpd m2, m2, m3
1261    unpckhpd tx1_e1, tx1_e1, tx1_o1
1262    unpckhpd tx2_e1, tx2_e1, tx2_o1
1263
1264    vextractf128 [outq +  2*mmsize +  0], tmp1,   0
1265    vextractf128 [outq +  2*mmsize + 16], m0,     0
1266    vextractf128 [outq +  3*mmsize +  0], tmp1,   1
1267    vextractf128 [outq +  3*mmsize + 16], m0,     1
1268
1269    vextractf128 [outq +  6*mmsize +  0], tmp2,   0
1270    vextractf128 [outq +  6*mmsize + 16], m2,     0
1271    vextractf128 [outq +  7*mmsize +  0], tmp2,   1
1272    vextractf128 [outq +  7*mmsize + 16], m2,     1
1273
1274    vextractf128 [outq + 10*mmsize +  0], tw_e,   0
1275    vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0
1276    vextractf128 [outq + 11*mmsize +  0], tw_e,   1
1277    vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1
1278
1279    vextractf128 [outq + 14*mmsize +  0], tw_o,   0
1280    vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0
1281    vextractf128 [outq + 15*mmsize +  0], tw_o,   1
1282    vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
1283
1284    RET
1285%endmacro
1286
1287%if ARCH_X86_64
1288FFT_SPLIT_RADIX_FN fma3, float,    0
1289FFT_SPLIT_RADIX_FN fma3, ns_float, 1
1290%if HAVE_AVX2_EXTERNAL
1291FFT_SPLIT_RADIX_FN avx2, float,    0
1292FFT_SPLIT_RADIX_FN avx2, ns_float, 1
1293%endif
1294%endif
1295