1;******************************************************************************
2;* FFT transform with SSE/AVX optimizations
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2011 Vitor Sessak
5;*
6;* This algorithm (though not any of the implementation details) is
7;* based on libdjbfft by D. J. Bernstein.
8;*
9;* This file is part of FFmpeg.
10;*
11;* FFmpeg is free software; you can redistribute it and/or
12;* modify it under the terms of the GNU Lesser General Public
13;* License as published by the Free Software Foundation; either
14;* version 2.1 of the License, or (at your option) any later version.
15;*
16;* FFmpeg is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19;* Lesser General Public License for more details.
20;*
21;* You should have received a copy of the GNU Lesser General Public
22;* License along with FFmpeg; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;******************************************************************************
25
26; These functions are not individually interchangeable with the C versions.
27; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28; in blocks as conventient to the vector size.
29; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
30
31%include "libavutil/x86/x86util.asm"
32
33%if ARCH_X86_64
34%define pointer resq
35%else
36%define pointer resd
37%endif
38
39struc FFTContext
40    .nbits:    resd 1
41    .reverse:  resd 1
42    .revtab:   pointer 1
43    .tmpbuf:   pointer 1
44    .mdctsize: resd 1
45    .mdctbits: resd 1
46    .tcos:     pointer 1
47    .tsin:     pointer 1
48    .fftperm:  pointer 1
49    .fftcalc:  pointer 1
50    .imdctcalc:pointer 1
51    .imdcthalf:pointer 1
52endstruc
53
54SECTION_RODATA 32
55
56%define M_SQRT1_2 0.70710678118654752440
57%define M_COS_PI_1_8 0.923879532511287
58%define M_COS_PI_3_8 0.38268343236509
59
60ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
61ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
62
63ps_root2: times 8 dd M_SQRT1_2
64ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
65ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
66
67perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
68perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
69ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
70ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
71ps_m1p1: dd 1<<31, 0
72
73cextern ps_neg
74
75%assign i 16
76%rep 14
77cextern cos_ %+ i
78%assign i i<<1
79%endrep
80
81%if ARCH_X86_64
82    %define pointer dq
83%else
84    %define pointer dd
85%endif
86
87%macro IF0 1+
88%endmacro
89%macro IF1 1+
90    %1
91%endmacro
92
93SECTION .text
94
95;  in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
96;      %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
97;      %3, %4, %5 tmp
98; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
99;      %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
100%macro T8_AVX 5
101    vsubps     %5, %1, %2       ; v  = %1 - %2
102    vaddps     %3, %1, %2       ; w  = %1 + %2
103    vmulps     %2, %5, [ps_p1p1m1p1root2]  ; v *= vals1
104    vpermilps  %2, %2, [perm1]
105    vblendps   %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
106    vshufps    %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
107    vsubps     %4, %5, %1       ; s = r - q
108    vaddps     %1, %5, %1       ; u = r + q
109    vpermilps  %1, %1, [perm2]  ; k  = {u1,u2,u3,u4,u6,u5,u7,u8}
110    vshufps    %5, %4, %1, 0xbb
111    vshufps    %3, %4, %1, 0xee
112    vperm2f128 %3, %3, %5, 0x13
113    vxorps     %4, %4, [ps_m1m1p1m1p1m1m1m1]  ; s *= {1,1,-1,-1,1,-1,-1,-1}
114    vshufps    %2, %1, %4, 0xdd
115    vshufps    %1, %1, %4, 0x88
116    vperm2f128 %4, %2, %1, 0x02 ; v  = {k1,k3,s1,s3,k2,k4,s2,s4}
117    vperm2f128 %1, %1, %2, 0x13 ; w  = {k6,k8,s6,s8,k5,k7,s5,s7}
118    vsubps     %5, %1, %3
119    vblendps   %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
120    vsubps     %2, %4, %1       ; %2 = v - w
121    vaddps     %1, %4, %1       ; %1 = v + w
122%endmacro
123
124; In SSE mode do one fft4 transforms
125; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
126; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
127;
128; In AVX mode do two fft4 transforms
129; in:  %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
130; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
131%macro T4_SSE 3
132    subps    %3, %1, %2       ; {t3,t4,-t8,t7}
133    addps    %1, %1, %2       ; {t1,t2,t6,t5}
134    xorps    %3, %3, [ps_p1p1m1p1]
135    shufps   %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
136    shufps   %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
137    subps    %3, %1, %2       ; {r2,i2,r3,i3}
138    addps    %1, %1, %2       ; {r0,i0,r1,i1}
139    shufps   %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
140    shufps   %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
141%endmacro
142
143; In SSE mode do one FFT8
144; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
145; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
146;
147; In AVX mode do two FFT8
148; in:  %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
149;      %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
150; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
151;      %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
152%macro T8_SSE 6
153    addps    %6, %3, %4       ; {t1,t2,t3,t4}
154    subps    %3, %3, %4       ; {r5,i5,r7,i7}
155    shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
156    mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
157    mulps    %4, %4, [ps_root2]
158    addps    %3, %3, %4       ; {t8,t7,ta,t9}
159    shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
160    shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
161    subps    %3, %6, %4       ; {t6,t5,tc,tb}
162    addps    %6, %6, %4       ; {t1,t2,t9,ta}
163    shufps   %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
164    shufps   %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
165    subps    %3, %1, %6       ; {r4,r5,r6,r7}
166    addps    %1, %1, %6       ; {r0,r1,r2,r3}
167    subps    %4, %2, %5       ; {i4,i5,i6,i7}
168    addps    %2, %2, %5       ; {i0,i1,i2,i3}
169%endmacro
170
171%macro INTERL 5
172%if cpuflag(avx)
173    vunpckhps      %3, %2, %1
174    vunpcklps      %2, %2, %1
175    vextractf128   %4(%5), %2, 0
176    vextractf128  %4 %+ H(%5), %3, 0
177    vextractf128   %4(%5 + 1), %2, 1
178    vextractf128  %4 %+ H(%5 + 1), %3, 1
179%elif cpuflag(sse)
180    mova     %3, %2
181    unpcklps %2, %1
182    unpckhps %3, %1
183    mova  %4(%5), %2
184    mova  %4(%5+1), %3
185%endif
186%endmacro
187
188; scheduled for cpu-bound sizes
189%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
190IF%1 mova    m4, Z(4)
191IF%1 mova    m5, Z(5)
192    mova     m0, %2 ; wre
193    mova     m1, %3 ; wim
194    mulps    m2, m4, m0 ; r2*wre
195IF%1 mova    m6, Z2(6)
196    mulps    m3, m5, m1 ; i2*wim
197IF%1 mova    m7, Z2(7)
198    mulps    m4, m4, m1 ; r2*wim
199    mulps    m5, m5, m0 ; i2*wre
200    addps    m2, m2, m3 ; r2*wre + i2*wim
201    mulps    m3, m1, m7 ; i3*wim
202    subps    m5, m5, m4 ; i2*wre - r2*wim
203    mulps    m1, m1, m6 ; r3*wim
204    mulps    m4, m0, m6 ; r3*wre
205    mulps    m0, m0, m7 ; i3*wre
206    subps    m4, m4, m3 ; r3*wre - i3*wim
207    mova     m3, Z(0)
208    addps    m0, m0, m1 ; i3*wre + r3*wim
209    subps    m1, m4, m2 ; t3
210    addps    m4, m4, m2 ; t5
211    subps    m3, m3, m4 ; r2
212    addps    m4, m4, Z(0) ; r0
213    mova     m6, Z(2)
214    mova   Z(4), m3
215    mova   Z(0), m4
216    subps    m3, m5, m0 ; t4
217    subps    m4, m6, m3 ; r3
218    addps    m3, m3, m6 ; r1
219    mova  Z2(6), m4
220    mova   Z(2), m3
221    mova     m2, Z(3)
222    addps    m3, m5, m0 ; t6
223    subps    m2, m2, m1 ; i3
224    mova     m7, Z(1)
225    addps    m1, m1, Z(3) ; i1
226    mova  Z2(7), m2
227    mova   Z(3), m1
228    subps    m4, m7, m3 ; i2
229    addps    m3, m3, m7 ; i0
230    mova   Z(5), m4
231    mova   Z(1), m3
232%endmacro
233
234; scheduled to avoid store->load aliasing
235%macro PASS_BIG 1 ; (!interleave)
236    mova     m4, Z(4) ; r2
237    mova     m5, Z(5) ; i2
238    mova     m0, [wq] ; wre
239    mova     m1, [wq+o1q] ; wim
240    mulps    m2, m4, m0 ; r2*wre
241    mova     m6, Z2(6) ; r3
242    mulps    m3, m5, m1 ; i2*wim
243    mova     m7, Z2(7) ; i3
244    mulps    m4, m4, m1 ; r2*wim
245    mulps    m5, m5, m0 ; i2*wre
246    addps    m2, m2, m3 ; r2*wre + i2*wim
247    mulps    m3, m1, m7 ; i3*wim
248    mulps    m1, m1, m6 ; r3*wim
249    subps    m5, m5, m4 ; i2*wre - r2*wim
250    mulps    m4, m0, m6 ; r3*wre
251    mulps    m0, m0, m7 ; i3*wre
252    subps    m4, m4, m3 ; r3*wre - i3*wim
253    mova     m3, Z(0)
254    addps    m0, m0, m1 ; i3*wre + r3*wim
255    subps    m1, m4, m2 ; t3
256    addps    m4, m4, m2 ; t5
257    subps    m3, m3, m4 ; r2
258    addps    m4, m4, Z(0) ; r0
259    mova     m6, Z(2)
260    mova   Z(4), m3
261    mova   Z(0), m4
262    subps    m3, m5, m0 ; t4
263    subps    m4, m6, m3 ; r3
264    addps    m3, m3, m6 ; r1
265IF%1 mova Z2(6), m4
266IF%1 mova  Z(2), m3
267    mova     m2, Z(3)
268    addps    m5, m5, m0 ; t6
269    subps    m2, m2, m1 ; i3
270    mova     m7, Z(1)
271    addps    m1, m1, Z(3) ; i1
272IF%1 mova Z2(7), m2
273IF%1 mova  Z(3), m1
274    subps    m6, m7, m5 ; i2
275    addps    m5, m5, m7 ; i0
276IF%1 mova  Z(5), m6
277IF%1 mova  Z(1), m5
278%if %1==0
279    INTERL m1, m3, m7, Z, 2
280    INTERL m2, m4, m0, Z2, 6
281
282    mova     m1, Z(0)
283    mova     m2, Z(4)
284
285    INTERL m5, m1, m3, Z, 0
286    INTERL m6, m2, m7, Z, 4
287%endif
288%endmacro
289
290%define Z(x) [r0+mmsize*x]
291%define Z2(x) [r0+mmsize*x]
292%define ZH(x) [r0+mmsize*x+mmsize/2]
293
294INIT_YMM avx
295
296%if HAVE_AVX_EXTERNAL
297align 16
298fft8_avx:
299    mova      m0, Z(0)
300    mova      m1, Z(1)
301    T8_AVX    m0, m1, m2, m3, m4
302    mova      Z(0), m0
303    mova      Z(1), m1
304    ret
305
306
307align 16
308fft16_avx:
309    mova       m2, Z(2)
310    mova       m3, Z(3)
311    T4_SSE     m2, m3, m7
312
313    mova       m0, Z(0)
314    mova       m1, Z(1)
315    T8_AVX     m0, m1, m4, m5, m7
316
317    mova       m4, [ps_cos16_1]
318    mova       m5, [ps_cos16_2]
319    vmulps     m6, m2, m4
320    vmulps     m7, m3, m5
321    vaddps     m7, m7, m6
322    vmulps     m2, m2, m5
323    vmulps     m3, m3, m4
324    vsubps     m3, m3, m2
325    vblendps   m2, m7, m3, 0xf0
326    vperm2f128 m3, m7, m3, 0x21
327    vaddps     m4, m2, m3
328    vsubps     m2, m3, m2
329    vperm2f128 m2, m2, m2, 0x01
330    vsubps     m3, m1, m2
331    vaddps     m1, m1, m2
332    vsubps     m5, m0, m4
333    vaddps     m0, m0, m4
334    vextractf128   Z(0), m0, 0
335    vextractf128  ZH(0), m1, 0
336    vextractf128   Z(1), m0, 1
337    vextractf128  ZH(1), m1, 1
338    vextractf128   Z(2), m5, 0
339    vextractf128  ZH(2), m3, 0
340    vextractf128   Z(3), m5, 1
341    vextractf128  ZH(3), m3, 1
342    ret
343
344align 16
345fft32_avx:
346    call fft16_avx
347
348    mova m0, Z(4)
349    mova m1, Z(5)
350
351    T4_SSE      m0, m1, m4
352
353    mova m2, Z(6)
354    mova m3, Z(7)
355
356    T8_SSE      m0, m1, m2, m3, m4, m6
357    ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
358    ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
359
360    vperm2f128  m4, m0, m2, 0x20
361    vperm2f128  m5, m1, m3, 0x20
362    vperm2f128  m6, m0, m2, 0x31
363    vperm2f128  m7, m1, m3, 0x31
364
365    PASS_SMALL 0, [cos_32], [cos_32+32]
366
367    ret
368
369fft32_interleave_avx:
370    call fft32_avx
371    mov r2d, 32
372.deint_loop:
373    mova     m2, Z(0)
374    mova     m3, Z(1)
375    vunpcklps      m0, m2, m3
376    vunpckhps      m1, m2, m3
377    vextractf128   Z(0), m0, 0
378    vextractf128  ZH(0), m1, 0
379    vextractf128   Z(1), m0, 1
380    vextractf128  ZH(1), m1, 1
381    add r0, mmsize*2
382    sub r2d, mmsize/4
383    jg .deint_loop
384    ret
385
386%endif
387
388INIT_XMM sse
389
390align 16
391fft4_avx:
392fft4_sse:
393    mova     m0, Z(0)
394    mova     m1, Z(1)
395    T4_SSE   m0, m1, m2
396    mova   Z(0), m0
397    mova   Z(1), m1
398    ret
399
400align 16
401fft8_sse:
402    mova     m0, Z(0)
403    mova     m1, Z(1)
404    T4_SSE   m0, m1, m2
405    mova     m2, Z(2)
406    mova     m3, Z(3)
407    T8_SSE   m0, m1, m2, m3, m4, m5
408    mova   Z(0), m0
409    mova   Z(1), m1
410    mova   Z(2), m2
411    mova   Z(3), m3
412    ret
413
414align 16
415fft16_sse:
416    mova     m0, Z(0)
417    mova     m1, Z(1)
418    T4_SSE   m0, m1, m2
419    mova     m2, Z(2)
420    mova     m3, Z(3)
421    T8_SSE   m0, m1, m2, m3, m4, m5
422    mova     m4, Z(4)
423    mova     m5, Z(5)
424    mova   Z(0), m0
425    mova   Z(1), m1
426    mova   Z(2), m2
427    mova   Z(3), m3
428    T4_SSE   m4, m5, m6
429    mova     m6, Z2(6)
430    mova     m7, Z2(7)
431    T4_SSE   m6, m7, m0
432    PASS_SMALL 0, [cos_16], [cos_16+16]
433    ret
434
435
436%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
437%define Z2(x) [zcq + o3q + mmsize*(x&1)]
438%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
439%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
440
441%macro DECL_PASS 2+ ; name, payload
442align 16
443%1:
444DEFINE_ARGS zc, w, n, o1, o3
445    lea o3q, [nq*3]
446    lea o1q, [nq*8]
447    shl o3q, 4
448.loop:
449    %2
450    add zcq, mmsize*2
451    add  wq, mmsize
452    sub  nd, mmsize/8
453    jg .loop
454    rep ret
455%endmacro
456
457%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
458    lea r2, [dispatch_tab%1]
459    mov r2, [r2 + (%2q-2)*gprsize]
460%ifdef PIC
461    lea r3, [$$]
462    add r2, r3
463%endif
464    call r2
465%endmacro ; FFT_DISPATCH
466
467INIT_YMM avx
468
469%if HAVE_AVX_EXTERNAL
470DECL_PASS pass_avx, PASS_BIG 1
471DECL_PASS pass_interleave_avx, PASS_BIG 0
472
473cglobal fft_calc, 2,5,8
474    mov     r3d, [r0 + FFTContext.nbits]
475    mov     r0, r1
476    mov     r1, r3
477    FFT_DISPATCH _interleave %+ SUFFIX, r1
478    REP_RET
479
480%endif
481
482INIT_XMM sse
483
484DECL_PASS pass_sse, PASS_BIG 1
485DECL_PASS pass_interleave_sse, PASS_BIG 0
486
487INIT_XMM sse
488cglobal fft_calc, 2,5,8
489    mov     r3d, [r0 + FFTContext.nbits]
490    PUSH    r1
491    PUSH    r3
492    mov     r0, r1
493    mov     r1, r3
494    FFT_DISPATCH _interleave %+ SUFFIX, r1
495    POP     rcx
496    POP     r4
497    cmp     rcx, 3+(mmsize/16)
498    jg      .end
499    mov     r2, -1
500    add     rcx, 3
501    shl     r2, cl
502    sub     r4, r2
503.loop:
504    movaps   xmm0, [r4 + r2]
505    movaps   xmm1, xmm0
506    unpcklps xmm0, [r4 + r2 + 16]
507    unpckhps xmm1, [r4 + r2 + 16]
508    movaps   [r4 + r2],      xmm0
509    movaps   [r4 + r2 + 16], xmm1
510    add      r2, mmsize*2
511    jl       .loop
512.end:
513    REP_RET
514
515cglobal fft_permute, 2,7,1
516    mov     r4,  [r0 + FFTContext.revtab]
517    mov     r5,  [r0 + FFTContext.tmpbuf]
518    mov     ecx, [r0 + FFTContext.nbits]
519    mov     r2, 1
520    shl     r2, cl
521    xor     r0, r0
522%if ARCH_X86_32
523    mov     r1, r1m
524%endif
525.loop:
526    movaps  xmm0, [r1 + 8*r0]
527    movzx   r6, word [r4 + 2*r0]
528    movzx   r3, word [r4 + 2*r0 + 2]
529    movlps  [r5 + 8*r6], xmm0
530    movhps  [r5 + 8*r3], xmm0
531    add     r0, 2
532    cmp     r0, r2
533    jl      .loop
534    shl     r2, 3
535    add     r1, r2
536    add     r5, r2
537    neg     r2
538; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
539.loopcopy:
540    movaps  xmm0, [r5 + r2]
541    movaps  xmm1, [r5 + r2 + 16]
542    movaps  [r1 + r2], xmm0
543    movaps  [r1 + r2 + 16], xmm1
544    add     r2, 32
545    jl      .loopcopy
546    REP_RET
547
548INIT_XMM sse
549cglobal imdct_calc, 3,5,3
550    mov     r3d, [r0 + FFTContext.mdctsize]
551    mov     r4,  [r0 + FFTContext.imdcthalf]
552    add     r1,  r3
553    PUSH    r3
554    PUSH    r1
555%if ARCH_X86_32
556    push    r2
557    push    r1
558    push    r0
559%else
560    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
561%endif
562    call    r4
563%if ARCH_X86_32
564    add     esp, 12
565%else
566    add     rsp, 8+32*WIN64
567%endif
568    POP     r1
569    POP     r3
570    lea     r0, [r1 + 2*r3]
571    mov     r2, r3
572    sub     r3, mmsize
573    neg     r2
574    mova    m2, [ps_neg]
575.loop:
576    mova    m0, [r1 + r3]
577    mova    m1, [r0 + r2]
578    shufps  m0, m0, 0x1b
579    shufps  m1, m1, 0x1b
580    xorps   m0, m2
581    mova [r0 + r3], m1
582    mova [r1 + r2], m0
583    sub     r3, mmsize
584    add     r2, mmsize
585    jl      .loop
586    REP_RET
587
588%ifdef PIC
589%define SECTION_REL - $$
590%else
591%define SECTION_REL
592%endif
593
594%macro DECL_FFT 1-2 ; nbits, suffix
595%ifidn %0, 1
596%xdefine fullsuffix SUFFIX
597%else
598%xdefine fullsuffix %2 %+ SUFFIX
599%endif
600%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
601%if %1>=5
602%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
603%endif
604%if %1>=6
605%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
606%endif
607
608%assign n 1<<%1
609%rep 18-%1
610%assign n2 n/2
611%assign n4 n/4
612%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
613
614align 16
615fft %+ n %+ fullsuffix:
616    call fft %+ n2 %+ SUFFIX
617    add r0, n*4 - (n&(-2<<%1))
618    call fft %+ n4 %+ SUFFIX
619    add r0, n*2 - (n2&(-2<<%1))
620    call fft %+ n4 %+ SUFFIX
621    sub r0, n*6 + (n2&(-2<<%1))
622    lea r1, [cos_ %+ n]
623    mov r2d, n4/2
624    jmp pass %+ fullsuffix
625
626%assign n n*2
627%endrep
628%undef n
629
630align 8
631dispatch_tab %+ fullsuffix: pointer list_of_fft
632%endmacro ; DECL_FFT
633
634%if HAVE_AVX_EXTERNAL
635INIT_YMM avx
636DECL_FFT 6
637DECL_FFT 6, _interleave
638%endif
639INIT_XMM sse
640DECL_FFT 5
641DECL_FFT 5, _interleave
642
643INIT_XMM sse
644%undef mulps
645%undef addps
646%undef subps
647%undef unpcklps
648%undef unpckhps
649
650%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
651    movaps   xmm0, [%3+%2*4]
652    movaps   xmm1, [%3+%1*4-0x10]
653    movaps   xmm2, xmm0
654    shufps   xmm0, xmm1, 0x88
655    shufps   xmm1, xmm2, 0x77
656    movlps   xmm4, [%4+%2*2]
657    movlps   xmm5, [%5+%2*2+0x0]
658    movhps   xmm4, [%4+%1*2-0x8]
659    movhps   xmm5, [%5+%1*2-0x8]
660    movaps   xmm2, xmm0
661    movaps   xmm3, xmm1
662    mulps    xmm0, xmm5
663    mulps    xmm1, xmm4
664    mulps    xmm2, xmm4
665    mulps    xmm3, xmm5
666    subps    xmm1, xmm0
667    addps    xmm2, xmm3
668    movaps   xmm0, xmm1
669    unpcklps xmm1, xmm2
670    unpckhps xmm0, xmm2
671%endmacro
672
673%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
674    mulps      m6, %3, [%5+%1]
675    mulps      m7, %2, [%5+%1]
676    mulps      %2, %2, [%6+%1]
677    mulps      %3, %3, [%6+%1]
678    subps      %2, %2, m6
679    addps      %3, %3, m7
680%endmacro
681
682%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
683.post:
684%if cpuflag(avx)
685    vmovaps      ymm1,   [%3+%1*2]
686    vmovaps      ymm0,   [%3+%1*2+0x20]
687    vmovaps      ymm3,   [%3+%2*2]
688    vmovaps      ymm2,   [%3+%2*2+0x20]
689
690    CMUL         %1, ymm0, ymm1, %3, %4, %5
691    CMUL         %2, ymm2, ymm3, %3, %4, %5
692    vshufps      ymm1, ymm1, ymm1, 0x1b
693    vshufps      ymm3, ymm3, ymm3, 0x1b
694    vperm2f128   ymm1, ymm1, ymm1, 0x01
695    vperm2f128   ymm3, ymm3, ymm3, 0x01
696    vunpcklps    ymm6, ymm2, ymm1
697    vunpckhps    ymm4, ymm2, ymm1
698    vunpcklps    ymm7, ymm0, ymm3
699    vunpckhps    ymm5, ymm0, ymm3
700
701    vextractf128 [%3+%1*2],      ymm7, 0
702    vextractf128 [%3+%1*2+0x10], ymm5, 0
703    vextractf128 [%3+%1*2+0x20], ymm7, 1
704    vextractf128 [%3+%1*2+0x30], ymm5, 1
705
706    vextractf128 [%3+%2*2],      ymm6, 0
707    vextractf128 [%3+%2*2+0x10], ymm4, 0
708    vextractf128 [%3+%2*2+0x20], ymm6, 1
709    vextractf128 [%3+%2*2+0x30], ymm4, 1
710    sub      %2,   0x20
711    add      %1,   0x20
712    jl       .post
713%else
714    movaps   xmm1, [%3+%1*2]
715    movaps   xmm0, [%3+%1*2+0x10]
716    CMUL     %1,   xmm0, xmm1, %3, %4, %5
717    movaps   xmm5, [%3+%2*2]
718    movaps   xmm4, [%3+%2*2+0x10]
719    CMUL     %2,   xmm4, xmm5, %3, %4, %5
720    shufps   xmm1, xmm1, 0x1b
721    shufps   xmm5, xmm5, 0x1b
722    movaps   xmm6, xmm4
723    unpckhps xmm4, xmm1
724    unpcklps xmm6, xmm1
725    movaps   xmm2, xmm0
726    unpcklps xmm0, xmm5
727    unpckhps xmm2, xmm5
728    movaps   [%3+%2*2],      xmm6
729    movaps   [%3+%2*2+0x10], xmm4
730    movaps   [%3+%1*2],      xmm0
731    movaps   [%3+%1*2+0x10], xmm2
732    sub      %2,   0x10
733    add      %1,   0x10
734    jl       .post
735%endif
736%endmacro
737
738%macro DECL_IMDCT 0
739cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
740%if ARCH_X86_64
741%define rrevtab r7
742%define rtcos   r8
743%define rtsin   r9
744%else
745%define rrevtab r6
746%define rtsin   r6
747%define rtcos   r5
748%endif
749    mov   r3d, [r0+FFTContext.mdctsize]
750    add   r2, r3
751    shr   r3, 1
752    mov   rtcos, [r0+FFTContext.tcos]
753    mov   rtsin, [r0+FFTContext.tsin]
754    add   rtcos, r3
755    add   rtsin, r3
756%if ARCH_X86_64 == 0
757    push  rtcos
758    push  rtsin
759%endif
760    shr   r3, 1
761    mov   rrevtab, [r0+FFTContext.revtab]
762    add   rrevtab, r3
763%if ARCH_X86_64 == 0
764    push  rrevtab
765%endif
766
767    sub   r3, 4
768%if ARCH_X86_64
769    xor   r4, r4
770    sub   r4, r3
771%endif
772.pre:
773%if ARCH_X86_64 == 0
774;unspill
775    xor   r4, r4
776    sub   r4, r3
777    mov   rtcos, [esp+8]
778    mov   rtsin, [esp+4]
779%endif
780
781    PREROTATER r4, r3, r2, rtcos, rtsin
782%if ARCH_X86_64
783    movzx  r5,  word [rrevtab+r4-4]
784    movzx  r6,  word [rrevtab+r4-2]
785    movzx  r10, word [rrevtab+r3]
786    movzx  r11, word [rrevtab+r3+2]
787    movlps [r1+r5 *8], xmm0
788    movhps [r1+r6 *8], xmm0
789    movlps [r1+r10*8], xmm1
790    movhps [r1+r11*8], xmm1
791    add    r4, 4
792%else
793    mov    r6, [esp]
794    movzx  r5, word [r6+r4-4]
795    movzx  r4, word [r6+r4-2]
796    movlps [r1+r5*8], xmm0
797    movhps [r1+r4*8], xmm0
798    movzx  r5, word [r6+r3]
799    movzx  r4, word [r6+r3+2]
800    movlps [r1+r5*8], xmm1
801    movhps [r1+r4*8], xmm1
802%endif
803    sub    r3, 4
804    jns    .pre
805
806    mov  r5, r0
807    mov  r6, r1
808    mov  r0, r1
809    mov  r1d, [r5+FFTContext.nbits]
810
811    FFT_DISPATCH SUFFIX, r1
812
813    mov  r0d, [r5+FFTContext.mdctsize]
814    add  r6, r0
815    shr  r0, 1
816%if ARCH_X86_64 == 0
817%define rtcos r2
818%define rtsin r3
819    mov  rtcos, [esp+8]
820    mov  rtsin, [esp+4]
821%endif
822    neg  r0
823    mov  r1, -mmsize
824    sub  r1, r0
825    POSROTATESHUF r0, r1, r6, rtcos, rtsin
826%if ARCH_X86_64 == 0
827    add esp, 12
828%endif
829    RET
830%endmacro
831
832DECL_IMDCT
833
834INIT_YMM avx
835
836%if HAVE_AVX_EXTERNAL
837DECL_IMDCT
838%endif
839