1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* FFT transform with SSE/AVX optimizations
3cabdff1aSopenharmony_ci;* Copyright (c) 2008 Loren Merritt
4cabdff1aSopenharmony_ci;* Copyright (c) 2011 Vitor Sessak
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This algorithm (though not any of the implementation details) is
7cabdff1aSopenharmony_ci;* based on libdjbfft by D. J. Bernstein.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
10cabdff1aSopenharmony_ci;*
11cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
12cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
13cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
14cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
15cabdff1aSopenharmony_ci;*
16cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
17cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
20cabdff1aSopenharmony_ci;*
21cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
22cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
23cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24cabdff1aSopenharmony_ci;******************************************************************************
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci; These functions are not individually interchangeable with the C versions.
27cabdff1aSopenharmony_ci; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28cabdff1aSopenharmony_ci; in blocks as conventient to the vector size.
29cabdff1aSopenharmony_ci; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci%if ARCH_X86_64
34cabdff1aSopenharmony_ci%define pointer resq
35cabdff1aSopenharmony_ci%else
36cabdff1aSopenharmony_ci%define pointer resd
37cabdff1aSopenharmony_ci%endif
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_cistruc FFTContext
40cabdff1aSopenharmony_ci    .nbits:    resd 1
41cabdff1aSopenharmony_ci    .reverse:  resd 1
42cabdff1aSopenharmony_ci    .revtab:   pointer 1
43cabdff1aSopenharmony_ci    .tmpbuf:   pointer 1
44cabdff1aSopenharmony_ci    .mdctsize: resd 1
45cabdff1aSopenharmony_ci    .mdctbits: resd 1
46cabdff1aSopenharmony_ci    .tcos:     pointer 1
47cabdff1aSopenharmony_ci    .tsin:     pointer 1
48cabdff1aSopenharmony_ci    .fftperm:  pointer 1
49cabdff1aSopenharmony_ci    .fftcalc:  pointer 1
50cabdff1aSopenharmony_ci    .imdctcalc:pointer 1
51cabdff1aSopenharmony_ci    .imdcthalf:pointer 1
52cabdff1aSopenharmony_ciendstruc
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ciSECTION_RODATA 32
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ci%define M_SQRT1_2 0.70710678118654752440
57cabdff1aSopenharmony_ci%define M_COS_PI_1_8 0.923879532511287
58cabdff1aSopenharmony_ci%define M_COS_PI_3_8 0.38268343236509
59cabdff1aSopenharmony_ci
60cabdff1aSopenharmony_cips_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
61cabdff1aSopenharmony_cips_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_cips_root2: times 8 dd M_SQRT1_2
64cabdff1aSopenharmony_cips_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
65cabdff1aSopenharmony_cips_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_ciperm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
68cabdff1aSopenharmony_ciperm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
69cabdff1aSopenharmony_cips_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
70cabdff1aSopenharmony_cips_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
71cabdff1aSopenharmony_cips_m1p1: dd 1<<31, 0
72cabdff1aSopenharmony_ci
73cabdff1aSopenharmony_cicextern ps_neg
74cabdff1aSopenharmony_ci
75cabdff1aSopenharmony_ci%assign i 16
76cabdff1aSopenharmony_ci%rep 14
77cabdff1aSopenharmony_cicextern cos_ %+ i
78cabdff1aSopenharmony_ci%assign i i<<1
79cabdff1aSopenharmony_ci%endrep
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci%if ARCH_X86_64
82cabdff1aSopenharmony_ci    %define pointer dq
83cabdff1aSopenharmony_ci%else
84cabdff1aSopenharmony_ci    %define pointer dd
85cabdff1aSopenharmony_ci%endif
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_ci%macro IF0 1+
88cabdff1aSopenharmony_ci%endmacro
89cabdff1aSopenharmony_ci%macro IF1 1+
90cabdff1aSopenharmony_ci    %1
91cabdff1aSopenharmony_ci%endmacro
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ciSECTION .text
94cabdff1aSopenharmony_ci
95cabdff1aSopenharmony_ci;  in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
96cabdff1aSopenharmony_ci;      %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
97cabdff1aSopenharmony_ci;      %3, %4, %5 tmp
98cabdff1aSopenharmony_ci; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
99cabdff1aSopenharmony_ci;      %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
100cabdff1aSopenharmony_ci%macro T8_AVX 5
101cabdff1aSopenharmony_ci    vsubps     %5, %1, %2       ; v  = %1 - %2
102cabdff1aSopenharmony_ci    vaddps     %3, %1, %2       ; w  = %1 + %2
103cabdff1aSopenharmony_ci    vmulps     %2, %5, [ps_p1p1m1p1root2]  ; v *= vals1
104cabdff1aSopenharmony_ci    vpermilps  %2, %2, [perm1]
105cabdff1aSopenharmony_ci    vblendps   %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
106cabdff1aSopenharmony_ci    vshufps    %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
107cabdff1aSopenharmony_ci    vsubps     %4, %5, %1       ; s = r - q
108cabdff1aSopenharmony_ci    vaddps     %1, %5, %1       ; u = r + q
109cabdff1aSopenharmony_ci    vpermilps  %1, %1, [perm2]  ; k  = {u1,u2,u3,u4,u6,u5,u7,u8}
110cabdff1aSopenharmony_ci    vshufps    %5, %4, %1, 0xbb
111cabdff1aSopenharmony_ci    vshufps    %3, %4, %1, 0xee
112cabdff1aSopenharmony_ci    vperm2f128 %3, %3, %5, 0x13
113cabdff1aSopenharmony_ci    vxorps     %4, %4, [ps_m1m1p1m1p1m1m1m1]  ; s *= {1,1,-1,-1,1,-1,-1,-1}
114cabdff1aSopenharmony_ci    vshufps    %2, %1, %4, 0xdd
115cabdff1aSopenharmony_ci    vshufps    %1, %1, %4, 0x88
116cabdff1aSopenharmony_ci    vperm2f128 %4, %2, %1, 0x02 ; v  = {k1,k3,s1,s3,k2,k4,s2,s4}
117cabdff1aSopenharmony_ci    vperm2f128 %1, %1, %2, 0x13 ; w  = {k6,k8,s6,s8,k5,k7,s5,s7}
118cabdff1aSopenharmony_ci    vsubps     %5, %1, %3
119cabdff1aSopenharmony_ci    vblendps   %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
120cabdff1aSopenharmony_ci    vsubps     %2, %4, %1       ; %2 = v - w
121cabdff1aSopenharmony_ci    vaddps     %1, %4, %1       ; %1 = v + w
122cabdff1aSopenharmony_ci%endmacro
123cabdff1aSopenharmony_ci
124cabdff1aSopenharmony_ci; In SSE mode do one fft4 transforms
125cabdff1aSopenharmony_ci; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
126cabdff1aSopenharmony_ci; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
127cabdff1aSopenharmony_ci;
128cabdff1aSopenharmony_ci; In AVX mode do two fft4 transforms
129cabdff1aSopenharmony_ci; in:  %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
130cabdff1aSopenharmony_ci; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
131cabdff1aSopenharmony_ci%macro T4_SSE 3
132cabdff1aSopenharmony_ci    subps    %3, %1, %2       ; {t3,t4,-t8,t7}
133cabdff1aSopenharmony_ci    addps    %1, %1, %2       ; {t1,t2,t6,t5}
134cabdff1aSopenharmony_ci    xorps    %3, %3, [ps_p1p1m1p1]
135cabdff1aSopenharmony_ci    shufps   %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
136cabdff1aSopenharmony_ci    shufps   %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
137cabdff1aSopenharmony_ci    subps    %3, %1, %2       ; {r2,i2,r3,i3}
138cabdff1aSopenharmony_ci    addps    %1, %1, %2       ; {r0,i0,r1,i1}
139cabdff1aSopenharmony_ci    shufps   %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
140cabdff1aSopenharmony_ci    shufps   %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
141cabdff1aSopenharmony_ci%endmacro
142cabdff1aSopenharmony_ci
143cabdff1aSopenharmony_ci; In SSE mode do one FFT8
144cabdff1aSopenharmony_ci; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
145cabdff1aSopenharmony_ci; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
146cabdff1aSopenharmony_ci;
147cabdff1aSopenharmony_ci; In AVX mode do two FFT8
148cabdff1aSopenharmony_ci; in:  %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
149cabdff1aSopenharmony_ci;      %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
150cabdff1aSopenharmony_ci; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
151cabdff1aSopenharmony_ci;      %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
152cabdff1aSopenharmony_ci%macro T8_SSE 6
153cabdff1aSopenharmony_ci    addps    %6, %3, %4       ; {t1,t2,t3,t4}
154cabdff1aSopenharmony_ci    subps    %3, %3, %4       ; {r5,i5,r7,i7}
155cabdff1aSopenharmony_ci    shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
156cabdff1aSopenharmony_ci    mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
157cabdff1aSopenharmony_ci    mulps    %4, %4, [ps_root2]
158cabdff1aSopenharmony_ci    addps    %3, %3, %4       ; {t8,t7,ta,t9}
159cabdff1aSopenharmony_ci    shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
160cabdff1aSopenharmony_ci    shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
161cabdff1aSopenharmony_ci    subps    %3, %6, %4       ; {t6,t5,tc,tb}
162cabdff1aSopenharmony_ci    addps    %6, %6, %4       ; {t1,t2,t9,ta}
163cabdff1aSopenharmony_ci    shufps   %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
164cabdff1aSopenharmony_ci    shufps   %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
165cabdff1aSopenharmony_ci    subps    %3, %1, %6       ; {r4,r5,r6,r7}
166cabdff1aSopenharmony_ci    addps    %1, %1, %6       ; {r0,r1,r2,r3}
167cabdff1aSopenharmony_ci    subps    %4, %2, %5       ; {i4,i5,i6,i7}
168cabdff1aSopenharmony_ci    addps    %2, %2, %5       ; {i0,i1,i2,i3}
169cabdff1aSopenharmony_ci%endmacro
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci%macro INTERL 5
172cabdff1aSopenharmony_ci%if cpuflag(avx)
173cabdff1aSopenharmony_ci    vunpckhps      %3, %2, %1
174cabdff1aSopenharmony_ci    vunpcklps      %2, %2, %1
175cabdff1aSopenharmony_ci    vextractf128   %4(%5), %2, 0
176cabdff1aSopenharmony_ci    vextractf128  %4 %+ H(%5), %3, 0
177cabdff1aSopenharmony_ci    vextractf128   %4(%5 + 1), %2, 1
178cabdff1aSopenharmony_ci    vextractf128  %4 %+ H(%5 + 1), %3, 1
179cabdff1aSopenharmony_ci%elif cpuflag(sse)
180cabdff1aSopenharmony_ci    mova     %3, %2
181cabdff1aSopenharmony_ci    unpcklps %2, %1
182cabdff1aSopenharmony_ci    unpckhps %3, %1
183cabdff1aSopenharmony_ci    mova  %4(%5), %2
184cabdff1aSopenharmony_ci    mova  %4(%5+1), %3
185cabdff1aSopenharmony_ci%endif
186cabdff1aSopenharmony_ci%endmacro
187cabdff1aSopenharmony_ci
188cabdff1aSopenharmony_ci; scheduled for cpu-bound sizes
189cabdff1aSopenharmony_ci%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
190cabdff1aSopenharmony_ciIF%1 mova    m4, Z(4)
191cabdff1aSopenharmony_ciIF%1 mova    m5, Z(5)
192cabdff1aSopenharmony_ci    mova     m0, %2 ; wre
193cabdff1aSopenharmony_ci    mova     m1, %3 ; wim
194cabdff1aSopenharmony_ci    mulps    m2, m4, m0 ; r2*wre
195cabdff1aSopenharmony_ciIF%1 mova    m6, Z2(6)
196cabdff1aSopenharmony_ci    mulps    m3, m5, m1 ; i2*wim
197cabdff1aSopenharmony_ciIF%1 mova    m7, Z2(7)
198cabdff1aSopenharmony_ci    mulps    m4, m4, m1 ; r2*wim
199cabdff1aSopenharmony_ci    mulps    m5, m5, m0 ; i2*wre
200cabdff1aSopenharmony_ci    addps    m2, m2, m3 ; r2*wre + i2*wim
201cabdff1aSopenharmony_ci    mulps    m3, m1, m7 ; i3*wim
202cabdff1aSopenharmony_ci    subps    m5, m5, m4 ; i2*wre - r2*wim
203cabdff1aSopenharmony_ci    mulps    m1, m1, m6 ; r3*wim
204cabdff1aSopenharmony_ci    mulps    m4, m0, m6 ; r3*wre
205cabdff1aSopenharmony_ci    mulps    m0, m0, m7 ; i3*wre
206cabdff1aSopenharmony_ci    subps    m4, m4, m3 ; r3*wre - i3*wim
207cabdff1aSopenharmony_ci    mova     m3, Z(0)
208cabdff1aSopenharmony_ci    addps    m0, m0, m1 ; i3*wre + r3*wim
209cabdff1aSopenharmony_ci    subps    m1, m4, m2 ; t3
210cabdff1aSopenharmony_ci    addps    m4, m4, m2 ; t5
211cabdff1aSopenharmony_ci    subps    m3, m3, m4 ; r2
212cabdff1aSopenharmony_ci    addps    m4, m4, Z(0) ; r0
213cabdff1aSopenharmony_ci    mova     m6, Z(2)
214cabdff1aSopenharmony_ci    mova   Z(4), m3
215cabdff1aSopenharmony_ci    mova   Z(0), m4
216cabdff1aSopenharmony_ci    subps    m3, m5, m0 ; t4
217cabdff1aSopenharmony_ci    subps    m4, m6, m3 ; r3
218cabdff1aSopenharmony_ci    addps    m3, m3, m6 ; r1
219cabdff1aSopenharmony_ci    mova  Z2(6), m4
220cabdff1aSopenharmony_ci    mova   Z(2), m3
221cabdff1aSopenharmony_ci    mova     m2, Z(3)
222cabdff1aSopenharmony_ci    addps    m3, m5, m0 ; t6
223cabdff1aSopenharmony_ci    subps    m2, m2, m1 ; i3
224cabdff1aSopenharmony_ci    mova     m7, Z(1)
225cabdff1aSopenharmony_ci    addps    m1, m1, Z(3) ; i1
226cabdff1aSopenharmony_ci    mova  Z2(7), m2
227cabdff1aSopenharmony_ci    mova   Z(3), m1
228cabdff1aSopenharmony_ci    subps    m4, m7, m3 ; i2
229cabdff1aSopenharmony_ci    addps    m3, m3, m7 ; i0
230cabdff1aSopenharmony_ci    mova   Z(5), m4
231cabdff1aSopenharmony_ci    mova   Z(1), m3
232cabdff1aSopenharmony_ci%endmacro
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci; scheduled to avoid store->load aliasing
235cabdff1aSopenharmony_ci%macro PASS_BIG 1 ; (!interleave)
236cabdff1aSopenharmony_ci    mova     m4, Z(4) ; r2
237cabdff1aSopenharmony_ci    mova     m5, Z(5) ; i2
238cabdff1aSopenharmony_ci    mova     m0, [wq] ; wre
239cabdff1aSopenharmony_ci    mova     m1, [wq+o1q] ; wim
240cabdff1aSopenharmony_ci    mulps    m2, m4, m0 ; r2*wre
241cabdff1aSopenharmony_ci    mova     m6, Z2(6) ; r3
242cabdff1aSopenharmony_ci    mulps    m3, m5, m1 ; i2*wim
243cabdff1aSopenharmony_ci    mova     m7, Z2(7) ; i3
244cabdff1aSopenharmony_ci    mulps    m4, m4, m1 ; r2*wim
245cabdff1aSopenharmony_ci    mulps    m5, m5, m0 ; i2*wre
246cabdff1aSopenharmony_ci    addps    m2, m2, m3 ; r2*wre + i2*wim
247cabdff1aSopenharmony_ci    mulps    m3, m1, m7 ; i3*wim
248cabdff1aSopenharmony_ci    mulps    m1, m1, m6 ; r3*wim
249cabdff1aSopenharmony_ci    subps    m5, m5, m4 ; i2*wre - r2*wim
250cabdff1aSopenharmony_ci    mulps    m4, m0, m6 ; r3*wre
251cabdff1aSopenharmony_ci    mulps    m0, m0, m7 ; i3*wre
252cabdff1aSopenharmony_ci    subps    m4, m4, m3 ; r3*wre - i3*wim
253cabdff1aSopenharmony_ci    mova     m3, Z(0)
254cabdff1aSopenharmony_ci    addps    m0, m0, m1 ; i3*wre + r3*wim
255cabdff1aSopenharmony_ci    subps    m1, m4, m2 ; t3
256cabdff1aSopenharmony_ci    addps    m4, m4, m2 ; t5
257cabdff1aSopenharmony_ci    subps    m3, m3, m4 ; r2
258cabdff1aSopenharmony_ci    addps    m4, m4, Z(0) ; r0
259cabdff1aSopenharmony_ci    mova     m6, Z(2)
260cabdff1aSopenharmony_ci    mova   Z(4), m3
261cabdff1aSopenharmony_ci    mova   Z(0), m4
262cabdff1aSopenharmony_ci    subps    m3, m5, m0 ; t4
263cabdff1aSopenharmony_ci    subps    m4, m6, m3 ; r3
264cabdff1aSopenharmony_ci    addps    m3, m3, m6 ; r1
265cabdff1aSopenharmony_ciIF%1 mova Z2(6), m4
266cabdff1aSopenharmony_ciIF%1 mova  Z(2), m3
267cabdff1aSopenharmony_ci    mova     m2, Z(3)
268cabdff1aSopenharmony_ci    addps    m5, m5, m0 ; t6
269cabdff1aSopenharmony_ci    subps    m2, m2, m1 ; i3
270cabdff1aSopenharmony_ci    mova     m7, Z(1)
271cabdff1aSopenharmony_ci    addps    m1, m1, Z(3) ; i1
272cabdff1aSopenharmony_ciIF%1 mova Z2(7), m2
273cabdff1aSopenharmony_ciIF%1 mova  Z(3), m1
274cabdff1aSopenharmony_ci    subps    m6, m7, m5 ; i2
275cabdff1aSopenharmony_ci    addps    m5, m5, m7 ; i0
276cabdff1aSopenharmony_ciIF%1 mova  Z(5), m6
277cabdff1aSopenharmony_ciIF%1 mova  Z(1), m5
278cabdff1aSopenharmony_ci%if %1==0
279cabdff1aSopenharmony_ci    INTERL m1, m3, m7, Z, 2
280cabdff1aSopenharmony_ci    INTERL m2, m4, m0, Z2, 6
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci    mova     m1, Z(0)
283cabdff1aSopenharmony_ci    mova     m2, Z(4)
284cabdff1aSopenharmony_ci
285cabdff1aSopenharmony_ci    INTERL m5, m1, m3, Z, 0
286cabdff1aSopenharmony_ci    INTERL m6, m2, m7, Z, 4
287cabdff1aSopenharmony_ci%endif
288cabdff1aSopenharmony_ci%endmacro
289cabdff1aSopenharmony_ci
290cabdff1aSopenharmony_ci%define Z(x) [r0+mmsize*x]
291cabdff1aSopenharmony_ci%define Z2(x) [r0+mmsize*x]
292cabdff1aSopenharmony_ci%define ZH(x) [r0+mmsize*x+mmsize/2]
293cabdff1aSopenharmony_ci
294cabdff1aSopenharmony_ciINIT_YMM avx
295cabdff1aSopenharmony_ci
296cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
297cabdff1aSopenharmony_cialign 16
298cabdff1aSopenharmony_cifft8_avx:
299cabdff1aSopenharmony_ci    mova      m0, Z(0)
300cabdff1aSopenharmony_ci    mova      m1, Z(1)
301cabdff1aSopenharmony_ci    T8_AVX    m0, m1, m2, m3, m4
302cabdff1aSopenharmony_ci    mova      Z(0), m0
303cabdff1aSopenharmony_ci    mova      Z(1), m1
304cabdff1aSopenharmony_ci    ret
305cabdff1aSopenharmony_ci
306cabdff1aSopenharmony_ci
307cabdff1aSopenharmony_cialign 16
308cabdff1aSopenharmony_cifft16_avx:
309cabdff1aSopenharmony_ci    mova       m2, Z(2)
310cabdff1aSopenharmony_ci    mova       m3, Z(3)
311cabdff1aSopenharmony_ci    T4_SSE     m2, m3, m7
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci    mova       m0, Z(0)
314cabdff1aSopenharmony_ci    mova       m1, Z(1)
315cabdff1aSopenharmony_ci    T8_AVX     m0, m1, m4, m5, m7
316cabdff1aSopenharmony_ci
317cabdff1aSopenharmony_ci    mova       m4, [ps_cos16_1]
318cabdff1aSopenharmony_ci    mova       m5, [ps_cos16_2]
319cabdff1aSopenharmony_ci    vmulps     m6, m2, m4
320cabdff1aSopenharmony_ci    vmulps     m7, m3, m5
321cabdff1aSopenharmony_ci    vaddps     m7, m7, m6
322cabdff1aSopenharmony_ci    vmulps     m2, m2, m5
323cabdff1aSopenharmony_ci    vmulps     m3, m3, m4
324cabdff1aSopenharmony_ci    vsubps     m3, m3, m2
325cabdff1aSopenharmony_ci    vblendps   m2, m7, m3, 0xf0
326cabdff1aSopenharmony_ci    vperm2f128 m3, m7, m3, 0x21
327cabdff1aSopenharmony_ci    vaddps     m4, m2, m3
328cabdff1aSopenharmony_ci    vsubps     m2, m3, m2
329cabdff1aSopenharmony_ci    vperm2f128 m2, m2, m2, 0x01
330cabdff1aSopenharmony_ci    vsubps     m3, m1, m2
331cabdff1aSopenharmony_ci    vaddps     m1, m1, m2
332cabdff1aSopenharmony_ci    vsubps     m5, m0, m4
333cabdff1aSopenharmony_ci    vaddps     m0, m0, m4
334cabdff1aSopenharmony_ci    vextractf128   Z(0), m0, 0
335cabdff1aSopenharmony_ci    vextractf128  ZH(0), m1, 0
336cabdff1aSopenharmony_ci    vextractf128   Z(1), m0, 1
337cabdff1aSopenharmony_ci    vextractf128  ZH(1), m1, 1
338cabdff1aSopenharmony_ci    vextractf128   Z(2), m5, 0
339cabdff1aSopenharmony_ci    vextractf128  ZH(2), m3, 0
340cabdff1aSopenharmony_ci    vextractf128   Z(3), m5, 1
341cabdff1aSopenharmony_ci    vextractf128  ZH(3), m3, 1
342cabdff1aSopenharmony_ci    ret
343cabdff1aSopenharmony_ci
344cabdff1aSopenharmony_cialign 16
345cabdff1aSopenharmony_cifft32_avx:
346cabdff1aSopenharmony_ci    call fft16_avx
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci    mova m0, Z(4)
349cabdff1aSopenharmony_ci    mova m1, Z(5)
350cabdff1aSopenharmony_ci
351cabdff1aSopenharmony_ci    T4_SSE      m0, m1, m4
352cabdff1aSopenharmony_ci
353cabdff1aSopenharmony_ci    mova m2, Z(6)
354cabdff1aSopenharmony_ci    mova m3, Z(7)
355cabdff1aSopenharmony_ci
356cabdff1aSopenharmony_ci    T8_SSE      m0, m1, m2, m3, m4, m6
357cabdff1aSopenharmony_ci    ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
358cabdff1aSopenharmony_ci    ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
359cabdff1aSopenharmony_ci
360cabdff1aSopenharmony_ci    vperm2f128  m4, m0, m2, 0x20
361cabdff1aSopenharmony_ci    vperm2f128  m5, m1, m3, 0x20
362cabdff1aSopenharmony_ci    vperm2f128  m6, m0, m2, 0x31
363cabdff1aSopenharmony_ci    vperm2f128  m7, m1, m3, 0x31
364cabdff1aSopenharmony_ci
365cabdff1aSopenharmony_ci    PASS_SMALL 0, [cos_32], [cos_32+32]
366cabdff1aSopenharmony_ci
367cabdff1aSopenharmony_ci    ret
368cabdff1aSopenharmony_ci
369cabdff1aSopenharmony_cifft32_interleave_avx:
370cabdff1aSopenharmony_ci    call fft32_avx
371cabdff1aSopenharmony_ci    mov r2d, 32
372cabdff1aSopenharmony_ci.deint_loop:
373cabdff1aSopenharmony_ci    mova     m2, Z(0)
374cabdff1aSopenharmony_ci    mova     m3, Z(1)
375cabdff1aSopenharmony_ci    vunpcklps      m0, m2, m3
376cabdff1aSopenharmony_ci    vunpckhps      m1, m2, m3
377cabdff1aSopenharmony_ci    vextractf128   Z(0), m0, 0
378cabdff1aSopenharmony_ci    vextractf128  ZH(0), m1, 0
379cabdff1aSopenharmony_ci    vextractf128   Z(1), m0, 1
380cabdff1aSopenharmony_ci    vextractf128  ZH(1), m1, 1
381cabdff1aSopenharmony_ci    add r0, mmsize*2
382cabdff1aSopenharmony_ci    sub r2d, mmsize/4
383cabdff1aSopenharmony_ci    jg .deint_loop
384cabdff1aSopenharmony_ci    ret
385cabdff1aSopenharmony_ci
386cabdff1aSopenharmony_ci%endif
387cabdff1aSopenharmony_ci
388cabdff1aSopenharmony_ciINIT_XMM sse
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_cialign 16
391cabdff1aSopenharmony_cifft4_avx:
392cabdff1aSopenharmony_cifft4_sse:
393cabdff1aSopenharmony_ci    mova     m0, Z(0)
394cabdff1aSopenharmony_ci    mova     m1, Z(1)
395cabdff1aSopenharmony_ci    T4_SSE   m0, m1, m2
396cabdff1aSopenharmony_ci    mova   Z(0), m0
397cabdff1aSopenharmony_ci    mova   Z(1), m1
398cabdff1aSopenharmony_ci    ret
399cabdff1aSopenharmony_ci
400cabdff1aSopenharmony_cialign 16
401cabdff1aSopenharmony_cifft8_sse:
402cabdff1aSopenharmony_ci    mova     m0, Z(0)
403cabdff1aSopenharmony_ci    mova     m1, Z(1)
404cabdff1aSopenharmony_ci    T4_SSE   m0, m1, m2
405cabdff1aSopenharmony_ci    mova     m2, Z(2)
406cabdff1aSopenharmony_ci    mova     m3, Z(3)
407cabdff1aSopenharmony_ci    T8_SSE   m0, m1, m2, m3, m4, m5
408cabdff1aSopenharmony_ci    mova   Z(0), m0
409cabdff1aSopenharmony_ci    mova   Z(1), m1
410cabdff1aSopenharmony_ci    mova   Z(2), m2
411cabdff1aSopenharmony_ci    mova   Z(3), m3
412cabdff1aSopenharmony_ci    ret
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_cialign 16
415cabdff1aSopenharmony_cifft16_sse:
416cabdff1aSopenharmony_ci    mova     m0, Z(0)
417cabdff1aSopenharmony_ci    mova     m1, Z(1)
418cabdff1aSopenharmony_ci    T4_SSE   m0, m1, m2
419cabdff1aSopenharmony_ci    mova     m2, Z(2)
420cabdff1aSopenharmony_ci    mova     m3, Z(3)
421cabdff1aSopenharmony_ci    T8_SSE   m0, m1, m2, m3, m4, m5
422cabdff1aSopenharmony_ci    mova     m4, Z(4)
423cabdff1aSopenharmony_ci    mova     m5, Z(5)
424cabdff1aSopenharmony_ci    mova   Z(0), m0
425cabdff1aSopenharmony_ci    mova   Z(1), m1
426cabdff1aSopenharmony_ci    mova   Z(2), m2
427cabdff1aSopenharmony_ci    mova   Z(3), m3
428cabdff1aSopenharmony_ci    T4_SSE   m4, m5, m6
429cabdff1aSopenharmony_ci    mova     m6, Z2(6)
430cabdff1aSopenharmony_ci    mova     m7, Z2(7)
431cabdff1aSopenharmony_ci    T4_SSE   m6, m7, m0
432cabdff1aSopenharmony_ci    PASS_SMALL 0, [cos_16], [cos_16+16]
433cabdff1aSopenharmony_ci    ret
434cabdff1aSopenharmony_ci
435cabdff1aSopenharmony_ci
436cabdff1aSopenharmony_ci%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
437cabdff1aSopenharmony_ci%define Z2(x) [zcq + o3q + mmsize*(x&1)]
438cabdff1aSopenharmony_ci%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
439cabdff1aSopenharmony_ci%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
440cabdff1aSopenharmony_ci
441cabdff1aSopenharmony_ci%macro DECL_PASS 2+ ; name, payload
442cabdff1aSopenharmony_cialign 16
443cabdff1aSopenharmony_ci%1:
444cabdff1aSopenharmony_ciDEFINE_ARGS zc, w, n, o1, o3
445cabdff1aSopenharmony_ci    lea o3q, [nq*3]
446cabdff1aSopenharmony_ci    lea o1q, [nq*8]
447cabdff1aSopenharmony_ci    shl o3q, 4
448cabdff1aSopenharmony_ci.loop:
449cabdff1aSopenharmony_ci    %2
450cabdff1aSopenharmony_ci    add zcq, mmsize*2
451cabdff1aSopenharmony_ci    add  wq, mmsize
452cabdff1aSopenharmony_ci    sub  nd, mmsize/8
453cabdff1aSopenharmony_ci    jg .loop
454cabdff1aSopenharmony_ci    rep ret
455cabdff1aSopenharmony_ci%endmacro
456cabdff1aSopenharmony_ci
457cabdff1aSopenharmony_ci%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
458cabdff1aSopenharmony_ci    lea r2, [dispatch_tab%1]
459cabdff1aSopenharmony_ci    mov r2, [r2 + (%2q-2)*gprsize]
460cabdff1aSopenharmony_ci%ifdef PIC
461cabdff1aSopenharmony_ci    lea r3, [$$]
462cabdff1aSopenharmony_ci    add r2, r3
463cabdff1aSopenharmony_ci%endif
464cabdff1aSopenharmony_ci    call r2
465cabdff1aSopenharmony_ci%endmacro ; FFT_DISPATCH
466cabdff1aSopenharmony_ci
467cabdff1aSopenharmony_ciINIT_YMM avx
468cabdff1aSopenharmony_ci
469cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
470cabdff1aSopenharmony_ciDECL_PASS pass_avx, PASS_BIG 1
471cabdff1aSopenharmony_ciDECL_PASS pass_interleave_avx, PASS_BIG 0
472cabdff1aSopenharmony_ci
473cabdff1aSopenharmony_cicglobal fft_calc, 2,5,8
474cabdff1aSopenharmony_ci    mov     r3d, [r0 + FFTContext.nbits]
475cabdff1aSopenharmony_ci    mov     r0, r1
476cabdff1aSopenharmony_ci    mov     r1, r3
477cabdff1aSopenharmony_ci    FFT_DISPATCH _interleave %+ SUFFIX, r1
478cabdff1aSopenharmony_ci    REP_RET
479cabdff1aSopenharmony_ci
480cabdff1aSopenharmony_ci%endif
481cabdff1aSopenharmony_ci
482cabdff1aSopenharmony_ciINIT_XMM sse
483cabdff1aSopenharmony_ci
484cabdff1aSopenharmony_ciDECL_PASS pass_sse, PASS_BIG 1
485cabdff1aSopenharmony_ciDECL_PASS pass_interleave_sse, PASS_BIG 0
486cabdff1aSopenharmony_ci
487cabdff1aSopenharmony_ciINIT_XMM sse
488cabdff1aSopenharmony_cicglobal fft_calc, 2,5,8
489cabdff1aSopenharmony_ci    mov     r3d, [r0 + FFTContext.nbits]
490cabdff1aSopenharmony_ci    PUSH    r1
491cabdff1aSopenharmony_ci    PUSH    r3
492cabdff1aSopenharmony_ci    mov     r0, r1
493cabdff1aSopenharmony_ci    mov     r1, r3
494cabdff1aSopenharmony_ci    FFT_DISPATCH _interleave %+ SUFFIX, r1
495cabdff1aSopenharmony_ci    POP     rcx
496cabdff1aSopenharmony_ci    POP     r4
497cabdff1aSopenharmony_ci    cmp     rcx, 3+(mmsize/16)
498cabdff1aSopenharmony_ci    jg      .end
499cabdff1aSopenharmony_ci    mov     r2, -1
500cabdff1aSopenharmony_ci    add     rcx, 3
501cabdff1aSopenharmony_ci    shl     r2, cl
502cabdff1aSopenharmony_ci    sub     r4, r2
503cabdff1aSopenharmony_ci.loop:
504cabdff1aSopenharmony_ci    movaps   xmm0, [r4 + r2]
505cabdff1aSopenharmony_ci    movaps   xmm1, xmm0
506cabdff1aSopenharmony_ci    unpcklps xmm0, [r4 + r2 + 16]
507cabdff1aSopenharmony_ci    unpckhps xmm1, [r4 + r2 + 16]
508cabdff1aSopenharmony_ci    movaps   [r4 + r2],      xmm0
509cabdff1aSopenharmony_ci    movaps   [r4 + r2 + 16], xmm1
510cabdff1aSopenharmony_ci    add      r2, mmsize*2
511cabdff1aSopenharmony_ci    jl       .loop
512cabdff1aSopenharmony_ci.end:
513cabdff1aSopenharmony_ci    REP_RET
514cabdff1aSopenharmony_ci
515cabdff1aSopenharmony_cicglobal fft_permute, 2,7,1
516cabdff1aSopenharmony_ci    mov     r4,  [r0 + FFTContext.revtab]
517cabdff1aSopenharmony_ci    mov     r5,  [r0 + FFTContext.tmpbuf]
518cabdff1aSopenharmony_ci    mov     ecx, [r0 + FFTContext.nbits]
519cabdff1aSopenharmony_ci    mov     r2, 1
520cabdff1aSopenharmony_ci    shl     r2, cl
521cabdff1aSopenharmony_ci    xor     r0, r0
522cabdff1aSopenharmony_ci%if ARCH_X86_32
523cabdff1aSopenharmony_ci    mov     r1, r1m
524cabdff1aSopenharmony_ci%endif
525cabdff1aSopenharmony_ci.loop:
526cabdff1aSopenharmony_ci    movaps  xmm0, [r1 + 8*r0]
527cabdff1aSopenharmony_ci    movzx   r6, word [r4 + 2*r0]
528cabdff1aSopenharmony_ci    movzx   r3, word [r4 + 2*r0 + 2]
529cabdff1aSopenharmony_ci    movlps  [r5 + 8*r6], xmm0
530cabdff1aSopenharmony_ci    movhps  [r5 + 8*r3], xmm0
531cabdff1aSopenharmony_ci    add     r0, 2
532cabdff1aSopenharmony_ci    cmp     r0, r2
533cabdff1aSopenharmony_ci    jl      .loop
534cabdff1aSopenharmony_ci    shl     r2, 3
535cabdff1aSopenharmony_ci    add     r1, r2
536cabdff1aSopenharmony_ci    add     r5, r2
537cabdff1aSopenharmony_ci    neg     r2
538cabdff1aSopenharmony_ci; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
539cabdff1aSopenharmony_ci.loopcopy:
540cabdff1aSopenharmony_ci    movaps  xmm0, [r5 + r2]
541cabdff1aSopenharmony_ci    movaps  xmm1, [r5 + r2 + 16]
542cabdff1aSopenharmony_ci    movaps  [r1 + r2], xmm0
543cabdff1aSopenharmony_ci    movaps  [r1 + r2 + 16], xmm1
544cabdff1aSopenharmony_ci    add     r2, 32
545cabdff1aSopenharmony_ci    jl      .loopcopy
546cabdff1aSopenharmony_ci    REP_RET
547cabdff1aSopenharmony_ci
548cabdff1aSopenharmony_ciINIT_XMM sse
549cabdff1aSopenharmony_cicglobal imdct_calc, 3,5,3
550cabdff1aSopenharmony_ci    mov     r3d, [r0 + FFTContext.mdctsize]
551cabdff1aSopenharmony_ci    mov     r4,  [r0 + FFTContext.imdcthalf]
552cabdff1aSopenharmony_ci    add     r1,  r3
553cabdff1aSopenharmony_ci    PUSH    r3
554cabdff1aSopenharmony_ci    PUSH    r1
555cabdff1aSopenharmony_ci%if ARCH_X86_32
556cabdff1aSopenharmony_ci    push    r2
557cabdff1aSopenharmony_ci    push    r1
558cabdff1aSopenharmony_ci    push    r0
559cabdff1aSopenharmony_ci%else
560cabdff1aSopenharmony_ci    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
561cabdff1aSopenharmony_ci%endif
562cabdff1aSopenharmony_ci    call    r4
563cabdff1aSopenharmony_ci%if ARCH_X86_32
564cabdff1aSopenharmony_ci    add     esp, 12
565cabdff1aSopenharmony_ci%else
566cabdff1aSopenharmony_ci    add     rsp, 8+32*WIN64
567cabdff1aSopenharmony_ci%endif
568cabdff1aSopenharmony_ci    POP     r1
569cabdff1aSopenharmony_ci    POP     r3
570cabdff1aSopenharmony_ci    lea     r0, [r1 + 2*r3]
571cabdff1aSopenharmony_ci    mov     r2, r3
572cabdff1aSopenharmony_ci    sub     r3, mmsize
573cabdff1aSopenharmony_ci    neg     r2
574cabdff1aSopenharmony_ci    mova    m2, [ps_neg]
575cabdff1aSopenharmony_ci.loop:
576cabdff1aSopenharmony_ci    mova    m0, [r1 + r3]
577cabdff1aSopenharmony_ci    mova    m1, [r0 + r2]
578cabdff1aSopenharmony_ci    shufps  m0, m0, 0x1b
579cabdff1aSopenharmony_ci    shufps  m1, m1, 0x1b
580cabdff1aSopenharmony_ci    xorps   m0, m2
581cabdff1aSopenharmony_ci    mova [r0 + r3], m1
582cabdff1aSopenharmony_ci    mova [r1 + r2], m0
583cabdff1aSopenharmony_ci    sub     r3, mmsize
584cabdff1aSopenharmony_ci    add     r2, mmsize
585cabdff1aSopenharmony_ci    jl      .loop
586cabdff1aSopenharmony_ci    REP_RET
587cabdff1aSopenharmony_ci
588cabdff1aSopenharmony_ci%ifdef PIC
589cabdff1aSopenharmony_ci%define SECTION_REL - $$
590cabdff1aSopenharmony_ci%else
591cabdff1aSopenharmony_ci%define SECTION_REL
592cabdff1aSopenharmony_ci%endif
593cabdff1aSopenharmony_ci
594cabdff1aSopenharmony_ci%macro DECL_FFT 1-2 ; nbits, suffix
595cabdff1aSopenharmony_ci%ifidn %0, 1
596cabdff1aSopenharmony_ci%xdefine fullsuffix SUFFIX
597cabdff1aSopenharmony_ci%else
598cabdff1aSopenharmony_ci%xdefine fullsuffix %2 %+ SUFFIX
599cabdff1aSopenharmony_ci%endif
600cabdff1aSopenharmony_ci%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
601cabdff1aSopenharmony_ci%if %1>=5
602cabdff1aSopenharmony_ci%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
603cabdff1aSopenharmony_ci%endif
604cabdff1aSopenharmony_ci%if %1>=6
605cabdff1aSopenharmony_ci%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
606cabdff1aSopenharmony_ci%endif
607cabdff1aSopenharmony_ci
608cabdff1aSopenharmony_ci%assign n 1<<%1
609cabdff1aSopenharmony_ci%rep 18-%1
610cabdff1aSopenharmony_ci%assign n2 n/2
611cabdff1aSopenharmony_ci%assign n4 n/4
612cabdff1aSopenharmony_ci%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
613cabdff1aSopenharmony_ci
614cabdff1aSopenharmony_cialign 16
615cabdff1aSopenharmony_cifft %+ n %+ fullsuffix:
616cabdff1aSopenharmony_ci    call fft %+ n2 %+ SUFFIX
617cabdff1aSopenharmony_ci    add r0, n*4 - (n&(-2<<%1))
618cabdff1aSopenharmony_ci    call fft %+ n4 %+ SUFFIX
619cabdff1aSopenharmony_ci    add r0, n*2 - (n2&(-2<<%1))
620cabdff1aSopenharmony_ci    call fft %+ n4 %+ SUFFIX
621cabdff1aSopenharmony_ci    sub r0, n*6 + (n2&(-2<<%1))
622cabdff1aSopenharmony_ci    lea r1, [cos_ %+ n]
623cabdff1aSopenharmony_ci    mov r2d, n4/2
624cabdff1aSopenharmony_ci    jmp pass %+ fullsuffix
625cabdff1aSopenharmony_ci
626cabdff1aSopenharmony_ci%assign n n*2
627cabdff1aSopenharmony_ci%endrep
628cabdff1aSopenharmony_ci%undef n
629cabdff1aSopenharmony_ci
630cabdff1aSopenharmony_cialign 8
631cabdff1aSopenharmony_cidispatch_tab %+ fullsuffix: pointer list_of_fft
632cabdff1aSopenharmony_ci%endmacro ; DECL_FFT
633cabdff1aSopenharmony_ci
634cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
635cabdff1aSopenharmony_ciINIT_YMM avx
636cabdff1aSopenharmony_ciDECL_FFT 6
637cabdff1aSopenharmony_ciDECL_FFT 6, _interleave
638cabdff1aSopenharmony_ci%endif
639cabdff1aSopenharmony_ciINIT_XMM sse
640cabdff1aSopenharmony_ciDECL_FFT 5
641cabdff1aSopenharmony_ciDECL_FFT 5, _interleave
642cabdff1aSopenharmony_ci
643cabdff1aSopenharmony_ciINIT_XMM sse
644cabdff1aSopenharmony_ci%undef mulps
645cabdff1aSopenharmony_ci%undef addps
646cabdff1aSopenharmony_ci%undef subps
647cabdff1aSopenharmony_ci%undef unpcklps
648cabdff1aSopenharmony_ci%undef unpckhps
649cabdff1aSopenharmony_ci
650cabdff1aSopenharmony_ci%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
651cabdff1aSopenharmony_ci    movaps   xmm0, [%3+%2*4]
652cabdff1aSopenharmony_ci    movaps   xmm1, [%3+%1*4-0x10]
653cabdff1aSopenharmony_ci    movaps   xmm2, xmm0
654cabdff1aSopenharmony_ci    shufps   xmm0, xmm1, 0x88
655cabdff1aSopenharmony_ci    shufps   xmm1, xmm2, 0x77
656cabdff1aSopenharmony_ci    movlps   xmm4, [%4+%2*2]
657cabdff1aSopenharmony_ci    movlps   xmm5, [%5+%2*2+0x0]
658cabdff1aSopenharmony_ci    movhps   xmm4, [%4+%1*2-0x8]
659cabdff1aSopenharmony_ci    movhps   xmm5, [%5+%1*2-0x8]
660cabdff1aSopenharmony_ci    movaps   xmm2, xmm0
661cabdff1aSopenharmony_ci    movaps   xmm3, xmm1
662cabdff1aSopenharmony_ci    mulps    xmm0, xmm5
663cabdff1aSopenharmony_ci    mulps    xmm1, xmm4
664cabdff1aSopenharmony_ci    mulps    xmm2, xmm4
665cabdff1aSopenharmony_ci    mulps    xmm3, xmm5
666cabdff1aSopenharmony_ci    subps    xmm1, xmm0
667cabdff1aSopenharmony_ci    addps    xmm2, xmm3
668cabdff1aSopenharmony_ci    movaps   xmm0, xmm1
669cabdff1aSopenharmony_ci    unpcklps xmm1, xmm2
670cabdff1aSopenharmony_ci    unpckhps xmm0, xmm2
671cabdff1aSopenharmony_ci%endmacro
672cabdff1aSopenharmony_ci
673cabdff1aSopenharmony_ci%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
674cabdff1aSopenharmony_ci    mulps      m6, %3, [%5+%1]
675cabdff1aSopenharmony_ci    mulps      m7, %2, [%5+%1]
676cabdff1aSopenharmony_ci    mulps      %2, %2, [%6+%1]
677cabdff1aSopenharmony_ci    mulps      %3, %3, [%6+%1]
678cabdff1aSopenharmony_ci    subps      %2, %2, m6
679cabdff1aSopenharmony_ci    addps      %3, %3, m7
680cabdff1aSopenharmony_ci%endmacro
681cabdff1aSopenharmony_ci
682cabdff1aSopenharmony_ci%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
683cabdff1aSopenharmony_ci.post:
684cabdff1aSopenharmony_ci%if cpuflag(avx)
685cabdff1aSopenharmony_ci    vmovaps      ymm1,   [%3+%1*2]
686cabdff1aSopenharmony_ci    vmovaps      ymm0,   [%3+%1*2+0x20]
687cabdff1aSopenharmony_ci    vmovaps      ymm3,   [%3+%2*2]
688cabdff1aSopenharmony_ci    vmovaps      ymm2,   [%3+%2*2+0x20]
689cabdff1aSopenharmony_ci
690cabdff1aSopenharmony_ci    CMUL         %1, ymm0, ymm1, %3, %4, %5
691cabdff1aSopenharmony_ci    CMUL         %2, ymm2, ymm3, %3, %4, %5
692cabdff1aSopenharmony_ci    vshufps      ymm1, ymm1, ymm1, 0x1b
693cabdff1aSopenharmony_ci    vshufps      ymm3, ymm3, ymm3, 0x1b
694cabdff1aSopenharmony_ci    vperm2f128   ymm1, ymm1, ymm1, 0x01
695cabdff1aSopenharmony_ci    vperm2f128   ymm3, ymm3, ymm3, 0x01
696cabdff1aSopenharmony_ci    vunpcklps    ymm6, ymm2, ymm1
697cabdff1aSopenharmony_ci    vunpckhps    ymm4, ymm2, ymm1
698cabdff1aSopenharmony_ci    vunpcklps    ymm7, ymm0, ymm3
699cabdff1aSopenharmony_ci    vunpckhps    ymm5, ymm0, ymm3
700cabdff1aSopenharmony_ci
701cabdff1aSopenharmony_ci    vextractf128 [%3+%1*2],      ymm7, 0
702cabdff1aSopenharmony_ci    vextractf128 [%3+%1*2+0x10], ymm5, 0
703cabdff1aSopenharmony_ci    vextractf128 [%3+%1*2+0x20], ymm7, 1
704cabdff1aSopenharmony_ci    vextractf128 [%3+%1*2+0x30], ymm5, 1
705cabdff1aSopenharmony_ci
706cabdff1aSopenharmony_ci    vextractf128 [%3+%2*2],      ymm6, 0
707cabdff1aSopenharmony_ci    vextractf128 [%3+%2*2+0x10], ymm4, 0
708cabdff1aSopenharmony_ci    vextractf128 [%3+%2*2+0x20], ymm6, 1
709cabdff1aSopenharmony_ci    vextractf128 [%3+%2*2+0x30], ymm4, 1
710cabdff1aSopenharmony_ci    sub      %2,   0x20
711cabdff1aSopenharmony_ci    add      %1,   0x20
712cabdff1aSopenharmony_ci    jl       .post
713cabdff1aSopenharmony_ci%else
714cabdff1aSopenharmony_ci    movaps   xmm1, [%3+%1*2]
715cabdff1aSopenharmony_ci    movaps   xmm0, [%3+%1*2+0x10]
716cabdff1aSopenharmony_ci    CMUL     %1,   xmm0, xmm1, %3, %4, %5
717cabdff1aSopenharmony_ci    movaps   xmm5, [%3+%2*2]
718cabdff1aSopenharmony_ci    movaps   xmm4, [%3+%2*2+0x10]
719cabdff1aSopenharmony_ci    CMUL     %2,   xmm4, xmm5, %3, %4, %5
720cabdff1aSopenharmony_ci    shufps   xmm1, xmm1, 0x1b
721cabdff1aSopenharmony_ci    shufps   xmm5, xmm5, 0x1b
722cabdff1aSopenharmony_ci    movaps   xmm6, xmm4
723cabdff1aSopenharmony_ci    unpckhps xmm4, xmm1
724cabdff1aSopenharmony_ci    unpcklps xmm6, xmm1
725cabdff1aSopenharmony_ci    movaps   xmm2, xmm0
726cabdff1aSopenharmony_ci    unpcklps xmm0, xmm5
727cabdff1aSopenharmony_ci    unpckhps xmm2, xmm5
728cabdff1aSopenharmony_ci    movaps   [%3+%2*2],      xmm6
729cabdff1aSopenharmony_ci    movaps   [%3+%2*2+0x10], xmm4
730cabdff1aSopenharmony_ci    movaps   [%3+%1*2],      xmm0
731cabdff1aSopenharmony_ci    movaps   [%3+%1*2+0x10], xmm2
732cabdff1aSopenharmony_ci    sub      %2,   0x10
733cabdff1aSopenharmony_ci    add      %1,   0x10
734cabdff1aSopenharmony_ci    jl       .post
735cabdff1aSopenharmony_ci%endif
736cabdff1aSopenharmony_ci%endmacro
737cabdff1aSopenharmony_ci
738cabdff1aSopenharmony_ci%macro DECL_IMDCT 0
739cabdff1aSopenharmony_cicglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
740cabdff1aSopenharmony_ci%if ARCH_X86_64
741cabdff1aSopenharmony_ci%define rrevtab r7
742cabdff1aSopenharmony_ci%define rtcos   r8
743cabdff1aSopenharmony_ci%define rtsin   r9
744cabdff1aSopenharmony_ci%else
745cabdff1aSopenharmony_ci%define rrevtab r6
746cabdff1aSopenharmony_ci%define rtsin   r6
747cabdff1aSopenharmony_ci%define rtcos   r5
748cabdff1aSopenharmony_ci%endif
749cabdff1aSopenharmony_ci    mov   r3d, [r0+FFTContext.mdctsize]
750cabdff1aSopenharmony_ci    add   r2, r3
751cabdff1aSopenharmony_ci    shr   r3, 1
752cabdff1aSopenharmony_ci    mov   rtcos, [r0+FFTContext.tcos]
753cabdff1aSopenharmony_ci    mov   rtsin, [r0+FFTContext.tsin]
754cabdff1aSopenharmony_ci    add   rtcos, r3
755cabdff1aSopenharmony_ci    add   rtsin, r3
756cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
757cabdff1aSopenharmony_ci    push  rtcos
758cabdff1aSopenharmony_ci    push  rtsin
759cabdff1aSopenharmony_ci%endif
760cabdff1aSopenharmony_ci    shr   r3, 1
761cabdff1aSopenharmony_ci    mov   rrevtab, [r0+FFTContext.revtab]
762cabdff1aSopenharmony_ci    add   rrevtab, r3
763cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
764cabdff1aSopenharmony_ci    push  rrevtab
765cabdff1aSopenharmony_ci%endif
766cabdff1aSopenharmony_ci
767cabdff1aSopenharmony_ci    sub   r3, 4
768cabdff1aSopenharmony_ci%if ARCH_X86_64
769cabdff1aSopenharmony_ci    xor   r4, r4
770cabdff1aSopenharmony_ci    sub   r4, r3
771cabdff1aSopenharmony_ci%endif
772cabdff1aSopenharmony_ci.pre:
773cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
774cabdff1aSopenharmony_ci;unspill
775cabdff1aSopenharmony_ci    xor   r4, r4
776cabdff1aSopenharmony_ci    sub   r4, r3
777cabdff1aSopenharmony_ci    mov   rtcos, [esp+8]
778cabdff1aSopenharmony_ci    mov   rtsin, [esp+4]
779cabdff1aSopenharmony_ci%endif
780cabdff1aSopenharmony_ci
781cabdff1aSopenharmony_ci    PREROTATER r4, r3, r2, rtcos, rtsin
782cabdff1aSopenharmony_ci%if ARCH_X86_64
783cabdff1aSopenharmony_ci    movzx  r5,  word [rrevtab+r4-4]
784cabdff1aSopenharmony_ci    movzx  r6,  word [rrevtab+r4-2]
785cabdff1aSopenharmony_ci    movzx  r10, word [rrevtab+r3]
786cabdff1aSopenharmony_ci    movzx  r11, word [rrevtab+r3+2]
787cabdff1aSopenharmony_ci    movlps [r1+r5 *8], xmm0
788cabdff1aSopenharmony_ci    movhps [r1+r6 *8], xmm0
789cabdff1aSopenharmony_ci    movlps [r1+r10*8], xmm1
790cabdff1aSopenharmony_ci    movhps [r1+r11*8], xmm1
791cabdff1aSopenharmony_ci    add    r4, 4
792cabdff1aSopenharmony_ci%else
793cabdff1aSopenharmony_ci    mov    r6, [esp]
794cabdff1aSopenharmony_ci    movzx  r5, word [r6+r4-4]
795cabdff1aSopenharmony_ci    movzx  r4, word [r6+r4-2]
796cabdff1aSopenharmony_ci    movlps [r1+r5*8], xmm0
797cabdff1aSopenharmony_ci    movhps [r1+r4*8], xmm0
798cabdff1aSopenharmony_ci    movzx  r5, word [r6+r3]
799cabdff1aSopenharmony_ci    movzx  r4, word [r6+r3+2]
800cabdff1aSopenharmony_ci    movlps [r1+r5*8], xmm1
801cabdff1aSopenharmony_ci    movhps [r1+r4*8], xmm1
802cabdff1aSopenharmony_ci%endif
803cabdff1aSopenharmony_ci    sub    r3, 4
804cabdff1aSopenharmony_ci    jns    .pre
805cabdff1aSopenharmony_ci
806cabdff1aSopenharmony_ci    mov  r5, r0
807cabdff1aSopenharmony_ci    mov  r6, r1
808cabdff1aSopenharmony_ci    mov  r0, r1
809cabdff1aSopenharmony_ci    mov  r1d, [r5+FFTContext.nbits]
810cabdff1aSopenharmony_ci
811cabdff1aSopenharmony_ci    FFT_DISPATCH SUFFIX, r1
812cabdff1aSopenharmony_ci
813cabdff1aSopenharmony_ci    mov  r0d, [r5+FFTContext.mdctsize]
814cabdff1aSopenharmony_ci    add  r6, r0
815cabdff1aSopenharmony_ci    shr  r0, 1
816cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
817cabdff1aSopenharmony_ci%define rtcos r2
818cabdff1aSopenharmony_ci%define rtsin r3
819cabdff1aSopenharmony_ci    mov  rtcos, [esp+8]
820cabdff1aSopenharmony_ci    mov  rtsin, [esp+4]
821cabdff1aSopenharmony_ci%endif
822cabdff1aSopenharmony_ci    neg  r0
823cabdff1aSopenharmony_ci    mov  r1, -mmsize
824cabdff1aSopenharmony_ci    sub  r1, r0
825cabdff1aSopenharmony_ci    POSROTATESHUF r0, r1, r6, rtcos, rtsin
826cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
827cabdff1aSopenharmony_ci    add esp, 12
828cabdff1aSopenharmony_ci%endif
829cabdff1aSopenharmony_ci    RET
830cabdff1aSopenharmony_ci%endmacro
831cabdff1aSopenharmony_ci
832cabdff1aSopenharmony_ciDECL_IMDCT
833cabdff1aSopenharmony_ci
834cabdff1aSopenharmony_ciINIT_YMM avx
835cabdff1aSopenharmony_ci
836cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
837cabdff1aSopenharmony_ciDECL_IMDCT
838cabdff1aSopenharmony_ci%endif
839