1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* SIMD-optimized functions for the DCA decoder
3cabdff1aSopenharmony_ci;* Copyright (C) 2016 James Almer
4cabdff1aSopenharmony_ci;*
5cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci;******************************************************************************
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciSECTION .text
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci%define sizeof_float 4
27cabdff1aSopenharmony_ci%define FMA3_OFFSET (8 * cpuflag(fma3))
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ci%macro LFE_FIR0_FLOAT 0
30cabdff1aSopenharmony_cicglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
31cabdff1aSopenharmony_ci    shr nblocksd, 1
32cabdff1aSopenharmony_ci    sub     lfeq, 7*sizeof_float
33cabdff1aSopenharmony_ci    mov    cnt1d, 32*sizeof_float
34cabdff1aSopenharmony_ci    mov    cnt2d, 32*sizeof_float-8-FMA3_OFFSET
35cabdff1aSopenharmony_ci    lea   coeffq, [coeffq+cnt1q*8]
36cabdff1aSopenharmony_ci    add samplesq, cnt1q
37cabdff1aSopenharmony_ci    neg    cnt1q
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ci.loop:
40cabdff1aSopenharmony_ci%if cpuflag(avx)
41cabdff1aSopenharmony_ci    cvtdq2ps  m4, [lfeq+16]
42cabdff1aSopenharmony_ci    cvtdq2ps  m5, [lfeq   ]
43cabdff1aSopenharmony_ci    shufps    m7, m4, m4, q0123
44cabdff1aSopenharmony_ci    shufps    m6, m5, m5, q0123
45cabdff1aSopenharmony_ci%else
46cabdff1aSopenharmony_ci    movu      m4, [lfeq+16]
47cabdff1aSopenharmony_ci    movu      m5, [lfeq   ]
48cabdff1aSopenharmony_ci    cvtdq2ps  m4, m4
49cabdff1aSopenharmony_ci    cvtdq2ps  m5, m5
50cabdff1aSopenharmony_ci    pshufd    m7, m4, q0123
51cabdff1aSopenharmony_ci    pshufd    m6, m5, q0123
52cabdff1aSopenharmony_ci%endif
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ci.inner_loop:
55cabdff1aSopenharmony_ci%if ARCH_X86_64
56cabdff1aSopenharmony_ci    movaps    m8, [coeffq+cnt1q*8   ]
57cabdff1aSopenharmony_ci    movaps    m9, [coeffq+cnt1q*8+16]
58cabdff1aSopenharmony_ci    movaps   m10, [coeffq+cnt1q*8+32]
59cabdff1aSopenharmony_ci    movaps   m11, [coeffq+cnt1q*8+48]
60cabdff1aSopenharmony_ci%if cpuflag(fma3)
61cabdff1aSopenharmony_ci    movaps   m12, [coeffq+cnt1q*8+64]
62cabdff1aSopenharmony_ci    movaps   m13, [coeffq+cnt1q*8+80]
63cabdff1aSopenharmony_ci    movaps   m14, [coeffq+cnt1q*8+96]
64cabdff1aSopenharmony_ci    movaps   m15, [coeffq+cnt1q*8+112]
65cabdff1aSopenharmony_ci    mulps     m0, m7, m8
66cabdff1aSopenharmony_ci    mulps     m1, m7, m10
67cabdff1aSopenharmony_ci    mulps     m2, m7, m12
68cabdff1aSopenharmony_ci    mulps     m3, m7, m14
69cabdff1aSopenharmony_ci    fmaddps   m0, m6, m9, m0
70cabdff1aSopenharmony_ci    fmaddps   m1, m6, m11, m1
71cabdff1aSopenharmony_ci    fmaddps   m2, m6, m13, m2
72cabdff1aSopenharmony_ci    fmaddps   m3, m6, m15, m3
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_ci    haddps    m0, m1
75cabdff1aSopenharmony_ci    haddps    m2, m3
76cabdff1aSopenharmony_ci    haddps    m0, m2
77cabdff1aSopenharmony_ci    movaps [samplesq+cnt1q], m0
78cabdff1aSopenharmony_ci%else
79cabdff1aSopenharmony_ci    mulps     m0, m7, m8
80cabdff1aSopenharmony_ci    mulps     m1, m6, m9
81cabdff1aSopenharmony_ci    mulps     m2, m7, m10
82cabdff1aSopenharmony_ci    mulps     m3, m6, m11
83cabdff1aSopenharmony_ci    addps     m0, m1
84cabdff1aSopenharmony_ci    addps     m2, m3
85cabdff1aSopenharmony_ci
86cabdff1aSopenharmony_ci    unpckhps  m3, m0, m2
87cabdff1aSopenharmony_ci    unpcklps  m0, m2
88cabdff1aSopenharmony_ci    addps     m3, m0
89cabdff1aSopenharmony_ci    movhlps   m2, m3
90cabdff1aSopenharmony_ci    addps     m2, m3
91cabdff1aSopenharmony_ci    movlps [samplesq+cnt1q], m2
92cabdff1aSopenharmony_ci%endif
93cabdff1aSopenharmony_ci%else ; ARCH_X86_32
94cabdff1aSopenharmony_ci%if cpuflag(fma3)
95cabdff1aSopenharmony_ci    mulps     m0, m7, [coeffq+cnt1q*8    ]
96cabdff1aSopenharmony_ci    mulps     m1, m7, [coeffq+cnt1q*8+32 ]
97cabdff1aSopenharmony_ci    mulps     m2, m7, [coeffq+cnt1q*8+64 ]
98cabdff1aSopenharmony_ci    mulps     m3, m7, [coeffq+cnt1q*8+96 ]
99cabdff1aSopenharmony_ci    fmaddps   m0, m6, [coeffq+cnt1q*8+16 ], m0
100cabdff1aSopenharmony_ci    fmaddps   m1, m6, [coeffq+cnt1q*8+48 ], m1
101cabdff1aSopenharmony_ci    fmaddps   m2, m6, [coeffq+cnt1q*8+80 ], m2
102cabdff1aSopenharmony_ci    fmaddps   m3, m6, [coeffq+cnt1q*8+112], m3
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci    haddps    m0, m1
105cabdff1aSopenharmony_ci    haddps    m2, m3
106cabdff1aSopenharmony_ci    haddps    m0, m2
107cabdff1aSopenharmony_ci    movaps [samplesq+cnt1q], m0
108cabdff1aSopenharmony_ci%else
109cabdff1aSopenharmony_ci    mulps     m0, m7, [coeffq+cnt1q*8   ]
110cabdff1aSopenharmony_ci    mulps     m1, m6, [coeffq+cnt1q*8+16]
111cabdff1aSopenharmony_ci    mulps     m2, m7, [coeffq+cnt1q*8+32]
112cabdff1aSopenharmony_ci    mulps     m3, m6, [coeffq+cnt1q*8+48]
113cabdff1aSopenharmony_ci    addps     m0, m1
114cabdff1aSopenharmony_ci    addps     m2, m3
115cabdff1aSopenharmony_ci
116cabdff1aSopenharmony_ci    unpckhps  m3, m0, m2
117cabdff1aSopenharmony_ci    unpcklps  m0, m2
118cabdff1aSopenharmony_ci    addps     m3, m0
119cabdff1aSopenharmony_ci    movhlps   m2, m3
120cabdff1aSopenharmony_ci    addps     m2, m3
121cabdff1aSopenharmony_ci    movlps [samplesq+cnt1q], m2
122cabdff1aSopenharmony_ci%endif
123cabdff1aSopenharmony_ci%endif; ARCH
124cabdff1aSopenharmony_ci
125cabdff1aSopenharmony_ci%if ARCH_X86_64
126cabdff1aSopenharmony_ci%if cpuflag(fma3)
127cabdff1aSopenharmony_ci    mulps     m8, m5
128cabdff1aSopenharmony_ci    mulps    m10, m5
129cabdff1aSopenharmony_ci    mulps    m12, m5
130cabdff1aSopenharmony_ci    mulps    m14, m5
131cabdff1aSopenharmony_ci    fmaddps   m8, m4, m9, m8
132cabdff1aSopenharmony_ci    fmaddps  m10, m4, m11, m10
133cabdff1aSopenharmony_ci    fmaddps  m12, m4, m13, m12
134cabdff1aSopenharmony_ci    fmaddps  m14, m4, m15, m14
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ci    haddps   m10, m8
137cabdff1aSopenharmony_ci    haddps   m14, m12
138cabdff1aSopenharmony_ci    haddps   m14, m10
139cabdff1aSopenharmony_ci    movaps [samplesq+cnt2q], m14
140cabdff1aSopenharmony_ci%else
141cabdff1aSopenharmony_ci    mulps     m8, m5
142cabdff1aSopenharmony_ci    mulps     m9, m4
143cabdff1aSopenharmony_ci    mulps    m10, m5
144cabdff1aSopenharmony_ci    mulps    m11, m4
145cabdff1aSopenharmony_ci    addps     m8, m9
146cabdff1aSopenharmony_ci    addps    m10, m11
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_ci    unpckhps m11, m10, m8
149cabdff1aSopenharmony_ci    unpcklps m10, m8
150cabdff1aSopenharmony_ci    addps    m11, m10
151cabdff1aSopenharmony_ci    movhlps   m8, m11
152cabdff1aSopenharmony_ci    addps     m8, m11
153cabdff1aSopenharmony_ci    movlps [samplesq+cnt2q], m8
154cabdff1aSopenharmony_ci%endif
155cabdff1aSopenharmony_ci%else ; ARCH_X86_32
156cabdff1aSopenharmony_ci%if cpuflag(fma3)
157cabdff1aSopenharmony_ci    mulps     m0, m5, [coeffq+cnt1q*8    ]
158cabdff1aSopenharmony_ci    mulps     m1, m5, [coeffq+cnt1q*8+32 ]
159cabdff1aSopenharmony_ci    mulps     m2, m5, [coeffq+cnt1q*8+64 ]
160cabdff1aSopenharmony_ci    mulps     m3, m5, [coeffq+cnt1q*8+96 ]
161cabdff1aSopenharmony_ci    fmaddps   m0, m4, [coeffq+cnt1q*8+16 ], m0
162cabdff1aSopenharmony_ci    fmaddps   m1, m4, [coeffq+cnt1q*8+48 ], m1
163cabdff1aSopenharmony_ci    fmaddps   m2, m4, [coeffq+cnt1q*8+80 ], m2
164cabdff1aSopenharmony_ci    fmaddps   m3, m4, [coeffq+cnt1q*8+112], m3
165cabdff1aSopenharmony_ci
166cabdff1aSopenharmony_ci    haddps    m1, m0
167cabdff1aSopenharmony_ci    haddps    m3, m2
168cabdff1aSopenharmony_ci    haddps    m3, m1
169cabdff1aSopenharmony_ci    movaps [samplesq+cnt2q], m3
170cabdff1aSopenharmony_ci%else
171cabdff1aSopenharmony_ci    mulps     m0, m5, [coeffq+cnt1q*8   ]
172cabdff1aSopenharmony_ci    mulps     m1, m4, [coeffq+cnt1q*8+16]
173cabdff1aSopenharmony_ci    mulps     m2, m5, [coeffq+cnt1q*8+32]
174cabdff1aSopenharmony_ci    mulps     m3, m4, [coeffq+cnt1q*8+48]
175cabdff1aSopenharmony_ci    addps     m0, m1
176cabdff1aSopenharmony_ci    addps     m2, m3
177cabdff1aSopenharmony_ci
178cabdff1aSopenharmony_ci    unpckhps  m3, m2, m0
179cabdff1aSopenharmony_ci    unpcklps  m2, m0
180cabdff1aSopenharmony_ci    addps     m3, m2
181cabdff1aSopenharmony_ci    movhlps   m0, m3
182cabdff1aSopenharmony_ci    addps     m0, m3
183cabdff1aSopenharmony_ci    movlps [samplesq+cnt2q], m0
184cabdff1aSopenharmony_ci%endif
185cabdff1aSopenharmony_ci%endif; ARCH
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ci    sub    cnt2d, 8 + FMA3_OFFSET
188cabdff1aSopenharmony_ci    add    cnt1q, 8 + FMA3_OFFSET
189cabdff1aSopenharmony_ci    jl .inner_loop
190cabdff1aSopenharmony_ci
191cabdff1aSopenharmony_ci    add     lfeq, 4
192cabdff1aSopenharmony_ci    add samplesq,  64*sizeof_float
193cabdff1aSopenharmony_ci    mov    cnt1q, -32*sizeof_float
194cabdff1aSopenharmony_ci    mov    cnt2d,  32*sizeof_float-8-FMA3_OFFSET
195cabdff1aSopenharmony_ci    sub nblocksd, 1
196cabdff1aSopenharmony_ci    jg .loop
197cabdff1aSopenharmony_ci    RET
198cabdff1aSopenharmony_ci%endmacro
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ciINIT_XMM sse2
201cabdff1aSopenharmony_ciLFE_FIR0_FLOAT
202cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
203cabdff1aSopenharmony_ciINIT_XMM avx
204cabdff1aSopenharmony_ciLFE_FIR0_FLOAT
205cabdff1aSopenharmony_ci%endif
206cabdff1aSopenharmony_ci%if HAVE_FMA3_EXTERNAL
207cabdff1aSopenharmony_ciINIT_XMM fma3
208cabdff1aSopenharmony_ciLFE_FIR0_FLOAT
209cabdff1aSopenharmony_ci%endif
210cabdff1aSopenharmony_ci
211cabdff1aSopenharmony_ci%macro LFE_FIR1_FLOAT 0
212cabdff1aSopenharmony_cicglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
213cabdff1aSopenharmony_ci    shr nblocksd, 2
214cabdff1aSopenharmony_ci    sub     lfeq, 3*sizeof_float
215cabdff1aSopenharmony_ci    mov    cnt1d, 64*sizeof_float
216cabdff1aSopenharmony_ci    mov    cnt2d, 64*sizeof_float-16
217cabdff1aSopenharmony_ci    lea   coeffq, [coeffq+cnt1q*4]
218cabdff1aSopenharmony_ci    add samplesq, cnt1q
219cabdff1aSopenharmony_ci    neg    cnt1q
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_ci.loop:
222cabdff1aSopenharmony_ci%if cpuflag(avx)
223cabdff1aSopenharmony_ci    cvtdq2ps  m4, [lfeq]
224cabdff1aSopenharmony_ci    shufps    m5, m4, m4, q0123
225cabdff1aSopenharmony_ci%else
226cabdff1aSopenharmony_ci    movu      m4, [lfeq]
227cabdff1aSopenharmony_ci    cvtdq2ps  m4, m4
228cabdff1aSopenharmony_ci    pshufd    m5, m4, q0123
229cabdff1aSopenharmony_ci%endif
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci.inner_loop:
232cabdff1aSopenharmony_ci    movaps    m6, [coeffq+cnt1q*4   ]
233cabdff1aSopenharmony_ci    movaps    m7, [coeffq+cnt1q*4+16]
234cabdff1aSopenharmony_ci    mulps     m0, m5, m6
235cabdff1aSopenharmony_ci    mulps     m1, m5, m7
236cabdff1aSopenharmony_ci%if ARCH_X86_64
237cabdff1aSopenharmony_ci    movaps    m8, [coeffq+cnt1q*4+32]
238cabdff1aSopenharmony_ci    movaps    m9, [coeffq+cnt1q*4+48]
239cabdff1aSopenharmony_ci    mulps     m2, m5, m8
240cabdff1aSopenharmony_ci    mulps     m3, m5, m9
241cabdff1aSopenharmony_ci%else
242cabdff1aSopenharmony_ci    mulps     m2, m5, [coeffq+cnt1q*4+32]
243cabdff1aSopenharmony_ci    mulps     m3, m5, [coeffq+cnt1q*4+48]
244cabdff1aSopenharmony_ci%endif
245cabdff1aSopenharmony_ci
246cabdff1aSopenharmony_ci    haddps    m0, m1
247cabdff1aSopenharmony_ci    haddps    m2, m3
248cabdff1aSopenharmony_ci    haddps    m0, m2
249cabdff1aSopenharmony_ci    movaps [samplesq+cnt1q], m0
250cabdff1aSopenharmony_ci
251cabdff1aSopenharmony_ci    mulps     m6, m4
252cabdff1aSopenharmony_ci    mulps     m7, m4
253cabdff1aSopenharmony_ci%if ARCH_X86_64
254cabdff1aSopenharmony_ci    mulps     m8, m4
255cabdff1aSopenharmony_ci    mulps     m9, m4
256cabdff1aSopenharmony_ci
257cabdff1aSopenharmony_ci    haddps    m6, m7
258cabdff1aSopenharmony_ci    haddps    m8, m9
259cabdff1aSopenharmony_ci    haddps    m6, m8
260cabdff1aSopenharmony_ci%else
261cabdff1aSopenharmony_ci    mulps     m2, m4, [coeffq+cnt1q*4+32]
262cabdff1aSopenharmony_ci    mulps     m3, m4, [coeffq+cnt1q*4+48]
263cabdff1aSopenharmony_ci
264cabdff1aSopenharmony_ci    haddps    m6, m7
265cabdff1aSopenharmony_ci    haddps    m2, m3
266cabdff1aSopenharmony_ci    haddps    m6, m2
267cabdff1aSopenharmony_ci%endif
268cabdff1aSopenharmony_ci    movaps [samplesq+cnt2q], m6
269cabdff1aSopenharmony_ci
270cabdff1aSopenharmony_ci    sub    cnt2d, 16
271cabdff1aSopenharmony_ci    add    cnt1q, 16
272cabdff1aSopenharmony_ci    jl .inner_loop
273cabdff1aSopenharmony_ci
274cabdff1aSopenharmony_ci    add     lfeq, sizeof_float
275cabdff1aSopenharmony_ci    add samplesq, 128*sizeof_float
276cabdff1aSopenharmony_ci    mov    cnt1q, -64*sizeof_float
277cabdff1aSopenharmony_ci    mov    cnt2d,  64*sizeof_float-16
278cabdff1aSopenharmony_ci    sub nblocksd, 1
279cabdff1aSopenharmony_ci    jg .loop
280cabdff1aSopenharmony_ci    RET
281cabdff1aSopenharmony_ci%endmacro
282cabdff1aSopenharmony_ci
283cabdff1aSopenharmony_ciINIT_XMM sse3
284cabdff1aSopenharmony_ciLFE_FIR1_FLOAT
285cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
286cabdff1aSopenharmony_ciINIT_XMM avx
287cabdff1aSopenharmony_ciLFE_FIR1_FLOAT
288cabdff1aSopenharmony_ci%endif
289