1;******************************************************************************
2;* SIMD-optimized functions for the DCA decoder
3;* Copyright (C) 2016 James Almer
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION .text
25
26%define sizeof_float 4
27%define FMA3_OFFSET (8 * cpuflag(fma3))
28
29%macro LFE_FIR0_FLOAT 0
30cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
31    shr nblocksd, 1
32    sub     lfeq, 7*sizeof_float
33    mov    cnt1d, 32*sizeof_float
34    mov    cnt2d, 32*sizeof_float-8-FMA3_OFFSET
35    lea   coeffq, [coeffq+cnt1q*8]
36    add samplesq, cnt1q
37    neg    cnt1q
38
39.loop:
40%if cpuflag(avx)
41    cvtdq2ps  m4, [lfeq+16]
42    cvtdq2ps  m5, [lfeq   ]
43    shufps    m7, m4, m4, q0123
44    shufps    m6, m5, m5, q0123
45%else
46    movu      m4, [lfeq+16]
47    movu      m5, [lfeq   ]
48    cvtdq2ps  m4, m4
49    cvtdq2ps  m5, m5
50    pshufd    m7, m4, q0123
51    pshufd    m6, m5, q0123
52%endif
53
54.inner_loop:
55%if ARCH_X86_64
56    movaps    m8, [coeffq+cnt1q*8   ]
57    movaps    m9, [coeffq+cnt1q*8+16]
58    movaps   m10, [coeffq+cnt1q*8+32]
59    movaps   m11, [coeffq+cnt1q*8+48]
60%if cpuflag(fma3)
61    movaps   m12, [coeffq+cnt1q*8+64]
62    movaps   m13, [coeffq+cnt1q*8+80]
63    movaps   m14, [coeffq+cnt1q*8+96]
64    movaps   m15, [coeffq+cnt1q*8+112]
65    mulps     m0, m7, m8
66    mulps     m1, m7, m10
67    mulps     m2, m7, m12
68    mulps     m3, m7, m14
69    fmaddps   m0, m6, m9, m0
70    fmaddps   m1, m6, m11, m1
71    fmaddps   m2, m6, m13, m2
72    fmaddps   m3, m6, m15, m3
73
74    haddps    m0, m1
75    haddps    m2, m3
76    haddps    m0, m2
77    movaps [samplesq+cnt1q], m0
78%else
79    mulps     m0, m7, m8
80    mulps     m1, m6, m9
81    mulps     m2, m7, m10
82    mulps     m3, m6, m11
83    addps     m0, m1
84    addps     m2, m3
85
86    unpckhps  m3, m0, m2
87    unpcklps  m0, m2
88    addps     m3, m0
89    movhlps   m2, m3
90    addps     m2, m3
91    movlps [samplesq+cnt1q], m2
92%endif
93%else ; ARCH_X86_32
94%if cpuflag(fma3)
95    mulps     m0, m7, [coeffq+cnt1q*8    ]
96    mulps     m1, m7, [coeffq+cnt1q*8+32 ]
97    mulps     m2, m7, [coeffq+cnt1q*8+64 ]
98    mulps     m3, m7, [coeffq+cnt1q*8+96 ]
99    fmaddps   m0, m6, [coeffq+cnt1q*8+16 ], m0
100    fmaddps   m1, m6, [coeffq+cnt1q*8+48 ], m1
101    fmaddps   m2, m6, [coeffq+cnt1q*8+80 ], m2
102    fmaddps   m3, m6, [coeffq+cnt1q*8+112], m3
103
104    haddps    m0, m1
105    haddps    m2, m3
106    haddps    m0, m2
107    movaps [samplesq+cnt1q], m0
108%else
109    mulps     m0, m7, [coeffq+cnt1q*8   ]
110    mulps     m1, m6, [coeffq+cnt1q*8+16]
111    mulps     m2, m7, [coeffq+cnt1q*8+32]
112    mulps     m3, m6, [coeffq+cnt1q*8+48]
113    addps     m0, m1
114    addps     m2, m3
115
116    unpckhps  m3, m0, m2
117    unpcklps  m0, m2
118    addps     m3, m0
119    movhlps   m2, m3
120    addps     m2, m3
121    movlps [samplesq+cnt1q], m2
122%endif
123%endif; ARCH
124
125%if ARCH_X86_64
126%if cpuflag(fma3)
127    mulps     m8, m5
128    mulps    m10, m5
129    mulps    m12, m5
130    mulps    m14, m5
131    fmaddps   m8, m4, m9, m8
132    fmaddps  m10, m4, m11, m10
133    fmaddps  m12, m4, m13, m12
134    fmaddps  m14, m4, m15, m14
135
136    haddps   m10, m8
137    haddps   m14, m12
138    haddps   m14, m10
139    movaps [samplesq+cnt2q], m14
140%else
141    mulps     m8, m5
142    mulps     m9, m4
143    mulps    m10, m5
144    mulps    m11, m4
145    addps     m8, m9
146    addps    m10, m11
147
148    unpckhps m11, m10, m8
149    unpcklps m10, m8
150    addps    m11, m10
151    movhlps   m8, m11
152    addps     m8, m11
153    movlps [samplesq+cnt2q], m8
154%endif
155%else ; ARCH_X86_32
156%if cpuflag(fma3)
157    mulps     m0, m5, [coeffq+cnt1q*8    ]
158    mulps     m1, m5, [coeffq+cnt1q*8+32 ]
159    mulps     m2, m5, [coeffq+cnt1q*8+64 ]
160    mulps     m3, m5, [coeffq+cnt1q*8+96 ]
161    fmaddps   m0, m4, [coeffq+cnt1q*8+16 ], m0
162    fmaddps   m1, m4, [coeffq+cnt1q*8+48 ], m1
163    fmaddps   m2, m4, [coeffq+cnt1q*8+80 ], m2
164    fmaddps   m3, m4, [coeffq+cnt1q*8+112], m3
165
166    haddps    m1, m0
167    haddps    m3, m2
168    haddps    m3, m1
169    movaps [samplesq+cnt2q], m3
170%else
171    mulps     m0, m5, [coeffq+cnt1q*8   ]
172    mulps     m1, m4, [coeffq+cnt1q*8+16]
173    mulps     m2, m5, [coeffq+cnt1q*8+32]
174    mulps     m3, m4, [coeffq+cnt1q*8+48]
175    addps     m0, m1
176    addps     m2, m3
177
178    unpckhps  m3, m2, m0
179    unpcklps  m2, m0
180    addps     m3, m2
181    movhlps   m0, m3
182    addps     m0, m3
183    movlps [samplesq+cnt2q], m0
184%endif
185%endif; ARCH
186
187    sub    cnt2d, 8 + FMA3_OFFSET
188    add    cnt1q, 8 + FMA3_OFFSET
189    jl .inner_loop
190
191    add     lfeq, 4
192    add samplesq,  64*sizeof_float
193    mov    cnt1q, -32*sizeof_float
194    mov    cnt2d,  32*sizeof_float-8-FMA3_OFFSET
195    sub nblocksd, 1
196    jg .loop
197    RET
198%endmacro
199
200INIT_XMM sse2
201LFE_FIR0_FLOAT
202%if HAVE_AVX_EXTERNAL
203INIT_XMM avx
204LFE_FIR0_FLOAT
205%endif
206%if HAVE_FMA3_EXTERNAL
207INIT_XMM fma3
208LFE_FIR0_FLOAT
209%endif
210
211%macro LFE_FIR1_FLOAT 0
212cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
213    shr nblocksd, 2
214    sub     lfeq, 3*sizeof_float
215    mov    cnt1d, 64*sizeof_float
216    mov    cnt2d, 64*sizeof_float-16
217    lea   coeffq, [coeffq+cnt1q*4]
218    add samplesq, cnt1q
219    neg    cnt1q
220
221.loop:
222%if cpuflag(avx)
223    cvtdq2ps  m4, [lfeq]
224    shufps    m5, m4, m4, q0123
225%else
226    movu      m4, [lfeq]
227    cvtdq2ps  m4, m4
228    pshufd    m5, m4, q0123
229%endif
230
231.inner_loop:
232    movaps    m6, [coeffq+cnt1q*4   ]
233    movaps    m7, [coeffq+cnt1q*4+16]
234    mulps     m0, m5, m6
235    mulps     m1, m5, m7
236%if ARCH_X86_64
237    movaps    m8, [coeffq+cnt1q*4+32]
238    movaps    m9, [coeffq+cnt1q*4+48]
239    mulps     m2, m5, m8
240    mulps     m3, m5, m9
241%else
242    mulps     m2, m5, [coeffq+cnt1q*4+32]
243    mulps     m3, m5, [coeffq+cnt1q*4+48]
244%endif
245
246    haddps    m0, m1
247    haddps    m2, m3
248    haddps    m0, m2
249    movaps [samplesq+cnt1q], m0
250
251    mulps     m6, m4
252    mulps     m7, m4
253%if ARCH_X86_64
254    mulps     m8, m4
255    mulps     m9, m4
256
257    haddps    m6, m7
258    haddps    m8, m9
259    haddps    m6, m8
260%else
261    mulps     m2, m4, [coeffq+cnt1q*4+32]
262    mulps     m3, m4, [coeffq+cnt1q*4+48]
263
264    haddps    m6, m7
265    haddps    m2, m3
266    haddps    m6, m2
267%endif
268    movaps [samplesq+cnt2q], m6
269
270    sub    cnt2d, 16
271    add    cnt1q, 16
272    jl .inner_loop
273
274    add     lfeq, sizeof_float
275    add samplesq, 128*sizeof_float
276    mov    cnt1q, -64*sizeof_float
277    mov    cnt2d,  64*sizeof_float-16
278    sub nblocksd, 1
279    jg .loop
280    RET
281%endmacro
282
283INIT_XMM sse3
284LFE_FIR1_FLOAT
285%if HAVE_AVX_EXTERNAL
286INIT_XMM avx
287LFE_FIR1_FLOAT
288%endif
289