1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* x86-optimized Float DSP functions
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright 2006 Loren Merritt
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci;******************************************************************************
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ciSECTION_RODATA 32
26cabdff1aSopenharmony_cipd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_ciSECTION .text
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
31cabdff1aSopenharmony_ci; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
32cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
33cabdff1aSopenharmony_ci%macro VECTOR_FMUL 0
34cabdff1aSopenharmony_cicglobal vector_fmul, 4,4,2, dst, src0, src1, len
35cabdff1aSopenharmony_ci    lea       lenq, [lend*4 - 64]
36cabdff1aSopenharmony_ciALIGN 16
37cabdff1aSopenharmony_ci.loop:
38cabdff1aSopenharmony_ci%assign a 0
39cabdff1aSopenharmony_ci%rep 32/mmsize
40cabdff1aSopenharmony_ci    mova      m0,   [src0q + lenq + (a+0)*mmsize]
41cabdff1aSopenharmony_ci    mova      m1,   [src0q + lenq + (a+1)*mmsize]
42cabdff1aSopenharmony_ci    mulps     m0, m0, [src1q + lenq + (a+0)*mmsize]
43cabdff1aSopenharmony_ci    mulps     m1, m1, [src1q + lenq + (a+1)*mmsize]
44cabdff1aSopenharmony_ci    mova      [dstq + lenq + (a+0)*mmsize], m0
45cabdff1aSopenharmony_ci    mova      [dstq + lenq + (a+1)*mmsize], m1
46cabdff1aSopenharmony_ci%assign a a+2
47cabdff1aSopenharmony_ci%endrep
48cabdff1aSopenharmony_ci
49cabdff1aSopenharmony_ci    sub       lenq, 64
50cabdff1aSopenharmony_ci    jge       .loop
51cabdff1aSopenharmony_ci    REP_RET
52cabdff1aSopenharmony_ci%endmacro
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ciINIT_XMM sse
55cabdff1aSopenharmony_ciVECTOR_FMUL
56cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
57cabdff1aSopenharmony_ciINIT_YMM avx
58cabdff1aSopenharmony_ciVECTOR_FMUL
59cabdff1aSopenharmony_ci%endif
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
62cabdff1aSopenharmony_ci; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
63cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
64cabdff1aSopenharmony_ci%macro VECTOR_DMUL 0
65cabdff1aSopenharmony_cicglobal vector_dmul, 4,4,4, dst, src0, src1, len
66cabdff1aSopenharmony_ci    lea       lend, [lenq*8 - mmsize*4]
67cabdff1aSopenharmony_ciALIGN 16
68cabdff1aSopenharmony_ci.loop:
69cabdff1aSopenharmony_ci    movaps    m0,     [src0q + lenq + 0*mmsize]
70cabdff1aSopenharmony_ci    movaps    m1,     [src0q + lenq + 1*mmsize]
71cabdff1aSopenharmony_ci    movaps    m2,     [src0q + lenq + 2*mmsize]
72cabdff1aSopenharmony_ci    movaps    m3,     [src0q + lenq + 3*mmsize]
73cabdff1aSopenharmony_ci    mulpd     m0, m0, [src1q + lenq + 0*mmsize]
74cabdff1aSopenharmony_ci    mulpd     m1, m1, [src1q + lenq + 1*mmsize]
75cabdff1aSopenharmony_ci    mulpd     m2, m2, [src1q + lenq + 2*mmsize]
76cabdff1aSopenharmony_ci    mulpd     m3, m3, [src1q + lenq + 3*mmsize]
77cabdff1aSopenharmony_ci    movaps    [dstq + lenq + 0*mmsize], m0
78cabdff1aSopenharmony_ci    movaps    [dstq + lenq + 1*mmsize], m1
79cabdff1aSopenharmony_ci    movaps    [dstq + lenq + 2*mmsize], m2
80cabdff1aSopenharmony_ci    movaps    [dstq + lenq + 3*mmsize], m3
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_ci    sub       lenq, mmsize*4
83cabdff1aSopenharmony_ci    jge       .loop
84cabdff1aSopenharmony_ci    RET
85cabdff1aSopenharmony_ci%endmacro
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_ciINIT_XMM sse2
88cabdff1aSopenharmony_ciVECTOR_DMUL
89cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
90cabdff1aSopenharmony_ciINIT_YMM avx
91cabdff1aSopenharmony_ciVECTOR_DMUL
92cabdff1aSopenharmony_ci%endif
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
95cabdff1aSopenharmony_ci; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
96cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
97cabdff1aSopenharmony_ci
98cabdff1aSopenharmony_ci%macro VECTOR_FMAC_SCALAR 0
99cabdff1aSopenharmony_ci%if UNIX64
100cabdff1aSopenharmony_cicglobal vector_fmac_scalar, 3,3,5, dst, src, len
101cabdff1aSopenharmony_ci%else
102cabdff1aSopenharmony_cicglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
103cabdff1aSopenharmony_ci%endif
104cabdff1aSopenharmony_ci%if ARCH_X86_32
105cabdff1aSopenharmony_ci    VBROADCASTSS m0, mulm
106cabdff1aSopenharmony_ci%else
107cabdff1aSopenharmony_ci%if WIN64
108cabdff1aSopenharmony_ci    SWAP 0, 2
109cabdff1aSopenharmony_ci%endif
110cabdff1aSopenharmony_ci    shufps      xm0, xm0, 0
111cabdff1aSopenharmony_ci%if cpuflag(avx)
112cabdff1aSopenharmony_ci    vinsertf128  m0, m0, xm0, 1
113cabdff1aSopenharmony_ci%endif
114cabdff1aSopenharmony_ci%endif
115cabdff1aSopenharmony_ci    lea    lenq, [lend*4-64]
116cabdff1aSopenharmony_ci.loop:
117cabdff1aSopenharmony_ci%if cpuflag(fma3)
118cabdff1aSopenharmony_ci    mova     m1,     [dstq+lenq]
119cabdff1aSopenharmony_ci    mova     m2,     [dstq+lenq+1*mmsize]
120cabdff1aSopenharmony_ci    fmaddps  m1, m0, [srcq+lenq], m1
121cabdff1aSopenharmony_ci    fmaddps  m2, m0, [srcq+lenq+1*mmsize], m2
122cabdff1aSopenharmony_ci%else ; cpuflag
123cabdff1aSopenharmony_ci    mulps    m1, m0, [srcq+lenq]
124cabdff1aSopenharmony_ci    mulps    m2, m0, [srcq+lenq+1*mmsize]
125cabdff1aSopenharmony_ci%if mmsize < 32
126cabdff1aSopenharmony_ci    mulps    m3, m0, [srcq+lenq+2*mmsize]
127cabdff1aSopenharmony_ci    mulps    m4, m0, [srcq+lenq+3*mmsize]
128cabdff1aSopenharmony_ci%endif ; mmsize
129cabdff1aSopenharmony_ci    addps    m1, m1, [dstq+lenq]
130cabdff1aSopenharmony_ci    addps    m2, m2, [dstq+lenq+1*mmsize]
131cabdff1aSopenharmony_ci%if mmsize < 32
132cabdff1aSopenharmony_ci    addps    m3, m3, [dstq+lenq+2*mmsize]
133cabdff1aSopenharmony_ci    addps    m4, m4, [dstq+lenq+3*mmsize]
134cabdff1aSopenharmony_ci%endif ; mmsize
135cabdff1aSopenharmony_ci%endif ; cpuflag
136cabdff1aSopenharmony_ci    mova  [dstq+lenq], m1
137cabdff1aSopenharmony_ci    mova  [dstq+lenq+1*mmsize], m2
138cabdff1aSopenharmony_ci%if mmsize < 32
139cabdff1aSopenharmony_ci    mova  [dstq+lenq+2*mmsize], m3
140cabdff1aSopenharmony_ci    mova  [dstq+lenq+3*mmsize], m4
141cabdff1aSopenharmony_ci%endif ; mmsize
142cabdff1aSopenharmony_ci    sub    lenq, 64
143cabdff1aSopenharmony_ci    jge .loop
144cabdff1aSopenharmony_ci    REP_RET
145cabdff1aSopenharmony_ci%endmacro
146cabdff1aSopenharmony_ci
147cabdff1aSopenharmony_ciINIT_XMM sse
148cabdff1aSopenharmony_ciVECTOR_FMAC_SCALAR
149cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
150cabdff1aSopenharmony_ciINIT_YMM avx
151cabdff1aSopenharmony_ciVECTOR_FMAC_SCALAR
152cabdff1aSopenharmony_ci%endif
153cabdff1aSopenharmony_ci%if HAVE_FMA3_EXTERNAL
154cabdff1aSopenharmony_ciINIT_YMM fma3
155cabdff1aSopenharmony_ciVECTOR_FMAC_SCALAR
156cabdff1aSopenharmony_ci%endif
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
159cabdff1aSopenharmony_ci; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
160cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_ci%macro VECTOR_FMUL_SCALAR 0
163cabdff1aSopenharmony_ci%if UNIX64
164cabdff1aSopenharmony_cicglobal vector_fmul_scalar, 3,3,2, dst, src, len
165cabdff1aSopenharmony_ci%else
166cabdff1aSopenharmony_cicglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
167cabdff1aSopenharmony_ci%endif
168cabdff1aSopenharmony_ci%if ARCH_X86_32
169cabdff1aSopenharmony_ci    movss    m0, mulm
170cabdff1aSopenharmony_ci%elif WIN64
171cabdff1aSopenharmony_ci    SWAP 0, 2
172cabdff1aSopenharmony_ci%endif
173cabdff1aSopenharmony_ci    shufps   m0, m0, 0
174cabdff1aSopenharmony_ci    lea    lenq, [lend*4-mmsize]
175cabdff1aSopenharmony_ci.loop:
176cabdff1aSopenharmony_ci    mova     m1, [srcq+lenq]
177cabdff1aSopenharmony_ci    mulps    m1, m0
178cabdff1aSopenharmony_ci    mova  [dstq+lenq], m1
179cabdff1aSopenharmony_ci    sub    lenq, mmsize
180cabdff1aSopenharmony_ci    jge .loop
181cabdff1aSopenharmony_ci    REP_RET
182cabdff1aSopenharmony_ci%endmacro
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ciINIT_XMM sse
185cabdff1aSopenharmony_ciVECTOR_FMUL_SCALAR
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
188cabdff1aSopenharmony_ci; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
189cabdff1aSopenharmony_ci;                            int len)
190cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
191cabdff1aSopenharmony_ci
192cabdff1aSopenharmony_ci%macro VECTOR_DMAC_SCALAR 0
193cabdff1aSopenharmony_ci%if ARCH_X86_32
194cabdff1aSopenharmony_cicglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
195cabdff1aSopenharmony_ci    mov          lenq, lenaddrm
196cabdff1aSopenharmony_ci    VBROADCASTSD m0, mulm
197cabdff1aSopenharmony_ci%else
198cabdff1aSopenharmony_ci%if UNIX64
199cabdff1aSopenharmony_cicglobal vector_dmac_scalar, 3,3,5, dst, src, len
200cabdff1aSopenharmony_ci%else
201cabdff1aSopenharmony_cicglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
202cabdff1aSopenharmony_ci    SWAP 0, 2
203cabdff1aSopenharmony_ci%endif
204cabdff1aSopenharmony_ci    movlhps     xm0, xm0
205cabdff1aSopenharmony_ci%if cpuflag(avx)
206cabdff1aSopenharmony_ci    vinsertf128  m0, m0, xm0, 1
207cabdff1aSopenharmony_ci%endif
208cabdff1aSopenharmony_ci%endif
209cabdff1aSopenharmony_ci    lea    lenq, [lend*8-mmsize*4]
210cabdff1aSopenharmony_ci.loop:
211cabdff1aSopenharmony_ci%if cpuflag(fma3)
212cabdff1aSopenharmony_ci    movaps   m1,     [dstq+lenq]
213cabdff1aSopenharmony_ci    movaps   m2,     [dstq+lenq+1*mmsize]
214cabdff1aSopenharmony_ci    movaps   m3,     [dstq+lenq+2*mmsize]
215cabdff1aSopenharmony_ci    movaps   m4,     [dstq+lenq+3*mmsize]
216cabdff1aSopenharmony_ci    fmaddpd  m1, m0, [srcq+lenq], m1
217cabdff1aSopenharmony_ci    fmaddpd  m2, m0, [srcq+lenq+1*mmsize], m2
218cabdff1aSopenharmony_ci    fmaddpd  m3, m0, [srcq+lenq+2*mmsize], m3
219cabdff1aSopenharmony_ci    fmaddpd  m4, m0, [srcq+lenq+3*mmsize], m4
220cabdff1aSopenharmony_ci%else ; cpuflag
221cabdff1aSopenharmony_ci    mulpd    m1, m0, [srcq+lenq]
222cabdff1aSopenharmony_ci    mulpd    m2, m0, [srcq+lenq+1*mmsize]
223cabdff1aSopenharmony_ci    mulpd    m3, m0, [srcq+lenq+2*mmsize]
224cabdff1aSopenharmony_ci    mulpd    m4, m0, [srcq+lenq+3*mmsize]
225cabdff1aSopenharmony_ci    addpd    m1, m1, [dstq+lenq]
226cabdff1aSopenharmony_ci    addpd    m2, m2, [dstq+lenq+1*mmsize]
227cabdff1aSopenharmony_ci    addpd    m3, m3, [dstq+lenq+2*mmsize]
228cabdff1aSopenharmony_ci    addpd    m4, m4, [dstq+lenq+3*mmsize]
229cabdff1aSopenharmony_ci%endif ; cpuflag
230cabdff1aSopenharmony_ci    movaps [dstq+lenq], m1
231cabdff1aSopenharmony_ci    movaps [dstq+lenq+1*mmsize], m2
232cabdff1aSopenharmony_ci    movaps [dstq+lenq+2*mmsize], m3
233cabdff1aSopenharmony_ci    movaps [dstq+lenq+3*mmsize], m4
234cabdff1aSopenharmony_ci    sub    lenq, mmsize*4
235cabdff1aSopenharmony_ci    jge .loop
236cabdff1aSopenharmony_ci    REP_RET
237cabdff1aSopenharmony_ci%endmacro
238cabdff1aSopenharmony_ci
239cabdff1aSopenharmony_ciINIT_XMM sse2
240cabdff1aSopenharmony_ciVECTOR_DMAC_SCALAR
241cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
242cabdff1aSopenharmony_ciINIT_YMM avx
243cabdff1aSopenharmony_ciVECTOR_DMAC_SCALAR
244cabdff1aSopenharmony_ci%endif
245cabdff1aSopenharmony_ci%if HAVE_FMA3_EXTERNAL
246cabdff1aSopenharmony_ciINIT_YMM fma3
247cabdff1aSopenharmony_ciVECTOR_DMAC_SCALAR
248cabdff1aSopenharmony_ci%endif
249cabdff1aSopenharmony_ci
250cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
251cabdff1aSopenharmony_ci; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
252cabdff1aSopenharmony_ci;                            int len)
253cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ci%macro VECTOR_DMUL_SCALAR 0
256cabdff1aSopenharmony_ci%if ARCH_X86_32
257cabdff1aSopenharmony_cicglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
258cabdff1aSopenharmony_ci    mov          lenq, lenaddrm
259cabdff1aSopenharmony_ci%elif UNIX64
260cabdff1aSopenharmony_cicglobal vector_dmul_scalar, 3,3,3, dst, src, len
261cabdff1aSopenharmony_ci%else
262cabdff1aSopenharmony_cicglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
263cabdff1aSopenharmony_ci%endif
264cabdff1aSopenharmony_ci%if ARCH_X86_32
265cabdff1aSopenharmony_ci    VBROADCASTSD   m0, mulm
266cabdff1aSopenharmony_ci%else
267cabdff1aSopenharmony_ci%if WIN64
268cabdff1aSopenharmony_ci    SWAP 0, 2
269cabdff1aSopenharmony_ci%endif
270cabdff1aSopenharmony_ci    movlhps       xm0, xm0
271cabdff1aSopenharmony_ci%if cpuflag(avx)
272cabdff1aSopenharmony_ci    vinsertf128   ym0, ym0, xm0, 1
273cabdff1aSopenharmony_ci%endif
274cabdff1aSopenharmony_ci%endif
275cabdff1aSopenharmony_ci    lea          lenq, [lend*8-2*mmsize]
276cabdff1aSopenharmony_ci.loop:
277cabdff1aSopenharmony_ci    mulpd          m1, m0, [srcq+lenq       ]
278cabdff1aSopenharmony_ci    mulpd          m2, m0, [srcq+lenq+mmsize]
279cabdff1aSopenharmony_ci    movaps [dstq+lenq       ], m1
280cabdff1aSopenharmony_ci    movaps [dstq+lenq+mmsize], m2
281cabdff1aSopenharmony_ci    sub          lenq, 2*mmsize
282cabdff1aSopenharmony_ci    jge .loop
283cabdff1aSopenharmony_ci    REP_RET
284cabdff1aSopenharmony_ci%endmacro
285cabdff1aSopenharmony_ci
286cabdff1aSopenharmony_ciINIT_XMM sse2
287cabdff1aSopenharmony_ciVECTOR_DMUL_SCALAR
288cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
289cabdff1aSopenharmony_ciINIT_YMM avx
290cabdff1aSopenharmony_ciVECTOR_DMUL_SCALAR
291cabdff1aSopenharmony_ci%endif
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
294cabdff1aSopenharmony_ci; vector_fmul_window(float *dst, const float *src0,
295cabdff1aSopenharmony_ci;                    const float *src1, const float *win, int len);
296cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
297cabdff1aSopenharmony_ciINIT_XMM sse
298cabdff1aSopenharmony_cicglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
299cabdff1aSopenharmony_ci    shl     lend, 2
300cabdff1aSopenharmony_ci    lea    len1q, [lenq - mmsize]
301cabdff1aSopenharmony_ci    add    src0q, lenq
302cabdff1aSopenharmony_ci    add     dstq, lenq
303cabdff1aSopenharmony_ci    add     winq, lenq
304cabdff1aSopenharmony_ci    neg     lenq
305cabdff1aSopenharmony_ci.loop:
306cabdff1aSopenharmony_ci    mova      m0, [winq  + lenq]
307cabdff1aSopenharmony_ci    mova      m4, [src0q + lenq]
308cabdff1aSopenharmony_ci    mova      m1, [winq  + len1q]
309cabdff1aSopenharmony_ci    mova      m5, [src1q + len1q]
310cabdff1aSopenharmony_ci    shufps    m1, m1, 0x1b
311cabdff1aSopenharmony_ci    shufps    m5, m5, 0x1b
312cabdff1aSopenharmony_ci    mova      m2, m0
313cabdff1aSopenharmony_ci    mova      m3, m1
314cabdff1aSopenharmony_ci    mulps     m2, m4
315cabdff1aSopenharmony_ci    mulps     m3, m5
316cabdff1aSopenharmony_ci    mulps     m1, m4
317cabdff1aSopenharmony_ci    mulps     m0, m5
318cabdff1aSopenharmony_ci    addps     m2, m3
319cabdff1aSopenharmony_ci    subps     m1, m0
320cabdff1aSopenharmony_ci    shufps    m2, m2, 0x1b
321cabdff1aSopenharmony_ci    mova      [dstq + lenq], m1
322cabdff1aSopenharmony_ci    mova      [dstq + len1q], m2
323cabdff1aSopenharmony_ci    sub       len1q, mmsize
324cabdff1aSopenharmony_ci    add       lenq,  mmsize
325cabdff1aSopenharmony_ci    jl .loop
326cabdff1aSopenharmony_ci    REP_RET
327cabdff1aSopenharmony_ci
328cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
329cabdff1aSopenharmony_ci; vector_fmul_add(float *dst, const float *src0, const float *src1,
330cabdff1aSopenharmony_ci;                 const float *src2, int len)
331cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
332cabdff1aSopenharmony_ci%macro VECTOR_FMUL_ADD 0
333cabdff1aSopenharmony_cicglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
334cabdff1aSopenharmony_ci    lea       lenq, [lend*4 - 2*mmsize]
335cabdff1aSopenharmony_ciALIGN 16
336cabdff1aSopenharmony_ci.loop:
337cabdff1aSopenharmony_ci    mova    m0,   [src0q + lenq]
338cabdff1aSopenharmony_ci    mova    m1,   [src0q + lenq + mmsize]
339cabdff1aSopenharmony_ci%if cpuflag(fma3)
340cabdff1aSopenharmony_ci    mova    m2,     [src2q + lenq]
341cabdff1aSopenharmony_ci    mova    m3,     [src2q + lenq + mmsize]
342cabdff1aSopenharmony_ci    fmaddps m0, m0, [src1q + lenq], m2
343cabdff1aSopenharmony_ci    fmaddps m1, m1, [src1q + lenq + mmsize], m3
344cabdff1aSopenharmony_ci%else
345cabdff1aSopenharmony_ci    mulps   m0, m0, [src1q + lenq]
346cabdff1aSopenharmony_ci    mulps   m1, m1, [src1q + lenq + mmsize]
347cabdff1aSopenharmony_ci    addps   m0, m0, [src2q + lenq]
348cabdff1aSopenharmony_ci    addps   m1, m1, [src2q + lenq + mmsize]
349cabdff1aSopenharmony_ci%endif
350cabdff1aSopenharmony_ci    mova    [dstq + lenq], m0
351cabdff1aSopenharmony_ci    mova    [dstq + lenq + mmsize], m1
352cabdff1aSopenharmony_ci
353cabdff1aSopenharmony_ci    sub     lenq,   2*mmsize
354cabdff1aSopenharmony_ci    jge     .loop
355cabdff1aSopenharmony_ci    REP_RET
356cabdff1aSopenharmony_ci%endmacro
357cabdff1aSopenharmony_ci
358cabdff1aSopenharmony_ciINIT_XMM sse
359cabdff1aSopenharmony_ciVECTOR_FMUL_ADD
360cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
361cabdff1aSopenharmony_ciINIT_YMM avx
362cabdff1aSopenharmony_ciVECTOR_FMUL_ADD
363cabdff1aSopenharmony_ci%endif
364cabdff1aSopenharmony_ci%if HAVE_FMA3_EXTERNAL
365cabdff1aSopenharmony_ciINIT_YMM fma3
366cabdff1aSopenharmony_ciVECTOR_FMUL_ADD
367cabdff1aSopenharmony_ci%endif
368cabdff1aSopenharmony_ci
369cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
370cabdff1aSopenharmony_ci; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
371cabdff1aSopenharmony_ci;                          int len)
372cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
373cabdff1aSopenharmony_ci%macro VECTOR_FMUL_REVERSE 0
374cabdff1aSopenharmony_cicglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
375cabdff1aSopenharmony_ci%if cpuflag(avx2)
376cabdff1aSopenharmony_ci    movaps  m2, [pd_reverse]
377cabdff1aSopenharmony_ci%endif
378cabdff1aSopenharmony_ci    lea       lenq, [lend*4 - 2*mmsize]
379cabdff1aSopenharmony_ciALIGN 16
380cabdff1aSopenharmony_ci.loop:
381cabdff1aSopenharmony_ci%if cpuflag(avx2)
382cabdff1aSopenharmony_ci    vpermps m0, m2, [src1q]
383cabdff1aSopenharmony_ci    vpermps m1, m2, [src1q+mmsize]
384cabdff1aSopenharmony_ci%elif cpuflag(avx)
385cabdff1aSopenharmony_ci    vmovaps     xmm0, [src1q + 16]
386cabdff1aSopenharmony_ci    vinsertf128 m0, m0, [src1q], 1
387cabdff1aSopenharmony_ci    vshufps     m0, m0, m0, q0123
388cabdff1aSopenharmony_ci    vmovaps     xmm1, [src1q + mmsize + 16]
389cabdff1aSopenharmony_ci    vinsertf128 m1, m1, [src1q + mmsize], 1
390cabdff1aSopenharmony_ci    vshufps     m1, m1, m1, q0123
391cabdff1aSopenharmony_ci%else
392cabdff1aSopenharmony_ci    mova    m0, [src1q]
393cabdff1aSopenharmony_ci    mova    m1, [src1q + mmsize]
394cabdff1aSopenharmony_ci    shufps  m0, m0, q0123
395cabdff1aSopenharmony_ci    shufps  m1, m1, q0123
396cabdff1aSopenharmony_ci%endif
397cabdff1aSopenharmony_ci    mulps   m0, m0, [src0q + lenq + mmsize]
398cabdff1aSopenharmony_ci    mulps   m1, m1, [src0q + lenq]
399cabdff1aSopenharmony_ci    movaps  [dstq + lenq + mmsize], m0
400cabdff1aSopenharmony_ci    movaps  [dstq + lenq], m1
401cabdff1aSopenharmony_ci    add     src1q, 2*mmsize
402cabdff1aSopenharmony_ci    sub     lenq,  2*mmsize
403cabdff1aSopenharmony_ci    jge     .loop
404cabdff1aSopenharmony_ci    REP_RET
405cabdff1aSopenharmony_ci%endmacro
406cabdff1aSopenharmony_ci
407cabdff1aSopenharmony_ciINIT_XMM sse
408cabdff1aSopenharmony_ciVECTOR_FMUL_REVERSE
409cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
410cabdff1aSopenharmony_ciINIT_YMM avx
411cabdff1aSopenharmony_ciVECTOR_FMUL_REVERSE
412cabdff1aSopenharmony_ci%endif
413cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
414cabdff1aSopenharmony_ciINIT_YMM avx2
415cabdff1aSopenharmony_ciVECTOR_FMUL_REVERSE
416cabdff1aSopenharmony_ci%endif
417cabdff1aSopenharmony_ci
418cabdff1aSopenharmony_ci; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
419cabdff1aSopenharmony_ciINIT_XMM sse
420cabdff1aSopenharmony_cicglobal scalarproduct_float, 3,3,2, v1, v2, offset
421cabdff1aSopenharmony_ci    shl   offsetd, 2
422cabdff1aSopenharmony_ci    add       v1q, offsetq
423cabdff1aSopenharmony_ci    add       v2q, offsetq
424cabdff1aSopenharmony_ci    neg   offsetq
425cabdff1aSopenharmony_ci    xorps    xmm0, xmm0
426cabdff1aSopenharmony_ci.loop:
427cabdff1aSopenharmony_ci    movaps   xmm1, [v1q+offsetq]
428cabdff1aSopenharmony_ci    mulps    xmm1, [v2q+offsetq]
429cabdff1aSopenharmony_ci    addps    xmm0, xmm1
430cabdff1aSopenharmony_ci    add   offsetq, 16
431cabdff1aSopenharmony_ci    js .loop
432cabdff1aSopenharmony_ci    movhlps  xmm1, xmm0
433cabdff1aSopenharmony_ci    addps    xmm0, xmm1
434cabdff1aSopenharmony_ci    movss    xmm1, xmm0
435cabdff1aSopenharmony_ci    shufps   xmm0, xmm0, 1
436cabdff1aSopenharmony_ci    addss    xmm0, xmm1
437cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
438cabdff1aSopenharmony_ci    movss     r0m,  xmm0
439cabdff1aSopenharmony_ci    fld dword r0m
440cabdff1aSopenharmony_ci%endif
441cabdff1aSopenharmony_ci    RET
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
444cabdff1aSopenharmony_ci; void ff_butterflies_float(float *src0, float *src1, int len);
445cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
446cabdff1aSopenharmony_ciINIT_XMM sse
447cabdff1aSopenharmony_cicglobal butterflies_float, 3,3,3, src0, src1, len
448cabdff1aSopenharmony_ci    shl       lend, 2
449cabdff1aSopenharmony_ci    add      src0q, lenq
450cabdff1aSopenharmony_ci    add      src1q, lenq
451cabdff1aSopenharmony_ci    neg       lenq
452cabdff1aSopenharmony_ci.loop:
453cabdff1aSopenharmony_ci    mova        m0, [src0q + lenq]
454cabdff1aSopenharmony_ci    mova        m1, [src1q + lenq]
455cabdff1aSopenharmony_ci    subps       m2, m0, m1
456cabdff1aSopenharmony_ci    addps       m0, m0, m1
457cabdff1aSopenharmony_ci    mova        [src1q + lenq], m2
458cabdff1aSopenharmony_ci    mova        [src0q + lenq], m0
459cabdff1aSopenharmony_ci    add       lenq, mmsize
460cabdff1aSopenharmony_ci    jl .loop
461cabdff1aSopenharmony_ci    REP_RET
462