1;*****************************************************************************
2;* x86-optimized Float DSP functions
3;*
4;* Copyright 2006 Loren Merritt
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA 32
26pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
27
28SECTION .text
29
30;-----------------------------------------------------------------------------
31; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
32;-----------------------------------------------------------------------------
33%macro VECTOR_FMUL 0
34cglobal vector_fmul, 4,4,2, dst, src0, src1, len
35    lea       lenq, [lend*4 - 64]
36ALIGN 16
37.loop:
38%assign a 0
39%rep 32/mmsize
40    mova      m0,   [src0q + lenq + (a+0)*mmsize]
41    mova      m1,   [src0q + lenq + (a+1)*mmsize]
42    mulps     m0, m0, [src1q + lenq + (a+0)*mmsize]
43    mulps     m1, m1, [src1q + lenq + (a+1)*mmsize]
44    mova      [dstq + lenq + (a+0)*mmsize], m0
45    mova      [dstq + lenq + (a+1)*mmsize], m1
46%assign a a+2
47%endrep
48
49    sub       lenq, 64
50    jge       .loop
51    REP_RET
52%endmacro
53
54INIT_XMM sse
55VECTOR_FMUL
56%if HAVE_AVX_EXTERNAL
57INIT_YMM avx
58VECTOR_FMUL
59%endif
60
61;-----------------------------------------------------------------------------
62; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
63;-----------------------------------------------------------------------------
64%macro VECTOR_DMUL 0
65cglobal vector_dmul, 4,4,4, dst, src0, src1, len
66    lea       lend, [lenq*8 - mmsize*4]
67ALIGN 16
68.loop:
69    movaps    m0,     [src0q + lenq + 0*mmsize]
70    movaps    m1,     [src0q + lenq + 1*mmsize]
71    movaps    m2,     [src0q + lenq + 2*mmsize]
72    movaps    m3,     [src0q + lenq + 3*mmsize]
73    mulpd     m0, m0, [src1q + lenq + 0*mmsize]
74    mulpd     m1, m1, [src1q + lenq + 1*mmsize]
75    mulpd     m2, m2, [src1q + lenq + 2*mmsize]
76    mulpd     m3, m3, [src1q + lenq + 3*mmsize]
77    movaps    [dstq + lenq + 0*mmsize], m0
78    movaps    [dstq + lenq + 1*mmsize], m1
79    movaps    [dstq + lenq + 2*mmsize], m2
80    movaps    [dstq + lenq + 3*mmsize], m3
81
82    sub       lenq, mmsize*4
83    jge       .loop
84    RET
85%endmacro
86
87INIT_XMM sse2
88VECTOR_DMUL
89%if HAVE_AVX_EXTERNAL
90INIT_YMM avx
91VECTOR_DMUL
92%endif
93
94;------------------------------------------------------------------------------
95; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
96;------------------------------------------------------------------------------
97
98%macro VECTOR_FMAC_SCALAR 0
99%if UNIX64
100cglobal vector_fmac_scalar, 3,3,5, dst, src, len
101%else
102cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
103%endif
104%if ARCH_X86_32
105    VBROADCASTSS m0, mulm
106%else
107%if WIN64
108    SWAP 0, 2
109%endif
110    shufps      xm0, xm0, 0
111%if cpuflag(avx)
112    vinsertf128  m0, m0, xm0, 1
113%endif
114%endif
115    lea    lenq, [lend*4-64]
116.loop:
117%if cpuflag(fma3)
118    mova     m1,     [dstq+lenq]
119    mova     m2,     [dstq+lenq+1*mmsize]
120    fmaddps  m1, m0, [srcq+lenq], m1
121    fmaddps  m2, m0, [srcq+lenq+1*mmsize], m2
122%else ; cpuflag
123    mulps    m1, m0, [srcq+lenq]
124    mulps    m2, m0, [srcq+lenq+1*mmsize]
125%if mmsize < 32
126    mulps    m3, m0, [srcq+lenq+2*mmsize]
127    mulps    m4, m0, [srcq+lenq+3*mmsize]
128%endif ; mmsize
129    addps    m1, m1, [dstq+lenq]
130    addps    m2, m2, [dstq+lenq+1*mmsize]
131%if mmsize < 32
132    addps    m3, m3, [dstq+lenq+2*mmsize]
133    addps    m4, m4, [dstq+lenq+3*mmsize]
134%endif ; mmsize
135%endif ; cpuflag
136    mova  [dstq+lenq], m1
137    mova  [dstq+lenq+1*mmsize], m2
138%if mmsize < 32
139    mova  [dstq+lenq+2*mmsize], m3
140    mova  [dstq+lenq+3*mmsize], m4
141%endif ; mmsize
142    sub    lenq, 64
143    jge .loop
144    REP_RET
145%endmacro
146
147INIT_XMM sse
148VECTOR_FMAC_SCALAR
149%if HAVE_AVX_EXTERNAL
150INIT_YMM avx
151VECTOR_FMAC_SCALAR
152%endif
153%if HAVE_FMA3_EXTERNAL
154INIT_YMM fma3
155VECTOR_FMAC_SCALAR
156%endif
157
158;------------------------------------------------------------------------------
159; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
160;------------------------------------------------------------------------------
161
162%macro VECTOR_FMUL_SCALAR 0
163%if UNIX64
164cglobal vector_fmul_scalar, 3,3,2, dst, src, len
165%else
166cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
167%endif
168%if ARCH_X86_32
169    movss    m0, mulm
170%elif WIN64
171    SWAP 0, 2
172%endif
173    shufps   m0, m0, 0
174    lea    lenq, [lend*4-mmsize]
175.loop:
176    mova     m1, [srcq+lenq]
177    mulps    m1, m0
178    mova  [dstq+lenq], m1
179    sub    lenq, mmsize
180    jge .loop
181    REP_RET
182%endmacro
183
184INIT_XMM sse
185VECTOR_FMUL_SCALAR
186
187;------------------------------------------------------------------------------
188; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
189;                            int len)
190;------------------------------------------------------------------------------
191
192%macro VECTOR_DMAC_SCALAR 0
193%if ARCH_X86_32
194cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
195    mov          lenq, lenaddrm
196    VBROADCASTSD m0, mulm
197%else
198%if UNIX64
199cglobal vector_dmac_scalar, 3,3,5, dst, src, len
200%else
201cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
202    SWAP 0, 2
203%endif
204    movlhps     xm0, xm0
205%if cpuflag(avx)
206    vinsertf128  m0, m0, xm0, 1
207%endif
208%endif
209    lea    lenq, [lend*8-mmsize*4]
210.loop:
211%if cpuflag(fma3)
212    movaps   m1,     [dstq+lenq]
213    movaps   m2,     [dstq+lenq+1*mmsize]
214    movaps   m3,     [dstq+lenq+2*mmsize]
215    movaps   m4,     [dstq+lenq+3*mmsize]
216    fmaddpd  m1, m0, [srcq+lenq], m1
217    fmaddpd  m2, m0, [srcq+lenq+1*mmsize], m2
218    fmaddpd  m3, m0, [srcq+lenq+2*mmsize], m3
219    fmaddpd  m4, m0, [srcq+lenq+3*mmsize], m4
220%else ; cpuflag
221    mulpd    m1, m0, [srcq+lenq]
222    mulpd    m2, m0, [srcq+lenq+1*mmsize]
223    mulpd    m3, m0, [srcq+lenq+2*mmsize]
224    mulpd    m4, m0, [srcq+lenq+3*mmsize]
225    addpd    m1, m1, [dstq+lenq]
226    addpd    m2, m2, [dstq+lenq+1*mmsize]
227    addpd    m3, m3, [dstq+lenq+2*mmsize]
228    addpd    m4, m4, [dstq+lenq+3*mmsize]
229%endif ; cpuflag
230    movaps [dstq+lenq], m1
231    movaps [dstq+lenq+1*mmsize], m2
232    movaps [dstq+lenq+2*mmsize], m3
233    movaps [dstq+lenq+3*mmsize], m4
234    sub    lenq, mmsize*4
235    jge .loop
236    REP_RET
237%endmacro
238
239INIT_XMM sse2
240VECTOR_DMAC_SCALAR
241%if HAVE_AVX_EXTERNAL
242INIT_YMM avx
243VECTOR_DMAC_SCALAR
244%endif
245%if HAVE_FMA3_EXTERNAL
246INIT_YMM fma3
247VECTOR_DMAC_SCALAR
248%endif
249
250;------------------------------------------------------------------------------
251; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
252;                            int len)
253;------------------------------------------------------------------------------
254
255%macro VECTOR_DMUL_SCALAR 0
256%if ARCH_X86_32
257cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
258    mov          lenq, lenaddrm
259%elif UNIX64
260cglobal vector_dmul_scalar, 3,3,3, dst, src, len
261%else
262cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
263%endif
264%if ARCH_X86_32
265    VBROADCASTSD   m0, mulm
266%else
267%if WIN64
268    SWAP 0, 2
269%endif
270    movlhps       xm0, xm0
271%if cpuflag(avx)
272    vinsertf128   ym0, ym0, xm0, 1
273%endif
274%endif
275    lea          lenq, [lend*8-2*mmsize]
276.loop:
277    mulpd          m1, m0, [srcq+lenq       ]
278    mulpd          m2, m0, [srcq+lenq+mmsize]
279    movaps [dstq+lenq       ], m1
280    movaps [dstq+lenq+mmsize], m2
281    sub          lenq, 2*mmsize
282    jge .loop
283    REP_RET
284%endmacro
285
286INIT_XMM sse2
287VECTOR_DMUL_SCALAR
288%if HAVE_AVX_EXTERNAL
289INIT_YMM avx
290VECTOR_DMUL_SCALAR
291%endif
292
293;-----------------------------------------------------------------------------
294; vector_fmul_window(float *dst, const float *src0,
295;                    const float *src1, const float *win, int len);
296;-----------------------------------------------------------------------------
297INIT_XMM sse
298cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
299    shl     lend, 2
300    lea    len1q, [lenq - mmsize]
301    add    src0q, lenq
302    add     dstq, lenq
303    add     winq, lenq
304    neg     lenq
305.loop:
306    mova      m0, [winq  + lenq]
307    mova      m4, [src0q + lenq]
308    mova      m1, [winq  + len1q]
309    mova      m5, [src1q + len1q]
310    shufps    m1, m1, 0x1b
311    shufps    m5, m5, 0x1b
312    mova      m2, m0
313    mova      m3, m1
314    mulps     m2, m4
315    mulps     m3, m5
316    mulps     m1, m4
317    mulps     m0, m5
318    addps     m2, m3
319    subps     m1, m0
320    shufps    m2, m2, 0x1b
321    mova      [dstq + lenq], m1
322    mova      [dstq + len1q], m2
323    sub       len1q, mmsize
324    add       lenq,  mmsize
325    jl .loop
326    REP_RET
327
328;-----------------------------------------------------------------------------
329; vector_fmul_add(float *dst, const float *src0, const float *src1,
330;                 const float *src2, int len)
331;-----------------------------------------------------------------------------
332%macro VECTOR_FMUL_ADD 0
333cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
334    lea       lenq, [lend*4 - 2*mmsize]
335ALIGN 16
336.loop:
337    mova    m0,   [src0q + lenq]
338    mova    m1,   [src0q + lenq + mmsize]
339%if cpuflag(fma3)
340    mova    m2,     [src2q + lenq]
341    mova    m3,     [src2q + lenq + mmsize]
342    fmaddps m0, m0, [src1q + lenq], m2
343    fmaddps m1, m1, [src1q + lenq + mmsize], m3
344%else
345    mulps   m0, m0, [src1q + lenq]
346    mulps   m1, m1, [src1q + lenq + mmsize]
347    addps   m0, m0, [src2q + lenq]
348    addps   m1, m1, [src2q + lenq + mmsize]
349%endif
350    mova    [dstq + lenq], m0
351    mova    [dstq + lenq + mmsize], m1
352
353    sub     lenq,   2*mmsize
354    jge     .loop
355    REP_RET
356%endmacro
357
358INIT_XMM sse
359VECTOR_FMUL_ADD
360%if HAVE_AVX_EXTERNAL
361INIT_YMM avx
362VECTOR_FMUL_ADD
363%endif
364%if HAVE_FMA3_EXTERNAL
365INIT_YMM fma3
366VECTOR_FMUL_ADD
367%endif
368
369;-----------------------------------------------------------------------------
370; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
371;                          int len)
372;-----------------------------------------------------------------------------
373%macro VECTOR_FMUL_REVERSE 0
374cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
375%if cpuflag(avx2)
376    movaps  m2, [pd_reverse]
377%endif
378    lea       lenq, [lend*4 - 2*mmsize]
379ALIGN 16
380.loop:
381%if cpuflag(avx2)
382    vpermps m0, m2, [src1q]
383    vpermps m1, m2, [src1q+mmsize]
384%elif cpuflag(avx)
385    vmovaps     xmm0, [src1q + 16]
386    vinsertf128 m0, m0, [src1q], 1
387    vshufps     m0, m0, m0, q0123
388    vmovaps     xmm1, [src1q + mmsize + 16]
389    vinsertf128 m1, m1, [src1q + mmsize], 1
390    vshufps     m1, m1, m1, q0123
391%else
392    mova    m0, [src1q]
393    mova    m1, [src1q + mmsize]
394    shufps  m0, m0, q0123
395    shufps  m1, m1, q0123
396%endif
397    mulps   m0, m0, [src0q + lenq + mmsize]
398    mulps   m1, m1, [src0q + lenq]
399    movaps  [dstq + lenq + mmsize], m0
400    movaps  [dstq + lenq], m1
401    add     src1q, 2*mmsize
402    sub     lenq,  2*mmsize
403    jge     .loop
404    REP_RET
405%endmacro
406
407INIT_XMM sse
408VECTOR_FMUL_REVERSE
409%if HAVE_AVX_EXTERNAL
410INIT_YMM avx
411VECTOR_FMUL_REVERSE
412%endif
413%if HAVE_AVX2_EXTERNAL
414INIT_YMM avx2
415VECTOR_FMUL_REVERSE
416%endif
417
418; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
419INIT_XMM sse
420cglobal scalarproduct_float, 3,3,2, v1, v2, offset
421    shl   offsetd, 2
422    add       v1q, offsetq
423    add       v2q, offsetq
424    neg   offsetq
425    xorps    xmm0, xmm0
426.loop:
427    movaps   xmm1, [v1q+offsetq]
428    mulps    xmm1, [v2q+offsetq]
429    addps    xmm0, xmm1
430    add   offsetq, 16
431    js .loop
432    movhlps  xmm1, xmm0
433    addps    xmm0, xmm1
434    movss    xmm1, xmm0
435    shufps   xmm0, xmm0, 1
436    addss    xmm0, xmm1
437%if ARCH_X86_64 == 0
438    movss     r0m,  xmm0
439    fld dword r0m
440%endif
441    RET
442
443;-----------------------------------------------------------------------------
444; void ff_butterflies_float(float *src0, float *src1, int len);
445;-----------------------------------------------------------------------------
446INIT_XMM sse
447cglobal butterflies_float, 3,3,3, src0, src1, len
448    shl       lend, 2
449    add      src0q, lenq
450    add      src1q, lenq
451    neg       lenq
452.loop:
453    mova        m0, [src0q + lenq]
454    mova        m1, [src1q + lenq]
455    subps       m2, m0, m1
456    addps       m0, m0, m1
457    mova        [src1q + lenq], m2
458    mova        [src0q + lenq], m0
459    add       lenq, mmsize
460    jl .loop
461    REP_RET
462