1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* optimized audio functions
3cabdff1aSopenharmony_ci;* Copyright (c) 2008 Loren Merritt
4cabdff1aSopenharmony_ci;*
5cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci;******************************************************************************
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciSECTION .text
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
27cabdff1aSopenharmony_ciINIT_XMM sse2
28cabdff1aSopenharmony_cicglobal scalarproduct_int16, 3,3,3, v1, v2, order
29cabdff1aSopenharmony_ci    add orderd, orderd
30cabdff1aSopenharmony_ci    add v1q, orderq
31cabdff1aSopenharmony_ci    add v2q, orderq
32cabdff1aSopenharmony_ci    neg orderq
33cabdff1aSopenharmony_ci    pxor    m2, m2
34cabdff1aSopenharmony_ci.loop:
35cabdff1aSopenharmony_ci    movu    m0, [v1q + orderq]
36cabdff1aSopenharmony_ci    movu    m1, [v1q + orderq + mmsize]
37cabdff1aSopenharmony_ci    pmaddwd m0, [v2q + orderq]
38cabdff1aSopenharmony_ci    pmaddwd m1, [v2q + orderq + mmsize]
39cabdff1aSopenharmony_ci    paddd   m2, m0
40cabdff1aSopenharmony_ci    paddd   m2, m1
41cabdff1aSopenharmony_ci    add     orderq, mmsize*2
42cabdff1aSopenharmony_ci    jl .loop
43cabdff1aSopenharmony_ci    HADDD   m2, m0
44cabdff1aSopenharmony_ci    movd   eax, m2
45cabdff1aSopenharmony_ci    RET
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
49cabdff1aSopenharmony_ci; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
50cabdff1aSopenharmony_ci;                           int32_t max, unsigned int len)
51cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
52cabdff1aSopenharmony_ci
53cabdff1aSopenharmony_ci; %1 = number of xmm registers used
54cabdff1aSopenharmony_ci; %2 = number of inline load/process/store loops per asm loop
55cabdff1aSopenharmony_ci; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
56cabdff1aSopenharmony_ci; %4 = CLIPD function takes min/max as float instead of int (SSE2 version)
57cabdff1aSopenharmony_ci; %5 = suffix
58cabdff1aSopenharmony_ci%macro VECTOR_CLIP_INT32 4-5
59cabdff1aSopenharmony_cicglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
60cabdff1aSopenharmony_ci%if %4
61cabdff1aSopenharmony_ci    cvtsi2ss  m4, minm
62cabdff1aSopenharmony_ci    cvtsi2ss  m5, maxm
63cabdff1aSopenharmony_ci%else
64cabdff1aSopenharmony_ci    movd      m4, minm
65cabdff1aSopenharmony_ci    movd      m5, maxm
66cabdff1aSopenharmony_ci%endif
67cabdff1aSopenharmony_ci    SPLATD    m4
68cabdff1aSopenharmony_ci    SPLATD    m5
69cabdff1aSopenharmony_ci.loop:
70cabdff1aSopenharmony_ci%assign %%i 0
71cabdff1aSopenharmony_ci%rep %2
72cabdff1aSopenharmony_ci    mova      m0,  [srcq + mmsize * (0 + %%i)]
73cabdff1aSopenharmony_ci    mova      m1,  [srcq + mmsize * (1 + %%i)]
74cabdff1aSopenharmony_ci    mova      m2,  [srcq + mmsize * (2 + %%i)]
75cabdff1aSopenharmony_ci    mova      m3,  [srcq + mmsize * (3 + %%i)]
76cabdff1aSopenharmony_ci%if %3
77cabdff1aSopenharmony_ci    mova      m7,  [srcq + mmsize * (4 + %%i)]
78cabdff1aSopenharmony_ci    mova      m8,  [srcq + mmsize * (5 + %%i)]
79cabdff1aSopenharmony_ci    mova      m9,  [srcq + mmsize * (6 + %%i)]
80cabdff1aSopenharmony_ci    mova      m10, [srcq + mmsize * (7 + %%i)]
81cabdff1aSopenharmony_ci%endif
82cabdff1aSopenharmony_ci    CLIPD  m0,  m4, m5, m6
83cabdff1aSopenharmony_ci    CLIPD  m1,  m4, m5, m6
84cabdff1aSopenharmony_ci    CLIPD  m2,  m4, m5, m6
85cabdff1aSopenharmony_ci    CLIPD  m3,  m4, m5, m6
86cabdff1aSopenharmony_ci%if %3
87cabdff1aSopenharmony_ci    CLIPD  m7,  m4, m5, m6
88cabdff1aSopenharmony_ci    CLIPD  m8,  m4, m5, m6
89cabdff1aSopenharmony_ci    CLIPD  m9,  m4, m5, m6
90cabdff1aSopenharmony_ci    CLIPD  m10, m4, m5, m6
91cabdff1aSopenharmony_ci%endif
92cabdff1aSopenharmony_ci    mova  [dstq + mmsize * (0 + %%i)], m0
93cabdff1aSopenharmony_ci    mova  [dstq + mmsize * (1 + %%i)], m1
94cabdff1aSopenharmony_ci    mova  [dstq + mmsize * (2 + %%i)], m2
95cabdff1aSopenharmony_ci    mova  [dstq + mmsize * (3 + %%i)], m3
96cabdff1aSopenharmony_ci%if %3
97cabdff1aSopenharmony_ci    mova  [dstq + mmsize * (4 + %%i)], m7
98cabdff1aSopenharmony_ci    mova  [dstq + mmsize * (5 + %%i)], m8
99cabdff1aSopenharmony_ci    mova  [dstq + mmsize * (6 + %%i)], m9
100cabdff1aSopenharmony_ci    mova  [dstq + mmsize * (7 + %%i)], m10
101cabdff1aSopenharmony_ci%endif
102cabdff1aSopenharmony_ci%assign %%i (%%i + 4 * (1 + %3))
103cabdff1aSopenharmony_ci%endrep
104cabdff1aSopenharmony_ci    add     srcq, mmsize*4*(%2+%3)
105cabdff1aSopenharmony_ci    add     dstq, mmsize*4*(%2+%3)
106cabdff1aSopenharmony_ci    sub     lend, mmsize*(%2+%3)
107cabdff1aSopenharmony_ci    jg .loop
108cabdff1aSopenharmony_ci    REP_RET
109cabdff1aSopenharmony_ci%endmacro
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_ciINIT_XMM sse2
112cabdff1aSopenharmony_ciVECTOR_CLIP_INT32 6, 1, 0, 0, _int
113cabdff1aSopenharmony_ciVECTOR_CLIP_INT32 6, 2, 0, 1
114cabdff1aSopenharmony_ciINIT_XMM sse4
115cabdff1aSopenharmony_ci%ifdef m8
116cabdff1aSopenharmony_ciVECTOR_CLIP_INT32 11, 1, 1, 0
117cabdff1aSopenharmony_ci%else
118cabdff1aSopenharmony_ciVECTOR_CLIP_INT32 6, 1, 0, 0
119cabdff1aSopenharmony_ci%endif
120cabdff1aSopenharmony_ci
121cabdff1aSopenharmony_ci; void ff_vector_clipf_sse(float *dst, const float *src,
122cabdff1aSopenharmony_ci;                          int len, float min, float max)
123cabdff1aSopenharmony_ciINIT_XMM sse
124cabdff1aSopenharmony_cicglobal vector_clipf, 3, 3, 6, dst, src, len, min, max
125cabdff1aSopenharmony_ci%if ARCH_X86_32
126cabdff1aSopenharmony_ci    VBROADCASTSS m0, minm
127cabdff1aSopenharmony_ci    VBROADCASTSS m1, maxm
128cabdff1aSopenharmony_ci%elif WIN64
129cabdff1aSopenharmony_ci    SWAP 0, 3
130cabdff1aSopenharmony_ci    VBROADCASTSS m0, m0
131cabdff1aSopenharmony_ci    VBROADCASTSS m1, maxm
132cabdff1aSopenharmony_ci%else ; 64bit sysv
133cabdff1aSopenharmony_ci    VBROADCASTSS m0, m0
134cabdff1aSopenharmony_ci    VBROADCASTSS m1, m1
135cabdff1aSopenharmony_ci%endif
136cabdff1aSopenharmony_ci
137cabdff1aSopenharmony_ci    movsxdifnidn lenq, lend
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci.loop:
140cabdff1aSopenharmony_ci    mova m2, [srcq + 4 * lenq - 4 * mmsize]
141cabdff1aSopenharmony_ci    mova m3, [srcq + 4 * lenq - 3 * mmsize]
142cabdff1aSopenharmony_ci    mova m4, [srcq + 4 * lenq - 2 * mmsize]
143cabdff1aSopenharmony_ci    mova m5, [srcq + 4 * lenq - 1 * mmsize]
144cabdff1aSopenharmony_ci
145cabdff1aSopenharmony_ci    maxps m2, m0
146cabdff1aSopenharmony_ci    maxps m3, m0
147cabdff1aSopenharmony_ci    maxps m4, m0
148cabdff1aSopenharmony_ci    maxps m5, m0
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci    minps m2, m1
151cabdff1aSopenharmony_ci    minps m3, m1
152cabdff1aSopenharmony_ci    minps m4, m1
153cabdff1aSopenharmony_ci    minps m5, m1
154cabdff1aSopenharmony_ci
155cabdff1aSopenharmony_ci    mova [dstq + 4 * lenq - 4 * mmsize], m2
156cabdff1aSopenharmony_ci    mova [dstq + 4 * lenq - 3 * mmsize], m3
157cabdff1aSopenharmony_ci    mova [dstq + 4 * lenq - 2 * mmsize], m4
158cabdff1aSopenharmony_ci    mova [dstq + 4 * lenq - 1 * mmsize], m5
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_ci    sub lenq, mmsize
161cabdff1aSopenharmony_ci    jg .loop
162cabdff1aSopenharmony_ci
163cabdff1aSopenharmony_ci    RET
164