1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* x86 optimized channel mixing
3cabdff1aSopenharmony_ci;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
4cabdff1aSopenharmony_ci;*
5cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci;******************************************************************************
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
23cabdff1aSopenharmony_ci%include "util.asm"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ciSECTION .text
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
28cabdff1aSopenharmony_ci; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len,
29cabdff1aSopenharmony_ci;                             int out_ch, int in_ch);
30cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci%macro MIX_2_TO_1_FLTP_FLT 0
33cabdff1aSopenharmony_cicglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
34cabdff1aSopenharmony_ci    mov       src1q, [srcq+gprsize]
35cabdff1aSopenharmony_ci    mov        srcq, [srcq        ]
36cabdff1aSopenharmony_ci    sub       src1q, srcq
37cabdff1aSopenharmony_ci    mov     matrixq, [matrixq  ]
38cabdff1aSopenharmony_ci    VBROADCASTSS m4, [matrixq  ]
39cabdff1aSopenharmony_ci    VBROADCASTSS m5, [matrixq+4]
40cabdff1aSopenharmony_ci    ALIGN 16
41cabdff1aSopenharmony_ci.loop:
42cabdff1aSopenharmony_ci    mulps        m0, m4, [srcq             ]
43cabdff1aSopenharmony_ci    mulps        m1, m5, [srcq+src1q       ]
44cabdff1aSopenharmony_ci    mulps        m2, m4, [srcq+      mmsize]
45cabdff1aSopenharmony_ci    mulps        m3, m5, [srcq+src1q+mmsize]
46cabdff1aSopenharmony_ci    addps        m0, m0, m1
47cabdff1aSopenharmony_ci    addps        m2, m2, m3
48cabdff1aSopenharmony_ci    mova  [srcq       ], m0
49cabdff1aSopenharmony_ci    mova  [srcq+mmsize], m2
50cabdff1aSopenharmony_ci    add        srcq, mmsize*2
51cabdff1aSopenharmony_ci    sub        lend, mmsize*2/4
52cabdff1aSopenharmony_ci    jg .loop
53cabdff1aSopenharmony_ci    REP_RET
54cabdff1aSopenharmony_ci%endmacro
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ciINIT_XMM sse
57cabdff1aSopenharmony_ciMIX_2_TO_1_FLTP_FLT
58cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
59cabdff1aSopenharmony_ciINIT_YMM avx
60cabdff1aSopenharmony_ciMIX_2_TO_1_FLTP_FLT
61cabdff1aSopenharmony_ci%endif
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
64cabdff1aSopenharmony_ci; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len,
65cabdff1aSopenharmony_ci;                             int out_ch, int in_ch);
66cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci%macro MIX_2_TO_1_S16P_FLT 0
69cabdff1aSopenharmony_cicglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1
70cabdff1aSopenharmony_ci    mov       src1q, [srcq+gprsize]
71cabdff1aSopenharmony_ci    mov        srcq, [srcq]
72cabdff1aSopenharmony_ci    sub       src1q, srcq
73cabdff1aSopenharmony_ci    mov     matrixq, [matrixq  ]
74cabdff1aSopenharmony_ci    VBROADCASTSS m4, [matrixq  ]
75cabdff1aSopenharmony_ci    VBROADCASTSS m5, [matrixq+4]
76cabdff1aSopenharmony_ci    ALIGN 16
77cabdff1aSopenharmony_ci.loop:
78cabdff1aSopenharmony_ci    mova         m0, [srcq      ]
79cabdff1aSopenharmony_ci    mova         m2, [srcq+src1q]
80cabdff1aSopenharmony_ci    S16_TO_S32_SX 0, 1
81cabdff1aSopenharmony_ci    S16_TO_S32_SX 2, 3
82cabdff1aSopenharmony_ci    cvtdq2ps     m0, m0
83cabdff1aSopenharmony_ci    cvtdq2ps     m1, m1
84cabdff1aSopenharmony_ci    cvtdq2ps     m2, m2
85cabdff1aSopenharmony_ci    cvtdq2ps     m3, m3
86cabdff1aSopenharmony_ci    mulps        m0, m4
87cabdff1aSopenharmony_ci    mulps        m1, m4
88cabdff1aSopenharmony_ci    mulps        m2, m5
89cabdff1aSopenharmony_ci    mulps        m3, m5
90cabdff1aSopenharmony_ci    addps        m0, m2
91cabdff1aSopenharmony_ci    addps        m1, m3
92cabdff1aSopenharmony_ci    cvtps2dq     m0, m0
93cabdff1aSopenharmony_ci    cvtps2dq     m1, m1
94cabdff1aSopenharmony_ci    packssdw     m0, m1
95cabdff1aSopenharmony_ci    mova     [srcq], m0
96cabdff1aSopenharmony_ci    add        srcq, mmsize
97cabdff1aSopenharmony_ci    sub        lend, mmsize/2
98cabdff1aSopenharmony_ci    jg .loop
99cabdff1aSopenharmony_ci    REP_RET
100cabdff1aSopenharmony_ci%endmacro
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_ciINIT_XMM sse2
103cabdff1aSopenharmony_ciMIX_2_TO_1_S16P_FLT
104cabdff1aSopenharmony_ciINIT_XMM sse4
105cabdff1aSopenharmony_ciMIX_2_TO_1_S16P_FLT
106cabdff1aSopenharmony_ci
107cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
108cabdff1aSopenharmony_ci; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len,
109cabdff1aSopenharmony_ci;                            int out_ch, int in_ch);
110cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ciINIT_XMM sse2
113cabdff1aSopenharmony_cicglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1
114cabdff1aSopenharmony_ci    mov       src1q, [srcq+gprsize]
115cabdff1aSopenharmony_ci    mov        srcq, [srcq]
116cabdff1aSopenharmony_ci    sub       src1q, srcq
117cabdff1aSopenharmony_ci    mov     matrixq, [matrixq]
118cabdff1aSopenharmony_ci    movd         m4, [matrixq]
119cabdff1aSopenharmony_ci    movd         m5, [matrixq]
120cabdff1aSopenharmony_ci    SPLATW       m4, m4, 0
121cabdff1aSopenharmony_ci    SPLATW       m5, m5, 1
122cabdff1aSopenharmony_ci    pxor         m0, m0
123cabdff1aSopenharmony_ci    punpcklwd    m4, m0
124cabdff1aSopenharmony_ci    punpcklwd    m5, m0
125cabdff1aSopenharmony_ci    ALIGN 16
126cabdff1aSopenharmony_ci.loop:
127cabdff1aSopenharmony_ci    mova         m0, [srcq      ]
128cabdff1aSopenharmony_ci    mova         m2, [srcq+src1q]
129cabdff1aSopenharmony_ci    punpckhwd    m1, m0, m0
130cabdff1aSopenharmony_ci    punpcklwd    m0, m0
131cabdff1aSopenharmony_ci    punpckhwd    m3, m2, m2
132cabdff1aSopenharmony_ci    punpcklwd    m2, m2
133cabdff1aSopenharmony_ci    pmaddwd      m0, m4
134cabdff1aSopenharmony_ci    pmaddwd      m1, m4
135cabdff1aSopenharmony_ci    pmaddwd      m2, m5
136cabdff1aSopenharmony_ci    pmaddwd      m3, m5
137cabdff1aSopenharmony_ci    paddd        m0, m2
138cabdff1aSopenharmony_ci    paddd        m1, m3
139cabdff1aSopenharmony_ci    psrad        m0, 8
140cabdff1aSopenharmony_ci    psrad        m1, 8
141cabdff1aSopenharmony_ci    packssdw     m0, m1
142cabdff1aSopenharmony_ci    mova     [srcq], m0
143cabdff1aSopenharmony_ci    add        srcq, mmsize
144cabdff1aSopenharmony_ci    sub        lend, mmsize/2
145cabdff1aSopenharmony_ci    jg .loop
146cabdff1aSopenharmony_ci    REP_RET
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
149cabdff1aSopenharmony_ci; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len,
150cabdff1aSopenharmony_ci;                             int out_ch, int in_ch);
151cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
152cabdff1aSopenharmony_ci
153cabdff1aSopenharmony_ci%macro MIX_1_TO_2_FLTP_FLT 0
154cabdff1aSopenharmony_cicglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1
155cabdff1aSopenharmony_ci    mov       src1q, [src0q+gprsize]
156cabdff1aSopenharmony_ci    mov       src0q, [src0q]
157cabdff1aSopenharmony_ci    sub       src1q, src0q
158cabdff1aSopenharmony_ci    mov    matrix1q, [matrix0q+gprsize]
159cabdff1aSopenharmony_ci    mov    matrix0q, [matrix0q]
160cabdff1aSopenharmony_ci    VBROADCASTSS m2, [matrix0q]
161cabdff1aSopenharmony_ci    VBROADCASTSS m3, [matrix1q]
162cabdff1aSopenharmony_ci    ALIGN 16
163cabdff1aSopenharmony_ci.loop:
164cabdff1aSopenharmony_ci    mova         m0, [src0q]
165cabdff1aSopenharmony_ci    mulps        m1, m0, m3
166cabdff1aSopenharmony_ci    mulps        m0, m0, m2
167cabdff1aSopenharmony_ci    mova  [src0q      ], m0
168cabdff1aSopenharmony_ci    mova  [src0q+src1q], m1
169cabdff1aSopenharmony_ci    add       src0q, mmsize
170cabdff1aSopenharmony_ci    sub        lend, mmsize/4
171cabdff1aSopenharmony_ci    jg .loop
172cabdff1aSopenharmony_ci    REP_RET
173cabdff1aSopenharmony_ci%endmacro
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_ciINIT_XMM sse
176cabdff1aSopenharmony_ciMIX_1_TO_2_FLTP_FLT
177cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
178cabdff1aSopenharmony_ciINIT_YMM avx
179cabdff1aSopenharmony_ciMIX_1_TO_2_FLTP_FLT
180cabdff1aSopenharmony_ci%endif
181cabdff1aSopenharmony_ci
182cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
183cabdff1aSopenharmony_ci; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len,
184cabdff1aSopenharmony_ci;                             int out_ch, int in_ch);
185cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ci%macro MIX_1_TO_2_S16P_FLT 0
188cabdff1aSopenharmony_cicglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1
189cabdff1aSopenharmony_ci    mov       src1q, [src0q+gprsize]
190cabdff1aSopenharmony_ci    mov       src0q, [src0q]
191cabdff1aSopenharmony_ci    sub       src1q, src0q
192cabdff1aSopenharmony_ci    mov    matrix1q, [matrix0q+gprsize]
193cabdff1aSopenharmony_ci    mov    matrix0q, [matrix0q]
194cabdff1aSopenharmony_ci    VBROADCASTSS m4, [matrix0q]
195cabdff1aSopenharmony_ci    VBROADCASTSS m5, [matrix1q]
196cabdff1aSopenharmony_ci    ALIGN 16
197cabdff1aSopenharmony_ci.loop:
198cabdff1aSopenharmony_ci    mova         m0, [src0q]
199cabdff1aSopenharmony_ci    S16_TO_S32_SX 0, 2
200cabdff1aSopenharmony_ci    cvtdq2ps     m0, m0
201cabdff1aSopenharmony_ci    cvtdq2ps     m2, m2
202cabdff1aSopenharmony_ci    mulps        m1, m0, m5
203cabdff1aSopenharmony_ci    mulps        m0, m0, m4
204cabdff1aSopenharmony_ci    mulps        m3, m2, m5
205cabdff1aSopenharmony_ci    mulps        m2, m2, m4
206cabdff1aSopenharmony_ci    cvtps2dq     m0, m0
207cabdff1aSopenharmony_ci    cvtps2dq     m1, m1
208cabdff1aSopenharmony_ci    cvtps2dq     m2, m2
209cabdff1aSopenharmony_ci    cvtps2dq     m3, m3
210cabdff1aSopenharmony_ci    packssdw     m0, m2
211cabdff1aSopenharmony_ci    packssdw     m1, m3
212cabdff1aSopenharmony_ci    mova  [src0q      ], m0
213cabdff1aSopenharmony_ci    mova  [src0q+src1q], m1
214cabdff1aSopenharmony_ci    add       src0q, mmsize
215cabdff1aSopenharmony_ci    sub        lend, mmsize/2
216cabdff1aSopenharmony_ci    jg .loop
217cabdff1aSopenharmony_ci    REP_RET
218cabdff1aSopenharmony_ci%endmacro
219cabdff1aSopenharmony_ci
220cabdff1aSopenharmony_ciINIT_XMM sse2
221cabdff1aSopenharmony_ciMIX_1_TO_2_S16P_FLT
222cabdff1aSopenharmony_ciINIT_XMM sse4
223cabdff1aSopenharmony_ciMIX_1_TO_2_S16P_FLT
224cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
225cabdff1aSopenharmony_ciINIT_XMM avx
226cabdff1aSopenharmony_ciMIX_1_TO_2_S16P_FLT
227cabdff1aSopenharmony_ci%endif
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
230cabdff1aSopenharmony_ci; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix,
231cabdff1aSopenharmony_ci;                                      int len, int out_ch, int in_ch);
232cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci%macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp
235cabdff1aSopenharmony_ci; define some names to make the code clearer
236cabdff1aSopenharmony_ci%assign  in_channels %1
237cabdff1aSopenharmony_ci%assign out_channels %2
238cabdff1aSopenharmony_ci%assign stereo out_channels - 1
239cabdff1aSopenharmony_ci%ifidn %3, s16p
240cabdff1aSopenharmony_ci    %assign is_s16 1
241cabdff1aSopenharmony_ci%else
242cabdff1aSopenharmony_ci    %assign is_s16 0
243cabdff1aSopenharmony_ci%endif
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci; determine how many matrix elements must go on the stack vs. mmregs
246cabdff1aSopenharmony_ci%assign matrix_elements in_channels * out_channels
247cabdff1aSopenharmony_ci%if is_s16
248cabdff1aSopenharmony_ci    %if stereo
249cabdff1aSopenharmony_ci        %assign needed_mmregs 7
250cabdff1aSopenharmony_ci    %else
251cabdff1aSopenharmony_ci        %assign needed_mmregs 5
252cabdff1aSopenharmony_ci    %endif
253cabdff1aSopenharmony_ci%else
254cabdff1aSopenharmony_ci    %if stereo
255cabdff1aSopenharmony_ci        %assign needed_mmregs 4
256cabdff1aSopenharmony_ci    %else
257cabdff1aSopenharmony_ci        %assign needed_mmregs 3
258cabdff1aSopenharmony_ci    %endif
259cabdff1aSopenharmony_ci%endif
260cabdff1aSopenharmony_ci%assign matrix_elements_mm num_mmregs - needed_mmregs
261cabdff1aSopenharmony_ci%if matrix_elements < matrix_elements_mm
262cabdff1aSopenharmony_ci    %assign matrix_elements_mm matrix_elements
263cabdff1aSopenharmony_ci%endif
264cabdff1aSopenharmony_ci%if matrix_elements_mm < matrix_elements
265cabdff1aSopenharmony_ci    %assign matrix_elements_stack matrix_elements - matrix_elements_mm
266cabdff1aSopenharmony_ci%else
267cabdff1aSopenharmony_ci    %assign matrix_elements_stack 0
268cabdff1aSopenharmony_ci%endif
269cabdff1aSopenharmony_ci%assign matrix_stack_size matrix_elements_stack * mmsize
270cabdff1aSopenharmony_ci
271cabdff1aSopenharmony_ci%assign needed_stack_size -1 * matrix_stack_size
272cabdff1aSopenharmony_ci%if ARCH_X86_32 && in_channels >= 7
273cabdff1aSopenharmony_ci%assign needed_stack_size needed_stack_size - 16
274cabdff1aSopenharmony_ci%endif
275cabdff1aSopenharmony_ci
276cabdff1aSopenharmony_cicglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, needed_stack_size, src0, src1, len, src2, src3, src4, src5, src6, src7
277cabdff1aSopenharmony_ci
278cabdff1aSopenharmony_ci; define src pointers on stack if needed
279cabdff1aSopenharmony_ci%if matrix_elements_stack > 0 && ARCH_X86_32 && in_channels >= 7
280cabdff1aSopenharmony_ci    %define src5m [rsp+matrix_stack_size+0]
281cabdff1aSopenharmony_ci    %define src6m [rsp+matrix_stack_size+4]
282cabdff1aSopenharmony_ci    %define src7m [rsp+matrix_stack_size+8]
283cabdff1aSopenharmony_ci%endif
284cabdff1aSopenharmony_ci
285cabdff1aSopenharmony_ci; load matrix pointers
286cabdff1aSopenharmony_ci%define matrix0q r1q
287cabdff1aSopenharmony_ci%define matrix1q r3q
288cabdff1aSopenharmony_ci%if stereo
289cabdff1aSopenharmony_ci    mov      matrix1q, [matrix0q+gprsize]
290cabdff1aSopenharmony_ci%endif
291cabdff1aSopenharmony_ci    mov      matrix0q, [matrix0q]
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci; define matrix coeff names
294cabdff1aSopenharmony_ci%assign %%i 0
295cabdff1aSopenharmony_ci%assign %%j needed_mmregs
296cabdff1aSopenharmony_ci%rep in_channels
297cabdff1aSopenharmony_ci    %if %%i >= matrix_elements_mm
298cabdff1aSopenharmony_ci        CAT_XDEFINE mx_stack_0_, %%i, 1
299cabdff1aSopenharmony_ci        CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
300cabdff1aSopenharmony_ci    %else
301cabdff1aSopenharmony_ci        CAT_XDEFINE mx_stack_0_, %%i, 0
302cabdff1aSopenharmony_ci        CAT_XDEFINE mx_0_, %%i, m %+ %%j
303cabdff1aSopenharmony_ci        %assign %%j %%j+1
304cabdff1aSopenharmony_ci    %endif
305cabdff1aSopenharmony_ci    %assign %%i %%i+1
306cabdff1aSopenharmony_ci%endrep
307cabdff1aSopenharmony_ci%if stereo
308cabdff1aSopenharmony_ci%assign %%i 0
309cabdff1aSopenharmony_ci%rep in_channels
310cabdff1aSopenharmony_ci    %if in_channels + %%i >= matrix_elements_mm
311cabdff1aSopenharmony_ci        CAT_XDEFINE mx_stack_1_, %%i, 1
312cabdff1aSopenharmony_ci        CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
313cabdff1aSopenharmony_ci    %else
314cabdff1aSopenharmony_ci        CAT_XDEFINE mx_stack_1_, %%i, 0
315cabdff1aSopenharmony_ci        CAT_XDEFINE mx_1_, %%i, m %+ %%j
316cabdff1aSopenharmony_ci        %assign %%j %%j+1
317cabdff1aSopenharmony_ci    %endif
318cabdff1aSopenharmony_ci    %assign %%i %%i+1
319cabdff1aSopenharmony_ci%endrep
320cabdff1aSopenharmony_ci%endif
321cabdff1aSopenharmony_ci
322cabdff1aSopenharmony_ci; load/splat matrix coeffs
323cabdff1aSopenharmony_ci%assign %%i 0
324cabdff1aSopenharmony_ci%rep in_channels
325cabdff1aSopenharmony_ci    %if mx_stack_0_ %+ %%i
326cabdff1aSopenharmony_ci        VBROADCASTSS m0, [matrix0q+4*%%i]
327cabdff1aSopenharmony_ci        mova  mx_0_ %+ %%i, m0
328cabdff1aSopenharmony_ci    %else
329cabdff1aSopenharmony_ci        VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
330cabdff1aSopenharmony_ci    %endif
331cabdff1aSopenharmony_ci    %if stereo
332cabdff1aSopenharmony_ci    %if mx_stack_1_ %+ %%i
333cabdff1aSopenharmony_ci        VBROADCASTSS m0, [matrix1q+4*%%i]
334cabdff1aSopenharmony_ci        mova  mx_1_ %+ %%i, m0
335cabdff1aSopenharmony_ci    %else
336cabdff1aSopenharmony_ci        VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
337cabdff1aSopenharmony_ci    %endif
338cabdff1aSopenharmony_ci    %endif
339cabdff1aSopenharmony_ci    %assign %%i %%i+1
340cabdff1aSopenharmony_ci%endrep
341cabdff1aSopenharmony_ci
342cabdff1aSopenharmony_ci; load channel pointers to registers as offsets from the first channel pointer
343cabdff1aSopenharmony_ci%if ARCH_X86_64
344cabdff1aSopenharmony_ci    movsxd       lenq, r2d
345cabdff1aSopenharmony_ci%endif
346cabdff1aSopenharmony_ci    shl          lenq, 2-is_s16
347cabdff1aSopenharmony_ci%assign %%i 1
348cabdff1aSopenharmony_ci%rep (in_channels - 1)
349cabdff1aSopenharmony_ci    %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5
350cabdff1aSopenharmony_ci    mov         src5q, [src0q+%%i*gprsize]
351cabdff1aSopenharmony_ci    add         src5q, lenq
352cabdff1aSopenharmony_ci    mov         src %+ %%i %+ m, src5q
353cabdff1aSopenharmony_ci    %else
354cabdff1aSopenharmony_ci    mov         src %+ %%i %+ q, [src0q+%%i*gprsize]
355cabdff1aSopenharmony_ci    add         src %+ %%i %+ q, lenq
356cabdff1aSopenharmony_ci    %endif
357cabdff1aSopenharmony_ci    %assign %%i %%i+1
358cabdff1aSopenharmony_ci%endrep
359cabdff1aSopenharmony_ci    mov         src0q, [src0q]
360cabdff1aSopenharmony_ci    add         src0q, lenq
361cabdff1aSopenharmony_ci    neg          lenq
362cabdff1aSopenharmony_ci.loop:
363cabdff1aSopenharmony_ci; for x86-32 with 7-8 channels we do not have enough gp registers for all src
364cabdff1aSopenharmony_ci; pointers, so we have to load some of them from the stack each time
365cabdff1aSopenharmony_ci%define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
366cabdff1aSopenharmony_ci%if is_s16
367cabdff1aSopenharmony_ci    ; mix with s16p input
368cabdff1aSopenharmony_ci    mova           m0, [src0q+lenq]
369cabdff1aSopenharmony_ci    S16_TO_S32_SX   0, 1
370cabdff1aSopenharmony_ci    cvtdq2ps       m0, m0
371cabdff1aSopenharmony_ci    cvtdq2ps       m1, m1
372cabdff1aSopenharmony_ci    %if stereo
373cabdff1aSopenharmony_ci    mulps          m2, m0, mx_1_0
374cabdff1aSopenharmony_ci    mulps          m3, m1, mx_1_0
375cabdff1aSopenharmony_ci    %endif
376cabdff1aSopenharmony_ci    mulps          m0, m0, mx_0_0
377cabdff1aSopenharmony_ci    mulps          m1, m1, mx_0_0
378cabdff1aSopenharmony_ci%assign %%i 1
379cabdff1aSopenharmony_ci%rep (in_channels - 1)
380cabdff1aSopenharmony_ci    %if copy_src_from_stack
381cabdff1aSopenharmony_ci        %define src_ptr src5q
382cabdff1aSopenharmony_ci    %else
383cabdff1aSopenharmony_ci        %define src_ptr src %+ %%i %+ q
384cabdff1aSopenharmony_ci    %endif
385cabdff1aSopenharmony_ci    %if stereo
386cabdff1aSopenharmony_ci    %if copy_src_from_stack
387cabdff1aSopenharmony_ci    mov       src_ptr, src %+ %%i %+ m
388cabdff1aSopenharmony_ci    %endif
389cabdff1aSopenharmony_ci    mova           m4, [src_ptr+lenq]
390cabdff1aSopenharmony_ci    S16_TO_S32_SX   4, 5
391cabdff1aSopenharmony_ci    cvtdq2ps       m4, m4
392cabdff1aSopenharmony_ci    cvtdq2ps       m5, m5
393cabdff1aSopenharmony_ci    FMULADD_PS     m2, m4, mx_1_ %+ %%i, m2, m6
394cabdff1aSopenharmony_ci    FMULADD_PS     m3, m5, mx_1_ %+ %%i, m3, m6
395cabdff1aSopenharmony_ci    FMULADD_PS     m0, m4, mx_0_ %+ %%i, m0, m4
396cabdff1aSopenharmony_ci    FMULADD_PS     m1, m5, mx_0_ %+ %%i, m1, m5
397cabdff1aSopenharmony_ci    %else
398cabdff1aSopenharmony_ci    %if copy_src_from_stack
399cabdff1aSopenharmony_ci    mov       src_ptr, src %+ %%i %+ m
400cabdff1aSopenharmony_ci    %endif
401cabdff1aSopenharmony_ci    mova           m2, [src_ptr+lenq]
402cabdff1aSopenharmony_ci    S16_TO_S32_SX   2, 3
403cabdff1aSopenharmony_ci    cvtdq2ps       m2, m2
404cabdff1aSopenharmony_ci    cvtdq2ps       m3, m3
405cabdff1aSopenharmony_ci    FMULADD_PS     m0, m2, mx_0_ %+ %%i, m0, m4
406cabdff1aSopenharmony_ci    FMULADD_PS     m1, m3, mx_0_ %+ %%i, m1, m4
407cabdff1aSopenharmony_ci    %endif
408cabdff1aSopenharmony_ci    %assign %%i %%i+1
409cabdff1aSopenharmony_ci%endrep
410cabdff1aSopenharmony_ci    %if stereo
411cabdff1aSopenharmony_ci    cvtps2dq       m2, m2
412cabdff1aSopenharmony_ci    cvtps2dq       m3, m3
413cabdff1aSopenharmony_ci    packssdw       m2, m3
414cabdff1aSopenharmony_ci    mova [src1q+lenq], m2
415cabdff1aSopenharmony_ci    %endif
416cabdff1aSopenharmony_ci    cvtps2dq       m0, m0
417cabdff1aSopenharmony_ci    cvtps2dq       m1, m1
418cabdff1aSopenharmony_ci    packssdw       m0, m1
419cabdff1aSopenharmony_ci    mova [src0q+lenq], m0
420cabdff1aSopenharmony_ci%else
421cabdff1aSopenharmony_ci    ; mix with fltp input
422cabdff1aSopenharmony_ci    %if stereo || mx_stack_0_0
423cabdff1aSopenharmony_ci    mova           m0, [src0q+lenq]
424cabdff1aSopenharmony_ci    %endif
425cabdff1aSopenharmony_ci    %if stereo
426cabdff1aSopenharmony_ci    mulps          m1, m0, mx_1_0
427cabdff1aSopenharmony_ci    %endif
428cabdff1aSopenharmony_ci    %if stereo || mx_stack_0_0
429cabdff1aSopenharmony_ci    mulps          m0, m0, mx_0_0
430cabdff1aSopenharmony_ci    %else
431cabdff1aSopenharmony_ci    mulps          m0, mx_0_0, [src0q+lenq]
432cabdff1aSopenharmony_ci    %endif
433cabdff1aSopenharmony_ci%assign %%i 1
434cabdff1aSopenharmony_ci%rep (in_channels - 1)
435cabdff1aSopenharmony_ci    %if copy_src_from_stack
436cabdff1aSopenharmony_ci        %define src_ptr src5q
437cabdff1aSopenharmony_ci        mov   src_ptr, src %+ %%i %+ m
438cabdff1aSopenharmony_ci    %else
439cabdff1aSopenharmony_ci        %define src_ptr src %+ %%i %+ q
440cabdff1aSopenharmony_ci    %endif
441cabdff1aSopenharmony_ci    ; avoid extra load for mono if matrix is in a mm register
442cabdff1aSopenharmony_ci    %if stereo || mx_stack_0_ %+ %%i
443cabdff1aSopenharmony_ci    mova           m2, [src_ptr+lenq]
444cabdff1aSopenharmony_ci    %endif
445cabdff1aSopenharmony_ci    %if stereo
446cabdff1aSopenharmony_ci    FMULADD_PS     m1, m2, mx_1_ %+ %%i, m1, m3
447cabdff1aSopenharmony_ci    %endif
448cabdff1aSopenharmony_ci    %if stereo || mx_stack_0_ %+ %%i
449cabdff1aSopenharmony_ci    FMULADD_PS     m0, m2, mx_0_ %+ %%i, m0, m2
450cabdff1aSopenharmony_ci    %else
451cabdff1aSopenharmony_ci    FMULADD_PS     m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
452cabdff1aSopenharmony_ci    %endif
453cabdff1aSopenharmony_ci    %assign %%i %%i+1
454cabdff1aSopenharmony_ci%endrep
455cabdff1aSopenharmony_ci    mova [src0q+lenq], m0
456cabdff1aSopenharmony_ci    %if stereo
457cabdff1aSopenharmony_ci    mova [src1q+lenq], m1
458cabdff1aSopenharmony_ci    %endif
459cabdff1aSopenharmony_ci%endif
460cabdff1aSopenharmony_ci
461cabdff1aSopenharmony_ci    add          lenq, mmsize
462cabdff1aSopenharmony_ci    jl .loop
463cabdff1aSopenharmony_ci; zero ymm high halves
464cabdff1aSopenharmony_ci%if mmsize == 32
465cabdff1aSopenharmony_ci    vzeroupper
466cabdff1aSopenharmony_ci%endif
467cabdff1aSopenharmony_ci    RET
468cabdff1aSopenharmony_ci%endmacro
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_ci%macro MIX_3_8_TO_1_2_FLT_FUNCS 0
471cabdff1aSopenharmony_ci%assign %%i 3
472cabdff1aSopenharmony_ci%rep 6
473cabdff1aSopenharmony_ci    INIT_XMM sse
474cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 1, fltp
475cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 2, fltp
476cabdff1aSopenharmony_ci    INIT_XMM sse2
477cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 1, s16p
478cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 2, s16p
479cabdff1aSopenharmony_ci    INIT_XMM sse4
480cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 1, s16p
481cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 2, s16p
482cabdff1aSopenharmony_ci    ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues
483cabdff1aSopenharmony_ci    %if HAVE_AVX_EXTERNAL
484cabdff1aSopenharmony_ci    %if ARCH_X86_64 || %%i < 6
485cabdff1aSopenharmony_ci    INIT_YMM avx
486cabdff1aSopenharmony_ci    %else
487cabdff1aSopenharmony_ci    INIT_XMM avx
488cabdff1aSopenharmony_ci    %endif
489cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 1, fltp
490cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 2, fltp
491cabdff1aSopenharmony_ci    INIT_XMM avx
492cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 1, s16p
493cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 2, s16p
494cabdff1aSopenharmony_ci    %endif
495cabdff1aSopenharmony_ci    %if HAVE_FMA4_EXTERNAL
496cabdff1aSopenharmony_ci    %if ARCH_X86_64 || %%i < 6
497cabdff1aSopenharmony_ci    INIT_YMM fma4
498cabdff1aSopenharmony_ci    %else
499cabdff1aSopenharmony_ci    INIT_XMM fma4
500cabdff1aSopenharmony_ci    %endif
501cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 1, fltp
502cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 2, fltp
503cabdff1aSopenharmony_ci    INIT_XMM fma4
504cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 1, s16p
505cabdff1aSopenharmony_ci    MIX_3_8_TO_1_2_FLT %%i, 2, s16p
506cabdff1aSopenharmony_ci    %endif
507cabdff1aSopenharmony_ci    %assign %%i %%i+1
508cabdff1aSopenharmony_ci%endrep
509cabdff1aSopenharmony_ci%endmacro
510cabdff1aSopenharmony_ci
511cabdff1aSopenharmony_ciMIX_3_8_TO_1_2_FLT_FUNCS
512