1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* x86 optimized Format Conversion Utils
3cabdff1aSopenharmony_ci;* Copyright (c) 2008 Loren Merritt
4cabdff1aSopenharmony_ci;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci;******************************************************************************
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
24cabdff1aSopenharmony_ci%include "util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION_RODATA 32
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_cipf_s32_inv_scale: times 8 dd 0x30000000
29cabdff1aSopenharmony_cipf_s32_scale:     times 8 dd 0x4f000000
30cabdff1aSopenharmony_cipf_s32_clip:      times 8 dd 0x4effffff
31cabdff1aSopenharmony_cipf_s16_inv_scale: times 4 dd 0x38000000
32cabdff1aSopenharmony_cipf_s16_scale:     times 4 dd 0x47000000
33cabdff1aSopenharmony_cipb_shuf_unpack_even:      db -1, -1,  0,  1, -1, -1,  2,  3, -1, -1,  8,  9, -1, -1, 10, 11
34cabdff1aSopenharmony_cipb_shuf_unpack_odd:       db -1, -1,  4,  5, -1, -1,  6,  7, -1, -1, 12, 13, -1, -1, 14, 15
35cabdff1aSopenharmony_cipb_interleave_words: SHUFFLE_MASK_W  0,  4,  1,  5,  2,  6,  3,  7
36cabdff1aSopenharmony_cipb_deinterleave_words: SHUFFLE_MASK_W  0,  2,  4,  6,  1,  3,  5,  7
37cabdff1aSopenharmony_cipw_zero_even:     times 4 dw 0x0000, 0xffff
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ciSECTION .text
40cabdff1aSopenharmony_ci
41cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
42cabdff1aSopenharmony_ci; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
43cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
44cabdff1aSopenharmony_ci
45cabdff1aSopenharmony_ciINIT_XMM sse2
46cabdff1aSopenharmony_cicglobal conv_s16_to_s32, 3,3,3, dst, src, len
47cabdff1aSopenharmony_ci    lea      lenq, [2*lend]
48cabdff1aSopenharmony_ci    lea      dstq, [dstq+2*lenq]
49cabdff1aSopenharmony_ci    add      srcq, lenq
50cabdff1aSopenharmony_ci    neg      lenq
51cabdff1aSopenharmony_ci.loop:
52cabdff1aSopenharmony_ci    mova       m2, [srcq+lenq]
53cabdff1aSopenharmony_ci    pxor       m0, m0
54cabdff1aSopenharmony_ci    pxor       m1, m1
55cabdff1aSopenharmony_ci    punpcklwd  m0, m2
56cabdff1aSopenharmony_ci    punpckhwd  m1, m2
57cabdff1aSopenharmony_ci    mova  [dstq+2*lenq       ], m0
58cabdff1aSopenharmony_ci    mova  [dstq+2*lenq+mmsize], m1
59cabdff1aSopenharmony_ci    add      lenq, mmsize
60cabdff1aSopenharmony_ci    jl .loop
61cabdff1aSopenharmony_ci    REP_RET
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
64cabdff1aSopenharmony_ci; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
65cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_ci%macro CONV_S16_TO_FLT 0
68cabdff1aSopenharmony_cicglobal conv_s16_to_flt, 3,3,3, dst, src, len
69cabdff1aSopenharmony_ci    lea      lenq, [2*lend]
70cabdff1aSopenharmony_ci    add      srcq, lenq
71cabdff1aSopenharmony_ci    lea      dstq, [dstq + 2*lenq]
72cabdff1aSopenharmony_ci    neg      lenq
73cabdff1aSopenharmony_ci    mova       m2, [pf_s16_inv_scale]
74cabdff1aSopenharmony_ci    ALIGN 16
75cabdff1aSopenharmony_ci.loop:
76cabdff1aSopenharmony_ci    mova       m0, [srcq+lenq]
77cabdff1aSopenharmony_ci    S16_TO_S32_SX 0, 1
78cabdff1aSopenharmony_ci    cvtdq2ps   m0, m0
79cabdff1aSopenharmony_ci    cvtdq2ps   m1, m1
80cabdff1aSopenharmony_ci    mulps      m0, m2
81cabdff1aSopenharmony_ci    mulps      m1, m2
82cabdff1aSopenharmony_ci    mova  [dstq+2*lenq       ], m0
83cabdff1aSopenharmony_ci    mova  [dstq+2*lenq+mmsize], m1
84cabdff1aSopenharmony_ci    add      lenq, mmsize
85cabdff1aSopenharmony_ci    jl .loop
86cabdff1aSopenharmony_ci    REP_RET
87cabdff1aSopenharmony_ci%endmacro
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ciINIT_XMM sse2
90cabdff1aSopenharmony_ciCONV_S16_TO_FLT
91cabdff1aSopenharmony_ciINIT_XMM sse4
92cabdff1aSopenharmony_ciCONV_S16_TO_FLT
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
95cabdff1aSopenharmony_ci; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
96cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
97cabdff1aSopenharmony_ci
98cabdff1aSopenharmony_ci%macro CONV_S32_TO_S16 0
99cabdff1aSopenharmony_cicglobal conv_s32_to_s16, 3,3,4, dst, src, len
100cabdff1aSopenharmony_ci    lea     lenq, [2*lend]
101cabdff1aSopenharmony_ci    lea     srcq, [srcq+2*lenq]
102cabdff1aSopenharmony_ci    add     dstq, lenq
103cabdff1aSopenharmony_ci    neg     lenq
104cabdff1aSopenharmony_ci.loop:
105cabdff1aSopenharmony_ci    mova      m0, [srcq+2*lenq         ]
106cabdff1aSopenharmony_ci    mova      m1, [srcq+2*lenq+  mmsize]
107cabdff1aSopenharmony_ci    mova      m2, [srcq+2*lenq+2*mmsize]
108cabdff1aSopenharmony_ci    mova      m3, [srcq+2*lenq+3*mmsize]
109cabdff1aSopenharmony_ci    psrad     m0, 16
110cabdff1aSopenharmony_ci    psrad     m1, 16
111cabdff1aSopenharmony_ci    psrad     m2, 16
112cabdff1aSopenharmony_ci    psrad     m3, 16
113cabdff1aSopenharmony_ci    packssdw  m0, m1
114cabdff1aSopenharmony_ci    packssdw  m2, m3
115cabdff1aSopenharmony_ci    mova  [dstq+lenq       ], m0
116cabdff1aSopenharmony_ci    mova  [dstq+lenq+mmsize], m2
117cabdff1aSopenharmony_ci    add     lenq, mmsize*2
118cabdff1aSopenharmony_ci    jl .loop
119cabdff1aSopenharmony_ci%if mmsize == 8
120cabdff1aSopenharmony_ci    emms
121cabdff1aSopenharmony_ci    RET
122cabdff1aSopenharmony_ci%else
123cabdff1aSopenharmony_ci    REP_RET
124cabdff1aSopenharmony_ci%endif
125cabdff1aSopenharmony_ci%endmacro
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ciINIT_MMX mmx
128cabdff1aSopenharmony_ciCONV_S32_TO_S16
129cabdff1aSopenharmony_ciINIT_XMM sse2
130cabdff1aSopenharmony_ciCONV_S32_TO_S16
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
133cabdff1aSopenharmony_ci; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
134cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ci%macro CONV_S32_TO_FLT 0
137cabdff1aSopenharmony_cicglobal conv_s32_to_flt, 3,3,3, dst, src, len
138cabdff1aSopenharmony_ci    lea     lenq, [4*lend]
139cabdff1aSopenharmony_ci    add     srcq, lenq
140cabdff1aSopenharmony_ci    add     dstq, lenq
141cabdff1aSopenharmony_ci    neg     lenq
142cabdff1aSopenharmony_ci    mova      m0, [pf_s32_inv_scale]
143cabdff1aSopenharmony_ci    ALIGN 16
144cabdff1aSopenharmony_ci.loop:
145cabdff1aSopenharmony_ci    cvtdq2ps  m1, [srcq+lenq       ]
146cabdff1aSopenharmony_ci    cvtdq2ps  m2, [srcq+lenq+mmsize]
147cabdff1aSopenharmony_ci    mulps     m1, m1, m0
148cabdff1aSopenharmony_ci    mulps     m2, m2, m0
149cabdff1aSopenharmony_ci    mova  [dstq+lenq       ], m1
150cabdff1aSopenharmony_ci    mova  [dstq+lenq+mmsize], m2
151cabdff1aSopenharmony_ci    add     lenq, mmsize*2
152cabdff1aSopenharmony_ci    jl .loop
153cabdff1aSopenharmony_ci    REP_RET
154cabdff1aSopenharmony_ci%endmacro
155cabdff1aSopenharmony_ci
156cabdff1aSopenharmony_ciINIT_XMM sse2
157cabdff1aSopenharmony_ciCONV_S32_TO_FLT
158cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
159cabdff1aSopenharmony_ciINIT_YMM avx
160cabdff1aSopenharmony_ciCONV_S32_TO_FLT
161cabdff1aSopenharmony_ci%endif
162cabdff1aSopenharmony_ci
163cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
164cabdff1aSopenharmony_ci; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
165cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ciINIT_XMM sse2
168cabdff1aSopenharmony_cicglobal conv_flt_to_s16, 3,3,5, dst, src, len
169cabdff1aSopenharmony_ci    lea     lenq, [2*lend]
170cabdff1aSopenharmony_ci    lea     srcq, [srcq+2*lenq]
171cabdff1aSopenharmony_ci    add     dstq, lenq
172cabdff1aSopenharmony_ci    neg     lenq
173cabdff1aSopenharmony_ci    mova      m4, [pf_s16_scale]
174cabdff1aSopenharmony_ci.loop:
175cabdff1aSopenharmony_ci    mova      m0, [srcq+2*lenq         ]
176cabdff1aSopenharmony_ci    mova      m1, [srcq+2*lenq+1*mmsize]
177cabdff1aSopenharmony_ci    mova      m2, [srcq+2*lenq+2*mmsize]
178cabdff1aSopenharmony_ci    mova      m3, [srcq+2*lenq+3*mmsize]
179cabdff1aSopenharmony_ci    mulps     m0, m4
180cabdff1aSopenharmony_ci    mulps     m1, m4
181cabdff1aSopenharmony_ci    mulps     m2, m4
182cabdff1aSopenharmony_ci    mulps     m3, m4
183cabdff1aSopenharmony_ci    cvtps2dq  m0, m0
184cabdff1aSopenharmony_ci    cvtps2dq  m1, m1
185cabdff1aSopenharmony_ci    cvtps2dq  m2, m2
186cabdff1aSopenharmony_ci    cvtps2dq  m3, m3
187cabdff1aSopenharmony_ci    packssdw  m0, m1
188cabdff1aSopenharmony_ci    packssdw  m2, m3
189cabdff1aSopenharmony_ci    mova  [dstq+lenq       ], m0
190cabdff1aSopenharmony_ci    mova  [dstq+lenq+mmsize], m2
191cabdff1aSopenharmony_ci    add     lenq, mmsize*2
192cabdff1aSopenharmony_ci    jl .loop
193cabdff1aSopenharmony_ci    REP_RET
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
196cabdff1aSopenharmony_ci; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
197cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
198cabdff1aSopenharmony_ci
199cabdff1aSopenharmony_ci%macro CONV_FLT_TO_S32 0
200cabdff1aSopenharmony_cicglobal conv_flt_to_s32, 3,3,6, dst, src, len
201cabdff1aSopenharmony_ci    lea     lenq, [lend*4]
202cabdff1aSopenharmony_ci    add     srcq, lenq
203cabdff1aSopenharmony_ci    add     dstq, lenq
204cabdff1aSopenharmony_ci    neg     lenq
205cabdff1aSopenharmony_ci    mova      m4, [pf_s32_scale]
206cabdff1aSopenharmony_ci    mova      m5, [pf_s32_clip]
207cabdff1aSopenharmony_ci.loop:
208cabdff1aSopenharmony_ci    mulps     m0, m4, [srcq+lenq         ]
209cabdff1aSopenharmony_ci    mulps     m1, m4, [srcq+lenq+1*mmsize]
210cabdff1aSopenharmony_ci    mulps     m2, m4, [srcq+lenq+2*mmsize]
211cabdff1aSopenharmony_ci    mulps     m3, m4, [srcq+lenq+3*mmsize]
212cabdff1aSopenharmony_ci    minps     m0, m0, m5
213cabdff1aSopenharmony_ci    minps     m1, m1, m5
214cabdff1aSopenharmony_ci    minps     m2, m2, m5
215cabdff1aSopenharmony_ci    minps     m3, m3, m5
216cabdff1aSopenharmony_ci    cvtps2dq  m0, m0
217cabdff1aSopenharmony_ci    cvtps2dq  m1, m1
218cabdff1aSopenharmony_ci    cvtps2dq  m2, m2
219cabdff1aSopenharmony_ci    cvtps2dq  m3, m3
220cabdff1aSopenharmony_ci    mova  [dstq+lenq         ], m0
221cabdff1aSopenharmony_ci    mova  [dstq+lenq+1*mmsize], m1
222cabdff1aSopenharmony_ci    mova  [dstq+lenq+2*mmsize], m2
223cabdff1aSopenharmony_ci    mova  [dstq+lenq+3*mmsize], m3
224cabdff1aSopenharmony_ci    add     lenq, mmsize*4
225cabdff1aSopenharmony_ci    jl .loop
226cabdff1aSopenharmony_ci    REP_RET
227cabdff1aSopenharmony_ci%endmacro
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ciINIT_XMM sse2
230cabdff1aSopenharmony_ciCONV_FLT_TO_S32
231cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
232cabdff1aSopenharmony_ciINIT_YMM avx
233cabdff1aSopenharmony_ciCONV_FLT_TO_S32
234cabdff1aSopenharmony_ci%endif
235cabdff1aSopenharmony_ci
236cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
237cabdff1aSopenharmony_ci; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
238cabdff1aSopenharmony_ci;                              int channels);
239cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
240cabdff1aSopenharmony_ci
241cabdff1aSopenharmony_ci%macro CONV_S16P_TO_S16_2CH 0
242cabdff1aSopenharmony_cicglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
243cabdff1aSopenharmony_ci    mov       src1q, [src0q+gprsize]
244cabdff1aSopenharmony_ci    mov       src0q, [src0q        ]
245cabdff1aSopenharmony_ci    lea        lenq, [2*lend]
246cabdff1aSopenharmony_ci    add       src0q, lenq
247cabdff1aSopenharmony_ci    add       src1q, lenq
248cabdff1aSopenharmony_ci    lea        dstq, [dstq+2*lenq]
249cabdff1aSopenharmony_ci    neg        lenq
250cabdff1aSopenharmony_ci.loop:
251cabdff1aSopenharmony_ci    mova         m0, [src0q+lenq       ]
252cabdff1aSopenharmony_ci    mova         m1, [src1q+lenq       ]
253cabdff1aSopenharmony_ci    mova         m2, [src0q+lenq+mmsize]
254cabdff1aSopenharmony_ci    mova         m3, [src1q+lenq+mmsize]
255cabdff1aSopenharmony_ci    SBUTTERFLY2  wd, 0, 1, 4
256cabdff1aSopenharmony_ci    SBUTTERFLY2  wd, 2, 3, 4
257cabdff1aSopenharmony_ci    mova  [dstq+2*lenq+0*mmsize], m0
258cabdff1aSopenharmony_ci    mova  [dstq+2*lenq+1*mmsize], m1
259cabdff1aSopenharmony_ci    mova  [dstq+2*lenq+2*mmsize], m2
260cabdff1aSopenharmony_ci    mova  [dstq+2*lenq+3*mmsize], m3
261cabdff1aSopenharmony_ci    add        lenq, 2*mmsize
262cabdff1aSopenharmony_ci    jl .loop
263cabdff1aSopenharmony_ci    REP_RET
264cabdff1aSopenharmony_ci%endmacro
265cabdff1aSopenharmony_ci
266cabdff1aSopenharmony_ciINIT_XMM sse2
267cabdff1aSopenharmony_ciCONV_S16P_TO_S16_2CH
268cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
269cabdff1aSopenharmony_ciINIT_XMM avx
270cabdff1aSopenharmony_ciCONV_S16P_TO_S16_2CH
271cabdff1aSopenharmony_ci%endif
272cabdff1aSopenharmony_ci
273cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
274cabdff1aSopenharmony_ci; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
275cabdff1aSopenharmony_ci;                              int channels);
276cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
277cabdff1aSopenharmony_ci
278cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
279cabdff1aSopenharmony_ci; NOTE: In the 6-channel functions, len could be used as an index on x86-64
280cabdff1aSopenharmony_ci;       instead of just a counter, which would avoid incrementing the
281cabdff1aSopenharmony_ci;       pointers, but the extra complexity and amount of code is not worth
282cabdff1aSopenharmony_ci;       the small gain. On x86-32 there are not enough registers to use len
283cabdff1aSopenharmony_ci;       as an index without keeping two of the pointers on the stack and
284cabdff1aSopenharmony_ci;       loading them in each iteration.
285cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci%macro CONV_S16P_TO_S16_6CH 0
288cabdff1aSopenharmony_ci%if ARCH_X86_64
289cabdff1aSopenharmony_cicglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
290cabdff1aSopenharmony_ci%else
291cabdff1aSopenharmony_cicglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
292cabdff1aSopenharmony_ci%define lend dword r2m
293cabdff1aSopenharmony_ci%endif
294cabdff1aSopenharmony_ci    mov      src1q, [src0q+1*gprsize]
295cabdff1aSopenharmony_ci    mov      src2q, [src0q+2*gprsize]
296cabdff1aSopenharmony_ci    mov      src3q, [src0q+3*gprsize]
297cabdff1aSopenharmony_ci    mov      src4q, [src0q+4*gprsize]
298cabdff1aSopenharmony_ci    mov      src5q, [src0q+5*gprsize]
299cabdff1aSopenharmony_ci    mov      src0q, [src0q]
300cabdff1aSopenharmony_ci    sub      src1q, src0q
301cabdff1aSopenharmony_ci    sub      src2q, src0q
302cabdff1aSopenharmony_ci    sub      src3q, src0q
303cabdff1aSopenharmony_ci    sub      src4q, src0q
304cabdff1aSopenharmony_ci    sub      src5q, src0q
305cabdff1aSopenharmony_ci.loop:
306cabdff1aSopenharmony_ci%if cpuflag(sse2slow)
307cabdff1aSopenharmony_ci    movq        m0, [src0q      ]   ; m0 =  0,  6, 12, 18,  x,  x,  x,  x
308cabdff1aSopenharmony_ci    movq        m1, [src0q+src1q]   ; m1 =  1,  7, 13, 19,  x,  x,  x,  x
309cabdff1aSopenharmony_ci    movq        m2, [src0q+src2q]   ; m2 =  2,  8, 14, 20,  x,  x,  x,  x
310cabdff1aSopenharmony_ci    movq        m3, [src0q+src3q]   ; m3 =  3,  9, 15, 21,  x,  x,  x,  x
311cabdff1aSopenharmony_ci    movq        m4, [src0q+src4q]   ; m4 =  4, 10, 16, 22,  x,  x,  x,  x
312cabdff1aSopenharmony_ci    movq        m5, [src0q+src5q]   ; m5 =  5, 11, 17, 23,  x,  x,  x,  x
313cabdff1aSopenharmony_ci                                    ; unpack words:
314cabdff1aSopenharmony_ci    punpcklwd   m0, m1              ; m0 =  0,  1,  6,  7, 12, 13, 18, 19
315cabdff1aSopenharmony_ci    punpcklwd   m2, m3              ; m2 =  4,  5, 10, 11, 16, 17, 22, 23
316cabdff1aSopenharmony_ci    punpcklwd   m4, m5              ; m4 =  2,  3,  8,  9, 14, 15, 20, 21
317cabdff1aSopenharmony_ci                                    ; blend dwords
318cabdff1aSopenharmony_ci    shufps      m1, m0, m2, q2020   ; m1 =  0,  1, 12, 13,  2,  3, 14, 15
319cabdff1aSopenharmony_ci    shufps      m0, m4, q2031       ; m0 =  6,  7, 18, 19,  4,  5, 16, 17
320cabdff1aSopenharmony_ci    shufps      m2, m4, q3131       ; m2 =  8,  9, 20, 21, 10, 11, 22, 23
321cabdff1aSopenharmony_ci                                    ; shuffle dwords
322cabdff1aSopenharmony_ci    pshufd      m0, m0, q1302       ; m0 =  4,  5,  6,  7, 16, 17, 18, 19
323cabdff1aSopenharmony_ci    pshufd      m1, m1, q3120       ; m1 =  0,  1,  2,  3, 12, 13, 14, 15
324cabdff1aSopenharmony_ci    pshufd      m2, m2, q3120       ; m2 =  8,  9, 10, 11, 20, 21, 22, 23
325cabdff1aSopenharmony_ci    movq   [dstq+0*mmsize/2], m1
326cabdff1aSopenharmony_ci    movq   [dstq+1*mmsize/2], m0
327cabdff1aSopenharmony_ci    movq   [dstq+2*mmsize/2], m2
328cabdff1aSopenharmony_ci    movhps [dstq+3*mmsize/2], m1
329cabdff1aSopenharmony_ci    movhps [dstq+4*mmsize/2], m0
330cabdff1aSopenharmony_ci    movhps [dstq+5*mmsize/2], m2
331cabdff1aSopenharmony_ci    add      src0q, mmsize/2
332cabdff1aSopenharmony_ci    add       dstq, mmsize*3
333cabdff1aSopenharmony_ci    sub       lend, mmsize/4
334cabdff1aSopenharmony_ci%else
335cabdff1aSopenharmony_ci    mova        m0, [src0q      ]   ; m0 =  0,  6, 12, 18, 24, 30, 36, 42
336cabdff1aSopenharmony_ci    mova        m1, [src0q+src1q]   ; m1 =  1,  7, 13, 19, 25, 31, 37, 43
337cabdff1aSopenharmony_ci    mova        m2, [src0q+src2q]   ; m2 =  2,  8, 14, 20, 26, 32, 38, 44
338cabdff1aSopenharmony_ci    mova        m3, [src0q+src3q]   ; m3 =  3,  9, 15, 21, 27, 33, 39, 45
339cabdff1aSopenharmony_ci    mova        m4, [src0q+src4q]   ; m4 =  4, 10, 16, 22, 28, 34, 40, 46
340cabdff1aSopenharmony_ci    mova        m5, [src0q+src5q]   ; m5 =  5, 11, 17, 23, 29, 35, 41, 47
341cabdff1aSopenharmony_ci                                    ; unpack words:
342cabdff1aSopenharmony_ci    SBUTTERFLY2 wd, 0, 1, 6         ; m0 =  0,  1,  6,  7, 12, 13, 18, 19
343cabdff1aSopenharmony_ci                                    ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
344cabdff1aSopenharmony_ci    SBUTTERFLY2 wd, 2, 3, 6         ; m2 =  2,  3,  8,  9, 14, 15, 20, 21
345cabdff1aSopenharmony_ci                                    ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
346cabdff1aSopenharmony_ci    SBUTTERFLY2 wd, 4, 5, 6         ; m4 =  4,  5, 10, 11, 16, 17, 22, 23
347cabdff1aSopenharmony_ci                                    ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
348cabdff1aSopenharmony_ci                                    ; blend dwords
349cabdff1aSopenharmony_ci    shufps      m6, m0, m2, q2020   ; m6 =  0,  1, 12, 13,  2,  3, 14, 15
350cabdff1aSopenharmony_ci    shufps      m0, m4, q2031       ; m0 =  6,  7, 18, 19,  4,  5, 16, 17
351cabdff1aSopenharmony_ci    shufps      m2, m4, q3131       ; m2 =  8,  9, 20, 21, 10, 11, 22, 23
352cabdff1aSopenharmony_ci    SWAP 4,6                        ; m4 =  0,  1, 12, 13,  2,  3, 14, 15
353cabdff1aSopenharmony_ci    shufps      m6, m1, m3, q2020   ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
354cabdff1aSopenharmony_ci    shufps      m1, m5, q2031       ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
355cabdff1aSopenharmony_ci    shufps      m3, m5, q3131       ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
356cabdff1aSopenharmony_ci    SWAP 5,6                        ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
357cabdff1aSopenharmony_ci                                    ; shuffle dwords
358cabdff1aSopenharmony_ci    pshufd      m0, m0, q1302       ; m0 =  4,  5,  6,  7, 16, 17, 18, 19
359cabdff1aSopenharmony_ci    pshufd      m2, m2, q3120       ; m2 =  8,  9, 10, 11, 20, 21, 22, 23
360cabdff1aSopenharmony_ci    pshufd      m4, m4, q3120       ; m4 =  0,  1,  2,  3, 12, 13, 14, 15
361cabdff1aSopenharmony_ci    pshufd      m1, m1, q1302       ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
362cabdff1aSopenharmony_ci    pshufd      m3, m3, q3120       ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
363cabdff1aSopenharmony_ci    pshufd      m5, m5, q3120       ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
364cabdff1aSopenharmony_ci                                    ; shuffle qwords
365cabdff1aSopenharmony_ci    punpcklqdq  m6, m4, m0          ; m6 =  0,  1,  2,  3,  4,  5,  6,  7
366cabdff1aSopenharmony_ci    punpckhqdq  m0, m2              ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
367cabdff1aSopenharmony_ci    shufps      m2, m4, q3210       ; m2 =  8,  9, 10, 11, 12, 13, 14, 15
368cabdff1aSopenharmony_ci    SWAP 4,6                        ; m4 =  0,  1,  2,  3,  4,  5,  6,  7
369cabdff1aSopenharmony_ci    punpcklqdq  m6, m5, m1          ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
370cabdff1aSopenharmony_ci    punpckhqdq  m1, m3              ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
371cabdff1aSopenharmony_ci    shufps      m3, m5, q3210       ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
372cabdff1aSopenharmony_ci    SWAP 5,6                        ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
373cabdff1aSopenharmony_ci    mova   [dstq+0*mmsize], m4
374cabdff1aSopenharmony_ci    mova   [dstq+1*mmsize], m2
375cabdff1aSopenharmony_ci    mova   [dstq+2*mmsize], m0
376cabdff1aSopenharmony_ci    mova   [dstq+3*mmsize], m5
377cabdff1aSopenharmony_ci    mova   [dstq+4*mmsize], m3
378cabdff1aSopenharmony_ci    mova   [dstq+5*mmsize], m1
379cabdff1aSopenharmony_ci    add      src0q, mmsize
380cabdff1aSopenharmony_ci    add       dstq, mmsize*6
381cabdff1aSopenharmony_ci    sub       lend, mmsize/2
382cabdff1aSopenharmony_ci%endif
383cabdff1aSopenharmony_ci    jg .loop
384cabdff1aSopenharmony_ci    REP_RET
385cabdff1aSopenharmony_ci%endmacro
386cabdff1aSopenharmony_ci
387cabdff1aSopenharmony_ciINIT_XMM sse2
388cabdff1aSopenharmony_ciCONV_S16P_TO_S16_6CH
389cabdff1aSopenharmony_ciINIT_XMM sse2slow
390cabdff1aSopenharmony_ciCONV_S16P_TO_S16_6CH
391cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
392cabdff1aSopenharmony_ciINIT_XMM avx
393cabdff1aSopenharmony_ciCONV_S16P_TO_S16_6CH
394cabdff1aSopenharmony_ci%endif
395cabdff1aSopenharmony_ci
396cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
397cabdff1aSopenharmony_ci; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
398cabdff1aSopenharmony_ci;                              int channels);
399cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
400cabdff1aSopenharmony_ci
401cabdff1aSopenharmony_ci%macro CONV_S16P_TO_FLT_2CH 0
402cabdff1aSopenharmony_cicglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
403cabdff1aSopenharmony_ci    lea       lenq, [2*lend]
404cabdff1aSopenharmony_ci    mov      src1q, [src0q+gprsize]
405cabdff1aSopenharmony_ci    mov      src0q, [src0q        ]
406cabdff1aSopenharmony_ci    lea       dstq, [dstq+4*lenq]
407cabdff1aSopenharmony_ci    add      src0q, lenq
408cabdff1aSopenharmony_ci    add      src1q, lenq
409cabdff1aSopenharmony_ci    neg       lenq
410cabdff1aSopenharmony_ci    mova        m5, [pf_s32_inv_scale]
411cabdff1aSopenharmony_ci.loop:
412cabdff1aSopenharmony_ci    mova        m2, [src0q+lenq]    ; m2 =  0,  2,  4,  6,  8, 10, 12, 14
413cabdff1aSopenharmony_ci    mova        m4, [src1q+lenq]    ; m4 =  1,  3,  5,  7,  9, 11, 13, 15
414cabdff1aSopenharmony_ci    SBUTTERFLY2 wd, 2, 4, 3         ; m2 =  0,  1,  2,  3,  4,  5,  6,  7
415cabdff1aSopenharmony_ci                                    ; m4 =  8,  9, 10, 11, 12, 13, 14, 15
416cabdff1aSopenharmony_ci    pxor        m3, m3
417cabdff1aSopenharmony_ci    punpcklwd   m0, m3, m2          ; m0 =      0,      1,      2,      3
418cabdff1aSopenharmony_ci    punpckhwd   m1, m3, m2          ; m1 =      4,      5,      6,      7
419cabdff1aSopenharmony_ci    punpcklwd   m2, m3, m4          ; m2 =      8,      9,     10,     11
420cabdff1aSopenharmony_ci    punpckhwd   m3, m4              ; m3 =     12,     13,     14,     15
421cabdff1aSopenharmony_ci    cvtdq2ps    m0, m0
422cabdff1aSopenharmony_ci    cvtdq2ps    m1, m1
423cabdff1aSopenharmony_ci    cvtdq2ps    m2, m2
424cabdff1aSopenharmony_ci    cvtdq2ps    m3, m3
425cabdff1aSopenharmony_ci    mulps       m0, m5
426cabdff1aSopenharmony_ci    mulps       m1, m5
427cabdff1aSopenharmony_ci    mulps       m2, m5
428cabdff1aSopenharmony_ci    mulps       m3, m5
429cabdff1aSopenharmony_ci    mova  [dstq+4*lenq         ], m0
430cabdff1aSopenharmony_ci    mova  [dstq+4*lenq+  mmsize], m1
431cabdff1aSopenharmony_ci    mova  [dstq+4*lenq+2*mmsize], m2
432cabdff1aSopenharmony_ci    mova  [dstq+4*lenq+3*mmsize], m3
433cabdff1aSopenharmony_ci    add       lenq, mmsize
434cabdff1aSopenharmony_ci    jl .loop
435cabdff1aSopenharmony_ci    REP_RET
436cabdff1aSopenharmony_ci%endmacro
437cabdff1aSopenharmony_ci
438cabdff1aSopenharmony_ciINIT_XMM sse2
439cabdff1aSopenharmony_ciCONV_S16P_TO_FLT_2CH
440cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
441cabdff1aSopenharmony_ciINIT_XMM avx
442cabdff1aSopenharmony_ciCONV_S16P_TO_FLT_2CH
443cabdff1aSopenharmony_ci%endif
444cabdff1aSopenharmony_ci
445cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
446cabdff1aSopenharmony_ci; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
447cabdff1aSopenharmony_ci;                              int channels);
448cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
449cabdff1aSopenharmony_ci
450cabdff1aSopenharmony_ci%macro CONV_S16P_TO_FLT_6CH 0
451cabdff1aSopenharmony_ci%if ARCH_X86_64
452cabdff1aSopenharmony_cicglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
453cabdff1aSopenharmony_ci%else
454cabdff1aSopenharmony_cicglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
455cabdff1aSopenharmony_ci%define lend dword r2m
456cabdff1aSopenharmony_ci%endif
457cabdff1aSopenharmony_ci    mov     src1q, [srcq+1*gprsize]
458cabdff1aSopenharmony_ci    mov     src2q, [srcq+2*gprsize]
459cabdff1aSopenharmony_ci    mov     src3q, [srcq+3*gprsize]
460cabdff1aSopenharmony_ci    mov     src4q, [srcq+4*gprsize]
461cabdff1aSopenharmony_ci    mov     src5q, [srcq+5*gprsize]
462cabdff1aSopenharmony_ci    mov      srcq, [srcq]
463cabdff1aSopenharmony_ci    sub     src1q, srcq
464cabdff1aSopenharmony_ci    sub     src2q, srcq
465cabdff1aSopenharmony_ci    sub     src3q, srcq
466cabdff1aSopenharmony_ci    sub     src4q, srcq
467cabdff1aSopenharmony_ci    sub     src5q, srcq
468cabdff1aSopenharmony_ci    mova       m7, [pf_s32_inv_scale]
469cabdff1aSopenharmony_ci%if cpuflag(ssse3)
470cabdff1aSopenharmony_ci    %define unpack_even m6
471cabdff1aSopenharmony_ci    mova       m6, [pb_shuf_unpack_even]
472cabdff1aSopenharmony_ci%if ARCH_X86_64
473cabdff1aSopenharmony_ci    %define unpack_odd m8
474cabdff1aSopenharmony_ci    mova       m8, [pb_shuf_unpack_odd]
475cabdff1aSopenharmony_ci%else
476cabdff1aSopenharmony_ci    %define unpack_odd [pb_shuf_unpack_odd]
477cabdff1aSopenharmony_ci%endif
478cabdff1aSopenharmony_ci%endif
479cabdff1aSopenharmony_ci.loop:
480cabdff1aSopenharmony_ci    movq       m0, [srcq      ]  ; m0 =  0,  6, 12, 18,  x,  x,  x,  x
481cabdff1aSopenharmony_ci    movq       m1, [srcq+src1q]  ; m1 =  1,  7, 13, 19,  x,  x,  x,  x
482cabdff1aSopenharmony_ci    movq       m2, [srcq+src2q]  ; m2 =  2,  8, 14, 20,  x,  x,  x,  x
483cabdff1aSopenharmony_ci    movq       m3, [srcq+src3q]  ; m3 =  3,  9, 15, 21,  x,  x,  x,  x
484cabdff1aSopenharmony_ci    movq       m4, [srcq+src4q]  ; m4 =  4, 10, 16, 22,  x,  x,  x,  x
485cabdff1aSopenharmony_ci    movq       m5, [srcq+src5q]  ; m5 =  5, 11, 17, 23,  x,  x,  x,  x
486cabdff1aSopenharmony_ci                                 ; unpack words:
487cabdff1aSopenharmony_ci    punpcklwd  m0, m1            ; m0 =  0,  1,  6,  7, 12, 13, 18, 19
488cabdff1aSopenharmony_ci    punpcklwd  m2, m3            ; m2 =  2,  3,  8,  9, 14, 15, 20, 21
489cabdff1aSopenharmony_ci    punpcklwd  m4, m5            ; m4 =  4,  5, 10, 11, 16, 17, 22, 23
490cabdff1aSopenharmony_ci                                 ; blend dwords
491cabdff1aSopenharmony_ci    shufps     m1, m4, m0, q3120 ; m1 =  4,  5, 16, 17,  6,  7, 18, 19
492cabdff1aSopenharmony_ci    shufps         m0, m2, q2020 ; m0 =  0,  1, 12, 13,  2,  3, 14, 15
493cabdff1aSopenharmony_ci    shufps         m2, m4, q3131 ; m2 =  8,  9, 20, 21, 10, 11, 22, 23
494cabdff1aSopenharmony_ci%if cpuflag(ssse3)
495cabdff1aSopenharmony_ci    pshufb     m3, m0, unpack_odd   ; m3 =  12,     13,     14,     15
496cabdff1aSopenharmony_ci    pshufb         m0, unpack_even  ; m0 =   0,      1,      2,      3
497cabdff1aSopenharmony_ci    pshufb     m4, m1, unpack_odd   ; m4 =  16,     17,     18,     19
498cabdff1aSopenharmony_ci    pshufb         m1, unpack_even  ; m1 =   4,      5,      6,      7
499cabdff1aSopenharmony_ci    pshufb     m5, m2, unpack_odd   ; m5 =  20,     21,     22,     23
500cabdff1aSopenharmony_ci    pshufb         m2, unpack_even  ; m2 =   8,      9,     10,     11
501cabdff1aSopenharmony_ci%else
502cabdff1aSopenharmony_ci                                 ; shuffle dwords
503cabdff1aSopenharmony_ci    pshufd     m0, m0, q3120     ; m0 =  0,  1,  2,  3, 12, 13, 14, 15
504cabdff1aSopenharmony_ci    pshufd     m1, m1, q3120     ; m1 =  4,  5,  6,  7, 16, 17, 18, 19
505cabdff1aSopenharmony_ci    pshufd     m2, m2, q3120     ; m2 =  8,  9, 10, 11, 20, 21, 22, 23
506cabdff1aSopenharmony_ci    pxor       m6, m6            ; convert s16 in m0-m2 to s32 in m0-m5
507cabdff1aSopenharmony_ci    punpcklwd  m3, m6, m0        ; m3 =      0,      1,      2,      3
508cabdff1aSopenharmony_ci    punpckhwd  m4, m6, m0        ; m4 =     12,     13,     14,     15
509cabdff1aSopenharmony_ci    punpcklwd  m0, m6, m1        ; m0 =      4,      5,      6,      7
510cabdff1aSopenharmony_ci    punpckhwd  m5, m6, m1        ; m5 =     16,     17,     18,     19
511cabdff1aSopenharmony_ci    punpcklwd  m1, m6, m2        ; m1 =      8,      9,     10,     11
512cabdff1aSopenharmony_ci    punpckhwd      m6, m2        ; m6 =     20,     21,     22,     23
513cabdff1aSopenharmony_ci    SWAP 6,2,1,0,3,4,5           ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
514cabdff1aSopenharmony_ci%endif
515cabdff1aSopenharmony_ci    cvtdq2ps   m0, m0            ; convert s32 to float
516cabdff1aSopenharmony_ci    cvtdq2ps   m1, m1
517cabdff1aSopenharmony_ci    cvtdq2ps   m2, m2
518cabdff1aSopenharmony_ci    cvtdq2ps   m3, m3
519cabdff1aSopenharmony_ci    cvtdq2ps   m4, m4
520cabdff1aSopenharmony_ci    cvtdq2ps   m5, m5
521cabdff1aSopenharmony_ci    mulps      m0, m7            ; scale float from s32 range to [-1.0,1.0]
522cabdff1aSopenharmony_ci    mulps      m1, m7
523cabdff1aSopenharmony_ci    mulps      m2, m7
524cabdff1aSopenharmony_ci    mulps      m3, m7
525cabdff1aSopenharmony_ci    mulps      m4, m7
526cabdff1aSopenharmony_ci    mulps      m5, m7
527cabdff1aSopenharmony_ci    mova  [dstq         ], m0
528cabdff1aSopenharmony_ci    mova  [dstq+  mmsize], m1
529cabdff1aSopenharmony_ci    mova  [dstq+2*mmsize], m2
530cabdff1aSopenharmony_ci    mova  [dstq+3*mmsize], m3
531cabdff1aSopenharmony_ci    mova  [dstq+4*mmsize], m4
532cabdff1aSopenharmony_ci    mova  [dstq+5*mmsize], m5
533cabdff1aSopenharmony_ci    add      srcq, mmsize/2
534cabdff1aSopenharmony_ci    add      dstq, mmsize*6
535cabdff1aSopenharmony_ci    sub      lend, mmsize/4
536cabdff1aSopenharmony_ci    jg .loop
537cabdff1aSopenharmony_ci    REP_RET
538cabdff1aSopenharmony_ci%endmacro
539cabdff1aSopenharmony_ci
540cabdff1aSopenharmony_ciINIT_XMM sse2
541cabdff1aSopenharmony_ciCONV_S16P_TO_FLT_6CH
542cabdff1aSopenharmony_ciINIT_XMM ssse3
543cabdff1aSopenharmony_ciCONV_S16P_TO_FLT_6CH
544cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
545cabdff1aSopenharmony_ciINIT_XMM avx
546cabdff1aSopenharmony_ciCONV_S16P_TO_FLT_6CH
547cabdff1aSopenharmony_ci%endif
548cabdff1aSopenharmony_ci
549cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
550cabdff1aSopenharmony_ci; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
551cabdff1aSopenharmony_ci;                              int channels);
552cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
553cabdff1aSopenharmony_ci
554cabdff1aSopenharmony_ci%macro CONV_FLTP_TO_S16_2CH 0
555cabdff1aSopenharmony_cicglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
556cabdff1aSopenharmony_ci    lea      lenq, [4*lend]
557cabdff1aSopenharmony_ci    mov     src1q, [src0q+gprsize]
558cabdff1aSopenharmony_ci    mov     src0q, [src0q        ]
559cabdff1aSopenharmony_ci    add      dstq, lenq
560cabdff1aSopenharmony_ci    add     src0q, lenq
561cabdff1aSopenharmony_ci    add     src1q, lenq
562cabdff1aSopenharmony_ci    neg      lenq
563cabdff1aSopenharmony_ci    mova       m2, [pf_s16_scale]
564cabdff1aSopenharmony_ci%if cpuflag(ssse3)
565cabdff1aSopenharmony_ci    mova       m3, [pb_interleave_words]
566cabdff1aSopenharmony_ci%endif
567cabdff1aSopenharmony_ci.loop:
568cabdff1aSopenharmony_ci    mulps      m0, m2, [src0q+lenq] ; m0 =    0,    2,    4,    6
569cabdff1aSopenharmony_ci    mulps      m1, m2, [src1q+lenq] ; m1 =    1,    3,    5,    7
570cabdff1aSopenharmony_ci    cvtps2dq   m0, m0
571cabdff1aSopenharmony_ci    cvtps2dq   m1, m1
572cabdff1aSopenharmony_ci%if cpuflag(ssse3)
573cabdff1aSopenharmony_ci    packssdw   m0, m1               ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
574cabdff1aSopenharmony_ci    pshufb     m0, m3               ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
575cabdff1aSopenharmony_ci%else
576cabdff1aSopenharmony_ci    packssdw   m0, m0               ; m0 = 0, 2, 4, 6, x, x, x, x
577cabdff1aSopenharmony_ci    packssdw   m1, m1               ; m1 = 1, 3, 5, 7, x, x, x, x
578cabdff1aSopenharmony_ci    punpcklwd  m0, m1               ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
579cabdff1aSopenharmony_ci%endif
580cabdff1aSopenharmony_ci    mova  [dstq+lenq], m0
581cabdff1aSopenharmony_ci    add      lenq, mmsize
582cabdff1aSopenharmony_ci    jl .loop
583cabdff1aSopenharmony_ci    REP_RET
584cabdff1aSopenharmony_ci%endmacro
585cabdff1aSopenharmony_ci
586cabdff1aSopenharmony_ciINIT_XMM sse2
587cabdff1aSopenharmony_ciCONV_FLTP_TO_S16_2CH
588cabdff1aSopenharmony_ciINIT_XMM ssse3
589cabdff1aSopenharmony_ciCONV_FLTP_TO_S16_2CH
590cabdff1aSopenharmony_ci
591cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
592cabdff1aSopenharmony_ci; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
593cabdff1aSopenharmony_ci;                              int channels);
594cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
595cabdff1aSopenharmony_ci
596cabdff1aSopenharmony_ci%macro CONV_FLTP_TO_S16_6CH 0
597cabdff1aSopenharmony_ci%if ARCH_X86_64
598cabdff1aSopenharmony_cicglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
599cabdff1aSopenharmony_ci%else
600cabdff1aSopenharmony_cicglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
601cabdff1aSopenharmony_ci%define lend dword r2m
602cabdff1aSopenharmony_ci%endif
603cabdff1aSopenharmony_ci    mov        src1q, [srcq+1*gprsize]
604cabdff1aSopenharmony_ci    mov        src2q, [srcq+2*gprsize]
605cabdff1aSopenharmony_ci    mov        src3q, [srcq+3*gprsize]
606cabdff1aSopenharmony_ci    mov        src4q, [srcq+4*gprsize]
607cabdff1aSopenharmony_ci    mov        src5q, [srcq+5*gprsize]
608cabdff1aSopenharmony_ci    mov         srcq, [srcq]
609cabdff1aSopenharmony_ci    sub        src1q, srcq
610cabdff1aSopenharmony_ci    sub        src2q, srcq
611cabdff1aSopenharmony_ci    sub        src3q, srcq
612cabdff1aSopenharmony_ci    sub        src4q, srcq
613cabdff1aSopenharmony_ci    sub        src5q, srcq
614cabdff1aSopenharmony_ci    movaps      xmm6, [pf_s16_scale]
615cabdff1aSopenharmony_ci.loop:
616cabdff1aSopenharmony_ci%if cpuflag(sse2)
617cabdff1aSopenharmony_ci    mulps         m0, m6, [srcq      ]
618cabdff1aSopenharmony_ci    mulps         m1, m6, [srcq+src1q]
619cabdff1aSopenharmony_ci    mulps         m2, m6, [srcq+src2q]
620cabdff1aSopenharmony_ci    mulps         m3, m6, [srcq+src3q]
621cabdff1aSopenharmony_ci    mulps         m4, m6, [srcq+src4q]
622cabdff1aSopenharmony_ci    mulps         m5, m6, [srcq+src5q]
623cabdff1aSopenharmony_ci    cvtps2dq      m0, m0
624cabdff1aSopenharmony_ci    cvtps2dq      m1, m1
625cabdff1aSopenharmony_ci    cvtps2dq      m2, m2
626cabdff1aSopenharmony_ci    cvtps2dq      m3, m3
627cabdff1aSopenharmony_ci    cvtps2dq      m4, m4
628cabdff1aSopenharmony_ci    cvtps2dq      m5, m5
629cabdff1aSopenharmony_ci    packssdw      m0, m3            ; m0 =  0,  6, 12, 18,  3,  9, 15, 21
630cabdff1aSopenharmony_ci    packssdw      m1, m4            ; m1 =  1,  7, 13, 19,  4, 10, 16, 22
631cabdff1aSopenharmony_ci    packssdw      m2, m5            ; m2 =  2,  8, 14, 20,  5, 11, 17, 23
632cabdff1aSopenharmony_ci                                    ; unpack words:
633cabdff1aSopenharmony_ci    movhlps       m3, m0            ; m3 =  3,  9, 15, 21,  x,  x,  x,  x
634cabdff1aSopenharmony_ci    punpcklwd     m0, m1            ; m0 =  0,  1,  6,  7, 12, 13, 18, 19
635cabdff1aSopenharmony_ci    punpckhwd     m1, m2            ; m1 =  4,  5, 10, 11, 16, 17, 22, 23
636cabdff1aSopenharmony_ci    punpcklwd     m2, m3            ; m2 =  2,  3,  8,  9, 14, 15, 20, 21
637cabdff1aSopenharmony_ci                                    ; blend dwords:
638cabdff1aSopenharmony_ci    shufps        m3, m0, m2, q2020 ; m3 =  0,  1, 12, 13,  2,  3, 14, 15
639cabdff1aSopenharmony_ci    shufps        m0, m1, q2031     ; m0 =  6,  7, 18, 19,  4,  5, 16, 17
640cabdff1aSopenharmony_ci    shufps        m2, m1, q3131     ; m2 =  8,  9, 20, 21, 10, 11, 22, 23
641cabdff1aSopenharmony_ci                                    ; shuffle dwords:
642cabdff1aSopenharmony_ci    shufps        m1, m2, m3, q3120 ; m1 =  8,  9, 10, 11, 12, 13, 14, 15
643cabdff1aSopenharmony_ci    shufps        m3, m0,     q0220 ; m3 =  0,  1,  2,  3,  4,  5,  6,  7
644cabdff1aSopenharmony_ci    shufps        m0, m2,     q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
645cabdff1aSopenharmony_ci    mova  [dstq+0*mmsize], m3
646cabdff1aSopenharmony_ci    mova  [dstq+1*mmsize], m1
647cabdff1aSopenharmony_ci    mova  [dstq+2*mmsize], m0
648cabdff1aSopenharmony_ci%else ; sse
649cabdff1aSopenharmony_ci    movlps      xmm0, [srcq      ]
650cabdff1aSopenharmony_ci    movlps      xmm1, [srcq+src1q]
651cabdff1aSopenharmony_ci    movlps      xmm2, [srcq+src2q]
652cabdff1aSopenharmony_ci    movlps      xmm3, [srcq+src3q]
653cabdff1aSopenharmony_ci    movlps      xmm4, [srcq+src4q]
654cabdff1aSopenharmony_ci    movlps      xmm5, [srcq+src5q]
655cabdff1aSopenharmony_ci    mulps       xmm0, xmm6
656cabdff1aSopenharmony_ci    mulps       xmm1, xmm6
657cabdff1aSopenharmony_ci    mulps       xmm2, xmm6
658cabdff1aSopenharmony_ci    mulps       xmm3, xmm6
659cabdff1aSopenharmony_ci    mulps       xmm4, xmm6
660cabdff1aSopenharmony_ci    mulps       xmm5, xmm6
661cabdff1aSopenharmony_ci    cvtps2pi     mm0, xmm0
662cabdff1aSopenharmony_ci    cvtps2pi     mm1, xmm1
663cabdff1aSopenharmony_ci    cvtps2pi     mm2, xmm2
664cabdff1aSopenharmony_ci    cvtps2pi     mm3, xmm3
665cabdff1aSopenharmony_ci    cvtps2pi     mm4, xmm4
666cabdff1aSopenharmony_ci    cvtps2pi     mm5, xmm5
667cabdff1aSopenharmony_ci    packssdw     mm0, mm3           ; m0 =  0,  6,  3,  9
668cabdff1aSopenharmony_ci    packssdw     mm1, mm4           ; m1 =  1,  7,  4, 10
669cabdff1aSopenharmony_ci    packssdw     mm2, mm5           ; m2 =  2,  8,  5, 11
670cabdff1aSopenharmony_ci                                    ; unpack words
671cabdff1aSopenharmony_ci    pshufw       mm3, mm0, q1032    ; m3 =  3,  9,  0,  6
672cabdff1aSopenharmony_ci    punpcklwd    mm0, mm1           ; m0 =  0,  1,  6,  7
673cabdff1aSopenharmony_ci    punpckhwd    mm1, mm2           ; m1 =  4,  5, 10, 11
674cabdff1aSopenharmony_ci    punpcklwd    mm2, mm3           ; m2 =  2,  3,  8,  9
675cabdff1aSopenharmony_ci                                    ; unpack dwords
676cabdff1aSopenharmony_ci    pshufw       mm3, mm0, q1032    ; m3 =  6,  7,  0,  1
677cabdff1aSopenharmony_ci    punpckldq    mm0, mm2           ; m0 =  0,  1,  2,  3 (final)
678cabdff1aSopenharmony_ci    punpckhdq    mm2, mm1           ; m2 =  8,  9, 10, 11 (final)
679cabdff1aSopenharmony_ci    punpckldq    mm1, mm3           ; m1 =  4,  5,  6,  7 (final)
680cabdff1aSopenharmony_ci    mova  [dstq+0*mmsize], mm0
681cabdff1aSopenharmony_ci    mova  [dstq+1*mmsize], mm1
682cabdff1aSopenharmony_ci    mova  [dstq+2*mmsize], mm2
683cabdff1aSopenharmony_ci%endif
684cabdff1aSopenharmony_ci    add       srcq, mmsize
685cabdff1aSopenharmony_ci    add       dstq, mmsize*3
686cabdff1aSopenharmony_ci    sub       lend, mmsize/4
687cabdff1aSopenharmony_ci    jg .loop
688cabdff1aSopenharmony_ci%if mmsize == 8
689cabdff1aSopenharmony_ci    emms
690cabdff1aSopenharmony_ci    RET
691cabdff1aSopenharmony_ci%else
692cabdff1aSopenharmony_ci    REP_RET
693cabdff1aSopenharmony_ci%endif
694cabdff1aSopenharmony_ci%endmacro
695cabdff1aSopenharmony_ci
696cabdff1aSopenharmony_ciINIT_MMX sse
697cabdff1aSopenharmony_ciCONV_FLTP_TO_S16_6CH
698cabdff1aSopenharmony_ciINIT_XMM sse2
699cabdff1aSopenharmony_ciCONV_FLTP_TO_S16_6CH
700cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
701cabdff1aSopenharmony_ciINIT_XMM avx
702cabdff1aSopenharmony_ciCONV_FLTP_TO_S16_6CH
703cabdff1aSopenharmony_ci%endif
704cabdff1aSopenharmony_ci
705cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
706cabdff1aSopenharmony_ci; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
707cabdff1aSopenharmony_ci;                              int channels);
708cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
709cabdff1aSopenharmony_ci
710cabdff1aSopenharmony_ci%macro CONV_FLTP_TO_FLT_2CH 0
711cabdff1aSopenharmony_cicglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
712cabdff1aSopenharmony_ci    mov  src1q, [src0q+gprsize]
713cabdff1aSopenharmony_ci    mov  src0q, [src0q]
714cabdff1aSopenharmony_ci    lea   lenq, [4*lend]
715cabdff1aSopenharmony_ci    add  src0q, lenq
716cabdff1aSopenharmony_ci    add  src1q, lenq
717cabdff1aSopenharmony_ci    lea   dstq, [dstq+2*lenq]
718cabdff1aSopenharmony_ci    neg   lenq
719cabdff1aSopenharmony_ci.loop:
720cabdff1aSopenharmony_ci    mova    m0, [src0q+lenq       ]
721cabdff1aSopenharmony_ci    mova    m1, [src1q+lenq       ]
722cabdff1aSopenharmony_ci    mova    m2, [src0q+lenq+mmsize]
723cabdff1aSopenharmony_ci    mova    m3, [src1q+lenq+mmsize]
724cabdff1aSopenharmony_ci    SBUTTERFLYPS 0, 1, 4
725cabdff1aSopenharmony_ci    SBUTTERFLYPS 2, 3, 4
726cabdff1aSopenharmony_ci    mova  [dstq+2*lenq+0*mmsize], m0
727cabdff1aSopenharmony_ci    mova  [dstq+2*lenq+1*mmsize], m1
728cabdff1aSopenharmony_ci    mova  [dstq+2*lenq+2*mmsize], m2
729cabdff1aSopenharmony_ci    mova  [dstq+2*lenq+3*mmsize], m3
730cabdff1aSopenharmony_ci    add   lenq, 2*mmsize
731cabdff1aSopenharmony_ci    jl .loop
732cabdff1aSopenharmony_ci    REP_RET
733cabdff1aSopenharmony_ci%endmacro
734cabdff1aSopenharmony_ci
735cabdff1aSopenharmony_ciINIT_XMM sse
736cabdff1aSopenharmony_ciCONV_FLTP_TO_FLT_2CH
737cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
738cabdff1aSopenharmony_ciINIT_XMM avx
739cabdff1aSopenharmony_ciCONV_FLTP_TO_FLT_2CH
740cabdff1aSopenharmony_ci%endif
741cabdff1aSopenharmony_ci
742cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
743cabdff1aSopenharmony_ci; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
744cabdff1aSopenharmony_ci;                              int channels);
745cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
746cabdff1aSopenharmony_ci
747cabdff1aSopenharmony_ci%macro CONV_FLTP_TO_FLT_6CH 0
748cabdff1aSopenharmony_cicglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
749cabdff1aSopenharmony_ci%if ARCH_X86_64
750cabdff1aSopenharmony_ci    mov     lend, r2d
751cabdff1aSopenharmony_ci%else
752cabdff1aSopenharmony_ci    %define lend dword r2m
753cabdff1aSopenharmony_ci%endif
754cabdff1aSopenharmony_ci    mov    src1q, [srcq+1*gprsize]
755cabdff1aSopenharmony_ci    mov    src2q, [srcq+2*gprsize]
756cabdff1aSopenharmony_ci    mov    src3q, [srcq+3*gprsize]
757cabdff1aSopenharmony_ci    mov    src4q, [srcq+4*gprsize]
758cabdff1aSopenharmony_ci    mov    src5q, [srcq+5*gprsize]
759cabdff1aSopenharmony_ci    mov     srcq, [srcq]
760cabdff1aSopenharmony_ci    sub    src1q, srcq
761cabdff1aSopenharmony_ci    sub    src2q, srcq
762cabdff1aSopenharmony_ci    sub    src3q, srcq
763cabdff1aSopenharmony_ci    sub    src4q, srcq
764cabdff1aSopenharmony_ci    sub    src5q, srcq
765cabdff1aSopenharmony_ci.loop:
766cabdff1aSopenharmony_ci    mova      m0, [srcq      ]
767cabdff1aSopenharmony_ci    mova      m1, [srcq+src1q]
768cabdff1aSopenharmony_ci    mova      m2, [srcq+src2q]
769cabdff1aSopenharmony_ci    mova      m3, [srcq+src3q]
770cabdff1aSopenharmony_ci    mova      m4, [srcq+src4q]
771cabdff1aSopenharmony_ci    mova      m5, [srcq+src5q]
772cabdff1aSopenharmony_ci%if cpuflag(sse4)
773cabdff1aSopenharmony_ci    SBUTTERFLYPS 0, 1, 6
774cabdff1aSopenharmony_ci    SBUTTERFLYPS 2, 3, 6
775cabdff1aSopenharmony_ci    SBUTTERFLYPS 4, 5, 6
776cabdff1aSopenharmony_ci
777cabdff1aSopenharmony_ci    blendps   m6, m4, m0, 1100b
778cabdff1aSopenharmony_ci    movlhps   m0, m2
779cabdff1aSopenharmony_ci    movhlps   m4, m2
780cabdff1aSopenharmony_ci    blendps   m2, m5, m1, 1100b
781cabdff1aSopenharmony_ci    movlhps   m1, m3
782cabdff1aSopenharmony_ci    movhlps   m5, m3
783cabdff1aSopenharmony_ci
784cabdff1aSopenharmony_ci    movaps [dstq   ], m0
785cabdff1aSopenharmony_ci    movaps [dstq+16], m6
786cabdff1aSopenharmony_ci    movaps [dstq+32], m4
787cabdff1aSopenharmony_ci    movaps [dstq+48], m1
788cabdff1aSopenharmony_ci    movaps [dstq+64], m2
789cabdff1aSopenharmony_ci    movaps [dstq+80], m5
790cabdff1aSopenharmony_ci%else ; mmx
791cabdff1aSopenharmony_ci    SBUTTERFLY dq, 0, 1, 6
792cabdff1aSopenharmony_ci    SBUTTERFLY dq, 2, 3, 6
793cabdff1aSopenharmony_ci    SBUTTERFLY dq, 4, 5, 6
794cabdff1aSopenharmony_ci
795cabdff1aSopenharmony_ci    movq   [dstq   ], m0
796cabdff1aSopenharmony_ci    movq   [dstq+ 8], m2
797cabdff1aSopenharmony_ci    movq   [dstq+16], m4
798cabdff1aSopenharmony_ci    movq   [dstq+24], m1
799cabdff1aSopenharmony_ci    movq   [dstq+32], m3
800cabdff1aSopenharmony_ci    movq   [dstq+40], m5
801cabdff1aSopenharmony_ci%endif
802cabdff1aSopenharmony_ci    add      srcq, mmsize
803cabdff1aSopenharmony_ci    add      dstq, mmsize*6
804cabdff1aSopenharmony_ci    sub      lend, mmsize/4
805cabdff1aSopenharmony_ci    jg .loop
806cabdff1aSopenharmony_ci%if mmsize == 8
807cabdff1aSopenharmony_ci    emms
808cabdff1aSopenharmony_ci    RET
809cabdff1aSopenharmony_ci%else
810cabdff1aSopenharmony_ci    REP_RET
811cabdff1aSopenharmony_ci%endif
812cabdff1aSopenharmony_ci%endmacro
813cabdff1aSopenharmony_ci
814cabdff1aSopenharmony_ciINIT_MMX mmx
815cabdff1aSopenharmony_ciCONV_FLTP_TO_FLT_6CH
816cabdff1aSopenharmony_ciINIT_XMM sse4
817cabdff1aSopenharmony_ciCONV_FLTP_TO_FLT_6CH
818cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
819cabdff1aSopenharmony_ciINIT_XMM avx
820cabdff1aSopenharmony_ciCONV_FLTP_TO_FLT_6CH
821cabdff1aSopenharmony_ci%endif
822cabdff1aSopenharmony_ci
823cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
824cabdff1aSopenharmony_ci; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
825cabdff1aSopenharmony_ci;                              int channels);
826cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
827cabdff1aSopenharmony_ci
828cabdff1aSopenharmony_ci%macro CONV_S16_TO_S16P_2CH 0
829cabdff1aSopenharmony_cicglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
830cabdff1aSopenharmony_ci    lea       lenq, [2*lend]
831cabdff1aSopenharmony_ci    mov      dst1q, [dst0q+gprsize]
832cabdff1aSopenharmony_ci    mov      dst0q, [dst0q        ]
833cabdff1aSopenharmony_ci    lea       srcq, [srcq+2*lenq]
834cabdff1aSopenharmony_ci    add      dst0q, lenq
835cabdff1aSopenharmony_ci    add      dst1q, lenq
836cabdff1aSopenharmony_ci    neg       lenq
837cabdff1aSopenharmony_ci%if cpuflag(ssse3)
838cabdff1aSopenharmony_ci    mova        m3, [pb_deinterleave_words]
839cabdff1aSopenharmony_ci%endif
840cabdff1aSopenharmony_ci.loop:
841cabdff1aSopenharmony_ci    mova        m0, [srcq+2*lenq       ]  ; m0 =  0,  1,  2,  3,  4,  5,  6,  7
842cabdff1aSopenharmony_ci    mova        m1, [srcq+2*lenq+mmsize]  ; m1 =  8,  9, 10, 11, 12, 13, 14, 15
843cabdff1aSopenharmony_ci%if cpuflag(ssse3)
844cabdff1aSopenharmony_ci    pshufb      m0, m3                    ; m0 =  0,  2,  4,  6,  1,  3,  5,  7
845cabdff1aSopenharmony_ci    pshufb      m1, m3                    ; m1 =  8, 10, 12, 14,  9, 11, 13, 15
846cabdff1aSopenharmony_ci    SBUTTERFLY2 qdq, 0, 1, 2              ; m0 =  0,  2,  4,  6,  8, 10, 12, 14
847cabdff1aSopenharmony_ci                                          ; m1 =  1,  3,  5,  7,  9, 11, 13, 15
848cabdff1aSopenharmony_ci%else ; sse2
849cabdff1aSopenharmony_ci    pshuflw     m0, m0, q3120             ; m0 =  0,  2,  1,  3,  4,  5,  6,  7
850cabdff1aSopenharmony_ci    pshufhw     m0, m0, q3120             ; m0 =  0,  2,  1,  3,  4,  6,  5,  7
851cabdff1aSopenharmony_ci    pshuflw     m1, m1, q3120             ; m1 =  8, 10,  9, 11, 12, 13, 14, 15
852cabdff1aSopenharmony_ci    pshufhw     m1, m1, q3120             ; m1 =  8, 10,  9, 11, 12, 14, 13, 15
853cabdff1aSopenharmony_ci    DEINT2_PS    0, 1, 2                  ; m0 =  0,  2,  4,  6,  8, 10, 12, 14
854cabdff1aSopenharmony_ci                                          ; m1 =  1,  3,  5,  7,  9, 11, 13, 15
855cabdff1aSopenharmony_ci%endif
856cabdff1aSopenharmony_ci    mova  [dst0q+lenq], m0
857cabdff1aSopenharmony_ci    mova  [dst1q+lenq], m1
858cabdff1aSopenharmony_ci    add       lenq, mmsize
859cabdff1aSopenharmony_ci    jl .loop
860cabdff1aSopenharmony_ci    REP_RET
861cabdff1aSopenharmony_ci%endmacro
862cabdff1aSopenharmony_ci
863cabdff1aSopenharmony_ciINIT_XMM sse2
864cabdff1aSopenharmony_ciCONV_S16_TO_S16P_2CH
865cabdff1aSopenharmony_ciINIT_XMM ssse3
866cabdff1aSopenharmony_ciCONV_S16_TO_S16P_2CH
867cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
868cabdff1aSopenharmony_ciINIT_XMM avx
869cabdff1aSopenharmony_ciCONV_S16_TO_S16P_2CH
870cabdff1aSopenharmony_ci%endif
871cabdff1aSopenharmony_ci
872cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
873cabdff1aSopenharmony_ci; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
874cabdff1aSopenharmony_ci;                              int channels);
875cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
876cabdff1aSopenharmony_ci
877cabdff1aSopenharmony_ci%macro CONV_S16_TO_S16P_6CH 0
878cabdff1aSopenharmony_ci%if ARCH_X86_64
879cabdff1aSopenharmony_cicglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
880cabdff1aSopenharmony_ci%else
881cabdff1aSopenharmony_cicglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
882cabdff1aSopenharmony_ci%define lend dword r2m
883cabdff1aSopenharmony_ci%endif
884cabdff1aSopenharmony_ci    mov     dst1q, [dstq+  gprsize]
885cabdff1aSopenharmony_ci    mov     dst2q, [dstq+2*gprsize]
886cabdff1aSopenharmony_ci    mov     dst3q, [dstq+3*gprsize]
887cabdff1aSopenharmony_ci    mov     dst4q, [dstq+4*gprsize]
888cabdff1aSopenharmony_ci    mov     dst5q, [dstq+5*gprsize]
889cabdff1aSopenharmony_ci    mov      dstq, [dstq          ]
890cabdff1aSopenharmony_ci    sub     dst1q, dstq
891cabdff1aSopenharmony_ci    sub     dst2q, dstq
892cabdff1aSopenharmony_ci    sub     dst3q, dstq
893cabdff1aSopenharmony_ci    sub     dst4q, dstq
894cabdff1aSopenharmony_ci    sub     dst5q, dstq
895cabdff1aSopenharmony_ci.loop:
896cabdff1aSopenharmony_ci    mova       m0, [srcq+0*mmsize]      ; m0 =  0,  1,  2,  3,  4,  5,  6,  7
897cabdff1aSopenharmony_ci    mova       m3, [srcq+1*mmsize]      ; m3 =  8,  9, 10, 11, 12, 13, 14, 15
898cabdff1aSopenharmony_ci    mova       m2, [srcq+2*mmsize]      ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
899cabdff1aSopenharmony_ci    PALIGNR    m1, m3, m0, 12, m4       ; m1 =  6,  7,  8,  9, 10, 11,  x,  x
900cabdff1aSopenharmony_ci    shufps     m3, m2, q1032            ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
901cabdff1aSopenharmony_ci    psrldq     m2, 4                    ; m2 = 18, 19, 20, 21, 22, 23,  x,  x
902cabdff1aSopenharmony_ci    SBUTTERFLY2 wd, 0, 1, 4             ; m0 =  0,  6,  1,  7,  2,  8,  3,  9
903cabdff1aSopenharmony_ci                                        ; m1 =  4, 10,  5, 11,  x,  x,  x,  x
904cabdff1aSopenharmony_ci    SBUTTERFLY2 wd, 3, 2, 4             ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
905cabdff1aSopenharmony_ci                                        ; m2 = 16, 22, 17, 23,  x,  x,  x,  x
906cabdff1aSopenharmony_ci    SBUTTERFLY2 dq, 0, 3, 4             ; m0 =  0,  6, 12, 18,  1,  7, 13, 19
907cabdff1aSopenharmony_ci                                        ; m3 =  2,  8, 14, 20,  3,  9, 15, 21
908cabdff1aSopenharmony_ci    punpckldq  m1, m2                   ; m1 =  4, 10, 16, 22,  5, 11, 17, 23
909cabdff1aSopenharmony_ci    movq    [dstq      ], m0
910cabdff1aSopenharmony_ci    movhps  [dstq+dst1q], m0
911cabdff1aSopenharmony_ci    movq    [dstq+dst2q], m3
912cabdff1aSopenharmony_ci    movhps  [dstq+dst3q], m3
913cabdff1aSopenharmony_ci    movq    [dstq+dst4q], m1
914cabdff1aSopenharmony_ci    movhps  [dstq+dst5q], m1
915cabdff1aSopenharmony_ci    add      srcq, mmsize*3
916cabdff1aSopenharmony_ci    add      dstq, mmsize/2
917cabdff1aSopenharmony_ci    sub      lend, mmsize/4
918cabdff1aSopenharmony_ci    jg .loop
919cabdff1aSopenharmony_ci    REP_RET
920cabdff1aSopenharmony_ci%endmacro
921cabdff1aSopenharmony_ci
922cabdff1aSopenharmony_ciINIT_XMM sse2
923cabdff1aSopenharmony_ciCONV_S16_TO_S16P_6CH
924cabdff1aSopenharmony_ciINIT_XMM ssse3
925cabdff1aSopenharmony_ciCONV_S16_TO_S16P_6CH
926cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
927cabdff1aSopenharmony_ciINIT_XMM avx
928cabdff1aSopenharmony_ciCONV_S16_TO_S16P_6CH
929cabdff1aSopenharmony_ci%endif
930cabdff1aSopenharmony_ci
931cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
932cabdff1aSopenharmony_ci; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
933cabdff1aSopenharmony_ci;                              int channels);
934cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
935cabdff1aSopenharmony_ci
936cabdff1aSopenharmony_ci%macro CONV_S16_TO_FLTP_2CH 0
937cabdff1aSopenharmony_cicglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
938cabdff1aSopenharmony_ci    lea       lenq, [4*lend]
939cabdff1aSopenharmony_ci    mov      dst1q, [dst0q+gprsize]
940cabdff1aSopenharmony_ci    mov      dst0q, [dst0q        ]
941cabdff1aSopenharmony_ci    add       srcq, lenq
942cabdff1aSopenharmony_ci    add      dst0q, lenq
943cabdff1aSopenharmony_ci    add      dst1q, lenq
944cabdff1aSopenharmony_ci    neg       lenq
945cabdff1aSopenharmony_ci    mova        m3, [pf_s32_inv_scale]
946cabdff1aSopenharmony_ci    mova        m4, [pw_zero_even]
947cabdff1aSopenharmony_ci.loop:
948cabdff1aSopenharmony_ci    mova        m1, [srcq+lenq]
949cabdff1aSopenharmony_ci    pslld       m0, m1, 16
950cabdff1aSopenharmony_ci    pand        m1, m4
951cabdff1aSopenharmony_ci    cvtdq2ps    m0, m0
952cabdff1aSopenharmony_ci    cvtdq2ps    m1, m1
953cabdff1aSopenharmony_ci    mulps       m0, m0, m3
954cabdff1aSopenharmony_ci    mulps       m1, m1, m3
955cabdff1aSopenharmony_ci    mova  [dst0q+lenq], m0
956cabdff1aSopenharmony_ci    mova  [dst1q+lenq], m1
957cabdff1aSopenharmony_ci    add       lenq, mmsize
958cabdff1aSopenharmony_ci    jl .loop
959cabdff1aSopenharmony_ci    REP_RET
960cabdff1aSopenharmony_ci%endmacro
961cabdff1aSopenharmony_ci
962cabdff1aSopenharmony_ciINIT_XMM sse2
963cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_2CH
964cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
965cabdff1aSopenharmony_ciINIT_XMM avx
966cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_2CH
967cabdff1aSopenharmony_ci%endif
968cabdff1aSopenharmony_ci
969cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
970cabdff1aSopenharmony_ci; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
971cabdff1aSopenharmony_ci;                              int channels);
972cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
973cabdff1aSopenharmony_ci
974cabdff1aSopenharmony_ci%macro CONV_S16_TO_FLTP_6CH 0
975cabdff1aSopenharmony_ci%if ARCH_X86_64
976cabdff1aSopenharmony_cicglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
977cabdff1aSopenharmony_ci%else
978cabdff1aSopenharmony_cicglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
979cabdff1aSopenharmony_ci%define lend dword r2m
980cabdff1aSopenharmony_ci%endif
981cabdff1aSopenharmony_ci    mov     dst1q, [dstq+  gprsize]
982cabdff1aSopenharmony_ci    mov     dst2q, [dstq+2*gprsize]
983cabdff1aSopenharmony_ci    mov     dst3q, [dstq+3*gprsize]
984cabdff1aSopenharmony_ci    mov     dst4q, [dstq+4*gprsize]
985cabdff1aSopenharmony_ci    mov     dst5q, [dstq+5*gprsize]
986cabdff1aSopenharmony_ci    mov      dstq, [dstq          ]
987cabdff1aSopenharmony_ci    sub     dst1q, dstq
988cabdff1aSopenharmony_ci    sub     dst2q, dstq
989cabdff1aSopenharmony_ci    sub     dst3q, dstq
990cabdff1aSopenharmony_ci    sub     dst4q, dstq
991cabdff1aSopenharmony_ci    sub     dst5q, dstq
992cabdff1aSopenharmony_ci    mova       m6, [pf_s16_inv_scale]
993cabdff1aSopenharmony_ci.loop:
994cabdff1aSopenharmony_ci    mova       m0, [srcq+0*mmsize]  ; m0 =  0,  1,  2,  3,  4,  5,  6,  7
995cabdff1aSopenharmony_ci    mova       m3, [srcq+1*mmsize]  ; m3 =  8,  9, 10, 11, 12, 13, 14, 15
996cabdff1aSopenharmony_ci    mova       m2, [srcq+2*mmsize]  ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
997cabdff1aSopenharmony_ci    PALIGNR    m1, m3, m0, 12, m4   ; m1 =  6,  7,  8,  9, 10, 11,  x,  x
998cabdff1aSopenharmony_ci    shufps     m3, m2, q1032        ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
999cabdff1aSopenharmony_ci    psrldq     m2, 4                ; m2 = 18, 19, 20, 21, 22, 23,  x,  x
1000cabdff1aSopenharmony_ci    SBUTTERFLY2 wd, 0, 1, 4         ; m0 =  0,  6,  1,  7,  2,  8,  3,  9
1001cabdff1aSopenharmony_ci                                    ; m1 =  4, 10,  5, 11,  x,  x,  x,  x
1002cabdff1aSopenharmony_ci    SBUTTERFLY2 wd, 3, 2, 4         ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
1003cabdff1aSopenharmony_ci                                    ; m2 = 16, 22, 17, 23,  x,  x,  x,  x
1004cabdff1aSopenharmony_ci    SBUTTERFLY2 dq, 0, 3, 4         ; m0 =  0,  6, 12, 18,  1,  7, 13, 19
1005cabdff1aSopenharmony_ci                                    ; m3 =  2,  8, 14, 20,  3,  9, 15, 21
1006cabdff1aSopenharmony_ci    punpckldq  m1, m2               ; m1 =  4, 10, 16, 22,  5, 11, 17, 23
1007cabdff1aSopenharmony_ci    S16_TO_S32_SX 0, 2              ; m0 =      0,      6,     12,     18
1008cabdff1aSopenharmony_ci                                    ; m2 =      1,      7,     13,     19
1009cabdff1aSopenharmony_ci    S16_TO_S32_SX 3, 4              ; m3 =      2,      8,     14,     20
1010cabdff1aSopenharmony_ci                                    ; m4 =      3,      9,     15,     21
1011cabdff1aSopenharmony_ci    S16_TO_S32_SX 1, 5              ; m1 =      4,     10,     16,     22
1012cabdff1aSopenharmony_ci                                    ; m5 =      5,     11,     17,     23
1013cabdff1aSopenharmony_ci    SWAP 1,2,3,4
1014cabdff1aSopenharmony_ci    cvtdq2ps   m0, m0
1015cabdff1aSopenharmony_ci    cvtdq2ps   m1, m1
1016cabdff1aSopenharmony_ci    cvtdq2ps   m2, m2
1017cabdff1aSopenharmony_ci    cvtdq2ps   m3, m3
1018cabdff1aSopenharmony_ci    cvtdq2ps   m4, m4
1019cabdff1aSopenharmony_ci    cvtdq2ps   m5, m5
1020cabdff1aSopenharmony_ci    mulps      m0, m6
1021cabdff1aSopenharmony_ci    mulps      m1, m6
1022cabdff1aSopenharmony_ci    mulps      m2, m6
1023cabdff1aSopenharmony_ci    mulps      m3, m6
1024cabdff1aSopenharmony_ci    mulps      m4, m6
1025cabdff1aSopenharmony_ci    mulps      m5, m6
1026cabdff1aSopenharmony_ci    mova  [dstq      ], m0
1027cabdff1aSopenharmony_ci    mova  [dstq+dst1q], m1
1028cabdff1aSopenharmony_ci    mova  [dstq+dst2q], m2
1029cabdff1aSopenharmony_ci    mova  [dstq+dst3q], m3
1030cabdff1aSopenharmony_ci    mova  [dstq+dst4q], m4
1031cabdff1aSopenharmony_ci    mova  [dstq+dst5q], m5
1032cabdff1aSopenharmony_ci    add      srcq, mmsize*3
1033cabdff1aSopenharmony_ci    add      dstq, mmsize
1034cabdff1aSopenharmony_ci    sub      lend, mmsize/4
1035cabdff1aSopenharmony_ci    jg .loop
1036cabdff1aSopenharmony_ci    REP_RET
1037cabdff1aSopenharmony_ci%endmacro
1038cabdff1aSopenharmony_ci
1039cabdff1aSopenharmony_ciINIT_XMM sse2
1040cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_6CH
1041cabdff1aSopenharmony_ciINIT_XMM ssse3
1042cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_6CH
1043cabdff1aSopenharmony_ciINIT_XMM sse4
1044cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_6CH
1045cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
1046cabdff1aSopenharmony_ciINIT_XMM avx
1047cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_6CH
1048cabdff1aSopenharmony_ci%endif
1049cabdff1aSopenharmony_ci
1050cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1051cabdff1aSopenharmony_ci; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
1052cabdff1aSopenharmony_ci;                              int channels);
1053cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1054cabdff1aSopenharmony_ci
1055cabdff1aSopenharmony_ci%macro CONV_FLT_TO_S16P_2CH 0
1056cabdff1aSopenharmony_cicglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
1057cabdff1aSopenharmony_ci    lea       lenq, [2*lend]
1058cabdff1aSopenharmony_ci    mov      dst1q, [dst0q+gprsize]
1059cabdff1aSopenharmony_ci    mov      dst0q, [dst0q        ]
1060cabdff1aSopenharmony_ci    lea       srcq, [srcq+4*lenq]
1061cabdff1aSopenharmony_ci    add      dst0q, lenq
1062cabdff1aSopenharmony_ci    add      dst1q, lenq
1063cabdff1aSopenharmony_ci    neg       lenq
1064cabdff1aSopenharmony_ci    mova        m5, [pf_s16_scale]
1065cabdff1aSopenharmony_ci.loop:
1066cabdff1aSopenharmony_ci    mova       m0, [srcq+4*lenq         ]
1067cabdff1aSopenharmony_ci    mova       m1, [srcq+4*lenq+  mmsize]
1068cabdff1aSopenharmony_ci    mova       m2, [srcq+4*lenq+2*mmsize]
1069cabdff1aSopenharmony_ci    mova       m3, [srcq+4*lenq+3*mmsize]
1070cabdff1aSopenharmony_ci    DEINT2_PS   0, 1, 4
1071cabdff1aSopenharmony_ci    DEINT2_PS   2, 3, 4
1072cabdff1aSopenharmony_ci    mulps      m0, m0, m5
1073cabdff1aSopenharmony_ci    mulps      m1, m1, m5
1074cabdff1aSopenharmony_ci    mulps      m2, m2, m5
1075cabdff1aSopenharmony_ci    mulps      m3, m3, m5
1076cabdff1aSopenharmony_ci    cvtps2dq   m0, m0
1077cabdff1aSopenharmony_ci    cvtps2dq   m1, m1
1078cabdff1aSopenharmony_ci    cvtps2dq   m2, m2
1079cabdff1aSopenharmony_ci    cvtps2dq   m3, m3
1080cabdff1aSopenharmony_ci    packssdw   m0, m2
1081cabdff1aSopenharmony_ci    packssdw   m1, m3
1082cabdff1aSopenharmony_ci    mova  [dst0q+lenq], m0
1083cabdff1aSopenharmony_ci    mova  [dst1q+lenq], m1
1084cabdff1aSopenharmony_ci    add      lenq, mmsize
1085cabdff1aSopenharmony_ci    jl .loop
1086cabdff1aSopenharmony_ci    REP_RET
1087cabdff1aSopenharmony_ci%endmacro
1088cabdff1aSopenharmony_ci
1089cabdff1aSopenharmony_ciINIT_XMM sse2
1090cabdff1aSopenharmony_ciCONV_FLT_TO_S16P_2CH
1091cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
1092cabdff1aSopenharmony_ciINIT_XMM avx
1093cabdff1aSopenharmony_ciCONV_FLT_TO_S16P_2CH
1094cabdff1aSopenharmony_ci%endif
1095cabdff1aSopenharmony_ci
1096cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1097cabdff1aSopenharmony_ci; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
1098cabdff1aSopenharmony_ci;                              int channels);
1099cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1100cabdff1aSopenharmony_ci
1101cabdff1aSopenharmony_ci%macro CONV_FLT_TO_S16P_6CH 0
1102cabdff1aSopenharmony_ci%if ARCH_X86_64
1103cabdff1aSopenharmony_cicglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1104cabdff1aSopenharmony_ci%else
1105cabdff1aSopenharmony_cicglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1106cabdff1aSopenharmony_ci%define lend dword r2m
1107cabdff1aSopenharmony_ci%endif
1108cabdff1aSopenharmony_ci    mov     dst1q, [dstq+  gprsize]
1109cabdff1aSopenharmony_ci    mov     dst2q, [dstq+2*gprsize]
1110cabdff1aSopenharmony_ci    mov     dst3q, [dstq+3*gprsize]
1111cabdff1aSopenharmony_ci    mov     dst4q, [dstq+4*gprsize]
1112cabdff1aSopenharmony_ci    mov     dst5q, [dstq+5*gprsize]
1113cabdff1aSopenharmony_ci    mov      dstq, [dstq          ]
1114cabdff1aSopenharmony_ci    sub     dst1q, dstq
1115cabdff1aSopenharmony_ci    sub     dst2q, dstq
1116cabdff1aSopenharmony_ci    sub     dst3q, dstq
1117cabdff1aSopenharmony_ci    sub     dst4q, dstq
1118cabdff1aSopenharmony_ci    sub     dst5q, dstq
1119cabdff1aSopenharmony_ci    mova       m6, [pf_s16_scale]
1120cabdff1aSopenharmony_ci.loop:
1121cabdff1aSopenharmony_ci    mulps      m0, m6, [srcq+0*mmsize]
1122cabdff1aSopenharmony_ci    mulps      m3, m6, [srcq+1*mmsize]
1123cabdff1aSopenharmony_ci    mulps      m1, m6, [srcq+2*mmsize]
1124cabdff1aSopenharmony_ci    mulps      m4, m6, [srcq+3*mmsize]
1125cabdff1aSopenharmony_ci    mulps      m2, m6, [srcq+4*mmsize]
1126cabdff1aSopenharmony_ci    mulps      m5, m6, [srcq+5*mmsize]
1127cabdff1aSopenharmony_ci    cvtps2dq   m0, m0
1128cabdff1aSopenharmony_ci    cvtps2dq   m1, m1
1129cabdff1aSopenharmony_ci    cvtps2dq   m2, m2
1130cabdff1aSopenharmony_ci    cvtps2dq   m3, m3
1131cabdff1aSopenharmony_ci    cvtps2dq   m4, m4
1132cabdff1aSopenharmony_ci    cvtps2dq   m5, m5
1133cabdff1aSopenharmony_ci    packssdw   m0, m3               ; m0 =  0,  1,  2,  3,  4,  5,  6,  7
1134cabdff1aSopenharmony_ci    packssdw   m1, m4               ; m1 =  8,  9, 10, 11, 12, 13, 14, 15
1135cabdff1aSopenharmony_ci    packssdw   m2, m5               ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
1136cabdff1aSopenharmony_ci    PALIGNR    m3, m1, m0, 12, m4   ; m3 =  6,  7,  8,  9, 10, 11,  x,  x
1137cabdff1aSopenharmony_ci    shufps     m1, m2, q1032        ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
1138cabdff1aSopenharmony_ci    psrldq     m2, 4                ; m2 = 18, 19, 20, 21, 22, 23,  x,  x
1139cabdff1aSopenharmony_ci    SBUTTERFLY2 wd, 0, 3, 4         ; m0 =  0,  6,  1,  7,  2,  8,  3,  9
1140cabdff1aSopenharmony_ci                                    ; m3 =  4, 10,  5, 11,  x,  x,  x,  x
1141cabdff1aSopenharmony_ci    SBUTTERFLY2 wd, 1, 2, 4         ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
1142cabdff1aSopenharmony_ci                                    ; m2 = 16, 22, 17, 23,  x,  x,  x,  x
1143cabdff1aSopenharmony_ci    SBUTTERFLY2 dq, 0, 1, 4         ; m0 =  0,  6, 12, 18,  1,  7, 13, 19
1144cabdff1aSopenharmony_ci                                    ; m1 =  2,  8, 14, 20,  3,  9, 15, 21
1145cabdff1aSopenharmony_ci    punpckldq  m3, m2               ; m3 =  4, 10, 16, 22,  5, 11, 17, 23
1146cabdff1aSopenharmony_ci    movq    [dstq      ], m0
1147cabdff1aSopenharmony_ci    movhps  [dstq+dst1q], m0
1148cabdff1aSopenharmony_ci    movq    [dstq+dst2q], m1
1149cabdff1aSopenharmony_ci    movhps  [dstq+dst3q], m1
1150cabdff1aSopenharmony_ci    movq    [dstq+dst4q], m3
1151cabdff1aSopenharmony_ci    movhps  [dstq+dst5q], m3
1152cabdff1aSopenharmony_ci    add      srcq, mmsize*6
1153cabdff1aSopenharmony_ci    add      dstq, mmsize/2
1154cabdff1aSopenharmony_ci    sub      lend, mmsize/4
1155cabdff1aSopenharmony_ci    jg .loop
1156cabdff1aSopenharmony_ci    REP_RET
1157cabdff1aSopenharmony_ci%endmacro
1158cabdff1aSopenharmony_ci
1159cabdff1aSopenharmony_ciINIT_XMM sse2
1160cabdff1aSopenharmony_ciCONV_FLT_TO_S16P_6CH
1161cabdff1aSopenharmony_ciINIT_XMM ssse3
1162cabdff1aSopenharmony_ciCONV_FLT_TO_S16P_6CH
1163cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
1164cabdff1aSopenharmony_ciINIT_XMM avx
1165cabdff1aSopenharmony_ciCONV_FLT_TO_S16P_6CH
1166cabdff1aSopenharmony_ci%endif
1167cabdff1aSopenharmony_ci
1168cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1169cabdff1aSopenharmony_ci; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
1170cabdff1aSopenharmony_ci;                              int channels);
1171cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1172cabdff1aSopenharmony_ci
1173cabdff1aSopenharmony_ci%macro CONV_FLT_TO_FLTP_2CH 0
1174cabdff1aSopenharmony_cicglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
1175cabdff1aSopenharmony_ci    lea    lenq, [4*lend]
1176cabdff1aSopenharmony_ci    mov   dst1q, [dst0q+gprsize]
1177cabdff1aSopenharmony_ci    mov   dst0q, [dst0q        ]
1178cabdff1aSopenharmony_ci    lea    srcq, [srcq+2*lenq]
1179cabdff1aSopenharmony_ci    add   dst0q, lenq
1180cabdff1aSopenharmony_ci    add   dst1q, lenq
1181cabdff1aSopenharmony_ci    neg    lenq
1182cabdff1aSopenharmony_ci.loop:
1183cabdff1aSopenharmony_ci    mova     m0, [srcq+2*lenq       ]
1184cabdff1aSopenharmony_ci    mova     m1, [srcq+2*lenq+mmsize]
1185cabdff1aSopenharmony_ci    DEINT2_PS 0, 1, 2
1186cabdff1aSopenharmony_ci    mova  [dst0q+lenq], m0
1187cabdff1aSopenharmony_ci    mova  [dst1q+lenq], m1
1188cabdff1aSopenharmony_ci    add    lenq, mmsize
1189cabdff1aSopenharmony_ci    jl .loop
1190cabdff1aSopenharmony_ci    REP_RET
1191cabdff1aSopenharmony_ci%endmacro
1192cabdff1aSopenharmony_ci
1193cabdff1aSopenharmony_ciINIT_XMM sse
1194cabdff1aSopenharmony_ciCONV_FLT_TO_FLTP_2CH
1195cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
1196cabdff1aSopenharmony_ciINIT_XMM avx
1197cabdff1aSopenharmony_ciCONV_FLT_TO_FLTP_2CH
1198cabdff1aSopenharmony_ci%endif
1199cabdff1aSopenharmony_ci
1200cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1201cabdff1aSopenharmony_ci; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
1202cabdff1aSopenharmony_ci;                              int channels);
1203cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1204cabdff1aSopenharmony_ci
1205cabdff1aSopenharmony_ci%macro CONV_FLT_TO_FLTP_6CH 0
1206cabdff1aSopenharmony_ci%if ARCH_X86_64
1207cabdff1aSopenharmony_cicglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1208cabdff1aSopenharmony_ci%else
1209cabdff1aSopenharmony_cicglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1210cabdff1aSopenharmony_ci%define lend dword r2m
1211cabdff1aSopenharmony_ci%endif
1212cabdff1aSopenharmony_ci    mov     dst1q, [dstq+  gprsize]
1213cabdff1aSopenharmony_ci    mov     dst2q, [dstq+2*gprsize]
1214cabdff1aSopenharmony_ci    mov     dst3q, [dstq+3*gprsize]
1215cabdff1aSopenharmony_ci    mov     dst4q, [dstq+4*gprsize]
1216cabdff1aSopenharmony_ci    mov     dst5q, [dstq+5*gprsize]
1217cabdff1aSopenharmony_ci    mov      dstq, [dstq          ]
1218cabdff1aSopenharmony_ci    sub     dst1q, dstq
1219cabdff1aSopenharmony_ci    sub     dst2q, dstq
1220cabdff1aSopenharmony_ci    sub     dst3q, dstq
1221cabdff1aSopenharmony_ci    sub     dst4q, dstq
1222cabdff1aSopenharmony_ci    sub     dst5q, dstq
1223cabdff1aSopenharmony_ci.loop:
1224cabdff1aSopenharmony_ci    mova       m0, [srcq+0*mmsize]  ; m0 =  0,  1,  2,  3
1225cabdff1aSopenharmony_ci    mova       m1, [srcq+1*mmsize]  ; m1 =  4,  5,  6,  7
1226cabdff1aSopenharmony_ci    mova       m2, [srcq+2*mmsize]  ; m2 =  8,  9, 10, 11
1227cabdff1aSopenharmony_ci    mova       m3, [srcq+3*mmsize]  ; m3 = 12, 13, 14, 15
1228cabdff1aSopenharmony_ci    mova       m4, [srcq+4*mmsize]  ; m4 = 16, 17, 18, 19
1229cabdff1aSopenharmony_ci    mova       m5, [srcq+5*mmsize]  ; m5 = 20, 21, 22, 23
1230cabdff1aSopenharmony_ci
1231cabdff1aSopenharmony_ci    SBUTTERFLY2 dq, 0, 3, 6         ; m0 =  0, 12,  1, 13
1232cabdff1aSopenharmony_ci                                    ; m3 =  2, 14,  3, 15
1233cabdff1aSopenharmony_ci    SBUTTERFLY2 dq, 1, 4, 6         ; m1 =  4, 16,  5, 17
1234cabdff1aSopenharmony_ci                                    ; m4 =  6, 18,  7, 19
1235cabdff1aSopenharmony_ci    SBUTTERFLY2 dq, 2, 5, 6         ; m2 =  8, 20,  9, 21
1236cabdff1aSopenharmony_ci                                    ; m5 = 10, 22, 11, 23
1237cabdff1aSopenharmony_ci    SBUTTERFLY2 dq, 0, 4, 6         ; m0 =  0,  6, 12, 18
1238cabdff1aSopenharmony_ci                                    ; m4 =  1,  7, 13, 19
1239cabdff1aSopenharmony_ci    SBUTTERFLY2 dq, 3, 2, 6         ; m3 =  2,  8, 14, 20
1240cabdff1aSopenharmony_ci                                    ; m2 =  3,  9, 15, 21
1241cabdff1aSopenharmony_ci    SBUTTERFLY2 dq, 1, 5, 6         ; m1 =  4, 10, 16, 22
1242cabdff1aSopenharmony_ci                                    ; m5 =  5, 11, 17, 23
1243cabdff1aSopenharmony_ci    mova [dstq      ], m0
1244cabdff1aSopenharmony_ci    mova [dstq+dst1q], m4
1245cabdff1aSopenharmony_ci    mova [dstq+dst2q], m3
1246cabdff1aSopenharmony_ci    mova [dstq+dst3q], m2
1247cabdff1aSopenharmony_ci    mova [dstq+dst4q], m1
1248cabdff1aSopenharmony_ci    mova [dstq+dst5q], m5
1249cabdff1aSopenharmony_ci    add      srcq, mmsize*6
1250cabdff1aSopenharmony_ci    add      dstq, mmsize
1251cabdff1aSopenharmony_ci    sub      lend, mmsize/4
1252cabdff1aSopenharmony_ci    jg .loop
1253cabdff1aSopenharmony_ci    REP_RET
1254cabdff1aSopenharmony_ci%endmacro
1255cabdff1aSopenharmony_ci
1256cabdff1aSopenharmony_ciINIT_XMM sse2
1257cabdff1aSopenharmony_ciCONV_FLT_TO_FLTP_6CH
1258cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
1259cabdff1aSopenharmony_ciINIT_XMM avx
1260cabdff1aSopenharmony_ciCONV_FLT_TO_FLTP_6CH
1261cabdff1aSopenharmony_ci%endif
1262