1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* x86 optimized Format Conversion Utils 3cabdff1aSopenharmony_ci;* Copyright (c) 2008 Loren Merritt 4cabdff1aSopenharmony_ci;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com> 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci%include "util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION_RODATA 32 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cipf_s32_inv_scale: times 8 dd 0x30000000 29cabdff1aSopenharmony_cipf_s32_scale: times 8 dd 0x4f000000 30cabdff1aSopenharmony_cipf_s32_clip: times 8 dd 0x4effffff 31cabdff1aSopenharmony_cipf_s16_inv_scale: times 4 dd 0x38000000 32cabdff1aSopenharmony_cipf_s16_scale: times 4 dd 0x47000000 33cabdff1aSopenharmony_cipb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11 34cabdff1aSopenharmony_cipb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15 35cabdff1aSopenharmony_cipb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7 36cabdff1aSopenharmony_cipb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7 37cabdff1aSopenharmony_cipw_zero_even: times 4 dw 0x0000, 0xffff 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ciSECTION .text 40cabdff1aSopenharmony_ci 41cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 42cabdff1aSopenharmony_ci; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len); 43cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 44cabdff1aSopenharmony_ci 45cabdff1aSopenharmony_ciINIT_XMM sse2 46cabdff1aSopenharmony_cicglobal conv_s16_to_s32, 3,3,3, dst, src, len 47cabdff1aSopenharmony_ci lea lenq, [2*lend] 48cabdff1aSopenharmony_ci lea dstq, [dstq+2*lenq] 49cabdff1aSopenharmony_ci add srcq, lenq 50cabdff1aSopenharmony_ci neg lenq 51cabdff1aSopenharmony_ci.loop: 52cabdff1aSopenharmony_ci mova m2, [srcq+lenq] 53cabdff1aSopenharmony_ci pxor m0, m0 54cabdff1aSopenharmony_ci pxor m1, m1 55cabdff1aSopenharmony_ci punpcklwd m0, m2 56cabdff1aSopenharmony_ci punpckhwd m1, m2 57cabdff1aSopenharmony_ci mova [dstq+2*lenq ], m0 58cabdff1aSopenharmony_ci mova [dstq+2*lenq+mmsize], m1 59cabdff1aSopenharmony_ci add lenq, mmsize 60cabdff1aSopenharmony_ci jl .loop 61cabdff1aSopenharmony_ci REP_RET 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 64cabdff1aSopenharmony_ci; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len); 65cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ci%macro CONV_S16_TO_FLT 0 68cabdff1aSopenharmony_cicglobal conv_s16_to_flt, 3,3,3, dst, src, len 69cabdff1aSopenharmony_ci lea lenq, [2*lend] 70cabdff1aSopenharmony_ci add srcq, lenq 71cabdff1aSopenharmony_ci lea dstq, [dstq + 2*lenq] 72cabdff1aSopenharmony_ci neg lenq 73cabdff1aSopenharmony_ci mova m2, [pf_s16_inv_scale] 74cabdff1aSopenharmony_ci ALIGN 16 75cabdff1aSopenharmony_ci.loop: 76cabdff1aSopenharmony_ci mova m0, [srcq+lenq] 77cabdff1aSopenharmony_ci S16_TO_S32_SX 0, 1 78cabdff1aSopenharmony_ci cvtdq2ps m0, m0 79cabdff1aSopenharmony_ci cvtdq2ps m1, m1 80cabdff1aSopenharmony_ci mulps m0, m2 81cabdff1aSopenharmony_ci mulps m1, m2 82cabdff1aSopenharmony_ci mova [dstq+2*lenq ], m0 83cabdff1aSopenharmony_ci mova [dstq+2*lenq+mmsize], m1 84cabdff1aSopenharmony_ci add lenq, mmsize 85cabdff1aSopenharmony_ci jl .loop 86cabdff1aSopenharmony_ci REP_RET 87cabdff1aSopenharmony_ci%endmacro 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ciINIT_XMM sse2 90cabdff1aSopenharmony_ciCONV_S16_TO_FLT 91cabdff1aSopenharmony_ciINIT_XMM sse4 92cabdff1aSopenharmony_ciCONV_S16_TO_FLT 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 95cabdff1aSopenharmony_ci; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len); 96cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci%macro CONV_S32_TO_S16 0 99cabdff1aSopenharmony_cicglobal conv_s32_to_s16, 3,3,4, dst, src, len 100cabdff1aSopenharmony_ci lea lenq, [2*lend] 101cabdff1aSopenharmony_ci lea srcq, [srcq+2*lenq] 102cabdff1aSopenharmony_ci add dstq, lenq 103cabdff1aSopenharmony_ci neg lenq 104cabdff1aSopenharmony_ci.loop: 105cabdff1aSopenharmony_ci mova m0, [srcq+2*lenq ] 106cabdff1aSopenharmony_ci mova m1, [srcq+2*lenq+ mmsize] 107cabdff1aSopenharmony_ci mova m2, [srcq+2*lenq+2*mmsize] 108cabdff1aSopenharmony_ci mova m3, [srcq+2*lenq+3*mmsize] 109cabdff1aSopenharmony_ci psrad m0, 16 110cabdff1aSopenharmony_ci psrad m1, 16 111cabdff1aSopenharmony_ci psrad m2, 16 112cabdff1aSopenharmony_ci psrad m3, 16 113cabdff1aSopenharmony_ci packssdw m0, m1 114cabdff1aSopenharmony_ci packssdw m2, m3 115cabdff1aSopenharmony_ci mova [dstq+lenq ], m0 116cabdff1aSopenharmony_ci mova [dstq+lenq+mmsize], m2 117cabdff1aSopenharmony_ci add lenq, mmsize*2 118cabdff1aSopenharmony_ci jl .loop 119cabdff1aSopenharmony_ci%if mmsize == 8 120cabdff1aSopenharmony_ci emms 121cabdff1aSopenharmony_ci RET 122cabdff1aSopenharmony_ci%else 123cabdff1aSopenharmony_ci REP_RET 124cabdff1aSopenharmony_ci%endif 125cabdff1aSopenharmony_ci%endmacro 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ciINIT_MMX mmx 128cabdff1aSopenharmony_ciCONV_S32_TO_S16 129cabdff1aSopenharmony_ciINIT_XMM sse2 130cabdff1aSopenharmony_ciCONV_S32_TO_S16 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 133cabdff1aSopenharmony_ci; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len); 134cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 135cabdff1aSopenharmony_ci 136cabdff1aSopenharmony_ci%macro CONV_S32_TO_FLT 0 137cabdff1aSopenharmony_cicglobal conv_s32_to_flt, 3,3,3, dst, src, len 138cabdff1aSopenharmony_ci lea lenq, [4*lend] 139cabdff1aSopenharmony_ci add srcq, lenq 140cabdff1aSopenharmony_ci add dstq, lenq 141cabdff1aSopenharmony_ci neg lenq 142cabdff1aSopenharmony_ci mova m0, [pf_s32_inv_scale] 143cabdff1aSopenharmony_ci ALIGN 16 144cabdff1aSopenharmony_ci.loop: 145cabdff1aSopenharmony_ci cvtdq2ps m1, [srcq+lenq ] 146cabdff1aSopenharmony_ci cvtdq2ps m2, [srcq+lenq+mmsize] 147cabdff1aSopenharmony_ci mulps m1, m1, m0 148cabdff1aSopenharmony_ci mulps m2, m2, m0 149cabdff1aSopenharmony_ci mova [dstq+lenq ], m1 150cabdff1aSopenharmony_ci mova [dstq+lenq+mmsize], m2 151cabdff1aSopenharmony_ci add lenq, mmsize*2 152cabdff1aSopenharmony_ci jl .loop 153cabdff1aSopenharmony_ci REP_RET 154cabdff1aSopenharmony_ci%endmacro 155cabdff1aSopenharmony_ci 156cabdff1aSopenharmony_ciINIT_XMM sse2 157cabdff1aSopenharmony_ciCONV_S32_TO_FLT 158cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 159cabdff1aSopenharmony_ciINIT_YMM avx 160cabdff1aSopenharmony_ciCONV_S32_TO_FLT 161cabdff1aSopenharmony_ci%endif 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 164cabdff1aSopenharmony_ci; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len); 165cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ciINIT_XMM sse2 168cabdff1aSopenharmony_cicglobal conv_flt_to_s16, 3,3,5, dst, src, len 169cabdff1aSopenharmony_ci lea lenq, [2*lend] 170cabdff1aSopenharmony_ci lea srcq, [srcq+2*lenq] 171cabdff1aSopenharmony_ci add dstq, lenq 172cabdff1aSopenharmony_ci neg lenq 173cabdff1aSopenharmony_ci mova m4, [pf_s16_scale] 174cabdff1aSopenharmony_ci.loop: 175cabdff1aSopenharmony_ci mova m0, [srcq+2*lenq ] 176cabdff1aSopenharmony_ci mova m1, [srcq+2*lenq+1*mmsize] 177cabdff1aSopenharmony_ci mova m2, [srcq+2*lenq+2*mmsize] 178cabdff1aSopenharmony_ci mova m3, [srcq+2*lenq+3*mmsize] 179cabdff1aSopenharmony_ci mulps m0, m4 180cabdff1aSopenharmony_ci mulps m1, m4 181cabdff1aSopenharmony_ci mulps m2, m4 182cabdff1aSopenharmony_ci mulps m3, m4 183cabdff1aSopenharmony_ci cvtps2dq m0, m0 184cabdff1aSopenharmony_ci cvtps2dq m1, m1 185cabdff1aSopenharmony_ci cvtps2dq m2, m2 186cabdff1aSopenharmony_ci cvtps2dq m3, m3 187cabdff1aSopenharmony_ci packssdw m0, m1 188cabdff1aSopenharmony_ci packssdw m2, m3 189cabdff1aSopenharmony_ci mova [dstq+lenq ], m0 190cabdff1aSopenharmony_ci mova [dstq+lenq+mmsize], m2 191cabdff1aSopenharmony_ci add lenq, mmsize*2 192cabdff1aSopenharmony_ci jl .loop 193cabdff1aSopenharmony_ci REP_RET 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 196cabdff1aSopenharmony_ci; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len); 197cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 198cabdff1aSopenharmony_ci 199cabdff1aSopenharmony_ci%macro CONV_FLT_TO_S32 0 200cabdff1aSopenharmony_cicglobal conv_flt_to_s32, 3,3,6, dst, src, len 201cabdff1aSopenharmony_ci lea lenq, [lend*4] 202cabdff1aSopenharmony_ci add srcq, lenq 203cabdff1aSopenharmony_ci add dstq, lenq 204cabdff1aSopenharmony_ci neg lenq 205cabdff1aSopenharmony_ci mova m4, [pf_s32_scale] 206cabdff1aSopenharmony_ci mova m5, [pf_s32_clip] 207cabdff1aSopenharmony_ci.loop: 208cabdff1aSopenharmony_ci mulps m0, m4, [srcq+lenq ] 209cabdff1aSopenharmony_ci mulps m1, m4, [srcq+lenq+1*mmsize] 210cabdff1aSopenharmony_ci mulps m2, m4, [srcq+lenq+2*mmsize] 211cabdff1aSopenharmony_ci mulps m3, m4, [srcq+lenq+3*mmsize] 212cabdff1aSopenharmony_ci minps m0, m0, m5 213cabdff1aSopenharmony_ci minps m1, m1, m5 214cabdff1aSopenharmony_ci minps m2, m2, m5 215cabdff1aSopenharmony_ci minps m3, m3, m5 216cabdff1aSopenharmony_ci cvtps2dq m0, m0 217cabdff1aSopenharmony_ci cvtps2dq m1, m1 218cabdff1aSopenharmony_ci cvtps2dq m2, m2 219cabdff1aSopenharmony_ci cvtps2dq m3, m3 220cabdff1aSopenharmony_ci mova [dstq+lenq ], m0 221cabdff1aSopenharmony_ci mova [dstq+lenq+1*mmsize], m1 222cabdff1aSopenharmony_ci mova [dstq+lenq+2*mmsize], m2 223cabdff1aSopenharmony_ci mova [dstq+lenq+3*mmsize], m3 224cabdff1aSopenharmony_ci add lenq, mmsize*4 225cabdff1aSopenharmony_ci jl .loop 226cabdff1aSopenharmony_ci REP_RET 227cabdff1aSopenharmony_ci%endmacro 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ciINIT_XMM sse2 230cabdff1aSopenharmony_ciCONV_FLT_TO_S32 231cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 232cabdff1aSopenharmony_ciINIT_YMM avx 233cabdff1aSopenharmony_ciCONV_FLT_TO_S32 234cabdff1aSopenharmony_ci%endif 235cabdff1aSopenharmony_ci 236cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 237cabdff1aSopenharmony_ci; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len, 238cabdff1aSopenharmony_ci; int channels); 239cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 240cabdff1aSopenharmony_ci 241cabdff1aSopenharmony_ci%macro CONV_S16P_TO_S16_2CH 0 242cabdff1aSopenharmony_cicglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1 243cabdff1aSopenharmony_ci mov src1q, [src0q+gprsize] 244cabdff1aSopenharmony_ci mov src0q, [src0q ] 245cabdff1aSopenharmony_ci lea lenq, [2*lend] 246cabdff1aSopenharmony_ci add src0q, lenq 247cabdff1aSopenharmony_ci add src1q, lenq 248cabdff1aSopenharmony_ci lea dstq, [dstq+2*lenq] 249cabdff1aSopenharmony_ci neg lenq 250cabdff1aSopenharmony_ci.loop: 251cabdff1aSopenharmony_ci mova m0, [src0q+lenq ] 252cabdff1aSopenharmony_ci mova m1, [src1q+lenq ] 253cabdff1aSopenharmony_ci mova m2, [src0q+lenq+mmsize] 254cabdff1aSopenharmony_ci mova m3, [src1q+lenq+mmsize] 255cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 0, 1, 4 256cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 2, 3, 4 257cabdff1aSopenharmony_ci mova [dstq+2*lenq+0*mmsize], m0 258cabdff1aSopenharmony_ci mova [dstq+2*lenq+1*mmsize], m1 259cabdff1aSopenharmony_ci mova [dstq+2*lenq+2*mmsize], m2 260cabdff1aSopenharmony_ci mova [dstq+2*lenq+3*mmsize], m3 261cabdff1aSopenharmony_ci add lenq, 2*mmsize 262cabdff1aSopenharmony_ci jl .loop 263cabdff1aSopenharmony_ci REP_RET 264cabdff1aSopenharmony_ci%endmacro 265cabdff1aSopenharmony_ci 266cabdff1aSopenharmony_ciINIT_XMM sse2 267cabdff1aSopenharmony_ciCONV_S16P_TO_S16_2CH 268cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 269cabdff1aSopenharmony_ciINIT_XMM avx 270cabdff1aSopenharmony_ciCONV_S16P_TO_S16_2CH 271cabdff1aSopenharmony_ci%endif 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 274cabdff1aSopenharmony_ci; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len, 275cabdff1aSopenharmony_ci; int channels); 276cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 277cabdff1aSopenharmony_ci 278cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 279cabdff1aSopenharmony_ci; NOTE: In the 6-channel functions, len could be used as an index on x86-64 280cabdff1aSopenharmony_ci; instead of just a counter, which would avoid incrementing the 281cabdff1aSopenharmony_ci; pointers, but the extra complexity and amount of code is not worth 282cabdff1aSopenharmony_ci; the small gain. On x86-32 there are not enough registers to use len 283cabdff1aSopenharmony_ci; as an index without keeping two of the pointers on the stack and 284cabdff1aSopenharmony_ci; loading them in each iteration. 285cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci%macro CONV_S16P_TO_S16_6CH 0 288cabdff1aSopenharmony_ci%if ARCH_X86_64 289cabdff1aSopenharmony_cicglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5 290cabdff1aSopenharmony_ci%else 291cabdff1aSopenharmony_cicglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5 292cabdff1aSopenharmony_ci%define lend dword r2m 293cabdff1aSopenharmony_ci%endif 294cabdff1aSopenharmony_ci mov src1q, [src0q+1*gprsize] 295cabdff1aSopenharmony_ci mov src2q, [src0q+2*gprsize] 296cabdff1aSopenharmony_ci mov src3q, [src0q+3*gprsize] 297cabdff1aSopenharmony_ci mov src4q, [src0q+4*gprsize] 298cabdff1aSopenharmony_ci mov src5q, [src0q+5*gprsize] 299cabdff1aSopenharmony_ci mov src0q, [src0q] 300cabdff1aSopenharmony_ci sub src1q, src0q 301cabdff1aSopenharmony_ci sub src2q, src0q 302cabdff1aSopenharmony_ci sub src3q, src0q 303cabdff1aSopenharmony_ci sub src4q, src0q 304cabdff1aSopenharmony_ci sub src5q, src0q 305cabdff1aSopenharmony_ci.loop: 306cabdff1aSopenharmony_ci%if cpuflag(sse2slow) 307cabdff1aSopenharmony_ci movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x 308cabdff1aSopenharmony_ci movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x 309cabdff1aSopenharmony_ci movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x 310cabdff1aSopenharmony_ci movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x 311cabdff1aSopenharmony_ci movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x 312cabdff1aSopenharmony_ci movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x 313cabdff1aSopenharmony_ci ; unpack words: 314cabdff1aSopenharmony_ci punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 315cabdff1aSopenharmony_ci punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23 316cabdff1aSopenharmony_ci punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21 317cabdff1aSopenharmony_ci ; blend dwords 318cabdff1aSopenharmony_ci shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15 319cabdff1aSopenharmony_ci shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 320cabdff1aSopenharmony_ci shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 321cabdff1aSopenharmony_ci ; shuffle dwords 322cabdff1aSopenharmony_ci pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19 323cabdff1aSopenharmony_ci pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15 324cabdff1aSopenharmony_ci pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 325cabdff1aSopenharmony_ci movq [dstq+0*mmsize/2], m1 326cabdff1aSopenharmony_ci movq [dstq+1*mmsize/2], m0 327cabdff1aSopenharmony_ci movq [dstq+2*mmsize/2], m2 328cabdff1aSopenharmony_ci movhps [dstq+3*mmsize/2], m1 329cabdff1aSopenharmony_ci movhps [dstq+4*mmsize/2], m0 330cabdff1aSopenharmony_ci movhps [dstq+5*mmsize/2], m2 331cabdff1aSopenharmony_ci add src0q, mmsize/2 332cabdff1aSopenharmony_ci add dstq, mmsize*3 333cabdff1aSopenharmony_ci sub lend, mmsize/4 334cabdff1aSopenharmony_ci%else 335cabdff1aSopenharmony_ci mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42 336cabdff1aSopenharmony_ci mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43 337cabdff1aSopenharmony_ci mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44 338cabdff1aSopenharmony_ci mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45 339cabdff1aSopenharmony_ci mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46 340cabdff1aSopenharmony_ci mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47 341cabdff1aSopenharmony_ci ; unpack words: 342cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 343cabdff1aSopenharmony_ci ; m1 = 24, 25, 30, 31, 36, 37, 42, 43 344cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 345cabdff1aSopenharmony_ci ; m3 = 26, 27, 32, 33, 38, 39, 44, 45 346cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23 347cabdff1aSopenharmony_ci ; m5 = 28, 29, 34, 35, 40, 41, 46, 47 348cabdff1aSopenharmony_ci ; blend dwords 349cabdff1aSopenharmony_ci shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15 350cabdff1aSopenharmony_ci shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 351cabdff1aSopenharmony_ci shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 352cabdff1aSopenharmony_ci SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15 353cabdff1aSopenharmony_ci shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39 354cabdff1aSopenharmony_ci shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41 355cabdff1aSopenharmony_ci shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47 356cabdff1aSopenharmony_ci SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39 357cabdff1aSopenharmony_ci ; shuffle dwords 358cabdff1aSopenharmony_ci pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19 359cabdff1aSopenharmony_ci pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 360cabdff1aSopenharmony_ci pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15 361cabdff1aSopenharmony_ci pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43 362cabdff1aSopenharmony_ci pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47 363cabdff1aSopenharmony_ci pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39 364cabdff1aSopenharmony_ci ; shuffle qwords 365cabdff1aSopenharmony_ci punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7 366cabdff1aSopenharmony_ci punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23 367cabdff1aSopenharmony_ci shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15 368cabdff1aSopenharmony_ci SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7 369cabdff1aSopenharmony_ci punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31 370cabdff1aSopenharmony_ci punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47 371cabdff1aSopenharmony_ci shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39 372cabdff1aSopenharmony_ci SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31 373cabdff1aSopenharmony_ci mova [dstq+0*mmsize], m4 374cabdff1aSopenharmony_ci mova [dstq+1*mmsize], m2 375cabdff1aSopenharmony_ci mova [dstq+2*mmsize], m0 376cabdff1aSopenharmony_ci mova [dstq+3*mmsize], m5 377cabdff1aSopenharmony_ci mova [dstq+4*mmsize], m3 378cabdff1aSopenharmony_ci mova [dstq+5*mmsize], m1 379cabdff1aSopenharmony_ci add src0q, mmsize 380cabdff1aSopenharmony_ci add dstq, mmsize*6 381cabdff1aSopenharmony_ci sub lend, mmsize/2 382cabdff1aSopenharmony_ci%endif 383cabdff1aSopenharmony_ci jg .loop 384cabdff1aSopenharmony_ci REP_RET 385cabdff1aSopenharmony_ci%endmacro 386cabdff1aSopenharmony_ci 387cabdff1aSopenharmony_ciINIT_XMM sse2 388cabdff1aSopenharmony_ciCONV_S16P_TO_S16_6CH 389cabdff1aSopenharmony_ciINIT_XMM sse2slow 390cabdff1aSopenharmony_ciCONV_S16P_TO_S16_6CH 391cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 392cabdff1aSopenharmony_ciINIT_XMM avx 393cabdff1aSopenharmony_ciCONV_S16P_TO_S16_6CH 394cabdff1aSopenharmony_ci%endif 395cabdff1aSopenharmony_ci 396cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 397cabdff1aSopenharmony_ci; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len, 398cabdff1aSopenharmony_ci; int channels); 399cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 400cabdff1aSopenharmony_ci 401cabdff1aSopenharmony_ci%macro CONV_S16P_TO_FLT_2CH 0 402cabdff1aSopenharmony_cicglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1 403cabdff1aSopenharmony_ci lea lenq, [2*lend] 404cabdff1aSopenharmony_ci mov src1q, [src0q+gprsize] 405cabdff1aSopenharmony_ci mov src0q, [src0q ] 406cabdff1aSopenharmony_ci lea dstq, [dstq+4*lenq] 407cabdff1aSopenharmony_ci add src0q, lenq 408cabdff1aSopenharmony_ci add src1q, lenq 409cabdff1aSopenharmony_ci neg lenq 410cabdff1aSopenharmony_ci mova m5, [pf_s32_inv_scale] 411cabdff1aSopenharmony_ci.loop: 412cabdff1aSopenharmony_ci mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14 413cabdff1aSopenharmony_ci mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15 414cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7 415cabdff1aSopenharmony_ci ; m4 = 8, 9, 10, 11, 12, 13, 14, 15 416cabdff1aSopenharmony_ci pxor m3, m3 417cabdff1aSopenharmony_ci punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3 418cabdff1aSopenharmony_ci punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7 419cabdff1aSopenharmony_ci punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11 420cabdff1aSopenharmony_ci punpckhwd m3, m4 ; m3 = 12, 13, 14, 15 421cabdff1aSopenharmony_ci cvtdq2ps m0, m0 422cabdff1aSopenharmony_ci cvtdq2ps m1, m1 423cabdff1aSopenharmony_ci cvtdq2ps m2, m2 424cabdff1aSopenharmony_ci cvtdq2ps m3, m3 425cabdff1aSopenharmony_ci mulps m0, m5 426cabdff1aSopenharmony_ci mulps m1, m5 427cabdff1aSopenharmony_ci mulps m2, m5 428cabdff1aSopenharmony_ci mulps m3, m5 429cabdff1aSopenharmony_ci mova [dstq+4*lenq ], m0 430cabdff1aSopenharmony_ci mova [dstq+4*lenq+ mmsize], m1 431cabdff1aSopenharmony_ci mova [dstq+4*lenq+2*mmsize], m2 432cabdff1aSopenharmony_ci mova [dstq+4*lenq+3*mmsize], m3 433cabdff1aSopenharmony_ci add lenq, mmsize 434cabdff1aSopenharmony_ci jl .loop 435cabdff1aSopenharmony_ci REP_RET 436cabdff1aSopenharmony_ci%endmacro 437cabdff1aSopenharmony_ci 438cabdff1aSopenharmony_ciINIT_XMM sse2 439cabdff1aSopenharmony_ciCONV_S16P_TO_FLT_2CH 440cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 441cabdff1aSopenharmony_ciINIT_XMM avx 442cabdff1aSopenharmony_ciCONV_S16P_TO_FLT_2CH 443cabdff1aSopenharmony_ci%endif 444cabdff1aSopenharmony_ci 445cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 446cabdff1aSopenharmony_ci; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len, 447cabdff1aSopenharmony_ci; int channels); 448cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 449cabdff1aSopenharmony_ci 450cabdff1aSopenharmony_ci%macro CONV_S16P_TO_FLT_6CH 0 451cabdff1aSopenharmony_ci%if ARCH_X86_64 452cabdff1aSopenharmony_cicglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5 453cabdff1aSopenharmony_ci%else 454cabdff1aSopenharmony_cicglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5 455cabdff1aSopenharmony_ci%define lend dword r2m 456cabdff1aSopenharmony_ci%endif 457cabdff1aSopenharmony_ci mov src1q, [srcq+1*gprsize] 458cabdff1aSopenharmony_ci mov src2q, [srcq+2*gprsize] 459cabdff1aSopenharmony_ci mov src3q, [srcq+3*gprsize] 460cabdff1aSopenharmony_ci mov src4q, [srcq+4*gprsize] 461cabdff1aSopenharmony_ci mov src5q, [srcq+5*gprsize] 462cabdff1aSopenharmony_ci mov srcq, [srcq] 463cabdff1aSopenharmony_ci sub src1q, srcq 464cabdff1aSopenharmony_ci sub src2q, srcq 465cabdff1aSopenharmony_ci sub src3q, srcq 466cabdff1aSopenharmony_ci sub src4q, srcq 467cabdff1aSopenharmony_ci sub src5q, srcq 468cabdff1aSopenharmony_ci mova m7, [pf_s32_inv_scale] 469cabdff1aSopenharmony_ci%if cpuflag(ssse3) 470cabdff1aSopenharmony_ci %define unpack_even m6 471cabdff1aSopenharmony_ci mova m6, [pb_shuf_unpack_even] 472cabdff1aSopenharmony_ci%if ARCH_X86_64 473cabdff1aSopenharmony_ci %define unpack_odd m8 474cabdff1aSopenharmony_ci mova m8, [pb_shuf_unpack_odd] 475cabdff1aSopenharmony_ci%else 476cabdff1aSopenharmony_ci %define unpack_odd [pb_shuf_unpack_odd] 477cabdff1aSopenharmony_ci%endif 478cabdff1aSopenharmony_ci%endif 479cabdff1aSopenharmony_ci.loop: 480cabdff1aSopenharmony_ci movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x 481cabdff1aSopenharmony_ci movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x 482cabdff1aSopenharmony_ci movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x 483cabdff1aSopenharmony_ci movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x 484cabdff1aSopenharmony_ci movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x 485cabdff1aSopenharmony_ci movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x 486cabdff1aSopenharmony_ci ; unpack words: 487cabdff1aSopenharmony_ci punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 488cabdff1aSopenharmony_ci punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 489cabdff1aSopenharmony_ci punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23 490cabdff1aSopenharmony_ci ; blend dwords 491cabdff1aSopenharmony_ci shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19 492cabdff1aSopenharmony_ci shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15 493cabdff1aSopenharmony_ci shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 494cabdff1aSopenharmony_ci%if cpuflag(ssse3) 495cabdff1aSopenharmony_ci pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15 496cabdff1aSopenharmony_ci pshufb m0, unpack_even ; m0 = 0, 1, 2, 3 497cabdff1aSopenharmony_ci pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19 498cabdff1aSopenharmony_ci pshufb m1, unpack_even ; m1 = 4, 5, 6, 7 499cabdff1aSopenharmony_ci pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23 500cabdff1aSopenharmony_ci pshufb m2, unpack_even ; m2 = 8, 9, 10, 11 501cabdff1aSopenharmony_ci%else 502cabdff1aSopenharmony_ci ; shuffle dwords 503cabdff1aSopenharmony_ci pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15 504cabdff1aSopenharmony_ci pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19 505cabdff1aSopenharmony_ci pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 506cabdff1aSopenharmony_ci pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5 507cabdff1aSopenharmony_ci punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3 508cabdff1aSopenharmony_ci punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15 509cabdff1aSopenharmony_ci punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7 510cabdff1aSopenharmony_ci punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19 511cabdff1aSopenharmony_ci punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11 512cabdff1aSopenharmony_ci punpckhwd m6, m2 ; m6 = 20, 21, 22, 23 513cabdff1aSopenharmony_ci SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5 514cabdff1aSopenharmony_ci%endif 515cabdff1aSopenharmony_ci cvtdq2ps m0, m0 ; convert s32 to float 516cabdff1aSopenharmony_ci cvtdq2ps m1, m1 517cabdff1aSopenharmony_ci cvtdq2ps m2, m2 518cabdff1aSopenharmony_ci cvtdq2ps m3, m3 519cabdff1aSopenharmony_ci cvtdq2ps m4, m4 520cabdff1aSopenharmony_ci cvtdq2ps m5, m5 521cabdff1aSopenharmony_ci mulps m0, m7 ; scale float from s32 range to [-1.0,1.0] 522cabdff1aSopenharmony_ci mulps m1, m7 523cabdff1aSopenharmony_ci mulps m2, m7 524cabdff1aSopenharmony_ci mulps m3, m7 525cabdff1aSopenharmony_ci mulps m4, m7 526cabdff1aSopenharmony_ci mulps m5, m7 527cabdff1aSopenharmony_ci mova [dstq ], m0 528cabdff1aSopenharmony_ci mova [dstq+ mmsize], m1 529cabdff1aSopenharmony_ci mova [dstq+2*mmsize], m2 530cabdff1aSopenharmony_ci mova [dstq+3*mmsize], m3 531cabdff1aSopenharmony_ci mova [dstq+4*mmsize], m4 532cabdff1aSopenharmony_ci mova [dstq+5*mmsize], m5 533cabdff1aSopenharmony_ci add srcq, mmsize/2 534cabdff1aSopenharmony_ci add dstq, mmsize*6 535cabdff1aSopenharmony_ci sub lend, mmsize/4 536cabdff1aSopenharmony_ci jg .loop 537cabdff1aSopenharmony_ci REP_RET 538cabdff1aSopenharmony_ci%endmacro 539cabdff1aSopenharmony_ci 540cabdff1aSopenharmony_ciINIT_XMM sse2 541cabdff1aSopenharmony_ciCONV_S16P_TO_FLT_6CH 542cabdff1aSopenharmony_ciINIT_XMM ssse3 543cabdff1aSopenharmony_ciCONV_S16P_TO_FLT_6CH 544cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 545cabdff1aSopenharmony_ciINIT_XMM avx 546cabdff1aSopenharmony_ciCONV_S16P_TO_FLT_6CH 547cabdff1aSopenharmony_ci%endif 548cabdff1aSopenharmony_ci 549cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 550cabdff1aSopenharmony_ci; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len, 551cabdff1aSopenharmony_ci; int channels); 552cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 553cabdff1aSopenharmony_ci 554cabdff1aSopenharmony_ci%macro CONV_FLTP_TO_S16_2CH 0 555cabdff1aSopenharmony_cicglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1 556cabdff1aSopenharmony_ci lea lenq, [4*lend] 557cabdff1aSopenharmony_ci mov src1q, [src0q+gprsize] 558cabdff1aSopenharmony_ci mov src0q, [src0q ] 559cabdff1aSopenharmony_ci add dstq, lenq 560cabdff1aSopenharmony_ci add src0q, lenq 561cabdff1aSopenharmony_ci add src1q, lenq 562cabdff1aSopenharmony_ci neg lenq 563cabdff1aSopenharmony_ci mova m2, [pf_s16_scale] 564cabdff1aSopenharmony_ci%if cpuflag(ssse3) 565cabdff1aSopenharmony_ci mova m3, [pb_interleave_words] 566cabdff1aSopenharmony_ci%endif 567cabdff1aSopenharmony_ci.loop: 568cabdff1aSopenharmony_ci mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6 569cabdff1aSopenharmony_ci mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7 570cabdff1aSopenharmony_ci cvtps2dq m0, m0 571cabdff1aSopenharmony_ci cvtps2dq m1, m1 572cabdff1aSopenharmony_ci%if cpuflag(ssse3) 573cabdff1aSopenharmony_ci packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7 574cabdff1aSopenharmony_ci pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 575cabdff1aSopenharmony_ci%else 576cabdff1aSopenharmony_ci packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x 577cabdff1aSopenharmony_ci packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x 578cabdff1aSopenharmony_ci punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 579cabdff1aSopenharmony_ci%endif 580cabdff1aSopenharmony_ci mova [dstq+lenq], m0 581cabdff1aSopenharmony_ci add lenq, mmsize 582cabdff1aSopenharmony_ci jl .loop 583cabdff1aSopenharmony_ci REP_RET 584cabdff1aSopenharmony_ci%endmacro 585cabdff1aSopenharmony_ci 586cabdff1aSopenharmony_ciINIT_XMM sse2 587cabdff1aSopenharmony_ciCONV_FLTP_TO_S16_2CH 588cabdff1aSopenharmony_ciINIT_XMM ssse3 589cabdff1aSopenharmony_ciCONV_FLTP_TO_S16_2CH 590cabdff1aSopenharmony_ci 591cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 592cabdff1aSopenharmony_ci; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len, 593cabdff1aSopenharmony_ci; int channels); 594cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 595cabdff1aSopenharmony_ci 596cabdff1aSopenharmony_ci%macro CONV_FLTP_TO_S16_6CH 0 597cabdff1aSopenharmony_ci%if ARCH_X86_64 598cabdff1aSopenharmony_cicglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5 599cabdff1aSopenharmony_ci%else 600cabdff1aSopenharmony_cicglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5 601cabdff1aSopenharmony_ci%define lend dword r2m 602cabdff1aSopenharmony_ci%endif 603cabdff1aSopenharmony_ci mov src1q, [srcq+1*gprsize] 604cabdff1aSopenharmony_ci mov src2q, [srcq+2*gprsize] 605cabdff1aSopenharmony_ci mov src3q, [srcq+3*gprsize] 606cabdff1aSopenharmony_ci mov src4q, [srcq+4*gprsize] 607cabdff1aSopenharmony_ci mov src5q, [srcq+5*gprsize] 608cabdff1aSopenharmony_ci mov srcq, [srcq] 609cabdff1aSopenharmony_ci sub src1q, srcq 610cabdff1aSopenharmony_ci sub src2q, srcq 611cabdff1aSopenharmony_ci sub src3q, srcq 612cabdff1aSopenharmony_ci sub src4q, srcq 613cabdff1aSopenharmony_ci sub src5q, srcq 614cabdff1aSopenharmony_ci movaps xmm6, [pf_s16_scale] 615cabdff1aSopenharmony_ci.loop: 616cabdff1aSopenharmony_ci%if cpuflag(sse2) 617cabdff1aSopenharmony_ci mulps m0, m6, [srcq ] 618cabdff1aSopenharmony_ci mulps m1, m6, [srcq+src1q] 619cabdff1aSopenharmony_ci mulps m2, m6, [srcq+src2q] 620cabdff1aSopenharmony_ci mulps m3, m6, [srcq+src3q] 621cabdff1aSopenharmony_ci mulps m4, m6, [srcq+src4q] 622cabdff1aSopenharmony_ci mulps m5, m6, [srcq+src5q] 623cabdff1aSopenharmony_ci cvtps2dq m0, m0 624cabdff1aSopenharmony_ci cvtps2dq m1, m1 625cabdff1aSopenharmony_ci cvtps2dq m2, m2 626cabdff1aSopenharmony_ci cvtps2dq m3, m3 627cabdff1aSopenharmony_ci cvtps2dq m4, m4 628cabdff1aSopenharmony_ci cvtps2dq m5, m5 629cabdff1aSopenharmony_ci packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21 630cabdff1aSopenharmony_ci packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22 631cabdff1aSopenharmony_ci packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23 632cabdff1aSopenharmony_ci ; unpack words: 633cabdff1aSopenharmony_ci movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x 634cabdff1aSopenharmony_ci punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 635cabdff1aSopenharmony_ci punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23 636cabdff1aSopenharmony_ci punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 637cabdff1aSopenharmony_ci ; blend dwords: 638cabdff1aSopenharmony_ci shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15 639cabdff1aSopenharmony_ci shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 640cabdff1aSopenharmony_ci shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 641cabdff1aSopenharmony_ci ; shuffle dwords: 642cabdff1aSopenharmony_ci shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 643cabdff1aSopenharmony_ci shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7 644cabdff1aSopenharmony_ci shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23 645cabdff1aSopenharmony_ci mova [dstq+0*mmsize], m3 646cabdff1aSopenharmony_ci mova [dstq+1*mmsize], m1 647cabdff1aSopenharmony_ci mova [dstq+2*mmsize], m0 648cabdff1aSopenharmony_ci%else ; sse 649cabdff1aSopenharmony_ci movlps xmm0, [srcq ] 650cabdff1aSopenharmony_ci movlps xmm1, [srcq+src1q] 651cabdff1aSopenharmony_ci movlps xmm2, [srcq+src2q] 652cabdff1aSopenharmony_ci movlps xmm3, [srcq+src3q] 653cabdff1aSopenharmony_ci movlps xmm4, [srcq+src4q] 654cabdff1aSopenharmony_ci movlps xmm5, [srcq+src5q] 655cabdff1aSopenharmony_ci mulps xmm0, xmm6 656cabdff1aSopenharmony_ci mulps xmm1, xmm6 657cabdff1aSopenharmony_ci mulps xmm2, xmm6 658cabdff1aSopenharmony_ci mulps xmm3, xmm6 659cabdff1aSopenharmony_ci mulps xmm4, xmm6 660cabdff1aSopenharmony_ci mulps xmm5, xmm6 661cabdff1aSopenharmony_ci cvtps2pi mm0, xmm0 662cabdff1aSopenharmony_ci cvtps2pi mm1, xmm1 663cabdff1aSopenharmony_ci cvtps2pi mm2, xmm2 664cabdff1aSopenharmony_ci cvtps2pi mm3, xmm3 665cabdff1aSopenharmony_ci cvtps2pi mm4, xmm4 666cabdff1aSopenharmony_ci cvtps2pi mm5, xmm5 667cabdff1aSopenharmony_ci packssdw mm0, mm3 ; m0 = 0, 6, 3, 9 668cabdff1aSopenharmony_ci packssdw mm1, mm4 ; m1 = 1, 7, 4, 10 669cabdff1aSopenharmony_ci packssdw mm2, mm5 ; m2 = 2, 8, 5, 11 670cabdff1aSopenharmony_ci ; unpack words 671cabdff1aSopenharmony_ci pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6 672cabdff1aSopenharmony_ci punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7 673cabdff1aSopenharmony_ci punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11 674cabdff1aSopenharmony_ci punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9 675cabdff1aSopenharmony_ci ; unpack dwords 676cabdff1aSopenharmony_ci pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1 677cabdff1aSopenharmony_ci punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final) 678cabdff1aSopenharmony_ci punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final) 679cabdff1aSopenharmony_ci punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final) 680cabdff1aSopenharmony_ci mova [dstq+0*mmsize], mm0 681cabdff1aSopenharmony_ci mova [dstq+1*mmsize], mm1 682cabdff1aSopenharmony_ci mova [dstq+2*mmsize], mm2 683cabdff1aSopenharmony_ci%endif 684cabdff1aSopenharmony_ci add srcq, mmsize 685cabdff1aSopenharmony_ci add dstq, mmsize*3 686cabdff1aSopenharmony_ci sub lend, mmsize/4 687cabdff1aSopenharmony_ci jg .loop 688cabdff1aSopenharmony_ci%if mmsize == 8 689cabdff1aSopenharmony_ci emms 690cabdff1aSopenharmony_ci RET 691cabdff1aSopenharmony_ci%else 692cabdff1aSopenharmony_ci REP_RET 693cabdff1aSopenharmony_ci%endif 694cabdff1aSopenharmony_ci%endmacro 695cabdff1aSopenharmony_ci 696cabdff1aSopenharmony_ciINIT_MMX sse 697cabdff1aSopenharmony_ciCONV_FLTP_TO_S16_6CH 698cabdff1aSopenharmony_ciINIT_XMM sse2 699cabdff1aSopenharmony_ciCONV_FLTP_TO_S16_6CH 700cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 701cabdff1aSopenharmony_ciINIT_XMM avx 702cabdff1aSopenharmony_ciCONV_FLTP_TO_S16_6CH 703cabdff1aSopenharmony_ci%endif 704cabdff1aSopenharmony_ci 705cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 706cabdff1aSopenharmony_ci; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len, 707cabdff1aSopenharmony_ci; int channels); 708cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 709cabdff1aSopenharmony_ci 710cabdff1aSopenharmony_ci%macro CONV_FLTP_TO_FLT_2CH 0 711cabdff1aSopenharmony_cicglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1 712cabdff1aSopenharmony_ci mov src1q, [src0q+gprsize] 713cabdff1aSopenharmony_ci mov src0q, [src0q] 714cabdff1aSopenharmony_ci lea lenq, [4*lend] 715cabdff1aSopenharmony_ci add src0q, lenq 716cabdff1aSopenharmony_ci add src1q, lenq 717cabdff1aSopenharmony_ci lea dstq, [dstq+2*lenq] 718cabdff1aSopenharmony_ci neg lenq 719cabdff1aSopenharmony_ci.loop: 720cabdff1aSopenharmony_ci mova m0, [src0q+lenq ] 721cabdff1aSopenharmony_ci mova m1, [src1q+lenq ] 722cabdff1aSopenharmony_ci mova m2, [src0q+lenq+mmsize] 723cabdff1aSopenharmony_ci mova m3, [src1q+lenq+mmsize] 724cabdff1aSopenharmony_ci SBUTTERFLYPS 0, 1, 4 725cabdff1aSopenharmony_ci SBUTTERFLYPS 2, 3, 4 726cabdff1aSopenharmony_ci mova [dstq+2*lenq+0*mmsize], m0 727cabdff1aSopenharmony_ci mova [dstq+2*lenq+1*mmsize], m1 728cabdff1aSopenharmony_ci mova [dstq+2*lenq+2*mmsize], m2 729cabdff1aSopenharmony_ci mova [dstq+2*lenq+3*mmsize], m3 730cabdff1aSopenharmony_ci add lenq, 2*mmsize 731cabdff1aSopenharmony_ci jl .loop 732cabdff1aSopenharmony_ci REP_RET 733cabdff1aSopenharmony_ci%endmacro 734cabdff1aSopenharmony_ci 735cabdff1aSopenharmony_ciINIT_XMM sse 736cabdff1aSopenharmony_ciCONV_FLTP_TO_FLT_2CH 737cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 738cabdff1aSopenharmony_ciINIT_XMM avx 739cabdff1aSopenharmony_ciCONV_FLTP_TO_FLT_2CH 740cabdff1aSopenharmony_ci%endif 741cabdff1aSopenharmony_ci 742cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 743cabdff1aSopenharmony_ci; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len, 744cabdff1aSopenharmony_ci; int channels); 745cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 746cabdff1aSopenharmony_ci 747cabdff1aSopenharmony_ci%macro CONV_FLTP_TO_FLT_6CH 0 748cabdff1aSopenharmony_cicglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len 749cabdff1aSopenharmony_ci%if ARCH_X86_64 750cabdff1aSopenharmony_ci mov lend, r2d 751cabdff1aSopenharmony_ci%else 752cabdff1aSopenharmony_ci %define lend dword r2m 753cabdff1aSopenharmony_ci%endif 754cabdff1aSopenharmony_ci mov src1q, [srcq+1*gprsize] 755cabdff1aSopenharmony_ci mov src2q, [srcq+2*gprsize] 756cabdff1aSopenharmony_ci mov src3q, [srcq+3*gprsize] 757cabdff1aSopenharmony_ci mov src4q, [srcq+4*gprsize] 758cabdff1aSopenharmony_ci mov src5q, [srcq+5*gprsize] 759cabdff1aSopenharmony_ci mov srcq, [srcq] 760cabdff1aSopenharmony_ci sub src1q, srcq 761cabdff1aSopenharmony_ci sub src2q, srcq 762cabdff1aSopenharmony_ci sub src3q, srcq 763cabdff1aSopenharmony_ci sub src4q, srcq 764cabdff1aSopenharmony_ci sub src5q, srcq 765cabdff1aSopenharmony_ci.loop: 766cabdff1aSopenharmony_ci mova m0, [srcq ] 767cabdff1aSopenharmony_ci mova m1, [srcq+src1q] 768cabdff1aSopenharmony_ci mova m2, [srcq+src2q] 769cabdff1aSopenharmony_ci mova m3, [srcq+src3q] 770cabdff1aSopenharmony_ci mova m4, [srcq+src4q] 771cabdff1aSopenharmony_ci mova m5, [srcq+src5q] 772cabdff1aSopenharmony_ci%if cpuflag(sse4) 773cabdff1aSopenharmony_ci SBUTTERFLYPS 0, 1, 6 774cabdff1aSopenharmony_ci SBUTTERFLYPS 2, 3, 6 775cabdff1aSopenharmony_ci SBUTTERFLYPS 4, 5, 6 776cabdff1aSopenharmony_ci 777cabdff1aSopenharmony_ci blendps m6, m4, m0, 1100b 778cabdff1aSopenharmony_ci movlhps m0, m2 779cabdff1aSopenharmony_ci movhlps m4, m2 780cabdff1aSopenharmony_ci blendps m2, m5, m1, 1100b 781cabdff1aSopenharmony_ci movlhps m1, m3 782cabdff1aSopenharmony_ci movhlps m5, m3 783cabdff1aSopenharmony_ci 784cabdff1aSopenharmony_ci movaps [dstq ], m0 785cabdff1aSopenharmony_ci movaps [dstq+16], m6 786cabdff1aSopenharmony_ci movaps [dstq+32], m4 787cabdff1aSopenharmony_ci movaps [dstq+48], m1 788cabdff1aSopenharmony_ci movaps [dstq+64], m2 789cabdff1aSopenharmony_ci movaps [dstq+80], m5 790cabdff1aSopenharmony_ci%else ; mmx 791cabdff1aSopenharmony_ci SBUTTERFLY dq, 0, 1, 6 792cabdff1aSopenharmony_ci SBUTTERFLY dq, 2, 3, 6 793cabdff1aSopenharmony_ci SBUTTERFLY dq, 4, 5, 6 794cabdff1aSopenharmony_ci 795cabdff1aSopenharmony_ci movq [dstq ], m0 796cabdff1aSopenharmony_ci movq [dstq+ 8], m2 797cabdff1aSopenharmony_ci movq [dstq+16], m4 798cabdff1aSopenharmony_ci movq [dstq+24], m1 799cabdff1aSopenharmony_ci movq [dstq+32], m3 800cabdff1aSopenharmony_ci movq [dstq+40], m5 801cabdff1aSopenharmony_ci%endif 802cabdff1aSopenharmony_ci add srcq, mmsize 803cabdff1aSopenharmony_ci add dstq, mmsize*6 804cabdff1aSopenharmony_ci sub lend, mmsize/4 805cabdff1aSopenharmony_ci jg .loop 806cabdff1aSopenharmony_ci%if mmsize == 8 807cabdff1aSopenharmony_ci emms 808cabdff1aSopenharmony_ci RET 809cabdff1aSopenharmony_ci%else 810cabdff1aSopenharmony_ci REP_RET 811cabdff1aSopenharmony_ci%endif 812cabdff1aSopenharmony_ci%endmacro 813cabdff1aSopenharmony_ci 814cabdff1aSopenharmony_ciINIT_MMX mmx 815cabdff1aSopenharmony_ciCONV_FLTP_TO_FLT_6CH 816cabdff1aSopenharmony_ciINIT_XMM sse4 817cabdff1aSopenharmony_ciCONV_FLTP_TO_FLT_6CH 818cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 819cabdff1aSopenharmony_ciINIT_XMM avx 820cabdff1aSopenharmony_ciCONV_FLTP_TO_FLT_6CH 821cabdff1aSopenharmony_ci%endif 822cabdff1aSopenharmony_ci 823cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 824cabdff1aSopenharmony_ci; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len, 825cabdff1aSopenharmony_ci; int channels); 826cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 827cabdff1aSopenharmony_ci 828cabdff1aSopenharmony_ci%macro CONV_S16_TO_S16P_2CH 0 829cabdff1aSopenharmony_cicglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1 830cabdff1aSopenharmony_ci lea lenq, [2*lend] 831cabdff1aSopenharmony_ci mov dst1q, [dst0q+gprsize] 832cabdff1aSopenharmony_ci mov dst0q, [dst0q ] 833cabdff1aSopenharmony_ci lea srcq, [srcq+2*lenq] 834cabdff1aSopenharmony_ci add dst0q, lenq 835cabdff1aSopenharmony_ci add dst1q, lenq 836cabdff1aSopenharmony_ci neg lenq 837cabdff1aSopenharmony_ci%if cpuflag(ssse3) 838cabdff1aSopenharmony_ci mova m3, [pb_deinterleave_words] 839cabdff1aSopenharmony_ci%endif 840cabdff1aSopenharmony_ci.loop: 841cabdff1aSopenharmony_ci mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 842cabdff1aSopenharmony_ci mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 843cabdff1aSopenharmony_ci%if cpuflag(ssse3) 844cabdff1aSopenharmony_ci pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7 845cabdff1aSopenharmony_ci pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15 846cabdff1aSopenharmony_ci SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14 847cabdff1aSopenharmony_ci ; m1 = 1, 3, 5, 7, 9, 11, 13, 15 848cabdff1aSopenharmony_ci%else ; sse2 849cabdff1aSopenharmony_ci pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7 850cabdff1aSopenharmony_ci pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7 851cabdff1aSopenharmony_ci pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15 852cabdff1aSopenharmony_ci pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15 853cabdff1aSopenharmony_ci DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14 854cabdff1aSopenharmony_ci ; m1 = 1, 3, 5, 7, 9, 11, 13, 15 855cabdff1aSopenharmony_ci%endif 856cabdff1aSopenharmony_ci mova [dst0q+lenq], m0 857cabdff1aSopenharmony_ci mova [dst1q+lenq], m1 858cabdff1aSopenharmony_ci add lenq, mmsize 859cabdff1aSopenharmony_ci jl .loop 860cabdff1aSopenharmony_ci REP_RET 861cabdff1aSopenharmony_ci%endmacro 862cabdff1aSopenharmony_ci 863cabdff1aSopenharmony_ciINIT_XMM sse2 864cabdff1aSopenharmony_ciCONV_S16_TO_S16P_2CH 865cabdff1aSopenharmony_ciINIT_XMM ssse3 866cabdff1aSopenharmony_ciCONV_S16_TO_S16P_2CH 867cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 868cabdff1aSopenharmony_ciINIT_XMM avx 869cabdff1aSopenharmony_ciCONV_S16_TO_S16P_2CH 870cabdff1aSopenharmony_ci%endif 871cabdff1aSopenharmony_ci 872cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 873cabdff1aSopenharmony_ci; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len, 874cabdff1aSopenharmony_ci; int channels); 875cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 876cabdff1aSopenharmony_ci 877cabdff1aSopenharmony_ci%macro CONV_S16_TO_S16P_6CH 0 878cabdff1aSopenharmony_ci%if ARCH_X86_64 879cabdff1aSopenharmony_cicglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5 880cabdff1aSopenharmony_ci%else 881cabdff1aSopenharmony_cicglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5 882cabdff1aSopenharmony_ci%define lend dword r2m 883cabdff1aSopenharmony_ci%endif 884cabdff1aSopenharmony_ci mov dst1q, [dstq+ gprsize] 885cabdff1aSopenharmony_ci mov dst2q, [dstq+2*gprsize] 886cabdff1aSopenharmony_ci mov dst3q, [dstq+3*gprsize] 887cabdff1aSopenharmony_ci mov dst4q, [dstq+4*gprsize] 888cabdff1aSopenharmony_ci mov dst5q, [dstq+5*gprsize] 889cabdff1aSopenharmony_ci mov dstq, [dstq ] 890cabdff1aSopenharmony_ci sub dst1q, dstq 891cabdff1aSopenharmony_ci sub dst2q, dstq 892cabdff1aSopenharmony_ci sub dst3q, dstq 893cabdff1aSopenharmony_ci sub dst4q, dstq 894cabdff1aSopenharmony_ci sub dst5q, dstq 895cabdff1aSopenharmony_ci.loop: 896cabdff1aSopenharmony_ci mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 897cabdff1aSopenharmony_ci mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15 898cabdff1aSopenharmony_ci mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 899cabdff1aSopenharmony_ci PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x 900cabdff1aSopenharmony_ci shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 901cabdff1aSopenharmony_ci psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x 902cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 903cabdff1aSopenharmony_ci ; m1 = 4, 10, 5, 11, x, x, x, x 904cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21 905cabdff1aSopenharmony_ci ; m2 = 16, 22, 17, 23, x, x, x, x 906cabdff1aSopenharmony_ci SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 907cabdff1aSopenharmony_ci ; m3 = 2, 8, 14, 20, 3, 9, 15, 21 908cabdff1aSopenharmony_ci punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23 909cabdff1aSopenharmony_ci movq [dstq ], m0 910cabdff1aSopenharmony_ci movhps [dstq+dst1q], m0 911cabdff1aSopenharmony_ci movq [dstq+dst2q], m3 912cabdff1aSopenharmony_ci movhps [dstq+dst3q], m3 913cabdff1aSopenharmony_ci movq [dstq+dst4q], m1 914cabdff1aSopenharmony_ci movhps [dstq+dst5q], m1 915cabdff1aSopenharmony_ci add srcq, mmsize*3 916cabdff1aSopenharmony_ci add dstq, mmsize/2 917cabdff1aSopenharmony_ci sub lend, mmsize/4 918cabdff1aSopenharmony_ci jg .loop 919cabdff1aSopenharmony_ci REP_RET 920cabdff1aSopenharmony_ci%endmacro 921cabdff1aSopenharmony_ci 922cabdff1aSopenharmony_ciINIT_XMM sse2 923cabdff1aSopenharmony_ciCONV_S16_TO_S16P_6CH 924cabdff1aSopenharmony_ciINIT_XMM ssse3 925cabdff1aSopenharmony_ciCONV_S16_TO_S16P_6CH 926cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 927cabdff1aSopenharmony_ciINIT_XMM avx 928cabdff1aSopenharmony_ciCONV_S16_TO_S16P_6CH 929cabdff1aSopenharmony_ci%endif 930cabdff1aSopenharmony_ci 931cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 932cabdff1aSopenharmony_ci; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len, 933cabdff1aSopenharmony_ci; int channels); 934cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 935cabdff1aSopenharmony_ci 936cabdff1aSopenharmony_ci%macro CONV_S16_TO_FLTP_2CH 0 937cabdff1aSopenharmony_cicglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1 938cabdff1aSopenharmony_ci lea lenq, [4*lend] 939cabdff1aSopenharmony_ci mov dst1q, [dst0q+gprsize] 940cabdff1aSopenharmony_ci mov dst0q, [dst0q ] 941cabdff1aSopenharmony_ci add srcq, lenq 942cabdff1aSopenharmony_ci add dst0q, lenq 943cabdff1aSopenharmony_ci add dst1q, lenq 944cabdff1aSopenharmony_ci neg lenq 945cabdff1aSopenharmony_ci mova m3, [pf_s32_inv_scale] 946cabdff1aSopenharmony_ci mova m4, [pw_zero_even] 947cabdff1aSopenharmony_ci.loop: 948cabdff1aSopenharmony_ci mova m1, [srcq+lenq] 949cabdff1aSopenharmony_ci pslld m0, m1, 16 950cabdff1aSopenharmony_ci pand m1, m4 951cabdff1aSopenharmony_ci cvtdq2ps m0, m0 952cabdff1aSopenharmony_ci cvtdq2ps m1, m1 953cabdff1aSopenharmony_ci mulps m0, m0, m3 954cabdff1aSopenharmony_ci mulps m1, m1, m3 955cabdff1aSopenharmony_ci mova [dst0q+lenq], m0 956cabdff1aSopenharmony_ci mova [dst1q+lenq], m1 957cabdff1aSopenharmony_ci add lenq, mmsize 958cabdff1aSopenharmony_ci jl .loop 959cabdff1aSopenharmony_ci REP_RET 960cabdff1aSopenharmony_ci%endmacro 961cabdff1aSopenharmony_ci 962cabdff1aSopenharmony_ciINIT_XMM sse2 963cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_2CH 964cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 965cabdff1aSopenharmony_ciINIT_XMM avx 966cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_2CH 967cabdff1aSopenharmony_ci%endif 968cabdff1aSopenharmony_ci 969cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 970cabdff1aSopenharmony_ci; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len, 971cabdff1aSopenharmony_ci; int channels); 972cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 973cabdff1aSopenharmony_ci 974cabdff1aSopenharmony_ci%macro CONV_S16_TO_FLTP_6CH 0 975cabdff1aSopenharmony_ci%if ARCH_X86_64 976cabdff1aSopenharmony_cicglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 977cabdff1aSopenharmony_ci%else 978cabdff1aSopenharmony_cicglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 979cabdff1aSopenharmony_ci%define lend dword r2m 980cabdff1aSopenharmony_ci%endif 981cabdff1aSopenharmony_ci mov dst1q, [dstq+ gprsize] 982cabdff1aSopenharmony_ci mov dst2q, [dstq+2*gprsize] 983cabdff1aSopenharmony_ci mov dst3q, [dstq+3*gprsize] 984cabdff1aSopenharmony_ci mov dst4q, [dstq+4*gprsize] 985cabdff1aSopenharmony_ci mov dst5q, [dstq+5*gprsize] 986cabdff1aSopenharmony_ci mov dstq, [dstq ] 987cabdff1aSopenharmony_ci sub dst1q, dstq 988cabdff1aSopenharmony_ci sub dst2q, dstq 989cabdff1aSopenharmony_ci sub dst3q, dstq 990cabdff1aSopenharmony_ci sub dst4q, dstq 991cabdff1aSopenharmony_ci sub dst5q, dstq 992cabdff1aSopenharmony_ci mova m6, [pf_s16_inv_scale] 993cabdff1aSopenharmony_ci.loop: 994cabdff1aSopenharmony_ci mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 995cabdff1aSopenharmony_ci mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15 996cabdff1aSopenharmony_ci mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 997cabdff1aSopenharmony_ci PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x 998cabdff1aSopenharmony_ci shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 999cabdff1aSopenharmony_ci psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x 1000cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 1001cabdff1aSopenharmony_ci ; m1 = 4, 10, 5, 11, x, x, x, x 1002cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21 1003cabdff1aSopenharmony_ci ; m2 = 16, 22, 17, 23, x, x, x, x 1004cabdff1aSopenharmony_ci SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 1005cabdff1aSopenharmony_ci ; m3 = 2, 8, 14, 20, 3, 9, 15, 21 1006cabdff1aSopenharmony_ci punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23 1007cabdff1aSopenharmony_ci S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18 1008cabdff1aSopenharmony_ci ; m2 = 1, 7, 13, 19 1009cabdff1aSopenharmony_ci S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20 1010cabdff1aSopenharmony_ci ; m4 = 3, 9, 15, 21 1011cabdff1aSopenharmony_ci S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22 1012cabdff1aSopenharmony_ci ; m5 = 5, 11, 17, 23 1013cabdff1aSopenharmony_ci SWAP 1,2,3,4 1014cabdff1aSopenharmony_ci cvtdq2ps m0, m0 1015cabdff1aSopenharmony_ci cvtdq2ps m1, m1 1016cabdff1aSopenharmony_ci cvtdq2ps m2, m2 1017cabdff1aSopenharmony_ci cvtdq2ps m3, m3 1018cabdff1aSopenharmony_ci cvtdq2ps m4, m4 1019cabdff1aSopenharmony_ci cvtdq2ps m5, m5 1020cabdff1aSopenharmony_ci mulps m0, m6 1021cabdff1aSopenharmony_ci mulps m1, m6 1022cabdff1aSopenharmony_ci mulps m2, m6 1023cabdff1aSopenharmony_ci mulps m3, m6 1024cabdff1aSopenharmony_ci mulps m4, m6 1025cabdff1aSopenharmony_ci mulps m5, m6 1026cabdff1aSopenharmony_ci mova [dstq ], m0 1027cabdff1aSopenharmony_ci mova [dstq+dst1q], m1 1028cabdff1aSopenharmony_ci mova [dstq+dst2q], m2 1029cabdff1aSopenharmony_ci mova [dstq+dst3q], m3 1030cabdff1aSopenharmony_ci mova [dstq+dst4q], m4 1031cabdff1aSopenharmony_ci mova [dstq+dst5q], m5 1032cabdff1aSopenharmony_ci add srcq, mmsize*3 1033cabdff1aSopenharmony_ci add dstq, mmsize 1034cabdff1aSopenharmony_ci sub lend, mmsize/4 1035cabdff1aSopenharmony_ci jg .loop 1036cabdff1aSopenharmony_ci REP_RET 1037cabdff1aSopenharmony_ci%endmacro 1038cabdff1aSopenharmony_ci 1039cabdff1aSopenharmony_ciINIT_XMM sse2 1040cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_6CH 1041cabdff1aSopenharmony_ciINIT_XMM ssse3 1042cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_6CH 1043cabdff1aSopenharmony_ciINIT_XMM sse4 1044cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_6CH 1045cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 1046cabdff1aSopenharmony_ciINIT_XMM avx 1047cabdff1aSopenharmony_ciCONV_S16_TO_FLTP_6CH 1048cabdff1aSopenharmony_ci%endif 1049cabdff1aSopenharmony_ci 1050cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1051cabdff1aSopenharmony_ci; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len, 1052cabdff1aSopenharmony_ci; int channels); 1053cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1054cabdff1aSopenharmony_ci 1055cabdff1aSopenharmony_ci%macro CONV_FLT_TO_S16P_2CH 0 1056cabdff1aSopenharmony_cicglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1 1057cabdff1aSopenharmony_ci lea lenq, [2*lend] 1058cabdff1aSopenharmony_ci mov dst1q, [dst0q+gprsize] 1059cabdff1aSopenharmony_ci mov dst0q, [dst0q ] 1060cabdff1aSopenharmony_ci lea srcq, [srcq+4*lenq] 1061cabdff1aSopenharmony_ci add dst0q, lenq 1062cabdff1aSopenharmony_ci add dst1q, lenq 1063cabdff1aSopenharmony_ci neg lenq 1064cabdff1aSopenharmony_ci mova m5, [pf_s16_scale] 1065cabdff1aSopenharmony_ci.loop: 1066cabdff1aSopenharmony_ci mova m0, [srcq+4*lenq ] 1067cabdff1aSopenharmony_ci mova m1, [srcq+4*lenq+ mmsize] 1068cabdff1aSopenharmony_ci mova m2, [srcq+4*lenq+2*mmsize] 1069cabdff1aSopenharmony_ci mova m3, [srcq+4*lenq+3*mmsize] 1070cabdff1aSopenharmony_ci DEINT2_PS 0, 1, 4 1071cabdff1aSopenharmony_ci DEINT2_PS 2, 3, 4 1072cabdff1aSopenharmony_ci mulps m0, m0, m5 1073cabdff1aSopenharmony_ci mulps m1, m1, m5 1074cabdff1aSopenharmony_ci mulps m2, m2, m5 1075cabdff1aSopenharmony_ci mulps m3, m3, m5 1076cabdff1aSopenharmony_ci cvtps2dq m0, m0 1077cabdff1aSopenharmony_ci cvtps2dq m1, m1 1078cabdff1aSopenharmony_ci cvtps2dq m2, m2 1079cabdff1aSopenharmony_ci cvtps2dq m3, m3 1080cabdff1aSopenharmony_ci packssdw m0, m2 1081cabdff1aSopenharmony_ci packssdw m1, m3 1082cabdff1aSopenharmony_ci mova [dst0q+lenq], m0 1083cabdff1aSopenharmony_ci mova [dst1q+lenq], m1 1084cabdff1aSopenharmony_ci add lenq, mmsize 1085cabdff1aSopenharmony_ci jl .loop 1086cabdff1aSopenharmony_ci REP_RET 1087cabdff1aSopenharmony_ci%endmacro 1088cabdff1aSopenharmony_ci 1089cabdff1aSopenharmony_ciINIT_XMM sse2 1090cabdff1aSopenharmony_ciCONV_FLT_TO_S16P_2CH 1091cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 1092cabdff1aSopenharmony_ciINIT_XMM avx 1093cabdff1aSopenharmony_ciCONV_FLT_TO_S16P_2CH 1094cabdff1aSopenharmony_ci%endif 1095cabdff1aSopenharmony_ci 1096cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1097cabdff1aSopenharmony_ci; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len, 1098cabdff1aSopenharmony_ci; int channels); 1099cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1100cabdff1aSopenharmony_ci 1101cabdff1aSopenharmony_ci%macro CONV_FLT_TO_S16P_6CH 0 1102cabdff1aSopenharmony_ci%if ARCH_X86_64 1103cabdff1aSopenharmony_cicglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 1104cabdff1aSopenharmony_ci%else 1105cabdff1aSopenharmony_cicglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 1106cabdff1aSopenharmony_ci%define lend dword r2m 1107cabdff1aSopenharmony_ci%endif 1108cabdff1aSopenharmony_ci mov dst1q, [dstq+ gprsize] 1109cabdff1aSopenharmony_ci mov dst2q, [dstq+2*gprsize] 1110cabdff1aSopenharmony_ci mov dst3q, [dstq+3*gprsize] 1111cabdff1aSopenharmony_ci mov dst4q, [dstq+4*gprsize] 1112cabdff1aSopenharmony_ci mov dst5q, [dstq+5*gprsize] 1113cabdff1aSopenharmony_ci mov dstq, [dstq ] 1114cabdff1aSopenharmony_ci sub dst1q, dstq 1115cabdff1aSopenharmony_ci sub dst2q, dstq 1116cabdff1aSopenharmony_ci sub dst3q, dstq 1117cabdff1aSopenharmony_ci sub dst4q, dstq 1118cabdff1aSopenharmony_ci sub dst5q, dstq 1119cabdff1aSopenharmony_ci mova m6, [pf_s16_scale] 1120cabdff1aSopenharmony_ci.loop: 1121cabdff1aSopenharmony_ci mulps m0, m6, [srcq+0*mmsize] 1122cabdff1aSopenharmony_ci mulps m3, m6, [srcq+1*mmsize] 1123cabdff1aSopenharmony_ci mulps m1, m6, [srcq+2*mmsize] 1124cabdff1aSopenharmony_ci mulps m4, m6, [srcq+3*mmsize] 1125cabdff1aSopenharmony_ci mulps m2, m6, [srcq+4*mmsize] 1126cabdff1aSopenharmony_ci mulps m5, m6, [srcq+5*mmsize] 1127cabdff1aSopenharmony_ci cvtps2dq m0, m0 1128cabdff1aSopenharmony_ci cvtps2dq m1, m1 1129cabdff1aSopenharmony_ci cvtps2dq m2, m2 1130cabdff1aSopenharmony_ci cvtps2dq m3, m3 1131cabdff1aSopenharmony_ci cvtps2dq m4, m4 1132cabdff1aSopenharmony_ci cvtps2dq m5, m5 1133cabdff1aSopenharmony_ci packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 1134cabdff1aSopenharmony_ci packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 1135cabdff1aSopenharmony_ci packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 1136cabdff1aSopenharmony_ci PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x 1137cabdff1aSopenharmony_ci shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19 1138cabdff1aSopenharmony_ci psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x 1139cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 1140cabdff1aSopenharmony_ci ; m3 = 4, 10, 5, 11, x, x, x, x 1141cabdff1aSopenharmony_ci SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21 1142cabdff1aSopenharmony_ci ; m2 = 16, 22, 17, 23, x, x, x, x 1143cabdff1aSopenharmony_ci SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 1144cabdff1aSopenharmony_ci ; m1 = 2, 8, 14, 20, 3, 9, 15, 21 1145cabdff1aSopenharmony_ci punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23 1146cabdff1aSopenharmony_ci movq [dstq ], m0 1147cabdff1aSopenharmony_ci movhps [dstq+dst1q], m0 1148cabdff1aSopenharmony_ci movq [dstq+dst2q], m1 1149cabdff1aSopenharmony_ci movhps [dstq+dst3q], m1 1150cabdff1aSopenharmony_ci movq [dstq+dst4q], m3 1151cabdff1aSopenharmony_ci movhps [dstq+dst5q], m3 1152cabdff1aSopenharmony_ci add srcq, mmsize*6 1153cabdff1aSopenharmony_ci add dstq, mmsize/2 1154cabdff1aSopenharmony_ci sub lend, mmsize/4 1155cabdff1aSopenharmony_ci jg .loop 1156cabdff1aSopenharmony_ci REP_RET 1157cabdff1aSopenharmony_ci%endmacro 1158cabdff1aSopenharmony_ci 1159cabdff1aSopenharmony_ciINIT_XMM sse2 1160cabdff1aSopenharmony_ciCONV_FLT_TO_S16P_6CH 1161cabdff1aSopenharmony_ciINIT_XMM ssse3 1162cabdff1aSopenharmony_ciCONV_FLT_TO_S16P_6CH 1163cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 1164cabdff1aSopenharmony_ciINIT_XMM avx 1165cabdff1aSopenharmony_ciCONV_FLT_TO_S16P_6CH 1166cabdff1aSopenharmony_ci%endif 1167cabdff1aSopenharmony_ci 1168cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1169cabdff1aSopenharmony_ci; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len, 1170cabdff1aSopenharmony_ci; int channels); 1171cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1172cabdff1aSopenharmony_ci 1173cabdff1aSopenharmony_ci%macro CONV_FLT_TO_FLTP_2CH 0 1174cabdff1aSopenharmony_cicglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1 1175cabdff1aSopenharmony_ci lea lenq, [4*lend] 1176cabdff1aSopenharmony_ci mov dst1q, [dst0q+gprsize] 1177cabdff1aSopenharmony_ci mov dst0q, [dst0q ] 1178cabdff1aSopenharmony_ci lea srcq, [srcq+2*lenq] 1179cabdff1aSopenharmony_ci add dst0q, lenq 1180cabdff1aSopenharmony_ci add dst1q, lenq 1181cabdff1aSopenharmony_ci neg lenq 1182cabdff1aSopenharmony_ci.loop: 1183cabdff1aSopenharmony_ci mova m0, [srcq+2*lenq ] 1184cabdff1aSopenharmony_ci mova m1, [srcq+2*lenq+mmsize] 1185cabdff1aSopenharmony_ci DEINT2_PS 0, 1, 2 1186cabdff1aSopenharmony_ci mova [dst0q+lenq], m0 1187cabdff1aSopenharmony_ci mova [dst1q+lenq], m1 1188cabdff1aSopenharmony_ci add lenq, mmsize 1189cabdff1aSopenharmony_ci jl .loop 1190cabdff1aSopenharmony_ci REP_RET 1191cabdff1aSopenharmony_ci%endmacro 1192cabdff1aSopenharmony_ci 1193cabdff1aSopenharmony_ciINIT_XMM sse 1194cabdff1aSopenharmony_ciCONV_FLT_TO_FLTP_2CH 1195cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 1196cabdff1aSopenharmony_ciINIT_XMM avx 1197cabdff1aSopenharmony_ciCONV_FLT_TO_FLTP_2CH 1198cabdff1aSopenharmony_ci%endif 1199cabdff1aSopenharmony_ci 1200cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1201cabdff1aSopenharmony_ci; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len, 1202cabdff1aSopenharmony_ci; int channels); 1203cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1204cabdff1aSopenharmony_ci 1205cabdff1aSopenharmony_ci%macro CONV_FLT_TO_FLTP_6CH 0 1206cabdff1aSopenharmony_ci%if ARCH_X86_64 1207cabdff1aSopenharmony_cicglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 1208cabdff1aSopenharmony_ci%else 1209cabdff1aSopenharmony_cicglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 1210cabdff1aSopenharmony_ci%define lend dword r2m 1211cabdff1aSopenharmony_ci%endif 1212cabdff1aSopenharmony_ci mov dst1q, [dstq+ gprsize] 1213cabdff1aSopenharmony_ci mov dst2q, [dstq+2*gprsize] 1214cabdff1aSopenharmony_ci mov dst3q, [dstq+3*gprsize] 1215cabdff1aSopenharmony_ci mov dst4q, [dstq+4*gprsize] 1216cabdff1aSopenharmony_ci mov dst5q, [dstq+5*gprsize] 1217cabdff1aSopenharmony_ci mov dstq, [dstq ] 1218cabdff1aSopenharmony_ci sub dst1q, dstq 1219cabdff1aSopenharmony_ci sub dst2q, dstq 1220cabdff1aSopenharmony_ci sub dst3q, dstq 1221cabdff1aSopenharmony_ci sub dst4q, dstq 1222cabdff1aSopenharmony_ci sub dst5q, dstq 1223cabdff1aSopenharmony_ci.loop: 1224cabdff1aSopenharmony_ci mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3 1225cabdff1aSopenharmony_ci mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7 1226cabdff1aSopenharmony_ci mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11 1227cabdff1aSopenharmony_ci mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15 1228cabdff1aSopenharmony_ci mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19 1229cabdff1aSopenharmony_ci mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23 1230cabdff1aSopenharmony_ci 1231cabdff1aSopenharmony_ci SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13 1232cabdff1aSopenharmony_ci ; m3 = 2, 14, 3, 15 1233cabdff1aSopenharmony_ci SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17 1234cabdff1aSopenharmony_ci ; m4 = 6, 18, 7, 19 1235cabdff1aSopenharmony_ci SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21 1236cabdff1aSopenharmony_ci ; m5 = 10, 22, 11, 23 1237cabdff1aSopenharmony_ci SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18 1238cabdff1aSopenharmony_ci ; m4 = 1, 7, 13, 19 1239cabdff1aSopenharmony_ci SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20 1240cabdff1aSopenharmony_ci ; m2 = 3, 9, 15, 21 1241cabdff1aSopenharmony_ci SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22 1242cabdff1aSopenharmony_ci ; m5 = 5, 11, 17, 23 1243cabdff1aSopenharmony_ci mova [dstq ], m0 1244cabdff1aSopenharmony_ci mova [dstq+dst1q], m4 1245cabdff1aSopenharmony_ci mova [dstq+dst2q], m3 1246cabdff1aSopenharmony_ci mova [dstq+dst3q], m2 1247cabdff1aSopenharmony_ci mova [dstq+dst4q], m1 1248cabdff1aSopenharmony_ci mova [dstq+dst5q], m5 1249cabdff1aSopenharmony_ci add srcq, mmsize*6 1250cabdff1aSopenharmony_ci add dstq, mmsize 1251cabdff1aSopenharmony_ci sub lend, mmsize/4 1252cabdff1aSopenharmony_ci jg .loop 1253cabdff1aSopenharmony_ci REP_RET 1254cabdff1aSopenharmony_ci%endmacro 1255cabdff1aSopenharmony_ci 1256cabdff1aSopenharmony_ciINIT_XMM sse2 1257cabdff1aSopenharmony_ciCONV_FLT_TO_FLTP_6CH 1258cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 1259cabdff1aSopenharmony_ciINIT_XMM avx 1260cabdff1aSopenharmony_ciCONV_FLT_TO_FLTP_6CH 1261cabdff1aSopenharmony_ci%endif 1262