1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* x86-optimized Float DSP functions 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright 2006 Loren Merritt 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION_RODATA 32 26cabdff1aSopenharmony_cipd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_ciSECTION .text 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 31cabdff1aSopenharmony_ci; void vector_fmul(float *dst, const float *src0, const float *src1, int len) 32cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 33cabdff1aSopenharmony_ci%macro VECTOR_FMUL 0 34cabdff1aSopenharmony_cicglobal vector_fmul, 4,4,2, dst, src0, src1, len 35cabdff1aSopenharmony_ci lea lenq, [lend*4 - 64] 36cabdff1aSopenharmony_ciALIGN 16 37cabdff1aSopenharmony_ci.loop: 38cabdff1aSopenharmony_ci%assign a 0 39cabdff1aSopenharmony_ci%rep 32/mmsize 40cabdff1aSopenharmony_ci mova m0, [src0q + lenq + (a+0)*mmsize] 41cabdff1aSopenharmony_ci mova m1, [src0q + lenq + (a+1)*mmsize] 42cabdff1aSopenharmony_ci mulps m0, m0, [src1q + lenq + (a+0)*mmsize] 43cabdff1aSopenharmony_ci mulps m1, m1, [src1q + lenq + (a+1)*mmsize] 44cabdff1aSopenharmony_ci mova [dstq + lenq + (a+0)*mmsize], m0 45cabdff1aSopenharmony_ci mova [dstq + lenq + (a+1)*mmsize], m1 46cabdff1aSopenharmony_ci%assign a a+2 47cabdff1aSopenharmony_ci%endrep 48cabdff1aSopenharmony_ci 49cabdff1aSopenharmony_ci sub lenq, 64 50cabdff1aSopenharmony_ci jge .loop 51cabdff1aSopenharmony_ci REP_RET 52cabdff1aSopenharmony_ci%endmacro 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ciINIT_XMM sse 55cabdff1aSopenharmony_ciVECTOR_FMUL 56cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 57cabdff1aSopenharmony_ciINIT_YMM avx 58cabdff1aSopenharmony_ciVECTOR_FMUL 59cabdff1aSopenharmony_ci%endif 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 62cabdff1aSopenharmony_ci; void vector_dmul(double *dst, const double *src0, const double *src1, int len) 63cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 64cabdff1aSopenharmony_ci%macro VECTOR_DMUL 0 65cabdff1aSopenharmony_cicglobal vector_dmul, 4,4,4, dst, src0, src1, len 66cabdff1aSopenharmony_ci lea lend, [lenq*8 - mmsize*4] 67cabdff1aSopenharmony_ciALIGN 16 68cabdff1aSopenharmony_ci.loop: 69cabdff1aSopenharmony_ci movaps m0, [src0q + lenq + 0*mmsize] 70cabdff1aSopenharmony_ci movaps m1, [src0q + lenq + 1*mmsize] 71cabdff1aSopenharmony_ci movaps m2, [src0q + lenq + 2*mmsize] 72cabdff1aSopenharmony_ci movaps m3, [src0q + lenq + 3*mmsize] 73cabdff1aSopenharmony_ci mulpd m0, m0, [src1q + lenq + 0*mmsize] 74cabdff1aSopenharmony_ci mulpd m1, m1, [src1q + lenq + 1*mmsize] 75cabdff1aSopenharmony_ci mulpd m2, m2, [src1q + lenq + 2*mmsize] 76cabdff1aSopenharmony_ci mulpd m3, m3, [src1q + lenq + 3*mmsize] 77cabdff1aSopenharmony_ci movaps [dstq + lenq + 0*mmsize], m0 78cabdff1aSopenharmony_ci movaps [dstq + lenq + 1*mmsize], m1 79cabdff1aSopenharmony_ci movaps [dstq + lenq + 2*mmsize], m2 80cabdff1aSopenharmony_ci movaps [dstq + lenq + 3*mmsize], m3 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_ci sub lenq, mmsize*4 83cabdff1aSopenharmony_ci jge .loop 84cabdff1aSopenharmony_ci RET 85cabdff1aSopenharmony_ci%endmacro 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_ciINIT_XMM sse2 88cabdff1aSopenharmony_ciVECTOR_DMUL 89cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 90cabdff1aSopenharmony_ciINIT_YMM avx 91cabdff1aSopenharmony_ciVECTOR_DMUL 92cabdff1aSopenharmony_ci%endif 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 95cabdff1aSopenharmony_ci; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) 96cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci%macro VECTOR_FMAC_SCALAR 0 99cabdff1aSopenharmony_ci%if UNIX64 100cabdff1aSopenharmony_cicglobal vector_fmac_scalar, 3,3,5, dst, src, len 101cabdff1aSopenharmony_ci%else 102cabdff1aSopenharmony_cicglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len 103cabdff1aSopenharmony_ci%endif 104cabdff1aSopenharmony_ci%if ARCH_X86_32 105cabdff1aSopenharmony_ci VBROADCASTSS m0, mulm 106cabdff1aSopenharmony_ci%else 107cabdff1aSopenharmony_ci%if WIN64 108cabdff1aSopenharmony_ci SWAP 0, 2 109cabdff1aSopenharmony_ci%endif 110cabdff1aSopenharmony_ci shufps xm0, xm0, 0 111cabdff1aSopenharmony_ci%if cpuflag(avx) 112cabdff1aSopenharmony_ci vinsertf128 m0, m0, xm0, 1 113cabdff1aSopenharmony_ci%endif 114cabdff1aSopenharmony_ci%endif 115cabdff1aSopenharmony_ci lea lenq, [lend*4-64] 116cabdff1aSopenharmony_ci.loop: 117cabdff1aSopenharmony_ci%if cpuflag(fma3) 118cabdff1aSopenharmony_ci mova m1, [dstq+lenq] 119cabdff1aSopenharmony_ci mova m2, [dstq+lenq+1*mmsize] 120cabdff1aSopenharmony_ci fmaddps m1, m0, [srcq+lenq], m1 121cabdff1aSopenharmony_ci fmaddps m2, m0, [srcq+lenq+1*mmsize], m2 122cabdff1aSopenharmony_ci%else ; cpuflag 123cabdff1aSopenharmony_ci mulps m1, m0, [srcq+lenq] 124cabdff1aSopenharmony_ci mulps m2, m0, [srcq+lenq+1*mmsize] 125cabdff1aSopenharmony_ci%if mmsize < 32 126cabdff1aSopenharmony_ci mulps m3, m0, [srcq+lenq+2*mmsize] 127cabdff1aSopenharmony_ci mulps m4, m0, [srcq+lenq+3*mmsize] 128cabdff1aSopenharmony_ci%endif ; mmsize 129cabdff1aSopenharmony_ci addps m1, m1, [dstq+lenq] 130cabdff1aSopenharmony_ci addps m2, m2, [dstq+lenq+1*mmsize] 131cabdff1aSopenharmony_ci%if mmsize < 32 132cabdff1aSopenharmony_ci addps m3, m3, [dstq+lenq+2*mmsize] 133cabdff1aSopenharmony_ci addps m4, m4, [dstq+lenq+3*mmsize] 134cabdff1aSopenharmony_ci%endif ; mmsize 135cabdff1aSopenharmony_ci%endif ; cpuflag 136cabdff1aSopenharmony_ci mova [dstq+lenq], m1 137cabdff1aSopenharmony_ci mova [dstq+lenq+1*mmsize], m2 138cabdff1aSopenharmony_ci%if mmsize < 32 139cabdff1aSopenharmony_ci mova [dstq+lenq+2*mmsize], m3 140cabdff1aSopenharmony_ci mova [dstq+lenq+3*mmsize], m4 141cabdff1aSopenharmony_ci%endif ; mmsize 142cabdff1aSopenharmony_ci sub lenq, 64 143cabdff1aSopenharmony_ci jge .loop 144cabdff1aSopenharmony_ci REP_RET 145cabdff1aSopenharmony_ci%endmacro 146cabdff1aSopenharmony_ci 147cabdff1aSopenharmony_ciINIT_XMM sse 148cabdff1aSopenharmony_ciVECTOR_FMAC_SCALAR 149cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 150cabdff1aSopenharmony_ciINIT_YMM avx 151cabdff1aSopenharmony_ciVECTOR_FMAC_SCALAR 152cabdff1aSopenharmony_ci%endif 153cabdff1aSopenharmony_ci%if HAVE_FMA3_EXTERNAL 154cabdff1aSopenharmony_ciINIT_YMM fma3 155cabdff1aSopenharmony_ciVECTOR_FMAC_SCALAR 156cabdff1aSopenharmony_ci%endif 157cabdff1aSopenharmony_ci 158cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 159cabdff1aSopenharmony_ci; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) 160cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci%macro VECTOR_FMUL_SCALAR 0 163cabdff1aSopenharmony_ci%if UNIX64 164cabdff1aSopenharmony_cicglobal vector_fmul_scalar, 3,3,2, dst, src, len 165cabdff1aSopenharmony_ci%else 166cabdff1aSopenharmony_cicglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len 167cabdff1aSopenharmony_ci%endif 168cabdff1aSopenharmony_ci%if ARCH_X86_32 169cabdff1aSopenharmony_ci movss m0, mulm 170cabdff1aSopenharmony_ci%elif WIN64 171cabdff1aSopenharmony_ci SWAP 0, 2 172cabdff1aSopenharmony_ci%endif 173cabdff1aSopenharmony_ci shufps m0, m0, 0 174cabdff1aSopenharmony_ci lea lenq, [lend*4-mmsize] 175cabdff1aSopenharmony_ci.loop: 176cabdff1aSopenharmony_ci mova m1, [srcq+lenq] 177cabdff1aSopenharmony_ci mulps m1, m0 178cabdff1aSopenharmony_ci mova [dstq+lenq], m1 179cabdff1aSopenharmony_ci sub lenq, mmsize 180cabdff1aSopenharmony_ci jge .loop 181cabdff1aSopenharmony_ci REP_RET 182cabdff1aSopenharmony_ci%endmacro 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ciINIT_XMM sse 185cabdff1aSopenharmony_ciVECTOR_FMUL_SCALAR 186cabdff1aSopenharmony_ci 187cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 188cabdff1aSopenharmony_ci; void ff_vector_dmac_scalar(double *dst, const double *src, double mul, 189cabdff1aSopenharmony_ci; int len) 190cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 191cabdff1aSopenharmony_ci 192cabdff1aSopenharmony_ci%macro VECTOR_DMAC_SCALAR 0 193cabdff1aSopenharmony_ci%if ARCH_X86_32 194cabdff1aSopenharmony_cicglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr 195cabdff1aSopenharmony_ci mov lenq, lenaddrm 196cabdff1aSopenharmony_ci VBROADCASTSD m0, mulm 197cabdff1aSopenharmony_ci%else 198cabdff1aSopenharmony_ci%if UNIX64 199cabdff1aSopenharmony_cicglobal vector_dmac_scalar, 3,3,5, dst, src, len 200cabdff1aSopenharmony_ci%else 201cabdff1aSopenharmony_cicglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len 202cabdff1aSopenharmony_ci SWAP 0, 2 203cabdff1aSopenharmony_ci%endif 204cabdff1aSopenharmony_ci movlhps xm0, xm0 205cabdff1aSopenharmony_ci%if cpuflag(avx) 206cabdff1aSopenharmony_ci vinsertf128 m0, m0, xm0, 1 207cabdff1aSopenharmony_ci%endif 208cabdff1aSopenharmony_ci%endif 209cabdff1aSopenharmony_ci lea lenq, [lend*8-mmsize*4] 210cabdff1aSopenharmony_ci.loop: 211cabdff1aSopenharmony_ci%if cpuflag(fma3) 212cabdff1aSopenharmony_ci movaps m1, [dstq+lenq] 213cabdff1aSopenharmony_ci movaps m2, [dstq+lenq+1*mmsize] 214cabdff1aSopenharmony_ci movaps m3, [dstq+lenq+2*mmsize] 215cabdff1aSopenharmony_ci movaps m4, [dstq+lenq+3*mmsize] 216cabdff1aSopenharmony_ci fmaddpd m1, m0, [srcq+lenq], m1 217cabdff1aSopenharmony_ci fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2 218cabdff1aSopenharmony_ci fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3 219cabdff1aSopenharmony_ci fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4 220cabdff1aSopenharmony_ci%else ; cpuflag 221cabdff1aSopenharmony_ci mulpd m1, m0, [srcq+lenq] 222cabdff1aSopenharmony_ci mulpd m2, m0, [srcq+lenq+1*mmsize] 223cabdff1aSopenharmony_ci mulpd m3, m0, [srcq+lenq+2*mmsize] 224cabdff1aSopenharmony_ci mulpd m4, m0, [srcq+lenq+3*mmsize] 225cabdff1aSopenharmony_ci addpd m1, m1, [dstq+lenq] 226cabdff1aSopenharmony_ci addpd m2, m2, [dstq+lenq+1*mmsize] 227cabdff1aSopenharmony_ci addpd m3, m3, [dstq+lenq+2*mmsize] 228cabdff1aSopenharmony_ci addpd m4, m4, [dstq+lenq+3*mmsize] 229cabdff1aSopenharmony_ci%endif ; cpuflag 230cabdff1aSopenharmony_ci movaps [dstq+lenq], m1 231cabdff1aSopenharmony_ci movaps [dstq+lenq+1*mmsize], m2 232cabdff1aSopenharmony_ci movaps [dstq+lenq+2*mmsize], m3 233cabdff1aSopenharmony_ci movaps [dstq+lenq+3*mmsize], m4 234cabdff1aSopenharmony_ci sub lenq, mmsize*4 235cabdff1aSopenharmony_ci jge .loop 236cabdff1aSopenharmony_ci REP_RET 237cabdff1aSopenharmony_ci%endmacro 238cabdff1aSopenharmony_ci 239cabdff1aSopenharmony_ciINIT_XMM sse2 240cabdff1aSopenharmony_ciVECTOR_DMAC_SCALAR 241cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 242cabdff1aSopenharmony_ciINIT_YMM avx 243cabdff1aSopenharmony_ciVECTOR_DMAC_SCALAR 244cabdff1aSopenharmony_ci%endif 245cabdff1aSopenharmony_ci%if HAVE_FMA3_EXTERNAL 246cabdff1aSopenharmony_ciINIT_YMM fma3 247cabdff1aSopenharmony_ciVECTOR_DMAC_SCALAR 248cabdff1aSopenharmony_ci%endif 249cabdff1aSopenharmony_ci 250cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 251cabdff1aSopenharmony_ci; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, 252cabdff1aSopenharmony_ci; int len) 253cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_ci%macro VECTOR_DMUL_SCALAR 0 256cabdff1aSopenharmony_ci%if ARCH_X86_32 257cabdff1aSopenharmony_cicglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr 258cabdff1aSopenharmony_ci mov lenq, lenaddrm 259cabdff1aSopenharmony_ci%elif UNIX64 260cabdff1aSopenharmony_cicglobal vector_dmul_scalar, 3,3,3, dst, src, len 261cabdff1aSopenharmony_ci%else 262cabdff1aSopenharmony_cicglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len 263cabdff1aSopenharmony_ci%endif 264cabdff1aSopenharmony_ci%if ARCH_X86_32 265cabdff1aSopenharmony_ci VBROADCASTSD m0, mulm 266cabdff1aSopenharmony_ci%else 267cabdff1aSopenharmony_ci%if WIN64 268cabdff1aSopenharmony_ci SWAP 0, 2 269cabdff1aSopenharmony_ci%endif 270cabdff1aSopenharmony_ci movlhps xm0, xm0 271cabdff1aSopenharmony_ci%if cpuflag(avx) 272cabdff1aSopenharmony_ci vinsertf128 ym0, ym0, xm0, 1 273cabdff1aSopenharmony_ci%endif 274cabdff1aSopenharmony_ci%endif 275cabdff1aSopenharmony_ci lea lenq, [lend*8-2*mmsize] 276cabdff1aSopenharmony_ci.loop: 277cabdff1aSopenharmony_ci mulpd m1, m0, [srcq+lenq ] 278cabdff1aSopenharmony_ci mulpd m2, m0, [srcq+lenq+mmsize] 279cabdff1aSopenharmony_ci movaps [dstq+lenq ], m1 280cabdff1aSopenharmony_ci movaps [dstq+lenq+mmsize], m2 281cabdff1aSopenharmony_ci sub lenq, 2*mmsize 282cabdff1aSopenharmony_ci jge .loop 283cabdff1aSopenharmony_ci REP_RET 284cabdff1aSopenharmony_ci%endmacro 285cabdff1aSopenharmony_ci 286cabdff1aSopenharmony_ciINIT_XMM sse2 287cabdff1aSopenharmony_ciVECTOR_DMUL_SCALAR 288cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 289cabdff1aSopenharmony_ciINIT_YMM avx 290cabdff1aSopenharmony_ciVECTOR_DMUL_SCALAR 291cabdff1aSopenharmony_ci%endif 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 294cabdff1aSopenharmony_ci; vector_fmul_window(float *dst, const float *src0, 295cabdff1aSopenharmony_ci; const float *src1, const float *win, int len); 296cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 297cabdff1aSopenharmony_ciINIT_XMM sse 298cabdff1aSopenharmony_cicglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1 299cabdff1aSopenharmony_ci shl lend, 2 300cabdff1aSopenharmony_ci lea len1q, [lenq - mmsize] 301cabdff1aSopenharmony_ci add src0q, lenq 302cabdff1aSopenharmony_ci add dstq, lenq 303cabdff1aSopenharmony_ci add winq, lenq 304cabdff1aSopenharmony_ci neg lenq 305cabdff1aSopenharmony_ci.loop: 306cabdff1aSopenharmony_ci mova m0, [winq + lenq] 307cabdff1aSopenharmony_ci mova m4, [src0q + lenq] 308cabdff1aSopenharmony_ci mova m1, [winq + len1q] 309cabdff1aSopenharmony_ci mova m5, [src1q + len1q] 310cabdff1aSopenharmony_ci shufps m1, m1, 0x1b 311cabdff1aSopenharmony_ci shufps m5, m5, 0x1b 312cabdff1aSopenharmony_ci mova m2, m0 313cabdff1aSopenharmony_ci mova m3, m1 314cabdff1aSopenharmony_ci mulps m2, m4 315cabdff1aSopenharmony_ci mulps m3, m5 316cabdff1aSopenharmony_ci mulps m1, m4 317cabdff1aSopenharmony_ci mulps m0, m5 318cabdff1aSopenharmony_ci addps m2, m3 319cabdff1aSopenharmony_ci subps m1, m0 320cabdff1aSopenharmony_ci shufps m2, m2, 0x1b 321cabdff1aSopenharmony_ci mova [dstq + lenq], m1 322cabdff1aSopenharmony_ci mova [dstq + len1q], m2 323cabdff1aSopenharmony_ci sub len1q, mmsize 324cabdff1aSopenharmony_ci add lenq, mmsize 325cabdff1aSopenharmony_ci jl .loop 326cabdff1aSopenharmony_ci REP_RET 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 329cabdff1aSopenharmony_ci; vector_fmul_add(float *dst, const float *src0, const float *src1, 330cabdff1aSopenharmony_ci; const float *src2, int len) 331cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 332cabdff1aSopenharmony_ci%macro VECTOR_FMUL_ADD 0 333cabdff1aSopenharmony_cicglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len 334cabdff1aSopenharmony_ci lea lenq, [lend*4 - 2*mmsize] 335cabdff1aSopenharmony_ciALIGN 16 336cabdff1aSopenharmony_ci.loop: 337cabdff1aSopenharmony_ci mova m0, [src0q + lenq] 338cabdff1aSopenharmony_ci mova m1, [src0q + lenq + mmsize] 339cabdff1aSopenharmony_ci%if cpuflag(fma3) 340cabdff1aSopenharmony_ci mova m2, [src2q + lenq] 341cabdff1aSopenharmony_ci mova m3, [src2q + lenq + mmsize] 342cabdff1aSopenharmony_ci fmaddps m0, m0, [src1q + lenq], m2 343cabdff1aSopenharmony_ci fmaddps m1, m1, [src1q + lenq + mmsize], m3 344cabdff1aSopenharmony_ci%else 345cabdff1aSopenharmony_ci mulps m0, m0, [src1q + lenq] 346cabdff1aSopenharmony_ci mulps m1, m1, [src1q + lenq + mmsize] 347cabdff1aSopenharmony_ci addps m0, m0, [src2q + lenq] 348cabdff1aSopenharmony_ci addps m1, m1, [src2q + lenq + mmsize] 349cabdff1aSopenharmony_ci%endif 350cabdff1aSopenharmony_ci mova [dstq + lenq], m0 351cabdff1aSopenharmony_ci mova [dstq + lenq + mmsize], m1 352cabdff1aSopenharmony_ci 353cabdff1aSopenharmony_ci sub lenq, 2*mmsize 354cabdff1aSopenharmony_ci jge .loop 355cabdff1aSopenharmony_ci REP_RET 356cabdff1aSopenharmony_ci%endmacro 357cabdff1aSopenharmony_ci 358cabdff1aSopenharmony_ciINIT_XMM sse 359cabdff1aSopenharmony_ciVECTOR_FMUL_ADD 360cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 361cabdff1aSopenharmony_ciINIT_YMM avx 362cabdff1aSopenharmony_ciVECTOR_FMUL_ADD 363cabdff1aSopenharmony_ci%endif 364cabdff1aSopenharmony_ci%if HAVE_FMA3_EXTERNAL 365cabdff1aSopenharmony_ciINIT_YMM fma3 366cabdff1aSopenharmony_ciVECTOR_FMUL_ADD 367cabdff1aSopenharmony_ci%endif 368cabdff1aSopenharmony_ci 369cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 370cabdff1aSopenharmony_ci; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, 371cabdff1aSopenharmony_ci; int len) 372cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 373cabdff1aSopenharmony_ci%macro VECTOR_FMUL_REVERSE 0 374cabdff1aSopenharmony_cicglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len 375cabdff1aSopenharmony_ci%if cpuflag(avx2) 376cabdff1aSopenharmony_ci movaps m2, [pd_reverse] 377cabdff1aSopenharmony_ci%endif 378cabdff1aSopenharmony_ci lea lenq, [lend*4 - 2*mmsize] 379cabdff1aSopenharmony_ciALIGN 16 380cabdff1aSopenharmony_ci.loop: 381cabdff1aSopenharmony_ci%if cpuflag(avx2) 382cabdff1aSopenharmony_ci vpermps m0, m2, [src1q] 383cabdff1aSopenharmony_ci vpermps m1, m2, [src1q+mmsize] 384cabdff1aSopenharmony_ci%elif cpuflag(avx) 385cabdff1aSopenharmony_ci vmovaps xmm0, [src1q + 16] 386cabdff1aSopenharmony_ci vinsertf128 m0, m0, [src1q], 1 387cabdff1aSopenharmony_ci vshufps m0, m0, m0, q0123 388cabdff1aSopenharmony_ci vmovaps xmm1, [src1q + mmsize + 16] 389cabdff1aSopenharmony_ci vinsertf128 m1, m1, [src1q + mmsize], 1 390cabdff1aSopenharmony_ci vshufps m1, m1, m1, q0123 391cabdff1aSopenharmony_ci%else 392cabdff1aSopenharmony_ci mova m0, [src1q] 393cabdff1aSopenharmony_ci mova m1, [src1q + mmsize] 394cabdff1aSopenharmony_ci shufps m0, m0, q0123 395cabdff1aSopenharmony_ci shufps m1, m1, q0123 396cabdff1aSopenharmony_ci%endif 397cabdff1aSopenharmony_ci mulps m0, m0, [src0q + lenq + mmsize] 398cabdff1aSopenharmony_ci mulps m1, m1, [src0q + lenq] 399cabdff1aSopenharmony_ci movaps [dstq + lenq + mmsize], m0 400cabdff1aSopenharmony_ci movaps [dstq + lenq], m1 401cabdff1aSopenharmony_ci add src1q, 2*mmsize 402cabdff1aSopenharmony_ci sub lenq, 2*mmsize 403cabdff1aSopenharmony_ci jge .loop 404cabdff1aSopenharmony_ci REP_RET 405cabdff1aSopenharmony_ci%endmacro 406cabdff1aSopenharmony_ci 407cabdff1aSopenharmony_ciINIT_XMM sse 408cabdff1aSopenharmony_ciVECTOR_FMUL_REVERSE 409cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 410cabdff1aSopenharmony_ciINIT_YMM avx 411cabdff1aSopenharmony_ciVECTOR_FMUL_REVERSE 412cabdff1aSopenharmony_ci%endif 413cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 414cabdff1aSopenharmony_ciINIT_YMM avx2 415cabdff1aSopenharmony_ciVECTOR_FMUL_REVERSE 416cabdff1aSopenharmony_ci%endif 417cabdff1aSopenharmony_ci 418cabdff1aSopenharmony_ci; float scalarproduct_float_sse(const float *v1, const float *v2, int len) 419cabdff1aSopenharmony_ciINIT_XMM sse 420cabdff1aSopenharmony_cicglobal scalarproduct_float, 3,3,2, v1, v2, offset 421cabdff1aSopenharmony_ci shl offsetd, 2 422cabdff1aSopenharmony_ci add v1q, offsetq 423cabdff1aSopenharmony_ci add v2q, offsetq 424cabdff1aSopenharmony_ci neg offsetq 425cabdff1aSopenharmony_ci xorps xmm0, xmm0 426cabdff1aSopenharmony_ci.loop: 427cabdff1aSopenharmony_ci movaps xmm1, [v1q+offsetq] 428cabdff1aSopenharmony_ci mulps xmm1, [v2q+offsetq] 429cabdff1aSopenharmony_ci addps xmm0, xmm1 430cabdff1aSopenharmony_ci add offsetq, 16 431cabdff1aSopenharmony_ci js .loop 432cabdff1aSopenharmony_ci movhlps xmm1, xmm0 433cabdff1aSopenharmony_ci addps xmm0, xmm1 434cabdff1aSopenharmony_ci movss xmm1, xmm0 435cabdff1aSopenharmony_ci shufps xmm0, xmm0, 1 436cabdff1aSopenharmony_ci addss xmm0, xmm1 437cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 438cabdff1aSopenharmony_ci movss r0m, xmm0 439cabdff1aSopenharmony_ci fld dword r0m 440cabdff1aSopenharmony_ci%endif 441cabdff1aSopenharmony_ci RET 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 444cabdff1aSopenharmony_ci; void ff_butterflies_float(float *src0, float *src1, int len); 445cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 446cabdff1aSopenharmony_ciINIT_XMM sse 447cabdff1aSopenharmony_cicglobal butterflies_float, 3,3,3, src0, src1, len 448cabdff1aSopenharmony_ci shl lend, 2 449cabdff1aSopenharmony_ci add src0q, lenq 450cabdff1aSopenharmony_ci add src1q, lenq 451cabdff1aSopenharmony_ci neg lenq 452cabdff1aSopenharmony_ci.loop: 453cabdff1aSopenharmony_ci mova m0, [src0q + lenq] 454cabdff1aSopenharmony_ci mova m1, [src1q + lenq] 455cabdff1aSopenharmony_ci subps m2, m0, m1 456cabdff1aSopenharmony_ci addps m0, m0, m1 457cabdff1aSopenharmony_ci mova [src1q + lenq], m2 458cabdff1aSopenharmony_ci mova [src0q + lenq], m0 459cabdff1aSopenharmony_ci add lenq, mmsize 460cabdff1aSopenharmony_ci jl .loop 461cabdff1aSopenharmony_ci REP_RET 462