1;***************************************************************************** 2;* x86-optimized Float DSP functions 3;* 4;* Copyright 2006 Loren Merritt 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 32 26pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0 27 28SECTION .text 29 30;----------------------------------------------------------------------------- 31; void vector_fmul(float *dst, const float *src0, const float *src1, int len) 32;----------------------------------------------------------------------------- 33%macro VECTOR_FMUL 0 34cglobal vector_fmul, 4,4,2, dst, src0, src1, len 35 lea lenq, [lend*4 - 64] 36ALIGN 16 37.loop: 38%assign a 0 39%rep 32/mmsize 40 mova m0, [src0q + lenq + (a+0)*mmsize] 41 mova m1, [src0q + lenq + (a+1)*mmsize] 42 mulps m0, m0, [src1q + lenq + (a+0)*mmsize] 43 mulps m1, m1, [src1q + lenq + (a+1)*mmsize] 44 mova [dstq + lenq + (a+0)*mmsize], m0 45 mova [dstq + lenq + (a+1)*mmsize], m1 46%assign a a+2 47%endrep 48 49 sub lenq, 64 50 jge .loop 51 REP_RET 52%endmacro 53 54INIT_XMM sse 55VECTOR_FMUL 56%if HAVE_AVX_EXTERNAL 57INIT_YMM avx 58VECTOR_FMUL 59%endif 60 61;----------------------------------------------------------------------------- 62; void vector_dmul(double *dst, const double *src0, const double *src1, int len) 63;----------------------------------------------------------------------------- 64%macro VECTOR_DMUL 0 65cglobal vector_dmul, 4,4,4, dst, src0, src1, len 66 lea lend, [lenq*8 - mmsize*4] 67ALIGN 16 68.loop: 69 movaps m0, [src0q + lenq + 0*mmsize] 70 movaps m1, [src0q + lenq + 1*mmsize] 71 movaps m2, [src0q + lenq + 2*mmsize] 72 movaps m3, [src0q + lenq + 3*mmsize] 73 mulpd m0, m0, [src1q + lenq + 0*mmsize] 74 mulpd m1, m1, [src1q + lenq + 1*mmsize] 75 mulpd m2, m2, [src1q + lenq + 2*mmsize] 76 mulpd m3, m3, [src1q + lenq + 3*mmsize] 77 movaps [dstq + lenq + 0*mmsize], m0 78 movaps [dstq + lenq + 1*mmsize], m1 79 movaps [dstq + lenq + 2*mmsize], m2 80 movaps [dstq + lenq + 3*mmsize], m3 81 82 sub lenq, mmsize*4 83 jge .loop 84 RET 85%endmacro 86 87INIT_XMM sse2 88VECTOR_DMUL 89%if HAVE_AVX_EXTERNAL 90INIT_YMM avx 91VECTOR_DMUL 92%endif 93 94;------------------------------------------------------------------------------ 95; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) 96;------------------------------------------------------------------------------ 97 98%macro VECTOR_FMAC_SCALAR 0 99%if UNIX64 100cglobal vector_fmac_scalar, 3,3,5, dst, src, len 101%else 102cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len 103%endif 104%if ARCH_X86_32 105 VBROADCASTSS m0, mulm 106%else 107%if WIN64 108 SWAP 0, 2 109%endif 110 shufps xm0, xm0, 0 111%if cpuflag(avx) 112 vinsertf128 m0, m0, xm0, 1 113%endif 114%endif 115 lea lenq, [lend*4-64] 116.loop: 117%if cpuflag(fma3) 118 mova m1, [dstq+lenq] 119 mova m2, [dstq+lenq+1*mmsize] 120 fmaddps m1, m0, [srcq+lenq], m1 121 fmaddps m2, m0, [srcq+lenq+1*mmsize], m2 122%else ; cpuflag 123 mulps m1, m0, [srcq+lenq] 124 mulps m2, m0, [srcq+lenq+1*mmsize] 125%if mmsize < 32 126 mulps m3, m0, [srcq+lenq+2*mmsize] 127 mulps m4, m0, [srcq+lenq+3*mmsize] 128%endif ; mmsize 129 addps m1, m1, [dstq+lenq] 130 addps m2, m2, [dstq+lenq+1*mmsize] 131%if mmsize < 32 132 addps m3, m3, [dstq+lenq+2*mmsize] 133 addps m4, m4, [dstq+lenq+3*mmsize] 134%endif ; mmsize 135%endif ; cpuflag 136 mova [dstq+lenq], m1 137 mova [dstq+lenq+1*mmsize], m2 138%if mmsize < 32 139 mova [dstq+lenq+2*mmsize], m3 140 mova [dstq+lenq+3*mmsize], m4 141%endif ; mmsize 142 sub lenq, 64 143 jge .loop 144 REP_RET 145%endmacro 146 147INIT_XMM sse 148VECTOR_FMAC_SCALAR 149%if HAVE_AVX_EXTERNAL 150INIT_YMM avx 151VECTOR_FMAC_SCALAR 152%endif 153%if HAVE_FMA3_EXTERNAL 154INIT_YMM fma3 155VECTOR_FMAC_SCALAR 156%endif 157 158;------------------------------------------------------------------------------ 159; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) 160;------------------------------------------------------------------------------ 161 162%macro VECTOR_FMUL_SCALAR 0 163%if UNIX64 164cglobal vector_fmul_scalar, 3,3,2, dst, src, len 165%else 166cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len 167%endif 168%if ARCH_X86_32 169 movss m0, mulm 170%elif WIN64 171 SWAP 0, 2 172%endif 173 shufps m0, m0, 0 174 lea lenq, [lend*4-mmsize] 175.loop: 176 mova m1, [srcq+lenq] 177 mulps m1, m0 178 mova [dstq+lenq], m1 179 sub lenq, mmsize 180 jge .loop 181 REP_RET 182%endmacro 183 184INIT_XMM sse 185VECTOR_FMUL_SCALAR 186 187;------------------------------------------------------------------------------ 188; void ff_vector_dmac_scalar(double *dst, const double *src, double mul, 189; int len) 190;------------------------------------------------------------------------------ 191 192%macro VECTOR_DMAC_SCALAR 0 193%if ARCH_X86_32 194cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr 195 mov lenq, lenaddrm 196 VBROADCASTSD m0, mulm 197%else 198%if UNIX64 199cglobal vector_dmac_scalar, 3,3,5, dst, src, len 200%else 201cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len 202 SWAP 0, 2 203%endif 204 movlhps xm0, xm0 205%if cpuflag(avx) 206 vinsertf128 m0, m0, xm0, 1 207%endif 208%endif 209 lea lenq, [lend*8-mmsize*4] 210.loop: 211%if cpuflag(fma3) 212 movaps m1, [dstq+lenq] 213 movaps m2, [dstq+lenq+1*mmsize] 214 movaps m3, [dstq+lenq+2*mmsize] 215 movaps m4, [dstq+lenq+3*mmsize] 216 fmaddpd m1, m0, [srcq+lenq], m1 217 fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2 218 fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3 219 fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4 220%else ; cpuflag 221 mulpd m1, m0, [srcq+lenq] 222 mulpd m2, m0, [srcq+lenq+1*mmsize] 223 mulpd m3, m0, [srcq+lenq+2*mmsize] 224 mulpd m4, m0, [srcq+lenq+3*mmsize] 225 addpd m1, m1, [dstq+lenq] 226 addpd m2, m2, [dstq+lenq+1*mmsize] 227 addpd m3, m3, [dstq+lenq+2*mmsize] 228 addpd m4, m4, [dstq+lenq+3*mmsize] 229%endif ; cpuflag 230 movaps [dstq+lenq], m1 231 movaps [dstq+lenq+1*mmsize], m2 232 movaps [dstq+lenq+2*mmsize], m3 233 movaps [dstq+lenq+3*mmsize], m4 234 sub lenq, mmsize*4 235 jge .loop 236 REP_RET 237%endmacro 238 239INIT_XMM sse2 240VECTOR_DMAC_SCALAR 241%if HAVE_AVX_EXTERNAL 242INIT_YMM avx 243VECTOR_DMAC_SCALAR 244%endif 245%if HAVE_FMA3_EXTERNAL 246INIT_YMM fma3 247VECTOR_DMAC_SCALAR 248%endif 249 250;------------------------------------------------------------------------------ 251; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, 252; int len) 253;------------------------------------------------------------------------------ 254 255%macro VECTOR_DMUL_SCALAR 0 256%if ARCH_X86_32 257cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr 258 mov lenq, lenaddrm 259%elif UNIX64 260cglobal vector_dmul_scalar, 3,3,3, dst, src, len 261%else 262cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len 263%endif 264%if ARCH_X86_32 265 VBROADCASTSD m0, mulm 266%else 267%if WIN64 268 SWAP 0, 2 269%endif 270 movlhps xm0, xm0 271%if cpuflag(avx) 272 vinsertf128 ym0, ym0, xm0, 1 273%endif 274%endif 275 lea lenq, [lend*8-2*mmsize] 276.loop: 277 mulpd m1, m0, [srcq+lenq ] 278 mulpd m2, m0, [srcq+lenq+mmsize] 279 movaps [dstq+lenq ], m1 280 movaps [dstq+lenq+mmsize], m2 281 sub lenq, 2*mmsize 282 jge .loop 283 REP_RET 284%endmacro 285 286INIT_XMM sse2 287VECTOR_DMUL_SCALAR 288%if HAVE_AVX_EXTERNAL 289INIT_YMM avx 290VECTOR_DMUL_SCALAR 291%endif 292 293;----------------------------------------------------------------------------- 294; vector_fmul_window(float *dst, const float *src0, 295; const float *src1, const float *win, int len); 296;----------------------------------------------------------------------------- 297INIT_XMM sse 298cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1 299 shl lend, 2 300 lea len1q, [lenq - mmsize] 301 add src0q, lenq 302 add dstq, lenq 303 add winq, lenq 304 neg lenq 305.loop: 306 mova m0, [winq + lenq] 307 mova m4, [src0q + lenq] 308 mova m1, [winq + len1q] 309 mova m5, [src1q + len1q] 310 shufps m1, m1, 0x1b 311 shufps m5, m5, 0x1b 312 mova m2, m0 313 mova m3, m1 314 mulps m2, m4 315 mulps m3, m5 316 mulps m1, m4 317 mulps m0, m5 318 addps m2, m3 319 subps m1, m0 320 shufps m2, m2, 0x1b 321 mova [dstq + lenq], m1 322 mova [dstq + len1q], m2 323 sub len1q, mmsize 324 add lenq, mmsize 325 jl .loop 326 REP_RET 327 328;----------------------------------------------------------------------------- 329; vector_fmul_add(float *dst, const float *src0, const float *src1, 330; const float *src2, int len) 331;----------------------------------------------------------------------------- 332%macro VECTOR_FMUL_ADD 0 333cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len 334 lea lenq, [lend*4 - 2*mmsize] 335ALIGN 16 336.loop: 337 mova m0, [src0q + lenq] 338 mova m1, [src0q + lenq + mmsize] 339%if cpuflag(fma3) 340 mova m2, [src2q + lenq] 341 mova m3, [src2q + lenq + mmsize] 342 fmaddps m0, m0, [src1q + lenq], m2 343 fmaddps m1, m1, [src1q + lenq + mmsize], m3 344%else 345 mulps m0, m0, [src1q + lenq] 346 mulps m1, m1, [src1q + lenq + mmsize] 347 addps m0, m0, [src2q + lenq] 348 addps m1, m1, [src2q + lenq + mmsize] 349%endif 350 mova [dstq + lenq], m0 351 mova [dstq + lenq + mmsize], m1 352 353 sub lenq, 2*mmsize 354 jge .loop 355 REP_RET 356%endmacro 357 358INIT_XMM sse 359VECTOR_FMUL_ADD 360%if HAVE_AVX_EXTERNAL 361INIT_YMM avx 362VECTOR_FMUL_ADD 363%endif 364%if HAVE_FMA3_EXTERNAL 365INIT_YMM fma3 366VECTOR_FMUL_ADD 367%endif 368 369;----------------------------------------------------------------------------- 370; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, 371; int len) 372;----------------------------------------------------------------------------- 373%macro VECTOR_FMUL_REVERSE 0 374cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len 375%if cpuflag(avx2) 376 movaps m2, [pd_reverse] 377%endif 378 lea lenq, [lend*4 - 2*mmsize] 379ALIGN 16 380.loop: 381%if cpuflag(avx2) 382 vpermps m0, m2, [src1q] 383 vpermps m1, m2, [src1q+mmsize] 384%elif cpuflag(avx) 385 vmovaps xmm0, [src1q + 16] 386 vinsertf128 m0, m0, [src1q], 1 387 vshufps m0, m0, m0, q0123 388 vmovaps xmm1, [src1q + mmsize + 16] 389 vinsertf128 m1, m1, [src1q + mmsize], 1 390 vshufps m1, m1, m1, q0123 391%else 392 mova m0, [src1q] 393 mova m1, [src1q + mmsize] 394 shufps m0, m0, q0123 395 shufps m1, m1, q0123 396%endif 397 mulps m0, m0, [src0q + lenq + mmsize] 398 mulps m1, m1, [src0q + lenq] 399 movaps [dstq + lenq + mmsize], m0 400 movaps [dstq + lenq], m1 401 add src1q, 2*mmsize 402 sub lenq, 2*mmsize 403 jge .loop 404 REP_RET 405%endmacro 406 407INIT_XMM sse 408VECTOR_FMUL_REVERSE 409%if HAVE_AVX_EXTERNAL 410INIT_YMM avx 411VECTOR_FMUL_REVERSE 412%endif 413%if HAVE_AVX2_EXTERNAL 414INIT_YMM avx2 415VECTOR_FMUL_REVERSE 416%endif 417 418; float scalarproduct_float_sse(const float *v1, const float *v2, int len) 419INIT_XMM sse 420cglobal scalarproduct_float, 3,3,2, v1, v2, offset 421 shl offsetd, 2 422 add v1q, offsetq 423 add v2q, offsetq 424 neg offsetq 425 xorps xmm0, xmm0 426.loop: 427 movaps xmm1, [v1q+offsetq] 428 mulps xmm1, [v2q+offsetq] 429 addps xmm0, xmm1 430 add offsetq, 16 431 js .loop 432 movhlps xmm1, xmm0 433 addps xmm0, xmm1 434 movss xmm1, xmm0 435 shufps xmm0, xmm0, 1 436 addss xmm0, xmm1 437%if ARCH_X86_64 == 0 438 movss r0m, xmm0 439 fld dword r0m 440%endif 441 RET 442 443;----------------------------------------------------------------------------- 444; void ff_butterflies_float(float *src0, float *src1, int len); 445;----------------------------------------------------------------------------- 446INIT_XMM sse 447cglobal butterflies_float, 3,3,3, src0, src1, len 448 shl lend, 2 449 add src0q, lenq 450 add src1q, lenq 451 neg lenq 452.loop: 453 mova m0, [src0q + lenq] 454 mova m1, [src1q + lenq] 455 subps m2, m0, m1 456 addps m0, m0, m1 457 mova [src1q + lenq], m2 458 mova [src0q + lenq], m0 459 add lenq, mmsize 460 jl .loop 461 REP_RET 462