1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* optimized audio functions 3cabdff1aSopenharmony_ci;* Copyright (c) 2008 Loren Merritt 4cabdff1aSopenharmony_ci;* 5cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 16cabdff1aSopenharmony_ci;* 17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci;****************************************************************************** 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciSECTION .text 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order) 27cabdff1aSopenharmony_ciINIT_XMM sse2 28cabdff1aSopenharmony_cicglobal scalarproduct_int16, 3,3,3, v1, v2, order 29cabdff1aSopenharmony_ci add orderd, orderd 30cabdff1aSopenharmony_ci add v1q, orderq 31cabdff1aSopenharmony_ci add v2q, orderq 32cabdff1aSopenharmony_ci neg orderq 33cabdff1aSopenharmony_ci pxor m2, m2 34cabdff1aSopenharmony_ci.loop: 35cabdff1aSopenharmony_ci movu m0, [v1q + orderq] 36cabdff1aSopenharmony_ci movu m1, [v1q + orderq + mmsize] 37cabdff1aSopenharmony_ci pmaddwd m0, [v2q + orderq] 38cabdff1aSopenharmony_ci pmaddwd m1, [v2q + orderq + mmsize] 39cabdff1aSopenharmony_ci paddd m2, m0 40cabdff1aSopenharmony_ci paddd m2, m1 41cabdff1aSopenharmony_ci add orderq, mmsize*2 42cabdff1aSopenharmony_ci jl .loop 43cabdff1aSopenharmony_ci HADDD m2, m0 44cabdff1aSopenharmony_ci movd eax, m2 45cabdff1aSopenharmony_ci RET 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 49cabdff1aSopenharmony_ci; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, 50cabdff1aSopenharmony_ci; int32_t max, unsigned int len) 51cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 52cabdff1aSopenharmony_ci 53cabdff1aSopenharmony_ci; %1 = number of xmm registers used 54cabdff1aSopenharmony_ci; %2 = number of inline load/process/store loops per asm loop 55cabdff1aSopenharmony_ci; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop 56cabdff1aSopenharmony_ci; %4 = CLIPD function takes min/max as float instead of int (SSE2 version) 57cabdff1aSopenharmony_ci; %5 = suffix 58cabdff1aSopenharmony_ci%macro VECTOR_CLIP_INT32 4-5 59cabdff1aSopenharmony_cicglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len 60cabdff1aSopenharmony_ci%if %4 61cabdff1aSopenharmony_ci cvtsi2ss m4, minm 62cabdff1aSopenharmony_ci cvtsi2ss m5, maxm 63cabdff1aSopenharmony_ci%else 64cabdff1aSopenharmony_ci movd m4, minm 65cabdff1aSopenharmony_ci movd m5, maxm 66cabdff1aSopenharmony_ci%endif 67cabdff1aSopenharmony_ci SPLATD m4 68cabdff1aSopenharmony_ci SPLATD m5 69cabdff1aSopenharmony_ci.loop: 70cabdff1aSopenharmony_ci%assign %%i 0 71cabdff1aSopenharmony_ci%rep %2 72cabdff1aSopenharmony_ci mova m0, [srcq + mmsize * (0 + %%i)] 73cabdff1aSopenharmony_ci mova m1, [srcq + mmsize * (1 + %%i)] 74cabdff1aSopenharmony_ci mova m2, [srcq + mmsize * (2 + %%i)] 75cabdff1aSopenharmony_ci mova m3, [srcq + mmsize * (3 + %%i)] 76cabdff1aSopenharmony_ci%if %3 77cabdff1aSopenharmony_ci mova m7, [srcq + mmsize * (4 + %%i)] 78cabdff1aSopenharmony_ci mova m8, [srcq + mmsize * (5 + %%i)] 79cabdff1aSopenharmony_ci mova m9, [srcq + mmsize * (6 + %%i)] 80cabdff1aSopenharmony_ci mova m10, [srcq + mmsize * (7 + %%i)] 81cabdff1aSopenharmony_ci%endif 82cabdff1aSopenharmony_ci CLIPD m0, m4, m5, m6 83cabdff1aSopenharmony_ci CLIPD m1, m4, m5, m6 84cabdff1aSopenharmony_ci CLIPD m2, m4, m5, m6 85cabdff1aSopenharmony_ci CLIPD m3, m4, m5, m6 86cabdff1aSopenharmony_ci%if %3 87cabdff1aSopenharmony_ci CLIPD m7, m4, m5, m6 88cabdff1aSopenharmony_ci CLIPD m8, m4, m5, m6 89cabdff1aSopenharmony_ci CLIPD m9, m4, m5, m6 90cabdff1aSopenharmony_ci CLIPD m10, m4, m5, m6 91cabdff1aSopenharmony_ci%endif 92cabdff1aSopenharmony_ci mova [dstq + mmsize * (0 + %%i)], m0 93cabdff1aSopenharmony_ci mova [dstq + mmsize * (1 + %%i)], m1 94cabdff1aSopenharmony_ci mova [dstq + mmsize * (2 + %%i)], m2 95cabdff1aSopenharmony_ci mova [dstq + mmsize * (3 + %%i)], m3 96cabdff1aSopenharmony_ci%if %3 97cabdff1aSopenharmony_ci mova [dstq + mmsize * (4 + %%i)], m7 98cabdff1aSopenharmony_ci mova [dstq + mmsize * (5 + %%i)], m8 99cabdff1aSopenharmony_ci mova [dstq + mmsize * (6 + %%i)], m9 100cabdff1aSopenharmony_ci mova [dstq + mmsize * (7 + %%i)], m10 101cabdff1aSopenharmony_ci%endif 102cabdff1aSopenharmony_ci%assign %%i (%%i + 4 * (1 + %3)) 103cabdff1aSopenharmony_ci%endrep 104cabdff1aSopenharmony_ci add srcq, mmsize*4*(%2+%3) 105cabdff1aSopenharmony_ci add dstq, mmsize*4*(%2+%3) 106cabdff1aSopenharmony_ci sub lend, mmsize*(%2+%3) 107cabdff1aSopenharmony_ci jg .loop 108cabdff1aSopenharmony_ci REP_RET 109cabdff1aSopenharmony_ci%endmacro 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ciINIT_XMM sse2 112cabdff1aSopenharmony_ciVECTOR_CLIP_INT32 6, 1, 0, 0, _int 113cabdff1aSopenharmony_ciVECTOR_CLIP_INT32 6, 2, 0, 1 114cabdff1aSopenharmony_ciINIT_XMM sse4 115cabdff1aSopenharmony_ci%ifdef m8 116cabdff1aSopenharmony_ciVECTOR_CLIP_INT32 11, 1, 1, 0 117cabdff1aSopenharmony_ci%else 118cabdff1aSopenharmony_ciVECTOR_CLIP_INT32 6, 1, 0, 0 119cabdff1aSopenharmony_ci%endif 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci; void ff_vector_clipf_sse(float *dst, const float *src, 122cabdff1aSopenharmony_ci; int len, float min, float max) 123cabdff1aSopenharmony_ciINIT_XMM sse 124cabdff1aSopenharmony_cicglobal vector_clipf, 3, 3, 6, dst, src, len, min, max 125cabdff1aSopenharmony_ci%if ARCH_X86_32 126cabdff1aSopenharmony_ci VBROADCASTSS m0, minm 127cabdff1aSopenharmony_ci VBROADCASTSS m1, maxm 128cabdff1aSopenharmony_ci%elif WIN64 129cabdff1aSopenharmony_ci SWAP 0, 3 130cabdff1aSopenharmony_ci VBROADCASTSS m0, m0 131cabdff1aSopenharmony_ci VBROADCASTSS m1, maxm 132cabdff1aSopenharmony_ci%else ; 64bit sysv 133cabdff1aSopenharmony_ci VBROADCASTSS m0, m0 134cabdff1aSopenharmony_ci VBROADCASTSS m1, m1 135cabdff1aSopenharmony_ci%endif 136cabdff1aSopenharmony_ci 137cabdff1aSopenharmony_ci movsxdifnidn lenq, lend 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci.loop: 140cabdff1aSopenharmony_ci mova m2, [srcq + 4 * lenq - 4 * mmsize] 141cabdff1aSopenharmony_ci mova m3, [srcq + 4 * lenq - 3 * mmsize] 142cabdff1aSopenharmony_ci mova m4, [srcq + 4 * lenq - 2 * mmsize] 143cabdff1aSopenharmony_ci mova m5, [srcq + 4 * lenq - 1 * mmsize] 144cabdff1aSopenharmony_ci 145cabdff1aSopenharmony_ci maxps m2, m0 146cabdff1aSopenharmony_ci maxps m3, m0 147cabdff1aSopenharmony_ci maxps m4, m0 148cabdff1aSopenharmony_ci maxps m5, m0 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci minps m2, m1 151cabdff1aSopenharmony_ci minps m3, m1 152cabdff1aSopenharmony_ci minps m4, m1 153cabdff1aSopenharmony_ci minps m5, m1 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ci mova [dstq + 4 * lenq - 4 * mmsize], m2 156cabdff1aSopenharmony_ci mova [dstq + 4 * lenq - 3 * mmsize], m3 157cabdff1aSopenharmony_ci mova [dstq + 4 * lenq - 2 * mmsize], m4 158cabdff1aSopenharmony_ci mova [dstq + 4 * lenq - 1 * mmsize], m5 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_ci sub lenq, mmsize 161cabdff1aSopenharmony_ci jg .loop 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_ci RET 164