1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* x86-optimized AC-3 DSP functions 3cabdff1aSopenharmony_ci;* Copyright (c) 2011 Justin Ruggles 4cabdff1aSopenharmony_ci;* 5cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 16cabdff1aSopenharmony_ci;* 17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci;****************************************************************************** 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciSECTION_RODATA 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci; 16777216.0f - used in ff_float_to_fixed24() 27cabdff1aSopenharmony_cipf_1_24: times 4 dd 0x4B800000 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ci; used in ff_ac3_compute_mantissa_size() 30cabdff1aSopenharmony_cicextern ac3_bap_bits 31cabdff1aSopenharmony_cipw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768 32cabdff1aSopenharmony_cipw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci; used in ff_ac3_extract_exponents() 35cabdff1aSopenharmony_cicextern pd_1 36cabdff1aSopenharmony_cipd_151: times 4 dd 151 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ciSECTION .text 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 41cabdff1aSopenharmony_ci; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs) 42cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_ci%macro AC3_EXPONENT_MIN 0 45cabdff1aSopenharmony_cicglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset 46cabdff1aSopenharmony_ci shl reuse_blksq, 8 47cabdff1aSopenharmony_ci jz .end 48cabdff1aSopenharmony_ci LOOP_ALIGN 49cabdff1aSopenharmony_ci.nextexp: 50cabdff1aSopenharmony_ci mov offsetq, reuse_blksq 51cabdff1aSopenharmony_ci mova m0, [expq+offsetq] 52cabdff1aSopenharmony_ci sub offsetq, 256 53cabdff1aSopenharmony_ci LOOP_ALIGN 54cabdff1aSopenharmony_ci.nextblk: 55cabdff1aSopenharmony_ci PMINUB m0, [expq+offsetq], m1 56cabdff1aSopenharmony_ci sub offsetq, 256 57cabdff1aSopenharmony_ci jae .nextblk 58cabdff1aSopenharmony_ci mova [expq], m0 59cabdff1aSopenharmony_ci add expq, mmsize 60cabdff1aSopenharmony_ci sub expnq, mmsize 61cabdff1aSopenharmony_ci jg .nextexp 62cabdff1aSopenharmony_ci.end: 63cabdff1aSopenharmony_ci REP_RET 64cabdff1aSopenharmony_ci%endmacro 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_ci%define LOOP_ALIGN ALIGN 16 67cabdff1aSopenharmony_ci%if HAVE_SSE2_EXTERNAL 68cabdff1aSopenharmony_ciINIT_XMM sse2 69cabdff1aSopenharmony_ciAC3_EXPONENT_MIN 70cabdff1aSopenharmony_ci%endif 71cabdff1aSopenharmony_ci%undef LOOP_ALIGN 72cabdff1aSopenharmony_ci 73cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 74cabdff1aSopenharmony_ci; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len) 75cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 76cabdff1aSopenharmony_ci 77cabdff1aSopenharmony_ciINIT_XMM sse2 78cabdff1aSopenharmony_cicglobal float_to_fixed24, 3, 3, 9, dst, src, len 79cabdff1aSopenharmony_ci movaps m0, [pf_1_24] 80cabdff1aSopenharmony_ci.loop: 81cabdff1aSopenharmony_ci movaps m1, [srcq ] 82cabdff1aSopenharmony_ci movaps m2, [srcq+16 ] 83cabdff1aSopenharmony_ci movaps m3, [srcq+32 ] 84cabdff1aSopenharmony_ci movaps m4, [srcq+48 ] 85cabdff1aSopenharmony_ci%ifdef m8 86cabdff1aSopenharmony_ci movaps m5, [srcq+64 ] 87cabdff1aSopenharmony_ci movaps m6, [srcq+80 ] 88cabdff1aSopenharmony_ci movaps m7, [srcq+96 ] 89cabdff1aSopenharmony_ci movaps m8, [srcq+112] 90cabdff1aSopenharmony_ci%endif 91cabdff1aSopenharmony_ci mulps m1, m0 92cabdff1aSopenharmony_ci mulps m2, m0 93cabdff1aSopenharmony_ci mulps m3, m0 94cabdff1aSopenharmony_ci mulps m4, m0 95cabdff1aSopenharmony_ci%ifdef m8 96cabdff1aSopenharmony_ci mulps m5, m0 97cabdff1aSopenharmony_ci mulps m6, m0 98cabdff1aSopenharmony_ci mulps m7, m0 99cabdff1aSopenharmony_ci mulps m8, m0 100cabdff1aSopenharmony_ci%endif 101cabdff1aSopenharmony_ci cvtps2dq m1, m1 102cabdff1aSopenharmony_ci cvtps2dq m2, m2 103cabdff1aSopenharmony_ci cvtps2dq m3, m3 104cabdff1aSopenharmony_ci cvtps2dq m4, m4 105cabdff1aSopenharmony_ci%ifdef m8 106cabdff1aSopenharmony_ci cvtps2dq m5, m5 107cabdff1aSopenharmony_ci cvtps2dq m6, m6 108cabdff1aSopenharmony_ci cvtps2dq m7, m7 109cabdff1aSopenharmony_ci cvtps2dq m8, m8 110cabdff1aSopenharmony_ci%endif 111cabdff1aSopenharmony_ci movdqa [dstq ], m1 112cabdff1aSopenharmony_ci movdqa [dstq+16 ], m2 113cabdff1aSopenharmony_ci movdqa [dstq+32 ], m3 114cabdff1aSopenharmony_ci movdqa [dstq+48 ], m4 115cabdff1aSopenharmony_ci%ifdef m8 116cabdff1aSopenharmony_ci movdqa [dstq+64 ], m5 117cabdff1aSopenharmony_ci movdqa [dstq+80 ], m6 118cabdff1aSopenharmony_ci movdqa [dstq+96 ], m7 119cabdff1aSopenharmony_ci movdqa [dstq+112], m8 120cabdff1aSopenharmony_ci add srcq, 128 121cabdff1aSopenharmony_ci add dstq, 128 122cabdff1aSopenharmony_ci sub lenq, 32 123cabdff1aSopenharmony_ci%else 124cabdff1aSopenharmony_ci add srcq, 64 125cabdff1aSopenharmony_ci add dstq, 64 126cabdff1aSopenharmony_ci sub lenq, 16 127cabdff1aSopenharmony_ci%endif 128cabdff1aSopenharmony_ci ja .loop 129cabdff1aSopenharmony_ci REP_RET 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 132cabdff1aSopenharmony_ci; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16]) 133cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 134cabdff1aSopenharmony_ci 135cabdff1aSopenharmony_ci%macro PHADDD4 2 ; xmm src, xmm tmp 136cabdff1aSopenharmony_ci movhlps %2, %1 137cabdff1aSopenharmony_ci paddd %1, %2 138cabdff1aSopenharmony_ci pshufd %2, %1, 0x1 139cabdff1aSopenharmony_ci paddd %1, %2 140cabdff1aSopenharmony_ci%endmacro 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_ciINIT_XMM sse2 143cabdff1aSopenharmony_cicglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum 144cabdff1aSopenharmony_ci movdqa m0, [mant_cntq ] 145cabdff1aSopenharmony_ci movdqa m1, [mant_cntq+ 1*16] 146cabdff1aSopenharmony_ci paddw m0, [mant_cntq+ 2*16] 147cabdff1aSopenharmony_ci paddw m1, [mant_cntq+ 3*16] 148cabdff1aSopenharmony_ci paddw m0, [mant_cntq+ 4*16] 149cabdff1aSopenharmony_ci paddw m1, [mant_cntq+ 5*16] 150cabdff1aSopenharmony_ci paddw m0, [mant_cntq+ 6*16] 151cabdff1aSopenharmony_ci paddw m1, [mant_cntq+ 7*16] 152cabdff1aSopenharmony_ci paddw m0, [mant_cntq+ 8*16] 153cabdff1aSopenharmony_ci paddw m1, [mant_cntq+ 9*16] 154cabdff1aSopenharmony_ci paddw m0, [mant_cntq+10*16] 155cabdff1aSopenharmony_ci paddw m1, [mant_cntq+11*16] 156cabdff1aSopenharmony_ci pmaddwd m0, [ac3_bap_bits ] 157cabdff1aSopenharmony_ci pmaddwd m1, [ac3_bap_bits+16] 158cabdff1aSopenharmony_ci paddd m0, m1 159cabdff1aSopenharmony_ci PHADDD4 m0, m1 160cabdff1aSopenharmony_ci movd sumd, m0 161cabdff1aSopenharmony_ci movdqa m3, [pw_bap_mul1] 162cabdff1aSopenharmony_ci movhpd m0, [mant_cntq +2] 163cabdff1aSopenharmony_ci movlpd m0, [mant_cntq+1*32+2] 164cabdff1aSopenharmony_ci movhpd m1, [mant_cntq+2*32+2] 165cabdff1aSopenharmony_ci movlpd m1, [mant_cntq+3*32+2] 166cabdff1aSopenharmony_ci movhpd m2, [mant_cntq+4*32+2] 167cabdff1aSopenharmony_ci movlpd m2, [mant_cntq+5*32+2] 168cabdff1aSopenharmony_ci pmulhuw m0, m3 169cabdff1aSopenharmony_ci pmulhuw m1, m3 170cabdff1aSopenharmony_ci pmulhuw m2, m3 171cabdff1aSopenharmony_ci paddusw m0, m1 172cabdff1aSopenharmony_ci paddusw m0, m2 173cabdff1aSopenharmony_ci pmaddwd m0, [pw_bap_mul2] 174cabdff1aSopenharmony_ci PHADDD4 m0, m1 175cabdff1aSopenharmony_ci movd eax, m0 176cabdff1aSopenharmony_ci add eax, sumd 177cabdff1aSopenharmony_ci RET 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 180cabdff1aSopenharmony_ci; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs) 181cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_ci%macro PABSD 1-2 ; src/dst, unused 184cabdff1aSopenharmony_ci%if cpuflag(ssse3) 185cabdff1aSopenharmony_ci pabsd %1, %1 186cabdff1aSopenharmony_ci%else ; src/dst, tmp 187cabdff1aSopenharmony_ci pxor %2, %2 188cabdff1aSopenharmony_ci pcmpgtd %2, %1 189cabdff1aSopenharmony_ci pxor %1, %2 190cabdff1aSopenharmony_ci psubd %1, %2 191cabdff1aSopenharmony_ci%endif 192cabdff1aSopenharmony_ci%endmacro 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci%macro AC3_EXTRACT_EXPONENTS 0 195cabdff1aSopenharmony_cicglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len 196cabdff1aSopenharmony_ci add expq, lenq 197cabdff1aSopenharmony_ci lea coefq, [coefq+4*lenq] 198cabdff1aSopenharmony_ci neg lenq 199cabdff1aSopenharmony_ci mova m2, [pd_1] 200cabdff1aSopenharmony_ci mova m3, [pd_151] 201cabdff1aSopenharmony_ci.loop: 202cabdff1aSopenharmony_ci ; move 4 32-bit coefs to xmm0 203cabdff1aSopenharmony_ci mova m0, [coefq+4*lenq] 204cabdff1aSopenharmony_ci ; absolute value 205cabdff1aSopenharmony_ci PABSD m0, m1 206cabdff1aSopenharmony_ci ; convert to float and extract exponents 207cabdff1aSopenharmony_ci pslld m0, 1 208cabdff1aSopenharmony_ci por m0, m2 209cabdff1aSopenharmony_ci cvtdq2ps m1, m0 210cabdff1aSopenharmony_ci psrld m1, 23 211cabdff1aSopenharmony_ci mova m0, m3 212cabdff1aSopenharmony_ci psubd m0, m1 213cabdff1aSopenharmony_ci ; move the lowest byte in each of 4 dwords to the low dword 214cabdff1aSopenharmony_ci ; NOTE: We cannot just extract the low bytes with pshufb because the dword 215cabdff1aSopenharmony_ci ; result for 16777215 is -1 due to float inaccuracy. Using packuswb 216cabdff1aSopenharmony_ci ; clips this to 0, which is the correct exponent. 217cabdff1aSopenharmony_ci packssdw m0, m0 218cabdff1aSopenharmony_ci packuswb m0, m0 219cabdff1aSopenharmony_ci movd [expq+lenq], m0 220cabdff1aSopenharmony_ci 221cabdff1aSopenharmony_ci add lenq, 4 222cabdff1aSopenharmony_ci jl .loop 223cabdff1aSopenharmony_ci REP_RET 224cabdff1aSopenharmony_ci%endmacro 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_ci%if HAVE_SSE2_EXTERNAL 227cabdff1aSopenharmony_ciINIT_XMM sse2 228cabdff1aSopenharmony_ciAC3_EXTRACT_EXPONENTS 229cabdff1aSopenharmony_ci%endif 230cabdff1aSopenharmony_ci%if HAVE_SSSE3_EXTERNAL 231cabdff1aSopenharmony_ciINIT_XMM ssse3 232cabdff1aSopenharmony_ciAC3_EXTRACT_EXPONENTS 233cabdff1aSopenharmony_ci%endif 234