1;***************************************************************************** 2;* x86-optimized AC-3 DSP functions 3;* Copyright (c) 2011 Justin Ruggles 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25 26; 16777216.0f - used in ff_float_to_fixed24() 27pf_1_24: times 4 dd 0x4B800000 28 29; used in ff_ac3_compute_mantissa_size() 30cextern ac3_bap_bits 31pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768 32pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7 33 34; used in ff_ac3_extract_exponents() 35cextern pd_1 36pd_151: times 4 dd 151 37 38SECTION .text 39 40;----------------------------------------------------------------------------- 41; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs) 42;----------------------------------------------------------------------------- 43 44%macro AC3_EXPONENT_MIN 0 45cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset 46 shl reuse_blksq, 8 47 jz .end 48 LOOP_ALIGN 49.nextexp: 50 mov offsetq, reuse_blksq 51 mova m0, [expq+offsetq] 52 sub offsetq, 256 53 LOOP_ALIGN 54.nextblk: 55 PMINUB m0, [expq+offsetq], m1 56 sub offsetq, 256 57 jae .nextblk 58 mova [expq], m0 59 add expq, mmsize 60 sub expnq, mmsize 61 jg .nextexp 62.end: 63 REP_RET 64%endmacro 65 66%define LOOP_ALIGN ALIGN 16 67%if HAVE_SSE2_EXTERNAL 68INIT_XMM sse2 69AC3_EXPONENT_MIN 70%endif 71%undef LOOP_ALIGN 72 73;----------------------------------------------------------------------------- 74; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len) 75;----------------------------------------------------------------------------- 76 77INIT_XMM sse2 78cglobal float_to_fixed24, 3, 3, 9, dst, src, len 79 movaps m0, [pf_1_24] 80.loop: 81 movaps m1, [srcq ] 82 movaps m2, [srcq+16 ] 83 movaps m3, [srcq+32 ] 84 movaps m4, [srcq+48 ] 85%ifdef m8 86 movaps m5, [srcq+64 ] 87 movaps m6, [srcq+80 ] 88 movaps m7, [srcq+96 ] 89 movaps m8, [srcq+112] 90%endif 91 mulps m1, m0 92 mulps m2, m0 93 mulps m3, m0 94 mulps m4, m0 95%ifdef m8 96 mulps m5, m0 97 mulps m6, m0 98 mulps m7, m0 99 mulps m8, m0 100%endif 101 cvtps2dq m1, m1 102 cvtps2dq m2, m2 103 cvtps2dq m3, m3 104 cvtps2dq m4, m4 105%ifdef m8 106 cvtps2dq m5, m5 107 cvtps2dq m6, m6 108 cvtps2dq m7, m7 109 cvtps2dq m8, m8 110%endif 111 movdqa [dstq ], m1 112 movdqa [dstq+16 ], m2 113 movdqa [dstq+32 ], m3 114 movdqa [dstq+48 ], m4 115%ifdef m8 116 movdqa [dstq+64 ], m5 117 movdqa [dstq+80 ], m6 118 movdqa [dstq+96 ], m7 119 movdqa [dstq+112], m8 120 add srcq, 128 121 add dstq, 128 122 sub lenq, 32 123%else 124 add srcq, 64 125 add dstq, 64 126 sub lenq, 16 127%endif 128 ja .loop 129 REP_RET 130 131;------------------------------------------------------------------------------ 132; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16]) 133;------------------------------------------------------------------------------ 134 135%macro PHADDD4 2 ; xmm src, xmm tmp 136 movhlps %2, %1 137 paddd %1, %2 138 pshufd %2, %1, 0x1 139 paddd %1, %2 140%endmacro 141 142INIT_XMM sse2 143cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum 144 movdqa m0, [mant_cntq ] 145 movdqa m1, [mant_cntq+ 1*16] 146 paddw m0, [mant_cntq+ 2*16] 147 paddw m1, [mant_cntq+ 3*16] 148 paddw m0, [mant_cntq+ 4*16] 149 paddw m1, [mant_cntq+ 5*16] 150 paddw m0, [mant_cntq+ 6*16] 151 paddw m1, [mant_cntq+ 7*16] 152 paddw m0, [mant_cntq+ 8*16] 153 paddw m1, [mant_cntq+ 9*16] 154 paddw m0, [mant_cntq+10*16] 155 paddw m1, [mant_cntq+11*16] 156 pmaddwd m0, [ac3_bap_bits ] 157 pmaddwd m1, [ac3_bap_bits+16] 158 paddd m0, m1 159 PHADDD4 m0, m1 160 movd sumd, m0 161 movdqa m3, [pw_bap_mul1] 162 movhpd m0, [mant_cntq +2] 163 movlpd m0, [mant_cntq+1*32+2] 164 movhpd m1, [mant_cntq+2*32+2] 165 movlpd m1, [mant_cntq+3*32+2] 166 movhpd m2, [mant_cntq+4*32+2] 167 movlpd m2, [mant_cntq+5*32+2] 168 pmulhuw m0, m3 169 pmulhuw m1, m3 170 pmulhuw m2, m3 171 paddusw m0, m1 172 paddusw m0, m2 173 pmaddwd m0, [pw_bap_mul2] 174 PHADDD4 m0, m1 175 movd eax, m0 176 add eax, sumd 177 RET 178 179;------------------------------------------------------------------------------ 180; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs) 181;------------------------------------------------------------------------------ 182 183%macro PABSD 1-2 ; src/dst, unused 184%if cpuflag(ssse3) 185 pabsd %1, %1 186%else ; src/dst, tmp 187 pxor %2, %2 188 pcmpgtd %2, %1 189 pxor %1, %2 190 psubd %1, %2 191%endif 192%endmacro 193 194%macro AC3_EXTRACT_EXPONENTS 0 195cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len 196 add expq, lenq 197 lea coefq, [coefq+4*lenq] 198 neg lenq 199 mova m2, [pd_1] 200 mova m3, [pd_151] 201.loop: 202 ; move 4 32-bit coefs to xmm0 203 mova m0, [coefq+4*lenq] 204 ; absolute value 205 PABSD m0, m1 206 ; convert to float and extract exponents 207 pslld m0, 1 208 por m0, m2 209 cvtdq2ps m1, m0 210 psrld m1, 23 211 mova m0, m3 212 psubd m0, m1 213 ; move the lowest byte in each of 4 dwords to the low dword 214 ; NOTE: We cannot just extract the low bytes with pshufb because the dword 215 ; result for 16777215 is -1 due to float inaccuracy. Using packuswb 216 ; clips this to 0, which is the correct exponent. 217 packssdw m0, m0 218 packuswb m0, m0 219 movd [expq+lenq], m0 220 221 add lenq, 4 222 jl .loop 223 REP_RET 224%endmacro 225 226%if HAVE_SSE2_EXTERNAL 227INIT_XMM sse2 228AC3_EXTRACT_EXPONENTS 229%endif 230%if HAVE_SSSE3_EXTERNAL 231INIT_XMM ssse3 232AC3_EXTRACT_EXPONENTS 233%endif 234