1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * SIMD-optimized MP3 decoding functions 3cabdff1aSopenharmony_ci * Copyright (c) 2010 Vitor Sessak 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci#include <stddef.h> 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci#include "config.h" 25cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 26cabdff1aSopenharmony_ci#include "libavutil/cpu.h" 27cabdff1aSopenharmony_ci#include "libavutil/mem_internal.h" 28cabdff1aSopenharmony_ci#include "libavutil/x86/asm.h" 29cabdff1aSopenharmony_ci#include "libavutil/x86/cpu.h" 30cabdff1aSopenharmony_ci#include "libavcodec/mpegaudiodsp.h" 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ci#define DECL(CPU)\ 33cabdff1aSopenharmony_cistatic void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\ 34cabdff1aSopenharmony_civoid ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win); 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ci#if HAVE_X86ASM 37cabdff1aSopenharmony_ciDECL(sse2) 38cabdff1aSopenharmony_ciDECL(sse3) 39cabdff1aSopenharmony_ciDECL(ssse3) 40cabdff1aSopenharmony_ciDECL(avx) 41cabdff1aSopenharmony_ci#endif /* HAVE_X86ASM */ 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_civoid ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, 44cabdff1aSopenharmony_ci float *tmpbuf); 45cabdff1aSopenharmony_civoid ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, 46cabdff1aSopenharmony_ci float *tmpbuf); 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ciDECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; 49cabdff1aSopenharmony_ci 50cabdff1aSopenharmony_ci#if HAVE_6REGS && HAVE_SSE_INLINE 51cabdff1aSopenharmony_ci 52cabdff1aSopenharmony_ci#define MACS(rt, ra, rb) rt+=(ra)*(rb) 53cabdff1aSopenharmony_ci#define MLSS(rt, ra, rb) rt-=(ra)*(rb) 54cabdff1aSopenharmony_ci 55cabdff1aSopenharmony_ci#define SUM8(op, sum, w, p) \ 56cabdff1aSopenharmony_ci{ \ 57cabdff1aSopenharmony_ci op(sum, (w)[0 * 64], (p)[0 * 64]); \ 58cabdff1aSopenharmony_ci op(sum, (w)[1 * 64], (p)[1 * 64]); \ 59cabdff1aSopenharmony_ci op(sum, (w)[2 * 64], (p)[2 * 64]); \ 60cabdff1aSopenharmony_ci op(sum, (w)[3 * 64], (p)[3 * 64]); \ 61cabdff1aSopenharmony_ci op(sum, (w)[4 * 64], (p)[4 * 64]); \ 62cabdff1aSopenharmony_ci op(sum, (w)[5 * 64], (p)[5 * 64]); \ 63cabdff1aSopenharmony_ci op(sum, (w)[6 * 64], (p)[6 * 64]); \ 64cabdff1aSopenharmony_ci op(sum, (w)[7 * 64], (p)[7 * 64]); \ 65cabdff1aSopenharmony_ci} 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_cistatic void apply_window(const float *buf, const float *win1, 68cabdff1aSopenharmony_ci const float *win2, float *sum1, float *sum2, int len) 69cabdff1aSopenharmony_ci{ 70cabdff1aSopenharmony_ci x86_reg count = - 4*len; 71cabdff1aSopenharmony_ci const float *win1a = win1+len; 72cabdff1aSopenharmony_ci const float *win2a = win2+len; 73cabdff1aSopenharmony_ci const float *bufa = buf+len; 74cabdff1aSopenharmony_ci float *sum1a = sum1+len; 75cabdff1aSopenharmony_ci float *sum2a = sum2+len; 76cabdff1aSopenharmony_ci 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_ci#define MULT(a, b) \ 79cabdff1aSopenharmony_ci "movaps " #a "(%1,%0), %%xmm1 \n\t" \ 80cabdff1aSopenharmony_ci "movaps " #a "(%3,%0), %%xmm2 \n\t" \ 81cabdff1aSopenharmony_ci "mulps %%xmm2, %%xmm1 \n\t" \ 82cabdff1aSopenharmony_ci "subps %%xmm1, %%xmm0 \n\t" \ 83cabdff1aSopenharmony_ci "mulps " #b "(%2,%0), %%xmm2 \n\t" \ 84cabdff1aSopenharmony_ci "subps %%xmm2, %%xmm4 \n\t" \ 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci __asm__ volatile( 87cabdff1aSopenharmony_ci "1: \n\t" 88cabdff1aSopenharmony_ci "xorps %%xmm0, %%xmm0 \n\t" 89cabdff1aSopenharmony_ci "xorps %%xmm4, %%xmm4 \n\t" 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ci MULT( 0, 0) 92cabdff1aSopenharmony_ci MULT( 256, 64) 93cabdff1aSopenharmony_ci MULT( 512, 128) 94cabdff1aSopenharmony_ci MULT( 768, 192) 95cabdff1aSopenharmony_ci MULT(1024, 256) 96cabdff1aSopenharmony_ci MULT(1280, 320) 97cabdff1aSopenharmony_ci MULT(1536, 384) 98cabdff1aSopenharmony_ci MULT(1792, 448) 99cabdff1aSopenharmony_ci 100cabdff1aSopenharmony_ci "movaps %%xmm0, (%4,%0) \n\t" 101cabdff1aSopenharmony_ci "movaps %%xmm4, (%5,%0) \n\t" 102cabdff1aSopenharmony_ci "add $16, %0 \n\t" 103cabdff1aSopenharmony_ci "jl 1b \n\t" 104cabdff1aSopenharmony_ci :"+&r"(count) 105cabdff1aSopenharmony_ci :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a) 106cabdff1aSopenharmony_ci ); 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_ci#undef MULT 109cabdff1aSopenharmony_ci} 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_cistatic void apply_window_mp3(float *in, float *win, int *unused, float *out, 112cabdff1aSopenharmony_ci ptrdiff_t incr) 113cabdff1aSopenharmony_ci{ 114cabdff1aSopenharmony_ci LOCAL_ALIGNED_16(float, suma, [17]); 115cabdff1aSopenharmony_ci LOCAL_ALIGNED_16(float, sumb, [17]); 116cabdff1aSopenharmony_ci LOCAL_ALIGNED_16(float, sumc, [17]); 117cabdff1aSopenharmony_ci LOCAL_ALIGNED_16(float, sumd, [17]); 118cabdff1aSopenharmony_ci 119cabdff1aSopenharmony_ci float sum; 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci /* copy to avoid wrap */ 122cabdff1aSopenharmony_ci __asm__ volatile( 123cabdff1aSopenharmony_ci "movaps 0(%0), %%xmm0 \n\t" \ 124cabdff1aSopenharmony_ci "movaps 16(%0), %%xmm1 \n\t" \ 125cabdff1aSopenharmony_ci "movaps 32(%0), %%xmm2 \n\t" \ 126cabdff1aSopenharmony_ci "movaps 48(%0), %%xmm3 \n\t" \ 127cabdff1aSopenharmony_ci "movaps %%xmm0, 0(%1) \n\t" \ 128cabdff1aSopenharmony_ci "movaps %%xmm1, 16(%1) \n\t" \ 129cabdff1aSopenharmony_ci "movaps %%xmm2, 32(%1) \n\t" \ 130cabdff1aSopenharmony_ci "movaps %%xmm3, 48(%1) \n\t" \ 131cabdff1aSopenharmony_ci "movaps 64(%0), %%xmm0 \n\t" \ 132cabdff1aSopenharmony_ci "movaps 80(%0), %%xmm1 \n\t" \ 133cabdff1aSopenharmony_ci "movaps 96(%0), %%xmm2 \n\t" \ 134cabdff1aSopenharmony_ci "movaps 112(%0), %%xmm3 \n\t" \ 135cabdff1aSopenharmony_ci "movaps %%xmm0, 64(%1) \n\t" \ 136cabdff1aSopenharmony_ci "movaps %%xmm1, 80(%1) \n\t" \ 137cabdff1aSopenharmony_ci "movaps %%xmm2, 96(%1) \n\t" \ 138cabdff1aSopenharmony_ci "movaps %%xmm3, 112(%1) \n\t" 139cabdff1aSopenharmony_ci ::"r"(in), "r"(in+512) 140cabdff1aSopenharmony_ci :"memory" 141cabdff1aSopenharmony_ci ); 142cabdff1aSopenharmony_ci 143cabdff1aSopenharmony_ci apply_window(in + 16, win , win + 512, suma, sumc, 16); 144cabdff1aSopenharmony_ci apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_ci SUM8(MACS, suma[0], win + 32, in + 48); 147cabdff1aSopenharmony_ci 148cabdff1aSopenharmony_ci sumc[ 0] = 0; 149cabdff1aSopenharmony_ci sumb[16] = 0; 150cabdff1aSopenharmony_ci sumd[16] = 0; 151cabdff1aSopenharmony_ci 152cabdff1aSopenharmony_ci#define SUMS(suma, sumb, sumc, sumd, out1, out2) \ 153cabdff1aSopenharmony_ci "movups " #sumd "(%4), %%xmm0 \n\t" \ 154cabdff1aSopenharmony_ci "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ 155cabdff1aSopenharmony_ci "subps " #suma "(%1), %%xmm0 \n\t" \ 156cabdff1aSopenharmony_ci "movaps %%xmm0," #out1 "(%0) \n\t" \ 157cabdff1aSopenharmony_ci\ 158cabdff1aSopenharmony_ci "movups " #sumc "(%3), %%xmm0 \n\t" \ 159cabdff1aSopenharmony_ci "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ 160cabdff1aSopenharmony_ci "addps " #sumb "(%2), %%xmm0 \n\t" \ 161cabdff1aSopenharmony_ci "movaps %%xmm0," #out2 "(%0) \n\t" 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_ci if (incr == 1) { 164cabdff1aSopenharmony_ci __asm__ volatile( 165cabdff1aSopenharmony_ci SUMS( 0, 48, 4, 52, 0, 112) 166cabdff1aSopenharmony_ci SUMS(16, 32, 20, 36, 16, 96) 167cabdff1aSopenharmony_ci SUMS(32, 16, 36, 20, 32, 80) 168cabdff1aSopenharmony_ci SUMS(48, 0, 52, 4, 48, 64) 169cabdff1aSopenharmony_ci 170cabdff1aSopenharmony_ci :"+&r"(out) 171cabdff1aSopenharmony_ci :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0]) 172cabdff1aSopenharmony_ci :"memory" 173cabdff1aSopenharmony_ci ); 174cabdff1aSopenharmony_ci out += 16*incr; 175cabdff1aSopenharmony_ci } else { 176cabdff1aSopenharmony_ci int j; 177cabdff1aSopenharmony_ci float *out2 = out + 32 * incr; 178cabdff1aSopenharmony_ci out[0 ] = -suma[ 0]; 179cabdff1aSopenharmony_ci out += incr; 180cabdff1aSopenharmony_ci out2 -= incr; 181cabdff1aSopenharmony_ci for(j=1;j<16;j++) { 182cabdff1aSopenharmony_ci *out = -suma[ j] + sumd[16-j]; 183cabdff1aSopenharmony_ci *out2 = sumb[16-j] + sumc[ j]; 184cabdff1aSopenharmony_ci out += incr; 185cabdff1aSopenharmony_ci out2 -= incr; 186cabdff1aSopenharmony_ci } 187cabdff1aSopenharmony_ci } 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci sum = 0; 190cabdff1aSopenharmony_ci SUM8(MLSS, sum, win + 16 + 32, in + 32); 191cabdff1aSopenharmony_ci *out = sum; 192cabdff1aSopenharmony_ci} 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci#endif /* HAVE_6REGS && HAVE_SSE_INLINE */ 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_ci#if HAVE_X86ASM 197cabdff1aSopenharmony_ci#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ 198cabdff1aSopenharmony_cistatic void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ 199cabdff1aSopenharmony_ci int count, int switch_point, int block_type) \ 200cabdff1aSopenharmony_ci{ \ 201cabdff1aSopenharmony_ci int align_end = count - (count & 3); \ 202cabdff1aSopenharmony_ci int j; \ 203cabdff1aSopenharmony_ci for (j = 0; j < align_end; j+= 4) { \ 204cabdff1aSopenharmony_ci LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ 205cabdff1aSopenharmony_ci float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ 206cabdff1aSopenharmony_ci /* apply window & overlap with previous buffer */ \ 207cabdff1aSopenharmony_ci \ 208cabdff1aSopenharmony_ci /* select window */ \ 209cabdff1aSopenharmony_ci ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ 210cabdff1aSopenharmony_ci in += 4*18; \ 211cabdff1aSopenharmony_ci buf += 4*18; \ 212cabdff1aSopenharmony_ci out += 4; \ 213cabdff1aSopenharmony_ci } \ 214cabdff1aSopenharmony_ci for (; j < count; j++) { \ 215cabdff1aSopenharmony_ci /* apply window & overlap with previous buffer */ \ 216cabdff1aSopenharmony_ci \ 217cabdff1aSopenharmony_ci /* select window */ \ 218cabdff1aSopenharmony_ci int win_idx = (switch_point && j < 2) ? 0 : block_type; \ 219cabdff1aSopenharmony_ci float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ 220cabdff1aSopenharmony_ci \ 221cabdff1aSopenharmony_ci ff_imdct36_float_ ## CPU1(out, buf, in, win); \ 222cabdff1aSopenharmony_ci \ 223cabdff1aSopenharmony_ci in += 18; \ 224cabdff1aSopenharmony_ci buf++; \ 225cabdff1aSopenharmony_ci out++; \ 226cabdff1aSopenharmony_ci } \ 227cabdff1aSopenharmony_ci} 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci#if HAVE_SSE 230cabdff1aSopenharmony_ciDECL_IMDCT_BLOCKS(sse2,sse) 231cabdff1aSopenharmony_ciDECL_IMDCT_BLOCKS(sse3,sse) 232cabdff1aSopenharmony_ciDECL_IMDCT_BLOCKS(ssse3,sse) 233cabdff1aSopenharmony_ci#endif 234cabdff1aSopenharmony_ci#if HAVE_AVX_EXTERNAL 235cabdff1aSopenharmony_ciDECL_IMDCT_BLOCKS(avx,avx) 236cabdff1aSopenharmony_ci#endif 237cabdff1aSopenharmony_ci#endif /* HAVE_X86ASM */ 238cabdff1aSopenharmony_ci 239cabdff1aSopenharmony_ciav_cold void ff_mpadsp_init_x86_tabs(void) 240cabdff1aSopenharmony_ci{ 241cabdff1aSopenharmony_ci int i, j; 242cabdff1aSopenharmony_ci for (j = 0; j < 4; j++) { 243cabdff1aSopenharmony_ci for (i = 0; i < 40; i ++) { 244cabdff1aSopenharmony_ci mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; 245cabdff1aSopenharmony_ci mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; 246cabdff1aSopenharmony_ci mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; 247cabdff1aSopenharmony_ci mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; 248cabdff1aSopenharmony_ci mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; 249cabdff1aSopenharmony_ci mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; 250cabdff1aSopenharmony_ci mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; 251cabdff1aSopenharmony_ci mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; 252cabdff1aSopenharmony_ci } 253cabdff1aSopenharmony_ci } 254cabdff1aSopenharmony_ci} 255cabdff1aSopenharmony_ci 256cabdff1aSopenharmony_ciav_cold void ff_mpadsp_init_x86(MPADSPContext *s) 257cabdff1aSopenharmony_ci{ 258cabdff1aSopenharmony_ci av_unused int cpu_flags = av_get_cpu_flags(); 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci#if HAVE_6REGS && HAVE_SSE_INLINE 261cabdff1aSopenharmony_ci if (INLINE_SSE(cpu_flags)) { 262cabdff1aSopenharmony_ci s->apply_window_float = apply_window_mp3; 263cabdff1aSopenharmony_ci } 264cabdff1aSopenharmony_ci#endif /* HAVE_SSE_INLINE */ 265cabdff1aSopenharmony_ci 266cabdff1aSopenharmony_ci#if HAVE_X86ASM 267cabdff1aSopenharmony_ci#if HAVE_SSE 268cabdff1aSopenharmony_ci if (EXTERNAL_SSE2(cpu_flags)) { 269cabdff1aSopenharmony_ci s->imdct36_blocks_float = imdct36_blocks_sse2; 270cabdff1aSopenharmony_ci } 271cabdff1aSopenharmony_ci if (EXTERNAL_SSE3(cpu_flags)) { 272cabdff1aSopenharmony_ci s->imdct36_blocks_float = imdct36_blocks_sse3; 273cabdff1aSopenharmony_ci } 274cabdff1aSopenharmony_ci if (EXTERNAL_SSSE3(cpu_flags)) { 275cabdff1aSopenharmony_ci s->imdct36_blocks_float = imdct36_blocks_ssse3; 276cabdff1aSopenharmony_ci } 277cabdff1aSopenharmony_ci#endif 278cabdff1aSopenharmony_ci#if HAVE_AVX_EXTERNAL 279cabdff1aSopenharmony_ci if (EXTERNAL_AVX(cpu_flags)) { 280cabdff1aSopenharmony_ci s->imdct36_blocks_float = imdct36_blocks_avx; 281cabdff1aSopenharmony_ci } 282cabdff1aSopenharmony_ci#endif 283cabdff1aSopenharmony_ci#endif /* HAVE_X86ASM */ 284cabdff1aSopenharmony_ci} 285