1;****************************************************************************** 2;* AAC Spectral Band Replication decoding functions 3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25; mask equivalent for multiply by -1.0 1.0 26ps_mask times 2 dd 1<<31, 0 27ps_mask2 times 2 dd 0, 1<<31 28ps_mask3 dd 0, 0, 0, 1<<31 29ps_noise0 times 2 dd 1.0, 0.0, 30ps_noise2 times 2 dd -1.0, 0.0 31ps_noise13 dd 0.0, 1.0, 0.0, -1.0 32 dd 0.0, -1.0, 0.0, 1.0 33 dd 0.0, 1.0, 0.0, -1.0 34cextern sbr_noise_table 35cextern ps_neg 36 37SECTION .text 38 39INIT_XMM sse 40cglobal sbr_sum_square, 2, 3, 6 41 mov r2d, r1d 42 xorps m0, m0 43 xorps m1, m1 44 sar r2, 3 45 jz .prepare 46.loop: 47 movu m2, [r0 + 0] 48 movu m3, [r0 + 16] 49 movu m4, [r0 + 32] 50 movu m5, [r0 + 48] 51 mulps m2, m2 52 mulps m3, m3 53 mulps m4, m4 54 mulps m5, m5 55 addps m0, m2 56 addps m1, m3 57 addps m0, m4 58 addps m1, m5 59 add r0, 64 60 dec r2 61 jnz .loop 62.prepare: 63 and r1, 7 64 sar r1, 1 65 jz .end 66; len is a multiple of 2, thus there are at least 4 elements to process 67.endloop: 68 movu m2, [r0] 69 add r0, 16 70 mulps m2, m2 71 dec r1 72 addps m0, m2 73 jnz .endloop 74.end: 75 addps m0, m1 76 movhlps m2, m0 77 addps m0, m2 78 movss m1, m0 79 shufps m0, m0, 1 80 addss m0, m1 81%if ARCH_X86_64 == 0 82 movss r0m, m0 83 fld dword r0m 84%endif 85 RET 86 87%define STEP 40*4*2 88cglobal sbr_hf_g_filt, 5, 6, 5 89 lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high 90 mov r5, r3 91 and r3, 0xFC 92 lea r2, [r2 + r3*4] 93 lea r0, [r0 + r3*8] 94 neg r3 95 jz .loop1 96.loop4: 97 movlps m0, [r2 + 4*r3 + 0] 98 movlps m1, [r2 + 4*r3 + 8] 99 movlps m2, [r1 + 0*STEP] 100 movlps m3, [r1 + 2*STEP] 101 movhps m2, [r1 + 1*STEP] 102 movhps m3, [r1 + 3*STEP] 103 unpcklps m0, m0 104 unpcklps m1, m1 105 mulps m0, m2 106 mulps m1, m3 107 movu [r0 + 8*r3 + 0], m0 108 movu [r0 + 8*r3 + 16], m1 109 add r1, 4*STEP 110 add r3, 4 111 jnz .loop4 112 and r5, 3 ; number of single element loops 113 jz .end 114.loop1: ; element 0 and 1 can be computed at the same time 115 movss m0, [r2] 116 movlps m2, [r1] 117 unpcklps m0, m0 118 mulps m2, m0 119 movlps [r0], m2 120 add r0, 8 121 add r2, 4 122 add r1, STEP 123 dec r5 124 jnz .loop1 125.end: 126 RET 127 128; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2], 129; const float alpha0[2], const float alpha1[2], 130; float bw, int start, int end) 131; 132cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E 133 ; load alpha factors 134%define bw m0 135%if ARCH_X86_64 == 0 || WIN64 136 movss bw, BWm 137%endif 138 movlps m2, [alpha1q] 139 movlps m1, [alpha0q] 140 shufps bw, bw, 0 141 mulps m2, bw ; (a1[0] a1[1])*bw 142 mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3) 143 mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1) 144 mova m3, m1 145 mova m4, m2 146 147 ; Set pointers 148%if ARCH_X86_64 == 0 || WIN64 149 ; start and end 6th and 7th args on stack 150 mov r2d, Sm 151 mov r3d, Em 152 DEFINE_ARGS X_high, X_low, start, end 153%else 154; BW does not actually occupy a register, so shift by 1 155 DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end 156 movsxd startq, startd 157 movsxd endq, endd 158%endif 159 sub startq, endq ; neg num of loops 160 lea X_highq, [X_highq + endq*2*4] 161 lea X_lowq, [X_lowq + endq*2*4 - 2*2*4] 162 shl startq, 3 ; offset from num loops 163 164 mova m0, [X_lowq + startq] 165 shufps m3, m3, q1111 166 shufps m4, m4, q1111 167 xorps m3, [ps_mask] 168 shufps m1, m1, q0000 169 shufps m2, m2, q0000 170 xorps m4, [ps_mask] 171.loop2: 172 movu m7, [X_lowq + startq + 8] ; BbCc 173 mova m6, m0 174 mova m5, m7 175 shufps m0, m0, q2301 ; aAbB 176 shufps m7, m7, q2301 ; bBcC 177 mulps m0, m4 178 mulps m7, m3 179 mulps m6, m2 180 mulps m5, m1 181 addps m7, m0 182 mova m0, [X_lowq + startq + 16] ; CcDd 183 addps m7, m0 184 addps m6, m5 185 addps m7, m6 186 mova [X_highq + startq], m7 187 add startq, 16 188 jnz .loop2 189 RET 190 191cglobal sbr_sum64x5, 1,2,4,z 192 lea r1q, [zq+ 256] 193.loop: 194 mova m0, [zq+ 0] 195 mova m2, [zq+ 16] 196 mova m1, [zq+ 256] 197 mova m3, [zq+ 272] 198 addps m0, [zq+ 512] 199 addps m2, [zq+ 528] 200 addps m1, [zq+ 768] 201 addps m3, [zq+ 784] 202 addps m0, [zq+1024] 203 addps m2, [zq+1040] 204 addps m0, m1 205 addps m2, m3 206 mova [zq], m0 207 mova [zq+16], m2 208 add zq, 32 209 cmp zq, r1q 210 jne .loop 211 REP_RET 212 213INIT_XMM sse 214cglobal sbr_qmf_post_shuffle, 2,3,4,W,z 215 lea r2q, [zq + (64-4)*4] 216 mova m3, [ps_neg] 217.loop: 218 mova m1, [zq] 219 xorps m0, m3, [r2q] 220 shufps m0, m0, m0, q0123 221 unpcklps m2, m0, m1 222 unpckhps m0, m0, m1 223 mova [Wq + 0], m2 224 mova [Wq + 16], m0 225 add Wq, 32 226 sub r2q, 16 227 add zq, 16 228 cmp zq, r2q 229 jl .loop 230 REP_RET 231 232INIT_XMM sse 233cglobal sbr_neg_odd_64, 1,2,4,z 234 lea r1q, [zq+256] 235.loop: 236 mova m0, [zq+ 0] 237 mova m1, [zq+16] 238 mova m2, [zq+32] 239 mova m3, [zq+48] 240 xorps m0, [ps_mask2] 241 xorps m1, [ps_mask2] 242 xorps m2, [ps_mask2] 243 xorps m3, [ps_mask2] 244 mova [zq+ 0], m0 245 mova [zq+16], m1 246 mova [zq+32], m2 247 mova [zq+48], m3 248 add zq, 64 249 cmp zq, r1q 250 jne .loop 251 REP_RET 252 253; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1) 254INIT_XMM sse2 255cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c 256 mov cq, 64*4-2*mmsize 257 lea vrevq, [vq + 64*4] 258.loop: 259 mova m0, [src0q+cq] 260 mova m1, [src1q] 261 mova m4, [src0q+cq+mmsize] 262 mova m5, [src1q+mmsize] 263 pshufd m2, m0, q0123 264 pshufd m3, m1, q0123 265 pshufd m6, m4, q0123 266 pshufd m7, m5, q0123 267 addps m5, m2 268 subps m0, m7 269 addps m1, m6 270 subps m4, m3 271 mova [vrevq], m1 272 mova [vrevq+mmsize], m5 273 mova [vq+cq], m0 274 mova [vq+cq+mmsize], m4 275 add src1q, 2*mmsize 276 add vrevq, 2*mmsize 277 sub cq, 2*mmsize 278 jge .loop 279 REP_RET 280 281INIT_XMM sse2 282cglobal sbr_qmf_pre_shuffle, 1,4,6,z 283%define OFFSET (32*4-2*mmsize) 284 mov r3q, OFFSET 285 lea r1q, [zq + (32+1)*4] 286 lea r2q, [zq + 64*4] 287 mova m5, [ps_neg] 288.loop: 289 movu m0, [r1q] 290 movu m2, [r1q + mmsize] 291 movu m1, [zq + r3q + 4 + mmsize] 292 movu m3, [zq + r3q + 4] 293 294 pxor m2, m5 295 pxor m0, m5 296 pshufd m2, m2, q0123 297 pshufd m0, m0, q0123 298 SBUTTERFLY dq, 2, 3, 4 299 SBUTTERFLY dq, 0, 1, 4 300 mova [r2q + 2*r3q + 0*mmsize], m2 301 mova [r2q + 2*r3q + 1*mmsize], m3 302 mova [r2q + 2*r3q + 2*mmsize], m0 303 mova [r2q + 2*r3q + 3*mmsize], m1 304 add r1q, 2*mmsize 305 sub r3q, 2*mmsize 306 jge .loop 307 movq m2, [zq] 308 movq [r2q], m2 309 REP_RET 310 311%ifdef PIC 312%define NREGS 1 313%if UNIX64 314%define NOISE_TABLE r6q ; r5q is m_max 315%else 316%define NOISE_TABLE r5q 317%endif 318%else 319%define NREGS 0 320%define NOISE_TABLE sbr_noise_table 321%endif 322 323%macro LOAD_NST 1 324%ifdef PIC 325 lea NOISE_TABLE, [%1] 326 mova m0, [kxq + NOISE_TABLE] 327%else 328 mova m0, [kxq + %1] 329%endif 330%endmacro 331 332INIT_XMM sse2 333; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m, 334; const float *q_filt, int noise, 335; int kx, int m_max) 336cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max 337 mova m0, [ps_noise0] 338 jmp apply_noise_main 339 340; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m, 341; const float *q_filt, int noise, 342; int kx, int m_max) 343cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max 344 and kxq, 1 345 shl kxq, 4 346 LOAD_NST ps_noise13 347 jmp apply_noise_main 348 349; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m, 350; const float *q_filt, int noise, 351; int kx, int m_max) 352cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max 353 mova m0, [ps_noise2] 354 jmp apply_noise_main 355 356; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m, 357; const float *q_filt, int noise, 358; int kx, int m_max) 359cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max 360 and kxq, 1 361 shl kxq, 4 362 LOAD_NST ps_noise13+16 363 364apply_noise_main: 365%if ARCH_X86_64 == 0 || WIN64 366 mov kxd, m_maxm 367 DEFINE_ARGS Y, s_m, q_filt, noise, count 368%else 369 DEFINE_ARGS Y, s_m, q_filt, noise, kx, count 370%endif 371 movsxdifnidn noiseq, noised 372 dec noiseq 373 shl countd, 2 374%ifdef PIC 375 lea NOISE_TABLE, [sbr_noise_table] 376%endif 377 lea Yq, [Yq + 2*countq] 378 add s_mq, countq 379 add q_filtq, countq 380 shl noiseq, 3 381 pxor m5, m5 382 neg countq 383.loop: 384 mova m1, [q_filtq + countq] 385 movu m3, [noiseq + NOISE_TABLE + 1*mmsize] 386 movu m4, [noiseq + NOISE_TABLE + 2*mmsize] 387 add noiseq, 2*mmsize 388 and noiseq, 0x1ff<<3 389 punpckhdq m2, m1, m1 390 punpckldq m1, m1 391 mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] 392 mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] 393 mova m3, [s_mq + countq] 394 ; TODO: replace by a vpermd in AVX2 395 punpckhdq m4, m3, m3 396 punpckldq m3, m3 397 pcmpeqd m6, m3, m5 ; m6 == 0 398 pcmpeqd m7, m4, m5 ; m7 == 0 399 mulps m3, m0 ; s_m[m] * phi_sign 400 mulps m4, m0 ; s_m[m] * phi_sign 401 pand m1, m6 402 pand m2, m7 403 movu m6, [Yq + 2*countq] 404 movu m7, [Yq + 2*countq + mmsize] 405 addps m3, m1 406 addps m4, m2 407 addps m6, m3 408 addps m7, m4 409 movu [Yq + 2*countq], m6 410 movu [Yq + 2*countq + mmsize], m7 411 add countq, mmsize 412 jl .loop 413 RET 414 415INIT_XMM sse 416cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c 417%define COUNT 32*4 418%define OFFSET 32*4 419 mov cq, -COUNT 420 lea vrevq, [vq + OFFSET + COUNT] 421 add vq, OFFSET-mmsize 422 add srcq, 2*COUNT 423 mova m3, [ps_neg] 424.loop: 425 mova m0, [srcq + 2*cq + 0*mmsize] 426 mova m1, [srcq + 2*cq + 1*mmsize] 427 shufps m2, m0, m1, q2020 428 shufps m1, m0, q1313 429 xorps m2, m3 430 mova [vq], m1 431 mova [vrevq + cq], m2 432 sub vq, mmsize 433 add cq, mmsize 434 jl .loop 435 REP_RET 436 437%macro SBR_AUTOCORRELATE 0 438cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt 439 mov cntq, 37*8 440 add xq, cntq 441 neg cntq 442 443%if cpuflag(sse3) 444%define MOVH movsd 445 movddup m5, [xq+cntq] 446%else 447%define MOVH movlps 448 movlps m5, [xq+cntq] 449 movlhps m5, m5 450%endif 451 MOVH m7, [xq+cntq+8 ] 452 MOVH m1, [xq+cntq+16] 453 shufps m7, m7, q0110 454 shufps m1, m1, q0110 455 mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0] 456 mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1]; 457 mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0] 458 movaps [rsp ], m3 459 movaps [rsp+16], m4 460 add cntq, 8 461 462 MOVH m2, [xq+cntq+16] 463 movlhps m7, m7 464 shufps m2, m2, q0110 465 mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0] 466 mulps m4, m7, m2 467 mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1]; 468 addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0] 469 470align 16 471.loop: 472 add cntq, 8 473 MOVH m0, [xq+cntq+16] 474 movlhps m1, m1 475 shufps m0, m0, q0110 476 mulps m3, m1, m2 477 mulps m4, m1, m0 478 mulps m1, m1 479 addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; 480 addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; 481 addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; 482 add cntq, 8 483 MOVH m1, [xq+cntq+16] 484 movlhps m2, m2 485 shufps m1, m1, q0110 486 mulps m3, m2, m0 487 mulps m4, m2, m1 488 mulps m2, m2 489 addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; 490 addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; 491 addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; 492 add cntq, 8 493 MOVH m2, [xq+cntq+16] 494 movlhps m0, m0 495 shufps m2, m2, q0110 496 mulps m3, m0, m1 497 mulps m4, m0, m2 498 mulps m0, m0 499 addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; 500 addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; 501 addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; 502 jl .loop 503 504 movlhps m1, m1 505 mulps m2, m1 506 mulps m1, m1 507 addps m2, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0]; 508 addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1]; 509 addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0]; 510 addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1]; 511 512 xorps m2, [ps_mask3] 513 xorps m5, [ps_mask3] 514 xorps m6, [ps_mask3] 515 HADDPS m2, m5, m3 516 HADDPS m7, m6, m4 517%if cpuflag(sse3) 518 movshdup m0, m1 519%else 520 movss m0, m1 521 shufps m1, m1, q0001 522%endif 523 addss m1, m0 524 movaps [phiq ], m2 525 movhps [phiq+0x18], m7 526 movss [phiq+0x28], m7 527 movss [phiq+0x10], m1 528 RET 529%endmacro 530 531INIT_XMM sse 532SBR_AUTOCORRELATE 533INIT_XMM sse3 534SBR_AUTOCORRELATE 535