1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* FLAC DSP SIMD optimizations 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (C) 2014 Loren Merritt 5cabdff1aSopenharmony_ci;* Copyright (C) 2014 James Almer 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;****************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION .text 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_ci%macro PMACSDQL 5 29cabdff1aSopenharmony_ci%if cpuflag(xop) 30cabdff1aSopenharmony_ci pmacsdql %1, %2, %3, %1 31cabdff1aSopenharmony_ci%else 32cabdff1aSopenharmony_ci pmuldq %2, %3 33cabdff1aSopenharmony_ci paddq %1, %2 34cabdff1aSopenharmony_ci%endif 35cabdff1aSopenharmony_ci%endmacro 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci%macro LPC_32 1 38cabdff1aSopenharmony_ciINIT_XMM %1 39cabdff1aSopenharmony_cicglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j 40cabdff1aSopenharmony_ci sub lend, pred_orderd 41cabdff1aSopenharmony_ci jle .ret 42cabdff1aSopenharmony_ci lea decodedq, [decodedq+pred_orderq*4-8] 43cabdff1aSopenharmony_ci lea coeffsq, [coeffsq+pred_orderq*4] 44cabdff1aSopenharmony_ci neg pred_orderq 45cabdff1aSopenharmony_ci movd m4, qlevelm 46cabdff1aSopenharmony_ciALIGN 16 47cabdff1aSopenharmony_ci.loop_sample: 48cabdff1aSopenharmony_ci movd m0, [decodedq+pred_orderq*4+8] 49cabdff1aSopenharmony_ci add decodedq, 8 50cabdff1aSopenharmony_ci movd m1, [coeffsq+pred_orderq*4] 51cabdff1aSopenharmony_ci pxor m2, m2 52cabdff1aSopenharmony_ci pxor m3, m3 53cabdff1aSopenharmony_ci lea jq, [pred_orderq+1] 54cabdff1aSopenharmony_ci test jq, jq 55cabdff1aSopenharmony_ci jz .end_order 56cabdff1aSopenharmony_ci.loop_order: 57cabdff1aSopenharmony_ci PMACSDQL m2, m0, m1, m2, m0 58cabdff1aSopenharmony_ci movd m0, [decodedq+jq*4] 59cabdff1aSopenharmony_ci PMACSDQL m3, m1, m0, m3, m1 60cabdff1aSopenharmony_ci movd m1, [coeffsq+jq*4] 61cabdff1aSopenharmony_ci inc jq 62cabdff1aSopenharmony_ci jl .loop_order 63cabdff1aSopenharmony_ci.end_order: 64cabdff1aSopenharmony_ci PMACSDQL m2, m0, m1, m2, m0 65cabdff1aSopenharmony_ci psrlq m2, m4 66cabdff1aSopenharmony_ci movd m0, [decodedq] 67cabdff1aSopenharmony_ci paddd m0, m2 68cabdff1aSopenharmony_ci movd [decodedq], m0 69cabdff1aSopenharmony_ci sub lend, 2 70cabdff1aSopenharmony_ci jl .ret 71cabdff1aSopenharmony_ci PMACSDQL m3, m1, m0, m3, m1 72cabdff1aSopenharmony_ci psrlq m3, m4 73cabdff1aSopenharmony_ci movd m1, [decodedq+4] 74cabdff1aSopenharmony_ci paddd m1, m3 75cabdff1aSopenharmony_ci movd [decodedq+4], m1 76cabdff1aSopenharmony_ci jg .loop_sample 77cabdff1aSopenharmony_ci.ret: 78cabdff1aSopenharmony_ci REP_RET 79cabdff1aSopenharmony_ci%endmacro 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci%if HAVE_XOP_EXTERNAL 82cabdff1aSopenharmony_ciLPC_32 xop 83cabdff1aSopenharmony_ci%endif 84cabdff1aSopenharmony_ciLPC_32 sse4 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------- 87cabdff1aSopenharmony_ci;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels, 88cabdff1aSopenharmony_ci; int len, int shift); 89cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------- 90cabdff1aSopenharmony_ci%macro FLAC_DECORRELATE_16 3-4 91cabdff1aSopenharmony_cicglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len 92cabdff1aSopenharmony_ci%if ARCH_X86_32 93cabdff1aSopenharmony_ci mov lend, lenm 94cabdff1aSopenharmony_ci%endif 95cabdff1aSopenharmony_ci movd m3, r4m 96cabdff1aSopenharmony_ci shl lend, 2 97cabdff1aSopenharmony_ci mov in1q, [in0q + gprsize] 98cabdff1aSopenharmony_ci mov in0q, [in0q] 99cabdff1aSopenharmony_ci mov outq, [outq] 100cabdff1aSopenharmony_ci add in1q, lenq 101cabdff1aSopenharmony_ci add in0q, lenq 102cabdff1aSopenharmony_ci add outq, lenq 103cabdff1aSopenharmony_ci neg lenq 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_cialign 16 106cabdff1aSopenharmony_ci.loop: 107cabdff1aSopenharmony_ci mova m0, [in0q + lenq] 108cabdff1aSopenharmony_ci mova m1, [in1q + lenq] 109cabdff1aSopenharmony_ci%ifidn %1, ms 110cabdff1aSopenharmony_ci psrad m2, m1, 1 111cabdff1aSopenharmony_ci psubd m0, m2 112cabdff1aSopenharmony_ci%endif 113cabdff1aSopenharmony_ci%ifnidn %1, indep2 114cabdff1aSopenharmony_ci p%4d m2, m0, m1 115cabdff1aSopenharmony_ci%endif 116cabdff1aSopenharmony_ci packssdw m%2, m%2 117cabdff1aSopenharmony_ci packssdw m%3, m%3 118cabdff1aSopenharmony_ci punpcklwd m%2, m%3 119cabdff1aSopenharmony_ci psllw m%2, m3 120cabdff1aSopenharmony_ci mova [outq + lenq], m%2 121cabdff1aSopenharmony_ci add lenq, 16 122cabdff1aSopenharmony_ci jl .loop 123cabdff1aSopenharmony_ci REP_RET 124cabdff1aSopenharmony_ci%endmacro 125cabdff1aSopenharmony_ci 126cabdff1aSopenharmony_ciINIT_XMM sse2 127cabdff1aSopenharmony_ciFLAC_DECORRELATE_16 ls, 0, 2, sub 128cabdff1aSopenharmony_ciFLAC_DECORRELATE_16 rs, 2, 1, add 129cabdff1aSopenharmony_ciFLAC_DECORRELATE_16 ms, 2, 0, add 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------- 132cabdff1aSopenharmony_ci;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels, 133cabdff1aSopenharmony_ci; int len, int shift); 134cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------- 135cabdff1aSopenharmony_ci%macro FLAC_DECORRELATE_32 5 136cabdff1aSopenharmony_cicglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len 137cabdff1aSopenharmony_ci%if ARCH_X86_32 138cabdff1aSopenharmony_ci mov lend, lenm 139cabdff1aSopenharmony_ci%endif 140cabdff1aSopenharmony_ci movd m3, r4m 141cabdff1aSopenharmony_ci mov in1q, [in0q + gprsize] 142cabdff1aSopenharmony_ci mov in0q, [in0q] 143cabdff1aSopenharmony_ci mov outq, [outq] 144cabdff1aSopenharmony_ci sub in1q, in0q 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_cialign 16 147cabdff1aSopenharmony_ci.loop: 148cabdff1aSopenharmony_ci mova m0, [in0q] 149cabdff1aSopenharmony_ci mova m1, [in0q + in1q] 150cabdff1aSopenharmony_ci%ifidn %1, ms 151cabdff1aSopenharmony_ci psrad m2, m1, 1 152cabdff1aSopenharmony_ci psubd m0, m2 153cabdff1aSopenharmony_ci%endif 154cabdff1aSopenharmony_ci p%5d m2, m0, m1 155cabdff1aSopenharmony_ci pslld m%2, m3 156cabdff1aSopenharmony_ci pslld m%3, m3 157cabdff1aSopenharmony_ci 158cabdff1aSopenharmony_ci SBUTTERFLY dq, %2, %3, %4 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_ci mova [outq ], m%2 161cabdff1aSopenharmony_ci mova [outq + mmsize], m%3 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_ci add in0q, mmsize 164cabdff1aSopenharmony_ci add outq, mmsize*2 165cabdff1aSopenharmony_ci sub lend, mmsize/4 166cabdff1aSopenharmony_ci jg .loop 167cabdff1aSopenharmony_ci REP_RET 168cabdff1aSopenharmony_ci%endmacro 169cabdff1aSopenharmony_ci 170cabdff1aSopenharmony_ciINIT_XMM sse2 171cabdff1aSopenharmony_ciFLAC_DECORRELATE_32 ls, 0, 2, 1, sub 172cabdff1aSopenharmony_ciFLAC_DECORRELATE_32 rs, 2, 1, 0, add 173cabdff1aSopenharmony_ciFLAC_DECORRELATE_32 ms, 2, 0, 1, add 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ci;----------------------------------------------------------------------------------------- 176cabdff1aSopenharmony_ci;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels, 177cabdff1aSopenharmony_ci; int len, int shift); 178cabdff1aSopenharmony_ci;----------------------------------------------------------------------------------------- 179cabdff1aSopenharmony_ci;%1 = bps 180cabdff1aSopenharmony_ci;%2 = channels 181cabdff1aSopenharmony_ci;%3 = last xmm reg used 182cabdff1aSopenharmony_ci;%4 = word/dword (shift instruction) 183cabdff1aSopenharmony_ci%macro FLAC_DECORRELATE_INDEP 4 184cabdff1aSopenharmony_ci%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels 185cabdff1aSopenharmony_cicglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7 186cabdff1aSopenharmony_ci%if ARCH_X86_32 187cabdff1aSopenharmony_ci%if %2 == 6 188cabdff1aSopenharmony_ci DEFINE_ARGS out, in0, in1, in2, in3, in4, in5 189cabdff1aSopenharmony_ci %define lend dword r3m 190cabdff1aSopenharmony_ci%else 191cabdff1aSopenharmony_ci mov lend, lenm 192cabdff1aSopenharmony_ci%endif 193cabdff1aSopenharmony_ci%endif 194cabdff1aSopenharmony_ci movd m%3, r4m 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_ci%assign %%i 1 197cabdff1aSopenharmony_ci%rep %2-1 198cabdff1aSopenharmony_ci mov in %+ %%i %+ q, [in0q+%%i*gprsize] 199cabdff1aSopenharmony_ci%assign %%i %%i+1 200cabdff1aSopenharmony_ci%endrep 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ci mov in0q, [in0q] 203cabdff1aSopenharmony_ci mov outq, [outq] 204cabdff1aSopenharmony_ci 205cabdff1aSopenharmony_ci%assign %%i 1 206cabdff1aSopenharmony_ci%rep %2-1 207cabdff1aSopenharmony_ci sub in %+ %%i %+ q, in0q 208cabdff1aSopenharmony_ci%assign %%i %%i+1 209cabdff1aSopenharmony_ci%endrep 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_cialign 16 212cabdff1aSopenharmony_ci.loop: 213cabdff1aSopenharmony_ci mova m0, [in0q] 214cabdff1aSopenharmony_ci 215cabdff1aSopenharmony_ci%assign %%i 1 216cabdff1aSopenharmony_ci%rep REPCOUNT-1 217cabdff1aSopenharmony_ci mova m %+ %%i, [in0q + in %+ %%i %+ q] 218cabdff1aSopenharmony_ci%assign %%i %%i+1 219cabdff1aSopenharmony_ci%endrep 220cabdff1aSopenharmony_ci 221cabdff1aSopenharmony_ci%if %1 == 32 222cabdff1aSopenharmony_ci 223cabdff1aSopenharmony_ci%if %2 == 8 224cabdff1aSopenharmony_ci TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8 225cabdff1aSopenharmony_ci%elif %2 == 6 226cabdff1aSopenharmony_ci SBUTTERFLY dq, 0, 1, 6 227cabdff1aSopenharmony_ci SBUTTERFLY dq, 2, 3, 6 228cabdff1aSopenharmony_ci SBUTTERFLY dq, 4, 5, 6 229cabdff1aSopenharmony_ci 230cabdff1aSopenharmony_ci punpcklqdq m6, m0, m2 231cabdff1aSopenharmony_ci punpckhqdq m2, m4 232cabdff1aSopenharmony_ci shufps m4, m0, 0xe4 233cabdff1aSopenharmony_ci punpcklqdq m0, m1, m3 234cabdff1aSopenharmony_ci punpckhqdq m3, m5 235cabdff1aSopenharmony_ci shufps m5, m1, 0xe4 236cabdff1aSopenharmony_ci SWAP 0,6,1,4,5,3 237cabdff1aSopenharmony_ci%elif %2 == 4 238cabdff1aSopenharmony_ci TRANSPOSE4x4D 0, 1, 2, 3, 4 239cabdff1aSopenharmony_ci%else ; %2 == 2 240cabdff1aSopenharmony_ci SBUTTERFLY dq, 0, 1, 2 241cabdff1aSopenharmony_ci%endif 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci%else ; %1 == 16 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ci%if %2 == 8 246cabdff1aSopenharmony_ci packssdw m0, [in0q + in4q] 247cabdff1aSopenharmony_ci packssdw m1, [in0q + in5q] 248cabdff1aSopenharmony_ci packssdw m2, [in0q + in6q] 249cabdff1aSopenharmony_ci packssdw m3, [in0q + in7q] 250cabdff1aSopenharmony_ci TRANSPOSE2x4x4W 0, 1, 2, 3, 4 251cabdff1aSopenharmony_ci%elif %2 == 6 252cabdff1aSopenharmony_ci packssdw m0, [in0q + in3q] 253cabdff1aSopenharmony_ci packssdw m1, [in0q + in4q] 254cabdff1aSopenharmony_ci packssdw m2, [in0q + in5q] 255cabdff1aSopenharmony_ci pshufd m3, m0, q1032 256cabdff1aSopenharmony_ci punpcklwd m0, m1 257cabdff1aSopenharmony_ci punpckhwd m1, m2 258cabdff1aSopenharmony_ci punpcklwd m2, m3 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci shufps m3, m0, m2, q2020 261cabdff1aSopenharmony_ci shufps m0, m1, q2031 262cabdff1aSopenharmony_ci shufps m2, m1, q3131 263cabdff1aSopenharmony_ci shufps m1, m2, m3, q3120 264cabdff1aSopenharmony_ci shufps m3, m0, q0220 265cabdff1aSopenharmony_ci shufps m0, m2, q3113 266cabdff1aSopenharmony_ci SWAP 2, 0, 3 267cabdff1aSopenharmony_ci%else ; %2 == 4 268cabdff1aSopenharmony_ci packssdw m0, [in0q + in2q] 269cabdff1aSopenharmony_ci packssdw m1, [in0q + in3q] 270cabdff1aSopenharmony_ci SBUTTERFLY wd, 0, 1, 2 271cabdff1aSopenharmony_ci SBUTTERFLY dq, 0, 1, 2 272cabdff1aSopenharmony_ci%endif 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ci%endif 275cabdff1aSopenharmony_ci 276cabdff1aSopenharmony_ci%assign %%i 0 277cabdff1aSopenharmony_ci%rep REPCOUNT 278cabdff1aSopenharmony_ci psll%4 m %+ %%i, m%3 279cabdff1aSopenharmony_ci%assign %%i %%i+1 280cabdff1aSopenharmony_ci%endrep 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci%assign %%i 0 283cabdff1aSopenharmony_ci%rep REPCOUNT 284cabdff1aSopenharmony_ci mova [outq + %%i*mmsize], m %+ %%i 285cabdff1aSopenharmony_ci%assign %%i %%i+1 286cabdff1aSopenharmony_ci%endrep 287cabdff1aSopenharmony_ci 288cabdff1aSopenharmony_ci add in0q, mmsize 289cabdff1aSopenharmony_ci add outq, mmsize*REPCOUNT 290cabdff1aSopenharmony_ci sub lend, mmsize/4 291cabdff1aSopenharmony_ci jg .loop 292cabdff1aSopenharmony_ci REP_RET 293cabdff1aSopenharmony_ci%endmacro 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ciINIT_XMM sse2 296cabdff1aSopenharmony_ciFLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro 297cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 2, 3, d 298cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 16, 4, 3, w 299cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 4, 5, d 300cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 16, 6, 4, w 301cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 6, 7, d 302cabdff1aSopenharmony_ci%if ARCH_X86_64 303cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 16, 8, 5, w 304cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 8, 9, d 305cabdff1aSopenharmony_ci%endif 306cabdff1aSopenharmony_ci 307cabdff1aSopenharmony_ciINIT_XMM avx 308cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 4, 5, d 309cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 6, 7, d 310cabdff1aSopenharmony_ci%if ARCH_X86_64 311cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 16, 8, 5, w 312cabdff1aSopenharmony_ciFLAC_DECORRELATE_INDEP 32, 8, 9, d 313cabdff1aSopenharmony_ci%endif 314