1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* ALAC DSP SIMD optimizations 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (C) 2015 James Almer 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION .text 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ciINIT_XMM sse4 28cabdff1aSopenharmony_ci%if ARCH_X86_64 29cabdff1aSopenharmony_cicglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1 30cabdff1aSopenharmony_ci%else 31cabdff1aSopenharmony_cicglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight 32cabdff1aSopenharmony_ci%define buf1q r2q 33cabdff1aSopenharmony_ci%endif 34cabdff1aSopenharmony_ci movd m6, shiftm 35cabdff1aSopenharmony_ci movd m7, weightm 36cabdff1aSopenharmony_ci SPLATD m7 37cabdff1aSopenharmony_ci shl lend, 2 38cabdff1aSopenharmony_ci mov buf1q, [buf0q + gprsize] 39cabdff1aSopenharmony_ci mov buf0q, [buf0q] 40cabdff1aSopenharmony_ci add buf1q, lenq 41cabdff1aSopenharmony_ci add buf0q, lenq 42cabdff1aSopenharmony_ci neg lenq 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_cialign 16 45cabdff1aSopenharmony_ci.loop: 46cabdff1aSopenharmony_ci mova m0, [buf0q + lenq] 47cabdff1aSopenharmony_ci mova m1, [buf0q + lenq + mmsize] 48cabdff1aSopenharmony_ci mova m2, [buf1q + lenq] 49cabdff1aSopenharmony_ci mova m3, [buf1q + lenq + mmsize] 50cabdff1aSopenharmony_ci pmulld m4, m2, m7 51cabdff1aSopenharmony_ci pmulld m5, m3, m7 52cabdff1aSopenharmony_ci psrad m4, m6 53cabdff1aSopenharmony_ci psrad m5, m6 54cabdff1aSopenharmony_ci psubd m0, m4 55cabdff1aSopenharmony_ci psubd m1, m5 56cabdff1aSopenharmony_ci paddd m2, m0 57cabdff1aSopenharmony_ci paddd m3, m1 58cabdff1aSopenharmony_ci mova [buf1q + lenq], m0 59cabdff1aSopenharmony_ci mova [buf1q + lenq + mmsize], m1 60cabdff1aSopenharmony_ci mova [buf0q + lenq], m2 61cabdff1aSopenharmony_ci mova [buf0q + lenq + mmsize], m3 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_ci add lenq, mmsize*2 64cabdff1aSopenharmony_ci jl .loop 65cabdff1aSopenharmony_ci RET 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ciINIT_XMM sse2 68cabdff1aSopenharmony_cicglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len 69cabdff1aSopenharmony_ci movifnidn lend, lenm 70cabdff1aSopenharmony_ci movd m4, r2m ; exbits 71cabdff1aSopenharmony_ci shl lend, 2 72cabdff1aSopenharmony_ci mov buf1q, [buf0q + gprsize] 73cabdff1aSopenharmony_ci mov buf0q, [buf0q] 74cabdff1aSopenharmony_ci mov exbuf1q, [exbuf0q + gprsize] 75cabdff1aSopenharmony_ci mov exbuf0q, [exbuf0q] 76cabdff1aSopenharmony_ci add buf1q, lenq 77cabdff1aSopenharmony_ci add buf0q, lenq 78cabdff1aSopenharmony_ci add exbuf1q, lenq 79cabdff1aSopenharmony_ci add exbuf0q, lenq 80cabdff1aSopenharmony_ci neg lenq 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_cialign 16 83cabdff1aSopenharmony_ci.loop: 84cabdff1aSopenharmony_ci mova m0, [buf0q + lenq] 85cabdff1aSopenharmony_ci mova m1, [buf0q + lenq + mmsize] 86cabdff1aSopenharmony_ci pslld m0, m4 87cabdff1aSopenharmony_ci pslld m1, m4 88cabdff1aSopenharmony_ci mova m2, [buf1q + lenq] 89cabdff1aSopenharmony_ci mova m3, [buf1q + lenq + mmsize] 90cabdff1aSopenharmony_ci pslld m2, m4 91cabdff1aSopenharmony_ci pslld m3, m4 92cabdff1aSopenharmony_ci por m0, [exbuf0q + lenq] 93cabdff1aSopenharmony_ci por m1, [exbuf0q + lenq + mmsize] 94cabdff1aSopenharmony_ci por m2, [exbuf1q + lenq] 95cabdff1aSopenharmony_ci por m3, [exbuf1q + lenq + mmsize] 96cabdff1aSopenharmony_ci mova [buf0q + lenq ], m0 97cabdff1aSopenharmony_ci mova [buf0q + lenq + mmsize], m1 98cabdff1aSopenharmony_ci mova [buf1q + lenq ], m2 99cabdff1aSopenharmony_ci mova [buf1q + lenq + mmsize], m3 100cabdff1aSopenharmony_ci 101cabdff1aSopenharmony_ci add lenq, mmsize*2 102cabdff1aSopenharmony_ci jl .loop 103cabdff1aSopenharmony_ci REP_RET 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci%if ARCH_X86_64 106cabdff1aSopenharmony_cicglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len 107cabdff1aSopenharmony_ci%else 108cabdff1aSopenharmony_cicglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len 109cabdff1aSopenharmony_ci%define exbitsm r2m 110cabdff1aSopenharmony_ci%endif 111cabdff1aSopenharmony_ci movifnidn lend, r4m 112cabdff1aSopenharmony_ci movd m2, exbitsm 113cabdff1aSopenharmony_ci shl lend, 2 114cabdff1aSopenharmony_ci mov bufq, [bufq] 115cabdff1aSopenharmony_ci mov exbufq, [exbufq] 116cabdff1aSopenharmony_ci add bufq, lenq 117cabdff1aSopenharmony_ci add exbufq, lenq 118cabdff1aSopenharmony_ci neg lenq 119cabdff1aSopenharmony_ci 120cabdff1aSopenharmony_cialign 16 121cabdff1aSopenharmony_ci.loop: 122cabdff1aSopenharmony_ci mova m0, [bufq + lenq] 123cabdff1aSopenharmony_ci mova m1, [bufq + lenq + mmsize] 124cabdff1aSopenharmony_ci pslld m0, m2 125cabdff1aSopenharmony_ci pslld m1, m2 126cabdff1aSopenharmony_ci por m0, [exbufq + lenq] 127cabdff1aSopenharmony_ci por m1, [exbufq + lenq + mmsize] 128cabdff1aSopenharmony_ci mova [bufq + lenq], m0 129cabdff1aSopenharmony_ci mova [bufq + lenq + mmsize], m1 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_ci add lenq, mmsize*2 132cabdff1aSopenharmony_ci jl .loop 133cabdff1aSopenharmony_ci REP_RET 134