1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* TAK DSP SIMD optimizations 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (C) 2015 Paul B Mahol 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION_RODATA 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_cipd_128: times 4 dd 128 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ciSECTION .text 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ciINIT_XMM sse2 32cabdff1aSopenharmony_cicglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length 33cabdff1aSopenharmony_ci shl lengthd, 2 34cabdff1aSopenharmony_ci add p1q, lengthq 35cabdff1aSopenharmony_ci add p2q, lengthq 36cabdff1aSopenharmony_ci neg lengthq 37cabdff1aSopenharmony_ci.loop: 38cabdff1aSopenharmony_ci mova m0, [p1q+lengthq+mmsize*0] 39cabdff1aSopenharmony_ci mova m1, [p1q+lengthq+mmsize*1] 40cabdff1aSopenharmony_ci paddd m0, [p2q+lengthq+mmsize*0] 41cabdff1aSopenharmony_ci paddd m1, [p2q+lengthq+mmsize*1] 42cabdff1aSopenharmony_ci mova [p2q+lengthq+mmsize*0], m0 43cabdff1aSopenharmony_ci mova [p2q+lengthq+mmsize*1], m1 44cabdff1aSopenharmony_ci add lengthq, mmsize*2 45cabdff1aSopenharmony_ci jl .loop 46cabdff1aSopenharmony_ci REP_RET 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_cicglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length 49cabdff1aSopenharmony_ci shl lengthd, 2 50cabdff1aSopenharmony_ci add p1q, lengthq 51cabdff1aSopenharmony_ci add p2q, lengthq 52cabdff1aSopenharmony_ci neg lengthq 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci.loop: 55cabdff1aSopenharmony_ci mova m0, [p2q+lengthq+mmsize*0] 56cabdff1aSopenharmony_ci mova m1, [p2q+lengthq+mmsize*1] 57cabdff1aSopenharmony_ci psubd m0, [p1q+lengthq+mmsize*0] 58cabdff1aSopenharmony_ci psubd m1, [p1q+lengthq+mmsize*1] 59cabdff1aSopenharmony_ci mova [p1q+lengthq+mmsize*0], m0 60cabdff1aSopenharmony_ci mova [p1q+lengthq+mmsize*1], m1 61cabdff1aSopenharmony_ci add lengthq, mmsize*2 62cabdff1aSopenharmony_ci jl .loop 63cabdff1aSopenharmony_ci REP_RET 64cabdff1aSopenharmony_ci 65cabdff1aSopenharmony_cicglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length 66cabdff1aSopenharmony_ci shl lengthd, 2 67cabdff1aSopenharmony_ci add p1q, lengthq 68cabdff1aSopenharmony_ci add p2q, lengthq 69cabdff1aSopenharmony_ci neg lengthq 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_ci.loop: 72cabdff1aSopenharmony_ci mova m0, [p1q+lengthq] 73cabdff1aSopenharmony_ci mova m1, [p2q+lengthq] 74cabdff1aSopenharmony_ci mova m3, [p1q+lengthq+mmsize] 75cabdff1aSopenharmony_ci mova m4, [p2q+lengthq+mmsize] 76cabdff1aSopenharmony_ci mova m2, m1 77cabdff1aSopenharmony_ci mova m5, m4 78cabdff1aSopenharmony_ci psrad m2, 1 79cabdff1aSopenharmony_ci psrad m5, 1 80cabdff1aSopenharmony_ci psubd m0, m2 81cabdff1aSopenharmony_ci psubd m3, m5 82cabdff1aSopenharmony_ci paddd m1, m0 83cabdff1aSopenharmony_ci paddd m4, m3 84cabdff1aSopenharmony_ci mova [p1q+lengthq], m0 85cabdff1aSopenharmony_ci mova [p2q+lengthq], m1 86cabdff1aSopenharmony_ci mova [p1q+lengthq+mmsize], m3 87cabdff1aSopenharmony_ci mova [p2q+lengthq+mmsize], m4 88cabdff1aSopenharmony_ci add lengthq, mmsize*2 89cabdff1aSopenharmony_ci jl .loop 90cabdff1aSopenharmony_ci REP_RET 91cabdff1aSopenharmony_ci 92cabdff1aSopenharmony_ciINIT_XMM sse4 93cabdff1aSopenharmony_cicglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor 94cabdff1aSopenharmony_ci shl lengthd, 2 95cabdff1aSopenharmony_ci add p1q, lengthq 96cabdff1aSopenharmony_ci add p2q, lengthq 97cabdff1aSopenharmony_ci neg lengthq 98cabdff1aSopenharmony_ci 99cabdff1aSopenharmony_ci movd m2, dshiftm 100cabdff1aSopenharmony_ci movd m3, dfactorm 101cabdff1aSopenharmony_ci pshufd m3, m3, 0 102cabdff1aSopenharmony_ci mova m4, [pd_128] 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_ci.loop: 105cabdff1aSopenharmony_ci mova m0, [p1q+lengthq] 106cabdff1aSopenharmony_ci mova m1, [p2q+lengthq] 107cabdff1aSopenharmony_ci psrad m1, m2 108cabdff1aSopenharmony_ci pmulld m1, m3 109cabdff1aSopenharmony_ci paddd m1, m4 110cabdff1aSopenharmony_ci psrad m1, 8 111cabdff1aSopenharmony_ci pslld m1, m2 112cabdff1aSopenharmony_ci psubd m1, m0 113cabdff1aSopenharmony_ci mova [p1q+lengthq], m1 114cabdff1aSopenharmony_ci add lengthq, mmsize 115cabdff1aSopenharmony_ci jl .loop 116cabdff1aSopenharmony_ci REP_RET 117