1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* TTA Encoder DSP SIMD optimizations 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (C) 2014-2016 James Almer 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION_RODATA 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_cipd_n0113: dd ~0, ~1, ~1, ~3 28cabdff1aSopenharmony_cipd_1224: dd 1, 2, 2, 4 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ciSECTION .text 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ci%macro TTAENC_FILTER 2 33cabdff1aSopenharmony_ciINIT_XMM %1 34cabdff1aSopenharmony_cicglobal ttaenc_filter_process, 5,5,%2, qm, dx, dl, error, in, shift, round 35cabdff1aSopenharmony_ci mova m2, [qmq ] 36cabdff1aSopenharmony_ci mova m3, [qmq + 0x10] 37cabdff1aSopenharmony_ci mova m4, [dxq ] 38cabdff1aSopenharmony_ci mova m5, [dxq + 0x10] 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ci movd m6, [errorq] ; if (filter->error < 0) { 41cabdff1aSopenharmony_ci SPLATD m6 ; for (int i = 0; i < 8; i++) 42cabdff1aSopenharmony_ci psignd m0, m4, m6 ; filter->qm[i] -= filter->dx[i]; 43cabdff1aSopenharmony_ci psignd m1, m5, m6 ; } else if (filter->error > 0) { 44cabdff1aSopenharmony_ci paddd m2, m0 ; for (int i = 0; i < 8; i++) 45cabdff1aSopenharmony_ci paddd m3, m1 ; filter->qm[i] += filter->dx[i]; 46cabdff1aSopenharmony_ci mova [qmq ], m2 ; } 47cabdff1aSopenharmony_ci mova [qmq + 0x10], m3 ; 48cabdff1aSopenharmony_ci 49cabdff1aSopenharmony_ci mova m0, [dlq ] 50cabdff1aSopenharmony_ci mova m1, [dlq + 0x10] 51cabdff1aSopenharmony_ci 52cabdff1aSopenharmony_ci%if cpuflag(sse4) 53cabdff1aSopenharmony_ci pmulld m2, m0 54cabdff1aSopenharmony_ci pmulld m3, m1 55cabdff1aSopenharmony_ci%else 56cabdff1aSopenharmony_ci pshufd m6, m0, 0xb1 57cabdff1aSopenharmony_ci pshufd m7, m2, 0xb1 58cabdff1aSopenharmony_ci pmuludq m6, m7 59cabdff1aSopenharmony_ci pshufd m6, m6, 0xd8 60cabdff1aSopenharmony_ci pmuludq m2, m0 61cabdff1aSopenharmony_ci pshufd m2, m2, 0xd8 62cabdff1aSopenharmony_ci punpckldq m2, m6 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_ci pshufd m6, m1, 0xb1 65cabdff1aSopenharmony_ci pshufd m7, m3, 0xb1 66cabdff1aSopenharmony_ci pmuludq m6, m7 67cabdff1aSopenharmony_ci pshufd m6, m6, 0xd8 68cabdff1aSopenharmony_ci pmuludq m3, m1 69cabdff1aSopenharmony_ci pshufd m3, m3, 0xd8 70cabdff1aSopenharmony_ci punpckldq m3, m6 71cabdff1aSopenharmony_ci%endif 72cabdff1aSopenharmony_ci ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around 73cabdff1aSopenharmony_ci paddd m2, m3 ; int sum = filter->round + 74cabdff1aSopenharmony_ci ; filter->dl[0] * filter->qm[0] + 75cabdff1aSopenharmony_ci pshufd m3, m2, 0xe ; filter->dl[1] * filter->qm[1] + 76cabdff1aSopenharmony_ci paddd m2, m3 ; filter->dl[2] * filter->qm[2] + 77cabdff1aSopenharmony_ci ; filter->dl[3] * filter->qm[3] + 78cabdff1aSopenharmony_ci movd m6, roundm ; filter->dl[4] * filter->qm[4] + 79cabdff1aSopenharmony_ci paddd m6, m2 ; filter->dl[5] * filter->qm[5] + 80cabdff1aSopenharmony_ci pshufd m2, m2, 0x1 ; filter->dl[6] * filter->qm[6] + 81cabdff1aSopenharmony_ci paddd m6, m2 ; filter->dl[7] * filter->qm[7]; 82cabdff1aSopenharmony_ci 83cabdff1aSopenharmony_ci palignr m5, m4, 4 ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2]; 84cabdff1aSopenharmony_ci ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4]; 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci palignr m2, m1, m0, 4 ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2]; 87cabdff1aSopenharmony_ci ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4]; 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci psrad m4, m1, 30 ; filter->dx[4] = ((filter->dl[4] >> 30) | 1); 90cabdff1aSopenharmony_ci por m4, [pd_1224 ] ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1; 91cabdff1aSopenharmony_ci pand m4, [pd_n0113] ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1; 92cabdff1aSopenharmony_ci ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3; 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci mova [dlq ], m2 95cabdff1aSopenharmony_ci mova [dxq ], m5 96cabdff1aSopenharmony_ci mova [dxq + 0x10], m4 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci movd m2, shiftm ; 99cabdff1aSopenharmony_ci movd m0, [inq] ; 100cabdff1aSopenharmony_ci psrad m6, m2 ; 101cabdff1aSopenharmony_ci psubd m3, m0, m6 ; 102cabdff1aSopenharmony_ci movd [inq], m3 ; *in -= (sum >> filter->shift); 103cabdff1aSopenharmony_ci movd [errorq], m3 ; filter->error = *in; 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci psrldq m1, 4 ; 106cabdff1aSopenharmony_ci pslldq m0, 12 ; filter->dl[4] = -filter->dl[5]; 107cabdff1aSopenharmony_ci pshufd m0, m0, 0xf0 ; filter->dl[5] = -filter->dl[6]; 108cabdff1aSopenharmony_ci psubd m0, m1 ; filter->dl[6] = *in - filter->dl[7]; 109cabdff1aSopenharmony_ci psrldq m1, m0, 4 ; filter->dl[7] = *in; 110cabdff1aSopenharmony_ci pshufd m1, m1, 0xf4 ; filter->dl[5] += filter->dl[6]; 111cabdff1aSopenharmony_ci paddd m0, m1 ; filter->dl[4] += filter->dl[5]; 112cabdff1aSopenharmony_ci psrldq m1, 4 ; 113cabdff1aSopenharmony_ci paddd m0, m1 ; 114cabdff1aSopenharmony_ci mova [dlq + 0x10], m0 ; 115cabdff1aSopenharmony_ci RET 116cabdff1aSopenharmony_ci%endmacro 117cabdff1aSopenharmony_ci 118cabdff1aSopenharmony_ciTTAENC_FILTER ssse3, 8 119cabdff1aSopenharmony_ciTTAENC_FILTER sse4, 7 120