1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* SSE2-optimized weighted prediction code 3cabdff1aSopenharmony_ci;***************************************************************************** 4cabdff1aSopenharmony_ci;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 5cabdff1aSopenharmony_ci;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;****************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION .text 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 29cabdff1aSopenharmony_ci; biweight pred: 30cabdff1aSopenharmony_ci; 31cabdff1aSopenharmony_ci; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, 32cabdff1aSopenharmony_ci; int height, int log2_denom, int weightd, 33cabdff1aSopenharmony_ci; int weights, int offset); 34cabdff1aSopenharmony_ci; and 35cabdff1aSopenharmony_ci; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height, 36cabdff1aSopenharmony_ci; int log2_denom, int weight, int offset); 37cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ci%macro WEIGHT_SETUP 0 40cabdff1aSopenharmony_ci add r5, r5 41cabdff1aSopenharmony_ci inc r5 42cabdff1aSopenharmony_ci movd m3, r4d 43cabdff1aSopenharmony_ci movd m5, r5d 44cabdff1aSopenharmony_ci movd m6, r3d 45cabdff1aSopenharmony_ci pslld m5, m6 46cabdff1aSopenharmony_ci psrld m5, 1 47cabdff1aSopenharmony_ci%if mmsize == 16 48cabdff1aSopenharmony_ci pshuflw m3, m3, 0 49cabdff1aSopenharmony_ci pshuflw m5, m5, 0 50cabdff1aSopenharmony_ci punpcklqdq m3, m3 51cabdff1aSopenharmony_ci punpcklqdq m5, m5 52cabdff1aSopenharmony_ci%else 53cabdff1aSopenharmony_ci pshufw m3, m3, 0 54cabdff1aSopenharmony_ci pshufw m5, m5, 0 55cabdff1aSopenharmony_ci%endif 56cabdff1aSopenharmony_ci pxor m7, m7 57cabdff1aSopenharmony_ci%endmacro 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci%macro WEIGHT_OP 2 60cabdff1aSopenharmony_ci movh m0, [r0+%1] 61cabdff1aSopenharmony_ci movh m1, [r0+%2] 62cabdff1aSopenharmony_ci punpcklbw m0, m7 63cabdff1aSopenharmony_ci punpcklbw m1, m7 64cabdff1aSopenharmony_ci pmullw m0, m3 65cabdff1aSopenharmony_ci pmullw m1, m3 66cabdff1aSopenharmony_ci paddsw m0, m5 67cabdff1aSopenharmony_ci paddsw m1, m5 68cabdff1aSopenharmony_ci psraw m0, m6 69cabdff1aSopenharmony_ci psraw m1, m6 70cabdff1aSopenharmony_ci packuswb m0, m1 71cabdff1aSopenharmony_ci%endmacro 72cabdff1aSopenharmony_ci 73cabdff1aSopenharmony_ci%macro WEIGHT_FUNC_MM 2 74cabdff1aSopenharmony_cicglobal h264_weight_%1, 6, 6, %2 75cabdff1aSopenharmony_ci WEIGHT_SETUP 76cabdff1aSopenharmony_ci.nextrow: 77cabdff1aSopenharmony_ci WEIGHT_OP 0, mmsize/2 78cabdff1aSopenharmony_ci mova [r0], m0 79cabdff1aSopenharmony_ci add r0, r1 80cabdff1aSopenharmony_ci dec r2d 81cabdff1aSopenharmony_ci jnz .nextrow 82cabdff1aSopenharmony_ci REP_RET 83cabdff1aSopenharmony_ci%endmacro 84cabdff1aSopenharmony_ci 85cabdff1aSopenharmony_ciINIT_XMM sse2 86cabdff1aSopenharmony_ciWEIGHT_FUNC_MM 16, 8 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci%macro WEIGHT_FUNC_HALF_MM 2 89cabdff1aSopenharmony_cicglobal h264_weight_%1, 6, 6, %2 90cabdff1aSopenharmony_ci WEIGHT_SETUP 91cabdff1aSopenharmony_ci sar r2d, 1 92cabdff1aSopenharmony_ci lea r3, [r1*2] 93cabdff1aSopenharmony_ci.nextrow: 94cabdff1aSopenharmony_ci WEIGHT_OP 0, r1 95cabdff1aSopenharmony_ci movh [r0], m0 96cabdff1aSopenharmony_ci%if mmsize == 16 97cabdff1aSopenharmony_ci movhps [r0+r1], m0 98cabdff1aSopenharmony_ci%else 99cabdff1aSopenharmony_ci psrlq m0, 32 100cabdff1aSopenharmony_ci movh [r0+r1], m0 101cabdff1aSopenharmony_ci%endif 102cabdff1aSopenharmony_ci add r0, r3 103cabdff1aSopenharmony_ci dec r2d 104cabdff1aSopenharmony_ci jnz .nextrow 105cabdff1aSopenharmony_ci REP_RET 106cabdff1aSopenharmony_ci%endmacro 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_ciINIT_MMX mmxext 109cabdff1aSopenharmony_ciWEIGHT_FUNC_HALF_MM 4, 0 110cabdff1aSopenharmony_ciINIT_XMM sse2 111cabdff1aSopenharmony_ciWEIGHT_FUNC_HALF_MM 8, 8 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci%macro BIWEIGHT_SETUP 0 114cabdff1aSopenharmony_ci%if ARCH_X86_64 115cabdff1aSopenharmony_ci%define off_regd r7d 116cabdff1aSopenharmony_ci%else 117cabdff1aSopenharmony_ci%define off_regd r3d 118cabdff1aSopenharmony_ci%endif 119cabdff1aSopenharmony_ci mov off_regd, r7m 120cabdff1aSopenharmony_ci add off_regd, 1 121cabdff1aSopenharmony_ci or off_regd, 1 122cabdff1aSopenharmony_ci add r4d, 1 123cabdff1aSopenharmony_ci cmp r6d, 128 124cabdff1aSopenharmony_ci je .nonnormal 125cabdff1aSopenharmony_ci cmp r5d, 128 126cabdff1aSopenharmony_ci jne .normal 127cabdff1aSopenharmony_ci.nonnormal: 128cabdff1aSopenharmony_ci sar r5d, 1 129cabdff1aSopenharmony_ci sar r6d, 1 130cabdff1aSopenharmony_ci sar off_regd, 1 131cabdff1aSopenharmony_ci sub r4d, 1 132cabdff1aSopenharmony_ci.normal: 133cabdff1aSopenharmony_ci%if cpuflag(ssse3) 134cabdff1aSopenharmony_ci movd m4, r5d 135cabdff1aSopenharmony_ci movd m0, r6d 136cabdff1aSopenharmony_ci%else 137cabdff1aSopenharmony_ci movd m3, r5d 138cabdff1aSopenharmony_ci movd m4, r6d 139cabdff1aSopenharmony_ci%endif 140cabdff1aSopenharmony_ci movd m5, off_regd 141cabdff1aSopenharmony_ci movd m6, r4d 142cabdff1aSopenharmony_ci pslld m5, m6 143cabdff1aSopenharmony_ci psrld m5, 1 144cabdff1aSopenharmony_ci%if cpuflag(ssse3) 145cabdff1aSopenharmony_ci punpcklbw m4, m0 146cabdff1aSopenharmony_ci pshuflw m4, m4, 0 147cabdff1aSopenharmony_ci pshuflw m5, m5, 0 148cabdff1aSopenharmony_ci punpcklqdq m4, m4 149cabdff1aSopenharmony_ci punpcklqdq m5, m5 150cabdff1aSopenharmony_ci 151cabdff1aSopenharmony_ci%else 152cabdff1aSopenharmony_ci%if mmsize == 16 153cabdff1aSopenharmony_ci pshuflw m3, m3, 0 154cabdff1aSopenharmony_ci pshuflw m4, m4, 0 155cabdff1aSopenharmony_ci pshuflw m5, m5, 0 156cabdff1aSopenharmony_ci punpcklqdq m3, m3 157cabdff1aSopenharmony_ci punpcklqdq m4, m4 158cabdff1aSopenharmony_ci punpcklqdq m5, m5 159cabdff1aSopenharmony_ci%else 160cabdff1aSopenharmony_ci pshufw m3, m3, 0 161cabdff1aSopenharmony_ci pshufw m4, m4, 0 162cabdff1aSopenharmony_ci pshufw m5, m5, 0 163cabdff1aSopenharmony_ci%endif 164cabdff1aSopenharmony_ci pxor m7, m7 165cabdff1aSopenharmony_ci%endif 166cabdff1aSopenharmony_ci%endmacro 167cabdff1aSopenharmony_ci 168cabdff1aSopenharmony_ci%macro BIWEIGHT_STEPA 3 169cabdff1aSopenharmony_ci movh m%1, [r0+%3] 170cabdff1aSopenharmony_ci movh m%2, [r1+%3] 171cabdff1aSopenharmony_ci punpcklbw m%1, m7 172cabdff1aSopenharmony_ci punpcklbw m%2, m7 173cabdff1aSopenharmony_ci pmullw m%1, m3 174cabdff1aSopenharmony_ci pmullw m%2, m4 175cabdff1aSopenharmony_ci paddsw m%1, m%2 176cabdff1aSopenharmony_ci%endmacro 177cabdff1aSopenharmony_ci 178cabdff1aSopenharmony_ci%macro BIWEIGHT_STEPB 0 179cabdff1aSopenharmony_ci paddsw m0, m5 180cabdff1aSopenharmony_ci paddsw m1, m5 181cabdff1aSopenharmony_ci psraw m0, m6 182cabdff1aSopenharmony_ci psraw m1, m6 183cabdff1aSopenharmony_ci packuswb m0, m1 184cabdff1aSopenharmony_ci%endmacro 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_ci%macro BIWEIGHT_FUNC_MM 2 187cabdff1aSopenharmony_cicglobal h264_biweight_%1, 7, 8, %2 188cabdff1aSopenharmony_ci BIWEIGHT_SETUP 189cabdff1aSopenharmony_ci movifnidn r3d, r3m 190cabdff1aSopenharmony_ci.nextrow: 191cabdff1aSopenharmony_ci BIWEIGHT_STEPA 0, 1, 0 192cabdff1aSopenharmony_ci BIWEIGHT_STEPA 1, 2, mmsize/2 193cabdff1aSopenharmony_ci BIWEIGHT_STEPB 194cabdff1aSopenharmony_ci mova [r0], m0 195cabdff1aSopenharmony_ci add r0, r2 196cabdff1aSopenharmony_ci add r1, r2 197cabdff1aSopenharmony_ci dec r3d 198cabdff1aSopenharmony_ci jnz .nextrow 199cabdff1aSopenharmony_ci REP_RET 200cabdff1aSopenharmony_ci%endmacro 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ciINIT_XMM sse2 203cabdff1aSopenharmony_ciBIWEIGHT_FUNC_MM 16, 8 204cabdff1aSopenharmony_ci 205cabdff1aSopenharmony_ci%macro BIWEIGHT_FUNC_HALF_MM 2 206cabdff1aSopenharmony_cicglobal h264_biweight_%1, 7, 8, %2 207cabdff1aSopenharmony_ci BIWEIGHT_SETUP 208cabdff1aSopenharmony_ci movifnidn r3d, r3m 209cabdff1aSopenharmony_ci sar r3, 1 210cabdff1aSopenharmony_ci lea r4, [r2*2] 211cabdff1aSopenharmony_ci.nextrow: 212cabdff1aSopenharmony_ci BIWEIGHT_STEPA 0, 1, 0 213cabdff1aSopenharmony_ci BIWEIGHT_STEPA 1, 2, r2 214cabdff1aSopenharmony_ci BIWEIGHT_STEPB 215cabdff1aSopenharmony_ci movh [r0], m0 216cabdff1aSopenharmony_ci%if mmsize == 16 217cabdff1aSopenharmony_ci movhps [r0+r2], m0 218cabdff1aSopenharmony_ci%else 219cabdff1aSopenharmony_ci psrlq m0, 32 220cabdff1aSopenharmony_ci movh [r0+r2], m0 221cabdff1aSopenharmony_ci%endif 222cabdff1aSopenharmony_ci add r0, r4 223cabdff1aSopenharmony_ci add r1, r4 224cabdff1aSopenharmony_ci dec r3d 225cabdff1aSopenharmony_ci jnz .nextrow 226cabdff1aSopenharmony_ci REP_RET 227cabdff1aSopenharmony_ci%endmacro 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ciINIT_MMX mmxext 230cabdff1aSopenharmony_ciBIWEIGHT_FUNC_HALF_MM 4, 0 231cabdff1aSopenharmony_ciINIT_XMM sse2 232cabdff1aSopenharmony_ciBIWEIGHT_FUNC_HALF_MM 8, 8 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci%macro BIWEIGHT_SSSE3_OP 0 235cabdff1aSopenharmony_ci pmaddubsw m0, m4 236cabdff1aSopenharmony_ci pmaddubsw m2, m4 237cabdff1aSopenharmony_ci paddsw m0, m5 238cabdff1aSopenharmony_ci paddsw m2, m5 239cabdff1aSopenharmony_ci psraw m0, m6 240cabdff1aSopenharmony_ci psraw m2, m6 241cabdff1aSopenharmony_ci packuswb m0, m2 242cabdff1aSopenharmony_ci%endmacro 243cabdff1aSopenharmony_ci 244cabdff1aSopenharmony_ciINIT_XMM ssse3 245cabdff1aSopenharmony_cicglobal h264_biweight_16, 7, 8, 8 246cabdff1aSopenharmony_ci BIWEIGHT_SETUP 247cabdff1aSopenharmony_ci movifnidn r3d, r3m 248cabdff1aSopenharmony_ci 249cabdff1aSopenharmony_ci.nextrow: 250cabdff1aSopenharmony_ci movh m0, [r0] 251cabdff1aSopenharmony_ci movh m2, [r0+8] 252cabdff1aSopenharmony_ci movh m3, [r1+8] 253cabdff1aSopenharmony_ci punpcklbw m0, [r1] 254cabdff1aSopenharmony_ci punpcklbw m2, m3 255cabdff1aSopenharmony_ci BIWEIGHT_SSSE3_OP 256cabdff1aSopenharmony_ci mova [r0], m0 257cabdff1aSopenharmony_ci add r0, r2 258cabdff1aSopenharmony_ci add r1, r2 259cabdff1aSopenharmony_ci dec r3d 260cabdff1aSopenharmony_ci jnz .nextrow 261cabdff1aSopenharmony_ci REP_RET 262cabdff1aSopenharmony_ci 263cabdff1aSopenharmony_ciINIT_XMM ssse3 264cabdff1aSopenharmony_cicglobal h264_biweight_8, 7, 8, 8 265cabdff1aSopenharmony_ci BIWEIGHT_SETUP 266cabdff1aSopenharmony_ci movifnidn r3d, r3m 267cabdff1aSopenharmony_ci sar r3, 1 268cabdff1aSopenharmony_ci lea r4, [r2*2] 269cabdff1aSopenharmony_ci 270cabdff1aSopenharmony_ci.nextrow: 271cabdff1aSopenharmony_ci movh m0, [r0] 272cabdff1aSopenharmony_ci movh m1, [r1] 273cabdff1aSopenharmony_ci movh m2, [r0+r2] 274cabdff1aSopenharmony_ci movh m3, [r1+r2] 275cabdff1aSopenharmony_ci punpcklbw m0, m1 276cabdff1aSopenharmony_ci punpcklbw m2, m3 277cabdff1aSopenharmony_ci BIWEIGHT_SSSE3_OP 278cabdff1aSopenharmony_ci movh [r0], m0 279cabdff1aSopenharmony_ci movhps [r0+r2], m0 280cabdff1aSopenharmony_ci add r0, r4 281cabdff1aSopenharmony_ci add r1, r4 282cabdff1aSopenharmony_ci dec r3d 283cabdff1aSopenharmony_ci jnz .nextrow 284cabdff1aSopenharmony_ci REP_RET 285