1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code 3cabdff1aSopenharmony_ci;***************************************************************************** 4cabdff1aSopenharmony_ci;* Copyright (C) 2005-2011 x264 project 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 9cabdff1aSopenharmony_ci;* 10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 14cabdff1aSopenharmony_ci;* 15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 19cabdff1aSopenharmony_ci;* 20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23cabdff1aSopenharmony_ci;****************************************************************************** 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ciSECTION_RODATA 32 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_cisq_1: dq 1 30cabdff1aSopenharmony_ci dq 0 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_cicextern pw_1 33cabdff1aSopenharmony_cicextern pw_1023 34cabdff1aSopenharmony_ci%define pw_pixel_max pw_1023 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ciSECTION .text 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 39cabdff1aSopenharmony_ci; void ff_h264_weight_16_10(uint8_t *dst, int stride, int height, 40cabdff1aSopenharmony_ci; int log2_denom, int weight, int offset); 41cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 42cabdff1aSopenharmony_ci%macro WEIGHT_PROLOGUE 0 43cabdff1aSopenharmony_ci.prologue: 44cabdff1aSopenharmony_ci PROLOGUE 0,6,8 45cabdff1aSopenharmony_ci movifnidn r0, r0mp 46cabdff1aSopenharmony_ci movifnidn r1d, r1m 47cabdff1aSopenharmony_ci movifnidn r2d, r2m 48cabdff1aSopenharmony_ci movifnidn r4d, r4m 49cabdff1aSopenharmony_ci movifnidn r5d, r5m 50cabdff1aSopenharmony_ci%endmacro 51cabdff1aSopenharmony_ci 52cabdff1aSopenharmony_ci%macro WEIGHT_SETUP 0 53cabdff1aSopenharmony_ci mova m0, [pw_1] 54cabdff1aSopenharmony_ci movd m2, r3m 55cabdff1aSopenharmony_ci pslld m0, m2 ; 1<<log2_denom 56cabdff1aSopenharmony_ci SPLATW m0, m0 57cabdff1aSopenharmony_ci shl r5, 19 ; *8, move to upper half of dword 58cabdff1aSopenharmony_ci lea r5, [r5+r4*2+0x10000] 59cabdff1aSopenharmony_ci movd m3, r5d ; weight<<1 | 1+(offset<<(3)) 60cabdff1aSopenharmony_ci pshufd m3, m3, 0 61cabdff1aSopenharmony_ci mova m4, [pw_pixel_max] 62cabdff1aSopenharmony_ci paddw m2, [sq_1] ; log2_denom+1 63cabdff1aSopenharmony_ci%if notcpuflag(sse4) 64cabdff1aSopenharmony_ci pxor m7, m7 65cabdff1aSopenharmony_ci%endif 66cabdff1aSopenharmony_ci%endmacro 67cabdff1aSopenharmony_ci 68cabdff1aSopenharmony_ci%macro WEIGHT_OP 1-2 69cabdff1aSopenharmony_ci%if %0==1 70cabdff1aSopenharmony_ci mova m5, [r0+%1] 71cabdff1aSopenharmony_ci punpckhwd m6, m5, m0 72cabdff1aSopenharmony_ci punpcklwd m5, m0 73cabdff1aSopenharmony_ci%else 74cabdff1aSopenharmony_ci movq m5, [r0+%1] 75cabdff1aSopenharmony_ci movq m6, [r0+%2] 76cabdff1aSopenharmony_ci punpcklwd m5, m0 77cabdff1aSopenharmony_ci punpcklwd m6, m0 78cabdff1aSopenharmony_ci%endif 79cabdff1aSopenharmony_ci pmaddwd m5, m3 80cabdff1aSopenharmony_ci pmaddwd m6, m3 81cabdff1aSopenharmony_ci psrad m5, m2 82cabdff1aSopenharmony_ci psrad m6, m2 83cabdff1aSopenharmony_ci%if cpuflag(sse4) 84cabdff1aSopenharmony_ci packusdw m5, m6 85cabdff1aSopenharmony_ci pminsw m5, m4 86cabdff1aSopenharmony_ci%else 87cabdff1aSopenharmony_ci packssdw m5, m6 88cabdff1aSopenharmony_ci CLIPW m5, m7, m4 89cabdff1aSopenharmony_ci%endif 90cabdff1aSopenharmony_ci%endmacro 91cabdff1aSopenharmony_ci 92cabdff1aSopenharmony_ci%macro WEIGHT_FUNC_DBL 0 93cabdff1aSopenharmony_cicglobal h264_weight_16_10 94cabdff1aSopenharmony_ci WEIGHT_PROLOGUE 95cabdff1aSopenharmony_ci WEIGHT_SETUP 96cabdff1aSopenharmony_ci.nextrow: 97cabdff1aSopenharmony_ci WEIGHT_OP 0 98cabdff1aSopenharmony_ci mova [r0 ], m5 99cabdff1aSopenharmony_ci WEIGHT_OP 16 100cabdff1aSopenharmony_ci mova [r0+16], m5 101cabdff1aSopenharmony_ci add r0, r1 102cabdff1aSopenharmony_ci dec r2d 103cabdff1aSopenharmony_ci jnz .nextrow 104cabdff1aSopenharmony_ci REP_RET 105cabdff1aSopenharmony_ci%endmacro 106cabdff1aSopenharmony_ci 107cabdff1aSopenharmony_ciINIT_XMM sse2 108cabdff1aSopenharmony_ciWEIGHT_FUNC_DBL 109cabdff1aSopenharmony_ciINIT_XMM sse4 110cabdff1aSopenharmony_ciWEIGHT_FUNC_DBL 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci%macro WEIGHT_FUNC_MM 0 114cabdff1aSopenharmony_cicglobal h264_weight_8_10 115cabdff1aSopenharmony_ci WEIGHT_PROLOGUE 116cabdff1aSopenharmony_ci WEIGHT_SETUP 117cabdff1aSopenharmony_ci.nextrow: 118cabdff1aSopenharmony_ci WEIGHT_OP 0 119cabdff1aSopenharmony_ci mova [r0], m5 120cabdff1aSopenharmony_ci add r0, r1 121cabdff1aSopenharmony_ci dec r2d 122cabdff1aSopenharmony_ci jnz .nextrow 123cabdff1aSopenharmony_ci REP_RET 124cabdff1aSopenharmony_ci%endmacro 125cabdff1aSopenharmony_ci 126cabdff1aSopenharmony_ciINIT_XMM sse2 127cabdff1aSopenharmony_ciWEIGHT_FUNC_MM 128cabdff1aSopenharmony_ciINIT_XMM sse4 129cabdff1aSopenharmony_ciWEIGHT_FUNC_MM 130cabdff1aSopenharmony_ci 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci%macro WEIGHT_FUNC_HALF_MM 0 133cabdff1aSopenharmony_cicglobal h264_weight_4_10 134cabdff1aSopenharmony_ci WEIGHT_PROLOGUE 135cabdff1aSopenharmony_ci sar r2d, 1 136cabdff1aSopenharmony_ci WEIGHT_SETUP 137cabdff1aSopenharmony_ci lea r3, [r1*2] 138cabdff1aSopenharmony_ci.nextrow: 139cabdff1aSopenharmony_ci WEIGHT_OP 0, r1 140cabdff1aSopenharmony_ci movh [r0], m5 141cabdff1aSopenharmony_ci movhps [r0+r1], m5 142cabdff1aSopenharmony_ci add r0, r3 143cabdff1aSopenharmony_ci dec r2d 144cabdff1aSopenharmony_ci jnz .nextrow 145cabdff1aSopenharmony_ci REP_RET 146cabdff1aSopenharmony_ci%endmacro 147cabdff1aSopenharmony_ci 148cabdff1aSopenharmony_ciINIT_XMM sse2 149cabdff1aSopenharmony_ciWEIGHT_FUNC_HALF_MM 150cabdff1aSopenharmony_ciINIT_XMM sse4 151cabdff1aSopenharmony_ciWEIGHT_FUNC_HALF_MM 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_ci 154cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 155cabdff1aSopenharmony_ci; void ff_h264_biweight_16_10(uint8_t *dst, uint8_t *src, int stride, 156cabdff1aSopenharmony_ci; int height, int log2_denom, int weightd, 157cabdff1aSopenharmony_ci; int weights, int offset); 158cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 159cabdff1aSopenharmony_ci%if ARCH_X86_32 160cabdff1aSopenharmony_ciDECLARE_REG_TMP 3 161cabdff1aSopenharmony_ci%else 162cabdff1aSopenharmony_ciDECLARE_REG_TMP 7 163cabdff1aSopenharmony_ci%endif 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_ci%macro BIWEIGHT_PROLOGUE 0 166cabdff1aSopenharmony_ci.prologue: 167cabdff1aSopenharmony_ci PROLOGUE 0,8,8 168cabdff1aSopenharmony_ci movifnidn r0, r0mp 169cabdff1aSopenharmony_ci movifnidn r1, r1mp 170cabdff1aSopenharmony_ci movifnidn r2d, r2m 171cabdff1aSopenharmony_ci movifnidn r5d, r5m 172cabdff1aSopenharmony_ci movifnidn r6d, r6m 173cabdff1aSopenharmony_ci movifnidn t0d, r7m 174cabdff1aSopenharmony_ci%endmacro 175cabdff1aSopenharmony_ci 176cabdff1aSopenharmony_ci%macro BIWEIGHT_SETUP 0 177cabdff1aSopenharmony_ci lea t0, [t0*4+1] ; (offset<<2)+1 178cabdff1aSopenharmony_ci or t0, 1 179cabdff1aSopenharmony_ci shl r6, 16 180cabdff1aSopenharmony_ci or r5, r6 181cabdff1aSopenharmony_ci movd m4, r5d ; weightd | weights 182cabdff1aSopenharmony_ci movd m5, t0d ; (offset+1)|1 183cabdff1aSopenharmony_ci movd m6, r4m ; log2_denom 184cabdff1aSopenharmony_ci pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom 185cabdff1aSopenharmony_ci paddd m6, [sq_1] 186cabdff1aSopenharmony_ci pshufd m4, m4, 0 187cabdff1aSopenharmony_ci pshufd m5, m5, 0 188cabdff1aSopenharmony_ci mova m3, [pw_pixel_max] 189cabdff1aSopenharmony_ci movifnidn r3d, r3m 190cabdff1aSopenharmony_ci%if notcpuflag(sse4) 191cabdff1aSopenharmony_ci pxor m7, m7 192cabdff1aSopenharmony_ci%endif 193cabdff1aSopenharmony_ci%endmacro 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_ci%macro BIWEIGHT 1-2 196cabdff1aSopenharmony_ci%if %0==1 197cabdff1aSopenharmony_ci mova m0, [r0+%1] 198cabdff1aSopenharmony_ci mova m1, [r1+%1] 199cabdff1aSopenharmony_ci punpckhwd m2, m0, m1 200cabdff1aSopenharmony_ci punpcklwd m0, m1 201cabdff1aSopenharmony_ci%else 202cabdff1aSopenharmony_ci movq m0, [r0+%1] 203cabdff1aSopenharmony_ci movq m1, [r1+%1] 204cabdff1aSopenharmony_ci punpcklwd m0, m1 205cabdff1aSopenharmony_ci movq m2, [r0+%2] 206cabdff1aSopenharmony_ci movq m1, [r1+%2] 207cabdff1aSopenharmony_ci punpcklwd m2, m1 208cabdff1aSopenharmony_ci%endif 209cabdff1aSopenharmony_ci pmaddwd m0, m4 210cabdff1aSopenharmony_ci pmaddwd m2, m4 211cabdff1aSopenharmony_ci paddd m0, m5 212cabdff1aSopenharmony_ci paddd m2, m5 213cabdff1aSopenharmony_ci psrad m0, m6 214cabdff1aSopenharmony_ci psrad m2, m6 215cabdff1aSopenharmony_ci%if cpuflag(sse4) 216cabdff1aSopenharmony_ci packusdw m0, m2 217cabdff1aSopenharmony_ci pminsw m0, m3 218cabdff1aSopenharmony_ci%else 219cabdff1aSopenharmony_ci packssdw m0, m2 220cabdff1aSopenharmony_ci CLIPW m0, m7, m3 221cabdff1aSopenharmony_ci%endif 222cabdff1aSopenharmony_ci%endmacro 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ci%macro BIWEIGHT_FUNC_DBL 0 225cabdff1aSopenharmony_cicglobal h264_biweight_16_10 226cabdff1aSopenharmony_ci BIWEIGHT_PROLOGUE 227cabdff1aSopenharmony_ci BIWEIGHT_SETUP 228cabdff1aSopenharmony_ci.nextrow: 229cabdff1aSopenharmony_ci BIWEIGHT 0 230cabdff1aSopenharmony_ci mova [r0 ], m0 231cabdff1aSopenharmony_ci BIWEIGHT 16 232cabdff1aSopenharmony_ci mova [r0+16], m0 233cabdff1aSopenharmony_ci add r0, r2 234cabdff1aSopenharmony_ci add r1, r2 235cabdff1aSopenharmony_ci dec r3d 236cabdff1aSopenharmony_ci jnz .nextrow 237cabdff1aSopenharmony_ci REP_RET 238cabdff1aSopenharmony_ci%endmacro 239cabdff1aSopenharmony_ci 240cabdff1aSopenharmony_ciINIT_XMM sse2 241cabdff1aSopenharmony_ciBIWEIGHT_FUNC_DBL 242cabdff1aSopenharmony_ciINIT_XMM sse4 243cabdff1aSopenharmony_ciBIWEIGHT_FUNC_DBL 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ci%macro BIWEIGHT_FUNC 0 246cabdff1aSopenharmony_cicglobal h264_biweight_8_10 247cabdff1aSopenharmony_ci BIWEIGHT_PROLOGUE 248cabdff1aSopenharmony_ci BIWEIGHT_SETUP 249cabdff1aSopenharmony_ci.nextrow: 250cabdff1aSopenharmony_ci BIWEIGHT 0 251cabdff1aSopenharmony_ci mova [r0], m0 252cabdff1aSopenharmony_ci add r0, r2 253cabdff1aSopenharmony_ci add r1, r2 254cabdff1aSopenharmony_ci dec r3d 255cabdff1aSopenharmony_ci jnz .nextrow 256cabdff1aSopenharmony_ci REP_RET 257cabdff1aSopenharmony_ci%endmacro 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_ciINIT_XMM sse2 260cabdff1aSopenharmony_ciBIWEIGHT_FUNC 261cabdff1aSopenharmony_ciINIT_XMM sse4 262cabdff1aSopenharmony_ciBIWEIGHT_FUNC 263cabdff1aSopenharmony_ci 264cabdff1aSopenharmony_ci%macro BIWEIGHT_FUNC_HALF 0 265cabdff1aSopenharmony_cicglobal h264_biweight_4_10 266cabdff1aSopenharmony_ci BIWEIGHT_PROLOGUE 267cabdff1aSopenharmony_ci BIWEIGHT_SETUP 268cabdff1aSopenharmony_ci sar r3d, 1 269cabdff1aSopenharmony_ci lea r4, [r2*2] 270cabdff1aSopenharmony_ci.nextrow: 271cabdff1aSopenharmony_ci BIWEIGHT 0, r2 272cabdff1aSopenharmony_ci movh [r0 ], m0 273cabdff1aSopenharmony_ci movhps [r0+r2], m0 274cabdff1aSopenharmony_ci add r0, r4 275cabdff1aSopenharmony_ci add r1, r4 276cabdff1aSopenharmony_ci dec r3d 277cabdff1aSopenharmony_ci jnz .nextrow 278cabdff1aSopenharmony_ci REP_RET 279cabdff1aSopenharmony_ci%endmacro 280cabdff1aSopenharmony_ci 281cabdff1aSopenharmony_ciINIT_XMM sse2 282cabdff1aSopenharmony_ciBIWEIGHT_FUNC_HALF 283cabdff1aSopenharmony_ciINIT_XMM sse4 284cabdff1aSopenharmony_ciBIWEIGHT_FUNC_HALF 285