1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code 3cabdff1aSopenharmony_ci;***************************************************************************** 4cabdff1aSopenharmony_ci;* Copyright (C) 2005-2011 x264 project 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 9cabdff1aSopenharmony_ci;* 10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 14cabdff1aSopenharmony_ci;* 15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 19cabdff1aSopenharmony_ci;* 20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23cabdff1aSopenharmony_ci;****************************************************************************** 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ciSECTION_RODATA 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_cicextern pw_1023 30cabdff1aSopenharmony_ci%define pw_pixel_max pw_1023 31cabdff1aSopenharmony_cicextern pw_512 32cabdff1aSopenharmony_cicextern pw_16 33cabdff1aSopenharmony_cicextern pw_8 34cabdff1aSopenharmony_cicextern pw_4 35cabdff1aSopenharmony_cicextern pw_2 36cabdff1aSopenharmony_cicextern pw_1 37cabdff1aSopenharmony_cicextern pd_16 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_cipw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 40cabdff1aSopenharmony_cipw_m3: times 8 dw -3 41cabdff1aSopenharmony_cipd_17: times 4 dd 17 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_ciSECTION .text 44cabdff1aSopenharmony_ci 45cabdff1aSopenharmony_ci; dest, left, right, src 46cabdff1aSopenharmony_ci; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 47cabdff1aSopenharmony_ci%macro PRED4x4_LOWPASS 4 48cabdff1aSopenharmony_ci paddw %2, %3 49cabdff1aSopenharmony_ci psrlw %2, 1 50cabdff1aSopenharmony_ci pavgw %1, %4, %2 51cabdff1aSopenharmony_ci%endmacro 52cabdff1aSopenharmony_ci 53cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 54cabdff1aSopenharmony_ci; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright, 55cabdff1aSopenharmony_ci; ptrdiff_t stride) 56cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 57cabdff1aSopenharmony_ci%macro PRED4x4_DR 0 58cabdff1aSopenharmony_cicglobal pred4x4_down_right_10, 3, 3 59cabdff1aSopenharmony_ci sub r0, r2 60cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 61cabdff1aSopenharmony_ci movhps m1, [r1-8] 62cabdff1aSopenharmony_ci movhps m2, [r0+r2*1-8] 63cabdff1aSopenharmony_ci movhps m4, [r0-8] 64cabdff1aSopenharmony_ci punpckhwd m2, m4 65cabdff1aSopenharmony_ci movq m3, [r0] 66cabdff1aSopenharmony_ci punpckhdq m1, m2 67cabdff1aSopenharmony_ci PALIGNR m3, m1, 10, m1 68cabdff1aSopenharmony_ci movhps m4, [r1+r2*1-8] 69cabdff1aSopenharmony_ci PALIGNR m0, m3, m4, 14, m4 70cabdff1aSopenharmony_ci movhps m4, [r1+r2*2-8] 71cabdff1aSopenharmony_ci PALIGNR m2, m0, m4, 14, m4 72cabdff1aSopenharmony_ci PRED4x4_LOWPASS m0, m2, m3, m0 73cabdff1aSopenharmony_ci movq [r1+r2*2], m0 74cabdff1aSopenharmony_ci psrldq m0, 2 75cabdff1aSopenharmony_ci movq [r1+r2*1], m0 76cabdff1aSopenharmony_ci psrldq m0, 2 77cabdff1aSopenharmony_ci movq [r0+r2*2], m0 78cabdff1aSopenharmony_ci psrldq m0, 2 79cabdff1aSopenharmony_ci movq [r0+r2*1], m0 80cabdff1aSopenharmony_ci RET 81cabdff1aSopenharmony_ci%endmacro 82cabdff1aSopenharmony_ci 83cabdff1aSopenharmony_ciINIT_XMM sse2 84cabdff1aSopenharmony_ciPRED4x4_DR 85cabdff1aSopenharmony_ciINIT_XMM ssse3 86cabdff1aSopenharmony_ciPRED4x4_DR 87cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 88cabdff1aSopenharmony_ciINIT_XMM avx 89cabdff1aSopenharmony_ciPRED4x4_DR 90cabdff1aSopenharmony_ci%endif 91cabdff1aSopenharmony_ci 92cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 93cabdff1aSopenharmony_ci; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright, 94cabdff1aSopenharmony_ci; ptrdiff_t stride) 95cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 96cabdff1aSopenharmony_ci%macro PRED4x4_VR 0 97cabdff1aSopenharmony_cicglobal pred4x4_vertical_right_10, 3, 3, 6 98cabdff1aSopenharmony_ci sub r0, r2 99cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 100cabdff1aSopenharmony_ci movq m5, [r0] ; ........t3t2t1t0 101cabdff1aSopenharmony_ci movhps m1, [r0-8] 102cabdff1aSopenharmony_ci PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt 103cabdff1aSopenharmony_ci pavgw m5, m0 104cabdff1aSopenharmony_ci movhps m1, [r0+r2*1-8] 105cabdff1aSopenharmony_ci PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 106cabdff1aSopenharmony_ci movhps m2, [r0+r2*2-8] 107cabdff1aSopenharmony_ci PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 108cabdff1aSopenharmony_ci movhps m3, [r1+r2*1-8] 109cabdff1aSopenharmony_ci PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 110cabdff1aSopenharmony_ci PRED4x4_LOWPASS m1, m0, m2, m1 111cabdff1aSopenharmony_ci pslldq m0, m1, 12 112cabdff1aSopenharmony_ci psrldq m1, 4 113cabdff1aSopenharmony_ci movq [r0+r2*1], m5 114cabdff1aSopenharmony_ci movq [r0+r2*2], m1 115cabdff1aSopenharmony_ci PALIGNR m5, m0, 14, m2 116cabdff1aSopenharmony_ci pslldq m0, 2 117cabdff1aSopenharmony_ci movq [r1+r2*1], m5 118cabdff1aSopenharmony_ci PALIGNR m1, m0, 14, m0 119cabdff1aSopenharmony_ci movq [r1+r2*2], m1 120cabdff1aSopenharmony_ci RET 121cabdff1aSopenharmony_ci%endmacro 122cabdff1aSopenharmony_ci 123cabdff1aSopenharmony_ciINIT_XMM sse2 124cabdff1aSopenharmony_ciPRED4x4_VR 125cabdff1aSopenharmony_ciINIT_XMM ssse3 126cabdff1aSopenharmony_ciPRED4x4_VR 127cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 128cabdff1aSopenharmony_ciINIT_XMM avx 129cabdff1aSopenharmony_ciPRED4x4_VR 130cabdff1aSopenharmony_ci%endif 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 133cabdff1aSopenharmony_ci; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright, 134cabdff1aSopenharmony_ci; ptrdiff_t stride) 135cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 136cabdff1aSopenharmony_ci%macro PRED4x4_HD 0 137cabdff1aSopenharmony_cicglobal pred4x4_horizontal_down_10, 3, 3 138cabdff1aSopenharmony_ci sub r0, r2 139cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 140cabdff1aSopenharmony_ci movq m0, [r0-8] ; lt .. 141cabdff1aSopenharmony_ci movhps m0, [r0] 142cabdff1aSopenharmony_ci pslldq m0, 2 ; t2 t1 t0 lt .. .. .. .. 143cabdff1aSopenharmony_ci movq m1, [r1+r2*2-8] ; l3 144cabdff1aSopenharmony_ci movq m3, [r1+r2*1-8] 145cabdff1aSopenharmony_ci punpcklwd m1, m3 ; l2 l3 146cabdff1aSopenharmony_ci movq m2, [r0+r2*2-8] ; l1 147cabdff1aSopenharmony_ci movq m3, [r0+r2*1-8] 148cabdff1aSopenharmony_ci punpcklwd m2, m3 ; l0 l1 149cabdff1aSopenharmony_ci punpckhdq m1, m2 ; l0 l1 l2 l3 150cabdff1aSopenharmony_ci punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 151cabdff1aSopenharmony_ci psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 152cabdff1aSopenharmony_ci psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 153cabdff1aSopenharmony_ci pavgw m5, m1, m3 154cabdff1aSopenharmony_ci PRED4x4_LOWPASS m3, m1, m0, m3 155cabdff1aSopenharmony_ci punpcklwd m5, m3 156cabdff1aSopenharmony_ci psrldq m3, 8 157cabdff1aSopenharmony_ci PALIGNR m3, m5, 12, m4 158cabdff1aSopenharmony_ci movq [r1+r2*2], m5 159cabdff1aSopenharmony_ci movhps [r0+r2*2], m5 160cabdff1aSopenharmony_ci psrldq m5, 4 161cabdff1aSopenharmony_ci movq [r1+r2*1], m5 162cabdff1aSopenharmony_ci movq [r0+r2*1], m3 163cabdff1aSopenharmony_ci RET 164cabdff1aSopenharmony_ci%endmacro 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ciINIT_XMM sse2 167cabdff1aSopenharmony_ciPRED4x4_HD 168cabdff1aSopenharmony_ciINIT_XMM ssse3 169cabdff1aSopenharmony_ciPRED4x4_HD 170cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 171cabdff1aSopenharmony_ciINIT_XMM avx 172cabdff1aSopenharmony_ciPRED4x4_HD 173cabdff1aSopenharmony_ci%endif 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 176cabdff1aSopenharmony_ci; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride) 177cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ciINIT_MMX mmxext 180cabdff1aSopenharmony_cicglobal pred4x4_dc_10, 3, 3 181cabdff1aSopenharmony_ci sub r0, r2 182cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 183cabdff1aSopenharmony_ci movq m2, [r0+r2*1-8] 184cabdff1aSopenharmony_ci paddw m2, [r0+r2*2-8] 185cabdff1aSopenharmony_ci paddw m2, [r1+r2*1-8] 186cabdff1aSopenharmony_ci paddw m2, [r1+r2*2-8] 187cabdff1aSopenharmony_ci psrlq m2, 48 188cabdff1aSopenharmony_ci movq m0, [r0] 189cabdff1aSopenharmony_ci HADDW m0, m1 190cabdff1aSopenharmony_ci paddw m0, [pw_4] 191cabdff1aSopenharmony_ci paddw m0, m2 192cabdff1aSopenharmony_ci psrlw m0, 3 193cabdff1aSopenharmony_ci SPLATW m0, m0, 0 194cabdff1aSopenharmony_ci movq [r0+r2*1], m0 195cabdff1aSopenharmony_ci movq [r0+r2*2], m0 196cabdff1aSopenharmony_ci movq [r1+r2*1], m0 197cabdff1aSopenharmony_ci movq [r1+r2*2], m0 198cabdff1aSopenharmony_ci RET 199cabdff1aSopenharmony_ci 200cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 201cabdff1aSopenharmony_ci; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright, 202cabdff1aSopenharmony_ci; ptrdiff_t stride) 203cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 204cabdff1aSopenharmony_ci%macro PRED4x4_DL 0 205cabdff1aSopenharmony_cicglobal pred4x4_down_left_10, 3, 3 206cabdff1aSopenharmony_ci sub r0, r2 207cabdff1aSopenharmony_ci movq m0, [r0] 208cabdff1aSopenharmony_ci movhps m0, [r1] 209cabdff1aSopenharmony_ci psrldq m2, m0, 2 210cabdff1aSopenharmony_ci pslldq m3, m0, 2 211cabdff1aSopenharmony_ci pshufhw m2, m2, 10100100b 212cabdff1aSopenharmony_ci PRED4x4_LOWPASS m0, m3, m2, m0 213cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 214cabdff1aSopenharmony_ci movhps [r1+r2*2], m0 215cabdff1aSopenharmony_ci psrldq m0, 2 216cabdff1aSopenharmony_ci movq [r0+r2*1], m0 217cabdff1aSopenharmony_ci psrldq m0, 2 218cabdff1aSopenharmony_ci movq [r0+r2*2], m0 219cabdff1aSopenharmony_ci psrldq m0, 2 220cabdff1aSopenharmony_ci movq [r1+r2*1], m0 221cabdff1aSopenharmony_ci RET 222cabdff1aSopenharmony_ci%endmacro 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ciINIT_XMM sse2 225cabdff1aSopenharmony_ciPRED4x4_DL 226cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 227cabdff1aSopenharmony_ciINIT_XMM avx 228cabdff1aSopenharmony_ciPRED4x4_DL 229cabdff1aSopenharmony_ci%endif 230cabdff1aSopenharmony_ci 231cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 232cabdff1aSopenharmony_ci; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright, 233cabdff1aSopenharmony_ci; ptrdiff_t stride) 234cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 235cabdff1aSopenharmony_ci%macro PRED4x4_VL 0 236cabdff1aSopenharmony_cicglobal pred4x4_vertical_left_10, 3, 3 237cabdff1aSopenharmony_ci sub r0, r2 238cabdff1aSopenharmony_ci movu m1, [r0] 239cabdff1aSopenharmony_ci movhps m1, [r1] 240cabdff1aSopenharmony_ci psrldq m0, m1, 2 241cabdff1aSopenharmony_ci psrldq m2, m1, 4 242cabdff1aSopenharmony_ci pavgw m4, m0, m1 243cabdff1aSopenharmony_ci PRED4x4_LOWPASS m0, m1, m2, m0 244cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 245cabdff1aSopenharmony_ci movq [r0+r2*1], m4 246cabdff1aSopenharmony_ci movq [r0+r2*2], m0 247cabdff1aSopenharmony_ci psrldq m4, 2 248cabdff1aSopenharmony_ci psrldq m0, 2 249cabdff1aSopenharmony_ci movq [r1+r2*1], m4 250cabdff1aSopenharmony_ci movq [r1+r2*2], m0 251cabdff1aSopenharmony_ci RET 252cabdff1aSopenharmony_ci%endmacro 253cabdff1aSopenharmony_ci 254cabdff1aSopenharmony_ciINIT_XMM sse2 255cabdff1aSopenharmony_ciPRED4x4_VL 256cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 257cabdff1aSopenharmony_ciINIT_XMM avx 258cabdff1aSopenharmony_ciPRED4x4_VL 259cabdff1aSopenharmony_ci%endif 260cabdff1aSopenharmony_ci 261cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 262cabdff1aSopenharmony_ci; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright, 263cabdff1aSopenharmony_ci; ptrdiff_t stride) 264cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 265cabdff1aSopenharmony_ciINIT_MMX mmxext 266cabdff1aSopenharmony_cicglobal pred4x4_horizontal_up_10, 3, 3 267cabdff1aSopenharmony_ci sub r0, r2 268cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 269cabdff1aSopenharmony_ci movq m0, [r0+r2*1-8] 270cabdff1aSopenharmony_ci punpckhwd m0, [r0+r2*2-8] 271cabdff1aSopenharmony_ci movq m1, [r1+r2*1-8] 272cabdff1aSopenharmony_ci punpckhwd m1, [r1+r2*2-8] 273cabdff1aSopenharmony_ci punpckhdq m0, m1 274cabdff1aSopenharmony_ci pshufw m1, m1, 0xFF 275cabdff1aSopenharmony_ci movq [r1+r2*2], m1 276cabdff1aSopenharmony_ci movd [r1+r2*1+4], m1 277cabdff1aSopenharmony_ci pshufw m2, m0, 11111001b 278cabdff1aSopenharmony_ci movq m1, m2 279cabdff1aSopenharmony_ci pavgw m2, m0 280cabdff1aSopenharmony_ci 281cabdff1aSopenharmony_ci pshufw m5, m0, 11111110b 282cabdff1aSopenharmony_ci PRED4x4_LOWPASS m1, m0, m5, m1 283cabdff1aSopenharmony_ci movq m6, m2 284cabdff1aSopenharmony_ci punpcklwd m6, m1 285cabdff1aSopenharmony_ci movq [r0+r2*1], m6 286cabdff1aSopenharmony_ci psrlq m2, 16 287cabdff1aSopenharmony_ci psrlq m1, 16 288cabdff1aSopenharmony_ci punpcklwd m2, m1 289cabdff1aSopenharmony_ci movq [r0+r2*2], m2 290cabdff1aSopenharmony_ci psrlq m2, 32 291cabdff1aSopenharmony_ci movd [r1+r2*1], m2 292cabdff1aSopenharmony_ci RET 293cabdff1aSopenharmony_ci 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci 296cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 297cabdff1aSopenharmony_ci; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride) 298cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 299cabdff1aSopenharmony_ciINIT_XMM sse2 300cabdff1aSopenharmony_cicglobal pred8x8_vertical_10, 2, 2 301cabdff1aSopenharmony_ci sub r0, r1 302cabdff1aSopenharmony_ci mova m0, [r0] 303cabdff1aSopenharmony_ci%rep 3 304cabdff1aSopenharmony_ci mova [r0+r1*1], m0 305cabdff1aSopenharmony_ci mova [r0+r1*2], m0 306cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 307cabdff1aSopenharmony_ci%endrep 308cabdff1aSopenharmony_ci mova [r0+r1*1], m0 309cabdff1aSopenharmony_ci mova [r0+r1*2], m0 310cabdff1aSopenharmony_ci RET 311cabdff1aSopenharmony_ci 312cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 313cabdff1aSopenharmony_ci; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride) 314cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 315cabdff1aSopenharmony_ciINIT_XMM sse2 316cabdff1aSopenharmony_cicglobal pred8x8_horizontal_10, 2, 3 317cabdff1aSopenharmony_ci mov r2d, 4 318cabdff1aSopenharmony_ci.loop: 319cabdff1aSopenharmony_ci movq m0, [r0+r1*0-8] 320cabdff1aSopenharmony_ci movq m1, [r0+r1*1-8] 321cabdff1aSopenharmony_ci pshuflw m0, m0, 0xff 322cabdff1aSopenharmony_ci pshuflw m1, m1, 0xff 323cabdff1aSopenharmony_ci punpcklqdq m0, m0 324cabdff1aSopenharmony_ci punpcklqdq m1, m1 325cabdff1aSopenharmony_ci mova [r0+r1*0], m0 326cabdff1aSopenharmony_ci mova [r0+r1*1], m1 327cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 328cabdff1aSopenharmony_ci dec r2d 329cabdff1aSopenharmony_ci jg .loop 330cabdff1aSopenharmony_ci REP_RET 331cabdff1aSopenharmony_ci 332cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 333cabdff1aSopenharmony_ci; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride) 334cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 335cabdff1aSopenharmony_ci%macro MOV8 2-3 336cabdff1aSopenharmony_ci; sort of a hack, but it works 337cabdff1aSopenharmony_ci movdqa [%1], %2 338cabdff1aSopenharmony_ci%endmacro 339cabdff1aSopenharmony_ci 340cabdff1aSopenharmony_ci%macro PRED8x8_DC 1 341cabdff1aSopenharmony_cicglobal pred8x8_dc_10, 2, 6 342cabdff1aSopenharmony_ci sub r0, r1 343cabdff1aSopenharmony_ci pxor m4, m4 344cabdff1aSopenharmony_ci movq m0, [r0+0] 345cabdff1aSopenharmony_ci movq m1, [r0+8] 346cabdff1aSopenharmony_ci punpcklwd m0, m1 347cabdff1aSopenharmony_ci movhlps m1, m0 348cabdff1aSopenharmony_ci paddw m0, m1 349cabdff1aSopenharmony_ci %1 m2, m0, 00001110b 350cabdff1aSopenharmony_ci paddw m0, m2 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci lea r5, [r1*3] 353cabdff1aSopenharmony_ci lea r4, [r0+r1*4] 354cabdff1aSopenharmony_ci movzx r2d, word [r0+r1*1-2] 355cabdff1aSopenharmony_ci movzx r3d, word [r0+r1*2-2] 356cabdff1aSopenharmony_ci add r2d, r3d 357cabdff1aSopenharmony_ci movzx r3d, word [r0+r5*1-2] 358cabdff1aSopenharmony_ci add r2d, r3d 359cabdff1aSopenharmony_ci movzx r3d, word [r4-2] 360cabdff1aSopenharmony_ci add r2d, r3d 361cabdff1aSopenharmony_ci movd m2, r2d ; s2 362cabdff1aSopenharmony_ci 363cabdff1aSopenharmony_ci movzx r2d, word [r4+r1*1-2] 364cabdff1aSopenharmony_ci movzx r3d, word [r4+r1*2-2] 365cabdff1aSopenharmony_ci add r2d, r3d 366cabdff1aSopenharmony_ci movzx r3d, word [r4+r5*1-2] 367cabdff1aSopenharmony_ci add r2d, r3d 368cabdff1aSopenharmony_ci movzx r3d, word [r4+r1*4-2] 369cabdff1aSopenharmony_ci add r2d, r3d 370cabdff1aSopenharmony_ci movd m3, r2d ; s3 371cabdff1aSopenharmony_ci 372cabdff1aSopenharmony_ci punpcklwd m2, m3 373cabdff1aSopenharmony_ci punpckldq m0, m2 ; s0, s1, s2, s3 374cabdff1aSopenharmony_ci %1 m3, m0, 11110110b ; s2, s1, s3, s3 375cabdff1aSopenharmony_ci %1 m0, m0, 01110100b ; s0, s1, s3, s1 376cabdff1aSopenharmony_ci paddw m0, m3 377cabdff1aSopenharmony_ci psrlw m0, 2 378cabdff1aSopenharmony_ci pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 379cabdff1aSopenharmony_ci punpcklwd m0, m0 380cabdff1aSopenharmony_ci pshufd m3, m0, 11111010b 381cabdff1aSopenharmony_ci punpckldq m0, m0 382cabdff1aSopenharmony_ci SWAP 0,1 383cabdff1aSopenharmony_ci MOV8 r0+r1*1, m1, m2 384cabdff1aSopenharmony_ci MOV8 r0+r1*2, m1, m2 385cabdff1aSopenharmony_ci MOV8 r0+r5*1, m1, m2 386cabdff1aSopenharmony_ci MOV8 r0+r1*4, m1, m2 387cabdff1aSopenharmony_ci MOV8 r4+r1*1, m3, m4 388cabdff1aSopenharmony_ci MOV8 r4+r1*2, m3, m4 389cabdff1aSopenharmony_ci MOV8 r4+r5*1, m3, m4 390cabdff1aSopenharmony_ci MOV8 r4+r1*4, m3, m4 391cabdff1aSopenharmony_ci RET 392cabdff1aSopenharmony_ci%endmacro 393cabdff1aSopenharmony_ci 394cabdff1aSopenharmony_ciINIT_XMM sse2 395cabdff1aSopenharmony_ciPRED8x8_DC pshuflw 396cabdff1aSopenharmony_ci 397cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 398cabdff1aSopenharmony_ci; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride) 399cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 400cabdff1aSopenharmony_ciINIT_XMM sse2 401cabdff1aSopenharmony_cicglobal pred8x8_top_dc_10, 2, 4 402cabdff1aSopenharmony_ci sub r0, r1 403cabdff1aSopenharmony_ci mova m0, [r0] 404cabdff1aSopenharmony_ci pshuflw m1, m0, 0x4e 405cabdff1aSopenharmony_ci pshufhw m1, m1, 0x4e 406cabdff1aSopenharmony_ci paddw m0, m1 407cabdff1aSopenharmony_ci pshuflw m1, m0, 0xb1 408cabdff1aSopenharmony_ci pshufhw m1, m1, 0xb1 409cabdff1aSopenharmony_ci paddw m0, m1 410cabdff1aSopenharmony_ci lea r2, [r1*3] 411cabdff1aSopenharmony_ci lea r3, [r0+r1*4] 412cabdff1aSopenharmony_ci paddw m0, [pw_2] 413cabdff1aSopenharmony_ci psrlw m0, 2 414cabdff1aSopenharmony_ci mova [r0+r1*1], m0 415cabdff1aSopenharmony_ci mova [r0+r1*2], m0 416cabdff1aSopenharmony_ci mova [r0+r2*1], m0 417cabdff1aSopenharmony_ci mova [r0+r1*4], m0 418cabdff1aSopenharmony_ci mova [r3+r1*1], m0 419cabdff1aSopenharmony_ci mova [r3+r1*2], m0 420cabdff1aSopenharmony_ci mova [r3+r2*1], m0 421cabdff1aSopenharmony_ci mova [r3+r1*4], m0 422cabdff1aSopenharmony_ci RET 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 425cabdff1aSopenharmony_ci; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride) 426cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 427cabdff1aSopenharmony_ciINIT_XMM sse2 428cabdff1aSopenharmony_cicglobal pred8x8_plane_10, 2, 7, 7 429cabdff1aSopenharmony_ci sub r0, r1 430cabdff1aSopenharmony_ci lea r2, [r1*3] 431cabdff1aSopenharmony_ci lea r3, [r0+r1*4] 432cabdff1aSopenharmony_ci mova m2, [r0] 433cabdff1aSopenharmony_ci pmaddwd m2, [pw_m32101234] 434cabdff1aSopenharmony_ci HADDD m2, m1 435cabdff1aSopenharmony_ci movd m0, [r0-4] 436cabdff1aSopenharmony_ci psrld m0, 14 437cabdff1aSopenharmony_ci psubw m2, m0 ; H 438cabdff1aSopenharmony_ci movd m0, [r3+r1*4-4] 439cabdff1aSopenharmony_ci movd m1, [r0+12] 440cabdff1aSopenharmony_ci paddw m0, m1 441cabdff1aSopenharmony_ci psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7]) 442cabdff1aSopenharmony_ci movzx r4d, word [r3+r1*1-2] ; src[4*stride-1] 443cabdff1aSopenharmony_ci movzx r5d, word [r0+r2*1-2] ; src[2*stride-1] 444cabdff1aSopenharmony_ci sub r4d, r5d 445cabdff1aSopenharmony_ci movzx r6d, word [r3+r1*2-2] ; src[5*stride-1] 446cabdff1aSopenharmony_ci movzx r5d, word [r0+r1*2-2] ; src[1*stride-1] 447cabdff1aSopenharmony_ci sub r6d, r5d 448cabdff1aSopenharmony_ci lea r4d, [r4+r6*2] 449cabdff1aSopenharmony_ci movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] 450cabdff1aSopenharmony_ci movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] 451cabdff1aSopenharmony_ci sub r5d, r6d 452cabdff1aSopenharmony_ci lea r5d, [r5*3] 453cabdff1aSopenharmony_ci add r4d, r5d 454cabdff1aSopenharmony_ci movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] 455cabdff1aSopenharmony_ci movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] 456cabdff1aSopenharmony_ci sub r6d, r5d 457cabdff1aSopenharmony_ci lea r4d, [r4+r6*4] 458cabdff1aSopenharmony_ci movd m3, r4d ; V 459cabdff1aSopenharmony_ci punpckldq m2, m3 460cabdff1aSopenharmony_ci pmaddwd m2, [pd_17] 461cabdff1aSopenharmony_ci paddd m2, [pd_16] 462cabdff1aSopenharmony_ci psrad m2, 5 ; b, c 463cabdff1aSopenharmony_ci 464cabdff1aSopenharmony_ci mova m3, [pw_pixel_max] 465cabdff1aSopenharmony_ci pxor m1, m1 466cabdff1aSopenharmony_ci SPLATW m0, m0, 1 467cabdff1aSopenharmony_ci SPLATW m4, m2, 2 468cabdff1aSopenharmony_ci SPLATW m2, m2, 0 469cabdff1aSopenharmony_ci pmullw m2, [pw_m32101234] ; b 470cabdff1aSopenharmony_ci pmullw m5, m4, [pw_m3] ; c 471cabdff1aSopenharmony_ci paddw m5, [pw_16] 472cabdff1aSopenharmony_ci mov r2d, 8 473cabdff1aSopenharmony_ci add r0, r1 474cabdff1aSopenharmony_ci.loop: 475cabdff1aSopenharmony_ci paddsw m6, m2, m5 476cabdff1aSopenharmony_ci paddsw m6, m0 477cabdff1aSopenharmony_ci psraw m6, 5 478cabdff1aSopenharmony_ci CLIPW m6, m1, m3 479cabdff1aSopenharmony_ci mova [r0], m6 480cabdff1aSopenharmony_ci paddw m5, m4 481cabdff1aSopenharmony_ci add r0, r1 482cabdff1aSopenharmony_ci dec r2d 483cabdff1aSopenharmony_ci jg .loop 484cabdff1aSopenharmony_ci REP_RET 485cabdff1aSopenharmony_ci 486cabdff1aSopenharmony_ci 487cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 488cabdff1aSopenharmony_ci; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright, 489cabdff1aSopenharmony_ci; ptrdiff_t stride) 490cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 491cabdff1aSopenharmony_ciINIT_XMM sse2 492cabdff1aSopenharmony_cicglobal pred8x8l_128_dc_10, 4, 4 493cabdff1aSopenharmony_ci mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) 494cabdff1aSopenharmony_ci lea r1, [r3*3] 495cabdff1aSopenharmony_ci lea r2, [r0+r3*4] 496cabdff1aSopenharmony_ci MOV8 r0+r3*0, m0, m0 497cabdff1aSopenharmony_ci MOV8 r0+r3*1, m0, m0 498cabdff1aSopenharmony_ci MOV8 r0+r3*2, m0, m0 499cabdff1aSopenharmony_ci MOV8 r0+r1*1, m0, m0 500cabdff1aSopenharmony_ci MOV8 r2+r3*0, m0, m0 501cabdff1aSopenharmony_ci MOV8 r2+r3*1, m0, m0 502cabdff1aSopenharmony_ci MOV8 r2+r3*2, m0, m0 503cabdff1aSopenharmony_ci MOV8 r2+r1*1, m0, m0 504cabdff1aSopenharmony_ci RET 505cabdff1aSopenharmony_ci 506cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 507cabdff1aSopenharmony_ci; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright, 508cabdff1aSopenharmony_ci; ptrdiff_t stride) 509cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 510cabdff1aSopenharmony_ci%macro PRED8x8L_TOP_DC 0 511cabdff1aSopenharmony_cicglobal pred8x8l_top_dc_10, 4, 4, 6 512cabdff1aSopenharmony_ci sub r0, r3 513cabdff1aSopenharmony_ci mova m0, [r0] 514cabdff1aSopenharmony_ci shr r1d, 14 515cabdff1aSopenharmony_ci shr r2d, 13 516cabdff1aSopenharmony_ci neg r1 517cabdff1aSopenharmony_ci pslldq m1, m0, 2 518cabdff1aSopenharmony_ci psrldq m2, m0, 2 519cabdff1aSopenharmony_ci pinsrw m1, [r0+r1], 0 520cabdff1aSopenharmony_ci pinsrw m2, [r0+r2+14], 7 521cabdff1aSopenharmony_ci lea r1, [r3*3] 522cabdff1aSopenharmony_ci lea r2, [r0+r3*4] 523cabdff1aSopenharmony_ci PRED4x4_LOWPASS m0, m2, m1, m0 524cabdff1aSopenharmony_ci HADDW m0, m1 525cabdff1aSopenharmony_ci paddw m0, [pw_4] 526cabdff1aSopenharmony_ci psrlw m0, 3 527cabdff1aSopenharmony_ci SPLATW m0, m0, 0 528cabdff1aSopenharmony_ci mova [r0+r3*1], m0 529cabdff1aSopenharmony_ci mova [r0+r3*2], m0 530cabdff1aSopenharmony_ci mova [r0+r1*1], m0 531cabdff1aSopenharmony_ci mova [r0+r3*4], m0 532cabdff1aSopenharmony_ci mova [r2+r3*1], m0 533cabdff1aSopenharmony_ci mova [r2+r3*2], m0 534cabdff1aSopenharmony_ci mova [r2+r1*1], m0 535cabdff1aSopenharmony_ci mova [r2+r3*4], m0 536cabdff1aSopenharmony_ci RET 537cabdff1aSopenharmony_ci%endmacro 538cabdff1aSopenharmony_ci 539cabdff1aSopenharmony_ciINIT_XMM sse2 540cabdff1aSopenharmony_ciPRED8x8L_TOP_DC 541cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 542cabdff1aSopenharmony_ciINIT_XMM avx 543cabdff1aSopenharmony_ciPRED8x8L_TOP_DC 544cabdff1aSopenharmony_ci%endif 545cabdff1aSopenharmony_ci 546cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 547cabdff1aSopenharmony_ci; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright, 548cabdff1aSopenharmony_ci; ptrdiff_t stride) 549cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 550cabdff1aSopenharmony_ci;TODO: see if scalar is faster 551cabdff1aSopenharmony_ci%macro PRED8x8L_DC 0 552cabdff1aSopenharmony_cicglobal pred8x8l_dc_10, 4, 6, 6 553cabdff1aSopenharmony_ci sub r0, r3 554cabdff1aSopenharmony_ci lea r4, [r0+r3*4] 555cabdff1aSopenharmony_ci lea r5, [r3*3] 556cabdff1aSopenharmony_ci mova m0, [r0+r3*2-16] 557cabdff1aSopenharmony_ci punpckhwd m0, [r0+r3*1-16] 558cabdff1aSopenharmony_ci mova m1, [r4+r3*0-16] 559cabdff1aSopenharmony_ci punpckhwd m1, [r0+r5*1-16] 560cabdff1aSopenharmony_ci punpckhdq m1, m0 561cabdff1aSopenharmony_ci mova m2, [r4+r3*2-16] 562cabdff1aSopenharmony_ci punpckhwd m2, [r4+r3*1-16] 563cabdff1aSopenharmony_ci mova m3, [r4+r3*4-16] 564cabdff1aSopenharmony_ci punpckhwd m3, [r4+r5*1-16] 565cabdff1aSopenharmony_ci punpckhdq m3, m2 566cabdff1aSopenharmony_ci punpckhqdq m3, m1 567cabdff1aSopenharmony_ci mova m0, [r0] 568cabdff1aSopenharmony_ci shr r1d, 14 569cabdff1aSopenharmony_ci shr r2d, 13 570cabdff1aSopenharmony_ci neg r1 571cabdff1aSopenharmony_ci pslldq m1, m0, 2 572cabdff1aSopenharmony_ci psrldq m2, m0, 2 573cabdff1aSopenharmony_ci pinsrw m1, [r0+r1], 0 574cabdff1aSopenharmony_ci pinsrw m2, [r0+r2+14], 7 575cabdff1aSopenharmony_ci not r1 576cabdff1aSopenharmony_ci and r1, r3 577cabdff1aSopenharmony_ci pslldq m4, m3, 2 578cabdff1aSopenharmony_ci psrldq m5, m3, 2 579cabdff1aSopenharmony_ci pshuflw m4, m4, 11100101b 580cabdff1aSopenharmony_ci pinsrw m5, [r0+r1-2], 7 581cabdff1aSopenharmony_ci PRED4x4_LOWPASS m3, m4, m5, m3 582cabdff1aSopenharmony_ci PRED4x4_LOWPASS m0, m2, m1, m0 583cabdff1aSopenharmony_ci paddw m0, m3 584cabdff1aSopenharmony_ci HADDW m0, m1 585cabdff1aSopenharmony_ci paddw m0, [pw_8] 586cabdff1aSopenharmony_ci psrlw m0, 4 587cabdff1aSopenharmony_ci SPLATW m0, m0 588cabdff1aSopenharmony_ci mova [r0+r3*1], m0 589cabdff1aSopenharmony_ci mova [r0+r3*2], m0 590cabdff1aSopenharmony_ci mova [r0+r5*1], m0 591cabdff1aSopenharmony_ci mova [r0+r3*4], m0 592cabdff1aSopenharmony_ci mova [r4+r3*1], m0 593cabdff1aSopenharmony_ci mova [r4+r3*2], m0 594cabdff1aSopenharmony_ci mova [r4+r5*1], m0 595cabdff1aSopenharmony_ci mova [r4+r3*4], m0 596cabdff1aSopenharmony_ci RET 597cabdff1aSopenharmony_ci%endmacro 598cabdff1aSopenharmony_ci 599cabdff1aSopenharmony_ciINIT_XMM sse2 600cabdff1aSopenharmony_ciPRED8x8L_DC 601cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 602cabdff1aSopenharmony_ciINIT_XMM avx 603cabdff1aSopenharmony_ciPRED8x8L_DC 604cabdff1aSopenharmony_ci%endif 605cabdff1aSopenharmony_ci 606cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 607cabdff1aSopenharmony_ci; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright, 608cabdff1aSopenharmony_ci; ptrdiff_t stride) 609cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 610cabdff1aSopenharmony_ci%macro PRED8x8L_VERTICAL 0 611cabdff1aSopenharmony_cicglobal pred8x8l_vertical_10, 4, 4, 6 612cabdff1aSopenharmony_ci sub r0, r3 613cabdff1aSopenharmony_ci mova m0, [r0] 614cabdff1aSopenharmony_ci shr r1d, 14 615cabdff1aSopenharmony_ci shr r2d, 13 616cabdff1aSopenharmony_ci neg r1 617cabdff1aSopenharmony_ci pslldq m1, m0, 2 618cabdff1aSopenharmony_ci psrldq m2, m0, 2 619cabdff1aSopenharmony_ci pinsrw m1, [r0+r1], 0 620cabdff1aSopenharmony_ci pinsrw m2, [r0+r2+14], 7 621cabdff1aSopenharmony_ci lea r1, [r3*3] 622cabdff1aSopenharmony_ci lea r2, [r0+r3*4] 623cabdff1aSopenharmony_ci PRED4x4_LOWPASS m0, m2, m1, m0 624cabdff1aSopenharmony_ci mova [r0+r3*1], m0 625cabdff1aSopenharmony_ci mova [r0+r3*2], m0 626cabdff1aSopenharmony_ci mova [r0+r1*1], m0 627cabdff1aSopenharmony_ci mova [r0+r3*4], m0 628cabdff1aSopenharmony_ci mova [r2+r3*1], m0 629cabdff1aSopenharmony_ci mova [r2+r3*2], m0 630cabdff1aSopenharmony_ci mova [r2+r1*1], m0 631cabdff1aSopenharmony_ci mova [r2+r3*4], m0 632cabdff1aSopenharmony_ci RET 633cabdff1aSopenharmony_ci%endmacro 634cabdff1aSopenharmony_ci 635cabdff1aSopenharmony_ciINIT_XMM sse2 636cabdff1aSopenharmony_ciPRED8x8L_VERTICAL 637cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 638cabdff1aSopenharmony_ciINIT_XMM avx 639cabdff1aSopenharmony_ciPRED8x8L_VERTICAL 640cabdff1aSopenharmony_ci%endif 641cabdff1aSopenharmony_ci 642cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 643cabdff1aSopenharmony_ci; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft, 644cabdff1aSopenharmony_ci; int has_topright, ptrdiff_t stride) 645cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 646cabdff1aSopenharmony_ci%macro PRED8x8L_HORIZONTAL 0 647cabdff1aSopenharmony_cicglobal pred8x8l_horizontal_10, 4, 4, 5 648cabdff1aSopenharmony_ci mova m0, [r0-16] 649cabdff1aSopenharmony_ci shr r1d, 14 650cabdff1aSopenharmony_ci dec r1 651cabdff1aSopenharmony_ci and r1, r3 652cabdff1aSopenharmony_ci sub r1, r3 653cabdff1aSopenharmony_ci punpckhwd m0, [r0+r1-16] 654cabdff1aSopenharmony_ci mova m1, [r0+r3*2-16] 655cabdff1aSopenharmony_ci punpckhwd m1, [r0+r3*1-16] 656cabdff1aSopenharmony_ci lea r2, [r0+r3*4] 657cabdff1aSopenharmony_ci lea r1, [r3*3] 658cabdff1aSopenharmony_ci punpckhdq m1, m0 659cabdff1aSopenharmony_ci mova m2, [r2+r3*0-16] 660cabdff1aSopenharmony_ci punpckhwd m2, [r0+r1-16] 661cabdff1aSopenharmony_ci mova m3, [r2+r3*2-16] 662cabdff1aSopenharmony_ci punpckhwd m3, [r2+r3*1-16] 663cabdff1aSopenharmony_ci punpckhdq m3, m2 664cabdff1aSopenharmony_ci punpckhqdq m3, m1 665cabdff1aSopenharmony_ci PALIGNR m4, m3, [r2+r1-16], 14, m0 666cabdff1aSopenharmony_ci pslldq m0, m4, 2 667cabdff1aSopenharmony_ci pshuflw m0, m0, 11100101b 668cabdff1aSopenharmony_ci PRED4x4_LOWPASS m4, m3, m0, m4 669cabdff1aSopenharmony_ci punpckhwd m3, m4, m4 670cabdff1aSopenharmony_ci punpcklwd m4, m4 671cabdff1aSopenharmony_ci pshufd m0, m3, 0xff 672cabdff1aSopenharmony_ci pshufd m1, m3, 0xaa 673cabdff1aSopenharmony_ci pshufd m2, m3, 0x55 674cabdff1aSopenharmony_ci pshufd m3, m3, 0x00 675cabdff1aSopenharmony_ci mova [r0+r3*0], m0 676cabdff1aSopenharmony_ci mova [r0+r3*1], m1 677cabdff1aSopenharmony_ci mova [r0+r3*2], m2 678cabdff1aSopenharmony_ci mova [r0+r1*1], m3 679cabdff1aSopenharmony_ci pshufd m0, m4, 0xff 680cabdff1aSopenharmony_ci pshufd m1, m4, 0xaa 681cabdff1aSopenharmony_ci pshufd m2, m4, 0x55 682cabdff1aSopenharmony_ci pshufd m3, m4, 0x00 683cabdff1aSopenharmony_ci mova [r2+r3*0], m0 684cabdff1aSopenharmony_ci mova [r2+r3*1], m1 685cabdff1aSopenharmony_ci mova [r2+r3*2], m2 686cabdff1aSopenharmony_ci mova [r2+r1*1], m3 687cabdff1aSopenharmony_ci RET 688cabdff1aSopenharmony_ci%endmacro 689cabdff1aSopenharmony_ci 690cabdff1aSopenharmony_ciINIT_XMM sse2 691cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL 692cabdff1aSopenharmony_ciINIT_XMM ssse3 693cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL 694cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 695cabdff1aSopenharmony_ciINIT_XMM avx 696cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL 697cabdff1aSopenharmony_ci%endif 698cabdff1aSopenharmony_ci 699cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 700cabdff1aSopenharmony_ci; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright, 701cabdff1aSopenharmony_ci; ptrdiff_t stride) 702cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 703cabdff1aSopenharmony_ci%macro PRED8x8L_DOWN_LEFT 0 704cabdff1aSopenharmony_cicglobal pred8x8l_down_left_10, 4, 4, 7 705cabdff1aSopenharmony_ci sub r0, r3 706cabdff1aSopenharmony_ci mova m3, [r0] 707cabdff1aSopenharmony_ci shr r1d, 14 708cabdff1aSopenharmony_ci neg r1 709cabdff1aSopenharmony_ci shr r2d, 13 710cabdff1aSopenharmony_ci pslldq m1, m3, 2 711cabdff1aSopenharmony_ci psrldq m2, m3, 2 712cabdff1aSopenharmony_ci pinsrw m1, [r0+r1], 0 713cabdff1aSopenharmony_ci pinsrw m2, [r0+r2+14], 7 714cabdff1aSopenharmony_ci PRED4x4_LOWPASS m6, m2, m1, m3 715cabdff1aSopenharmony_ci jz .fix_tr ; flags from shr r2d 716cabdff1aSopenharmony_ci mova m1, [r0+16] 717cabdff1aSopenharmony_ci psrldq m5, m1, 2 718cabdff1aSopenharmony_ci PALIGNR m2, m1, m3, 14, m3 719cabdff1aSopenharmony_ci pshufhw m5, m5, 10100100b 720cabdff1aSopenharmony_ci PRED4x4_LOWPASS m1, m2, m5, m1 721cabdff1aSopenharmony_ci.do_topright: 722cabdff1aSopenharmony_ci lea r1, [r3*3] 723cabdff1aSopenharmony_ci psrldq m5, m1, 14 724cabdff1aSopenharmony_ci lea r2, [r0+r3*4] 725cabdff1aSopenharmony_ci PALIGNR m2, m1, m6, 2, m0 726cabdff1aSopenharmony_ci PALIGNR m3, m1, m6, 14, m0 727cabdff1aSopenharmony_ci PALIGNR m5, m1, 2, m0 728cabdff1aSopenharmony_ci pslldq m4, m6, 2 729cabdff1aSopenharmony_ci PRED4x4_LOWPASS m6, m4, m2, m6 730cabdff1aSopenharmony_ci PRED4x4_LOWPASS m1, m3, m5, m1 731cabdff1aSopenharmony_ci mova [r2+r3*4], m1 732cabdff1aSopenharmony_ci PALIGNR m1, m6, 14, m2 733cabdff1aSopenharmony_ci pslldq m6, 2 734cabdff1aSopenharmony_ci mova [r2+r1*1], m1 735cabdff1aSopenharmony_ci PALIGNR m1, m6, 14, m2 736cabdff1aSopenharmony_ci pslldq m6, 2 737cabdff1aSopenharmony_ci mova [r2+r3*2], m1 738cabdff1aSopenharmony_ci PALIGNR m1, m6, 14, m2 739cabdff1aSopenharmony_ci pslldq m6, 2 740cabdff1aSopenharmony_ci mova [r2+r3*1], m1 741cabdff1aSopenharmony_ci PALIGNR m1, m6, 14, m2 742cabdff1aSopenharmony_ci pslldq m6, 2 743cabdff1aSopenharmony_ci mova [r0+r3*4], m1 744cabdff1aSopenharmony_ci PALIGNR m1, m6, 14, m2 745cabdff1aSopenharmony_ci pslldq m6, 2 746cabdff1aSopenharmony_ci mova [r0+r1*1], m1 747cabdff1aSopenharmony_ci PALIGNR m1, m6, 14, m2 748cabdff1aSopenharmony_ci pslldq m6, 2 749cabdff1aSopenharmony_ci mova [r0+r3*2], m1 750cabdff1aSopenharmony_ci PALIGNR m1, m6, 14, m6 751cabdff1aSopenharmony_ci mova [r0+r3*1], m1 752cabdff1aSopenharmony_ci RET 753cabdff1aSopenharmony_ci.fix_tr: 754cabdff1aSopenharmony_ci punpckhwd m3, m3 755cabdff1aSopenharmony_ci pshufd m1, m3, 0xFF 756cabdff1aSopenharmony_ci jmp .do_topright 757cabdff1aSopenharmony_ci%endmacro 758cabdff1aSopenharmony_ci 759cabdff1aSopenharmony_ciINIT_XMM sse2 760cabdff1aSopenharmony_ciPRED8x8L_DOWN_LEFT 761cabdff1aSopenharmony_ciINIT_XMM ssse3 762cabdff1aSopenharmony_ciPRED8x8L_DOWN_LEFT 763cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 764cabdff1aSopenharmony_ciINIT_XMM avx 765cabdff1aSopenharmony_ciPRED8x8L_DOWN_LEFT 766cabdff1aSopenharmony_ci%endif 767cabdff1aSopenharmony_ci 768cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 769cabdff1aSopenharmony_ci; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft, 770cabdff1aSopenharmony_ci; int has_topright, ptrdiff_t stride) 771cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 772cabdff1aSopenharmony_ci%macro PRED8x8L_DOWN_RIGHT 0 773cabdff1aSopenharmony_ci; standard forbids this when has_topleft is false 774cabdff1aSopenharmony_ci; no need to check 775cabdff1aSopenharmony_cicglobal pred8x8l_down_right_10, 4, 5, 8 776cabdff1aSopenharmony_ci sub r0, r3 777cabdff1aSopenharmony_ci lea r4, [r0+r3*4] 778cabdff1aSopenharmony_ci lea r1, [r3*3] 779cabdff1aSopenharmony_ci mova m0, [r0+r3*1-16] 780cabdff1aSopenharmony_ci punpckhwd m0, [r0+r3*0-16] 781cabdff1aSopenharmony_ci mova m1, [r0+r1*1-16] 782cabdff1aSopenharmony_ci punpckhwd m1, [r0+r3*2-16] 783cabdff1aSopenharmony_ci punpckhdq m1, m0 784cabdff1aSopenharmony_ci mova m2, [r4+r3*1-16] 785cabdff1aSopenharmony_ci punpckhwd m2, [r4+r3*0-16] 786cabdff1aSopenharmony_ci mova m3, [r4+r1*1-16] 787cabdff1aSopenharmony_ci punpckhwd m3, [r4+r3*2-16] 788cabdff1aSopenharmony_ci punpckhdq m3, m2 789cabdff1aSopenharmony_ci punpckhqdq m3, m1 790cabdff1aSopenharmony_ci mova m0, [r4+r3*4-16] 791cabdff1aSopenharmony_ci mova m1, [r0] 792cabdff1aSopenharmony_ci PALIGNR m4, m3, m0, 14, m0 793cabdff1aSopenharmony_ci PALIGNR m1, m3, 2, m2 794cabdff1aSopenharmony_ci pslldq m0, m4, 2 795cabdff1aSopenharmony_ci pshuflw m0, m0, 11100101b 796cabdff1aSopenharmony_ci PRED4x4_LOWPASS m6, m1, m4, m3 797cabdff1aSopenharmony_ci PRED4x4_LOWPASS m4, m3, m0, m4 798cabdff1aSopenharmony_ci mova m3, [r0] 799cabdff1aSopenharmony_ci shr r2d, 13 800cabdff1aSopenharmony_ci pslldq m1, m3, 2 801cabdff1aSopenharmony_ci psrldq m2, m3, 2 802cabdff1aSopenharmony_ci pinsrw m1, [r0-2], 0 803cabdff1aSopenharmony_ci pinsrw m2, [r0+r2+14], 7 804cabdff1aSopenharmony_ci PRED4x4_LOWPASS m3, m2, m1, m3 805cabdff1aSopenharmony_ci PALIGNR m2, m3, m6, 2, m0 806cabdff1aSopenharmony_ci PALIGNR m5, m3, m6, 14, m0 807cabdff1aSopenharmony_ci psrldq m7, m3, 2 808cabdff1aSopenharmony_ci PRED4x4_LOWPASS m6, m4, m2, m6 809cabdff1aSopenharmony_ci PRED4x4_LOWPASS m3, m5, m7, m3 810cabdff1aSopenharmony_ci mova [r4+r3*4], m6 811cabdff1aSopenharmony_ci PALIGNR m3, m6, 14, m2 812cabdff1aSopenharmony_ci pslldq m6, 2 813cabdff1aSopenharmony_ci mova [r0+r3*1], m3 814cabdff1aSopenharmony_ci PALIGNR m3, m6, 14, m2 815cabdff1aSopenharmony_ci pslldq m6, 2 816cabdff1aSopenharmony_ci mova [r0+r3*2], m3 817cabdff1aSopenharmony_ci PALIGNR m3, m6, 14, m2 818cabdff1aSopenharmony_ci pslldq m6, 2 819cabdff1aSopenharmony_ci mova [r0+r1*1], m3 820cabdff1aSopenharmony_ci PALIGNR m3, m6, 14, m2 821cabdff1aSopenharmony_ci pslldq m6, 2 822cabdff1aSopenharmony_ci mova [r0+r3*4], m3 823cabdff1aSopenharmony_ci PALIGNR m3, m6, 14, m2 824cabdff1aSopenharmony_ci pslldq m6, 2 825cabdff1aSopenharmony_ci mova [r4+r3*1], m3 826cabdff1aSopenharmony_ci PALIGNR m3, m6, 14, m2 827cabdff1aSopenharmony_ci pslldq m6, 2 828cabdff1aSopenharmony_ci mova [r4+r3*2], m3 829cabdff1aSopenharmony_ci PALIGNR m3, m6, 14, m6 830cabdff1aSopenharmony_ci mova [r4+r1*1], m3 831cabdff1aSopenharmony_ci RET 832cabdff1aSopenharmony_ci%endmacro 833cabdff1aSopenharmony_ci 834cabdff1aSopenharmony_ciINIT_XMM sse2 835cabdff1aSopenharmony_ciPRED8x8L_DOWN_RIGHT 836cabdff1aSopenharmony_ciINIT_XMM ssse3 837cabdff1aSopenharmony_ciPRED8x8L_DOWN_RIGHT 838cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 839cabdff1aSopenharmony_ciINIT_XMM avx 840cabdff1aSopenharmony_ciPRED8x8L_DOWN_RIGHT 841cabdff1aSopenharmony_ci%endif 842cabdff1aSopenharmony_ci 843cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 844cabdff1aSopenharmony_ci; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft, 845cabdff1aSopenharmony_ci; int has_topright, ptrdiff_t stride) 846cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 847cabdff1aSopenharmony_ci%macro PRED8x8L_VERTICAL_RIGHT 0 848cabdff1aSopenharmony_ci; likewise with 8x8l_down_right 849cabdff1aSopenharmony_cicglobal pred8x8l_vertical_right_10, 4, 5, 7 850cabdff1aSopenharmony_ci sub r0, r3 851cabdff1aSopenharmony_ci lea r4, [r0+r3*4] 852cabdff1aSopenharmony_ci lea r1, [r3*3] 853cabdff1aSopenharmony_ci mova m0, [r0+r3*1-16] 854cabdff1aSopenharmony_ci punpckhwd m0, [r0+r3*0-16] 855cabdff1aSopenharmony_ci mova m1, [r0+r1*1-16] 856cabdff1aSopenharmony_ci punpckhwd m1, [r0+r3*2-16] 857cabdff1aSopenharmony_ci punpckhdq m1, m0 858cabdff1aSopenharmony_ci mova m2, [r4+r3*1-16] 859cabdff1aSopenharmony_ci punpckhwd m2, [r4+r3*0-16] 860cabdff1aSopenharmony_ci mova m3, [r4+r1*1-16] 861cabdff1aSopenharmony_ci punpckhwd m3, [r4+r3*2-16] 862cabdff1aSopenharmony_ci punpckhdq m3, m2 863cabdff1aSopenharmony_ci punpckhqdq m3, m1 864cabdff1aSopenharmony_ci mova m0, [r4+r3*4-16] 865cabdff1aSopenharmony_ci mova m1, [r0] 866cabdff1aSopenharmony_ci PALIGNR m4, m3, m0, 14, m0 867cabdff1aSopenharmony_ci PALIGNR m1, m3, 2, m2 868cabdff1aSopenharmony_ci PRED4x4_LOWPASS m3, m1, m4, m3 869cabdff1aSopenharmony_ci mova m2, [r0] 870cabdff1aSopenharmony_ci shr r2d, 13 871cabdff1aSopenharmony_ci pslldq m1, m2, 2 872cabdff1aSopenharmony_ci psrldq m5, m2, 2 873cabdff1aSopenharmony_ci pinsrw m1, [r0-2], 0 874cabdff1aSopenharmony_ci pinsrw m5, [r0+r2+14], 7 875cabdff1aSopenharmony_ci PRED4x4_LOWPASS m2, m5, m1, m2 876cabdff1aSopenharmony_ci PALIGNR m6, m2, m3, 12, m1 877cabdff1aSopenharmony_ci PALIGNR m5, m2, m3, 14, m0 878cabdff1aSopenharmony_ci PRED4x4_LOWPASS m0, m6, m2, m5 879cabdff1aSopenharmony_ci pavgw m2, m5 880cabdff1aSopenharmony_ci mova [r0+r3*2], m0 881cabdff1aSopenharmony_ci mova [r0+r3*1], m2 882cabdff1aSopenharmony_ci pslldq m6, m3, 4 883cabdff1aSopenharmony_ci pslldq m1, m3, 2 884cabdff1aSopenharmony_ci PRED4x4_LOWPASS m1, m3, m6, m1 885cabdff1aSopenharmony_ci PALIGNR m2, m1, 14, m4 886cabdff1aSopenharmony_ci mova [r0+r1*1], m2 887cabdff1aSopenharmony_ci pslldq m1, 2 888cabdff1aSopenharmony_ci PALIGNR m0, m1, 14, m3 889cabdff1aSopenharmony_ci mova [r0+r3*4], m0 890cabdff1aSopenharmony_ci pslldq m1, 2 891cabdff1aSopenharmony_ci PALIGNR m2, m1, 14, m4 892cabdff1aSopenharmony_ci mova [r4+r3*1], m2 893cabdff1aSopenharmony_ci pslldq m1, 2 894cabdff1aSopenharmony_ci PALIGNR m0, m1, 14, m3 895cabdff1aSopenharmony_ci mova [r4+r3*2], m0 896cabdff1aSopenharmony_ci pslldq m1, 2 897cabdff1aSopenharmony_ci PALIGNR m2, m1, 14, m4 898cabdff1aSopenharmony_ci mova [r4+r1*1], m2 899cabdff1aSopenharmony_ci pslldq m1, 2 900cabdff1aSopenharmony_ci PALIGNR m0, m1, 14, m1 901cabdff1aSopenharmony_ci mova [r4+r3*4], m0 902cabdff1aSopenharmony_ci RET 903cabdff1aSopenharmony_ci%endmacro 904cabdff1aSopenharmony_ci 905cabdff1aSopenharmony_ciINIT_XMM sse2 906cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_RIGHT 907cabdff1aSopenharmony_ciINIT_XMM ssse3 908cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_RIGHT 909cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 910cabdff1aSopenharmony_ciINIT_XMM avx 911cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_RIGHT 912cabdff1aSopenharmony_ci%endif 913cabdff1aSopenharmony_ci 914cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 915cabdff1aSopenharmony_ci; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft, 916cabdff1aSopenharmony_ci; int has_topright, ptrdiff_t stride) 917cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 918cabdff1aSopenharmony_ci%macro PRED8x8L_HORIZONTAL_UP 0 919cabdff1aSopenharmony_cicglobal pred8x8l_horizontal_up_10, 4, 4, 6 920cabdff1aSopenharmony_ci mova m0, [r0+r3*0-16] 921cabdff1aSopenharmony_ci punpckhwd m0, [r0+r3*1-16] 922cabdff1aSopenharmony_ci shr r1d, 14 923cabdff1aSopenharmony_ci dec r1 924cabdff1aSopenharmony_ci and r1, r3 925cabdff1aSopenharmony_ci sub r1, r3 926cabdff1aSopenharmony_ci mova m4, [r0+r1*1-16] 927cabdff1aSopenharmony_ci lea r1, [r3*3] 928cabdff1aSopenharmony_ci lea r2, [r0+r3*4] 929cabdff1aSopenharmony_ci mova m1, [r0+r3*2-16] 930cabdff1aSopenharmony_ci punpckhwd m1, [r0+r1*1-16] 931cabdff1aSopenharmony_ci punpckhdq m0, m1 932cabdff1aSopenharmony_ci mova m2, [r2+r3*0-16] 933cabdff1aSopenharmony_ci punpckhwd m2, [r2+r3*1-16] 934cabdff1aSopenharmony_ci mova m3, [r2+r3*2-16] 935cabdff1aSopenharmony_ci punpckhwd m3, [r2+r1*1-16] 936cabdff1aSopenharmony_ci punpckhdq m2, m3 937cabdff1aSopenharmony_ci punpckhqdq m0, m2 938cabdff1aSopenharmony_ci PALIGNR m1, m0, m4, 14, m4 939cabdff1aSopenharmony_ci psrldq m2, m0, 2 940cabdff1aSopenharmony_ci pshufhw m2, m2, 10100100b 941cabdff1aSopenharmony_ci PRED4x4_LOWPASS m0, m1, m2, m0 942cabdff1aSopenharmony_ci psrldq m1, m0, 2 943cabdff1aSopenharmony_ci psrldq m2, m0, 4 944cabdff1aSopenharmony_ci pshufhw m1, m1, 10100100b 945cabdff1aSopenharmony_ci pshufhw m2, m2, 01010100b 946cabdff1aSopenharmony_ci pavgw m4, m0, m1 947cabdff1aSopenharmony_ci PRED4x4_LOWPASS m1, m2, m0, m1 948cabdff1aSopenharmony_ci punpckhwd m5, m4, m1 949cabdff1aSopenharmony_ci punpcklwd m4, m1 950cabdff1aSopenharmony_ci mova [r2+r3*0], m5 951cabdff1aSopenharmony_ci mova [r0+r3*0], m4 952cabdff1aSopenharmony_ci pshufd m0, m5, 11111001b 953cabdff1aSopenharmony_ci pshufd m1, m5, 11111110b 954cabdff1aSopenharmony_ci pshufd m2, m5, 11111111b 955cabdff1aSopenharmony_ci mova [r2+r3*1], m0 956cabdff1aSopenharmony_ci mova [r2+r3*2], m1 957cabdff1aSopenharmony_ci mova [r2+r1*1], m2 958cabdff1aSopenharmony_ci PALIGNR m2, m5, m4, 4, m0 959cabdff1aSopenharmony_ci PALIGNR m3, m5, m4, 8, m1 960cabdff1aSopenharmony_ci PALIGNR m5, m5, m4, 12, m4 961cabdff1aSopenharmony_ci mova [r0+r3*1], m2 962cabdff1aSopenharmony_ci mova [r0+r3*2], m3 963cabdff1aSopenharmony_ci mova [r0+r1*1], m5 964cabdff1aSopenharmony_ci RET 965cabdff1aSopenharmony_ci%endmacro 966cabdff1aSopenharmony_ci 967cabdff1aSopenharmony_ciINIT_XMM sse2 968cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_UP 969cabdff1aSopenharmony_ciINIT_XMM ssse3 970cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_UP 971cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 972cabdff1aSopenharmony_ciINIT_XMM avx 973cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_UP 974cabdff1aSopenharmony_ci%endif 975cabdff1aSopenharmony_ci 976cabdff1aSopenharmony_ci 977cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 978cabdff1aSopenharmony_ci; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride) 979cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 980cabdff1aSopenharmony_ci%macro MOV16 3-5 981cabdff1aSopenharmony_ci mova [%1+ 0], %2 982cabdff1aSopenharmony_ci mova [%1+mmsize], %3 983cabdff1aSopenharmony_ci%endmacro 984cabdff1aSopenharmony_ci 985cabdff1aSopenharmony_ciINIT_XMM sse2 986cabdff1aSopenharmony_cicglobal pred16x16_vertical_10, 2, 3 987cabdff1aSopenharmony_ci sub r0, r1 988cabdff1aSopenharmony_ci mov r2d, 8 989cabdff1aSopenharmony_ci mova m0, [r0+ 0] 990cabdff1aSopenharmony_ci mova m1, [r0+mmsize] 991cabdff1aSopenharmony_ci.loop: 992cabdff1aSopenharmony_ci MOV16 r0+r1*1, m0, m1, m2, m3 993cabdff1aSopenharmony_ci MOV16 r0+r1*2, m0, m1, m2, m3 994cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 995cabdff1aSopenharmony_ci dec r2d 996cabdff1aSopenharmony_ci jg .loop 997cabdff1aSopenharmony_ci REP_RET 998cabdff1aSopenharmony_ci 999cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1000cabdff1aSopenharmony_ci; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride) 1001cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1002cabdff1aSopenharmony_ciINIT_XMM sse2 1003cabdff1aSopenharmony_cicglobal pred16x16_horizontal_10, 2, 3 1004cabdff1aSopenharmony_ci mov r2d, 8 1005cabdff1aSopenharmony_ci.vloop: 1006cabdff1aSopenharmony_ci movd m0, [r0+r1*0-4] 1007cabdff1aSopenharmony_ci movd m1, [r0+r1*1-4] 1008cabdff1aSopenharmony_ci SPLATW m0, m0, 1 1009cabdff1aSopenharmony_ci SPLATW m1, m1, 1 1010cabdff1aSopenharmony_ci MOV16 r0+r1*0, m0, m0, m0, m0 1011cabdff1aSopenharmony_ci MOV16 r0+r1*1, m1, m1, m1, m1 1012cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 1013cabdff1aSopenharmony_ci dec r2d 1014cabdff1aSopenharmony_ci jg .vloop 1015cabdff1aSopenharmony_ci REP_RET 1016cabdff1aSopenharmony_ci 1017cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1018cabdff1aSopenharmony_ci; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride) 1019cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1020cabdff1aSopenharmony_ciINIT_XMM sse2 1021cabdff1aSopenharmony_cicglobal pred16x16_dc_10, 2, 6 1022cabdff1aSopenharmony_ci mov r5, r0 1023cabdff1aSopenharmony_ci sub r0, r1 1024cabdff1aSopenharmony_ci mova m0, [r0+0] 1025cabdff1aSopenharmony_ci paddw m0, [r0+mmsize] 1026cabdff1aSopenharmony_ci HADDW m0, m2 1027cabdff1aSopenharmony_ci 1028cabdff1aSopenharmony_ci lea r0, [r0+r1-2] 1029cabdff1aSopenharmony_ci movzx r3d, word [r0] 1030cabdff1aSopenharmony_ci movzx r4d, word [r0+r1] 1031cabdff1aSopenharmony_ci%rep 7 1032cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 1033cabdff1aSopenharmony_ci movzx r2d, word [r0] 1034cabdff1aSopenharmony_ci add r3d, r2d 1035cabdff1aSopenharmony_ci movzx r2d, word [r0+r1] 1036cabdff1aSopenharmony_ci add r4d, r2d 1037cabdff1aSopenharmony_ci%endrep 1038cabdff1aSopenharmony_ci lea r3d, [r3+r4+16] 1039cabdff1aSopenharmony_ci 1040cabdff1aSopenharmony_ci movd m1, r3d 1041cabdff1aSopenharmony_ci paddw m0, m1 1042cabdff1aSopenharmony_ci psrlw m0, 5 1043cabdff1aSopenharmony_ci SPLATW m0, m0 1044cabdff1aSopenharmony_ci mov r3d, 8 1045cabdff1aSopenharmony_ci.loop: 1046cabdff1aSopenharmony_ci MOV16 r5+r1*0, m0, m0, m0, m0 1047cabdff1aSopenharmony_ci MOV16 r5+r1*1, m0, m0, m0, m0 1048cabdff1aSopenharmony_ci lea r5, [r5+r1*2] 1049cabdff1aSopenharmony_ci dec r3d 1050cabdff1aSopenharmony_ci jg .loop 1051cabdff1aSopenharmony_ci REP_RET 1052cabdff1aSopenharmony_ci 1053cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1054cabdff1aSopenharmony_ci; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride) 1055cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1056cabdff1aSopenharmony_ciINIT_XMM sse2 1057cabdff1aSopenharmony_cicglobal pred16x16_top_dc_10, 2, 3 1058cabdff1aSopenharmony_ci sub r0, r1 1059cabdff1aSopenharmony_ci mova m0, [r0+0] 1060cabdff1aSopenharmony_ci paddw m0, [r0+mmsize] 1061cabdff1aSopenharmony_ci HADDW m0, m2 1062cabdff1aSopenharmony_ci 1063cabdff1aSopenharmony_ci SPLATW m0, m0 1064cabdff1aSopenharmony_ci paddw m0, [pw_8] 1065cabdff1aSopenharmony_ci psrlw m0, 4 1066cabdff1aSopenharmony_ci mov r2d, 8 1067cabdff1aSopenharmony_ci.loop: 1068cabdff1aSopenharmony_ci MOV16 r0+r1*1, m0, m0, m0, m0 1069cabdff1aSopenharmony_ci MOV16 r0+r1*2, m0, m0, m0, m0 1070cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 1071cabdff1aSopenharmony_ci dec r2d 1072cabdff1aSopenharmony_ci jg .loop 1073cabdff1aSopenharmony_ci REP_RET 1074cabdff1aSopenharmony_ci 1075cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1076cabdff1aSopenharmony_ci; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride) 1077cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1078cabdff1aSopenharmony_ciINIT_XMM sse2 1079cabdff1aSopenharmony_cicglobal pred16x16_left_dc_10, 2, 6 1080cabdff1aSopenharmony_ci mov r5, r0 1081cabdff1aSopenharmony_ci 1082cabdff1aSopenharmony_ci sub r0, 2 1083cabdff1aSopenharmony_ci movzx r3d, word [r0] 1084cabdff1aSopenharmony_ci movzx r4d, word [r0+r1] 1085cabdff1aSopenharmony_ci%rep 7 1086cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 1087cabdff1aSopenharmony_ci movzx r2d, word [r0] 1088cabdff1aSopenharmony_ci add r3d, r2d 1089cabdff1aSopenharmony_ci movzx r2d, word [r0+r1] 1090cabdff1aSopenharmony_ci add r4d, r2d 1091cabdff1aSopenharmony_ci%endrep 1092cabdff1aSopenharmony_ci lea r3d, [r3+r4+8] 1093cabdff1aSopenharmony_ci shr r3d, 4 1094cabdff1aSopenharmony_ci 1095cabdff1aSopenharmony_ci movd m0, r3d 1096cabdff1aSopenharmony_ci SPLATW m0, m0 1097cabdff1aSopenharmony_ci mov r3d, 8 1098cabdff1aSopenharmony_ci.loop: 1099cabdff1aSopenharmony_ci MOV16 r5+r1*0, m0, m0, m0, m0 1100cabdff1aSopenharmony_ci MOV16 r5+r1*1, m0, m0, m0, m0 1101cabdff1aSopenharmony_ci lea r5, [r5+r1*2] 1102cabdff1aSopenharmony_ci dec r3d 1103cabdff1aSopenharmony_ci jg .loop 1104cabdff1aSopenharmony_ci REP_RET 1105cabdff1aSopenharmony_ci 1106cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1107cabdff1aSopenharmony_ci; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride) 1108cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1109cabdff1aSopenharmony_ciINIT_XMM sse2 1110cabdff1aSopenharmony_cicglobal pred16x16_128_dc_10, 2,3 1111cabdff1aSopenharmony_ci mova m0, [pw_512] 1112cabdff1aSopenharmony_ci mov r2d, 8 1113cabdff1aSopenharmony_ci.loop: 1114cabdff1aSopenharmony_ci MOV16 r0+r1*0, m0, m0, m0, m0 1115cabdff1aSopenharmony_ci MOV16 r0+r1*1, m0, m0, m0, m0 1116cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 1117cabdff1aSopenharmony_ci dec r2d 1118cabdff1aSopenharmony_ci jg .loop 1119cabdff1aSopenharmony_ci REP_RET 1120