1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* H.264 intra prediction asm optimizations 3cabdff1aSopenharmony_ci;* Copyright (c) 2010 Fiona Glaser 4cabdff1aSopenharmony_ci;* Copyright (c) 2010 Holger Lubitz 5cabdff1aSopenharmony_ci;* Copyright (c) 2010 Loren Merritt 6cabdff1aSopenharmony_ci;* Copyright (c) 2010 Ronald S. Bultje 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 9cabdff1aSopenharmony_ci;* 10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 14cabdff1aSopenharmony_ci;* 15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 19cabdff1aSopenharmony_ci;* 20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23cabdff1aSopenharmony_ci;****************************************************************************** 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ciSECTION_RODATA 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_citm_shuf: times 8 db 0x03, 0x80 30cabdff1aSopenharmony_cipw_ff00: times 8 dw 0xff00 31cabdff1aSopenharmony_ciplane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 32cabdff1aSopenharmony_ci db 1, 2, 3, 4, 5, 6, 7, 8 33cabdff1aSopenharmony_ciplane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 34cabdff1aSopenharmony_ci db 1, 2, 3, 4, 0, 0, 0, 0 35cabdff1aSopenharmony_cipw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 36cabdff1aSopenharmony_cipw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 37cabdff1aSopenharmony_cipw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 38cabdff1aSopenharmony_cipw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ciSECTION .text 41cabdff1aSopenharmony_ci 42cabdff1aSopenharmony_cicextern pb_1 43cabdff1aSopenharmony_cicextern pb_3 44cabdff1aSopenharmony_cicextern pw_4 45cabdff1aSopenharmony_cicextern pw_8 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 48cabdff1aSopenharmony_ci; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride) 49cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_ciINIT_XMM sse 52cabdff1aSopenharmony_cicglobal pred16x16_vertical_8, 2,3 53cabdff1aSopenharmony_ci sub r0, r1 54cabdff1aSopenharmony_ci mov r2, 4 55cabdff1aSopenharmony_ci movaps xmm0, [r0] 56cabdff1aSopenharmony_ci.loop: 57cabdff1aSopenharmony_ci movaps [r0+r1*1], xmm0 58cabdff1aSopenharmony_ci movaps [r0+r1*2], xmm0 59cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 60cabdff1aSopenharmony_ci movaps [r0+r1*1], xmm0 61cabdff1aSopenharmony_ci movaps [r0+r1*2], xmm0 62cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 63cabdff1aSopenharmony_ci dec r2 64cabdff1aSopenharmony_ci jg .loop 65cabdff1aSopenharmony_ci REP_RET 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 68cabdff1aSopenharmony_ci; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride) 69cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_ci%macro PRED16x16_H 0 72cabdff1aSopenharmony_cicglobal pred16x16_horizontal_8, 2,3 73cabdff1aSopenharmony_ci mov r2, 8 74cabdff1aSopenharmony_ci%if cpuflag(ssse3) 75cabdff1aSopenharmony_ci mova m2, [pb_3] 76cabdff1aSopenharmony_ci%endif 77cabdff1aSopenharmony_ci.loop: 78cabdff1aSopenharmony_ci movd m0, [r0+r1*0-4] 79cabdff1aSopenharmony_ci movd m1, [r0+r1*1-4] 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci%if cpuflag(ssse3) 82cabdff1aSopenharmony_ci pshufb m0, m2 83cabdff1aSopenharmony_ci pshufb m1, m2 84cabdff1aSopenharmony_ci%else 85cabdff1aSopenharmony_ci punpcklbw m0, m0 86cabdff1aSopenharmony_ci punpcklbw m1, m1 87cabdff1aSopenharmony_ci SPLATW m0, m0, 3 88cabdff1aSopenharmony_ci SPLATW m1, m1, 3 89cabdff1aSopenharmony_ci mova [r0+r1*0+8], m0 90cabdff1aSopenharmony_ci mova [r0+r1*1+8], m1 91cabdff1aSopenharmony_ci%endif 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ci mova [r0+r1*0], m0 94cabdff1aSopenharmony_ci mova [r0+r1*1], m1 95cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 96cabdff1aSopenharmony_ci dec r2 97cabdff1aSopenharmony_ci jg .loop 98cabdff1aSopenharmony_ci REP_RET 99cabdff1aSopenharmony_ci%endmacro 100cabdff1aSopenharmony_ci 101cabdff1aSopenharmony_ciINIT_MMX mmxext 102cabdff1aSopenharmony_ciPRED16x16_H 103cabdff1aSopenharmony_ciINIT_XMM ssse3 104cabdff1aSopenharmony_ciPRED16x16_H 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 107cabdff1aSopenharmony_ci; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride) 108cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 109cabdff1aSopenharmony_ci 110cabdff1aSopenharmony_ci%macro PRED16x16_DC 0 111cabdff1aSopenharmony_cicglobal pred16x16_dc_8, 2,7 112cabdff1aSopenharmony_ci mov r4, r0 113cabdff1aSopenharmony_ci sub r0, r1 114cabdff1aSopenharmony_ci pxor mm0, mm0 115cabdff1aSopenharmony_ci pxor mm1, mm1 116cabdff1aSopenharmony_ci psadbw mm0, [r0+0] 117cabdff1aSopenharmony_ci psadbw mm1, [r0+8] 118cabdff1aSopenharmony_ci dec r0 119cabdff1aSopenharmony_ci movzx r5d, byte [r0+r1*1] 120cabdff1aSopenharmony_ci paddw mm0, mm1 121cabdff1aSopenharmony_ci movd r6d, mm0 122cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 123cabdff1aSopenharmony_ci%rep 7 124cabdff1aSopenharmony_ci movzx r2d, byte [r0+r1*0] 125cabdff1aSopenharmony_ci movzx r3d, byte [r0+r1*1] 126cabdff1aSopenharmony_ci add r5d, r2d 127cabdff1aSopenharmony_ci add r6d, r3d 128cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 129cabdff1aSopenharmony_ci%endrep 130cabdff1aSopenharmony_ci movzx r2d, byte [r0+r1*0] 131cabdff1aSopenharmony_ci add r5d, r6d 132cabdff1aSopenharmony_ci lea r2d, [r2+r5+16] 133cabdff1aSopenharmony_ci shr r2d, 5 134cabdff1aSopenharmony_ci%if cpuflag(ssse3) 135cabdff1aSopenharmony_ci pxor m1, m1 136cabdff1aSopenharmony_ci%endif 137cabdff1aSopenharmony_ci SPLATB_REG m0, r2, m1 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci mov r3d, 4 140cabdff1aSopenharmony_ci.loop: 141cabdff1aSopenharmony_ci mova [r4+r1*0], m0 142cabdff1aSopenharmony_ci mova [r4+r1*1], m0 143cabdff1aSopenharmony_ci lea r4, [r4+r1*2] 144cabdff1aSopenharmony_ci mova [r4+r1*0], m0 145cabdff1aSopenharmony_ci mova [r4+r1*1], m0 146cabdff1aSopenharmony_ci lea r4, [r4+r1*2] 147cabdff1aSopenharmony_ci dec r3d 148cabdff1aSopenharmony_ci jg .loop 149cabdff1aSopenharmony_ci REP_RET 150cabdff1aSopenharmony_ci%endmacro 151cabdff1aSopenharmony_ci 152cabdff1aSopenharmony_ciINIT_XMM sse2 153cabdff1aSopenharmony_ciPRED16x16_DC 154cabdff1aSopenharmony_ciINIT_XMM ssse3 155cabdff1aSopenharmony_ciPRED16x16_DC 156cabdff1aSopenharmony_ci 157cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 158cabdff1aSopenharmony_ci; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride) 159cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ciINIT_XMM sse2 162cabdff1aSopenharmony_cicglobal pred16x16_tm_vp8_8, 2,6,6 163cabdff1aSopenharmony_ci sub r0, r1 164cabdff1aSopenharmony_ci pxor xmm2, xmm2 165cabdff1aSopenharmony_ci movdqa xmm0, [r0] 166cabdff1aSopenharmony_ci movdqa xmm1, xmm0 167cabdff1aSopenharmony_ci punpcklbw xmm0, xmm2 168cabdff1aSopenharmony_ci punpckhbw xmm1, xmm2 169cabdff1aSopenharmony_ci movzx r4d, byte [r0-1] 170cabdff1aSopenharmony_ci mov r5d, 8 171cabdff1aSopenharmony_ci.loop: 172cabdff1aSopenharmony_ci movzx r2d, byte [r0+r1*1-1] 173cabdff1aSopenharmony_ci movzx r3d, byte [r0+r1*2-1] 174cabdff1aSopenharmony_ci sub r2d, r4d 175cabdff1aSopenharmony_ci sub r3d, r4d 176cabdff1aSopenharmony_ci movd xmm2, r2d 177cabdff1aSopenharmony_ci movd xmm4, r3d 178cabdff1aSopenharmony_ci pshuflw xmm2, xmm2, 0 179cabdff1aSopenharmony_ci pshuflw xmm4, xmm4, 0 180cabdff1aSopenharmony_ci punpcklqdq xmm2, xmm2 181cabdff1aSopenharmony_ci punpcklqdq xmm4, xmm4 182cabdff1aSopenharmony_ci movdqa xmm3, xmm2 183cabdff1aSopenharmony_ci movdqa xmm5, xmm4 184cabdff1aSopenharmony_ci paddw xmm2, xmm0 185cabdff1aSopenharmony_ci paddw xmm3, xmm1 186cabdff1aSopenharmony_ci paddw xmm4, xmm0 187cabdff1aSopenharmony_ci paddw xmm5, xmm1 188cabdff1aSopenharmony_ci packuswb xmm2, xmm3 189cabdff1aSopenharmony_ci packuswb xmm4, xmm5 190cabdff1aSopenharmony_ci movdqa [r0+r1*1], xmm2 191cabdff1aSopenharmony_ci movdqa [r0+r1*2], xmm4 192cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 193cabdff1aSopenharmony_ci dec r5d 194cabdff1aSopenharmony_ci jg .loop 195cabdff1aSopenharmony_ci REP_RET 196cabdff1aSopenharmony_ci 197cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 198cabdff1aSopenharmony_ciINIT_YMM avx2 199cabdff1aSopenharmony_cicglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration 200cabdff1aSopenharmony_ci sub dstq, strideq 201cabdff1aSopenharmony_ci pmovzxbw m0, [dstq] 202cabdff1aSopenharmony_ci vpbroadcastb xm1, [r0-1] 203cabdff1aSopenharmony_ci pmovzxbw m1, xm1 204cabdff1aSopenharmony_ci psubw m0, m1 205cabdff1aSopenharmony_ci mov iterationd, 4 206cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 207cabdff1aSopenharmony_ci.loop: 208cabdff1aSopenharmony_ci vpbroadcastb xm1, [dstq+strideq*1-1] 209cabdff1aSopenharmony_ci vpbroadcastb xm2, [dstq+strideq*2-1] 210cabdff1aSopenharmony_ci vpbroadcastb xm3, [dstq+stride3q-1] 211cabdff1aSopenharmony_ci vpbroadcastb xm4, [dstq+strideq*4-1] 212cabdff1aSopenharmony_ci pmovzxbw m1, xm1 213cabdff1aSopenharmony_ci pmovzxbw m2, xm2 214cabdff1aSopenharmony_ci pmovzxbw m3, xm3 215cabdff1aSopenharmony_ci pmovzxbw m4, xm4 216cabdff1aSopenharmony_ci paddw m1, m0 217cabdff1aSopenharmony_ci paddw m2, m0 218cabdff1aSopenharmony_ci paddw m3, m0 219cabdff1aSopenharmony_ci paddw m4, m0 220cabdff1aSopenharmony_ci vpackuswb m1, m1, m2 221cabdff1aSopenharmony_ci vpackuswb m3, m3, m4 222cabdff1aSopenharmony_ci vpermq m1, m1, q3120 223cabdff1aSopenharmony_ci vpermq m3, m3, q3120 224cabdff1aSopenharmony_ci movdqa [dstq+strideq*1], xm1 225cabdff1aSopenharmony_ci vextracti128 [dstq+strideq*2], m1, 1 226cabdff1aSopenharmony_ci movdqa [dstq+stride3q*1], xm3 227cabdff1aSopenharmony_ci vextracti128 [dstq+strideq*4], m3, 1 228cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 229cabdff1aSopenharmony_ci dec iterationd 230cabdff1aSopenharmony_ci jg .loop 231cabdff1aSopenharmony_ci REP_RET 232cabdff1aSopenharmony_ci%endif 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 235cabdff1aSopenharmony_ci; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride) 236cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 237cabdff1aSopenharmony_ci 238cabdff1aSopenharmony_ci%macro H264_PRED16x16_PLANE 1 239cabdff1aSopenharmony_cicglobal pred16x16_plane_%1_8, 2,9,7 240cabdff1aSopenharmony_ci mov r2, r1 ; +stride 241cabdff1aSopenharmony_ci neg r1 ; -stride 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci movh m0, [r0+r1 -1] 244cabdff1aSopenharmony_ci%if cpuflag(ssse3) 245cabdff1aSopenharmony_ci movhps m0, [r0+r1 +8] 246cabdff1aSopenharmony_ci pmaddubsw m0, [plane_shuf] ; H coefficients 247cabdff1aSopenharmony_ci%else ; sse2 248cabdff1aSopenharmony_ci pxor m2, m2 249cabdff1aSopenharmony_ci movh m1, [r0+r1 +8] 250cabdff1aSopenharmony_ci punpcklbw m0, m2 251cabdff1aSopenharmony_ci punpcklbw m1, m2 252cabdff1aSopenharmony_ci pmullw m0, [pw_m8tom1] 253cabdff1aSopenharmony_ci pmullw m1, [pw_1to8] 254cabdff1aSopenharmony_ci paddw m0, m1 255cabdff1aSopenharmony_ci%endif 256cabdff1aSopenharmony_ci movhlps m1, m0 257cabdff1aSopenharmony_ci paddw m0, m1 258cabdff1aSopenharmony_ci PSHUFLW m1, m0, 0xE 259cabdff1aSopenharmony_ci paddw m0, m1 260cabdff1aSopenharmony_ci PSHUFLW m1, m0, 0x1 261cabdff1aSopenharmony_ci paddw m0, m1 ; sum of H coefficients 262cabdff1aSopenharmony_ci 263cabdff1aSopenharmony_ci lea r4, [r0+r2*8-1] 264cabdff1aSopenharmony_ci lea r3, [r0+r2*4-1] 265cabdff1aSopenharmony_ci add r4, r2 266cabdff1aSopenharmony_ci 267cabdff1aSopenharmony_ci%if ARCH_X86_64 268cabdff1aSopenharmony_ci%define e_reg r8 269cabdff1aSopenharmony_ci%else 270cabdff1aSopenharmony_ci%define e_reg r0 271cabdff1aSopenharmony_ci%endif 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci movzx e_reg, byte [r3+r2*2 ] 274cabdff1aSopenharmony_ci movzx r5, byte [r4+r1 ] 275cabdff1aSopenharmony_ci sub r5, e_reg 276cabdff1aSopenharmony_ci 277cabdff1aSopenharmony_ci movzx e_reg, byte [r3+r2 ] 278cabdff1aSopenharmony_ci movzx r6, byte [r4 ] 279cabdff1aSopenharmony_ci sub r6, e_reg 280cabdff1aSopenharmony_ci lea r5, [r5+r6*2] 281cabdff1aSopenharmony_ci 282cabdff1aSopenharmony_ci movzx e_reg, byte [r3+r1 ] 283cabdff1aSopenharmony_ci movzx r6, byte [r4+r2*2 ] 284cabdff1aSopenharmony_ci sub r6, e_reg 285cabdff1aSopenharmony_ci lea r5, [r5+r6*4] 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci movzx e_reg, byte [r3 ] 288cabdff1aSopenharmony_ci%if ARCH_X86_64 289cabdff1aSopenharmony_ci movzx r7, byte [r4+r2 ] 290cabdff1aSopenharmony_ci sub r7, e_reg 291cabdff1aSopenharmony_ci%else 292cabdff1aSopenharmony_ci movzx r6, byte [r4+r2 ] 293cabdff1aSopenharmony_ci sub r6, e_reg 294cabdff1aSopenharmony_ci lea r5, [r5+r6*4] 295cabdff1aSopenharmony_ci sub r5, r6 296cabdff1aSopenharmony_ci%endif 297cabdff1aSopenharmony_ci 298cabdff1aSopenharmony_ci lea e_reg, [r3+r1*4] 299cabdff1aSopenharmony_ci lea r3, [r4+r2*4] 300cabdff1aSopenharmony_ci 301cabdff1aSopenharmony_ci movzx r4, byte [e_reg+r2 ] 302cabdff1aSopenharmony_ci movzx r6, byte [r3 ] 303cabdff1aSopenharmony_ci sub r6, r4 304cabdff1aSopenharmony_ci%if ARCH_X86_64 305cabdff1aSopenharmony_ci lea r6, [r7+r6*2] 306cabdff1aSopenharmony_ci lea r5, [r5+r6*2] 307cabdff1aSopenharmony_ci add r5, r6 308cabdff1aSopenharmony_ci%else 309cabdff1aSopenharmony_ci lea r5, [r5+r6*4] 310cabdff1aSopenharmony_ci lea r5, [r5+r6*2] 311cabdff1aSopenharmony_ci%endif 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci movzx r4, byte [e_reg ] 314cabdff1aSopenharmony_ci%if ARCH_X86_64 315cabdff1aSopenharmony_ci movzx r7, byte [r3 +r2 ] 316cabdff1aSopenharmony_ci sub r7, r4 317cabdff1aSopenharmony_ci sub r5, r7 318cabdff1aSopenharmony_ci%else 319cabdff1aSopenharmony_ci movzx r6, byte [r3 +r2 ] 320cabdff1aSopenharmony_ci sub r6, r4 321cabdff1aSopenharmony_ci lea r5, [r5+r6*8] 322cabdff1aSopenharmony_ci sub r5, r6 323cabdff1aSopenharmony_ci%endif 324cabdff1aSopenharmony_ci 325cabdff1aSopenharmony_ci movzx r4, byte [e_reg+r1 ] 326cabdff1aSopenharmony_ci movzx r6, byte [r3 +r2*2] 327cabdff1aSopenharmony_ci sub r6, r4 328cabdff1aSopenharmony_ci%if ARCH_X86_64 329cabdff1aSopenharmony_ci add r6, r7 330cabdff1aSopenharmony_ci%endif 331cabdff1aSopenharmony_ci lea r5, [r5+r6*8] 332cabdff1aSopenharmony_ci 333cabdff1aSopenharmony_ci movzx r4, byte [e_reg+r2*2] 334cabdff1aSopenharmony_ci movzx r6, byte [r3 +r1 ] 335cabdff1aSopenharmony_ci sub r6, r4 336cabdff1aSopenharmony_ci lea r5, [r5+r6*4] 337cabdff1aSopenharmony_ci add r5, r6 ; sum of V coefficients 338cabdff1aSopenharmony_ci 339cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 340cabdff1aSopenharmony_ci mov r0, r0m 341cabdff1aSopenharmony_ci%endif 342cabdff1aSopenharmony_ci 343cabdff1aSopenharmony_ci%ifidn %1, h264 344cabdff1aSopenharmony_ci lea r5, [r5*5+32] 345cabdff1aSopenharmony_ci sar r5, 6 346cabdff1aSopenharmony_ci%elifidn %1, rv40 347cabdff1aSopenharmony_ci lea r5, [r5*5] 348cabdff1aSopenharmony_ci sar r5, 6 349cabdff1aSopenharmony_ci%elifidn %1, svq3 350cabdff1aSopenharmony_ci test r5, r5 351cabdff1aSopenharmony_ci lea r6, [r5+3] 352cabdff1aSopenharmony_ci cmovs r5, r6 353cabdff1aSopenharmony_ci sar r5, 2 ; V/4 354cabdff1aSopenharmony_ci lea r5, [r5*5] ; 5*(V/4) 355cabdff1aSopenharmony_ci test r5, r5 356cabdff1aSopenharmony_ci lea r6, [r5+15] 357cabdff1aSopenharmony_ci cmovs r5, r6 358cabdff1aSopenharmony_ci sar r5, 4 ; (5*(V/4))/16 359cabdff1aSopenharmony_ci%endif 360cabdff1aSopenharmony_ci 361cabdff1aSopenharmony_ci movzx r4, byte [r0+r1 +15] 362cabdff1aSopenharmony_ci movzx r3, byte [r3+r2*2 ] 363cabdff1aSopenharmony_ci lea r3, [r3+r4+1] 364cabdff1aSopenharmony_ci shl r3, 4 365cabdff1aSopenharmony_ci 366cabdff1aSopenharmony_ci movd r1d, m0 367cabdff1aSopenharmony_ci movsx r1d, r1w 368cabdff1aSopenharmony_ci%ifnidn %1, svq3 369cabdff1aSopenharmony_ci%ifidn %1, h264 370cabdff1aSopenharmony_ci lea r1d, [r1d*5+32] 371cabdff1aSopenharmony_ci%else ; rv40 372cabdff1aSopenharmony_ci lea r1d, [r1d*5] 373cabdff1aSopenharmony_ci%endif 374cabdff1aSopenharmony_ci sar r1d, 6 375cabdff1aSopenharmony_ci%else ; svq3 376cabdff1aSopenharmony_ci test r1d, r1d 377cabdff1aSopenharmony_ci lea r4d, [r1d+3] 378cabdff1aSopenharmony_ci cmovs r1d, r4d 379cabdff1aSopenharmony_ci sar r1d, 2 ; H/4 380cabdff1aSopenharmony_ci lea r1d, [r1d*5] ; 5*(H/4) 381cabdff1aSopenharmony_ci test r1d, r1d 382cabdff1aSopenharmony_ci lea r4d, [r1d+15] 383cabdff1aSopenharmony_ci cmovs r1d, r4d 384cabdff1aSopenharmony_ci sar r1d, 4 ; (5*(H/4))/16 385cabdff1aSopenharmony_ci%endif 386cabdff1aSopenharmony_ci movd m0, r1d 387cabdff1aSopenharmony_ci 388cabdff1aSopenharmony_ci add r1d, r5d 389cabdff1aSopenharmony_ci add r3d, r1d 390cabdff1aSopenharmony_ci shl r1d, 3 391cabdff1aSopenharmony_ci sub r3d, r1d ; a 392cabdff1aSopenharmony_ci 393cabdff1aSopenharmony_ci movd m1, r5d 394cabdff1aSopenharmony_ci movd m3, r3d 395cabdff1aSopenharmony_ci SPLATW m0, m0, 0 ; H 396cabdff1aSopenharmony_ci SPLATW m1, m1, 0 ; V 397cabdff1aSopenharmony_ci SPLATW m3, m3, 0 ; a 398cabdff1aSopenharmony_ci%ifidn %1, svq3 399cabdff1aSopenharmony_ci SWAP 0, 1 400cabdff1aSopenharmony_ci%endif 401cabdff1aSopenharmony_ci mova m2, m0 402cabdff1aSopenharmony_ci pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 403cabdff1aSopenharmony_ci psllw m2, 3 404cabdff1aSopenharmony_ci paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 405cabdff1aSopenharmony_ci paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H 406cabdff1aSopenharmony_ci 407cabdff1aSopenharmony_ci mov r4, 8 408cabdff1aSopenharmony_ci.loop: 409cabdff1aSopenharmony_ci mova m3, m0 ; b[0..7] 410cabdff1aSopenharmony_ci mova m4, m2 ; b[8..15] 411cabdff1aSopenharmony_ci psraw m3, 5 412cabdff1aSopenharmony_ci psraw m4, 5 413cabdff1aSopenharmony_ci packuswb m3, m4 414cabdff1aSopenharmony_ci mova [r0], m3 415cabdff1aSopenharmony_ci paddw m0, m1 416cabdff1aSopenharmony_ci paddw m2, m1 417cabdff1aSopenharmony_ci 418cabdff1aSopenharmony_ci mova m3, m0 ; b[0..7] 419cabdff1aSopenharmony_ci mova m4, m2 ; b[8..15] 420cabdff1aSopenharmony_ci psraw m3, 5 421cabdff1aSopenharmony_ci psraw m4, 5 422cabdff1aSopenharmony_ci packuswb m3, m4 423cabdff1aSopenharmony_ci mova [r0+r2], m3 424cabdff1aSopenharmony_ci paddw m0, m1 425cabdff1aSopenharmony_ci paddw m2, m1 426cabdff1aSopenharmony_ci 427cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 428cabdff1aSopenharmony_ci dec r4 429cabdff1aSopenharmony_ci jg .loop 430cabdff1aSopenharmony_ci REP_RET 431cabdff1aSopenharmony_ci%endmacro 432cabdff1aSopenharmony_ci 433cabdff1aSopenharmony_ciINIT_XMM sse2 434cabdff1aSopenharmony_ciH264_PRED16x16_PLANE h264 435cabdff1aSopenharmony_ciH264_PRED16x16_PLANE rv40 436cabdff1aSopenharmony_ciH264_PRED16x16_PLANE svq3 437cabdff1aSopenharmony_ciINIT_XMM ssse3 438cabdff1aSopenharmony_ciH264_PRED16x16_PLANE h264 439cabdff1aSopenharmony_ciH264_PRED16x16_PLANE rv40 440cabdff1aSopenharmony_ciH264_PRED16x16_PLANE svq3 441cabdff1aSopenharmony_ci 442cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 443cabdff1aSopenharmony_ci; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride) 444cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 445cabdff1aSopenharmony_ci 446cabdff1aSopenharmony_ci%macro H264_PRED8x8_PLANE 0 447cabdff1aSopenharmony_cicglobal pred8x8_plane_8, 2,9,7 448cabdff1aSopenharmony_ci mov r2, r1 ; +stride 449cabdff1aSopenharmony_ci neg r1 ; -stride 450cabdff1aSopenharmony_ci 451cabdff1aSopenharmony_ci movd m0, [r0+r1 -1] 452cabdff1aSopenharmony_ci%if cpuflag(ssse3) 453cabdff1aSopenharmony_ci movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary 454cabdff1aSopenharmony_ci pmaddubsw m0, [plane8_shuf] ; H coefficients 455cabdff1aSopenharmony_ci%else ; sse2 456cabdff1aSopenharmony_ci pxor m2, m2 457cabdff1aSopenharmony_ci movd m1, [r0+r1 +4] 458cabdff1aSopenharmony_ci punpckldq m0, m1 459cabdff1aSopenharmony_ci punpcklbw m0, m2 460cabdff1aSopenharmony_ci pmullw m0, [pw_m4to4] 461cabdff1aSopenharmony_ci%endif 462cabdff1aSopenharmony_ci movhlps m1, m0 463cabdff1aSopenharmony_ci paddw m0, m1 464cabdff1aSopenharmony_ci 465cabdff1aSopenharmony_ci%if notcpuflag(ssse3) 466cabdff1aSopenharmony_ci PSHUFLW m1, m0, 0xE 467cabdff1aSopenharmony_ci paddw m0, m1 468cabdff1aSopenharmony_ci%endif ; !ssse3 469cabdff1aSopenharmony_ci 470cabdff1aSopenharmony_ci PSHUFLW m1, m0, 0x1 471cabdff1aSopenharmony_ci paddw m0, m1 ; sum of H coefficients 472cabdff1aSopenharmony_ci 473cabdff1aSopenharmony_ci lea r4, [r0+r2*4-1] 474cabdff1aSopenharmony_ci lea r3, [r0 -1] 475cabdff1aSopenharmony_ci add r4, r2 476cabdff1aSopenharmony_ci 477cabdff1aSopenharmony_ci%if ARCH_X86_64 478cabdff1aSopenharmony_ci%define e_reg r8 479cabdff1aSopenharmony_ci%else 480cabdff1aSopenharmony_ci%define e_reg r0 481cabdff1aSopenharmony_ci%endif 482cabdff1aSopenharmony_ci 483cabdff1aSopenharmony_ci movzx e_reg, byte [r3+r2*2 ] 484cabdff1aSopenharmony_ci movzx r5, byte [r4+r1 ] 485cabdff1aSopenharmony_ci sub r5, e_reg 486cabdff1aSopenharmony_ci 487cabdff1aSopenharmony_ci movzx e_reg, byte [r3 ] 488cabdff1aSopenharmony_ci%if ARCH_X86_64 489cabdff1aSopenharmony_ci movzx r7, byte [r4+r2 ] 490cabdff1aSopenharmony_ci sub r7, e_reg 491cabdff1aSopenharmony_ci sub r5, r7 492cabdff1aSopenharmony_ci%else 493cabdff1aSopenharmony_ci movzx r6, byte [r4+r2 ] 494cabdff1aSopenharmony_ci sub r6, e_reg 495cabdff1aSopenharmony_ci lea r5, [r5+r6*4] 496cabdff1aSopenharmony_ci sub r5, r6 497cabdff1aSopenharmony_ci%endif 498cabdff1aSopenharmony_ci 499cabdff1aSopenharmony_ci movzx e_reg, byte [r3+r1 ] 500cabdff1aSopenharmony_ci movzx r6, byte [r4+r2*2 ] 501cabdff1aSopenharmony_ci sub r6, e_reg 502cabdff1aSopenharmony_ci%if ARCH_X86_64 503cabdff1aSopenharmony_ci add r6, r7 504cabdff1aSopenharmony_ci%endif 505cabdff1aSopenharmony_ci lea r5, [r5+r6*4] 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_ci movzx e_reg, byte [r3+r2 ] 508cabdff1aSopenharmony_ci movzx r6, byte [r4 ] 509cabdff1aSopenharmony_ci sub r6, e_reg 510cabdff1aSopenharmony_ci lea r6, [r5+r6*2] 511cabdff1aSopenharmony_ci 512cabdff1aSopenharmony_ci lea r5, [r6*9+16] 513cabdff1aSopenharmony_ci lea r5, [r5+r6*8] 514cabdff1aSopenharmony_ci sar r5, 5 515cabdff1aSopenharmony_ci 516cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 517cabdff1aSopenharmony_ci mov r0, r0m 518cabdff1aSopenharmony_ci%endif 519cabdff1aSopenharmony_ci 520cabdff1aSopenharmony_ci movzx r3, byte [r4+r2*2 ] 521cabdff1aSopenharmony_ci movzx r4, byte [r0+r1 +7] 522cabdff1aSopenharmony_ci lea r3, [r3+r4+1] 523cabdff1aSopenharmony_ci shl r3, 4 524cabdff1aSopenharmony_ci movd r1d, m0 525cabdff1aSopenharmony_ci movsx r1d, r1w 526cabdff1aSopenharmony_ci imul r1d, 17 527cabdff1aSopenharmony_ci add r1d, 16 528cabdff1aSopenharmony_ci sar r1d, 5 529cabdff1aSopenharmony_ci movd m0, r1d 530cabdff1aSopenharmony_ci add r1d, r5d 531cabdff1aSopenharmony_ci sub r3d, r1d 532cabdff1aSopenharmony_ci add r1d, r1d 533cabdff1aSopenharmony_ci sub r3d, r1d ; a 534cabdff1aSopenharmony_ci 535cabdff1aSopenharmony_ci movd m1, r5d 536cabdff1aSopenharmony_ci movd m3, r3d 537cabdff1aSopenharmony_ci SPLATW m0, m0, 0 ; H 538cabdff1aSopenharmony_ci SPLATW m1, m1, 0 ; V 539cabdff1aSopenharmony_ci SPLATW m3, m3, 0 ; a 540cabdff1aSopenharmony_ci pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 541cabdff1aSopenharmony_ci paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 542cabdff1aSopenharmony_ci 543cabdff1aSopenharmony_ci mov r4, 4 544cabdff1aSopenharmony_ciALIGN 16 545cabdff1aSopenharmony_ci.loop: 546cabdff1aSopenharmony_ci mova m3, m0 ; b[0..7] 547cabdff1aSopenharmony_ci paddw m0, m1 548cabdff1aSopenharmony_ci psraw m3, 5 549cabdff1aSopenharmony_ci mova m4, m0 ; V+b[0..7] 550cabdff1aSopenharmony_ci paddw m0, m1 551cabdff1aSopenharmony_ci psraw m4, 5 552cabdff1aSopenharmony_ci packuswb m3, m4 553cabdff1aSopenharmony_ci movh [r0], m3 554cabdff1aSopenharmony_ci movhps [r0+r2], m3 555cabdff1aSopenharmony_ci 556cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 557cabdff1aSopenharmony_ci dec r4 558cabdff1aSopenharmony_ci jg .loop 559cabdff1aSopenharmony_ci REP_RET 560cabdff1aSopenharmony_ci%endmacro 561cabdff1aSopenharmony_ci 562cabdff1aSopenharmony_ciINIT_XMM sse2 563cabdff1aSopenharmony_ciH264_PRED8x8_PLANE 564cabdff1aSopenharmony_ciINIT_XMM ssse3 565cabdff1aSopenharmony_ciH264_PRED8x8_PLANE 566cabdff1aSopenharmony_ci 567cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 568cabdff1aSopenharmony_ci; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride) 569cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 570cabdff1aSopenharmony_ci 571cabdff1aSopenharmony_ciINIT_MMX mmx 572cabdff1aSopenharmony_cicglobal pred8x8_vertical_8, 2,2 573cabdff1aSopenharmony_ci sub r0, r1 574cabdff1aSopenharmony_ci movq mm0, [r0] 575cabdff1aSopenharmony_ci%rep 3 576cabdff1aSopenharmony_ci movq [r0+r1*1], mm0 577cabdff1aSopenharmony_ci movq [r0+r1*2], mm0 578cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 579cabdff1aSopenharmony_ci%endrep 580cabdff1aSopenharmony_ci movq [r0+r1*1], mm0 581cabdff1aSopenharmony_ci movq [r0+r1*2], mm0 582cabdff1aSopenharmony_ci RET 583cabdff1aSopenharmony_ci 584cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 585cabdff1aSopenharmony_ci; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride) 586cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 587cabdff1aSopenharmony_ci 588cabdff1aSopenharmony_ci%macro PRED8x8_H 0 589cabdff1aSopenharmony_cicglobal pred8x8_horizontal_8, 2,3 590cabdff1aSopenharmony_ci mov r2, 4 591cabdff1aSopenharmony_ci%if cpuflag(ssse3) 592cabdff1aSopenharmony_ci mova m2, [pb_3] 593cabdff1aSopenharmony_ci%endif 594cabdff1aSopenharmony_ci.loop: 595cabdff1aSopenharmony_ci SPLATB_LOAD m0, r0+r1*0-1, m2 596cabdff1aSopenharmony_ci SPLATB_LOAD m1, r0+r1*1-1, m2 597cabdff1aSopenharmony_ci mova [r0+r1*0], m0 598cabdff1aSopenharmony_ci mova [r0+r1*1], m1 599cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 600cabdff1aSopenharmony_ci dec r2 601cabdff1aSopenharmony_ci jg .loop 602cabdff1aSopenharmony_ci REP_RET 603cabdff1aSopenharmony_ci%endmacro 604cabdff1aSopenharmony_ci 605cabdff1aSopenharmony_ciINIT_MMX mmxext 606cabdff1aSopenharmony_ciPRED8x8_H 607cabdff1aSopenharmony_ciINIT_MMX ssse3 608cabdff1aSopenharmony_ciPRED8x8_H 609cabdff1aSopenharmony_ci 610cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 611cabdff1aSopenharmony_ci; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) 612cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 613cabdff1aSopenharmony_ciINIT_MMX mmxext 614cabdff1aSopenharmony_cicglobal pred8x8_top_dc_8, 2,5 615cabdff1aSopenharmony_ci sub r0, r1 616cabdff1aSopenharmony_ci movq mm0, [r0] 617cabdff1aSopenharmony_ci pxor mm1, mm1 618cabdff1aSopenharmony_ci pxor mm2, mm2 619cabdff1aSopenharmony_ci lea r2, [r0+r1*2] 620cabdff1aSopenharmony_ci punpckhbw mm1, mm0 621cabdff1aSopenharmony_ci punpcklbw mm0, mm2 622cabdff1aSopenharmony_ci psadbw mm1, mm2 ; s1 623cabdff1aSopenharmony_ci lea r3, [r2+r1*2] 624cabdff1aSopenharmony_ci psadbw mm0, mm2 ; s0 625cabdff1aSopenharmony_ci psrlw mm1, 1 626cabdff1aSopenharmony_ci psrlw mm0, 1 627cabdff1aSopenharmony_ci pavgw mm1, mm2 628cabdff1aSopenharmony_ci lea r4, [r3+r1*2] 629cabdff1aSopenharmony_ci pavgw mm0, mm2 630cabdff1aSopenharmony_ci pshufw mm1, mm1, 0 631cabdff1aSopenharmony_ci pshufw mm0, mm0, 0 ; dc0 (w) 632cabdff1aSopenharmony_ci packuswb mm0, mm1 ; dc0,dc1 (b) 633cabdff1aSopenharmony_ci movq [r0+r1*1], mm0 634cabdff1aSopenharmony_ci movq [r0+r1*2], mm0 635cabdff1aSopenharmony_ci lea r0, [r3+r1*2] 636cabdff1aSopenharmony_ci movq [r2+r1*1], mm0 637cabdff1aSopenharmony_ci movq [r2+r1*2], mm0 638cabdff1aSopenharmony_ci movq [r3+r1*1], mm0 639cabdff1aSopenharmony_ci movq [r3+r1*2], mm0 640cabdff1aSopenharmony_ci movq [r0+r1*1], mm0 641cabdff1aSopenharmony_ci movq [r0+r1*2], mm0 642cabdff1aSopenharmony_ci RET 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 645cabdff1aSopenharmony_ci; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) 646cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 647cabdff1aSopenharmony_ci 648cabdff1aSopenharmony_ciINIT_MMX mmxext 649cabdff1aSopenharmony_cicglobal pred8x8_dc_8, 2,5 650cabdff1aSopenharmony_ci sub r0, r1 651cabdff1aSopenharmony_ci pxor m7, m7 652cabdff1aSopenharmony_ci movd m0, [r0+0] 653cabdff1aSopenharmony_ci movd m1, [r0+4] 654cabdff1aSopenharmony_ci psadbw m0, m7 ; s0 655cabdff1aSopenharmony_ci mov r4, r0 656cabdff1aSopenharmony_ci psadbw m1, m7 ; s1 657cabdff1aSopenharmony_ci 658cabdff1aSopenharmony_ci movzx r2d, byte [r0+r1*1-1] 659cabdff1aSopenharmony_ci movzx r3d, byte [r0+r1*2-1] 660cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 661cabdff1aSopenharmony_ci add r2d, r3d 662cabdff1aSopenharmony_ci movzx r3d, byte [r0+r1*1-1] 663cabdff1aSopenharmony_ci add r2d, r3d 664cabdff1aSopenharmony_ci movzx r3d, byte [r0+r1*2-1] 665cabdff1aSopenharmony_ci add r2d, r3d 666cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 667cabdff1aSopenharmony_ci movd m2, r2d ; s2 668cabdff1aSopenharmony_ci movzx r2d, byte [r0+r1*1-1] 669cabdff1aSopenharmony_ci movzx r3d, byte [r0+r1*2-1] 670cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 671cabdff1aSopenharmony_ci add r2d, r3d 672cabdff1aSopenharmony_ci movzx r3d, byte [r0+r1*1-1] 673cabdff1aSopenharmony_ci add r2d, r3d 674cabdff1aSopenharmony_ci movzx r3d, byte [r0+r1*2-1] 675cabdff1aSopenharmony_ci add r2d, r3d 676cabdff1aSopenharmony_ci movd m3, r2d ; s3 677cabdff1aSopenharmony_ci 678cabdff1aSopenharmony_ci punpcklwd m0, m1 679cabdff1aSopenharmony_ci mov r0, r4 680cabdff1aSopenharmony_ci punpcklwd m2, m3 681cabdff1aSopenharmony_ci punpckldq m0, m2 ; s0, s1, s2, s3 682cabdff1aSopenharmony_ci pshufw m3, m0, 11110110b ; s2, s1, s3, s3 683cabdff1aSopenharmony_ci lea r2, [r0+r1*2] 684cabdff1aSopenharmony_ci pshufw m0, m0, 01110100b ; s0, s1, s3, s1 685cabdff1aSopenharmony_ci paddw m0, m3 686cabdff1aSopenharmony_ci lea r3, [r2+r1*2] 687cabdff1aSopenharmony_ci psrlw m0, 2 688cabdff1aSopenharmony_ci pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 689cabdff1aSopenharmony_ci lea r4, [r3+r1*2] 690cabdff1aSopenharmony_ci packuswb m0, m0 691cabdff1aSopenharmony_ci punpcklbw m0, m0 692cabdff1aSopenharmony_ci movq m1, m0 693cabdff1aSopenharmony_ci punpcklbw m0, m0 694cabdff1aSopenharmony_ci punpckhbw m1, m1 695cabdff1aSopenharmony_ci movq [r0+r1*1], m0 696cabdff1aSopenharmony_ci movq [r0+r1*2], m0 697cabdff1aSopenharmony_ci movq [r2+r1*1], m0 698cabdff1aSopenharmony_ci movq [r2+r1*2], m0 699cabdff1aSopenharmony_ci movq [r3+r1*1], m1 700cabdff1aSopenharmony_ci movq [r3+r1*2], m1 701cabdff1aSopenharmony_ci movq [r4+r1*1], m1 702cabdff1aSopenharmony_ci movq [r4+r1*2], m1 703cabdff1aSopenharmony_ci RET 704cabdff1aSopenharmony_ci 705cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 706cabdff1aSopenharmony_ci; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride) 707cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 708cabdff1aSopenharmony_ci 709cabdff1aSopenharmony_ciINIT_MMX mmxext 710cabdff1aSopenharmony_cicglobal pred8x8_dc_rv40_8, 2,7 711cabdff1aSopenharmony_ci mov r4, r0 712cabdff1aSopenharmony_ci sub r0, r1 713cabdff1aSopenharmony_ci pxor mm0, mm0 714cabdff1aSopenharmony_ci psadbw mm0, [r0] 715cabdff1aSopenharmony_ci dec r0 716cabdff1aSopenharmony_ci movzx r5d, byte [r0+r1*1] 717cabdff1aSopenharmony_ci movd r6d, mm0 718cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 719cabdff1aSopenharmony_ci%rep 3 720cabdff1aSopenharmony_ci movzx r2d, byte [r0+r1*0] 721cabdff1aSopenharmony_ci movzx r3d, byte [r0+r1*1] 722cabdff1aSopenharmony_ci add r5d, r2d 723cabdff1aSopenharmony_ci add r6d, r3d 724cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 725cabdff1aSopenharmony_ci%endrep 726cabdff1aSopenharmony_ci movzx r2d, byte [r0+r1*0] 727cabdff1aSopenharmony_ci add r5d, r6d 728cabdff1aSopenharmony_ci lea r2d, [r2+r5+8] 729cabdff1aSopenharmony_ci shr r2d, 4 730cabdff1aSopenharmony_ci movd mm0, r2d 731cabdff1aSopenharmony_ci punpcklbw mm0, mm0 732cabdff1aSopenharmony_ci pshufw mm0, mm0, 0 733cabdff1aSopenharmony_ci mov r3d, 4 734cabdff1aSopenharmony_ci.loop: 735cabdff1aSopenharmony_ci movq [r4+r1*0], mm0 736cabdff1aSopenharmony_ci movq [r4+r1*1], mm0 737cabdff1aSopenharmony_ci lea r4, [r4+r1*2] 738cabdff1aSopenharmony_ci dec r3d 739cabdff1aSopenharmony_ci jg .loop 740cabdff1aSopenharmony_ci REP_RET 741cabdff1aSopenharmony_ci 742cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 743cabdff1aSopenharmony_ci; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride) 744cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 745cabdff1aSopenharmony_ci 746cabdff1aSopenharmony_ciINIT_XMM sse2 747cabdff1aSopenharmony_cicglobal pred8x8_tm_vp8_8, 2,6,4 748cabdff1aSopenharmony_ci sub r0, r1 749cabdff1aSopenharmony_ci pxor xmm1, xmm1 750cabdff1aSopenharmony_ci movq xmm0, [r0] 751cabdff1aSopenharmony_ci punpcklbw xmm0, xmm1 752cabdff1aSopenharmony_ci movzx r4d, byte [r0-1] 753cabdff1aSopenharmony_ci mov r5d, 4 754cabdff1aSopenharmony_ci.loop: 755cabdff1aSopenharmony_ci movzx r2d, byte [r0+r1*1-1] 756cabdff1aSopenharmony_ci movzx r3d, byte [r0+r1*2-1] 757cabdff1aSopenharmony_ci sub r2d, r4d 758cabdff1aSopenharmony_ci sub r3d, r4d 759cabdff1aSopenharmony_ci movd xmm2, r2d 760cabdff1aSopenharmony_ci movd xmm3, r3d 761cabdff1aSopenharmony_ci pshuflw xmm2, xmm2, 0 762cabdff1aSopenharmony_ci pshuflw xmm3, xmm3, 0 763cabdff1aSopenharmony_ci punpcklqdq xmm2, xmm2 764cabdff1aSopenharmony_ci punpcklqdq xmm3, xmm3 765cabdff1aSopenharmony_ci paddw xmm2, xmm0 766cabdff1aSopenharmony_ci paddw xmm3, xmm0 767cabdff1aSopenharmony_ci packuswb xmm2, xmm3 768cabdff1aSopenharmony_ci movq [r0+r1*1], xmm2 769cabdff1aSopenharmony_ci movhps [r0+r1*2], xmm2 770cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 771cabdff1aSopenharmony_ci dec r5d 772cabdff1aSopenharmony_ci jg .loop 773cabdff1aSopenharmony_ci REP_RET 774cabdff1aSopenharmony_ci 775cabdff1aSopenharmony_ciINIT_XMM ssse3 776cabdff1aSopenharmony_cicglobal pred8x8_tm_vp8_8, 2,3,6 777cabdff1aSopenharmony_ci sub r0, r1 778cabdff1aSopenharmony_ci movdqa xmm4, [tm_shuf] 779cabdff1aSopenharmony_ci pxor xmm1, xmm1 780cabdff1aSopenharmony_ci movq xmm0, [r0] 781cabdff1aSopenharmony_ci punpcklbw xmm0, xmm1 782cabdff1aSopenharmony_ci movd xmm5, [r0-4] 783cabdff1aSopenharmony_ci pshufb xmm5, xmm4 784cabdff1aSopenharmony_ci mov r2d, 4 785cabdff1aSopenharmony_ci.loop: 786cabdff1aSopenharmony_ci movd xmm2, [r0+r1*1-4] 787cabdff1aSopenharmony_ci movd xmm3, [r0+r1*2-4] 788cabdff1aSopenharmony_ci pshufb xmm2, xmm4 789cabdff1aSopenharmony_ci pshufb xmm3, xmm4 790cabdff1aSopenharmony_ci psubw xmm2, xmm5 791cabdff1aSopenharmony_ci psubw xmm3, xmm5 792cabdff1aSopenharmony_ci paddw xmm2, xmm0 793cabdff1aSopenharmony_ci paddw xmm3, xmm0 794cabdff1aSopenharmony_ci packuswb xmm2, xmm3 795cabdff1aSopenharmony_ci movq [r0+r1*1], xmm2 796cabdff1aSopenharmony_ci movhps [r0+r1*2], xmm2 797cabdff1aSopenharmony_ci lea r0, [r0+r1*2] 798cabdff1aSopenharmony_ci dec r2d 799cabdff1aSopenharmony_ci jg .loop 800cabdff1aSopenharmony_ci REP_RET 801cabdff1aSopenharmony_ci 802cabdff1aSopenharmony_ci; dest, left, right, src, tmp 803cabdff1aSopenharmony_ci; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 804cabdff1aSopenharmony_ci%macro PRED4x4_LOWPASS 5 805cabdff1aSopenharmony_ci mova %5, %2 806cabdff1aSopenharmony_ci pavgb %2, %3 807cabdff1aSopenharmony_ci pxor %3, %5 808cabdff1aSopenharmony_ci mova %1, %4 809cabdff1aSopenharmony_ci pand %3, [pb_1] 810cabdff1aSopenharmony_ci psubusb %2, %3 811cabdff1aSopenharmony_ci pavgb %1, %2 812cabdff1aSopenharmony_ci%endmacro 813cabdff1aSopenharmony_ci 814cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 815cabdff1aSopenharmony_ci; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, 816cabdff1aSopenharmony_ci; ptrdiff_t stride) 817cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 818cabdff1aSopenharmony_ci%macro PRED8x8L_TOP_DC 0 819cabdff1aSopenharmony_cicglobal pred8x8l_top_dc_8, 4,4 820cabdff1aSopenharmony_ci sub r0, r3 821cabdff1aSopenharmony_ci pxor mm7, mm7 822cabdff1aSopenharmony_ci movq mm0, [r0-8] 823cabdff1aSopenharmony_ci movq mm3, [r0] 824cabdff1aSopenharmony_ci movq mm1, [r0+8] 825cabdff1aSopenharmony_ci movq mm2, mm3 826cabdff1aSopenharmony_ci movq mm4, mm3 827cabdff1aSopenharmony_ci PALIGNR mm2, mm0, 7, mm0 828cabdff1aSopenharmony_ci PALIGNR mm1, mm4, 1, mm4 829cabdff1aSopenharmony_ci test r1d, r1d ; top_left 830cabdff1aSopenharmony_ci jz .fix_lt_2 831cabdff1aSopenharmony_ci test r2d, r2d ; top_right 832cabdff1aSopenharmony_ci jz .fix_tr_1 833cabdff1aSopenharmony_ci jmp .body 834cabdff1aSopenharmony_ci.fix_lt_2: 835cabdff1aSopenharmony_ci movq mm5, mm3 836cabdff1aSopenharmony_ci pxor mm5, mm2 837cabdff1aSopenharmony_ci psllq mm5, 56 838cabdff1aSopenharmony_ci psrlq mm5, 56 839cabdff1aSopenharmony_ci pxor mm2, mm5 840cabdff1aSopenharmony_ci test r2d, r2d ; top_right 841cabdff1aSopenharmony_ci jnz .body 842cabdff1aSopenharmony_ci.fix_tr_1: 843cabdff1aSopenharmony_ci movq mm5, mm3 844cabdff1aSopenharmony_ci pxor mm5, mm1 845cabdff1aSopenharmony_ci psrlq mm5, 56 846cabdff1aSopenharmony_ci psllq mm5, 56 847cabdff1aSopenharmony_ci pxor mm1, mm5 848cabdff1aSopenharmony_ci.body: 849cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 850cabdff1aSopenharmony_ci psadbw mm7, mm0 851cabdff1aSopenharmony_ci paddw mm7, [pw_4] 852cabdff1aSopenharmony_ci psrlw mm7, 3 853cabdff1aSopenharmony_ci pshufw mm7, mm7, 0 854cabdff1aSopenharmony_ci packuswb mm7, mm7 855cabdff1aSopenharmony_ci%rep 3 856cabdff1aSopenharmony_ci movq [r0+r3*1], mm7 857cabdff1aSopenharmony_ci movq [r0+r3*2], mm7 858cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 859cabdff1aSopenharmony_ci%endrep 860cabdff1aSopenharmony_ci movq [r0+r3*1], mm7 861cabdff1aSopenharmony_ci movq [r0+r3*2], mm7 862cabdff1aSopenharmony_ci RET 863cabdff1aSopenharmony_ci%endmacro 864cabdff1aSopenharmony_ci 865cabdff1aSopenharmony_ciINIT_MMX mmxext 866cabdff1aSopenharmony_ciPRED8x8L_TOP_DC 867cabdff1aSopenharmony_ciINIT_MMX ssse3 868cabdff1aSopenharmony_ciPRED8x8L_TOP_DC 869cabdff1aSopenharmony_ci 870cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 871cabdff1aSopenharmony_ci; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, 872cabdff1aSopenharmony_ci; ptrdiff_t stride) 873cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 874cabdff1aSopenharmony_ci 875cabdff1aSopenharmony_ci%macro PRED8x8L_DC 0 876cabdff1aSopenharmony_cicglobal pred8x8l_dc_8, 4,5 877cabdff1aSopenharmony_ci sub r0, r3 878cabdff1aSopenharmony_ci lea r4, [r0+r3*2] 879cabdff1aSopenharmony_ci movq mm0, [r0+r3*1-8] 880cabdff1aSopenharmony_ci punpckhbw mm0, [r0+r3*0-8] 881cabdff1aSopenharmony_ci movq mm1, [r4+r3*1-8] 882cabdff1aSopenharmony_ci punpckhbw mm1, [r0+r3*2-8] 883cabdff1aSopenharmony_ci mov r4, r0 884cabdff1aSopenharmony_ci punpckhwd mm1, mm0 885cabdff1aSopenharmony_ci lea r0, [r0+r3*4] 886cabdff1aSopenharmony_ci movq mm2, [r0+r3*1-8] 887cabdff1aSopenharmony_ci punpckhbw mm2, [r0+r3*0-8] 888cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 889cabdff1aSopenharmony_ci movq mm3, [r0+r3*1-8] 890cabdff1aSopenharmony_ci punpckhbw mm3, [r0+r3*0-8] 891cabdff1aSopenharmony_ci punpckhwd mm3, mm2 892cabdff1aSopenharmony_ci punpckhdq mm3, mm1 893cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 894cabdff1aSopenharmony_ci movq mm0, [r0+r3*0-8] 895cabdff1aSopenharmony_ci movq mm1, [r4] 896cabdff1aSopenharmony_ci mov r0, r4 897cabdff1aSopenharmony_ci movq mm4, mm3 898cabdff1aSopenharmony_ci movq mm2, mm3 899cabdff1aSopenharmony_ci PALIGNR mm4, mm0, 7, mm0 900cabdff1aSopenharmony_ci PALIGNR mm1, mm2, 1, mm2 901cabdff1aSopenharmony_ci test r1d, r1d 902cabdff1aSopenharmony_ci jnz .do_left 903cabdff1aSopenharmony_ci.fix_lt_1: 904cabdff1aSopenharmony_ci movq mm5, mm3 905cabdff1aSopenharmony_ci pxor mm5, mm4 906cabdff1aSopenharmony_ci psrlq mm5, 56 907cabdff1aSopenharmony_ci psllq mm5, 48 908cabdff1aSopenharmony_ci pxor mm1, mm5 909cabdff1aSopenharmony_ci jmp .do_left 910cabdff1aSopenharmony_ci.fix_lt_2: 911cabdff1aSopenharmony_ci movq mm5, mm3 912cabdff1aSopenharmony_ci pxor mm5, mm2 913cabdff1aSopenharmony_ci psllq mm5, 56 914cabdff1aSopenharmony_ci psrlq mm5, 56 915cabdff1aSopenharmony_ci pxor mm2, mm5 916cabdff1aSopenharmony_ci test r2d, r2d 917cabdff1aSopenharmony_ci jnz .body 918cabdff1aSopenharmony_ci.fix_tr_1: 919cabdff1aSopenharmony_ci movq mm5, mm3 920cabdff1aSopenharmony_ci pxor mm5, mm1 921cabdff1aSopenharmony_ci psrlq mm5, 56 922cabdff1aSopenharmony_ci psllq mm5, 56 923cabdff1aSopenharmony_ci pxor mm1, mm5 924cabdff1aSopenharmony_ci jmp .body 925cabdff1aSopenharmony_ci.do_left: 926cabdff1aSopenharmony_ci movq mm0, mm4 927cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 928cabdff1aSopenharmony_ci movq mm4, mm0 929cabdff1aSopenharmony_ci movq mm7, mm2 930cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 931cabdff1aSopenharmony_ci psllq mm1, 56 932cabdff1aSopenharmony_ci PALIGNR mm7, mm1, 7, mm3 933cabdff1aSopenharmony_ci movq mm0, [r0-8] 934cabdff1aSopenharmony_ci movq mm3, [r0] 935cabdff1aSopenharmony_ci movq mm1, [r0+8] 936cabdff1aSopenharmony_ci movq mm2, mm3 937cabdff1aSopenharmony_ci movq mm4, mm3 938cabdff1aSopenharmony_ci PALIGNR mm2, mm0, 7, mm0 939cabdff1aSopenharmony_ci PALIGNR mm1, mm4, 1, mm4 940cabdff1aSopenharmony_ci test r1d, r1d 941cabdff1aSopenharmony_ci jz .fix_lt_2 942cabdff1aSopenharmony_ci test r2d, r2d 943cabdff1aSopenharmony_ci jz .fix_tr_1 944cabdff1aSopenharmony_ci.body: 945cabdff1aSopenharmony_ci lea r1, [r0+r3*2] 946cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 947cabdff1aSopenharmony_ci pxor mm0, mm0 948cabdff1aSopenharmony_ci pxor mm1, mm1 949cabdff1aSopenharmony_ci lea r2, [r1+r3*2] 950cabdff1aSopenharmony_ci psadbw mm0, mm7 951cabdff1aSopenharmony_ci psadbw mm1, mm6 952cabdff1aSopenharmony_ci paddw mm0, [pw_8] 953cabdff1aSopenharmony_ci paddw mm0, mm1 954cabdff1aSopenharmony_ci lea r4, [r2+r3*2] 955cabdff1aSopenharmony_ci psrlw mm0, 4 956cabdff1aSopenharmony_ci pshufw mm0, mm0, 0 957cabdff1aSopenharmony_ci packuswb mm0, mm0 958cabdff1aSopenharmony_ci movq [r0+r3*1], mm0 959cabdff1aSopenharmony_ci movq [r0+r3*2], mm0 960cabdff1aSopenharmony_ci movq [r1+r3*1], mm0 961cabdff1aSopenharmony_ci movq [r1+r3*2], mm0 962cabdff1aSopenharmony_ci movq [r2+r3*1], mm0 963cabdff1aSopenharmony_ci movq [r2+r3*2], mm0 964cabdff1aSopenharmony_ci movq [r4+r3*1], mm0 965cabdff1aSopenharmony_ci movq [r4+r3*2], mm0 966cabdff1aSopenharmony_ci RET 967cabdff1aSopenharmony_ci%endmacro 968cabdff1aSopenharmony_ci 969cabdff1aSopenharmony_ciINIT_MMX mmxext 970cabdff1aSopenharmony_ciPRED8x8L_DC 971cabdff1aSopenharmony_ciINIT_MMX ssse3 972cabdff1aSopenharmony_ciPRED8x8L_DC 973cabdff1aSopenharmony_ci 974cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 975cabdff1aSopenharmony_ci; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft, 976cabdff1aSopenharmony_ci; int has_topright, ptrdiff_t stride) 977cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 978cabdff1aSopenharmony_ci 979cabdff1aSopenharmony_ci%macro PRED8x8L_HORIZONTAL 0 980cabdff1aSopenharmony_cicglobal pred8x8l_horizontal_8, 4,4 981cabdff1aSopenharmony_ci sub r0, r3 982cabdff1aSopenharmony_ci lea r2, [r0+r3*2] 983cabdff1aSopenharmony_ci movq mm0, [r0+r3*1-8] 984cabdff1aSopenharmony_ci test r1d, r1d 985cabdff1aSopenharmony_ci lea r1, [r0+r3] 986cabdff1aSopenharmony_ci cmovnz r1, r0 987cabdff1aSopenharmony_ci punpckhbw mm0, [r1+r3*0-8] 988cabdff1aSopenharmony_ci movq mm1, [r2+r3*1-8] 989cabdff1aSopenharmony_ci punpckhbw mm1, [r0+r3*2-8] 990cabdff1aSopenharmony_ci mov r2, r0 991cabdff1aSopenharmony_ci punpckhwd mm1, mm0 992cabdff1aSopenharmony_ci lea r0, [r0+r3*4] 993cabdff1aSopenharmony_ci movq mm2, [r0+r3*1-8] 994cabdff1aSopenharmony_ci punpckhbw mm2, [r0+r3*0-8] 995cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 996cabdff1aSopenharmony_ci movq mm3, [r0+r3*1-8] 997cabdff1aSopenharmony_ci punpckhbw mm3, [r0+r3*0-8] 998cabdff1aSopenharmony_ci punpckhwd mm3, mm2 999cabdff1aSopenharmony_ci punpckhdq mm3, mm1 1000cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 1001cabdff1aSopenharmony_ci movq mm0, [r0+r3*0-8] 1002cabdff1aSopenharmony_ci movq mm1, [r1+r3*0-8] 1003cabdff1aSopenharmony_ci mov r0, r2 1004cabdff1aSopenharmony_ci movq mm4, mm3 1005cabdff1aSopenharmony_ci movq mm2, mm3 1006cabdff1aSopenharmony_ci PALIGNR mm4, mm0, 7, mm0 1007cabdff1aSopenharmony_ci PALIGNR mm1, mm2, 1, mm2 1008cabdff1aSopenharmony_ci movq mm0, mm4 1009cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1010cabdff1aSopenharmony_ci movq mm4, mm0 1011cabdff1aSopenharmony_ci movq mm7, mm2 1012cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1013cabdff1aSopenharmony_ci psllq mm1, 56 1014cabdff1aSopenharmony_ci PALIGNR mm7, mm1, 7, mm3 1015cabdff1aSopenharmony_ci movq mm3, mm7 1016cabdff1aSopenharmony_ci lea r1, [r0+r3*2] 1017cabdff1aSopenharmony_ci movq mm7, mm3 1018cabdff1aSopenharmony_ci punpckhbw mm3, mm3 1019cabdff1aSopenharmony_ci punpcklbw mm7, mm7 1020cabdff1aSopenharmony_ci pshufw mm0, mm3, 0xff 1021cabdff1aSopenharmony_ci pshufw mm1, mm3, 0xaa 1022cabdff1aSopenharmony_ci lea r2, [r1+r3*2] 1023cabdff1aSopenharmony_ci pshufw mm2, mm3, 0x55 1024cabdff1aSopenharmony_ci pshufw mm3, mm3, 0x00 1025cabdff1aSopenharmony_ci pshufw mm4, mm7, 0xff 1026cabdff1aSopenharmony_ci pshufw mm5, mm7, 0xaa 1027cabdff1aSopenharmony_ci pshufw mm6, mm7, 0x55 1028cabdff1aSopenharmony_ci pshufw mm7, mm7, 0x00 1029cabdff1aSopenharmony_ci movq [r0+r3*1], mm0 1030cabdff1aSopenharmony_ci movq [r0+r3*2], mm1 1031cabdff1aSopenharmony_ci movq [r1+r3*1], mm2 1032cabdff1aSopenharmony_ci movq [r1+r3*2], mm3 1033cabdff1aSopenharmony_ci movq [r2+r3*1], mm4 1034cabdff1aSopenharmony_ci movq [r2+r3*2], mm5 1035cabdff1aSopenharmony_ci lea r0, [r2+r3*2] 1036cabdff1aSopenharmony_ci movq [r0+r3*1], mm6 1037cabdff1aSopenharmony_ci movq [r0+r3*2], mm7 1038cabdff1aSopenharmony_ci RET 1039cabdff1aSopenharmony_ci%endmacro 1040cabdff1aSopenharmony_ci 1041cabdff1aSopenharmony_ciINIT_MMX mmxext 1042cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL 1043cabdff1aSopenharmony_ciINIT_MMX ssse3 1044cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL 1045cabdff1aSopenharmony_ci 1046cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1047cabdff1aSopenharmony_ci; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, 1048cabdff1aSopenharmony_ci; ptrdiff_t stride) 1049cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1050cabdff1aSopenharmony_ci 1051cabdff1aSopenharmony_ci%macro PRED8x8L_VERTICAL 0 1052cabdff1aSopenharmony_cicglobal pred8x8l_vertical_8, 4,4 1053cabdff1aSopenharmony_ci sub r0, r3 1054cabdff1aSopenharmony_ci movq mm0, [r0-8] 1055cabdff1aSopenharmony_ci movq mm3, [r0] 1056cabdff1aSopenharmony_ci movq mm1, [r0+8] 1057cabdff1aSopenharmony_ci movq mm2, mm3 1058cabdff1aSopenharmony_ci movq mm4, mm3 1059cabdff1aSopenharmony_ci PALIGNR mm2, mm0, 7, mm0 1060cabdff1aSopenharmony_ci PALIGNR mm1, mm4, 1, mm4 1061cabdff1aSopenharmony_ci test r1d, r1d ; top_left 1062cabdff1aSopenharmony_ci jz .fix_lt_2 1063cabdff1aSopenharmony_ci test r2d, r2d ; top_right 1064cabdff1aSopenharmony_ci jz .fix_tr_1 1065cabdff1aSopenharmony_ci jmp .body 1066cabdff1aSopenharmony_ci.fix_lt_2: 1067cabdff1aSopenharmony_ci movq mm5, mm3 1068cabdff1aSopenharmony_ci pxor mm5, mm2 1069cabdff1aSopenharmony_ci psllq mm5, 56 1070cabdff1aSopenharmony_ci psrlq mm5, 56 1071cabdff1aSopenharmony_ci pxor mm2, mm5 1072cabdff1aSopenharmony_ci test r2d, r2d ; top_right 1073cabdff1aSopenharmony_ci jnz .body 1074cabdff1aSopenharmony_ci.fix_tr_1: 1075cabdff1aSopenharmony_ci movq mm5, mm3 1076cabdff1aSopenharmony_ci pxor mm5, mm1 1077cabdff1aSopenharmony_ci psrlq mm5, 56 1078cabdff1aSopenharmony_ci psllq mm5, 56 1079cabdff1aSopenharmony_ci pxor mm1, mm5 1080cabdff1aSopenharmony_ci.body: 1081cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 1082cabdff1aSopenharmony_ci%rep 3 1083cabdff1aSopenharmony_ci movq [r0+r3*1], mm0 1084cabdff1aSopenharmony_ci movq [r0+r3*2], mm0 1085cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 1086cabdff1aSopenharmony_ci%endrep 1087cabdff1aSopenharmony_ci movq [r0+r3*1], mm0 1088cabdff1aSopenharmony_ci movq [r0+r3*2], mm0 1089cabdff1aSopenharmony_ci RET 1090cabdff1aSopenharmony_ci%endmacro 1091cabdff1aSopenharmony_ci 1092cabdff1aSopenharmony_ciINIT_MMX mmxext 1093cabdff1aSopenharmony_ciPRED8x8L_VERTICAL 1094cabdff1aSopenharmony_ciINIT_MMX ssse3 1095cabdff1aSopenharmony_ciPRED8x8L_VERTICAL 1096cabdff1aSopenharmony_ci 1097cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1098cabdff1aSopenharmony_ci; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft, 1099cabdff1aSopenharmony_ci; int has_topright, ptrdiff_t stride) 1100cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1101cabdff1aSopenharmony_ci 1102cabdff1aSopenharmony_ci%macro PRED8x8L_DOWN_LEFT 0 1103cabdff1aSopenharmony_cicglobal pred8x8l_down_left_8, 4,4 1104cabdff1aSopenharmony_ci sub r0, r3 1105cabdff1aSopenharmony_ci movq mm0, [r0-8] 1106cabdff1aSopenharmony_ci movq mm3, [r0] 1107cabdff1aSopenharmony_ci movq mm1, [r0+8] 1108cabdff1aSopenharmony_ci movq mm2, mm3 1109cabdff1aSopenharmony_ci movq mm4, mm3 1110cabdff1aSopenharmony_ci PALIGNR mm2, mm0, 7, mm0 1111cabdff1aSopenharmony_ci PALIGNR mm1, mm4, 1, mm4 1112cabdff1aSopenharmony_ci test r1d, r1d ; top_left 1113cabdff1aSopenharmony_ci jz .fix_lt_2 1114cabdff1aSopenharmony_ci test r2d, r2d ; top_right 1115cabdff1aSopenharmony_ci jz .fix_tr_1 1116cabdff1aSopenharmony_ci jmp .do_top 1117cabdff1aSopenharmony_ci.fix_lt_2: 1118cabdff1aSopenharmony_ci movq mm5, mm3 1119cabdff1aSopenharmony_ci pxor mm5, mm2 1120cabdff1aSopenharmony_ci psllq mm5, 56 1121cabdff1aSopenharmony_ci psrlq mm5, 56 1122cabdff1aSopenharmony_ci pxor mm2, mm5 1123cabdff1aSopenharmony_ci test r2d, r2d ; top_right 1124cabdff1aSopenharmony_ci jnz .do_top 1125cabdff1aSopenharmony_ci.fix_tr_1: 1126cabdff1aSopenharmony_ci movq mm5, mm3 1127cabdff1aSopenharmony_ci pxor mm5, mm1 1128cabdff1aSopenharmony_ci psrlq mm5, 56 1129cabdff1aSopenharmony_ci psllq mm5, 56 1130cabdff1aSopenharmony_ci pxor mm1, mm5 1131cabdff1aSopenharmony_ci jmp .do_top 1132cabdff1aSopenharmony_ci.fix_tr_2: 1133cabdff1aSopenharmony_ci punpckhbw mm3, mm3 1134cabdff1aSopenharmony_ci pshufw mm1, mm3, 0xFF 1135cabdff1aSopenharmony_ci jmp .do_topright 1136cabdff1aSopenharmony_ci.do_top: 1137cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1138cabdff1aSopenharmony_ci movq2dq xmm3, mm4 1139cabdff1aSopenharmony_ci test r2d, r2d ; top_right 1140cabdff1aSopenharmony_ci jz .fix_tr_2 1141cabdff1aSopenharmony_ci movq mm0, [r0+8] 1142cabdff1aSopenharmony_ci movq mm5, mm0 1143cabdff1aSopenharmony_ci movq mm2, mm0 1144cabdff1aSopenharmony_ci movq mm4, mm0 1145cabdff1aSopenharmony_ci psrlq mm5, 56 1146cabdff1aSopenharmony_ci PALIGNR mm2, mm3, 7, mm3 1147cabdff1aSopenharmony_ci PALIGNR mm5, mm4, 1, mm4 1148cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1149cabdff1aSopenharmony_ci.do_topright: 1150cabdff1aSopenharmony_ci movq2dq xmm4, mm1 1151cabdff1aSopenharmony_ci psrlq mm1, 56 1152cabdff1aSopenharmony_ci movq2dq xmm5, mm1 1153cabdff1aSopenharmony_ci lea r1, [r0+r3*2] 1154cabdff1aSopenharmony_ci pslldq xmm4, 8 1155cabdff1aSopenharmony_ci por xmm3, xmm4 1156cabdff1aSopenharmony_ci movdqa xmm2, xmm3 1157cabdff1aSopenharmony_ci psrldq xmm2, 1 1158cabdff1aSopenharmony_ci pslldq xmm5, 15 1159cabdff1aSopenharmony_ci por xmm2, xmm5 1160cabdff1aSopenharmony_ci lea r2, [r1+r3*2] 1161cabdff1aSopenharmony_ci movdqa xmm1, xmm3 1162cabdff1aSopenharmony_ci pslldq xmm1, 1 1163cabdff1aSopenharmony_ciINIT_XMM cpuname 1164cabdff1aSopenharmony_ci PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1165cabdff1aSopenharmony_ci psrldq xmm0, 1 1166cabdff1aSopenharmony_ci movq [r0+r3*1], xmm0 1167cabdff1aSopenharmony_ci psrldq xmm0, 1 1168cabdff1aSopenharmony_ci movq [r0+r3*2], xmm0 1169cabdff1aSopenharmony_ci psrldq xmm0, 1 1170cabdff1aSopenharmony_ci lea r0, [r2+r3*2] 1171cabdff1aSopenharmony_ci movq [r1+r3*1], xmm0 1172cabdff1aSopenharmony_ci psrldq xmm0, 1 1173cabdff1aSopenharmony_ci movq [r1+r3*2], xmm0 1174cabdff1aSopenharmony_ci psrldq xmm0, 1 1175cabdff1aSopenharmony_ci movq [r2+r3*1], xmm0 1176cabdff1aSopenharmony_ci psrldq xmm0, 1 1177cabdff1aSopenharmony_ci movq [r2+r3*2], xmm0 1178cabdff1aSopenharmony_ci psrldq xmm0, 1 1179cabdff1aSopenharmony_ci movq [r0+r3*1], xmm0 1180cabdff1aSopenharmony_ci psrldq xmm0, 1 1181cabdff1aSopenharmony_ci movq [r0+r3*2], xmm0 1182cabdff1aSopenharmony_ci RET 1183cabdff1aSopenharmony_ci%endmacro 1184cabdff1aSopenharmony_ci 1185cabdff1aSopenharmony_ciINIT_MMX sse2 1186cabdff1aSopenharmony_ciPRED8x8L_DOWN_LEFT 1187cabdff1aSopenharmony_ciINIT_MMX ssse3 1188cabdff1aSopenharmony_ciPRED8x8L_DOWN_LEFT 1189cabdff1aSopenharmony_ci 1190cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1191cabdff1aSopenharmony_ci; void ff_pred8x8l_down_right_8(uint8_t *src, int has_topleft, 1192cabdff1aSopenharmony_ci; int has_topright, ptrdiff_t stride) 1193cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1194cabdff1aSopenharmony_ci 1195cabdff1aSopenharmony_ci%macro PRED8x8L_DOWN_RIGHT 0 1196cabdff1aSopenharmony_cicglobal pred8x8l_down_right_8, 4,5 1197cabdff1aSopenharmony_ci sub r0, r3 1198cabdff1aSopenharmony_ci lea r4, [r0+r3*2] 1199cabdff1aSopenharmony_ci movq mm0, [r0+r3*1-8] 1200cabdff1aSopenharmony_ci punpckhbw mm0, [r0+r3*0-8] 1201cabdff1aSopenharmony_ci movq mm1, [r4+r3*1-8] 1202cabdff1aSopenharmony_ci punpckhbw mm1, [r0+r3*2-8] 1203cabdff1aSopenharmony_ci mov r4, r0 1204cabdff1aSopenharmony_ci punpckhwd mm1, mm0 1205cabdff1aSopenharmony_ci lea r0, [r0+r3*4] 1206cabdff1aSopenharmony_ci movq mm2, [r0+r3*1-8] 1207cabdff1aSopenharmony_ci punpckhbw mm2, [r0+r3*0-8] 1208cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 1209cabdff1aSopenharmony_ci movq mm3, [r0+r3*1-8] 1210cabdff1aSopenharmony_ci punpckhbw mm3, [r0+r3*0-8] 1211cabdff1aSopenharmony_ci punpckhwd mm3, mm2 1212cabdff1aSopenharmony_ci punpckhdq mm3, mm1 1213cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 1214cabdff1aSopenharmony_ci movq mm0, [r0+r3*0-8] 1215cabdff1aSopenharmony_ci movq mm1, [r4] 1216cabdff1aSopenharmony_ci mov r0, r4 1217cabdff1aSopenharmony_ci movq mm4, mm3 1218cabdff1aSopenharmony_ci movq mm2, mm3 1219cabdff1aSopenharmony_ci PALIGNR mm4, mm0, 7, mm0 1220cabdff1aSopenharmony_ci PALIGNR mm1, mm2, 1, mm2 1221cabdff1aSopenharmony_ci test r1d, r1d 1222cabdff1aSopenharmony_ci jz .fix_lt_1 1223cabdff1aSopenharmony_ci jmp .do_left 1224cabdff1aSopenharmony_ci.fix_lt_1: 1225cabdff1aSopenharmony_ci movq mm5, mm3 1226cabdff1aSopenharmony_ci pxor mm5, mm4 1227cabdff1aSopenharmony_ci psrlq mm5, 56 1228cabdff1aSopenharmony_ci psllq mm5, 48 1229cabdff1aSopenharmony_ci pxor mm1, mm5 1230cabdff1aSopenharmony_ci jmp .do_left 1231cabdff1aSopenharmony_ci.fix_lt_2: 1232cabdff1aSopenharmony_ci movq mm5, mm3 1233cabdff1aSopenharmony_ci pxor mm5, mm2 1234cabdff1aSopenharmony_ci psllq mm5, 56 1235cabdff1aSopenharmony_ci psrlq mm5, 56 1236cabdff1aSopenharmony_ci pxor mm2, mm5 1237cabdff1aSopenharmony_ci test r2d, r2d 1238cabdff1aSopenharmony_ci jnz .do_top 1239cabdff1aSopenharmony_ci.fix_tr_1: 1240cabdff1aSopenharmony_ci movq mm5, mm3 1241cabdff1aSopenharmony_ci pxor mm5, mm1 1242cabdff1aSopenharmony_ci psrlq mm5, 56 1243cabdff1aSopenharmony_ci psllq mm5, 56 1244cabdff1aSopenharmony_ci pxor mm1, mm5 1245cabdff1aSopenharmony_ci jmp .do_top 1246cabdff1aSopenharmony_ci.do_left: 1247cabdff1aSopenharmony_ci movq mm0, mm4 1248cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1249cabdff1aSopenharmony_ci movq mm4, mm0 1250cabdff1aSopenharmony_ci movq mm7, mm2 1251cabdff1aSopenharmony_ci movq2dq xmm3, mm2 1252cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1253cabdff1aSopenharmony_ci psllq mm1, 56 1254cabdff1aSopenharmony_ci PALIGNR mm7, mm1, 7, mm3 1255cabdff1aSopenharmony_ci movq2dq xmm1, mm7 1256cabdff1aSopenharmony_ci movq mm0, [r0-8] 1257cabdff1aSopenharmony_ci movq mm3, [r0] 1258cabdff1aSopenharmony_ci movq mm1, [r0+8] 1259cabdff1aSopenharmony_ci movq mm2, mm3 1260cabdff1aSopenharmony_ci movq mm4, mm3 1261cabdff1aSopenharmony_ci PALIGNR mm2, mm0, 7, mm0 1262cabdff1aSopenharmony_ci PALIGNR mm1, mm4, 1, mm4 1263cabdff1aSopenharmony_ci test r1d, r1d 1264cabdff1aSopenharmony_ci jz .fix_lt_2 1265cabdff1aSopenharmony_ci test r2d, r2d 1266cabdff1aSopenharmony_ci jz .fix_tr_1 1267cabdff1aSopenharmony_ci.do_top: 1268cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1269cabdff1aSopenharmony_ci movq2dq xmm4, mm4 1270cabdff1aSopenharmony_ci lea r1, [r0+r3*2] 1271cabdff1aSopenharmony_ci movdqa xmm0, xmm3 1272cabdff1aSopenharmony_ci pslldq xmm4, 8 1273cabdff1aSopenharmony_ci por xmm3, xmm4 1274cabdff1aSopenharmony_ci lea r2, [r1+r3*2] 1275cabdff1aSopenharmony_ci pslldq xmm4, 1 1276cabdff1aSopenharmony_ci por xmm1, xmm4 1277cabdff1aSopenharmony_ci psrldq xmm0, 7 1278cabdff1aSopenharmony_ci pslldq xmm0, 15 1279cabdff1aSopenharmony_ci psrldq xmm0, 7 1280cabdff1aSopenharmony_ci por xmm1, xmm0 1281cabdff1aSopenharmony_ci lea r0, [r2+r3*2] 1282cabdff1aSopenharmony_ci movdqa xmm2, xmm3 1283cabdff1aSopenharmony_ci psrldq xmm2, 1 1284cabdff1aSopenharmony_ciINIT_XMM cpuname 1285cabdff1aSopenharmony_ci PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1286cabdff1aSopenharmony_ci movdqa xmm1, xmm0 1287cabdff1aSopenharmony_ci psrldq xmm1, 1 1288cabdff1aSopenharmony_ci movq [r0+r3*2], xmm0 1289cabdff1aSopenharmony_ci movq [r0+r3*1], xmm1 1290cabdff1aSopenharmony_ci psrldq xmm0, 2 1291cabdff1aSopenharmony_ci psrldq xmm1, 2 1292cabdff1aSopenharmony_ci movq [r2+r3*2], xmm0 1293cabdff1aSopenharmony_ci movq [r2+r3*1], xmm1 1294cabdff1aSopenharmony_ci psrldq xmm0, 2 1295cabdff1aSopenharmony_ci psrldq xmm1, 2 1296cabdff1aSopenharmony_ci movq [r1+r3*2], xmm0 1297cabdff1aSopenharmony_ci movq [r1+r3*1], xmm1 1298cabdff1aSopenharmony_ci psrldq xmm0, 2 1299cabdff1aSopenharmony_ci psrldq xmm1, 2 1300cabdff1aSopenharmony_ci movq [r4+r3*2], xmm0 1301cabdff1aSopenharmony_ci movq [r4+r3*1], xmm1 1302cabdff1aSopenharmony_ci RET 1303cabdff1aSopenharmony_ci%endmacro 1304cabdff1aSopenharmony_ci 1305cabdff1aSopenharmony_ciINIT_MMX sse2 1306cabdff1aSopenharmony_ciPRED8x8L_DOWN_RIGHT 1307cabdff1aSopenharmony_ciINIT_MMX ssse3 1308cabdff1aSopenharmony_ciPRED8x8L_DOWN_RIGHT 1309cabdff1aSopenharmony_ci 1310cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1311cabdff1aSopenharmony_ci; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, 1312cabdff1aSopenharmony_ci; int has_topright, ptrdiff_t stride) 1313cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1314cabdff1aSopenharmony_ci 1315cabdff1aSopenharmony_ci%macro PRED8x8L_VERTICAL_RIGHT 0 1316cabdff1aSopenharmony_cicglobal pred8x8l_vertical_right_8, 4,5,7 1317cabdff1aSopenharmony_ci ; manually spill XMM registers for Win64 because 1318cabdff1aSopenharmony_ci ; the code here is initialized with INIT_MMX 1319cabdff1aSopenharmony_ci WIN64_SPILL_XMM 7 1320cabdff1aSopenharmony_ci sub r0, r3 1321cabdff1aSopenharmony_ci lea r4, [r0+r3*2] 1322cabdff1aSopenharmony_ci movq mm0, [r0+r3*1-8] 1323cabdff1aSopenharmony_ci punpckhbw mm0, [r0+r3*0-8] 1324cabdff1aSopenharmony_ci movq mm1, [r4+r3*1-8] 1325cabdff1aSopenharmony_ci punpckhbw mm1, [r0+r3*2-8] 1326cabdff1aSopenharmony_ci mov r4, r0 1327cabdff1aSopenharmony_ci punpckhwd mm1, mm0 1328cabdff1aSopenharmony_ci lea r0, [r0+r3*4] 1329cabdff1aSopenharmony_ci movq mm2, [r0+r3*1-8] 1330cabdff1aSopenharmony_ci punpckhbw mm2, [r0+r3*0-8] 1331cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 1332cabdff1aSopenharmony_ci movq mm3, [r0+r3*1-8] 1333cabdff1aSopenharmony_ci punpckhbw mm3, [r0+r3*0-8] 1334cabdff1aSopenharmony_ci punpckhwd mm3, mm2 1335cabdff1aSopenharmony_ci punpckhdq mm3, mm1 1336cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 1337cabdff1aSopenharmony_ci movq mm0, [r0+r3*0-8] 1338cabdff1aSopenharmony_ci movq mm1, [r4] 1339cabdff1aSopenharmony_ci mov r0, r4 1340cabdff1aSopenharmony_ci movq mm4, mm3 1341cabdff1aSopenharmony_ci movq mm2, mm3 1342cabdff1aSopenharmony_ci PALIGNR mm4, mm0, 7, mm0 1343cabdff1aSopenharmony_ci PALIGNR mm1, mm2, 1, mm2 1344cabdff1aSopenharmony_ci test r1d, r1d 1345cabdff1aSopenharmony_ci jnz .do_left 1346cabdff1aSopenharmony_ci.fix_lt_1: 1347cabdff1aSopenharmony_ci movq mm5, mm3 1348cabdff1aSopenharmony_ci pxor mm5, mm4 1349cabdff1aSopenharmony_ci psrlq mm5, 56 1350cabdff1aSopenharmony_ci psllq mm5, 48 1351cabdff1aSopenharmony_ci pxor mm1, mm5 1352cabdff1aSopenharmony_ci jmp .do_left 1353cabdff1aSopenharmony_ci.fix_lt_2: 1354cabdff1aSopenharmony_ci movq mm5, mm3 1355cabdff1aSopenharmony_ci pxor mm5, mm2 1356cabdff1aSopenharmony_ci psllq mm5, 56 1357cabdff1aSopenharmony_ci psrlq mm5, 56 1358cabdff1aSopenharmony_ci pxor mm2, mm5 1359cabdff1aSopenharmony_ci test r2d, r2d 1360cabdff1aSopenharmony_ci jnz .do_top 1361cabdff1aSopenharmony_ci.fix_tr_1: 1362cabdff1aSopenharmony_ci movq mm5, mm3 1363cabdff1aSopenharmony_ci pxor mm5, mm1 1364cabdff1aSopenharmony_ci psrlq mm5, 56 1365cabdff1aSopenharmony_ci psllq mm5, 56 1366cabdff1aSopenharmony_ci pxor mm1, mm5 1367cabdff1aSopenharmony_ci jmp .do_top 1368cabdff1aSopenharmony_ci.do_left: 1369cabdff1aSopenharmony_ci movq mm0, mm4 1370cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1371cabdff1aSopenharmony_ci movq2dq xmm0, mm2 1372cabdff1aSopenharmony_ci movq mm0, [r0-8] 1373cabdff1aSopenharmony_ci movq mm3, [r0] 1374cabdff1aSopenharmony_ci movq mm1, [r0+8] 1375cabdff1aSopenharmony_ci movq mm2, mm3 1376cabdff1aSopenharmony_ci movq mm4, mm3 1377cabdff1aSopenharmony_ci PALIGNR mm2, mm0, 7, mm0 1378cabdff1aSopenharmony_ci PALIGNR mm1, mm4, 1, mm4 1379cabdff1aSopenharmony_ci test r1d, r1d 1380cabdff1aSopenharmony_ci jz .fix_lt_2 1381cabdff1aSopenharmony_ci test r2d, r2d 1382cabdff1aSopenharmony_ci jz .fix_tr_1 1383cabdff1aSopenharmony_ci.do_top: 1384cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1385cabdff1aSopenharmony_ci lea r1, [r0+r3*2] 1386cabdff1aSopenharmony_ci movq2dq xmm4, mm6 1387cabdff1aSopenharmony_ci pslldq xmm4, 8 1388cabdff1aSopenharmony_ci por xmm0, xmm4 1389cabdff1aSopenharmony_ci movdqa xmm6, [pw_ff00] 1390cabdff1aSopenharmony_ci movdqa xmm1, xmm0 1391cabdff1aSopenharmony_ci lea r2, [r1+r3*2] 1392cabdff1aSopenharmony_ci movdqa xmm2, xmm0 1393cabdff1aSopenharmony_ci movdqa xmm3, xmm0 1394cabdff1aSopenharmony_ci pslldq xmm0, 1 1395cabdff1aSopenharmony_ci pslldq xmm1, 2 1396cabdff1aSopenharmony_ci pavgb xmm2, xmm0 1397cabdff1aSopenharmony_ciINIT_XMM cpuname 1398cabdff1aSopenharmony_ci PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 1399cabdff1aSopenharmony_ci pandn xmm6, xmm4 1400cabdff1aSopenharmony_ci movdqa xmm5, xmm4 1401cabdff1aSopenharmony_ci psrlw xmm4, 8 1402cabdff1aSopenharmony_ci packuswb xmm6, xmm4 1403cabdff1aSopenharmony_ci movhlps xmm4, xmm6 1404cabdff1aSopenharmony_ci movhps [r0+r3*2], xmm5 1405cabdff1aSopenharmony_ci movhps [r0+r3*1], xmm2 1406cabdff1aSopenharmony_ci psrldq xmm5, 4 1407cabdff1aSopenharmony_ci movss xmm5, xmm6 1408cabdff1aSopenharmony_ci psrldq xmm2, 4 1409cabdff1aSopenharmony_ci movss xmm2, xmm4 1410cabdff1aSopenharmony_ci lea r0, [r2+r3*2] 1411cabdff1aSopenharmony_ci psrldq xmm5, 1 1412cabdff1aSopenharmony_ci psrldq xmm2, 1 1413cabdff1aSopenharmony_ci movq [r0+r3*2], xmm5 1414cabdff1aSopenharmony_ci movq [r0+r3*1], xmm2 1415cabdff1aSopenharmony_ci psrldq xmm5, 1 1416cabdff1aSopenharmony_ci psrldq xmm2, 1 1417cabdff1aSopenharmony_ci movq [r2+r3*2], xmm5 1418cabdff1aSopenharmony_ci movq [r2+r3*1], xmm2 1419cabdff1aSopenharmony_ci psrldq xmm5, 1 1420cabdff1aSopenharmony_ci psrldq xmm2, 1 1421cabdff1aSopenharmony_ci movq [r1+r3*2], xmm5 1422cabdff1aSopenharmony_ci movq [r1+r3*1], xmm2 1423cabdff1aSopenharmony_ci RET 1424cabdff1aSopenharmony_ci%endmacro 1425cabdff1aSopenharmony_ci 1426cabdff1aSopenharmony_ciINIT_MMX sse2 1427cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_RIGHT 1428cabdff1aSopenharmony_ciINIT_MMX ssse3 1429cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_RIGHT 1430cabdff1aSopenharmony_ci 1431cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1432cabdff1aSopenharmony_ci; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, 1433cabdff1aSopenharmony_ci; int has_topright, ptrdiff_t stride) 1434cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1435cabdff1aSopenharmony_ci 1436cabdff1aSopenharmony_ci%macro PRED8x8L_VERTICAL_LEFT 0 1437cabdff1aSopenharmony_cicglobal pred8x8l_vertical_left_8, 4,4 1438cabdff1aSopenharmony_ci sub r0, r3 1439cabdff1aSopenharmony_ci movq mm0, [r0-8] 1440cabdff1aSopenharmony_ci movq mm3, [r0] 1441cabdff1aSopenharmony_ci movq mm1, [r0+8] 1442cabdff1aSopenharmony_ci movq mm2, mm3 1443cabdff1aSopenharmony_ci movq mm4, mm3 1444cabdff1aSopenharmony_ci PALIGNR mm2, mm0, 7, mm0 1445cabdff1aSopenharmony_ci PALIGNR mm1, mm4, 1, mm4 1446cabdff1aSopenharmony_ci test r1d, r1d 1447cabdff1aSopenharmony_ci jz .fix_lt_2 1448cabdff1aSopenharmony_ci test r2d, r2d 1449cabdff1aSopenharmony_ci jz .fix_tr_1 1450cabdff1aSopenharmony_ci jmp .do_top 1451cabdff1aSopenharmony_ci.fix_lt_2: 1452cabdff1aSopenharmony_ci movq mm5, mm3 1453cabdff1aSopenharmony_ci pxor mm5, mm2 1454cabdff1aSopenharmony_ci psllq mm5, 56 1455cabdff1aSopenharmony_ci psrlq mm5, 56 1456cabdff1aSopenharmony_ci pxor mm2, mm5 1457cabdff1aSopenharmony_ci test r2d, r2d 1458cabdff1aSopenharmony_ci jnz .do_top 1459cabdff1aSopenharmony_ci.fix_tr_1: 1460cabdff1aSopenharmony_ci movq mm5, mm3 1461cabdff1aSopenharmony_ci pxor mm5, mm1 1462cabdff1aSopenharmony_ci psrlq mm5, 56 1463cabdff1aSopenharmony_ci psllq mm5, 56 1464cabdff1aSopenharmony_ci pxor mm1, mm5 1465cabdff1aSopenharmony_ci jmp .do_top 1466cabdff1aSopenharmony_ci.fix_tr_2: 1467cabdff1aSopenharmony_ci punpckhbw mm3, mm3 1468cabdff1aSopenharmony_ci pshufw mm1, mm3, 0xFF 1469cabdff1aSopenharmony_ci jmp .do_topright 1470cabdff1aSopenharmony_ci.do_top: 1471cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1472cabdff1aSopenharmony_ci movq2dq xmm4, mm4 1473cabdff1aSopenharmony_ci test r2d, r2d 1474cabdff1aSopenharmony_ci jz .fix_tr_2 1475cabdff1aSopenharmony_ci movq mm0, [r0+8] 1476cabdff1aSopenharmony_ci movq mm5, mm0 1477cabdff1aSopenharmony_ci movq mm2, mm0 1478cabdff1aSopenharmony_ci movq mm4, mm0 1479cabdff1aSopenharmony_ci psrlq mm5, 56 1480cabdff1aSopenharmony_ci PALIGNR mm2, mm3, 7, mm3 1481cabdff1aSopenharmony_ci PALIGNR mm5, mm4, 1, mm4 1482cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1483cabdff1aSopenharmony_ci.do_topright: 1484cabdff1aSopenharmony_ci movq2dq xmm3, mm1 1485cabdff1aSopenharmony_ci lea r1, [r0+r3*2] 1486cabdff1aSopenharmony_ci pslldq xmm3, 8 1487cabdff1aSopenharmony_ci por xmm4, xmm3 1488cabdff1aSopenharmony_ci movdqa xmm2, xmm4 1489cabdff1aSopenharmony_ci movdqa xmm1, xmm4 1490cabdff1aSopenharmony_ci movdqa xmm3, xmm4 1491cabdff1aSopenharmony_ci psrldq xmm2, 1 1492cabdff1aSopenharmony_ci pslldq xmm1, 1 1493cabdff1aSopenharmony_ci pavgb xmm3, xmm2 1494cabdff1aSopenharmony_ci lea r2, [r1+r3*2] 1495cabdff1aSopenharmony_ciINIT_XMM cpuname 1496cabdff1aSopenharmony_ci PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 1497cabdff1aSopenharmony_ci psrldq xmm0, 1 1498cabdff1aSopenharmony_ci movq [r0+r3*1], xmm3 1499cabdff1aSopenharmony_ci movq [r0+r3*2], xmm0 1500cabdff1aSopenharmony_ci lea r0, [r2+r3*2] 1501cabdff1aSopenharmony_ci psrldq xmm3, 1 1502cabdff1aSopenharmony_ci psrldq xmm0, 1 1503cabdff1aSopenharmony_ci movq [r1+r3*1], xmm3 1504cabdff1aSopenharmony_ci movq [r1+r3*2], xmm0 1505cabdff1aSopenharmony_ci psrldq xmm3, 1 1506cabdff1aSopenharmony_ci psrldq xmm0, 1 1507cabdff1aSopenharmony_ci movq [r2+r3*1], xmm3 1508cabdff1aSopenharmony_ci movq [r2+r3*2], xmm0 1509cabdff1aSopenharmony_ci psrldq xmm3, 1 1510cabdff1aSopenharmony_ci psrldq xmm0, 1 1511cabdff1aSopenharmony_ci movq [r0+r3*1], xmm3 1512cabdff1aSopenharmony_ci movq [r0+r3*2], xmm0 1513cabdff1aSopenharmony_ci RET 1514cabdff1aSopenharmony_ci%endmacro 1515cabdff1aSopenharmony_ci 1516cabdff1aSopenharmony_ciINIT_MMX sse2 1517cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_LEFT 1518cabdff1aSopenharmony_ciINIT_MMX ssse3 1519cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_LEFT 1520cabdff1aSopenharmony_ci 1521cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1522cabdff1aSopenharmony_ci; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, 1523cabdff1aSopenharmony_ci; int has_topright, ptrdiff_t stride) 1524cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1525cabdff1aSopenharmony_ci 1526cabdff1aSopenharmony_ci%macro PRED8x8L_HORIZONTAL_UP 0 1527cabdff1aSopenharmony_cicglobal pred8x8l_horizontal_up_8, 4,4 1528cabdff1aSopenharmony_ci sub r0, r3 1529cabdff1aSopenharmony_ci lea r2, [r0+r3*2] 1530cabdff1aSopenharmony_ci movq mm0, [r0+r3*1-8] 1531cabdff1aSopenharmony_ci test r1d, r1d 1532cabdff1aSopenharmony_ci lea r1, [r0+r3] 1533cabdff1aSopenharmony_ci cmovnz r1, r0 1534cabdff1aSopenharmony_ci punpckhbw mm0, [r1+r3*0-8] 1535cabdff1aSopenharmony_ci movq mm1, [r2+r3*1-8] 1536cabdff1aSopenharmony_ci punpckhbw mm1, [r0+r3*2-8] 1537cabdff1aSopenharmony_ci mov r2, r0 1538cabdff1aSopenharmony_ci punpckhwd mm1, mm0 1539cabdff1aSopenharmony_ci lea r0, [r0+r3*4] 1540cabdff1aSopenharmony_ci movq mm2, [r0+r3*1-8] 1541cabdff1aSopenharmony_ci punpckhbw mm2, [r0+r3*0-8] 1542cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 1543cabdff1aSopenharmony_ci movq mm3, [r0+r3*1-8] 1544cabdff1aSopenharmony_ci punpckhbw mm3, [r0+r3*0-8] 1545cabdff1aSopenharmony_ci punpckhwd mm3, mm2 1546cabdff1aSopenharmony_ci punpckhdq mm3, mm1 1547cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 1548cabdff1aSopenharmony_ci movq mm0, [r0+r3*0-8] 1549cabdff1aSopenharmony_ci movq mm1, [r1+r3*0-8] 1550cabdff1aSopenharmony_ci mov r0, r2 1551cabdff1aSopenharmony_ci movq mm4, mm3 1552cabdff1aSopenharmony_ci movq mm2, mm3 1553cabdff1aSopenharmony_ci PALIGNR mm4, mm0, 7, mm0 1554cabdff1aSopenharmony_ci PALIGNR mm1, mm2, 1, mm2 1555cabdff1aSopenharmony_ci movq mm0, mm4 1556cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1557cabdff1aSopenharmony_ci movq mm4, mm0 1558cabdff1aSopenharmony_ci movq mm7, mm2 1559cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1560cabdff1aSopenharmony_ci psllq mm1, 56 1561cabdff1aSopenharmony_ci PALIGNR mm7, mm1, 7, mm3 1562cabdff1aSopenharmony_ci lea r1, [r0+r3*2] 1563cabdff1aSopenharmony_ci pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 1564cabdff1aSopenharmony_ci psllq mm7, 56 ; l7 .. .. .. .. .. .. .. 1565cabdff1aSopenharmony_ci movq mm2, mm0 1566cabdff1aSopenharmony_ci psllw mm0, 8 1567cabdff1aSopenharmony_ci psrlw mm2, 8 1568cabdff1aSopenharmony_ci por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 1569cabdff1aSopenharmony_ci movq mm3, mm2 1570cabdff1aSopenharmony_ci movq mm4, mm2 1571cabdff1aSopenharmony_ci movq mm5, mm2 1572cabdff1aSopenharmony_ci psrlq mm2, 8 1573cabdff1aSopenharmony_ci psrlq mm3, 16 1574cabdff1aSopenharmony_ci lea r2, [r1+r3*2] 1575cabdff1aSopenharmony_ci por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1 1576cabdff1aSopenharmony_ci punpckhbw mm7, mm7 1577cabdff1aSopenharmony_ci por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2 1578cabdff1aSopenharmony_ci pavgb mm4, mm2 1579cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6 1580cabdff1aSopenharmony_ci movq mm5, mm4 1581cabdff1aSopenharmony_ci punpcklbw mm4, mm1 ; p4 p3 p2 p1 1582cabdff1aSopenharmony_ci punpckhbw mm5, mm1 ; p8 p7 p6 p5 1583cabdff1aSopenharmony_ci movq mm6, mm5 1584cabdff1aSopenharmony_ci movq mm7, mm5 1585cabdff1aSopenharmony_ci movq mm0, mm5 1586cabdff1aSopenharmony_ci PALIGNR mm5, mm4, 2, mm1 1587cabdff1aSopenharmony_ci pshufw mm1, mm6, 11111001b 1588cabdff1aSopenharmony_ci PALIGNR mm6, mm4, 4, mm2 1589cabdff1aSopenharmony_ci pshufw mm2, mm7, 11111110b 1590cabdff1aSopenharmony_ci PALIGNR mm7, mm4, 6, mm3 1591cabdff1aSopenharmony_ci pshufw mm3, mm0, 11111111b 1592cabdff1aSopenharmony_ci movq [r0+r3*1], mm4 1593cabdff1aSopenharmony_ci movq [r0+r3*2], mm5 1594cabdff1aSopenharmony_ci lea r0, [r2+r3*2] 1595cabdff1aSopenharmony_ci movq [r1+r3*1], mm6 1596cabdff1aSopenharmony_ci movq [r1+r3*2], mm7 1597cabdff1aSopenharmony_ci movq [r2+r3*1], mm0 1598cabdff1aSopenharmony_ci movq [r2+r3*2], mm1 1599cabdff1aSopenharmony_ci movq [r0+r3*1], mm2 1600cabdff1aSopenharmony_ci movq [r0+r3*2], mm3 1601cabdff1aSopenharmony_ci RET 1602cabdff1aSopenharmony_ci%endmacro 1603cabdff1aSopenharmony_ci 1604cabdff1aSopenharmony_ciINIT_MMX mmxext 1605cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_UP 1606cabdff1aSopenharmony_ciINIT_MMX ssse3 1607cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_UP 1608cabdff1aSopenharmony_ci 1609cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1610cabdff1aSopenharmony_ci; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, 1611cabdff1aSopenharmony_ci; int has_topright, ptrdiff_t stride) 1612cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1613cabdff1aSopenharmony_ci 1614cabdff1aSopenharmony_ci%macro PRED8x8L_HORIZONTAL_DOWN 0 1615cabdff1aSopenharmony_cicglobal pred8x8l_horizontal_down_8, 4,5 1616cabdff1aSopenharmony_ci sub r0, r3 1617cabdff1aSopenharmony_ci lea r4, [r0+r3*2] 1618cabdff1aSopenharmony_ci movq mm0, [r0+r3*1-8] 1619cabdff1aSopenharmony_ci punpckhbw mm0, [r0+r3*0-8] 1620cabdff1aSopenharmony_ci movq mm1, [r4+r3*1-8] 1621cabdff1aSopenharmony_ci punpckhbw mm1, [r0+r3*2-8] 1622cabdff1aSopenharmony_ci mov r4, r0 1623cabdff1aSopenharmony_ci punpckhwd mm1, mm0 1624cabdff1aSopenharmony_ci lea r0, [r0+r3*4] 1625cabdff1aSopenharmony_ci movq mm2, [r0+r3*1-8] 1626cabdff1aSopenharmony_ci punpckhbw mm2, [r0+r3*0-8] 1627cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 1628cabdff1aSopenharmony_ci movq mm3, [r0+r3*1-8] 1629cabdff1aSopenharmony_ci punpckhbw mm3, [r0+r3*0-8] 1630cabdff1aSopenharmony_ci punpckhwd mm3, mm2 1631cabdff1aSopenharmony_ci punpckhdq mm3, mm1 1632cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 1633cabdff1aSopenharmony_ci movq mm0, [r0+r3*0-8] 1634cabdff1aSopenharmony_ci movq mm1, [r4] 1635cabdff1aSopenharmony_ci mov r0, r4 1636cabdff1aSopenharmony_ci movq mm4, mm3 1637cabdff1aSopenharmony_ci movq mm2, mm3 1638cabdff1aSopenharmony_ci PALIGNR mm4, mm0, 7, mm0 1639cabdff1aSopenharmony_ci PALIGNR mm1, mm2, 1, mm2 1640cabdff1aSopenharmony_ci test r1d, r1d 1641cabdff1aSopenharmony_ci jnz .do_left 1642cabdff1aSopenharmony_ci.fix_lt_1: 1643cabdff1aSopenharmony_ci movq mm5, mm3 1644cabdff1aSopenharmony_ci pxor mm5, mm4 1645cabdff1aSopenharmony_ci psrlq mm5, 56 1646cabdff1aSopenharmony_ci psllq mm5, 48 1647cabdff1aSopenharmony_ci pxor mm1, mm5 1648cabdff1aSopenharmony_ci jmp .do_left 1649cabdff1aSopenharmony_ci.fix_lt_2: 1650cabdff1aSopenharmony_ci movq mm5, mm3 1651cabdff1aSopenharmony_ci pxor mm5, mm2 1652cabdff1aSopenharmony_ci psllq mm5, 56 1653cabdff1aSopenharmony_ci psrlq mm5, 56 1654cabdff1aSopenharmony_ci pxor mm2, mm5 1655cabdff1aSopenharmony_ci test r2d, r2d 1656cabdff1aSopenharmony_ci jnz .do_top 1657cabdff1aSopenharmony_ci.fix_tr_1: 1658cabdff1aSopenharmony_ci movq mm5, mm3 1659cabdff1aSopenharmony_ci pxor mm5, mm1 1660cabdff1aSopenharmony_ci psrlq mm5, 56 1661cabdff1aSopenharmony_ci psllq mm5, 56 1662cabdff1aSopenharmony_ci pxor mm1, mm5 1663cabdff1aSopenharmony_ci jmp .do_top 1664cabdff1aSopenharmony_ci.fix_tr_2: 1665cabdff1aSopenharmony_ci punpckhbw mm3, mm3 1666cabdff1aSopenharmony_ci pshufw mm1, mm3, 0xFF 1667cabdff1aSopenharmony_ci jmp .do_topright 1668cabdff1aSopenharmony_ci.do_left: 1669cabdff1aSopenharmony_ci movq mm0, mm4 1670cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1671cabdff1aSopenharmony_ci movq2dq xmm0, mm2 1672cabdff1aSopenharmony_ci pslldq xmm0, 8 1673cabdff1aSopenharmony_ci movq mm4, mm0 1674cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1675cabdff1aSopenharmony_ci movq2dq xmm2, mm1 1676cabdff1aSopenharmony_ci pslldq xmm2, 15 1677cabdff1aSopenharmony_ci psrldq xmm2, 8 1678cabdff1aSopenharmony_ci por xmm0, xmm2 1679cabdff1aSopenharmony_ci movq mm0, [r0-8] 1680cabdff1aSopenharmony_ci movq mm3, [r0] 1681cabdff1aSopenharmony_ci movq mm1, [r0+8] 1682cabdff1aSopenharmony_ci movq mm2, mm3 1683cabdff1aSopenharmony_ci movq mm4, mm3 1684cabdff1aSopenharmony_ci PALIGNR mm2, mm0, 7, mm0 1685cabdff1aSopenharmony_ci PALIGNR mm1, mm4, 1, mm4 1686cabdff1aSopenharmony_ci test r1d, r1d 1687cabdff1aSopenharmony_ci jz .fix_lt_2 1688cabdff1aSopenharmony_ci test r2d, r2d 1689cabdff1aSopenharmony_ci jz .fix_tr_1 1690cabdff1aSopenharmony_ci.do_top: 1691cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1692cabdff1aSopenharmony_ci movq2dq xmm1, mm4 1693cabdff1aSopenharmony_ci test r2d, r2d 1694cabdff1aSopenharmony_ci jz .fix_tr_2 1695cabdff1aSopenharmony_ci movq mm0, [r0+8] 1696cabdff1aSopenharmony_ci movq mm5, mm0 1697cabdff1aSopenharmony_ci movq mm2, mm0 1698cabdff1aSopenharmony_ci movq mm4, mm0 1699cabdff1aSopenharmony_ci psrlq mm5, 56 1700cabdff1aSopenharmony_ci PALIGNR mm2, mm3, 7, mm3 1701cabdff1aSopenharmony_ci PALIGNR mm5, mm4, 1, mm4 1702cabdff1aSopenharmony_ci PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1703cabdff1aSopenharmony_ci.do_topright: 1704cabdff1aSopenharmony_ci movq2dq xmm5, mm1 1705cabdff1aSopenharmony_ci pslldq xmm5, 8 1706cabdff1aSopenharmony_ci por xmm1, xmm5 1707cabdff1aSopenharmony_ciINIT_XMM cpuname 1708cabdff1aSopenharmony_ci lea r2, [r4+r3*2] 1709cabdff1aSopenharmony_ci movdqa xmm2, xmm1 1710cabdff1aSopenharmony_ci movdqa xmm3, xmm1 1711cabdff1aSopenharmony_ci PALIGNR xmm1, xmm0, 7, xmm4 1712cabdff1aSopenharmony_ci PALIGNR xmm2, xmm0, 9, xmm5 1713cabdff1aSopenharmony_ci lea r1, [r2+r3*2] 1714cabdff1aSopenharmony_ci PALIGNR xmm3, xmm0, 8, xmm0 1715cabdff1aSopenharmony_ci movdqa xmm4, xmm1 1716cabdff1aSopenharmony_ci pavgb xmm4, xmm3 1717cabdff1aSopenharmony_ci lea r0, [r1+r3*2] 1718cabdff1aSopenharmony_ci PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 1719cabdff1aSopenharmony_ci punpcklbw xmm4, xmm0 1720cabdff1aSopenharmony_ci movhlps xmm0, xmm4 1721cabdff1aSopenharmony_ci movq [r0+r3*2], xmm4 1722cabdff1aSopenharmony_ci movq [r2+r3*2], xmm0 1723cabdff1aSopenharmony_ci psrldq xmm4, 2 1724cabdff1aSopenharmony_ci psrldq xmm0, 2 1725cabdff1aSopenharmony_ci movq [r0+r3*1], xmm4 1726cabdff1aSopenharmony_ci movq [r2+r3*1], xmm0 1727cabdff1aSopenharmony_ci psrldq xmm4, 2 1728cabdff1aSopenharmony_ci psrldq xmm0, 2 1729cabdff1aSopenharmony_ci movq [r1+r3*2], xmm4 1730cabdff1aSopenharmony_ci movq [r4+r3*2], xmm0 1731cabdff1aSopenharmony_ci psrldq xmm4, 2 1732cabdff1aSopenharmony_ci psrldq xmm0, 2 1733cabdff1aSopenharmony_ci movq [r1+r3*1], xmm4 1734cabdff1aSopenharmony_ci movq [r4+r3*1], xmm0 1735cabdff1aSopenharmony_ci RET 1736cabdff1aSopenharmony_ci%endmacro 1737cabdff1aSopenharmony_ci 1738cabdff1aSopenharmony_ciINIT_MMX sse2 1739cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_DOWN 1740cabdff1aSopenharmony_ciINIT_MMX ssse3 1741cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_DOWN 1742cabdff1aSopenharmony_ci 1743cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 1744cabdff1aSopenharmony_ci; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, 1745cabdff1aSopenharmony_ci; ptrdiff_t stride) 1746cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 1747cabdff1aSopenharmony_ci 1748cabdff1aSopenharmony_ciINIT_MMX mmxext 1749cabdff1aSopenharmony_cicglobal pred4x4_dc_8, 3,5 1750cabdff1aSopenharmony_ci pxor mm7, mm7 1751cabdff1aSopenharmony_ci mov r4, r0 1752cabdff1aSopenharmony_ci sub r0, r2 1753cabdff1aSopenharmony_ci movd mm0, [r0] 1754cabdff1aSopenharmony_ci psadbw mm0, mm7 1755cabdff1aSopenharmony_ci movzx r1d, byte [r0+r2*1-1] 1756cabdff1aSopenharmony_ci movd r3d, mm0 1757cabdff1aSopenharmony_ci add r3d, r1d 1758cabdff1aSopenharmony_ci movzx r1d, byte [r0+r2*2-1] 1759cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 1760cabdff1aSopenharmony_ci add r3d, r1d 1761cabdff1aSopenharmony_ci movzx r1d, byte [r0+r2*1-1] 1762cabdff1aSopenharmony_ci add r3d, r1d 1763cabdff1aSopenharmony_ci movzx r1d, byte [r0+r2*2-1] 1764cabdff1aSopenharmony_ci add r3d, r1d 1765cabdff1aSopenharmony_ci add r3d, 4 1766cabdff1aSopenharmony_ci shr r3d, 3 1767cabdff1aSopenharmony_ci imul r3d, 0x01010101 1768cabdff1aSopenharmony_ci mov [r4+r2*0], r3d 1769cabdff1aSopenharmony_ci mov [r0+r2*0], r3d 1770cabdff1aSopenharmony_ci mov [r0+r2*1], r3d 1771cabdff1aSopenharmony_ci mov [r0+r2*2], r3d 1772cabdff1aSopenharmony_ci RET 1773cabdff1aSopenharmony_ci 1774cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1775cabdff1aSopenharmony_ci; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, 1776cabdff1aSopenharmony_ci; ptrdiff_t stride) 1777cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1778cabdff1aSopenharmony_ci 1779cabdff1aSopenharmony_ciINIT_MMX mmxext 1780cabdff1aSopenharmony_cicglobal pred4x4_tm_vp8_8, 3,6 1781cabdff1aSopenharmony_ci sub r0, r2 1782cabdff1aSopenharmony_ci pxor mm7, mm7 1783cabdff1aSopenharmony_ci movd mm0, [r0] 1784cabdff1aSopenharmony_ci punpcklbw mm0, mm7 1785cabdff1aSopenharmony_ci movzx r4d, byte [r0-1] 1786cabdff1aSopenharmony_ci mov r5d, 2 1787cabdff1aSopenharmony_ci.loop: 1788cabdff1aSopenharmony_ci movzx r1d, byte [r0+r2*1-1] 1789cabdff1aSopenharmony_ci movzx r3d, byte [r0+r2*2-1] 1790cabdff1aSopenharmony_ci sub r1d, r4d 1791cabdff1aSopenharmony_ci sub r3d, r4d 1792cabdff1aSopenharmony_ci movd mm2, r1d 1793cabdff1aSopenharmony_ci movd mm4, r3d 1794cabdff1aSopenharmony_ci pshufw mm2, mm2, 0 1795cabdff1aSopenharmony_ci pshufw mm4, mm4, 0 1796cabdff1aSopenharmony_ci paddw mm2, mm0 1797cabdff1aSopenharmony_ci paddw mm4, mm0 1798cabdff1aSopenharmony_ci packuswb mm2, mm2 1799cabdff1aSopenharmony_ci packuswb mm4, mm4 1800cabdff1aSopenharmony_ci movd [r0+r2*1], mm2 1801cabdff1aSopenharmony_ci movd [r0+r2*2], mm4 1802cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 1803cabdff1aSopenharmony_ci dec r5d 1804cabdff1aSopenharmony_ci jg .loop 1805cabdff1aSopenharmony_ci REP_RET 1806cabdff1aSopenharmony_ci 1807cabdff1aSopenharmony_ciINIT_XMM ssse3 1808cabdff1aSopenharmony_cicglobal pred4x4_tm_vp8_8, 3,3 1809cabdff1aSopenharmony_ci sub r0, r2 1810cabdff1aSopenharmony_ci movq mm6, [tm_shuf] 1811cabdff1aSopenharmony_ci pxor mm1, mm1 1812cabdff1aSopenharmony_ci movd mm0, [r0] 1813cabdff1aSopenharmony_ci punpcklbw mm0, mm1 1814cabdff1aSopenharmony_ci movd mm7, [r0-4] 1815cabdff1aSopenharmony_ci pshufb mm7, mm6 1816cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 1817cabdff1aSopenharmony_ci movd mm2, [r0+r2*1-4] 1818cabdff1aSopenharmony_ci movd mm3, [r0+r2*2-4] 1819cabdff1aSopenharmony_ci movd mm4, [r1+r2*1-4] 1820cabdff1aSopenharmony_ci movd mm5, [r1+r2*2-4] 1821cabdff1aSopenharmony_ci pshufb mm2, mm6 1822cabdff1aSopenharmony_ci pshufb mm3, mm6 1823cabdff1aSopenharmony_ci pshufb mm4, mm6 1824cabdff1aSopenharmony_ci pshufb mm5, mm6 1825cabdff1aSopenharmony_ci psubw mm0, mm7 1826cabdff1aSopenharmony_ci paddw mm2, mm0 1827cabdff1aSopenharmony_ci paddw mm3, mm0 1828cabdff1aSopenharmony_ci paddw mm4, mm0 1829cabdff1aSopenharmony_ci paddw mm5, mm0 1830cabdff1aSopenharmony_ci packuswb mm2, mm2 1831cabdff1aSopenharmony_ci packuswb mm3, mm3 1832cabdff1aSopenharmony_ci packuswb mm4, mm4 1833cabdff1aSopenharmony_ci packuswb mm5, mm5 1834cabdff1aSopenharmony_ci movd [r0+r2*1], mm2 1835cabdff1aSopenharmony_ci movd [r0+r2*2], mm3 1836cabdff1aSopenharmony_ci movd [r1+r2*1], mm4 1837cabdff1aSopenharmony_ci movd [r1+r2*2], mm5 1838cabdff1aSopenharmony_ci RET 1839cabdff1aSopenharmony_ci 1840cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1841cabdff1aSopenharmony_ci; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, 1842cabdff1aSopenharmony_ci; ptrdiff_t stride) 1843cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1844cabdff1aSopenharmony_ci 1845cabdff1aSopenharmony_ciINIT_MMX mmxext 1846cabdff1aSopenharmony_cicglobal pred4x4_vertical_vp8_8, 3,3 1847cabdff1aSopenharmony_ci sub r0, r2 1848cabdff1aSopenharmony_ci movd m1, [r0-1] 1849cabdff1aSopenharmony_ci movd m0, [r0] 1850cabdff1aSopenharmony_ci mova m2, m0 ;t0 t1 t2 t3 1851cabdff1aSopenharmony_ci punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 1852cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 1853cabdff1aSopenharmony_ci psrlq m0, 8 ;t1 t2 t3 t4 1854cabdff1aSopenharmony_ci PRED4x4_LOWPASS m3, m1, m0, m2, m4 1855cabdff1aSopenharmony_ci movd [r0+r2*1], m3 1856cabdff1aSopenharmony_ci movd [r0+r2*2], m3 1857cabdff1aSopenharmony_ci movd [r1+r2*1], m3 1858cabdff1aSopenharmony_ci movd [r1+r2*2], m3 1859cabdff1aSopenharmony_ci RET 1860cabdff1aSopenharmony_ci 1861cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1862cabdff1aSopenharmony_ci; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, 1863cabdff1aSopenharmony_ci; ptrdiff_t stride) 1864cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1865cabdff1aSopenharmony_ciINIT_MMX mmxext 1866cabdff1aSopenharmony_cicglobal pred4x4_down_left_8, 3,3 1867cabdff1aSopenharmony_ci sub r0, r2 1868cabdff1aSopenharmony_ci movq m1, [r0] 1869cabdff1aSopenharmony_ci punpckldq m1, [r1] 1870cabdff1aSopenharmony_ci movq m2, m1 1871cabdff1aSopenharmony_ci movq m3, m1 1872cabdff1aSopenharmony_ci psllq m1, 8 1873cabdff1aSopenharmony_ci pxor m2, m1 1874cabdff1aSopenharmony_ci psrlq m2, 8 1875cabdff1aSopenharmony_ci pxor m2, m3 1876cabdff1aSopenharmony_ci PRED4x4_LOWPASS m0, m1, m2, m3, m4 1877cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 1878cabdff1aSopenharmony_ci psrlq m0, 8 1879cabdff1aSopenharmony_ci movd [r0+r2*1], m0 1880cabdff1aSopenharmony_ci psrlq m0, 8 1881cabdff1aSopenharmony_ci movd [r0+r2*2], m0 1882cabdff1aSopenharmony_ci psrlq m0, 8 1883cabdff1aSopenharmony_ci movd [r1+r2*1], m0 1884cabdff1aSopenharmony_ci psrlq m0, 8 1885cabdff1aSopenharmony_ci movd [r1+r2*2], m0 1886cabdff1aSopenharmony_ci RET 1887cabdff1aSopenharmony_ci 1888cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1889cabdff1aSopenharmony_ci; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, 1890cabdff1aSopenharmony_ci; ptrdiff_t stride) 1891cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1892cabdff1aSopenharmony_ci 1893cabdff1aSopenharmony_ciINIT_MMX mmxext 1894cabdff1aSopenharmony_cicglobal pred4x4_vertical_left_8, 3,3 1895cabdff1aSopenharmony_ci sub r0, r2 1896cabdff1aSopenharmony_ci movq m1, [r0] 1897cabdff1aSopenharmony_ci punpckldq m1, [r1] 1898cabdff1aSopenharmony_ci movq m3, m1 1899cabdff1aSopenharmony_ci movq m2, m1 1900cabdff1aSopenharmony_ci psrlq m3, 8 1901cabdff1aSopenharmony_ci psrlq m2, 16 1902cabdff1aSopenharmony_ci movq m4, m3 1903cabdff1aSopenharmony_ci pavgb m4, m1 1904cabdff1aSopenharmony_ci PRED4x4_LOWPASS m0, m1, m2, m3, m5 1905cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 1906cabdff1aSopenharmony_ci movh [r0+r2*1], m4 1907cabdff1aSopenharmony_ci movh [r0+r2*2], m0 1908cabdff1aSopenharmony_ci psrlq m4, 8 1909cabdff1aSopenharmony_ci psrlq m0, 8 1910cabdff1aSopenharmony_ci movh [r1+r2*1], m4 1911cabdff1aSopenharmony_ci movh [r1+r2*2], m0 1912cabdff1aSopenharmony_ci RET 1913cabdff1aSopenharmony_ci 1914cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1915cabdff1aSopenharmony_ci; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, 1916cabdff1aSopenharmony_ci; ptrdiff_t stride) 1917cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1918cabdff1aSopenharmony_ci 1919cabdff1aSopenharmony_ciINIT_MMX mmxext 1920cabdff1aSopenharmony_cicglobal pred4x4_horizontal_up_8, 3,3 1921cabdff1aSopenharmony_ci sub r0, r2 1922cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 1923cabdff1aSopenharmony_ci movd m0, [r0+r2*1-4] 1924cabdff1aSopenharmony_ci punpcklbw m0, [r0+r2*2-4] 1925cabdff1aSopenharmony_ci movd m1, [r1+r2*1-4] 1926cabdff1aSopenharmony_ci punpcklbw m1, [r1+r2*2-4] 1927cabdff1aSopenharmony_ci punpckhwd m0, m1 1928cabdff1aSopenharmony_ci movq m1, m0 1929cabdff1aSopenharmony_ci punpckhbw m1, m1 1930cabdff1aSopenharmony_ci pshufw m1, m1, 0xFF 1931cabdff1aSopenharmony_ci punpckhdq m0, m1 1932cabdff1aSopenharmony_ci movq m2, m0 1933cabdff1aSopenharmony_ci movq m3, m0 1934cabdff1aSopenharmony_ci movq m7, m0 1935cabdff1aSopenharmony_ci psrlq m2, 16 1936cabdff1aSopenharmony_ci psrlq m3, 8 1937cabdff1aSopenharmony_ci pavgb m7, m3 1938cabdff1aSopenharmony_ci PRED4x4_LOWPASS m4, m0, m2, m3, m5 1939cabdff1aSopenharmony_ci punpcklbw m7, m4 1940cabdff1aSopenharmony_ci movd [r0+r2*1], m7 1941cabdff1aSopenharmony_ci psrlq m7, 16 1942cabdff1aSopenharmony_ci movd [r0+r2*2], m7 1943cabdff1aSopenharmony_ci psrlq m7, 16 1944cabdff1aSopenharmony_ci movd [r1+r2*1], m7 1945cabdff1aSopenharmony_ci movd [r1+r2*2], m1 1946cabdff1aSopenharmony_ci RET 1947cabdff1aSopenharmony_ci 1948cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1949cabdff1aSopenharmony_ci; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src, 1950cabdff1aSopenharmony_ci; const uint8_t *topright, 1951cabdff1aSopenharmony_ci; ptrdiff_t stride) 1952cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 1953cabdff1aSopenharmony_ci 1954cabdff1aSopenharmony_ciINIT_MMX mmxext 1955cabdff1aSopenharmony_cicglobal pred4x4_horizontal_down_8, 3,3 1956cabdff1aSopenharmony_ci sub r0, r2 1957cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 1958cabdff1aSopenharmony_ci movh m0, [r0-4] ; lt .. 1959cabdff1aSopenharmony_ci punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. .. 1960cabdff1aSopenharmony_ci psllq m0, 8 ; t2 t1 t0 lt .. .. .. .. 1961cabdff1aSopenharmony_ci movd m1, [r1+r2*2-4] ; l3 1962cabdff1aSopenharmony_ci punpcklbw m1, [r1+r2*1-4] ; l2 l3 1963cabdff1aSopenharmony_ci movd m2, [r0+r2*2-4] ; l1 1964cabdff1aSopenharmony_ci punpcklbw m2, [r0+r2*1-4] ; l0 l1 1965cabdff1aSopenharmony_ci punpckhwd m1, m2 ; l0 l1 l2 l3 1966cabdff1aSopenharmony_ci punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 1967cabdff1aSopenharmony_ci movq m0, m1 1968cabdff1aSopenharmony_ci movq m2, m1 1969cabdff1aSopenharmony_ci movq m5, m1 1970cabdff1aSopenharmony_ci psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 1971cabdff1aSopenharmony_ci psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 1972cabdff1aSopenharmony_ci pavgb m5, m2 1973cabdff1aSopenharmony_ci PRED4x4_LOWPASS m3, m1, m0, m2, m4 1974cabdff1aSopenharmony_ci punpcklbw m5, m3 1975cabdff1aSopenharmony_ci psrlq m3, 32 1976cabdff1aSopenharmony_ci PALIGNR m3, m5, 6, m4 1977cabdff1aSopenharmony_ci movh [r1+r2*2], m5 1978cabdff1aSopenharmony_ci psrlq m5, 16 1979cabdff1aSopenharmony_ci movh [r1+r2*1], m5 1980cabdff1aSopenharmony_ci psrlq m5, 16 1981cabdff1aSopenharmony_ci movh [r0+r2*2], m5 1982cabdff1aSopenharmony_ci movh [r0+r2*1], m3 1983cabdff1aSopenharmony_ci RET 1984cabdff1aSopenharmony_ci 1985cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1986cabdff1aSopenharmony_ci; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src, 1987cabdff1aSopenharmony_ci; const uint8_t *topright, 1988cabdff1aSopenharmony_ci; ptrdiff_t stride) 1989cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 1990cabdff1aSopenharmony_ci 1991cabdff1aSopenharmony_ciINIT_MMX mmxext 1992cabdff1aSopenharmony_cicglobal pred4x4_vertical_right_8, 3,3 1993cabdff1aSopenharmony_ci sub r0, r2 1994cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 1995cabdff1aSopenharmony_ci movh m0, [r0] ; ........t3t2t1t0 1996cabdff1aSopenharmony_ci movq m5, m0 1997cabdff1aSopenharmony_ci PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt 1998cabdff1aSopenharmony_ci pavgb m5, m0 1999cabdff1aSopenharmony_ci PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0 2000cabdff1aSopenharmony_ci movq m1, m0 2001cabdff1aSopenharmony_ci PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 2002cabdff1aSopenharmony_ci movq m2, m0 2003cabdff1aSopenharmony_ci PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 2004cabdff1aSopenharmony_ci PRED4x4_LOWPASS m3, m1, m0, m2, m4 2005cabdff1aSopenharmony_ci movq m1, m3 2006cabdff1aSopenharmony_ci psrlq m3, 16 2007cabdff1aSopenharmony_ci psllq m1, 48 2008cabdff1aSopenharmony_ci movh [r0+r2*1], m5 2009cabdff1aSopenharmony_ci movh [r0+r2*2], m3 2010cabdff1aSopenharmony_ci PALIGNR m5, m1, 7, m2 2011cabdff1aSopenharmony_ci psllq m1, 8 2012cabdff1aSopenharmony_ci movh [r1+r2*1], m5 2013cabdff1aSopenharmony_ci PALIGNR m3, m1, 7, m1 2014cabdff1aSopenharmony_ci movh [r1+r2*2], m3 2015cabdff1aSopenharmony_ci RET 2016cabdff1aSopenharmony_ci 2017cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 2018cabdff1aSopenharmony_ci; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, 2019cabdff1aSopenharmony_ci; ptrdiff_t stride) 2020cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 2021cabdff1aSopenharmony_ci 2022cabdff1aSopenharmony_ciINIT_MMX mmxext 2023cabdff1aSopenharmony_cicglobal pred4x4_down_right_8, 3,3 2024cabdff1aSopenharmony_ci sub r0, r2 2025cabdff1aSopenharmony_ci lea r1, [r0+r2*2] 2026cabdff1aSopenharmony_ci movq m1, [r1-8] 2027cabdff1aSopenharmony_ci movq m2, [r0+r2*1-8] 2028cabdff1aSopenharmony_ci punpckhbw m2, [r0-8] 2029cabdff1aSopenharmony_ci movh m3, [r0] 2030cabdff1aSopenharmony_ci punpckhwd m1, m2 2031cabdff1aSopenharmony_ci PALIGNR m3, m1, 5, m1 2032cabdff1aSopenharmony_ci movq m1, m3 2033cabdff1aSopenharmony_ci PALIGNR m3, [r1+r2*1-8], 7, m4 2034cabdff1aSopenharmony_ci movq m2, m3 2035cabdff1aSopenharmony_ci PALIGNR m3, [r1+r2*2-8], 7, m4 2036cabdff1aSopenharmony_ci PRED4x4_LOWPASS m0, m3, m1, m2, m4 2037cabdff1aSopenharmony_ci movh [r1+r2*2], m0 2038cabdff1aSopenharmony_ci psrlq m0, 8 2039cabdff1aSopenharmony_ci movh [r1+r2*1], m0 2040cabdff1aSopenharmony_ci psrlq m0, 8 2041cabdff1aSopenharmony_ci movh [r0+r2*2], m0 2042cabdff1aSopenharmony_ci psrlq m0, 8 2043cabdff1aSopenharmony_ci movh [r0+r2*1], m0 2044cabdff1aSopenharmony_ci RET 2045