1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* MMX/SSSE3-optimized functions for H.264 chroma MC 3cabdff1aSopenharmony_ci;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, 4cabdff1aSopenharmony_ci;* 2005-2008 Loren Merritt 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION_RODATA 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_cirnd_rv40_2d_tbl: times 4 dw 0 28cabdff1aSopenharmony_ci times 4 dw 16 29cabdff1aSopenharmony_ci times 4 dw 32 30cabdff1aSopenharmony_ci times 4 dw 16 31cabdff1aSopenharmony_ci times 4 dw 32 32cabdff1aSopenharmony_ci times 4 dw 28 33cabdff1aSopenharmony_ci times 4 dw 32 34cabdff1aSopenharmony_ci times 4 dw 28 35cabdff1aSopenharmony_ci times 4 dw 0 36cabdff1aSopenharmony_ci times 4 dw 32 37cabdff1aSopenharmony_ci times 4 dw 16 38cabdff1aSopenharmony_ci times 4 dw 32 39cabdff1aSopenharmony_ci times 4 dw 32 40cabdff1aSopenharmony_ci times 4 dw 28 41cabdff1aSopenharmony_ci times 4 dw 32 42cabdff1aSopenharmony_ci times 4 dw 28 43cabdff1aSopenharmony_cirnd_rv40_1d_tbl: times 4 dw 0 44cabdff1aSopenharmony_ci times 4 dw 2 45cabdff1aSopenharmony_ci times 4 dw 4 46cabdff1aSopenharmony_ci times 4 dw 2 47cabdff1aSopenharmony_ci times 4 dw 4 48cabdff1aSopenharmony_ci times 4 dw 3 49cabdff1aSopenharmony_ci times 4 dw 4 50cabdff1aSopenharmony_ci times 4 dw 3 51cabdff1aSopenharmony_ci times 4 dw 0 52cabdff1aSopenharmony_ci times 4 dw 4 53cabdff1aSopenharmony_ci times 4 dw 2 54cabdff1aSopenharmony_ci times 4 dw 4 55cabdff1aSopenharmony_ci times 4 dw 4 56cabdff1aSopenharmony_ci times 4 dw 3 57cabdff1aSopenharmony_ci times 4 dw 4 58cabdff1aSopenharmony_ci times 4 dw 3 59cabdff1aSopenharmony_ci 60cabdff1aSopenharmony_cicextern pw_3 61cabdff1aSopenharmony_cicextern pw_4 62cabdff1aSopenharmony_cicextern pw_8 63cabdff1aSopenharmony_cipw_28: times 8 dw 28 64cabdff1aSopenharmony_cicextern pw_32 65cabdff1aSopenharmony_cicextern pw_64 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ciSECTION .text 68cabdff1aSopenharmony_ci 69cabdff1aSopenharmony_ci%macro mv0_pixels_mc8 0 70cabdff1aSopenharmony_ci lea r4, [r2*2 ] 71cabdff1aSopenharmony_ci.next4rows: 72cabdff1aSopenharmony_ci movq mm0, [r1 ] 73cabdff1aSopenharmony_ci movq mm1, [r1+r2] 74cabdff1aSopenharmony_ci add r1, r4 75cabdff1aSopenharmony_ci CHROMAMC_AVG mm0, [r0 ] 76cabdff1aSopenharmony_ci CHROMAMC_AVG mm1, [r0+r2] 77cabdff1aSopenharmony_ci movq [r0 ], mm0 78cabdff1aSopenharmony_ci movq [r0+r2], mm1 79cabdff1aSopenharmony_ci add r0, r4 80cabdff1aSopenharmony_ci movq mm0, [r1 ] 81cabdff1aSopenharmony_ci movq mm1, [r1+r2] 82cabdff1aSopenharmony_ci add r1, r4 83cabdff1aSopenharmony_ci CHROMAMC_AVG mm0, [r0 ] 84cabdff1aSopenharmony_ci CHROMAMC_AVG mm1, [r0+r2] 85cabdff1aSopenharmony_ci movq [r0 ], mm0 86cabdff1aSopenharmony_ci movq [r0+r2], mm1 87cabdff1aSopenharmony_ci add r0, r4 88cabdff1aSopenharmony_ci sub r3d, 4 89cabdff1aSopenharmony_ci jne .next4rows 90cabdff1aSopenharmony_ci%endmacro 91cabdff1aSopenharmony_ci 92cabdff1aSopenharmony_ci%macro chroma_mc8_mmx_func 2-3 93cabdff1aSopenharmony_ci%ifidn %2, rv40 94cabdff1aSopenharmony_ci%ifdef PIC 95cabdff1aSopenharmony_ci%define rnd_1d_rv40 r8 96cabdff1aSopenharmony_ci%define rnd_2d_rv40 r8 97cabdff1aSopenharmony_ci%define extra_regs 2 98cabdff1aSopenharmony_ci%else ; no-PIC 99cabdff1aSopenharmony_ci%define rnd_1d_rv40 rnd_rv40_1d_tbl 100cabdff1aSopenharmony_ci%define rnd_2d_rv40 rnd_rv40_2d_tbl 101cabdff1aSopenharmony_ci%define extra_regs 1 102cabdff1aSopenharmony_ci%endif ; PIC 103cabdff1aSopenharmony_ci%else 104cabdff1aSopenharmony_ci%define extra_regs 0 105cabdff1aSopenharmony_ci%endif ; rv40 106cabdff1aSopenharmony_ci; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */, 107cabdff1aSopenharmony_ci; uint8_t *src /* align 1 */, 108cabdff1aSopenharmony_ci; ptrdiff_t stride, int h, int mx, int my) 109cabdff1aSopenharmony_cicglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0 110cabdff1aSopenharmony_ci mov r6d, r5d 111cabdff1aSopenharmony_ci or r6d, r4d 112cabdff1aSopenharmony_ci jne .at_least_one_non_zero 113cabdff1aSopenharmony_ci ; mx == 0 AND my == 0 - no filter needed 114cabdff1aSopenharmony_ci mv0_pixels_mc8 115cabdff1aSopenharmony_ci REP_RET 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci.at_least_one_non_zero: 118cabdff1aSopenharmony_ci%ifidn %2, rv40 119cabdff1aSopenharmony_ci%if ARCH_X86_64 120cabdff1aSopenharmony_ci mov r7, r5 121cabdff1aSopenharmony_ci and r7, 6 ; &~1 for mx/my=[0,7] 122cabdff1aSopenharmony_ci lea r7, [r7*4+r4] 123cabdff1aSopenharmony_ci sar r7d, 1 124cabdff1aSopenharmony_ci%define rnd_bias r7 125cabdff1aSopenharmony_ci%define dest_reg r0 126cabdff1aSopenharmony_ci%else ; x86-32 127cabdff1aSopenharmony_ci mov r0, r5 128cabdff1aSopenharmony_ci and r0, 6 ; &~1 for mx/my=[0,7] 129cabdff1aSopenharmony_ci lea r0, [r0*4+r4] 130cabdff1aSopenharmony_ci sar r0d, 1 131cabdff1aSopenharmony_ci%define rnd_bias r0 132cabdff1aSopenharmony_ci%define dest_reg r5 133cabdff1aSopenharmony_ci%endif 134cabdff1aSopenharmony_ci%else ; vc1, h264 135cabdff1aSopenharmony_ci%define rnd_bias 0 136cabdff1aSopenharmony_ci%define dest_reg r0 137cabdff1aSopenharmony_ci%endif 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci test r5d, r5d 140cabdff1aSopenharmony_ci mov r6, 1 141cabdff1aSopenharmony_ci je .my_is_zero 142cabdff1aSopenharmony_ci test r4d, r4d 143cabdff1aSopenharmony_ci mov r6, r2 ; dxy = x ? 1 : stride 144cabdff1aSopenharmony_ci jne .both_non_zero 145cabdff1aSopenharmony_ci.my_is_zero: 146cabdff1aSopenharmony_ci ; mx == 0 XOR my == 0 - 1 dimensional filter only 147cabdff1aSopenharmony_ci or r4d, r5d ; x + y 148cabdff1aSopenharmony_ci 149cabdff1aSopenharmony_ci%ifidn %2, rv40 150cabdff1aSopenharmony_ci%ifdef PIC 151cabdff1aSopenharmony_ci lea r8, [rnd_rv40_1d_tbl] 152cabdff1aSopenharmony_ci%endif 153cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 154cabdff1aSopenharmony_ci mov r5, r0m 155cabdff1aSopenharmony_ci%endif 156cabdff1aSopenharmony_ci%endif 157cabdff1aSopenharmony_ci 158cabdff1aSopenharmony_ci movd m5, r4d 159cabdff1aSopenharmony_ci movq m4, [pw_8] 160cabdff1aSopenharmony_ci movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 161cabdff1aSopenharmony_ci punpcklwd m5, m5 162cabdff1aSopenharmony_ci punpckldq m5, m5 ; mm5 = B = x 163cabdff1aSopenharmony_ci pxor m7, m7 164cabdff1aSopenharmony_ci psubw m4, m5 ; mm4 = A = 8-x 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ci.next1drow: 167cabdff1aSopenharmony_ci movq m0, [r1 ] ; mm0 = src[0..7] 168cabdff1aSopenharmony_ci movq m2, [r1+r6] ; mm1 = src[1..8] 169cabdff1aSopenharmony_ci 170cabdff1aSopenharmony_ci movq m1, m0 171cabdff1aSopenharmony_ci movq m3, m2 172cabdff1aSopenharmony_ci punpcklbw m0, m7 173cabdff1aSopenharmony_ci punpckhbw m1, m7 174cabdff1aSopenharmony_ci punpcklbw m2, m7 175cabdff1aSopenharmony_ci punpckhbw m3, m7 176cabdff1aSopenharmony_ci pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] 177cabdff1aSopenharmony_ci pmullw m1, m4 178cabdff1aSopenharmony_ci pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] 179cabdff1aSopenharmony_ci pmullw m3, m5 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ci paddw m0, m6 182cabdff1aSopenharmony_ci paddw m1, m6 183cabdff1aSopenharmony_ci paddw m0, m2 184cabdff1aSopenharmony_ci paddw m1, m3 185cabdff1aSopenharmony_ci psrlw m0, 3 186cabdff1aSopenharmony_ci psrlw m1, 3 187cabdff1aSopenharmony_ci packuswb m0, m1 188cabdff1aSopenharmony_ci CHROMAMC_AVG m0, [dest_reg] 189cabdff1aSopenharmony_ci movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 190cabdff1aSopenharmony_ci 191cabdff1aSopenharmony_ci add dest_reg, r2 192cabdff1aSopenharmony_ci add r1, r2 193cabdff1aSopenharmony_ci dec r3d 194cabdff1aSopenharmony_ci jne .next1drow 195cabdff1aSopenharmony_ci REP_RET 196cabdff1aSopenharmony_ci 197cabdff1aSopenharmony_ci.both_non_zero: ; general case, bilinear 198cabdff1aSopenharmony_ci movd m4, r4d ; x 199cabdff1aSopenharmony_ci movd m6, r5d ; y 200cabdff1aSopenharmony_ci%ifidn %2, rv40 201cabdff1aSopenharmony_ci%ifdef PIC 202cabdff1aSopenharmony_ci lea r8, [rnd_rv40_2d_tbl] 203cabdff1aSopenharmony_ci%endif 204cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 205cabdff1aSopenharmony_ci mov r5, r0m 206cabdff1aSopenharmony_ci%endif 207cabdff1aSopenharmony_ci%endif 208cabdff1aSopenharmony_ci mov r6, rsp ; backup stack pointer 209cabdff1aSopenharmony_ci and rsp, ~(mmsize-1) ; align stack 210cabdff1aSopenharmony_ci sub rsp, 16 ; AA and DD 211cabdff1aSopenharmony_ci 212cabdff1aSopenharmony_ci punpcklwd m4, m4 213cabdff1aSopenharmony_ci punpcklwd m6, m6 214cabdff1aSopenharmony_ci punpckldq m4, m4 ; mm4 = x words 215cabdff1aSopenharmony_ci punpckldq m6, m6 ; mm6 = y words 216cabdff1aSopenharmony_ci movq m5, m4 217cabdff1aSopenharmony_ci pmullw m4, m6 ; mm4 = x * y 218cabdff1aSopenharmony_ci psllw m5, 3 219cabdff1aSopenharmony_ci psllw m6, 3 220cabdff1aSopenharmony_ci movq m7, m5 221cabdff1aSopenharmony_ci paddw m7, m6 222cabdff1aSopenharmony_ci movq [rsp+8], m4 ; DD = x * y 223cabdff1aSopenharmony_ci psubw m5, m4 ; mm5 = B = 8x - xy 224cabdff1aSopenharmony_ci psubw m6, m4 ; mm6 = C = 8y - xy 225cabdff1aSopenharmony_ci paddw m4, [pw_64] 226cabdff1aSopenharmony_ci psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 227cabdff1aSopenharmony_ci pxor m7, m7 228cabdff1aSopenharmony_ci movq [rsp ], m4 229cabdff1aSopenharmony_ci 230cabdff1aSopenharmony_ci movq m0, [r1 ] ; mm0 = src[0..7] 231cabdff1aSopenharmony_ci movq m1, [r1+1] ; mm1 = src[1..8] 232cabdff1aSopenharmony_ci.next2drow: 233cabdff1aSopenharmony_ci add r1, r2 234cabdff1aSopenharmony_ci 235cabdff1aSopenharmony_ci movq m2, m0 236cabdff1aSopenharmony_ci movq m3, m1 237cabdff1aSopenharmony_ci punpckhbw m0, m7 238cabdff1aSopenharmony_ci punpcklbw m1, m7 239cabdff1aSopenharmony_ci punpcklbw m2, m7 240cabdff1aSopenharmony_ci punpckhbw m3, m7 241cabdff1aSopenharmony_ci pmullw m0, [rsp] 242cabdff1aSopenharmony_ci pmullw m2, [rsp] 243cabdff1aSopenharmony_ci pmullw m1, m5 244cabdff1aSopenharmony_ci pmullw m3, m5 245cabdff1aSopenharmony_ci paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] 246cabdff1aSopenharmony_ci paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_ci movq m0, [r1] 249cabdff1aSopenharmony_ci movq m1, m0 250cabdff1aSopenharmony_ci punpcklbw m0, m7 251cabdff1aSopenharmony_ci punpckhbw m1, m7 252cabdff1aSopenharmony_ci pmullw m0, m6 253cabdff1aSopenharmony_ci pmullw m1, m6 254cabdff1aSopenharmony_ci paddw m2, m0 255cabdff1aSopenharmony_ci paddw m3, m1 ; [mm2,mm3] += C * src[0..7] 256cabdff1aSopenharmony_ci 257cabdff1aSopenharmony_ci movq m1, [r1+1] 258cabdff1aSopenharmony_ci movq m0, m1 259cabdff1aSopenharmony_ci movq m4, m1 260cabdff1aSopenharmony_ci punpcklbw m0, m7 261cabdff1aSopenharmony_ci punpckhbw m4, m7 262cabdff1aSopenharmony_ci pmullw m0, [rsp+8] 263cabdff1aSopenharmony_ci pmullw m4, [rsp+8] 264cabdff1aSopenharmony_ci paddw m2, m0 265cabdff1aSopenharmony_ci paddw m3, m4 ; [mm2,mm3] += D * src[1..8] 266cabdff1aSopenharmony_ci movq m0, [r1] 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_ci paddw m2, [rnd_2d_%2+rnd_bias*8] 269cabdff1aSopenharmony_ci paddw m3, [rnd_2d_%2+rnd_bias*8] 270cabdff1aSopenharmony_ci psrlw m2, 6 271cabdff1aSopenharmony_ci psrlw m3, 6 272cabdff1aSopenharmony_ci packuswb m2, m3 273cabdff1aSopenharmony_ci CHROMAMC_AVG m2, [dest_reg] 274cabdff1aSopenharmony_ci movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 275cabdff1aSopenharmony_ci 276cabdff1aSopenharmony_ci add dest_reg, r2 277cabdff1aSopenharmony_ci dec r3d 278cabdff1aSopenharmony_ci jne .next2drow 279cabdff1aSopenharmony_ci mov rsp, r6 ; restore stack pointer 280cabdff1aSopenharmony_ci RET 281cabdff1aSopenharmony_ci%endmacro 282cabdff1aSopenharmony_ci 283cabdff1aSopenharmony_ci%macro chroma_mc4_mmx_func 2 284cabdff1aSopenharmony_ci%define extra_regs 0 285cabdff1aSopenharmony_ci%ifidn %2, rv40 286cabdff1aSopenharmony_ci%ifdef PIC 287cabdff1aSopenharmony_ci%define extra_regs 1 288cabdff1aSopenharmony_ci%endif ; PIC 289cabdff1aSopenharmony_ci%endif ; rv40 290cabdff1aSopenharmony_cicglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0 291cabdff1aSopenharmony_ci pxor m7, m7 292cabdff1aSopenharmony_ci movd m2, r4d ; x 293cabdff1aSopenharmony_ci movd m3, r5d ; y 294cabdff1aSopenharmony_ci movq m4, [pw_8] 295cabdff1aSopenharmony_ci movq m5, [pw_8] 296cabdff1aSopenharmony_ci punpcklwd m2, m2 297cabdff1aSopenharmony_ci punpcklwd m3, m3 298cabdff1aSopenharmony_ci punpcklwd m2, m2 299cabdff1aSopenharmony_ci punpcklwd m3, m3 300cabdff1aSopenharmony_ci psubw m4, m2 301cabdff1aSopenharmony_ci psubw m5, m3 302cabdff1aSopenharmony_ci 303cabdff1aSopenharmony_ci%ifidn %2, rv40 304cabdff1aSopenharmony_ci%ifdef PIC 305cabdff1aSopenharmony_ci lea r6, [rnd_rv40_2d_tbl] 306cabdff1aSopenharmony_ci%define rnd_2d_rv40 r6 307cabdff1aSopenharmony_ci%else 308cabdff1aSopenharmony_ci%define rnd_2d_rv40 rnd_rv40_2d_tbl 309cabdff1aSopenharmony_ci%endif 310cabdff1aSopenharmony_ci and r5, 6 ; &~1 for mx/my=[0,7] 311cabdff1aSopenharmony_ci lea r5, [r5*4+r4] 312cabdff1aSopenharmony_ci sar r5d, 1 313cabdff1aSopenharmony_ci%define rnd_bias r5 314cabdff1aSopenharmony_ci%else ; vc1, h264 315cabdff1aSopenharmony_ci%define rnd_bias 0 316cabdff1aSopenharmony_ci%endif 317cabdff1aSopenharmony_ci 318cabdff1aSopenharmony_ci movd m0, [r1 ] 319cabdff1aSopenharmony_ci movd m6, [r1+1] 320cabdff1aSopenharmony_ci add r1, r2 321cabdff1aSopenharmony_ci punpcklbw m0, m7 322cabdff1aSopenharmony_ci punpcklbw m6, m7 323cabdff1aSopenharmony_ci pmullw m0, m4 324cabdff1aSopenharmony_ci pmullw m6, m2 325cabdff1aSopenharmony_ci paddw m6, m0 326cabdff1aSopenharmony_ci 327cabdff1aSopenharmony_ci.next2rows: 328cabdff1aSopenharmony_ci movd m0, [r1 ] 329cabdff1aSopenharmony_ci movd m1, [r1+1] 330cabdff1aSopenharmony_ci add r1, r2 331cabdff1aSopenharmony_ci punpcklbw m0, m7 332cabdff1aSopenharmony_ci punpcklbw m1, m7 333cabdff1aSopenharmony_ci pmullw m0, m4 334cabdff1aSopenharmony_ci pmullw m1, m2 335cabdff1aSopenharmony_ci paddw m1, m0 336cabdff1aSopenharmony_ci movq m0, m1 337cabdff1aSopenharmony_ci 338cabdff1aSopenharmony_ci pmullw m6, m5 339cabdff1aSopenharmony_ci pmullw m1, m3 340cabdff1aSopenharmony_ci paddw m6, [rnd_2d_%2+rnd_bias*8] 341cabdff1aSopenharmony_ci paddw m1, m6 342cabdff1aSopenharmony_ci psrlw m1, 6 343cabdff1aSopenharmony_ci packuswb m1, m1 344cabdff1aSopenharmony_ci CHROMAMC_AVG4 m1, m6, [r0] 345cabdff1aSopenharmony_ci movd [r0], m1 346cabdff1aSopenharmony_ci add r0, r2 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci movd m6, [r1 ] 349cabdff1aSopenharmony_ci movd m1, [r1+1] 350cabdff1aSopenharmony_ci add r1, r2 351cabdff1aSopenharmony_ci punpcklbw m6, m7 352cabdff1aSopenharmony_ci punpcklbw m1, m7 353cabdff1aSopenharmony_ci pmullw m6, m4 354cabdff1aSopenharmony_ci pmullw m1, m2 355cabdff1aSopenharmony_ci paddw m1, m6 356cabdff1aSopenharmony_ci movq m6, m1 357cabdff1aSopenharmony_ci pmullw m0, m5 358cabdff1aSopenharmony_ci pmullw m1, m3 359cabdff1aSopenharmony_ci paddw m0, [rnd_2d_%2+rnd_bias*8] 360cabdff1aSopenharmony_ci paddw m1, m0 361cabdff1aSopenharmony_ci psrlw m1, 6 362cabdff1aSopenharmony_ci packuswb m1, m1 363cabdff1aSopenharmony_ci CHROMAMC_AVG4 m1, m0, [r0] 364cabdff1aSopenharmony_ci movd [r0], m1 365cabdff1aSopenharmony_ci add r0, r2 366cabdff1aSopenharmony_ci sub r3d, 2 367cabdff1aSopenharmony_ci jnz .next2rows 368cabdff1aSopenharmony_ci REP_RET 369cabdff1aSopenharmony_ci%endmacro 370cabdff1aSopenharmony_ci 371cabdff1aSopenharmony_ci%macro chroma_mc2_mmx_func 2 372cabdff1aSopenharmony_cicglobal %1_%2_chroma_mc2, 6, 7, 0 373cabdff1aSopenharmony_ci mov r6d, r4d 374cabdff1aSopenharmony_ci shl r4d, 16 375cabdff1aSopenharmony_ci sub r4d, r6d 376cabdff1aSopenharmony_ci add r4d, 8 377cabdff1aSopenharmony_ci imul r5d, r4d ; x*y<<16 | y*(8-x) 378cabdff1aSopenharmony_ci shl r4d, 3 379cabdff1aSopenharmony_ci sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) 380cabdff1aSopenharmony_ci 381cabdff1aSopenharmony_ci movd m5, r4d 382cabdff1aSopenharmony_ci movd m6, r5d 383cabdff1aSopenharmony_ci punpckldq m5, m5 ; mm5 = {A,B,A,B} 384cabdff1aSopenharmony_ci punpckldq m6, m6 ; mm6 = {C,D,C,D} 385cabdff1aSopenharmony_ci pxor m7, m7 386cabdff1aSopenharmony_ci movd m2, [r1] 387cabdff1aSopenharmony_ci punpcklbw m2, m7 388cabdff1aSopenharmony_ci pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_ci.nextrow: 391cabdff1aSopenharmony_ci add r1, r2 392cabdff1aSopenharmony_ci movq m1, m2 393cabdff1aSopenharmony_ci pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] 394cabdff1aSopenharmony_ci movd m0, [r1] 395cabdff1aSopenharmony_ci punpcklbw m0, m7 396cabdff1aSopenharmony_ci pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] 397cabdff1aSopenharmony_ci movq m2, m0 398cabdff1aSopenharmony_ci pmaddwd m0, m6 399cabdff1aSopenharmony_ci paddw m1, [rnd_2d_%2] 400cabdff1aSopenharmony_ci paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] 401cabdff1aSopenharmony_ci psrlw m1, 6 402cabdff1aSopenharmony_ci packssdw m1, m7 403cabdff1aSopenharmony_ci packuswb m1, m7 404cabdff1aSopenharmony_ci CHROMAMC_AVG4 m1, m3, [r0] 405cabdff1aSopenharmony_ci movd r5d, m1 406cabdff1aSopenharmony_ci mov [r0], r5w 407cabdff1aSopenharmony_ci add r0, r2 408cabdff1aSopenharmony_ci sub r3d, 1 409cabdff1aSopenharmony_ci jnz .nextrow 410cabdff1aSopenharmony_ci REP_RET 411cabdff1aSopenharmony_ci%endmacro 412cabdff1aSopenharmony_ci 413cabdff1aSopenharmony_ci%define rnd_1d_h264 pw_4 414cabdff1aSopenharmony_ci%define rnd_2d_h264 pw_32 415cabdff1aSopenharmony_ci%define rnd_1d_vc1 pw_3 416cabdff1aSopenharmony_ci%define rnd_2d_vc1 pw_28 417cabdff1aSopenharmony_ci 418cabdff1aSopenharmony_ci%macro NOTHING 2-3 419cabdff1aSopenharmony_ci%endmacro 420cabdff1aSopenharmony_ci%macro DIRECT_AVG 2 421cabdff1aSopenharmony_ci PAVGB %1, %2 422cabdff1aSopenharmony_ci%endmacro 423cabdff1aSopenharmony_ci%macro COPY_AVG 3 424cabdff1aSopenharmony_ci movd %2, %3 425cabdff1aSopenharmony_ci PAVGB %1, %2 426cabdff1aSopenharmony_ci%endmacro 427cabdff1aSopenharmony_ci 428cabdff1aSopenharmony_ciINIT_MMX mmx 429cabdff1aSopenharmony_ci%define CHROMAMC_AVG NOTHING 430cabdff1aSopenharmony_ci%define CHROMAMC_AVG4 NOTHING 431cabdff1aSopenharmony_cichroma_mc8_mmx_func put, h264, _rnd 432cabdff1aSopenharmony_cichroma_mc8_mmx_func put, vc1, _nornd 433cabdff1aSopenharmony_cichroma_mc8_mmx_func put, rv40 434cabdff1aSopenharmony_cichroma_mc4_mmx_func put, h264 435cabdff1aSopenharmony_cichroma_mc4_mmx_func put, rv40 436cabdff1aSopenharmony_ci 437cabdff1aSopenharmony_ciINIT_MMX mmxext 438cabdff1aSopenharmony_cichroma_mc2_mmx_func put, h264 439cabdff1aSopenharmony_ci 440cabdff1aSopenharmony_ci%define CHROMAMC_AVG DIRECT_AVG 441cabdff1aSopenharmony_ci%define CHROMAMC_AVG4 COPY_AVG 442cabdff1aSopenharmony_cichroma_mc8_mmx_func avg, h264, _rnd 443cabdff1aSopenharmony_cichroma_mc8_mmx_func avg, vc1, _nornd 444cabdff1aSopenharmony_cichroma_mc8_mmx_func avg, rv40 445cabdff1aSopenharmony_cichroma_mc4_mmx_func avg, h264 446cabdff1aSopenharmony_cichroma_mc4_mmx_func avg, rv40 447cabdff1aSopenharmony_cichroma_mc2_mmx_func avg, h264 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ci%macro chroma_mc8_ssse3_func 2-3 450cabdff1aSopenharmony_cicglobal %1_%2_chroma_mc8%3, 6, 7, 8 451cabdff1aSopenharmony_ci mov r6d, r5d 452cabdff1aSopenharmony_ci or r6d, r4d 453cabdff1aSopenharmony_ci jne .at_least_one_non_zero 454cabdff1aSopenharmony_ci ; mx == 0 AND my == 0 - no filter needed 455cabdff1aSopenharmony_ci mv0_pixels_mc8 456cabdff1aSopenharmony_ci REP_RET 457cabdff1aSopenharmony_ci 458cabdff1aSopenharmony_ci.at_least_one_non_zero: 459cabdff1aSopenharmony_ci test r5d, r5d 460cabdff1aSopenharmony_ci je .my_is_zero 461cabdff1aSopenharmony_ci test r4d, r4d 462cabdff1aSopenharmony_ci je .mx_is_zero 463cabdff1aSopenharmony_ci 464cabdff1aSopenharmony_ci ; general case, bilinear 465cabdff1aSopenharmony_ci mov r6d, r4d 466cabdff1aSopenharmony_ci shl r4d, 8 467cabdff1aSopenharmony_ci sub r4, r6 468cabdff1aSopenharmony_ci mov r6, 8 469cabdff1aSopenharmony_ci add r4, 8 ; x*288+8 = x<<8 | (8-x) 470cabdff1aSopenharmony_ci sub r6d, r5d 471cabdff1aSopenharmony_ci imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) 472cabdff1aSopenharmony_ci imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) 473cabdff1aSopenharmony_ci 474cabdff1aSopenharmony_ci movd m7, r6d 475cabdff1aSopenharmony_ci movd m6, r4d 476cabdff1aSopenharmony_ci movdqa m5, [rnd_2d_%2] 477cabdff1aSopenharmony_ci movq m0, [r1 ] 478cabdff1aSopenharmony_ci movq m1, [r1+1] 479cabdff1aSopenharmony_ci pshuflw m7, m7, 0 480cabdff1aSopenharmony_ci pshuflw m6, m6, 0 481cabdff1aSopenharmony_ci punpcklbw m0, m1 482cabdff1aSopenharmony_ci movlhps m7, m7 483cabdff1aSopenharmony_ci movlhps m6, m6 484cabdff1aSopenharmony_ci 485cabdff1aSopenharmony_ci.next2rows: 486cabdff1aSopenharmony_ci movq m1, [r1+r2*1 ] 487cabdff1aSopenharmony_ci movq m2, [r1+r2*1+1] 488cabdff1aSopenharmony_ci movq m3, [r1+r2*2 ] 489cabdff1aSopenharmony_ci movq m4, [r1+r2*2+1] 490cabdff1aSopenharmony_ci lea r1, [r1+r2*2] 491cabdff1aSopenharmony_ci punpcklbw m1, m2 492cabdff1aSopenharmony_ci movdqa m2, m1 493cabdff1aSopenharmony_ci punpcklbw m3, m4 494cabdff1aSopenharmony_ci movdqa m4, m3 495cabdff1aSopenharmony_ci pmaddubsw m0, m7 496cabdff1aSopenharmony_ci pmaddubsw m1, m6 497cabdff1aSopenharmony_ci pmaddubsw m2, m7 498cabdff1aSopenharmony_ci pmaddubsw m3, m6 499cabdff1aSopenharmony_ci paddw m0, m5 500cabdff1aSopenharmony_ci paddw m2, m5 501cabdff1aSopenharmony_ci paddw m1, m0 502cabdff1aSopenharmony_ci paddw m3, m2 503cabdff1aSopenharmony_ci psrlw m1, 6 504cabdff1aSopenharmony_ci movdqa m0, m4 505cabdff1aSopenharmony_ci psrlw m3, 6 506cabdff1aSopenharmony_ci%ifidn %1, avg 507cabdff1aSopenharmony_ci movq m2, [r0 ] 508cabdff1aSopenharmony_ci movhps m2, [r0+r2] 509cabdff1aSopenharmony_ci%endif 510cabdff1aSopenharmony_ci packuswb m1, m3 511cabdff1aSopenharmony_ci CHROMAMC_AVG m1, m2 512cabdff1aSopenharmony_ci movq [r0 ], m1 513cabdff1aSopenharmony_ci movhps [r0+r2], m1 514cabdff1aSopenharmony_ci sub r3d, 2 515cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 516cabdff1aSopenharmony_ci jg .next2rows 517cabdff1aSopenharmony_ci REP_RET 518cabdff1aSopenharmony_ci 519cabdff1aSopenharmony_ci.my_is_zero: 520cabdff1aSopenharmony_ci mov r5d, r4d 521cabdff1aSopenharmony_ci shl r4d, 8 522cabdff1aSopenharmony_ci add r4, 8 523cabdff1aSopenharmony_ci sub r4, r5 ; 255*x+8 = x<<8 | (8-x) 524cabdff1aSopenharmony_ci movd m7, r4d 525cabdff1aSopenharmony_ci movdqa m6, [rnd_1d_%2] 526cabdff1aSopenharmony_ci pshuflw m7, m7, 0 527cabdff1aSopenharmony_ci movlhps m7, m7 528cabdff1aSopenharmony_ci 529cabdff1aSopenharmony_ci.next2xrows: 530cabdff1aSopenharmony_ci movq m0, [r1 ] 531cabdff1aSopenharmony_ci movq m1, [r1 +1] 532cabdff1aSopenharmony_ci movq m2, [r1+r2 ] 533cabdff1aSopenharmony_ci movq m3, [r1+r2+1] 534cabdff1aSopenharmony_ci punpcklbw m0, m1 535cabdff1aSopenharmony_ci punpcklbw m2, m3 536cabdff1aSopenharmony_ci pmaddubsw m0, m7 537cabdff1aSopenharmony_ci pmaddubsw m2, m7 538cabdff1aSopenharmony_ci%ifidn %1, avg 539cabdff1aSopenharmony_ci movq m4, [r0 ] 540cabdff1aSopenharmony_ci movhps m4, [r0+r2] 541cabdff1aSopenharmony_ci%endif 542cabdff1aSopenharmony_ci paddw m0, m6 543cabdff1aSopenharmony_ci paddw m2, m6 544cabdff1aSopenharmony_ci psrlw m0, 3 545cabdff1aSopenharmony_ci psrlw m2, 3 546cabdff1aSopenharmony_ci packuswb m0, m2 547cabdff1aSopenharmony_ci CHROMAMC_AVG m0, m4 548cabdff1aSopenharmony_ci movq [r0 ], m0 549cabdff1aSopenharmony_ci movhps [r0+r2], m0 550cabdff1aSopenharmony_ci sub r3d, 2 551cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 552cabdff1aSopenharmony_ci lea r1, [r1+r2*2] 553cabdff1aSopenharmony_ci jg .next2xrows 554cabdff1aSopenharmony_ci REP_RET 555cabdff1aSopenharmony_ci 556cabdff1aSopenharmony_ci.mx_is_zero: 557cabdff1aSopenharmony_ci mov r4d, r5d 558cabdff1aSopenharmony_ci shl r5d, 8 559cabdff1aSopenharmony_ci add r5, 8 560cabdff1aSopenharmony_ci sub r5, r4 ; 255*y+8 = y<<8 | (8-y) 561cabdff1aSopenharmony_ci movd m7, r5d 562cabdff1aSopenharmony_ci movdqa m6, [rnd_1d_%2] 563cabdff1aSopenharmony_ci pshuflw m7, m7, 0 564cabdff1aSopenharmony_ci movlhps m7, m7 565cabdff1aSopenharmony_ci 566cabdff1aSopenharmony_ci.next2yrows: 567cabdff1aSopenharmony_ci movq m0, [r1 ] 568cabdff1aSopenharmony_ci movq m1, [r1+r2 ] 569cabdff1aSopenharmony_ci movdqa m2, m1 570cabdff1aSopenharmony_ci movq m3, [r1+r2*2] 571cabdff1aSopenharmony_ci lea r1, [r1+r2*2] 572cabdff1aSopenharmony_ci punpcklbw m0, m1 573cabdff1aSopenharmony_ci punpcklbw m2, m3 574cabdff1aSopenharmony_ci pmaddubsw m0, m7 575cabdff1aSopenharmony_ci pmaddubsw m2, m7 576cabdff1aSopenharmony_ci%ifidn %1, avg 577cabdff1aSopenharmony_ci movq m4, [r0 ] 578cabdff1aSopenharmony_ci movhps m4, [r0+r2] 579cabdff1aSopenharmony_ci%endif 580cabdff1aSopenharmony_ci paddw m0, m6 581cabdff1aSopenharmony_ci paddw m2, m6 582cabdff1aSopenharmony_ci psrlw m0, 3 583cabdff1aSopenharmony_ci psrlw m2, 3 584cabdff1aSopenharmony_ci packuswb m0, m2 585cabdff1aSopenharmony_ci CHROMAMC_AVG m0, m4 586cabdff1aSopenharmony_ci movq [r0 ], m0 587cabdff1aSopenharmony_ci movhps [r0+r2], m0 588cabdff1aSopenharmony_ci sub r3d, 2 589cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 590cabdff1aSopenharmony_ci jg .next2yrows 591cabdff1aSopenharmony_ci REP_RET 592cabdff1aSopenharmony_ci%endmacro 593cabdff1aSopenharmony_ci 594cabdff1aSopenharmony_ci%macro chroma_mc4_ssse3_func 2 595cabdff1aSopenharmony_cicglobal %1_%2_chroma_mc4, 6, 7, 0 596cabdff1aSopenharmony_ci mov r6, r4 597cabdff1aSopenharmony_ci shl r4d, 8 598cabdff1aSopenharmony_ci sub r4d, r6d 599cabdff1aSopenharmony_ci mov r6, 8 600cabdff1aSopenharmony_ci add r4d, 8 ; x*288+8 601cabdff1aSopenharmony_ci sub r6d, r5d 602cabdff1aSopenharmony_ci imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) 603cabdff1aSopenharmony_ci imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) 604cabdff1aSopenharmony_ci 605cabdff1aSopenharmony_ci movd m7, r6d 606cabdff1aSopenharmony_ci movd m6, r4d 607cabdff1aSopenharmony_ci movq m5, [pw_32] 608cabdff1aSopenharmony_ci movd m0, [r1 ] 609cabdff1aSopenharmony_ci pshufw m7, m7, 0 610cabdff1aSopenharmony_ci punpcklbw m0, [r1+1] 611cabdff1aSopenharmony_ci pshufw m6, m6, 0 612cabdff1aSopenharmony_ci 613cabdff1aSopenharmony_ci.next2rows: 614cabdff1aSopenharmony_ci movd m1, [r1+r2*1 ] 615cabdff1aSopenharmony_ci movd m3, [r1+r2*2 ] 616cabdff1aSopenharmony_ci punpcklbw m1, [r1+r2*1+1] 617cabdff1aSopenharmony_ci punpcklbw m3, [r1+r2*2+1] 618cabdff1aSopenharmony_ci lea r1, [r1+r2*2] 619cabdff1aSopenharmony_ci movq m2, m1 620cabdff1aSopenharmony_ci movq m4, m3 621cabdff1aSopenharmony_ci pmaddubsw m0, m7 622cabdff1aSopenharmony_ci pmaddubsw m1, m6 623cabdff1aSopenharmony_ci pmaddubsw m2, m7 624cabdff1aSopenharmony_ci pmaddubsw m3, m6 625cabdff1aSopenharmony_ci paddw m0, m5 626cabdff1aSopenharmony_ci paddw m2, m5 627cabdff1aSopenharmony_ci paddw m1, m0 628cabdff1aSopenharmony_ci paddw m3, m2 629cabdff1aSopenharmony_ci psrlw m1, 6 630cabdff1aSopenharmony_ci movq m0, m4 631cabdff1aSopenharmony_ci psrlw m3, 6 632cabdff1aSopenharmony_ci packuswb m1, m1 633cabdff1aSopenharmony_ci packuswb m3, m3 634cabdff1aSopenharmony_ci CHROMAMC_AVG m1, [r0 ] 635cabdff1aSopenharmony_ci CHROMAMC_AVG m3, [r0+r2] 636cabdff1aSopenharmony_ci movd [r0 ], m1 637cabdff1aSopenharmony_ci movd [r0+r2], m3 638cabdff1aSopenharmony_ci sub r3d, 2 639cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 640cabdff1aSopenharmony_ci jg .next2rows 641cabdff1aSopenharmony_ci REP_RET 642cabdff1aSopenharmony_ci%endmacro 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci%define CHROMAMC_AVG NOTHING 645cabdff1aSopenharmony_ciINIT_XMM ssse3 646cabdff1aSopenharmony_cichroma_mc8_ssse3_func put, h264, _rnd 647cabdff1aSopenharmony_cichroma_mc8_ssse3_func put, vc1, _nornd 648cabdff1aSopenharmony_ciINIT_MMX ssse3 649cabdff1aSopenharmony_cichroma_mc4_ssse3_func put, h264 650cabdff1aSopenharmony_ci 651cabdff1aSopenharmony_ci%define CHROMAMC_AVG DIRECT_AVG 652cabdff1aSopenharmony_ciINIT_XMM ssse3 653cabdff1aSopenharmony_cichroma_mc8_ssse3_func avg, h264, _rnd 654cabdff1aSopenharmony_cichroma_mc8_ssse3_func avg, vc1, _nornd 655cabdff1aSopenharmony_ciINIT_MMX ssse3 656cabdff1aSopenharmony_cichroma_mc4_ssse3_func avg, h264 657