1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* MMX/SSE2/SSSE3-optimized H.264 QPEL code 3cabdff1aSopenharmony_ci;***************************************************************************** 4cabdff1aSopenharmony_ci;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 5cabdff1aSopenharmony_ci;* Copyright (C) 2012 Daniel Kang 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 10cabdff1aSopenharmony_ci;* 11cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 12cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 13cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 14cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 15cabdff1aSopenharmony_ci;* 16cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 17cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 20cabdff1aSopenharmony_ci;* 21cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 22cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 23cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24cabdff1aSopenharmony_ci;****************************************************************************** 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_ciSECTION_RODATA 32 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_cicextern pw_16 31cabdff1aSopenharmony_cicextern pw_5 32cabdff1aSopenharmony_cicextern pb_0 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ciSECTION .text 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci%macro op_avgh 3 38cabdff1aSopenharmony_ci movh %3, %2 39cabdff1aSopenharmony_ci pavgb %1, %3 40cabdff1aSopenharmony_ci movh %2, %1 41cabdff1aSopenharmony_ci%endmacro 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_ci%macro op_avg 2-3 44cabdff1aSopenharmony_ci pavgb %1, %2 45cabdff1aSopenharmony_ci mova %2, %1 46cabdff1aSopenharmony_ci%endmacro 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci%macro op_puth 2-3 49cabdff1aSopenharmony_ci movh %2, %1 50cabdff1aSopenharmony_ci%endmacro 51cabdff1aSopenharmony_ci 52cabdff1aSopenharmony_ci%macro op_put 2-3 53cabdff1aSopenharmony_ci mova %2, %1 54cabdff1aSopenharmony_ci%endmacro 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci%macro QPEL4_H_LOWPASS_OP 1 57cabdff1aSopenharmony_cicglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride 58cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 59cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 60cabdff1aSopenharmony_ci pxor m7, m7 61cabdff1aSopenharmony_ci mova m4, [pw_5] 62cabdff1aSopenharmony_ci mova m5, [pw_16] 63cabdff1aSopenharmony_ci mov r4d, 4 64cabdff1aSopenharmony_ci.loop: 65cabdff1aSopenharmony_ci movh m1, [r1-1] 66cabdff1aSopenharmony_ci movh m2, [r1+0] 67cabdff1aSopenharmony_ci movh m3, [r1+1] 68cabdff1aSopenharmony_ci movh m0, [r1+2] 69cabdff1aSopenharmony_ci punpcklbw m1, m7 70cabdff1aSopenharmony_ci punpcklbw m2, m7 71cabdff1aSopenharmony_ci punpcklbw m3, m7 72cabdff1aSopenharmony_ci punpcklbw m0, m7 73cabdff1aSopenharmony_ci paddw m1, m0 74cabdff1aSopenharmony_ci paddw m2, m3 75cabdff1aSopenharmony_ci movh m0, [r1-2] 76cabdff1aSopenharmony_ci movh m3, [r1+3] 77cabdff1aSopenharmony_ci punpcklbw m0, m7 78cabdff1aSopenharmony_ci punpcklbw m3, m7 79cabdff1aSopenharmony_ci paddw m0, m3 80cabdff1aSopenharmony_ci psllw m2, 2 81cabdff1aSopenharmony_ci psubw m2, m1 82cabdff1aSopenharmony_ci pmullw m2, m4 83cabdff1aSopenharmony_ci paddw m0, m5 84cabdff1aSopenharmony_ci paddw m0, m2 85cabdff1aSopenharmony_ci psraw m0, 5 86cabdff1aSopenharmony_ci packuswb m0, m0 87cabdff1aSopenharmony_ci op_%1h m0, [r0], m6 88cabdff1aSopenharmony_ci add r0, r2 89cabdff1aSopenharmony_ci add r1, r3 90cabdff1aSopenharmony_ci dec r4d 91cabdff1aSopenharmony_ci jg .loop 92cabdff1aSopenharmony_ci REP_RET 93cabdff1aSopenharmony_ci%endmacro 94cabdff1aSopenharmony_ci 95cabdff1aSopenharmony_ciINIT_MMX mmxext 96cabdff1aSopenharmony_ciQPEL4_H_LOWPASS_OP put 97cabdff1aSopenharmony_ciQPEL4_H_LOWPASS_OP avg 98cabdff1aSopenharmony_ci 99cabdff1aSopenharmony_ci%macro QPEL8_H_LOWPASS_OP 1 100cabdff1aSopenharmony_cicglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride 101cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 102cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 103cabdff1aSopenharmony_ci mov r4d, 8 104cabdff1aSopenharmony_ci pxor m7, m7 105cabdff1aSopenharmony_ci mova m6, [pw_5] 106cabdff1aSopenharmony_ci.loop: 107cabdff1aSopenharmony_ci mova m0, [r1] 108cabdff1aSopenharmony_ci mova m2, [r1+1] 109cabdff1aSopenharmony_ci mova m1, m0 110cabdff1aSopenharmony_ci mova m3, m2 111cabdff1aSopenharmony_ci punpcklbw m0, m7 112cabdff1aSopenharmony_ci punpckhbw m1, m7 113cabdff1aSopenharmony_ci punpcklbw m2, m7 114cabdff1aSopenharmony_ci punpckhbw m3, m7 115cabdff1aSopenharmony_ci paddw m0, m2 116cabdff1aSopenharmony_ci paddw m1, m3 117cabdff1aSopenharmony_ci psllw m0, 2 118cabdff1aSopenharmony_ci psllw m1, 2 119cabdff1aSopenharmony_ci mova m2, [r1-1] 120cabdff1aSopenharmony_ci mova m4, [r1+2] 121cabdff1aSopenharmony_ci mova m3, m2 122cabdff1aSopenharmony_ci mova m5, m4 123cabdff1aSopenharmony_ci punpcklbw m2, m7 124cabdff1aSopenharmony_ci punpckhbw m3, m7 125cabdff1aSopenharmony_ci punpcklbw m4, m7 126cabdff1aSopenharmony_ci punpckhbw m5, m7 127cabdff1aSopenharmony_ci paddw m2, m4 128cabdff1aSopenharmony_ci paddw m5, m3 129cabdff1aSopenharmony_ci psubw m0, m2 130cabdff1aSopenharmony_ci psubw m1, m5 131cabdff1aSopenharmony_ci pmullw m0, m6 132cabdff1aSopenharmony_ci pmullw m1, m6 133cabdff1aSopenharmony_ci movd m2, [r1-2] 134cabdff1aSopenharmony_ci movd m5, [r1+7] 135cabdff1aSopenharmony_ci punpcklbw m2, m7 136cabdff1aSopenharmony_ci punpcklbw m5, m7 137cabdff1aSopenharmony_ci paddw m2, m3 138cabdff1aSopenharmony_ci paddw m4, m5 139cabdff1aSopenharmony_ci mova m5, [pw_16] 140cabdff1aSopenharmony_ci paddw m2, m5 141cabdff1aSopenharmony_ci paddw m4, m5 142cabdff1aSopenharmony_ci paddw m0, m2 143cabdff1aSopenharmony_ci paddw m1, m4 144cabdff1aSopenharmony_ci psraw m0, 5 145cabdff1aSopenharmony_ci psraw m1, 5 146cabdff1aSopenharmony_ci packuswb m0, m1 147cabdff1aSopenharmony_ci op_%1 m0, [r0], m4 148cabdff1aSopenharmony_ci add r0, r2 149cabdff1aSopenharmony_ci add r1, r3 150cabdff1aSopenharmony_ci dec r4d 151cabdff1aSopenharmony_ci jg .loop 152cabdff1aSopenharmony_ci REP_RET 153cabdff1aSopenharmony_ci%endmacro 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ciINIT_MMX mmxext 156cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_OP put 157cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_OP avg 158cabdff1aSopenharmony_ci 159cabdff1aSopenharmony_ci%macro QPEL8_H_LOWPASS_OP_XMM 1 160cabdff1aSopenharmony_cicglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride 161cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 162cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 163cabdff1aSopenharmony_ci mov r4d, 8 164cabdff1aSopenharmony_ci pxor m7, m7 165cabdff1aSopenharmony_ci mova m6, [pw_5] 166cabdff1aSopenharmony_ci.loop: 167cabdff1aSopenharmony_ci movu m1, [r1-2] 168cabdff1aSopenharmony_ci mova m0, m1 169cabdff1aSopenharmony_ci punpckhbw m1, m7 170cabdff1aSopenharmony_ci punpcklbw m0, m7 171cabdff1aSopenharmony_ci mova m2, m1 172cabdff1aSopenharmony_ci mova m3, m1 173cabdff1aSopenharmony_ci mova m4, m1 174cabdff1aSopenharmony_ci mova m5, m1 175cabdff1aSopenharmony_ci palignr m4, m0, 2 176cabdff1aSopenharmony_ci palignr m3, m0, 4 177cabdff1aSopenharmony_ci palignr m2, m0, 6 178cabdff1aSopenharmony_ci palignr m1, m0, 8 179cabdff1aSopenharmony_ci palignr m5, m0, 10 180cabdff1aSopenharmony_ci paddw m0, m5 181cabdff1aSopenharmony_ci paddw m2, m3 182cabdff1aSopenharmony_ci paddw m1, m4 183cabdff1aSopenharmony_ci psllw m2, 2 184cabdff1aSopenharmony_ci psubw m2, m1 185cabdff1aSopenharmony_ci paddw m0, [pw_16] 186cabdff1aSopenharmony_ci pmullw m2, m6 187cabdff1aSopenharmony_ci paddw m2, m0 188cabdff1aSopenharmony_ci psraw m2, 5 189cabdff1aSopenharmony_ci packuswb m2, m2 190cabdff1aSopenharmony_ci op_%1h m2, [r0], m4 191cabdff1aSopenharmony_ci add r1, r3 192cabdff1aSopenharmony_ci add r0, r2 193cabdff1aSopenharmony_ci dec r4d 194cabdff1aSopenharmony_ci jne .loop 195cabdff1aSopenharmony_ci REP_RET 196cabdff1aSopenharmony_ci%endmacro 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ciINIT_XMM ssse3 199cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_OP_XMM put 200cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_OP_XMM avg 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci%macro QPEL4_H_LOWPASS_L2_OP 1 204cabdff1aSopenharmony_cicglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride 205cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 206cabdff1aSopenharmony_ci movsxdifnidn r4, r4d 207cabdff1aSopenharmony_ci pxor m7, m7 208cabdff1aSopenharmony_ci mova m4, [pw_5] 209cabdff1aSopenharmony_ci mova m5, [pw_16] 210cabdff1aSopenharmony_ci mov r5d, 4 211cabdff1aSopenharmony_ci.loop: 212cabdff1aSopenharmony_ci movh m1, [r1-1] 213cabdff1aSopenharmony_ci movh m2, [r1+0] 214cabdff1aSopenharmony_ci movh m3, [r1+1] 215cabdff1aSopenharmony_ci movh m0, [r1+2] 216cabdff1aSopenharmony_ci punpcklbw m1, m7 217cabdff1aSopenharmony_ci punpcklbw m2, m7 218cabdff1aSopenharmony_ci punpcklbw m3, m7 219cabdff1aSopenharmony_ci punpcklbw m0, m7 220cabdff1aSopenharmony_ci paddw m1, m0 221cabdff1aSopenharmony_ci paddw m2, m3 222cabdff1aSopenharmony_ci movh m0, [r1-2] 223cabdff1aSopenharmony_ci movh m3, [r1+3] 224cabdff1aSopenharmony_ci punpcklbw m0, m7 225cabdff1aSopenharmony_ci punpcklbw m3, m7 226cabdff1aSopenharmony_ci paddw m0, m3 227cabdff1aSopenharmony_ci psllw m2, 2 228cabdff1aSopenharmony_ci psubw m2, m1 229cabdff1aSopenharmony_ci pmullw m2, m4 230cabdff1aSopenharmony_ci paddw m0, m5 231cabdff1aSopenharmony_ci paddw m0, m2 232cabdff1aSopenharmony_ci movh m3, [r2] 233cabdff1aSopenharmony_ci psraw m0, 5 234cabdff1aSopenharmony_ci packuswb m0, m0 235cabdff1aSopenharmony_ci pavgb m0, m3 236cabdff1aSopenharmony_ci op_%1h m0, [r0], m6 237cabdff1aSopenharmony_ci add r0, r3 238cabdff1aSopenharmony_ci add r1, r3 239cabdff1aSopenharmony_ci add r2, r4 240cabdff1aSopenharmony_ci dec r5d 241cabdff1aSopenharmony_ci jg .loop 242cabdff1aSopenharmony_ci REP_RET 243cabdff1aSopenharmony_ci%endmacro 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ciINIT_MMX mmxext 246cabdff1aSopenharmony_ciQPEL4_H_LOWPASS_L2_OP put 247cabdff1aSopenharmony_ciQPEL4_H_LOWPASS_L2_OP avg 248cabdff1aSopenharmony_ci 249cabdff1aSopenharmony_ci 250cabdff1aSopenharmony_ci%macro QPEL8_H_LOWPASS_L2_OP 1 251cabdff1aSopenharmony_cicglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride 252cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 253cabdff1aSopenharmony_ci movsxdifnidn r4, r4d 254cabdff1aSopenharmony_ci mov r5d, 8 255cabdff1aSopenharmony_ci pxor m7, m7 256cabdff1aSopenharmony_ci mova m6, [pw_5] 257cabdff1aSopenharmony_ci.loop: 258cabdff1aSopenharmony_ci mova m0, [r1] 259cabdff1aSopenharmony_ci mova m2, [r1+1] 260cabdff1aSopenharmony_ci mova m1, m0 261cabdff1aSopenharmony_ci mova m3, m2 262cabdff1aSopenharmony_ci punpcklbw m0, m7 263cabdff1aSopenharmony_ci punpckhbw m1, m7 264cabdff1aSopenharmony_ci punpcklbw m2, m7 265cabdff1aSopenharmony_ci punpckhbw m3, m7 266cabdff1aSopenharmony_ci paddw m0, m2 267cabdff1aSopenharmony_ci paddw m1, m3 268cabdff1aSopenharmony_ci psllw m0, 2 269cabdff1aSopenharmony_ci psllw m1, 2 270cabdff1aSopenharmony_ci mova m2, [r1-1] 271cabdff1aSopenharmony_ci mova m4, [r1+2] 272cabdff1aSopenharmony_ci mova m3, m2 273cabdff1aSopenharmony_ci mova m5, m4 274cabdff1aSopenharmony_ci punpcklbw m2, m7 275cabdff1aSopenharmony_ci punpckhbw m3, m7 276cabdff1aSopenharmony_ci punpcklbw m4, m7 277cabdff1aSopenharmony_ci punpckhbw m5, m7 278cabdff1aSopenharmony_ci paddw m2, m4 279cabdff1aSopenharmony_ci paddw m5, m3 280cabdff1aSopenharmony_ci psubw m0, m2 281cabdff1aSopenharmony_ci psubw m1, m5 282cabdff1aSopenharmony_ci pmullw m0, m6 283cabdff1aSopenharmony_ci pmullw m1, m6 284cabdff1aSopenharmony_ci movd m2, [r1-2] 285cabdff1aSopenharmony_ci movd m5, [r1+7] 286cabdff1aSopenharmony_ci punpcklbw m2, m7 287cabdff1aSopenharmony_ci punpcklbw m5, m7 288cabdff1aSopenharmony_ci paddw m2, m3 289cabdff1aSopenharmony_ci paddw m4, m5 290cabdff1aSopenharmony_ci mova m5, [pw_16] 291cabdff1aSopenharmony_ci paddw m2, m5 292cabdff1aSopenharmony_ci paddw m4, m5 293cabdff1aSopenharmony_ci paddw m0, m2 294cabdff1aSopenharmony_ci paddw m1, m4 295cabdff1aSopenharmony_ci psraw m0, 5 296cabdff1aSopenharmony_ci psraw m1, 5 297cabdff1aSopenharmony_ci mova m4, [r2] 298cabdff1aSopenharmony_ci packuswb m0, m1 299cabdff1aSopenharmony_ci pavgb m0, m4 300cabdff1aSopenharmony_ci op_%1 m0, [r0], m4 301cabdff1aSopenharmony_ci add r0, r3 302cabdff1aSopenharmony_ci add r1, r3 303cabdff1aSopenharmony_ci add r2, r4 304cabdff1aSopenharmony_ci dec r5d 305cabdff1aSopenharmony_ci jg .loop 306cabdff1aSopenharmony_ci REP_RET 307cabdff1aSopenharmony_ci%endmacro 308cabdff1aSopenharmony_ci 309cabdff1aSopenharmony_ciINIT_MMX mmxext 310cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_L2_OP put 311cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_L2_OP avg 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci 314cabdff1aSopenharmony_ci%macro QPEL8_H_LOWPASS_L2_OP_XMM 1 315cabdff1aSopenharmony_cicglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride 316cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 317cabdff1aSopenharmony_ci movsxdifnidn r4, r4d 318cabdff1aSopenharmony_ci mov r5d, 8 319cabdff1aSopenharmony_ci pxor m7, m7 320cabdff1aSopenharmony_ci mova m6, [pw_5] 321cabdff1aSopenharmony_ci.loop: 322cabdff1aSopenharmony_ci lddqu m1, [r1-2] 323cabdff1aSopenharmony_ci mova m0, m1 324cabdff1aSopenharmony_ci punpckhbw m1, m7 325cabdff1aSopenharmony_ci punpcklbw m0, m7 326cabdff1aSopenharmony_ci mova m2, m1 327cabdff1aSopenharmony_ci mova m3, m1 328cabdff1aSopenharmony_ci mova m4, m1 329cabdff1aSopenharmony_ci mova m5, m1 330cabdff1aSopenharmony_ci palignr m4, m0, 2 331cabdff1aSopenharmony_ci palignr m3, m0, 4 332cabdff1aSopenharmony_ci palignr m2, m0, 6 333cabdff1aSopenharmony_ci palignr m1, m0, 8 334cabdff1aSopenharmony_ci palignr m5, m0, 10 335cabdff1aSopenharmony_ci paddw m0, m5 336cabdff1aSopenharmony_ci paddw m2, m3 337cabdff1aSopenharmony_ci paddw m1, m4 338cabdff1aSopenharmony_ci psllw m2, 2 339cabdff1aSopenharmony_ci movh m3, [r2] 340cabdff1aSopenharmony_ci psubw m2, m1 341cabdff1aSopenharmony_ci paddw m0, [pw_16] 342cabdff1aSopenharmony_ci pmullw m2, m6 343cabdff1aSopenharmony_ci paddw m2, m0 344cabdff1aSopenharmony_ci psraw m2, 5 345cabdff1aSopenharmony_ci packuswb m2, m2 346cabdff1aSopenharmony_ci pavgb m2, m3 347cabdff1aSopenharmony_ci op_%1h m2, [r0], m4 348cabdff1aSopenharmony_ci add r1, r3 349cabdff1aSopenharmony_ci add r0, r3 350cabdff1aSopenharmony_ci add r2, r4 351cabdff1aSopenharmony_ci dec r5d 352cabdff1aSopenharmony_ci jg .loop 353cabdff1aSopenharmony_ci REP_RET 354cabdff1aSopenharmony_ci%endmacro 355cabdff1aSopenharmony_ci 356cabdff1aSopenharmony_ciINIT_XMM ssse3 357cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_L2_OP_XMM put 358cabdff1aSopenharmony_ciQPEL8_H_LOWPASS_L2_OP_XMM avg 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci 361cabdff1aSopenharmony_ci; All functions that call this are required to have function arguments of 362cabdff1aSopenharmony_ci; dst, src, dstStride, srcStride 363cabdff1aSopenharmony_ci%macro FILT_V 1 364cabdff1aSopenharmony_ci mova m6, m2 365cabdff1aSopenharmony_ci movh m5, [r1] 366cabdff1aSopenharmony_ci paddw m6, m3 367cabdff1aSopenharmony_ci psllw m6, 2 368cabdff1aSopenharmony_ci psubw m6, m1 369cabdff1aSopenharmony_ci psubw m6, m4 370cabdff1aSopenharmony_ci punpcklbw m5, m7 371cabdff1aSopenharmony_ci pmullw m6, [pw_5] 372cabdff1aSopenharmony_ci paddw m0, [pw_16] 373cabdff1aSopenharmony_ci add r1, r3 374cabdff1aSopenharmony_ci paddw m0, m5 375cabdff1aSopenharmony_ci paddw m6, m0 376cabdff1aSopenharmony_ci psraw m6, 5 377cabdff1aSopenharmony_ci packuswb m6, m6 378cabdff1aSopenharmony_ci op_%1h m6, [r0], m0 ; 1 379cabdff1aSopenharmony_ci add r0, r2 380cabdff1aSopenharmony_ci SWAP 0, 1, 2, 3, 4, 5 381cabdff1aSopenharmony_ci%endmacro 382cabdff1aSopenharmony_ci 383cabdff1aSopenharmony_ci%macro QPEL4_V_LOWPASS_OP 1 384cabdff1aSopenharmony_cicglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride 385cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 386cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 387cabdff1aSopenharmony_ci sub r1, r3 388cabdff1aSopenharmony_ci sub r1, r3 389cabdff1aSopenharmony_ci pxor m7, m7 390cabdff1aSopenharmony_ci movh m0, [r1] 391cabdff1aSopenharmony_ci movh m1, [r1+r3] 392cabdff1aSopenharmony_ci lea r1, [r1+2*r3] 393cabdff1aSopenharmony_ci movh m2, [r1] 394cabdff1aSopenharmony_ci movh m3, [r1+r3] 395cabdff1aSopenharmony_ci lea r1, [r1+2*r3] 396cabdff1aSopenharmony_ci movh m4, [r1] 397cabdff1aSopenharmony_ci add r1, r3 398cabdff1aSopenharmony_ci punpcklbw m0, m7 399cabdff1aSopenharmony_ci punpcklbw m1, m7 400cabdff1aSopenharmony_ci punpcklbw m2, m7 401cabdff1aSopenharmony_ci punpcklbw m3, m7 402cabdff1aSopenharmony_ci punpcklbw m4, m7 403cabdff1aSopenharmony_ci FILT_V %1 404cabdff1aSopenharmony_ci FILT_V %1 405cabdff1aSopenharmony_ci FILT_V %1 406cabdff1aSopenharmony_ci FILT_V %1 407cabdff1aSopenharmony_ci RET 408cabdff1aSopenharmony_ci%endmacro 409cabdff1aSopenharmony_ci 410cabdff1aSopenharmony_ciINIT_MMX mmxext 411cabdff1aSopenharmony_ciQPEL4_V_LOWPASS_OP put 412cabdff1aSopenharmony_ciQPEL4_V_LOWPASS_OP avg 413cabdff1aSopenharmony_ci 414cabdff1aSopenharmony_ci 415cabdff1aSopenharmony_ci 416cabdff1aSopenharmony_ci%macro QPEL8OR16_V_LOWPASS_OP 1 417cabdff1aSopenharmony_ci%if cpuflag(sse2) 418cabdff1aSopenharmony_cicglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h 419cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 420cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 421cabdff1aSopenharmony_ci sub r1, r3 422cabdff1aSopenharmony_ci sub r1, r3 423cabdff1aSopenharmony_ci%else 424cabdff1aSopenharmony_cicglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h 425cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 426cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 427cabdff1aSopenharmony_ci%endif 428cabdff1aSopenharmony_ci pxor m7, m7 429cabdff1aSopenharmony_ci movh m0, [r1] 430cabdff1aSopenharmony_ci movh m1, [r1+r3] 431cabdff1aSopenharmony_ci lea r1, [r1+2*r3] 432cabdff1aSopenharmony_ci movh m2, [r1] 433cabdff1aSopenharmony_ci movh m3, [r1+r3] 434cabdff1aSopenharmony_ci lea r1, [r1+2*r3] 435cabdff1aSopenharmony_ci movh m4, [r1] 436cabdff1aSopenharmony_ci add r1, r3 437cabdff1aSopenharmony_ci punpcklbw m0, m7 438cabdff1aSopenharmony_ci punpcklbw m1, m7 439cabdff1aSopenharmony_ci punpcklbw m2, m7 440cabdff1aSopenharmony_ci punpcklbw m3, m7 441cabdff1aSopenharmony_ci punpcklbw m4, m7 442cabdff1aSopenharmony_ci FILT_V %1 443cabdff1aSopenharmony_ci FILT_V %1 444cabdff1aSopenharmony_ci FILT_V %1 445cabdff1aSopenharmony_ci FILT_V %1 446cabdff1aSopenharmony_ci FILT_V %1 447cabdff1aSopenharmony_ci FILT_V %1 448cabdff1aSopenharmony_ci FILT_V %1 449cabdff1aSopenharmony_ci FILT_V %1 450cabdff1aSopenharmony_ci cmp r4d, 16 451cabdff1aSopenharmony_ci jne .end 452cabdff1aSopenharmony_ci FILT_V %1 453cabdff1aSopenharmony_ci FILT_V %1 454cabdff1aSopenharmony_ci FILT_V %1 455cabdff1aSopenharmony_ci FILT_V %1 456cabdff1aSopenharmony_ci FILT_V %1 457cabdff1aSopenharmony_ci FILT_V %1 458cabdff1aSopenharmony_ci FILT_V %1 459cabdff1aSopenharmony_ci FILT_V %1 460cabdff1aSopenharmony_ci.end: 461cabdff1aSopenharmony_ci REP_RET 462cabdff1aSopenharmony_ci%endmacro 463cabdff1aSopenharmony_ci 464cabdff1aSopenharmony_ciINIT_XMM sse2 465cabdff1aSopenharmony_ciQPEL8OR16_V_LOWPASS_OP put 466cabdff1aSopenharmony_ciQPEL8OR16_V_LOWPASS_OP avg 467cabdff1aSopenharmony_ci 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_ci; All functions that use this are required to have args: 470cabdff1aSopenharmony_ci; src, tmp, srcSize 471cabdff1aSopenharmony_ci%macro FILT_HV 1 ; offset 472cabdff1aSopenharmony_ci mova m6, m2 473cabdff1aSopenharmony_ci movh m5, [r0] 474cabdff1aSopenharmony_ci paddw m6, m3 475cabdff1aSopenharmony_ci psllw m6, 2 476cabdff1aSopenharmony_ci paddw m0, [pw_16] 477cabdff1aSopenharmony_ci psubw m6, m1 478cabdff1aSopenharmony_ci psubw m6, m4 479cabdff1aSopenharmony_ci punpcklbw m5, m7 480cabdff1aSopenharmony_ci pmullw m6, [pw_5] 481cabdff1aSopenharmony_ci paddw m0, m5 482cabdff1aSopenharmony_ci add r0, r2 483cabdff1aSopenharmony_ci paddw m6, m0 484cabdff1aSopenharmony_ci mova [r1+%1], m6 485cabdff1aSopenharmony_ci SWAP 0, 1, 2, 3, 4, 5 486cabdff1aSopenharmony_ci%endmacro 487cabdff1aSopenharmony_ci 488cabdff1aSopenharmony_ci%macro QPEL4_HV1_LOWPASS_OP 1 489cabdff1aSopenharmony_cicglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride 490cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 491cabdff1aSopenharmony_ci pxor m7, m7 492cabdff1aSopenharmony_ci movh m0, [r0] 493cabdff1aSopenharmony_ci movh m1, [r0+r2] 494cabdff1aSopenharmony_ci lea r0, [r0+2*r2] 495cabdff1aSopenharmony_ci movh m2, [r0] 496cabdff1aSopenharmony_ci movh m3, [r0+r2] 497cabdff1aSopenharmony_ci lea r0, [r0+2*r2] 498cabdff1aSopenharmony_ci movh m4, [r0] 499cabdff1aSopenharmony_ci add r0, r2 500cabdff1aSopenharmony_ci punpcklbw m0, m7 501cabdff1aSopenharmony_ci punpcklbw m1, m7 502cabdff1aSopenharmony_ci punpcklbw m2, m7 503cabdff1aSopenharmony_ci punpcklbw m3, m7 504cabdff1aSopenharmony_ci punpcklbw m4, m7 505cabdff1aSopenharmony_ci FILT_HV 0*24 506cabdff1aSopenharmony_ci FILT_HV 1*24 507cabdff1aSopenharmony_ci FILT_HV 2*24 508cabdff1aSopenharmony_ci FILT_HV 3*24 509cabdff1aSopenharmony_ci RET 510cabdff1aSopenharmony_ci 511cabdff1aSopenharmony_cicglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride 512cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 513cabdff1aSopenharmony_ci mov r3d, 4 514cabdff1aSopenharmony_ci.loop: 515cabdff1aSopenharmony_ci mova m0, [r0] 516cabdff1aSopenharmony_ci paddw m0, [r0+10] 517cabdff1aSopenharmony_ci mova m1, [r0+2] 518cabdff1aSopenharmony_ci paddw m1, [r0+8] 519cabdff1aSopenharmony_ci mova m2, [r0+4] 520cabdff1aSopenharmony_ci paddw m2, [r0+6] 521cabdff1aSopenharmony_ci psubw m0, m1 522cabdff1aSopenharmony_ci psraw m0, 2 523cabdff1aSopenharmony_ci psubw m0, m1 524cabdff1aSopenharmony_ci paddsw m0, m2 525cabdff1aSopenharmony_ci psraw m0, 2 526cabdff1aSopenharmony_ci paddw m0, m2 527cabdff1aSopenharmony_ci psraw m0, 6 528cabdff1aSopenharmony_ci packuswb m0, m0 529cabdff1aSopenharmony_ci op_%1h m0, [r1], m7 530cabdff1aSopenharmony_ci add r0, 24 531cabdff1aSopenharmony_ci add r1, r2 532cabdff1aSopenharmony_ci dec r3d 533cabdff1aSopenharmony_ci jnz .loop 534cabdff1aSopenharmony_ci REP_RET 535cabdff1aSopenharmony_ci%endmacro 536cabdff1aSopenharmony_ci 537cabdff1aSopenharmony_ciINIT_MMX mmxext 538cabdff1aSopenharmony_ciQPEL4_HV1_LOWPASS_OP put 539cabdff1aSopenharmony_ciQPEL4_HV1_LOWPASS_OP avg 540cabdff1aSopenharmony_ci 541cabdff1aSopenharmony_ci%macro QPEL8OR16_HV1_LOWPASS_OP 1 542cabdff1aSopenharmony_cicglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size 543cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 544cabdff1aSopenharmony_ci pxor m7, m7 545cabdff1aSopenharmony_ci movh m0, [r0] 546cabdff1aSopenharmony_ci movh m1, [r0+r2] 547cabdff1aSopenharmony_ci lea r0, [r0+2*r2] 548cabdff1aSopenharmony_ci movh m2, [r0] 549cabdff1aSopenharmony_ci movh m3, [r0+r2] 550cabdff1aSopenharmony_ci lea r0, [r0+2*r2] 551cabdff1aSopenharmony_ci movh m4, [r0] 552cabdff1aSopenharmony_ci add r0, r2 553cabdff1aSopenharmony_ci punpcklbw m0, m7 554cabdff1aSopenharmony_ci punpcklbw m1, m7 555cabdff1aSopenharmony_ci punpcklbw m2, m7 556cabdff1aSopenharmony_ci punpcklbw m3, m7 557cabdff1aSopenharmony_ci punpcklbw m4, m7 558cabdff1aSopenharmony_ci FILT_HV 0*48 559cabdff1aSopenharmony_ci FILT_HV 1*48 560cabdff1aSopenharmony_ci FILT_HV 2*48 561cabdff1aSopenharmony_ci FILT_HV 3*48 562cabdff1aSopenharmony_ci FILT_HV 4*48 563cabdff1aSopenharmony_ci FILT_HV 5*48 564cabdff1aSopenharmony_ci FILT_HV 6*48 565cabdff1aSopenharmony_ci FILT_HV 7*48 566cabdff1aSopenharmony_ci cmp r3d, 16 567cabdff1aSopenharmony_ci jne .end 568cabdff1aSopenharmony_ci FILT_HV 8*48 569cabdff1aSopenharmony_ci FILT_HV 9*48 570cabdff1aSopenharmony_ci FILT_HV 10*48 571cabdff1aSopenharmony_ci FILT_HV 11*48 572cabdff1aSopenharmony_ci FILT_HV 12*48 573cabdff1aSopenharmony_ci FILT_HV 13*48 574cabdff1aSopenharmony_ci FILT_HV 14*48 575cabdff1aSopenharmony_ci FILT_HV 15*48 576cabdff1aSopenharmony_ci.end: 577cabdff1aSopenharmony_ci REP_RET 578cabdff1aSopenharmony_ci%endmacro 579cabdff1aSopenharmony_ci 580cabdff1aSopenharmony_ciINIT_XMM sse2 581cabdff1aSopenharmony_ciQPEL8OR16_HV1_LOWPASS_OP put 582cabdff1aSopenharmony_ci 583cabdff1aSopenharmony_ci 584cabdff1aSopenharmony_ci 585cabdff1aSopenharmony_ci%macro QPEL8OR16_HV2_LOWPASS_OP 1 586cabdff1aSopenharmony_ci; unused is to match ssse3 and mmxext args 587cabdff1aSopenharmony_cicglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h 588cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 589cabdff1aSopenharmony_ci.loop: 590cabdff1aSopenharmony_ci mova m0, [r1] 591cabdff1aSopenharmony_ci mova m3, [r1+8] 592cabdff1aSopenharmony_ci mova m1, [r1+2] 593cabdff1aSopenharmony_ci mova m4, [r1+10] 594cabdff1aSopenharmony_ci paddw m0, m4 595cabdff1aSopenharmony_ci paddw m1, m3 596cabdff1aSopenharmony_ci paddw m3, [r1+18] 597cabdff1aSopenharmony_ci paddw m4, [r1+16] 598cabdff1aSopenharmony_ci mova m2, [r1+4] 599cabdff1aSopenharmony_ci mova m5, [r1+12] 600cabdff1aSopenharmony_ci paddw m2, [r1+6] 601cabdff1aSopenharmony_ci paddw m5, [r1+14] 602cabdff1aSopenharmony_ci psubw m0, m1 603cabdff1aSopenharmony_ci psubw m3, m4 604cabdff1aSopenharmony_ci psraw m0, 2 605cabdff1aSopenharmony_ci psraw m3, 2 606cabdff1aSopenharmony_ci psubw m0, m1 607cabdff1aSopenharmony_ci psubw m3, m4 608cabdff1aSopenharmony_ci paddsw m0, m2 609cabdff1aSopenharmony_ci paddsw m3, m5 610cabdff1aSopenharmony_ci psraw m0, 2 611cabdff1aSopenharmony_ci psraw m3, 2 612cabdff1aSopenharmony_ci paddw m0, m2 613cabdff1aSopenharmony_ci paddw m3, m5 614cabdff1aSopenharmony_ci psraw m0, 6 615cabdff1aSopenharmony_ci psraw m3, 6 616cabdff1aSopenharmony_ci packuswb m0, m3 617cabdff1aSopenharmony_ci op_%1 m0, [r0], m7 618cabdff1aSopenharmony_ci add r1, 48 619cabdff1aSopenharmony_ci add r0, r2 620cabdff1aSopenharmony_ci dec r4d 621cabdff1aSopenharmony_ci jne .loop 622cabdff1aSopenharmony_ci REP_RET 623cabdff1aSopenharmony_ci%endmacro 624cabdff1aSopenharmony_ci 625cabdff1aSopenharmony_ciINIT_MMX mmxext 626cabdff1aSopenharmony_ciQPEL8OR16_HV2_LOWPASS_OP put 627cabdff1aSopenharmony_ciQPEL8OR16_HV2_LOWPASS_OP avg 628cabdff1aSopenharmony_ci 629cabdff1aSopenharmony_ci%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 630cabdff1aSopenharmony_cicglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size 631cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 632cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 633cabdff1aSopenharmony_ci cmp r4d, 16 634cabdff1aSopenharmony_ci je .op16 635cabdff1aSopenharmony_ci.loop8: 636cabdff1aSopenharmony_ci mova m1, [r1+16] 637cabdff1aSopenharmony_ci mova m0, [r1] 638cabdff1aSopenharmony_ci mova m2, m1 639cabdff1aSopenharmony_ci mova m3, m1 640cabdff1aSopenharmony_ci mova m4, m1 641cabdff1aSopenharmony_ci mova m5, m1 642cabdff1aSopenharmony_ci palignr m5, m0, 10 643cabdff1aSopenharmony_ci palignr m4, m0, 8 644cabdff1aSopenharmony_ci palignr m3, m0, 6 645cabdff1aSopenharmony_ci palignr m2, m0, 4 646cabdff1aSopenharmony_ci palignr m1, m0, 2 647cabdff1aSopenharmony_ci paddw m0, m5 648cabdff1aSopenharmony_ci paddw m1, m4 649cabdff1aSopenharmony_ci paddw m2, m3 650cabdff1aSopenharmony_ci psubw m0, m1 651cabdff1aSopenharmony_ci psraw m0, 2 652cabdff1aSopenharmony_ci psubw m0, m1 653cabdff1aSopenharmony_ci paddw m0, m2 654cabdff1aSopenharmony_ci psraw m0, 2 655cabdff1aSopenharmony_ci paddw m0, m2 656cabdff1aSopenharmony_ci psraw m0, 6 657cabdff1aSopenharmony_ci packuswb m0, m0 658cabdff1aSopenharmony_ci op_%1h m0, [r0], m7 659cabdff1aSopenharmony_ci add r1, 48 660cabdff1aSopenharmony_ci add r0, r2 661cabdff1aSopenharmony_ci dec r4d 662cabdff1aSopenharmony_ci jne .loop8 663cabdff1aSopenharmony_ci jmp .done 664cabdff1aSopenharmony_ci.op16: 665cabdff1aSopenharmony_ci mova m4, [r1+32] 666cabdff1aSopenharmony_ci mova m5, [r1+16] 667cabdff1aSopenharmony_ci mova m7, [r1] 668cabdff1aSopenharmony_ci mova m3, m4 669cabdff1aSopenharmony_ci mova m2, m4 670cabdff1aSopenharmony_ci mova m1, m4 671cabdff1aSopenharmony_ci mova m0, m4 672cabdff1aSopenharmony_ci palignr m0, m5, 10 673cabdff1aSopenharmony_ci palignr m1, m5, 8 674cabdff1aSopenharmony_ci palignr m2, m5, 6 675cabdff1aSopenharmony_ci palignr m3, m5, 4 676cabdff1aSopenharmony_ci palignr m4, m5, 2 677cabdff1aSopenharmony_ci paddw m0, m5 678cabdff1aSopenharmony_ci paddw m1, m4 679cabdff1aSopenharmony_ci paddw m2, m3 680cabdff1aSopenharmony_ci mova m6, m5 681cabdff1aSopenharmony_ci mova m4, m5 682cabdff1aSopenharmony_ci mova m3, m5 683cabdff1aSopenharmony_ci palignr m4, m7, 8 684cabdff1aSopenharmony_ci palignr m6, m7, 2 685cabdff1aSopenharmony_ci palignr m3, m7, 10 686cabdff1aSopenharmony_ci paddw m4, m6 687cabdff1aSopenharmony_ci mova m6, m5 688cabdff1aSopenharmony_ci palignr m5, m7, 6 689cabdff1aSopenharmony_ci palignr m6, m7, 4 690cabdff1aSopenharmony_ci paddw m3, m7 691cabdff1aSopenharmony_ci paddw m5, m6 692cabdff1aSopenharmony_ci psubw m0, m1 693cabdff1aSopenharmony_ci psubw m3, m4 694cabdff1aSopenharmony_ci psraw m0, 2 695cabdff1aSopenharmony_ci psraw m3, 2 696cabdff1aSopenharmony_ci psubw m0, m1 697cabdff1aSopenharmony_ci psubw m3, m4 698cabdff1aSopenharmony_ci paddw m0, m2 699cabdff1aSopenharmony_ci paddw m3, m5 700cabdff1aSopenharmony_ci psraw m0, 2 701cabdff1aSopenharmony_ci psraw m3, 2 702cabdff1aSopenharmony_ci paddw m0, m2 703cabdff1aSopenharmony_ci paddw m3, m5 704cabdff1aSopenharmony_ci psraw m0, 6 705cabdff1aSopenharmony_ci psraw m3, 6 706cabdff1aSopenharmony_ci packuswb m3, m0 707cabdff1aSopenharmony_ci op_%1 m3, [r0], m7 708cabdff1aSopenharmony_ci add r1, 48 709cabdff1aSopenharmony_ci add r0, r2 710cabdff1aSopenharmony_ci dec r4d 711cabdff1aSopenharmony_ci jne .op16 712cabdff1aSopenharmony_ci.done: 713cabdff1aSopenharmony_ci REP_RET 714cabdff1aSopenharmony_ci%endmacro 715cabdff1aSopenharmony_ci 716cabdff1aSopenharmony_ciINIT_XMM ssse3 717cabdff1aSopenharmony_ciQPEL8OR16_HV2_LOWPASS_OP_XMM put 718cabdff1aSopenharmony_ciQPEL8OR16_HV2_LOWPASS_OP_XMM avg 719cabdff1aSopenharmony_ci 720cabdff1aSopenharmony_ci 721cabdff1aSopenharmony_ci%macro PIXELS4_L2_SHIFT5 1 722cabdff1aSopenharmony_cicglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h 723cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 724cabdff1aSopenharmony_ci movsxdifnidn r4, r4d 725cabdff1aSopenharmony_ci mova m0, [r1] 726cabdff1aSopenharmony_ci mova m1, [r1+24] 727cabdff1aSopenharmony_ci psraw m0, 5 728cabdff1aSopenharmony_ci psraw m1, 5 729cabdff1aSopenharmony_ci packuswb m0, m0 730cabdff1aSopenharmony_ci packuswb m1, m1 731cabdff1aSopenharmony_ci pavgb m0, [r2] 732cabdff1aSopenharmony_ci pavgb m1, [r2+r4] 733cabdff1aSopenharmony_ci op_%1h m0, [r0], m4 734cabdff1aSopenharmony_ci op_%1h m1, [r0+r3], m5 735cabdff1aSopenharmony_ci lea r2, [r2+r4*2] 736cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 737cabdff1aSopenharmony_ci mova m0, [r1+48] 738cabdff1aSopenharmony_ci mova m1, [r1+72] 739cabdff1aSopenharmony_ci psraw m0, 5 740cabdff1aSopenharmony_ci psraw m1, 5 741cabdff1aSopenharmony_ci packuswb m0, m0 742cabdff1aSopenharmony_ci packuswb m1, m1 743cabdff1aSopenharmony_ci pavgb m0, [r2] 744cabdff1aSopenharmony_ci pavgb m1, [r2+r4] 745cabdff1aSopenharmony_ci op_%1h m0, [r0], m4 746cabdff1aSopenharmony_ci op_%1h m1, [r0+r3], m5 747cabdff1aSopenharmony_ci RET 748cabdff1aSopenharmony_ci%endmacro 749cabdff1aSopenharmony_ci 750cabdff1aSopenharmony_ciINIT_MMX mmxext 751cabdff1aSopenharmony_ciPIXELS4_L2_SHIFT5 put 752cabdff1aSopenharmony_ciPIXELS4_L2_SHIFT5 avg 753cabdff1aSopenharmony_ci 754cabdff1aSopenharmony_ci 755cabdff1aSopenharmony_ci%macro PIXELS8_L2_SHIFT5 1 756cabdff1aSopenharmony_cicglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h 757cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 758cabdff1aSopenharmony_ci movsxdifnidn r4, r4d 759cabdff1aSopenharmony_ci.loop: 760cabdff1aSopenharmony_ci mova m0, [r1] 761cabdff1aSopenharmony_ci mova m1, [r1+8] 762cabdff1aSopenharmony_ci mova m2, [r1+48] 763cabdff1aSopenharmony_ci mova m3, [r1+48+8] 764cabdff1aSopenharmony_ci psraw m0, 5 765cabdff1aSopenharmony_ci psraw m1, 5 766cabdff1aSopenharmony_ci psraw m2, 5 767cabdff1aSopenharmony_ci psraw m3, 5 768cabdff1aSopenharmony_ci packuswb m0, m1 769cabdff1aSopenharmony_ci packuswb m2, m3 770cabdff1aSopenharmony_ci pavgb m0, [r2] 771cabdff1aSopenharmony_ci pavgb m2, [r2+r4] 772cabdff1aSopenharmony_ci op_%1 m0, [r0], m4 773cabdff1aSopenharmony_ci op_%1 m2, [r0+r3], m5 774cabdff1aSopenharmony_ci lea r2, [r2+2*r4] 775cabdff1aSopenharmony_ci add r1, 48*2 776cabdff1aSopenharmony_ci lea r0, [r0+2*r3] 777cabdff1aSopenharmony_ci sub r5d, 2 778cabdff1aSopenharmony_ci jne .loop 779cabdff1aSopenharmony_ci REP_RET 780cabdff1aSopenharmony_ci%endmacro 781cabdff1aSopenharmony_ci 782cabdff1aSopenharmony_ciINIT_MMX mmxext 783cabdff1aSopenharmony_ciPIXELS8_L2_SHIFT5 put 784cabdff1aSopenharmony_ciPIXELS8_L2_SHIFT5 avg 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci 787cabdff1aSopenharmony_ci%if ARCH_X86_64 788cabdff1aSopenharmony_ci%macro QPEL16_H_LOWPASS_L2_OP 1 789cabdff1aSopenharmony_cicglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride 790cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 791cabdff1aSopenharmony_ci movsxdifnidn r4, r4d 792cabdff1aSopenharmony_ci mov r5d, 16 793cabdff1aSopenharmony_ci pxor m15, m15 794cabdff1aSopenharmony_ci mova m14, [pw_5] 795cabdff1aSopenharmony_ci mova m13, [pw_16] 796cabdff1aSopenharmony_ci.loop: 797cabdff1aSopenharmony_ci lddqu m1, [r1+6] 798cabdff1aSopenharmony_ci lddqu m7, [r1-2] 799cabdff1aSopenharmony_ci mova m0, m1 800cabdff1aSopenharmony_ci punpckhbw m1, m15 801cabdff1aSopenharmony_ci punpcklbw m0, m15 802cabdff1aSopenharmony_ci punpcklbw m7, m15 803cabdff1aSopenharmony_ci mova m2, m1 804cabdff1aSopenharmony_ci mova m6, m0 805cabdff1aSopenharmony_ci mova m3, m1 806cabdff1aSopenharmony_ci mova m8, m0 807cabdff1aSopenharmony_ci mova m4, m1 808cabdff1aSopenharmony_ci mova m9, m0 809cabdff1aSopenharmony_ci mova m12, m0 810cabdff1aSopenharmony_ci mova m11, m1 811cabdff1aSopenharmony_ci palignr m11, m0, 10 812cabdff1aSopenharmony_ci palignr m12, m7, 10 813cabdff1aSopenharmony_ci palignr m4, m0, 2 814cabdff1aSopenharmony_ci palignr m9, m7, 2 815cabdff1aSopenharmony_ci palignr m3, m0, 4 816cabdff1aSopenharmony_ci palignr m8, m7, 4 817cabdff1aSopenharmony_ci palignr m2, m0, 6 818cabdff1aSopenharmony_ci palignr m6, m7, 6 819cabdff1aSopenharmony_ci paddw m11, m0 820cabdff1aSopenharmony_ci palignr m1, m0, 8 821cabdff1aSopenharmony_ci palignr m0, m7, 8 822cabdff1aSopenharmony_ci paddw m7, m12 823cabdff1aSopenharmony_ci paddw m2, m3 824cabdff1aSopenharmony_ci paddw m6, m8 825cabdff1aSopenharmony_ci paddw m1, m4 826cabdff1aSopenharmony_ci paddw m0, m9 827cabdff1aSopenharmony_ci psllw m2, 2 828cabdff1aSopenharmony_ci psllw m6, 2 829cabdff1aSopenharmony_ci psubw m2, m1 830cabdff1aSopenharmony_ci psubw m6, m0 831cabdff1aSopenharmony_ci paddw m11, m13 832cabdff1aSopenharmony_ci paddw m7, m13 833cabdff1aSopenharmony_ci pmullw m2, m14 834cabdff1aSopenharmony_ci pmullw m6, m14 835cabdff1aSopenharmony_ci lddqu m3, [r2] 836cabdff1aSopenharmony_ci paddw m2, m11 837cabdff1aSopenharmony_ci paddw m6, m7 838cabdff1aSopenharmony_ci psraw m2, 5 839cabdff1aSopenharmony_ci psraw m6, 5 840cabdff1aSopenharmony_ci packuswb m6, m2 841cabdff1aSopenharmony_ci pavgb m6, m3 842cabdff1aSopenharmony_ci op_%1 m6, [r0], m11 843cabdff1aSopenharmony_ci add r1, r3 844cabdff1aSopenharmony_ci add r0, r3 845cabdff1aSopenharmony_ci add r2, r4 846cabdff1aSopenharmony_ci dec r5d 847cabdff1aSopenharmony_ci jg .loop 848cabdff1aSopenharmony_ci REP_RET 849cabdff1aSopenharmony_ci%endmacro 850cabdff1aSopenharmony_ci 851cabdff1aSopenharmony_ciINIT_XMM ssse3 852cabdff1aSopenharmony_ciQPEL16_H_LOWPASS_L2_OP put 853cabdff1aSopenharmony_ciQPEL16_H_LOWPASS_L2_OP avg 854cabdff1aSopenharmony_ci%endif 855