1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code 3cabdff1aSopenharmony_ci;***************************************************************************** 4cabdff1aSopenharmony_ci;* Copyright (C) 2011 x264 project 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 9cabdff1aSopenharmony_ci;* 10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 14cabdff1aSopenharmony_ci;* 15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 19cabdff1aSopenharmony_ci;* 20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23cabdff1aSopenharmony_ci;****************************************************************************** 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ciSECTION_RODATA 32 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_cicextern pd_65535 30cabdff1aSopenharmony_cicextern pw_1023 31cabdff1aSopenharmony_ci%define pw_pixel_max pw_1023 32cabdff1aSopenharmony_cicextern pw_16 33cabdff1aSopenharmony_cicextern pw_1 34cabdff1aSopenharmony_cicextern pb_0 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_cipad10: times 8 dw 10*1023 37cabdff1aSopenharmony_cipad20: times 8 dw 20*1023 38cabdff1aSopenharmony_cipad30: times 8 dw 30*1023 39cabdff1aSopenharmony_cidepad: times 4 dd 32*20*1023 + 512 40cabdff1aSopenharmony_cidepad2: times 8 dw 20*1023 + 16*1022 + 16 41cabdff1aSopenharmony_ciunpad: times 8 dw 16*1022/32 ; needs to be mod 16 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_citap1: times 4 dw 1, -5 44cabdff1aSopenharmony_citap2: times 4 dw 20, 20 45cabdff1aSopenharmony_citap3: times 4 dw -5, 1 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ciSECTION .text 48cabdff1aSopenharmony_ci 49cabdff1aSopenharmony_ci 50cabdff1aSopenharmony_ci%macro AVG_MOV 2 51cabdff1aSopenharmony_ci pavgw %2, %1 52cabdff1aSopenharmony_ci mova %1, %2 53cabdff1aSopenharmony_ci%endmacro 54cabdff1aSopenharmony_ci 55cabdff1aSopenharmony_ci%macro ADDW 3 56cabdff1aSopenharmony_ci%if mmsize == 8 57cabdff1aSopenharmony_ci paddw %1, %2 58cabdff1aSopenharmony_ci%else 59cabdff1aSopenharmony_ci movu %3, %2 60cabdff1aSopenharmony_ci paddw %1, %3 61cabdff1aSopenharmony_ci%endif 62cabdff1aSopenharmony_ci%endmacro 63cabdff1aSopenharmony_ci 64cabdff1aSopenharmony_ci%macro FILT_H 4 65cabdff1aSopenharmony_ci paddw %1, %4 66cabdff1aSopenharmony_ci psubw %1, %2 ; a-b 67cabdff1aSopenharmony_ci psraw %1, 2 ; (a-b)/4 68cabdff1aSopenharmony_ci psubw %1, %2 ; (a-b)/4-b 69cabdff1aSopenharmony_ci paddw %1, %3 ; (a-b)/4-b+c 70cabdff1aSopenharmony_ci psraw %1, 2 ; ((a-b)/4-b+c)/4 71cabdff1aSopenharmony_ci paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 72cabdff1aSopenharmony_ci%endmacro 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_ci%macro PRELOAD_V 0 75cabdff1aSopenharmony_ci lea r3, [r2*3] 76cabdff1aSopenharmony_ci sub r1, r3 77cabdff1aSopenharmony_ci movu m0, [r1+r2] 78cabdff1aSopenharmony_ci movu m1, [r1+r2*2] 79cabdff1aSopenharmony_ci add r1, r3 80cabdff1aSopenharmony_ci movu m2, [r1] 81cabdff1aSopenharmony_ci movu m3, [r1+r2] 82cabdff1aSopenharmony_ci movu m4, [r1+r2*2] 83cabdff1aSopenharmony_ci add r1, r3 84cabdff1aSopenharmony_ci%endmacro 85cabdff1aSopenharmony_ci 86cabdff1aSopenharmony_ci%macro FILT_V 8 87cabdff1aSopenharmony_ci movu %6, [r1] 88cabdff1aSopenharmony_ci paddw %1, %6 89cabdff1aSopenharmony_ci mova %7, %2 90cabdff1aSopenharmony_ci paddw %7, %5 91cabdff1aSopenharmony_ci mova %8, %3 92cabdff1aSopenharmony_ci paddw %8, %4 93cabdff1aSopenharmony_ci FILT_H %1, %7, %8, [pw_16] 94cabdff1aSopenharmony_ci psraw %1, 1 95cabdff1aSopenharmony_ci CLIPW %1, [pb_0], [pw_pixel_max] 96cabdff1aSopenharmony_ci%endmacro 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci%macro MC 1 99cabdff1aSopenharmony_ci%define OP_MOV mova 100cabdff1aSopenharmony_ciINIT_MMX mmxext 101cabdff1aSopenharmony_ci%1 put, 4 102cabdff1aSopenharmony_ciINIT_XMM sse2 103cabdff1aSopenharmony_ci%1 put, 8 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci%define OP_MOV AVG_MOV 106cabdff1aSopenharmony_ciINIT_MMX mmxext 107cabdff1aSopenharmony_ci%1 avg, 4 108cabdff1aSopenharmony_ciINIT_XMM sse2 109cabdff1aSopenharmony_ci%1 avg, 8 110cabdff1aSopenharmony_ci%endmacro 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_ci%macro MCAxA_OP 7 113cabdff1aSopenharmony_ci%if ARCH_X86_32 114cabdff1aSopenharmony_cicglobal %1_h264_qpel%4_%2_10, %5,%6,%7 115cabdff1aSopenharmony_ci call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 116cabdff1aSopenharmony_ci mov r0, r0m 117cabdff1aSopenharmony_ci mov r1, r1m 118cabdff1aSopenharmony_ci add r0, %3*2 119cabdff1aSopenharmony_ci add r1, %3*2 120cabdff1aSopenharmony_ci call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 121cabdff1aSopenharmony_ci mov r0, r0m 122cabdff1aSopenharmony_ci mov r1, r1m 123cabdff1aSopenharmony_ci lea r0, [r0+r2*%3] 124cabdff1aSopenharmony_ci lea r1, [r1+r2*%3] 125cabdff1aSopenharmony_ci call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 126cabdff1aSopenharmony_ci mov r0, r0m 127cabdff1aSopenharmony_ci mov r1, r1m 128cabdff1aSopenharmony_ci lea r0, [r0+r2*%3+%3*2] 129cabdff1aSopenharmony_ci lea r1, [r1+r2*%3+%3*2] 130cabdff1aSopenharmony_ci call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 131cabdff1aSopenharmony_ci RET 132cabdff1aSopenharmony_ci%else ; ARCH_X86_64 133cabdff1aSopenharmony_cicglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7 134cabdff1aSopenharmony_ci mov r%6, r0 135cabdff1aSopenharmony_ci%assign p1 %6+1 136cabdff1aSopenharmony_ci mov r %+ p1, r1 137cabdff1aSopenharmony_ci call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 138cabdff1aSopenharmony_ci lea r0, [r%6+%3*2] 139cabdff1aSopenharmony_ci lea r1, [r %+ p1+%3*2] 140cabdff1aSopenharmony_ci call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 141cabdff1aSopenharmony_ci lea r0, [r%6+r2*%3] 142cabdff1aSopenharmony_ci lea r1, [r %+ p1+r2*%3] 143cabdff1aSopenharmony_ci call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 144cabdff1aSopenharmony_ci lea r0, [r%6+r2*%3+%3*2] 145cabdff1aSopenharmony_ci lea r1, [r %+ p1+r2*%3+%3*2] 146cabdff1aSopenharmony_ci%if UNIX64 == 0 ; fall through to function 147cabdff1aSopenharmony_ci call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 148cabdff1aSopenharmony_ci RET 149cabdff1aSopenharmony_ci%endif 150cabdff1aSopenharmony_ci%endif 151cabdff1aSopenharmony_ci%endmacro 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_ci;cpu, put/avg, mc, 4/8, ... 154cabdff1aSopenharmony_ci%macro cglobal_mc 6 155cabdff1aSopenharmony_ci%assign i %3*2 156cabdff1aSopenharmony_ci%if cpuflag(sse2) 157cabdff1aSopenharmony_ciMCAxA_OP %1, %2, %3, i, %4,%5,%6 158cabdff1aSopenharmony_ci%endif 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_cicglobal %1_h264_qpel%3_%2_10, %4,%5,%6 161cabdff1aSopenharmony_ci%if UNIX64 == 0 ; no prologue or epilogue for UNIX64 162cabdff1aSopenharmony_ci call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX 163cabdff1aSopenharmony_ci RET 164cabdff1aSopenharmony_ci%endif 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_cistub_%1_h264_qpel%3_%2_10 %+ SUFFIX: 167cabdff1aSopenharmony_ci%endmacro 168cabdff1aSopenharmony_ci 169cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 170cabdff1aSopenharmony_ci; void ff_h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride) 171cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 172cabdff1aSopenharmony_ci%macro COPY4 0 173cabdff1aSopenharmony_ci movu m0, [r1 ] 174cabdff1aSopenharmony_ci OP_MOV [r0 ], m0 175cabdff1aSopenharmony_ci movu m0, [r1+r2 ] 176cabdff1aSopenharmony_ci OP_MOV [r0+r2 ], m0 177cabdff1aSopenharmony_ci movu m0, [r1+r2*2] 178cabdff1aSopenharmony_ci OP_MOV [r0+r2*2], m0 179cabdff1aSopenharmony_ci movu m0, [r1+r3 ] 180cabdff1aSopenharmony_ci OP_MOV [r0+r3 ], m0 181cabdff1aSopenharmony_ci%endmacro 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_ci%macro MC00 1 184cabdff1aSopenharmony_ciINIT_MMX mmxext 185cabdff1aSopenharmony_cicglobal_mc %1, mc00, 4, 3,4,0 186cabdff1aSopenharmony_ci lea r3, [r2*3] 187cabdff1aSopenharmony_ci COPY4 188cabdff1aSopenharmony_ci ret 189cabdff1aSopenharmony_ci 190cabdff1aSopenharmony_ciINIT_XMM sse2 191cabdff1aSopenharmony_cicglobal %1_h264_qpel8_mc00_10, 3,4 192cabdff1aSopenharmony_ci lea r3, [r2*3] 193cabdff1aSopenharmony_ci COPY4 194cabdff1aSopenharmony_ci lea r0, [r0+r2*4] 195cabdff1aSopenharmony_ci lea r1, [r1+r2*4] 196cabdff1aSopenharmony_ci COPY4 197cabdff1aSopenharmony_ci RET 198cabdff1aSopenharmony_ci 199cabdff1aSopenharmony_cicglobal %1_h264_qpel16_mc00_10, 3,4 200cabdff1aSopenharmony_ci mov r3d, 8 201cabdff1aSopenharmony_ci.loop: 202cabdff1aSopenharmony_ci movu m0, [r1 ] 203cabdff1aSopenharmony_ci movu m1, [r1 +16] 204cabdff1aSopenharmony_ci OP_MOV [r0 ], m0 205cabdff1aSopenharmony_ci OP_MOV [r0 +16], m1 206cabdff1aSopenharmony_ci movu m0, [r1+r2 ] 207cabdff1aSopenharmony_ci movu m1, [r1+r2+16] 208cabdff1aSopenharmony_ci OP_MOV [r0+r2 ], m0 209cabdff1aSopenharmony_ci OP_MOV [r0+r2+16], m1 210cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 211cabdff1aSopenharmony_ci lea r1, [r1+r2*2] 212cabdff1aSopenharmony_ci dec r3d 213cabdff1aSopenharmony_ci jg .loop 214cabdff1aSopenharmony_ci REP_RET 215cabdff1aSopenharmony_ci%endmacro 216cabdff1aSopenharmony_ci 217cabdff1aSopenharmony_ci%define OP_MOV mova 218cabdff1aSopenharmony_ciMC00 put 219cabdff1aSopenharmony_ci 220cabdff1aSopenharmony_ci%define OP_MOV AVG_MOV 221cabdff1aSopenharmony_ciMC00 avg 222cabdff1aSopenharmony_ci 223cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 224cabdff1aSopenharmony_ci; void ff_h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride) 225cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 226cabdff1aSopenharmony_ci%macro MC_CACHE 1 227cabdff1aSopenharmony_ci%define OP_MOV mova 228cabdff1aSopenharmony_ciINIT_MMX mmxext 229cabdff1aSopenharmony_ci%1 put, 4 230cabdff1aSopenharmony_ciINIT_XMM sse2, cache64 231cabdff1aSopenharmony_ci%1 put, 8 232cabdff1aSopenharmony_ciINIT_XMM ssse3, cache64 233cabdff1aSopenharmony_ci%1 put, 8 234cabdff1aSopenharmony_ciINIT_XMM sse2 235cabdff1aSopenharmony_ci%1 put, 8 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci%define OP_MOV AVG_MOV 238cabdff1aSopenharmony_ciINIT_MMX mmxext 239cabdff1aSopenharmony_ci%1 avg, 4 240cabdff1aSopenharmony_ciINIT_XMM sse2, cache64 241cabdff1aSopenharmony_ci%1 avg, 8 242cabdff1aSopenharmony_ciINIT_XMM ssse3, cache64 243cabdff1aSopenharmony_ci%1 avg, 8 244cabdff1aSopenharmony_ciINIT_XMM sse2 245cabdff1aSopenharmony_ci%1 avg, 8 246cabdff1aSopenharmony_ci%endmacro 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_ci%macro MC20 2 249cabdff1aSopenharmony_cicglobal_mc %1, mc20, %2, 3,4,9 250cabdff1aSopenharmony_ci mov r3d, %2 251cabdff1aSopenharmony_ci mova m1, [pw_pixel_max] 252cabdff1aSopenharmony_ci%if num_mmregs > 8 253cabdff1aSopenharmony_ci mova m8, [pw_16] 254cabdff1aSopenharmony_ci %define p16 m8 255cabdff1aSopenharmony_ci%else 256cabdff1aSopenharmony_ci %define p16 [pw_16] 257cabdff1aSopenharmony_ci%endif 258cabdff1aSopenharmony_ci.nextrow: 259cabdff1aSopenharmony_ci%if %0 == 4 260cabdff1aSopenharmony_ci movu m2, [r1-4] 261cabdff1aSopenharmony_ci movu m3, [r1-2] 262cabdff1aSopenharmony_ci movu m4, [r1+0] 263cabdff1aSopenharmony_ci ADDW m2, [r1+6], m5 264cabdff1aSopenharmony_ci ADDW m3, [r1+4], m5 265cabdff1aSopenharmony_ci ADDW m4, [r1+2], m5 266cabdff1aSopenharmony_ci%else ; movu is slow on these processors 267cabdff1aSopenharmony_ci%if mmsize==16 268cabdff1aSopenharmony_ci movu m2, [r1-4] 269cabdff1aSopenharmony_ci movu m0, [r1+6] 270cabdff1aSopenharmony_ci mova m6, m0 271cabdff1aSopenharmony_ci psrldq m0, 6 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci paddw m6, m2 274cabdff1aSopenharmony_ci PALIGNR m3, m0, m2, 2, m5 275cabdff1aSopenharmony_ci PALIGNR m7, m0, m2, 8, m5 276cabdff1aSopenharmony_ci paddw m3, m7 277cabdff1aSopenharmony_ci PALIGNR m4, m0, m2, 4, m5 278cabdff1aSopenharmony_ci PALIGNR m7, m0, m2, 6, m5 279cabdff1aSopenharmony_ci paddw m4, m7 280cabdff1aSopenharmony_ci SWAP 2, 6 281cabdff1aSopenharmony_ci%else 282cabdff1aSopenharmony_ci movu m2, [r1-4] 283cabdff1aSopenharmony_ci movu m6, [r1+4] 284cabdff1aSopenharmony_ci PALIGNR m3, m6, m2, 2, m5 285cabdff1aSopenharmony_ci paddw m3, m6 286cabdff1aSopenharmony_ci PALIGNR m4, m6, m2, 4, m5 287cabdff1aSopenharmony_ci PALIGNR m7, m6, m2, 6, m5 288cabdff1aSopenharmony_ci paddw m4, m7 289cabdff1aSopenharmony_ci paddw m2, [r1+6] 290cabdff1aSopenharmony_ci%endif 291cabdff1aSopenharmony_ci%endif 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci FILT_H m2, m3, m4, p16 294cabdff1aSopenharmony_ci psraw m2, 1 295cabdff1aSopenharmony_ci pxor m0, m0 296cabdff1aSopenharmony_ci CLIPW m2, m0, m1 297cabdff1aSopenharmony_ci OP_MOV [r0], m2 298cabdff1aSopenharmony_ci add r0, r2 299cabdff1aSopenharmony_ci add r1, r2 300cabdff1aSopenharmony_ci dec r3d 301cabdff1aSopenharmony_ci jg .nextrow 302cabdff1aSopenharmony_ci rep ret 303cabdff1aSopenharmony_ci%endmacro 304cabdff1aSopenharmony_ci 305cabdff1aSopenharmony_ciMC_CACHE MC20 306cabdff1aSopenharmony_ci 307cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 308cabdff1aSopenharmony_ci; void ff_h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride) 309cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 310cabdff1aSopenharmony_ci%macro MC30 2 311cabdff1aSopenharmony_cicglobal_mc %1, mc30, %2, 3,5,9 312cabdff1aSopenharmony_ci lea r4, [r1+2] 313cabdff1aSopenharmony_ci jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body 314cabdff1aSopenharmony_ci%endmacro 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_ciMC_CACHE MC30 317cabdff1aSopenharmony_ci 318cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 319cabdff1aSopenharmony_ci; void ff_h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride) 320cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 321cabdff1aSopenharmony_ci%macro MC10 2 322cabdff1aSopenharmony_cicglobal_mc %1, mc10, %2, 3,5,9 323cabdff1aSopenharmony_ci mov r4, r1 324cabdff1aSopenharmony_ci.body: 325cabdff1aSopenharmony_ci mov r3d, %2 326cabdff1aSopenharmony_ci mova m1, [pw_pixel_max] 327cabdff1aSopenharmony_ci%if num_mmregs > 8 328cabdff1aSopenharmony_ci mova m8, [pw_16] 329cabdff1aSopenharmony_ci %define p16 m8 330cabdff1aSopenharmony_ci%else 331cabdff1aSopenharmony_ci %define p16 [pw_16] 332cabdff1aSopenharmony_ci%endif 333cabdff1aSopenharmony_ci.nextrow: 334cabdff1aSopenharmony_ci%if %0 == 4 335cabdff1aSopenharmony_ci movu m2, [r1-4] 336cabdff1aSopenharmony_ci movu m3, [r1-2] 337cabdff1aSopenharmony_ci movu m4, [r1+0] 338cabdff1aSopenharmony_ci ADDW m2, [r1+6], m5 339cabdff1aSopenharmony_ci ADDW m3, [r1+4], m5 340cabdff1aSopenharmony_ci ADDW m4, [r1+2], m5 341cabdff1aSopenharmony_ci%else ; movu is slow on these processors 342cabdff1aSopenharmony_ci%if mmsize==16 343cabdff1aSopenharmony_ci movu m2, [r1-4] 344cabdff1aSopenharmony_ci movu m0, [r1+6] 345cabdff1aSopenharmony_ci mova m6, m0 346cabdff1aSopenharmony_ci psrldq m0, 6 347cabdff1aSopenharmony_ci 348cabdff1aSopenharmony_ci paddw m6, m2 349cabdff1aSopenharmony_ci PALIGNR m3, m0, m2, 2, m5 350cabdff1aSopenharmony_ci PALIGNR m7, m0, m2, 8, m5 351cabdff1aSopenharmony_ci paddw m3, m7 352cabdff1aSopenharmony_ci PALIGNR m4, m0, m2, 4, m5 353cabdff1aSopenharmony_ci PALIGNR m7, m0, m2, 6, m5 354cabdff1aSopenharmony_ci paddw m4, m7 355cabdff1aSopenharmony_ci SWAP 2, 6 356cabdff1aSopenharmony_ci%else 357cabdff1aSopenharmony_ci movu m2, [r1-4] 358cabdff1aSopenharmony_ci movu m6, [r1+4] 359cabdff1aSopenharmony_ci PALIGNR m3, m6, m2, 2, m5 360cabdff1aSopenharmony_ci paddw m3, m6 361cabdff1aSopenharmony_ci PALIGNR m4, m6, m2, 4, m5 362cabdff1aSopenharmony_ci PALIGNR m7, m6, m2, 6, m5 363cabdff1aSopenharmony_ci paddw m4, m7 364cabdff1aSopenharmony_ci paddw m2, [r1+6] 365cabdff1aSopenharmony_ci%endif 366cabdff1aSopenharmony_ci%endif 367cabdff1aSopenharmony_ci 368cabdff1aSopenharmony_ci FILT_H m2, m3, m4, p16 369cabdff1aSopenharmony_ci psraw m2, 1 370cabdff1aSopenharmony_ci pxor m0, m0 371cabdff1aSopenharmony_ci CLIPW m2, m0, m1 372cabdff1aSopenharmony_ci movu m3, [r4] 373cabdff1aSopenharmony_ci pavgw m2, m3 374cabdff1aSopenharmony_ci OP_MOV [r0], m2 375cabdff1aSopenharmony_ci add r0, r2 376cabdff1aSopenharmony_ci add r1, r2 377cabdff1aSopenharmony_ci add r4, r2 378cabdff1aSopenharmony_ci dec r3d 379cabdff1aSopenharmony_ci jg .nextrow 380cabdff1aSopenharmony_ci rep ret 381cabdff1aSopenharmony_ci%endmacro 382cabdff1aSopenharmony_ci 383cabdff1aSopenharmony_ciMC_CACHE MC10 384cabdff1aSopenharmony_ci 385cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 386cabdff1aSopenharmony_ci; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride) 387cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 388cabdff1aSopenharmony_ci%macro V_FILT 10 389cabdff1aSopenharmony_civ_filt%9_%10_10: 390cabdff1aSopenharmony_ci add r4, r2 391cabdff1aSopenharmony_ci.no_addr4: 392cabdff1aSopenharmony_ci FILT_V m0, m1, m2, m3, m4, m5, m6, m7 393cabdff1aSopenharmony_ci add r1, r2 394cabdff1aSopenharmony_ci add r0, r2 395cabdff1aSopenharmony_ci ret 396cabdff1aSopenharmony_ci%endmacro 397cabdff1aSopenharmony_ci 398cabdff1aSopenharmony_ciINIT_MMX mmxext 399cabdff1aSopenharmony_ciRESET_MM_PERMUTATION 400cabdff1aSopenharmony_ci%assign i 0 401cabdff1aSopenharmony_ci%rep 4 402cabdff1aSopenharmony_ciV_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i 403cabdff1aSopenharmony_ciSWAP 0,1,2,3,4,5 404cabdff1aSopenharmony_ci%assign i i+1 405cabdff1aSopenharmony_ci%endrep 406cabdff1aSopenharmony_ci 407cabdff1aSopenharmony_ciINIT_XMM sse2 408cabdff1aSopenharmony_ciRESET_MM_PERMUTATION 409cabdff1aSopenharmony_ci%assign i 0 410cabdff1aSopenharmony_ci%rep 6 411cabdff1aSopenharmony_ciV_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i 412cabdff1aSopenharmony_ciSWAP 0,1,2,3,4,5 413cabdff1aSopenharmony_ci%assign i i+1 414cabdff1aSopenharmony_ci%endrep 415cabdff1aSopenharmony_ci 416cabdff1aSopenharmony_ci%macro MC02 2 417cabdff1aSopenharmony_cicglobal_mc %1, mc02, %2, 3,4,8 418cabdff1aSopenharmony_ci PRELOAD_V 419cabdff1aSopenharmony_ci 420cabdff1aSopenharmony_ci sub r0, r2 421cabdff1aSopenharmony_ci%assign j 0 422cabdff1aSopenharmony_ci%rep %2 423cabdff1aSopenharmony_ci %assign i (j % 6) 424cabdff1aSopenharmony_ci call v_filt%2_ %+ i %+ _10.no_addr4 425cabdff1aSopenharmony_ci OP_MOV [r0], m0 426cabdff1aSopenharmony_ci SWAP 0,1,2,3,4,5 427cabdff1aSopenharmony_ci %assign j j+1 428cabdff1aSopenharmony_ci%endrep 429cabdff1aSopenharmony_ci ret 430cabdff1aSopenharmony_ci%endmacro 431cabdff1aSopenharmony_ci 432cabdff1aSopenharmony_ciMC MC02 433cabdff1aSopenharmony_ci 434cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 435cabdff1aSopenharmony_ci; void ff_h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride) 436cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 437cabdff1aSopenharmony_ci%macro MC01 2 438cabdff1aSopenharmony_cicglobal_mc %1, mc01, %2, 3,5,8 439cabdff1aSopenharmony_ci mov r4, r1 440cabdff1aSopenharmony_ci.body: 441cabdff1aSopenharmony_ci PRELOAD_V 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci sub r4, r2 444cabdff1aSopenharmony_ci sub r0, r2 445cabdff1aSopenharmony_ci%assign j 0 446cabdff1aSopenharmony_ci%rep %2 447cabdff1aSopenharmony_ci %assign i (j % 6) 448cabdff1aSopenharmony_ci call v_filt%2_ %+ i %+ _10 449cabdff1aSopenharmony_ci movu m7, [r4] 450cabdff1aSopenharmony_ci pavgw m0, m7 451cabdff1aSopenharmony_ci OP_MOV [r0], m0 452cabdff1aSopenharmony_ci SWAP 0,1,2,3,4,5 453cabdff1aSopenharmony_ci %assign j j+1 454cabdff1aSopenharmony_ci%endrep 455cabdff1aSopenharmony_ci ret 456cabdff1aSopenharmony_ci%endmacro 457cabdff1aSopenharmony_ci 458cabdff1aSopenharmony_ciMC MC01 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 461cabdff1aSopenharmony_ci; void ff_h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride) 462cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 463cabdff1aSopenharmony_ci%macro MC03 2 464cabdff1aSopenharmony_cicglobal_mc %1, mc03, %2, 3,5,8 465cabdff1aSopenharmony_ci lea r4, [r1+r2] 466cabdff1aSopenharmony_ci jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body 467cabdff1aSopenharmony_ci%endmacro 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_ciMC MC03 470cabdff1aSopenharmony_ci 471cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 472cabdff1aSopenharmony_ci; void ff_h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride) 473cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 474cabdff1aSopenharmony_ci%macro H_FILT_AVG 2-3 475cabdff1aSopenharmony_cih_filt%1_%2_10: 476cabdff1aSopenharmony_ci;FILT_H with fewer registers and averaged with the FILT_V result 477cabdff1aSopenharmony_ci;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration 478cabdff1aSopenharmony_ci;unfortunately I need three registers, so m5 will have to be re-read from memory 479cabdff1aSopenharmony_ci movu m5, [r4-4] 480cabdff1aSopenharmony_ci ADDW m5, [r4+6], m7 481cabdff1aSopenharmony_ci movu m6, [r4-2] 482cabdff1aSopenharmony_ci ADDW m6, [r4+4], m7 483cabdff1aSopenharmony_ci paddw m5, [pw_16] 484cabdff1aSopenharmony_ci psubw m5, m6 ; a-b 485cabdff1aSopenharmony_ci psraw m5, 2 ; (a-b)/4 486cabdff1aSopenharmony_ci psubw m5, m6 ; (a-b)/4-b 487cabdff1aSopenharmony_ci movu m6, [r4+0] 488cabdff1aSopenharmony_ci ADDW m6, [r4+2], m7 489cabdff1aSopenharmony_ci paddw m5, m6 ; (a-b)/4-b+c 490cabdff1aSopenharmony_ci psraw m5, 2 ; ((a-b)/4-b+c)/4 491cabdff1aSopenharmony_ci paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 492cabdff1aSopenharmony_ci psraw m5, 1 493cabdff1aSopenharmony_ci CLIPW m5, [pb_0], [pw_pixel_max] 494cabdff1aSopenharmony_ci;avg FILT_V, FILT_H 495cabdff1aSopenharmony_ci pavgw m0, m5 496cabdff1aSopenharmony_ci%if %0!=4 497cabdff1aSopenharmony_ci movu m5, [r1+r5] 498cabdff1aSopenharmony_ci%endif 499cabdff1aSopenharmony_ci ret 500cabdff1aSopenharmony_ci%endmacro 501cabdff1aSopenharmony_ci 502cabdff1aSopenharmony_ciINIT_MMX mmxext 503cabdff1aSopenharmony_ciRESET_MM_PERMUTATION 504cabdff1aSopenharmony_ci%assign i 0 505cabdff1aSopenharmony_ci%rep 3 506cabdff1aSopenharmony_ciH_FILT_AVG 4, i 507cabdff1aSopenharmony_ciSWAP 0,1,2,3,4,5 508cabdff1aSopenharmony_ci%assign i i+1 509cabdff1aSopenharmony_ci%endrep 510cabdff1aSopenharmony_ciH_FILT_AVG 4, i, 0 511cabdff1aSopenharmony_ci 512cabdff1aSopenharmony_ciINIT_XMM sse2 513cabdff1aSopenharmony_ciRESET_MM_PERMUTATION 514cabdff1aSopenharmony_ci%assign i 0 515cabdff1aSopenharmony_ci%rep 6 516cabdff1aSopenharmony_ci%if i==1 517cabdff1aSopenharmony_ciH_FILT_AVG 8, i, 0 518cabdff1aSopenharmony_ci%else 519cabdff1aSopenharmony_ciH_FILT_AVG 8, i 520cabdff1aSopenharmony_ci%endif 521cabdff1aSopenharmony_ciSWAP 0,1,2,3,4,5 522cabdff1aSopenharmony_ci%assign i i+1 523cabdff1aSopenharmony_ci%endrep 524cabdff1aSopenharmony_ci 525cabdff1aSopenharmony_ci%macro MC11 2 526cabdff1aSopenharmony_ci; this REALLY needs x86_64 527cabdff1aSopenharmony_cicglobal_mc %1, mc11, %2, 3,6,8 528cabdff1aSopenharmony_ci mov r4, r1 529cabdff1aSopenharmony_ci.body: 530cabdff1aSopenharmony_ci PRELOAD_V 531cabdff1aSopenharmony_ci 532cabdff1aSopenharmony_ci sub r0, r2 533cabdff1aSopenharmony_ci sub r4, r2 534cabdff1aSopenharmony_ci mov r5, r2 535cabdff1aSopenharmony_ci neg r5 536cabdff1aSopenharmony_ci%assign j 0 537cabdff1aSopenharmony_ci%rep %2 538cabdff1aSopenharmony_ci %assign i (j % 6) 539cabdff1aSopenharmony_ci call v_filt%2_ %+ i %+ _10 540cabdff1aSopenharmony_ci call h_filt%2_ %+ i %+ _10 541cabdff1aSopenharmony_ci%if %2==8 && i==1 542cabdff1aSopenharmony_ci movu m5, [r1+r5] 543cabdff1aSopenharmony_ci%endif 544cabdff1aSopenharmony_ci OP_MOV [r0], m0 545cabdff1aSopenharmony_ci SWAP 0,1,2,3,4,5 546cabdff1aSopenharmony_ci %assign j j+1 547cabdff1aSopenharmony_ci%endrep 548cabdff1aSopenharmony_ci ret 549cabdff1aSopenharmony_ci%endmacro 550cabdff1aSopenharmony_ci 551cabdff1aSopenharmony_ciMC MC11 552cabdff1aSopenharmony_ci 553cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 554cabdff1aSopenharmony_ci; void ff_h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride) 555cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 556cabdff1aSopenharmony_ci%macro MC31 2 557cabdff1aSopenharmony_cicglobal_mc %1, mc31, %2, 3,6,8 558cabdff1aSopenharmony_ci mov r4, r1 559cabdff1aSopenharmony_ci add r1, 2 560cabdff1aSopenharmony_ci jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body 561cabdff1aSopenharmony_ci%endmacro 562cabdff1aSopenharmony_ci 563cabdff1aSopenharmony_ciMC MC31 564cabdff1aSopenharmony_ci 565cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 566cabdff1aSopenharmony_ci; void ff_h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride) 567cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 568cabdff1aSopenharmony_ci%macro MC13 2 569cabdff1aSopenharmony_cicglobal_mc %1, mc13, %2, 3,7,12 570cabdff1aSopenharmony_ci lea r4, [r1+r2] 571cabdff1aSopenharmony_ci jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body 572cabdff1aSopenharmony_ci%endmacro 573cabdff1aSopenharmony_ci 574cabdff1aSopenharmony_ciMC MC13 575cabdff1aSopenharmony_ci 576cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 577cabdff1aSopenharmony_ci; void ff_h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride) 578cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 579cabdff1aSopenharmony_ci%macro MC33 2 580cabdff1aSopenharmony_cicglobal_mc %1, mc33, %2, 3,6,8 581cabdff1aSopenharmony_ci lea r4, [r1+r2] 582cabdff1aSopenharmony_ci add r1, 2 583cabdff1aSopenharmony_ci jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body 584cabdff1aSopenharmony_ci%endmacro 585cabdff1aSopenharmony_ci 586cabdff1aSopenharmony_ciMC MC33 587cabdff1aSopenharmony_ci 588cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 589cabdff1aSopenharmony_ci; void ff_h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride) 590cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 591cabdff1aSopenharmony_ci%macro FILT_H2 3 592cabdff1aSopenharmony_ci psubw %1, %2 ; a-b 593cabdff1aSopenharmony_ci psubw %2, %3 ; b-c 594cabdff1aSopenharmony_ci psllw %2, 2 595cabdff1aSopenharmony_ci psubw %1, %2 ; a-5*b+4*c 596cabdff1aSopenharmony_ci psllw %3, 4 597cabdff1aSopenharmony_ci paddw %1, %3 ; a-5*b+20*c 598cabdff1aSopenharmony_ci%endmacro 599cabdff1aSopenharmony_ci 600cabdff1aSopenharmony_ci%macro FILT_VNRD 8 601cabdff1aSopenharmony_ci movu %6, [r1] 602cabdff1aSopenharmony_ci paddw %1, %6 603cabdff1aSopenharmony_ci mova %7, %2 604cabdff1aSopenharmony_ci paddw %7, %5 605cabdff1aSopenharmony_ci mova %8, %3 606cabdff1aSopenharmony_ci paddw %8, %4 607cabdff1aSopenharmony_ci FILT_H2 %1, %7, %8 608cabdff1aSopenharmony_ci%endmacro 609cabdff1aSopenharmony_ci 610cabdff1aSopenharmony_ci%macro HV 1 611cabdff1aSopenharmony_ci%if mmsize==16 612cabdff1aSopenharmony_ci%define PAD 12 613cabdff1aSopenharmony_ci%define COUNT 2 614cabdff1aSopenharmony_ci%else 615cabdff1aSopenharmony_ci%define PAD 4 616cabdff1aSopenharmony_ci%define COUNT 3 617cabdff1aSopenharmony_ci%endif 618cabdff1aSopenharmony_ciput_hv%1_10: 619cabdff1aSopenharmony_ci neg r2 ; This actually saves instructions 620cabdff1aSopenharmony_ci lea r1, [r1+r2*2-mmsize+PAD] 621cabdff1aSopenharmony_ci lea r4, [rsp+PAD+gprsize] 622cabdff1aSopenharmony_ci mov r3d, COUNT 623cabdff1aSopenharmony_ci.v_loop: 624cabdff1aSopenharmony_ci movu m0, [r1] 625cabdff1aSopenharmony_ci sub r1, r2 626cabdff1aSopenharmony_ci movu m1, [r1] 627cabdff1aSopenharmony_ci sub r1, r2 628cabdff1aSopenharmony_ci movu m2, [r1] 629cabdff1aSopenharmony_ci sub r1, r2 630cabdff1aSopenharmony_ci movu m3, [r1] 631cabdff1aSopenharmony_ci sub r1, r2 632cabdff1aSopenharmony_ci movu m4, [r1] 633cabdff1aSopenharmony_ci sub r1, r2 634cabdff1aSopenharmony_ci%assign i 0 635cabdff1aSopenharmony_ci%rep %1-1 636cabdff1aSopenharmony_ci FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 637cabdff1aSopenharmony_ci psubw m0, [pad20] 638cabdff1aSopenharmony_ci movu [r4+i*mmsize*3], m0 639cabdff1aSopenharmony_ci sub r1, r2 640cabdff1aSopenharmony_ci SWAP 0,1,2,3,4,5 641cabdff1aSopenharmony_ci%assign i i+1 642cabdff1aSopenharmony_ci%endrep 643cabdff1aSopenharmony_ci FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 644cabdff1aSopenharmony_ci psubw m0, [pad20] 645cabdff1aSopenharmony_ci movu [r4+i*mmsize*3], m0 646cabdff1aSopenharmony_ci add r4, mmsize 647cabdff1aSopenharmony_ci lea r1, [r1+r2*8+mmsize] 648cabdff1aSopenharmony_ci%if %1==8 649cabdff1aSopenharmony_ci lea r1, [r1+r2*4] 650cabdff1aSopenharmony_ci%endif 651cabdff1aSopenharmony_ci dec r3d 652cabdff1aSopenharmony_ci jg .v_loop 653cabdff1aSopenharmony_ci neg r2 654cabdff1aSopenharmony_ci ret 655cabdff1aSopenharmony_ci%endmacro 656cabdff1aSopenharmony_ci 657cabdff1aSopenharmony_ciINIT_MMX mmxext 658cabdff1aSopenharmony_ciHV 4 659cabdff1aSopenharmony_ciINIT_XMM sse2 660cabdff1aSopenharmony_ciHV 8 661cabdff1aSopenharmony_ci 662cabdff1aSopenharmony_ci%macro H_LOOP 1 663cabdff1aSopenharmony_ci%if num_mmregs > 8 664cabdff1aSopenharmony_ci %define s1 m8 665cabdff1aSopenharmony_ci %define s2 m9 666cabdff1aSopenharmony_ci %define s3 m10 667cabdff1aSopenharmony_ci %define d1 m11 668cabdff1aSopenharmony_ci%else 669cabdff1aSopenharmony_ci %define s1 [tap1] 670cabdff1aSopenharmony_ci %define s2 [tap2] 671cabdff1aSopenharmony_ci %define s3 [tap3] 672cabdff1aSopenharmony_ci %define d1 [depad] 673cabdff1aSopenharmony_ci%endif 674cabdff1aSopenharmony_cih%1_loop_op: 675cabdff1aSopenharmony_ci movu m1, [r1+mmsize-4] 676cabdff1aSopenharmony_ci movu m2, [r1+mmsize-2] 677cabdff1aSopenharmony_ci mova m3, [r1+mmsize+0] 678cabdff1aSopenharmony_ci movu m4, [r1+mmsize+2] 679cabdff1aSopenharmony_ci movu m5, [r1+mmsize+4] 680cabdff1aSopenharmony_ci movu m6, [r1+mmsize+6] 681cabdff1aSopenharmony_ci%if num_mmregs > 8 682cabdff1aSopenharmony_ci pmaddwd m1, s1 683cabdff1aSopenharmony_ci pmaddwd m2, s1 684cabdff1aSopenharmony_ci pmaddwd m3, s2 685cabdff1aSopenharmony_ci pmaddwd m4, s2 686cabdff1aSopenharmony_ci pmaddwd m5, s3 687cabdff1aSopenharmony_ci pmaddwd m6, s3 688cabdff1aSopenharmony_ci paddd m1, d1 689cabdff1aSopenharmony_ci paddd m2, d1 690cabdff1aSopenharmony_ci%else 691cabdff1aSopenharmony_ci mova m0, s1 692cabdff1aSopenharmony_ci pmaddwd m1, m0 693cabdff1aSopenharmony_ci pmaddwd m2, m0 694cabdff1aSopenharmony_ci mova m0, s2 695cabdff1aSopenharmony_ci pmaddwd m3, m0 696cabdff1aSopenharmony_ci pmaddwd m4, m0 697cabdff1aSopenharmony_ci mova m0, s3 698cabdff1aSopenharmony_ci pmaddwd m5, m0 699cabdff1aSopenharmony_ci pmaddwd m6, m0 700cabdff1aSopenharmony_ci mova m0, d1 701cabdff1aSopenharmony_ci paddd m1, m0 702cabdff1aSopenharmony_ci paddd m2, m0 703cabdff1aSopenharmony_ci%endif 704cabdff1aSopenharmony_ci paddd m3, m5 705cabdff1aSopenharmony_ci paddd m4, m6 706cabdff1aSopenharmony_ci paddd m1, m3 707cabdff1aSopenharmony_ci paddd m2, m4 708cabdff1aSopenharmony_ci psrad m1, 10 709cabdff1aSopenharmony_ci psrad m2, 10 710cabdff1aSopenharmony_ci pslld m2, 16 711cabdff1aSopenharmony_ci pand m1, [pd_65535] 712cabdff1aSopenharmony_ci por m1, m2 713cabdff1aSopenharmony_ci%if num_mmregs <= 8 714cabdff1aSopenharmony_ci pxor m0, m0 715cabdff1aSopenharmony_ci%endif 716cabdff1aSopenharmony_ci CLIPW m1, m0, m7 717cabdff1aSopenharmony_ci add r1, mmsize*3 718cabdff1aSopenharmony_ci ret 719cabdff1aSopenharmony_ci%endmacro 720cabdff1aSopenharmony_ci 721cabdff1aSopenharmony_ciINIT_MMX mmxext 722cabdff1aSopenharmony_ciH_LOOP 4 723cabdff1aSopenharmony_ciINIT_XMM sse2 724cabdff1aSopenharmony_ciH_LOOP 8 725cabdff1aSopenharmony_ci 726cabdff1aSopenharmony_ci%macro MC22 2 727cabdff1aSopenharmony_cicglobal_mc %1, mc22, %2, 3,7,12 728cabdff1aSopenharmony_ci%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) 729cabdff1aSopenharmony_ci mov r6, rsp ; backup stack pointer 730cabdff1aSopenharmony_ci and rsp, ~(mmsize-1) ; align stack 731cabdff1aSopenharmony_ci sub rsp, PAD 732cabdff1aSopenharmony_ci 733cabdff1aSopenharmony_ci call put_hv%2_10 734cabdff1aSopenharmony_ci 735cabdff1aSopenharmony_ci mov r3d, %2 736cabdff1aSopenharmony_ci mova m7, [pw_pixel_max] 737cabdff1aSopenharmony_ci%if num_mmregs > 8 738cabdff1aSopenharmony_ci pxor m0, m0 739cabdff1aSopenharmony_ci mova m8, [tap1] 740cabdff1aSopenharmony_ci mova m9, [tap2] 741cabdff1aSopenharmony_ci mova m10, [tap3] 742cabdff1aSopenharmony_ci mova m11, [depad] 743cabdff1aSopenharmony_ci%endif 744cabdff1aSopenharmony_ci mov r1, rsp 745cabdff1aSopenharmony_ci.h_loop: 746cabdff1aSopenharmony_ci call h%2_loop_op 747cabdff1aSopenharmony_ci 748cabdff1aSopenharmony_ci OP_MOV [r0], m1 749cabdff1aSopenharmony_ci add r0, r2 750cabdff1aSopenharmony_ci dec r3d 751cabdff1aSopenharmony_ci jg .h_loop 752cabdff1aSopenharmony_ci 753cabdff1aSopenharmony_ci mov rsp, r6 ; restore stack pointer 754cabdff1aSopenharmony_ci ret 755cabdff1aSopenharmony_ci%endmacro 756cabdff1aSopenharmony_ci 757cabdff1aSopenharmony_ciMC MC22 758cabdff1aSopenharmony_ci 759cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 760cabdff1aSopenharmony_ci; void ff_h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride) 761cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 762cabdff1aSopenharmony_ci%macro MC12 2 763cabdff1aSopenharmony_cicglobal_mc %1, mc12, %2, 3,7,12 764cabdff1aSopenharmony_ci%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) 765cabdff1aSopenharmony_ci mov r6, rsp ; backup stack pointer 766cabdff1aSopenharmony_ci and rsp, ~(mmsize-1) ; align stack 767cabdff1aSopenharmony_ci sub rsp, PAD 768cabdff1aSopenharmony_ci 769cabdff1aSopenharmony_ci call put_hv%2_10 770cabdff1aSopenharmony_ci 771cabdff1aSopenharmony_ci xor r4d, r4d 772cabdff1aSopenharmony_ci.body: 773cabdff1aSopenharmony_ci mov r3d, %2 774cabdff1aSopenharmony_ci pxor m0, m0 775cabdff1aSopenharmony_ci mova m7, [pw_pixel_max] 776cabdff1aSopenharmony_ci%if num_mmregs > 8 777cabdff1aSopenharmony_ci mova m8, [tap1] 778cabdff1aSopenharmony_ci mova m9, [tap2] 779cabdff1aSopenharmony_ci mova m10, [tap3] 780cabdff1aSopenharmony_ci mova m11, [depad] 781cabdff1aSopenharmony_ci%endif 782cabdff1aSopenharmony_ci mov r1, rsp 783cabdff1aSopenharmony_ci.h_loop: 784cabdff1aSopenharmony_ci call h%2_loop_op 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc 787cabdff1aSopenharmony_ci paddw m3, [depad2] 788cabdff1aSopenharmony_ci psrlw m3, 5 789cabdff1aSopenharmony_ci psubw m3, [unpad] 790cabdff1aSopenharmony_ci CLIPW m3, m0, m7 791cabdff1aSopenharmony_ci pavgw m1, m3 792cabdff1aSopenharmony_ci 793cabdff1aSopenharmony_ci OP_MOV [r0], m1 794cabdff1aSopenharmony_ci add r0, r2 795cabdff1aSopenharmony_ci dec r3d 796cabdff1aSopenharmony_ci jg .h_loop 797cabdff1aSopenharmony_ci 798cabdff1aSopenharmony_ci mov rsp, r6 ; restore stack pointer 799cabdff1aSopenharmony_ci ret 800cabdff1aSopenharmony_ci%endmacro 801cabdff1aSopenharmony_ci 802cabdff1aSopenharmony_ciMC MC12 803cabdff1aSopenharmony_ci 804cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 805cabdff1aSopenharmony_ci; void ff_h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride) 806cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 807cabdff1aSopenharmony_ci%macro MC32 2 808cabdff1aSopenharmony_cicglobal_mc %1, mc32, %2, 3,7,12 809cabdff1aSopenharmony_ci%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) 810cabdff1aSopenharmony_ci mov r6, rsp ; backup stack pointer 811cabdff1aSopenharmony_ci and rsp, ~(mmsize-1) ; align stack 812cabdff1aSopenharmony_ci sub rsp, PAD 813cabdff1aSopenharmony_ci 814cabdff1aSopenharmony_ci call put_hv%2_10 815cabdff1aSopenharmony_ci 816cabdff1aSopenharmony_ci mov r4d, 2 ; sizeof(pixel) 817cabdff1aSopenharmony_ci jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body 818cabdff1aSopenharmony_ci%endmacro 819cabdff1aSopenharmony_ci 820cabdff1aSopenharmony_ciMC MC32 821cabdff1aSopenharmony_ci 822cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 823cabdff1aSopenharmony_ci; void ff_h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride) 824cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 825cabdff1aSopenharmony_ci%macro H_NRD 1 826cabdff1aSopenharmony_ciput_h%1_10: 827cabdff1aSopenharmony_ci add rsp, gprsize 828cabdff1aSopenharmony_ci mov r3d, %1 829cabdff1aSopenharmony_ci xor r4d, r4d 830cabdff1aSopenharmony_ci mova m6, [pad20] 831cabdff1aSopenharmony_ci.nextrow: 832cabdff1aSopenharmony_ci movu m2, [r5-4] 833cabdff1aSopenharmony_ci movu m3, [r5-2] 834cabdff1aSopenharmony_ci movu m4, [r5+0] 835cabdff1aSopenharmony_ci ADDW m2, [r5+6], m5 836cabdff1aSopenharmony_ci ADDW m3, [r5+4], m5 837cabdff1aSopenharmony_ci ADDW m4, [r5+2], m5 838cabdff1aSopenharmony_ci 839cabdff1aSopenharmony_ci FILT_H2 m2, m3, m4 840cabdff1aSopenharmony_ci psubw m2, m6 841cabdff1aSopenharmony_ci mova [rsp+r4], m2 842cabdff1aSopenharmony_ci add r4d, mmsize*3 843cabdff1aSopenharmony_ci add r5, r2 844cabdff1aSopenharmony_ci dec r3d 845cabdff1aSopenharmony_ci jg .nextrow 846cabdff1aSopenharmony_ci sub rsp, gprsize 847cabdff1aSopenharmony_ci ret 848cabdff1aSopenharmony_ci%endmacro 849cabdff1aSopenharmony_ci 850cabdff1aSopenharmony_ciINIT_MMX mmxext 851cabdff1aSopenharmony_ciH_NRD 4 852cabdff1aSopenharmony_ciINIT_XMM sse2 853cabdff1aSopenharmony_ciH_NRD 8 854cabdff1aSopenharmony_ci 855cabdff1aSopenharmony_ci%macro MC21 2 856cabdff1aSopenharmony_cicglobal_mc %1, mc21, %2, 3,7,12 857cabdff1aSopenharmony_ci mov r5, r1 858cabdff1aSopenharmony_ci.body: 859cabdff1aSopenharmony_ci%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) 860cabdff1aSopenharmony_ci mov r6, rsp ; backup stack pointer 861cabdff1aSopenharmony_ci and rsp, ~(mmsize-1) ; align stack 862cabdff1aSopenharmony_ci 863cabdff1aSopenharmony_ci sub rsp, PAD 864cabdff1aSopenharmony_ci call put_h%2_10 865cabdff1aSopenharmony_ci 866cabdff1aSopenharmony_ci sub rsp, PAD 867cabdff1aSopenharmony_ci call put_hv%2_10 868cabdff1aSopenharmony_ci 869cabdff1aSopenharmony_ci mov r4d, PAD-mmsize ; H buffer 870cabdff1aSopenharmony_ci jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body 871cabdff1aSopenharmony_ci%endmacro 872cabdff1aSopenharmony_ci 873cabdff1aSopenharmony_ciMC MC21 874cabdff1aSopenharmony_ci 875cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 876cabdff1aSopenharmony_ci; void ff_h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride) 877cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 878cabdff1aSopenharmony_ci%macro MC23 2 879cabdff1aSopenharmony_cicglobal_mc %1, mc23, %2, 3,7,12 880cabdff1aSopenharmony_ci lea r5, [r1+r2] 881cabdff1aSopenharmony_ci jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body 882cabdff1aSopenharmony_ci%endmacro 883cabdff1aSopenharmony_ci 884cabdff1aSopenharmony_ciMC MC23 885