1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code 3cabdff1aSopenharmony_ci;***************************************************************************** 4cabdff1aSopenharmony_ci;* Copyright (C) 2005-2011 x264 project 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 9cabdff1aSopenharmony_ci;* 10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 14cabdff1aSopenharmony_ci;* 15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 19cabdff1aSopenharmony_ci;* 20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23cabdff1aSopenharmony_ci;****************************************************************************** 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ciSECTION .text 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_cicextern pw_1023 30cabdff1aSopenharmony_ci%define pw_pixel_max pw_1023 31cabdff1aSopenharmony_cicextern pd_32 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 34cabdff1aSopenharmony_ci; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride) 35cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 36cabdff1aSopenharmony_ci%macro STORE_DIFFx2 6 37cabdff1aSopenharmony_ci psrad %1, 6 38cabdff1aSopenharmony_ci psrad %2, 6 39cabdff1aSopenharmony_ci packssdw %1, %2 40cabdff1aSopenharmony_ci movq %3, [%5] 41cabdff1aSopenharmony_ci movhps %3, [%5+%6] 42cabdff1aSopenharmony_ci paddsw %1, %3 43cabdff1aSopenharmony_ci CLIPW %1, %4, [pw_pixel_max] 44cabdff1aSopenharmony_ci movq [%5], %1 45cabdff1aSopenharmony_ci movhps [%5+%6], %1 46cabdff1aSopenharmony_ci%endmacro 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci%macro STORE_DIFF16 5 49cabdff1aSopenharmony_ci psrad %1, 6 50cabdff1aSopenharmony_ci psrad %2, 6 51cabdff1aSopenharmony_ci packssdw %1, %2 52cabdff1aSopenharmony_ci paddsw %1, [%5] 53cabdff1aSopenharmony_ci CLIPW %1, %3, %4 54cabdff1aSopenharmony_ci mova [%5], %1 55cabdff1aSopenharmony_ci%endmacro 56cabdff1aSopenharmony_ci 57cabdff1aSopenharmony_ci;dst, in, stride 58cabdff1aSopenharmony_ci%macro IDCT4_ADD_10 3 59cabdff1aSopenharmony_ci mova m0, [%2+ 0] 60cabdff1aSopenharmony_ci mova m1, [%2+16] 61cabdff1aSopenharmony_ci mova m2, [%2+32] 62cabdff1aSopenharmony_ci mova m3, [%2+48] 63cabdff1aSopenharmony_ci IDCT4_1D d,0,1,2,3,4,5 64cabdff1aSopenharmony_ci TRANSPOSE4x4D 0,1,2,3,4 65cabdff1aSopenharmony_ci paddd m0, [pd_32] 66cabdff1aSopenharmony_ci IDCT4_1D d,0,1,2,3,4,5 67cabdff1aSopenharmony_ci pxor m5, m5 68cabdff1aSopenharmony_ci mova [%2+ 0], m5 69cabdff1aSopenharmony_ci mova [%2+16], m5 70cabdff1aSopenharmony_ci mova [%2+32], m5 71cabdff1aSopenharmony_ci mova [%2+48], m5 72cabdff1aSopenharmony_ci STORE_DIFFx2 m0, m1, m4, m5, %1, %3 73cabdff1aSopenharmony_ci lea %1, [%1+%3*2] 74cabdff1aSopenharmony_ci STORE_DIFFx2 m2, m3, m4, m5, %1, %3 75cabdff1aSopenharmony_ci%endmacro 76cabdff1aSopenharmony_ci 77cabdff1aSopenharmony_ci%macro IDCT_ADD_10 0 78cabdff1aSopenharmony_cicglobal h264_idct_add_10, 3,3 79cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 80cabdff1aSopenharmony_ci IDCT4_ADD_10 r0, r1, r2 81cabdff1aSopenharmony_ci RET 82cabdff1aSopenharmony_ci%endmacro 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ciINIT_XMM sse2 85cabdff1aSopenharmony_ciIDCT_ADD_10 86cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 87cabdff1aSopenharmony_ciINIT_XMM avx 88cabdff1aSopenharmony_ciIDCT_ADD_10 89cabdff1aSopenharmony_ci%endif 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 92cabdff1aSopenharmony_ci; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset, 93cabdff1aSopenharmony_ci; int16_t *block, int stride, 94cabdff1aSopenharmony_ci; const uint8_t nnzc[6*8]) 95cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 96cabdff1aSopenharmony_ci;;;;;;; NO FATE SAMPLES TRIGGER THIS 97cabdff1aSopenharmony_ci%macro ADD4x4IDCT 0 98cabdff1aSopenharmony_ciadd4x4_idct %+ SUFFIX: 99cabdff1aSopenharmony_ci add r5, r0 100cabdff1aSopenharmony_ci mova m0, [r2+ 0] 101cabdff1aSopenharmony_ci mova m1, [r2+16] 102cabdff1aSopenharmony_ci mova m2, [r2+32] 103cabdff1aSopenharmony_ci mova m3, [r2+48] 104cabdff1aSopenharmony_ci IDCT4_1D d,0,1,2,3,4,5 105cabdff1aSopenharmony_ci TRANSPOSE4x4D 0,1,2,3,4 106cabdff1aSopenharmony_ci paddd m0, [pd_32] 107cabdff1aSopenharmony_ci IDCT4_1D d,0,1,2,3,4,5 108cabdff1aSopenharmony_ci pxor m5, m5 109cabdff1aSopenharmony_ci mova [r2+ 0], m5 110cabdff1aSopenharmony_ci mova [r2+16], m5 111cabdff1aSopenharmony_ci mova [r2+32], m5 112cabdff1aSopenharmony_ci mova [r2+48], m5 113cabdff1aSopenharmony_ci STORE_DIFFx2 m0, m1, m4, m5, r5, r3 114cabdff1aSopenharmony_ci lea r5, [r5+r3*2] 115cabdff1aSopenharmony_ci STORE_DIFFx2 m2, m3, m4, m5, r5, r3 116cabdff1aSopenharmony_ci ret 117cabdff1aSopenharmony_ci%endmacro 118cabdff1aSopenharmony_ci 119cabdff1aSopenharmony_ciINIT_XMM sse2 120cabdff1aSopenharmony_ciALIGN 16 121cabdff1aSopenharmony_ciADD4x4IDCT 122cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 123cabdff1aSopenharmony_ciINIT_XMM avx 124cabdff1aSopenharmony_ciALIGN 16 125cabdff1aSopenharmony_ciADD4x4IDCT 126cabdff1aSopenharmony_ci%endif 127cabdff1aSopenharmony_ci 128cabdff1aSopenharmony_ci%macro ADD16_OP 2 129cabdff1aSopenharmony_ci cmp byte [r4+%2], 0 130cabdff1aSopenharmony_ci jz .skipblock%1 131cabdff1aSopenharmony_ci mov r5d, [r1+%1*4] 132cabdff1aSopenharmony_ci call add4x4_idct %+ SUFFIX 133cabdff1aSopenharmony_ci.skipblock%1: 134cabdff1aSopenharmony_ci%if %1<15 135cabdff1aSopenharmony_ci add r2, 64 136cabdff1aSopenharmony_ci%endif 137cabdff1aSopenharmony_ci%endmacro 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci%macro IDCT_ADD16_10 0 140cabdff1aSopenharmony_cicglobal h264_idct_add16_10, 5,6 141cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 142cabdff1aSopenharmony_ci ADD16_OP 0, 4+1*8 143cabdff1aSopenharmony_ci ADD16_OP 1, 5+1*8 144cabdff1aSopenharmony_ci ADD16_OP 2, 4+2*8 145cabdff1aSopenharmony_ci ADD16_OP 3, 5+2*8 146cabdff1aSopenharmony_ci ADD16_OP 4, 6+1*8 147cabdff1aSopenharmony_ci ADD16_OP 5, 7+1*8 148cabdff1aSopenharmony_ci ADD16_OP 6, 6+2*8 149cabdff1aSopenharmony_ci ADD16_OP 7, 7+2*8 150cabdff1aSopenharmony_ci ADD16_OP 8, 4+3*8 151cabdff1aSopenharmony_ci ADD16_OP 9, 5+3*8 152cabdff1aSopenharmony_ci ADD16_OP 10, 4+4*8 153cabdff1aSopenharmony_ci ADD16_OP 11, 5+4*8 154cabdff1aSopenharmony_ci ADD16_OP 12, 6+3*8 155cabdff1aSopenharmony_ci ADD16_OP 13, 7+3*8 156cabdff1aSopenharmony_ci ADD16_OP 14, 6+4*8 157cabdff1aSopenharmony_ci ADD16_OP 15, 7+4*8 158cabdff1aSopenharmony_ci REP_RET 159cabdff1aSopenharmony_ci%endmacro 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ciINIT_XMM sse2 162cabdff1aSopenharmony_ciIDCT_ADD16_10 163cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 164cabdff1aSopenharmony_ciINIT_XMM avx 165cabdff1aSopenharmony_ciIDCT_ADD16_10 166cabdff1aSopenharmony_ci%endif 167cabdff1aSopenharmony_ci 168cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 169cabdff1aSopenharmony_ci; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride) 170cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 171cabdff1aSopenharmony_ci%macro IDCT_DC_ADD_OP_10 3 172cabdff1aSopenharmony_ci pxor m5, m5 173cabdff1aSopenharmony_ci%if avx_enabled 174cabdff1aSopenharmony_ci paddw m1, m0, [%1+0 ] 175cabdff1aSopenharmony_ci paddw m2, m0, [%1+%2 ] 176cabdff1aSopenharmony_ci paddw m3, m0, [%1+%2*2] 177cabdff1aSopenharmony_ci paddw m4, m0, [%1+%3 ] 178cabdff1aSopenharmony_ci%else 179cabdff1aSopenharmony_ci mova m1, [%1+0 ] 180cabdff1aSopenharmony_ci mova m2, [%1+%2 ] 181cabdff1aSopenharmony_ci mova m3, [%1+%2*2] 182cabdff1aSopenharmony_ci mova m4, [%1+%3 ] 183cabdff1aSopenharmony_ci paddw m1, m0 184cabdff1aSopenharmony_ci paddw m2, m0 185cabdff1aSopenharmony_ci paddw m3, m0 186cabdff1aSopenharmony_ci paddw m4, m0 187cabdff1aSopenharmony_ci%endif 188cabdff1aSopenharmony_ci CLIPW m1, m5, m6 189cabdff1aSopenharmony_ci CLIPW m2, m5, m6 190cabdff1aSopenharmony_ci CLIPW m3, m5, m6 191cabdff1aSopenharmony_ci CLIPW m4, m5, m6 192cabdff1aSopenharmony_ci mova [%1+0 ], m1 193cabdff1aSopenharmony_ci mova [%1+%2 ], m2 194cabdff1aSopenharmony_ci mova [%1+%2*2], m3 195cabdff1aSopenharmony_ci mova [%1+%3 ], m4 196cabdff1aSopenharmony_ci%endmacro 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ciINIT_MMX mmxext 199cabdff1aSopenharmony_cicglobal h264_idct_dc_add_10,3,3 200cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 201cabdff1aSopenharmony_ci movd m0, [r1] 202cabdff1aSopenharmony_ci mov dword [r1], 0 203cabdff1aSopenharmony_ci paddd m0, [pd_32] 204cabdff1aSopenharmony_ci psrad m0, 6 205cabdff1aSopenharmony_ci lea r1, [r2*3] 206cabdff1aSopenharmony_ci pshufw m0, m0, 0 207cabdff1aSopenharmony_ci mova m6, [pw_pixel_max] 208cabdff1aSopenharmony_ci IDCT_DC_ADD_OP_10 r0, r2, r1 209cabdff1aSopenharmony_ci RET 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 212cabdff1aSopenharmony_ci; void ff_h264_idct8_dc_add_10(pixel *dst, int16_t *block, int stride) 213cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 214cabdff1aSopenharmony_ci%macro IDCT8_DC_ADD 0 215cabdff1aSopenharmony_cicglobal h264_idct8_dc_add_10,3,4,7 216cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 217cabdff1aSopenharmony_ci movd m0, [r1] 218cabdff1aSopenharmony_ci mov dword[r1], 0 219cabdff1aSopenharmony_ci paddd m0, [pd_32] 220cabdff1aSopenharmony_ci psrad m0, 6 221cabdff1aSopenharmony_ci lea r1, [r2*3] 222cabdff1aSopenharmony_ci SPLATW m0, m0, 0 223cabdff1aSopenharmony_ci mova m6, [pw_pixel_max] 224cabdff1aSopenharmony_ci IDCT_DC_ADD_OP_10 r0, r2, r1 225cabdff1aSopenharmony_ci lea r0, [r0+r2*4] 226cabdff1aSopenharmony_ci IDCT_DC_ADD_OP_10 r0, r2, r1 227cabdff1aSopenharmony_ci RET 228cabdff1aSopenharmony_ci%endmacro 229cabdff1aSopenharmony_ci 230cabdff1aSopenharmony_ciINIT_XMM sse2 231cabdff1aSopenharmony_ciIDCT8_DC_ADD 232cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 233cabdff1aSopenharmony_ciINIT_XMM avx 234cabdff1aSopenharmony_ciIDCT8_DC_ADD 235cabdff1aSopenharmony_ci%endif 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 238cabdff1aSopenharmony_ci; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset, 239cabdff1aSopenharmony_ci; int16_t *block, int stride, 240cabdff1aSopenharmony_ci; const uint8_t nnzc[6*8]) 241cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 242cabdff1aSopenharmony_ci%macro AC 1 243cabdff1aSopenharmony_ci.ac%1: 244cabdff1aSopenharmony_ci mov r5d, [r1+(%1+0)*4] 245cabdff1aSopenharmony_ci call add4x4_idct %+ SUFFIX 246cabdff1aSopenharmony_ci mov r5d, [r1+(%1+1)*4] 247cabdff1aSopenharmony_ci add r2, 64 248cabdff1aSopenharmony_ci call add4x4_idct %+ SUFFIX 249cabdff1aSopenharmony_ci add r2, 64 250cabdff1aSopenharmony_ci jmp .skipadd%1 251cabdff1aSopenharmony_ci%endmacro 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci%assign last_block 16 254cabdff1aSopenharmony_ci%macro ADD16_OP_INTRA 2 255cabdff1aSopenharmony_ci cmp word [r4+%2], 0 256cabdff1aSopenharmony_ci jnz .ac%1 257cabdff1aSopenharmony_ci mov r5d, [r2+ 0] 258cabdff1aSopenharmony_ci or r5d, [r2+64] 259cabdff1aSopenharmony_ci jz .skipblock%1 260cabdff1aSopenharmony_ci mov r5d, [r1+(%1+0)*4] 261cabdff1aSopenharmony_ci call idct_dc_add %+ SUFFIX 262cabdff1aSopenharmony_ci.skipblock%1: 263cabdff1aSopenharmony_ci%if %1<last_block-2 264cabdff1aSopenharmony_ci add r2, 128 265cabdff1aSopenharmony_ci%endif 266cabdff1aSopenharmony_ci.skipadd%1: 267cabdff1aSopenharmony_ci%endmacro 268cabdff1aSopenharmony_ci 269cabdff1aSopenharmony_ci%macro IDCT_ADD16INTRA_10 0 270cabdff1aSopenharmony_ciidct_dc_add %+ SUFFIX: 271cabdff1aSopenharmony_ci add r5, r0 272cabdff1aSopenharmony_ci movq m0, [r2+ 0] 273cabdff1aSopenharmony_ci movhps m0, [r2+64] 274cabdff1aSopenharmony_ci mov dword [r2+ 0], 0 275cabdff1aSopenharmony_ci mov dword [r2+64], 0 276cabdff1aSopenharmony_ci paddd m0, [pd_32] 277cabdff1aSopenharmony_ci psrad m0, 6 278cabdff1aSopenharmony_ci pshufhw m0, m0, 0 279cabdff1aSopenharmony_ci pshuflw m0, m0, 0 280cabdff1aSopenharmony_ci lea r6, [r3*3] 281cabdff1aSopenharmony_ci mova m6, [pw_pixel_max] 282cabdff1aSopenharmony_ci IDCT_DC_ADD_OP_10 r5, r3, r6 283cabdff1aSopenharmony_ci ret 284cabdff1aSopenharmony_ci 285cabdff1aSopenharmony_cicglobal h264_idct_add16intra_10,5,7,8 286cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 287cabdff1aSopenharmony_ci ADD16_OP_INTRA 0, 4+1*8 288cabdff1aSopenharmony_ci ADD16_OP_INTRA 2, 4+2*8 289cabdff1aSopenharmony_ci ADD16_OP_INTRA 4, 6+1*8 290cabdff1aSopenharmony_ci ADD16_OP_INTRA 6, 6+2*8 291cabdff1aSopenharmony_ci ADD16_OP_INTRA 8, 4+3*8 292cabdff1aSopenharmony_ci ADD16_OP_INTRA 10, 4+4*8 293cabdff1aSopenharmony_ci ADD16_OP_INTRA 12, 6+3*8 294cabdff1aSopenharmony_ci ADD16_OP_INTRA 14, 6+4*8 295cabdff1aSopenharmony_ci REP_RET 296cabdff1aSopenharmony_ci AC 8 297cabdff1aSopenharmony_ci AC 10 298cabdff1aSopenharmony_ci AC 12 299cabdff1aSopenharmony_ci AC 14 300cabdff1aSopenharmony_ci AC 0 301cabdff1aSopenharmony_ci AC 2 302cabdff1aSopenharmony_ci AC 4 303cabdff1aSopenharmony_ci AC 6 304cabdff1aSopenharmony_ci%endmacro 305cabdff1aSopenharmony_ci 306cabdff1aSopenharmony_ciINIT_XMM sse2 307cabdff1aSopenharmony_ciIDCT_ADD16INTRA_10 308cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 309cabdff1aSopenharmony_ciINIT_XMM avx 310cabdff1aSopenharmony_ciIDCT_ADD16INTRA_10 311cabdff1aSopenharmony_ci%endif 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci%assign last_block 36 314cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 315cabdff1aSopenharmony_ci; void ff_h264_idct_add8_10(pixel **dst, const int *block_offset, 316cabdff1aSopenharmony_ci; int16_t *block, int stride, 317cabdff1aSopenharmony_ci; const uint8_t nnzc[6*8]) 318cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 319cabdff1aSopenharmony_ci%macro IDCT_ADD8 0 320cabdff1aSopenharmony_cicglobal h264_idct_add8_10,5,8,7 321cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 322cabdff1aSopenharmony_ci%if ARCH_X86_64 323cabdff1aSopenharmony_ci mov r7, r0 324cabdff1aSopenharmony_ci%endif 325cabdff1aSopenharmony_ci add r2, 1024 326cabdff1aSopenharmony_ci mov r0, [r0] 327cabdff1aSopenharmony_ci ADD16_OP_INTRA 16, 4+ 6*8 328cabdff1aSopenharmony_ci ADD16_OP_INTRA 18, 4+ 7*8 329cabdff1aSopenharmony_ci add r2, 1024-128*2 330cabdff1aSopenharmony_ci%if ARCH_X86_64 331cabdff1aSopenharmony_ci mov r0, [r7+gprsize] 332cabdff1aSopenharmony_ci%else 333cabdff1aSopenharmony_ci mov r0, r0m 334cabdff1aSopenharmony_ci mov r0, [r0+gprsize] 335cabdff1aSopenharmony_ci%endif 336cabdff1aSopenharmony_ci ADD16_OP_INTRA 32, 4+11*8 337cabdff1aSopenharmony_ci ADD16_OP_INTRA 34, 4+12*8 338cabdff1aSopenharmony_ci REP_RET 339cabdff1aSopenharmony_ci AC 16 340cabdff1aSopenharmony_ci AC 18 341cabdff1aSopenharmony_ci AC 32 342cabdff1aSopenharmony_ci AC 34 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci%endmacro ; IDCT_ADD8 345cabdff1aSopenharmony_ci 346cabdff1aSopenharmony_ciINIT_XMM sse2 347cabdff1aSopenharmony_ciIDCT_ADD8 348cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 349cabdff1aSopenharmony_ciINIT_XMM avx 350cabdff1aSopenharmony_ciIDCT_ADD8 351cabdff1aSopenharmony_ci%endif 352cabdff1aSopenharmony_ci 353cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 354cabdff1aSopenharmony_ci; void ff_h264_idct_add8_422_10(pixel **dst, const int *block_offset, 355cabdff1aSopenharmony_ci; int16_t *block, int stride, 356cabdff1aSopenharmony_ci; const uint8_t nnzc[6*8]) 357cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 358cabdff1aSopenharmony_ci%assign last_block 44 359cabdff1aSopenharmony_ci 360cabdff1aSopenharmony_ci%macro IDCT_ADD8_422 0 361cabdff1aSopenharmony_ci 362cabdff1aSopenharmony_cicglobal h264_idct_add8_422_10, 5, 8, 7 363cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 364cabdff1aSopenharmony_ci%if ARCH_X86_64 365cabdff1aSopenharmony_ci mov r7, r0 366cabdff1aSopenharmony_ci%endif 367cabdff1aSopenharmony_ci 368cabdff1aSopenharmony_ci add r2, 1024 369cabdff1aSopenharmony_ci mov r0, [r0] 370cabdff1aSopenharmony_ci ADD16_OP_INTRA 16, 4+ 6*8 371cabdff1aSopenharmony_ci ADD16_OP_INTRA 18, 4+ 7*8 372cabdff1aSopenharmony_ci ADD16_OP_INTRA 24, 4+ 8*8 ; i+4 373cabdff1aSopenharmony_ci ADD16_OP_INTRA 26, 4+ 9*8 ; i+4 374cabdff1aSopenharmony_ci add r2, 1024-128*4 375cabdff1aSopenharmony_ci 376cabdff1aSopenharmony_ci%if ARCH_X86_64 377cabdff1aSopenharmony_ci mov r0, [r7+gprsize] 378cabdff1aSopenharmony_ci%else 379cabdff1aSopenharmony_ci mov r0, r0m 380cabdff1aSopenharmony_ci mov r0, [r0+gprsize] 381cabdff1aSopenharmony_ci%endif 382cabdff1aSopenharmony_ci 383cabdff1aSopenharmony_ci ADD16_OP_INTRA 32, 4+11*8 384cabdff1aSopenharmony_ci ADD16_OP_INTRA 34, 4+12*8 385cabdff1aSopenharmony_ci ADD16_OP_INTRA 40, 4+13*8 ; i+4 386cabdff1aSopenharmony_ci ADD16_OP_INTRA 42, 4+14*8 ; i+4 387cabdff1aSopenharmony_ciREP_RET 388cabdff1aSopenharmony_ci AC 16 389cabdff1aSopenharmony_ci AC 18 390cabdff1aSopenharmony_ci AC 24 ; i+4 391cabdff1aSopenharmony_ci AC 26 ; i+4 392cabdff1aSopenharmony_ci AC 32 393cabdff1aSopenharmony_ci AC 34 394cabdff1aSopenharmony_ci AC 40 ; i+4 395cabdff1aSopenharmony_ci AC 42 ; i+4 396cabdff1aSopenharmony_ci 397cabdff1aSopenharmony_ci%endmacro 398cabdff1aSopenharmony_ci 399cabdff1aSopenharmony_ciINIT_XMM sse2 400cabdff1aSopenharmony_ciIDCT_ADD8_422 401cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 402cabdff1aSopenharmony_ciINIT_XMM avx 403cabdff1aSopenharmony_ciIDCT_ADD8_422 404cabdff1aSopenharmony_ci%endif 405cabdff1aSopenharmony_ci 406cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 407cabdff1aSopenharmony_ci; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride) 408cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 409cabdff1aSopenharmony_ci%macro IDCT8_1D 2 410cabdff1aSopenharmony_ci SWAP 0, 1 411cabdff1aSopenharmony_ci psrad m4, m5, 1 412cabdff1aSopenharmony_ci psrad m1, m0, 1 413cabdff1aSopenharmony_ci paddd m4, m5 414cabdff1aSopenharmony_ci paddd m1, m0 415cabdff1aSopenharmony_ci paddd m4, m7 416cabdff1aSopenharmony_ci paddd m1, m5 417cabdff1aSopenharmony_ci psubd m4, m0 418cabdff1aSopenharmony_ci paddd m1, m3 419cabdff1aSopenharmony_ci 420cabdff1aSopenharmony_ci psubd m0, m3 421cabdff1aSopenharmony_ci psubd m5, m3 422cabdff1aSopenharmony_ci paddd m0, m7 423cabdff1aSopenharmony_ci psubd m5, m7 424cabdff1aSopenharmony_ci psrad m3, 1 425cabdff1aSopenharmony_ci psrad m7, 1 426cabdff1aSopenharmony_ci psubd m0, m3 427cabdff1aSopenharmony_ci psubd m5, m7 428cabdff1aSopenharmony_ci 429cabdff1aSopenharmony_ci SWAP 1, 7 430cabdff1aSopenharmony_ci psrad m1, m7, 2 431cabdff1aSopenharmony_ci psrad m3, m4, 2 432cabdff1aSopenharmony_ci paddd m3, m0 433cabdff1aSopenharmony_ci psrad m0, 2 434cabdff1aSopenharmony_ci paddd m1, m5 435cabdff1aSopenharmony_ci psrad m5, 2 436cabdff1aSopenharmony_ci psubd m0, m4 437cabdff1aSopenharmony_ci psubd m7, m5 438cabdff1aSopenharmony_ci 439cabdff1aSopenharmony_ci SWAP 5, 6 440cabdff1aSopenharmony_ci psrad m4, m2, 1 441cabdff1aSopenharmony_ci psrad m6, m5, 1 442cabdff1aSopenharmony_ci psubd m4, m5 443cabdff1aSopenharmony_ci paddd m6, m2 444cabdff1aSopenharmony_ci 445cabdff1aSopenharmony_ci mova m2, %1 446cabdff1aSopenharmony_ci mova m5, %2 447cabdff1aSopenharmony_ci SUMSUB_BA d, 5, 2 448cabdff1aSopenharmony_ci SUMSUB_BA d, 6, 5 449cabdff1aSopenharmony_ci SUMSUB_BA d, 4, 2 450cabdff1aSopenharmony_ci SUMSUB_BA d, 7, 6 451cabdff1aSopenharmony_ci SUMSUB_BA d, 0, 4 452cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 2 453cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 5 454cabdff1aSopenharmony_ci SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 455cabdff1aSopenharmony_ci%endmacro 456cabdff1aSopenharmony_ci 457cabdff1aSopenharmony_ci%macro IDCT8_1D_FULL 1 458cabdff1aSopenharmony_ci mova m7, [%1+112*2] 459cabdff1aSopenharmony_ci mova m6, [%1+ 96*2] 460cabdff1aSopenharmony_ci mova m5, [%1+ 80*2] 461cabdff1aSopenharmony_ci mova m3, [%1+ 48*2] 462cabdff1aSopenharmony_ci mova m2, [%1+ 32*2] 463cabdff1aSopenharmony_ci mova m1, [%1+ 16*2] 464cabdff1aSopenharmony_ci IDCT8_1D [%1], [%1+ 64*2] 465cabdff1aSopenharmony_ci%endmacro 466cabdff1aSopenharmony_ci 467cabdff1aSopenharmony_ci; %1=int16_t *block, %2=int16_t *dstblock 468cabdff1aSopenharmony_ci%macro IDCT8_ADD_SSE_START 2 469cabdff1aSopenharmony_ci IDCT8_1D_FULL %1 470cabdff1aSopenharmony_ci%if ARCH_X86_64 471cabdff1aSopenharmony_ci TRANSPOSE4x4D 0,1,2,3,8 472cabdff1aSopenharmony_ci mova [%2 ], m0 473cabdff1aSopenharmony_ci TRANSPOSE4x4D 4,5,6,7,8 474cabdff1aSopenharmony_ci mova [%2+8*2], m4 475cabdff1aSopenharmony_ci%else 476cabdff1aSopenharmony_ci mova [%1], m7 477cabdff1aSopenharmony_ci TRANSPOSE4x4D 0,1,2,3,7 478cabdff1aSopenharmony_ci mova m7, [%1] 479cabdff1aSopenharmony_ci mova [%2 ], m0 480cabdff1aSopenharmony_ci mova [%2+16*2], m1 481cabdff1aSopenharmony_ci mova [%2+32*2], m2 482cabdff1aSopenharmony_ci mova [%2+48*2], m3 483cabdff1aSopenharmony_ci TRANSPOSE4x4D 4,5,6,7,3 484cabdff1aSopenharmony_ci mova [%2+ 8*2], m4 485cabdff1aSopenharmony_ci mova [%2+24*2], m5 486cabdff1aSopenharmony_ci mova [%2+40*2], m6 487cabdff1aSopenharmony_ci mova [%2+56*2], m7 488cabdff1aSopenharmony_ci%endif 489cabdff1aSopenharmony_ci%endmacro 490cabdff1aSopenharmony_ci 491cabdff1aSopenharmony_ci; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 492cabdff1aSopenharmony_ci%macro IDCT8_ADD_SSE_END 3 493cabdff1aSopenharmony_ci IDCT8_1D_FULL %2 494cabdff1aSopenharmony_ci mova [%2 ], m6 495cabdff1aSopenharmony_ci mova [%2+16*2], m7 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci pxor m7, m7 498cabdff1aSopenharmony_ci STORE_DIFFx2 m0, m1, m6, m7, %1, %3 499cabdff1aSopenharmony_ci lea %1, [%1+%3*2] 500cabdff1aSopenharmony_ci STORE_DIFFx2 m2, m3, m6, m7, %1, %3 501cabdff1aSopenharmony_ci mova m0, [%2 ] 502cabdff1aSopenharmony_ci mova m1, [%2+16*2] 503cabdff1aSopenharmony_ci lea %1, [%1+%3*2] 504cabdff1aSopenharmony_ci STORE_DIFFx2 m4, m5, m6, m7, %1, %3 505cabdff1aSopenharmony_ci lea %1, [%1+%3*2] 506cabdff1aSopenharmony_ci STORE_DIFFx2 m0, m1, m6, m7, %1, %3 507cabdff1aSopenharmony_ci%endmacro 508cabdff1aSopenharmony_ci 509cabdff1aSopenharmony_ci%macro IDCT8_ADD 0 510cabdff1aSopenharmony_cicglobal h264_idct8_add_10, 3,4,16 511cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 512cabdff1aSopenharmony_ci%if UNIX64 == 0 513cabdff1aSopenharmony_ci %assign pad 16-gprsize-(stack_offset&15) 514cabdff1aSopenharmony_ci sub rsp, pad 515cabdff1aSopenharmony_ci call h264_idct8_add1_10 %+ SUFFIX 516cabdff1aSopenharmony_ci add rsp, pad 517cabdff1aSopenharmony_ci RET 518cabdff1aSopenharmony_ci%endif 519cabdff1aSopenharmony_ci 520cabdff1aSopenharmony_ciALIGN 16 521cabdff1aSopenharmony_ci; TODO: does not need to use stack 522cabdff1aSopenharmony_cih264_idct8_add1_10 %+ SUFFIX: 523cabdff1aSopenharmony_ci%assign pad 256+16-gprsize 524cabdff1aSopenharmony_ci sub rsp, pad 525cabdff1aSopenharmony_ci add dword [r1], 32 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ci%if ARCH_X86_64 528cabdff1aSopenharmony_ci IDCT8_ADD_SSE_START r1, rsp 529cabdff1aSopenharmony_ci SWAP 1, 9 530cabdff1aSopenharmony_ci SWAP 2, 10 531cabdff1aSopenharmony_ci SWAP 3, 11 532cabdff1aSopenharmony_ci SWAP 5, 13 533cabdff1aSopenharmony_ci SWAP 6, 14 534cabdff1aSopenharmony_ci SWAP 7, 15 535cabdff1aSopenharmony_ci IDCT8_ADD_SSE_START r1+16, rsp+128 536cabdff1aSopenharmony_ci PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7 537cabdff1aSopenharmony_ci IDCT8_1D [rsp], [rsp+128] 538cabdff1aSopenharmony_ci SWAP 0, 8 539cabdff1aSopenharmony_ci SWAP 1, 9 540cabdff1aSopenharmony_ci SWAP 2, 10 541cabdff1aSopenharmony_ci SWAP 3, 11 542cabdff1aSopenharmony_ci SWAP 4, 12 543cabdff1aSopenharmony_ci SWAP 5, 13 544cabdff1aSopenharmony_ci SWAP 6, 14 545cabdff1aSopenharmony_ci SWAP 7, 15 546cabdff1aSopenharmony_ci IDCT8_1D [rsp+16], [rsp+144] 547cabdff1aSopenharmony_ci psrad m8, 6 548cabdff1aSopenharmony_ci psrad m0, 6 549cabdff1aSopenharmony_ci packssdw m8, m0 550cabdff1aSopenharmony_ci paddsw m8, [r0] 551cabdff1aSopenharmony_ci pxor m0, m0 552cabdff1aSopenharmony_ci mova [r1+ 0], m0 553cabdff1aSopenharmony_ci mova [r1+ 16], m0 554cabdff1aSopenharmony_ci mova [r1+ 32], m0 555cabdff1aSopenharmony_ci mova [r1+ 48], m0 556cabdff1aSopenharmony_ci mova [r1+ 64], m0 557cabdff1aSopenharmony_ci mova [r1+ 80], m0 558cabdff1aSopenharmony_ci mova [r1+ 96], m0 559cabdff1aSopenharmony_ci mova [r1+112], m0 560cabdff1aSopenharmony_ci mova [r1+128], m0 561cabdff1aSopenharmony_ci mova [r1+144], m0 562cabdff1aSopenharmony_ci mova [r1+160], m0 563cabdff1aSopenharmony_ci mova [r1+176], m0 564cabdff1aSopenharmony_ci mova [r1+192], m0 565cabdff1aSopenharmony_ci mova [r1+208], m0 566cabdff1aSopenharmony_ci mova [r1+224], m0 567cabdff1aSopenharmony_ci mova [r1+240], m0 568cabdff1aSopenharmony_ci CLIPW m8, m0, [pw_pixel_max] 569cabdff1aSopenharmony_ci mova [r0], m8 570cabdff1aSopenharmony_ci mova m8, [pw_pixel_max] 571cabdff1aSopenharmony_ci STORE_DIFF16 m9, m1, m0, m8, r0+r2 572cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 573cabdff1aSopenharmony_ci STORE_DIFF16 m10, m2, m0, m8, r0 574cabdff1aSopenharmony_ci STORE_DIFF16 m11, m3, m0, m8, r0+r2 575cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 576cabdff1aSopenharmony_ci STORE_DIFF16 m12, m4, m0, m8, r0 577cabdff1aSopenharmony_ci STORE_DIFF16 m13, m5, m0, m8, r0+r2 578cabdff1aSopenharmony_ci lea r0, [r0+r2*2] 579cabdff1aSopenharmony_ci STORE_DIFF16 m14, m6, m0, m8, r0 580cabdff1aSopenharmony_ci STORE_DIFF16 m15, m7, m0, m8, r0+r2 581cabdff1aSopenharmony_ci%else 582cabdff1aSopenharmony_ci IDCT8_ADD_SSE_START r1, rsp 583cabdff1aSopenharmony_ci IDCT8_ADD_SSE_START r1+16, rsp+128 584cabdff1aSopenharmony_ci lea r3, [r0+8] 585cabdff1aSopenharmony_ci IDCT8_ADD_SSE_END r0, rsp, r2 586cabdff1aSopenharmony_ci IDCT8_ADD_SSE_END r3, rsp+16, r2 587cabdff1aSopenharmony_ci mova [r1+ 0], m7 588cabdff1aSopenharmony_ci mova [r1+ 16], m7 589cabdff1aSopenharmony_ci mova [r1+ 32], m7 590cabdff1aSopenharmony_ci mova [r1+ 48], m7 591cabdff1aSopenharmony_ci mova [r1+ 64], m7 592cabdff1aSopenharmony_ci mova [r1+ 80], m7 593cabdff1aSopenharmony_ci mova [r1+ 96], m7 594cabdff1aSopenharmony_ci mova [r1+112], m7 595cabdff1aSopenharmony_ci mova [r1+128], m7 596cabdff1aSopenharmony_ci mova [r1+144], m7 597cabdff1aSopenharmony_ci mova [r1+160], m7 598cabdff1aSopenharmony_ci mova [r1+176], m7 599cabdff1aSopenharmony_ci mova [r1+192], m7 600cabdff1aSopenharmony_ci mova [r1+208], m7 601cabdff1aSopenharmony_ci mova [r1+224], m7 602cabdff1aSopenharmony_ci mova [r1+240], m7 603cabdff1aSopenharmony_ci%endif ; ARCH_X86_64 604cabdff1aSopenharmony_ci 605cabdff1aSopenharmony_ci add rsp, pad 606cabdff1aSopenharmony_ci ret 607cabdff1aSopenharmony_ci%endmacro 608cabdff1aSopenharmony_ci 609cabdff1aSopenharmony_ciINIT_XMM sse2 610cabdff1aSopenharmony_ciIDCT8_ADD 611cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 612cabdff1aSopenharmony_ciINIT_XMM avx 613cabdff1aSopenharmony_ciIDCT8_ADD 614cabdff1aSopenharmony_ci%endif 615cabdff1aSopenharmony_ci 616cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 617cabdff1aSopenharmony_ci; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset, 618cabdff1aSopenharmony_ci; int16_t *block, int stride, 619cabdff1aSopenharmony_ci; const uint8_t nnzc[6*8]) 620cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 621cabdff1aSopenharmony_ci;;;;;;; NO FATE SAMPLES TRIGGER THIS 622cabdff1aSopenharmony_ci%macro IDCT8_ADD4_OP 2 623cabdff1aSopenharmony_ci cmp byte [r4+%2], 0 624cabdff1aSopenharmony_ci jz .skipblock%1 625cabdff1aSopenharmony_ci mov r0d, [r6+%1*4] 626cabdff1aSopenharmony_ci add r0, r5 627cabdff1aSopenharmony_ci call h264_idct8_add1_10 %+ SUFFIX 628cabdff1aSopenharmony_ci.skipblock%1: 629cabdff1aSopenharmony_ci%if %1<12 630cabdff1aSopenharmony_ci add r1, 256 631cabdff1aSopenharmony_ci%endif 632cabdff1aSopenharmony_ci%endmacro 633cabdff1aSopenharmony_ci 634cabdff1aSopenharmony_ci%macro IDCT8_ADD4 0 635cabdff1aSopenharmony_cicglobal h264_idct8_add4_10, 0,7,16 636cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 637cabdff1aSopenharmony_ci %assign pad 16-gprsize-(stack_offset&15) 638cabdff1aSopenharmony_ci SUB rsp, pad 639cabdff1aSopenharmony_ci mov r5, r0mp 640cabdff1aSopenharmony_ci mov r6, r1mp 641cabdff1aSopenharmony_ci mov r1, r2mp 642cabdff1aSopenharmony_ci mov r2d, r3m 643cabdff1aSopenharmony_ci movifnidn r4, r4mp 644cabdff1aSopenharmony_ci IDCT8_ADD4_OP 0, 4+1*8 645cabdff1aSopenharmony_ci IDCT8_ADD4_OP 4, 6+1*8 646cabdff1aSopenharmony_ci IDCT8_ADD4_OP 8, 4+3*8 647cabdff1aSopenharmony_ci IDCT8_ADD4_OP 12, 6+3*8 648cabdff1aSopenharmony_ci ADD rsp, pad 649cabdff1aSopenharmony_ci RET 650cabdff1aSopenharmony_ci%endmacro ; IDCT8_ADD4 651cabdff1aSopenharmony_ci 652cabdff1aSopenharmony_ciINIT_XMM sse2 653cabdff1aSopenharmony_ciIDCT8_ADD4 654cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 655cabdff1aSopenharmony_ciINIT_XMM avx 656cabdff1aSopenharmony_ciIDCT8_ADD4 657cabdff1aSopenharmony_ci%endif 658