1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* MMX/SSE2-optimized H.264 iDCT 3cabdff1aSopenharmony_ci;***************************************************************************** 4cabdff1aSopenharmony_ci;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt 5cabdff1aSopenharmony_ci;* Copyright (C) 2003-2008 x264 project 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* Authors: Laurent Aimar <fenrir@via.ecp.fr> 8cabdff1aSopenharmony_ci;* Loren Merritt <lorenm@u.washington.edu> 9cabdff1aSopenharmony_ci;* Holger Lubitz <hal@duncan.ol.sub.de> 10cabdff1aSopenharmony_ci;* Min Chen <chenm001.163.com> 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 15cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 16cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 17cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 20cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 21cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 23cabdff1aSopenharmony_ci;* 24cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 25cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 26cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 27cabdff1aSopenharmony_ci;***************************************************************************** 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ciSECTION_RODATA 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ciscan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 34cabdff1aSopenharmony_ci db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 35cabdff1aSopenharmony_ci db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 36cabdff1aSopenharmony_ci db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 37cabdff1aSopenharmony_ci db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 38cabdff1aSopenharmony_ci db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 39cabdff1aSopenharmony_ci db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 40cabdff1aSopenharmony_ci db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 41cabdff1aSopenharmony_ci db 4+11*8, 5+11*8, 4+12*8, 5+12*8 42cabdff1aSopenharmony_ci db 6+11*8, 7+11*8, 6+12*8, 7+12*8 43cabdff1aSopenharmony_ci db 4+13*8, 5+13*8, 4+14*8, 5+14*8 44cabdff1aSopenharmony_ci db 6+13*8, 7+13*8, 6+14*8, 7+14*8 45cabdff1aSopenharmony_ci%ifdef PIC 46cabdff1aSopenharmony_ci%define npicregs 1 47cabdff1aSopenharmony_ci%define scan8 picregq 48cabdff1aSopenharmony_ci%else 49cabdff1aSopenharmony_ci%define npicregs 0 50cabdff1aSopenharmony_ci%define scan8 scan8_mem 51cabdff1aSopenharmony_ci%endif 52cabdff1aSopenharmony_ci 53cabdff1aSopenharmony_cicextern pw_32 54cabdff1aSopenharmony_cicextern pw_1 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ciSECTION .text 57cabdff1aSopenharmony_ci 58cabdff1aSopenharmony_ci; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 59cabdff1aSopenharmony_ci%macro IDCT4_ADD 3 60cabdff1aSopenharmony_ci ; Load dct coeffs 61cabdff1aSopenharmony_ci movq m0, [%2] 62cabdff1aSopenharmony_ci movq m1, [%2+8] 63cabdff1aSopenharmony_ci movq m2, [%2+16] 64cabdff1aSopenharmony_ci movq m3, [%2+24] 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_ci IDCT4_1D w, 0, 1, 2, 3, 4, 5 67cabdff1aSopenharmony_ci mova m6, [pw_32] 68cabdff1aSopenharmony_ci %if mmsize == 8 69cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 4 70cabdff1aSopenharmony_ci %else 71cabdff1aSopenharmony_ci punpcklwd m0, m1 72cabdff1aSopenharmony_ci punpcklwd m2, m3 73cabdff1aSopenharmony_ci SBUTTERFLY dq, 0, 2, 4 74cabdff1aSopenharmony_ci MOVHL m1, m0 75cabdff1aSopenharmony_ci MOVHL m3, m2 76cabdff1aSopenharmony_ci %endif 77cabdff1aSopenharmony_ci paddw m0, m6 78cabdff1aSopenharmony_ci IDCT4_1D w, 0, 1, 2, 3, 4, 5 79cabdff1aSopenharmony_ci pxor m7, m7 80cabdff1aSopenharmony_ci movq [%2+ 0], m7 81cabdff1aSopenharmony_ci movq [%2+ 8], m7 82cabdff1aSopenharmony_ci movq [%2+16], m7 83cabdff1aSopenharmony_ci movq [%2+24], m7 84cabdff1aSopenharmony_ci 85cabdff1aSopenharmony_ci STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 86cabdff1aSopenharmony_ci lea %1, [%1+%3*2] 87cabdff1aSopenharmony_ci STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 88cabdff1aSopenharmony_ci%endmacro 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ci%macro IDCT8_1D 2 91cabdff1aSopenharmony_ci psraw m0, m1, 1 92cabdff1aSopenharmony_ci SWAP 0, 1 93cabdff1aSopenharmony_ci psraw m4, m5, 1 94cabdff1aSopenharmony_ci paddw m4, m5 95cabdff1aSopenharmony_ci paddw m1, m0 96cabdff1aSopenharmony_ci paddw m4, m7 97cabdff1aSopenharmony_ci paddw m1, m5 98cabdff1aSopenharmony_ci psubw m4, m0 99cabdff1aSopenharmony_ci paddw m1, m3 100cabdff1aSopenharmony_ci 101cabdff1aSopenharmony_ci psubw m0, m3 102cabdff1aSopenharmony_ci psubw m5, m3 103cabdff1aSopenharmony_ci psraw m3, 1 104cabdff1aSopenharmony_ci paddw m0, m7 105cabdff1aSopenharmony_ci psubw m5, m7 106cabdff1aSopenharmony_ci psraw m7, 1 107cabdff1aSopenharmony_ci psubw m0, m3 108cabdff1aSopenharmony_ci psubw m5, m7 109cabdff1aSopenharmony_ci 110cabdff1aSopenharmony_ci psraw m7, m1, 2 111cabdff1aSopenharmony_ci SWAP 7,1 112cabdff1aSopenharmony_ci psraw m3, m4, 2 113cabdff1aSopenharmony_ci paddw m3, m0 114cabdff1aSopenharmony_ci psraw m0, 2 115cabdff1aSopenharmony_ci paddw m1, m5 116cabdff1aSopenharmony_ci psraw m5, 2 117cabdff1aSopenharmony_ci psubw m0, m4 118cabdff1aSopenharmony_ci psubw m7, m5 119cabdff1aSopenharmony_ci 120cabdff1aSopenharmony_ci psraw m5, m6, 1 121cabdff1aSopenharmony_ci SWAP 5,6 122cabdff1aSopenharmony_ci psraw m4, m2, 1 123cabdff1aSopenharmony_ci paddw m6, m2 124cabdff1aSopenharmony_ci psubw m4, m5 125cabdff1aSopenharmony_ci 126cabdff1aSopenharmony_ci mova m2, %1 127cabdff1aSopenharmony_ci mova m5, %2 128cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 2 129cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 5 130cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 2 131cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 6 132cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 4 133cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 2 134cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 5 135cabdff1aSopenharmony_ci SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 136cabdff1aSopenharmony_ci%endmacro 137cabdff1aSopenharmony_ci 138cabdff1aSopenharmony_ci%macro IDCT8_1D_FULL 1 139cabdff1aSopenharmony_ci mova m7, [%1+112] 140cabdff1aSopenharmony_ci mova m6, [%1+ 96] 141cabdff1aSopenharmony_ci mova m5, [%1+ 80] 142cabdff1aSopenharmony_ci mova m3, [%1+ 48] 143cabdff1aSopenharmony_ci mova m2, [%1+ 32] 144cabdff1aSopenharmony_ci mova m1, [%1+ 16] 145cabdff1aSopenharmony_ci IDCT8_1D [%1], [%1+ 64] 146cabdff1aSopenharmony_ci%endmacro 147cabdff1aSopenharmony_ci 148cabdff1aSopenharmony_ci; %1=int16_t *block, %2=int16_t *dstblock 149cabdff1aSopenharmony_ci%macro IDCT8_ADD_MMX_START 2 150cabdff1aSopenharmony_ci IDCT8_1D_FULL %1 151cabdff1aSopenharmony_ci mova [%1], m7 152cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 7 153cabdff1aSopenharmony_ci mova m7, [%1] 154cabdff1aSopenharmony_ci mova [%2 ], m0 155cabdff1aSopenharmony_ci mova [%2+16], m1 156cabdff1aSopenharmony_ci mova [%2+32], m2 157cabdff1aSopenharmony_ci mova [%2+48], m3 158cabdff1aSopenharmony_ci TRANSPOSE4x4W 4, 5, 6, 7, 3 159cabdff1aSopenharmony_ci mova [%2+ 8], m4 160cabdff1aSopenharmony_ci mova [%2+24], m5 161cabdff1aSopenharmony_ci mova [%2+40], m6 162cabdff1aSopenharmony_ci mova [%2+56], m7 163cabdff1aSopenharmony_ci%endmacro 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_ci; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 166cabdff1aSopenharmony_ci%macro IDCT8_ADD_MMX_END 3-4 167cabdff1aSopenharmony_ci IDCT8_1D_FULL %2 168cabdff1aSopenharmony_ci mova [%2 ], m5 169cabdff1aSopenharmony_ci mova [%2+16], m6 170cabdff1aSopenharmony_ci mova [%2+32], m7 171cabdff1aSopenharmony_ci 172cabdff1aSopenharmony_ci pxor m7, m7 173cabdff1aSopenharmony_ci%if %0 == 4 174cabdff1aSopenharmony_ci movq [%4+ 0], m7 175cabdff1aSopenharmony_ci movq [%4+ 8], m7 176cabdff1aSopenharmony_ci movq [%4+ 16], m7 177cabdff1aSopenharmony_ci movq [%4+ 24], m7 178cabdff1aSopenharmony_ci movq [%4+ 32], m7 179cabdff1aSopenharmony_ci movq [%4+ 40], m7 180cabdff1aSopenharmony_ci movq [%4+ 48], m7 181cabdff1aSopenharmony_ci movq [%4+ 56], m7 182cabdff1aSopenharmony_ci movq [%4+ 64], m7 183cabdff1aSopenharmony_ci movq [%4+ 72], m7 184cabdff1aSopenharmony_ci movq [%4+ 80], m7 185cabdff1aSopenharmony_ci movq [%4+ 88], m7 186cabdff1aSopenharmony_ci movq [%4+ 96], m7 187cabdff1aSopenharmony_ci movq [%4+104], m7 188cabdff1aSopenharmony_ci movq [%4+112], m7 189cabdff1aSopenharmony_ci movq [%4+120], m7 190cabdff1aSopenharmony_ci%endif 191cabdff1aSopenharmony_ci STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 192cabdff1aSopenharmony_ci lea %1, [%1+%3*2] 193cabdff1aSopenharmony_ci STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 194cabdff1aSopenharmony_ci mova m0, [%2 ] 195cabdff1aSopenharmony_ci mova m1, [%2+16] 196cabdff1aSopenharmony_ci mova m2, [%2+32] 197cabdff1aSopenharmony_ci lea %1, [%1+%3*2] 198cabdff1aSopenharmony_ci STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 199cabdff1aSopenharmony_ci lea %1, [%1+%3*2] 200cabdff1aSopenharmony_ci STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 201cabdff1aSopenharmony_ci%endmacro 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 204cabdff1aSopenharmony_ci%macro IDCT8_ADD_SSE 4 205cabdff1aSopenharmony_ci IDCT8_1D_FULL %2 206cabdff1aSopenharmony_ci%if ARCH_X86_64 207cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 208cabdff1aSopenharmony_ci%else 209cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] 210cabdff1aSopenharmony_ci%endif 211cabdff1aSopenharmony_ci paddw m0, [pw_32] 212cabdff1aSopenharmony_ci 213cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 214cabdff1aSopenharmony_ci mova [%2 ], m0 215cabdff1aSopenharmony_ci mova [%2+16], m4 216cabdff1aSopenharmony_ci IDCT8_1D [%2], [%2+ 16] 217cabdff1aSopenharmony_ci mova [%2 ], m6 218cabdff1aSopenharmony_ci mova [%2+16], m7 219cabdff1aSopenharmony_ci%else 220cabdff1aSopenharmony_ci SWAP 0, 8 221cabdff1aSopenharmony_ci SWAP 4, 9 222cabdff1aSopenharmony_ci IDCT8_1D m8, m9 223cabdff1aSopenharmony_ci SWAP 6, 8 224cabdff1aSopenharmony_ci SWAP 7, 9 225cabdff1aSopenharmony_ci%endif 226cabdff1aSopenharmony_ci 227cabdff1aSopenharmony_ci pxor m7, m7 228cabdff1aSopenharmony_ci lea %4, [%3*3] 229cabdff1aSopenharmony_ci STORE_DIFF m0, m6, m7, [%1 ] 230cabdff1aSopenharmony_ci STORE_DIFF m1, m6, m7, [%1+%3 ] 231cabdff1aSopenharmony_ci STORE_DIFF m2, m6, m7, [%1+%3*2] 232cabdff1aSopenharmony_ci STORE_DIFF m3, m6, m7, [%1+%4 ] 233cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 234cabdff1aSopenharmony_ci mova m0, [%2 ] 235cabdff1aSopenharmony_ci mova m1, [%2+16] 236cabdff1aSopenharmony_ci%else 237cabdff1aSopenharmony_ci SWAP 0, 8 238cabdff1aSopenharmony_ci SWAP 1, 9 239cabdff1aSopenharmony_ci%endif 240cabdff1aSopenharmony_ci mova [%2+ 0], m7 241cabdff1aSopenharmony_ci mova [%2+ 16], m7 242cabdff1aSopenharmony_ci mova [%2+ 32], m7 243cabdff1aSopenharmony_ci mova [%2+ 48], m7 244cabdff1aSopenharmony_ci mova [%2+ 64], m7 245cabdff1aSopenharmony_ci mova [%2+ 80], m7 246cabdff1aSopenharmony_ci mova [%2+ 96], m7 247cabdff1aSopenharmony_ci mova [%2+112], m7 248cabdff1aSopenharmony_ci lea %1, [%1+%3*4] 249cabdff1aSopenharmony_ci STORE_DIFF m4, m6, m7, [%1 ] 250cabdff1aSopenharmony_ci STORE_DIFF m5, m6, m7, [%1+%3 ] 251cabdff1aSopenharmony_ci STORE_DIFF m0, m6, m7, [%1+%3*2] 252cabdff1aSopenharmony_ci STORE_DIFF m1, m6, m7, [%1+%4 ] 253cabdff1aSopenharmony_ci%endmacro 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_ciINIT_XMM sse2 256cabdff1aSopenharmony_ci; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride) 257cabdff1aSopenharmony_cicglobal h264_idct8_add_8, 3, 4, 10 258cabdff1aSopenharmony_ci movsxdifnidn r2, r2d 259cabdff1aSopenharmony_ci IDCT8_ADD_SSE r0, r1, r2, r3 260cabdff1aSopenharmony_ci RET 261cabdff1aSopenharmony_ci 262cabdff1aSopenharmony_ci%macro DC_ADD_MMXEXT_INIT 2 263cabdff1aSopenharmony_ci add %1, 32 264cabdff1aSopenharmony_ci sar %1, 6 265cabdff1aSopenharmony_ci movd m0, %1d 266cabdff1aSopenharmony_ci lea %1, [%2*3] 267cabdff1aSopenharmony_ci pshufw m0, m0, 0 268cabdff1aSopenharmony_ci pxor m1, m1 269cabdff1aSopenharmony_ci psubw m1, m0 270cabdff1aSopenharmony_ci packuswb m0, m0 271cabdff1aSopenharmony_ci packuswb m1, m1 272cabdff1aSopenharmony_ci%endmacro 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ci%macro DC_ADD_MMXEXT_OP 4 275cabdff1aSopenharmony_ci %1 m2, [%2 ] 276cabdff1aSopenharmony_ci %1 m3, [%2+%3 ] 277cabdff1aSopenharmony_ci %1 m4, [%2+%3*2] 278cabdff1aSopenharmony_ci %1 m5, [%2+%4 ] 279cabdff1aSopenharmony_ci paddusb m2, m0 280cabdff1aSopenharmony_ci paddusb m3, m0 281cabdff1aSopenharmony_ci paddusb m4, m0 282cabdff1aSopenharmony_ci paddusb m5, m0 283cabdff1aSopenharmony_ci psubusb m2, m1 284cabdff1aSopenharmony_ci psubusb m3, m1 285cabdff1aSopenharmony_ci psubusb m4, m1 286cabdff1aSopenharmony_ci psubusb m5, m1 287cabdff1aSopenharmony_ci %1 [%2 ], m2 288cabdff1aSopenharmony_ci %1 [%2+%3 ], m3 289cabdff1aSopenharmony_ci %1 [%2+%3*2], m4 290cabdff1aSopenharmony_ci %1 [%2+%4 ], m5 291cabdff1aSopenharmony_ci%endmacro 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ciINIT_MMX mmxext 294cabdff1aSopenharmony_ci%if ARCH_X86_64 295cabdff1aSopenharmony_ci; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 296cabdff1aSopenharmony_cicglobal h264_idct8_dc_add_8, 3, 4, 0 297cabdff1aSopenharmony_ci movsxd r2, r2d 298cabdff1aSopenharmony_ci movsx r3, word [r1] 299cabdff1aSopenharmony_ci mov dword [r1], 0 300cabdff1aSopenharmony_ci DC_ADD_MMXEXT_INIT r3, r2 301cabdff1aSopenharmony_ci DC_ADD_MMXEXT_OP mova, r0, r2, r3 302cabdff1aSopenharmony_ci lea r0, [r0+r2*4] 303cabdff1aSopenharmony_ci DC_ADD_MMXEXT_OP mova, r0, r2, r3 304cabdff1aSopenharmony_ci RET 305cabdff1aSopenharmony_ci%else 306cabdff1aSopenharmony_ci; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) 307cabdff1aSopenharmony_cicglobal h264_idct8_dc_add_8, 2, 3, 0 308cabdff1aSopenharmony_ci movsx r2, word [r1] 309cabdff1aSopenharmony_ci mov dword [r1], 0 310cabdff1aSopenharmony_ci mov r1, r2m 311cabdff1aSopenharmony_ci DC_ADD_MMXEXT_INIT r2, r1 312cabdff1aSopenharmony_ci DC_ADD_MMXEXT_OP mova, r0, r1, r2 313cabdff1aSopenharmony_ci lea r0, [r0+r1*4] 314cabdff1aSopenharmony_ci DC_ADD_MMXEXT_OP mova, r0, r1, r2 315cabdff1aSopenharmony_ci RET 316cabdff1aSopenharmony_ci%endif 317cabdff1aSopenharmony_ci 318cabdff1aSopenharmony_ciINIT_XMM sse2 319cabdff1aSopenharmony_ci; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset, 320cabdff1aSopenharmony_ci; int16_t *block, int stride, 321cabdff1aSopenharmony_ci; const uint8_t nnzc[6 * 8]) 322cabdff1aSopenharmony_cicglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 323cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 324cabdff1aSopenharmony_ci xor r5, r5 325cabdff1aSopenharmony_ci%ifdef PIC 326cabdff1aSopenharmony_ci lea picregq, [scan8_mem] 327cabdff1aSopenharmony_ci%endif 328cabdff1aSopenharmony_ci.nextblock: 329cabdff1aSopenharmony_ci movzx r6, byte [scan8+r5] 330cabdff1aSopenharmony_ci movzx r6, byte [r4+r6] 331cabdff1aSopenharmony_ci test r6, r6 332cabdff1aSopenharmony_ci jz .skipblock 333cabdff1aSopenharmony_ci cmp r6, 1 334cabdff1aSopenharmony_ci jnz .no_dc 335cabdff1aSopenharmony_ci movsx r6, word [r2] 336cabdff1aSopenharmony_ci test r6, r6 337cabdff1aSopenharmony_ci jz .no_dc 338cabdff1aSopenharmony_ciINIT_MMX cpuname 339cabdff1aSopenharmony_ci mov word [r2], 0 340cabdff1aSopenharmony_ci DC_ADD_MMXEXT_INIT r6, r3 341cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 342cabdff1aSopenharmony_ci%define dst2q r1 343cabdff1aSopenharmony_ci%define dst2d r1d 344cabdff1aSopenharmony_ci%endif 345cabdff1aSopenharmony_ci mov dst2d, dword [r1+r5*4] 346cabdff1aSopenharmony_ci add dst2q, r0 347cabdff1aSopenharmony_ci DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 348cabdff1aSopenharmony_ci lea dst2q, [dst2q+r3*4] 349cabdff1aSopenharmony_ci DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 350cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 351cabdff1aSopenharmony_ci mov r1, r1m 352cabdff1aSopenharmony_ci%endif 353cabdff1aSopenharmony_ci add r5, 4 354cabdff1aSopenharmony_ci add r2, 128 355cabdff1aSopenharmony_ci cmp r5, 16 356cabdff1aSopenharmony_ci jl .nextblock 357cabdff1aSopenharmony_ci REP_RET 358cabdff1aSopenharmony_ci.no_dc: 359cabdff1aSopenharmony_ciINIT_XMM cpuname 360cabdff1aSopenharmony_ci mov dst2d, dword [r1+r5*4] 361cabdff1aSopenharmony_ci add dst2q, r0 362cabdff1aSopenharmony_ci IDCT8_ADD_SSE dst2q, r2, r3, r6 363cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0 364cabdff1aSopenharmony_ci mov r1, r1m 365cabdff1aSopenharmony_ci%endif 366cabdff1aSopenharmony_ci.skipblock: 367cabdff1aSopenharmony_ci add r5, 4 368cabdff1aSopenharmony_ci add r2, 128 369cabdff1aSopenharmony_ci cmp r5, 16 370cabdff1aSopenharmony_ci jl .nextblock 371cabdff1aSopenharmony_ci REP_RET 372cabdff1aSopenharmony_ci 373cabdff1aSopenharmony_ciINIT_MMX mmx 374cabdff1aSopenharmony_cih264_idct_add8_mmx_plane: 375cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 376cabdff1aSopenharmony_ci.nextblock: 377cabdff1aSopenharmony_ci movzx r6, byte [scan8+r5] 378cabdff1aSopenharmony_ci movzx r6, byte [r4+r6] 379cabdff1aSopenharmony_ci or r6w, word [r2] 380cabdff1aSopenharmony_ci test r6, r6 381cabdff1aSopenharmony_ci jz .skipblock 382cabdff1aSopenharmony_ci%if ARCH_X86_64 383cabdff1aSopenharmony_ci mov r0d, dword [r1+r5*4] 384cabdff1aSopenharmony_ci add r0, [dst2q] 385cabdff1aSopenharmony_ci%else 386cabdff1aSopenharmony_ci mov r0, r1m ; XXX r1m here is actually r0m of the calling func 387cabdff1aSopenharmony_ci mov r0, [r0] 388cabdff1aSopenharmony_ci add r0, dword [r1+r5*4] 389cabdff1aSopenharmony_ci%endif 390cabdff1aSopenharmony_ci IDCT4_ADD r0, r2, r3 391cabdff1aSopenharmony_ci.skipblock: 392cabdff1aSopenharmony_ci inc r5 393cabdff1aSopenharmony_ci add r2, 32 394cabdff1aSopenharmony_ci test r5, 3 395cabdff1aSopenharmony_ci jnz .nextblock 396cabdff1aSopenharmony_ci rep ret 397cabdff1aSopenharmony_ci 398cabdff1aSopenharmony_cicglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 399cabdff1aSopenharmony_ci; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg 400cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 401cabdff1aSopenharmony_ci%ifdef PIC 402cabdff1aSopenharmony_ci lea picregq, [scan8_mem] 403cabdff1aSopenharmony_ci%endif 404cabdff1aSopenharmony_ci%if ARCH_X86_64 405cabdff1aSopenharmony_ci mov dst2q, r0 406cabdff1aSopenharmony_ci%endif 407cabdff1aSopenharmony_ci 408cabdff1aSopenharmony_ci mov r5, 16 ; i 409cabdff1aSopenharmony_ci add r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_ci call h264_idct_add8_mmx_plane 412cabdff1aSopenharmony_ci add r5, 4 413cabdff1aSopenharmony_ci call h264_idct_add8_mmx_plane 414cabdff1aSopenharmony_ci 415cabdff1aSopenharmony_ci%if ARCH_X86_64 416cabdff1aSopenharmony_ci add dst2q, gprsize ; dest[1] 417cabdff1aSopenharmony_ci%else 418cabdff1aSopenharmony_ci add r0mp, gprsize 419cabdff1aSopenharmony_ci%endif 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ci add r5, 4 ; set to 32 422cabdff1aSopenharmony_ci add r2, 256 ; set to i * 16 * sizeof(dctcoef) 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci call h264_idct_add8_mmx_plane 425cabdff1aSopenharmony_ci add r5, 4 426cabdff1aSopenharmony_ci call h264_idct_add8_mmx_plane 427cabdff1aSopenharmony_ci 428cabdff1aSopenharmony_ci RET ; TODO: check rep ret after a function call 429cabdff1aSopenharmony_ci 430cabdff1aSopenharmony_ci; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered 431cabdff1aSopenharmony_cih264_idct_dc_add8_mmxext: 432cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 433cabdff1aSopenharmony_ci movd m0, [r2 ] ; 0 0 X D 434cabdff1aSopenharmony_ci mov word [r2+ 0], 0 435cabdff1aSopenharmony_ci punpcklwd m0, [r2+32] ; x X d D 436cabdff1aSopenharmony_ci mov word [r2+32], 0 437cabdff1aSopenharmony_ci paddsw m0, [pw_32] 438cabdff1aSopenharmony_ci psraw m0, 6 439cabdff1aSopenharmony_ci punpcklwd m0, m0 ; d d D D 440cabdff1aSopenharmony_ci pxor m1, m1 ; 0 0 0 0 441cabdff1aSopenharmony_ci psubw m1, m0 ; -d-d-D-D 442cabdff1aSopenharmony_ci packuswb m0, m1 ; -d-d-D-D d d D D 443cabdff1aSopenharmony_ci pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D 444cabdff1aSopenharmony_ci punpcklwd m0, m0 ; d d d d D D D D 445cabdff1aSopenharmony_ci lea r6, [r3*3] 446cabdff1aSopenharmony_ci DC_ADD_MMXEXT_OP movq, r0, r3, r6 447cabdff1aSopenharmony_ci ret 448cabdff1aSopenharmony_ci 449cabdff1aSopenharmony_ciALIGN 16 450cabdff1aSopenharmony_ciINIT_XMM sse2 451cabdff1aSopenharmony_ci; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride 452cabdff1aSopenharmony_cih264_add8x4_idct_sse2: 453cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 454cabdff1aSopenharmony_ci movq m0, [r2+ 0] 455cabdff1aSopenharmony_ci movq m1, [r2+ 8] 456cabdff1aSopenharmony_ci movq m2, [r2+16] 457cabdff1aSopenharmony_ci movq m3, [r2+24] 458cabdff1aSopenharmony_ci movhps m0, [r2+32] 459cabdff1aSopenharmony_ci movhps m1, [r2+40] 460cabdff1aSopenharmony_ci movhps m2, [r2+48] 461cabdff1aSopenharmony_ci movhps m3, [r2+56] 462cabdff1aSopenharmony_ci IDCT4_1D w,0,1,2,3,4,5 463cabdff1aSopenharmony_ci TRANSPOSE2x4x4W 0,1,2,3,4 464cabdff1aSopenharmony_ci paddw m0, [pw_32] 465cabdff1aSopenharmony_ci IDCT4_1D w,0,1,2,3,4,5 466cabdff1aSopenharmony_ci pxor m7, m7 467cabdff1aSopenharmony_ci mova [r2+ 0], m7 468cabdff1aSopenharmony_ci mova [r2+16], m7 469cabdff1aSopenharmony_ci mova [r2+32], m7 470cabdff1aSopenharmony_ci mova [r2+48], m7 471cabdff1aSopenharmony_ci STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 472cabdff1aSopenharmony_ci lea r0, [r0+r3*2] 473cabdff1aSopenharmony_ci STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 474cabdff1aSopenharmony_ci ret 475cabdff1aSopenharmony_ci 476cabdff1aSopenharmony_ci%macro add16_sse2_cycle 2 477cabdff1aSopenharmony_ci movzx r0, word [r4+%2] 478cabdff1aSopenharmony_ci test r0, r0 479cabdff1aSopenharmony_ci jz .cycle%1end 480cabdff1aSopenharmony_ci mov r0d, dword [r1+%1*8] 481cabdff1aSopenharmony_ci%if ARCH_X86_64 482cabdff1aSopenharmony_ci add r0, r5 483cabdff1aSopenharmony_ci%else 484cabdff1aSopenharmony_ci add r0, r0m 485cabdff1aSopenharmony_ci%endif 486cabdff1aSopenharmony_ci call h264_add8x4_idct_sse2 487cabdff1aSopenharmony_ci.cycle%1end: 488cabdff1aSopenharmony_ci%if %1 < 7 489cabdff1aSopenharmony_ci add r2, 64 490cabdff1aSopenharmony_ci%endif 491cabdff1aSopenharmony_ci%endmacro 492cabdff1aSopenharmony_ci 493cabdff1aSopenharmony_ci; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset, 494cabdff1aSopenharmony_ci; int16_t *block, int stride, 495cabdff1aSopenharmony_ci; const uint8_t nnzc[6 * 8]) 496cabdff1aSopenharmony_cicglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8 497cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 498cabdff1aSopenharmony_ci%if ARCH_X86_64 499cabdff1aSopenharmony_ci mov r5, r0 500cabdff1aSopenharmony_ci%endif 501cabdff1aSopenharmony_ci ; unrolling of the loop leads to an average performance gain of 502cabdff1aSopenharmony_ci ; 20-25% 503cabdff1aSopenharmony_ci add16_sse2_cycle 0, 0xc 504cabdff1aSopenharmony_ci add16_sse2_cycle 1, 0x14 505cabdff1aSopenharmony_ci add16_sse2_cycle 2, 0xe 506cabdff1aSopenharmony_ci add16_sse2_cycle 3, 0x16 507cabdff1aSopenharmony_ci add16_sse2_cycle 4, 0x1c 508cabdff1aSopenharmony_ci add16_sse2_cycle 5, 0x24 509cabdff1aSopenharmony_ci add16_sse2_cycle 6, 0x1e 510cabdff1aSopenharmony_ci add16_sse2_cycle 7, 0x26 511cabdff1aSopenharmony_ciREP_RET 512cabdff1aSopenharmony_ci 513cabdff1aSopenharmony_ci%macro add16intra_sse2_cycle 2 514cabdff1aSopenharmony_ci movzx r0, word [r4+%2] 515cabdff1aSopenharmony_ci test r0, r0 516cabdff1aSopenharmony_ci jz .try%1dc 517cabdff1aSopenharmony_ci mov r0d, dword [r1+%1*8] 518cabdff1aSopenharmony_ci%if ARCH_X86_64 519cabdff1aSopenharmony_ci add r0, r7 520cabdff1aSopenharmony_ci%else 521cabdff1aSopenharmony_ci add r0, r0m 522cabdff1aSopenharmony_ci%endif 523cabdff1aSopenharmony_ci call h264_add8x4_idct_sse2 524cabdff1aSopenharmony_ci jmp .cycle%1end 525cabdff1aSopenharmony_ci.try%1dc: 526cabdff1aSopenharmony_ci movsx r0, word [r2 ] 527cabdff1aSopenharmony_ci or r0w, word [r2+32] 528cabdff1aSopenharmony_ci jz .cycle%1end 529cabdff1aSopenharmony_ci mov r0d, dword [r1+%1*8] 530cabdff1aSopenharmony_ci%if ARCH_X86_64 531cabdff1aSopenharmony_ci add r0, r7 532cabdff1aSopenharmony_ci%else 533cabdff1aSopenharmony_ci add r0, r0m 534cabdff1aSopenharmony_ci%endif 535cabdff1aSopenharmony_ci call h264_idct_dc_add8_mmxext 536cabdff1aSopenharmony_ci.cycle%1end: 537cabdff1aSopenharmony_ci%if %1 < 7 538cabdff1aSopenharmony_ci add r2, 64 539cabdff1aSopenharmony_ci%endif 540cabdff1aSopenharmony_ci%endmacro 541cabdff1aSopenharmony_ci 542cabdff1aSopenharmony_ci; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset, 543cabdff1aSopenharmony_ci; int16_t *block, int stride, 544cabdff1aSopenharmony_ci; const uint8_t nnzc[6 * 8]) 545cabdff1aSopenharmony_cicglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8 546cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 547cabdff1aSopenharmony_ci%if ARCH_X86_64 548cabdff1aSopenharmony_ci mov r7, r0 549cabdff1aSopenharmony_ci%endif 550cabdff1aSopenharmony_ci add16intra_sse2_cycle 0, 0xc 551cabdff1aSopenharmony_ci add16intra_sse2_cycle 1, 0x14 552cabdff1aSopenharmony_ci add16intra_sse2_cycle 2, 0xe 553cabdff1aSopenharmony_ci add16intra_sse2_cycle 3, 0x16 554cabdff1aSopenharmony_ci add16intra_sse2_cycle 4, 0x1c 555cabdff1aSopenharmony_ci add16intra_sse2_cycle 5, 0x24 556cabdff1aSopenharmony_ci add16intra_sse2_cycle 6, 0x1e 557cabdff1aSopenharmony_ci add16intra_sse2_cycle 7, 0x26 558cabdff1aSopenharmony_ciREP_RET 559cabdff1aSopenharmony_ci 560cabdff1aSopenharmony_ci%macro add8_sse2_cycle 2 561cabdff1aSopenharmony_ci movzx r0, word [r4+%2] 562cabdff1aSopenharmony_ci test r0, r0 563cabdff1aSopenharmony_ci jz .try%1dc 564cabdff1aSopenharmony_ci%if ARCH_X86_64 565cabdff1aSopenharmony_ci mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 566cabdff1aSopenharmony_ci add r0, [r7] 567cabdff1aSopenharmony_ci%else 568cabdff1aSopenharmony_ci mov r0, r0m 569cabdff1aSopenharmony_ci mov r0, [r0] 570cabdff1aSopenharmony_ci add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 571cabdff1aSopenharmony_ci%endif 572cabdff1aSopenharmony_ci call h264_add8x4_idct_sse2 573cabdff1aSopenharmony_ci jmp .cycle%1end 574cabdff1aSopenharmony_ci.try%1dc: 575cabdff1aSopenharmony_ci movsx r0, word [r2 ] 576cabdff1aSopenharmony_ci or r0w, word [r2+32] 577cabdff1aSopenharmony_ci jz .cycle%1end 578cabdff1aSopenharmony_ci%if ARCH_X86_64 579cabdff1aSopenharmony_ci mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 580cabdff1aSopenharmony_ci add r0, [r7] 581cabdff1aSopenharmony_ci%else 582cabdff1aSopenharmony_ci mov r0, r0m 583cabdff1aSopenharmony_ci mov r0, [r0] 584cabdff1aSopenharmony_ci add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] 585cabdff1aSopenharmony_ci%endif 586cabdff1aSopenharmony_ci call h264_idct_dc_add8_mmxext 587cabdff1aSopenharmony_ci.cycle%1end: 588cabdff1aSopenharmony_ci%if %1 == 1 589cabdff1aSopenharmony_ci add r2, 384+64 590cabdff1aSopenharmony_ci%elif %1 < 3 591cabdff1aSopenharmony_ci add r2, 64 592cabdff1aSopenharmony_ci%endif 593cabdff1aSopenharmony_ci%endmacro 594cabdff1aSopenharmony_ci 595cabdff1aSopenharmony_ci; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset, 596cabdff1aSopenharmony_ci; int16_t *block, int stride, 597cabdff1aSopenharmony_ci; const uint8_t nnzc[6 * 8]) 598cabdff1aSopenharmony_cicglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8 599cabdff1aSopenharmony_ci movsxdifnidn r3, r3d 600cabdff1aSopenharmony_ci add r2, 512 601cabdff1aSopenharmony_ci%if ARCH_X86_64 602cabdff1aSopenharmony_ci mov r7, r0 603cabdff1aSopenharmony_ci%endif 604cabdff1aSopenharmony_ci add8_sse2_cycle 0, 0x34 605cabdff1aSopenharmony_ci add8_sse2_cycle 1, 0x3c 606cabdff1aSopenharmony_ci%if ARCH_X86_64 607cabdff1aSopenharmony_ci add r7, gprsize 608cabdff1aSopenharmony_ci%else 609cabdff1aSopenharmony_ci add r0mp, gprsize 610cabdff1aSopenharmony_ci%endif 611cabdff1aSopenharmony_ci add8_sse2_cycle 2, 0x5c 612cabdff1aSopenharmony_ci add8_sse2_cycle 3, 0x64 613cabdff1aSopenharmony_ciREP_RET 614cabdff1aSopenharmony_ci 615cabdff1aSopenharmony_ci;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul) 616cabdff1aSopenharmony_ci 617cabdff1aSopenharmony_ci%macro WALSH4_1D 5 618cabdff1aSopenharmony_ci SUMSUB_BADC w, %4, %3, %2, %1, %5 619cabdff1aSopenharmony_ci SUMSUB_BADC w, %4, %2, %3, %1, %5 620cabdff1aSopenharmony_ci SWAP %1, %4, %3 621cabdff1aSopenharmony_ci%endmacro 622cabdff1aSopenharmony_ci 623cabdff1aSopenharmony_ci%macro DEQUANT 1-3 624cabdff1aSopenharmony_ci%if cpuflag(sse2) 625cabdff1aSopenharmony_ci movd xmm4, t3d 626cabdff1aSopenharmony_ci movq xmm5, [pw_1] 627cabdff1aSopenharmony_ci pshufd xmm4, xmm4, 0 628cabdff1aSopenharmony_ci movq2dq xmm0, m0 629cabdff1aSopenharmony_ci movq2dq xmm1, m1 630cabdff1aSopenharmony_ci movq2dq xmm2, m2 631cabdff1aSopenharmony_ci movq2dq xmm3, m3 632cabdff1aSopenharmony_ci punpcklwd xmm0, xmm5 633cabdff1aSopenharmony_ci punpcklwd xmm1, xmm5 634cabdff1aSopenharmony_ci punpcklwd xmm2, xmm5 635cabdff1aSopenharmony_ci punpcklwd xmm3, xmm5 636cabdff1aSopenharmony_ci pmaddwd xmm0, xmm4 637cabdff1aSopenharmony_ci pmaddwd xmm1, xmm4 638cabdff1aSopenharmony_ci pmaddwd xmm2, xmm4 639cabdff1aSopenharmony_ci pmaddwd xmm3, xmm4 640cabdff1aSopenharmony_ci psrad xmm0, %1 641cabdff1aSopenharmony_ci psrad xmm1, %1 642cabdff1aSopenharmony_ci psrad xmm2, %1 643cabdff1aSopenharmony_ci psrad xmm3, %1 644cabdff1aSopenharmony_ci packssdw xmm0, xmm1 645cabdff1aSopenharmony_ci packssdw xmm2, xmm3 646cabdff1aSopenharmony_ci%else 647cabdff1aSopenharmony_ci mova m7, [pw_1] 648cabdff1aSopenharmony_ci mova m4, %1 649cabdff1aSopenharmony_ci punpcklwd %1, m7 650cabdff1aSopenharmony_ci punpckhwd m4, m7 651cabdff1aSopenharmony_ci mova m5, %2 652cabdff1aSopenharmony_ci punpcklwd %2, m7 653cabdff1aSopenharmony_ci punpckhwd m5, m7 654cabdff1aSopenharmony_ci movd m7, t3d 655cabdff1aSopenharmony_ci punpckldq m7, m7 656cabdff1aSopenharmony_ci pmaddwd %1, m7 657cabdff1aSopenharmony_ci pmaddwd %2, m7 658cabdff1aSopenharmony_ci pmaddwd m4, m7 659cabdff1aSopenharmony_ci pmaddwd m5, m7 660cabdff1aSopenharmony_ci psrad %1, %3 661cabdff1aSopenharmony_ci psrad %2, %3 662cabdff1aSopenharmony_ci psrad m4, %3 663cabdff1aSopenharmony_ci psrad m5, %3 664cabdff1aSopenharmony_ci packssdw %1, m4 665cabdff1aSopenharmony_ci packssdw %2, m5 666cabdff1aSopenharmony_ci%endif 667cabdff1aSopenharmony_ci%endmacro 668cabdff1aSopenharmony_ci 669cabdff1aSopenharmony_ci%macro STORE_WORDS 5-9 670cabdff1aSopenharmony_ci%if cpuflag(sse) 671cabdff1aSopenharmony_ci movd t0d, %1 672cabdff1aSopenharmony_ci psrldq %1, 4 673cabdff1aSopenharmony_ci movd t1d, %1 674cabdff1aSopenharmony_ci psrldq %1, 4 675cabdff1aSopenharmony_ci mov [t2+%2*32], t0w 676cabdff1aSopenharmony_ci mov [t2+%4*32], t1w 677cabdff1aSopenharmony_ci shr t0d, 16 678cabdff1aSopenharmony_ci shr t1d, 16 679cabdff1aSopenharmony_ci mov [t2+%3*32], t0w 680cabdff1aSopenharmony_ci mov [t2+%5*32], t1w 681cabdff1aSopenharmony_ci movd t0d, %1 682cabdff1aSopenharmony_ci psrldq %1, 4 683cabdff1aSopenharmony_ci movd t1d, %1 684cabdff1aSopenharmony_ci mov [t2+%6*32], t0w 685cabdff1aSopenharmony_ci mov [t2+%8*32], t1w 686cabdff1aSopenharmony_ci shr t0d, 16 687cabdff1aSopenharmony_ci shr t1d, 16 688cabdff1aSopenharmony_ci mov [t2+%7*32], t0w 689cabdff1aSopenharmony_ci mov [t2+%9*32], t1w 690cabdff1aSopenharmony_ci%else 691cabdff1aSopenharmony_ci movd t0d, %1 692cabdff1aSopenharmony_ci psrlq %1, 32 693cabdff1aSopenharmony_ci movd t1d, %1 694cabdff1aSopenharmony_ci mov [t2+%2*32], t0w 695cabdff1aSopenharmony_ci mov [t2+%4*32], t1w 696cabdff1aSopenharmony_ci shr t0d, 16 697cabdff1aSopenharmony_ci shr t1d, 16 698cabdff1aSopenharmony_ci mov [t2+%3*32], t0w 699cabdff1aSopenharmony_ci mov [t2+%5*32], t1w 700cabdff1aSopenharmony_ci%endif 701cabdff1aSopenharmony_ci%endmacro 702cabdff1aSopenharmony_ci 703cabdff1aSopenharmony_ci%macro DEQUANT_STORE 1 704cabdff1aSopenharmony_ci%if cpuflag(sse2) 705cabdff1aSopenharmony_ci DEQUANT %1 706cabdff1aSopenharmony_ci STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7 707cabdff1aSopenharmony_ci STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15 708cabdff1aSopenharmony_ci%else 709cabdff1aSopenharmony_ci DEQUANT m0, m1, %1 710cabdff1aSopenharmony_ci STORE_WORDS m0, 0, 1, 4, 5 711cabdff1aSopenharmony_ci STORE_WORDS m1, 2, 3, 6, 7 712cabdff1aSopenharmony_ci 713cabdff1aSopenharmony_ci DEQUANT m2, m3, %1 714cabdff1aSopenharmony_ci STORE_WORDS m2, 8, 9, 12, 13 715cabdff1aSopenharmony_ci STORE_WORDS m3, 10, 11, 14, 15 716cabdff1aSopenharmony_ci%endif 717cabdff1aSopenharmony_ci%endmacro 718cabdff1aSopenharmony_ci 719cabdff1aSopenharmony_ci%macro IDCT_DC_DEQUANT 1 720cabdff1aSopenharmony_cicglobal h264_luma_dc_dequant_idct, 3, 4, %1 721cabdff1aSopenharmony_ci ; manually spill XMM registers for Win64 because 722cabdff1aSopenharmony_ci ; the code here is initialized with INIT_MMX 723cabdff1aSopenharmony_ci WIN64_SPILL_XMM %1 724cabdff1aSopenharmony_ci movq m3, [r1+24] 725cabdff1aSopenharmony_ci movq m2, [r1+16] 726cabdff1aSopenharmony_ci movq m1, [r1+ 8] 727cabdff1aSopenharmony_ci movq m0, [r1+ 0] 728cabdff1aSopenharmony_ci WALSH4_1D 0,1,2,3,4 729cabdff1aSopenharmony_ci TRANSPOSE4x4W 0,1,2,3,4 730cabdff1aSopenharmony_ci WALSH4_1D 0,1,2,3,4 731cabdff1aSopenharmony_ci 732cabdff1aSopenharmony_ci; shift, tmp, output, qmul 733cabdff1aSopenharmony_ci%if WIN64 734cabdff1aSopenharmony_ci DECLARE_REG_TMP 0,3,1,2 735cabdff1aSopenharmony_ci ; we can't avoid this, because r0 is the shift register (ecx) on win64 736cabdff1aSopenharmony_ci xchg r0, t2 737cabdff1aSopenharmony_ci%elif ARCH_X86_64 738cabdff1aSopenharmony_ci DECLARE_REG_TMP 3,1,0,2 739cabdff1aSopenharmony_ci%else 740cabdff1aSopenharmony_ci DECLARE_REG_TMP 1,3,0,2 741cabdff1aSopenharmony_ci%endif 742cabdff1aSopenharmony_ci 743cabdff1aSopenharmony_ci cmp t3d, 32767 744cabdff1aSopenharmony_ci jg .big_qmul 745cabdff1aSopenharmony_ci add t3d, 128 << 16 746cabdff1aSopenharmony_ci DEQUANT_STORE 8 747cabdff1aSopenharmony_ci RET 748cabdff1aSopenharmony_ci.big_qmul: 749cabdff1aSopenharmony_ci bsr t0d, t3d 750cabdff1aSopenharmony_ci add t3d, 128 << 16 751cabdff1aSopenharmony_ci mov t1d, 7 752cabdff1aSopenharmony_ci cmp t0d, t1d 753cabdff1aSopenharmony_ci cmovg t0d, t1d 754cabdff1aSopenharmony_ci inc t1d 755cabdff1aSopenharmony_ci shr t3d, t0b 756cabdff1aSopenharmony_ci sub t1d, t0d 757cabdff1aSopenharmony_ci movd xmm6, t1d 758cabdff1aSopenharmony_ci DEQUANT_STORE xmm6 759cabdff1aSopenharmony_ci RET 760cabdff1aSopenharmony_ci%endmacro 761cabdff1aSopenharmony_ci 762cabdff1aSopenharmony_ciINIT_MMX sse2 763cabdff1aSopenharmony_ciIDCT_DC_DEQUANT 7 764cabdff1aSopenharmony_ci 765cabdff1aSopenharmony_ci%ifdef __NASM_VER__ 766cabdff1aSopenharmony_ci%if __NASM_MAJOR__ >= 2 && __NASM_MINOR__ >= 4 767cabdff1aSopenharmony_ci%unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet 768cabdff1aSopenharmony_ci%endif 769cabdff1aSopenharmony_ci%endif 770cabdff1aSopenharmony_ci%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride 771cabdff1aSopenharmony_ci movd %3, [%7] 772cabdff1aSopenharmony_ci movd %4, [%7+%8] 773cabdff1aSopenharmony_ci psraw %1, %6 774cabdff1aSopenharmony_ci psraw %2, %6 775cabdff1aSopenharmony_ci punpcklbw %3, %5 776cabdff1aSopenharmony_ci punpcklbw %4, %5 777cabdff1aSopenharmony_ci paddw %3, %1 778cabdff1aSopenharmony_ci paddw %4, %2 779cabdff1aSopenharmony_ci packuswb %3, %5 780cabdff1aSopenharmony_ci packuswb %4, %5 781cabdff1aSopenharmony_ci movd [%7], %3 782cabdff1aSopenharmony_ci movd [%7+%8], %4 783cabdff1aSopenharmony_ci%endmacro 784cabdff1aSopenharmony_ci 785cabdff1aSopenharmony_ci%macro DC_ADD_INIT 1 786cabdff1aSopenharmony_ci add %1d, 32 787cabdff1aSopenharmony_ci sar %1d, 6 788cabdff1aSopenharmony_ci movd m0, %1d 789cabdff1aSopenharmony_ci pshuflw m0, m0, 0 790cabdff1aSopenharmony_ci lea %1, [3*stride_q] 791cabdff1aSopenharmony_ci pxor m1, m1 792cabdff1aSopenharmony_ci psubw m1, m0 793cabdff1aSopenharmony_ci packuswb m0, m0 794cabdff1aSopenharmony_ci packuswb m1, m1 795cabdff1aSopenharmony_ci%endmacro 796cabdff1aSopenharmony_ci 797cabdff1aSopenharmony_ci%macro IDCT_XMM 1 798cabdff1aSopenharmony_ci 799cabdff1aSopenharmony_ciINIT_XMM %1 800cabdff1aSopenharmony_ci 801cabdff1aSopenharmony_cicglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ 802cabdff1aSopenharmony_ci movsxdifnidn stride_q, stride_d 803cabdff1aSopenharmony_ci IDCT4_ADD dst_q, block_q, stride_q 804cabdff1aSopenharmony_ciRET 805cabdff1aSopenharmony_ci 806cabdff1aSopenharmony_cicglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_ 807cabdff1aSopenharmony_ci movsxdifnidn stride_q, stride_d 808cabdff1aSopenharmony_ci movsx r3d, word [block_q] 809cabdff1aSopenharmony_ci mov dword [block_q], 0 810cabdff1aSopenharmony_ci DC_ADD_INIT r3 811cabdff1aSopenharmony_ci DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3 812cabdff1aSopenharmony_ciRET 813cabdff1aSopenharmony_ci 814cabdff1aSopenharmony_ci%endmacro 815cabdff1aSopenharmony_ci 816cabdff1aSopenharmony_ciIDCT_XMM sse2 817cabdff1aSopenharmony_ciIDCT_XMM avx 818