1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* VP9 inverse transform x86 SIMD optimizations 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com> 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci%include "vp9itxfm_template.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION_RODATA 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cicextern pw_8 29cabdff1aSopenharmony_cicextern pw_1023 30cabdff1aSopenharmony_cicextern pw_2048 31cabdff1aSopenharmony_cicextern pw_4095 32cabdff1aSopenharmony_cicextern pw_m1 33cabdff1aSopenharmony_cicextern pd_1 34cabdff1aSopenharmony_cicextern pd_16 35cabdff1aSopenharmony_cicextern pd_32 36cabdff1aSopenharmony_cicextern pd_8192 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_cipd_8: times 4 dd 8 39cabdff1aSopenharmony_cipd_3fff: times 4 dd 0x3fff 40cabdff1aSopenharmony_ci 41cabdff1aSopenharmony_cicextern pw_11585x2 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_cicextern pw_5283_13377 44cabdff1aSopenharmony_cicextern pw_9929_13377 45cabdff1aSopenharmony_cicextern pw_15212_m13377 46cabdff1aSopenharmony_cicextern pw_15212_9929 47cabdff1aSopenharmony_cicextern pw_m5283_m15212 48cabdff1aSopenharmony_cicextern pw_13377x2 49cabdff1aSopenharmony_cicextern pw_m13377_13377 50cabdff1aSopenharmony_cicextern pw_13377_0 51cabdff1aSopenharmony_ci 52cabdff1aSopenharmony_cipw_9929_m5283: times 4 dw 9929, -5283 53cabdff1aSopenharmony_ci 54cabdff1aSopenharmony_ci%macro COEF_PAIR 2-3 55cabdff1aSopenharmony_cicextern pw_m%1_%2 56cabdff1aSopenharmony_cicextern pw_%2_%1 57cabdff1aSopenharmony_ci%if %0 == 3 58cabdff1aSopenharmony_cicextern pw_m%1_m%2 59cabdff1aSopenharmony_ci%if %1 != %2 60cabdff1aSopenharmony_cicextern pw_m%2_%1 61cabdff1aSopenharmony_cicextern pw_%1_%2 62cabdff1aSopenharmony_ci%endif 63cabdff1aSopenharmony_ci%endif 64cabdff1aSopenharmony_ci%endmacro 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_ciCOEF_PAIR 2404, 16207 67cabdff1aSopenharmony_ciCOEF_PAIR 3196, 16069, 1 68cabdff1aSopenharmony_ciCOEF_PAIR 4756, 15679 69cabdff1aSopenharmony_ciCOEF_PAIR 5520, 15426 70cabdff1aSopenharmony_ciCOEF_PAIR 6270, 15137, 1 71cabdff1aSopenharmony_ciCOEF_PAIR 8423, 14053 72cabdff1aSopenharmony_ciCOEF_PAIR 10394, 12665 73cabdff1aSopenharmony_ciCOEF_PAIR 11003, 12140 74cabdff1aSopenharmony_ciCOEF_PAIR 11585, 11585, 1 75cabdff1aSopenharmony_ciCOEF_PAIR 13160, 9760 76cabdff1aSopenharmony_ciCOEF_PAIR 13623, 9102, 1 77cabdff1aSopenharmony_ciCOEF_PAIR 14449, 7723 78cabdff1aSopenharmony_ciCOEF_PAIR 14811, 7005 79cabdff1aSopenharmony_ciCOEF_PAIR 15893, 3981 80cabdff1aSopenharmony_ciCOEF_PAIR 16305, 1606 81cabdff1aSopenharmony_ciCOEF_PAIR 16364, 804 82cabdff1aSopenharmony_ci 83cabdff1aSopenharmony_cidefault_8x8: 84cabdff1aSopenharmony_citimes 12 db 1 85cabdff1aSopenharmony_citimes 52 db 2 86cabdff1aSopenharmony_cirow_8x8: 87cabdff1aSopenharmony_citimes 18 db 1 88cabdff1aSopenharmony_citimes 46 db 2 89cabdff1aSopenharmony_cicol_8x8: 90cabdff1aSopenharmony_citimes 6 db 1 91cabdff1aSopenharmony_citimes 58 db 2 92cabdff1aSopenharmony_cidefault_16x16: 93cabdff1aSopenharmony_citimes 10 db 1 94cabdff1aSopenharmony_citimes 28 db 2 95cabdff1aSopenharmony_citimes 51 db 3 96cabdff1aSopenharmony_citimes 167 db 4 97cabdff1aSopenharmony_cirow_16x16: 98cabdff1aSopenharmony_citimes 21 db 1 99cabdff1aSopenharmony_citimes 45 db 2 100cabdff1aSopenharmony_citimes 60 db 3 101cabdff1aSopenharmony_citimes 130 db 4 102cabdff1aSopenharmony_cicol_16x16: 103cabdff1aSopenharmony_citimes 5 db 1 104cabdff1aSopenharmony_citimes 12 db 2 105cabdff1aSopenharmony_citimes 25 db 3 106cabdff1aSopenharmony_citimes 214 db 4 107cabdff1aSopenharmony_cidefault_32x32: 108cabdff1aSopenharmony_citimes 9 db 1 109cabdff1aSopenharmony_citimes 25 db 2 110cabdff1aSopenharmony_citimes 36 db 3 111cabdff1aSopenharmony_citimes 65 db 4 112cabdff1aSopenharmony_citimes 105 db 5 113cabdff1aSopenharmony_citimes 96 db 6 114cabdff1aSopenharmony_citimes 112 db 7 115cabdff1aSopenharmony_citimes 576 db 8 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ciSECTION .text 118cabdff1aSopenharmony_ci 119cabdff1aSopenharmony_ci%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst 120cabdff1aSopenharmony_ci mova m%3, [%7] 121cabdff1aSopenharmony_ci mova m%4, [%7+strideq] 122cabdff1aSopenharmony_ci paddw m%3, m%1 123cabdff1aSopenharmony_ci paddw m%4, m%2 124cabdff1aSopenharmony_ci pmaxsw m%3, m%5 125cabdff1aSopenharmony_ci pmaxsw m%4, m%5 126cabdff1aSopenharmony_ci pminsw m%3, m%6 127cabdff1aSopenharmony_ci pminsw m%4, m%6 128cabdff1aSopenharmony_ci mova [%7], m%3 129cabdff1aSopenharmony_ci mova [%7+strideq], m%4 130cabdff1aSopenharmony_ci%endmacro 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg 133cabdff1aSopenharmony_ci%assign %%y 0 134cabdff1aSopenharmony_ci%rep %3 135cabdff1aSopenharmony_ci%assign %%x 0 136cabdff1aSopenharmony_ci%rep %3*4/mmsize 137cabdff1aSopenharmony_ci mova [%1+%%y+%%x], %4 138cabdff1aSopenharmony_ci%assign %%x (%%x+mmsize) 139cabdff1aSopenharmony_ci%endrep 140cabdff1aSopenharmony_ci%assign %%y (%%y+%2) 141cabdff1aSopenharmony_ci%endrep 142cabdff1aSopenharmony_ci%endmacro 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci; the input coefficients are scaled up by 2 bit (which we downscale immediately 145cabdff1aSopenharmony_ci; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d. 146cabdff1aSopenharmony_ci; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling, 147cabdff1aSopenharmony_ci; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits 148cabdff1aSopenharmony_ci; add 2 bits, we need to scale before converting to word in 12bpp, since the 149cabdff1aSopenharmony_ci; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp 150cabdff1aSopenharmony_ci; we can scale after converting to words (which is half the instructions), 151cabdff1aSopenharmony_ci; since the input is only 14+sign bit, which fits in 15+sign words directly. 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_ci%macro IWHT4_FN 2 ; bpp, max 154cabdff1aSopenharmony_cicglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob 155cabdff1aSopenharmony_ci mova m7, [pw_%2] 156cabdff1aSopenharmony_ci mova m0, [blockq+0*16+0] 157cabdff1aSopenharmony_ci mova m1, [blockq+1*16+0] 158cabdff1aSopenharmony_ci%if %1 >= 12 159cabdff1aSopenharmony_ci mova m4, [blockq+0*16+8] 160cabdff1aSopenharmony_ci mova m5, [blockq+1*16+8] 161cabdff1aSopenharmony_ci psrad m0, 2 162cabdff1aSopenharmony_ci psrad m1, 2 163cabdff1aSopenharmony_ci psrad m4, 2 164cabdff1aSopenharmony_ci psrad m5, 2 165cabdff1aSopenharmony_ci packssdw m0, m4 166cabdff1aSopenharmony_ci packssdw m1, m5 167cabdff1aSopenharmony_ci%else 168cabdff1aSopenharmony_ci packssdw m0, [blockq+0*16+8] 169cabdff1aSopenharmony_ci packssdw m1, [blockq+1*16+8] 170cabdff1aSopenharmony_ci psraw m0, 2 171cabdff1aSopenharmony_ci psraw m1, 2 172cabdff1aSopenharmony_ci%endif 173cabdff1aSopenharmony_ci mova m2, [blockq+2*16+0] 174cabdff1aSopenharmony_ci mova m3, [blockq+3*16+0] 175cabdff1aSopenharmony_ci%if %1 >= 12 176cabdff1aSopenharmony_ci mova m4, [blockq+2*16+8] 177cabdff1aSopenharmony_ci mova m5, [blockq+3*16+8] 178cabdff1aSopenharmony_ci psrad m2, 2 179cabdff1aSopenharmony_ci psrad m3, 2 180cabdff1aSopenharmony_ci psrad m4, 2 181cabdff1aSopenharmony_ci psrad m5, 2 182cabdff1aSopenharmony_ci packssdw m2, m4 183cabdff1aSopenharmony_ci packssdw m3, m5 184cabdff1aSopenharmony_ci%else 185cabdff1aSopenharmony_ci packssdw m2, [blockq+2*16+8] 186cabdff1aSopenharmony_ci packssdw m3, [blockq+3*16+8] 187cabdff1aSopenharmony_ci psraw m2, 2 188cabdff1aSopenharmony_ci psraw m3, 2 189cabdff1aSopenharmony_ci%endif 190cabdff1aSopenharmony_ci 191cabdff1aSopenharmony_ci VP9_IWHT4_1D 192cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 4 193cabdff1aSopenharmony_ci VP9_IWHT4_1D 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_ci pxor m6, m6 196cabdff1aSopenharmony_ci VP9_STORE_2X 0, 1, 4, 5, 6, 7 197cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 198cabdff1aSopenharmony_ci VP9_STORE_2X 2, 3, 4, 5, 6, 7 199cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 16, 4, m6 200cabdff1aSopenharmony_ci RET 201cabdff1aSopenharmony_ci%endmacro 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ciINIT_MMX mmxext 204cabdff1aSopenharmony_ciIWHT4_FN 10, 1023 205cabdff1aSopenharmony_ciINIT_MMX mmxext 206cabdff1aSopenharmony_ciIWHT4_FN 12, 4095 207cabdff1aSopenharmony_ci 208cabdff1aSopenharmony_ci%macro VP9_IDCT4_WRITEOUT 0 209cabdff1aSopenharmony_ci%if cpuflag(ssse3) 210cabdff1aSopenharmony_ci mova m5, [pw_2048] 211cabdff1aSopenharmony_ci pmulhrsw m0, m5 212cabdff1aSopenharmony_ci pmulhrsw m1, m5 213cabdff1aSopenharmony_ci pmulhrsw m2, m5 214cabdff1aSopenharmony_ci pmulhrsw m3, m5 215cabdff1aSopenharmony_ci%else 216cabdff1aSopenharmony_ci mova m5, [pw_8] 217cabdff1aSopenharmony_ci paddw m0, m5 218cabdff1aSopenharmony_ci paddw m1, m5 219cabdff1aSopenharmony_ci paddw m2, m5 220cabdff1aSopenharmony_ci paddw m3, m5 221cabdff1aSopenharmony_ci psraw m0, 4 222cabdff1aSopenharmony_ci psraw m1, 4 223cabdff1aSopenharmony_ci psraw m2, 4 224cabdff1aSopenharmony_ci psraw m3, 4 225cabdff1aSopenharmony_ci%endif 226cabdff1aSopenharmony_ci mova m5, [pw_1023] 227cabdff1aSopenharmony_ci VP9_STORE_2X 0, 1, 6, 7, 4, 5 228cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 229cabdff1aSopenharmony_ci VP9_STORE_2X 2, 3, 6, 7, 4, 5 230cabdff1aSopenharmony_ci%endmacro 231cabdff1aSopenharmony_ci 232cabdff1aSopenharmony_ci%macro DC_ONLY 2 ; shift, zero 233cabdff1aSopenharmony_ci mov coefd, dword [blockq] 234cabdff1aSopenharmony_ci movd [blockq], %2 235cabdff1aSopenharmony_ci imul coefd, 11585 236cabdff1aSopenharmony_ci add coefd, 8192 237cabdff1aSopenharmony_ci sar coefd, 14 238cabdff1aSopenharmony_ci imul coefd, 11585 239cabdff1aSopenharmony_ci add coefd, ((1 << (%1 - 1)) << 14) + 8192 240cabdff1aSopenharmony_ci sar coefd, 14 + %1 241cabdff1aSopenharmony_ci%endmacro 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits 244cabdff1aSopenharmony_ci; in 15+1 words without additional effort, since the coefficients are 15bpp. 245cabdff1aSopenharmony_ci 246cabdff1aSopenharmony_ci%macro IDCT4_10_FN 0 247cabdff1aSopenharmony_cicglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob 248cabdff1aSopenharmony_ci cmp eobd, 1 249cabdff1aSopenharmony_ci jg .idctfull 250cabdff1aSopenharmony_ci 251cabdff1aSopenharmony_ci ; dc-only 252cabdff1aSopenharmony_ci pxor m4, m4 253cabdff1aSopenharmony_ci%if cpuflag(ssse3) 254cabdff1aSopenharmony_ci movd m0, [blockq] 255cabdff1aSopenharmony_ci movd [blockq], m4 256cabdff1aSopenharmony_ci mova m5, [pw_11585x2] 257cabdff1aSopenharmony_ci pmulhrsw m0, m5 258cabdff1aSopenharmony_ci pmulhrsw m0, m5 259cabdff1aSopenharmony_ci%else 260cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef 261cabdff1aSopenharmony_ci DC_ONLY 4, m4 262cabdff1aSopenharmony_ci movd m0, coefd 263cabdff1aSopenharmony_ci%endif 264cabdff1aSopenharmony_ci pshufw m0, m0, 0 265cabdff1aSopenharmony_ci mova m5, [pw_1023] 266cabdff1aSopenharmony_ci%if cpuflag(ssse3) 267cabdff1aSopenharmony_ci pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 268cabdff1aSopenharmony_ci%endif 269cabdff1aSopenharmony_ci VP9_STORE_2X 0, 0, 6, 7, 4, 5 270cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 271cabdff1aSopenharmony_ci VP9_STORE_2X 0, 0, 6, 7, 4, 5 272cabdff1aSopenharmony_ci RET 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ci.idctfull: 275cabdff1aSopenharmony_ci mova m0, [blockq+0*16+0] 276cabdff1aSopenharmony_ci mova m1, [blockq+1*16+0] 277cabdff1aSopenharmony_ci packssdw m0, [blockq+0*16+8] 278cabdff1aSopenharmony_ci packssdw m1, [blockq+1*16+8] 279cabdff1aSopenharmony_ci mova m2, [blockq+2*16+0] 280cabdff1aSopenharmony_ci mova m3, [blockq+3*16+0] 281cabdff1aSopenharmony_ci packssdw m2, [blockq+2*16+8] 282cabdff1aSopenharmony_ci packssdw m3, [blockq+3*16+8] 283cabdff1aSopenharmony_ci 284cabdff1aSopenharmony_ci%if cpuflag(ssse3) 285cabdff1aSopenharmony_ci mova m6, [pw_11585x2] 286cabdff1aSopenharmony_ci%endif 287cabdff1aSopenharmony_ci mova m7, [pd_8192] ; rounding 288cabdff1aSopenharmony_ci VP9_IDCT4_1D 289cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 4 290cabdff1aSopenharmony_ci VP9_IDCT4_1D 291cabdff1aSopenharmony_ci 292cabdff1aSopenharmony_ci pxor m4, m4 293cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 16, 4, m4 294cabdff1aSopenharmony_ci VP9_IDCT4_WRITEOUT 295cabdff1aSopenharmony_ci RET 296cabdff1aSopenharmony_ci%endmacro 297cabdff1aSopenharmony_ci 298cabdff1aSopenharmony_ciINIT_MMX mmxext 299cabdff1aSopenharmony_ciIDCT4_10_FN 300cabdff1aSopenharmony_ciINIT_MMX ssse3 301cabdff1aSopenharmony_ciIDCT4_10_FN 302cabdff1aSopenharmony_ci 303cabdff1aSopenharmony_ci%macro IADST4_FN 4 304cabdff1aSopenharmony_cicglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob 305cabdff1aSopenharmony_ci%if WIN64 && notcpuflag(ssse3) 306cabdff1aSopenharmony_ci WIN64_SPILL_XMM 8 307cabdff1aSopenharmony_ci%endif 308cabdff1aSopenharmony_ci movdqa xmm5, [pd_8192] 309cabdff1aSopenharmony_ci mova m0, [blockq+0*16+0] 310cabdff1aSopenharmony_ci mova m1, [blockq+1*16+0] 311cabdff1aSopenharmony_ci packssdw m0, [blockq+0*16+8] 312cabdff1aSopenharmony_ci packssdw m1, [blockq+1*16+8] 313cabdff1aSopenharmony_ci mova m2, [blockq+2*16+0] 314cabdff1aSopenharmony_ci mova m3, [blockq+3*16+0] 315cabdff1aSopenharmony_ci packssdw m2, [blockq+2*16+8] 316cabdff1aSopenharmony_ci packssdw m3, [blockq+3*16+8] 317cabdff1aSopenharmony_ci 318cabdff1aSopenharmony_ci%if cpuflag(ssse3) 319cabdff1aSopenharmony_ci mova m6, [pw_11585x2] 320cabdff1aSopenharmony_ci%endif 321cabdff1aSopenharmony_ci%ifnidn %1%3, iadstiadst 322cabdff1aSopenharmony_ci movdq2q m7, xmm5 323cabdff1aSopenharmony_ci%endif 324cabdff1aSopenharmony_ci VP9_%2_1D 325cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 4 326cabdff1aSopenharmony_ci VP9_%4_1D 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_ci pxor m4, m4 329cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 16, 4, m4 330cabdff1aSopenharmony_ci VP9_IDCT4_WRITEOUT 331cabdff1aSopenharmony_ci RET 332cabdff1aSopenharmony_ci%endmacro 333cabdff1aSopenharmony_ci 334cabdff1aSopenharmony_ciINIT_MMX sse2 335cabdff1aSopenharmony_ciIADST4_FN idct, IDCT4, iadst, IADST4 336cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, idct, IDCT4 337cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, iadst, IADST4 338cabdff1aSopenharmony_ci 339cabdff1aSopenharmony_ciINIT_MMX ssse3 340cabdff1aSopenharmony_ciIADST4_FN idct, IDCT4, iadst, IADST4 341cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, idct, IDCT4 342cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, iadst, IADST4 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci; inputs and outputs are dwords, coefficients are words 345cabdff1aSopenharmony_ci; 346cabdff1aSopenharmony_ci; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14 347cabdff1aSopenharmony_ci; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14 348cabdff1aSopenharmony_ci%macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask 349cabdff1aSopenharmony_ci pand m%3, m%1, %8 350cabdff1aSopenharmony_ci pand m%4, m%2, %8 351cabdff1aSopenharmony_ci psrad m%1, 14 352cabdff1aSopenharmony_ci psrad m%2, 14 353cabdff1aSopenharmony_ci packssdw m%4, m%2 354cabdff1aSopenharmony_ci packssdw m%3, m%1 355cabdff1aSopenharmony_ci punpckhwd m%2, m%4, m%3 356cabdff1aSopenharmony_ci punpcklwd m%4, m%3 357cabdff1aSopenharmony_ci pmaddwd m%3, m%4, [pw_%6_%5] 358cabdff1aSopenharmony_ci pmaddwd m%1, m%2, [pw_%6_%5] 359cabdff1aSopenharmony_ci pmaddwd m%4, [pw_m%5_%6] 360cabdff1aSopenharmony_ci pmaddwd m%2, [pw_m%5_%6] 361cabdff1aSopenharmony_ci paddd m%3, %7 362cabdff1aSopenharmony_ci paddd m%4, %7 363cabdff1aSopenharmony_ci psrad m%3, 14 364cabdff1aSopenharmony_ci psrad m%4, 14 365cabdff1aSopenharmony_ci paddd m%1, m%3 366cabdff1aSopenharmony_ci paddd m%2, m%4 367cabdff1aSopenharmony_ci%endmacro 368cabdff1aSopenharmony_ci 369cabdff1aSopenharmony_ci%macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1 370cabdff1aSopenharmony_ci SUMSUB_MUL %3, %5, %7, %8, 11585, 11585, %1, %2 371cabdff1aSopenharmony_ci SUMSUB_MUL %4, %6, %7, %8, 15137, 6270, %1, %2 372cabdff1aSopenharmony_ci SUMSUB_BA d, %4, %3, %7 373cabdff1aSopenharmony_ci SUMSUB_BA d, %6, %5, %7 374cabdff1aSopenharmony_ci SWAP %4, %6, %3 375cabdff1aSopenharmony_ci%endmacro 376cabdff1aSopenharmony_ci 377cabdff1aSopenharmony_ci%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max 378cabdff1aSopenharmony_ci movh m%1, [dstq+strideq*0] 379cabdff1aSopenharmony_ci movh m%2, [dstq+strideq*2] 380cabdff1aSopenharmony_ci movhps m%1, [dstq+strideq*1] 381cabdff1aSopenharmony_ci movhps m%2, [dstq+stride3q ] 382cabdff1aSopenharmony_ci paddw m%1, m%3 383cabdff1aSopenharmony_ci paddw m%2, m%4 384cabdff1aSopenharmony_ci pmaxsw m%1, %5 385cabdff1aSopenharmony_ci pmaxsw m%2, %5 386cabdff1aSopenharmony_ci pminsw m%1, %6 387cabdff1aSopenharmony_ci pminsw m%2, %6 388cabdff1aSopenharmony_ci movh [dstq+strideq*0], m%1 389cabdff1aSopenharmony_ci movhps [dstq+strideq*1], m%1 390cabdff1aSopenharmony_ci movh [dstq+strideq*2], m%2 391cabdff1aSopenharmony_ci movhps [dstq+stride3q ], m%2 392cabdff1aSopenharmony_ci%endmacro 393cabdff1aSopenharmony_ci 394cabdff1aSopenharmony_ci%macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift 395cabdff1aSopenharmony_ci paddd m%1, %7 396cabdff1aSopenharmony_ci paddd m%2, %7 397cabdff1aSopenharmony_ci paddd m%3, %7 398cabdff1aSopenharmony_ci paddd m%4, %7 399cabdff1aSopenharmony_ci psrad m%1, %8 400cabdff1aSopenharmony_ci psrad m%2, %8 401cabdff1aSopenharmony_ci psrad m%3, %8 402cabdff1aSopenharmony_ci psrad m%4, %8 403cabdff1aSopenharmony_ci packssdw m%1, m%2 404cabdff1aSopenharmony_ci packssdw m%3, m%4 405cabdff1aSopenharmony_ci STORE_4x4 %2, %4, %1, %3, %5, %6 406cabdff1aSopenharmony_ci%endmacro 407cabdff1aSopenharmony_ci 408cabdff1aSopenharmony_ciINIT_XMM sse2 409cabdff1aSopenharmony_cicglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob 410cabdff1aSopenharmony_ci cmp eobd, 1 411cabdff1aSopenharmony_ci jg .idctfull 412cabdff1aSopenharmony_ci 413cabdff1aSopenharmony_ci ; dc-only - this is special, since for 4x4 12bpp, the max coef size is 414cabdff1aSopenharmony_ci ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the 415cabdff1aSopenharmony_ci ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a 416cabdff1aSopenharmony_ci ; dword. After the final shift (4), the result is 13+sign bits, so we 417cabdff1aSopenharmony_ci ; don't need any additional processing to fit it in a word 418cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef 419cabdff1aSopenharmony_ci pxor m4, m4 420cabdff1aSopenharmony_ci DC_ONLY 4, m4 421cabdff1aSopenharmony_ci movd m0, coefd 422cabdff1aSopenharmony_ci pshuflw m0, m0, q0000 423cabdff1aSopenharmony_ci punpcklqdq m0, m0 424cabdff1aSopenharmony_ci mova m5, [pw_4095] 425cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 426cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 427cabdff1aSopenharmony_ci STORE_4x4 1, 3, 0, 0, m4, m5 428cabdff1aSopenharmony_ci RET 429cabdff1aSopenharmony_ci 430cabdff1aSopenharmony_ci.idctfull: 431cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, eob 432cabdff1aSopenharmony_ci mova m0, [blockq+0*16] 433cabdff1aSopenharmony_ci mova m1, [blockq+1*16] 434cabdff1aSopenharmony_ci mova m2, [blockq+2*16] 435cabdff1aSopenharmony_ci mova m3, [blockq+3*16] 436cabdff1aSopenharmony_ci mova m6, [pd_8192] 437cabdff1aSopenharmony_ci mova m7, [pd_3fff] 438cabdff1aSopenharmony_ci 439cabdff1aSopenharmony_ci IDCT4_12BPP_1D m6, m7 440cabdff1aSopenharmony_ci TRANSPOSE4x4D 0, 1, 2, 3, 4 441cabdff1aSopenharmony_ci IDCT4_12BPP_1D m6, m7 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci pxor m4, m4 444cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 16, 4, m4 445cabdff1aSopenharmony_ci 446cabdff1aSopenharmony_ci ; writeout 447cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 448cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 449cabdff1aSopenharmony_ci mova m5, [pw_4095] 450cabdff1aSopenharmony_ci mova m6, [pd_8] 451cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4 452cabdff1aSopenharmony_ci RET 453cabdff1aSopenharmony_ci 454cabdff1aSopenharmony_ci%macro SCRATCH 3-4 455cabdff1aSopenharmony_ci%if ARCH_X86_64 456cabdff1aSopenharmony_ci SWAP %1, %2 457cabdff1aSopenharmony_ci%if %0 == 4 458cabdff1aSopenharmony_ci%define reg_%4 m%2 459cabdff1aSopenharmony_ci%endif 460cabdff1aSopenharmony_ci%else 461cabdff1aSopenharmony_ci mova [%3], m%1 462cabdff1aSopenharmony_ci%if %0 == 4 463cabdff1aSopenharmony_ci%define reg_%4 [%3] 464cabdff1aSopenharmony_ci%endif 465cabdff1aSopenharmony_ci%endif 466cabdff1aSopenharmony_ci%endmacro 467cabdff1aSopenharmony_ci 468cabdff1aSopenharmony_ci%macro UNSCRATCH 3-4 469cabdff1aSopenharmony_ci%if ARCH_X86_64 470cabdff1aSopenharmony_ci SWAP %1, %2 471cabdff1aSopenharmony_ci%else 472cabdff1aSopenharmony_ci mova m%1, [%3] 473cabdff1aSopenharmony_ci%endif 474cabdff1aSopenharmony_ci%if %0 == 4 475cabdff1aSopenharmony_ci%undef reg_%4 476cabdff1aSopenharmony_ci%endif 477cabdff1aSopenharmony_ci%endmacro 478cabdff1aSopenharmony_ci 479cabdff1aSopenharmony_ci%macro PRELOAD 2-3 480cabdff1aSopenharmony_ci%if ARCH_X86_64 481cabdff1aSopenharmony_ci mova m%1, [%2] 482cabdff1aSopenharmony_ci%if %0 == 3 483cabdff1aSopenharmony_ci%define reg_%3 m%1 484cabdff1aSopenharmony_ci%endif 485cabdff1aSopenharmony_ci%elif %0 == 3 486cabdff1aSopenharmony_ci%define reg_%3 [%2] 487cabdff1aSopenharmony_ci%endif 488cabdff1aSopenharmony_ci%endmacro 489cabdff1aSopenharmony_ci 490cabdff1aSopenharmony_ci; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14 491cabdff1aSopenharmony_ci; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14 492cabdff1aSopenharmony_ci; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14 493cabdff1aSopenharmony_ci; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14 494cabdff1aSopenharmony_ci%macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask 495cabdff1aSopenharmony_ci pand m4, m0, %2 496cabdff1aSopenharmony_ci pand m5, m1, %2 497cabdff1aSopenharmony_ci psrad m0, 14 498cabdff1aSopenharmony_ci psrad m1, 14 499cabdff1aSopenharmony_ci packssdw m5, m1 500cabdff1aSopenharmony_ci packssdw m4, m0 501cabdff1aSopenharmony_ci punpckhwd m1, m4, m5 502cabdff1aSopenharmony_ci punpcklwd m4, m5 503cabdff1aSopenharmony_ci pand m5, m2, %2 504cabdff1aSopenharmony_ci pand m6, m3, %2 505cabdff1aSopenharmony_ci psrad m2, 14 506cabdff1aSopenharmony_ci psrad m3, 14 507cabdff1aSopenharmony_ci packssdw m6, m3 508cabdff1aSopenharmony_ci packssdw m5, m2 509cabdff1aSopenharmony_ci punpckhwd m3, m5, m6 510cabdff1aSopenharmony_ci punpcklwd m5, m6 511cabdff1aSopenharmony_ci SCRATCH 1, 8, rsp+0*mmsize, a 512cabdff1aSopenharmony_ci SCRATCH 5, 9, rsp+1*mmsize, b 513cabdff1aSopenharmony_ci 514cabdff1aSopenharmony_ci ; m1/3 have the high bits of 0,1,2,3 515cabdff1aSopenharmony_ci ; m4/5 have the low bits of 0,1,2,3 516cabdff1aSopenharmony_ci ; m0/2/6/7 are free 517cabdff1aSopenharmony_ci 518cabdff1aSopenharmony_ci mova m2, [pw_15212_9929] 519cabdff1aSopenharmony_ci mova m0, [pw_5283_13377] 520cabdff1aSopenharmony_ci pmaddwd m7, m2, reg_b 521cabdff1aSopenharmony_ci pmaddwd m6, m4, m0 522cabdff1aSopenharmony_ci pmaddwd m2, m3 523cabdff1aSopenharmony_ci pmaddwd m0, reg_a 524cabdff1aSopenharmony_ci paddd m6, m7 525cabdff1aSopenharmony_ci paddd m0, m2 526cabdff1aSopenharmony_ci mova m1, [pw_m13377_13377] 527cabdff1aSopenharmony_ci mova m5, [pw_13377_0] 528cabdff1aSopenharmony_ci pmaddwd m7, m1, reg_b 529cabdff1aSopenharmony_ci pmaddwd m2, m4, m5 530cabdff1aSopenharmony_ci pmaddwd m1, m3 531cabdff1aSopenharmony_ci pmaddwd m5, reg_a 532cabdff1aSopenharmony_ci paddd m2, m7 533cabdff1aSopenharmony_ci paddd m1, m5 534cabdff1aSopenharmony_ci paddd m6, %1 535cabdff1aSopenharmony_ci paddd m2, %1 536cabdff1aSopenharmony_ci psrad m6, 14 537cabdff1aSopenharmony_ci psrad m2, 14 538cabdff1aSopenharmony_ci paddd m0, m6 ; t0 539cabdff1aSopenharmony_ci paddd m2, m1 ; t2 540cabdff1aSopenharmony_ci 541cabdff1aSopenharmony_ci mova m7, [pw_m5283_m15212] 542cabdff1aSopenharmony_ci mova m5, [pw_9929_13377] 543cabdff1aSopenharmony_ci pmaddwd m1, m7, reg_b 544cabdff1aSopenharmony_ci pmaddwd m6, m4, m5 545cabdff1aSopenharmony_ci pmaddwd m7, m3 546cabdff1aSopenharmony_ci pmaddwd m5, reg_a 547cabdff1aSopenharmony_ci paddd m6, m1 548cabdff1aSopenharmony_ci paddd m7, m5 549cabdff1aSopenharmony_ci UNSCRATCH 5, 9, rsp+1*mmsize, b 550cabdff1aSopenharmony_ci pmaddwd m5, [pw_9929_m5283] 551cabdff1aSopenharmony_ci pmaddwd m4, [pw_15212_m13377] 552cabdff1aSopenharmony_ci pmaddwd m3, [pw_9929_m5283] 553cabdff1aSopenharmony_ci UNSCRATCH 1, 8, rsp+0*mmsize, a 554cabdff1aSopenharmony_ci pmaddwd m1, [pw_15212_m13377] 555cabdff1aSopenharmony_ci paddd m4, m5 556cabdff1aSopenharmony_ci paddd m3, m1 557cabdff1aSopenharmony_ci paddd m6, %1 558cabdff1aSopenharmony_ci paddd m4, %1 559cabdff1aSopenharmony_ci psrad m6, 14 560cabdff1aSopenharmony_ci psrad m4, 14 561cabdff1aSopenharmony_ci paddd m7, m6 ; t1 562cabdff1aSopenharmony_ci paddd m3, m4 ; t3 563cabdff1aSopenharmony_ci 564cabdff1aSopenharmony_ci SWAP 1, 7 565cabdff1aSopenharmony_ci%endmacro 566cabdff1aSopenharmony_ci 567cabdff1aSopenharmony_ci%macro IADST4_12BPP_FN 4 568cabdff1aSopenharmony_cicglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob 569cabdff1aSopenharmony_ci mova m0, [blockq+0*16] 570cabdff1aSopenharmony_ci mova m1, [blockq+1*16] 571cabdff1aSopenharmony_ci mova m2, [blockq+2*16] 572cabdff1aSopenharmony_ci mova m3, [blockq+3*16] 573cabdff1aSopenharmony_ci 574cabdff1aSopenharmony_ci PRELOAD 10, pd_8192, rnd 575cabdff1aSopenharmony_ci PRELOAD 11, pd_3fff, mask 576cabdff1aSopenharmony_ci %2_12BPP_1D reg_rnd, reg_mask 577cabdff1aSopenharmony_ci TRANSPOSE4x4D 0, 1, 2, 3, 4 578cabdff1aSopenharmony_ci %4_12BPP_1D reg_rnd, reg_mask 579cabdff1aSopenharmony_ci 580cabdff1aSopenharmony_ci pxor m4, m4 581cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 16, 4, m4 582cabdff1aSopenharmony_ci 583cabdff1aSopenharmony_ci ; writeout 584cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3 585cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 586cabdff1aSopenharmony_ci mova m5, [pw_4095] 587cabdff1aSopenharmony_ci mova m6, [pd_8] 588cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4 589cabdff1aSopenharmony_ci RET 590cabdff1aSopenharmony_ci%endmacro 591cabdff1aSopenharmony_ci 592cabdff1aSopenharmony_ciINIT_XMM sse2 593cabdff1aSopenharmony_ciIADST4_12BPP_FN idct, IDCT4, iadst, IADST4 594cabdff1aSopenharmony_ciIADST4_12BPP_FN iadst, IADST4, idct, IDCT4 595cabdff1aSopenharmony_ciIADST4_12BPP_FN iadst, IADST4, iadst, IADST4 596cabdff1aSopenharmony_ci 597cabdff1aSopenharmony_ci; the following line has not been executed at the end of this macro: 598cabdff1aSopenharmony_ci; UNSCRATCH 6, 8, rsp+%3*mmsize 599cabdff1aSopenharmony_ci%macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset 600cabdff1aSopenharmony_ci mova m0, [%1+0*%4] 601cabdff1aSopenharmony_ci mova m2, [%1+2*%4] 602cabdff1aSopenharmony_ci mova m4, [%1+4*%4] 603cabdff1aSopenharmony_ci mova m6, [%1+6*%4] 604cabdff1aSopenharmony_ci IDCT4_12BPP_1D %2, %3, 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3 605cabdff1aSopenharmony_ci SCRATCH 4, 8, rsp+(%5+0)*mmsize 606cabdff1aSopenharmony_ci SCRATCH 6, 9, rsp+(%5+1)*mmsize 607cabdff1aSopenharmony_ci mova m1, [%1+1*%4] 608cabdff1aSopenharmony_ci mova m3, [%1+3*%4] 609cabdff1aSopenharmony_ci mova m5, [%1+5*%4] 610cabdff1aSopenharmony_ci mova m7, [%1+7*%4] 611cabdff1aSopenharmony_ci SUMSUB_MUL 1, 7, 4, 6, 16069, 3196, %2, %3 ; m1=t7a, m7=t4a 612cabdff1aSopenharmony_ci SUMSUB_MUL 5, 3, 4, 6, 9102, 13623, %2, %3 ; m5=t6a, m3=t5a 613cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a 614cabdff1aSopenharmony_ci SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a 615cabdff1aSopenharmony_ci SUMSUB_MUL 1, 7, 4, 6, 11585, 11585, %2, %3 ; m1=t6, m7=t5 616cabdff1aSopenharmony_ci SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7 617cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6 618cabdff1aSopenharmony_ci UNSCRATCH 4, 8, rsp+(%5+0)*mmsize 619cabdff1aSopenharmony_ci UNSCRATCH 6, 9, rsp+(%5+1)*mmsize 620cabdff1aSopenharmony_ci SCRATCH 2, 8, rsp+(%5+0)*mmsize 621cabdff1aSopenharmony_ci SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5 622cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4 623cabdff1aSopenharmony_ci SWAP 0, 5, 4, 6, 2, 7 624cabdff1aSopenharmony_ci%endmacro 625cabdff1aSopenharmony_ci 626cabdff1aSopenharmony_ci%macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max 627cabdff1aSopenharmony_ci mova m%1, [%6+%7*0] 628cabdff1aSopenharmony_ci mova m%2, [%6+%7*1] 629cabdff1aSopenharmony_ci paddw m%1, m%3 630cabdff1aSopenharmony_ci paddw m%2, m%3 631cabdff1aSopenharmony_ci pmaxsw m%1, %4 632cabdff1aSopenharmony_ci pmaxsw m%2, %4 633cabdff1aSopenharmony_ci pminsw m%1, %5 634cabdff1aSopenharmony_ci pminsw m%2, %5 635cabdff1aSopenharmony_ci mova [%6+%7*0], m%1 636cabdff1aSopenharmony_ci mova [%6+%7*1], m%2 637cabdff1aSopenharmony_ci%endmacro 638cabdff1aSopenharmony_ci 639cabdff1aSopenharmony_ci; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp 640cabdff1aSopenharmony_ci; storage also instead of allocating two more stack spaces. This doesn't 641cabdff1aSopenharmony_ci; matter much but it's something... 642cabdff1aSopenharmony_ciINIT_XMM sse2 643cabdff1aSopenharmony_cicglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \ 644cabdff1aSopenharmony_ci 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \ 645cabdff1aSopenharmony_ci dst, stride, block, eob 646cabdff1aSopenharmony_ci mova m0, [pw_1023] 647cabdff1aSopenharmony_ci cmp eobd, 1 648cabdff1aSopenharmony_ci jg .idctfull 649cabdff1aSopenharmony_ci 650cabdff1aSopenharmony_ci ; dc-only - the 10bit version can be done entirely in 32bit, since the max 651cabdff1aSopenharmony_ci ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily 652cabdff1aSopenharmony_ci ; fits in 32bit 653cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef 654cabdff1aSopenharmony_ci pxor m2, m2 655cabdff1aSopenharmony_ci DC_ONLY 5, m2 656cabdff1aSopenharmony_ci movd m1, coefd 657cabdff1aSopenharmony_ci pshuflw m1, m1, q0000 658cabdff1aSopenharmony_ci punpcklqdq m1, m1 659cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 660cabdff1aSopenharmony_ci mov cntd, 4 661cabdff1aSopenharmony_ci.loop_dc: 662cabdff1aSopenharmony_ci STORE_2x8 3, 4, 1, m2, m0 663cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 664cabdff1aSopenharmony_ci dec cntd 665cabdff1aSopenharmony_ci jg .loop_dc 666cabdff1aSopenharmony_ci RET 667cabdff1aSopenharmony_ci 668cabdff1aSopenharmony_ci.idctfull: 669cabdff1aSopenharmony_ci SCRATCH 0, 12, rsp+16*mmsize, max 670cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 671cabdff1aSopenharmony_ci%if ARCH_X86_64 672cabdff1aSopenharmony_ci mov dstbakq, dstq 673cabdff1aSopenharmony_ci movsxd cntq, cntd 674cabdff1aSopenharmony_ci%endif 675cabdff1aSopenharmony_ci%ifdef PIC 676cabdff1aSopenharmony_ci lea ptrq, [default_8x8] 677cabdff1aSopenharmony_ci movzx cntd, byte [ptrq+cntq-1] 678cabdff1aSopenharmony_ci%else 679cabdff1aSopenharmony_ci movzx cntd, byte [default_8x8+cntq-1] 680cabdff1aSopenharmony_ci%endif 681cabdff1aSopenharmony_ci mov skipd, 2 682cabdff1aSopenharmony_ci sub skipd, cntd 683cabdff1aSopenharmony_ci mov ptrq, rsp 684cabdff1aSopenharmony_ci PRELOAD 10, pd_8192, rnd 685cabdff1aSopenharmony_ci PRELOAD 11, pd_3fff, mask 686cabdff1aSopenharmony_ci PRELOAD 13, pd_16, srnd 687cabdff1aSopenharmony_ci.loop_1: 688cabdff1aSopenharmony_ci IDCT8_1D blockq, reg_rnd, reg_mask 689cabdff1aSopenharmony_ci 690cabdff1aSopenharmony_ci TRANSPOSE4x4D 0, 1, 2, 3, 6 691cabdff1aSopenharmony_ci mova [ptrq+ 0*mmsize], m0 692cabdff1aSopenharmony_ci mova [ptrq+ 2*mmsize], m1 693cabdff1aSopenharmony_ci mova [ptrq+ 4*mmsize], m2 694cabdff1aSopenharmony_ci mova [ptrq+ 6*mmsize], m3 695cabdff1aSopenharmony_ci UNSCRATCH 6, 8, rsp+17*mmsize 696cabdff1aSopenharmony_ci TRANSPOSE4x4D 4, 5, 6, 7, 0 697cabdff1aSopenharmony_ci mova [ptrq+ 1*mmsize], m4 698cabdff1aSopenharmony_ci mova [ptrq+ 3*mmsize], m5 699cabdff1aSopenharmony_ci mova [ptrq+ 5*mmsize], m6 700cabdff1aSopenharmony_ci mova [ptrq+ 7*mmsize], m7 701cabdff1aSopenharmony_ci add ptrq, 8 * mmsize 702cabdff1aSopenharmony_ci add blockq, mmsize 703cabdff1aSopenharmony_ci dec cntd 704cabdff1aSopenharmony_ci jg .loop_1 705cabdff1aSopenharmony_ci 706cabdff1aSopenharmony_ci ; zero-pad the remainder (skipped cols) 707cabdff1aSopenharmony_ci test skipd, skipd 708cabdff1aSopenharmony_ci jz .end 709cabdff1aSopenharmony_ci add skipd, skipd 710cabdff1aSopenharmony_ci lea blockq, [blockq+skipq*(mmsize/2)] 711cabdff1aSopenharmony_ci pxor m0, m0 712cabdff1aSopenharmony_ci.loop_z: 713cabdff1aSopenharmony_ci mova [ptrq+mmsize*0], m0 714cabdff1aSopenharmony_ci mova [ptrq+mmsize*1], m0 715cabdff1aSopenharmony_ci mova [ptrq+mmsize*2], m0 716cabdff1aSopenharmony_ci mova [ptrq+mmsize*3], m0 717cabdff1aSopenharmony_ci add ptrq, 4 * mmsize 718cabdff1aSopenharmony_ci dec skipd 719cabdff1aSopenharmony_ci jg .loop_z 720cabdff1aSopenharmony_ci.end: 721cabdff1aSopenharmony_ci 722cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 723cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 724cabdff1aSopenharmony_ci mov cntd, 2 725cabdff1aSopenharmony_ci mov ptrq, rsp 726cabdff1aSopenharmony_ci.loop_2: 727cabdff1aSopenharmony_ci IDCT8_1D ptrq, reg_rnd, reg_mask 728cabdff1aSopenharmony_ci 729cabdff1aSopenharmony_ci pxor m6, m6 730cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5 731cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 732cabdff1aSopenharmony_ci UNSCRATCH 0, 8, rsp+17*mmsize 733cabdff1aSopenharmony_ci UNSCRATCH 1, 12, rsp+16*mmsize, max 734cabdff1aSopenharmony_ci UNSCRATCH 2, 13, pd_16, srnd 735cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5 736cabdff1aSopenharmony_ci add ptrq, 16 737cabdff1aSopenharmony_ci%if ARCH_X86_64 738cabdff1aSopenharmony_ci lea dstq, [dstbakq+8] 739cabdff1aSopenharmony_ci%else 740cabdff1aSopenharmony_ci mov dstq, dstm 741cabdff1aSopenharmony_ci add dstq, 8 742cabdff1aSopenharmony_ci%endif 743cabdff1aSopenharmony_ci dec cntd 744cabdff1aSopenharmony_ci jg .loop_2 745cabdff1aSopenharmony_ci 746cabdff1aSopenharmony_ci ; m6 is still zero 747cabdff1aSopenharmony_ci ZERO_BLOCK blockq-2*mmsize, 32, 8, m6 748cabdff1aSopenharmony_ci RET 749cabdff1aSopenharmony_ci 750cabdff1aSopenharmony_ci%macro DC_ONLY_64BIT 2 ; shift, zero 751cabdff1aSopenharmony_ci%if ARCH_X86_64 752cabdff1aSopenharmony_ci movsxd coefq, dword [blockq] 753cabdff1aSopenharmony_ci movd [blockq], %2 754cabdff1aSopenharmony_ci imul coefq, 11585 755cabdff1aSopenharmony_ci add coefq, 8192 756cabdff1aSopenharmony_ci sar coefq, 14 757cabdff1aSopenharmony_ci imul coefq, 11585 758cabdff1aSopenharmony_ci add coefq, ((1 << (%1 - 1)) << 14) + 8192 759cabdff1aSopenharmony_ci sar coefq, 14 + %1 760cabdff1aSopenharmony_ci%else 761cabdff1aSopenharmony_ci mov coefd, dword [blockq] 762cabdff1aSopenharmony_ci movd [blockq], %2 763cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt, coef, coefl 764cabdff1aSopenharmony_ci mov cntd, 2 765cabdff1aSopenharmony_ci.loop_dc_calc: 766cabdff1aSopenharmony_ci mov coefld, coefd 767cabdff1aSopenharmony_ci sar coefd, 14 768cabdff1aSopenharmony_ci and coefld, 0x3fff 769cabdff1aSopenharmony_ci imul coefd, 11585 770cabdff1aSopenharmony_ci imul coefld, 11585 771cabdff1aSopenharmony_ci add coefld, 8192 772cabdff1aSopenharmony_ci sar coefld, 14 773cabdff1aSopenharmony_ci add coefd, coefld 774cabdff1aSopenharmony_ci dec cntd 775cabdff1aSopenharmony_ci jg .loop_dc_calc 776cabdff1aSopenharmony_ci add coefd, 1 << (%1 - 1) 777cabdff1aSopenharmony_ci sar coefd, %1 778cabdff1aSopenharmony_ci%endif 779cabdff1aSopenharmony_ci%endmacro 780cabdff1aSopenharmony_ci 781cabdff1aSopenharmony_ciINIT_XMM sse2 782cabdff1aSopenharmony_cicglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \ 783cabdff1aSopenharmony_ci 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \ 784cabdff1aSopenharmony_ci dst, stride, block, eob 785cabdff1aSopenharmony_ci mova m0, [pw_4095] 786cabdff1aSopenharmony_ci cmp eobd, 1 787cabdff1aSopenharmony_ci jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull 788cabdff1aSopenharmony_ci 789cabdff1aSopenharmony_ci ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign 790cabdff1aSopenharmony_ci ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies 791cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef, coefl 792cabdff1aSopenharmony_ci pxor m2, m2 793cabdff1aSopenharmony_ci DC_ONLY_64BIT 5, m2 794cabdff1aSopenharmony_ci movd m1, coefd 795cabdff1aSopenharmony_ci pshuflw m1, m1, q0000 796cabdff1aSopenharmony_ci punpcklqdq m1, m1 797cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 798cabdff1aSopenharmony_ci mov cntd, 4 799cabdff1aSopenharmony_ci.loop_dc: 800cabdff1aSopenharmony_ci STORE_2x8 3, 4, 1, m2, m0 801cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 802cabdff1aSopenharmony_ci dec cntd 803cabdff1aSopenharmony_ci jg .loop_dc 804cabdff1aSopenharmony_ci RET 805cabdff1aSopenharmony_ci 806cabdff1aSopenharmony_ci; inputs and outputs are dwords, coefficients are words 807cabdff1aSopenharmony_ci; 808cabdff1aSopenharmony_ci; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2 809cabdff1aSopenharmony_ci; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1 810cabdff1aSopenharmony_ci%macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask 811cabdff1aSopenharmony_ci pand m%3, m%1, %7 812cabdff1aSopenharmony_ci pand m%4, m%2, %7 813cabdff1aSopenharmony_ci psrad m%1, 14 814cabdff1aSopenharmony_ci psrad m%2, 14 815cabdff1aSopenharmony_ci packssdw m%4, m%2 816cabdff1aSopenharmony_ci packssdw m%3, m%1 817cabdff1aSopenharmony_ci punpckhwd m%2, m%4, m%3 818cabdff1aSopenharmony_ci punpcklwd m%4, m%3 819cabdff1aSopenharmony_ci pmaddwd m%3, m%4, [pw_%6_%5] 820cabdff1aSopenharmony_ci pmaddwd m%1, m%2, [pw_%6_%5] 821cabdff1aSopenharmony_ci pmaddwd m%4, [pw_m%5_%6] 822cabdff1aSopenharmony_ci pmaddwd m%2, [pw_m%5_%6] 823cabdff1aSopenharmony_ci%endmacro 824cabdff1aSopenharmony_ci 825cabdff1aSopenharmony_ci; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14 826cabdff1aSopenharmony_ci; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14 827cabdff1aSopenharmony_ci%macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd 828cabdff1aSopenharmony_ci SUMSUB_BA d, %1, %2, %5 829cabdff1aSopenharmony_ci SUMSUB_BA d, %3, %4, %5 830cabdff1aSopenharmony_ci paddd m%3, %6 831cabdff1aSopenharmony_ci paddd m%4, %6 832cabdff1aSopenharmony_ci psrad m%3, 14 833cabdff1aSopenharmony_ci psrad m%4, 14 834cabdff1aSopenharmony_ci paddd m%1, m%3 835cabdff1aSopenharmony_ci paddd m%2, m%4 836cabdff1aSopenharmony_ci%endmacro 837cabdff1aSopenharmony_ci 838cabdff1aSopenharmony_ci%macro NEGD 1 839cabdff1aSopenharmony_ci%if cpuflag(ssse3) 840cabdff1aSopenharmony_ci psignd %1, [pw_m1] 841cabdff1aSopenharmony_ci%else 842cabdff1aSopenharmony_ci pxor %1, [pw_m1] 843cabdff1aSopenharmony_ci paddd %1, [pd_1] 844cabdff1aSopenharmony_ci%endif 845cabdff1aSopenharmony_ci%endmacro 846cabdff1aSopenharmony_ci 847cabdff1aSopenharmony_ci; the following line has not been executed at the end of this macro: 848cabdff1aSopenharmony_ci; UNSCRATCH 6, 8, rsp+17*mmsize 849cabdff1aSopenharmony_ci%macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask 850cabdff1aSopenharmony_ci mova m0, [%1+ 0*mmsize] 851cabdff1aSopenharmony_ci mova m3, [%1+ 6*mmsize] 852cabdff1aSopenharmony_ci mova m4, [%1+ 8*mmsize] 853cabdff1aSopenharmony_ci mova m7, [%1+14*mmsize] 854cabdff1aSopenharmony_ci SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606, %3 ; m7/1=t0a, m0/2=t1a 855cabdff1aSopenharmony_ci SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665, %3 ; m3/5=t4a, m4/6=t5a 856cabdff1aSopenharmony_ci SCRATCH 0, 8, rsp+17*mmsize 857cabdff1aSopenharmony_ci SUMSUB_PACK_D 3, 7, 5, 1, 0, %2 ; m3=t0, m7=t4 858cabdff1aSopenharmony_ci UNSCRATCH 0, 8, rsp+17*mmsize 859cabdff1aSopenharmony_ci SUMSUB_PACK_D 4, 0, 6, 2, 1, %2 ; m4=t1, m0=t5 860cabdff1aSopenharmony_ci 861cabdff1aSopenharmony_ci SCRATCH 3, 8, rsp+17*mmsize 862cabdff1aSopenharmony_ci SCRATCH 4, 9, rsp+18*mmsize 863cabdff1aSopenharmony_ci SCRATCH 7, 10, rsp+19*mmsize 864cabdff1aSopenharmony_ci SCRATCH 0, 11, rsp+20*mmsize 865cabdff1aSopenharmony_ci 866cabdff1aSopenharmony_ci mova m1, [%1+ 2*mmsize] 867cabdff1aSopenharmony_ci mova m2, [%1+ 4*mmsize] 868cabdff1aSopenharmony_ci mova m5, [%1+10*mmsize] 869cabdff1aSopenharmony_ci mova m6, [%1+12*mmsize] 870cabdff1aSopenharmony_ci SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723, %3 ; m5/8=t2a, m2/9=t3a 871cabdff1aSopenharmony_ci SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679, %3 ; m1/10=t6a, m6/11=t7a 872cabdff1aSopenharmony_ci SCRATCH 2, 12, rsp+21*mmsize 873cabdff1aSopenharmony_ci SUMSUB_PACK_D 1, 5, 7, 3, 2, %2 ; m1=t2, m5=t6 874cabdff1aSopenharmony_ci UNSCRATCH 2, 12, rsp+21*mmsize 875cabdff1aSopenharmony_ci SUMSUB_PACK_D 6, 2, 0, 4, 3, %2 ; m6=t3, m2=t7 876cabdff1aSopenharmony_ci 877cabdff1aSopenharmony_ci UNSCRATCH 7, 10, rsp+19*mmsize 878cabdff1aSopenharmony_ci UNSCRATCH 0, 11, rsp+20*mmsize 879cabdff1aSopenharmony_ci SCRATCH 1, 10, rsp+19*mmsize 880cabdff1aSopenharmony_ci SCRATCH 6, 11, rsp+20*mmsize 881cabdff1aSopenharmony_ci 882cabdff1aSopenharmony_ci SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270, %3 ; m7/8=t4a, m0/9=t5a 883cabdff1aSopenharmony_ci SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137, %3 ; m2/10=t7a, m5/11=t6a 884cabdff1aSopenharmony_ci SCRATCH 2, 12, rsp+21*mmsize 885cabdff1aSopenharmony_ci SUMSUB_PACK_D 5, 7, 6, 3, 2, %2 ; m5=-out1, m7=t6 886cabdff1aSopenharmony_ci UNSCRATCH 2, 12, rsp+21*mmsize 887cabdff1aSopenharmony_ci NEGD m5 ; m5=out1 888cabdff1aSopenharmony_ci SUMSUB_PACK_D 2, 0, 1, 4, 3, %2 ; m2=out6, m0=t7 889cabdff1aSopenharmony_ci SUMSUB_MUL 7, 0, 3, 4, 11585, 11585, %2, %3 ; m7=out2, m0=-out5 890cabdff1aSopenharmony_ci NEGD m0 ; m0=out5 891cabdff1aSopenharmony_ci 892cabdff1aSopenharmony_ci UNSCRATCH 3, 8, rsp+17*mmsize 893cabdff1aSopenharmony_ci UNSCRATCH 4, 9, rsp+18*mmsize 894cabdff1aSopenharmony_ci UNSCRATCH 1, 10, rsp+19*mmsize 895cabdff1aSopenharmony_ci UNSCRATCH 6, 11, rsp+20*mmsize 896cabdff1aSopenharmony_ci SCRATCH 2, 8, rsp+17*mmsize 897cabdff1aSopenharmony_ci SCRATCH 0, 9, rsp+18*mmsize 898cabdff1aSopenharmony_ci 899cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2 900cabdff1aSopenharmony_ci SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3 901cabdff1aSopenharmony_ci NEGD m6 ; m6=out7 902cabdff1aSopenharmony_ci SUMSUB_MUL 3, 4, 2, 0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4 903cabdff1aSopenharmony_ci NEGD m3 ; m3=out3 904cabdff1aSopenharmony_ci 905cabdff1aSopenharmony_ci UNSCRATCH 0, 9, rsp+18*mmsize 906cabdff1aSopenharmony_ci 907cabdff1aSopenharmony_ci SWAP 0, 1, 5 908cabdff1aSopenharmony_ci SWAP 2, 7, 6 909cabdff1aSopenharmony_ci%endmacro 910cabdff1aSopenharmony_ci 911cabdff1aSopenharmony_ci%macro IADST8_FN 5 912cabdff1aSopenharmony_cicglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \ 913cabdff1aSopenharmony_ci 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ 914cabdff1aSopenharmony_ci dst, stride, block, eob 915cabdff1aSopenharmony_ci mova m0, [pw_1023] 916cabdff1aSopenharmony_ci 917cabdff1aSopenharmony_ci.body: 918cabdff1aSopenharmony_ci SCRATCH 0, 13, rsp+16*mmsize, max 919cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 920cabdff1aSopenharmony_ci%if ARCH_X86_64 921cabdff1aSopenharmony_ci mov dstbakq, dstq 922cabdff1aSopenharmony_ci movsxd cntq, cntd 923cabdff1aSopenharmony_ci%endif 924cabdff1aSopenharmony_ci%ifdef PIC 925cabdff1aSopenharmony_ci lea ptrq, [%5_8x8] 926cabdff1aSopenharmony_ci movzx cntd, byte [ptrq+cntq-1] 927cabdff1aSopenharmony_ci%else 928cabdff1aSopenharmony_ci movzx cntd, byte [%5_8x8+cntq-1] 929cabdff1aSopenharmony_ci%endif 930cabdff1aSopenharmony_ci mov skipd, 2 931cabdff1aSopenharmony_ci sub skipd, cntd 932cabdff1aSopenharmony_ci mov ptrq, rsp 933cabdff1aSopenharmony_ci PRELOAD 14, pd_8192, rnd 934cabdff1aSopenharmony_ci PRELOAD 15, pd_3fff, mask 935cabdff1aSopenharmony_ci.loop_1: 936cabdff1aSopenharmony_ci %2_1D blockq, reg_rnd, reg_mask 937cabdff1aSopenharmony_ci 938cabdff1aSopenharmony_ci TRANSPOSE4x4D 0, 1, 2, 3, 6 939cabdff1aSopenharmony_ci mova [ptrq+ 0*mmsize], m0 940cabdff1aSopenharmony_ci mova [ptrq+ 2*mmsize], m1 941cabdff1aSopenharmony_ci mova [ptrq+ 4*mmsize], m2 942cabdff1aSopenharmony_ci mova [ptrq+ 6*mmsize], m3 943cabdff1aSopenharmony_ci UNSCRATCH 6, 8, rsp+17*mmsize 944cabdff1aSopenharmony_ci TRANSPOSE4x4D 4, 5, 6, 7, 0 945cabdff1aSopenharmony_ci mova [ptrq+ 1*mmsize], m4 946cabdff1aSopenharmony_ci mova [ptrq+ 3*mmsize], m5 947cabdff1aSopenharmony_ci mova [ptrq+ 5*mmsize], m6 948cabdff1aSopenharmony_ci mova [ptrq+ 7*mmsize], m7 949cabdff1aSopenharmony_ci add ptrq, 8 * mmsize 950cabdff1aSopenharmony_ci add blockq, mmsize 951cabdff1aSopenharmony_ci dec cntd 952cabdff1aSopenharmony_ci jg .loop_1 953cabdff1aSopenharmony_ci 954cabdff1aSopenharmony_ci ; zero-pad the remainder (skipped cols) 955cabdff1aSopenharmony_ci test skipd, skipd 956cabdff1aSopenharmony_ci jz .end 957cabdff1aSopenharmony_ci add skipd, skipd 958cabdff1aSopenharmony_ci lea blockq, [blockq+skipq*(mmsize/2)] 959cabdff1aSopenharmony_ci pxor m0, m0 960cabdff1aSopenharmony_ci.loop_z: 961cabdff1aSopenharmony_ci mova [ptrq+mmsize*0], m0 962cabdff1aSopenharmony_ci mova [ptrq+mmsize*1], m0 963cabdff1aSopenharmony_ci mova [ptrq+mmsize*2], m0 964cabdff1aSopenharmony_ci mova [ptrq+mmsize*3], m0 965cabdff1aSopenharmony_ci add ptrq, 4 * mmsize 966cabdff1aSopenharmony_ci dec skipd 967cabdff1aSopenharmony_ci jg .loop_z 968cabdff1aSopenharmony_ci.end: 969cabdff1aSopenharmony_ci 970cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 971cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 972cabdff1aSopenharmony_ci mov cntd, 2 973cabdff1aSopenharmony_ci mov ptrq, rsp 974cabdff1aSopenharmony_ci.loop_2: 975cabdff1aSopenharmony_ci %4_1D ptrq, reg_rnd, reg_mask 976cabdff1aSopenharmony_ci 977cabdff1aSopenharmony_ci pxor m6, m6 978cabdff1aSopenharmony_ci PRELOAD 9, pd_16, srnd 979cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5 980cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 981cabdff1aSopenharmony_ci UNSCRATCH 0, 8, rsp+17*mmsize 982cabdff1aSopenharmony_ci UNSCRATCH 1, 13, rsp+16*mmsize, max 983cabdff1aSopenharmony_ci UNSCRATCH 2, 9, pd_16, srnd 984cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5 985cabdff1aSopenharmony_ci add ptrq, 16 986cabdff1aSopenharmony_ci%if ARCH_X86_64 987cabdff1aSopenharmony_ci lea dstq, [dstbakq+8] 988cabdff1aSopenharmony_ci%else 989cabdff1aSopenharmony_ci mov dstq, dstm 990cabdff1aSopenharmony_ci add dstq, 8 991cabdff1aSopenharmony_ci%endif 992cabdff1aSopenharmony_ci dec cntd 993cabdff1aSopenharmony_ci jg .loop_2 994cabdff1aSopenharmony_ci 995cabdff1aSopenharmony_ci ; m6 is still zero 996cabdff1aSopenharmony_ci ZERO_BLOCK blockq-2*mmsize, 32, 8, m6 997cabdff1aSopenharmony_ci RET 998cabdff1aSopenharmony_ci 999cabdff1aSopenharmony_cicglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \ 1000cabdff1aSopenharmony_ci 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ 1001cabdff1aSopenharmony_ci dst, stride, block, eob 1002cabdff1aSopenharmony_ci mova m0, [pw_4095] 1003cabdff1aSopenharmony_ci jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body 1004cabdff1aSopenharmony_ci%endmacro 1005cabdff1aSopenharmony_ci 1006cabdff1aSopenharmony_ciINIT_XMM sse2 1007cabdff1aSopenharmony_ciIADST8_FN idct, IDCT8, iadst, IADST8, row 1008cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, idct, IDCT8, col 1009cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, iadst, IADST8, default 1010cabdff1aSopenharmony_ci 1011cabdff1aSopenharmony_ci%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset 1012cabdff1aSopenharmony_ci IDCT8_1D %1, [pd_8192], [pd_3fff], %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7 1013cabdff1aSopenharmony_ci ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6 1014cabdff1aSopenharmony_ci SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a 1015cabdff1aSopenharmony_ci SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a 1016cabdff1aSopenharmony_ci SCRATCH 2, 13, rsp+(%4+5)*mmsize ; t2a 1017cabdff1aSopenharmony_ci SCRATCH 3, 12, rsp+(%4+4)*mmsize ; t3a 1018cabdff1aSopenharmony_ci SCRATCH 4, 11, rsp+(%4+3)*mmsize ; t4 1019cabdff1aSopenharmony_ci mova [rsp+(%3+0)*mmsize], m5 ; t5 1020cabdff1aSopenharmony_ci mova [rsp+(%3+1)*mmsize], m7 ; t7 1021cabdff1aSopenharmony_ci 1022cabdff1aSopenharmony_ci mova m0, [%1+ 1*%2] ; in1 1023cabdff1aSopenharmony_ci mova m3, [%1+ 7*%2] ; in7 1024cabdff1aSopenharmony_ci mova m4, [%1+ 9*%2] ; in9 1025cabdff1aSopenharmony_ci mova m7, [%1+15*%2] ; in15 1026cabdff1aSopenharmony_ci 1027cabdff1aSopenharmony_ci SUMSUB_MUL 0, 7, 1, 2, 16305, 1606 ; m0=t15a, m7=t8a 1028cabdff1aSopenharmony_ci SUMSUB_MUL 4, 3, 1, 2, 10394, 12665 ; m4=t14a, m3=t9a 1029cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 7, 1 ; m3=t8, m7=t9 1030cabdff1aSopenharmony_ci SUMSUB_BA d, 4, 0, 1 ; m4=t15,m0=t14 1031cabdff1aSopenharmony_ci SUMSUB_MUL 0, 7, 1, 2, 15137, 6270 ; m0=t14a, m7=t9a 1032cabdff1aSopenharmony_ci 1033cabdff1aSopenharmony_ci mova m1, [%1+ 3*%2] ; in3 1034cabdff1aSopenharmony_ci mova m2, [%1+ 5*%2] ; in5 1035cabdff1aSopenharmony_ci mova m5, [%1+11*%2] ; in11 1036cabdff1aSopenharmony_ci mova m6, [%1+13*%2] ; in13 1037cabdff1aSopenharmony_ci 1038cabdff1aSopenharmony_ci SCRATCH 0, 9, rsp+(%4+1)*mmsize 1039cabdff1aSopenharmony_ci SCRATCH 7, 10, rsp+(%4+2)*mmsize 1040cabdff1aSopenharmony_ci 1041cabdff1aSopenharmony_ci SUMSUB_MUL 2, 5, 0, 7, 14449, 7723 ; m2=t13a, m5=t10a 1042cabdff1aSopenharmony_ci SUMSUB_MUL 6, 1, 0, 7, 4756, 15679 ; m6=t12a, m1=t11a 1043cabdff1aSopenharmony_ci SUMSUB_BA d, 5, 1, 0 ; m5=t11,m1=t10 1044cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 6, 0 ; m2=t12,m6=t13 1045cabdff1aSopenharmony_ci NEGD m1 ; m1=-t10 1046cabdff1aSopenharmony_ci SUMSUB_MUL 1, 6, 0, 7, 15137, 6270 ; m1=t13a, m6=t10a 1047cabdff1aSopenharmony_ci 1048cabdff1aSopenharmony_ci UNSCRATCH 7, 10, rsp+(%4+2)*mmsize 1049cabdff1aSopenharmony_ci SUMSUB_BA d, 5, 3, 0 ; m5=t8a, m3=t11a 1050cabdff1aSopenharmony_ci SUMSUB_BA d, 6, 7, 0 ; m6=t9, m7=t10 1051cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 4, 0 ; m2=t15a,m4=t12a 1052cabdff1aSopenharmony_ci SCRATCH 5, 10, rsp+(%4+2)*mmsize 1053cabdff1aSopenharmony_ci SUMSUB_MUL 4, 3, 0, 5, 11585, 11585 ; m4=t12, m3=t11 1054cabdff1aSopenharmony_ci UNSCRATCH 0, 9, rsp+(%4+1)*mmsize 1055cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 0, 5 ; m1=t14, m0=t13 1056cabdff1aSopenharmony_ci SCRATCH 6, 9, rsp+(%4+1)*mmsize 1057cabdff1aSopenharmony_ci SUMSUB_MUL 0, 7, 6, 5, 11585, 11585 ; m0=t13a,m7=t10a 1058cabdff1aSopenharmony_ci 1059cabdff1aSopenharmony_ci ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2 1060cabdff1aSopenharmony_ci ; free: 6,5 1061cabdff1aSopenharmony_ci 1062cabdff1aSopenharmony_ci UNSCRATCH 5, 15, rsp+(%4+7)*mmsize 1063cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 5, 6 ; m2=out0, m5=out15 1064cabdff1aSopenharmony_ci SCRATCH 5, 15, rsp+(%4+7)*mmsize 1065cabdff1aSopenharmony_ci UNSCRATCH 5, 14, rsp+(%4+6)*mmsize 1066cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 5, 6 ; m1=out1, m5=out14 1067cabdff1aSopenharmony_ci SCRATCH 5, 14, rsp+(%4+6)*mmsize 1068cabdff1aSopenharmony_ci UNSCRATCH 5, 13, rsp+(%4+5)*mmsize 1069cabdff1aSopenharmony_ci SUMSUB_BA d, 0, 5, 6 ; m0=out2, m5=out13 1070cabdff1aSopenharmony_ci SCRATCH 5, 13, rsp+(%4+5)*mmsize 1071cabdff1aSopenharmony_ci UNSCRATCH 5, 12, rsp+(%4+4)*mmsize 1072cabdff1aSopenharmony_ci SUMSUB_BA d, 4, 5, 6 ; m4=out3, m5=out12 1073cabdff1aSopenharmony_ci SCRATCH 5, 12, rsp+(%4+4)*mmsize 1074cabdff1aSopenharmony_ci UNSCRATCH 5, 11, rsp+(%4+3)*mmsize 1075cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 5, 6 ; m3=out4, m5=out11 1076cabdff1aSopenharmony_ci SCRATCH 4, 11, rsp+(%4+3)*mmsize 1077cabdff1aSopenharmony_ci mova m4, [rsp+(%3+0)*mmsize] 1078cabdff1aSopenharmony_ci SUMSUB_BA d, 7, 4, 6 ; m7=out5, m4=out10 1079cabdff1aSopenharmony_ci mova [rsp+(%3+0)*mmsize], m5 1080cabdff1aSopenharmony_ci UNSCRATCH 5, 8, rsp+(%4+0)*mmsize 1081cabdff1aSopenharmony_ci UNSCRATCH 6, 9, rsp+(%4+1)*mmsize 1082cabdff1aSopenharmony_ci SCRATCH 2, 8, rsp+(%4+0)*mmsize 1083cabdff1aSopenharmony_ci SCRATCH 1, 9, rsp+(%4+1)*mmsize 1084cabdff1aSopenharmony_ci UNSCRATCH 1, 10, rsp+(%4+2)*mmsize 1085cabdff1aSopenharmony_ci SCRATCH 0, 10, rsp+(%4+2)*mmsize 1086cabdff1aSopenharmony_ci mova m0, [rsp+(%3+1)*mmsize] 1087cabdff1aSopenharmony_ci SUMSUB_BA d, 6, 5, 2 ; m6=out6, m5=out9 1088cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 0, 2 ; m1=out7, m0=out8 1089cabdff1aSopenharmony_ci 1090cabdff1aSopenharmony_ci SWAP 0, 3, 1, 7, 2, 6, 4 1091cabdff1aSopenharmony_ci 1092cabdff1aSopenharmony_ci ; output order: 8-11|r67-70=out0-3 1093cabdff1aSopenharmony_ci ; 0-6,r65=out4-11 1094cabdff1aSopenharmony_ci ; 12-15|r71-74=out12-15 1095cabdff1aSopenharmony_ci%endmacro 1096cabdff1aSopenharmony_ci 1097cabdff1aSopenharmony_ciINIT_XMM sse2 1098cabdff1aSopenharmony_cicglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ 1099cabdff1aSopenharmony_ci 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1100cabdff1aSopenharmony_ci dst, stride, block, eob 1101cabdff1aSopenharmony_ci mova m0, [pw_1023] 1102cabdff1aSopenharmony_ci cmp eobd, 1 1103cabdff1aSopenharmony_ci jg .idctfull 1104cabdff1aSopenharmony_ci 1105cabdff1aSopenharmony_ci ; dc-only - the 10bit version can be done entirely in 32bit, since the max 1106cabdff1aSopenharmony_ci ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily 1107cabdff1aSopenharmony_ci ; fits in 32bit 1108cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef 1109cabdff1aSopenharmony_ci pxor m2, m2 1110cabdff1aSopenharmony_ci DC_ONLY 6, m2 1111cabdff1aSopenharmony_ci movd m1, coefd 1112cabdff1aSopenharmony_ci pshuflw m1, m1, q0000 1113cabdff1aSopenharmony_ci punpcklqdq m1, m1 1114cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 1115cabdff1aSopenharmony_ci mov cntd, 8 1116cabdff1aSopenharmony_ci.loop_dc: 1117cabdff1aSopenharmony_ci STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 1118cabdff1aSopenharmony_ci STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize 1119cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1120cabdff1aSopenharmony_ci dec cntd 1121cabdff1aSopenharmony_ci jg .loop_dc 1122cabdff1aSopenharmony_ci RET 1123cabdff1aSopenharmony_ci 1124cabdff1aSopenharmony_ci.idctfull: 1125cabdff1aSopenharmony_ci mova [rsp+64*mmsize], m0 1126cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 1127cabdff1aSopenharmony_ci%if ARCH_X86_64 1128cabdff1aSopenharmony_ci mov dstbakq, dstq 1129cabdff1aSopenharmony_ci movsxd cntq, cntd 1130cabdff1aSopenharmony_ci%endif 1131cabdff1aSopenharmony_ci%ifdef PIC 1132cabdff1aSopenharmony_ci lea ptrq, [default_16x16] 1133cabdff1aSopenharmony_ci movzx cntd, byte [ptrq+cntq-1] 1134cabdff1aSopenharmony_ci%else 1135cabdff1aSopenharmony_ci movzx cntd, byte [default_16x16+cntq-1] 1136cabdff1aSopenharmony_ci%endif 1137cabdff1aSopenharmony_ci mov skipd, 4 1138cabdff1aSopenharmony_ci sub skipd, cntd 1139cabdff1aSopenharmony_ci mov ptrq, rsp 1140cabdff1aSopenharmony_ci.loop_1: 1141cabdff1aSopenharmony_ci IDCT16_1D blockq 1142cabdff1aSopenharmony_ci 1143cabdff1aSopenharmony_ci TRANSPOSE4x4D 0, 1, 2, 3, 7 1144cabdff1aSopenharmony_ci mova [ptrq+ 1*mmsize], m0 1145cabdff1aSopenharmony_ci mova [ptrq+ 5*mmsize], m1 1146cabdff1aSopenharmony_ci mova [ptrq+ 9*mmsize], m2 1147cabdff1aSopenharmony_ci mova [ptrq+13*mmsize], m3 1148cabdff1aSopenharmony_ci mova m7, [rsp+65*mmsize] 1149cabdff1aSopenharmony_ci TRANSPOSE4x4D 4, 5, 6, 7, 0 1150cabdff1aSopenharmony_ci mova [ptrq+ 2*mmsize], m4 1151cabdff1aSopenharmony_ci mova [ptrq+ 6*mmsize], m5 1152cabdff1aSopenharmony_ci mova [ptrq+10*mmsize], m6 1153cabdff1aSopenharmony_ci mova [ptrq+14*mmsize], m7 1154cabdff1aSopenharmony_ci UNSCRATCH 0, 8, rsp+67*mmsize 1155cabdff1aSopenharmony_ci UNSCRATCH 1, 9, rsp+68*mmsize 1156cabdff1aSopenharmony_ci UNSCRATCH 2, 10, rsp+69*mmsize 1157cabdff1aSopenharmony_ci UNSCRATCH 3, 11, rsp+70*mmsize 1158cabdff1aSopenharmony_ci TRANSPOSE4x4D 0, 1, 2, 3, 7 1159cabdff1aSopenharmony_ci mova [ptrq+ 0*mmsize], m0 1160cabdff1aSopenharmony_ci mova [ptrq+ 4*mmsize], m1 1161cabdff1aSopenharmony_ci mova [ptrq+ 8*mmsize], m2 1162cabdff1aSopenharmony_ci mova [ptrq+12*mmsize], m3 1163cabdff1aSopenharmony_ci UNSCRATCH 4, 12, rsp+71*mmsize 1164cabdff1aSopenharmony_ci UNSCRATCH 5, 13, rsp+72*mmsize 1165cabdff1aSopenharmony_ci UNSCRATCH 6, 14, rsp+73*mmsize 1166cabdff1aSopenharmony_ci UNSCRATCH 7, 15, rsp+74*mmsize 1167cabdff1aSopenharmony_ci TRANSPOSE4x4D 4, 5, 6, 7, 0 1168cabdff1aSopenharmony_ci mova [ptrq+ 3*mmsize], m4 1169cabdff1aSopenharmony_ci mova [ptrq+ 7*mmsize], m5 1170cabdff1aSopenharmony_ci mova [ptrq+11*mmsize], m6 1171cabdff1aSopenharmony_ci mova [ptrq+15*mmsize], m7 1172cabdff1aSopenharmony_ci add ptrq, 16 * mmsize 1173cabdff1aSopenharmony_ci add blockq, mmsize 1174cabdff1aSopenharmony_ci dec cntd 1175cabdff1aSopenharmony_ci jg .loop_1 1176cabdff1aSopenharmony_ci 1177cabdff1aSopenharmony_ci ; zero-pad the remainder (skipped cols) 1178cabdff1aSopenharmony_ci test skipd, skipd 1179cabdff1aSopenharmony_ci jz .end 1180cabdff1aSopenharmony_ci add skipd, skipd 1181cabdff1aSopenharmony_ci lea blockq, [blockq+skipq*(mmsize/2)] 1182cabdff1aSopenharmony_ci pxor m0, m0 1183cabdff1aSopenharmony_ci.loop_z: 1184cabdff1aSopenharmony_ci mova [ptrq+mmsize*0], m0 1185cabdff1aSopenharmony_ci mova [ptrq+mmsize*1], m0 1186cabdff1aSopenharmony_ci mova [ptrq+mmsize*2], m0 1187cabdff1aSopenharmony_ci mova [ptrq+mmsize*3], m0 1188cabdff1aSopenharmony_ci mova [ptrq+mmsize*4], m0 1189cabdff1aSopenharmony_ci mova [ptrq+mmsize*5], m0 1190cabdff1aSopenharmony_ci mova [ptrq+mmsize*6], m0 1191cabdff1aSopenharmony_ci mova [ptrq+mmsize*7], m0 1192cabdff1aSopenharmony_ci add ptrq, 8 * mmsize 1193cabdff1aSopenharmony_ci dec skipd 1194cabdff1aSopenharmony_ci jg .loop_z 1195cabdff1aSopenharmony_ci.end: 1196cabdff1aSopenharmony_ci 1197cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1198cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1199cabdff1aSopenharmony_ci mov cntd, 4 1200cabdff1aSopenharmony_ci mov ptrq, rsp 1201cabdff1aSopenharmony_ci.loop_2: 1202cabdff1aSopenharmony_ci IDCT16_1D ptrq 1203cabdff1aSopenharmony_ci 1204cabdff1aSopenharmony_ci pxor m7, m7 1205cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 1206cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 1207cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 1208cabdff1aSopenharmony_ci mova m0, [rsp+65*mmsize] 1209cabdff1aSopenharmony_ci mova m1, [rsp+64*mmsize] 1210cabdff1aSopenharmony_ci mova m2, [pd_32] 1211cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1212cabdff1aSopenharmony_ci 1213cabdff1aSopenharmony_ci%if ARCH_X86_64 1214cabdff1aSopenharmony_ci DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst 1215cabdff1aSopenharmony_ci%else 1216cabdff1aSopenharmony_ci mov dstq, dstm 1217cabdff1aSopenharmony_ci%endif 1218cabdff1aSopenharmony_ci UNSCRATCH 0, 8, rsp+67*mmsize 1219cabdff1aSopenharmony_ci UNSCRATCH 4, 9, rsp+68*mmsize 1220cabdff1aSopenharmony_ci UNSCRATCH 5, 10, rsp+69*mmsize 1221cabdff1aSopenharmony_ci UNSCRATCH 3, 11, rsp+70*mmsize 1222cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6 1223cabdff1aSopenharmony_ci%if ARCH_X86_64 1224cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1225cabdff1aSopenharmony_ci lea dstq, [dstbakq+stride3q*4] 1226cabdff1aSopenharmony_ci%else 1227cabdff1aSopenharmony_ci lea dstq, [dstq+stride3q*4] 1228cabdff1aSopenharmony_ci%endif 1229cabdff1aSopenharmony_ci UNSCRATCH 4, 12, rsp+71*mmsize 1230cabdff1aSopenharmony_ci UNSCRATCH 5, 13, rsp+72*mmsize 1231cabdff1aSopenharmony_ci UNSCRATCH 6, 14, rsp+73*mmsize 1232cabdff1aSopenharmony_ci UNSCRATCH 0, 15, rsp+74*mmsize 1233cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1234cabdff1aSopenharmony_ci 1235cabdff1aSopenharmony_ci add ptrq, mmsize 1236cabdff1aSopenharmony_ci%if ARCH_X86_64 1237cabdff1aSopenharmony_ci add dstbakq, 8 1238cabdff1aSopenharmony_ci mov dstq, dstbakq 1239cabdff1aSopenharmony_ci%else 1240cabdff1aSopenharmony_ci add dword dstm, 8 1241cabdff1aSopenharmony_ci mov dstq, dstm 1242cabdff1aSopenharmony_ci%endif 1243cabdff1aSopenharmony_ci dec cntd 1244cabdff1aSopenharmony_ci jg .loop_2 1245cabdff1aSopenharmony_ci 1246cabdff1aSopenharmony_ci ; m7 is still zero 1247cabdff1aSopenharmony_ci ZERO_BLOCK blockq-4*mmsize, 64, 16, m7 1248cabdff1aSopenharmony_ci RET 1249cabdff1aSopenharmony_ci 1250cabdff1aSopenharmony_ciINIT_XMM sse2 1251cabdff1aSopenharmony_cicglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ 1252cabdff1aSopenharmony_ci 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1253cabdff1aSopenharmony_ci dst, stride, block, eob 1254cabdff1aSopenharmony_ci mova m0, [pw_4095] 1255cabdff1aSopenharmony_ci cmp eobd, 1 1256cabdff1aSopenharmony_ci jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull 1257cabdff1aSopenharmony_ci 1258cabdff1aSopenharmony_ci ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign 1259cabdff1aSopenharmony_ci ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies 1260cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef, coefl 1261cabdff1aSopenharmony_ci pxor m2, m2 1262cabdff1aSopenharmony_ci DC_ONLY_64BIT 6, m2 1263cabdff1aSopenharmony_ci movd m1, coefd 1264cabdff1aSopenharmony_ci pshuflw m1, m1, q0000 1265cabdff1aSopenharmony_ci punpcklqdq m1, m1 1266cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 1267cabdff1aSopenharmony_ci mov cntd, 8 1268cabdff1aSopenharmony_ci.loop_dc: 1269cabdff1aSopenharmony_ci STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 1270cabdff1aSopenharmony_ci STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize 1271cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1272cabdff1aSopenharmony_ci dec cntd 1273cabdff1aSopenharmony_ci jg .loop_dc 1274cabdff1aSopenharmony_ci RET 1275cabdff1aSopenharmony_ci 1276cabdff1aSopenharmony_ci; r65-69 are available for spills 1277cabdff1aSopenharmony_ci; r70-77 are available on x86-32 only (x86-64 should use m8-15) 1278cabdff1aSopenharmony_ci; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77 1279cabdff1aSopenharmony_ci%macro IADST16_1D 1 ; src 1280cabdff1aSopenharmony_ci mova m0, [%1+ 0*4*mmsize] ; in0 1281cabdff1aSopenharmony_ci mova m1, [%1+ 7*4*mmsize] ; in7 1282cabdff1aSopenharmony_ci mova m2, [%1+ 8*4*mmsize] ; in8 1283cabdff1aSopenharmony_ci mova m3, [%1+15*4*mmsize] ; in15 1284cabdff1aSopenharmony_ci SUMSUB_MUL_D 3, 0, 4, 5, 16364, 804 ; m3/4=t0, m0/5=t1 1285cabdff1aSopenharmony_ci SUMSUB_MUL_D 1, 2, 6, 7, 11003, 12140 ; m1/6=t8, m2/7=t9 1286cabdff1aSopenharmony_ci SCRATCH 0, 8, rsp+70*mmsize 1287cabdff1aSopenharmony_ci SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t0a, m3=t8a 1288cabdff1aSopenharmony_ci UNSCRATCH 0, 8, rsp+70*mmsize 1289cabdff1aSopenharmony_ci SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t1a, m0=t9a 1290cabdff1aSopenharmony_ci mova [rsp+67*mmsize], m1 1291cabdff1aSopenharmony_ci SCRATCH 2, 9, rsp+71*mmsize 1292cabdff1aSopenharmony_ci SCRATCH 3, 12, rsp+74*mmsize 1293cabdff1aSopenharmony_ci SCRATCH 0, 13, rsp+75*mmsize 1294cabdff1aSopenharmony_ci 1295cabdff1aSopenharmony_ci mova m0, [%1+ 3*4*mmsize] ; in3 1296cabdff1aSopenharmony_ci mova m1, [%1+ 4*4*mmsize] ; in4 1297cabdff1aSopenharmony_ci mova m2, [%1+11*4*mmsize] ; in11 1298cabdff1aSopenharmony_ci mova m3, [%1+12*4*mmsize] ; in12 1299cabdff1aSopenharmony_ci SUMSUB_MUL_D 2, 1, 4, 5, 14811, 7005 ; m2/4=t4, m1/5=t5 1300cabdff1aSopenharmony_ci SUMSUB_MUL_D 0, 3, 6, 7, 5520, 15426 ; m0/6=t12, m3/7=t13 1301cabdff1aSopenharmony_ci SCRATCH 1, 10, rsp+72*mmsize 1302cabdff1aSopenharmony_ci SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t4a, m2=t12a 1303cabdff1aSopenharmony_ci UNSCRATCH 1, 10, rsp+72*mmsize 1304cabdff1aSopenharmony_ci SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t5a, m1=t13a 1305cabdff1aSopenharmony_ci SCRATCH 0, 15, rsp+77*mmsize 1306cabdff1aSopenharmony_ci SCRATCH 3, 11, rsp+73*mmsize 1307cabdff1aSopenharmony_ci 1308cabdff1aSopenharmony_ci UNSCRATCH 0, 12, rsp+74*mmsize ; t8a 1309cabdff1aSopenharmony_ci UNSCRATCH 3, 13, rsp+75*mmsize ; t9a 1310cabdff1aSopenharmony_ci SUMSUB_MUL_D 0, 3, 4, 5, 16069, 3196 ; m0/4=t8, m3/5=t9 1311cabdff1aSopenharmony_ci SUMSUB_MUL_D 1, 2, 6, 7, 3196, 16069 ; m1/6=t13, m2/7=t12 1312cabdff1aSopenharmony_ci SCRATCH 1, 12, rsp+74*mmsize 1313cabdff1aSopenharmony_ci SUMSUB_PACK_D 2, 0, 7, 4, 1 ; m2=t8a, m0=t12a 1314cabdff1aSopenharmony_ci UNSCRATCH 1, 12, rsp+74*mmsize 1315cabdff1aSopenharmony_ci SUMSUB_PACK_D 1, 3, 6, 5, 4 ; m1=t9a, m3=t13a 1316cabdff1aSopenharmony_ci mova [rsp+65*mmsize], m2 1317cabdff1aSopenharmony_ci mova [rsp+66*mmsize], m1 1318cabdff1aSopenharmony_ci SCRATCH 0, 8, rsp+70*mmsize 1319cabdff1aSopenharmony_ci SCRATCH 3, 12, rsp+74*mmsize 1320cabdff1aSopenharmony_ci 1321cabdff1aSopenharmony_ci mova m0, [%1+ 2*4*mmsize] ; in2 1322cabdff1aSopenharmony_ci mova m1, [%1+ 5*4*mmsize] ; in5 1323cabdff1aSopenharmony_ci mova m2, [%1+10*4*mmsize] ; in10 1324cabdff1aSopenharmony_ci mova m3, [%1+13*4*mmsize] ; in13 1325cabdff1aSopenharmony_ci SUMSUB_MUL_D 3, 0, 4, 5, 15893, 3981 ; m3/4=t2, m0/5=t3 1326cabdff1aSopenharmony_ci SUMSUB_MUL_D 1, 2, 6, 7, 8423, 14053 ; m1/6=t10, m2/7=t11 1327cabdff1aSopenharmony_ci SCRATCH 0, 10, rsp+72*mmsize 1328cabdff1aSopenharmony_ci SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t2a, m3=t10a 1329cabdff1aSopenharmony_ci UNSCRATCH 0, 10, rsp+72*mmsize 1330cabdff1aSopenharmony_ci SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t3a, m0=t11a 1331cabdff1aSopenharmony_ci mova [rsp+68*mmsize], m1 1332cabdff1aSopenharmony_ci mova [rsp+69*mmsize], m2 1333cabdff1aSopenharmony_ci SCRATCH 3, 13, rsp+75*mmsize 1334cabdff1aSopenharmony_ci SCRATCH 0, 14, rsp+76*mmsize 1335cabdff1aSopenharmony_ci 1336cabdff1aSopenharmony_ci mova m0, [%1+ 1*4*mmsize] ; in1 1337cabdff1aSopenharmony_ci mova m1, [%1+ 6*4*mmsize] ; in6 1338cabdff1aSopenharmony_ci mova m2, [%1+ 9*4*mmsize] ; in9 1339cabdff1aSopenharmony_ci mova m3, [%1+14*4*mmsize] ; in14 1340cabdff1aSopenharmony_ci SUMSUB_MUL_D 2, 1, 4, 5, 13160, 9760 ; m2/4=t6, m1/5=t7 1341cabdff1aSopenharmony_ci SUMSUB_MUL_D 0, 3, 6, 7, 2404, 16207 ; m0/6=t14, m3/7=t15 1342cabdff1aSopenharmony_ci SCRATCH 1, 10, rsp+72*mmsize 1343cabdff1aSopenharmony_ci SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t6a, m2=t14a 1344cabdff1aSopenharmony_ci UNSCRATCH 1, 10, rsp+72*mmsize 1345cabdff1aSopenharmony_ci SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t7a, m1=t15a 1346cabdff1aSopenharmony_ci 1347cabdff1aSopenharmony_ci UNSCRATCH 4, 13, rsp+75*mmsize ; t10a 1348cabdff1aSopenharmony_ci UNSCRATCH 5, 14, rsp+76*mmsize ; t11a 1349cabdff1aSopenharmony_ci SCRATCH 0, 13, rsp+75*mmsize 1350cabdff1aSopenharmony_ci SCRATCH 3, 14, rsp+76*mmsize 1351cabdff1aSopenharmony_ci SUMSUB_MUL_D 4, 5, 6, 7, 9102, 13623 ; m4/6=t10, m5/7=t11 1352cabdff1aSopenharmony_ci SUMSUB_MUL_D 1, 2, 0, 3, 13623, 9102 ; m1/0=t15, m2/3=t14 1353cabdff1aSopenharmony_ci SCRATCH 0, 10, rsp+72*mmsize 1354cabdff1aSopenharmony_ci SUMSUB_PACK_D 2, 4, 3, 6, 0 ; m2=t10a, m4=t14a 1355cabdff1aSopenharmony_ci UNSCRATCH 0, 10, rsp+72*mmsize 1356cabdff1aSopenharmony_ci SUMSUB_PACK_D 1, 5, 0, 7, 6 ; m1=t11a, m5=t15a 1357cabdff1aSopenharmony_ci 1358cabdff1aSopenharmony_ci UNSCRATCH 0, 8, rsp+70*mmsize ; t12a 1359cabdff1aSopenharmony_ci UNSCRATCH 3, 12, rsp+74*mmsize ; t13a 1360cabdff1aSopenharmony_ci SCRATCH 2, 8, rsp+70*mmsize 1361cabdff1aSopenharmony_ci SCRATCH 1, 12, rsp+74*mmsize 1362cabdff1aSopenharmony_ci SUMSUB_MUL_D 0, 3, 1, 2, 15137, 6270 ; m0/1=t12, m3/2=t13 1363cabdff1aSopenharmony_ci SUMSUB_MUL_D 5, 4, 7, 6, 6270, 15137 ; m5/7=t15, m4/6=t14 1364cabdff1aSopenharmony_ci SCRATCH 2, 10, rsp+72*mmsize 1365cabdff1aSopenharmony_ci SUMSUB_PACK_D 4, 0, 6, 1, 2 ; m4=out2, m0=t14a 1366cabdff1aSopenharmony_ci UNSCRATCH 2, 10, rsp+72*mmsize 1367cabdff1aSopenharmony_ci SUMSUB_PACK_D 5, 3, 7, 2, 1 ; m5=-out13, m3=t15a 1368cabdff1aSopenharmony_ci NEGD m5 ; m5=out13 1369cabdff1aSopenharmony_ci 1370cabdff1aSopenharmony_ci UNSCRATCH 1, 9, rsp+71*mmsize ; t1a 1371cabdff1aSopenharmony_ci mova m2, [rsp+68*mmsize] ; t2a 1372cabdff1aSopenharmony_ci UNSCRATCH 6, 13, rsp+75*mmsize ; t6a 1373cabdff1aSopenharmony_ci UNSCRATCH 7, 14, rsp+76*mmsize ; t7a 1374cabdff1aSopenharmony_ci SCRATCH 4, 10, rsp+72*mmsize 1375cabdff1aSopenharmony_ci SCRATCH 5, 13, rsp+75*mmsize 1376cabdff1aSopenharmony_ci UNSCRATCH 4, 15, rsp+77*mmsize ; t4a 1377cabdff1aSopenharmony_ci UNSCRATCH 5, 11, rsp+73*mmsize ; t5a 1378cabdff1aSopenharmony_ci SCRATCH 0, 14, rsp+76*mmsize 1379cabdff1aSopenharmony_ci SCRATCH 3, 15, rsp+77*mmsize 1380cabdff1aSopenharmony_ci mova m0, [rsp+67*mmsize] ; t0a 1381cabdff1aSopenharmony_ci SUMSUB_BA d, 4, 0, 3 ; m4=t0, m0=t4 1382cabdff1aSopenharmony_ci SUMSUB_BA d, 5, 1, 3 ; m5=t1, m1=t5 1383cabdff1aSopenharmony_ci SUMSUB_BA d, 6, 2, 3 ; m6=t2, m2=t6 1384cabdff1aSopenharmony_ci SCRATCH 4, 9, rsp+71*mmsize 1385cabdff1aSopenharmony_ci mova m3, [rsp+69*mmsize] ; t3a 1386cabdff1aSopenharmony_ci SUMSUB_BA d, 7, 3, 4 ; m7=t3, m3=t7 1387cabdff1aSopenharmony_ci 1388cabdff1aSopenharmony_ci mova [rsp+67*mmsize], m5 1389cabdff1aSopenharmony_ci mova [rsp+68*mmsize], m6 1390cabdff1aSopenharmony_ci mova [rsp+69*mmsize], m7 1391cabdff1aSopenharmony_ci SUMSUB_MUL_D 0, 1, 4, 5, 15137, 6270 ; m0/4=t4a, m1/5=t5a 1392cabdff1aSopenharmony_ci SUMSUB_MUL_D 3, 2, 7, 6, 6270, 15137 ; m3/7=t7a, m2/6=t6a 1393cabdff1aSopenharmony_ci SCRATCH 1, 11, rsp+73*mmsize 1394cabdff1aSopenharmony_ci SUMSUB_PACK_D 2, 0, 6, 4, 1 ; m2=-out3, m0=t6 1395cabdff1aSopenharmony_ci NEGD m2 ; m2=out3 1396cabdff1aSopenharmony_ci UNSCRATCH 1, 11, rsp+73*mmsize 1397cabdff1aSopenharmony_ci SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=out12, m1=t7 1398cabdff1aSopenharmony_ci SCRATCH 2, 11, rsp+73*mmsize 1399cabdff1aSopenharmony_ci UNSCRATCH 2, 12, rsp+74*mmsize ; t11a 1400cabdff1aSopenharmony_ci SCRATCH 3, 12, rsp+74*mmsize 1401cabdff1aSopenharmony_ci 1402cabdff1aSopenharmony_ci UNSCRATCH 3, 8, rsp+70*mmsize ; t10a 1403cabdff1aSopenharmony_ci mova m4, [rsp+65*mmsize] ; t8a 1404cabdff1aSopenharmony_ci mova m5, [rsp+66*mmsize] ; t9a 1405cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 4, 6 ; m3=-out1, m4=t10 1406cabdff1aSopenharmony_ci NEGD m3 ; m3=out1 1407cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 5, 6 ; m2=out14, m5=t11 1408cabdff1aSopenharmony_ci UNSCRATCH 6, 9, rsp+71*mmsize ; t0 1409cabdff1aSopenharmony_ci UNSCRATCH 7, 14, rsp+76*mmsize ; t14a 1410cabdff1aSopenharmony_ci SCRATCH 3, 9, rsp+71*mmsize 1411cabdff1aSopenharmony_ci SCRATCH 2, 14, rsp+76*mmsize 1412cabdff1aSopenharmony_ci 1413cabdff1aSopenharmony_ci SUMSUB_MUL 1, 0, 2, 3, 11585, 11585 ; m1=out4, m0=out11 1414cabdff1aSopenharmony_ci mova [rsp+65*mmsize], m0 1415cabdff1aSopenharmony_ci SUMSUB_MUL 5, 4, 2, 3, 11585, 11585 ; m5=out6, m4=out9 1416cabdff1aSopenharmony_ci UNSCRATCH 0, 15, rsp+77*mmsize ; t15a 1417cabdff1aSopenharmony_ci SUMSUB_MUL 7, 0, 2, 3, 11585, m11585 ; m7=out10, m0=out5 1418cabdff1aSopenharmony_ci 1419cabdff1aSopenharmony_ci mova m2, [rsp+68*mmsize] ; t2 1420cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 6, 3 ; m2=out0, m6=t2a 1421cabdff1aSopenharmony_ci SCRATCH 2, 8, rsp+70*mmsize 1422cabdff1aSopenharmony_ci mova m2, [rsp+67*mmsize] ; t1 1423cabdff1aSopenharmony_ci mova m3, [rsp+69*mmsize] ; t3 1424cabdff1aSopenharmony_ci mova [rsp+67*mmsize], m7 1425cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 2, 7 ; m3=-out15, m2=t3a 1426cabdff1aSopenharmony_ci NEGD m3 ; m3=out15 1427cabdff1aSopenharmony_ci SCRATCH 3, 15, rsp+77*mmsize 1428cabdff1aSopenharmony_ci SUMSUB_MUL 6, 2, 7, 3, 11585, m11585 ; m6=out8, m2=out7 1429cabdff1aSopenharmony_ci mova m7, [rsp+67*mmsize] 1430cabdff1aSopenharmony_ci 1431cabdff1aSopenharmony_ci SWAP 0, 1 1432cabdff1aSopenharmony_ci SWAP 2, 5, 4, 6, 7, 3 1433cabdff1aSopenharmony_ci%endmacro 1434cabdff1aSopenharmony_ci 1435cabdff1aSopenharmony_ci%macro IADST16_FN 7 1436cabdff1aSopenharmony_cicglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ 1437cabdff1aSopenharmony_ci 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1438cabdff1aSopenharmony_ci dst, stride, block, eob 1439cabdff1aSopenharmony_ci mova m0, [pw_1023] 1440cabdff1aSopenharmony_ci 1441cabdff1aSopenharmony_ci.body: 1442cabdff1aSopenharmony_ci mova [rsp+64*mmsize], m0 1443cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 1444cabdff1aSopenharmony_ci%if ARCH_X86_64 1445cabdff1aSopenharmony_ci mov dstbakq, dstq 1446cabdff1aSopenharmony_ci movsxd cntq, cntd 1447cabdff1aSopenharmony_ci%endif 1448cabdff1aSopenharmony_ci%ifdef PIC 1449cabdff1aSopenharmony_ci lea ptrq, [%7_16x16] 1450cabdff1aSopenharmony_ci movzx cntd, byte [ptrq+cntq-1] 1451cabdff1aSopenharmony_ci%else 1452cabdff1aSopenharmony_ci movzx cntd, byte [%7_16x16+cntq-1] 1453cabdff1aSopenharmony_ci%endif 1454cabdff1aSopenharmony_ci mov skipd, 4 1455cabdff1aSopenharmony_ci sub skipd, cntd 1456cabdff1aSopenharmony_ci mov ptrq, rsp 1457cabdff1aSopenharmony_ci.loop_1: 1458cabdff1aSopenharmony_ci %2_1D blockq 1459cabdff1aSopenharmony_ci 1460cabdff1aSopenharmony_ci TRANSPOSE4x4D 0, 1, 2, 3, 7 1461cabdff1aSopenharmony_ci mova [ptrq+ 1*mmsize], m0 1462cabdff1aSopenharmony_ci mova [ptrq+ 5*mmsize], m1 1463cabdff1aSopenharmony_ci mova [ptrq+ 9*mmsize], m2 1464cabdff1aSopenharmony_ci mova [ptrq+13*mmsize], m3 1465cabdff1aSopenharmony_ci mova m7, [rsp+65*mmsize] 1466cabdff1aSopenharmony_ci TRANSPOSE4x4D 4, 5, 6, 7, 0 1467cabdff1aSopenharmony_ci mova [ptrq+ 2*mmsize], m4 1468cabdff1aSopenharmony_ci mova [ptrq+ 6*mmsize], m5 1469cabdff1aSopenharmony_ci mova [ptrq+10*mmsize], m6 1470cabdff1aSopenharmony_ci mova [ptrq+14*mmsize], m7 1471cabdff1aSopenharmony_ci UNSCRATCH 0, 8, rsp+(%3+0)*mmsize 1472cabdff1aSopenharmony_ci UNSCRATCH 1, 9, rsp+(%3+1)*mmsize 1473cabdff1aSopenharmony_ci UNSCRATCH 2, 10, rsp+(%3+2)*mmsize 1474cabdff1aSopenharmony_ci UNSCRATCH 3, 11, rsp+(%3+3)*mmsize 1475cabdff1aSopenharmony_ci TRANSPOSE4x4D 0, 1, 2, 3, 7 1476cabdff1aSopenharmony_ci mova [ptrq+ 0*mmsize], m0 1477cabdff1aSopenharmony_ci mova [ptrq+ 4*mmsize], m1 1478cabdff1aSopenharmony_ci mova [ptrq+ 8*mmsize], m2 1479cabdff1aSopenharmony_ci mova [ptrq+12*mmsize], m3 1480cabdff1aSopenharmony_ci UNSCRATCH 4, 12, rsp+(%3+4)*mmsize 1481cabdff1aSopenharmony_ci UNSCRATCH 5, 13, rsp+(%3+5)*mmsize 1482cabdff1aSopenharmony_ci UNSCRATCH 6, 14, rsp+(%3+6)*mmsize 1483cabdff1aSopenharmony_ci UNSCRATCH 7, 15, rsp+(%3+7)*mmsize 1484cabdff1aSopenharmony_ci TRANSPOSE4x4D 4, 5, 6, 7, 0 1485cabdff1aSopenharmony_ci mova [ptrq+ 3*mmsize], m4 1486cabdff1aSopenharmony_ci mova [ptrq+ 7*mmsize], m5 1487cabdff1aSopenharmony_ci mova [ptrq+11*mmsize], m6 1488cabdff1aSopenharmony_ci mova [ptrq+15*mmsize], m7 1489cabdff1aSopenharmony_ci add ptrq, 16 * mmsize 1490cabdff1aSopenharmony_ci add blockq, mmsize 1491cabdff1aSopenharmony_ci dec cntd 1492cabdff1aSopenharmony_ci jg .loop_1 1493cabdff1aSopenharmony_ci 1494cabdff1aSopenharmony_ci ; zero-pad the remainder (skipped cols) 1495cabdff1aSopenharmony_ci test skipd, skipd 1496cabdff1aSopenharmony_ci jz .end 1497cabdff1aSopenharmony_ci add skipd, skipd 1498cabdff1aSopenharmony_ci lea blockq, [blockq+skipq*(mmsize/2)] 1499cabdff1aSopenharmony_ci pxor m0, m0 1500cabdff1aSopenharmony_ci.loop_z: 1501cabdff1aSopenharmony_ci mova [ptrq+mmsize*0], m0 1502cabdff1aSopenharmony_ci mova [ptrq+mmsize*1], m0 1503cabdff1aSopenharmony_ci mova [ptrq+mmsize*2], m0 1504cabdff1aSopenharmony_ci mova [ptrq+mmsize*3], m0 1505cabdff1aSopenharmony_ci mova [ptrq+mmsize*4], m0 1506cabdff1aSopenharmony_ci mova [ptrq+mmsize*5], m0 1507cabdff1aSopenharmony_ci mova [ptrq+mmsize*6], m0 1508cabdff1aSopenharmony_ci mova [ptrq+mmsize*7], m0 1509cabdff1aSopenharmony_ci add ptrq, 8 * mmsize 1510cabdff1aSopenharmony_ci dec skipd 1511cabdff1aSopenharmony_ci jg .loop_z 1512cabdff1aSopenharmony_ci.end: 1513cabdff1aSopenharmony_ci 1514cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1515cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1516cabdff1aSopenharmony_ci mov cntd, 4 1517cabdff1aSopenharmony_ci mov ptrq, rsp 1518cabdff1aSopenharmony_ci.loop_2: 1519cabdff1aSopenharmony_ci %5_1D ptrq 1520cabdff1aSopenharmony_ci 1521cabdff1aSopenharmony_ci pxor m7, m7 1522cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 1523cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 1524cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 1525cabdff1aSopenharmony_ci mova m0, [rsp+65*mmsize] 1526cabdff1aSopenharmony_ci mova m1, [rsp+64*mmsize] 1527cabdff1aSopenharmony_ci mova m2, [pd_32] 1528cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1529cabdff1aSopenharmony_ci 1530cabdff1aSopenharmony_ci%if ARCH_X86_64 1531cabdff1aSopenharmony_ci DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst 1532cabdff1aSopenharmony_ci%else 1533cabdff1aSopenharmony_ci mov dstq, dstm 1534cabdff1aSopenharmony_ci%endif 1535cabdff1aSopenharmony_ci UNSCRATCH 0, 8, rsp+(%6+0)*mmsize 1536cabdff1aSopenharmony_ci UNSCRATCH 4, 9, rsp+(%6+1)*mmsize 1537cabdff1aSopenharmony_ci UNSCRATCH 5, 10, rsp+(%6+2)*mmsize 1538cabdff1aSopenharmony_ci UNSCRATCH 3, 11, rsp+(%6+3)*mmsize 1539cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6 1540cabdff1aSopenharmony_ci%if ARCH_X86_64 1541cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1542cabdff1aSopenharmony_ci lea dstq, [dstbakq+stride3q*4] 1543cabdff1aSopenharmony_ci%else 1544cabdff1aSopenharmony_ci lea dstq, [dstq+stride3q*4] 1545cabdff1aSopenharmony_ci%endif 1546cabdff1aSopenharmony_ci UNSCRATCH 4, 12, rsp+(%6+4)*mmsize 1547cabdff1aSopenharmony_ci UNSCRATCH 5, 13, rsp+(%6+5)*mmsize 1548cabdff1aSopenharmony_ci UNSCRATCH 6, 14, rsp+(%6+6)*mmsize 1549cabdff1aSopenharmony_ci UNSCRATCH 0, 15, rsp+(%6+7)*mmsize 1550cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1551cabdff1aSopenharmony_ci 1552cabdff1aSopenharmony_ci add ptrq, mmsize 1553cabdff1aSopenharmony_ci%if ARCH_X86_64 1554cabdff1aSopenharmony_ci add dstbakq, 8 1555cabdff1aSopenharmony_ci mov dstq, dstbakq 1556cabdff1aSopenharmony_ci%else 1557cabdff1aSopenharmony_ci add dword dstm, 8 1558cabdff1aSopenharmony_ci mov dstq, dstm 1559cabdff1aSopenharmony_ci%endif 1560cabdff1aSopenharmony_ci dec cntd 1561cabdff1aSopenharmony_ci jg .loop_2 1562cabdff1aSopenharmony_ci 1563cabdff1aSopenharmony_ci ; m7 is still zero 1564cabdff1aSopenharmony_ci ZERO_BLOCK blockq-4*mmsize, 64, 16, m7 1565cabdff1aSopenharmony_ci RET 1566cabdff1aSopenharmony_ci 1567cabdff1aSopenharmony_cicglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ 1568cabdff1aSopenharmony_ci 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1569cabdff1aSopenharmony_ci dst, stride, block, eob 1570cabdff1aSopenharmony_ci mova m0, [pw_4095] 1571cabdff1aSopenharmony_ci jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body 1572cabdff1aSopenharmony_ci%endmacro 1573cabdff1aSopenharmony_ci 1574cabdff1aSopenharmony_ciINIT_XMM sse2 1575cabdff1aSopenharmony_ciIADST16_FN idct, IDCT16, 67, iadst, IADST16, 70, row 1576cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, 70, idct, IDCT16, 67, col 1577cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default 1578cabdff1aSopenharmony_ci 1579cabdff1aSopenharmony_ci%macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride 1580cabdff1aSopenharmony_ci IDCT16_1D %2, 2 * %3, 272, 257 1581cabdff1aSopenharmony_ci%if ARCH_X86_64 1582cabdff1aSopenharmony_ci mova [rsp+257*mmsize], m8 1583cabdff1aSopenharmony_ci mova [rsp+258*mmsize], m9 1584cabdff1aSopenharmony_ci mova [rsp+259*mmsize], m10 1585cabdff1aSopenharmony_ci mova [rsp+260*mmsize], m11 1586cabdff1aSopenharmony_ci mova [rsp+261*mmsize], m12 1587cabdff1aSopenharmony_ci mova [rsp+262*mmsize], m13 1588cabdff1aSopenharmony_ci mova [rsp+263*mmsize], m14 1589cabdff1aSopenharmony_ci mova [rsp+264*mmsize], m15 1590cabdff1aSopenharmony_ci%endif 1591cabdff1aSopenharmony_ci mova [rsp+265*mmsize], m0 1592cabdff1aSopenharmony_ci mova [rsp+266*mmsize], m1 1593cabdff1aSopenharmony_ci mova [rsp+267*mmsize], m2 1594cabdff1aSopenharmony_ci mova [rsp+268*mmsize], m3 1595cabdff1aSopenharmony_ci mova [rsp+269*mmsize], m4 1596cabdff1aSopenharmony_ci mova [rsp+270*mmsize], m5 1597cabdff1aSopenharmony_ci mova [rsp+271*mmsize], m6 1598cabdff1aSopenharmony_ci 1599cabdff1aSopenharmony_ci ; r257-260: t0-3 1600cabdff1aSopenharmony_ci ; r265-272: t4/5a/6a/7/8/9a/10/11a 1601cabdff1aSopenharmony_ci ; r261-264: t12a/13/14a/15 1602cabdff1aSopenharmony_ci ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit 1603cabdff1aSopenharmony_ci 1604cabdff1aSopenharmony_ci mova m0, [%2+ 1*%3] ; in1 1605cabdff1aSopenharmony_ci mova m1, [%2+15*%3] ; in15 1606cabdff1aSopenharmony_ci mova m2, [%2+17*%3] ; in17 1607cabdff1aSopenharmony_ci mova m3, [%2+31*%3] ; in31 1608cabdff1aSopenharmony_ci SUMSUB_MUL 0, 3, 4, 5, 16364, 804 ; m0=t31a, m3=t16a 1609cabdff1aSopenharmony_ci SUMSUB_MUL 2, 1, 4, 5, 11003, 12140 ; m2=t30a, m1=t17a 1610cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 3, 4 ; m1=t16, m3=t17 1611cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 0, 4 ; m2=t31, m0=t30 1612cabdff1aSopenharmony_ci SUMSUB_MUL 0, 3, 4, 5, 16069, 3196 ; m0=t30a, m3=t17a 1613cabdff1aSopenharmony_ci SCRATCH 0, 8, rsp+275*mmsize 1614cabdff1aSopenharmony_ci SCRATCH 2, 9, rsp+276*mmsize 1615cabdff1aSopenharmony_ci 1616cabdff1aSopenharmony_ci ; end of stage 1-3 first quart 1617cabdff1aSopenharmony_ci 1618cabdff1aSopenharmony_ci mova m0, [%2+ 7*%3] ; in7 1619cabdff1aSopenharmony_ci mova m2, [%2+ 9*%3] ; in9 1620cabdff1aSopenharmony_ci mova m4, [%2+23*%3] ; in23 1621cabdff1aSopenharmony_ci mova m5, [%2+25*%3] ; in25 1622cabdff1aSopenharmony_ci SUMSUB_MUL 2, 4, 6, 7, 14811, 7005 ; m2=t29a, m4=t18a 1623cabdff1aSopenharmony_ci SUMSUB_MUL 5, 0, 6, 7, 5520, 15426 ; m5=t28a, m0=t19a 1624cabdff1aSopenharmony_ci SUMSUB_BA d, 4, 0, 6 ; m4=t19, m0=t18 1625cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 5, 6 ; m2=t28, m5=t29 1626cabdff1aSopenharmony_ci SUMSUB_MUL 5, 0, 6, 7, 3196, m16069 ; m5=t29a, m0=t18a 1627cabdff1aSopenharmony_ci 1628cabdff1aSopenharmony_ci ; end of stage 1-3 second quart 1629cabdff1aSopenharmony_ci 1630cabdff1aSopenharmony_ci SUMSUB_BA d, 4, 1, 6 ; m4=t16a, m1=t19a 1631cabdff1aSopenharmony_ci SUMSUB_BA d, 0, 3, 6 ; m0=t17, m3=t18 1632cabdff1aSopenharmony_ci UNSCRATCH 6, 8, rsp+275*mmsize ; t30a 1633cabdff1aSopenharmony_ci UNSCRATCH 7, 9, rsp+276*mmsize ; t31 1634cabdff1aSopenharmony_ci mova [rsp+273*mmsize], m4 1635cabdff1aSopenharmony_ci mova [rsp+274*mmsize], m0 1636cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 7, 0 ; m2=t31a, m7=t28a 1637cabdff1aSopenharmony_ci SUMSUB_BA d, 5, 6, 0 ; m5=t30, m6=t29 1638cabdff1aSopenharmony_ci SUMSUB_MUL 6, 3, 0, 4, 15137, 6270 ; m6=t29a, m3=t18a 1639cabdff1aSopenharmony_ci SUMSUB_MUL 7, 1, 0, 4, 15137, 6270 ; m7=t28, m1=t19 1640cabdff1aSopenharmony_ci SCRATCH 3, 10, rsp+277*mmsize 1641cabdff1aSopenharmony_ci SCRATCH 1, 11, rsp+278*mmsize 1642cabdff1aSopenharmony_ci SCRATCH 7, 12, rsp+279*mmsize 1643cabdff1aSopenharmony_ci SCRATCH 6, 13, rsp+280*mmsize 1644cabdff1aSopenharmony_ci SCRATCH 5, 14, rsp+281*mmsize 1645cabdff1aSopenharmony_ci SCRATCH 2, 15, rsp+282*mmsize 1646cabdff1aSopenharmony_ci 1647cabdff1aSopenharmony_ci ; end of stage 4-5 first half 1648cabdff1aSopenharmony_ci 1649cabdff1aSopenharmony_ci mova m0, [%2+ 5*%3] ; in5 1650cabdff1aSopenharmony_ci mova m1, [%2+11*%3] ; in11 1651cabdff1aSopenharmony_ci mova m2, [%2+21*%3] ; in21 1652cabdff1aSopenharmony_ci mova m3, [%2+27*%3] ; in27 1653cabdff1aSopenharmony_ci SUMSUB_MUL 0, 3, 4, 5, 15893, 3981 ; m0=t27a, m3=t20a 1654cabdff1aSopenharmony_ci SUMSUB_MUL 2, 1, 4, 5, 8423, 14053 ; m2=t26a, m1=t21a 1655cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 3, 4 ; m1=t20, m3=t21 1656cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 0, 4 ; m2=t27, m0=t26 1657cabdff1aSopenharmony_ci SUMSUB_MUL 0, 3, 4, 5, 9102, 13623 ; m0=t26a, m3=t21a 1658cabdff1aSopenharmony_ci SCRATCH 0, 8, rsp+275*mmsize 1659cabdff1aSopenharmony_ci SCRATCH 2, 9, rsp+276*mmsize 1660cabdff1aSopenharmony_ci 1661cabdff1aSopenharmony_ci ; end of stage 1-3 third quart 1662cabdff1aSopenharmony_ci 1663cabdff1aSopenharmony_ci mova m0, [%2+ 3*%3] ; in3 1664cabdff1aSopenharmony_ci mova m2, [%2+13*%3] ; in13 1665cabdff1aSopenharmony_ci mova m4, [%2+19*%3] ; in19 1666cabdff1aSopenharmony_ci mova m5, [%2+29*%3] ; in29 1667cabdff1aSopenharmony_ci SUMSUB_MUL 2, 4, 6, 7, 13160, 9760 ; m2=t25a, m4=t22a 1668cabdff1aSopenharmony_ci SUMSUB_MUL 5, 0, 6, 7, 2404, 16207 ; m5=t24a, m0=t23a 1669cabdff1aSopenharmony_ci SUMSUB_BA d, 4, 0, 6 ; m4=t23, m0=t22 1670cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 5, 6 ; m2=t24, m5=t25 1671cabdff1aSopenharmony_ci SUMSUB_MUL 5, 0, 6, 7, 13623, m9102 ; m5=t25a, m0=t22a 1672cabdff1aSopenharmony_ci 1673cabdff1aSopenharmony_ci ; end of stage 1-3 fourth quart 1674cabdff1aSopenharmony_ci 1675cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 4, 6 ; m1=t23a, m4=t20a 1676cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 0, 6 ; m3=t22, m0=t21 1677cabdff1aSopenharmony_ci UNSCRATCH 6, 8, rsp+275*mmsize ; t26a 1678cabdff1aSopenharmony_ci UNSCRATCH 7, 9, rsp+276*mmsize ; t27 1679cabdff1aSopenharmony_ci SCRATCH 3, 8, rsp+275*mmsize 1680cabdff1aSopenharmony_ci SCRATCH 1, 9, rsp+276*mmsize 1681cabdff1aSopenharmony_ci SUMSUB_BA d, 7, 2, 1 ; m7=t24a, m2=t27a 1682cabdff1aSopenharmony_ci SUMSUB_BA d, 6, 5, 1 ; m6=t25, m5=t26 1683cabdff1aSopenharmony_ci SUMSUB_MUL 2, 4, 1, 3, 6270, m15137 ; m2=t27, m4=t20 1684cabdff1aSopenharmony_ci SUMSUB_MUL 5, 0, 1, 3, 6270, m15137 ; m5=t26a, m0=t21a 1685cabdff1aSopenharmony_ci 1686cabdff1aSopenharmony_ci ; end of stage 4-5 second half 1687cabdff1aSopenharmony_ci 1688cabdff1aSopenharmony_ci UNSCRATCH 1, 12, rsp+279*mmsize ; t28 1689cabdff1aSopenharmony_ci UNSCRATCH 3, 13, rsp+280*mmsize ; t29a 1690cabdff1aSopenharmony_ci SCRATCH 4, 12, rsp+279*mmsize 1691cabdff1aSopenharmony_ci SCRATCH 0, 13, rsp+280*mmsize 1692cabdff1aSopenharmony_ci SUMSUB_BA d, 5, 3, 0 ; m5=t29, m3=t26 1693cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 1, 0 ; m2=t28a, m1=t27a 1694cabdff1aSopenharmony_ci UNSCRATCH 0, 14, rsp+281*mmsize ; t30 1695cabdff1aSopenharmony_ci UNSCRATCH 4, 15, rsp+282*mmsize ; t31a 1696cabdff1aSopenharmony_ci SCRATCH 2, 14, rsp+281*mmsize 1697cabdff1aSopenharmony_ci SCRATCH 5, 15, rsp+282*mmsize 1698cabdff1aSopenharmony_ci SUMSUB_BA d, 6, 0, 2 ; m6=t30a, m0=t25a 1699cabdff1aSopenharmony_ci SUMSUB_BA d, 7, 4, 2 ; m7=t31, m4=t24 1700cabdff1aSopenharmony_ci 1701cabdff1aSopenharmony_ci mova m2, [rsp+273*mmsize] ; t16a 1702cabdff1aSopenharmony_ci mova m5, [rsp+274*mmsize] ; t17 1703cabdff1aSopenharmony_ci mova [rsp+273*mmsize], m6 1704cabdff1aSopenharmony_ci mova [rsp+274*mmsize], m7 1705cabdff1aSopenharmony_ci UNSCRATCH 6, 10, rsp+277*mmsize ; t18a 1706cabdff1aSopenharmony_ci UNSCRATCH 7, 11, rsp+278*mmsize ; t19 1707cabdff1aSopenharmony_ci SCRATCH 4, 10, rsp+277*mmsize 1708cabdff1aSopenharmony_ci SCRATCH 0, 11, rsp+278*mmsize 1709cabdff1aSopenharmony_ci UNSCRATCH 4, 12, rsp+279*mmsize ; t20 1710cabdff1aSopenharmony_ci UNSCRATCH 0, 13, rsp+280*mmsize ; t21a 1711cabdff1aSopenharmony_ci SCRATCH 3, 12, rsp+279*mmsize 1712cabdff1aSopenharmony_ci SCRATCH 1, 13, rsp+280*mmsize 1713cabdff1aSopenharmony_ci SUMSUB_BA d, 0, 6, 1 ; m0=t18, m6=t21 1714cabdff1aSopenharmony_ci SUMSUB_BA d, 4, 7, 1 ; m4=t19a, m7=t20a 1715cabdff1aSopenharmony_ci UNSCRATCH 3, 8, rsp+275*mmsize ; t22 1716cabdff1aSopenharmony_ci UNSCRATCH 1, 9, rsp+276*mmsize ; t23a 1717cabdff1aSopenharmony_ci SCRATCH 0, 8, rsp+275*mmsize 1718cabdff1aSopenharmony_ci SCRATCH 4, 9, rsp+276*mmsize 1719cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 5, 0 ; m3=t17a, m5=t22a 1720cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 2, 0 ; m1=t16, m2=t23 1721cabdff1aSopenharmony_ci 1722cabdff1aSopenharmony_ci ; end of stage 6 1723cabdff1aSopenharmony_ci 1724cabdff1aSopenharmony_ci UNSCRATCH 0, 10, rsp+277*mmsize ; t24 1725cabdff1aSopenharmony_ci UNSCRATCH 4, 11, rsp+278*mmsize ; t25a 1726cabdff1aSopenharmony_ci SCRATCH 1, 10, rsp+277*mmsize 1727cabdff1aSopenharmony_ci SCRATCH 3, 11, rsp+278*mmsize 1728cabdff1aSopenharmony_ci SUMSUB_MUL 0, 2, 1, 3, 11585, 11585 ; m0=t24a, m2=t23a 1729cabdff1aSopenharmony_ci SUMSUB_MUL 4, 5, 1, 3, 11585, 11585 ; m4=t25, m5=t22 1730cabdff1aSopenharmony_ci UNSCRATCH 1, 12, rsp+279*mmsize ; t26 1731cabdff1aSopenharmony_ci UNSCRATCH 3, 13, rsp+280*mmsize ; t27a 1732cabdff1aSopenharmony_ci SCRATCH 0, 12, rsp+279*mmsize 1733cabdff1aSopenharmony_ci SCRATCH 4, 13, rsp+280*mmsize 1734cabdff1aSopenharmony_ci SUMSUB_MUL 3, 7, 0, 4, 11585, 11585 ; m3=t27, m7=t20 1735cabdff1aSopenharmony_ci SUMSUB_MUL 1, 6, 0, 4, 11585, 11585 ; m1=t26a, m6=t21a 1736cabdff1aSopenharmony_ci 1737cabdff1aSopenharmony_ci ; end of stage 7 1738cabdff1aSopenharmony_ci 1739cabdff1aSopenharmony_ci mova m0, [rsp+269*mmsize] ; t8 1740cabdff1aSopenharmony_ci mova m4, [rsp+270*mmsize] ; t9a 1741cabdff1aSopenharmony_ci mova [rsp+269*mmsize], m1 ; t26a 1742cabdff1aSopenharmony_ci mova [rsp+270*mmsize], m3 ; t27 1743cabdff1aSopenharmony_ci mova m3, [rsp+271*mmsize] ; t10 1744cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 0, 1 ; m2=out8, m0=out23 1745cabdff1aSopenharmony_ci SUMSUB_BA d, 5, 4, 1 ; m5=out9, m4=out22 1746cabdff1aSopenharmony_ci SUMSUB_BA d, 6, 3, 1 ; m6=out10, m3=out21 1747cabdff1aSopenharmony_ci mova m1, [rsp+272*mmsize] ; t11a 1748cabdff1aSopenharmony_ci mova [rsp+271*mmsize], m0 1749cabdff1aSopenharmony_ci SUMSUB_BA d, 7, 1, 0 ; m7=out11, m1=out20 1750cabdff1aSopenharmony_ci 1751cabdff1aSopenharmony_ci%if %1 == 1 1752cabdff1aSopenharmony_ci TRANSPOSE4x4D 2, 5, 6, 7, 0 1753cabdff1aSopenharmony_ci mova [ptrq+ 2*mmsize], m2 1754cabdff1aSopenharmony_ci mova [ptrq+10*mmsize], m5 1755cabdff1aSopenharmony_ci mova [ptrq+18*mmsize], m6 1756cabdff1aSopenharmony_ci mova [ptrq+26*mmsize], m7 1757cabdff1aSopenharmony_ci%else ; %1 == 2 1758cabdff1aSopenharmony_ci pxor m0, m0 1759cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*8] 1760cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6 1761cabdff1aSopenharmony_ci%endif 1762cabdff1aSopenharmony_ci mova m2, [rsp+271*mmsize] 1763cabdff1aSopenharmony_ci%if %1 == 1 1764cabdff1aSopenharmony_ci TRANSPOSE4x4D 1, 3, 4, 2, 0 1765cabdff1aSopenharmony_ci mova [ptrq+ 5*mmsize], m1 1766cabdff1aSopenharmony_ci mova [ptrq+13*mmsize], m3 1767cabdff1aSopenharmony_ci mova [ptrq+21*mmsize], m4 1768cabdff1aSopenharmony_ci mova [ptrq+29*mmsize], m2 1769cabdff1aSopenharmony_ci%else ; %1 == 2 1770cabdff1aSopenharmony_ci lea dstq, [dstq+stride3q*4] 1771cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6 1772cabdff1aSopenharmony_ci%endif 1773cabdff1aSopenharmony_ci 1774cabdff1aSopenharmony_ci ; end of last stage + store for out8-11 and out20-23 1775cabdff1aSopenharmony_ci 1776cabdff1aSopenharmony_ci UNSCRATCH 0, 9, rsp+276*mmsize ; t19a 1777cabdff1aSopenharmony_ci UNSCRATCH 1, 8, rsp+275*mmsize ; t18 1778cabdff1aSopenharmony_ci UNSCRATCH 2, 11, rsp+278*mmsize ; t17a 1779cabdff1aSopenharmony_ci UNSCRATCH 3, 10, rsp+277*mmsize ; t16 1780cabdff1aSopenharmony_ci mova m7, [rsp+261*mmsize] ; t12a 1781cabdff1aSopenharmony_ci mova m6, [rsp+262*mmsize] ; t13 1782cabdff1aSopenharmony_ci mova m5, [rsp+263*mmsize] ; t14a 1783cabdff1aSopenharmony_ci SUMSUB_BA d, 0, 7, 4 ; m0=out12, m7=out19 1784cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 6, 4 ; m1=out13, m6=out18 1785cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 5, 4 ; m2=out14, m5=out17 1786cabdff1aSopenharmony_ci mova m4, [rsp+264*mmsize] ; t15 1787cabdff1aSopenharmony_ci SCRATCH 7, 8, rsp+275*mmsize 1788cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 4, 7 ; m3=out15, m4=out16 1789cabdff1aSopenharmony_ci 1790cabdff1aSopenharmony_ci%if %1 == 1 1791cabdff1aSopenharmony_ci TRANSPOSE4x4D 0, 1, 2, 3, 7 1792cabdff1aSopenharmony_ci mova [ptrq+ 3*mmsize], m0 1793cabdff1aSopenharmony_ci mova [ptrq+11*mmsize], m1 1794cabdff1aSopenharmony_ci mova [ptrq+19*mmsize], m2 1795cabdff1aSopenharmony_ci mova [ptrq+27*mmsize], m3 1796cabdff1aSopenharmony_ci%else ; %1 == 2 1797cabdff1aSopenharmony_ci%if ARCH_X86_64 1798cabdff1aSopenharmony_ci SWAP 7, 9 1799cabdff1aSopenharmony_ci lea dstq, [dstbakq+stride3q*4] 1800cabdff1aSopenharmony_ci%else ; x86-32 1801cabdff1aSopenharmony_ci pxor m7, m7 1802cabdff1aSopenharmony_ci mov dstq, dstm 1803cabdff1aSopenharmony_ci lea dstq, [dstq+stride3q*4] 1804cabdff1aSopenharmony_ci%endif 1805cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6 1806cabdff1aSopenharmony_ci%endif 1807cabdff1aSopenharmony_ci UNSCRATCH 0, 8, rsp+275*mmsize ; out19 1808cabdff1aSopenharmony_ci%if %1 == 1 1809cabdff1aSopenharmony_ci TRANSPOSE4x4D 4, 5, 6, 0, 7 1810cabdff1aSopenharmony_ci mova [ptrq+ 4*mmsize], m4 1811cabdff1aSopenharmony_ci mova [ptrq+12*mmsize], m5 1812cabdff1aSopenharmony_ci mova [ptrq+20*mmsize], m6 1813cabdff1aSopenharmony_ci mova [ptrq+28*mmsize], m0 1814cabdff1aSopenharmony_ci%else ; %1 == 2 1815cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 1816cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6 1817cabdff1aSopenharmony_ci%endif 1818cabdff1aSopenharmony_ci 1819cabdff1aSopenharmony_ci ; end of last stage + store for out12-19 1820cabdff1aSopenharmony_ci 1821cabdff1aSopenharmony_ci%if ARCH_X86_64 1822cabdff1aSopenharmony_ci SWAP 7, 8 1823cabdff1aSopenharmony_ci%endif 1824cabdff1aSopenharmony_ci mova m7, [rsp+257*mmsize] ; t0 1825cabdff1aSopenharmony_ci mova m6, [rsp+258*mmsize] ; t1 1826cabdff1aSopenharmony_ci mova m5, [rsp+259*mmsize] ; t2 1827cabdff1aSopenharmony_ci mova m4, [rsp+260*mmsize] ; t3 1828cabdff1aSopenharmony_ci mova m0, [rsp+274*mmsize] ; t31 1829cabdff1aSopenharmony_ci mova m1, [rsp+273*mmsize] ; t30a 1830cabdff1aSopenharmony_ci UNSCRATCH 2, 15, rsp+282*mmsize ; t29 1831cabdff1aSopenharmony_ci SUMSUB_BA d, 0, 7, 3 ; m0=out0, m7=out31 1832cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 6, 3 ; m1=out1, m6=out30 1833cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 5, 3 ; m2=out2, m5=out29 1834cabdff1aSopenharmony_ci SCRATCH 0, 9, rsp+276*mmsize 1835cabdff1aSopenharmony_ci UNSCRATCH 3, 14, rsp+281*mmsize ; t28a 1836cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 4, 0 ; m3=out3, m4=out28 1837cabdff1aSopenharmony_ci 1838cabdff1aSopenharmony_ci%if %1 == 1 1839cabdff1aSopenharmony_ci TRANSPOSE4x4D 4, 5, 6, 7, 0 1840cabdff1aSopenharmony_ci mova [ptrq+ 7*mmsize], m4 1841cabdff1aSopenharmony_ci mova [ptrq+15*mmsize], m5 1842cabdff1aSopenharmony_ci mova [ptrq+23*mmsize], m6 1843cabdff1aSopenharmony_ci mova [ptrq+31*mmsize], m7 1844cabdff1aSopenharmony_ci%else ; %1 == 2 1845cabdff1aSopenharmony_ci%if ARCH_X86_64 1846cabdff1aSopenharmony_ci SWAP 0, 8 1847cabdff1aSopenharmony_ci%else ; x86-32 1848cabdff1aSopenharmony_ci pxor m0, m0 1849cabdff1aSopenharmony_ci%endif 1850cabdff1aSopenharmony_ci lea dstq, [dstq+stride3q*4] 1851cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6 1852cabdff1aSopenharmony_ci%endif 1853cabdff1aSopenharmony_ci UNSCRATCH 7, 9, rsp+276*mmsize ; out0 1854cabdff1aSopenharmony_ci%if %1 == 1 1855cabdff1aSopenharmony_ci TRANSPOSE4x4D 7, 1, 2, 3, 0 1856cabdff1aSopenharmony_ci mova [ptrq+ 0*mmsize], m7 1857cabdff1aSopenharmony_ci mova [ptrq+ 8*mmsize], m1 1858cabdff1aSopenharmony_ci mova [ptrq+16*mmsize], m2 1859cabdff1aSopenharmony_ci mova [ptrq+24*mmsize], m3 1860cabdff1aSopenharmony_ci%else ; %1 == 2 1861cabdff1aSopenharmony_ci%if ARCH_X86_64 1862cabdff1aSopenharmony_ci DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst 1863cabdff1aSopenharmony_ci%else ; x86-32 1864cabdff1aSopenharmony_ci mov dstq, dstm 1865cabdff1aSopenharmony_ci%endif 1866cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6 1867cabdff1aSopenharmony_ci%if ARCH_X86_64 1868cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1869cabdff1aSopenharmony_ci%endif 1870cabdff1aSopenharmony_ci%endif 1871cabdff1aSopenharmony_ci 1872cabdff1aSopenharmony_ci ; end of last stage + store for out0-3 and out28-31 1873cabdff1aSopenharmony_ci 1874cabdff1aSopenharmony_ci%if ARCH_X86_64 1875cabdff1aSopenharmony_ci SWAP 0, 8 1876cabdff1aSopenharmony_ci%endif 1877cabdff1aSopenharmony_ci mova m7, [rsp+265*mmsize] ; t4 1878cabdff1aSopenharmony_ci mova m6, [rsp+266*mmsize] ; t5a 1879cabdff1aSopenharmony_ci mova m5, [rsp+267*mmsize] ; t6a 1880cabdff1aSopenharmony_ci mova m4, [rsp+268*mmsize] ; t7 1881cabdff1aSopenharmony_ci mova m0, [rsp+270*mmsize] ; t27 1882cabdff1aSopenharmony_ci mova m1, [rsp+269*mmsize] ; t26a 1883cabdff1aSopenharmony_ci UNSCRATCH 2, 13, rsp+280*mmsize ; t25 1884cabdff1aSopenharmony_ci SUMSUB_BA d, 0, 7, 3 ; m0=out4, m7=out27 1885cabdff1aSopenharmony_ci SUMSUB_BA d, 1, 6, 3 ; m1=out5, m6=out26 1886cabdff1aSopenharmony_ci SUMSUB_BA d, 2, 5, 3 ; m2=out6, m5=out25 1887cabdff1aSopenharmony_ci UNSCRATCH 3, 12, rsp+279*mmsize ; t24a 1888cabdff1aSopenharmony_ci SCRATCH 7, 9, rsp+276*mmsize 1889cabdff1aSopenharmony_ci SUMSUB_BA d, 3, 4, 7 ; m3=out7, m4=out24 1890cabdff1aSopenharmony_ci 1891cabdff1aSopenharmony_ci%if %1 == 1 1892cabdff1aSopenharmony_ci TRANSPOSE4x4D 0, 1, 2, 3, 7 1893cabdff1aSopenharmony_ci mova [ptrq+ 1*mmsize], m0 1894cabdff1aSopenharmony_ci mova [ptrq+ 9*mmsize], m1 1895cabdff1aSopenharmony_ci mova [ptrq+17*mmsize], m2 1896cabdff1aSopenharmony_ci mova [ptrq+25*mmsize], m3 1897cabdff1aSopenharmony_ci%else ; %1 == 2 1898cabdff1aSopenharmony_ci%if ARCH_X86_64 1899cabdff1aSopenharmony_ci SWAP 7, 8 1900cabdff1aSopenharmony_ci lea dstq, [dstbakq+strideq*4] 1901cabdff1aSopenharmony_ci%else ; x86-32 1902cabdff1aSopenharmony_ci pxor m7, m7 1903cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*4] 1904cabdff1aSopenharmony_ci%endif 1905cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6 1906cabdff1aSopenharmony_ci%endif 1907cabdff1aSopenharmony_ci UNSCRATCH 0, 9, rsp+276*mmsize ; out27 1908cabdff1aSopenharmony_ci%if %1 == 1 1909cabdff1aSopenharmony_ci TRANSPOSE4x4D 4, 5, 6, 0, 7 1910cabdff1aSopenharmony_ci mova [ptrq+ 6*mmsize], m4 1911cabdff1aSopenharmony_ci mova [ptrq+14*mmsize], m5 1912cabdff1aSopenharmony_ci mova [ptrq+22*mmsize], m6 1913cabdff1aSopenharmony_ci mova [ptrq+30*mmsize], m0 1914cabdff1aSopenharmony_ci%else ; %1 == 2 1915cabdff1aSopenharmony_ci%if ARCH_X86_64 1916cabdff1aSopenharmony_ci lea dstq, [dstbakq+stride3q*8] 1917cabdff1aSopenharmony_ci%else 1918cabdff1aSopenharmony_ci mov dstq, dstm 1919cabdff1aSopenharmony_ci lea dstq, [dstq+stride3q*8] 1920cabdff1aSopenharmony_ci%endif 1921cabdff1aSopenharmony_ci ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6 1922cabdff1aSopenharmony_ci%endif 1923cabdff1aSopenharmony_ci 1924cabdff1aSopenharmony_ci ; end of last stage + store for out4-7 and out24-27 1925cabdff1aSopenharmony_ci%endmacro 1926cabdff1aSopenharmony_ci 1927cabdff1aSopenharmony_ciINIT_XMM sse2 1928cabdff1aSopenharmony_cicglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \ 1929cabdff1aSopenharmony_ci 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1930cabdff1aSopenharmony_ci dst, stride, block, eob 1931cabdff1aSopenharmony_ci mova m0, [pw_1023] 1932cabdff1aSopenharmony_ci cmp eobd, 1 1933cabdff1aSopenharmony_ci jg .idctfull 1934cabdff1aSopenharmony_ci 1935cabdff1aSopenharmony_ci ; dc-only - the 10bit version can be done entirely in 32bit, since the max 1936cabdff1aSopenharmony_ci ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily 1937cabdff1aSopenharmony_ci ; fits in 32bit 1938cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef 1939cabdff1aSopenharmony_ci pxor m2, m2 1940cabdff1aSopenharmony_ci DC_ONLY 6, m2 1941cabdff1aSopenharmony_ci movd m1, coefd 1942cabdff1aSopenharmony_ci pshuflw m1, m1, q0000 1943cabdff1aSopenharmony_ci punpcklqdq m1, m1 1944cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 1945cabdff1aSopenharmony_ci mov cntd, 32 1946cabdff1aSopenharmony_ci.loop_dc: 1947cabdff1aSopenharmony_ci STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 1948cabdff1aSopenharmony_ci STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize 1949cabdff1aSopenharmony_ci add dstq, strideq 1950cabdff1aSopenharmony_ci dec cntd 1951cabdff1aSopenharmony_ci jg .loop_dc 1952cabdff1aSopenharmony_ci RET 1953cabdff1aSopenharmony_ci 1954cabdff1aSopenharmony_ci.idctfull: 1955cabdff1aSopenharmony_ci mova [rsp+256*mmsize], m0 1956cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 1957cabdff1aSopenharmony_ci%if ARCH_X86_64 1958cabdff1aSopenharmony_ci mov dstbakq, dstq 1959cabdff1aSopenharmony_ci movsxd cntq, cntd 1960cabdff1aSopenharmony_ci%endif 1961cabdff1aSopenharmony_ci%ifdef PIC 1962cabdff1aSopenharmony_ci lea ptrq, [default_32x32] 1963cabdff1aSopenharmony_ci movzx cntd, byte [ptrq+cntq-1] 1964cabdff1aSopenharmony_ci%else 1965cabdff1aSopenharmony_ci movzx cntd, byte [default_32x32+cntq-1] 1966cabdff1aSopenharmony_ci%endif 1967cabdff1aSopenharmony_ci mov skipd, 8 1968cabdff1aSopenharmony_ci sub skipd, cntd 1969cabdff1aSopenharmony_ci mov ptrq, rsp 1970cabdff1aSopenharmony_ci.loop_1: 1971cabdff1aSopenharmony_ci IDCT32_1D 1, blockq 1972cabdff1aSopenharmony_ci 1973cabdff1aSopenharmony_ci add ptrq, 32 * mmsize 1974cabdff1aSopenharmony_ci add blockq, mmsize 1975cabdff1aSopenharmony_ci dec cntd 1976cabdff1aSopenharmony_ci jg .loop_1 1977cabdff1aSopenharmony_ci 1978cabdff1aSopenharmony_ci ; zero-pad the remainder (skipped cols) 1979cabdff1aSopenharmony_ci test skipd, skipd 1980cabdff1aSopenharmony_ci jz .end 1981cabdff1aSopenharmony_ci shl skipd, 2 1982cabdff1aSopenharmony_ci lea blockq, [blockq+skipq*(mmsize/4)] 1983cabdff1aSopenharmony_ci pxor m0, m0 1984cabdff1aSopenharmony_ci.loop_z: 1985cabdff1aSopenharmony_ci mova [ptrq+mmsize*0], m0 1986cabdff1aSopenharmony_ci mova [ptrq+mmsize*1], m0 1987cabdff1aSopenharmony_ci mova [ptrq+mmsize*2], m0 1988cabdff1aSopenharmony_ci mova [ptrq+mmsize*3], m0 1989cabdff1aSopenharmony_ci mova [ptrq+mmsize*4], m0 1990cabdff1aSopenharmony_ci mova [ptrq+mmsize*5], m0 1991cabdff1aSopenharmony_ci mova [ptrq+mmsize*6], m0 1992cabdff1aSopenharmony_ci mova [ptrq+mmsize*7], m0 1993cabdff1aSopenharmony_ci add ptrq, 8 * mmsize 1994cabdff1aSopenharmony_ci dec skipd 1995cabdff1aSopenharmony_ci jg .loop_z 1996cabdff1aSopenharmony_ci.end: 1997cabdff1aSopenharmony_ci 1998cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1999cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 2000cabdff1aSopenharmony_ci mov cntd, 8 2001cabdff1aSopenharmony_ci mov ptrq, rsp 2002cabdff1aSopenharmony_ci.loop_2: 2003cabdff1aSopenharmony_ci IDCT32_1D 2, ptrq 2004cabdff1aSopenharmony_ci 2005cabdff1aSopenharmony_ci add ptrq, mmsize 2006cabdff1aSopenharmony_ci%if ARCH_X86_64 2007cabdff1aSopenharmony_ci add dstbakq, 8 2008cabdff1aSopenharmony_ci mov dstq, dstbakq 2009cabdff1aSopenharmony_ci%else 2010cabdff1aSopenharmony_ci add dword dstm, 8 2011cabdff1aSopenharmony_ci mov dstq, dstm 2012cabdff1aSopenharmony_ci%endif 2013cabdff1aSopenharmony_ci dec cntd 2014cabdff1aSopenharmony_ci jg .loop_2 2015cabdff1aSopenharmony_ci 2016cabdff1aSopenharmony_ci ; m7 is still zero 2017cabdff1aSopenharmony_ci ZERO_BLOCK blockq-8*mmsize, 128, 32, m7 2018cabdff1aSopenharmony_ci RET 2019cabdff1aSopenharmony_ci 2020cabdff1aSopenharmony_ciINIT_XMM sse2 2021cabdff1aSopenharmony_cicglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \ 2022cabdff1aSopenharmony_ci 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 2023cabdff1aSopenharmony_ci dst, stride, block, eob 2024cabdff1aSopenharmony_ci mova m0, [pw_4095] 2025cabdff1aSopenharmony_ci cmp eobd, 1 2026cabdff1aSopenharmony_ci jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull 2027cabdff1aSopenharmony_ci 2028cabdff1aSopenharmony_ci ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign 2029cabdff1aSopenharmony_ci ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies 2030cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef, coefl 2031cabdff1aSopenharmony_ci pxor m2, m2 2032cabdff1aSopenharmony_ci DC_ONLY_64BIT 6, m2 2033cabdff1aSopenharmony_ci movd m1, coefd 2034cabdff1aSopenharmony_ci pshuflw m1, m1, q0000 2035cabdff1aSopenharmony_ci punpcklqdq m1, m1 2036cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 2037cabdff1aSopenharmony_ci mov cntd, 32 2038cabdff1aSopenharmony_ci.loop_dc: 2039cabdff1aSopenharmony_ci STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 2040cabdff1aSopenharmony_ci STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize 2041cabdff1aSopenharmony_ci add dstq, strideq 2042cabdff1aSopenharmony_ci dec cntd 2043cabdff1aSopenharmony_ci jg .loop_dc 2044cabdff1aSopenharmony_ci RET 2045