1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* VP9 IDCT SIMD optimizations 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (C) 2013 Clément Bœsch <u pkh me> 5cabdff1aSopenharmony_ci;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;****************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci%include "vp9itxfm_template.asm" 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ciSECTION_RODATA 32 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_ci%macro VP9_IDCT_COEFFS 2-3 0 30cabdff1aSopenharmony_ciconst pw_m%1_%2 31cabdff1aSopenharmony_citimes 8 dw -%1, %2 32cabdff1aSopenharmony_ciconst pw_%2_%1 33cabdff1aSopenharmony_citimes 8 dw %2, %1 34cabdff1aSopenharmony_ci 35cabdff1aSopenharmony_ci%if %3 == 1 36cabdff1aSopenharmony_ciconst pw_m%2_m%1 37cabdff1aSopenharmony_citimes 8 dw -%2, -%1 38cabdff1aSopenharmony_ci%if %1 != %2 39cabdff1aSopenharmony_ciconst pw_m%2_%1 40cabdff1aSopenharmony_citimes 8 dw -%2, %1 41cabdff1aSopenharmony_ciconst pw_%1_%2 42cabdff1aSopenharmony_citimes 8 dw %1, %2 43cabdff1aSopenharmony_ci%endif 44cabdff1aSopenharmony_ci%endif 45cabdff1aSopenharmony_ci 46cabdff1aSopenharmony_ci%if %1 < 11585 47cabdff1aSopenharmony_cipw_m%1x2: times 16 dw -%1*2 48cabdff1aSopenharmony_ci%elif %1 > 11585 49cabdff1aSopenharmony_cipw_%1x2: times 16 dw %1*2 50cabdff1aSopenharmony_ci%else 51cabdff1aSopenharmony_ciconst pw_%1x2 52cabdff1aSopenharmony_citimes 16 dw %1*2 53cabdff1aSopenharmony_ci%endif 54cabdff1aSopenharmony_ci 55cabdff1aSopenharmony_ci%if %2 != %1 56cabdff1aSopenharmony_cipw_%2x2: times 16 dw %2*2 57cabdff1aSopenharmony_ci%endif 58cabdff1aSopenharmony_ci%endmacro 59cabdff1aSopenharmony_ci 60cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 16364, 804 61cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 16305, 1606 62cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 16069, 3196, 1 63cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 15893, 3981 64cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 15137, 6270, 1 65cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 14811, 7005 66cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 14449, 7723 67cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 13160, 9760 68cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 11585, 11585, 1 69cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 11003, 12140 70cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 10394, 12665 71cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 9102, 13623, 1 72cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 8423, 14053 73cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 5520, 15426 74cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 4756, 15679 75cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 2404, 16207 76cabdff1aSopenharmony_ci 77cabdff1aSopenharmony_ciconst pw_5283_13377 78cabdff1aSopenharmony_citimes 4 dw 5283, 13377 79cabdff1aSopenharmony_ciconst pw_9929_13377 80cabdff1aSopenharmony_citimes 4 dw 9929, 13377 81cabdff1aSopenharmony_ciconst pw_15212_m13377 82cabdff1aSopenharmony_citimes 4 dw 15212, -13377 83cabdff1aSopenharmony_ciconst pw_15212_9929 84cabdff1aSopenharmony_citimes 4 dw 15212, 9929 85cabdff1aSopenharmony_ciconst pw_m5283_m15212 86cabdff1aSopenharmony_citimes 4 dw -5283, -15212 87cabdff1aSopenharmony_ciconst pw_13377x2 88cabdff1aSopenharmony_citimes 8 dw 13377*2 89cabdff1aSopenharmony_ciconst pw_m13377_13377 90cabdff1aSopenharmony_citimes 4 dw -13377, 13377 91cabdff1aSopenharmony_ciconst pw_13377_0 92cabdff1aSopenharmony_citimes 4 dw 13377, 0 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_cicextern pw_8 95cabdff1aSopenharmony_cicextern pw_16 96cabdff1aSopenharmony_cicextern pw_32 97cabdff1aSopenharmony_cicextern pw_512 98cabdff1aSopenharmony_cicextern pw_1024 99cabdff1aSopenharmony_cicextern pw_2048 100cabdff1aSopenharmony_cicextern pw_m1 101cabdff1aSopenharmony_cicextern pd_8192 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ciSECTION .text 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2 106cabdff1aSopenharmony_ci punpckhwd m%4, m%2, m%1 107cabdff1aSopenharmony_ci punpcklwd m%2, m%1 108cabdff1aSopenharmony_ci pmaddwd m%3, m%4, [pw_m%5_%6] 109cabdff1aSopenharmony_ci pmaddwd m%4, [pw_%6_%5] 110cabdff1aSopenharmony_ci pmaddwd m%1, m%2, [pw_m%5_%6] 111cabdff1aSopenharmony_ci pmaddwd m%2, [pw_%6_%5] 112cabdff1aSopenharmony_ci%endmacro 113cabdff1aSopenharmony_ci 114cabdff1aSopenharmony_ci%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round 115cabdff1aSopenharmony_ci SUMSUB_BA d, %1, %2, %5 116cabdff1aSopenharmony_ci SUMSUB_BA d, %3, %4, %5 117cabdff1aSopenharmony_ci paddd m%1, %6 118cabdff1aSopenharmony_ci paddd m%2, %6 119cabdff1aSopenharmony_ci paddd m%3, %6 120cabdff1aSopenharmony_ci paddd m%4, %6 121cabdff1aSopenharmony_ci psrad m%1, 14 122cabdff1aSopenharmony_ci psrad m%2, 14 123cabdff1aSopenharmony_ci psrad m%3, 14 124cabdff1aSopenharmony_ci psrad m%4, 14 125cabdff1aSopenharmony_ci packssdw m%1, m%3 126cabdff1aSopenharmony_ci packssdw m%2, m%4 127cabdff1aSopenharmony_ci%endmacro 128cabdff1aSopenharmony_ci 129cabdff1aSopenharmony_ci%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst 130cabdff1aSopenharmony_ci%if mmsize == 32 131cabdff1aSopenharmony_ci pmovzxbw m%3, [%6] 132cabdff1aSopenharmony_ci pmovzxbw m%4, [%6+strideq] 133cabdff1aSopenharmony_ci%else 134cabdff1aSopenharmony_ci movh m%3, [%6] 135cabdff1aSopenharmony_ci movh m%4, [%6+strideq] 136cabdff1aSopenharmony_ci punpcklbw m%3, m%5 137cabdff1aSopenharmony_ci punpcklbw m%4, m%5 138cabdff1aSopenharmony_ci%endif 139cabdff1aSopenharmony_ci paddw m%3, m%1 140cabdff1aSopenharmony_ci paddw m%4, m%2 141cabdff1aSopenharmony_ci%if mmsize == 32 142cabdff1aSopenharmony_ci packuswb m%3, m%4 143cabdff1aSopenharmony_ci ; Intel... 144cabdff1aSopenharmony_ci vpermq m%3, m%3, q3120 145cabdff1aSopenharmony_ci mova [%6], xm%3 146cabdff1aSopenharmony_ci vextracti128 [%6+strideq], m%3, 1 147cabdff1aSopenharmony_ci%elif mmsize == 16 148cabdff1aSopenharmony_ci packuswb m%3, m%4 149cabdff1aSopenharmony_ci movh [%6], m%3 150cabdff1aSopenharmony_ci movhps [%6+strideq], m%3 151cabdff1aSopenharmony_ci%else 152cabdff1aSopenharmony_ci packuswb m%3, m%5 153cabdff1aSopenharmony_ci packuswb m%4, m%5 154cabdff1aSopenharmony_ci movh [%6], m%3 155cabdff1aSopenharmony_ci movh [%6+strideq], m%4 156cabdff1aSopenharmony_ci%endif 157cabdff1aSopenharmony_ci%endmacro 158cabdff1aSopenharmony_ci 159cabdff1aSopenharmony_ci%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg 160cabdff1aSopenharmony_ci%assign %%y 0 161cabdff1aSopenharmony_ci%rep %3 162cabdff1aSopenharmony_ci%assign %%x 0 163cabdff1aSopenharmony_ci%rep %3*2/mmsize 164cabdff1aSopenharmony_ci mova [%1+%%y+%%x], %4 165cabdff1aSopenharmony_ci%assign %%x (%%x+mmsize) 166cabdff1aSopenharmony_ci%endrep 167cabdff1aSopenharmony_ci%assign %%y (%%y+%2) 168cabdff1aSopenharmony_ci%endrep 169cabdff1aSopenharmony_ci%endmacro 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------- 172cabdff1aSopenharmony_ci; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 173cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------- 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ciINIT_MMX mmx 176cabdff1aSopenharmony_cicglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob 177cabdff1aSopenharmony_ci mova m0, [blockq+0*8] 178cabdff1aSopenharmony_ci mova m1, [blockq+1*8] 179cabdff1aSopenharmony_ci mova m2, [blockq+2*8] 180cabdff1aSopenharmony_ci mova m3, [blockq+3*8] 181cabdff1aSopenharmony_ci psraw m0, 2 182cabdff1aSopenharmony_ci psraw m1, 2 183cabdff1aSopenharmony_ci psraw m2, 2 184cabdff1aSopenharmony_ci psraw m3, 2 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_ci VP9_IWHT4_1D 187cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 4 188cabdff1aSopenharmony_ci VP9_IWHT4_1D 189cabdff1aSopenharmony_ci 190cabdff1aSopenharmony_ci pxor m4, m4 191cabdff1aSopenharmony_ci VP9_STORE_2X 0, 1, 5, 6, 4 192cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 193cabdff1aSopenharmony_ci VP9_STORE_2X 2, 3, 5, 6, 4 194cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 8, 4, m4 195cabdff1aSopenharmony_ci RET 196cabdff1aSopenharmony_ci 197cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------- 198cabdff1aSopenharmony_ci; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 199cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------- 200cabdff1aSopenharmony_ci 201cabdff1aSopenharmony_ci; 2x2 top left corner 202cabdff1aSopenharmony_ci%macro VP9_IDCT4_2x2_1D 0 203cabdff1aSopenharmony_ci pmulhrsw m0, m5 ; m0=t1 204cabdff1aSopenharmony_ci mova m2, m0 ; m2=t0 205cabdff1aSopenharmony_ci mova m3, m1 206cabdff1aSopenharmony_ci pmulhrsw m1, m6 ; m1=t2 207cabdff1aSopenharmony_ci pmulhrsw m3, m7 ; m3=t3 208cabdff1aSopenharmony_ci VP9_IDCT4_1D_FINALIZE 209cabdff1aSopenharmony_ci%endmacro 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci%macro VP9_IDCT4_WRITEOUT 0 212cabdff1aSopenharmony_ci%if cpuflag(ssse3) 213cabdff1aSopenharmony_ci mova m5, [pw_2048] 214cabdff1aSopenharmony_ci pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 215cabdff1aSopenharmony_ci pmulhrsw m1, m5 216cabdff1aSopenharmony_ci%else 217cabdff1aSopenharmony_ci mova m5, [pw_8] 218cabdff1aSopenharmony_ci paddw m0, m5 219cabdff1aSopenharmony_ci paddw m1, m5 220cabdff1aSopenharmony_ci psraw m0, 4 221cabdff1aSopenharmony_ci psraw m1, 4 222cabdff1aSopenharmony_ci%endif 223cabdff1aSopenharmony_ci VP9_STORE_2X 0, 1, 6, 7, 4 224cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 225cabdff1aSopenharmony_ci%if cpuflag(ssse3) 226cabdff1aSopenharmony_ci pmulhrsw m2, m5 227cabdff1aSopenharmony_ci pmulhrsw m3, m5 228cabdff1aSopenharmony_ci%else 229cabdff1aSopenharmony_ci paddw m2, m5 230cabdff1aSopenharmony_ci paddw m3, m5 231cabdff1aSopenharmony_ci psraw m2, 4 232cabdff1aSopenharmony_ci psraw m3, 4 233cabdff1aSopenharmony_ci%endif 234cabdff1aSopenharmony_ci VP9_STORE_2X 2, 3, 6, 7, 4 235cabdff1aSopenharmony_ci%endmacro 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci%macro IDCT_4x4_FN 1 238cabdff1aSopenharmony_ciINIT_MMX %1 239cabdff1aSopenharmony_cicglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob 240cabdff1aSopenharmony_ci 241cabdff1aSopenharmony_ci%if cpuflag(ssse3) 242cabdff1aSopenharmony_ci cmp eobd, 4 ; 2x2 or smaller 243cabdff1aSopenharmony_ci jg .idctfull 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ci cmp eobd, 1 ; faster path for when only DC is set 246cabdff1aSopenharmony_ci jne .idct2x2 247cabdff1aSopenharmony_ci%else 248cabdff1aSopenharmony_ci cmp eobd, 1 249cabdff1aSopenharmony_ci jg .idctfull 250cabdff1aSopenharmony_ci%endif 251cabdff1aSopenharmony_ci 252cabdff1aSopenharmony_ci%if cpuflag(ssse3) 253cabdff1aSopenharmony_ci movd m0, [blockq] 254cabdff1aSopenharmony_ci mova m5, [pw_11585x2] 255cabdff1aSopenharmony_ci pmulhrsw m0, m5 256cabdff1aSopenharmony_ci pmulhrsw m0, m5 257cabdff1aSopenharmony_ci%else 258cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef 259cabdff1aSopenharmony_ci movsx coefd, word [blockq] 260cabdff1aSopenharmony_ci imul coefd, 11585 261cabdff1aSopenharmony_ci add coefd, 8192 262cabdff1aSopenharmony_ci sar coefd, 14 263cabdff1aSopenharmony_ci imul coefd, 11585 264cabdff1aSopenharmony_ci add coefd, (8 << 14) + 8192 265cabdff1aSopenharmony_ci sar coefd, 14 + 4 266cabdff1aSopenharmony_ci movd m0, coefd 267cabdff1aSopenharmony_ci%endif 268cabdff1aSopenharmony_ci pshufw m0, m0, 0 269cabdff1aSopenharmony_ci pxor m4, m4 270cabdff1aSopenharmony_ci movh [blockq], m4 271cabdff1aSopenharmony_ci%if cpuflag(ssse3) 272cabdff1aSopenharmony_ci pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 273cabdff1aSopenharmony_ci%endif 274cabdff1aSopenharmony_ci VP9_STORE_2X 0, 0, 6, 7, 4 275cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 276cabdff1aSopenharmony_ci VP9_STORE_2X 0, 0, 6, 7, 4 277cabdff1aSopenharmony_ci RET 278cabdff1aSopenharmony_ci 279cabdff1aSopenharmony_ci%if cpuflag(ssse3) 280cabdff1aSopenharmony_ci; faster path for when only top left 2x2 block is set 281cabdff1aSopenharmony_ci.idct2x2: 282cabdff1aSopenharmony_ci movd m0, [blockq+0] 283cabdff1aSopenharmony_ci movd m1, [blockq+8] 284cabdff1aSopenharmony_ci mova m5, [pw_11585x2] 285cabdff1aSopenharmony_ci mova m6, [pw_6270x2] 286cabdff1aSopenharmony_ci mova m7, [pw_15137x2] 287cabdff1aSopenharmony_ci VP9_IDCT4_2x2_1D 288cabdff1aSopenharmony_ci ; partial 2x4 transpose 289cabdff1aSopenharmony_ci punpcklwd m0, m1 290cabdff1aSopenharmony_ci punpcklwd m2, m3 291cabdff1aSopenharmony_ci SBUTTERFLY dq, 0, 2, 1 292cabdff1aSopenharmony_ci SWAP 1, 2 293cabdff1aSopenharmony_ci VP9_IDCT4_2x2_1D 294cabdff1aSopenharmony_ci pxor m4, m4 ; used for the block reset, and VP9_STORE_2X 295cabdff1aSopenharmony_ci movh [blockq+ 0], m4 296cabdff1aSopenharmony_ci movh [blockq+ 8], m4 297cabdff1aSopenharmony_ci VP9_IDCT4_WRITEOUT 298cabdff1aSopenharmony_ci RET 299cabdff1aSopenharmony_ci%endif 300cabdff1aSopenharmony_ci 301cabdff1aSopenharmony_ci.idctfull: ; generic full 4x4 idct/idct 302cabdff1aSopenharmony_ci mova m0, [blockq+ 0] 303cabdff1aSopenharmony_ci mova m1, [blockq+ 8] 304cabdff1aSopenharmony_ci mova m2, [blockq+16] 305cabdff1aSopenharmony_ci mova m3, [blockq+24] 306cabdff1aSopenharmony_ci%if cpuflag(ssse3) 307cabdff1aSopenharmony_ci mova m6, [pw_11585x2] 308cabdff1aSopenharmony_ci%endif 309cabdff1aSopenharmony_ci mova m7, [pd_8192] ; rounding 310cabdff1aSopenharmony_ci VP9_IDCT4_1D 311cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 4 312cabdff1aSopenharmony_ci VP9_IDCT4_1D 313cabdff1aSopenharmony_ci pxor m4, m4 ; used for the block reset, and VP9_STORE_2X 314cabdff1aSopenharmony_ci mova [blockq+ 0], m4 315cabdff1aSopenharmony_ci mova [blockq+ 8], m4 316cabdff1aSopenharmony_ci mova [blockq+16], m4 317cabdff1aSopenharmony_ci mova [blockq+24], m4 318cabdff1aSopenharmony_ci VP9_IDCT4_WRITEOUT 319cabdff1aSopenharmony_ci RET 320cabdff1aSopenharmony_ci%endmacro 321cabdff1aSopenharmony_ci 322cabdff1aSopenharmony_ciIDCT_4x4_FN mmxext 323cabdff1aSopenharmony_ciIDCT_4x4_FN ssse3 324cabdff1aSopenharmony_ci 325cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------- 326cabdff1aSopenharmony_ci; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 327cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------- 328cabdff1aSopenharmony_ci 329cabdff1aSopenharmony_ci%macro IADST4_FN 5 330cabdff1aSopenharmony_ciINIT_MMX %5 331cabdff1aSopenharmony_cicglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob 332cabdff1aSopenharmony_ci%if WIN64 && notcpuflag(ssse3) 333cabdff1aSopenharmony_ci WIN64_SPILL_XMM 8 334cabdff1aSopenharmony_ci%endif 335cabdff1aSopenharmony_ci movdqa xmm5, [pd_8192] 336cabdff1aSopenharmony_ci mova m0, [blockq+ 0] 337cabdff1aSopenharmony_ci mova m1, [blockq+ 8] 338cabdff1aSopenharmony_ci mova m2, [blockq+16] 339cabdff1aSopenharmony_ci mova m3, [blockq+24] 340cabdff1aSopenharmony_ci%if cpuflag(ssse3) 341cabdff1aSopenharmony_ci mova m6, [pw_11585x2] 342cabdff1aSopenharmony_ci%endif 343cabdff1aSopenharmony_ci%ifnidn %1%3, iadstiadst 344cabdff1aSopenharmony_ci movdq2q m7, xmm5 345cabdff1aSopenharmony_ci%endif 346cabdff1aSopenharmony_ci VP9_%2_1D 347cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 4 348cabdff1aSopenharmony_ci VP9_%4_1D 349cabdff1aSopenharmony_ci pxor m4, m4 ; used for the block reset, and VP9_STORE_2X 350cabdff1aSopenharmony_ci mova [blockq+ 0], m4 351cabdff1aSopenharmony_ci mova [blockq+ 8], m4 352cabdff1aSopenharmony_ci mova [blockq+16], m4 353cabdff1aSopenharmony_ci mova [blockq+24], m4 354cabdff1aSopenharmony_ci VP9_IDCT4_WRITEOUT 355cabdff1aSopenharmony_ci RET 356cabdff1aSopenharmony_ci%endmacro 357cabdff1aSopenharmony_ci 358cabdff1aSopenharmony_ciIADST4_FN idct, IDCT4, iadst, IADST4, sse2 359cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, idct, IDCT4, sse2 360cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, iadst, IADST4, sse2 361cabdff1aSopenharmony_ci 362cabdff1aSopenharmony_ciIADST4_FN idct, IDCT4, iadst, IADST4, ssse3 363cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, idct, IDCT4, ssse3 364cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, iadst, IADST4, ssse3 365cabdff1aSopenharmony_ci 366cabdff1aSopenharmony_ci%macro SCRATCH 3 367cabdff1aSopenharmony_ci%if ARCH_X86_64 368cabdff1aSopenharmony_ci SWAP %1, %2 369cabdff1aSopenharmony_ci%else 370cabdff1aSopenharmony_ci mova [%3], m%1 371cabdff1aSopenharmony_ci%endif 372cabdff1aSopenharmony_ci%endmacro 373cabdff1aSopenharmony_ci 374cabdff1aSopenharmony_ci%macro UNSCRATCH 3 375cabdff1aSopenharmony_ci%if ARCH_X86_64 376cabdff1aSopenharmony_ci SWAP %1, %2 377cabdff1aSopenharmony_ci%else 378cabdff1aSopenharmony_ci mova m%1, [%3] 379cabdff1aSopenharmony_ci%endif 380cabdff1aSopenharmony_ci%endmacro 381cabdff1aSopenharmony_ci 382cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------- 383cabdff1aSopenharmony_ci; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 384cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------- 385cabdff1aSopenharmony_ci 386cabdff1aSopenharmony_ci%macro VP9_IDCT8_1D_FINALIZE 0 387cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 6, 5 ; m3=t0+t7, m6=t0-t7 388cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 2, 5 ; m1=t1+t6, m2=t1-t6 389cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 0, 5 ; m7=t2+t5, m0=t2-t5 390cabdff1aSopenharmony_ci 391cabdff1aSopenharmony_ci UNSCRATCH 5, 8, blockq+ 0 392cabdff1aSopenharmony_ci SCRATCH 2, 8, blockq+ 0 393cabdff1aSopenharmony_ci 394cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 4, 2 ; m5=t3+t4, m4=t3-t4 395cabdff1aSopenharmony_ci SWAP 7, 6, 2 396cabdff1aSopenharmony_ci SWAP 3, 5, 0 397cabdff1aSopenharmony_ci 398cabdff1aSopenharmony_ci%if ARCH_X86_64 399cabdff1aSopenharmony_ci SWAP 6, 8 400cabdff1aSopenharmony_ci%endif 401cabdff1aSopenharmony_ci%endmacro 402cabdff1aSopenharmony_ci 403cabdff1aSopenharmony_ci; x86-32 404cabdff1aSopenharmony_ci; - in: m0/m4 is in mem 405cabdff1aSopenharmony_ci; - out: m6 is in mem 406cabdff1aSopenharmony_ci; x86-64: 407cabdff1aSopenharmony_ci; - everything is in registers (m0-7) 408cabdff1aSopenharmony_ci%macro VP9_IDCT8_1D 0 409cabdff1aSopenharmony_ci%if ARCH_X86_64 410cabdff1aSopenharmony_ci SWAP 0, 8 411cabdff1aSopenharmony_ci SWAP 4, 9 412cabdff1aSopenharmony_ci%endif 413cabdff1aSopenharmony_ci 414cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 5, 3, 9102, 13623, D_8192_REG, 0, 4 ; m5=t5a, m3=t6a 415cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 1, 7, 16069, 3196, D_8192_REG, 0, 4 ; m1=t4a, m7=t7a 416cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 1, 0 ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a) 417cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 7, 0 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) 418cabdff1aSopenharmony_ci%if cpuflag(ssse3) 419cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 7, 0 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) 420cabdff1aSopenharmony_ci pmulhrsw m1, W_11585x2_REG ; m1=t6 421cabdff1aSopenharmony_ci pmulhrsw m7, W_11585x2_REG ; m7=t5 422cabdff1aSopenharmony_ci%else 423cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 7, 1, 11585, 11585, D_8192_REG, 0, 4 424cabdff1aSopenharmony_ci%endif 425cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 2, 6, 15137, 6270, D_8192_REG, 0, 4 ; m2=t2a, m6=t3a 426cabdff1aSopenharmony_ci 427cabdff1aSopenharmony_ci UNSCRATCH 0, 8, blockq+ 0 ; IN(0) 428cabdff1aSopenharmony_ci UNSCRATCH 4, 9, blockq+64 ; IN(4) 429cabdff1aSopenharmony_ci SCRATCH 5, 8, blockq+ 0 430cabdff1aSopenharmony_ci 431cabdff1aSopenharmony_ci%if cpuflag(ssse3) 432cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 0, 5 ; m4=IN(0)+IN(4) m0=IN(0)-IN(4) 433cabdff1aSopenharmony_ci pmulhrsw m4, W_11585x2_REG ; m4=t0a 434cabdff1aSopenharmony_ci pmulhrsw m0, W_11585x2_REG ; m0=t1a 435cabdff1aSopenharmony_ci%else 436cabdff1aSopenharmony_ci SCRATCH 7, 9, blockq+64 437cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 0, 4, 11585, 11585, D_8192_REG, 5, 7 438cabdff1aSopenharmony_ci UNSCRATCH 7, 9, blockq+64 439cabdff1aSopenharmony_ci%endif 440cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 4, 5 ; m6=t0a+t3a (t0), m4=t0a-t3a (t3) 441cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci VP9_IDCT8_1D_FINALIZE 444cabdff1aSopenharmony_ci%endmacro 445cabdff1aSopenharmony_ci 446cabdff1aSopenharmony_ci%macro VP9_IDCT8_4x4_1D 0 447cabdff1aSopenharmony_ci pmulhrsw m0, W_11585x2_REG ; m0=t1a/t0a 448cabdff1aSopenharmony_ci pmulhrsw m6, m2, [pw_15137x2] ; m6=t3a 449cabdff1aSopenharmony_ci pmulhrsw m2, [pw_6270x2] ; m2=t2a 450cabdff1aSopenharmony_ci pmulhrsw m7, m1, [pw_16069x2] ; m7=t7a 451cabdff1aSopenharmony_ci pmulhrsw m1, [pw_3196x2] ; m1=t4a 452cabdff1aSopenharmony_ci pmulhrsw m5, m3, [pw_m9102x2] ; m5=t5a 453cabdff1aSopenharmony_ci pmulhrsw m3, [pw_13623x2] ; m3=t6a 454cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 1, 4 ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a) 455cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 7, 4 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) 456cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 7, 4 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) 457cabdff1aSopenharmony_ci pmulhrsw m1, W_11585x2_REG ; m1=t6 458cabdff1aSopenharmony_ci pmulhrsw m7, W_11585x2_REG ; m7=t5 459cabdff1aSopenharmony_ci psubw m4, m0, m6 ; m4=t0a-t3a (t3) 460cabdff1aSopenharmony_ci paddw m6, m0 ; m6=t0a+t3a (t0) 461cabdff1aSopenharmony_ci SCRATCH 5, 8, blockq+ 0 462cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) 463cabdff1aSopenharmony_ci VP9_IDCT8_1D_FINALIZE 464cabdff1aSopenharmony_ci%endmacro 465cabdff1aSopenharmony_ci 466cabdff1aSopenharmony_ci%macro VP9_IDCT8_2x2_1D 1 467cabdff1aSopenharmony_ci pmulhrsw m0, W_11585x2_REG ; m0=t0 468cabdff1aSopenharmony_ci pmulhrsw m3, m1, W_16069x2_REG ; m3=t7 469cabdff1aSopenharmony_ci pmulhrsw m1, W_3196x2_REG ; m1=t4 470cabdff1aSopenharmony_ci psubw m7, m3, m1 ; t5 = t7a - t4a 471cabdff1aSopenharmony_ci paddw m5, m3, m1 ; t6 = t7a + t4a 472cabdff1aSopenharmony_ci pmulhrsw m7, W_11585x2_REG ; m7=t5 473cabdff1aSopenharmony_ci pmulhrsw m5, W_11585x2_REG ; m5=t6 474cabdff1aSopenharmony_ci SWAP 5, 1 475cabdff1aSopenharmony_ci ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier 476cabdff1aSopenharmony_ci psubw m6, m0, m3 ; m6=t0-t7 477cabdff1aSopenharmony_ci paddw m3, m0 ; m3=t0+t7 478cabdff1aSopenharmony_ci psubw m2, m0, m1 ; m2=t1-t6 479cabdff1aSopenharmony_ci paddw m1, m0 ; m1=t1+t6 480cabdff1aSopenharmony_ci%if %1 == 1 481cabdff1aSopenharmony_ci punpcklwd m3, m1 482cabdff1aSopenharmony_ci%define SCRATCH_REG 1 483cabdff1aSopenharmony_ci%elif ARCH_X86_32 484cabdff1aSopenharmony_ci mova [blockq+ 0], m2 485cabdff1aSopenharmony_ci%define SCRATCH_REG 2 486cabdff1aSopenharmony_ci%else 487cabdff1aSopenharmony_ci%define SCRATCH_REG 8 488cabdff1aSopenharmony_ci%endif 489cabdff1aSopenharmony_ci psubw m4, m0, m5 ; m4=t3-t4 490cabdff1aSopenharmony_ci paddw m5, m0 ; m5=t3+t4 491cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 0, SCRATCH_REG ; m7=t2+t5, m0=t2-t5 492cabdff1aSopenharmony_ci SWAP 7, 6, 2 493cabdff1aSopenharmony_ci SWAP 3, 5, 0 494cabdff1aSopenharmony_ci%undef SCRATCH_REG 495cabdff1aSopenharmony_ci%endmacro 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift 498cabdff1aSopenharmony_ci%if cpuflag(ssse3) 499cabdff1aSopenharmony_ci pmulhrsw m%1, %6 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 500cabdff1aSopenharmony_ci pmulhrsw m%2, %6 501cabdff1aSopenharmony_ci%else 502cabdff1aSopenharmony_ci paddw m%1, %6 503cabdff1aSopenharmony_ci paddw m%2, %6 504cabdff1aSopenharmony_ci psraw m%1, %7 505cabdff1aSopenharmony_ci psraw m%2, %7 506cabdff1aSopenharmony_ci%endif 507cabdff1aSopenharmony_ci%if %0 <= 7 508cabdff1aSopenharmony_ci VP9_STORE_2X %1, %2, %3, %4, %5 509cabdff1aSopenharmony_ci%else 510cabdff1aSopenharmony_ci VP9_STORE_2X %1, %2, %3, %4, %5, %8 511cabdff1aSopenharmony_ci%endif 512cabdff1aSopenharmony_ci%endmacro 513cabdff1aSopenharmony_ci 514cabdff1aSopenharmony_ci; x86-32: 515cabdff1aSopenharmony_ci; - m6 is in mem 516cabdff1aSopenharmony_ci; x86-64: 517cabdff1aSopenharmony_ci; - m8 holds m6 (SWAP) 518cabdff1aSopenharmony_ci; m6 holds zero 519cabdff1aSopenharmony_ci%macro VP9_IDCT8_WRITEOUT 0 520cabdff1aSopenharmony_ci%if ARCH_X86_64 521cabdff1aSopenharmony_ci%if cpuflag(ssse3) 522cabdff1aSopenharmony_ci mova m9, [pw_1024] 523cabdff1aSopenharmony_ci%else 524cabdff1aSopenharmony_ci mova m9, [pw_16] 525cabdff1aSopenharmony_ci%endif 526cabdff1aSopenharmony_ci%define ROUND_REG m9 527cabdff1aSopenharmony_ci%else 528cabdff1aSopenharmony_ci%if cpuflag(ssse3) 529cabdff1aSopenharmony_ci%define ROUND_REG [pw_1024] 530cabdff1aSopenharmony_ci%else 531cabdff1aSopenharmony_ci%define ROUND_REG [pw_16] 532cabdff1aSopenharmony_ci%endif 533cabdff1aSopenharmony_ci%endif 534cabdff1aSopenharmony_ci SCRATCH 5, 10, blockq+16 535cabdff1aSopenharmony_ci SCRATCH 7, 11, blockq+32 536cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 0, 1, 5, 7, 6, ROUND_REG 537cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 538cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 2, 3, 5, 7, 6, ROUND_REG 539cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 540cabdff1aSopenharmony_ci UNSCRATCH 5, 10, blockq+16 541cabdff1aSopenharmony_ci UNSCRATCH 7, 11, blockq+32 542cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 4, 5, 0, 1, 6, ROUND_REG 543cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 544cabdff1aSopenharmony_ci UNSCRATCH 5, 8, blockq+ 0 545cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 5, 7, 0, 1, 6, ROUND_REG 546cabdff1aSopenharmony_ci 547cabdff1aSopenharmony_ci%undef ROUND_REG 548cabdff1aSopenharmony_ci%endmacro 549cabdff1aSopenharmony_ci 550cabdff1aSopenharmony_ci%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2 551cabdff1aSopenharmony_ciINIT_XMM %1 552cabdff1aSopenharmony_cicglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob 553cabdff1aSopenharmony_ci 554cabdff1aSopenharmony_ci%if cpuflag(ssse3) 555cabdff1aSopenharmony_ci%if ARCH_X86_64 556cabdff1aSopenharmony_ci mova m12, [pw_11585x2] ; often used 557cabdff1aSopenharmony_ci%define W_11585x2_REG m12 558cabdff1aSopenharmony_ci%else 559cabdff1aSopenharmony_ci%define W_11585x2_REG [pw_11585x2] 560cabdff1aSopenharmony_ci%endif 561cabdff1aSopenharmony_ci 562cabdff1aSopenharmony_ci cmp eobd, 12 ; top left half or less 563cabdff1aSopenharmony_ci jg .idctfull 564cabdff1aSopenharmony_ci 565cabdff1aSopenharmony_ci cmp eobd, 3 ; top left corner or less 566cabdff1aSopenharmony_ci jg .idcthalf 567cabdff1aSopenharmony_ci 568cabdff1aSopenharmony_ci cmp eobd, 1 ; faster path for when only DC is set 569cabdff1aSopenharmony_ci jne .idcttopleftcorner 570cabdff1aSopenharmony_ci%else 571cabdff1aSopenharmony_ci cmp eobd, 1 572cabdff1aSopenharmony_ci jg .idctfull 573cabdff1aSopenharmony_ci%endif 574cabdff1aSopenharmony_ci 575cabdff1aSopenharmony_ci%if cpuflag(ssse3) 576cabdff1aSopenharmony_ci movd m0, [blockq] 577cabdff1aSopenharmony_ci pmulhrsw m0, W_11585x2_REG 578cabdff1aSopenharmony_ci pmulhrsw m0, W_11585x2_REG 579cabdff1aSopenharmony_ci%else 580cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef 581cabdff1aSopenharmony_ci movsx coefd, word [blockq] 582cabdff1aSopenharmony_ci imul coefd, 11585 583cabdff1aSopenharmony_ci add coefd, 8192 584cabdff1aSopenharmony_ci sar coefd, 14 585cabdff1aSopenharmony_ci imul coefd, 11585 586cabdff1aSopenharmony_ci add coefd, (16 << 14) + 8192 587cabdff1aSopenharmony_ci sar coefd, 14 + 5 588cabdff1aSopenharmony_ci movd m0, coefd 589cabdff1aSopenharmony_ci%endif 590cabdff1aSopenharmony_ci SPLATW m0, m0, 0 591cabdff1aSopenharmony_ci pxor m4, m4 592cabdff1aSopenharmony_ci movd [blockq], m4 593cabdff1aSopenharmony_ci%if cpuflag(ssse3) 594cabdff1aSopenharmony_ci pmulhrsw m0, [pw_1024] ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 595cabdff1aSopenharmony_ci%endif 596cabdff1aSopenharmony_ci%rep 3 597cabdff1aSopenharmony_ci VP9_STORE_2X 0, 0, 6, 7, 4 598cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 599cabdff1aSopenharmony_ci%endrep 600cabdff1aSopenharmony_ci VP9_STORE_2X 0, 0, 6, 7, 4 601cabdff1aSopenharmony_ci RET 602cabdff1aSopenharmony_ci 603cabdff1aSopenharmony_ci%if cpuflag(ssse3) 604cabdff1aSopenharmony_ci; faster path for when only left corner is set (3 input: DC, right to DC, below 605cabdff1aSopenharmony_ci; to DC). Note: also working with a 2x2 block 606cabdff1aSopenharmony_ci.idcttopleftcorner: 607cabdff1aSopenharmony_ci movd m0, [blockq+0] 608cabdff1aSopenharmony_ci movd m1, [blockq+16] 609cabdff1aSopenharmony_ci%if ARCH_X86_64 610cabdff1aSopenharmony_ci mova m10, [pw_3196x2] 611cabdff1aSopenharmony_ci mova m11, [pw_16069x2] 612cabdff1aSopenharmony_ci%define W_3196x2_REG m10 613cabdff1aSopenharmony_ci%define W_16069x2_REG m11 614cabdff1aSopenharmony_ci%else 615cabdff1aSopenharmony_ci%define W_3196x2_REG [pw_3196x2] 616cabdff1aSopenharmony_ci%define W_16069x2_REG [pw_16069x2] 617cabdff1aSopenharmony_ci%endif 618cabdff1aSopenharmony_ci VP9_IDCT8_2x2_1D 1 619cabdff1aSopenharmony_ci ; partial 2x8 transpose 620cabdff1aSopenharmony_ci ; punpcklwd m0, m1 already done inside idct 621cabdff1aSopenharmony_ci punpcklwd m2, m3 622cabdff1aSopenharmony_ci punpcklwd m4, m5 623cabdff1aSopenharmony_ci punpcklwd m6, m7 624cabdff1aSopenharmony_ci punpckldq m0, m2 625cabdff1aSopenharmony_ci punpckldq m4, m6 626cabdff1aSopenharmony_ci SBUTTERFLY qdq, 0, 4, 1 627cabdff1aSopenharmony_ci SWAP 1, 4 628cabdff1aSopenharmony_ci VP9_IDCT8_2x2_1D 2 629cabdff1aSopenharmony_ci%if ARCH_X86_64 630cabdff1aSopenharmony_ci SWAP 6, 8 631cabdff1aSopenharmony_ci%endif 632cabdff1aSopenharmony_ci pxor m6, m6 ; used for the block reset, and VP9_STORE_2X 633cabdff1aSopenharmony_ci VP9_IDCT8_WRITEOUT 634cabdff1aSopenharmony_ci%if ARCH_X86_64 635cabdff1aSopenharmony_ci movd [blockq+ 0], m6 636cabdff1aSopenharmony_ci movd [blockq+16], m6 637cabdff1aSopenharmony_ci%else 638cabdff1aSopenharmony_ci mova [blockq+ 0], m6 639cabdff1aSopenharmony_ci mova [blockq+16], m6 640cabdff1aSopenharmony_ci mova [blockq+32], m6 641cabdff1aSopenharmony_ci%endif 642cabdff1aSopenharmony_ci RET 643cabdff1aSopenharmony_ci 644cabdff1aSopenharmony_ci.idcthalf: 645cabdff1aSopenharmony_ci movh m0, [blockq + 0] 646cabdff1aSopenharmony_ci movh m1, [blockq +16] 647cabdff1aSopenharmony_ci movh m2, [blockq +32] 648cabdff1aSopenharmony_ci movh m3, [blockq +48] 649cabdff1aSopenharmony_ci VP9_IDCT8_4x4_1D 650cabdff1aSopenharmony_ci ; partial 4x8 transpose 651cabdff1aSopenharmony_ci%if ARCH_X86_32 652cabdff1aSopenharmony_ci mova m6, [blockq+ 0] 653cabdff1aSopenharmony_ci%endif 654cabdff1aSopenharmony_ci punpcklwd m0, m1 655cabdff1aSopenharmony_ci punpcklwd m2, m3 656cabdff1aSopenharmony_ci punpcklwd m4, m5 657cabdff1aSopenharmony_ci punpcklwd m6, m7 658cabdff1aSopenharmony_ci SBUTTERFLY dq, 0, 2, 1 659cabdff1aSopenharmony_ci SBUTTERFLY dq, 4, 6, 5 660cabdff1aSopenharmony_ci SBUTTERFLY qdq, 0, 4, 1 661cabdff1aSopenharmony_ci SBUTTERFLY qdq, 2, 6, 5 662cabdff1aSopenharmony_ci SWAP 1, 4 663cabdff1aSopenharmony_ci SWAP 3, 6 664cabdff1aSopenharmony_ci VP9_IDCT8_4x4_1D 665cabdff1aSopenharmony_ci%if ARCH_X86_64 666cabdff1aSopenharmony_ci SWAP 6, 8 667cabdff1aSopenharmony_ci%endif 668cabdff1aSopenharmony_ci pxor m6, m6 669cabdff1aSopenharmony_ci VP9_IDCT8_WRITEOUT 670cabdff1aSopenharmony_ci%if ARCH_X86_64 671cabdff1aSopenharmony_ci movh [blockq+ 0], m6 672cabdff1aSopenharmony_ci movh [blockq+16], m6 673cabdff1aSopenharmony_ci movh [blockq+32], m6 674cabdff1aSopenharmony_ci%else 675cabdff1aSopenharmony_ci mova [blockq+ 0], m6 676cabdff1aSopenharmony_ci mova [blockq+16], m6 677cabdff1aSopenharmony_ci mova [blockq+32], m6 678cabdff1aSopenharmony_ci%endif 679cabdff1aSopenharmony_ci movh [blockq+48], m6 680cabdff1aSopenharmony_ci RET 681cabdff1aSopenharmony_ci%endif 682cabdff1aSopenharmony_ci 683cabdff1aSopenharmony_ci.idctfull: ; generic full 8x8 idct/idct 684cabdff1aSopenharmony_ci%if ARCH_X86_64 685cabdff1aSopenharmony_ci mova m0, [blockq+ 0] ; IN(0) 686cabdff1aSopenharmony_ci%endif 687cabdff1aSopenharmony_ci mova m1, [blockq+ 16] ; IN(1) 688cabdff1aSopenharmony_ci mova m2, [blockq+ 32] ; IN(2) 689cabdff1aSopenharmony_ci mova m3, [blockq+ 48] ; IN(3) 690cabdff1aSopenharmony_ci%if ARCH_X86_64 691cabdff1aSopenharmony_ci mova m4, [blockq+ 64] ; IN(4) 692cabdff1aSopenharmony_ci%endif 693cabdff1aSopenharmony_ci mova m5, [blockq+ 80] ; IN(5) 694cabdff1aSopenharmony_ci mova m6, [blockq+ 96] ; IN(6) 695cabdff1aSopenharmony_ci mova m7, [blockq+112] ; IN(7) 696cabdff1aSopenharmony_ci%if ARCH_X86_64 697cabdff1aSopenharmony_ci mova m11, [pd_8192] ; rounding 698cabdff1aSopenharmony_ci%define D_8192_REG m11 699cabdff1aSopenharmony_ci%else 700cabdff1aSopenharmony_ci%define D_8192_REG [pd_8192] 701cabdff1aSopenharmony_ci%endif 702cabdff1aSopenharmony_ci VP9_IDCT8_1D 703cabdff1aSopenharmony_ci%if ARCH_X86_64 704cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 705cabdff1aSopenharmony_ci%else 706cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 707cabdff1aSopenharmony_ci mova [blockq+0], m0 708cabdff1aSopenharmony_ci%endif 709cabdff1aSopenharmony_ci VP9_IDCT8_1D 710cabdff1aSopenharmony_ci 711cabdff1aSopenharmony_ci%if ARCH_X86_64 712cabdff1aSopenharmony_ci SWAP 6, 8 713cabdff1aSopenharmony_ci%endif 714cabdff1aSopenharmony_ci pxor m6, m6 ; used for the block reset, and VP9_STORE_2X 715cabdff1aSopenharmony_ci VP9_IDCT8_WRITEOUT 716cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 16, 8, m6 717cabdff1aSopenharmony_ci RET 718cabdff1aSopenharmony_ci%undef W_11585x2_REG 719cabdff1aSopenharmony_ci%endmacro 720cabdff1aSopenharmony_ci 721cabdff1aSopenharmony_ciVP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12 722cabdff1aSopenharmony_ciVP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13 723cabdff1aSopenharmony_ciVP9_IDCT_IDCT_8x8_ADD_XMM avx, 13 724cabdff1aSopenharmony_ci 725cabdff1aSopenharmony_ci;--------------------------------------------------------------------------------------------- 726cabdff1aSopenharmony_ci; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 727cabdff1aSopenharmony_ci;--------------------------------------------------------------------------------------------- 728cabdff1aSopenharmony_ci 729cabdff1aSopenharmony_ci; x86-32: 730cabdff1aSopenharmony_ci; - in: m0/3/4/7 are in mem [blockq+N*16] 731cabdff1aSopenharmony_ci; - out: m6 is in mem [blockq+0] 732cabdff1aSopenharmony_ci; x86-64: 733cabdff1aSopenharmony_ci; - everything is in registers 734cabdff1aSopenharmony_ci%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7 735cabdff1aSopenharmony_ci%if ARCH_X86_64 736cabdff1aSopenharmony_ci SWAP 0, 8 737cabdff1aSopenharmony_ci SWAP 3, 9 738cabdff1aSopenharmony_ci SWAP 4, 10 739cabdff1aSopenharmony_ci SWAP 7, 11 740cabdff1aSopenharmony_ci%endif 741cabdff1aSopenharmony_ci 742cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 5, 2, 0, 3, 14449, 7723 ; m5/2=t3[d], m2/4=t2[d] 743cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 1, 6, 4, 7, 4756, 15679 ; m1/4=t7[d], m6/7=t6[d] 744cabdff1aSopenharmony_ci SCRATCH 4, 12, blockq+1*16 745cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 6, 2, 7, 3, 4, D_8192_REG ; m6=t2[w], m2=t6[w] 746cabdff1aSopenharmony_ci UNSCRATCH 4, 12, blockq+1*16 747cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 1, 5, 4, 0, 3, D_8192_REG ; m1=t3[w], m5=t7[w] 748cabdff1aSopenharmony_ci 749cabdff1aSopenharmony_ci UNSCRATCH 0, 8, blockq+16*0 750cabdff1aSopenharmony_ci UNSCRATCH 3, 9, blockq+16*3 751cabdff1aSopenharmony_ci UNSCRATCH 4, 10, blockq+16*4 752cabdff1aSopenharmony_ci UNSCRATCH 7, 11, blockq+16*7 753cabdff1aSopenharmony_ci SCRATCH 1, 8, blockq+16*1 754cabdff1aSopenharmony_ci SCRATCH 2, 9, blockq+16*2 755cabdff1aSopenharmony_ci SCRATCH 5, 10, blockq+16*5 756cabdff1aSopenharmony_ci SCRATCH 6, 11, blockq+16*6 757cabdff1aSopenharmony_ci 758cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 7, 0, 1, 2, 16305, 1606 ; m7/1=t1[d], m0/2=t0[d] 759cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 3, 4, 5, 6, 10394, 12665 ; m3/5=t5[d], m4/6=t4[d] 760cabdff1aSopenharmony_ci SCRATCH 1, 12, blockq+ 0*16 761cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 4, 0, 6, 2, 1, D_8192_REG ; m4=t0[w], m0=t4[w] 762cabdff1aSopenharmony_ci UNSCRATCH 1, 12, blockq+ 0*16 763cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 3, 7, 5, 1, 2, D_8192_REG ; m3=t1[w], m7=t5[w] 764cabdff1aSopenharmony_ci 765cabdff1aSopenharmony_ci UNSCRATCH 2, 9, blockq+16*2 766cabdff1aSopenharmony_ci UNSCRATCH 5, 10, blockq+16*5 767cabdff1aSopenharmony_ci SCRATCH 3, 9, blockq+16*3 768cabdff1aSopenharmony_ci SCRATCH 4, 10, blockq+16*4 769cabdff1aSopenharmony_ci 770cabdff1aSopenharmony_ci ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7 771cabdff1aSopenharmony_ci 772cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 0, 7, 1, 3, 15137, 6270 ; m0/1=t5[d], m7/3=t4[d] 773cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 5, 2, 4, 6, 6270, 15137 ; m5/4=t6[d], m2/6=t7[d] 774cabdff1aSopenharmony_ci SCRATCH 1, 12, blockq+ 0*16 775cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 5, 7, 4, 3, 1, D_8192_REG 776cabdff1aSopenharmony_ci UNSCRATCH 1, 12, blockq+ 0*16 777cabdff1aSopenharmony_ci PSIGNW m5, W_M1_REG ; m5=out1[w], m7=t6[w] 778cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 2, 0, 6, 1, 3, D_8192_REG ; m2=out6[w], m0=t7[w] 779cabdff1aSopenharmony_ci 780cabdff1aSopenharmony_ci UNSCRATCH 1, 8, blockq+16*1 781cabdff1aSopenharmony_ci UNSCRATCH 3, 9, blockq+16*3 782cabdff1aSopenharmony_ci UNSCRATCH 4, 10, blockq+16*4 783cabdff1aSopenharmony_ci UNSCRATCH 6, 11, blockq+16*6 784cabdff1aSopenharmony_ci SCRATCH 2, 8, blockq+16*0 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 4, 2 ; m6=out0[w], m4=t2[w] 787cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 3, 2 788cabdff1aSopenharmony_ci PSIGNW m1, W_M1_REG ; m1=out7[w], m3=t3[w] 789cabdff1aSopenharmony_ci 790cabdff1aSopenharmony_ci ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7 791cabdff1aSopenharmony_ci 792cabdff1aSopenharmony_ci ; unfortunately, the code below overflows in some cases 793cabdff1aSopenharmony_ci%if 0; cpuflag(ssse3) 794cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 4, 2 795cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 7, 2 796cabdff1aSopenharmony_ci pmulhrsw m3, W_11585x2_REG 797cabdff1aSopenharmony_ci pmulhrsw m7, W_11585x2_REG 798cabdff1aSopenharmony_ci pmulhrsw m4, W_11585x2_REG ; out4 799cabdff1aSopenharmony_ci pmulhrsw m0, W_11585x2_REG ; out2 800cabdff1aSopenharmony_ci%else 801cabdff1aSopenharmony_ci SCRATCH 5, 9, blockq+16*1 802cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, D_8192_REG, 2, 5 803cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 7, 0, 11585, 11585, D_8192_REG, 2, 5 804cabdff1aSopenharmony_ci UNSCRATCH 5, 9, blockq+16*1 805cabdff1aSopenharmony_ci%endif 806cabdff1aSopenharmony_ci PSIGNW m3, W_M1_REG ; out3 807cabdff1aSopenharmony_ci PSIGNW m7, W_M1_REG ; out5 808cabdff1aSopenharmony_ci 809cabdff1aSopenharmony_ci ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7 810cabdff1aSopenharmony_ci 811cabdff1aSopenharmony_ci%if ARCH_X86_64 812cabdff1aSopenharmony_ci SWAP 2, 8 813cabdff1aSopenharmony_ci%endif 814cabdff1aSopenharmony_ci SWAP 0, 6, 2 815cabdff1aSopenharmony_ci SWAP 7, 1, 5 816cabdff1aSopenharmony_ci%endmacro 817cabdff1aSopenharmony_ci 818cabdff1aSopenharmony_ci%macro IADST8_FN 6 819cabdff1aSopenharmony_ciINIT_XMM %5 820cabdff1aSopenharmony_cicglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob 821cabdff1aSopenharmony_ci 822cabdff1aSopenharmony_ci%ifidn %1, idct 823cabdff1aSopenharmony_ci%define first_is_idct 1 824cabdff1aSopenharmony_ci%else 825cabdff1aSopenharmony_ci%define first_is_idct 0 826cabdff1aSopenharmony_ci%endif 827cabdff1aSopenharmony_ci 828cabdff1aSopenharmony_ci%ifidn %3, idct 829cabdff1aSopenharmony_ci%define second_is_idct 1 830cabdff1aSopenharmony_ci%else 831cabdff1aSopenharmony_ci%define second_is_idct 0 832cabdff1aSopenharmony_ci%endif 833cabdff1aSopenharmony_ci 834cabdff1aSopenharmony_ci%if ARCH_X86_64 835cabdff1aSopenharmony_ci mova m0, [blockq+ 0] ; IN(0) 836cabdff1aSopenharmony_ci%endif 837cabdff1aSopenharmony_ci mova m1, [blockq+ 16] ; IN(1) 838cabdff1aSopenharmony_ci mova m2, [blockq+ 32] ; IN(2) 839cabdff1aSopenharmony_ci%if ARCH_X86_64 || first_is_idct 840cabdff1aSopenharmony_ci mova m3, [blockq+ 48] ; IN(3) 841cabdff1aSopenharmony_ci%endif 842cabdff1aSopenharmony_ci%if ARCH_X86_64 843cabdff1aSopenharmony_ci mova m4, [blockq+ 64] ; IN(4) 844cabdff1aSopenharmony_ci%endif 845cabdff1aSopenharmony_ci mova m5, [blockq+ 80] ; IN(5) 846cabdff1aSopenharmony_ci mova m6, [blockq+ 96] ; IN(6) 847cabdff1aSopenharmony_ci%if ARCH_X86_64 || first_is_idct 848cabdff1aSopenharmony_ci mova m7, [blockq+112] ; IN(7) 849cabdff1aSopenharmony_ci%endif 850cabdff1aSopenharmony_ci%if ARCH_X86_64 851cabdff1aSopenharmony_ci%if cpuflag(ssse3) 852cabdff1aSopenharmony_ci mova m15, [pw_11585x2] ; often used 853cabdff1aSopenharmony_ci%endif 854cabdff1aSopenharmony_ci mova m13, [pd_8192] ; rounding 855cabdff1aSopenharmony_ci mova m14, [pw_m1] 856cabdff1aSopenharmony_ci%define W_11585x2_REG m15 857cabdff1aSopenharmony_ci%define D_8192_REG m13 858cabdff1aSopenharmony_ci%define W_M1_REG m14 859cabdff1aSopenharmony_ci%else 860cabdff1aSopenharmony_ci%define W_11585x2_REG [pw_11585x2] 861cabdff1aSopenharmony_ci%define D_8192_REG [pd_8192] 862cabdff1aSopenharmony_ci%define W_M1_REG [pw_m1] 863cabdff1aSopenharmony_ci%endif 864cabdff1aSopenharmony_ci 865cabdff1aSopenharmony_ci ; note different calling conventions for idct8 vs. iadst8 on x86-32 866cabdff1aSopenharmony_ci VP9_%2_1D 867cabdff1aSopenharmony_ci%if ARCH_X86_64 868cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 869cabdff1aSopenharmony_ci%else 870cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 871cabdff1aSopenharmony_ci mova [blockq+ 0], m0 872cabdff1aSopenharmony_ci%if second_is_idct == 0 873cabdff1aSopenharmony_ci mova [blockq+ 48], m3 874cabdff1aSopenharmony_ci mova [blockq+112], m7 875cabdff1aSopenharmony_ci%endif 876cabdff1aSopenharmony_ci%endif 877cabdff1aSopenharmony_ci VP9_%4_1D 878cabdff1aSopenharmony_ci 879cabdff1aSopenharmony_ci%if ARCH_X86_64 880cabdff1aSopenharmony_ci SWAP 6, 8 881cabdff1aSopenharmony_ci%endif 882cabdff1aSopenharmony_ci pxor m6, m6 ; used for the block reset, and VP9_STORE_2X 883cabdff1aSopenharmony_ci VP9_IDCT8_WRITEOUT 884cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 16, 8, m6 885cabdff1aSopenharmony_ci RET 886cabdff1aSopenharmony_ci 887cabdff1aSopenharmony_ci%undef W_11585x2_REG 888cabdff1aSopenharmony_ci%undef first_is_idct 889cabdff1aSopenharmony_ci%undef second_is_idct 890cabdff1aSopenharmony_ci 891cabdff1aSopenharmony_ci%endmacro 892cabdff1aSopenharmony_ci 893cabdff1aSopenharmony_ciIADST8_FN idct, IDCT8, iadst, IADST8, sse2, 15 894cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, idct, IDCT8, sse2, 15 895cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15 896cabdff1aSopenharmony_ciIADST8_FN idct, IDCT8, iadst, IADST8, ssse3, 16 897cabdff1aSopenharmony_ciIADST8_FN idct, IDCT8, iadst, IADST8, avx, 16 898cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, idct, IDCT8, ssse3, 16 899cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, idct, IDCT8, avx, 16 900cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16 901cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 902cabdff1aSopenharmony_ci 903cabdff1aSopenharmony_ci;--------------------------------------------------------------------------------------------- 904cabdff1aSopenharmony_ci; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 905cabdff1aSopenharmony_ci;--------------------------------------------------------------------------------------------- 906cabdff1aSopenharmony_ci 907cabdff1aSopenharmony_ci; x86-64: 908cabdff1aSopenharmony_ci; at the end of this macro, m7 is stored in [%4+15*%5] 909cabdff1aSopenharmony_ci; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15 910cabdff1aSopenharmony_ci; the following sumsubs have not been done yet: 911cabdff1aSopenharmony_ci; SUMSUB_BA w, 6, 9, 15 ; t6, t9 912cabdff1aSopenharmony_ci; SUMSUB_BA w, 7, 8, 15 ; t7, t8 913cabdff1aSopenharmony_ci; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1, 914cabdff1aSopenharmony_ci; and the following simsubs have not been done yet: 915cabdff1aSopenharmony_ci; SUMSUB_BA w, x13, x14, 7 ; t6, t9 916cabdff1aSopenharmony_ci; SUMSUB_BA w, x15, x12, 7 ; t7, t8 917cabdff1aSopenharmony_ci 918cabdff1aSopenharmony_ci%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst 919cabdff1aSopenharmony_ci%if %2 <= 4 920cabdff1aSopenharmony_ci mova m3, [%1+ 1*%3] ; IN(1) 921cabdff1aSopenharmony_ci mova m0, [%1+ 3*%3] ; IN(3) 922cabdff1aSopenharmony_ci 923cabdff1aSopenharmony_ci pmulhrsw m4, m3, [pw_16305x2] ; t14-15 924cabdff1aSopenharmony_ci pmulhrsw m3, [pw_1606x2] ; t8-9 925cabdff1aSopenharmony_ci pmulhrsw m7, m0, [pw_m4756x2] ; t10-11 926cabdff1aSopenharmony_ci pmulhrsw m0, [pw_15679x2] ; t12-13 927cabdff1aSopenharmony_ci 928cabdff1aSopenharmony_ci ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 929cabdff1aSopenharmony_ci ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 930cabdff1aSopenharmony_ci 931cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 1, 6 ; t9, t14 932cabdff1aSopenharmony_ci SCRATCH 4, 10, %4+ 1*%5 933cabdff1aSopenharmony_ci SCRATCH 5, 11, %4+ 7*%5 934cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 935cabdff1aSopenharmony_ci UNSCRATCH 5, 11, %4+ 7*%5 936cabdff1aSopenharmony_ci 937cabdff1aSopenharmony_ci ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 938cabdff1aSopenharmony_ci ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 939cabdff1aSopenharmony_ci%else 940cabdff1aSopenharmony_ci mova m5, [%1+ 1*%3] ; IN(1) 941cabdff1aSopenharmony_ci mova m4, [%1+ 7*%3] ; IN(7) 942cabdff1aSopenharmony_ci%if %2 <= 8 943cabdff1aSopenharmony_ci pmulhrsw m2, m5, [pw_16305x2] ; t15 944cabdff1aSopenharmony_ci pmulhrsw m5, [pw_1606x2] ; t8 945cabdff1aSopenharmony_ci pmulhrsw m3, m4, [pw_m10394x2] ; t9 946cabdff1aSopenharmony_ci pmulhrsw m4, [pw_12665x2] ; t14 947cabdff1aSopenharmony_ci%else 948cabdff1aSopenharmony_ci mova m3, [%1+ 9*%3] ; IN(9) 949cabdff1aSopenharmony_ci mova m2, [%1+15*%3] ; IN(15) 950cabdff1aSopenharmony_ci 951cabdff1aSopenharmony_ci ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7 952cabdff1aSopenharmony_ci ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15 953cabdff1aSopenharmony_ci 954cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 5, 2, 16305, 1606, [pd_8192], 0, 1 ; t8, t15 955cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 3, 4, 10394, 12665, [pd_8192], 0, 1 ; t9, t14 956cabdff1aSopenharmony_ci%endif 957cabdff1aSopenharmony_ci 958cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 5, 0 ; t8, t9 959cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 2, 0 ; t15, t14 960cabdff1aSopenharmony_ci 961cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 0, 1 ; t9, t14 962cabdff1aSopenharmony_ci 963cabdff1aSopenharmony_ci SCRATCH 4, 10, %4+ 1*%5 964cabdff1aSopenharmony_ci SCRATCH 5, 11, %4+ 7*%5 965cabdff1aSopenharmony_ci 966cabdff1aSopenharmony_ci mova m6, [%1+ 3*%3] ; IN(3) 967cabdff1aSopenharmony_ci mova m7, [%1+ 5*%3] ; IN(5) 968cabdff1aSopenharmony_ci%if %2 <= 8 969cabdff1aSopenharmony_ci pmulhrsw m0, m7, [pw_14449x2] ; t13 970cabdff1aSopenharmony_ci pmulhrsw m7, [pw_7723x2] ; t10 971cabdff1aSopenharmony_ci pmulhrsw m1, m6, [pw_m4756x2] ; t11 972cabdff1aSopenharmony_ci pmulhrsw m6, [pw_15679x2] ; t12 973cabdff1aSopenharmony_ci%else 974cabdff1aSopenharmony_ci mova m0, [%1+11*%3] ; IN(11) 975cabdff1aSopenharmony_ci mova m1, [%1+13*%3] ; IN(13) 976cabdff1aSopenharmony_ci 977cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 7, 0, 14449, 7723, [pd_8192], 4, 5 ; t10, t13 978cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 1, 6, 4756, 15679, [pd_8192], 4, 5 ; t11, t12 979cabdff1aSopenharmony_ci%endif 980cabdff1aSopenharmony_ci 981cabdff1aSopenharmony_ci ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7 982cabdff1aSopenharmony_ci ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15 983cabdff1aSopenharmony_ci 984cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 1, 4 ; t11, t10 985cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 6, 4 ; t12, t13 986cabdff1aSopenharmony_ci 987cabdff1aSopenharmony_ci ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 988cabdff1aSopenharmony_ci ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 989cabdff1aSopenharmony_ci 990cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 991cabdff1aSopenharmony_ci 992cabdff1aSopenharmony_ci UNSCRATCH 5, 11, %4+ 7*%5 993cabdff1aSopenharmony_ci%endif 994cabdff1aSopenharmony_ci 995cabdff1aSopenharmony_ci ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7 996cabdff1aSopenharmony_ci ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15 997cabdff1aSopenharmony_ci 998cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 3, 4 ; t8, t11 999cabdff1aSopenharmony_ci 1000cabdff1aSopenharmony_ci ; backup first register 1001cabdff1aSopenharmony_ci mova [%4+15*%5], m7 1002cabdff1aSopenharmony_ci 1003cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 2, 7 ; t9, t10 1004cabdff1aSopenharmony_ci UNSCRATCH 4, 10, %4+ 1*%5 1005cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 4, 7 ; t15, t12 1006cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 5, 7 ; t14. t13 1007cabdff1aSopenharmony_ci 1008cabdff1aSopenharmony_ci ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 1009cabdff1aSopenharmony_ci ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 1010cabdff1aSopenharmony_ci 1011cabdff1aSopenharmony_ci%if cpuflag(ssse3) && %6 == 0 1012cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 5, 7 1013cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 4, 7 1014cabdff1aSopenharmony_ci pmulhrsw m5, [pw_11585x2] ; t10 1015cabdff1aSopenharmony_ci pmulhrsw m4, [pw_11585x2] ; t11 1016cabdff1aSopenharmony_ci pmulhrsw m3, [pw_11585x2] ; t12 1017cabdff1aSopenharmony_ci pmulhrsw m2, [pw_11585x2] ; t13 1018cabdff1aSopenharmony_ci%else 1019cabdff1aSopenharmony_ci SCRATCH 6, 10, %4+ 1*%5 1020cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 5, 2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13 1021cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12 1022cabdff1aSopenharmony_ci UNSCRATCH 6, 10, %4+ 1*%5 1023cabdff1aSopenharmony_ci%endif 1024cabdff1aSopenharmony_ci 1025cabdff1aSopenharmony_ci ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 1026cabdff1aSopenharmony_ci ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15 1027cabdff1aSopenharmony_ci 1028cabdff1aSopenharmony_ci SCRATCH 0, 8, %4+ 1*%5 1029cabdff1aSopenharmony_ci SCRATCH 1, 9, %4+ 3*%5 1030cabdff1aSopenharmony_ci SCRATCH 2, 10, %4+ 5*%5 1031cabdff1aSopenharmony_ci SCRATCH 3, 11, %4+ 7*%5 1032cabdff1aSopenharmony_ci SCRATCH 4, 12, %4+ 9*%5 1033cabdff1aSopenharmony_ci SCRATCH 5, 13, %4+11*%5 1034cabdff1aSopenharmony_ci SCRATCH 6, 14, %4+13*%5 1035cabdff1aSopenharmony_ci 1036cabdff1aSopenharmony_ci ; even (tx8x8) 1037cabdff1aSopenharmony_ci%if %2 <= 4 1038cabdff1aSopenharmony_ci mova m3, [%1+ 0*%3] ; IN(0) 1039cabdff1aSopenharmony_ci mova m4, [%1+ 2*%3] ; IN(2) 1040cabdff1aSopenharmony_ci 1041cabdff1aSopenharmony_ci pmulhrsw m3, [pw_11585x2] ; t0-t3 1042cabdff1aSopenharmony_ci pmulhrsw m7, m4, [pw_16069x2] ; t6-7 1043cabdff1aSopenharmony_ci pmulhrsw m4, [pw_3196x2] ; t4-5 1044cabdff1aSopenharmony_ci 1045cabdff1aSopenharmony_ci%if 0 ; overflows :( 1046cabdff1aSopenharmony_ci paddw m6, m7, m4 1047cabdff1aSopenharmony_ci psubw m5, m7, m4 1048cabdff1aSopenharmony_ci pmulhrsw m5, [pw_11585x2] ; t5 1049cabdff1aSopenharmony_ci pmulhrsw m6, [pw_11585x2] ; t6 1050cabdff1aSopenharmony_ci%else 1051cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5, t6 1052cabdff1aSopenharmony_ci%endif 1053cabdff1aSopenharmony_ci 1054cabdff1aSopenharmony_ci psubw m0, m3, m7 1055cabdff1aSopenharmony_ci paddw m7, m3 1056cabdff1aSopenharmony_ci psubw m1, m3, m6 1057cabdff1aSopenharmony_ci paddw m6, m3 1058cabdff1aSopenharmony_ci psubw m2, m3, m5 1059cabdff1aSopenharmony_ci paddw m5, m3 1060cabdff1aSopenharmony_ci 1061cabdff1aSopenharmony_ci%if ARCH_X86_32 1062cabdff1aSopenharmony_ci SWAP 0, 7 1063cabdff1aSopenharmony_ci%endif 1064cabdff1aSopenharmony_ci SCRATCH 7, 15, %4+12*%5 1065cabdff1aSopenharmony_ci%else 1066cabdff1aSopenharmony_ci mova m6, [%1+ 2*%3] ; IN(2) 1067cabdff1aSopenharmony_ci mova m1, [%1+ 4*%3] ; IN(4) 1068cabdff1aSopenharmony_ci mova m7, [%1+ 6*%3] ; IN(6) 1069cabdff1aSopenharmony_ci%if %2 <= 8 1070cabdff1aSopenharmony_ci pmulhrsw m0, m1, [pw_15137x2] ; t3 1071cabdff1aSopenharmony_ci pmulhrsw m1, [pw_6270x2] ; t2 1072cabdff1aSopenharmony_ci pmulhrsw m5, m6, [pw_16069x2] ; t7 1073cabdff1aSopenharmony_ci pmulhrsw m6, [pw_3196x2] ; t4 1074cabdff1aSopenharmony_ci pmulhrsw m4, m7, [pw_m9102x2] ; t5 1075cabdff1aSopenharmony_ci pmulhrsw m7, [pw_13623x2] ; t6 1076cabdff1aSopenharmony_ci%else 1077cabdff1aSopenharmony_ci mova m4, [%1+10*%3] ; IN(10) 1078cabdff1aSopenharmony_ci mova m0, [%1+12*%3] ; IN(12) 1079cabdff1aSopenharmony_ci mova m5, [%1+14*%3] ; IN(14) 1080cabdff1aSopenharmony_ci 1081cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 1, 0, 15137, 6270, [pd_8192], 2, 3 ; t2, t3 1082cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 6, 5, 16069, 3196, [pd_8192], 2, 3 ; t4, t7 1083cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 4, 7, 9102, 13623, [pd_8192], 2, 3 ; t5, t6 1084cabdff1aSopenharmony_ci%endif 1085cabdff1aSopenharmony_ci 1086cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 6, 2 ; t4, t5 1087cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 5, 2 ; t7, t6 1088cabdff1aSopenharmony_ci 1089cabdff1aSopenharmony_ci%if cpuflag(ssse3) && %6 == 0 1090cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 5, 2 1091cabdff1aSopenharmony_ci pmulhrsw m5, [pw_11585x2] ; t5 1092cabdff1aSopenharmony_ci pmulhrsw m6, [pw_11585x2] ; t6 1093cabdff1aSopenharmony_ci%else 1094cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 5, 6, 11585, 11585, [pd_8192], 2, 3 ; t5, t6 1095cabdff1aSopenharmony_ci%endif 1096cabdff1aSopenharmony_ci 1097cabdff1aSopenharmony_ci SCRATCH 5, 15, %4+10*%5 1098cabdff1aSopenharmony_ci mova m2, [%1+ 0*%3] ; IN(0) 1099cabdff1aSopenharmony_ci%if %2 <= 8 1100cabdff1aSopenharmony_ci pmulhrsw m2, [pw_11585x2] ; t0 and t1 1101cabdff1aSopenharmony_ci psubw m3, m2, m0 1102cabdff1aSopenharmony_ci paddw m0, m2 1103cabdff1aSopenharmony_ci 1104cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 0, 5 ; t0, t7 1105cabdff1aSopenharmony_ci%else 1106cabdff1aSopenharmony_ci mova m3, [%1+ 8*%3] ; IN(8) 1107cabdff1aSopenharmony_ci 1108cabdff1aSopenharmony_ci ; from 3 stages back 1109cabdff1aSopenharmony_ci%if cpuflag(ssse3) && %6 == 0 1110cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 2, 5 1111cabdff1aSopenharmony_ci pmulhrsw m3, [pw_11585x2] ; t0 1112cabdff1aSopenharmony_ci pmulhrsw m2, [pw_11585x2] ; t1 1113cabdff1aSopenharmony_ci%else 1114cabdff1aSopenharmony_ci mova [%1+ 0*%3], m0 1115cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 0 ; t0, t1 1116cabdff1aSopenharmony_ci mova m0, [%1+ 0*%3] 1117cabdff1aSopenharmony_ci%endif 1118cabdff1aSopenharmony_ci 1119cabdff1aSopenharmony_ci ; from 2 stages back 1120cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 3, 5 ; t0, t3 1121cabdff1aSopenharmony_ci 1122cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 0, 5 ; t0, t7 1123cabdff1aSopenharmony_ci%endif 1124cabdff1aSopenharmony_ci UNSCRATCH 5, 15, %4+10*%5 1125cabdff1aSopenharmony_ci%if ARCH_X86_32 1126cabdff1aSopenharmony_ci SWAP 0, 7 1127cabdff1aSopenharmony_ci%endif 1128cabdff1aSopenharmony_ci SCRATCH 7, 15, %4+12*%5 1129cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 2, 7 ; t1, t2 1130cabdff1aSopenharmony_ci 1131cabdff1aSopenharmony_ci ; from 1 stage back 1132cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 1, 7 ; t1, t6 1133cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 2, 7 ; t2, t5 1134cabdff1aSopenharmony_ci%endif 1135cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 3, 7 ; t3, t4 1136cabdff1aSopenharmony_ci 1137cabdff1aSopenharmony_ci%if ARCH_X86_64 1138cabdff1aSopenharmony_ci SWAP 0, 8 1139cabdff1aSopenharmony_ci SWAP 1, 9 1140cabdff1aSopenharmony_ci SWAP 2, 10 1141cabdff1aSopenharmony_ci SWAP 3, 11 1142cabdff1aSopenharmony_ci SWAP 4, 12 1143cabdff1aSopenharmony_ci SWAP 5, 13 1144cabdff1aSopenharmony_ci SWAP 6, 14 1145cabdff1aSopenharmony_ci 1146cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 15, 7 ; t0, t15 1147cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 14, 7 ; t1, t14 1148cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 13, 7 ; t2, t13 1149cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 12, 7 ; t3, t12 1150cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 11, 7 ; t4, t11 1151cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 10, 7 ; t5, t10 1152cabdff1aSopenharmony_ci%else 1153cabdff1aSopenharmony_ci SWAP 1, 6 1154cabdff1aSopenharmony_ci SWAP 2, 5 1155cabdff1aSopenharmony_ci SWAP 3, 4 1156cabdff1aSopenharmony_ci mova [%4+14*%5], m6 1157cabdff1aSopenharmony_ci 1158cabdff1aSopenharmony_ci%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride 1159cabdff1aSopenharmony_ci mova m6, [%4+%2*%5] 1160cabdff1aSopenharmony_ci SUMSUB_BA w, 6, %1, 7 1161cabdff1aSopenharmony_ci SWAP %1, 6 1162cabdff1aSopenharmony_ci mova [%4+%3*%5], m6 1163cabdff1aSopenharmony_ci%endmacro 1164cabdff1aSopenharmony_ci 1165cabdff1aSopenharmony_ci %%SUMSUB_BA_STORE 0, 1, 1, %4, %5 ; t0, t15 1166cabdff1aSopenharmony_ci %%SUMSUB_BA_STORE 1, 3, 3, %4, %5 ; t1, t14 1167cabdff1aSopenharmony_ci %%SUMSUB_BA_STORE 2, 5, 5, %4, %5 ; t2, t13 1168cabdff1aSopenharmony_ci %%SUMSUB_BA_STORE 3, 7, 7, %4, %5 ; t3, t12 1169cabdff1aSopenharmony_ci %%SUMSUB_BA_STORE 4, 9, 9, %4, %5 ; t4, t11 1170cabdff1aSopenharmony_ci %%SUMSUB_BA_STORE 5, 11, 11, %4, %5 ; t5, t10 1171cabdff1aSopenharmony_ci%endif 1172cabdff1aSopenharmony_ci%endmacro 1173cabdff1aSopenharmony_ci 1174cabdff1aSopenharmony_ci%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst 1175cabdff1aSopenharmony_ci%if %2 == 1 1176cabdff1aSopenharmony_ci VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4 1177cabdff1aSopenharmony_ci 1178cabdff1aSopenharmony_ci%if ARCH_X86_64 1179cabdff1aSopenharmony_ci ; backup a different register 1180cabdff1aSopenharmony_ci mova m7, [tmpq+15*16] 1181cabdff1aSopenharmony_ci mova [tmpq+ 1*16], m15 1182cabdff1aSopenharmony_ci 1183cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 9, 15 ; t6, t9 1184cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 8, 15 ; t7, t8 1185cabdff1aSopenharmony_ci 1186cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15 1187cabdff1aSopenharmony_ci mova [tmpq+ 0], m0 1188cabdff1aSopenharmony_ci mova [tmpq+ 32], m1 1189cabdff1aSopenharmony_ci mova [tmpq+ 64], m2 1190cabdff1aSopenharmony_ci mova [tmpq+ 96], m3 1191cabdff1aSopenharmony_ci mova [tmpq+128], m4 1192cabdff1aSopenharmony_ci mova [tmpq+160], m5 1193cabdff1aSopenharmony_ci mova [tmpq+192], m6 1194cabdff1aSopenharmony_ci mova [tmpq+224], m7 1195cabdff1aSopenharmony_ci 1196cabdff1aSopenharmony_ci mova m15, [tmpq+ 1*16] 1197cabdff1aSopenharmony_ci TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 1198cabdff1aSopenharmony_ci mova [tmpq+ 16], m8 1199cabdff1aSopenharmony_ci mova [tmpq+ 48], m9 1200cabdff1aSopenharmony_ci mova [tmpq+ 80], m10 1201cabdff1aSopenharmony_ci mova [tmpq+112], m11 1202cabdff1aSopenharmony_ci mova [tmpq+144], m12 1203cabdff1aSopenharmony_ci mova [tmpq+176], m13 1204cabdff1aSopenharmony_ci mova [tmpq+208], m14 1205cabdff1aSopenharmony_ci mova [tmpq+240], m15 1206cabdff1aSopenharmony_ci%else 1207cabdff1aSopenharmony_ci mova m6, [tmpq+13*16] 1208cabdff1aSopenharmony_ci mova m7, [tmpq+14*16] 1209cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 7 ; t6, t9 1210cabdff1aSopenharmony_ci mova [tmpq+14*16], m6 1211cabdff1aSopenharmony_ci mova [tmpq+13*16], m7 1212cabdff1aSopenharmony_ci mova m7, [tmpq+15*16] 1213cabdff1aSopenharmony_ci mova m6, [tmpq+12*16] 1214cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 6 ; t7, t8 1215cabdff1aSopenharmony_ci mova [tmpq+15*16], m6 1216cabdff1aSopenharmony_ci 1217cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1 1218cabdff1aSopenharmony_ci mova [tmpq+ 0*16], m0 1219cabdff1aSopenharmony_ci mova [tmpq+ 2*16], m1 1220cabdff1aSopenharmony_ci mova [tmpq+ 4*16], m2 1221cabdff1aSopenharmony_ci mova [tmpq+ 6*16], m3 1222cabdff1aSopenharmony_ci mova [tmpq+10*16], m5 1223cabdff1aSopenharmony_ci mova [tmpq+12*16], m6 1224cabdff1aSopenharmony_ci mova [tmpq+14*16], m7 1225cabdff1aSopenharmony_ci 1226cabdff1aSopenharmony_ci mova m0, [tmpq+15*16] 1227cabdff1aSopenharmony_ci mova m1, [tmpq+13*16] 1228cabdff1aSopenharmony_ci mova m2, [tmpq+11*16] 1229cabdff1aSopenharmony_ci mova m3, [tmpq+ 9*16] 1230cabdff1aSopenharmony_ci mova m4, [tmpq+ 7*16] 1231cabdff1aSopenharmony_ci mova m5, [tmpq+ 5*16] 1232cabdff1aSopenharmony_ci mova m7, [tmpq+ 1*16] 1233cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1 1234cabdff1aSopenharmony_ci mova [tmpq+ 1*16], m0 1235cabdff1aSopenharmony_ci mova [tmpq+ 3*16], m1 1236cabdff1aSopenharmony_ci mova [tmpq+ 5*16], m2 1237cabdff1aSopenharmony_ci mova [tmpq+ 7*16], m3 1238cabdff1aSopenharmony_ci mova [tmpq+11*16], m5 1239cabdff1aSopenharmony_ci mova [tmpq+13*16], m6 1240cabdff1aSopenharmony_ci mova [tmpq+15*16], m7 1241cabdff1aSopenharmony_ci%endif 1242cabdff1aSopenharmony_ci%else ; %2 == 2 1243cabdff1aSopenharmony_ci VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4 1244cabdff1aSopenharmony_ci 1245cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1246cabdff1aSopenharmony_ci%define ROUND_REG [pw_512] 1247cabdff1aSopenharmony_ci%else 1248cabdff1aSopenharmony_ci%define ROUND_REG [pw_32] 1249cabdff1aSopenharmony_ci%endif 1250cabdff1aSopenharmony_ci 1251cabdff1aSopenharmony_ci pxor m7, m7 1252cabdff1aSopenharmony_ci%if ARCH_X86_64 1253cabdff1aSopenharmony_ci ; backup more registers 1254cabdff1aSopenharmony_ci mova [%1+ 2*32], m8 1255cabdff1aSopenharmony_ci mova [%1+ 3*32], m9 1256cabdff1aSopenharmony_ci 1257cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 0, 1, 8, 9, 7, ROUND_REG, 6 1258cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1259cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 2, 3, 8, 9, 7, ROUND_REG, 6 1260cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1261cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 4, 5, 8, 9, 7, ROUND_REG, 6 1262cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1263cabdff1aSopenharmony_ci 1264cabdff1aSopenharmony_ci ; restore from cache 1265cabdff1aSopenharmony_ci SWAP 0, 7 ; move zero from m7 to m0 1266cabdff1aSopenharmony_ci mova m7, [%1+15*32] 1267cabdff1aSopenharmony_ci mova m8, [%1+ 2*32] 1268cabdff1aSopenharmony_ci mova m9, [%1+ 3*32] 1269cabdff1aSopenharmony_ci 1270cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 9, 3 ; t6, t9 1271cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 8, 3 ; t7, t8 1272cabdff1aSopenharmony_ci 1273cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 6, 7, 3, 4, 0, ROUND_REG, 6 1274cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1275cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 8, 9, 3, 4, 0, ROUND_REG, 6 1276cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1277cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 10, 11, 1, 2, 0, ROUND_REG, 6 1278cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1279cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 12, 13, 1, 2, 0, ROUND_REG, 6 1280cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1281cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 14, 15, 1, 2, 0, ROUND_REG, 6 1282cabdff1aSopenharmony_ci%else 1283cabdff1aSopenharmony_ci mova [tmpq+ 0*32], m5 1284cabdff1aSopenharmony_ci 1285cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 0, 1, 5, 6, 7, ROUND_REG, 6 1286cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1287cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 2, 3, 5, 6, 7, ROUND_REG, 6 1288cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1289cabdff1aSopenharmony_ci 1290cabdff1aSopenharmony_ci SWAP 0, 7 ; move zero from m7 to m0 1291cabdff1aSopenharmony_ci mova m5, [tmpq+ 0*32] 1292cabdff1aSopenharmony_ci 1293cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1294cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1295cabdff1aSopenharmony_ci 1296cabdff1aSopenharmony_ci mova m4, [tmpq+13*32] 1297cabdff1aSopenharmony_ci mova m7, [tmpq+14*32] 1298cabdff1aSopenharmony_ci mova m5, [tmpq+15*32] 1299cabdff1aSopenharmony_ci mova m6, [tmpq+12*32] 1300cabdff1aSopenharmony_ci SUMSUB_BADC w, 4, 7, 5, 6, 1 1301cabdff1aSopenharmony_ci 1302cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1303cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1304cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 1305cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1306cabdff1aSopenharmony_ci 1307cabdff1aSopenharmony_ci mova m4, [tmpq+11*32] 1308cabdff1aSopenharmony_ci mova m5, [tmpq+ 9*32] 1309cabdff1aSopenharmony_ci mova m6, [tmpq+ 7*32] 1310cabdff1aSopenharmony_ci mova m7, [tmpq+ 5*32] 1311cabdff1aSopenharmony_ci 1312cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1313cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1314cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 1315cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1316cabdff1aSopenharmony_ci 1317cabdff1aSopenharmony_ci mova m4, [tmpq+ 3*32] 1318cabdff1aSopenharmony_ci mova m5, [tmpq+ 1*32] 1319cabdff1aSopenharmony_ci 1320cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1321cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1322cabdff1aSopenharmony_ci%endif 1323cabdff1aSopenharmony_ci 1324cabdff1aSopenharmony_ci%undef ROUND_REG 1325cabdff1aSopenharmony_ci%endif ; %2 == 1/2 1326cabdff1aSopenharmony_ci%endmacro 1327cabdff1aSopenharmony_ci 1328cabdff1aSopenharmony_ci%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride 1329cabdff1aSopenharmony_ci mova m%3, [dstq] 1330cabdff1aSopenharmony_ci mova m%5, [dstq+%7] 1331cabdff1aSopenharmony_ci punpcklbw m%2, m%3, m%6 1332cabdff1aSopenharmony_ci punpckhbw m%3, m%6 1333cabdff1aSopenharmony_ci punpcklbw m%4, m%5, m%6 1334cabdff1aSopenharmony_ci punpckhbw m%5, m%6 1335cabdff1aSopenharmony_ci paddw m%2, m%1 1336cabdff1aSopenharmony_ci paddw m%3, m%1 1337cabdff1aSopenharmony_ci paddw m%4, m%1 1338cabdff1aSopenharmony_ci paddw m%5, m%1 1339cabdff1aSopenharmony_ci packuswb m%2, m%3 1340cabdff1aSopenharmony_ci packuswb m%4, m%5 1341cabdff1aSopenharmony_ci mova [dstq], m%2 1342cabdff1aSopenharmony_ci mova [dstq+%7], m%4 1343cabdff1aSopenharmony_ci%endmacro 1344cabdff1aSopenharmony_ci 1345cabdff1aSopenharmony_ci%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1 1346cabdff1aSopenharmony_ciINIT_XMM %1 1347cabdff1aSopenharmony_cicglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob 1348cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1349cabdff1aSopenharmony_ci ; 2x2=eob=3, 4x4=eob=10 1350cabdff1aSopenharmony_ci cmp eobd, 38 1351cabdff1aSopenharmony_ci jg .idctfull 1352cabdff1aSopenharmony_ci cmp eobd, 1 ; faster path for when only DC is set 1353cabdff1aSopenharmony_ci jne .idct8x8 1354cabdff1aSopenharmony_ci%else 1355cabdff1aSopenharmony_ci cmp eobd, 1 ; faster path for when only DC is set 1356cabdff1aSopenharmony_ci jg .idctfull 1357cabdff1aSopenharmony_ci%endif 1358cabdff1aSopenharmony_ci 1359cabdff1aSopenharmony_ci ; dc-only 1360cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1361cabdff1aSopenharmony_ci movd m0, [blockq] 1362cabdff1aSopenharmony_ci mova m1, [pw_11585x2] 1363cabdff1aSopenharmony_ci pmulhrsw m0, m1 1364cabdff1aSopenharmony_ci pmulhrsw m0, m1 1365cabdff1aSopenharmony_ci%else 1366cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef 1367cabdff1aSopenharmony_ci movsx coefd, word [blockq] 1368cabdff1aSopenharmony_ci imul coefd, 11585 1369cabdff1aSopenharmony_ci add coefd, 8192 1370cabdff1aSopenharmony_ci sar coefd, 14 1371cabdff1aSopenharmony_ci imul coefd, 11585 1372cabdff1aSopenharmony_ci add coefd, (32 << 14) + 8192 1373cabdff1aSopenharmony_ci sar coefd, 14 + 6 1374cabdff1aSopenharmony_ci movd m0, coefd 1375cabdff1aSopenharmony_ci%endif 1376cabdff1aSopenharmony_ci SPLATW m0, m0, q0000 1377cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1378cabdff1aSopenharmony_ci pmulhrsw m0, [pw_512] 1379cabdff1aSopenharmony_ci%endif 1380cabdff1aSopenharmony_ci pxor m5, m5 1381cabdff1aSopenharmony_ci movd [blockq], m5 1382cabdff1aSopenharmony_ci%rep 7 1383cabdff1aSopenharmony_ci VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 1384cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 1385cabdff1aSopenharmony_ci%endrep 1386cabdff1aSopenharmony_ci VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 1387cabdff1aSopenharmony_ci RET 1388cabdff1aSopenharmony_ci 1389cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp 1390cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1391cabdff1aSopenharmony_ci.idct8x8: 1392cabdff1aSopenharmony_ci mov tmpq, rsp 1393cabdff1aSopenharmony_ci VP9_IDCT16_1D blockq, 1, 8, 0 1394cabdff1aSopenharmony_ci 1395cabdff1aSopenharmony_ci mov cntd, 2 1396cabdff1aSopenharmony_ci mov dst_bakq, dstq 1397cabdff1aSopenharmony_ci.loop2_8x8: 1398cabdff1aSopenharmony_ci VP9_IDCT16_1D tmpq, 2, 8, 0 1399cabdff1aSopenharmony_ci lea dstq, [dst_bakq+8] 1400cabdff1aSopenharmony_ci add tmpq, 16 1401cabdff1aSopenharmony_ci dec cntd 1402cabdff1aSopenharmony_ci jg .loop2_8x8 1403cabdff1aSopenharmony_ci 1404cabdff1aSopenharmony_ci ; at the end of the loop, m0 should still be zero 1405cabdff1aSopenharmony_ci ; use that to zero out block coefficients 1406cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 32, 8, m0 1407cabdff1aSopenharmony_ci RET 1408cabdff1aSopenharmony_ci%endif 1409cabdff1aSopenharmony_ci 1410cabdff1aSopenharmony_ci.idctfull: 1411cabdff1aSopenharmony_ci mov cntd, 2 1412cabdff1aSopenharmony_ci mov tmpq, rsp 1413cabdff1aSopenharmony_ci.loop1_full: 1414cabdff1aSopenharmony_ci VP9_IDCT16_1D blockq, 1, 16, 0 1415cabdff1aSopenharmony_ci add blockq, 16 1416cabdff1aSopenharmony_ci add tmpq, 256 1417cabdff1aSopenharmony_ci dec cntd 1418cabdff1aSopenharmony_ci jg .loop1_full 1419cabdff1aSopenharmony_ci sub blockq, 32 1420cabdff1aSopenharmony_ci 1421cabdff1aSopenharmony_ci mov cntd, 2 1422cabdff1aSopenharmony_ci mov tmpq, rsp 1423cabdff1aSopenharmony_ci mov dst_bakq, dstq 1424cabdff1aSopenharmony_ci.loop2_full: 1425cabdff1aSopenharmony_ci VP9_IDCT16_1D tmpq, 2, 16, 0 1426cabdff1aSopenharmony_ci lea dstq, [dst_bakq+8] 1427cabdff1aSopenharmony_ci add tmpq, 16 1428cabdff1aSopenharmony_ci dec cntd 1429cabdff1aSopenharmony_ci jg .loop2_full 1430cabdff1aSopenharmony_ci 1431cabdff1aSopenharmony_ci ; at the end of the loop, m0 should still be zero 1432cabdff1aSopenharmony_ci ; use that to zero out block coefficients 1433cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 32, 16, m0 1434cabdff1aSopenharmony_ci RET 1435cabdff1aSopenharmony_ci%endmacro 1436cabdff1aSopenharmony_ci 1437cabdff1aSopenharmony_ciVP9_IDCT_IDCT_16x16_ADD_XMM sse2 1438cabdff1aSopenharmony_ciVP9_IDCT_IDCT_16x16_ADD_XMM ssse3 1439cabdff1aSopenharmony_ciVP9_IDCT_IDCT_16x16_ADD_XMM avx 1440cabdff1aSopenharmony_ci 1441cabdff1aSopenharmony_ci%macro VP9_IDCT16_YMM_1D 0 1442cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 1, 15, 16305, 1606, [pd_8192], 0, 4 ; t8, t15 1443cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 9, 7, 10394, 12665, [pd_8192], 0, 4 ; t9, t14 1444cabdff1aSopenharmony_ci 1445cabdff1aSopenharmony_ci SUMSUB_BA w, 9, 1, 0 ; t8, t9 1446cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 15, 0 ; t15, t14 1447cabdff1aSopenharmony_ci 1448cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 15, 1, 15137, 6270, [pd_8192], 0, 4 ; t9, t14 1449cabdff1aSopenharmony_ci 1450cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 5, 11, 14449, 7723, [pd_8192], 0, 4 ; t10, t13 1451cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 13, 3, 4756, 15679, [pd_8192], 0, 4 ; t11, t12 1452cabdff1aSopenharmony_ci 1453cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 13, 0 ; t11, t10 1454cabdff1aSopenharmony_ci SUMSUB_BA w, 11, 3, 0 ; t12, t13 1455cabdff1aSopenharmony_ci 1456cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 3, 13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13 1457cabdff1aSopenharmony_ci 1458cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 9, 0 ; t8, t11 1459cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 15, 0 ; t9, t10 1460cabdff1aSopenharmony_ci SUMSUB_BA w, 11, 7, 0 ; t15, t12 1461cabdff1aSopenharmony_ci SUMSUB_BA w, 13, 1, 0 ; t14, t13 1462cabdff1aSopenharmony_ci 1463cabdff1aSopenharmony_ci SUMSUB_BA w, 15, 1, 0 1464cabdff1aSopenharmony_ci SUMSUB_BA w, 9, 7, 0 1465cabdff1aSopenharmony_ci pmulhrsw m1, [pw_11585x2] ; t10 1466cabdff1aSopenharmony_ci pmulhrsw m7, [pw_11585x2] ; t11 1467cabdff1aSopenharmony_ci pmulhrsw m9, [pw_11585x2] ; t12 1468cabdff1aSopenharmony_ci pmulhrsw m15, [pw_11585x2] ; t13 1469cabdff1aSopenharmony_ci 1470cabdff1aSopenharmony_ci ; even (tx8x8) 1471cabdff1aSopenharmony_ci mova m4, [blockq+128] 1472cabdff1aSopenharmony_ci mova [blockq+128], m5 1473cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 4, 12, 15137, 6270, [pd_8192], 0, 5 ; t2, t3 1474cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 2, 14, 16069, 3196, [pd_8192], 0, 5 ; t4, t7 1475cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 10, 6, 9102, 13623, [pd_8192], 0, 5 ; t5, t6 1476cabdff1aSopenharmony_ci mova m0, [blockq+ 0] 1477cabdff1aSopenharmony_ci SUMSUB_BA w, 8, 0, 5 1478cabdff1aSopenharmony_ci pmulhrsw m8, [pw_11585x2] ; t0 1479cabdff1aSopenharmony_ci pmulhrsw m0, [pw_11585x2] ; t1 1480cabdff1aSopenharmony_ci 1481cabdff1aSopenharmony_ci SUMSUB_BA w, 10, 2, 5 ; t4, t5 1482cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 14, 5 ; t7, t6 1483cabdff1aSopenharmony_ci SUMSUB_BA w, 12, 8, 5 ; t0, t3 1484cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 0, 5 ; t1, t2 1485cabdff1aSopenharmony_ci 1486cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 14, 5 1487cabdff1aSopenharmony_ci pmulhrsw m14, [pw_11585x2] ; t5 1488cabdff1aSopenharmony_ci pmulhrsw m2, [pw_11585x2] ; t6 1489cabdff1aSopenharmony_ci 1490cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 12, 5 ; t0, t7 1491cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 4, 5 ; t1, t6 1492cabdff1aSopenharmony_ci SUMSUB_BA w, 14, 0, 5 ; t2, t5 1493cabdff1aSopenharmony_ci SUMSUB_BA w, 10, 8, 5 ; t3, t4 1494cabdff1aSopenharmony_ci 1495cabdff1aSopenharmony_ci ; final stage 1496cabdff1aSopenharmony_ci SUMSUB_BA w, 11, 6, 5 ; out0, out15 1497cabdff1aSopenharmony_ci SUMSUB_BA w, 13, 2, 5 ; out1, out14 1498cabdff1aSopenharmony_ci SUMSUB_BA w, 15, 14, 5 ; out2, out13 1499cabdff1aSopenharmony_ci SUMSUB_BA w, 9, 10, 5 ; out3, out12 1500cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 8, 5 ; out4, out11 1501cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 0, 5 ; out5, out10 1502cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 4, 5 ; out6, out9 1503cabdff1aSopenharmony_ci mova m5, [blockq+128] 1504cabdff1aSopenharmony_ci mova [blockq+192], m3 1505cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 12, 3 ; out7, out8 1506cabdff1aSopenharmony_ci 1507cabdff1aSopenharmony_ci SWAP 0, 11, 8, 12, 10 1508cabdff1aSopenharmony_ci SWAP 1, 13, 14, 2, 15, 6, 3, 9, 4, 7, 5 1509cabdff1aSopenharmony_ci%endmacro 1510cabdff1aSopenharmony_ci 1511cabdff1aSopenharmony_ci; this is almost identical to VP9_STORE_2X, but it does two rows 1512cabdff1aSopenharmony_ci; for slightly improved interleaving, and it omits vpermq since the 1513cabdff1aSopenharmony_ci; input is DC so all values are identical 1514cabdff1aSopenharmony_ci%macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero 1515cabdff1aSopenharmony_ci mova xm%2, [dstq] 1516cabdff1aSopenharmony_ci mova xm%4, [dstq+strideq*2] 1517cabdff1aSopenharmony_ci vinserti128 m%2, m%2, [dstq+strideq], 1 1518cabdff1aSopenharmony_ci vinserti128 m%4, m%4, [dstq+stride3q], 1 1519cabdff1aSopenharmony_ci punpckhbw m%3, m%2, m%6 1520cabdff1aSopenharmony_ci punpcklbw m%2, m%6 1521cabdff1aSopenharmony_ci punpckhbw m%5, m%4, m%6 1522cabdff1aSopenharmony_ci punpcklbw m%4, m%6 1523cabdff1aSopenharmony_ci paddw m%3, m%1 1524cabdff1aSopenharmony_ci paddw m%2, m%1 1525cabdff1aSopenharmony_ci paddw m%5, m%1 1526cabdff1aSopenharmony_ci paddw m%4, m%1 1527cabdff1aSopenharmony_ci packuswb m%2, m%3 1528cabdff1aSopenharmony_ci packuswb m%4, m%5 1529cabdff1aSopenharmony_ci mova [dstq], xm%2 1530cabdff1aSopenharmony_ci mova [dstq+strideq*2], xm%4 1531cabdff1aSopenharmony_ci vextracti128 [dstq+strideq], m%2, 1 1532cabdff1aSopenharmony_ci vextracti128 [dstq+stride3q], m%4, 1 1533cabdff1aSopenharmony_ci%endmacro 1534cabdff1aSopenharmony_ci 1535cabdff1aSopenharmony_ci%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 1536cabdff1aSopenharmony_ciINIT_YMM avx2 1537cabdff1aSopenharmony_cicglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob 1538cabdff1aSopenharmony_ci cmp eobd, 1 ; faster path for when only DC is set 1539cabdff1aSopenharmony_ci jg .idctfull 1540cabdff1aSopenharmony_ci 1541cabdff1aSopenharmony_ci ; dc-only 1542cabdff1aSopenharmony_ci mova m1, [pw_11585x2] 1543cabdff1aSopenharmony_ci vpbroadcastw m0, [blockq] 1544cabdff1aSopenharmony_ci pmulhrsw m0, m1 1545cabdff1aSopenharmony_ci pmulhrsw m0, m1 1546cabdff1aSopenharmony_ci pxor m5, m5 1547cabdff1aSopenharmony_ci pmulhrsw m0, [pw_512] 1548cabdff1aSopenharmony_ci movd [blockq], xm5 1549cabdff1aSopenharmony_ci 1550cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride3, cnt 1551cabdff1aSopenharmony_ci mov cntd, 4 1552cabdff1aSopenharmony_ci lea stride3q, [strideq*3] 1553cabdff1aSopenharmony_ci.loop_dc: 1554cabdff1aSopenharmony_ci VP9_STORE_YMM_DC_4X 0, 1, 2, 3, 4, 5 1555cabdff1aSopenharmony_ci lea dstq, [dstq+4*strideq] 1556cabdff1aSopenharmony_ci dec cntd 1557cabdff1aSopenharmony_ci jg .loop_dc 1558cabdff1aSopenharmony_ci RET 1559cabdff1aSopenharmony_ci 1560cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, eob 1561cabdff1aSopenharmony_ci.idctfull: 1562cabdff1aSopenharmony_ci mova m1, [blockq+ 32] 1563cabdff1aSopenharmony_ci mova m2, [blockq+ 64] 1564cabdff1aSopenharmony_ci mova m3, [blockq+ 96] 1565cabdff1aSopenharmony_ci mova m5, [blockq+160] 1566cabdff1aSopenharmony_ci mova m6, [blockq+192] 1567cabdff1aSopenharmony_ci mova m7, [blockq+224] 1568cabdff1aSopenharmony_ci mova m8, [blockq+256] 1569cabdff1aSopenharmony_ci mova m9, [blockq+288] 1570cabdff1aSopenharmony_ci mova m10, [blockq+320] 1571cabdff1aSopenharmony_ci mova m11, [blockq+352] 1572cabdff1aSopenharmony_ci mova m12, [blockq+384] 1573cabdff1aSopenharmony_ci mova m13, [blockq+416] 1574cabdff1aSopenharmony_ci mova m14, [blockq+448] 1575cabdff1aSopenharmony_ci mova m15, [blockq+480] 1576cabdff1aSopenharmony_ci 1577cabdff1aSopenharmony_ci VP9_IDCT16_YMM_1D 1578cabdff1aSopenharmony_ci TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 1579cabdff1aSopenharmony_ci [blockq+192], [blockq+128], 1 1580cabdff1aSopenharmony_ci mova [blockq+ 0], m0 1581cabdff1aSopenharmony_ci VP9_IDCT16_YMM_1D 1582cabdff1aSopenharmony_ci 1583cabdff1aSopenharmony_ci mova [blockq+224], m7 1584cabdff1aSopenharmony_ci 1585cabdff1aSopenharmony_ci ; store 1586cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 1587cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 1588cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 1589cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 1590cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 1591cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 1592cabdff1aSopenharmony_ci mova m6, [blockq+192] 1593cabdff1aSopenharmony_ci mova m7, [blockq+224] 1594cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 1595cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 1596cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 1597cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 1598cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 1599cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 1600cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 1601cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 1602cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 1603cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 1604cabdff1aSopenharmony_ci 1605cabdff1aSopenharmony_ci ; at the end of the loop, m0 should still be zero 1606cabdff1aSopenharmony_ci ; use that to zero out block coefficients 1607cabdff1aSopenharmony_ci pxor m0, m0 1608cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 32, 16, m0 1609cabdff1aSopenharmony_ci RET 1610cabdff1aSopenharmony_ci%endif 1611cabdff1aSopenharmony_ci 1612cabdff1aSopenharmony_ci;--------------------------------------------------------------------------------------------- 1613cabdff1aSopenharmony_ci; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 1614cabdff1aSopenharmony_ci;--------------------------------------------------------------------------------------------- 1615cabdff1aSopenharmony_ci 1616cabdff1aSopenharmony_ci%macro VP9_IADST16_1D 2 ; src, pass 1617cabdff1aSopenharmony_ci%assign %%str 16*%2 1618cabdff1aSopenharmony_ci mova m0, [%1+ 0*32] ; in0 1619cabdff1aSopenharmony_ci mova m1, [%1+15*32] ; in15 1620cabdff1aSopenharmony_ci mova m2, [%1+ 7*32] ; in7 1621cabdff1aSopenharmony_ci mova m3, [%1+ 8*32] ; in8 1622cabdff1aSopenharmony_ci 1623cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 1, 0, 4, 5, 16364, 804 ; m1/4=t1[d], m0/5=t0[d] 1624cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 6, 11003, 12140 ; m2/7=t9[d], m3/6=t8[d] 1625cabdff1aSopenharmony_ci SCRATCH 4, 8, tmpq+ 0*%%str 1626cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 3, 0, 6, 5, 4, [pd_8192] ; m3=t0[w], m0=t8[w] 1627cabdff1aSopenharmony_ci UNSCRATCH 4, 8, tmpq+ 0*%%str 1628cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 2, 1, 7, 4, 5, [pd_8192] ; m2=t1[w], m1=t9[w] 1629cabdff1aSopenharmony_ci 1630cabdff1aSopenharmony_ci SCRATCH 0, 10, tmpq+ 0*%%str 1631cabdff1aSopenharmony_ci SCRATCH 1, 11, tmpq+15*%%str 1632cabdff1aSopenharmony_ci mova [tmpq+ 7*%%str], m2 1633cabdff1aSopenharmony_ci mova [tmpq+ 8*%%str], m3 1634cabdff1aSopenharmony_ci 1635cabdff1aSopenharmony_ci mova m1, [%1+ 2*32] ; in2 1636cabdff1aSopenharmony_ci mova m0, [%1+13*32] ; in13 1637cabdff1aSopenharmony_ci mova m3, [%1+ 5*32] ; in5 1638cabdff1aSopenharmony_ci mova m2, [%1+10*32] ; in10 1639cabdff1aSopenharmony_ci 1640cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 15893, 3981 ; m0/6=t3[d], m1/7=t2[d] 1641cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d] 1642cabdff1aSopenharmony_ci SCRATCH 4, 12, tmpq+ 2*%%str 1643cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 2, 1, 5, 7, 4, [pd_8192] ; m2=t2[w], m1=t10[w] 1644cabdff1aSopenharmony_ci UNSCRATCH 4, 12, tmpq+ 2*%%str 1645cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 3, 0, 4, 6, 5, [pd_8192] ; m3=t3[w], m0=t11[w] 1646cabdff1aSopenharmony_ci 1647cabdff1aSopenharmony_ci SCRATCH 0, 12, tmpq+ 2*%%str 1648cabdff1aSopenharmony_ci SCRATCH 1, 13, tmpq+13*%%str 1649cabdff1aSopenharmony_ci mova [tmpq+ 5*%%str], m2 1650cabdff1aSopenharmony_ci mova [tmpq+10*%%str], m3 1651cabdff1aSopenharmony_ci 1652cabdff1aSopenharmony_ci mova m2, [%1+ 4*32] ; in4 1653cabdff1aSopenharmony_ci mova m3, [%1+11*32] ; in11 1654cabdff1aSopenharmony_ci mova m0, [%1+ 3*32] ; in3 1655cabdff1aSopenharmony_ci mova m1, [%1+12*32] ; in12 1656cabdff1aSopenharmony_ci 1657cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 14811, 7005 ; m3/7=t5[d], m2/6=t4[d] 1658cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 5520, 15426 ; m0/4=t13[d], m1/5=t12[d] 1659cabdff1aSopenharmony_ci SCRATCH 4, 9, tmpq+ 4*%%str 1660cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t4[w], m2=t12[w] 1661cabdff1aSopenharmony_ci UNSCRATCH 4, 9, tmpq+ 4*%%str 1662cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t5[w], m3=t13[w] 1663cabdff1aSopenharmony_ci 1664cabdff1aSopenharmony_ci SCRATCH 0, 8, tmpq+ 4*%%str 1665cabdff1aSopenharmony_ci mova [tmpq+11*%%str], m1 ; t4:m1->r11 1666cabdff1aSopenharmony_ci UNSCRATCH 0, 10, tmpq+ 0*%%str 1667cabdff1aSopenharmony_ci UNSCRATCH 1, 11, tmpq+15*%%str 1668cabdff1aSopenharmony_ci 1669cabdff1aSopenharmony_ci ; round 2 interleaved part 1 1670cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d] 1671cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 3, 2, 5, 4, 3196, 16069 ; m3/5=t12[d], m2/4=t13[d] 1672cabdff1aSopenharmony_ci SCRATCH 4, 9, tmpq+ 3*%%str 1673cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 3, 1, 5, 7, 4, [pd_8192] ; m3=t8[w], m1=t12[w] 1674cabdff1aSopenharmony_ci UNSCRATCH 4, 9, tmpq+ 3*%%str 1675cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 2, 0, 4, 6, 5, [pd_8192] ; m2=t9[w], m0=t13[w] 1676cabdff1aSopenharmony_ci 1677cabdff1aSopenharmony_ci SCRATCH 0, 10, tmpq+ 0*%%str 1678cabdff1aSopenharmony_ci SCRATCH 1, 11, tmpq+15*%%str 1679cabdff1aSopenharmony_ci SCRATCH 2, 14, tmpq+ 3*%%str 1680cabdff1aSopenharmony_ci SCRATCH 3, 15, tmpq+12*%%str 1681cabdff1aSopenharmony_ci 1682cabdff1aSopenharmony_ci mova m2, [%1+ 6*32] ; in6 1683cabdff1aSopenharmony_ci mova m3, [%1+ 9*32] ; in9 1684cabdff1aSopenharmony_ci mova m0, [%1+ 1*32] ; in1 1685cabdff1aSopenharmony_ci mova m1, [%1+14*32] ; in14 1686cabdff1aSopenharmony_ci 1687cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d] 1688cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 2404, 16207 ; m0/4=t15[d], m1/5=t14[d] 1689cabdff1aSopenharmony_ci SCRATCH 4, 9, tmpq+ 6*%%str 1690cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t6[w], m2=t14[w] 1691cabdff1aSopenharmony_ci UNSCRATCH 4, 9, tmpq+ 6*%%str 1692cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t7[w], m3=t15[w] 1693cabdff1aSopenharmony_ci 1694cabdff1aSopenharmony_ci ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7 1695cabdff1aSopenharmony_ci ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15 1696cabdff1aSopenharmony_ci 1697cabdff1aSopenharmony_ci UNSCRATCH 4, 12, tmpq+ 2*%%str 1698cabdff1aSopenharmony_ci UNSCRATCH 5, 13, tmpq+13*%%str 1699cabdff1aSopenharmony_ci SCRATCH 0, 12, tmpq+ 1*%%str 1700cabdff1aSopenharmony_ci SCRATCH 1, 13, tmpq+14*%%str 1701cabdff1aSopenharmony_ci 1702cabdff1aSopenharmony_ci ; remainder of round 2 (rest of t8-15) 1703cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 5, 4, 6, 7, 9102, 13623 ; m5/6=t11[d], m4/7=t10[d] 1704cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 3, 2, 1, 0, 13623, 9102 ; m3/1=t14[d], m2/0=t15[d] 1705cabdff1aSopenharmony_ci SCRATCH 0, 9, tmpq+ 6*%%str 1706cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 3, 4, 1, 7, 0, [pd_8192] ; m3=t10[w], m4=t14[w] 1707cabdff1aSopenharmony_ci UNSCRATCH 0, 9, tmpq+ 6*%%str 1708cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 2, 5, 0, 6, 1, [pd_8192] ; m2=t11[w], m5=t15[w] 1709cabdff1aSopenharmony_ci 1710cabdff1aSopenharmony_ci ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15 1711cabdff1aSopenharmony_ci 1712cabdff1aSopenharmony_ci UNSCRATCH 6, 14, tmpq+ 3*%%str 1713cabdff1aSopenharmony_ci UNSCRATCH 7, 15, tmpq+12*%%str 1714cabdff1aSopenharmony_ci 1715cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 7, 1 1716cabdff1aSopenharmony_ci PSIGNW m3, [pw_m1] ; m3=out1[w], m7=t10[w] 1717cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 6, 1 ; m2=out14[w], m6=t11[w] 1718cabdff1aSopenharmony_ci 1719cabdff1aSopenharmony_ci ; unfortunately, the code below overflows in some cases, e.g. 1720cabdff1aSopenharmony_ci ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm 1721cabdff1aSopenharmony_ci%if 0; cpuflag(ssse3) 1722cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 6, 1 1723cabdff1aSopenharmony_ci pmulhrsw m7, [pw_11585x2] ; m7=out6[w] 1724cabdff1aSopenharmony_ci pmulhrsw m6, [pw_11585x2] ; m6=out9[w] 1725cabdff1aSopenharmony_ci%else 1726cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, 11585, [pd_8192], 1, 0 1727cabdff1aSopenharmony_ci%endif 1728cabdff1aSopenharmony_ci 1729cabdff1aSopenharmony_ci mova [tmpq+ 3*%%str], m6 1730cabdff1aSopenharmony_ci mova [tmpq+ 6*%%str], m7 1731cabdff1aSopenharmony_ci UNSCRATCH 6, 10, tmpq+ 0*%%str 1732cabdff1aSopenharmony_ci UNSCRATCH 7, 11, tmpq+15*%%str 1733cabdff1aSopenharmony_ci mova [tmpq+13*%%str], m2 1734cabdff1aSopenharmony_ci SCRATCH 3, 11, tmpq+ 9*%%str 1735cabdff1aSopenharmony_ci 1736cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 7, 6, 2, 3, 15137, 6270 ; m6/3=t13[d], m7/2=t12[d] 1737cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 5, 4, 1, 0, 6270, 15137 ; m5/1=t14[d], m4/0=t15[d] 1738cabdff1aSopenharmony_ci SCRATCH 0, 9, tmpq+ 2*%%str 1739cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 5, 6, 1, 3, 0, [pd_8192] ; m5=out2[w], m6=t14[w] 1740cabdff1aSopenharmony_ci UNSCRATCH 0, 9, tmpq+ 2*%%str 1741cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 4, 7, 0, 2, 1, [pd_8192] 1742cabdff1aSopenharmony_ci PSIGNW m4, [pw_m1] ; m4=out13[w], m7=t15[w] 1743cabdff1aSopenharmony_ci 1744cabdff1aSopenharmony_ci ; unfortunately, the code below overflows in some cases 1745cabdff1aSopenharmony_ci%if 0; cpuflag(ssse3) 1746cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 6, 1 1747cabdff1aSopenharmony_ci pmulhrsw m7, [pw_m11585x2] ; m7=out5[w] 1748cabdff1aSopenharmony_ci pmulhrsw m6, [pw_11585x2] ; m6=out10[w] 1749cabdff1aSopenharmony_ci%else 1750cabdff1aSopenharmony_ci PSIGNW m7, [pw_m1] 1751cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 7, 6, 11585, 11585, [pd_8192], 1, 0 1752cabdff1aSopenharmony_ci%endif 1753cabdff1aSopenharmony_ci 1754cabdff1aSopenharmony_ci ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14 1755cabdff1aSopenharmony_ci 1756cabdff1aSopenharmony_ci mova m2, [tmpq+ 8*%%str] 1757cabdff1aSopenharmony_ci mova m3, [tmpq+ 7*%%str] 1758cabdff1aSopenharmony_ci mova m1, [tmpq+11*%%str] 1759cabdff1aSopenharmony_ci mova [tmpq+ 7*%%str], m6 1760cabdff1aSopenharmony_ci mova [tmpq+11*%%str], m4 1761cabdff1aSopenharmony_ci mova m4, [tmpq+ 5*%%str] 1762cabdff1aSopenharmony_ci SCRATCH 5, 14, tmpq+ 5*%%str 1763cabdff1aSopenharmony_ci SCRATCH 7, 15, tmpq+ 8*%%str 1764cabdff1aSopenharmony_ci UNSCRATCH 6, 8, tmpq+ 4*%%str 1765cabdff1aSopenharmony_ci UNSCRATCH 5, 12, tmpq+ 1*%%str 1766cabdff1aSopenharmony_ci UNSCRATCH 7, 13, tmpq+14*%%str 1767cabdff1aSopenharmony_ci 1768cabdff1aSopenharmony_ci ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7 1769cabdff1aSopenharmony_ci ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 1770cabdff1aSopenharmony_ci 1771cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 2, 0 ; m1=t0[w], m2=t4[w] 1772cabdff1aSopenharmony_ci mova m0, [tmpq+10*%%str] 1773cabdff1aSopenharmony_ci SCRATCH 1, 12, tmpq+ 1*%%str 1774cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 3, 1 ; m8=t1[w], m3=t5[w] 1775cabdff1aSopenharmony_ci SCRATCH 6, 13, tmpq+ 4*%%str 1776cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 4, 1 ; m13=t2[w], m9=t6[w] 1777cabdff1aSopenharmony_ci SCRATCH 7, 8, tmpq+10*%%str 1778cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 0, 1 ; m12=t3[w], m0=t7[w] 1779cabdff1aSopenharmony_ci SCRATCH 5, 9, tmpq+14*%%str 1780cabdff1aSopenharmony_ci 1781cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 5, 15137, 6270 ; m2/6=t5[d], m3/10=t4[d] 1782cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 0, 4, 1, 6, 6270, 15137 ; m0/14=t6[d], m9/15=t7[d] 1783cabdff1aSopenharmony_ci SCRATCH 6, 10, tmpq+ 0*%%str 1784cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 0, 3, 1, 5, 6, [pd_8192] 1785cabdff1aSopenharmony_ci UNSCRATCH 6, 10, tmpq+ 0*%%str 1786cabdff1aSopenharmony_ci PSIGNW m0, [pw_m1] ; m0=out3[w], m3=t6[w] 1787cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 4, 2, 6, 7, 5, [pd_8192] ; m9=out12[w], m2=t7[w] 1788cabdff1aSopenharmony_ci 1789cabdff1aSopenharmony_ci UNSCRATCH 1, 8, tmpq+10*%%str 1790cabdff1aSopenharmony_ci UNSCRATCH 5, 9, tmpq+14*%%str 1791cabdff1aSopenharmony_ci UNSCRATCH 6, 12, tmpq+ 1*%%str 1792cabdff1aSopenharmony_ci UNSCRATCH 7, 13, tmpq+ 4*%%str 1793cabdff1aSopenharmony_ci SCRATCH 4, 9, tmpq+14*%%str 1794cabdff1aSopenharmony_ci 1795cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 6, 4 ; m13=out0[w], m1=t2[w] 1796cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 7, 4 1797cabdff1aSopenharmony_ci PSIGNW m5, [pw_m1] ; m12=out15[w], m8=t3[w] 1798cabdff1aSopenharmony_ci 1799cabdff1aSopenharmony_ci ; unfortunately, the code below overflows in some cases, e.g. 1800cabdff1aSopenharmony_ci ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm 1801cabdff1aSopenharmony_ci%if 0 ; cpuflag(ssse3) 1802cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 6, 4 1803cabdff1aSopenharmony_ci pmulhrsw m7, [pw_m11585x2] ; m8=out7[w] 1804cabdff1aSopenharmony_ci pmulhrsw m6, [pw_11585x2] ; m1=out8[w] 1805cabdff1aSopenharmony_ci SWAP 6, 7 1806cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 2, 4 1807cabdff1aSopenharmony_ci pmulhrsw m3, [pw_11585x2] ; m3=out4[w] 1808cabdff1aSopenharmony_ci pmulhrsw m2, [pw_11585x2] ; m2=out11[w] 1809cabdff1aSopenharmony_ci%else 1810cabdff1aSopenharmony_ci SCRATCH 5, 8, tmpq+10*%%str 1811cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, m11585, [pd_8192], 5, 4 1812cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 4 1813cabdff1aSopenharmony_ci UNSCRATCH 5, 8, tmpq+10*%%str 1814cabdff1aSopenharmony_ci%endif 1815cabdff1aSopenharmony_ci 1816cabdff1aSopenharmony_ci ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15 1817cabdff1aSopenharmony_ci ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 1818cabdff1aSopenharmony_ci 1819cabdff1aSopenharmony_ci%if %2 == 1 1820cabdff1aSopenharmony_ci%if ARCH_X86_64 1821cabdff1aSopenharmony_ci mova m13, [tmpq+ 6*%%str] 1822cabdff1aSopenharmony_ci TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 6, 10 1823cabdff1aSopenharmony_ci mova [tmpq+ 0*16], m1 1824cabdff1aSopenharmony_ci mova [tmpq+ 2*16], m11 1825cabdff1aSopenharmony_ci mova [tmpq+ 4*16], m14 1826cabdff1aSopenharmony_ci mova [tmpq+ 6*16], m0 1827cabdff1aSopenharmony_ci mova m1, [tmpq+ 3*%%str] 1828cabdff1aSopenharmony_ci mova m11, [tmpq+ 7*%%str] 1829cabdff1aSopenharmony_ci mova m14, [tmpq+11*%%str] 1830cabdff1aSopenharmony_ci mova m0, [tmpq+13*%%str] 1831cabdff1aSopenharmony_ci mova [tmpq+ 8*16], m3 1832cabdff1aSopenharmony_ci mova [tmpq+10*16], m15 1833cabdff1aSopenharmony_ci mova [tmpq+12*16], m13 1834cabdff1aSopenharmony_ci mova [tmpq+14*16], m6 1835cabdff1aSopenharmony_ci 1836cabdff1aSopenharmony_ci TRANSPOSE8x8W 7, 1, 11, 2, 9, 14, 0, 5, 10 1837cabdff1aSopenharmony_ci mova [tmpq+ 1*16], m7 1838cabdff1aSopenharmony_ci mova [tmpq+ 3*16], m1 1839cabdff1aSopenharmony_ci mova [tmpq+ 5*16], m11 1840cabdff1aSopenharmony_ci mova [tmpq+ 7*16], m2 1841cabdff1aSopenharmony_ci mova [tmpq+ 9*16], m9 1842cabdff1aSopenharmony_ci mova [tmpq+11*16], m14 1843cabdff1aSopenharmony_ci mova [tmpq+13*16], m0 1844cabdff1aSopenharmony_ci mova [tmpq+15*16], m5 1845cabdff1aSopenharmony_ci%else 1846cabdff1aSopenharmony_ci mova [tmpq+12*%%str], m2 1847cabdff1aSopenharmony_ci mova [tmpq+ 1*%%str], m5 1848cabdff1aSopenharmony_ci mova [tmpq+15*%%str], m7 1849cabdff1aSopenharmony_ci mova m2, [tmpq+ 9*%%str] 1850cabdff1aSopenharmony_ci mova m5, [tmpq+ 5*%%str] 1851cabdff1aSopenharmony_ci mova m7, [tmpq+ 8*%%str] 1852cabdff1aSopenharmony_ci TRANSPOSE8x8W 1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1 1853cabdff1aSopenharmony_ci mova [tmpq+ 0*16], m1 1854cabdff1aSopenharmony_ci mova [tmpq+ 2*16], m2 1855cabdff1aSopenharmony_ci mova [tmpq+ 4*16], m5 1856cabdff1aSopenharmony_ci mova [tmpq+ 6*16], m0 1857cabdff1aSopenharmony_ci mova [tmpq+10*16], m7 1858cabdff1aSopenharmony_ci mova m3, [tmpq+12*%%str] 1859cabdff1aSopenharmony_ci mova [tmpq+12*16], m4 1860cabdff1aSopenharmony_ci mova m4, [tmpq+14*%%str] 1861cabdff1aSopenharmony_ci mova [tmpq+14*16], m6 1862cabdff1aSopenharmony_ci 1863cabdff1aSopenharmony_ci mova m0, [tmpq+15*%%str] 1864cabdff1aSopenharmony_ci mova m1, [tmpq+ 3*%%str] 1865cabdff1aSopenharmony_ci mova m2, [tmpq+ 7*%%str] 1866cabdff1aSopenharmony_ci mova m5, [tmpq+11*%%str] 1867cabdff1aSopenharmony_ci mova m7, [tmpq+ 1*%%str] 1868cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1 1869cabdff1aSopenharmony_ci mova [tmpq+ 1*16], m0 1870cabdff1aSopenharmony_ci mova [tmpq+ 3*16], m1 1871cabdff1aSopenharmony_ci mova [tmpq+ 5*16], m2 1872cabdff1aSopenharmony_ci mova [tmpq+ 7*16], m3 1873cabdff1aSopenharmony_ci mova [tmpq+11*16], m5 1874cabdff1aSopenharmony_ci mova [tmpq+13*16], m6 1875cabdff1aSopenharmony_ci mova [tmpq+15*16], m7 1876cabdff1aSopenharmony_ci%endif 1877cabdff1aSopenharmony_ci%else 1878cabdff1aSopenharmony_ci pxor m4, m4 1879cabdff1aSopenharmony_ci 1880cabdff1aSopenharmony_ci%if cpuflag(ssse3) 1881cabdff1aSopenharmony_ci%define ROUND_REG [pw_512] 1882cabdff1aSopenharmony_ci%else 1883cabdff1aSopenharmony_ci%define ROUND_REG [pw_32] 1884cabdff1aSopenharmony_ci%endif 1885cabdff1aSopenharmony_ci 1886cabdff1aSopenharmony_ci%if ARCH_X86_64 1887cabdff1aSopenharmony_ci mova m12, [tmpq+ 6*%%str] 1888cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 1, 11, 10, 8, 4, ROUND_REG, 6 1889cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1890cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 14, 0, 10, 8, 4, ROUND_REG, 6 1891cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1892cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 3, 15, 10, 8, 4, ROUND_REG, 6 1893cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1894cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 12, 6, 10, 8, 4, ROUND_REG, 6 1895cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1896cabdff1aSopenharmony_ci 1897cabdff1aSopenharmony_ci mova m1, [tmpq+ 3*%%str] 1898cabdff1aSopenharmony_ci mova m11, [tmpq+ 7*%%str] 1899cabdff1aSopenharmony_ci mova m14, [tmpq+11*%%str] 1900cabdff1aSopenharmony_ci mova m0, [tmpq+13*%%str] 1901cabdff1aSopenharmony_ci 1902cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 7, 1, 10, 8, 4, ROUND_REG, 6 1903cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1904cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 11, 2, 10, 8, 4, ROUND_REG, 6 1905cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1906cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 9, 14, 10, 8, 4, ROUND_REG, 6 1907cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1908cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 0, 5, 10, 8, 4, ROUND_REG, 6 1909cabdff1aSopenharmony_ci%else 1910cabdff1aSopenharmony_ci mova [tmpq+ 0*%%str], m2 1911cabdff1aSopenharmony_ci mova [tmpq+ 1*%%str], m5 1912cabdff1aSopenharmony_ci mova [tmpq+ 2*%%str], m7 1913cabdff1aSopenharmony_ci mova m2, [tmpq+ 9*%%str] 1914cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 1, 2, 5, 7, 4, ROUND_REG, 6 1915cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1916cabdff1aSopenharmony_ci mova m5, [tmpq+ 5*%%str] 1917cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 5, 0, 1, 2, 4, ROUND_REG, 6 1918cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1919cabdff1aSopenharmony_ci mova m5, [tmpq+ 8*%%str] 1920cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 3, 5, 1, 2, 4, ROUND_REG, 6 1921cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1922cabdff1aSopenharmony_ci mova m5, [tmpq+ 6*%%str] 1923cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 5, 6, 1, 2, 4, ROUND_REG, 6 1924cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1925cabdff1aSopenharmony_ci 1926cabdff1aSopenharmony_ci mova m0, [tmpq+ 2*%%str] 1927cabdff1aSopenharmony_ci mova m3, [tmpq+ 3*%%str] 1928cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1929cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1930cabdff1aSopenharmony_ci mova m0, [tmpq+ 7*%%str] 1931cabdff1aSopenharmony_ci mova m3, [tmpq+ 0*%%str] 1932cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1933cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1934cabdff1aSopenharmony_ci mova m0, [tmpq+14*%%str] 1935cabdff1aSopenharmony_ci mova m3, [tmpq+11*%%str] 1936cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1937cabdff1aSopenharmony_ci lea dstq, [dstq+strideq*2] 1938cabdff1aSopenharmony_ci mova m0, [tmpq+13*%%str] 1939cabdff1aSopenharmony_ci mova m3, [tmpq+ 1*%%str] 1940cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1941cabdff1aSopenharmony_ci%endif 1942cabdff1aSopenharmony_ci 1943cabdff1aSopenharmony_ci SWAP 0, 4 ; zero 1944cabdff1aSopenharmony_ci%undef ROUND_REG 1945cabdff1aSopenharmony_ci%endif 1946cabdff1aSopenharmony_ci%endmacro 1947cabdff1aSopenharmony_ci 1948cabdff1aSopenharmony_ci%macro IADST16_FN 5 1949cabdff1aSopenharmony_ciINIT_XMM %5 1950cabdff1aSopenharmony_cicglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp 1951cabdff1aSopenharmony_ci mov cntd, 2 1952cabdff1aSopenharmony_ci mov tmpq, rsp 1953cabdff1aSopenharmony_ci.loop1_full: 1954cabdff1aSopenharmony_ci VP9_%2_1D blockq, 1 1955cabdff1aSopenharmony_ci add blockq, 16 1956cabdff1aSopenharmony_ci add tmpq, 256 1957cabdff1aSopenharmony_ci dec cntd 1958cabdff1aSopenharmony_ci jg .loop1_full 1959cabdff1aSopenharmony_ci sub blockq, 32 1960cabdff1aSopenharmony_ci 1961cabdff1aSopenharmony_ci mov cntd, 2 1962cabdff1aSopenharmony_ci mov tmpq, rsp 1963cabdff1aSopenharmony_ci mov dst_bakq, dstq 1964cabdff1aSopenharmony_ci.loop2_full: 1965cabdff1aSopenharmony_ci VP9_%4_1D tmpq, 2 1966cabdff1aSopenharmony_ci lea dstq, [dst_bakq+8] 1967cabdff1aSopenharmony_ci add tmpq, 16 1968cabdff1aSopenharmony_ci dec cntd 1969cabdff1aSopenharmony_ci jg .loop2_full 1970cabdff1aSopenharmony_ci 1971cabdff1aSopenharmony_ci ; at the end of the loop, m0 should still be zero 1972cabdff1aSopenharmony_ci ; use that to zero out block coefficients 1973cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 32, 16, m0 1974cabdff1aSopenharmony_ci RET 1975cabdff1aSopenharmony_ci%endmacro 1976cabdff1aSopenharmony_ci 1977cabdff1aSopenharmony_ciIADST16_FN idct, IDCT16, iadst, IADST16, sse2 1978cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, idct, IDCT16, sse2 1979cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, iadst, IADST16, sse2 1980cabdff1aSopenharmony_ciIADST16_FN idct, IDCT16, iadst, IADST16, ssse3 1981cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, idct, IDCT16, ssse3 1982cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, iadst, IADST16, ssse3 1983cabdff1aSopenharmony_ciIADST16_FN idct, IDCT16, iadst, IADST16, avx 1984cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, idct, IDCT16, avx 1985cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, iadst, IADST16, avx 1986cabdff1aSopenharmony_ci 1987cabdff1aSopenharmony_ci; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128] 1988cabdff1aSopenharmony_ci; out: m[0-15] except m6, which is in [blockq+192] 1989cabdff1aSopenharmony_ci; uses blockq as scratch space 1990cabdff1aSopenharmony_ci%macro VP9_IADST16_YMM_1D 0 1991cabdff1aSopenharmony_ci mova [blockq+ 32], m3 1992cabdff1aSopenharmony_ci mova [blockq+ 64], m7 1993cabdff1aSopenharmony_ci mova [blockq+ 96], m8 1994cabdff1aSopenharmony_ci 1995cabdff1aSopenharmony_ci ; first half of round 1 1996cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 9, 6, 0, 3, 13160, 9760 ; m9/x=t7[d], m6/x=t6[d] 1997cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 1, 14, 4, 7, 2404, 16207 ; m1/x=t15[d], m14/x=t14[d] 1998cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 14, 6, 7, 3, 8, [pd_8192] ; m14=t6[w], m6=t14[w] 1999cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 1, 9, 4, 0, 8, [pd_8192] ; m1=t7[w], m9=t15[w] 2000cabdff1aSopenharmony_ci 2001cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 13, 2, 4, 7, 15893, 3981 ; m13/x=t3[d], m2/x=t2[d] 2002cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 5, 10, 0, 3, 8423, 14053 ; m5/x=t11[d], m10/x=t10[d] 2003cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 10, 2, 3, 7, 8, [pd_8192] ; m10=t2[w], m2=t10[w] 2004cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 5, 13, 0, 4, 8, [pd_8192] ; m5=t3[w], m13=t11[w] 2005cabdff1aSopenharmony_ci 2006cabdff1aSopenharmony_ci ; half of round 2 t8-15 2007cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 2, 13, 4, 7, 9102, 13623 ; m2/x=t11[d], m13/x=t10[d] 2008cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 9, 6, 3, 0, 13623, 9102 ; m9/x=t14[d], m6/x=t15[d] 2009cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 9, 13, 3, 7, 8, [pd_8192] ; m9=t10[w], m13=t14[w] 2010cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 6, 2, 0, 4, 8, [pd_8192] ; m6=t11[w], m2=t15[w] 2011cabdff1aSopenharmony_ci 2012cabdff1aSopenharmony_ci SUMSUB_BA w, 14, 10, 8 ; m14=t2, m10=t6 2013cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 5, 8 ; m1=t3, m5=t7 2014cabdff1aSopenharmony_ci 2015cabdff1aSopenharmony_ci mova m0, [blockq+ 0] 2016cabdff1aSopenharmony_ci mova m4, [blockq+128] 2017cabdff1aSopenharmony_ci mova m3, [blockq+ 32] 2018cabdff1aSopenharmony_ci mova m7, [blockq+ 64] 2019cabdff1aSopenharmony_ci mova m8, [blockq+ 96] 2020cabdff1aSopenharmony_ci mova [blockq+ 0], m1 2021cabdff1aSopenharmony_ci mova [blockq+128], m14 2022cabdff1aSopenharmony_ci mova [blockq+ 32], m6 2023cabdff1aSopenharmony_ci mova [blockq+ 64], m9 2024cabdff1aSopenharmony_ci mova [blockq+ 96], m10 2025cabdff1aSopenharmony_ci 2026cabdff1aSopenharmony_ci ; second half of round 1 2027cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 15, 0, 1, 9, 16364, 804 ; m15/x=t1[d], m0/x=t0[d] 2028cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 7, 8, 10, 6, 11003, 12140 ; m7/x=t9[d], m8/x=t8[d] 2029cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 8, 0, 6, 9, 14, [pd_8192] ; m8=t0[w], m0=t8[w] 2030cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 7, 15, 10, 1, 14, [pd_8192] ; m7=t1[w], m15=t9[w] 2031cabdff1aSopenharmony_ci 2032cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 11, 4, 10, 6, 14811, 7005 ; m11/x=t5[d], m4/x=t4[d] 2033cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 3, 12, 1, 9, 5520, 15426 ; m3/x=t13[d], m12/x=t12[d] 2034cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 12, 4, 9, 6, 14, [pd_8192] ; m12=t4[w], m4=t12[w] 2035cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 3, 11, 1, 10, 14, [pd_8192] ; m3=t5[w], m11=t13[w] 2036cabdff1aSopenharmony_ci 2037cabdff1aSopenharmony_ci ; second half of round 2 t8-15 2038cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 0, 15, 6, 10, 16069, 3196 ; m15/x=t8[d], m0/x=t9[d] 2039cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 11, 4, 9, 1, 3196, 16069 ; m11/x=t12[d], m4/x=t13[d] 2040cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 11, 15, 9, 10, 14, [pd_8192] ; m11=t8[w], m15=t12[w] 2041cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 4, 0, 1, 6, 14, [pd_8192] ; m4=t9[w], m0=t13[w] 2042cabdff1aSopenharmony_ci 2043cabdff1aSopenharmony_ci SUMSUB_BA w, 12, 8, 14 ; m12=t0, m8=t4 2044cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 7, 14 ; m3=t1, m7=t5 2045cabdff1aSopenharmony_ci 2046cabdff1aSopenharmony_ci mova m10, [blockq+ 96] 2047cabdff1aSopenharmony_ci mova [blockq+ 96], m12 2048cabdff1aSopenharmony_ci 2049cabdff1aSopenharmony_ci ; round 3 2050cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 15, 0, 9, 12, 15137, 6270 ; m15/x=t13[d], m0/x=t12[d] 2051cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 2, 13, 1, 6, 6270, 15137 ; m2/x=t14[d], m13/x=t15[d] 2052cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 2, 0, 1, 12, 14, [pd_8192] ; m2=out2[w], m0=t14a[w] 2053cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 13, 15, 6, 9, 14, [pd_8192] 2054cabdff1aSopenharmony_ci PSIGNW m13, [pw_m1] ; m13=out13[w], m15=t15a[w] 2055cabdff1aSopenharmony_ci 2056cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 8, 7, 12, 9, 15137, 6270 ; m8/x=t5[d], m7/x=t4[d] 2057cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2D_4X 5, 10, 1, 6, 6270, 15137 ; m5/x=t6[d], m10/x=t7[d] 2058cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 5, 7, 1, 9, 14, [pd_8192] 2059cabdff1aSopenharmony_ci PSIGNW m5, [pw_m1] ; m5=out3[w], m7=t6[w] 2060cabdff1aSopenharmony_ci VP9_RND_SH_SUMSUB_BA 10, 8, 6, 12, 14, [pd_8192] ; m10=out12[w], m8=t7[w] 2061cabdff1aSopenharmony_ci 2062cabdff1aSopenharmony_ci mova m1, [blockq+ 0] 2063cabdff1aSopenharmony_ci mova m14, [blockq+128] 2064cabdff1aSopenharmony_ci mova m6, [blockq+ 32] 2065cabdff1aSopenharmony_ci mova m9, [blockq+ 64] 2066cabdff1aSopenharmony_ci mova m12, [blockq+ 96] 2067cabdff1aSopenharmony_ci mova [blockq+ 0], m10 2068cabdff1aSopenharmony_ci mova [blockq+128], m5 2069cabdff1aSopenharmony_ci 2070cabdff1aSopenharmony_ci SUMSUB_BA w, 14, 12, 5 ; m14=out0, m12=t2a 2071cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 3, 5 2072cabdff1aSopenharmony_ci PSIGNW m1, [pw_m1] ; m1=out15, m3=t3a 2073cabdff1aSopenharmony_ci 2074cabdff1aSopenharmony_ci SUMSUB_BA w, 9, 11, 5 2075cabdff1aSopenharmony_ci PSIGNW m9, [pw_m1] ; m9=out1, m11=t10 2076cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 4, 5 ; m6=out14, m4=t11 2077cabdff1aSopenharmony_ci 2078cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 4, 11, 11585, 11585, [pd_8192], 5, 10 ; m4=out9, m11=out6 2079cabdff1aSopenharmony_ci mova m5, [blockq+128] 2080cabdff1aSopenharmony_ci mova [blockq+192], m11 2081cabdff1aSopenharmony_ci PSIGNW m15, [pw_m1] 2082cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 15, 0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10 2083cabdff1aSopenharmony_ci 2084cabdff1aSopenharmony_ci PSIGNW m3, [pw_m1] 2085cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8 2086cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 8, 7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4 2087cabdff1aSopenharmony_ci 2088cabdff1aSopenharmony_ci mova m10, [blockq+ 0] 2089cabdff1aSopenharmony_ci 2090cabdff1aSopenharmony_ci SWAP 0, 14, 6, 11, 8, 12, 10 2091cabdff1aSopenharmony_ci SWAP 1, 9, 15, 4, 7, 3, 5 2092cabdff1aSopenharmony_ci SWAP 5, 9, 15 2093cabdff1aSopenharmony_ci%endmacro 2094cabdff1aSopenharmony_ci 2095cabdff1aSopenharmony_ci%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 2096cabdff1aSopenharmony_ci%macro IADST16_YMM_FN 4 2097cabdff1aSopenharmony_ciINIT_YMM avx2 2098cabdff1aSopenharmony_cicglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob 2099cabdff1aSopenharmony_ci mova m1, [blockq+ 32] 2100cabdff1aSopenharmony_ci mova m2, [blockq+ 64] 2101cabdff1aSopenharmony_ci mova m3, [blockq+ 96] 2102cabdff1aSopenharmony_ci mova m5, [blockq+160] 2103cabdff1aSopenharmony_ci mova m6, [blockq+192] 2104cabdff1aSopenharmony_ci mova m7, [blockq+224] 2105cabdff1aSopenharmony_ci mova m8, [blockq+256] 2106cabdff1aSopenharmony_ci mova m9, [blockq+288] 2107cabdff1aSopenharmony_ci mova m10, [blockq+320] 2108cabdff1aSopenharmony_ci mova m11, [blockq+352] 2109cabdff1aSopenharmony_ci mova m12, [blockq+384] 2110cabdff1aSopenharmony_ci mova m13, [blockq+416] 2111cabdff1aSopenharmony_ci mova m14, [blockq+448] 2112cabdff1aSopenharmony_ci mova m15, [blockq+480] 2113cabdff1aSopenharmony_ci 2114cabdff1aSopenharmony_ci VP9_%2_YMM_1D 2115cabdff1aSopenharmony_ci TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 2116cabdff1aSopenharmony_ci [blockq+192], [blockq+128], 1 2117cabdff1aSopenharmony_ci mova [blockq+ 0], m0 2118cabdff1aSopenharmony_ci VP9_%4_YMM_1D 2119cabdff1aSopenharmony_ci 2120cabdff1aSopenharmony_ci mova [blockq+224], m7 2121cabdff1aSopenharmony_ci 2122cabdff1aSopenharmony_ci ; store 2123cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 2124cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 2125cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 2126cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 2127cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 2128cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 2129cabdff1aSopenharmony_ci mova m6, [blockq+192] 2130cabdff1aSopenharmony_ci mova m7, [blockq+224] 2131cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 2132cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 2133cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 2134cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 2135cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 2136cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 2137cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 2138cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 2139cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 2140cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 2141cabdff1aSopenharmony_ci 2142cabdff1aSopenharmony_ci ; at the end of the loop, m0 should still be zero 2143cabdff1aSopenharmony_ci ; use that to zero out block coefficients 2144cabdff1aSopenharmony_ci pxor m0, m0 2145cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 32, 16, m0 2146cabdff1aSopenharmony_ci RET 2147cabdff1aSopenharmony_ci%endmacro 2148cabdff1aSopenharmony_ci 2149cabdff1aSopenharmony_ciIADST16_YMM_FN idct, IDCT16, iadst, IADST16 2150cabdff1aSopenharmony_ciIADST16_YMM_FN iadst, IADST16, idct, IDCT16 2151cabdff1aSopenharmony_ciIADST16_YMM_FN iadst, IADST16, iadst, IADST16 2152cabdff1aSopenharmony_ci%endif 2153cabdff1aSopenharmony_ci 2154cabdff1aSopenharmony_ci;--------------------------------------------------------------------------------------------- 2155cabdff1aSopenharmony_ci; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 2156cabdff1aSopenharmony_ci;--------------------------------------------------------------------------------------------- 2157cabdff1aSopenharmony_ci 2158cabdff1aSopenharmony_ci%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc 2159cabdff1aSopenharmony_ci%if %2 == 1 2160cabdff1aSopenharmony_ci%assign %%str mmsize 2161cabdff1aSopenharmony_ci%else 2162cabdff1aSopenharmony_ci%assign %%str 64 2163cabdff1aSopenharmony_ci%endif 2164cabdff1aSopenharmony_ci 2165cabdff1aSopenharmony_ci ; first do t0-15, this can be done identical to idct16x16 2166cabdff1aSopenharmony_ci VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1 2167cabdff1aSopenharmony_ci 2168cabdff1aSopenharmony_ci ; store everything on stack to make space available for t16-31 2169cabdff1aSopenharmony_ci ; we store interleaved with the output of the second half (t16-31) 2170cabdff1aSopenharmony_ci ; so we don't need to allocate extra stack space 2171cabdff1aSopenharmony_ci mova [tmpq+ 0*%%str], m0 ; t0 2172cabdff1aSopenharmony_ci mova [tmpq+ 4*%%str], m1 ; t1 2173cabdff1aSopenharmony_ci mova [tmpq+ 8*%%str], m2 ; t2 2174cabdff1aSopenharmony_ci mova [tmpq+12*%%str], m3 ; t3 2175cabdff1aSopenharmony_ci mova [tmpq+16*%%str], m4 ; t4 2176cabdff1aSopenharmony_ci mova [tmpq+20*%%str], m5 ; t5 2177cabdff1aSopenharmony_ci%if ARCH_X86_64 2178cabdff1aSopenharmony_ci mova [tmpq+22*%%str], m10 ; t10 2179cabdff1aSopenharmony_ci mova [tmpq+18*%%str], m11 ; t11 2180cabdff1aSopenharmony_ci mova [tmpq+14*%%str], m12 ; t12 2181cabdff1aSopenharmony_ci mova [tmpq+10*%%str], m13 ; t13 2182cabdff1aSopenharmony_ci mova [tmpq+ 6*%%str], m14 ; t14 2183cabdff1aSopenharmony_ci mova [tmpq+ 2*%%str], m15 ; t15 2184cabdff1aSopenharmony_ci%endif 2185cabdff1aSopenharmony_ci 2186cabdff1aSopenharmony_ci mova m0, [tmpq+ 30*%%str] 2187cabdff1aSopenharmony_ci UNSCRATCH 1, 6, tmpq+26*%%str 2188cabdff1aSopenharmony_ci UNSCRATCH 2, 8, tmpq+24*%%str 2189cabdff1aSopenharmony_ci UNSCRATCH 3, 9, tmpq+28*%%str 2190cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 3, 4 ; t6, t9 2191cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 2, 4 ; t7, t8 2192cabdff1aSopenharmony_ci 2193cabdff1aSopenharmony_ci mova [tmpq+24*%%str], m1 ; t6 2194cabdff1aSopenharmony_ci mova [tmpq+28*%%str], m0 ; t7 2195cabdff1aSopenharmony_ci mova [tmpq+30*%%str], m2 ; t8 2196cabdff1aSopenharmony_ci mova [tmpq+26*%%str], m3 ; t9 2197cabdff1aSopenharmony_ci 2198cabdff1aSopenharmony_ci ; then, secondly, do t16-31 2199cabdff1aSopenharmony_ci%if %3 <= 8 2200cabdff1aSopenharmony_ci mova m4, [%1+ 1*64] 2201cabdff1aSopenharmony_ci mova m7, [%1+ 7*64] 2202cabdff1aSopenharmony_ci 2203cabdff1aSopenharmony_ci pmulhrsw m1, m4, [pw_16364x2] ;t31 2204cabdff1aSopenharmony_ci pmulhrsw m4, [pw_804x2] ;t16 2205cabdff1aSopenharmony_ci 2206cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 5, 0, 1, 4, 16069, 3196, [pd_8192], 6, 2 ; t17, t30 2207cabdff1aSopenharmony_ci 2208cabdff1aSopenharmony_ci pmulhrsw m3, m7, [pw_m5520x2] ;t19 2209cabdff1aSopenharmony_ci pmulhrsw m7, [pw_15426x2] ;t28 2210cabdff1aSopenharmony_ci 2211cabdff1aSopenharmony_ci SCRATCH 4, 13, tmpq+ 1*%%str 2212cabdff1aSopenharmony_ci SCRATCH 5, 12, tmpq+15*%%str 2213cabdff1aSopenharmony_ci 2214cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 2, 6, 7, 3, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 2215cabdff1aSopenharmony_ci%else 2216cabdff1aSopenharmony_ci mova m0, [%1+ 1*64] 2217cabdff1aSopenharmony_ci mova m1, [%1+15*64] 2218cabdff1aSopenharmony_ci%if %3 <= 16 2219cabdff1aSopenharmony_ci pmulhrsw m5, m0, [pw_16364x2] 2220cabdff1aSopenharmony_ci pmulhrsw m0, [pw_804x2] 2221cabdff1aSopenharmony_ci pmulhrsw m4, m1, [pw_m11003x2] 2222cabdff1aSopenharmony_ci pmulhrsw m1, [pw_12140x2] 2223cabdff1aSopenharmony_ci%else 2224cabdff1aSopenharmony_ci mova m4, [%1+17*64] 2225cabdff1aSopenharmony_ci mova m5, [%1+31*64] 2226cabdff1aSopenharmony_ci 2227cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 0, 5, 16364, 804, [pd_8192], 2, 3 ; t16, t31 2228cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 4, 1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30 2229cabdff1aSopenharmony_ci%endif 2230cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 0, 2 2231cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 5, 2 2232cabdff1aSopenharmony_ci 2233cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 5, 0, 16069, 3196, [pd_8192], 2, 3 ; t17, t30 2234cabdff1aSopenharmony_ci 2235cabdff1aSopenharmony_ci SCRATCH 4, 13, tmpq+ 1*%%str 2236cabdff1aSopenharmony_ci SCRATCH 5, 12, tmpq+15*%%str 2237cabdff1aSopenharmony_ci 2238cabdff1aSopenharmony_ci mova m2, [%1+ 7*64] 2239cabdff1aSopenharmony_ci mova m3, [%1+ 9*64] 2240cabdff1aSopenharmony_ci%if %3 <= 16 2241cabdff1aSopenharmony_ci pmulhrsw m7, m3, [pw_14811x2] 2242cabdff1aSopenharmony_ci pmulhrsw m3, [pw_7005x2] 2243cabdff1aSopenharmony_ci pmulhrsw m6, m2, [pw_m5520x2] 2244cabdff1aSopenharmony_ci pmulhrsw m2, [pw_15426x2] 2245cabdff1aSopenharmony_ci%else 2246cabdff1aSopenharmony_ci mova m7, [%1+23*64] 2247cabdff1aSopenharmony_ci mova m6, [%1+25*64] 2248cabdff1aSopenharmony_ci 2249cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 3, 7, 14811, 7005, [pd_8192], 4, 5 ; t18, t29 2250cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 6, 2, 5520, 15426, [pd_8192], 4, 5 ; t19, t28 2251cabdff1aSopenharmony_ci%endif 2252cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 6, 4 2253cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 2, 4 2254cabdff1aSopenharmony_ci 2255cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 2, 6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 2256cabdff1aSopenharmony_ci%endif 2257cabdff1aSopenharmony_ci 2258cabdff1aSopenharmony_ci UNSCRATCH 5, 12, tmpq+15*%%str 2259cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 0, 4 2260cabdff1aSopenharmony_ci mova [tmpq+25*%%str], m6 ; t19 2261cabdff1aSopenharmony_ci UNSCRATCH 4, 13, tmpq+ 1*%%str 2262cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 1, 6 2263cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 4, 6 2264cabdff1aSopenharmony_ci mova [tmpq+23*%%str], m3 ; t16 2265cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 5, 6 2266cabdff1aSopenharmony_ci 2267cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 0, 5, 15137, 6270, [pd_8192], 6, 3 ; t18, t29 2268cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 1, 4, 15137, 6270, [pd_8192], 6, 3 ; t19, t28 2269cabdff1aSopenharmony_ci 2270cabdff1aSopenharmony_ci SCRATCH 0, 10, tmpq+ 1*%%str 2271cabdff1aSopenharmony_ci SCRATCH 1, 11, tmpq+ 7*%%str 2272cabdff1aSopenharmony_ci SCRATCH 2, 9, tmpq+ 9*%%str 2273cabdff1aSopenharmony_ci SCRATCH 4, 14, tmpq+15*%%str 2274cabdff1aSopenharmony_ci SCRATCH 5, 15, tmpq+17*%%str 2275cabdff1aSopenharmony_ci SCRATCH 7, 13, tmpq+31*%%str 2276cabdff1aSopenharmony_ci 2277cabdff1aSopenharmony_ci%if %3 <= 8 2278cabdff1aSopenharmony_ci mova m0, [%1+ 5*64] 2279cabdff1aSopenharmony_ci mova m3, [%1+ 3*64] 2280cabdff1aSopenharmony_ci 2281cabdff1aSopenharmony_ci pmulhrsw m5, m0, [pw_15893x2] ;t27 2282cabdff1aSopenharmony_ci pmulhrsw m0, [pw_3981x2] ;t20 2283cabdff1aSopenharmony_ci 2284cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 1, 4, 5, 0, 9102, 13623, [pd_8192], 7, 2 ; t21, t26 2285cabdff1aSopenharmony_ci 2286cabdff1aSopenharmony_ci pmulhrsw m6, m3, [pw_m2404x2] ;t23 2287cabdff1aSopenharmony_ci pmulhrsw m3, [pw_16207x2] ;t24 2288cabdff1aSopenharmony_ci 2289cabdff1aSopenharmony_ci SCRATCH 5, 8, tmpq+ 5*%%str 2290cabdff1aSopenharmony_ci SCRATCH 4, 12, tmpq+11*%%str 2291cabdff1aSopenharmony_ci 2292cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 7, 2, 3, 6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 2293cabdff1aSopenharmony_ci%else 2294cabdff1aSopenharmony_ci mova m4, [%1+ 5*64] 2295cabdff1aSopenharmony_ci mova m5, [%1+11*64] 2296cabdff1aSopenharmony_ci%if %3 <= 16 2297cabdff1aSopenharmony_ci pmulhrsw m1, m4, [pw_15893x2] 2298cabdff1aSopenharmony_ci pmulhrsw m4, [pw_3981x2] 2299cabdff1aSopenharmony_ci pmulhrsw m0, m5, [pw_m8423x2] 2300cabdff1aSopenharmony_ci pmulhrsw m5, [pw_14053x2] 2301cabdff1aSopenharmony_ci%else 2302cabdff1aSopenharmony_ci mova m0, [%1+21*64] 2303cabdff1aSopenharmony_ci mova m1, [%1+27*64] 2304cabdff1aSopenharmony_ci 2305cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 4, 1, 15893, 3981, [pd_8192], 2, 3 ; t20, t27 2306cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 0, 5, 8423, 14053, [pd_8192], 2, 3 ; t21, t26 2307cabdff1aSopenharmony_ci%endif 2308cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 4, 2 2309cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 1, 2 2310cabdff1aSopenharmony_ci 2311cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 1, 4, 9102, 13623, [pd_8192], 2, 3 ; t21, t26 2312cabdff1aSopenharmony_ci 2313cabdff1aSopenharmony_ci SCRATCH 5, 8, tmpq+ 5*%%str 2314cabdff1aSopenharmony_ci SCRATCH 4, 12, tmpq+11*%%str 2315cabdff1aSopenharmony_ci 2316cabdff1aSopenharmony_ci mova m7, [%1+ 3*64] 2317cabdff1aSopenharmony_ci mova m6, [%1+13*64] 2318cabdff1aSopenharmony_ci%if %3 <= 16 2319cabdff1aSopenharmony_ci pmulhrsw m3, m6, [pw_13160x2] 2320cabdff1aSopenharmony_ci pmulhrsw m6, [pw_9760x2] 2321cabdff1aSopenharmony_ci pmulhrsw m2, m7, [pw_m2404x2] 2322cabdff1aSopenharmony_ci pmulhrsw m7, [pw_16207x2] 2323cabdff1aSopenharmony_ci%else 2324cabdff1aSopenharmony_ci mova m2, [%1+29*64] 2325cabdff1aSopenharmony_ci mova m3, [%1+19*64] 2326cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 6, 3, 13160, 9760, [pd_8192], 4, 5 ; t22, t25 2327cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 2, 7, 2404, 16207, [pd_8192], 4, 5 ; t23, t24 2328cabdff1aSopenharmony_ci%endif 2329cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 2, 4 2330cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 7, 4 2331cabdff1aSopenharmony_ci 2332cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 7, 2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 2333cabdff1aSopenharmony_ci%endif 2334cabdff1aSopenharmony_ci 2335cabdff1aSopenharmony_ci ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23, 2336cabdff1aSopenharmony_ci ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31 2337cabdff1aSopenharmony_ci 2338cabdff1aSopenharmony_ci UNSCRATCH 4, 12, tmpq+11*%%str 2339cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 6, 5 2340cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 2, 5 2341cabdff1aSopenharmony_ci UNSCRATCH 5, 8, tmpq+ 5*%%str 2342cabdff1aSopenharmony_ci SCRATCH 4, 8, tmpq+11*%%str 2343cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 7, 4 2344cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 3, 4 2345cabdff1aSopenharmony_ci SCRATCH 5, 12, tmpq+ 5*%%str 2346cabdff1aSopenharmony_ci 2347cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 3, 6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27 2348cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 2, 7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26 2349cabdff1aSopenharmony_ci 2350cabdff1aSopenharmony_ci ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23, 2351cabdff1aSopenharmony_ci ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31 2352cabdff1aSopenharmony_ci 2353cabdff1aSopenharmony_ci UNSCRATCH 5, 9, tmpq+ 9*%%str 2354cabdff1aSopenharmony_ci mova m4, [tmpq+23*%%str] ; t16 2355cabdff1aSopenharmony_ci%if ARCH_X86_64 2356cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 5, 9 2357cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 4, 9 2358cabdff1aSopenharmony_ci%else 2359cabdff1aSopenharmony_ci SUMSUB_BADC w, 1, 5, 0, 4 2360cabdff1aSopenharmony_ci%endif 2361cabdff1aSopenharmony_ci mova [tmpq+29*%%str], m1 ; t17 2362cabdff1aSopenharmony_ci mova [tmpq+21*%%str], m0 ; t16 2363cabdff1aSopenharmony_ci UNSCRATCH 0, 10, tmpq+ 1*%%str 2364cabdff1aSopenharmony_ci UNSCRATCH 1, 11, tmpq+ 7*%%str 2365cabdff1aSopenharmony_ci%if ARCH_X86_64 2366cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 0, 9 2367cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 1, 9 2368cabdff1aSopenharmony_ci%else 2369cabdff1aSopenharmony_ci SUMSUB_BADC w, 2, 0, 3, 1 2370cabdff1aSopenharmony_ci%endif 2371cabdff1aSopenharmony_ci mova [tmpq+ 9*%%str], m2 ; t18 2372cabdff1aSopenharmony_ci mova [tmpq+13*%%str], m3 ; t19 2373cabdff1aSopenharmony_ci SCRATCH 0, 10, tmpq+23*%%str 2374cabdff1aSopenharmony_ci SCRATCH 1, 11, tmpq+27*%%str 2375cabdff1aSopenharmony_ci 2376cabdff1aSopenharmony_ci UNSCRATCH 2, 14, tmpq+15*%%str 2377cabdff1aSopenharmony_ci UNSCRATCH 3, 15, tmpq+17*%%str 2378cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 2, 0 2379cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 3, 0 2380cabdff1aSopenharmony_ci SCRATCH 6, 14, tmpq+ 3*%%str 2381cabdff1aSopenharmony_ci SCRATCH 7, 15, tmpq+ 7*%%str 2382cabdff1aSopenharmony_ci 2383cabdff1aSopenharmony_ci UNSCRATCH 0, 8, tmpq+11*%%str 2384cabdff1aSopenharmony_ci mova m1, [tmpq+25*%%str] ; t19 2385cabdff1aSopenharmony_ci UNSCRATCH 6, 12, tmpq+ 5*%%str 2386cabdff1aSopenharmony_ci UNSCRATCH 7, 13, tmpq+31*%%str 2387cabdff1aSopenharmony_ci%if ARCH_X86_64 2388cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 1, 9 2389cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 7, 9 2390cabdff1aSopenharmony_ci%else 2391cabdff1aSopenharmony_ci SUMSUB_BADC w, 0, 1, 6, 7 2392cabdff1aSopenharmony_ci%endif 2393cabdff1aSopenharmony_ci 2394cabdff1aSopenharmony_ci ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23, 2395cabdff1aSopenharmony_ci ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31 2396cabdff1aSopenharmony_ci 2397cabdff1aSopenharmony_ci%if 0; cpuflag(ssse3) 2398cabdff1aSopenharmony_ci%if ARCH_X86_64 2399cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 7, 8 2400cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 1, 8 2401cabdff1aSopenharmony_ci%else 2402cabdff1aSopenharmony_ci SUMSUB_BADC w, 4, 7, 5, 1 2403cabdff1aSopenharmony_ci%endif 2404cabdff1aSopenharmony_ci 2405cabdff1aSopenharmony_ci pmulhrsw m7, [pw_11585x2] 2406cabdff1aSopenharmony_ci pmulhrsw m4, [pw_11585x2] 2407cabdff1aSopenharmony_ci pmulhrsw m1, [pw_11585x2] 2408cabdff1aSopenharmony_ci pmulhrsw m5, [pw_11585x2] 2409cabdff1aSopenharmony_ci 2410cabdff1aSopenharmony_ci mova [tmpq+ 5*%%str], m7 ; t23 2411cabdff1aSopenharmony_ci SCRATCH 1, 13, tmpq+25*%%str 2412cabdff1aSopenharmony_ci UNSCRATCH 7, 10, tmpq+23*%%str 2413cabdff1aSopenharmony_ci UNSCRATCH 1, 11, tmpq+27*%%str 2414cabdff1aSopenharmony_ci 2415cabdff1aSopenharmony_ci%if ARCH_X86_64 2416cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 3, 10 2417cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 2, 10 2418cabdff1aSopenharmony_ci%else 2419cabdff1aSopenharmony_ci SUMSUB_BADC w, 7, 3, 1, 2 2420cabdff1aSopenharmony_ci%endif 2421cabdff1aSopenharmony_ci 2422cabdff1aSopenharmony_ci pmulhrsw m3, [pw_11585x2] 2423cabdff1aSopenharmony_ci pmulhrsw m7, [pw_11585x2] 2424cabdff1aSopenharmony_ci pmulhrsw m2, [pw_11585x2] 2425cabdff1aSopenharmony_ci pmulhrsw m1, [pw_11585x2] 2426cabdff1aSopenharmony_ci%else 2427cabdff1aSopenharmony_ci SCRATCH 0, 8, tmpq+15*%%str 2428cabdff1aSopenharmony_ci SCRATCH 6, 9, tmpq+17*%%str 2429cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 7, 4, 11585, 11585, [pd_8192], 0, 6 2430cabdff1aSopenharmony_ci mova [tmpq+ 5*%%str], m7 ; t23 2431cabdff1aSopenharmony_ci UNSCRATCH 7, 10, tmpq+23*%%str 2432cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 1, 5, 11585, 11585, [pd_8192], 0, 6 2433cabdff1aSopenharmony_ci SCRATCH 1, 13, tmpq+25*%%str 2434cabdff1aSopenharmony_ci UNSCRATCH 1, 11, tmpq+27*%%str 2435cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 3, 7, 11585, 11585, [pd_8192], 0, 6 2436cabdff1aSopenharmony_ci VP9_UNPACK_MULSUB_2W_4X 2, 1, 11585, 11585, [pd_8192], 0, 6 2437cabdff1aSopenharmony_ci UNSCRATCH 0, 8, tmpq+15*%%str 2438cabdff1aSopenharmony_ci UNSCRATCH 6, 9, tmpq+17*%%str 2439cabdff1aSopenharmony_ci%endif 2440cabdff1aSopenharmony_ci 2441cabdff1aSopenharmony_ci ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23, 2442cabdff1aSopenharmony_ci ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31 2443cabdff1aSopenharmony_ci 2444cabdff1aSopenharmony_ci ; then do final pass to sumsub+store the two halves 2445cabdff1aSopenharmony_ci%if %2 == 1 2446cabdff1aSopenharmony_ci mova [tmpq+17*%%str], m2 ; t20 2447cabdff1aSopenharmony_ci mova [tmpq+ 1*%%str], m3 ; t21 2448cabdff1aSopenharmony_ci%if ARCH_X86_64 2449cabdff1aSopenharmony_ci mova [tmpq+25*%%str], m13 ; t22 2450cabdff1aSopenharmony_ci 2451cabdff1aSopenharmony_ci mova m8, [tmpq+ 0*%%str] ; t0 2452cabdff1aSopenharmony_ci mova m9, [tmpq+ 4*%%str] ; t1 2453cabdff1aSopenharmony_ci mova m12, [tmpq+ 8*%%str] ; t2 2454cabdff1aSopenharmony_ci mova m11, [tmpq+12*%%str] ; t3 2455cabdff1aSopenharmony_ci mova m2, [tmpq+16*%%str] ; t4 2456cabdff1aSopenharmony_ci mova m3, [tmpq+20*%%str] ; t5 2457cabdff1aSopenharmony_ci mova m13, [tmpq+24*%%str] ; t6 2458cabdff1aSopenharmony_ci 2459cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 8, 10 2460cabdff1aSopenharmony_ci mova [tmpq+ 3*%%str], m8 ; t15 2461cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 9, 8 2462cabdff1aSopenharmony_ci SUMSUB_BA w, 15, 12, 8 2463cabdff1aSopenharmony_ci SUMSUB_BA w, 14, 11, 8 2464cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 2, 8 2465cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 3, 8 2466cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 13, 8 2467cabdff1aSopenharmony_ci mova m10, [tmpq+28*%%str] ; t7 2468cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 10, 8 2469cabdff1aSopenharmony_ci%if cpuflag(avx2) 2470cabdff1aSopenharmony_ci ; the "shitty" about this idct is that the final pass does the outermost 2471cabdff1aSopenharmony_ci ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need 2472cabdff1aSopenharmony_ci ; to be sequential, which means I need to load/store half of the sumsub 2473cabdff1aSopenharmony_ci ; intermediates back to/from memory to get a 16x16 transpose going... 2474cabdff1aSopenharmony_ci ; This would be easier if we had more (e.g. 32) YMM regs here. 2475cabdff1aSopenharmony_ci mova [tmpq+ 7*%%str], m9 2476cabdff1aSopenharmony_ci mova [tmpq+11*%%str], m12 2477cabdff1aSopenharmony_ci mova [tmpq+15*%%str], m11 2478cabdff1aSopenharmony_ci mova [tmpq+19*%%str], m2 2479cabdff1aSopenharmony_ci mova [tmpq+23*%%str], m3 2480cabdff1aSopenharmony_ci mova [tmpq+27*%%str], m13 2481cabdff1aSopenharmony_ci mova [tmpq+31*%%str], m10 2482cabdff1aSopenharmony_ci mova [tmpq+12*%%str], m5 2483cabdff1aSopenharmony_ci 2484cabdff1aSopenharmony_ci mova m13, [tmpq+30*%%str] ; t8 2485cabdff1aSopenharmony_ci mova m12, [tmpq+26*%%str] ; t9 2486cabdff1aSopenharmony_ci mova m11, [tmpq+22*%%str] ; t10 2487cabdff1aSopenharmony_ci mova m10, [tmpq+18*%%str] ; t11 2488cabdff1aSopenharmony_ci mova m9, [tmpq+17*%%str] ; t20 2489cabdff1aSopenharmony_ci mova m8, [tmpq+ 1*%%str] ; t21 2490cabdff1aSopenharmony_ci mova m3, [tmpq+25*%%str] ; t22 2491cabdff1aSopenharmony_ci mova m2, [tmpq+ 5*%%str] ; t23 2492cabdff1aSopenharmony_ci 2493cabdff1aSopenharmony_ci SUMSUB_BA w, 9, 10, 5 2494cabdff1aSopenharmony_ci SUMSUB_BA w, 8, 11, 5 2495cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 12, 5 2496cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 13, 5 2497cabdff1aSopenharmony_ci mova [tmpq+ 1*%%str], m10 2498cabdff1aSopenharmony_ci mova [tmpq+ 5*%%str], m11 2499cabdff1aSopenharmony_ci mova [tmpq+17*%%str], m12 2500cabdff1aSopenharmony_ci mova [tmpq+25*%%str], m13 2501cabdff1aSopenharmony_ci 2502cabdff1aSopenharmony_ci mova m13, [tmpq+14*%%str] ; t12 2503cabdff1aSopenharmony_ci mova m12, [tmpq+10*%%str] ; t13 2504cabdff1aSopenharmony_ci mova m11, [tmpq+ 9*%%str] ; t18 2505cabdff1aSopenharmony_ci mova m10, [tmpq+13*%%str] ; t19 2506cabdff1aSopenharmony_ci 2507cabdff1aSopenharmony_ci SUMSUB_BA w, 11, 12, 5 2508cabdff1aSopenharmony_ci SUMSUB_BA w, 10, 13, 5 2509cabdff1aSopenharmony_ci mova [tmpq+ 9*%%str], m13 2510cabdff1aSopenharmony_ci mova [tmpq+13*%%str], m12 2511cabdff1aSopenharmony_ci mova [tmpq+10*%%str], m10 2512cabdff1aSopenharmony_ci mova [tmpq+14*%%str], m11 2513cabdff1aSopenharmony_ci 2514cabdff1aSopenharmony_ci mova m13, [tmpq+ 6*%%str] ; t14 2515cabdff1aSopenharmony_ci mova m12, [tmpq+ 2*%%str] ; t15 2516cabdff1aSopenharmony_ci mova m11, [tmpq+21*%%str] ; t16 2517cabdff1aSopenharmony_ci mova m10, [tmpq+29*%%str] ; t17 2518cabdff1aSopenharmony_ci SUMSUB_BA w, 11, 12, 5 2519cabdff1aSopenharmony_ci SUMSUB_BA w, 10, 13, 5 2520cabdff1aSopenharmony_ci mova [tmpq+21*%%str], m12 2521cabdff1aSopenharmony_ci mova [tmpq+29*%%str], m13 2522cabdff1aSopenharmony_ci mova m12, [tmpq+10*%%str] 2523cabdff1aSopenharmony_ci mova m13, [tmpq+14*%%str] 2524cabdff1aSopenharmony_ci 2525cabdff1aSopenharmony_ci TRANSPOSE16x16W 6, 0, 15, 14, 1, 7, 5, 4, \ 2526cabdff1aSopenharmony_ci 2, 3, 8, 9, 12, 13, 10, 11, \ 2527cabdff1aSopenharmony_ci [tmpq+12*%%str], [tmpq+ 8*%%str], 1 2528cabdff1aSopenharmony_ci mova [tmpq+ 0*%%str], m6 2529cabdff1aSopenharmony_ci mova [tmpq+ 2*%%str], m0 2530cabdff1aSopenharmony_ci mova [tmpq+ 4*%%str], m15 2531cabdff1aSopenharmony_ci mova [tmpq+ 6*%%str], m14 2532cabdff1aSopenharmony_ci mova [tmpq+10*%%str], m7 2533cabdff1aSopenharmony_ci mova [tmpq+12*%%str], m5 2534cabdff1aSopenharmony_ci mova [tmpq+14*%%str], m4 2535cabdff1aSopenharmony_ci mova [tmpq+16*%%str], m2 2536cabdff1aSopenharmony_ci mova [tmpq+18*%%str], m3 2537cabdff1aSopenharmony_ci mova [tmpq+20*%%str], m8 2538cabdff1aSopenharmony_ci mova [tmpq+22*%%str], m9 2539cabdff1aSopenharmony_ci mova [tmpq+24*%%str], m12 2540cabdff1aSopenharmony_ci mova [tmpq+26*%%str], m13 2541cabdff1aSopenharmony_ci mova [tmpq+28*%%str], m10 2542cabdff1aSopenharmony_ci mova [tmpq+30*%%str], m11 2543cabdff1aSopenharmony_ci 2544cabdff1aSopenharmony_ci mova m0, [tmpq+21*%%str] 2545cabdff1aSopenharmony_ci mova m1, [tmpq+29*%%str] 2546cabdff1aSopenharmony_ci mova m2, [tmpq+13*%%str] 2547cabdff1aSopenharmony_ci mova m3, [tmpq+ 9*%%str] 2548cabdff1aSopenharmony_ci mova m4, [tmpq+ 1*%%str] 2549cabdff1aSopenharmony_ci mova m5, [tmpq+ 5*%%str] 2550cabdff1aSopenharmony_ci mova m7, [tmpq+25*%%str] 2551cabdff1aSopenharmony_ci mova m8, [tmpq+31*%%str] 2552cabdff1aSopenharmony_ci mova m9, [tmpq+27*%%str] 2553cabdff1aSopenharmony_ci mova m10, [tmpq+23*%%str] 2554cabdff1aSopenharmony_ci mova m11, [tmpq+19*%%str] 2555cabdff1aSopenharmony_ci mova m12, [tmpq+15*%%str] 2556cabdff1aSopenharmony_ci mova m13, [tmpq+11*%%str] 2557cabdff1aSopenharmony_ci mova m14, [tmpq+ 7*%%str] 2558cabdff1aSopenharmony_ci mova m15, [tmpq+ 3*%%str] 2559cabdff1aSopenharmony_ci TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, \ 2560cabdff1aSopenharmony_ci 8, 9, 10, 11, 12, 13, 14, 15, \ 2561cabdff1aSopenharmony_ci [tmpq+17*%%str], [tmpq+ 9*%%str], 1 2562cabdff1aSopenharmony_ci mova [tmpq+ 1*%%str], m0 2563cabdff1aSopenharmony_ci mova [tmpq+ 3*%%str], m1 2564cabdff1aSopenharmony_ci mova [tmpq+ 5*%%str], m2 2565cabdff1aSopenharmony_ci mova [tmpq+ 7*%%str], m3 2566cabdff1aSopenharmony_ci mova [tmpq+11*%%str], m5 2567cabdff1aSopenharmony_ci mova [tmpq+13*%%str], m6 2568cabdff1aSopenharmony_ci mova [tmpq+15*%%str], m7 2569cabdff1aSopenharmony_ci mova [tmpq+17*%%str], m8 2570cabdff1aSopenharmony_ci mova [tmpq+19*%%str], m9 2571cabdff1aSopenharmony_ci mova [tmpq+21*%%str], m10 2572cabdff1aSopenharmony_ci mova [tmpq+23*%%str], m11 2573cabdff1aSopenharmony_ci mova [tmpq+25*%%str], m12 2574cabdff1aSopenharmony_ci mova [tmpq+27*%%str], m13 2575cabdff1aSopenharmony_ci mova [tmpq+29*%%str], m14 2576cabdff1aSopenharmony_ci mova [tmpq+31*%%str], m15 2577cabdff1aSopenharmony_ci%else ; !avx2 2578cabdff1aSopenharmony_ci TRANSPOSE8x8W 6, 0, 15, 14, 1, 7, 5, 4, 8 2579cabdff1aSopenharmony_ci mova [tmpq+ 0*%%str], m6 2580cabdff1aSopenharmony_ci mova [tmpq+ 4*%%str], m0 2581cabdff1aSopenharmony_ci mova [tmpq+ 8*%%str], m15 2582cabdff1aSopenharmony_ci mova [tmpq+12*%%str], m14 2583cabdff1aSopenharmony_ci mova [tmpq+16*%%str], m1 2584cabdff1aSopenharmony_ci mova [tmpq+20*%%str], m7 2585cabdff1aSopenharmony_ci mova [tmpq+24*%%str], m5 2586cabdff1aSopenharmony_ci mova [tmpq+28*%%str], m4 2587cabdff1aSopenharmony_ci 2588cabdff1aSopenharmony_ci mova m8, [tmpq+ 3*%%str] ; t15 2589cabdff1aSopenharmony_ci TRANSPOSE8x8W 10, 13, 3, 2, 11, 12, 9, 8, 0 2590cabdff1aSopenharmony_ci mova [tmpq+ 3*%%str], m10 2591cabdff1aSopenharmony_ci mova [tmpq+ 7*%%str], m13 2592cabdff1aSopenharmony_ci mova [tmpq+11*%%str], m3 2593cabdff1aSopenharmony_ci mova [tmpq+15*%%str], m2 2594cabdff1aSopenharmony_ci mova [tmpq+19*%%str], m11 2595cabdff1aSopenharmony_ci mova [tmpq+23*%%str], m12 2596cabdff1aSopenharmony_ci mova [tmpq+27*%%str], m9 2597cabdff1aSopenharmony_ci mova [tmpq+31*%%str], m8 2598cabdff1aSopenharmony_ci 2599cabdff1aSopenharmony_ci mova m15, [tmpq+30*%%str] ; t8 2600cabdff1aSopenharmony_ci mova m14, [tmpq+26*%%str] ; t9 2601cabdff1aSopenharmony_ci mova m13, [tmpq+22*%%str] ; t10 2602cabdff1aSopenharmony_ci mova m12, [tmpq+18*%%str] ; t11 2603cabdff1aSopenharmony_ci mova m11, [tmpq+14*%%str] ; t12 2604cabdff1aSopenharmony_ci mova m10, [tmpq+10*%%str] ; t13 2605cabdff1aSopenharmony_ci mova m9, [tmpq+ 6*%%str] ; t14 2606cabdff1aSopenharmony_ci mova m8, [tmpq+ 2*%%str] ; t15 2607cabdff1aSopenharmony_ci mova m7, [tmpq+21*%%str] ; t16 2608cabdff1aSopenharmony_ci mova m6, [tmpq+29*%%str] ; t17 2609cabdff1aSopenharmony_ci mova m5, [tmpq+ 9*%%str] ; t18 2610cabdff1aSopenharmony_ci mova m4, [tmpq+13*%%str] ; t19 2611cabdff1aSopenharmony_ci mova m3, [tmpq+17*%%str] ; t20 2612cabdff1aSopenharmony_ci mova m2, [tmpq+ 1*%%str] ; t21 2613cabdff1aSopenharmony_ci mova m1, [tmpq+25*%%str] ; t22 2614cabdff1aSopenharmony_ci 2615cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 8, 0 2616cabdff1aSopenharmony_ci mova [tmpq+ 2*%%str], m8 2617cabdff1aSopenharmony_ci mova m0, [tmpq+ 5*%%str] ; t23 2618cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 9, 8 2619cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 10, 8 2620cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 11, 8 2621cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 12, 8 2622cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 13, 8 2623cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 14, 8 2624cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 15, 8 2625cabdff1aSopenharmony_ci 2626cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 2627cabdff1aSopenharmony_ci mova [tmpq+ 1*%%str], m0 2628cabdff1aSopenharmony_ci mova [tmpq+ 5*%%str], m1 2629cabdff1aSopenharmony_ci mova [tmpq+ 9*%%str], m2 2630cabdff1aSopenharmony_ci mova [tmpq+13*%%str], m3 2631cabdff1aSopenharmony_ci mova [tmpq+17*%%str], m4 2632cabdff1aSopenharmony_ci mova [tmpq+21*%%str], m5 2633cabdff1aSopenharmony_ci mova [tmpq+25*%%str], m6 2634cabdff1aSopenharmony_ci mova [tmpq+29*%%str], m7 2635cabdff1aSopenharmony_ci 2636cabdff1aSopenharmony_ci mova m8, [tmpq+ 2*%%str] 2637cabdff1aSopenharmony_ci TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 2638cabdff1aSopenharmony_ci mova [tmpq+ 2*%%str], m8 2639cabdff1aSopenharmony_ci mova [tmpq+ 6*%%str], m9 2640cabdff1aSopenharmony_ci mova [tmpq+10*%%str], m10 2641cabdff1aSopenharmony_ci mova [tmpq+14*%%str], m11 2642cabdff1aSopenharmony_ci mova [tmpq+18*%%str], m12 2643cabdff1aSopenharmony_ci mova [tmpq+22*%%str], m13 2644cabdff1aSopenharmony_ci mova [tmpq+26*%%str], m14 2645cabdff1aSopenharmony_ci mova [tmpq+30*%%str], m15 2646cabdff1aSopenharmony_ci%endif ; avx2 2647cabdff1aSopenharmony_ci%else 2648cabdff1aSopenharmony_ci mova m2, [tmpq+24*%%str] ; t6 2649cabdff1aSopenharmony_ci mova m3, [tmpq+28*%%str] ; t7 2650cabdff1aSopenharmony_ci SUMSUB_BADC w, 5, 2, 4, 3 2651cabdff1aSopenharmony_ci mova [tmpq+24*%%str], m5 2652cabdff1aSopenharmony_ci mova [tmpq+23*%%str], m2 2653cabdff1aSopenharmony_ci mova [tmpq+28*%%str], m4 2654cabdff1aSopenharmony_ci mova [tmpq+19*%%str], m3 2655cabdff1aSopenharmony_ci 2656cabdff1aSopenharmony_ci mova m2, [tmpq+16*%%str] ; t4 2657cabdff1aSopenharmony_ci mova m3, [tmpq+20*%%str] ; t5 2658cabdff1aSopenharmony_ci SUMSUB_BA w, 1, 2, 5 2659cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 3, 5 2660cabdff1aSopenharmony_ci mova [tmpq+15*%%str], m2 2661cabdff1aSopenharmony_ci mova [tmpq+11*%%str], m3 2662cabdff1aSopenharmony_ci 2663cabdff1aSopenharmony_ci mova m2, [tmpq+ 0*%%str] ; t0 2664cabdff1aSopenharmony_ci mova m3, [tmpq+ 4*%%str] ; t1 2665cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 2, 5 2666cabdff1aSopenharmony_ci SUMSUB_BA w, 0, 3, 5 2667cabdff1aSopenharmony_ci mova [tmpq+31*%%str], m2 2668cabdff1aSopenharmony_ci mova [tmpq+27*%%str], m3 2669cabdff1aSopenharmony_ci 2670cabdff1aSopenharmony_ci mova m2, [tmpq+ 8*%%str] ; t2 2671cabdff1aSopenharmony_ci mova m3, [tmpq+12*%%str] ; t3 2672cabdff1aSopenharmony_ci mova m5, [tmpq+ 7*%%str] 2673cabdff1aSopenharmony_ci mova m4, [tmpq+ 3*%%str] 2674cabdff1aSopenharmony_ci SUMSUB_BADC w, 5, 2, 4, 3 2675cabdff1aSopenharmony_ci mova [tmpq+ 7*%%str], m2 2676cabdff1aSopenharmony_ci mova [tmpq+ 3*%%str], m3 2677cabdff1aSopenharmony_ci 2678cabdff1aSopenharmony_ci mova m3, [tmpq+28*%%str] 2679cabdff1aSopenharmony_ci TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1 2680cabdff1aSopenharmony_ci mova [tmpq+ 0*%%str], m6 2681cabdff1aSopenharmony_ci mova [tmpq+ 4*%%str], m0 2682cabdff1aSopenharmony_ci mova [tmpq+ 8*%%str], m5 2683cabdff1aSopenharmony_ci mova [tmpq+12*%%str], m4 2684cabdff1aSopenharmony_ci mova [tmpq+20*%%str], m7 2685cabdff1aSopenharmony_ci mova [tmpq+24*%%str], m2 2686cabdff1aSopenharmony_ci mova [tmpq+28*%%str], m3 2687cabdff1aSopenharmony_ci 2688cabdff1aSopenharmony_ci mova m6, [tmpq+19*%%str] 2689cabdff1aSopenharmony_ci mova m0, [tmpq+23*%%str] 2690cabdff1aSopenharmony_ci mova m5, [tmpq+11*%%str] 2691cabdff1aSopenharmony_ci mova m4, [tmpq+15*%%str] 2692cabdff1aSopenharmony_ci mova m1, [tmpq+ 3*%%str] 2693cabdff1aSopenharmony_ci mova m7, [tmpq+ 7*%%str] 2694cabdff1aSopenharmony_ci mova m3, [tmpq+31*%%str] 2695cabdff1aSopenharmony_ci TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1 2696cabdff1aSopenharmony_ci mova [tmpq+ 3*%%str], m6 2697cabdff1aSopenharmony_ci mova [tmpq+ 7*%%str], m0 2698cabdff1aSopenharmony_ci mova [tmpq+11*%%str], m5 2699cabdff1aSopenharmony_ci mova [tmpq+15*%%str], m4 2700cabdff1aSopenharmony_ci mova [tmpq+23*%%str], m7 2701cabdff1aSopenharmony_ci mova [tmpq+27*%%str], m2 2702cabdff1aSopenharmony_ci mova [tmpq+31*%%str], m3 2703cabdff1aSopenharmony_ci 2704cabdff1aSopenharmony_ci mova m1, [tmpq+ 6*%%str] ; t14 2705cabdff1aSopenharmony_ci mova m0, [tmpq+ 2*%%str] ; t15 2706cabdff1aSopenharmony_ci mova m7, [tmpq+21*%%str] ; t16 2707cabdff1aSopenharmony_ci mova m6, [tmpq+29*%%str] ; t17 2708cabdff1aSopenharmony_ci SUMSUB_BA w, 7, 0, 2 2709cabdff1aSopenharmony_ci SUMSUB_BA w, 6, 1, 2 2710cabdff1aSopenharmony_ci mova [tmpq+29*%%str], m7 2711cabdff1aSopenharmony_ci mova [tmpq+ 2*%%str], m0 2712cabdff1aSopenharmony_ci mova [tmpq+21*%%str], m6 2713cabdff1aSopenharmony_ci mova [tmpq+ 6*%%str], m1 2714cabdff1aSopenharmony_ci 2715cabdff1aSopenharmony_ci mova m1, [tmpq+14*%%str] ; t12 2716cabdff1aSopenharmony_ci mova m0, [tmpq+10*%%str] ; t13 2717cabdff1aSopenharmony_ci mova m5, [tmpq+ 9*%%str] ; t18 2718cabdff1aSopenharmony_ci mova m4, [tmpq+13*%%str] ; t19 2719cabdff1aSopenharmony_ci SUMSUB_BA w, 5, 0, 2 2720cabdff1aSopenharmony_ci SUMSUB_BA w, 4, 1, 2 2721cabdff1aSopenharmony_ci mova [tmpq+10*%%str], m0 2722cabdff1aSopenharmony_ci mova [tmpq+14*%%str], m1 2723cabdff1aSopenharmony_ci 2724cabdff1aSopenharmony_ci mova m1, [tmpq+22*%%str] ; t10 2725cabdff1aSopenharmony_ci mova m0, [tmpq+18*%%str] ; t11 2726cabdff1aSopenharmony_ci mova m3, [tmpq+17*%%str] ; t20 2727cabdff1aSopenharmony_ci mova m2, [tmpq+ 1*%%str] ; t21 2728cabdff1aSopenharmony_ci SUMSUB_BA w, 3, 0, 6 2729cabdff1aSopenharmony_ci SUMSUB_BA w, 2, 1, 6 2730cabdff1aSopenharmony_ci mova [tmpq+18*%%str], m0 2731cabdff1aSopenharmony_ci mova [tmpq+22*%%str], m1 2732cabdff1aSopenharmony_ci 2733cabdff1aSopenharmony_ci mova m7, [tmpq+30*%%str] ; t8 2734cabdff1aSopenharmony_ci mova m6, [tmpq+26*%%str] ; t9 2735cabdff1aSopenharmony_ci mova m1, [tmpq+25*%%str] ; t22 2736cabdff1aSopenharmony_ci mova m0, [tmpq+ 5*%%str] ; t23 2737cabdff1aSopenharmony_ci SUMSUB_BADC w, 1, 6, 0, 7 2738cabdff1aSopenharmony_ci mova [tmpq+26*%%str], m6 2739cabdff1aSopenharmony_ci mova [tmpq+30*%%str], m7 2740cabdff1aSopenharmony_ci 2741cabdff1aSopenharmony_ci mova m7, [tmpq+29*%%str] 2742cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1 2743cabdff1aSopenharmony_ci mova [tmpq+ 1*%%str], m0 2744cabdff1aSopenharmony_ci mova [tmpq+ 5*%%str], m1 2745cabdff1aSopenharmony_ci mova [tmpq+ 9*%%str], m2 2746cabdff1aSopenharmony_ci mova [tmpq+13*%%str], m3 2747cabdff1aSopenharmony_ci mova [tmpq+21*%%str], m5 2748cabdff1aSopenharmony_ci mova [tmpq+25*%%str], m6 2749cabdff1aSopenharmony_ci mova [tmpq+29*%%str], m7 2750cabdff1aSopenharmony_ci 2751cabdff1aSopenharmony_ci mova m0, [tmpq+ 2*%%str] 2752cabdff1aSopenharmony_ci mova m1, [tmpq+ 6*%%str] 2753cabdff1aSopenharmony_ci mova m2, [tmpq+10*%%str] 2754cabdff1aSopenharmony_ci mova m3, [tmpq+14*%%str] 2755cabdff1aSopenharmony_ci mova m4, [tmpq+18*%%str] 2756cabdff1aSopenharmony_ci mova m5, [tmpq+22*%%str] 2757cabdff1aSopenharmony_ci mova m7, [tmpq+30*%%str] 2758cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1 2759cabdff1aSopenharmony_ci mova [tmpq+ 2*%%str], m0 2760cabdff1aSopenharmony_ci mova [tmpq+ 6*%%str], m1 2761cabdff1aSopenharmony_ci mova [tmpq+10*%%str], m2 2762cabdff1aSopenharmony_ci mova [tmpq+14*%%str], m3 2763cabdff1aSopenharmony_ci mova [tmpq+22*%%str], m5 2764cabdff1aSopenharmony_ci mova [tmpq+26*%%str], m6 2765cabdff1aSopenharmony_ci mova [tmpq+30*%%str], m7 2766cabdff1aSopenharmony_ci%endif 2767cabdff1aSopenharmony_ci%else 2768cabdff1aSopenharmony_ci ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str] 2769cabdff1aSopenharmony_ci ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str] 2770cabdff1aSopenharmony_ci ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str] 2771cabdff1aSopenharmony_ci ; t20-22 is in m4-6 2772cabdff1aSopenharmony_ci ; t24-31 is in m8-15 2773cabdff1aSopenharmony_ci 2774cabdff1aSopenharmony_ci%if cpuflag(ssse3) 2775cabdff1aSopenharmony_ci%define ROUND_REG [pw_512] 2776cabdff1aSopenharmony_ci%else 2777cabdff1aSopenharmony_ci%define ROUND_REG [pw_32] 2778cabdff1aSopenharmony_ci%endif 2779cabdff1aSopenharmony_ci 2780cabdff1aSopenharmony_ci%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs 2781cabdff1aSopenharmony_ci SUMSUB_BA w, %4, %1, %5 2782cabdff1aSopenharmony_ci SUMSUB_BA w, %3, %2, %5 2783cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 %4, %3, %5, %6, %7, ROUND_REG, 6 2784cabdff1aSopenharmony_ci%if %8 == 1 2785cabdff1aSopenharmony_ci add dstq, stride2q 2786cabdff1aSopenharmony_ci%endif 2787cabdff1aSopenharmony_ci VP9_IDCT8_WRITEx2 %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq 2788cabdff1aSopenharmony_ci%if %8 == 1 2789cabdff1aSopenharmony_ci sub dst_endq, stride2q 2790cabdff1aSopenharmony_ci%endif 2791cabdff1aSopenharmony_ci%endmacro 2792cabdff1aSopenharmony_ci 2793cabdff1aSopenharmony_ci%if ARCH_X86_64 2794cabdff1aSopenharmony_ci pxor m10, m10 2795cabdff1aSopenharmony_ci 2796cabdff1aSopenharmony_ci ; store t0-1 and t30-31 2797cabdff1aSopenharmony_ci mova m8, [tmpq+ 0*%%str] 2798cabdff1aSopenharmony_ci mova m9, [tmpq+ 4*%%str] 2799cabdff1aSopenharmony_ci %%STORE_2X2 8, 9, 0, 6, 12, 11, 10 2800cabdff1aSopenharmony_ci 2801cabdff1aSopenharmony_ci ; store t2-3 and t28-29 2802cabdff1aSopenharmony_ci mova m8, [tmpq+ 8*%%str] 2803cabdff1aSopenharmony_ci mova m9, [tmpq+12*%%str] 2804cabdff1aSopenharmony_ci %%STORE_2X2 8, 9, 14, 15, 12, 11, 10 2805cabdff1aSopenharmony_ci 2806cabdff1aSopenharmony_ci ; store t4-5 and t26-27 2807cabdff1aSopenharmony_ci mova m8, [tmpq+16*%%str] 2808cabdff1aSopenharmony_ci mova m9, [tmpq+20*%%str] 2809cabdff1aSopenharmony_ci %%STORE_2X2 8, 9, 7, 1, 12, 11, 10 2810cabdff1aSopenharmony_ci 2811cabdff1aSopenharmony_ci ; store t6-7 and t24-25 2812cabdff1aSopenharmony_ci mova m8, [tmpq+24*%%str] 2813cabdff1aSopenharmony_ci mova m9, [tmpq+28*%%str] 2814cabdff1aSopenharmony_ci %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 2815cabdff1aSopenharmony_ci 2816cabdff1aSopenharmony_ci ; store t8-9 and t22-23 2817cabdff1aSopenharmony_ci mova m8, [tmpq+30*%%str] 2818cabdff1aSopenharmony_ci mova m9, [tmpq+26*%%str] 2819cabdff1aSopenharmony_ci mova m0, [tmpq+ 5*%%str] 2820cabdff1aSopenharmony_ci %%STORE_2X2 8, 9, 13, 0, 12, 11, 10 2821cabdff1aSopenharmony_ci 2822cabdff1aSopenharmony_ci ; store t10-11 and t20-21 2823cabdff1aSopenharmony_ci mova m8, [tmpq+22*%%str] 2824cabdff1aSopenharmony_ci mova m9, [tmpq+18*%%str] 2825cabdff1aSopenharmony_ci %%STORE_2X2 8, 9, 2, 3, 12, 11, 10 2826cabdff1aSopenharmony_ci 2827cabdff1aSopenharmony_ci ; store t12-13 and t18-19 2828cabdff1aSopenharmony_ci mova m8, [tmpq+14*%%str] 2829cabdff1aSopenharmony_ci mova m9, [tmpq+10*%%str] 2830cabdff1aSopenharmony_ci mova m5, [tmpq+13*%%str] 2831cabdff1aSopenharmony_ci mova m4, [tmpq+ 9*%%str] 2832cabdff1aSopenharmony_ci %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 2833cabdff1aSopenharmony_ci 2834cabdff1aSopenharmony_ci ; store t14-17 2835cabdff1aSopenharmony_ci mova m8, [tmpq+ 6*%%str] 2836cabdff1aSopenharmony_ci mova m9, [tmpq+ 2*%%str] 2837cabdff1aSopenharmony_ci mova m5, [tmpq+29*%%str] 2838cabdff1aSopenharmony_ci mova m4, [tmpq+21*%%str] 2839cabdff1aSopenharmony_ci %%STORE_2X2 8, 9, 4, 5, 12, 11, 10, 0 2840cabdff1aSopenharmony_ci 2841cabdff1aSopenharmony_ci SWAP 1, 10 ; zero 2842cabdff1aSopenharmony_ci%else 2843cabdff1aSopenharmony_ci mova [tmpq+ 1*%%str], m1 2844cabdff1aSopenharmony_ci mova [tmpq+11*%%str], m2 2845cabdff1aSopenharmony_ci mova [tmpq+15*%%str], m3 2846cabdff1aSopenharmony_ci mova [tmpq+17*%%str], m4 2847cabdff1aSopenharmony_ci mova [tmpq+19*%%str], m5 2848cabdff1aSopenharmony_ci pxor m1, m1 2849cabdff1aSopenharmony_ci 2850cabdff1aSopenharmony_ci ; store t0-1 and t30-31 2851cabdff1aSopenharmony_ci mova m2, [tmpq+ 0*%%str] 2852cabdff1aSopenharmony_ci mova m3, [tmpq+ 4*%%str] 2853cabdff1aSopenharmony_ci %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2854cabdff1aSopenharmony_ci 2855cabdff1aSopenharmony_ci ; store t2-3 and t28-29 2856cabdff1aSopenharmony_ci mova m2, [tmpq+ 8*%%str] 2857cabdff1aSopenharmony_ci mova m3, [tmpq+12*%%str] 2858cabdff1aSopenharmony_ci mova m0, [tmpq+ 3*%%str] 2859cabdff1aSopenharmony_ci mova m6, [tmpq+ 7*%%str] 2860cabdff1aSopenharmony_ci %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2861cabdff1aSopenharmony_ci 2862cabdff1aSopenharmony_ci ; store t4-5 and t26-27 2863cabdff1aSopenharmony_ci mova m2, [tmpq+16*%%str] 2864cabdff1aSopenharmony_ci mova m3, [tmpq+20*%%str] 2865cabdff1aSopenharmony_ci mova m0, [tmpq+ 1*%%str] 2866cabdff1aSopenharmony_ci %%STORE_2X2 2, 3, 7, 0, 4, 5, 1 2867cabdff1aSopenharmony_ci 2868cabdff1aSopenharmony_ci ; store t6-7 and t24-25 2869cabdff1aSopenharmony_ci mova m2, [tmpq+24*%%str] 2870cabdff1aSopenharmony_ci mova m3, [tmpq+28*%%str] 2871cabdff1aSopenharmony_ci mova m0, [tmpq+17*%%str] 2872cabdff1aSopenharmony_ci mova m6, [tmpq+19*%%str] 2873cabdff1aSopenharmony_ci %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2874cabdff1aSopenharmony_ci 2875cabdff1aSopenharmony_ci ; store t8-9 and t22-23 2876cabdff1aSopenharmony_ci mova m2, [tmpq+30*%%str] 2877cabdff1aSopenharmony_ci mova m3, [tmpq+26*%%str] 2878cabdff1aSopenharmony_ci mova m0, [tmpq+25*%%str] 2879cabdff1aSopenharmony_ci mova m6, [tmpq+ 5*%%str] 2880cabdff1aSopenharmony_ci %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2881cabdff1aSopenharmony_ci 2882cabdff1aSopenharmony_ci ; store t10-11 and t20-21 2883cabdff1aSopenharmony_ci mova m2, [tmpq+22*%%str] 2884cabdff1aSopenharmony_ci mova m3, [tmpq+18*%%str] 2885cabdff1aSopenharmony_ci mova m0, [tmpq+11*%%str] 2886cabdff1aSopenharmony_ci mova m6, [tmpq+15*%%str] 2887cabdff1aSopenharmony_ci %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2888cabdff1aSopenharmony_ci 2889cabdff1aSopenharmony_ci ; store t12-13 and t18-19 2890cabdff1aSopenharmony_ci mova m2, [tmpq+14*%%str] 2891cabdff1aSopenharmony_ci mova m3, [tmpq+10*%%str] 2892cabdff1aSopenharmony_ci mova m6, [tmpq+13*%%str] 2893cabdff1aSopenharmony_ci mova m0, [tmpq+ 9*%%str] 2894cabdff1aSopenharmony_ci %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2895cabdff1aSopenharmony_ci 2896cabdff1aSopenharmony_ci ; store t14-17 2897cabdff1aSopenharmony_ci mova m2, [tmpq+ 6*%%str] 2898cabdff1aSopenharmony_ci mova m3, [tmpq+ 2*%%str] 2899cabdff1aSopenharmony_ci mova m6, [tmpq+29*%%str] 2900cabdff1aSopenharmony_ci mova m0, [tmpq+21*%%str] 2901cabdff1aSopenharmony_ci %%STORE_2X2 2, 3, 0, 6, 4, 5, 1, 0 2902cabdff1aSopenharmony_ci%endif 2903cabdff1aSopenharmony_ci%undef ROUND_REG 2904cabdff1aSopenharmony_ci%endif 2905cabdff1aSopenharmony_ci%endmacro 2906cabdff1aSopenharmony_ci 2907cabdff1aSopenharmony_ci%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1 2908cabdff1aSopenharmony_ciINIT_XMM %1 2909cabdff1aSopenharmony_cicglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob 2910cabdff1aSopenharmony_ci movifnidn eobd, dword eobm 2911cabdff1aSopenharmony_ci%if cpuflag(ssse3) 2912cabdff1aSopenharmony_ci cmp eobd, 135 2913cabdff1aSopenharmony_ci jg .idctfull 2914cabdff1aSopenharmony_ci cmp eobd, 34 2915cabdff1aSopenharmony_ci jg .idct16x16 2916cabdff1aSopenharmony_ci cmp eobd, 1 2917cabdff1aSopenharmony_ci jg .idct8x8 2918cabdff1aSopenharmony_ci%else 2919cabdff1aSopenharmony_ci cmp eobd, 1 2920cabdff1aSopenharmony_ci jg .idctfull 2921cabdff1aSopenharmony_ci%endif 2922cabdff1aSopenharmony_ci 2923cabdff1aSopenharmony_ci ; dc-only case 2924cabdff1aSopenharmony_ci movifnidn blockq, blockmp 2925cabdff1aSopenharmony_ci movifnidn dstq, dstmp 2926cabdff1aSopenharmony_ci movifnidn strideq, stridemp 2927cabdff1aSopenharmony_ci%if cpuflag(ssse3) 2928cabdff1aSopenharmony_ci movd m0, [blockq] 2929cabdff1aSopenharmony_ci mova m1, [pw_11585x2] 2930cabdff1aSopenharmony_ci pmulhrsw m0, m1 2931cabdff1aSopenharmony_ci pmulhrsw m0, m1 2932cabdff1aSopenharmony_ci%else 2933cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, block, coef 2934cabdff1aSopenharmony_ci movsx coefd, word [blockq] 2935cabdff1aSopenharmony_ci imul coefd, 11585 2936cabdff1aSopenharmony_ci add coefd, 8192 2937cabdff1aSopenharmony_ci sar coefd, 14 2938cabdff1aSopenharmony_ci imul coefd, 11585 2939cabdff1aSopenharmony_ci add coefd, (32 << 14) + 8192 2940cabdff1aSopenharmony_ci sar coefd, 14 + 6 2941cabdff1aSopenharmony_ci movd m0, coefd 2942cabdff1aSopenharmony_ci%endif 2943cabdff1aSopenharmony_ci SPLATW m0, m0, q0000 2944cabdff1aSopenharmony_ci%if cpuflag(ssse3) 2945cabdff1aSopenharmony_ci pmulhrsw m0, [pw_512] 2946cabdff1aSopenharmony_ci%endif 2947cabdff1aSopenharmony_ci pxor m5, m5 2948cabdff1aSopenharmony_ci movd [blockq], m5 2949cabdff1aSopenharmony_ci%rep 31 2950cabdff1aSopenharmony_ci VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize 2951cabdff1aSopenharmony_ci add dstq, strideq 2952cabdff1aSopenharmony_ci%endrep 2953cabdff1aSopenharmony_ci VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize 2954cabdff1aSopenharmony_ci RET 2955cabdff1aSopenharmony_ci 2956cabdff1aSopenharmony_ci%if ARCH_X86_64 2957cabdff1aSopenharmony_ci DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp 2958cabdff1aSopenharmony_ci%else 2959cabdff1aSopenharmony_ci%define dst_bakq r0mp 2960cabdff1aSopenharmony_ci%endif 2961cabdff1aSopenharmony_ci%if cpuflag(ssse3) 2962cabdff1aSopenharmony_ci.idct8x8: 2963cabdff1aSopenharmony_ci%if ARCH_X86_32 2964cabdff1aSopenharmony_ci DEFINE_ARGS block, u1, u2, u3, u4, tmp 2965cabdff1aSopenharmony_ci mov blockq, r2mp 2966cabdff1aSopenharmony_ci%endif 2967cabdff1aSopenharmony_ci mov tmpq, rsp 2968cabdff1aSopenharmony_ci VP9_IDCT32_1D blockq, 1, 8 2969cabdff1aSopenharmony_ci 2970cabdff1aSopenharmony_ci%if ARCH_X86_32 2971cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp 2972cabdff1aSopenharmony_ci mov strideq, r1mp 2973cabdff1aSopenharmony_ci%define cntd dword r3m 2974cabdff1aSopenharmony_ci%endif 2975cabdff1aSopenharmony_ci mov stride30q, strideq ; stride 2976cabdff1aSopenharmony_ci lea stride2q, [strideq*2] ; stride*2 2977cabdff1aSopenharmony_ci shl stride30q, 5 ; stride*32 2978cabdff1aSopenharmony_ci mov cntd, 4 2979cabdff1aSopenharmony_ci sub stride30q, stride2q ; stride*30 2980cabdff1aSopenharmony_ci.loop2_8x8: 2981cabdff1aSopenharmony_ci mov dstq, dst_bakq 2982cabdff1aSopenharmony_ci lea dst_endq, [dstq+stride30q] 2983cabdff1aSopenharmony_ci VP9_IDCT32_1D tmpq, 2, 8 2984cabdff1aSopenharmony_ci add dst_bakq, 8 2985cabdff1aSopenharmony_ci add tmpq, 16 2986cabdff1aSopenharmony_ci dec cntd 2987cabdff1aSopenharmony_ci jg .loop2_8x8 2988cabdff1aSopenharmony_ci 2989cabdff1aSopenharmony_ci ; at the end of the loop, m7 should still be zero 2990cabdff1aSopenharmony_ci ; use that to zero out block coefficients 2991cabdff1aSopenharmony_ci%if ARCH_X86_32 2992cabdff1aSopenharmony_ci DEFINE_ARGS block 2993cabdff1aSopenharmony_ci mov blockq, r2mp 2994cabdff1aSopenharmony_ci%endif 2995cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 64, 8, m1 2996cabdff1aSopenharmony_ci RET 2997cabdff1aSopenharmony_ci 2998cabdff1aSopenharmony_ci.idct16x16: 2999cabdff1aSopenharmony_ci%if ARCH_X86_32 3000cabdff1aSopenharmony_ci DEFINE_ARGS block, tmp, cnt 3001cabdff1aSopenharmony_ci mov blockq, r2mp 3002cabdff1aSopenharmony_ci%endif 3003cabdff1aSopenharmony_ci mov cntd, 2 3004cabdff1aSopenharmony_ci mov tmpq, rsp 3005cabdff1aSopenharmony_ci.loop1_16x16: 3006cabdff1aSopenharmony_ci VP9_IDCT32_1D blockq, 1, 16 3007cabdff1aSopenharmony_ci add blockq, 16 3008cabdff1aSopenharmony_ci add tmpq, 512 3009cabdff1aSopenharmony_ci dec cntd 3010cabdff1aSopenharmony_ci jg .loop1_16x16 3011cabdff1aSopenharmony_ci 3012cabdff1aSopenharmony_ci%if ARCH_X86_64 3013cabdff1aSopenharmony_ci sub blockq, 32 3014cabdff1aSopenharmony_ci%else 3015cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp 3016cabdff1aSopenharmony_ci mov strideq, r1mp 3017cabdff1aSopenharmony_ci%define cntd dword r3m 3018cabdff1aSopenharmony_ci%endif 3019cabdff1aSopenharmony_ci 3020cabdff1aSopenharmony_ci mov stride30q, strideq ; stride 3021cabdff1aSopenharmony_ci lea stride2q, [strideq*2] ; stride*2 3022cabdff1aSopenharmony_ci shl stride30q, 5 ; stride*32 3023cabdff1aSopenharmony_ci mov cntd, 4 3024cabdff1aSopenharmony_ci mov tmpq, rsp 3025cabdff1aSopenharmony_ci sub stride30q, stride2q ; stride*30 3026cabdff1aSopenharmony_ci.loop2_16x16: 3027cabdff1aSopenharmony_ci mov dstq, dst_bakq 3028cabdff1aSopenharmony_ci lea dst_endq, [dstq+stride30q] 3029cabdff1aSopenharmony_ci VP9_IDCT32_1D tmpq, 2, 16 3030cabdff1aSopenharmony_ci add dst_bakq, 8 3031cabdff1aSopenharmony_ci add tmpq, 16 3032cabdff1aSopenharmony_ci dec cntd 3033cabdff1aSopenharmony_ci jg .loop2_16x16 3034cabdff1aSopenharmony_ci 3035cabdff1aSopenharmony_ci ; at the end of the loop, m7 should still be zero 3036cabdff1aSopenharmony_ci ; use that to zero out block coefficients 3037cabdff1aSopenharmony_ci%if ARCH_X86_32 3038cabdff1aSopenharmony_ci DEFINE_ARGS block 3039cabdff1aSopenharmony_ci mov blockq, r2mp 3040cabdff1aSopenharmony_ci%endif 3041cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 64, 16, m1 3042cabdff1aSopenharmony_ci RET 3043cabdff1aSopenharmony_ci%endif 3044cabdff1aSopenharmony_ci 3045cabdff1aSopenharmony_ci.idctfull: 3046cabdff1aSopenharmony_ci%if ARCH_X86_32 3047cabdff1aSopenharmony_ci DEFINE_ARGS block, tmp, cnt 3048cabdff1aSopenharmony_ci mov blockq, r2mp 3049cabdff1aSopenharmony_ci%endif 3050cabdff1aSopenharmony_ci mov cntd, 4 3051cabdff1aSopenharmony_ci mov tmpq, rsp 3052cabdff1aSopenharmony_ci.loop1_full: 3053cabdff1aSopenharmony_ci VP9_IDCT32_1D blockq, 1 3054cabdff1aSopenharmony_ci add blockq, 16 3055cabdff1aSopenharmony_ci add tmpq, 512 3056cabdff1aSopenharmony_ci dec cntd 3057cabdff1aSopenharmony_ci jg .loop1_full 3058cabdff1aSopenharmony_ci 3059cabdff1aSopenharmony_ci%if ARCH_X86_64 3060cabdff1aSopenharmony_ci sub blockq, 64 3061cabdff1aSopenharmony_ci%else 3062cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp 3063cabdff1aSopenharmony_ci mov strideq, r1mp 3064cabdff1aSopenharmony_ci%define cntd dword r3m 3065cabdff1aSopenharmony_ci%endif 3066cabdff1aSopenharmony_ci 3067cabdff1aSopenharmony_ci mov stride30q, strideq ; stride 3068cabdff1aSopenharmony_ci lea stride2q, [strideq*2] ; stride*2 3069cabdff1aSopenharmony_ci shl stride30q, 5 ; stride*32 3070cabdff1aSopenharmony_ci mov cntd, 4 3071cabdff1aSopenharmony_ci mov tmpq, rsp 3072cabdff1aSopenharmony_ci sub stride30q, stride2q ; stride*30 3073cabdff1aSopenharmony_ci.loop2_full: 3074cabdff1aSopenharmony_ci mov dstq, dst_bakq 3075cabdff1aSopenharmony_ci lea dst_endq, [dstq+stride30q] 3076cabdff1aSopenharmony_ci VP9_IDCT32_1D tmpq, 2 3077cabdff1aSopenharmony_ci add dst_bakq, 8 3078cabdff1aSopenharmony_ci add tmpq, 16 3079cabdff1aSopenharmony_ci dec cntd 3080cabdff1aSopenharmony_ci jg .loop2_full 3081cabdff1aSopenharmony_ci 3082cabdff1aSopenharmony_ci ; at the end of the loop, m7 should still be zero 3083cabdff1aSopenharmony_ci ; use that to zero out block coefficients 3084cabdff1aSopenharmony_ci%if ARCH_X86_32 3085cabdff1aSopenharmony_ci DEFINE_ARGS block 3086cabdff1aSopenharmony_ci mov blockq, r2mp 3087cabdff1aSopenharmony_ci%endif 3088cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 64, 32, m1 3089cabdff1aSopenharmony_ci RET 3090cabdff1aSopenharmony_ci%endmacro 3091cabdff1aSopenharmony_ci 3092cabdff1aSopenharmony_ciVP9_IDCT_IDCT_32x32_ADD_XMM sse2 3093cabdff1aSopenharmony_ciVP9_IDCT_IDCT_32x32_ADD_XMM ssse3 3094cabdff1aSopenharmony_ciVP9_IDCT_IDCT_32x32_ADD_XMM avx 3095cabdff1aSopenharmony_ci 3096cabdff1aSopenharmony_ci; this is almost identical to VP9_STORE_2X, but it does two rows 3097cabdff1aSopenharmony_ci; for slightly improved interleaving, and it omits vpermq since the 3098cabdff1aSopenharmony_ci; input is DC so all values are identical 3099cabdff1aSopenharmony_ci%macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero 3100cabdff1aSopenharmony_ci mova m%2, [dstq] 3101cabdff1aSopenharmony_ci mova m%4, [dstq+strideq] 3102cabdff1aSopenharmony_ci punpckhbw m%3, m%2, m%6 3103cabdff1aSopenharmony_ci punpcklbw m%2, m%6 3104cabdff1aSopenharmony_ci punpckhbw m%5, m%4, m%6 3105cabdff1aSopenharmony_ci punpcklbw m%4, m%6 3106cabdff1aSopenharmony_ci paddw m%3, m%1 3107cabdff1aSopenharmony_ci paddw m%2, m%1 3108cabdff1aSopenharmony_ci paddw m%5, m%1 3109cabdff1aSopenharmony_ci paddw m%4, m%1 3110cabdff1aSopenharmony_ci packuswb m%2, m%3 3111cabdff1aSopenharmony_ci packuswb m%4, m%5 3112cabdff1aSopenharmony_ci mova [dstq+strideq*0], m%2 3113cabdff1aSopenharmony_ci mova [dstq+strideq*1], m%4 3114cabdff1aSopenharmony_ci%endmacro 3115cabdff1aSopenharmony_ci 3116cabdff1aSopenharmony_ci%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 3117cabdff1aSopenharmony_ciINIT_YMM avx2 3118cabdff1aSopenharmony_cicglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob 3119cabdff1aSopenharmony_ci cmp eobd, 135 3120cabdff1aSopenharmony_ci jg .idctfull 3121cabdff1aSopenharmony_ci cmp eobd, 1 3122cabdff1aSopenharmony_ci jg .idct16x16 3123cabdff1aSopenharmony_ci 3124cabdff1aSopenharmony_ci ; dc-only case 3125cabdff1aSopenharmony_ci mova m1, [pw_11585x2] 3126cabdff1aSopenharmony_ci vpbroadcastw m0, [blockq] 3127cabdff1aSopenharmony_ci pmulhrsw m0, m1 3128cabdff1aSopenharmony_ci pmulhrsw m0, m1 3129cabdff1aSopenharmony_ci pxor m5, m5 3130cabdff1aSopenharmony_ci pmulhrsw m0, [pw_512] 3131cabdff1aSopenharmony_ci movd [blockq], xm5 3132cabdff1aSopenharmony_ci 3133cabdff1aSopenharmony_ci DEFINE_ARGS dst, stride, cnt 3134cabdff1aSopenharmony_ci mov cntd, 16 3135cabdff1aSopenharmony_ci.loop_dc: 3136cabdff1aSopenharmony_ci VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5 3137cabdff1aSopenharmony_ci lea dstq, [dstq+2*strideq] 3138cabdff1aSopenharmony_ci dec cntd 3139cabdff1aSopenharmony_ci jg .loop_dc 3140cabdff1aSopenharmony_ci RET 3141cabdff1aSopenharmony_ci 3142cabdff1aSopenharmony_ci DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp 3143cabdff1aSopenharmony_ci.idct16x16: 3144cabdff1aSopenharmony_ci mov tmpq, rsp 3145cabdff1aSopenharmony_ci VP9_IDCT32_1D blockq, 1, 16 3146cabdff1aSopenharmony_ci 3147cabdff1aSopenharmony_ci mov stride30q, strideq ; stride 3148cabdff1aSopenharmony_ci lea stride2q, [strideq*2] ; stride*2 3149cabdff1aSopenharmony_ci shl stride30q, 5 ; stride*32 3150cabdff1aSopenharmony_ci mov cntd, 2 3151cabdff1aSopenharmony_ci sub stride30q, stride2q ; stride*30 3152cabdff1aSopenharmony_ci.loop2_16x16: 3153cabdff1aSopenharmony_ci mov dstq, dst_bakq 3154cabdff1aSopenharmony_ci lea dst_endq, [dstq+stride30q] 3155cabdff1aSopenharmony_ci VP9_IDCT32_1D tmpq, 2, 16 3156cabdff1aSopenharmony_ci add dst_bakq, 16 3157cabdff1aSopenharmony_ci add tmpq, 32 3158cabdff1aSopenharmony_ci dec cntd 3159cabdff1aSopenharmony_ci jg .loop2_16x16 3160cabdff1aSopenharmony_ci 3161cabdff1aSopenharmony_ci ; at the end of the loop, m1 should still be zero 3162cabdff1aSopenharmony_ci ; use that to zero out block coefficients 3163cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 64, 16, m1 3164cabdff1aSopenharmony_ci RET 3165cabdff1aSopenharmony_ci 3166cabdff1aSopenharmony_ci.idctfull: 3167cabdff1aSopenharmony_ci mov cntd, 2 3168cabdff1aSopenharmony_ci mov tmpq, rsp 3169cabdff1aSopenharmony_ci.loop1_full: 3170cabdff1aSopenharmony_ci VP9_IDCT32_1D blockq, 1 3171cabdff1aSopenharmony_ci add blockq, 32 3172cabdff1aSopenharmony_ci add tmpq, 1024 3173cabdff1aSopenharmony_ci dec cntd 3174cabdff1aSopenharmony_ci jg .loop1_full 3175cabdff1aSopenharmony_ci 3176cabdff1aSopenharmony_ci sub blockq, 64 3177cabdff1aSopenharmony_ci 3178cabdff1aSopenharmony_ci mov stride30q, strideq ; stride 3179cabdff1aSopenharmony_ci lea stride2q, [strideq*2] ; stride*2 3180cabdff1aSopenharmony_ci shl stride30q, 5 ; stride*32 3181cabdff1aSopenharmony_ci mov cntd, 2 3182cabdff1aSopenharmony_ci mov tmpq, rsp 3183cabdff1aSopenharmony_ci sub stride30q, stride2q ; stride*30 3184cabdff1aSopenharmony_ci.loop2_full: 3185cabdff1aSopenharmony_ci mov dstq, dst_bakq 3186cabdff1aSopenharmony_ci lea dst_endq, [dstq+stride30q] 3187cabdff1aSopenharmony_ci VP9_IDCT32_1D tmpq, 2 3188cabdff1aSopenharmony_ci add dst_bakq, 16 3189cabdff1aSopenharmony_ci add tmpq, 32 3190cabdff1aSopenharmony_ci dec cntd 3191cabdff1aSopenharmony_ci jg .loop2_full 3192cabdff1aSopenharmony_ci 3193cabdff1aSopenharmony_ci ; at the end of the loop, m1 should still be zero 3194cabdff1aSopenharmony_ci ; use that to zero out block coefficients 3195cabdff1aSopenharmony_ci ZERO_BLOCK blockq, 64, 32, m1 3196cabdff1aSopenharmony_ci RET 3197cabdff1aSopenharmony_ci%endif 3198