1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* VP9 IDCT SIMD optimizations
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (C) 2013 Clément Bœsch <u pkh me>
5cabdff1aSopenharmony_ci;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci;******************************************************************************
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci%include "vp9itxfm_template.asm"
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ciSECTION_RODATA 32
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ci%macro VP9_IDCT_COEFFS 2-3 0
30cabdff1aSopenharmony_ciconst pw_m%1_%2
31cabdff1aSopenharmony_citimes 8 dw -%1,  %2
32cabdff1aSopenharmony_ciconst pw_%2_%1
33cabdff1aSopenharmony_citimes 8 dw  %2,  %1
34cabdff1aSopenharmony_ci
35cabdff1aSopenharmony_ci%if %3 == 1
36cabdff1aSopenharmony_ciconst pw_m%2_m%1
37cabdff1aSopenharmony_citimes 8 dw -%2, -%1
38cabdff1aSopenharmony_ci%if %1 != %2
39cabdff1aSopenharmony_ciconst pw_m%2_%1
40cabdff1aSopenharmony_citimes 8 dw -%2,  %1
41cabdff1aSopenharmony_ciconst pw_%1_%2
42cabdff1aSopenharmony_citimes 8 dw  %1,  %2
43cabdff1aSopenharmony_ci%endif
44cabdff1aSopenharmony_ci%endif
45cabdff1aSopenharmony_ci
46cabdff1aSopenharmony_ci%if %1 < 11585
47cabdff1aSopenharmony_cipw_m%1x2:   times 16 dw -%1*2
48cabdff1aSopenharmony_ci%elif %1 > 11585
49cabdff1aSopenharmony_cipw_%1x2:    times 16 dw  %1*2
50cabdff1aSopenharmony_ci%else
51cabdff1aSopenharmony_ciconst pw_%1x2
52cabdff1aSopenharmony_citimes 16 dw %1*2
53cabdff1aSopenharmony_ci%endif
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_ci%if %2 != %1
56cabdff1aSopenharmony_cipw_%2x2:    times 16 dw  %2*2
57cabdff1aSopenharmony_ci%endif
58cabdff1aSopenharmony_ci%endmacro
59cabdff1aSopenharmony_ci
60cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 16364,   804
61cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 16305,  1606
62cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 16069,  3196, 1
63cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 15893,  3981
64cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 15137,  6270, 1
65cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 14811,  7005
66cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 14449,  7723
67cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 13160,  9760
68cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 11585, 11585, 1
69cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 11003, 12140
70cabdff1aSopenharmony_ciVP9_IDCT_COEFFS 10394, 12665
71cabdff1aSopenharmony_ciVP9_IDCT_COEFFS  9102, 13623, 1
72cabdff1aSopenharmony_ciVP9_IDCT_COEFFS  8423, 14053
73cabdff1aSopenharmony_ciVP9_IDCT_COEFFS  5520, 15426
74cabdff1aSopenharmony_ciVP9_IDCT_COEFFS  4756, 15679
75cabdff1aSopenharmony_ciVP9_IDCT_COEFFS  2404, 16207
76cabdff1aSopenharmony_ci
77cabdff1aSopenharmony_ciconst pw_5283_13377
78cabdff1aSopenharmony_citimes 4 dw 5283, 13377
79cabdff1aSopenharmony_ciconst pw_9929_13377
80cabdff1aSopenharmony_citimes 4 dw 9929, 13377
81cabdff1aSopenharmony_ciconst pw_15212_m13377
82cabdff1aSopenharmony_citimes 4 dw 15212, -13377
83cabdff1aSopenharmony_ciconst pw_15212_9929
84cabdff1aSopenharmony_citimes 4 dw 15212, 9929
85cabdff1aSopenharmony_ciconst pw_m5283_m15212
86cabdff1aSopenharmony_citimes 4 dw -5283, -15212
87cabdff1aSopenharmony_ciconst pw_13377x2
88cabdff1aSopenharmony_citimes 8 dw 13377*2
89cabdff1aSopenharmony_ciconst pw_m13377_13377
90cabdff1aSopenharmony_citimes 4 dw -13377, 13377
91cabdff1aSopenharmony_ciconst pw_13377_0
92cabdff1aSopenharmony_citimes 4 dw 13377, 0
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_cicextern pw_8
95cabdff1aSopenharmony_cicextern pw_16
96cabdff1aSopenharmony_cicextern pw_32
97cabdff1aSopenharmony_cicextern pw_512
98cabdff1aSopenharmony_cicextern pw_1024
99cabdff1aSopenharmony_cicextern pw_2048
100cabdff1aSopenharmony_cicextern pw_m1
101cabdff1aSopenharmony_cicextern pd_8192
102cabdff1aSopenharmony_ci
103cabdff1aSopenharmony_ciSECTION .text
104cabdff1aSopenharmony_ci
105cabdff1aSopenharmony_ci%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
106cabdff1aSopenharmony_ci    punpckhwd          m%4, m%2, m%1
107cabdff1aSopenharmony_ci    punpcklwd          m%2, m%1
108cabdff1aSopenharmony_ci    pmaddwd            m%3, m%4, [pw_m%5_%6]
109cabdff1aSopenharmony_ci    pmaddwd            m%4, [pw_%6_%5]
110cabdff1aSopenharmony_ci    pmaddwd            m%1, m%2, [pw_m%5_%6]
111cabdff1aSopenharmony_ci    pmaddwd            m%2, [pw_%6_%5]
112cabdff1aSopenharmony_ci%endmacro
113cabdff1aSopenharmony_ci
114cabdff1aSopenharmony_ci%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
115cabdff1aSopenharmony_ci    SUMSUB_BA            d, %1, %2, %5
116cabdff1aSopenharmony_ci    SUMSUB_BA            d, %3, %4, %5
117cabdff1aSopenharmony_ci    paddd              m%1, %6
118cabdff1aSopenharmony_ci    paddd              m%2, %6
119cabdff1aSopenharmony_ci    paddd              m%3, %6
120cabdff1aSopenharmony_ci    paddd              m%4, %6
121cabdff1aSopenharmony_ci    psrad              m%1, 14
122cabdff1aSopenharmony_ci    psrad              m%2, 14
123cabdff1aSopenharmony_ci    psrad              m%3, 14
124cabdff1aSopenharmony_ci    psrad              m%4, 14
125cabdff1aSopenharmony_ci    packssdw           m%1, m%3
126cabdff1aSopenharmony_ci    packssdw           m%2, m%4
127cabdff1aSopenharmony_ci%endmacro
128cabdff1aSopenharmony_ci
129cabdff1aSopenharmony_ci%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
130cabdff1aSopenharmony_ci%if mmsize == 32
131cabdff1aSopenharmony_ci    pmovzxbw           m%3, [%6]
132cabdff1aSopenharmony_ci    pmovzxbw           m%4, [%6+strideq]
133cabdff1aSopenharmony_ci%else
134cabdff1aSopenharmony_ci    movh               m%3, [%6]
135cabdff1aSopenharmony_ci    movh               m%4, [%6+strideq]
136cabdff1aSopenharmony_ci    punpcklbw          m%3, m%5
137cabdff1aSopenharmony_ci    punpcklbw          m%4, m%5
138cabdff1aSopenharmony_ci%endif
139cabdff1aSopenharmony_ci    paddw              m%3, m%1
140cabdff1aSopenharmony_ci    paddw              m%4, m%2
141cabdff1aSopenharmony_ci%if mmsize == 32
142cabdff1aSopenharmony_ci    packuswb           m%3, m%4
143cabdff1aSopenharmony_ci    ; Intel...
144cabdff1aSopenharmony_ci    vpermq             m%3, m%3, q3120
145cabdff1aSopenharmony_ci    mova              [%6], xm%3
146cabdff1aSopenharmony_ci    vextracti128 [%6+strideq], m%3, 1
147cabdff1aSopenharmony_ci%elif mmsize == 16
148cabdff1aSopenharmony_ci    packuswb           m%3, m%4
149cabdff1aSopenharmony_ci    movh              [%6], m%3
150cabdff1aSopenharmony_ci    movhps    [%6+strideq], m%3
151cabdff1aSopenharmony_ci%else
152cabdff1aSopenharmony_ci    packuswb           m%3, m%5
153cabdff1aSopenharmony_ci    packuswb           m%4, m%5
154cabdff1aSopenharmony_ci    movh              [%6], m%3
155cabdff1aSopenharmony_ci    movh      [%6+strideq], m%4
156cabdff1aSopenharmony_ci%endif
157cabdff1aSopenharmony_ci%endmacro
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_ci%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
160cabdff1aSopenharmony_ci%assign %%y 0
161cabdff1aSopenharmony_ci%rep %3
162cabdff1aSopenharmony_ci%assign %%x 0
163cabdff1aSopenharmony_ci%rep %3*2/mmsize
164cabdff1aSopenharmony_ci    mova      [%1+%%y+%%x], %4
165cabdff1aSopenharmony_ci%assign %%x (%%x+mmsize)
166cabdff1aSopenharmony_ci%endrep
167cabdff1aSopenharmony_ci%assign %%y (%%y+%2)
168cabdff1aSopenharmony_ci%endrep
169cabdff1aSopenharmony_ci%endmacro
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------
172cabdff1aSopenharmony_ci; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
173cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_ciINIT_MMX mmx
176cabdff1aSopenharmony_cicglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
177cabdff1aSopenharmony_ci    mova                m0, [blockq+0*8]
178cabdff1aSopenharmony_ci    mova                m1, [blockq+1*8]
179cabdff1aSopenharmony_ci    mova                m2, [blockq+2*8]
180cabdff1aSopenharmony_ci    mova                m3, [blockq+3*8]
181cabdff1aSopenharmony_ci    psraw               m0, 2
182cabdff1aSopenharmony_ci    psraw               m1, 2
183cabdff1aSopenharmony_ci    psraw               m2, 2
184cabdff1aSopenharmony_ci    psraw               m3, 2
185cabdff1aSopenharmony_ci
186cabdff1aSopenharmony_ci    VP9_IWHT4_1D
187cabdff1aSopenharmony_ci    TRANSPOSE4x4W        0, 1, 2, 3, 4
188cabdff1aSopenharmony_ci    VP9_IWHT4_1D
189cabdff1aSopenharmony_ci
190cabdff1aSopenharmony_ci    pxor                m4, m4
191cabdff1aSopenharmony_ci    VP9_STORE_2X         0, 1, 5, 6, 4
192cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
193cabdff1aSopenharmony_ci    VP9_STORE_2X         2, 3, 5, 6, 4
194cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 8, 4, m4
195cabdff1aSopenharmony_ci    RET
196cabdff1aSopenharmony_ci
197cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------
198cabdff1aSopenharmony_ci; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
199cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------
200cabdff1aSopenharmony_ci
201cabdff1aSopenharmony_ci; 2x2 top left corner
202cabdff1aSopenharmony_ci%macro VP9_IDCT4_2x2_1D 0
203cabdff1aSopenharmony_ci    pmulhrsw            m0, m5                              ; m0=t1
204cabdff1aSopenharmony_ci    mova                m2, m0                              ; m2=t0
205cabdff1aSopenharmony_ci    mova                m3, m1
206cabdff1aSopenharmony_ci    pmulhrsw            m1, m6                              ; m1=t2
207cabdff1aSopenharmony_ci    pmulhrsw            m3, m7                              ; m3=t3
208cabdff1aSopenharmony_ci    VP9_IDCT4_1D_FINALIZE
209cabdff1aSopenharmony_ci%endmacro
210cabdff1aSopenharmony_ci
211cabdff1aSopenharmony_ci%macro VP9_IDCT4_WRITEOUT 0
212cabdff1aSopenharmony_ci%if cpuflag(ssse3)
213cabdff1aSopenharmony_ci    mova                m5, [pw_2048]
214cabdff1aSopenharmony_ci    pmulhrsw            m0, m5              ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
215cabdff1aSopenharmony_ci    pmulhrsw            m1, m5
216cabdff1aSopenharmony_ci%else
217cabdff1aSopenharmony_ci    mova                m5, [pw_8]
218cabdff1aSopenharmony_ci    paddw               m0, m5
219cabdff1aSopenharmony_ci    paddw               m1, m5
220cabdff1aSopenharmony_ci    psraw               m0, 4
221cabdff1aSopenharmony_ci    psraw               m1, 4
222cabdff1aSopenharmony_ci%endif
223cabdff1aSopenharmony_ci    VP9_STORE_2X         0,  1,  6,  7,  4
224cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
225cabdff1aSopenharmony_ci%if cpuflag(ssse3)
226cabdff1aSopenharmony_ci    pmulhrsw            m2, m5
227cabdff1aSopenharmony_ci    pmulhrsw            m3, m5
228cabdff1aSopenharmony_ci%else
229cabdff1aSopenharmony_ci    paddw               m2, m5
230cabdff1aSopenharmony_ci    paddw               m3, m5
231cabdff1aSopenharmony_ci    psraw               m2, 4
232cabdff1aSopenharmony_ci    psraw               m3, 4
233cabdff1aSopenharmony_ci%endif
234cabdff1aSopenharmony_ci    VP9_STORE_2X         2,  3,  6,  7,  4
235cabdff1aSopenharmony_ci%endmacro
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci%macro IDCT_4x4_FN 1
238cabdff1aSopenharmony_ciINIT_MMX %1
239cabdff1aSopenharmony_cicglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob
240cabdff1aSopenharmony_ci
241cabdff1aSopenharmony_ci%if cpuflag(ssse3)
242cabdff1aSopenharmony_ci    cmp eobd, 4 ; 2x2 or smaller
243cabdff1aSopenharmony_ci    jg .idctfull
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci    cmp eobd, 1 ; faster path for when only DC is set
246cabdff1aSopenharmony_ci    jne .idct2x2
247cabdff1aSopenharmony_ci%else
248cabdff1aSopenharmony_ci    cmp eobd, 1
249cabdff1aSopenharmony_ci    jg .idctfull
250cabdff1aSopenharmony_ci%endif
251cabdff1aSopenharmony_ci
252cabdff1aSopenharmony_ci%if cpuflag(ssse3)
253cabdff1aSopenharmony_ci    movd                m0, [blockq]
254cabdff1aSopenharmony_ci    mova                m5, [pw_11585x2]
255cabdff1aSopenharmony_ci    pmulhrsw            m0, m5
256cabdff1aSopenharmony_ci    pmulhrsw            m0, m5
257cabdff1aSopenharmony_ci%else
258cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, block, coef
259cabdff1aSopenharmony_ci    movsx            coefd, word [blockq]
260cabdff1aSopenharmony_ci    imul             coefd, 11585
261cabdff1aSopenharmony_ci    add              coefd, 8192
262cabdff1aSopenharmony_ci    sar              coefd, 14
263cabdff1aSopenharmony_ci    imul             coefd, 11585
264cabdff1aSopenharmony_ci    add              coefd, (8 << 14) + 8192
265cabdff1aSopenharmony_ci    sar              coefd, 14 + 4
266cabdff1aSopenharmony_ci    movd                m0, coefd
267cabdff1aSopenharmony_ci%endif
268cabdff1aSopenharmony_ci    pshufw              m0, m0, 0
269cabdff1aSopenharmony_ci    pxor                m4, m4
270cabdff1aSopenharmony_ci    movh          [blockq], m4
271cabdff1aSopenharmony_ci%if cpuflag(ssse3)
272cabdff1aSopenharmony_ci    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
273cabdff1aSopenharmony_ci%endif
274cabdff1aSopenharmony_ci    VP9_STORE_2X         0,  0,  6,  7,  4
275cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
276cabdff1aSopenharmony_ci    VP9_STORE_2X         0,  0,  6,  7,  4
277cabdff1aSopenharmony_ci    RET
278cabdff1aSopenharmony_ci
279cabdff1aSopenharmony_ci%if cpuflag(ssse3)
280cabdff1aSopenharmony_ci; faster path for when only top left 2x2 block is set
281cabdff1aSopenharmony_ci.idct2x2:
282cabdff1aSopenharmony_ci    movd                m0, [blockq+0]
283cabdff1aSopenharmony_ci    movd                m1, [blockq+8]
284cabdff1aSopenharmony_ci    mova                m5, [pw_11585x2]
285cabdff1aSopenharmony_ci    mova                m6, [pw_6270x2]
286cabdff1aSopenharmony_ci    mova                m7, [pw_15137x2]
287cabdff1aSopenharmony_ci    VP9_IDCT4_2x2_1D
288cabdff1aSopenharmony_ci    ; partial 2x4 transpose
289cabdff1aSopenharmony_ci    punpcklwd           m0, m1
290cabdff1aSopenharmony_ci    punpcklwd           m2, m3
291cabdff1aSopenharmony_ci    SBUTTERFLY          dq, 0, 2, 1
292cabdff1aSopenharmony_ci    SWAP                1, 2
293cabdff1aSopenharmony_ci    VP9_IDCT4_2x2_1D
294cabdff1aSopenharmony_ci    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
295cabdff1aSopenharmony_ci    movh       [blockq+ 0], m4
296cabdff1aSopenharmony_ci    movh       [blockq+ 8], m4
297cabdff1aSopenharmony_ci    VP9_IDCT4_WRITEOUT
298cabdff1aSopenharmony_ci    RET
299cabdff1aSopenharmony_ci%endif
300cabdff1aSopenharmony_ci
301cabdff1aSopenharmony_ci.idctfull: ; generic full 4x4 idct/idct
302cabdff1aSopenharmony_ci    mova                m0, [blockq+ 0]
303cabdff1aSopenharmony_ci    mova                m1, [blockq+ 8]
304cabdff1aSopenharmony_ci    mova                m2, [blockq+16]
305cabdff1aSopenharmony_ci    mova                m3, [blockq+24]
306cabdff1aSopenharmony_ci%if cpuflag(ssse3)
307cabdff1aSopenharmony_ci    mova                m6, [pw_11585x2]
308cabdff1aSopenharmony_ci%endif
309cabdff1aSopenharmony_ci    mova                m7, [pd_8192]       ; rounding
310cabdff1aSopenharmony_ci    VP9_IDCT4_1D
311cabdff1aSopenharmony_ci    TRANSPOSE4x4W  0, 1, 2, 3, 4
312cabdff1aSopenharmony_ci    VP9_IDCT4_1D
313cabdff1aSopenharmony_ci    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
314cabdff1aSopenharmony_ci    mova       [blockq+ 0], m4
315cabdff1aSopenharmony_ci    mova       [blockq+ 8], m4
316cabdff1aSopenharmony_ci    mova       [blockq+16], m4
317cabdff1aSopenharmony_ci    mova       [blockq+24], m4
318cabdff1aSopenharmony_ci    VP9_IDCT4_WRITEOUT
319cabdff1aSopenharmony_ci    RET
320cabdff1aSopenharmony_ci%endmacro
321cabdff1aSopenharmony_ci
322cabdff1aSopenharmony_ciIDCT_4x4_FN mmxext
323cabdff1aSopenharmony_ciIDCT_4x4_FN ssse3
324cabdff1aSopenharmony_ci
325cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------
326cabdff1aSopenharmony_ci; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
327cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------
328cabdff1aSopenharmony_ci
329cabdff1aSopenharmony_ci%macro IADST4_FN 5
330cabdff1aSopenharmony_ciINIT_MMX %5
331cabdff1aSopenharmony_cicglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
332cabdff1aSopenharmony_ci%if WIN64 && notcpuflag(ssse3)
333cabdff1aSopenharmony_ci    WIN64_SPILL_XMM 8
334cabdff1aSopenharmony_ci%endif
335cabdff1aSopenharmony_ci    movdqa            xmm5, [pd_8192]
336cabdff1aSopenharmony_ci    mova                m0, [blockq+ 0]
337cabdff1aSopenharmony_ci    mova                m1, [blockq+ 8]
338cabdff1aSopenharmony_ci    mova                m2, [blockq+16]
339cabdff1aSopenharmony_ci    mova                m3, [blockq+24]
340cabdff1aSopenharmony_ci%if cpuflag(ssse3)
341cabdff1aSopenharmony_ci    mova                m6, [pw_11585x2]
342cabdff1aSopenharmony_ci%endif
343cabdff1aSopenharmony_ci%ifnidn %1%3, iadstiadst
344cabdff1aSopenharmony_ci    movdq2q             m7, xmm5
345cabdff1aSopenharmony_ci%endif
346cabdff1aSopenharmony_ci    VP9_%2_1D
347cabdff1aSopenharmony_ci    TRANSPOSE4x4W  0, 1, 2, 3, 4
348cabdff1aSopenharmony_ci    VP9_%4_1D
349cabdff1aSopenharmony_ci    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
350cabdff1aSopenharmony_ci    mova       [blockq+ 0], m4
351cabdff1aSopenharmony_ci    mova       [blockq+ 8], m4
352cabdff1aSopenharmony_ci    mova       [blockq+16], m4
353cabdff1aSopenharmony_ci    mova       [blockq+24], m4
354cabdff1aSopenharmony_ci    VP9_IDCT4_WRITEOUT
355cabdff1aSopenharmony_ci    RET
356cabdff1aSopenharmony_ci%endmacro
357cabdff1aSopenharmony_ci
358cabdff1aSopenharmony_ciIADST4_FN idct,  IDCT4,  iadst, IADST4, sse2
359cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, idct,  IDCT4,  sse2
360cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, iadst, IADST4, sse2
361cabdff1aSopenharmony_ci
362cabdff1aSopenharmony_ciIADST4_FN idct,  IDCT4,  iadst, IADST4, ssse3
363cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, idct,  IDCT4,  ssse3
364cabdff1aSopenharmony_ciIADST4_FN iadst, IADST4, iadst, IADST4, ssse3
365cabdff1aSopenharmony_ci
366cabdff1aSopenharmony_ci%macro SCRATCH 3
367cabdff1aSopenharmony_ci%if ARCH_X86_64
368cabdff1aSopenharmony_ci    SWAP                %1, %2
369cabdff1aSopenharmony_ci%else
370cabdff1aSopenharmony_ci    mova              [%3], m%1
371cabdff1aSopenharmony_ci%endif
372cabdff1aSopenharmony_ci%endmacro
373cabdff1aSopenharmony_ci
374cabdff1aSopenharmony_ci%macro UNSCRATCH 3
375cabdff1aSopenharmony_ci%if ARCH_X86_64
376cabdff1aSopenharmony_ci    SWAP                %1, %2
377cabdff1aSopenharmony_ci%else
378cabdff1aSopenharmony_ci    mova               m%1, [%3]
379cabdff1aSopenharmony_ci%endif
380cabdff1aSopenharmony_ci%endmacro
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------
383cabdff1aSopenharmony_ci; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
384cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------------------
385cabdff1aSopenharmony_ci
386cabdff1aSopenharmony_ci%macro VP9_IDCT8_1D_FINALIZE 0
387cabdff1aSopenharmony_ci    SUMSUB_BA            w,  3,  6, 5                       ; m3=t0+t7, m6=t0-t7
388cabdff1aSopenharmony_ci    SUMSUB_BA            w,  1,  2, 5                       ; m1=t1+t6, m2=t1-t6
389cabdff1aSopenharmony_ci    SUMSUB_BA            w,  7,  0, 5                       ; m7=t2+t5, m0=t2-t5
390cabdff1aSopenharmony_ci
391cabdff1aSopenharmony_ci    UNSCRATCH            5, 8, blockq+ 0
392cabdff1aSopenharmony_ci    SCRATCH              2, 8, blockq+ 0
393cabdff1aSopenharmony_ci
394cabdff1aSopenharmony_ci    SUMSUB_BA            w,  5,  4, 2                       ; m5=t3+t4, m4=t3-t4
395cabdff1aSopenharmony_ci    SWAP                 7,  6,  2
396cabdff1aSopenharmony_ci    SWAP                 3,  5,  0
397cabdff1aSopenharmony_ci
398cabdff1aSopenharmony_ci%if ARCH_X86_64
399cabdff1aSopenharmony_ci    SWAP                 6, 8
400cabdff1aSopenharmony_ci%endif
401cabdff1aSopenharmony_ci%endmacro
402cabdff1aSopenharmony_ci
403cabdff1aSopenharmony_ci; x86-32
404cabdff1aSopenharmony_ci; - in: m0/m4 is in mem
405cabdff1aSopenharmony_ci; - out: m6 is in mem
406cabdff1aSopenharmony_ci; x86-64:
407cabdff1aSopenharmony_ci; - everything is in registers (m0-7)
408cabdff1aSopenharmony_ci%macro VP9_IDCT8_1D 0
409cabdff1aSopenharmony_ci%if ARCH_X86_64
410cabdff1aSopenharmony_ci    SWAP                 0, 8
411cabdff1aSopenharmony_ci    SWAP                 4, 9
412cabdff1aSopenharmony_ci%endif
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X 5,  3,  9102, 13623, D_8192_REG, 0, 4  ; m5=t5a, m3=t6a
415cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X 1,  7, 16069,  3196, D_8192_REG, 0, 4  ; m1=t4a, m7=t7a
416cabdff1aSopenharmony_ci    SUMSUB_BA            w,  5,  1, 0                       ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a)
417cabdff1aSopenharmony_ci    SUMSUB_BA            w,  3,  7, 0                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
418cabdff1aSopenharmony_ci%if cpuflag(ssse3)
419cabdff1aSopenharmony_ci    SUMSUB_BA            w,  1,  7, 0                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
420cabdff1aSopenharmony_ci    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
421cabdff1aSopenharmony_ci    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
422cabdff1aSopenharmony_ci%else
423cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X 7,  1, 11585, 11585, D_8192_REG, 0, 4
424cabdff1aSopenharmony_ci%endif
425cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X 2,  6, 15137,  6270, D_8192_REG, 0, 4  ; m2=t2a, m6=t3a
426cabdff1aSopenharmony_ci
427cabdff1aSopenharmony_ci    UNSCRATCH            0, 8, blockq+ 0    ; IN(0)
428cabdff1aSopenharmony_ci    UNSCRATCH            4, 9, blockq+64    ; IN(4)
429cabdff1aSopenharmony_ci    SCRATCH              5, 8, blockq+ 0
430cabdff1aSopenharmony_ci
431cabdff1aSopenharmony_ci%if cpuflag(ssse3)
432cabdff1aSopenharmony_ci    SUMSUB_BA            w, 4, 0, 5                         ; m4=IN(0)+IN(4) m0=IN(0)-IN(4)
433cabdff1aSopenharmony_ci    pmulhrsw            m4, W_11585x2_REG                   ; m4=t0a
434cabdff1aSopenharmony_ci    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a
435cabdff1aSopenharmony_ci%else
436cabdff1aSopenharmony_ci    SCRATCH              7, 9, blockq+64
437cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X 0,  4, 11585, 11585, D_8192_REG, 5, 7
438cabdff1aSopenharmony_ci    UNSCRATCH            7, 9, blockq+64
439cabdff1aSopenharmony_ci%endif
440cabdff1aSopenharmony_ci    SUMSUB_BA            w,  6,  4, 5                       ; m6=t0a+t3a (t0), m4=t0a-t3a (t3)
441cabdff1aSopenharmony_ci    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci    VP9_IDCT8_1D_FINALIZE
444cabdff1aSopenharmony_ci%endmacro
445cabdff1aSopenharmony_ci
446cabdff1aSopenharmony_ci%macro VP9_IDCT8_4x4_1D 0
447cabdff1aSopenharmony_ci    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a/t0a
448cabdff1aSopenharmony_ci    pmulhrsw            m6, m2, [pw_15137x2]                ; m6=t3a
449cabdff1aSopenharmony_ci    pmulhrsw            m2, [pw_6270x2]                     ; m2=t2a
450cabdff1aSopenharmony_ci    pmulhrsw            m7, m1, [pw_16069x2]                ; m7=t7a
451cabdff1aSopenharmony_ci    pmulhrsw            m1, [pw_3196x2]                     ; m1=t4a
452cabdff1aSopenharmony_ci    pmulhrsw            m5, m3, [pw_m9102x2]                ; m5=t5a
453cabdff1aSopenharmony_ci    pmulhrsw            m3, [pw_13623x2]                    ; m3=t6a
454cabdff1aSopenharmony_ci    SUMSUB_BA            w,  5,  1, 4                       ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a)
455cabdff1aSopenharmony_ci    SUMSUB_BA            w,  3,  7, 4                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
456cabdff1aSopenharmony_ci    SUMSUB_BA            w,  1,  7, 4                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
457cabdff1aSopenharmony_ci    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
458cabdff1aSopenharmony_ci    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
459cabdff1aSopenharmony_ci    psubw               m4, m0, m6                          ; m4=t0a-t3a (t3)
460cabdff1aSopenharmony_ci    paddw               m6, m0                              ; m6=t0a+t3a (t0)
461cabdff1aSopenharmony_ci    SCRATCH              5,  8, blockq+ 0
462cabdff1aSopenharmony_ci    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
463cabdff1aSopenharmony_ci    VP9_IDCT8_1D_FINALIZE
464cabdff1aSopenharmony_ci%endmacro
465cabdff1aSopenharmony_ci
466cabdff1aSopenharmony_ci%macro VP9_IDCT8_2x2_1D 1
467cabdff1aSopenharmony_ci    pmulhrsw            m0, W_11585x2_REG                   ; m0=t0
468cabdff1aSopenharmony_ci    pmulhrsw            m3, m1, W_16069x2_REG               ; m3=t7
469cabdff1aSopenharmony_ci    pmulhrsw            m1, W_3196x2_REG                    ; m1=t4
470cabdff1aSopenharmony_ci    psubw               m7, m3, m1                          ; t5 = t7a - t4a
471cabdff1aSopenharmony_ci    paddw               m5, m3, m1                          ; t6 = t7a + t4a
472cabdff1aSopenharmony_ci    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
473cabdff1aSopenharmony_ci    pmulhrsw            m5, W_11585x2_REG                   ; m5=t6
474cabdff1aSopenharmony_ci    SWAP                 5,  1
475cabdff1aSopenharmony_ci    ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
476cabdff1aSopenharmony_ci    psubw               m6, m0, m3                          ; m6=t0-t7
477cabdff1aSopenharmony_ci    paddw               m3, m0                              ; m3=t0+t7
478cabdff1aSopenharmony_ci    psubw               m2, m0, m1                          ; m2=t1-t6
479cabdff1aSopenharmony_ci    paddw               m1, m0                              ; m1=t1+t6
480cabdff1aSopenharmony_ci%if %1 == 1
481cabdff1aSopenharmony_ci    punpcklwd           m3, m1
482cabdff1aSopenharmony_ci%define SCRATCH_REG 1
483cabdff1aSopenharmony_ci%elif ARCH_X86_32
484cabdff1aSopenharmony_ci    mova       [blockq+ 0], m2
485cabdff1aSopenharmony_ci%define SCRATCH_REG 2
486cabdff1aSopenharmony_ci%else
487cabdff1aSopenharmony_ci%define SCRATCH_REG 8
488cabdff1aSopenharmony_ci%endif
489cabdff1aSopenharmony_ci    psubw               m4, m0, m5                          ; m4=t3-t4
490cabdff1aSopenharmony_ci    paddw               m5, m0                              ; m5=t3+t4
491cabdff1aSopenharmony_ci    SUMSUB_BA            w,  7,  0, SCRATCH_REG             ; m7=t2+t5, m0=t2-t5
492cabdff1aSopenharmony_ci    SWAP                 7,  6,  2
493cabdff1aSopenharmony_ci    SWAP                 3,  5,  0
494cabdff1aSopenharmony_ci%undef SCRATCH_REG
495cabdff1aSopenharmony_ci%endmacro
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift
498cabdff1aSopenharmony_ci%if cpuflag(ssse3)
499cabdff1aSopenharmony_ci    pmulhrsw           m%1, %6              ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
500cabdff1aSopenharmony_ci    pmulhrsw           m%2, %6
501cabdff1aSopenharmony_ci%else
502cabdff1aSopenharmony_ci    paddw              m%1, %6
503cabdff1aSopenharmony_ci    paddw              m%2, %6
504cabdff1aSopenharmony_ci    psraw              m%1, %7
505cabdff1aSopenharmony_ci    psraw              m%2, %7
506cabdff1aSopenharmony_ci%endif
507cabdff1aSopenharmony_ci%if %0 <= 7
508cabdff1aSopenharmony_ci    VP9_STORE_2X        %1, %2, %3, %4, %5
509cabdff1aSopenharmony_ci%else
510cabdff1aSopenharmony_ci    VP9_STORE_2X        %1, %2, %3, %4, %5, %8
511cabdff1aSopenharmony_ci%endif
512cabdff1aSopenharmony_ci%endmacro
513cabdff1aSopenharmony_ci
514cabdff1aSopenharmony_ci; x86-32:
515cabdff1aSopenharmony_ci; - m6 is in mem
516cabdff1aSopenharmony_ci; x86-64:
517cabdff1aSopenharmony_ci; - m8 holds m6 (SWAP)
518cabdff1aSopenharmony_ci; m6 holds zero
519cabdff1aSopenharmony_ci%macro VP9_IDCT8_WRITEOUT 0
520cabdff1aSopenharmony_ci%if ARCH_X86_64
521cabdff1aSopenharmony_ci%if cpuflag(ssse3)
522cabdff1aSopenharmony_ci    mova                m9, [pw_1024]
523cabdff1aSopenharmony_ci%else
524cabdff1aSopenharmony_ci    mova                m9, [pw_16]
525cabdff1aSopenharmony_ci%endif
526cabdff1aSopenharmony_ci%define ROUND_REG m9
527cabdff1aSopenharmony_ci%else
528cabdff1aSopenharmony_ci%if cpuflag(ssse3)
529cabdff1aSopenharmony_ci%define ROUND_REG [pw_1024]
530cabdff1aSopenharmony_ci%else
531cabdff1aSopenharmony_ci%define ROUND_REG [pw_16]
532cabdff1aSopenharmony_ci%endif
533cabdff1aSopenharmony_ci%endif
534cabdff1aSopenharmony_ci    SCRATCH              5, 10, blockq+16
535cabdff1aSopenharmony_ci    SCRATCH              7, 11, blockq+32
536cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    0,  1, 5, 7, 6, ROUND_REG
537cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
538cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    2,  3, 5, 7, 6, ROUND_REG
539cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
540cabdff1aSopenharmony_ci    UNSCRATCH            5, 10, blockq+16
541cabdff1aSopenharmony_ci    UNSCRATCH            7, 11, blockq+32
542cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    4,  5, 0, 1, 6, ROUND_REG
543cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
544cabdff1aSopenharmony_ci    UNSCRATCH            5, 8, blockq+ 0
545cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    5,  7, 0, 1, 6, ROUND_REG
546cabdff1aSopenharmony_ci
547cabdff1aSopenharmony_ci%undef ROUND_REG
548cabdff1aSopenharmony_ci%endmacro
549cabdff1aSopenharmony_ci
550cabdff1aSopenharmony_ci%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2
551cabdff1aSopenharmony_ciINIT_XMM %1
552cabdff1aSopenharmony_cicglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob
553cabdff1aSopenharmony_ci
554cabdff1aSopenharmony_ci%if cpuflag(ssse3)
555cabdff1aSopenharmony_ci%if ARCH_X86_64
556cabdff1aSopenharmony_ci    mova               m12, [pw_11585x2]    ; often used
557cabdff1aSopenharmony_ci%define W_11585x2_REG m12
558cabdff1aSopenharmony_ci%else
559cabdff1aSopenharmony_ci%define W_11585x2_REG [pw_11585x2]
560cabdff1aSopenharmony_ci%endif
561cabdff1aSopenharmony_ci
562cabdff1aSopenharmony_ci    cmp eobd, 12 ; top left half or less
563cabdff1aSopenharmony_ci    jg .idctfull
564cabdff1aSopenharmony_ci
565cabdff1aSopenharmony_ci    cmp eobd, 3  ; top left corner or less
566cabdff1aSopenharmony_ci    jg .idcthalf
567cabdff1aSopenharmony_ci
568cabdff1aSopenharmony_ci    cmp eobd, 1 ; faster path for when only DC is set
569cabdff1aSopenharmony_ci    jne .idcttopleftcorner
570cabdff1aSopenharmony_ci%else
571cabdff1aSopenharmony_ci    cmp eobd, 1
572cabdff1aSopenharmony_ci    jg .idctfull
573cabdff1aSopenharmony_ci%endif
574cabdff1aSopenharmony_ci
575cabdff1aSopenharmony_ci%if cpuflag(ssse3)
576cabdff1aSopenharmony_ci    movd                m0, [blockq]
577cabdff1aSopenharmony_ci    pmulhrsw            m0, W_11585x2_REG
578cabdff1aSopenharmony_ci    pmulhrsw            m0, W_11585x2_REG
579cabdff1aSopenharmony_ci%else
580cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, block, coef
581cabdff1aSopenharmony_ci    movsx            coefd, word [blockq]
582cabdff1aSopenharmony_ci    imul             coefd, 11585
583cabdff1aSopenharmony_ci    add              coefd, 8192
584cabdff1aSopenharmony_ci    sar              coefd, 14
585cabdff1aSopenharmony_ci    imul             coefd, 11585
586cabdff1aSopenharmony_ci    add              coefd, (16 << 14) + 8192
587cabdff1aSopenharmony_ci    sar              coefd, 14 + 5
588cabdff1aSopenharmony_ci    movd                m0, coefd
589cabdff1aSopenharmony_ci%endif
590cabdff1aSopenharmony_ci    SPLATW              m0, m0, 0
591cabdff1aSopenharmony_ci    pxor                m4, m4
592cabdff1aSopenharmony_ci    movd          [blockq], m4
593cabdff1aSopenharmony_ci%if cpuflag(ssse3)
594cabdff1aSopenharmony_ci    pmulhrsw            m0, [pw_1024]       ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
595cabdff1aSopenharmony_ci%endif
596cabdff1aSopenharmony_ci%rep 3
597cabdff1aSopenharmony_ci    VP9_STORE_2X         0,  0,  6,  7,  4
598cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
599cabdff1aSopenharmony_ci%endrep
600cabdff1aSopenharmony_ci    VP9_STORE_2X         0,  0,  6,  7,  4
601cabdff1aSopenharmony_ci    RET
602cabdff1aSopenharmony_ci
603cabdff1aSopenharmony_ci%if cpuflag(ssse3)
604cabdff1aSopenharmony_ci; faster path for when only left corner is set (3 input: DC, right to DC, below
605cabdff1aSopenharmony_ci; to DC). Note: also working with a 2x2 block
606cabdff1aSopenharmony_ci.idcttopleftcorner:
607cabdff1aSopenharmony_ci    movd                m0, [blockq+0]
608cabdff1aSopenharmony_ci    movd                m1, [blockq+16]
609cabdff1aSopenharmony_ci%if ARCH_X86_64
610cabdff1aSopenharmony_ci    mova               m10, [pw_3196x2]
611cabdff1aSopenharmony_ci    mova               m11, [pw_16069x2]
612cabdff1aSopenharmony_ci%define W_3196x2_REG m10
613cabdff1aSopenharmony_ci%define W_16069x2_REG m11
614cabdff1aSopenharmony_ci%else
615cabdff1aSopenharmony_ci%define W_3196x2_REG [pw_3196x2]
616cabdff1aSopenharmony_ci%define W_16069x2_REG [pw_16069x2]
617cabdff1aSopenharmony_ci%endif
618cabdff1aSopenharmony_ci    VP9_IDCT8_2x2_1D 1
619cabdff1aSopenharmony_ci    ; partial 2x8 transpose
620cabdff1aSopenharmony_ci    ; punpcklwd m0, m1 already done inside idct
621cabdff1aSopenharmony_ci    punpcklwd           m2, m3
622cabdff1aSopenharmony_ci    punpcklwd           m4, m5
623cabdff1aSopenharmony_ci    punpcklwd           m6, m7
624cabdff1aSopenharmony_ci    punpckldq           m0, m2
625cabdff1aSopenharmony_ci    punpckldq           m4, m6
626cabdff1aSopenharmony_ci    SBUTTERFLY         qdq, 0, 4, 1
627cabdff1aSopenharmony_ci    SWAP                 1, 4
628cabdff1aSopenharmony_ci    VP9_IDCT8_2x2_1D 2
629cabdff1aSopenharmony_ci%if ARCH_X86_64
630cabdff1aSopenharmony_ci    SWAP                 6, 8
631cabdff1aSopenharmony_ci%endif
632cabdff1aSopenharmony_ci    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
633cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEOUT
634cabdff1aSopenharmony_ci%if ARCH_X86_64
635cabdff1aSopenharmony_ci    movd       [blockq+ 0], m6
636cabdff1aSopenharmony_ci    movd       [blockq+16], m6
637cabdff1aSopenharmony_ci%else
638cabdff1aSopenharmony_ci    mova       [blockq+ 0], m6
639cabdff1aSopenharmony_ci    mova       [blockq+16], m6
640cabdff1aSopenharmony_ci    mova       [blockq+32], m6
641cabdff1aSopenharmony_ci%endif
642cabdff1aSopenharmony_ci    RET
643cabdff1aSopenharmony_ci
644cabdff1aSopenharmony_ci.idcthalf:
645cabdff1aSopenharmony_ci    movh                m0, [blockq + 0]
646cabdff1aSopenharmony_ci    movh                m1, [blockq +16]
647cabdff1aSopenharmony_ci    movh                m2, [blockq +32]
648cabdff1aSopenharmony_ci    movh                m3, [blockq +48]
649cabdff1aSopenharmony_ci    VP9_IDCT8_4x4_1D
650cabdff1aSopenharmony_ci    ; partial 4x8 transpose
651cabdff1aSopenharmony_ci%if ARCH_X86_32
652cabdff1aSopenharmony_ci    mova                m6, [blockq+ 0]
653cabdff1aSopenharmony_ci%endif
654cabdff1aSopenharmony_ci    punpcklwd           m0, m1
655cabdff1aSopenharmony_ci    punpcklwd           m2, m3
656cabdff1aSopenharmony_ci    punpcklwd           m4, m5
657cabdff1aSopenharmony_ci    punpcklwd           m6, m7
658cabdff1aSopenharmony_ci    SBUTTERFLY          dq, 0, 2, 1
659cabdff1aSopenharmony_ci    SBUTTERFLY          dq, 4, 6, 5
660cabdff1aSopenharmony_ci    SBUTTERFLY         qdq, 0, 4, 1
661cabdff1aSopenharmony_ci    SBUTTERFLY         qdq, 2, 6, 5
662cabdff1aSopenharmony_ci    SWAP                 1, 4
663cabdff1aSopenharmony_ci    SWAP                 3, 6
664cabdff1aSopenharmony_ci    VP9_IDCT8_4x4_1D
665cabdff1aSopenharmony_ci%if ARCH_X86_64
666cabdff1aSopenharmony_ci    SWAP                 6, 8
667cabdff1aSopenharmony_ci%endif
668cabdff1aSopenharmony_ci    pxor                m6, m6
669cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEOUT
670cabdff1aSopenharmony_ci%if ARCH_X86_64
671cabdff1aSopenharmony_ci    movh       [blockq+ 0], m6
672cabdff1aSopenharmony_ci    movh       [blockq+16], m6
673cabdff1aSopenharmony_ci    movh       [blockq+32], m6
674cabdff1aSopenharmony_ci%else
675cabdff1aSopenharmony_ci    mova       [blockq+ 0], m6
676cabdff1aSopenharmony_ci    mova       [blockq+16], m6
677cabdff1aSopenharmony_ci    mova       [blockq+32], m6
678cabdff1aSopenharmony_ci%endif
679cabdff1aSopenharmony_ci    movh       [blockq+48], m6
680cabdff1aSopenharmony_ci    RET
681cabdff1aSopenharmony_ci%endif
682cabdff1aSopenharmony_ci
683cabdff1aSopenharmony_ci.idctfull: ; generic full 8x8 idct/idct
684cabdff1aSopenharmony_ci%if ARCH_X86_64
685cabdff1aSopenharmony_ci    mova                m0, [blockq+  0]    ; IN(0)
686cabdff1aSopenharmony_ci%endif
687cabdff1aSopenharmony_ci    mova                m1, [blockq+ 16]    ; IN(1)
688cabdff1aSopenharmony_ci    mova                m2, [blockq+ 32]    ; IN(2)
689cabdff1aSopenharmony_ci    mova                m3, [blockq+ 48]    ; IN(3)
690cabdff1aSopenharmony_ci%if ARCH_X86_64
691cabdff1aSopenharmony_ci    mova                m4, [blockq+ 64]    ; IN(4)
692cabdff1aSopenharmony_ci%endif
693cabdff1aSopenharmony_ci    mova                m5, [blockq+ 80]    ; IN(5)
694cabdff1aSopenharmony_ci    mova                m6, [blockq+ 96]    ; IN(6)
695cabdff1aSopenharmony_ci    mova                m7, [blockq+112]    ; IN(7)
696cabdff1aSopenharmony_ci%if ARCH_X86_64
697cabdff1aSopenharmony_ci    mova               m11, [pd_8192]       ; rounding
698cabdff1aSopenharmony_ci%define D_8192_REG m11
699cabdff1aSopenharmony_ci%else
700cabdff1aSopenharmony_ci%define D_8192_REG [pd_8192]
701cabdff1aSopenharmony_ci%endif
702cabdff1aSopenharmony_ci    VP9_IDCT8_1D
703cabdff1aSopenharmony_ci%if ARCH_X86_64
704cabdff1aSopenharmony_ci    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
705cabdff1aSopenharmony_ci%else
706cabdff1aSopenharmony_ci    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
707cabdff1aSopenharmony_ci    mova        [blockq+0], m0
708cabdff1aSopenharmony_ci%endif
709cabdff1aSopenharmony_ci    VP9_IDCT8_1D
710cabdff1aSopenharmony_ci
711cabdff1aSopenharmony_ci%if ARCH_X86_64
712cabdff1aSopenharmony_ci    SWAP                 6, 8
713cabdff1aSopenharmony_ci%endif
714cabdff1aSopenharmony_ci    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
715cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEOUT
716cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 16, 8, m6
717cabdff1aSopenharmony_ci    RET
718cabdff1aSopenharmony_ci%undef W_11585x2_REG
719cabdff1aSopenharmony_ci%endmacro
720cabdff1aSopenharmony_ci
721cabdff1aSopenharmony_ciVP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
722cabdff1aSopenharmony_ciVP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
723cabdff1aSopenharmony_ciVP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
724cabdff1aSopenharmony_ci
725cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------------------
726cabdff1aSopenharmony_ci; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
727cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------------------
728cabdff1aSopenharmony_ci
729cabdff1aSopenharmony_ci; x86-32:
730cabdff1aSopenharmony_ci; - in: m0/3/4/7 are in mem [blockq+N*16]
731cabdff1aSopenharmony_ci; - out: m6 is in mem [blockq+0]
732cabdff1aSopenharmony_ci; x86-64:
733cabdff1aSopenharmony_ci; - everything is in registers
734cabdff1aSopenharmony_ci%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7
735cabdff1aSopenharmony_ci%if ARCH_X86_64
736cabdff1aSopenharmony_ci    SWAP                     0, 8
737cabdff1aSopenharmony_ci    SWAP                     3, 9
738cabdff1aSopenharmony_ci    SWAP                     4, 10
739cabdff1aSopenharmony_ci    SWAP                     7, 11
740cabdff1aSopenharmony_ci%endif
741cabdff1aSopenharmony_ci
742cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  5,  2,  0,  3, 14449,  7723    ; m5/2=t3[d], m2/4=t2[d]
743cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  1,  6,  4,  7,  4756, 15679    ; m1/4=t7[d], m6/7=t6[d]
744cabdff1aSopenharmony_ci    SCRATCH                  4, 12, blockq+1*16
745cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     6,  2,  7,  3, 4, D_8192_REG  ; m6=t2[w], m2=t6[w]
746cabdff1aSopenharmony_ci    UNSCRATCH                4, 12, blockq+1*16
747cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     1,  5,  4,  0, 3, D_8192_REG  ; m1=t3[w], m5=t7[w]
748cabdff1aSopenharmony_ci
749cabdff1aSopenharmony_ci    UNSCRATCH                0,  8, blockq+16*0
750cabdff1aSopenharmony_ci    UNSCRATCH                3,  9, blockq+16*3
751cabdff1aSopenharmony_ci    UNSCRATCH                4, 10, blockq+16*4
752cabdff1aSopenharmony_ci    UNSCRATCH                7, 11, blockq+16*7
753cabdff1aSopenharmony_ci    SCRATCH                  1,  8, blockq+16*1
754cabdff1aSopenharmony_ci    SCRATCH                  2,  9, blockq+16*2
755cabdff1aSopenharmony_ci    SCRATCH                  5, 10, blockq+16*5
756cabdff1aSopenharmony_ci    SCRATCH                  6, 11, blockq+16*6
757cabdff1aSopenharmony_ci
758cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  7,  0,  1,  2, 16305,  1606    ; m7/1=t1[d], m0/2=t0[d]
759cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  3,  4,  5,  6, 10394, 12665    ; m3/5=t5[d], m4/6=t4[d]
760cabdff1aSopenharmony_ci    SCRATCH                  1, 12, blockq+ 0*16
761cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     4,  0,  6,  2, 1, D_8192_REG  ; m4=t0[w], m0=t4[w]
762cabdff1aSopenharmony_ci    UNSCRATCH                1, 12, blockq+ 0*16
763cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     3,  7,  5,  1, 2, D_8192_REG  ; m3=t1[w], m7=t5[w]
764cabdff1aSopenharmony_ci
765cabdff1aSopenharmony_ci    UNSCRATCH                2,  9, blockq+16*2
766cabdff1aSopenharmony_ci    UNSCRATCH                5, 10, blockq+16*5
767cabdff1aSopenharmony_ci    SCRATCH                  3,  9, blockq+16*3
768cabdff1aSopenharmony_ci    SCRATCH                  4, 10, blockq+16*4
769cabdff1aSopenharmony_ci
770cabdff1aSopenharmony_ci    ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7
771cabdff1aSopenharmony_ci
772cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  0,  7,  1,  3, 15137,  6270    ; m0/1=t5[d], m7/3=t4[d]
773cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  5,  2,  4,  6,  6270, 15137    ; m5/4=t6[d], m2/6=t7[d]
774cabdff1aSopenharmony_ci    SCRATCH                  1, 12, blockq+ 0*16
775cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     5,  7,  4,  3, 1, D_8192_REG
776cabdff1aSopenharmony_ci    UNSCRATCH                1, 12, blockq+ 0*16
777cabdff1aSopenharmony_ci    PSIGNW                  m5, W_M1_REG                    ; m5=out1[w], m7=t6[w]
778cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     2,  0,  6,  1, 3, D_8192_REG   ; m2=out6[w], m0=t7[w]
779cabdff1aSopenharmony_ci
780cabdff1aSopenharmony_ci    UNSCRATCH                1,  8, blockq+16*1
781cabdff1aSopenharmony_ci    UNSCRATCH                3,  9, blockq+16*3
782cabdff1aSopenharmony_ci    UNSCRATCH                4, 10, blockq+16*4
783cabdff1aSopenharmony_ci    UNSCRATCH                6, 11, blockq+16*6
784cabdff1aSopenharmony_ci    SCRATCH                  2,  8, blockq+16*0
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci    SUMSUB_BA                w,  6,  4, 2                   ; m6=out0[w], m4=t2[w]
787cabdff1aSopenharmony_ci    SUMSUB_BA                w,  1,  3, 2
788cabdff1aSopenharmony_ci    PSIGNW                  m1, W_M1_REG                    ; m1=out7[w], m3=t3[w]
789cabdff1aSopenharmony_ci
790cabdff1aSopenharmony_ci    ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7
791cabdff1aSopenharmony_ci
792cabdff1aSopenharmony_ci    ; unfortunately, the code below overflows in some cases
793cabdff1aSopenharmony_ci%if 0; cpuflag(ssse3)
794cabdff1aSopenharmony_ci    SUMSUB_BA                w,  3,  4,  2
795cabdff1aSopenharmony_ci    SUMSUB_BA                w,  0,  7,  2
796cabdff1aSopenharmony_ci    pmulhrsw                m3, W_11585x2_REG
797cabdff1aSopenharmony_ci    pmulhrsw                m7, W_11585x2_REG
798cabdff1aSopenharmony_ci    pmulhrsw                m4, W_11585x2_REG               ; out4
799cabdff1aSopenharmony_ci    pmulhrsw                m0, W_11585x2_REG               ; out2
800cabdff1aSopenharmony_ci%else
801cabdff1aSopenharmony_ci    SCRATCH                  5,  9, blockq+16*1
802cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  4, 3, 11585, 11585, D_8192_REG, 2, 5
803cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  7, 0, 11585, 11585, D_8192_REG, 2, 5
804cabdff1aSopenharmony_ci    UNSCRATCH                5,  9, blockq+16*1
805cabdff1aSopenharmony_ci%endif
806cabdff1aSopenharmony_ci    PSIGNW                  m3, W_M1_REG                    ; out3
807cabdff1aSopenharmony_ci    PSIGNW                  m7, W_M1_REG                    ; out5
808cabdff1aSopenharmony_ci
809cabdff1aSopenharmony_ci    ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7
810cabdff1aSopenharmony_ci
811cabdff1aSopenharmony_ci%if ARCH_X86_64
812cabdff1aSopenharmony_ci    SWAP                     2, 8
813cabdff1aSopenharmony_ci%endif
814cabdff1aSopenharmony_ci    SWAP                     0, 6, 2
815cabdff1aSopenharmony_ci    SWAP                     7, 1, 5
816cabdff1aSopenharmony_ci%endmacro
817cabdff1aSopenharmony_ci
818cabdff1aSopenharmony_ci%macro IADST8_FN 6
819cabdff1aSopenharmony_ciINIT_XMM %5
820cabdff1aSopenharmony_cicglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob
821cabdff1aSopenharmony_ci
822cabdff1aSopenharmony_ci%ifidn %1, idct
823cabdff1aSopenharmony_ci%define first_is_idct 1
824cabdff1aSopenharmony_ci%else
825cabdff1aSopenharmony_ci%define first_is_idct 0
826cabdff1aSopenharmony_ci%endif
827cabdff1aSopenharmony_ci
828cabdff1aSopenharmony_ci%ifidn %3, idct
829cabdff1aSopenharmony_ci%define second_is_idct 1
830cabdff1aSopenharmony_ci%else
831cabdff1aSopenharmony_ci%define second_is_idct 0
832cabdff1aSopenharmony_ci%endif
833cabdff1aSopenharmony_ci
834cabdff1aSopenharmony_ci%if ARCH_X86_64
835cabdff1aSopenharmony_ci    mova                m0, [blockq+  0]    ; IN(0)
836cabdff1aSopenharmony_ci%endif
837cabdff1aSopenharmony_ci    mova                m1, [blockq+ 16]    ; IN(1)
838cabdff1aSopenharmony_ci    mova                m2, [blockq+ 32]    ; IN(2)
839cabdff1aSopenharmony_ci%if ARCH_X86_64 || first_is_idct
840cabdff1aSopenharmony_ci    mova                m3, [blockq+ 48]    ; IN(3)
841cabdff1aSopenharmony_ci%endif
842cabdff1aSopenharmony_ci%if ARCH_X86_64
843cabdff1aSopenharmony_ci    mova                m4, [blockq+ 64]    ; IN(4)
844cabdff1aSopenharmony_ci%endif
845cabdff1aSopenharmony_ci    mova                m5, [blockq+ 80]    ; IN(5)
846cabdff1aSopenharmony_ci    mova                m6, [blockq+ 96]    ; IN(6)
847cabdff1aSopenharmony_ci%if ARCH_X86_64 || first_is_idct
848cabdff1aSopenharmony_ci    mova                m7, [blockq+112]    ; IN(7)
849cabdff1aSopenharmony_ci%endif
850cabdff1aSopenharmony_ci%if ARCH_X86_64
851cabdff1aSopenharmony_ci%if cpuflag(ssse3)
852cabdff1aSopenharmony_ci    mova               m15, [pw_11585x2]    ; often used
853cabdff1aSopenharmony_ci%endif
854cabdff1aSopenharmony_ci    mova               m13, [pd_8192]       ; rounding
855cabdff1aSopenharmony_ci    mova               m14, [pw_m1]
856cabdff1aSopenharmony_ci%define W_11585x2_REG m15
857cabdff1aSopenharmony_ci%define D_8192_REG m13
858cabdff1aSopenharmony_ci%define W_M1_REG m14
859cabdff1aSopenharmony_ci%else
860cabdff1aSopenharmony_ci%define W_11585x2_REG [pw_11585x2]
861cabdff1aSopenharmony_ci%define D_8192_REG [pd_8192]
862cabdff1aSopenharmony_ci%define W_M1_REG [pw_m1]
863cabdff1aSopenharmony_ci%endif
864cabdff1aSopenharmony_ci
865cabdff1aSopenharmony_ci    ; note different calling conventions for idct8 vs. iadst8 on x86-32
866cabdff1aSopenharmony_ci    VP9_%2_1D
867cabdff1aSopenharmony_ci%if ARCH_X86_64
868cabdff1aSopenharmony_ci    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
869cabdff1aSopenharmony_ci%else
870cabdff1aSopenharmony_ci    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
871cabdff1aSopenharmony_ci    mova      [blockq+  0], m0
872cabdff1aSopenharmony_ci%if second_is_idct == 0
873cabdff1aSopenharmony_ci    mova      [blockq+ 48], m3
874cabdff1aSopenharmony_ci    mova      [blockq+112], m7
875cabdff1aSopenharmony_ci%endif
876cabdff1aSopenharmony_ci%endif
877cabdff1aSopenharmony_ci    VP9_%4_1D
878cabdff1aSopenharmony_ci
879cabdff1aSopenharmony_ci%if ARCH_X86_64
880cabdff1aSopenharmony_ci    SWAP                 6, 8
881cabdff1aSopenharmony_ci%endif
882cabdff1aSopenharmony_ci    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
883cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEOUT
884cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 16, 8, m6
885cabdff1aSopenharmony_ci    RET
886cabdff1aSopenharmony_ci
887cabdff1aSopenharmony_ci%undef W_11585x2_REG
888cabdff1aSopenharmony_ci%undef first_is_idct
889cabdff1aSopenharmony_ci%undef second_is_idct
890cabdff1aSopenharmony_ci
891cabdff1aSopenharmony_ci%endmacro
892cabdff1aSopenharmony_ci
893cabdff1aSopenharmony_ciIADST8_FN idct,  IDCT8,  iadst, IADST8, sse2, 15
894cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, idct,  IDCT8,  sse2, 15
895cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
896cabdff1aSopenharmony_ciIADST8_FN idct,  IDCT8,  iadst, IADST8, ssse3, 16
897cabdff1aSopenharmony_ciIADST8_FN idct,  IDCT8,  iadst, IADST8, avx, 16
898cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, idct,  IDCT8,  ssse3, 16
899cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, idct,  IDCT8,  avx, 16
900cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
901cabdff1aSopenharmony_ciIADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
902cabdff1aSopenharmony_ci
903cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------------------
904cabdff1aSopenharmony_ci; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
905cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------------------
906cabdff1aSopenharmony_ci
907cabdff1aSopenharmony_ci; x86-64:
908cabdff1aSopenharmony_ci; at the end of this macro, m7 is stored in [%4+15*%5]
909cabdff1aSopenharmony_ci; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
910cabdff1aSopenharmony_ci; the following sumsubs have not been done yet:
911cabdff1aSopenharmony_ci;    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
912cabdff1aSopenharmony_ci;    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
913cabdff1aSopenharmony_ci; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1,
914cabdff1aSopenharmony_ci; and the following simsubs have not been done yet:
915cabdff1aSopenharmony_ci;    SUMSUB_BA            w, x13, x14, 7       ; t6, t9
916cabdff1aSopenharmony_ci;    SUMSUB_BA            w, x15, x12, 7       ; t7, t8
917cabdff1aSopenharmony_ci
918cabdff1aSopenharmony_ci%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst
919cabdff1aSopenharmony_ci%if %2 <= 4
920cabdff1aSopenharmony_ci    mova                m3, [%1+ 1*%3]      ; IN(1)
921cabdff1aSopenharmony_ci    mova                m0, [%1+ 3*%3]      ; IN(3)
922cabdff1aSopenharmony_ci
923cabdff1aSopenharmony_ci    pmulhrsw            m4, m3,  [pw_16305x2]       ; t14-15
924cabdff1aSopenharmony_ci    pmulhrsw            m3, [pw_1606x2]             ; t8-9
925cabdff1aSopenharmony_ci    pmulhrsw            m7, m0,  [pw_m4756x2]       ; t10-11
926cabdff1aSopenharmony_ci    pmulhrsw            m0, [pw_15679x2]            ; t12-13
927cabdff1aSopenharmony_ci
928cabdff1aSopenharmony_ci    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
929cabdff1aSopenharmony_ci    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
930cabdff1aSopenharmony_ci
931cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137,  6270, [pd_8192], 1, 6 ; t9,  t14
932cabdff1aSopenharmony_ci    SCRATCH              4, 10, %4+ 1*%5
933cabdff1aSopenharmony_ci    SCRATCH              5, 11, %4+ 7*%5
934cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
935cabdff1aSopenharmony_ci    UNSCRATCH            5, 11, %4+ 7*%5
936cabdff1aSopenharmony_ci
937cabdff1aSopenharmony_ci    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
938cabdff1aSopenharmony_ci    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
939cabdff1aSopenharmony_ci%else
940cabdff1aSopenharmony_ci    mova                m5, [%1+ 1*%3]      ; IN(1)
941cabdff1aSopenharmony_ci    mova                m4, [%1+ 7*%3]      ; IN(7)
942cabdff1aSopenharmony_ci%if %2 <= 8
943cabdff1aSopenharmony_ci    pmulhrsw            m2, m5,  [pw_16305x2]       ; t15
944cabdff1aSopenharmony_ci    pmulhrsw            m5, [pw_1606x2]             ; t8
945cabdff1aSopenharmony_ci    pmulhrsw            m3, m4,  [pw_m10394x2]      ; t9
946cabdff1aSopenharmony_ci    pmulhrsw            m4, [pw_12665x2]            ; t14
947cabdff1aSopenharmony_ci%else
948cabdff1aSopenharmony_ci    mova                m3, [%1+ 9*%3]      ; IN(9)
949cabdff1aSopenharmony_ci    mova                m2, [%1+15*%3]      ; IN(15)
950cabdff1aSopenharmony_ci
951cabdff1aSopenharmony_ci    ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
952cabdff1aSopenharmony_ci    ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
953cabdff1aSopenharmony_ci
954cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   5,   2, 16305,  1606, [pd_8192], 0, 1 ; t8,  t15
955cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   3,   4, 10394, 12665, [pd_8192], 0, 1 ; t9,  t14
956cabdff1aSopenharmony_ci%endif
957cabdff1aSopenharmony_ci
958cabdff1aSopenharmony_ci    SUMSUB_BA            w,  3,  5, 0       ; t8,  t9
959cabdff1aSopenharmony_ci    SUMSUB_BA            w,  4,  2, 0       ; t15, t14
960cabdff1aSopenharmony_ci
961cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   2,   5, 15137,  6270, [pd_8192], 0, 1 ; t9,  t14
962cabdff1aSopenharmony_ci
963cabdff1aSopenharmony_ci    SCRATCH              4, 10, %4+ 1*%5
964cabdff1aSopenharmony_ci    SCRATCH              5, 11, %4+ 7*%5
965cabdff1aSopenharmony_ci
966cabdff1aSopenharmony_ci    mova                m6, [%1+ 3*%3]      ; IN(3)
967cabdff1aSopenharmony_ci    mova                m7, [%1+ 5*%3]      ; IN(5)
968cabdff1aSopenharmony_ci%if %2 <= 8
969cabdff1aSopenharmony_ci    pmulhrsw            m0, m7,  [pw_14449x2]       ; t13
970cabdff1aSopenharmony_ci    pmulhrsw            m7, [pw_7723x2]             ; t10
971cabdff1aSopenharmony_ci    pmulhrsw            m1, m6,  [pw_m4756x2]       ; t11
972cabdff1aSopenharmony_ci    pmulhrsw            m6, [pw_15679x2]            ; t12
973cabdff1aSopenharmony_ci%else
974cabdff1aSopenharmony_ci    mova                m0, [%1+11*%3]      ; IN(11)
975cabdff1aSopenharmony_ci    mova                m1, [%1+13*%3]      ; IN(13)
976cabdff1aSopenharmony_ci
977cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   7,   0, 14449,  7723, [pd_8192], 4, 5 ; t10, t13
978cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   1,   6,  4756, 15679, [pd_8192], 4, 5 ; t11, t12
979cabdff1aSopenharmony_ci%endif
980cabdff1aSopenharmony_ci
981cabdff1aSopenharmony_ci    ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
982cabdff1aSopenharmony_ci    ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
983cabdff1aSopenharmony_ci
984cabdff1aSopenharmony_ci    SUMSUB_BA            w,  7,  1, 4       ; t11, t10
985cabdff1aSopenharmony_ci    SUMSUB_BA            w,  0,  6, 4       ; t12, t13
986cabdff1aSopenharmony_ci
987cabdff1aSopenharmony_ci    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
988cabdff1aSopenharmony_ci    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
989cabdff1aSopenharmony_ci
990cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   6,   1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
991cabdff1aSopenharmony_ci
992cabdff1aSopenharmony_ci    UNSCRATCH            5, 11, %4+ 7*%5
993cabdff1aSopenharmony_ci%endif
994cabdff1aSopenharmony_ci
995cabdff1aSopenharmony_ci    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
996cabdff1aSopenharmony_ci    ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
997cabdff1aSopenharmony_ci
998cabdff1aSopenharmony_ci    SUMSUB_BA            w,  7,  3, 4       ; t8,  t11
999cabdff1aSopenharmony_ci
1000cabdff1aSopenharmony_ci    ; backup first register
1001cabdff1aSopenharmony_ci    mova        [%4+15*%5], m7
1002cabdff1aSopenharmony_ci
1003cabdff1aSopenharmony_ci    SUMSUB_BA            w,  6,  2, 7       ; t9,  t10
1004cabdff1aSopenharmony_ci    UNSCRATCH            4, 10, %4+ 1*%5
1005cabdff1aSopenharmony_ci    SUMSUB_BA            w,  0,  4, 7       ; t15, t12
1006cabdff1aSopenharmony_ci    SUMSUB_BA            w,  1,  5, 7       ; t14. t13
1007cabdff1aSopenharmony_ci
1008cabdff1aSopenharmony_ci    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
1009cabdff1aSopenharmony_ci    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
1010cabdff1aSopenharmony_ci
1011cabdff1aSopenharmony_ci%if cpuflag(ssse3) && %6 == 0
1012cabdff1aSopenharmony_ci    SUMSUB_BA            w,  2,  5, 7
1013cabdff1aSopenharmony_ci    SUMSUB_BA            w,  3,  4, 7
1014cabdff1aSopenharmony_ci    pmulhrsw            m5, [pw_11585x2]    ; t10
1015cabdff1aSopenharmony_ci    pmulhrsw            m4, [pw_11585x2]    ; t11
1016cabdff1aSopenharmony_ci    pmulhrsw            m3, [pw_11585x2]    ; t12
1017cabdff1aSopenharmony_ci    pmulhrsw            m2, [pw_11585x2]    ; t13
1018cabdff1aSopenharmony_ci%else
1019cabdff1aSopenharmony_ci    SCRATCH              6, 10, %4+ 1*%5
1020cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   5,   2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13
1021cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   4,   3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12
1022cabdff1aSopenharmony_ci    UNSCRATCH            6, 10, %4+ 1*%5
1023cabdff1aSopenharmony_ci%endif
1024cabdff1aSopenharmony_ci
1025cabdff1aSopenharmony_ci    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
1026cabdff1aSopenharmony_ci    ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
1027cabdff1aSopenharmony_ci
1028cabdff1aSopenharmony_ci    SCRATCH              0,  8, %4+ 1*%5
1029cabdff1aSopenharmony_ci    SCRATCH              1,  9, %4+ 3*%5
1030cabdff1aSopenharmony_ci    SCRATCH              2, 10, %4+ 5*%5
1031cabdff1aSopenharmony_ci    SCRATCH              3, 11, %4+ 7*%5
1032cabdff1aSopenharmony_ci    SCRATCH              4, 12, %4+ 9*%5
1033cabdff1aSopenharmony_ci    SCRATCH              5, 13, %4+11*%5
1034cabdff1aSopenharmony_ci    SCRATCH              6, 14, %4+13*%5
1035cabdff1aSopenharmony_ci
1036cabdff1aSopenharmony_ci    ; even (tx8x8)
1037cabdff1aSopenharmony_ci%if %2 <= 4
1038cabdff1aSopenharmony_ci    mova                m3, [%1+ 0*%3]      ; IN(0)
1039cabdff1aSopenharmony_ci    mova                m4, [%1+ 2*%3]      ; IN(2)
1040cabdff1aSopenharmony_ci
1041cabdff1aSopenharmony_ci    pmulhrsw            m3, [pw_11585x2]    ; t0-t3
1042cabdff1aSopenharmony_ci    pmulhrsw            m7, m4, [pw_16069x2]        ; t6-7
1043cabdff1aSopenharmony_ci    pmulhrsw            m4, [pw_3196x2]             ; t4-5
1044cabdff1aSopenharmony_ci
1045cabdff1aSopenharmony_ci%if 0 ; overflows :(
1046cabdff1aSopenharmony_ci    paddw               m6, m7, m4
1047cabdff1aSopenharmony_ci    psubw               m5, m7, m4
1048cabdff1aSopenharmony_ci    pmulhrsw            m5, [pw_11585x2]            ; t5
1049cabdff1aSopenharmony_ci    pmulhrsw            m6, [pw_11585x2]            ; t6
1050cabdff1aSopenharmony_ci%else
1051cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5,  t6
1052cabdff1aSopenharmony_ci%endif
1053cabdff1aSopenharmony_ci
1054cabdff1aSopenharmony_ci    psubw               m0, m3, m7
1055cabdff1aSopenharmony_ci    paddw               m7, m3
1056cabdff1aSopenharmony_ci    psubw               m1, m3, m6
1057cabdff1aSopenharmony_ci    paddw               m6, m3
1058cabdff1aSopenharmony_ci    psubw               m2, m3, m5
1059cabdff1aSopenharmony_ci    paddw               m5, m3
1060cabdff1aSopenharmony_ci
1061cabdff1aSopenharmony_ci%if ARCH_X86_32
1062cabdff1aSopenharmony_ci    SWAP                 0, 7
1063cabdff1aSopenharmony_ci%endif
1064cabdff1aSopenharmony_ci    SCRATCH              7, 15, %4+12*%5
1065cabdff1aSopenharmony_ci%else
1066cabdff1aSopenharmony_ci    mova                m6, [%1+ 2*%3]      ; IN(2)
1067cabdff1aSopenharmony_ci    mova                m1, [%1+ 4*%3]      ; IN(4)
1068cabdff1aSopenharmony_ci    mova                m7, [%1+ 6*%3]      ; IN(6)
1069cabdff1aSopenharmony_ci%if %2 <= 8
1070cabdff1aSopenharmony_ci    pmulhrsw            m0, m1,  [pw_15137x2]       ; t3
1071cabdff1aSopenharmony_ci    pmulhrsw            m1, [pw_6270x2]             ; t2
1072cabdff1aSopenharmony_ci    pmulhrsw            m5, m6, [pw_16069x2]        ; t7
1073cabdff1aSopenharmony_ci    pmulhrsw            m6, [pw_3196x2]             ; t4
1074cabdff1aSopenharmony_ci    pmulhrsw            m4, m7, [pw_m9102x2]        ; t5
1075cabdff1aSopenharmony_ci    pmulhrsw            m7, [pw_13623x2]            ; t6
1076cabdff1aSopenharmony_ci%else
1077cabdff1aSopenharmony_ci    mova                m4, [%1+10*%3]      ; IN(10)
1078cabdff1aSopenharmony_ci    mova                m0, [%1+12*%3]      ; IN(12)
1079cabdff1aSopenharmony_ci    mova                m5, [%1+14*%3]      ; IN(14)
1080cabdff1aSopenharmony_ci
1081cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   1,   0, 15137,  6270, [pd_8192], 2, 3 ; t2,  t3
1082cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   6,   5, 16069,  3196, [pd_8192], 2, 3 ; t4,  t7
1083cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   4,   7,  9102, 13623, [pd_8192], 2, 3 ; t5,  t6
1084cabdff1aSopenharmony_ci%endif
1085cabdff1aSopenharmony_ci
1086cabdff1aSopenharmony_ci    SUMSUB_BA            w,  4,  6, 2       ; t4,  t5
1087cabdff1aSopenharmony_ci    SUMSUB_BA            w,  7,  5, 2       ; t7,  t6
1088cabdff1aSopenharmony_ci
1089cabdff1aSopenharmony_ci%if cpuflag(ssse3) && %6 == 0
1090cabdff1aSopenharmony_ci    SUMSUB_BA            w,  6,  5, 2
1091cabdff1aSopenharmony_ci    pmulhrsw            m5, [pw_11585x2]                              ; t5
1092cabdff1aSopenharmony_ci    pmulhrsw            m6, [pw_11585x2]                              ; t6
1093cabdff1aSopenharmony_ci%else
1094cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  5,  6, 11585, 11585, [pd_8192], 2, 3 ; t5,  t6
1095cabdff1aSopenharmony_ci%endif
1096cabdff1aSopenharmony_ci
1097cabdff1aSopenharmony_ci    SCRATCH              5, 15, %4+10*%5
1098cabdff1aSopenharmony_ci    mova                m2, [%1+ 0*%3]      ; IN(0)
1099cabdff1aSopenharmony_ci%if %2 <= 8
1100cabdff1aSopenharmony_ci    pmulhrsw            m2, [pw_11585x2]    ; t0 and t1
1101cabdff1aSopenharmony_ci    psubw               m3, m2, m0
1102cabdff1aSopenharmony_ci    paddw               m0, m2
1103cabdff1aSopenharmony_ci
1104cabdff1aSopenharmony_ci    SUMSUB_BA            w,  7,  0, 5       ; t0,  t7
1105cabdff1aSopenharmony_ci%else
1106cabdff1aSopenharmony_ci    mova                m3, [%1+ 8*%3]      ; IN(8)
1107cabdff1aSopenharmony_ci
1108cabdff1aSopenharmony_ci    ; from 3 stages back
1109cabdff1aSopenharmony_ci%if cpuflag(ssse3) && %6 == 0
1110cabdff1aSopenharmony_ci    SUMSUB_BA            w,  3,  2, 5
1111cabdff1aSopenharmony_ci    pmulhrsw            m3, [pw_11585x2]    ; t0
1112cabdff1aSopenharmony_ci    pmulhrsw            m2, [pw_11585x2]    ; t1
1113cabdff1aSopenharmony_ci%else
1114cabdff1aSopenharmony_ci    mova        [%1+ 0*%3], m0
1115cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585,  11585, [pd_8192], 5, 0 ; t0, t1
1116cabdff1aSopenharmony_ci    mova                m0, [%1+ 0*%3]
1117cabdff1aSopenharmony_ci%endif
1118cabdff1aSopenharmony_ci
1119cabdff1aSopenharmony_ci    ; from 2 stages back
1120cabdff1aSopenharmony_ci    SUMSUB_BA            w,  0,  3, 5      ; t0,  t3
1121cabdff1aSopenharmony_ci
1122cabdff1aSopenharmony_ci    SUMSUB_BA            w,  7,  0, 5      ; t0,  t7
1123cabdff1aSopenharmony_ci%endif
1124cabdff1aSopenharmony_ci    UNSCRATCH            5, 15, %4+10*%5
1125cabdff1aSopenharmony_ci%if ARCH_X86_32
1126cabdff1aSopenharmony_ci    SWAP                 0, 7
1127cabdff1aSopenharmony_ci%endif
1128cabdff1aSopenharmony_ci    SCRATCH              7, 15, %4+12*%5
1129cabdff1aSopenharmony_ci    SUMSUB_BA            w,  1,  2, 7       ; t1,  t2
1130cabdff1aSopenharmony_ci
1131cabdff1aSopenharmony_ci    ; from 1 stage back
1132cabdff1aSopenharmony_ci    SUMSUB_BA            w,  6,  1, 7       ; t1,  t6
1133cabdff1aSopenharmony_ci    SUMSUB_BA            w,  5,  2, 7       ; t2,  t5
1134cabdff1aSopenharmony_ci%endif
1135cabdff1aSopenharmony_ci    SUMSUB_BA            w,  4,  3, 7       ; t3,  t4
1136cabdff1aSopenharmony_ci
1137cabdff1aSopenharmony_ci%if ARCH_X86_64
1138cabdff1aSopenharmony_ci    SWAP                 0, 8
1139cabdff1aSopenharmony_ci    SWAP                 1, 9
1140cabdff1aSopenharmony_ci    SWAP                 2, 10
1141cabdff1aSopenharmony_ci    SWAP                 3, 11
1142cabdff1aSopenharmony_ci    SWAP                 4, 12
1143cabdff1aSopenharmony_ci    SWAP                 5, 13
1144cabdff1aSopenharmony_ci    SWAP                 6, 14
1145cabdff1aSopenharmony_ci
1146cabdff1aSopenharmony_ci    SUMSUB_BA            w,  0, 15, 7       ; t0, t15
1147cabdff1aSopenharmony_ci    SUMSUB_BA            w,  1, 14, 7       ; t1, t14
1148cabdff1aSopenharmony_ci    SUMSUB_BA            w,  2, 13, 7       ; t2, t13
1149cabdff1aSopenharmony_ci    SUMSUB_BA            w,  3, 12, 7       ; t3, t12
1150cabdff1aSopenharmony_ci    SUMSUB_BA            w,  4, 11, 7       ; t4, t11
1151cabdff1aSopenharmony_ci    SUMSUB_BA            w,  5, 10, 7       ; t5, t10
1152cabdff1aSopenharmony_ci%else
1153cabdff1aSopenharmony_ci    SWAP                 1, 6
1154cabdff1aSopenharmony_ci    SWAP                 2, 5
1155cabdff1aSopenharmony_ci    SWAP                 3, 4
1156cabdff1aSopenharmony_ci    mova        [%4+14*%5], m6
1157cabdff1aSopenharmony_ci
1158cabdff1aSopenharmony_ci%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride
1159cabdff1aSopenharmony_ci    mova                m6, [%4+%2*%5]
1160cabdff1aSopenharmony_ci    SUMSUB_BA            w,  6, %1, 7
1161cabdff1aSopenharmony_ci    SWAP                %1, 6
1162cabdff1aSopenharmony_ci    mova        [%4+%3*%5], m6
1163cabdff1aSopenharmony_ci%endmacro
1164cabdff1aSopenharmony_ci
1165cabdff1aSopenharmony_ci    %%SUMSUB_BA_STORE    0,  1,  1, %4, %5  ; t0, t15
1166cabdff1aSopenharmony_ci    %%SUMSUB_BA_STORE    1,  3,  3, %4, %5  ; t1, t14
1167cabdff1aSopenharmony_ci    %%SUMSUB_BA_STORE    2,  5,  5, %4, %5  ; t2, t13
1168cabdff1aSopenharmony_ci    %%SUMSUB_BA_STORE    3,  7,  7, %4, %5  ; t3, t12
1169cabdff1aSopenharmony_ci    %%SUMSUB_BA_STORE    4,  9,  9, %4, %5  ; t4, t11
1170cabdff1aSopenharmony_ci    %%SUMSUB_BA_STORE    5, 11, 11, %4, %5  ; t5, t10
1171cabdff1aSopenharmony_ci%endif
1172cabdff1aSopenharmony_ci%endmacro
1173cabdff1aSopenharmony_ci
1174cabdff1aSopenharmony_ci%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst
1175cabdff1aSopenharmony_ci%if %2 == 1
1176cabdff1aSopenharmony_ci    VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4
1177cabdff1aSopenharmony_ci
1178cabdff1aSopenharmony_ci%if ARCH_X86_64
1179cabdff1aSopenharmony_ci    ; backup a different register
1180cabdff1aSopenharmony_ci    mova                m7, [tmpq+15*16]
1181cabdff1aSopenharmony_ci    mova      [tmpq+ 1*16], m15
1182cabdff1aSopenharmony_ci
1183cabdff1aSopenharmony_ci    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
1184cabdff1aSopenharmony_ci    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
1185cabdff1aSopenharmony_ci
1186cabdff1aSopenharmony_ci    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 15
1187cabdff1aSopenharmony_ci    mova        [tmpq+  0], m0
1188cabdff1aSopenharmony_ci    mova        [tmpq+ 32], m1
1189cabdff1aSopenharmony_ci    mova        [tmpq+ 64], m2
1190cabdff1aSopenharmony_ci    mova        [tmpq+ 96], m3
1191cabdff1aSopenharmony_ci    mova        [tmpq+128], m4
1192cabdff1aSopenharmony_ci    mova        [tmpq+160], m5
1193cabdff1aSopenharmony_ci    mova        [tmpq+192], m6
1194cabdff1aSopenharmony_ci    mova        [tmpq+224], m7
1195cabdff1aSopenharmony_ci
1196cabdff1aSopenharmony_ci    mova               m15, [tmpq+ 1*16]
1197cabdff1aSopenharmony_ci    TRANSPOSE8x8W        8, 9, 10, 11, 12, 13, 14, 15, 0
1198cabdff1aSopenharmony_ci    mova        [tmpq+ 16], m8
1199cabdff1aSopenharmony_ci    mova        [tmpq+ 48], m9
1200cabdff1aSopenharmony_ci    mova        [tmpq+ 80], m10
1201cabdff1aSopenharmony_ci    mova        [tmpq+112], m11
1202cabdff1aSopenharmony_ci    mova        [tmpq+144], m12
1203cabdff1aSopenharmony_ci    mova        [tmpq+176], m13
1204cabdff1aSopenharmony_ci    mova        [tmpq+208], m14
1205cabdff1aSopenharmony_ci    mova        [tmpq+240], m15
1206cabdff1aSopenharmony_ci%else
1207cabdff1aSopenharmony_ci    mova                m6, [tmpq+13*16]
1208cabdff1aSopenharmony_ci    mova                m7, [tmpq+14*16]
1209cabdff1aSopenharmony_ci    SUMSUB_BA            w, 6, 7                ; t6, t9
1210cabdff1aSopenharmony_ci    mova      [tmpq+14*16], m6
1211cabdff1aSopenharmony_ci    mova      [tmpq+13*16], m7
1212cabdff1aSopenharmony_ci    mova                m7, [tmpq+15*16]
1213cabdff1aSopenharmony_ci    mova                m6, [tmpq+12*16]
1214cabdff1aSopenharmony_ci    SUMSUB_BA            w, 7, 6                ; t7, t8
1215cabdff1aSopenharmony_ci    mova      [tmpq+15*16], m6
1216cabdff1aSopenharmony_ci
1217cabdff1aSopenharmony_ci    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1
1218cabdff1aSopenharmony_ci    mova     [tmpq+ 0*16], m0
1219cabdff1aSopenharmony_ci    mova     [tmpq+ 2*16], m1
1220cabdff1aSopenharmony_ci    mova     [tmpq+ 4*16], m2
1221cabdff1aSopenharmony_ci    mova     [tmpq+ 6*16], m3
1222cabdff1aSopenharmony_ci    mova     [tmpq+10*16], m5
1223cabdff1aSopenharmony_ci    mova     [tmpq+12*16], m6
1224cabdff1aSopenharmony_ci    mova     [tmpq+14*16], m7
1225cabdff1aSopenharmony_ci
1226cabdff1aSopenharmony_ci    mova                m0, [tmpq+15*16]
1227cabdff1aSopenharmony_ci    mova                m1, [tmpq+13*16]
1228cabdff1aSopenharmony_ci    mova                m2, [tmpq+11*16]
1229cabdff1aSopenharmony_ci    mova                m3, [tmpq+ 9*16]
1230cabdff1aSopenharmony_ci    mova                m4, [tmpq+ 7*16]
1231cabdff1aSopenharmony_ci    mova                m5, [tmpq+ 5*16]
1232cabdff1aSopenharmony_ci    mova                m7, [tmpq+ 1*16]
1233cabdff1aSopenharmony_ci    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1
1234cabdff1aSopenharmony_ci    mova     [tmpq+ 1*16], m0
1235cabdff1aSopenharmony_ci    mova     [tmpq+ 3*16], m1
1236cabdff1aSopenharmony_ci    mova     [tmpq+ 5*16], m2
1237cabdff1aSopenharmony_ci    mova     [tmpq+ 7*16], m3
1238cabdff1aSopenharmony_ci    mova     [tmpq+11*16], m5
1239cabdff1aSopenharmony_ci    mova     [tmpq+13*16], m6
1240cabdff1aSopenharmony_ci    mova     [tmpq+15*16], m7
1241cabdff1aSopenharmony_ci%endif
1242cabdff1aSopenharmony_ci%else ; %2 == 2
1243cabdff1aSopenharmony_ci    VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4
1244cabdff1aSopenharmony_ci
1245cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1246cabdff1aSopenharmony_ci%define ROUND_REG [pw_512]
1247cabdff1aSopenharmony_ci%else
1248cabdff1aSopenharmony_ci%define ROUND_REG [pw_32]
1249cabdff1aSopenharmony_ci%endif
1250cabdff1aSopenharmony_ci
1251cabdff1aSopenharmony_ci    pxor                m7, m7
1252cabdff1aSopenharmony_ci%if ARCH_X86_64
1253cabdff1aSopenharmony_ci    ; backup more registers
1254cabdff1aSopenharmony_ci    mova        [%1+ 2*32], m8
1255cabdff1aSopenharmony_ci    mova        [%1+ 3*32], m9
1256cabdff1aSopenharmony_ci
1257cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    0,  1, 8, 9, 7, ROUND_REG, 6
1258cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1259cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    2,  3, 8, 9, 7, ROUND_REG, 6
1260cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1261cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    4,  5, 8, 9, 7, ROUND_REG, 6
1262cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1263cabdff1aSopenharmony_ci
1264cabdff1aSopenharmony_ci    ; restore from cache
1265cabdff1aSopenharmony_ci    SWAP                 0, 7               ; move zero from m7 to m0
1266cabdff1aSopenharmony_ci    mova                m7, [%1+15*32]
1267cabdff1aSopenharmony_ci    mova                m8, [%1+ 2*32]
1268cabdff1aSopenharmony_ci    mova                m9, [%1+ 3*32]
1269cabdff1aSopenharmony_ci
1270cabdff1aSopenharmony_ci    SUMSUB_BA            w,  6,  9, 3       ; t6, t9
1271cabdff1aSopenharmony_ci    SUMSUB_BA            w,  7,  8, 3       ; t7, t8
1272cabdff1aSopenharmony_ci
1273cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    6,  7, 3, 4, 0, ROUND_REG, 6
1274cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1275cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    8,  9, 3, 4, 0, ROUND_REG, 6
1276cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1277cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2   10, 11, 1, 2, 0, ROUND_REG, 6
1278cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1279cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2   12, 13, 1, 2, 0, ROUND_REG, 6
1280cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1281cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2   14, 15, 1, 2, 0, ROUND_REG, 6
1282cabdff1aSopenharmony_ci%else
1283cabdff1aSopenharmony_ci    mova      [tmpq+ 0*32], m5
1284cabdff1aSopenharmony_ci
1285cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    0,  1, 5, 6, 7, ROUND_REG, 6
1286cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1287cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    2,  3, 5, 6, 7, ROUND_REG, 6
1288cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1289cabdff1aSopenharmony_ci
1290cabdff1aSopenharmony_ci    SWAP                 0, 7               ; move zero from m7 to m0
1291cabdff1aSopenharmony_ci    mova                m5, [tmpq+ 0*32]
1292cabdff1aSopenharmony_ci
1293cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
1294cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1295cabdff1aSopenharmony_ci
1296cabdff1aSopenharmony_ci    mova                m4, [tmpq+13*32]
1297cabdff1aSopenharmony_ci    mova                m7, [tmpq+14*32]
1298cabdff1aSopenharmony_ci    mova                m5, [tmpq+15*32]
1299cabdff1aSopenharmony_ci    mova                m6, [tmpq+12*32]
1300cabdff1aSopenharmony_ci    SUMSUB_BADC w, 4, 7, 5, 6, 1
1301cabdff1aSopenharmony_ci
1302cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
1303cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1304cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
1305cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1306cabdff1aSopenharmony_ci
1307cabdff1aSopenharmony_ci    mova                m4, [tmpq+11*32]
1308cabdff1aSopenharmony_ci    mova                m5, [tmpq+ 9*32]
1309cabdff1aSopenharmony_ci    mova                m6, [tmpq+ 7*32]
1310cabdff1aSopenharmony_ci    mova                m7, [tmpq+ 5*32]
1311cabdff1aSopenharmony_ci
1312cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
1313cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1314cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
1315cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1316cabdff1aSopenharmony_ci
1317cabdff1aSopenharmony_ci    mova                m4, [tmpq+ 3*32]
1318cabdff1aSopenharmony_ci    mova                m5, [tmpq+ 1*32]
1319cabdff1aSopenharmony_ci
1320cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
1321cabdff1aSopenharmony_ci    lea               dstq, [dstq+strideq*2]
1322cabdff1aSopenharmony_ci%endif
1323cabdff1aSopenharmony_ci
1324cabdff1aSopenharmony_ci%undef ROUND_REG
1325cabdff1aSopenharmony_ci%endif ; %2 == 1/2
1326cabdff1aSopenharmony_ci%endmacro
1327cabdff1aSopenharmony_ci
1328cabdff1aSopenharmony_ci%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
1329cabdff1aSopenharmony_ci    mova               m%3, [dstq]
1330cabdff1aSopenharmony_ci    mova               m%5, [dstq+%7]
1331cabdff1aSopenharmony_ci    punpcklbw          m%2, m%3, m%6
1332cabdff1aSopenharmony_ci    punpckhbw          m%3, m%6
1333cabdff1aSopenharmony_ci    punpcklbw          m%4, m%5, m%6
1334cabdff1aSopenharmony_ci    punpckhbw          m%5, m%6
1335cabdff1aSopenharmony_ci    paddw              m%2, m%1
1336cabdff1aSopenharmony_ci    paddw              m%3, m%1
1337cabdff1aSopenharmony_ci    paddw              m%4, m%1
1338cabdff1aSopenharmony_ci    paddw              m%5, m%1
1339cabdff1aSopenharmony_ci    packuswb           m%2, m%3
1340cabdff1aSopenharmony_ci    packuswb           m%4, m%5
1341cabdff1aSopenharmony_ci    mova            [dstq], m%2
1342cabdff1aSopenharmony_ci    mova         [dstq+%7], m%4
1343cabdff1aSopenharmony_ci%endmacro
1344cabdff1aSopenharmony_ci
1345cabdff1aSopenharmony_ci%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
1346cabdff1aSopenharmony_ciINIT_XMM %1
1347cabdff1aSopenharmony_cicglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
1348cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1349cabdff1aSopenharmony_ci    ; 2x2=eob=3, 4x4=eob=10
1350cabdff1aSopenharmony_ci    cmp eobd, 38
1351cabdff1aSopenharmony_ci    jg .idctfull
1352cabdff1aSopenharmony_ci    cmp eobd, 1 ; faster path for when only DC is set
1353cabdff1aSopenharmony_ci    jne .idct8x8
1354cabdff1aSopenharmony_ci%else
1355cabdff1aSopenharmony_ci    cmp eobd, 1 ; faster path for when only DC is set
1356cabdff1aSopenharmony_ci    jg .idctfull
1357cabdff1aSopenharmony_ci%endif
1358cabdff1aSopenharmony_ci
1359cabdff1aSopenharmony_ci    ; dc-only
1360cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1361cabdff1aSopenharmony_ci    movd                m0, [blockq]
1362cabdff1aSopenharmony_ci    mova                m1, [pw_11585x2]
1363cabdff1aSopenharmony_ci    pmulhrsw            m0, m1
1364cabdff1aSopenharmony_ci    pmulhrsw            m0, m1
1365cabdff1aSopenharmony_ci%else
1366cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, block, coef
1367cabdff1aSopenharmony_ci    movsx            coefd, word [blockq]
1368cabdff1aSopenharmony_ci    imul             coefd, 11585
1369cabdff1aSopenharmony_ci    add              coefd, 8192
1370cabdff1aSopenharmony_ci    sar              coefd, 14
1371cabdff1aSopenharmony_ci    imul             coefd, 11585
1372cabdff1aSopenharmony_ci    add              coefd, (32 << 14) + 8192
1373cabdff1aSopenharmony_ci    sar              coefd, 14 + 6
1374cabdff1aSopenharmony_ci    movd                m0, coefd
1375cabdff1aSopenharmony_ci%endif
1376cabdff1aSopenharmony_ci    SPLATW              m0, m0, q0000
1377cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1378cabdff1aSopenharmony_ci    pmulhrsw            m0, [pw_512]
1379cabdff1aSopenharmony_ci%endif
1380cabdff1aSopenharmony_ci    pxor                m5, m5
1381cabdff1aSopenharmony_ci    movd          [blockq], m5
1382cabdff1aSopenharmony_ci%rep 7
1383cabdff1aSopenharmony_ci    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
1384cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
1385cabdff1aSopenharmony_ci%endrep
1386cabdff1aSopenharmony_ci    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
1387cabdff1aSopenharmony_ci    RET
1388cabdff1aSopenharmony_ci
1389cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
1390cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1391cabdff1aSopenharmony_ci.idct8x8:
1392cabdff1aSopenharmony_ci    mov               tmpq, rsp
1393cabdff1aSopenharmony_ci    VP9_IDCT16_1D   blockq, 1, 8, 0
1394cabdff1aSopenharmony_ci
1395cabdff1aSopenharmony_ci    mov               cntd, 2
1396cabdff1aSopenharmony_ci    mov           dst_bakq, dstq
1397cabdff1aSopenharmony_ci.loop2_8x8:
1398cabdff1aSopenharmony_ci    VP9_IDCT16_1D     tmpq, 2, 8, 0
1399cabdff1aSopenharmony_ci    lea               dstq, [dst_bakq+8]
1400cabdff1aSopenharmony_ci    add               tmpq, 16
1401cabdff1aSopenharmony_ci    dec               cntd
1402cabdff1aSopenharmony_ci    jg .loop2_8x8
1403cabdff1aSopenharmony_ci
1404cabdff1aSopenharmony_ci    ; at the end of the loop, m0 should still be zero
1405cabdff1aSopenharmony_ci    ; use that to zero out block coefficients
1406cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 32, 8, m0
1407cabdff1aSopenharmony_ci    RET
1408cabdff1aSopenharmony_ci%endif
1409cabdff1aSopenharmony_ci
1410cabdff1aSopenharmony_ci.idctfull:
1411cabdff1aSopenharmony_ci    mov               cntd, 2
1412cabdff1aSopenharmony_ci    mov               tmpq, rsp
1413cabdff1aSopenharmony_ci.loop1_full:
1414cabdff1aSopenharmony_ci    VP9_IDCT16_1D   blockq, 1, 16, 0
1415cabdff1aSopenharmony_ci    add             blockq, 16
1416cabdff1aSopenharmony_ci    add               tmpq, 256
1417cabdff1aSopenharmony_ci    dec               cntd
1418cabdff1aSopenharmony_ci    jg .loop1_full
1419cabdff1aSopenharmony_ci    sub             blockq, 32
1420cabdff1aSopenharmony_ci
1421cabdff1aSopenharmony_ci    mov               cntd, 2
1422cabdff1aSopenharmony_ci    mov               tmpq, rsp
1423cabdff1aSopenharmony_ci    mov           dst_bakq, dstq
1424cabdff1aSopenharmony_ci.loop2_full:
1425cabdff1aSopenharmony_ci    VP9_IDCT16_1D     tmpq, 2, 16, 0
1426cabdff1aSopenharmony_ci    lea               dstq, [dst_bakq+8]
1427cabdff1aSopenharmony_ci    add               tmpq, 16
1428cabdff1aSopenharmony_ci    dec               cntd
1429cabdff1aSopenharmony_ci    jg .loop2_full
1430cabdff1aSopenharmony_ci
1431cabdff1aSopenharmony_ci    ; at the end of the loop, m0 should still be zero
1432cabdff1aSopenharmony_ci    ; use that to zero out block coefficients
1433cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 32, 16, m0
1434cabdff1aSopenharmony_ci    RET
1435cabdff1aSopenharmony_ci%endmacro
1436cabdff1aSopenharmony_ci
1437cabdff1aSopenharmony_ciVP9_IDCT_IDCT_16x16_ADD_XMM sse2
1438cabdff1aSopenharmony_ciVP9_IDCT_IDCT_16x16_ADD_XMM ssse3
1439cabdff1aSopenharmony_ciVP9_IDCT_IDCT_16x16_ADD_XMM avx
1440cabdff1aSopenharmony_ci
1441cabdff1aSopenharmony_ci%macro VP9_IDCT16_YMM_1D 0
1442cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  1,  15, 16305,  1606, [pd_8192], 0, 4 ; t8,  t15
1443cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  9,   7, 10394, 12665, [pd_8192], 0, 4 ; t9,  t14
1444cabdff1aSopenharmony_ci
1445cabdff1aSopenharmony_ci    SUMSUB_BA            w,  9,   1, 0      ; t8,  t9
1446cabdff1aSopenharmony_ci    SUMSUB_BA            w,  7,  15, 0      ; t15, t14
1447cabdff1aSopenharmony_ci
1448cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X 15,   1, 15137,  6270, [pd_8192], 0, 4 ; t9,  t14
1449cabdff1aSopenharmony_ci
1450cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  5,  11, 14449,  7723, [pd_8192], 0, 4 ; t10, t13
1451cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X 13,   3,  4756, 15679, [pd_8192], 0, 4 ; t11, t12
1452cabdff1aSopenharmony_ci
1453cabdff1aSopenharmony_ci    SUMSUB_BA            w,  5,  13, 0      ; t11, t10
1454cabdff1aSopenharmony_ci    SUMSUB_BA            w, 11,   3, 0      ; t12, t13
1455cabdff1aSopenharmony_ci
1456cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  3,  13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13
1457cabdff1aSopenharmony_ci
1458cabdff1aSopenharmony_ci    SUMSUB_BA            w,  5,   9, 0      ; t8,  t11
1459cabdff1aSopenharmony_ci    SUMSUB_BA            w,  3,  15, 0      ; t9,  t10
1460cabdff1aSopenharmony_ci    SUMSUB_BA            w, 11,   7, 0      ; t15, t12
1461cabdff1aSopenharmony_ci    SUMSUB_BA            w, 13,   1, 0      ; t14, t13
1462cabdff1aSopenharmony_ci
1463cabdff1aSopenharmony_ci    SUMSUB_BA            w, 15,   1, 0
1464cabdff1aSopenharmony_ci    SUMSUB_BA            w,  9,   7, 0
1465cabdff1aSopenharmony_ci    pmulhrsw            m1, [pw_11585x2]    ; t10
1466cabdff1aSopenharmony_ci    pmulhrsw            m7, [pw_11585x2]    ; t11
1467cabdff1aSopenharmony_ci    pmulhrsw            m9, [pw_11585x2]    ; t12
1468cabdff1aSopenharmony_ci    pmulhrsw           m15, [pw_11585x2]    ; t13
1469cabdff1aSopenharmony_ci
1470cabdff1aSopenharmony_ci    ; even (tx8x8)
1471cabdff1aSopenharmony_ci    mova                m4, [blockq+128]
1472cabdff1aSopenharmony_ci    mova      [blockq+128], m5
1473cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   4,  12, 15137,  6270, [pd_8192], 0, 5 ; t2,  t3
1474cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   2,  14, 16069,  3196, [pd_8192], 0, 5 ; t4,  t7
1475cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  10,   6,  9102, 13623, [pd_8192], 0, 5 ; t5,  t6
1476cabdff1aSopenharmony_ci    mova                m0, [blockq+  0]
1477cabdff1aSopenharmony_ci    SUMSUB_BA            w,   8,   0, 5
1478cabdff1aSopenharmony_ci    pmulhrsw            m8, [pw_11585x2]    ; t0
1479cabdff1aSopenharmony_ci    pmulhrsw            m0, [pw_11585x2]    ; t1
1480cabdff1aSopenharmony_ci
1481cabdff1aSopenharmony_ci    SUMSUB_BA            w,  10,   2, 5     ; t4,  t5
1482cabdff1aSopenharmony_ci    SUMSUB_BA            w,   6,  14, 5     ; t7,  t6
1483cabdff1aSopenharmony_ci    SUMSUB_BA            w,  12,   8, 5     ; t0,  t3
1484cabdff1aSopenharmony_ci    SUMSUB_BA            w,   4,   0, 5     ; t1,  t2
1485cabdff1aSopenharmony_ci
1486cabdff1aSopenharmony_ci    SUMSUB_BA            w,   2,  14, 5
1487cabdff1aSopenharmony_ci    pmulhrsw           m14, [pw_11585x2]    ; t5
1488cabdff1aSopenharmony_ci    pmulhrsw            m2, [pw_11585x2]    ; t6
1489cabdff1aSopenharmony_ci
1490cabdff1aSopenharmony_ci    SUMSUB_BA            w,   6,  12, 5     ; t0,  t7
1491cabdff1aSopenharmony_ci    SUMSUB_BA            w,   2,   4, 5     ; t1,  t6
1492cabdff1aSopenharmony_ci    SUMSUB_BA            w,  14,   0, 5     ; t2,  t5
1493cabdff1aSopenharmony_ci    SUMSUB_BA            w,  10,   8, 5     ; t3,  t4
1494cabdff1aSopenharmony_ci
1495cabdff1aSopenharmony_ci    ; final stage
1496cabdff1aSopenharmony_ci    SUMSUB_BA            w, 11,  6,  5      ; out0, out15
1497cabdff1aSopenharmony_ci    SUMSUB_BA            w, 13,  2,  5      ; out1, out14
1498cabdff1aSopenharmony_ci    SUMSUB_BA            w, 15, 14,  5      ; out2, out13
1499cabdff1aSopenharmony_ci    SUMSUB_BA            w,  9, 10,  5      ; out3, out12
1500cabdff1aSopenharmony_ci    SUMSUB_BA            w,  7,  8,  5      ; out4, out11
1501cabdff1aSopenharmony_ci    SUMSUB_BA            w,  1,  0,  5      ; out5, out10
1502cabdff1aSopenharmony_ci    SUMSUB_BA            w,  3,  4,  5      ; out6, out9
1503cabdff1aSopenharmony_ci    mova                m5, [blockq+128]
1504cabdff1aSopenharmony_ci    mova      [blockq+192], m3
1505cabdff1aSopenharmony_ci    SUMSUB_BA            w,  5, 12,  3      ; out7, out8
1506cabdff1aSopenharmony_ci
1507cabdff1aSopenharmony_ci    SWAP  0, 11,  8, 12, 10
1508cabdff1aSopenharmony_ci    SWAP  1, 13, 14,  2, 15,  6,  3,  9,  4,  7,  5
1509cabdff1aSopenharmony_ci%endmacro
1510cabdff1aSopenharmony_ci
1511cabdff1aSopenharmony_ci; this is almost identical to VP9_STORE_2X, but it does two rows
1512cabdff1aSopenharmony_ci; for slightly improved interleaving, and it omits vpermq since the
1513cabdff1aSopenharmony_ci; input is DC so all values are identical
1514cabdff1aSopenharmony_ci%macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
1515cabdff1aSopenharmony_ci    mova              xm%2, [dstq]
1516cabdff1aSopenharmony_ci    mova              xm%4, [dstq+strideq*2]
1517cabdff1aSopenharmony_ci    vinserti128        m%2, m%2, [dstq+strideq], 1
1518cabdff1aSopenharmony_ci    vinserti128        m%4, m%4, [dstq+stride3q], 1
1519cabdff1aSopenharmony_ci    punpckhbw          m%3, m%2, m%6
1520cabdff1aSopenharmony_ci    punpcklbw          m%2, m%6
1521cabdff1aSopenharmony_ci    punpckhbw          m%5, m%4, m%6
1522cabdff1aSopenharmony_ci    punpcklbw          m%4, m%6
1523cabdff1aSopenharmony_ci    paddw              m%3, m%1
1524cabdff1aSopenharmony_ci    paddw              m%2, m%1
1525cabdff1aSopenharmony_ci    paddw              m%5, m%1
1526cabdff1aSopenharmony_ci    paddw              m%4, m%1
1527cabdff1aSopenharmony_ci    packuswb           m%2, m%3
1528cabdff1aSopenharmony_ci    packuswb           m%4, m%5
1529cabdff1aSopenharmony_ci    mova            [dstq], xm%2
1530cabdff1aSopenharmony_ci    mova        [dstq+strideq*2], xm%4
1531cabdff1aSopenharmony_ci    vextracti128  [dstq+strideq], m%2, 1
1532cabdff1aSopenharmony_ci    vextracti128 [dstq+stride3q], m%4, 1
1533cabdff1aSopenharmony_ci%endmacro
1534cabdff1aSopenharmony_ci
1535cabdff1aSopenharmony_ci%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
1536cabdff1aSopenharmony_ciINIT_YMM avx2
1537cabdff1aSopenharmony_cicglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob
1538cabdff1aSopenharmony_ci    cmp eobd, 1 ; faster path for when only DC is set
1539cabdff1aSopenharmony_ci    jg .idctfull
1540cabdff1aSopenharmony_ci
1541cabdff1aSopenharmony_ci    ; dc-only
1542cabdff1aSopenharmony_ci    mova                m1, [pw_11585x2]
1543cabdff1aSopenharmony_ci    vpbroadcastw        m0, [blockq]
1544cabdff1aSopenharmony_ci    pmulhrsw            m0, m1
1545cabdff1aSopenharmony_ci    pmulhrsw            m0, m1
1546cabdff1aSopenharmony_ci    pxor                m5, m5
1547cabdff1aSopenharmony_ci    pmulhrsw            m0, [pw_512]
1548cabdff1aSopenharmony_ci    movd          [blockq], xm5
1549cabdff1aSopenharmony_ci
1550cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
1551cabdff1aSopenharmony_ci    mov               cntd, 4
1552cabdff1aSopenharmony_ci    lea           stride3q, [strideq*3]
1553cabdff1aSopenharmony_ci.loop_dc:
1554cabdff1aSopenharmony_ci    VP9_STORE_YMM_DC_4X  0, 1, 2, 3, 4, 5
1555cabdff1aSopenharmony_ci    lea               dstq, [dstq+4*strideq]
1556cabdff1aSopenharmony_ci    dec               cntd
1557cabdff1aSopenharmony_ci    jg .loop_dc
1558cabdff1aSopenharmony_ci    RET
1559cabdff1aSopenharmony_ci
1560cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, block, eob
1561cabdff1aSopenharmony_ci.idctfull:
1562cabdff1aSopenharmony_ci    mova                m1, [blockq+ 32]
1563cabdff1aSopenharmony_ci    mova                m2, [blockq+ 64]
1564cabdff1aSopenharmony_ci    mova                m3, [blockq+ 96]
1565cabdff1aSopenharmony_ci    mova                m5, [blockq+160]
1566cabdff1aSopenharmony_ci    mova                m6, [blockq+192]
1567cabdff1aSopenharmony_ci    mova                m7, [blockq+224]
1568cabdff1aSopenharmony_ci    mova                m8, [blockq+256]
1569cabdff1aSopenharmony_ci    mova                m9, [blockq+288]
1570cabdff1aSopenharmony_ci    mova               m10, [blockq+320]
1571cabdff1aSopenharmony_ci    mova               m11, [blockq+352]
1572cabdff1aSopenharmony_ci    mova               m12, [blockq+384]
1573cabdff1aSopenharmony_ci    mova               m13, [blockq+416]
1574cabdff1aSopenharmony_ci    mova               m14, [blockq+448]
1575cabdff1aSopenharmony_ci    mova               m15, [blockq+480]
1576cabdff1aSopenharmony_ci
1577cabdff1aSopenharmony_ci    VP9_IDCT16_YMM_1D
1578cabdff1aSopenharmony_ci    TRANSPOSE16x16W      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
1579cabdff1aSopenharmony_ci                         [blockq+192], [blockq+128], 1
1580cabdff1aSopenharmony_ci    mova      [blockq+  0], m0
1581cabdff1aSopenharmony_ci    VP9_IDCT16_YMM_1D
1582cabdff1aSopenharmony_ci
1583cabdff1aSopenharmony_ci    mova      [blockq+224], m7
1584cabdff1aSopenharmony_ci
1585cabdff1aSopenharmony_ci    ; store
1586cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    0,  1, 6, 7, unused, [pw_512], 6
1587cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
1588cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    2,  3, 6, 7, unused, [pw_512], 6
1589cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
1590cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    4,  5, 6, 7, unused, [pw_512], 6
1591cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
1592cabdff1aSopenharmony_ci    mova                m6, [blockq+192]
1593cabdff1aSopenharmony_ci    mova                m7, [blockq+224]
1594cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    6,  7, 1, 2, unused, [pw_512], 6
1595cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
1596cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    8,  9, 1, 2, unused, [pw_512], 6
1597cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
1598cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2   10, 11, 1, 2, unused, [pw_512], 6
1599cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
1600cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2   12, 13, 1, 2, unused, [pw_512], 6
1601cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
1602cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2   14, 15, 1, 2, unused, [pw_512], 6
1603cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
1604cabdff1aSopenharmony_ci
1605cabdff1aSopenharmony_ci    ; at the end of the loop, m0 should still be zero
1606cabdff1aSopenharmony_ci    ; use that to zero out block coefficients
1607cabdff1aSopenharmony_ci    pxor                m0, m0
1608cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 32, 16, m0
1609cabdff1aSopenharmony_ci    RET
1610cabdff1aSopenharmony_ci%endif
1611cabdff1aSopenharmony_ci
1612cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------------------
1613cabdff1aSopenharmony_ci; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
1614cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------------------
1615cabdff1aSopenharmony_ci
1616cabdff1aSopenharmony_ci%macro VP9_IADST16_1D 2 ; src, pass
1617cabdff1aSopenharmony_ci%assign %%str 16*%2
1618cabdff1aSopenharmony_ci    mova                m0, [%1+ 0*32]  ; in0
1619cabdff1aSopenharmony_ci    mova                m1, [%1+15*32]  ; in15
1620cabdff1aSopenharmony_ci    mova                m2, [%1+ 7*32]  ; in7
1621cabdff1aSopenharmony_ci    mova                m3, [%1+ 8*32]  ; in8
1622cabdff1aSopenharmony_ci
1623cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  1,  0,  4,  5, 16364,   804    ; m1/4=t1[d], m0/5=t0[d]
1624cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  6, 11003, 12140    ; m2/7=t9[d], m3/6=t8[d]
1625cabdff1aSopenharmony_ci    SCRATCH              4, 8, tmpq+ 0*%%str
1626cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     3,  0,  6,  5,  4, [pd_8192]   ; m3=t0[w], m0=t8[w]
1627cabdff1aSopenharmony_ci    UNSCRATCH            4, 8, tmpq+ 0*%%str
1628cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     2,  1,  7,  4,  5, [pd_8192]   ; m2=t1[w], m1=t9[w]
1629cabdff1aSopenharmony_ci
1630cabdff1aSopenharmony_ci    SCRATCH              0, 10, tmpq+ 0*%%str
1631cabdff1aSopenharmony_ci    SCRATCH              1, 11, tmpq+15*%%str
1632cabdff1aSopenharmony_ci    mova   [tmpq+ 7*%%str], m2
1633cabdff1aSopenharmony_ci    mova   [tmpq+ 8*%%str], m3
1634cabdff1aSopenharmony_ci
1635cabdff1aSopenharmony_ci    mova                m1, [%1+ 2*32]  ; in2
1636cabdff1aSopenharmony_ci    mova                m0, [%1+13*32]  ; in13
1637cabdff1aSopenharmony_ci    mova                m3, [%1+ 5*32]  ; in5
1638cabdff1aSopenharmony_ci    mova                m2, [%1+10*32]  ; in10
1639cabdff1aSopenharmony_ci
1640cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 15893,  3981    ; m0/6=t3[d], m1/7=t2[d]
1641cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  3,  2,  4,  5,  8423, 14053    ; m3/4=t11[d], m2/5=t10[d]
1642cabdff1aSopenharmony_ci    SCRATCH              4, 12, tmpq+ 2*%%str
1643cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     2,  1,  5,  7,  4, [pd_8192]   ; m2=t2[w], m1=t10[w]
1644cabdff1aSopenharmony_ci    UNSCRATCH            4, 12, tmpq+ 2*%%str
1645cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     3,  0,  4,  6,  5, [pd_8192]   ; m3=t3[w], m0=t11[w]
1646cabdff1aSopenharmony_ci
1647cabdff1aSopenharmony_ci    SCRATCH              0, 12, tmpq+ 2*%%str
1648cabdff1aSopenharmony_ci    SCRATCH              1, 13, tmpq+13*%%str
1649cabdff1aSopenharmony_ci    mova   [tmpq+ 5*%%str], m2
1650cabdff1aSopenharmony_ci    mova   [tmpq+10*%%str], m3
1651cabdff1aSopenharmony_ci
1652cabdff1aSopenharmony_ci    mova                m2, [%1+ 4*32]  ; in4
1653cabdff1aSopenharmony_ci    mova                m3, [%1+11*32]  ; in11
1654cabdff1aSopenharmony_ci    mova                m0, [%1+ 3*32]  ; in3
1655cabdff1aSopenharmony_ci    mova                m1, [%1+12*32]  ; in12
1656cabdff1aSopenharmony_ci
1657cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 14811,  7005    ; m3/7=t5[d], m2/6=t4[d]
1658cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  5520, 15426    ; m0/4=t13[d], m1/5=t12[d]
1659cabdff1aSopenharmony_ci    SCRATCH              4, 9, tmpq+ 4*%%str
1660cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t4[w], m2=t12[w]
1661cabdff1aSopenharmony_ci    UNSCRATCH            4, 9, tmpq+ 4*%%str
1662cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t5[w], m3=t13[w]
1663cabdff1aSopenharmony_ci
1664cabdff1aSopenharmony_ci    SCRATCH              0,  8, tmpq+ 4*%%str
1665cabdff1aSopenharmony_ci    mova   [tmpq+11*%%str], m1          ; t4:m1->r11
1666cabdff1aSopenharmony_ci    UNSCRATCH            0, 10, tmpq+ 0*%%str
1667cabdff1aSopenharmony_ci    UNSCRATCH            1, 11, tmpq+15*%%str
1668cabdff1aSopenharmony_ci
1669cabdff1aSopenharmony_ci    ; round 2 interleaved part 1
1670cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 16069,  3196    ; m1/7=t8[d], m0/6=t9[d]
1671cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  3,  2,  5,  4,  3196, 16069    ; m3/5=t12[d], m2/4=t13[d]
1672cabdff1aSopenharmony_ci    SCRATCH              4, 9, tmpq+ 3*%%str
1673cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     3,  1,  5,  7,  4, [pd_8192]   ; m3=t8[w], m1=t12[w]
1674cabdff1aSopenharmony_ci    UNSCRATCH            4, 9, tmpq+ 3*%%str
1675cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     2,  0,  4,  6,  5, [pd_8192]   ; m2=t9[w], m0=t13[w]
1676cabdff1aSopenharmony_ci
1677cabdff1aSopenharmony_ci    SCRATCH              0, 10, tmpq+ 0*%%str
1678cabdff1aSopenharmony_ci    SCRATCH              1, 11, tmpq+15*%%str
1679cabdff1aSopenharmony_ci    SCRATCH              2, 14, tmpq+ 3*%%str
1680cabdff1aSopenharmony_ci    SCRATCH              3, 15, tmpq+12*%%str
1681cabdff1aSopenharmony_ci
1682cabdff1aSopenharmony_ci    mova                m2, [%1+ 6*32]  ; in6
1683cabdff1aSopenharmony_ci    mova                m3, [%1+ 9*32]  ; in9
1684cabdff1aSopenharmony_ci    mova                m0, [%1+ 1*32]  ; in1
1685cabdff1aSopenharmony_ci    mova                m1, [%1+14*32]  ; in14
1686cabdff1aSopenharmony_ci
1687cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 13160,  9760    ; m3/7=t7[d], m2/6=t6[d]
1688cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  2404, 16207    ; m0/4=t15[d], m1/5=t14[d]
1689cabdff1aSopenharmony_ci    SCRATCH              4, 9, tmpq+ 6*%%str
1690cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t6[w], m2=t14[w]
1691cabdff1aSopenharmony_ci    UNSCRATCH            4, 9, tmpq+ 6*%%str
1692cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t7[w], m3=t15[w]
1693cabdff1aSopenharmony_ci
1694cabdff1aSopenharmony_ci    ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7
1695cabdff1aSopenharmony_ci    ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15
1696cabdff1aSopenharmony_ci
1697cabdff1aSopenharmony_ci    UNSCRATCH            4, 12, tmpq+ 2*%%str
1698cabdff1aSopenharmony_ci    UNSCRATCH            5, 13, tmpq+13*%%str
1699cabdff1aSopenharmony_ci    SCRATCH              0, 12, tmpq+ 1*%%str
1700cabdff1aSopenharmony_ci    SCRATCH              1, 13, tmpq+14*%%str
1701cabdff1aSopenharmony_ci
1702cabdff1aSopenharmony_ci    ; remainder of round 2 (rest of t8-15)
1703cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  5,  4,  6,  7,  9102, 13623    ; m5/6=t11[d], m4/7=t10[d]
1704cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  3,  2,  1,  0, 13623,  9102    ; m3/1=t14[d], m2/0=t15[d]
1705cabdff1aSopenharmony_ci    SCRATCH              0, 9, tmpq+ 6*%%str
1706cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     3,  4,  1,  7,  0, [pd_8192]   ; m3=t10[w], m4=t14[w]
1707cabdff1aSopenharmony_ci    UNSCRATCH            0, 9, tmpq+ 6*%%str
1708cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     2,  5,  0,  6,  1, [pd_8192]   ; m2=t11[w], m5=t15[w]
1709cabdff1aSopenharmony_ci
1710cabdff1aSopenharmony_ci    ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15
1711cabdff1aSopenharmony_ci
1712cabdff1aSopenharmony_ci    UNSCRATCH            6, 14, tmpq+ 3*%%str
1713cabdff1aSopenharmony_ci    UNSCRATCH            7, 15, tmpq+12*%%str
1714cabdff1aSopenharmony_ci
1715cabdff1aSopenharmony_ci    SUMSUB_BA                w,  3,  7,  1
1716cabdff1aSopenharmony_ci    PSIGNW                  m3, [pw_m1]                     ; m3=out1[w], m7=t10[w]
1717cabdff1aSopenharmony_ci    SUMSUB_BA                w,  2,  6,  1                  ; m2=out14[w], m6=t11[w]
1718cabdff1aSopenharmony_ci
1719cabdff1aSopenharmony_ci    ; unfortunately, the code below overflows in some cases, e.g.
1720cabdff1aSopenharmony_ci    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm
1721cabdff1aSopenharmony_ci%if 0; cpuflag(ssse3)
1722cabdff1aSopenharmony_ci    SUMSUB_BA                w,  7,  6,  1
1723cabdff1aSopenharmony_ci    pmulhrsw                m7, [pw_11585x2]                ; m7=out6[w]
1724cabdff1aSopenharmony_ci    pmulhrsw                m6, [pw_11585x2]                ; m6=out9[w]
1725cabdff1aSopenharmony_ci%else
1726cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, 11585, [pd_8192], 1, 0
1727cabdff1aSopenharmony_ci%endif
1728cabdff1aSopenharmony_ci
1729cabdff1aSopenharmony_ci    mova       [tmpq+ 3*%%str], m6
1730cabdff1aSopenharmony_ci    mova       [tmpq+ 6*%%str], m7
1731cabdff1aSopenharmony_ci    UNSCRATCH                6, 10, tmpq+ 0*%%str
1732cabdff1aSopenharmony_ci    UNSCRATCH                7, 11, tmpq+15*%%str
1733cabdff1aSopenharmony_ci    mova       [tmpq+13*%%str], m2
1734cabdff1aSopenharmony_ci    SCRATCH                  3, 11, tmpq+ 9*%%str
1735cabdff1aSopenharmony_ci
1736cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  7,  6,  2,  3, 15137,  6270    ; m6/3=t13[d], m7/2=t12[d]
1737cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  5,  4,  1,  0,  6270, 15137    ; m5/1=t14[d], m4/0=t15[d]
1738cabdff1aSopenharmony_ci    SCRATCH              0, 9, tmpq+ 2*%%str
1739cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     5,  6,  1,  3,  0, [pd_8192]   ; m5=out2[w], m6=t14[w]
1740cabdff1aSopenharmony_ci    UNSCRATCH            0, 9, tmpq+ 2*%%str
1741cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     4,  7,  0,  2,  1, [pd_8192]
1742cabdff1aSopenharmony_ci    PSIGNW                  m4, [pw_m1]                     ; m4=out13[w], m7=t15[w]
1743cabdff1aSopenharmony_ci
1744cabdff1aSopenharmony_ci    ; unfortunately, the code below overflows in some cases
1745cabdff1aSopenharmony_ci%if 0; cpuflag(ssse3)
1746cabdff1aSopenharmony_ci    SUMSUB_BA                w,  7,  6,  1
1747cabdff1aSopenharmony_ci    pmulhrsw                m7, [pw_m11585x2]               ; m7=out5[w]
1748cabdff1aSopenharmony_ci    pmulhrsw                m6, [pw_11585x2]                ; m6=out10[w]
1749cabdff1aSopenharmony_ci%else
1750cabdff1aSopenharmony_ci    PSIGNW                  m7, [pw_m1]
1751cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  7,  6, 11585, 11585, [pd_8192], 1, 0
1752cabdff1aSopenharmony_ci%endif
1753cabdff1aSopenharmony_ci
1754cabdff1aSopenharmony_ci    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14
1755cabdff1aSopenharmony_ci
1756cabdff1aSopenharmony_ci    mova                    m2, [tmpq+ 8*%%str]
1757cabdff1aSopenharmony_ci    mova                    m3, [tmpq+ 7*%%str]
1758cabdff1aSopenharmony_ci    mova                    m1, [tmpq+11*%%str]
1759cabdff1aSopenharmony_ci    mova       [tmpq+ 7*%%str], m6
1760cabdff1aSopenharmony_ci    mova       [tmpq+11*%%str], m4
1761cabdff1aSopenharmony_ci    mova                    m4, [tmpq+ 5*%%str]
1762cabdff1aSopenharmony_ci    SCRATCH                  5, 14, tmpq+ 5*%%str
1763cabdff1aSopenharmony_ci    SCRATCH                  7, 15, tmpq+ 8*%%str
1764cabdff1aSopenharmony_ci    UNSCRATCH                6,  8, tmpq+ 4*%%str
1765cabdff1aSopenharmony_ci    UNSCRATCH                5, 12, tmpq+ 1*%%str
1766cabdff1aSopenharmony_ci    UNSCRATCH                7, 13, tmpq+14*%%str
1767cabdff1aSopenharmony_ci
1768cabdff1aSopenharmony_ci    ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7
1769cabdff1aSopenharmony_ci    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
1770cabdff1aSopenharmony_ci
1771cabdff1aSopenharmony_ci    SUMSUB_BA                w,  1,  2, 0                   ; m1=t0[w], m2=t4[w]
1772cabdff1aSopenharmony_ci    mova                    m0, [tmpq+10*%%str]
1773cabdff1aSopenharmony_ci    SCRATCH                  1, 12, tmpq+ 1*%%str
1774cabdff1aSopenharmony_ci    SUMSUB_BA                w,  6,  3, 1                   ; m8=t1[w], m3=t5[w]
1775cabdff1aSopenharmony_ci    SCRATCH                  6, 13, tmpq+ 4*%%str
1776cabdff1aSopenharmony_ci    SUMSUB_BA                w,  7,  4, 1                   ; m13=t2[w], m9=t6[w]
1777cabdff1aSopenharmony_ci    SCRATCH                  7,  8, tmpq+10*%%str
1778cabdff1aSopenharmony_ci    SUMSUB_BA                w,  5,  0, 1                   ; m12=t3[w], m0=t7[w]
1779cabdff1aSopenharmony_ci    SCRATCH                  5,  9, tmpq+14*%%str
1780cabdff1aSopenharmony_ci
1781cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  5, 15137,  6270    ; m2/6=t5[d], m3/10=t4[d]
1782cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  0,  4,  1,  6,  6270, 15137    ; m0/14=t6[d], m9/15=t7[d]
1783cabdff1aSopenharmony_ci    SCRATCH                  6, 10, tmpq+ 0*%%str
1784cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     0,  3,  1,  5,  6, [pd_8192]
1785cabdff1aSopenharmony_ci    UNSCRATCH                6, 10, tmpq+ 0*%%str
1786cabdff1aSopenharmony_ci    PSIGNW                  m0, [pw_m1]                     ; m0=out3[w], m3=t6[w]
1787cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     4,  2,  6,  7,  5, [pd_8192]   ; m9=out12[w], m2=t7[w]
1788cabdff1aSopenharmony_ci
1789cabdff1aSopenharmony_ci    UNSCRATCH                1,  8, tmpq+10*%%str
1790cabdff1aSopenharmony_ci    UNSCRATCH                5,  9, tmpq+14*%%str
1791cabdff1aSopenharmony_ci    UNSCRATCH                6, 12, tmpq+ 1*%%str
1792cabdff1aSopenharmony_ci    UNSCRATCH                7, 13, tmpq+ 4*%%str
1793cabdff1aSopenharmony_ci    SCRATCH                  4,  9, tmpq+14*%%str
1794cabdff1aSopenharmony_ci
1795cabdff1aSopenharmony_ci    SUMSUB_BA                w,  1,  6,  4                  ; m13=out0[w], m1=t2[w]
1796cabdff1aSopenharmony_ci    SUMSUB_BA                w,  5,  7,  4
1797cabdff1aSopenharmony_ci    PSIGNW                  m5, [pw_m1]                     ; m12=out15[w], m8=t3[w]
1798cabdff1aSopenharmony_ci
1799cabdff1aSopenharmony_ci    ; unfortunately, the code below overflows in some cases, e.g.
1800cabdff1aSopenharmony_ci    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
1801cabdff1aSopenharmony_ci%if 0 ; cpuflag(ssse3)
1802cabdff1aSopenharmony_ci    SUMSUB_BA               w,   7,  6,  4
1803cabdff1aSopenharmony_ci    pmulhrsw                m7, [pw_m11585x2]               ; m8=out7[w]
1804cabdff1aSopenharmony_ci    pmulhrsw                m6, [pw_11585x2]                ; m1=out8[w]
1805cabdff1aSopenharmony_ci    SWAP                     6,  7
1806cabdff1aSopenharmony_ci    SUMSUB_BA                w,  3,  2,  4
1807cabdff1aSopenharmony_ci    pmulhrsw                m3, [pw_11585x2]                ; m3=out4[w]
1808cabdff1aSopenharmony_ci    pmulhrsw                m2, [pw_11585x2]                ; m2=out11[w]
1809cabdff1aSopenharmony_ci%else
1810cabdff1aSopenharmony_ci    SCRATCH                  5,  8, tmpq+10*%%str
1811cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, m11585, [pd_8192],  5,  4
1812cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585, 11585, [pd_8192],  5,  4
1813cabdff1aSopenharmony_ci    UNSCRATCH                5,  8, tmpq+10*%%str
1814cabdff1aSopenharmony_ci%endif
1815cabdff1aSopenharmony_ci
1816cabdff1aSopenharmony_ci    ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15
1817cabdff1aSopenharmony_ci    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
1818cabdff1aSopenharmony_ci
1819cabdff1aSopenharmony_ci%if %2 == 1
1820cabdff1aSopenharmony_ci%if ARCH_X86_64
1821cabdff1aSopenharmony_ci    mova                   m13, [tmpq+ 6*%%str]
1822cabdff1aSopenharmony_ci    TRANSPOSE8x8W            1, 11, 14, 0, 3, 15, 13, 6, 10
1823cabdff1aSopenharmony_ci    mova          [tmpq+ 0*16], m1
1824cabdff1aSopenharmony_ci    mova          [tmpq+ 2*16], m11
1825cabdff1aSopenharmony_ci    mova          [tmpq+ 4*16], m14
1826cabdff1aSopenharmony_ci    mova          [tmpq+ 6*16], m0
1827cabdff1aSopenharmony_ci    mova                    m1, [tmpq+ 3*%%str]
1828cabdff1aSopenharmony_ci    mova                   m11, [tmpq+ 7*%%str]
1829cabdff1aSopenharmony_ci    mova                   m14, [tmpq+11*%%str]
1830cabdff1aSopenharmony_ci    mova                    m0, [tmpq+13*%%str]
1831cabdff1aSopenharmony_ci    mova          [tmpq+ 8*16], m3
1832cabdff1aSopenharmony_ci    mova          [tmpq+10*16], m15
1833cabdff1aSopenharmony_ci    mova          [tmpq+12*16], m13
1834cabdff1aSopenharmony_ci    mova          [tmpq+14*16], m6
1835cabdff1aSopenharmony_ci
1836cabdff1aSopenharmony_ci    TRANSPOSE8x8W            7, 1, 11, 2, 9, 14, 0, 5, 10
1837cabdff1aSopenharmony_ci    mova          [tmpq+ 1*16], m7
1838cabdff1aSopenharmony_ci    mova          [tmpq+ 3*16], m1
1839cabdff1aSopenharmony_ci    mova          [tmpq+ 5*16], m11
1840cabdff1aSopenharmony_ci    mova          [tmpq+ 7*16], m2
1841cabdff1aSopenharmony_ci    mova          [tmpq+ 9*16], m9
1842cabdff1aSopenharmony_ci    mova          [tmpq+11*16], m14
1843cabdff1aSopenharmony_ci    mova          [tmpq+13*16], m0
1844cabdff1aSopenharmony_ci    mova          [tmpq+15*16], m5
1845cabdff1aSopenharmony_ci%else
1846cabdff1aSopenharmony_ci    mova       [tmpq+12*%%str], m2
1847cabdff1aSopenharmony_ci    mova       [tmpq+ 1*%%str], m5
1848cabdff1aSopenharmony_ci    mova       [tmpq+15*%%str], m7
1849cabdff1aSopenharmony_ci    mova                    m2, [tmpq+ 9*%%str]
1850cabdff1aSopenharmony_ci    mova                    m5, [tmpq+ 5*%%str]
1851cabdff1aSopenharmony_ci    mova                    m7, [tmpq+ 8*%%str]
1852cabdff1aSopenharmony_ci    TRANSPOSE8x8W            1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1
1853cabdff1aSopenharmony_ci    mova          [tmpq+ 0*16], m1
1854cabdff1aSopenharmony_ci    mova          [tmpq+ 2*16], m2
1855cabdff1aSopenharmony_ci    mova          [tmpq+ 4*16], m5
1856cabdff1aSopenharmony_ci    mova          [tmpq+ 6*16], m0
1857cabdff1aSopenharmony_ci    mova          [tmpq+10*16], m7
1858cabdff1aSopenharmony_ci    mova                    m3, [tmpq+12*%%str]
1859cabdff1aSopenharmony_ci    mova          [tmpq+12*16], m4
1860cabdff1aSopenharmony_ci    mova                    m4, [tmpq+14*%%str]
1861cabdff1aSopenharmony_ci    mova          [tmpq+14*16], m6
1862cabdff1aSopenharmony_ci
1863cabdff1aSopenharmony_ci    mova                    m0, [tmpq+15*%%str]
1864cabdff1aSopenharmony_ci    mova                    m1, [tmpq+ 3*%%str]
1865cabdff1aSopenharmony_ci    mova                    m2, [tmpq+ 7*%%str]
1866cabdff1aSopenharmony_ci    mova                    m5, [tmpq+11*%%str]
1867cabdff1aSopenharmony_ci    mova                    m7, [tmpq+ 1*%%str]
1868cabdff1aSopenharmony_ci    TRANSPOSE8x8W            0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1
1869cabdff1aSopenharmony_ci    mova          [tmpq+ 1*16], m0
1870cabdff1aSopenharmony_ci    mova          [tmpq+ 3*16], m1
1871cabdff1aSopenharmony_ci    mova          [tmpq+ 5*16], m2
1872cabdff1aSopenharmony_ci    mova          [tmpq+ 7*16], m3
1873cabdff1aSopenharmony_ci    mova          [tmpq+11*16], m5
1874cabdff1aSopenharmony_ci    mova          [tmpq+13*16], m6
1875cabdff1aSopenharmony_ci    mova          [tmpq+15*16], m7
1876cabdff1aSopenharmony_ci%endif
1877cabdff1aSopenharmony_ci%else
1878cabdff1aSopenharmony_ci    pxor                    m4, m4
1879cabdff1aSopenharmony_ci
1880cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1881cabdff1aSopenharmony_ci%define ROUND_REG [pw_512]
1882cabdff1aSopenharmony_ci%else
1883cabdff1aSopenharmony_ci%define ROUND_REG [pw_32]
1884cabdff1aSopenharmony_ci%endif
1885cabdff1aSopenharmony_ci
1886cabdff1aSopenharmony_ci%if ARCH_X86_64
1887cabdff1aSopenharmony_ci    mova                   m12, [tmpq+ 6*%%str]
1888cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        1, 11, 10,  8,  4, ROUND_REG, 6
1889cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1890cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2       14,  0, 10,  8,  4, ROUND_REG, 6
1891cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1892cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        3, 15, 10,  8,  4, ROUND_REG, 6
1893cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1894cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2       12,  6, 10,  8,  4, ROUND_REG, 6
1895cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1896cabdff1aSopenharmony_ci
1897cabdff1aSopenharmony_ci    mova                    m1, [tmpq+ 3*%%str]
1898cabdff1aSopenharmony_ci    mova                   m11, [tmpq+ 7*%%str]
1899cabdff1aSopenharmony_ci    mova                   m14, [tmpq+11*%%str]
1900cabdff1aSopenharmony_ci    mova                    m0, [tmpq+13*%%str]
1901cabdff1aSopenharmony_ci
1902cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        7,  1, 10,  8,  4, ROUND_REG, 6
1903cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1904cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2       11,  2, 10,  8,  4, ROUND_REG, 6
1905cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1906cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        9, 14, 10,  8,  4, ROUND_REG, 6
1907cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1908cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        0,  5, 10,  8,  4, ROUND_REG, 6
1909cabdff1aSopenharmony_ci%else
1910cabdff1aSopenharmony_ci    mova       [tmpq+ 0*%%str], m2
1911cabdff1aSopenharmony_ci    mova       [tmpq+ 1*%%str], m5
1912cabdff1aSopenharmony_ci    mova       [tmpq+ 2*%%str], m7
1913cabdff1aSopenharmony_ci    mova                    m2, [tmpq+ 9*%%str]
1914cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        1,  2,  5,  7,  4, ROUND_REG, 6
1915cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1916cabdff1aSopenharmony_ci    mova                    m5, [tmpq+ 5*%%str]
1917cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        5,  0,  1,  2,  4, ROUND_REG, 6
1918cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1919cabdff1aSopenharmony_ci    mova                    m5, [tmpq+ 8*%%str]
1920cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        3,  5,  1,  2,  4, ROUND_REG, 6
1921cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1922cabdff1aSopenharmony_ci    mova                    m5, [tmpq+ 6*%%str]
1923cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        5,  6,  1,  2,  4, ROUND_REG, 6
1924cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1925cabdff1aSopenharmony_ci
1926cabdff1aSopenharmony_ci    mova                    m0, [tmpq+ 2*%%str]
1927cabdff1aSopenharmony_ci    mova                    m3, [tmpq+ 3*%%str]
1928cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
1929cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1930cabdff1aSopenharmony_ci    mova                    m0, [tmpq+ 7*%%str]
1931cabdff1aSopenharmony_ci    mova                    m3, [tmpq+ 0*%%str]
1932cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
1933cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1934cabdff1aSopenharmony_ci    mova                    m0, [tmpq+14*%%str]
1935cabdff1aSopenharmony_ci    mova                    m3, [tmpq+11*%%str]
1936cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
1937cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1938cabdff1aSopenharmony_ci    mova                    m0, [tmpq+13*%%str]
1939cabdff1aSopenharmony_ci    mova                    m3, [tmpq+ 1*%%str]
1940cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
1941cabdff1aSopenharmony_ci%endif
1942cabdff1aSopenharmony_ci
1943cabdff1aSopenharmony_ci    SWAP                     0,  4 ; zero
1944cabdff1aSopenharmony_ci%undef ROUND_REG
1945cabdff1aSopenharmony_ci%endif
1946cabdff1aSopenharmony_ci%endmacro
1947cabdff1aSopenharmony_ci
1948cabdff1aSopenharmony_ci%macro IADST16_FN 5
1949cabdff1aSopenharmony_ciINIT_XMM %5
1950cabdff1aSopenharmony_cicglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
1951cabdff1aSopenharmony_ci    mov               cntd, 2
1952cabdff1aSopenharmony_ci    mov               tmpq, rsp
1953cabdff1aSopenharmony_ci.loop1_full:
1954cabdff1aSopenharmony_ci    VP9_%2_1D       blockq, 1
1955cabdff1aSopenharmony_ci    add             blockq, 16
1956cabdff1aSopenharmony_ci    add               tmpq, 256
1957cabdff1aSopenharmony_ci    dec               cntd
1958cabdff1aSopenharmony_ci    jg .loop1_full
1959cabdff1aSopenharmony_ci    sub             blockq, 32
1960cabdff1aSopenharmony_ci
1961cabdff1aSopenharmony_ci    mov               cntd, 2
1962cabdff1aSopenharmony_ci    mov               tmpq, rsp
1963cabdff1aSopenharmony_ci    mov           dst_bakq, dstq
1964cabdff1aSopenharmony_ci.loop2_full:
1965cabdff1aSopenharmony_ci    VP9_%4_1D         tmpq, 2
1966cabdff1aSopenharmony_ci    lea               dstq, [dst_bakq+8]
1967cabdff1aSopenharmony_ci    add               tmpq, 16
1968cabdff1aSopenharmony_ci    dec               cntd
1969cabdff1aSopenharmony_ci    jg .loop2_full
1970cabdff1aSopenharmony_ci
1971cabdff1aSopenharmony_ci    ; at the end of the loop, m0 should still be zero
1972cabdff1aSopenharmony_ci    ; use that to zero out block coefficients
1973cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 32, 16, m0
1974cabdff1aSopenharmony_ci    RET
1975cabdff1aSopenharmony_ci%endmacro
1976cabdff1aSopenharmony_ci
1977cabdff1aSopenharmony_ciIADST16_FN idct,  IDCT16,  iadst, IADST16, sse2
1978cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, idct,  IDCT16,  sse2
1979cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, iadst, IADST16, sse2
1980cabdff1aSopenharmony_ciIADST16_FN idct,  IDCT16,  iadst, IADST16, ssse3
1981cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, idct,  IDCT16,  ssse3
1982cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, iadst, IADST16, ssse3
1983cabdff1aSopenharmony_ciIADST16_FN idct,  IDCT16,  iadst, IADST16, avx
1984cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, idct,  IDCT16,  avx
1985cabdff1aSopenharmony_ciIADST16_FN iadst, IADST16, iadst, IADST16, avx
1986cabdff1aSopenharmony_ci
1987cabdff1aSopenharmony_ci; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128]
1988cabdff1aSopenharmony_ci; out: m[0-15] except m6, which is in [blockq+192]
1989cabdff1aSopenharmony_ci; uses blockq as scratch space
1990cabdff1aSopenharmony_ci%macro VP9_IADST16_YMM_1D 0
1991cabdff1aSopenharmony_ci    mova          [blockq+ 32], m3
1992cabdff1aSopenharmony_ci    mova          [blockq+ 64], m7
1993cabdff1aSopenharmony_ci    mova          [blockq+ 96], m8
1994cabdff1aSopenharmony_ci
1995cabdff1aSopenharmony_ci    ; first half of round 1
1996cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  9,  6,  0,  3, 13160,  9760    ; m9/x=t7[d], m6/x=t6[d]
1997cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  1, 14,  4,  7,  2404, 16207    ; m1/x=t15[d], m14/x=t14[d]
1998cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA    14,  6,  7,  3,  8, [pd_8192]   ; m14=t6[w], m6=t14[w]
1999cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     1,  9,  4,  0,  8, [pd_8192]   ; m1=t7[w], m9=t15[w]
2000cabdff1aSopenharmony_ci
2001cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X 13,  2,  4,  7, 15893,  3981    ; m13/x=t3[d], m2/x=t2[d]
2002cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  5, 10,  0,  3,  8423, 14053    ; m5/x=t11[d], m10/x=t10[d]
2003cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA    10,  2,  3,  7,  8, [pd_8192]   ; m10=t2[w], m2=t10[w]
2004cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     5, 13,  0,  4,  8, [pd_8192]   ; m5=t3[w], m13=t11[w]
2005cabdff1aSopenharmony_ci
2006cabdff1aSopenharmony_ci    ; half of round 2 t8-15
2007cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  2, 13,  4,  7,  9102, 13623    ; m2/x=t11[d], m13/x=t10[d]
2008cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  9,  6,  3,  0, 13623,  9102    ; m9/x=t14[d], m6/x=t15[d]
2009cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     9, 13,  3,  7,  8, [pd_8192]   ; m9=t10[w], m13=t14[w]
2010cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     6,  2,  0,  4,  8, [pd_8192]   ; m6=t11[w], m2=t15[w]
2011cabdff1aSopenharmony_ci
2012cabdff1aSopenharmony_ci    SUMSUB_BA            w, 14, 10,  8                      ; m14=t2, m10=t6
2013cabdff1aSopenharmony_ci    SUMSUB_BA            w,  1,  5,  8                      ; m1=t3, m5=t7
2014cabdff1aSopenharmony_ci
2015cabdff1aSopenharmony_ci    mova                    m0, [blockq+  0]
2016cabdff1aSopenharmony_ci    mova                    m4, [blockq+128]
2017cabdff1aSopenharmony_ci    mova                    m3, [blockq+ 32]
2018cabdff1aSopenharmony_ci    mova                    m7, [blockq+ 64]
2019cabdff1aSopenharmony_ci    mova                    m8, [blockq+ 96]
2020cabdff1aSopenharmony_ci    mova          [blockq+  0], m1
2021cabdff1aSopenharmony_ci    mova          [blockq+128], m14
2022cabdff1aSopenharmony_ci    mova          [blockq+ 32], m6
2023cabdff1aSopenharmony_ci    mova          [blockq+ 64], m9
2024cabdff1aSopenharmony_ci    mova          [blockq+ 96], m10
2025cabdff1aSopenharmony_ci
2026cabdff1aSopenharmony_ci    ; second half of round 1
2027cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X 15,  0,  1,  9, 16364,   804    ; m15/x=t1[d], m0/x=t0[d]
2028cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  7,  8, 10,  6, 11003, 12140    ; m7/x=t9[d], m8/x=t8[d]
2029cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     8,  0,  6,  9, 14, [pd_8192]   ; m8=t0[w], m0=t8[w]
2030cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     7, 15, 10,  1, 14, [pd_8192]   ; m7=t1[w], m15=t9[w]
2031cabdff1aSopenharmony_ci
2032cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X 11,  4, 10,  6, 14811,  7005    ; m11/x=t5[d], m4/x=t4[d]
2033cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  3, 12,  1,  9,  5520, 15426    ; m3/x=t13[d], m12/x=t12[d]
2034cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA    12,  4,  9,  6, 14, [pd_8192]   ; m12=t4[w], m4=t12[w]
2035cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     3, 11,  1, 10, 14, [pd_8192]   ; m3=t5[w], m11=t13[w]
2036cabdff1aSopenharmony_ci
2037cabdff1aSopenharmony_ci    ; second half of round 2 t8-15
2038cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  0, 15,  6, 10, 16069,  3196    ; m15/x=t8[d], m0/x=t9[d]
2039cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X 11,  4,  9,  1,  3196, 16069    ; m11/x=t12[d], m4/x=t13[d]
2040cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA    11, 15,  9, 10, 14, [pd_8192]   ; m11=t8[w], m15=t12[w]
2041cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     4,  0,  1,  6, 14, [pd_8192]   ; m4=t9[w], m0=t13[w]
2042cabdff1aSopenharmony_ci
2043cabdff1aSopenharmony_ci    SUMSUB_BA            w, 12,  8, 14                      ; m12=t0, m8=t4
2044cabdff1aSopenharmony_ci    SUMSUB_BA            w,  3,  7, 14                      ; m3=t1, m7=t5
2045cabdff1aSopenharmony_ci
2046cabdff1aSopenharmony_ci    mova                   m10, [blockq+ 96]
2047cabdff1aSopenharmony_ci    mova          [blockq+ 96], m12
2048cabdff1aSopenharmony_ci
2049cabdff1aSopenharmony_ci    ; round 3
2050cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X 15,  0,  9, 12, 15137,  6270    ; m15/x=t13[d], m0/x=t12[d]
2051cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  2, 13,  1,  6,  6270, 15137    ; m2/x=t14[d], m13/x=t15[d]
2052cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     2,  0,  1, 12, 14, [pd_8192]   ; m2=out2[w], m0=t14a[w]
2053cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA    13, 15,  6,  9, 14, [pd_8192]
2054cabdff1aSopenharmony_ci    PSIGNW                 m13, [pw_m1]                     ; m13=out13[w], m15=t15a[w]
2055cabdff1aSopenharmony_ci
2056cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  8,  7, 12,  9, 15137,  6270    ; m8/x=t5[d], m7/x=t4[d]
2057cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2D_4X  5, 10,  1,  6,  6270, 15137    ; m5/x=t6[d], m10/x=t7[d]
2058cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA     5,  7,  1,  9, 14, [pd_8192]
2059cabdff1aSopenharmony_ci    PSIGNW                  m5, [pw_m1]                     ; m5=out3[w], m7=t6[w]
2060cabdff1aSopenharmony_ci    VP9_RND_SH_SUMSUB_BA    10,  8,  6, 12, 14, [pd_8192]   ; m10=out12[w], m8=t7[w]
2061cabdff1aSopenharmony_ci
2062cabdff1aSopenharmony_ci    mova                    m1, [blockq+  0]
2063cabdff1aSopenharmony_ci    mova                   m14, [blockq+128]
2064cabdff1aSopenharmony_ci    mova                    m6, [blockq+ 32]
2065cabdff1aSopenharmony_ci    mova                    m9, [blockq+ 64]
2066cabdff1aSopenharmony_ci    mova                   m12, [blockq+ 96]
2067cabdff1aSopenharmony_ci    mova          [blockq+  0], m10
2068cabdff1aSopenharmony_ci    mova          [blockq+128], m5
2069cabdff1aSopenharmony_ci
2070cabdff1aSopenharmony_ci    SUMSUB_BA            w, 14, 12,  5                      ; m14=out0, m12=t2a
2071cabdff1aSopenharmony_ci    SUMSUB_BA            w,  1,  3,  5
2072cabdff1aSopenharmony_ci    PSIGNW                  m1, [pw_m1]                     ; m1=out15, m3=t3a
2073cabdff1aSopenharmony_ci
2074cabdff1aSopenharmony_ci    SUMSUB_BA            w,  9, 11,  5
2075cabdff1aSopenharmony_ci    PSIGNW                  m9, [pw_m1]                     ; m9=out1, m11=t10
2076cabdff1aSopenharmony_ci    SUMSUB_BA            w,  6,  4,  5                      ; m6=out14, m4=t11
2077cabdff1aSopenharmony_ci
2078cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  4, 11, 11585, 11585, [pd_8192],  5, 10 ; m4=out9, m11=out6
2079cabdff1aSopenharmony_ci    mova                    m5, [blockq+128]
2080cabdff1aSopenharmony_ci    mova          [blockq+192], m11
2081cabdff1aSopenharmony_ci    PSIGNW                 m15, [pw_m1]
2082cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X 15,  0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10
2083cabdff1aSopenharmony_ci
2084cabdff1aSopenharmony_ci    PSIGNW                  m3, [pw_m1]
2085cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8
2086cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  8,  7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4
2087cabdff1aSopenharmony_ci
2088cabdff1aSopenharmony_ci    mova                   m10, [blockq+  0]
2089cabdff1aSopenharmony_ci
2090cabdff1aSopenharmony_ci    SWAP                     0, 14,  6, 11,  8, 12, 10
2091cabdff1aSopenharmony_ci    SWAP                     1,  9, 15,  4,  7,  3,  5
2092cabdff1aSopenharmony_ci    SWAP                     5,  9, 15
2093cabdff1aSopenharmony_ci%endmacro
2094cabdff1aSopenharmony_ci
2095cabdff1aSopenharmony_ci%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
2096cabdff1aSopenharmony_ci%macro IADST16_YMM_FN 4
2097cabdff1aSopenharmony_ciINIT_YMM avx2
2098cabdff1aSopenharmony_cicglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob
2099cabdff1aSopenharmony_ci    mova                m1, [blockq+ 32]
2100cabdff1aSopenharmony_ci    mova                m2, [blockq+ 64]
2101cabdff1aSopenharmony_ci    mova                m3, [blockq+ 96]
2102cabdff1aSopenharmony_ci    mova                m5, [blockq+160]
2103cabdff1aSopenharmony_ci    mova                m6, [blockq+192]
2104cabdff1aSopenharmony_ci    mova                m7, [blockq+224]
2105cabdff1aSopenharmony_ci    mova                m8, [blockq+256]
2106cabdff1aSopenharmony_ci    mova                m9, [blockq+288]
2107cabdff1aSopenharmony_ci    mova               m10, [blockq+320]
2108cabdff1aSopenharmony_ci    mova               m11, [blockq+352]
2109cabdff1aSopenharmony_ci    mova               m12, [blockq+384]
2110cabdff1aSopenharmony_ci    mova               m13, [blockq+416]
2111cabdff1aSopenharmony_ci    mova               m14, [blockq+448]
2112cabdff1aSopenharmony_ci    mova               m15, [blockq+480]
2113cabdff1aSopenharmony_ci
2114cabdff1aSopenharmony_ci    VP9_%2_YMM_1D
2115cabdff1aSopenharmony_ci    TRANSPOSE16x16W      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
2116cabdff1aSopenharmony_ci                         [blockq+192], [blockq+128], 1
2117cabdff1aSopenharmony_ci    mova      [blockq+  0], m0
2118cabdff1aSopenharmony_ci    VP9_%4_YMM_1D
2119cabdff1aSopenharmony_ci
2120cabdff1aSopenharmony_ci    mova      [blockq+224], m7
2121cabdff1aSopenharmony_ci
2122cabdff1aSopenharmony_ci    ; store
2123cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    0,  1, 6, 7, unused, [pw_512], 6
2124cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
2125cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    2,  3, 6, 7, unused, [pw_512], 6
2126cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
2127cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    4,  5, 6, 7, unused, [pw_512], 6
2128cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
2129cabdff1aSopenharmony_ci    mova                m6, [blockq+192]
2130cabdff1aSopenharmony_ci    mova                m7, [blockq+224]
2131cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    6,  7, 1, 2, unused, [pw_512], 6
2132cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
2133cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2    8,  9, 1, 2, unused, [pw_512], 6
2134cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
2135cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2   10, 11, 1, 2, unused, [pw_512], 6
2136cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
2137cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2   12, 13, 1, 2, unused, [pw_512], 6
2138cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
2139cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2   14, 15, 1, 2, unused, [pw_512], 6
2140cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
2141cabdff1aSopenharmony_ci
2142cabdff1aSopenharmony_ci    ; at the end of the loop, m0 should still be zero
2143cabdff1aSopenharmony_ci    ; use that to zero out block coefficients
2144cabdff1aSopenharmony_ci    pxor                m0, m0
2145cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 32, 16, m0
2146cabdff1aSopenharmony_ci    RET
2147cabdff1aSopenharmony_ci%endmacro
2148cabdff1aSopenharmony_ci
2149cabdff1aSopenharmony_ciIADST16_YMM_FN idct,  IDCT16,  iadst, IADST16
2150cabdff1aSopenharmony_ciIADST16_YMM_FN iadst, IADST16, idct,  IDCT16
2151cabdff1aSopenharmony_ciIADST16_YMM_FN iadst, IADST16, iadst, IADST16
2152cabdff1aSopenharmony_ci%endif
2153cabdff1aSopenharmony_ci
2154cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------------------
2155cabdff1aSopenharmony_ci; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
2156cabdff1aSopenharmony_ci;---------------------------------------------------------------------------------------------
2157cabdff1aSopenharmony_ci
2158cabdff1aSopenharmony_ci%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
2159cabdff1aSopenharmony_ci%if %2 == 1
2160cabdff1aSopenharmony_ci%assign %%str mmsize
2161cabdff1aSopenharmony_ci%else
2162cabdff1aSopenharmony_ci%assign %%str 64
2163cabdff1aSopenharmony_ci%endif
2164cabdff1aSopenharmony_ci
2165cabdff1aSopenharmony_ci    ; first do t0-15, this can be done identical to idct16x16
2166cabdff1aSopenharmony_ci    VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1
2167cabdff1aSopenharmony_ci
2168cabdff1aSopenharmony_ci    ; store everything on stack to make space available for t16-31
2169cabdff1aSopenharmony_ci    ; we store interleaved with the output of the second half (t16-31)
2170cabdff1aSopenharmony_ci    ; so we don't need to allocate extra stack space
2171cabdff1aSopenharmony_ci    mova    [tmpq+ 0*%%str], m0     ; t0
2172cabdff1aSopenharmony_ci    mova    [tmpq+ 4*%%str], m1     ; t1
2173cabdff1aSopenharmony_ci    mova    [tmpq+ 8*%%str], m2     ; t2
2174cabdff1aSopenharmony_ci    mova    [tmpq+12*%%str], m3     ; t3
2175cabdff1aSopenharmony_ci    mova    [tmpq+16*%%str], m4     ; t4
2176cabdff1aSopenharmony_ci    mova    [tmpq+20*%%str], m5     ; t5
2177cabdff1aSopenharmony_ci%if ARCH_X86_64
2178cabdff1aSopenharmony_ci    mova    [tmpq+22*%%str], m10    ; t10
2179cabdff1aSopenharmony_ci    mova    [tmpq+18*%%str], m11    ; t11
2180cabdff1aSopenharmony_ci    mova    [tmpq+14*%%str], m12    ; t12
2181cabdff1aSopenharmony_ci    mova    [tmpq+10*%%str], m13    ; t13
2182cabdff1aSopenharmony_ci    mova    [tmpq+ 6*%%str], m14    ; t14
2183cabdff1aSopenharmony_ci    mova    [tmpq+ 2*%%str], m15    ; t15
2184cabdff1aSopenharmony_ci%endif
2185cabdff1aSopenharmony_ci
2186cabdff1aSopenharmony_ci    mova                m0, [tmpq+ 30*%%str]
2187cabdff1aSopenharmony_ci    UNSCRATCH            1,  6, tmpq+26*%%str
2188cabdff1aSopenharmony_ci    UNSCRATCH            2,  8, tmpq+24*%%str
2189cabdff1aSopenharmony_ci    UNSCRATCH            3,  9, tmpq+28*%%str
2190cabdff1aSopenharmony_ci    SUMSUB_BA            w,  1,  3, 4       ; t6, t9
2191cabdff1aSopenharmony_ci    SUMSUB_BA            w,  0,  2, 4       ; t7, t8
2192cabdff1aSopenharmony_ci
2193cabdff1aSopenharmony_ci    mova    [tmpq+24*%%str], m1     ; t6
2194cabdff1aSopenharmony_ci    mova    [tmpq+28*%%str], m0     ; t7
2195cabdff1aSopenharmony_ci    mova    [tmpq+30*%%str], m2     ; t8
2196cabdff1aSopenharmony_ci    mova    [tmpq+26*%%str], m3     ; t9
2197cabdff1aSopenharmony_ci
2198cabdff1aSopenharmony_ci    ; then, secondly, do t16-31
2199cabdff1aSopenharmony_ci%if %3 <= 8
2200cabdff1aSopenharmony_ci    mova                 m4, [%1+ 1*64]
2201cabdff1aSopenharmony_ci    mova                 m7, [%1+ 7*64]
2202cabdff1aSopenharmony_ci
2203cabdff1aSopenharmony_ci    pmulhrsw             m1,  m4, [pw_16364x2] ;t31
2204cabdff1aSopenharmony_ci    pmulhrsw             m4, [pw_804x2] ;t16
2205cabdff1aSopenharmony_ci
2206cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   5,  0,  1,  4, 16069,  3196, [pd_8192], 6,  2 ; t17, t30
2207cabdff1aSopenharmony_ci
2208cabdff1aSopenharmony_ci    pmulhrsw             m3,  m7, [pw_m5520x2] ;t19
2209cabdff1aSopenharmony_ci    pmulhrsw             m7, [pw_15426x2] ;t28
2210cabdff1aSopenharmony_ci
2211cabdff1aSopenharmony_ci    SCRATCH               4, 13, tmpq+ 1*%%str
2212cabdff1aSopenharmony_ci    SCRATCH               5, 12, tmpq+15*%%str
2213cabdff1aSopenharmony_ci
2214cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   2,  6,  7,  3, 3196, m16069, [pd_8192], 4,  5 ; t18, t29
2215cabdff1aSopenharmony_ci%else
2216cabdff1aSopenharmony_ci    mova                 m0, [%1+ 1*64]
2217cabdff1aSopenharmony_ci    mova                 m1, [%1+15*64]
2218cabdff1aSopenharmony_ci%if %3 <= 16
2219cabdff1aSopenharmony_ci    pmulhrsw             m5, m0, [pw_16364x2]
2220cabdff1aSopenharmony_ci    pmulhrsw             m0, [pw_804x2]
2221cabdff1aSopenharmony_ci    pmulhrsw             m4, m1, [pw_m11003x2]
2222cabdff1aSopenharmony_ci    pmulhrsw             m1, [pw_12140x2]
2223cabdff1aSopenharmony_ci%else
2224cabdff1aSopenharmony_ci    mova                 m4, [%1+17*64]
2225cabdff1aSopenharmony_ci    mova                 m5, [%1+31*64]
2226cabdff1aSopenharmony_ci
2227cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   0,  5, 16364,   804, [pd_8192], 2, 3 ; t16, t31
2228cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   4,  1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
2229cabdff1aSopenharmony_ci%endif
2230cabdff1aSopenharmony_ci    SUMSUB_BA             w,  4,  0,  2
2231cabdff1aSopenharmony_ci    SUMSUB_BA             w,  1,  5,  2
2232cabdff1aSopenharmony_ci
2233cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   5,  0, 16069,  3196, [pd_8192], 2, 3 ; t17, t30
2234cabdff1aSopenharmony_ci
2235cabdff1aSopenharmony_ci    SCRATCH               4, 13, tmpq+ 1*%%str
2236cabdff1aSopenharmony_ci    SCRATCH               5, 12, tmpq+15*%%str
2237cabdff1aSopenharmony_ci
2238cabdff1aSopenharmony_ci    mova                 m2, [%1+ 7*64]
2239cabdff1aSopenharmony_ci    mova                 m3, [%1+ 9*64]
2240cabdff1aSopenharmony_ci%if %3 <= 16
2241cabdff1aSopenharmony_ci    pmulhrsw             m7,  m3, [pw_14811x2]
2242cabdff1aSopenharmony_ci    pmulhrsw             m3, [pw_7005x2]
2243cabdff1aSopenharmony_ci    pmulhrsw             m6,  m2, [pw_m5520x2]
2244cabdff1aSopenharmony_ci    pmulhrsw             m2, [pw_15426x2]
2245cabdff1aSopenharmony_ci%else
2246cabdff1aSopenharmony_ci    mova                 m7, [%1+23*64]
2247cabdff1aSopenharmony_ci    mova                 m6, [%1+25*64]
2248cabdff1aSopenharmony_ci
2249cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   3,  7, 14811,  7005, [pd_8192], 4, 5 ; t18, t29
2250cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   6,  2,  5520, 15426, [pd_8192], 4, 5 ; t19, t28
2251cabdff1aSopenharmony_ci%endif
2252cabdff1aSopenharmony_ci    SUMSUB_BA             w,  3,  6,  4
2253cabdff1aSopenharmony_ci    SUMSUB_BA             w,  7,  2,  4
2254cabdff1aSopenharmony_ci
2255cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   2,  6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29
2256cabdff1aSopenharmony_ci%endif
2257cabdff1aSopenharmony_ci
2258cabdff1aSopenharmony_ci    UNSCRATCH             5, 12, tmpq+15*%%str
2259cabdff1aSopenharmony_ci    SUMSUB_BA             w,  6,  0,  4
2260cabdff1aSopenharmony_ci    mova    [tmpq+25*%%str], m6             ; t19
2261cabdff1aSopenharmony_ci    UNSCRATCH             4, 13, tmpq+ 1*%%str
2262cabdff1aSopenharmony_ci    SUMSUB_BA             w,  7,  1,  6
2263cabdff1aSopenharmony_ci    SUMSUB_BA             w,  3,  4,  6
2264cabdff1aSopenharmony_ci    mova    [tmpq+23*%%str], m3             ; t16
2265cabdff1aSopenharmony_ci    SUMSUB_BA             w,  2,  5,  6
2266cabdff1aSopenharmony_ci
2267cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   0,  5, 15137,  6270, [pd_8192], 6, 3 ; t18, t29
2268cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   1,  4, 15137,  6270, [pd_8192], 6, 3 ; t19, t28
2269cabdff1aSopenharmony_ci
2270cabdff1aSopenharmony_ci    SCRATCH               0, 10, tmpq+ 1*%%str
2271cabdff1aSopenharmony_ci    SCRATCH               1, 11, tmpq+ 7*%%str
2272cabdff1aSopenharmony_ci    SCRATCH               2,  9, tmpq+ 9*%%str
2273cabdff1aSopenharmony_ci    SCRATCH               4, 14, tmpq+15*%%str
2274cabdff1aSopenharmony_ci    SCRATCH               5, 15, tmpq+17*%%str
2275cabdff1aSopenharmony_ci    SCRATCH               7, 13, tmpq+31*%%str
2276cabdff1aSopenharmony_ci
2277cabdff1aSopenharmony_ci%if %3 <= 8
2278cabdff1aSopenharmony_ci    mova                 m0, [%1+ 5*64]
2279cabdff1aSopenharmony_ci    mova                 m3, [%1+ 3*64]
2280cabdff1aSopenharmony_ci
2281cabdff1aSopenharmony_ci    pmulhrsw             m5,  m0, [pw_15893x2] ;t27
2282cabdff1aSopenharmony_ci    pmulhrsw             m0, [pw_3981x2] ;t20
2283cabdff1aSopenharmony_ci
2284cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   1,  4,  5,  0,  9102, 13623, [pd_8192], 7,  2 ; t21, t26
2285cabdff1aSopenharmony_ci
2286cabdff1aSopenharmony_ci    pmulhrsw             m6,  m3, [pw_m2404x2] ;t23
2287cabdff1aSopenharmony_ci    pmulhrsw             m3, [pw_16207x2] ;t24
2288cabdff1aSopenharmony_ci
2289cabdff1aSopenharmony_ci    SCRATCH               5,  8, tmpq+ 5*%%str
2290cabdff1aSopenharmony_ci    SCRATCH               4, 12, tmpq+11*%%str
2291cabdff1aSopenharmony_ci
2292cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   7,  2,  3,  6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
2293cabdff1aSopenharmony_ci%else
2294cabdff1aSopenharmony_ci    mova                 m4, [%1+ 5*64]
2295cabdff1aSopenharmony_ci    mova                 m5, [%1+11*64]
2296cabdff1aSopenharmony_ci%if %3 <= 16
2297cabdff1aSopenharmony_ci    pmulhrsw             m1, m4, [pw_15893x2]
2298cabdff1aSopenharmony_ci    pmulhrsw             m4, [pw_3981x2]
2299cabdff1aSopenharmony_ci    pmulhrsw             m0, m5, [pw_m8423x2]
2300cabdff1aSopenharmony_ci    pmulhrsw             m5, [pw_14053x2]
2301cabdff1aSopenharmony_ci%else
2302cabdff1aSopenharmony_ci    mova                 m0, [%1+21*64]
2303cabdff1aSopenharmony_ci    mova                 m1, [%1+27*64]
2304cabdff1aSopenharmony_ci
2305cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   4,  1, 15893,  3981, [pd_8192], 2, 3 ; t20, t27
2306cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   0,  5,  8423, 14053, [pd_8192], 2, 3 ; t21, t26
2307cabdff1aSopenharmony_ci%endif
2308cabdff1aSopenharmony_ci    SUMSUB_BA             w,  0,  4,  2
2309cabdff1aSopenharmony_ci    SUMSUB_BA             w,  5,  1,  2
2310cabdff1aSopenharmony_ci
2311cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   1,  4,  9102, 13623, [pd_8192], 2, 3 ; t21, t26
2312cabdff1aSopenharmony_ci
2313cabdff1aSopenharmony_ci    SCRATCH               5,  8, tmpq+ 5*%%str
2314cabdff1aSopenharmony_ci    SCRATCH               4, 12, tmpq+11*%%str
2315cabdff1aSopenharmony_ci
2316cabdff1aSopenharmony_ci    mova                 m7, [%1+ 3*64]
2317cabdff1aSopenharmony_ci    mova                 m6, [%1+13*64]
2318cabdff1aSopenharmony_ci%if %3 <= 16
2319cabdff1aSopenharmony_ci    pmulhrsw             m3, m6, [pw_13160x2]
2320cabdff1aSopenharmony_ci    pmulhrsw             m6, [pw_9760x2]
2321cabdff1aSopenharmony_ci    pmulhrsw             m2, m7, [pw_m2404x2]
2322cabdff1aSopenharmony_ci    pmulhrsw             m7, [pw_16207x2]
2323cabdff1aSopenharmony_ci%else
2324cabdff1aSopenharmony_ci    mova                 m2, [%1+29*64]
2325cabdff1aSopenharmony_ci    mova                 m3, [%1+19*64]
2326cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   6,  3, 13160,  9760, [pd_8192], 4, 5 ; t22, t25
2327cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   2,  7,  2404, 16207, [pd_8192], 4, 5 ; t23, t24
2328cabdff1aSopenharmony_ci%endif
2329cabdff1aSopenharmony_ci    SUMSUB_BA             w,  6,  2,  4
2330cabdff1aSopenharmony_ci    SUMSUB_BA             w,  3,  7,  4
2331cabdff1aSopenharmony_ci
2332cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   7,  2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
2333cabdff1aSopenharmony_ci%endif
2334cabdff1aSopenharmony_ci
2335cabdff1aSopenharmony_ci    ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
2336cabdff1aSopenharmony_ci    ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
2337cabdff1aSopenharmony_ci
2338cabdff1aSopenharmony_ci    UNSCRATCH             4, 12, tmpq+11*%%str
2339cabdff1aSopenharmony_ci    SUMSUB_BA             w,  0,  6, 5
2340cabdff1aSopenharmony_ci    SUMSUB_BA             w,  4,  2, 5
2341cabdff1aSopenharmony_ci    UNSCRATCH             5,  8, tmpq+ 5*%%str
2342cabdff1aSopenharmony_ci    SCRATCH               4,  8, tmpq+11*%%str
2343cabdff1aSopenharmony_ci    SUMSUB_BA             w,  1,  7, 4
2344cabdff1aSopenharmony_ci    SUMSUB_BA             w,  5,  3, 4
2345cabdff1aSopenharmony_ci    SCRATCH               5, 12, tmpq+ 5*%%str
2346cabdff1aSopenharmony_ci
2347cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   3,  6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27
2348cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X   2,  7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26
2349cabdff1aSopenharmony_ci
2350cabdff1aSopenharmony_ci    ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
2351cabdff1aSopenharmony_ci    ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
2352cabdff1aSopenharmony_ci
2353cabdff1aSopenharmony_ci    UNSCRATCH             5,  9, tmpq+ 9*%%str
2354cabdff1aSopenharmony_ci    mova                 m4, [tmpq+23*%%str] ; t16
2355cabdff1aSopenharmony_ci%if ARCH_X86_64
2356cabdff1aSopenharmony_ci    SUMSUB_BA             w,  1,  5,  9
2357cabdff1aSopenharmony_ci    SUMSUB_BA             w,  0,  4,  9
2358cabdff1aSopenharmony_ci%else
2359cabdff1aSopenharmony_ci    SUMSUB_BADC           w,  1,  5,  0,  4
2360cabdff1aSopenharmony_ci%endif
2361cabdff1aSopenharmony_ci    mova    [tmpq+29*%%str], m1     ; t17
2362cabdff1aSopenharmony_ci    mova    [tmpq+21*%%str], m0     ; t16
2363cabdff1aSopenharmony_ci    UNSCRATCH             0, 10, tmpq+ 1*%%str
2364cabdff1aSopenharmony_ci    UNSCRATCH             1, 11, tmpq+ 7*%%str
2365cabdff1aSopenharmony_ci%if ARCH_X86_64
2366cabdff1aSopenharmony_ci    SUMSUB_BA             w,  2,  0,  9
2367cabdff1aSopenharmony_ci    SUMSUB_BA             w,  3,  1,  9
2368cabdff1aSopenharmony_ci%else
2369cabdff1aSopenharmony_ci    SUMSUB_BADC           w,  2,  0,  3,  1
2370cabdff1aSopenharmony_ci%endif
2371cabdff1aSopenharmony_ci    mova    [tmpq+ 9*%%str], m2     ; t18
2372cabdff1aSopenharmony_ci    mova    [tmpq+13*%%str], m3     ; t19
2373cabdff1aSopenharmony_ci    SCRATCH               0, 10, tmpq+23*%%str
2374cabdff1aSopenharmony_ci    SCRATCH               1, 11, tmpq+27*%%str
2375cabdff1aSopenharmony_ci
2376cabdff1aSopenharmony_ci    UNSCRATCH             2, 14, tmpq+15*%%str
2377cabdff1aSopenharmony_ci    UNSCRATCH             3, 15, tmpq+17*%%str
2378cabdff1aSopenharmony_ci    SUMSUB_BA             w,  6,  2, 0
2379cabdff1aSopenharmony_ci    SUMSUB_BA             w,  7,  3, 0
2380cabdff1aSopenharmony_ci    SCRATCH               6, 14, tmpq+ 3*%%str
2381cabdff1aSopenharmony_ci    SCRATCH               7, 15, tmpq+ 7*%%str
2382cabdff1aSopenharmony_ci
2383cabdff1aSopenharmony_ci    UNSCRATCH             0,  8, tmpq+11*%%str
2384cabdff1aSopenharmony_ci    mova                 m1, [tmpq+25*%%str] ; t19
2385cabdff1aSopenharmony_ci    UNSCRATCH             6, 12, tmpq+ 5*%%str
2386cabdff1aSopenharmony_ci    UNSCRATCH             7, 13, tmpq+31*%%str
2387cabdff1aSopenharmony_ci%if ARCH_X86_64
2388cabdff1aSopenharmony_ci    SUMSUB_BA             w,  0,  1,  9
2389cabdff1aSopenharmony_ci    SUMSUB_BA             w,  6,  7,  9
2390cabdff1aSopenharmony_ci%else
2391cabdff1aSopenharmony_ci    SUMSUB_BADC           w,  0,  1,  6,  7
2392cabdff1aSopenharmony_ci%endif
2393cabdff1aSopenharmony_ci
2394cabdff1aSopenharmony_ci    ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
2395cabdff1aSopenharmony_ci    ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
2396cabdff1aSopenharmony_ci
2397cabdff1aSopenharmony_ci%if 0; cpuflag(ssse3)
2398cabdff1aSopenharmony_ci%if ARCH_X86_64
2399cabdff1aSopenharmony_ci    SUMSUB_BA             w,  4,  7,  8
2400cabdff1aSopenharmony_ci    SUMSUB_BA             w,  5,  1,  8
2401cabdff1aSopenharmony_ci%else
2402cabdff1aSopenharmony_ci    SUMSUB_BADC           w,  4,  7,  5,  1
2403cabdff1aSopenharmony_ci%endif
2404cabdff1aSopenharmony_ci
2405cabdff1aSopenharmony_ci    pmulhrsw             m7, [pw_11585x2]
2406cabdff1aSopenharmony_ci    pmulhrsw             m4, [pw_11585x2]
2407cabdff1aSopenharmony_ci    pmulhrsw             m1, [pw_11585x2]
2408cabdff1aSopenharmony_ci    pmulhrsw             m5, [pw_11585x2]
2409cabdff1aSopenharmony_ci
2410cabdff1aSopenharmony_ci    mova    [tmpq+ 5*%%str], m7     ; t23
2411cabdff1aSopenharmony_ci    SCRATCH               1, 13, tmpq+25*%%str
2412cabdff1aSopenharmony_ci    UNSCRATCH             7, 10, tmpq+23*%%str
2413cabdff1aSopenharmony_ci    UNSCRATCH             1, 11, tmpq+27*%%str
2414cabdff1aSopenharmony_ci
2415cabdff1aSopenharmony_ci%if ARCH_X86_64
2416cabdff1aSopenharmony_ci    SUMSUB_BA             w,  7,  3, 10
2417cabdff1aSopenharmony_ci    SUMSUB_BA             w,  1,  2, 10
2418cabdff1aSopenharmony_ci%else
2419cabdff1aSopenharmony_ci    SUMSUB_BADC           w,  7,  3,  1,  2
2420cabdff1aSopenharmony_ci%endif
2421cabdff1aSopenharmony_ci
2422cabdff1aSopenharmony_ci    pmulhrsw             m3, [pw_11585x2]
2423cabdff1aSopenharmony_ci    pmulhrsw             m7, [pw_11585x2]
2424cabdff1aSopenharmony_ci    pmulhrsw             m2, [pw_11585x2]
2425cabdff1aSopenharmony_ci    pmulhrsw             m1, [pw_11585x2]
2426cabdff1aSopenharmony_ci%else
2427cabdff1aSopenharmony_ci    SCRATCH               0,  8, tmpq+15*%%str
2428cabdff1aSopenharmony_ci    SCRATCH               6,  9, tmpq+17*%%str
2429cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  7,  4, 11585, 11585, [pd_8192], 0, 6
2430cabdff1aSopenharmony_ci    mova    [tmpq+ 5*%%str], m7     ; t23
2431cabdff1aSopenharmony_ci    UNSCRATCH             7, 10, tmpq+23*%%str
2432cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  1,  5, 11585, 11585, [pd_8192], 0, 6
2433cabdff1aSopenharmony_ci    SCRATCH               1, 13, tmpq+25*%%str
2434cabdff1aSopenharmony_ci    UNSCRATCH             1, 11, tmpq+27*%%str
2435cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  3,  7, 11585, 11585, [pd_8192], 0, 6
2436cabdff1aSopenharmony_ci    VP9_UNPACK_MULSUB_2W_4X  2,  1, 11585, 11585, [pd_8192], 0, 6
2437cabdff1aSopenharmony_ci    UNSCRATCH             0,  8, tmpq+15*%%str
2438cabdff1aSopenharmony_ci    UNSCRATCH             6,  9, tmpq+17*%%str
2439cabdff1aSopenharmony_ci%endif
2440cabdff1aSopenharmony_ci
2441cabdff1aSopenharmony_ci    ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
2442cabdff1aSopenharmony_ci    ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
2443cabdff1aSopenharmony_ci
2444cabdff1aSopenharmony_ci    ; then do final pass to sumsub+store the two halves
2445cabdff1aSopenharmony_ci%if %2 == 1
2446cabdff1aSopenharmony_ci    mova    [tmpq+17*%%str], m2     ; t20
2447cabdff1aSopenharmony_ci    mova    [tmpq+ 1*%%str], m3     ; t21
2448cabdff1aSopenharmony_ci%if ARCH_X86_64
2449cabdff1aSopenharmony_ci    mova    [tmpq+25*%%str], m13    ; t22
2450cabdff1aSopenharmony_ci
2451cabdff1aSopenharmony_ci    mova                 m8, [tmpq+ 0*%%str] ; t0
2452cabdff1aSopenharmony_ci    mova                 m9, [tmpq+ 4*%%str] ; t1
2453cabdff1aSopenharmony_ci    mova                m12, [tmpq+ 8*%%str] ; t2
2454cabdff1aSopenharmony_ci    mova                m11, [tmpq+12*%%str] ; t3
2455cabdff1aSopenharmony_ci    mova                 m2, [tmpq+16*%%str] ; t4
2456cabdff1aSopenharmony_ci    mova                 m3, [tmpq+20*%%str] ; t5
2457cabdff1aSopenharmony_ci    mova                m13, [tmpq+24*%%str] ; t6
2458cabdff1aSopenharmony_ci
2459cabdff1aSopenharmony_ci    SUMSUB_BA             w,  6,  8, 10
2460cabdff1aSopenharmony_ci    mova    [tmpq+ 3*%%str], m8              ; t15
2461cabdff1aSopenharmony_ci    SUMSUB_BA             w,  0,  9,  8
2462cabdff1aSopenharmony_ci    SUMSUB_BA             w, 15, 12,  8
2463cabdff1aSopenharmony_ci    SUMSUB_BA             w, 14, 11,  8
2464cabdff1aSopenharmony_ci    SUMSUB_BA             w,  1,  2,  8
2465cabdff1aSopenharmony_ci    SUMSUB_BA             w,  7,  3,  8
2466cabdff1aSopenharmony_ci    SUMSUB_BA             w,  5, 13,  8
2467cabdff1aSopenharmony_ci    mova                m10, [tmpq+28*%%str] ; t7
2468cabdff1aSopenharmony_ci    SUMSUB_BA             w,  4, 10,  8
2469cabdff1aSopenharmony_ci%if cpuflag(avx2)
2470cabdff1aSopenharmony_ci    ; the "shitty" about this idct is that the final pass does the outermost
2471cabdff1aSopenharmony_ci    ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need
2472cabdff1aSopenharmony_ci    ; to be sequential, which means I need to load/store half of the sumsub
2473cabdff1aSopenharmony_ci    ; intermediates back to/from memory to get a 16x16 transpose going...
2474cabdff1aSopenharmony_ci    ; This would be easier if we had more (e.g. 32) YMM regs here.
2475cabdff1aSopenharmony_ci    mova    [tmpq+ 7*%%str], m9
2476cabdff1aSopenharmony_ci    mova    [tmpq+11*%%str], m12
2477cabdff1aSopenharmony_ci    mova    [tmpq+15*%%str], m11
2478cabdff1aSopenharmony_ci    mova    [tmpq+19*%%str], m2
2479cabdff1aSopenharmony_ci    mova    [tmpq+23*%%str], m3
2480cabdff1aSopenharmony_ci    mova    [tmpq+27*%%str], m13
2481cabdff1aSopenharmony_ci    mova    [tmpq+31*%%str], m10
2482cabdff1aSopenharmony_ci    mova    [tmpq+12*%%str], m5
2483cabdff1aSopenharmony_ci
2484cabdff1aSopenharmony_ci    mova                m13, [tmpq+30*%%str] ; t8
2485cabdff1aSopenharmony_ci    mova                m12, [tmpq+26*%%str] ; t9
2486cabdff1aSopenharmony_ci    mova                m11, [tmpq+22*%%str] ; t10
2487cabdff1aSopenharmony_ci    mova                m10, [tmpq+18*%%str] ; t11
2488cabdff1aSopenharmony_ci    mova                 m9, [tmpq+17*%%str] ; t20
2489cabdff1aSopenharmony_ci    mova                 m8, [tmpq+ 1*%%str] ; t21
2490cabdff1aSopenharmony_ci    mova                 m3, [tmpq+25*%%str] ; t22
2491cabdff1aSopenharmony_ci    mova                 m2, [tmpq+ 5*%%str] ; t23
2492cabdff1aSopenharmony_ci
2493cabdff1aSopenharmony_ci    SUMSUB_BA             w,  9, 10, 5
2494cabdff1aSopenharmony_ci    SUMSUB_BA             w,  8, 11, 5
2495cabdff1aSopenharmony_ci    SUMSUB_BA             w,  3, 12, 5
2496cabdff1aSopenharmony_ci    SUMSUB_BA             w,  2, 13, 5
2497cabdff1aSopenharmony_ci    mova    [tmpq+ 1*%%str], m10
2498cabdff1aSopenharmony_ci    mova    [tmpq+ 5*%%str], m11
2499cabdff1aSopenharmony_ci    mova    [tmpq+17*%%str], m12
2500cabdff1aSopenharmony_ci    mova    [tmpq+25*%%str], m13
2501cabdff1aSopenharmony_ci
2502cabdff1aSopenharmony_ci    mova                m13, [tmpq+14*%%str] ; t12
2503cabdff1aSopenharmony_ci    mova                m12, [tmpq+10*%%str] ; t13
2504cabdff1aSopenharmony_ci    mova                m11, [tmpq+ 9*%%str] ; t18
2505cabdff1aSopenharmony_ci    mova                m10, [tmpq+13*%%str] ; t19
2506cabdff1aSopenharmony_ci
2507cabdff1aSopenharmony_ci    SUMSUB_BA             w, 11, 12, 5
2508cabdff1aSopenharmony_ci    SUMSUB_BA             w, 10, 13, 5
2509cabdff1aSopenharmony_ci    mova    [tmpq+ 9*%%str], m13
2510cabdff1aSopenharmony_ci    mova    [tmpq+13*%%str], m12
2511cabdff1aSopenharmony_ci    mova    [tmpq+10*%%str], m10
2512cabdff1aSopenharmony_ci    mova    [tmpq+14*%%str], m11
2513cabdff1aSopenharmony_ci
2514cabdff1aSopenharmony_ci    mova                m13, [tmpq+ 6*%%str] ; t14
2515cabdff1aSopenharmony_ci    mova                m12, [tmpq+ 2*%%str] ; t15
2516cabdff1aSopenharmony_ci    mova                m11, [tmpq+21*%%str] ; t16
2517cabdff1aSopenharmony_ci    mova                m10, [tmpq+29*%%str] ; t17
2518cabdff1aSopenharmony_ci    SUMSUB_BA             w, 11, 12, 5
2519cabdff1aSopenharmony_ci    SUMSUB_BA             w, 10, 13, 5
2520cabdff1aSopenharmony_ci    mova    [tmpq+21*%%str], m12
2521cabdff1aSopenharmony_ci    mova    [tmpq+29*%%str], m13
2522cabdff1aSopenharmony_ci    mova                m12, [tmpq+10*%%str]
2523cabdff1aSopenharmony_ci    mova                m13, [tmpq+14*%%str]
2524cabdff1aSopenharmony_ci
2525cabdff1aSopenharmony_ci    TRANSPOSE16x16W       6,  0, 15, 14,  1,  7,  5,  4, \
2526cabdff1aSopenharmony_ci                          2,  3,  8,  9, 12, 13, 10, 11, \
2527cabdff1aSopenharmony_ci            [tmpq+12*%%str], [tmpq+ 8*%%str], 1
2528cabdff1aSopenharmony_ci    mova    [tmpq+ 0*%%str], m6
2529cabdff1aSopenharmony_ci    mova    [tmpq+ 2*%%str], m0
2530cabdff1aSopenharmony_ci    mova    [tmpq+ 4*%%str], m15
2531cabdff1aSopenharmony_ci    mova    [tmpq+ 6*%%str], m14
2532cabdff1aSopenharmony_ci    mova    [tmpq+10*%%str], m7
2533cabdff1aSopenharmony_ci    mova    [tmpq+12*%%str], m5
2534cabdff1aSopenharmony_ci    mova    [tmpq+14*%%str], m4
2535cabdff1aSopenharmony_ci    mova    [tmpq+16*%%str], m2
2536cabdff1aSopenharmony_ci    mova    [tmpq+18*%%str], m3
2537cabdff1aSopenharmony_ci    mova    [tmpq+20*%%str], m8
2538cabdff1aSopenharmony_ci    mova    [tmpq+22*%%str], m9
2539cabdff1aSopenharmony_ci    mova    [tmpq+24*%%str], m12
2540cabdff1aSopenharmony_ci    mova    [tmpq+26*%%str], m13
2541cabdff1aSopenharmony_ci    mova    [tmpq+28*%%str], m10
2542cabdff1aSopenharmony_ci    mova    [tmpq+30*%%str], m11
2543cabdff1aSopenharmony_ci
2544cabdff1aSopenharmony_ci    mova                 m0, [tmpq+21*%%str]
2545cabdff1aSopenharmony_ci    mova                 m1, [tmpq+29*%%str]
2546cabdff1aSopenharmony_ci    mova                 m2, [tmpq+13*%%str]
2547cabdff1aSopenharmony_ci    mova                 m3, [tmpq+ 9*%%str]
2548cabdff1aSopenharmony_ci    mova                 m4, [tmpq+ 1*%%str]
2549cabdff1aSopenharmony_ci    mova                 m5, [tmpq+ 5*%%str]
2550cabdff1aSopenharmony_ci    mova                 m7, [tmpq+25*%%str]
2551cabdff1aSopenharmony_ci    mova                 m8, [tmpq+31*%%str]
2552cabdff1aSopenharmony_ci    mova                 m9, [tmpq+27*%%str]
2553cabdff1aSopenharmony_ci    mova                m10, [tmpq+23*%%str]
2554cabdff1aSopenharmony_ci    mova                m11, [tmpq+19*%%str]
2555cabdff1aSopenharmony_ci    mova                m12, [tmpq+15*%%str]
2556cabdff1aSopenharmony_ci    mova                m13, [tmpq+11*%%str]
2557cabdff1aSopenharmony_ci    mova                m14, [tmpq+ 7*%%str]
2558cabdff1aSopenharmony_ci    mova                m15, [tmpq+ 3*%%str]
2559cabdff1aSopenharmony_ci    TRANSPOSE16x16W       0,  1,  2,  3,  4,  5,  6,  7, \
2560cabdff1aSopenharmony_ci                          8,  9, 10, 11, 12, 13, 14, 15, \
2561cabdff1aSopenharmony_ci            [tmpq+17*%%str], [tmpq+ 9*%%str], 1
2562cabdff1aSopenharmony_ci    mova    [tmpq+ 1*%%str], m0
2563cabdff1aSopenharmony_ci    mova    [tmpq+ 3*%%str], m1
2564cabdff1aSopenharmony_ci    mova    [tmpq+ 5*%%str], m2
2565cabdff1aSopenharmony_ci    mova    [tmpq+ 7*%%str], m3
2566cabdff1aSopenharmony_ci    mova    [tmpq+11*%%str], m5
2567cabdff1aSopenharmony_ci    mova    [tmpq+13*%%str], m6
2568cabdff1aSopenharmony_ci    mova    [tmpq+15*%%str], m7
2569cabdff1aSopenharmony_ci    mova    [tmpq+17*%%str], m8
2570cabdff1aSopenharmony_ci    mova    [tmpq+19*%%str], m9
2571cabdff1aSopenharmony_ci    mova    [tmpq+21*%%str], m10
2572cabdff1aSopenharmony_ci    mova    [tmpq+23*%%str], m11
2573cabdff1aSopenharmony_ci    mova    [tmpq+25*%%str], m12
2574cabdff1aSopenharmony_ci    mova    [tmpq+27*%%str], m13
2575cabdff1aSopenharmony_ci    mova    [tmpq+29*%%str], m14
2576cabdff1aSopenharmony_ci    mova    [tmpq+31*%%str], m15
2577cabdff1aSopenharmony_ci%else ; !avx2
2578cabdff1aSopenharmony_ci    TRANSPOSE8x8W         6, 0, 15, 14, 1, 7, 5, 4, 8
2579cabdff1aSopenharmony_ci    mova    [tmpq+ 0*%%str], m6
2580cabdff1aSopenharmony_ci    mova    [tmpq+ 4*%%str], m0
2581cabdff1aSopenharmony_ci    mova    [tmpq+ 8*%%str], m15
2582cabdff1aSopenharmony_ci    mova    [tmpq+12*%%str], m14
2583cabdff1aSopenharmony_ci    mova    [tmpq+16*%%str], m1
2584cabdff1aSopenharmony_ci    mova    [tmpq+20*%%str], m7
2585cabdff1aSopenharmony_ci    mova    [tmpq+24*%%str], m5
2586cabdff1aSopenharmony_ci    mova    [tmpq+28*%%str], m4
2587cabdff1aSopenharmony_ci
2588cabdff1aSopenharmony_ci    mova                  m8, [tmpq+ 3*%%str] ; t15
2589cabdff1aSopenharmony_ci    TRANSPOSE8x8W         10, 13, 3, 2, 11, 12, 9, 8, 0
2590cabdff1aSopenharmony_ci    mova    [tmpq+ 3*%%str], m10
2591cabdff1aSopenharmony_ci    mova    [tmpq+ 7*%%str], m13
2592cabdff1aSopenharmony_ci    mova    [tmpq+11*%%str], m3
2593cabdff1aSopenharmony_ci    mova    [tmpq+15*%%str], m2
2594cabdff1aSopenharmony_ci    mova    [tmpq+19*%%str], m11
2595cabdff1aSopenharmony_ci    mova    [tmpq+23*%%str], m12
2596cabdff1aSopenharmony_ci    mova    [tmpq+27*%%str], m9
2597cabdff1aSopenharmony_ci    mova    [tmpq+31*%%str], m8
2598cabdff1aSopenharmony_ci
2599cabdff1aSopenharmony_ci    mova                m15, [tmpq+30*%%str] ; t8
2600cabdff1aSopenharmony_ci    mova                m14, [tmpq+26*%%str] ; t9
2601cabdff1aSopenharmony_ci    mova                m13, [tmpq+22*%%str] ; t10
2602cabdff1aSopenharmony_ci    mova                m12, [tmpq+18*%%str] ; t11
2603cabdff1aSopenharmony_ci    mova                m11, [tmpq+14*%%str] ; t12
2604cabdff1aSopenharmony_ci    mova                m10, [tmpq+10*%%str] ; t13
2605cabdff1aSopenharmony_ci    mova                 m9, [tmpq+ 6*%%str] ; t14
2606cabdff1aSopenharmony_ci    mova                 m8, [tmpq+ 2*%%str] ; t15
2607cabdff1aSopenharmony_ci    mova                 m7, [tmpq+21*%%str] ; t16
2608cabdff1aSopenharmony_ci    mova                 m6, [tmpq+29*%%str] ; t17
2609cabdff1aSopenharmony_ci    mova                 m5, [tmpq+ 9*%%str] ; t18
2610cabdff1aSopenharmony_ci    mova                 m4, [tmpq+13*%%str] ; t19
2611cabdff1aSopenharmony_ci    mova                 m3, [tmpq+17*%%str] ; t20
2612cabdff1aSopenharmony_ci    mova                 m2, [tmpq+ 1*%%str] ; t21
2613cabdff1aSopenharmony_ci    mova                 m1, [tmpq+25*%%str] ; t22
2614cabdff1aSopenharmony_ci
2615cabdff1aSopenharmony_ci    SUMSUB_BA             w,  7,  8, 0
2616cabdff1aSopenharmony_ci    mova    [tmpq+ 2*%%str], m8
2617cabdff1aSopenharmony_ci    mova                 m0, [tmpq+ 5*%%str] ; t23
2618cabdff1aSopenharmony_ci    SUMSUB_BA             w,  6,  9, 8
2619cabdff1aSopenharmony_ci    SUMSUB_BA             w,  5, 10, 8
2620cabdff1aSopenharmony_ci    SUMSUB_BA             w,  4, 11, 8
2621cabdff1aSopenharmony_ci    SUMSUB_BA             w,  3, 12, 8
2622cabdff1aSopenharmony_ci    SUMSUB_BA             w,  2, 13, 8
2623cabdff1aSopenharmony_ci    SUMSUB_BA             w,  1, 14, 8
2624cabdff1aSopenharmony_ci    SUMSUB_BA             w,  0, 15, 8
2625cabdff1aSopenharmony_ci
2626cabdff1aSopenharmony_ci    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, 8
2627cabdff1aSopenharmony_ci    mova    [tmpq+ 1*%%str], m0
2628cabdff1aSopenharmony_ci    mova    [tmpq+ 5*%%str], m1
2629cabdff1aSopenharmony_ci    mova    [tmpq+ 9*%%str], m2
2630cabdff1aSopenharmony_ci    mova    [tmpq+13*%%str], m3
2631cabdff1aSopenharmony_ci    mova    [tmpq+17*%%str], m4
2632cabdff1aSopenharmony_ci    mova    [tmpq+21*%%str], m5
2633cabdff1aSopenharmony_ci    mova    [tmpq+25*%%str], m6
2634cabdff1aSopenharmony_ci    mova    [tmpq+29*%%str], m7
2635cabdff1aSopenharmony_ci
2636cabdff1aSopenharmony_ci    mova                 m8, [tmpq+ 2*%%str]
2637cabdff1aSopenharmony_ci    TRANSPOSE8x8W         8, 9, 10, 11, 12, 13, 14, 15, 0
2638cabdff1aSopenharmony_ci    mova    [tmpq+ 2*%%str], m8
2639cabdff1aSopenharmony_ci    mova    [tmpq+ 6*%%str], m9
2640cabdff1aSopenharmony_ci    mova    [tmpq+10*%%str], m10
2641cabdff1aSopenharmony_ci    mova    [tmpq+14*%%str], m11
2642cabdff1aSopenharmony_ci    mova    [tmpq+18*%%str], m12
2643cabdff1aSopenharmony_ci    mova    [tmpq+22*%%str], m13
2644cabdff1aSopenharmony_ci    mova    [tmpq+26*%%str], m14
2645cabdff1aSopenharmony_ci    mova    [tmpq+30*%%str], m15
2646cabdff1aSopenharmony_ci%endif ; avx2
2647cabdff1aSopenharmony_ci%else
2648cabdff1aSopenharmony_ci    mova                 m2, [tmpq+24*%%str] ; t6
2649cabdff1aSopenharmony_ci    mova                 m3, [tmpq+28*%%str] ; t7
2650cabdff1aSopenharmony_ci    SUMSUB_BADC           w,  5,  2,  4,  3
2651cabdff1aSopenharmony_ci    mova    [tmpq+24*%%str], m5
2652cabdff1aSopenharmony_ci    mova    [tmpq+23*%%str], m2
2653cabdff1aSopenharmony_ci    mova    [tmpq+28*%%str], m4
2654cabdff1aSopenharmony_ci    mova    [tmpq+19*%%str], m3
2655cabdff1aSopenharmony_ci
2656cabdff1aSopenharmony_ci    mova                 m2, [tmpq+16*%%str] ; t4
2657cabdff1aSopenharmony_ci    mova                 m3, [tmpq+20*%%str] ; t5
2658cabdff1aSopenharmony_ci    SUMSUB_BA             w,  1,  2,  5
2659cabdff1aSopenharmony_ci    SUMSUB_BA             w,  7,  3,  5
2660cabdff1aSopenharmony_ci    mova    [tmpq+15*%%str], m2
2661cabdff1aSopenharmony_ci    mova    [tmpq+11*%%str], m3
2662cabdff1aSopenharmony_ci
2663cabdff1aSopenharmony_ci    mova                 m2, [tmpq+ 0*%%str] ; t0
2664cabdff1aSopenharmony_ci    mova                 m3, [tmpq+ 4*%%str] ; t1
2665cabdff1aSopenharmony_ci    SUMSUB_BA             w,  6,  2,  5
2666cabdff1aSopenharmony_ci    SUMSUB_BA             w,  0,  3,  5
2667cabdff1aSopenharmony_ci    mova    [tmpq+31*%%str], m2
2668cabdff1aSopenharmony_ci    mova    [tmpq+27*%%str], m3
2669cabdff1aSopenharmony_ci
2670cabdff1aSopenharmony_ci    mova                 m2, [tmpq+ 8*%%str] ; t2
2671cabdff1aSopenharmony_ci    mova                 m3, [tmpq+12*%%str] ; t3
2672cabdff1aSopenharmony_ci    mova                 m5, [tmpq+ 7*%%str]
2673cabdff1aSopenharmony_ci    mova                 m4, [tmpq+ 3*%%str]
2674cabdff1aSopenharmony_ci    SUMSUB_BADC           w,  5,  2,  4,  3
2675cabdff1aSopenharmony_ci    mova    [tmpq+ 7*%%str], m2
2676cabdff1aSopenharmony_ci    mova    [tmpq+ 3*%%str], m3
2677cabdff1aSopenharmony_ci
2678cabdff1aSopenharmony_ci    mova                 m3, [tmpq+28*%%str]
2679cabdff1aSopenharmony_ci    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1
2680cabdff1aSopenharmony_ci    mova    [tmpq+ 0*%%str], m6
2681cabdff1aSopenharmony_ci    mova    [tmpq+ 4*%%str], m0
2682cabdff1aSopenharmony_ci    mova    [tmpq+ 8*%%str], m5
2683cabdff1aSopenharmony_ci    mova    [tmpq+12*%%str], m4
2684cabdff1aSopenharmony_ci    mova    [tmpq+20*%%str], m7
2685cabdff1aSopenharmony_ci    mova    [tmpq+24*%%str], m2
2686cabdff1aSopenharmony_ci    mova    [tmpq+28*%%str], m3
2687cabdff1aSopenharmony_ci
2688cabdff1aSopenharmony_ci    mova                 m6, [tmpq+19*%%str]
2689cabdff1aSopenharmony_ci    mova                 m0, [tmpq+23*%%str]
2690cabdff1aSopenharmony_ci    mova                 m5, [tmpq+11*%%str]
2691cabdff1aSopenharmony_ci    mova                 m4, [tmpq+15*%%str]
2692cabdff1aSopenharmony_ci    mova                 m1, [tmpq+ 3*%%str]
2693cabdff1aSopenharmony_ci    mova                 m7, [tmpq+ 7*%%str]
2694cabdff1aSopenharmony_ci    mova                 m3, [tmpq+31*%%str]
2695cabdff1aSopenharmony_ci    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1
2696cabdff1aSopenharmony_ci    mova    [tmpq+ 3*%%str], m6
2697cabdff1aSopenharmony_ci    mova    [tmpq+ 7*%%str], m0
2698cabdff1aSopenharmony_ci    mova    [tmpq+11*%%str], m5
2699cabdff1aSopenharmony_ci    mova    [tmpq+15*%%str], m4
2700cabdff1aSopenharmony_ci    mova    [tmpq+23*%%str], m7
2701cabdff1aSopenharmony_ci    mova    [tmpq+27*%%str], m2
2702cabdff1aSopenharmony_ci    mova    [tmpq+31*%%str], m3
2703cabdff1aSopenharmony_ci
2704cabdff1aSopenharmony_ci    mova                 m1, [tmpq+ 6*%%str] ; t14
2705cabdff1aSopenharmony_ci    mova                 m0, [tmpq+ 2*%%str] ; t15
2706cabdff1aSopenharmony_ci    mova                 m7, [tmpq+21*%%str] ; t16
2707cabdff1aSopenharmony_ci    mova                 m6, [tmpq+29*%%str] ; t17
2708cabdff1aSopenharmony_ci    SUMSUB_BA             w,  7,  0,  2
2709cabdff1aSopenharmony_ci    SUMSUB_BA             w,  6,  1,  2
2710cabdff1aSopenharmony_ci    mova    [tmpq+29*%%str], m7
2711cabdff1aSopenharmony_ci    mova    [tmpq+ 2*%%str], m0
2712cabdff1aSopenharmony_ci    mova    [tmpq+21*%%str], m6
2713cabdff1aSopenharmony_ci    mova    [tmpq+ 6*%%str], m1
2714cabdff1aSopenharmony_ci
2715cabdff1aSopenharmony_ci    mova                 m1, [tmpq+14*%%str] ; t12
2716cabdff1aSopenharmony_ci    mova                 m0, [tmpq+10*%%str] ; t13
2717cabdff1aSopenharmony_ci    mova                 m5, [tmpq+ 9*%%str] ; t18
2718cabdff1aSopenharmony_ci    mova                 m4, [tmpq+13*%%str] ; t19
2719cabdff1aSopenharmony_ci    SUMSUB_BA             w,  5,  0,  2
2720cabdff1aSopenharmony_ci    SUMSUB_BA             w,  4,  1,  2
2721cabdff1aSopenharmony_ci    mova     [tmpq+10*%%str], m0
2722cabdff1aSopenharmony_ci    mova     [tmpq+14*%%str], m1
2723cabdff1aSopenharmony_ci
2724cabdff1aSopenharmony_ci    mova                 m1, [tmpq+22*%%str] ; t10
2725cabdff1aSopenharmony_ci    mova                 m0, [tmpq+18*%%str] ; t11
2726cabdff1aSopenharmony_ci    mova                 m3, [tmpq+17*%%str] ; t20
2727cabdff1aSopenharmony_ci    mova                 m2, [tmpq+ 1*%%str] ; t21
2728cabdff1aSopenharmony_ci    SUMSUB_BA             w,  3,  0,  6
2729cabdff1aSopenharmony_ci    SUMSUB_BA             w,  2,  1,  6
2730cabdff1aSopenharmony_ci    mova     [tmpq+18*%%str], m0
2731cabdff1aSopenharmony_ci    mova     [tmpq+22*%%str], m1
2732cabdff1aSopenharmony_ci
2733cabdff1aSopenharmony_ci    mova                 m7, [tmpq+30*%%str] ; t8
2734cabdff1aSopenharmony_ci    mova                 m6, [tmpq+26*%%str] ; t9
2735cabdff1aSopenharmony_ci    mova                 m1, [tmpq+25*%%str] ; t22
2736cabdff1aSopenharmony_ci    mova                 m0, [tmpq+ 5*%%str] ; t23
2737cabdff1aSopenharmony_ci    SUMSUB_BADC           w,  1,  6,  0,  7
2738cabdff1aSopenharmony_ci    mova     [tmpq+26*%%str], m6
2739cabdff1aSopenharmony_ci    mova     [tmpq+30*%%str], m7
2740cabdff1aSopenharmony_ci
2741cabdff1aSopenharmony_ci    mova                 m7, [tmpq+29*%%str]
2742cabdff1aSopenharmony_ci    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1
2743cabdff1aSopenharmony_ci    mova    [tmpq+ 1*%%str], m0
2744cabdff1aSopenharmony_ci    mova    [tmpq+ 5*%%str], m1
2745cabdff1aSopenharmony_ci    mova    [tmpq+ 9*%%str], m2
2746cabdff1aSopenharmony_ci    mova    [tmpq+13*%%str], m3
2747cabdff1aSopenharmony_ci    mova    [tmpq+21*%%str], m5
2748cabdff1aSopenharmony_ci    mova    [tmpq+25*%%str], m6
2749cabdff1aSopenharmony_ci    mova    [tmpq+29*%%str], m7
2750cabdff1aSopenharmony_ci
2751cabdff1aSopenharmony_ci    mova                 m0, [tmpq+ 2*%%str]
2752cabdff1aSopenharmony_ci    mova                 m1, [tmpq+ 6*%%str]
2753cabdff1aSopenharmony_ci    mova                 m2, [tmpq+10*%%str]
2754cabdff1aSopenharmony_ci    mova                 m3, [tmpq+14*%%str]
2755cabdff1aSopenharmony_ci    mova                 m4, [tmpq+18*%%str]
2756cabdff1aSopenharmony_ci    mova                 m5, [tmpq+22*%%str]
2757cabdff1aSopenharmony_ci    mova                 m7, [tmpq+30*%%str]
2758cabdff1aSopenharmony_ci    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1
2759cabdff1aSopenharmony_ci    mova    [tmpq+ 2*%%str], m0
2760cabdff1aSopenharmony_ci    mova    [tmpq+ 6*%%str], m1
2761cabdff1aSopenharmony_ci    mova    [tmpq+10*%%str], m2
2762cabdff1aSopenharmony_ci    mova    [tmpq+14*%%str], m3
2763cabdff1aSopenharmony_ci    mova    [tmpq+22*%%str], m5
2764cabdff1aSopenharmony_ci    mova    [tmpq+26*%%str], m6
2765cabdff1aSopenharmony_ci    mova    [tmpq+30*%%str], m7
2766cabdff1aSopenharmony_ci%endif
2767cabdff1aSopenharmony_ci%else
2768cabdff1aSopenharmony_ci    ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
2769cabdff1aSopenharmony_ci    ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
2770cabdff1aSopenharmony_ci    ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
2771cabdff1aSopenharmony_ci    ; t20-22 is in m4-6
2772cabdff1aSopenharmony_ci    ; t24-31 is in m8-15
2773cabdff1aSopenharmony_ci
2774cabdff1aSopenharmony_ci%if cpuflag(ssse3)
2775cabdff1aSopenharmony_ci%define ROUND_REG [pw_512]
2776cabdff1aSopenharmony_ci%else
2777cabdff1aSopenharmony_ci%define ROUND_REG [pw_32]
2778cabdff1aSopenharmony_ci%endif
2779cabdff1aSopenharmony_ci
2780cabdff1aSopenharmony_ci%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
2781cabdff1aSopenharmony_ci    SUMSUB_BA            w, %4, %1, %5
2782cabdff1aSopenharmony_ci    SUMSUB_BA            w, %3, %2, %5
2783cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2   %4, %3, %5, %6, %7, ROUND_REG, 6
2784cabdff1aSopenharmony_ci%if %8 == 1
2785cabdff1aSopenharmony_ci    add               dstq, stride2q
2786cabdff1aSopenharmony_ci%endif
2787cabdff1aSopenharmony_ci    VP9_IDCT8_WRITEx2   %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq
2788cabdff1aSopenharmony_ci%if %8 == 1
2789cabdff1aSopenharmony_ci    sub           dst_endq, stride2q
2790cabdff1aSopenharmony_ci%endif
2791cabdff1aSopenharmony_ci%endmacro
2792cabdff1aSopenharmony_ci
2793cabdff1aSopenharmony_ci%if ARCH_X86_64
2794cabdff1aSopenharmony_ci    pxor               m10, m10
2795cabdff1aSopenharmony_ci
2796cabdff1aSopenharmony_ci    ; store t0-1 and t30-31
2797cabdff1aSopenharmony_ci    mova                m8, [tmpq+ 0*%%str]
2798cabdff1aSopenharmony_ci    mova                m9, [tmpq+ 4*%%str]
2799cabdff1aSopenharmony_ci    %%STORE_2X2          8,  9,  0,  6, 12, 11, 10
2800cabdff1aSopenharmony_ci
2801cabdff1aSopenharmony_ci    ; store t2-3 and t28-29
2802cabdff1aSopenharmony_ci    mova                m8, [tmpq+ 8*%%str]
2803cabdff1aSopenharmony_ci    mova                m9, [tmpq+12*%%str]
2804cabdff1aSopenharmony_ci    %%STORE_2X2          8,  9, 14, 15, 12, 11, 10
2805cabdff1aSopenharmony_ci
2806cabdff1aSopenharmony_ci    ; store t4-5 and t26-27
2807cabdff1aSopenharmony_ci    mova                m8, [tmpq+16*%%str]
2808cabdff1aSopenharmony_ci    mova                m9, [tmpq+20*%%str]
2809cabdff1aSopenharmony_ci    %%STORE_2X2          8,  9,  7,  1, 12, 11, 10
2810cabdff1aSopenharmony_ci
2811cabdff1aSopenharmony_ci    ; store t6-7 and t24-25
2812cabdff1aSopenharmony_ci    mova                m8, [tmpq+24*%%str]
2813cabdff1aSopenharmony_ci    mova                m9, [tmpq+28*%%str]
2814cabdff1aSopenharmony_ci    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
2815cabdff1aSopenharmony_ci
2816cabdff1aSopenharmony_ci    ; store t8-9 and t22-23
2817cabdff1aSopenharmony_ci    mova                m8, [tmpq+30*%%str]
2818cabdff1aSopenharmony_ci    mova                m9, [tmpq+26*%%str]
2819cabdff1aSopenharmony_ci    mova                m0, [tmpq+ 5*%%str]
2820cabdff1aSopenharmony_ci    %%STORE_2X2          8,  9, 13,  0, 12, 11, 10
2821cabdff1aSopenharmony_ci
2822cabdff1aSopenharmony_ci    ; store t10-11 and t20-21
2823cabdff1aSopenharmony_ci    mova                m8, [tmpq+22*%%str]
2824cabdff1aSopenharmony_ci    mova                m9, [tmpq+18*%%str]
2825cabdff1aSopenharmony_ci    %%STORE_2X2          8,  9,  2,  3, 12, 11, 10
2826cabdff1aSopenharmony_ci
2827cabdff1aSopenharmony_ci    ; store t12-13 and t18-19
2828cabdff1aSopenharmony_ci    mova                m8, [tmpq+14*%%str]
2829cabdff1aSopenharmony_ci    mova                m9, [tmpq+10*%%str]
2830cabdff1aSopenharmony_ci    mova                m5, [tmpq+13*%%str]
2831cabdff1aSopenharmony_ci    mova                m4, [tmpq+ 9*%%str]
2832cabdff1aSopenharmony_ci    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
2833cabdff1aSopenharmony_ci
2834cabdff1aSopenharmony_ci    ; store t14-17
2835cabdff1aSopenharmony_ci    mova                m8, [tmpq+ 6*%%str]
2836cabdff1aSopenharmony_ci    mova                m9, [tmpq+ 2*%%str]
2837cabdff1aSopenharmony_ci    mova                m5, [tmpq+29*%%str]
2838cabdff1aSopenharmony_ci    mova                m4, [tmpq+21*%%str]
2839cabdff1aSopenharmony_ci    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10, 0
2840cabdff1aSopenharmony_ci
2841cabdff1aSopenharmony_ci    SWAP                 1, 10 ; zero
2842cabdff1aSopenharmony_ci%else
2843cabdff1aSopenharmony_ci    mova   [tmpq+ 1*%%str], m1
2844cabdff1aSopenharmony_ci    mova   [tmpq+11*%%str], m2
2845cabdff1aSopenharmony_ci    mova   [tmpq+15*%%str], m3
2846cabdff1aSopenharmony_ci    mova   [tmpq+17*%%str], m4
2847cabdff1aSopenharmony_ci    mova   [tmpq+19*%%str], m5
2848cabdff1aSopenharmony_ci    pxor                m1, m1
2849cabdff1aSopenharmony_ci
2850cabdff1aSopenharmony_ci    ; store t0-1 and t30-31
2851cabdff1aSopenharmony_ci    mova                m2, [tmpq+ 0*%%str]
2852cabdff1aSopenharmony_ci    mova                m3, [tmpq+ 4*%%str]
2853cabdff1aSopenharmony_ci    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2854cabdff1aSopenharmony_ci
2855cabdff1aSopenharmony_ci    ; store t2-3 and t28-29
2856cabdff1aSopenharmony_ci    mova                m2, [tmpq+ 8*%%str]
2857cabdff1aSopenharmony_ci    mova                m3, [tmpq+12*%%str]
2858cabdff1aSopenharmony_ci    mova                m0, [tmpq+ 3*%%str]
2859cabdff1aSopenharmony_ci    mova                m6, [tmpq+ 7*%%str]
2860cabdff1aSopenharmony_ci    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2861cabdff1aSopenharmony_ci
2862cabdff1aSopenharmony_ci    ; store t4-5 and t26-27
2863cabdff1aSopenharmony_ci    mova                m2, [tmpq+16*%%str]
2864cabdff1aSopenharmony_ci    mova                m3, [tmpq+20*%%str]
2865cabdff1aSopenharmony_ci    mova                m0, [tmpq+ 1*%%str]
2866cabdff1aSopenharmony_ci    %%STORE_2X2          2,  3,  7,  0, 4, 5, 1
2867cabdff1aSopenharmony_ci
2868cabdff1aSopenharmony_ci    ; store t6-7 and t24-25
2869cabdff1aSopenharmony_ci    mova                m2, [tmpq+24*%%str]
2870cabdff1aSopenharmony_ci    mova                m3, [tmpq+28*%%str]
2871cabdff1aSopenharmony_ci    mova                m0, [tmpq+17*%%str]
2872cabdff1aSopenharmony_ci    mova                m6, [tmpq+19*%%str]
2873cabdff1aSopenharmony_ci    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2874cabdff1aSopenharmony_ci
2875cabdff1aSopenharmony_ci    ; store t8-9 and t22-23
2876cabdff1aSopenharmony_ci    mova                m2, [tmpq+30*%%str]
2877cabdff1aSopenharmony_ci    mova                m3, [tmpq+26*%%str]
2878cabdff1aSopenharmony_ci    mova                m0, [tmpq+25*%%str]
2879cabdff1aSopenharmony_ci    mova                m6, [tmpq+ 5*%%str]
2880cabdff1aSopenharmony_ci    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2881cabdff1aSopenharmony_ci
2882cabdff1aSopenharmony_ci    ; store t10-11 and t20-21
2883cabdff1aSopenharmony_ci    mova                m2, [tmpq+22*%%str]
2884cabdff1aSopenharmony_ci    mova                m3, [tmpq+18*%%str]
2885cabdff1aSopenharmony_ci    mova                m0, [tmpq+11*%%str]
2886cabdff1aSopenharmony_ci    mova                m6, [tmpq+15*%%str]
2887cabdff1aSopenharmony_ci    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2888cabdff1aSopenharmony_ci
2889cabdff1aSopenharmony_ci    ; store t12-13 and t18-19
2890cabdff1aSopenharmony_ci    mova                m2, [tmpq+14*%%str]
2891cabdff1aSopenharmony_ci    mova                m3, [tmpq+10*%%str]
2892cabdff1aSopenharmony_ci    mova                m6, [tmpq+13*%%str]
2893cabdff1aSopenharmony_ci    mova                m0, [tmpq+ 9*%%str]
2894cabdff1aSopenharmony_ci    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
2895cabdff1aSopenharmony_ci
2896cabdff1aSopenharmony_ci    ; store t14-17
2897cabdff1aSopenharmony_ci    mova                m2, [tmpq+ 6*%%str]
2898cabdff1aSopenharmony_ci    mova                m3, [tmpq+ 2*%%str]
2899cabdff1aSopenharmony_ci    mova                m6, [tmpq+29*%%str]
2900cabdff1aSopenharmony_ci    mova                m0, [tmpq+21*%%str]
2901cabdff1aSopenharmony_ci    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1, 0
2902cabdff1aSopenharmony_ci%endif
2903cabdff1aSopenharmony_ci%undef ROUND_REG
2904cabdff1aSopenharmony_ci%endif
2905cabdff1aSopenharmony_ci%endmacro
2906cabdff1aSopenharmony_ci
2907cabdff1aSopenharmony_ci%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
2908cabdff1aSopenharmony_ciINIT_XMM %1
2909cabdff1aSopenharmony_cicglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
2910cabdff1aSopenharmony_ci    movifnidn         eobd, dword eobm
2911cabdff1aSopenharmony_ci%if cpuflag(ssse3)
2912cabdff1aSopenharmony_ci    cmp eobd, 135
2913cabdff1aSopenharmony_ci    jg .idctfull
2914cabdff1aSopenharmony_ci    cmp eobd, 34
2915cabdff1aSopenharmony_ci    jg .idct16x16
2916cabdff1aSopenharmony_ci    cmp eobd, 1
2917cabdff1aSopenharmony_ci    jg .idct8x8
2918cabdff1aSopenharmony_ci%else
2919cabdff1aSopenharmony_ci    cmp eobd, 1
2920cabdff1aSopenharmony_ci    jg .idctfull
2921cabdff1aSopenharmony_ci%endif
2922cabdff1aSopenharmony_ci
2923cabdff1aSopenharmony_ci    ; dc-only case
2924cabdff1aSopenharmony_ci    movifnidn       blockq, blockmp
2925cabdff1aSopenharmony_ci    movifnidn         dstq, dstmp
2926cabdff1aSopenharmony_ci    movifnidn      strideq, stridemp
2927cabdff1aSopenharmony_ci%if cpuflag(ssse3)
2928cabdff1aSopenharmony_ci    movd                m0, [blockq]
2929cabdff1aSopenharmony_ci    mova                m1, [pw_11585x2]
2930cabdff1aSopenharmony_ci    pmulhrsw            m0, m1
2931cabdff1aSopenharmony_ci    pmulhrsw            m0, m1
2932cabdff1aSopenharmony_ci%else
2933cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, block, coef
2934cabdff1aSopenharmony_ci    movsx            coefd, word [blockq]
2935cabdff1aSopenharmony_ci    imul             coefd, 11585
2936cabdff1aSopenharmony_ci    add              coefd, 8192
2937cabdff1aSopenharmony_ci    sar              coefd, 14
2938cabdff1aSopenharmony_ci    imul             coefd, 11585
2939cabdff1aSopenharmony_ci    add              coefd, (32 << 14) + 8192
2940cabdff1aSopenharmony_ci    sar              coefd, 14 + 6
2941cabdff1aSopenharmony_ci    movd                m0, coefd
2942cabdff1aSopenharmony_ci%endif
2943cabdff1aSopenharmony_ci    SPLATW              m0, m0, q0000
2944cabdff1aSopenharmony_ci%if cpuflag(ssse3)
2945cabdff1aSopenharmony_ci    pmulhrsw            m0, [pw_512]
2946cabdff1aSopenharmony_ci%endif
2947cabdff1aSopenharmony_ci    pxor                m5, m5
2948cabdff1aSopenharmony_ci    movd          [blockq], m5
2949cabdff1aSopenharmony_ci%rep 31
2950cabdff1aSopenharmony_ci    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
2951cabdff1aSopenharmony_ci    add               dstq, strideq
2952cabdff1aSopenharmony_ci%endrep
2953cabdff1aSopenharmony_ci    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
2954cabdff1aSopenharmony_ci    RET
2955cabdff1aSopenharmony_ci
2956cabdff1aSopenharmony_ci%if ARCH_X86_64
2957cabdff1aSopenharmony_ci    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
2958cabdff1aSopenharmony_ci%else
2959cabdff1aSopenharmony_ci%define dst_bakq r0mp
2960cabdff1aSopenharmony_ci%endif
2961cabdff1aSopenharmony_ci%if cpuflag(ssse3)
2962cabdff1aSopenharmony_ci.idct8x8:
2963cabdff1aSopenharmony_ci%if ARCH_X86_32
2964cabdff1aSopenharmony_ci    DEFINE_ARGS block, u1, u2, u3, u4, tmp
2965cabdff1aSopenharmony_ci    mov             blockq, r2mp
2966cabdff1aSopenharmony_ci%endif
2967cabdff1aSopenharmony_ci    mov               tmpq, rsp
2968cabdff1aSopenharmony_ci    VP9_IDCT32_1D   blockq, 1, 8
2969cabdff1aSopenharmony_ci
2970cabdff1aSopenharmony_ci%if ARCH_X86_32
2971cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
2972cabdff1aSopenharmony_ci    mov            strideq, r1mp
2973cabdff1aSopenharmony_ci%define cntd dword r3m
2974cabdff1aSopenharmony_ci%endif
2975cabdff1aSopenharmony_ci    mov          stride30q, strideq         ; stride
2976cabdff1aSopenharmony_ci    lea           stride2q, [strideq*2]     ; stride*2
2977cabdff1aSopenharmony_ci    shl          stride30q, 5               ; stride*32
2978cabdff1aSopenharmony_ci    mov               cntd, 4
2979cabdff1aSopenharmony_ci    sub          stride30q, stride2q        ; stride*30
2980cabdff1aSopenharmony_ci.loop2_8x8:
2981cabdff1aSopenharmony_ci    mov               dstq, dst_bakq
2982cabdff1aSopenharmony_ci    lea           dst_endq, [dstq+stride30q]
2983cabdff1aSopenharmony_ci    VP9_IDCT32_1D     tmpq, 2, 8
2984cabdff1aSopenharmony_ci    add           dst_bakq, 8
2985cabdff1aSopenharmony_ci    add               tmpq, 16
2986cabdff1aSopenharmony_ci    dec               cntd
2987cabdff1aSopenharmony_ci    jg .loop2_8x8
2988cabdff1aSopenharmony_ci
2989cabdff1aSopenharmony_ci    ; at the end of the loop, m7 should still be zero
2990cabdff1aSopenharmony_ci    ; use that to zero out block coefficients
2991cabdff1aSopenharmony_ci%if ARCH_X86_32
2992cabdff1aSopenharmony_ci    DEFINE_ARGS block
2993cabdff1aSopenharmony_ci    mov             blockq, r2mp
2994cabdff1aSopenharmony_ci%endif
2995cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 64,  8, m1
2996cabdff1aSopenharmony_ci    RET
2997cabdff1aSopenharmony_ci
2998cabdff1aSopenharmony_ci.idct16x16:
2999cabdff1aSopenharmony_ci%if ARCH_X86_32
3000cabdff1aSopenharmony_ci    DEFINE_ARGS block, tmp, cnt
3001cabdff1aSopenharmony_ci    mov             blockq, r2mp
3002cabdff1aSopenharmony_ci%endif
3003cabdff1aSopenharmony_ci    mov               cntd, 2
3004cabdff1aSopenharmony_ci    mov               tmpq, rsp
3005cabdff1aSopenharmony_ci.loop1_16x16:
3006cabdff1aSopenharmony_ci    VP9_IDCT32_1D   blockq, 1, 16
3007cabdff1aSopenharmony_ci    add             blockq, 16
3008cabdff1aSopenharmony_ci    add               tmpq, 512
3009cabdff1aSopenharmony_ci    dec               cntd
3010cabdff1aSopenharmony_ci    jg .loop1_16x16
3011cabdff1aSopenharmony_ci
3012cabdff1aSopenharmony_ci%if ARCH_X86_64
3013cabdff1aSopenharmony_ci    sub             blockq, 32
3014cabdff1aSopenharmony_ci%else
3015cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
3016cabdff1aSopenharmony_ci    mov            strideq, r1mp
3017cabdff1aSopenharmony_ci%define cntd dword r3m
3018cabdff1aSopenharmony_ci%endif
3019cabdff1aSopenharmony_ci
3020cabdff1aSopenharmony_ci    mov          stride30q, strideq         ; stride
3021cabdff1aSopenharmony_ci    lea           stride2q, [strideq*2]     ; stride*2
3022cabdff1aSopenharmony_ci    shl          stride30q, 5               ; stride*32
3023cabdff1aSopenharmony_ci    mov               cntd, 4
3024cabdff1aSopenharmony_ci    mov               tmpq, rsp
3025cabdff1aSopenharmony_ci    sub          stride30q, stride2q        ; stride*30
3026cabdff1aSopenharmony_ci.loop2_16x16:
3027cabdff1aSopenharmony_ci    mov               dstq, dst_bakq
3028cabdff1aSopenharmony_ci    lea           dst_endq, [dstq+stride30q]
3029cabdff1aSopenharmony_ci    VP9_IDCT32_1D     tmpq, 2, 16
3030cabdff1aSopenharmony_ci    add           dst_bakq, 8
3031cabdff1aSopenharmony_ci    add               tmpq, 16
3032cabdff1aSopenharmony_ci    dec               cntd
3033cabdff1aSopenharmony_ci    jg .loop2_16x16
3034cabdff1aSopenharmony_ci
3035cabdff1aSopenharmony_ci    ; at the end of the loop, m7 should still be zero
3036cabdff1aSopenharmony_ci    ; use that to zero out block coefficients
3037cabdff1aSopenharmony_ci%if ARCH_X86_32
3038cabdff1aSopenharmony_ci    DEFINE_ARGS block
3039cabdff1aSopenharmony_ci    mov             blockq, r2mp
3040cabdff1aSopenharmony_ci%endif
3041cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 64, 16, m1
3042cabdff1aSopenharmony_ci    RET
3043cabdff1aSopenharmony_ci%endif
3044cabdff1aSopenharmony_ci
3045cabdff1aSopenharmony_ci.idctfull:
3046cabdff1aSopenharmony_ci%if ARCH_X86_32
3047cabdff1aSopenharmony_ci    DEFINE_ARGS block, tmp, cnt
3048cabdff1aSopenharmony_ci    mov             blockq, r2mp
3049cabdff1aSopenharmony_ci%endif
3050cabdff1aSopenharmony_ci    mov               cntd, 4
3051cabdff1aSopenharmony_ci    mov               tmpq, rsp
3052cabdff1aSopenharmony_ci.loop1_full:
3053cabdff1aSopenharmony_ci    VP9_IDCT32_1D   blockq, 1
3054cabdff1aSopenharmony_ci    add             blockq, 16
3055cabdff1aSopenharmony_ci    add               tmpq, 512
3056cabdff1aSopenharmony_ci    dec               cntd
3057cabdff1aSopenharmony_ci    jg .loop1_full
3058cabdff1aSopenharmony_ci
3059cabdff1aSopenharmony_ci%if ARCH_X86_64
3060cabdff1aSopenharmony_ci    sub             blockq, 64
3061cabdff1aSopenharmony_ci%else
3062cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
3063cabdff1aSopenharmony_ci    mov            strideq, r1mp
3064cabdff1aSopenharmony_ci%define cntd dword r3m
3065cabdff1aSopenharmony_ci%endif
3066cabdff1aSopenharmony_ci
3067cabdff1aSopenharmony_ci    mov          stride30q, strideq         ; stride
3068cabdff1aSopenharmony_ci    lea           stride2q, [strideq*2]     ; stride*2
3069cabdff1aSopenharmony_ci    shl          stride30q, 5               ; stride*32
3070cabdff1aSopenharmony_ci    mov               cntd, 4
3071cabdff1aSopenharmony_ci    mov               tmpq, rsp
3072cabdff1aSopenharmony_ci    sub          stride30q, stride2q        ; stride*30
3073cabdff1aSopenharmony_ci.loop2_full:
3074cabdff1aSopenharmony_ci    mov               dstq, dst_bakq
3075cabdff1aSopenharmony_ci    lea           dst_endq, [dstq+stride30q]
3076cabdff1aSopenharmony_ci    VP9_IDCT32_1D     tmpq, 2
3077cabdff1aSopenharmony_ci    add           dst_bakq, 8
3078cabdff1aSopenharmony_ci    add               tmpq, 16
3079cabdff1aSopenharmony_ci    dec               cntd
3080cabdff1aSopenharmony_ci    jg .loop2_full
3081cabdff1aSopenharmony_ci
3082cabdff1aSopenharmony_ci    ; at the end of the loop, m7 should still be zero
3083cabdff1aSopenharmony_ci    ; use that to zero out block coefficients
3084cabdff1aSopenharmony_ci%if ARCH_X86_32
3085cabdff1aSopenharmony_ci    DEFINE_ARGS block
3086cabdff1aSopenharmony_ci    mov             blockq, r2mp
3087cabdff1aSopenharmony_ci%endif
3088cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 64, 32, m1
3089cabdff1aSopenharmony_ci    RET
3090cabdff1aSopenharmony_ci%endmacro
3091cabdff1aSopenharmony_ci
3092cabdff1aSopenharmony_ciVP9_IDCT_IDCT_32x32_ADD_XMM sse2
3093cabdff1aSopenharmony_ciVP9_IDCT_IDCT_32x32_ADD_XMM ssse3
3094cabdff1aSopenharmony_ciVP9_IDCT_IDCT_32x32_ADD_XMM avx
3095cabdff1aSopenharmony_ci
3096cabdff1aSopenharmony_ci; this is almost identical to VP9_STORE_2X, but it does two rows
3097cabdff1aSopenharmony_ci; for slightly improved interleaving, and it omits vpermq since the
3098cabdff1aSopenharmony_ci; input is DC so all values are identical
3099cabdff1aSopenharmony_ci%macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
3100cabdff1aSopenharmony_ci    mova               m%2, [dstq]
3101cabdff1aSopenharmony_ci    mova               m%4, [dstq+strideq]
3102cabdff1aSopenharmony_ci    punpckhbw          m%3, m%2, m%6
3103cabdff1aSopenharmony_ci    punpcklbw          m%2, m%6
3104cabdff1aSopenharmony_ci    punpckhbw          m%5, m%4, m%6
3105cabdff1aSopenharmony_ci    punpcklbw          m%4, m%6
3106cabdff1aSopenharmony_ci    paddw              m%3, m%1
3107cabdff1aSopenharmony_ci    paddw              m%2, m%1
3108cabdff1aSopenharmony_ci    paddw              m%5, m%1
3109cabdff1aSopenharmony_ci    paddw              m%4, m%1
3110cabdff1aSopenharmony_ci    packuswb           m%2, m%3
3111cabdff1aSopenharmony_ci    packuswb           m%4, m%5
3112cabdff1aSopenharmony_ci    mova  [dstq+strideq*0], m%2
3113cabdff1aSopenharmony_ci    mova  [dstq+strideq*1], m%4
3114cabdff1aSopenharmony_ci%endmacro
3115cabdff1aSopenharmony_ci
3116cabdff1aSopenharmony_ci%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
3117cabdff1aSopenharmony_ciINIT_YMM avx2
3118cabdff1aSopenharmony_cicglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
3119cabdff1aSopenharmony_ci    cmp eobd, 135
3120cabdff1aSopenharmony_ci    jg .idctfull
3121cabdff1aSopenharmony_ci    cmp eobd, 1
3122cabdff1aSopenharmony_ci    jg .idct16x16
3123cabdff1aSopenharmony_ci
3124cabdff1aSopenharmony_ci    ; dc-only case
3125cabdff1aSopenharmony_ci    mova                m1, [pw_11585x2]
3126cabdff1aSopenharmony_ci    vpbroadcastw        m0, [blockq]
3127cabdff1aSopenharmony_ci    pmulhrsw            m0, m1
3128cabdff1aSopenharmony_ci    pmulhrsw            m0, m1
3129cabdff1aSopenharmony_ci    pxor                m5, m5
3130cabdff1aSopenharmony_ci    pmulhrsw            m0, [pw_512]
3131cabdff1aSopenharmony_ci    movd          [blockq], xm5
3132cabdff1aSopenharmony_ci
3133cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt
3134cabdff1aSopenharmony_ci    mov               cntd, 16
3135cabdff1aSopenharmony_ci.loop_dc:
3136cabdff1aSopenharmony_ci    VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5
3137cabdff1aSopenharmony_ci    lea               dstq, [dstq+2*strideq]
3138cabdff1aSopenharmony_ci    dec               cntd
3139cabdff1aSopenharmony_ci    jg .loop_dc
3140cabdff1aSopenharmony_ci    RET
3141cabdff1aSopenharmony_ci
3142cabdff1aSopenharmony_ci    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
3143cabdff1aSopenharmony_ci.idct16x16:
3144cabdff1aSopenharmony_ci    mov               tmpq, rsp
3145cabdff1aSopenharmony_ci    VP9_IDCT32_1D   blockq, 1, 16
3146cabdff1aSopenharmony_ci
3147cabdff1aSopenharmony_ci    mov          stride30q, strideq         ; stride
3148cabdff1aSopenharmony_ci    lea           stride2q, [strideq*2]     ; stride*2
3149cabdff1aSopenharmony_ci    shl          stride30q, 5               ; stride*32
3150cabdff1aSopenharmony_ci    mov               cntd, 2
3151cabdff1aSopenharmony_ci    sub          stride30q, stride2q        ; stride*30
3152cabdff1aSopenharmony_ci.loop2_16x16:
3153cabdff1aSopenharmony_ci    mov               dstq, dst_bakq
3154cabdff1aSopenharmony_ci    lea           dst_endq, [dstq+stride30q]
3155cabdff1aSopenharmony_ci    VP9_IDCT32_1D     tmpq, 2, 16
3156cabdff1aSopenharmony_ci    add           dst_bakq, 16
3157cabdff1aSopenharmony_ci    add               tmpq, 32
3158cabdff1aSopenharmony_ci    dec               cntd
3159cabdff1aSopenharmony_ci    jg .loop2_16x16
3160cabdff1aSopenharmony_ci
3161cabdff1aSopenharmony_ci    ; at the end of the loop, m1 should still be zero
3162cabdff1aSopenharmony_ci    ; use that to zero out block coefficients
3163cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 64, 16, m1
3164cabdff1aSopenharmony_ci    RET
3165cabdff1aSopenharmony_ci
3166cabdff1aSopenharmony_ci.idctfull:
3167cabdff1aSopenharmony_ci    mov               cntd, 2
3168cabdff1aSopenharmony_ci    mov               tmpq, rsp
3169cabdff1aSopenharmony_ci.loop1_full:
3170cabdff1aSopenharmony_ci    VP9_IDCT32_1D   blockq, 1
3171cabdff1aSopenharmony_ci    add             blockq, 32
3172cabdff1aSopenharmony_ci    add               tmpq, 1024
3173cabdff1aSopenharmony_ci    dec               cntd
3174cabdff1aSopenharmony_ci    jg .loop1_full
3175cabdff1aSopenharmony_ci
3176cabdff1aSopenharmony_ci    sub             blockq, 64
3177cabdff1aSopenharmony_ci
3178cabdff1aSopenharmony_ci    mov          stride30q, strideq         ; stride
3179cabdff1aSopenharmony_ci    lea           stride2q, [strideq*2]     ; stride*2
3180cabdff1aSopenharmony_ci    shl          stride30q, 5               ; stride*32
3181cabdff1aSopenharmony_ci    mov               cntd, 2
3182cabdff1aSopenharmony_ci    mov               tmpq, rsp
3183cabdff1aSopenharmony_ci    sub          stride30q, stride2q        ; stride*30
3184cabdff1aSopenharmony_ci.loop2_full:
3185cabdff1aSopenharmony_ci    mov               dstq, dst_bakq
3186cabdff1aSopenharmony_ci    lea           dst_endq, [dstq+stride30q]
3187cabdff1aSopenharmony_ci    VP9_IDCT32_1D     tmpq, 2
3188cabdff1aSopenharmony_ci    add           dst_bakq, 16
3189cabdff1aSopenharmony_ci    add               tmpq, 32
3190cabdff1aSopenharmony_ci    dec               cntd
3191cabdff1aSopenharmony_ci    jg .loop2_full
3192cabdff1aSopenharmony_ci
3193cabdff1aSopenharmony_ci    ; at the end of the loop, m1 should still be zero
3194cabdff1aSopenharmony_ci    ; use that to zero out block coefficients
3195cabdff1aSopenharmony_ci    ZERO_BLOCK      blockq, 64, 32, m1
3196cabdff1aSopenharmony_ci    RET
3197cabdff1aSopenharmony_ci%endif
3198