1cabdff1aSopenharmony_ci;*******************************************************************************
2cabdff1aSopenharmony_ci;* SIMD-optimized IDCT functions for HEVC decoding
3cabdff1aSopenharmony_ci;* Copyright (c) 2014 Pierre-Edouard LEPERE
4cabdff1aSopenharmony_ci;* Copyright (c) 2014 James Almer
5cabdff1aSopenharmony_ci;* Copyright (c) 2016 Alexandra Hájková
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci;******************************************************************************
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION_RODATA
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_cipd_64: times 4 dd 64
29cabdff1aSopenharmony_cipd_2048: times 4 dd 2048
30cabdff1aSopenharmony_cipd_512: times 4 dd 512
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci; 4x4 transform coeffs
33cabdff1aSopenharmony_cicextern pw_64
34cabdff1aSopenharmony_cipw_64_m64: times 4 dw 64, -64
35cabdff1aSopenharmony_cipw_83_36: times 4 dw 83, 36
36cabdff1aSopenharmony_cipw_36_m83: times 4 dw 36, -83
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ci; 8x8 transform coeffs
39cabdff1aSopenharmony_cipw_89_75: times 4 dw 89, 75
40cabdff1aSopenharmony_cipw_50_18: times 4 dw 50, 18
41cabdff1aSopenharmony_ci
42cabdff1aSopenharmony_cipw_75_m18: times 4 dw 75, -18
43cabdff1aSopenharmony_cipw_m89_m50: times 4 dw -89, -50
44cabdff1aSopenharmony_ci
45cabdff1aSopenharmony_cipw_50_m89: times 4 dw 50, -89
46cabdff1aSopenharmony_cipw_18_75: times 4 dw 18, 75
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_cipw_18_m50: times 4 dw 18, -50
49cabdff1aSopenharmony_cipw_75_m89: times 4 dw 75, -89
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_ci; 16x16 transformation coeffs
52cabdff1aSopenharmony_citrans_coeffs16: times 4 dw 90, 87
53cabdff1aSopenharmony_citimes 4 dw 80, 70
54cabdff1aSopenharmony_citimes 4 dw 57, 43
55cabdff1aSopenharmony_citimes 4 dw 25, 9
56cabdff1aSopenharmony_ci
57cabdff1aSopenharmony_citimes 4 dw 87, 57
58cabdff1aSopenharmony_citimes 4 dw 9, -43
59cabdff1aSopenharmony_citimes 4 dw -80, -90
60cabdff1aSopenharmony_citimes 4 dw -70, -25
61cabdff1aSopenharmony_ci
62cabdff1aSopenharmony_citimes 4 dw 80, 9
63cabdff1aSopenharmony_citimes 4 dw -70, -87
64cabdff1aSopenharmony_citimes 4 dw -25, 57
65cabdff1aSopenharmony_citimes 4 dw 90, 43
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_citimes 4 dw 70, -43
68cabdff1aSopenharmony_citimes 4 dw -87, 9
69cabdff1aSopenharmony_citimes 4 dw 90, 25
70cabdff1aSopenharmony_citimes 4 dw -80, -57
71cabdff1aSopenharmony_ci
72cabdff1aSopenharmony_citimes 4 dw 57, -80
73cabdff1aSopenharmony_citimes 4 dw -25, 90
74cabdff1aSopenharmony_citimes 4 dw -9, -87
75cabdff1aSopenharmony_citimes 4 dw 43, 70
76cabdff1aSopenharmony_ci
77cabdff1aSopenharmony_citimes 4 dw 43, -90
78cabdff1aSopenharmony_citimes 4 dw 57, 25
79cabdff1aSopenharmony_citimes 4 dw -87, 70
80cabdff1aSopenharmony_citimes 4 dw 9, -80
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_citimes 4 dw 25, -70
83cabdff1aSopenharmony_citimes 4 dw 90, -80
84cabdff1aSopenharmony_citimes 4 dw 43, 9
85cabdff1aSopenharmony_citimes 4 dw -57, 87
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_citimes 4 dw 9, -25
88cabdff1aSopenharmony_citimes 4 dw 43, -57
89cabdff1aSopenharmony_citimes 4 dw 70, -80
90cabdff1aSopenharmony_citimes 4 dw 87, -90
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ci; 32x32 transform coeffs
93cabdff1aSopenharmony_citrans_coeff32: times 8 dw 90
94cabdff1aSopenharmony_citimes 4 dw 88, 85
95cabdff1aSopenharmony_citimes 4 dw 82, 78
96cabdff1aSopenharmony_citimes 4 dw 73, 67
97cabdff1aSopenharmony_citimes 4 dw 61, 54
98cabdff1aSopenharmony_citimes 4 dw 46, 38
99cabdff1aSopenharmony_citimes 4 dw 31, 22
100cabdff1aSopenharmony_citimes 4 dw 13, 4
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_citimes 4 dw 90, 82
103cabdff1aSopenharmony_citimes 4 dw 67, 46
104cabdff1aSopenharmony_citimes 4 dw 22, -4
105cabdff1aSopenharmony_citimes 4 dw -31, -54
106cabdff1aSopenharmony_citimes 4 dw -73, -85
107cabdff1aSopenharmony_citimes 4 dw -90, -88
108cabdff1aSopenharmony_citimes 4 dw -78, -61
109cabdff1aSopenharmony_citimes 4 dw -38, -13
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_citimes 4 dw 88, 67
112cabdff1aSopenharmony_citimes 4 dw 31, -13
113cabdff1aSopenharmony_citimes 4 dw -54, -82
114cabdff1aSopenharmony_citimes 4 dw -90, -78
115cabdff1aSopenharmony_citimes 4 dw -46, -4
116cabdff1aSopenharmony_citimes 4 dw 38, 73
117cabdff1aSopenharmony_citimes 4 dw 90, 85
118cabdff1aSopenharmony_citimes 4 dw 61, 22
119cabdff1aSopenharmony_ci
120cabdff1aSopenharmony_citimes 4 dw 85, 46
121cabdff1aSopenharmony_citimes 4 dw -13, -67
122cabdff1aSopenharmony_citimes 4 dw -90, -73
123cabdff1aSopenharmony_citimes 4 dw -22, 38
124cabdff1aSopenharmony_citimes 4 dw 82, 88
125cabdff1aSopenharmony_citimes 4 dw 54, -4
126cabdff1aSopenharmony_citimes 4 dw -61, -90
127cabdff1aSopenharmony_citimes 4 dw -78, -31
128cabdff1aSopenharmony_ci
129cabdff1aSopenharmony_citimes 4 dw 82, 22
130cabdff1aSopenharmony_citimes 4 dw -54, -90
131cabdff1aSopenharmony_citimes 4 dw -61, 13
132cabdff1aSopenharmony_citimes 4 dw 78, 85
133cabdff1aSopenharmony_citimes 4 dw 31, -46
134cabdff1aSopenharmony_citimes 4 dw -90, -67
135cabdff1aSopenharmony_citimes 4 dw 4, 73
136cabdff1aSopenharmony_citimes 4 dw 88, 38
137cabdff1aSopenharmony_ci
138cabdff1aSopenharmony_citimes 4 dw 78, -4
139cabdff1aSopenharmony_citimes 4 dw -82, -73
140cabdff1aSopenharmony_citimes 4 dw 13, 85
141cabdff1aSopenharmony_citimes 4 dw 67, -22
142cabdff1aSopenharmony_citimes 4 dw -88, -61
143cabdff1aSopenharmony_citimes 4 dw 31, 90
144cabdff1aSopenharmony_citimes 4 dw 54, -38
145cabdff1aSopenharmony_citimes 4 dw -90, -46
146cabdff1aSopenharmony_ci
147cabdff1aSopenharmony_citimes 4 dw 73, -31
148cabdff1aSopenharmony_citimes 4 dw -90, -22
149cabdff1aSopenharmony_citimes 4 dw 78, 67
150cabdff1aSopenharmony_citimes 4 dw -38, -90
151cabdff1aSopenharmony_citimes 4 dw -13, 82
152cabdff1aSopenharmony_citimes 4 dw 61, -46
153cabdff1aSopenharmony_citimes 4 dw -88, -4
154cabdff1aSopenharmony_citimes 4 dw 85, 54
155cabdff1aSopenharmony_ci
156cabdff1aSopenharmony_citimes 4 dw 67, -54
157cabdff1aSopenharmony_citimes 4 dw -78, 38
158cabdff1aSopenharmony_citimes 4 dw 85, -22
159cabdff1aSopenharmony_citimes 4 dw -90, 4
160cabdff1aSopenharmony_citimes 4 dw 90, 13
161cabdff1aSopenharmony_citimes 4 dw -88, -31
162cabdff1aSopenharmony_citimes 4 dw 82, 46
163cabdff1aSopenharmony_citimes 4 dw -73, -61
164cabdff1aSopenharmony_ci
165cabdff1aSopenharmony_citimes 4 dw 61, -73
166cabdff1aSopenharmony_citimes 4 dw -46, 82
167cabdff1aSopenharmony_citimes 4 dw 31, -88
168cabdff1aSopenharmony_citimes 4 dw -13, 90
169cabdff1aSopenharmony_citimes 4 dw -4, -90
170cabdff1aSopenharmony_citimes 4 dw 22, 85
171cabdff1aSopenharmony_citimes 4 dw -38, -78
172cabdff1aSopenharmony_citimes 4 dw 54, 67
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_citimes 4 dw 54, -85
175cabdff1aSopenharmony_citimes 4 dw -4, 88
176cabdff1aSopenharmony_citimes 4 dw -46, -61
177cabdff1aSopenharmony_citimes 4 dw 82, 13
178cabdff1aSopenharmony_citimes 4 dw -90, 38
179cabdff1aSopenharmony_citimes 4 dw 67, -78
180cabdff1aSopenharmony_citimes 4 dw -22, 90
181cabdff1aSopenharmony_citimes 4 dw -31, -73
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_citimes 4 dw 46, -90
184cabdff1aSopenharmony_citimes 4 dw 38, 54
185cabdff1aSopenharmony_citimes 4 dw -90, 31
186cabdff1aSopenharmony_citimes 4 dw 61, -88
187cabdff1aSopenharmony_citimes 4 dw 22, 67
188cabdff1aSopenharmony_citimes 4 dw -85, 13
189cabdff1aSopenharmony_citimes 4 dw 73, -82
190cabdff1aSopenharmony_citimes 4 dw 4, 78
191cabdff1aSopenharmony_ci
192cabdff1aSopenharmony_citimes 4 dw 38, -88
193cabdff1aSopenharmony_citimes 4 dw 73, -4
194cabdff1aSopenharmony_citimes 4 dw -67, 90
195cabdff1aSopenharmony_citimes 4 dw -46, -31
196cabdff1aSopenharmony_citimes 4 dw 85, -78
197cabdff1aSopenharmony_citimes 4 dw 13, 61
198cabdff1aSopenharmony_citimes 4 dw -90, 54
199cabdff1aSopenharmony_citimes 4 dw 22, -82
200cabdff1aSopenharmony_ci
201cabdff1aSopenharmony_citimes 4 dw 31, -78
202cabdff1aSopenharmony_citimes 4 dw 90, -61
203cabdff1aSopenharmony_citimes 4 dw 4, 54
204cabdff1aSopenharmony_citimes 4 dw -88, 82
205cabdff1aSopenharmony_citimes 4 dw -38, -22
206cabdff1aSopenharmony_citimes 4 dw 73, -90
207cabdff1aSopenharmony_citimes 4 dw 67, -13
208cabdff1aSopenharmony_citimes 4 dw -46, 85
209cabdff1aSopenharmony_ci
210cabdff1aSopenharmony_citimes 4 dw 22, -61
211cabdff1aSopenharmony_citimes 4 dw 85, -90
212cabdff1aSopenharmony_citimes 4 dw 73, -38
213cabdff1aSopenharmony_citimes 4 dw -4, 46
214cabdff1aSopenharmony_citimes 4 dw -78, 90
215cabdff1aSopenharmony_citimes 4 dw -82, 54
216cabdff1aSopenharmony_citimes 4 dw -13, -31
217cabdff1aSopenharmony_citimes 4 dw 67, -88
218cabdff1aSopenharmony_ci
219cabdff1aSopenharmony_citimes 4 dw 13, -38
220cabdff1aSopenharmony_citimes 4 dw 61, -78
221cabdff1aSopenharmony_citimes 4 dw 88, -90
222cabdff1aSopenharmony_citimes 4 dw 85, -73
223cabdff1aSopenharmony_citimes 4 dw 54, -31
224cabdff1aSopenharmony_citimes 4 dw 4, 22
225cabdff1aSopenharmony_citimes 4 dw -46, 67
226cabdff1aSopenharmony_citimes 4 dw -82, 90
227cabdff1aSopenharmony_ci
228cabdff1aSopenharmony_citimes 4 dw 4, -13
229cabdff1aSopenharmony_citimes 4 dw 22, -31
230cabdff1aSopenharmony_citimes 4 dw 38, -46
231cabdff1aSopenharmony_citimes 4 dw 54, -61
232cabdff1aSopenharmony_citimes 4 dw 67, -73
233cabdff1aSopenharmony_citimes 4 dw 78, -82
234cabdff1aSopenharmony_citimes 4 dw 85, -88
235cabdff1aSopenharmony_citimes 4 dw 90, -90
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ciSECTION .text
238cabdff1aSopenharmony_ci
239cabdff1aSopenharmony_ci; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
240cabdff1aSopenharmony_ci; %1 = HxW
241cabdff1aSopenharmony_ci; %2 = number of loops
242cabdff1aSopenharmony_ci; %3 = bitdepth
243cabdff1aSopenharmony_ci%macro IDCT_DC 3
244cabdff1aSopenharmony_cicglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp
245cabdff1aSopenharmony_ci    movsx             tmpd, word [coeffq]
246cabdff1aSopenharmony_ci    add               tmpd, (1 << (14 - %3)) + 1
247cabdff1aSopenharmony_ci    sar               tmpd, (15 - %3)
248cabdff1aSopenharmony_ci    movd               xm0, tmpd
249cabdff1aSopenharmony_ci    SPLATW              m0, xm0
250cabdff1aSopenharmony_ci    DEFINE_ARGS coeff, cnt
251cabdff1aSopenharmony_ci    mov               cntd, %2
252cabdff1aSopenharmony_ci.loop:
253cabdff1aSopenharmony_ci    mova [coeffq+mmsize*0], m0
254cabdff1aSopenharmony_ci    mova [coeffq+mmsize*1], m0
255cabdff1aSopenharmony_ci    mova [coeffq+mmsize*2], m0
256cabdff1aSopenharmony_ci    mova [coeffq+mmsize*3], m0
257cabdff1aSopenharmony_ci    add  coeffq, mmsize*8
258cabdff1aSopenharmony_ci    mova [coeffq+mmsize*-4], m0
259cabdff1aSopenharmony_ci    mova [coeffq+mmsize*-3], m0
260cabdff1aSopenharmony_ci    mova [coeffq+mmsize*-2], m0
261cabdff1aSopenharmony_ci    mova [coeffq+mmsize*-1], m0
262cabdff1aSopenharmony_ci    dec  cntd
263cabdff1aSopenharmony_ci    jg  .loop
264cabdff1aSopenharmony_ci    RET
265cabdff1aSopenharmony_ci%endmacro
266cabdff1aSopenharmony_ci
267cabdff1aSopenharmony_ci; %1 = HxW
268cabdff1aSopenharmony_ci; %2 = bitdepth
269cabdff1aSopenharmony_ci%macro IDCT_DC_NL 2 ; No loop
270cabdff1aSopenharmony_cicglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
271cabdff1aSopenharmony_ci    movsx             tmpd, word [coeffq]
272cabdff1aSopenharmony_ci    add               tmpd, (1 << (14 - %2)) + 1
273cabdff1aSopenharmony_ci    sar               tmpd, (15 - %2)
274cabdff1aSopenharmony_ci    movd                m0, tmpd
275cabdff1aSopenharmony_ci    SPLATW              m0, xm0
276cabdff1aSopenharmony_ci    mova [coeffq+mmsize*0], m0
277cabdff1aSopenharmony_ci    mova [coeffq+mmsize*1], m0
278cabdff1aSopenharmony_ci    mova [coeffq+mmsize*2], m0
279cabdff1aSopenharmony_ci    mova [coeffq+mmsize*3], m0
280cabdff1aSopenharmony_ci%if mmsize == 16
281cabdff1aSopenharmony_ci    mova [coeffq+mmsize*4], m0
282cabdff1aSopenharmony_ci    mova [coeffq+mmsize*5], m0
283cabdff1aSopenharmony_ci    mova [coeffq+mmsize*6], m0
284cabdff1aSopenharmony_ci    mova [coeffq+mmsize*7], m0
285cabdff1aSopenharmony_ci%endif
286cabdff1aSopenharmony_ci    RET
287cabdff1aSopenharmony_ci%endmacro
288cabdff1aSopenharmony_ci
289cabdff1aSopenharmony_ci; IDCT 4x4, expects input in m0, m1
290cabdff1aSopenharmony_ci; %1 - shift
291cabdff1aSopenharmony_ci; %2 - 1/0 - SCALE and Transpose or not
292cabdff1aSopenharmony_ci; %3 - 1/0 add constant or not
293cabdff1aSopenharmony_ci%macro TR_4x4 3
294cabdff1aSopenharmony_ci    ; interleaves src0 with src2 to m0
295cabdff1aSopenharmony_ci    ;         and src1 with scr3 to m2
296cabdff1aSopenharmony_ci    ; src0: 00 01 02 03     m0: 00 20 01 21 02 22 03 23
297cabdff1aSopenharmony_ci    ; src1: 10 11 12 13 -->
298cabdff1aSopenharmony_ci    ; src2: 20 21 22 23     m1: 10 30 11 31 12 32 13 33
299cabdff1aSopenharmony_ci    ; src3: 30 31 32 33
300cabdff1aSopenharmony_ci
301cabdff1aSopenharmony_ci    SBUTTERFLY wd, 0, 1, 2
302cabdff1aSopenharmony_ci
303cabdff1aSopenharmony_ci    pmaddwd m2, m0, [pw_64]    ; e0
304cabdff1aSopenharmony_ci    pmaddwd m3, m1, [pw_83_36] ; o0
305cabdff1aSopenharmony_ci    pmaddwd m0, [pw_64_m64]    ; e1
306cabdff1aSopenharmony_ci    pmaddwd m1, [pw_36_m83]    ; o1
307cabdff1aSopenharmony_ci
308cabdff1aSopenharmony_ci%if %3 == 1
309cabdff1aSopenharmony_ci    %assign %%add 1 << (%1 - 1)
310cabdff1aSopenharmony_ci    mova  m4, [pd_ %+ %%add]
311cabdff1aSopenharmony_ci    paddd m2, m4
312cabdff1aSopenharmony_ci    paddd m0, m4
313cabdff1aSopenharmony_ci%endif
314cabdff1aSopenharmony_ci
315cabdff1aSopenharmony_ci    SUMSUB_BADC d, 3, 2, 1, 0, 4
316cabdff1aSopenharmony_ci
317cabdff1aSopenharmony_ci%if %2 == 1
318cabdff1aSopenharmony_ci    psrad m3, %1 ; e0 + o0
319cabdff1aSopenharmony_ci    psrad m1, %1 ; e1 + o1
320cabdff1aSopenharmony_ci    psrad m2, %1 ; e0 - o0
321cabdff1aSopenharmony_ci    psrad m0, %1 ; e1 - o1
322cabdff1aSopenharmony_ci    ;clip16
323cabdff1aSopenharmony_ci    packssdw m3, m1
324cabdff1aSopenharmony_ci    packssdw m0, m2
325cabdff1aSopenharmony_ci    ; Transpose
326cabdff1aSopenharmony_ci    SBUTTERFLY wd, 3, 0, 1
327cabdff1aSopenharmony_ci    SBUTTERFLY wd, 3, 0, 1
328cabdff1aSopenharmony_ci    SWAP 3, 1, 0
329cabdff1aSopenharmony_ci%else
330cabdff1aSopenharmony_ci    SWAP 3, 2, 0
331cabdff1aSopenharmony_ci%endif
332cabdff1aSopenharmony_ci%endmacro
333cabdff1aSopenharmony_ci
334cabdff1aSopenharmony_ci%macro DEFINE_BIAS 1
335cabdff1aSopenharmony_ci    %assign shift (20 - %1)
336cabdff1aSopenharmony_ci    %assign c_add (1 << (shift - 1))
337cabdff1aSopenharmony_ci    %define arr_add pd_ %+ c_add
338cabdff1aSopenharmony_ci%endmacro
339cabdff1aSopenharmony_ci
340cabdff1aSopenharmony_ci; %1 - bit_depth
341cabdff1aSopenharmony_ci; %2 - register add constant
342cabdff1aSopenharmony_ci; is loaded to
343cabdff1aSopenharmony_ci; shift = 20 - bit_depth
344cabdff1aSopenharmony_ci%macro LOAD_BIAS 2
345cabdff1aSopenharmony_ci    DEFINE_BIAS %1
346cabdff1aSopenharmony_ci    mova %2, [arr_add]
347cabdff1aSopenharmony_ci%endmacro
348cabdff1aSopenharmony_ci
349cabdff1aSopenharmony_ci; %1, %2 - registers to load packed 16 bit values to
350cabdff1aSopenharmony_ci; %3, %4, %5, %6 - vertical offsets
351cabdff1aSopenharmony_ci; %7 - horizontal offset
352cabdff1aSopenharmony_ci%macro LOAD_BLOCK 7
353cabdff1aSopenharmony_ci    movq   %1, [r0 + %3 + %7]
354cabdff1aSopenharmony_ci    movhps %1, [r0 + %5 + %7]
355cabdff1aSopenharmony_ci    movq   %2, [r0 + %4 + %7]
356cabdff1aSopenharmony_ci    movhps %2, [r0 + %6 + %7]
357cabdff1aSopenharmony_ci%endmacro
358cabdff1aSopenharmony_ci
359cabdff1aSopenharmony_ci; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit)
360cabdff1aSopenharmony_ci; %1 = bitdepth
361cabdff1aSopenharmony_ci%macro IDCT_4x4 1
362cabdff1aSopenharmony_cicglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs
363cabdff1aSopenharmony_ci    mova m0, [coeffsq]
364cabdff1aSopenharmony_ci    mova m1, [coeffsq + 16]
365cabdff1aSopenharmony_ci
366cabdff1aSopenharmony_ci    TR_4x4 7, 1, 1
367cabdff1aSopenharmony_ci    TR_4x4 20 - %1, 1, 1
368cabdff1aSopenharmony_ci
369cabdff1aSopenharmony_ci    mova [coeffsq],      m0
370cabdff1aSopenharmony_ci    mova [coeffsq + 16], m1
371cabdff1aSopenharmony_ci    RET
372cabdff1aSopenharmony_ci%endmacro
373cabdff1aSopenharmony_ci
374cabdff1aSopenharmony_ci; scale, pack (clip16) and store the residuals     0 e8[0] + o8[0] --> + %1
375cabdff1aSopenharmony_ci; 4 at one time (4 columns)                        1 e8[1] + o8[1]
376cabdff1aSopenharmony_ci; from %5: e8/16 + o8/16, with %1 offset                  ...
377cabdff1aSopenharmony_ci; and  %3: e8/16 - o8/16, with %2 offset           6 e8[1] - o8[1]
378cabdff1aSopenharmony_ci; %4 - shift                                       7 e8[0] - o8[0] --> + %2
379cabdff1aSopenharmony_ci%macro STORE_8 7
380cabdff1aSopenharmony_ci    psrad    %5, %4
381cabdff1aSopenharmony_ci    psrad    %3, %4
382cabdff1aSopenharmony_ci    packssdw %5, %3
383cabdff1aSopenharmony_ci    movq     [coeffsq + %1], %5
384cabdff1aSopenharmony_ci    movhps   [coeffsq + %2], %5
385cabdff1aSopenharmony_ci%endmacro
386cabdff1aSopenharmony_ci
387cabdff1aSopenharmony_ci; %1 - horizontal offset
388cabdff1aSopenharmony_ci; %2 - shift
389cabdff1aSopenharmony_ci; %3, %4 - transform coeffs
390cabdff1aSopenharmony_ci; %5 - vertical offset for e8 + o8
391cabdff1aSopenharmony_ci; %6 - vertical offset for e8 - o8
392cabdff1aSopenharmony_ci; %7 - register with e8 inside
393cabdff1aSopenharmony_ci; %8 - block_size
394cabdff1aSopenharmony_ci; %9 - register to store e8 +o8
395cabdff1aSopenharmony_ci; %10 - register to store e8 - o8
396cabdff1aSopenharmony_ci%macro E8_O8 10
397cabdff1aSopenharmony_ci    pmaddwd m6, m4, %3
398cabdff1aSopenharmony_ci    pmaddwd m7, m5, %4
399cabdff1aSopenharmony_ci
400cabdff1aSopenharmony_ci    paddd m6, m7
401cabdff1aSopenharmony_ci    paddd m7, m6, %7 ; o8 + e8
402cabdff1aSopenharmony_ci    psubd %7, m6     ; e8 - o8
403cabdff1aSopenharmony_ci%if %8 == 8
404cabdff1aSopenharmony_ci    STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0
405cabdff1aSopenharmony_ci%else
406cabdff1aSopenharmony_ci    SWAP m7, %9
407cabdff1aSopenharmony_ci    SWAP %7, %10
408cabdff1aSopenharmony_ci%endif
409cabdff1aSopenharmony_ci%endmacro
410cabdff1aSopenharmony_ci
411cabdff1aSopenharmony_ci; 8x4 residuals are processed and stored
412cabdff1aSopenharmony_ci; %1 - horizontal offset
413cabdff1aSopenharmony_ci; %2 - shift
414cabdff1aSopenharmony_ci; %3 - offset of the even row
415cabdff1aSopenharmony_ci; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32
416cabdff1aSopenharmony_ci; %5 - offset of the odd row
417cabdff1aSopenharmony_ci; %6 - block size
418cabdff1aSopenharmony_ci; %7 - 1/0 add a constant in TR_4x4 or not
419cabdff1aSopenharmony_ci; I want to add a constant for 8x8 transform but not for 16x16 and 32x32
420cabdff1aSopenharmony_ci%macro TR_8x4 7
421cabdff1aSopenharmony_ci    ; load 4 columns of even rows
422cabdff1aSopenharmony_ci    LOAD_BLOCK  m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci    TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only
425cabdff1aSopenharmony_ci
426cabdff1aSopenharmony_ci    ; load 4 columns of odd rows
427cabdff1aSopenharmony_ci    LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1
428cabdff1aSopenharmony_ci
429cabdff1aSopenharmony_ci    ; 00 01 02 03
430cabdff1aSopenharmony_ci    ; 10 11 12 13      m4: 10 30 11 31 12 32 13 33
431cabdff1aSopenharmony_ci
432cabdff1aSopenharmony_ci    ; ...        -- >
433cabdff1aSopenharmony_ci    ;                  m5: 50 70 51 71 52 72 53 73
434cabdff1aSopenharmony_ci    ; 70 71 72 73
435cabdff1aSopenharmony_ci    SBUTTERFLY wd, 4, 5, 6
436cabdff1aSopenharmony_ci
437cabdff1aSopenharmony_ci    E8_O8 %1, %2, [pw_89_75],  [pw_50_18],   0,      %5 * 7, m0, %6, m8, m15
438cabdff1aSopenharmony_ci    E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5,     %5 * 6, m1, %6, m9, m14
439cabdff1aSopenharmony_ci    E8_O8 %1, %2, [pw_50_m89], [pw_18_75],   %5 * 2, %5 * 5, m2, %6, m10, m13
440cabdff1aSopenharmony_ci    E8_O8 %1, %2, [pw_18_m50], [pw_75_m89],  %5 * 3, %5 * 4, m3, %6, m11, m12
441cabdff1aSopenharmony_ci%endmacro
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci%macro STORE_PACKED 7
444cabdff1aSopenharmony_ci    movq   [r0 + %3 + %7], %1
445cabdff1aSopenharmony_ci    movhps [r0 + %4 + %7], %1
446cabdff1aSopenharmony_ci    movq   [r0 + %5 + %7], %2
447cabdff1aSopenharmony_ci    movhps [r0 + %6 + %7], %2
448cabdff1aSopenharmony_ci%endmacro
449cabdff1aSopenharmony_ci
450cabdff1aSopenharmony_ci; transpose 4x4 block packed
451cabdff1aSopenharmony_ci; in %1 and %2 registers
452cabdff1aSopenharmony_ci; %3 - temporary register
453cabdff1aSopenharmony_ci%macro TRANSPOSE_4x4 3
454cabdff1aSopenharmony_ci    SBUTTERFLY wd, %1, %2, %3
455cabdff1aSopenharmony_ci    SBUTTERFLY dq, %1, %2, %3
456cabdff1aSopenharmony_ci%endmacro
457cabdff1aSopenharmony_ci
458cabdff1aSopenharmony_ci; %1 - horizontal offset of the block i
459cabdff1aSopenharmony_ci; %2 - vertical offset of the block i
460cabdff1aSopenharmony_ci; %3 - width in bytes
461cabdff1aSopenharmony_ci; %4 - vertical offset for the block j
462cabdff1aSopenharmony_ci; %5 - horizontal offset for the block j
463cabdff1aSopenharmony_ci%macro SWAP_BLOCKS 5
464cabdff1aSopenharmony_ci    ; M_j
465cabdff1aSopenharmony_ci    LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
466cabdff1aSopenharmony_ci    TRANSPOSE_4x4 4, 5, 6
467cabdff1aSopenharmony_ci
468cabdff1aSopenharmony_ci    ; M_i
469cabdff1aSopenharmony_ci    LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
470cabdff1aSopenharmony_ci
471cabdff1aSopenharmony_ci    STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
472cabdff1aSopenharmony_ci
473cabdff1aSopenharmony_ci    ; transpose and store M_i
474cabdff1aSopenharmony_ci    SWAP m6, m4
475cabdff1aSopenharmony_ci    SWAP m7, m5
476cabdff1aSopenharmony_ci    TRANSPOSE_4x4 4, 5, 6
477cabdff1aSopenharmony_ci    STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
478cabdff1aSopenharmony_ci%endmacro
479cabdff1aSopenharmony_ci
480cabdff1aSopenharmony_ci; %1 - horizontal offset
481cabdff1aSopenharmony_ci; %2 - vertical offset of the block
482cabdff1aSopenharmony_ci; %3 - width in bytes
483cabdff1aSopenharmony_ci%macro TRANSPOSE_BLOCK 3
484cabdff1aSopenharmony_ci    LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
485cabdff1aSopenharmony_ci    TRANSPOSE_4x4 4, 5, 6
486cabdff1aSopenharmony_ci    STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
487cabdff1aSopenharmony_ci%endmacro
488cabdff1aSopenharmony_ci
489cabdff1aSopenharmony_ci%macro TRANSPOSE_8x8 0
490cabdff1aSopenharmony_cicglobal hevc_idct_transpose_8x8, 0, 0, 0
491cabdff1aSopenharmony_ci    ; M1 M2 ^T = M1^t M3^t
492cabdff1aSopenharmony_ci    ; M3 M4      M2^t M4^t
493cabdff1aSopenharmony_ci
494cabdff1aSopenharmony_ci    ; M1 4x4 block
495cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 0, 0, 16
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci    ; M2 and M3
498cabdff1aSopenharmony_ci    SWAP_BLOCKS 0, 64, 16, 0, 8
499cabdff1aSopenharmony_ci
500cabdff1aSopenharmony_ci    ; M4
501cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 8, 64, 16
502cabdff1aSopenharmony_ci
503cabdff1aSopenharmony_ci    ret
504cabdff1aSopenharmony_ci%endmacro
505cabdff1aSopenharmony_ci
506cabdff1aSopenharmony_ci; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit)
507cabdff1aSopenharmony_ci; %1 = bitdepth
508cabdff1aSopenharmony_ci%macro IDCT_8x8 1
509cabdff1aSopenharmony_cicglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs
510cabdff1aSopenharmony_ci    TR_8x4 0, 7, 32, 1, 16, 8, 1
511cabdff1aSopenharmony_ci    TR_8x4 8, 7, 32, 1, 16, 8, 1
512cabdff1aSopenharmony_ci
513cabdff1aSopenharmony_ci    call hevc_idct_transpose_8x8_ %+ cpuname
514cabdff1aSopenharmony_ci
515cabdff1aSopenharmony_ci    DEFINE_BIAS %1
516cabdff1aSopenharmony_ci    TR_8x4 0, shift, 32, 1, 16, 8, 1
517cabdff1aSopenharmony_ci    TR_8x4 8, shift, 32, 1, 16, 8, 1
518cabdff1aSopenharmony_ci
519cabdff1aSopenharmony_ci    TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1
520cabdff1aSopenharmony_ci%endmacro
521cabdff1aSopenharmony_ci
522cabdff1aSopenharmony_ci; store intermedite e32 coeffs on stack
523cabdff1aSopenharmony_ci; as 16x4 matrix
524cabdff1aSopenharmony_ci; from m10: e8 + o8, with %6 offset
525cabdff1aSopenharmony_ci; and  %3:  e8 - o8, with %7 offset
526cabdff1aSopenharmony_ci; %4 - shift, unused here
527cabdff1aSopenharmony_ci%macro STORE_16 7
528cabdff1aSopenharmony_ci    mova [rsp + %6], %5
529cabdff1aSopenharmony_ci    mova [rsp + %7], %3
530cabdff1aSopenharmony_ci%endmacro
531cabdff1aSopenharmony_ci
532cabdff1aSopenharmony_ci; %1, %2 - transform constants
533cabdff1aSopenharmony_ci; %3, %4 - regs with interleaved coeffs
534cabdff1aSopenharmony_ci; %5 - 1/0 SWAP or add
535cabdff1aSopenharmony_ci; %6, %7 - registers for intermidiate sums
536cabdff1aSopenharmony_ci; %8 - accumulator register
537cabdff1aSopenharmony_ci%macro ADD_ROWS 8
538cabdff1aSopenharmony_ci    pmaddwd %6, %3, %1
539cabdff1aSopenharmony_ci    pmaddwd %7, %4, %2
540cabdff1aSopenharmony_ci    paddd   %6, %7
541cabdff1aSopenharmony_ci%if %5 == 1
542cabdff1aSopenharmony_ci    SWAP %6, %8
543cabdff1aSopenharmony_ci%else
544cabdff1aSopenharmony_ci    paddd %8, %6
545cabdff1aSopenharmony_ci%endif
546cabdff1aSopenharmony_ci%endmacro
547cabdff1aSopenharmony_ci
548cabdff1aSopenharmony_ci; %1 - transform coeffs
549cabdff1aSopenharmony_ci; %2, %3 offsets for storing e+o/e-o back to coeffsq
550cabdff1aSopenharmony_ci; %4 - shift
551cabdff1aSopenharmony_ci; %5 - add
552cabdff1aSopenharmony_ci; %6 - block_size
553cabdff1aSopenharmony_ci; %7 - register with e16
554cabdff1aSopenharmony_ci; %8, %9 - stack offsets for storing e+o/e-o
555cabdff1aSopenharmony_ci%macro E16_O16 9
556cabdff1aSopenharmony_ci    ADD_ROWS [%1],          [%1 +     16], m0, m1, 1, m5, m6, m7
557cabdff1aSopenharmony_ci    ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7
558cabdff1aSopenharmony_ci
559cabdff1aSopenharmony_ci%if %6 == 8
560cabdff1aSopenharmony_ci    paddd %7, %5
561cabdff1aSopenharmony_ci%endif
562cabdff1aSopenharmony_ci
563cabdff1aSopenharmony_ci    paddd m4, m7, %7 ; o16 + e16
564cabdff1aSopenharmony_ci    psubd %7, m7     ; e16 - o16
565cabdff1aSopenharmony_ci    STORE_%6 %2, %3, %7, %4, m4, %8, %9
566cabdff1aSopenharmony_ci%endmacro
567cabdff1aSopenharmony_ci
568cabdff1aSopenharmony_ci%macro TR_16x4 10
569cabdff1aSopenharmony_ci    ; produce 8x4 matrix of e16 coeffs
570cabdff1aSopenharmony_ci    ; for 4 first rows and store it on stack (128 bytes)
571cabdff1aSopenharmony_ci    TR_8x4 %1, 7, %4, %5, %6, %8, 0
572cabdff1aSopenharmony_ci
573cabdff1aSopenharmony_ci    ; load 8 even rows
574cabdff1aSopenharmony_ci    LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1
575cabdff1aSopenharmony_ci    LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1
576cabdff1aSopenharmony_ci
577cabdff1aSopenharmony_ci    SBUTTERFLY wd, 0, 1, 4
578cabdff1aSopenharmony_ci    SBUTTERFLY wd, 2, 3, 4
579cabdff1aSopenharmony_ci
580cabdff1aSopenharmony_ci    E16_O16 trans_coeffs16,               0 + %1, 15 * %6 + %1, %2, %3, %7, m8,       0, 15 * 16
581cabdff1aSopenharmony_ci    mova m8, %3
582cabdff1aSopenharmony_ci    E16_O16 trans_coeffs16 +     64,     %6 + %1, 14 * %6 + %1, %2, m8, %7, m9,      16, 14 * 16
583cabdff1aSopenharmony_ci    E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16
584cabdff1aSopenharmony_ci    E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16
585cabdff1aSopenharmony_ci    E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16
586cabdff1aSopenharmony_ci    E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16
587cabdff1aSopenharmony_ci    E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1,  9 * %6 + %1, %2, m8, %7, m14, 6 * 16,  9 * 16
588cabdff1aSopenharmony_ci    E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1,  8 * %6 + %1, %2, m8, %7, m15, 7 * 16,  8 * 16
589cabdff1aSopenharmony_ci%endmacro
590cabdff1aSopenharmony_ci
591cabdff1aSopenharmony_ci%macro TRANSPOSE_16x16 0
592cabdff1aSopenharmony_cicglobal hevc_idct_transpose_16x16, 0, 0, 0
593cabdff1aSopenharmony_ci; M1  M2  M3  M4 ^T      m1 m5 m9  m13   M_i^T = m_i
594cabdff1aSopenharmony_ci; M5  M6  M7  M8    -->  m2 m6 m10 m14
595cabdff1aSopenharmony_ci; M9  M10 M11 M12        m3 m7 m11 m15
596cabdff1aSopenharmony_ci; M13 M14 M15 M16        m4 m8 m12 m16
597cabdff1aSopenharmony_ci
598cabdff1aSopenharmony_ci    ; M1 4x4 block
599cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 0, 0, 32
600cabdff1aSopenharmony_ci
601cabdff1aSopenharmony_ci    ; M5, M2
602cabdff1aSopenharmony_ci    SWAP_BLOCKS 0, 128, 32, 0, 8
603cabdff1aSopenharmony_ci    ; M9, M3
604cabdff1aSopenharmony_ci    SWAP_BLOCKS 0, 256, 32, 0, 16
605cabdff1aSopenharmony_ci    ; M13, M4
606cabdff1aSopenharmony_ci    SWAP_BLOCKS 0, 384, 32, 0, 24
607cabdff1aSopenharmony_ci
608cabdff1aSopenharmony_ci    ;M6
609cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 8, 128, 32
610cabdff1aSopenharmony_ci
611cabdff1aSopenharmony_ci    ; M10, M7
612cabdff1aSopenharmony_ci    SWAP_BLOCKS 8, 256, 32, 128, 16
613cabdff1aSopenharmony_ci    ; M14, M8
614cabdff1aSopenharmony_ci    SWAP_BLOCKS 8, 384, 32, 128, 24
615cabdff1aSopenharmony_ci
616cabdff1aSopenharmony_ci    ;M11
617cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 16, 256, 32
618cabdff1aSopenharmony_ci
619cabdff1aSopenharmony_ci    ; M15, M12
620cabdff1aSopenharmony_ci    SWAP_BLOCKS 16, 384, 32, 256, 24
621cabdff1aSopenharmony_ci
622cabdff1aSopenharmony_ci    ;M16
623cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 24, 384, 32
624cabdff1aSopenharmony_ci
625cabdff1aSopenharmony_ci    ret
626cabdff1aSopenharmony_ci%endmacro
627cabdff1aSopenharmony_ci
628cabdff1aSopenharmony_ci; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit)
629cabdff1aSopenharmony_ci; %1 = bitdepth
630cabdff1aSopenharmony_ci%macro IDCT_16x16 1
631cabdff1aSopenharmony_cicglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs
632cabdff1aSopenharmony_ci    mov r1d, 3
633cabdff1aSopenharmony_ci.loop16:
634cabdff1aSopenharmony_ci    TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0
635cabdff1aSopenharmony_ci    dec r1d
636cabdff1aSopenharmony_ci    jge .loop16
637cabdff1aSopenharmony_ci
638cabdff1aSopenharmony_ci    call hevc_idct_transpose_16x16_ %+ cpuname
639cabdff1aSopenharmony_ci
640cabdff1aSopenharmony_ci    DEFINE_BIAS %1
641cabdff1aSopenharmony_ci    mov r1d, 3
642cabdff1aSopenharmony_ci.loop16_2:
643cabdff1aSopenharmony_ci    TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1
644cabdff1aSopenharmony_ci    dec r1d
645cabdff1aSopenharmony_ci    jge .loop16_2
646cabdff1aSopenharmony_ci
647cabdff1aSopenharmony_ci    TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1
648cabdff1aSopenharmony_ci%endmacro
649cabdff1aSopenharmony_ci
650cabdff1aSopenharmony_ci; scale, pack (clip16) and store the residuals     0 e32[0] + o32[0] --> %1
651cabdff1aSopenharmony_ci; 4 at one time (4 columns)                        1 e32[1] + o32[1]
652cabdff1aSopenharmony_ci; %1 - address to store e32 + o32
653cabdff1aSopenharmony_ci; %2 - address to store e32 - e32
654cabdff1aSopenharmony_ci; %5 - reg with e32 + o32                                  ...
655cabdff1aSopenharmony_ci; %3 - reg with e32 - o32                          30 e32[1] - o32[1]
656cabdff1aSopenharmony_ci; %4 - shift                                       31 e32[0] - o32[0] --> %2
657cabdff1aSopenharmony_ci%macro STORE_32 5
658cabdff1aSopenharmony_ci    psrad    %5, %4
659cabdff1aSopenharmony_ci    psrad    %3, %4
660cabdff1aSopenharmony_ci    packssdw %5, %3
661cabdff1aSopenharmony_ci    movq     [%1], %5
662cabdff1aSopenharmony_ci    movhps   [%2], %5
663cabdff1aSopenharmony_ci%endmacro
664cabdff1aSopenharmony_ci
665cabdff1aSopenharmony_ci; %1 - transform coeffs
666cabdff1aSopenharmony_ci; %2 - stack offset for e32
667cabdff1aSopenharmony_ci; %2, %3 offsets for storing e+o/e-o back to coeffsq
668cabdff1aSopenharmony_ci; %4 - shift
669cabdff1aSopenharmony_ci; %5 - stack offset of e32
670cabdff1aSopenharmony_ci%macro E32_O32 5
671cabdff1aSopenharmony_ci    ADD_ROWS [%1],          [%1 +     16], m0, m1, 1, m8, m9, m10
672cabdff1aSopenharmony_ci    ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10
673cabdff1aSopenharmony_ci    ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10
674cabdff1aSopenharmony_ci    ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10
675cabdff1aSopenharmony_ci
676cabdff1aSopenharmony_ci    paddd m11, m14, [rsp + %5]
677cabdff1aSopenharmony_ci    paddd m12, m10, m11 ; o32 + e32
678cabdff1aSopenharmony_ci    psubd m11, m10      ; e32 - o32
679cabdff1aSopenharmony_ci    STORE_32 %2, %3, m11, %4, m12
680cabdff1aSopenharmony_ci%endmacro
681cabdff1aSopenharmony_ci
682cabdff1aSopenharmony_ci; %1 - horizontal offset
683cabdff1aSopenharmony_ci; %2 - bitdepth
684cabdff1aSopenharmony_ci%macro TR_32x4 3
685cabdff1aSopenharmony_ci    TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0
686cabdff1aSopenharmony_ci
687cabdff1aSopenharmony_ci    LOAD_BLOCK m0, m1,      64,  3 * 64,  5 * 64,  7 * 64, %1
688cabdff1aSopenharmony_ci    LOAD_BLOCK m2, m3,  9 * 64, 11 * 64, 13 * 64, 15 * 64, %1
689cabdff1aSopenharmony_ci    LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1
690cabdff1aSopenharmony_ci    LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1
691cabdff1aSopenharmony_ci
692cabdff1aSopenharmony_ci    SBUTTERFLY wd, 0, 1, 8
693cabdff1aSopenharmony_ci    SBUTTERFLY wd, 2, 3, 8
694cabdff1aSopenharmony_ci    SBUTTERFLY wd, 4, 5, 8
695cabdff1aSopenharmony_ci    SBUTTERFLY wd, 6, 7, 8
696cabdff1aSopenharmony_ci
697cabdff1aSopenharmony_ci%if %3 == 1
698cabdff1aSopenharmony_ci    %assign shift 7
699cabdff1aSopenharmony_ci    mova m14, [pd_64]
700cabdff1aSopenharmony_ci%else
701cabdff1aSopenharmony_ci    LOAD_BIAS %2, m14
702cabdff1aSopenharmony_ci%endif
703cabdff1aSopenharmony_ci
704cabdff1aSopenharmony_ci    lea r2, [trans_coeff32 + 15 * 128]
705cabdff1aSopenharmony_ci    lea r3, [coeffsq + %1]
706cabdff1aSopenharmony_ci    lea r4, [r3 + 16 * 64]
707cabdff1aSopenharmony_ci    mov r5d, 15 * 16
708cabdff1aSopenharmony_ci%%loop:
709cabdff1aSopenharmony_ci    E32_O32 r2, r3 + r5 * 4, r4, shift, r5
710cabdff1aSopenharmony_ci    sub r2, 128
711cabdff1aSopenharmony_ci    add r4, 64
712cabdff1aSopenharmony_ci    sub r5d, 16
713cabdff1aSopenharmony_ci    jge %%loop
714cabdff1aSopenharmony_ci%endmacro
715cabdff1aSopenharmony_ci
716cabdff1aSopenharmony_ci%macro TRANSPOSE_32x32 0
717cabdff1aSopenharmony_cicglobal hevc_idct_transpose_32x32, 0, 0, 0
718cabdff1aSopenharmony_ci    ; M0  M1 ... M7
719cabdff1aSopenharmony_ci    ; M8         M15
720cabdff1aSopenharmony_ci    ;
721cabdff1aSopenharmony_ci    ; ...
722cabdff1aSopenharmony_ci    ;
723cabdff1aSopenharmony_ci    ; M56        M63
724cabdff1aSopenharmony_ci
725cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 0, 0, 64 ; M1
726cabdff1aSopenharmony_ci    mov r1d, 7
727cabdff1aSopenharmony_ci    mov r2d, 7 * 256
728cabdff1aSopenharmony_ci.loop_transpose:
729cabdff1aSopenharmony_ci    SWAP_BLOCKS 0, r2, 64, 0, r1 * 8
730cabdff1aSopenharmony_ci    sub r2d, 256
731cabdff1aSopenharmony_ci    dec r1d
732cabdff1aSopenharmony_ci    jg .loop_transpose
733cabdff1aSopenharmony_ci
734cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 8, 256, 64 ; M9
735cabdff1aSopenharmony_ci    mov r1d, 6
736cabdff1aSopenharmony_ci    mov r2d, 512
737cabdff1aSopenharmony_ci    mov r3d, 16
738cabdff1aSopenharmony_ci.loop_transpose2:
739cabdff1aSopenharmony_ci    SWAP_BLOCKS 8, r2, 64, 256, r3
740cabdff1aSopenharmony_ci    add r3d, 8
741cabdff1aSopenharmony_ci    add r2d, 256
742cabdff1aSopenharmony_ci    dec r1d
743cabdff1aSopenharmony_ci    jg .loop_transpose2
744cabdff1aSopenharmony_ci
745cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9
746cabdff1aSopenharmony_ci    mov r1d, 5
747cabdff1aSopenharmony_ci    mov r2d, 768
748cabdff1aSopenharmony_ci    mov r3d, 24
749cabdff1aSopenharmony_ci.loop_transpose3:
750cabdff1aSopenharmony_ci    SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3
751cabdff1aSopenharmony_ci    add r3d, 8
752cabdff1aSopenharmony_ci    add r2d, 256
753cabdff1aSopenharmony_ci    dec r1d
754cabdff1aSopenharmony_ci    jg .loop_transpose3
755cabdff1aSopenharmony_ci
756cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27
757cabdff1aSopenharmony_ci    mov r1d, 4
758cabdff1aSopenharmony_ci    mov r2d, 1024
759cabdff1aSopenharmony_ci    mov r3d, 32
760cabdff1aSopenharmony_ci.loop_transpose4:
761cabdff1aSopenharmony_ci    SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3
762cabdff1aSopenharmony_ci    add r3d, 8
763cabdff1aSopenharmony_ci    add r2d, 256
764cabdff1aSopenharmony_ci    dec r1d
765cabdff1aSopenharmony_ci    jg .loop_transpose4
766cabdff1aSopenharmony_ci
767cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36
768cabdff1aSopenharmony_ci    mov r1d, 3
769cabdff1aSopenharmony_ci    mov r2d, 1280
770cabdff1aSopenharmony_ci    mov r3d, 40
771cabdff1aSopenharmony_ci.loop_transpose5:
772cabdff1aSopenharmony_ci    SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3
773cabdff1aSopenharmony_ci    add r3d, 8
774cabdff1aSopenharmony_ci    add r2d, 256
775cabdff1aSopenharmony_ci    dec r1d
776cabdff1aSopenharmony_ci    jg .loop_transpose5
777cabdff1aSopenharmony_ci
778cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45
779cabdff1aSopenharmony_ci    SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8
780cabdff1aSopenharmony_ci    SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8
781cabdff1aSopenharmony_ci
782cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54
783cabdff1aSopenharmony_ci    SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8
784cabdff1aSopenharmony_ci
785cabdff1aSopenharmony_ci    TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63
786cabdff1aSopenharmony_ci
787cabdff1aSopenharmony_ci    ret
788cabdff1aSopenharmony_ci%endmacro
789cabdff1aSopenharmony_ci
790cabdff1aSopenharmony_ci; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit)
791cabdff1aSopenharmony_ci; %1 = bitdepth
792cabdff1aSopenharmony_ci%macro IDCT_32x32 1
793cabdff1aSopenharmony_cicglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs
794cabdff1aSopenharmony_ci    mov r1d, 7
795cabdff1aSopenharmony_ci.loop32:
796cabdff1aSopenharmony_ci    TR_32x4 8 * r1, %1, 1
797cabdff1aSopenharmony_ci    dec r1d
798cabdff1aSopenharmony_ci    jge .loop32
799cabdff1aSopenharmony_ci
800cabdff1aSopenharmony_ci    call hevc_idct_transpose_32x32_ %+ cpuname
801cabdff1aSopenharmony_ci
802cabdff1aSopenharmony_ci    mov r1d, 7
803cabdff1aSopenharmony_ci.loop32_2:
804cabdff1aSopenharmony_ci    TR_32x4 8 * r1, %1, 0
805cabdff1aSopenharmony_ci    dec r1d
806cabdff1aSopenharmony_ci    jge .loop32_2
807cabdff1aSopenharmony_ci
808cabdff1aSopenharmony_ci    TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1
809cabdff1aSopenharmony_ci%endmacro
810cabdff1aSopenharmony_ci
811cabdff1aSopenharmony_ci%macro INIT_IDCT_DC 1
812cabdff1aSopenharmony_ciINIT_MMX mmxext
813cabdff1aSopenharmony_ciIDCT_DC_NL  4,      %1
814cabdff1aSopenharmony_ci
815cabdff1aSopenharmony_ciINIT_XMM sse2
816cabdff1aSopenharmony_ciIDCT_DC_NL  8,      %1
817cabdff1aSopenharmony_ciIDCT_DC    16,  4,  %1
818cabdff1aSopenharmony_ciIDCT_DC    32, 16,  %1
819cabdff1aSopenharmony_ci
820cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
821cabdff1aSopenharmony_ci    INIT_YMM avx2
822cabdff1aSopenharmony_ci    IDCT_DC    16,  2,  %1
823cabdff1aSopenharmony_ci    IDCT_DC    32,  8,  %1
824cabdff1aSopenharmony_ci%endif ;HAVE_AVX2_EXTERNAL
825cabdff1aSopenharmony_ci%endmacro
826cabdff1aSopenharmony_ci
827cabdff1aSopenharmony_ci%macro INIT_IDCT 2
828cabdff1aSopenharmony_ciINIT_XMM %2
829cabdff1aSopenharmony_ci%if %1 == 8
830cabdff1aSopenharmony_ci    TRANSPOSE_8x8
831cabdff1aSopenharmony_ci    %if ARCH_X86_64
832cabdff1aSopenharmony_ci        TRANSPOSE_16x16
833cabdff1aSopenharmony_ci        TRANSPOSE_32x32
834cabdff1aSopenharmony_ci    %endif
835cabdff1aSopenharmony_ci%endif
836cabdff1aSopenharmony_ci%if ARCH_X86_64
837cabdff1aSopenharmony_ci    IDCT_32x32 %1
838cabdff1aSopenharmony_ci    IDCT_16x16 %1
839cabdff1aSopenharmony_ci%endif
840cabdff1aSopenharmony_ciIDCT_8x8 %1
841cabdff1aSopenharmony_ciIDCT_4x4 %1
842cabdff1aSopenharmony_ci%endmacro
843cabdff1aSopenharmony_ci
844cabdff1aSopenharmony_ciINIT_IDCT_DC 8
845cabdff1aSopenharmony_ciINIT_IDCT_DC 10
846cabdff1aSopenharmony_ciINIT_IDCT_DC 12
847cabdff1aSopenharmony_ciINIT_IDCT 8, sse2
848cabdff1aSopenharmony_ciINIT_IDCT 8, avx
849cabdff1aSopenharmony_ciINIT_IDCT 10, sse2
850cabdff1aSopenharmony_ciINIT_IDCT 10, avx
851cabdff1aSopenharmony_ci;INIT_IDCT 12, sse2
852cabdff1aSopenharmony_ci;INIT_IDCT 12, avx
853