1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * ARM NEON optimised IDCT functions for HEVC decoding
3cabdff1aSopenharmony_ci * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
4cabdff1aSopenharmony_ci * Copyright (c) 2017 Alexandra Hájková
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ciconst trans, align=4
26cabdff1aSopenharmony_ci        .short 64, 83, 64, 36
27cabdff1aSopenharmony_ci        .short 89, 75, 50, 18
28cabdff1aSopenharmony_ci        .short 90, 87, 80, 70
29cabdff1aSopenharmony_ci        .short 57, 43, 25, 9
30cabdff1aSopenharmony_ci        .short 90, 90, 88, 85
31cabdff1aSopenharmony_ci        .short 82, 78, 73, 67
32cabdff1aSopenharmony_ci        .short 61, 54, 46, 38
33cabdff1aSopenharmony_ci        .short 31, 22, 13, 4
34cabdff1aSopenharmony_ciendconst
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ci.macro clip10 in1, in2, c1, c2
37cabdff1aSopenharmony_ci        vmax.s16        \in1, \in1, \c1
38cabdff1aSopenharmony_ci        vmax.s16        \in2, \in2, \c1
39cabdff1aSopenharmony_ci        vmin.s16        \in1, \in1, \c2
40cabdff1aSopenharmony_ci        vmin.s16        \in2, \in2, \c2
41cabdff1aSopenharmony_ci.endm
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_cifunction ff_hevc_add_residual_4x4_8_neon, export=1
44cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r1, :128]
45cabdff1aSopenharmony_ci        vld1.32         d4[0], [r0, :32], r2
46cabdff1aSopenharmony_ci        vld1.32         d4[1], [r0, :32], r2
47cabdff1aSopenharmony_ci        vld1.32         d5[0], [r0, :32], r2
48cabdff1aSopenharmony_ci        vld1.32         d5[1], [r0, :32], r2
49cabdff1aSopenharmony_ci        sub             r0, r0, r2, lsl #2
50cabdff1aSopenharmony_ci        vmovl.u8        q8, d4
51cabdff1aSopenharmony_ci        vmovl.u8        q9, d5
52cabdff1aSopenharmony_ci        vqadd.s16       q0, q0, q8
53cabdff1aSopenharmony_ci        vqadd.s16       q1, q1, q9
54cabdff1aSopenharmony_ci        vqmovun.s16     d0, q0
55cabdff1aSopenharmony_ci        vqmovun.s16     d1, q1
56cabdff1aSopenharmony_ci        vst1.32         d0[0], [r0, :32], r2
57cabdff1aSopenharmony_ci        vst1.32         d0[1], [r0, :32], r2
58cabdff1aSopenharmony_ci        vst1.32         d1[0], [r0, :32], r2
59cabdff1aSopenharmony_ci        vst1.32         d1[1], [r0, :32], r2
60cabdff1aSopenharmony_ci        bx              lr
61cabdff1aSopenharmony_ciendfunc
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_cifunction ff_hevc_add_residual_4x4_10_neon, export=1
64cabdff1aSopenharmony_ci        mov             r12, r0
65cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r1, :128]
66cabdff1aSopenharmony_ci        vld1.16         d4, [r12, :64], r2
67cabdff1aSopenharmony_ci        vld1.16         d5, [r12, :64], r2
68cabdff1aSopenharmony_ci        vld1.16         d6, [r12, :64], r2
69cabdff1aSopenharmony_ci        vqadd.s16       q0, q2
70cabdff1aSopenharmony_ci        vld1.16         d7, [r12, :64], r2
71cabdff1aSopenharmony_ci        vmov.s16        q12, #0
72cabdff1aSopenharmony_ci        vqadd.s16       q1, q3
73cabdff1aSopenharmony_ci        vmvn.s16        q13, #0xFC00 @ vmov.s16 #0x3FF
74cabdff1aSopenharmony_ci        clip10          q0, q1, q12, q13
75cabdff1aSopenharmony_ci        vst1.16         d0, [r0, :64], r2
76cabdff1aSopenharmony_ci        vst1.16         d1, [r0, :64], r2
77cabdff1aSopenharmony_ci        vst1.16         d2, [r0, :64], r2
78cabdff1aSopenharmony_ci        vst1.16         d3, [r0, :64], r2
79cabdff1aSopenharmony_ci        bx              lr
80cabdff1aSopenharmony_ciendfunc
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_cifunction ff_hevc_add_residual_8x8_8_neon, export=1
83cabdff1aSopenharmony_ci        add             r12, r0, r2
84cabdff1aSopenharmony_ci        add             r2,  r2, r2
85cabdff1aSopenharmony_ci        mov             r3,   #8
86cabdff1aSopenharmony_ci1:      subs            r3,   #2
87cabdff1aSopenharmony_ci        vld1.8          {d16},   [r0,  :64]
88cabdff1aSopenharmony_ci        vld1.8          {d17},   [r12, :64]
89cabdff1aSopenharmony_ci        vmovl.u8        q9,   d16
90cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r1,  :128]!
91cabdff1aSopenharmony_ci        vmovl.u8        q8,   d17
92cabdff1aSopenharmony_ci        vqadd.s16       q0,   q9
93cabdff1aSopenharmony_ci        vqadd.s16       q1,   q8
94cabdff1aSopenharmony_ci        vqmovun.s16     d0,   q0
95cabdff1aSopenharmony_ci        vqmovun.s16     d1,   q1
96cabdff1aSopenharmony_ci        vst1.8          d0,   [r0,  :64], r2
97cabdff1aSopenharmony_ci        vst1.8          d1,   [r12, :64], r2
98cabdff1aSopenharmony_ci        bne             1b
99cabdff1aSopenharmony_ci        bx              lr
100cabdff1aSopenharmony_ciendfunc
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_cifunction ff_hevc_add_residual_8x8_10_neon, export=1
103cabdff1aSopenharmony_ci        add             r12, r0, r2
104cabdff1aSopenharmony_ci        add             r2,  r2, r2
105cabdff1aSopenharmony_ci        mov             r3,  #8
106cabdff1aSopenharmony_ci        vmov.s16        q12, #0
107cabdff1aSopenharmony_ci        vmvn.s16        q13, #0xFC00 @ vmov.s16 #0x3FF
108cabdff1aSopenharmony_ci1:      subs            r3,  #2
109cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r1, :128]!
110cabdff1aSopenharmony_ci        vld1.16         {q8},    [r0, :128]
111cabdff1aSopenharmony_ci        vqadd.s16       q0, q8
112cabdff1aSopenharmony_ci        vld1.16         {q9},    [r12, :128]
113cabdff1aSopenharmony_ci        vqadd.s16       q1, q9
114cabdff1aSopenharmony_ci        clip10          q0, q1, q12, q13
115cabdff1aSopenharmony_ci        vst1.16         {q0}, [r0, :128], r2
116cabdff1aSopenharmony_ci        vst1.16         {q1}, [r12, :128], r2
117cabdff1aSopenharmony_ci        bne             1b
118cabdff1aSopenharmony_ci        bx              lr
119cabdff1aSopenharmony_ciendfunc
120cabdff1aSopenharmony_ci
121cabdff1aSopenharmony_cifunction ff_hevc_add_residual_16x16_8_neon, export=1
122cabdff1aSopenharmony_ci        mov             r3,  #16
123cabdff1aSopenharmony_ci        add             r12, r0, r2
124cabdff1aSopenharmony_ci        add             r2,  r2, r2
125cabdff1aSopenharmony_ci1:      subs            r3,  #2
126cabdff1aSopenharmony_ci        vld1.8          {q8},     [r0, :128]
127cabdff1aSopenharmony_ci        vld1.16         {q0, q1}, [r1, :128]!
128cabdff1aSopenharmony_ci        vld1.8          {q11},    [r12, :128]
129cabdff1aSopenharmony_ci        vld1.16         {q2, q3}, [r1, :128]!
130cabdff1aSopenharmony_ci        vmovl.u8        q9,  d16
131cabdff1aSopenharmony_ci        vmovl.u8        q10, d17
132cabdff1aSopenharmony_ci        vmovl.u8        q12, d22
133cabdff1aSopenharmony_ci        vmovl.u8        q13, d23
134cabdff1aSopenharmony_ci        vqadd.s16       q0,  q9
135cabdff1aSopenharmony_ci        vqadd.s16       q1,  q10
136cabdff1aSopenharmony_ci        vqadd.s16       q2,  q12
137cabdff1aSopenharmony_ci        vqadd.s16       q3,  q13
138cabdff1aSopenharmony_ci        vqmovun.s16     d0,  q0
139cabdff1aSopenharmony_ci        vqmovun.s16     d1,  q1
140cabdff1aSopenharmony_ci        vqmovun.s16     d2,  q2
141cabdff1aSopenharmony_ci        vqmovun.s16     d3,  q3
142cabdff1aSopenharmony_ci        vst1.8          {q0},     [r0, :128], r2
143cabdff1aSopenharmony_ci        vst1.8          {q1},     [r12, :128], r2
144cabdff1aSopenharmony_ci        bne             1b
145cabdff1aSopenharmony_ci        bx              lr
146cabdff1aSopenharmony_ciendfunc
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_cifunction ff_hevc_add_residual_16x16_10_neon, export=1
149cabdff1aSopenharmony_ci        mov             r3,  #16
150cabdff1aSopenharmony_ci        vmov.s16        q12, #0
151cabdff1aSopenharmony_ci        vmvn.s16        q13, #0xFC00 @ vmov.s16 #0x3FF
152cabdff1aSopenharmony_ci        add             r12, r0, r2
153cabdff1aSopenharmony_ci        add             r2,  r2, r2
154cabdff1aSopenharmony_ci1:      subs            r3,  #2
155cabdff1aSopenharmony_ci        vld1.16         {q8-q9},   [r0, :128]
156cabdff1aSopenharmony_ci        vld1.16         {q0, q1},  [r1, :128]!
157cabdff1aSopenharmony_ci        vqadd.s16       q0, q8
158cabdff1aSopenharmony_ci        vld1.16         {q10-q11}, [r12, :128]
159cabdff1aSopenharmony_ci        vqadd.s16       q1, q9
160cabdff1aSopenharmony_ci        vld1.16         {q2, q3},  [r1, :128]!
161cabdff1aSopenharmony_ci        vqadd.s16       q2, q10
162cabdff1aSopenharmony_ci        vqadd.s16       q3, q11
163cabdff1aSopenharmony_ci        clip10          q0, q1, q12, q13
164cabdff1aSopenharmony_ci        clip10          q2, q3, q12, q13
165cabdff1aSopenharmony_ci        vst1.16         {q0-q1},   [r0, :128], r2
166cabdff1aSopenharmony_ci        vst1.16         {q2-q3},   [r12, :128], r2
167cabdff1aSopenharmony_ci        bne             1b
168cabdff1aSopenharmony_ci        bx              lr
169cabdff1aSopenharmony_ciendfunc
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_cifunction ff_hevc_add_residual_32x32_8_neon, export=1
172cabdff1aSopenharmony_ci        vpush           {q4-q7}
173cabdff1aSopenharmony_ci        add             r12, r0, r2
174cabdff1aSopenharmony_ci        add             r2,  r2, r2
175cabdff1aSopenharmony_ci        mov             r3,  #32
176cabdff1aSopenharmony_ci1:      subs            r3,  #2
177cabdff1aSopenharmony_ci        vld1.8          {q12, q13}, [r0,  :128]
178cabdff1aSopenharmony_ci        vmovl.u8        q8,  d24
179cabdff1aSopenharmony_ci        vmovl.u8        q9,  d25
180cabdff1aSopenharmony_ci        vld1.8          {q14, q15}, [r12, :128]
181cabdff1aSopenharmony_ci        vmovl.u8        q10, d26
182cabdff1aSopenharmony_ci        vmovl.u8        q11, d27
183cabdff1aSopenharmony_ci        vmovl.u8        q12, d28
184cabdff1aSopenharmony_ci        vldm            r1!, {q0-q7}
185cabdff1aSopenharmony_ci        vmovl.u8        q13, d29
186cabdff1aSopenharmony_ci        vmovl.u8        q14, d30
187cabdff1aSopenharmony_ci        vmovl.u8        q15, d31
188cabdff1aSopenharmony_ci        vqadd.s16       q0,  q8
189cabdff1aSopenharmony_ci        vqadd.s16       q1,  q9
190cabdff1aSopenharmony_ci        vqadd.s16       q2,  q10
191cabdff1aSopenharmony_ci        vqadd.s16       q3,  q11
192cabdff1aSopenharmony_ci        vqadd.s16       q4,  q12
193cabdff1aSopenharmony_ci        vqadd.s16       q5,  q13
194cabdff1aSopenharmony_ci        vqadd.s16       q6,  q14
195cabdff1aSopenharmony_ci        vqadd.s16       q7,  q15
196cabdff1aSopenharmony_ci        vqmovun.s16     d0,  q0
197cabdff1aSopenharmony_ci        vqmovun.s16     d1,  q1
198cabdff1aSopenharmony_ci        vqmovun.s16     d2,  q2
199cabdff1aSopenharmony_ci        vqmovun.s16     d3,  q3
200cabdff1aSopenharmony_ci        vqmovun.s16     d4,  q4
201cabdff1aSopenharmony_ci        vqmovun.s16     d5,  q5
202cabdff1aSopenharmony_ci        vst1.8          {q0, q1}, [r0, :128], r2
203cabdff1aSopenharmony_ci        vqmovun.s16     d6,  q6
204cabdff1aSopenharmony_ci        vqmovun.s16     d7,  q7
205cabdff1aSopenharmony_ci        vst1.8          {q2, q3}, [r12, :128], r2
206cabdff1aSopenharmony_ci        bne             1b
207cabdff1aSopenharmony_ci        vpop            {q4-q7}
208cabdff1aSopenharmony_ci        bx              lr
209cabdff1aSopenharmony_ciendfunc
210cabdff1aSopenharmony_ci
211cabdff1aSopenharmony_cifunction ff_hevc_add_residual_32x32_10_neon, export=1
212cabdff1aSopenharmony_ci        mov             r3,  #32
213cabdff1aSopenharmony_ci        add             r12, r0, #32
214cabdff1aSopenharmony_ci        vmov.s16        q12, #0
215cabdff1aSopenharmony_ci        vmvn.s16        q13, #0xFC00 @ vmov.s16 #0x3FF
216cabdff1aSopenharmony_ci1:      subs            r3,  #1
217cabdff1aSopenharmony_ci        vldm            r1!, {q0-q3}
218cabdff1aSopenharmony_ci        vld1.16         {q8, q9},   [r0, :128]
219cabdff1aSopenharmony_ci        vld1.16         {q10, q11}, [r12, :128]
220cabdff1aSopenharmony_ci        vqadd.s16       q0, q8
221cabdff1aSopenharmony_ci        vqadd.s16       q1, q9
222cabdff1aSopenharmony_ci        vqadd.s16       q2, q10
223cabdff1aSopenharmony_ci        vqadd.s16       q3, q11
224cabdff1aSopenharmony_ci        clip10          q0, q1, q12, q13
225cabdff1aSopenharmony_ci        clip10          q2, q3, q12, q13
226cabdff1aSopenharmony_ci        vst1.16         {q0-q1},   [r0, :128], r2
227cabdff1aSopenharmony_ci        vst1.16         {q2-q3},   [r12, :128], r2
228cabdff1aSopenharmony_ci        bne             1b
229cabdff1aSopenharmony_ci        bx              lr
230cabdff1aSopenharmony_ciendfunc
231cabdff1aSopenharmony_ci
232cabdff1aSopenharmony_ci.macro idct_4x4_dc bitdepth
233cabdff1aSopenharmony_cifunction ff_hevc_idct_4x4_dc_\bitdepth\()_neon, export=1
234cabdff1aSopenharmony_ci        ldrsh           r1, [r0]
235cabdff1aSopenharmony_ci        ldr             r2, =(1 << (13 - \bitdepth))
236cabdff1aSopenharmony_ci        add             r1, #1
237cabdff1aSopenharmony_ci        asr             r1, #1
238cabdff1aSopenharmony_ci        add             r1, r2
239cabdff1aSopenharmony_ci        asr             r1, #(14 - \bitdepth)
240cabdff1aSopenharmony_ci        vdup.16         q0, r1
241cabdff1aSopenharmony_ci        vdup.16         q1, r1
242cabdff1aSopenharmony_ci        vst1.16         {q0, q1}, [r0, :128]
243cabdff1aSopenharmony_ci        bx              lr
244cabdff1aSopenharmony_ciendfunc
245cabdff1aSopenharmony_ci.endm
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci.macro idct_8x8_dc bitdepth
248cabdff1aSopenharmony_cifunction ff_hevc_idct_8x8_dc_\bitdepth\()_neon, export=1
249cabdff1aSopenharmony_ci        ldrsh           r1, [r0]
250cabdff1aSopenharmony_ci        ldr             r2, =(1 << (13 - \bitdepth))
251cabdff1aSopenharmony_ci        add             r1, #1
252cabdff1aSopenharmony_ci        asr             r1, #1
253cabdff1aSopenharmony_ci        add             r1, r2
254cabdff1aSopenharmony_ci        asr             r1, #(14 - \bitdepth)
255cabdff1aSopenharmony_ci        vdup.16         q8, r1
256cabdff1aSopenharmony_ci        vdup.16         q9, r1
257cabdff1aSopenharmony_ci        vmov.16         q10, q8
258cabdff1aSopenharmony_ci        vmov.16         q11, q8
259cabdff1aSopenharmony_ci        vmov.16         q12, q8
260cabdff1aSopenharmony_ci        vmov.16         q13, q8
261cabdff1aSopenharmony_ci        vmov.16         q14, q8
262cabdff1aSopenharmony_ci        vmov.16         q15, q8
263cabdff1aSopenharmony_ci        vstm            r0, {q8-q15}
264cabdff1aSopenharmony_ci        bx              lr
265cabdff1aSopenharmony_ciendfunc
266cabdff1aSopenharmony_ci.endm
267cabdff1aSopenharmony_ci
268cabdff1aSopenharmony_ci.macro idct_16x16_dc bitdepth
269cabdff1aSopenharmony_cifunction ff_hevc_idct_16x16_dc_\bitdepth\()_neon, export=1
270cabdff1aSopenharmony_ci        ldrsh           r1, [r0]
271cabdff1aSopenharmony_ci        ldr             r2, =(1 << (13 - \bitdepth))
272cabdff1aSopenharmony_ci        add             r1, #1
273cabdff1aSopenharmony_ci        asr             r1, #1
274cabdff1aSopenharmony_ci        add             r1, r2
275cabdff1aSopenharmony_ci        asr             r1, #(14 - \bitdepth)
276cabdff1aSopenharmony_ci        vdup.16         q8, r1
277cabdff1aSopenharmony_ci        vdup.16         q9, r1
278cabdff1aSopenharmony_ci        vmov.16         q10, q8
279cabdff1aSopenharmony_ci        vmov.16         q11, q8
280cabdff1aSopenharmony_ci        vmov.16         q12, q8
281cabdff1aSopenharmony_ci        vmov.16         q13, q8
282cabdff1aSopenharmony_ci        vmov.16         q14, q8
283cabdff1aSopenharmony_ci        vmov.16         q15, q8
284cabdff1aSopenharmony_ci        vstm            r0!, {q8-q15}
285cabdff1aSopenharmony_ci        vstm            r0!, {q8-q15}
286cabdff1aSopenharmony_ci        vstm            r0!, {q8-q15}
287cabdff1aSopenharmony_ci        vstm            r0, {q8-q15}
288cabdff1aSopenharmony_ci        bx              lr
289cabdff1aSopenharmony_ciendfunc
290cabdff1aSopenharmony_ci.endm
291cabdff1aSopenharmony_ci
292cabdff1aSopenharmony_ci.macro idct_32x32_dc bitdepth
293cabdff1aSopenharmony_cifunction ff_hevc_idct_32x32_dc_\bitdepth\()_neon, export=1
294cabdff1aSopenharmony_ci        ldrsh           r1, [r0]
295cabdff1aSopenharmony_ci        ldr             r2, =(1 << (13 - \bitdepth))
296cabdff1aSopenharmony_ci        add             r1, #1
297cabdff1aSopenharmony_ci        asr             r1, #1
298cabdff1aSopenharmony_ci        add             r1, r2
299cabdff1aSopenharmony_ci        asr             r1, #(14 - \bitdepth)
300cabdff1aSopenharmony_ci        mov             r3, #16
301cabdff1aSopenharmony_ci        vdup.16         q8, r1
302cabdff1aSopenharmony_ci        vdup.16         q9, r1
303cabdff1aSopenharmony_ci        vmov.16         q10, q8
304cabdff1aSopenharmony_ci        vmov.16         q11, q8
305cabdff1aSopenharmony_ci        vmov.16         q12, q8
306cabdff1aSopenharmony_ci        vmov.16         q13, q8
307cabdff1aSopenharmony_ci        vmov.16         q14, q8
308cabdff1aSopenharmony_ci        vmov.16         q15, q8
309cabdff1aSopenharmony_ci1:      subs            r3, #1
310cabdff1aSopenharmony_ci        vstm            r0!, {q8-q15}
311cabdff1aSopenharmony_ci        bne             1b
312cabdff1aSopenharmony_ci        bx              lr
313cabdff1aSopenharmony_ciendfunc
314cabdff1aSopenharmony_ci.endm
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_ci.macro sum_sub out, in, c, op
317cabdff1aSopenharmony_ci  .ifc \op, +
318cabdff1aSopenharmony_ci        vmlal.s16       \out, \in, \c
319cabdff1aSopenharmony_ci  .else
320cabdff1aSopenharmony_ci        vmlsl.s16       \out, \in, \c
321cabdff1aSopenharmony_ci  .endif
322cabdff1aSopenharmony_ci.endm
323cabdff1aSopenharmony_ci
324cabdff1aSopenharmony_ci.macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift, tmp0, tmp1, tmp2, tmp3, tmp4
325cabdff1aSopenharmony_ci         vshll.s16      \tmp0, \in0, #6
326cabdff1aSopenharmony_ci         vmull.s16      \tmp2, \in1, d4[1]
327cabdff1aSopenharmony_ci         vmov           \tmp1, \tmp0
328cabdff1aSopenharmony_ci         vmull.s16      \tmp3, \in1, d4[3]
329cabdff1aSopenharmony_ci         vmlal.s16      \tmp0, \in2, d4[0] @e0
330cabdff1aSopenharmony_ci         vmlsl.s16      \tmp1, \in2, d4[0] @e1
331cabdff1aSopenharmony_ci         vmlal.s16      \tmp2, \in3, d4[3] @o0
332cabdff1aSopenharmony_ci         vmlsl.s16      \tmp3, \in3, d4[1] @o1
333cabdff1aSopenharmony_ci
334cabdff1aSopenharmony_ci         vadd.s32       \tmp4, \tmp0, \tmp2
335cabdff1aSopenharmony_ci         vsub.s32       \tmp0, \tmp0, \tmp2
336cabdff1aSopenharmony_ci         vadd.s32       \tmp2, \tmp1, \tmp3
337cabdff1aSopenharmony_ci         vsub.s32       \tmp1, \tmp1, \tmp3
338cabdff1aSopenharmony_ci         vqrshrn.s32    \out0, \tmp4, #\shift
339cabdff1aSopenharmony_ci         vqrshrn.s32    \out3, \tmp0, #\shift
340cabdff1aSopenharmony_ci         vqrshrn.s32    \out1, \tmp2, #\shift
341cabdff1aSopenharmony_ci         vqrshrn.s32    \out2, \tmp1, #\shift
342cabdff1aSopenharmony_ci.endm
343cabdff1aSopenharmony_ci
344cabdff1aSopenharmony_ci.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3
345cabdff1aSopenharmony_ci         vshll.s16      \tmp0, \in0, #6
346cabdff1aSopenharmony_ci         vld1.s16       {\in0}, [r1, :64]!
347cabdff1aSopenharmony_ci         vmov           \tmp1, \tmp0
348cabdff1aSopenharmony_ci         vmull.s16      \tmp2, \in1, \in0[1]
349cabdff1aSopenharmony_ci         vmull.s16      \tmp3, \in1, \in0[3]
350cabdff1aSopenharmony_ci         vmlal.s16      \tmp0, \in2, \in0[0] @e0
351cabdff1aSopenharmony_ci         vmlsl.s16      \tmp1, \in2, \in0[0] @e1
352cabdff1aSopenharmony_ci         vmlal.s16      \tmp2, \in3, \in0[3] @o0
353cabdff1aSopenharmony_ci         vmlsl.s16      \tmp3, \in3, \in0[1] @o1
354cabdff1aSopenharmony_ci
355cabdff1aSopenharmony_ci         vld1.s16       {\in0}, [r1, :64]
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_ci         vadd.s32       \out0, \tmp0, \tmp2
358cabdff1aSopenharmony_ci         vadd.s32       \out1, \tmp1, \tmp3
359cabdff1aSopenharmony_ci         vsub.s32       \out2, \tmp1, \tmp3
360cabdff1aSopenharmony_ci         vsub.s32       \out3, \tmp0, \tmp2
361cabdff1aSopenharmony_ci
362cabdff1aSopenharmony_ci         sub            r1,  r1,  #8
363cabdff1aSopenharmony_ci.endm
364cabdff1aSopenharmony_ci
365cabdff1aSopenharmony_ci@ Do a 4x4 transpose, using q registers for the subtransposes that don't
366cabdff1aSopenharmony_ci@ need to address the indiviudal d registers.
367cabdff1aSopenharmony_ci@ r0,r1 == rq0, r2,r3 == rq1
368cabdff1aSopenharmony_ci.macro transpose_4x4 rq0, rq1, r0, r1, r2, r3
369cabdff1aSopenharmony_ci        vtrn.32         \rq0, \rq1
370cabdff1aSopenharmony_ci        vtrn.16         \r0,  \r1
371cabdff1aSopenharmony_ci        vtrn.16         \r2,  \r3
372cabdff1aSopenharmony_ci.endm
373cabdff1aSopenharmony_ci
374cabdff1aSopenharmony_ci.macro idct_4x4 bitdepth
375cabdff1aSopenharmony_cifunction ff_hevc_idct_4x4_\bitdepth\()_neon, export=1
376cabdff1aSopenharmony_ci@r0 - coeffs
377cabdff1aSopenharmony_ci        vld1.s16        {q0-q1}, [r0, :128]
378cabdff1aSopenharmony_ci
379cabdff1aSopenharmony_ci        movrel          r1, trans
380cabdff1aSopenharmony_ci        vld1.s16        {d4}, [r1, :64]
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_ci        tr_4x4          d0, d1, d2, d3, d16, d17, d18, d19, 7, q10, q11, q12, q13, q0
383cabdff1aSopenharmony_ci        transpose_4x4   q8, q9, d16, d17, d18, d19
384cabdff1aSopenharmony_ci
385cabdff1aSopenharmony_ci        tr_4x4          d16, d17, d18, d19, d0, d1, d2, d3, 20 - \bitdepth, q10, q11, q12, q13, q0
386cabdff1aSopenharmony_ci        transpose_4x4   q0, q1, d0, d1, d2, d3
387cabdff1aSopenharmony_ci        vst1.s16        {d0-d3}, [r0, :128]
388cabdff1aSopenharmony_ci        bx lr
389cabdff1aSopenharmony_ciendfunc
390cabdff1aSopenharmony_ci.endm
391cabdff1aSopenharmony_ci
392cabdff1aSopenharmony_ci.macro transpose8_4x4 r0, r1, r2, r3
393cabdff1aSopenharmony_ci        vtrn.16         \r0,  \r1
394cabdff1aSopenharmony_ci        vtrn.16         \r2,  \r3
395cabdff1aSopenharmony_ci        vtrn.32         \r0,  \r2
396cabdff1aSopenharmony_ci        vtrn.32         \r1,  \r3
397cabdff1aSopenharmony_ci.endm
398cabdff1aSopenharmony_ci
399cabdff1aSopenharmony_ci.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7, l0, l1, l2, l3, l4, l5, l6, l7
400cabdff1aSopenharmony_ci        transpose8_4x4  \r0, \r1, \r2, \r3
401cabdff1aSopenharmony_ci        transpose8_4x4  \r4, \r5, \r6, \r7
402cabdff1aSopenharmony_ci
403cabdff1aSopenharmony_ci        transpose8_4x4  \l0, \l1, \l2, \l3
404cabdff1aSopenharmony_ci        transpose8_4x4  \l4, \l5, \l6, \l7
405cabdff1aSopenharmony_ci.endm
406cabdff1aSopenharmony_ci
407cabdff1aSopenharmony_ci.macro tr_8x4 shift, in0, in1, in2, in3, in4, in5, in6, in7
408cabdff1aSopenharmony_ci        tr_4x4_8        \in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, q14, q15
409cabdff1aSopenharmony_ci
410cabdff1aSopenharmony_ci        vmull.s16       q14, \in1, \in0[2]
411cabdff1aSopenharmony_ci        vmull.s16       q12, \in1, \in0[0]
412cabdff1aSopenharmony_ci        vmull.s16       q13, \in1, \in0[1]
413cabdff1aSopenharmony_ci        sum_sub         q14, \in3, \in0[0], -
414cabdff1aSopenharmony_ci        sum_sub         q12, \in3, \in0[1], +
415cabdff1aSopenharmony_ci        sum_sub         q13, \in3, \in0[3], -
416cabdff1aSopenharmony_ci
417cabdff1aSopenharmony_ci        sum_sub         q14, \in5, \in0[3], +
418cabdff1aSopenharmony_ci        sum_sub         q12, \in5, \in0[2], +
419cabdff1aSopenharmony_ci        sum_sub         q13, \in5, \in0[0], -
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_ci        sum_sub         q14, \in7, \in0[1], +
422cabdff1aSopenharmony_ci        sum_sub         q12, \in7, \in0[3], +
423cabdff1aSopenharmony_ci        sum_sub         q13, \in7, \in0[2], -
424cabdff1aSopenharmony_ci
425cabdff1aSopenharmony_ci        vadd.s32        q15, q10, q14
426cabdff1aSopenharmony_ci        vsub.s32        q10, q10, q14
427cabdff1aSopenharmony_ci        vqrshrn.s32     \in2, q15, \shift
428cabdff1aSopenharmony_ci
429cabdff1aSopenharmony_ci        vmull.s16       q15, \in1, \in0[3]
430cabdff1aSopenharmony_ci        sum_sub         q15, \in3, \in0[2], -
431cabdff1aSopenharmony_ci        sum_sub         q15, \in5, \in0[1], +
432cabdff1aSopenharmony_ci        sum_sub         q15, \in7, \in0[0], -
433cabdff1aSopenharmony_ci
434cabdff1aSopenharmony_ci        vqrshrn.s32     \in5, q10,  \shift
435cabdff1aSopenharmony_ci
436cabdff1aSopenharmony_ci        vadd.s32        q10, q8, q12
437cabdff1aSopenharmony_ci        vsub.s32        q8,  q8, q12
438cabdff1aSopenharmony_ci        vadd.s32        q12, q9, q13
439cabdff1aSopenharmony_ci        vsub.s32        q9,  q9, q13
440cabdff1aSopenharmony_ci        vadd.s32        q14, q11, q15
441cabdff1aSopenharmony_ci        vsub.s32        q11, q11, q15
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci        vqrshrn.s32     \in0, q10, \shift
444cabdff1aSopenharmony_ci        vqrshrn.s32     \in7, q8,  \shift
445cabdff1aSopenharmony_ci        vqrshrn.s32     \in1, q12, \shift
446cabdff1aSopenharmony_ci        vqrshrn.s32     \in6, q9,  \shift
447cabdff1aSopenharmony_ci        vqrshrn.s32     \in3, q14, \shift
448cabdff1aSopenharmony_ci        vqrshrn.s32     \in4, q11, \shift
449cabdff1aSopenharmony_ci.endm
450cabdff1aSopenharmony_ci
451cabdff1aSopenharmony_ci.macro idct_8x8 bitdepth
452cabdff1aSopenharmony_cifunction ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
453cabdff1aSopenharmony_ci@r0 - coeffs
454cabdff1aSopenharmony_ci        vpush           {q4-q7}
455cabdff1aSopenharmony_ci
456cabdff1aSopenharmony_ci        mov             r1,  r0
457cabdff1aSopenharmony_ci        mov             r2,  #64
458cabdff1aSopenharmony_ci        add             r3,  r0,  #32
459cabdff1aSopenharmony_ci        vld1.s16        {q0-q1}, [r1,:128], r2
460cabdff1aSopenharmony_ci        vld1.s16        {q2-q3}, [r3,:128], r2
461cabdff1aSopenharmony_ci        vld1.s16        {q4-q5}, [r1,:128], r2
462cabdff1aSopenharmony_ci        vld1.s16        {q6-q7}, [r3,:128], r2
463cabdff1aSopenharmony_ci
464cabdff1aSopenharmony_ci        movrel          r1, trans
465cabdff1aSopenharmony_ci
466cabdff1aSopenharmony_ci        tr_8x4          7, d0, d2, d4, d6, d8, d10, d12, d14
467cabdff1aSopenharmony_ci        tr_8x4          7, d1, d3, d5, d7, d9, d11, d13, d15
468cabdff1aSopenharmony_ci
469cabdff1aSopenharmony_ci        @ Transpose each 4x4 block, and swap how d4-d7 and d8-d11 are used.
470cabdff1aSopenharmony_ci        @ Layout before:
471cabdff1aSopenharmony_ci        @ d0  d1
472cabdff1aSopenharmony_ci        @ d2  d3
473cabdff1aSopenharmony_ci        @ d4  d5
474cabdff1aSopenharmony_ci        @ d6  d7
475cabdff1aSopenharmony_ci        @ d8  d9
476cabdff1aSopenharmony_ci        @ d10 d11
477cabdff1aSopenharmony_ci        @ d12 d13
478cabdff1aSopenharmony_ci        @ d14 d15
479cabdff1aSopenharmony_ci        transpose_8x8   d0, d2, d4, d6, d8, d10, d12, d14, d1, d3, d5, d7, d9, d11, d13, d15
480cabdff1aSopenharmony_ci        @ Now the layout is:
481cabdff1aSopenharmony_ci        @ d0  d8
482cabdff1aSopenharmony_ci        @ d2  d10
483cabdff1aSopenharmony_ci        @ d4  d12
484cabdff1aSopenharmony_ci        @ d6  d14
485cabdff1aSopenharmony_ci        @ d1  d9
486cabdff1aSopenharmony_ci        @ d3  d11
487cabdff1aSopenharmony_ci        @ d5  d13
488cabdff1aSopenharmony_ci        @ d7  d15
489cabdff1aSopenharmony_ci
490cabdff1aSopenharmony_ci        tr_8x4          20 - \bitdepth, d0, d2, d4, d6, d1, d3, d5, d7
491cabdff1aSopenharmony_ci        vswp            d0, d8
492cabdff1aSopenharmony_ci        tr_8x4          20 - \bitdepth, d0, d10, d12, d14, d9, d11, d13, d15
493cabdff1aSopenharmony_ci        vswp            d0, d8
494cabdff1aSopenharmony_ci
495cabdff1aSopenharmony_ci        transpose_8x8   d0, d2, d4, d6, d8, d10, d12, d14, d1, d3, d5, d7, d9, d11, d13, d15
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci        mov             r1,  r0
498cabdff1aSopenharmony_ci        mov             r2,  #64
499cabdff1aSopenharmony_ci        add             r3,  r0,  #32
500cabdff1aSopenharmony_ci        vst1.s16        {q0-q1}, [r1,:128], r2
501cabdff1aSopenharmony_ci        vst1.s16        {q2-q3}, [r3,:128], r2
502cabdff1aSopenharmony_ci        vst1.s16        {q4-q5}, [r1,:128], r2
503cabdff1aSopenharmony_ci        vst1.s16        {q6-q7}, [r3,:128], r2
504cabdff1aSopenharmony_ci
505cabdff1aSopenharmony_ci        vpop            {q4-q7}
506cabdff1aSopenharmony_ci        bx              lr
507cabdff1aSopenharmony_ciendfunc
508cabdff1aSopenharmony_ci.endm
509cabdff1aSopenharmony_ci
510cabdff1aSopenharmony_ci.macro butterfly e, o, tmp_p, tmp_m
511cabdff1aSopenharmony_ci        vadd.s32        \tmp_p, \e, \o
512cabdff1aSopenharmony_ci        vsub.s32        \tmp_m, \e, \o
513cabdff1aSopenharmony_ci.endm
514cabdff1aSopenharmony_ci
515cabdff1aSopenharmony_ci.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7, offset
516cabdff1aSopenharmony_ci        tr_4x4_8        \in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, q14, q15
517cabdff1aSopenharmony_ci
518cabdff1aSopenharmony_ci        vmull.s16       q12, \in1, \in0[0]
519cabdff1aSopenharmony_ci        vmull.s16       q13, \in1, \in0[1]
520cabdff1aSopenharmony_ci        vmull.s16       q14, \in1, \in0[2]
521cabdff1aSopenharmony_ci        vmull.s16       q15, \in1, \in0[3]
522cabdff1aSopenharmony_ci        sum_sub         q12, \in3, \in0[1], +
523cabdff1aSopenharmony_ci        sum_sub         q13, \in3, \in0[3], -
524cabdff1aSopenharmony_ci        sum_sub         q14, \in3, \in0[0], -
525cabdff1aSopenharmony_ci        sum_sub         q15, \in3, \in0[2], -
526cabdff1aSopenharmony_ci
527cabdff1aSopenharmony_ci        sum_sub         q12, \in5, \in0[2], +
528cabdff1aSopenharmony_ci        sum_sub         q13, \in5, \in0[0], -
529cabdff1aSopenharmony_ci        sum_sub         q14, \in5, \in0[3], +
530cabdff1aSopenharmony_ci        sum_sub         q15, \in5, \in0[1], +
531cabdff1aSopenharmony_ci
532cabdff1aSopenharmony_ci        sum_sub         q12, \in7, \in0[3], +
533cabdff1aSopenharmony_ci        sum_sub         q13, \in7, \in0[2], -
534cabdff1aSopenharmony_ci        sum_sub         q14, \in7, \in0[1], +
535cabdff1aSopenharmony_ci        sum_sub         q15, \in7, \in0[0], -
536cabdff1aSopenharmony_ci
537cabdff1aSopenharmony_ci        butterfly       q8,  q12, q0, q7
538cabdff1aSopenharmony_ci        butterfly       q9,  q13, q1, q6
539cabdff1aSopenharmony_ci        butterfly       q10, q14, q2, q5
540cabdff1aSopenharmony_ci        butterfly       q11, q15, q3, q4
541cabdff1aSopenharmony_ci        add             r4,  sp,  #\offset
542cabdff1aSopenharmony_ci        vst1.s32        {q0-q1}, [r4, :128]!
543cabdff1aSopenharmony_ci        vst1.s32        {q2-q3}, [r4, :128]!
544cabdff1aSopenharmony_ci        vst1.s32        {q4-q5}, [r4, :128]!
545cabdff1aSopenharmony_ci        vst1.s32        {q6-q7}, [r4, :128]
546cabdff1aSopenharmony_ci.endm
547cabdff1aSopenharmony_ci
548cabdff1aSopenharmony_ci.macro load16 in0, in1, in2, in3, in4, in5, in6, in7
549cabdff1aSopenharmony_ci        vld1.s16        {\in0}, [r1, :64], r2
550cabdff1aSopenharmony_ci        vld1.s16        {\in1}, [r3, :64], r2
551cabdff1aSopenharmony_ci        vld1.s16        {\in2}, [r1, :64], r2
552cabdff1aSopenharmony_ci        vld1.s16        {\in3}, [r3, :64], r2
553cabdff1aSopenharmony_ci        vld1.s16        {\in4}, [r1, :64], r2
554cabdff1aSopenharmony_ci        vld1.s16        {\in5}, [r3, :64], r2
555cabdff1aSopenharmony_ci        vld1.s16        {\in6}, [r1, :64], r2
556cabdff1aSopenharmony_ci        vld1.s16        {\in7}, [r3, :64], r2
557cabdff1aSopenharmony_ci.endm
558cabdff1aSopenharmony_ci
559cabdff1aSopenharmony_ci.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7
560cabdff1aSopenharmony_ci        sum_sub q5,     \in, \t0, \op0
561cabdff1aSopenharmony_ci        sum_sub q6,     \in, \t1, \op1
562cabdff1aSopenharmony_ci        sum_sub q7,     \in, \t2, \op2
563cabdff1aSopenharmony_ci        sum_sub q8,     \in, \t3, \op3
564cabdff1aSopenharmony_ci        sum_sub q9,     \in, \t4, \op4
565cabdff1aSopenharmony_ci        sum_sub q10,    \in, \t5, \op5
566cabdff1aSopenharmony_ci        sum_sub q11,    \in, \t6, \op6
567cabdff1aSopenharmony_ci        sum_sub q12,    \in, \t7, \op7
568cabdff1aSopenharmony_ci.endm
569cabdff1aSopenharmony_ci
570cabdff1aSopenharmony_ci.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
571cabdff1aSopenharmony_ci        vadd.s32        q4, \in0, \in1
572cabdff1aSopenharmony_ci        vsub.s32        \in0, \in0, \in1
573cabdff1aSopenharmony_ci        vadd.s32        \in1, \in2, \in3
574cabdff1aSopenharmony_ci        vsub.s32        \in2, \in2, \in3
575cabdff1aSopenharmony_ci        vadd.s32        \in3, \in4, \in5
576cabdff1aSopenharmony_ci        vsub.s32        \in4, \in4, \in5
577cabdff1aSopenharmony_ci        vadd.s32        \in5, \in6, \in7
578cabdff1aSopenharmony_ci        vsub.s32        \in6, \in6, \in7
579cabdff1aSopenharmony_ci.endm
580cabdff1aSopenharmony_ci
581cabdff1aSopenharmony_ci.macro store16 in0, in1, in2, in3, in4, in5, in6, in7, rx
582cabdff1aSopenharmony_ci        vst1.s16        \in0, [r1, :64], r2
583cabdff1aSopenharmony_ci        vst1.s16        \in1, [r3, :64], \rx
584cabdff1aSopenharmony_ci        vst1.s16        \in2, [r1, :64], r2
585cabdff1aSopenharmony_ci        vst1.s16        \in3, [r3, :64], \rx
586cabdff1aSopenharmony_ci        vst1.s16        \in4, [r1, :64], r2
587cabdff1aSopenharmony_ci        vst1.s16        \in5, [r3, :64], \rx
588cabdff1aSopenharmony_ci        vst1.s16        \in6, [r1, :64], r2
589cabdff1aSopenharmony_ci        vst1.s16        \in7, [r3, :64], \rx
590cabdff1aSopenharmony_ci.endm
591cabdff1aSopenharmony_ci
592cabdff1aSopenharmony_ci.macro scale out0, out1, out2, out3, out4, out5, out6, out7, in0, in1, in2, in3, in4, in5, in6, in7, shift
593cabdff1aSopenharmony_ci        vqrshrn.s32     \out0, \in0, \shift
594cabdff1aSopenharmony_ci        vqrshrn.s32     \out1, \in1, \shift
595cabdff1aSopenharmony_ci        vqrshrn.s32     \out2, \in2, \shift
596cabdff1aSopenharmony_ci        vqrshrn.s32     \out3, \in3, \shift
597cabdff1aSopenharmony_ci        vqrshrn.s32     \out4, \in4, \shift
598cabdff1aSopenharmony_ci        vqrshrn.s32     \out5, \in5, \shift
599cabdff1aSopenharmony_ci        vqrshrn.s32     \out6, \in6, \shift
600cabdff1aSopenharmony_ci        vqrshrn.s32     \out7, \in7, \shift
601cabdff1aSopenharmony_ci.endm
602cabdff1aSopenharmony_ci
603cabdff1aSopenharmony_ci@stores in1, in2, in4, in6 ascending from off1 and
604cabdff1aSopenharmony_ci@stores in1, in3, in5, in7 descending from off2
605cabdff1aSopenharmony_ci.macro store_to_stack off1, off2, in0, in2, in4, in6, in7, in5, in3, in1
606cabdff1aSopenharmony_ci        add             r1, sp, #\off1
607cabdff1aSopenharmony_ci        add             r3, sp, #\off2
608cabdff1aSopenharmony_ci        mov             r2, #-16
609cabdff1aSopenharmony_ci        vst1.s32        {\in0}, [r1, :128]!
610cabdff1aSopenharmony_ci        vst1.s32        {\in1}, [r3, :128], r2
611cabdff1aSopenharmony_ci        vst1.s32        {\in2}, [r1, :128]!
612cabdff1aSopenharmony_ci        vst1.s32        {\in3}, [r3, :128], r2
613cabdff1aSopenharmony_ci        vst1.s32        {\in4}, [r1, :128]!
614cabdff1aSopenharmony_ci        vst1.s32        {\in5}, [r3, :128], r2
615cabdff1aSopenharmony_ci        vst1.s32        {\in6}, [r1, :128]
616cabdff1aSopenharmony_ci        vst1.s32        {\in7}, [r3, :128]
617cabdff1aSopenharmony_ci.endm
618cabdff1aSopenharmony_ci
619cabdff1aSopenharmony_ci.macro tr_16x4 name, shift, offset, step
620cabdff1aSopenharmony_cifunction func_tr_16x4_\name
621cabdff1aSopenharmony_ci        mov             r1,  r5
622cabdff1aSopenharmony_ci        add             r3, r5, #(\step * 64)
623cabdff1aSopenharmony_ci        mov             r2, #(\step * 128)
624cabdff1aSopenharmony_ci        load16          d0, d1, d2, d3, d4, d5, d6, d7
625cabdff1aSopenharmony_ci        movrel          r1, trans
626cabdff1aSopenharmony_ci
627cabdff1aSopenharmony_ci        tr16_8x4        d0, d1, d2, d3, d4, d5, d6, d7, \offset
628cabdff1aSopenharmony_ci
629cabdff1aSopenharmony_ci        add             r1,  r5, #(\step * 32)
630cabdff1aSopenharmony_ci        add             r3,  r5, #(\step * 3 *32)
631cabdff1aSopenharmony_ci        mov             r2,  #(\step * 128)
632cabdff1aSopenharmony_ci        load16          d8, d9, d2, d3, d4, d5, d6, d7
633cabdff1aSopenharmony_ci        movrel          r1, trans + 16
634cabdff1aSopenharmony_ci        vld1.s16        {q0}, [r1, :128]
635cabdff1aSopenharmony_ci        vmull.s16       q5, d8, d0[0]
636cabdff1aSopenharmony_ci        vmull.s16       q6, d8, d0[1]
637cabdff1aSopenharmony_ci        vmull.s16       q7, d8, d0[2]
638cabdff1aSopenharmony_ci        vmull.s16       q8, d8, d0[3]
639cabdff1aSopenharmony_ci        vmull.s16       q9, d8, d1[0]
640cabdff1aSopenharmony_ci        vmull.s16       q10, d8, d1[1]
641cabdff1aSopenharmony_ci        vmull.s16       q11, d8, d1[2]
642cabdff1aSopenharmony_ci        vmull.s16       q12, d8, d1[3]
643cabdff1aSopenharmony_ci
644cabdff1aSopenharmony_ci        add_member      d9, d0[1], d1[0], d1[3], d1[1], d0[2], d0[0], d0[3], d1[2], +, +, +, -, -, -, -, -
645cabdff1aSopenharmony_ci        add_member      d2, d0[2], d1[3], d0[3], d0[1], d1[2], d1[0], d0[0], d1[1], +, +, -, -, -, +, +, +
646cabdff1aSopenharmony_ci        add_member      d3, d0[3], d1[1], d0[1], d1[3], d0[0], d1[2], d0[2], d1[0], +, -, -, +, +, +, -, -
647cabdff1aSopenharmony_ci        add_member      d4, d1[0], d0[2], d1[2], d0[0], d1[3], d0[1], d1[1], d0[3], +, -, -, +, -, -, +, +
648cabdff1aSopenharmony_ci        add_member      d5, d1[1], d0[0], d1[0], d1[2], d0[1], d0[3], d1[3], d0[2], +, -, +, +, -, +, +, -
649cabdff1aSopenharmony_ci        add_member      d6, d1[2], d0[3], d0[0], d0[2], d1[1], d1[3], d1[0], d0[1], +, -, +, -, +, +, -, +
650cabdff1aSopenharmony_ci        add_member      d7, d1[3], d1[2], d1[1], d1[0], d0[3], d0[2], d0[1], d0[0], +, -, +, -, +, -, +, -
651cabdff1aSopenharmony_ci
652cabdff1aSopenharmony_ci        add             r4, sp, #\offset
653cabdff1aSopenharmony_ci        vld1.s32        {q0-q1}, [r4, :128]!
654cabdff1aSopenharmony_ci        vld1.s32        {q2-q3}, [r4, :128]!
655cabdff1aSopenharmony_ci
656cabdff1aSopenharmony_ci        butterfly16     q0, q5, q1, q6, q2, q7, q3, q8
657cabdff1aSopenharmony_ci    .if \shift > 0
658cabdff1aSopenharmony_ci        scale           d26, d27, d28, d29, d30, d31, d16, d17, q4, q0, q5, q1, q6, q2, q7, q3, \shift
659cabdff1aSopenharmony_ci        transpose8_4x4  d26, d28, d30, d16
660cabdff1aSopenharmony_ci        transpose8_4x4  d17, d31, d29, d27
661cabdff1aSopenharmony_ci        mov             r1, r6
662cabdff1aSopenharmony_ci        add             r3, r6, #(24 +3*32)
663cabdff1aSopenharmony_ci        mov             r2, #32
664cabdff1aSopenharmony_ci        mov             r4, #-32
665cabdff1aSopenharmony_ci        store16         d26, d27, d28, d29, d30, d31, d16, d17, r4
666cabdff1aSopenharmony_ci    .else
667cabdff1aSopenharmony_ci        store_to_stack  \offset, (\offset + 240), q4, q5, q6, q7, q3, q2, q1, q0
668cabdff1aSopenharmony_ci    .endif
669cabdff1aSopenharmony_ci
670cabdff1aSopenharmony_ci        add             r4, sp, #(\offset + 64)
671cabdff1aSopenharmony_ci        vld1.s32        {q0-q1}, [r4, :128]!
672cabdff1aSopenharmony_ci        vld1.s32        {q2-q3}, [r4, :128]
673cabdff1aSopenharmony_ci        butterfly16     q0, q9, q1, q10, q2, q11, q3, q12
674cabdff1aSopenharmony_ci    .if \shift > 0
675cabdff1aSopenharmony_ci        scale           d26, d27, d28, d29, d30, d31, d8, d9, q4, q0, q9, q1, q10, q2, q11, q3, \shift
676cabdff1aSopenharmony_ci        transpose8_4x4  d26, d28, d30, d8
677cabdff1aSopenharmony_ci        transpose8_4x4  d9, d31, d29, d27
678cabdff1aSopenharmony_ci
679cabdff1aSopenharmony_ci        add             r1, r6, #8
680cabdff1aSopenharmony_ci        add             r3, r6, #(16 + 3 * 32)
681cabdff1aSopenharmony_ci        mov             r2, #32
682cabdff1aSopenharmony_ci        mov             r4, #-32
683cabdff1aSopenharmony_ci        store16         d26, d27, d28, d29, d30, d31, d8, d9, r4
684cabdff1aSopenharmony_ci    .else
685cabdff1aSopenharmony_ci        store_to_stack (\offset + 64), (\offset + 176), q4, q9, q10, q11, q3, q2, q1, q0
686cabdff1aSopenharmony_ci    .endif
687cabdff1aSopenharmony_ci
688cabdff1aSopenharmony_ci        bx              lr
689cabdff1aSopenharmony_ciendfunc
690cabdff1aSopenharmony_ci.endm
691cabdff1aSopenharmony_ci
692cabdff1aSopenharmony_ci.macro idct_16x16 bitdepth
693cabdff1aSopenharmony_cifunction ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
694cabdff1aSopenharmony_ci@r0 - coeffs
695cabdff1aSopenharmony_ci        push            {r4-r7, lr}
696cabdff1aSopenharmony_ci        vpush           {q4-q7}
697cabdff1aSopenharmony_ci
698cabdff1aSopenharmony_ci        @ Align the stack, allocate a temp buffer
699cabdff1aSopenharmony_ciT       mov             r7,  sp
700cabdff1aSopenharmony_ciT       and             r7,  r7,  #15
701cabdff1aSopenharmony_ciA       and             r7,  sp,  #15
702cabdff1aSopenharmony_ci        add             r7,  r7,  #640
703cabdff1aSopenharmony_ci        sub             sp,  sp,  r7
704cabdff1aSopenharmony_ci
705cabdff1aSopenharmony_ci.irp i, 0, 1, 2, 3
706cabdff1aSopenharmony_ci        add             r5, r0, #(8 * \i)
707cabdff1aSopenharmony_ci        add             r6, sp, #(8 * \i * 16)
708cabdff1aSopenharmony_ci        bl              func_tr_16x4_firstpass
709cabdff1aSopenharmony_ci.endr
710cabdff1aSopenharmony_ci
711cabdff1aSopenharmony_ci.irp i, 0, 1, 2, 3
712cabdff1aSopenharmony_ci        add             r5, sp, #(8 * \i)
713cabdff1aSopenharmony_ci        add             r6, r0, #(8 * \i * 16)
714cabdff1aSopenharmony_ci        bl              func_tr_16x4_secondpass_\bitdepth
715cabdff1aSopenharmony_ci.endr
716cabdff1aSopenharmony_ci
717cabdff1aSopenharmony_ci        add             sp,  sp,  r7
718cabdff1aSopenharmony_ci
719cabdff1aSopenharmony_ci        vpop            {q4-q7}
720cabdff1aSopenharmony_ci        pop             {r4-r7, pc}
721cabdff1aSopenharmony_ciendfunc
722cabdff1aSopenharmony_ci.endm
723cabdff1aSopenharmony_ci
724cabdff1aSopenharmony_ci.macro load32
725cabdff1aSopenharmony_ci        add             r1,  r5, #64
726cabdff1aSopenharmony_ci        add             r3,  r1, #128
727cabdff1aSopenharmony_ci        mov             r2,  #256
728cabdff1aSopenharmony_ci        vld1.s16        {d4}, [r1, :64], r2
729cabdff1aSopenharmony_ci        vld1.s16        {d5}, [r3, :64], r2
730cabdff1aSopenharmony_ci        vld1.s16        {d6}, [r1, :64], r2
731cabdff1aSopenharmony_ci        vld1.s16        {d7}, [r3, :64], r2
732cabdff1aSopenharmony_ci        vld1.s16        {d8}, [r1, :64], r2
733cabdff1aSopenharmony_ci        vld1.s16        {d9}, [r3, :64], r2
734cabdff1aSopenharmony_ci        vld1.s16        {d10}, [r1, :64], r2
735cabdff1aSopenharmony_ci        vld1.s16        {d11}, [r3, :64], r2
736cabdff1aSopenharmony_ci        vld1.s16        {d12}, [r1, :64], r2
737cabdff1aSopenharmony_ci        vld1.s16        {d13}, [r3, :64], r2
738cabdff1aSopenharmony_ci        vld1.s16        {d14}, [r1, :64], r2
739cabdff1aSopenharmony_ci        vld1.s16        {d15}, [r3, :64], r2
740cabdff1aSopenharmony_ci        vld1.s16        {d16}, [r1, :64], r2
741cabdff1aSopenharmony_ci        vld1.s16        {d17}, [r3, :64], r2
742cabdff1aSopenharmony_ci        vld1.s16        {d18}, [r1, :64], r2
743cabdff1aSopenharmony_ci        vld1.s16        {d19}, [r3, :64], r2
744cabdff1aSopenharmony_ci.endm
745cabdff1aSopenharmony_ci
746cabdff1aSopenharmony_ci.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3
747cabdff1aSopenharmony_ci        sum_sub q10,     \in, \t0, \op0
748cabdff1aSopenharmony_ci        sum_sub q11,     \in, \t1, \op1
749cabdff1aSopenharmony_ci        sum_sub q12,     \in, \t2, \op2
750cabdff1aSopenharmony_ci        sum_sub q13,     \in, \t3, \op3
751cabdff1aSopenharmony_ci.endm
752cabdff1aSopenharmony_ci
753cabdff1aSopenharmony_ci.macro butterfly32 in0, in1, in2, in3
754cabdff1aSopenharmony_ci        vadd.s32        q1, \in0, \in1
755cabdff1aSopenharmony_ci        vsub.s32        \in0, \in0, \in1
756cabdff1aSopenharmony_ci        vadd.s32        \in1, \in2, \in3
757cabdff1aSopenharmony_ci        vsub.s32        \in2, \in2, \in3
758cabdff1aSopenharmony_ci.endm
759cabdff1aSopenharmony_ci
760cabdff1aSopenharmony_ci.macro scale32 out0, out1, out2, out3, in0, in1, in2, in3, shift
761cabdff1aSopenharmony_ci        vqrshrn.s32     \out0, \in0, \shift
762cabdff1aSopenharmony_ci        vqrshrn.s32     \out1, \in1, \shift
763cabdff1aSopenharmony_ci        vqrshrn.s32     \out2, \in2, \shift
764cabdff1aSopenharmony_ci        vqrshrn.s32     \out3, \in3, \shift
765cabdff1aSopenharmony_ci.endm
766cabdff1aSopenharmony_ci
767cabdff1aSopenharmony_ci.macro multiply in
768cabdff1aSopenharmony_ci        vmull.s16       q10, d4, \in[0]
769cabdff1aSopenharmony_ci        vmull.s16       q11, d4, \in[1]
770cabdff1aSopenharmony_ci        vmull.s16       q12, d4, \in[2]
771cabdff1aSopenharmony_ci        vmull.s16       q13, d4, \in[3]
772cabdff1aSopenharmony_ci.endm
773cabdff1aSopenharmony_ci
774cabdff1aSopenharmony_ci.macro scale_store shift
775cabdff1aSopenharmony_ci        vld1.s16        {q14-q15}, [r4, :128]!
776cabdff1aSopenharmony_ci        butterfly32     q14, q10, q15, q11
777cabdff1aSopenharmony_ci        scale32         d22, d23, d20, d21, q1, q14, q10, q15, \shift
778cabdff1aSopenharmony_ci
779cabdff1aSopenharmony_ci        vld1.s16        {q14-q15}, [r4, :128]!
780cabdff1aSopenharmony_ci        butterfly32     q14, q12, q15, q13
781cabdff1aSopenharmony_ci        scale32         d2, d3, d28, d29, q1, q14, q12, q15, \shift
782cabdff1aSopenharmony_ci        transpose8_4x4  d22, d20, d2, d28
783cabdff1aSopenharmony_ci        transpose8_4x4  d29, d3, d21, d23
784cabdff1aSopenharmony_ci        store16         d22, d23, d20, d21, d2, d3, d28, d29, r8
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci        @ reload multiplication coefficiens to q1
787cabdff1aSopenharmony_ci        vld1.s16        {q1}, [r9, :128]
788cabdff1aSopenharmony_ci.endm
789cabdff1aSopenharmony_ci
790cabdff1aSopenharmony_cifunction tr_block1
791cabdff1aSopenharmony_ci        multiply        d0
792cabdff1aSopenharmony_ci        add_member32    d5,  d0[1], d1[0], d1[3], d2[2], +, +, +, +
793cabdff1aSopenharmony_ci        add_member32    d6,  d0[2], d1[3], d3[0], d3[2], +, +, +, -
794cabdff1aSopenharmony_ci        add_member32    d7,  d0[3], d2[2], d3[2], d1[3], +, +, -, -
795cabdff1aSopenharmony_ci        add_member32    d8,  d1[0], d3[1], d2[1], d0[0], +, +, -, -
796cabdff1aSopenharmony_ci        add_member32    d9,  d1[1], d3[3], d1[0], d1[2], +, -, -, -
797cabdff1aSopenharmony_ci        add_member32    d10, d1[2], d3[0], d0[0], d3[1], +, -, -, -
798cabdff1aSopenharmony_ci        add_member32    d11, d1[3], d2[1], d1[1], d2[3], +, -, -, +
799cabdff1aSopenharmony_ci        add_member32    d12, d2[0], d1[2], d2[2], d1[0], +, -, -, +
800cabdff1aSopenharmony_ci        add_member32    d13, d2[1], d0[3], d3[3], d0[2], +, -, -, +
801cabdff1aSopenharmony_ci        add_member32    d14, d2[2], d0[1], d2[3], d2[1], +, -, +, +
802cabdff1aSopenharmony_ci        add_member32    d15, d2[3], d0[2], d1[2], d3[3], +, -, +, -
803cabdff1aSopenharmony_ci        add_member32    d16, d3[0], d1[1], d0[1], d2[0], +, -, +, -
804cabdff1aSopenharmony_ci        add_member32    d17, d3[1], d2[0], d0[3], d0[1], +, -, +, -
805cabdff1aSopenharmony_ci        add_member32    d18, d3[2], d2[3], d2[0], d1[1], +, -, +, -
806cabdff1aSopenharmony_ci        add_member32    d19, d3[3], d3[2], d3[1], d3[0], +, -, +, -
807cabdff1aSopenharmony_ci        bx              lr
808cabdff1aSopenharmony_ciendfunc
809cabdff1aSopenharmony_ci
810cabdff1aSopenharmony_cifunction tr_block2
811cabdff1aSopenharmony_ci        multiply        d1
812cabdff1aSopenharmony_ci        add_member32    d5,  d3[1], d3[3], d3[0], d2[1], +, -, -, -
813cabdff1aSopenharmony_ci        add_member32    d6,  d2[1], d1[0], d0[0], d1[1], -, -, -, -
814cabdff1aSopenharmony_ci        add_member32    d7,  d0[0], d1[2], d3[1], d2[3], -, -, -, +
815cabdff1aSopenharmony_ci        add_member32    d8,  d2[0], d3[2], d1[1], d0[3], -, +, +, +
816cabdff1aSopenharmony_ci        add_member32    d9,  d3[2], d0[3], d1[3], d3[1], +, +, +, -
817cabdff1aSopenharmony_ci        add_member32    d10, d1[1], d1[3], d2[3], d0[0], +, +, -, -
818cabdff1aSopenharmony_ci        add_member32    d11, d0[3], d3[1], d0[1], d3[3], +, -, -, +
819cabdff1aSopenharmony_ci        add_member32    d12, d3[0], d0[2], d3[2], d0[1], +, -, -, +
820cabdff1aSopenharmony_ci        add_member32    d13, d2[2], d2[0], d1[0], d3[2], -, -, +, +
821cabdff1aSopenharmony_ci        add_member32    d14, d0[1], d3[0], d2[0], d0[2], -, +, +, -
822cabdff1aSopenharmony_ci        add_member32    d15, d1[3], d0[1], d2[2], d3[0], -, +, -, -
823cabdff1aSopenharmony_ci        add_member32    d16, d3[3], d2[1], d0[2], d1[0], +, +, -, +
824cabdff1aSopenharmony_ci        add_member32    d17, d1[2], d2[3], d3[3], d2[2], +, -, -, +
825cabdff1aSopenharmony_ci        add_member32    d18, d0[2], d0[1], d0[3], d1[2], +, -, +, -
826cabdff1aSopenharmony_ci        add_member32    d19, d2[3], d2[2], d2[1], d2[0], +, -, +, -
827cabdff1aSopenharmony_ci        bx              lr
828cabdff1aSopenharmony_ciendfunc
829cabdff1aSopenharmony_ci
830cabdff1aSopenharmony_cifunction tr_block3
831cabdff1aSopenharmony_ci        multiply        d2
832cabdff1aSopenharmony_ci        add_member32    d5,  d1[2], d0[3], d0[0], d0[2], -, -, -, -
833cabdff1aSopenharmony_ci        add_member32    d6,  d2[2], d3[3], d2[3], d1[2], -, -, +, +
834cabdff1aSopenharmony_ci        add_member32    d7,  d1[0], d0[2], d2[1], d3[3], +, +, +, -
835cabdff1aSopenharmony_ci        add_member32    d8,  d3[0], d2[2], d0[1], d1[3], +, -, -, -
836cabdff1aSopenharmony_ci        add_member32    d9,  d0[2], d2[0], d3[0], d0[0], -, -, +, +
837cabdff1aSopenharmony_ci        add_member32    d10, d3[2], d1[0], d2[0], d2[2], -, +, +, -
838cabdff1aSopenharmony_ci        add_member32    d11, d0[0], d3[2], d0[2], d3[0], +, +, -, -
839cabdff1aSopenharmony_ci        add_member32    d12, d3[3], d0[1], d3[1], d0[3], -, -, +, +
840cabdff1aSopenharmony_ci        add_member32    d13, d0[1], d2[3], d1[3], d1[1], -, +, +, -
841cabdff1aSopenharmony_ci        add_member32    d14, d3[1], d1[3], d0[3], d3[2], +, +, -, +
842cabdff1aSopenharmony_ci        add_member32    d15, d0[3], d1[1], d3[2], d2[0], +, -, +, +
843cabdff1aSopenharmony_ci        add_member32    d16, d2[3], d3[1], d1[2], d0[1], -, -, +, -
844cabdff1aSopenharmony_ci        add_member32    d17, d1[1], d0[0], d1[0], d2[1], -, +, -, +
845cabdff1aSopenharmony_ci        add_member32    d18, d2[1], d3[0], d3[3], d3[1], +, -, +, +
846cabdff1aSopenharmony_ci        add_member32    d19, d1[3], d1[2], d1[1], d1[0], +, -, +, -
847cabdff1aSopenharmony_ci        bx              lr
848cabdff1aSopenharmony_ciendfunc
849cabdff1aSopenharmony_ci
850cabdff1aSopenharmony_cifunction tr_block4
851cabdff1aSopenharmony_ci        multiply        d3
852cabdff1aSopenharmony_ci        add_member32    d5,  d1[1], d2[0], d2[3], d3[2], -, -, -, -
853cabdff1aSopenharmony_ci        add_member32    d6,  d0[0], d0[3], d2[0], d3[1], +, +, +, +
854cabdff1aSopenharmony_ci        add_member32    d7,  d2[0], d0[0], d1[1], d3[0], -, -, -, -
855cabdff1aSopenharmony_ci        add_member32    d8,  d3[3], d1[2], d0[2], d2[3], +, +, +, +
856cabdff1aSopenharmony_ci        add_member32    d9,  d2[1], d2[3], d0[0], d2[2], +, -, -, -
857cabdff1aSopenharmony_ci        add_member32    d10, d0[2], d3[3], d0[3], d2[1], -, -, +, +
858cabdff1aSopenharmony_ci        add_member32    d11, d1[0], d2[2], d1[2], d2[0], +, +, -, -
859cabdff1aSopenharmony_ci        add_member32    d12, d2[3], d1[1], d2[1], d1[3], -, -, +, +
860cabdff1aSopenharmony_ci        add_member32    d13, d3[1], d0[1], d3[0], d1[2], -, +, -, -
861cabdff1aSopenharmony_ci        add_member32    d14, d1[2], d1[0], d3[3], d1[1], +, -, +, +
862cabdff1aSopenharmony_ci        add_member32    d15, d0[1], d2[1], d3[1], d1[0], -, +, +, -
863cabdff1aSopenharmony_ci        add_member32    d16, d1[3], d3[2], d2[2], d0[3], +, -, -, +
864cabdff1aSopenharmony_ci        add_member32    d17, d3[2], d3[0], d1[3], d0[2], -, -, +, -
865cabdff1aSopenharmony_ci        add_member32    d18, d2[2], d1[3], d1[0], d0[1], -, +, -, +
866cabdff1aSopenharmony_ci        add_member32    d19, d0[3], d0[2], d0[1], d0[0], +, -, +, -
867cabdff1aSopenharmony_ci        bx              lr
868cabdff1aSopenharmony_ciendfunc
869cabdff1aSopenharmony_ci
870cabdff1aSopenharmony_ci.macro tr_32x4 name, shift
871cabdff1aSopenharmony_cifunction func_tr_32x4_\name
872cabdff1aSopenharmony_ci        mov             r10, lr
873cabdff1aSopenharmony_ci        bl              func_tr_16x4_noscale
874cabdff1aSopenharmony_ci
875cabdff1aSopenharmony_ci        load32
876cabdff1aSopenharmony_ci        movrel          r9, trans + 32
877cabdff1aSopenharmony_ci        vld1.s16        {q0}, [r9, :128]!
878cabdff1aSopenharmony_ci        vld1.s16        {q1}, [r9, :128]
879cabdff1aSopenharmony_ci
880cabdff1aSopenharmony_ci        bl              tr_block1
881cabdff1aSopenharmony_ci
882cabdff1aSopenharmony_ci        add             r4, sp, #2048
883cabdff1aSopenharmony_ci        vld1.s16        {q14-q15}, [r4, :128]!
884cabdff1aSopenharmony_ci        butterfly32     q14, q10, q15, q11
885cabdff1aSopenharmony_ci        scale32         d22, d23, d20, d21, q1, q14, q10, q15, \shift
886cabdff1aSopenharmony_ci
887cabdff1aSopenharmony_ci        vld1.s16        {q14-q15}, [r4, :128]!
888cabdff1aSopenharmony_ci        butterfly32     q14, q12, q15, q13
889cabdff1aSopenharmony_ci        scale32         d2, d3, d28, d29, q1, q14, q12, q15, \shift
890cabdff1aSopenharmony_ci
891cabdff1aSopenharmony_ci        transpose8_4x4  d22, d20, d2, d28
892cabdff1aSopenharmony_ci        transpose8_4x4  d29, d3, d21, d23
893cabdff1aSopenharmony_ci        mov             r1, r11
894cabdff1aSopenharmony_ci        mov             r2, #64
895cabdff1aSopenharmony_ci        mov             r8, #-64
896cabdff1aSopenharmony_ci        add             r3, r11, #(56 + 3 * 64)
897cabdff1aSopenharmony_ci        store16         d22, d23, d20, d21, d2, d3, d28, d29, r8
898cabdff1aSopenharmony_ci
899cabdff1aSopenharmony_ci        @ reload multiplication coefficiens to q1
900cabdff1aSopenharmony_ci        vld1.s16        {q1}, [r9, :128]
901cabdff1aSopenharmony_ci
902cabdff1aSopenharmony_ci        bl              tr_block2
903cabdff1aSopenharmony_ci        add             r1, r11, #8
904cabdff1aSopenharmony_ci        add             r3, r11, #(48 + 3 * 64)
905cabdff1aSopenharmony_ci        mov             r2, #64
906cabdff1aSopenharmony_ci        mov             r8, #-64
907cabdff1aSopenharmony_ci        scale_store     \shift
908cabdff1aSopenharmony_ci
909cabdff1aSopenharmony_ci        bl              tr_block3
910cabdff1aSopenharmony_ci        add             r1, r11, #16
911cabdff1aSopenharmony_ci        add             r3, r11, #(40 + 3 * 64)
912cabdff1aSopenharmony_ci        mov             r2, #64
913cabdff1aSopenharmony_ci        mov             r8, #-64
914cabdff1aSopenharmony_ci        scale_store     \shift
915cabdff1aSopenharmony_ci
916cabdff1aSopenharmony_ci        bl              tr_block4
917cabdff1aSopenharmony_ci        add             r1, r11, #24
918cabdff1aSopenharmony_ci        add             r3, r11, #(32 + 3 * 64)
919cabdff1aSopenharmony_ci        mov             r2, #64
920cabdff1aSopenharmony_ci        mov             r8, #-64
921cabdff1aSopenharmony_ci        scale_store     \shift
922cabdff1aSopenharmony_ci
923cabdff1aSopenharmony_ci        bx               r10
924cabdff1aSopenharmony_ciendfunc
925cabdff1aSopenharmony_ci.endm
926cabdff1aSopenharmony_ci
927cabdff1aSopenharmony_ci.macro idct_32x32 bitdepth
928cabdff1aSopenharmony_cifunction ff_hevc_idct_32x32_\bitdepth\()_neon, export=1
929cabdff1aSopenharmony_ci@r0 - coeffs
930cabdff1aSopenharmony_ci        push            {r4-r11, lr}
931cabdff1aSopenharmony_ci        vpush           {q4-q7}
932cabdff1aSopenharmony_ci
933cabdff1aSopenharmony_ci        @ Align the stack, allocate a temp buffer
934cabdff1aSopenharmony_ciT       mov             r7,  sp
935cabdff1aSopenharmony_ciT       and             r7,  r7,  #15
936cabdff1aSopenharmony_ciA       and             r7,  sp,  #15
937cabdff1aSopenharmony_ci        add             r7,  r7,  #2432
938cabdff1aSopenharmony_ci        sub             sp,  sp,  r7
939cabdff1aSopenharmony_ci
940cabdff1aSopenharmony_ci.irp i, 0, 1, 2, 3, 4, 5, 6, 7
941cabdff1aSopenharmony_ci        add             r5, r0, #(8 * \i)
942cabdff1aSopenharmony_ci        add             r11, sp, #(8 * \i * 32)
943cabdff1aSopenharmony_ci        bl              func_tr_32x4_firstpass
944cabdff1aSopenharmony_ci.endr
945cabdff1aSopenharmony_ci
946cabdff1aSopenharmony_ci.irp i, 0, 1, 2, 3, 4, 5, 6, 7
947cabdff1aSopenharmony_ci        add             r5, sp, #(8 * \i)
948cabdff1aSopenharmony_ci        add             r11, r0, #(8 * \i * 32)
949cabdff1aSopenharmony_ci        bl              func_tr_32x4_secondpass_\bitdepth
950cabdff1aSopenharmony_ci.endr
951cabdff1aSopenharmony_ci
952cabdff1aSopenharmony_ci        add             sp,  sp,  r7
953cabdff1aSopenharmony_ci        vpop            {q4-q7}
954cabdff1aSopenharmony_ci        pop             {r4-r11, pc}
955cabdff1aSopenharmony_ciendfunc
956cabdff1aSopenharmony_ci.endm
957cabdff1aSopenharmony_ci
958cabdff1aSopenharmony_citr_16x4 firstpass, 7, 512, 1
959cabdff1aSopenharmony_citr_16x4 secondpass_8, 20 - 8, 512, 1
960cabdff1aSopenharmony_citr_16x4 secondpass_10, 20 - 10, 512, 1
961cabdff1aSopenharmony_citr_16x4 noscale, 0, 2048, 4
962cabdff1aSopenharmony_ci.ltorg
963cabdff1aSopenharmony_citr_32x4 firstpass, 7
964cabdff1aSopenharmony_citr_32x4 secondpass_8, 20 - 8
965cabdff1aSopenharmony_citr_32x4 secondpass_10, 20 - 10
966cabdff1aSopenharmony_ci.ltorg
967cabdff1aSopenharmony_ci
968cabdff1aSopenharmony_ciidct_4x4 8
969cabdff1aSopenharmony_ciidct_4x4_dc 8
970cabdff1aSopenharmony_ciidct_4x4 10
971cabdff1aSopenharmony_ciidct_4x4_dc 10
972cabdff1aSopenharmony_ciidct_8x8 8
973cabdff1aSopenharmony_ciidct_8x8_dc 8
974cabdff1aSopenharmony_ciidct_8x8 10
975cabdff1aSopenharmony_ciidct_8x8_dc 10
976cabdff1aSopenharmony_ciidct_16x16 8
977cabdff1aSopenharmony_ciidct_16x16_dc 8
978cabdff1aSopenharmony_ciidct_16x16 10
979cabdff1aSopenharmony_ciidct_16x16_dc 10
980cabdff1aSopenharmony_ciidct_32x32 8
981cabdff1aSopenharmony_ciidct_32x32_dc 8
982cabdff1aSopenharmony_ciidct_32x32 10
983cabdff1aSopenharmony_ciidct_32x32_dc 10
984cabdff1aSopenharmony_ci
985cabdff1aSopenharmony_ci/* uses registers q2 - q9 for temp values */
986cabdff1aSopenharmony_ci/* TODO: reorder */
987cabdff1aSopenharmony_ci.macro tr4_luma_shift r0, r1, r2, r3, shift
988cabdff1aSopenharmony_ci        vaddl.s16   q5, \r0, \r2    // c0 = src0 + src2
989cabdff1aSopenharmony_ci        vaddl.s16   q2, \r2, \r3    // c1 = src2 + src3
990cabdff1aSopenharmony_ci        vsubl.s16   q4, \r0, \r3    // c2 = src0 - src3
991cabdff1aSopenharmony_ci        vmull.s16   q6, \r1, d0[0]  // c3 = 74 * src1
992cabdff1aSopenharmony_ci
993cabdff1aSopenharmony_ci        vaddl.s16   q7, \r0, \r3    // src0 + src3
994cabdff1aSopenharmony_ci        vsubw.s16   q7, q7, \r2     // src0 - src2 + src3
995cabdff1aSopenharmony_ci        vmul.s32    q7, q7, d0[0]   // dst2 = 74 * (src0 - src2 + src3)
996cabdff1aSopenharmony_ci
997cabdff1aSopenharmony_ci        vmul.s32    q8, q5, d0[1]   // 29 * c0
998cabdff1aSopenharmony_ci        vmul.s32    q9, q2, d1[0]   // 55 * c1
999cabdff1aSopenharmony_ci        vadd.s32    q8, q9          // 29 * c0 + 55 * c1
1000cabdff1aSopenharmony_ci        vadd.s32    q8, q6          // dst0 = 29 * c0 + 55 * c1 + c3
1001cabdff1aSopenharmony_ci
1002cabdff1aSopenharmony_ci        vmul.s32    q2, q2, d0[1]   // 29 * c1
1003cabdff1aSopenharmony_ci        vmul.s32    q9, q4, d1[0]   // 55 * c2
1004cabdff1aSopenharmony_ci        vsub.s32    q9, q2          // 55 * c2 - 29 * c1
1005cabdff1aSopenharmony_ci        vadd.s32    q9, q6          // dst1 = 55 * c2 - 29 * c1 + c3
1006cabdff1aSopenharmony_ci
1007cabdff1aSopenharmony_ci        vmul.s32    q5, q5, d1[0]   // 55 * c0
1008cabdff1aSopenharmony_ci        vmul.s32    q4, q4, d0[1]   // 29 * c2
1009cabdff1aSopenharmony_ci        vadd.s32    q5, q4          // 55 * c0 + 29 * c2
1010cabdff1aSopenharmony_ci        vsub.s32    q5, q6          // dst3 = 55 * c0 + 29 * c2 - c3
1011cabdff1aSopenharmony_ci
1012cabdff1aSopenharmony_ci        vqrshrn.s32   \r0, q8, \shift
1013cabdff1aSopenharmony_ci        vqrshrn.s32   \r1, q9, \shift
1014cabdff1aSopenharmony_ci        vqrshrn.s32   \r2, q7, \shift
1015cabdff1aSopenharmony_ci        vqrshrn.s32   \r3, q5, \shift
1016cabdff1aSopenharmony_ci.endm
1017cabdff1aSopenharmony_ci
1018cabdff1aSopenharmony_ci.ltorg
1019cabdff1aSopenharmony_cifunction ff_hevc_transform_luma_4x4_neon_8, export=1
1020cabdff1aSopenharmony_ci        vpush       {d8-d15}
1021cabdff1aSopenharmony_ci        vld1.16     {q14, q15}, [r0]  // coeffs
1022cabdff1aSopenharmony_ci        ldr         r3, =0x4a  // 74
1023cabdff1aSopenharmony_ci        vmov.32     d0[0], r3
1024cabdff1aSopenharmony_ci        ldr         r3, =0x1d  // 29
1025cabdff1aSopenharmony_ci        vmov.32     d0[1], r3
1026cabdff1aSopenharmony_ci        ldr         r3, =0x37  // 55
1027cabdff1aSopenharmony_ci        vmov.32     d1[0], r3
1028cabdff1aSopenharmony_ci
1029cabdff1aSopenharmony_ci        tr4_luma_shift d28, d29, d30, d31, #7
1030cabdff1aSopenharmony_ci
1031cabdff1aSopenharmony_ci        vtrn.16     d28, d29
1032cabdff1aSopenharmony_ci        vtrn.16     d30, d31
1033cabdff1aSopenharmony_ci        vtrn.32     q14, q15
1034cabdff1aSopenharmony_ci
1035cabdff1aSopenharmony_ci        tr4_luma_shift d28, d29, d30, d31, #12
1036cabdff1aSopenharmony_ci
1037cabdff1aSopenharmony_ci        vtrn.16     d28, d29
1038cabdff1aSopenharmony_ci        vtrn.16     d30, d31
1039cabdff1aSopenharmony_ci        vtrn.32     q14, q15
1040cabdff1aSopenharmony_ci        vst1.16     {q14, q15}, [r0]
1041cabdff1aSopenharmony_ci        vpop        {d8-d15}
1042cabdff1aSopenharmony_ci        bx lr
1043cabdff1aSopenharmony_ciendfunc
1044