1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2017 Google Inc.
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S"
22cabdff1aSopenharmony_ci#include "neon.S"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciconst itxfm4_coeffs, align=4
25cabdff1aSopenharmony_ci        .short  11585, 0, 6270, 15137
26cabdff1aSopenharmony_ciiadst4_coeffs:
27cabdff1aSopenharmony_ci        .short  5283, 15212, 9929, 13377
28cabdff1aSopenharmony_ciendconst
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ciconst iadst8_coeffs, align=4
31cabdff1aSopenharmony_ci        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32cabdff1aSopenharmony_ciidct_coeffs:
33cabdff1aSopenharmony_ci        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34cabdff1aSopenharmony_ci        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35cabdff1aSopenharmony_ci        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36cabdff1aSopenharmony_ci        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37cabdff1aSopenharmony_ciendconst
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ciconst iadst16_coeffs, align=4
40cabdff1aSopenharmony_ci        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41cabdff1aSopenharmony_ci        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
42cabdff1aSopenharmony_ciendconst
43cabdff1aSopenharmony_ci
44cabdff1aSopenharmony_ci@ Do two 4x4 transposes, using q registers for the subtransposes that don't
45cabdff1aSopenharmony_ci@ need to address the individual d registers.
46cabdff1aSopenharmony_ci@ r0,r1 == rq1, r2,r3 == rq1, etc
47cabdff1aSopenharmony_ci.macro transpose32_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
48cabdff1aSopenharmony_ci        vswp             \r1,  \r4  @ vtrn.64 \rq0, \rq2
49cabdff1aSopenharmony_ci        vswp             \r3,  \r6  @ vtrn.64 \rq1, \rq3
50cabdff1aSopenharmony_ci        vswp             \r9,  \r12 @ vtrn.64 \rq4, \rq6
51cabdff1aSopenharmony_ci        vswp             \r11, \r14 @ vtrn.64 \rq5, \rq7
52cabdff1aSopenharmony_ci        vtrn.32          \rq0, \rq1
53cabdff1aSopenharmony_ci        vtrn.32          \rq2, \rq3
54cabdff1aSopenharmony_ci        vtrn.32          \rq4, \rq5
55cabdff1aSopenharmony_ci        vtrn.32          \rq6, \rq7
56cabdff1aSopenharmony_ci.endm
57cabdff1aSopenharmony_ci
58cabdff1aSopenharmony_ci@ Do eight 2x2 transposes.
59cabdff1aSopenharmony_ci.macro transpose32_8x_2x2 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
60cabdff1aSopenharmony_ci        vtrn.32          \r0,  \r1
61cabdff1aSopenharmony_ci        vtrn.32          \r2,  \r3
62cabdff1aSopenharmony_ci        vtrn.32          \r4,  \r5
63cabdff1aSopenharmony_ci        vtrn.32          \r6,  \r7
64cabdff1aSopenharmony_ci        vtrn.32          \r8,  \r9
65cabdff1aSopenharmony_ci        vtrn.32          \r10, \r11
66cabdff1aSopenharmony_ci        vtrn.32          \r12, \r13
67cabdff1aSopenharmony_ci        vtrn.32          \r14, \r15
68cabdff1aSopenharmony_ci.endm
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_ci@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
71cabdff1aSopenharmony_ci@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
72cabdff1aSopenharmony_ci@ in/out are d registers
73cabdff1aSopenharmony_ci.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
74cabdff1aSopenharmony_ci        vadd.s32        \tmpd1, \in1,  \in2
75cabdff1aSopenharmony_ci        vsub.s32        \tmpd2, \in1,  \in2
76cabdff1aSopenharmony_ci.if \neg > 0
77cabdff1aSopenharmony_ci        vneg.s32        \tmpd1, \tmpd1
78cabdff1aSopenharmony_ci.endif
79cabdff1aSopenharmony_ci        vmull.s32       \tmpq3, \tmpd1, d0[0]
80cabdff1aSopenharmony_ci        vmull.s32       \tmpq4, \tmpd2, d0[0]
81cabdff1aSopenharmony_ci        vrshrn.s64      \out1, \tmpq3, #14
82cabdff1aSopenharmony_ci        vrshrn.s64      \out2, \tmpq4, #14
83cabdff1aSopenharmony_ci.endm
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_ci@ Same as mbutterfly0 above, but treating the input in in2 as zero,
86cabdff1aSopenharmony_ci@ writing the same output into both out1 and out2.
87cabdff1aSopenharmony_ci.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
88cabdff1aSopenharmony_ci        vmull.s32       \tmpq3, \in1, d0[0]
89cabdff1aSopenharmony_ci        vrshrn.s64      \out1, \tmpq3, #14
90cabdff1aSopenharmony_ci        vrshrn.s64      \out2, \tmpq3, #14
91cabdff1aSopenharmony_ci.endm
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
94cabdff1aSopenharmony_ci@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
95cabdff1aSopenharmony_ci@ Same as mbutterfly0, but with input being 2 q registers, output
96cabdff1aSopenharmony_ci@ being 4 d registers.
97cabdff1aSopenharmony_ci@ This can do with either 4 or 6 temporary q registers.
98cabdff1aSopenharmony_ci.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
99cabdff1aSopenharmony_ci        vadd.s32        \tmpq1, \in1,  \in2
100cabdff1aSopenharmony_ci        vsub.s32        \tmpq2, \in1,  \in2
101cabdff1aSopenharmony_ci        vmull.s32       \tmpq3, \tmpd11, d0[0]
102cabdff1aSopenharmony_ci        vmull.s32       \tmpq4, \tmpd12, d0[0]
103cabdff1aSopenharmony_ci.ifb \tmpq5
104cabdff1aSopenharmony_ci        vrshrn.s64      \out1, \tmpq3, #14
105cabdff1aSopenharmony_ci        vrshrn.s64      \out2, \tmpq4, #14
106cabdff1aSopenharmony_ci        vmull.s32       \tmpq3, \tmpd21, d0[0]
107cabdff1aSopenharmony_ci        vmull.s32       \tmpq4, \tmpd22, d0[0]
108cabdff1aSopenharmony_ci        vrshrn.s64      \out3, \tmpq3, #14
109cabdff1aSopenharmony_ci        vrshrn.s64      \out4, \tmpq4, #14
110cabdff1aSopenharmony_ci.else
111cabdff1aSopenharmony_ci        vmull.s32       \tmpq5, \tmpd21, d0[0]
112cabdff1aSopenharmony_ci        vmull.s32       \tmpq6, \tmpd22, d0[0]
113cabdff1aSopenharmony_ci        vrshrn.s64      \out1, \tmpq3, #14
114cabdff1aSopenharmony_ci        vrshrn.s64      \out2, \tmpq4, #14
115cabdff1aSopenharmony_ci        vrshrn.s64      \out3, \tmpq5, #14
116cabdff1aSopenharmony_ci        vrshrn.s64      \out4, \tmpq6, #14
117cabdff1aSopenharmony_ci.endif
118cabdff1aSopenharmony_ci.endm
119cabdff1aSopenharmony_ci
120cabdff1aSopenharmony_ci@ out1 = in1 * coef1 - in2 * coef2
121cabdff1aSopenharmony_ci@ out2 = in1 * coef2 + in2 * coef1
122cabdff1aSopenharmony_ci@ out are 2 q registers, in are 2 d registers
123cabdff1aSopenharmony_ci.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2, neg=0
124cabdff1aSopenharmony_ci        vmull.s32       \out1, \in1, \coef1
125cabdff1aSopenharmony_ci        vmlsl.s32       \out1, \in2, \coef2
126cabdff1aSopenharmony_ci.if \neg
127cabdff1aSopenharmony_ci        vmov.s64        \out2, #0
128cabdff1aSopenharmony_ci        vmlsl.s32       \out2, \in1, \coef2
129cabdff1aSopenharmony_ci        vmlsl.s32       \out2, \in2, \coef1
130cabdff1aSopenharmony_ci.else
131cabdff1aSopenharmony_ci        vmull.s32       \out2, \in1, \coef2
132cabdff1aSopenharmony_ci        vmlal.s32       \out2, \in2, \coef1
133cabdff1aSopenharmony_ci.endif
134cabdff1aSopenharmony_ci.endm
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ci@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
137cabdff1aSopenharmony_ci@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
138cabdff1aSopenharmony_ci@ out are 4 q registers, in are 4 d registers
139cabdff1aSopenharmony_ci.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
140cabdff1aSopenharmony_ci        vmull.s32       \out1, \in1, \coef1
141cabdff1aSopenharmony_ci        vmull.s32       \out2, \in2, \coef1
142cabdff1aSopenharmony_ci        vmull.s32       \out3, \in1, \coef2
143cabdff1aSopenharmony_ci        vmull.s32       \out4, \in2, \coef2
144cabdff1aSopenharmony_ci        vmlsl.s32       \out1, \in3, \coef2
145cabdff1aSopenharmony_ci        vmlsl.s32       \out2, \in4, \coef2
146cabdff1aSopenharmony_ci        vmlal.s32       \out3, \in3, \coef1
147cabdff1aSopenharmony_ci        vmlal.s32       \out4, \in4, \coef1
148cabdff1aSopenharmony_ci.endm
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
151cabdff1aSopenharmony_ci@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
152cabdff1aSopenharmony_ci@ inout are 2 d registers, tmp are 2 q registers
153cabdff1aSopenharmony_ci.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0
154cabdff1aSopenharmony_ci        mbutterfly_l    \tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2, \neg
155cabdff1aSopenharmony_ci        vrshrn.s64      \inout1, \tmp1,  #14
156cabdff1aSopenharmony_ci        vrshrn.s64      \inout2, \tmp2,  #14
157cabdff1aSopenharmony_ci.endm
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_ci@ Same as mbutterfly above, but treating the input in inout2 as zero
160cabdff1aSopenharmony_ci.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
161cabdff1aSopenharmony_ci        vmull.s32       \tmp1,   \inout1, \coef1
162cabdff1aSopenharmony_ci        vmull.s32       \tmp2,   \inout1, \coef2
163cabdff1aSopenharmony_ci        vrshrn.s64      \inout1, \tmp1,   #14
164cabdff1aSopenharmony_ci        vrshrn.s64      \inout2, \tmp2,   #14
165cabdff1aSopenharmony_ci.endm
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci@ Same as mbutterfly above, but treating the input in inout1 as zero
168cabdff1aSopenharmony_ci.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
169cabdff1aSopenharmony_ci        vmov.s64        \tmp1,   #0
170cabdff1aSopenharmony_ci        vmull.s32       \tmp2,   \inout2, \coef1
171cabdff1aSopenharmony_ci        vmlsl.s32       \tmp1,   \inout2, \coef2
172cabdff1aSopenharmony_ci        vrshrn.s64      \inout2, \tmp2,   #14
173cabdff1aSopenharmony_ci        vrshrn.s64      \inout1, \tmp1,   #14
174cabdff1aSopenharmony_ci.endm
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_ci@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
177cabdff1aSopenharmony_ci@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
178cabdff1aSopenharmony_ci@ inout are 4 d registers, tmp are 4 q registers
179cabdff1aSopenharmony_ci.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4
180cabdff1aSopenharmony_ci        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2
181cabdff1aSopenharmony_ci        vrshrn.s64      \inout1, \tmp1,  #14
182cabdff1aSopenharmony_ci        vrshrn.s64      \inout2, \tmp2,  #14
183cabdff1aSopenharmony_ci        vrshrn.s64      \inout3, \tmp3,  #14
184cabdff1aSopenharmony_ci        vrshrn.s64      \inout4, \tmp4,  #14
185cabdff1aSopenharmony_ci.endm
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ci@ out1 = in1 + in2
188cabdff1aSopenharmony_ci@ out2 = in1 - in2
189cabdff1aSopenharmony_ci.macro butterfly out1, out2, in1, in2
190cabdff1aSopenharmony_ci        vadd.s32        \out1, \in1, \in2
191cabdff1aSopenharmony_ci        vsub.s32        \out2, \in1, \in2
192cabdff1aSopenharmony_ci.endm
193cabdff1aSopenharmony_ci
194cabdff1aSopenharmony_ci@ out1 = in1 - in2
195cabdff1aSopenharmony_ci@ out2 = in1 + in2
196cabdff1aSopenharmony_ci.macro butterfly_r out1, out2, in1, in2
197cabdff1aSopenharmony_ci        vsub.s32        \out1, \in1, \in2
198cabdff1aSopenharmony_ci        vadd.s32        \out2, \in1, \in2
199cabdff1aSopenharmony_ci.endm
200cabdff1aSopenharmony_ci
201cabdff1aSopenharmony_ci@ out1 = (in1 + in2 + (1 << 13)) >> 14
202cabdff1aSopenharmony_ci@ out2 = (in1 - in2 + (1 << 13)) >> 14
203cabdff1aSopenharmony_ci@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
204cabdff1aSopenharmony_ci.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
205cabdff1aSopenharmony_ci        vadd.s64        \tmp1, \in1, \in2
206cabdff1aSopenharmony_ci        vsub.s64        \tmp2, \in1, \in2
207cabdff1aSopenharmony_ci        vrshrn.s64      \out1, \tmp1,  #14
208cabdff1aSopenharmony_ci        vrshrn.s64      \out2, \tmp2,  #14
209cabdff1aSopenharmony_ci.endm
210cabdff1aSopenharmony_ci
211cabdff1aSopenharmony_ci@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
212cabdff1aSopenharmony_ci@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
213cabdff1aSopenharmony_ci@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
214cabdff1aSopenharmony_ci.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
215cabdff1aSopenharmony_ci        vadd.s64        \tmp1, \in1, \in3
216cabdff1aSopenharmony_ci        vadd.s64        \tmp2, \in2, \in4
217cabdff1aSopenharmony_ci        vsub.s64        \tmp3, \in1, \in3
218cabdff1aSopenharmony_ci        vsub.s64        \tmp4, \in2, \in4
219cabdff1aSopenharmony_ci        vrshrn.s64      \out1, \tmp1,  #14
220cabdff1aSopenharmony_ci        vrshrn.s64      \out2, \tmp2,  #14
221cabdff1aSopenharmony_ci        vrshrn.s64      \out3, \tmp3,  #14
222cabdff1aSopenharmony_ci        vrshrn.s64      \out4, \tmp4,  #14
223cabdff1aSopenharmony_ci.endm
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_ci
226cabdff1aSopenharmony_ci.macro iwht4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
227cabdff1aSopenharmony_ci        vadd.i32        \c0,  \c0,  \c1
228cabdff1aSopenharmony_ci        vsub.i32        q11,  \c2,  \c3
229cabdff1aSopenharmony_ci        vsub.i32        q10,  \c0,  q11
230cabdff1aSopenharmony_ci        vshr.s32        q10,  q10,  #1
231cabdff1aSopenharmony_ci        vsub.i32        \c2,  q10,  \c1
232cabdff1aSopenharmony_ci        vsub.i32        \c1,  q10,  \c3
233cabdff1aSopenharmony_ci        vadd.i32        \c3,  q11,  \c2
234cabdff1aSopenharmony_ci        vsub.i32        \c0,  \c0,  \c1
235cabdff1aSopenharmony_ci.endm
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci.macro iwht4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
238cabdff1aSopenharmony_ci        iwht4_10        \c0, \c1, \c2, \c3, \cd0, \cd1, \cd2, \cd3, \cd4, \cd5, \cd6, \cd7
239cabdff1aSopenharmony_ci.endm
240cabdff1aSopenharmony_ci
241cabdff1aSopenharmony_ci@ c0 == cd0,cd1, c1 == cd2,cd3
242cabdff1aSopenharmony_ci.macro idct4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
243cabdff1aSopenharmony_ci        vmul.s32        q13,  \c1,  d1[1]
244cabdff1aSopenharmony_ci        vmul.s32        q11,  \c1,  d1[0]
245cabdff1aSopenharmony_ci        vadd.i32        q14,  \c0,  \c2
246cabdff1aSopenharmony_ci        vsub.i32        q15,  \c0,  \c2
247cabdff1aSopenharmony_ci        vmla.s32        q13,  \c3,  d1[0]
248cabdff1aSopenharmony_ci        vmul.s32        q12,  q14,  d0[0]
249cabdff1aSopenharmony_ci        vmul.s32        q10,  q15,  d0[0]
250cabdff1aSopenharmony_ci        vmls.s32        q11,  \c3,  d1[1]
251cabdff1aSopenharmony_ci        vrshr.s32       q13,  q13,  #14
252cabdff1aSopenharmony_ci        vrshr.s32       q12,  q12,  #14
253cabdff1aSopenharmony_ci        vrshr.s32       q10,  q10,  #14
254cabdff1aSopenharmony_ci        vrshr.s32       q11,  q11,  #14
255cabdff1aSopenharmony_ci        vadd.i32        \c0,  q12,  q13
256cabdff1aSopenharmony_ci        vsub.i32        \c3,  q12,  q13
257cabdff1aSopenharmony_ci        vadd.i32        \c1,  q10,  q11
258cabdff1aSopenharmony_ci        vsub.i32        \c2,  q10,  q11
259cabdff1aSopenharmony_ci.endm
260cabdff1aSopenharmony_ci
261cabdff1aSopenharmony_ci.macro idct4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
262cabdff1aSopenharmony_ci        vmull.s32       q13,  \cd2, d1[1]
263cabdff1aSopenharmony_ci        vmull.s32       q15,  \cd3, d1[1]
264cabdff1aSopenharmony_ci        vmull.s32       q11,  \cd2, d1[0]
265cabdff1aSopenharmony_ci        vmull.s32       q3,   \cd3, d1[0]
266cabdff1aSopenharmony_ci        vadd.i32        q14,  \c0,  \c2
267cabdff1aSopenharmony_ci        vsub.i32        q2,   \c0,  \c2
268cabdff1aSopenharmony_ci        vmlal.s32       q13,  \cd6, d1[0]
269cabdff1aSopenharmony_ci        vmlal.s32       q15,  \cd7, d1[0]
270cabdff1aSopenharmony_ci        vmull.s32       q12,  d28,  d0[0]
271cabdff1aSopenharmony_ci        vmull.s32       q14,  d29,  d0[0]
272cabdff1aSopenharmony_ci        vmull.s32       q10,  d4,   d0[0]
273cabdff1aSopenharmony_ci        vmull.s32       q8,   d5,   d0[0]
274cabdff1aSopenharmony_ci        vmlsl.s32       q11,  \cd6, d1[1]
275cabdff1aSopenharmony_ci        vmlsl.s32       q3,   \cd7, d1[1]
276cabdff1aSopenharmony_ci        vrshrn.s64      d26,  q13,  #14
277cabdff1aSopenharmony_ci        vrshrn.s64      d27,  q15,  #14
278cabdff1aSopenharmony_ci        vrshrn.s64      d24,  q12,  #14
279cabdff1aSopenharmony_ci        vrshrn.s64      d25,  q14,  #14
280cabdff1aSopenharmony_ci        vrshrn.s64      d20,  q10,  #14
281cabdff1aSopenharmony_ci        vrshrn.s64      d21,  q8,   #14
282cabdff1aSopenharmony_ci        vrshrn.s64      d22,  q11,  #14
283cabdff1aSopenharmony_ci        vrshrn.s64      d23,  q3,   #14
284cabdff1aSopenharmony_ci        vadd.i32        \c0,  q12,  q13
285cabdff1aSopenharmony_ci        vsub.i32        \c3,  q12,  q13
286cabdff1aSopenharmony_ci        vadd.i32        \c1,  q10,  q11
287cabdff1aSopenharmony_ci        vsub.i32        \c2,  q10,  q11
288cabdff1aSopenharmony_ci.endm
289cabdff1aSopenharmony_ci
290cabdff1aSopenharmony_ci.macro iadst4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
291cabdff1aSopenharmony_ci        vmul.s32        q10,  \c0,  d2[0]
292cabdff1aSopenharmony_ci        vmla.s32        q10,  \c2,  d2[1]
293cabdff1aSopenharmony_ci        vmla.s32        q10,  \c3,  d3[0]
294cabdff1aSopenharmony_ci        vmul.s32        q11,  \c0,  d3[0]
295cabdff1aSopenharmony_ci        vmls.s32        q11,  \c2,  d2[0]
296cabdff1aSopenharmony_ci        vsub.s32        \c0,  \c0,  \c2
297cabdff1aSopenharmony_ci        vmls.s32        q11,  \c3,  d2[1]
298cabdff1aSopenharmony_ci        vadd.s32        \c0,  \c0,  \c3
299cabdff1aSopenharmony_ci        vmul.s32        q13,  \c1,  d3[1]
300cabdff1aSopenharmony_ci        vmul.s32        q12,  \c0,  d3[1]
301cabdff1aSopenharmony_ci        vadd.s32        q14,  q10,  q13
302cabdff1aSopenharmony_ci        vadd.s32        q15,  q11,  q13
303cabdff1aSopenharmony_ci        vrshr.s32       \c0,  q14,  #14
304cabdff1aSopenharmony_ci        vadd.s32        q10,  q10,  q11
305cabdff1aSopenharmony_ci        vrshr.s32       \c1,  q15,  #14
306cabdff1aSopenharmony_ci        vsub.s32        q10,  q10,  q13
307cabdff1aSopenharmony_ci        vrshr.s32       \c2,  q12,  #14
308cabdff1aSopenharmony_ci        vrshr.s32       \c3,  q10,  #14
309cabdff1aSopenharmony_ci.endm
310cabdff1aSopenharmony_ci
311cabdff1aSopenharmony_ci.macro iadst4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
312cabdff1aSopenharmony_ci        vmull.s32       q10,  \cd0, d2[0]
313cabdff1aSopenharmony_ci        vmull.s32       q4,   \cd1, d2[0]
314cabdff1aSopenharmony_ci        vmlal.s32       q10,  \cd4, d2[1]
315cabdff1aSopenharmony_ci        vmlal.s32       q4,   \cd5, d2[1]
316cabdff1aSopenharmony_ci        vmlal.s32       q10,  \cd6, d3[0]
317cabdff1aSopenharmony_ci        vmlal.s32       q4,   \cd7, d3[0]
318cabdff1aSopenharmony_ci        vmull.s32       q11,  \cd0, d3[0]
319cabdff1aSopenharmony_ci        vmull.s32       q5,   \cd1, d3[0]
320cabdff1aSopenharmony_ci        vmlsl.s32       q11,  \cd4, d2[0]
321cabdff1aSopenharmony_ci        vmlsl.s32       q5,   \cd5, d2[0]
322cabdff1aSopenharmony_ci        vsub.s32        \c0,  \c0,  \c2
323cabdff1aSopenharmony_ci        vmlsl.s32       q11,  \cd6, d2[1]
324cabdff1aSopenharmony_ci        vmlsl.s32       q5,   \cd7, d2[1]
325cabdff1aSopenharmony_ci        vadd.s32        \c0,  \c0,  \c3
326cabdff1aSopenharmony_ci        vmull.s32       q13,  \cd2, d3[1]
327cabdff1aSopenharmony_ci        vmull.s32       q6,   \cd3, d3[1]
328cabdff1aSopenharmony_ci        vmull.s32       q12,  \cd0, d3[1]
329cabdff1aSopenharmony_ci        vmull.s32       q7,   \cd1, d3[1]
330cabdff1aSopenharmony_ci        vadd.s64        q14,  q10,  q13
331cabdff1aSopenharmony_ci        vadd.s64        q2,   q4,   q6
332cabdff1aSopenharmony_ci        vadd.s64        q15,  q11,  q13
333cabdff1aSopenharmony_ci        vadd.s64        q3,   q5,   q6
334cabdff1aSopenharmony_ci        vrshrn.s64      \cd1, q2,   #14
335cabdff1aSopenharmony_ci        vrshrn.s64      \cd0, q14,  #14
336cabdff1aSopenharmony_ci        vadd.s64        q10,  q10,  q11
337cabdff1aSopenharmony_ci        vadd.s64        q4,   q4,   q5
338cabdff1aSopenharmony_ci        vrshrn.s64      \cd3, q3,   #14
339cabdff1aSopenharmony_ci        vrshrn.s64      \cd2, q15,  #14
340cabdff1aSopenharmony_ci        vsub.s64        q10,  q10,  q13
341cabdff1aSopenharmony_ci        vsub.s64        q4,   q4,   q6
342cabdff1aSopenharmony_ci        vrshrn.s64      \cd4, q12,  #14
343cabdff1aSopenharmony_ci        vrshrn.s64      \cd5, q7,   #14
344cabdff1aSopenharmony_ci        vrshrn.s64      \cd6, q10,  #14
345cabdff1aSopenharmony_ci        vrshrn.s64      \cd7, q4,   #14
346cabdff1aSopenharmony_ci.endm
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci@ The public functions in this file have got the following signature:
349cabdff1aSopenharmony_ci@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
350cabdff1aSopenharmony_ci
351cabdff1aSopenharmony_ci.macro itxfm_func4x4 txfm1, txfm2, bpp
352cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
353cabdff1aSopenharmony_ci.ifc \txfm1,\txfm2
354cabdff1aSopenharmony_ci.ifc \txfm1,idct
355cabdff1aSopenharmony_ci        movrel          r12, itxfm4_coeffs
356cabdff1aSopenharmony_ci        vld1.16         {d0}, [r12,:64]
357cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
358cabdff1aSopenharmony_ci.endif
359cabdff1aSopenharmony_ci.ifc \txfm1,iadst
360cabdff1aSopenharmony_ci        movrel          r12, iadst4_coeffs
361cabdff1aSopenharmony_ci        vld1.16         {d1}, [r12,:64]
362cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
363cabdff1aSopenharmony_ci.endif
364cabdff1aSopenharmony_ci.else
365cabdff1aSopenharmony_ci        movrel          r12, itxfm4_coeffs
366cabdff1aSopenharmony_ci        vld1.16         {q0}, [r12,:128]
367cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
368cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
369cabdff1aSopenharmony_ci.endif
370cabdff1aSopenharmony_ci.if \bpp > 10
371cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
372cabdff1aSopenharmony_ci        @ iadst4_12 needs q4-q7
373cabdff1aSopenharmony_ci        vpush           {q4-q7}
374cabdff1aSopenharmony_ci.endif
375cabdff1aSopenharmony_ci.endif
376cabdff1aSopenharmony_ci
377cabdff1aSopenharmony_ci        vmov.i32        q14, #0
378cabdff1aSopenharmony_ci        vmov.i32        q15, #0
379cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
380cabdff1aSopenharmony_ci        cmp             r3,  #1
381cabdff1aSopenharmony_ci        bne             1f
382cabdff1aSopenharmony_ci        @ DC-only for idct/idct
383cabdff1aSopenharmony_ci        vld1.32         {d4[]},   [r2,:32]
384cabdff1aSopenharmony_ci        vmull.s32       q2,  d4,  d0[0]
385cabdff1aSopenharmony_ci        vrshrn.s64      d4,  q2,  #14
386cabdff1aSopenharmony_ci        vmull.s32       q2,  d4,  d0[0]
387cabdff1aSopenharmony_ci        vrshrn.s64      d4,  q2,  #14
388cabdff1aSopenharmony_ci        vst1.32         {d30[0]}, [r2,:32]
389cabdff1aSopenharmony_ci        vdup.32         q2,  d4[0]
390cabdff1aSopenharmony_ci        vmov            q3,  q2
391cabdff1aSopenharmony_ci        vmov            q8,  q2
392cabdff1aSopenharmony_ci        vmov            q9,  q2
393cabdff1aSopenharmony_ci        b               2f
394cabdff1aSopenharmony_ci.endif
395cabdff1aSopenharmony_ci
396cabdff1aSopenharmony_ci1:
397cabdff1aSopenharmony_ci        vld1.32         {q2-q3},   [r2,:128]
398cabdff1aSopenharmony_ci        vst1.32         {q14-q15}, [r2,:128]!
399cabdff1aSopenharmony_ci        vld1.32         {q8-q9},   [r2,:128]
400cabdff1aSopenharmony_ci
401cabdff1aSopenharmony_ci.ifc \txfm1,iwht
402cabdff1aSopenharmony_ci        vshr.s32        q2,  q2,  #2
403cabdff1aSopenharmony_ci        vshr.s32        q3,  q3,  #2
404cabdff1aSopenharmony_ci        vshr.s32        q8,  q8,  #2
405cabdff1aSopenharmony_ci        vshr.s32        q9,  q9,  #2
406cabdff1aSopenharmony_ci.endif
407cabdff1aSopenharmony_ci
408cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r2,:128]!
409cabdff1aSopenharmony_ci        \txfm1\()4_\bpp q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  d16, d17, d18, d19
410cabdff1aSopenharmony_ci
411cabdff1aSopenharmony_ci        @ Transpose 4x4 with 32 bit elements
412cabdff1aSopenharmony_ci        vtrn.32         q2,  q3
413cabdff1aSopenharmony_ci        vtrn.32         q8,  q9
414cabdff1aSopenharmony_ci        vswp            d5,  d16
415cabdff1aSopenharmony_ci        vswp            d7,  d18
416cabdff1aSopenharmony_ci
417cabdff1aSopenharmony_ci        \txfm2\()4_\bpp q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  d16, d17, d18, d19
418cabdff1aSopenharmony_ci2:
419cabdff1aSopenharmony_ci        vmvn.u16        q15, #((0xffff << \bpp) & 0xffff)
420cabdff1aSopenharmony_ci        vld1.16         {d0},  [r0,:64], r1
421cabdff1aSopenharmony_ci        vld1.16         {d1},  [r0,:64], r1
422cabdff1aSopenharmony_ci.ifnc \txfm1,iwht
423cabdff1aSopenharmony_ci        vrshr.s32       q2,  q2,  #4
424cabdff1aSopenharmony_ci        vrshr.s32       q3,  q3,  #4
425cabdff1aSopenharmony_ci        vrshr.s32       q8,  q8,  #4
426cabdff1aSopenharmony_ci        vrshr.s32       q9,  q9,  #4
427cabdff1aSopenharmony_ci.endif
428cabdff1aSopenharmony_ci        vaddw.u16       q2,  q2,  d0
429cabdff1aSopenharmony_ci        vaddw.u16       q3,  q3,  d1
430cabdff1aSopenharmony_ci        vld1.16         {d2},  [r0,:64], r1
431cabdff1aSopenharmony_ci        vld1.16         {d3},  [r0,:64], r1
432cabdff1aSopenharmony_ci        vqmovun.s32     d0,  q2
433cabdff1aSopenharmony_ci        vqmovun.s32     d1,  q3
434cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
435cabdff1aSopenharmony_ci
436cabdff1aSopenharmony_ci        vaddw.u16       q8,  q8,  d2
437cabdff1aSopenharmony_ci        vmin.u16        q0,  q0,  q15
438cabdff1aSopenharmony_ci        vaddw.u16       q9,  q9,  d3
439cabdff1aSopenharmony_ci        vst1.16         {d0},  [r0,:64], r1
440cabdff1aSopenharmony_ci        vqmovun.s32     d2,  q8
441cabdff1aSopenharmony_ci        vqmovun.s32     d3,  q9
442cabdff1aSopenharmony_ci        vmin.u16        q1,  q1,  q15
443cabdff1aSopenharmony_ci
444cabdff1aSopenharmony_ci        vst1.16         {d1},  [r0,:64], r1
445cabdff1aSopenharmony_ci        vst1.16         {d2},  [r0,:64], r1
446cabdff1aSopenharmony_ci        vst1.16         {d3},  [r0,:64], r1
447cabdff1aSopenharmony_ci
448cabdff1aSopenharmony_ci.if \bpp > 10
449cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
450cabdff1aSopenharmony_ci        vpop            {q4-q7}
451cabdff1aSopenharmony_ci.endif
452cabdff1aSopenharmony_ci.endif
453cabdff1aSopenharmony_ci        bx              lr
454cabdff1aSopenharmony_ciendfunc
455cabdff1aSopenharmony_ci.endm
456cabdff1aSopenharmony_ci
457cabdff1aSopenharmony_ci.macro itxfm_funcs4x4 bpp
458cabdff1aSopenharmony_ciitxfm_func4x4 idct,  idct,  \bpp
459cabdff1aSopenharmony_ciitxfm_func4x4 iadst, idct,  \bpp
460cabdff1aSopenharmony_ciitxfm_func4x4 idct,  iadst, \bpp
461cabdff1aSopenharmony_ciitxfm_func4x4 iadst, iadst, \bpp
462cabdff1aSopenharmony_ciitxfm_func4x4 iwht,  iwht,  \bpp
463cabdff1aSopenharmony_ci.endm
464cabdff1aSopenharmony_ci
465cabdff1aSopenharmony_ciitxfm_funcs4x4 10
466cabdff1aSopenharmony_ciitxfm_funcs4x4 12
467cabdff1aSopenharmony_ci
468cabdff1aSopenharmony_ci.macro idct8
469cabdff1aSopenharmony_ci        dmbutterfly0    d16, d17, d24, d25, q8,  q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
470cabdff1aSopenharmony_ci        dmbutterfly     d20, d21, d28, d29, d1[0], d1[1], q2,  q3,  q4,  q5 @ q10 = t2a, q14 = t3a
471cabdff1aSopenharmony_ci        dmbutterfly     d18, d19, d30, d31, d2[0], d2[1], q2,  q3,  q4,  q5 @ q9  = t4a, q15 = t7a
472cabdff1aSopenharmony_ci        dmbutterfly     d26, d27, d22, d23, d3[0], d3[1], q2,  q3,  q4,  q5 @ q13 = t5a, q11 = t6a
473cabdff1aSopenharmony_ci
474cabdff1aSopenharmony_ci        butterfly       q2,  q14, q8,  q14 @ q2 = t0, q14 = t3
475cabdff1aSopenharmony_ci        butterfly       q3,  q10, q12, q10 @ q3 = t1, q10 = t2
476cabdff1aSopenharmony_ci        butterfly       q4,  q13, q9,  q13 @ q4 = t4, q13 = t5a
477cabdff1aSopenharmony_ci        butterfly       q5,  q11, q15, q11 @ q5 = t7, q11 = t6a
478cabdff1aSopenharmony_ci
479cabdff1aSopenharmony_ci        butterfly       q8,  q15, q2,  q5  @ q8 = out[0], q15 = out[7]
480cabdff1aSopenharmony_ci
481cabdff1aSopenharmony_ci        dmbutterfly0    d4,  d5,  d10, d11, q11, q13, q9,  q13, d18, d19, d26, d27, q2,  q5, q11, q12 @ q2 = t6, q5 = t5
482cabdff1aSopenharmony_ci
483cabdff1aSopenharmony_ci        butterfly       q11, q12, q14, q4  @ q11 = out[3], q12 = out[4]
484cabdff1aSopenharmony_ci        butterfly       q9,  q14, q3,  q2  @ q9 = out[1],  q14 = out[6]
485cabdff1aSopenharmony_ci        butterfly_r     q13, q10, q10, q5  @ q13 = out[5], q10 = out[2]
486cabdff1aSopenharmony_ci.endm
487cabdff1aSopenharmony_ci
488cabdff1aSopenharmony_ci.macro iadst8
489cabdff1aSopenharmony_ci        movrel          r12, iadst8_coeffs
490cabdff1aSopenharmony_ci        vld1.16         {q1}, [r12,:128]!
491cabdff1aSopenharmony_ci        vmovl.s16       q0,  d2
492cabdff1aSopenharmony_ci        vmovl.s16       q1,  d3
493cabdff1aSopenharmony_ci
494cabdff1aSopenharmony_ci        dmbutterfly_l   q4,  q5,  q2,  q3,  d30, d31, d16, d17, d0[1], d0[0] @ q4,q5  = t1a, q2,q3 = t0a
495cabdff1aSopenharmony_ci        dmbutterfly_l   q8,  q15, q6,  q7,  d22, d23, d24, d25, d2[1], d2[0] @ q8,q15 = t5a, q6,q7 = t4a
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci        dbutterfly_n    d22, d23, d4,  d5,  q2,  q3,  q6,  q7,  q11, q12, q2,  q3 @ q11 = t0, q2 = t4
498cabdff1aSopenharmony_ci
499cabdff1aSopenharmony_ci        dbutterfly_n    d24, d25, d6,  d7,  q4,  q5,  q8,  q15, q12, q3,  q6,  q7 @ q12 = t1, q3 = t5
500cabdff1aSopenharmony_ci
501cabdff1aSopenharmony_ci        dmbutterfly_l   q6,  q7,  q4,  q5,  d26, d27, d20, d21, d1[1], d1[0] @ q6,q7 = t3a, q4,q5 = t2a
502cabdff1aSopenharmony_ci        dmbutterfly_l   q10, q13, q8,  q15, d18, d19, d28, d29, d3[1], d3[0] @ q10,q13 = t7a, q8,q15 = t6a
503cabdff1aSopenharmony_ci
504cabdff1aSopenharmony_ci        dbutterfly_n    d18, d19, d8,  d9,  q4,  q5,  q8,  q15, q9,  q14, q4, q5 @ q9 = t2, q4 = t6
505cabdff1aSopenharmony_ci        dbutterfly_n    d16, d17, d12, d13, q6,  q7,  q10, q13, q8,  q15, q6, q7 @ q8 = t3, q6 = t7
506cabdff1aSopenharmony_ci
507cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
508cabdff1aSopenharmony_ci        vld1.16         {q0}, [r12,:128]
509cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
510cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
511cabdff1aSopenharmony_ci
512cabdff1aSopenharmony_ci        butterfly       q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
513cabdff1aSopenharmony_ci        vneg.s32        q15, q15          @ q15 = out[7]
514cabdff1aSopenharmony_ci        butterfly       q8,  q9,  q11, q9 @ q8 = out[0], q9 = t2
515cabdff1aSopenharmony_ci
516cabdff1aSopenharmony_ci        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d1[0], d1[1] @ q10,q11 = t5a, q5,q7 = t4a
517cabdff1aSopenharmony_ci        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d1[1], d1[0] @ q2,q3 = t6a, q13,q14 = t7a
518cabdff1aSopenharmony_ci
519cabdff1aSopenharmony_ci        dbutterfly_n    d28, d29, d8,  d9,  q10, q11, q13, q14, q4,  q6,  q10, q11 @ q14 = out[6], q4 = t7
520cabdff1aSopenharmony_ci
521cabdff1aSopenharmony_ci        dmbutterfly0    d22, d23, d24, d25, q9,  q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
522cabdff1aSopenharmony_ci        vneg.s32        q11, q11      @ q11 = out[3]
523cabdff1aSopenharmony_ci
524cabdff1aSopenharmony_ci        dbutterfly_n    d18, d19, d4,  d5,  q5,  q7,  q2,  q3,  q9, q10, q2,  q3 @ q9 = -out[1], q2 = t6
525cabdff1aSopenharmony_ci        vneg.s32        q9,  q9       @ q9 = out[1]
526cabdff1aSopenharmony_ci
527cabdff1aSopenharmony_ci        dmbutterfly0    d20, d21, d26, d27, q2,  q4,  q3, q5,  d6,  d7,  d10, d11, q6,  q7 @ q10 = out[2], q13 = -out[5]
528cabdff1aSopenharmony_ci        vneg.s32        q13, q13      @ q13 = out[5]
529cabdff1aSopenharmony_ci.endm
530cabdff1aSopenharmony_ci
531cabdff1aSopenharmony_cifunction idct8x8_dc_add_neon
532cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
533cabdff1aSopenharmony_ci        vld1.16         {d0}, [r12,:64]
534cabdff1aSopenharmony_ci
535cabdff1aSopenharmony_ci        vmov.i32        q2,  #0
536cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
537cabdff1aSopenharmony_ci
538cabdff1aSopenharmony_ci        vld1.32         {d16[]}, [r2,:32]
539cabdff1aSopenharmony_ci        vmull.s32       q8,  d16, d0[0]
540cabdff1aSopenharmony_ci        vrshrn.s64      d16, q8,  #14
541cabdff1aSopenharmony_ci        vmull.s32       q8,  d16, d0[0]
542cabdff1aSopenharmony_ci        vrshrn.s64      d16, q8,  #14
543cabdff1aSopenharmony_ci        vdup.32         q8,  d16[0]
544cabdff1aSopenharmony_ci        vst1.32         {d4[0]}, [r2,:32]
545cabdff1aSopenharmony_ci
546cabdff1aSopenharmony_ci        vrshr.s32       q8,  q8,  #5
547cabdff1aSopenharmony_ci        vdup.s16        q15, r8
548cabdff1aSopenharmony_ci
549cabdff1aSopenharmony_ci        mov             r3,  r0
550cabdff1aSopenharmony_ci        mov             r12, #8
551cabdff1aSopenharmony_ci1:
552cabdff1aSopenharmony_ci        @ Loop to add the constant from q8 into all 8x8 outputs
553cabdff1aSopenharmony_ci        subs            r12, r12, #2
554cabdff1aSopenharmony_ci        vld1.16         {q2},  [r0,:128], r1
555cabdff1aSopenharmony_ci        vaddw.u16       q10, q8,  d4
556cabdff1aSopenharmony_ci        vld1.16         {q3},  [r0,:128], r1
557cabdff1aSopenharmony_ci        vaddw.u16       q11, q8,  d5
558cabdff1aSopenharmony_ci        vaddw.u16       q12, q8,  d6
559cabdff1aSopenharmony_ci        vaddw.u16       q13, q8,  d7
560cabdff1aSopenharmony_ci        vqmovun.s32     d4,  q10
561cabdff1aSopenharmony_ci        vqmovun.s32     d5,  q11
562cabdff1aSopenharmony_ci        vqmovun.s32     d6,  q12
563cabdff1aSopenharmony_ci        vqmovun.s32     d7,  q13
564cabdff1aSopenharmony_ci        vmin.u16        q2,  q2,  q15
565cabdff1aSopenharmony_ci        vst1.16         {q2},  [r3,:128], r1
566cabdff1aSopenharmony_ci        vmin.u16        q3,  q3,  q15
567cabdff1aSopenharmony_ci        vst1.16         {q3},  [r3,:128], r1
568cabdff1aSopenharmony_ci        bne             1b
569cabdff1aSopenharmony_ci
570cabdff1aSopenharmony_ci        pop             {r4-r8,pc}
571cabdff1aSopenharmony_ciendfunc
572cabdff1aSopenharmony_ci.ltorg
573cabdff1aSopenharmony_ci
574cabdff1aSopenharmony_ci.macro itxfm8_1d_funcs txfm
575cabdff1aSopenharmony_ci@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it,
576cabdff1aSopenharmony_ci@ transpose into a horizontal 8x4 slice and store.
577cabdff1aSopenharmony_ci@ r0 = dst (temp buffer)
578cabdff1aSopenharmony_ci@ r1 = slice offset
579cabdff1aSopenharmony_ci@ r2 = src
580cabdff1aSopenharmony_cifunction \txfm\()8_1d_4x8_pass1_neon
581cabdff1aSopenharmony_ci        mov             r12, #32
582cabdff1aSopenharmony_ci        vmov.s32        q2,  #0
583cabdff1aSopenharmony_ci.irp i, 8, 9, 10, 11, 12, 13, 14, 15
584cabdff1aSopenharmony_ci        vld1.32         {q\i}, [r2,:128]
585cabdff1aSopenharmony_ci        vst1.32         {q2},  [r2,:128], r12
586cabdff1aSopenharmony_ci.endr
587cabdff1aSopenharmony_ci
588cabdff1aSopenharmony_ci        \txfm\()8
589cabdff1aSopenharmony_ci
590cabdff1aSopenharmony_ci        @ Do two 4x4 transposes. Originally, q8-q15 contain the
591cabdff1aSopenharmony_ci        @ 8 rows. Afterwards, q8-q11, q12-q15 contain the transposed
592cabdff1aSopenharmony_ci        @ 4x4 blocks.
593cabdff1aSopenharmony_ci        transpose32_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
594cabdff1aSopenharmony_ci
595cabdff1aSopenharmony_ci        @ Store the transposed 4x4 blocks horizontally.
596cabdff1aSopenharmony_ci        cmp             r1,  #4
597cabdff1aSopenharmony_ci        beq             1f
598cabdff1aSopenharmony_ci.irp i, 8, 12, 9, 13, 10, 14, 11, 15
599cabdff1aSopenharmony_ci        vst1.32         {q\i}, [r0,:128]!
600cabdff1aSopenharmony_ci.endr
601cabdff1aSopenharmony_ci        bx              lr
602cabdff1aSopenharmony_ci1:
603cabdff1aSopenharmony_ci        @ Special case: For the last input column (r1 == 4),
604cabdff1aSopenharmony_ci        @ which would be stored as the last row in the temp buffer,
605cabdff1aSopenharmony_ci        @ don't store the first 4x4 block, but keep it in registers
606cabdff1aSopenharmony_ci        @ for the first slice of the second pass (where it is the
607cabdff1aSopenharmony_ci        @ last 4x4 block).
608cabdff1aSopenharmony_ci.irp i, 12, 13, 14, 15
609cabdff1aSopenharmony_ci        add             r0,  r0,  #16
610cabdff1aSopenharmony_ci        vst1.32         {q\i}, [r0,:128]!
611cabdff1aSopenharmony_ci.endr
612cabdff1aSopenharmony_ci        vmov            q12, q8
613cabdff1aSopenharmony_ci        vmov            q13, q9
614cabdff1aSopenharmony_ci        vmov            q14, q10
615cabdff1aSopenharmony_ci        vmov            q15, q11
616cabdff1aSopenharmony_ci        bx              lr
617cabdff1aSopenharmony_ciendfunc
618cabdff1aSopenharmony_ci
619cabdff1aSopenharmony_ci@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it,
620cabdff1aSopenharmony_ci@ load the destination pixels (from a similar 4x8 slice), add and store back.
621cabdff1aSopenharmony_ci@ r0 = dst
622cabdff1aSopenharmony_ci@ r1 = dst stride
623cabdff1aSopenharmony_ci@ r2 = src (temp buffer)
624cabdff1aSopenharmony_ci@ r3 = slice offset
625cabdff1aSopenharmony_cifunction \txfm\()8_1d_4x8_pass2_neon
626cabdff1aSopenharmony_ci        mov             r12, #32
627cabdff1aSopenharmony_ci.irp i, 8, 9, 10, 11
628cabdff1aSopenharmony_ci        vld1.32         {q\i}, [r2,:128], r12
629cabdff1aSopenharmony_ci.endr
630cabdff1aSopenharmony_ci        cmp             r3,  #0
631cabdff1aSopenharmony_ci        beq             1f
632cabdff1aSopenharmony_ci.irp i, 12, 13, 14, 15
633cabdff1aSopenharmony_ci        vld1.32         {q\i}, [r2,:128], r12
634cabdff1aSopenharmony_ci.endr
635cabdff1aSopenharmony_ci1:
636cabdff1aSopenharmony_ci
637cabdff1aSopenharmony_ci        add             r3,  r0,  r1
638cabdff1aSopenharmony_ci        lsl             r1,  r1,  #1
639cabdff1aSopenharmony_ci        \txfm\()8
640cabdff1aSopenharmony_ci
641cabdff1aSopenharmony_ci        vdup.s16        q4,  r8
642cabdff1aSopenharmony_ci.macro load_add_store coef0, coef1, coef2, coef3
643cabdff1aSopenharmony_ci        vld1.16         {d4},   [r0,:64], r1
644cabdff1aSopenharmony_ci        vld1.16         {d5},   [r3,:64], r1
645cabdff1aSopenharmony_ci        vld1.16         {d6},   [r0,:64], r1
646cabdff1aSopenharmony_ci        vld1.16         {d7},   [r3,:64], r1
647cabdff1aSopenharmony_ci
648cabdff1aSopenharmony_ci        vrshr.s32       \coef0, \coef0, #5
649cabdff1aSopenharmony_ci        vrshr.s32       \coef1, \coef1, #5
650cabdff1aSopenharmony_ci        vrshr.s32       \coef2, \coef2, #5
651cabdff1aSopenharmony_ci        vrshr.s32       \coef3, \coef3, #5
652cabdff1aSopenharmony_ci
653cabdff1aSopenharmony_ci        vaddw.u16       \coef0, \coef0, d4
654cabdff1aSopenharmony_ci        vaddw.u16       \coef1, \coef1, d5
655cabdff1aSopenharmony_ci        vaddw.u16       \coef2, \coef2, d6
656cabdff1aSopenharmony_ci        vaddw.u16       \coef3, \coef3, d7
657cabdff1aSopenharmony_ci
658cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
659cabdff1aSopenharmony_ci        sub             r3,  r3,  r1, lsl #1
660cabdff1aSopenharmony_ci
661cabdff1aSopenharmony_ci        vqmovun.s32     d4,  \coef0
662cabdff1aSopenharmony_ci        vqmovun.s32     d5,  \coef1
663cabdff1aSopenharmony_ci        vqmovun.s32     d6,  \coef2
664cabdff1aSopenharmony_ci        vqmovun.s32     d7,  \coef3
665cabdff1aSopenharmony_ci
666cabdff1aSopenharmony_ci        vmin.u16        q2,  q2,  q4
667cabdff1aSopenharmony_ci        vmin.u16        q3,  q3,  q4
668cabdff1aSopenharmony_ci
669cabdff1aSopenharmony_ci        vst1.16         {d4},  [r0,:64], r1
670cabdff1aSopenharmony_ci        vst1.16         {d5},  [r3,:64], r1
671cabdff1aSopenharmony_ci        vst1.16         {d6},  [r0,:64], r1
672cabdff1aSopenharmony_ci        vst1.16         {d7},  [r3,:64], r1
673cabdff1aSopenharmony_ci.endm
674cabdff1aSopenharmony_ci        load_add_store  q8,  q9,  q10, q11
675cabdff1aSopenharmony_ci        load_add_store  q12, q13, q14, q15
676cabdff1aSopenharmony_ci.purgem load_add_store
677cabdff1aSopenharmony_ci
678cabdff1aSopenharmony_ci        bx              lr
679cabdff1aSopenharmony_ciendfunc
680cabdff1aSopenharmony_ci.endm
681cabdff1aSopenharmony_ci
682cabdff1aSopenharmony_ciitxfm8_1d_funcs idct
683cabdff1aSopenharmony_ciitxfm8_1d_funcs iadst
684cabdff1aSopenharmony_ci
685cabdff1aSopenharmony_ci.macro itxfm_func8x8 txfm1, txfm2
686cabdff1aSopenharmony_cifunction vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
687cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
688cabdff1aSopenharmony_ci        cmp             r3,  #1
689cabdff1aSopenharmony_ci        beq             idct8x8_dc_add_neon
690cabdff1aSopenharmony_ci.endif
691cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
692cabdff1aSopenharmony_ci        vpush           {q4-q7}
693cabdff1aSopenharmony_ci.else
694cabdff1aSopenharmony_ci        vpush           {q4-q5}
695cabdff1aSopenharmony_ci.endif
696cabdff1aSopenharmony_ci
697cabdff1aSopenharmony_ci        @ Align the stack, allocate a temp buffer
698cabdff1aSopenharmony_ciT       mov             r7,  sp
699cabdff1aSopenharmony_ciT       and             r7,  r7,  #15
700cabdff1aSopenharmony_ciA       and             r7,  sp,  #15
701cabdff1aSopenharmony_ci        add             r7,  r7,  #256
702cabdff1aSopenharmony_ci        sub             sp,  sp,  r7
703cabdff1aSopenharmony_ci
704cabdff1aSopenharmony_ci        mov             r4,  r0
705cabdff1aSopenharmony_ci        mov             r5,  r1
706cabdff1aSopenharmony_ci        mov             r6,  r2
707cabdff1aSopenharmony_ci
708cabdff1aSopenharmony_ci.ifc \txfm1,idct
709cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
710cabdff1aSopenharmony_ci        vld1.16         {q0}, [r12,:128]
711cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
712cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
713cabdff1aSopenharmony_ci.endif
714cabdff1aSopenharmony_ci
715cabdff1aSopenharmony_ci.irp i, 0, 4
716cabdff1aSopenharmony_ci        add             r0,  sp,  #(\i*32)
717cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
718cabdff1aSopenharmony_ci.if \i == 4
719cabdff1aSopenharmony_ci        cmp             r3,  #12
720cabdff1aSopenharmony_ci        ble             1f
721cabdff1aSopenharmony_ci.endif
722cabdff1aSopenharmony_ci.endif
723cabdff1aSopenharmony_ci        mov             r1,  #\i
724cabdff1aSopenharmony_ci        add             r2,  r6,  #(\i*4)
725cabdff1aSopenharmony_ci        bl              \txfm1\()8_1d_4x8_pass1_neon
726cabdff1aSopenharmony_ci.endr
727cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
728cabdff1aSopenharmony_ci        b               3f
729cabdff1aSopenharmony_ci1:
730cabdff1aSopenharmony_ci        @ For all-zero slices in pass 1, set q12-q15 to zero, for the in-register
731cabdff1aSopenharmony_ci        @ passthrough of coefficients to pass 2 and clear the end of the temp buffer
732cabdff1aSopenharmony_ci        vmov.i32        q12, #0
733cabdff1aSopenharmony_ci        vmov.i32        q13, #0
734cabdff1aSopenharmony_ci        vmov.i32        q14, #0
735cabdff1aSopenharmony_ci        vmov.i32        q15, #0
736cabdff1aSopenharmony_ci.rept 4
737cabdff1aSopenharmony_ci        vst1.32         {q12-q13}, [r0,:128]!
738cabdff1aSopenharmony_ci.endr
739cabdff1aSopenharmony_ci3:
740cabdff1aSopenharmony_ci.endif
741cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,iadst_idct
742cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
743cabdff1aSopenharmony_ci        vld1.16         {q0}, [r12,:128]
744cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
745cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
746cabdff1aSopenharmony_ci.endif
747cabdff1aSopenharmony_ci.irp i, 0, 4
748cabdff1aSopenharmony_ci        add             r0,  r4,  #(\i*2)
749cabdff1aSopenharmony_ci        mov             r1,  r5
750cabdff1aSopenharmony_ci        add             r2,  sp,  #(\i*4)
751cabdff1aSopenharmony_ci        mov             r3,  #\i
752cabdff1aSopenharmony_ci        bl              \txfm2\()8_1d_4x8_pass2_neon
753cabdff1aSopenharmony_ci.endr
754cabdff1aSopenharmony_ci
755cabdff1aSopenharmony_ci        add             sp,  sp,  r7
756cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
757cabdff1aSopenharmony_ci        vpop            {q4-q7}
758cabdff1aSopenharmony_ci.else
759cabdff1aSopenharmony_ci        vpop            {q4-q5}
760cabdff1aSopenharmony_ci.endif
761cabdff1aSopenharmony_ci        pop             {r4-r8,pc}
762cabdff1aSopenharmony_ciendfunc
763cabdff1aSopenharmony_ci
764cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
765cabdff1aSopenharmony_ci        push            {r4-r8,lr}
766cabdff1aSopenharmony_ci        movw            r8,  #0x03ff
767cabdff1aSopenharmony_ci        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
768cabdff1aSopenharmony_ciendfunc
769cabdff1aSopenharmony_ci
770cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
771cabdff1aSopenharmony_ci        push            {r4-r8,lr}
772cabdff1aSopenharmony_ci        movw            r8,  #0x0fff
773cabdff1aSopenharmony_ci        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
774cabdff1aSopenharmony_ciendfunc
775cabdff1aSopenharmony_ci.endm
776cabdff1aSopenharmony_ci
777cabdff1aSopenharmony_ciitxfm_func8x8 idct,  idct
778cabdff1aSopenharmony_ciitxfm_func8x8 iadst, idct
779cabdff1aSopenharmony_ciitxfm_func8x8 idct,  iadst
780cabdff1aSopenharmony_ciitxfm_func8x8 iadst, iadst
781cabdff1aSopenharmony_ci
782cabdff1aSopenharmony_cifunction idct16x16_dc_add_neon
783cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
784cabdff1aSopenharmony_ci        vld1.16         {d0}, [r12,:64]
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci        vmov.i32        q2,  #0
787cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
788cabdff1aSopenharmony_ci
789cabdff1aSopenharmony_ci        vld1.32         {d16[]}, [r2,:32]
790cabdff1aSopenharmony_ci        vmull.s32       q8,  d16, d0[0]
791cabdff1aSopenharmony_ci        vrshrn.s64      d16, q8,  #14
792cabdff1aSopenharmony_ci        vmull.s32       q8,  d16, d0[0]
793cabdff1aSopenharmony_ci        vrshrn.s64      d16, q8,  #14
794cabdff1aSopenharmony_ci        vdup.32         q8,  d16[0]
795cabdff1aSopenharmony_ci        vst1.32         {d4[0]}, [r2,:32]
796cabdff1aSopenharmony_ci
797cabdff1aSopenharmony_ci        vrshr.s32       q8,  q8,  #6
798cabdff1aSopenharmony_ci        vdup.s16        q15, r9
799cabdff1aSopenharmony_ci
800cabdff1aSopenharmony_ci        mov             r3,  r0
801cabdff1aSopenharmony_ci        mov             r12, #16
802cabdff1aSopenharmony_ci1:
803cabdff1aSopenharmony_ci        @ Loop to add the constant from q8 into all 16x16 outputs
804cabdff1aSopenharmony_ci        subs            r12, r12, #2
805cabdff1aSopenharmony_ci        vld1.16         {q0-q1},  [r0,:128], r1
806cabdff1aSopenharmony_ci        vaddw.u16       q9,  q8,  d0
807cabdff1aSopenharmony_ci        vaddw.u16       q10, q8,  d1
808cabdff1aSopenharmony_ci        vld1.16         {q2-q3},  [r0,:128], r1
809cabdff1aSopenharmony_ci        vaddw.u16       q11, q8,  d2
810cabdff1aSopenharmony_ci        vaddw.u16       q12, q8,  d3
811cabdff1aSopenharmony_ci        vaddw.u16       q13, q8,  d4
812cabdff1aSopenharmony_ci        vaddw.u16       q14, q8,  d5
813cabdff1aSopenharmony_ci        vqmovun.s32     d0,  q9
814cabdff1aSopenharmony_ci        vaddw.u16       q9,  q8,  d6
815cabdff1aSopenharmony_ci        vqmovun.s32     d1,  q10
816cabdff1aSopenharmony_ci        vaddw.u16       q10, q8,  d7
817cabdff1aSopenharmony_ci        vqmovun.s32     d2,  q11
818cabdff1aSopenharmony_ci        vqmovun.s32     d3,  q12
819cabdff1aSopenharmony_ci        vqmovun.s32     d4,  q13
820cabdff1aSopenharmony_ci        vqmovun.s32     d5,  q14
821cabdff1aSopenharmony_ci        vmin.u16        q0,  q0,  q15
822cabdff1aSopenharmony_ci        vmin.u16        q1,  q1,  q15
823cabdff1aSopenharmony_ci        vqmovun.s32     d6,  q9
824cabdff1aSopenharmony_ci        vqmovun.s32     d7,  q10
825cabdff1aSopenharmony_ci        vst1.16         {q0-q1},  [r3,:128], r1
826cabdff1aSopenharmony_ci        vmin.u16        q2,  q2,  q15
827cabdff1aSopenharmony_ci        vmin.u16        q3,  q3,  q15
828cabdff1aSopenharmony_ci        vst1.16         {q2-q3},  [r3,:128], r1
829cabdff1aSopenharmony_ci        bne             1b
830cabdff1aSopenharmony_ci
831cabdff1aSopenharmony_ci        pop             {r4-r9,pc}
832cabdff1aSopenharmony_ciendfunc
833cabdff1aSopenharmony_ci.ltorg
834cabdff1aSopenharmony_ci
835cabdff1aSopenharmony_ci.macro idct16_end
836cabdff1aSopenharmony_ci        butterfly       d18, d11, d8,  d11               @ d18 = t0a,  d11 = t7a
837cabdff1aSopenharmony_ci        butterfly       d19, d22, d9,  d22               @ d19 = t1a,  d22 = t6
838cabdff1aSopenharmony_ci        butterfly       d8,  d26, d20, d26               @ d8  = t2a,  d26 = t5
839cabdff1aSopenharmony_ci        butterfly       d9,  d10, d28, d10               @ d9  = t3a,  d10 = t4
840cabdff1aSopenharmony_ci        butterfly       d20, d28, d16, d24               @ d20 = t8a,  d28 = t11a
841cabdff1aSopenharmony_ci        butterfly       d24, d21, d23, d21               @ d24 = t9,   d21 = t10
842cabdff1aSopenharmony_ci        butterfly       d23, d27, d25, d27               @ d23 = t14,  d27 = t13
843cabdff1aSopenharmony_ci        butterfly       d25, d29, d29, d17               @ d25 = t15a, d29 = t12a
844cabdff1aSopenharmony_ci
845cabdff1aSopenharmony_ci        mbutterfly0     d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a
846cabdff1aSopenharmony_ci        mbutterfly0     d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12,  d28 = t11
847cabdff1aSopenharmony_ci
848cabdff1aSopenharmony_ci        vswp            d27, d29                         @ d27 = t12, d29 = t13a
849cabdff1aSopenharmony_ci        vswp            d28, d27                         @ d28 = t12, d27 = t11
850cabdff1aSopenharmony_ci        butterfly       d16, d31, d18, d25               @ d16 = out[0], d31 = out[15]
851cabdff1aSopenharmony_ci        butterfly       d17, d30, d19, d23               @ d17 = out[1], d30 = out[14]
852cabdff1aSopenharmony_ci        butterfly_r     d25, d22, d22, d24               @ d25 = out[9], d22 = out[6]
853cabdff1aSopenharmony_ci        butterfly       d23, d24, d11, d20               @ d23 = out[7], d24 = out[8]
854cabdff1aSopenharmony_ci        butterfly       d18, d29, d8,  d29               @ d18 = out[2], d29 = out[13]
855cabdff1aSopenharmony_ci        butterfly       d19, d28, d9,  d28               @ d19 = out[3], d28 = out[12]
856cabdff1aSopenharmony_ci        vmov            d8,  d21                         @ d8  = t10a
857cabdff1aSopenharmony_ci        butterfly       d20, d27, d10, d27               @ d20 = out[4], d27 = out[11]
858cabdff1aSopenharmony_ci        butterfly       d21, d26, d26, d8                @ d21 = out[5], d26 = out[10]
859cabdff1aSopenharmony_ci        bx              lr
860cabdff1aSopenharmony_ci.endm
861cabdff1aSopenharmony_ci
862cabdff1aSopenharmony_cifunction idct16
863cabdff1aSopenharmony_ci        mbutterfly0     d16, d24, d16, d24, d8, d10, q4,  q5 @ d16 = t0a,  d24 = t1a
864cabdff1aSopenharmony_ci        mbutterfly      d20, d28, d1[0], d1[1], q4,  q5  @ d20 = t2a,  d28 = t3a
865cabdff1aSopenharmony_ci        mbutterfly      d18, d30, d2[0], d2[1], q4,  q5  @ d18 = t4a,  d30 = t7a
866cabdff1aSopenharmony_ci        mbutterfly      d26, d22, d3[0], d3[1], q4,  q5  @ d26 = t5a,  d22 = t6a
867cabdff1aSopenharmony_ci        mbutterfly      d17, d31, d4[0], d4[1], q4,  q5  @ d17 = t8a,  d31 = t15a
868cabdff1aSopenharmony_ci        mbutterfly      d25, d23, d5[0], d5[1], q4,  q5  @ d25 = t9a,  d23 = t14a
869cabdff1aSopenharmony_ci        mbutterfly      d21, d27, d6[0], d6[1], q4,  q5  @ d21 = t10a, d27 = t13a
870cabdff1aSopenharmony_ci        mbutterfly      d29, d19, d7[0], d7[1], q4,  q5  @ d29 = t11a, d19 = t12a
871cabdff1aSopenharmony_ci
872cabdff1aSopenharmony_ci        butterfly       d8,  d28, d16, d28               @ d8  = t0,   d28 = t3
873cabdff1aSopenharmony_ci        butterfly       d9,  d20, d24, d20               @ d9  = t1,   d20 = t2
874cabdff1aSopenharmony_ci        butterfly       d10, d26, d18, d26               @ d10 = t4,   d26 = t5
875cabdff1aSopenharmony_ci        butterfly       d11, d22, d30, d22               @ d11 = t7,   d22 = t6
876cabdff1aSopenharmony_ci        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
877cabdff1aSopenharmony_ci        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
878cabdff1aSopenharmony_ci        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
879cabdff1aSopenharmony_ci        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
880cabdff1aSopenharmony_ci
881cabdff1aSopenharmony_ci        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
882cabdff1aSopenharmony_ci        mbutterfly      d23, d25, d1[0], d1[1], q9,  q15        @ d23 = t9a,  d25 = t14a
883cabdff1aSopenharmony_ci        mbutterfly      d27, d21, d1[0], d1[1], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
884cabdff1aSopenharmony_ci        idct16_end
885cabdff1aSopenharmony_ciendfunc
886cabdff1aSopenharmony_ci
887cabdff1aSopenharmony_cifunction idct16_half
888cabdff1aSopenharmony_ci        mbutterfly0_h   d16, d24, d16, d24, d8, d10, q4,  q5 @ d16 = t0a,  d24 = t1a
889cabdff1aSopenharmony_ci        mbutterfly_h1   d20, d28, d1[0], d1[1], q4,  q5  @ d20 = t2a,  d28 = t3a
890cabdff1aSopenharmony_ci        mbutterfly_h1   d18, d30, d2[0], d2[1], q4,  q5  @ d18 = t4a,  d30 = t7a
891cabdff1aSopenharmony_ci        mbutterfly_h2   d26, d22, d3[0], d3[1], q4,  q5  @ d26 = t5a,  d22 = t6a
892cabdff1aSopenharmony_ci        mbutterfly_h1   d17, d31, d4[0], d4[1], q4,  q5  @ d17 = t8a,  d31 = t15a
893cabdff1aSopenharmony_ci        mbutterfly_h2   d25, d23, d5[0], d5[1], q4,  q5  @ d25 = t9a,  d23 = t14a
894cabdff1aSopenharmony_ci        mbutterfly_h1   d21, d27, d6[0], d6[1], q4,  q5  @ d21 = t10a, d27 = t13a
895cabdff1aSopenharmony_ci        mbutterfly_h2   d29, d19, d7[0], d7[1], q4,  q5  @ d29 = t11a, d19 = t12a
896cabdff1aSopenharmony_ci
897cabdff1aSopenharmony_ci        butterfly       d8,  d28, d16, d28               @ d8  = t0,   d28 = t3
898cabdff1aSopenharmony_ci        butterfly       d9,  d20, d24, d20               @ d9  = t1,   d20 = t2
899cabdff1aSopenharmony_ci        butterfly       d10, d26, d18, d26               @ d10 = t4,   d26 = t5
900cabdff1aSopenharmony_ci        butterfly       d11, d22, d30, d22               @ d11 = t7,   d22 = t6
901cabdff1aSopenharmony_ci        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
902cabdff1aSopenharmony_ci        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
903cabdff1aSopenharmony_ci        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
904cabdff1aSopenharmony_ci        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
905cabdff1aSopenharmony_ci
906cabdff1aSopenharmony_ci        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
907cabdff1aSopenharmony_ci        mbutterfly      d23, d25, d1[0], d1[1], q9,  q15        @ d23 = t9a,  d25 = t14a
908cabdff1aSopenharmony_ci        mbutterfly      d27, d21, d1[0], d1[1], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
909cabdff1aSopenharmony_ci        idct16_end
910cabdff1aSopenharmony_ciendfunc
911cabdff1aSopenharmony_ci
912cabdff1aSopenharmony_cifunction idct16_quarter
913cabdff1aSopenharmony_ci        vmov.s64        q12, #0
914cabdff1aSopenharmony_ci        vmull.s32       q4,  d17, d4[0]
915cabdff1aSopenharmony_ci        vmull.s32       q5,  d18, d2[1]
916cabdff1aSopenharmony_ci        vmull.s32       q15, d18, d2[0]
917cabdff1aSopenharmony_ci        vmlsl.s32       q12, d19, d7[1]
918cabdff1aSopenharmony_ci        vmull.s32       q14, d17, d4[1]
919cabdff1aSopenharmony_ci        vmull.s32       q13, d19, d7[0]
920cabdff1aSopenharmony_ci        vmull.s32       q11, d16, d0[0]
921cabdff1aSopenharmony_ci        vrshrn.s64      d16, q4,  #14
922cabdff1aSopenharmony_ci        vrshrn.s64      d11, q5,  #14
923cabdff1aSopenharmony_ci        vrshrn.s64      d10, q15, #14
924cabdff1aSopenharmony_ci        vrshrn.s64      d24, q12, #14
925cabdff1aSopenharmony_ci        vrshrn.s64      d29, q14, #14
926cabdff1aSopenharmony_ci        vrshrn.s64      d17, q13, #14
927cabdff1aSopenharmony_ci        vrshrn.s64      d28, q11, #14
928cabdff1aSopenharmony_ci
929cabdff1aSopenharmony_ci        mbutterfly_l    q10, q11, d17, d24, d1[0], d1[1], neg=1
930cabdff1aSopenharmony_ci        mbutterfly_l    q9,  q15, d29, d16, d1[0], d1[1]
931cabdff1aSopenharmony_ci        vrshrn.s64      d27, q10, #14
932cabdff1aSopenharmony_ci        vrshrn.s64      d21, q11, #14
933cabdff1aSopenharmony_ci        vrshrn.s64      d23, q9,  #14
934cabdff1aSopenharmony_ci        vrshrn.s64      d25, q15, #14
935cabdff1aSopenharmony_ci        vmov            d8,  d28
936cabdff1aSopenharmony_ci        vmov            d9,  d28
937cabdff1aSopenharmony_ci        mbutterfly0     d22, d26, d11, d10, d18, d30, q9,  q15
938cabdff1aSopenharmony_ci        vmov            d20, d28
939cabdff1aSopenharmony_ci        idct16_end
940cabdff1aSopenharmony_ciendfunc
941cabdff1aSopenharmony_ci
942cabdff1aSopenharmony_cifunction iadst16
943cabdff1aSopenharmony_ci        movrel          r12, iadst16_coeffs
944cabdff1aSopenharmony_ci        vld1.16         {q0},  [r12,:128]!
945cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
946cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
947cabdff1aSopenharmony_ci
948cabdff1aSopenharmony_ci        mbutterfly_l    q3,  q2,  d31, d16, d0[1], d0[0] @ q3  = t1,   q2  = t0
949cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d23, d24, d2[1], d2[0] @ q5  = t9,   q4  = t8
950cabdff1aSopenharmony_ci        butterfly_n     d31, d24, q3,  q5,  q6,  q5      @ d31 = t1a,  d24 = t9a
951cabdff1aSopenharmony_ci        mbutterfly_l    q7,  q6,  d29, d18, d1[1], d1[0] @ q7  = t3,   q6  = t2
952cabdff1aSopenharmony_ci        butterfly_n     d16, d23, q2,  q4,  q3,  q4      @ d16 = t0a,  d23 = t8a
953cabdff1aSopenharmony_ci        mbutterfly_l    q3,  q2,  d21, d26, d3[1], d3[0] @ q3  = t11,  q2  = t10
954cabdff1aSopenharmony_ci
955cabdff1aSopenharmony_ci        vld1.16         {q0},  [r12,:128]!
956cabdff1aSopenharmony_ci        butterfly_n     d29, d26, q7,  q3,  q4,  q3      @ d29 = t3a,  d26 = t11a
957cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
958cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
959cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d27, d20, d0[1], d0[0] @ q5  = t5,   q4  = t4
960cabdff1aSopenharmony_ci        butterfly_n     d18, d21, q6,  q2,  q3,  q2      @ d18 = t2a,  d21 = t10a
961cabdff1aSopenharmony_ci
962cabdff1aSopenharmony_ci        mbutterfly_l    q7,  q6,  d19, d28, d2[1], d2[0] @ q7  = t13,  q6  = t12
963cabdff1aSopenharmony_ci        butterfly_n     d20, d28, q5,  q7,  q2,  q7      @ d20 = t5a,  d28 = t13a
964cabdff1aSopenharmony_ci        mbutterfly_l    q3,  q2,  d25, d22, d1[1], d1[0] @ q3  = t7,   q2  = t6
965cabdff1aSopenharmony_ci        butterfly_n     d27, d19, q4,  q6,  q5,  q6      @ d27 = t4a,  d19 = t12a
966cabdff1aSopenharmony_ci
967cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d17, d30, d3[1], d3[0] @ q5  = t15,  q4  = t14
968cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
969cabdff1aSopenharmony_ci        vld1.16         {q0}, [r12,:128]
970cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
971cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
972cabdff1aSopenharmony_ci        butterfly_n     d22, d30, q3,  q5,  q6,  q5      @ d22 = t7a,  d30 = t15a
973cabdff1aSopenharmony_ci        mbutterfly_l    q7,  q6,  d23, d24, d2[0], d2[1] @ q7  = t9,   q6  = t8
974cabdff1aSopenharmony_ci        butterfly_n     d25, d17, q2,  q4,  q3,  q4      @ d25 = t6a,  d17 = t14a
975cabdff1aSopenharmony_ci
976cabdff1aSopenharmony_ci        mbutterfly_l    q2,  q3,  d28, d19, d2[1], d2[0] @ q2  = t12,  q3  = t13
977cabdff1aSopenharmony_ci        butterfly_n     d23, d19, q6,  q2,  q4,  q2      @ d23 = t8a,  d19 = t12a
978cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d21, d26, d3[0], d3[1] @ q5  = t11,  q4  = t10
979cabdff1aSopenharmony_ci        butterfly_r     d4,  d27, d16, d27               @ d4  = t4,   d27 = t0
980cabdff1aSopenharmony_ci        butterfly_n     d24, d28, q7,  q3,  q6,  q3      @ d24 = t9a,  d28 = t13a
981cabdff1aSopenharmony_ci
982cabdff1aSopenharmony_ci        mbutterfly_l    q6,  q7,  d30, d17, d3[1], d3[0] @ q6  = t14,  q7  = t15
983cabdff1aSopenharmony_ci        butterfly_r     d5,  d20, d31, d20               @ d5  = t5,   d20 = t1
984cabdff1aSopenharmony_ci        butterfly_n     d21, d17, q4,  q6,  q3,  q6      @ d21 = t10a, d17 = t14a
985cabdff1aSopenharmony_ci        butterfly_n     d26, d30, q5,  q7,  q4,  q7      @ d26 = t11a, d30 = t15a
986cabdff1aSopenharmony_ci
987cabdff1aSopenharmony_ci        butterfly_r     d6,  d25, d18, d25               @ d6  = t6,   d25 = t2
988cabdff1aSopenharmony_ci        butterfly_r     d7,  d22, d29, d22               @ d7  = t7,   d22 = t3
989cabdff1aSopenharmony_ci
990cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d19, d28, d1[0], d1[1] @ q5  = t13,  q4  = t12
991cabdff1aSopenharmony_ci        mbutterfly_l    q6,  q7,  d30, d17, d1[1], d1[0] @ q6  = t14,  q7  = t15
992cabdff1aSopenharmony_ci
993cabdff1aSopenharmony_ci        butterfly_n     d18, d30, q4,  q6,  q8,  q6      @ d18 = out[2],   d30 = t14a
994cabdff1aSopenharmony_ci        butterfly_n     d29, d17, q5,  q7,  q6,  q7      @ d29 = -out[13], d17 = t15a
995cabdff1aSopenharmony_ci        vneg.s32        d29, d29                         @ d29 = out[13]
996cabdff1aSopenharmony_ci
997cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d4,  d5,  d1[0], d1[1] @ q5  = t5a,  q4  = t4a
998cabdff1aSopenharmony_ci        mbutterfly_l    q6,  q7,  d7,  d6,  d1[1], d1[0] @ q6  = t6a,  q7  = t7a
999cabdff1aSopenharmony_ci
1000cabdff1aSopenharmony_ci        butterfly       d2,  d6,  d27, d25               @ d2 = out[0], d6 = t2a
1001cabdff1aSopenharmony_ci        butterfly       d3,  d7,  d23, d21               @ d3 =-out[1], d7 = t10
1002cabdff1aSopenharmony_ci
1003cabdff1aSopenharmony_ci        butterfly_n     d19, d31, q4,  q6,  q2,  q4      @ d19 = -out[3],  d31 = t6
1004cabdff1aSopenharmony_ci        vneg.s32        d19, d19                         @ d19 = out[3]
1005cabdff1aSopenharmony_ci        butterfly_n     d28, d16, q5,  q7,  q2,  q5      @ d28 = out[12],  d16 = t7
1006cabdff1aSopenharmony_ci
1007cabdff1aSopenharmony_ci        butterfly       d5,  d8,  d20, d22               @ d5 =-out[15],d8 = t3a
1008cabdff1aSopenharmony_ci        butterfly       d4,  d9,  d24, d26               @ d4 = out[14],d9 = t11
1009cabdff1aSopenharmony_ci
1010cabdff1aSopenharmony_ci        mbutterfly0     d23, d24, d6,  d8,  d10, d11, q6,  q7, 1 @ d23 = out[7], d24 = out[8]
1011cabdff1aSopenharmony_ci        mbutterfly0     d20, d27, d16, d31, d10, d11, q6,  q7    @ d20 = out[4], d27 = out[11]
1012cabdff1aSopenharmony_ci        mbutterfly0     d22, d25, d9,  d7,  d10, d11, q6,  q7    @ d22 = out[6], d25 = out[9]
1013cabdff1aSopenharmony_ci        mbutterfly0     d21, d26, d30, d17, d10, d11, q6,  q7, 1 @ d21 = out[5], d26 = out[10]
1014cabdff1aSopenharmony_ci
1015cabdff1aSopenharmony_ci        vneg.s32        d31, d5                          @ d31 = out[15]
1016cabdff1aSopenharmony_ci        vneg.s32        d17, d3                          @ d17 = out[1]
1017cabdff1aSopenharmony_ci
1018cabdff1aSopenharmony_ci        vmov            d16, d2
1019cabdff1aSopenharmony_ci        vmov            d30, d4
1020cabdff1aSopenharmony_ci        bx              lr
1021cabdff1aSopenharmony_ciendfunc
1022cabdff1aSopenharmony_ci
1023cabdff1aSopenharmony_ci.macro itxfm16_1d_funcs txfm, suffix
1024cabdff1aSopenharmony_ci@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
1025cabdff1aSopenharmony_ci@ transpose into a horizontal 16x2 slice and store.
1026cabdff1aSopenharmony_ci@ r0 = dst (temp buffer)
1027cabdff1aSopenharmony_ci@ r2 = src
1028cabdff1aSopenharmony_cifunction \txfm\()16_1d_2x16_pass1\suffix\()_neon
1029cabdff1aSopenharmony_ci        push            {lr}
1030cabdff1aSopenharmony_ci
1031cabdff1aSopenharmony_ci        mov             r12, #64
1032cabdff1aSopenharmony_ci        vmov.s32        q4,  #0
1033cabdff1aSopenharmony_ci.ifb \suffix
1034cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1035cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64]
1036cabdff1aSopenharmony_ci        vst1.32         {d8},  [r2,:64], r12
1037cabdff1aSopenharmony_ci.endr
1038cabdff1aSopenharmony_ci.endif
1039cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1040cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1041cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64]
1042cabdff1aSopenharmony_ci        vst1.32         {d8},  [r2,:64], r12
1043cabdff1aSopenharmony_ci.endr
1044cabdff1aSopenharmony_ci.endif
1045cabdff1aSopenharmony_ci.ifc \suffix,_half
1046cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1047cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64]
1048cabdff1aSopenharmony_ci        vst1.32         {d8},  [r2,:64], r12
1049cabdff1aSopenharmony_ci.endr
1050cabdff1aSopenharmony_ci.endif
1051cabdff1aSopenharmony_ci
1052cabdff1aSopenharmony_ci        bl              \txfm\()16\suffix
1053cabdff1aSopenharmony_ci
1054cabdff1aSopenharmony_ci        @ Do eight 2x2 transposes. Originally, d16-d31 contain the
1055cabdff1aSopenharmony_ci        @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
1056cabdff1aSopenharmony_ci        @ transposed 2x2 blocks.
1057cabdff1aSopenharmony_ci        transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
1058cabdff1aSopenharmony_ci
1059cabdff1aSopenharmony_ci        @ Store the transposed 2x2 blocks horizontally.
1060cabdff1aSopenharmony_ci.irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31
1061cabdff1aSopenharmony_ci        vst1.32         {d\i}, [r0,:64]!
1062cabdff1aSopenharmony_ci.endr
1063cabdff1aSopenharmony_ci        pop             {pc}
1064cabdff1aSopenharmony_ciendfunc
1065cabdff1aSopenharmony_ci
1066cabdff1aSopenharmony_ci@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
1067cabdff1aSopenharmony_ci@ load the destination pixels (from a similar 2x16 slice), add and store back.
1068cabdff1aSopenharmony_ci@ r0 = dst
1069cabdff1aSopenharmony_ci@ r1 = dst stride
1070cabdff1aSopenharmony_ci@ r2 = src (temp buffer)
1071cabdff1aSopenharmony_cifunction \txfm\()16_1d_2x16_pass2\suffix\()_neon
1072cabdff1aSopenharmony_ci        push            {lr}
1073cabdff1aSopenharmony_ci
1074cabdff1aSopenharmony_ci        mov             r12, #64
1075cabdff1aSopenharmony_ci.ifb \suffix
1076cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1077cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1078cabdff1aSopenharmony_ci.endr
1079cabdff1aSopenharmony_ci.endif
1080cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1081cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20
1082cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1083cabdff1aSopenharmony_ci.endr
1084cabdff1aSopenharmony_ci.endif
1085cabdff1aSopenharmony_ci.ifc \suffix,_half
1086cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1087cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1088cabdff1aSopenharmony_ci.endr
1089cabdff1aSopenharmony_ci.endif
1090cabdff1aSopenharmony_ci
1091cabdff1aSopenharmony_ci        add             r3,  r0,  r1
1092cabdff1aSopenharmony_ci        lsl             r1,  r1,  #1
1093cabdff1aSopenharmony_ci        bl              \txfm\()16\suffix
1094cabdff1aSopenharmony_ci
1095cabdff1aSopenharmony_ci.macro load_add_store coef0, coef1, coef2, coef3
1096cabdff1aSopenharmony_ci        vrshr.s32       \coef0, \coef0, #6
1097cabdff1aSopenharmony_ci        vrshr.s32       \coef1, \coef1, #6
1098cabdff1aSopenharmony_ci
1099cabdff1aSopenharmony_ci        vld1.32         {d8[]},   [r0,:32], r1
1100cabdff1aSopenharmony_ci        vld1.32         {d8[1]},  [r3,:32], r1
1101cabdff1aSopenharmony_ci        vrshr.s32       \coef2, \coef2, #6
1102cabdff1aSopenharmony_ci        vrshr.s32       \coef3, \coef3, #6
1103cabdff1aSopenharmony_ci        vld1.32         {d9[]},   [r0,:32], r1
1104cabdff1aSopenharmony_ci        vld1.32         {d9[1]},  [r3,:32], r1
1105cabdff1aSopenharmony_ci        vaddw.u16       \coef0, \coef0, d8
1106cabdff1aSopenharmony_ci        vld1.32         {d10[]},  [r0,:32], r1
1107cabdff1aSopenharmony_ci        vld1.32         {d10[1]}, [r3,:32], r1
1108cabdff1aSopenharmony_ci        vaddw.u16       \coef1, \coef1, d9
1109cabdff1aSopenharmony_ci        vld1.32         {d11[]},  [r0,:32], r1
1110cabdff1aSopenharmony_ci        vld1.32         {d11[1]}, [r3,:32], r1
1111cabdff1aSopenharmony_ci
1112cabdff1aSopenharmony_ci        vqmovun.s32     d8,  \coef0
1113cabdff1aSopenharmony_ci        vdup.s16        q8,  r9
1114cabdff1aSopenharmony_ci        vqmovun.s32     d9,  \coef1
1115cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
1116cabdff1aSopenharmony_ci        sub             r3,  r3,  r1, lsl #2
1117cabdff1aSopenharmony_ci        vaddw.u16       \coef2, \coef2, d10
1118cabdff1aSopenharmony_ci        vaddw.u16       \coef3, \coef3, d11
1119cabdff1aSopenharmony_ci        vmin.u16        q4,  q4,  q8
1120cabdff1aSopenharmony_ci        vst1.32         {d8[0]},  [r0,:32], r1
1121cabdff1aSopenharmony_ci        vst1.32         {d8[1]},  [r3,:32], r1
1122cabdff1aSopenharmony_ci        vqmovun.s32     d10, \coef2
1123cabdff1aSopenharmony_ci        vst1.32         {d9[0]},  [r0,:32], r1
1124cabdff1aSopenharmony_ci        vst1.32         {d9[1]},  [r3,:32], r1
1125cabdff1aSopenharmony_ci        vqmovun.s32     d11, \coef3
1126cabdff1aSopenharmony_ci        vmin.u16        q5,  q5,  q8
1127cabdff1aSopenharmony_ci
1128cabdff1aSopenharmony_ci        vst1.32         {d10[0]}, [r0,:32], r1
1129cabdff1aSopenharmony_ci        vst1.32         {d10[1]}, [r3,:32], r1
1130cabdff1aSopenharmony_ci        vst1.32         {d11[0]}, [r0,:32], r1
1131cabdff1aSopenharmony_ci        vst1.32         {d11[1]}, [r3,:32], r1
1132cabdff1aSopenharmony_ci.endm
1133cabdff1aSopenharmony_ci        load_add_store  q8,  q9,  q10, q11
1134cabdff1aSopenharmony_ci        load_add_store  q12, q13, q14, q15
1135cabdff1aSopenharmony_ci.purgem load_add_store
1136cabdff1aSopenharmony_ci
1137cabdff1aSopenharmony_ci        pop             {pc}
1138cabdff1aSopenharmony_ciendfunc
1139cabdff1aSopenharmony_ci.endm
1140cabdff1aSopenharmony_ci
1141cabdff1aSopenharmony_ciitxfm16_1d_funcs idct
1142cabdff1aSopenharmony_ciitxfm16_1d_funcs iadst
1143cabdff1aSopenharmony_ciitxfm16_1d_funcs idct, _quarter
1144cabdff1aSopenharmony_ciitxfm16_1d_funcs idct, _half
1145cabdff1aSopenharmony_ci.ltorg
1146cabdff1aSopenharmony_ci
1147cabdff1aSopenharmony_ci@ This is the minimum eob value for each subpartition, in increments of 2
1148cabdff1aSopenharmony_ciconst min_eob_idct_idct_16, align=4
1149cabdff1aSopenharmony_ci        .short  0, 3, 10, 22, 38, 62, 89, 121
1150cabdff1aSopenharmony_ciendconst
1151cabdff1aSopenharmony_ci
1152cabdff1aSopenharmony_ci.macro itxfm_func16x16 txfm1, txfm2
1153cabdff1aSopenharmony_cifunction vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1154cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
1155cabdff1aSopenharmony_ci        cmp             r3,  #1
1156cabdff1aSopenharmony_ci        beq             idct16x16_dc_add_neon
1157cabdff1aSopenharmony_ci.endif
1158cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
1159cabdff1aSopenharmony_ci        vpush           {q4-q7}
1160cabdff1aSopenharmony_ci.else
1161cabdff1aSopenharmony_ci        vpush           {q4-q5}
1162cabdff1aSopenharmony_ci.endif
1163cabdff1aSopenharmony_ci
1164cabdff1aSopenharmony_ci        @ Align the stack, allocate a temp buffer
1165cabdff1aSopenharmony_ciT       mov             r7,  sp
1166cabdff1aSopenharmony_ciT       and             r7,  r7,  #15
1167cabdff1aSopenharmony_ciA       and             r7,  sp,  #15
1168cabdff1aSopenharmony_ci        add             r7,  r7,  #1024
1169cabdff1aSopenharmony_ci        sub             sp,  sp,  r7
1170cabdff1aSopenharmony_ci
1171cabdff1aSopenharmony_ci        mov             r4,  r0
1172cabdff1aSopenharmony_ci        mov             r5,  r1
1173cabdff1aSopenharmony_ci        mov             r6,  r2
1174cabdff1aSopenharmony_ci
1175cabdff1aSopenharmony_ci.ifc \txfm1,idct
1176cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
1177cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r12,:128]
1178cabdff1aSopenharmony_ci        vmovl.s16       q2,  d2
1179cabdff1aSopenharmony_ci        vmovl.s16       q3,  d3
1180cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
1181cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
1182cabdff1aSopenharmony_ci.endif
1183cabdff1aSopenharmony_ci
1184cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
1185cabdff1aSopenharmony_ci        cmp             r3,  #10
1186cabdff1aSopenharmony_ci        ble             idct16x16_quarter_add_16_neon
1187cabdff1aSopenharmony_ci        cmp             r3,  #38
1188cabdff1aSopenharmony_ci        ble             idct16x16_half_add_16_neon
1189cabdff1aSopenharmony_ci
1190cabdff1aSopenharmony_ci        movrel          r8,  min_eob_idct_idct_16 + 2
1191cabdff1aSopenharmony_ci.endif
1192cabdff1aSopenharmony_ci
1193cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14
1194cabdff1aSopenharmony_ci        add             r0,  sp,  #(\i*64)
1195cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
1196cabdff1aSopenharmony_ci.if \i > 0
1197cabdff1aSopenharmony_ci        ldrh_post       r1,  r8,  #2
1198cabdff1aSopenharmony_ci        cmp             r3,  r1
1199cabdff1aSopenharmony_ci        it              le
1200cabdff1aSopenharmony_ci        movle           r1,  #(16 - \i)/2
1201cabdff1aSopenharmony_ci        ble             1f
1202cabdff1aSopenharmony_ci.endif
1203cabdff1aSopenharmony_ci.endif
1204cabdff1aSopenharmony_ci        add             r2,  r6,  #(\i*4)
1205cabdff1aSopenharmony_ci        bl              \txfm1\()16_1d_2x16_pass1_neon
1206cabdff1aSopenharmony_ci.endr
1207cabdff1aSopenharmony_ci
1208cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
1209cabdff1aSopenharmony_ci        b               3f
1210cabdff1aSopenharmony_ci1:
1211cabdff1aSopenharmony_ci        vmov.i32        q14, #0
1212cabdff1aSopenharmony_ci        vmov.i32        q15, #0
1213cabdff1aSopenharmony_ci2:
1214cabdff1aSopenharmony_ci        subs            r1,  r1,  #1
1215cabdff1aSopenharmony_ci        @ Unroll for 2 lines
1216cabdff1aSopenharmony_ci.rept 2
1217cabdff1aSopenharmony_ci        @ Fill one line with zeros
1218cabdff1aSopenharmony_ci        vst1.32         {q14-q15}, [r0,:128]!
1219cabdff1aSopenharmony_ci        vst1.32         {q14-q15}, [r0,:128]!
1220cabdff1aSopenharmony_ci.endr
1221cabdff1aSopenharmony_ci        bne             2b
1222cabdff1aSopenharmony_ci3:
1223cabdff1aSopenharmony_ci.endif
1224cabdff1aSopenharmony_ci
1225cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,iadst_idct
1226cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
1227cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r12,:128]
1228cabdff1aSopenharmony_ci        vmovl.s16       q2,  d2
1229cabdff1aSopenharmony_ci        vmovl.s16       q3,  d3
1230cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
1231cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
1232cabdff1aSopenharmony_ci.endif
1233cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14
1234cabdff1aSopenharmony_ci        add             r0,  r4,  #(\i*2)
1235cabdff1aSopenharmony_ci        mov             r1,  r5
1236cabdff1aSopenharmony_ci        add             r2,  sp,  #(\i*4)
1237cabdff1aSopenharmony_ci        bl              \txfm2\()16_1d_2x16_pass2_neon
1238cabdff1aSopenharmony_ci.endr
1239cabdff1aSopenharmony_ci
1240cabdff1aSopenharmony_ci        add             sp,  sp,  r7
1241cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
1242cabdff1aSopenharmony_ci        vpop            {q4-q7}
1243cabdff1aSopenharmony_ci.else
1244cabdff1aSopenharmony_ci        vpop            {q4-q5}
1245cabdff1aSopenharmony_ci.endif
1246cabdff1aSopenharmony_ci        pop             {r4-r9,pc}
1247cabdff1aSopenharmony_ciendfunc
1248cabdff1aSopenharmony_ci
1249cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
1250cabdff1aSopenharmony_ci        push            {r4-r9,lr}
1251cabdff1aSopenharmony_ci        movw            r9,  #0x03ff
1252cabdff1aSopenharmony_ci        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1253cabdff1aSopenharmony_ciendfunc
1254cabdff1aSopenharmony_ci
1255cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
1256cabdff1aSopenharmony_ci        push            {r4-r9,lr}
1257cabdff1aSopenharmony_ci        movw            r9,  #0x0fff
1258cabdff1aSopenharmony_ci        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1259cabdff1aSopenharmony_ciendfunc
1260cabdff1aSopenharmony_ci.endm
1261cabdff1aSopenharmony_ci
1262cabdff1aSopenharmony_ciitxfm_func16x16 idct,  idct
1263cabdff1aSopenharmony_ciitxfm_func16x16 iadst, idct
1264cabdff1aSopenharmony_ciitxfm_func16x16 idct,  iadst
1265cabdff1aSopenharmony_ciitxfm_func16x16 iadst, iadst
1266cabdff1aSopenharmony_ci.ltorg
1267cabdff1aSopenharmony_ci
1268cabdff1aSopenharmony_ci.macro idct16_partial size
1269cabdff1aSopenharmony_cifunction idct16x16_\size\()_add_16_neon
1270cabdff1aSopenharmony_ci.irp i, 0, 2
1271cabdff1aSopenharmony_ci        add             r0,  sp,  #(\i*64)
1272cabdff1aSopenharmony_ci.ifc \size,quarter
1273cabdff1aSopenharmony_ci.if \i == 2
1274cabdff1aSopenharmony_ci        cmp             r3,  #3
1275cabdff1aSopenharmony_ci        ble             1f
1276cabdff1aSopenharmony_ci.endif
1277cabdff1aSopenharmony_ci.endif
1278cabdff1aSopenharmony_ci        add             r2,  r6,  #(\i*4)
1279cabdff1aSopenharmony_ci        bl              idct16_1d_2x16_pass1_\size\()_neon
1280cabdff1aSopenharmony_ci.endr
1281cabdff1aSopenharmony_ci
1282cabdff1aSopenharmony_ci.ifc \size,half
1283cabdff1aSopenharmony_ci.irp i, 4, 6
1284cabdff1aSopenharmony_ci        add             r0,  sp,  #(\i*64)
1285cabdff1aSopenharmony_ci.if \i == 6
1286cabdff1aSopenharmony_ci        cmp             r3,  #22
1287cabdff1aSopenharmony_ci        ble             1f
1288cabdff1aSopenharmony_ci.endif
1289cabdff1aSopenharmony_ci        add             r2,  r6,  #(\i*4)
1290cabdff1aSopenharmony_ci        bl              idct16_1d_2x16_pass1_\size\()_neon
1291cabdff1aSopenharmony_ci.endr
1292cabdff1aSopenharmony_ci.endif
1293cabdff1aSopenharmony_ci
1294cabdff1aSopenharmony_ci        b               3f
1295cabdff1aSopenharmony_ci1:
1296cabdff1aSopenharmony_ci        vmov.i32        q14, #0
1297cabdff1aSopenharmony_ci        vmov.i32        q15, #0
1298cabdff1aSopenharmony_ci
1299cabdff1aSopenharmony_ci        @ Unroll for 2 lines
1300cabdff1aSopenharmony_ci.rept 2
1301cabdff1aSopenharmony_ci        @ Fill one line with zeros
1302cabdff1aSopenharmony_ci        vst1.32         {q14-q15}, [r0,:128]!
1303cabdff1aSopenharmony_ci        vst1.32         {q14-q15}, [r0,:128]!
1304cabdff1aSopenharmony_ci.endr
1305cabdff1aSopenharmony_ci
1306cabdff1aSopenharmony_ci3:
1307cabdff1aSopenharmony_ci
1308cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14
1309cabdff1aSopenharmony_ci        add             r0,  r4,  #(\i*2)
1310cabdff1aSopenharmony_ci        mov             r1,  r5
1311cabdff1aSopenharmony_ci        add             r2,  sp,  #(\i*4)
1312cabdff1aSopenharmony_ci        bl              idct16_1d_2x16_pass2_\size\()_neon
1313cabdff1aSopenharmony_ci.endr
1314cabdff1aSopenharmony_ci
1315cabdff1aSopenharmony_ci        add             sp,  sp,  r7
1316cabdff1aSopenharmony_ci        vpop            {q4-q5}
1317cabdff1aSopenharmony_ci        pop             {r4-r9,pc}
1318cabdff1aSopenharmony_ciendfunc
1319cabdff1aSopenharmony_ci.endm
1320cabdff1aSopenharmony_ci
1321cabdff1aSopenharmony_ciidct16_partial quarter
1322cabdff1aSopenharmony_ciidct16_partial half
1323cabdff1aSopenharmony_ci
1324cabdff1aSopenharmony_cifunction idct32x32_dc_add_neon
1325cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
1326cabdff1aSopenharmony_ci        vld1.16         {d0}, [r12,:64]
1327cabdff1aSopenharmony_ci
1328cabdff1aSopenharmony_ci        vmov.i32        q2,  #0
1329cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
1330cabdff1aSopenharmony_ci
1331cabdff1aSopenharmony_ci        vld1.32         {d16[]}, [r2,:32]
1332cabdff1aSopenharmony_ci        vmull.s32       q8,  d16, d0[0]
1333cabdff1aSopenharmony_ci        vrshrn.s64      d16, q8,  #14
1334cabdff1aSopenharmony_ci        vmull.s32       q8,  d16, d0[0]
1335cabdff1aSopenharmony_ci        vrshrn.s64      d16, q8,  #14
1336cabdff1aSopenharmony_ci        vdup.32         q8,  d16[0]
1337cabdff1aSopenharmony_ci        vst1.32         {d4[0]}, [r2,:32]
1338cabdff1aSopenharmony_ci
1339cabdff1aSopenharmony_ci        vrshr.s32       q8,  q8,  #6
1340cabdff1aSopenharmony_ci        vdup.s16        q15, r9
1341cabdff1aSopenharmony_ci
1342cabdff1aSopenharmony_ci        mov             r3,  r0
1343cabdff1aSopenharmony_ci        mov             r12, #32
1344cabdff1aSopenharmony_ci        sub             r1,  r1,  #32
1345cabdff1aSopenharmony_ci1:
1346cabdff1aSopenharmony_ci        @ Loop to add the constant from q8 into all 32x32 outputs
1347cabdff1aSopenharmony_ci        subs            r12, r12, #1
1348cabdff1aSopenharmony_ci        vld1.16         {q0-q1},  [r0,:128]!
1349cabdff1aSopenharmony_ci        vaddw.u16       q9,  q8,  d0
1350cabdff1aSopenharmony_ci        vaddw.u16       q10, q8,  d1
1351cabdff1aSopenharmony_ci        vld1.16         {q2-q3},  [r0,:128], r1
1352cabdff1aSopenharmony_ci        vaddw.u16       q11, q8,  d2
1353cabdff1aSopenharmony_ci        vaddw.u16       q12, q8,  d3
1354cabdff1aSopenharmony_ci        vaddw.u16       q13, q8,  d4
1355cabdff1aSopenharmony_ci        vaddw.u16       q14, q8,  d5
1356cabdff1aSopenharmony_ci        vqmovun.s32     d0,  q9
1357cabdff1aSopenharmony_ci        vaddw.u16       q9,  q8,  d6
1358cabdff1aSopenharmony_ci        vqmovun.s32     d1,  q10
1359cabdff1aSopenharmony_ci        vaddw.u16       q10, q8,  d7
1360cabdff1aSopenharmony_ci        vqmovun.s32     d2,  q11
1361cabdff1aSopenharmony_ci        vqmovun.s32     d3,  q12
1362cabdff1aSopenharmony_ci        vqmovun.s32     d4,  q13
1363cabdff1aSopenharmony_ci        vqmovun.s32     d5,  q14
1364cabdff1aSopenharmony_ci        vmin.u16        q0,  q0,  q15
1365cabdff1aSopenharmony_ci        vmin.u16        q1,  q1,  q15
1366cabdff1aSopenharmony_ci        vqmovun.s32     d6,  q9
1367cabdff1aSopenharmony_ci        vqmovun.s32     d7,  q10
1368cabdff1aSopenharmony_ci        vst1.16         {q0-q1},  [r3,:128]!
1369cabdff1aSopenharmony_ci        vmin.u16        q2,  q2,  q15
1370cabdff1aSopenharmony_ci        vmin.u16        q3,  q3,  q15
1371cabdff1aSopenharmony_ci        vst1.16         {q2-q3},  [r3,:128], r1
1372cabdff1aSopenharmony_ci        bne             1b
1373cabdff1aSopenharmony_ci
1374cabdff1aSopenharmony_ci        pop             {r4-r9,pc}
1375cabdff1aSopenharmony_ciendfunc
1376cabdff1aSopenharmony_ci
1377cabdff1aSopenharmony_ci.macro idct32_end
1378cabdff1aSopenharmony_ci        butterfly       d16, d9,  d8,  d9  @ d16 = t16a, d9  = t19a
1379cabdff1aSopenharmony_ci        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
1380cabdff1aSopenharmony_ci        butterfly       d18, d10, d11, d10 @ d18 = t23a, d10 = t20a
1381cabdff1aSopenharmony_ci        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
1382cabdff1aSopenharmony_ci        butterfly       d8,  d28, d28, d30 @ d8  = t24a, d28 = t27a
1383cabdff1aSopenharmony_ci        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
1384cabdff1aSopenharmony_ci        butterfly       d11, d29, d29, d31 @ d11 = t31a, d29 = t28a
1385cabdff1aSopenharmony_ci        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
1386cabdff1aSopenharmony_ci
1387cabdff1aSopenharmony_ci        mbutterfly      d27, d20, d1[0], d1[1], q12, q15        @ d27 = t18a, d20 = t29a
1388cabdff1aSopenharmony_ci        mbutterfly      d29, d9,  d1[0], d1[1], q12, q15        @ d29 = t19,  d9  = t28
1389cabdff1aSopenharmony_ci        mbutterfly      d28, d10, d1[0], d1[1], q12, q15, neg=1 @ d28 = t27,  d10 = t20
1390cabdff1aSopenharmony_ci        mbutterfly      d26, d21, d1[0], d1[1], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
1391cabdff1aSopenharmony_ci
1392cabdff1aSopenharmony_ci        butterfly       d31, d24, d11, d8  @ d31 = t31,  d24 = t24
1393cabdff1aSopenharmony_ci        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
1394cabdff1aSopenharmony_ci        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
1395cabdff1aSopenharmony_ci        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
1396cabdff1aSopenharmony_ci        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
1397cabdff1aSopenharmony_ci        butterfly_r     d27, d28, d9,  d28 @ d27 = t27a, d28 = t28a
1398cabdff1aSopenharmony_ci        butterfly       d8,  d26, d20, d26 @ d8  = t29,  d26 = t26
1399cabdff1aSopenharmony_ci        butterfly       d19, d20, d29, d10 @ d19 = t19a, d20 = t20
1400cabdff1aSopenharmony_ci        vmov            d29, d8            @ d29 = t29
1401cabdff1aSopenharmony_ci
1402cabdff1aSopenharmony_ci        mbutterfly0     d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27,  d20 = t20
1403cabdff1aSopenharmony_ci        mbutterfly0     d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
1404cabdff1aSopenharmony_ci        mbutterfly0     d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25,  d22 = t22
1405cabdff1aSopenharmony_ci        mbutterfly0     d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
1406cabdff1aSopenharmony_ci        bx              lr
1407cabdff1aSopenharmony_ci.endm
1408cabdff1aSopenharmony_ci
1409cabdff1aSopenharmony_cifunction idct32_odd
1410cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
1411cabdff1aSopenharmony_ci
1412cabdff1aSopenharmony_ci        @ Overwrite the idct16 coeffs with the stored ones for idct32
1413cabdff1aSopenharmony_ci        vmovl.s16       q0,  d12
1414cabdff1aSopenharmony_ci        vmovl.s16       q1,  d13
1415cabdff1aSopenharmony_ci        vmovl.s16       q2,  d14
1416cabdff1aSopenharmony_ci        vmovl.s16       q3,  d15
1417cabdff1aSopenharmony_ci
1418cabdff1aSopenharmony_ci        mbutterfly      d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a
1419cabdff1aSopenharmony_ci        mbutterfly      d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a
1420cabdff1aSopenharmony_ci        mbutterfly      d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a
1421cabdff1aSopenharmony_ci        mbutterfly      d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a
1422cabdff1aSopenharmony_ci        mbutterfly      d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a
1423cabdff1aSopenharmony_ci        mbutterfly      d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a
1424cabdff1aSopenharmony_ci        mbutterfly      d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a
1425cabdff1aSopenharmony_ci        mbutterfly      d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a
1426cabdff1aSopenharmony_ci
1427cabdff1aSopenharmony_ci        @ Reload the idct16 coefficients. We could swap the coefficients between
1428cabdff1aSopenharmony_ci        @ q0-q3 and q6-q7 by narrowing/lengthening, but that's slower than just
1429cabdff1aSopenharmony_ci        @ loading and lengthening.
1430cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r12,:128]
1431cabdff1aSopenharmony_ci
1432cabdff1aSopenharmony_ci        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
1433cabdff1aSopenharmony_ci        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
1434cabdff1aSopenharmony_ci        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
1435cabdff1aSopenharmony_ci        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
1436cabdff1aSopenharmony_ci        vmovl.s16       q2,  d2
1437cabdff1aSopenharmony_ci        vmovl.s16       q3,  d3
1438cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
1439cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
1440cabdff1aSopenharmony_ci        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
1441cabdff1aSopenharmony_ci        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
1442cabdff1aSopenharmony_ci        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
1443cabdff1aSopenharmony_ci        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
1444cabdff1aSopenharmony_ci
1445cabdff1aSopenharmony_ci        mbutterfly      d23, d24, d2[0], d2[1], q8, q9        @ d23 = t17a, d24 = t30a
1446cabdff1aSopenharmony_ci        mbutterfly      d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
1447cabdff1aSopenharmony_ci        mbutterfly      d21, d26, d3[0], d3[1], q8, q9        @ d21 = t21a, d26 = t26a
1448cabdff1aSopenharmony_ci        mbutterfly      d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
1449cabdff1aSopenharmony_ci        idct32_end
1450cabdff1aSopenharmony_ciendfunc
1451cabdff1aSopenharmony_ci
1452cabdff1aSopenharmony_cifunction idct32_odd_half
1453cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
1454cabdff1aSopenharmony_ci
1455cabdff1aSopenharmony_ci        vmovl.s16       q0,  d12
1456cabdff1aSopenharmony_ci        vmovl.s16       q1,  d13
1457cabdff1aSopenharmony_ci        vmovl.s16       q2,  d14
1458cabdff1aSopenharmony_ci        vmovl.s16       q3,  d15
1459cabdff1aSopenharmony_ci
1460cabdff1aSopenharmony_ci        mbutterfly_h1   d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a
1461cabdff1aSopenharmony_ci        mbutterfly_h2   d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a
1462cabdff1aSopenharmony_ci        mbutterfly_h1   d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a
1463cabdff1aSopenharmony_ci        mbutterfly_h2   d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a
1464cabdff1aSopenharmony_ci        mbutterfly_h1   d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a
1465cabdff1aSopenharmony_ci        mbutterfly_h2   d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a
1466cabdff1aSopenharmony_ci        mbutterfly_h1   d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a
1467cabdff1aSopenharmony_ci        mbutterfly_h2   d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a
1468cabdff1aSopenharmony_ci
1469cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r12,:128]
1470cabdff1aSopenharmony_ci
1471cabdff1aSopenharmony_ci        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
1472cabdff1aSopenharmony_ci        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
1473cabdff1aSopenharmony_ci        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
1474cabdff1aSopenharmony_ci        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
1475cabdff1aSopenharmony_ci        vmovl.s16       q2,  d2
1476cabdff1aSopenharmony_ci        vmovl.s16       q3,  d3
1477cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
1478cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
1479cabdff1aSopenharmony_ci        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
1480cabdff1aSopenharmony_ci        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
1481cabdff1aSopenharmony_ci        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
1482cabdff1aSopenharmony_ci        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
1483cabdff1aSopenharmony_ci
1484cabdff1aSopenharmony_ci        mbutterfly      d23, d24, d2[0], d2[1], q8, q9        @ d23 = t17a, d24 = t30a
1485cabdff1aSopenharmony_ci        mbutterfly      d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
1486cabdff1aSopenharmony_ci        mbutterfly      d21, d26, d3[0], d3[1], q8, q9        @ d21 = t21a, d26 = t26a
1487cabdff1aSopenharmony_ci        mbutterfly      d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
1488cabdff1aSopenharmony_ci        idct32_end
1489cabdff1aSopenharmony_ciendfunc
1490cabdff1aSopenharmony_ci
1491cabdff1aSopenharmony_cifunction idct32_odd_quarter
1492cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
1493cabdff1aSopenharmony_ci
1494cabdff1aSopenharmony_ci        vmovl.s16       q0,  d12
1495cabdff1aSopenharmony_ci        vmovl.s16       q1,  d13
1496cabdff1aSopenharmony_ci        vmovl.s16       q2,  d14
1497cabdff1aSopenharmony_ci        vmovl.s16       q3,  d15
1498cabdff1aSopenharmony_ci
1499cabdff1aSopenharmony_ci        vmov.s64        q14, #0
1500cabdff1aSopenharmony_ci        vmov.s64        q5,  #0
1501cabdff1aSopenharmony_ci
1502cabdff1aSopenharmony_ci        vmull.s32       q4,  d16, d0[0]
1503cabdff1aSopenharmony_ci        vmlsl.s32       q14, d19, d3[1]
1504cabdff1aSopenharmony_ci        vmull.s32       q15, d16, d0[1]
1505cabdff1aSopenharmony_ci        vmull.s32       q11, d17, d7[0]
1506cabdff1aSopenharmony_ci        vmlsl.s32       q5,  d17, d7[1]
1507cabdff1aSopenharmony_ci        vmull.s32       q13, d19, d3[0]
1508cabdff1aSopenharmony_ci        vmull.s32       q10, d18, d4[0]
1509cabdff1aSopenharmony_ci        vmull.s32       q12, d18, d4[1]
1510cabdff1aSopenharmony_ci
1511cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r12,:128]
1512cabdff1aSopenharmony_ci
1513cabdff1aSopenharmony_ci        vrshrn.s64      d8,  q4,  #14
1514cabdff1aSopenharmony_ci        vrshrn.s64      d9,  q14, #14
1515cabdff1aSopenharmony_ci        vrshrn.s64      d29, q15, #14
1516cabdff1aSopenharmony_ci        vrshrn.s64      d28, q11, #14
1517cabdff1aSopenharmony_ci
1518cabdff1aSopenharmony_ci        vmovl.s16       q2,  d2
1519cabdff1aSopenharmony_ci        vmovl.s16       q3,  d3
1520cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
1521cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
1522cabdff1aSopenharmony_ci
1523cabdff1aSopenharmony_ci        vrshrn.s64      d11, q5,  #14
1524cabdff1aSopenharmony_ci        vrshrn.s64      d31, q13, #14
1525cabdff1aSopenharmony_ci        vrshrn.s64      d10, q10, #14
1526cabdff1aSopenharmony_ci        vrshrn.s64      d30, q12, #14
1527cabdff1aSopenharmony_ci
1528cabdff1aSopenharmony_ci        mbutterfly_l    q8,  q9,  d29, d8,  d2[0], d2[1]
1529cabdff1aSopenharmony_ci        mbutterfly_l    q13, q10, d31, d9,  d2[0], d2[1], neg=1
1530cabdff1aSopenharmony_ci        vrshrn.s64      d23, q8,  #14
1531cabdff1aSopenharmony_ci        vrshrn.s64      d24, q9,  #14
1532cabdff1aSopenharmony_ci        vrshrn.s64      d27, q13, #14
1533cabdff1aSopenharmony_ci        vrshrn.s64      d20, q10, #14
1534cabdff1aSopenharmony_ci        mbutterfly_l    q8,  q9,  d30, d10, d3[0], d3[1]
1535cabdff1aSopenharmony_ci        vrshrn.s64      d21, q8,  #14
1536cabdff1aSopenharmony_ci        vrshrn.s64      d26, q9,  #14
1537cabdff1aSopenharmony_ci        mbutterfly_l    q8,  q9,  d28, d11, d3[0], d3[1], neg=1
1538cabdff1aSopenharmony_ci        vrshrn.s64      d25, q8,  #14
1539cabdff1aSopenharmony_ci        vrshrn.s64      d22, q9,  #14
1540cabdff1aSopenharmony_ci
1541cabdff1aSopenharmony_ci        idct32_end
1542cabdff1aSopenharmony_ciendfunc
1543cabdff1aSopenharmony_ci
1544cabdff1aSopenharmony_ci.macro idct32_funcs suffix
1545cabdff1aSopenharmony_ci@ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix.
1546cabdff1aSopenharmony_ci@ We don't have register space to do a single pass IDCT of 2x32 though,
1547cabdff1aSopenharmony_ci@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
1548cabdff1aSopenharmony_ci@ a normal IDCT16 with every other input component (the even ones, with
1549cabdff1aSopenharmony_ci@ each output written twice), followed by a separate 16-point IDCT
1550cabdff1aSopenharmony_ci@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
1551cabdff1aSopenharmony_ci@ r0 = dst (temp buffer)
1552cabdff1aSopenharmony_ci@ r1 = unused
1553cabdff1aSopenharmony_ci@ r2 = src
1554cabdff1aSopenharmony_cifunction idct32_1d_2x32_pass1\suffix\()_neon
1555cabdff1aSopenharmony_ci        push            {lr}
1556cabdff1aSopenharmony_ci
1557cabdff1aSopenharmony_ci        @ Double stride of the input, since we only read every other line
1558cabdff1aSopenharmony_ci        mov             r12, #256
1559cabdff1aSopenharmony_ci        vmov.s32        d8,  #0
1560cabdff1aSopenharmony_ci
1561cabdff1aSopenharmony_ci        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
1562cabdff1aSopenharmony_ci.ifb \suffix
1563cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1564cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64]
1565cabdff1aSopenharmony_ci        vst1.32         {d8},  [r2,:64], r12
1566cabdff1aSopenharmony_ci.endr
1567cabdff1aSopenharmony_ci.endif
1568cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1569cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1570cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64]
1571cabdff1aSopenharmony_ci        vst1.32         {d8},  [r2,:64], r12
1572cabdff1aSopenharmony_ci.endr
1573cabdff1aSopenharmony_ci.endif
1574cabdff1aSopenharmony_ci.ifc \suffix,_half
1575cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1576cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64]
1577cabdff1aSopenharmony_ci        vst1.32         {d8},  [r2,:64], r12
1578cabdff1aSopenharmony_ci.endr
1579cabdff1aSopenharmony_ci.endif
1580cabdff1aSopenharmony_ci
1581cabdff1aSopenharmony_ci        bl              idct16\suffix
1582cabdff1aSopenharmony_ci
1583cabdff1aSopenharmony_ci        @ Do eight 2x2 transposes. Originally, d16-d31 contain the
1584cabdff1aSopenharmony_ci        @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
1585cabdff1aSopenharmony_ci        @ transposed 2x2 blocks.
1586cabdff1aSopenharmony_ci        transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
1587cabdff1aSopenharmony_ci
1588cabdff1aSopenharmony_ci        @ Store the registers a, b, c, d, e, f, g, h horizontally, followed
1589cabdff1aSopenharmony_ci        @ by the same registers h, g, f, e, d, c, b, a mirrored.
1590cabdff1aSopenharmony_ci.macro store_rev a, b, c, d, e, f, g, h
1591cabdff1aSopenharmony_ci.irp i, \a, \b, \c, \d, \e, \f, \g, \h
1592cabdff1aSopenharmony_ci        vst1.32         {d\i}, [r0,:64]!
1593cabdff1aSopenharmony_ci        vrev64.32       d\i, d\i
1594cabdff1aSopenharmony_ci.endr
1595cabdff1aSopenharmony_ci.irp i, \h, \g, \f, \e, \d, \c, \b, \a
1596cabdff1aSopenharmony_ci        vst1.32         {d\i}, [r0,:64]!
1597cabdff1aSopenharmony_ci.endr
1598cabdff1aSopenharmony_ci.endm
1599cabdff1aSopenharmony_ci        store_rev       16, 18, 20, 22, 24, 26, 28, 30
1600cabdff1aSopenharmony_ci        store_rev       17, 19, 21, 23, 25, 27, 29, 31
1601cabdff1aSopenharmony_ci        sub             r0,  r0,  #256
1602cabdff1aSopenharmony_ci.purgem store_rev
1603cabdff1aSopenharmony_ci
1604cabdff1aSopenharmony_ci        @ Move r2 back to the start of the input, and move
1605cabdff1aSopenharmony_ci        @ to the first odd row
1606cabdff1aSopenharmony_ci.ifb \suffix
1607cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #4
1608cabdff1aSopenharmony_ci.endif
1609cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1610cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #2
1611cabdff1aSopenharmony_ci.endif
1612cabdff1aSopenharmony_ci.ifc \suffix,_half
1613cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #3
1614cabdff1aSopenharmony_ci.endif
1615cabdff1aSopenharmony_ci        add             r2,  r2,  #128
1616cabdff1aSopenharmony_ci
1617cabdff1aSopenharmony_ci        vmov.s32        d8,  #0
1618cabdff1aSopenharmony_ci        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
1619cabdff1aSopenharmony_ci.ifb \suffix
1620cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1621cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
1622cabdff1aSopenharmony_ci        vst1.16         {d8},  [r2,:64], r12
1623cabdff1aSopenharmony_ci.endr
1624cabdff1aSopenharmony_ci.endif
1625cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1626cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1627cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
1628cabdff1aSopenharmony_ci        vst1.16         {d8},  [r2,:64], r12
1629cabdff1aSopenharmony_ci.endr
1630cabdff1aSopenharmony_ci.endif
1631cabdff1aSopenharmony_ci.ifc \suffix,_half
1632cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1633cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
1634cabdff1aSopenharmony_ci        vst1.16         {d8},  [r2,:64], r12
1635cabdff1aSopenharmony_ci.endr
1636cabdff1aSopenharmony_ci.endif
1637cabdff1aSopenharmony_ci
1638cabdff1aSopenharmony_ci        bl              idct32_odd\suffix
1639cabdff1aSopenharmony_ci
1640cabdff1aSopenharmony_ci        transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
1641cabdff1aSopenharmony_ci
1642cabdff1aSopenharmony_ci        @ Store the registers a, b, c, d, e, f, g, h horizontally,
1643cabdff1aSopenharmony_ci        @ adding into the output first, and then mirrored, subtracted
1644cabdff1aSopenharmony_ci        @ from the output.
1645cabdff1aSopenharmony_ci.macro store_rev a, b, c, d, e, f, g, h
1646cabdff1aSopenharmony_ci.irp i, \a, \b, \c, \d, \e, \f, \g, \h
1647cabdff1aSopenharmony_ci        vld1.32         {d8},  [r0,:64]
1648cabdff1aSopenharmony_ci        vadd.s32        d8, d8, d\i
1649cabdff1aSopenharmony_ci        vst1.32         {d8},  [r0,:64]!
1650cabdff1aSopenharmony_ci        vrev64.32       d\i, d\i
1651cabdff1aSopenharmony_ci.endr
1652cabdff1aSopenharmony_ci.irp i, \h, \g, \f, \e, \d, \c, \b, \a
1653cabdff1aSopenharmony_ci        vld1.32         {d8},  [r0,:64]
1654cabdff1aSopenharmony_ci        vsub.s32        d8, d8, d\i
1655cabdff1aSopenharmony_ci        vst1.32         {d8},  [r0,:64]!
1656cabdff1aSopenharmony_ci.endr
1657cabdff1aSopenharmony_ci.endm
1658cabdff1aSopenharmony_ci
1659cabdff1aSopenharmony_ci        store_rev       31, 29, 27, 25, 23, 21, 19, 17
1660cabdff1aSopenharmony_ci        store_rev       30, 28, 26, 24, 22, 20, 18, 16
1661cabdff1aSopenharmony_ci.purgem store_rev
1662cabdff1aSopenharmony_ci        pop             {pc}
1663cabdff1aSopenharmony_ciendfunc
1664cabdff1aSopenharmony_ci.ltorg
1665cabdff1aSopenharmony_ci
1666cabdff1aSopenharmony_ci@ This is mostly the same as 2x32_pass1, but without the transpose,
1667cabdff1aSopenharmony_ci@ and use the source as temp buffer between the two idct passes, and
1668cabdff1aSopenharmony_ci@ add into the destination.
1669cabdff1aSopenharmony_ci@ r0 = dst
1670cabdff1aSopenharmony_ci@ r1 = dst stride
1671cabdff1aSopenharmony_ci@ r2 = src (temp buffer)
1672cabdff1aSopenharmony_cifunction idct32_1d_2x32_pass2\suffix\()_neon
1673cabdff1aSopenharmony_ci        push            {lr}
1674cabdff1aSopenharmony_ci
1675cabdff1aSopenharmony_ci        mov             r12, #256
1676cabdff1aSopenharmony_ci        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
1677cabdff1aSopenharmony_ci.ifb \suffix
1678cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1679cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64], r12
1680cabdff1aSopenharmony_ci.endr
1681cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #4
1682cabdff1aSopenharmony_ci.endif
1683cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1684cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1685cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64], r12
1686cabdff1aSopenharmony_ci.endr
1687cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #2
1688cabdff1aSopenharmony_ci.endif
1689cabdff1aSopenharmony_ci.ifc \suffix,_half
1690cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1691cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64], r12
1692cabdff1aSopenharmony_ci.endr
1693cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #3
1694cabdff1aSopenharmony_ci.endif
1695cabdff1aSopenharmony_ci
1696cabdff1aSopenharmony_ci        bl              idct16\suffix
1697cabdff1aSopenharmony_ci
1698cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1699cabdff1aSopenharmony_ci        vst1.32         {d\i}, [r2,:64], r12
1700cabdff1aSopenharmony_ci.endr
1701cabdff1aSopenharmony_ci
1702cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #4
1703cabdff1aSopenharmony_ci        add             r2,  r2,  #128
1704cabdff1aSopenharmony_ci
1705cabdff1aSopenharmony_ci        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
1706cabdff1aSopenharmony_ci.ifb \suffix
1707cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1708cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64], r12
1709cabdff1aSopenharmony_ci.endr
1710cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #4
1711cabdff1aSopenharmony_ci.endif
1712cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1713cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1714cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64], r12
1715cabdff1aSopenharmony_ci.endr
1716cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #2
1717cabdff1aSopenharmony_ci.endif
1718cabdff1aSopenharmony_ci.ifc \suffix,_half
1719cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1720cabdff1aSopenharmony_ci        vld1.32         {d\i}, [r2,:64], r12
1721cabdff1aSopenharmony_ci.endr
1722cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #3
1723cabdff1aSopenharmony_ci.endif
1724cabdff1aSopenharmony_ci        sub             r2,  r2,  #128
1725cabdff1aSopenharmony_ci
1726cabdff1aSopenharmony_ci        bl              idct32_odd\suffix
1727cabdff1aSopenharmony_ci
1728cabdff1aSopenharmony_ci        @ Narrow the ict16 coefficients in q0-q3 into q0-q1, to
1729cabdff1aSopenharmony_ci        @ allow clobbering q2-q3 below.
1730cabdff1aSopenharmony_ci        vmovn.s32       d0,  q0
1731cabdff1aSopenharmony_ci        vmovn.s32       d1,  q1
1732cabdff1aSopenharmony_ci        vmovn.s32       d2,  q2
1733cabdff1aSopenharmony_ci        vmovn.s32       d3,  q3
1734cabdff1aSopenharmony_ci
1735cabdff1aSopenharmony_ci        mov             r12, #256
1736cabdff1aSopenharmony_ci        vdup.s16        q4,  r9
1737cabdff1aSopenharmony_ci.macro load_acc_store a, b, c, d, neg=0
1738cabdff1aSopenharmony_ci        vld1.32         {d4},  [r2,:64], r12
1739cabdff1aSopenharmony_ci        vld1.32         {d5},  [r2,:64], r12
1740cabdff1aSopenharmony_ci.if \neg == 0
1741cabdff1aSopenharmony_ci        vadd.s32        d4,  d4,  d\a
1742cabdff1aSopenharmony_ci        vld1.32         {d6},  [r2,:64], r12
1743cabdff1aSopenharmony_ci        vadd.s32        d5,  d5,  d\b
1744cabdff1aSopenharmony_ci        vld1.32         {d7},  [r2,:64], r12
1745cabdff1aSopenharmony_ci        vadd.s32        d6,  d6,  d\c
1746cabdff1aSopenharmony_ci        vadd.s32        d7,  d7,  d\d
1747cabdff1aSopenharmony_ci.else
1748cabdff1aSopenharmony_ci        vsub.s32        d4,  d4,  d\a
1749cabdff1aSopenharmony_ci        vld1.32         {d6},  [r2,:64], r12
1750cabdff1aSopenharmony_ci        vsub.s32        d5,  d5,  d\b
1751cabdff1aSopenharmony_ci        vld1.32         {d7},  [r2,:64], r12
1752cabdff1aSopenharmony_ci        vsub.s32        d6,  d6,  d\c
1753cabdff1aSopenharmony_ci        vsub.s32        d7,  d7,  d\d
1754cabdff1aSopenharmony_ci.endif
1755cabdff1aSopenharmony_ci        vld1.32         {d10[]},  [r0,:32], r1
1756cabdff1aSopenharmony_ci        vld1.32         {d10[1]}, [r0,:32], r1
1757cabdff1aSopenharmony_ci        vrshr.s32       q2,  q2,  #6
1758cabdff1aSopenharmony_ci        vld1.32         {d11[]},  [r0,:32], r1
1759cabdff1aSopenharmony_ci        vrshr.s32       q3,  q3,  #6
1760cabdff1aSopenharmony_ci        vld1.32         {d11[1]}, [r0,:32], r1
1761cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
1762cabdff1aSopenharmony_ci        vaddw.u16       q2,  q2,  d10
1763cabdff1aSopenharmony_ci        vaddw.u16       q3,  q3,  d11
1764cabdff1aSopenharmony_ci        vqmovun.s32     d4,  q2
1765cabdff1aSopenharmony_ci        vqmovun.s32     d5,  q3
1766cabdff1aSopenharmony_ci        vmin.u16        q2,  q2,  q4
1767cabdff1aSopenharmony_ci        vst1.32         {d4[0]},  [r0,:32], r1
1768cabdff1aSopenharmony_ci        vst1.32         {d4[1]},  [r0,:32], r1
1769cabdff1aSopenharmony_ci        vst1.32         {d5[0]},  [r0,:32], r1
1770cabdff1aSopenharmony_ci        vst1.32         {d5[1]},  [r0,:32], r1
1771cabdff1aSopenharmony_ci.endm
1772cabdff1aSopenharmony_ci        load_acc_store  31, 30, 29, 28
1773cabdff1aSopenharmony_ci        load_acc_store  27, 26, 25, 24
1774cabdff1aSopenharmony_ci        load_acc_store  23, 22, 21, 20
1775cabdff1aSopenharmony_ci        load_acc_store  19, 18, 17, 16
1776cabdff1aSopenharmony_ci        sub             r2,  r2,  r12
1777cabdff1aSopenharmony_ci        neg             r12, r12
1778cabdff1aSopenharmony_ci        load_acc_store  16, 17, 18, 19, 1
1779cabdff1aSopenharmony_ci        load_acc_store  20, 21, 22, 23, 1
1780cabdff1aSopenharmony_ci        load_acc_store  24, 25, 26, 27, 1
1781cabdff1aSopenharmony_ci        load_acc_store  28, 29, 30, 31, 1
1782cabdff1aSopenharmony_ci.purgem load_acc_store
1783cabdff1aSopenharmony_ci        @ Lengthen the idct16 coeffs back into 32 bit form
1784cabdff1aSopenharmony_ci        vmovl.s16       q2,  d2
1785cabdff1aSopenharmony_ci        vmovl.s16       q3,  d3
1786cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
1787cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
1788cabdff1aSopenharmony_ci        pop             {pc}
1789cabdff1aSopenharmony_ciendfunc
1790cabdff1aSopenharmony_ci.endm
1791cabdff1aSopenharmony_ci
1792cabdff1aSopenharmony_ciidct32_funcs
1793cabdff1aSopenharmony_ciidct32_funcs _quarter
1794cabdff1aSopenharmony_ciidct32_funcs _half
1795cabdff1aSopenharmony_ci
1796cabdff1aSopenharmony_ciconst min_eob_idct_idct_32, align=4
1797cabdff1aSopenharmony_ci        .short  0, 3, 9, 21, 34, 51, 70, 98, 135, 176, 240, 258, 336, 357, 448, 472
1798cabdff1aSopenharmony_ciendconst
1799cabdff1aSopenharmony_ci
1800cabdff1aSopenharmony_cifunction vp9_idct_idct_32x32_add_16_neon
1801cabdff1aSopenharmony_ci        cmp             r3,  #1
1802cabdff1aSopenharmony_ci        beq             idct32x32_dc_add_neon
1803cabdff1aSopenharmony_ci        vpush           {q4-q7}
1804cabdff1aSopenharmony_ci        movrel          r8,  min_eob_idct_idct_32 + 2
1805cabdff1aSopenharmony_ci
1806cabdff1aSopenharmony_ci        @ Align the stack, allocate a temp buffer
1807cabdff1aSopenharmony_ciT       mov             r7,  sp
1808cabdff1aSopenharmony_ciT       and             r7,  r7,  #15
1809cabdff1aSopenharmony_ciA       and             r7,  sp,  #15
1810cabdff1aSopenharmony_ci        add             r7,  r7,  #4096
1811cabdff1aSopenharmony_ci        sub             sp,  sp,  r7
1812cabdff1aSopenharmony_ci
1813cabdff1aSopenharmony_ci        mov             r4,  r0
1814cabdff1aSopenharmony_ci        mov             r5,  r1
1815cabdff1aSopenharmony_ci        mov             r6,  r2
1816cabdff1aSopenharmony_ci
1817cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
1818cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r12,:128]!
1819cabdff1aSopenharmony_ci        vld1.16         {q6-q7}, [r12,:128]
1820cabdff1aSopenharmony_ci        vmovl.s16       q2,  d2
1821cabdff1aSopenharmony_ci        vmovl.s16       q3,  d3
1822cabdff1aSopenharmony_ci        vmovl.s16       q1,  d1
1823cabdff1aSopenharmony_ci        vmovl.s16       q0,  d0
1824cabdff1aSopenharmony_ci
1825cabdff1aSopenharmony_ci        cmp             r3,  #34
1826cabdff1aSopenharmony_ci        ble             idct32x32_quarter_add_16_neon
1827cabdff1aSopenharmony_ci        cmp             r3,  #135
1828cabdff1aSopenharmony_ci        ble             idct32x32_half_add_16_neon
1829cabdff1aSopenharmony_ci
1830cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
1831cabdff1aSopenharmony_ci        add             r0,  sp,  #(\i*128)
1832cabdff1aSopenharmony_ci.if \i > 0
1833cabdff1aSopenharmony_ci        ldrh_post       r1,  r8,  #2
1834cabdff1aSopenharmony_ci        cmp             r3,  r1
1835cabdff1aSopenharmony_ci        it              le
1836cabdff1aSopenharmony_ci        movle           r1,  #(32 - \i)/2
1837cabdff1aSopenharmony_ci        ble             1f
1838cabdff1aSopenharmony_ci.endif
1839cabdff1aSopenharmony_ci        add             r2,  r6,  #(\i*4)
1840cabdff1aSopenharmony_ci        bl              idct32_1d_2x32_pass1_neon
1841cabdff1aSopenharmony_ci.endr
1842cabdff1aSopenharmony_ci        b               3f
1843cabdff1aSopenharmony_ci
1844cabdff1aSopenharmony_ci1:
1845cabdff1aSopenharmony_ci        @ Write zeros to the temp buffer for pass 2
1846cabdff1aSopenharmony_ci        vmov.i16        q14, #0
1847cabdff1aSopenharmony_ci        vmov.i16        q15, #0
1848cabdff1aSopenharmony_ci2:
1849cabdff1aSopenharmony_ci        subs            r1,  r1,  #1
1850cabdff1aSopenharmony_ci.rept 2
1851cabdff1aSopenharmony_ci        @ Fill one line with zeros
1852cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r0,:128]!
1853cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r0,:128]!
1854cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r0,:128]!
1855cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r0,:128]!
1856cabdff1aSopenharmony_ci.endr
1857cabdff1aSopenharmony_ci        bne             2b
1858cabdff1aSopenharmony_ci3:
1859cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
1860cabdff1aSopenharmony_ci        add             r0,  r4,  #(\i*2)
1861cabdff1aSopenharmony_ci        mov             r1,  r5
1862cabdff1aSopenharmony_ci        add             r2,  sp,  #(\i*4)
1863cabdff1aSopenharmony_ci        bl              idct32_1d_2x32_pass2_neon
1864cabdff1aSopenharmony_ci.endr
1865cabdff1aSopenharmony_ci
1866cabdff1aSopenharmony_ci        add             sp,  sp,  r7
1867cabdff1aSopenharmony_ci        vpop            {q4-q7}
1868cabdff1aSopenharmony_ci        pop             {r4-r9,pc}
1869cabdff1aSopenharmony_ciendfunc
1870cabdff1aSopenharmony_ci
1871cabdff1aSopenharmony_cifunction ff_vp9_idct_idct_32x32_add_10_neon, export=1
1872cabdff1aSopenharmony_ci        push            {r4-r9,lr}
1873cabdff1aSopenharmony_ci        movw            r9,  #0x03ff
1874cabdff1aSopenharmony_ci        b               vp9_idct_idct_32x32_add_16_neon
1875cabdff1aSopenharmony_ciendfunc
1876cabdff1aSopenharmony_ci
1877cabdff1aSopenharmony_cifunction ff_vp9_idct_idct_32x32_add_12_neon, export=1
1878cabdff1aSopenharmony_ci        push            {r4-r9,lr}
1879cabdff1aSopenharmony_ci        movw            r9,  #0x0fff
1880cabdff1aSopenharmony_ci        b               vp9_idct_idct_32x32_add_16_neon
1881cabdff1aSopenharmony_ciendfunc
1882cabdff1aSopenharmony_ci
1883cabdff1aSopenharmony_ci.macro idct32_partial size, rows
1884cabdff1aSopenharmony_cifunction idct32x32_\size\()_add_16_neon
1885cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6
1886cabdff1aSopenharmony_ci        add             r0,  sp,  #(\i*128)
1887cabdff1aSopenharmony_ci.ifc \size,quarter
1888cabdff1aSopenharmony_ci.if \i > 0
1889cabdff1aSopenharmony_ci        ldrh_post       r1,  r8,  #2
1890cabdff1aSopenharmony_ci        cmp             r3,  r1
1891cabdff1aSopenharmony_ci        it              le
1892cabdff1aSopenharmony_ci        movle           r1,  #(\rows - \i)/2
1893cabdff1aSopenharmony_ci        ble             1f
1894cabdff1aSopenharmony_ci.endif
1895cabdff1aSopenharmony_ci.endif
1896cabdff1aSopenharmony_ci        add             r2,  r6,  #(\i*4)
1897cabdff1aSopenharmony_ci        bl              idct32_1d_2x32_pass1_\size\()_neon
1898cabdff1aSopenharmony_ci.endr
1899cabdff1aSopenharmony_ci.ifc \size,half
1900cabdff1aSopenharmony_ci        add             r8,  r8,  #8
1901cabdff1aSopenharmony_ci.irp i, 8, 10, 12, 14
1902cabdff1aSopenharmony_ci        add             r0,  sp,  #(\i*128)
1903cabdff1aSopenharmony_ci.if \i > 8
1904cabdff1aSopenharmony_ci        ldrh_post       r1,  r8,  #2
1905cabdff1aSopenharmony_ci        cmp             r3,  r1
1906cabdff1aSopenharmony_ci        it              le
1907cabdff1aSopenharmony_ci        movle           r1,  #(\rows - \i)/2
1908cabdff1aSopenharmony_ci        ble             1f
1909cabdff1aSopenharmony_ci.endif
1910cabdff1aSopenharmony_ci        add             r2,  r6,  #(\i*4)
1911cabdff1aSopenharmony_ci        bl              idct32_1d_2x32_pass1_\size\()_neon
1912cabdff1aSopenharmony_ci.endr
1913cabdff1aSopenharmony_ci.endif
1914cabdff1aSopenharmony_ci        b               3f
1915cabdff1aSopenharmony_ci
1916cabdff1aSopenharmony_ci1:
1917cabdff1aSopenharmony_ci        @ Write zeros to the temp buffer for pass 2
1918cabdff1aSopenharmony_ci        vmov.i16        q14, #0
1919cabdff1aSopenharmony_ci        vmov.i16        q15, #0
1920cabdff1aSopenharmony_ci2:
1921cabdff1aSopenharmony_ci        subs            r1,  r1,  #1
1922cabdff1aSopenharmony_ci.rept 2
1923cabdff1aSopenharmony_ci        @ Fill one line with zeros
1924cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r0,:128]!
1925cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r0,:128]!
1926cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r0,:128]!
1927cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r0,:128]!
1928cabdff1aSopenharmony_ci.endr
1929cabdff1aSopenharmony_ci        bne             2b
1930cabdff1aSopenharmony_ci3:
1931cabdff1aSopenharmony_ci.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
1932cabdff1aSopenharmony_ci        add             r0,  r4,  #(\i*2)
1933cabdff1aSopenharmony_ci        mov             r1,  r5
1934cabdff1aSopenharmony_ci        add             r2,  sp,  #(\i*4)
1935cabdff1aSopenharmony_ci        bl              idct32_1d_2x32_pass2_\size\()_neon
1936cabdff1aSopenharmony_ci.endr
1937cabdff1aSopenharmony_ci
1938cabdff1aSopenharmony_ci        add             sp,  sp,  r7
1939cabdff1aSopenharmony_ci        vpop            {q4-q7}
1940cabdff1aSopenharmony_ci        pop             {r4-r9,pc}
1941cabdff1aSopenharmony_ciendfunc
1942cabdff1aSopenharmony_ci.endm
1943cabdff1aSopenharmony_ci
1944cabdff1aSopenharmony_ciidct32_partial quarter, 8
1945cabdff1aSopenharmony_ciidct32_partial half, 16
1946