1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2016 Google Inc.
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S"
22cabdff1aSopenharmony_ci#include "neon.S"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciconst itxfm4_coeffs, align=4
25cabdff1aSopenharmony_ci        .short  11585, 0, 6270, 15137
26cabdff1aSopenharmony_ciiadst4_coeffs:
27cabdff1aSopenharmony_ci        .short  5283, 15212, 9929, 13377
28cabdff1aSopenharmony_ciendconst
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ciconst iadst8_coeffs, align=4
31cabdff1aSopenharmony_ci        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32cabdff1aSopenharmony_ciidct_coeffs:
33cabdff1aSopenharmony_ci        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34cabdff1aSopenharmony_ci        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35cabdff1aSopenharmony_ci        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36cabdff1aSopenharmony_ci        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37cabdff1aSopenharmony_ciendconst
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ciconst iadst16_coeffs, align=4
40cabdff1aSopenharmony_ci        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41cabdff1aSopenharmony_ci        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
42cabdff1aSopenharmony_ciendconst
43cabdff1aSopenharmony_ci
44cabdff1aSopenharmony_ci@ Do four 4x4 transposes, using q registers for the subtransposes that don't
45cabdff1aSopenharmony_ci@ need to address the individual d registers.
46cabdff1aSopenharmony_ci@ r0,r1 == rq1, r2,r3 == rq1, etc
47cabdff1aSopenharmony_ci.macro transpose16_q_4x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
48cabdff1aSopenharmony_ci        vtrn.32          \rq0, \rq1
49cabdff1aSopenharmony_ci        vtrn.32          \rq2, \rq3
50cabdff1aSopenharmony_ci        vtrn.32          \rq4, \rq5
51cabdff1aSopenharmony_ci        vtrn.32          \rq6, \rq7
52cabdff1aSopenharmony_ci        vtrn.16          \r0,  \r1
53cabdff1aSopenharmony_ci        vtrn.16          \r2,  \r3
54cabdff1aSopenharmony_ci        vtrn.16          \r4,  \r5
55cabdff1aSopenharmony_ci        vtrn.16          \r6,  \r7
56cabdff1aSopenharmony_ci        vtrn.16          \r8,  \r9
57cabdff1aSopenharmony_ci        vtrn.16          \r10, \r11
58cabdff1aSopenharmony_ci        vtrn.16          \r12, \r13
59cabdff1aSopenharmony_ci        vtrn.16          \r14, \r15
60cabdff1aSopenharmony_ci.endm
61cabdff1aSopenharmony_ci
62cabdff1aSopenharmony_ci@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
63cabdff1aSopenharmony_ci@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
64cabdff1aSopenharmony_ci@ in/out are d registers
65cabdff1aSopenharmony_ci.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
66cabdff1aSopenharmony_ci        vadd.s16        \tmpd1, \in1,  \in2
67cabdff1aSopenharmony_ci        vsub.s16        \tmpd2, \in1,  \in2
68cabdff1aSopenharmony_ci        vmull.s16       \tmpq3, \tmpd1, d0[0]
69cabdff1aSopenharmony_ci        vmull.s16       \tmpq4, \tmpd2, d0[0]
70cabdff1aSopenharmony_ci.if \neg > 0
71cabdff1aSopenharmony_ci        vneg.s32        \tmpq3, \tmpq3
72cabdff1aSopenharmony_ci.endif
73cabdff1aSopenharmony_ci        vrshrn.s32      \out1, \tmpq3, #14
74cabdff1aSopenharmony_ci        vrshrn.s32      \out2, \tmpq4, #14
75cabdff1aSopenharmony_ci.endm
76cabdff1aSopenharmony_ci
77cabdff1aSopenharmony_ci@ Same as mbutterfly0 above, but treating the input in in2 as zero,
78cabdff1aSopenharmony_ci@ writing the same output into both out1 and out2.
79cabdff1aSopenharmony_ci.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
80cabdff1aSopenharmony_ci        vmull.s16       \tmpq3, \in1, d0[0]
81cabdff1aSopenharmony_ci        vrshrn.s32      \out1,  \tmpq3, #14
82cabdff1aSopenharmony_ci        vrshrn.s32      \out2,  \tmpq3, #14
83cabdff1aSopenharmony_ci.endm
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_ci@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
86cabdff1aSopenharmony_ci@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
87cabdff1aSopenharmony_ci@ Same as mbutterfly0, but with input being 2 q registers, output
88cabdff1aSopenharmony_ci@ being 4 d registers.
89cabdff1aSopenharmony_ci@ This can do with either 4 or 6 temporary q registers.
90cabdff1aSopenharmony_ci.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
91cabdff1aSopenharmony_ci        vadd.s16        \tmpq1, \in1,  \in2
92cabdff1aSopenharmony_ci        vsub.s16        \tmpq2, \in1,  \in2
93cabdff1aSopenharmony_ci        vmull.s16       \tmpq3, \tmpd11, d0[0]
94cabdff1aSopenharmony_ci        vmull.s16       \tmpq4, \tmpd12, d0[0]
95cabdff1aSopenharmony_ci.ifb \tmpq5
96cabdff1aSopenharmony_ci        vrshrn.s32      \out1, \tmpq3, #14
97cabdff1aSopenharmony_ci        vrshrn.s32      \out2, \tmpq4, #14
98cabdff1aSopenharmony_ci        vmull.s16       \tmpq3, \tmpd21, d0[0]
99cabdff1aSopenharmony_ci        vmull.s16       \tmpq4, \tmpd22, d0[0]
100cabdff1aSopenharmony_ci        vrshrn.s32      \out3, \tmpq3, #14
101cabdff1aSopenharmony_ci        vrshrn.s32      \out4, \tmpq4, #14
102cabdff1aSopenharmony_ci.else
103cabdff1aSopenharmony_ci        vmull.s16       \tmpq5, \tmpd21, d0[0]
104cabdff1aSopenharmony_ci        vmull.s16       \tmpq6, \tmpd22, d0[0]
105cabdff1aSopenharmony_ci        vrshrn.s32      \out1, \tmpq3, #14
106cabdff1aSopenharmony_ci        vrshrn.s32      \out2, \tmpq4, #14
107cabdff1aSopenharmony_ci        vrshrn.s32      \out3, \tmpq5, #14
108cabdff1aSopenharmony_ci        vrshrn.s32      \out4, \tmpq6, #14
109cabdff1aSopenharmony_ci.endif
110cabdff1aSopenharmony_ci.endm
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci@ out1 = in1 * coef1 - in2 * coef2
113cabdff1aSopenharmony_ci@ out2 = in1 * coef2 + in2 * coef1
114cabdff1aSopenharmony_ci@ out are 2 q registers, in are 2 d registers
115cabdff1aSopenharmony_ci.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2
116cabdff1aSopenharmony_ci        vmull.s16       \out1, \in1, \coef1
117cabdff1aSopenharmony_ci        vmlsl.s16       \out1, \in2, \coef2
118cabdff1aSopenharmony_ci        vmull.s16       \out2, \in1, \coef2
119cabdff1aSopenharmony_ci        vmlal.s16       \out2, \in2, \coef1
120cabdff1aSopenharmony_ci.endm
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
123cabdff1aSopenharmony_ci@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
124cabdff1aSopenharmony_ci@ out are 4 q registers, in are 4 d registers
125cabdff1aSopenharmony_ci.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
126cabdff1aSopenharmony_ci        vmull.s16       \out1, \in1, \coef1
127cabdff1aSopenharmony_ci        vmull.s16       \out2, \in2, \coef1
128cabdff1aSopenharmony_ci        vmull.s16       \out3, \in1, \coef2
129cabdff1aSopenharmony_ci        vmull.s16       \out4, \in2, \coef2
130cabdff1aSopenharmony_ci        vmlsl.s16       \out1, \in3, \coef2
131cabdff1aSopenharmony_ci        vmlsl.s16       \out2, \in4, \coef2
132cabdff1aSopenharmony_ci        vmlal.s16       \out3, \in3, \coef1
133cabdff1aSopenharmony_ci        vmlal.s16       \out4, \in4, \coef1
134cabdff1aSopenharmony_ci.endm
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ci@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
137cabdff1aSopenharmony_ci@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
138cabdff1aSopenharmony_ci@ inout are 2 d registers, tmp are 2 q registers
139cabdff1aSopenharmony_ci.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0
140cabdff1aSopenharmony_ci        mbutterfly_l    \tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2
141cabdff1aSopenharmony_ci.if \neg > 0
142cabdff1aSopenharmony_ci        vneg.s32        \tmp2, \tmp2
143cabdff1aSopenharmony_ci.endif
144cabdff1aSopenharmony_ci        vrshrn.s32      \inout1, \tmp1,  #14
145cabdff1aSopenharmony_ci        vrshrn.s32      \inout2, \tmp2,  #14
146cabdff1aSopenharmony_ci.endm
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_ci@ Same as mbutterfly above, but treating the input in inout2 as zero
149cabdff1aSopenharmony_ci.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
150cabdff1aSopenharmony_ci        vmull.s16       \tmp1,   \inout1, \coef1
151cabdff1aSopenharmony_ci        vmull.s16       \tmp2,   \inout1, \coef2
152cabdff1aSopenharmony_ci        vrshrn.s32      \inout1, \tmp1,   #14
153cabdff1aSopenharmony_ci        vrshrn.s32      \inout2, \tmp2,   #14
154cabdff1aSopenharmony_ci.endm
155cabdff1aSopenharmony_ci
156cabdff1aSopenharmony_ci@ Same as mbutterfly above, but treating the input in inout1 as zero
157cabdff1aSopenharmony_ci.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
158cabdff1aSopenharmony_ci        vmull.s16       \tmp1,   \inout2, \coef2
159cabdff1aSopenharmony_ci        vmull.s16       \tmp2,   \inout2, \coef1
160cabdff1aSopenharmony_ci        vneg.s32        \tmp1,   \tmp1
161cabdff1aSopenharmony_ci        vrshrn.s32      \inout2, \tmp2,   #14
162cabdff1aSopenharmony_ci        vrshrn.s32      \inout1, \tmp1,   #14
163cabdff1aSopenharmony_ci.endm
164cabdff1aSopenharmony_ci
165cabdff1aSopenharmony_ci@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
166cabdff1aSopenharmony_ci@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
167cabdff1aSopenharmony_ci@ inout are 4 d registers, tmp are 4 q registers
168cabdff1aSopenharmony_ci.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4
169cabdff1aSopenharmony_ci        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2
170cabdff1aSopenharmony_ci        vrshrn.s32      \inout1, \tmp1,  #14
171cabdff1aSopenharmony_ci        vrshrn.s32      \inout2, \tmp2,  #14
172cabdff1aSopenharmony_ci        vrshrn.s32      \inout3, \tmp3,  #14
173cabdff1aSopenharmony_ci        vrshrn.s32      \inout4, \tmp4,  #14
174cabdff1aSopenharmony_ci.endm
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_ci@ out1 = in1 + in2
177cabdff1aSopenharmony_ci@ out2 = in1 - in2
178cabdff1aSopenharmony_ci.macro butterfly out1, out2, in1, in2
179cabdff1aSopenharmony_ci        vadd.s16        \out1, \in1, \in2
180cabdff1aSopenharmony_ci        vsub.s16        \out2, \in1, \in2
181cabdff1aSopenharmony_ci.endm
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci@ out1 = in1 - in2
184cabdff1aSopenharmony_ci@ out2 = in1 + in2
185cabdff1aSopenharmony_ci.macro butterfly_r out1, out2, in1, in2
186cabdff1aSopenharmony_ci        vsub.s16        \out1, \in1, \in2
187cabdff1aSopenharmony_ci        vadd.s16        \out2, \in1, \in2
188cabdff1aSopenharmony_ci.endm
189cabdff1aSopenharmony_ci
190cabdff1aSopenharmony_ci@ out1 = (in1 + in2 + (1 << 13)) >> 14
191cabdff1aSopenharmony_ci@ out2 = (in1 - in2 + (1 << 13)) >> 14
192cabdff1aSopenharmony_ci@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
193cabdff1aSopenharmony_ci.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
194cabdff1aSopenharmony_ci        vadd.s32        \tmp1, \in1, \in2
195cabdff1aSopenharmony_ci        vsub.s32        \tmp2, \in1, \in2
196cabdff1aSopenharmony_ci        vrshrn.s32      \out1, \tmp1,  #14
197cabdff1aSopenharmony_ci        vrshrn.s32      \out2, \tmp2,  #14
198cabdff1aSopenharmony_ci.endm
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ci@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
201cabdff1aSopenharmony_ci@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
202cabdff1aSopenharmony_ci@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
203cabdff1aSopenharmony_ci.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
204cabdff1aSopenharmony_ci        vadd.s32        \tmp1, \in1, \in3
205cabdff1aSopenharmony_ci        vadd.s32        \tmp2, \in2, \in4
206cabdff1aSopenharmony_ci        vsub.s32        \tmp3, \in1, \in3
207cabdff1aSopenharmony_ci        vsub.s32        \tmp4, \in2, \in4
208cabdff1aSopenharmony_ci        vrshrn.s32      \out1, \tmp1,  #14
209cabdff1aSopenharmony_ci        vrshrn.s32      \out2, \tmp2,  #14
210cabdff1aSopenharmony_ci        vrshrn.s32      \out3, \tmp3,  #14
211cabdff1aSopenharmony_ci        vrshrn.s32      \out4, \tmp4,  #14
212cabdff1aSopenharmony_ci.endm
213cabdff1aSopenharmony_ci
214cabdff1aSopenharmony_ci
215cabdff1aSopenharmony_ci.macro iwht4 c0, c1, c2, c3
216cabdff1aSopenharmony_ci        vadd.i16        \c0,  \c0,  \c1
217cabdff1aSopenharmony_ci        vsub.i16        d17,  \c2,  \c3
218cabdff1aSopenharmony_ci        vsub.i16        d16,  \c0,  d17
219cabdff1aSopenharmony_ci        vshr.s16        d16,  d16,  #1
220cabdff1aSopenharmony_ci        vsub.i16        \c2,  d16,  \c1
221cabdff1aSopenharmony_ci        vsub.i16        \c1,  d16,  \c3
222cabdff1aSopenharmony_ci        vadd.i16        \c3,  d17,  \c2
223cabdff1aSopenharmony_ci        vsub.i16        \c0,  \c0,  \c1
224cabdff1aSopenharmony_ci.endm
225cabdff1aSopenharmony_ci
226cabdff1aSopenharmony_ci.macro idct4 c0, c1, c2, c3
227cabdff1aSopenharmony_ci        vmull.s16       q13,  \c1,  d0[3]
228cabdff1aSopenharmony_ci        vmull.s16       q11,  \c1,  d0[2]
229cabdff1aSopenharmony_ci        vadd.i16        d16,  \c0,  \c2
230cabdff1aSopenharmony_ci        vsub.i16        d17,  \c0,  \c2
231cabdff1aSopenharmony_ci        vmlal.s16       q13,  \c3,  d0[2]
232cabdff1aSopenharmony_ci        vmull.s16       q9,   d16,  d0[0]
233cabdff1aSopenharmony_ci        vmull.s16       q10,  d17,  d0[0]
234cabdff1aSopenharmony_ci        vmlsl.s16       q11,  \c3,  d0[3]
235cabdff1aSopenharmony_ci        vrshrn.s32      d26,  q13,  #14
236cabdff1aSopenharmony_ci        vrshrn.s32      d18,  q9,   #14
237cabdff1aSopenharmony_ci        vrshrn.s32      d20,  q10,  #14
238cabdff1aSopenharmony_ci        vrshrn.s32      d22,  q11,  #14
239cabdff1aSopenharmony_ci        vadd.i16        \c0,  d18,  d26
240cabdff1aSopenharmony_ci        vsub.i16        \c3,  d18,  d26
241cabdff1aSopenharmony_ci        vadd.i16        \c1,  d20,  d22
242cabdff1aSopenharmony_ci        vsub.i16        \c2,  d20,  d22
243cabdff1aSopenharmony_ci.endm
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_ci.macro iadst4 c0, c1, c2, c3
246cabdff1aSopenharmony_ci        vmull.s16       q10,  \c0,  d1[0]
247cabdff1aSopenharmony_ci        vmlal.s16       q10,  \c2,  d1[1]
248cabdff1aSopenharmony_ci        vmlal.s16       q10,  \c3,  d1[2]
249cabdff1aSopenharmony_ci        vmull.s16       q11,  \c0,  d1[2]
250cabdff1aSopenharmony_ci        vmlsl.s16       q11,  \c2,  d1[0]
251cabdff1aSopenharmony_ci        vsub.s16        \c0,  \c0,  \c2
252cabdff1aSopenharmony_ci        vmlsl.s16       q11,  \c3,  d1[1]
253cabdff1aSopenharmony_ci        vadd.s16        \c0,  \c0,  \c3
254cabdff1aSopenharmony_ci        vmull.s16       q13,  \c1,  d1[3]
255cabdff1aSopenharmony_ci        vmull.s16       q12,  \c0,  d1[3]
256cabdff1aSopenharmony_ci        vadd.s32        q14,  q10,  q13
257cabdff1aSopenharmony_ci        vadd.s32        q1,   q11,  q13
258cabdff1aSopenharmony_ci        vrshrn.s32      \c0,  q14,  #14
259cabdff1aSopenharmony_ci        vadd.s32        q10,  q10,  q11
260cabdff1aSopenharmony_ci        vrshrn.s32      \c1,  q1,   #14
261cabdff1aSopenharmony_ci        vsub.s32        q10,  q10,  q13
262cabdff1aSopenharmony_ci        vrshrn.s32      \c2,  q12,  #14
263cabdff1aSopenharmony_ci        vrshrn.s32      \c3,  q10,  #14
264cabdff1aSopenharmony_ci.endm
265cabdff1aSopenharmony_ci
266cabdff1aSopenharmony_ci@ The public functions in this file have got the following signature:
267cabdff1aSopenharmony_ci@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
268cabdff1aSopenharmony_ci
269cabdff1aSopenharmony_ci.macro itxfm_func4x4 txfm1, txfm2
270cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
271cabdff1aSopenharmony_ci.ifc \txfm1,\txfm2
272cabdff1aSopenharmony_ci.ifc \txfm1,idct
273cabdff1aSopenharmony_ci        movrel          r12, itxfm4_coeffs
274cabdff1aSopenharmony_ci        vld1.16         {d0}, [r12,:64]
275cabdff1aSopenharmony_ci.endif
276cabdff1aSopenharmony_ci.ifc \txfm1,iadst
277cabdff1aSopenharmony_ci        movrel          r12, iadst4_coeffs
278cabdff1aSopenharmony_ci        vld1.16         {d1}, [r12,:64]
279cabdff1aSopenharmony_ci.endif
280cabdff1aSopenharmony_ci.else
281cabdff1aSopenharmony_ci        movrel          r12, itxfm4_coeffs
282cabdff1aSopenharmony_ci        vld1.16         {q0}, [r12,:128]
283cabdff1aSopenharmony_ci.endif
284cabdff1aSopenharmony_ci
285cabdff1aSopenharmony_ci        vmov.i16        q15, #0
286cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
287cabdff1aSopenharmony_ci        cmp             r3,  #1
288cabdff1aSopenharmony_ci        bne             1f
289cabdff1aSopenharmony_ci        @ DC-only for idct/idct
290cabdff1aSopenharmony_ci        vld1.16         {d4[]},   [r2,:16]
291cabdff1aSopenharmony_ci        vmull.s16       q2,  d4,  d0[0]
292cabdff1aSopenharmony_ci        vrshrn.s32      d4,  q2,  #14
293cabdff1aSopenharmony_ci        vmull.s16       q2,  d4,  d0[0]
294cabdff1aSopenharmony_ci        vrshrn.s32      d4,  q2,  #14
295cabdff1aSopenharmony_ci        vst1.16         {d30[0]}, [r2,:16]
296cabdff1aSopenharmony_ci        vdup.16         q2,  d4[0]
297cabdff1aSopenharmony_ci        vmov            q3,  q2
298cabdff1aSopenharmony_ci        b               2f
299cabdff1aSopenharmony_ci.endif
300cabdff1aSopenharmony_ci
301cabdff1aSopenharmony_ci1:
302cabdff1aSopenharmony_ci        vld1.16         {d4-d7},  [r2,:128]
303cabdff1aSopenharmony_ci        vst1.16         {q15}, [r2,:128]!
304cabdff1aSopenharmony_ci
305cabdff1aSopenharmony_ci.ifc \txfm1,iwht
306cabdff1aSopenharmony_ci        vshr.s16        q2,  q2,  #2
307cabdff1aSopenharmony_ci        vshr.s16        q3,  q3,  #2
308cabdff1aSopenharmony_ci.endif
309cabdff1aSopenharmony_ci
310cabdff1aSopenharmony_ci        \txfm1\()4      d4,  d5,  d6,  d7
311cabdff1aSopenharmony_ci
312cabdff1aSopenharmony_ci        vst1.16         {q15}, [r2,:128]!
313cabdff1aSopenharmony_ci        @ Transpose 4x4 with 16 bit elements
314cabdff1aSopenharmony_ci        vtrn.16         d4,  d5
315cabdff1aSopenharmony_ci        vtrn.16         d6,  d7
316cabdff1aSopenharmony_ci        vtrn.32         q2,  q3
317cabdff1aSopenharmony_ci
318cabdff1aSopenharmony_ci        \txfm2\()4      d4,  d5,  d6,  d7
319cabdff1aSopenharmony_ci2:
320cabdff1aSopenharmony_ci        vld1.32         {d0[]},   [r0,:32], r1
321cabdff1aSopenharmony_ci        vld1.32         {d0[1]},  [r0,:32], r1
322cabdff1aSopenharmony_ci.ifnc \txfm1,iwht
323cabdff1aSopenharmony_ci        vrshr.s16       q2,  q2,  #4
324cabdff1aSopenharmony_ci        vrshr.s16       q3,  q3,  #4
325cabdff1aSopenharmony_ci.endif
326cabdff1aSopenharmony_ci        vaddw.u8        q2,  q2,  d0
327cabdff1aSopenharmony_ci        vld1.32         {d1[]},   [r0,:32], r1
328cabdff1aSopenharmony_ci        vld1.32         {d1[1]},  [r0,:32], r1
329cabdff1aSopenharmony_ci        vqmovun.s16     d0,  q2
330cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
331cabdff1aSopenharmony_ci
332cabdff1aSopenharmony_ci        vaddw.u8        q3,  q3,  d1
333cabdff1aSopenharmony_ci        vst1.32         {d0[0]},  [r0,:32], r1
334cabdff1aSopenharmony_ci        vqmovun.s16     d1,  q3
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci        vst1.32         {d0[1]},  [r0,:32], r1
337cabdff1aSopenharmony_ci        vst1.32         {d1[0]},  [r0,:32], r1
338cabdff1aSopenharmony_ci        vst1.32         {d1[1]},  [r0,:32], r1
339cabdff1aSopenharmony_ci
340cabdff1aSopenharmony_ci        bx              lr
341cabdff1aSopenharmony_ciendfunc
342cabdff1aSopenharmony_ci.endm
343cabdff1aSopenharmony_ci
344cabdff1aSopenharmony_ciitxfm_func4x4 idct,  idct
345cabdff1aSopenharmony_ciitxfm_func4x4 iadst, idct
346cabdff1aSopenharmony_ciitxfm_func4x4 idct,  iadst
347cabdff1aSopenharmony_ciitxfm_func4x4 iadst, iadst
348cabdff1aSopenharmony_ciitxfm_func4x4 iwht,  iwht
349cabdff1aSopenharmony_ci
350cabdff1aSopenharmony_ci
351cabdff1aSopenharmony_ci.macro idct8
352cabdff1aSopenharmony_ci        dmbutterfly0    d16, d17, d24, d25, q8,  q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
353cabdff1aSopenharmony_ci        dmbutterfly     d20, d21, d28, d29, d0[2], d0[3], q2,  q3,  q4,  q5 @ q10 = t2a, q14 = t3a
354cabdff1aSopenharmony_ci        dmbutterfly     d18, d19, d30, d31, d1[0], d1[1], q2,  q3,  q4,  q5 @ q9  = t4a, q15 = t7a
355cabdff1aSopenharmony_ci        dmbutterfly     d26, d27, d22, d23, d1[2], d1[3], q2,  q3,  q4,  q5 @ q13 = t5a, q11 = t6a
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_ci        butterfly       q2,  q14, q8,  q14 @ q2 = t0, q14 = t3
358cabdff1aSopenharmony_ci        butterfly       q3,  q10, q12, q10 @ q3 = t1, q10 = t2
359cabdff1aSopenharmony_ci        butterfly       q4,  q13, q9,  q13 @ q4 = t4, q13 = t5a
360cabdff1aSopenharmony_ci        butterfly       q5,  q11, q15, q11 @ q5 = t7, q11 = t6a
361cabdff1aSopenharmony_ci
362cabdff1aSopenharmony_ci        butterfly       q8,  q15, q2,  q5  @ q8 = out[0], q15 = out[7]
363cabdff1aSopenharmony_ci
364cabdff1aSopenharmony_ci        dmbutterfly0    d4,  d5,  d10, d11, q11, q13, q9,  q13, d18, d19, d26, d27, q2,  q5, q11, q12 @ q2 = t6, q5 = t5
365cabdff1aSopenharmony_ci
366cabdff1aSopenharmony_ci        butterfly       q11, q12, q14, q4  @ q11 = out[3], q12 = out[4]
367cabdff1aSopenharmony_ci        butterfly       q9,  q14, q3,  q2  @ q9 = out[1],  q14 = out[6]
368cabdff1aSopenharmony_ci        butterfly_r     q13, q10, q10, q5  @ q13 = out[5], q10 = out[2]
369cabdff1aSopenharmony_ci.endm
370cabdff1aSopenharmony_ci
371cabdff1aSopenharmony_ci.macro iadst8
372cabdff1aSopenharmony_ci        dmbutterfly_l   q4,  q5,  q2,  q3,  d30, d31, d16, d17, d2[1], d2[0] @ q4,q5  = t1a, q2,q3 = t0a
373cabdff1aSopenharmony_ci        dmbutterfly_l   q8,  q15, q6,  q7,  d22, d23, d24, d25, d3[1], d3[0] @ q8,q15 = t5a, q6,q7 = t4a
374cabdff1aSopenharmony_ci
375cabdff1aSopenharmony_ci        dbutterfly_n    d22, d23, d4,  d5,  q2,  q3,  q6,  q7,  q11, q12, q2,  q3 @ q11 = t0, q2 = t4
376cabdff1aSopenharmony_ci
377cabdff1aSopenharmony_ci        dbutterfly_n    d24, d25, d6,  d7,  q4,  q5,  q8,  q15, q12, q3,  q6,  q7 @ q12 = t1, q3 = t5
378cabdff1aSopenharmony_ci
379cabdff1aSopenharmony_ci        dmbutterfly_l   q6,  q7,  q4,  q5,  d26, d27, d20, d21, d2[3], d2[2] @ q6,q7 = t3a, q4,q5 = t2a
380cabdff1aSopenharmony_ci        dmbutterfly_l   q10, q13, q8,  q15, d18, d19, d28, d29, d3[3], d3[2] @ q10,q13 = t7a, q8,q15 = t6a
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_ci        dbutterfly_n    d18, d19, d8,  d9,  q4,  q5,  q8,  q15, q9,  q14, q4, q5 @ q9 = t2, q4 = t6
383cabdff1aSopenharmony_ci        dbutterfly_n    d16, d17, d12, d13, q6,  q7,  q10, q13, q8,  q15, q6, q7 @ q8 = t3, q6 = t7
384cabdff1aSopenharmony_ci
385cabdff1aSopenharmony_ci        butterfly       q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
386cabdff1aSopenharmony_ci        vneg.s16        q15, q15          @ q15 = out[7]
387cabdff1aSopenharmony_ci        butterfly       q8,  q9,  q11, q9 @ q8 = out[0], q9 = t2
388cabdff1aSopenharmony_ci
389cabdff1aSopenharmony_ci        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d0[2], d0[3] @ q10,q11 = t5a, q5,q7 = t4a
390cabdff1aSopenharmony_ci        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d0[3], d0[2] @ q2,q3 = t6a, q13,q14 = t7a
391cabdff1aSopenharmony_ci
392cabdff1aSopenharmony_ci        dbutterfly_n    d28, d29, d8,  d9,  q10, q11, q13, q14, q4,  q6,  q10, q11 @ q14 = out[6], q4 = t7
393cabdff1aSopenharmony_ci
394cabdff1aSopenharmony_ci        dmbutterfly0    d22, d23, d24, d25, q9,  q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
395cabdff1aSopenharmony_ci        vneg.s16        q11, q11      @ q11 = out[3]
396cabdff1aSopenharmony_ci
397cabdff1aSopenharmony_ci        dbutterfly_n    d18, d19, d4,  d5,  q5,  q7,  q2,  q3,  q9, q10, q2,  q3 @ q9 = -out[1], q2 = t6
398cabdff1aSopenharmony_ci        vneg.s16        q9,  q9       @ q9 = out[1]
399cabdff1aSopenharmony_ci
400cabdff1aSopenharmony_ci        dmbutterfly0    d20, d21, d26, d27, q2,  q4,  q3, q5,  d6,  d7,  d10, d11, q6,  q7 @ q10 = out[2], q13 = -out[5]
401cabdff1aSopenharmony_ci        vneg.s16        q13, q13      @ q13 = out[5]
402cabdff1aSopenharmony_ci.endm
403cabdff1aSopenharmony_ci
404cabdff1aSopenharmony_ci
405cabdff1aSopenharmony_ci.macro itxfm_func8x8 txfm1, txfm2
406cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
407cabdff1aSopenharmony_ci        @ Push q4-q7 if iadst is used, idct requires
408cabdff1aSopenharmony_ci        @ a few scratch registers less, so only push q4-q5
409cabdff1aSopenharmony_ci        @ if only idct is involved.
410cabdff1aSopenharmony_ci        @ The iadst also uses a few coefficients from
411cabdff1aSopenharmony_ci        @ idct, so those always need to be loaded.
412cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
413cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
414cabdff1aSopenharmony_ci        vpush           {q4-q5}
415cabdff1aSopenharmony_ci.else
416cabdff1aSopenharmony_ci        movrel          r12, iadst8_coeffs
417cabdff1aSopenharmony_ci        vld1.16         {q1}, [r12,:128]!
418cabdff1aSopenharmony_ci        vpush           {q4-q7}
419cabdff1aSopenharmony_ci.endif
420cabdff1aSopenharmony_ci        vld1.16         {q0}, [r12,:128]
421cabdff1aSopenharmony_ci
422cabdff1aSopenharmony_ci        vmov.i16        q2, #0
423cabdff1aSopenharmony_ci        vmov.i16        q3, #0
424cabdff1aSopenharmony_ci
425cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
426cabdff1aSopenharmony_ci        cmp             r3,  #1
427cabdff1aSopenharmony_ci        bne             1f
428cabdff1aSopenharmony_ci        @ DC-only for idct/idct
429cabdff1aSopenharmony_ci        vld1.16         {d16[]}, [r2,:16]
430cabdff1aSopenharmony_ci        vmull.s16       q8,  d16, d0[0]
431cabdff1aSopenharmony_ci        vrshrn.s32      d16, q8,  #14
432cabdff1aSopenharmony_ci        vmull.s16       q8,  d16, d0[0]
433cabdff1aSopenharmony_ci        vrshrn.s32      d16, q8,  #14
434cabdff1aSopenharmony_ci        vdup.16         q8,  d16[0]
435cabdff1aSopenharmony_ci        vmov            q9,  q8
436cabdff1aSopenharmony_ci        vmov            q10, q8
437cabdff1aSopenharmony_ci        vmov            q11, q8
438cabdff1aSopenharmony_ci        vmov            q12, q8
439cabdff1aSopenharmony_ci        vmov            q13, q8
440cabdff1aSopenharmony_ci        vmov            q14, q8
441cabdff1aSopenharmony_ci        vmov            q15, q8
442cabdff1aSopenharmony_ci        vst1.16         {d4[0]}, [r2,:16]
443cabdff1aSopenharmony_ci        b               2f
444cabdff1aSopenharmony_ci.endif
445cabdff1aSopenharmony_ci1:
446cabdff1aSopenharmony_ci        vld1.16         {q8-q9},    [r2,:128]!
447cabdff1aSopenharmony_ci        vld1.16         {q10-q11},  [r2,:128]!
448cabdff1aSopenharmony_ci        vld1.16         {q12-q13},  [r2,:128]!
449cabdff1aSopenharmony_ci        vld1.16         {q14-q15},  [r2,:128]!
450cabdff1aSopenharmony_ci        sub             r2,  r2,  #128
451cabdff1aSopenharmony_ci        vst1.16         {q2-q3}, [r2,:128]!
452cabdff1aSopenharmony_ci        vst1.16         {q2-q3}, [r2,:128]!
453cabdff1aSopenharmony_ci        vst1.16         {q2-q3}, [r2,:128]!
454cabdff1aSopenharmony_ci        vst1.16         {q2-q3}, [r2,:128]!
455cabdff1aSopenharmony_ci
456cabdff1aSopenharmony_ci        \txfm1\()8
457cabdff1aSopenharmony_ci
458cabdff1aSopenharmony_ci        @ Transpose 8x8 with 16 bit elements
459cabdff1aSopenharmony_ci        vswp            d17, d24
460cabdff1aSopenharmony_ci        vswp            d19, d26
461cabdff1aSopenharmony_ci        vswp            d21, d28
462cabdff1aSopenharmony_ci        vswp            d23, d30
463cabdff1aSopenharmony_ci        transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
464cabdff1aSopenharmony_ci
465cabdff1aSopenharmony_ci        \txfm2\()8
466cabdff1aSopenharmony_ci2:
467cabdff1aSopenharmony_ci        mov             r3,  r0
468cabdff1aSopenharmony_ci        @ Add into the destination
469cabdff1aSopenharmony_ci        vld1.8          {d4},  [r0,:64], r1
470cabdff1aSopenharmony_ci        vrshr.s16       q8,  q8,  #5
471cabdff1aSopenharmony_ci        vld1.8          {d5},  [r0,:64], r1
472cabdff1aSopenharmony_ci        vrshr.s16       q9,  q9,  #5
473cabdff1aSopenharmony_ci        vld1.8          {d6},  [r0,:64], r1
474cabdff1aSopenharmony_ci        vrshr.s16       q10, q10, #5
475cabdff1aSopenharmony_ci        vaddw.u8        q8,  q8,  d4
476cabdff1aSopenharmony_ci        vld1.8          {d7},  [r0,:64], r1
477cabdff1aSopenharmony_ci        vrshr.s16       q11, q11, #5
478cabdff1aSopenharmony_ci        vaddw.u8        q9,  q9,  d5
479cabdff1aSopenharmony_ci        vld1.8          {d8},  [r0,:64], r1
480cabdff1aSopenharmony_ci        vrshr.s16       q12, q12, #5
481cabdff1aSopenharmony_ci        vaddw.u8        q10, q10, d6
482cabdff1aSopenharmony_ci        vqmovun.s16     d4,  q8
483cabdff1aSopenharmony_ci        vld1.8          {d9},  [r0,:64], r1
484cabdff1aSopenharmony_ci        vrshr.s16       q13, q13, #5
485cabdff1aSopenharmony_ci        vaddw.u8        q11, q11, d7
486cabdff1aSopenharmony_ci        vqmovun.s16     d5,  q9
487cabdff1aSopenharmony_ci        vld1.8          {d10}, [r0,:64], r1
488cabdff1aSopenharmony_ci        vrshr.s16       q14, q14, #5
489cabdff1aSopenharmony_ci        vaddw.u8        q12, q12, d8
490cabdff1aSopenharmony_ci        vqmovun.s16     d6,  q10
491cabdff1aSopenharmony_ci        vld1.8          {d11}, [r0,:64], r1
492cabdff1aSopenharmony_ci        vrshr.s16       q15, q15, #5
493cabdff1aSopenharmony_ci        vaddw.u8        q13, q13, d9
494cabdff1aSopenharmony_ci        vqmovun.s16     d7,  q11
495cabdff1aSopenharmony_ci
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci        vst1.8          {d4},  [r3,:64], r1
498cabdff1aSopenharmony_ci        vaddw.u8        q14, q14, d10
499cabdff1aSopenharmony_ci        vst1.8          {d5},  [r3,:64], r1
500cabdff1aSopenharmony_ci        vqmovun.s16     d8,  q12
501cabdff1aSopenharmony_ci        vst1.8          {d6},  [r3,:64], r1
502cabdff1aSopenharmony_ci        vaddw.u8        q15, q15, d11
503cabdff1aSopenharmony_ci        vst1.8          {d7},  [r3,:64], r1
504cabdff1aSopenharmony_ci        vqmovun.s16     d9,  q13
505cabdff1aSopenharmony_ci        vst1.8          {d8},  [r3,:64], r1
506cabdff1aSopenharmony_ci        vqmovun.s16     d10, q14
507cabdff1aSopenharmony_ci        vst1.8          {d9},  [r3,:64], r1
508cabdff1aSopenharmony_ci        vqmovun.s16     d11, q15
509cabdff1aSopenharmony_ci
510cabdff1aSopenharmony_ci        vst1.8          {d10}, [r3,:64], r1
511cabdff1aSopenharmony_ci        vst1.8          {d11}, [r3,:64], r1
512cabdff1aSopenharmony_ci
513cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
514cabdff1aSopenharmony_ci        vpop            {q4-q5}
515cabdff1aSopenharmony_ci.else
516cabdff1aSopenharmony_ci        vpop            {q4-q7}
517cabdff1aSopenharmony_ci.endif
518cabdff1aSopenharmony_ci        bx              lr
519cabdff1aSopenharmony_ciendfunc
520cabdff1aSopenharmony_ci.endm
521cabdff1aSopenharmony_ci
522cabdff1aSopenharmony_ciitxfm_func8x8 idct,  idct
523cabdff1aSopenharmony_ciitxfm_func8x8 iadst, idct
524cabdff1aSopenharmony_ci.ltorg
525cabdff1aSopenharmony_ciitxfm_func8x8 idct,  iadst
526cabdff1aSopenharmony_ciitxfm_func8x8 iadst, iadst
527cabdff1aSopenharmony_ci
528cabdff1aSopenharmony_ci
529cabdff1aSopenharmony_cifunction idct16x16_dc_add_neon
530cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
531cabdff1aSopenharmony_ci        vld1.16         {d0}, [r12,:64]
532cabdff1aSopenharmony_ci
533cabdff1aSopenharmony_ci        vmov.i16        q2,  #0
534cabdff1aSopenharmony_ci
535cabdff1aSopenharmony_ci        vld1.16         {d16[]}, [r2,:16]
536cabdff1aSopenharmony_ci        vmull.s16       q8,  d16, d0[0]
537cabdff1aSopenharmony_ci        vrshrn.s32      d16, q8,  #14
538cabdff1aSopenharmony_ci        vmull.s16       q8,  d16, d0[0]
539cabdff1aSopenharmony_ci        vrshrn.s32      d16, q8,  #14
540cabdff1aSopenharmony_ci        vdup.16         q8,  d16[0]
541cabdff1aSopenharmony_ci        vst1.16         {d4[0]}, [r2,:16]
542cabdff1aSopenharmony_ci
543cabdff1aSopenharmony_ci        vrshr.s16       q8,  q8,  #6
544cabdff1aSopenharmony_ci
545cabdff1aSopenharmony_ci        mov             r3,  r0
546cabdff1aSopenharmony_ci        mov             r12, #16
547cabdff1aSopenharmony_ci1:
548cabdff1aSopenharmony_ci        @ Loop to add the constant from q8 into all 16x16 outputs
549cabdff1aSopenharmony_ci        subs            r12, r12, #2
550cabdff1aSopenharmony_ci        vld1.8          {q2},  [r0,:128], r1
551cabdff1aSopenharmony_ci        vaddw.u8        q10, q8,  d4
552cabdff1aSopenharmony_ci        vld1.8          {q3},  [r0,:128], r1
553cabdff1aSopenharmony_ci        vaddw.u8        q11, q8,  d5
554cabdff1aSopenharmony_ci        vaddw.u8        q12, q8,  d6
555cabdff1aSopenharmony_ci        vaddw.u8        q13, q8,  d7
556cabdff1aSopenharmony_ci        vqmovun.s16     d4,  q10
557cabdff1aSopenharmony_ci        vqmovun.s16     d5,  q11
558cabdff1aSopenharmony_ci        vqmovun.s16     d6,  q12
559cabdff1aSopenharmony_ci        vst1.8          {q2},  [r3,:128], r1
560cabdff1aSopenharmony_ci        vqmovun.s16     d7,  q13
561cabdff1aSopenharmony_ci        vst1.8          {q3},  [r3,:128], r1
562cabdff1aSopenharmony_ci        bne             1b
563cabdff1aSopenharmony_ci
564cabdff1aSopenharmony_ci        bx              lr
565cabdff1aSopenharmony_ciendfunc
566cabdff1aSopenharmony_ci.ltorg
567cabdff1aSopenharmony_ci
568cabdff1aSopenharmony_ci.macro idct16_end
569cabdff1aSopenharmony_ci        butterfly       d18, d7,  d4,  d7                @ d18 = t0a,  d7  = t7a
570cabdff1aSopenharmony_ci        butterfly       d19, d22, d5,  d22               @ d19 = t1a,  d22 = t6
571cabdff1aSopenharmony_ci        butterfly       d4,  d26, d20, d26               @ d4  = t2a,  d26 = t5
572cabdff1aSopenharmony_ci        butterfly       d5,  d6,  d28, d6                @ d5  = t3a,  d6  = t4
573cabdff1aSopenharmony_ci        butterfly       d20, d28, d16, d24               @ d20 = t8a,  d28 = t11a
574cabdff1aSopenharmony_ci        butterfly       d24, d21, d23, d21               @ d24 = t9,   d21 = t10
575cabdff1aSopenharmony_ci        butterfly       d23, d27, d25, d27               @ d23 = t14,  d27 = t13
576cabdff1aSopenharmony_ci        butterfly       d25, d29, d29, d17               @ d25 = t15a, d29 = t12a
577cabdff1aSopenharmony_ci
578cabdff1aSopenharmony_ci        mbutterfly0     d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a
579cabdff1aSopenharmony_ci        mbutterfly0     d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12,  d28 = t11
580cabdff1aSopenharmony_ci
581cabdff1aSopenharmony_ci        vswp            d27, d29                         @ d27 = t12, d29 = t13a
582cabdff1aSopenharmony_ci        vswp            d28, d27                         @ d28 = t12, d27 = t11
583cabdff1aSopenharmony_ci        butterfly       d16, d31, d18, d25               @ d16 = out[0], d31 = out[15]
584cabdff1aSopenharmony_ci        butterfly       d17, d30, d19, d23               @ d17 = out[1], d30 = out[14]
585cabdff1aSopenharmony_ci        butterfly_r     d25, d22, d22, d24               @ d25 = out[9], d22 = out[6]
586cabdff1aSopenharmony_ci        butterfly       d23, d24, d7,  d20               @ d23 = out[7], d24 = out[8]
587cabdff1aSopenharmony_ci        butterfly       d18, d29, d4,  d29               @ d18 = out[2], d29 = out[13]
588cabdff1aSopenharmony_ci        butterfly       d19, d28, d5,  d28               @ d19 = out[3], d28 = out[12]
589cabdff1aSopenharmony_ci        vmov            d4,  d21                         @ d4  = t10a
590cabdff1aSopenharmony_ci        butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 = out[11]
591cabdff1aSopenharmony_ci        butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 = out[10]
592cabdff1aSopenharmony_ci        bx              lr
593cabdff1aSopenharmony_ci.endm
594cabdff1aSopenharmony_ci
595cabdff1aSopenharmony_cifunction idct16
596cabdff1aSopenharmony_ci        mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
597cabdff1aSopenharmony_ci        mbutterfly      d20, d28, d0[2], d0[3], q2,  q3  @ d20 = t2a,  d28 = t3a
598cabdff1aSopenharmony_ci        mbutterfly      d18, d30, d1[0], d1[1], q2,  q3  @ d18 = t4a,  d30 = t7a
599cabdff1aSopenharmony_ci        mbutterfly      d26, d22, d1[2], d1[3], q2,  q3  @ d26 = t5a,  d22 = t6a
600cabdff1aSopenharmony_ci        mbutterfly      d17, d31, d2[0], d2[1], q2,  q3  @ d17 = t8a,  d31 = t15a
601cabdff1aSopenharmony_ci        mbutterfly      d25, d23, d2[2], d2[3], q2,  q3  @ d25 = t9a,  d23 = t14a
602cabdff1aSopenharmony_ci        mbutterfly      d21, d27, d3[0], d3[1], q2,  q3  @ d21 = t10a, d27 = t13a
603cabdff1aSopenharmony_ci        mbutterfly      d29, d19, d3[2], d3[3], q2,  q3  @ d29 = t11a, d19 = t12a
604cabdff1aSopenharmony_ci
605cabdff1aSopenharmony_ci        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
606cabdff1aSopenharmony_ci        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
607cabdff1aSopenharmony_ci        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
608cabdff1aSopenharmony_ci        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
609cabdff1aSopenharmony_ci        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
610cabdff1aSopenharmony_ci        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
611cabdff1aSopenharmony_ci        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
612cabdff1aSopenharmony_ci        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
613cabdff1aSopenharmony_ci
614cabdff1aSopenharmony_ci        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
615cabdff1aSopenharmony_ci        mbutterfly      d23, d25, d0[2], d0[3], q9,  q15        @ d23 = t9a,  d25 = t14a
616cabdff1aSopenharmony_ci        mbutterfly      d27, d21, d0[2], d0[3], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
617cabdff1aSopenharmony_ci        idct16_end
618cabdff1aSopenharmony_ciendfunc
619cabdff1aSopenharmony_ci
620cabdff1aSopenharmony_cifunction idct16_half
621cabdff1aSopenharmony_ci        mbutterfly0_h   d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
622cabdff1aSopenharmony_ci        mbutterfly_h1   d20, d28, d0[2], d0[3], q2,  q3  @ d20 = t2a,  d28 = t3a
623cabdff1aSopenharmony_ci        mbutterfly_h1   d18, d30, d1[0], d1[1], q2,  q3  @ d18 = t4a,  d30 = t7a
624cabdff1aSopenharmony_ci        mbutterfly_h2   d26, d22, d1[2], d1[3], q2,  q3  @ d26 = t5a,  d22 = t6a
625cabdff1aSopenharmony_ci        mbutterfly_h1   d17, d31, d2[0], d2[1], q2,  q3  @ d17 = t8a,  d31 = t15a
626cabdff1aSopenharmony_ci        mbutterfly_h2   d25, d23, d2[2], d2[3], q2,  q3  @ d25 = t9a,  d23 = t14a
627cabdff1aSopenharmony_ci        mbutterfly_h1   d21, d27, d3[0], d3[1], q2,  q3  @ d21 = t10a, d27 = t13a
628cabdff1aSopenharmony_ci        mbutterfly_h2   d29, d19, d3[2], d3[3], q2,  q3  @ d29 = t11a, d19 = t12a
629cabdff1aSopenharmony_ci
630cabdff1aSopenharmony_ci        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
631cabdff1aSopenharmony_ci        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
632cabdff1aSopenharmony_ci        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
633cabdff1aSopenharmony_ci        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
634cabdff1aSopenharmony_ci        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
635cabdff1aSopenharmony_ci        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
636cabdff1aSopenharmony_ci        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
637cabdff1aSopenharmony_ci        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
638cabdff1aSopenharmony_ci
639cabdff1aSopenharmony_ci        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
640cabdff1aSopenharmony_ci        mbutterfly      d23, d25, d0[2], d0[3], q9,  q15        @ d23 = t9a,  d25 = t14a
641cabdff1aSopenharmony_ci        mbutterfly      d27, d21, d0[2], d0[3], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
642cabdff1aSopenharmony_ci        idct16_end
643cabdff1aSopenharmony_ciendfunc
644cabdff1aSopenharmony_ci
645cabdff1aSopenharmony_cifunction idct16_quarter
646cabdff1aSopenharmony_ci        vmull.s16       q12, d19, d3[3]
647cabdff1aSopenharmony_ci        vmull.s16       q2,  d17, d2[0]
648cabdff1aSopenharmony_ci        vmull.s16       q3,  d18, d1[1]
649cabdff1aSopenharmony_ci        vmull.s16       q15, d18, d1[0]
650cabdff1aSopenharmony_ci        vneg.s32        q12, q12
651cabdff1aSopenharmony_ci        vmull.s16       q14, d17, d2[1]
652cabdff1aSopenharmony_ci        vmull.s16       q13, d19, d3[2]
653cabdff1aSopenharmony_ci        vmull.s16       q11, d16, d0[0]
654cabdff1aSopenharmony_ci        vrshrn.s32      d24, q12, #14
655cabdff1aSopenharmony_ci        vrshrn.s32      d16, q2,  #14
656cabdff1aSopenharmony_ci        vrshrn.s32      d7,  q3,  #14
657cabdff1aSopenharmony_ci        vrshrn.s32      d6,  q15, #14
658cabdff1aSopenharmony_ci        vrshrn.s32      d29, q14, #14
659cabdff1aSopenharmony_ci        vrshrn.s32      d17, q13, #14
660cabdff1aSopenharmony_ci        vrshrn.s32      d28, q11, #14
661cabdff1aSopenharmony_ci
662cabdff1aSopenharmony_ci        mbutterfly_l    q10, q11, d17, d24, d0[2], d0[3]
663cabdff1aSopenharmony_ci        mbutterfly_l    q9,  q15, d29, d16, d0[2], d0[3]
664cabdff1aSopenharmony_ci        vneg.s32        q11, q11
665cabdff1aSopenharmony_ci        vrshrn.s32      d27, q10, #14
666cabdff1aSopenharmony_ci        vrshrn.s32      d21, q11, #14
667cabdff1aSopenharmony_ci        vrshrn.s32      d23, q9,  #14
668cabdff1aSopenharmony_ci        vrshrn.s32      d25, q15, #14
669cabdff1aSopenharmony_ci        vmov            d4,  d28
670cabdff1aSopenharmony_ci        vmov            d5,  d28
671cabdff1aSopenharmony_ci        mbutterfly0     d22, d26, d7,  d6,  d18, d30, q9,  q15
672cabdff1aSopenharmony_ci        vmov            d20, d28
673cabdff1aSopenharmony_ci        idct16_end
674cabdff1aSopenharmony_ciendfunc
675cabdff1aSopenharmony_ci
676cabdff1aSopenharmony_cifunction iadst16
677cabdff1aSopenharmony_ci        movrel          r12, iadst16_coeffs
678cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r12,:128]
679cabdff1aSopenharmony_ci
680cabdff1aSopenharmony_ci        mbutterfly_l    q3,  q2,  d31, d16, d0[1], d0[0] @ q3  = t1,   q2  = t0
681cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d23, d24, d1[1], d1[0] @ q5  = t9,   q4  = t8
682cabdff1aSopenharmony_ci        butterfly_n     d31, d24, q3,  q5,  q6,  q5      @ d31 = t1a,  d24 = t9a
683cabdff1aSopenharmony_ci        mbutterfly_l    q7,  q6,  d29, d18, d0[3], d0[2] @ q7  = t3,   q6  = t2
684cabdff1aSopenharmony_ci        butterfly_n     d16, d23, q2,  q4,  q3,  q4      @ d16 = t0a,  d23 = t8a
685cabdff1aSopenharmony_ci
686cabdff1aSopenharmony_ci        mbutterfly_l    q3,  q2,  d21, d26, d1[3], d1[2] @ q3  = t11,  q2  = t10
687cabdff1aSopenharmony_ci        butterfly_n     d29, d26, q7,  q3,  q4,  q3      @ d29 = t3a,  d26 = t11a
688cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d27, d20, d2[1], d2[0] @ q5  = t5,   q4  = t4
689cabdff1aSopenharmony_ci        butterfly_n     d18, d21, q6,  q2,  q3,  q2      @ d18 = t2a,  d21 = t10a
690cabdff1aSopenharmony_ci
691cabdff1aSopenharmony_ci        mbutterfly_l    q7,  q6,  d19, d28, d3[1], d3[0] @ q7  = t13,  q6  = t12
692cabdff1aSopenharmony_ci        butterfly_n     d20, d28, q5,  q7,  q2,  q7      @ d20 = t5a,  d28 = t13a
693cabdff1aSopenharmony_ci        mbutterfly_l    q3,  q2,  d25, d22, d2[3], d2[2] @ q3  = t7,   q2  = t6
694cabdff1aSopenharmony_ci        butterfly_n     d27, d19, q4,  q6,  q5,  q6      @ d27 = t4a,  d19 = t12a
695cabdff1aSopenharmony_ci
696cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d17, d30, d3[3], d3[2] @ q5  = t15,  q4  = t14
697cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
698cabdff1aSopenharmony_ci        vld1.16         {q0}, [r12,:128]
699cabdff1aSopenharmony_ci        butterfly_n     d22, d30, q3,  q5,  q6,  q5      @ d22 = t7a,  d30 = t15a
700cabdff1aSopenharmony_ci        mbutterfly_l    q7,  q6,  d23, d24, d1[0], d1[1] @ q7  = t9,   q6  = t8
701cabdff1aSopenharmony_ci        butterfly_n     d25, d17, q2,  q4,  q3,  q4      @ d25 = t6a,  d17 = t14a
702cabdff1aSopenharmony_ci
703cabdff1aSopenharmony_ci        mbutterfly_l    q2,  q3,  d28, d19, d1[1], d1[0] @ q2  = t12,  q3  = t13
704cabdff1aSopenharmony_ci        butterfly_n     d23, d19, q6,  q2,  q4,  q2      @ d23 = t8a,  d19 = t12a
705cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d21, d26, d1[2], d1[3] @ q5  = t11,  q4  = t10
706cabdff1aSopenharmony_ci        butterfly_r     d4,  d27, d16, d27               @ d4  = t4,   d27 = t0
707cabdff1aSopenharmony_ci        butterfly_n     d24, d28, q7,  q3,  q6,  q3      @ d24 = t9a,  d28 = t13a
708cabdff1aSopenharmony_ci
709cabdff1aSopenharmony_ci        mbutterfly_l    q6,  q7,  d30, d17, d1[3], d1[2] @ q6  = t14,  q7  = t15
710cabdff1aSopenharmony_ci        butterfly_r     d5,  d20, d31, d20               @ d5  = t5,   d20 = t1
711cabdff1aSopenharmony_ci        butterfly_n     d21, d17, q4,  q6,  q3,  q6      @ d21 = t10a, d17 = t14a
712cabdff1aSopenharmony_ci        butterfly_n     d26, d30, q5,  q7,  q4,  q7      @ d26 = t11a, d30 = t15a
713cabdff1aSopenharmony_ci
714cabdff1aSopenharmony_ci        butterfly_r     d6,  d25, d18, d25               @ d6  = t6,   d25 = t2
715cabdff1aSopenharmony_ci        butterfly_r     d7,  d22, d29, d22               @ d7  = t7,   d22 = t3
716cabdff1aSopenharmony_ci
717cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d19, d28, d0[2], d0[3] @ q5  = t13,  q4  = t12
718cabdff1aSopenharmony_ci        mbutterfly_l    q6,  q7,  d30, d17, d0[3], d0[2] @ q6  = t14,  q7  = t15
719cabdff1aSopenharmony_ci
720cabdff1aSopenharmony_ci        butterfly_n     d18, d30, q4,  q6,  q8,  q6      @ d18 = out[2],   d30 = t14a
721cabdff1aSopenharmony_ci        butterfly_n     d29, d17, q5,  q7,  q6,  q7      @ d29 = -out[13], d17 = t15a
722cabdff1aSopenharmony_ci        vneg.s16        d29, d29                         @ d29 = out[13]
723cabdff1aSopenharmony_ci
724cabdff1aSopenharmony_ci        mbutterfly_l    q5,  q4,  d4,  d5,  d0[2], d0[3] @ q5  = t5a,  q4  = t4a
725cabdff1aSopenharmony_ci        mbutterfly_l    q6,  q7,  d7,  d6,  d0[3], d0[2] @ q6  = t6a,  q7  = t7a
726cabdff1aSopenharmony_ci
727cabdff1aSopenharmony_ci        butterfly       d2,  d6,  d27, d25               @ d2 = out[0], d6 = t2a
728cabdff1aSopenharmony_ci        butterfly       d3,  d7,  d23, d21               @ d3 =-out[1], d7 = t10
729cabdff1aSopenharmony_ci
730cabdff1aSopenharmony_ci        butterfly_n     d19, d31, q4,  q6,  q2,  q4      @ d19 = -out[3],  d31 = t6
731cabdff1aSopenharmony_ci        vneg.s16        d19, d19                         @ d19 = out[3]
732cabdff1aSopenharmony_ci        butterfly_n     d28, d16, q5,  q7,  q2,  q5      @ d28 = out[12],  d16 = t7
733cabdff1aSopenharmony_ci
734cabdff1aSopenharmony_ci        butterfly       d5,  d8,  d20, d22               @ d5 =-out[15],d8 = t3a
735cabdff1aSopenharmony_ci        butterfly       d4,  d9,  d24, d26               @ d4 = out[14],d9 = t11
736cabdff1aSopenharmony_ci
737cabdff1aSopenharmony_ci        mbutterfly0     d23, d24, d6,  d8,  d10, d11, q6,  q7, 1 @ d23 = out[7], d24 = out[8]
738cabdff1aSopenharmony_ci        mbutterfly0     d20, d27, d16, d31, d10, d11, q6,  q7    @ d20 = out[4], d27 = out[11]
739cabdff1aSopenharmony_ci        mbutterfly0     d22, d25, d9,  d7,  d10, d11, q6,  q7    @ d22 = out[6], d25 = out[9]
740cabdff1aSopenharmony_ci        mbutterfly0     d21, d26, d30, d17, d10, d11, q6,  q7, 1 @ d21 = out[5], d26 = out[10]
741cabdff1aSopenharmony_ci
742cabdff1aSopenharmony_ci        vneg.s16        d31, d5                          @ d31 = out[15]
743cabdff1aSopenharmony_ci        vneg.s16        d17, d3                          @ d17 = out[1]
744cabdff1aSopenharmony_ci
745cabdff1aSopenharmony_ci        vmov            d16, d2
746cabdff1aSopenharmony_ci        vmov            d30, d4
747cabdff1aSopenharmony_ci        bx              lr
748cabdff1aSopenharmony_ciendfunc
749cabdff1aSopenharmony_ci
750cabdff1aSopenharmony_ci.macro load_add_store coef0, coef1, coef2, coef3
751cabdff1aSopenharmony_ci        vrshr.s16       \coef0, \coef0, #6
752cabdff1aSopenharmony_ci        vrshr.s16       \coef1, \coef1, #6
753cabdff1aSopenharmony_ci
754cabdff1aSopenharmony_ci        vld1.32         {d4[]},   [r0,:32], r1
755cabdff1aSopenharmony_ci        vld1.32         {d4[1]},  [r3,:32], r1
756cabdff1aSopenharmony_ci        vrshr.s16       \coef2, \coef2, #6
757cabdff1aSopenharmony_ci        vrshr.s16       \coef3, \coef3, #6
758cabdff1aSopenharmony_ci        vld1.32         {d5[]},   [r0,:32], r1
759cabdff1aSopenharmony_ci        vld1.32         {d5[1]},  [r3,:32], r1
760cabdff1aSopenharmony_ci        vaddw.u8        \coef0, \coef0, d4
761cabdff1aSopenharmony_ci        vld1.32         {d6[]},   [r0,:32], r1
762cabdff1aSopenharmony_ci        vld1.32         {d6[1]},  [r3,:32], r1
763cabdff1aSopenharmony_ci        vaddw.u8        \coef1, \coef1, d5
764cabdff1aSopenharmony_ci        vld1.32         {d7[]},   [r0,:32], r1
765cabdff1aSopenharmony_ci        vld1.32         {d7[1]},  [r3,:32], r1
766cabdff1aSopenharmony_ci
767cabdff1aSopenharmony_ci        vqmovun.s16     d4,  \coef0
768cabdff1aSopenharmony_ci        vqmovun.s16     d5,  \coef1
769cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
770cabdff1aSopenharmony_ci        sub             r3,  r3,  r1, lsl #2
771cabdff1aSopenharmony_ci        vaddw.u8        \coef2, \coef2, d6
772cabdff1aSopenharmony_ci        vaddw.u8        \coef3, \coef3, d7
773cabdff1aSopenharmony_ci        vst1.32         {d4[0]},  [r0,:32], r1
774cabdff1aSopenharmony_ci        vst1.32         {d4[1]},  [r3,:32], r1
775cabdff1aSopenharmony_ci        vqmovun.s16     d6,  \coef2
776cabdff1aSopenharmony_ci        vst1.32         {d5[0]},  [r0,:32], r1
777cabdff1aSopenharmony_ci        vst1.32         {d5[1]},  [r3,:32], r1
778cabdff1aSopenharmony_ci        vqmovun.s16     d7,  \coef3
779cabdff1aSopenharmony_ci
780cabdff1aSopenharmony_ci        vst1.32         {d6[0]},  [r0,:32], r1
781cabdff1aSopenharmony_ci        vst1.32         {d6[1]},  [r3,:32], r1
782cabdff1aSopenharmony_ci        vst1.32         {d7[0]},  [r0,:32], r1
783cabdff1aSopenharmony_ci        vst1.32         {d7[1]},  [r3,:32], r1
784cabdff1aSopenharmony_ci.endm
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci.macro itxfm16_1d_funcs txfm
787cabdff1aSopenharmony_ci@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
788cabdff1aSopenharmony_ci@ transpose into a horizontal 16x4 slice and store.
789cabdff1aSopenharmony_ci@ r0 = dst (temp buffer)
790cabdff1aSopenharmony_ci@ r1 = slice offset
791cabdff1aSopenharmony_ci@ r2 = src
792cabdff1aSopenharmony_cifunction \txfm\()16_1d_4x16_pass1_neon
793cabdff1aSopenharmony_ci        push            {lr}
794cabdff1aSopenharmony_ci
795cabdff1aSopenharmony_ci        mov             r12, #32
796cabdff1aSopenharmony_ci        vmov.s16        q2,  #0
797cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
798cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
799cabdff1aSopenharmony_ci        vst1.16         {d4},  [r2,:64], r12
800cabdff1aSopenharmony_ci.endr
801cabdff1aSopenharmony_ci
802cabdff1aSopenharmony_ci        bl              \txfm\()16
803cabdff1aSopenharmony_ci
804cabdff1aSopenharmony_ci        @ Do four 4x4 transposes. Originally, d16-d31 contain the
805cabdff1aSopenharmony_ci        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
806cabdff1aSopenharmony_ci        @ contain the transposed 4x4 blocks.
807cabdff1aSopenharmony_ci        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
808cabdff1aSopenharmony_ci
809cabdff1aSopenharmony_ci        @ Store the transposed 4x4 blocks horizontally.
810cabdff1aSopenharmony_ci        cmp             r1,  #12
811cabdff1aSopenharmony_ci        beq             1f
812cabdff1aSopenharmony_ci.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
813cabdff1aSopenharmony_ci        vst1.16         {d\i}, [r0,:64]!
814cabdff1aSopenharmony_ci.endr
815cabdff1aSopenharmony_ci        pop             {pc}
816cabdff1aSopenharmony_ci1:
817cabdff1aSopenharmony_ci        @ Special case: For the last input column (r1 == 12),
818cabdff1aSopenharmony_ci        @ which would be stored as the last row in the temp buffer,
819cabdff1aSopenharmony_ci        @ don't store the first 4x4 block, but keep it in registers
820cabdff1aSopenharmony_ci        @ for the first slice of the second pass (where it is the
821cabdff1aSopenharmony_ci        @ last 4x4 block).
822cabdff1aSopenharmony_ci        add             r0,  r0,  #8
823cabdff1aSopenharmony_ci        vst1.16         {d20}, [r0,:64]!
824cabdff1aSopenharmony_ci        vst1.16         {d24}, [r0,:64]!
825cabdff1aSopenharmony_ci        vst1.16         {d28}, [r0,:64]!
826cabdff1aSopenharmony_ci        add             r0,  r0,  #8
827cabdff1aSopenharmony_ci        vst1.16         {d21}, [r0,:64]!
828cabdff1aSopenharmony_ci        vst1.16         {d25}, [r0,:64]!
829cabdff1aSopenharmony_ci        vst1.16         {d29}, [r0,:64]!
830cabdff1aSopenharmony_ci        add             r0,  r0,  #8
831cabdff1aSopenharmony_ci        vst1.16         {d22}, [r0,:64]!
832cabdff1aSopenharmony_ci        vst1.16         {d26}, [r0,:64]!
833cabdff1aSopenharmony_ci        vst1.16         {d30}, [r0,:64]!
834cabdff1aSopenharmony_ci        add             r0,  r0,  #8
835cabdff1aSopenharmony_ci        vst1.16         {d23}, [r0,:64]!
836cabdff1aSopenharmony_ci        vst1.16         {d27}, [r0,:64]!
837cabdff1aSopenharmony_ci        vst1.16         {d31}, [r0,:64]!
838cabdff1aSopenharmony_ci        vmov            d28, d16
839cabdff1aSopenharmony_ci        vmov            d29, d17
840cabdff1aSopenharmony_ci        vmov            d30, d18
841cabdff1aSopenharmony_ci        vmov            d31, d19
842cabdff1aSopenharmony_ci        pop             {pc}
843cabdff1aSopenharmony_ciendfunc
844cabdff1aSopenharmony_ci
845cabdff1aSopenharmony_ci@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
846cabdff1aSopenharmony_ci@ load the destination pixels (from a similar 4x16 slice), add and store back.
847cabdff1aSopenharmony_ci@ r0 = dst
848cabdff1aSopenharmony_ci@ r1 = dst stride
849cabdff1aSopenharmony_ci@ r2 = src (temp buffer)
850cabdff1aSopenharmony_ci@ r3 = slice offset
851cabdff1aSopenharmony_cifunction \txfm\()16_1d_4x16_pass2_neon
852cabdff1aSopenharmony_ci        push            {lr}
853cabdff1aSopenharmony_ci        mov             r12, #32
854cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
855cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
856cabdff1aSopenharmony_ci.endr
857cabdff1aSopenharmony_ci        cmp             r3,  #0
858cabdff1aSopenharmony_ci        beq             1f
859cabdff1aSopenharmony_ci.irp i, 28, 29, 30, 31
860cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
861cabdff1aSopenharmony_ci.endr
862cabdff1aSopenharmony_ci1:
863cabdff1aSopenharmony_ci
864cabdff1aSopenharmony_ci        add             r3,  r0,  r1
865cabdff1aSopenharmony_ci        lsl             r1,  r1,  #1
866cabdff1aSopenharmony_ci        bl              \txfm\()16
867cabdff1aSopenharmony_ci
868cabdff1aSopenharmony_ci        load_add_store  q8,  q9,  q10, q11
869cabdff1aSopenharmony_ci        load_add_store  q12, q13, q14, q15
870cabdff1aSopenharmony_ci
871cabdff1aSopenharmony_ci        pop             {pc}
872cabdff1aSopenharmony_ciendfunc
873cabdff1aSopenharmony_ci.endm
874cabdff1aSopenharmony_ci
875cabdff1aSopenharmony_ciitxfm16_1d_funcs idct
876cabdff1aSopenharmony_ciitxfm16_1d_funcs iadst
877cabdff1aSopenharmony_ci
878cabdff1aSopenharmony_ci@ This is the minimum eob value for each subpartition, in increments of 4
879cabdff1aSopenharmony_ciconst min_eob_idct_idct_16, align=4
880cabdff1aSopenharmony_ci        .short  0, 10, 38, 89
881cabdff1aSopenharmony_ciendconst
882cabdff1aSopenharmony_ci
883cabdff1aSopenharmony_ci.macro itxfm_func16x16 txfm1, txfm2
884cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
885cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
886cabdff1aSopenharmony_ci        cmp             r3,  #1
887cabdff1aSopenharmony_ci        beq             idct16x16_dc_add_neon
888cabdff1aSopenharmony_ci.endif
889cabdff1aSopenharmony_ci        push            {r4-r8,lr}
890cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
891cabdff1aSopenharmony_ci        vpush           {q4-q7}
892cabdff1aSopenharmony_ci.endif
893cabdff1aSopenharmony_ci
894cabdff1aSopenharmony_ci        @ Align the stack, allocate a temp buffer
895cabdff1aSopenharmony_ciT       mov             r7,  sp
896cabdff1aSopenharmony_ciT       and             r7,  r7,  #15
897cabdff1aSopenharmony_ciA       and             r7,  sp,  #15
898cabdff1aSopenharmony_ci        add             r7,  r7,  #512
899cabdff1aSopenharmony_ci        sub             sp,  sp,  r7
900cabdff1aSopenharmony_ci
901cabdff1aSopenharmony_ci        mov             r4,  r0
902cabdff1aSopenharmony_ci        mov             r5,  r1
903cabdff1aSopenharmony_ci        mov             r6,  r2
904cabdff1aSopenharmony_ci
905cabdff1aSopenharmony_ci.ifc \txfm1,idct
906cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
907cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r12,:128]
908cabdff1aSopenharmony_ci.endif
909cabdff1aSopenharmony_ci
910cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
911cabdff1aSopenharmony_ci        cmp             r3,  #10
912cabdff1aSopenharmony_ci        ble             idct16x16_quarter_add_neon
913cabdff1aSopenharmony_ci        cmp             r3,  #38
914cabdff1aSopenharmony_ci        ble             idct16x16_half_add_neon
915cabdff1aSopenharmony_ci
916cabdff1aSopenharmony_ci        movrel          r8,  min_eob_idct_idct_16 + 2
917cabdff1aSopenharmony_ci.endif
918cabdff1aSopenharmony_ci
919cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12
920cabdff1aSopenharmony_ci        add             r0,  sp,  #(\i*32)
921cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
922cabdff1aSopenharmony_ci.if \i > 0
923cabdff1aSopenharmony_ci        ldrh_post       r1,  r8,  #2
924cabdff1aSopenharmony_ci        cmp             r3,  r1
925cabdff1aSopenharmony_ci        it              le
926cabdff1aSopenharmony_ci        movle           r1,  #(16 - \i)/4
927cabdff1aSopenharmony_ci        ble             1f
928cabdff1aSopenharmony_ci.endif
929cabdff1aSopenharmony_ci.endif
930cabdff1aSopenharmony_ci        mov             r1,  #\i
931cabdff1aSopenharmony_ci        add             r2,  r6,  #(\i*2)
932cabdff1aSopenharmony_ci        bl              \txfm1\()16_1d_4x16_pass1_neon
933cabdff1aSopenharmony_ci.endr
934cabdff1aSopenharmony_ci
935cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
936cabdff1aSopenharmony_ci        b               3f
937cabdff1aSopenharmony_ci1:
938cabdff1aSopenharmony_ci        @ For all-zero slices in pass 1, set d28-d31 to zero, for the in-register
939cabdff1aSopenharmony_ci        @ passthrough of coefficients to pass 2 and clear the end of the temp buffer
940cabdff1aSopenharmony_ci        vmov.i16        q14, #0
941cabdff1aSopenharmony_ci        vmov.i16        q15, #0
942cabdff1aSopenharmony_ci2:
943cabdff1aSopenharmony_ci        subs            r1,  r1,  #1
944cabdff1aSopenharmony_ci.rept 4
945cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r0,:128]!
946cabdff1aSopenharmony_ci.endr
947cabdff1aSopenharmony_ci        bne             2b
948cabdff1aSopenharmony_ci3:
949cabdff1aSopenharmony_ci.endif
950cabdff1aSopenharmony_ci
951cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,iadst_idct
952cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
953cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r12,:128]
954cabdff1aSopenharmony_ci.endif
955cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12
956cabdff1aSopenharmony_ci        add             r0,  r4,  #(\i)
957cabdff1aSopenharmony_ci        mov             r1,  r5
958cabdff1aSopenharmony_ci        add             r2,  sp,  #(\i*2)
959cabdff1aSopenharmony_ci        mov             r3,  #\i
960cabdff1aSopenharmony_ci        bl              \txfm2\()16_1d_4x16_pass2_neon
961cabdff1aSopenharmony_ci.endr
962cabdff1aSopenharmony_ci
963cabdff1aSopenharmony_ci        add             sp,  sp,  r7
964cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
965cabdff1aSopenharmony_ci        vpop            {q4-q7}
966cabdff1aSopenharmony_ci.endif
967cabdff1aSopenharmony_ci        pop             {r4-r8,pc}
968cabdff1aSopenharmony_ciendfunc
969cabdff1aSopenharmony_ci.endm
970cabdff1aSopenharmony_ci
971cabdff1aSopenharmony_ciitxfm_func16x16 idct,  idct
972cabdff1aSopenharmony_ciitxfm_func16x16 iadst, idct
973cabdff1aSopenharmony_ciitxfm_func16x16 idct,  iadst
974cabdff1aSopenharmony_ciitxfm_func16x16 iadst, iadst
975cabdff1aSopenharmony_ci.ltorg
976cabdff1aSopenharmony_ci
977cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass1_quarter_neon
978cabdff1aSopenharmony_ci        push            {lr}
979cabdff1aSopenharmony_ci        mov             r12, #32
980cabdff1aSopenharmony_ci        vmov.s16        q2, #0
981cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
982cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
983cabdff1aSopenharmony_ci        vst1.16         {d4},  [r2,:64], r12
984cabdff1aSopenharmony_ci.endr
985cabdff1aSopenharmony_ci
986cabdff1aSopenharmony_ci        bl              idct16_quarter
987cabdff1aSopenharmony_ci
988cabdff1aSopenharmony_ci        @ Do four 4x4 transposes. Originally, d16-d31 contain the
989cabdff1aSopenharmony_ci        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
990cabdff1aSopenharmony_ci        @ contain the transposed 4x4 blocks.
991cabdff1aSopenharmony_ci        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
992cabdff1aSopenharmony_ci
993cabdff1aSopenharmony_ci        @ Store the transposed 4x4 blocks horizontally.
994cabdff1aSopenharmony_ci        @ The first 4x4 block is kept in registers for the second pass,
995cabdff1aSopenharmony_ci        @ store the rest in the temp buffer.
996cabdff1aSopenharmony_ci        add             r0,  r0,  #8
997cabdff1aSopenharmony_ci        vst1.16         {d20}, [r0,:64]!
998cabdff1aSopenharmony_ci        vst1.16         {d24}, [r0,:64]!
999cabdff1aSopenharmony_ci        vst1.16         {d28}, [r0,:64]!
1000cabdff1aSopenharmony_ci        add             r0,  r0,  #8
1001cabdff1aSopenharmony_ci        vst1.16         {d21}, [r0,:64]!
1002cabdff1aSopenharmony_ci        vst1.16         {d25}, [r0,:64]!
1003cabdff1aSopenharmony_ci        vst1.16         {d29}, [r0,:64]!
1004cabdff1aSopenharmony_ci        add             r0,  r0,  #8
1005cabdff1aSopenharmony_ci        vst1.16         {d22}, [r0,:64]!
1006cabdff1aSopenharmony_ci        vst1.16         {d26}, [r0,:64]!
1007cabdff1aSopenharmony_ci        vst1.16         {d30}, [r0,:64]!
1008cabdff1aSopenharmony_ci        add             r0,  r0,  #8
1009cabdff1aSopenharmony_ci        vst1.16         {d23}, [r0,:64]!
1010cabdff1aSopenharmony_ci        vst1.16         {d27}, [r0,:64]!
1011cabdff1aSopenharmony_ci        vst1.16         {d31}, [r0,:64]!
1012cabdff1aSopenharmony_ci        pop             {pc}
1013cabdff1aSopenharmony_ciendfunc
1014cabdff1aSopenharmony_ci
1015cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass2_quarter_neon
1016cabdff1aSopenharmony_ci        push            {lr}
1017cabdff1aSopenharmony_ci        @ Only load the top 4 lines, and only do it for the later slices.
1018cabdff1aSopenharmony_ci        @ For the first slice, d16-d19 is kept in registers from the first pass.
1019cabdff1aSopenharmony_ci        cmp             r3,  #0
1020cabdff1aSopenharmony_ci        beq             1f
1021cabdff1aSopenharmony_ci        mov             r12, #32
1022cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1023cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1024cabdff1aSopenharmony_ci.endr
1025cabdff1aSopenharmony_ci1:
1026cabdff1aSopenharmony_ci
1027cabdff1aSopenharmony_ci        add             r3,  r0,  r1
1028cabdff1aSopenharmony_ci        lsl             r1,  r1,  #1
1029cabdff1aSopenharmony_ci        bl              idct16_quarter
1030cabdff1aSopenharmony_ci
1031cabdff1aSopenharmony_ci        load_add_store  q8,  q9,  q10, q11
1032cabdff1aSopenharmony_ci        load_add_store  q12, q13, q14, q15
1033cabdff1aSopenharmony_ci
1034cabdff1aSopenharmony_ci        pop             {pc}
1035cabdff1aSopenharmony_ciendfunc
1036cabdff1aSopenharmony_ci
1037cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass1_half_neon
1038cabdff1aSopenharmony_ci        push            {lr}
1039cabdff1aSopenharmony_ci        mov             r12, #32
1040cabdff1aSopenharmony_ci        vmov.s16        q2, #0
1041cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1042cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
1043cabdff1aSopenharmony_ci        vst1.16         {d4},  [r2,:64], r12
1044cabdff1aSopenharmony_ci.endr
1045cabdff1aSopenharmony_ci
1046cabdff1aSopenharmony_ci        bl              idct16_half
1047cabdff1aSopenharmony_ci
1048cabdff1aSopenharmony_ci        @ Do four 4x4 transposes. Originally, d16-d31 contain the
1049cabdff1aSopenharmony_ci        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
1050cabdff1aSopenharmony_ci        @ contain the transposed 4x4 blocks.
1051cabdff1aSopenharmony_ci        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
1052cabdff1aSopenharmony_ci
1053cabdff1aSopenharmony_ci        @ Store the transposed 4x4 blocks horizontally.
1054cabdff1aSopenharmony_ci        cmp             r1,  #4
1055cabdff1aSopenharmony_ci        beq             1f
1056cabdff1aSopenharmony_ci.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
1057cabdff1aSopenharmony_ci        vst1.16         {d\i}, [r0,:64]!
1058cabdff1aSopenharmony_ci.endr
1059cabdff1aSopenharmony_ci        pop             {pc}
1060cabdff1aSopenharmony_ci1:
1061cabdff1aSopenharmony_ci        @ Special case: For the second input column (r1 == 4),
1062cabdff1aSopenharmony_ci        @ which would be stored as the second row in the temp buffer,
1063cabdff1aSopenharmony_ci        @ don't store the first 4x4 block, but keep it in registers
1064cabdff1aSopenharmony_ci        @ for the first slice of the second pass (where it is the
1065cabdff1aSopenharmony_ci        @ second 4x4 block).
1066cabdff1aSopenharmony_ci        add             r0,  r0,  #8
1067cabdff1aSopenharmony_ci        vst1.16         {d20}, [r0,:64]!
1068cabdff1aSopenharmony_ci        vst1.16         {d24}, [r0,:64]!
1069cabdff1aSopenharmony_ci        vst1.16         {d28}, [r0,:64]!
1070cabdff1aSopenharmony_ci        add             r0,  r0,  #8
1071cabdff1aSopenharmony_ci        vst1.16         {d21}, [r0,:64]!
1072cabdff1aSopenharmony_ci        vst1.16         {d25}, [r0,:64]!
1073cabdff1aSopenharmony_ci        vst1.16         {d29}, [r0,:64]!
1074cabdff1aSopenharmony_ci        add             r0,  r0,  #8
1075cabdff1aSopenharmony_ci        vst1.16         {d22}, [r0,:64]!
1076cabdff1aSopenharmony_ci        vst1.16         {d26}, [r0,:64]!
1077cabdff1aSopenharmony_ci        vst1.16         {d30}, [r0,:64]!
1078cabdff1aSopenharmony_ci        add             r0,  r0,  #8
1079cabdff1aSopenharmony_ci        vst1.16         {d23}, [r0,:64]!
1080cabdff1aSopenharmony_ci        vst1.16         {d27}, [r0,:64]!
1081cabdff1aSopenharmony_ci        vst1.16         {d31}, [r0,:64]!
1082cabdff1aSopenharmony_ci        vmov            d20, d16
1083cabdff1aSopenharmony_ci        vmov            d21, d17
1084cabdff1aSopenharmony_ci        vmov            d22, d18
1085cabdff1aSopenharmony_ci        vmov            d23, d19
1086cabdff1aSopenharmony_ci        pop             {pc}
1087cabdff1aSopenharmony_ciendfunc
1088cabdff1aSopenharmony_ci
1089cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass2_half_neon
1090cabdff1aSopenharmony_ci        push            {lr}
1091cabdff1aSopenharmony_ci        mov             r12, #32
1092cabdff1aSopenharmony_ci        cmp             r3,  #0
1093cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1094cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1095cabdff1aSopenharmony_ci.endr
1096cabdff1aSopenharmony_ci        beq             1f
1097cabdff1aSopenharmony_ci.irp i, 20, 21, 22, 23
1098cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1099cabdff1aSopenharmony_ci.endr
1100cabdff1aSopenharmony_ci1:
1101cabdff1aSopenharmony_ci
1102cabdff1aSopenharmony_ci        add             r3,  r0,  r1
1103cabdff1aSopenharmony_ci        lsl             r1,  r1,  #1
1104cabdff1aSopenharmony_ci        bl              idct16_half
1105cabdff1aSopenharmony_ci
1106cabdff1aSopenharmony_ci        load_add_store  q8,  q9,  q10, q11
1107cabdff1aSopenharmony_ci        load_add_store  q12, q13, q14, q15
1108cabdff1aSopenharmony_ci
1109cabdff1aSopenharmony_ci        pop             {pc}
1110cabdff1aSopenharmony_ciendfunc
1111cabdff1aSopenharmony_ci.purgem load_add_store
1112cabdff1aSopenharmony_ci
1113cabdff1aSopenharmony_ci.macro idct16_partial size
1114cabdff1aSopenharmony_cifunction idct16x16_\size\()_add_neon
1115cabdff1aSopenharmony_ci        add             r0,  sp,  #(0*32)
1116cabdff1aSopenharmony_ci        mov             r1,  #0
1117cabdff1aSopenharmony_ci        add             r2,  r6,  #(0*2)
1118cabdff1aSopenharmony_ci        bl              idct16_1d_4x16_pass1_\size\()_neon
1119cabdff1aSopenharmony_ci.ifc \size,half
1120cabdff1aSopenharmony_ci        add             r0,  sp,  #(4*32)
1121cabdff1aSopenharmony_ci        mov             r1,  #4
1122cabdff1aSopenharmony_ci        add             r2,  r6,  #(4*2)
1123cabdff1aSopenharmony_ci        bl              idct16_1d_4x16_pass1_\size\()_neon
1124cabdff1aSopenharmony_ci.endif
1125cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12
1126cabdff1aSopenharmony_ci        add             r0,  r4,  #(\i)
1127cabdff1aSopenharmony_ci        mov             r1,  r5
1128cabdff1aSopenharmony_ci        add             r2,  sp,  #(\i*2)
1129cabdff1aSopenharmony_ci        mov             r3,  #\i
1130cabdff1aSopenharmony_ci        bl              idct16_1d_4x16_pass2_\size\()_neon
1131cabdff1aSopenharmony_ci.endr
1132cabdff1aSopenharmony_ci
1133cabdff1aSopenharmony_ci        add             sp,  sp,  r7
1134cabdff1aSopenharmony_ci        pop             {r4-r8,pc}
1135cabdff1aSopenharmony_ciendfunc
1136cabdff1aSopenharmony_ci.endm
1137cabdff1aSopenharmony_ci
1138cabdff1aSopenharmony_ciidct16_partial quarter
1139cabdff1aSopenharmony_ciidct16_partial half
1140cabdff1aSopenharmony_ci
1141cabdff1aSopenharmony_cifunction idct32x32_dc_add_neon
1142cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
1143cabdff1aSopenharmony_ci        vld1.16         {d0}, [r12,:64]
1144cabdff1aSopenharmony_ci
1145cabdff1aSopenharmony_ci        vmov.i16        q2,  #0
1146cabdff1aSopenharmony_ci
1147cabdff1aSopenharmony_ci        vld1.16         {d16[]}, [r2,:16]
1148cabdff1aSopenharmony_ci        vmull.s16       q8,  d16, d0[0]
1149cabdff1aSopenharmony_ci        vrshrn.s32      d16, q8,  #14
1150cabdff1aSopenharmony_ci        vmull.s16       q8,  d16, d0[0]
1151cabdff1aSopenharmony_ci        vrshrn.s32      d16, q8,  #14
1152cabdff1aSopenharmony_ci        vdup.16         q8,  d16[0]
1153cabdff1aSopenharmony_ci        vst1.16         {d4[0]}, [r2,:16]
1154cabdff1aSopenharmony_ci
1155cabdff1aSopenharmony_ci        vrshr.s16       q8,  q8,  #6
1156cabdff1aSopenharmony_ci
1157cabdff1aSopenharmony_ci        mov             r3,  r0
1158cabdff1aSopenharmony_ci        mov             r12, #32
1159cabdff1aSopenharmony_ci1:
1160cabdff1aSopenharmony_ci        @ Loop to add the constant from q8 into all 32x32 outputs
1161cabdff1aSopenharmony_ci        subs            r12, r12, #2
1162cabdff1aSopenharmony_ci        vld1.8          {q0-q1},  [r0,:128], r1
1163cabdff1aSopenharmony_ci        vaddw.u8        q9,  q8,  d0
1164cabdff1aSopenharmony_ci        vaddw.u8        q10, q8,  d1
1165cabdff1aSopenharmony_ci        vld1.8          {q2-q3},  [r0,:128], r1
1166cabdff1aSopenharmony_ci        vaddw.u8        q11, q8,  d2
1167cabdff1aSopenharmony_ci        vaddw.u8        q12, q8,  d3
1168cabdff1aSopenharmony_ci        vaddw.u8        q13, q8,  d4
1169cabdff1aSopenharmony_ci        vaddw.u8        q14, q8,  d5
1170cabdff1aSopenharmony_ci        vaddw.u8        q15, q8,  d6
1171cabdff1aSopenharmony_ci        vqmovun.s16     d0,  q9
1172cabdff1aSopenharmony_ci        vaddw.u8        q9,  q8,  d7
1173cabdff1aSopenharmony_ci        vqmovun.s16     d1,  q10
1174cabdff1aSopenharmony_ci        vqmovun.s16     d2,  q11
1175cabdff1aSopenharmony_ci        vqmovun.s16     d3,  q12
1176cabdff1aSopenharmony_ci        vqmovun.s16     d4,  q13
1177cabdff1aSopenharmony_ci        vqmovun.s16     d5,  q14
1178cabdff1aSopenharmony_ci        vst1.8          {q0-q1},  [r3,:128], r1
1179cabdff1aSopenharmony_ci        vqmovun.s16     d6,  q15
1180cabdff1aSopenharmony_ci        vqmovun.s16     d7,  q9
1181cabdff1aSopenharmony_ci        vst1.8          {q2-q3},  [r3,:128], r1
1182cabdff1aSopenharmony_ci        bne             1b
1183cabdff1aSopenharmony_ci
1184cabdff1aSopenharmony_ci        bx              lr
1185cabdff1aSopenharmony_ciendfunc
1186cabdff1aSopenharmony_ci
1187cabdff1aSopenharmony_ci.macro idct32_end
1188cabdff1aSopenharmony_ci        butterfly       d16, d9,  d8,  d9  @ d16 = t16a, d9  = t19a
1189cabdff1aSopenharmony_ci        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
1190cabdff1aSopenharmony_ci        butterfly       d18, d10, d11, d10 @ d18 = t23a, d10 = t20a
1191cabdff1aSopenharmony_ci        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
1192cabdff1aSopenharmony_ci        butterfly       d8,  d28, d28, d30 @ d8  = t24a, d28 = t27a
1193cabdff1aSopenharmony_ci        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
1194cabdff1aSopenharmony_ci        butterfly       d11, d29, d29, d31 @ d11 = t31a, d29 = t28a
1195cabdff1aSopenharmony_ci        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
1196cabdff1aSopenharmony_ci
1197cabdff1aSopenharmony_ci        mbutterfly      d27, d20, d0[2], d0[3], q12, q15        @ d27 = t18a, d20 = t29a
1198cabdff1aSopenharmony_ci        mbutterfly      d29, d9,  d0[2], d0[3], q12, q15        @ d29 = t19,  d5  = t28
1199cabdff1aSopenharmony_ci        mbutterfly      d28, d10, d0[2], d0[3], q12, q15, neg=1 @ d28 = t27,  d6  = t20
1200cabdff1aSopenharmony_ci        mbutterfly      d26, d21, d0[2], d0[3], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
1201cabdff1aSopenharmony_ci
1202cabdff1aSopenharmony_ci        butterfly       d31, d24, d11, d8  @ d31 = t31,  d24 = t24
1203cabdff1aSopenharmony_ci        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
1204cabdff1aSopenharmony_ci        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
1205cabdff1aSopenharmony_ci        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
1206cabdff1aSopenharmony_ci        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
1207cabdff1aSopenharmony_ci        butterfly_r     d27, d28, d9,  d28 @ d27 = t27a, d28 = t28a
1208cabdff1aSopenharmony_ci        butterfly       d8,  d26, d20, d26 @ d8  = t29,  d26 = t26
1209cabdff1aSopenharmony_ci        butterfly       d19, d20, d29, d10 @ d19 = t19a, d20 = t20
1210cabdff1aSopenharmony_ci        vmov            d29, d8            @ d29 = t29
1211cabdff1aSopenharmony_ci
1212cabdff1aSopenharmony_ci        mbutterfly0     d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27,  d20 = t20
1213cabdff1aSopenharmony_ci        mbutterfly0     d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
1214cabdff1aSopenharmony_ci        mbutterfly0     d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25,  d22 = t22
1215cabdff1aSopenharmony_ci        mbutterfly0     d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
1216cabdff1aSopenharmony_ci        bx              lr
1217cabdff1aSopenharmony_ci.endm
1218cabdff1aSopenharmony_ci
1219cabdff1aSopenharmony_cifunction idct32_odd
1220cabdff1aSopenharmony_ci        mbutterfly      d16, d31, d4[0], d4[1], q4, q5 @ d16 = t16a, d31 = t31a
1221cabdff1aSopenharmony_ci        mbutterfly      d24, d23, d4[2], d4[3], q4, q5 @ d24 = t17a, d23 = t30a
1222cabdff1aSopenharmony_ci        mbutterfly      d20, d27, d5[0], d5[1], q4, q5 @ d20 = t18a, d27 = t29a
1223cabdff1aSopenharmony_ci        mbutterfly      d28, d19, d5[2], d5[3], q4, q5 @ d28 = t19a, d19 = t28a
1224cabdff1aSopenharmony_ci        mbutterfly      d18, d29, d6[0], d6[1], q4, q5 @ d18 = t20a, d29 = t27a
1225cabdff1aSopenharmony_ci        mbutterfly      d26, d21, d6[2], d6[3], q4, q5 @ d26 = t21a, d21 = t26a
1226cabdff1aSopenharmony_ci        mbutterfly      d22, d25, d7[0], d7[1], q4, q5 @ d22 = t22a, d25 = t25a
1227cabdff1aSopenharmony_ci        mbutterfly      d30, d17, d7[2], d7[3], q4, q5 @ d30 = t23a, d17 = t24a
1228cabdff1aSopenharmony_ci
1229cabdff1aSopenharmony_ci        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
1230cabdff1aSopenharmony_ci        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
1231cabdff1aSopenharmony_ci        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
1232cabdff1aSopenharmony_ci        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
1233cabdff1aSopenharmony_ci        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
1234cabdff1aSopenharmony_ci        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
1235cabdff1aSopenharmony_ci        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
1236cabdff1aSopenharmony_ci        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
1237cabdff1aSopenharmony_ci
1238cabdff1aSopenharmony_ci        mbutterfly      d23, d24, d1[0], d1[1], q8, q9        @ d23 = t17a, d24 = t30a
1239cabdff1aSopenharmony_ci        mbutterfly      d27, d20, d1[0], d1[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
1240cabdff1aSopenharmony_ci        mbutterfly      d21, d26, d1[2], d1[3], q8, q9        @ d21 = t21a, d26 = t26a
1241cabdff1aSopenharmony_ci        mbutterfly      d25, d22, d1[2], d1[3], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
1242cabdff1aSopenharmony_ci        idct32_end
1243cabdff1aSopenharmony_ciendfunc
1244cabdff1aSopenharmony_ci
1245cabdff1aSopenharmony_cifunction idct32_odd_half
1246cabdff1aSopenharmony_ci        mbutterfly_h1   d16, d31, d4[0], d4[1], q4, q5 @ d16 = t16a, d31 = t31a
1247cabdff1aSopenharmony_ci        mbutterfly_h2   d24, d23, d4[2], d4[3], q4, q5 @ d24 = t17a, d23 = t30a
1248cabdff1aSopenharmony_ci        mbutterfly_h1   d20, d27, d5[0], d5[1], q4, q5 @ d20 = t18a, d27 = t29a
1249cabdff1aSopenharmony_ci        mbutterfly_h2   d28, d19, d5[2], d5[3], q4, q5 @ d28 = t19a, d19 = t28a
1250cabdff1aSopenharmony_ci        mbutterfly_h1   d18, d29, d6[0], d6[1], q4, q5 @ d18 = t20a, d29 = t27a
1251cabdff1aSopenharmony_ci        mbutterfly_h2   d26, d21, d6[2], d6[3], q4, q5 @ d26 = t21a, d21 = t26a
1252cabdff1aSopenharmony_ci        mbutterfly_h1   d22, d25, d7[0], d7[1], q4, q5 @ d22 = t22a, d25 = t25a
1253cabdff1aSopenharmony_ci        mbutterfly_h2   d30, d17, d7[2], d7[3], q4, q5 @ d30 = t23a, d17 = t24a
1254cabdff1aSopenharmony_ci
1255cabdff1aSopenharmony_ci        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
1256cabdff1aSopenharmony_ci        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
1257cabdff1aSopenharmony_ci        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
1258cabdff1aSopenharmony_ci        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
1259cabdff1aSopenharmony_ci        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
1260cabdff1aSopenharmony_ci        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
1261cabdff1aSopenharmony_ci        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
1262cabdff1aSopenharmony_ci        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
1263cabdff1aSopenharmony_ci
1264cabdff1aSopenharmony_ci        mbutterfly      d23, d24, d1[0], d1[1], q8, q9        @ d23 = t17a, d24 = t30a
1265cabdff1aSopenharmony_ci        mbutterfly      d27, d20, d1[0], d1[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
1266cabdff1aSopenharmony_ci        mbutterfly      d21, d26, d1[2], d1[3], q8, q9        @ d21 = t21a, d26 = t26a
1267cabdff1aSopenharmony_ci        mbutterfly      d25, d22, d1[2], d1[3], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
1268cabdff1aSopenharmony_ci
1269cabdff1aSopenharmony_ci        idct32_end
1270cabdff1aSopenharmony_ciendfunc
1271cabdff1aSopenharmony_ci
1272cabdff1aSopenharmony_cifunction idct32_odd_quarter
1273cabdff1aSopenharmony_ci        vmull.s16       q4,  d16, d4[0]
1274cabdff1aSopenharmony_ci        vmull.s16       q14, d19, d5[3]
1275cabdff1aSopenharmony_ci        vmull.s16       q15, d16, d4[1]
1276cabdff1aSopenharmony_ci        vmull.s16       q11, d17, d7[2]
1277cabdff1aSopenharmony_ci        vmull.s16       q5,  d17, d7[3]
1278cabdff1aSopenharmony_ci        vmull.s16       q13, d19, d5[2]
1279cabdff1aSopenharmony_ci        vmull.s16       q10, d18, d6[0]
1280cabdff1aSopenharmony_ci        vmull.s16       q12, d18, d6[1]
1281cabdff1aSopenharmony_ci
1282cabdff1aSopenharmony_ci        vneg.s32        q14, q14
1283cabdff1aSopenharmony_ci        vneg.s32        q5,  q5
1284cabdff1aSopenharmony_ci
1285cabdff1aSopenharmony_ci        vrshrn.s32      d8,  q4,  #14
1286cabdff1aSopenharmony_ci        vrshrn.s32      d9,  q14, #14
1287cabdff1aSopenharmony_ci        vrshrn.s32      d29, q15, #14
1288cabdff1aSopenharmony_ci        vrshrn.s32      d28, q11, #14
1289cabdff1aSopenharmony_ci        vrshrn.s32      d11, q5,  #14
1290cabdff1aSopenharmony_ci        vrshrn.s32      d31, q13, #14
1291cabdff1aSopenharmony_ci        vrshrn.s32      d10, q10, #14
1292cabdff1aSopenharmony_ci        vrshrn.s32      d30, q12, #14
1293cabdff1aSopenharmony_ci
1294cabdff1aSopenharmony_ci        mbutterfly_l    q8,  q9,  d29, d8,  d1[0], d1[1]
1295cabdff1aSopenharmony_ci        mbutterfly_l    q13, q10, d31, d9,  d1[0], d1[1]
1296cabdff1aSopenharmony_ci        vrshrn.s32      d23, q8,  #14
1297cabdff1aSopenharmony_ci        vrshrn.s32      d24, q9,  #14
1298cabdff1aSopenharmony_ci        vneg.s32        q10, q10
1299cabdff1aSopenharmony_ci        vrshrn.s32      d27, q13, #14
1300cabdff1aSopenharmony_ci        vrshrn.s32      d20, q10, #14
1301cabdff1aSopenharmony_ci        mbutterfly_l    q8,  q9,  d30, d10, d1[2], d1[3]
1302cabdff1aSopenharmony_ci        vrshrn.s32      d21, q8,  #14
1303cabdff1aSopenharmony_ci        vrshrn.s32      d26, q9,  #14
1304cabdff1aSopenharmony_ci        mbutterfly_l    q8,  q9,  d28, d11, d1[2], d1[3]
1305cabdff1aSopenharmony_ci        vrshrn.s32      d25, q8,  #14
1306cabdff1aSopenharmony_ci        vneg.s32        q9,  q9
1307cabdff1aSopenharmony_ci        vrshrn.s32      d22, q9,  #14
1308cabdff1aSopenharmony_ci
1309cabdff1aSopenharmony_ci        idct32_end
1310cabdff1aSopenharmony_ciendfunc
1311cabdff1aSopenharmony_ci
1312cabdff1aSopenharmony_ci.macro idct32_funcs suffix
1313cabdff1aSopenharmony_ci@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
1314cabdff1aSopenharmony_ci@ We don't have register space to do a single pass IDCT of 4x32 though,
1315cabdff1aSopenharmony_ci@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
1316cabdff1aSopenharmony_ci@ a normal IDCT16 with every other input component (the even ones, with
1317cabdff1aSopenharmony_ci@ each output written twice), followed by a separate 16-point IDCT
1318cabdff1aSopenharmony_ci@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
1319cabdff1aSopenharmony_ci@ r0 = dst (temp buffer)
1320cabdff1aSopenharmony_ci@ r1 = unused
1321cabdff1aSopenharmony_ci@ r2 = src
1322cabdff1aSopenharmony_cifunction idct32_1d_4x32_pass1\suffix\()_neon
1323cabdff1aSopenharmony_ci        push            {lr}
1324cabdff1aSopenharmony_ci
1325cabdff1aSopenharmony_ci        @ idct16 clobbers q2-q3 (since it doesn't clobber q4-q7 at all
1326cabdff1aSopenharmony_ci        @ when doing the normal 16x16 idct), so move the idct32_odd coeffs
1327cabdff1aSopenharmony_ci        @ to q4-q5
1328cabdff1aSopenharmony_ci        vmov            q4,  q2
1329cabdff1aSopenharmony_ci        vmov            q5,  q3
1330cabdff1aSopenharmony_ci
1331cabdff1aSopenharmony_ci        @ Double stride of the input, since we only read every other line
1332cabdff1aSopenharmony_ci        mov             r12, #128
1333cabdff1aSopenharmony_ci        vmov.s16        d4,  #0
1334cabdff1aSopenharmony_ci
1335cabdff1aSopenharmony_ci        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
1336cabdff1aSopenharmony_ci.ifb \suffix
1337cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1338cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
1339cabdff1aSopenharmony_ci        vst1.16         {d4},  [r2,:64], r12
1340cabdff1aSopenharmony_ci.endr
1341cabdff1aSopenharmony_ci.endif
1342cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1343cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1344cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
1345cabdff1aSopenharmony_ci        vst1.16         {d4},  [r2,:64], r12
1346cabdff1aSopenharmony_ci.endr
1347cabdff1aSopenharmony_ci.endif
1348cabdff1aSopenharmony_ci.ifc \suffix,_half
1349cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1350cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
1351cabdff1aSopenharmony_ci        vst1.16         {d4},  [r2,:64], r12
1352cabdff1aSopenharmony_ci.endr
1353cabdff1aSopenharmony_ci.endif
1354cabdff1aSopenharmony_ci
1355cabdff1aSopenharmony_ci        bl              idct16\suffix
1356cabdff1aSopenharmony_ci
1357cabdff1aSopenharmony_ci        @ Move the idct32_odd coeffs back into q2-q3 for idct32_odd;
1358cabdff1aSopenharmony_ci        @ the constants for a vmul with a lane must be in q0-q3.
1359cabdff1aSopenharmony_ci        vmov            q2,  q4
1360cabdff1aSopenharmony_ci        vmov            q3,  q5
1361cabdff1aSopenharmony_ci
1362cabdff1aSopenharmony_ci        @ Do four 4x4 transposes. Originally, d16-d31 contain the
1363cabdff1aSopenharmony_ci        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
1364cabdff1aSopenharmony_ci        @ contain the transposed 4x4 blocks.
1365cabdff1aSopenharmony_ci        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
1366cabdff1aSopenharmony_ci        @ Store the registers a, b, c, d horizontally, followed
1367cabdff1aSopenharmony_ci        @ by the same registers d, c, b, a mirrored.
1368cabdff1aSopenharmony_ci.macro store_rev a, b, c, d
1369cabdff1aSopenharmony_ci.irp i, \a, \b, \c, \d
1370cabdff1aSopenharmony_ci        vst1.16         {d\i}, [r0,:64]!
1371cabdff1aSopenharmony_ci        vrev64.16       d\i, d\i
1372cabdff1aSopenharmony_ci.endr
1373cabdff1aSopenharmony_ci.irp i, \d, \c, \b, \a
1374cabdff1aSopenharmony_ci        vst1.16         {d\i}, [r0,:64]!
1375cabdff1aSopenharmony_ci.endr
1376cabdff1aSopenharmony_ci.endm
1377cabdff1aSopenharmony_ci        store_rev       16, 20, 24, 28
1378cabdff1aSopenharmony_ci        store_rev       17, 21, 25, 29
1379cabdff1aSopenharmony_ci        store_rev       18, 22, 26, 30
1380cabdff1aSopenharmony_ci        store_rev       19, 23, 27, 31
1381cabdff1aSopenharmony_ci        sub             r0,  r0,  #256
1382cabdff1aSopenharmony_ci.purgem store_rev
1383cabdff1aSopenharmony_ci
1384cabdff1aSopenharmony_ci        @ Move r2 back to the start of the input, and move
1385cabdff1aSopenharmony_ci        @ to the first odd row
1386cabdff1aSopenharmony_ci.ifb \suffix
1387cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #4
1388cabdff1aSopenharmony_ci.endif
1389cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1390cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #2
1391cabdff1aSopenharmony_ci.endif
1392cabdff1aSopenharmony_ci.ifc \suffix,_half
1393cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #3
1394cabdff1aSopenharmony_ci.endif
1395cabdff1aSopenharmony_ci        add             r2,  r2,  #64
1396cabdff1aSopenharmony_ci
1397cabdff1aSopenharmony_ci        vmov.s16        d8,  #0
1398cabdff1aSopenharmony_ci        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
1399cabdff1aSopenharmony_ci.ifb \suffix
1400cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1401cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
1402cabdff1aSopenharmony_ci        vst1.16         {d8},  [r2,:64], r12
1403cabdff1aSopenharmony_ci.endr
1404cabdff1aSopenharmony_ci.endif
1405cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1406cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1407cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
1408cabdff1aSopenharmony_ci        vst1.16         {d8},  [r2,:64], r12
1409cabdff1aSopenharmony_ci.endr
1410cabdff1aSopenharmony_ci.endif
1411cabdff1aSopenharmony_ci.ifc \suffix,_half
1412cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1413cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64]
1414cabdff1aSopenharmony_ci        vst1.16         {d8},  [r2,:64], r12
1415cabdff1aSopenharmony_ci.endr
1416cabdff1aSopenharmony_ci.endif
1417cabdff1aSopenharmony_ci
1418cabdff1aSopenharmony_ci        bl              idct32_odd\suffix
1419cabdff1aSopenharmony_ci
1420cabdff1aSopenharmony_ci        transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
1421cabdff1aSopenharmony_ci
1422cabdff1aSopenharmony_ci        @ Store the registers a, b, c, d horizontally,
1423cabdff1aSopenharmony_ci        @ adding into the output first, and then mirrored, subtracted
1424cabdff1aSopenharmony_ci        @ from the output.
1425cabdff1aSopenharmony_ci.macro store_rev a, b, c, d
1426cabdff1aSopenharmony_ci.irp i, \a, \b, \c, \d
1427cabdff1aSopenharmony_ci        vld1.16         {d8},  [r0,:64]
1428cabdff1aSopenharmony_ci        vadd.s16        d8,  d8,  d\i
1429cabdff1aSopenharmony_ci        vst1.16         {d8},  [r0,:64]!
1430cabdff1aSopenharmony_ci        vrev64.16       d\i, d\i
1431cabdff1aSopenharmony_ci.endr
1432cabdff1aSopenharmony_ci.irp i, \d, \c, \b, \a
1433cabdff1aSopenharmony_ci        vld1.16         {d8},  [r0,:64]
1434cabdff1aSopenharmony_ci        vsub.s16        d8,  d8,  d\i
1435cabdff1aSopenharmony_ci        vst1.16         {d8},  [r0,:64]!
1436cabdff1aSopenharmony_ci.endr
1437cabdff1aSopenharmony_ci.endm
1438cabdff1aSopenharmony_ci
1439cabdff1aSopenharmony_ci        store_rev       31, 27, 23, 19
1440cabdff1aSopenharmony_ci        store_rev       30, 26, 22, 18
1441cabdff1aSopenharmony_ci        store_rev       29, 25, 21, 17
1442cabdff1aSopenharmony_ci        store_rev       28, 24, 20, 16
1443cabdff1aSopenharmony_ci.purgem store_rev
1444cabdff1aSopenharmony_ci        pop             {pc}
1445cabdff1aSopenharmony_ciendfunc
1446cabdff1aSopenharmony_ci.ltorg
1447cabdff1aSopenharmony_ci
1448cabdff1aSopenharmony_ci@ This is mostly the same as 4x32_pass1, but without the transpose,
1449cabdff1aSopenharmony_ci@ and use the source as temp buffer between the two idct passes, and
1450cabdff1aSopenharmony_ci@ add into the destination.
1451cabdff1aSopenharmony_ci@ r0 = dst
1452cabdff1aSopenharmony_ci@ r1 = dst stride
1453cabdff1aSopenharmony_ci@ r2 = src (temp buffer)
1454cabdff1aSopenharmony_cifunction idct32_1d_4x32_pass2\suffix\()_neon
1455cabdff1aSopenharmony_ci        push            {lr}
1456cabdff1aSopenharmony_ci        vmov            q4,  q2
1457cabdff1aSopenharmony_ci        vmov            q5,  q3
1458cabdff1aSopenharmony_ci
1459cabdff1aSopenharmony_ci        mov             r12, #128
1460cabdff1aSopenharmony_ci        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
1461cabdff1aSopenharmony_ci.ifb \suffix
1462cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1463cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1464cabdff1aSopenharmony_ci.endr
1465cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #4
1466cabdff1aSopenharmony_ci.endif
1467cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1468cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1469cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1470cabdff1aSopenharmony_ci.endr
1471cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #2
1472cabdff1aSopenharmony_ci.endif
1473cabdff1aSopenharmony_ci.ifc \suffix,_half
1474cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1475cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1476cabdff1aSopenharmony_ci.endr
1477cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #3
1478cabdff1aSopenharmony_ci.endif
1479cabdff1aSopenharmony_ci
1480cabdff1aSopenharmony_ci        bl              idct16\suffix
1481cabdff1aSopenharmony_ci
1482cabdff1aSopenharmony_ci        vmov            q2,  q4
1483cabdff1aSopenharmony_ci        vmov            q3,  q5
1484cabdff1aSopenharmony_ci
1485cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1486cabdff1aSopenharmony_ci        vst1.16         {d\i}, [r2,:64], r12
1487cabdff1aSopenharmony_ci.endr
1488cabdff1aSopenharmony_ci
1489cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #4
1490cabdff1aSopenharmony_ci        add             r2,  r2,  #64
1491cabdff1aSopenharmony_ci
1492cabdff1aSopenharmony_ci        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
1493cabdff1aSopenharmony_ci.ifb \suffix
1494cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1495cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1496cabdff1aSopenharmony_ci.endr
1497cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #4
1498cabdff1aSopenharmony_ci.endif
1499cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1500cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1501cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1502cabdff1aSopenharmony_ci.endr
1503cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #2
1504cabdff1aSopenharmony_ci.endif
1505cabdff1aSopenharmony_ci.ifc \suffix,_half
1506cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1507cabdff1aSopenharmony_ci        vld1.16         {d\i}, [r2,:64], r12
1508cabdff1aSopenharmony_ci.endr
1509cabdff1aSopenharmony_ci        sub             r2,  r2,  r12, lsl #3
1510cabdff1aSopenharmony_ci.endif
1511cabdff1aSopenharmony_ci        sub             r2,  r2,  #64
1512cabdff1aSopenharmony_ci
1513cabdff1aSopenharmony_ci        bl              idct32_odd\suffix
1514cabdff1aSopenharmony_ci
1515cabdff1aSopenharmony_ci        mov             r12, #128
1516cabdff1aSopenharmony_ci.macro load_acc_store a, b, c, d, neg=0
1517cabdff1aSopenharmony_ci        vld1.16         {d8},  [r2,:64], r12
1518cabdff1aSopenharmony_ci        vld1.16         {d9},  [r2,:64], r12
1519cabdff1aSopenharmony_ci.if \neg == 0
1520cabdff1aSopenharmony_ci        vadd.s16        d8,  d8,  d\a
1521cabdff1aSopenharmony_ci        vld1.16         {d10}, [r2,:64], r12
1522cabdff1aSopenharmony_ci        vadd.s16        d9,  d9,  d\b
1523cabdff1aSopenharmony_ci        vld1.16         {d11}, [r2,:64], r12
1524cabdff1aSopenharmony_ci        vadd.s16        d10, d10, d\c
1525cabdff1aSopenharmony_ci        vadd.s16        d11, d11, d\d
1526cabdff1aSopenharmony_ci.else
1527cabdff1aSopenharmony_ci        vsub.s16        d8,  d8,  d\a
1528cabdff1aSopenharmony_ci        vld1.16         {d10}, [r2,:64], r12
1529cabdff1aSopenharmony_ci        vsub.s16        d9,  d9,  d\b
1530cabdff1aSopenharmony_ci        vld1.16         {d11}, [r2,:64], r12
1531cabdff1aSopenharmony_ci        vsub.s16        d10, d10, d\c
1532cabdff1aSopenharmony_ci        vsub.s16        d11, d11, d\d
1533cabdff1aSopenharmony_ci.endif
1534cabdff1aSopenharmony_ci        vld1.32         {d12[]},  [r0,:32], r1
1535cabdff1aSopenharmony_ci        vld1.32         {d12[1]}, [r0,:32], r1
1536cabdff1aSopenharmony_ci        vrshr.s16       q4,  q4,  #6
1537cabdff1aSopenharmony_ci        vld1.32         {d13[]},  [r0,:32], r1
1538cabdff1aSopenharmony_ci        vrshr.s16       q5,  q5,  #6
1539cabdff1aSopenharmony_ci        vld1.32         {d13[1]}, [r0,:32], r1
1540cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
1541cabdff1aSopenharmony_ci        vaddw.u8        q4,  q4,  d12
1542cabdff1aSopenharmony_ci        vaddw.u8        q5,  q5,  d13
1543cabdff1aSopenharmony_ci        vqmovun.s16     d8,  q4
1544cabdff1aSopenharmony_ci        vqmovun.s16     d9,  q5
1545cabdff1aSopenharmony_ci        vst1.32         {d8[0]},  [r0,:32], r1
1546cabdff1aSopenharmony_ci        vst1.32         {d8[1]},  [r0,:32], r1
1547cabdff1aSopenharmony_ci        vst1.32         {d9[0]},  [r0,:32], r1
1548cabdff1aSopenharmony_ci        vst1.32         {d9[1]},  [r0,:32], r1
1549cabdff1aSopenharmony_ci.endm
1550cabdff1aSopenharmony_ci        load_acc_store  31, 30, 29, 28
1551cabdff1aSopenharmony_ci        load_acc_store  27, 26, 25, 24
1552cabdff1aSopenharmony_ci        load_acc_store  23, 22, 21, 20
1553cabdff1aSopenharmony_ci        load_acc_store  19, 18, 17, 16
1554cabdff1aSopenharmony_ci        sub             r2,  r2,  r12
1555cabdff1aSopenharmony_ci        neg             r12, r12
1556cabdff1aSopenharmony_ci        load_acc_store  16, 17, 18, 19, 1
1557cabdff1aSopenharmony_ci        load_acc_store  20, 21, 22, 23, 1
1558cabdff1aSopenharmony_ci        load_acc_store  24, 25, 26, 27, 1
1559cabdff1aSopenharmony_ci        load_acc_store  28, 29, 30, 31, 1
1560cabdff1aSopenharmony_ci.purgem load_acc_store
1561cabdff1aSopenharmony_ci        pop             {pc}
1562cabdff1aSopenharmony_ciendfunc
1563cabdff1aSopenharmony_ci.endm
1564cabdff1aSopenharmony_ci
1565cabdff1aSopenharmony_ciidct32_funcs
1566cabdff1aSopenharmony_ciidct32_funcs _quarter
1567cabdff1aSopenharmony_ciidct32_funcs _half
1568cabdff1aSopenharmony_ci
1569cabdff1aSopenharmony_ciconst min_eob_idct_idct_32, align=4
1570cabdff1aSopenharmony_ci        .short  0, 9, 34, 70, 135, 240, 336, 448
1571cabdff1aSopenharmony_ciendconst
1572cabdff1aSopenharmony_ci
1573cabdff1aSopenharmony_cifunction ff_vp9_idct_idct_32x32_add_neon, export=1
1574cabdff1aSopenharmony_ci        cmp             r3,  #1
1575cabdff1aSopenharmony_ci        beq             idct32x32_dc_add_neon
1576cabdff1aSopenharmony_ci        push            {r4-r8,lr}
1577cabdff1aSopenharmony_ci        vpush           {q4-q6}
1578cabdff1aSopenharmony_ci
1579cabdff1aSopenharmony_ci        @ Align the stack, allocate a temp buffer
1580cabdff1aSopenharmony_ciT       mov             r7,  sp
1581cabdff1aSopenharmony_ciT       and             r7,  r7,  #15
1582cabdff1aSopenharmony_ciA       and             r7,  sp,  #15
1583cabdff1aSopenharmony_ci        add             r7,  r7,  #2048
1584cabdff1aSopenharmony_ci        sub             sp,  sp,  r7
1585cabdff1aSopenharmony_ci
1586cabdff1aSopenharmony_ci        mov             r4,  r0
1587cabdff1aSopenharmony_ci        mov             r5,  r1
1588cabdff1aSopenharmony_ci        mov             r6,  r2
1589cabdff1aSopenharmony_ci
1590cabdff1aSopenharmony_ci        movrel          r12, idct_coeffs
1591cabdff1aSopenharmony_ci        vld1.16         {q0-q1}, [r12,:128]!
1592cabdff1aSopenharmony_ci        vld1.16         {q2-q3}, [r12,:128]
1593cabdff1aSopenharmony_ci
1594cabdff1aSopenharmony_ci        cmp             r3,  #34
1595cabdff1aSopenharmony_ci        ble             idct32x32_quarter_add_neon
1596cabdff1aSopenharmony_ci        cmp             r3,  #135
1597cabdff1aSopenharmony_ci        ble             idct32x32_half_add_neon
1598cabdff1aSopenharmony_ci
1599cabdff1aSopenharmony_ci        movrel          r8,  min_eob_idct_idct_32 + 2
1600cabdff1aSopenharmony_ci
1601cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12, 16, 20, 24, 28
1602cabdff1aSopenharmony_ci        add             r0,  sp,  #(\i*64)
1603cabdff1aSopenharmony_ci.if \i > 0
1604cabdff1aSopenharmony_ci        ldrh_post       r1,  r8,  #2
1605cabdff1aSopenharmony_ci        cmp             r3,  r1
1606cabdff1aSopenharmony_ci        it              le
1607cabdff1aSopenharmony_ci        movle           r1,  #(32 - \i)/2
1608cabdff1aSopenharmony_ci        ble             1f
1609cabdff1aSopenharmony_ci.endif
1610cabdff1aSopenharmony_ci        add             r2,  r6,  #(\i*2)
1611cabdff1aSopenharmony_ci        bl              idct32_1d_4x32_pass1_neon
1612cabdff1aSopenharmony_ci.endr
1613cabdff1aSopenharmony_ci        b               3f
1614cabdff1aSopenharmony_ci
1615cabdff1aSopenharmony_ci1:
1616cabdff1aSopenharmony_ci        @ Write zeros to the temp buffer for pass 2
1617cabdff1aSopenharmony_ci        vmov.i16        q14, #0
1618cabdff1aSopenharmony_ci        vmov.i16        q15, #0
1619cabdff1aSopenharmony_ci2:
1620cabdff1aSopenharmony_ci        subs            r1,  r1,  #1
1621cabdff1aSopenharmony_ci.rept 4
1622cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r0,:128]!
1623cabdff1aSopenharmony_ci.endr
1624cabdff1aSopenharmony_ci        bne             2b
1625cabdff1aSopenharmony_ci3:
1626cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12, 16, 20, 24, 28
1627cabdff1aSopenharmony_ci        add             r0,  r4,  #(\i)
1628cabdff1aSopenharmony_ci        mov             r1,  r5
1629cabdff1aSopenharmony_ci        add             r2,  sp,  #(\i*2)
1630cabdff1aSopenharmony_ci        bl              idct32_1d_4x32_pass2_neon
1631cabdff1aSopenharmony_ci.endr
1632cabdff1aSopenharmony_ci
1633cabdff1aSopenharmony_ci        add             sp,  sp,  r7
1634cabdff1aSopenharmony_ci        vpop            {q4-q6}
1635cabdff1aSopenharmony_ci        pop             {r4-r8,pc}
1636cabdff1aSopenharmony_ciendfunc
1637cabdff1aSopenharmony_ci
1638cabdff1aSopenharmony_ci.macro idct32_partial size
1639cabdff1aSopenharmony_cifunction idct32x32_\size\()_add_neon
1640cabdff1aSopenharmony_ci.irp i, 0, 4
1641cabdff1aSopenharmony_ci        add             r0,  sp,  #(\i*64)
1642cabdff1aSopenharmony_ci.ifc \size,quarter
1643cabdff1aSopenharmony_ci.if \i == 4
1644cabdff1aSopenharmony_ci        cmp             r3,  #9
1645cabdff1aSopenharmony_ci        ble             1f
1646cabdff1aSopenharmony_ci.endif
1647cabdff1aSopenharmony_ci.endif
1648cabdff1aSopenharmony_ci        add             r2,  r6,  #(\i*2)
1649cabdff1aSopenharmony_ci        bl              idct32_1d_4x32_pass1_\size\()_neon
1650cabdff1aSopenharmony_ci.endr
1651cabdff1aSopenharmony_ci
1652cabdff1aSopenharmony_ci.ifc \size,half
1653cabdff1aSopenharmony_ci.irp i, 8, 12
1654cabdff1aSopenharmony_ci        add             r0,  sp,  #(\i*64)
1655cabdff1aSopenharmony_ci.if \i == 12
1656cabdff1aSopenharmony_ci        cmp             r3,  #70
1657cabdff1aSopenharmony_ci        ble             1f
1658cabdff1aSopenharmony_ci.endif
1659cabdff1aSopenharmony_ci        add             r2,  r6,  #(\i*2)
1660cabdff1aSopenharmony_ci        bl              idct32_1d_4x32_pass1_\size\()_neon
1661cabdff1aSopenharmony_ci.endr
1662cabdff1aSopenharmony_ci.endif
1663cabdff1aSopenharmony_ci        b               3f
1664cabdff1aSopenharmony_ci
1665cabdff1aSopenharmony_ci1:
1666cabdff1aSopenharmony_ci        @ Write zeros to the temp buffer for pass 2
1667cabdff1aSopenharmony_ci        vmov.i16        q14, #0
1668cabdff1aSopenharmony_ci        vmov.i16        q15, #0
1669cabdff1aSopenharmony_ci.rept 8
1670cabdff1aSopenharmony_ci        vst1.16         {q14-q15}, [r0,:128]!
1671cabdff1aSopenharmony_ci.endr
1672cabdff1aSopenharmony_ci
1673cabdff1aSopenharmony_ci3:
1674cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12, 16, 20, 24, 28
1675cabdff1aSopenharmony_ci        add             r0,  r4,  #(\i)
1676cabdff1aSopenharmony_ci        mov             r1,  r5
1677cabdff1aSopenharmony_ci        add             r2,  sp,  #(\i*2)
1678cabdff1aSopenharmony_ci        bl              idct32_1d_4x32_pass2_\size\()_neon
1679cabdff1aSopenharmony_ci.endr
1680cabdff1aSopenharmony_ci
1681cabdff1aSopenharmony_ci        add             sp,  sp,  r7
1682cabdff1aSopenharmony_ci        vpop            {q4-q6}
1683cabdff1aSopenharmony_ci        pop             {r4-r8,pc}
1684cabdff1aSopenharmony_ciendfunc
1685cabdff1aSopenharmony_ci.endm
1686cabdff1aSopenharmony_ci
1687cabdff1aSopenharmony_ciidct32_partial quarter
1688cabdff1aSopenharmony_ciidct32_partial half
1689