1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2017 Google Inc.
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
22cabdff1aSopenharmony_ci#include "neon.S"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciconst itxfm4_coeffs, align=4
25cabdff1aSopenharmony_ci        .short  11585, 0, 6270, 15137
26cabdff1aSopenharmony_ciiadst4_coeffs:
27cabdff1aSopenharmony_ci        .short  5283, 15212, 9929, 13377
28cabdff1aSopenharmony_ciendconst
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ciconst iadst8_coeffs, align=4
31cabdff1aSopenharmony_ci        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32cabdff1aSopenharmony_ciidct_coeffs:
33cabdff1aSopenharmony_ci        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34cabdff1aSopenharmony_ci        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35cabdff1aSopenharmony_ci        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36cabdff1aSopenharmony_ci        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37cabdff1aSopenharmony_ciendconst
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ciconst iadst16_coeffs, align=4
40cabdff1aSopenharmony_ci        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41cabdff1aSopenharmony_ci        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
42cabdff1aSopenharmony_ciendconst
43cabdff1aSopenharmony_ci
44cabdff1aSopenharmony_ci.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
45cabdff1aSopenharmony_ci        trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
46cabdff1aSopenharmony_ci        trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
47cabdff1aSopenharmony_ci        trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
48cabdff1aSopenharmony_ci        trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
49cabdff1aSopenharmony_ci        trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
50cabdff1aSopenharmony_ci        trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
51cabdff1aSopenharmony_ci        trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
52cabdff1aSopenharmony_ci        trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
53cabdff1aSopenharmony_ci.endm
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_ci// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
56cabdff1aSopenharmony_ci// over two registers.
57cabdff1aSopenharmony_ci.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
58cabdff1aSopenharmony_ci        transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
59cabdff1aSopenharmony_ci        transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ci        // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
62cabdff1aSopenharmony_ci        // while swapping the two 4x4 matrices between each other
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_ci        // First step of the 4x4 transpose of r1-r7, into t0-t3
65cabdff1aSopenharmony_ci        trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
66cabdff1aSopenharmony_ci        trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
67cabdff1aSopenharmony_ci        trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
68cabdff1aSopenharmony_ci        trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_ci        // First step of the 4x4 transpose of r8-r12, into r1-r7
71cabdff1aSopenharmony_ci        trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
72cabdff1aSopenharmony_ci        trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
73cabdff1aSopenharmony_ci        trn1            \r5\().4s,  \r12\().4s, \r14\().4s
74cabdff1aSopenharmony_ci        trn2            \r7\().4s,  \r12\().4s, \r14\().4s
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_ci        // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
77cabdff1aSopenharmony_ci        trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
78cabdff1aSopenharmony_ci        trn2            \r12\().2d, \t0\().2d,  \t2\().2d
79cabdff1aSopenharmony_ci        trn1            \r10\().2d, \t1\().2d,  \t3\().2d
80cabdff1aSopenharmony_ci        trn2            \r14\().2d, \t1\().2d,  \t3\().2d
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_ci        // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
83cabdff1aSopenharmony_ci        trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
84cabdff1aSopenharmony_ci        trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
85cabdff1aSopenharmony_ci        trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
86cabdff1aSopenharmony_ci        trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci        // Move the outputs of trn1 back in place
89cabdff1aSopenharmony_ci        mov             \r1\().16b,  \t0\().16b
90cabdff1aSopenharmony_ci        mov             \r3\().16b,  \t1\().16b
91cabdff1aSopenharmony_ci.endm
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
94cabdff1aSopenharmony_ci// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
95cabdff1aSopenharmony_ci// in/out are .4s registers; this can do with 4 temp registers, but is
96cabdff1aSopenharmony_ci// more efficient if 6 temp registers are available.
97cabdff1aSopenharmony_ci.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
98cabdff1aSopenharmony_ci.if \neg > 0
99cabdff1aSopenharmony_ci        neg             \tmp4\().4s, v0.4s
100cabdff1aSopenharmony_ci.endif
101cabdff1aSopenharmony_ci        add             \tmp1\().4s, \in1\().4s,  \in2\().4s
102cabdff1aSopenharmony_ci        sub             \tmp2\().4s, \in1\().4s,  \in2\().4s
103cabdff1aSopenharmony_ci.if \neg > 0
104cabdff1aSopenharmony_ci        smull           \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
105cabdff1aSopenharmony_ci        smull2          \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
106cabdff1aSopenharmony_ci.else
107cabdff1aSopenharmony_ci        smull           \tmp3\().2d, \tmp1\().2s, v0.s[0]
108cabdff1aSopenharmony_ci        smull2          \tmp4\().2d, \tmp1\().4s, v0.s[0]
109cabdff1aSopenharmony_ci.endif
110cabdff1aSopenharmony_ci.ifb \tmp5
111cabdff1aSopenharmony_ci        rshrn           \out1\().2s, \tmp3\().2d, #14
112cabdff1aSopenharmony_ci        rshrn2          \out1\().4s, \tmp4\().2d, #14
113cabdff1aSopenharmony_ci        smull           \tmp3\().2d, \tmp2\().2s, v0.s[0]
114cabdff1aSopenharmony_ci        smull2          \tmp4\().2d, \tmp2\().4s, v0.s[0]
115cabdff1aSopenharmony_ci        rshrn           \out2\().2s, \tmp3\().2d, #14
116cabdff1aSopenharmony_ci        rshrn2          \out2\().4s, \tmp4\().2d, #14
117cabdff1aSopenharmony_ci.else
118cabdff1aSopenharmony_ci        smull           \tmp5\().2d, \tmp2\().2s, v0.s[0]
119cabdff1aSopenharmony_ci        smull2          \tmp6\().2d, \tmp2\().4s, v0.s[0]
120cabdff1aSopenharmony_ci        rshrn           \out1\().2s, \tmp3\().2d, #14
121cabdff1aSopenharmony_ci        rshrn2          \out1\().4s, \tmp4\().2d, #14
122cabdff1aSopenharmony_ci        rshrn           \out2\().2s, \tmp5\().2d, #14
123cabdff1aSopenharmony_ci        rshrn2          \out2\().4s, \tmp6\().2d, #14
124cabdff1aSopenharmony_ci.endif
125cabdff1aSopenharmony_ci.endm
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci// Same as dmbutterfly0 above, but treating the input in in2 as zero,
128cabdff1aSopenharmony_ci// writing the same output into both out1 and out2.
129cabdff1aSopenharmony_ci.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
130cabdff1aSopenharmony_ci        smull           \tmp1\().2d, \in1\().2s,  v0.s[0]
131cabdff1aSopenharmony_ci        smull2          \tmp2\().2d, \in1\().4s,  v0.s[0]
132cabdff1aSopenharmony_ci        rshrn           \out1\().2s, \tmp1\().2d, #14
133cabdff1aSopenharmony_ci        rshrn2          \out1\().4s, \tmp2\().2d, #14
134cabdff1aSopenharmony_ci        rshrn           \out2\().2s, \tmp1\().2d, #14
135cabdff1aSopenharmony_ci        rshrn2          \out2\().4s, \tmp2\().2d, #14
136cabdff1aSopenharmony_ci.endm
137cabdff1aSopenharmony_ci
138cabdff1aSopenharmony_ci// out1,out2 = in1 * coef1 - in2 * coef2
139cabdff1aSopenharmony_ci// out3,out4 = in1 * coef2 + in2 * coef1
140cabdff1aSopenharmony_ci// out are 4 x .2d registers, in are 2 x .4s registers
141cabdff1aSopenharmony_ci.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
142cabdff1aSopenharmony_ci        smull           \out1\().2d, \in1\().2s, \coef1
143cabdff1aSopenharmony_ci        smull2          \out2\().2d, \in1\().4s, \coef1
144cabdff1aSopenharmony_ci        smull           \out3\().2d, \in1\().2s, \coef2
145cabdff1aSopenharmony_ci        smull2          \out4\().2d, \in1\().4s, \coef2
146cabdff1aSopenharmony_ci        smlsl           \out1\().2d, \in2\().2s, \coef2
147cabdff1aSopenharmony_ci        smlsl2          \out2\().2d, \in2\().4s, \coef2
148cabdff1aSopenharmony_ci        smlal           \out3\().2d, \in2\().2s, \coef1
149cabdff1aSopenharmony_ci        smlal2          \out4\().2d, \in2\().4s, \coef1
150cabdff1aSopenharmony_ci.endm
151cabdff1aSopenharmony_ci
152cabdff1aSopenharmony_ci// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
153cabdff1aSopenharmony_ci// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
154cabdff1aSopenharmony_ci// inout are 2 x .4s registers
155cabdff1aSopenharmony_ci.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
156cabdff1aSopenharmony_ci        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
157cabdff1aSopenharmony_ci.if \neg > 0
158cabdff1aSopenharmony_ci        neg             \tmp3\().2d, \tmp3\().2d
159cabdff1aSopenharmony_ci        neg             \tmp4\().2d, \tmp4\().2d
160cabdff1aSopenharmony_ci.endif
161cabdff1aSopenharmony_ci        rshrn           \inout1\().2s, \tmp1\().2d,  #14
162cabdff1aSopenharmony_ci        rshrn2          \inout1\().4s, \tmp2\().2d,  #14
163cabdff1aSopenharmony_ci        rshrn           \inout2\().2s, \tmp3\().2d,  #14
164cabdff1aSopenharmony_ci        rshrn2          \inout2\().4s, \tmp4\().2d,  #14
165cabdff1aSopenharmony_ci.endm
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci// Same as dmbutterfly above, but treating the input in inout2 as zero
168cabdff1aSopenharmony_ci.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
169cabdff1aSopenharmony_ci        smull           \tmp1\().2d, \inout1\().2s, \coef1
170cabdff1aSopenharmony_ci        smull2          \tmp2\().2d, \inout1\().4s, \coef1
171cabdff1aSopenharmony_ci        smull           \tmp3\().2d, \inout1\().2s, \coef2
172cabdff1aSopenharmony_ci        smull2          \tmp4\().2d, \inout1\().4s, \coef2
173cabdff1aSopenharmony_ci        rshrn           \inout1\().2s, \tmp1\().2d, #14
174cabdff1aSopenharmony_ci        rshrn2          \inout1\().4s, \tmp2\().2d, #14
175cabdff1aSopenharmony_ci        rshrn           \inout2\().2s, \tmp3\().2d, #14
176cabdff1aSopenharmony_ci        rshrn2          \inout2\().4s, \tmp4\().2d, #14
177cabdff1aSopenharmony_ci.endm
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci// Same as dmbutterfly above, but treating the input in inout1 as zero
180cabdff1aSopenharmony_ci.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
181cabdff1aSopenharmony_ci        smull           \tmp1\().2d, \inout2\().2s, \coef2
182cabdff1aSopenharmony_ci        smull2          \tmp2\().2d, \inout2\().4s, \coef2
183cabdff1aSopenharmony_ci        smull           \tmp3\().2d, \inout2\().2s, \coef1
184cabdff1aSopenharmony_ci        smull2          \tmp4\().2d, \inout2\().4s, \coef1
185cabdff1aSopenharmony_ci        neg             \tmp1\().2d, \tmp1\().2d
186cabdff1aSopenharmony_ci        neg             \tmp2\().2d, \tmp2\().2d
187cabdff1aSopenharmony_ci        rshrn           \inout2\().2s, \tmp3\().2d, #14
188cabdff1aSopenharmony_ci        rshrn2          \inout2\().4s, \tmp4\().2d, #14
189cabdff1aSopenharmony_ci        rshrn           \inout1\().2s, \tmp1\().2d, #14
190cabdff1aSopenharmony_ci        rshrn2          \inout1\().4s, \tmp2\().2d, #14
191cabdff1aSopenharmony_ci.endm
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci.macro dsmull_h out1, out2, in, coef
194cabdff1aSopenharmony_ci        smull           \out1\().2d, \in\().2s, \coef
195cabdff1aSopenharmony_ci        smull2          \out2\().2d, \in\().4s, \coef
196cabdff1aSopenharmony_ci.endm
197cabdff1aSopenharmony_ci
198cabdff1aSopenharmony_ci.macro drshrn_h out, in1, in2, shift
199cabdff1aSopenharmony_ci        rshrn           \out\().2s, \in1\().2d, \shift
200cabdff1aSopenharmony_ci        rshrn2          \out\().4s, \in2\().2d, \shift
201cabdff1aSopenharmony_ci.endm
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci
204cabdff1aSopenharmony_ci// out1 = in1 + in2
205cabdff1aSopenharmony_ci// out2 = in1 - in2
206cabdff1aSopenharmony_ci.macro butterfly_4s out1, out2, in1, in2
207cabdff1aSopenharmony_ci        add             \out1\().4s, \in1\().4s, \in2\().4s
208cabdff1aSopenharmony_ci        sub             \out2\().4s, \in1\().4s, \in2\().4s
209cabdff1aSopenharmony_ci.endm
210cabdff1aSopenharmony_ci
211cabdff1aSopenharmony_ci// out1 = in1 - in2
212cabdff1aSopenharmony_ci// out2 = in1 + in2
213cabdff1aSopenharmony_ci.macro butterfly_4s_r out1, out2, in1, in2
214cabdff1aSopenharmony_ci        sub             \out1\().4s, \in1\().4s, \in2\().4s
215cabdff1aSopenharmony_ci        add             \out2\().4s, \in1\().4s, \in2\().4s
216cabdff1aSopenharmony_ci.endm
217cabdff1aSopenharmony_ci
218cabdff1aSopenharmony_ci// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
219cabdff1aSopenharmony_ci// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
220cabdff1aSopenharmony_ci// out are 2 x .4s registers, in are 4 x .2d registers
221cabdff1aSopenharmony_ci.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
222cabdff1aSopenharmony_ci        add             \tmp1\().2d, \in1\().2d, \in3\().2d
223cabdff1aSopenharmony_ci        add             \tmp2\().2d, \in2\().2d, \in4\().2d
224cabdff1aSopenharmony_ci        sub             \tmp3\().2d, \in1\().2d, \in3\().2d
225cabdff1aSopenharmony_ci        sub             \tmp4\().2d, \in2\().2d, \in4\().2d
226cabdff1aSopenharmony_ci        rshrn           \out1\().2s, \tmp1\().2d,  #14
227cabdff1aSopenharmony_ci        rshrn2          \out1\().4s, \tmp2\().2d,  #14
228cabdff1aSopenharmony_ci        rshrn           \out2\().2s, \tmp3\().2d,  #14
229cabdff1aSopenharmony_ci        rshrn2          \out2\().4s, \tmp4\().2d,  #14
230cabdff1aSopenharmony_ci.endm
231cabdff1aSopenharmony_ci
232cabdff1aSopenharmony_ci.macro iwht4_10 c0, c1, c2, c3
233cabdff1aSopenharmony_ci        add             \c0\().4s, \c0\().4s, \c1\().4s
234cabdff1aSopenharmony_ci        sub             v17.4s,    \c2\().4s, \c3\().4s
235cabdff1aSopenharmony_ci        sub             v16.4s,    \c0\().4s, v17.4s
236cabdff1aSopenharmony_ci        sshr            v16.4s,    v16.4s,    #1
237cabdff1aSopenharmony_ci        sub             \c2\().4s, v16.4s,    \c1\().4s
238cabdff1aSopenharmony_ci        sub             \c1\().4s, v16.4s,    \c3\().4s
239cabdff1aSopenharmony_ci        add             \c3\().4s, v17.4s,    \c2\().4s
240cabdff1aSopenharmony_ci        sub             \c0\().4s, \c0\().4s, \c1\().4s
241cabdff1aSopenharmony_ci.endm
242cabdff1aSopenharmony_ci
243cabdff1aSopenharmony_ci.macro iwht4_12 c0, c1, c2, c3
244cabdff1aSopenharmony_ci        iwht4_10        \c0, \c1, \c2, \c3
245cabdff1aSopenharmony_ci.endm
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci.macro idct4_10 c0, c1, c2, c3
248cabdff1aSopenharmony_ci        mul             v22.4s,    \c1\().4s, v0.s[3]
249cabdff1aSopenharmony_ci        mul             v20.4s,    \c1\().4s, v0.s[2]
250cabdff1aSopenharmony_ci        add             v16.4s,    \c0\().4s, \c2\().4s
251cabdff1aSopenharmony_ci        sub             v17.4s,    \c0\().4s, \c2\().4s
252cabdff1aSopenharmony_ci        mla             v22.4s,    \c3\().4s, v0.s[2]
253cabdff1aSopenharmony_ci        mul             v18.4s,    v16.4s,    v0.s[0]
254cabdff1aSopenharmony_ci        mul             v24.4s,    v17.4s,    v0.s[0]
255cabdff1aSopenharmony_ci        mls             v20.4s,    \c3\().4s, v0.s[3]
256cabdff1aSopenharmony_ci        srshr           v22.4s,    v22.4s,    #14
257cabdff1aSopenharmony_ci        srshr           v18.4s,    v18.4s,    #14
258cabdff1aSopenharmony_ci        srshr           v24.4s,    v24.4s,    #14
259cabdff1aSopenharmony_ci        srshr           v20.4s,    v20.4s,    #14
260cabdff1aSopenharmony_ci        add             \c0\().4s, v18.4s,    v22.4s
261cabdff1aSopenharmony_ci        sub             \c3\().4s, v18.4s,    v22.4s
262cabdff1aSopenharmony_ci        add             \c1\().4s, v24.4s,    v20.4s
263cabdff1aSopenharmony_ci        sub             \c2\().4s, v24.4s,    v20.4s
264cabdff1aSopenharmony_ci.endm
265cabdff1aSopenharmony_ci
266cabdff1aSopenharmony_ci.macro idct4_12 c0, c1, c2, c3
267cabdff1aSopenharmony_ci        smull           v22.2d,    \c1\().2s, v0.s[3]
268cabdff1aSopenharmony_ci        smull2          v23.2d,    \c1\().4s, v0.s[3]
269cabdff1aSopenharmony_ci        smull           v20.2d,    \c1\().2s, v0.s[2]
270cabdff1aSopenharmony_ci        smull2          v21.2d,    \c1\().4s, v0.s[2]
271cabdff1aSopenharmony_ci        add             v16.4s,    \c0\().4s, \c2\().4s
272cabdff1aSopenharmony_ci        sub             v17.4s,    \c0\().4s, \c2\().4s
273cabdff1aSopenharmony_ci        smlal           v22.2d,    \c3\().2s, v0.s[2]
274cabdff1aSopenharmony_ci        smlal2          v23.2d,    \c3\().4s, v0.s[2]
275cabdff1aSopenharmony_ci        smull           v18.2d,    v16.2s,    v0.s[0]
276cabdff1aSopenharmony_ci        smull2          v19.2d,    v16.4s,    v0.s[0]
277cabdff1aSopenharmony_ci        smull           v24.2d,    v17.2s,    v0.s[0]
278cabdff1aSopenharmony_ci        smull2          v25.2d,    v17.4s,    v0.s[0]
279cabdff1aSopenharmony_ci        smlsl           v20.2d,    \c3\().2s, v0.s[3]
280cabdff1aSopenharmony_ci        smlsl2          v21.2d,    \c3\().4s, v0.s[3]
281cabdff1aSopenharmony_ci        rshrn           v22.2s,    v22.2d,    #14
282cabdff1aSopenharmony_ci        rshrn2          v22.4s,    v23.2d,    #14
283cabdff1aSopenharmony_ci        rshrn           v18.2s,    v18.2d,    #14
284cabdff1aSopenharmony_ci        rshrn2          v18.4s,    v19.2d,    #14
285cabdff1aSopenharmony_ci        rshrn           v24.2s,    v24.2d,    #14
286cabdff1aSopenharmony_ci        rshrn2          v24.4s,    v25.2d,    #14
287cabdff1aSopenharmony_ci        rshrn           v20.2s,    v20.2d,    #14
288cabdff1aSopenharmony_ci        rshrn2          v20.4s,    v21.2d,    #14
289cabdff1aSopenharmony_ci        add             \c0\().4s, v18.4s,    v22.4s
290cabdff1aSopenharmony_ci        sub             \c3\().4s, v18.4s,    v22.4s
291cabdff1aSopenharmony_ci        add             \c1\().4s, v24.4s,    v20.4s
292cabdff1aSopenharmony_ci        sub             \c2\().4s, v24.4s,    v20.4s
293cabdff1aSopenharmony_ci.endm
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci.macro iadst4_10 c0, c1, c2, c3
296cabdff1aSopenharmony_ci        mul             v16.4s,    \c0\().4s, v1.s[0]
297cabdff1aSopenharmony_ci        mla             v16.4s,    \c2\().4s, v1.s[1]
298cabdff1aSopenharmony_ci        mla             v16.4s,    \c3\().4s, v1.s[2]
299cabdff1aSopenharmony_ci        mul             v18.4s,    \c0\().4s, v1.s[2]
300cabdff1aSopenharmony_ci        mls             v18.4s,    \c2\().4s, v1.s[0]
301cabdff1aSopenharmony_ci        sub             \c0\().4s, \c0\().4s, \c2\().4s
302cabdff1aSopenharmony_ci        mls             v18.4s,    \c3\().4s, v1.s[1]
303cabdff1aSopenharmony_ci        add             \c0\().4s, \c0\().4s, \c3\().4s
304cabdff1aSopenharmony_ci        mul             v22.4s,    \c1\().4s, v1.s[3]
305cabdff1aSopenharmony_ci        mul             v20.4s,    \c0\().4s, v1.s[3]
306cabdff1aSopenharmony_ci        add             v24.4s,    v16.4s,    v22.4s
307cabdff1aSopenharmony_ci        add             v26.4s,    v18.4s,    v22.4s
308cabdff1aSopenharmony_ci        srshr           \c0\().4s, v24.4s,    #14
309cabdff1aSopenharmony_ci        add             v16.4s,    v16.4s,    v18.4s
310cabdff1aSopenharmony_ci        srshr           \c1\().4s, v26.4s,    #14
311cabdff1aSopenharmony_ci        sub             v16.4s,    v16.4s,    v22.4s
312cabdff1aSopenharmony_ci        srshr           \c2\().4s, v20.4s,    #14
313cabdff1aSopenharmony_ci        srshr           \c3\().4s, v16.4s,    #14
314cabdff1aSopenharmony_ci.endm
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_ci.macro iadst4_12 c0, c1, c2, c3
317cabdff1aSopenharmony_ci        smull           v16.2d,    \c0\().2s, v1.s[0]
318cabdff1aSopenharmony_ci        smull2          v17.2d,    \c0\().4s, v1.s[0]
319cabdff1aSopenharmony_ci        smlal           v16.2d,    \c2\().2s, v1.s[1]
320cabdff1aSopenharmony_ci        smlal2          v17.2d,    \c2\().4s, v1.s[1]
321cabdff1aSopenharmony_ci        smlal           v16.2d,    \c3\().2s, v1.s[2]
322cabdff1aSopenharmony_ci        smlal2          v17.2d,    \c3\().4s, v1.s[2]
323cabdff1aSopenharmony_ci        smull           v18.2d,    \c0\().2s, v1.s[2]
324cabdff1aSopenharmony_ci        smull2          v19.2d,    \c0\().4s, v1.s[2]
325cabdff1aSopenharmony_ci        smlsl           v18.2d,    \c2\().2s, v1.s[0]
326cabdff1aSopenharmony_ci        smlsl2          v19.2d,    \c2\().4s, v1.s[0]
327cabdff1aSopenharmony_ci        sub             \c0\().4s, \c0\().4s, \c2\().4s
328cabdff1aSopenharmony_ci        smlsl           v18.2d,    \c3\().2s, v1.s[1]
329cabdff1aSopenharmony_ci        smlsl2          v19.2d,    \c3\().4s, v1.s[1]
330cabdff1aSopenharmony_ci        add             \c0\().4s, \c0\().4s, \c3\().4s
331cabdff1aSopenharmony_ci        smull           v22.2d,    \c1\().2s, v1.s[3]
332cabdff1aSopenharmony_ci        smull2          v23.2d,    \c1\().4s, v1.s[3]
333cabdff1aSopenharmony_ci        smull           v20.2d,    \c0\().2s, v1.s[3]
334cabdff1aSopenharmony_ci        smull2          v21.2d,    \c0\().4s, v1.s[3]
335cabdff1aSopenharmony_ci        add             v24.2d,    v16.2d,    v22.2d
336cabdff1aSopenharmony_ci        add             v25.2d,    v17.2d,    v23.2d
337cabdff1aSopenharmony_ci        add             v26.2d,    v18.2d,    v22.2d
338cabdff1aSopenharmony_ci        add             v27.2d,    v19.2d,    v23.2d
339cabdff1aSopenharmony_ci        rshrn           \c0\().2s, v24.2d,    #14
340cabdff1aSopenharmony_ci        rshrn2          \c0\().4s, v25.2d,    #14
341cabdff1aSopenharmony_ci        add             v16.2d,    v16.2d,    v18.2d
342cabdff1aSopenharmony_ci        add             v17.2d,    v17.2d,    v19.2d
343cabdff1aSopenharmony_ci        rshrn           \c1\().2s, v26.2d,    #14
344cabdff1aSopenharmony_ci        rshrn2          \c1\().4s, v27.2d,    #14
345cabdff1aSopenharmony_ci        sub             v16.2d,    v16.2d,    v22.2d
346cabdff1aSopenharmony_ci        sub             v17.2d,    v17.2d,    v23.2d
347cabdff1aSopenharmony_ci        rshrn           \c2\().2s, v20.2d,    #14
348cabdff1aSopenharmony_ci        rshrn2          \c2\().4s, v21.2d,    #14
349cabdff1aSopenharmony_ci        rshrn           \c3\().2s, v16.2d,    #14
350cabdff1aSopenharmony_ci        rshrn2          \c3\().4s, v17.2d,    #14
351cabdff1aSopenharmony_ci.endm
352cabdff1aSopenharmony_ci
353cabdff1aSopenharmony_ci// The public functions in this file have got the following signature:
354cabdff1aSopenharmony_ci// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
355cabdff1aSopenharmony_ci
356cabdff1aSopenharmony_ci.macro itxfm_func4x4 txfm1, txfm2, bpp
357cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
358cabdff1aSopenharmony_ci.ifc \txfm1,\txfm2
359cabdff1aSopenharmony_ci.ifc \txfm1,idct
360cabdff1aSopenharmony_ci        movrel          x4,  itxfm4_coeffs
361cabdff1aSopenharmony_ci        ld1             {v0.4h}, [x4]
362cabdff1aSopenharmony_ci        sxtl            v0.4s,  v0.4h
363cabdff1aSopenharmony_ci.endif
364cabdff1aSopenharmony_ci.ifc \txfm1,iadst
365cabdff1aSopenharmony_ci        movrel          x4,  iadst4_coeffs
366cabdff1aSopenharmony_ci        ld1             {v0.d}[1], [x4]
367cabdff1aSopenharmony_ci        sxtl2           v1.4s,  v0.8h
368cabdff1aSopenharmony_ci.endif
369cabdff1aSopenharmony_ci.else
370cabdff1aSopenharmony_ci        movrel          x4,  itxfm4_coeffs
371cabdff1aSopenharmony_ci        ld1             {v0.8h}, [x4]
372cabdff1aSopenharmony_ci        sxtl2           v1.4s,  v0.8h
373cabdff1aSopenharmony_ci        sxtl            v0.4s,  v0.4h
374cabdff1aSopenharmony_ci.endif
375cabdff1aSopenharmony_ci
376cabdff1aSopenharmony_ci        movi            v30.4s, #0
377cabdff1aSopenharmony_ci        movi            v31.4s, #0
378cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
379cabdff1aSopenharmony_ci        cmp             w3,  #1
380cabdff1aSopenharmony_ci        b.ne            1f
381cabdff1aSopenharmony_ci        // DC-only for idct/idct
382cabdff1aSopenharmony_ci        ld1             {v2.s}[0],  [x2]
383cabdff1aSopenharmony_ci        smull           v2.2d,  v2.2s, v0.s[0]
384cabdff1aSopenharmony_ci        rshrn           v2.2s,  v2.2d, #14
385cabdff1aSopenharmony_ci        smull           v2.2d,  v2.2s, v0.s[0]
386cabdff1aSopenharmony_ci        rshrn           v2.2s,  v2.2d, #14
387cabdff1aSopenharmony_ci        st1             {v31.s}[0], [x2]
388cabdff1aSopenharmony_ci        dup             v4.4s,  v2.s[0]
389cabdff1aSopenharmony_ci        mov             v5.16b, v4.16b
390cabdff1aSopenharmony_ci        mov             v6.16b, v4.16b
391cabdff1aSopenharmony_ci        mov             v7.16b, v4.16b
392cabdff1aSopenharmony_ci        b               2f
393cabdff1aSopenharmony_ci.endif
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_ci1:
396cabdff1aSopenharmony_ci        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
397cabdff1aSopenharmony_ci        st1             {v30.4s,v31.4s}, [x2], #32
398cabdff1aSopenharmony_ci
399cabdff1aSopenharmony_ci.ifc \txfm1,iwht
400cabdff1aSopenharmony_ci        sshr            v4.4s,  v4.4s,  #2
401cabdff1aSopenharmony_ci        sshr            v5.4s,  v5.4s,  #2
402cabdff1aSopenharmony_ci        sshr            v6.4s,  v6.4s,  #2
403cabdff1aSopenharmony_ci        sshr            v7.4s,  v7.4s,  #2
404cabdff1aSopenharmony_ci.endif
405cabdff1aSopenharmony_ci
406cabdff1aSopenharmony_ci        \txfm1\()4_\bpp v4,  v5,  v6,  v7
407cabdff1aSopenharmony_ci
408cabdff1aSopenharmony_ci        st1             {v30.4s,v31.4s}, [x2], #32
409cabdff1aSopenharmony_ci        // Transpose 4x4 with 32 bit elements
410cabdff1aSopenharmony_ci        transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
411cabdff1aSopenharmony_ci
412cabdff1aSopenharmony_ci        \txfm2\()4_\bpp v4,  v5,  v6,  v7
413cabdff1aSopenharmony_ci2:
414cabdff1aSopenharmony_ci        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
415cabdff1aSopenharmony_ci        ld1             {v0.4h},   [x0], x1
416cabdff1aSopenharmony_ci        ld1             {v1.4h},   [x0], x1
417cabdff1aSopenharmony_ci.ifnc \txfm1,iwht
418cabdff1aSopenharmony_ci        srshr           v4.4s,  v4.4s,  #4
419cabdff1aSopenharmony_ci        srshr           v5.4s,  v5.4s,  #4
420cabdff1aSopenharmony_ci        srshr           v6.4s,  v6.4s,  #4
421cabdff1aSopenharmony_ci        srshr           v7.4s,  v7.4s,  #4
422cabdff1aSopenharmony_ci.endif
423cabdff1aSopenharmony_ci        uaddw           v4.4s,  v4.4s,  v0.4h
424cabdff1aSopenharmony_ci        uaddw           v5.4s,  v5.4s,  v1.4h
425cabdff1aSopenharmony_ci        ld1             {v2.4h},   [x0], x1
426cabdff1aSopenharmony_ci        ld1             {v3.4h},   [x0], x1
427cabdff1aSopenharmony_ci        sqxtun          v0.4h,  v4.4s
428cabdff1aSopenharmony_ci        sqxtun2         v0.8h,  v5.4s
429cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
430cabdff1aSopenharmony_ci
431cabdff1aSopenharmony_ci        uaddw           v6.4s,  v6.4s,  v2.4h
432cabdff1aSopenharmony_ci        umin            v0.8h,  v0.8h,  v31.8h
433cabdff1aSopenharmony_ci        uaddw           v7.4s,  v7.4s,  v3.4h
434cabdff1aSopenharmony_ci        st1             {v0.4h},   [x0], x1
435cabdff1aSopenharmony_ci        sqxtun          v2.4h,  v6.4s
436cabdff1aSopenharmony_ci        sqxtun2         v2.8h,  v7.4s
437cabdff1aSopenharmony_ci        umin            v2.8h,  v2.8h,  v31.8h
438cabdff1aSopenharmony_ci
439cabdff1aSopenharmony_ci        st1             {v0.d}[1], [x0], x1
440cabdff1aSopenharmony_ci        st1             {v2.4h},   [x0], x1
441cabdff1aSopenharmony_ci        st1             {v2.d}[1], [x0], x1
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci        ret
444cabdff1aSopenharmony_ciendfunc
445cabdff1aSopenharmony_ci.endm
446cabdff1aSopenharmony_ci
447cabdff1aSopenharmony_ci.macro itxfm_funcs4x4 bpp
448cabdff1aSopenharmony_ciitxfm_func4x4 idct,  idct,  \bpp
449cabdff1aSopenharmony_ciitxfm_func4x4 iadst, idct,  \bpp
450cabdff1aSopenharmony_ciitxfm_func4x4 idct,  iadst, \bpp
451cabdff1aSopenharmony_ciitxfm_func4x4 iadst, iadst, \bpp
452cabdff1aSopenharmony_ciitxfm_func4x4 iwht,  iwht,  \bpp
453cabdff1aSopenharmony_ci.endm
454cabdff1aSopenharmony_ci
455cabdff1aSopenharmony_ciitxfm_funcs4x4 10
456cabdff1aSopenharmony_ciitxfm_funcs4x4 12
457cabdff1aSopenharmony_ci
458cabdff1aSopenharmony_cifunction idct8x8_dc_add_neon
459cabdff1aSopenharmony_ci        movrel          x4,  idct_coeffs
460cabdff1aSopenharmony_ci        ld1             {v0.4h}, [x4]
461cabdff1aSopenharmony_ci
462cabdff1aSopenharmony_ci        movi            v1.4h,  #0
463cabdff1aSopenharmony_ci        sxtl            v0.4s,  v0.4h
464cabdff1aSopenharmony_ci
465cabdff1aSopenharmony_ci        ld1             {v2.s}[0],  [x2]
466cabdff1aSopenharmony_ci        smull           v2.2d,  v2.2s,  v0.s[0]
467cabdff1aSopenharmony_ci        rshrn           v2.2s,  v2.2d,  #14
468cabdff1aSopenharmony_ci        smull           v2.2d,  v2.2s,  v0.s[0]
469cabdff1aSopenharmony_ci        rshrn           v2.2s,  v2.2d,  #14
470cabdff1aSopenharmony_ci        st1             {v1.s}[0],  [x2]
471cabdff1aSopenharmony_ci        dup             v2.4s,  v2.s[0]
472cabdff1aSopenharmony_ci
473cabdff1aSopenharmony_ci        srshr           v2.4s,  v2.4s,  #5
474cabdff1aSopenharmony_ci
475cabdff1aSopenharmony_ci        mov             x4,  #8
476cabdff1aSopenharmony_ci        mov             x3,  x0
477cabdff1aSopenharmony_ci        dup             v31.8h, w5
478cabdff1aSopenharmony_ci1:
479cabdff1aSopenharmony_ci        // Loop to add the constant from v2 into all 8x8 outputs
480cabdff1aSopenharmony_ci        subs            x4,  x4,  #2
481cabdff1aSopenharmony_ci        ld1             {v3.8h},  [x0], x1
482cabdff1aSopenharmony_ci        ld1             {v4.8h},  [x0], x1
483cabdff1aSopenharmony_ci        uaddw           v16.4s, v2.4s,  v3.4h
484cabdff1aSopenharmony_ci        uaddw2          v17.4s, v2.4s,  v3.8h
485cabdff1aSopenharmony_ci        uaddw           v18.4s, v2.4s,  v4.4h
486cabdff1aSopenharmony_ci        uaddw2          v19.4s, v2.4s,  v4.8h
487cabdff1aSopenharmony_ci        sqxtun          v3.4h,  v16.4s
488cabdff1aSopenharmony_ci        sqxtun2         v3.8h,  v17.4s
489cabdff1aSopenharmony_ci        sqxtun          v4.4h,  v18.4s
490cabdff1aSopenharmony_ci        sqxtun2         v4.8h,  v19.4s
491cabdff1aSopenharmony_ci        umin            v3.8h,  v3.8h,  v31.8h
492cabdff1aSopenharmony_ci        umin            v4.8h,  v4.8h,  v31.8h
493cabdff1aSopenharmony_ci        st1             {v3.8h},  [x3], x1
494cabdff1aSopenharmony_ci        st1             {v4.8h},  [x3], x1
495cabdff1aSopenharmony_ci        b.ne            1b
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci        ret
498cabdff1aSopenharmony_ciendfunc
499cabdff1aSopenharmony_ci
500cabdff1aSopenharmony_ci.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
501cabdff1aSopenharmony_ci        dmbutterfly0    \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
502cabdff1aSopenharmony_ci        dmbutterfly     \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3   // r2 = t2a, r6 = t3a
503cabdff1aSopenharmony_ci        dmbutterfly     \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3   // r1 = t4a, r7 = t7a
504cabdff1aSopenharmony_ci        dmbutterfly     \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3   // r5 = t5a, r3 = t6a
505cabdff1aSopenharmony_ci
506cabdff1aSopenharmony_ci        butterfly_4s    \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
507cabdff1aSopenharmony_ci        butterfly_4s    \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
508cabdff1aSopenharmony_ci        butterfly_4s    \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
509cabdff1aSopenharmony_ci        butterfly_4s    \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
510cabdff1aSopenharmony_ci
511cabdff1aSopenharmony_ci        dmbutterfly0    \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
512cabdff1aSopenharmony_ci
513cabdff1aSopenharmony_ci        butterfly_4s    \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
514cabdff1aSopenharmony_ci        butterfly_4s    \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
515cabdff1aSopenharmony_ci        butterfly_4s    \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
516cabdff1aSopenharmony_ci        butterfly_4s    \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
517cabdff1aSopenharmony_ci.endm
518cabdff1aSopenharmony_ci
519cabdff1aSopenharmony_ci.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
520cabdff1aSopenharmony_ci        dmbutterfly_l   \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]   // t2,t3 = t1a, t0,t1 = t0a
521cabdff1aSopenharmony_ci        dmbutterfly_l   \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]   // r0,r7 = t5a, t4,t5 = t4a
522cabdff1aSopenharmony_ci
523cabdff1aSopenharmony_ci        dbutterfly_n    \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
524cabdff1aSopenharmony_ci        dbutterfly_n    \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
525cabdff1aSopenharmony_ci
526cabdff1aSopenharmony_ci        dmbutterfly_l   \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]   // t4,t5 = t3a, t2,t3 = t2a
527cabdff1aSopenharmony_ci        dmbutterfly_l   \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]   // r2,r5 = t7a, r0,r7 = t6a
528cabdff1aSopenharmony_ci
529cabdff1aSopenharmony_ci        dbutterfly_n    \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
530cabdff1aSopenharmony_ci        dbutterfly_n    \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
531cabdff1aSopenharmony_ci
532cabdff1aSopenharmony_ci        butterfly_4s    \r7, \r4, \r4, \r0   // r7 = -out[7], r4 = t3
533cabdff1aSopenharmony_ci        neg             \r7\().4s, \r7\().4s // r7 = out[7]
534cabdff1aSopenharmony_ci        butterfly_4s    \r0, \r1, \r3, \r1   // r0 = out[0],  r1 = t2
535cabdff1aSopenharmony_ci
536cabdff1aSopenharmony_ci        dmbutterfly_l   \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]   // r2,r3 = t5a, t3,t5 = t4a
537cabdff1aSopenharmony_ci        dmbutterfly_l   \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]   // t0,t1 = t6a, r5,r6 = t7a
538cabdff1aSopenharmony_ci
539cabdff1aSopenharmony_ci        dbutterfly_n    \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6],  t2 = t7
540cabdff1aSopenharmony_ci
541cabdff1aSopenharmony_ci        dmbutterfly0    \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2           // r3 = -out[3], r4 = out[4]
542cabdff1aSopenharmony_ci        neg             \r3\().4s, \r3\().4s  // r3 = out[3]
543cabdff1aSopenharmony_ci
544cabdff1aSopenharmony_ci        dbutterfly_n    \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
545cabdff1aSopenharmony_ci        neg             \r1\().4s, \r1\().4s  // r1 = out[1]
546cabdff1aSopenharmony_ci
547cabdff1aSopenharmony_ci        dmbutterfly0    \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5           // r2 = out[2],  r5 = -out[5]
548cabdff1aSopenharmony_ci        neg             \r5\().4s, \r5\().4s  // r5 = out[5]
549cabdff1aSopenharmony_ci.endm
550cabdff1aSopenharmony_ci
551cabdff1aSopenharmony_ci
552cabdff1aSopenharmony_ci.macro itxfm_func8x8 txfm1, txfm2
553cabdff1aSopenharmony_cifunction vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
554cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
555cabdff1aSopenharmony_ci        cmp             w3,  #1
556cabdff1aSopenharmony_ci        b.eq            idct8x8_dc_add_neon
557cabdff1aSopenharmony_ci.endif
558cabdff1aSopenharmony_ci        // The iadst also uses a few coefficients from
559cabdff1aSopenharmony_ci        // idct, so those always need to be loaded.
560cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
561cabdff1aSopenharmony_ci        movrel          x4,  idct_coeffs
562cabdff1aSopenharmony_ci.else
563cabdff1aSopenharmony_ci        movrel          x4,  iadst8_coeffs
564cabdff1aSopenharmony_ci        ld1             {v1.8h}, [x4], #16
565cabdff1aSopenharmony_ci        stp             d8,  d9,  [sp, #-0x10]!
566cabdff1aSopenharmony_ci        sxtl2           v3.4s,  v1.8h
567cabdff1aSopenharmony_ci        sxtl            v2.4s,  v1.4h
568cabdff1aSopenharmony_ci.endif
569cabdff1aSopenharmony_ci        ld1             {v0.8h}, [x4]
570cabdff1aSopenharmony_ci        sxtl2           v1.4s,  v0.8h
571cabdff1aSopenharmony_ci        sxtl            v0.4s,  v0.4h
572cabdff1aSopenharmony_ci
573cabdff1aSopenharmony_ci        movi            v4.4s, #0
574cabdff1aSopenharmony_ci        movi            v5.4s, #0
575cabdff1aSopenharmony_ci        movi            v6.4s, #0
576cabdff1aSopenharmony_ci        movi            v7.4s, #0
577cabdff1aSopenharmony_ci
578cabdff1aSopenharmony_ci1:
579cabdff1aSopenharmony_ci        ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
580cabdff1aSopenharmony_ci        ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
581cabdff1aSopenharmony_ci        ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
582cabdff1aSopenharmony_ci        ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
583cabdff1aSopenharmony_ci        sub             x2,  x2,  #256
584cabdff1aSopenharmony_ci        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
585cabdff1aSopenharmony_ci        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
586cabdff1aSopenharmony_ci        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
587cabdff1aSopenharmony_ci        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
588cabdff1aSopenharmony_ci
589cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
590cabdff1aSopenharmony_ci        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
591cabdff1aSopenharmony_ci        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
592cabdff1aSopenharmony_ci.else
593cabdff1aSopenharmony_ci        \txfm1\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
594cabdff1aSopenharmony_ci        \txfm1\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
595cabdff1aSopenharmony_ci.endif
596cabdff1aSopenharmony_ci
597cabdff1aSopenharmony_ci        // Transpose 8x8 with 16 bit elements
598cabdff1aSopenharmony_ci        transpose_8x8s  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
599cabdff1aSopenharmony_ci
600cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
601cabdff1aSopenharmony_ci        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
602cabdff1aSopenharmony_ci        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
603cabdff1aSopenharmony_ci.else
604cabdff1aSopenharmony_ci        \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
605cabdff1aSopenharmony_ci        \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
606cabdff1aSopenharmony_ci.endif
607cabdff1aSopenharmony_ci2:
608cabdff1aSopenharmony_ci        mov             x3,  x0
609cabdff1aSopenharmony_ci        // Add into the destination
610cabdff1aSopenharmony_ci        ld1             {v0.8h},  [x0], x1
611cabdff1aSopenharmony_ci        srshr           v16.4s, v16.4s, #5
612cabdff1aSopenharmony_ci        srshr           v17.4s, v17.4s, #5
613cabdff1aSopenharmony_ci        ld1             {v1.8h},  [x0], x1
614cabdff1aSopenharmony_ci        srshr           v18.4s, v18.4s, #5
615cabdff1aSopenharmony_ci        srshr           v19.4s, v19.4s, #5
616cabdff1aSopenharmony_ci        ld1             {v2.8h},  [x0], x1
617cabdff1aSopenharmony_ci        srshr           v20.4s, v20.4s, #5
618cabdff1aSopenharmony_ci        srshr           v21.4s, v21.4s, #5
619cabdff1aSopenharmony_ci        uaddw           v16.4s, v16.4s, v0.4h
620cabdff1aSopenharmony_ci        uaddw2          v17.4s, v17.4s, v0.8h
621cabdff1aSopenharmony_ci        ld1             {v3.8h},  [x0], x1
622cabdff1aSopenharmony_ci        srshr           v22.4s, v22.4s, #5
623cabdff1aSopenharmony_ci        srshr           v23.4s, v23.4s, #5
624cabdff1aSopenharmony_ci        uaddw           v18.4s, v18.4s, v1.4h
625cabdff1aSopenharmony_ci        uaddw2          v19.4s, v19.4s, v1.8h
626cabdff1aSopenharmony_ci        ld1             {v4.8h},  [x0], x1
627cabdff1aSopenharmony_ci        srshr           v24.4s, v24.4s, #5
628cabdff1aSopenharmony_ci        srshr           v25.4s, v25.4s, #5
629cabdff1aSopenharmony_ci        uaddw           v20.4s, v20.4s, v2.4h
630cabdff1aSopenharmony_ci        uaddw2          v21.4s, v21.4s, v2.8h
631cabdff1aSopenharmony_ci        sqxtun          v0.4h,  v16.4s
632cabdff1aSopenharmony_ci        sqxtun2         v0.8h,  v17.4s
633cabdff1aSopenharmony_ci        dup             v16.8h, w5
634cabdff1aSopenharmony_ci        ld1             {v5.8h},  [x0], x1
635cabdff1aSopenharmony_ci        srshr           v26.4s, v26.4s, #5
636cabdff1aSopenharmony_ci        srshr           v27.4s, v27.4s, #5
637cabdff1aSopenharmony_ci        uaddw           v22.4s, v22.4s, v3.4h
638cabdff1aSopenharmony_ci        uaddw2          v23.4s, v23.4s, v3.8h
639cabdff1aSopenharmony_ci        sqxtun          v1.4h,  v18.4s
640cabdff1aSopenharmony_ci        sqxtun2         v1.8h,  v19.4s
641cabdff1aSopenharmony_ci        umin            v0.8h,  v0.8h,  v16.8h
642cabdff1aSopenharmony_ci        ld1             {v6.8h},  [x0], x1
643cabdff1aSopenharmony_ci        srshr           v28.4s, v28.4s, #5
644cabdff1aSopenharmony_ci        srshr           v29.4s, v29.4s, #5
645cabdff1aSopenharmony_ci        uaddw           v24.4s, v24.4s, v4.4h
646cabdff1aSopenharmony_ci        uaddw2          v25.4s, v25.4s, v4.8h
647cabdff1aSopenharmony_ci        sqxtun          v2.4h,  v20.4s
648cabdff1aSopenharmony_ci        sqxtun2         v2.8h,  v21.4s
649cabdff1aSopenharmony_ci        umin            v1.8h,  v1.8h,  v16.8h
650cabdff1aSopenharmony_ci        ld1             {v7.8h},  [x0], x1
651cabdff1aSopenharmony_ci        srshr           v30.4s, v30.4s, #5
652cabdff1aSopenharmony_ci        srshr           v31.4s, v31.4s, #5
653cabdff1aSopenharmony_ci        uaddw           v26.4s, v26.4s, v5.4h
654cabdff1aSopenharmony_ci        uaddw2          v27.4s, v27.4s, v5.8h
655cabdff1aSopenharmony_ci        sqxtun          v3.4h,  v22.4s
656cabdff1aSopenharmony_ci        sqxtun2         v3.8h,  v23.4s
657cabdff1aSopenharmony_ci        umin            v2.8h,  v2.8h,  v16.8h
658cabdff1aSopenharmony_ci
659cabdff1aSopenharmony_ci        st1             {v0.8h},  [x3], x1
660cabdff1aSopenharmony_ci        uaddw           v28.4s, v28.4s, v6.4h
661cabdff1aSopenharmony_ci        uaddw2          v29.4s, v29.4s, v6.8h
662cabdff1aSopenharmony_ci        st1             {v1.8h},  [x3], x1
663cabdff1aSopenharmony_ci        sqxtun          v4.4h,  v24.4s
664cabdff1aSopenharmony_ci        sqxtun2         v4.8h,  v25.4s
665cabdff1aSopenharmony_ci        umin            v3.8h,  v3.8h,  v16.8h
666cabdff1aSopenharmony_ci        st1             {v2.8h},  [x3], x1
667cabdff1aSopenharmony_ci        uaddw           v30.4s, v30.4s, v7.4h
668cabdff1aSopenharmony_ci        uaddw2          v31.4s, v31.4s, v7.8h
669cabdff1aSopenharmony_ci        st1             {v3.8h},  [x3], x1
670cabdff1aSopenharmony_ci        sqxtun          v5.4h,  v26.4s
671cabdff1aSopenharmony_ci        sqxtun2         v5.8h,  v27.4s
672cabdff1aSopenharmony_ci        umin            v4.8h,  v4.8h,  v16.8h
673cabdff1aSopenharmony_ci        st1             {v4.8h},  [x3], x1
674cabdff1aSopenharmony_ci        sqxtun          v6.4h,  v28.4s
675cabdff1aSopenharmony_ci        sqxtun2         v6.8h,  v29.4s
676cabdff1aSopenharmony_ci        umin            v5.8h,  v5.8h,  v16.8h
677cabdff1aSopenharmony_ci        st1             {v5.8h},  [x3], x1
678cabdff1aSopenharmony_ci        sqxtun          v7.4h,  v30.4s
679cabdff1aSopenharmony_ci        sqxtun2         v7.8h,  v31.4s
680cabdff1aSopenharmony_ci        umin            v6.8h,  v6.8h,  v16.8h
681cabdff1aSopenharmony_ci
682cabdff1aSopenharmony_ci        st1             {v6.8h},  [x3], x1
683cabdff1aSopenharmony_ci        umin            v7.8h,  v7.8h,  v16.8h
684cabdff1aSopenharmony_ci        st1             {v7.8h},  [x3], x1
685cabdff1aSopenharmony_ci
686cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
687cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
688cabdff1aSopenharmony_ci.endif
689cabdff1aSopenharmony_ci        ret
690cabdff1aSopenharmony_ciendfunc
691cabdff1aSopenharmony_ci
692cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
693cabdff1aSopenharmony_ci        mov             x5,  #0x03ff
694cabdff1aSopenharmony_ci        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
695cabdff1aSopenharmony_ciendfunc
696cabdff1aSopenharmony_ci
697cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
698cabdff1aSopenharmony_ci        mov             x5,  #0x0fff
699cabdff1aSopenharmony_ci        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
700cabdff1aSopenharmony_ciendfunc
701cabdff1aSopenharmony_ci.endm
702cabdff1aSopenharmony_ci
703cabdff1aSopenharmony_ciitxfm_func8x8 idct,  idct
704cabdff1aSopenharmony_ciitxfm_func8x8 iadst, idct
705cabdff1aSopenharmony_ciitxfm_func8x8 idct,  iadst
706cabdff1aSopenharmony_ciitxfm_func8x8 iadst, iadst
707cabdff1aSopenharmony_ci
708cabdff1aSopenharmony_ci
709cabdff1aSopenharmony_cifunction idct16x16_dc_add_neon
710cabdff1aSopenharmony_ci        movrel          x4,  idct_coeffs
711cabdff1aSopenharmony_ci        ld1             {v0.4h}, [x4]
712cabdff1aSopenharmony_ci        sxtl            v0.4s,  v0.4h
713cabdff1aSopenharmony_ci
714cabdff1aSopenharmony_ci        movi            v1.4h,  #0
715cabdff1aSopenharmony_ci
716cabdff1aSopenharmony_ci        ld1             {v2.s}[0],  [x2]
717cabdff1aSopenharmony_ci        smull           v2.2d,  v2.2s,  v0.s[0]
718cabdff1aSopenharmony_ci        rshrn           v2.2s,  v2.2d,  #14
719cabdff1aSopenharmony_ci        smull           v2.2d,  v2.2s,  v0.s[0]
720cabdff1aSopenharmony_ci        rshrn           v2.2s,  v2.2d,  #14
721cabdff1aSopenharmony_ci        st1             {v1.s}[0],  [x2]
722cabdff1aSopenharmony_ci        dup             v2.4s,  v2.s[0]
723cabdff1aSopenharmony_ci
724cabdff1aSopenharmony_ci        srshr           v0.4s,  v2.4s,  #6
725cabdff1aSopenharmony_ci
726cabdff1aSopenharmony_ci        mov             x3, x0
727cabdff1aSopenharmony_ci        mov             x4, #16
728cabdff1aSopenharmony_ci        dup             v31.8h, w13
729cabdff1aSopenharmony_ci1:
730cabdff1aSopenharmony_ci        // Loop to add the constant from v2 into all 16x16 outputs
731cabdff1aSopenharmony_ci        subs            x4,  x4,  #2
732cabdff1aSopenharmony_ci        ld1             {v1.8h,v2.8h},  [x0], x1
733cabdff1aSopenharmony_ci        uaddw           v16.4s, v0.4s,  v1.4h
734cabdff1aSopenharmony_ci        uaddw2          v17.4s, v0.4s,  v1.8h
735cabdff1aSopenharmony_ci        ld1             {v3.8h,v4.8h},  [x0], x1
736cabdff1aSopenharmony_ci        uaddw           v18.4s, v0.4s,  v2.4h
737cabdff1aSopenharmony_ci        uaddw2          v19.4s, v0.4s,  v2.8h
738cabdff1aSopenharmony_ci        uaddw           v20.4s, v0.4s,  v3.4h
739cabdff1aSopenharmony_ci        uaddw2          v21.4s, v0.4s,  v3.8h
740cabdff1aSopenharmony_ci        uaddw           v22.4s, v0.4s,  v4.4h
741cabdff1aSopenharmony_ci        uaddw2          v23.4s, v0.4s,  v4.8h
742cabdff1aSopenharmony_ci        sqxtun          v1.4h,  v16.4s
743cabdff1aSopenharmony_ci        sqxtun2         v1.8h,  v17.4s
744cabdff1aSopenharmony_ci        sqxtun          v2.4h,  v18.4s
745cabdff1aSopenharmony_ci        sqxtun2         v2.8h,  v19.4s
746cabdff1aSopenharmony_ci        sqxtun          v3.4h,  v20.4s
747cabdff1aSopenharmony_ci        sqxtun2         v3.8h,  v21.4s
748cabdff1aSopenharmony_ci        sqxtun          v4.4h,  v22.4s
749cabdff1aSopenharmony_ci        sqxtun2         v4.8h,  v23.4s
750cabdff1aSopenharmony_ci        umin            v1.8h,  v1.8h,  v31.8h
751cabdff1aSopenharmony_ci        umin            v2.8h,  v2.8h,  v31.8h
752cabdff1aSopenharmony_ci        st1             {v1.8h,v2.8h},  [x3], x1
753cabdff1aSopenharmony_ci        umin            v3.8h,  v3.8h,  v31.8h
754cabdff1aSopenharmony_ci        umin            v4.8h,  v4.8h,  v31.8h
755cabdff1aSopenharmony_ci        st1             {v3.8h,v4.8h},  [x3], x1
756cabdff1aSopenharmony_ci        b.ne            1b
757cabdff1aSopenharmony_ci
758cabdff1aSopenharmony_ci        ret
759cabdff1aSopenharmony_ciendfunc
760cabdff1aSopenharmony_ci
761cabdff1aSopenharmony_ci.macro idct16_end
762cabdff1aSopenharmony_ci        butterfly_4s    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
763cabdff1aSopenharmony_ci        butterfly_4s    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
764cabdff1aSopenharmony_ci        butterfly_4s    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
765cabdff1aSopenharmony_ci        butterfly_4s    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
766cabdff1aSopenharmony_ci        butterfly_4s    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
767cabdff1aSopenharmony_ci        butterfly_4s    v24, v21, v23, v21               // v24 = t9,   v21 = t10
768cabdff1aSopenharmony_ci        butterfly_4s    v23, v27, v25, v27               // v23 = t14,  v27 = t13
769cabdff1aSopenharmony_ci        butterfly_4s    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
770cabdff1aSopenharmony_ci
771cabdff1aSopenharmony_ci        dmbutterfly0    v8,  v9,  v27, v21, v8,  v9,  v16, v17, v30, v31 // v8  = t13a, v9  = t10a
772cabdff1aSopenharmony_ci        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
773cabdff1aSopenharmony_ci
774cabdff1aSopenharmony_ci        butterfly_4s    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
775cabdff1aSopenharmony_ci        butterfly_4s    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
776cabdff1aSopenharmony_ci        butterfly_4s_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
777cabdff1aSopenharmony_ci        butterfly_4s    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
778cabdff1aSopenharmony_ci        butterfly_4s    v18, v29, v4,  v8                // v18 = out[2], v29 = out[13]
779cabdff1aSopenharmony_ci        butterfly_4s    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
780cabdff1aSopenharmony_ci        butterfly_4s    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
781cabdff1aSopenharmony_ci        butterfly_4s    v21, v26, v26, v9                // v21 = out[5], v26 = out[10]
782cabdff1aSopenharmony_ci        ret
783cabdff1aSopenharmony_ci.endm
784cabdff1aSopenharmony_ci
785cabdff1aSopenharmony_cifunction idct16
786cabdff1aSopenharmony_ci        dmbutterfly0    v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
787cabdff1aSopenharmony_ci        dmbutterfly     v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
788cabdff1aSopenharmony_ci        dmbutterfly     v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
789cabdff1aSopenharmony_ci        dmbutterfly     v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
790cabdff1aSopenharmony_ci        dmbutterfly     v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
791cabdff1aSopenharmony_ci        dmbutterfly     v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
792cabdff1aSopenharmony_ci        dmbutterfly     v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
793cabdff1aSopenharmony_ci        dmbutterfly     v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
794cabdff1aSopenharmony_ci
795cabdff1aSopenharmony_ci        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
796cabdff1aSopenharmony_ci        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
797cabdff1aSopenharmony_ci        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
798cabdff1aSopenharmony_ci        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
799cabdff1aSopenharmony_ci        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
800cabdff1aSopenharmony_ci        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
801cabdff1aSopenharmony_ci        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
802cabdff1aSopenharmony_ci        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
803cabdff1aSopenharmony_ci
804cabdff1aSopenharmony_ci        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
805cabdff1aSopenharmony_ci        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
806cabdff1aSopenharmony_ci        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
807cabdff1aSopenharmony_ci        idct16_end
808cabdff1aSopenharmony_ciendfunc
809cabdff1aSopenharmony_ci
810cabdff1aSopenharmony_cifunction idct16_half
811cabdff1aSopenharmony_ci        dmbutterfly0_h  v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
812cabdff1aSopenharmony_ci        dmbutterfly_h1  v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
813cabdff1aSopenharmony_ci        dmbutterfly_h1  v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
814cabdff1aSopenharmony_ci        dmbutterfly_h2  v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
815cabdff1aSopenharmony_ci        dmbutterfly_h1  v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
816cabdff1aSopenharmony_ci        dmbutterfly_h2  v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
817cabdff1aSopenharmony_ci        dmbutterfly_h1  v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
818cabdff1aSopenharmony_ci        dmbutterfly_h2  v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
819cabdff1aSopenharmony_ci
820cabdff1aSopenharmony_ci        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
821cabdff1aSopenharmony_ci        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
822cabdff1aSopenharmony_ci        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
823cabdff1aSopenharmony_ci        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
824cabdff1aSopenharmony_ci        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
825cabdff1aSopenharmony_ci        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
826cabdff1aSopenharmony_ci        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
827cabdff1aSopenharmony_ci        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
828cabdff1aSopenharmony_ci
829cabdff1aSopenharmony_ci        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
830cabdff1aSopenharmony_ci        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
831cabdff1aSopenharmony_ci        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
832cabdff1aSopenharmony_ci        idct16_end
833cabdff1aSopenharmony_ciendfunc
834cabdff1aSopenharmony_ci
835cabdff1aSopenharmony_cifunction idct16_quarter
836cabdff1aSopenharmony_ci        dsmull_h        v24, v25, v19, v3.s[3]
837cabdff1aSopenharmony_ci        dsmull_h        v4,  v5,  v17, v2.s[0]
838cabdff1aSopenharmony_ci        dsmull_h        v7,  v6,  v18, v1.s[1]
839cabdff1aSopenharmony_ci        dsmull_h        v30, v31, v18, v1.s[0]
840cabdff1aSopenharmony_ci        neg             v24.2d,  v24.2d
841cabdff1aSopenharmony_ci        neg             v25.2d,  v25.2d
842cabdff1aSopenharmony_ci        dsmull_h        v29, v28, v17, v2.s[1]
843cabdff1aSopenharmony_ci        dsmull_h        v26, v27, v19, v3.s[2]
844cabdff1aSopenharmony_ci        dsmull_h        v22, v23, v16, v0.s[0]
845cabdff1aSopenharmony_ci        drshrn_h        v24, v24, v25, #14
846cabdff1aSopenharmony_ci        drshrn_h        v16, v4,  v5,  #14
847cabdff1aSopenharmony_ci        drshrn_h        v7,  v7,  v6,  #14
848cabdff1aSopenharmony_ci        drshrn_h        v6,  v30, v31, #14
849cabdff1aSopenharmony_ci        drshrn_h        v29, v29, v28, #14
850cabdff1aSopenharmony_ci        drshrn_h        v17, v26, v27, #14
851cabdff1aSopenharmony_ci        drshrn_h        v28, v22, v23, #14
852cabdff1aSopenharmony_ci
853cabdff1aSopenharmony_ci        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
854cabdff1aSopenharmony_ci        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
855cabdff1aSopenharmony_ci        neg             v22.2d,  v22.2d
856cabdff1aSopenharmony_ci        neg             v23.2d,  v23.2d
857cabdff1aSopenharmony_ci        drshrn_h        v27, v20, v21, #14
858cabdff1aSopenharmony_ci        drshrn_h        v21, v22, v23, #14
859cabdff1aSopenharmony_ci        drshrn_h        v23, v18, v19, #14
860cabdff1aSopenharmony_ci        drshrn_h        v25, v30, v31, #14
861cabdff1aSopenharmony_ci        mov             v4.16b,  v28.16b
862cabdff1aSopenharmony_ci        mov             v5.16b,  v28.16b
863cabdff1aSopenharmony_ci        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
864cabdff1aSopenharmony_ci        mov             v20.16b, v28.16b
865cabdff1aSopenharmony_ci        idct16_end
866cabdff1aSopenharmony_ciendfunc
867cabdff1aSopenharmony_ci
868cabdff1aSopenharmony_cifunction iadst16
869cabdff1aSopenharmony_ci        ld1             {v0.8h,v1.8h}, [x11]
870cabdff1aSopenharmony_ci        sxtl            v2.4s,  v1.4h
871cabdff1aSopenharmony_ci        sxtl2           v3.4s,  v1.8h
872cabdff1aSopenharmony_ci        sxtl2           v1.4s,  v0.8h
873cabdff1aSopenharmony_ci        sxtl            v0.4s,  v0.4h
874cabdff1aSopenharmony_ci
875cabdff1aSopenharmony_ci        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.s[1], v0.s[0]   // v6,v7   = t1,   v4,v5   = t0
876cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.s[1], v1.s[0]   // v10,v11 = t9,   v8,v9   = t8
877cabdff1aSopenharmony_ci        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
878cabdff1aSopenharmony_ci        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]   // v14,v15 = t3,   v12,v13 = t2
879cabdff1aSopenharmony_ci        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
880cabdff1aSopenharmony_ci
881cabdff1aSopenharmony_ci        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.s[3], v1.s[2]   // v6,v7   = t11,  v4,v5   = t10
882cabdff1aSopenharmony_ci        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
883cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v2.s[1], v2.s[0]   // v10,v11 = t5,   v8,v9   = t4
884cabdff1aSopenharmony_ci        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
885cabdff1aSopenharmony_ci
886cabdff1aSopenharmony_ci        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]   // v14,v15 = t13,  v12,v13 = t12
887cabdff1aSopenharmony_ci        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
888cabdff1aSopenharmony_ci        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v2.s[3], v2.s[2]   // v6,v7   = t7,   v4,v5   = t6
889cabdff1aSopenharmony_ci        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
890cabdff1aSopenharmony_ci
891cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v3.s[3], v3.s[2]   // v10,v11 = t15,  v8,v9   = t14
892cabdff1aSopenharmony_ci        ld1             {v0.8h}, [x10]
893cabdff1aSopenharmony_ci        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
894cabdff1aSopenharmony_ci        sxtl2           v1.4s,  v0.8h
895cabdff1aSopenharmony_ci        sxtl            v0.4s,  v0.4h
896cabdff1aSopenharmony_ci        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]   // v14,v15 = t9,   v12,v13 = t8
897cabdff1aSopenharmony_ci        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
898cabdff1aSopenharmony_ci
899cabdff1aSopenharmony_ci        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v1.s[1], v1.s[0]   // v4,v5   = t12,  v6,v7   = t13
900cabdff1aSopenharmony_ci        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
901cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v1.s[2], v1.s[3]   // v10,v11 = t11,  v8,v9   = t10
902cabdff1aSopenharmony_ci        butterfly_4s_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
903cabdff1aSopenharmony_ci        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
904cabdff1aSopenharmony_ci
905cabdff1aSopenharmony_ci        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]   // v12,v13 = t14,  v14,v15 = t15
906cabdff1aSopenharmony_ci        butterfly_4s_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
907cabdff1aSopenharmony_ci        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
908cabdff1aSopenharmony_ci        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
909cabdff1aSopenharmony_ci
910cabdff1aSopenharmony_ci        butterfly_4s_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
911cabdff1aSopenharmony_ci        butterfly_4s_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
912cabdff1aSopenharmony_ci
913cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.s[2], v0.s[3]   // v10,v11 = t13,  v8,v9   = t12
914cabdff1aSopenharmony_ci        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]   // v12,v13 = t14,  v14,v15 = t15
915cabdff1aSopenharmony_ci
916cabdff1aSopenharmony_ci        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
917cabdff1aSopenharmony_ci        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
918cabdff1aSopenharmony_ci        neg             v29.4s, v29.4s                   // v29 = out[13]
919cabdff1aSopenharmony_ci
920cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.s[2], v0.s[3]   // v10,v11 = t5a,  v8,v9   = t4a
921cabdff1aSopenharmony_ci        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.s[3], v0.s[2]   // v12,v13 = t6a,  v14,v15 = t7a
922cabdff1aSopenharmony_ci
923cabdff1aSopenharmony_ci        butterfly_4s    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
924cabdff1aSopenharmony_ci        butterfly_4s    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
925cabdff1aSopenharmony_ci
926cabdff1aSopenharmony_ci        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
927cabdff1aSopenharmony_ci        neg             v19.4s, v19.4s                   // v19 = out[3]
928cabdff1aSopenharmony_ci        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
929cabdff1aSopenharmony_ci
930cabdff1aSopenharmony_ci        butterfly_4s    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
931cabdff1aSopenharmony_ci        butterfly_4s    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
932cabdff1aSopenharmony_ci
933cabdff1aSopenharmony_ci        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
934cabdff1aSopenharmony_ci        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
935cabdff1aSopenharmony_ci        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
936cabdff1aSopenharmony_ci        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
937cabdff1aSopenharmony_ci
938cabdff1aSopenharmony_ci        neg             v31.4s,  v5.4s                    // v31 = out[15]
939cabdff1aSopenharmony_ci        neg             v17.4s,  v3.4s                    // v17 = out[1]
940cabdff1aSopenharmony_ci
941cabdff1aSopenharmony_ci        mov             v16.16b, v2.16b
942cabdff1aSopenharmony_ci        mov             v30.16b, v4.16b
943cabdff1aSopenharmony_ci        ret
944cabdff1aSopenharmony_ciendfunc
945cabdff1aSopenharmony_ci
946cabdff1aSopenharmony_ci// Helper macros; we can't use these expressions directly within
947cabdff1aSopenharmony_ci// e.g. .irp due to the extra concatenation \(). Therefore wrap
948cabdff1aSopenharmony_ci// them in macros to allow using .irp below.
949cabdff1aSopenharmony_ci.macro load i, src, inc
950cabdff1aSopenharmony_ci        ld1             {v\i\().4s},  [\src], \inc
951cabdff1aSopenharmony_ci.endm
952cabdff1aSopenharmony_ci.macro store i, dst, inc
953cabdff1aSopenharmony_ci        st1             {v\i\().4s},  [\dst], \inc
954cabdff1aSopenharmony_ci.endm
955cabdff1aSopenharmony_ci.macro movi_v i, size, imm
956cabdff1aSopenharmony_ci        movi            v\i\()\size,  \imm
957cabdff1aSopenharmony_ci.endm
958cabdff1aSopenharmony_ci.macro load_clear i, src, inc
959cabdff1aSopenharmony_ci        ld1             {v\i\().4s}, [\src]
960cabdff1aSopenharmony_ci        st1             {v4.4s},  [\src], \inc
961cabdff1aSopenharmony_ci.endm
962cabdff1aSopenharmony_ci
963cabdff1aSopenharmony_ci.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
964cabdff1aSopenharmony_ci        srshr           \coef0, \coef0, #6
965cabdff1aSopenharmony_ci        ld1             {v4.4h},   [x0], x1
966cabdff1aSopenharmony_ci        srshr           \coef1, \coef1, #6
967cabdff1aSopenharmony_ci        ld1             {v4.d}[1], [x3], x1
968cabdff1aSopenharmony_ci        srshr           \coef2, \coef2, #6
969cabdff1aSopenharmony_ci        ld1             {v5.4h},   [x0], x1
970cabdff1aSopenharmony_ci        srshr           \coef3, \coef3, #6
971cabdff1aSopenharmony_ci        uaddw           \coef0, \coef0, v4.4h
972cabdff1aSopenharmony_ci        ld1             {v5.d}[1], [x3], x1
973cabdff1aSopenharmony_ci        srshr           \coef4, \coef4, #6
974cabdff1aSopenharmony_ci        uaddw2          \coef1, \coef1, v4.8h
975cabdff1aSopenharmony_ci        ld1             {v6.4h},   [x0], x1
976cabdff1aSopenharmony_ci        srshr           \coef5, \coef5, #6
977cabdff1aSopenharmony_ci        uaddw           \coef2, \coef2, v5.4h
978cabdff1aSopenharmony_ci        ld1             {v6.d}[1], [x3], x1
979cabdff1aSopenharmony_ci        sqxtun          v4.4h,  \coef0
980cabdff1aSopenharmony_ci        srshr           \coef6, \coef6, #6
981cabdff1aSopenharmony_ci        uaddw2          \coef3, \coef3, v5.8h
982cabdff1aSopenharmony_ci        ld1             {v7.4h},   [x0], x1
983cabdff1aSopenharmony_ci        sqxtun2         v4.8h,  \coef1
984cabdff1aSopenharmony_ci        srshr           \coef7, \coef7, #6
985cabdff1aSopenharmony_ci        uaddw           \coef4, \coef4, v6.4h
986cabdff1aSopenharmony_ci        ld1             {v7.d}[1], [x3], x1
987cabdff1aSopenharmony_ci        umin            v4.8h,  v4.8h,  v8.8h
988cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
989cabdff1aSopenharmony_ci        sub             x3,  x3,  x1, lsl #2
990cabdff1aSopenharmony_ci        sqxtun          v5.4h,  \coef2
991cabdff1aSopenharmony_ci        uaddw2          \coef5, \coef5, v6.8h
992cabdff1aSopenharmony_ci        st1             {v4.4h},   [x0], x1
993cabdff1aSopenharmony_ci        sqxtun2         v5.8h,  \coef3
994cabdff1aSopenharmony_ci        uaddw           \coef6, \coef6, v7.4h
995cabdff1aSopenharmony_ci        st1             {v4.d}[1], [x3], x1
996cabdff1aSopenharmony_ci        umin            v5.8h,  v5.8h,  v8.8h
997cabdff1aSopenharmony_ci        sqxtun          v6.4h,  \coef4
998cabdff1aSopenharmony_ci        uaddw2          \coef7, \coef7, v7.8h
999cabdff1aSopenharmony_ci        st1             {v5.4h},   [x0], x1
1000cabdff1aSopenharmony_ci        sqxtun2         v6.8h,  \coef5
1001cabdff1aSopenharmony_ci        st1             {v5.d}[1], [x3], x1
1002cabdff1aSopenharmony_ci        umin            v6.8h,  v6.8h,  v8.8h
1003cabdff1aSopenharmony_ci        sqxtun          v7.4h,  \coef6
1004cabdff1aSopenharmony_ci        st1             {v6.4h},   [x0], x1
1005cabdff1aSopenharmony_ci        sqxtun2         v7.8h,  \coef7
1006cabdff1aSopenharmony_ci        st1             {v6.d}[1], [x3], x1
1007cabdff1aSopenharmony_ci        umin            v7.8h,  v7.8h,  v8.8h
1008cabdff1aSopenharmony_ci        st1             {v7.4h},   [x0], x1
1009cabdff1aSopenharmony_ci        st1             {v7.d}[1], [x3], x1
1010cabdff1aSopenharmony_ci.endm
1011cabdff1aSopenharmony_ci
1012cabdff1aSopenharmony_ci// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
1013cabdff1aSopenharmony_ci// transpose into a horizontal 16x4 slice and store.
1014cabdff1aSopenharmony_ci// x0 = dst (temp buffer)
1015cabdff1aSopenharmony_ci// x1 = slice offset
1016cabdff1aSopenharmony_ci// x2 = src
1017cabdff1aSopenharmony_ci// x9 = input stride
1018cabdff1aSopenharmony_ci.macro itxfm16_1d_funcs txfm
1019cabdff1aSopenharmony_cifunction \txfm\()16_1d_4x16_pass1_neon
1020cabdff1aSopenharmony_ci        mov             x14, x30
1021cabdff1aSopenharmony_ci
1022cabdff1aSopenharmony_ci        movi            v4.4s, #0
1023cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1024cabdff1aSopenharmony_ci        load_clear      \i,  x2,  x9
1025cabdff1aSopenharmony_ci.endr
1026cabdff1aSopenharmony_ci
1027cabdff1aSopenharmony_ci        bl              \txfm\()16
1028cabdff1aSopenharmony_ci
1029cabdff1aSopenharmony_ci        // Do four 4x4 transposes. Originally, v16-v31 contain the
1030cabdff1aSopenharmony_ci        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1031cabdff1aSopenharmony_ci        // contain the four transposed 4x4 blocks.
1032cabdff1aSopenharmony_ci        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1033cabdff1aSopenharmony_ci        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1034cabdff1aSopenharmony_ci        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1035cabdff1aSopenharmony_ci        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1036cabdff1aSopenharmony_ci
1037cabdff1aSopenharmony_ci        // Store the transposed 4x4 blocks horizontally.
1038cabdff1aSopenharmony_ci        cmp             x1,  #12
1039cabdff1aSopenharmony_ci        b.eq            1f
1040cabdff1aSopenharmony_ci.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
1041cabdff1aSopenharmony_ci        store           \i,  x0,  #16
1042cabdff1aSopenharmony_ci.endr
1043cabdff1aSopenharmony_ci        ret             x14
1044cabdff1aSopenharmony_ci1:
1045cabdff1aSopenharmony_ci        // Special case: For the last input column (x1 == 12),
1046cabdff1aSopenharmony_ci        // which would be stored as the last row in the temp buffer,
1047cabdff1aSopenharmony_ci        // don't store the first 4x4 block, but keep it in registers
1048cabdff1aSopenharmony_ci        // for the first slice of the second pass (where it is the
1049cabdff1aSopenharmony_ci        // last 4x4 block).
1050cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1051cabdff1aSopenharmony_ci        st1             {v20.4s},  [x0], #16
1052cabdff1aSopenharmony_ci        st1             {v24.4s},  [x0], #16
1053cabdff1aSopenharmony_ci        st1             {v28.4s},  [x0], #16
1054cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1055cabdff1aSopenharmony_ci        st1             {v21.4s},  [x0], #16
1056cabdff1aSopenharmony_ci        st1             {v25.4s},  [x0], #16
1057cabdff1aSopenharmony_ci        st1             {v29.4s},  [x0], #16
1058cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1059cabdff1aSopenharmony_ci        st1             {v22.4s},  [x0], #16
1060cabdff1aSopenharmony_ci        st1             {v26.4s},  [x0], #16
1061cabdff1aSopenharmony_ci        st1             {v30.4s},  [x0], #16
1062cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1063cabdff1aSopenharmony_ci        st1             {v23.4s},  [x0], #16
1064cabdff1aSopenharmony_ci        st1             {v27.4s},  [x0], #16
1065cabdff1aSopenharmony_ci        st1             {v31.4s},  [x0], #16
1066cabdff1aSopenharmony_ci
1067cabdff1aSopenharmony_ci        mov             v28.16b, v16.16b
1068cabdff1aSopenharmony_ci        mov             v29.16b, v17.16b
1069cabdff1aSopenharmony_ci        mov             v30.16b, v18.16b
1070cabdff1aSopenharmony_ci        mov             v31.16b, v19.16b
1071cabdff1aSopenharmony_ci        ret             x14
1072cabdff1aSopenharmony_ciendfunc
1073cabdff1aSopenharmony_ci
1074cabdff1aSopenharmony_ci// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
1075cabdff1aSopenharmony_ci// load the destination pixels (from a similar 4x16 slice), add and store back.
1076cabdff1aSopenharmony_ci// x0 = dst
1077cabdff1aSopenharmony_ci// x1 = dst stride
1078cabdff1aSopenharmony_ci// x2 = src (temp buffer)
1079cabdff1aSopenharmony_ci// x3 = slice offset
1080cabdff1aSopenharmony_ci// x9 = temp buffer stride
1081cabdff1aSopenharmony_cifunction \txfm\()16_1d_4x16_pass2_neon
1082cabdff1aSopenharmony_ci        mov             x14, x30
1083cabdff1aSopenharmony_ci
1084cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
1085cabdff1aSopenharmony_ci        load            \i,  x2,  x9
1086cabdff1aSopenharmony_ci.endr
1087cabdff1aSopenharmony_ci        cbz             x3,  1f
1088cabdff1aSopenharmony_ci.irp i, 28, 29, 30, 31
1089cabdff1aSopenharmony_ci        load            \i,  x2,  x9
1090cabdff1aSopenharmony_ci.endr
1091cabdff1aSopenharmony_ci1:
1092cabdff1aSopenharmony_ci
1093cabdff1aSopenharmony_ci        add             x3,  x0,  x1
1094cabdff1aSopenharmony_ci        lsl             x1,  x1,  #1
1095cabdff1aSopenharmony_ci        bl              \txfm\()16
1096cabdff1aSopenharmony_ci
1097cabdff1aSopenharmony_ci        dup             v8.8h, w13
1098cabdff1aSopenharmony_ci        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1099cabdff1aSopenharmony_ci        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1100cabdff1aSopenharmony_ci
1101cabdff1aSopenharmony_ci        ret             x14
1102cabdff1aSopenharmony_ciendfunc
1103cabdff1aSopenharmony_ci.endm
1104cabdff1aSopenharmony_ci
1105cabdff1aSopenharmony_ciitxfm16_1d_funcs idct
1106cabdff1aSopenharmony_ciitxfm16_1d_funcs iadst
1107cabdff1aSopenharmony_ci
1108cabdff1aSopenharmony_ci// This is the minimum eob value for each subpartition, in increments of 4
1109cabdff1aSopenharmony_ciconst min_eob_idct_idct_16, align=4
1110cabdff1aSopenharmony_ci        .short  0, 10, 38, 89
1111cabdff1aSopenharmony_ciendconst
1112cabdff1aSopenharmony_ci
1113cabdff1aSopenharmony_ci.macro itxfm_func16x16 txfm1, txfm2
1114cabdff1aSopenharmony_cifunction vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1115cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
1116cabdff1aSopenharmony_ci        cmp             w3,  #1
1117cabdff1aSopenharmony_ci        b.eq            idct16x16_dc_add_neon
1118cabdff1aSopenharmony_ci.endif
1119cabdff1aSopenharmony_ci        mov             x15, x30
1120cabdff1aSopenharmony_ci        // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
1121cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
1122cabdff1aSopenharmony_ci        stp             d14, d15, [sp, #-0x10]!
1123cabdff1aSopenharmony_ci        stp             d12, d13, [sp, #-0x10]!
1124cabdff1aSopenharmony_ci        stp             d10, d11, [sp, #-0x10]!
1125cabdff1aSopenharmony_ci.endif
1126cabdff1aSopenharmony_ci        stp             d8,  d9,  [sp, #-0x10]!
1127cabdff1aSopenharmony_ci
1128cabdff1aSopenharmony_ci        sub             sp,  sp,  #1024
1129cabdff1aSopenharmony_ci
1130cabdff1aSopenharmony_ci        mov             x4,  x0
1131cabdff1aSopenharmony_ci        mov             x5,  x1
1132cabdff1aSopenharmony_ci        mov             x6,  x2
1133cabdff1aSopenharmony_ci
1134cabdff1aSopenharmony_ci        movrel          x10, idct_coeffs
1135cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
1136cabdff1aSopenharmony_ci        movrel          x11, iadst16_coeffs
1137cabdff1aSopenharmony_ci.endif
1138cabdff1aSopenharmony_ci.ifc \txfm1,idct
1139cabdff1aSopenharmony_ci        ld1             {v0.8h,v1.8h}, [x10]
1140cabdff1aSopenharmony_ci        sxtl            v2.4s,  v1.4h
1141cabdff1aSopenharmony_ci        sxtl2           v3.4s,  v1.8h
1142cabdff1aSopenharmony_ci        sxtl2           v1.4s,  v0.8h
1143cabdff1aSopenharmony_ci        sxtl            v0.4s,  v0.4h
1144cabdff1aSopenharmony_ci.endif
1145cabdff1aSopenharmony_ci        mov             x9,  #64
1146cabdff1aSopenharmony_ci
1147cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
1148cabdff1aSopenharmony_ci        cmp             w3,  #10
1149cabdff1aSopenharmony_ci        b.le            idct16x16_quarter_add_16_neon
1150cabdff1aSopenharmony_ci        cmp             w3,  #38
1151cabdff1aSopenharmony_ci        b.le            idct16x16_half_add_16_neon
1152cabdff1aSopenharmony_ci
1153cabdff1aSopenharmony_ci        movrel          x12, min_eob_idct_idct_16, 2
1154cabdff1aSopenharmony_ci.endif
1155cabdff1aSopenharmony_ci
1156cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12
1157cabdff1aSopenharmony_ci        add             x0,  sp,  #(\i*64)
1158cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
1159cabdff1aSopenharmony_ci.if \i > 0
1160cabdff1aSopenharmony_ci        ldrh            w1,  [x12], #2
1161cabdff1aSopenharmony_ci        cmp             w3,  w1
1162cabdff1aSopenharmony_ci        mov             x1,  #(16 - \i)/4
1163cabdff1aSopenharmony_ci        b.le            1f
1164cabdff1aSopenharmony_ci.endif
1165cabdff1aSopenharmony_ci.endif
1166cabdff1aSopenharmony_ci        mov             x1,  #\i
1167cabdff1aSopenharmony_ci        add             x2,  x6,  #(\i*4)
1168cabdff1aSopenharmony_ci        bl              \txfm1\()16_1d_4x16_pass1_neon
1169cabdff1aSopenharmony_ci.endr
1170cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,iadst_idct
1171cabdff1aSopenharmony_ci        ld1             {v0.8h,v1.8h}, [x10]
1172cabdff1aSopenharmony_ci        sxtl            v2.4s,  v1.4h
1173cabdff1aSopenharmony_ci        sxtl2           v3.4s,  v1.8h
1174cabdff1aSopenharmony_ci        sxtl2           v1.4s,  v0.8h
1175cabdff1aSopenharmony_ci        sxtl            v0.4s,  v0.4h
1176cabdff1aSopenharmony_ci.endif
1177cabdff1aSopenharmony_ci
1178cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
1179cabdff1aSopenharmony_ci        b               3f
1180cabdff1aSopenharmony_ci1:
1181cabdff1aSopenharmony_ci        // Set v28-v31 to zero, for the in-register passthrough of
1182cabdff1aSopenharmony_ci        // coefficients to pass 2.
1183cabdff1aSopenharmony_ci        movi            v28.4s,  #0
1184cabdff1aSopenharmony_ci        movi            v29.4s,  #0
1185cabdff1aSopenharmony_ci        movi            v30.4s,  #0
1186cabdff1aSopenharmony_ci        movi            v31.4s,  #0
1187cabdff1aSopenharmony_ci2:
1188cabdff1aSopenharmony_ci        subs            x1,  x1,  #1
1189cabdff1aSopenharmony_ci.rept 4
1190cabdff1aSopenharmony_ci        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
1191cabdff1aSopenharmony_ci.endr
1192cabdff1aSopenharmony_ci        b.ne            2b
1193cabdff1aSopenharmony_ci3:
1194cabdff1aSopenharmony_ci.endif
1195cabdff1aSopenharmony_ci
1196cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12
1197cabdff1aSopenharmony_ci        add             x0,  x4,  #(\i*2)
1198cabdff1aSopenharmony_ci        mov             x1,  x5
1199cabdff1aSopenharmony_ci        add             x2,  sp,  #(\i*4)
1200cabdff1aSopenharmony_ci        mov             x3,  #\i
1201cabdff1aSopenharmony_ci        bl              \txfm2\()16_1d_4x16_pass2_neon
1202cabdff1aSopenharmony_ci.endr
1203cabdff1aSopenharmony_ci
1204cabdff1aSopenharmony_ci        add             sp,  sp,  #1024
1205cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
1206cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
1207cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
1208cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
1209cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
1210cabdff1aSopenharmony_ci.endif
1211cabdff1aSopenharmony_ci        ret             x15
1212cabdff1aSopenharmony_ciendfunc
1213cabdff1aSopenharmony_ci
1214cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
1215cabdff1aSopenharmony_ci        mov             x13, #0x03ff
1216cabdff1aSopenharmony_ci        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1217cabdff1aSopenharmony_ciendfunc
1218cabdff1aSopenharmony_ci
1219cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
1220cabdff1aSopenharmony_ci        mov             x13, #0x0fff
1221cabdff1aSopenharmony_ci        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1222cabdff1aSopenharmony_ciendfunc
1223cabdff1aSopenharmony_ci.endm
1224cabdff1aSopenharmony_ci
1225cabdff1aSopenharmony_ciitxfm_func16x16 idct,  idct
1226cabdff1aSopenharmony_ciitxfm_func16x16 iadst, idct
1227cabdff1aSopenharmony_ciitxfm_func16x16 idct,  iadst
1228cabdff1aSopenharmony_ciitxfm_func16x16 iadst, iadst
1229cabdff1aSopenharmony_ci
1230cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass1_quarter_neon
1231cabdff1aSopenharmony_ci        mov             x14, x30
1232cabdff1aSopenharmony_ci
1233cabdff1aSopenharmony_ci        movi            v4.4s, #0
1234cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1235cabdff1aSopenharmony_ci        load_clear      \i,  x2,  x9
1236cabdff1aSopenharmony_ci.endr
1237cabdff1aSopenharmony_ci
1238cabdff1aSopenharmony_ci        bl              idct16_quarter
1239cabdff1aSopenharmony_ci
1240cabdff1aSopenharmony_ci        // Do four 4x4 transposes. Originally, v16-v31 contain the
1241cabdff1aSopenharmony_ci        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1242cabdff1aSopenharmony_ci        // contain the four transposed 4x4 blocks.
1243cabdff1aSopenharmony_ci        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1244cabdff1aSopenharmony_ci        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1245cabdff1aSopenharmony_ci        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1246cabdff1aSopenharmony_ci        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1247cabdff1aSopenharmony_ci
1248cabdff1aSopenharmony_ci        // Store the transposed 4x4 blocks horizontally.
1249cabdff1aSopenharmony_ci        // The first 4x4 block is kept in registers for the second pass,
1250cabdff1aSopenharmony_ci        // store the rest in the temp buffer.
1251cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1252cabdff1aSopenharmony_ci        st1             {v20.4s},  [x0], #16
1253cabdff1aSopenharmony_ci        st1             {v24.4s},  [x0], #16
1254cabdff1aSopenharmony_ci        st1             {v28.4s},  [x0], #16
1255cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1256cabdff1aSopenharmony_ci        st1             {v21.4s},  [x0], #16
1257cabdff1aSopenharmony_ci        st1             {v25.4s},  [x0], #16
1258cabdff1aSopenharmony_ci        st1             {v29.4s},  [x0], #16
1259cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1260cabdff1aSopenharmony_ci        st1             {v22.4s},  [x0], #16
1261cabdff1aSopenharmony_ci        st1             {v26.4s},  [x0], #16
1262cabdff1aSopenharmony_ci        st1             {v30.4s},  [x0], #16
1263cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1264cabdff1aSopenharmony_ci        st1             {v23.4s},  [x0], #16
1265cabdff1aSopenharmony_ci        st1             {v27.4s},  [x0], #16
1266cabdff1aSopenharmony_ci        st1             {v31.4s},  [x0], #16
1267cabdff1aSopenharmony_ci        ret             x14
1268cabdff1aSopenharmony_ciendfunc
1269cabdff1aSopenharmony_ci
1270cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass2_quarter_neon
1271cabdff1aSopenharmony_ci        mov             x14, x30
1272cabdff1aSopenharmony_ci
1273cabdff1aSopenharmony_ci        // Only load the top 4 lines, and only do it for the later slices.
1274cabdff1aSopenharmony_ci        // For the first slice, d16-d19 is kept in registers from the first pass.
1275cabdff1aSopenharmony_ci        cbz             x3,  1f
1276cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1277cabdff1aSopenharmony_ci        load            \i,  x2,  x9
1278cabdff1aSopenharmony_ci.endr
1279cabdff1aSopenharmony_ci1:
1280cabdff1aSopenharmony_ci
1281cabdff1aSopenharmony_ci        add             x3,  x0,  x1
1282cabdff1aSopenharmony_ci        lsl             x1,  x1,  #1
1283cabdff1aSopenharmony_ci        bl              idct16_quarter
1284cabdff1aSopenharmony_ci
1285cabdff1aSopenharmony_ci        dup             v8.8h, w13
1286cabdff1aSopenharmony_ci        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1287cabdff1aSopenharmony_ci        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1288cabdff1aSopenharmony_ci
1289cabdff1aSopenharmony_ci        ret             x14
1290cabdff1aSopenharmony_ciendfunc
1291cabdff1aSopenharmony_ci
1292cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass1_half_neon
1293cabdff1aSopenharmony_ci        mov             x14, x30
1294cabdff1aSopenharmony_ci
1295cabdff1aSopenharmony_ci        movi            v4.4s, #0
1296cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1297cabdff1aSopenharmony_ci        load_clear      \i,  x2,  x9
1298cabdff1aSopenharmony_ci.endr
1299cabdff1aSopenharmony_ci
1300cabdff1aSopenharmony_ci        bl              idct16_half
1301cabdff1aSopenharmony_ci
1302cabdff1aSopenharmony_ci        // Do four 4x4 transposes. Originally, v16-v31 contain the
1303cabdff1aSopenharmony_ci        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1304cabdff1aSopenharmony_ci        // contain the four transposed 4x4 blocks.
1305cabdff1aSopenharmony_ci        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1306cabdff1aSopenharmony_ci        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1307cabdff1aSopenharmony_ci        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1308cabdff1aSopenharmony_ci        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1309cabdff1aSopenharmony_ci
1310cabdff1aSopenharmony_ci        // Store the transposed 4x4 blocks horizontally.
1311cabdff1aSopenharmony_ci        cmp             x1,  #4
1312cabdff1aSopenharmony_ci        b.eq            1f
1313cabdff1aSopenharmony_ci.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
1314cabdff1aSopenharmony_ci        store           \i,  x0,  #16
1315cabdff1aSopenharmony_ci.endr
1316cabdff1aSopenharmony_ci        ret             x14
1317cabdff1aSopenharmony_ci1:
1318cabdff1aSopenharmony_ci        // Special case: For the second input column (r1 == 4),
1319cabdff1aSopenharmony_ci        // which would be stored as the second row in the temp buffer,
1320cabdff1aSopenharmony_ci        // don't store the first 4x4 block, but keep it in registers
1321cabdff1aSopenharmony_ci        // for the first slice of the second pass (where it is the
1322cabdff1aSopenharmony_ci        // second 4x4 block).
1323cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1324cabdff1aSopenharmony_ci        st1             {v20.4s},  [x0], #16
1325cabdff1aSopenharmony_ci        st1             {v24.4s},  [x0], #16
1326cabdff1aSopenharmony_ci        st1             {v28.4s},  [x0], #16
1327cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1328cabdff1aSopenharmony_ci        st1             {v21.4s},  [x0], #16
1329cabdff1aSopenharmony_ci        st1             {v25.4s},  [x0], #16
1330cabdff1aSopenharmony_ci        st1             {v29.4s},  [x0], #16
1331cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1332cabdff1aSopenharmony_ci        st1             {v22.4s},  [x0], #16
1333cabdff1aSopenharmony_ci        st1             {v26.4s},  [x0], #16
1334cabdff1aSopenharmony_ci        st1             {v30.4s},  [x0], #16
1335cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1336cabdff1aSopenharmony_ci        st1             {v23.4s},  [x0], #16
1337cabdff1aSopenharmony_ci        st1             {v27.4s},  [x0], #16
1338cabdff1aSopenharmony_ci        st1             {v31.4s},  [x0], #16
1339cabdff1aSopenharmony_ci
1340cabdff1aSopenharmony_ci        mov             v20.16b, v16.16b
1341cabdff1aSopenharmony_ci        mov             v21.16b, v17.16b
1342cabdff1aSopenharmony_ci        mov             v22.16b, v18.16b
1343cabdff1aSopenharmony_ci        mov             v23.16b, v19.16b
1344cabdff1aSopenharmony_ci        ret             x14
1345cabdff1aSopenharmony_ciendfunc
1346cabdff1aSopenharmony_ci
1347cabdff1aSopenharmony_cifunction idct16_1d_4x16_pass2_half_neon
1348cabdff1aSopenharmony_ci        mov             x14, x30
1349cabdff1aSopenharmony_ci
1350cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1351cabdff1aSopenharmony_ci        load            \i,  x2,  x9
1352cabdff1aSopenharmony_ci.endr
1353cabdff1aSopenharmony_ci        cbz             x3,  1f
1354cabdff1aSopenharmony_ci.irp i, 20, 21, 22, 23
1355cabdff1aSopenharmony_ci        load            \i,  x2,  x9
1356cabdff1aSopenharmony_ci.endr
1357cabdff1aSopenharmony_ci1:
1358cabdff1aSopenharmony_ci
1359cabdff1aSopenharmony_ci        add             x3,  x0,  x1
1360cabdff1aSopenharmony_ci        lsl             x1,  x1,  #1
1361cabdff1aSopenharmony_ci        bl              idct16_half
1362cabdff1aSopenharmony_ci
1363cabdff1aSopenharmony_ci        dup             v8.8h, w13
1364cabdff1aSopenharmony_ci        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1365cabdff1aSopenharmony_ci        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1366cabdff1aSopenharmony_ci
1367cabdff1aSopenharmony_ci        ret             x14
1368cabdff1aSopenharmony_ciendfunc
1369cabdff1aSopenharmony_ci
1370cabdff1aSopenharmony_ci.macro idct16_partial size
1371cabdff1aSopenharmony_cifunction idct16x16_\size\()_add_16_neon
1372cabdff1aSopenharmony_ci        add             x0,  sp,  #(0*64)
1373cabdff1aSopenharmony_ci        mov             x1,  #0
1374cabdff1aSopenharmony_ci        add             x2,  x6,  #(0*4)
1375cabdff1aSopenharmony_ci        bl              idct16_1d_4x16_pass1_\size\()_neon
1376cabdff1aSopenharmony_ci.ifc \size,half
1377cabdff1aSopenharmony_ci        add             x0,  sp,  #(4*64)
1378cabdff1aSopenharmony_ci        mov             x1,  #4
1379cabdff1aSopenharmony_ci        add             x2,  x6,  #(4*4)
1380cabdff1aSopenharmony_ci        bl              idct16_1d_4x16_pass1_\size\()_neon
1381cabdff1aSopenharmony_ci.endif
1382cabdff1aSopenharmony_ci
1383cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12
1384cabdff1aSopenharmony_ci        add             x0,  x4,  #(\i*2)
1385cabdff1aSopenharmony_ci        mov             x1,  x5
1386cabdff1aSopenharmony_ci        add             x2,  sp,  #(\i*4)
1387cabdff1aSopenharmony_ci        mov             x3,  #\i
1388cabdff1aSopenharmony_ci        bl              idct16_1d_4x16_pass2_\size\()_neon
1389cabdff1aSopenharmony_ci.endr
1390cabdff1aSopenharmony_ci
1391cabdff1aSopenharmony_ci        add             sp,  sp,  #1024
1392cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
1393cabdff1aSopenharmony_ci        ret             x15
1394cabdff1aSopenharmony_ciendfunc
1395cabdff1aSopenharmony_ci.endm
1396cabdff1aSopenharmony_ci
1397cabdff1aSopenharmony_ciidct16_partial quarter
1398cabdff1aSopenharmony_ciidct16_partial half
1399cabdff1aSopenharmony_ci
1400cabdff1aSopenharmony_cifunction idct32x32_dc_add_neon
1401cabdff1aSopenharmony_ci        movrel          x4,  idct_coeffs
1402cabdff1aSopenharmony_ci        ld1             {v0.4h}, [x4]
1403cabdff1aSopenharmony_ci        sxtl            v0.4s,  v0.4h
1404cabdff1aSopenharmony_ci
1405cabdff1aSopenharmony_ci        movi            v1.4h,  #0
1406cabdff1aSopenharmony_ci
1407cabdff1aSopenharmony_ci        ld1             {v2.s}[0],  [x2]
1408cabdff1aSopenharmony_ci        smull           v2.2d,  v2.2s,  v0.s[0]
1409cabdff1aSopenharmony_ci        rshrn           v2.2s,  v2.2d,  #14
1410cabdff1aSopenharmony_ci        smull           v2.2d,  v2.2s,  v0.s[0]
1411cabdff1aSopenharmony_ci        rshrn           v2.2s,  v2.2d,  #14
1412cabdff1aSopenharmony_ci        st1             {v1.s}[0],  [x2]
1413cabdff1aSopenharmony_ci        dup             v2.4s,  v2.s[0]
1414cabdff1aSopenharmony_ci
1415cabdff1aSopenharmony_ci        srshr           v0.4s,  v2.4s,  #6
1416cabdff1aSopenharmony_ci
1417cabdff1aSopenharmony_ci        mov             x3,  x0
1418cabdff1aSopenharmony_ci        mov             x4,  #32
1419cabdff1aSopenharmony_ci        sub             x1,  x1,  #32
1420cabdff1aSopenharmony_ci        dup             v31.8h, w13
1421cabdff1aSopenharmony_ci1:
1422cabdff1aSopenharmony_ci        // Loop to add the constant v0 into all 32x32 outputs
1423cabdff1aSopenharmony_ci        subs            x4,  x4,  #1
1424cabdff1aSopenharmony_ci        ld1             {v1.8h,v2.8h},  [x0], #32
1425cabdff1aSopenharmony_ci        uaddw           v16.4s, v0.4s,  v1.4h
1426cabdff1aSopenharmony_ci        uaddw2          v17.4s, v0.4s,  v1.8h
1427cabdff1aSopenharmony_ci        ld1             {v3.8h,v4.8h},  [x0], x1
1428cabdff1aSopenharmony_ci        uaddw           v18.4s, v0.4s,  v2.4h
1429cabdff1aSopenharmony_ci        uaddw2          v19.4s, v0.4s,  v2.8h
1430cabdff1aSopenharmony_ci        uaddw           v20.4s, v0.4s,  v3.4h
1431cabdff1aSopenharmony_ci        uaddw2          v21.4s, v0.4s,  v3.8h
1432cabdff1aSopenharmony_ci        uaddw           v22.4s, v0.4s,  v4.4h
1433cabdff1aSopenharmony_ci        uaddw2          v23.4s, v0.4s,  v4.8h
1434cabdff1aSopenharmony_ci        sqxtun          v1.4h,  v16.4s
1435cabdff1aSopenharmony_ci        sqxtun2         v1.8h,  v17.4s
1436cabdff1aSopenharmony_ci        sqxtun          v2.4h,  v18.4s
1437cabdff1aSopenharmony_ci        sqxtun2         v2.8h,  v19.4s
1438cabdff1aSopenharmony_ci        sqxtun          v3.4h,  v20.4s
1439cabdff1aSopenharmony_ci        sqxtun2         v3.8h,  v21.4s
1440cabdff1aSopenharmony_ci        sqxtun          v4.4h,  v22.4s
1441cabdff1aSopenharmony_ci        sqxtun2         v4.8h,  v23.4s
1442cabdff1aSopenharmony_ci        umin            v1.8h,  v1.8h,  v31.8h
1443cabdff1aSopenharmony_ci        umin            v2.8h,  v2.8h,  v31.8h
1444cabdff1aSopenharmony_ci        st1             {v1.8h,v2.8h},  [x3], #32
1445cabdff1aSopenharmony_ci        umin            v3.8h,  v3.8h,  v31.8h
1446cabdff1aSopenharmony_ci        umin            v4.8h,  v4.8h,  v31.8h
1447cabdff1aSopenharmony_ci        st1             {v3.8h,v4.8h},  [x3], x1
1448cabdff1aSopenharmony_ci        b.ne            1b
1449cabdff1aSopenharmony_ci
1450cabdff1aSopenharmony_ci        ret
1451cabdff1aSopenharmony_ciendfunc
1452cabdff1aSopenharmony_ci
1453cabdff1aSopenharmony_ci.macro idct32_end
1454cabdff1aSopenharmony_ci        butterfly_4s    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
1455cabdff1aSopenharmony_ci        butterfly_4s    v17, v20, v23, v20 // v17 = t17,  v20 = t18
1456cabdff1aSopenharmony_ci        butterfly_4s    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
1457cabdff1aSopenharmony_ci        butterfly_4s    v19, v21, v22, v21 // v19 = t22,  v21 = t21
1458cabdff1aSopenharmony_ci        butterfly_4s    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
1459cabdff1aSopenharmony_ci        butterfly_4s    v23, v26, v25, v26 // v23 = t25,  v26 = t26
1460cabdff1aSopenharmony_ci        butterfly_4s    v7,  v8,  v29, v31 // v7  = t31a, v3  = t28a
1461cabdff1aSopenharmony_ci        butterfly_4s    v22, v27, v24, v27 // v22 = t30,  v27 = t29
1462cabdff1aSopenharmony_ci
1463cabdff1aSopenharmony_ci        dmbutterfly     v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
1464cabdff1aSopenharmony_ci        dmbutterfly     v8,  v5,  v0.s[2], v0.s[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
1465cabdff1aSopenharmony_ci        dmbutterfly     v28, v6,  v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
1466cabdff1aSopenharmony_ci        dmbutterfly     v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1467cabdff1aSopenharmony_ci
1468cabdff1aSopenharmony_ci        butterfly_4s    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
1469cabdff1aSopenharmony_ci        butterfly_4s    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1470cabdff1aSopenharmony_ci        butterfly_4s_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
1471cabdff1aSopenharmony_ci        butterfly_4s_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1472cabdff1aSopenharmony_ci        butterfly_4s    v18, v21, v27, v21 // v18 = t18,  v21 = t21
1473cabdff1aSopenharmony_ci        butterfly_4s_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
1474cabdff1aSopenharmony_ci        butterfly_4s    v29, v26, v20, v26 // v29 = t29,  v26 = t26
1475cabdff1aSopenharmony_ci        butterfly_4s    v19, v20, v8,  v6  // v19 = t19a, v20 = t20
1476cabdff1aSopenharmony_ci
1477cabdff1aSopenharmony_ci        dmbutterfly0    v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27,  v20 = t20
1478cabdff1aSopenharmony_ci        dmbutterfly0    v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
1479cabdff1aSopenharmony_ci        dmbutterfly0    v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25,  v22 = t22
1480cabdff1aSopenharmony_ci        dmbutterfly0    v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
1481cabdff1aSopenharmony_ci        ret
1482cabdff1aSopenharmony_ci.endm
1483cabdff1aSopenharmony_ci
1484cabdff1aSopenharmony_cifunction idct32_odd
1485cabdff1aSopenharmony_ci        dmbutterfly     v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1486cabdff1aSopenharmony_ci        dmbutterfly     v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1487cabdff1aSopenharmony_ci        dmbutterfly     v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1488cabdff1aSopenharmony_ci        dmbutterfly     v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1489cabdff1aSopenharmony_ci        dmbutterfly     v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1490cabdff1aSopenharmony_ci        dmbutterfly     v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1491cabdff1aSopenharmony_ci        dmbutterfly     v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1492cabdff1aSopenharmony_ci        dmbutterfly     v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1493cabdff1aSopenharmony_ci
1494cabdff1aSopenharmony_ci        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1495cabdff1aSopenharmony_ci        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1496cabdff1aSopenharmony_ci        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1497cabdff1aSopenharmony_ci        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1498cabdff1aSopenharmony_ci        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
1499cabdff1aSopenharmony_ci        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
1500cabdff1aSopenharmony_ci        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
1501cabdff1aSopenharmony_ci        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
1502cabdff1aSopenharmony_ci
1503cabdff1aSopenharmony_ci        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1504cabdff1aSopenharmony_ci        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1505cabdff1aSopenharmony_ci        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1506cabdff1aSopenharmony_ci        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1507cabdff1aSopenharmony_ci        idct32_end
1508cabdff1aSopenharmony_ciendfunc
1509cabdff1aSopenharmony_ci
1510cabdff1aSopenharmony_cifunction idct32_odd_half
1511cabdff1aSopenharmony_ci        dmbutterfly_h1  v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1512cabdff1aSopenharmony_ci        dmbutterfly_h2  v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1513cabdff1aSopenharmony_ci        dmbutterfly_h1  v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1514cabdff1aSopenharmony_ci        dmbutterfly_h2  v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1515cabdff1aSopenharmony_ci        dmbutterfly_h1  v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1516cabdff1aSopenharmony_ci        dmbutterfly_h2  v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1517cabdff1aSopenharmony_ci        dmbutterfly_h1  v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1518cabdff1aSopenharmony_ci        dmbutterfly_h2  v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1519cabdff1aSopenharmony_ci
1520cabdff1aSopenharmony_ci        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1521cabdff1aSopenharmony_ci        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1522cabdff1aSopenharmony_ci        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1523cabdff1aSopenharmony_ci        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1524cabdff1aSopenharmony_ci        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
1525cabdff1aSopenharmony_ci        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
1526cabdff1aSopenharmony_ci        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
1527cabdff1aSopenharmony_ci        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
1528cabdff1aSopenharmony_ci
1529cabdff1aSopenharmony_ci        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1530cabdff1aSopenharmony_ci        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1531cabdff1aSopenharmony_ci        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1532cabdff1aSopenharmony_ci        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1533cabdff1aSopenharmony_ci        idct32_end
1534cabdff1aSopenharmony_ciendfunc
1535cabdff1aSopenharmony_ci
1536cabdff1aSopenharmony_cifunction idct32_odd_quarter
1537cabdff1aSopenharmony_ci        dsmull_h        v4,  v5,  v16, v10.s[0]
1538cabdff1aSopenharmony_ci        dsmull_h        v28, v29, v19, v11.s[3]
1539cabdff1aSopenharmony_ci        dsmull_h        v30, v31, v16, v10.s[1]
1540cabdff1aSopenharmony_ci        dsmull_h        v22, v23, v17, v13.s[2]
1541cabdff1aSopenharmony_ci        dsmull_h        v7,  v6,  v17, v13.s[3]
1542cabdff1aSopenharmony_ci        dsmull_h        v26, v27, v19, v11.s[2]
1543cabdff1aSopenharmony_ci        dsmull_h        v20, v21, v18, v12.s[0]
1544cabdff1aSopenharmony_ci        dsmull_h        v24, v25, v18, v12.s[1]
1545cabdff1aSopenharmony_ci
1546cabdff1aSopenharmony_ci        neg             v28.2d, v28.2d
1547cabdff1aSopenharmony_ci        neg             v29.2d, v29.2d
1548cabdff1aSopenharmony_ci        neg             v7.2d,  v7.2d
1549cabdff1aSopenharmony_ci        neg             v6.2d,  v6.2d
1550cabdff1aSopenharmony_ci
1551cabdff1aSopenharmony_ci        drshrn_h        v4,  v4,  v5,  #14
1552cabdff1aSopenharmony_ci        drshrn_h        v5,  v28, v29, #14
1553cabdff1aSopenharmony_ci        drshrn_h        v29, v30, v31, #14
1554cabdff1aSopenharmony_ci        drshrn_h        v28, v22, v23, #14
1555cabdff1aSopenharmony_ci        drshrn_h        v7,  v7,  v6,  #14
1556cabdff1aSopenharmony_ci        drshrn_h        v31, v26, v27, #14
1557cabdff1aSopenharmony_ci        drshrn_h        v6,  v20, v21, #14
1558cabdff1aSopenharmony_ci        drshrn_h        v30, v24, v25, #14
1559cabdff1aSopenharmony_ci
1560cabdff1aSopenharmony_ci        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v1.s[0], v1.s[1]
1561cabdff1aSopenharmony_ci        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v1.s[0], v1.s[1]
1562cabdff1aSopenharmony_ci        drshrn_h        v23, v16, v17, #14
1563cabdff1aSopenharmony_ci        drshrn_h        v24, v18, v19, #14
1564cabdff1aSopenharmony_ci        neg             v20.2d, v20.2d
1565cabdff1aSopenharmony_ci        neg             v21.2d, v21.2d
1566cabdff1aSopenharmony_ci        drshrn_h        v27, v27, v26, #14
1567cabdff1aSopenharmony_ci        drshrn_h        v20, v20, v21, #14
1568cabdff1aSopenharmony_ci        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v1.s[2], v1.s[3]
1569cabdff1aSopenharmony_ci        drshrn_h        v21, v16, v17, #14
1570cabdff1aSopenharmony_ci        drshrn_h        v26, v18, v19, #14
1571cabdff1aSopenharmony_ci        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v1.s[2], v1.s[3]
1572cabdff1aSopenharmony_ci        drshrn_h        v25, v16, v17, #14
1573cabdff1aSopenharmony_ci        neg             v18.2d, v18.2d
1574cabdff1aSopenharmony_ci        neg             v19.2d, v19.2d
1575cabdff1aSopenharmony_ci        drshrn_h        v22, v18, v19, #14
1576cabdff1aSopenharmony_ci
1577cabdff1aSopenharmony_ci        idct32_end
1578cabdff1aSopenharmony_ciendfunc
1579cabdff1aSopenharmony_ci
1580cabdff1aSopenharmony_ci.macro idct32_funcs suffix
1581cabdff1aSopenharmony_ci// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
1582cabdff1aSopenharmony_ci// The 32-point IDCT can be decomposed into two 16-point IDCTs;
1583cabdff1aSopenharmony_ci// a normal IDCT16 with every other input component (the even ones, with
1584cabdff1aSopenharmony_ci// each output written twice), followed by a separate 16-point IDCT
1585cabdff1aSopenharmony_ci// of the odd inputs, added/subtracted onto the outputs of the first idct16.
1586cabdff1aSopenharmony_ci// x0 = dst (temp buffer)
1587cabdff1aSopenharmony_ci// x1 = unused
1588cabdff1aSopenharmony_ci// x2 = src
1589cabdff1aSopenharmony_ci// x9 = double input stride
1590cabdff1aSopenharmony_cifunction idct32_1d_4x32_pass1\suffix\()_neon
1591cabdff1aSopenharmony_ci        mov             x14, x30
1592cabdff1aSopenharmony_ci
1593cabdff1aSopenharmony_ci        movi            v4.4s,  #0
1594cabdff1aSopenharmony_ci
1595cabdff1aSopenharmony_ci        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1596cabdff1aSopenharmony_ci.ifb \suffix
1597cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1598cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1599cabdff1aSopenharmony_ci.endr
1600cabdff1aSopenharmony_ci.endif
1601cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1602cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1603cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1604cabdff1aSopenharmony_ci.endr
1605cabdff1aSopenharmony_ci.endif
1606cabdff1aSopenharmony_ci.ifc \suffix,_half
1607cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1608cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1609cabdff1aSopenharmony_ci.endr
1610cabdff1aSopenharmony_ci.endif
1611cabdff1aSopenharmony_ci
1612cabdff1aSopenharmony_ci        bl              idct16\suffix
1613cabdff1aSopenharmony_ci
1614cabdff1aSopenharmony_ci        // Do four 4x4 transposes. Originally, v16-v31 contain the
1615cabdff1aSopenharmony_ci        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1616cabdff1aSopenharmony_ci        // contain the four transposed 4x4 blocks.
1617cabdff1aSopenharmony_ci        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1618cabdff1aSopenharmony_ci        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1619cabdff1aSopenharmony_ci        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1620cabdff1aSopenharmony_ci        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1621cabdff1aSopenharmony_ci
1622cabdff1aSopenharmony_ci        // Store the registers a, b, c, d horizontally, followed by the
1623cabdff1aSopenharmony_ci        // same registers d, c, b, a mirrored.
1624cabdff1aSopenharmony_ci.macro store_rev a, b, c, d
1625cabdff1aSopenharmony_ci        // There's no rev128 instruction, but we reverse each 64 bit
1626cabdff1aSopenharmony_ci        // half, and then flip them using an ext with 8 bytes offset.
1627cabdff1aSopenharmony_ci        rev64           v7.4s, \d
1628cabdff1aSopenharmony_ci        st1             {\a},  [x0], #16
1629cabdff1aSopenharmony_ci        ext             v7.16b, v7.16b, v7.16b, #8
1630cabdff1aSopenharmony_ci        st1             {\b},  [x0], #16
1631cabdff1aSopenharmony_ci        rev64           v6.4s, \c
1632cabdff1aSopenharmony_ci        st1             {\c},  [x0], #16
1633cabdff1aSopenharmony_ci        ext             v6.16b, v6.16b, v6.16b, #8
1634cabdff1aSopenharmony_ci        st1             {\d},  [x0], #16
1635cabdff1aSopenharmony_ci        rev64           v5.4s, \b
1636cabdff1aSopenharmony_ci        st1             {v7.4s},  [x0], #16
1637cabdff1aSopenharmony_ci        ext             v5.16b, v5.16b, v5.16b, #8
1638cabdff1aSopenharmony_ci        st1             {v6.4s},  [x0], #16
1639cabdff1aSopenharmony_ci        rev64           v4.4s, \a
1640cabdff1aSopenharmony_ci        st1             {v5.4s},  [x0], #16
1641cabdff1aSopenharmony_ci        ext             v4.16b, v4.16b, v4.16b, #8
1642cabdff1aSopenharmony_ci        st1             {v4.4s},  [x0], #16
1643cabdff1aSopenharmony_ci.endm
1644cabdff1aSopenharmony_ci        store_rev       v16.4s, v20.4s, v24.4s, v28.4s
1645cabdff1aSopenharmony_ci        store_rev       v17.4s, v21.4s, v25.4s, v29.4s
1646cabdff1aSopenharmony_ci        store_rev       v18.4s, v22.4s, v26.4s, v30.4s
1647cabdff1aSopenharmony_ci        store_rev       v19.4s, v23.4s, v27.4s, v31.4s
1648cabdff1aSopenharmony_ci        sub             x0,  x0,  #512
1649cabdff1aSopenharmony_ci.purgem store_rev
1650cabdff1aSopenharmony_ci
1651cabdff1aSopenharmony_ci        // Move x2 back to the start of the input, and move
1652cabdff1aSopenharmony_ci        // to the first odd row
1653cabdff1aSopenharmony_ci.ifb \suffix
1654cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #4
1655cabdff1aSopenharmony_ci.endif
1656cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1657cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #2
1658cabdff1aSopenharmony_ci.endif
1659cabdff1aSopenharmony_ci.ifc \suffix,_half
1660cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #3
1661cabdff1aSopenharmony_ci.endif
1662cabdff1aSopenharmony_ci        add             x2,  x2,  #128
1663cabdff1aSopenharmony_ci
1664cabdff1aSopenharmony_ci        movi            v4.4s,  #0
1665cabdff1aSopenharmony_ci        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1666cabdff1aSopenharmony_ci.ifb \suffix
1667cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1668cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1669cabdff1aSopenharmony_ci.endr
1670cabdff1aSopenharmony_ci.endif
1671cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1672cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1673cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1674cabdff1aSopenharmony_ci.endr
1675cabdff1aSopenharmony_ci.endif
1676cabdff1aSopenharmony_ci.ifc \suffix,_half
1677cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1678cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1679cabdff1aSopenharmony_ci.endr
1680cabdff1aSopenharmony_ci.endif
1681cabdff1aSopenharmony_ci
1682cabdff1aSopenharmony_ci        bl              idct32_odd\suffix
1683cabdff1aSopenharmony_ci
1684cabdff1aSopenharmony_ci        transpose_4x4s  v31, v30, v29, v28, v4, v5, v6, v7
1685cabdff1aSopenharmony_ci        transpose_4x4s  v27, v26, v25, v24, v4, v5, v6, v7
1686cabdff1aSopenharmony_ci        transpose_4x4s  v23, v22, v21, v20, v4, v5, v6, v7
1687cabdff1aSopenharmony_ci        transpose_4x4s  v19, v18, v17, v16, v4, v5, v6, v7
1688cabdff1aSopenharmony_ci
1689cabdff1aSopenharmony_ci        // Store the registers a, b, c, d horizontally,
1690cabdff1aSopenharmony_ci        // adding into the output first, and the mirrored,
1691cabdff1aSopenharmony_ci        // subtracted from the output.
1692cabdff1aSopenharmony_ci.macro store_rev a, b, c, d, a16b, b16b
1693cabdff1aSopenharmony_ci        ld1             {v4.4s},  [x0]
1694cabdff1aSopenharmony_ci        rev64           v9.4s, \d
1695cabdff1aSopenharmony_ci        add             v4.4s, v4.4s, \a
1696cabdff1aSopenharmony_ci        st1             {v4.4s},  [x0], #16
1697cabdff1aSopenharmony_ci        rev64           v8.4s, \c
1698cabdff1aSopenharmony_ci        ld1             {v4.4s},  [x0]
1699cabdff1aSopenharmony_ci        ext             v9.16b, v9.16b, v9.16b, #8
1700cabdff1aSopenharmony_ci        add             v4.4s, v4.4s, \b
1701cabdff1aSopenharmony_ci        st1             {v4.4s},  [x0], #16
1702cabdff1aSopenharmony_ci        ext             v8.16b, v8.16b, v8.16b, #8
1703cabdff1aSopenharmony_ci        ld1             {v4.4s},  [x0]
1704cabdff1aSopenharmony_ci        rev64           \b, \b
1705cabdff1aSopenharmony_ci        add             v4.4s, v4.4s, \c
1706cabdff1aSopenharmony_ci        st1             {v4.4s},  [x0], #16
1707cabdff1aSopenharmony_ci        rev64           \a, \a
1708cabdff1aSopenharmony_ci        ld1             {v4.4s},  [x0]
1709cabdff1aSopenharmony_ci        ext             \b16b, \b16b, \b16b, #8
1710cabdff1aSopenharmony_ci        add             v4.4s, v4.4s, \d
1711cabdff1aSopenharmony_ci        st1             {v4.4s},  [x0], #16
1712cabdff1aSopenharmony_ci        ext             \a16b, \a16b, \a16b, #8
1713cabdff1aSopenharmony_ci        ld1             {v4.4s},  [x0]
1714cabdff1aSopenharmony_ci        sub             v4.4s, v4.4s, v9.4s
1715cabdff1aSopenharmony_ci        st1             {v4.4s},  [x0], #16
1716cabdff1aSopenharmony_ci        ld1             {v4.4s},  [x0]
1717cabdff1aSopenharmony_ci        sub             v4.4s, v4.4s, v8.4s
1718cabdff1aSopenharmony_ci        st1             {v4.4s},  [x0], #16
1719cabdff1aSopenharmony_ci        ld1             {v4.4s},  [x0]
1720cabdff1aSopenharmony_ci        sub             v4.4s, v4.4s, \b
1721cabdff1aSopenharmony_ci        st1             {v4.4s},  [x0], #16
1722cabdff1aSopenharmony_ci        ld1             {v4.4s},  [x0]
1723cabdff1aSopenharmony_ci        sub             v4.4s, v4.4s, \a
1724cabdff1aSopenharmony_ci        st1             {v4.4s},  [x0], #16
1725cabdff1aSopenharmony_ci.endm
1726cabdff1aSopenharmony_ci
1727cabdff1aSopenharmony_ci        store_rev       v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
1728cabdff1aSopenharmony_ci        store_rev       v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
1729cabdff1aSopenharmony_ci        store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
1730cabdff1aSopenharmony_ci        store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
1731cabdff1aSopenharmony_ci.purgem store_rev
1732cabdff1aSopenharmony_ci        ret             x14
1733cabdff1aSopenharmony_ciendfunc
1734cabdff1aSopenharmony_ci
1735cabdff1aSopenharmony_ci// This is mostly the same as 4x32_pass1, but without the transpose,
1736cabdff1aSopenharmony_ci// and use the source as temp buffer between the two idct passes, and
1737cabdff1aSopenharmony_ci// add into the destination.
1738cabdff1aSopenharmony_ci// x0 = dst
1739cabdff1aSopenharmony_ci// x1 = dst stride
1740cabdff1aSopenharmony_ci// x2 = src (temp buffer)
1741cabdff1aSopenharmony_ci// x7 = negative double temp buffer stride
1742cabdff1aSopenharmony_ci// x9 = double temp buffer stride
1743cabdff1aSopenharmony_cifunction idct32_1d_4x32_pass2\suffix\()_neon
1744cabdff1aSopenharmony_ci        mov             x14, x30
1745cabdff1aSopenharmony_ci
1746cabdff1aSopenharmony_ci        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1747cabdff1aSopenharmony_ci.ifb \suffix
1748cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1749cabdff1aSopenharmony_ci        load            \i, x2, x9
1750cabdff1aSopenharmony_ci.endr
1751cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #4
1752cabdff1aSopenharmony_ci.endif
1753cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1754cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1755cabdff1aSopenharmony_ci        load            \i, x2, x9
1756cabdff1aSopenharmony_ci.endr
1757cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #2
1758cabdff1aSopenharmony_ci.endif
1759cabdff1aSopenharmony_ci.ifc \suffix,_half
1760cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1761cabdff1aSopenharmony_ci        load            \i, x2, x9
1762cabdff1aSopenharmony_ci.endr
1763cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #3
1764cabdff1aSopenharmony_ci.endif
1765cabdff1aSopenharmony_ci
1766cabdff1aSopenharmony_ci        bl              idct16\suffix
1767cabdff1aSopenharmony_ci
1768cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1769cabdff1aSopenharmony_ci        store           \i, x2, x9
1770cabdff1aSopenharmony_ci.endr
1771cabdff1aSopenharmony_ci
1772cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #4
1773cabdff1aSopenharmony_ci        add             x2,  x2,  #128
1774cabdff1aSopenharmony_ci
1775cabdff1aSopenharmony_ci        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1776cabdff1aSopenharmony_ci.ifb \suffix
1777cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1778cabdff1aSopenharmony_ci        load            \i, x2, x9
1779cabdff1aSopenharmony_ci.endr
1780cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #4
1781cabdff1aSopenharmony_ci.endif
1782cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1783cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1784cabdff1aSopenharmony_ci        load            \i, x2, x9
1785cabdff1aSopenharmony_ci.endr
1786cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #2
1787cabdff1aSopenharmony_ci.endif
1788cabdff1aSopenharmony_ci.ifc \suffix,_half
1789cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1790cabdff1aSopenharmony_ci        load            \i, x2, x9
1791cabdff1aSopenharmony_ci.endr
1792cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #3
1793cabdff1aSopenharmony_ci.endif
1794cabdff1aSopenharmony_ci        sub             x2,  x2,  #128
1795cabdff1aSopenharmony_ci
1796cabdff1aSopenharmony_ci        bl              idct32_odd\suffix
1797cabdff1aSopenharmony_ci
1798cabdff1aSopenharmony_ci.macro load_acc_store a, b, c, d, neg=0
1799cabdff1aSopenharmony_ci.if \neg == 0
1800cabdff1aSopenharmony_ci        ld1             {v4.4s},  [x2], x9
1801cabdff1aSopenharmony_ci        ld1             {v5.4s},  [x2], x9
1802cabdff1aSopenharmony_ci        add             v4.4s, v4.4s, \a
1803cabdff1aSopenharmony_ci        ld1             {v6.4s},  [x2], x9
1804cabdff1aSopenharmony_ci        add             v5.4s, v5.4s, \b
1805cabdff1aSopenharmony_ci        ld1             {v7.4s},  [x2], x9
1806cabdff1aSopenharmony_ci        add             v6.4s, v6.4s, \c
1807cabdff1aSopenharmony_ci        add             v7.4s, v7.4s, \d
1808cabdff1aSopenharmony_ci.else
1809cabdff1aSopenharmony_ci        ld1             {v4.4s},  [x2], x7
1810cabdff1aSopenharmony_ci        ld1             {v5.4s},  [x2], x7
1811cabdff1aSopenharmony_ci        sub             v4.4s, v4.4s, \a
1812cabdff1aSopenharmony_ci        ld1             {v6.4s},  [x2], x7
1813cabdff1aSopenharmony_ci        sub             v5.4s, v5.4s, \b
1814cabdff1aSopenharmony_ci        ld1             {v7.4s},  [x2], x7
1815cabdff1aSopenharmony_ci        sub             v6.4s, v6.4s, \c
1816cabdff1aSopenharmony_ci        sub             v7.4s, v7.4s, \d
1817cabdff1aSopenharmony_ci.endif
1818cabdff1aSopenharmony_ci        ld1             {v8.4h},   [x0], x1
1819cabdff1aSopenharmony_ci        ld1             {v8.d}[1], [x0], x1
1820cabdff1aSopenharmony_ci        srshr           v4.4s, v4.4s, #6
1821cabdff1aSopenharmony_ci        ld1             {v9.4h},   [x0], x1
1822cabdff1aSopenharmony_ci        srshr           v5.4s, v5.4s, #6
1823cabdff1aSopenharmony_ci        uaddw           v4.4s, v4.4s, v8.4h
1824cabdff1aSopenharmony_ci        ld1             {v9.d}[1], [x0], x1
1825cabdff1aSopenharmony_ci        srshr           v6.4s, v6.4s, #6
1826cabdff1aSopenharmony_ci        uaddw2          v5.4s, v5.4s, v8.8h
1827cabdff1aSopenharmony_ci        srshr           v7.4s, v7.4s, #6
1828cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
1829cabdff1aSopenharmony_ci        uaddw           v6.4s, v6.4s, v9.4h
1830cabdff1aSopenharmony_ci        sqxtun          v4.4h, v4.4s
1831cabdff1aSopenharmony_ci        uaddw2          v7.4s, v7.4s, v9.8h
1832cabdff1aSopenharmony_ci        sqxtun2         v4.8h, v5.4s
1833cabdff1aSopenharmony_ci        umin            v4.8h, v4.8h, v15.8h
1834cabdff1aSopenharmony_ci        st1             {v4.4h},   [x0], x1
1835cabdff1aSopenharmony_ci        sqxtun          v5.4h, v6.4s
1836cabdff1aSopenharmony_ci        st1             {v4.d}[1], [x0], x1
1837cabdff1aSopenharmony_ci        sqxtun2         v5.8h, v7.4s
1838cabdff1aSopenharmony_ci        umin            v5.8h, v5.8h, v15.8h
1839cabdff1aSopenharmony_ci        st1             {v5.4h},   [x0], x1
1840cabdff1aSopenharmony_ci        st1             {v5.d}[1], [x0], x1
1841cabdff1aSopenharmony_ci.endm
1842cabdff1aSopenharmony_ci        load_acc_store  v31.4s, v30.4s, v29.4s, v28.4s
1843cabdff1aSopenharmony_ci        load_acc_store  v27.4s, v26.4s, v25.4s, v24.4s
1844cabdff1aSopenharmony_ci        load_acc_store  v23.4s, v22.4s, v21.4s, v20.4s
1845cabdff1aSopenharmony_ci        load_acc_store  v19.4s, v18.4s, v17.4s, v16.4s
1846cabdff1aSopenharmony_ci        sub             x2,  x2,  x9
1847cabdff1aSopenharmony_ci        load_acc_store  v16.4s, v17.4s, v18.4s, v19.4s, 1
1848cabdff1aSopenharmony_ci        load_acc_store  v20.4s, v21.4s, v22.4s, v23.4s, 1
1849cabdff1aSopenharmony_ci        load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
1850cabdff1aSopenharmony_ci        load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
1851cabdff1aSopenharmony_ci.purgem load_acc_store
1852cabdff1aSopenharmony_ci        ret             x14
1853cabdff1aSopenharmony_ciendfunc
1854cabdff1aSopenharmony_ci.endm
1855cabdff1aSopenharmony_ci
1856cabdff1aSopenharmony_ciidct32_funcs
1857cabdff1aSopenharmony_ciidct32_funcs _quarter
1858cabdff1aSopenharmony_ciidct32_funcs _half
1859cabdff1aSopenharmony_ci
1860cabdff1aSopenharmony_ciconst min_eob_idct_idct_32, align=4
1861cabdff1aSopenharmony_ci        .short  0, 9, 34, 70, 135, 240, 336, 448
1862cabdff1aSopenharmony_ciendconst
1863cabdff1aSopenharmony_ci
1864cabdff1aSopenharmony_cifunction vp9_idct_idct_32x32_add_16_neon
1865cabdff1aSopenharmony_ci        cmp             w3,  #1
1866cabdff1aSopenharmony_ci        b.eq            idct32x32_dc_add_neon
1867cabdff1aSopenharmony_ci
1868cabdff1aSopenharmony_ci        movrel          x10, idct_coeffs
1869cabdff1aSopenharmony_ci
1870cabdff1aSopenharmony_ci        mov             x15, x30
1871cabdff1aSopenharmony_ci        stp             d8,  d9,  [sp, #-0x10]!
1872cabdff1aSopenharmony_ci        stp             d10, d11, [sp, #-0x10]!
1873cabdff1aSopenharmony_ci        stp             d12, d13, [sp, #-0x10]!
1874cabdff1aSopenharmony_ci        stp             d14, d15, [sp, #-0x10]!
1875cabdff1aSopenharmony_ci
1876cabdff1aSopenharmony_ci        sub             sp,  sp,  #4096
1877cabdff1aSopenharmony_ci
1878cabdff1aSopenharmony_ci        mov             x4,  x0
1879cabdff1aSopenharmony_ci        mov             x5,  x1
1880cabdff1aSopenharmony_ci        mov             x6,  x2
1881cabdff1aSopenharmony_ci
1882cabdff1aSopenharmony_ci        // Double stride of the input, since we only read every other line
1883cabdff1aSopenharmony_ci        mov             x9,  #256
1884cabdff1aSopenharmony_ci        neg             x7,  x9
1885cabdff1aSopenharmony_ci
1886cabdff1aSopenharmony_ci        ld1             {v0.8h,v1.8h},   [x10], #32
1887cabdff1aSopenharmony_ci        sxtl            v2.4s,  v1.4h
1888cabdff1aSopenharmony_ci        sxtl2           v3.4s,  v1.8h
1889cabdff1aSopenharmony_ci        sxtl2           v1.4s,  v0.8h
1890cabdff1aSopenharmony_ci        sxtl            v0.4s,  v0.4h
1891cabdff1aSopenharmony_ci        ld1             {v10.8h,v11.8h}, [x10]
1892cabdff1aSopenharmony_ci        sxtl            v12.4s, v11.4h
1893cabdff1aSopenharmony_ci        sxtl2           v13.4s, v11.8h
1894cabdff1aSopenharmony_ci        sxtl2           v11.4s, v10.8h
1895cabdff1aSopenharmony_ci        sxtl            v10.4s, v10.4h
1896cabdff1aSopenharmony_ci
1897cabdff1aSopenharmony_ci        dup             v15.8h, w13
1898cabdff1aSopenharmony_ci
1899cabdff1aSopenharmony_ci        cmp             w3,  #34
1900cabdff1aSopenharmony_ci        b.le            idct32x32_quarter_add_16_neon
1901cabdff1aSopenharmony_ci        cmp             w3,  #135
1902cabdff1aSopenharmony_ci        b.le            idct32x32_half_add_16_neon
1903cabdff1aSopenharmony_ci
1904cabdff1aSopenharmony_ci        movrel          x12, min_eob_idct_idct_32, 2
1905cabdff1aSopenharmony_ci
1906cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12, 16, 20, 24, 28
1907cabdff1aSopenharmony_ci        add             x0,  sp,  #(\i*128)
1908cabdff1aSopenharmony_ci.if \i > 0
1909cabdff1aSopenharmony_ci        ldrh            w1,  [x12], #2
1910cabdff1aSopenharmony_ci        cmp             w3,  w1
1911cabdff1aSopenharmony_ci        mov             x1,  #(32 - \i)/4
1912cabdff1aSopenharmony_ci        b.le            1f
1913cabdff1aSopenharmony_ci.endif
1914cabdff1aSopenharmony_ci        add             x2,  x6,  #(\i*4)
1915cabdff1aSopenharmony_ci        bl              idct32_1d_4x32_pass1_neon
1916cabdff1aSopenharmony_ci.endr
1917cabdff1aSopenharmony_ci        b               3f
1918cabdff1aSopenharmony_ci
1919cabdff1aSopenharmony_ci1:
1920cabdff1aSopenharmony_ci        // Write zeros to the temp buffer for pass 2
1921cabdff1aSopenharmony_ci        movi            v16.4s,  #0
1922cabdff1aSopenharmony_ci        movi            v17.4s,  #0
1923cabdff1aSopenharmony_ci        movi            v18.4s,  #0
1924cabdff1aSopenharmony_ci        movi            v19.4s,  #0
1925cabdff1aSopenharmony_ci2:
1926cabdff1aSopenharmony_ci        subs            x1,  x1,  #1
1927cabdff1aSopenharmony_ci.rept 4
1928cabdff1aSopenharmony_ci        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1929cabdff1aSopenharmony_ci        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1930cabdff1aSopenharmony_ci.endr
1931cabdff1aSopenharmony_ci        b.ne            2b
1932cabdff1aSopenharmony_ci3:
1933cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12, 16, 20, 24, 28
1934cabdff1aSopenharmony_ci        add             x0,  x4,  #(\i*2)
1935cabdff1aSopenharmony_ci        mov             x1,  x5
1936cabdff1aSopenharmony_ci        add             x2,  sp,  #(\i*4)
1937cabdff1aSopenharmony_ci        bl              idct32_1d_4x32_pass2_neon
1938cabdff1aSopenharmony_ci.endr
1939cabdff1aSopenharmony_ci
1940cabdff1aSopenharmony_ci        add             sp,  sp,  #4096
1941cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
1942cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
1943cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
1944cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
1945cabdff1aSopenharmony_ci
1946cabdff1aSopenharmony_ci        ret             x15
1947cabdff1aSopenharmony_ciendfunc
1948cabdff1aSopenharmony_ci
1949cabdff1aSopenharmony_cifunction ff_vp9_idct_idct_32x32_add_10_neon, export=1
1950cabdff1aSopenharmony_ci        mov             x13, #0x03ff
1951cabdff1aSopenharmony_ci        b               vp9_idct_idct_32x32_add_16_neon
1952cabdff1aSopenharmony_ciendfunc
1953cabdff1aSopenharmony_ci
1954cabdff1aSopenharmony_cifunction ff_vp9_idct_idct_32x32_add_12_neon, export=1
1955cabdff1aSopenharmony_ci        mov             x13, #0x0fff
1956cabdff1aSopenharmony_ci        b               vp9_idct_idct_32x32_add_16_neon
1957cabdff1aSopenharmony_ciendfunc
1958cabdff1aSopenharmony_ci
1959cabdff1aSopenharmony_ci.macro idct32_partial size
1960cabdff1aSopenharmony_cifunction idct32x32_\size\()_add_16_neon
1961cabdff1aSopenharmony_ci.irp i, 0, 4
1962cabdff1aSopenharmony_ci        add             x0,  sp,  #(\i*128)
1963cabdff1aSopenharmony_ci.ifc \size,quarter
1964cabdff1aSopenharmony_ci.if \i == 4
1965cabdff1aSopenharmony_ci        cmp             w3,  #9
1966cabdff1aSopenharmony_ci        b.le            1f
1967cabdff1aSopenharmony_ci.endif
1968cabdff1aSopenharmony_ci.endif
1969cabdff1aSopenharmony_ci        add             x2,  x6,  #(\i*4)
1970cabdff1aSopenharmony_ci        bl              idct32_1d_4x32_pass1_\size\()_neon
1971cabdff1aSopenharmony_ci.endr
1972cabdff1aSopenharmony_ci
1973cabdff1aSopenharmony_ci.ifc \size,half
1974cabdff1aSopenharmony_ci.irp i, 8, 12
1975cabdff1aSopenharmony_ci        add             x0,  sp,  #(\i*128)
1976cabdff1aSopenharmony_ci.if \i == 12
1977cabdff1aSopenharmony_ci        cmp             w3,  #70
1978cabdff1aSopenharmony_ci        b.le            1f
1979cabdff1aSopenharmony_ci.endif
1980cabdff1aSopenharmony_ci        add             x2,  x6,  #(\i*4)
1981cabdff1aSopenharmony_ci        bl              idct32_1d_4x32_pass1_\size\()_neon
1982cabdff1aSopenharmony_ci.endr
1983cabdff1aSopenharmony_ci.endif
1984cabdff1aSopenharmony_ci        b               3f
1985cabdff1aSopenharmony_ci
1986cabdff1aSopenharmony_ci1:
1987cabdff1aSopenharmony_ci        // Write zeros to the temp buffer for pass 2
1988cabdff1aSopenharmony_ci        movi            v16.4s,  #0
1989cabdff1aSopenharmony_ci        movi            v17.4s,  #0
1990cabdff1aSopenharmony_ci        movi            v18.4s,  #0
1991cabdff1aSopenharmony_ci        movi            v19.4s,  #0
1992cabdff1aSopenharmony_ci
1993cabdff1aSopenharmony_ci.rept 4
1994cabdff1aSopenharmony_ci        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1995cabdff1aSopenharmony_ci        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1996cabdff1aSopenharmony_ci.endr
1997cabdff1aSopenharmony_ci
1998cabdff1aSopenharmony_ci3:
1999cabdff1aSopenharmony_ci.irp i, 0, 4, 8, 12, 16, 20, 24, 28
2000cabdff1aSopenharmony_ci        add             x0,  x4,  #(\i*2)
2001cabdff1aSopenharmony_ci        mov             x1,  x5
2002cabdff1aSopenharmony_ci        add             x2,  sp,  #(\i*4)
2003cabdff1aSopenharmony_ci        bl              idct32_1d_4x32_pass2_\size\()_neon
2004cabdff1aSopenharmony_ci.endr
2005cabdff1aSopenharmony_ci
2006cabdff1aSopenharmony_ci        add             sp,  sp,  #4096
2007cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
2008cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
2009cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
2010cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
2011cabdff1aSopenharmony_ci
2012cabdff1aSopenharmony_ci        ret             x15
2013cabdff1aSopenharmony_ciendfunc
2014cabdff1aSopenharmony_ci.endm
2015cabdff1aSopenharmony_ci
2016cabdff1aSopenharmony_ciidct32_partial quarter
2017cabdff1aSopenharmony_ciidct32_partial half
2018