1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2016 Google Inc.
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
22cabdff1aSopenharmony_ci#include "neon.S"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciconst itxfm4_coeffs, align=4
25cabdff1aSopenharmony_ci        .short  11585, 0, 6270, 15137
26cabdff1aSopenharmony_ciiadst4_coeffs:
27cabdff1aSopenharmony_ci        .short  5283, 15212, 9929, 13377
28cabdff1aSopenharmony_ciendconst
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ciconst iadst8_coeffs, align=4
31cabdff1aSopenharmony_ci        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32cabdff1aSopenharmony_ciidct_coeffs:
33cabdff1aSopenharmony_ci        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34cabdff1aSopenharmony_ci        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35cabdff1aSopenharmony_ci        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36cabdff1aSopenharmony_ci        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37cabdff1aSopenharmony_ciendconst
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ciconst iadst16_coeffs, align=4
40cabdff1aSopenharmony_ci        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41cabdff1aSopenharmony_ci        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
42cabdff1aSopenharmony_ciendconst
43cabdff1aSopenharmony_ci
44cabdff1aSopenharmony_ci// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
45cabdff1aSopenharmony_ci// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
46cabdff1aSopenharmony_ci// in/out are .8h registers; this can do with 4 temp registers, but is
47cabdff1aSopenharmony_ci// more efficient if 6 temp registers are available.
48cabdff1aSopenharmony_ci.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
49cabdff1aSopenharmony_ci.if \neg > 0
50cabdff1aSopenharmony_ci        neg             \tmp4\().4h, v0.4h
51cabdff1aSopenharmony_ci.endif
52cabdff1aSopenharmony_ci        add             \tmp1\().8h, \in1\().8h,  \in2\().8h
53cabdff1aSopenharmony_ci        sub             \tmp2\().8h, \in1\().8h,  \in2\().8h
54cabdff1aSopenharmony_ci.if \neg > 0
55cabdff1aSopenharmony_ci        smull           \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
56cabdff1aSopenharmony_ci        smull2          \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
57cabdff1aSopenharmony_ci.else
58cabdff1aSopenharmony_ci        smull           \tmp3\().4s, \tmp1\().4h, v0.h[0]
59cabdff1aSopenharmony_ci        smull2          \tmp4\().4s, \tmp1\().8h, v0.h[0]
60cabdff1aSopenharmony_ci.endif
61cabdff1aSopenharmony_ci.ifb \tmp5
62cabdff1aSopenharmony_ci        rshrn           \out1\().4h, \tmp3\().4s, #14
63cabdff1aSopenharmony_ci        rshrn2          \out1\().8h, \tmp4\().4s, #14
64cabdff1aSopenharmony_ci        smull           \tmp3\().4s, \tmp2\().4h, v0.h[0]
65cabdff1aSopenharmony_ci        smull2          \tmp4\().4s, \tmp2\().8h, v0.h[0]
66cabdff1aSopenharmony_ci        rshrn           \out2\().4h, \tmp3\().4s, #14
67cabdff1aSopenharmony_ci        rshrn2          \out2\().8h, \tmp4\().4s, #14
68cabdff1aSopenharmony_ci.else
69cabdff1aSopenharmony_ci        smull           \tmp5\().4s, \tmp2\().4h, v0.h[0]
70cabdff1aSopenharmony_ci        smull2          \tmp6\().4s, \tmp2\().8h, v0.h[0]
71cabdff1aSopenharmony_ci        rshrn           \out1\().4h, \tmp3\().4s, #14
72cabdff1aSopenharmony_ci        rshrn2          \out1\().8h, \tmp4\().4s, #14
73cabdff1aSopenharmony_ci        rshrn           \out2\().4h, \tmp5\().4s, #14
74cabdff1aSopenharmony_ci        rshrn2          \out2\().8h, \tmp6\().4s, #14
75cabdff1aSopenharmony_ci.endif
76cabdff1aSopenharmony_ci.endm
77cabdff1aSopenharmony_ci
78cabdff1aSopenharmony_ci// Same as dmbutterfly0 above, but treating the input in in2 as zero,
79cabdff1aSopenharmony_ci// writing the same output into both out1 and out2.
80cabdff1aSopenharmony_ci.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
81cabdff1aSopenharmony_ci        smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
82cabdff1aSopenharmony_ci        smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
83cabdff1aSopenharmony_ci        rshrn           \out1\().4h,  \tmp1\().4s, #14
84cabdff1aSopenharmony_ci        rshrn2          \out1\().8h,  \tmp2\().4s, #14
85cabdff1aSopenharmony_ci        rshrn           \out2\().4h,  \tmp1\().4s, #14
86cabdff1aSopenharmony_ci        rshrn2          \out2\().8h,  \tmp2\().4s, #14
87cabdff1aSopenharmony_ci.endm
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci// out1,out2 = in1 * coef1 - in2 * coef2
90cabdff1aSopenharmony_ci// out3,out4 = in1 * coef2 + in2 * coef1
91cabdff1aSopenharmony_ci// out are 4 x .4s registers, in are 2 x .8h registers
92cabdff1aSopenharmony_ci.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
93cabdff1aSopenharmony_ci        smull           \out1\().4s, \in1\().4h, \coef1
94cabdff1aSopenharmony_ci        smull2          \out2\().4s, \in1\().8h, \coef1
95cabdff1aSopenharmony_ci        smull           \out3\().4s, \in1\().4h, \coef2
96cabdff1aSopenharmony_ci        smull2          \out4\().4s, \in1\().8h, \coef2
97cabdff1aSopenharmony_ci        smlsl           \out1\().4s, \in2\().4h, \coef2
98cabdff1aSopenharmony_ci        smlsl2          \out2\().4s, \in2\().8h, \coef2
99cabdff1aSopenharmony_ci        smlal           \out3\().4s, \in2\().4h, \coef1
100cabdff1aSopenharmony_ci        smlal2          \out4\().4s, \in2\().8h, \coef1
101cabdff1aSopenharmony_ci.endm
102cabdff1aSopenharmony_ci
103cabdff1aSopenharmony_ci// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
104cabdff1aSopenharmony_ci// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
105cabdff1aSopenharmony_ci// inout are 2 x .8h registers
106cabdff1aSopenharmony_ci.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
107cabdff1aSopenharmony_ci        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
108cabdff1aSopenharmony_ci.if \neg > 0
109cabdff1aSopenharmony_ci        neg             \tmp3\().4s, \tmp3\().4s
110cabdff1aSopenharmony_ci        neg             \tmp4\().4s, \tmp4\().4s
111cabdff1aSopenharmony_ci.endif
112cabdff1aSopenharmony_ci        rshrn           \inout1\().4h, \tmp1\().4s,  #14
113cabdff1aSopenharmony_ci        rshrn2          \inout1\().8h, \tmp2\().4s,  #14
114cabdff1aSopenharmony_ci        rshrn           \inout2\().4h, \tmp3\().4s,  #14
115cabdff1aSopenharmony_ci        rshrn2          \inout2\().8h, \tmp4\().4s,  #14
116cabdff1aSopenharmony_ci.endm
117cabdff1aSopenharmony_ci
118cabdff1aSopenharmony_ci// Same as dmbutterfly above, but treating the input in inout2 as zero
119cabdff1aSopenharmony_ci.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
120cabdff1aSopenharmony_ci        smull           \tmp1\().4s, \inout1\().4h, \coef1
121cabdff1aSopenharmony_ci        smull2          \tmp2\().4s, \inout1\().8h, \coef1
122cabdff1aSopenharmony_ci        smull           \tmp3\().4s, \inout1\().4h, \coef2
123cabdff1aSopenharmony_ci        smull2          \tmp4\().4s, \inout1\().8h, \coef2
124cabdff1aSopenharmony_ci        rshrn           \inout1\().4h, \tmp1\().4s, #14
125cabdff1aSopenharmony_ci        rshrn2          \inout1\().8h, \tmp2\().4s, #14
126cabdff1aSopenharmony_ci        rshrn           \inout2\().4h, \tmp3\().4s, #14
127cabdff1aSopenharmony_ci        rshrn2          \inout2\().8h, \tmp4\().4s, #14
128cabdff1aSopenharmony_ci.endm
129cabdff1aSopenharmony_ci
130cabdff1aSopenharmony_ci// Same as dmbutterfly above, but treating the input in inout1 as zero
131cabdff1aSopenharmony_ci.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
132cabdff1aSopenharmony_ci        smull           \tmp1\().4s, \inout2\().4h, \coef2
133cabdff1aSopenharmony_ci        smull2          \tmp2\().4s, \inout2\().8h, \coef2
134cabdff1aSopenharmony_ci        smull           \tmp3\().4s, \inout2\().4h, \coef1
135cabdff1aSopenharmony_ci        smull2          \tmp4\().4s, \inout2\().8h, \coef1
136cabdff1aSopenharmony_ci        neg             \tmp1\().4s, \tmp1\().4s
137cabdff1aSopenharmony_ci        neg             \tmp2\().4s, \tmp2\().4s
138cabdff1aSopenharmony_ci        rshrn           \inout2\().4h, \tmp3\().4s, #14
139cabdff1aSopenharmony_ci        rshrn2          \inout2\().8h, \tmp4\().4s, #14
140cabdff1aSopenharmony_ci        rshrn           \inout1\().4h, \tmp1\().4s, #14
141cabdff1aSopenharmony_ci        rshrn2          \inout1\().8h, \tmp2\().4s, #14
142cabdff1aSopenharmony_ci.endm
143cabdff1aSopenharmony_ci
144cabdff1aSopenharmony_ci.macro dsmull_h out1, out2, in, coef
145cabdff1aSopenharmony_ci        smull           \out1\().4s, \in\().4h, \coef
146cabdff1aSopenharmony_ci        smull2          \out2\().4s, \in\().8h, \coef
147cabdff1aSopenharmony_ci.endm
148cabdff1aSopenharmony_ci
149cabdff1aSopenharmony_ci.macro drshrn_h out, in1, in2, shift
150cabdff1aSopenharmony_ci        rshrn           \out\().4h, \in1\().4s, \shift
151cabdff1aSopenharmony_ci        rshrn2          \out\().8h, \in2\().4s, \shift
152cabdff1aSopenharmony_ci.endm
153cabdff1aSopenharmony_ci
154cabdff1aSopenharmony_ci
155cabdff1aSopenharmony_ci// out1 = in1 + in2
156cabdff1aSopenharmony_ci// out2 = in1 - in2
157cabdff1aSopenharmony_ci.macro butterfly_8h out1, out2, in1, in2
158cabdff1aSopenharmony_ci        add             \out1\().8h, \in1\().8h, \in2\().8h
159cabdff1aSopenharmony_ci        sub             \out2\().8h, \in1\().8h, \in2\().8h
160cabdff1aSopenharmony_ci.endm
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_ci// out1 = in1 - in2
163cabdff1aSopenharmony_ci// out2 = in1 + in2
164cabdff1aSopenharmony_ci.macro butterfly_8h_r out1, out2, in1, in2
165cabdff1aSopenharmony_ci        sub             \out1\().8h, \in1\().8h, \in2\().8h
166cabdff1aSopenharmony_ci        add             \out2\().8h, \in1\().8h, \in2\().8h
167cabdff1aSopenharmony_ci.endm
168cabdff1aSopenharmony_ci
169cabdff1aSopenharmony_ci// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
170cabdff1aSopenharmony_ci// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
171cabdff1aSopenharmony_ci// out are 2 x .8h registers, in are 4 x .4s registers
172cabdff1aSopenharmony_ci.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
173cabdff1aSopenharmony_ci        add             \tmp1\().4s, \in1\().4s, \in3\().4s
174cabdff1aSopenharmony_ci        add             \tmp2\().4s, \in2\().4s, \in4\().4s
175cabdff1aSopenharmony_ci        sub             \tmp3\().4s, \in1\().4s, \in3\().4s
176cabdff1aSopenharmony_ci        sub             \tmp4\().4s, \in2\().4s, \in4\().4s
177cabdff1aSopenharmony_ci        rshrn           \out1\().4h, \tmp1\().4s,  #14
178cabdff1aSopenharmony_ci        rshrn2          \out1\().8h, \tmp2\().4s,  #14
179cabdff1aSopenharmony_ci        rshrn           \out2\().4h, \tmp3\().4s,  #14
180cabdff1aSopenharmony_ci        rshrn2          \out2\().8h, \tmp4\().4s,  #14
181cabdff1aSopenharmony_ci.endm
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci.macro iwht4 c0, c1, c2, c3
184cabdff1aSopenharmony_ci        add             \c0\().4h, \c0\().4h, \c1\().4h
185cabdff1aSopenharmony_ci        sub             v17.4h,    \c2\().4h, \c3\().4h
186cabdff1aSopenharmony_ci        sub             v16.4h,    \c0\().4h, v17.4h
187cabdff1aSopenharmony_ci        sshr            v16.4h,    v16.4h,    #1
188cabdff1aSopenharmony_ci        sub             \c2\().4h, v16.4h,    \c1\().4h
189cabdff1aSopenharmony_ci        sub             \c1\().4h, v16.4h,    \c3\().4h
190cabdff1aSopenharmony_ci        add             \c3\().4h, v17.4h,    \c2\().4h
191cabdff1aSopenharmony_ci        sub             \c0\().4h, \c0\().4h, \c1\().4h
192cabdff1aSopenharmony_ci.endm
193cabdff1aSopenharmony_ci
194cabdff1aSopenharmony_ci.macro idct4 c0, c1, c2, c3
195cabdff1aSopenharmony_ci        smull           v22.4s,    \c1\().4h, v0.h[3]
196cabdff1aSopenharmony_ci        smull           v20.4s,    \c1\().4h, v0.h[2]
197cabdff1aSopenharmony_ci        add             v16.4h,    \c0\().4h, \c2\().4h
198cabdff1aSopenharmony_ci        sub             v17.4h,    \c0\().4h, \c2\().4h
199cabdff1aSopenharmony_ci        smlal           v22.4s,    \c3\().4h, v0.h[2]
200cabdff1aSopenharmony_ci        smull           v18.4s,    v16.4h,    v0.h[0]
201cabdff1aSopenharmony_ci        smull           v19.4s,    v17.4h,    v0.h[0]
202cabdff1aSopenharmony_ci        smlsl           v20.4s,    \c3\().4h, v0.h[3]
203cabdff1aSopenharmony_ci        rshrn           v22.4h,    v22.4s,    #14
204cabdff1aSopenharmony_ci        rshrn           v18.4h,    v18.4s,    #14
205cabdff1aSopenharmony_ci        rshrn           v19.4h,    v19.4s,    #14
206cabdff1aSopenharmony_ci        rshrn           v20.4h,    v20.4s,    #14
207cabdff1aSopenharmony_ci        add             \c0\().4h, v18.4h,    v22.4h
208cabdff1aSopenharmony_ci        sub             \c3\().4h, v18.4h,    v22.4h
209cabdff1aSopenharmony_ci        add             \c1\().4h, v19.4h,    v20.4h
210cabdff1aSopenharmony_ci        sub             \c2\().4h, v19.4h,    v20.4h
211cabdff1aSopenharmony_ci.endm
212cabdff1aSopenharmony_ci
213cabdff1aSopenharmony_ci.macro iadst4 c0, c1, c2, c3
214cabdff1aSopenharmony_ci        smull           v16.4s,    \c0\().4h, v0.h[4]
215cabdff1aSopenharmony_ci        smlal           v16.4s,    \c2\().4h, v0.h[5]
216cabdff1aSopenharmony_ci        smlal           v16.4s,    \c3\().4h, v0.h[6]
217cabdff1aSopenharmony_ci        smull           v17.4s,    \c0\().4h, v0.h[6]
218cabdff1aSopenharmony_ci        smlsl           v17.4s,    \c2\().4h, v0.h[4]
219cabdff1aSopenharmony_ci        sub             \c0\().4h, \c0\().4h, \c2\().4h
220cabdff1aSopenharmony_ci        smlsl           v17.4s,    \c3\().4h, v0.h[5]
221cabdff1aSopenharmony_ci        add             \c0\().4h, \c0\().4h, \c3\().4h
222cabdff1aSopenharmony_ci        smull           v19.4s,    \c1\().4h, v0.h[7]
223cabdff1aSopenharmony_ci        smull           v18.4s,    \c0\().4h, v0.h[7]
224cabdff1aSopenharmony_ci        add             v20.4s,    v16.4s,    v19.4s
225cabdff1aSopenharmony_ci        add             v21.4s,    v17.4s,    v19.4s
226cabdff1aSopenharmony_ci        rshrn           \c0\().4h, v20.4s,    #14
227cabdff1aSopenharmony_ci        add             v16.4s,    v16.4s,    v17.4s
228cabdff1aSopenharmony_ci        rshrn           \c1\().4h, v21.4s,    #14
229cabdff1aSopenharmony_ci        sub             v16.4s,    v16.4s,    v19.4s
230cabdff1aSopenharmony_ci        rshrn           \c2\().4h, v18.4s,    #14
231cabdff1aSopenharmony_ci        rshrn           \c3\().4h, v16.4s,    #14
232cabdff1aSopenharmony_ci.endm
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci// The public functions in this file have got the following signature:
235cabdff1aSopenharmony_ci// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci.macro itxfm_func4x4 txfm1, txfm2
238cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
239cabdff1aSopenharmony_ci.ifc \txfm1,\txfm2
240cabdff1aSopenharmony_ci.ifc \txfm1,idct
241cabdff1aSopenharmony_ci        movrel          x4,  itxfm4_coeffs
242cabdff1aSopenharmony_ci        ld1             {v0.4h}, [x4]
243cabdff1aSopenharmony_ci.endif
244cabdff1aSopenharmony_ci.ifc \txfm1,iadst
245cabdff1aSopenharmony_ci        movrel          x4,  iadst4_coeffs
246cabdff1aSopenharmony_ci        ld1             {v0.d}[1], [x4]
247cabdff1aSopenharmony_ci.endif
248cabdff1aSopenharmony_ci.else
249cabdff1aSopenharmony_ci        movrel          x4,  itxfm4_coeffs
250cabdff1aSopenharmony_ci        ld1             {v0.8h}, [x4]
251cabdff1aSopenharmony_ci.endif
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci        movi            v31.8h, #0
254cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
255cabdff1aSopenharmony_ci        cmp             w3,  #1
256cabdff1aSopenharmony_ci        b.ne            1f
257cabdff1aSopenharmony_ci        // DC-only for idct/idct
258cabdff1aSopenharmony_ci        ld1             {v2.h}[0], [x2]
259cabdff1aSopenharmony_ci        smull           v2.4s,  v2.4h, v0.h[0]
260cabdff1aSopenharmony_ci        rshrn           v2.4h,  v2.4s, #14
261cabdff1aSopenharmony_ci        smull           v2.4s,  v2.4h, v0.h[0]
262cabdff1aSopenharmony_ci        rshrn           v2.4h,  v2.4s, #14
263cabdff1aSopenharmony_ci        st1             {v31.h}[0], [x2]
264cabdff1aSopenharmony_ci        dup             v4.4h,  v2.h[0]
265cabdff1aSopenharmony_ci        mov             v5.16b, v4.16b
266cabdff1aSopenharmony_ci        mov             v6.16b, v4.16b
267cabdff1aSopenharmony_ci        mov             v7.16b, v4.16b
268cabdff1aSopenharmony_ci        b               2f
269cabdff1aSopenharmony_ci.endif
270cabdff1aSopenharmony_ci
271cabdff1aSopenharmony_ci1:
272cabdff1aSopenharmony_ci        ld1             {v4.4h,v5.4h,v6.4h,v7.4h},  [x2]
273cabdff1aSopenharmony_ci        st1             {v31.8h}, [x2], #16
274cabdff1aSopenharmony_ci
275cabdff1aSopenharmony_ci.ifc \txfm1,iwht
276cabdff1aSopenharmony_ci        sshr            v4.4h,  v4.4h,  #2
277cabdff1aSopenharmony_ci        sshr            v5.4h,  v5.4h,  #2
278cabdff1aSopenharmony_ci        sshr            v6.4h,  v6.4h,  #2
279cabdff1aSopenharmony_ci        sshr            v7.4h,  v7.4h,  #2
280cabdff1aSopenharmony_ci.endif
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci        \txfm1\()4      v4,  v5,  v6,  v7
283cabdff1aSopenharmony_ci
284cabdff1aSopenharmony_ci        st1             {v31.8h}, [x2], #16
285cabdff1aSopenharmony_ci        // Transpose 4x4 with 16 bit elements
286cabdff1aSopenharmony_ci        transpose_4x4H  v4,  v5,  v6,  v7,  v16, v17, v18, v19
287cabdff1aSopenharmony_ci
288cabdff1aSopenharmony_ci        \txfm2\()4      v4,  v5,  v6,  v7
289cabdff1aSopenharmony_ci2:
290cabdff1aSopenharmony_ci        ld1             {v0.s}[0],   [x0], x1
291cabdff1aSopenharmony_ci        ld1             {v1.s}[0],   [x0], x1
292cabdff1aSopenharmony_ci.ifnc \txfm1,iwht
293cabdff1aSopenharmony_ci        srshr           v4.4h,  v4.4h,  #4
294cabdff1aSopenharmony_ci        srshr           v5.4h,  v5.4h,  #4
295cabdff1aSopenharmony_ci        srshr           v6.4h,  v6.4h,  #4
296cabdff1aSopenharmony_ci        srshr           v7.4h,  v7.4h,  #4
297cabdff1aSopenharmony_ci.endif
298cabdff1aSopenharmony_ci        uaddw           v4.8h,  v4.8h,  v0.8b
299cabdff1aSopenharmony_ci        uaddw           v5.8h,  v5.8h,  v1.8b
300cabdff1aSopenharmony_ci        ld1             {v2.s}[0],   [x0], x1
301cabdff1aSopenharmony_ci        ld1             {v3.s}[0],   [x0], x1
302cabdff1aSopenharmony_ci        sqxtun          v0.8b,  v4.8h
303cabdff1aSopenharmony_ci        sqxtun          v1.8b,  v5.8h
304cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
305cabdff1aSopenharmony_ci
306cabdff1aSopenharmony_ci        uaddw           v6.8h,  v6.8h,  v2.8b
307cabdff1aSopenharmony_ci        uaddw           v7.8h,  v7.8h,  v3.8b
308cabdff1aSopenharmony_ci        st1             {v0.s}[0],  [x0], x1
309cabdff1aSopenharmony_ci        sqxtun          v2.8b,  v6.8h
310cabdff1aSopenharmony_ci        sqxtun          v3.8b,  v7.8h
311cabdff1aSopenharmony_ci
312cabdff1aSopenharmony_ci        st1             {v1.s}[0],  [x0], x1
313cabdff1aSopenharmony_ci        st1             {v2.s}[0],  [x0], x1
314cabdff1aSopenharmony_ci        st1             {v3.s}[0],  [x0], x1
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_ci        ret
317cabdff1aSopenharmony_ciendfunc
318cabdff1aSopenharmony_ci.endm
319cabdff1aSopenharmony_ci
320cabdff1aSopenharmony_ciitxfm_func4x4 idct,  idct
321cabdff1aSopenharmony_ciitxfm_func4x4 iadst, idct
322cabdff1aSopenharmony_ciitxfm_func4x4 idct,  iadst
323cabdff1aSopenharmony_ciitxfm_func4x4 iadst, iadst
324cabdff1aSopenharmony_ciitxfm_func4x4 iwht,  iwht
325cabdff1aSopenharmony_ci
326cabdff1aSopenharmony_ci
327cabdff1aSopenharmony_ci.macro idct8
328cabdff1aSopenharmony_ci        dmbutterfly0    v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
329cabdff1aSopenharmony_ci        dmbutterfly     v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
330cabdff1aSopenharmony_ci        dmbutterfly     v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
331cabdff1aSopenharmony_ci        dmbutterfly     v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
332cabdff1aSopenharmony_ci
333cabdff1aSopenharmony_ci        butterfly_8h    v24, v25, v16, v22 // v24 = t0, v25 = t3
334cabdff1aSopenharmony_ci        butterfly_8h    v28, v29, v17, v21 // v28 = t4, v29 = t5a
335cabdff1aSopenharmony_ci        butterfly_8h    v30, v31, v23, v19 // v30 = t7, v31 = t6a
336cabdff1aSopenharmony_ci        butterfly_8h    v26, v27, v20, v18 // v26 = t1, v27 = t2
337cabdff1aSopenharmony_ci
338cabdff1aSopenharmony_ci        dmbutterfly0    v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
339cabdff1aSopenharmony_ci
340cabdff1aSopenharmony_ci        butterfly_8h    v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
341cabdff1aSopenharmony_ci        butterfly_8h    v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
342cabdff1aSopenharmony_ci        butterfly_8h    v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
343cabdff1aSopenharmony_ci        butterfly_8h    v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
344cabdff1aSopenharmony_ci.endm
345cabdff1aSopenharmony_ci
346cabdff1aSopenharmony_ci.macro iadst8
347cabdff1aSopenharmony_ci        dmbutterfly_l   v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0]   // v24,v25 = t1a, v26,v27 = t0a
348cabdff1aSopenharmony_ci        dmbutterfly_l   v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2]   // v28,v29 = t3a, v30,v31 = t2a
349cabdff1aSopenharmony_ci        dmbutterfly_l   v2,  v3,  v4,  v5,  v19, v20, v1.h[5], v1.h[4]   // v2,v3   = t5a, v4,v5   = t4a
350cabdff1aSopenharmony_ci        dmbutterfly_l   v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6]   // v16,v18 = t7a, v21,v23 = t6a
351cabdff1aSopenharmony_ci
352cabdff1aSopenharmony_ci        dbutterfly_n    v4,  v5,  v26, v27, v4,  v5,  v6,  v7, v26, v27  // v4  = t0, v5  = t4
353cabdff1aSopenharmony_ci        dbutterfly_n    v2,  v3,  v24, v25, v2,  v3,  v6,  v7, v26, v27  // v2  = t1, v3  = t5
354cabdff1aSopenharmony_ci        dbutterfly_n    v24, v25, v30, v31, v21, v23, v6,  v7, v26, v27  // v24 = t2, v25 = t6
355cabdff1aSopenharmony_ci        dbutterfly_n    v30, v31, v28, v29, v16, v18, v6,  v7, v26, v27  // v30 = t3, v31 = t7
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_ci        butterfly_8h    v16, v6,  v4, v24 // v16 = out[0],  v6 = t2
358cabdff1aSopenharmony_ci        butterfly_8h    v23, v7,  v2, v30 // v23 = -out[7], v7 = t3
359cabdff1aSopenharmony_ci        neg             v23.8h,   v23.8h  // v23 = out[7]
360cabdff1aSopenharmony_ci
361cabdff1aSopenharmony_ci        dmbutterfly0    v19, v20, v6, v7, v24, v26, v27, v28, v29, v30   // v19 = -out[3], v20 = out[4]
362cabdff1aSopenharmony_ci        neg             v19.8h,   v19.8h  // v19 = out[3]
363cabdff1aSopenharmony_ci
364cabdff1aSopenharmony_ci        dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[2], v0.h[3]   // v26,v27 = t5a, v28,v29 = t4a
365cabdff1aSopenharmony_ci        dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[3], v0.h[2]   // v2,v3   = t6a, v4,v5   = t7a
366cabdff1aSopenharmony_ci
367cabdff1aSopenharmony_ci        dbutterfly_n    v17, v30, v28, v29, v2,  v3,  v6,  v7,  v24, v25 // v17 = -out[1], v30 = t6
368cabdff1aSopenharmony_ci        dbutterfly_n    v22, v31, v26, v27, v4,  v5,  v6,  v7,  v24, v25 // v22 = out[6],  v31 = t7
369cabdff1aSopenharmony_ci        neg             v17.8h,   v17.8h  // v17 = out[1]
370cabdff1aSopenharmony_ci
371cabdff1aSopenharmony_ci        dmbutterfly0    v18, v21, v30, v31, v2,  v3,  v4,  v5,  v6,  v7  // v18 = out[2], v21 = -out[5]
372cabdff1aSopenharmony_ci        neg             v21.8h,   v21.8h  // v21 = out[5]
373cabdff1aSopenharmony_ci.endm
374cabdff1aSopenharmony_ci
375cabdff1aSopenharmony_ci
376cabdff1aSopenharmony_ci.macro itxfm_func8x8 txfm1, txfm2
377cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
378cabdff1aSopenharmony_ci        // The iadst also uses a few coefficients from
379cabdff1aSopenharmony_ci        // idct, so those always need to be loaded.
380cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
381cabdff1aSopenharmony_ci        movrel          x4,  idct_coeffs
382cabdff1aSopenharmony_ci.else
383cabdff1aSopenharmony_ci        movrel          x4,  iadst8_coeffs
384cabdff1aSopenharmony_ci        ld1             {v1.8h}, [x4], #16
385cabdff1aSopenharmony_ci.endif
386cabdff1aSopenharmony_ci        ld1             {v0.8h}, [x4]
387cabdff1aSopenharmony_ci
388cabdff1aSopenharmony_ci        movi            v2.8h, #0
389cabdff1aSopenharmony_ci        movi            v3.8h, #0
390cabdff1aSopenharmony_ci        movi            v4.8h, #0
391cabdff1aSopenharmony_ci        movi            v5.8h, #0
392cabdff1aSopenharmony_ci
393cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
394cabdff1aSopenharmony_ci        cmp             w3,  #1
395cabdff1aSopenharmony_ci        b.ne            1f
396cabdff1aSopenharmony_ci        // DC-only for idct/idct
397cabdff1aSopenharmony_ci        ld1             {v2.h}[0],  [x2]
398cabdff1aSopenharmony_ci        smull           v2.4s,  v2.4h, v0.h[0]
399cabdff1aSopenharmony_ci        rshrn           v2.4h,  v2.4s, #14
400cabdff1aSopenharmony_ci        smull           v2.4s,  v2.4h, v0.h[0]
401cabdff1aSopenharmony_ci        rshrn           v2.4h,  v2.4s, #14
402cabdff1aSopenharmony_ci        st1             {v3.h}[0],  [x2]
403cabdff1aSopenharmony_ci        dup             v16.8h,  v2.h[0]
404cabdff1aSopenharmony_ci        mov             v17.16b, v16.16b
405cabdff1aSopenharmony_ci        mov             v18.16b, v16.16b
406cabdff1aSopenharmony_ci        mov             v19.16b, v16.16b
407cabdff1aSopenharmony_ci        mov             v20.16b, v16.16b
408cabdff1aSopenharmony_ci        mov             v21.16b, v16.16b
409cabdff1aSopenharmony_ci        mov             v22.16b, v16.16b
410cabdff1aSopenharmony_ci        mov             v23.16b, v16.16b
411cabdff1aSopenharmony_ci        b               2f
412cabdff1aSopenharmony_ci.endif
413cabdff1aSopenharmony_ci1:
414cabdff1aSopenharmony_ci        ld1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x2], #64
415cabdff1aSopenharmony_ci        ld1             {v20.8h,v21.8h,v22.8h,v23.8h},  [x2], #64
416cabdff1aSopenharmony_ci        sub             x2,  x2,  #128
417cabdff1aSopenharmony_ci        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
418cabdff1aSopenharmony_ci        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
419cabdff1aSopenharmony_ci
420cabdff1aSopenharmony_ci        \txfm1\()8
421cabdff1aSopenharmony_ci
422cabdff1aSopenharmony_ci        // Transpose 8x8 with 16 bit elements
423cabdff1aSopenharmony_ci        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
424cabdff1aSopenharmony_ci
425cabdff1aSopenharmony_ci        \txfm2\()8
426cabdff1aSopenharmony_ci2:
427cabdff1aSopenharmony_ci        mov             x3,  x0
428cabdff1aSopenharmony_ci        // Add into the destination
429cabdff1aSopenharmony_ci        ld1             {v0.8b},  [x0], x1
430cabdff1aSopenharmony_ci        srshr           v16.8h, v16.8h, #5
431cabdff1aSopenharmony_ci        ld1             {v1.8b},  [x0], x1
432cabdff1aSopenharmony_ci        srshr           v17.8h, v17.8h, #5
433cabdff1aSopenharmony_ci        ld1             {v2.8b},  [x0], x1
434cabdff1aSopenharmony_ci        srshr           v18.8h, v18.8h, #5
435cabdff1aSopenharmony_ci        uaddw           v16.8h, v16.8h, v0.8b
436cabdff1aSopenharmony_ci        ld1             {v3.8b},  [x0], x1
437cabdff1aSopenharmony_ci        srshr           v19.8h, v19.8h, #5
438cabdff1aSopenharmony_ci        uaddw           v17.8h, v17.8h, v1.8b
439cabdff1aSopenharmony_ci        ld1             {v4.8b},  [x0], x1
440cabdff1aSopenharmony_ci        srshr           v20.8h, v20.8h, #5
441cabdff1aSopenharmony_ci        uaddw           v18.8h, v18.8h, v2.8b
442cabdff1aSopenharmony_ci        sqxtun          v0.8b,  v16.8h
443cabdff1aSopenharmony_ci        ld1             {v5.8b},  [x0], x1
444cabdff1aSopenharmony_ci        srshr           v21.8h, v21.8h, #5
445cabdff1aSopenharmony_ci        uaddw           v19.8h, v19.8h, v3.8b
446cabdff1aSopenharmony_ci        sqxtun          v1.8b,  v17.8h
447cabdff1aSopenharmony_ci        ld1             {v6.8b},  [x0], x1
448cabdff1aSopenharmony_ci        srshr           v22.8h, v22.8h, #5
449cabdff1aSopenharmony_ci        uaddw           v20.8h, v20.8h, v4.8b
450cabdff1aSopenharmony_ci        sqxtun          v2.8b,  v18.8h
451cabdff1aSopenharmony_ci        ld1             {v7.8b},  [x0], x1
452cabdff1aSopenharmony_ci        srshr           v23.8h, v23.8h, #5
453cabdff1aSopenharmony_ci        uaddw           v21.8h, v21.8h, v5.8b
454cabdff1aSopenharmony_ci        sqxtun          v3.8b,  v19.8h
455cabdff1aSopenharmony_ci
456cabdff1aSopenharmony_ci        st1             {v0.8b},  [x3], x1
457cabdff1aSopenharmony_ci        uaddw           v22.8h, v22.8h, v6.8b
458cabdff1aSopenharmony_ci        st1             {v1.8b},  [x3], x1
459cabdff1aSopenharmony_ci        sqxtun          v4.8b,  v20.8h
460cabdff1aSopenharmony_ci        st1             {v2.8b},  [x3], x1
461cabdff1aSopenharmony_ci        uaddw           v23.8h, v23.8h, v7.8b
462cabdff1aSopenharmony_ci        st1             {v3.8b},  [x3], x1
463cabdff1aSopenharmony_ci        sqxtun          v5.8b,  v21.8h
464cabdff1aSopenharmony_ci        st1             {v4.8b},  [x3], x1
465cabdff1aSopenharmony_ci        sqxtun          v6.8b,  v22.8h
466cabdff1aSopenharmony_ci        st1             {v5.8b},  [x3], x1
467cabdff1aSopenharmony_ci        sqxtun          v7.8b,  v23.8h
468cabdff1aSopenharmony_ci
469cabdff1aSopenharmony_ci        st1             {v6.8b},  [x3], x1
470cabdff1aSopenharmony_ci        st1             {v7.8b},  [x3], x1
471cabdff1aSopenharmony_ci
472cabdff1aSopenharmony_ci        ret
473cabdff1aSopenharmony_ciendfunc
474cabdff1aSopenharmony_ci.endm
475cabdff1aSopenharmony_ci
476cabdff1aSopenharmony_ciitxfm_func8x8 idct,  idct
477cabdff1aSopenharmony_ciitxfm_func8x8 iadst, idct
478cabdff1aSopenharmony_ciitxfm_func8x8 idct,  iadst
479cabdff1aSopenharmony_ciitxfm_func8x8 iadst, iadst
480cabdff1aSopenharmony_ci
481cabdff1aSopenharmony_ci
482cabdff1aSopenharmony_cifunction idct16x16_dc_add_neon
483cabdff1aSopenharmony_ci        movrel          x4,  idct_coeffs
484cabdff1aSopenharmony_ci        ld1             {v0.4h}, [x4]
485cabdff1aSopenharmony_ci
486cabdff1aSopenharmony_ci        movi            v1.4h,  #0
487cabdff1aSopenharmony_ci
488cabdff1aSopenharmony_ci        ld1             {v2.h}[0], [x2]
489cabdff1aSopenharmony_ci        smull           v2.4s,  v2.4h,  v0.h[0]
490cabdff1aSopenharmony_ci        rshrn           v2.4h,  v2.4s,  #14
491cabdff1aSopenharmony_ci        smull           v2.4s,  v2.4h,  v0.h[0]
492cabdff1aSopenharmony_ci        rshrn           v2.4h,  v2.4s,  #14
493cabdff1aSopenharmony_ci        dup             v2.8h,  v2.h[0]
494cabdff1aSopenharmony_ci        st1             {v1.h}[0], [x2]
495cabdff1aSopenharmony_ci
496cabdff1aSopenharmony_ci        srshr           v2.8h,  v2.8h,  #6
497cabdff1aSopenharmony_ci
498cabdff1aSopenharmony_ci        mov             x3,  x0
499cabdff1aSopenharmony_ci        mov             x4,  #16
500cabdff1aSopenharmony_ci1:
501cabdff1aSopenharmony_ci        // Loop to add the constant from v2 into all 16x16 outputs
502cabdff1aSopenharmony_ci        subs            x4,  x4,  #2
503cabdff1aSopenharmony_ci        ld1             {v3.16b},  [x0], x1
504cabdff1aSopenharmony_ci        ld1             {v4.16b},  [x0], x1
505cabdff1aSopenharmony_ci        uaddw           v16.8h, v2.8h,  v3.8b
506cabdff1aSopenharmony_ci        uaddw2          v17.8h, v2.8h,  v3.16b
507cabdff1aSopenharmony_ci        uaddw           v18.8h, v2.8h,  v4.8b
508cabdff1aSopenharmony_ci        uaddw2          v19.8h, v2.8h,  v4.16b
509cabdff1aSopenharmony_ci        sqxtun          v3.8b,  v16.8h
510cabdff1aSopenharmony_ci        sqxtun2         v3.16b, v17.8h
511cabdff1aSopenharmony_ci        sqxtun          v4.8b,  v18.8h
512cabdff1aSopenharmony_ci        sqxtun2         v4.16b, v19.8h
513cabdff1aSopenharmony_ci        st1             {v3.16b},  [x3], x1
514cabdff1aSopenharmony_ci        st1             {v4.16b},  [x3], x1
515cabdff1aSopenharmony_ci        b.ne            1b
516cabdff1aSopenharmony_ci
517cabdff1aSopenharmony_ci        ret
518cabdff1aSopenharmony_ciendfunc
519cabdff1aSopenharmony_ci
520cabdff1aSopenharmony_ci.macro idct16_end
521cabdff1aSopenharmony_ci        butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
522cabdff1aSopenharmony_ci        butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
523cabdff1aSopenharmony_ci        butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
524cabdff1aSopenharmony_ci        butterfly_8h    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
525cabdff1aSopenharmony_ci        butterfly_8h    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
526cabdff1aSopenharmony_ci        butterfly_8h    v24, v21, v23, v21               // v24 = t9,   v21 = t10
527cabdff1aSopenharmony_ci        butterfly_8h    v23, v27, v25, v27               // v23 = t14,  v27 = t13
528cabdff1aSopenharmony_ci        butterfly_8h    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
529cabdff1aSopenharmony_ci
530cabdff1aSopenharmony_ci        dmbutterfly0    v2,  v3,  v27, v21, v2,  v3,  v16, v17, v30, v31 // v2  = t13a, v3  = t10a
531cabdff1aSopenharmony_ci        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
532cabdff1aSopenharmony_ci
533cabdff1aSopenharmony_ci        butterfly_8h    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
534cabdff1aSopenharmony_ci        butterfly_8h    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
535cabdff1aSopenharmony_ci        butterfly_8h_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
536cabdff1aSopenharmony_ci        butterfly_8h    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
537cabdff1aSopenharmony_ci        butterfly_8h    v18, v29, v4,  v2                // v18 = out[2], v29 = out[13]
538cabdff1aSopenharmony_ci        butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
539cabdff1aSopenharmony_ci        butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
540cabdff1aSopenharmony_ci        butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
541cabdff1aSopenharmony_ci        ret
542cabdff1aSopenharmony_ci.endm
543cabdff1aSopenharmony_ci
544cabdff1aSopenharmony_cifunction idct16
545cabdff1aSopenharmony_ci        dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
546cabdff1aSopenharmony_ci        dmbutterfly     v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
547cabdff1aSopenharmony_ci        dmbutterfly     v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
548cabdff1aSopenharmony_ci        dmbutterfly     v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
549cabdff1aSopenharmony_ci        dmbutterfly     v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
550cabdff1aSopenharmony_ci        dmbutterfly     v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
551cabdff1aSopenharmony_ci        dmbutterfly     v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
552cabdff1aSopenharmony_ci        dmbutterfly     v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
553cabdff1aSopenharmony_ci
554cabdff1aSopenharmony_ci        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
555cabdff1aSopenharmony_ci        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
556cabdff1aSopenharmony_ci        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
557cabdff1aSopenharmony_ci        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
558cabdff1aSopenharmony_ci        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
559cabdff1aSopenharmony_ci        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
560cabdff1aSopenharmony_ci        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
561cabdff1aSopenharmony_ci        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
562cabdff1aSopenharmony_ci
563cabdff1aSopenharmony_ci        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
564cabdff1aSopenharmony_ci        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
565cabdff1aSopenharmony_ci        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
566cabdff1aSopenharmony_ci        idct16_end
567cabdff1aSopenharmony_ciendfunc
568cabdff1aSopenharmony_ci
569cabdff1aSopenharmony_cifunction idct16_half
570cabdff1aSopenharmony_ci        dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
571cabdff1aSopenharmony_ci        dmbutterfly_h1  v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
572cabdff1aSopenharmony_ci        dmbutterfly_h1  v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
573cabdff1aSopenharmony_ci        dmbutterfly_h2  v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
574cabdff1aSopenharmony_ci        dmbutterfly_h1  v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
575cabdff1aSopenharmony_ci        dmbutterfly_h2  v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
576cabdff1aSopenharmony_ci        dmbutterfly_h1  v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
577cabdff1aSopenharmony_ci        dmbutterfly_h2  v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
578cabdff1aSopenharmony_ci
579cabdff1aSopenharmony_ci        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
580cabdff1aSopenharmony_ci        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
581cabdff1aSopenharmony_ci        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
582cabdff1aSopenharmony_ci        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
583cabdff1aSopenharmony_ci        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
584cabdff1aSopenharmony_ci        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
585cabdff1aSopenharmony_ci        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
586cabdff1aSopenharmony_ci        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
587cabdff1aSopenharmony_ci
588cabdff1aSopenharmony_ci        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
589cabdff1aSopenharmony_ci        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
590cabdff1aSopenharmony_ci        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
591cabdff1aSopenharmony_ci        idct16_end
592cabdff1aSopenharmony_ciendfunc
593cabdff1aSopenharmony_ci
594cabdff1aSopenharmony_cifunction idct16_quarter
595cabdff1aSopenharmony_ci        dsmull_h        v24, v25, v19, v1.h[7]
596cabdff1aSopenharmony_ci        dsmull_h        v4,  v5,  v17, v1.h[0]
597cabdff1aSopenharmony_ci        dsmull_h        v7,  v6,  v18, v0.h[5]
598cabdff1aSopenharmony_ci        dsmull_h        v30, v31, v18, v0.h[4]
599cabdff1aSopenharmony_ci        neg             v24.4s,  v24.4s
600cabdff1aSopenharmony_ci        neg             v25.4s,  v25.4s
601cabdff1aSopenharmony_ci        dsmull_h        v29, v28, v17, v1.h[1]
602cabdff1aSopenharmony_ci        dsmull_h        v26, v27, v19, v1.h[6]
603cabdff1aSopenharmony_ci        dsmull_h        v22, v23, v16, v0.h[0]
604cabdff1aSopenharmony_ci        drshrn_h        v24, v24, v25, #14
605cabdff1aSopenharmony_ci        drshrn_h        v16, v4,  v5,  #14
606cabdff1aSopenharmony_ci        drshrn_h        v7,  v7,  v6,  #14
607cabdff1aSopenharmony_ci        drshrn_h        v6,  v30, v31, #14
608cabdff1aSopenharmony_ci        drshrn_h        v29, v29, v28, #14
609cabdff1aSopenharmony_ci        drshrn_h        v17, v26, v27, #14
610cabdff1aSopenharmony_ci        drshrn_h        v28, v22, v23, #14
611cabdff1aSopenharmony_ci
612cabdff1aSopenharmony_ci        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
613cabdff1aSopenharmony_ci        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
614cabdff1aSopenharmony_ci        neg             v22.4s,  v22.4s
615cabdff1aSopenharmony_ci        neg             v23.4s,  v23.4s
616cabdff1aSopenharmony_ci        drshrn_h        v27, v20, v21, #14
617cabdff1aSopenharmony_ci        drshrn_h        v21, v22, v23, #14
618cabdff1aSopenharmony_ci        drshrn_h        v23, v18, v19, #14
619cabdff1aSopenharmony_ci        drshrn_h        v25, v30, v31, #14
620cabdff1aSopenharmony_ci        mov             v4.16b,  v28.16b
621cabdff1aSopenharmony_ci        mov             v5.16b,  v28.16b
622cabdff1aSopenharmony_ci        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
623cabdff1aSopenharmony_ci        mov             v20.16b, v28.16b
624cabdff1aSopenharmony_ci        idct16_end
625cabdff1aSopenharmony_ciendfunc
626cabdff1aSopenharmony_ci
627cabdff1aSopenharmony_cifunction iadst16
628cabdff1aSopenharmony_ci        ld1             {v0.8h,v1.8h}, [x11]
629cabdff1aSopenharmony_ci
630cabdff1aSopenharmony_ci        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // v6,v7   = t1,   v4,v5   = t0
631cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v0.h[5], v0.h[4]   // v10,v11 = t9,   v8,v9   = t8
632cabdff1aSopenharmony_ci        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
633cabdff1aSopenharmony_ci        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2]   // v14,v15 = t3,   v12,v13 = t2
634cabdff1aSopenharmony_ci        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
635cabdff1aSopenharmony_ci
636cabdff1aSopenharmony_ci        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v0.h[7], v0.h[6]   // v6,v7   = t11,  v4,v5   = t10
637cabdff1aSopenharmony_ci        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
638cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v1.h[1], v1.h[0]   // v10,v11 = t5,   v8,v9   = t4
639cabdff1aSopenharmony_ci        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
640cabdff1aSopenharmony_ci
641cabdff1aSopenharmony_ci        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4]   // v14,v15 = t13,  v12,v13 = t12
642cabdff1aSopenharmony_ci        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
643cabdff1aSopenharmony_ci        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v1.h[3], v1.h[2]   // v6,v7   = t7,   v4,v5   = t6
644cabdff1aSopenharmony_ci        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
645cabdff1aSopenharmony_ci
646cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v1.h[7], v1.h[6]   // v10,v11 = t15,  v8,v9   = t14
647cabdff1aSopenharmony_ci        ld1             {v0.8h}, [x10]
648cabdff1aSopenharmony_ci        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
649cabdff1aSopenharmony_ci        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5]   // v14,v15 = t9,   v12,v13 = t8
650cabdff1aSopenharmony_ci        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
651cabdff1aSopenharmony_ci
652cabdff1aSopenharmony_ci        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v0.h[5], v0.h[4]   // v4,v5   = t12,  v6,v7   = t13
653cabdff1aSopenharmony_ci        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
654cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v0.h[6], v0.h[7]   // v10,v11 = t11,  v8,v9   = t10
655cabdff1aSopenharmony_ci        butterfly_8h_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
656cabdff1aSopenharmony_ci        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
657cabdff1aSopenharmony_ci
658cabdff1aSopenharmony_ci        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6]   // v12,v13 = t14,  v14,v15 = t15
659cabdff1aSopenharmony_ci        butterfly_8h_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
660cabdff1aSopenharmony_ci        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
661cabdff1aSopenharmony_ci        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
662cabdff1aSopenharmony_ci
663cabdff1aSopenharmony_ci        butterfly_8h_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
664cabdff1aSopenharmony_ci        butterfly_8h_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
665cabdff1aSopenharmony_ci
666cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.h[2], v0.h[3]   // v10,v11 = t13,  v8,v9   = t12
667cabdff1aSopenharmony_ci        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2]   // v12,v13 = t14,  v14,v15 = t15
668cabdff1aSopenharmony_ci
669cabdff1aSopenharmony_ci        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
670cabdff1aSopenharmony_ci        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
671cabdff1aSopenharmony_ci        neg             v29.8h, v29.8h                   // v29 = out[13]
672cabdff1aSopenharmony_ci
673cabdff1aSopenharmony_ci        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.h[2], v0.h[3]   // v10,v11 = t5a,  v8,v9   = t4a
674cabdff1aSopenharmony_ci        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.h[3], v0.h[2]   // v12,v13 = t6a,  v14,v15 = t7a
675cabdff1aSopenharmony_ci
676cabdff1aSopenharmony_ci        butterfly_8h    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
677cabdff1aSopenharmony_ci        butterfly_8h    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
678cabdff1aSopenharmony_ci
679cabdff1aSopenharmony_ci        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
680cabdff1aSopenharmony_ci        neg             v19.8h, v19.8h                   // v19 = out[3]
681cabdff1aSopenharmony_ci        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
682cabdff1aSopenharmony_ci
683cabdff1aSopenharmony_ci        butterfly_8h    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
684cabdff1aSopenharmony_ci        butterfly_8h    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
685cabdff1aSopenharmony_ci
686cabdff1aSopenharmony_ci        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
687cabdff1aSopenharmony_ci        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
688cabdff1aSopenharmony_ci        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
689cabdff1aSopenharmony_ci        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
690cabdff1aSopenharmony_ci
691cabdff1aSopenharmony_ci        neg             v31.8h,  v5.8h                    // v31 = out[15]
692cabdff1aSopenharmony_ci        neg             v17.8h,  v3.8h                    // v17 = out[1]
693cabdff1aSopenharmony_ci
694cabdff1aSopenharmony_ci        mov             v16.16b, v2.16b
695cabdff1aSopenharmony_ci        mov             v30.16b, v4.16b
696cabdff1aSopenharmony_ci        ret
697cabdff1aSopenharmony_ciendfunc
698cabdff1aSopenharmony_ci
699cabdff1aSopenharmony_ci// Helper macros; we can't use these expressions directly within
700cabdff1aSopenharmony_ci// e.g. .irp due to the extra concatenation \(). Therefore wrap
701cabdff1aSopenharmony_ci// them in macros to allow using .irp below.
702cabdff1aSopenharmony_ci.macro load i, src, inc
703cabdff1aSopenharmony_ci        ld1             {v\i\().8h},  [\src], \inc
704cabdff1aSopenharmony_ci.endm
705cabdff1aSopenharmony_ci.macro store i, dst, inc
706cabdff1aSopenharmony_ci        st1             {v\i\().8h},  [\dst], \inc
707cabdff1aSopenharmony_ci.endm
708cabdff1aSopenharmony_ci.macro movi_v i, size, imm
709cabdff1aSopenharmony_ci        movi            v\i\()\size,  \imm
710cabdff1aSopenharmony_ci.endm
711cabdff1aSopenharmony_ci.macro load_clear i, src, inc
712cabdff1aSopenharmony_ci        ld1             {v\i\().8h}, [\src]
713cabdff1aSopenharmony_ci        st1             {v2.8h},  [\src], \inc
714cabdff1aSopenharmony_ci.endm
715cabdff1aSopenharmony_ci
716cabdff1aSopenharmony_ci.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
717cabdff1aSopenharmony_ci        srshr           \coef0, \coef0, #6
718cabdff1aSopenharmony_ci        ld1             {v2.8b},  [x0], x1
719cabdff1aSopenharmony_ci        srshr           \coef1, \coef1, #6
720cabdff1aSopenharmony_ci        ld1             {v3.8b},  [x3], x1
721cabdff1aSopenharmony_ci        srshr           \coef2, \coef2, #6
722cabdff1aSopenharmony_ci        ld1             {v4.8b},  [x0], x1
723cabdff1aSopenharmony_ci        srshr           \coef3, \coef3, #6
724cabdff1aSopenharmony_ci        uaddw           \coef0, \coef0, v2.8b
725cabdff1aSopenharmony_ci        ld1             {v5.8b},  [x3], x1
726cabdff1aSopenharmony_ci        uaddw           \coef1, \coef1, v3.8b
727cabdff1aSopenharmony_ci        srshr           \coef4, \coef4, #6
728cabdff1aSopenharmony_ci        ld1             {v6.8b},  [x0], x1
729cabdff1aSopenharmony_ci        srshr           \coef5, \coef5, #6
730cabdff1aSopenharmony_ci        ld1             {v7.8b},  [x3], x1
731cabdff1aSopenharmony_ci        sqxtun          v2.8b,  \coef0
732cabdff1aSopenharmony_ci        srshr           \coef6, \coef6, #6
733cabdff1aSopenharmony_ci        sqxtun          v3.8b,  \coef1
734cabdff1aSopenharmony_ci        srshr           \coef7, \coef7, #6
735cabdff1aSopenharmony_ci        uaddw           \coef2, \coef2, v4.8b
736cabdff1aSopenharmony_ci        ld1             {\tmp1},  [x0], x1
737cabdff1aSopenharmony_ci        uaddw           \coef3, \coef3, v5.8b
738cabdff1aSopenharmony_ci        ld1             {\tmp2},  [x3], x1
739cabdff1aSopenharmony_ci        sqxtun          v4.8b,  \coef2
740cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
741cabdff1aSopenharmony_ci        sub             x3,  x3,  x1, lsl #2
742cabdff1aSopenharmony_ci        sqxtun          v5.8b,  \coef3
743cabdff1aSopenharmony_ci        uaddw           \coef4, \coef4, v6.8b
744cabdff1aSopenharmony_ci        st1             {v2.8b},  [x0], x1
745cabdff1aSopenharmony_ci        uaddw           \coef5, \coef5, v7.8b
746cabdff1aSopenharmony_ci        st1             {v3.8b},  [x3], x1
747cabdff1aSopenharmony_ci        sqxtun          v6.8b,  \coef4
748cabdff1aSopenharmony_ci        st1             {v4.8b},  [x0], x1
749cabdff1aSopenharmony_ci        sqxtun          v7.8b,  \coef5
750cabdff1aSopenharmony_ci        st1             {v5.8b},  [x3], x1
751cabdff1aSopenharmony_ci        uaddw           \coef6, \coef6, \tmp1
752cabdff1aSopenharmony_ci        st1             {v6.8b},  [x0], x1
753cabdff1aSopenharmony_ci        uaddw           \coef7, \coef7, \tmp2
754cabdff1aSopenharmony_ci        st1             {v7.8b},  [x3], x1
755cabdff1aSopenharmony_ci        sqxtun          \tmp1,  \coef6
756cabdff1aSopenharmony_ci        sqxtun          \tmp2,  \coef7
757cabdff1aSopenharmony_ci        st1             {\tmp1},  [x0], x1
758cabdff1aSopenharmony_ci        st1             {\tmp2},  [x3], x1
759cabdff1aSopenharmony_ci.endm
760cabdff1aSopenharmony_ci
761cabdff1aSopenharmony_ci// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
762cabdff1aSopenharmony_ci// transpose into a horizontal 16x8 slice and store.
763cabdff1aSopenharmony_ci// x0 = dst (temp buffer)
764cabdff1aSopenharmony_ci// x1 = slice offset
765cabdff1aSopenharmony_ci// x2 = src
766cabdff1aSopenharmony_ci// x9 = input stride
767cabdff1aSopenharmony_ci.macro itxfm16_1d_funcs txfm
768cabdff1aSopenharmony_cifunction \txfm\()16_1d_8x16_pass1_neon
769cabdff1aSopenharmony_ci        mov             x14, x30
770cabdff1aSopenharmony_ci
771cabdff1aSopenharmony_ci        movi            v2.8h, #0
772cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
773cabdff1aSopenharmony_ci        load_clear      \i,  x2,  x9
774cabdff1aSopenharmony_ci.endr
775cabdff1aSopenharmony_ci
776cabdff1aSopenharmony_ci        bl              \txfm\()16
777cabdff1aSopenharmony_ci
778cabdff1aSopenharmony_ci        // Do two 8x8 transposes. Originally, v16-v31 contain the
779cabdff1aSopenharmony_ci        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
780cabdff1aSopenharmony_ci        // transposed 8x8 blocks.
781cabdff1aSopenharmony_ci        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
782cabdff1aSopenharmony_ci        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
783cabdff1aSopenharmony_ci
784cabdff1aSopenharmony_ci        // Store the transposed 8x8 blocks horizontally.
785cabdff1aSopenharmony_ci        cmp             x1,  #8
786cabdff1aSopenharmony_ci        b.eq            1f
787cabdff1aSopenharmony_ci.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
788cabdff1aSopenharmony_ci        store           \i,  x0,  #16
789cabdff1aSopenharmony_ci.endr
790cabdff1aSopenharmony_ci        ret             x14
791cabdff1aSopenharmony_ci1:
792cabdff1aSopenharmony_ci        // Special case: For the last input column (x1 == 8),
793cabdff1aSopenharmony_ci        // which would be stored as the last row in the temp buffer,
794cabdff1aSopenharmony_ci        // don't store the first 8x8 block, but keep it in registers
795cabdff1aSopenharmony_ci        // for the first slice of the second pass (where it is the
796cabdff1aSopenharmony_ci        // last 8x8 block).
797cabdff1aSopenharmony_ci.irp i, 24, 25, 26, 27, 28, 29, 30, 31
798cabdff1aSopenharmony_ci        add             x0,  x0,  #16
799cabdff1aSopenharmony_ci        store           \i,  x0,  #16
800cabdff1aSopenharmony_ci.endr
801cabdff1aSopenharmony_ci        mov             v24.16b, v16.16b
802cabdff1aSopenharmony_ci        mov             v25.16b, v17.16b
803cabdff1aSopenharmony_ci        mov             v26.16b, v18.16b
804cabdff1aSopenharmony_ci        mov             v27.16b, v19.16b
805cabdff1aSopenharmony_ci        mov             v28.16b, v20.16b
806cabdff1aSopenharmony_ci        mov             v29.16b, v21.16b
807cabdff1aSopenharmony_ci        mov             v30.16b, v22.16b
808cabdff1aSopenharmony_ci        mov             v31.16b, v23.16b
809cabdff1aSopenharmony_ci        ret             x14
810cabdff1aSopenharmony_ciendfunc
811cabdff1aSopenharmony_ci
812cabdff1aSopenharmony_ci// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
813cabdff1aSopenharmony_ci// load the destination pixels (from a similar 8x16 slice), add and store back.
814cabdff1aSopenharmony_ci// x0 = dst
815cabdff1aSopenharmony_ci// x1 = dst stride
816cabdff1aSopenharmony_ci// x2 = src (temp buffer)
817cabdff1aSopenharmony_ci// x3 = slice offset
818cabdff1aSopenharmony_ci// x9 = temp buffer stride
819cabdff1aSopenharmony_cifunction \txfm\()16_1d_8x16_pass2_neon
820cabdff1aSopenharmony_ci        mov             x14, x30
821cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
822cabdff1aSopenharmony_ci        load            \i,  x2,  x9
823cabdff1aSopenharmony_ci.endr
824cabdff1aSopenharmony_ci        cbz             x3,  1f
825cabdff1aSopenharmony_ci.irp i, 24, 25, 26, 27, 28, 29, 30, 31
826cabdff1aSopenharmony_ci        load            \i,  x2,  x9
827cabdff1aSopenharmony_ci.endr
828cabdff1aSopenharmony_ci1:
829cabdff1aSopenharmony_ci
830cabdff1aSopenharmony_ci        add             x3,  x0,  x1
831cabdff1aSopenharmony_ci        lsl             x1,  x1,  #1
832cabdff1aSopenharmony_ci        bl              \txfm\()16
833cabdff1aSopenharmony_ci
834cabdff1aSopenharmony_ci        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
835cabdff1aSopenharmony_ci        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
836cabdff1aSopenharmony_ci
837cabdff1aSopenharmony_ci        ret             x14
838cabdff1aSopenharmony_ciendfunc
839cabdff1aSopenharmony_ci.endm
840cabdff1aSopenharmony_ci
841cabdff1aSopenharmony_ciitxfm16_1d_funcs idct
842cabdff1aSopenharmony_ciitxfm16_1d_funcs iadst
843cabdff1aSopenharmony_ci
844cabdff1aSopenharmony_ci.macro itxfm_func16x16 txfm1, txfm2
845cabdff1aSopenharmony_cifunction ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
846cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
847cabdff1aSopenharmony_ci        cmp             w3,  #1
848cabdff1aSopenharmony_ci        b.eq            idct16x16_dc_add_neon
849cabdff1aSopenharmony_ci.endif
850cabdff1aSopenharmony_ci        mov             x15, x30
851cabdff1aSopenharmony_ci        // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
852cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
853cabdff1aSopenharmony_ci        stp             d14, d15, [sp, #-0x10]!
854cabdff1aSopenharmony_ci        stp             d12, d13, [sp, #-0x10]!
855cabdff1aSopenharmony_ci        stp             d10, d11, [sp, #-0x10]!
856cabdff1aSopenharmony_ci        stp             d8,  d9,  [sp, #-0x10]!
857cabdff1aSopenharmony_ci.endif
858cabdff1aSopenharmony_ci
859cabdff1aSopenharmony_ci        sub             sp,  sp,  #512
860cabdff1aSopenharmony_ci
861cabdff1aSopenharmony_ci        mov             x4,  x0
862cabdff1aSopenharmony_ci        mov             x5,  x1
863cabdff1aSopenharmony_ci        mov             x6,  x2
864cabdff1aSopenharmony_ci
865cabdff1aSopenharmony_ci        movrel          x10, idct_coeffs
866cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
867cabdff1aSopenharmony_ci        movrel          x11, iadst16_coeffs
868cabdff1aSopenharmony_ci.endif
869cabdff1aSopenharmony_ci.ifc \txfm1,idct
870cabdff1aSopenharmony_ci        ld1             {v0.8h,v1.8h}, [x10]
871cabdff1aSopenharmony_ci.endif
872cabdff1aSopenharmony_ci        mov             x9,  #32
873cabdff1aSopenharmony_ci
874cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
875cabdff1aSopenharmony_ci        cmp             w3,  #10
876cabdff1aSopenharmony_ci        b.le            idct16x16_quarter_add_neon
877cabdff1aSopenharmony_ci        cmp             w3,  #38
878cabdff1aSopenharmony_ci        b.le            idct16x16_half_add_neon
879cabdff1aSopenharmony_ci.endif
880cabdff1aSopenharmony_ci
881cabdff1aSopenharmony_ci.irp i, 0, 8
882cabdff1aSopenharmony_ci        add             x0,  sp,  #(\i*32)
883cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
884cabdff1aSopenharmony_ci.if \i == 8
885cabdff1aSopenharmony_ci        cmp             w3,  #38
886cabdff1aSopenharmony_ci        b.le            1f
887cabdff1aSopenharmony_ci.endif
888cabdff1aSopenharmony_ci.endif
889cabdff1aSopenharmony_ci        mov             x1,  #\i
890cabdff1aSopenharmony_ci        add             x2,  x6,  #(\i*2)
891cabdff1aSopenharmony_ci        bl              \txfm1\()16_1d_8x16_pass1_neon
892cabdff1aSopenharmony_ci.endr
893cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,iadst_idct
894cabdff1aSopenharmony_ci        ld1             {v0.8h,v1.8h}, [x10]
895cabdff1aSopenharmony_ci.endif
896cabdff1aSopenharmony_ci
897cabdff1aSopenharmony_ci.ifc \txfm1\()_\txfm2,idct_idct
898cabdff1aSopenharmony_ci        b               3f
899cabdff1aSopenharmony_ci1:
900cabdff1aSopenharmony_ci        // Set v24-v31 to zero, for the in-register passthrough of
901cabdff1aSopenharmony_ci        // coefficients to pass 2. Since we only do two slices, this can
902cabdff1aSopenharmony_ci        // only ever happen for the second slice. So we only need to store
903cabdff1aSopenharmony_ci        // zeros to the temp buffer for the second half of the buffer.
904cabdff1aSopenharmony_ci        // Move x0 to the second half, and use x9 == 32 as increment.
905cabdff1aSopenharmony_ci        add             x0,  x0,  #16
906cabdff1aSopenharmony_ci.irp i, 24, 25, 26, 27, 28, 29, 30, 31
907cabdff1aSopenharmony_ci        movi_v          \i,  .16b, #0
908cabdff1aSopenharmony_ci        st1             {v24.8h},  [x0], x9
909cabdff1aSopenharmony_ci.endr
910cabdff1aSopenharmony_ci3:
911cabdff1aSopenharmony_ci.endif
912cabdff1aSopenharmony_ci
913cabdff1aSopenharmony_ci.irp i, 0, 8
914cabdff1aSopenharmony_ci        add             x0,  x4,  #(\i)
915cabdff1aSopenharmony_ci        mov             x1,  x5
916cabdff1aSopenharmony_ci        add             x2,  sp,  #(\i*2)
917cabdff1aSopenharmony_ci        mov             x3,  #\i
918cabdff1aSopenharmony_ci        bl              \txfm2\()16_1d_8x16_pass2_neon
919cabdff1aSopenharmony_ci.endr
920cabdff1aSopenharmony_ci
921cabdff1aSopenharmony_ci        add             sp,  sp,  #512
922cabdff1aSopenharmony_ci.ifnc \txfm1\()_\txfm2,idct_idct
923cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
924cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
925cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
926cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
927cabdff1aSopenharmony_ci.endif
928cabdff1aSopenharmony_ci        ret             x15
929cabdff1aSopenharmony_ciendfunc
930cabdff1aSopenharmony_ci.endm
931cabdff1aSopenharmony_ci
932cabdff1aSopenharmony_ciitxfm_func16x16 idct,  idct
933cabdff1aSopenharmony_ciitxfm_func16x16 iadst, idct
934cabdff1aSopenharmony_ciitxfm_func16x16 idct,  iadst
935cabdff1aSopenharmony_ciitxfm_func16x16 iadst, iadst
936cabdff1aSopenharmony_ci
937cabdff1aSopenharmony_cifunction idct16_1d_8x16_pass1_quarter_neon
938cabdff1aSopenharmony_ci        mov             x14, x30
939cabdff1aSopenharmony_ci        movi            v2.8h, #0
940cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
941cabdff1aSopenharmony_ci        load_clear      \i,  x2,  x9
942cabdff1aSopenharmony_ci.endr
943cabdff1aSopenharmony_ci
944cabdff1aSopenharmony_ci        bl              idct16_quarter
945cabdff1aSopenharmony_ci
946cabdff1aSopenharmony_ci        // Do two 8x8 transposes. Originally, v16-v31 contain the
947cabdff1aSopenharmony_ci        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
948cabdff1aSopenharmony_ci        // transposed 8x8 blocks.
949cabdff1aSopenharmony_ci        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
950cabdff1aSopenharmony_ci        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
951cabdff1aSopenharmony_ci
952cabdff1aSopenharmony_ci        // Store the transposed 8x8 blocks horizontally.
953cabdff1aSopenharmony_ci        // The first 8x8 block is kept in registers for the second pass,
954cabdff1aSopenharmony_ci        // store the rest in the temp buffer.
955cabdff1aSopenharmony_ci        // Since only a 4x4 part of the input was nonzero, this means that
956cabdff1aSopenharmony_ci        // only 4 rows are nonzero after transposing, and the second pass
957cabdff1aSopenharmony_ci        // only reads the topmost 4 rows. Therefore only store the topmost
958cabdff1aSopenharmony_ci        // 4 rows.
959cabdff1aSopenharmony_ci        add             x0,  x0,  #16
960cabdff1aSopenharmony_ci.irp i, 24, 25, 26, 27
961cabdff1aSopenharmony_ci        store           \i,  x0,  x9
962cabdff1aSopenharmony_ci.endr
963cabdff1aSopenharmony_ci        ret             x14
964cabdff1aSopenharmony_ciendfunc
965cabdff1aSopenharmony_ci
966cabdff1aSopenharmony_cifunction idct16_1d_8x16_pass2_quarter_neon
967cabdff1aSopenharmony_ci        mov             x14, x30
968cabdff1aSopenharmony_ci        cbz             x3,  1f
969cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
970cabdff1aSopenharmony_ci        load            \i,  x2,  x9
971cabdff1aSopenharmony_ci.endr
972cabdff1aSopenharmony_ci1:
973cabdff1aSopenharmony_ci
974cabdff1aSopenharmony_ci        add             x3,  x0,  x1
975cabdff1aSopenharmony_ci        lsl             x1,  x1,  #1
976cabdff1aSopenharmony_ci        bl              idct16_quarter
977cabdff1aSopenharmony_ci
978cabdff1aSopenharmony_ci        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
979cabdff1aSopenharmony_ci        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
980cabdff1aSopenharmony_ci
981cabdff1aSopenharmony_ci        ret             x14
982cabdff1aSopenharmony_ciendfunc
983cabdff1aSopenharmony_ci
984cabdff1aSopenharmony_cifunction idct16_1d_8x16_pass1_half_neon
985cabdff1aSopenharmony_ci        mov             x14, x30
986cabdff1aSopenharmony_ci        movi            v2.8h, #0
987cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
988cabdff1aSopenharmony_ci        load_clear      \i,  x2,  x9
989cabdff1aSopenharmony_ci.endr
990cabdff1aSopenharmony_ci
991cabdff1aSopenharmony_ci        bl              idct16_half
992cabdff1aSopenharmony_ci
993cabdff1aSopenharmony_ci        // Do two 8x8 transposes. Originally, v16-v31 contain the
994cabdff1aSopenharmony_ci        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
995cabdff1aSopenharmony_ci        // transposed 8x8 blocks.
996cabdff1aSopenharmony_ci        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
997cabdff1aSopenharmony_ci        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
998cabdff1aSopenharmony_ci
999cabdff1aSopenharmony_ci        // Store the transposed 8x8 blocks horizontally.
1000cabdff1aSopenharmony_ci        // The first 8x8 block is kept in registers for the second pass,
1001cabdff1aSopenharmony_ci        // store the rest in the temp buffer.
1002cabdff1aSopenharmony_ci        add             x0,  x0,  #16
1003cabdff1aSopenharmony_ci.irp i, 24, 25, 26, 27, 28, 29, 30, 31
1004cabdff1aSopenharmony_ci        store           \i,  x0,  x9
1005cabdff1aSopenharmony_ci.endr
1006cabdff1aSopenharmony_ci        ret             x14
1007cabdff1aSopenharmony_ciendfunc
1008cabdff1aSopenharmony_ci
1009cabdff1aSopenharmony_cifunction idct16_1d_8x16_pass2_half_neon
1010cabdff1aSopenharmony_ci        mov             x14, x30
1011cabdff1aSopenharmony_ci        cbz             x3,  1f
1012cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1013cabdff1aSopenharmony_ci        load            \i,  x2,  x9
1014cabdff1aSopenharmony_ci.endr
1015cabdff1aSopenharmony_ci1:
1016cabdff1aSopenharmony_ci
1017cabdff1aSopenharmony_ci        add             x3,  x0,  x1
1018cabdff1aSopenharmony_ci        lsl             x1,  x1,  #1
1019cabdff1aSopenharmony_ci        bl              idct16_half
1020cabdff1aSopenharmony_ci
1021cabdff1aSopenharmony_ci        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
1022cabdff1aSopenharmony_ci        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
1023cabdff1aSopenharmony_ci
1024cabdff1aSopenharmony_ci        ret             x14
1025cabdff1aSopenharmony_ciendfunc
1026cabdff1aSopenharmony_ci
1027cabdff1aSopenharmony_ci.macro idct16_partial size
1028cabdff1aSopenharmony_cifunction idct16x16_\size\()_add_neon
1029cabdff1aSopenharmony_ci        add             x0,  sp,  #(0*32)
1030cabdff1aSopenharmony_ci        add             x2,  x6,  #(0*2)
1031cabdff1aSopenharmony_ci        bl              idct16_1d_8x16_pass1_\size\()_neon
1032cabdff1aSopenharmony_ci.irp i, 0, 8
1033cabdff1aSopenharmony_ci        add             x0,  x4,  #(\i)
1034cabdff1aSopenharmony_ci        mov             x1,  x5
1035cabdff1aSopenharmony_ci        add             x2,  sp,  #(\i*2)
1036cabdff1aSopenharmony_ci        mov             x3,  #\i
1037cabdff1aSopenharmony_ci        bl              idct16_1d_8x16_pass2_\size\()_neon
1038cabdff1aSopenharmony_ci.endr
1039cabdff1aSopenharmony_ci
1040cabdff1aSopenharmony_ci        add             sp,  sp,  #512
1041cabdff1aSopenharmony_ci        ret             x15
1042cabdff1aSopenharmony_ciendfunc
1043cabdff1aSopenharmony_ci.endm
1044cabdff1aSopenharmony_ci
1045cabdff1aSopenharmony_ciidct16_partial quarter
1046cabdff1aSopenharmony_ciidct16_partial half
1047cabdff1aSopenharmony_ci
1048cabdff1aSopenharmony_cifunction idct32x32_dc_add_neon
1049cabdff1aSopenharmony_ci        movrel          x4,  idct_coeffs
1050cabdff1aSopenharmony_ci        ld1             {v0.4h}, [x4]
1051cabdff1aSopenharmony_ci
1052cabdff1aSopenharmony_ci        movi            v1.4h,  #0
1053cabdff1aSopenharmony_ci
1054cabdff1aSopenharmony_ci        ld1             {v2.h}[0], [x2]
1055cabdff1aSopenharmony_ci        smull           v2.4s,  v2.4h,  v0.h[0]
1056cabdff1aSopenharmony_ci        rshrn           v2.4h,  v2.4s,  #14
1057cabdff1aSopenharmony_ci        smull           v2.4s,  v2.4h,  v0.h[0]
1058cabdff1aSopenharmony_ci        rshrn           v2.4h,  v2.4s,  #14
1059cabdff1aSopenharmony_ci        dup             v2.8h,  v2.h[0]
1060cabdff1aSopenharmony_ci        st1             {v1.h}[0], [x2]
1061cabdff1aSopenharmony_ci
1062cabdff1aSopenharmony_ci        srshr           v0.8h,  v2.8h,  #6
1063cabdff1aSopenharmony_ci
1064cabdff1aSopenharmony_ci        mov             x3,  x0
1065cabdff1aSopenharmony_ci        mov             x4,  #32
1066cabdff1aSopenharmony_ci1:
1067cabdff1aSopenharmony_ci        // Loop to add the constant v0 into all 32x32 outputs
1068cabdff1aSopenharmony_ci        subs            x4,  x4,  #2
1069cabdff1aSopenharmony_ci        ld1             {v1.16b,v2.16b},  [x0], x1
1070cabdff1aSopenharmony_ci        uaddw           v16.8h, v0.8h,  v1.8b
1071cabdff1aSopenharmony_ci        uaddw2          v17.8h, v0.8h,  v1.16b
1072cabdff1aSopenharmony_ci        ld1             {v3.16b,v4.16b},  [x0], x1
1073cabdff1aSopenharmony_ci        uaddw           v18.8h, v0.8h,  v2.8b
1074cabdff1aSopenharmony_ci        uaddw2          v19.8h, v0.8h,  v2.16b
1075cabdff1aSopenharmony_ci        uaddw           v20.8h, v0.8h,  v3.8b
1076cabdff1aSopenharmony_ci        uaddw2          v21.8h, v0.8h,  v3.16b
1077cabdff1aSopenharmony_ci        uaddw           v22.8h, v0.8h,  v4.8b
1078cabdff1aSopenharmony_ci        uaddw2          v23.8h, v0.8h,  v4.16b
1079cabdff1aSopenharmony_ci        sqxtun          v1.8b,  v16.8h
1080cabdff1aSopenharmony_ci        sqxtun2         v1.16b, v17.8h
1081cabdff1aSopenharmony_ci        sqxtun          v2.8b,  v18.8h
1082cabdff1aSopenharmony_ci        sqxtun2         v2.16b, v19.8h
1083cabdff1aSopenharmony_ci        sqxtun          v3.8b,  v20.8h
1084cabdff1aSopenharmony_ci        sqxtun2         v3.16b, v21.8h
1085cabdff1aSopenharmony_ci        st1             {v1.16b,v2.16b},  [x3], x1
1086cabdff1aSopenharmony_ci        sqxtun          v4.8b,  v22.8h
1087cabdff1aSopenharmony_ci        sqxtun2         v4.16b, v23.8h
1088cabdff1aSopenharmony_ci        st1             {v3.16b,v4.16b},  [x3], x1
1089cabdff1aSopenharmony_ci        b.ne            1b
1090cabdff1aSopenharmony_ci
1091cabdff1aSopenharmony_ci        ret
1092cabdff1aSopenharmony_ciendfunc
1093cabdff1aSopenharmony_ci
1094cabdff1aSopenharmony_ci.macro idct32_end
1095cabdff1aSopenharmony_ci        butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
1096cabdff1aSopenharmony_ci        butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
1097cabdff1aSopenharmony_ci        butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
1098cabdff1aSopenharmony_ci        butterfly_8h    v19, v21, v22, v21 // v19 = t22,  v21 = t21
1099cabdff1aSopenharmony_ci        butterfly_8h    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
1100cabdff1aSopenharmony_ci        butterfly_8h    v23, v26, v25, v26 // v23 = t25,  v26 = t26
1101cabdff1aSopenharmony_ci        butterfly_8h    v7,  v3,  v29, v31 // v7  = t31a, v3  = t28a
1102cabdff1aSopenharmony_ci        butterfly_8h    v22, v27, v24, v27 // v22 = t30,  v27 = t29
1103cabdff1aSopenharmony_ci
1104cabdff1aSopenharmony_ci        dmbutterfly     v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
1105cabdff1aSopenharmony_ci        dmbutterfly     v3,  v5,  v0.h[2], v0.h[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
1106cabdff1aSopenharmony_ci        dmbutterfly     v28, v6,  v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
1107cabdff1aSopenharmony_ci        dmbutterfly     v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1108cabdff1aSopenharmony_ci
1109cabdff1aSopenharmony_ci        butterfly_8h    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
1110cabdff1aSopenharmony_ci        butterfly_8h    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1111cabdff1aSopenharmony_ci        butterfly_8h_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
1112cabdff1aSopenharmony_ci        butterfly_8h_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1113cabdff1aSopenharmony_ci        butterfly_8h    v18, v21, v27, v21 // v18 = t18,  v21 = t21
1114cabdff1aSopenharmony_ci        butterfly_8h_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
1115cabdff1aSopenharmony_ci        butterfly_8h    v29, v26, v20, v26 // v29 = t29,  v26 = t26
1116cabdff1aSopenharmony_ci        butterfly_8h    v19, v20, v3,  v6  // v19 = t19a, v20 = t20
1117cabdff1aSopenharmony_ci
1118cabdff1aSopenharmony_ci        dmbutterfly0    v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27,  v20 = t20
1119cabdff1aSopenharmony_ci        dmbutterfly0    v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
1120cabdff1aSopenharmony_ci        dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
1121cabdff1aSopenharmony_ci        dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
1122cabdff1aSopenharmony_ci        ret
1123cabdff1aSopenharmony_ci.endm
1124cabdff1aSopenharmony_ci
1125cabdff1aSopenharmony_cifunction idct32_odd
1126cabdff1aSopenharmony_ci        dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1127cabdff1aSopenharmony_ci        dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1128cabdff1aSopenharmony_ci        dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1129cabdff1aSopenharmony_ci        dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1130cabdff1aSopenharmony_ci        dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1131cabdff1aSopenharmony_ci        dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1132cabdff1aSopenharmony_ci        dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1133cabdff1aSopenharmony_ci        dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1134cabdff1aSopenharmony_ci
1135cabdff1aSopenharmony_ci        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1136cabdff1aSopenharmony_ci        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1137cabdff1aSopenharmony_ci        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1138cabdff1aSopenharmony_ci        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1139cabdff1aSopenharmony_ci        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
1140cabdff1aSopenharmony_ci        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
1141cabdff1aSopenharmony_ci        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
1142cabdff1aSopenharmony_ci        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
1143cabdff1aSopenharmony_ci
1144cabdff1aSopenharmony_ci        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1145cabdff1aSopenharmony_ci        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1146cabdff1aSopenharmony_ci        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1147cabdff1aSopenharmony_ci        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1148cabdff1aSopenharmony_ci        idct32_end
1149cabdff1aSopenharmony_ciendfunc
1150cabdff1aSopenharmony_ci
1151cabdff1aSopenharmony_cifunction idct32_odd_half
1152cabdff1aSopenharmony_ci        dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1153cabdff1aSopenharmony_ci        dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1154cabdff1aSopenharmony_ci        dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1155cabdff1aSopenharmony_ci        dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1156cabdff1aSopenharmony_ci        dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1157cabdff1aSopenharmony_ci        dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1158cabdff1aSopenharmony_ci        dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1159cabdff1aSopenharmony_ci        dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1160cabdff1aSopenharmony_ci
1161cabdff1aSopenharmony_ci        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1162cabdff1aSopenharmony_ci        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1163cabdff1aSopenharmony_ci        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1164cabdff1aSopenharmony_ci        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1165cabdff1aSopenharmony_ci        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
1166cabdff1aSopenharmony_ci        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
1167cabdff1aSopenharmony_ci        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
1168cabdff1aSopenharmony_ci        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
1169cabdff1aSopenharmony_ci
1170cabdff1aSopenharmony_ci        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1171cabdff1aSopenharmony_ci        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1172cabdff1aSopenharmony_ci        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1173cabdff1aSopenharmony_ci        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1174cabdff1aSopenharmony_ci        idct32_end
1175cabdff1aSopenharmony_ciendfunc
1176cabdff1aSopenharmony_ci
1177cabdff1aSopenharmony_cifunction idct32_odd_quarter
1178cabdff1aSopenharmony_ci        dsmull_h        v4,  v5,  v16, v8.h[0]
1179cabdff1aSopenharmony_ci        dsmull_h        v28, v29, v19, v8.h[7]
1180cabdff1aSopenharmony_ci        dsmull_h        v30, v31, v16, v8.h[1]
1181cabdff1aSopenharmony_ci        dsmull_h        v22, v23, v17, v9.h[6]
1182cabdff1aSopenharmony_ci        dsmull_h        v7,  v6,  v17, v9.h[7]
1183cabdff1aSopenharmony_ci        dsmull_h        v26, v27, v19, v8.h[6]
1184cabdff1aSopenharmony_ci        dsmull_h        v20, v21, v18, v9.h[0]
1185cabdff1aSopenharmony_ci        dsmull_h        v24, v25, v18, v9.h[1]
1186cabdff1aSopenharmony_ci
1187cabdff1aSopenharmony_ci        neg             v28.4s, v28.4s
1188cabdff1aSopenharmony_ci        neg             v29.4s, v29.4s
1189cabdff1aSopenharmony_ci        neg             v7.4s,  v7.4s
1190cabdff1aSopenharmony_ci        neg             v6.4s,  v6.4s
1191cabdff1aSopenharmony_ci
1192cabdff1aSopenharmony_ci        drshrn_h        v4,  v4,  v5,  #14
1193cabdff1aSopenharmony_ci        drshrn_h        v5,  v28, v29, #14
1194cabdff1aSopenharmony_ci        drshrn_h        v29, v30, v31, #14
1195cabdff1aSopenharmony_ci        drshrn_h        v28, v22, v23, #14
1196cabdff1aSopenharmony_ci        drshrn_h        v7,  v7,  v6,  #14
1197cabdff1aSopenharmony_ci        drshrn_h        v31, v26, v27, #14
1198cabdff1aSopenharmony_ci        drshrn_h        v6,  v20, v21, #14
1199cabdff1aSopenharmony_ci        drshrn_h        v30, v24, v25, #14
1200cabdff1aSopenharmony_ci
1201cabdff1aSopenharmony_ci        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[4], v0.h[5]
1202cabdff1aSopenharmony_ci        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[4], v0.h[5]
1203cabdff1aSopenharmony_ci        drshrn_h        v23, v16, v17, #14
1204cabdff1aSopenharmony_ci        drshrn_h        v24, v18, v19, #14
1205cabdff1aSopenharmony_ci        neg             v20.4s, v20.4s
1206cabdff1aSopenharmony_ci        neg             v21.4s, v21.4s
1207cabdff1aSopenharmony_ci        drshrn_h        v27, v27, v26, #14
1208cabdff1aSopenharmony_ci        drshrn_h        v20, v20, v21, #14
1209cabdff1aSopenharmony_ci        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[6], v0.h[7]
1210cabdff1aSopenharmony_ci        drshrn_h        v21, v16, v17, #14
1211cabdff1aSopenharmony_ci        drshrn_h        v26, v18, v19, #14
1212cabdff1aSopenharmony_ci        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[6], v0.h[7]
1213cabdff1aSopenharmony_ci        drshrn_h        v25, v16, v17, #14
1214cabdff1aSopenharmony_ci        neg             v18.4s, v18.4s
1215cabdff1aSopenharmony_ci        neg             v19.4s, v19.4s
1216cabdff1aSopenharmony_ci        drshrn_h        v22, v18, v19, #14
1217cabdff1aSopenharmony_ci
1218cabdff1aSopenharmony_ci        idct32_end
1219cabdff1aSopenharmony_ciendfunc
1220cabdff1aSopenharmony_ci
1221cabdff1aSopenharmony_ci.macro idct32_funcs suffix
1222cabdff1aSopenharmony_ci// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
1223cabdff1aSopenharmony_ci// The 32-point IDCT can be decomposed into two 16-point IDCTs;
1224cabdff1aSopenharmony_ci// a normal IDCT16 with every other input component (the even ones, with
1225cabdff1aSopenharmony_ci// each output written twice), followed by a separate 16-point IDCT
1226cabdff1aSopenharmony_ci// of the odd inputs, added/subtracted onto the outputs of the first idct16.
1227cabdff1aSopenharmony_ci// x0 = dst (temp buffer)
1228cabdff1aSopenharmony_ci// x1 = unused
1229cabdff1aSopenharmony_ci// x2 = src
1230cabdff1aSopenharmony_ci// x9 = double input stride
1231cabdff1aSopenharmony_cifunction idct32_1d_8x32_pass1\suffix\()_neon
1232cabdff1aSopenharmony_ci        mov             x14, x30
1233cabdff1aSopenharmony_ci        movi            v2.8h,  #0
1234cabdff1aSopenharmony_ci
1235cabdff1aSopenharmony_ci        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1236cabdff1aSopenharmony_ci.ifb \suffix
1237cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1238cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1239cabdff1aSopenharmony_ci.endr
1240cabdff1aSopenharmony_ci.endif
1241cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1242cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1243cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1244cabdff1aSopenharmony_ci.endr
1245cabdff1aSopenharmony_ci.endif
1246cabdff1aSopenharmony_ci.ifc \suffix,_half
1247cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1248cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1249cabdff1aSopenharmony_ci.endr
1250cabdff1aSopenharmony_ci.endif
1251cabdff1aSopenharmony_ci
1252cabdff1aSopenharmony_ci        bl              idct16\suffix
1253cabdff1aSopenharmony_ci
1254cabdff1aSopenharmony_ci        // Do two 8x8 transposes. Originally, v16-v31 contain the
1255cabdff1aSopenharmony_ci        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
1256cabdff1aSopenharmony_ci        // two transposed 8x8 blocks.
1257cabdff1aSopenharmony_ci        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
1258cabdff1aSopenharmony_ci        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
1259cabdff1aSopenharmony_ci
1260cabdff1aSopenharmony_ci        // Store the registers a, b horizontally, followed by the
1261cabdff1aSopenharmony_ci        // same registers b, a mirrored.
1262cabdff1aSopenharmony_ci.macro store_rev a, b
1263cabdff1aSopenharmony_ci        // There's no rev128 instruction, but we reverse each 64 bit
1264cabdff1aSopenharmony_ci        // half, and then flip them using an ext with 8 bytes offset.
1265cabdff1aSopenharmony_ci        rev64           v3.8h, \b
1266cabdff1aSopenharmony_ci        st1             {\a},  [x0], #16
1267cabdff1aSopenharmony_ci        rev64           v2.8h, \a
1268cabdff1aSopenharmony_ci        ext             v3.16b, v3.16b, v3.16b, #8
1269cabdff1aSopenharmony_ci        st1             {\b},  [x0], #16
1270cabdff1aSopenharmony_ci        ext             v2.16b, v2.16b, v2.16b, #8
1271cabdff1aSopenharmony_ci        st1             {v3.8h},  [x0], #16
1272cabdff1aSopenharmony_ci        st1             {v2.8h},  [x0], #16
1273cabdff1aSopenharmony_ci.endm
1274cabdff1aSopenharmony_ci        store_rev       v16.8h, v24.8h
1275cabdff1aSopenharmony_ci        store_rev       v17.8h, v25.8h
1276cabdff1aSopenharmony_ci        store_rev       v18.8h, v26.8h
1277cabdff1aSopenharmony_ci        store_rev       v19.8h, v27.8h
1278cabdff1aSopenharmony_ci        store_rev       v20.8h, v28.8h
1279cabdff1aSopenharmony_ci        store_rev       v21.8h, v29.8h
1280cabdff1aSopenharmony_ci        store_rev       v22.8h, v30.8h
1281cabdff1aSopenharmony_ci        store_rev       v23.8h, v31.8h
1282cabdff1aSopenharmony_ci        sub             x0,  x0,  #512
1283cabdff1aSopenharmony_ci.purgem store_rev
1284cabdff1aSopenharmony_ci
1285cabdff1aSopenharmony_ci        // Move x2 back to the start of the input, and move
1286cabdff1aSopenharmony_ci        // to the first odd row
1287cabdff1aSopenharmony_ci.ifb \suffix
1288cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #4
1289cabdff1aSopenharmony_ci.endif
1290cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1291cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #2
1292cabdff1aSopenharmony_ci.endif
1293cabdff1aSopenharmony_ci.ifc \suffix,_half
1294cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #3
1295cabdff1aSopenharmony_ci.endif
1296cabdff1aSopenharmony_ci        add             x2,  x2,  #64
1297cabdff1aSopenharmony_ci
1298cabdff1aSopenharmony_ci        movi            v2.8h,  #0
1299cabdff1aSopenharmony_ci        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1300cabdff1aSopenharmony_ci.ifb \suffix
1301cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1302cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1303cabdff1aSopenharmony_ci.endr
1304cabdff1aSopenharmony_ci.endif
1305cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1306cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1307cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1308cabdff1aSopenharmony_ci.endr
1309cabdff1aSopenharmony_ci.endif
1310cabdff1aSopenharmony_ci.ifc \suffix,_half
1311cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1312cabdff1aSopenharmony_ci        load_clear      \i, x2, x9
1313cabdff1aSopenharmony_ci.endr
1314cabdff1aSopenharmony_ci.endif
1315cabdff1aSopenharmony_ci
1316cabdff1aSopenharmony_ci        bl              idct32_odd\suffix
1317cabdff1aSopenharmony_ci
1318cabdff1aSopenharmony_ci        transpose_8x8H  v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
1319cabdff1aSopenharmony_ci        transpose_8x8H  v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
1320cabdff1aSopenharmony_ci
1321cabdff1aSopenharmony_ci        // Store the registers a, b horizontally,
1322cabdff1aSopenharmony_ci        // adding into the output first, and the mirrored,
1323cabdff1aSopenharmony_ci        // subtracted from the output.
1324cabdff1aSopenharmony_ci.macro store_rev a, b
1325cabdff1aSopenharmony_ci        ld1             {v4.8h},  [x0]
1326cabdff1aSopenharmony_ci        rev64           v3.8h, \b
1327cabdff1aSopenharmony_ci        add             v4.8h, v4.8h, \a
1328cabdff1aSopenharmony_ci        rev64           v2.8h, \a
1329cabdff1aSopenharmony_ci        st1             {v4.8h},  [x0], #16
1330cabdff1aSopenharmony_ci        ext             v3.16b, v3.16b, v3.16b, #8
1331cabdff1aSopenharmony_ci        ld1             {v5.8h},  [x0]
1332cabdff1aSopenharmony_ci        ext             v2.16b, v2.16b, v2.16b, #8
1333cabdff1aSopenharmony_ci        add             v5.8h, v5.8h, \b
1334cabdff1aSopenharmony_ci        st1             {v5.8h},  [x0], #16
1335cabdff1aSopenharmony_ci        ld1             {v6.8h},  [x0]
1336cabdff1aSopenharmony_ci        sub             v6.8h, v6.8h, v3.8h
1337cabdff1aSopenharmony_ci        st1             {v6.8h},  [x0], #16
1338cabdff1aSopenharmony_ci        ld1             {v7.8h},  [x0]
1339cabdff1aSopenharmony_ci        sub             v7.8h, v7.8h, v2.8h
1340cabdff1aSopenharmony_ci        st1             {v7.8h},  [x0], #16
1341cabdff1aSopenharmony_ci.endm
1342cabdff1aSopenharmony_ci
1343cabdff1aSopenharmony_ci        store_rev       v31.8h, v23.8h
1344cabdff1aSopenharmony_ci        store_rev       v30.8h, v22.8h
1345cabdff1aSopenharmony_ci        store_rev       v29.8h, v21.8h
1346cabdff1aSopenharmony_ci        store_rev       v28.8h, v20.8h
1347cabdff1aSopenharmony_ci        store_rev       v27.8h, v19.8h
1348cabdff1aSopenharmony_ci        store_rev       v26.8h, v18.8h
1349cabdff1aSopenharmony_ci        store_rev       v25.8h, v17.8h
1350cabdff1aSopenharmony_ci        store_rev       v24.8h, v16.8h
1351cabdff1aSopenharmony_ci.purgem store_rev
1352cabdff1aSopenharmony_ci        ret             x14
1353cabdff1aSopenharmony_ciendfunc
1354cabdff1aSopenharmony_ci
1355cabdff1aSopenharmony_ci// This is mostly the same as 8x32_pass1, but without the transpose,
1356cabdff1aSopenharmony_ci// and use the source as temp buffer between the two idct passes, and
1357cabdff1aSopenharmony_ci// add into the destination.
1358cabdff1aSopenharmony_ci// x0 = dst
1359cabdff1aSopenharmony_ci// x1 = dst stride
1360cabdff1aSopenharmony_ci// x2 = src (temp buffer)
1361cabdff1aSopenharmony_ci// x7 = negative double temp buffer stride
1362cabdff1aSopenharmony_ci// x9 = double temp buffer stride
1363cabdff1aSopenharmony_cifunction idct32_1d_8x32_pass2\suffix\()_neon
1364cabdff1aSopenharmony_ci        mov             x14, x30
1365cabdff1aSopenharmony_ci        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1366cabdff1aSopenharmony_ci.ifb \suffix
1367cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1368cabdff1aSopenharmony_ci        load            \i, x2, x9
1369cabdff1aSopenharmony_ci.endr
1370cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #4
1371cabdff1aSopenharmony_ci.endif
1372cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1373cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1374cabdff1aSopenharmony_ci        load            \i, x2, x9
1375cabdff1aSopenharmony_ci.endr
1376cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #2
1377cabdff1aSopenharmony_ci.endif
1378cabdff1aSopenharmony_ci.ifc \suffix,_half
1379cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1380cabdff1aSopenharmony_ci        load            \i, x2, x9
1381cabdff1aSopenharmony_ci.endr
1382cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #3
1383cabdff1aSopenharmony_ci.endif
1384cabdff1aSopenharmony_ci
1385cabdff1aSopenharmony_ci        bl              idct16\suffix
1386cabdff1aSopenharmony_ci
1387cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1388cabdff1aSopenharmony_ci        store           \i, x2, x9
1389cabdff1aSopenharmony_ci.endr
1390cabdff1aSopenharmony_ci
1391cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #4
1392cabdff1aSopenharmony_ci        add             x2,  x2,  #64
1393cabdff1aSopenharmony_ci
1394cabdff1aSopenharmony_ci        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1395cabdff1aSopenharmony_ci.ifb \suffix
1396cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1397cabdff1aSopenharmony_ci        load            \i, x2, x9
1398cabdff1aSopenharmony_ci.endr
1399cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #4
1400cabdff1aSopenharmony_ci.endif
1401cabdff1aSopenharmony_ci.ifc \suffix,_quarter
1402cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19
1403cabdff1aSopenharmony_ci        load            \i, x2, x9
1404cabdff1aSopenharmony_ci.endr
1405cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #2
1406cabdff1aSopenharmony_ci.endif
1407cabdff1aSopenharmony_ci.ifc \suffix,_half
1408cabdff1aSopenharmony_ci.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1409cabdff1aSopenharmony_ci        load            \i, x2, x9
1410cabdff1aSopenharmony_ci.endr
1411cabdff1aSopenharmony_ci        sub             x2,  x2,  x9, lsl #3
1412cabdff1aSopenharmony_ci.endif
1413cabdff1aSopenharmony_ci        sub             x2,  x2,  #64
1414cabdff1aSopenharmony_ci
1415cabdff1aSopenharmony_ci        bl              idct32_odd\suffix
1416cabdff1aSopenharmony_ci
1417cabdff1aSopenharmony_ci.macro load_acc_store a, b, c, d, neg=0
1418cabdff1aSopenharmony_ci.if \neg == 0
1419cabdff1aSopenharmony_ci        ld1             {v4.8h},  [x2], x9
1420cabdff1aSopenharmony_ci        ld1             {v5.8h},  [x2], x9
1421cabdff1aSopenharmony_ci        add             v4.8h, v4.8h, \a
1422cabdff1aSopenharmony_ci        ld1             {v6.8h},  [x2], x9
1423cabdff1aSopenharmony_ci        add             v5.8h, v5.8h, \b
1424cabdff1aSopenharmony_ci        ld1             {v7.8h},  [x2], x9
1425cabdff1aSopenharmony_ci        add             v6.8h, v6.8h, \c
1426cabdff1aSopenharmony_ci        add             v7.8h, v7.8h, \d
1427cabdff1aSopenharmony_ci.else
1428cabdff1aSopenharmony_ci        ld1             {v4.8h},  [x2], x7
1429cabdff1aSopenharmony_ci        ld1             {v5.8h},  [x2], x7
1430cabdff1aSopenharmony_ci        sub             v4.8h, v4.8h, \a
1431cabdff1aSopenharmony_ci        ld1             {v6.8h},  [x2], x7
1432cabdff1aSopenharmony_ci        sub             v5.8h, v5.8h, \b
1433cabdff1aSopenharmony_ci        ld1             {v7.8h},  [x2], x7
1434cabdff1aSopenharmony_ci        sub             v6.8h, v6.8h, \c
1435cabdff1aSopenharmony_ci        sub             v7.8h, v7.8h, \d
1436cabdff1aSopenharmony_ci.endif
1437cabdff1aSopenharmony_ci        ld1             {v10.8b}, [x0], x1
1438cabdff1aSopenharmony_ci        ld1             {v11.8b}, [x0], x1
1439cabdff1aSopenharmony_ci        srshr           v4.8h, v4.8h, #6
1440cabdff1aSopenharmony_ci        ld1             {v2.8b}, [x0], x1
1441cabdff1aSopenharmony_ci        srshr           v5.8h, v5.8h, #6
1442cabdff1aSopenharmony_ci        uaddw           v4.8h, v4.8h, v10.8b
1443cabdff1aSopenharmony_ci        ld1             {v3.8b}, [x0], x1
1444cabdff1aSopenharmony_ci        srshr           v6.8h, v6.8h, #6
1445cabdff1aSopenharmony_ci        uaddw           v5.8h, v5.8h, v11.8b
1446cabdff1aSopenharmony_ci        srshr           v7.8h, v7.8h, #6
1447cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
1448cabdff1aSopenharmony_ci        uaddw           v6.8h, v6.8h, v2.8b
1449cabdff1aSopenharmony_ci        sqxtun          v4.8b, v4.8h
1450cabdff1aSopenharmony_ci        uaddw           v7.8h, v7.8h, v3.8b
1451cabdff1aSopenharmony_ci        sqxtun          v5.8b, v5.8h
1452cabdff1aSopenharmony_ci        st1             {v4.8b}, [x0], x1
1453cabdff1aSopenharmony_ci        sqxtun          v6.8b, v6.8h
1454cabdff1aSopenharmony_ci        st1             {v5.8b}, [x0], x1
1455cabdff1aSopenharmony_ci        sqxtun          v7.8b, v7.8h
1456cabdff1aSopenharmony_ci        st1             {v6.8b}, [x0], x1
1457cabdff1aSopenharmony_ci        st1             {v7.8b}, [x0], x1
1458cabdff1aSopenharmony_ci.endm
1459cabdff1aSopenharmony_ci        load_acc_store  v31.8h, v30.8h, v29.8h, v28.8h
1460cabdff1aSopenharmony_ci        load_acc_store  v27.8h, v26.8h, v25.8h, v24.8h
1461cabdff1aSopenharmony_ci        load_acc_store  v23.8h, v22.8h, v21.8h, v20.8h
1462cabdff1aSopenharmony_ci        load_acc_store  v19.8h, v18.8h, v17.8h, v16.8h
1463cabdff1aSopenharmony_ci        sub             x2,  x2,  x9
1464cabdff1aSopenharmony_ci        load_acc_store  v16.8h, v17.8h, v18.8h, v19.8h, 1
1465cabdff1aSopenharmony_ci        load_acc_store  v20.8h, v21.8h, v22.8h, v23.8h, 1
1466cabdff1aSopenharmony_ci        load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
1467cabdff1aSopenharmony_ci        load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
1468cabdff1aSopenharmony_ci.purgem load_acc_store
1469cabdff1aSopenharmony_ci        ret             x14
1470cabdff1aSopenharmony_ciendfunc
1471cabdff1aSopenharmony_ci.endm
1472cabdff1aSopenharmony_ci
1473cabdff1aSopenharmony_ciidct32_funcs
1474cabdff1aSopenharmony_ciidct32_funcs _quarter
1475cabdff1aSopenharmony_ciidct32_funcs _half
1476cabdff1aSopenharmony_ci
1477cabdff1aSopenharmony_ciconst min_eob_idct_idct_32, align=4
1478cabdff1aSopenharmony_ci        .short  0, 34, 135, 336
1479cabdff1aSopenharmony_ciendconst
1480cabdff1aSopenharmony_ci
1481cabdff1aSopenharmony_cifunction ff_vp9_idct_idct_32x32_add_neon, export=1
1482cabdff1aSopenharmony_ci        cmp             w3,  #1
1483cabdff1aSopenharmony_ci        b.eq            idct32x32_dc_add_neon
1484cabdff1aSopenharmony_ci
1485cabdff1aSopenharmony_ci        movrel          x10, idct_coeffs
1486cabdff1aSopenharmony_ci
1487cabdff1aSopenharmony_ci        mov             x15, x30
1488cabdff1aSopenharmony_ci
1489cabdff1aSopenharmony_ci        stp             d10, d11, [sp, #-0x10]!
1490cabdff1aSopenharmony_ci        stp             d8,  d9,  [sp, #-0x10]!
1491cabdff1aSopenharmony_ci
1492cabdff1aSopenharmony_ci        sub             sp,  sp,  #2048
1493cabdff1aSopenharmony_ci
1494cabdff1aSopenharmony_ci        mov             x4,  x0
1495cabdff1aSopenharmony_ci        mov             x5,  x1
1496cabdff1aSopenharmony_ci        mov             x6,  x2
1497cabdff1aSopenharmony_ci
1498cabdff1aSopenharmony_ci        // Double stride of the input, since we only read every other line
1499cabdff1aSopenharmony_ci        mov             x9,  #128
1500cabdff1aSopenharmony_ci        neg             x7,  x9
1501cabdff1aSopenharmony_ci
1502cabdff1aSopenharmony_ci        ld1             {v0.8h,v1.8h}, [x10], #32
1503cabdff1aSopenharmony_ci        ld1             {v8.8h,v9.8h}, [x10]
1504cabdff1aSopenharmony_ci
1505cabdff1aSopenharmony_ci        cmp             w3,  #34
1506cabdff1aSopenharmony_ci        b.le            idct32x32_quarter_add_neon
1507cabdff1aSopenharmony_ci        cmp             w3,  #135
1508cabdff1aSopenharmony_ci        b.le            idct32x32_half_add_neon
1509cabdff1aSopenharmony_ci
1510cabdff1aSopenharmony_ci        movrel          x12, min_eob_idct_idct_32, 2
1511cabdff1aSopenharmony_ci
1512cabdff1aSopenharmony_ci.irp i, 0, 8, 16, 24
1513cabdff1aSopenharmony_ci        add             x0,  sp,  #(\i*64)
1514cabdff1aSopenharmony_ci.if \i > 0
1515cabdff1aSopenharmony_ci        ldrh            w1,  [x12], #2
1516cabdff1aSopenharmony_ci        cmp             w3,  w1
1517cabdff1aSopenharmony_ci        mov             x1,  #(32 - \i)/4
1518cabdff1aSopenharmony_ci        b.le            1f
1519cabdff1aSopenharmony_ci.endif
1520cabdff1aSopenharmony_ci        add             x2,  x6,  #(\i*2)
1521cabdff1aSopenharmony_ci        bl              idct32_1d_8x32_pass1_neon
1522cabdff1aSopenharmony_ci.endr
1523cabdff1aSopenharmony_ci        b               3f
1524cabdff1aSopenharmony_ci
1525cabdff1aSopenharmony_ci1:
1526cabdff1aSopenharmony_ci        // Write zeros to the temp buffer for pass 2
1527cabdff1aSopenharmony_ci        movi            v16.8h,  #0
1528cabdff1aSopenharmony_ci        movi            v17.8h,  #0
1529cabdff1aSopenharmony_ci        movi            v18.8h,  #0
1530cabdff1aSopenharmony_ci        movi            v19.8h,  #0
1531cabdff1aSopenharmony_ci2:
1532cabdff1aSopenharmony_ci        subs            x1,  x1,  #1
1533cabdff1aSopenharmony_ci.rept 4
1534cabdff1aSopenharmony_ci        st1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x0], #64
1535cabdff1aSopenharmony_ci.endr
1536cabdff1aSopenharmony_ci        b.ne            2b
1537cabdff1aSopenharmony_ci3:
1538cabdff1aSopenharmony_ci.irp i, 0, 8, 16, 24
1539cabdff1aSopenharmony_ci        add             x0,  x4,  #(\i)
1540cabdff1aSopenharmony_ci        mov             x1,  x5
1541cabdff1aSopenharmony_ci        add             x2,  sp,  #(\i*2)
1542cabdff1aSopenharmony_ci        bl              idct32_1d_8x32_pass2_neon
1543cabdff1aSopenharmony_ci.endr
1544cabdff1aSopenharmony_ci
1545cabdff1aSopenharmony_ci        add             sp,  sp,  #2048
1546cabdff1aSopenharmony_ci
1547cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
1548cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
1549cabdff1aSopenharmony_ci
1550cabdff1aSopenharmony_ci        ret             x15
1551cabdff1aSopenharmony_ciendfunc
1552cabdff1aSopenharmony_ci
1553cabdff1aSopenharmony_ci.macro idct32_partial size
1554cabdff1aSopenharmony_cifunction idct32x32_\size\()_add_neon
1555cabdff1aSopenharmony_ci        add             x0,  sp,  #(0*64)
1556cabdff1aSopenharmony_ci        add             x2,  x6,  #(0*2)
1557cabdff1aSopenharmony_ci        bl              idct32_1d_8x32_pass1_\size\()_neon
1558cabdff1aSopenharmony_ci.ifc \size,half
1559cabdff1aSopenharmony_ci        add             x0,  sp,  #(8*64)
1560cabdff1aSopenharmony_ci        add             x2,  x6,  #(8*2)
1561cabdff1aSopenharmony_ci        bl              idct32_1d_8x32_pass1_\size\()_neon
1562cabdff1aSopenharmony_ci.endif
1563cabdff1aSopenharmony_ci.irp i, 0, 8, 16, 24
1564cabdff1aSopenharmony_ci        add             x0,  x4,  #(\i)
1565cabdff1aSopenharmony_ci        mov             x1,  x5
1566cabdff1aSopenharmony_ci        add             x2,  sp,  #(\i*2)
1567cabdff1aSopenharmony_ci        bl              idct32_1d_8x32_pass2_\size\()_neon
1568cabdff1aSopenharmony_ci.endr
1569cabdff1aSopenharmony_ci
1570cabdff1aSopenharmony_ci        add             sp,  sp,  #2048
1571cabdff1aSopenharmony_ci
1572cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
1573cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
1574cabdff1aSopenharmony_ci
1575cabdff1aSopenharmony_ci        ret             x15
1576cabdff1aSopenharmony_ciendfunc
1577cabdff1aSopenharmony_ci.endm
1578cabdff1aSopenharmony_ci
1579cabdff1aSopenharmony_ciidct32_partial quarter
1580cabdff1aSopenharmony_ciidct32_partial half
1581