1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * ARM NEON IDCT
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5cabdff1aSopenharmony_ci * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
6cabdff1aSopenharmony_ci *
7cabdff1aSopenharmony_ci * Based on Simple IDCT
8cabdff1aSopenharmony_ci * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
9cabdff1aSopenharmony_ci *
10cabdff1aSopenharmony_ci * This file is part of FFmpeg.
11cabdff1aSopenharmony_ci *
12cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
13cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
14cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
15cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
16cabdff1aSopenharmony_ci *
17cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
18cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
19cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20cabdff1aSopenharmony_ci * Lesser General Public License for more details.
21cabdff1aSopenharmony_ci *
22cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
23cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
24cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25cabdff1aSopenharmony_ci */
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ci#define Z1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
30cabdff1aSopenharmony_ci#define Z2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
31cabdff1aSopenharmony_ci#define Z3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
32cabdff1aSopenharmony_ci#define Z4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
33cabdff1aSopenharmony_ci#define Z5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34cabdff1aSopenharmony_ci#define Z6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35cabdff1aSopenharmony_ci#define Z7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36cabdff1aSopenharmony_ci#define Z4c ((1<<(COL_SHIFT-1))/Z4)
37cabdff1aSopenharmony_ci#define ROW_SHIFT 11
38cabdff1aSopenharmony_ci#define COL_SHIFT 20
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_ci#define z1 v0.H[0]
41cabdff1aSopenharmony_ci#define z2 v0.H[1]
42cabdff1aSopenharmony_ci#define z3 v0.H[2]
43cabdff1aSopenharmony_ci#define z4 v0.H[3]
44cabdff1aSopenharmony_ci#define z5 v0.H[4]
45cabdff1aSopenharmony_ci#define z6 v0.H[5]
46cabdff1aSopenharmony_ci#define z7 v0.H[6]
47cabdff1aSopenharmony_ci#define z4c v0.H[7]
48cabdff1aSopenharmony_ci
49cabdff1aSopenharmony_ciconst   idct_coeff_neon, align=4
50cabdff1aSopenharmony_ci        .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
51cabdff1aSopenharmony_ciendconst
52cabdff1aSopenharmony_ci
53cabdff1aSopenharmony_ci.macro idct_start data
54cabdff1aSopenharmony_ci        prfm            pldl1keep, [\data]
55cabdff1aSopenharmony_ci        mov             x10, x30
56cabdff1aSopenharmony_ci        movrel          x3, idct_coeff_neon
57cabdff1aSopenharmony_ci        ld1             {v0.2D}, [x3]
58cabdff1aSopenharmony_ci.endm
59cabdff1aSopenharmony_ci
60cabdff1aSopenharmony_ci.macro idct_end
61cabdff1aSopenharmony_ci        ret             x10
62cabdff1aSopenharmony_ci.endm
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_ci.macro smull1 a, b, c
65cabdff1aSopenharmony_ci        smull           \a, \b, \c
66cabdff1aSopenharmony_ci.endm
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci.macro smlal1 a, b, c
69cabdff1aSopenharmony_ci        smlal           \a, \b, \c
70cabdff1aSopenharmony_ci.endm
71cabdff1aSopenharmony_ci
72cabdff1aSopenharmony_ci.macro smlsl1 a, b, c
73cabdff1aSopenharmony_ci        smlsl           \a, \b, \c
74cabdff1aSopenharmony_ci.endm
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_ci.macro idct_col4_top y1, y2, y3, y4, i, l
77cabdff1aSopenharmony_ci        smull\i         v7.4S,  \y3\l, z2
78cabdff1aSopenharmony_ci        smull\i         v16.4S, \y3\l, z6
79cabdff1aSopenharmony_ci        smull\i         v17.4S, \y2\l, z1
80cabdff1aSopenharmony_ci        add             v19.4S, v23.4S, v7.4S
81cabdff1aSopenharmony_ci        smull\i         v18.4S, \y2\l, z3
82cabdff1aSopenharmony_ci        add             v20.4S, v23.4S, v16.4S
83cabdff1aSopenharmony_ci        smull\i         v5.4S,  \y2\l, z5
84cabdff1aSopenharmony_ci        sub             v21.4S, v23.4S, v16.4S
85cabdff1aSopenharmony_ci        smull\i         v6.4S,  \y2\l, z7
86cabdff1aSopenharmony_ci        sub             v22.4S, v23.4S, v7.4S
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci        smlal\i         v17.4S, \y4\l, z3
89cabdff1aSopenharmony_ci        smlsl\i         v18.4S, \y4\l, z7
90cabdff1aSopenharmony_ci        smlsl\i         v5.4S,  \y4\l, z1
91cabdff1aSopenharmony_ci        smlsl\i         v6.4S,  \y4\l, z5
92cabdff1aSopenharmony_ci.endm
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci.macro idct_row4_neon y1, y2, y3, y4, pass
95cabdff1aSopenharmony_ci        ld1             {\y1\().2D,\y2\().2D}, [x2], #32
96cabdff1aSopenharmony_ci        movi            v23.4S, #1<<2, lsl #8
97cabdff1aSopenharmony_ci        orr             v5.16B, \y1\().16B, \y2\().16B
98cabdff1aSopenharmony_ci        ld1             {\y3\().2D,\y4\().2D}, [x2], #32
99cabdff1aSopenharmony_ci        orr             v6.16B, \y3\().16B, \y4\().16B
100cabdff1aSopenharmony_ci        orr             v5.16B, v5.16B, v6.16B
101cabdff1aSopenharmony_ci        mov             x3, v5.D[1]
102cabdff1aSopenharmony_ci        smlal           v23.4S, \y1\().4H, z4
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4H
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci        cmp             x3, #0
107cabdff1aSopenharmony_ci        b.eq            \pass\()f
108cabdff1aSopenharmony_ci
109cabdff1aSopenharmony_ci        smull2          v7.4S, \y1\().8H, z4
110cabdff1aSopenharmony_ci        smlal2          v17.4S, \y2\().8H, z5
111cabdff1aSopenharmony_ci        smlsl2          v18.4S, \y2\().8H, z1
112cabdff1aSopenharmony_ci        smull2          v16.4S, \y3\().8H, z2
113cabdff1aSopenharmony_ci        smlal2          v5.4S, \y2\().8H, z7
114cabdff1aSopenharmony_ci        add             v19.4S, v19.4S, v7.4S
115cabdff1aSopenharmony_ci        sub             v20.4S, v20.4S, v7.4S
116cabdff1aSopenharmony_ci        sub             v21.4S, v21.4S, v7.4S
117cabdff1aSopenharmony_ci        add             v22.4S, v22.4S, v7.4S
118cabdff1aSopenharmony_ci        smlal2          v6.4S, \y2\().8H, z3
119cabdff1aSopenharmony_ci        smull2          v7.4S, \y3\().8H, z6
120cabdff1aSopenharmony_ci        smlal2          v17.4S, \y4\().8H, z7
121cabdff1aSopenharmony_ci        smlsl2          v18.4S, \y4\().8H, z5
122cabdff1aSopenharmony_ci        smlal2          v5.4S, \y4\().8H, z3
123cabdff1aSopenharmony_ci        smlsl2          v6.4S, \y4\().8H, z1
124cabdff1aSopenharmony_ci        add             v19.4S, v19.4S, v7.4S
125cabdff1aSopenharmony_ci        sub             v20.4S, v20.4S, v16.4S
126cabdff1aSopenharmony_ci        add             v21.4S, v21.4S, v16.4S
127cabdff1aSopenharmony_ci        sub             v22.4S, v22.4S, v7.4S
128cabdff1aSopenharmony_ci
129cabdff1aSopenharmony_ci\pass:  add             \y3\().4S, v19.4S, v17.4S
130cabdff1aSopenharmony_ci        add             \y4\().4S, v20.4S, v18.4S
131cabdff1aSopenharmony_ci        shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
132cabdff1aSopenharmony_ci        shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
133cabdff1aSopenharmony_ci        add             v7.4S, v21.4S, v5.4S
134cabdff1aSopenharmony_ci        add             v16.4S, v22.4S, v6.4S
135cabdff1aSopenharmony_ci        shrn            \y3\().4H, v7.4S, #ROW_SHIFT
136cabdff1aSopenharmony_ci        shrn            \y4\().4H, v16.4S, #ROW_SHIFT
137cabdff1aSopenharmony_ci        sub             v22.4S, v22.4S, v6.4S
138cabdff1aSopenharmony_ci        sub             v19.4S, v19.4S, v17.4S
139cabdff1aSopenharmony_ci        sub             v21.4S, v21.4S, v5.4S
140cabdff1aSopenharmony_ci        shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
141cabdff1aSopenharmony_ci        sub             v20.4S, v20.4S, v18.4S
142cabdff1aSopenharmony_ci        shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
143cabdff1aSopenharmony_ci        shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
144cabdff1aSopenharmony_ci        shrn2           \y4\().8H, v19.4S, #ROW_SHIFT
145cabdff1aSopenharmony_ci
146cabdff1aSopenharmony_ci        trn1            v16.8H, \y1\().8H, \y2\().8H
147cabdff1aSopenharmony_ci        trn2            v17.8H, \y1\().8H, \y2\().8H
148cabdff1aSopenharmony_ci        trn1            v18.8H, \y3\().8H, \y4\().8H
149cabdff1aSopenharmony_ci        trn2            v19.8H, \y3\().8H, \y4\().8H
150cabdff1aSopenharmony_ci        trn1            \y1\().4S, v16.4S, v18.4S
151cabdff1aSopenharmony_ci        trn1            \y2\().4S, v17.4S, v19.4S
152cabdff1aSopenharmony_ci        trn2            \y3\().4S, v16.4S, v18.4S
153cabdff1aSopenharmony_ci        trn2            \y4\().4S, v17.4S, v19.4S
154cabdff1aSopenharmony_ci.endm
155cabdff1aSopenharmony_ci
156cabdff1aSopenharmony_ci.macro declare_idct_col4_neon i, l
157cabdff1aSopenharmony_cifunction idct_col4_neon\i
158cabdff1aSopenharmony_ci        dup             v23.4H, z4c
159cabdff1aSopenharmony_ci.if \i == 1
160cabdff1aSopenharmony_ci        add             v23.4H, v23.4H, v24.4H
161cabdff1aSopenharmony_ci.else
162cabdff1aSopenharmony_ci        mov             v5.D[0], v24.D[1]
163cabdff1aSopenharmony_ci        add             v23.4H, v23.4H, v5.4H
164cabdff1aSopenharmony_ci.endif
165cabdff1aSopenharmony_ci        smull           v23.4S, v23.4H, z4
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci        idct_col4_top   v24, v25, v26, v27, \i, \l
168cabdff1aSopenharmony_ci
169cabdff1aSopenharmony_ci        mov             x4, v28.D[\i - 1]
170cabdff1aSopenharmony_ci        mov             x5, v29.D[\i - 1]
171cabdff1aSopenharmony_ci        cmp             x4, #0
172cabdff1aSopenharmony_ci        b.eq            1f
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_ci        smull\i         v7.4S,  v28\l,  z4
175cabdff1aSopenharmony_ci        add             v19.4S, v19.4S, v7.4S
176cabdff1aSopenharmony_ci        sub             v20.4S, v20.4S, v7.4S
177cabdff1aSopenharmony_ci        sub             v21.4S, v21.4S, v7.4S
178cabdff1aSopenharmony_ci        add             v22.4S, v22.4S, v7.4S
179cabdff1aSopenharmony_ci
180cabdff1aSopenharmony_ci1:      mov             x4, v30.D[\i - 1]
181cabdff1aSopenharmony_ci        cmp             x5, #0
182cabdff1aSopenharmony_ci        b.eq            2f
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ci        smlal\i         v17.4S, v29\l, z5
185cabdff1aSopenharmony_ci        smlsl\i         v18.4S, v29\l, z1
186cabdff1aSopenharmony_ci        smlal\i         v5.4S,  v29\l, z7
187cabdff1aSopenharmony_ci        smlal\i         v6.4S,  v29\l, z3
188cabdff1aSopenharmony_ci
189cabdff1aSopenharmony_ci2:      mov             x5, v31.D[\i - 1]
190cabdff1aSopenharmony_ci        cmp             x4, #0
191cabdff1aSopenharmony_ci        b.eq            3f
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci        smull\i         v7.4S,  v30\l, z6
194cabdff1aSopenharmony_ci        smull\i         v16.4S, v30\l, z2
195cabdff1aSopenharmony_ci        add             v19.4S, v19.4S, v7.4S
196cabdff1aSopenharmony_ci        sub             v22.4S, v22.4S, v7.4S
197cabdff1aSopenharmony_ci        sub             v20.4S, v20.4S, v16.4S
198cabdff1aSopenharmony_ci        add             v21.4S, v21.4S, v16.4S
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ci3:      cmp             x5, #0
201cabdff1aSopenharmony_ci        b.eq            4f
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci        smlal\i         v17.4S, v31\l, z7
204cabdff1aSopenharmony_ci        smlsl\i         v18.4S, v31\l, z5
205cabdff1aSopenharmony_ci        smlal\i         v5.4S,  v31\l, z3
206cabdff1aSopenharmony_ci        smlsl\i         v6.4S,  v31\l, z1
207cabdff1aSopenharmony_ci
208cabdff1aSopenharmony_ci4:      addhn           v7.4H, v19.4S, v17.4S
209cabdff1aSopenharmony_ci        addhn2          v7.8H, v20.4S, v18.4S
210cabdff1aSopenharmony_ci        subhn           v18.4H, v20.4S, v18.4S
211cabdff1aSopenharmony_ci        subhn2          v18.8H, v19.4S, v17.4S
212cabdff1aSopenharmony_ci
213cabdff1aSopenharmony_ci        addhn           v16.4H, v21.4S, v5.4S
214cabdff1aSopenharmony_ci        addhn2          v16.8H, v22.4S, v6.4S
215cabdff1aSopenharmony_ci        subhn           v17.4H, v22.4S, v6.4S
216cabdff1aSopenharmony_ci        subhn2          v17.8H, v21.4S, v5.4S
217cabdff1aSopenharmony_ci
218cabdff1aSopenharmony_ci        ret
219cabdff1aSopenharmony_ciendfunc
220cabdff1aSopenharmony_ci.endm
221cabdff1aSopenharmony_ci
222cabdff1aSopenharmony_cideclare_idct_col4_neon 1, .4H
223cabdff1aSopenharmony_cideclare_idct_col4_neon 2, .8H
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_cifunction ff_simple_idct_put_neon, export=1
226cabdff1aSopenharmony_ci        idct_start      x2
227cabdff1aSopenharmony_ci
228cabdff1aSopenharmony_ci        idct_row4_neon  v24, v25, v26, v27, 1
229cabdff1aSopenharmony_ci        idct_row4_neon  v28, v29, v30, v31, 2
230cabdff1aSopenharmony_ci        bl              idct_col4_neon1
231cabdff1aSopenharmony_ci
232cabdff1aSopenharmony_ci        sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
233cabdff1aSopenharmony_ci        sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
234cabdff1aSopenharmony_ci        sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
235cabdff1aSopenharmony_ci        sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci        bl              idct_col4_neon2
238cabdff1aSopenharmony_ci
239cabdff1aSopenharmony_ci        sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
240cabdff1aSopenharmony_ci        sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
241cabdff1aSopenharmony_ci        sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
242cabdff1aSopenharmony_ci        sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16
243cabdff1aSopenharmony_ci
244cabdff1aSopenharmony_ci        zip1            v16.4S, v1.4S, v2.4S
245cabdff1aSopenharmony_ci        zip2            v17.4S, v1.4S, v2.4S
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci        st1             {v16.D}[0], [x0], x1
248cabdff1aSopenharmony_ci        st1             {v16.D}[1], [x0], x1
249cabdff1aSopenharmony_ci
250cabdff1aSopenharmony_ci        zip1            v18.4S, v3.4S, v4.4S
251cabdff1aSopenharmony_ci        zip2            v19.4S, v3.4S, v4.4S
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci        st1             {v17.D}[0], [x0], x1
254cabdff1aSopenharmony_ci        st1             {v17.D}[1], [x0], x1
255cabdff1aSopenharmony_ci        st1             {v18.D}[0], [x0], x1
256cabdff1aSopenharmony_ci        st1             {v18.D}[1], [x0], x1
257cabdff1aSopenharmony_ci        st1             {v19.D}[0], [x0], x1
258cabdff1aSopenharmony_ci        st1             {v19.D}[1], [x0], x1
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_ci        idct_end
261cabdff1aSopenharmony_ciendfunc
262cabdff1aSopenharmony_ci
263cabdff1aSopenharmony_cifunction ff_simple_idct_add_neon, export=1
264cabdff1aSopenharmony_ci        idct_start      x2
265cabdff1aSopenharmony_ci
266cabdff1aSopenharmony_ci        idct_row4_neon  v24, v25, v26, v27, 1
267cabdff1aSopenharmony_ci        idct_row4_neon  v28, v29, v30, v31, 2
268cabdff1aSopenharmony_ci        bl              idct_col4_neon1
269cabdff1aSopenharmony_ci
270cabdff1aSopenharmony_ci        sshr            v1.8H, v7.8H, #COL_SHIFT-16
271cabdff1aSopenharmony_ci        sshr            v2.8H, v16.8H, #COL_SHIFT-16
272cabdff1aSopenharmony_ci        sshr            v3.8H, v17.8H, #COL_SHIFT-16
273cabdff1aSopenharmony_ci        sshr            v4.8H, v18.8H, #COL_SHIFT-16
274cabdff1aSopenharmony_ci
275cabdff1aSopenharmony_ci        bl              idct_col4_neon2
276cabdff1aSopenharmony_ci
277cabdff1aSopenharmony_ci        sshr            v7.8H, v7.8H, #COL_SHIFT-16
278cabdff1aSopenharmony_ci        sshr            v16.8H, v16.8H, #COL_SHIFT-16
279cabdff1aSopenharmony_ci        sshr            v17.8H, v17.8H, #COL_SHIFT-16
280cabdff1aSopenharmony_ci        sshr            v18.8H, v18.8H, #COL_SHIFT-16
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci        mov             x9,  x0
283cabdff1aSopenharmony_ci        ld1             {v19.D}[0], [x0], x1
284cabdff1aSopenharmony_ci        zip1            v23.2D, v1.2D, v7.2D
285cabdff1aSopenharmony_ci        zip2            v24.2D, v1.2D, v7.2D
286cabdff1aSopenharmony_ci        ld1             {v19.D}[1], [x0], x1
287cabdff1aSopenharmony_ci        zip1            v25.2D, v2.2D, v16.2D
288cabdff1aSopenharmony_ci        zip2            v26.2D, v2.2D, v16.2D
289cabdff1aSopenharmony_ci        ld1             {v20.D}[0], [x0], x1
290cabdff1aSopenharmony_ci        zip1            v27.2D, v3.2D, v17.2D
291cabdff1aSopenharmony_ci        zip2            v28.2D, v3.2D, v17.2D
292cabdff1aSopenharmony_ci        ld1             {v20.D}[1], [x0], x1
293cabdff1aSopenharmony_ci        zip1            v29.2D, v4.2D, v18.2D
294cabdff1aSopenharmony_ci        zip2            v30.2D, v4.2D, v18.2D
295cabdff1aSopenharmony_ci        ld1             {v21.D}[0], [x0], x1
296cabdff1aSopenharmony_ci        uaddw           v23.8H, v23.8H, v19.8B
297cabdff1aSopenharmony_ci        uaddw2          v24.8H, v24.8H, v19.16B
298cabdff1aSopenharmony_ci        ld1             {v21.D}[1], [x0], x1
299cabdff1aSopenharmony_ci        sqxtun          v23.8B, v23.8H
300cabdff1aSopenharmony_ci        sqxtun2         v23.16B, v24.8H
301cabdff1aSopenharmony_ci        ld1             {v22.D}[0], [x0], x1
302cabdff1aSopenharmony_ci        uaddw           v24.8H, v25.8H, v20.8B
303cabdff1aSopenharmony_ci        uaddw2          v25.8H, v26.8H, v20.16B
304cabdff1aSopenharmony_ci        ld1             {v22.D}[1], [x0], x1
305cabdff1aSopenharmony_ci        sqxtun          v24.8B, v24.8H
306cabdff1aSopenharmony_ci        sqxtun2         v24.16B, v25.8H
307cabdff1aSopenharmony_ci        st1             {v23.D}[0], [x9], x1
308cabdff1aSopenharmony_ci        uaddw           v25.8H, v27.8H, v21.8B
309cabdff1aSopenharmony_ci        uaddw2          v26.8H, v28.8H, v21.16B
310cabdff1aSopenharmony_ci        st1             {v23.D}[1], [x9], x1
311cabdff1aSopenharmony_ci        sqxtun          v25.8B, v25.8H
312cabdff1aSopenharmony_ci        sqxtun2         v25.16B, v26.8H
313cabdff1aSopenharmony_ci        st1             {v24.D}[0], [x9], x1
314cabdff1aSopenharmony_ci        uaddw           v26.8H, v29.8H, v22.8B
315cabdff1aSopenharmony_ci        uaddw2          v27.8H, v30.8H, v22.16B
316cabdff1aSopenharmony_ci        st1             {v24.D}[1], [x9], x1
317cabdff1aSopenharmony_ci        sqxtun          v26.8B, v26.8H
318cabdff1aSopenharmony_ci        sqxtun2         v26.16B, v27.8H
319cabdff1aSopenharmony_ci        st1             {v25.D}[0], [x9], x1
320cabdff1aSopenharmony_ci        st1             {v25.D}[1], [x9], x1
321cabdff1aSopenharmony_ci        st1             {v26.D}[0], [x9], x1
322cabdff1aSopenharmony_ci        st1             {v26.D}[1], [x9], x1
323cabdff1aSopenharmony_ci
324cabdff1aSopenharmony_ci        idct_end
325cabdff1aSopenharmony_ciendfunc
326cabdff1aSopenharmony_ci
327cabdff1aSopenharmony_cifunction ff_simple_idct_neon, export=1
328cabdff1aSopenharmony_ci        idct_start      x0
329cabdff1aSopenharmony_ci
330cabdff1aSopenharmony_ci        mov             x2,  x0
331cabdff1aSopenharmony_ci        idct_row4_neon  v24, v25, v26, v27, 1
332cabdff1aSopenharmony_ci        idct_row4_neon  v28, v29, v30, v31, 2
333cabdff1aSopenharmony_ci        sub             x2, x2, #128
334cabdff1aSopenharmony_ci        bl              idct_col4_neon1
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci        sshr            v1.8H, v7.8H, #COL_SHIFT-16
337cabdff1aSopenharmony_ci        sshr            v2.8H, v16.8H, #COL_SHIFT-16
338cabdff1aSopenharmony_ci        sshr            v3.8H, v17.8H, #COL_SHIFT-16
339cabdff1aSopenharmony_ci        sshr            v4.8H, v18.8H, #COL_SHIFT-16
340cabdff1aSopenharmony_ci
341cabdff1aSopenharmony_ci        bl              idct_col4_neon2
342cabdff1aSopenharmony_ci
343cabdff1aSopenharmony_ci        sshr            v7.8H, v7.8H, #COL_SHIFT-16
344cabdff1aSopenharmony_ci        sshr            v16.8H, v16.8H, #COL_SHIFT-16
345cabdff1aSopenharmony_ci        sshr            v17.8H, v17.8H, #COL_SHIFT-16
346cabdff1aSopenharmony_ci        sshr            v18.8H, v18.8H, #COL_SHIFT-16
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci        zip1            v23.2D, v1.2D, v7.2D
349cabdff1aSopenharmony_ci        zip2            v24.2D, v1.2D, v7.2D
350cabdff1aSopenharmony_ci        st1             {v23.2D,v24.2D}, [x2], #32
351cabdff1aSopenharmony_ci        zip1            v25.2D, v2.2D, v16.2D
352cabdff1aSopenharmony_ci        zip2            v26.2D, v2.2D, v16.2D
353cabdff1aSopenharmony_ci        st1             {v25.2D,v26.2D}, [x2], #32
354cabdff1aSopenharmony_ci        zip1            v27.2D, v3.2D, v17.2D
355cabdff1aSopenharmony_ci        zip2            v28.2D, v3.2D, v17.2D
356cabdff1aSopenharmony_ci        st1             {v27.2D,v28.2D}, [x2], #32
357cabdff1aSopenharmony_ci        zip1            v29.2D, v4.2D, v18.2D
358cabdff1aSopenharmony_ci        zip2            v30.2D, v4.2D, v18.2D
359cabdff1aSopenharmony_ci        st1             {v29.2D,v30.2D}, [x2], #32
360cabdff1aSopenharmony_ci
361cabdff1aSopenharmony_ci        idct_end
362cabdff1aSopenharmony_ciendfunc
363