1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci.macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
24cabdff1aSopenharmony_ci.if \n >= 8 || \hi == 0
25cabdff1aSopenharmony_ci        ld1             {\rd\().b}[0],  [\rs], \rt
26cabdff1aSopenharmony_ci        ld1             {\rd\().b}[1],  [\rs], \rt
27cabdff1aSopenharmony_ci        ld1             {\rd\().b}[2],  [\rs], \rt
28cabdff1aSopenharmony_ci        ld1             {\rd\().b}[3],  [\rs], \rt
29cabdff1aSopenharmony_ci.endif
30cabdff1aSopenharmony_ci.if \n >= 8 || \hi == 1
31cabdff1aSopenharmony_ci        ld1             {\rd\().b}[4],  [\rs], \rt
32cabdff1aSopenharmony_ci        ld1             {\rd\().b}[5],  [\rs], \rt
33cabdff1aSopenharmony_ci        ld1             {\rd\().b}[6],  [\rs], \rt
34cabdff1aSopenharmony_ci        ld1             {\rd\().b}[7],  [\rs], \rt
35cabdff1aSopenharmony_ci.endif
36cabdff1aSopenharmony_ci.if \n == 16
37cabdff1aSopenharmony_ci        ld1             {\rd\().b}[8],  [\rs], \rt
38cabdff1aSopenharmony_ci        ld1             {\rd\().b}[9],  [\rs], \rt
39cabdff1aSopenharmony_ci        ld1             {\rd\().b}[10], [\rs], \rt
40cabdff1aSopenharmony_ci        ld1             {\rd\().b}[11], [\rs], \rt
41cabdff1aSopenharmony_ci        ld1             {\rd\().b}[12], [\rs], \rt
42cabdff1aSopenharmony_ci        ld1             {\rd\().b}[13], [\rs], \rt
43cabdff1aSopenharmony_ci        ld1             {\rd\().b}[14], [\rs], \rt
44cabdff1aSopenharmony_ci        ld1             {\rd\().b}[15], [\rs], \rt
45cabdff1aSopenharmony_ci.endif
46cabdff1aSopenharmony_ci.endm
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_cifunction ff_pred16x16_128_dc_neon, export=1
49cabdff1aSopenharmony_ci        movi            v0.16b,  #128
50cabdff1aSopenharmony_ci        b               .L_pred16x16_dc_end
51cabdff1aSopenharmony_ciendfunc
52cabdff1aSopenharmony_ci
53cabdff1aSopenharmony_cifunction ff_pred16x16_top_dc_neon, export=1
54cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
55cabdff1aSopenharmony_ci        ld1             {v0.16b},  [x2]
56cabdff1aSopenharmony_ci        uaddlv          h0,  v0.16b
57cabdff1aSopenharmony_ci        rshrn           v0.8b,  v0.8h,  #4
58cabdff1aSopenharmony_ci        dup             v0.16b, v0.b[0]
59cabdff1aSopenharmony_ci        b               .L_pred16x16_dc_end
60cabdff1aSopenharmony_ciendfunc
61cabdff1aSopenharmony_ci
62cabdff1aSopenharmony_cifunction ff_pred16x16_left_dc_neon, export=1
63cabdff1aSopenharmony_ci        sub             x2,  x0,  #1
64cabdff1aSopenharmony_ci        ldcol.8         v0,  x2,  x1, 16
65cabdff1aSopenharmony_ci        uaddlv          h0,  v0.16b
66cabdff1aSopenharmony_ci        rshrn           v0.8b,  v0.8h,  #4
67cabdff1aSopenharmony_ci        dup             v0.16b, v0.b[0]
68cabdff1aSopenharmony_ci        b               .L_pred16x16_dc_end
69cabdff1aSopenharmony_ciendfunc
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_cifunction ff_pred16x16_dc_neon, export=1
72cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
73cabdff1aSopenharmony_ci        sub             x3,  x0,  #1
74cabdff1aSopenharmony_ci        ld1             {v0.16b}, [x2]
75cabdff1aSopenharmony_ci        ldcol.8         v1,  x3,  x1, 16
76cabdff1aSopenharmony_ci        uaddlv          h0,  v0.16b
77cabdff1aSopenharmony_ci        uaddlv          h1,  v1.16b
78cabdff1aSopenharmony_ci        add             v0.4h,  v0.4h,  v1.4h
79cabdff1aSopenharmony_ci        rshrn           v0.8b,  v0.8h,  #5
80cabdff1aSopenharmony_ci        dup             v0.16b, v0.b[0]
81cabdff1aSopenharmony_ci.L_pred16x16_dc_end:
82cabdff1aSopenharmony_ci        mov             w3,  #8
83cabdff1aSopenharmony_ci6:      st1             {v0.16b}, [x0], x1
84cabdff1aSopenharmony_ci        subs            w3,  w3,  #1
85cabdff1aSopenharmony_ci        st1             {v0.16b}, [x0], x1
86cabdff1aSopenharmony_ci        b.ne            6b
87cabdff1aSopenharmony_ci        ret
88cabdff1aSopenharmony_ciendfunc
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_cifunction ff_pred16x16_hor_neon, export=1
91cabdff1aSopenharmony_ci        sub             x2,  x0,  #1
92cabdff1aSopenharmony_ci        mov             w3,  #16
93cabdff1aSopenharmony_ci1:      ld1r            {v0.16b}, [x2], x1
94cabdff1aSopenharmony_ci        subs            w3,  w3,  #1
95cabdff1aSopenharmony_ci        st1             {v0.16b}, [x0], x1
96cabdff1aSopenharmony_ci        b.ne            1b
97cabdff1aSopenharmony_ci        ret
98cabdff1aSopenharmony_ciendfunc
99cabdff1aSopenharmony_ci
100cabdff1aSopenharmony_cifunction ff_pred16x16_vert_neon, export=1
101cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
102cabdff1aSopenharmony_ci        add             x1,  x1,  x1
103cabdff1aSopenharmony_ci        ld1             {v0.16b}, [x2], x1
104cabdff1aSopenharmony_ci        mov             w3,  #8
105cabdff1aSopenharmony_ci1:      subs            w3,  w3,  #1
106cabdff1aSopenharmony_ci        st1             {v0.16b}, [x0], x1
107cabdff1aSopenharmony_ci        st1             {v0.16b}, [x2], x1
108cabdff1aSopenharmony_ci        b.ne            1b
109cabdff1aSopenharmony_ci        ret
110cabdff1aSopenharmony_ciendfunc
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_cifunction ff_pred16x16_plane_neon, export=1
113cabdff1aSopenharmony_ci        sub             x3,  x0,  x1
114cabdff1aSopenharmony_ci        movrel          x4,  p16weight
115cabdff1aSopenharmony_ci        add             x2,  x3,  #8
116cabdff1aSopenharmony_ci        sub             x3,  x3,  #1
117cabdff1aSopenharmony_ci        ld1             {v0.8b},  [x3]
118cabdff1aSopenharmony_ci        ld1             {v2.8b},  [x2], x1
119cabdff1aSopenharmony_ci        ldcol.8         v1,  x3,  x1
120cabdff1aSopenharmony_ci        add             x3,  x3,  x1
121cabdff1aSopenharmony_ci        ldcol.8         v3,  x3,  x1
122cabdff1aSopenharmony_ci        rev64           v0.8b,  v0.8b
123cabdff1aSopenharmony_ci        rev64           v1.8b,  v1.8b
124cabdff1aSopenharmony_ci        uaddl           v7.8h,  v2.8b,  v3.8b
125cabdff1aSopenharmony_ci        usubl           v2.8h,  v2.8b,  v0.8b
126cabdff1aSopenharmony_ci        usubl           v3.8h,  v3.8b,  v1.8b
127cabdff1aSopenharmony_ci        ld1             {v0.8h},     [x4]
128cabdff1aSopenharmony_ci        mul             v2.8h,  v2.8h,  v0.8h
129cabdff1aSopenharmony_ci        mul             v3.8h,  v3.8h,  v0.8h
130cabdff1aSopenharmony_ci        addp            v2.8h,  v2.8h,  v3.8h
131cabdff1aSopenharmony_ci        addp            v2.8h,  v2.8h,  v2.8h
132cabdff1aSopenharmony_ci        addp            v2.4h,  v2.4h,  v2.4h
133cabdff1aSopenharmony_ci        sshll           v3.4s,  v2.4h,  #2
134cabdff1aSopenharmony_ci        saddw           v2.4s,  v3.4s,  v2.4h
135cabdff1aSopenharmony_ci        rshrn           v4.4h,  v2.4s,  #6
136cabdff1aSopenharmony_ci        trn2            v5.4h,  v4.4h,  v4.4h
137cabdff1aSopenharmony_ci        add             v2.4h,  v4.4h,  v5.4h
138cabdff1aSopenharmony_ci        shl             v3.4h,  v2.4h,  #3
139cabdff1aSopenharmony_ci        ext             v7.16b, v7.16b, v7.16b, #14
140cabdff1aSopenharmony_ci        sub             v3.4h,  v3.4h,  v2.4h   // 7 * (b + c)
141cabdff1aSopenharmony_ci        add             v7.4h,  v7.4h,  v0.4h
142cabdff1aSopenharmony_ci        shl             v2.4h,  v7.4h,  #4
143cabdff1aSopenharmony_ci        sub             v2.4h,  v2.4h,  v3.4h
144cabdff1aSopenharmony_ci        shl             v3.4h,  v4.4h,  #4
145cabdff1aSopenharmony_ci        ext             v0.16b, v0.16b, v0.16b, #14
146cabdff1aSopenharmony_ci        sub             v6.4h,  v5.4h,  v3.4h
147cabdff1aSopenharmony_ci        mov             v0.h[0],  wzr
148cabdff1aSopenharmony_ci        mul             v0.8h,  v0.8h,  v4.h[0]
149cabdff1aSopenharmony_ci        dup             v1.8h,  v2.h[0]
150cabdff1aSopenharmony_ci        dup             v2.8h,  v4.h[0]
151cabdff1aSopenharmony_ci        dup             v3.8h,  v6.h[0]
152cabdff1aSopenharmony_ci        shl             v2.8h,  v2.8h,  #3
153cabdff1aSopenharmony_ci        add             v1.8h,  v1.8h,  v0.8h
154cabdff1aSopenharmony_ci        add             v3.8h,  v3.8h,  v2.8h
155cabdff1aSopenharmony_ci        mov             w3,  #16
156cabdff1aSopenharmony_ci1:
157cabdff1aSopenharmony_ci        sqshrun         v0.8b,  v1.8h,  #5
158cabdff1aSopenharmony_ci        add             v1.8h,  v1.8h,  v2.8h
159cabdff1aSopenharmony_ci        sqshrun2        v0.16b, v1.8h,  #5
160cabdff1aSopenharmony_ci        add             v1.8h,  v1.8h,  v3.8h
161cabdff1aSopenharmony_ci        subs            w3,  w3,  #1
162cabdff1aSopenharmony_ci        st1             {v0.16b}, [x0], x1
163cabdff1aSopenharmony_ci        b.ne            1b
164cabdff1aSopenharmony_ci        ret
165cabdff1aSopenharmony_ciendfunc
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ciconst   p16weight, align=4
168cabdff1aSopenharmony_ci        .short          1,2,3,4,5,6,7,8
169cabdff1aSopenharmony_ciendconst
170cabdff1aSopenharmony_ciconst   p8weight, align=4
171cabdff1aSopenharmony_ci        .short          1,2,3,4,1,2,3,4
172cabdff1aSopenharmony_ciendconst
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_cifunction ff_pred8x8_hor_neon, export=1
175cabdff1aSopenharmony_ci        sub             x2,  x0,  #1
176cabdff1aSopenharmony_ci        mov             w3,  #8
177cabdff1aSopenharmony_ci1:      ld1r            {v0.8b},  [x2], x1
178cabdff1aSopenharmony_ci        subs            w3,  w3,  #1
179cabdff1aSopenharmony_ci        st1             {v0.8b},  [x0], x1
180cabdff1aSopenharmony_ci        b.ne            1b
181cabdff1aSopenharmony_ci        ret
182cabdff1aSopenharmony_ciendfunc
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_cifunction ff_pred8x8_vert_neon, export=1
185cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
186cabdff1aSopenharmony_ci        lsl             x1,  x1,  #1
187cabdff1aSopenharmony_ci        ld1             {v0.8b},  [x2], x1
188cabdff1aSopenharmony_ci        mov             w3,  #4
189cabdff1aSopenharmony_ci1:      subs            w3,  w3,  #1
190cabdff1aSopenharmony_ci        st1             {v0.8b},  [x0], x1
191cabdff1aSopenharmony_ci        st1             {v0.8b},  [x2], x1
192cabdff1aSopenharmony_ci        b.ne            1b
193cabdff1aSopenharmony_ci        ret
194cabdff1aSopenharmony_ciendfunc
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_cifunction ff_pred8x8_plane_neon, export=1
197cabdff1aSopenharmony_ci        sub             x3,  x0,  x1
198cabdff1aSopenharmony_ci        movrel          x4,  p8weight
199cabdff1aSopenharmony_ci        movrel          x5,  p16weight
200cabdff1aSopenharmony_ci        add             x2,  x3,  #4
201cabdff1aSopenharmony_ci        sub             x3,  x3,  #1
202cabdff1aSopenharmony_ci        ld1             {v0.s}[0],  [x3]
203cabdff1aSopenharmony_ci        ld1             {v2.s}[0],  [x2], x1
204cabdff1aSopenharmony_ci        ldcol.8         v0,  x3,  x1,  4,  hi=1
205cabdff1aSopenharmony_ci        add             x3,  x3,  x1
206cabdff1aSopenharmony_ci        ldcol.8         v3,  x3,  x1,  4
207cabdff1aSopenharmony_ci        uaddl           v7.8h,  v2.8b,  v3.8b
208cabdff1aSopenharmony_ci        rev32           v0.8b,  v0.8b
209cabdff1aSopenharmony_ci        trn1            v2.2s,  v2.2s,  v3.2s
210cabdff1aSopenharmony_ci        usubl           v2.8h,  v2.8b,  v0.8b
211cabdff1aSopenharmony_ci        ld1             {v6.8h},  [x4]
212cabdff1aSopenharmony_ci        mul             v2.8h,  v2.8h,  v6.8h
213cabdff1aSopenharmony_ci        ld1             {v0.8h},  [x5]
214cabdff1aSopenharmony_ci        saddlp          v2.4s,  v2.8h
215cabdff1aSopenharmony_ci        addp            v2.4s,  v2.4s,  v2.4s
216cabdff1aSopenharmony_ci        shl             v3.4s,  v2.4s,  #4
217cabdff1aSopenharmony_ci        add             v2.4s,  v3.4s,  v2.4s
218cabdff1aSopenharmony_ci        rshrn           v5.4h,  v2.4s,  #5
219cabdff1aSopenharmony_ci        addp            v2.4h,  v5.4h,  v5.4h
220cabdff1aSopenharmony_ci        shl             v3.4h,  v2.4h,  #1
221cabdff1aSopenharmony_ci        add             v3.4h,  v3.4h,  v2.4h
222cabdff1aSopenharmony_ci        rev64           v7.4h,  v7.4h
223cabdff1aSopenharmony_ci        add             v7.4h,  v7.4h,  v0.4h
224cabdff1aSopenharmony_ci        shl             v2.4h,  v7.4h,  #4
225cabdff1aSopenharmony_ci        sub             v2.4h,  v2.4h,  v3.4h
226cabdff1aSopenharmony_ci        ext             v0.16b, v0.16b, v0.16b, #14
227cabdff1aSopenharmony_ci        mov             v0.h[0],  wzr
228cabdff1aSopenharmony_ci        mul             v0.8h,  v0.8h,  v5.h[0]
229cabdff1aSopenharmony_ci        dup             v1.8h,  v2.h[0]
230cabdff1aSopenharmony_ci        dup             v2.8h,  v5.h[1]
231cabdff1aSopenharmony_ci        add             v1.8h,  v1.8h,  v0.8h
232cabdff1aSopenharmony_ci        mov             w3,  #8
233cabdff1aSopenharmony_ci1:
234cabdff1aSopenharmony_ci        sqshrun         v0.8b,  v1.8h,  #5
235cabdff1aSopenharmony_ci        subs            w3,  w3,  #1
236cabdff1aSopenharmony_ci        add             v1.8h,  v1.8h,  v2.8h
237cabdff1aSopenharmony_ci        st1             {v0.8b},  [x0], x1
238cabdff1aSopenharmony_ci        b.ne            1b
239cabdff1aSopenharmony_ci        ret
240cabdff1aSopenharmony_ciendfunc
241cabdff1aSopenharmony_ci
242cabdff1aSopenharmony_cifunction ff_pred8x8_128_dc_neon, export=1
243cabdff1aSopenharmony_ci        movi            v0.8b,  #128
244cabdff1aSopenharmony_ci        movi            v1.8b,  #128
245cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_end
246cabdff1aSopenharmony_ciendfunc
247cabdff1aSopenharmony_ci
248cabdff1aSopenharmony_cifunction ff_pred8x8_top_dc_neon, export=1
249cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
250cabdff1aSopenharmony_ci        ld1             {v0.8b},  [x2]
251cabdff1aSopenharmony_ci        uaddlp          v0.4h,  v0.8b
252cabdff1aSopenharmony_ci        addp            v0.4h,  v0.4h,  v0.4h
253cabdff1aSopenharmony_ci        zip1            v0.8h,  v0.8h,  v0.8h
254cabdff1aSopenharmony_ci        rshrn           v2.8b,  v0.8h,  #2
255cabdff1aSopenharmony_ci        zip1            v0.8b,  v2.8b,  v2.8b
256cabdff1aSopenharmony_ci        zip1            v1.8b,  v2.8b,  v2.8b
257cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_end
258cabdff1aSopenharmony_ciendfunc
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_cifunction ff_pred8x8_left_dc_neon, export=1
261cabdff1aSopenharmony_ci        sub             x2,  x0,  #1
262cabdff1aSopenharmony_ci        ldcol.8         v0,  x2,  x1
263cabdff1aSopenharmony_ci        uaddlp          v0.4h,  v0.8b
264cabdff1aSopenharmony_ci        addp            v0.4h,  v0.4h,  v0.4h
265cabdff1aSopenharmony_ci        rshrn           v2.8b,  v0.8h,  #2
266cabdff1aSopenharmony_ci        dup             v1.8b,  v2.b[1]
267cabdff1aSopenharmony_ci        dup             v0.8b,  v2.b[0]
268cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_end
269cabdff1aSopenharmony_ciendfunc
270cabdff1aSopenharmony_ci
271cabdff1aSopenharmony_cifunction ff_pred8x8_dc_neon, export=1
272cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
273cabdff1aSopenharmony_ci        sub             x3,  x0,  #1
274cabdff1aSopenharmony_ci        ld1             {v0.8b}, [x2]
275cabdff1aSopenharmony_ci        ldcol.8         v1,  x3,  x1
276cabdff1aSopenharmony_ci        uaddlp          v0.4h,  v0.8b
277cabdff1aSopenharmony_ci        uaddlp          v1.4h,  v1.8b
278cabdff1aSopenharmony_ci        trn1            v2.2s,  v0.2s,  v1.2s
279cabdff1aSopenharmony_ci        trn2            v3.2s,  v0.2s,  v1.2s
280cabdff1aSopenharmony_ci        addp            v4.4h,  v2.4h,  v3.4h
281cabdff1aSopenharmony_ci        addp            v5.4h,  v4.4h,  v4.4h
282cabdff1aSopenharmony_ci        rshrn           v6.8b,  v5.8h,  #3
283cabdff1aSopenharmony_ci        rshrn           v7.8b,  v4.8h,  #2
284cabdff1aSopenharmony_ci        dup             v0.8b,  v6.b[0]
285cabdff1aSopenharmony_ci        dup             v2.8b,  v7.b[2]
286cabdff1aSopenharmony_ci        dup             v1.8b,  v7.b[3]
287cabdff1aSopenharmony_ci        dup             v3.8b,  v6.b[1]
288cabdff1aSopenharmony_ci        zip1            v0.2s,  v0.2s,  v2.2s
289cabdff1aSopenharmony_ci        zip1            v1.2s,  v1.2s,  v3.2s
290cabdff1aSopenharmony_ci.L_pred8x8_dc_end:
291cabdff1aSopenharmony_ci        mov             w3,  #4
292cabdff1aSopenharmony_ci        add             x2,  x0,  x1,  lsl #2
293cabdff1aSopenharmony_ci6:      subs            w3,  w3,  #1
294cabdff1aSopenharmony_ci        st1             {v0.8b},  [x0], x1
295cabdff1aSopenharmony_ci        st1             {v1.8b},  [x2], x1
296cabdff1aSopenharmony_ci        b.ne            6b
297cabdff1aSopenharmony_ci        ret
298cabdff1aSopenharmony_ciendfunc
299cabdff1aSopenharmony_ci
300cabdff1aSopenharmony_cifunction ff_pred8x8_l0t_dc_neon, export=1
301cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
302cabdff1aSopenharmony_ci        sub             x3,  x0,  #1
303cabdff1aSopenharmony_ci        ld1             {v0.8b},  [x2]
304cabdff1aSopenharmony_ci        ldcol.8         v1,  x3,  x1,  4
305cabdff1aSopenharmony_ci        zip1            v0.4s,  v0.4s,  v1.4s
306cabdff1aSopenharmony_ci        uaddlp          v0.8h,  v0.16b
307cabdff1aSopenharmony_ci        addp            v0.8h,  v0.8h,  v0.8h
308cabdff1aSopenharmony_ci        addp            v1.4h,  v0.4h,  v0.4h
309cabdff1aSopenharmony_ci        rshrn           v2.8b,  v0.8h,  #2
310cabdff1aSopenharmony_ci        rshrn           v3.8b,  v1.8h,  #3
311cabdff1aSopenharmony_ci        dup             v4.8b,  v3.b[0]
312cabdff1aSopenharmony_ci        dup             v6.8b,  v2.b[2]
313cabdff1aSopenharmony_ci        dup             v5.8b,  v2.b[0]
314cabdff1aSopenharmony_ci        zip1            v0.2s,  v4.2s,  v6.2s
315cabdff1aSopenharmony_ci        zip1            v1.2s,  v5.2s,  v6.2s
316cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_end
317cabdff1aSopenharmony_ciendfunc
318cabdff1aSopenharmony_ci
319cabdff1aSopenharmony_cifunction ff_pred8x8_l00_dc_neon, export=1
320cabdff1aSopenharmony_ci        sub             x2,  x0,  #1
321cabdff1aSopenharmony_ci        ldcol.8         v0,  x2,  x1,  4
322cabdff1aSopenharmony_ci        uaddlp          v0.4h,  v0.8b
323cabdff1aSopenharmony_ci        addp            v0.4h,  v0.4h,  v0.4h
324cabdff1aSopenharmony_ci        rshrn           v0.8b,  v0.8h,  #2
325cabdff1aSopenharmony_ci        movi            v1.8b,  #128
326cabdff1aSopenharmony_ci        dup             v0.8b,  v0.b[0]
327cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_end
328cabdff1aSopenharmony_ciendfunc
329cabdff1aSopenharmony_ci
330cabdff1aSopenharmony_cifunction ff_pred8x8_0lt_dc_neon, export=1
331cabdff1aSopenharmony_ci        add             x3,  x0,  x1,  lsl #2
332cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
333cabdff1aSopenharmony_ci        sub             x3,  x3,  #1
334cabdff1aSopenharmony_ci        ld1             {v0.8b},  [x2]
335cabdff1aSopenharmony_ci        ldcol.8         v1,  x3,  x1,  4,  hi=1
336cabdff1aSopenharmony_ci        zip1            v0.4s,  v0.4s,  v1.4s
337cabdff1aSopenharmony_ci        uaddlp          v0.8h,  v0.16b
338cabdff1aSopenharmony_ci        addp            v0.8h,  v0.8h,  v0.8h
339cabdff1aSopenharmony_ci        addp            v1.4h,  v0.4h,  v0.4h
340cabdff1aSopenharmony_ci        rshrn           v2.8b,  v0.8h,  #2
341cabdff1aSopenharmony_ci        rshrn           v3.8b,  v1.8h,  #3
342cabdff1aSopenharmony_ci        dup             v4.8b,  v2.b[0]
343cabdff1aSopenharmony_ci        dup             v5.8b,  v2.b[3]
344cabdff1aSopenharmony_ci        dup             v6.8b,  v2.b[2]
345cabdff1aSopenharmony_ci        dup             v7.8b,  v3.b[1]
346cabdff1aSopenharmony_ci        zip1            v0.2s,  v4.2s,  v6.2s
347cabdff1aSopenharmony_ci        zip1            v1.2s,  v5.2s,  v7.2s
348cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_end
349cabdff1aSopenharmony_ciendfunc
350cabdff1aSopenharmony_ci
351cabdff1aSopenharmony_cifunction ff_pred8x8_0l0_dc_neon, export=1
352cabdff1aSopenharmony_ci        add             x2,  x0,  x1,  lsl #2
353cabdff1aSopenharmony_ci        sub             x2,  x2,  #1
354cabdff1aSopenharmony_ci        ldcol.8         v1,  x2,  x1,  4
355cabdff1aSopenharmony_ci        uaddlp          v2.4h,  v1.8b
356cabdff1aSopenharmony_ci        addp            v2.4h,  v2.4h,  v2.4h
357cabdff1aSopenharmony_ci        rshrn           v1.8b,  v2.8h,  #2
358cabdff1aSopenharmony_ci        movi            v0.8b,  #128
359cabdff1aSopenharmony_ci        dup             v1.8b,  v1.b[0]
360cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_end
361cabdff1aSopenharmony_ciendfunc
362cabdff1aSopenharmony_ci
363cabdff1aSopenharmony_ci.macro ldcol.16  rd,  rs,  rt,  n=4,  hi=0
364cabdff1aSopenharmony_ci.if \n >= 4 && \hi == 0
365cabdff1aSopenharmony_ci        ld1             {\rd\().h}[0],  [\rs], \rt
366cabdff1aSopenharmony_ci        ld1             {\rd\().h}[1],  [\rs], \rt
367cabdff1aSopenharmony_ci        ld1             {\rd\().h}[2],  [\rs], \rt
368cabdff1aSopenharmony_ci        ld1             {\rd\().h}[3],  [\rs], \rt
369cabdff1aSopenharmony_ci.endif
370cabdff1aSopenharmony_ci.if \n == 8 || \hi == 1
371cabdff1aSopenharmony_ci        ld1             {\rd\().h}[4],  [\rs], \rt
372cabdff1aSopenharmony_ci        ld1             {\rd\().h}[5],  [\rs], \rt
373cabdff1aSopenharmony_ci        ld1             {\rd\().h}[6],  [\rs], \rt
374cabdff1aSopenharmony_ci        ld1             {\rd\().h}[7],  [\rs], \rt
375cabdff1aSopenharmony_ci.endif
376cabdff1aSopenharmony_ci.endm
377cabdff1aSopenharmony_ci
378cabdff1aSopenharmony_ci// slower than C
379cabdff1aSopenharmony_ci/*
380cabdff1aSopenharmony_cifunction ff_pred16x16_128_dc_neon_10, export=1
381cabdff1aSopenharmony_ci        movi            v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
382cabdff1aSopenharmony_ci
383cabdff1aSopenharmony_ci        b               .L_pred16x16_dc_10_end
384cabdff1aSopenharmony_ciendfunc
385cabdff1aSopenharmony_ci*/
386cabdff1aSopenharmony_ci
387cabdff1aSopenharmony_cifunction ff_pred16x16_top_dc_neon_10, export=1
388cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_ci        ld1             {v0.8h, v1.8h}, [x2]
391cabdff1aSopenharmony_ci
392cabdff1aSopenharmony_ci        add             v0.8h, v0.8h, v1.8h
393cabdff1aSopenharmony_ci        addv            h0, v0.8h
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_ci        urshr           v0.4h,  v0.4h,  #4
396cabdff1aSopenharmony_ci        dup             v0.8h, v0.h[0]
397cabdff1aSopenharmony_ci        b               .L_pred16x16_dc_10_end
398cabdff1aSopenharmony_ciendfunc
399cabdff1aSopenharmony_ci
400cabdff1aSopenharmony_ci// slower than C
401cabdff1aSopenharmony_ci/*
402cabdff1aSopenharmony_cifunction ff_pred16x16_left_dc_neon_10, export=1
403cabdff1aSopenharmony_ci        sub             x2,  x0,  #2 // access to the "left" column
404cabdff1aSopenharmony_ci        ldcol.16        v0,  x2,  x1,  8
405cabdff1aSopenharmony_ci        ldcol.16        v1,  x2,  x1,  8 // load "left" column
406cabdff1aSopenharmony_ci
407cabdff1aSopenharmony_ci        add             v0.8h, v0.8h, v1.8h
408cabdff1aSopenharmony_ci        addv            h0,  v0.8h
409cabdff1aSopenharmony_ci
410cabdff1aSopenharmony_ci        urshr           v0.4h,  v0.4h,  #4
411cabdff1aSopenharmony_ci        dup             v0.8h, v0.h[0]
412cabdff1aSopenharmony_ci        b               .L_pred16x16_dc_10_end
413cabdff1aSopenharmony_ciendfunc
414cabdff1aSopenharmony_ci*/
415cabdff1aSopenharmony_ci
416cabdff1aSopenharmony_cifunction ff_pred16x16_dc_neon_10, export=1
417cabdff1aSopenharmony_ci        sub             x2,  x0,  x1 // access to the "top" row
418cabdff1aSopenharmony_ci        sub             x3,  x0,  #2 // access to the "left" column
419cabdff1aSopenharmony_ci
420cabdff1aSopenharmony_ci        ld1             {v0.8h, v1.8h}, [x2]
421cabdff1aSopenharmony_ci        ldcol.16        v2,  x3,  x1,  8
422cabdff1aSopenharmony_ci        ldcol.16        v3,  x3,  x1,  8 // load pixels in "top" row and "left" col
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci        add             v0.8h, v0.8h, v1.8h
425cabdff1aSopenharmony_ci        add             v2.8h, v2.8h, v3.8h
426cabdff1aSopenharmony_ci        add             v0.8h, v0.8h, v2.8h
427cabdff1aSopenharmony_ci        addv            h0, v0.8h
428cabdff1aSopenharmony_ci
429cabdff1aSopenharmony_ci        urshr           v0.4h,  v0.4h,  #5
430cabdff1aSopenharmony_ci        dup             v0.8h,  v0.h[0]
431cabdff1aSopenharmony_ci.L_pred16x16_dc_10_end:
432cabdff1aSopenharmony_ci        mov             v1.16b,  v0.16b
433cabdff1aSopenharmony_ci        mov             w3,  #8
434cabdff1aSopenharmony_ci6:      st1             {v0.8h, v1.8h}, [x0], x1
435cabdff1aSopenharmony_ci        subs            w3,  w3,  #1
436cabdff1aSopenharmony_ci        st1             {v0.8h, v1.8h}, [x0], x1
437cabdff1aSopenharmony_ci        b.ne            6b
438cabdff1aSopenharmony_ci        ret
439cabdff1aSopenharmony_ciendfunc
440cabdff1aSopenharmony_ci
441cabdff1aSopenharmony_cifunction ff_pred16x16_hor_neon_10, export=1
442cabdff1aSopenharmony_ci        sub             x2,  x0,  #2
443cabdff1aSopenharmony_ci        add             x3,  x0,  #16
444cabdff1aSopenharmony_ci
445cabdff1aSopenharmony_ci        mov             w4,  #16
446cabdff1aSopenharmony_ci1:      ld1r            {v0.8h},  [x2],  x1
447cabdff1aSopenharmony_ci        subs            w4,  w4,  #1
448cabdff1aSopenharmony_ci        st1             {v0.8h},  [x0],  x1
449cabdff1aSopenharmony_ci        st1             {v0.8h},  [x3],  x1
450cabdff1aSopenharmony_ci        b.ne            1b
451cabdff1aSopenharmony_ci        ret
452cabdff1aSopenharmony_ciendfunc
453cabdff1aSopenharmony_ci
454cabdff1aSopenharmony_cifunction ff_pred16x16_vert_neon_10, export=1
455cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
456cabdff1aSopenharmony_ci        add             x1,  x1,  x1
457cabdff1aSopenharmony_ci
458cabdff1aSopenharmony_ci        ld1             {v0.8h, v1.8h},  [x2],  x1
459cabdff1aSopenharmony_ci
460cabdff1aSopenharmony_ci        mov             w3,  #8
461cabdff1aSopenharmony_ci1:      subs            w3,  w3,  #1
462cabdff1aSopenharmony_ci        st1             {v0.8h, v1.8h},  [x0],  x1
463cabdff1aSopenharmony_ci        st1             {v0.8h, v1.8h},  [x2],  x1
464cabdff1aSopenharmony_ci
465cabdff1aSopenharmony_ci        b.ne            1b
466cabdff1aSopenharmony_ci        ret
467cabdff1aSopenharmony_ciendfunc
468cabdff1aSopenharmony_ci
469cabdff1aSopenharmony_cifunction ff_pred16x16_plane_neon_10, export=1
470cabdff1aSopenharmony_ci        sub             x3,  x0,  x1
471cabdff1aSopenharmony_ci        movrel          x4,  p16weight
472cabdff1aSopenharmony_ci        add             x2,  x3,  #16
473cabdff1aSopenharmony_ci        sub             x3,  x3,  #2
474cabdff1aSopenharmony_ci        ld1             {v0.8h},  [x3]
475cabdff1aSopenharmony_ci        ld1             {v2.8h},  [x2], x1
476cabdff1aSopenharmony_ci        ldcol.16        v1,  x3,  x1, 8
477cabdff1aSopenharmony_ci        add             x3,  x3,  x1
478cabdff1aSopenharmony_ci        ldcol.16        v3,  x3,  x1, 8
479cabdff1aSopenharmony_ci
480cabdff1aSopenharmony_ci        rev64           v16.8h,  v0.8h
481cabdff1aSopenharmony_ci        rev64           v17.8h,  v1.8h
482cabdff1aSopenharmony_ci        ext             v0.16b, v16.16b, v16.16b, #8
483cabdff1aSopenharmony_ci        ext             v1.16b, v17.16b, v17.16b, #8
484cabdff1aSopenharmony_ci
485cabdff1aSopenharmony_ci        add             v7.8h,  v2.8h,  v3.8h
486cabdff1aSopenharmony_ci        sub             v2.8h,  v2.8h,  v0.8h
487cabdff1aSopenharmony_ci        sub             v3.8h,  v3.8h,  v1.8h
488cabdff1aSopenharmony_ci        ld1             {v0.8h},     [x4]
489cabdff1aSopenharmony_ci        mul             v2.8h,  v2.8h,  v0.8h
490cabdff1aSopenharmony_ci        mul             v3.8h,  v3.8h,  v0.8h
491cabdff1aSopenharmony_ci        addp            v2.8h,  v2.8h,  v3.8h
492cabdff1aSopenharmony_ci        addp            v2.8h,  v2.8h,  v2.8h
493cabdff1aSopenharmony_ci        addp            v2.4h,  v2.4h,  v2.4h
494cabdff1aSopenharmony_ci        sshll           v3.4s,  v2.4h,  #2
495cabdff1aSopenharmony_ci        saddw           v2.4s,  v3.4s,  v2.4h
496cabdff1aSopenharmony_ci        rshrn           v4.4h,  v2.4s,  #6
497cabdff1aSopenharmony_ci        trn2            v5.4h,  v4.4h,  v4.4h
498cabdff1aSopenharmony_ci        add             v2.4h,  v4.4h,  v5.4h
499cabdff1aSopenharmony_ci        shl             v3.4h,  v2.4h,  #3
500cabdff1aSopenharmony_ci        ext             v7.16b, v7.16b, v7.16b, #14
501cabdff1aSopenharmony_ci        sub             v3.4h,  v3.4h,  v2.4h   // 7 * (b + c)
502cabdff1aSopenharmony_ci        add             v7.4h,  v7.4h,  v0.4h
503cabdff1aSopenharmony_ci        shl             v2.4h,  v7.4h,  #4
504cabdff1aSopenharmony_ci        ssubl           v2.4s,  v2.4h,  v3.4h
505cabdff1aSopenharmony_ci        shl             v3.4h,  v4.4h,  #4
506cabdff1aSopenharmony_ci        ext             v0.16b, v0.16b, v0.16b, #14
507cabdff1aSopenharmony_ci        ssubl           v6.4s,  v5.4h,  v3.4h
508cabdff1aSopenharmony_ci
509cabdff1aSopenharmony_ci        mov             v0.h[0],  wzr
510cabdff1aSopenharmony_ci        mul             v0.8h,  v0.8h,  v4.h[0]
511cabdff1aSopenharmony_ci        dup             v16.4s, v2.s[0]
512cabdff1aSopenharmony_ci        dup             v17.4s, v2.s[0]
513cabdff1aSopenharmony_ci        dup             v2.8h,  v4.h[0]
514cabdff1aSopenharmony_ci        dup             v3.4s,  v6.s[0]
515cabdff1aSopenharmony_ci        shl             v2.8h,  v2.8h,  #3
516cabdff1aSopenharmony_ci        saddw           v16.4s, v16.4s, v0.4h
517cabdff1aSopenharmony_ci        saddw2          v17.4s, v17.4s, v0.8h
518cabdff1aSopenharmony_ci        saddw           v3.4s,  v3.4s,  v2.4h
519cabdff1aSopenharmony_ci
520cabdff1aSopenharmony_ci        mov             w3,      #16
521cabdff1aSopenharmony_ci        mvni            v4.8h,   #0xFC, lsl #8 // 1023 for clipping
522cabdff1aSopenharmony_ci1:
523cabdff1aSopenharmony_ci        sqshrun         v0.4h,  v16.4s, #5
524cabdff1aSopenharmony_ci        sqshrun2        v0.8h,  v17.4s, #5
525cabdff1aSopenharmony_ci        saddw           v16.4s, v16.4s, v2.4h
526cabdff1aSopenharmony_ci        saddw           v17.4s, v17.4s, v2.4h
527cabdff1aSopenharmony_ci        sqshrun         v1.4h,  v16.4s, #5
528cabdff1aSopenharmony_ci        sqshrun2        v1.8h,  v17.4s, #5
529cabdff1aSopenharmony_ci        add             v16.4s, v16.4s, v3.4s
530cabdff1aSopenharmony_ci        add             v17.4s, v17.4s, v3.4s
531cabdff1aSopenharmony_ci
532cabdff1aSopenharmony_ci        subs            w3,  w3,  #1
533cabdff1aSopenharmony_ci
534cabdff1aSopenharmony_ci        smin            v0.8h,  v0.8h,  v4.8h
535cabdff1aSopenharmony_ci        smin            v1.8h,  v1.8h,  v4.8h
536cabdff1aSopenharmony_ci
537cabdff1aSopenharmony_ci        st1             {v0.8h, v1.8h}, [x0], x1
538cabdff1aSopenharmony_ci        b.ne            1b
539cabdff1aSopenharmony_ci        ret
540cabdff1aSopenharmony_ciendfunc
541cabdff1aSopenharmony_ci
542cabdff1aSopenharmony_cifunction ff_pred8x8_hor_neon_10, export=1
543cabdff1aSopenharmony_ci        sub             x2,  x0,  #2
544cabdff1aSopenharmony_ci        mov             w3,  #8
545cabdff1aSopenharmony_ci
546cabdff1aSopenharmony_ci1:      ld1r            {v0.8h},  [x2], x1
547cabdff1aSopenharmony_ci        subs            w3,  w3,  #1
548cabdff1aSopenharmony_ci        st1             {v0.8h},  [x0], x1
549cabdff1aSopenharmony_ci        b.ne            1b
550cabdff1aSopenharmony_ci        ret
551cabdff1aSopenharmony_ciendfunc
552cabdff1aSopenharmony_ci
553cabdff1aSopenharmony_cifunction ff_pred8x8_vert_neon_10, export=1
554cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
555cabdff1aSopenharmony_ci        lsl             x1,  x1,  #1
556cabdff1aSopenharmony_ci
557cabdff1aSopenharmony_ci        ld1             {v0.8h},  [x2], x1
558cabdff1aSopenharmony_ci        mov             w3,  #4
559cabdff1aSopenharmony_ci1:      subs            w3,  w3,  #1
560cabdff1aSopenharmony_ci        st1             {v0.8h},  [x0], x1
561cabdff1aSopenharmony_ci        st1             {v0.8h},  [x2], x1
562cabdff1aSopenharmony_ci        b.ne            1b
563cabdff1aSopenharmony_ci        ret
564cabdff1aSopenharmony_ciendfunc
565cabdff1aSopenharmony_ci
566cabdff1aSopenharmony_cifunction ff_pred8x8_plane_neon_10, export=1
567cabdff1aSopenharmony_ci        sub             x3,  x0,  x1
568cabdff1aSopenharmony_ci        movrel          x4,  p8weight
569cabdff1aSopenharmony_ci        movrel          x5,  p16weight
570cabdff1aSopenharmony_ci        add             x2,  x3,  #8
571cabdff1aSopenharmony_ci        sub             x3,  x3,  #2
572cabdff1aSopenharmony_ci        ld1             {v0.d}[0],  [x3]
573cabdff1aSopenharmony_ci        ld1             {v2.d}[0],  [x2], x1
574cabdff1aSopenharmony_ci        ldcol.16        v0,  x3,  x1,  hi=1
575cabdff1aSopenharmony_ci        add             x3,  x3,  x1
576cabdff1aSopenharmony_ci        ldcol.16        v3,  x3,  x1,  4
577cabdff1aSopenharmony_ci        add             v7.8h,  v2.8h,  v3.8h
578cabdff1aSopenharmony_ci        rev64           v0.8h,  v0.8h
579cabdff1aSopenharmony_ci        trn1            v2.2d,  v2.2d,  v3.2d
580cabdff1aSopenharmony_ci        sub             v2.8h,  v2.8h,  v0.8h
581cabdff1aSopenharmony_ci        ld1             {v6.8h},  [x4]
582cabdff1aSopenharmony_ci        mul             v2.8h,  v2.8h,  v6.8h
583cabdff1aSopenharmony_ci        ld1             {v0.8h},  [x5]
584cabdff1aSopenharmony_ci        saddlp          v2.4s,  v2.8h
585cabdff1aSopenharmony_ci        addp            v2.4s,  v2.4s,  v2.4s
586cabdff1aSopenharmony_ci        shl             v3.4s,  v2.4s,  #4
587cabdff1aSopenharmony_ci        add             v2.4s,  v3.4s,  v2.4s
588cabdff1aSopenharmony_ci        rshrn           v5.4h,  v2.4s,  #5
589cabdff1aSopenharmony_ci        addp            v2.4h,  v5.4h,  v5.4h
590cabdff1aSopenharmony_ci        shl             v3.4h,  v2.4h,  #1
591cabdff1aSopenharmony_ci        add             v3.4h,  v3.4h,  v2.4h
592cabdff1aSopenharmony_ci        rev64           v7.4h,  v7.4h
593cabdff1aSopenharmony_ci        add             v7.4h,  v7.4h,  v0.4h
594cabdff1aSopenharmony_ci        shl             v2.4h,  v7.4h,  #4
595cabdff1aSopenharmony_ci        ssubl           v2.4s,  v2.4h,  v3.4h
596cabdff1aSopenharmony_ci        ext             v0.16b, v0.16b, v0.16b, #14
597cabdff1aSopenharmony_ci        mov             v0.h[0],  wzr
598cabdff1aSopenharmony_ci        mul             v0.8h,  v0.8h,  v5.h[0]
599cabdff1aSopenharmony_ci        dup             v1.4s,  v2.s[0]
600cabdff1aSopenharmony_ci        dup             v2.4s,  v2.s[0]
601cabdff1aSopenharmony_ci        dup             v3.8h,  v5.h[1]
602cabdff1aSopenharmony_ci        saddw           v1.4s,  v1.4s,  v0.4h
603cabdff1aSopenharmony_ci        saddw2          v2.4s,  v2.4s,  v0.8h
604cabdff1aSopenharmony_ci        mov             w3,  #8
605cabdff1aSopenharmony_ci        mvni            v4.8h,  #0xFC,  lsl #8 // 1023 for clipping
606cabdff1aSopenharmony_ci1:
607cabdff1aSopenharmony_ci        sqshrun         v0.4h,  v1.4s,  #5
608cabdff1aSopenharmony_ci        sqshrun2        v0.8h,  v2.4s,  #5
609cabdff1aSopenharmony_ci
610cabdff1aSopenharmony_ci        saddw           v1.4s,  v1.4s,  v3.4h
611cabdff1aSopenharmony_ci        saddw           v2.4s,  v2.4s,  v3.4h
612cabdff1aSopenharmony_ci
613cabdff1aSopenharmony_ci        subs            w3,  w3,  #1
614cabdff1aSopenharmony_ci
615cabdff1aSopenharmony_ci        smin            v0.8h,  v0.8h,  v4.8h
616cabdff1aSopenharmony_ci
617cabdff1aSopenharmony_ci        st1             {v0.8h},  [x0],  x1
618cabdff1aSopenharmony_ci        b.ne            1b
619cabdff1aSopenharmony_ci        ret
620cabdff1aSopenharmony_ciendfunc
621cabdff1aSopenharmony_ci
622cabdff1aSopenharmony_cifunction ff_pred8x8_128_dc_neon_10, export=1
623cabdff1aSopenharmony_ci        movi            v0.8h,  #2, lsl #8      // 512, 1 << (bit_depth - 1)
624cabdff1aSopenharmony_ci        movi            v1.8h,  #2, lsl #8
625cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_10_end
626cabdff1aSopenharmony_ciendfunc
627cabdff1aSopenharmony_ci
628cabdff1aSopenharmony_cifunction ff_pred8x8_top_dc_neon_10, export=1
629cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
630cabdff1aSopenharmony_ci        ld1             {v0.8h},  [x2]
631cabdff1aSopenharmony_ci
632cabdff1aSopenharmony_ci        addp            v0.8h,  v0.8h,  v0.8h
633cabdff1aSopenharmony_ci        addp            v0.4h,  v0.4h,  v0.4h
634cabdff1aSopenharmony_ci        zip1            v0.4h,  v0.4h,  v0.4h
635cabdff1aSopenharmony_ci        urshr           v2.4h,  v0.4h,  #2
636cabdff1aSopenharmony_ci        zip1            v0.8h,  v2.8h,  v2.8h
637cabdff1aSopenharmony_ci        zip1            v1.8h,  v2.8h,  v2.8h
638cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_10_end
639cabdff1aSopenharmony_ciendfunc
640cabdff1aSopenharmony_ci
641cabdff1aSopenharmony_cifunction ff_pred8x8_left_dc_neon_10, export=1
642cabdff1aSopenharmony_ci        sub             x2,  x0,  #2
643cabdff1aSopenharmony_ci        ldcol.16        v0,  x2,  x1,  8
644cabdff1aSopenharmony_ci
645cabdff1aSopenharmony_ci        addp            v0.8h,  v0.8h,  v0.8h
646cabdff1aSopenharmony_ci        addp            v0.4h,  v0.4h,  v0.4h
647cabdff1aSopenharmony_ci        urshr           v2.4h,  v0.4h,  #2
648cabdff1aSopenharmony_ci        dup             v1.8h,  v2.h[1]
649cabdff1aSopenharmony_ci        dup             v0.8h,  v2.h[0]
650cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_10_end
651cabdff1aSopenharmony_ciendfunc
652cabdff1aSopenharmony_ci
653cabdff1aSopenharmony_cifunction ff_pred8x8_dc_neon_10, export=1
654cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
655cabdff1aSopenharmony_ci        sub             x3,  x0,  #2
656cabdff1aSopenharmony_ci
657cabdff1aSopenharmony_ci        ld1             {v0.8h}, [x2]
658cabdff1aSopenharmony_ci        ldcol.16        v1,  x3,  x1, 8
659cabdff1aSopenharmony_ci
660cabdff1aSopenharmony_ci        addp            v0.8h,  v0.8h, v0.8h
661cabdff1aSopenharmony_ci        addp            v1.8h,  v1.8h, v1.8h
662cabdff1aSopenharmony_ci        trn1            v2.2s,  v0.2s,  v1.2s
663cabdff1aSopenharmony_ci        trn2            v3.2s,  v0.2s,  v1.2s
664cabdff1aSopenharmony_ci        addp            v4.4h,  v2.4h,  v3.4h
665cabdff1aSopenharmony_ci        addp            v5.4h,  v4.4h,  v4.4h
666cabdff1aSopenharmony_ci        urshr           v6.4h,  v5.4h,  #3
667cabdff1aSopenharmony_ci        urshr           v7.4h,  v4.4h,  #2
668cabdff1aSopenharmony_ci        dup             v0.8h,  v6.h[0]
669cabdff1aSopenharmony_ci        dup             v2.8h,  v7.h[2]
670cabdff1aSopenharmony_ci        dup             v1.8h,  v7.h[3]
671cabdff1aSopenharmony_ci        dup             v3.8h,  v6.h[1]
672cabdff1aSopenharmony_ci        zip1            v0.2d,  v0.2d,  v2.2d
673cabdff1aSopenharmony_ci        zip1            v1.2d,  v1.2d,  v3.2d
674cabdff1aSopenharmony_ci.L_pred8x8_dc_10_end:
675cabdff1aSopenharmony_ci        mov             w3,  #4
676cabdff1aSopenharmony_ci        add             x2,  x0,  x1,  lsl #2
677cabdff1aSopenharmony_ci
678cabdff1aSopenharmony_ci6:      st1             {v0.8h},  [x0], x1
679cabdff1aSopenharmony_ci        subs            w3,  w3,  #1
680cabdff1aSopenharmony_ci        st1             {v1.8h},  [x2], x1
681cabdff1aSopenharmony_ci        b.ne            6b
682cabdff1aSopenharmony_ci        ret
683cabdff1aSopenharmony_ciendfunc
684cabdff1aSopenharmony_ci
685cabdff1aSopenharmony_cifunction ff_pred8x8_l0t_dc_neon_10, export=1
686cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
687cabdff1aSopenharmony_ci        sub             x3,  x0,  #2
688cabdff1aSopenharmony_ci
689cabdff1aSopenharmony_ci        ld1             {v0.8h},  [x2]
690cabdff1aSopenharmony_ci        ldcol.16        v1,  x3,  x1, 4
691cabdff1aSopenharmony_ci
692cabdff1aSopenharmony_ci        addp            v0.8h,  v0.8h,  v0.8h
693cabdff1aSopenharmony_ci        addp            v1.4h,  v1.4h,  v1.4h
694cabdff1aSopenharmony_ci        addp            v0.4h,  v0.4h,  v0.4h
695cabdff1aSopenharmony_ci        addp            v1.4h,  v1.4h,  v1.4h
696cabdff1aSopenharmony_ci        add             v1.4h,  v1.4h,  v0.4h
697cabdff1aSopenharmony_ci
698cabdff1aSopenharmony_ci        urshr           v2.4h,  v0.4h,  #2
699cabdff1aSopenharmony_ci        urshr           v3.4h,  v1.4h,  #3      // the pred4x4 part
700cabdff1aSopenharmony_ci
701cabdff1aSopenharmony_ci        dup             v4.4h,  v3.h[0]
702cabdff1aSopenharmony_ci        dup             v5.4h,  v2.h[0]
703cabdff1aSopenharmony_ci        dup             v6.4h,  v2.h[1]
704cabdff1aSopenharmony_ci
705cabdff1aSopenharmony_ci        zip1            v0.2d,  v4.2d,  v6.2d
706cabdff1aSopenharmony_ci        zip1            v1.2d,  v5.2d,  v6.2d
707cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_10_end
708cabdff1aSopenharmony_ciendfunc
709cabdff1aSopenharmony_ci
710cabdff1aSopenharmony_cifunction ff_pred8x8_l00_dc_neon_10, export=1
711cabdff1aSopenharmony_ci        sub             x2,  x0,  #2
712cabdff1aSopenharmony_ci
713cabdff1aSopenharmony_ci        ldcol.16        v0,  x2,  x1,  4
714cabdff1aSopenharmony_ci
715cabdff1aSopenharmony_ci        addp            v0.4h,  v0.4h,  v0.4h
716cabdff1aSopenharmony_ci        addp            v0.4h,  v0.4h,  v0.4h
717cabdff1aSopenharmony_ci        urshr           v0.4h,  v0.4h,  #2
718cabdff1aSopenharmony_ci
719cabdff1aSopenharmony_ci        movi            v1.8h,  #2, lsl #8      // 512
720cabdff1aSopenharmony_ci        dup             v0.8h,  v0.h[0]
721cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_10_end
722cabdff1aSopenharmony_ciendfunc
723cabdff1aSopenharmony_ci
724cabdff1aSopenharmony_cifunction ff_pred8x8_0lt_dc_neon_10, export=1
725cabdff1aSopenharmony_ci        add             x3,  x0,  x1,  lsl #2
726cabdff1aSopenharmony_ci        sub             x2,  x0,  x1
727cabdff1aSopenharmony_ci        sub             x3,  x3,  #2
728cabdff1aSopenharmony_ci
729cabdff1aSopenharmony_ci        ld1             {v0.8h},  [x2]
730cabdff1aSopenharmony_ci        ldcol.16        v1,  x3,  x1,  hi=1
731cabdff1aSopenharmony_ci
732cabdff1aSopenharmony_ci        addp            v0.8h,  v0.8h,  v0.8h
733cabdff1aSopenharmony_ci        addp            v1.8h,  v1.8h,  v1.8h
734cabdff1aSopenharmony_ci        addp            v0.4h,  v0.4h,  v0.4h
735cabdff1aSopenharmony_ci        addp            v1.4h,  v1.4h,  v1.4h
736cabdff1aSopenharmony_ci        zip1            v0.2s,  v0.2s,  v1.2s
737cabdff1aSopenharmony_ci        add             v1.4h,  v0.4h,  v1.4h
738cabdff1aSopenharmony_ci
739cabdff1aSopenharmony_ci        urshr           v2.4h,  v0.4h,  #2
740cabdff1aSopenharmony_ci        urshr           v3.4h,  v1.4h,  #3
741cabdff1aSopenharmony_ci
742cabdff1aSopenharmony_ci        dup             v4.4h,  v2.h[0]
743cabdff1aSopenharmony_ci        dup             v5.4h,  v2.h[3]
744cabdff1aSopenharmony_ci        dup             v6.4h,  v2.h[1]
745cabdff1aSopenharmony_ci        dup             v7.4h,  v3.h[1]
746cabdff1aSopenharmony_ci
747cabdff1aSopenharmony_ci        zip1            v0.2d,  v4.2d,  v6.2d
748cabdff1aSopenharmony_ci        zip1            v1.2d,  v5.2d,  v7.2d
749cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_10_end
750cabdff1aSopenharmony_ciendfunc
751cabdff1aSopenharmony_ci
752cabdff1aSopenharmony_cifunction ff_pred8x8_0l0_dc_neon_10, export=1
753cabdff1aSopenharmony_ci        add             x2,  x0,  x1,  lsl #2
754cabdff1aSopenharmony_ci        sub             x2,  x2,  #2
755cabdff1aSopenharmony_ci
756cabdff1aSopenharmony_ci        ldcol.16        v1,  x2,  x1,  4
757cabdff1aSopenharmony_ci
758cabdff1aSopenharmony_ci        addp            v2.8h,  v1.8h,  v1.8h
759cabdff1aSopenharmony_ci        addp            v2.4h,  v2.4h,  v2.4h
760cabdff1aSopenharmony_ci        urshr           v1.4h,  v2.4h,  #2
761cabdff1aSopenharmony_ci
762cabdff1aSopenharmony_ci        movi            v0.8h,  #2,  lsl #8     // 512
763cabdff1aSopenharmony_ci        dup             v1.8h,  v1.h[0]
764cabdff1aSopenharmony_ci        b               .L_pred8x8_dc_10_end
765cabdff1aSopenharmony_ciendfunc
766