1/*
2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/aarch64/asm.S"
22
23.macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
24.if \n >= 8 || \hi == 0
25        ld1             {\rd\().b}[0],  [\rs], \rt
26        ld1             {\rd\().b}[1],  [\rs], \rt
27        ld1             {\rd\().b}[2],  [\rs], \rt
28        ld1             {\rd\().b}[3],  [\rs], \rt
29.endif
30.if \n >= 8 || \hi == 1
31        ld1             {\rd\().b}[4],  [\rs], \rt
32        ld1             {\rd\().b}[5],  [\rs], \rt
33        ld1             {\rd\().b}[6],  [\rs], \rt
34        ld1             {\rd\().b}[7],  [\rs], \rt
35.endif
36.if \n == 16
37        ld1             {\rd\().b}[8],  [\rs], \rt
38        ld1             {\rd\().b}[9],  [\rs], \rt
39        ld1             {\rd\().b}[10], [\rs], \rt
40        ld1             {\rd\().b}[11], [\rs], \rt
41        ld1             {\rd\().b}[12], [\rs], \rt
42        ld1             {\rd\().b}[13], [\rs], \rt
43        ld1             {\rd\().b}[14], [\rs], \rt
44        ld1             {\rd\().b}[15], [\rs], \rt
45.endif
46.endm
47
48function ff_pred16x16_128_dc_neon, export=1
49        movi            v0.16b,  #128
50        b               .L_pred16x16_dc_end
51endfunc
52
53function ff_pred16x16_top_dc_neon, export=1
54        sub             x2,  x0,  x1
55        ld1             {v0.16b},  [x2]
56        uaddlv          h0,  v0.16b
57        rshrn           v0.8b,  v0.8h,  #4
58        dup             v0.16b, v0.b[0]
59        b               .L_pred16x16_dc_end
60endfunc
61
62function ff_pred16x16_left_dc_neon, export=1
63        sub             x2,  x0,  #1
64        ldcol.8         v0,  x2,  x1, 16
65        uaddlv          h0,  v0.16b
66        rshrn           v0.8b,  v0.8h,  #4
67        dup             v0.16b, v0.b[0]
68        b               .L_pred16x16_dc_end
69endfunc
70
71function ff_pred16x16_dc_neon, export=1
72        sub             x2,  x0,  x1
73        sub             x3,  x0,  #1
74        ld1             {v0.16b}, [x2]
75        ldcol.8         v1,  x3,  x1, 16
76        uaddlv          h0,  v0.16b
77        uaddlv          h1,  v1.16b
78        add             v0.4h,  v0.4h,  v1.4h
79        rshrn           v0.8b,  v0.8h,  #5
80        dup             v0.16b, v0.b[0]
81.L_pred16x16_dc_end:
82        mov             w3,  #8
836:      st1             {v0.16b}, [x0], x1
84        subs            w3,  w3,  #1
85        st1             {v0.16b}, [x0], x1
86        b.ne            6b
87        ret
88endfunc
89
90function ff_pred16x16_hor_neon, export=1
91        sub             x2,  x0,  #1
92        mov             w3,  #16
931:      ld1r            {v0.16b}, [x2], x1
94        subs            w3,  w3,  #1
95        st1             {v0.16b}, [x0], x1
96        b.ne            1b
97        ret
98endfunc
99
100function ff_pred16x16_vert_neon, export=1
101        sub             x2,  x0,  x1
102        add             x1,  x1,  x1
103        ld1             {v0.16b}, [x2], x1
104        mov             w3,  #8
1051:      subs            w3,  w3,  #1
106        st1             {v0.16b}, [x0], x1
107        st1             {v0.16b}, [x2], x1
108        b.ne            1b
109        ret
110endfunc
111
112function ff_pred16x16_plane_neon, export=1
113        sub             x3,  x0,  x1
114        movrel          x4,  p16weight
115        add             x2,  x3,  #8
116        sub             x3,  x3,  #1
117        ld1             {v0.8b},  [x3]
118        ld1             {v2.8b},  [x2], x1
119        ldcol.8         v1,  x3,  x1
120        add             x3,  x3,  x1
121        ldcol.8         v3,  x3,  x1
122        rev64           v0.8b,  v0.8b
123        rev64           v1.8b,  v1.8b
124        uaddl           v7.8h,  v2.8b,  v3.8b
125        usubl           v2.8h,  v2.8b,  v0.8b
126        usubl           v3.8h,  v3.8b,  v1.8b
127        ld1             {v0.8h},     [x4]
128        mul             v2.8h,  v2.8h,  v0.8h
129        mul             v3.8h,  v3.8h,  v0.8h
130        addp            v2.8h,  v2.8h,  v3.8h
131        addp            v2.8h,  v2.8h,  v2.8h
132        addp            v2.4h,  v2.4h,  v2.4h
133        sshll           v3.4s,  v2.4h,  #2
134        saddw           v2.4s,  v3.4s,  v2.4h
135        rshrn           v4.4h,  v2.4s,  #6
136        trn2            v5.4h,  v4.4h,  v4.4h
137        add             v2.4h,  v4.4h,  v5.4h
138        shl             v3.4h,  v2.4h,  #3
139        ext             v7.16b, v7.16b, v7.16b, #14
140        sub             v3.4h,  v3.4h,  v2.4h   // 7 * (b + c)
141        add             v7.4h,  v7.4h,  v0.4h
142        shl             v2.4h,  v7.4h,  #4
143        sub             v2.4h,  v2.4h,  v3.4h
144        shl             v3.4h,  v4.4h,  #4
145        ext             v0.16b, v0.16b, v0.16b, #14
146        sub             v6.4h,  v5.4h,  v3.4h
147        mov             v0.h[0],  wzr
148        mul             v0.8h,  v0.8h,  v4.h[0]
149        dup             v1.8h,  v2.h[0]
150        dup             v2.8h,  v4.h[0]
151        dup             v3.8h,  v6.h[0]
152        shl             v2.8h,  v2.8h,  #3
153        add             v1.8h,  v1.8h,  v0.8h
154        add             v3.8h,  v3.8h,  v2.8h
155        mov             w3,  #16
1561:
157        sqshrun         v0.8b,  v1.8h,  #5
158        add             v1.8h,  v1.8h,  v2.8h
159        sqshrun2        v0.16b, v1.8h,  #5
160        add             v1.8h,  v1.8h,  v3.8h
161        subs            w3,  w3,  #1
162        st1             {v0.16b}, [x0], x1
163        b.ne            1b
164        ret
165endfunc
166
167const   p16weight, align=4
168        .short          1,2,3,4,5,6,7,8
169endconst
170const   p8weight, align=4
171        .short          1,2,3,4,1,2,3,4
172endconst
173
174function ff_pred8x8_hor_neon, export=1
175        sub             x2,  x0,  #1
176        mov             w3,  #8
1771:      ld1r            {v0.8b},  [x2], x1
178        subs            w3,  w3,  #1
179        st1             {v0.8b},  [x0], x1
180        b.ne            1b
181        ret
182endfunc
183
184function ff_pred8x8_vert_neon, export=1
185        sub             x2,  x0,  x1
186        lsl             x1,  x1,  #1
187        ld1             {v0.8b},  [x2], x1
188        mov             w3,  #4
1891:      subs            w3,  w3,  #1
190        st1             {v0.8b},  [x0], x1
191        st1             {v0.8b},  [x2], x1
192        b.ne            1b
193        ret
194endfunc
195
196function ff_pred8x8_plane_neon, export=1
197        sub             x3,  x0,  x1
198        movrel          x4,  p8weight
199        movrel          x5,  p16weight
200        add             x2,  x3,  #4
201        sub             x3,  x3,  #1
202        ld1             {v0.s}[0],  [x3]
203        ld1             {v2.s}[0],  [x2], x1
204        ldcol.8         v0,  x3,  x1,  4,  hi=1
205        add             x3,  x3,  x1
206        ldcol.8         v3,  x3,  x1,  4
207        uaddl           v7.8h,  v2.8b,  v3.8b
208        rev32           v0.8b,  v0.8b
209        trn1            v2.2s,  v2.2s,  v3.2s
210        usubl           v2.8h,  v2.8b,  v0.8b
211        ld1             {v6.8h},  [x4]
212        mul             v2.8h,  v2.8h,  v6.8h
213        ld1             {v0.8h},  [x5]
214        saddlp          v2.4s,  v2.8h
215        addp            v2.4s,  v2.4s,  v2.4s
216        shl             v3.4s,  v2.4s,  #4
217        add             v2.4s,  v3.4s,  v2.4s
218        rshrn           v5.4h,  v2.4s,  #5
219        addp            v2.4h,  v5.4h,  v5.4h
220        shl             v3.4h,  v2.4h,  #1
221        add             v3.4h,  v3.4h,  v2.4h
222        rev64           v7.4h,  v7.4h
223        add             v7.4h,  v7.4h,  v0.4h
224        shl             v2.4h,  v7.4h,  #4
225        sub             v2.4h,  v2.4h,  v3.4h
226        ext             v0.16b, v0.16b, v0.16b, #14
227        mov             v0.h[0],  wzr
228        mul             v0.8h,  v0.8h,  v5.h[0]
229        dup             v1.8h,  v2.h[0]
230        dup             v2.8h,  v5.h[1]
231        add             v1.8h,  v1.8h,  v0.8h
232        mov             w3,  #8
2331:
234        sqshrun         v0.8b,  v1.8h,  #5
235        subs            w3,  w3,  #1
236        add             v1.8h,  v1.8h,  v2.8h
237        st1             {v0.8b},  [x0], x1
238        b.ne            1b
239        ret
240endfunc
241
242function ff_pred8x8_128_dc_neon, export=1
243        movi            v0.8b,  #128
244        movi            v1.8b,  #128
245        b               .L_pred8x8_dc_end
246endfunc
247
248function ff_pred8x8_top_dc_neon, export=1
249        sub             x2,  x0,  x1
250        ld1             {v0.8b},  [x2]
251        uaddlp          v0.4h,  v0.8b
252        addp            v0.4h,  v0.4h,  v0.4h
253        zip1            v0.8h,  v0.8h,  v0.8h
254        rshrn           v2.8b,  v0.8h,  #2
255        zip1            v0.8b,  v2.8b,  v2.8b
256        zip1            v1.8b,  v2.8b,  v2.8b
257        b               .L_pred8x8_dc_end
258endfunc
259
260function ff_pred8x8_left_dc_neon, export=1
261        sub             x2,  x0,  #1
262        ldcol.8         v0,  x2,  x1
263        uaddlp          v0.4h,  v0.8b
264        addp            v0.4h,  v0.4h,  v0.4h
265        rshrn           v2.8b,  v0.8h,  #2
266        dup             v1.8b,  v2.b[1]
267        dup             v0.8b,  v2.b[0]
268        b               .L_pred8x8_dc_end
269endfunc
270
271function ff_pred8x8_dc_neon, export=1
272        sub             x2,  x0,  x1
273        sub             x3,  x0,  #1
274        ld1             {v0.8b}, [x2]
275        ldcol.8         v1,  x3,  x1
276        uaddlp          v0.4h,  v0.8b
277        uaddlp          v1.4h,  v1.8b
278        trn1            v2.2s,  v0.2s,  v1.2s
279        trn2            v3.2s,  v0.2s,  v1.2s
280        addp            v4.4h,  v2.4h,  v3.4h
281        addp            v5.4h,  v4.4h,  v4.4h
282        rshrn           v6.8b,  v5.8h,  #3
283        rshrn           v7.8b,  v4.8h,  #2
284        dup             v0.8b,  v6.b[0]
285        dup             v2.8b,  v7.b[2]
286        dup             v1.8b,  v7.b[3]
287        dup             v3.8b,  v6.b[1]
288        zip1            v0.2s,  v0.2s,  v2.2s
289        zip1            v1.2s,  v1.2s,  v3.2s
290.L_pred8x8_dc_end:
291        mov             w3,  #4
292        add             x2,  x0,  x1,  lsl #2
2936:      subs            w3,  w3,  #1
294        st1             {v0.8b},  [x0], x1
295        st1             {v1.8b},  [x2], x1
296        b.ne            6b
297        ret
298endfunc
299
300function ff_pred8x8_l0t_dc_neon, export=1
301        sub             x2,  x0,  x1
302        sub             x3,  x0,  #1
303        ld1             {v0.8b},  [x2]
304        ldcol.8         v1,  x3,  x1,  4
305        zip1            v0.4s,  v0.4s,  v1.4s
306        uaddlp          v0.8h,  v0.16b
307        addp            v0.8h,  v0.8h,  v0.8h
308        addp            v1.4h,  v0.4h,  v0.4h
309        rshrn           v2.8b,  v0.8h,  #2
310        rshrn           v3.8b,  v1.8h,  #3
311        dup             v4.8b,  v3.b[0]
312        dup             v6.8b,  v2.b[2]
313        dup             v5.8b,  v2.b[0]
314        zip1            v0.2s,  v4.2s,  v6.2s
315        zip1            v1.2s,  v5.2s,  v6.2s
316        b               .L_pred8x8_dc_end
317endfunc
318
319function ff_pred8x8_l00_dc_neon, export=1
320        sub             x2,  x0,  #1
321        ldcol.8         v0,  x2,  x1,  4
322        uaddlp          v0.4h,  v0.8b
323        addp            v0.4h,  v0.4h,  v0.4h
324        rshrn           v0.8b,  v0.8h,  #2
325        movi            v1.8b,  #128
326        dup             v0.8b,  v0.b[0]
327        b               .L_pred8x8_dc_end
328endfunc
329
330function ff_pred8x8_0lt_dc_neon, export=1
331        add             x3,  x0,  x1,  lsl #2
332        sub             x2,  x0,  x1
333        sub             x3,  x3,  #1
334        ld1             {v0.8b},  [x2]
335        ldcol.8         v1,  x3,  x1,  4,  hi=1
336        zip1            v0.4s,  v0.4s,  v1.4s
337        uaddlp          v0.8h,  v0.16b
338        addp            v0.8h,  v0.8h,  v0.8h
339        addp            v1.4h,  v0.4h,  v0.4h
340        rshrn           v2.8b,  v0.8h,  #2
341        rshrn           v3.8b,  v1.8h,  #3
342        dup             v4.8b,  v2.b[0]
343        dup             v5.8b,  v2.b[3]
344        dup             v6.8b,  v2.b[2]
345        dup             v7.8b,  v3.b[1]
346        zip1            v0.2s,  v4.2s,  v6.2s
347        zip1            v1.2s,  v5.2s,  v7.2s
348        b               .L_pred8x8_dc_end
349endfunc
350
351function ff_pred8x8_0l0_dc_neon, export=1
352        add             x2,  x0,  x1,  lsl #2
353        sub             x2,  x2,  #1
354        ldcol.8         v1,  x2,  x1,  4
355        uaddlp          v2.4h,  v1.8b
356        addp            v2.4h,  v2.4h,  v2.4h
357        rshrn           v1.8b,  v2.8h,  #2
358        movi            v0.8b,  #128
359        dup             v1.8b,  v1.b[0]
360        b               .L_pred8x8_dc_end
361endfunc
362
363.macro ldcol.16  rd,  rs,  rt,  n=4,  hi=0
364.if \n >= 4 && \hi == 0
365        ld1             {\rd\().h}[0],  [\rs], \rt
366        ld1             {\rd\().h}[1],  [\rs], \rt
367        ld1             {\rd\().h}[2],  [\rs], \rt
368        ld1             {\rd\().h}[3],  [\rs], \rt
369.endif
370.if \n == 8 || \hi == 1
371        ld1             {\rd\().h}[4],  [\rs], \rt
372        ld1             {\rd\().h}[5],  [\rs], \rt
373        ld1             {\rd\().h}[6],  [\rs], \rt
374        ld1             {\rd\().h}[7],  [\rs], \rt
375.endif
376.endm
377
378// slower than C
379/*
380function ff_pred16x16_128_dc_neon_10, export=1
381        movi            v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
382
383        b               .L_pred16x16_dc_10_end
384endfunc
385*/
386
387function ff_pred16x16_top_dc_neon_10, export=1
388        sub             x2,  x0,  x1
389
390        ld1             {v0.8h, v1.8h}, [x2]
391
392        add             v0.8h, v0.8h, v1.8h
393        addv            h0, v0.8h
394
395        urshr           v0.4h,  v0.4h,  #4
396        dup             v0.8h, v0.h[0]
397        b               .L_pred16x16_dc_10_end
398endfunc
399
400// slower than C
401/*
402function ff_pred16x16_left_dc_neon_10, export=1
403        sub             x2,  x0,  #2 // access to the "left" column
404        ldcol.16        v0,  x2,  x1,  8
405        ldcol.16        v1,  x2,  x1,  8 // load "left" column
406
407        add             v0.8h, v0.8h, v1.8h
408        addv            h0,  v0.8h
409
410        urshr           v0.4h,  v0.4h,  #4
411        dup             v0.8h, v0.h[0]
412        b               .L_pred16x16_dc_10_end
413endfunc
414*/
415
416function ff_pred16x16_dc_neon_10, export=1
417        sub             x2,  x0,  x1 // access to the "top" row
418        sub             x3,  x0,  #2 // access to the "left" column
419
420        ld1             {v0.8h, v1.8h}, [x2]
421        ldcol.16        v2,  x3,  x1,  8
422        ldcol.16        v3,  x3,  x1,  8 // load pixels in "top" row and "left" col
423
424        add             v0.8h, v0.8h, v1.8h
425        add             v2.8h, v2.8h, v3.8h
426        add             v0.8h, v0.8h, v2.8h
427        addv            h0, v0.8h
428
429        urshr           v0.4h,  v0.4h,  #5
430        dup             v0.8h,  v0.h[0]
431.L_pred16x16_dc_10_end:
432        mov             v1.16b,  v0.16b
433        mov             w3,  #8
4346:      st1             {v0.8h, v1.8h}, [x0], x1
435        subs            w3,  w3,  #1
436        st1             {v0.8h, v1.8h}, [x0], x1
437        b.ne            6b
438        ret
439endfunc
440
441function ff_pred16x16_hor_neon_10, export=1
442        sub             x2,  x0,  #2
443        add             x3,  x0,  #16
444
445        mov             w4,  #16
4461:      ld1r            {v0.8h},  [x2],  x1
447        subs            w4,  w4,  #1
448        st1             {v0.8h},  [x0],  x1
449        st1             {v0.8h},  [x3],  x1
450        b.ne            1b
451        ret
452endfunc
453
454function ff_pred16x16_vert_neon_10, export=1
455        sub             x2,  x0,  x1
456        add             x1,  x1,  x1
457
458        ld1             {v0.8h, v1.8h},  [x2],  x1
459
460        mov             w3,  #8
4611:      subs            w3,  w3,  #1
462        st1             {v0.8h, v1.8h},  [x0],  x1
463        st1             {v0.8h, v1.8h},  [x2],  x1
464
465        b.ne            1b
466        ret
467endfunc
468
469function ff_pred16x16_plane_neon_10, export=1
470        sub             x3,  x0,  x1
471        movrel          x4,  p16weight
472        add             x2,  x3,  #16
473        sub             x3,  x3,  #2
474        ld1             {v0.8h},  [x3]
475        ld1             {v2.8h},  [x2], x1
476        ldcol.16        v1,  x3,  x1, 8
477        add             x3,  x3,  x1
478        ldcol.16        v3,  x3,  x1, 8
479
480        rev64           v16.8h,  v0.8h
481        rev64           v17.8h,  v1.8h
482        ext             v0.16b, v16.16b, v16.16b, #8
483        ext             v1.16b, v17.16b, v17.16b, #8
484
485        add             v7.8h,  v2.8h,  v3.8h
486        sub             v2.8h,  v2.8h,  v0.8h
487        sub             v3.8h,  v3.8h,  v1.8h
488        ld1             {v0.8h},     [x4]
489        mul             v2.8h,  v2.8h,  v0.8h
490        mul             v3.8h,  v3.8h,  v0.8h
491        addp            v2.8h,  v2.8h,  v3.8h
492        addp            v2.8h,  v2.8h,  v2.8h
493        addp            v2.4h,  v2.4h,  v2.4h
494        sshll           v3.4s,  v2.4h,  #2
495        saddw           v2.4s,  v3.4s,  v2.4h
496        rshrn           v4.4h,  v2.4s,  #6
497        trn2            v5.4h,  v4.4h,  v4.4h
498        add             v2.4h,  v4.4h,  v5.4h
499        shl             v3.4h,  v2.4h,  #3
500        ext             v7.16b, v7.16b, v7.16b, #14
501        sub             v3.4h,  v3.4h,  v2.4h   // 7 * (b + c)
502        add             v7.4h,  v7.4h,  v0.4h
503        shl             v2.4h,  v7.4h,  #4
504        ssubl           v2.4s,  v2.4h,  v3.4h
505        shl             v3.4h,  v4.4h,  #4
506        ext             v0.16b, v0.16b, v0.16b, #14
507        ssubl           v6.4s,  v5.4h,  v3.4h
508
509        mov             v0.h[0],  wzr
510        mul             v0.8h,  v0.8h,  v4.h[0]
511        dup             v16.4s, v2.s[0]
512        dup             v17.4s, v2.s[0]
513        dup             v2.8h,  v4.h[0]
514        dup             v3.4s,  v6.s[0]
515        shl             v2.8h,  v2.8h,  #3
516        saddw           v16.4s, v16.4s, v0.4h
517        saddw2          v17.4s, v17.4s, v0.8h
518        saddw           v3.4s,  v3.4s,  v2.4h
519
520        mov             w3,      #16
521        mvni            v4.8h,   #0xFC, lsl #8 // 1023 for clipping
5221:
523        sqshrun         v0.4h,  v16.4s, #5
524        sqshrun2        v0.8h,  v17.4s, #5
525        saddw           v16.4s, v16.4s, v2.4h
526        saddw           v17.4s, v17.4s, v2.4h
527        sqshrun         v1.4h,  v16.4s, #5
528        sqshrun2        v1.8h,  v17.4s, #5
529        add             v16.4s, v16.4s, v3.4s
530        add             v17.4s, v17.4s, v3.4s
531
532        subs            w3,  w3,  #1
533
534        smin            v0.8h,  v0.8h,  v4.8h
535        smin            v1.8h,  v1.8h,  v4.8h
536
537        st1             {v0.8h, v1.8h}, [x0], x1
538        b.ne            1b
539        ret
540endfunc
541
542function ff_pred8x8_hor_neon_10, export=1
543        sub             x2,  x0,  #2
544        mov             w3,  #8
545
5461:      ld1r            {v0.8h},  [x2], x1
547        subs            w3,  w3,  #1
548        st1             {v0.8h},  [x0], x1
549        b.ne            1b
550        ret
551endfunc
552
553function ff_pred8x8_vert_neon_10, export=1
554        sub             x2,  x0,  x1
555        lsl             x1,  x1,  #1
556
557        ld1             {v0.8h},  [x2], x1
558        mov             w3,  #4
5591:      subs            w3,  w3,  #1
560        st1             {v0.8h},  [x0], x1
561        st1             {v0.8h},  [x2], x1
562        b.ne            1b
563        ret
564endfunc
565
566function ff_pred8x8_plane_neon_10, export=1
567        sub             x3,  x0,  x1
568        movrel          x4,  p8weight
569        movrel          x5,  p16weight
570        add             x2,  x3,  #8
571        sub             x3,  x3,  #2
572        ld1             {v0.d}[0],  [x3]
573        ld1             {v2.d}[0],  [x2], x1
574        ldcol.16        v0,  x3,  x1,  hi=1
575        add             x3,  x3,  x1
576        ldcol.16        v3,  x3,  x1,  4
577        add             v7.8h,  v2.8h,  v3.8h
578        rev64           v0.8h,  v0.8h
579        trn1            v2.2d,  v2.2d,  v3.2d
580        sub             v2.8h,  v2.8h,  v0.8h
581        ld1             {v6.8h},  [x4]
582        mul             v2.8h,  v2.8h,  v6.8h
583        ld1             {v0.8h},  [x5]
584        saddlp          v2.4s,  v2.8h
585        addp            v2.4s,  v2.4s,  v2.4s
586        shl             v3.4s,  v2.4s,  #4
587        add             v2.4s,  v3.4s,  v2.4s
588        rshrn           v5.4h,  v2.4s,  #5
589        addp            v2.4h,  v5.4h,  v5.4h
590        shl             v3.4h,  v2.4h,  #1
591        add             v3.4h,  v3.4h,  v2.4h
592        rev64           v7.4h,  v7.4h
593        add             v7.4h,  v7.4h,  v0.4h
594        shl             v2.4h,  v7.4h,  #4
595        ssubl           v2.4s,  v2.4h,  v3.4h
596        ext             v0.16b, v0.16b, v0.16b, #14
597        mov             v0.h[0],  wzr
598        mul             v0.8h,  v0.8h,  v5.h[0]
599        dup             v1.4s,  v2.s[0]
600        dup             v2.4s,  v2.s[0]
601        dup             v3.8h,  v5.h[1]
602        saddw           v1.4s,  v1.4s,  v0.4h
603        saddw2          v2.4s,  v2.4s,  v0.8h
604        mov             w3,  #8
605        mvni            v4.8h,  #0xFC,  lsl #8 // 1023 for clipping
6061:
607        sqshrun         v0.4h,  v1.4s,  #5
608        sqshrun2        v0.8h,  v2.4s,  #5
609
610        saddw           v1.4s,  v1.4s,  v3.4h
611        saddw           v2.4s,  v2.4s,  v3.4h
612
613        subs            w3,  w3,  #1
614
615        smin            v0.8h,  v0.8h,  v4.8h
616
617        st1             {v0.8h},  [x0],  x1
618        b.ne            1b
619        ret
620endfunc
621
622function ff_pred8x8_128_dc_neon_10, export=1
623        movi            v0.8h,  #2, lsl #8      // 512, 1 << (bit_depth - 1)
624        movi            v1.8h,  #2, lsl #8
625        b               .L_pred8x8_dc_10_end
626endfunc
627
628function ff_pred8x8_top_dc_neon_10, export=1
629        sub             x2,  x0,  x1
630        ld1             {v0.8h},  [x2]
631
632        addp            v0.8h,  v0.8h,  v0.8h
633        addp            v0.4h,  v0.4h,  v0.4h
634        zip1            v0.4h,  v0.4h,  v0.4h
635        urshr           v2.4h,  v0.4h,  #2
636        zip1            v0.8h,  v2.8h,  v2.8h
637        zip1            v1.8h,  v2.8h,  v2.8h
638        b               .L_pred8x8_dc_10_end
639endfunc
640
641function ff_pred8x8_left_dc_neon_10, export=1
642        sub             x2,  x0,  #2
643        ldcol.16        v0,  x2,  x1,  8
644
645        addp            v0.8h,  v0.8h,  v0.8h
646        addp            v0.4h,  v0.4h,  v0.4h
647        urshr           v2.4h,  v0.4h,  #2
648        dup             v1.8h,  v2.h[1]
649        dup             v0.8h,  v2.h[0]
650        b               .L_pred8x8_dc_10_end
651endfunc
652
653function ff_pred8x8_dc_neon_10, export=1
654        sub             x2,  x0,  x1
655        sub             x3,  x0,  #2
656
657        ld1             {v0.8h}, [x2]
658        ldcol.16        v1,  x3,  x1, 8
659
660        addp            v0.8h,  v0.8h, v0.8h
661        addp            v1.8h,  v1.8h, v1.8h
662        trn1            v2.2s,  v0.2s,  v1.2s
663        trn2            v3.2s,  v0.2s,  v1.2s
664        addp            v4.4h,  v2.4h,  v3.4h
665        addp            v5.4h,  v4.4h,  v4.4h
666        urshr           v6.4h,  v5.4h,  #3
667        urshr           v7.4h,  v4.4h,  #2
668        dup             v0.8h,  v6.h[0]
669        dup             v2.8h,  v7.h[2]
670        dup             v1.8h,  v7.h[3]
671        dup             v3.8h,  v6.h[1]
672        zip1            v0.2d,  v0.2d,  v2.2d
673        zip1            v1.2d,  v1.2d,  v3.2d
674.L_pred8x8_dc_10_end:
675        mov             w3,  #4
676        add             x2,  x0,  x1,  lsl #2
677
6786:      st1             {v0.8h},  [x0], x1
679        subs            w3,  w3,  #1
680        st1             {v1.8h},  [x2], x1
681        b.ne            6b
682        ret
683endfunc
684
685function ff_pred8x8_l0t_dc_neon_10, export=1
686        sub             x2,  x0,  x1
687        sub             x3,  x0,  #2
688
689        ld1             {v0.8h},  [x2]
690        ldcol.16        v1,  x3,  x1, 4
691
692        addp            v0.8h,  v0.8h,  v0.8h
693        addp            v1.4h,  v1.4h,  v1.4h
694        addp            v0.4h,  v0.4h,  v0.4h
695        addp            v1.4h,  v1.4h,  v1.4h
696        add             v1.4h,  v1.4h,  v0.4h
697
698        urshr           v2.4h,  v0.4h,  #2
699        urshr           v3.4h,  v1.4h,  #3      // the pred4x4 part
700
701        dup             v4.4h,  v3.h[0]
702        dup             v5.4h,  v2.h[0]
703        dup             v6.4h,  v2.h[1]
704
705        zip1            v0.2d,  v4.2d,  v6.2d
706        zip1            v1.2d,  v5.2d,  v6.2d
707        b               .L_pred8x8_dc_10_end
708endfunc
709
710function ff_pred8x8_l00_dc_neon_10, export=1
711        sub             x2,  x0,  #2
712
713        ldcol.16        v0,  x2,  x1,  4
714
715        addp            v0.4h,  v0.4h,  v0.4h
716        addp            v0.4h,  v0.4h,  v0.4h
717        urshr           v0.4h,  v0.4h,  #2
718
719        movi            v1.8h,  #2, lsl #8      // 512
720        dup             v0.8h,  v0.h[0]
721        b               .L_pred8x8_dc_10_end
722endfunc
723
724function ff_pred8x8_0lt_dc_neon_10, export=1
725        add             x3,  x0,  x1,  lsl #2
726        sub             x2,  x0,  x1
727        sub             x3,  x3,  #2
728
729        ld1             {v0.8h},  [x2]
730        ldcol.16        v1,  x3,  x1,  hi=1
731
732        addp            v0.8h,  v0.8h,  v0.8h
733        addp            v1.8h,  v1.8h,  v1.8h
734        addp            v0.4h,  v0.4h,  v0.4h
735        addp            v1.4h,  v1.4h,  v1.4h
736        zip1            v0.2s,  v0.2s,  v1.2s
737        add             v1.4h,  v0.4h,  v1.4h
738
739        urshr           v2.4h,  v0.4h,  #2
740        urshr           v3.4h,  v1.4h,  #3
741
742        dup             v4.4h,  v2.h[0]
743        dup             v5.4h,  v2.h[3]
744        dup             v6.4h,  v2.h[1]
745        dup             v7.4h,  v3.h[1]
746
747        zip1            v0.2d,  v4.2d,  v6.2d
748        zip1            v1.2d,  v5.2d,  v7.2d
749        b               .L_pred8x8_dc_10_end
750endfunc
751
752function ff_pred8x8_0l0_dc_neon_10, export=1
753        add             x2,  x0,  x1,  lsl #2
754        sub             x2,  x2,  #2
755
756        ldcol.16        v1,  x2,  x1,  4
757
758        addp            v2.8h,  v1.8h,  v1.8h
759        addp            v2.4h,  v2.4h,  v2.4h
760        urshr           v1.4h,  v2.4h,  #2
761
762        movi            v0.8h,  #2,  lsl #8     // 512
763        dup             v1.8h,  v1.h[0]
764        b               .L_pred8x8_dc_10_end
765endfunc
766