1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/aarch64/asm.S"
23 #include "neon.S"
24 
25         /* H.264 qpel MC */
26 
27 .macro  lowpass_const   r
28         movz            \r, #20, lsl #16
29         movk            \r, #5
30         mov             v6.S[0], \r
31 .endm
32 
33 //trashes v0-v5
34 .macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
35         ext             v2.8B,      \r0\().8B, \r1\().8B, #2
36         ext             v3.8B,      \r0\().8B, \r1\().8B, #3
37         uaddl           v2.8H,      v2.8B,     v3.8B
38         ext             v4.8B,      \r0\().8B, \r1\().8B, #1
39         ext             v5.8B,      \r0\().8B, \r1\().8B, #4
40         uaddl           v4.8H,      v4.8B,     v5.8B
41         ext             v1.8B,      \r0\().8B, \r1\().8B, #5
42         uaddl           \d0\().8H,  \r0\().8B, v1.8B
43         ext             v0.8B,      \r2\().8B, \r3\().8B, #2
44         mla             \d0\().8H,  v2.8H,     v6.H[1]
45         ext             v1.8B,      \r2\().8B, \r3\().8B, #3
46         uaddl           v0.8H,      v0.8B,     v1.8B
47         ext             v1.8B,      \r2\().8B, \r3\().8B, #1
48         mls             \d0\().8H,  v4.8H,     v6.H[0]
49         ext             v3.8B,      \r2\().8B, \r3\().8B, #4
50         uaddl           v1.8H,      v1.8B,     v3.8B
51         ext             v2.8B,      \r2\().8B, \r3\().8B, #5
52         uaddl           \d1\().8H,  \r2\().8B, v2.8B
53         mla             \d1\().8H,  v0.8H,     v6.H[1]
54         mls             \d1\().8H,  v1.8H,     v6.H[0]
55   .if \narrow
56         sqrshrun        \d0\().8B,  \d0\().8H, #5
57         sqrshrun        \d1\().8B,  \d1\().8H, #5
58   .endif
59 .endm
60 
61 //trashes v0-v4
62 .macro  lowpass_8_v     r0,  r1,  r2,  r3,  r4,  r5,  r6,  d0,  d1,  narrow=1
63         uaddl           v2.8H,      \r2\().8B, \r3\().8B
64         uaddl           v0.8H,      \r3\().8B, \r4\().8B
65         uaddl           v4.8H,      \r1\().8B, \r4\().8B
66         uaddl           v1.8H,      \r2\().8B, \r5\().8B
67         uaddl           \d0\().8H,  \r0\().8B, \r5\().8B
68         uaddl           \d1\().8H,  \r1\().8B, \r6\().8B
69         mla             \d0\().8H,  v2.8H,     v6.H[1]
70         mls             \d0\().8H,  v4.8H,     v6.H[0]
71         mla             \d1\().8H,  v0.8H,     v6.H[1]
72         mls             \d1\().8H,  v1.8H,     v6.H[0]
73   .if \narrow
74         sqrshrun        \d0\().8B,  \d0\().8H, #5
75         sqrshrun        \d1\().8B,  \d1\().8H, #5
76   .endif
77 .endm
78 
79 //trashes v0-v5, v7, v30-v31
80 .macro  lowpass_8H      r0,  r1
81         ext             v0.16B,     \r0\().16B, \r0\().16B, #2
82         ext             v1.16B,     \r0\().16B, \r0\().16B, #3
83         uaddl           v0.8H,      v0.8B,      v1.8B
84         ext             v2.16B,     \r0\().16B, \r0\().16B, #1
85         ext             v3.16B,     \r0\().16B, \r0\().16B, #4
86         uaddl           v2.8H,      v2.8B,      v3.8B
87         ext             v30.16B,    \r0\().16B, \r0\().16B, #5
88         uaddl           \r0\().8H,  \r0\().8B,  v30.8B
89         ext             v4.16B,     \r1\().16B, \r1\().16B, #2
90         mla             \r0\().8H,  v0.8H,      v6.H[1]
91         ext             v5.16B,     \r1\().16B, \r1\().16B, #3
92         uaddl           v4.8H,      v4.8B,      v5.8B
93         ext             v7.16B,     \r1\().16B, \r1\().16B, #1
94         mls             \r0\().8H,  v2.8H,      v6.H[0]
95         ext             v0.16B,     \r1\().16B, \r1\().16B, #4
96         uaddl           v7.8H,      v7.8B,      v0.8B
97         ext             v31.16B,    \r1\().16B, \r1\().16B, #5
98         uaddl           \r1\().8H,  \r1\().8B,  v31.8B
99         mla             \r1\().8H,  v4.8H,      v6.H[1]
100         mls             \r1\().8H,  v7.8H,      v6.H[0]
101 .endm
102 
103 // trashes v2-v5, v30
104 .macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
105         ext             v2.8B,     \r0\().8B, \r1\().8B, #2
106         ext             v3.8B,     \r0\().8B, \r1\().8B, #3
107         uaddl           v2.8H,     v2.8B,     v3.8B
108         ext             v4.8B,     \r0\().8B, \r1\().8B, #1
109         ext             v5.8B,     \r0\().8B, \r1\().8B, #4
110         uaddl           v4.8H,     v4.8B,     v5.8B
111         ext             v30.8B,    \r0\().8B, \r1\().8B, #5
112         uaddl           \d0\().8H, \r0\().8B, v30.8B
113         mla             \d0\().8H, v2.8H,     v6.H[1]
114         mls             \d0\().8H, v4.8H,     v6.H[0]
115   .if \narrow
116         sqrshrun        \d0\().8B, \d0\().8H, #5
117   .endif
118 .endm
119 
120 // trashed v0-v7
121 .macro  lowpass_8.16    r0,  r1,  r2,  r3,  r4,  r5
122         saddl           v5.4S,      \r2\().4H,  \r3\().4H
123         saddl2          v1.4S,      \r2\().8H,  \r3\().8H
124         saddl           v6.4S,      \r1\().4H,  \r4\().4H
125         saddl2          v2.4S,      \r1\().8H,  \r4\().8H
126         saddl           v0.4S,      \r0\().4H,  \r5\().4H
127         saddl2          v4.4S,      \r0\().8H,  \r5\().8H
128 
129         shl             v3.4S,  v5.4S,  #4
130         shl             v5.4S,  v5.4S,  #2
131         shl             v7.4S,  v6.4S,  #2
132         add             v5.4S,  v5.4S,  v3.4S
133         add             v6.4S,  v6.4S,  v7.4S
134 
135         shl             v3.4S,  v1.4S,  #4
136         shl             v1.4S,  v1.4S,  #2
137         shl             v7.4S,  v2.4S,  #2
138         add             v1.4S,  v1.4S,  v3.4S
139         add             v2.4S,  v2.4S,  v7.4S
140 
141         add             v5.4S,  v5.4S,  v0.4S
142         sub             v5.4S,  v5.4S,  v6.4S
143 
144         add             v1.4S,  v1.4S,  v4.4S
145         sub             v1.4S,  v1.4S,  v2.4S
146 
147         rshrn           v5.4H,  v5.4S,  #10
148         rshrn2          v5.8H,  v1.4S,  #10
149 
150         sqxtun          \r0\().8B,  v5.8H
151 .endm
152 
153 function put_h264_qpel16_h_lowpass_neon_packed
154         mov             x4,  x30
155         mov             x12, #16
156         mov             x3,  #8
157         bl              put_h264_qpel8_h_lowpass_neon
158         sub             x1,  x1,  x2, lsl #4
159         add             x1,  x1,  #8
160         mov             x12, #16
161         mov             x30, x4
162         b               put_h264_qpel8_h_lowpass_neon
163 endfunc
164 
165 .macro  h264_qpel_h_lowpass type
166 function \type\()_h264_qpel16_h_lowpass_neon
167         mov             x13, x30
168         mov             x12, #16
169         bl              \type\()_h264_qpel8_h_lowpass_neon
170         sub             x0,  x0,  x3, lsl #4
171         sub             x1,  x1,  x2, lsl #4
172         add             x0,  x0,  #8
173         add             x1,  x1,  #8
174         mov             x12, #16
175         mov             x30, x13
176 endfunc
177 
178 function \type\()_h264_qpel8_h_lowpass_neon
179 1:      ld1             {v28.8B, v29.8B}, [x1], x2
180         ld1             {v16.8B, v17.8B}, [x1], x2
181         subs            x12, x12, #2
182         lowpass_8       v28, v29, v16, v17, v28, v16
183   .ifc \type,avg
184         ld1             {v2.8B},    [x0], x3
185         ld1             {v3.8B},    [x0]
186         urhadd          v28.8B, v28.8B,  v2.8B
187         urhadd          v16.8B, v16.8B, v3.8B
188         sub             x0,  x0,  x3
189   .endif
190         st1             {v28.8B},    [x0], x3
191         st1             {v16.8B},    [x0], x3
192         b.ne            1b
193         ret
194 endfunc
195 .endm
196 
197         h264_qpel_h_lowpass put
198         h264_qpel_h_lowpass avg
199 
200 .macro  h264_qpel_h_lowpass_l2 type
201 function \type\()_h264_qpel16_h_lowpass_l2_neon
202         mov             x13, x30
203         mov             x12, #16
204         bl              \type\()_h264_qpel8_h_lowpass_l2_neon
205         sub             x0,  x0,  x2, lsl #4
206         sub             x1,  x1,  x2, lsl #4
207         sub             x3,  x3,  x2, lsl #4
208         add             x0,  x0,  #8
209         add             x1,  x1,  #8
210         add             x3,  x3,  #8
211         mov             x12, #16
212         mov             x30, x13
213 endfunc
214 
215 function \type\()_h264_qpel8_h_lowpass_l2_neon
216 1:      ld1             {v26.8B, v27.8B}, [x1], x2
217         ld1             {v16.8B, v17.8B}, [x1], x2
218         ld1             {v28.8B},     [x3], x2
219         ld1             {v29.8B},     [x3], x2
220         subs            x12, x12, #2
221         lowpass_8       v26, v27, v16, v17, v26, v27
222         urhadd          v26.8B, v26.8B, v28.8B
223         urhadd          v27.8B, v27.8B, v29.8B
224   .ifc \type,avg
225         ld1             {v2.8B},      [x0], x2
226         ld1             {v3.8B},      [x0]
227         urhadd          v26.8B, v26.8B, v2.8B
228         urhadd          v27.8B, v27.8B, v3.8B
229         sub             x0,  x0,  x2
230   .endif
231         st1             {v26.8B},     [x0], x2
232         st1             {v27.8B},     [x0], x2
233         b.ne            1b
234         ret
235 endfunc
236 .endm
237 
238         h264_qpel_h_lowpass_l2 put
239         h264_qpel_h_lowpass_l2 avg
240 
241 function put_h264_qpel16_v_lowpass_neon_packed
242         mov             x4,  x30
243         mov             x2,  #8
244         bl              put_h264_qpel8_v_lowpass_neon
245         sub             x1,  x1,  x3, lsl #2
246         bl              put_h264_qpel8_v_lowpass_neon
247         sub             x1,  x1,  x3, lsl #4
248         sub             x1,  x1,  x3, lsl #2
249         add             x1,  x1,  #8
250         bl              put_h264_qpel8_v_lowpass_neon
251         sub             x1,  x1,  x3, lsl #2
252         mov             x30, x4
253         b               put_h264_qpel8_v_lowpass_neon
254 endfunc
255 
256 .macro  h264_qpel_v_lowpass type
257 function \type\()_h264_qpel16_v_lowpass_neon
258         mov             x4,  x30
259         bl              \type\()_h264_qpel8_v_lowpass_neon
260         sub             x1,  x1,  x3, lsl #2
261         bl              \type\()_h264_qpel8_v_lowpass_neon
262         sub             x0,  x0,  x2, lsl #4
263         add             x0,  x0,  #8
264         sub             x1,  x1,  x3, lsl #4
265         sub             x1,  x1,  x3, lsl #2
266         add             x1,  x1,  #8
267         bl              \type\()_h264_qpel8_v_lowpass_neon
268         sub             x1,  x1,  x3, lsl #2
269         mov             x30, x4
270 endfunc
271 
272 function \type\()_h264_qpel8_v_lowpass_neon
273         ld1             {v16.8B}, [x1], x3
274         ld1             {v17.8B}, [x1], x3
275         ld1             {v18.8B}, [x1], x3
276         ld1             {v19.8B}, [x1], x3
277         ld1             {v20.8B}, [x1], x3
278         ld1             {v21.8B}, [x1], x3
279         ld1             {v22.8B}, [x1], x3
280         ld1             {v23.8B}, [x1], x3
281         ld1             {v24.8B}, [x1], x3
282         ld1             {v25.8B}, [x1], x3
283         ld1             {v26.8B}, [x1], x3
284         ld1             {v27.8B}, [x1], x3
285         ld1             {v28.8B}, [x1]
286 
287         lowpass_8_v     v16, v17, v18, v19, v20, v21, v22, v16, v17
288         lowpass_8_v     v18, v19, v20, v21, v22, v23, v24, v18, v19
289         lowpass_8_v     v20, v21, v22, v23, v24, v25, v26, v20, v21
290         lowpass_8_v     v22, v23, v24, v25, v26, v27, v28, v22, v23
291   .ifc \type,avg
292         ld1             {v24.8B},  [x0], x2
293         ld1             {v25.8B}, [x0], x2
294         ld1             {v26.8B}, [x0], x2
295         urhadd          v16.8B, v16.8B, v24.8B
296         ld1             {v27.8B}, [x0], x2
297         urhadd          v17.8B, v17.8B, v25.8B
298         ld1             {v28.8B}, [x0], x2
299         urhadd          v18.8B, v18.8B, v26.8B
300         ld1             {v29.8B}, [x0], x2
301         urhadd          v19.8B, v19.8B, v27.8B
302         ld1             {v30.8B}, [x0], x2
303         urhadd          v20.8B, v20.8B, v28.8B
304         ld1             {v31.8B}, [x0], x2
305         urhadd          v21.8B, v21.8B, v29.8B
306         urhadd          v22.8B, v22.8B, v30.8B
307         urhadd          v23.8B, v23.8B, v31.8B
308         sub             x0,  x0,  x2,  lsl #3
309   .endif
310 
311         st1             {v16.8B}, [x0], x2
312         st1             {v17.8B}, [x0], x2
313         st1             {v18.8B}, [x0], x2
314         st1             {v19.8B}, [x0], x2
315         st1             {v20.8B}, [x0], x2
316         st1             {v21.8B}, [x0], x2
317         st1             {v22.8B}, [x0], x2
318         st1             {v23.8B}, [x0], x2
319 
320         ret
321 endfunc
322 .endm
323 
324         h264_qpel_v_lowpass put
325         h264_qpel_v_lowpass avg
326 
327 .macro  h264_qpel_v_lowpass_l2 type
328 function \type\()_h264_qpel16_v_lowpass_l2_neon
329         mov             x4,  x30
330         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
331         sub             x1,  x1,  x3, lsl #2
332         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
333         sub             x0,  x0,  x3, lsl #4
334         sub             x12, x12, x2, lsl #4
335         add             x0,  x0,  #8
336         add             x12, x12, #8
337         sub             x1,  x1,  x3, lsl #4
338         sub             x1,  x1,  x3, lsl #2
339         add             x1,  x1,  #8
340         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
341         sub             x1,  x1,  x3, lsl #2
342         mov             x30, x4
343 endfunc
344 
345 function \type\()_h264_qpel8_v_lowpass_l2_neon
346         ld1             {v16.8B}, [x1], x3
347         ld1             {v17.8B}, [x1], x3
348         ld1             {v18.8B}, [x1], x3
349         ld1             {v19.8B}, [x1], x3
350         ld1             {v20.8B}, [x1], x3
351         ld1             {v21.8B}, [x1], x3
352         ld1             {v22.8B}, [x1], x3
353         ld1             {v23.8B}, [x1], x3
354         ld1             {v24.8B}, [x1], x3
355         ld1             {v25.8B}, [x1], x3
356         ld1             {v26.8B}, [x1], x3
357         ld1             {v27.8B}, [x1], x3
358         ld1             {v28.8B}, [x1]
359 
360         lowpass_8_v     v16, v17, v18, v19, v20, v21, v22, v16, v17
361         lowpass_8_v     v18, v19, v20, v21, v22, v23, v24, v18, v19
362         lowpass_8_v     v20, v21, v22, v23, v24, v25, v26, v20, v21
363         lowpass_8_v     v22, v23, v24, v25, v26, v27, v28, v22, v23
364 
365         ld1             {v24.8B},  [x12], x2
366         ld1             {v25.8B},  [x12], x2
367         ld1             {v26.8B},  [x12], x2
368         ld1             {v27.8B},  [x12], x2
369         ld1             {v28.8B},  [x12], x2
370         urhadd          v16.8B, v24.8B, v16.8B
371         urhadd          v17.8B, v25.8B, v17.8B
372         ld1             {v29.8B},  [x12], x2
373         urhadd          v18.8B, v26.8B, v18.8B
374         urhadd          v19.8B, v27.8B, v19.8B
375         ld1             {v30.8B}, [x12], x2
376         urhadd          v20.8B, v28.8B, v20.8B
377         urhadd          v21.8B, v29.8B, v21.8B
378         ld1             {v31.8B}, [x12], x2
379         urhadd          v22.8B, v30.8B, v22.8B
380         urhadd          v23.8B, v31.8B, v23.8B
381 
382   .ifc \type,avg
383         ld1             {v24.8B}, [x0], x3
384         ld1             {v25.8B}, [x0], x3
385         ld1             {v26.8B}, [x0], x3
386         urhadd          v16.8B, v16.8B, v24.8B
387         ld1             {v27.8B}, [x0], x3
388         urhadd          v17.8B, v17.8B, v25.8B
389         ld1             {v28.8B}, [x0], x3
390         urhadd          v18.8B, v18.8B, v26.8B
391         ld1             {v29.8B}, [x0], x3
392         urhadd          v19.8B, v19.8B, v27.8B
393         ld1             {v30.8B}, [x0], x3
394         urhadd          v20.8B, v20.8B, v28.8B
395         ld1             {v31.8B}, [x0], x3
396         urhadd          v21.8B, v21.8B, v29.8B
397         urhadd          v22.8B, v22.8B, v30.8B
398         urhadd          v23.8B, v23.8B, v31.8B
399         sub             x0,  x0,  x3,  lsl #3
400   .endif
401 
402         st1             {v16.8B}, [x0], x3
403         st1             {v17.8B}, [x0], x3
404         st1             {v18.8B}, [x0], x3
405         st1             {v19.8B}, [x0], x3
406         st1             {v20.8B}, [x0], x3
407         st1             {v21.8B}, [x0], x3
408         st1             {v22.8B}, [x0], x3
409         st1             {v23.8B}, [x0], x3
410 
411         ret
412 endfunc
413 .endm
414 
415         h264_qpel_v_lowpass_l2 put
416         h264_qpel_v_lowpass_l2 avg
417 
418 function put_h264_qpel8_hv_lowpass_neon_top
419         lowpass_const   w12
420         ld1             {v16.8H}, [x1], x3
421         ld1             {v17.8H}, [x1], x3
422         ld1             {v18.8H}, [x1], x3
423         ld1             {v19.8H}, [x1], x3
424         ld1             {v20.8H}, [x1], x3
425         ld1             {v21.8H}, [x1], x3
426         ld1             {v22.8H}, [x1], x3
427         ld1             {v23.8H}, [x1], x3
428         ld1             {v24.8H}, [x1], x3
429         ld1             {v25.8H}, [x1], x3
430         ld1             {v26.8H}, [x1], x3
431         ld1             {v27.8H}, [x1], x3
432         ld1             {v28.8H}, [x1]
433         lowpass_8H      v16, v17
434         lowpass_8H      v18, v19
435         lowpass_8H      v20, v21
436         lowpass_8H      v22, v23
437         lowpass_8H      v24, v25
438         lowpass_8H      v26, v27
439         lowpass_8H      v28, v29
440 
441         lowpass_8.16    v16, v17, v18, v19, v20, v21
442         lowpass_8.16    v17, v18, v19, v20, v21, v22
443 
444         lowpass_8.16    v18, v19, v20, v21, v22, v23
445         lowpass_8.16    v19, v20, v21, v22, v23, v24
446 
447         lowpass_8.16    v20, v21, v22, v23, v24, v25
448         lowpass_8.16    v21, v22, v23, v24, v25, v26
449 
450         lowpass_8.16    v22, v23, v24, v25, v26, v27
451         lowpass_8.16    v23, v24, v25, v26, v27, v28
452 
453         ret
454 endfunc
455 
456 .macro  h264_qpel8_hv_lowpass type
457 function \type\()_h264_qpel8_hv_lowpass_neon
458         mov             x10, x30
459         bl              put_h264_qpel8_hv_lowpass_neon_top
460   .ifc \type,avg
461         ld1             {v0.8B},      [x0], x2
462         ld1             {v1.8B},      [x0], x2
463         ld1             {v2.8B},      [x0], x2
464         urhadd          v16.8B, v16.8B, v0.8B
465         ld1             {v3.8B},      [x0], x2
466         urhadd          v17.8B, v17.8B, v1.8B
467         ld1             {v4.8B},      [x0], x2
468         urhadd          v18.8B, v18.8B, v2.8B
469         ld1             {v5.8B},      [x0], x2
470         urhadd          v19.8B, v19.8B, v3.8B
471         ld1             {v6.8B},      [x0], x2
472         urhadd          v20.8B, v20.8B, v4.8B
473         ld1             {v7.8B},      [x0], x2
474         urhadd          v21.8B, v21.8B, v5.8B
475         urhadd          v22.8B, v22.8B, v6.8B
476         urhadd          v23.8B, v23.8B, v7.8B
477         sub             x0,  x0,  x2,  lsl #3
478   .endif
479 
480         st1             {v16.8B},     [x0], x2
481         st1             {v17.8B},     [x0], x2
482         st1             {v18.8B},     [x0], x2
483         st1             {v19.8B},     [x0], x2
484         st1             {v20.8B},     [x0], x2
485         st1             {v21.8B},     [x0], x2
486         st1             {v22.8B},     [x0], x2
487         st1             {v23.8B},     [x0], x2
488 
489         ret             x10
490 endfunc
491 .endm
492 
493         h264_qpel8_hv_lowpass put
494         h264_qpel8_hv_lowpass avg
495 
496 .macro  h264_qpel8_hv_lowpass_l2 type
497 function \type\()_h264_qpel8_hv_lowpass_l2_neon
498         mov             x10, x30
499         bl              put_h264_qpel8_hv_lowpass_neon_top
500 
501         ld1             {v0.8B, v1.8B},  [x2], #16
502         ld1             {v2.8B, v3.8B},  [x2], #16
503         urhadd          v0.8B,  v0.8B,  v16.8B
504         urhadd          v1.8B,  v1.8B,  v17.8B
505         ld1             {v4.8B, v5.8B},  [x2], #16
506         urhadd          v2.8B,  v2.8B,  v18.8B
507         urhadd          v3.8B,  v3.8B,  v19.8B
508         ld1             {v6.8B, v7.8B},  [x2], #16
509         urhadd          v4.8B,  v4.8B,  v20.8B
510         urhadd          v5.8B,  v5.8B,  v21.8B
511         urhadd          v6.8B,  v6.8B,  v22.8B
512         urhadd          v7.8B,  v7.8B,  v23.8B
513   .ifc \type,avg
514         ld1             {v16.8B},     [x0], x3
515         ld1             {v17.8B},     [x0], x3
516         ld1             {v18.8B},     [x0], x3
517         urhadd          v0.8B,  v0.8B,  v16.8B
518         ld1             {v19.8B},     [x0], x3
519         urhadd          v1.8B,  v1.8B,  v17.8B
520         ld1             {v20.8B},     [x0], x3
521         urhadd          v2.8B,  v2.8B,  v18.8B
522         ld1             {v21.8B},     [x0], x3
523         urhadd          v3.8B,  v3.8B,  v19.8B
524         ld1             {v22.8B},     [x0], x3
525         urhadd          v4.8B,  v4.8B,  v20.8B
526         ld1             {v23.8B},     [x0], x3
527         urhadd          v5.8B,  v5.8B,  v21.8B
528         urhadd          v6.8B,  v6.8B,  v22.8B
529         urhadd          v7.8B,  v7.8B,  v23.8B
530         sub             x0,  x0,  x3,  lsl #3
531   .endif
532         st1             {v0.8B},      [x0], x3
533         st1             {v1.8B},      [x0], x3
534         st1             {v2.8B},      [x0], x3
535         st1             {v3.8B},      [x0], x3
536         st1             {v4.8B},      [x0], x3
537         st1             {v5.8B},      [x0], x3
538         st1             {v6.8B},      [x0], x3
539         st1             {v7.8B},      [x0], x3
540 
541         ret             x10
542 endfunc
543 .endm
544 
545         h264_qpel8_hv_lowpass_l2 put
546         h264_qpel8_hv_lowpass_l2 avg
547 
548 .macro  h264_qpel16_hv  type
549 function \type\()_h264_qpel16_hv_lowpass_neon
550         mov             x13, x30
551         bl              \type\()_h264_qpel8_hv_lowpass_neon
552         sub             x1,  x1,  x3, lsl #2
553         bl              \type\()_h264_qpel8_hv_lowpass_neon
554         sub             x1,  x1,  x3, lsl #4
555         sub             x1,  x1,  x3, lsl #2
556         add             x1,  x1,  #8
557         sub             x0,  x0,  x2, lsl #4
558         add             x0,  x0,  #8
559         bl              \type\()_h264_qpel8_hv_lowpass_neon
560         sub             x1,  x1,  x3, lsl #2
561         mov             x30, x13
562         b               \type\()_h264_qpel8_hv_lowpass_neon
563 endfunc
564 
565 function \type\()_h264_qpel16_hv_lowpass_l2_neon
566         mov             x13, x30
567         sub             x2,  x4,  #256
568         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
569         sub             x1,  x1,  x3, lsl #2
570         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
571         sub             x1,  x1,  x3, lsl #4
572         sub             x1,  x1,  x3, lsl #2
573         add             x1,  x1,  #8
574         sub             x0,  x0,  x3, lsl #4
575         add             x0,  x0,  #8
576         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
577         sub             x1,  x1,  x3, lsl #2
578         mov             x30, x13
579         b               \type\()_h264_qpel8_hv_lowpass_l2_neon
580 endfunc
581 .endm
582 
583         h264_qpel16_hv put
584         h264_qpel16_hv avg
585 
586 .macro  h264_qpel8      type
587 function ff_\type\()_h264_qpel8_mc10_neon, export=1
588         lowpass_const   w3
589         mov             x3,  x1
590         sub             x1,  x1,  #2
591         mov             x12, #8
592         b               \type\()_h264_qpel8_h_lowpass_l2_neon
593 endfunc
594 
595 function ff_\type\()_h264_qpel8_mc20_neon, export=1
596         lowpass_const   w3
597         sub             x1,  x1,  #2
598         mov             x3,  x2
599         mov             x12, #8
600         b               \type\()_h264_qpel8_h_lowpass_neon
601 endfunc
602 
603 function ff_\type\()_h264_qpel8_mc30_neon, export=1
604         lowpass_const   w3
605         add             x3,  x1,  #1
606         sub             x1,  x1,  #2
607         mov             x12, #8
608         b               \type\()_h264_qpel8_h_lowpass_l2_neon
609 endfunc
610 
611 function ff_\type\()_h264_qpel8_mc01_neon, export=1
612         mov             x14, x30
613         mov             x12, x1
614 \type\()_h264_qpel8_mc01:
615         lowpass_const   w3
616         mov             x3,  x2
617         sub             x1,  x1,  x2, lsl #1
618         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
619         ret             x14
620 endfunc
621 
622 function ff_\type\()_h264_qpel8_mc11_neon, export=1
623         mov             x14, x30
624         mov             x8,  x0
625         mov             x9,  x1
626 \type\()_h264_qpel8_mc11:
627         lowpass_const   w3
628         mov             x11, sp
629         sub             sp,  sp,  #64
630         mov             x0,  sp
631         sub             x1,  x1,  #2
632         mov             x3,  #8
633         mov             x12, #8
634         bl              put_h264_qpel8_h_lowpass_neon
635         mov             x0,  x8
636         mov             x3,  x2
637         mov             x12, sp
638         sub             x1,  x9,  x2, lsl #1
639         mov             x2,  #8
640         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
641         mov             sp,  x11
642         ret             x14
643 endfunc
644 
645 function ff_\type\()_h264_qpel8_mc21_neon, export=1
646         mov             x14, x30
647         mov             x8,  x0
648         mov             x9,  x1
649 \type\()_h264_qpel8_mc21:
650         lowpass_const   w3
651         mov             x11, sp
652         sub             sp,  sp,  #(8*8+16*12)
653         sub             x1,  x1,  #2
654         mov             x3,  #8
655         mov             x0,  sp
656         mov             x12, #8
657         bl              put_h264_qpel8_h_lowpass_neon
658         mov             x4,  x0
659         mov             x0,  x8
660         sub             x1,  x9,  x2, lsl #1
661         sub             x1,  x1,  #2
662         mov             x3,  x2
663         sub             x2,  x4,  #64
664         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
665         mov             sp,  x11
666         ret             x14
667 endfunc
668 
669 function ff_\type\()_h264_qpel8_mc31_neon, export=1
670         add             x1,  x1,  #1
671         mov             x14, x30
672         mov             x8,  x0
673         mov             x9,  x1
674         sub             x1,  x1,  #1
675         b               \type\()_h264_qpel8_mc11
676 endfunc
677 
678 function ff_\type\()_h264_qpel8_mc02_neon, export=1
679         mov             x14, x30
680         lowpass_const   w3
681         sub             x1,  x1,  x2, lsl #1
682         mov             x3,  x2
683         bl              \type\()_h264_qpel8_v_lowpass_neon
684         ret             x14
685 endfunc
686 
687 function ff_\type\()_h264_qpel8_mc12_neon, export=1
688         mov             x14, x30
689         mov             x8,  x0
690         mov             x9,  x1
691 \type\()_h264_qpel8_mc12:
692         lowpass_const   w3
693         mov             x11, sp
694         sub             sp,  sp,  #(8*8+16*12)
695         sub             x1,  x1,  x2, lsl #1
696         mov             x3,  x2
697         mov             x2,  #8
698         mov             x0,  sp
699         bl              put_h264_qpel8_v_lowpass_neon
700         mov             x4,  x0
701         mov             x0,  x8
702         sub             x1,  x9,  x3, lsl #1
703         sub             x1,  x1,  #2
704         sub             x2,  x4,  #64
705         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
706         mov             sp,  x11
707         ret             x14
708 endfunc
709 
710 function ff_\type\()_h264_qpel8_mc22_neon, export=1
711         mov             x14, x30
712         mov             x11, sp
713         sub             x1,  x1,  x2, lsl #1
714         sub             x1,  x1,  #2
715         mov             x3,  x2
716         bl              \type\()_h264_qpel8_hv_lowpass_neon
717         mov             sp,  x11
718         ret             x14
719 endfunc
720 
721 function ff_\type\()_h264_qpel8_mc32_neon, export=1
722         mov             x14, x30
723         mov             x8,  x0
724         mov             x9,  x1
725         add             x1,  x1,  #1
726         b               \type\()_h264_qpel8_mc12
727 endfunc
728 
729 function ff_\type\()_h264_qpel8_mc03_neon, export=1
730         mov             x14, x30
731         add             x12, x1,  x2
732         b               \type\()_h264_qpel8_mc01
733 endfunc
734 
735 function ff_\type\()_h264_qpel8_mc13_neon, export=1
736         mov             x14, x30
737         mov             x8,  x0
738         mov             x9,  x1
739         add             x1,  x1,  x2
740         b               \type\()_h264_qpel8_mc11
741 endfunc
742 
743 function ff_\type\()_h264_qpel8_mc23_neon, export=1
744         mov             x14, x30
745         mov             x8,  x0
746         mov             x9,  x1
747         add             x1,  x1,  x2
748         b               \type\()_h264_qpel8_mc21
749 endfunc
750 
751 function ff_\type\()_h264_qpel8_mc33_neon, export=1
752         add             x1,  x1,  #1
753         mov             x14, x30
754         mov             x8,  x0
755         mov             x9,  x1
756         add             x1,  x1,  x2
757         sub             x1,  x1,  #1
758         b               \type\()_h264_qpel8_mc11
759 endfunc
760 .endm
761 
762         h264_qpel8 put
763         h264_qpel8 avg
764 
765 .macro  h264_qpel16     type
766 function ff_\type\()_h264_qpel16_mc10_neon, export=1
767         lowpass_const   w3
768         mov             x3,  x1
769         sub             x1,  x1,  #2
770         b               \type\()_h264_qpel16_h_lowpass_l2_neon
771 endfunc
772 
773 function ff_\type\()_h264_qpel16_mc20_neon, export=1
774         lowpass_const   w3
775         sub             x1,  x1,  #2
776         mov             x3,  x2
777         b               \type\()_h264_qpel16_h_lowpass_neon
778 endfunc
779 
780 function ff_\type\()_h264_qpel16_mc30_neon, export=1
781         lowpass_const   w3
782         add             x3,  x1,  #1
783         sub             x1,  x1,  #2
784         b               \type\()_h264_qpel16_h_lowpass_l2_neon
785 endfunc
786 
787 function ff_\type\()_h264_qpel16_mc01_neon, export=1
788         mov             x14, x30
789         mov             x12, x1
790 \type\()_h264_qpel16_mc01:
791         lowpass_const   w3
792         mov             x3,  x2
793         sub             x1,  x1,  x2, lsl #1
794         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
795         ret             x14
796 endfunc
797 
798 function ff_\type\()_h264_qpel16_mc11_neon, export=1
799         mov             x14, x30
800         mov             x8,  x0
801         mov             x9,  x1
802 \type\()_h264_qpel16_mc11:
803         lowpass_const   w3
804         mov             x11, sp
805         sub             sp,  sp,  #256
806         mov             x0,  sp
807         sub             x1,  x1,  #2
808         mov             x3,  #16
809         bl              put_h264_qpel16_h_lowpass_neon
810         mov             x0,  x8
811         mov             x3,  x2
812         mov             x12, sp
813         sub             x1,  x9,  x2, lsl #1
814         mov             x2,  #16
815         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
816         mov             sp,  x11
817         ret             x14
818 endfunc
819 
820 function ff_\type\()_h264_qpel16_mc21_neon, export=1
821         mov             x14, x30
822         mov             x8,  x0
823         mov             x9,  x1
824 \type\()_h264_qpel16_mc21:
825         lowpass_const   w3
826         mov             x11, sp
827         sub             sp,  sp,  #(16*16+16*12)
828         sub             x1,  x1,  #2
829         mov             x0,  sp
830         bl              put_h264_qpel16_h_lowpass_neon_packed
831         mov             x4,  x0
832         mov             x0,  x8
833         sub             x1,  x9,  x2, lsl #1
834         sub             x1,  x1,  #2
835         mov             x3,  x2
836         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
837         mov             sp,  x11
838         ret             x14
839 endfunc
840 
841 function ff_\type\()_h264_qpel16_mc31_neon, export=1
842         add             x1,  x1,  #1
843         mov             x14, x30
844         mov             x8,  x0
845         mov             x9,  x1
846         sub             x1,  x1,  #1
847         b               \type\()_h264_qpel16_mc11
848 endfunc
849 
850 function ff_\type\()_h264_qpel16_mc02_neon, export=1
851         mov             x14, x30
852         lowpass_const   w3
853         sub             x1,  x1,  x2, lsl #1
854         mov             x3,  x2
855         bl              \type\()_h264_qpel16_v_lowpass_neon
856         ret             x14
857 endfunc
858 
859 function ff_\type\()_h264_qpel16_mc12_neon, export=1
860         mov             x14, x30
861         mov             x8,  x0
862         mov             x9,  x1
863 \type\()_h264_qpel16_mc12:
864         lowpass_const   w3
865         mov             x11, sp
866         sub             sp,  sp,  #(16*16+16*12)
867         sub             x1,  x1,  x2, lsl #1
868         mov             x0,  sp
869         mov             x3,  x2
870         bl              put_h264_qpel16_v_lowpass_neon_packed
871         mov             x4,  x0
872         mov             x0,  x8
873         sub             x1,  x9,  x3, lsl #1
874         sub             x1,  x1,  #2
875         mov             x2,  x3
876         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
877         mov             sp,  x11
878         ret             x14
879 endfunc
880 
881 function ff_\type\()_h264_qpel16_mc22_neon, export=1
882         mov             x14, x30
883         lowpass_const   w3
884         mov             x11, sp
885         sub             x1,  x1,  x2, lsl #1
886         sub             x1,  x1,  #2
887         mov             x3,  x2
888         bl              \type\()_h264_qpel16_hv_lowpass_neon
889         mov             sp,  x11 // restore stack
890         ret             x14
891 endfunc
892 
893 function ff_\type\()_h264_qpel16_mc32_neon, export=1
894         mov             x14, x30
895         mov             x8,  x0
896         mov             x9,  x1
897         add             x1,  x1,  #1
898         b               \type\()_h264_qpel16_mc12
899 endfunc
900 
901 function ff_\type\()_h264_qpel16_mc03_neon, export=1
902         mov             x14, x30
903         add             x12, x1,  x2
904         b               \type\()_h264_qpel16_mc01
905 endfunc
906 
907 function ff_\type\()_h264_qpel16_mc13_neon, export=1
908         mov             x14, x30
909         mov             x8,  x0
910         mov             x9,  x1
911         add             x1,  x1,  x2
912         b               \type\()_h264_qpel16_mc11
913 endfunc
914 
915 function ff_\type\()_h264_qpel16_mc23_neon, export=1
916         mov             x14, x30
917         mov             x8,  x0
918         mov             x9,  x1
919         add             x1,  x1,  x2
920         b               \type\()_h264_qpel16_mc21
921 endfunc
922 
923 function ff_\type\()_h264_qpel16_mc33_neon, export=1
924         add             x1,  x1,  #1
925         mov             x14, x30
926         mov             x8,  x0
927         mov             x9,  x1
928         add             x1,  x1,  x2
929         sub             x1,  x1,  #1
930         b               \type\()_h264_qpel16_mc11
931 endfunc
932 .endm
933 
934         h264_qpel16 put
935         h264_qpel16 avg
936