1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22#include "neon.S"
23
24        /* H.264 qpel MC */
25
26.macro  lowpass_const   r
27        movw            \r,  #5
28        movt            \r,  #20
29        vmov.32         d6[0], \r
30.endm
31
32.macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
33  .if \narrow
34        t0 .req q0
35        t1 .req q8
36  .else
37        t0 .req \d0
38        t1 .req \d1
39  .endif
40        vext.8          d2,  \r0, \r1, #2
41        vext.8          d3,  \r0, \r1, #3
42        vaddl.u8        q1,  d2,  d3
43        vext.8          d4,  \r0, \r1, #1
44        vext.8          d5,  \r0, \r1, #4
45        vaddl.u8        q2,  d4,  d5
46        vext.8          d30, \r0, \r1, #5
47        vaddl.u8        t0,  \r0, d30
48        vext.8          d18, \r2, \r3, #2
49        vmla.i16        t0,  q1,  d6[1]
50        vext.8          d19, \r2, \r3, #3
51        vaddl.u8        q9,  d18, d19
52        vext.8          d20, \r2, \r3, #1
53        vmls.i16        t0,  q2,  d6[0]
54        vext.8          d21, \r2, \r3, #4
55        vaddl.u8        q10, d20, d21
56        vext.8          d31, \r2, \r3, #5
57        vaddl.u8        t1,  \r2, d31
58        vmla.i16        t1,  q9,  d6[1]
59        vmls.i16        t1,  q10, d6[0]
60  .if \narrow
61        vqrshrun.s16    \d0, t0,  #5
62        vqrshrun.s16    \d1, t1,  #5
63  .endif
64        .unreq  t0
65        .unreq  t1
66.endm
67
68.macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
69  .if \narrow
70        t0 .req q0
71  .else
72        t0 .req \d0
73  .endif
74        vext.8          d2,  \r0, \r1, #2
75        vext.8          d3,  \r0, \r1, #3
76        vaddl.u8        q1,  d2,  d3
77        vext.8          d4,  \r0, \r1, #1
78        vext.8          d5,  \r0, \r1, #4
79        vaddl.u8        q2,  d4,  d5
80        vext.8          d30, \r0, \r1, #5
81        vaddl.u8        t0,  \r0, d30
82        vmla.i16        t0,  q1,  d6[1]
83        vmls.i16        t0,  q2,  d6[0]
84  .if \narrow
85        vqrshrun.s16    \d0, t0,  #5
86  .endif
87        .unreq  t0
88.endm
89
90.macro  lowpass_8.16    r0,  r1,  l0,  h0,  l1,  h1,  d
91        vext.16         q1,  \r0, \r1, #2
92        vext.16         q0,  \r0, \r1, #3
93        vaddl.s16       q9,  d2,  d0
94        vext.16         q2,  \r0, \r1, #1
95        vaddl.s16       q1,  d3,  d1
96        vext.16         q3,  \r0, \r1, #4
97        vaddl.s16       q10, d4,  d6
98        vext.16         \r1, \r0, \r1, #5
99        vaddl.s16       q2,  d5,  d7
100        vaddl.s16       q0,  \h0, \h1
101        vaddl.s16       q8,  \l0, \l1
102
103        vshl.i32        q3,  q9,  #4
104        vshl.i32        q9,  q9,  #2
105        vshl.i32        q15, q10, #2
106        vadd.i32        q9,  q9,  q3
107        vadd.i32        q10, q10, q15
108
109        vshl.i32        q3,  q1,  #4
110        vshl.i32        q1,  q1,  #2
111        vshl.i32        q15, q2,  #2
112        vadd.i32        q1,  q1,  q3
113        vadd.i32        q2,  q2,  q15
114
115        vadd.i32        q9,  q9,  q8
116        vsub.i32        q9,  q9,  q10
117
118        vadd.i32        q1,  q1,  q0
119        vsub.i32        q1,  q1,  q2
120
121        vrshrn.s32      d18, q9,  #10
122        vrshrn.s32      d19, q1,  #10
123
124        vqmovun.s16     \d,  q9
125.endm
126
127function put_h264_qpel16_h_lowpass_neon_packed
128        mov             r4,  lr
129        mov             r12, #16
130        mov             r3,  #8
131        bl              put_h264_qpel8_h_lowpass_neon
132        sub             r1,  r1,  r2, lsl #4
133        add             r1,  r1,  #8
134        mov             r12, #16
135        mov             lr,  r4
136        b               put_h264_qpel8_h_lowpass_neon
137endfunc
138
139.macro  h264_qpel_h_lowpass type
140function \type\()_h264_qpel16_h_lowpass_neon
141        push            {lr}
142        mov             r12, #16
143        bl              \type\()_h264_qpel8_h_lowpass_neon
144        sub             r0,  r0,  r3, lsl #4
145        sub             r1,  r1,  r2, lsl #4
146        add             r0,  r0,  #8
147        add             r1,  r1,  #8
148        mov             r12, #16
149        pop             {lr}
150endfunc
151
152function \type\()_h264_qpel8_h_lowpass_neon
1531:      vld1.8          {d0, d1},  [r1], r2
154        vld1.8          {d16,d17}, [r1], r2
155        subs            r12, r12, #2
156        lowpass_8       d0,  d1,  d16, d17, d0,  d16
157  .ifc \type,avg
158        vld1.8          {d2},     [r0,:64], r3
159        vld1.8          {d3},     [r0,:64]
160        vrhadd.u8       d0,  d0,  d2
161        vrhadd.u8       d16, d16, d3
162        sub             r0,  r0,  r3
163  .endif
164        vst1.8          {d0},     [r0,:64], r3
165        vst1.8          {d16},    [r0,:64], r3
166        bne             1b
167        bx              lr
168endfunc
169.endm
170
171        h264_qpel_h_lowpass put
172        h264_qpel_h_lowpass avg
173
174.macro  h264_qpel_h_lowpass_l2 type
175function \type\()_h264_qpel16_h_lowpass_l2_neon
176        push            {lr}
177        mov             r12, #16
178        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
179        sub             r0,  r0,  r2, lsl #4
180        sub             r1,  r1,  r2, lsl #4
181        sub             r3,  r3,  r2, lsl #4
182        add             r0,  r0,  #8
183        add             r1,  r1,  #8
184        add             r3,  r3,  #8
185        mov             r12, #16
186        pop             {lr}
187endfunc
188
189function \type\()_h264_qpel8_h_lowpass_l2_neon
1901:      vld1.8          {d0, d1},  [r1], r2
191        vld1.8          {d16,d17}, [r1], r2
192        vld1.8          {d28},     [r3], r2
193        vld1.8          {d29},     [r3], r2
194        subs            r12, r12, #2
195        lowpass_8       d0,  d1,  d16, d17, d0,  d1
196        vrhadd.u8       q0,  q0,  q14
197  .ifc \type,avg
198        vld1.8          {d2},      [r0,:64], r2
199        vld1.8          {d3},      [r0,:64]
200        vrhadd.u8       q0,  q0,  q1
201        sub             r0,  r0,  r2
202  .endif
203        vst1.8          {d0},      [r0,:64], r2
204        vst1.8          {d1},      [r0,:64], r2
205        bne             1b
206        bx              lr
207endfunc
208.endm
209
210        h264_qpel_h_lowpass_l2 put
211        h264_qpel_h_lowpass_l2 avg
212
213function put_h264_qpel16_v_lowpass_neon_packed
214        mov             r4,  lr
215        mov             r2,  #8
216        bl              put_h264_qpel8_v_lowpass_neon
217        sub             r1,  r1,  r3, lsl #2
218        bl              put_h264_qpel8_v_lowpass_neon
219        sub             r1,  r1,  r3, lsl #4
220        sub             r1,  r1,  r3, lsl #2
221        add             r1,  r1,  #8
222        bl              put_h264_qpel8_v_lowpass_neon
223        sub             r1,  r1,  r3, lsl #2
224        mov             lr,  r4
225        b               put_h264_qpel8_v_lowpass_neon
226endfunc
227
228.macro  h264_qpel_v_lowpass type
229function \type\()_h264_qpel16_v_lowpass_neon
230        mov             r4,  lr
231        bl              \type\()_h264_qpel8_v_lowpass_neon
232        sub             r1,  r1,  r3, lsl #2
233        bl              \type\()_h264_qpel8_v_lowpass_neon
234        sub             r0,  r0,  r2, lsl #4
235        add             r0,  r0,  #8
236        sub             r1,  r1,  r3, lsl #4
237        sub             r1,  r1,  r3, lsl #2
238        add             r1,  r1,  #8
239        bl              \type\()_h264_qpel8_v_lowpass_neon
240        sub             r1,  r1,  r3, lsl #2
241        mov             lr,  r4
242endfunc
243
244function \type\()_h264_qpel8_v_lowpass_neon
245        vld1.8          {d8},  [r1], r3
246        vld1.8          {d10}, [r1], r3
247        vld1.8          {d12}, [r1], r3
248        vld1.8          {d14}, [r1], r3
249        vld1.8          {d22}, [r1], r3
250        vld1.8          {d24}, [r1], r3
251        vld1.8          {d26}, [r1], r3
252        vld1.8          {d28}, [r1], r3
253        vld1.8          {d9},  [r1], r3
254        vld1.8          {d11}, [r1], r3
255        vld1.8          {d13}, [r1], r3
256        vld1.8          {d15}, [r1], r3
257        vld1.8          {d23}, [r1]
258
259        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
260        lowpass_8       d8,  d9,  d10, d11, d8,  d10
261        lowpass_8       d12, d13, d14, d15, d12, d14
262        lowpass_8       d22, d23, d24, d25, d22, d24
263        lowpass_8       d26, d27, d28, d29, d26, d28
264        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
265
266  .ifc \type,avg
267        vld1.8          {d9},  [r0,:64], r2
268        vld1.8          {d11}, [r0,:64], r2
269        vld1.8          {d13}, [r0,:64], r2
270        vrhadd.u8       d8,  d8,  d9
271        vld1.8          {d15}, [r0,:64], r2
272        vrhadd.u8       d10, d10, d11
273        vld1.8          {d23}, [r0,:64], r2
274        vrhadd.u8       d12, d12, d13
275        vld1.8          {d25}, [r0,:64], r2
276        vrhadd.u8       d14, d14, d15
277        vld1.8          {d27}, [r0,:64], r2
278        vrhadd.u8       d22, d22, d23
279        vld1.8          {d29}, [r0,:64], r2
280        vrhadd.u8       d24, d24, d25
281        vrhadd.u8       d26, d26, d27
282        vrhadd.u8       d28, d28, d29
283        sub             r0,  r0,  r2,  lsl #3
284  .endif
285
286        vst1.8          {d8},  [r0,:64], r2
287        vst1.8          {d10}, [r0,:64], r2
288        vst1.8          {d12}, [r0,:64], r2
289        vst1.8          {d14}, [r0,:64], r2
290        vst1.8          {d22}, [r0,:64], r2
291        vst1.8          {d24}, [r0,:64], r2
292        vst1.8          {d26}, [r0,:64], r2
293        vst1.8          {d28}, [r0,:64], r2
294
295        bx              lr
296endfunc
297.endm
298
299        h264_qpel_v_lowpass put
300        h264_qpel_v_lowpass avg
301
302.macro  h264_qpel_v_lowpass_l2 type
303function \type\()_h264_qpel16_v_lowpass_l2_neon
304        mov             r4,  lr
305        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
306        sub             r1,  r1,  r3, lsl #2
307        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
308        sub             r0,  r0,  r3, lsl #4
309        sub             r12, r12, r2, lsl #4
310        add             r0,  r0,  #8
311        add             r12, r12, #8
312        sub             r1,  r1,  r3, lsl #4
313        sub             r1,  r1,  r3, lsl #2
314        add             r1,  r1,  #8
315        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
316        sub             r1,  r1,  r3, lsl #2
317        mov             lr,  r4
318endfunc
319
320function \type\()_h264_qpel8_v_lowpass_l2_neon
321        vld1.8          {d8},  [r1], r3
322        vld1.8          {d10}, [r1], r3
323        vld1.8          {d12}, [r1], r3
324        vld1.8          {d14}, [r1], r3
325        vld1.8          {d22}, [r1], r3
326        vld1.8          {d24}, [r1], r3
327        vld1.8          {d26}, [r1], r3
328        vld1.8          {d28}, [r1], r3
329        vld1.8          {d9},  [r1], r3
330        vld1.8          {d11}, [r1], r3
331        vld1.8          {d13}, [r1], r3
332        vld1.8          {d15}, [r1], r3
333        vld1.8          {d23}, [r1]
334
335        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
336        lowpass_8       d8,  d9,  d10, d11, d8,  d9
337        lowpass_8       d12, d13, d14, d15, d12, d13
338        lowpass_8       d22, d23, d24, d25, d22, d23
339        lowpass_8       d26, d27, d28, d29, d26, d27
340        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
341
342        vld1.8          {d0},  [r12], r2
343        vld1.8          {d1},  [r12], r2
344        vld1.8          {d2},  [r12], r2
345        vld1.8          {d3},  [r12], r2
346        vld1.8          {d4},  [r12], r2
347        vrhadd.u8       q0,  q0,  q4
348        vld1.8          {d5},  [r12], r2
349        vrhadd.u8       q1,  q1,  q6
350        vld1.8          {d10}, [r12], r2
351        vrhadd.u8       q2,  q2,  q11
352        vld1.8          {d11}, [r12], r2
353        vrhadd.u8       q5,  q5,  q13
354
355  .ifc \type,avg
356        vld1.8          {d16}, [r0,:64], r3
357        vld1.8          {d17}, [r0,:64], r3
358        vrhadd.u8       d0,  d0,  d16
359        vld1.8          {d16}, [r0,:64], r3
360        vrhadd.u8       d1,  d1,  d17
361        vld1.8          {d17}, [r0,:64], r3
362        vrhadd.u8       d2,  d2,  d16
363        vld1.8          {d16}, [r0,:64], r3
364        vrhadd.u8       d3,  d3,  d17
365        vld1.8          {d17}, [r0,:64], r3
366        vrhadd.u8       d4,  d4,  d16
367        vld1.8          {d16}, [r0,:64], r3
368        vrhadd.u8       d5,  d5,  d17
369        vld1.8          {d17}, [r0,:64], r3
370        vrhadd.u8       d10, d10, d16
371        vrhadd.u8       d11, d11, d17
372        sub             r0,  r0,  r3,  lsl #3
373  .endif
374
375        vst1.8          {d0},  [r0,:64], r3
376        vst1.8          {d1},  [r0,:64], r3
377        vst1.8          {d2},  [r0,:64], r3
378        vst1.8          {d3},  [r0,:64], r3
379        vst1.8          {d4},  [r0,:64], r3
380        vst1.8          {d5},  [r0,:64], r3
381        vst1.8          {d10}, [r0,:64], r3
382        vst1.8          {d11}, [r0,:64], r3
383
384        bx              lr
385endfunc
386.endm
387
388        h264_qpel_v_lowpass_l2 put
389        h264_qpel_v_lowpass_l2 avg
390
391function put_h264_qpel8_hv_lowpass_neon_top
392        lowpass_const   r12
393        mov             r12, #12
3941:      vld1.8          {d0, d1},  [r1], r3
395        vld1.8          {d16,d17}, [r1], r3
396        subs            r12, r12, #2
397        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
398        vst1.8          {d22-d25}, [r4,:128]!
399        bne             1b
400
401        vld1.8          {d0, d1},  [r1]
402        lowpass_8_1     d0,  d1,  q12, narrow=0
403
404        mov             r12, #-16
405        add             r4,  r4,  r12
406        vld1.8          {d30,d31}, [r4,:128], r12
407        vld1.8          {d20,d21}, [r4,:128], r12
408        vld1.8          {d18,d19}, [r4,:128], r12
409        vld1.8          {d16,d17}, [r4,:128], r12
410        vld1.8          {d14,d15}, [r4,:128], r12
411        vld1.8          {d12,d13}, [r4,:128], r12
412        vld1.8          {d10,d11}, [r4,:128], r12
413        vld1.8          {d8, d9},  [r4,:128], r12
414        vld1.8          {d6, d7},  [r4,:128], r12
415        vld1.8          {d4, d5},  [r4,:128], r12
416        vld1.8          {d2, d3},  [r4,:128], r12
417        vld1.8          {d0, d1},  [r4,:128]
418
419        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
420        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
421
422        swap4           d17, d19, d21, d31, d24, d26, d28, d22
423        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
424
425        vst1.8          {d30,d31}, [r4,:128]!
426        vst1.8          {d6, d7},  [r4,:128]!
427        vst1.8          {d20,d21}, [r4,:128]!
428        vst1.8          {d4, d5},  [r4,:128]!
429        vst1.8          {d18,d19}, [r4,:128]!
430        vst1.8          {d2, d3},  [r4,:128]!
431        vst1.8          {d16,d17}, [r4,:128]!
432        vst1.8          {d0, d1},  [r4,:128]
433
434        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
435        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
436        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
437        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
438
439        vld1.8          {d16,d17}, [r4,:128], r12
440        vld1.8          {d30,d31}, [r4,:128], r12
441        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
442        vld1.8          {d16,d17}, [r4,:128], r12
443        vld1.8          {d30,d31}, [r4,:128], r12
444        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
445        vld1.8          {d16,d17}, [r4,:128], r12
446        vld1.8          {d30,d31}, [r4,:128], r12
447        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
448        vld1.8          {d16,d17}, [r4,:128], r12
449        vld1.8          {d30,d31}, [r4,:128]
450        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
451
452        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
453
454        bx              lr
455endfunc
456
457.macro  h264_qpel8_hv_lowpass type
458function \type\()_h264_qpel8_hv_lowpass_neon
459        mov             r10, lr
460        bl              put_h264_qpel8_hv_lowpass_neon_top
461  .ifc \type,avg
462        vld1.8          {d0},      [r0,:64], r2
463        vld1.8          {d1},      [r0,:64], r2
464        vld1.8          {d2},      [r0,:64], r2
465        vld1.8          {d3},      [r0,:64], r2
466        vrhadd.u8       q6,  q6,  q0
467        vld1.8          {d4},      [r0,:64], r2
468        vld1.8          {d5},      [r0,:64], r2
469        vrhadd.u8       q7,  q7,  q1
470        vld1.8          {d6},      [r0,:64], r2
471        vld1.8          {d7},      [r0,:64], r2
472        vrhadd.u8       q4,  q4,  q2
473        vrhadd.u8       q5,  q5,  q3
474        sub             r0,  r0,  r2,  lsl #3
475  .endif
476
477        vst1.8          {d12},     [r0,:64], r2
478        vst1.8          {d13},     [r0,:64], r2
479        vst1.8          {d14},     [r0,:64], r2
480        vst1.8          {d15},     [r0,:64], r2
481        vst1.8          {d8},      [r0,:64], r2
482        vst1.8          {d9},      [r0,:64], r2
483        vst1.8          {d10},     [r0,:64], r2
484        vst1.8          {d11},     [r0,:64], r2
485
486        mov             lr,  r10
487        bx              lr
488endfunc
489.endm
490
491        h264_qpel8_hv_lowpass put
492        h264_qpel8_hv_lowpass avg
493
494.macro  h264_qpel8_hv_lowpass_l2 type
495function \type\()_h264_qpel8_hv_lowpass_l2_neon
496        mov             r10, lr
497        bl              put_h264_qpel8_hv_lowpass_neon_top
498
499        vld1.8          {d0, d1},  [r2,:128]!
500        vld1.8          {d2, d3},  [r2,:128]!
501        vrhadd.u8       q0,  q0,  q6
502        vld1.8          {d4, d5},  [r2,:128]!
503        vrhadd.u8       q1,  q1,  q7
504        vld1.8          {d6, d7},  [r2,:128]!
505        vrhadd.u8       q2,  q2,  q4
506        vrhadd.u8       q3,  q3,  q5
507  .ifc \type,avg
508        vld1.8          {d16},     [r0,:64], r3
509        vld1.8          {d17},     [r0,:64], r3
510        vld1.8          {d18},     [r0,:64], r3
511        vld1.8          {d19},     [r0,:64], r3
512        vrhadd.u8       q0,  q0,  q8
513        vld1.8          {d20},     [r0,:64], r3
514        vld1.8          {d21},     [r0,:64], r3
515        vrhadd.u8       q1,  q1,  q9
516        vld1.8          {d22},     [r0,:64], r3
517        vld1.8          {d23},     [r0,:64], r3
518        vrhadd.u8       q2,  q2,  q10
519        vrhadd.u8       q3,  q3,  q11
520        sub             r0,  r0,  r3,  lsl #3
521  .endif
522        vst1.8          {d0},      [r0,:64], r3
523        vst1.8          {d1},      [r0,:64], r3
524        vst1.8          {d2},      [r0,:64], r3
525        vst1.8          {d3},      [r0,:64], r3
526        vst1.8          {d4},      [r0,:64], r3
527        vst1.8          {d5},      [r0,:64], r3
528        vst1.8          {d6},      [r0,:64], r3
529        vst1.8          {d7},      [r0,:64], r3
530
531        mov             lr,  r10
532        bx              lr
533endfunc
534.endm
535
536        h264_qpel8_hv_lowpass_l2 put
537        h264_qpel8_hv_lowpass_l2 avg
538
539.macro  h264_qpel16_hv  type
540function \type\()_h264_qpel16_hv_lowpass_neon
541        mov             r9,  lr
542        bl              \type\()_h264_qpel8_hv_lowpass_neon
543        sub             r1,  r1,  r3, lsl #2
544        bl              \type\()_h264_qpel8_hv_lowpass_neon
545        sub             r1,  r1,  r3, lsl #4
546        sub             r1,  r1,  r3, lsl #2
547        add             r1,  r1,  #8
548        sub             r0,  r0,  r2, lsl #4
549        add             r0,  r0,  #8
550        bl              \type\()_h264_qpel8_hv_lowpass_neon
551        sub             r1,  r1,  r3, lsl #2
552        mov             lr,  r9
553        b               \type\()_h264_qpel8_hv_lowpass_neon
554endfunc
555
556function \type\()_h264_qpel16_hv_lowpass_l2_neon
557        mov             r9,  lr
558        sub             r2,  r4,  #256
559        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
560        sub             r1,  r1,  r3, lsl #2
561        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
562        sub             r1,  r1,  r3, lsl #4
563        sub             r1,  r1,  r3, lsl #2
564        add             r1,  r1,  #8
565        sub             r0,  r0,  r3, lsl #4
566        add             r0,  r0,  #8
567        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
568        sub             r1,  r1,  r3, lsl #2
569        mov             lr,  r9
570        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
571endfunc
572.endm
573
574        h264_qpel16_hv put
575        h264_qpel16_hv avg
576
577.macro  h264_qpel8      type
578function ff_\type\()_h264_qpel8_mc10_neon, export=1
579        lowpass_const   r3
580        mov             r3,  r1
581        sub             r1,  r1,  #2
582        mov             r12, #8
583        b               \type\()_h264_qpel8_h_lowpass_l2_neon
584endfunc
585
586function ff_\type\()_h264_qpel8_mc20_neon, export=1
587        lowpass_const   r3
588        sub             r1,  r1,  #2
589        mov             r3,  r2
590        mov             r12, #8
591        b               \type\()_h264_qpel8_h_lowpass_neon
592endfunc
593
594function ff_\type\()_h264_qpel8_mc30_neon, export=1
595        lowpass_const   r3
596        add             r3,  r1,  #1
597        sub             r1,  r1,  #2
598        mov             r12, #8
599        b               \type\()_h264_qpel8_h_lowpass_l2_neon
600endfunc
601
602function ff_\type\()_h264_qpel8_mc01_neon, export=1
603        push            {lr}
604        mov             r12, r1
605\type\()_h264_qpel8_mc01:
606        lowpass_const   r3
607        mov             r3,  r2
608        sub             r1,  r1,  r2, lsl #1
609        vpush           {d8-d15}
610        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
611        vpop            {d8-d15}
612        pop             {pc}
613endfunc
614
615function ff_\type\()_h264_qpel8_mc11_neon, export=1
616        push            {r0, r1, r11, lr}
617\type\()_h264_qpel8_mc11:
618        lowpass_const   r3
619        mov             r11, sp
620A       bic             sp,  sp,  #15
621T       bic             r0,  r11, #15
622T       mov             sp,  r0
623        sub             sp,  sp,  #64
624        mov             r0,  sp
625        sub             r1,  r1,  #2
626        mov             r3,  #8
627        mov             r12, #8
628        vpush           {d8-d15}
629        bl              put_h264_qpel8_h_lowpass_neon
630        ldrd            r0,  r1,  [r11], #8
631        mov             r3,  r2
632        add             r12, sp,  #64
633        sub             r1,  r1,  r2, lsl #1
634        mov             r2,  #8
635        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
636        vpop            {d8-d15}
637        mov             sp,  r11
638        pop             {r11, pc}
639endfunc
640
641function ff_\type\()_h264_qpel8_mc21_neon, export=1
642        push            {r0, r1, r4, r10, r11, lr}
643\type\()_h264_qpel8_mc21:
644        lowpass_const   r3
645        mov             r11, sp
646A       bic             sp,  sp,  #15
647T       bic             r0,  r11, #15
648T       mov             sp,  r0
649        sub             sp,  sp,  #(8*8+16*12)
650        sub             r1,  r1,  #2
651        mov             r3,  #8
652        mov             r0,  sp
653        mov             r12, #8
654        vpush           {d8-d15}
655        bl              put_h264_qpel8_h_lowpass_neon
656        mov             r4,  r0
657        ldrd            r0,  r1,  [r11], #8
658        sub             r1,  r1,  r2, lsl #1
659        sub             r1,  r1,  #2
660        mov             r3,  r2
661        sub             r2,  r4,  #64
662        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
663        vpop            {d8-d15}
664        mov             sp,  r11
665        pop             {r4, r10, r11, pc}
666endfunc
667
668function ff_\type\()_h264_qpel8_mc31_neon, export=1
669        add             r1,  r1,  #1
670        push            {r0, r1, r11, lr}
671        sub             r1,  r1,  #1
672        b               \type\()_h264_qpel8_mc11
673endfunc
674
675function ff_\type\()_h264_qpel8_mc02_neon, export=1
676        push            {lr}
677        lowpass_const   r3
678        sub             r1,  r1,  r2, lsl #1
679        mov             r3,  r2
680        vpush           {d8-d15}
681        bl              \type\()_h264_qpel8_v_lowpass_neon
682        vpop            {d8-d15}
683        pop             {pc}
684endfunc
685
686function ff_\type\()_h264_qpel8_mc12_neon, export=1
687        push            {r0, r1, r4, r10, r11, lr}
688\type\()_h264_qpel8_mc12:
689        lowpass_const   r3
690        mov             r11, sp
691A       bic             sp,  sp,  #15
692T       bic             r0,  r11, #15
693T       mov             sp,  r0
694        sub             sp,  sp,  #(8*8+16*12)
695        sub             r1,  r1,  r2, lsl #1
696        mov             r3,  r2
697        mov             r2,  #8
698        mov             r0,  sp
699        vpush           {d8-d15}
700        bl              put_h264_qpel8_v_lowpass_neon
701        mov             r4,  r0
702        ldrd            r0,  r1,  [r11], #8
703        sub             r1,  r1,  r3, lsl #1
704        sub             r1,  r1,  #2
705        sub             r2,  r4,  #64
706        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
707        vpop            {d8-d15}
708        mov             sp,  r11
709        pop             {r4, r10, r11, pc}
710endfunc
711
712function ff_\type\()_h264_qpel8_mc22_neon, export=1
713        push            {r4, r10, r11, lr}
714        mov             r11, sp
715A       bic             sp,  sp,  #15
716T       bic             r4,  r11, #15
717T       mov             sp,  r4
718        sub             r1,  r1,  r2, lsl #1
719        sub             r1,  r1,  #2
720        mov             r3,  r2
721        sub             sp,  sp,  #(16*12)
722        mov             r4,  sp
723        vpush           {d8-d15}
724        bl              \type\()_h264_qpel8_hv_lowpass_neon
725        vpop            {d8-d15}
726        mov             sp,  r11
727        pop             {r4, r10, r11, pc}
728endfunc
729
730function ff_\type\()_h264_qpel8_mc32_neon, export=1
731        push            {r0, r1, r4, r10, r11, lr}
732        add             r1,  r1,  #1
733        b               \type\()_h264_qpel8_mc12
734endfunc
735
736function ff_\type\()_h264_qpel8_mc03_neon, export=1
737        push            {lr}
738        add             r12, r1,  r2
739        b               \type\()_h264_qpel8_mc01
740endfunc
741
742function ff_\type\()_h264_qpel8_mc13_neon, export=1
743        push            {r0, r1, r11, lr}
744        add             r1,  r1,  r2
745        b               \type\()_h264_qpel8_mc11
746endfunc
747
748function ff_\type\()_h264_qpel8_mc23_neon, export=1
749        push            {r0, r1, r4, r10, r11, lr}
750        add             r1,  r1,  r2
751        b               \type\()_h264_qpel8_mc21
752endfunc
753
754function ff_\type\()_h264_qpel8_mc33_neon, export=1
755        add             r1,  r1,  #1
756        push            {r0, r1, r11, lr}
757        add             r1,  r1,  r2
758        sub             r1,  r1,  #1
759        b               \type\()_h264_qpel8_mc11
760endfunc
761.endm
762
763        h264_qpel8 put
764        h264_qpel8 avg
765
766.macro  h264_qpel16     type
767function ff_\type\()_h264_qpel16_mc10_neon, export=1
768        lowpass_const   r3
769        mov             r3,  r1
770        sub             r1,  r1,  #2
771        b               \type\()_h264_qpel16_h_lowpass_l2_neon
772endfunc
773
774function ff_\type\()_h264_qpel16_mc20_neon, export=1
775        lowpass_const   r3
776        sub             r1,  r1,  #2
777        mov             r3,  r2
778        b               \type\()_h264_qpel16_h_lowpass_neon
779endfunc
780
781function ff_\type\()_h264_qpel16_mc30_neon, export=1
782        lowpass_const   r3
783        add             r3,  r1,  #1
784        sub             r1,  r1,  #2
785        b               \type\()_h264_qpel16_h_lowpass_l2_neon
786endfunc
787
788function ff_\type\()_h264_qpel16_mc01_neon, export=1
789        push            {r4, lr}
790        mov             r12, r1
791\type\()_h264_qpel16_mc01:
792        lowpass_const   r3
793        mov             r3,  r2
794        sub             r1,  r1,  r2, lsl #1
795        vpush           {d8-d15}
796        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
797        vpop            {d8-d15}
798        pop             {r4, pc}
799endfunc
800
801function ff_\type\()_h264_qpel16_mc11_neon, export=1
802        push            {r0, r1, r4, r11, lr}
803\type\()_h264_qpel16_mc11:
804        lowpass_const   r3
805        mov             r11, sp
806A       bic             sp,  sp,  #15
807T       bic             r0,  r11, #15
808T       mov             sp,  r0
809        sub             sp,  sp,  #256
810        mov             r0,  sp
811        sub             r1,  r1,  #2
812        mov             r3,  #16
813        vpush           {d8-d15}
814        bl              put_h264_qpel16_h_lowpass_neon
815        ldrd            r0,  r1,  [r11], #8
816        mov             r3,  r2
817        add             r12, sp,  #64
818        sub             r1,  r1,  r2, lsl #1
819        mov             r2,  #16
820        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
821        vpop            {d8-d15}
822        mov             sp,  r11
823        pop             {r4, r11, pc}
824endfunc
825
826function ff_\type\()_h264_qpel16_mc21_neon, export=1
827        push            {r0, r1, r4-r5, r9-r11, lr}
828\type\()_h264_qpel16_mc21:
829        lowpass_const   r3
830        mov             r11, sp
831A       bic             sp,  sp,  #15
832T       bic             r0,  r11, #15
833T       mov             sp,  r0
834        sub             sp,  sp,  #(16*16+16*12)
835        sub             r1,  r1,  #2
836        mov             r0,  sp
837        vpush           {d8-d15}
838        bl              put_h264_qpel16_h_lowpass_neon_packed
839        mov             r4,  r0
840        ldrd            r0,  r1,  [r11], #8
841        sub             r1,  r1,  r2, lsl #1
842        sub             r1,  r1,  #2
843        mov             r3,  r2
844        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
845        vpop            {d8-d15}
846        mov             sp,  r11
847        pop             {r4-r5, r9-r11, pc}
848endfunc
849
850function ff_\type\()_h264_qpel16_mc31_neon, export=1
851        add             r1,  r1,  #1
852        push            {r0, r1, r4, r11, lr}
853        sub             r1,  r1,  #1
854        b               \type\()_h264_qpel16_mc11
855endfunc
856
857function ff_\type\()_h264_qpel16_mc02_neon, export=1
858        push            {r4, lr}
859        lowpass_const   r3
860        sub             r1,  r1,  r2, lsl #1
861        mov             r3,  r2
862        vpush           {d8-d15}
863        bl              \type\()_h264_qpel16_v_lowpass_neon
864        vpop            {d8-d15}
865        pop             {r4, pc}
866endfunc
867
868function ff_\type\()_h264_qpel16_mc12_neon, export=1
869        push            {r0, r1, r4-r5, r9-r11, lr}
870\type\()_h264_qpel16_mc12:
871        lowpass_const   r3
872        mov             r11, sp
873A       bic             sp,  sp,  #15
874T       bic             r0,  r11, #15
875T       mov             sp,  r0
876        sub             sp,  sp,  #(16*16+16*12)
877        sub             r1,  r1,  r2, lsl #1
878        mov             r0,  sp
879        mov             r3,  r2
880        vpush           {d8-d15}
881        bl              put_h264_qpel16_v_lowpass_neon_packed
882        mov             r4,  r0
883        ldrd            r0,  r1,  [r11], #8
884        sub             r1,  r1,  r3, lsl #1
885        sub             r1,  r1,  #2
886        mov             r2,  r3
887        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
888        vpop            {d8-d15}
889        mov             sp,  r11
890        pop             {r4-r5, r9-r11, pc}
891endfunc
892
893function ff_\type\()_h264_qpel16_mc22_neon, export=1
894        push            {r4, r9-r11, lr}
895        lowpass_const   r3
896        mov             r11, sp
897A       bic             sp,  sp,  #15
898T       bic             r4,  r11, #15
899T       mov             sp,  r4
900        sub             r1,  r1,  r2, lsl #1
901        sub             r1,  r1,  #2
902        mov             r3,  r2
903        sub             sp,  sp,  #(16*12)
904        mov             r4,  sp
905        vpush           {d8-d15}
906        bl              \type\()_h264_qpel16_hv_lowpass_neon
907        vpop            {d8-d15}
908        mov             sp,  r11
909        pop             {r4, r9-r11, pc}
910endfunc
911
912function ff_\type\()_h264_qpel16_mc32_neon, export=1
913        push            {r0, r1, r4-r5, r9-r11, lr}
914        add             r1,  r1,  #1
915        b               \type\()_h264_qpel16_mc12
916endfunc
917
918function ff_\type\()_h264_qpel16_mc03_neon, export=1
919        push            {r4, lr}
920        add             r12, r1,  r2
921        b               \type\()_h264_qpel16_mc01
922endfunc
923
924function ff_\type\()_h264_qpel16_mc13_neon, export=1
925        push            {r0, r1, r4, r11, lr}
926        add             r1,  r1,  r2
927        b               \type\()_h264_qpel16_mc11
928endfunc
929
930function ff_\type\()_h264_qpel16_mc23_neon, export=1
931        push            {r0, r1, r4-r5, r9-r11, lr}
932        add             r1,  r1,  r2
933        b               \type\()_h264_qpel16_mc21
934endfunc
935
936function ff_\type\()_h264_qpel16_mc33_neon, export=1
937        add             r1,  r1,  #1
938        push            {r0, r1, r4, r11, lr}
939        add             r1,  r1,  r2
940        sub             r1,  r1,  #1
941        b               \type\()_h264_qpel16_mc11
942endfunc
943.endm
944
945        h264_qpel16 put
946        h264_qpel16 avg
947