1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/aarch64/asm.S"
24#include "neon.S"
25
26.macro  h264_loop_filter_start
27        cmp             w2,  #0
28        ldr             w6,  [x4]
29        ccmp            w3,  #0, #0, ne
30        mov             v24.S[0], w6
31        and             w8,  w6,  w6,  lsl #16
32        b.eq            1f
33        ands            w8,  w8,  w8,  lsl #8
34        b.ge            2f
351:
36        ret
372:
38.endm
39
40.macro  h264_loop_filter_luma
41        dup             v22.16B, w2                     // alpha
42        uxtl            v24.8H,  v24.8B
43        uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
44        uxtl            v24.4S,  v24.4H
45        uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
46        sli             v24.8H,  v24.8H,  #8
47        uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
48        sli             v24.4S,  v24.4S,  #16
49        cmhi            v21.16B, v22.16B, v21.16B       // < alpha
50        dup             v22.16B, w3                     // beta
51        cmlt            v23.16B, v24.16B, #0
52        cmhi            v28.16B, v22.16B, v28.16B       // < beta
53        cmhi            v30.16B, v22.16B, v30.16B       // < beta
54        bic             v21.16B, v21.16B, v23.16B
55        uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
56        and             v21.16B, v21.16B, v28.16B
57        uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
58        and             v21.16B, v21.16B, v30.16B      // < beta
59        shrn            v30.8b,  v21.8h,  #4
60        mov             x7, v30.d[0]
61        cmhi            v17.16B, v22.16B, v17.16B       // < beta
62        cmhi            v19.16B, v22.16B, v19.16B       // < beta
63        cbz             x7,  9f
64        and             v17.16B, v17.16B, v21.16B
65        and             v19.16B, v19.16B, v21.16B
66        and             v24.16B, v24.16B, v21.16B
67        urhadd          v28.16B, v16.16B,  v0.16B
68        sub             v21.16B, v24.16B, v17.16B
69        uqadd           v23.16B, v18.16B, v24.16B
70        uhadd           v20.16B, v20.16B, v28.16B
71        sub             v21.16B, v21.16B, v19.16B
72        uhadd           v28.16B,  v4.16B, v28.16B
73        umin            v23.16B, v23.16B, v20.16B
74        uqsub           v22.16B, v18.16B, v24.16B
75        uqadd           v4.16B,   v2.16B, v24.16B
76        umax            v23.16B, v23.16B, v22.16B
77        uqsub           v22.16B,  v2.16B, v24.16B
78        umin            v28.16B,  v4.16B, v28.16B
79        uxtl            v4.8H,    v0.8B
80        umax            v28.16B, v28.16B, v22.16B
81        uxtl2           v20.8H,   v0.16B
82        usubw           v4.8H,    v4.8H,  v16.8B
83        usubw2          v20.8H,  v20.8H,  v16.16B
84        shl             v4.8H,    v4.8H,  #2
85        shl             v20.8H,  v20.8H,  #2
86        uaddw           v4.8H,    v4.8H,  v18.8B
87        uaddw2          v20.8H,  v20.8H,  v18.16B
88        usubw           v4.8H,    v4.8H,   v2.8B
89        usubw2          v20.8H,  v20.8H,   v2.16B
90        rshrn           v4.8B,    v4.8H,  #3
91        rshrn2          v4.16B,  v20.8H,  #3
92        bsl             v17.16B, v23.16B, v18.16B
93        bsl             v19.16B, v28.16B,  v2.16B
94        neg             v23.16B, v21.16B
95        uxtl            v28.8H,  v16.8B
96        smin            v4.16B,   v4.16B, v21.16B
97        uxtl2           v21.8H,  v16.16B
98        smax            v4.16B,   v4.16B, v23.16B
99        uxtl            v22.8H,   v0.8B
100        uxtl2           v24.8H,   v0.16B
101        saddw           v28.8H,  v28.8H,  v4.8B
102        saddw2          v21.8H,  v21.8H,  v4.16B
103        ssubw           v22.8H,  v22.8H,  v4.8B
104        ssubw2          v24.8H,  v24.8H,  v4.16B
105        sqxtun          v16.8B,  v28.8H
106        sqxtun2         v16.16B, v21.8H
107        sqxtun          v0.8B,   v22.8H
108        sqxtun2         v0.16B,  v24.8H
109.endm
110
111function ff_h264_v_loop_filter_luma_neon, export=1
112        h264_loop_filter_start
113
114        ld1             {v0.16B},  [x0], x1
115        ld1             {v2.16B},  [x0], x1
116        ld1             {v4.16B},  [x0], x1
117        sub             x0,  x0,  x1, lsl #2
118        sub             x0,  x0,  x1, lsl #1
119        ld1             {v20.16B},  [x0], x1
120        ld1             {v18.16B},  [x0], x1
121        ld1             {v16.16B},  [x0], x1
122
123        h264_loop_filter_luma
124
125        sub             x0,  x0,  x1, lsl #1
126        st1             {v17.16B},  [x0], x1
127        st1             {v16.16B}, [x0], x1
128        st1             {v0.16B},  [x0], x1
129        st1             {v19.16B}, [x0]
1309:
131        ret
132endfunc
133
134function ff_h264_h_loop_filter_luma_neon, export=1
135        h264_loop_filter_start
136
137        sub             x0,  x0,  #4
138        ld1             {v6.8B},  [x0], x1
139        ld1             {v20.8B}, [x0], x1
140        ld1             {v18.8B}, [x0], x1
141        ld1             {v16.8B}, [x0], x1
142        ld1             {v0.8B},  [x0], x1
143        ld1             {v2.8B},  [x0], x1
144        ld1             {v4.8B},  [x0], x1
145        ld1             {v26.8B}, [x0], x1
146        ld1             {v6.D}[1],  [x0], x1
147        ld1             {v20.D}[1], [x0], x1
148        ld1             {v18.D}[1], [x0], x1
149        ld1             {v16.D}[1], [x0], x1
150        ld1             {v0.D}[1],  [x0], x1
151        ld1             {v2.D}[1],  [x0], x1
152        ld1             {v4.D}[1],  [x0], x1
153        ld1             {v26.D}[1], [x0], x1
154
155        transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
156
157        h264_loop_filter_luma
158
159        transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
160
161        sub             x0,  x0,  x1, lsl #4
162        add             x0,  x0,  #2
163        st1             {v17.S}[0],  [x0], x1
164        st1             {v16.S}[0], [x0], x1
165        st1             {v0.S}[0],  [x0], x1
166        st1             {v19.S}[0], [x0], x1
167        st1             {v17.S}[1],  [x0], x1
168        st1             {v16.S}[1], [x0], x1
169        st1             {v0.S}[1],  [x0], x1
170        st1             {v19.S}[1], [x0], x1
171        st1             {v17.S}[2],  [x0], x1
172        st1             {v16.S}[2], [x0], x1
173        st1             {v0.S}[2],  [x0], x1
174        st1             {v19.S}[2], [x0], x1
175        st1             {v17.S}[3],  [x0], x1
176        st1             {v16.S}[3], [x0], x1
177        st1             {v0.S}[3],  [x0], x1
178        st1             {v19.S}[3], [x0], x1
1799:
180        ret
181endfunc
182
183
184.macro h264_loop_filter_start_intra
185        orr             w4,  w2,  w3
186        cbnz            w4,  1f
187        ret
1881:
189        dup             v30.16b, w2                // alpha
190        dup             v31.16b, w3                // beta
191.endm
192
193.macro h264_loop_filter_luma_intra
194        uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
195        uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
196        uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
197        cmhi            v19.16b, v30.16b, v16.16b       // < alpha
198        cmhi            v17.16b, v31.16b, v17.16b       // < beta
199        cmhi            v18.16b, v31.16b, v18.16b       // < beta
200
201        movi            v29.16b, #2
202        ushr            v30.16b, v30.16b, #2            // alpha >> 2
203        add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
204        cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
205
206        and             v19.16b, v19.16b, v17.16b
207        and             v19.16b, v19.16b, v18.16b
208        shrn            v20.8b,  v19.8h,  #4
209        mov             x4, v20.d[0]
210        cbz             x4, 9f
211
212        ushll           v20.8h,  v6.8b,   #1
213        ushll           v22.8h,  v1.8b,   #1
214        ushll2          v21.8h,  v6.16b,  #1
215        ushll2          v23.8h,  v1.16b,  #1
216        uaddw           v20.8h,  v20.8h,  v7.8b
217        uaddw           v22.8h,  v22.8h,  v0.8b
218        uaddw2          v21.8h,  v21.8h,  v7.16b
219        uaddw2          v23.8h,  v23.8h,  v0.16b
220        uaddw           v20.8h,  v20.8h,  v1.8b
221        uaddw           v22.8h,  v22.8h,  v6.8b
222        uaddw2          v21.8h,  v21.8h,  v1.16b
223        uaddw2          v23.8h,  v23.8h,  v6.16b
224
225        rshrn           v24.8b,  v20.8h,  #2 // p0'_1
226        rshrn           v25.8b,  v22.8h,  #2 // q0'_1
227        rshrn2          v24.16b, v21.8h,  #2 // p0'_1
228        rshrn2          v25.16b, v23.8h,  #2 // q0'_1
229
230        uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
231        uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
232        cmhi            v17.16b, v31.16b, v17.16b       // < beta
233        cmhi            v18.16b, v31.16b, v18.16b       // < beta
234
235        and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
236        and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
237
238        not             v30.16b, v17.16b
239        not             v31.16b, v18.16b
240
241        and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
242        and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
243
244        and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
245        and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
246
247        //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
248        uaddl           v26.8h,  v5.8b,   v7.8b
249        uaddl2          v27.8h,  v5.16b,  v7.16b
250        uaddw           v26.8h,  v26.8h,  v0.8b
251        uaddw2          v27.8h,  v27.8h,  v0.16b
252        add             v20.8h,  v20.8h,  v26.8h
253        add             v21.8h,  v21.8h,  v27.8h
254        uaddw           v20.8h,  v20.8h,  v0.8b
255        uaddw2          v21.8h,  v21.8h,  v0.16b
256        rshrn           v20.8b,  v20.8h,  #3 // p0'_2
257        rshrn2          v20.16b, v21.8h,  #3 // p0'_2
258        uaddw           v26.8h,  v26.8h,  v6.8b
259        uaddw2          v27.8h,  v27.8h,  v6.16b
260        rshrn           v21.8b,  v26.8h,  #2 // p1'_2
261        rshrn2          v21.16b, v27.8h,  #2 // p1'_2
262        uaddl           v28.8h,  v4.8b,   v5.8b
263        uaddl2          v29.8h,  v4.16b,  v5.16b
264        shl             v28.8h,  v28.8h,  #1
265        shl             v29.8h,  v29.8h,  #1
266        add             v28.8h,  v28.8h,  v26.8h
267        add             v29.8h,  v29.8h,  v27.8h
268        rshrn           v19.8b,  v28.8h,  #3 // p2'_2
269        rshrn2          v19.16b, v29.8h,  #3 // p2'_2
270
271        //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
272        uaddl           v26.8h,  v2.8b,   v0.8b
273        uaddl2          v27.8h,  v2.16b,  v0.16b
274        uaddw           v26.8h,  v26.8h,  v7.8b
275        uaddw2          v27.8h,  v27.8h,  v7.16b
276        add             v22.8h,  v22.8h,  v26.8h
277        add             v23.8h,  v23.8h,  v27.8h
278        uaddw           v22.8h,  v22.8h,  v7.8b
279        uaddw2          v23.8h,  v23.8h,  v7.16b
280        rshrn           v22.8b,  v22.8h,  #3 // q0'_2
281        rshrn2          v22.16b, v23.8h,  #3 // q0'_2
282        uaddw           v26.8h,  v26.8h,  v1.8b
283        uaddw2          v27.8h,  v27.8h,  v1.16b
284        rshrn           v23.8b,  v26.8h,  #2 // q1'_2
285        rshrn2          v23.16b, v27.8h,  #2 // q1'_2
286        uaddl           v28.8h,  v2.8b,   v3.8b
287        uaddl2          v29.8h,  v2.16b,  v3.16b
288        shl             v28.8h,  v28.8h,  #1
289        shl             v29.8h,  v29.8h,  #1
290        add             v28.8h,  v28.8h,  v26.8h
291        add             v29.8h,  v29.8h,  v27.8h
292        rshrn           v26.8b,  v28.8h,  #3 // q2'_2
293        rshrn2          v26.16b, v29.8h,  #3 // q2'_2
294
295        bit             v7.16b,  v24.16b, v30.16b  // p0'_1
296        bit             v0.16b,  v25.16b, v31.16b  // q0'_1
297        bit             v7.16b,  v20.16b, v17.16b  // p0'_2
298        bit             v6.16b,  v21.16b, v17.16b  // p1'_2
299        bit             v5.16b,  v19.16b, v17.16b  // p2'_2
300        bit             v0.16b,  v22.16b, v18.16b  // q0'_2
301        bit             v1.16b,  v23.16b, v18.16b  // q1'_2
302        bit             v2.16b,  v26.16b, v18.16b  // q2'_2
303.endm
304
305function ff_h264_v_loop_filter_luma_intra_neon, export=1
306        h264_loop_filter_start_intra
307
308        ld1             {v0.16b},  [x0], x1 // q0
309        ld1             {v1.16b},  [x0], x1 // q1
310        ld1             {v2.16b},  [x0], x1 // q2
311        ld1             {v3.16b},  [x0], x1 // q3
312        sub             x0,  x0,  x1, lsl #3
313        ld1             {v4.16b},  [x0], x1 // p3
314        ld1             {v5.16b},  [x0], x1 // p2
315        ld1             {v6.16b},  [x0], x1 // p1
316        ld1             {v7.16b},  [x0]     // p0
317
318        h264_loop_filter_luma_intra
319
320        sub             x0,  x0,  x1, lsl #1
321        st1             {v5.16b}, [x0], x1  // p2
322        st1             {v6.16b}, [x0], x1  // p1
323        st1             {v7.16b}, [x0], x1  // p0
324        st1             {v0.16b}, [x0], x1  // q0
325        st1             {v1.16b}, [x0], x1  // q1
326        st1             {v2.16b}, [x0]      // q2
3279:
328        ret
329endfunc
330
331function ff_h264_h_loop_filter_luma_intra_neon, export=1
332        h264_loop_filter_start_intra
333
334        sub             x0,  x0,  #4
335        ld1             {v4.8b},  [x0], x1
336        ld1             {v5.8b},  [x0], x1
337        ld1             {v6.8b},  [x0], x1
338        ld1             {v7.8b},  [x0], x1
339        ld1             {v0.8b},  [x0], x1
340        ld1             {v1.8b},  [x0], x1
341        ld1             {v2.8b},  [x0], x1
342        ld1             {v3.8b},  [x0], x1
343        ld1             {v4.d}[1],  [x0], x1
344        ld1             {v5.d}[1],  [x0], x1
345        ld1             {v6.d}[1],  [x0], x1
346        ld1             {v7.d}[1],  [x0], x1
347        ld1             {v0.d}[1],  [x0], x1
348        ld1             {v1.d}[1],  [x0], x1
349        ld1             {v2.d}[1],  [x0], x1
350        ld1             {v3.d}[1],  [x0], x1
351
352        transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
353
354        h264_loop_filter_luma_intra
355
356        transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
357
358        sub             x0,  x0,  x1, lsl #4
359        st1             {v4.8b},  [x0], x1
360        st1             {v5.8b},  [x0], x1
361        st1             {v6.8b},  [x0], x1
362        st1             {v7.8b},  [x0], x1
363        st1             {v0.8b},  [x0], x1
364        st1             {v1.8b},  [x0], x1
365        st1             {v2.8b},  [x0], x1
366        st1             {v3.8b},  [x0], x1
367        st1             {v4.d}[1],  [x0], x1
368        st1             {v5.d}[1],  [x0], x1
369        st1             {v6.d}[1],  [x0], x1
370        st1             {v7.d}[1],  [x0], x1
371        st1             {v0.d}[1],  [x0], x1
372        st1             {v1.d}[1],  [x0], x1
373        st1             {v2.d}[1],  [x0], x1
374        st1             {v3.d}[1],  [x0], x1
3759:
376        ret
377endfunc
378
379.macro  h264_loop_filter_chroma
380        dup             v22.8B, w2              // alpha
381        dup             v23.8B, w3              // beta
382        uxtl            v24.8H, v24.8B
383        uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
384        uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
385        uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
386        cmhi            v26.8B, v22.8B, v26.8B  // < alpha
387        cmhi            v28.8B, v23.8B, v28.8B  // < beta
388        cmhi            v30.8B, v23.8B, v30.8B  // < beta
389        uxtl            v4.8H,  v0.8B
390        and             v26.8B, v26.8B, v28.8B
391        usubw           v4.8H,  v4.8H,  v16.8B
392        and             v26.8B, v26.8B, v30.8B
393        shl             v4.8H,  v4.8H,  #2
394        mov             x8,  v26.d[0]
395        sli             v24.8H, v24.8H, #8
396        uaddw           v4.8H,  v4.8H,  v18.8B
397        cbz             x8,  9f
398        usubw           v4.8H,  v4.8H,  v2.8B
399        rshrn           v4.8B,  v4.8H,  #3
400        smin            v4.8B,  v4.8B,  v24.8B
401        neg             v25.8B, v24.8B
402        smax            v4.8B,  v4.8B,  v25.8B
403        uxtl            v22.8H, v0.8B
404        and             v4.8B,  v4.8B,  v26.8B
405        uxtl            v28.8H, v16.8B
406        saddw           v28.8H, v28.8H, v4.8B
407        ssubw           v22.8H, v22.8H, v4.8B
408        sqxtun          v16.8B, v28.8H
409        sqxtun          v0.8B,  v22.8H
410.endm
411
412function ff_h264_v_loop_filter_chroma_neon, export=1
413        h264_loop_filter_start
414
415        sub             x0,  x0,  x1, lsl #1
416        ld1             {v18.8B}, [x0], x1
417        ld1             {v16.8B}, [x0], x1
418        ld1             {v0.8B},  [x0], x1
419        ld1             {v2.8B},  [x0]
420
421        h264_loop_filter_chroma
422
423        sub             x0,  x0,  x1, lsl #1
424        st1             {v16.8B}, [x0], x1
425        st1             {v0.8B},  [x0], x1
4269:
427        ret
428endfunc
429
430function ff_h264_h_loop_filter_chroma_neon, export=1
431        h264_loop_filter_start
432
433        sub             x0,  x0,  #2
434h_loop_filter_chroma420:
435        ld1             {v18.S}[0], [x0], x1
436        ld1             {v16.S}[0], [x0], x1
437        ld1             {v0.S}[0],  [x0], x1
438        ld1             {v2.S}[0],  [x0], x1
439        ld1             {v18.S}[1], [x0], x1
440        ld1             {v16.S}[1], [x0], x1
441        ld1             {v0.S}[1],  [x0], x1
442        ld1             {v2.S}[1],  [x0], x1
443
444        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
445
446        h264_loop_filter_chroma
447
448        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
449
450        sub             x0,  x0,  x1, lsl #3
451        st1             {v18.S}[0], [x0], x1
452        st1             {v16.S}[0], [x0], x1
453        st1             {v0.S}[0],  [x0], x1
454        st1             {v2.S}[0],  [x0], x1
455        st1             {v18.S}[1], [x0], x1
456        st1             {v16.S}[1], [x0], x1
457        st1             {v0.S}[1],  [x0], x1
458        st1             {v2.S}[1],  [x0], x1
4599:
460        ret
461endfunc
462
463function ff_h264_h_loop_filter_chroma422_neon, export=1
464        h264_loop_filter_start
465        add             x5,  x0,  x1
466        sub             x0,  x0,  #2
467        add             x1,  x1,  x1
468        mov             x7,  x30
469        bl              h_loop_filter_chroma420
470        mov             x30, x7
471        sub             x0,  x5,  #2
472        mov             v24.s[0], w6
473        b               h_loop_filter_chroma420
474endfunc
475
476.macro h264_loop_filter_chroma_intra
477        uabd            v26.8b,  v16.8b,  v17.8b  // abs(p0 - q0)
478        uabd            v27.8b,  v18.8b,  v16.8b  // abs(p1 - p0)
479        uabd            v28.8b,  v19.8b,  v17.8b  // abs(q1 - q0)
480        cmhi            v26.8b,  v30.8b,  v26.8b  // < alpha
481        cmhi            v27.8b,  v31.8b,  v27.8b  // < beta
482        cmhi            v28.8b,  v31.8b,  v28.8b  // < beta
483        and             v26.8b,  v26.8b,  v27.8b
484        and             v26.8b,  v26.8b,  v28.8b
485        mov             x2, v26.d[0]
486
487        ushll           v4.8h,   v18.8b,  #1
488        ushll           v6.8h,   v19.8b,  #1
489        cbz             x2, 9f
490        uaddl           v20.8h,  v16.8b,  v19.8b
491        uaddl           v22.8h,  v17.8b,  v18.8b
492        add             v20.8h,  v20.8h,  v4.8h
493        add             v22.8h,  v22.8h,  v6.8h
494        uqrshrn         v24.8b,  v20.8h,  #2
495        uqrshrn         v25.8b,  v22.8h,  #2
496        bit             v16.8b,  v24.8b,  v26.8b
497        bit             v17.8b,  v25.8b,  v26.8b
498.endm
499
500function ff_h264_v_loop_filter_chroma_intra_neon, export=1
501        h264_loop_filter_start_intra
502
503        sub             x0,  x0,  x1, lsl #1
504        ld1             {v18.8b}, [x0], x1
505        ld1             {v16.8b}, [x0], x1
506        ld1             {v17.8b}, [x0], x1
507        ld1             {v19.8b}, [x0]
508
509        h264_loop_filter_chroma_intra
510
511        sub             x0,  x0,  x1, lsl #1
512        st1             {v16.8b}, [x0], x1
513        st1             {v17.8b}, [x0], x1
514
5159:
516        ret
517endfunc
518
519function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
520        h264_loop_filter_start_intra
521
522        sub             x4,  x0,  #2
523        sub             x0,  x0,  #1
524        ld1             {v18.8b}, [x4], x1
525        ld1             {v16.8b}, [x4], x1
526        ld1             {v17.8b}, [x4], x1
527        ld1             {v19.8b}, [x4], x1
528
529        transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
530
531        h264_loop_filter_chroma_intra
532
533        st2             {v16.b,v17.b}[0], [x0], x1
534        st2             {v16.b,v17.b}[1], [x0], x1
535        st2             {v16.b,v17.b}[2], [x0], x1
536        st2             {v16.b,v17.b}[3], [x0], x1
537
5389:
539        ret
540endfunc
541
542function ff_h264_h_loop_filter_chroma_intra_neon, export=1
543        h264_loop_filter_start_intra
544
545        sub             x4,  x0,  #2
546        sub             x0,  x0,  #1
547h_loop_filter_chroma420_intra:
548        ld1             {v18.8b}, [x4], x1
549        ld1             {v16.8b}, [x4], x1
550        ld1             {v17.8b}, [x4], x1
551        ld1             {v19.8b}, [x4], x1
552        ld1             {v18.s}[1], [x4], x1
553        ld1             {v16.s}[1], [x4], x1
554        ld1             {v17.s}[1], [x4], x1
555        ld1             {v19.s}[1], [x4], x1
556
557        transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
558
559        h264_loop_filter_chroma_intra
560
561        st2             {v16.b,v17.b}[0], [x0], x1
562        st2             {v16.b,v17.b}[1], [x0], x1
563        st2             {v16.b,v17.b}[2], [x0], x1
564        st2             {v16.b,v17.b}[3], [x0], x1
565        st2             {v16.b,v17.b}[4], [x0], x1
566        st2             {v16.b,v17.b}[5], [x0], x1
567        st2             {v16.b,v17.b}[6], [x0], x1
568        st2             {v16.b,v17.b}[7], [x0], x1
569
5709:
571        ret
572endfunc
573
574function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
575        h264_loop_filter_start_intra
576        sub             x4,  x0,  #2
577        add             x5,  x0,  x1, lsl #3
578        sub             x0,  x0,  #1
579        mov             x7,  x30
580        bl              h_loop_filter_chroma420_intra
581        sub             x0,  x5,  #1
582        mov             x30, x7
583        b               h_loop_filter_chroma420_intra
584endfunc
585
586.macro  biweight_16     macs, macd
587        dup             v0.16B,  w5
588        dup             v1.16B,  w6
589        mov             v4.16B,  v16.16B
590        mov             v6.16B,  v16.16B
5911:      subs            w3,  w3,  #2
592        ld1             {v20.16B}, [x0], x2
593        \macd           v4.8H,   v0.8B,  v20.8B
594        \macd\()2       v6.8H,   v0.16B, v20.16B
595        ld1             {v22.16B}, [x1], x2
596        \macs           v4.8H,   v1.8B,  v22.8B
597        \macs\()2       v6.8H,   v1.16B, v22.16B
598        mov             v24.16B, v16.16B
599        ld1             {v28.16B}, [x0], x2
600        mov             v26.16B, v16.16B
601        \macd           v24.8H,  v0.8B,  v28.8B
602        \macd\()2       v26.8H,  v0.16B, v28.16B
603        ld1             {v30.16B}, [x1], x2
604        \macs           v24.8H,  v1.8B,  v30.8B
605        \macs\()2       v26.8H,  v1.16B, v30.16B
606        sshl            v4.8H,   v4.8H,  v18.8H
607        sshl            v6.8H,   v6.8H,  v18.8H
608        sqxtun          v4.8B,   v4.8H
609        sqxtun2         v4.16B,  v6.8H
610        sshl            v24.8H,  v24.8H, v18.8H
611        sshl            v26.8H,  v26.8H, v18.8H
612        sqxtun          v24.8B,  v24.8H
613        sqxtun2         v24.16B, v26.8H
614        mov             v6.16B,  v16.16B
615        st1             {v4.16B},  [x7], x2
616        mov             v4.16B,  v16.16B
617        st1             {v24.16B}, [x7], x2
618        b.ne            1b
619        ret
620.endm
621
622.macro  biweight_8      macs, macd
623        dup             v0.8B,  w5
624        dup             v1.8B,  w6
625        mov             v2.16B,  v16.16B
626        mov             v20.16B, v16.16B
6271:      subs            w3,  w3,  #2
628        ld1             {v4.8B}, [x0], x2
629        \macd           v2.8H,  v0.8B,  v4.8B
630        ld1             {v5.8B}, [x1], x2
631        \macs           v2.8H,  v1.8B,  v5.8B
632        ld1             {v6.8B}, [x0], x2
633        \macd           v20.8H, v0.8B,  v6.8B
634        ld1             {v7.8B}, [x1], x2
635        \macs           v20.8H, v1.8B,  v7.8B
636        sshl            v2.8H,  v2.8H,  v18.8H
637        sqxtun          v2.8B,  v2.8H
638        sshl            v20.8H, v20.8H, v18.8H
639        sqxtun          v4.8B,  v20.8H
640        mov             v20.16B, v16.16B
641        st1             {v2.8B}, [x7], x2
642        mov             v2.16B,  v16.16B
643        st1             {v4.8B}, [x7], x2
644        b.ne            1b
645        ret
646.endm
647
648.macro  biweight_4      macs, macd
649        dup             v0.8B,  w5
650        dup             v1.8B,  w6
651        mov             v2.16B, v16.16B
652        mov             v20.16B,v16.16B
6531:      subs            w3,  w3,  #4
654        ld1             {v4.S}[0], [x0], x2
655        ld1             {v4.S}[1], [x0], x2
656        \macd           v2.8H,  v0.8B,  v4.8B
657        ld1             {v5.S}[0], [x1], x2
658        ld1             {v5.S}[1], [x1], x2
659        \macs           v2.8H,  v1.8B,  v5.8B
660        b.lt            2f
661        ld1             {v6.S}[0], [x0], x2
662        ld1             {v6.S}[1], [x0], x2
663        \macd           v20.8H, v0.8B,  v6.8B
664        ld1             {v7.S}[0], [x1], x2
665        ld1             {v7.S}[1], [x1], x2
666        \macs           v20.8H, v1.8B,  v7.8B
667        sshl            v2.8H,  v2.8H,  v18.8H
668        sqxtun          v2.8B,  v2.8H
669        sshl            v20.8H, v20.8H, v18.8H
670        sqxtun          v4.8B,  v20.8H
671        mov             v20.16B, v16.16B
672        st1             {v2.S}[0], [x7], x2
673        st1             {v2.S}[1], [x7], x2
674        mov             v2.16B,  v16.16B
675        st1             {v4.S}[0], [x7], x2
676        st1             {v4.S}[1], [x7], x2
677        b.ne            1b
678        ret
6792:      sshl            v2.8H,  v2.8H,  v18.8H
680        sqxtun          v2.8B,  v2.8H
681        st1             {v2.S}[0], [x7], x2
682        st1             {v2.S}[1], [x7], x2
683        ret
684.endm
685
686.macro  biweight_func   w
687function ff_biweight_h264_pixels_\w\()_neon, export=1
688        lsr             w8,  w5,  #31
689        add             w7,  w7,  #1
690        eor             w8,  w8,  w6,  lsr #30
691        orr             w7,  w7,  #1
692        dup             v18.8H,   w4
693        lsl             w7,  w7,  w4
694        not             v18.16B,  v18.16B
695        dup             v16.8H,   w7
696        mov             x7,  x0
697        cbz             w8,  10f
698        subs            w8,  w8,  #1
699        b.eq            20f
700        subs            w8,  w8,  #1
701        b.eq            30f
702        b               40f
70310:     biweight_\w     umlal, umlal
70420:     neg             w5, w5
705        biweight_\w     umlal, umlsl
70630:     neg             w5, w5
707        neg             w6, w6
708        biweight_\w     umlsl, umlsl
70940:     neg             w6, w6
710        biweight_\w     umlsl, umlal
711endfunc
712.endm
713
714        biweight_func   16
715        biweight_func   8
716        biweight_func   4
717
718.macro  weight_16       add
719        dup             v0.16B,  w4
7201:      subs            w2,  w2,  #2
721        ld1             {v20.16B}, [x0], x1
722        umull           v4.8H,   v0.8B,  v20.8B
723        umull2          v6.8H,   v0.16B, v20.16B
724        ld1             {v28.16B}, [x0], x1
725        umull           v24.8H,  v0.8B,  v28.8B
726        umull2          v26.8H,  v0.16B, v28.16B
727        \add            v4.8H,   v16.8H, v4.8H
728        srshl           v4.8H,   v4.8H,  v18.8H
729        \add            v6.8H,   v16.8H, v6.8H
730        srshl           v6.8H,   v6.8H,  v18.8H
731        sqxtun          v4.8B,   v4.8H
732        sqxtun2         v4.16B,  v6.8H
733        \add            v24.8H,  v16.8H, v24.8H
734        srshl           v24.8H,  v24.8H, v18.8H
735        \add            v26.8H,  v16.8H, v26.8H
736        srshl           v26.8H,  v26.8H, v18.8H
737        sqxtun          v24.8B,  v24.8H
738        sqxtun2         v24.16B, v26.8H
739        st1             {v4.16B},  [x5], x1
740        st1             {v24.16B}, [x5], x1
741        b.ne            1b
742        ret
743.endm
744
745.macro  weight_8        add
746        dup             v0.8B,  w4
7471:      subs            w2,  w2,  #2
748        ld1             {v4.8B}, [x0], x1
749        umull           v2.8H,  v0.8B,  v4.8B
750        ld1             {v6.8B}, [x0], x1
751        umull           v20.8H, v0.8B,  v6.8B
752        \add            v2.8H,  v16.8H,  v2.8H
753        srshl           v2.8H,  v2.8H,  v18.8H
754        sqxtun          v2.8B,  v2.8H
755        \add            v20.8H, v16.8H,  v20.8H
756        srshl           v20.8H, v20.8H, v18.8H
757        sqxtun          v4.8B,  v20.8H
758        st1             {v2.8B}, [x5], x1
759        st1             {v4.8B}, [x5], x1
760        b.ne            1b
761        ret
762.endm
763
764.macro  weight_4        add
765        dup             v0.8B,  w4
7661:      subs            w2,  w2,  #4
767        ld1             {v4.S}[0], [x0], x1
768        ld1             {v4.S}[1], [x0], x1
769        umull           v2.8H,  v0.8B,  v4.8B
770        b.lt            2f
771        ld1             {v6.S}[0], [x0], x1
772        ld1             {v6.S}[1], [x0], x1
773        umull           v20.8H, v0.8B,  v6.8B
774        \add            v2.8H,  v16.8H,  v2.8H
775        srshl           v2.8H,  v2.8H,  v18.8H
776        sqxtun          v2.8B,  v2.8H
777        \add            v20.8H, v16.8H,  v20.8H
778        srshl           v20.8H, v20.8h, v18.8H
779        sqxtun          v4.8B,  v20.8H
780        st1             {v2.S}[0], [x5], x1
781        st1             {v2.S}[1], [x5], x1
782        st1             {v4.S}[0], [x5], x1
783        st1             {v4.S}[1], [x5], x1
784        b.ne            1b
785        ret
7862:      \add            v2.8H,  v16.8H,  v2.8H
787        srshl           v2.8H,  v2.8H,  v18.8H
788        sqxtun          v2.8B,  v2.8H
789        st1             {v2.S}[0], [x5], x1
790        st1             {v2.S}[1], [x5], x1
791        ret
792.endm
793
794.macro  weight_func     w
795function ff_weight_h264_pixels_\w\()_neon, export=1
796        cmp             w3,  #1
797        mov             w6,  #1
798        lsl             w5,  w5,  w3
799        dup             v16.8H,  w5
800        mov             x5,  x0
801        b.le            20f
802        sub             w6,  w6,  w3
803        dup             v18.8H,  w6
804        cmp             w4, #0
805        b.lt            10f
806        weight_\w       shadd
80710:     neg             w4,  w4
808        weight_\w       shsub
80920:     neg             w6,  w3
810        dup             v18.8H,  w6
811        cmp             w4,  #0
812        b.lt            10f
813        weight_\w       add
81410:     neg             w4,  w4
815        weight_\w       sub
816endfunc
817.endm
818
819        weight_func     16
820        weight_func     8
821        weight_func     4
822
823.macro  h264_loop_filter_start_10
824        cmp             w2,  #0
825        ldr             w6,  [x4]
826        ccmp            w3,  #0,  #0,  ne
827        lsl             w2,  w2,  #2
828        mov             v24.S[0], w6
829        lsl             w3,  w3,  #2
830        and             w8,  w6,  w6,  lsl #16
831        b.eq            1f
832        ands            w8,  w8,  w8,  lsl #8
833        b.ge            2f
8341:
835        ret
8362:
837.endm
838
839.macro h264_loop_filter_start_intra_10
840        orr             w4,  w2,  w3
841        cbnz            w4,  1f
842        ret
8431:
844        lsl             w2,  w2,  #2
845        lsl             w3,  w3,  #2
846        dup             v30.8h,   w2              // alpha
847        dup             v31.8h,   w3              // beta
848.endm
849
850.macro  h264_loop_filter_chroma_10
851        dup             v22.8h,  w2               // alpha
852        dup             v23.8h,  w3               // beta
853        uxtl            v24.8h,  v24.8b           // tc0
854
855        uabd            v26.8h,  v16.8h,  v0.8h   // abs(p0 - q0)
856        uabd            v28.8h,  v18.8h,  v16.8h  // abs(p1 - p0)
857        uabd            v30.8h,  v2.8h,   v0.8h   // abs(q1 - q0)
858        cmhi            v26.8h,  v22.8h,  v26.8h  // < alpha
859        cmhi            v28.8h,  v23.8h,  v28.8h  // < beta
860        cmhi            v30.8h,  v23.8h,  v30.8h  // < beta
861
862        and             v26.16b, v26.16b, v28.16b
863        mov             v4.16b,  v0.16b
864        sub             v4.8h,   v4.8h,   v16.8h
865        and             v26.16b, v26.16b, v30.16b
866        shl             v4.8h,   v4.8h,   #2
867        mov             x8, v26.d[0]
868        mov             x9, v26.d[1]
869        sli             v24.8h,  v24.8h,  #8
870        uxtl            v24.8h,  v24.8b
871        add             v4.8h,   v4.8h,   v18.8h
872        adds            x8,  x8,  x9
873        shl             v24.8h,  v24.8h,  #2
874
875        b.eq            9f
876
877        movi            v31.8h, #3                // (tc0 - 1) << (BIT_DEPTH - 8)) + 1
878        uqsub           v24.8h,  v24.8h,  v31.8h
879        sub             v4.8h,   v4.8h,   v2.8h
880        srshr           v4.8h,   v4.8h,   #3
881        smin            v4.8h,   v4.8h,   v24.8h
882        neg             v25.8h,  v24.8h
883        smax            v4.8h,   v4.8h,   v25.8h
884        and             v4.16b,  v4.16b,  v26.16b
885        add             v16.8h,  v16.8h,  v4.8h
886        sub             v0.8h,   v0.8h,   v4.8h
887
888        mvni            v4.8h,   #0xFC, lsl #8    // 1023 for clipping
889        movi            v5.8h,   #0
890        smin            v0.8h,   v0.8h,   v4.8h
891        smin            v16.8h,  v16.8h,  v4.8h
892        smax            v0.8h,   v0.8h,   v5.8h
893        smax            v16.8h,  v16.8h,  v5.8h
894.endm
895
896function ff_h264_v_loop_filter_chroma_neon_10, export=1
897        h264_loop_filter_start_10
898
899        mov             x10,  x0
900        sub             x0,  x0,  x1, lsl #1
901        ld1             {v18.8h}, [x0 ], x1
902        ld1             {v0.8h},  [x10], x1
903        ld1             {v16.8h}, [x0 ], x1
904        ld1             {v2.8h},  [x10]
905
906        h264_loop_filter_chroma_10
907
908        sub             x0,  x10,  x1, lsl #1
909        st1             {v16.8h}, [x0], x1
910        st1             {v0.8h},  [x0], x1
9119:
912        ret
913endfunc
914
915function ff_h264_h_loop_filter_chroma_neon_10, export=1
916        h264_loop_filter_start_10
917
918        sub             x0,  x0,  #4 // access the 2nd left pixel
919h_loop_filter_chroma420_10:
920        add             x10,  x0,  x1,  lsl #2
921        ld1             {v18.d}[0], [x0 ], x1
922        ld1             {v18.d}[1], [x10], x1
923        ld1             {v16.d}[0], [x0 ], x1
924        ld1             {v16.d}[1], [x10], x1
925        ld1             {v0.d}[0],  [x0 ], x1
926        ld1             {v0.d}[1],  [x10], x1
927        ld1             {v2.d}[0],  [x0 ], x1
928        ld1             {v2.d}[1],  [x10], x1
929
930        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
931
932        h264_loop_filter_chroma_10
933
934        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
935
936        sub             x0,  x10,  x1, lsl #3
937        st1             {v18.d}[0], [x0], x1
938        st1             {v16.d}[0], [x0], x1
939        st1             {v0.d}[0],  [x0], x1
940        st1             {v2.d}[0],  [x0], x1
941        st1             {v18.d}[1], [x0], x1
942        st1             {v16.d}[1], [x0], x1
943        st1             {v0.d}[1],  [x0], x1
944        st1             {v2.d}[1],  [x0], x1
9459:
946        ret
947endfunc
948
949function ff_h264_h_loop_filter_chroma422_neon_10, export=1
950        h264_loop_filter_start_10
951        add             x5,  x0,  x1
952        sub             x0,  x0,  #4
953        add             x1,  x1,  x1
954        mov             x7,  x30
955        bl              h_loop_filter_chroma420_10
956        mov             x30, x7
957        sub             x0,  x5,  #4
958        mov             v24.s[0], w6
959        b               h_loop_filter_chroma420_10
960endfunc
961
962.macro h264_loop_filter_chroma_intra_10
963        uabd            v26.8h,  v16.8h,  v17.8h  // abs(p0 - q0)
964        uabd            v27.8h,  v18.8h,  v16.8h  // abs(p1 - p0)
965        uabd            v28.8h,  v19.8h,  v17.8h  // abs(q1 - q0)
966        cmhi            v26.8h,  v30.8h,  v26.8h  // < alpha
967        cmhi            v27.8h,  v31.8h,  v27.8h  // < beta
968        cmhi            v28.8h,  v31.8h,  v28.8h  // < beta
969        and             v26.16b, v26.16b, v27.16b
970        and             v26.16b, v26.16b, v28.16b
971        mov             x2, v26.d[0]
972        mov             x3, v26.d[1]
973
974        shl             v4.8h,  v18.8h,  #1
975        shl             v6.8h,  v19.8h,  #1
976
977        adds            x2,  x2,  x3
978        b.eq            9f
979
980        add             v20.8h,  v16.8h,  v19.8h
981        add             v22.8h,  v17.8h,  v18.8h
982        add             v20.8h,  v20.8h,  v4.8h
983        add             v22.8h,  v22.8h,  v6.8h
984        urshr           v24.8h,  v20.8h,  #2
985        urshr           v25.8h,  v22.8h,  #2
986        bit             v16.16b, v24.16b, v26.16b
987        bit             v17.16b, v25.16b, v26.16b
988.endm
989
990function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
991        h264_loop_filter_start_intra_10
992        mov             x9,  x0
993        sub             x0,  x0,  x1, lsl #1
994        ld1             {v18.8h}, [x0], x1
995        ld1             {v17.8h}, [x9], x1
996        ld1             {v16.8h}, [x0], x1
997        ld1             {v19.8h}, [x9]
998
999        h264_loop_filter_chroma_intra_10
1000
1001        sub             x0,  x9,  x1, lsl #1
1002        st1             {v16.8h}, [x0], x1
1003        st1             {v17.8h}, [x0], x1
1004
10059:
1006        ret
1007endfunc
1008
1009function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
1010        h264_loop_filter_start_intra_10
1011
1012        sub             x4,  x0,  #4
1013        sub             x0,  x0,  #2
1014        add             x9,  x4,  x1, lsl #1
1015        ld1             {v18.8h}, [x4], x1
1016        ld1             {v17.8h}, [x9], x1
1017        ld1             {v16.8h}, [x4], x1
1018        ld1             {v19.8h}, [x9], x1
1019
1020        transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
1021
1022        h264_loop_filter_chroma_intra_10
1023
1024        st2             {v16.h,v17.h}[0], [x0], x1
1025        st2             {v16.h,v17.h}[1], [x0], x1
1026        st2             {v16.h,v17.h}[2], [x0], x1
1027        st2             {v16.h,v17.h}[3], [x0], x1
1028
10299:
1030        ret
1031endfunc
1032
1033function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
1034        h264_loop_filter_start_intra_10
1035        sub             x4,  x0,  #4
1036        sub             x0,  x0,  #2
1037h_loop_filter_chroma420_intra_10:
1038        add             x9,  x4,  x1, lsl #2
1039        ld1             {v18.4h},   [x4], x1
1040        ld1             {v18.d}[1], [x9], x1
1041        ld1             {v16.4h},   [x4], x1
1042        ld1             {v16.d}[1], [x9], x1
1043        ld1             {v17.4h},   [x4], x1
1044        ld1             {v17.d}[1], [x9], x1
1045        ld1             {v19.4h},   [x4], x1
1046        ld1             {v19.d}[1], [x9], x1
1047
1048        transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
1049
1050        h264_loop_filter_chroma_intra_10
1051
1052        st2             {v16.h,v17.h}[0], [x0], x1
1053        st2             {v16.h,v17.h}[1], [x0], x1
1054        st2             {v16.h,v17.h}[2], [x0], x1
1055        st2             {v16.h,v17.h}[3], [x0], x1
1056        st2             {v16.h,v17.h}[4], [x0], x1
1057        st2             {v16.h,v17.h}[5], [x0], x1
1058        st2             {v16.h,v17.h}[6], [x0], x1
1059        st2             {v16.h,v17.h}[7], [x0], x1
1060
10619:
1062        ret
1063endfunc
1064
1065function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1
1066        h264_loop_filter_start_intra_10
1067        sub             x4,  x0,  #4
1068        add             x5,  x0,  x1, lsl #3
1069        sub             x0,  x0,  #2
1070        mov             x7,  x30
1071        bl              h_loop_filter_chroma420_intra_10
1072        mov             x4,  x9
1073        sub             x0,  x5,  #2
1074        mov             x30, x7
1075        b               h_loop_filter_chroma420_intra_10
1076endfunc
1077