1/*
2 * Copyright (c) 2017 Google Inc.
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/aarch64/asm.S"
22#include "neon.S"
23
24
25// The input to and output from this macro is in the registers v16-v31,
26// and v0-v7 are used as scratch registers.
27// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
28// Depending on the width of the loop filter, we either use v16-v19
29// and v28-v31 as temp registers, or v8-v15.
30.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
31        dup             v0.8h,  w2                   // E
32        dup             v2.8h,  w3                   // I
33        dup             v3.8h,  w4                   // H
34
35        uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
36        uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
37        uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
38        uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
39        uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
40        uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
41        umax            v4.8h,  v4.8h,  v5.8h
42        umax            v5.8h,  v6.8h,  v7.8h
43        umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
44        uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
45        umax            v4.8h,  v4.8h,  v5.8h
46        add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
47        uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
48        umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
49        ushr            v5.8h,  v5.8h,  #1
50        cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
51        add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
52        cmhs            v6.8h,  v0.8h,  v6.8h
53        and             v4.16b, v4.16b, v6.16b       // fm
54
55        // If no pixels need filtering, just exit as soon as possible
56        mov             x11, v4.d[0]
57        mov             x12, v4.d[1]
58        adds            x11, x11, x12
59        b.ne            1f
60        ret             x10
611:
62
63.if \wd >= 8
64        dup             v0.8h,  w5
65
66        uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
67        uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
68        uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
69        uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
70        uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
71        uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
72        umax            v6.8h,  v6.8h,  v2.8h
73        umax            v1.8h,  v1.8h,  \tmp1\().8h
74        umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
75.if \wd == 16
76        uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
77        umax            v6.8h,  v6.8h,  v1.8h
78        uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
79        umax            v6.8h,  v6.8h,  \tmp2\().8h
80        uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
81        cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
82        uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
83        and             v6.16b, v6.16b, v4.16b       // flat8in && fm
84        uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
85        bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
86        uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
87        uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
88        uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
89
90        umax            v7.8h,  v7.8h,  v2.8h
91        umax            v1.8h,  v1.8h,  v8.8h
92        umax            v9.8h,  v9.8h,  v10.8h
93        umax            v11.8h, v11.8h, v12.8h
94        // The rest of the calculation of flat8out is interleaved below
95.else
96        // The rest of the calculation of flat8in is interleaved below
97.endif
98.endif
99
100        // Calculate the normal inner loop filter for 2 or 4 pixels
101        uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
102.if \wd == 16
103        umax            v7.8h,  v7.8h,  v1.8h
104        umax            v9.8h,  v9.8h,  v11.8h
105.elseif \wd == 8
106        umax            v6.8h,  v6.8h,  v1.8h
107.endif
108        uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
109.if \wd == 16
110        umax            v7.8h,  v7.8h,  v9.8h
111.elseif \wd == 8
112        umax            v6.8h,  v6.8h,  \tmp2\().8h
113.endif
114        dup             \tmp2\().8h,  w6                        // left shift for saturation
115        sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
116        neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
117        umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
118        sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
119        movi            \tmp5\().8h,  #3
120.if \wd == 8
121        cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
122.endif
123        cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
124.if \wd == 8
125        and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
126.endif
127        sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
128.if \wd == 16
129        cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
130.elseif \wd == 8
131        bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
132.endif
133        and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
134.if \wd == 16
135        and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
136.endif
137        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
138
139        mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
140        bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
141        movi            v2.8h,  #4
142        add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
143        movi            v3.8h,  #3
144        sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
145        movi            \tmp5\().8h,  #0
146        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
147        dup             \tmp6\().8h,  w7                        // max pixel value
148.if \wd == 16
149        bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
150.endif
151
152        ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
153
154        add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
155        add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
156        smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
157        smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
158        sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
159        sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
160
161        add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
162        sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
163        smin            v0.8h,   v0.8h,   \tmp6\().8h
164        smin            v2.8h,   v2.8h,   \tmp6\().8h
165        srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
166        smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
167        smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
168        bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
169        bit             v24.16b, v2.16b,  v4.16b
170
171        add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
172        sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
173.if \wd >= 8
174        mov             x11, v6.d[0]
175.endif
176        smin            v0.8h,  v0.8h,  \tmp6\().8h
177        smin            v2.8h,  v2.8h,  \tmp6\().8h
178.if \wd >= 8
179        mov             x12, v6.d[1]
180.endif
181        smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
182        smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
183.if \wd >= 8
184        adds            x11, x11, x12
185.endif
186        bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
187        bit             v25.16b, v2.16b,  v5.16b
188
189        // If no pixels need flat8in, jump to flat8out
190        // (or to a writeout of the inner 4 pixels, for wd=8)
191.if \wd >= 8
192.if \wd == 16
193        b.eq            6f
194.else
195        b.ne            1f
196        ret             x13
1971:
198.endif
199
200        // flat8in
201        add             \tmp1\().8h, v20.8h, v21.8h
202        add             \tmp3\().8h, v22.8h, v25.8h
203        add             \tmp5\().8h, v20.8h, v22.8h
204        add             \tmp7\().8h, v23.8h, v26.8h
205        add             v0.8h,  \tmp1\().8h, \tmp1\().8h
206        add             v0.8h,  v0.8h,  v23.8h
207        add             v0.8h,  v0.8h,  v24.8h
208        add             v0.8h,  v0.8h,  \tmp5\().8h
209        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
210        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
211        urshr           v2.8h,  v0.8h,  #3                      // out p2
212
213        add             v0.8h,  v0.8h,  \tmp3\().8h
214        add             \tmp1\().8h, v20.8h,  v23.8h
215        add             \tmp3\().8h, v24.8h,  v27.8h
216        urshr           v3.8h,  v0.8h,  #3                      // out p1
217
218        add             v0.8h,  v0.8h,  \tmp7\().8h
219        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
220        add             \tmp5\().8h, v21.8h,  v24.8h
221        add             \tmp7\().8h, v25.8h,  v27.8h
222        urshr           v4.8h,  v0.8h,  #3                      // out p0
223
224        add             v0.8h,  v0.8h,  \tmp3\().8h
225        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
226        add             \tmp1\().8h, v22.8h,  v25.8h
227        add             \tmp3\().8h, v26.8h,  v27.8h
228        urshr           v5.8h,  v0.8h,  #3                      // out q0
229
230        add             v0.8h,  v0.8h,  \tmp7\().8h
231        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
232        urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
233
234        add             v0.8h,  v0.8h,  \tmp3\().8h
235        // The output here is written back into the input registers. This doesn't
236        // matter for the flat8part below, since we only update those pixels
237        // which won't be touched below.
238        bit             v21.16b, v2.16b,  v6.16b
239        bit             v22.16b, v3.16b,  v6.16b
240        bit             v23.16b, v4.16b,  v6.16b
241        urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
242        bit             v24.16b, v5.16b,  v6.16b
243        bit             v25.16b, \tmp5\().16b,  v6.16b
244        bit             v26.16b, \tmp6\().16b,  v6.16b
245.endif
246.if \wd == 16
2476:
248        orr             v2.16b,  v6.16b,  v7.16b
249        mov             x11, v2.d[0]
250        mov             x12, v2.d[1]
251        adds            x11, x11, x12
252        b.ne            1f
253        // If no pixels needed flat8in nor flat8out, jump to a
254        // writeout of the inner 4 pixels
255        ret             x14
2561:
257
258        mov             x11, v7.d[0]
259        mov             x12, v7.d[1]
260        adds            x11, x11, x12
261        b.ne            1f
262        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
263        ret             x15
264
2651:
266        // flat8out
267        // This writes all outputs into v2-v17 (skipping v6 and v16).
268        // If this part is skipped, the output is read from v21-v26 (which is the input
269        // to this section).
270        shl             v0.8h,   v16.8h,  #3     // 8 * v16
271        sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
272        add             v0.8h,   v0.8h,   v17.8h
273        add             v8.8h,   v17.8h,  v18.8h
274        add             v10.8h,  v19.8h,  v20.8h
275        add             v0.8h,   v0.8h,   v8.8h
276        add             v8.8h,   v16.8h,  v17.8h
277        add             v12.8h,  v21.8h,  v22.8h
278        add             v0.8h,   v0.8h,   v10.8h
279        add             v10.8h,  v18.8h,  v25.8h
280        add             v14.8h,  v23.8h,  v24.8h
281        sub             v10.8h,  v10.8h,  v8.8h
282        add             v0.8h,   v0.8h,   v12.8h
283        add             v0.8h,   v0.8h,   v14.8h
284        add             v12.8h,  v16.8h,  v18.8h
285        add             v14.8h,  v19.8h,  v26.8h
286        urshr           v2.8h,   v0.8h,   #4
287
288        add             v0.8h,   v0.8h,   v10.8h
289        add             v8.8h,   v16.8h,  v19.8h
290        add             v10.8h,  v20.8h,  v27.8h
291        sub             v14.8h,  v14.8h,  v12.8h
292        bif             v2.16b,  v17.16b, v7.16b
293        urshr           v3.8h ,  v0.8h,   #4
294
295        add             v0.8h,   v0.8h,   v14.8h
296        add             v12.8h,  v16.8h,  v20.8h
297        add             v14.8h,  v21.8h,  v28.8h
298        sub             v10.8h,  v10.8h,  v8.8h
299        bif             v3.16b,  v18.16b, v7.16b
300        urshr           v4.8h,   v0.8h,   #4
301
302        add             v0.8h,   v0.8h,   v10.8h
303        add             v8.8h,   v16.8h,  v21.8h
304        add             v10.8h,  v22.8h,  v29.8h
305        sub             v14.8h,  v14.8h,  v12.8h
306        bif             v4.16b,  v19.16b, v7.16b
307        urshr           v5.8h,   v0.8h,   #4
308
309        add             v0.8h,   v0.8h,   v14.8h
310        add             v12.8h,  v16.8h,  v22.8h
311        add             v14.8h,  v23.8h,  v30.8h
312        sub             v10.8h,  v10.8h,  v8.8h
313        bif             v5.16b,  v20.16b, v7.16b
314        urshr           v6.8h,   v0.8h,   #4
315
316        add             v0.8h,   v0.8h,   v10.8h
317        add             v10.8h,  v16.8h,  v23.8h
318        sub             v14.8h,  v14.8h,  v12.8h
319        add             v12.8h,  v24.8h,  v31.8h
320        bif             v6.16b,  v21.16b, v7.16b
321        urshr           v8.8h,   v0.8h,   #4
322
323        add             v0.8h,   v0.8h,   v14.8h
324        sub             v10.8h,  v12.8h,  v10.8h
325        add             v12.8h,  v17.8h,  v24.8h
326        add             v14.8h,  v25.8h,  v31.8h
327        bif             v8.16b,  v22.16b, v7.16b
328        urshr           v9.8h,   v0.8h,   #4
329
330        add             v0.8h,   v0.8h,   v10.8h
331        sub             v14.8h,  v14.8h,  v12.8h
332        add             v12.8h,  v26.8h,  v31.8h
333        bif             v9.16b,  v23.16b, v7.16b
334        urshr           v10.8h,  v0.8h,   #4
335
336        add             v0.8h,   v0.8h,   v14.8h
337        add             v14.8h,  v18.8h,  v25.8h
338        add             v18.8h,  v19.8h,  v26.8h
339        sub             v12.8h,  v12.8h,  v14.8h
340        add             v14.8h,  v27.8h,  v31.8h
341        bif             v10.16b, v24.16b, v7.16b
342        urshr           v11.8h,  v0.8h,   #4
343
344        add             v0.8h,   v0.8h,   v12.8h
345        add             v12.8h,  v20.8h,  v27.8h
346        sub             v14.8h,  v14.8h,  v18.8h
347        add             v18.8h,  v28.8h,  v31.8h
348        bif             v11.16b, v25.16b, v7.16b
349        sub             v18.8h,  v18.8h,  v12.8h
350        urshr           v12.8h,  v0.8h,   #4
351
352        add             v0.8h,   v0.8h,   v14.8h
353        add             v14.8h,  v21.8h,  v28.8h
354        add             v20.8h,  v29.8h,  v31.8h
355        bif             v12.16b, v26.16b, v7.16b
356        urshr           v13.8h,  v0.8h,   #4
357
358        add             v0.8h,   v0.8h,   v18.8h
359        sub             v20.8h,  v20.8h,  v14.8h
360        add             v18.8h,  v22.8h,  v29.8h
361        add             v22.8h,  v30.8h,  v31.8h
362        bif             v13.16b, v27.16b, v7.16b
363        urshr           v14.8h,  v0.8h,   #4
364
365        add             v0.8h,   v0.8h,   v20.8h
366        sub             v22.8h,  v22.8h,  v18.8h
367        bif             v14.16b, v28.16b, v7.16b
368        urshr           v15.8h,  v0.8h,   #4
369
370        add             v0.8h,   v0.8h,   v22.8h
371        bif             v15.16b, v29.16b, v7.16b
372        urshr           v17.8h,  v0.8h,   #4
373        bif             v17.16b, v30.16b, v7.16b
374.endif
375.endm
376
377// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
378// while we need those for inputs/outputs in wd=16 and use v8-v15
379// for temp registers there instead.
380function vp9_loop_filter_4
381        loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
382        ret
383endfunc
384
385function vp9_loop_filter_8
386        loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
387        ret
388endfunc
389
390function vp9_loop_filter_16
391        loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
392        ret
393endfunc
394
395.macro loop_filter_4
396        bl              vp9_loop_filter_4
397.endm
398
399.macro loop_filter_8
400        // calculate alternative 'return' targets
401        adr             x13, 6f
402        bl              vp9_loop_filter_8
403.endm
404
405.macro loop_filter_16
406        // calculate alternative 'return' targets
407        adr             x14, 7f
408        adr             x15, 8f
409        bl              vp9_loop_filter_16
410.endm
411
412
413// The public functions in this file have got the following signature:
414// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
415
416.macro bpp_frontend func, bpp, push
417function ff_\func\()_\bpp\()_neon, export=1
418.if \push
419        mov             x16, x30
420        stp             d14, d15, [sp, #-0x10]!
421        stp             d12, d13, [sp, #-0x10]!
422        stp             d10, d11, [sp, #-0x10]!
423        stp             d8,  d9,  [sp, #-0x10]!
424.endif
425        lsl             w2,  w2,  #\bpp - 8
426        lsl             w3,  w3,  #\bpp - 8
427        lsl             w4,  w4,  #\bpp - 8
428        mov             x5,  #1 << (\bpp - 8)
429        mov             x6,  #16 - \bpp
430        mov             x7,  #((1 << \bpp) - 1)
431.if \push
432        bl              \func\()_16_neon
433        ldp             d8,  d9,  [sp], 0x10
434        ldp             d10, d11, [sp], 0x10
435        ldp             d12, d13, [sp], 0x10
436        ldp             d14, d15, [sp], 0x10
437        ret             x16
438.else
439        b               \func\()_16_neon
440.endif
441endfunc
442.endm
443
444.macro bpp_frontends func, push=0
445        bpp_frontend    \func, 10, \push
446        bpp_frontend    \func, 12, \push
447.endm
448
449.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
450function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
451        mov             x16, x30
452.if \push
453        stp             d14, d15, [sp, #-0x10]!
454        stp             d12, d13, [sp, #-0x10]!
455        stp             d10, d11, [sp, #-0x10]!
456        stp             d8,  d9,  [sp, #-0x10]!
457.endif
458        lsl             w2,  w2,  #\bpp - 8
459        lsl             w3,  w3,  #\bpp - 8
460        lsl             w4,  w4,  #\bpp - 8
461        mov             x5,  #1 << (\bpp - 8)
462        mov             x6,  #16 - \bpp
463        mov             x7,  #((1 << \bpp) - 1)
464        bl              \func\()_\int_suffix\()_16_neon
465.ifc \dir,h
466        add             x0,  x0,  x1, lsl #3
467.else
468        add             x0,  x0,  #16
469.endif
470        bl              \func\()_\int_suffix\()_16_neon
471.if \push
472        ldp             d8,  d9,  [sp], 0x10
473        ldp             d10, d11, [sp], 0x10
474        ldp             d12, d13, [sp], 0x10
475        ldp             d14, d15, [sp], 0x10
476.endif
477        ret             x16
478endfunc
479.endm
480
481.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
482        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
483        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
484.endm
485
486.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
487function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
488        mov             x16, x30
489        lsr             w8,  w2,  #8
490        lsr             w14, w3,  #8
491        lsr             w15, w4,  #8
492        and             w2,  w2,  #0xff
493        and             w3,  w3,  #0xff
494        and             w4,  w4,  #0xff
495        lsl             w2,  w2,  #\bpp - 8
496        lsl             w3,  w3,  #\bpp - 8
497        lsl             w4,  w4,  #\bpp - 8
498        mov             x5,  #1 << (\bpp - 8)
499        mov             x6,  #16 - \bpp
500        mov             x7,  #((1 << \bpp) - 1)
501        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
502.ifc \dir,h
503        add             x0,  x0,  x1, lsl #3
504.else
505        add             x0,  x0,  #16
506.endif
507        lsl             w2,  w8,  #\bpp - 8
508        lsl             w3,  w14, #\bpp - 8
509        lsl             w4,  w15, #\bpp - 8
510        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
511        ret             x16
512endfunc
513.endm
514
515.macro bpp_frontends_mix2 wd1, wd2
516        bpp_frontend_mix2 \wd1, \wd2, v, 10
517        bpp_frontend_mix2 \wd1, \wd2, v, 12
518        bpp_frontend_mix2 \wd1, \wd2, h, 10
519        bpp_frontend_mix2 \wd1, \wd2, h, 12
520.endm
521
522function vp9_loop_filter_v_4_8_16_neon
523        mov             x10, x30
524        sub             x9,  x0,  x1, lsl #2
525        ld1             {v20.8h}, [x9], x1 // p3
526        ld1             {v24.8h}, [x0], x1 // q0
527        ld1             {v21.8h}, [x9], x1 // p2
528        ld1             {v25.8h}, [x0], x1 // q1
529        ld1             {v22.8h}, [x9], x1 // p1
530        ld1             {v26.8h}, [x0], x1 // q2
531        ld1             {v23.8h}, [x9], x1 // p0
532        ld1             {v27.8h}, [x0], x1 // q3
533        sub             x0,  x0,  x1, lsl #2
534        sub             x9,  x9,  x1, lsl #1
535
536        loop_filter_4
537
538        st1             {v22.8h}, [x9], x1
539        st1             {v24.8h}, [x0], x1
540        st1             {v23.8h}, [x9], x1
541        st1             {v25.8h}, [x0], x1
542        sub             x0,  x0,  x1, lsl #1
543
544        ret             x10
545endfunc
546
547bpp_frontends vp9_loop_filter_v_4_8
548
549function vp9_loop_filter_h_4_8_16_neon
550        mov             x10, x30
551        sub             x9,  x0,  #8
552        add             x0,  x9,  x1, lsl #2
553        ld1             {v20.8h}, [x9], x1
554        ld1             {v24.8h}, [x0], x1
555        ld1             {v21.8h}, [x9], x1
556        ld1             {v25.8h}, [x0], x1
557        ld1             {v22.8h}, [x9], x1
558        ld1             {v26.8h}, [x0], x1
559        ld1             {v23.8h}, [x9], x1
560        ld1             {v27.8h}, [x0], x1
561
562        sub             x9,  x9,  x1, lsl #2
563        sub             x0,  x0,  x1, lsl #3
564        add             x0,  x0,  #8
565
566        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
567
568        loop_filter_4
569
570        // Move x9 forward by 2 pixels; we don't need to rewrite the
571        // outermost 2 pixels since they aren't changed.
572        add             x9,  x9,  #4
573        add             x0,  x9,  x1, lsl #2
574
575        // We only will write the mid 4 pixels back; after the loop filter,
576        // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
577        // We need to transpose them to columns, done with a 4x8 transpose
578        // (which in practice is two 4x4 transposes of the two 4x4 halves
579        // of the 8x4 pixels; into 4x8 pixels).
580        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
581        st1             {v22.d}[0], [x9], x1
582        st1             {v22.d}[1], [x0], x1
583        st1             {v23.d}[0], [x9], x1
584        st1             {v23.d}[1], [x0], x1
585        st1             {v24.d}[0], [x9], x1
586        st1             {v24.d}[1], [x0], x1
587        st1             {v25.d}[0], [x9], x1
588        st1             {v25.d}[1], [x0], x1
589        sub             x0,  x0,  x1, lsl #3
590        add             x0,  x0,  #4
591
592        ret             x10
593endfunc
594
595bpp_frontends vp9_loop_filter_h_4_8
596
597function vp9_loop_filter_v_8_8_16_neon
598        mov             x10, x30
599        sub             x9,  x0,  x1, lsl #2
600        ld1             {v20.8h}, [x9], x1 // p3
601        ld1             {v24.8h}, [x0], x1 // q0
602        ld1             {v21.8h}, [x9], x1 // p2
603        ld1             {v25.8h}, [x0], x1 // q1
604        ld1             {v22.8h}, [x9], x1 // p1
605        ld1             {v26.8h}, [x0], x1 // q2
606        ld1             {v23.8h}, [x9], x1 // p0
607        ld1             {v27.8h}, [x0], x1 // q3
608        sub             x9,  x9,  x1, lsl #2
609        sub             x0,  x0,  x1, lsl #2
610        add             x9,  x9,  x1
611
612        loop_filter_8
613
614        st1             {v21.8h}, [x9], x1
615        st1             {v24.8h}, [x0], x1
616        st1             {v22.8h}, [x9], x1
617        st1             {v25.8h}, [x0], x1
618        st1             {v23.8h}, [x9], x1
619        st1             {v26.8h}, [x0], x1
620        sub             x0,  x0,  x1, lsl #1
621        sub             x0,  x0,  x1
622
623        ret             x10
6246:
625        sub             x9,  x0,  x1, lsl #1
626        st1             {v22.8h}, [x9], x1
627        st1             {v24.8h}, [x0], x1
628        st1             {v23.8h}, [x9], x1
629        st1             {v25.8h}, [x0], x1
630        sub             x0,  x0,  x1, lsl #1
631        ret             x10
632endfunc
633
634bpp_frontends vp9_loop_filter_v_8_8
635
636function vp9_loop_filter_h_8_8_16_neon
637        mov             x10, x30
638        sub             x9,  x0,  #8
639        add             x0,  x9,  x1, lsl #2
640        ld1             {v20.8h}, [x9], x1
641        ld1             {v24.8h}, [x0], x1
642        ld1             {v21.8h}, [x9], x1
643        ld1             {v25.8h}, [x0], x1
644        ld1             {v22.8h}, [x9], x1
645        ld1             {v26.8h}, [x0], x1
646        ld1             {v23.8h}, [x9], x1
647        ld1             {v27.8h}, [x0], x1
648
649        sub             x9,  x9,  x1, lsl #2
650        sub             x0,  x0,  x1, lsl #3
651        add             x0,  x0,  #8
652
653        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
654
655        loop_filter_8
656
657        add             x0,  x9,  x1, lsl #2
658
659        // Even though only 6 pixels per row have been changed, we write the
660        // full 8 pixel registers.
661        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
662
663        st1             {v20.8h}, [x9], x1
664        st1             {v24.8h}, [x0], x1
665        st1             {v21.8h}, [x9], x1
666        st1             {v25.8h}, [x0], x1
667        st1             {v22.8h}, [x9], x1
668        st1             {v26.8h}, [x0], x1
669        st1             {v23.8h}, [x9], x1
670        st1             {v27.8h}, [x0], x1
671        sub             x0,  x0,  x1, lsl #3
672        add             x0,  x0,  #8
673
674        ret             x10
6756:
676        // If we didn't need to do the flat8in part, we use the same writeback
677        // as in loop_filter_h_4_8.
678        add             x9,  x9,  #4
679        add             x0,  x9,  x1, lsl #2
680        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
681        st1             {v22.d}[0], [x9], x1
682        st1             {v22.d}[1], [x0], x1
683        st1             {v23.d}[0], [x9], x1
684        st1             {v23.d}[1], [x0], x1
685        st1             {v24.d}[0], [x9], x1
686        st1             {v24.d}[1], [x0], x1
687        st1             {v25.d}[0], [x9], x1
688        st1             {v25.d}[1], [x0], x1
689        sub             x0,  x0,  x1, lsl #3
690        add             x0,  x0,  #4
691        ret             x10
692endfunc
693
694bpp_frontends vp9_loop_filter_h_8_8
695
696bpp_frontends_mix2 4, 4
697bpp_frontends_mix2 4, 8
698bpp_frontends_mix2 8, 4
699bpp_frontends_mix2 8, 8
700
701function vp9_loop_filter_v_16_8_16_neon
702        mov             x10, x30
703        sub             x9,  x0,  x1, lsl #3
704        ld1             {v16.8h}, [x9], x1 // p7
705        ld1             {v24.8h}, [x0], x1 // q0
706        ld1             {v17.8h}, [x9], x1 // p6
707        ld1             {v25.8h}, [x0], x1 // q1
708        ld1             {v18.8h}, [x9], x1 // p5
709        ld1             {v26.8h}, [x0], x1 // q2
710        ld1             {v19.8h}, [x9], x1 // p4
711        ld1             {v27.8h}, [x0], x1 // q3
712        ld1             {v20.8h}, [x9], x1 // p3
713        ld1             {v28.8h}, [x0], x1 // q4
714        ld1             {v21.8h}, [x9], x1 // p2
715        ld1             {v29.8h}, [x0], x1 // q5
716        ld1             {v22.8h}, [x9], x1 // p1
717        ld1             {v30.8h}, [x0], x1 // q6
718        ld1             {v23.8h}, [x9], x1 // p0
719        ld1             {v31.8h}, [x0], x1 // q7
720        sub             x9,  x9,  x1, lsl #3
721        sub             x0,  x0,  x1, lsl #3
722        add             x9,  x9,  x1
723
724        loop_filter_16
725
726        // If we did the flat8out part, we get the output in
727        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
728        // store v2-v9 there, and v10-v17 into x0.
729        st1             {v2.8h},  [x9], x1
730        st1             {v10.8h}, [x0], x1
731        st1             {v3.8h},  [x9], x1
732        st1             {v11.8h}, [x0], x1
733        st1             {v4.8h},  [x9], x1
734        st1             {v12.8h}, [x0], x1
735        st1             {v5.8h},  [x9], x1
736        st1             {v13.8h}, [x0], x1
737        st1             {v6.8h},  [x9], x1
738        st1             {v14.8h}, [x0], x1
739        st1             {v8.8h},  [x9], x1
740        st1             {v15.8h}, [x0], x1
741        st1             {v9.8h},  [x9], x1
742        st1             {v17.8h}, [x0], x1
743        sub             x0,  x0,  x1, lsl #3
744        add             x0,  x0,  x1
745
746        ret             x10
7478:
748        add             x9,  x9,  x1, lsl #2
749        // If we didn't do the flat8out part, the output is left in the
750        // input registers.
751        st1             {v21.8h}, [x9], x1
752        st1             {v24.8h}, [x0], x1
753        st1             {v22.8h}, [x9], x1
754        st1             {v25.8h}, [x0], x1
755        st1             {v23.8h}, [x9], x1
756        st1             {v26.8h}, [x0], x1
757        sub             x0,  x0,  x1, lsl #1
758        sub             x0,  x0,  x1
759        ret             x10
7607:
761        sub             x9,  x0,  x1, lsl #1
762        st1             {v22.8h}, [x9], x1
763        st1             {v24.8h}, [x0], x1
764        st1             {v23.8h}, [x9], x1
765        st1             {v25.8h}, [x0], x1
766        sub             x0,  x0,  x1, lsl #1
767        ret             x10
768endfunc
769
770bpp_frontends vp9_loop_filter_v_16_8, push=1
771bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
772
773function vp9_loop_filter_h_16_8_16_neon
774        mov             x10, x30
775        sub             x9,  x0,  #16
776        ld1             {v16.8h}, [x9], x1
777        ld1             {v24.8h}, [x0], x1
778        ld1             {v17.8h}, [x9], x1
779        ld1             {v25.8h}, [x0], x1
780        ld1             {v18.8h}, [x9], x1
781        ld1             {v26.8h}, [x0], x1
782        ld1             {v19.8h}, [x9], x1
783        ld1             {v27.8h}, [x0], x1
784        ld1             {v20.8h}, [x9], x1
785        ld1             {v28.8h}, [x0], x1
786        ld1             {v21.8h}, [x9], x1
787        ld1             {v29.8h}, [x0], x1
788        ld1             {v22.8h}, [x9], x1
789        ld1             {v30.8h}, [x0], x1
790        ld1             {v23.8h}, [x9], x1
791        ld1             {v31.8h}, [x0], x1
792        sub             x0,  x0,  x1, lsl #3
793        sub             x9,  x9,  x1, lsl #3
794
795        // The 16x8 pixels read above is in two 8x8 blocks; the left
796        // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
797        // of this, to get one column per register.
798        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
799        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
800
801        loop_filter_16
802
803        transpose_8x8H  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
804        transpose_8x8H  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
805
806        st1             {v16.8h}, [x9], x1
807        st1             {v10.8h}, [x0], x1
808        st1             {v2.8h},  [x9], x1
809        st1             {v11.8h}, [x0], x1
810        st1             {v3.8h},  [x9], x1
811        st1             {v12.8h}, [x0], x1
812        st1             {v4.8h},  [x9], x1
813        st1             {v13.8h}, [x0], x1
814        st1             {v5.8h},  [x9], x1
815        st1             {v14.8h}, [x0], x1
816        st1             {v6.8h},  [x9], x1
817        st1             {v15.8h}, [x0], x1
818        st1             {v8.8h},  [x9], x1
819        st1             {v17.8h}, [x0], x1
820        st1             {v9.8h},  [x9], x1
821        st1             {v31.8h}, [x0], x1
822        sub             x0,  x0,  x1, lsl #3
823
824        ret             x10
8258:
826        // The same writeback as in loop_filter_h_8_8
827        sub             x9,  x0,  #8
828        add             x0,  x9,  x1, lsl #2
829        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
830
831        st1             {v20.8h}, [x9], x1
832        st1             {v24.8h}, [x0], x1
833        st1             {v21.8h}, [x9], x1
834        st1             {v25.8h}, [x0], x1
835        st1             {v22.8h}, [x9], x1
836        st1             {v26.8h}, [x0], x1
837        st1             {v23.8h}, [x9], x1
838        st1             {v27.8h}, [x0], x1
839        sub             x0,  x0,  x1, lsl #3
840        add             x0,  x0,  #8
841        ret             x10
8427:
843        // The same writeback as in loop_filter_h_4_8
844        sub             x9,  x0,  #4
845        add             x0,  x9,  x1, lsl #2
846        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
847        st1             {v22.d}[0], [x9], x1
848        st1             {v22.d}[1], [x0], x1
849        st1             {v23.d}[0], [x9], x1
850        st1             {v23.d}[1], [x0], x1
851        st1             {v24.d}[0], [x9], x1
852        st1             {v24.d}[1], [x0], x1
853        st1             {v25.d}[0], [x9], x1
854        st1             {v25.d}[1], [x0], x1
855        sub             x0,  x0,  x1, lsl #3
856        add             x0,  x0,  #4
857        ret             x10
858endfunc
859
860bpp_frontends vp9_loop_filter_h_16_8, push=1
861bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
862