1 /*
2  * Copyright (c) 2017 Google Inc.
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/aarch64/asm.S"
22 #include "neon.S"
23 
24 
25 // The input to and output from this macro is in the registers v16-v31,
26 // and v0-v7 are used as scratch registers.
27 // p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
28 // Depending on the width of the loop filter, we either use v16-v19
29 // and v28-v31 as temp registers, or v8-v15.
30 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
31         dup             v0.8h,  w2                   // E
32         dup             v2.8h,  w3                   // I
33         dup             v3.8h,  w4                   // H
34 
35         uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
36         uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
37         uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
38         uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
39         uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
40         uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
41         umax            v4.8h,  v4.8h,  v5.8h
42         umax            v5.8h,  v6.8h,  v7.8h
43         umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
44         uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
45         umax            v4.8h,  v4.8h,  v5.8h
46         add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
47         uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
48         umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
49         ushr            v5.8h,  v5.8h,  #1
50         cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
51         add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
52         cmhs            v6.8h,  v0.8h,  v6.8h
53         and             v4.16b, v4.16b, v6.16b       // fm
54 
55         // If no pixels need filtering, just exit as soon as possible
56         mov             x11, v4.d[0]
57         mov             x12, v4.d[1]
58         adds            x11, x11, x12
59         b.ne            1f
60         ret             x10
61 1:
62 
63 .if \wd >= 8
64         dup             v0.8h,  w5
65 
66         uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
67         uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
68         uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
69         uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
70         uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
71         uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
72         umax            v6.8h,  v6.8h,  v2.8h
73         umax            v1.8h,  v1.8h,  \tmp1\().8h
74         umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
75 .if \wd == 16
76         uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
77         umax            v6.8h,  v6.8h,  v1.8h
78         uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
79         umax            v6.8h,  v6.8h,  \tmp2\().8h
80         uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
81         cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
82         uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
83         and             v6.16b, v6.16b, v4.16b       // flat8in && fm
84         uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
85         bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
86         uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
87         uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
88         uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
89 
90         umax            v7.8h,  v7.8h,  v2.8h
91         umax            v1.8h,  v1.8h,  v8.8h
92         umax            v9.8h,  v9.8h,  v10.8h
93         umax            v11.8h, v11.8h, v12.8h
94         // The rest of the calculation of flat8out is interleaved below
95 .else
96         // The rest of the calculation of flat8in is interleaved below
97 .endif
98 .endif
99 
100         // Calculate the normal inner loop filter for 2 or 4 pixels
101         uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
102 .if \wd == 16
103         umax            v7.8h,  v7.8h,  v1.8h
104         umax            v9.8h,  v9.8h,  v11.8h
105 .elseif \wd == 8
106         umax            v6.8h,  v6.8h,  v1.8h
107 .endif
108         uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
109 .if \wd == 16
110         umax            v7.8h,  v7.8h,  v9.8h
111 .elseif \wd == 8
112         umax            v6.8h,  v6.8h,  \tmp2\().8h
113 .endif
114         dup             \tmp2\().8h,  w6                        // left shift for saturation
115         sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
116         neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
117         umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
118         sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
119         movi            \tmp5\().8h,  #3
120 .if \wd == 8
121         cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
122 .endif
123         cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
124 .if \wd == 8
125         and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
126 .endif
127         sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
128 .if \wd == 16
129         cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
130 .elseif \wd == 8
131         bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
132 .endif
133         and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
134 .if \wd == 16
135         and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
136 .endif
137         sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
138 
139         mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
140         bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
141         movi            v2.8h,  #4
142         add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
143         movi            v3.8h,  #3
144         sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
145         movi            \tmp5\().8h,  #0
146         sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
147         dup             \tmp6\().8h,  w7                        // max pixel value
148 .if \wd == 16
149         bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
150 .endif
151 
152         ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
153 
154         add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
155         add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
156         smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
157         smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
158         sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
159         sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
160 
161         add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
162         sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
163         smin            v0.8h,   v0.8h,   \tmp6\().8h
164         smin            v2.8h,   v2.8h,   \tmp6\().8h
165         srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
166         smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
167         smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
168         bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
169         bit             v24.16b, v2.16b,  v4.16b
170 
171         add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
172         sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
173 .if \wd >= 8
174         mov             x11, v6.d[0]
175 .endif
176         smin            v0.8h,  v0.8h,  \tmp6\().8h
177         smin            v2.8h,  v2.8h,  \tmp6\().8h
178 .if \wd >= 8
179         mov             x12, v6.d[1]
180 .endif
181         smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
182         smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
183 .if \wd >= 8
184         adds            x11, x11, x12
185 .endif
186         bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
187         bit             v25.16b, v2.16b,  v5.16b
188 
189         // If no pixels need flat8in, jump to flat8out
190         // (or to a writeout of the inner 4 pixels, for wd=8)
191 .if \wd >= 8
192 .if \wd == 16
193         b.eq            6f
194 .else
195         b.ne            1f
196         ret             x13
197 1:
198 .endif
199 
200         // flat8in
201         add             \tmp1\().8h, v20.8h, v21.8h
202         add             \tmp3\().8h, v22.8h, v25.8h
203         add             \tmp5\().8h, v20.8h, v22.8h
204         add             \tmp7\().8h, v23.8h, v26.8h
205         add             v0.8h,  \tmp1\().8h, \tmp1\().8h
206         add             v0.8h,  v0.8h,  v23.8h
207         add             v0.8h,  v0.8h,  v24.8h
208         add             v0.8h,  v0.8h,  \tmp5\().8h
209         sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
210         sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
211         urshr           v2.8h,  v0.8h,  #3                      // out p2
212 
213         add             v0.8h,  v0.8h,  \tmp3\().8h
214         add             \tmp1\().8h, v20.8h,  v23.8h
215         add             \tmp3\().8h, v24.8h,  v27.8h
216         urshr           v3.8h,  v0.8h,  #3                      // out p1
217 
218         add             v0.8h,  v0.8h,  \tmp7\().8h
219         sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
220         add             \tmp5\().8h, v21.8h,  v24.8h
221         add             \tmp7\().8h, v25.8h,  v27.8h
222         urshr           v4.8h,  v0.8h,  #3                      // out p0
223 
224         add             v0.8h,  v0.8h,  \tmp3\().8h
225         sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
226         add             \tmp1\().8h, v22.8h,  v25.8h
227         add             \tmp3\().8h, v26.8h,  v27.8h
228         urshr           v5.8h,  v0.8h,  #3                      // out q0
229 
230         add             v0.8h,  v0.8h,  \tmp7\().8h
231         sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
232         urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
233 
234         add             v0.8h,  v0.8h,  \tmp3\().8h
235         // The output here is written back into the input registers. This doesn't
236         // matter for the flat8part below, since we only update those pixels
237         // which won't be touched below.
238         bit             v21.16b, v2.16b,  v6.16b
239         bit             v22.16b, v3.16b,  v6.16b
240         bit             v23.16b, v4.16b,  v6.16b
241         urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
242         bit             v24.16b, v5.16b,  v6.16b
243         bit             v25.16b, \tmp5\().16b,  v6.16b
244         bit             v26.16b, \tmp6\().16b,  v6.16b
245 .endif
246 .if \wd == 16
247 6:
248         orr             v2.16b,  v6.16b,  v7.16b
249         mov             x11, v2.d[0]
250         mov             x12, v2.d[1]
251         adds            x11, x11, x12
252         b.ne            1f
253         // If no pixels needed flat8in nor flat8out, jump to a
254         // writeout of the inner 4 pixels
255         ret             x14
256 1:
257 
258         mov             x11, v7.d[0]
259         mov             x12, v7.d[1]
260         adds            x11, x11, x12
261         b.ne            1f
262         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
263         ret             x15
264 
265 1:
266         // flat8out
267         // This writes all outputs into v2-v17 (skipping v6 and v16).
268         // If this part is skipped, the output is read from v21-v26 (which is the input
269         // to this section).
270         shl             v0.8h,   v16.8h,  #3     // 8 * v16
271         sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
272         add             v0.8h,   v0.8h,   v17.8h
273         add             v8.8h,   v17.8h,  v18.8h
274         add             v10.8h,  v19.8h,  v20.8h
275         add             v0.8h,   v0.8h,   v8.8h
276         add             v8.8h,   v16.8h,  v17.8h
277         add             v12.8h,  v21.8h,  v22.8h
278         add             v0.8h,   v0.8h,   v10.8h
279         add             v10.8h,  v18.8h,  v25.8h
280         add             v14.8h,  v23.8h,  v24.8h
281         sub             v10.8h,  v10.8h,  v8.8h
282         add             v0.8h,   v0.8h,   v12.8h
283         add             v0.8h,   v0.8h,   v14.8h
284         add             v12.8h,  v16.8h,  v18.8h
285         add             v14.8h,  v19.8h,  v26.8h
286         urshr           v2.8h,   v0.8h,   #4
287 
288         add             v0.8h,   v0.8h,   v10.8h
289         add             v8.8h,   v16.8h,  v19.8h
290         add             v10.8h,  v20.8h,  v27.8h
291         sub             v14.8h,  v14.8h,  v12.8h
292         bif             v2.16b,  v17.16b, v7.16b
293         urshr           v3.8h ,  v0.8h,   #4
294 
295         add             v0.8h,   v0.8h,   v14.8h
296         add             v12.8h,  v16.8h,  v20.8h
297         add             v14.8h,  v21.8h,  v28.8h
298         sub             v10.8h,  v10.8h,  v8.8h
299         bif             v3.16b,  v18.16b, v7.16b
300         urshr           v4.8h,   v0.8h,   #4
301 
302         add             v0.8h,   v0.8h,   v10.8h
303         add             v8.8h,   v16.8h,  v21.8h
304         add             v10.8h,  v22.8h,  v29.8h
305         sub             v14.8h,  v14.8h,  v12.8h
306         bif             v4.16b,  v19.16b, v7.16b
307         urshr           v5.8h,   v0.8h,   #4
308 
309         add             v0.8h,   v0.8h,   v14.8h
310         add             v12.8h,  v16.8h,  v22.8h
311         add             v14.8h,  v23.8h,  v30.8h
312         sub             v10.8h,  v10.8h,  v8.8h
313         bif             v5.16b,  v20.16b, v7.16b
314         urshr           v6.8h,   v0.8h,   #4
315 
316         add             v0.8h,   v0.8h,   v10.8h
317         add             v10.8h,  v16.8h,  v23.8h
318         sub             v14.8h,  v14.8h,  v12.8h
319         add             v12.8h,  v24.8h,  v31.8h
320         bif             v6.16b,  v21.16b, v7.16b
321         urshr           v8.8h,   v0.8h,   #4
322 
323         add             v0.8h,   v0.8h,   v14.8h
324         sub             v10.8h,  v12.8h,  v10.8h
325         add             v12.8h,  v17.8h,  v24.8h
326         add             v14.8h,  v25.8h,  v31.8h
327         bif             v8.16b,  v22.16b, v7.16b
328         urshr           v9.8h,   v0.8h,   #4
329 
330         add             v0.8h,   v0.8h,   v10.8h
331         sub             v14.8h,  v14.8h,  v12.8h
332         add             v12.8h,  v26.8h,  v31.8h
333         bif             v9.16b,  v23.16b, v7.16b
334         urshr           v10.8h,  v0.8h,   #4
335 
336         add             v0.8h,   v0.8h,   v14.8h
337         add             v14.8h,  v18.8h,  v25.8h
338         add             v18.8h,  v19.8h,  v26.8h
339         sub             v12.8h,  v12.8h,  v14.8h
340         add             v14.8h,  v27.8h,  v31.8h
341         bif             v10.16b, v24.16b, v7.16b
342         urshr           v11.8h,  v0.8h,   #4
343 
344         add             v0.8h,   v0.8h,   v12.8h
345         add             v12.8h,  v20.8h,  v27.8h
346         sub             v14.8h,  v14.8h,  v18.8h
347         add             v18.8h,  v28.8h,  v31.8h
348         bif             v11.16b, v25.16b, v7.16b
349         sub             v18.8h,  v18.8h,  v12.8h
350         urshr           v12.8h,  v0.8h,   #4
351 
352         add             v0.8h,   v0.8h,   v14.8h
353         add             v14.8h,  v21.8h,  v28.8h
354         add             v20.8h,  v29.8h,  v31.8h
355         bif             v12.16b, v26.16b, v7.16b
356         urshr           v13.8h,  v0.8h,   #4
357 
358         add             v0.8h,   v0.8h,   v18.8h
359         sub             v20.8h,  v20.8h,  v14.8h
360         add             v18.8h,  v22.8h,  v29.8h
361         add             v22.8h,  v30.8h,  v31.8h
362         bif             v13.16b, v27.16b, v7.16b
363         urshr           v14.8h,  v0.8h,   #4
364 
365         add             v0.8h,   v0.8h,   v20.8h
366         sub             v22.8h,  v22.8h,  v18.8h
367         bif             v14.16b, v28.16b, v7.16b
368         urshr           v15.8h,  v0.8h,   #4
369 
370         add             v0.8h,   v0.8h,   v22.8h
371         bif             v15.16b, v29.16b, v7.16b
372         urshr           v17.8h,  v0.8h,   #4
373         bif             v17.16b, v30.16b, v7.16b
374 .endif
375 .endm
376 
377 // For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
378 // while we need those for inputs/outputs in wd=16 and use v8-v15
379 // for temp registers there instead.
380 function vp9_loop_filter_4
381         loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
382         ret
383 endfunc
384 
385 function vp9_loop_filter_8
386         loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
387         ret
388 endfunc
389 
390 function vp9_loop_filter_16
391         loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
392         ret
393 endfunc
394 
395 .macro loop_filter_4
396         bl              vp9_loop_filter_4
397 .endm
398 
399 .macro loop_filter_8
400         // calculate alternative 'return' targets
401         adr             x13, 6f
402         bl              vp9_loop_filter_8
403 .endm
404 
405 .macro loop_filter_16
406         // calculate alternative 'return' targets
407         adr             x14, 7f
408         adr             x15, 8f
409         bl              vp9_loop_filter_16
410 .endm
411 
412 
413 // The public functions in this file have got the following signature:
414 // void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
415 
416 .macro bpp_frontend func, bpp, push
417 function ff_\func\()_\bpp\()_neon, export=1
418 .if \push
419         mov             x16, x30
420         stp             d14, d15, [sp, #-0x10]!
421         stp             d12, d13, [sp, #-0x10]!
422         stp             d10, d11, [sp, #-0x10]!
423         stp             d8,  d9,  [sp, #-0x10]!
424 .endif
425         lsl             w2,  w2,  #\bpp - 8
426         lsl             w3,  w3,  #\bpp - 8
427         lsl             w4,  w4,  #\bpp - 8
428         mov             x5,  #1 << (\bpp - 8)
429         mov             x6,  #16 - \bpp
430         mov             x7,  #((1 << \bpp) - 1)
431 .if \push
432         bl              \func\()_16_neon
433         ldp             d8,  d9,  [sp], 0x10
434         ldp             d10, d11, [sp], 0x10
435         ldp             d12, d13, [sp], 0x10
436         ldp             d14, d15, [sp], 0x10
437         ret             x16
438 .else
439         b               \func\()_16_neon
440 .endif
441 endfunc
442 .endm
443 
444 .macro bpp_frontends func, push=0
445         bpp_frontend    \func, 10, \push
446         bpp_frontend    \func, 12, \push
447 .endm
448 
449 .macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
450 function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
451         mov             x16, x30
452 .if \push
453         stp             d14, d15, [sp, #-0x10]!
454         stp             d12, d13, [sp, #-0x10]!
455         stp             d10, d11, [sp, #-0x10]!
456         stp             d8,  d9,  [sp, #-0x10]!
457 .endif
458         lsl             w2,  w2,  #\bpp - 8
459         lsl             w3,  w3,  #\bpp - 8
460         lsl             w4,  w4,  #\bpp - 8
461         mov             x5,  #1 << (\bpp - 8)
462         mov             x6,  #16 - \bpp
463         mov             x7,  #((1 << \bpp) - 1)
464         bl              \func\()_\int_suffix\()_16_neon
465 .ifc \dir,h
466         add             x0,  x0,  x1, lsl #3
467 .else
468         add             x0,  x0,  #16
469 .endif
470         bl              \func\()_\int_suffix\()_16_neon
471 .if \push
472         ldp             d8,  d9,  [sp], 0x10
473         ldp             d10, d11, [sp], 0x10
474         ldp             d12, d13, [sp], 0x10
475         ldp             d14, d15, [sp], 0x10
476 .endif
477         ret             x16
478 endfunc
479 .endm
480 
481 .macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
482         bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
483         bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
484 .endm
485 
486 .macro bpp_frontend_mix2 wd1, wd2, dir, bpp
487 function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
488         mov             x16, x30
489         lsr             w8,  w2,  #8
490         lsr             w14, w3,  #8
491         lsr             w15, w4,  #8
492         and             w2,  w2,  #0xff
493         and             w3,  w3,  #0xff
494         and             w4,  w4,  #0xff
495         lsl             w2,  w2,  #\bpp - 8
496         lsl             w3,  w3,  #\bpp - 8
497         lsl             w4,  w4,  #\bpp - 8
498         mov             x5,  #1 << (\bpp - 8)
499         mov             x6,  #16 - \bpp
500         mov             x7,  #((1 << \bpp) - 1)
501         bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
502 .ifc \dir,h
503         add             x0,  x0,  x1, lsl #3
504 .else
505         add             x0,  x0,  #16
506 .endif
507         lsl             w2,  w8,  #\bpp - 8
508         lsl             w3,  w14, #\bpp - 8
509         lsl             w4,  w15, #\bpp - 8
510         bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
511         ret             x16
512 endfunc
513 .endm
514 
515 .macro bpp_frontends_mix2 wd1, wd2
516         bpp_frontend_mix2 \wd1, \wd2, v, 10
517         bpp_frontend_mix2 \wd1, \wd2, v, 12
518         bpp_frontend_mix2 \wd1, \wd2, h, 10
519         bpp_frontend_mix2 \wd1, \wd2, h, 12
520 .endm
521 
522 function vp9_loop_filter_v_4_8_16_neon
523         mov             x10, x30
524         sub             x9,  x0,  x1, lsl #2
525         ld1             {v20.8h}, [x9], x1 // p3
526         ld1             {v24.8h}, [x0], x1 // q0
527         ld1             {v21.8h}, [x9], x1 // p2
528         ld1             {v25.8h}, [x0], x1 // q1
529         ld1             {v22.8h}, [x9], x1 // p1
530         ld1             {v26.8h}, [x0], x1 // q2
531         ld1             {v23.8h}, [x9], x1 // p0
532         ld1             {v27.8h}, [x0], x1 // q3
533         sub             x0,  x0,  x1, lsl #2
534         sub             x9,  x9,  x1, lsl #1
535 
536         loop_filter_4
537 
538         st1             {v22.8h}, [x9], x1
539         st1             {v24.8h}, [x0], x1
540         st1             {v23.8h}, [x9], x1
541         st1             {v25.8h}, [x0], x1
542         sub             x0,  x0,  x1, lsl #1
543 
544         ret             x10
545 endfunc
546 
547 bpp_frontends vp9_loop_filter_v_4_8
548 
549 function vp9_loop_filter_h_4_8_16_neon
550         mov             x10, x30
551         sub             x9,  x0,  #8
552         add             x0,  x9,  x1, lsl #2
553         ld1             {v20.8h}, [x9], x1
554         ld1             {v24.8h}, [x0], x1
555         ld1             {v21.8h}, [x9], x1
556         ld1             {v25.8h}, [x0], x1
557         ld1             {v22.8h}, [x9], x1
558         ld1             {v26.8h}, [x0], x1
559         ld1             {v23.8h}, [x9], x1
560         ld1             {v27.8h}, [x0], x1
561 
562         sub             x9,  x9,  x1, lsl #2
563         sub             x0,  x0,  x1, lsl #3
564         add             x0,  x0,  #8
565 
566         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
567 
568         loop_filter_4
569 
570         // Move x9 forward by 2 pixels; we don't need to rewrite the
571         // outermost 2 pixels since they aren't changed.
572         add             x9,  x9,  #4
573         add             x0,  x9,  x1, lsl #2
574 
575         // We only will write the mid 4 pixels back; after the loop filter,
576         // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
577         // We need to transpose them to columns, done with a 4x8 transpose
578         // (which in practice is two 4x4 transposes of the two 4x4 halves
579         // of the 8x4 pixels; into 4x8 pixels).
580         transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
581         st1             {v22.d}[0], [x9], x1
582         st1             {v22.d}[1], [x0], x1
583         st1             {v23.d}[0], [x9], x1
584         st1             {v23.d}[1], [x0], x1
585         st1             {v24.d}[0], [x9], x1
586         st1             {v24.d}[1], [x0], x1
587         st1             {v25.d}[0], [x9], x1
588         st1             {v25.d}[1], [x0], x1
589         sub             x0,  x0,  x1, lsl #3
590         add             x0,  x0,  #4
591 
592         ret             x10
593 endfunc
594 
595 bpp_frontends vp9_loop_filter_h_4_8
596 
597 function vp9_loop_filter_v_8_8_16_neon
598         mov             x10, x30
599         sub             x9,  x0,  x1, lsl #2
600         ld1             {v20.8h}, [x9], x1 // p3
601         ld1             {v24.8h}, [x0], x1 // q0
602         ld1             {v21.8h}, [x9], x1 // p2
603         ld1             {v25.8h}, [x0], x1 // q1
604         ld1             {v22.8h}, [x9], x1 // p1
605         ld1             {v26.8h}, [x0], x1 // q2
606         ld1             {v23.8h}, [x9], x1 // p0
607         ld1             {v27.8h}, [x0], x1 // q3
608         sub             x9,  x9,  x1, lsl #2
609         sub             x0,  x0,  x1, lsl #2
610         add             x9,  x9,  x1
611 
612         loop_filter_8
613 
614         st1             {v21.8h}, [x9], x1
615         st1             {v24.8h}, [x0], x1
616         st1             {v22.8h}, [x9], x1
617         st1             {v25.8h}, [x0], x1
618         st1             {v23.8h}, [x9], x1
619         st1             {v26.8h}, [x0], x1
620         sub             x0,  x0,  x1, lsl #1
621         sub             x0,  x0,  x1
622 
623         ret             x10
624 6:
625         sub             x9,  x0,  x1, lsl #1
626         st1             {v22.8h}, [x9], x1
627         st1             {v24.8h}, [x0], x1
628         st1             {v23.8h}, [x9], x1
629         st1             {v25.8h}, [x0], x1
630         sub             x0,  x0,  x1, lsl #1
631         ret             x10
632 endfunc
633 
634 bpp_frontends vp9_loop_filter_v_8_8
635 
636 function vp9_loop_filter_h_8_8_16_neon
637         mov             x10, x30
638         sub             x9,  x0,  #8
639         add             x0,  x9,  x1, lsl #2
640         ld1             {v20.8h}, [x9], x1
641         ld1             {v24.8h}, [x0], x1
642         ld1             {v21.8h}, [x9], x1
643         ld1             {v25.8h}, [x0], x1
644         ld1             {v22.8h}, [x9], x1
645         ld1             {v26.8h}, [x0], x1
646         ld1             {v23.8h}, [x9], x1
647         ld1             {v27.8h}, [x0], x1
648 
649         sub             x9,  x9,  x1, lsl #2
650         sub             x0,  x0,  x1, lsl #3
651         add             x0,  x0,  #8
652 
653         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
654 
655         loop_filter_8
656 
657         add             x0,  x9,  x1, lsl #2
658 
659         // Even though only 6 pixels per row have been changed, we write the
660         // full 8 pixel registers.
661         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
662 
663         st1             {v20.8h}, [x9], x1
664         st1             {v24.8h}, [x0], x1
665         st1             {v21.8h}, [x9], x1
666         st1             {v25.8h}, [x0], x1
667         st1             {v22.8h}, [x9], x1
668         st1             {v26.8h}, [x0], x1
669         st1             {v23.8h}, [x9], x1
670         st1             {v27.8h}, [x0], x1
671         sub             x0,  x0,  x1, lsl #3
672         add             x0,  x0,  #8
673 
674         ret             x10
675 6:
676         // If we didn't need to do the flat8in part, we use the same writeback
677         // as in loop_filter_h_4_8.
678         add             x9,  x9,  #4
679         add             x0,  x9,  x1, lsl #2
680         transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
681         st1             {v22.d}[0], [x9], x1
682         st1             {v22.d}[1], [x0], x1
683         st1             {v23.d}[0], [x9], x1
684         st1             {v23.d}[1], [x0], x1
685         st1             {v24.d}[0], [x9], x1
686         st1             {v24.d}[1], [x0], x1
687         st1             {v25.d}[0], [x9], x1
688         st1             {v25.d}[1], [x0], x1
689         sub             x0,  x0,  x1, lsl #3
690         add             x0,  x0,  #4
691         ret             x10
692 endfunc
693 
694 bpp_frontends vp9_loop_filter_h_8_8
695 
696 bpp_frontends_mix2 4, 4
697 bpp_frontends_mix2 4, 8
698 bpp_frontends_mix2 8, 4
699 bpp_frontends_mix2 8, 8
700 
701 function vp9_loop_filter_v_16_8_16_neon
702         mov             x10, x30
703         sub             x9,  x0,  x1, lsl #3
704         ld1             {v16.8h}, [x9], x1 // p7
705         ld1             {v24.8h}, [x0], x1 // q0
706         ld1             {v17.8h}, [x9], x1 // p6
707         ld1             {v25.8h}, [x0], x1 // q1
708         ld1             {v18.8h}, [x9], x1 // p5
709         ld1             {v26.8h}, [x0], x1 // q2
710         ld1             {v19.8h}, [x9], x1 // p4
711         ld1             {v27.8h}, [x0], x1 // q3
712         ld1             {v20.8h}, [x9], x1 // p3
713         ld1             {v28.8h}, [x0], x1 // q4
714         ld1             {v21.8h}, [x9], x1 // p2
715         ld1             {v29.8h}, [x0], x1 // q5
716         ld1             {v22.8h}, [x9], x1 // p1
717         ld1             {v30.8h}, [x0], x1 // q6
718         ld1             {v23.8h}, [x9], x1 // p0
719         ld1             {v31.8h}, [x0], x1 // q7
720         sub             x9,  x9,  x1, lsl #3
721         sub             x0,  x0,  x1, lsl #3
722         add             x9,  x9,  x1
723 
724         loop_filter_16
725 
726         // If we did the flat8out part, we get the output in
727         // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
728         // store v2-v9 there, and v10-v17 into x0.
729         st1             {v2.8h},  [x9], x1
730         st1             {v10.8h}, [x0], x1
731         st1             {v3.8h},  [x9], x1
732         st1             {v11.8h}, [x0], x1
733         st1             {v4.8h},  [x9], x1
734         st1             {v12.8h}, [x0], x1
735         st1             {v5.8h},  [x9], x1
736         st1             {v13.8h}, [x0], x1
737         st1             {v6.8h},  [x9], x1
738         st1             {v14.8h}, [x0], x1
739         st1             {v8.8h},  [x9], x1
740         st1             {v15.8h}, [x0], x1
741         st1             {v9.8h},  [x9], x1
742         st1             {v17.8h}, [x0], x1
743         sub             x0,  x0,  x1, lsl #3
744         add             x0,  x0,  x1
745 
746         ret             x10
747 8:
748         add             x9,  x9,  x1, lsl #2
749         // If we didn't do the flat8out part, the output is left in the
750         // input registers.
751         st1             {v21.8h}, [x9], x1
752         st1             {v24.8h}, [x0], x1
753         st1             {v22.8h}, [x9], x1
754         st1             {v25.8h}, [x0], x1
755         st1             {v23.8h}, [x9], x1
756         st1             {v26.8h}, [x0], x1
757         sub             x0,  x0,  x1, lsl #1
758         sub             x0,  x0,  x1
759         ret             x10
760 7:
761         sub             x9,  x0,  x1, lsl #1
762         st1             {v22.8h}, [x9], x1
763         st1             {v24.8h}, [x0], x1
764         st1             {v23.8h}, [x9], x1
765         st1             {v25.8h}, [x0], x1
766         sub             x0,  x0,  x1, lsl #1
767         ret             x10
768 endfunc
769 
770 bpp_frontends vp9_loop_filter_v_16_8, push=1
771 bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
772 
773 function vp9_loop_filter_h_16_8_16_neon
774         mov             x10, x30
775         sub             x9,  x0,  #16
776         ld1             {v16.8h}, [x9], x1
777         ld1             {v24.8h}, [x0], x1
778         ld1             {v17.8h}, [x9], x1
779         ld1             {v25.8h}, [x0], x1
780         ld1             {v18.8h}, [x9], x1
781         ld1             {v26.8h}, [x0], x1
782         ld1             {v19.8h}, [x9], x1
783         ld1             {v27.8h}, [x0], x1
784         ld1             {v20.8h}, [x9], x1
785         ld1             {v28.8h}, [x0], x1
786         ld1             {v21.8h}, [x9], x1
787         ld1             {v29.8h}, [x0], x1
788         ld1             {v22.8h}, [x9], x1
789         ld1             {v30.8h}, [x0], x1
790         ld1             {v23.8h}, [x9], x1
791         ld1             {v31.8h}, [x0], x1
792         sub             x0,  x0,  x1, lsl #3
793         sub             x9,  x9,  x1, lsl #3
794 
795         // The 16x8 pixels read above is in two 8x8 blocks; the left
796         // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
797         // of this, to get one column per register.
798         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
799         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
800 
801         loop_filter_16
802 
803         transpose_8x8H  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
804         transpose_8x8H  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
805 
806         st1             {v16.8h}, [x9], x1
807         st1             {v10.8h}, [x0], x1
808         st1             {v2.8h},  [x9], x1
809         st1             {v11.8h}, [x0], x1
810         st1             {v3.8h},  [x9], x1
811         st1             {v12.8h}, [x0], x1
812         st1             {v4.8h},  [x9], x1
813         st1             {v13.8h}, [x0], x1
814         st1             {v5.8h},  [x9], x1
815         st1             {v14.8h}, [x0], x1
816         st1             {v6.8h},  [x9], x1
817         st1             {v15.8h}, [x0], x1
818         st1             {v8.8h},  [x9], x1
819         st1             {v17.8h}, [x0], x1
820         st1             {v9.8h},  [x9], x1
821         st1             {v31.8h}, [x0], x1
822         sub             x0,  x0,  x1, lsl #3
823 
824         ret             x10
825 8:
826         // The same writeback as in loop_filter_h_8_8
827         sub             x9,  x0,  #8
828         add             x0,  x9,  x1, lsl #2
829         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
830 
831         st1             {v20.8h}, [x9], x1
832         st1             {v24.8h}, [x0], x1
833         st1             {v21.8h}, [x9], x1
834         st1             {v25.8h}, [x0], x1
835         st1             {v22.8h}, [x9], x1
836         st1             {v26.8h}, [x0], x1
837         st1             {v23.8h}, [x9], x1
838         st1             {v27.8h}, [x0], x1
839         sub             x0,  x0,  x1, lsl #3
840         add             x0,  x0,  #8
841         ret             x10
842 7:
843         // The same writeback as in loop_filter_h_4_8
844         sub             x9,  x0,  #4
845         add             x0,  x9,  x1, lsl #2
846         transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
847         st1             {v22.d}[0], [x9], x1
848         st1             {v22.d}[1], [x0], x1
849         st1             {v23.d}[0], [x9], x1
850         st1             {v23.d}[1], [x0], x1
851         st1             {v24.d}[0], [x9], x1
852         st1             {v24.d}[1], [x0], x1
853         st1             {v25.d}[0], [x9], x1
854         st1             {v25.d}[1], [x0], x1
855         sub             x0,  x0,  x1, lsl #3
856         add             x0,  x0,  #4
857         ret             x10
858 endfunc
859 
860 bpp_frontends vp9_loop_filter_h_16_8, push=1
861 bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
862