1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2017 Google Inc.
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
22cabdff1aSopenharmony_ci#include "neon.S"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci// The input to and output from this macro is in the registers v16-v31,
26cabdff1aSopenharmony_ci// and v0-v7 are used as scratch registers.
27cabdff1aSopenharmony_ci// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
28cabdff1aSopenharmony_ci// Depending on the width of the loop filter, we either use v16-v19
29cabdff1aSopenharmony_ci// and v28-v31 as temp registers, or v8-v15.
30cabdff1aSopenharmony_ci.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
31cabdff1aSopenharmony_ci        dup             v0.8h,  w2                   // E
32cabdff1aSopenharmony_ci        dup             v2.8h,  w3                   // I
33cabdff1aSopenharmony_ci        dup             v3.8h,  w4                   // H
34cabdff1aSopenharmony_ci
35cabdff1aSopenharmony_ci        uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
36cabdff1aSopenharmony_ci        uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
37cabdff1aSopenharmony_ci        uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
38cabdff1aSopenharmony_ci        uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
39cabdff1aSopenharmony_ci        uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
40cabdff1aSopenharmony_ci        uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
41cabdff1aSopenharmony_ci        umax            v4.8h,  v4.8h,  v5.8h
42cabdff1aSopenharmony_ci        umax            v5.8h,  v6.8h,  v7.8h
43cabdff1aSopenharmony_ci        umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
44cabdff1aSopenharmony_ci        uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
45cabdff1aSopenharmony_ci        umax            v4.8h,  v4.8h,  v5.8h
46cabdff1aSopenharmony_ci        add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
47cabdff1aSopenharmony_ci        uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
48cabdff1aSopenharmony_ci        umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
49cabdff1aSopenharmony_ci        ushr            v5.8h,  v5.8h,  #1
50cabdff1aSopenharmony_ci        cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
51cabdff1aSopenharmony_ci        add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
52cabdff1aSopenharmony_ci        cmhs            v6.8h,  v0.8h,  v6.8h
53cabdff1aSopenharmony_ci        and             v4.16b, v4.16b, v6.16b       // fm
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_ci        // If no pixels need filtering, just exit as soon as possible
56cabdff1aSopenharmony_ci        mov             x11, v4.d[0]
57cabdff1aSopenharmony_ci        mov             x12, v4.d[1]
58cabdff1aSopenharmony_ci        adds            x11, x11, x12
59cabdff1aSopenharmony_ci        b.ne            1f
60cabdff1aSopenharmony_ci        ret             x10
61cabdff1aSopenharmony_ci1:
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_ci.if \wd >= 8
64cabdff1aSopenharmony_ci        dup             v0.8h,  w5
65cabdff1aSopenharmony_ci
66cabdff1aSopenharmony_ci        uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
67cabdff1aSopenharmony_ci        uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
68cabdff1aSopenharmony_ci        uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
69cabdff1aSopenharmony_ci        uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
70cabdff1aSopenharmony_ci        uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
71cabdff1aSopenharmony_ci        uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
72cabdff1aSopenharmony_ci        umax            v6.8h,  v6.8h,  v2.8h
73cabdff1aSopenharmony_ci        umax            v1.8h,  v1.8h,  \tmp1\().8h
74cabdff1aSopenharmony_ci        umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
75cabdff1aSopenharmony_ci.if \wd == 16
76cabdff1aSopenharmony_ci        uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
77cabdff1aSopenharmony_ci        umax            v6.8h,  v6.8h,  v1.8h
78cabdff1aSopenharmony_ci        uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
79cabdff1aSopenharmony_ci        umax            v6.8h,  v6.8h,  \tmp2\().8h
80cabdff1aSopenharmony_ci        uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
81cabdff1aSopenharmony_ci        cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
82cabdff1aSopenharmony_ci        uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
83cabdff1aSopenharmony_ci        and             v6.16b, v6.16b, v4.16b       // flat8in && fm
84cabdff1aSopenharmony_ci        uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
85cabdff1aSopenharmony_ci        bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
86cabdff1aSopenharmony_ci        uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
87cabdff1aSopenharmony_ci        uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
88cabdff1aSopenharmony_ci        uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci        umax            v7.8h,  v7.8h,  v2.8h
91cabdff1aSopenharmony_ci        umax            v1.8h,  v1.8h,  v8.8h
92cabdff1aSopenharmony_ci        umax            v9.8h,  v9.8h,  v10.8h
93cabdff1aSopenharmony_ci        umax            v11.8h, v11.8h, v12.8h
94cabdff1aSopenharmony_ci        // The rest of the calculation of flat8out is interleaved below
95cabdff1aSopenharmony_ci.else
96cabdff1aSopenharmony_ci        // The rest of the calculation of flat8in is interleaved below
97cabdff1aSopenharmony_ci.endif
98cabdff1aSopenharmony_ci.endif
99cabdff1aSopenharmony_ci
100cabdff1aSopenharmony_ci        // Calculate the normal inner loop filter for 2 or 4 pixels
101cabdff1aSopenharmony_ci        uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
102cabdff1aSopenharmony_ci.if \wd == 16
103cabdff1aSopenharmony_ci        umax            v7.8h,  v7.8h,  v1.8h
104cabdff1aSopenharmony_ci        umax            v9.8h,  v9.8h,  v11.8h
105cabdff1aSopenharmony_ci.elseif \wd == 8
106cabdff1aSopenharmony_ci        umax            v6.8h,  v6.8h,  v1.8h
107cabdff1aSopenharmony_ci.endif
108cabdff1aSopenharmony_ci        uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
109cabdff1aSopenharmony_ci.if \wd == 16
110cabdff1aSopenharmony_ci        umax            v7.8h,  v7.8h,  v9.8h
111cabdff1aSopenharmony_ci.elseif \wd == 8
112cabdff1aSopenharmony_ci        umax            v6.8h,  v6.8h,  \tmp2\().8h
113cabdff1aSopenharmony_ci.endif
114cabdff1aSopenharmony_ci        dup             \tmp2\().8h,  w6                        // left shift for saturation
115cabdff1aSopenharmony_ci        sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
116cabdff1aSopenharmony_ci        neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
117cabdff1aSopenharmony_ci        umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
118cabdff1aSopenharmony_ci        sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
119cabdff1aSopenharmony_ci        movi            \tmp5\().8h,  #3
120cabdff1aSopenharmony_ci.if \wd == 8
121cabdff1aSopenharmony_ci        cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
122cabdff1aSopenharmony_ci.endif
123cabdff1aSopenharmony_ci        cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
124cabdff1aSopenharmony_ci.if \wd == 8
125cabdff1aSopenharmony_ci        and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
126cabdff1aSopenharmony_ci.endif
127cabdff1aSopenharmony_ci        sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
128cabdff1aSopenharmony_ci.if \wd == 16
129cabdff1aSopenharmony_ci        cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
130cabdff1aSopenharmony_ci.elseif \wd == 8
131cabdff1aSopenharmony_ci        bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
132cabdff1aSopenharmony_ci.endif
133cabdff1aSopenharmony_ci        and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
134cabdff1aSopenharmony_ci.if \wd == 16
135cabdff1aSopenharmony_ci        and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
136cabdff1aSopenharmony_ci.endif
137cabdff1aSopenharmony_ci        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci        mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
140cabdff1aSopenharmony_ci        bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
141cabdff1aSopenharmony_ci        movi            v2.8h,  #4
142cabdff1aSopenharmony_ci        add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
143cabdff1aSopenharmony_ci        movi            v3.8h,  #3
144cabdff1aSopenharmony_ci        sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
145cabdff1aSopenharmony_ci        movi            \tmp5\().8h,  #0
146cabdff1aSopenharmony_ci        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
147cabdff1aSopenharmony_ci        dup             \tmp6\().8h,  w7                        // max pixel value
148cabdff1aSopenharmony_ci.if \wd == 16
149cabdff1aSopenharmony_ci        bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
150cabdff1aSopenharmony_ci.endif
151cabdff1aSopenharmony_ci
152cabdff1aSopenharmony_ci        ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
153cabdff1aSopenharmony_ci
154cabdff1aSopenharmony_ci        add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
155cabdff1aSopenharmony_ci        add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
156cabdff1aSopenharmony_ci        smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
157cabdff1aSopenharmony_ci        smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
158cabdff1aSopenharmony_ci        sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
159cabdff1aSopenharmony_ci        sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
160cabdff1aSopenharmony_ci
161cabdff1aSopenharmony_ci        add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
162cabdff1aSopenharmony_ci        sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
163cabdff1aSopenharmony_ci        smin            v0.8h,   v0.8h,   \tmp6\().8h
164cabdff1aSopenharmony_ci        smin            v2.8h,   v2.8h,   \tmp6\().8h
165cabdff1aSopenharmony_ci        srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
166cabdff1aSopenharmony_ci        smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
167cabdff1aSopenharmony_ci        smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
168cabdff1aSopenharmony_ci        bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
169cabdff1aSopenharmony_ci        bit             v24.16b, v2.16b,  v4.16b
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci        add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
172cabdff1aSopenharmony_ci        sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
173cabdff1aSopenharmony_ci.if \wd >= 8
174cabdff1aSopenharmony_ci        mov             x11, v6.d[0]
175cabdff1aSopenharmony_ci.endif
176cabdff1aSopenharmony_ci        smin            v0.8h,  v0.8h,  \tmp6\().8h
177cabdff1aSopenharmony_ci        smin            v2.8h,  v2.8h,  \tmp6\().8h
178cabdff1aSopenharmony_ci.if \wd >= 8
179cabdff1aSopenharmony_ci        mov             x12, v6.d[1]
180cabdff1aSopenharmony_ci.endif
181cabdff1aSopenharmony_ci        smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
182cabdff1aSopenharmony_ci        smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
183cabdff1aSopenharmony_ci.if \wd >= 8
184cabdff1aSopenharmony_ci        adds            x11, x11, x12
185cabdff1aSopenharmony_ci.endif
186cabdff1aSopenharmony_ci        bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
187cabdff1aSopenharmony_ci        bit             v25.16b, v2.16b,  v5.16b
188cabdff1aSopenharmony_ci
189cabdff1aSopenharmony_ci        // If no pixels need flat8in, jump to flat8out
190cabdff1aSopenharmony_ci        // (or to a writeout of the inner 4 pixels, for wd=8)
191cabdff1aSopenharmony_ci.if \wd >= 8
192cabdff1aSopenharmony_ci.if \wd == 16
193cabdff1aSopenharmony_ci        b.eq            6f
194cabdff1aSopenharmony_ci.else
195cabdff1aSopenharmony_ci        b.ne            1f
196cabdff1aSopenharmony_ci        ret             x13
197cabdff1aSopenharmony_ci1:
198cabdff1aSopenharmony_ci.endif
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ci        // flat8in
201cabdff1aSopenharmony_ci        add             \tmp1\().8h, v20.8h, v21.8h
202cabdff1aSopenharmony_ci        add             \tmp3\().8h, v22.8h, v25.8h
203cabdff1aSopenharmony_ci        add             \tmp5\().8h, v20.8h, v22.8h
204cabdff1aSopenharmony_ci        add             \tmp7\().8h, v23.8h, v26.8h
205cabdff1aSopenharmony_ci        add             v0.8h,  \tmp1\().8h, \tmp1\().8h
206cabdff1aSopenharmony_ci        add             v0.8h,  v0.8h,  v23.8h
207cabdff1aSopenharmony_ci        add             v0.8h,  v0.8h,  v24.8h
208cabdff1aSopenharmony_ci        add             v0.8h,  v0.8h,  \tmp5\().8h
209cabdff1aSopenharmony_ci        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
210cabdff1aSopenharmony_ci        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
211cabdff1aSopenharmony_ci        urshr           v2.8h,  v0.8h,  #3                      // out p2
212cabdff1aSopenharmony_ci
213cabdff1aSopenharmony_ci        add             v0.8h,  v0.8h,  \tmp3\().8h
214cabdff1aSopenharmony_ci        add             \tmp1\().8h, v20.8h,  v23.8h
215cabdff1aSopenharmony_ci        add             \tmp3\().8h, v24.8h,  v27.8h
216cabdff1aSopenharmony_ci        urshr           v3.8h,  v0.8h,  #3                      // out p1
217cabdff1aSopenharmony_ci
218cabdff1aSopenharmony_ci        add             v0.8h,  v0.8h,  \tmp7\().8h
219cabdff1aSopenharmony_ci        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
220cabdff1aSopenharmony_ci        add             \tmp5\().8h, v21.8h,  v24.8h
221cabdff1aSopenharmony_ci        add             \tmp7\().8h, v25.8h,  v27.8h
222cabdff1aSopenharmony_ci        urshr           v4.8h,  v0.8h,  #3                      // out p0
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_ci        add             v0.8h,  v0.8h,  \tmp3\().8h
225cabdff1aSopenharmony_ci        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
226cabdff1aSopenharmony_ci        add             \tmp1\().8h, v22.8h,  v25.8h
227cabdff1aSopenharmony_ci        add             \tmp3\().8h, v26.8h,  v27.8h
228cabdff1aSopenharmony_ci        urshr           v5.8h,  v0.8h,  #3                      // out q0
229cabdff1aSopenharmony_ci
230cabdff1aSopenharmony_ci        add             v0.8h,  v0.8h,  \tmp7\().8h
231cabdff1aSopenharmony_ci        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
232cabdff1aSopenharmony_ci        urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci        add             v0.8h,  v0.8h,  \tmp3\().8h
235cabdff1aSopenharmony_ci        // The output here is written back into the input registers. This doesn't
236cabdff1aSopenharmony_ci        // matter for the flat8part below, since we only update those pixels
237cabdff1aSopenharmony_ci        // which won't be touched below.
238cabdff1aSopenharmony_ci        bit             v21.16b, v2.16b,  v6.16b
239cabdff1aSopenharmony_ci        bit             v22.16b, v3.16b,  v6.16b
240cabdff1aSopenharmony_ci        bit             v23.16b, v4.16b,  v6.16b
241cabdff1aSopenharmony_ci        urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
242cabdff1aSopenharmony_ci        bit             v24.16b, v5.16b,  v6.16b
243cabdff1aSopenharmony_ci        bit             v25.16b, \tmp5\().16b,  v6.16b
244cabdff1aSopenharmony_ci        bit             v26.16b, \tmp6\().16b,  v6.16b
245cabdff1aSopenharmony_ci.endif
246cabdff1aSopenharmony_ci.if \wd == 16
247cabdff1aSopenharmony_ci6:
248cabdff1aSopenharmony_ci        orr             v2.16b,  v6.16b,  v7.16b
249cabdff1aSopenharmony_ci        mov             x11, v2.d[0]
250cabdff1aSopenharmony_ci        mov             x12, v2.d[1]
251cabdff1aSopenharmony_ci        adds            x11, x11, x12
252cabdff1aSopenharmony_ci        b.ne            1f
253cabdff1aSopenharmony_ci        // If no pixels needed flat8in nor flat8out, jump to a
254cabdff1aSopenharmony_ci        // writeout of the inner 4 pixels
255cabdff1aSopenharmony_ci        ret             x14
256cabdff1aSopenharmony_ci1:
257cabdff1aSopenharmony_ci
258cabdff1aSopenharmony_ci        mov             x11, v7.d[0]
259cabdff1aSopenharmony_ci        mov             x12, v7.d[1]
260cabdff1aSopenharmony_ci        adds            x11, x11, x12
261cabdff1aSopenharmony_ci        b.ne            1f
262cabdff1aSopenharmony_ci        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
263cabdff1aSopenharmony_ci        ret             x15
264cabdff1aSopenharmony_ci
265cabdff1aSopenharmony_ci1:
266cabdff1aSopenharmony_ci        // flat8out
267cabdff1aSopenharmony_ci        // This writes all outputs into v2-v17 (skipping v6 and v16).
268cabdff1aSopenharmony_ci        // If this part is skipped, the output is read from v21-v26 (which is the input
269cabdff1aSopenharmony_ci        // to this section).
270cabdff1aSopenharmony_ci        shl             v0.8h,   v16.8h,  #3     // 8 * v16
271cabdff1aSopenharmony_ci        sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
272cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v17.8h
273cabdff1aSopenharmony_ci        add             v8.8h,   v17.8h,  v18.8h
274cabdff1aSopenharmony_ci        add             v10.8h,  v19.8h,  v20.8h
275cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v8.8h
276cabdff1aSopenharmony_ci        add             v8.8h,   v16.8h,  v17.8h
277cabdff1aSopenharmony_ci        add             v12.8h,  v21.8h,  v22.8h
278cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v10.8h
279cabdff1aSopenharmony_ci        add             v10.8h,  v18.8h,  v25.8h
280cabdff1aSopenharmony_ci        add             v14.8h,  v23.8h,  v24.8h
281cabdff1aSopenharmony_ci        sub             v10.8h,  v10.8h,  v8.8h
282cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v12.8h
283cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v14.8h
284cabdff1aSopenharmony_ci        add             v12.8h,  v16.8h,  v18.8h
285cabdff1aSopenharmony_ci        add             v14.8h,  v19.8h,  v26.8h
286cabdff1aSopenharmony_ci        urshr           v2.8h,   v0.8h,   #4
287cabdff1aSopenharmony_ci
288cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v10.8h
289cabdff1aSopenharmony_ci        add             v8.8h,   v16.8h,  v19.8h
290cabdff1aSopenharmony_ci        add             v10.8h,  v20.8h,  v27.8h
291cabdff1aSopenharmony_ci        sub             v14.8h,  v14.8h,  v12.8h
292cabdff1aSopenharmony_ci        bif             v2.16b,  v17.16b, v7.16b
293cabdff1aSopenharmony_ci        urshr           v3.8h ,  v0.8h,   #4
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v14.8h
296cabdff1aSopenharmony_ci        add             v12.8h,  v16.8h,  v20.8h
297cabdff1aSopenharmony_ci        add             v14.8h,  v21.8h,  v28.8h
298cabdff1aSopenharmony_ci        sub             v10.8h,  v10.8h,  v8.8h
299cabdff1aSopenharmony_ci        bif             v3.16b,  v18.16b, v7.16b
300cabdff1aSopenharmony_ci        urshr           v4.8h,   v0.8h,   #4
301cabdff1aSopenharmony_ci
302cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v10.8h
303cabdff1aSopenharmony_ci        add             v8.8h,   v16.8h,  v21.8h
304cabdff1aSopenharmony_ci        add             v10.8h,  v22.8h,  v29.8h
305cabdff1aSopenharmony_ci        sub             v14.8h,  v14.8h,  v12.8h
306cabdff1aSopenharmony_ci        bif             v4.16b,  v19.16b, v7.16b
307cabdff1aSopenharmony_ci        urshr           v5.8h,   v0.8h,   #4
308cabdff1aSopenharmony_ci
309cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v14.8h
310cabdff1aSopenharmony_ci        add             v12.8h,  v16.8h,  v22.8h
311cabdff1aSopenharmony_ci        add             v14.8h,  v23.8h,  v30.8h
312cabdff1aSopenharmony_ci        sub             v10.8h,  v10.8h,  v8.8h
313cabdff1aSopenharmony_ci        bif             v5.16b,  v20.16b, v7.16b
314cabdff1aSopenharmony_ci        urshr           v6.8h,   v0.8h,   #4
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v10.8h
317cabdff1aSopenharmony_ci        add             v10.8h,  v16.8h,  v23.8h
318cabdff1aSopenharmony_ci        sub             v14.8h,  v14.8h,  v12.8h
319cabdff1aSopenharmony_ci        add             v12.8h,  v24.8h,  v31.8h
320cabdff1aSopenharmony_ci        bif             v6.16b,  v21.16b, v7.16b
321cabdff1aSopenharmony_ci        urshr           v8.8h,   v0.8h,   #4
322cabdff1aSopenharmony_ci
323cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v14.8h
324cabdff1aSopenharmony_ci        sub             v10.8h,  v12.8h,  v10.8h
325cabdff1aSopenharmony_ci        add             v12.8h,  v17.8h,  v24.8h
326cabdff1aSopenharmony_ci        add             v14.8h,  v25.8h,  v31.8h
327cabdff1aSopenharmony_ci        bif             v8.16b,  v22.16b, v7.16b
328cabdff1aSopenharmony_ci        urshr           v9.8h,   v0.8h,   #4
329cabdff1aSopenharmony_ci
330cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v10.8h
331cabdff1aSopenharmony_ci        sub             v14.8h,  v14.8h,  v12.8h
332cabdff1aSopenharmony_ci        add             v12.8h,  v26.8h,  v31.8h
333cabdff1aSopenharmony_ci        bif             v9.16b,  v23.16b, v7.16b
334cabdff1aSopenharmony_ci        urshr           v10.8h,  v0.8h,   #4
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v14.8h
337cabdff1aSopenharmony_ci        add             v14.8h,  v18.8h,  v25.8h
338cabdff1aSopenharmony_ci        add             v18.8h,  v19.8h,  v26.8h
339cabdff1aSopenharmony_ci        sub             v12.8h,  v12.8h,  v14.8h
340cabdff1aSopenharmony_ci        add             v14.8h,  v27.8h,  v31.8h
341cabdff1aSopenharmony_ci        bif             v10.16b, v24.16b, v7.16b
342cabdff1aSopenharmony_ci        urshr           v11.8h,  v0.8h,   #4
343cabdff1aSopenharmony_ci
344cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v12.8h
345cabdff1aSopenharmony_ci        add             v12.8h,  v20.8h,  v27.8h
346cabdff1aSopenharmony_ci        sub             v14.8h,  v14.8h,  v18.8h
347cabdff1aSopenharmony_ci        add             v18.8h,  v28.8h,  v31.8h
348cabdff1aSopenharmony_ci        bif             v11.16b, v25.16b, v7.16b
349cabdff1aSopenharmony_ci        sub             v18.8h,  v18.8h,  v12.8h
350cabdff1aSopenharmony_ci        urshr           v12.8h,  v0.8h,   #4
351cabdff1aSopenharmony_ci
352cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v14.8h
353cabdff1aSopenharmony_ci        add             v14.8h,  v21.8h,  v28.8h
354cabdff1aSopenharmony_ci        add             v20.8h,  v29.8h,  v31.8h
355cabdff1aSopenharmony_ci        bif             v12.16b, v26.16b, v7.16b
356cabdff1aSopenharmony_ci        urshr           v13.8h,  v0.8h,   #4
357cabdff1aSopenharmony_ci
358cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v18.8h
359cabdff1aSopenharmony_ci        sub             v20.8h,  v20.8h,  v14.8h
360cabdff1aSopenharmony_ci        add             v18.8h,  v22.8h,  v29.8h
361cabdff1aSopenharmony_ci        add             v22.8h,  v30.8h,  v31.8h
362cabdff1aSopenharmony_ci        bif             v13.16b, v27.16b, v7.16b
363cabdff1aSopenharmony_ci        urshr           v14.8h,  v0.8h,   #4
364cabdff1aSopenharmony_ci
365cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v20.8h
366cabdff1aSopenharmony_ci        sub             v22.8h,  v22.8h,  v18.8h
367cabdff1aSopenharmony_ci        bif             v14.16b, v28.16b, v7.16b
368cabdff1aSopenharmony_ci        urshr           v15.8h,  v0.8h,   #4
369cabdff1aSopenharmony_ci
370cabdff1aSopenharmony_ci        add             v0.8h,   v0.8h,   v22.8h
371cabdff1aSopenharmony_ci        bif             v15.16b, v29.16b, v7.16b
372cabdff1aSopenharmony_ci        urshr           v17.8h,  v0.8h,   #4
373cabdff1aSopenharmony_ci        bif             v17.16b, v30.16b, v7.16b
374cabdff1aSopenharmony_ci.endif
375cabdff1aSopenharmony_ci.endm
376cabdff1aSopenharmony_ci
377cabdff1aSopenharmony_ci// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
378cabdff1aSopenharmony_ci// while we need those for inputs/outputs in wd=16 and use v8-v15
379cabdff1aSopenharmony_ci// for temp registers there instead.
380cabdff1aSopenharmony_cifunction vp9_loop_filter_4
381cabdff1aSopenharmony_ci        loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
382cabdff1aSopenharmony_ci        ret
383cabdff1aSopenharmony_ciendfunc
384cabdff1aSopenharmony_ci
385cabdff1aSopenharmony_cifunction vp9_loop_filter_8
386cabdff1aSopenharmony_ci        loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
387cabdff1aSopenharmony_ci        ret
388cabdff1aSopenharmony_ciendfunc
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_cifunction vp9_loop_filter_16
391cabdff1aSopenharmony_ci        loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
392cabdff1aSopenharmony_ci        ret
393cabdff1aSopenharmony_ciendfunc
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_ci.macro loop_filter_4
396cabdff1aSopenharmony_ci        bl              vp9_loop_filter_4
397cabdff1aSopenharmony_ci.endm
398cabdff1aSopenharmony_ci
399cabdff1aSopenharmony_ci.macro loop_filter_8
400cabdff1aSopenharmony_ci        // calculate alternative 'return' targets
401cabdff1aSopenharmony_ci        adr             x13, 6f
402cabdff1aSopenharmony_ci        bl              vp9_loop_filter_8
403cabdff1aSopenharmony_ci.endm
404cabdff1aSopenharmony_ci
405cabdff1aSopenharmony_ci.macro loop_filter_16
406cabdff1aSopenharmony_ci        // calculate alternative 'return' targets
407cabdff1aSopenharmony_ci        adr             x14, 7f
408cabdff1aSopenharmony_ci        adr             x15, 8f
409cabdff1aSopenharmony_ci        bl              vp9_loop_filter_16
410cabdff1aSopenharmony_ci.endm
411cabdff1aSopenharmony_ci
412cabdff1aSopenharmony_ci
413cabdff1aSopenharmony_ci// The public functions in this file have got the following signature:
414cabdff1aSopenharmony_ci// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
415cabdff1aSopenharmony_ci
416cabdff1aSopenharmony_ci.macro bpp_frontend func, bpp, push
417cabdff1aSopenharmony_cifunction ff_\func\()_\bpp\()_neon, export=1
418cabdff1aSopenharmony_ci.if \push
419cabdff1aSopenharmony_ci        mov             x16, x30
420cabdff1aSopenharmony_ci        stp             d14, d15, [sp, #-0x10]!
421cabdff1aSopenharmony_ci        stp             d12, d13, [sp, #-0x10]!
422cabdff1aSopenharmony_ci        stp             d10, d11, [sp, #-0x10]!
423cabdff1aSopenharmony_ci        stp             d8,  d9,  [sp, #-0x10]!
424cabdff1aSopenharmony_ci.endif
425cabdff1aSopenharmony_ci        lsl             w2,  w2,  #\bpp - 8
426cabdff1aSopenharmony_ci        lsl             w3,  w3,  #\bpp - 8
427cabdff1aSopenharmony_ci        lsl             w4,  w4,  #\bpp - 8
428cabdff1aSopenharmony_ci        mov             x5,  #1 << (\bpp - 8)
429cabdff1aSopenharmony_ci        mov             x6,  #16 - \bpp
430cabdff1aSopenharmony_ci        mov             x7,  #((1 << \bpp) - 1)
431cabdff1aSopenharmony_ci.if \push
432cabdff1aSopenharmony_ci        bl              \func\()_16_neon
433cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
434cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
435cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
436cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
437cabdff1aSopenharmony_ci        ret             x16
438cabdff1aSopenharmony_ci.else
439cabdff1aSopenharmony_ci        b               \func\()_16_neon
440cabdff1aSopenharmony_ci.endif
441cabdff1aSopenharmony_ciendfunc
442cabdff1aSopenharmony_ci.endm
443cabdff1aSopenharmony_ci
444cabdff1aSopenharmony_ci.macro bpp_frontends func, push=0
445cabdff1aSopenharmony_ci        bpp_frontend    \func, 10, \push
446cabdff1aSopenharmony_ci        bpp_frontend    \func, 12, \push
447cabdff1aSopenharmony_ci.endm
448cabdff1aSopenharmony_ci
449cabdff1aSopenharmony_ci.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
450cabdff1aSopenharmony_cifunction ff_\func\()_\suffix\()_\bpp\()_neon, export=1
451cabdff1aSopenharmony_ci        mov             x16, x30
452cabdff1aSopenharmony_ci.if \push
453cabdff1aSopenharmony_ci        stp             d14, d15, [sp, #-0x10]!
454cabdff1aSopenharmony_ci        stp             d12, d13, [sp, #-0x10]!
455cabdff1aSopenharmony_ci        stp             d10, d11, [sp, #-0x10]!
456cabdff1aSopenharmony_ci        stp             d8,  d9,  [sp, #-0x10]!
457cabdff1aSopenharmony_ci.endif
458cabdff1aSopenharmony_ci        lsl             w2,  w2,  #\bpp - 8
459cabdff1aSopenharmony_ci        lsl             w3,  w3,  #\bpp - 8
460cabdff1aSopenharmony_ci        lsl             w4,  w4,  #\bpp - 8
461cabdff1aSopenharmony_ci        mov             x5,  #1 << (\bpp - 8)
462cabdff1aSopenharmony_ci        mov             x6,  #16 - \bpp
463cabdff1aSopenharmony_ci        mov             x7,  #((1 << \bpp) - 1)
464cabdff1aSopenharmony_ci        bl              \func\()_\int_suffix\()_16_neon
465cabdff1aSopenharmony_ci.ifc \dir,h
466cabdff1aSopenharmony_ci        add             x0,  x0,  x1, lsl #3
467cabdff1aSopenharmony_ci.else
468cabdff1aSopenharmony_ci        add             x0,  x0,  #16
469cabdff1aSopenharmony_ci.endif
470cabdff1aSopenharmony_ci        bl              \func\()_\int_suffix\()_16_neon
471cabdff1aSopenharmony_ci.if \push
472cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
473cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
474cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
475cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
476cabdff1aSopenharmony_ci.endif
477cabdff1aSopenharmony_ci        ret             x16
478cabdff1aSopenharmony_ciendfunc
479cabdff1aSopenharmony_ci.endm
480cabdff1aSopenharmony_ci
481cabdff1aSopenharmony_ci.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
482cabdff1aSopenharmony_ci        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
483cabdff1aSopenharmony_ci        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
484cabdff1aSopenharmony_ci.endm
485cabdff1aSopenharmony_ci
486cabdff1aSopenharmony_ci.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
487cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
488cabdff1aSopenharmony_ci        mov             x16, x30
489cabdff1aSopenharmony_ci        lsr             w8,  w2,  #8
490cabdff1aSopenharmony_ci        lsr             w14, w3,  #8
491cabdff1aSopenharmony_ci        lsr             w15, w4,  #8
492cabdff1aSopenharmony_ci        and             w2,  w2,  #0xff
493cabdff1aSopenharmony_ci        and             w3,  w3,  #0xff
494cabdff1aSopenharmony_ci        and             w4,  w4,  #0xff
495cabdff1aSopenharmony_ci        lsl             w2,  w2,  #\bpp - 8
496cabdff1aSopenharmony_ci        lsl             w3,  w3,  #\bpp - 8
497cabdff1aSopenharmony_ci        lsl             w4,  w4,  #\bpp - 8
498cabdff1aSopenharmony_ci        mov             x5,  #1 << (\bpp - 8)
499cabdff1aSopenharmony_ci        mov             x6,  #16 - \bpp
500cabdff1aSopenharmony_ci        mov             x7,  #((1 << \bpp) - 1)
501cabdff1aSopenharmony_ci        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
502cabdff1aSopenharmony_ci.ifc \dir,h
503cabdff1aSopenharmony_ci        add             x0,  x0,  x1, lsl #3
504cabdff1aSopenharmony_ci.else
505cabdff1aSopenharmony_ci        add             x0,  x0,  #16
506cabdff1aSopenharmony_ci.endif
507cabdff1aSopenharmony_ci        lsl             w2,  w8,  #\bpp - 8
508cabdff1aSopenharmony_ci        lsl             w3,  w14, #\bpp - 8
509cabdff1aSopenharmony_ci        lsl             w4,  w15, #\bpp - 8
510cabdff1aSopenharmony_ci        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
511cabdff1aSopenharmony_ci        ret             x16
512cabdff1aSopenharmony_ciendfunc
513cabdff1aSopenharmony_ci.endm
514cabdff1aSopenharmony_ci
515cabdff1aSopenharmony_ci.macro bpp_frontends_mix2 wd1, wd2
516cabdff1aSopenharmony_ci        bpp_frontend_mix2 \wd1, \wd2, v, 10
517cabdff1aSopenharmony_ci        bpp_frontend_mix2 \wd1, \wd2, v, 12
518cabdff1aSopenharmony_ci        bpp_frontend_mix2 \wd1, \wd2, h, 10
519cabdff1aSopenharmony_ci        bpp_frontend_mix2 \wd1, \wd2, h, 12
520cabdff1aSopenharmony_ci.endm
521cabdff1aSopenharmony_ci
522cabdff1aSopenharmony_cifunction vp9_loop_filter_v_4_8_16_neon
523cabdff1aSopenharmony_ci        mov             x10, x30
524cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #2
525cabdff1aSopenharmony_ci        ld1             {v20.8h}, [x9], x1 // p3
526cabdff1aSopenharmony_ci        ld1             {v24.8h}, [x0], x1 // q0
527cabdff1aSopenharmony_ci        ld1             {v21.8h}, [x9], x1 // p2
528cabdff1aSopenharmony_ci        ld1             {v25.8h}, [x0], x1 // q1
529cabdff1aSopenharmony_ci        ld1             {v22.8h}, [x9], x1 // p1
530cabdff1aSopenharmony_ci        ld1             {v26.8h}, [x0], x1 // q2
531cabdff1aSopenharmony_ci        ld1             {v23.8h}, [x9], x1 // p0
532cabdff1aSopenharmony_ci        ld1             {v27.8h}, [x0], x1 // q3
533cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
534cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #1
535cabdff1aSopenharmony_ci
536cabdff1aSopenharmony_ci        loop_filter_4
537cabdff1aSopenharmony_ci
538cabdff1aSopenharmony_ci        st1             {v22.8h}, [x9], x1
539cabdff1aSopenharmony_ci        st1             {v24.8h}, [x0], x1
540cabdff1aSopenharmony_ci        st1             {v23.8h}, [x9], x1
541cabdff1aSopenharmony_ci        st1             {v25.8h}, [x0], x1
542cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
543cabdff1aSopenharmony_ci
544cabdff1aSopenharmony_ci        ret             x10
545cabdff1aSopenharmony_ciendfunc
546cabdff1aSopenharmony_ci
547cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_v_4_8
548cabdff1aSopenharmony_ci
549cabdff1aSopenharmony_cifunction vp9_loop_filter_h_4_8_16_neon
550cabdff1aSopenharmony_ci        mov             x10, x30
551cabdff1aSopenharmony_ci        sub             x9,  x0,  #8
552cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #2
553cabdff1aSopenharmony_ci        ld1             {v20.8h}, [x9], x1
554cabdff1aSopenharmony_ci        ld1             {v24.8h}, [x0], x1
555cabdff1aSopenharmony_ci        ld1             {v21.8h}, [x9], x1
556cabdff1aSopenharmony_ci        ld1             {v25.8h}, [x0], x1
557cabdff1aSopenharmony_ci        ld1             {v22.8h}, [x9], x1
558cabdff1aSopenharmony_ci        ld1             {v26.8h}, [x0], x1
559cabdff1aSopenharmony_ci        ld1             {v23.8h}, [x9], x1
560cabdff1aSopenharmony_ci        ld1             {v27.8h}, [x0], x1
561cabdff1aSopenharmony_ci
562cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #2
563cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
564cabdff1aSopenharmony_ci        add             x0,  x0,  #8
565cabdff1aSopenharmony_ci
566cabdff1aSopenharmony_ci        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
567cabdff1aSopenharmony_ci
568cabdff1aSopenharmony_ci        loop_filter_4
569cabdff1aSopenharmony_ci
570cabdff1aSopenharmony_ci        // Move x9 forward by 2 pixels; we don't need to rewrite the
571cabdff1aSopenharmony_ci        // outermost 2 pixels since they aren't changed.
572cabdff1aSopenharmony_ci        add             x9,  x9,  #4
573cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #2
574cabdff1aSopenharmony_ci
575cabdff1aSopenharmony_ci        // We only will write the mid 4 pixels back; after the loop filter,
576cabdff1aSopenharmony_ci        // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
577cabdff1aSopenharmony_ci        // We need to transpose them to columns, done with a 4x8 transpose
578cabdff1aSopenharmony_ci        // (which in practice is two 4x4 transposes of the two 4x4 halves
579cabdff1aSopenharmony_ci        // of the 8x4 pixels; into 4x8 pixels).
580cabdff1aSopenharmony_ci        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
581cabdff1aSopenharmony_ci        st1             {v22.d}[0], [x9], x1
582cabdff1aSopenharmony_ci        st1             {v22.d}[1], [x0], x1
583cabdff1aSopenharmony_ci        st1             {v23.d}[0], [x9], x1
584cabdff1aSopenharmony_ci        st1             {v23.d}[1], [x0], x1
585cabdff1aSopenharmony_ci        st1             {v24.d}[0], [x9], x1
586cabdff1aSopenharmony_ci        st1             {v24.d}[1], [x0], x1
587cabdff1aSopenharmony_ci        st1             {v25.d}[0], [x9], x1
588cabdff1aSopenharmony_ci        st1             {v25.d}[1], [x0], x1
589cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
590cabdff1aSopenharmony_ci        add             x0,  x0,  #4
591cabdff1aSopenharmony_ci
592cabdff1aSopenharmony_ci        ret             x10
593cabdff1aSopenharmony_ciendfunc
594cabdff1aSopenharmony_ci
595cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_h_4_8
596cabdff1aSopenharmony_ci
597cabdff1aSopenharmony_cifunction vp9_loop_filter_v_8_8_16_neon
598cabdff1aSopenharmony_ci        mov             x10, x30
599cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #2
600cabdff1aSopenharmony_ci        ld1             {v20.8h}, [x9], x1 // p3
601cabdff1aSopenharmony_ci        ld1             {v24.8h}, [x0], x1 // q0
602cabdff1aSopenharmony_ci        ld1             {v21.8h}, [x9], x1 // p2
603cabdff1aSopenharmony_ci        ld1             {v25.8h}, [x0], x1 // q1
604cabdff1aSopenharmony_ci        ld1             {v22.8h}, [x9], x1 // p1
605cabdff1aSopenharmony_ci        ld1             {v26.8h}, [x0], x1 // q2
606cabdff1aSopenharmony_ci        ld1             {v23.8h}, [x9], x1 // p0
607cabdff1aSopenharmony_ci        ld1             {v27.8h}, [x0], x1 // q3
608cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #2
609cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
610cabdff1aSopenharmony_ci        add             x9,  x9,  x1
611cabdff1aSopenharmony_ci
612cabdff1aSopenharmony_ci        loop_filter_8
613cabdff1aSopenharmony_ci
614cabdff1aSopenharmony_ci        st1             {v21.8h}, [x9], x1
615cabdff1aSopenharmony_ci        st1             {v24.8h}, [x0], x1
616cabdff1aSopenharmony_ci        st1             {v22.8h}, [x9], x1
617cabdff1aSopenharmony_ci        st1             {v25.8h}, [x0], x1
618cabdff1aSopenharmony_ci        st1             {v23.8h}, [x9], x1
619cabdff1aSopenharmony_ci        st1             {v26.8h}, [x0], x1
620cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
621cabdff1aSopenharmony_ci        sub             x0,  x0,  x1
622cabdff1aSopenharmony_ci
623cabdff1aSopenharmony_ci        ret             x10
624cabdff1aSopenharmony_ci6:
625cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #1
626cabdff1aSopenharmony_ci        st1             {v22.8h}, [x9], x1
627cabdff1aSopenharmony_ci        st1             {v24.8h}, [x0], x1
628cabdff1aSopenharmony_ci        st1             {v23.8h}, [x9], x1
629cabdff1aSopenharmony_ci        st1             {v25.8h}, [x0], x1
630cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
631cabdff1aSopenharmony_ci        ret             x10
632cabdff1aSopenharmony_ciendfunc
633cabdff1aSopenharmony_ci
634cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_v_8_8
635cabdff1aSopenharmony_ci
636cabdff1aSopenharmony_cifunction vp9_loop_filter_h_8_8_16_neon
637cabdff1aSopenharmony_ci        mov             x10, x30
638cabdff1aSopenharmony_ci        sub             x9,  x0,  #8
639cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #2
640cabdff1aSopenharmony_ci        ld1             {v20.8h}, [x9], x1
641cabdff1aSopenharmony_ci        ld1             {v24.8h}, [x0], x1
642cabdff1aSopenharmony_ci        ld1             {v21.8h}, [x9], x1
643cabdff1aSopenharmony_ci        ld1             {v25.8h}, [x0], x1
644cabdff1aSopenharmony_ci        ld1             {v22.8h}, [x9], x1
645cabdff1aSopenharmony_ci        ld1             {v26.8h}, [x0], x1
646cabdff1aSopenharmony_ci        ld1             {v23.8h}, [x9], x1
647cabdff1aSopenharmony_ci        ld1             {v27.8h}, [x0], x1
648cabdff1aSopenharmony_ci
649cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #2
650cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
651cabdff1aSopenharmony_ci        add             x0,  x0,  #8
652cabdff1aSopenharmony_ci
653cabdff1aSopenharmony_ci        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
654cabdff1aSopenharmony_ci
655cabdff1aSopenharmony_ci        loop_filter_8
656cabdff1aSopenharmony_ci
657cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #2
658cabdff1aSopenharmony_ci
659cabdff1aSopenharmony_ci        // Even though only 6 pixels per row have been changed, we write the
660cabdff1aSopenharmony_ci        // full 8 pixel registers.
661cabdff1aSopenharmony_ci        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
662cabdff1aSopenharmony_ci
663cabdff1aSopenharmony_ci        st1             {v20.8h}, [x9], x1
664cabdff1aSopenharmony_ci        st1             {v24.8h}, [x0], x1
665cabdff1aSopenharmony_ci        st1             {v21.8h}, [x9], x1
666cabdff1aSopenharmony_ci        st1             {v25.8h}, [x0], x1
667cabdff1aSopenharmony_ci        st1             {v22.8h}, [x9], x1
668cabdff1aSopenharmony_ci        st1             {v26.8h}, [x0], x1
669cabdff1aSopenharmony_ci        st1             {v23.8h}, [x9], x1
670cabdff1aSopenharmony_ci        st1             {v27.8h}, [x0], x1
671cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
672cabdff1aSopenharmony_ci        add             x0,  x0,  #8
673cabdff1aSopenharmony_ci
674cabdff1aSopenharmony_ci        ret             x10
675cabdff1aSopenharmony_ci6:
676cabdff1aSopenharmony_ci        // If we didn't need to do the flat8in part, we use the same writeback
677cabdff1aSopenharmony_ci        // as in loop_filter_h_4_8.
678cabdff1aSopenharmony_ci        add             x9,  x9,  #4
679cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #2
680cabdff1aSopenharmony_ci        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
681cabdff1aSopenharmony_ci        st1             {v22.d}[0], [x9], x1
682cabdff1aSopenharmony_ci        st1             {v22.d}[1], [x0], x1
683cabdff1aSopenharmony_ci        st1             {v23.d}[0], [x9], x1
684cabdff1aSopenharmony_ci        st1             {v23.d}[1], [x0], x1
685cabdff1aSopenharmony_ci        st1             {v24.d}[0], [x9], x1
686cabdff1aSopenharmony_ci        st1             {v24.d}[1], [x0], x1
687cabdff1aSopenharmony_ci        st1             {v25.d}[0], [x9], x1
688cabdff1aSopenharmony_ci        st1             {v25.d}[1], [x0], x1
689cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
690cabdff1aSopenharmony_ci        add             x0,  x0,  #4
691cabdff1aSopenharmony_ci        ret             x10
692cabdff1aSopenharmony_ciendfunc
693cabdff1aSopenharmony_ci
694cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_h_8_8
695cabdff1aSopenharmony_ci
696cabdff1aSopenharmony_cibpp_frontends_mix2 4, 4
697cabdff1aSopenharmony_cibpp_frontends_mix2 4, 8
698cabdff1aSopenharmony_cibpp_frontends_mix2 8, 4
699cabdff1aSopenharmony_cibpp_frontends_mix2 8, 8
700cabdff1aSopenharmony_ci
701cabdff1aSopenharmony_cifunction vp9_loop_filter_v_16_8_16_neon
702cabdff1aSopenharmony_ci        mov             x10, x30
703cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #3
704cabdff1aSopenharmony_ci        ld1             {v16.8h}, [x9], x1 // p7
705cabdff1aSopenharmony_ci        ld1             {v24.8h}, [x0], x1 // q0
706cabdff1aSopenharmony_ci        ld1             {v17.8h}, [x9], x1 // p6
707cabdff1aSopenharmony_ci        ld1             {v25.8h}, [x0], x1 // q1
708cabdff1aSopenharmony_ci        ld1             {v18.8h}, [x9], x1 // p5
709cabdff1aSopenharmony_ci        ld1             {v26.8h}, [x0], x1 // q2
710cabdff1aSopenharmony_ci        ld1             {v19.8h}, [x9], x1 // p4
711cabdff1aSopenharmony_ci        ld1             {v27.8h}, [x0], x1 // q3
712cabdff1aSopenharmony_ci        ld1             {v20.8h}, [x9], x1 // p3
713cabdff1aSopenharmony_ci        ld1             {v28.8h}, [x0], x1 // q4
714cabdff1aSopenharmony_ci        ld1             {v21.8h}, [x9], x1 // p2
715cabdff1aSopenharmony_ci        ld1             {v29.8h}, [x0], x1 // q5
716cabdff1aSopenharmony_ci        ld1             {v22.8h}, [x9], x1 // p1
717cabdff1aSopenharmony_ci        ld1             {v30.8h}, [x0], x1 // q6
718cabdff1aSopenharmony_ci        ld1             {v23.8h}, [x9], x1 // p0
719cabdff1aSopenharmony_ci        ld1             {v31.8h}, [x0], x1 // q7
720cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #3
721cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
722cabdff1aSopenharmony_ci        add             x9,  x9,  x1
723cabdff1aSopenharmony_ci
724cabdff1aSopenharmony_ci        loop_filter_16
725cabdff1aSopenharmony_ci
726cabdff1aSopenharmony_ci        // If we did the flat8out part, we get the output in
727cabdff1aSopenharmony_ci        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
728cabdff1aSopenharmony_ci        // store v2-v9 there, and v10-v17 into x0.
729cabdff1aSopenharmony_ci        st1             {v2.8h},  [x9], x1
730cabdff1aSopenharmony_ci        st1             {v10.8h}, [x0], x1
731cabdff1aSopenharmony_ci        st1             {v3.8h},  [x9], x1
732cabdff1aSopenharmony_ci        st1             {v11.8h}, [x0], x1
733cabdff1aSopenharmony_ci        st1             {v4.8h},  [x9], x1
734cabdff1aSopenharmony_ci        st1             {v12.8h}, [x0], x1
735cabdff1aSopenharmony_ci        st1             {v5.8h},  [x9], x1
736cabdff1aSopenharmony_ci        st1             {v13.8h}, [x0], x1
737cabdff1aSopenharmony_ci        st1             {v6.8h},  [x9], x1
738cabdff1aSopenharmony_ci        st1             {v14.8h}, [x0], x1
739cabdff1aSopenharmony_ci        st1             {v8.8h},  [x9], x1
740cabdff1aSopenharmony_ci        st1             {v15.8h}, [x0], x1
741cabdff1aSopenharmony_ci        st1             {v9.8h},  [x9], x1
742cabdff1aSopenharmony_ci        st1             {v17.8h}, [x0], x1
743cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
744cabdff1aSopenharmony_ci        add             x0,  x0,  x1
745cabdff1aSopenharmony_ci
746cabdff1aSopenharmony_ci        ret             x10
747cabdff1aSopenharmony_ci8:
748cabdff1aSopenharmony_ci        add             x9,  x9,  x1, lsl #2
749cabdff1aSopenharmony_ci        // If we didn't do the flat8out part, the output is left in the
750cabdff1aSopenharmony_ci        // input registers.
751cabdff1aSopenharmony_ci        st1             {v21.8h}, [x9], x1
752cabdff1aSopenharmony_ci        st1             {v24.8h}, [x0], x1
753cabdff1aSopenharmony_ci        st1             {v22.8h}, [x9], x1
754cabdff1aSopenharmony_ci        st1             {v25.8h}, [x0], x1
755cabdff1aSopenharmony_ci        st1             {v23.8h}, [x9], x1
756cabdff1aSopenharmony_ci        st1             {v26.8h}, [x0], x1
757cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
758cabdff1aSopenharmony_ci        sub             x0,  x0,  x1
759cabdff1aSopenharmony_ci        ret             x10
760cabdff1aSopenharmony_ci7:
761cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #1
762cabdff1aSopenharmony_ci        st1             {v22.8h}, [x9], x1
763cabdff1aSopenharmony_ci        st1             {v24.8h}, [x0], x1
764cabdff1aSopenharmony_ci        st1             {v23.8h}, [x9], x1
765cabdff1aSopenharmony_ci        st1             {v25.8h}, [x0], x1
766cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
767cabdff1aSopenharmony_ci        ret             x10
768cabdff1aSopenharmony_ciendfunc
769cabdff1aSopenharmony_ci
770cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_v_16_8, push=1
771cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
772cabdff1aSopenharmony_ci
773cabdff1aSopenharmony_cifunction vp9_loop_filter_h_16_8_16_neon
774cabdff1aSopenharmony_ci        mov             x10, x30
775cabdff1aSopenharmony_ci        sub             x9,  x0,  #16
776cabdff1aSopenharmony_ci        ld1             {v16.8h}, [x9], x1
777cabdff1aSopenharmony_ci        ld1             {v24.8h}, [x0], x1
778cabdff1aSopenharmony_ci        ld1             {v17.8h}, [x9], x1
779cabdff1aSopenharmony_ci        ld1             {v25.8h}, [x0], x1
780cabdff1aSopenharmony_ci        ld1             {v18.8h}, [x9], x1
781cabdff1aSopenharmony_ci        ld1             {v26.8h}, [x0], x1
782cabdff1aSopenharmony_ci        ld1             {v19.8h}, [x9], x1
783cabdff1aSopenharmony_ci        ld1             {v27.8h}, [x0], x1
784cabdff1aSopenharmony_ci        ld1             {v20.8h}, [x9], x1
785cabdff1aSopenharmony_ci        ld1             {v28.8h}, [x0], x1
786cabdff1aSopenharmony_ci        ld1             {v21.8h}, [x9], x1
787cabdff1aSopenharmony_ci        ld1             {v29.8h}, [x0], x1
788cabdff1aSopenharmony_ci        ld1             {v22.8h}, [x9], x1
789cabdff1aSopenharmony_ci        ld1             {v30.8h}, [x0], x1
790cabdff1aSopenharmony_ci        ld1             {v23.8h}, [x9], x1
791cabdff1aSopenharmony_ci        ld1             {v31.8h}, [x0], x1
792cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
793cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #3
794cabdff1aSopenharmony_ci
795cabdff1aSopenharmony_ci        // The 16x8 pixels read above is in two 8x8 blocks; the left
796cabdff1aSopenharmony_ci        // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
797cabdff1aSopenharmony_ci        // of this, to get one column per register.
798cabdff1aSopenharmony_ci        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
799cabdff1aSopenharmony_ci        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
800cabdff1aSopenharmony_ci
801cabdff1aSopenharmony_ci        loop_filter_16
802cabdff1aSopenharmony_ci
803cabdff1aSopenharmony_ci        transpose_8x8H  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
804cabdff1aSopenharmony_ci        transpose_8x8H  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
805cabdff1aSopenharmony_ci
806cabdff1aSopenharmony_ci        st1             {v16.8h}, [x9], x1
807cabdff1aSopenharmony_ci        st1             {v10.8h}, [x0], x1
808cabdff1aSopenharmony_ci        st1             {v2.8h},  [x9], x1
809cabdff1aSopenharmony_ci        st1             {v11.8h}, [x0], x1
810cabdff1aSopenharmony_ci        st1             {v3.8h},  [x9], x1
811cabdff1aSopenharmony_ci        st1             {v12.8h}, [x0], x1
812cabdff1aSopenharmony_ci        st1             {v4.8h},  [x9], x1
813cabdff1aSopenharmony_ci        st1             {v13.8h}, [x0], x1
814cabdff1aSopenharmony_ci        st1             {v5.8h},  [x9], x1
815cabdff1aSopenharmony_ci        st1             {v14.8h}, [x0], x1
816cabdff1aSopenharmony_ci        st1             {v6.8h},  [x9], x1
817cabdff1aSopenharmony_ci        st1             {v15.8h}, [x0], x1
818cabdff1aSopenharmony_ci        st1             {v8.8h},  [x9], x1
819cabdff1aSopenharmony_ci        st1             {v17.8h}, [x0], x1
820cabdff1aSopenharmony_ci        st1             {v9.8h},  [x9], x1
821cabdff1aSopenharmony_ci        st1             {v31.8h}, [x0], x1
822cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
823cabdff1aSopenharmony_ci
824cabdff1aSopenharmony_ci        ret             x10
825cabdff1aSopenharmony_ci8:
826cabdff1aSopenharmony_ci        // The same writeback as in loop_filter_h_8_8
827cabdff1aSopenharmony_ci        sub             x9,  x0,  #8
828cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #2
829cabdff1aSopenharmony_ci        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
830cabdff1aSopenharmony_ci
831cabdff1aSopenharmony_ci        st1             {v20.8h}, [x9], x1
832cabdff1aSopenharmony_ci        st1             {v24.8h}, [x0], x1
833cabdff1aSopenharmony_ci        st1             {v21.8h}, [x9], x1
834cabdff1aSopenharmony_ci        st1             {v25.8h}, [x0], x1
835cabdff1aSopenharmony_ci        st1             {v22.8h}, [x9], x1
836cabdff1aSopenharmony_ci        st1             {v26.8h}, [x0], x1
837cabdff1aSopenharmony_ci        st1             {v23.8h}, [x9], x1
838cabdff1aSopenharmony_ci        st1             {v27.8h}, [x0], x1
839cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
840cabdff1aSopenharmony_ci        add             x0,  x0,  #8
841cabdff1aSopenharmony_ci        ret             x10
842cabdff1aSopenharmony_ci7:
843cabdff1aSopenharmony_ci        // The same writeback as in loop_filter_h_4_8
844cabdff1aSopenharmony_ci        sub             x9,  x0,  #4
845cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #2
846cabdff1aSopenharmony_ci        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
847cabdff1aSopenharmony_ci        st1             {v22.d}[0], [x9], x1
848cabdff1aSopenharmony_ci        st1             {v22.d}[1], [x0], x1
849cabdff1aSopenharmony_ci        st1             {v23.d}[0], [x9], x1
850cabdff1aSopenharmony_ci        st1             {v23.d}[1], [x0], x1
851cabdff1aSopenharmony_ci        st1             {v24.d}[0], [x9], x1
852cabdff1aSopenharmony_ci        st1             {v24.d}[1], [x0], x1
853cabdff1aSopenharmony_ci        st1             {v25.d}[0], [x9], x1
854cabdff1aSopenharmony_ci        st1             {v25.d}[1], [x0], x1
855cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
856cabdff1aSopenharmony_ci        add             x0,  x0,  #4
857cabdff1aSopenharmony_ci        ret             x10
858cabdff1aSopenharmony_ciendfunc
859cabdff1aSopenharmony_ci
860cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_h_16_8, push=1
861cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
862