1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2016 Google Inc.
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
22cabdff1aSopenharmony_ci#include "neon.S"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci// The main loop filter macro is templated and can produce filters for
26cabdff1aSopenharmony_ci// vectors of 8 or 16 bytes. The register mapping throughout the filter
27cabdff1aSopenharmony_ci// is close to identical to the arm version (please try to maintain this,
28cabdff1aSopenharmony_ci// if either is changed!). When the arm version uses e.g. d20 for the
29cabdff1aSopenharmony_ci// input variable p3, the aarch64 version uses v20.8b or v20.16b, depending
30cabdff1aSopenharmony_ci// on vector length.
31cabdff1aSopenharmony_ci//
32cabdff1aSopenharmony_ci// The number of elements in the vector is passed in via the macro parameter
33cabdff1aSopenharmony_ci// \sz, which is either .8b or .16b. For simple instructions that doesn't
34cabdff1aSopenharmony_ci// lengthen or narrow things, this can easily be templated like this:
35cabdff1aSopenharmony_ci//      uabd            v4\sz,  v20\sz, v21\sz
36cabdff1aSopenharmony_ci//
37cabdff1aSopenharmony_ci// For instructions that lengthen or narrow content, the arm version would
38cabdff1aSopenharmony_ci// have used q registers. For these instructions, we have macros that expand
39cabdff1aSopenharmony_ci// into either a single e.g. uaddl instruction, or into a uaddl + uaddl2
40cabdff1aSopenharmony_ci// pair, depending on the \sz parameter. Wherever the arm version would have
41cabdff1aSopenharmony_ci// used a q register, these macros instead take two v registers, i.e. q3
42cabdff1aSopenharmony_ci// is mapped to v6+v7. For the case with 8 byte input vectors, such a
43cabdff1aSopenharmony_ci// lengthening operation is only stored in v6.8h (what was in q3 in the arm
44cabdff1aSopenharmony_ci// case), while the 16 byte input vectors will use v6.8h + v7.8h.
45cabdff1aSopenharmony_ci// Such a macro invocation would look like this:
46cabdff1aSopenharmony_ci//      uaddl_sz        v8.8h,  v9.8h,  v17, v18, \sz
47cabdff1aSopenharmony_ci//
48cabdff1aSopenharmony_ci// That is, in the 8 byte input vector case, the second register in these
49cabdff1aSopenharmony_ci// register pairs will be unused.
50cabdff1aSopenharmony_ci// Unfortunately, this makes the code quite hard to read. For readability,
51cabdff1aSopenharmony_ci// see the arm version instead.
52cabdff1aSopenharmony_ci
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ci.macro add_sz dst1, dst2, in1, in2, in3, in4, sz
55cabdff1aSopenharmony_ci        add             \dst1,  \in1,  \in3
56cabdff1aSopenharmony_ci.ifc \sz, .16b
57cabdff1aSopenharmony_ci        add             \dst2,  \in2,  \in4
58cabdff1aSopenharmony_ci.endif
59cabdff1aSopenharmony_ci.endm
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ci.macro sub_sz dst1, dst2, in1, in2, in3, in4, sz
62cabdff1aSopenharmony_ci        sub             \dst1,  \in1,  \in3
63cabdff1aSopenharmony_ci.ifc \sz, .16b
64cabdff1aSopenharmony_ci        sub             \dst2,  \in2,  \in4
65cabdff1aSopenharmony_ci.endif
66cabdff1aSopenharmony_ci.endm
67cabdff1aSopenharmony_ci
68cabdff1aSopenharmony_ci.macro uaddw_sz dst1, dst2, in1, in2, in3, sz
69cabdff1aSopenharmony_ci        uaddw           \dst1,  \in1, \in3\().8b
70cabdff1aSopenharmony_ci.ifc \sz, .16b
71cabdff1aSopenharmony_ci        uaddw2          \dst2,  \in2, \in3\().16b
72cabdff1aSopenharmony_ci.endif
73cabdff1aSopenharmony_ci.endm
74cabdff1aSopenharmony_ci
75cabdff1aSopenharmony_ci.macro usubw_sz dst1, dst2, in1, in2, in3, sz
76cabdff1aSopenharmony_ci        usubw           \dst1,  \in1, \in3\().8b
77cabdff1aSopenharmony_ci.ifc \sz, .16b
78cabdff1aSopenharmony_ci        usubw2          \dst2,  \in2, \in3\().16b
79cabdff1aSopenharmony_ci.endif
80cabdff1aSopenharmony_ci.endm
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_ci.macro usubl_sz dst1, dst2, in1, in2, sz
83cabdff1aSopenharmony_ci        usubl           \dst1,  \in1\().8b,  \in2\().8b
84cabdff1aSopenharmony_ci.ifc \sz, .16b
85cabdff1aSopenharmony_ci        usubl2          \dst2,  \in1\().16b, \in2\().16b
86cabdff1aSopenharmony_ci.endif
87cabdff1aSopenharmony_ci.endm
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci.macro sqxtn_sz dst, in1, in2, sz
90cabdff1aSopenharmony_ci        sqxtn           \dst\().8b,  \in1
91cabdff1aSopenharmony_ci.ifc \sz, .16b
92cabdff1aSopenharmony_ci        sqxtn2          \dst\().16b, \in2
93cabdff1aSopenharmony_ci.endif
94cabdff1aSopenharmony_ci.endm
95cabdff1aSopenharmony_ci
96cabdff1aSopenharmony_ci.macro sqxtun_sz dst, in1, in2, sz
97cabdff1aSopenharmony_ci        sqxtun          \dst\().8b,  \in1
98cabdff1aSopenharmony_ci.ifc \sz, .16b
99cabdff1aSopenharmony_ci        sqxtun2         \dst\().16b, \in2
100cabdff1aSopenharmony_ci.endif
101cabdff1aSopenharmony_ci.endm
102cabdff1aSopenharmony_ci
103cabdff1aSopenharmony_ci.macro mul_sz dst1, dst2, in1, in2, in3, in4, sz
104cabdff1aSopenharmony_ci        mul             \dst1,  \in1,  \in3
105cabdff1aSopenharmony_ci.ifc \sz, .16b
106cabdff1aSopenharmony_ci        mul             \dst2,  \in2,  \in4
107cabdff1aSopenharmony_ci.endif
108cabdff1aSopenharmony_ci.endm
109cabdff1aSopenharmony_ci
110cabdff1aSopenharmony_ci.macro saddw_sz dst1, dst2, in1, in2, in3, sz
111cabdff1aSopenharmony_ci        saddw           \dst1,  \in1, \in3\().8b
112cabdff1aSopenharmony_ci.ifc \sz, .16b
113cabdff1aSopenharmony_ci        saddw2          \dst2,  \in2, \in3\().16b
114cabdff1aSopenharmony_ci.endif
115cabdff1aSopenharmony_ci.endm
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_ci.macro ssubw_sz dst1, dst2, in1, in2, in3, sz
118cabdff1aSopenharmony_ci        ssubw           \dst1,  \in1, \in3\().8b
119cabdff1aSopenharmony_ci.ifc \sz, .16b
120cabdff1aSopenharmony_ci        ssubw2          \dst2,  \in2, \in3\().16b
121cabdff1aSopenharmony_ci.endif
122cabdff1aSopenharmony_ci.endm
123cabdff1aSopenharmony_ci
124cabdff1aSopenharmony_ci.macro uxtl_sz dst1, dst2, in, sz
125cabdff1aSopenharmony_ci        uxtl            \dst1,  \in\().8b
126cabdff1aSopenharmony_ci.ifc \sz, .16b
127cabdff1aSopenharmony_ci        uxtl2           \dst2,  \in\().16b
128cabdff1aSopenharmony_ci.endif
129cabdff1aSopenharmony_ci.endm
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_ci.macro uaddl_sz dst1, dst2, in1, in2, sz
132cabdff1aSopenharmony_ci        uaddl           \dst1,  \in1\().8b,  \in2\().8b
133cabdff1aSopenharmony_ci.ifc \sz, .16b
134cabdff1aSopenharmony_ci        uaddl2          \dst2,  \in1\().16b, \in2\().16b
135cabdff1aSopenharmony_ci.endif
136cabdff1aSopenharmony_ci.endm
137cabdff1aSopenharmony_ci
138cabdff1aSopenharmony_ci.macro rshrn_sz dst, in1, in2, shift, sz
139cabdff1aSopenharmony_ci        rshrn           \dst\().8b,  \in1, \shift
140cabdff1aSopenharmony_ci.ifc \sz, .16b
141cabdff1aSopenharmony_ci        rshrn2          \dst\().16b, \in2, \shift
142cabdff1aSopenharmony_ci.endif
143cabdff1aSopenharmony_ci.endm
144cabdff1aSopenharmony_ci
145cabdff1aSopenharmony_ci.macro ushll_sz dst1, dst2, in, shift, sz
146cabdff1aSopenharmony_ci        ushll           \dst1,  \in\().8b,  \shift
147cabdff1aSopenharmony_ci.ifc \sz, .16b
148cabdff1aSopenharmony_ci        ushll2          \dst2,  \in\().16b, \shift
149cabdff1aSopenharmony_ci.endif
150cabdff1aSopenharmony_ci.endm
151cabdff1aSopenharmony_ci
152cabdff1aSopenharmony_ci// The input to and output from this macro is in the registers v16-v31,
153cabdff1aSopenharmony_ci// and v0-v7 are used as scratch registers.
154cabdff1aSopenharmony_ci// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
155cabdff1aSopenharmony_ci// Depending on the width of the loop filter, we either use v16-v19
156cabdff1aSopenharmony_ci// and v28-v31 as temp registers, or v8-v15.
157cabdff1aSopenharmony_ci// When comparing to the arm version, tmpq1 == tmp1 + tmp2,
158cabdff1aSopenharmony_ci// tmpq2 == tmp3 + tmp4, etc.
159cabdff1aSopenharmony_ci.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
160cabdff1aSopenharmony_ci.if \mix == 0
161cabdff1aSopenharmony_ci        dup             v0\sz,  w2        // E
162cabdff1aSopenharmony_ci        dup             v2\sz,  w3        // I
163cabdff1aSopenharmony_ci        dup             v3\sz,  w4        // H
164cabdff1aSopenharmony_ci.else
165cabdff1aSopenharmony_ci        dup             v0.8h,  w2        // E
166cabdff1aSopenharmony_ci        dup             v2.8h,  w3        // I
167cabdff1aSopenharmony_ci        dup             v3.8h,  w4        // H
168cabdff1aSopenharmony_ci        rev16           v1.16b, v0.16b    // E
169cabdff1aSopenharmony_ci        rev16           v4.16b, v2.16b    // I
170cabdff1aSopenharmony_ci        rev16           v5.16b, v3.16b    // H
171cabdff1aSopenharmony_ci        uzp1            v0.16b, v0.16b, v1.16b
172cabdff1aSopenharmony_ci        uzp1            v2.16b, v2.16b, v4.16b
173cabdff1aSopenharmony_ci        uzp1            v3.16b, v3.16b, v5.16b
174cabdff1aSopenharmony_ci.endif
175cabdff1aSopenharmony_ci
176cabdff1aSopenharmony_ci        uabd            v4\sz,  v20\sz, v21\sz        // abs(p3 - p2)
177cabdff1aSopenharmony_ci        uabd            v5\sz,  v21\sz, v22\sz        // abs(p2 - p1)
178cabdff1aSopenharmony_ci        uabd            v6\sz,  v22\sz, v23\sz        // abs(p1 - p0)
179cabdff1aSopenharmony_ci        uabd            v7\sz,  v24\sz, v25\sz        // abs(q0 - q1)
180cabdff1aSopenharmony_ci        uabd            \tmp1\sz,  v25\sz, v26\sz     // abs(q1 - q2)
181cabdff1aSopenharmony_ci        uabd            \tmp2\sz,  v26\sz, v27\sz     // abs(q2 - q3)
182cabdff1aSopenharmony_ci        umax            v4\sz,  v4\sz,  v5\sz
183cabdff1aSopenharmony_ci        umax            v5\sz,  v6\sz,  v7\sz
184cabdff1aSopenharmony_ci        umax            \tmp1\sz, \tmp1\sz, \tmp2\sz
185cabdff1aSopenharmony_ci        uabd            v6\sz,  v23\sz, v24\sz        // abs(p0 - q0)
186cabdff1aSopenharmony_ci        umax            v4\sz,  v4\sz,  v5\sz
187cabdff1aSopenharmony_ci        uqadd           v6\sz,  v6\sz,  v6\sz         // abs(p0 - q0) * 2
188cabdff1aSopenharmony_ci        uabd            v5\sz,  v22\sz, v25\sz        // abs(p1 - q1)
189cabdff1aSopenharmony_ci        umax            v4\sz,  v4\sz,  \tmp1\sz      // max(abs(p3 - p2), ..., abs(q2 - q3))
190cabdff1aSopenharmony_ci        ushr            v5\sz,  v5\sz,  #1
191cabdff1aSopenharmony_ci        cmhs            v4\sz,  v2\sz,  v4\sz         // max(abs()) <= I
192cabdff1aSopenharmony_ci        uqadd           v6\sz,  v6\sz,  v5\sz         // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
193cabdff1aSopenharmony_ci        cmhs            v5\sz,  v0\sz,  v6\sz
194cabdff1aSopenharmony_ci        and             v4\sz,  v4\sz,  v5\sz         // fm
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_ci        // If no pixels need filtering, just exit as soon as possible
197cabdff1aSopenharmony_ci        mov             x5,  v4.d[0]
198cabdff1aSopenharmony_ci.ifc \sz, .16b
199cabdff1aSopenharmony_ci        mov             x6,  v4.d[1]
200cabdff1aSopenharmony_ci        adds            x5,  x5,  x6
201cabdff1aSopenharmony_ci        b.eq            9f
202cabdff1aSopenharmony_ci.else
203cabdff1aSopenharmony_ci        cbz             x5,  9f
204cabdff1aSopenharmony_ci.endif
205cabdff1aSopenharmony_ci
206cabdff1aSopenharmony_ci.if \wd >= 8
207cabdff1aSopenharmony_ci        movi            v0\sz,  #1
208cabdff1aSopenharmony_ci
209cabdff1aSopenharmony_ci        uabd            v6\sz,  v20\sz, v23\sz    // abs(p3 - p0)
210cabdff1aSopenharmony_ci        uabd            v2\sz,  v21\sz, v23\sz    // abs(p2 - p0)
211cabdff1aSopenharmony_ci        uabd            v1\sz,  v22\sz, v23\sz    // abs(p1 - p0)
212cabdff1aSopenharmony_ci        uabd            \tmp1\sz,  v25\sz, v24\sz // abs(q1 - q0)
213cabdff1aSopenharmony_ci        uabd            \tmp2\sz,  v26\sz, v24\sz // abs(q2 - q0)
214cabdff1aSopenharmony_ci        uabd            \tmp3\sz,  v27\sz, v24\sz // abs(q3 - q0)
215cabdff1aSopenharmony_ci        umax            v6\sz,  v6\sz,  v2\sz
216cabdff1aSopenharmony_ci        umax            v1\sz,  v1\sz,  \tmp1\sz
217cabdff1aSopenharmony_ci        umax            \tmp2\sz,  \tmp2\sz,  \tmp3\sz
218cabdff1aSopenharmony_ci.if \wd == 16
219cabdff1aSopenharmony_ci        uabd            v7\sz,  v16\sz, v23\sz    // abs(p7 - p0)
220cabdff1aSopenharmony_ci        umax            v6\sz,  v6\sz,  v1\sz
221cabdff1aSopenharmony_ci        uabd            v2\sz,  v17\sz, v23\sz    // abs(p6 - p0)
222cabdff1aSopenharmony_ci        umax            v6\sz,  v6\sz,  \tmp2\sz
223cabdff1aSopenharmony_ci        uabd            v1\sz,  v18\sz, v23\sz    // abs(p5 - p0)
224cabdff1aSopenharmony_ci        cmhs            v6\sz,  v0\sz,  v6\sz     // flat8in
225cabdff1aSopenharmony_ci        uabd            v8\sz,  v19\sz, v23\sz    // abs(p4 - p0)
226cabdff1aSopenharmony_ci        and             v6\sz,  v6\sz,  v4\sz     // flat8in && fm
227cabdff1aSopenharmony_ci        uabd            v9\sz,  v28\sz, v24\sz    // abs(q4 - q0)
228cabdff1aSopenharmony_ci        bic             v4\sz,  v4\sz,  v6\sz     // fm && !flat8in
229cabdff1aSopenharmony_ci        uabd            v10\sz, v29\sz, v24\sz    // abs(q5 - q0)
230cabdff1aSopenharmony_ci        uabd            v11\sz, v30\sz, v24\sz    // abs(q6 - q0)
231cabdff1aSopenharmony_ci        uabd            v12\sz, v31\sz, v24\sz    // abs(q7 - q0)
232cabdff1aSopenharmony_ci
233cabdff1aSopenharmony_ci        umax            v7\sz,  v7\sz,  v2\sz
234cabdff1aSopenharmony_ci        umax            v1\sz,  v1\sz,  v8\sz
235cabdff1aSopenharmony_ci        umax            v9\sz,  v9\sz,  v10\sz
236cabdff1aSopenharmony_ci        umax            v11\sz, v11\sz, v12\sz
237cabdff1aSopenharmony_ci        // The rest of the calculation of flat8out is interleaved below
238cabdff1aSopenharmony_ci.else
239cabdff1aSopenharmony_ci        // The rest of the calculation of flat8in is interleaved below
240cabdff1aSopenharmony_ci.endif
241cabdff1aSopenharmony_ci.endif
242cabdff1aSopenharmony_ci
243cabdff1aSopenharmony_ci        // Calculate the normal inner loop filter for 2 or 4 pixels
244cabdff1aSopenharmony_ci        uabd            v5\sz,  v22\sz, v23\sz // abs(p1 - p0)
245cabdff1aSopenharmony_ci.if \wd == 16
246cabdff1aSopenharmony_ci        umax            v7\sz,  v7\sz,  v1\sz
247cabdff1aSopenharmony_ci        umax            v9\sz,  v9\sz,  v11\sz
248cabdff1aSopenharmony_ci.elseif \wd == 8
249cabdff1aSopenharmony_ci        umax            v6\sz,  v6\sz,  v1\sz
250cabdff1aSopenharmony_ci.endif
251cabdff1aSopenharmony_ci        uabd            v1\sz,  v25\sz, v24\sz // abs(q1 - q0)
252cabdff1aSopenharmony_ci.if \wd == 16
253cabdff1aSopenharmony_ci        umax            v7\sz,  v7\sz,  v9\sz
254cabdff1aSopenharmony_ci.elseif \wd == 8
255cabdff1aSopenharmony_ci        umax            v6\sz,  v6\sz,  \tmp2\sz
256cabdff1aSopenharmony_ci.endif
257cabdff1aSopenharmony_ci        usubl_sz        \tmp1\().8h,  \tmp2\().8h,  v22,  v25, \sz // p1 - q1
258cabdff1aSopenharmony_ci        umax            v5\sz,  v5\sz,  v1\sz  // max(abs(p1 - p0), abs(q1 - q0))
259cabdff1aSopenharmony_ci.if \mix != 0
260cabdff1aSopenharmony_ci        mov             v1.d[0], x11
261cabdff1aSopenharmony_ci.endif
262cabdff1aSopenharmony_ci        usubl_sz        \tmp3\().8h,  \tmp4\().8h,  v24,  v23, \sz // q0 - p0
263cabdff1aSopenharmony_ci        movi            \tmp5\().8h,  #3
264cabdff1aSopenharmony_ci.if \wd == 8
265cabdff1aSopenharmony_ci        cmhs            v6\sz,  v0\sz,  v6\sz  // flat8in
266cabdff1aSopenharmony_ci.endif
267cabdff1aSopenharmony_ci.if \mix != 0
268cabdff1aSopenharmony_ci        sxtl            v1.8h,  v1.8b
269cabdff1aSopenharmony_ci.endif
270cabdff1aSopenharmony_ci        cmhs            v5\sz,  v3\sz,  v5\sz  // !hev
271cabdff1aSopenharmony_ci.if \wd == 8
272cabdff1aSopenharmony_ci        // If a 4/8 or 8/4 mix is used, clear the relevant half of v6
273cabdff1aSopenharmony_ci.if \mix != 0
274cabdff1aSopenharmony_ci        and             v6\sz,  v6\sz,  v1.16b
275cabdff1aSopenharmony_ci.endif
276cabdff1aSopenharmony_ci        and             v6\sz,  v6\sz,  v4\sz  // flat8in && fm
277cabdff1aSopenharmony_ci.endif
278cabdff1aSopenharmony_ci        sqxtn_sz        \tmp1,        \tmp1\().8h,  \tmp2\().8h, \sz // av_clip_int8(p1 - q1)
279cabdff1aSopenharmony_ci.if \wd == 16
280cabdff1aSopenharmony_ci        cmhs            v7\sz,  v0\sz,  v7\sz  // flat8out
281cabdff1aSopenharmony_ci.elseif \wd == 8
282cabdff1aSopenharmony_ci        bic             v4\sz,  v4\sz,  v6\sz  // fm && !flat8in
283cabdff1aSopenharmony_ci.endif
284cabdff1aSopenharmony_ci        and             v5\sz,  v5\sz,  v4\sz  // !hev && fm && !flat8in
285cabdff1aSopenharmony_ci.if \wd == 16
286cabdff1aSopenharmony_ci        and             v7\sz,  v7\sz,  v6\sz  // flat8out && flat8in && fm
287cabdff1aSopenharmony_ci.endif
288cabdff1aSopenharmony_ci
289cabdff1aSopenharmony_ci        mul_sz          \tmp3\().8h,  \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp5\().8h,  \tmp5\().8h, \sz // 3 * (q0 - p0)
290cabdff1aSopenharmony_ci        bic             \tmp1\sz,  \tmp1\sz,  v5\sz    // if (!hev) av_clip_int8 = 0
291cabdff1aSopenharmony_ci        movi            v2\sz,  #4
292cabdff1aSopenharmony_ci        saddw_sz        \tmp3\().8h,  \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp1, \sz // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
293cabdff1aSopenharmony_ci        movi            v3\sz,  #3
294cabdff1aSopenharmony_ci        sqxtn_sz        \tmp1,        \tmp3\().8h,  \tmp4\().8h, \sz       // f
295cabdff1aSopenharmony_ci.if \wd == 16
296cabdff1aSopenharmony_ci        bic             v6\sz,  v6\sz,  v7\sz  // fm && flat8in && !flat8out
297cabdff1aSopenharmony_ci.endif
298cabdff1aSopenharmony_ci
299cabdff1aSopenharmony_ci        sqadd           \tmp3\sz,  \tmp1\sz,  v2\sz // FFMIN(f + 4, 127)
300cabdff1aSopenharmony_ci        sqadd           \tmp4\sz,  \tmp1\sz,  v3\sz // FFMIN(f + 3, 127)
301cabdff1aSopenharmony_ci        uxtl_sz         v0.8h,  v1.8h,  v23, \sz    // p0
302cabdff1aSopenharmony_ci        sshr            \tmp3\sz,  \tmp3\sz,  #3    // f1
303cabdff1aSopenharmony_ci        sshr            \tmp4\sz,  \tmp4\sz,  #3    // f2
304cabdff1aSopenharmony_ci
305cabdff1aSopenharmony_ci        uxtl_sz         v2.8h,  v3.8h,  v24, \sz    // q0
306cabdff1aSopenharmony_ci        saddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp4, \sz // p0 + f2
307cabdff1aSopenharmony_ci        ssubw_sz        v2.8h,  v3.8h,  v2.8h,  v3.8h,  \tmp3, \sz // q0 - f1
308cabdff1aSopenharmony_ci        sqxtun_sz       v0,  v0.8h,  v1.8h,  \sz    // out p0
309cabdff1aSopenharmony_ci        sqxtun_sz       v1,  v2.8h,  v3.8h,  \sz    // out q0
310cabdff1aSopenharmony_ci        srshr           \tmp3\sz, \tmp3\sz, #1      // f = (f1 + 1) >> 1
311cabdff1aSopenharmony_ci        bit             v23\sz, v0\sz,  v4\sz       // if (fm && !flat8in)
312cabdff1aSopenharmony_ci        bit             v24\sz, v1\sz,  v4\sz
313cabdff1aSopenharmony_ci
314cabdff1aSopenharmony_ci        uxtl_sz         v0.8h,  v1.8h,  v22, \sz    // p1
315cabdff1aSopenharmony_ci        uxtl_sz         v2.8h,  v3.8h,  v25, \sz    // q1
316cabdff1aSopenharmony_ci.if \wd >= 8
317cabdff1aSopenharmony_ci        mov             x5,  v6.d[0]
318cabdff1aSopenharmony_ci.ifc \sz, .16b
319cabdff1aSopenharmony_ci        mov             x6,  v6.d[1]
320cabdff1aSopenharmony_ci.endif
321cabdff1aSopenharmony_ci.endif
322cabdff1aSopenharmony_ci        saddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3, \sz // p1 + f
323cabdff1aSopenharmony_ci        ssubw_sz        v2.8h,  v3.8h,  v2.8h,  v3.8h,  \tmp3, \sz // q1 - f
324cabdff1aSopenharmony_ci        sqxtun_sz       v0,  v0.8h,  v1.8h, \sz     // out p1
325cabdff1aSopenharmony_ci        sqxtun_sz       v2,  v2.8h,  v3.8h, \sz     // out q1
326cabdff1aSopenharmony_ci.if \wd >= 8
327cabdff1aSopenharmony_ci.ifc \sz, .16b
328cabdff1aSopenharmony_ci        adds            x5,  x5,  x6
329cabdff1aSopenharmony_ci.endif
330cabdff1aSopenharmony_ci.endif
331cabdff1aSopenharmony_ci        bit             v22\sz, v0\sz,  v5\sz       // if (!hev && fm && !flat8in)
332cabdff1aSopenharmony_ci        bit             v25\sz, v2\sz,  v5\sz
333cabdff1aSopenharmony_ci
334cabdff1aSopenharmony_ci        // If no pixels need flat8in, jump to flat8out
335cabdff1aSopenharmony_ci        // (or to a writeout of the inner 4 pixels, for wd=8)
336cabdff1aSopenharmony_ci.if \wd >= 8
337cabdff1aSopenharmony_ci.ifc \sz, .16b
338cabdff1aSopenharmony_ci        b.eq            6f
339cabdff1aSopenharmony_ci.else
340cabdff1aSopenharmony_ci        cbz             x5,  6f
341cabdff1aSopenharmony_ci.endif
342cabdff1aSopenharmony_ci
343cabdff1aSopenharmony_ci        // flat8in
344cabdff1aSopenharmony_ci        uaddl_sz        \tmp1\().8h, \tmp2\().8h,  v20, v21, \sz
345cabdff1aSopenharmony_ci        uaddl_sz        \tmp3\().8h, \tmp4\().8h,  v22, v25, \sz
346cabdff1aSopenharmony_ci        uaddl_sz        \tmp5\().8h, \tmp6\().8h,  v20, v22, \sz
347cabdff1aSopenharmony_ci        uaddl_sz        \tmp7\().8h, \tmp8\().8h,  v23, v26, \sz
348cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  \tmp1\().8h, \tmp2\().8h, \tmp1\().8h, \tmp2\().8h, \sz
349cabdff1aSopenharmony_ci        uaddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  v23, \sz
350cabdff1aSopenharmony_ci        uaddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  v24, \sz
351cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp5\().8h, \tmp6\().8h, \sz
352cabdff1aSopenharmony_ci        sub_sz          \tmp3\().8h, \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp1\().8h, \tmp2\().8h, \sz
353cabdff1aSopenharmony_ci        sub_sz          \tmp7\().8h, \tmp8\().8h,  \tmp7\().8h, \tmp8\().8h,  \tmp5\().8h, \tmp6\().8h, \sz
354cabdff1aSopenharmony_ci        rshrn_sz        v2,  v0.8h,  v1.8h,  #3,  \sz // out p2
355cabdff1aSopenharmony_ci
356cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3\().8h, \tmp4\().8h, \sz
357cabdff1aSopenharmony_ci        uaddl_sz        \tmp1\().8h, \tmp2\().8h,  v20,  v23, \sz
358cabdff1aSopenharmony_ci        uaddl_sz        \tmp3\().8h, \tmp4\().8h,  v24,  v27, \sz
359cabdff1aSopenharmony_ci        rshrn_sz        v3,  v0.8h,  v1.8h,  #3,  \sz // out p1
360cabdff1aSopenharmony_ci
361cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp7\().8h, \tmp8\().8h, \sz
362cabdff1aSopenharmony_ci        sub_sz          \tmp3\().8h, \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp1\().8h, \tmp2\().8h, \sz
363cabdff1aSopenharmony_ci        uaddl_sz        \tmp5\().8h, \tmp6\().8h,  v21,  v24, \sz
364cabdff1aSopenharmony_ci        uaddl_sz        \tmp7\().8h, \tmp8\().8h,  v25,  v27, \sz
365cabdff1aSopenharmony_ci        rshrn_sz        v4,  v0.8h,  v1.8h,  #3,  \sz // out p0
366cabdff1aSopenharmony_ci
367cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3\().8h, \tmp4\().8h, \sz
368cabdff1aSopenharmony_ci        sub_sz          \tmp7\().8h, \tmp8\().8h,  \tmp7\().8h, \tmp8\().8h,  \tmp5\().8h, \tmp6\().8h, \sz
369cabdff1aSopenharmony_ci        uaddl_sz        \tmp1\().8h, \tmp2\().8h,  v22,  v25, \sz
370cabdff1aSopenharmony_ci        uaddl_sz        \tmp3\().8h, \tmp4\().8h,  v26,  v27, \sz
371cabdff1aSopenharmony_ci        rshrn_sz        v5,  v0.8h,  v1.8h,  #3,  \sz // out q0
372cabdff1aSopenharmony_ci
373cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp7\().8h, \tmp8\().8h, \sz
374cabdff1aSopenharmony_ci        sub_sz          \tmp3\().8h, \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp1\().8h, \tmp2\().8h, \sz
375cabdff1aSopenharmony_ci        rshrn_sz        \tmp5,  v0.8h,  v1.8h,  #3,  \sz // out q1
376cabdff1aSopenharmony_ci
377cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3\().8h, \tmp4\().8h, \sz
378cabdff1aSopenharmony_ci        // The output here is written back into the input registers. This doesn't
379cabdff1aSopenharmony_ci        // matter for the flat8part below, since we only update those pixels
380cabdff1aSopenharmony_ci        // which won't be touched below.
381cabdff1aSopenharmony_ci        bit             v21\sz, v2\sz,  v6\sz
382cabdff1aSopenharmony_ci        bit             v22\sz, v3\sz,  v6\sz
383cabdff1aSopenharmony_ci        bit             v23\sz, v4\sz,  v6\sz
384cabdff1aSopenharmony_ci        rshrn_sz        \tmp6,  v0.8h,  v1.8h,  #3,  \sz // out q2
385cabdff1aSopenharmony_ci        bit             v24\sz, v5\sz,  v6\sz
386cabdff1aSopenharmony_ci        bit             v25\sz, \tmp5\sz,  v6\sz
387cabdff1aSopenharmony_ci        bit             v26\sz, \tmp6\sz,  v6\sz
388cabdff1aSopenharmony_ci.endif
389cabdff1aSopenharmony_ci.if \wd == 16
390cabdff1aSopenharmony_ci6:
391cabdff1aSopenharmony_ci        orr             v2\sz,  v6\sz,  v7\sz
392cabdff1aSopenharmony_ci        mov             x5,  v2.d[0]
393cabdff1aSopenharmony_ci.ifc \sz, .16b
394cabdff1aSopenharmony_ci        mov             x6,  v2.d[1]
395cabdff1aSopenharmony_ci        adds            x5,  x5,  x6
396cabdff1aSopenharmony_ci        b.ne            1f
397cabdff1aSopenharmony_ci.else
398cabdff1aSopenharmony_ci        cbnz            x5,  1f
399cabdff1aSopenharmony_ci.endif
400cabdff1aSopenharmony_ci        // If no pixels needed flat8in nor flat8out, jump to a
401cabdff1aSopenharmony_ci        // writeout of the inner 4 pixels
402cabdff1aSopenharmony_ci        ret             x14
403cabdff1aSopenharmony_ci1:
404cabdff1aSopenharmony_ci
405cabdff1aSopenharmony_ci        mov             x5,  v7.d[0]
406cabdff1aSopenharmony_ci.ifc \sz, .16b
407cabdff1aSopenharmony_ci        mov             x6,  v7.d[1]
408cabdff1aSopenharmony_ci        adds            x5,  x5,  x6
409cabdff1aSopenharmony_ci        b.ne            1f
410cabdff1aSopenharmony_ci.else
411cabdff1aSopenharmony_ci        cbnz            x5,  1f
412cabdff1aSopenharmony_ci.endif
413cabdff1aSopenharmony_ci        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
414cabdff1aSopenharmony_ci        ret             x15
415cabdff1aSopenharmony_ci
416cabdff1aSopenharmony_ci1:
417cabdff1aSopenharmony_ci        // flat8out
418cabdff1aSopenharmony_ci        // This writes all outputs into v2-v17 (skipping v6 and v16).
419cabdff1aSopenharmony_ci        // If this part is skipped, the output is read from v21-v26 (which is the input
420cabdff1aSopenharmony_ci        // to this section).
421cabdff1aSopenharmony_ci        ushll_sz        v0.8h,  v1.8h,  v16,  #3,  \sz           // 8 * v16
422cabdff1aSopenharmony_ci        usubw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  v16, \sz // 7 * v16
423cabdff1aSopenharmony_ci        uaddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  v17, \sz
424cabdff1aSopenharmony_ci        uaddl_sz        v8.8h,  v9.8h,  v17, v18, \sz
425cabdff1aSopenharmony_ci        uaddl_sz        v10.8h, v11.8h, v19, v20, \sz
426cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v8.8h,  v9.8h,  \sz
427cabdff1aSopenharmony_ci        uaddl_sz        v8.8h,  v9.8h,  v16, v17, \sz
428cabdff1aSopenharmony_ci        uaddl_sz        v12.8h, v13.8h, v21, v22, \sz
429cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v10.8h, v11.8h, \sz
430cabdff1aSopenharmony_ci        uaddl_sz        v10.8h, v11.8h, v18, v25, \sz
431cabdff1aSopenharmony_ci        uaddl_sz        v14.8h, v15.8h, v23, v24, \sz
432cabdff1aSopenharmony_ci        sub_sz          v10.8h, v11.8h, v10.8h, v11.8h, v8.8h,  v9.8h,  \sz
433cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v12.8h, v13.8h, \sz
434cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
435cabdff1aSopenharmony_ci        uaddl_sz        v12.8h, v13.8h, v16, v18, \sz
436cabdff1aSopenharmony_ci        uaddl_sz        v14.8h, v15.8h, v19, v26, \sz
437cabdff1aSopenharmony_ci        rshrn_sz        v2,  v0.8h,  v1.8h,  #4,  \sz
438cabdff1aSopenharmony_ci
439cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v10.8h, v11.8h, \sz
440cabdff1aSopenharmony_ci        uaddl_sz        v8.8h,  v9.8h,  v16, v19, \sz
441cabdff1aSopenharmony_ci        uaddl_sz        v10.8h, v11.8h, v20, v27, \sz
442cabdff1aSopenharmony_ci        sub_sz          v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
443cabdff1aSopenharmony_ci        bif             v2\sz,  v17\sz, v7\sz
444cabdff1aSopenharmony_ci        rshrn_sz        v3,  v0.8h,  v1.8h,  #4,  \sz
445cabdff1aSopenharmony_ci
446cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
447cabdff1aSopenharmony_ci        uaddl_sz        v12.8h, v13.8h, v16, v20, \sz
448cabdff1aSopenharmony_ci        uaddl_sz        v14.8h, v15.8h, v21, v28, \sz
449cabdff1aSopenharmony_ci        sub_sz          v10.8h, v11.8h, v10.8h, v11.8h, v8.8h,  v9.8h,  \sz
450cabdff1aSopenharmony_ci        bif             v3\sz,  v18\sz, v7\sz
451cabdff1aSopenharmony_ci        rshrn_sz        v4,  v0.8h,  v1.8h,  #4,  \sz
452cabdff1aSopenharmony_ci
453cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v10.8h, v11.8h, \sz
454cabdff1aSopenharmony_ci        uaddl_sz        v8.8h,  v9.8h,  v16, v21, \sz
455cabdff1aSopenharmony_ci        uaddl_sz        v10.8h, v11.8h, v22, v29, \sz
456cabdff1aSopenharmony_ci        sub_sz          v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
457cabdff1aSopenharmony_ci        bif             v4\sz,  v19\sz, v7\sz
458cabdff1aSopenharmony_ci        rshrn_sz        v5,  v0.8h,  v1.8h,  #4,  \sz
459cabdff1aSopenharmony_ci
460cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
461cabdff1aSopenharmony_ci        uaddl_sz        v12.8h, v13.8h, v16, v22, \sz
462cabdff1aSopenharmony_ci        uaddl_sz        v14.8h, v15.8h, v23, v30, \sz
463cabdff1aSopenharmony_ci        sub_sz          v10.8h, v11.8h, v10.8h, v11.8h, v8.8h,  v9.8h,  \sz
464cabdff1aSopenharmony_ci        bif             v5\sz,  v20\sz, v7\sz
465cabdff1aSopenharmony_ci        rshrn_sz        v6,  v0.8h,  v1.8h,  #4,  \sz
466cabdff1aSopenharmony_ci
467cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v10.8h, v11.8h, \sz
468cabdff1aSopenharmony_ci        uaddl_sz        v10.8h, v11.8h, v16, v23, \sz
469cabdff1aSopenharmony_ci        sub_sz          v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
470cabdff1aSopenharmony_ci        uaddl_sz        v12.8h, v13.8h, v24, v31, \sz
471cabdff1aSopenharmony_ci        bif             v6\sz,  v21\sz, v7\sz
472cabdff1aSopenharmony_ci        rshrn_sz        v8,  v0.8h,  v1.8h,  #4,  \sz
473cabdff1aSopenharmony_ci
474cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
475cabdff1aSopenharmony_ci        sub_sz          v10.8h, v11.8h, v12.8h, v13.8h, v10.8h, v11.8h, \sz
476cabdff1aSopenharmony_ci        uaddl_sz        v12.8h, v13.8h, v17, v24, \sz
477cabdff1aSopenharmony_ci        uaddl_sz        v14.8h, v15.8h, v25, v31, \sz
478cabdff1aSopenharmony_ci        bif             v8\sz,  v22\sz, v7\sz
479cabdff1aSopenharmony_ci        rshrn_sz        v9,  v0.8h,  v1.8h,  #4,  \sz
480cabdff1aSopenharmony_ci
481cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v10.8h, v11.8h, \sz
482cabdff1aSopenharmony_ci        sub_sz          v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
483cabdff1aSopenharmony_ci        uaddl_sz        v12.8h, v13.8h, v26, v31, \sz
484cabdff1aSopenharmony_ci        bif             v9\sz,  v23\sz, v7\sz
485cabdff1aSopenharmony_ci        rshrn_sz        v10, v0.8h,  v1.8h,  #4,  \sz
486cabdff1aSopenharmony_ci
487cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
488cabdff1aSopenharmony_ci        uaddl_sz        v14.8h, v15.8h, v18, v25, \sz
489cabdff1aSopenharmony_ci        uaddl_sz        v18.8h, v19.8h, v19, v26, \sz
490cabdff1aSopenharmony_ci        sub_sz          v12.8h, v13.8h, v12.8h, v13.8h, v14.8h, v15.8h, \sz
491cabdff1aSopenharmony_ci        uaddl_sz        v14.8h, v15.8h, v27, v31, \sz
492cabdff1aSopenharmony_ci        bif             v10\sz, v24\sz, v7\sz
493cabdff1aSopenharmony_ci        rshrn_sz        v11, v0.8h,  v1.8h,  #4,  \sz
494cabdff1aSopenharmony_ci
495cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v12.8h, v13.8h, \sz
496cabdff1aSopenharmony_ci        uaddl_sz        v12.8h, v13.8h, v20, v27, \sz
497cabdff1aSopenharmony_ci        sub_sz          v14.8h, v15.8h, v14.8h, v15.8h, v18.8h, v19.8h, \sz
498cabdff1aSopenharmony_ci        uaddl_sz        v18.8h, v19.8h, v28, v31, \sz
499cabdff1aSopenharmony_ci        bif             v11\sz, v25\sz, v7\sz
500cabdff1aSopenharmony_ci        sub_sz          v18.8h, v19.8h, v18.8h, v19.8h, v12.8h, v13.8h, \sz
501cabdff1aSopenharmony_ci        rshrn_sz        v12, v0.8h,  v1.8h,  #4,  \sz
502cabdff1aSopenharmony_ci
503cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v14.8h, v15.8h, \sz
504cabdff1aSopenharmony_ci        uaddl_sz        v14.8h, v15.8h, v21, v28, \sz
505cabdff1aSopenharmony_ci        uaddl_sz        v20.8h, v21.8h, v29, v31, \sz
506cabdff1aSopenharmony_ci        bif             v12\sz, v26\sz, v7\sz
507cabdff1aSopenharmony_ci        rshrn_sz        v13, v0.8h,  v1.8h,  #4,  \sz
508cabdff1aSopenharmony_ci
509cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v18.8h, v19.8h, \sz
510cabdff1aSopenharmony_ci        sub_sz          v20.8h, v21.8h, v20.8h, v21.8h, v14.8h, v15.8h, \sz
511cabdff1aSopenharmony_ci        uaddl_sz        v18.8h, v19.8h, v22, v29, \sz
512cabdff1aSopenharmony_ci        uaddl_sz        v22.8h, v23.8h, v30, v31, \sz
513cabdff1aSopenharmony_ci        bif             v13\sz, v27\sz, v7\sz
514cabdff1aSopenharmony_ci        rshrn_sz        v14, v0.8h,  v1.8h,  #4,  \sz
515cabdff1aSopenharmony_ci
516cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v20.8h, v21.8h, \sz
517cabdff1aSopenharmony_ci        sub_sz          v22.8h, v23.8h, v22.8h, v23.8h, v18.8h, v19.8h, \sz
518cabdff1aSopenharmony_ci        bif             v14\sz, v28\sz, v7\sz
519cabdff1aSopenharmony_ci        rshrn_sz        v15, v0.8h,  v1.8h,  #4,  \sz
520cabdff1aSopenharmony_ci
521cabdff1aSopenharmony_ci        add_sz          v0.8h,  v1.8h,  v0.8h,  v1.8h,  v22.8h, v23.8h, \sz
522cabdff1aSopenharmony_ci        bif             v15\sz, v29\sz, v7\sz
523cabdff1aSopenharmony_ci        rshrn_sz        v17, v0.8h,  v1.8h,  #4,  \sz
524cabdff1aSopenharmony_ci        bif             v17\sz, v30\sz, v7\sz
525cabdff1aSopenharmony_ci.endif
526cabdff1aSopenharmony_ci.endm
527cabdff1aSopenharmony_ci
528cabdff1aSopenharmony_ci// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
529cabdff1aSopenharmony_ci// while we need those for inputs/outputs in wd=16 and use v8-v15
530cabdff1aSopenharmony_ci// for temp registers there instead.
531cabdff1aSopenharmony_cifunction vp9_loop_filter_4
532cabdff1aSopenharmony_ci        loop_filter     4,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
533cabdff1aSopenharmony_ci        ret
534cabdff1aSopenharmony_ci9:
535cabdff1aSopenharmony_ci        ret             x10
536cabdff1aSopenharmony_ciendfunc
537cabdff1aSopenharmony_ci
538cabdff1aSopenharmony_cifunction vp9_loop_filter_4_16b_mix_44
539cabdff1aSopenharmony_ci        loop_filter     4,  .16b, 44,   v16, v17, v18, v19, v28, v29, v30, v31
540cabdff1aSopenharmony_ci        ret
541cabdff1aSopenharmony_ci9:
542cabdff1aSopenharmony_ci        ret             x10
543cabdff1aSopenharmony_ciendfunc
544cabdff1aSopenharmony_ci
545cabdff1aSopenharmony_cifunction vp9_loop_filter_8
546cabdff1aSopenharmony_ci        loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
547cabdff1aSopenharmony_ci        ret
548cabdff1aSopenharmony_ci6:
549cabdff1aSopenharmony_ci        ret             x13
550cabdff1aSopenharmony_ci9:
551cabdff1aSopenharmony_ci        ret             x10
552cabdff1aSopenharmony_ciendfunc
553cabdff1aSopenharmony_ci
554cabdff1aSopenharmony_cifunction vp9_loop_filter_8_16b_mix
555cabdff1aSopenharmony_ci        loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
556cabdff1aSopenharmony_ci        ret
557cabdff1aSopenharmony_ci6:
558cabdff1aSopenharmony_ci        ret             x13
559cabdff1aSopenharmony_ci9:
560cabdff1aSopenharmony_ci        ret             x10
561cabdff1aSopenharmony_ciendfunc
562cabdff1aSopenharmony_ci
563cabdff1aSopenharmony_cifunction vp9_loop_filter_16
564cabdff1aSopenharmony_ci        loop_filter     16, .8b,  0,    v8,  v9,  v10, v11, v12, v13, v14, v15
565cabdff1aSopenharmony_ci        ret
566cabdff1aSopenharmony_ci9:
567cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
568cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
569cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
570cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
571cabdff1aSopenharmony_ci        ret             x10
572cabdff1aSopenharmony_ciendfunc
573cabdff1aSopenharmony_ci
574cabdff1aSopenharmony_cifunction vp9_loop_filter_16_16b
575cabdff1aSopenharmony_ci        loop_filter     16, .16b, 0,    v8,  v9,  v10, v11, v12, v13, v14, v15
576cabdff1aSopenharmony_ci        ret
577cabdff1aSopenharmony_ci9:
578cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
579cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
580cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
581cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
582cabdff1aSopenharmony_ci        ret             x10
583cabdff1aSopenharmony_ciendfunc
584cabdff1aSopenharmony_ci
585cabdff1aSopenharmony_ci.macro loop_filter_4
586cabdff1aSopenharmony_ci        bl              vp9_loop_filter_4
587cabdff1aSopenharmony_ci.endm
588cabdff1aSopenharmony_ci
589cabdff1aSopenharmony_ci.macro loop_filter_4_16b_mix mix
590cabdff1aSopenharmony_ci        bl              vp9_loop_filter_4_16b_mix_\mix
591cabdff1aSopenharmony_ci.endm
592cabdff1aSopenharmony_ci
593cabdff1aSopenharmony_ci.macro loop_filter_8
594cabdff1aSopenharmony_ci        // calculate alternative 'return' targets
595cabdff1aSopenharmony_ci        adr             x13, 6f
596cabdff1aSopenharmony_ci        bl              vp9_loop_filter_8
597cabdff1aSopenharmony_ci.endm
598cabdff1aSopenharmony_ci
599cabdff1aSopenharmony_ci.macro loop_filter_8_16b_mix mix
600cabdff1aSopenharmony_ci        // calculate alternative 'return' targets
601cabdff1aSopenharmony_ci        adr             x13, 6f
602cabdff1aSopenharmony_ci.if \mix == 48
603cabdff1aSopenharmony_ci        mov             x11, #0xffffffff00000000
604cabdff1aSopenharmony_ci.elseif \mix == 84
605cabdff1aSopenharmony_ci        mov             x11, #0x00000000ffffffff
606cabdff1aSopenharmony_ci.else
607cabdff1aSopenharmony_ci        mov             x11, #0xffffffffffffffff
608cabdff1aSopenharmony_ci.endif
609cabdff1aSopenharmony_ci        bl              vp9_loop_filter_8_16b_mix
610cabdff1aSopenharmony_ci.endm
611cabdff1aSopenharmony_ci
612cabdff1aSopenharmony_ci.macro loop_filter_16
613cabdff1aSopenharmony_ci        // calculate alternative 'return' targets
614cabdff1aSopenharmony_ci        adr             x14, 7f
615cabdff1aSopenharmony_ci        adr             x15, 8f
616cabdff1aSopenharmony_ci        bl              vp9_loop_filter_16
617cabdff1aSopenharmony_ci.endm
618cabdff1aSopenharmony_ci
619cabdff1aSopenharmony_ci.macro loop_filter_16_16b
620cabdff1aSopenharmony_ci        // calculate alternative 'return' targets
621cabdff1aSopenharmony_ci        adr             x14, 7f
622cabdff1aSopenharmony_ci        adr             x15, 8f
623cabdff1aSopenharmony_ci        bl              vp9_loop_filter_16_16b
624cabdff1aSopenharmony_ci.endm
625cabdff1aSopenharmony_ci
626cabdff1aSopenharmony_ci
627cabdff1aSopenharmony_ci// The public functions in this file have got the following signature:
628cabdff1aSopenharmony_ci// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
629cabdff1aSopenharmony_ci
630cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_4_8_neon, export=1
631cabdff1aSopenharmony_ci        mov             x10, x30
632cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #2
633cabdff1aSopenharmony_ci        ld1             {v20.8b}, [x9], x1 // p3
634cabdff1aSopenharmony_ci        ld1             {v24.8b}, [x0], x1 // q0
635cabdff1aSopenharmony_ci        ld1             {v21.8b}, [x9], x1 // p2
636cabdff1aSopenharmony_ci        ld1             {v25.8b}, [x0], x1 // q1
637cabdff1aSopenharmony_ci        ld1             {v22.8b}, [x9], x1 // p1
638cabdff1aSopenharmony_ci        ld1             {v26.8b}, [x0], x1 // q2
639cabdff1aSopenharmony_ci        ld1             {v23.8b}, [x9], x1 // p0
640cabdff1aSopenharmony_ci        ld1             {v27.8b}, [x0], x1 // q3
641cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
642cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #1
643cabdff1aSopenharmony_ci
644cabdff1aSopenharmony_ci        loop_filter_4
645cabdff1aSopenharmony_ci
646cabdff1aSopenharmony_ci        st1             {v22.8b}, [x9], x1
647cabdff1aSopenharmony_ci        st1             {v24.8b}, [x0], x1
648cabdff1aSopenharmony_ci        st1             {v23.8b}, [x9], x1
649cabdff1aSopenharmony_ci        st1             {v25.8b}, [x0], x1
650cabdff1aSopenharmony_ci
651cabdff1aSopenharmony_ci        ret             x10
652cabdff1aSopenharmony_ciendfunc
653cabdff1aSopenharmony_ci
654cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_44_16_neon, export=1
655cabdff1aSopenharmony_ci        mov             x10, x30
656cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #2
657cabdff1aSopenharmony_ci        ld1             {v20.16b}, [x9], x1 // p3
658cabdff1aSopenharmony_ci        ld1             {v24.16b}, [x0], x1 // q0
659cabdff1aSopenharmony_ci        ld1             {v21.16b}, [x9], x1 // p2
660cabdff1aSopenharmony_ci        ld1             {v25.16b}, [x0], x1 // q1
661cabdff1aSopenharmony_ci        ld1             {v22.16b}, [x9], x1 // p1
662cabdff1aSopenharmony_ci        ld1             {v26.16b}, [x0], x1 // q2
663cabdff1aSopenharmony_ci        ld1             {v23.16b}, [x9], x1 // p0
664cabdff1aSopenharmony_ci        ld1             {v27.16b}, [x0], x1 // q3
665cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
666cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #1
667cabdff1aSopenharmony_ci
668cabdff1aSopenharmony_ci        loop_filter_4_16b_mix 44
669cabdff1aSopenharmony_ci
670cabdff1aSopenharmony_ci        st1             {v22.16b}, [x9], x1
671cabdff1aSopenharmony_ci        st1             {v24.16b}, [x0], x1
672cabdff1aSopenharmony_ci        st1             {v23.16b}, [x9], x1
673cabdff1aSopenharmony_ci        st1             {v25.16b}, [x0], x1
674cabdff1aSopenharmony_ci
675cabdff1aSopenharmony_ci        ret             x10
676cabdff1aSopenharmony_ciendfunc
677cabdff1aSopenharmony_ci
678cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_4_8_neon, export=1
679cabdff1aSopenharmony_ci        mov             x10, x30
680cabdff1aSopenharmony_ci        sub             x9,  x0,  #4
681cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #2
682cabdff1aSopenharmony_ci        ld1             {v20.8b}, [x9], x1
683cabdff1aSopenharmony_ci        ld1             {v24.8b}, [x0], x1
684cabdff1aSopenharmony_ci        ld1             {v21.8b}, [x9], x1
685cabdff1aSopenharmony_ci        ld1             {v25.8b}, [x0], x1
686cabdff1aSopenharmony_ci        ld1             {v22.8b}, [x9], x1
687cabdff1aSopenharmony_ci        ld1             {v26.8b}, [x0], x1
688cabdff1aSopenharmony_ci        ld1             {v23.8b}, [x9], x1
689cabdff1aSopenharmony_ci        ld1             {v27.8b}, [x0], x1
690cabdff1aSopenharmony_ci
691cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #2
692cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
693cabdff1aSopenharmony_ci        // Move x0/x9 forward by 2 pixels; we don't need to rewrite the
694cabdff1aSopenharmony_ci        // outermost 2 pixels since they aren't changed.
695cabdff1aSopenharmony_ci        add             x9,  x9,  #2
696cabdff1aSopenharmony_ci        add             x0,  x0,  #2
697cabdff1aSopenharmony_ci
698cabdff1aSopenharmony_ci        transpose_8x8B  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
699cabdff1aSopenharmony_ci
700cabdff1aSopenharmony_ci        loop_filter_4
701cabdff1aSopenharmony_ci
702cabdff1aSopenharmony_ci        // We only will write the mid 4 pixels back; after the loop filter,
703cabdff1aSopenharmony_ci        // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
704cabdff1aSopenharmony_ci        // We need to transpose them to columns, done with a 4x8 transpose
705cabdff1aSopenharmony_ci        // (which in practice is two 4x4 transposes of the two 4x4 halves
706cabdff1aSopenharmony_ci        // of the 8x4 pixels; into 4x8 pixels).
707cabdff1aSopenharmony_ci        transpose_4x8B  v22, v23, v24, v25, v26, v27, v28, v29
708cabdff1aSopenharmony_ci        st1             {v22.s}[0], [x9], x1
709cabdff1aSopenharmony_ci        st1             {v22.s}[1], [x0], x1
710cabdff1aSopenharmony_ci        st1             {v23.s}[0], [x9], x1
711cabdff1aSopenharmony_ci        st1             {v23.s}[1], [x0], x1
712cabdff1aSopenharmony_ci        st1             {v24.s}[0], [x9], x1
713cabdff1aSopenharmony_ci        st1             {v24.s}[1], [x0], x1
714cabdff1aSopenharmony_ci        st1             {v25.s}[0], [x9], x1
715cabdff1aSopenharmony_ci        st1             {v25.s}[1], [x0], x1
716cabdff1aSopenharmony_ci
717cabdff1aSopenharmony_ci        ret             x10
718cabdff1aSopenharmony_ciendfunc
719cabdff1aSopenharmony_ci
720cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_44_16_neon, export=1
721cabdff1aSopenharmony_ci        mov             x10, x30
722cabdff1aSopenharmony_ci        sub             x9,  x0,  #4
723cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #3
724cabdff1aSopenharmony_ci        ld1             {v20.8b},   [x9], x1
725cabdff1aSopenharmony_ci        ld1             {v20.d}[1], [x0], x1
726cabdff1aSopenharmony_ci        ld1             {v21.8b},   [x9], x1
727cabdff1aSopenharmony_ci        ld1             {v21.d}[1], [x0], x1
728cabdff1aSopenharmony_ci        ld1             {v22.8b},   [x9], x1
729cabdff1aSopenharmony_ci        ld1             {v22.d}[1], [x0], x1
730cabdff1aSopenharmony_ci        ld1             {v23.8b},   [x9], x1
731cabdff1aSopenharmony_ci        ld1             {v23.d}[1], [x0], x1
732cabdff1aSopenharmony_ci        ld1             {v24.8b},   [x9], x1
733cabdff1aSopenharmony_ci        ld1             {v24.d}[1], [x0], x1
734cabdff1aSopenharmony_ci        ld1             {v25.8b},   [x9], x1
735cabdff1aSopenharmony_ci        ld1             {v25.d}[1], [x0], x1
736cabdff1aSopenharmony_ci        ld1             {v26.8b},   [x9], x1
737cabdff1aSopenharmony_ci        ld1             {v26.d}[1], [x0], x1
738cabdff1aSopenharmony_ci        ld1             {v27.8b},   [x9], x1
739cabdff1aSopenharmony_ci        ld1             {v27.d}[1], [x0], x1
740cabdff1aSopenharmony_ci
741cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #3
742cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
743cabdff1aSopenharmony_ci        add             x9,  x9,  #2
744cabdff1aSopenharmony_ci        add             x0,  x0,  #2
745cabdff1aSopenharmony_ci
746cabdff1aSopenharmony_ci        transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
747cabdff1aSopenharmony_ci
748cabdff1aSopenharmony_ci        loop_filter_4_16b_mix 44
749cabdff1aSopenharmony_ci
750cabdff1aSopenharmony_ci        transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
751cabdff1aSopenharmony_ci
752cabdff1aSopenharmony_ci        st1             {v22.s}[0], [x9], x1
753cabdff1aSopenharmony_ci        st1             {v22.s}[2], [x0], x1
754cabdff1aSopenharmony_ci        st1             {v23.s}[0], [x9], x1
755cabdff1aSopenharmony_ci        st1             {v23.s}[2], [x0], x1
756cabdff1aSopenharmony_ci        st1             {v24.s}[0], [x9], x1
757cabdff1aSopenharmony_ci        st1             {v24.s}[2], [x0], x1
758cabdff1aSopenharmony_ci        st1             {v25.s}[0], [x9], x1
759cabdff1aSopenharmony_ci        st1             {v25.s}[2], [x0], x1
760cabdff1aSopenharmony_ci        st1             {v22.s}[1], [x9], x1
761cabdff1aSopenharmony_ci        st1             {v22.s}[3], [x0], x1
762cabdff1aSopenharmony_ci        st1             {v23.s}[1], [x9], x1
763cabdff1aSopenharmony_ci        st1             {v23.s}[3], [x0], x1
764cabdff1aSopenharmony_ci        st1             {v24.s}[1], [x9], x1
765cabdff1aSopenharmony_ci        st1             {v24.s}[3], [x0], x1
766cabdff1aSopenharmony_ci        st1             {v25.s}[1], [x9], x1
767cabdff1aSopenharmony_ci        st1             {v25.s}[3], [x0], x1
768cabdff1aSopenharmony_ci
769cabdff1aSopenharmony_ci        ret             x10
770cabdff1aSopenharmony_ciendfunc
771cabdff1aSopenharmony_ci
772cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_8_8_neon, export=1
773cabdff1aSopenharmony_ci        mov             x10, x30
774cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #2
775cabdff1aSopenharmony_ci        ld1             {v20.8b}, [x9], x1 // p3
776cabdff1aSopenharmony_ci        ld1             {v24.8b}, [x0], x1 // q0
777cabdff1aSopenharmony_ci        ld1             {v21.8b}, [x9], x1 // p2
778cabdff1aSopenharmony_ci        ld1             {v25.8b}, [x0], x1 // q1
779cabdff1aSopenharmony_ci        ld1             {v22.8b}, [x9], x1 // p1
780cabdff1aSopenharmony_ci        ld1             {v26.8b}, [x0], x1 // q2
781cabdff1aSopenharmony_ci        ld1             {v23.8b}, [x9], x1 // p0
782cabdff1aSopenharmony_ci        ld1             {v27.8b}, [x0], x1 // q3
783cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #2
784cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
785cabdff1aSopenharmony_ci        add             x9,  x9,  x1
786cabdff1aSopenharmony_ci
787cabdff1aSopenharmony_ci        loop_filter_8
788cabdff1aSopenharmony_ci
789cabdff1aSopenharmony_ci        st1             {v21.8b}, [x9], x1
790cabdff1aSopenharmony_ci        st1             {v24.8b}, [x0], x1
791cabdff1aSopenharmony_ci        st1             {v22.8b}, [x9], x1
792cabdff1aSopenharmony_ci        st1             {v25.8b}, [x0], x1
793cabdff1aSopenharmony_ci        st1             {v23.8b}, [x9], x1
794cabdff1aSopenharmony_ci        st1             {v26.8b}, [x0], x1
795cabdff1aSopenharmony_ci
796cabdff1aSopenharmony_ci        ret             x10
797cabdff1aSopenharmony_ci6:
798cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #1
799cabdff1aSopenharmony_ci        st1             {v22.8b}, [x9], x1
800cabdff1aSopenharmony_ci        st1             {v24.8b}, [x0], x1
801cabdff1aSopenharmony_ci        st1             {v23.8b}, [x9], x1
802cabdff1aSopenharmony_ci        st1             {v25.8b}, [x0], x1
803cabdff1aSopenharmony_ci        ret             x10
804cabdff1aSopenharmony_ciendfunc
805cabdff1aSopenharmony_ci
806cabdff1aSopenharmony_ci.macro mix_v_16 mix
807cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_\mix\()_16_neon, export=1
808cabdff1aSopenharmony_ci        mov             x10, x30
809cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #2
810cabdff1aSopenharmony_ci        ld1             {v20.16b}, [x9], x1 // p3
811cabdff1aSopenharmony_ci        ld1             {v24.16b}, [x0], x1 // q0
812cabdff1aSopenharmony_ci        ld1             {v21.16b}, [x9], x1 // p2
813cabdff1aSopenharmony_ci        ld1             {v25.16b}, [x0], x1 // q1
814cabdff1aSopenharmony_ci        ld1             {v22.16b}, [x9], x1 // p1
815cabdff1aSopenharmony_ci        ld1             {v26.16b}, [x0], x1 // q2
816cabdff1aSopenharmony_ci        ld1             {v23.16b}, [x9], x1 // p0
817cabdff1aSopenharmony_ci        ld1             {v27.16b}, [x0], x1 // q3
818cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #2
819cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
820cabdff1aSopenharmony_ci        add             x9,  x9,  x1
821cabdff1aSopenharmony_ci
822cabdff1aSopenharmony_ci        loop_filter_8_16b_mix \mix
823cabdff1aSopenharmony_ci
824cabdff1aSopenharmony_ci        st1             {v21.16b}, [x9], x1
825cabdff1aSopenharmony_ci        st1             {v24.16b}, [x0], x1
826cabdff1aSopenharmony_ci        st1             {v22.16b}, [x9], x1
827cabdff1aSopenharmony_ci        st1             {v25.16b}, [x0], x1
828cabdff1aSopenharmony_ci        st1             {v23.16b}, [x9], x1
829cabdff1aSopenharmony_ci        st1             {v26.16b}, [x0], x1
830cabdff1aSopenharmony_ci
831cabdff1aSopenharmony_ci        ret             x10
832cabdff1aSopenharmony_ci6:
833cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #1
834cabdff1aSopenharmony_ci        st1             {v22.16b}, [x9], x1
835cabdff1aSopenharmony_ci        st1             {v24.16b}, [x0], x1
836cabdff1aSopenharmony_ci        st1             {v23.16b}, [x9], x1
837cabdff1aSopenharmony_ci        st1             {v25.16b}, [x0], x1
838cabdff1aSopenharmony_ci        ret             x10
839cabdff1aSopenharmony_ciendfunc
840cabdff1aSopenharmony_ci.endm
841cabdff1aSopenharmony_ci
842cabdff1aSopenharmony_cimix_v_16 48
843cabdff1aSopenharmony_cimix_v_16 84
844cabdff1aSopenharmony_cimix_v_16 88
845cabdff1aSopenharmony_ci
846cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_8_8_neon, export=1
847cabdff1aSopenharmony_ci        mov             x10, x30
848cabdff1aSopenharmony_ci        sub             x9,  x0,  #4
849cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #2
850cabdff1aSopenharmony_ci        ld1             {v20.8b}, [x9], x1
851cabdff1aSopenharmony_ci        ld1             {v24.8b}, [x0], x1
852cabdff1aSopenharmony_ci        ld1             {v21.8b}, [x9], x1
853cabdff1aSopenharmony_ci        ld1             {v25.8b}, [x0], x1
854cabdff1aSopenharmony_ci        ld1             {v22.8b}, [x9], x1
855cabdff1aSopenharmony_ci        ld1             {v26.8b}, [x0], x1
856cabdff1aSopenharmony_ci        ld1             {v23.8b}, [x9], x1
857cabdff1aSopenharmony_ci        ld1             {v27.8b}, [x0], x1
858cabdff1aSopenharmony_ci
859cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #2
860cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
861cabdff1aSopenharmony_ci
862cabdff1aSopenharmony_ci        transpose_8x8B  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
863cabdff1aSopenharmony_ci
864cabdff1aSopenharmony_ci        loop_filter_8
865cabdff1aSopenharmony_ci
866cabdff1aSopenharmony_ci        // Even though only 6 pixels per row have been changed, we write the
867cabdff1aSopenharmony_ci        // full 8 pixel registers.
868cabdff1aSopenharmony_ci        transpose_8x8B  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
869cabdff1aSopenharmony_ci
870cabdff1aSopenharmony_ci        st1             {v20.8b}, [x9], x1
871cabdff1aSopenharmony_ci        st1             {v24.8b}, [x0], x1
872cabdff1aSopenharmony_ci        st1             {v21.8b}, [x9], x1
873cabdff1aSopenharmony_ci        st1             {v25.8b}, [x0], x1
874cabdff1aSopenharmony_ci        st1             {v22.8b}, [x9], x1
875cabdff1aSopenharmony_ci        st1             {v26.8b}, [x0], x1
876cabdff1aSopenharmony_ci        st1             {v23.8b}, [x9], x1
877cabdff1aSopenharmony_ci        st1             {v27.8b}, [x0], x1
878cabdff1aSopenharmony_ci
879cabdff1aSopenharmony_ci        ret             x10
880cabdff1aSopenharmony_ci6:
881cabdff1aSopenharmony_ci        // If we didn't need to do the flat8in part, we use the same writeback
882cabdff1aSopenharmony_ci        // as in loop_filter_h_4_8.
883cabdff1aSopenharmony_ci        add             x9,  x9,  #2
884cabdff1aSopenharmony_ci        add             x0,  x0,  #2
885cabdff1aSopenharmony_ci        transpose_4x8B  v22, v23, v24, v25, v26, v27, v28, v29
886cabdff1aSopenharmony_ci        st1             {v22.s}[0], [x9], x1
887cabdff1aSopenharmony_ci        st1             {v22.s}[1], [x0], x1
888cabdff1aSopenharmony_ci        st1             {v23.s}[0], [x9], x1
889cabdff1aSopenharmony_ci        st1             {v23.s}[1], [x0], x1
890cabdff1aSopenharmony_ci        st1             {v24.s}[0], [x9], x1
891cabdff1aSopenharmony_ci        st1             {v24.s}[1], [x0], x1
892cabdff1aSopenharmony_ci        st1             {v25.s}[0], [x9], x1
893cabdff1aSopenharmony_ci        st1             {v25.s}[1], [x0], x1
894cabdff1aSopenharmony_ci        ret             x10
895cabdff1aSopenharmony_ciendfunc
896cabdff1aSopenharmony_ci
897cabdff1aSopenharmony_ci.macro mix_h_16 mix
898cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
899cabdff1aSopenharmony_ci        mov             x10, x30
900cabdff1aSopenharmony_ci        sub             x9,  x0,  #4
901cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #3
902cabdff1aSopenharmony_ci        ld1             {v20.8b},   [x9], x1
903cabdff1aSopenharmony_ci        ld1             {v20.d}[1], [x0], x1
904cabdff1aSopenharmony_ci        ld1             {v21.8b},   [x9], x1
905cabdff1aSopenharmony_ci        ld1             {v21.d}[1], [x0], x1
906cabdff1aSopenharmony_ci        ld1             {v22.8b},   [x9], x1
907cabdff1aSopenharmony_ci        ld1             {v22.d}[1], [x0], x1
908cabdff1aSopenharmony_ci        ld1             {v23.8b},   [x9], x1
909cabdff1aSopenharmony_ci        ld1             {v23.d}[1], [x0], x1
910cabdff1aSopenharmony_ci        ld1             {v24.8b},   [x9], x1
911cabdff1aSopenharmony_ci        ld1             {v24.d}[1], [x0], x1
912cabdff1aSopenharmony_ci        ld1             {v25.8b},   [x9], x1
913cabdff1aSopenharmony_ci        ld1             {v25.d}[1], [x0], x1
914cabdff1aSopenharmony_ci        ld1             {v26.8b},   [x9], x1
915cabdff1aSopenharmony_ci        ld1             {v26.d}[1], [x0], x1
916cabdff1aSopenharmony_ci        ld1             {v27.8b},   [x9], x1
917cabdff1aSopenharmony_ci        ld1             {v27.d}[1], [x0], x1
918cabdff1aSopenharmony_ci
919cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #3
920cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
921cabdff1aSopenharmony_ci
922cabdff1aSopenharmony_ci        transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
923cabdff1aSopenharmony_ci
924cabdff1aSopenharmony_ci        loop_filter_8_16b_mix \mix
925cabdff1aSopenharmony_ci
926cabdff1aSopenharmony_ci        transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
927cabdff1aSopenharmony_ci
928cabdff1aSopenharmony_ci        st1             {v20.8b},   [x9], x1
929cabdff1aSopenharmony_ci        st1             {v20.d}[1], [x0], x1
930cabdff1aSopenharmony_ci        st1             {v21.8b},   [x9], x1
931cabdff1aSopenharmony_ci        st1             {v21.d}[1], [x0], x1
932cabdff1aSopenharmony_ci        st1             {v22.8b},   [x9], x1
933cabdff1aSopenharmony_ci        st1             {v22.d}[1], [x0], x1
934cabdff1aSopenharmony_ci        st1             {v23.8b},   [x9], x1
935cabdff1aSopenharmony_ci        st1             {v23.d}[1], [x0], x1
936cabdff1aSopenharmony_ci        st1             {v24.8b},   [x9], x1
937cabdff1aSopenharmony_ci        st1             {v24.d}[1], [x0], x1
938cabdff1aSopenharmony_ci        st1             {v25.8b},   [x9], x1
939cabdff1aSopenharmony_ci        st1             {v25.d}[1], [x0], x1
940cabdff1aSopenharmony_ci        st1             {v26.8b},   [x9], x1
941cabdff1aSopenharmony_ci        st1             {v26.d}[1], [x0], x1
942cabdff1aSopenharmony_ci        st1             {v27.8b},   [x9], x1
943cabdff1aSopenharmony_ci        st1             {v27.d}[1], [x0], x1
944cabdff1aSopenharmony_ci
945cabdff1aSopenharmony_ci        ret             x10
946cabdff1aSopenharmony_ci6:
947cabdff1aSopenharmony_ci        add             x9,  x9,  #2
948cabdff1aSopenharmony_ci        add             x0,  x0,  #2
949cabdff1aSopenharmony_ci        transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
950cabdff1aSopenharmony_ci        st1             {v22.s}[0], [x9], x1
951cabdff1aSopenharmony_ci        st1             {v22.s}[2], [x0], x1
952cabdff1aSopenharmony_ci        st1             {v23.s}[0], [x9], x1
953cabdff1aSopenharmony_ci        st1             {v23.s}[2], [x0], x1
954cabdff1aSopenharmony_ci        st1             {v24.s}[0], [x9], x1
955cabdff1aSopenharmony_ci        st1             {v24.s}[2], [x0], x1
956cabdff1aSopenharmony_ci        st1             {v25.s}[0], [x9], x1
957cabdff1aSopenharmony_ci        st1             {v25.s}[2], [x0], x1
958cabdff1aSopenharmony_ci        st1             {v22.s}[1], [x9], x1
959cabdff1aSopenharmony_ci        st1             {v22.s}[3], [x0], x1
960cabdff1aSopenharmony_ci        st1             {v23.s}[1], [x9], x1
961cabdff1aSopenharmony_ci        st1             {v23.s}[3], [x0], x1
962cabdff1aSopenharmony_ci        st1             {v24.s}[1], [x9], x1
963cabdff1aSopenharmony_ci        st1             {v24.s}[3], [x0], x1
964cabdff1aSopenharmony_ci        st1             {v25.s}[1], [x9], x1
965cabdff1aSopenharmony_ci        st1             {v25.s}[3], [x0], x1
966cabdff1aSopenharmony_ci        ret             x10
967cabdff1aSopenharmony_ciendfunc
968cabdff1aSopenharmony_ci.endm
969cabdff1aSopenharmony_ci
970cabdff1aSopenharmony_cimix_h_16 48
971cabdff1aSopenharmony_cimix_h_16 84
972cabdff1aSopenharmony_cimix_h_16 88
973cabdff1aSopenharmony_ci
974cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_16_8_neon, export=1
975cabdff1aSopenharmony_ci        mov             x10, x30
976cabdff1aSopenharmony_ci        stp             d14, d15, [sp, #-0x10]!
977cabdff1aSopenharmony_ci        stp             d12, d13, [sp, #-0x10]!
978cabdff1aSopenharmony_ci        stp             d10, d11, [sp, #-0x10]!
979cabdff1aSopenharmony_ci        stp             d8,  d9,  [sp, #-0x10]!
980cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #3
981cabdff1aSopenharmony_ci        ld1             {v16.8b}, [x9], x1 // p7
982cabdff1aSopenharmony_ci        ld1             {v24.8b}, [x0], x1 // q0
983cabdff1aSopenharmony_ci        ld1             {v17.8b}, [x9], x1 // p6
984cabdff1aSopenharmony_ci        ld1             {v25.8b}, [x0], x1 // q1
985cabdff1aSopenharmony_ci        ld1             {v18.8b}, [x9], x1 // p5
986cabdff1aSopenharmony_ci        ld1             {v26.8b}, [x0], x1 // q2
987cabdff1aSopenharmony_ci        ld1             {v19.8b}, [x9], x1 // p4
988cabdff1aSopenharmony_ci        ld1             {v27.8b}, [x0], x1 // q3
989cabdff1aSopenharmony_ci        ld1             {v20.8b}, [x9], x1 // p3
990cabdff1aSopenharmony_ci        ld1             {v28.8b}, [x0], x1 // q4
991cabdff1aSopenharmony_ci        ld1             {v21.8b}, [x9], x1 // p2
992cabdff1aSopenharmony_ci        ld1             {v29.8b}, [x0], x1 // q5
993cabdff1aSopenharmony_ci        ld1             {v22.8b}, [x9], x1 // p1
994cabdff1aSopenharmony_ci        ld1             {v30.8b}, [x0], x1 // q6
995cabdff1aSopenharmony_ci        ld1             {v23.8b}, [x9], x1 // p0
996cabdff1aSopenharmony_ci        ld1             {v31.8b}, [x0], x1 // q7
997cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #3
998cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
999cabdff1aSopenharmony_ci        add             x9,  x9,  x1
1000cabdff1aSopenharmony_ci
1001cabdff1aSopenharmony_ci        loop_filter_16
1002cabdff1aSopenharmony_ci
1003cabdff1aSopenharmony_ci        // If we did the flat8out part, we get the output in
1004cabdff1aSopenharmony_ci        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
1005cabdff1aSopenharmony_ci        // store v2-v9 there, and v10-v17 into x0.
1006cabdff1aSopenharmony_ci        st1             {v2.8b},  [x9], x1
1007cabdff1aSopenharmony_ci        st1             {v10.8b}, [x0], x1
1008cabdff1aSopenharmony_ci        st1             {v3.8b},  [x9], x1
1009cabdff1aSopenharmony_ci        st1             {v11.8b}, [x0], x1
1010cabdff1aSopenharmony_ci        st1             {v4.8b},  [x9], x1
1011cabdff1aSopenharmony_ci        st1             {v12.8b}, [x0], x1
1012cabdff1aSopenharmony_ci        st1             {v5.8b},  [x9], x1
1013cabdff1aSopenharmony_ci        st1             {v13.8b}, [x0], x1
1014cabdff1aSopenharmony_ci        st1             {v6.8b},  [x9], x1
1015cabdff1aSopenharmony_ci        st1             {v14.8b}, [x0], x1
1016cabdff1aSopenharmony_ci        st1             {v8.8b},  [x9], x1
1017cabdff1aSopenharmony_ci        st1             {v15.8b}, [x0], x1
1018cabdff1aSopenharmony_ci        st1             {v9.8b},  [x9], x1
1019cabdff1aSopenharmony_ci        st1             {v17.8b}, [x0], x1
1020cabdff1aSopenharmony_ci9:
1021cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
1022cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
1023cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
1024cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
1025cabdff1aSopenharmony_ci        ret             x10
1026cabdff1aSopenharmony_ci8:
1027cabdff1aSopenharmony_ci        add             x9,  x9,  x1, lsl #2
1028cabdff1aSopenharmony_ci        // If we didn't do the flat8out part, the output is left in the
1029cabdff1aSopenharmony_ci        // input registers.
1030cabdff1aSopenharmony_ci        st1             {v21.8b}, [x9], x1
1031cabdff1aSopenharmony_ci        st1             {v24.8b}, [x0], x1
1032cabdff1aSopenharmony_ci        st1             {v22.8b}, [x9], x1
1033cabdff1aSopenharmony_ci        st1             {v25.8b}, [x0], x1
1034cabdff1aSopenharmony_ci        st1             {v23.8b}, [x9], x1
1035cabdff1aSopenharmony_ci        st1             {v26.8b}, [x0], x1
1036cabdff1aSopenharmony_ci        b               9b
1037cabdff1aSopenharmony_ci7:
1038cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #1
1039cabdff1aSopenharmony_ci        st1             {v22.8b}, [x9], x1
1040cabdff1aSopenharmony_ci        st1             {v24.8b}, [x0], x1
1041cabdff1aSopenharmony_ci        st1             {v23.8b}, [x9], x1
1042cabdff1aSopenharmony_ci        st1             {v25.8b}, [x0], x1
1043cabdff1aSopenharmony_ci        b               9b
1044cabdff1aSopenharmony_ciendfunc
1045cabdff1aSopenharmony_ci
1046cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_16_16_neon, export=1
1047cabdff1aSopenharmony_ci        mov             x10, x30
1048cabdff1aSopenharmony_ci        stp             d14, d15, [sp, #-0x10]!
1049cabdff1aSopenharmony_ci        stp             d12, d13, [sp, #-0x10]!
1050cabdff1aSopenharmony_ci        stp             d10, d11, [sp, #-0x10]!
1051cabdff1aSopenharmony_ci        stp             d8,  d9,  [sp, #-0x10]!
1052cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #3
1053cabdff1aSopenharmony_ci        ld1             {v16.16b}, [x9], x1 // p7
1054cabdff1aSopenharmony_ci        ld1             {v24.16b}, [x0], x1 // q0
1055cabdff1aSopenharmony_ci        ld1             {v17.16b}, [x9], x1 // p6
1056cabdff1aSopenharmony_ci        ld1             {v25.16b}, [x0], x1 // q1
1057cabdff1aSopenharmony_ci        ld1             {v18.16b}, [x9], x1 // p5
1058cabdff1aSopenharmony_ci        ld1             {v26.16b}, [x0], x1 // q2
1059cabdff1aSopenharmony_ci        ld1             {v19.16b}, [x9], x1 // p4
1060cabdff1aSopenharmony_ci        ld1             {v27.16b}, [x0], x1 // q3
1061cabdff1aSopenharmony_ci        ld1             {v20.16b}, [x9], x1 // p3
1062cabdff1aSopenharmony_ci        ld1             {v28.16b}, [x0], x1 // q4
1063cabdff1aSopenharmony_ci        ld1             {v21.16b}, [x9], x1 // p2
1064cabdff1aSopenharmony_ci        ld1             {v29.16b}, [x0], x1 // q5
1065cabdff1aSopenharmony_ci        ld1             {v22.16b}, [x9], x1 // p1
1066cabdff1aSopenharmony_ci        ld1             {v30.16b}, [x0], x1 // q6
1067cabdff1aSopenharmony_ci        ld1             {v23.16b}, [x9], x1 // p0
1068cabdff1aSopenharmony_ci        ld1             {v31.16b}, [x0], x1 // q7
1069cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #3
1070cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
1071cabdff1aSopenharmony_ci        add             x9,  x9,  x1
1072cabdff1aSopenharmony_ci
1073cabdff1aSopenharmony_ci        loop_filter_16_16b
1074cabdff1aSopenharmony_ci
1075cabdff1aSopenharmony_ci        st1             {v2.16b},  [x9], x1
1076cabdff1aSopenharmony_ci        st1             {v10.16b}, [x0], x1
1077cabdff1aSopenharmony_ci        st1             {v3.16b},  [x9], x1
1078cabdff1aSopenharmony_ci        st1             {v11.16b}, [x0], x1
1079cabdff1aSopenharmony_ci        st1             {v4.16b},  [x9], x1
1080cabdff1aSopenharmony_ci        st1             {v12.16b}, [x0], x1
1081cabdff1aSopenharmony_ci        st1             {v5.16b},  [x9], x1
1082cabdff1aSopenharmony_ci        st1             {v13.16b}, [x0], x1
1083cabdff1aSopenharmony_ci        st1             {v6.16b},  [x9], x1
1084cabdff1aSopenharmony_ci        st1             {v14.16b}, [x0], x1
1085cabdff1aSopenharmony_ci        st1             {v8.16b},  [x9], x1
1086cabdff1aSopenharmony_ci        st1             {v15.16b}, [x0], x1
1087cabdff1aSopenharmony_ci        st1             {v9.16b},  [x9], x1
1088cabdff1aSopenharmony_ci        st1             {v17.16b}, [x0], x1
1089cabdff1aSopenharmony_ci9:
1090cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
1091cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
1092cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
1093cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
1094cabdff1aSopenharmony_ci        ret             x10
1095cabdff1aSopenharmony_ci8:
1096cabdff1aSopenharmony_ci        add             x9,  x9,  x1, lsl #2
1097cabdff1aSopenharmony_ci        st1             {v21.16b}, [x9], x1
1098cabdff1aSopenharmony_ci        st1             {v24.16b}, [x0], x1
1099cabdff1aSopenharmony_ci        st1             {v22.16b}, [x9], x1
1100cabdff1aSopenharmony_ci        st1             {v25.16b}, [x0], x1
1101cabdff1aSopenharmony_ci        st1             {v23.16b}, [x9], x1
1102cabdff1aSopenharmony_ci        st1             {v26.16b}, [x0], x1
1103cabdff1aSopenharmony_ci        b               9b
1104cabdff1aSopenharmony_ci7:
1105cabdff1aSopenharmony_ci        sub             x9,  x0,  x1, lsl #1
1106cabdff1aSopenharmony_ci        st1             {v22.16b}, [x9], x1
1107cabdff1aSopenharmony_ci        st1             {v24.16b}, [x0], x1
1108cabdff1aSopenharmony_ci        st1             {v23.16b}, [x9], x1
1109cabdff1aSopenharmony_ci        st1             {v25.16b}, [x0], x1
1110cabdff1aSopenharmony_ci        b               9b
1111cabdff1aSopenharmony_ciendfunc
1112cabdff1aSopenharmony_ci
1113cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_16_8_neon, export=1
1114cabdff1aSopenharmony_ci        mov             x10, x30
1115cabdff1aSopenharmony_ci        stp             d14, d15, [sp, #-0x10]!
1116cabdff1aSopenharmony_ci        stp             d12, d13, [sp, #-0x10]!
1117cabdff1aSopenharmony_ci        stp             d10, d11, [sp, #-0x10]!
1118cabdff1aSopenharmony_ci        stp             d8,  d9,  [sp, #-0x10]!
1119cabdff1aSopenharmony_ci        sub             x9,  x0,  #8
1120cabdff1aSopenharmony_ci        ld1             {v16.8b}, [x9], x1
1121cabdff1aSopenharmony_ci        ld1             {v24.8b}, [x0], x1
1122cabdff1aSopenharmony_ci        ld1             {v17.8b}, [x9], x1
1123cabdff1aSopenharmony_ci        ld1             {v25.8b}, [x0], x1
1124cabdff1aSopenharmony_ci        ld1             {v18.8b}, [x9], x1
1125cabdff1aSopenharmony_ci        ld1             {v26.8b}, [x0], x1
1126cabdff1aSopenharmony_ci        ld1             {v19.8b}, [x9], x1
1127cabdff1aSopenharmony_ci        ld1             {v27.8b}, [x0], x1
1128cabdff1aSopenharmony_ci        ld1             {v20.8b}, [x9], x1
1129cabdff1aSopenharmony_ci        ld1             {v28.8b}, [x0], x1
1130cabdff1aSopenharmony_ci        ld1             {v21.8b}, [x9], x1
1131cabdff1aSopenharmony_ci        ld1             {v29.8b}, [x0], x1
1132cabdff1aSopenharmony_ci        ld1             {v22.8b}, [x9], x1
1133cabdff1aSopenharmony_ci        ld1             {v30.8b}, [x0], x1
1134cabdff1aSopenharmony_ci        ld1             {v23.8b}, [x9], x1
1135cabdff1aSopenharmony_ci        ld1             {v31.8b}, [x0], x1
1136cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
1137cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #3
1138cabdff1aSopenharmony_ci
1139cabdff1aSopenharmony_ci        // The 16x8 pixels read above is in two 8x8 blocks; the left
1140cabdff1aSopenharmony_ci        // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
1141cabdff1aSopenharmony_ci        // of this, to get one column per register.
1142cabdff1aSopenharmony_ci        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
1143cabdff1aSopenharmony_ci        transpose_8x8B  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
1144cabdff1aSopenharmony_ci
1145cabdff1aSopenharmony_ci        loop_filter_16
1146cabdff1aSopenharmony_ci
1147cabdff1aSopenharmony_ci        transpose_8x8B  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
1148cabdff1aSopenharmony_ci        transpose_8x8B  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
1149cabdff1aSopenharmony_ci
1150cabdff1aSopenharmony_ci        st1             {v16.8b}, [x9], x1
1151cabdff1aSopenharmony_ci        st1             {v10.8b}, [x0], x1
1152cabdff1aSopenharmony_ci        st1             {v2.8b},  [x9], x1
1153cabdff1aSopenharmony_ci        st1             {v11.8b}, [x0], x1
1154cabdff1aSopenharmony_ci        st1             {v3.8b},  [x9], x1
1155cabdff1aSopenharmony_ci        st1             {v12.8b}, [x0], x1
1156cabdff1aSopenharmony_ci        st1             {v4.8b},  [x9], x1
1157cabdff1aSopenharmony_ci        st1             {v13.8b}, [x0], x1
1158cabdff1aSopenharmony_ci        st1             {v5.8b},  [x9], x1
1159cabdff1aSopenharmony_ci        st1             {v14.8b}, [x0], x1
1160cabdff1aSopenharmony_ci        st1             {v6.8b},  [x9], x1
1161cabdff1aSopenharmony_ci        st1             {v15.8b}, [x0], x1
1162cabdff1aSopenharmony_ci        st1             {v8.8b},  [x9], x1
1163cabdff1aSopenharmony_ci        st1             {v17.8b}, [x0], x1
1164cabdff1aSopenharmony_ci        st1             {v9.8b},  [x9], x1
1165cabdff1aSopenharmony_ci        st1             {v31.8b}, [x0], x1
1166cabdff1aSopenharmony_ci9:
1167cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
1168cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
1169cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
1170cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
1171cabdff1aSopenharmony_ci        ret             x10
1172cabdff1aSopenharmony_ci8:
1173cabdff1aSopenharmony_ci        // The same writeback as in loop_filter_h_8_8
1174cabdff1aSopenharmony_ci        sub             x9,  x0,  #4
1175cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #2
1176cabdff1aSopenharmony_ci        transpose_8x8B  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
1177cabdff1aSopenharmony_ci
1178cabdff1aSopenharmony_ci        st1             {v20.8b}, [x9], x1
1179cabdff1aSopenharmony_ci        st1             {v24.8b}, [x0], x1
1180cabdff1aSopenharmony_ci        st1             {v21.8b}, [x9], x1
1181cabdff1aSopenharmony_ci        st1             {v25.8b}, [x0], x1
1182cabdff1aSopenharmony_ci        st1             {v22.8b}, [x9], x1
1183cabdff1aSopenharmony_ci        st1             {v26.8b}, [x0], x1
1184cabdff1aSopenharmony_ci        st1             {v23.8b}, [x9], x1
1185cabdff1aSopenharmony_ci        st1             {v27.8b}, [x0], x1
1186cabdff1aSopenharmony_ci        b               9b
1187cabdff1aSopenharmony_ci7:
1188cabdff1aSopenharmony_ci        // The same writeback as in loop_filter_h_4_8
1189cabdff1aSopenharmony_ci        sub             x9,  x0,  #2
1190cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #2
1191cabdff1aSopenharmony_ci        transpose_4x8B  v22, v23, v24, v25, v26, v27, v28, v29
1192cabdff1aSopenharmony_ci        st1             {v22.s}[0], [x9], x1
1193cabdff1aSopenharmony_ci        st1             {v22.s}[1], [x0], x1
1194cabdff1aSopenharmony_ci        st1             {v23.s}[0], [x9], x1
1195cabdff1aSopenharmony_ci        st1             {v23.s}[1], [x0], x1
1196cabdff1aSopenharmony_ci        st1             {v24.s}[0], [x9], x1
1197cabdff1aSopenharmony_ci        st1             {v24.s}[1], [x0], x1
1198cabdff1aSopenharmony_ci        st1             {v25.s}[0], [x9], x1
1199cabdff1aSopenharmony_ci        st1             {v25.s}[1], [x0], x1
1200cabdff1aSopenharmony_ci        b               9b
1201cabdff1aSopenharmony_ciendfunc
1202cabdff1aSopenharmony_ci
1203cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_16_16_neon, export=1
1204cabdff1aSopenharmony_ci        mov             x10, x30
1205cabdff1aSopenharmony_ci        stp             d14, d15, [sp, #-0x10]!
1206cabdff1aSopenharmony_ci        stp             d12, d13, [sp, #-0x10]!
1207cabdff1aSopenharmony_ci        stp             d10, d11, [sp, #-0x10]!
1208cabdff1aSopenharmony_ci        stp             d8,  d9,  [sp, #-0x10]!
1209cabdff1aSopenharmony_ci        sub             x9,  x0,  #8
1210cabdff1aSopenharmony_ci        ld1             {v16.8b},   [x9], x1
1211cabdff1aSopenharmony_ci        ld1             {v24.8b},   [x0], x1
1212cabdff1aSopenharmony_ci        ld1             {v17.8b},   [x9], x1
1213cabdff1aSopenharmony_ci        ld1             {v25.8b},   [x0], x1
1214cabdff1aSopenharmony_ci        ld1             {v18.8b},   [x9], x1
1215cabdff1aSopenharmony_ci        ld1             {v26.8b},   [x0], x1
1216cabdff1aSopenharmony_ci        ld1             {v19.8b},   [x9], x1
1217cabdff1aSopenharmony_ci        ld1             {v27.8b},   [x0], x1
1218cabdff1aSopenharmony_ci        ld1             {v20.8b},   [x9], x1
1219cabdff1aSopenharmony_ci        ld1             {v28.8b},   [x0], x1
1220cabdff1aSopenharmony_ci        ld1             {v21.8b},   [x9], x1
1221cabdff1aSopenharmony_ci        ld1             {v29.8b},   [x0], x1
1222cabdff1aSopenharmony_ci        ld1             {v22.8b},   [x9], x1
1223cabdff1aSopenharmony_ci        ld1             {v30.8b},   [x0], x1
1224cabdff1aSopenharmony_ci        ld1             {v23.8b},   [x9], x1
1225cabdff1aSopenharmony_ci        ld1             {v31.8b},   [x0], x1
1226cabdff1aSopenharmony_ci        ld1             {v16.d}[1], [x9], x1
1227cabdff1aSopenharmony_ci        ld1             {v24.d}[1], [x0], x1
1228cabdff1aSopenharmony_ci        ld1             {v17.d}[1], [x9], x1
1229cabdff1aSopenharmony_ci        ld1             {v25.d}[1], [x0], x1
1230cabdff1aSopenharmony_ci        ld1             {v18.d}[1], [x9], x1
1231cabdff1aSopenharmony_ci        ld1             {v26.d}[1], [x0], x1
1232cabdff1aSopenharmony_ci        ld1             {v19.d}[1], [x9], x1
1233cabdff1aSopenharmony_ci        ld1             {v27.d}[1], [x0], x1
1234cabdff1aSopenharmony_ci        ld1             {v20.d}[1], [x9], x1
1235cabdff1aSopenharmony_ci        ld1             {v28.d}[1], [x0], x1
1236cabdff1aSopenharmony_ci        ld1             {v21.d}[1], [x9], x1
1237cabdff1aSopenharmony_ci        ld1             {v29.d}[1], [x0], x1
1238cabdff1aSopenharmony_ci        ld1             {v22.d}[1], [x9], x1
1239cabdff1aSopenharmony_ci        ld1             {v30.d}[1], [x0], x1
1240cabdff1aSopenharmony_ci        ld1             {v23.d}[1], [x9], x1
1241cabdff1aSopenharmony_ci        ld1             {v31.d}[1], [x0], x1
1242cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #4
1243cabdff1aSopenharmony_ci        sub             x9,  x9,  x1, lsl #4
1244cabdff1aSopenharmony_ci
1245cabdff1aSopenharmony_ci        transpose_8x16B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
1246cabdff1aSopenharmony_ci        transpose_8x16B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
1247cabdff1aSopenharmony_ci
1248cabdff1aSopenharmony_ci        loop_filter_16_16b
1249cabdff1aSopenharmony_ci
1250cabdff1aSopenharmony_ci        transpose_8x16B v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
1251cabdff1aSopenharmony_ci        transpose_8x16B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
1252cabdff1aSopenharmony_ci
1253cabdff1aSopenharmony_ci        st1             {v16.8b},   [x9], x1
1254cabdff1aSopenharmony_ci        st1             {v10.8b},   [x0], x1
1255cabdff1aSopenharmony_ci        st1             {v2.8b},    [x9], x1
1256cabdff1aSopenharmony_ci        st1             {v11.8b},   [x0], x1
1257cabdff1aSopenharmony_ci        st1             {v3.8b},    [x9], x1
1258cabdff1aSopenharmony_ci        st1             {v12.8b},   [x0], x1
1259cabdff1aSopenharmony_ci        st1             {v4.8b},    [x9], x1
1260cabdff1aSopenharmony_ci        st1             {v13.8b},   [x0], x1
1261cabdff1aSopenharmony_ci        st1             {v5.8b},    [x9], x1
1262cabdff1aSopenharmony_ci        st1             {v14.8b},   [x0], x1
1263cabdff1aSopenharmony_ci        st1             {v6.8b},    [x9], x1
1264cabdff1aSopenharmony_ci        st1             {v15.8b},   [x0], x1
1265cabdff1aSopenharmony_ci        st1             {v8.8b},    [x9], x1
1266cabdff1aSopenharmony_ci        st1             {v17.8b},   [x0], x1
1267cabdff1aSopenharmony_ci        st1             {v9.8b},    [x9], x1
1268cabdff1aSopenharmony_ci        st1             {v31.8b},   [x0], x1
1269cabdff1aSopenharmony_ci        st1             {v16.d}[1], [x9], x1
1270cabdff1aSopenharmony_ci        st1             {v10.d}[1], [x0], x1
1271cabdff1aSopenharmony_ci        st1             {v2.d}[1],  [x9], x1
1272cabdff1aSopenharmony_ci        st1             {v11.d}[1], [x0], x1
1273cabdff1aSopenharmony_ci        st1             {v3.d}[1],  [x9], x1
1274cabdff1aSopenharmony_ci        st1             {v12.d}[1], [x0], x1
1275cabdff1aSopenharmony_ci        st1             {v4.d}[1],  [x9], x1
1276cabdff1aSopenharmony_ci        st1             {v13.d}[1], [x0], x1
1277cabdff1aSopenharmony_ci        st1             {v5.d}[1],  [x9], x1
1278cabdff1aSopenharmony_ci        st1             {v14.d}[1], [x0], x1
1279cabdff1aSopenharmony_ci        st1             {v6.d}[1],  [x9], x1
1280cabdff1aSopenharmony_ci        st1             {v15.d}[1], [x0], x1
1281cabdff1aSopenharmony_ci        st1             {v8.d}[1],  [x9], x1
1282cabdff1aSopenharmony_ci        st1             {v17.d}[1], [x0], x1
1283cabdff1aSopenharmony_ci        st1             {v9.d}[1],  [x9], x1
1284cabdff1aSopenharmony_ci        st1             {v31.d}[1], [x0], x1
1285cabdff1aSopenharmony_ci9:
1286cabdff1aSopenharmony_ci        ldp             d8,  d9,  [sp], 0x10
1287cabdff1aSopenharmony_ci        ldp             d10, d11, [sp], 0x10
1288cabdff1aSopenharmony_ci        ldp             d12, d13, [sp], 0x10
1289cabdff1aSopenharmony_ci        ldp             d14, d15, [sp], 0x10
1290cabdff1aSopenharmony_ci        ret             x10
1291cabdff1aSopenharmony_ci8:
1292cabdff1aSopenharmony_ci        sub             x9,  x0,  #4
1293cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #3
1294cabdff1aSopenharmony_ci        transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
1295cabdff1aSopenharmony_ci
1296cabdff1aSopenharmony_ci        st1             {v20.8b},   [x9], x1
1297cabdff1aSopenharmony_ci        st1             {v20.d}[1], [x0], x1
1298cabdff1aSopenharmony_ci        st1             {v21.8b},   [x9], x1
1299cabdff1aSopenharmony_ci        st1             {v21.d}[1], [x0], x1
1300cabdff1aSopenharmony_ci        st1             {v22.8b},   [x9], x1
1301cabdff1aSopenharmony_ci        st1             {v22.d}[1], [x0], x1
1302cabdff1aSopenharmony_ci        st1             {v23.8b},   [x9], x1
1303cabdff1aSopenharmony_ci        st1             {v23.d}[1], [x0], x1
1304cabdff1aSopenharmony_ci        st1             {v24.8b},   [x9], x1
1305cabdff1aSopenharmony_ci        st1             {v24.d}[1], [x0], x1
1306cabdff1aSopenharmony_ci        st1             {v25.8b},   [x9], x1
1307cabdff1aSopenharmony_ci        st1             {v25.d}[1], [x0], x1
1308cabdff1aSopenharmony_ci        st1             {v26.8b},   [x9], x1
1309cabdff1aSopenharmony_ci        st1             {v26.d}[1], [x0], x1
1310cabdff1aSopenharmony_ci        st1             {v27.8b},   [x9], x1
1311cabdff1aSopenharmony_ci        st1             {v27.d}[1], [x0], x1
1312cabdff1aSopenharmony_ci        b               9b
1313cabdff1aSopenharmony_ci7:
1314cabdff1aSopenharmony_ci        sub             x9,  x0,  #2
1315cabdff1aSopenharmony_ci        add             x0,  x9,  x1, lsl #3
1316cabdff1aSopenharmony_ci        transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
1317cabdff1aSopenharmony_ci        st1             {v22.s}[0], [x9], x1
1318cabdff1aSopenharmony_ci        st1             {v22.s}[2], [x0], x1
1319cabdff1aSopenharmony_ci        st1             {v23.s}[0], [x9], x1
1320cabdff1aSopenharmony_ci        st1             {v23.s}[2], [x0], x1
1321cabdff1aSopenharmony_ci        st1             {v24.s}[0], [x9], x1
1322cabdff1aSopenharmony_ci        st1             {v24.s}[2], [x0], x1
1323cabdff1aSopenharmony_ci        st1             {v25.s}[0], [x9], x1
1324cabdff1aSopenharmony_ci        st1             {v25.s}[2], [x0], x1
1325cabdff1aSopenharmony_ci        st1             {v22.s}[1], [x9], x1
1326cabdff1aSopenharmony_ci        st1             {v22.s}[3], [x0], x1
1327cabdff1aSopenharmony_ci        st1             {v23.s}[1], [x9], x1
1328cabdff1aSopenharmony_ci        st1             {v23.s}[3], [x0], x1
1329cabdff1aSopenharmony_ci        st1             {v24.s}[1], [x9], x1
1330cabdff1aSopenharmony_ci        st1             {v24.s}[3], [x0], x1
1331cabdff1aSopenharmony_ci        st1             {v25.s}[1], [x9], x1
1332cabdff1aSopenharmony_ci        st1             {v25.s}[3], [x0], x1
1333cabdff1aSopenharmony_ci        b               9b
1334cabdff1aSopenharmony_ciendfunc
1335