1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2017 Google Inc.
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S"
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci.macro transpose16_q_8x8 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
24cabdff1aSopenharmony_ci        vswp             \r1,  \r8  @ vtrn.64 \rq0, \rq4
25cabdff1aSopenharmony_ci        vswp             \r3,  \r10 @ vtrn.64 \rq1, \rq5
26cabdff1aSopenharmony_ci        vswp             \r5,  \r12 @ vtrn.64 \rq2, \rq6
27cabdff1aSopenharmony_ci        vswp             \r7,  \r14 @ vtrn.64 \rq3, \rq7
28cabdff1aSopenharmony_ci        vtrn.32          \rq0, \rq2
29cabdff1aSopenharmony_ci        vtrn.32          \rq1, \rq3
30cabdff1aSopenharmony_ci        vtrn.32          \rq4, \rq6
31cabdff1aSopenharmony_ci        vtrn.32          \rq5, \rq7
32cabdff1aSopenharmony_ci        vtrn.16          \rq0, \rq1
33cabdff1aSopenharmony_ci        vtrn.16          \rq2, \rq3
34cabdff1aSopenharmony_ci        vtrn.16          \rq4, \rq5
35cabdff1aSopenharmony_ci        vtrn.16          \rq6, \rq7
36cabdff1aSopenharmony_ci.endm
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ci.macro transpose16_4x4 r0, r1, r2, r3
39cabdff1aSopenharmony_ci        vtrn.32          \r0, \r2
40cabdff1aSopenharmony_ci        vtrn.32          \r1, \r3
41cabdff1aSopenharmony_ci        vtrn.16          \r0, \r1
42cabdff1aSopenharmony_ci        vtrn.16          \r2, \r3
43cabdff1aSopenharmony_ci.endm
44cabdff1aSopenharmony_ci
45cabdff1aSopenharmony_ci@ Do a 4x4 transpose, using q registers for the subtransposes that don't
46cabdff1aSopenharmony_ci@ need to address the indiviudal d registers.
47cabdff1aSopenharmony_ci@ r0,r1 == rq0, r2,r3 == rq1
48cabdff1aSopenharmony_ci.macro transpose16_q_4x4 rq0, rq1, r0, r1, r2, r3
49cabdff1aSopenharmony_ci        vtrn.32         \rq0, \rq1
50cabdff1aSopenharmony_ci        vtrn.16         \r0,  \r1
51cabdff1aSopenharmony_ci        vtrn.16         \r2,  \r3
52cabdff1aSopenharmony_ci.endm
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ci@ The input to and output from this macro is in the registers q8-q15,
55cabdff1aSopenharmony_ci@ and q0-q7 are used as scratch registers.
56cabdff1aSopenharmony_ci@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
57cabdff1aSopenharmony_ci.macro loop_filter_q wd
58cabdff1aSopenharmony_ci        vdup.u16        q0,  r2          @ E
59cabdff1aSopenharmony_ci        vdup.u16        q1,  r3          @ I
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ci        vabd.u16        q2,  q8,  q9     @ abs(p3 - p2)
62cabdff1aSopenharmony_ci        vabd.u16        q3,  q9,  q10    @ abs(p2 - p1)
63cabdff1aSopenharmony_ci        vabd.u16        q4,  q10, q11    @ abs(p1 - p0)
64cabdff1aSopenharmony_ci        vabd.u16        q5,  q12, q13    @ abs(q0 - q1)
65cabdff1aSopenharmony_ci        vabd.u16        q6,  q13, q14    @ abs(q1 - q2)
66cabdff1aSopenharmony_ci        vabd.u16        q7,  q14, q15    @ abs(q2 - q3)
67cabdff1aSopenharmony_ci        vmax.u16        q2,  q2,  q3
68cabdff1aSopenharmony_ci        vmax.u16        q3,  q4,  q5
69cabdff1aSopenharmony_ci        vmax.u16        q4,  q6,  q7
70cabdff1aSopenharmony_ci        vabd.u16        q5,  q11, q12    @ abs(p0 - q0)
71cabdff1aSopenharmony_ci        vmax.u16        q2,  q2,  q3
72cabdff1aSopenharmony_ci        vadd.u16        q5,  q5,  q5     @ abs(p0 - q0) * 2
73cabdff1aSopenharmony_ci        vabd.u16        q6,  q10, q13    @ abs(p1 - q1)
74cabdff1aSopenharmony_ci        vmax.u16        q2,  q2,  q4     @ max(abs(p3 - p2), ..., abs(q2 - q3))
75cabdff1aSopenharmony_ci        vshr.u16        q6,  q6,  #1
76cabdff1aSopenharmony_ci        vcle.u16        q2,  q2,  q1     @ max(abs()) <= I
77cabdff1aSopenharmony_ci        vadd.u16        q5,  q5,  q6     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
78cabdff1aSopenharmony_ci        vcle.u16        q5,  q5,  q0
79cabdff1aSopenharmony_ci        vand            q2,  q2,  q5     @ fm
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci        vmovn.u16       d10, q2
82cabdff1aSopenharmony_ci        vmov            r8,  r9,  d10
83cabdff1aSopenharmony_ci        orrs            r8,  r8,  r9
84cabdff1aSopenharmony_ci        @ If no pixels need filtering, just exit as soon as possible
85cabdff1aSopenharmony_ci        beq             9f
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_ci.if \wd >= 8
88cabdff1aSopenharmony_ci        vdup.u16        q0,  r5
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci        vabd.u16        q1,  q8,  q11    @ abs(p3 - p0)
91cabdff1aSopenharmony_ci        vabd.u16        q3,  q9,  q11    @ abs(p2 - p0)
92cabdff1aSopenharmony_ci        vabd.u16        q4,  q10, q11    @ abs(p1 - p0)
93cabdff1aSopenharmony_ci        vabd.u16        q5,  q13, q12    @ abs(q1 - q0)
94cabdff1aSopenharmony_ci        vabd.u16        q6,  q14, q12    @ abs(q2 - q0)
95cabdff1aSopenharmony_ci        vabd.u16        q7,  q15, q12    @ abs(q3 - q0)
96cabdff1aSopenharmony_ci        vmax.u16        q1,  q1,  q3
97cabdff1aSopenharmony_ci        vmax.u16        q4,  q4,  q5
98cabdff1aSopenharmony_ci        vmax.u16        q6,  q6,  q7
99cabdff1aSopenharmony_ci        @ The rest of the calculation of flat8in is interleaved below
100cabdff1aSopenharmony_ci.endif
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_ci        @ Calculate the normal inner loop filter for 2 or 4 pixels
103cabdff1aSopenharmony_ci        vabd.u16        q3,  q10, q11    @ abs(p1 - p0)
104cabdff1aSopenharmony_ci.if \wd == 8
105cabdff1aSopenharmony_ci        vmax.u16        q1,  q1,  q4
106cabdff1aSopenharmony_ci.endif
107cabdff1aSopenharmony_ci        vabd.u16        q4,  q13, q12    @ abs(q1 - q0)
108cabdff1aSopenharmony_ci.if \wd == 8
109cabdff1aSopenharmony_ci        vmax.u16        q1,  q1,  q6
110cabdff1aSopenharmony_ci.endif
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci        vsub.u16        q5,  q10, q13    @ p1 - q1
113cabdff1aSopenharmony_ci        vmax.u16        q3,  q3,  q4     @ max(abs(p1 - p0), abs(q1 - q0))
114cabdff1aSopenharmony_ci        vdup.u16        q4,  r4          @ H
115cabdff1aSopenharmony_ci        vsub.u16        q6,  q12, q11    @ q0 - p0
116cabdff1aSopenharmony_ci.if \wd == 8
117cabdff1aSopenharmony_ci        vcle.u16        q1,  q1,  q0     @ flat8in
118cabdff1aSopenharmony_ci.endif
119cabdff1aSopenharmony_ci        vdup.u16        q0,  r6          @ left shift for saturation
120cabdff1aSopenharmony_ci        vcle.u16        q3,  q3,  q4     @ !hev
121cabdff1aSopenharmony_ci.if \wd == 8
122cabdff1aSopenharmony_ci        vand            q1,  q1,  q2     @ flat8in && fm
123cabdff1aSopenharmony_ci.endif
124cabdff1aSopenharmony_ci        vneg.s16        q4,  q0          @ negative left shift after saturation
125cabdff1aSopenharmony_ci        vqshl.s16       q5,  q5,  q0
126cabdff1aSopenharmony_ci.if \wd == 8
127cabdff1aSopenharmony_ci        vbic            q2,  q2,  q1     @ fm && !flat8in
128cabdff1aSopenharmony_ci.endif
129cabdff1aSopenharmony_ci        vmov.s16        q7,  #3
130cabdff1aSopenharmony_ci        vand            q3,  q3,  q2     @ !hev && fm && !flat8in
131cabdff1aSopenharmony_ci        vshl.s16        q5,  q5,  q4     @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
132cabdff1aSopenharmony_ci
133cabdff1aSopenharmony_ci        vmul.s16        q6,  q6,  q7     @ 3 * (q0 - p0)
134cabdff1aSopenharmony_ci        vbic            q5,  q5,  q3     @ if (!hev) av_clip_int2p = 0
135cabdff1aSopenharmony_ci        vadd.s16        q6,  q6,  q5     @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
136cabdff1aSopenharmony_ci        vmov.s16        q5,  #4
137cabdff1aSopenharmony_ci        vqshl.s16       q6,  q6,  q0
138cabdff1aSopenharmony_ci        vmov.s16        q0,  #3
139cabdff1aSopenharmony_ci        vshl.s16        q6,  q6,  q4     @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
140cabdff1aSopenharmony_ci        vdup.u16        q4,  r7          @ max pixel value
141cabdff1aSopenharmony_ci
142cabdff1aSopenharmony_ci        vshr.u16        q4,  q4,  #1     @ (1 << (BIT_DEPTH - 1)) - 1)
143cabdff1aSopenharmony_ci
144cabdff1aSopenharmony_ci        vadd.s16        q5,  q6,  q5     @ f + 4
145cabdff1aSopenharmony_ci        vadd.s16        q0,  q6,  q0     @ f + 3
146cabdff1aSopenharmony_ci        vmov.s16        q6,  #0
147cabdff1aSopenharmony_ci        vmin.s16        q5,  q5,  q4     @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
148cabdff1aSopenharmony_ci        vmin.s16        q0,  q0,  q4     @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
149cabdff1aSopenharmony_ci        vdup.u16        q4,  r7          @ max pixel value
150cabdff1aSopenharmony_ci        vshr.s16        q5,  q5,  #3     @ f1
151cabdff1aSopenharmony_ci        vshr.s16        q0,  q0,  #3     @ f2
152cabdff1aSopenharmony_ci
153cabdff1aSopenharmony_ci        vadd.s16        q0,  q11, q0     @ p0 + f2
154cabdff1aSopenharmony_ci        vsub.s16        q7,  q12, q5     @ q0 - f1
155cabdff1aSopenharmony_ci        vmin.s16        q0,  q0,  q4
156cabdff1aSopenharmony_ci        vmin.s16        q7,  q7,  q4
157cabdff1aSopenharmony_ci        vrshr.s16       q5,  q5,  #1     @ f = (f1 + 1) >> 1
158cabdff1aSopenharmony_ci        vmax.s16        q0,  q0,  q6     @ out p0
159cabdff1aSopenharmony_ci        vmax.s16        q7,  q7,  q6     @ out q0
160cabdff1aSopenharmony_ci        vbit            q11, q0,  q2     @ if (fm && !flat8in)
161cabdff1aSopenharmony_ci        vbit            q12, q7,  q2
162cabdff1aSopenharmony_ci.if \wd >= 8
163cabdff1aSopenharmony_ci        vmovn.u16       d4,  q1
164cabdff1aSopenharmony_ci.endif
165cabdff1aSopenharmony_ci
166cabdff1aSopenharmony_ci        vadd.s16        q0,  q10, q5     @ p1 + f
167cabdff1aSopenharmony_ci        vsub.s16        q7,  q13, q5     @ q1 - f
168cabdff1aSopenharmony_ci.if \wd >= 8
169cabdff1aSopenharmony_ci        vmov            r8,  r9,  d4
170cabdff1aSopenharmony_ci.endif
171cabdff1aSopenharmony_ci        vmin.s16        q0,  q0,  q4
172cabdff1aSopenharmony_ci        vmin.s16        q7,  q7,  q4
173cabdff1aSopenharmony_ci.if \wd >= 8
174cabdff1aSopenharmony_ci        orrs            r8,  r8,  r9
175cabdff1aSopenharmony_ci.endif
176cabdff1aSopenharmony_ci        vmax.s16        q0,  q0,  q6     @ out p1
177cabdff1aSopenharmony_ci        vmax.s16        q7,  q7,  q6     @ out q1
178cabdff1aSopenharmony_ci        vbit            q10, q0,  q3     @ if (!hev && fm && !flat8in)
179cabdff1aSopenharmony_ci        vbit            q13, q7,  q3
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci.if \wd >= 8
182cabdff1aSopenharmony_ci        @ If no pixels need flat8in, jump to a writeout of the inner 4 pixels
183cabdff1aSopenharmony_ci        beq             6f
184cabdff1aSopenharmony_ci
185cabdff1aSopenharmony_ci        @ flat8in
186cabdff1aSopenharmony_ci        vadd.u16        q2,  q8,  q9
187cabdff1aSopenharmony_ci        vadd.u16        q3,  q10, q13
188cabdff1aSopenharmony_ci        vadd.u16        q4,  q8,  q10
189cabdff1aSopenharmony_ci        vadd.u16        q5,  q11, q14
190cabdff1aSopenharmony_ci        vadd.u16        q0,  q2,  q2
191cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  q11
192cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  q12
193cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  q4
194cabdff1aSopenharmony_ci        vsub.s16        q3,  q3,  q2
195cabdff1aSopenharmony_ci        vsub.s16        q5,  q5,  q4
196cabdff1aSopenharmony_ci        vrshr.u16       q6,  q0,  #3     @ out p2
197cabdff1aSopenharmony_ci
198cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  q3
199cabdff1aSopenharmony_ci        vadd.u16        q2,  q8,  q11
200cabdff1aSopenharmony_ci        vadd.u16        q3,  q12, q15
201cabdff1aSopenharmony_ci        vrshr.u16       q7,  q0,  #3     @ out p1
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  q5
204cabdff1aSopenharmony_ci        vsub.s16        q3,  q3,  q2
205cabdff1aSopenharmony_ci        vadd.u16        q4,  q9,  q12
206cabdff1aSopenharmony_ci        vbit            q9,  q6,  q1
207cabdff1aSopenharmony_ci        vadd.u16        q5,  q13, q15
208cabdff1aSopenharmony_ci        vrshr.u16       q6,  q0,  #3     @ out p0
209cabdff1aSopenharmony_ci
210cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  q3
211cabdff1aSopenharmony_ci        vsub.s16        q5,  q5,  q4
212cabdff1aSopenharmony_ci        vadd.u16        q2,  q10, q13
213cabdff1aSopenharmony_ci        vbit            q10, q7,  q1
214cabdff1aSopenharmony_ci        vadd.u16        q3,  q14, q15
215cabdff1aSopenharmony_ci        vrshr.u16       q7,  q0,  #3     @ out q0
216cabdff1aSopenharmony_ci
217cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  q5
218cabdff1aSopenharmony_ci        vsub.s16        q3,  q3,  q2
219cabdff1aSopenharmony_ci        vbit            q11, q6,  q1
220cabdff1aSopenharmony_ci        vrshr.u16       q6,  q0,  #3     @ out q1
221cabdff1aSopenharmony_ci
222cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  q3
223cabdff1aSopenharmony_ci        vbit            q12, q7,  q1
224cabdff1aSopenharmony_ci        vrshr.u16       q7,  q0,  #3     @ out q2
225cabdff1aSopenharmony_ci        vbit            q13, q6,  q1
226cabdff1aSopenharmony_ci        vbit            q14, q7,  q1
227cabdff1aSopenharmony_ci.endif
228cabdff1aSopenharmony_ci.endm
229cabdff1aSopenharmony_ci
230cabdff1aSopenharmony_ci@ The input to and output from this macro is in the registers d16-d31,
231cabdff1aSopenharmony_ci@ and d0-d7 are used as scratch registers.
232cabdff1aSopenharmony_ci@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
233cabdff1aSopenharmony_ci@ Depending on the width of the loop filter, we either use d16-d19
234cabdff1aSopenharmony_ci@ and d28-d31 as temp registers, or d8-d15.
235cabdff1aSopenharmony_ci@ In practice, this is only ever instantiated once, so the macro parameters
236cabdff1aSopenharmony_ci@ could be hardcoded, but keeping them as is, to keep similarities to the
237cabdff1aSopenharmony_ci@ 8 bpp and aarch64 versions.
238cabdff1aSopenharmony_ci.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
239cabdff1aSopenharmony_ci        vdup.u16        d0,  r2          @ E
240cabdff1aSopenharmony_ci        vdup.u16        d2,  r3          @ I
241cabdff1aSopenharmony_ci
242cabdff1aSopenharmony_ci        vabd.u16        d4,  d20, d21    @ abs(p3 - p2)
243cabdff1aSopenharmony_ci        vabd.u16        d5,  d21, d22    @ abs(p2 - p1)
244cabdff1aSopenharmony_ci        vabd.u16        d6,  d22, d23    @ abs(p1 - p0)
245cabdff1aSopenharmony_ci        vabd.u16        d7,  d24, d25    @ abs(q0 - q1)
246cabdff1aSopenharmony_ci        vabd.u16        \tmp1,  d25, d26 @ abs(q1 - q2)
247cabdff1aSopenharmony_ci        vabd.u16        \tmp2,  d26, d27 @ abs(q2 - q3)
248cabdff1aSopenharmony_ci        vmax.u16        d4,  d4,  d5
249cabdff1aSopenharmony_ci        vmax.u16        d5,  d6,  d7
250cabdff1aSopenharmony_ci        vmax.u16        \tmp1,  \tmp1,  \tmp2
251cabdff1aSopenharmony_ci        vabd.u16        d6,  d23, d24    @ abs(p0 - q0)
252cabdff1aSopenharmony_ci        vmax.u16        d4,  d4,  d5
253cabdff1aSopenharmony_ci        vadd.u16        d6,  d6,  d6     @ abs(p0 - q0) * 2
254cabdff1aSopenharmony_ci        vabd.u16        d5,  d22, d25    @ abs(p1 - q1)
255cabdff1aSopenharmony_ci        vmax.u16        d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
256cabdff1aSopenharmony_ci        vshr.u16        d5,  d5,  #1
257cabdff1aSopenharmony_ci        vcle.u16        d4,  d4,  d2     @ max(abs()) <= I
258cabdff1aSopenharmony_ci        vadd.u16        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
259cabdff1aSopenharmony_ci        vcle.u16        d6,  d6,  d0
260cabdff1aSopenharmony_ci        vand            d4,  d4,  d6     @ fm
261cabdff1aSopenharmony_ci
262cabdff1aSopenharmony_ci        vdup.u16        d3,  r4          @ H
263cabdff1aSopenharmony_ci        vmov            r8,  r9,  d4
264cabdff1aSopenharmony_ci        orrs            r8,  r8,  r9
265cabdff1aSopenharmony_ci        @ If no pixels need filtering, just exit as soon as possible
266cabdff1aSopenharmony_ci        beq             9f
267cabdff1aSopenharmony_ci
268cabdff1aSopenharmony_ci.if \wd >= 8
269cabdff1aSopenharmony_ci        vdup.u16        d0,  r5
270cabdff1aSopenharmony_ci
271cabdff1aSopenharmony_ci        vabd.u16        d6,  d20, d23    @ abs(p3 - p0)
272cabdff1aSopenharmony_ci        vabd.u16        d2,  d21, d23    @ abs(p2 - p0)
273cabdff1aSopenharmony_ci        vabd.u16        d1,  d22, d23    @ abs(p1 - p0)
274cabdff1aSopenharmony_ci        vabd.u16        \tmp1,  d25, d24 @ abs(q1 - q0)
275cabdff1aSopenharmony_ci        vabd.u16        \tmp2,  d26, d24 @ abs(q2 - q0)
276cabdff1aSopenharmony_ci        vabd.u16        \tmp3,  d27, d24 @ abs(q3 - q0)
277cabdff1aSopenharmony_ci        vmax.u16        d6,  d6,  d2
278cabdff1aSopenharmony_ci        vmax.u16        d1,  d1,  \tmp1
279cabdff1aSopenharmony_ci        vmax.u16        \tmp2,  \tmp2,  \tmp3
280cabdff1aSopenharmony_ci.if \wd == 16
281cabdff1aSopenharmony_ci        vabd.u16        d7,  d16, d23    @ abs(p7 - p0)
282cabdff1aSopenharmony_ci        vmax.u16        d6,  d6,  d1
283cabdff1aSopenharmony_ci        vabd.u16        d2,  d17, d23    @ abs(p6 - p0)
284cabdff1aSopenharmony_ci        vmax.u16        d6,  d6,  \tmp2
285cabdff1aSopenharmony_ci        vabd.u16        d1,  d18, d23    @ abs(p5 - p0)
286cabdff1aSopenharmony_ci        vcle.u16        d6,  d6,  d0     @ flat8in
287cabdff1aSopenharmony_ci        vabd.u16        d8,  d19, d23    @ abs(p4 - p0)
288cabdff1aSopenharmony_ci        vand            d6,  d6,  d4     @ flat8in && fm
289cabdff1aSopenharmony_ci        vabd.u16        d9,  d28, d24    @ abs(q4 - q0)
290cabdff1aSopenharmony_ci        vbic            d4,  d4,  d6     @ fm && !flat8in
291cabdff1aSopenharmony_ci        vabd.u16        d10, d29, d24    @ abs(q5 - q0)
292cabdff1aSopenharmony_ci        vabd.u16        d11, d30, d24    @ abs(q6 - q0)
293cabdff1aSopenharmony_ci        vabd.u16        d12, d31, d24    @ abs(q7 - q0)
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci        vmax.u16        d7,  d7,  d2
296cabdff1aSopenharmony_ci        vmax.u16        d1,  d1,  d8
297cabdff1aSopenharmony_ci        vmax.u16        d9,  d9,  d10
298cabdff1aSopenharmony_ci        vmax.u16        d11, d11, d12
299cabdff1aSopenharmony_ci        @ The rest of the calculation of flat8out is interleaved below
300cabdff1aSopenharmony_ci.else
301cabdff1aSopenharmony_ci        @ The rest of the calculation of flat8in is interleaved below
302cabdff1aSopenharmony_ci.endif
303cabdff1aSopenharmony_ci.endif
304cabdff1aSopenharmony_ci
305cabdff1aSopenharmony_ci        @ Calculate the normal inner loop filter for 2 or 4 pixels
306cabdff1aSopenharmony_ci        vabd.u16        d5,  d22, d23           @ abs(p1 - p0)
307cabdff1aSopenharmony_ci.if \wd == 16
308cabdff1aSopenharmony_ci        vmax.u16        d7,  d7,  d1
309cabdff1aSopenharmony_ci        vmax.u16        d9,  d9,  d11
310cabdff1aSopenharmony_ci.elseif \wd == 8
311cabdff1aSopenharmony_ci        vmax.u16        d6,  d6,  d1
312cabdff1aSopenharmony_ci.endif
313cabdff1aSopenharmony_ci        vabd.u16        d1,  d25, d24           @ abs(q1 - q0)
314cabdff1aSopenharmony_ci.if \wd == 16
315cabdff1aSopenharmony_ci        vmax.u16        d7,  d7,  d9
316cabdff1aSopenharmony_ci.elseif \wd == 8
317cabdff1aSopenharmony_ci        vmax.u16        d6,  d6,  \tmp2
318cabdff1aSopenharmony_ci.endif
319cabdff1aSopenharmony_ci        vdup.u16        \tmp2,  r6              @ left shift for saturation
320cabdff1aSopenharmony_ci        vsub.u16        \tmp1,  d22, d25        @ p1 - q1
321cabdff1aSopenharmony_ci        vneg.s16        \tmp6,  \tmp2           @ negative left shift after saturation
322cabdff1aSopenharmony_ci        vmax.u16        d5,  d5,  d1            @ max(abs(p1 - p0), abs(q1 - q0))
323cabdff1aSopenharmony_ci        vsub.u16        \tmp3,   d24, d23       @ q0 - p0
324cabdff1aSopenharmony_ci        vmov.s16        \tmp5,  #3
325cabdff1aSopenharmony_ci.if \wd == 8
326cabdff1aSopenharmony_ci        vcle.u16        d6,  d6,  d0            @ flat8in
327cabdff1aSopenharmony_ci.endif
328cabdff1aSopenharmony_ci        vcle.u16        d5,  d5,  d3            @ !hev
329cabdff1aSopenharmony_ci.if \wd == 8
330cabdff1aSopenharmony_ci        vand            d6,  d6,  d4            @ flat8in && fm
331cabdff1aSopenharmony_ci.endif
332cabdff1aSopenharmony_ci        vqshl.s16       \tmp1,  \tmp1,  \tmp2
333cabdff1aSopenharmony_ci.if \wd == 16
334cabdff1aSopenharmony_ci        vcle.u16        d7,  d7,  d0            @ flat8out
335cabdff1aSopenharmony_ci.elseif \wd == 8
336cabdff1aSopenharmony_ci        vbic            d4,  d4,  d6            @ fm && !flat8in
337cabdff1aSopenharmony_ci.endif
338cabdff1aSopenharmony_ci        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
339cabdff1aSopenharmony_ci.if \wd == 16
340cabdff1aSopenharmony_ci        vand            d7,  d7,  d6            @ flat8out && flat8in && fm
341cabdff1aSopenharmony_ci.endif
342cabdff1aSopenharmony_ci        vshl.s16        \tmp1,  \tmp1,  \tmp6   @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
343cabdff1aSopenharmony_ci
344cabdff1aSopenharmony_ci        vmul.s16        \tmp3,  \tmp3,  \tmp5   @ 3 * (q0 - p0)
345cabdff1aSopenharmony_ci        vbic            \tmp1,  \tmp1,   d5     @ if (!hev) av_clip_int2p = 0
346cabdff1aSopenharmony_ci        vmov.s16        d2,  #4
347cabdff1aSopenharmony_ci        vadd.s16        \tmp3,  \tmp3,  \tmp1   @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
348cabdff1aSopenharmony_ci        vmov.s16        d3,  #3
349cabdff1aSopenharmony_ci        vqshl.s16       \tmp1,  \tmp3,  \tmp2
350cabdff1aSopenharmony_ci        vmov.s16        \tmp5,  #0
351cabdff1aSopenharmony_ci        vshl.s16        \tmp1,  \tmp1,  \tmp6   @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
352cabdff1aSopenharmony_ci        vdup.u16        \tmp6,  r7              @ max pixel value
353cabdff1aSopenharmony_ci.if \wd == 16
354cabdff1aSopenharmony_ci        vbic            d6,  d6,  d7            @ fm && flat8in && !flat8out
355cabdff1aSopenharmony_ci.endif
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_ci        vshr.u16        \tmp2,  \tmp6,  #1      @ (1 << (BIT_DEPTH - 1)) - 1
358cabdff1aSopenharmony_ci
359cabdff1aSopenharmony_ci        vadd.s16        \tmp3,  \tmp1,  d2      @ f + 4
360cabdff1aSopenharmony_ci        vadd.s16        \tmp4,  \tmp1,  d3      @ f + 3
361cabdff1aSopenharmony_ci        vmin.s16        \tmp3,  \tmp3,  \tmp2   @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
362cabdff1aSopenharmony_ci        vmin.s16        \tmp4,  \tmp4,  \tmp2   @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
363cabdff1aSopenharmony_ci        vshr.s16        \tmp3,  \tmp3,  #3      @ f1
364cabdff1aSopenharmony_ci        vshr.s16        \tmp4,  \tmp4,  #3      @ f2
365cabdff1aSopenharmony_ci
366cabdff1aSopenharmony_ci        vadd.s16        d0,  d23, \tmp4         @ p0 + f2
367cabdff1aSopenharmony_ci        vsub.s16        d2,  d24, \tmp3         @ q0 - f1
368cabdff1aSopenharmony_ci        vmin.s16        d0,  d0,  \tmp6
369cabdff1aSopenharmony_ci        vmin.s16        d2,  d2,  \tmp6
370cabdff1aSopenharmony_ci        vrshr.s16       \tmp3,  \tmp3,  #1      @ f = (f1 + 1) >> 1
371cabdff1aSopenharmony_ci        vmax.s16        d0,  d0,  \tmp5         @ out p0
372cabdff1aSopenharmony_ci        vmax.s16        d2,  d2,  \tmp5         @ out q0
373cabdff1aSopenharmony_ci        vbit            d23, d0,  d4            @ if (fm && !flat8in)
374cabdff1aSopenharmony_ci        vbit            d24, d2,  d4
375cabdff1aSopenharmony_ci
376cabdff1aSopenharmony_ci        vadd.s16        d0,  d22, \tmp3         @ p1 + f
377cabdff1aSopenharmony_ci        vsub.s16        d2,  d25, \tmp3         @ q1 - f
378cabdff1aSopenharmony_ci.if \wd >= 8
379cabdff1aSopenharmony_ci        vmov            r8,  r9,  d6
380cabdff1aSopenharmony_ci.endif
381cabdff1aSopenharmony_ci        vmin.s16        d0,  d0,  \tmp6
382cabdff1aSopenharmony_ci        vmin.s16        d2,  d2,  \tmp6
383cabdff1aSopenharmony_ci.if \wd >= 8
384cabdff1aSopenharmony_ci        orrs            r8,  r8,  r9
385cabdff1aSopenharmony_ci.endif
386cabdff1aSopenharmony_ci        vmax.s16        d0,  d0,  \tmp5         @ out p1
387cabdff1aSopenharmony_ci        vmax.s16        d2,  d2,  \tmp5         @ out q1
388cabdff1aSopenharmony_ci        vbit            d22, d0,  d5            @ if (!hev && fm && !flat8in)
389cabdff1aSopenharmony_ci        vbit            d25, d2,  d5
390cabdff1aSopenharmony_ci
391cabdff1aSopenharmony_ci.if \wd >= 8
392cabdff1aSopenharmony_ci        @ If no pixels need flat8in, jump to flat8out
393cabdff1aSopenharmony_ci        @ (or to a writeout of the inner 4 pixels, for wd=8)
394cabdff1aSopenharmony_ci        beq             6f
395cabdff1aSopenharmony_ci
396cabdff1aSopenharmony_ci        @ flat8in
397cabdff1aSopenharmony_ci        vadd.u16        \tmp1,  d20, d21
398cabdff1aSopenharmony_ci        vadd.u16        \tmp3,  d22, d25
399cabdff1aSopenharmony_ci        vadd.u16        \tmp5,  d20, d22
400cabdff1aSopenharmony_ci        vadd.u16        \tmp7,  d23, d26
401cabdff1aSopenharmony_ci        vadd.u16        d0,  \tmp1,  \tmp1
402cabdff1aSopenharmony_ci        vadd.u16        d0,  d0,  d23
403cabdff1aSopenharmony_ci        vadd.u16        d0,  d0,  d24
404cabdff1aSopenharmony_ci        vadd.u16        d0,  d0,  \tmp5
405cabdff1aSopenharmony_ci        vsub.s16        \tmp3,  \tmp3,  \tmp1
406cabdff1aSopenharmony_ci        vsub.s16        \tmp7,  \tmp7,  \tmp5
407cabdff1aSopenharmony_ci        vrshr.u16       d2,  d0,  #3            @ out p2
408cabdff1aSopenharmony_ci
409cabdff1aSopenharmony_ci        vadd.u16        d0,  d0,  \tmp3
410cabdff1aSopenharmony_ci        vadd.u16        \tmp1,  d20, d23
411cabdff1aSopenharmony_ci        vadd.u16        \tmp3,  d24, d27
412cabdff1aSopenharmony_ci        vrshr.u16       d3,  d0,  #3            @ out p1
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_ci        vadd.u16        d0,  d0,  \tmp7
415cabdff1aSopenharmony_ci        vsub.s16        \tmp3,  \tmp3,  \tmp1
416cabdff1aSopenharmony_ci        vadd.u16        \tmp5,  d21, d24
417cabdff1aSopenharmony_ci        vadd.u16        \tmp7,  d25, d27
418cabdff1aSopenharmony_ci        vrshr.u16       d4,  d0,  #3            @ out p0
419cabdff1aSopenharmony_ci
420cabdff1aSopenharmony_ci        vadd.u16        d0,  d0,  \tmp3
421cabdff1aSopenharmony_ci        vsub.s16        \tmp7,  \tmp7,  \tmp5
422cabdff1aSopenharmony_ci        vadd.u16        \tmp1,  d22, d25
423cabdff1aSopenharmony_ci        vadd.u16        \tmp3,  d26, d27
424cabdff1aSopenharmony_ci        vrshr.u16       d5,  d0,  #3            @ out d0
425cabdff1aSopenharmony_ci
426cabdff1aSopenharmony_ci        vadd.u16        d0,  d0,  \tmp7
427cabdff1aSopenharmony_ci        vsub.s16        \tmp3,  \tmp3,  \tmp1
428cabdff1aSopenharmony_ci        vrshr.u16       \tmp5,  d0,  #3         @ out q1
429cabdff1aSopenharmony_ci
430cabdff1aSopenharmony_ci        vadd.u16        d0,  d0,  \tmp3
431cabdff1aSopenharmony_ci        @ The output here is written back into the input registers. This doesn't
432cabdff1aSopenharmony_ci        @ matter for the flat8out part below, since we only update those pixels
433cabdff1aSopenharmony_ci        @ which won't be touched below.
434cabdff1aSopenharmony_ci        vbit            d21, d2,  d6
435cabdff1aSopenharmony_ci        vbit            d22, d3,  d6
436cabdff1aSopenharmony_ci        vbit            d23, d4,  d6
437cabdff1aSopenharmony_ci        vrshr.u16       \tmp6,  d0,  #3         @ out q2
438cabdff1aSopenharmony_ci        vbit            d24, d5,  d6
439cabdff1aSopenharmony_ci        vbit            d25, \tmp5,  d6
440cabdff1aSopenharmony_ci        vbit            d26, \tmp6,  d6
441cabdff1aSopenharmony_ci.endif
442cabdff1aSopenharmony_ci.if \wd == 16
443cabdff1aSopenharmony_ci6:
444cabdff1aSopenharmony_ci        vorr            d2,  d6,  d7
445cabdff1aSopenharmony_ci        vmov            r8,  r9,  d2
446cabdff1aSopenharmony_ci        orrs            r8,  r8,  r9
447cabdff1aSopenharmony_ci        @ If no pixels needed flat8in nor flat8out, jump to a
448cabdff1aSopenharmony_ci        @ writeout of the inner 4 pixels
449cabdff1aSopenharmony_ci        beq             7f
450cabdff1aSopenharmony_ci        vmov            r8,  r9,  d7
451cabdff1aSopenharmony_ci        orrs            r8,  r8,  r9
452cabdff1aSopenharmony_ci        @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
453cabdff1aSopenharmony_ci        beq             8f
454cabdff1aSopenharmony_ci
455cabdff1aSopenharmony_ci        @ flat8out
456cabdff1aSopenharmony_ci        @ This writes all outputs into d2-d17 (skipping d6 and d16).
457cabdff1aSopenharmony_ci        @ If this part is skipped, the output is read from d21-d26 (which is the input
458cabdff1aSopenharmony_ci        @ to this section).
459cabdff1aSopenharmony_ci        vshl.u16        d0,  d16, #3  @ 8 * d16
460cabdff1aSopenharmony_ci        vsub.u16        d0,  d0,  d16 @ 7 * d16
461cabdff1aSopenharmony_ci        vadd.u16        d0,  d0,  d17
462cabdff1aSopenharmony_ci        vadd.u16        d8,  d17, d18
463cabdff1aSopenharmony_ci        vadd.u16        d10, d19, d20
464cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d8
465cabdff1aSopenharmony_ci        vadd.u16        d8,  d16, d17
466cabdff1aSopenharmony_ci        vadd.u16        d12, d21, d22
467cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d10
468cabdff1aSopenharmony_ci        vadd.u16        d10, d18, d25
469cabdff1aSopenharmony_ci        vadd.u16        d14, d23, d24
470cabdff1aSopenharmony_ci        vsub.s16        d10, d10, d8
471cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d12
472cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d14
473cabdff1aSopenharmony_ci        vadd.u16        d12, d16, d18
474cabdff1aSopenharmony_ci        vadd.u16        d14, d19, d26
475cabdff1aSopenharmony_ci        vrshr.u16       d2,  d0,  #4
476cabdff1aSopenharmony_ci
477cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d10
478cabdff1aSopenharmony_ci        vadd.u16        d8,  d16, d19
479cabdff1aSopenharmony_ci        vadd.u16        d10, d20, d27
480cabdff1aSopenharmony_ci        vsub.s16        d14, d14, d12
481cabdff1aSopenharmony_ci        vbif            d2,  d17, d7
482cabdff1aSopenharmony_ci        vrshr.u16       d3,  d0,  #4
483cabdff1aSopenharmony_ci
484cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d14
485cabdff1aSopenharmony_ci        vadd.u16        d12, d16, d20
486cabdff1aSopenharmony_ci        vadd.u16        d14, d21, d28
487cabdff1aSopenharmony_ci        vsub.s16        d10, d10, d8
488cabdff1aSopenharmony_ci        vbif            d3,  d18, d7
489cabdff1aSopenharmony_ci        vrshr.u16       d4,  d0,  #4
490cabdff1aSopenharmony_ci
491cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d10
492cabdff1aSopenharmony_ci        vadd.u16        d8,  d16, d21
493cabdff1aSopenharmony_ci        vadd.u16        d10, d22, d29
494cabdff1aSopenharmony_ci        vsub.s16        d14, d14, d12
495cabdff1aSopenharmony_ci        vbif            d4,  d19, d7
496cabdff1aSopenharmony_ci        vrshr.u16       d5,  d0,  #4
497cabdff1aSopenharmony_ci
498cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d14
499cabdff1aSopenharmony_ci        vadd.u16        d12, d16, d22
500cabdff1aSopenharmony_ci        vadd.u16        d14, d23, d30
501cabdff1aSopenharmony_ci        vsub.s16        d10, d10, d8
502cabdff1aSopenharmony_ci        vbif            d5,  d20, d7
503cabdff1aSopenharmony_ci        vrshr.u16       d6,  d0,  #4
504cabdff1aSopenharmony_ci
505cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d10
506cabdff1aSopenharmony_ci        vadd.u16        d10, d16, d23
507cabdff1aSopenharmony_ci        vsub.s16        d14, d14, d12
508cabdff1aSopenharmony_ci        vadd.u16        d12, d24, d31
509cabdff1aSopenharmony_ci        vbif            d6,  d21, d7
510cabdff1aSopenharmony_ci        vrshr.u16       d8,  d0,  #4
511cabdff1aSopenharmony_ci
512cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d14
513cabdff1aSopenharmony_ci        vsub.s16        d10, d12, d10
514cabdff1aSopenharmony_ci        vadd.u16        d12, d17, d24
515cabdff1aSopenharmony_ci        vadd.u16        d14, d25, d31
516cabdff1aSopenharmony_ci        vbif            d8,  d22, d7
517cabdff1aSopenharmony_ci        vrshr.u16       d9,  d0,  #4
518cabdff1aSopenharmony_ci
519cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d10
520cabdff1aSopenharmony_ci        vsub.s16        d14, d14, d12
521cabdff1aSopenharmony_ci        vadd.u16        d12, d26, d31
522cabdff1aSopenharmony_ci        vbif            d9,  d23, d7
523cabdff1aSopenharmony_ci        vrshr.u16       d10, d0,  #4
524cabdff1aSopenharmony_ci
525cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d14
526cabdff1aSopenharmony_ci        vadd.u16        d14, d18, d25
527cabdff1aSopenharmony_ci        vadd.u16        d18, d19, d26
528cabdff1aSopenharmony_ci        vsub.s16        d12, d12, d14
529cabdff1aSopenharmony_ci        vadd.u16        d14, d27, d31
530cabdff1aSopenharmony_ci        vbif            d10, d24, d7
531cabdff1aSopenharmony_ci        vrshr.u16       d11, d0,  #4
532cabdff1aSopenharmony_ci
533cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d12
534cabdff1aSopenharmony_ci        vadd.u16        d12, d20, d27
535cabdff1aSopenharmony_ci        vsub.s16        d14, d14, d18
536cabdff1aSopenharmony_ci        vadd.u16        d18, d28, d31
537cabdff1aSopenharmony_ci        vbif            d11, d25, d7
538cabdff1aSopenharmony_ci        vsub.s16        d18, d18, d12
539cabdff1aSopenharmony_ci        vrshr.u16       d12, d0,  #4
540cabdff1aSopenharmony_ci
541cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d14
542cabdff1aSopenharmony_ci        vadd.u16        d14, d21, d28
543cabdff1aSopenharmony_ci        vadd.u16        d20, d29, d31
544cabdff1aSopenharmony_ci        vbif            d12, d26, d7
545cabdff1aSopenharmony_ci        vrshr.u16       d13, d0,  #4
546cabdff1aSopenharmony_ci
547cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d18
548cabdff1aSopenharmony_ci        vsub.s16        d20, d20, d14
549cabdff1aSopenharmony_ci        vadd.u16        d18, d22, d29
550cabdff1aSopenharmony_ci        vadd.u16        d22, d30, d31
551cabdff1aSopenharmony_ci        vbif            d13, d27, d7
552cabdff1aSopenharmony_ci        vrshr.u16       d14, d0,  #4
553cabdff1aSopenharmony_ci
554cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d20
555cabdff1aSopenharmony_ci        vsub.s16        d22, d22, d18
556cabdff1aSopenharmony_ci        vbif            d14, d28, d7
557cabdff1aSopenharmony_ci        vrshr.u16       d15, d0,  #4
558cabdff1aSopenharmony_ci
559cabdff1aSopenharmony_ci        vadd.s16        d0,  d0,  d22
560cabdff1aSopenharmony_ci        vbif            d15, d29, d7
561cabdff1aSopenharmony_ci        vrshr.u16       d17, d0,  #4
562cabdff1aSopenharmony_ci        vbif            d17, d30, d7
563cabdff1aSopenharmony_ci.endif
564cabdff1aSopenharmony_ci.endm
565cabdff1aSopenharmony_ci
566cabdff1aSopenharmony_ci.macro loop_filter_q_4
567cabdff1aSopenharmony_ci        loop_filter_q   4
568cabdff1aSopenharmony_ci.endm
569cabdff1aSopenharmony_ci
570cabdff1aSopenharmony_ci.macro loop_filter_q_8
571cabdff1aSopenharmony_ci        loop_filter_q   8
572cabdff1aSopenharmony_ci.endm
573cabdff1aSopenharmony_ci
574cabdff1aSopenharmony_ci.macro loop_filter_16
575cabdff1aSopenharmony_ci        loop_filter     16, d8,  d9,  d10, d11, d12, d13, d14, d15
576cabdff1aSopenharmony_ci.endm
577cabdff1aSopenharmony_ci
578cabdff1aSopenharmony_ci
579cabdff1aSopenharmony_ci@ The public functions in this file have got the following signature:
580cabdff1aSopenharmony_ci@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
581cabdff1aSopenharmony_ci
582cabdff1aSopenharmony_ci.macro bpp_frontend func, bpp
583cabdff1aSopenharmony_cifunction ff_\func\()_\bpp\()_neon, export=1
584cabdff1aSopenharmony_ci        push            {r4-r9,lr}
585cabdff1aSopenharmony_ci        ldr             r4,  [sp, #28]
586cabdff1aSopenharmony_ci        vpush           {q4-q7}
587cabdff1aSopenharmony_ci        lsl             r2,  r2,  #\bpp - 8
588cabdff1aSopenharmony_ci        lsl             r3,  r3,  #\bpp - 8
589cabdff1aSopenharmony_ci        lsl             r4,  r4,  #\bpp - 8
590cabdff1aSopenharmony_ci        mov             r5,  #1 << (\bpp - 8)
591cabdff1aSopenharmony_ci        mov             r6,  #16 - \bpp
592cabdff1aSopenharmony_ci        movw            r7,  #((1 << \bpp) - 1)
593cabdff1aSopenharmony_ci        bl              \func\()_16_neon
594cabdff1aSopenharmony_ci        vpop            {q4-q7}
595cabdff1aSopenharmony_ci        pop             {r4-r9,pc}
596cabdff1aSopenharmony_ciendfunc
597cabdff1aSopenharmony_ci.endm
598cabdff1aSopenharmony_ci
599cabdff1aSopenharmony_ci.macro bpp_frontends func
600cabdff1aSopenharmony_ci        bpp_frontend    \func, 10
601cabdff1aSopenharmony_ci        bpp_frontend    \func, 12
602cabdff1aSopenharmony_ci.endm
603cabdff1aSopenharmony_ci
604cabdff1aSopenharmony_ci.macro bpp_frontend_rep func, suffix, int_suffix, rep, dir, bpp
605cabdff1aSopenharmony_cifunction ff_\func\()_\suffix\()_\bpp\()_neon, export=1
606cabdff1aSopenharmony_ci        push            {r4-r9,lr}
607cabdff1aSopenharmony_ci        ldr             r4,  [sp, #28]
608cabdff1aSopenharmony_ci        vpush           {q4-q7}
609cabdff1aSopenharmony_ci        lsl             r2,  r2,  #\bpp - 8
610cabdff1aSopenharmony_ci        lsl             r3,  r3,  #\bpp - 8
611cabdff1aSopenharmony_ci        lsl             r4,  r4,  #\bpp - 8
612cabdff1aSopenharmony_ci        mov             r5,  #1 << (\bpp - 8)
613cabdff1aSopenharmony_ci        mov             r6,  #16 - \bpp
614cabdff1aSopenharmony_ci        movw            r7,  #((1 << \bpp) - 1)
615cabdff1aSopenharmony_ci        bl              \func\()_\int_suffix\()_16_neon
616cabdff1aSopenharmony_ci.ifc \dir,h
617cabdff1aSopenharmony_ci        add             r0,  r0,  r1, lsl #2
618cabdff1aSopenharmony_ci.else
619cabdff1aSopenharmony_ci        add             r0,  r0,  #8
620cabdff1aSopenharmony_ci.endif
621cabdff1aSopenharmony_ci        bl              \func\()_\int_suffix\()_16_neon
622cabdff1aSopenharmony_ci.if \rep >= 4
623cabdff1aSopenharmony_ci.ifc \dir,h
624cabdff1aSopenharmony_ci        add             r0,  r0,  r1, lsl #2
625cabdff1aSopenharmony_ci        bl              \func\()_\int_suffix\()_16_neon
626cabdff1aSopenharmony_ci        add             r0,  r0,  r1, lsl #2
627cabdff1aSopenharmony_ci        bl              \func\()_\int_suffix\()_16_neon
628cabdff1aSopenharmony_ci.else
629cabdff1aSopenharmony_ci        add             r0,  r0,  #8
630cabdff1aSopenharmony_ci        bl              \func\()_\int_suffix\()_16_neon
631cabdff1aSopenharmony_ci        add             r0,  r0,  #8
632cabdff1aSopenharmony_ci        bl              \func\()_\int_suffix\()_16_neon
633cabdff1aSopenharmony_ci.endif
634cabdff1aSopenharmony_ci.endif
635cabdff1aSopenharmony_ci        vpop            {q4-q7}
636cabdff1aSopenharmony_ci        pop             {r4-r9,pc}
637cabdff1aSopenharmony_ciendfunc
638cabdff1aSopenharmony_ci.endm
639cabdff1aSopenharmony_ci
640cabdff1aSopenharmony_ci.macro bpp_frontends_rep func, suffix, int_suffix, rep, dir
641cabdff1aSopenharmony_ci        bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 10
642cabdff1aSopenharmony_ci        bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 12
643cabdff1aSopenharmony_ci.endm
644cabdff1aSopenharmony_ci
645cabdff1aSopenharmony_ci.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
646cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
647cabdff1aSopenharmony_ci        push            {r4-r9,lr}
648cabdff1aSopenharmony_ci        ldr             r4,  [sp, #28]
649cabdff1aSopenharmony_ci        vpush           {q4-q7}
650cabdff1aSopenharmony_ci        push            {r2, r3, r4}
651cabdff1aSopenharmony_ci        and             r2,  r2,  #0xff
652cabdff1aSopenharmony_ci        and             r3,  r3,  #0xff
653cabdff1aSopenharmony_ci        and             r4,  r4,  #0xff
654cabdff1aSopenharmony_ci        lsl             r2,  r2,  #\bpp - 8
655cabdff1aSopenharmony_ci        lsl             r3,  r3,  #\bpp - 8
656cabdff1aSopenharmony_ci        lsl             r4,  r4,  #\bpp - 8
657cabdff1aSopenharmony_ci        mov             r5,  #1 << (\bpp - 8)
658cabdff1aSopenharmony_ci        mov             r6,  #16 - \bpp
659cabdff1aSopenharmony_ci        movw            r7,  #((1 << \bpp) - 1)
660cabdff1aSopenharmony_ci        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
661cabdff1aSopenharmony_ci.ifc \dir,h
662cabdff1aSopenharmony_ci        add             r0,  r0,  r1, lsl #3
663cabdff1aSopenharmony_ci.else
664cabdff1aSopenharmony_ci        add             r0,  r0,  #16
665cabdff1aSopenharmony_ci.endif
666cabdff1aSopenharmony_ci        pop             {r2, r3, r4}
667cabdff1aSopenharmony_ci        lsr             r2,  r2,  #8
668cabdff1aSopenharmony_ci        lsr             r3,  r3,  #8
669cabdff1aSopenharmony_ci        lsr             r4,  r4,  #8
670cabdff1aSopenharmony_ci        lsl             r2,  r2,  #\bpp - 8
671cabdff1aSopenharmony_ci        lsl             r3,  r3,  #\bpp - 8
672cabdff1aSopenharmony_ci        lsl             r4,  r4,  #\bpp - 8
673cabdff1aSopenharmony_ci        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
674cabdff1aSopenharmony_ci        vpop            {q4-q7}
675cabdff1aSopenharmony_ci        pop             {r4-r9,pc}
676cabdff1aSopenharmony_ciendfunc
677cabdff1aSopenharmony_ci.endm
678cabdff1aSopenharmony_ci
679cabdff1aSopenharmony_ci.macro bpp_frontends_mix2 wd1, wd2
680cabdff1aSopenharmony_ci        bpp_frontend_mix2 \wd1, \wd2, v, 10
681cabdff1aSopenharmony_ci        bpp_frontend_mix2 \wd1, \wd2, v, 12
682cabdff1aSopenharmony_ci        bpp_frontend_mix2 \wd1, \wd2, h, 10
683cabdff1aSopenharmony_ci        bpp_frontend_mix2 \wd1, \wd2, h, 12
684cabdff1aSopenharmony_ci.endm
685cabdff1aSopenharmony_ci
686cabdff1aSopenharmony_cifunction vp9_loop_filter_v_4_8_16_neon
687cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #2
688cabdff1aSopenharmony_ci        vld1.16         {q8},  [r12,:128], r1 @ p3
689cabdff1aSopenharmony_ci        vld1.16         {q12}, [r0, :128], r1 @ q0
690cabdff1aSopenharmony_ci        vld1.16         {q9},  [r12,:128], r1 @ p2
691cabdff1aSopenharmony_ci        vld1.16         {q13}, [r0, :128], r1 @ q1
692cabdff1aSopenharmony_ci        vld1.16         {q10}, [r12,:128], r1 @ p1
693cabdff1aSopenharmony_ci        vld1.16         {q14}, [r0, :128], r1 @ q2
694cabdff1aSopenharmony_ci        vld1.16         {q11}, [r12,:128], r1 @ p0
695cabdff1aSopenharmony_ci        vld1.16         {q15}, [r0, :128], r1 @ q3
696cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
697cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #1
698cabdff1aSopenharmony_ci
699cabdff1aSopenharmony_ci        loop_filter_q_4
700cabdff1aSopenharmony_ci
701cabdff1aSopenharmony_ci        vst1.16         {q10}, [r12,:128], r1
702cabdff1aSopenharmony_ci        vst1.16         {q12}, [r0, :128], r1
703cabdff1aSopenharmony_ci        vst1.16         {q11}, [r12,:128], r1
704cabdff1aSopenharmony_ci        vst1.16         {q13}, [r0, :128], r1
705cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
706cabdff1aSopenharmony_ci9:
707cabdff1aSopenharmony_ci        bx              lr
708cabdff1aSopenharmony_ciendfunc
709cabdff1aSopenharmony_ci
710cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_v_4_8
711cabdff1aSopenharmony_ci
712cabdff1aSopenharmony_ci
713cabdff1aSopenharmony_cifunction vp9_loop_filter_h_4_8_16_neon
714cabdff1aSopenharmony_ci        sub             r12, r0,  #8
715cabdff1aSopenharmony_ci        add             r0,  r12, r1, lsl #2
716cabdff1aSopenharmony_ci        vld1.16         {q8},  [r12,:64], r1
717cabdff1aSopenharmony_ci        vld1.16         {q12}, [r0, :64], r1
718cabdff1aSopenharmony_ci        vld1.16         {q9},  [r12,:64], r1
719cabdff1aSopenharmony_ci        vld1.16         {q13}, [r0, :64], r1
720cabdff1aSopenharmony_ci        vld1.16         {q10}, [r12,:64], r1
721cabdff1aSopenharmony_ci        vld1.16         {q14}, [r0, :64], r1
722cabdff1aSopenharmony_ci        vld1.16         {q11}, [r12,:64], r1
723cabdff1aSopenharmony_ci        vld1.16         {q15}, [r0, :64], r1
724cabdff1aSopenharmony_ci
725cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
726cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
727cabdff1aSopenharmony_ci        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
728cabdff1aSopenharmony_ci        @ outermost 2 pixels since they aren't changed.
729cabdff1aSopenharmony_ci        add             r12, r12, #4
730cabdff1aSopenharmony_ci        add             r0,  r0,  #4
731cabdff1aSopenharmony_ci
732cabdff1aSopenharmony_ci        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
733cabdff1aSopenharmony_ci
734cabdff1aSopenharmony_ci        loop_filter_q_4
735cabdff1aSopenharmony_ci
736cabdff1aSopenharmony_ci        @ We only will write the mid 4 pixels back; after the loop filter,
737cabdff1aSopenharmony_ci        @ these are in q10, q11, q12, q13, ordered as rows (8x4 pixels).
738cabdff1aSopenharmony_ci        @ We need to transpose them to columns, done with a
739cabdff1aSopenharmony_ci        @ 4x4 transpose (which in practice is two 4x4 transposes of the two
740cabdff1aSopenharmony_ci        @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
741cabdff1aSopenharmony_ci        transpose16_4x4 q10, q11, q12, q13
742cabdff1aSopenharmony_ci
743cabdff1aSopenharmony_ci        vst1.16         {d20}, [r12], r1
744cabdff1aSopenharmony_ci        vst1.16         {d21}, [r0],  r1
745cabdff1aSopenharmony_ci        vst1.16         {d22}, [r12], r1
746cabdff1aSopenharmony_ci        vst1.16         {d23}, [r0],  r1
747cabdff1aSopenharmony_ci        vst1.16         {d24}, [r12], r1
748cabdff1aSopenharmony_ci        vst1.16         {d25}, [r0],  r1
749cabdff1aSopenharmony_ci        vst1.16         {d26}, [r12], r1
750cabdff1aSopenharmony_ci        vst1.16         {d27}, [r0],  r1
751cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
752cabdff1aSopenharmony_ci9:
753cabdff1aSopenharmony_ci        add             r0,  r12, #4
754cabdff1aSopenharmony_ci        bx              lr
755cabdff1aSopenharmony_ciendfunc
756cabdff1aSopenharmony_ci
757cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_h_4_8
758cabdff1aSopenharmony_ci
759cabdff1aSopenharmony_ci
760cabdff1aSopenharmony_cifunction vp9_loop_filter_v_8_8_16_neon
761cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #2
762cabdff1aSopenharmony_ci        vld1.16         {q8},  [r12,:128], r1 @ p3
763cabdff1aSopenharmony_ci        vld1.16         {q12}, [r0, :128], r1 @ q0
764cabdff1aSopenharmony_ci        vld1.16         {q9},  [r12,:128], r1 @ p2
765cabdff1aSopenharmony_ci        vld1.16         {q13}, [r0, :128], r1 @ q1
766cabdff1aSopenharmony_ci        vld1.16         {q10}, [r12,:128], r1 @ p1
767cabdff1aSopenharmony_ci        vld1.16         {q14}, [r0, :128], r1 @ q2
768cabdff1aSopenharmony_ci        vld1.16         {q11}, [r12,:128], r1 @ p0
769cabdff1aSopenharmony_ci        vld1.16         {q15}, [r0, :128], r1 @ q3
770cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
771cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
772cabdff1aSopenharmony_ci        add             r12, r12, r1
773cabdff1aSopenharmony_ci
774cabdff1aSopenharmony_ci        loop_filter_q_8
775cabdff1aSopenharmony_ci
776cabdff1aSopenharmony_ci        vst1.16         {q9},  [r12,:128], r1
777cabdff1aSopenharmony_ci        vst1.16         {q12}, [r0, :128], r1
778cabdff1aSopenharmony_ci        vst1.16         {q10}, [r12,:128], r1
779cabdff1aSopenharmony_ci        vst1.16         {q13}, [r0, :128], r1
780cabdff1aSopenharmony_ci        vst1.16         {q11}, [r12,:128], r1
781cabdff1aSopenharmony_ci        vst1.16         {q14}, [r0, :128], r1
782cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
783cabdff1aSopenharmony_ci        sub             r0,  r0,  r1
784cabdff1aSopenharmony_ci9:
785cabdff1aSopenharmony_ci        bx              lr
786cabdff1aSopenharmony_ci6:
787cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #1
788cabdff1aSopenharmony_ci        vst1.16         {q10}, [r12,:128], r1
789cabdff1aSopenharmony_ci        vst1.16         {q12}, [r0, :128], r1
790cabdff1aSopenharmony_ci        vst1.16         {q11}, [r12,:128], r1
791cabdff1aSopenharmony_ci        vst1.16         {q13}, [r0, :128], r1
792cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
793cabdff1aSopenharmony_ci        bx              lr
794cabdff1aSopenharmony_ciendfunc
795cabdff1aSopenharmony_ci
796cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_v_8_8
797cabdff1aSopenharmony_ci
798cabdff1aSopenharmony_ci
799cabdff1aSopenharmony_cifunction vp9_loop_filter_h_8_8_16_neon
800cabdff1aSopenharmony_ci        sub             r12, r0,  #8
801cabdff1aSopenharmony_ci        add             r0,  r12, r1, lsl #2
802cabdff1aSopenharmony_ci        vld1.16         {q8},  [r12,:64], r1
803cabdff1aSopenharmony_ci        vld1.16         {q12}, [r0, :64], r1
804cabdff1aSopenharmony_ci        vld1.16         {q9},  [r12,:64], r1
805cabdff1aSopenharmony_ci        vld1.16         {q13}, [r0, :64], r1
806cabdff1aSopenharmony_ci        vld1.16         {q10}, [r12,:64], r1
807cabdff1aSopenharmony_ci        vld1.16         {q14}, [r0, :64], r1
808cabdff1aSopenharmony_ci        vld1.16         {q11}, [r12,:64], r1
809cabdff1aSopenharmony_ci        vld1.16         {q15}, [r0, :64], r1
810cabdff1aSopenharmony_ci
811cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
812cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
813cabdff1aSopenharmony_ci
814cabdff1aSopenharmony_ci        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
815cabdff1aSopenharmony_ci
816cabdff1aSopenharmony_ci        loop_filter_q_8
817cabdff1aSopenharmony_ci
818cabdff1aSopenharmony_ci        @ Even though only 6 pixels per row have been changed, we write the
819cabdff1aSopenharmony_ci        @ full 8 pixel registers.
820cabdff1aSopenharmony_ci        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
821cabdff1aSopenharmony_ci
822cabdff1aSopenharmony_ci        vst1.16         {q8},  [r12,:64], r1
823cabdff1aSopenharmony_ci        vst1.16         {q12}, [r0, :64], r1
824cabdff1aSopenharmony_ci        vst1.16         {q9},  [r12,:64], r1
825cabdff1aSopenharmony_ci        vst1.16         {q13}, [r0, :64], r1
826cabdff1aSopenharmony_ci        vst1.16         {q10}, [r12,:64], r1
827cabdff1aSopenharmony_ci        vst1.16         {q14}, [r0, :64], r1
828cabdff1aSopenharmony_ci        vst1.16         {q11}, [r12,:64], r1
829cabdff1aSopenharmony_ci        vst1.16         {q15}, [r0, :64], r1
830cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
831cabdff1aSopenharmony_ci9:
832cabdff1aSopenharmony_ci        add             r0,  r12, #8
833cabdff1aSopenharmony_ci        bx              lr
834cabdff1aSopenharmony_ci6:
835cabdff1aSopenharmony_ci        @ If we didn't need to do the flat8in part, we use the same writeback
836cabdff1aSopenharmony_ci        @ as in loop_filter_h_4_8.
837cabdff1aSopenharmony_ci        add             r12, r12, #4
838cabdff1aSopenharmony_ci        add             r0,  r0,  #4
839cabdff1aSopenharmony_ci        transpose16_4x4 q10, q11, q12, q13
840cabdff1aSopenharmony_ci
841cabdff1aSopenharmony_ci        vst1.16         {d20}, [r12], r1
842cabdff1aSopenharmony_ci        vst1.16         {d21}, [r0],  r1
843cabdff1aSopenharmony_ci        vst1.16         {d22}, [r12], r1
844cabdff1aSopenharmony_ci        vst1.16         {d23}, [r0],  r1
845cabdff1aSopenharmony_ci        vst1.16         {d24}, [r12], r1
846cabdff1aSopenharmony_ci        vst1.16         {d25}, [r0],  r1
847cabdff1aSopenharmony_ci        vst1.16         {d26}, [r12], r1
848cabdff1aSopenharmony_ci        vst1.16         {d27}, [r0],  r1
849cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
850cabdff1aSopenharmony_ci        add             r0,  r12, #4
851cabdff1aSopenharmony_ci        bx              lr
852cabdff1aSopenharmony_ciendfunc
853cabdff1aSopenharmony_ci
854cabdff1aSopenharmony_cibpp_frontends vp9_loop_filter_h_8_8
855cabdff1aSopenharmony_ci
856cabdff1aSopenharmony_cibpp_frontends_mix2 4, 4
857cabdff1aSopenharmony_cibpp_frontends_mix2 4, 8
858cabdff1aSopenharmony_cibpp_frontends_mix2 8, 4
859cabdff1aSopenharmony_cibpp_frontends_mix2 8, 8
860cabdff1aSopenharmony_ci
861cabdff1aSopenharmony_cifunction vp9_loop_filter_v_16_4_16_neon
862cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #3
863cabdff1aSopenharmony_ci        @ Read p7-p0 using r12 and q0-q7 using r0
864cabdff1aSopenharmony_ci        vld1.16         {d16}, [r12,:64], r1 @ p7
865cabdff1aSopenharmony_ci        vld1.16         {d24}, [r0, :64], r1 @ q0
866cabdff1aSopenharmony_ci        vld1.16         {d17}, [r12,:64], r1 @ p6
867cabdff1aSopenharmony_ci        vld1.16         {d25}, [r0, :64], r1 @ q1
868cabdff1aSopenharmony_ci        vld1.16         {d18}, [r12,:64], r1 @ p5
869cabdff1aSopenharmony_ci        vld1.16         {d26}, [r0, :64], r1 @ q2
870cabdff1aSopenharmony_ci        vld1.16         {d19}, [r12,:64], r1 @ p4
871cabdff1aSopenharmony_ci        vld1.16         {d27}, [r0, :64], r1 @ q3
872cabdff1aSopenharmony_ci        vld1.16         {d20}, [r12,:64], r1 @ p3
873cabdff1aSopenharmony_ci        vld1.16         {d28}, [r0, :64], r1 @ q4
874cabdff1aSopenharmony_ci        vld1.16         {d21}, [r12,:64], r1 @ p2
875cabdff1aSopenharmony_ci        vld1.16         {d29}, [r0, :64], r1 @ q5
876cabdff1aSopenharmony_ci        vld1.16         {d22}, [r12,:64], r1 @ p1
877cabdff1aSopenharmony_ci        vld1.16         {d30}, [r0, :64], r1 @ q6
878cabdff1aSopenharmony_ci        vld1.16         {d23}, [r12,:64], r1 @ p0
879cabdff1aSopenharmony_ci        vld1.16         {d31}, [r0, :64], r1 @ q7
880cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #3
881cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #3
882cabdff1aSopenharmony_ci        add             r12, r12, r1
883cabdff1aSopenharmony_ci
884cabdff1aSopenharmony_ci        loop_filter_16
885cabdff1aSopenharmony_ci
886cabdff1aSopenharmony_ci        @ If we did the flat8out part, we get the output in
887cabdff1aSopenharmony_ci        @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
888cabdff1aSopenharmony_ci        @ store d2-d9 there, and d10-d17 into r0.
889cabdff1aSopenharmony_ci        vst1.16         {d2},  [r12,:64], r1
890cabdff1aSopenharmony_ci        vst1.16         {d10}, [r0, :64], r1
891cabdff1aSopenharmony_ci        vst1.16         {d3},  [r12,:64], r1
892cabdff1aSopenharmony_ci        vst1.16         {d11}, [r0, :64], r1
893cabdff1aSopenharmony_ci        vst1.16         {d4},  [r12,:64], r1
894cabdff1aSopenharmony_ci        vst1.16         {d12}, [r0, :64], r1
895cabdff1aSopenharmony_ci        vst1.16         {d5},  [r12,:64], r1
896cabdff1aSopenharmony_ci        vst1.16         {d13}, [r0, :64], r1
897cabdff1aSopenharmony_ci        vst1.16         {d6},  [r12,:64], r1
898cabdff1aSopenharmony_ci        vst1.16         {d14}, [r0, :64], r1
899cabdff1aSopenharmony_ci        vst1.16         {d8},  [r12,:64], r1
900cabdff1aSopenharmony_ci        vst1.16         {d15}, [r0, :64], r1
901cabdff1aSopenharmony_ci        vst1.16         {d9},  [r12,:64], r1
902cabdff1aSopenharmony_ci        vst1.16         {d17}, [r0, :64], r1
903cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #3
904cabdff1aSopenharmony_ci        add             r0,  r0,  r1
905cabdff1aSopenharmony_ci
906cabdff1aSopenharmony_ci9:
907cabdff1aSopenharmony_ci        bx              lr
908cabdff1aSopenharmony_ci
909cabdff1aSopenharmony_ci8:
910cabdff1aSopenharmony_ci        add             r12, r12, r1, lsl #2
911cabdff1aSopenharmony_ci        @ If we didn't do the flat8out part, the output is left in the
912cabdff1aSopenharmony_ci        @ input registers.
913cabdff1aSopenharmony_ci        vst1.16         {d21}, [r12,:64], r1
914cabdff1aSopenharmony_ci        vst1.16         {d24}, [r0, :64], r1
915cabdff1aSopenharmony_ci        vst1.16         {d22}, [r12,:64], r1
916cabdff1aSopenharmony_ci        vst1.16         {d25}, [r0, :64], r1
917cabdff1aSopenharmony_ci        vst1.16         {d23}, [r12,:64], r1
918cabdff1aSopenharmony_ci        vst1.16         {d26}, [r0, :64], r1
919cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
920cabdff1aSopenharmony_ci        sub             r0,  r0,  r1
921cabdff1aSopenharmony_ci        bx              lr
922cabdff1aSopenharmony_ci7:
923cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #1
924cabdff1aSopenharmony_ci        vst1.16         {d22}, [r12,:64], r1
925cabdff1aSopenharmony_ci        vst1.16         {d24}, [r0, :64], r1
926cabdff1aSopenharmony_ci        vst1.16         {d23}, [r12,:64], r1
927cabdff1aSopenharmony_ci        vst1.16         {d25}, [r0, :64], r1
928cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
929cabdff1aSopenharmony_ci        bx              lr
930cabdff1aSopenharmony_ciendfunc
931cabdff1aSopenharmony_ci
932cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_v_16, 8,  4, 2, v
933cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_v_16, 16, 4, 4, v
934cabdff1aSopenharmony_ci
935cabdff1aSopenharmony_cifunction vp9_loop_filter_h_16_4_16_neon
936cabdff1aSopenharmony_ci        sub             r12, r0,  #16
937cabdff1aSopenharmony_ci        sub             r0,  r0,  #8
938cabdff1aSopenharmony_ci        vld1.16         {d16}, [r12,:64], r1
939cabdff1aSopenharmony_ci        vld1.16         {d20}, [r0, :64], r1
940cabdff1aSopenharmony_ci        vld1.16         {d17}, [r12,:64], r1
941cabdff1aSopenharmony_ci        vld1.16         {d21}, [r0, :64], r1
942cabdff1aSopenharmony_ci        vld1.16         {d18}, [r12,:64], r1
943cabdff1aSopenharmony_ci        vld1.16         {d22}, [r0, :64], r1
944cabdff1aSopenharmony_ci        vld1.16         {d19}, [r12,:64], r1
945cabdff1aSopenharmony_ci        vld1.16         {d23}, [r0, :64], r1
946cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
947cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
948cabdff1aSopenharmony_ci        add             r12, r12, #16
949cabdff1aSopenharmony_ci        add             r0,  r0,  #16
950cabdff1aSopenharmony_ci        vld1.16         {d24}, [r12,:64], r1
951cabdff1aSopenharmony_ci        vld1.16         {d28}, [r0, :64], r1
952cabdff1aSopenharmony_ci        vld1.16         {d25}, [r12,:64], r1
953cabdff1aSopenharmony_ci        vld1.16         {d29}, [r0, :64], r1
954cabdff1aSopenharmony_ci        vld1.16         {d26}, [r12,:64], r1
955cabdff1aSopenharmony_ci        vld1.16         {d30}, [r0, :64], r1
956cabdff1aSopenharmony_ci        vld1.16         {d27}, [r12,:64], r1
957cabdff1aSopenharmony_ci        vld1.16         {d31}, [r0, :64], r1
958cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
959cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
960cabdff1aSopenharmony_ci        sub             r12, r12, #16
961cabdff1aSopenharmony_ci        sub             r0,  r0,  #16
962cabdff1aSopenharmony_ci
963cabdff1aSopenharmony_ci        @ The 16x4 pixels read above is in four 4x4 blocks
964cabdff1aSopenharmony_ci        transpose16_q_4x4 q8,  q9,  d16, d17, d18, d19
965cabdff1aSopenharmony_ci        transpose16_q_4x4 q10, q11, d20, d21, d22, d23
966cabdff1aSopenharmony_ci        transpose16_q_4x4 q12, q13, d24, d25, d26, d27
967cabdff1aSopenharmony_ci        transpose16_q_4x4 q14, q15, d28, d29, d30, d31
968cabdff1aSopenharmony_ci
969cabdff1aSopenharmony_ci        loop_filter_16
970cabdff1aSopenharmony_ci
971cabdff1aSopenharmony_ci        @ Transpose back; this is the same transpose as above, but
972cabdff1aSopenharmony_ci        @ we can't take advantage of q registers for the transpose, since
973cabdff1aSopenharmony_ci        @ all d registers in the transpose aren't consecutive.
974cabdff1aSopenharmony_ci        transpose16_4x4 d16, d2,  d3,  d4
975cabdff1aSopenharmony_ci        transpose16_4x4 d5,  d6,  d8,  d9
976cabdff1aSopenharmony_ci        transpose16_4x4 d10, d11, d12, d13
977cabdff1aSopenharmony_ci        transpose16_4x4 d14, d15, d17, d31
978cabdff1aSopenharmony_ci
979cabdff1aSopenharmony_ci        vst1.16         {d16}, [r12,:64], r1
980cabdff1aSopenharmony_ci        vst1.16         {d5},  [r0, :64], r1
981cabdff1aSopenharmony_ci
982cabdff1aSopenharmony_ci        vst1.16         {d2},  [r12,:64], r1
983cabdff1aSopenharmony_ci        vst1.16         {d6},  [r0, :64], r1
984cabdff1aSopenharmony_ci
985cabdff1aSopenharmony_ci        vst1.16         {d3},  [r12,:64], r1
986cabdff1aSopenharmony_ci        vst1.16         {d8},  [r0, :64], r1
987cabdff1aSopenharmony_ci
988cabdff1aSopenharmony_ci        vst1.16         {d4},  [r12,:64], r1
989cabdff1aSopenharmony_ci        vst1.16         {d9},  [r0, :64], r1
990cabdff1aSopenharmony_ci
991cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
992cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
993cabdff1aSopenharmony_ci        add             r12, r12, #16
994cabdff1aSopenharmony_ci        add             r0,  r0,  #16
995cabdff1aSopenharmony_ci
996cabdff1aSopenharmony_ci        vst1.16         {d10}, [r12,:64], r1
997cabdff1aSopenharmony_ci        vst1.16         {d14}, [r0, :64], r1
998cabdff1aSopenharmony_ci
999cabdff1aSopenharmony_ci        vst1.16         {d11}, [r12,:64], r1
1000cabdff1aSopenharmony_ci        vst1.16         {d15}, [r0, :64], r1
1001cabdff1aSopenharmony_ci
1002cabdff1aSopenharmony_ci        vst1.16         {d12}, [r12,:64], r1
1003cabdff1aSopenharmony_ci        vst1.16         {d17}, [r0, :64], r1
1004cabdff1aSopenharmony_ci
1005cabdff1aSopenharmony_ci        vst1.16         {d13}, [r12,:64], r1
1006cabdff1aSopenharmony_ci        vst1.16         {d31}, [r0, :64], r1
1007cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
1008cabdff1aSopenharmony_ci        sub             r0,  r0,  #8
1009cabdff1aSopenharmony_ci        bx              lr
1010cabdff1aSopenharmony_ci9:
1011cabdff1aSopenharmony_ci        add             r0,  r0,  #8
1012cabdff1aSopenharmony_ci        bx              lr
1013cabdff1aSopenharmony_ci8:
1014cabdff1aSopenharmony_ci        add             r12, r12, #8
1015cabdff1aSopenharmony_ci        add             r0,  r0,  #8
1016cabdff1aSopenharmony_ci        transpose16_q_4x4 q10, q11, d20, d21, d22, d23
1017cabdff1aSopenharmony_ci        transpose16_q_4x4 q12, q13, d24, d25, d26, d27
1018cabdff1aSopenharmony_ci
1019cabdff1aSopenharmony_ci        vst1.16         {d20}, [r12,:64], r1
1020cabdff1aSopenharmony_ci        vst1.16         {d24}, [r0, :64], r1
1021cabdff1aSopenharmony_ci        vst1.16         {d21}, [r12,:64], r1
1022cabdff1aSopenharmony_ci        vst1.16         {d25}, [r0, :64], r1
1023cabdff1aSopenharmony_ci        vst1.16         {d22}, [r12,:64], r1
1024cabdff1aSopenharmony_ci        vst1.16         {d26}, [r0, :64], r1
1025cabdff1aSopenharmony_ci        vst1.16         {d23}, [r12,:64], r1
1026cabdff1aSopenharmony_ci        vst1.16         {d27}, [r0, :64], r1
1027cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
1028cabdff1aSopenharmony_ci        bx              lr
1029cabdff1aSopenharmony_ci7:
1030cabdff1aSopenharmony_ci        add             r12, r12, #12
1031cabdff1aSopenharmony_ci        add             r0,  r12, r1, lsl #1
1032cabdff1aSopenharmony_ci        transpose16_q_4x4 q11, q12, d22, d23, d24, d25
1033cabdff1aSopenharmony_ci
1034cabdff1aSopenharmony_ci        vst1.16         {d22}, [r12], r1
1035cabdff1aSopenharmony_ci        vst1.16         {d24}, [r0],  r1
1036cabdff1aSopenharmony_ci        vst1.16         {d23}, [r12], r1
1037cabdff1aSopenharmony_ci        vst1.16         {d25}, [r0],  r1
1038cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
1039cabdff1aSopenharmony_ci        add             r0,  r0,  #4
1040cabdff1aSopenharmony_ci        bx              lr
1041cabdff1aSopenharmony_ciendfunc
1042cabdff1aSopenharmony_ci
1043cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_h_16, 8,  4, 2, h
1044cabdff1aSopenharmony_cibpp_frontends_rep vp9_loop_filter_h_16, 16, 4, 4, h
1045