1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2016 Google Inc.
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S"
22cabdff1aSopenharmony_ci#include "neon.S"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci@ Do an 8x8 transpose, using q registers for the subtransposes that don't
25cabdff1aSopenharmony_ci@ need to address the indiviudal d registers.
26cabdff1aSopenharmony_ci@ r0,r1 == rq0, r2,r3 == rq1, etc
27cabdff1aSopenharmony_ci.macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7
28cabdff1aSopenharmony_ci        vtrn.32         \rq0, \rq2
29cabdff1aSopenharmony_ci        vtrn.32         \rq1, \rq3
30cabdff1aSopenharmony_ci        vtrn.16         \rq0, \rq1
31cabdff1aSopenharmony_ci        vtrn.16         \rq2, \rq3
32cabdff1aSopenharmony_ci        vtrn.8          \r0,  \r1
33cabdff1aSopenharmony_ci        vtrn.8          \r2,  \r3
34cabdff1aSopenharmony_ci        vtrn.8          \r4,  \r5
35cabdff1aSopenharmony_ci        vtrn.8          \r6,  \r7
36cabdff1aSopenharmony_ci.endm
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ci@ Do a 4x4 transpose, using q registers for the subtransposes that don't
39cabdff1aSopenharmony_ci@ need to address the indiviudal d registers.
40cabdff1aSopenharmony_ci@ r0,r1 == rq0, r2,r3 == rq1
41cabdff1aSopenharmony_ci.macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3
42cabdff1aSopenharmony_ci        vtrn.16         \rq0, \rq1
43cabdff1aSopenharmony_ci        vtrn.8          \r0,  \r1
44cabdff1aSopenharmony_ci        vtrn.8          \r2,  \r3
45cabdff1aSopenharmony_ci.endm
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ci@ The input to and output from this macro is in the registers q8-q15,
48cabdff1aSopenharmony_ci@ and q0-q7 are used as scratch registers.
49cabdff1aSopenharmony_ci@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
50cabdff1aSopenharmony_ci.macro loop_filter_q
51cabdff1aSopenharmony_ci        vdup.u8         d0,  r2          @ E
52cabdff1aSopenharmony_ci        lsr             r2,  r2,  #8
53cabdff1aSopenharmony_ci        vdup.u8         d2,  r3          @ I
54cabdff1aSopenharmony_ci        lsr             r3,  r3,  #8
55cabdff1aSopenharmony_ci        vdup.u8         d1,  r2          @ E
56cabdff1aSopenharmony_ci        vdup.u8         d3,  r3          @ I
57cabdff1aSopenharmony_ci
58cabdff1aSopenharmony_ci        vabd.u8         q2,  q8,  q9     @ abs(p3 - p2)
59cabdff1aSopenharmony_ci        vabd.u8         q3,  q9,  q10    @ abs(p2 - p1)
60cabdff1aSopenharmony_ci        vabd.u8         q4,  q10, q11    @ abs(p1 - p0)
61cabdff1aSopenharmony_ci        vabd.u8         q5,  q12, q13    @ abs(q0 - q1)
62cabdff1aSopenharmony_ci        vabd.u8         q6,  q13, q14    @ abs(q1 - q2)
63cabdff1aSopenharmony_ci        vabd.u8         q7,  q14, q15    @ abs(q2 - q3)
64cabdff1aSopenharmony_ci        vmax.u8         q2,  q2,  q3
65cabdff1aSopenharmony_ci        vmax.u8         q3,  q4,  q5
66cabdff1aSopenharmony_ci        vmax.u8         q4,  q6,  q7
67cabdff1aSopenharmony_ci        vabd.u8         q5,  q11, q12    @ abs(p0 - q0)
68cabdff1aSopenharmony_ci        vmax.u8         q2,  q2,  q3
69cabdff1aSopenharmony_ci        vqadd.u8        q5,  q5,  q5     @ abs(p0 - q0) * 2
70cabdff1aSopenharmony_ci        vabd.u8         q7,  q10, q13    @ abs(p1 - q1)
71cabdff1aSopenharmony_ci        vmax.u8         q2,  q2,  q4     @ max(abs(p3 - p2), ..., abs(q2 - q3))
72cabdff1aSopenharmony_ci        vshr.u8         q7,  q7,  #1
73cabdff1aSopenharmony_ci        vcle.u8         q2,  q2,  q1     @ max(abs()) <= I
74cabdff1aSopenharmony_ci        vqadd.u8        q5,  q5,  q7     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
75cabdff1aSopenharmony_ci        vcle.u8         q5,  q5,  q0
76cabdff1aSopenharmony_ci        vand            q2,  q2,  q5     @ fm
77cabdff1aSopenharmony_ci
78cabdff1aSopenharmony_ci        vshrn.u16       d10, q2,  #4
79cabdff1aSopenharmony_ci        vmov            r2,  r3,  d10
80cabdff1aSopenharmony_ci        orrs            r2,  r2,  r3
81cabdff1aSopenharmony_ci        @ If no pixels need filtering, just exit as soon as possible
82cabdff1aSopenharmony_ci        beq             9f
83cabdff1aSopenharmony_ci
84cabdff1aSopenharmony_ci        @ Calculate the normal inner loop filter for 2 or 4 pixels
85cabdff1aSopenharmony_ci        ldr             r3,  [sp, #64]
86cabdff1aSopenharmony_ci        vabd.u8         q3,  q10, q11    @ abs(p1 - p0)
87cabdff1aSopenharmony_ci        vabd.u8         q4,  q13, q12    @ abs(q1 - q0)
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci        vsubl.u8        q5,  d20, d26    @ p1 - q1
90cabdff1aSopenharmony_ci        vsubl.u8        q6,  d21, d27    @ p1 - q1
91cabdff1aSopenharmony_ci        vmax.u8         q3,  q3,  q4     @ max(abs(p1 - p0), abs(q1 - q0))
92cabdff1aSopenharmony_ci        vqmovn.s16      d10, q5          @ av_clip_int8p(p1 - q1)
93cabdff1aSopenharmony_ci        vqmovn.s16      d11, q6          @ av_clip_int8p(p1 - q1)
94cabdff1aSopenharmony_ci        vdup.u8         d8,  r3          @ H
95cabdff1aSopenharmony_ci        lsr             r3,  r3,  #8
96cabdff1aSopenharmony_ci        vdup.u8         d9,  r3          @ H
97cabdff1aSopenharmony_ci        vsubl.u8        q6,  d24, d22    @ q0 - p0
98cabdff1aSopenharmony_ci        vsubl.u8        q7,  d25, d23    @ q0 - p0
99cabdff1aSopenharmony_ci        vcle.u8         q3,  q3,  q4     @ hev
100cabdff1aSopenharmony_ci        vmov.s16        q0,  #3
101cabdff1aSopenharmony_ci        vand            q3,  q3,  q2     @ !hev && fm && !flat8in
102cabdff1aSopenharmony_ci
103cabdff1aSopenharmony_ci        vmul.s16        q6,  q6,  q0     @ 3 * (q0 - p0)
104cabdff1aSopenharmony_ci        vmul.s16        q7,  q7,  q0     @ 3 * (q0 - p0)
105cabdff1aSopenharmony_ci        vbic            q5,  q5,  q3     @ if (!hev) av_clip_int8 = 0
106cabdff1aSopenharmony_ci        vaddw.s8        q6,  q6,  d10    @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
107cabdff1aSopenharmony_ci        vaddw.s8        q7,  q7,  d11    @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
108cabdff1aSopenharmony_ci        vmov.s8         q5,  #4
109cabdff1aSopenharmony_ci        vqmovn.s16      d12, q6
110cabdff1aSopenharmony_ci        vqmovn.s16      d13, q7          @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f
111cabdff1aSopenharmony_ci        vmov.s8         q0,  #3
112cabdff1aSopenharmony_ci
113cabdff1aSopenharmony_ci        vqadd.s8        q5,  q6,  q5     @ FFMIN(f + 4, 127)
114cabdff1aSopenharmony_ci        vqadd.s8        q0,  q6,  q0     @ FFMIN(f + 3, 127)
115cabdff1aSopenharmony_ci        vmovl.u8        q6,  d22         @ p0
116cabdff1aSopenharmony_ci        vmovl.u8        q7,  d23         @ p0
117cabdff1aSopenharmony_ci        vshr.s8         q5,  q5,  #3     @ f1
118cabdff1aSopenharmony_ci        vshr.s8         q0,  q0,  #3     @ f2
119cabdff1aSopenharmony_ci
120cabdff1aSopenharmony_ci        vaddw.s8        q6,  q6,  d0     @ p0 + f2
121cabdff1aSopenharmony_ci        vaddw.s8        q7,  q7,  d1     @ p0 + f2
122cabdff1aSopenharmony_ci        vqmovun.s16     d0,  q6          @ out p0
123cabdff1aSopenharmony_ci        vmovl.u8        q6,  d24         @ q0
124cabdff1aSopenharmony_ci        vqmovun.s16     d1,  q7          @ out p0
125cabdff1aSopenharmony_ci        vmovl.u8        q7,  d25         @ q0
126cabdff1aSopenharmony_ci        vsubw.s8        q6,  q6,  d10    @ q0 - f1
127cabdff1aSopenharmony_ci        vsubw.s8        q7,  q7,  d11    @ q0 - f1
128cabdff1aSopenharmony_ci        vqmovun.s16     d12, q6          @ out q0
129cabdff1aSopenharmony_ci        vqmovun.s16     d13, q7          @ out q0
130cabdff1aSopenharmony_ci        vrshr.s8        q5,  q5,  #1     @ f = (f1 + 1) >> 1
131cabdff1aSopenharmony_ci        vbit            q11, q0,  q2     @ if (fm && !flat8in)
132cabdff1aSopenharmony_ci        vbit            q12, q6,  q2
133cabdff1aSopenharmony_ci
134cabdff1aSopenharmony_ci        vmovl.u8        q0,  d20         @ p1
135cabdff1aSopenharmony_ci        vmovl.u8        q2,  d21         @ p1
136cabdff1aSopenharmony_ci        vmovl.u8        q6,  d26         @ q1
137cabdff1aSopenharmony_ci        vmovl.u8        q7,  d27         @ q1
138cabdff1aSopenharmony_ci        vaddw.s8        q0,  q0,  d10    @ p1 + f
139cabdff1aSopenharmony_ci        vaddw.s8        q2,  q2,  d11    @ p1 + f
140cabdff1aSopenharmony_ci        vsubw.s8        q6,  q6,  d10    @ q1 - f
141cabdff1aSopenharmony_ci        vsubw.s8        q7,  q7,  d11    @ q1 - f
142cabdff1aSopenharmony_ci        vqmovun.s16     d0,  q0          @ out p1
143cabdff1aSopenharmony_ci        vqmovun.s16     d1,  q2          @ out p1
144cabdff1aSopenharmony_ci        vqmovun.s16     d12, q6          @ out q1
145cabdff1aSopenharmony_ci        vqmovun.s16     d13, q7          @ out q1
146cabdff1aSopenharmony_ci        vbit            q10, q0,  q3     @ if (!hev && fm && !flat8in)
147cabdff1aSopenharmony_ci        vbit            q13, q6,  q3
148cabdff1aSopenharmony_ci.endm
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci@ The input to and output from this macro is in the registers d16-d31,
151cabdff1aSopenharmony_ci@ and d0-d7 are used as scratch registers.
152cabdff1aSopenharmony_ci@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
153cabdff1aSopenharmony_ci@ Depending on the width of the loop filter, we either use d16-d19
154cabdff1aSopenharmony_ci@ and d28-d31 as temp registers, or d8-d15.
155cabdff1aSopenharmony_ci@ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
156cabdff1aSopenharmony_ci.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
157cabdff1aSopenharmony_ci        vdup.u8         d0,  r2 @ E
158cabdff1aSopenharmony_ci        vdup.u8         d2,  r3 @ I
159cabdff1aSopenharmony_ci        ldr             r3,  [sp]
160cabdff1aSopenharmony_ci
161cabdff1aSopenharmony_ci        vabd.u8         d4,  d20, d21    @ abs(p3 - p2)
162cabdff1aSopenharmony_ci        vabd.u8         d5,  d21, d22    @ abs(p2 - p1)
163cabdff1aSopenharmony_ci        vabd.u8         d6,  d22, d23    @ abs(p1 - p0)
164cabdff1aSopenharmony_ci        vabd.u8         d7,  d24, d25    @ abs(q0 - q1)
165cabdff1aSopenharmony_ci        vabd.u8         \tmp1,  d25, d26 @ abs(q1 - q2)
166cabdff1aSopenharmony_ci        vabd.u8         \tmp2,  d26, d27 @ abs(q2 - q3)
167cabdff1aSopenharmony_ci        vmax.u8         d4,  d4,  d5
168cabdff1aSopenharmony_ci        vmax.u8         d5,  d6,  d7
169cabdff1aSopenharmony_ci        vmax.u8         \tmp1,  \tmp1,  \tmp2
170cabdff1aSopenharmony_ci        vabd.u8         d6,  d23, d24    @ abs(p0 - q0)
171cabdff1aSopenharmony_ci        vmax.u8         d4,  d4,  d5
172cabdff1aSopenharmony_ci        vqadd.u8        d6,  d6,  d6     @ abs(p0 - q0) * 2
173cabdff1aSopenharmony_ci        vabd.u8         d5,  d22, d25    @ abs(p1 - q1)
174cabdff1aSopenharmony_ci        vmax.u8         d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
175cabdff1aSopenharmony_ci        vshr.u8         d5,  d5,  #1
176cabdff1aSopenharmony_ci        vcle.u8         d4,  d4,  d2     @ max(abs()) <= I
177cabdff1aSopenharmony_ci        vqadd.u8        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
178cabdff1aSopenharmony_ci        vcle.u8         d5,  d6,  d0
179cabdff1aSopenharmony_ci        vand            d4,  d4,  d5     @ fm
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci        vdup.u8         d3,  r3          @ H
182cabdff1aSopenharmony_ci        vmov            r2,  r3,  d4
183cabdff1aSopenharmony_ci        orrs            r2,  r2,  r3
184cabdff1aSopenharmony_ci        @ If no pixels need filtering, just exit as soon as possible
185cabdff1aSopenharmony_ci        beq             9f
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ci.if \wd >= 8
188cabdff1aSopenharmony_ci        vmov.u8         d0,  #1
189cabdff1aSopenharmony_ci
190cabdff1aSopenharmony_ci        vabd.u8         d6,  d20, d23    @ abs(p3 - p0)
191cabdff1aSopenharmony_ci        vabd.u8         d2,  d21, d23    @ abs(p2 - p0)
192cabdff1aSopenharmony_ci        vabd.u8         d1,  d22, d23    @ abs(p1 - p0)
193cabdff1aSopenharmony_ci        vabd.u8         \tmp1,  d25, d24 @ abs(q1 - q0)
194cabdff1aSopenharmony_ci        vabd.u8         \tmp2,  d26, d24 @ abs(q2 - q0)
195cabdff1aSopenharmony_ci        vabd.u8         \tmp3,  d27, d24 @ abs(q3 - q0)
196cabdff1aSopenharmony_ci        vmax.u8         d6,  d6,  d2
197cabdff1aSopenharmony_ci        vmax.u8         d1,  d1,  \tmp1
198cabdff1aSopenharmony_ci        vmax.u8         \tmp2,  \tmp2,  \tmp3
199cabdff1aSopenharmony_ci.if \wd == 16
200cabdff1aSopenharmony_ci        vabd.u8         d7,  d16, d23    @ abs(p7 - p0)
201cabdff1aSopenharmony_ci        vmax.u8         d6,  d6,  d1
202cabdff1aSopenharmony_ci        vabd.u8         d2,  d17, d23    @ abs(p6 - p0)
203cabdff1aSopenharmony_ci        vmax.u8         d6,  d6,  \tmp2
204cabdff1aSopenharmony_ci        vabd.u8         d1,  d18, d23    @ abs(p5 - p0)
205cabdff1aSopenharmony_ci        vcle.u8         d6,  d6,  d0     @ flat8in
206cabdff1aSopenharmony_ci        vabd.u8         d8,  d19, d23    @ abs(p4 - p0)
207cabdff1aSopenharmony_ci        vand            d6,  d6,  d4     @ flat8in && fm
208cabdff1aSopenharmony_ci        vabd.u8         d9,  d28, d24    @ abs(q4 - q0)
209cabdff1aSopenharmony_ci        vbic            d4,  d4,  d6     @ fm && !flat8in
210cabdff1aSopenharmony_ci        vabd.u8         d10, d29, d24    @ abs(q5 - q0)
211cabdff1aSopenharmony_ci        vabd.u8         d11, d30, d24    @ abs(q6 - q0)
212cabdff1aSopenharmony_ci        vabd.u8         d12, d31, d24    @ abs(q7 - q0)
213cabdff1aSopenharmony_ci
214cabdff1aSopenharmony_ci        vmax.u8         d7,  d7,  d2
215cabdff1aSopenharmony_ci        vmax.u8         d1,  d1,  d8
216cabdff1aSopenharmony_ci        vmax.u8         d9,  d9,  d10
217cabdff1aSopenharmony_ci        vmax.u8         d11, d11, d12
218cabdff1aSopenharmony_ci        @ The rest of the calculation of flat8out is interleaved below
219cabdff1aSopenharmony_ci.else
220cabdff1aSopenharmony_ci        @ The rest of the calculation of flat8in is interleaved below
221cabdff1aSopenharmony_ci.endif
222cabdff1aSopenharmony_ci.endif
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_ci        @ Calculate the normal inner loop filter for 2 or 4 pixels
225cabdff1aSopenharmony_ci        vabd.u8         d5,  d22, d23           @ abs(p1 - p0)
226cabdff1aSopenharmony_ci.if \wd == 16
227cabdff1aSopenharmony_ci        vmax.u8         d7,  d7,  d1
228cabdff1aSopenharmony_ci        vmax.u8         d9,  d9,  d11
229cabdff1aSopenharmony_ci.elseif \wd == 8
230cabdff1aSopenharmony_ci        vmax.u8         d6,  d6,  d1
231cabdff1aSopenharmony_ci.endif
232cabdff1aSopenharmony_ci        vabd.u8         d1,  d25, d24           @ abs(q1 - q0)
233cabdff1aSopenharmony_ci.if \wd == 16
234cabdff1aSopenharmony_ci        vmax.u8         d7,  d7,  d9
235cabdff1aSopenharmony_ci.elseif \wd == 8
236cabdff1aSopenharmony_ci        vmax.u8         d6,  d6,  \tmp2
237cabdff1aSopenharmony_ci.endif
238cabdff1aSopenharmony_ci        vsubl.u8        \tmpq1,  d22, d25       @ p1 - q1
239cabdff1aSopenharmony_ci        vmax.u8         d5,  d5,  d1            @ max(abs(p1 - p0), abs(q1 - q0))
240cabdff1aSopenharmony_ci        vsubl.u8        \tmpq2,  d24, d23       @ q0 - p0
241cabdff1aSopenharmony_ci        vmov.s16        \tmpq3,  #3
242cabdff1aSopenharmony_ci.if \wd == 8
243cabdff1aSopenharmony_ci        vcle.u8         d6,  d6,  d0            @ flat8in
244cabdff1aSopenharmony_ci.endif
245cabdff1aSopenharmony_ci        vcle.u8         d5,  d5,  d3            @ !hev
246cabdff1aSopenharmony_ci.if \wd == 8
247cabdff1aSopenharmony_ci        vand            d6,  d6,  d4            @ flat8in && fm
248cabdff1aSopenharmony_ci.endif
249cabdff1aSopenharmony_ci        vqmovn.s16      \tmp1,   \tmpq1         @ av_clip_int8(p1 - q1)
250cabdff1aSopenharmony_ci.if \wd == 16
251cabdff1aSopenharmony_ci        vcle.u8         d7,  d7,  d0            @ flat8out
252cabdff1aSopenharmony_ci.elseif \wd == 8
253cabdff1aSopenharmony_ci        vbic            d4,  d4,  d6            @ fm && !flat8in
254cabdff1aSopenharmony_ci.endif
255cabdff1aSopenharmony_ci        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
256cabdff1aSopenharmony_ci.if \wd == 16
257cabdff1aSopenharmony_ci        vand            d7,  d7,  d6            @ flat8out && flat8in && fm
258cabdff1aSopenharmony_ci.endif
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_ci        vmul.s16        \tmpq2,  \tmpq2, \tmpq3 @ 3 * (q0 - p0)
261cabdff1aSopenharmony_ci        vbic            \tmp1,   \tmp1,   d5    @ if (!hev) av_clip_int8 = 0
262cabdff1aSopenharmony_ci        vmov.s8         d2,  #4
263cabdff1aSopenharmony_ci        vaddw.s8        \tmpq2,  \tmpq2,  \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
264cabdff1aSopenharmony_ci        vmov.s8         d3,  #3
265cabdff1aSopenharmony_ci        vqmovn.s16      \tmp1,   \tmpq2         @ f
266cabdff1aSopenharmony_ci.if \wd == 16
267cabdff1aSopenharmony_ci        vbic            d6,  d6,  d7            @ fm && flat8in && !flat8out
268cabdff1aSopenharmony_ci.endif
269cabdff1aSopenharmony_ci
270cabdff1aSopenharmony_ci        vqadd.s8        \tmp3, \tmp1,  d2       @ FFMIN(f + 4, 127)
271cabdff1aSopenharmony_ci        vqadd.s8        \tmp4, \tmp1,  d3       @ FFMIN(f + 3, 127)
272cabdff1aSopenharmony_ci        vmovl.u8        q0,  d23                @ p0
273cabdff1aSopenharmony_ci        vshr.s8         \tmp3, \tmp3,  #3       @ f1
274cabdff1aSopenharmony_ci        vshr.s8         \tmp4, \tmp4,  #3       @ f2
275cabdff1aSopenharmony_ci
276cabdff1aSopenharmony_ci        vmovl.u8        q1,  d24                @ q0
277cabdff1aSopenharmony_ci        vaddw.s8        q0,  q0,  \tmp4         @ p0 + f2
278cabdff1aSopenharmony_ci        vsubw.s8        q1,  q1,  \tmp3         @ q0 - f1
279cabdff1aSopenharmony_ci        vqmovun.s16     d0,  q0                 @ out p0
280cabdff1aSopenharmony_ci        vqmovun.s16     d1,  q1                 @ out q0
281cabdff1aSopenharmony_ci        vrshr.s8        \tmp3, \tmp3, #1        @ f = (f1 + 1) >> 1
282cabdff1aSopenharmony_ci        vbit            d23, d0,  d4            @ if (fm && !flat8in)
283cabdff1aSopenharmony_ci        vbit            d24, d1,  d4
284cabdff1aSopenharmony_ci
285cabdff1aSopenharmony_ci        vmovl.u8        q0,  d22                @ p1
286cabdff1aSopenharmony_ci        vmovl.u8        q1,  d25                @ q1
287cabdff1aSopenharmony_ci.if \wd >= 8
288cabdff1aSopenharmony_ci        vmov            r2,  r3,  d6
289cabdff1aSopenharmony_ci.endif
290cabdff1aSopenharmony_ci        vaddw.s8        q0,  q0,  \tmp3         @ p1 + f
291cabdff1aSopenharmony_ci        vsubw.s8        q1,  q1,  \tmp3         @ q1 - f
292cabdff1aSopenharmony_ci.if \wd >= 8
293cabdff1aSopenharmony_ci        orrs            r2,  r2,  r3
294cabdff1aSopenharmony_ci.endif
295cabdff1aSopenharmony_ci        vqmovun.s16     d0,  q0                 @ out p1
296cabdff1aSopenharmony_ci        vqmovun.s16     d2,  q1                 @ out q1
297cabdff1aSopenharmony_ci        vbit            d22, d0,  d5            @ if (!hev && fm && !flat8in)
298cabdff1aSopenharmony_ci        vbit            d25, d2,  d5
299cabdff1aSopenharmony_ci
300cabdff1aSopenharmony_ci.if \wd >= 8
301cabdff1aSopenharmony_ci        @ If no pixels need flat8in, jump to flat8out
302cabdff1aSopenharmony_ci        @ (or to a writeout of the inner 4 pixels, for wd=8)
303cabdff1aSopenharmony_ci        beq             6f
304cabdff1aSopenharmony_ci
305cabdff1aSopenharmony_ci        @ flat8in
306cabdff1aSopenharmony_ci        vaddl.u8        \tmpq1, d20, d21
307cabdff1aSopenharmony_ci        vaddl.u8        \tmpq2, d22, d25
308cabdff1aSopenharmony_ci        vaddl.u8        \tmpq3, d20, d22
309cabdff1aSopenharmony_ci        vaddl.u8        \tmpq4, d23, d26
310cabdff1aSopenharmony_ci        vadd.u16        q0,  \tmpq1, \tmpq1
311cabdff1aSopenharmony_ci        vaddw.u8        q0,  q0,  d23
312cabdff1aSopenharmony_ci        vaddw.u8        q0,  q0,  d24
313cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  \tmpq3
314cabdff1aSopenharmony_ci        vsub.s16        \tmpq2, \tmpq2, \tmpq1
315cabdff1aSopenharmony_ci        vsub.s16        \tmpq4, \tmpq4, \tmpq3
316cabdff1aSopenharmony_ci        vrshrn.u16      d2,  q0,  #3            @ out p2
317cabdff1aSopenharmony_ci
318cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  \tmpq2
319cabdff1aSopenharmony_ci        vaddl.u8        \tmpq1, d20, d23
320cabdff1aSopenharmony_ci        vaddl.u8        \tmpq2, d24, d27
321cabdff1aSopenharmony_ci        vrshrn.u16      d3,  q0,  #3            @ out p1
322cabdff1aSopenharmony_ci
323cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  \tmpq4
324cabdff1aSopenharmony_ci        vsub.s16        \tmpq2, \tmpq2, \tmpq1
325cabdff1aSopenharmony_ci        vaddl.u8        \tmpq3, d21, d24
326cabdff1aSopenharmony_ci        vaddl.u8        \tmpq4, d25, d27
327cabdff1aSopenharmony_ci        vrshrn.u16      d4,  q0,  #3            @ out p0
328cabdff1aSopenharmony_ci
329cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  \tmpq2
330cabdff1aSopenharmony_ci        vsub.s16        \tmpq4, \tmpq4, \tmpq3
331cabdff1aSopenharmony_ci        vaddl.u8        \tmpq1, d22, d25
332cabdff1aSopenharmony_ci        vaddl.u8        \tmpq2, d26, d27
333cabdff1aSopenharmony_ci        vrshrn.u16      d5,  q0,  #3            @ out q0
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  \tmpq4
336cabdff1aSopenharmony_ci        vsub.s16        \tmpq2, \tmpq2, \tmpq1
337cabdff1aSopenharmony_ci        vrshrn.u16      \tmp5,  q0,  #3         @ out q1
338cabdff1aSopenharmony_ci
339cabdff1aSopenharmony_ci        vadd.u16        q0,  q0,  \tmpq2
340cabdff1aSopenharmony_ci        @ The output here is written back into the input registers. This doesn't
341cabdff1aSopenharmony_ci        @ matter for the flat8out part below, since we only update those pixels
342cabdff1aSopenharmony_ci        @ which won't be touched below.
343cabdff1aSopenharmony_ci        vbit            d21, d2,  d6
344cabdff1aSopenharmony_ci        vbit            d22, d3,  d6
345cabdff1aSopenharmony_ci        vbit            d23, d4,  d6
346cabdff1aSopenharmony_ci        vrshrn.u16      \tmp6,  q0,  #3         @ out q2
347cabdff1aSopenharmony_ci        vbit            d24, d5,  d6
348cabdff1aSopenharmony_ci        vbit            d25, \tmp5,  d6
349cabdff1aSopenharmony_ci        vbit            d26, \tmp6,  d6
350cabdff1aSopenharmony_ci.endif
351cabdff1aSopenharmony_ci.if \wd == 16
352cabdff1aSopenharmony_ci6:
353cabdff1aSopenharmony_ci        vorr            d2,  d6,  d7
354cabdff1aSopenharmony_ci        vmov            r2,  r3,  d2
355cabdff1aSopenharmony_ci        orrs            r2,  r2,  r3
356cabdff1aSopenharmony_ci        @ If no pixels needed flat8in nor flat8out, jump to a
357cabdff1aSopenharmony_ci        @ writeout of the inner 4 pixels
358cabdff1aSopenharmony_ci        beq             7f
359cabdff1aSopenharmony_ci        vmov            r2,  r3,  d7
360cabdff1aSopenharmony_ci        orrs            r2,  r2,  r3
361cabdff1aSopenharmony_ci        @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
362cabdff1aSopenharmony_ci        beq             8f
363cabdff1aSopenharmony_ci
364cabdff1aSopenharmony_ci        @ flat8out
365cabdff1aSopenharmony_ci        @ This writes all outputs into d2-d17 (skipping d6 and d16).
366cabdff1aSopenharmony_ci        @ If this part is skipped, the output is read from d21-d26 (which is the input
367cabdff1aSopenharmony_ci        @ to this section).
368cabdff1aSopenharmony_ci        vshll.u8        q0,  d16, #3  @ 8 * d16
369cabdff1aSopenharmony_ci        vsubw.u8        q0,  q0,  d16 @ 7 * d16
370cabdff1aSopenharmony_ci        vaddw.u8        q0,  q0,  d17
371cabdff1aSopenharmony_ci        vaddl.u8        q4,  d17, d18
372cabdff1aSopenharmony_ci        vaddl.u8        q5,  d19, d20
373cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q4
374cabdff1aSopenharmony_ci        vaddl.u8        q4,  d16, d17
375cabdff1aSopenharmony_ci        vaddl.u8        q6,  d21, d22
376cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q5
377cabdff1aSopenharmony_ci        vaddl.u8        q5,  d18, d25
378cabdff1aSopenharmony_ci        vaddl.u8        q7,  d23, d24
379cabdff1aSopenharmony_ci        vsub.s16        q5,  q5,  q4
380cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q6
381cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q7
382cabdff1aSopenharmony_ci        vaddl.u8        q6,  d16, d18
383cabdff1aSopenharmony_ci        vaddl.u8        q7,  d19, d26
384cabdff1aSopenharmony_ci        vrshrn.u16      d2,  q0,  #4
385cabdff1aSopenharmony_ci
386cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q5
387cabdff1aSopenharmony_ci        vaddl.u8        q4,  d16, d19
388cabdff1aSopenharmony_ci        vaddl.u8        q5,  d20, d27
389cabdff1aSopenharmony_ci        vsub.s16        q7,  q7,  q6
390cabdff1aSopenharmony_ci        vbif            d2,  d17, d7
391cabdff1aSopenharmony_ci        vrshrn.u16      d3,  q0,  #4
392cabdff1aSopenharmony_ci
393cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q7
394cabdff1aSopenharmony_ci        vaddl.u8        q6,  d16, d20
395cabdff1aSopenharmony_ci        vaddl.u8        q7,  d21, d28
396cabdff1aSopenharmony_ci        vsub.s16        q5,  q5,  q4
397cabdff1aSopenharmony_ci        vbif            d3,  d18, d7
398cabdff1aSopenharmony_ci        vrshrn.u16      d4,  q0,  #4
399cabdff1aSopenharmony_ci
400cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q5
401cabdff1aSopenharmony_ci        vaddl.u8        q4,  d16, d21
402cabdff1aSopenharmony_ci        vaddl.u8        q5,  d22, d29
403cabdff1aSopenharmony_ci        vsub.s16        q7,  q7,  q6
404cabdff1aSopenharmony_ci        vbif            d4,  d19, d7
405cabdff1aSopenharmony_ci        vrshrn.u16      d5,  q0,  #4
406cabdff1aSopenharmony_ci
407cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q7
408cabdff1aSopenharmony_ci        vaddl.u8        q6,  d16, d22
409cabdff1aSopenharmony_ci        vaddl.u8        q7,  d23, d30
410cabdff1aSopenharmony_ci        vsub.s16        q5,  q5,  q4
411cabdff1aSopenharmony_ci        vbif            d5,  d20, d7
412cabdff1aSopenharmony_ci        vrshrn.u16      d6,  q0,  #4
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q5
415cabdff1aSopenharmony_ci        vaddl.u8        q5,  d16, d23
416cabdff1aSopenharmony_ci        vsub.s16        q7,  q7,  q6
417cabdff1aSopenharmony_ci        vaddl.u8        q6,  d24, d31
418cabdff1aSopenharmony_ci        vbif            d6,  d21, d7
419cabdff1aSopenharmony_ci        vrshrn.u16      d8,  q0,  #4
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q7
422cabdff1aSopenharmony_ci        vsub.s16        q5,  q6,  q5
423cabdff1aSopenharmony_ci        vaddl.u8        q6,  d17, d24
424cabdff1aSopenharmony_ci        vaddl.u8        q7,  d25, d31
425cabdff1aSopenharmony_ci        vbif            d8,  d22, d7
426cabdff1aSopenharmony_ci        vrshrn.u16      d9,  q0,  #4
427cabdff1aSopenharmony_ci
428cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q5
429cabdff1aSopenharmony_ci        vsub.s16        q7,  q7,  q6
430cabdff1aSopenharmony_ci        vaddl.u8        q6,  d26, d31
431cabdff1aSopenharmony_ci        vbif            d9,  d23, d7
432cabdff1aSopenharmony_ci        vrshrn.u16      d10, q0,  #4
433cabdff1aSopenharmony_ci
434cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q7
435cabdff1aSopenharmony_ci        vaddl.u8        q7,  d18, d25
436cabdff1aSopenharmony_ci        vaddl.u8        q9,  d19, d26
437cabdff1aSopenharmony_ci        vsub.s16        q6,  q6,  q7
438cabdff1aSopenharmony_ci        vaddl.u8        q7,  d27, d31
439cabdff1aSopenharmony_ci        vbif            d10, d24, d7
440cabdff1aSopenharmony_ci        vrshrn.u16      d11, q0,  #4
441cabdff1aSopenharmony_ci
442cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q6
443cabdff1aSopenharmony_ci        vaddl.u8        q6,  d20, d27
444cabdff1aSopenharmony_ci        vsub.s16        q7,  q7,  q9
445cabdff1aSopenharmony_ci        vaddl.u8        q9,  d28, d31
446cabdff1aSopenharmony_ci        vbif            d11, d25, d7
447cabdff1aSopenharmony_ci        vsub.s16        q9,  q9,  q6
448cabdff1aSopenharmony_ci        vrshrn.u16      d12, q0,  #4
449cabdff1aSopenharmony_ci
450cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q7
451cabdff1aSopenharmony_ci        vaddl.u8        q7,  d21, d28
452cabdff1aSopenharmony_ci        vaddl.u8        q10, d29, d31
453cabdff1aSopenharmony_ci        vbif            d12, d26, d7
454cabdff1aSopenharmony_ci        vrshrn.u16      d13, q0,  #4
455cabdff1aSopenharmony_ci
456cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q9
457cabdff1aSopenharmony_ci        vsub.s16        q10, q10, q7
458cabdff1aSopenharmony_ci        vaddl.u8        q9,  d22, d29
459cabdff1aSopenharmony_ci        vaddl.u8        q11, d30, d31
460cabdff1aSopenharmony_ci        vbif            d13, d27, d7
461cabdff1aSopenharmony_ci        vrshrn.u16      d14, q0,  #4
462cabdff1aSopenharmony_ci
463cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q10
464cabdff1aSopenharmony_ci        vsub.s16        q11, q11, q9
465cabdff1aSopenharmony_ci        vbif            d14, d28, d7
466cabdff1aSopenharmony_ci        vrshrn.u16      d15, q0,  #4
467cabdff1aSopenharmony_ci
468cabdff1aSopenharmony_ci        vadd.s16        q0,  q0,  q11
469cabdff1aSopenharmony_ci        vbif            d15, d29, d7
470cabdff1aSopenharmony_ci        vrshrn.u16      d17, q0,  #4
471cabdff1aSopenharmony_ci        vbif            d17, d30, d7
472cabdff1aSopenharmony_ci.endif
473cabdff1aSopenharmony_ci.endm
474cabdff1aSopenharmony_ci
475cabdff1aSopenharmony_ci@ For wd <= 8, we use d16-d19 and d28-d31 for temp registers,
476cabdff1aSopenharmony_ci@ while we need those for inputs/outputs in wd=16 and use d8-d15
477cabdff1aSopenharmony_ci@ for temp registers there instead.
478cabdff1aSopenharmony_ci.macro loop_filter_4
479cabdff1aSopenharmony_ci        loop_filter     4,  d16, d17, d18, d19, d28, d29, d30, d31, q8,  q9,  q14, q15
480cabdff1aSopenharmony_ci.endm
481cabdff1aSopenharmony_ci
482cabdff1aSopenharmony_ci.macro loop_filter_8
483cabdff1aSopenharmony_ci        loop_filter     8,  d16, d17, d18, d19, d28, d29, d30, d31, q8,  q9,  q14, q15
484cabdff1aSopenharmony_ci.endm
485cabdff1aSopenharmony_ci
486cabdff1aSopenharmony_ci.macro loop_filter_16
487cabdff1aSopenharmony_ci        loop_filter     16, d8,  d9,  d10, d11, d12, d13, d14, d15, q4,  q5,  q6,  q7
488cabdff1aSopenharmony_ci.endm
489cabdff1aSopenharmony_ci
490cabdff1aSopenharmony_ci
491cabdff1aSopenharmony_ci@ The public functions in this file have got the following signature:
492cabdff1aSopenharmony_ci@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
493cabdff1aSopenharmony_ci
494cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_4_8_neon, export=1
495cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #2
496cabdff1aSopenharmony_ci        vld1.8          {d20}, [r12,:64], r1 @ p3
497cabdff1aSopenharmony_ci        vld1.8          {d24}, [r0, :64], r1 @ q0
498cabdff1aSopenharmony_ci        vld1.8          {d21}, [r12,:64], r1 @ p2
499cabdff1aSopenharmony_ci        vld1.8          {d25}, [r0, :64], r1 @ q1
500cabdff1aSopenharmony_ci        vld1.8          {d22}, [r12,:64], r1 @ p1
501cabdff1aSopenharmony_ci        vld1.8          {d26}, [r0, :64], r1 @ q2
502cabdff1aSopenharmony_ci        vld1.8          {d23}, [r12,:64], r1 @ p0
503cabdff1aSopenharmony_ci        vld1.8          {d27}, [r0, :64], r1 @ q3
504cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
505cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #1
506cabdff1aSopenharmony_ci
507cabdff1aSopenharmony_ci        loop_filter_4
508cabdff1aSopenharmony_ci
509cabdff1aSopenharmony_ci        vst1.8          {d22}, [r12,:64], r1
510cabdff1aSopenharmony_ci        vst1.8          {d24}, [r0, :64], r1
511cabdff1aSopenharmony_ci        vst1.8          {d23}, [r12,:64], r1
512cabdff1aSopenharmony_ci        vst1.8          {d25}, [r0, :64], r1
513cabdff1aSopenharmony_ci9:
514cabdff1aSopenharmony_ci        bx              lr
515cabdff1aSopenharmony_ciendfunc
516cabdff1aSopenharmony_ci
517cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_4_8_neon, export=1
518cabdff1aSopenharmony_ci        sub             r12, r0,  #4
519cabdff1aSopenharmony_ci        add             r0,  r12, r1, lsl #2
520cabdff1aSopenharmony_ci        vld1.8          {d20}, [r12], r1
521cabdff1aSopenharmony_ci        vld1.8          {d24}, [r0],  r1
522cabdff1aSopenharmony_ci        vld1.8          {d21}, [r12], r1
523cabdff1aSopenharmony_ci        vld1.8          {d25}, [r0],  r1
524cabdff1aSopenharmony_ci        vld1.8          {d22}, [r12], r1
525cabdff1aSopenharmony_ci        vld1.8          {d26}, [r0],  r1
526cabdff1aSopenharmony_ci        vld1.8          {d23}, [r12], r1
527cabdff1aSopenharmony_ci        vld1.8          {d27}, [r0],  r1
528cabdff1aSopenharmony_ci
529cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
530cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
531cabdff1aSopenharmony_ci        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
532cabdff1aSopenharmony_ci        @ outermost 2 pixels since they aren't changed.
533cabdff1aSopenharmony_ci        add             r12, r12, #2
534cabdff1aSopenharmony_ci        add             r0,  r0,  #2
535cabdff1aSopenharmony_ci
536cabdff1aSopenharmony_ci        @ Transpose the 8x8 pixels, taking advantage of q registers, to get
537cabdff1aSopenharmony_ci        @ one register per column.
538cabdff1aSopenharmony_ci        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
539cabdff1aSopenharmony_ci
540cabdff1aSopenharmony_ci        loop_filter_4
541cabdff1aSopenharmony_ci
542cabdff1aSopenharmony_ci        @ We only will write the mid 4 pixels back; after the loop filter,
543cabdff1aSopenharmony_ci        @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows
544cabdff1aSopenharmony_ci        @ (8x4 pixels). We need to transpose them to columns, done with a
545cabdff1aSopenharmony_ci        @ 4x4 transpose (which in practice is two 4x4 transposes of the two
546cabdff1aSopenharmony_ci        @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
547cabdff1aSopenharmony_ci        transpose_q_4x4 q11, q12, d22, d23, d24, d25
548cabdff1aSopenharmony_ci
549cabdff1aSopenharmony_ci        vst1.32         {d22[0]}, [r12], r1
550cabdff1aSopenharmony_ci        vst1.32         {d22[1]}, [r0],  r1
551cabdff1aSopenharmony_ci        vst1.32         {d23[0]}, [r12], r1
552cabdff1aSopenharmony_ci        vst1.32         {d23[1]}, [r0],  r1
553cabdff1aSopenharmony_ci        vst1.32         {d24[0]}, [r12], r1
554cabdff1aSopenharmony_ci        vst1.32         {d24[1]}, [r0],  r1
555cabdff1aSopenharmony_ci        vst1.32         {d25[0]}, [r12], r1
556cabdff1aSopenharmony_ci        vst1.32         {d25[1]}, [r0],  r1
557cabdff1aSopenharmony_ci9:
558cabdff1aSopenharmony_ci        bx              lr
559cabdff1aSopenharmony_ciendfunc
560cabdff1aSopenharmony_ci
561cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_44_16_neon, export=1
562cabdff1aSopenharmony_ci        vpush           {q4-q7}
563cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #2
564cabdff1aSopenharmony_ci        vld1.8          {q8},  [r12,:128], r1 @ p3
565cabdff1aSopenharmony_ci        vld1.8          {q12}, [r0, :128], r1 @ q0
566cabdff1aSopenharmony_ci        vld1.8          {q9},  [r12,:128], r1 @ p2
567cabdff1aSopenharmony_ci        vld1.8          {q13}, [r0, :128], r1 @ q1
568cabdff1aSopenharmony_ci        vld1.8          {q10}, [r12,:128], r1 @ p1
569cabdff1aSopenharmony_ci        vld1.8          {q14}, [r0, :128], r1 @ q2
570cabdff1aSopenharmony_ci        vld1.8          {q11}, [r12,:128], r1 @ p0
571cabdff1aSopenharmony_ci        vld1.8          {q15}, [r0, :128], r1 @ q3
572cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
573cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #1
574cabdff1aSopenharmony_ci
575cabdff1aSopenharmony_ci        loop_filter_q
576cabdff1aSopenharmony_ci
577cabdff1aSopenharmony_ci        vst1.8          {q10}, [r12,:128], r1
578cabdff1aSopenharmony_ci        vst1.8          {q12}, [r0, :128], r1
579cabdff1aSopenharmony_ci        vst1.8          {q11}, [r12,:128], r1
580cabdff1aSopenharmony_ci        vst1.8          {q13}, [r0, :128], r1
581cabdff1aSopenharmony_ci9:
582cabdff1aSopenharmony_ci        vpop            {q4-q7}
583cabdff1aSopenharmony_ci        bx              lr
584cabdff1aSopenharmony_ciendfunc
585cabdff1aSopenharmony_ci
586cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_44_16_neon, export=1
587cabdff1aSopenharmony_ci        vpush           {q4-q7}
588cabdff1aSopenharmony_ci        sub             r12, r0,  #4
589cabdff1aSopenharmony_ci        add             r0,  r12, r1, lsl #2
590cabdff1aSopenharmony_ci        vld1.8          {d16}, [r12], r1
591cabdff1aSopenharmony_ci        vld1.8          {d24}, [r0],  r1
592cabdff1aSopenharmony_ci        vld1.8          {d18}, [r12], r1
593cabdff1aSopenharmony_ci        vld1.8          {d26}, [r0],  r1
594cabdff1aSopenharmony_ci        vld1.8          {d20}, [r12], r1
595cabdff1aSopenharmony_ci        vld1.8          {d28}, [r0],  r1
596cabdff1aSopenharmony_ci        vld1.8          {d22}, [r12], r1
597cabdff1aSopenharmony_ci        vld1.8          {d30}, [r0],  r1
598cabdff1aSopenharmony_ci        mov             r12, r0
599cabdff1aSopenharmony_ci        add             r0,  r0,  r1, lsl #2
600cabdff1aSopenharmony_ci        vld1.8          {d17}, [r12], r1
601cabdff1aSopenharmony_ci        vld1.8          {d25}, [r0],  r1
602cabdff1aSopenharmony_ci        vld1.8          {d19}, [r12], r1
603cabdff1aSopenharmony_ci        vld1.8          {d27}, [r0],  r1
604cabdff1aSopenharmony_ci        vld1.8          {d21}, [r12], r1
605cabdff1aSopenharmony_ci        vld1.8          {d29}, [r0],  r1
606cabdff1aSopenharmony_ci        vld1.8          {d23}, [r12], r1
607cabdff1aSopenharmony_ci        vld1.8          {d31}, [r0],  r1
608cabdff1aSopenharmony_ci
609cabdff1aSopenharmony_ci        @ Transpose the 16x8 pixels, as two 8x8 parts
610cabdff1aSopenharmony_ci        transpose_8x8   q8,  q9,  q10, q11, q12, q13, q14, q15
611cabdff1aSopenharmony_ci
612cabdff1aSopenharmony_ci        loop_filter_q
613cabdff1aSopenharmony_ci
614cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #4
615cabdff1aSopenharmony_ci        add             r0,  r12, r1, lsl #3
616cabdff1aSopenharmony_ci        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
617cabdff1aSopenharmony_ci        @ outermost 2 pixels since they aren't changed.
618cabdff1aSopenharmony_ci        add             r12, r12, #2
619cabdff1aSopenharmony_ci        add             r0,  r0,  #2
620cabdff1aSopenharmony_ci
621cabdff1aSopenharmony_ci        @ We only will write the mid 4 pixels back; after the loop filter,
622cabdff1aSopenharmony_ci        @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels).
623cabdff1aSopenharmony_ci        @ We need to transpose them to columns, done with a 4x4 transpose
624cabdff1aSopenharmony_ci        @ (which in practice is four 4x4 transposes of the 4x4 blocks of
625cabdff1aSopenharmony_ci        @ the 16x4 pixels; into 4x16 pixels).
626cabdff1aSopenharmony_ci        transpose_4x4   q10, q11, q12, q13
627cabdff1aSopenharmony_ci
628cabdff1aSopenharmony_ci        vst1.32         {d20[0]}, [r12], r1
629cabdff1aSopenharmony_ci        vst1.32         {d21[0]}, [r0],  r1
630cabdff1aSopenharmony_ci        vst1.32         {d22[0]}, [r12], r1
631cabdff1aSopenharmony_ci        vst1.32         {d23[0]}, [r0],  r1
632cabdff1aSopenharmony_ci        vst1.32         {d24[0]}, [r12], r1
633cabdff1aSopenharmony_ci        vst1.32         {d25[0]}, [r0],  r1
634cabdff1aSopenharmony_ci        vst1.32         {d26[0]}, [r12], r1
635cabdff1aSopenharmony_ci        vst1.32         {d27[0]}, [r0],  r1
636cabdff1aSopenharmony_ci        vst1.32         {d20[1]}, [r12], r1
637cabdff1aSopenharmony_ci        vst1.32         {d21[1]}, [r0],  r1
638cabdff1aSopenharmony_ci        vst1.32         {d22[1]}, [r12], r1
639cabdff1aSopenharmony_ci        vst1.32         {d23[1]}, [r0],  r1
640cabdff1aSopenharmony_ci        vst1.32         {d24[1]}, [r12], r1
641cabdff1aSopenharmony_ci        vst1.32         {d25[1]}, [r0],  r1
642cabdff1aSopenharmony_ci        vst1.32         {d26[1]}, [r12], r1
643cabdff1aSopenharmony_ci        vst1.32         {d27[1]}, [r0],  r1
644cabdff1aSopenharmony_ci9:
645cabdff1aSopenharmony_ci        vpop            {q4-q7}
646cabdff1aSopenharmony_ci        bx              lr
647cabdff1aSopenharmony_ciendfunc
648cabdff1aSopenharmony_ci
649cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_8_8_neon, export=1
650cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #2
651cabdff1aSopenharmony_ci        vld1.8          {d20}, [r12,:64], r1 @ p3
652cabdff1aSopenharmony_ci        vld1.8          {d24}, [r0, :64], r1 @ q0
653cabdff1aSopenharmony_ci        vld1.8          {d21}, [r12,:64], r1 @ p2
654cabdff1aSopenharmony_ci        vld1.8          {d25}, [r0, :64], r1 @ q1
655cabdff1aSopenharmony_ci        vld1.8          {d22}, [r12,:64], r1 @ p1
656cabdff1aSopenharmony_ci        vld1.8          {d26}, [r0, :64], r1 @ q2
657cabdff1aSopenharmony_ci        vld1.8          {d23}, [r12,:64], r1 @ p0
658cabdff1aSopenharmony_ci        vld1.8          {d27}, [r0, :64], r1 @ q3
659cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
660cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
661cabdff1aSopenharmony_ci        add             r12, r12, r1
662cabdff1aSopenharmony_ci
663cabdff1aSopenharmony_ci        loop_filter_8
664cabdff1aSopenharmony_ci
665cabdff1aSopenharmony_ci        vst1.8          {d21}, [r12,:64], r1
666cabdff1aSopenharmony_ci        vst1.8          {d24}, [r0, :64], r1
667cabdff1aSopenharmony_ci        vst1.8          {d22}, [r12,:64], r1
668cabdff1aSopenharmony_ci        vst1.8          {d25}, [r0, :64], r1
669cabdff1aSopenharmony_ci        vst1.8          {d23}, [r12,:64], r1
670cabdff1aSopenharmony_ci        vst1.8          {d26}, [r0, :64], r1
671cabdff1aSopenharmony_ci9:
672cabdff1aSopenharmony_ci        bx              lr
673cabdff1aSopenharmony_ci6:
674cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #1
675cabdff1aSopenharmony_ci        vst1.8          {d22}, [r12,:64], r1
676cabdff1aSopenharmony_ci        vst1.8          {d24}, [r0, :64], r1
677cabdff1aSopenharmony_ci        vst1.8          {d23}, [r12,:64], r1
678cabdff1aSopenharmony_ci        vst1.8          {d25}, [r0, :64], r1
679cabdff1aSopenharmony_ci        bx              lr
680cabdff1aSopenharmony_ciendfunc
681cabdff1aSopenharmony_ci
682cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_8_8_neon, export=1
683cabdff1aSopenharmony_ci        sub             r12, r0,  #4
684cabdff1aSopenharmony_ci        add             r0,  r12, r1, lsl #2
685cabdff1aSopenharmony_ci        vld1.8          {d20}, [r12], r1
686cabdff1aSopenharmony_ci        vld1.8          {d24}, [r0],  r1
687cabdff1aSopenharmony_ci        vld1.8          {d21}, [r12], r1
688cabdff1aSopenharmony_ci        vld1.8          {d25}, [r0],  r1
689cabdff1aSopenharmony_ci        vld1.8          {d22}, [r12], r1
690cabdff1aSopenharmony_ci        vld1.8          {d26}, [r0],  r1
691cabdff1aSopenharmony_ci        vld1.8          {d23}, [r12], r1
692cabdff1aSopenharmony_ci        vld1.8          {d27}, [r0],  r1
693cabdff1aSopenharmony_ci
694cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #2
695cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
696cabdff1aSopenharmony_ci
697cabdff1aSopenharmony_ci        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
698cabdff1aSopenharmony_ci
699cabdff1aSopenharmony_ci        loop_filter_8
700cabdff1aSopenharmony_ci
701cabdff1aSopenharmony_ci        @ Even though only 6 pixels per row have been changed, we write the
702cabdff1aSopenharmony_ci        @ full 8 pixel registers.
703cabdff1aSopenharmony_ci        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
704cabdff1aSopenharmony_ci
705cabdff1aSopenharmony_ci        vst1.8          {d20}, [r12], r1
706cabdff1aSopenharmony_ci        vst1.8          {d24}, [r0],  r1
707cabdff1aSopenharmony_ci        vst1.8          {d21}, [r12], r1
708cabdff1aSopenharmony_ci        vst1.8          {d25}, [r0],  r1
709cabdff1aSopenharmony_ci        vst1.8          {d22}, [r12], r1
710cabdff1aSopenharmony_ci        vst1.8          {d26}, [r0],  r1
711cabdff1aSopenharmony_ci        vst1.8          {d23}, [r12], r1
712cabdff1aSopenharmony_ci        vst1.8          {d27}, [r0],  r1
713cabdff1aSopenharmony_ci9:
714cabdff1aSopenharmony_ci        bx              lr
715cabdff1aSopenharmony_ci6:
716cabdff1aSopenharmony_ci        @ If we didn't need to do the flat8in part, we use the same writeback
717cabdff1aSopenharmony_ci        @ as in loop_filter_h_4_8.
718cabdff1aSopenharmony_ci        add             r12, r12, #2
719cabdff1aSopenharmony_ci        add             r0,  r0,  #2
720cabdff1aSopenharmony_ci        transpose_q_4x4 q11, q12, d22, d23, d24, d25
721cabdff1aSopenharmony_ci        vst1.32         {d22[0]}, [r12], r1
722cabdff1aSopenharmony_ci        vst1.32         {d22[1]}, [r0],  r1
723cabdff1aSopenharmony_ci        vst1.32         {d23[0]}, [r12], r1
724cabdff1aSopenharmony_ci        vst1.32         {d23[1]}, [r0],  r1
725cabdff1aSopenharmony_ci        vst1.32         {d24[0]}, [r12], r1
726cabdff1aSopenharmony_ci        vst1.32         {d24[1]}, [r0],  r1
727cabdff1aSopenharmony_ci        vst1.32         {d25[0]}, [r12], r1
728cabdff1aSopenharmony_ci        vst1.32         {d25[1]}, [r0],  r1
729cabdff1aSopenharmony_ci        bx              lr
730cabdff1aSopenharmony_ciendfunc
731cabdff1aSopenharmony_ci
732cabdff1aSopenharmony_cifunction vp9_loop_filter_v_16_neon
733cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #3
734cabdff1aSopenharmony_ci        @ Read p7-p0 using r12 and q0-q7 using r0
735cabdff1aSopenharmony_ci        vld1.8          {d16}, [r12,:64], r1 @ p7
736cabdff1aSopenharmony_ci        vld1.8          {d24}, [r0, :64], r1 @ q0
737cabdff1aSopenharmony_ci        vld1.8          {d17}, [r12,:64], r1 @ p6
738cabdff1aSopenharmony_ci        vld1.8          {d25}, [r0, :64], r1 @ q1
739cabdff1aSopenharmony_ci        vld1.8          {d18}, [r12,:64], r1 @ p5
740cabdff1aSopenharmony_ci        vld1.8          {d26}, [r0, :64], r1 @ q2
741cabdff1aSopenharmony_ci        vld1.8          {d19}, [r12,:64], r1 @ p4
742cabdff1aSopenharmony_ci        vld1.8          {d27}, [r0, :64], r1 @ q3
743cabdff1aSopenharmony_ci        vld1.8          {d20}, [r12,:64], r1 @ p3
744cabdff1aSopenharmony_ci        vld1.8          {d28}, [r0, :64], r1 @ q4
745cabdff1aSopenharmony_ci        vld1.8          {d21}, [r12,:64], r1 @ p2
746cabdff1aSopenharmony_ci        vld1.8          {d29}, [r0, :64], r1 @ q5
747cabdff1aSopenharmony_ci        vld1.8          {d22}, [r12,:64], r1 @ p1
748cabdff1aSopenharmony_ci        vld1.8          {d30}, [r0, :64], r1 @ q6
749cabdff1aSopenharmony_ci        vld1.8          {d23}, [r12,:64], r1 @ p0
750cabdff1aSopenharmony_ci        vld1.8          {d31}, [r0, :64], r1 @ q7
751cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #3
752cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #3
753cabdff1aSopenharmony_ci        add             r12, r12, r1
754cabdff1aSopenharmony_ci
755cabdff1aSopenharmony_ci        loop_filter_16
756cabdff1aSopenharmony_ci
757cabdff1aSopenharmony_ci        @ If we did the flat8out part, we get the output in
758cabdff1aSopenharmony_ci        @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
759cabdff1aSopenharmony_ci        @ store d2-d9 there, and d10-d17 into r0.
760cabdff1aSopenharmony_ci        vst1.8          {d2},  [r12,:64], r1
761cabdff1aSopenharmony_ci        vst1.8          {d10}, [r0, :64], r1
762cabdff1aSopenharmony_ci        vst1.8          {d3},  [r12,:64], r1
763cabdff1aSopenharmony_ci        vst1.8          {d11}, [r0, :64], r1
764cabdff1aSopenharmony_ci        vst1.8          {d4},  [r12,:64], r1
765cabdff1aSopenharmony_ci        vst1.8          {d12}, [r0, :64], r1
766cabdff1aSopenharmony_ci        vst1.8          {d5},  [r12,:64], r1
767cabdff1aSopenharmony_ci        vst1.8          {d13}, [r0, :64], r1
768cabdff1aSopenharmony_ci        vst1.8          {d6},  [r12,:64], r1
769cabdff1aSopenharmony_ci        vst1.8          {d14}, [r0, :64], r1
770cabdff1aSopenharmony_ci        vst1.8          {d8},  [r12,:64], r1
771cabdff1aSopenharmony_ci        vst1.8          {d15}, [r0, :64], r1
772cabdff1aSopenharmony_ci        vst1.8          {d9},  [r12,:64], r1
773cabdff1aSopenharmony_ci        vst1.8          {d17}, [r0, :64], r1
774cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #3
775cabdff1aSopenharmony_ci        add             r0,  r0,  r1
776cabdff1aSopenharmony_ci
777cabdff1aSopenharmony_ci9:
778cabdff1aSopenharmony_ci        bx              lr
779cabdff1aSopenharmony_ci
780cabdff1aSopenharmony_ci8:
781cabdff1aSopenharmony_ci        add             r12, r12, r1, lsl #2
782cabdff1aSopenharmony_ci        @ If we didn't do the flat8out part, the output is left in the
783cabdff1aSopenharmony_ci        @ input registers.
784cabdff1aSopenharmony_ci        vst1.8          {d21}, [r12,:64], r1
785cabdff1aSopenharmony_ci        vst1.8          {d24}, [r0, :64], r1
786cabdff1aSopenharmony_ci        vst1.8          {d22}, [r12,:64], r1
787cabdff1aSopenharmony_ci        vst1.8          {d25}, [r0, :64], r1
788cabdff1aSopenharmony_ci        vst1.8          {d23}, [r12,:64], r1
789cabdff1aSopenharmony_ci        vst1.8          {d26}, [r0, :64], r1
790cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
791cabdff1aSopenharmony_ci        sub             r0,  r0,  r1
792cabdff1aSopenharmony_ci        bx              lr
793cabdff1aSopenharmony_ci7:
794cabdff1aSopenharmony_ci        sub             r12, r0,  r1, lsl #1
795cabdff1aSopenharmony_ci        vst1.8          {d22}, [r12,:64], r1
796cabdff1aSopenharmony_ci        vst1.8          {d24}, [r0, :64], r1
797cabdff1aSopenharmony_ci        vst1.8          {d23}, [r12,:64], r1
798cabdff1aSopenharmony_ci        vst1.8          {d25}, [r0, :64], r1
799cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
800cabdff1aSopenharmony_ci        bx              lr
801cabdff1aSopenharmony_ciendfunc
802cabdff1aSopenharmony_ci
803cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_16_8_neon, export=1
804cabdff1aSopenharmony_ci        ldr             r12, [sp]
805cabdff1aSopenharmony_ci        push            {lr}
806cabdff1aSopenharmony_ci        vpush           {q4-q7}
807cabdff1aSopenharmony_ci        push            {r12}
808cabdff1aSopenharmony_ci        bl              vp9_loop_filter_v_16_neon
809cabdff1aSopenharmony_ci        add             sp,  sp,  #4
810cabdff1aSopenharmony_ci        vpop            {q4-q7}
811cabdff1aSopenharmony_ci        pop             {pc}
812cabdff1aSopenharmony_ciendfunc
813cabdff1aSopenharmony_ci
814cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_v_16_16_neon, export=1
815cabdff1aSopenharmony_ci        ldr             r12, [sp]
816cabdff1aSopenharmony_ci        // The filter clobbers r2 and r3, but we need to keep them for the second round
817cabdff1aSopenharmony_ci        push            {r2, r3, lr}
818cabdff1aSopenharmony_ci        vpush           {q4-q7}
819cabdff1aSopenharmony_ci        push            {r12}
820cabdff1aSopenharmony_ci        bl              vp9_loop_filter_v_16_neon
821cabdff1aSopenharmony_ci        add             r0,  #8
822cabdff1aSopenharmony_ci        ldr             r2,  [sp, #68]
823cabdff1aSopenharmony_ci        ldr             r3,  [sp, #72]
824cabdff1aSopenharmony_ci        bl              vp9_loop_filter_v_16_neon
825cabdff1aSopenharmony_ci        add             sp,  sp,  #4
826cabdff1aSopenharmony_ci        vpop            {q4-q7}
827cabdff1aSopenharmony_ci        pop             {r2, r3, pc}
828cabdff1aSopenharmony_ciendfunc
829cabdff1aSopenharmony_ci
830cabdff1aSopenharmony_cifunction vp9_loop_filter_h_16_neon
831cabdff1aSopenharmony_ci        sub             r12, r0,  #8
832cabdff1aSopenharmony_ci        vld1.8          {d16}, [r12,:64], r1
833cabdff1aSopenharmony_ci        vld1.8          {d24}, [r0, :64], r1
834cabdff1aSopenharmony_ci        vld1.8          {d17}, [r12,:64], r1
835cabdff1aSopenharmony_ci        vld1.8          {d25}, [r0, :64], r1
836cabdff1aSopenharmony_ci        vld1.8          {d18}, [r12,:64], r1
837cabdff1aSopenharmony_ci        vld1.8          {d26}, [r0, :64], r1
838cabdff1aSopenharmony_ci        vld1.8          {d19}, [r12,:64], r1
839cabdff1aSopenharmony_ci        vld1.8          {d27}, [r0, :64], r1
840cabdff1aSopenharmony_ci        vld1.8          {d20}, [r12,:64], r1
841cabdff1aSopenharmony_ci        vld1.8          {d28}, [r0, :64], r1
842cabdff1aSopenharmony_ci        vld1.8          {d21}, [r12,:64], r1
843cabdff1aSopenharmony_ci        vld1.8          {d29}, [r0, :64], r1
844cabdff1aSopenharmony_ci        vld1.8          {d22}, [r12,:64], r1
845cabdff1aSopenharmony_ci        vld1.8          {d30}, [r0, :64], r1
846cabdff1aSopenharmony_ci        vld1.8          {d23}, [r12,:64], r1
847cabdff1aSopenharmony_ci        vld1.8          {d31}, [r0, :64], r1
848cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #3
849cabdff1aSopenharmony_ci        sub             r12, r12, r1, lsl #3
850cabdff1aSopenharmony_ci
851cabdff1aSopenharmony_ci        @ The 16x8 pixels read above is in two 8x8 blocks; the left
852cabdff1aSopenharmony_ci        @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes
853cabdff1aSopenharmony_ci        @ of this, to get one column per register. This could be done with two
854cabdff1aSopenharmony_ci        @ transpose_8x8 as below, but this takes advantage of the q registers.
855cabdff1aSopenharmony_ci        transpose16_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15
856cabdff1aSopenharmony_ci        vtrn.8          d16, d17
857cabdff1aSopenharmony_ci        vtrn.8          d18, d19
858cabdff1aSopenharmony_ci        vtrn.8          d20, d21
859cabdff1aSopenharmony_ci        vtrn.8          d22, d23
860cabdff1aSopenharmony_ci        vtrn.8          d24, d25
861cabdff1aSopenharmony_ci        vtrn.8          d26, d27
862cabdff1aSopenharmony_ci        vtrn.8          d28, d29
863cabdff1aSopenharmony_ci        vtrn.8          d30, d31
864cabdff1aSopenharmony_ci
865cabdff1aSopenharmony_ci        loop_filter_16
866cabdff1aSopenharmony_ci
867cabdff1aSopenharmony_ci        @ Transpose back; this is the same transpose as above, but
868cabdff1aSopenharmony_ci        @ we can't take advantage of q registers for the transpose, since
869cabdff1aSopenharmony_ci        @ all d registers in the transpose aren't consecutive.
870cabdff1aSopenharmony_ci        transpose_8x8   d16, d2,  d3,  d4,  d5,  d6,  d8,  d9
871cabdff1aSopenharmony_ci        transpose_8x8   d10, d11, d12, d13, d14, d15, d17, d31
872cabdff1aSopenharmony_ci
873cabdff1aSopenharmony_ci        vst1.8          {d16}, [r12,:64], r1
874cabdff1aSopenharmony_ci        vst1.8          {d10}, [r0, :64], r1
875cabdff1aSopenharmony_ci
876cabdff1aSopenharmony_ci        vst1.8          {d2},  [r12,:64], r1
877cabdff1aSopenharmony_ci        vst1.8          {d11}, [r0, :64], r1
878cabdff1aSopenharmony_ci
879cabdff1aSopenharmony_ci        vst1.8          {d3},  [r12,:64], r1
880cabdff1aSopenharmony_ci        vst1.8          {d12}, [r0, :64], r1
881cabdff1aSopenharmony_ci
882cabdff1aSopenharmony_ci        vst1.8          {d4},  [r12,:64], r1
883cabdff1aSopenharmony_ci        vst1.8          {d13}, [r0, :64], r1
884cabdff1aSopenharmony_ci
885cabdff1aSopenharmony_ci        vst1.8          {d5},  [r12,:64], r1
886cabdff1aSopenharmony_ci        vst1.8          {d14}, [r0, :64], r1
887cabdff1aSopenharmony_ci
888cabdff1aSopenharmony_ci        vst1.8          {d6},  [r12,:64], r1
889cabdff1aSopenharmony_ci        vst1.8          {d15}, [r0, :64], r1
890cabdff1aSopenharmony_ci
891cabdff1aSopenharmony_ci        vst1.8          {d8},  [r12,:64], r1
892cabdff1aSopenharmony_ci        vst1.8          {d17}, [r0, :64], r1
893cabdff1aSopenharmony_ci
894cabdff1aSopenharmony_ci        vst1.8          {d9},  [r12,:64], r1
895cabdff1aSopenharmony_ci        vst1.8          {d31}, [r0, :64], r1
896cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #3
897cabdff1aSopenharmony_ci9:
898cabdff1aSopenharmony_ci        bx              lr
899cabdff1aSopenharmony_ci8:
900cabdff1aSopenharmony_ci        @ The same writeback as in loop_filter_h_8_8
901cabdff1aSopenharmony_ci        sub             r12, r0,  #4
902cabdff1aSopenharmony_ci        add             r0,  r12, r1, lsl #2
903cabdff1aSopenharmony_ci        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
904cabdff1aSopenharmony_ci
905cabdff1aSopenharmony_ci        vst1.8          {d20}, [r12], r1
906cabdff1aSopenharmony_ci        vst1.8          {d24}, [r0],  r1
907cabdff1aSopenharmony_ci        vst1.8          {d21}, [r12], r1
908cabdff1aSopenharmony_ci        vst1.8          {d25}, [r0],  r1
909cabdff1aSopenharmony_ci        vst1.8          {d22}, [r12], r1
910cabdff1aSopenharmony_ci        vst1.8          {d26}, [r0],  r1
911cabdff1aSopenharmony_ci        vst1.8          {d23}, [r12], r1
912cabdff1aSopenharmony_ci        vst1.8          {d27}, [r0],  r1
913cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #3
914cabdff1aSopenharmony_ci        add             r0,  r0,  #4
915cabdff1aSopenharmony_ci        bx              lr
916cabdff1aSopenharmony_ci7:
917cabdff1aSopenharmony_ci        @ The same writeback as in loop_filter_h_4_8
918cabdff1aSopenharmony_ci        sub             r12, r0,  #2
919cabdff1aSopenharmony_ci        add             r0,  r12, r1, lsl #2
920cabdff1aSopenharmony_ci        transpose_q_4x4 q11, q12, d22, d23, d24, d25
921cabdff1aSopenharmony_ci        vst1.32         {d22[0]}, [r12], r1
922cabdff1aSopenharmony_ci        vst1.32         {d22[1]}, [r0],  r1
923cabdff1aSopenharmony_ci        vst1.32         {d23[0]}, [r12], r1
924cabdff1aSopenharmony_ci        vst1.32         {d23[1]}, [r0],  r1
925cabdff1aSopenharmony_ci        vst1.32         {d24[0]}, [r12], r1
926cabdff1aSopenharmony_ci        vst1.32         {d24[1]}, [r0],  r1
927cabdff1aSopenharmony_ci        vst1.32         {d25[0]}, [r12], r1
928cabdff1aSopenharmony_ci        vst1.32         {d25[1]}, [r0],  r1
929cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #3
930cabdff1aSopenharmony_ci        add             r0,  r0,  #2
931cabdff1aSopenharmony_ci        bx              lr
932cabdff1aSopenharmony_ciendfunc
933cabdff1aSopenharmony_ci
934cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_16_8_neon, export=1
935cabdff1aSopenharmony_ci        ldr             r12, [sp]
936cabdff1aSopenharmony_ci        push            {lr}
937cabdff1aSopenharmony_ci        vpush           {q4-q7}
938cabdff1aSopenharmony_ci        push            {r12}
939cabdff1aSopenharmony_ci        bl              vp9_loop_filter_h_16_neon
940cabdff1aSopenharmony_ci        add             sp,  sp,  #4
941cabdff1aSopenharmony_ci        vpop            {q4-q7}
942cabdff1aSopenharmony_ci        pop             {pc}
943cabdff1aSopenharmony_ciendfunc
944cabdff1aSopenharmony_ci
945cabdff1aSopenharmony_cifunction ff_vp9_loop_filter_h_16_16_neon, export=1
946cabdff1aSopenharmony_ci        ldr             r12, [sp]
947cabdff1aSopenharmony_ci        // The filter clobbers r2 and r3, but we need to keep them for the second round
948cabdff1aSopenharmony_ci        push            {r2, r3, lr}
949cabdff1aSopenharmony_ci        vpush           {q4-q7}
950cabdff1aSopenharmony_ci        push            {r12}
951cabdff1aSopenharmony_ci        bl              vp9_loop_filter_h_16_neon
952cabdff1aSopenharmony_ci        add             r0,  r0,  r1, lsl #3
953cabdff1aSopenharmony_ci        ldr             r2,  [sp, #68]
954cabdff1aSopenharmony_ci        ldr             r3,  [sp, #72]
955cabdff1aSopenharmony_ci        bl              vp9_loop_filter_h_16_neon
956cabdff1aSopenharmony_ci        add             sp,  sp,  #4
957cabdff1aSopenharmony_ci        vpop            {q4-q7}
958cabdff1aSopenharmony_ci        pop             {r2, r3, pc}
959cabdff1aSopenharmony_ciendfunc
960