1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2016 Google Inc.
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S"
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci@ All public functions in this file have the following signature:
24cabdff1aSopenharmony_ci@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
25cabdff1aSopenharmony_ci@                            const uint8_t *ref, ptrdiff_t ref_stride,
26cabdff1aSopenharmony_ci@                            int h, int mx, int my);
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_cifunction ff_vp9_copy64_neon, export=1
29cabdff1aSopenharmony_ci        ldr             r12, [sp]
30cabdff1aSopenharmony_ci        sub             r1,  r1,  #32
31cabdff1aSopenharmony_ci        sub             r3,  r3,  #32
32cabdff1aSopenharmony_ci1:
33cabdff1aSopenharmony_ci        vld1.8          {q0,  q1},  [r2]!
34cabdff1aSopenharmony_ci        vst1.8          {q0,  q1},  [r0, :128]!
35cabdff1aSopenharmony_ci        vld1.8          {q2,  q3},  [r2], r3
36cabdff1aSopenharmony_ci        subs            r12, r12, #1
37cabdff1aSopenharmony_ci        vst1.8          {q2,  q3},  [r0, :128], r1
38cabdff1aSopenharmony_ci        bne             1b
39cabdff1aSopenharmony_ci        bx              lr
40cabdff1aSopenharmony_ciendfunc
41cabdff1aSopenharmony_ci
42cabdff1aSopenharmony_cifunction ff_vp9_avg64_neon, export=1
43cabdff1aSopenharmony_ci        push            {lr}
44cabdff1aSopenharmony_ci        ldr             r12, [sp, #4]
45cabdff1aSopenharmony_ci        sub             r1,  r1,  #32
46cabdff1aSopenharmony_ci        sub             r3,  r3,  #32
47cabdff1aSopenharmony_ci        mov             lr,  r0
48cabdff1aSopenharmony_ci1:
49cabdff1aSopenharmony_ci        vld1.8          {q8,  q9},  [r2]!
50cabdff1aSopenharmony_ci        vld1.8          {q0,  q1},  [r0, :128]!
51cabdff1aSopenharmony_ci        vld1.8          {q10, q11}, [r2], r3
52cabdff1aSopenharmony_ci        vrhadd.u8       q0,  q0,  q8
53cabdff1aSopenharmony_ci        vld1.8          {q2,  q3},  [r0, :128], r1
54cabdff1aSopenharmony_ci        vrhadd.u8       q1,  q1,  q9
55cabdff1aSopenharmony_ci        vrhadd.u8       q2,  q2,  q10
56cabdff1aSopenharmony_ci        vst1.8          {q0,  q1},  [lr, :128]!
57cabdff1aSopenharmony_ci        vrhadd.u8       q3,  q3,  q11
58cabdff1aSopenharmony_ci        vst1.8          {q2,  q3},  [lr, :128], r1
59cabdff1aSopenharmony_ci        subs            r12, r12, #1
60cabdff1aSopenharmony_ci        bne             1b
61cabdff1aSopenharmony_ci        pop             {pc}
62cabdff1aSopenharmony_ciendfunc
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_cifunction ff_vp9_copy32_neon, export=1
65cabdff1aSopenharmony_ci        ldr             r12, [sp]
66cabdff1aSopenharmony_ci1:
67cabdff1aSopenharmony_ci        vld1.8          {q0,  q1},  [r2], r3
68cabdff1aSopenharmony_ci        subs            r12, r12, #1
69cabdff1aSopenharmony_ci        vst1.8          {q0,  q1},  [r0, :128], r1
70cabdff1aSopenharmony_ci        bne             1b
71cabdff1aSopenharmony_ci        bx              lr
72cabdff1aSopenharmony_ciendfunc
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_cifunction ff_vp9_avg32_neon, export=1
75cabdff1aSopenharmony_ci        ldr             r12, [sp]
76cabdff1aSopenharmony_ci1:
77cabdff1aSopenharmony_ci        vld1.8          {q2,  q3},  [r2], r3
78cabdff1aSopenharmony_ci        vld1.8          {q0,  q1},  [r0, :128]
79cabdff1aSopenharmony_ci        vrhadd.u8       q0,  q0,  q2
80cabdff1aSopenharmony_ci        vrhadd.u8       q1,  q1,  q3
81cabdff1aSopenharmony_ci        subs            r12, r12, #1
82cabdff1aSopenharmony_ci        vst1.8          {q0,  q1},  [r0, :128], r1
83cabdff1aSopenharmony_ci        bne             1b
84cabdff1aSopenharmony_ci        bx              lr
85cabdff1aSopenharmony_ciendfunc
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_cifunction ff_vp9_copy16_neon, export=1
88cabdff1aSopenharmony_ci        push            {r4,lr}
89cabdff1aSopenharmony_ci        ldr             r12, [sp, #8]
90cabdff1aSopenharmony_ci        add             r4,  r0,  r1
91cabdff1aSopenharmony_ci        add             lr,  r2,  r3
92cabdff1aSopenharmony_ci        add             r1,  r1,  r1
93cabdff1aSopenharmony_ci        add             r3,  r3,  r3
94cabdff1aSopenharmony_ci1:
95cabdff1aSopenharmony_ci        vld1.8          {q0},  [r2], r3
96cabdff1aSopenharmony_ci        vld1.8          {q1},  [lr], r3
97cabdff1aSopenharmony_ci        subs            r12, r12, #2
98cabdff1aSopenharmony_ci        vst1.8          {q0},  [r0, :128], r1
99cabdff1aSopenharmony_ci        vst1.8          {q1},  [r4, :128], r1
100cabdff1aSopenharmony_ci        bne             1b
101cabdff1aSopenharmony_ci        pop             {r4,pc}
102cabdff1aSopenharmony_ciendfunc
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_cifunction ff_vp9_avg16_neon, export=1
105cabdff1aSopenharmony_ci        push            {lr}
106cabdff1aSopenharmony_ci        ldr             r12, [sp, #4]
107cabdff1aSopenharmony_ci        mov             lr,  r0
108cabdff1aSopenharmony_ci1:
109cabdff1aSopenharmony_ci        vld1.8          {q2},  [r2], r3
110cabdff1aSopenharmony_ci        vld1.8          {q0},  [r0, :128], r1
111cabdff1aSopenharmony_ci        vld1.8          {q3},  [r2], r3
112cabdff1aSopenharmony_ci        vrhadd.u8       q0,  q0,  q2
113cabdff1aSopenharmony_ci        vld1.8          {q1},  [r0, :128], r1
114cabdff1aSopenharmony_ci        vrhadd.u8       q1,  q1,  q3
115cabdff1aSopenharmony_ci        subs            r12, r12, #2
116cabdff1aSopenharmony_ci        vst1.8          {q0},  [lr, :128], r1
117cabdff1aSopenharmony_ci        vst1.8          {q1},  [lr, :128], r1
118cabdff1aSopenharmony_ci        bne             1b
119cabdff1aSopenharmony_ci        pop             {pc}
120cabdff1aSopenharmony_ciendfunc
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_cifunction ff_vp9_copy8_neon, export=1
123cabdff1aSopenharmony_ci        ldr             r12, [sp]
124cabdff1aSopenharmony_ci1:
125cabdff1aSopenharmony_ci        vld1.8          {d0},  [r2], r3
126cabdff1aSopenharmony_ci        vld1.8          {d1},  [r2], r3
127cabdff1aSopenharmony_ci        subs            r12, r12, #2
128cabdff1aSopenharmony_ci        vst1.8          {d0},  [r0, :64], r1
129cabdff1aSopenharmony_ci        vst1.8          {d1},  [r0, :64], r1
130cabdff1aSopenharmony_ci        bne             1b
131cabdff1aSopenharmony_ci        bx              lr
132cabdff1aSopenharmony_ciendfunc
133cabdff1aSopenharmony_ci
134cabdff1aSopenharmony_cifunction ff_vp9_avg8_neon, export=1
135cabdff1aSopenharmony_ci        ldr             r12, [sp]
136cabdff1aSopenharmony_ci1:
137cabdff1aSopenharmony_ci        vld1.8          {d2},  [r2], r3
138cabdff1aSopenharmony_ci        vld1.8          {d0},  [r0, :64], r1
139cabdff1aSopenharmony_ci        vld1.8          {d3},  [r2], r3
140cabdff1aSopenharmony_ci        vrhadd.u8       d0,  d0,  d2
141cabdff1aSopenharmony_ci        vld1.8          {d1},  [r0, :64]
142cabdff1aSopenharmony_ci        sub             r0,  r0,  r1
143cabdff1aSopenharmony_ci        vrhadd.u8       d1,  d1,  d3
144cabdff1aSopenharmony_ci        subs            r12, r12, #2
145cabdff1aSopenharmony_ci        vst1.8          {d0},  [r0, :64], r1
146cabdff1aSopenharmony_ci        vst1.8          {d1},  [r0, :64], r1
147cabdff1aSopenharmony_ci        bne             1b
148cabdff1aSopenharmony_ci        bx              lr
149cabdff1aSopenharmony_ciendfunc
150cabdff1aSopenharmony_ci
151cabdff1aSopenharmony_cifunction ff_vp9_copy4_neon, export=1
152cabdff1aSopenharmony_ci        ldr             r12, [sp]
153cabdff1aSopenharmony_ci1:
154cabdff1aSopenharmony_ci        vld1.32         {d0[]},   [r2], r3
155cabdff1aSopenharmony_ci        vld1.32         {d1[]},   [r2], r3
156cabdff1aSopenharmony_ci        vst1.32         {d0[0]},  [r0, :32], r1
157cabdff1aSopenharmony_ci        vld1.32         {d2[]},   [r2], r3
158cabdff1aSopenharmony_ci        vst1.32         {d1[0]},  [r0, :32], r1
159cabdff1aSopenharmony_ci        vld1.32         {d3[]},   [r2], r3
160cabdff1aSopenharmony_ci        subs            r12, r12, #4
161cabdff1aSopenharmony_ci        vst1.32         {d2[0]},  [r0, :32], r1
162cabdff1aSopenharmony_ci        vst1.32         {d3[0]},  [r0, :32], r1
163cabdff1aSopenharmony_ci        bne             1b
164cabdff1aSopenharmony_ci        bx              lr
165cabdff1aSopenharmony_ciendfunc
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_cifunction ff_vp9_avg4_neon, export=1
168cabdff1aSopenharmony_ci        push            {lr}
169cabdff1aSopenharmony_ci        ldr             r12, [sp, #4]
170cabdff1aSopenharmony_ci        mov             lr,  r0
171cabdff1aSopenharmony_ci1:
172cabdff1aSopenharmony_ci        vld1.32         {d4[]},   [r2], r3
173cabdff1aSopenharmony_ci        vld1.32         {d0[]},   [r0, :32], r1
174cabdff1aSopenharmony_ci        vld1.32         {d5[]},   [r2], r3
175cabdff1aSopenharmony_ci        vrhadd.u8       d0,  d0,  d4
176cabdff1aSopenharmony_ci        vld1.32         {d1[]},   [r0, :32], r1
177cabdff1aSopenharmony_ci        vld1.32         {d6[]},   [r2], r3
178cabdff1aSopenharmony_ci        vrhadd.u8       d1,  d1,  d5
179cabdff1aSopenharmony_ci        vld1.32         {d2[]},   [r0, :32], r1
180cabdff1aSopenharmony_ci        vld1.32         {d7[]},   [r2], r3
181cabdff1aSopenharmony_ci        vrhadd.u8       d2,  d2,  d6
182cabdff1aSopenharmony_ci        vld1.32         {d3[]},   [r0, :32], r1
183cabdff1aSopenharmony_ci        subs            r12, r12, #4
184cabdff1aSopenharmony_ci        vst1.32         {d0[0]},  [lr, :32], r1
185cabdff1aSopenharmony_ci        vrhadd.u8       d3,  d3,  d7
186cabdff1aSopenharmony_ci        vst1.32         {d1[0]},  [lr, :32], r1
187cabdff1aSopenharmony_ci        vst1.32         {d2[0]},  [lr, :32], r1
188cabdff1aSopenharmony_ci        vst1.32         {d3[0]},  [lr, :32], r1
189cabdff1aSopenharmony_ci        bne             1b
190cabdff1aSopenharmony_ci        pop             {pc}
191cabdff1aSopenharmony_ciendfunc
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci@ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index
194cabdff1aSopenharmony_ci.macro vmul_lane dst, src, idx
195cabdff1aSopenharmony_ci.if \idx < 4
196cabdff1aSopenharmony_ci       vmul.s16         \dst, \src, d0[\idx]
197cabdff1aSopenharmony_ci.else
198cabdff1aSopenharmony_ci       vmul.s16         \dst, \src, d1[\idx - 4]
199cabdff1aSopenharmony_ci.endif
200cabdff1aSopenharmony_ci.endm
201cabdff1aSopenharmony_ci.macro vmla_lane dst, src, idx
202cabdff1aSopenharmony_ci.if \idx < 4
203cabdff1aSopenharmony_ci       vmla.s16         \dst, \src, d0[\idx]
204cabdff1aSopenharmony_ci.else
205cabdff1aSopenharmony_ci       vmla.s16         \dst, \src, d1[\idx - 4]
206cabdff1aSopenharmony_ci.endif
207cabdff1aSopenharmony_ci.endm
208cabdff1aSopenharmony_ci
209cabdff1aSopenharmony_ci@ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
210cabdff1aSopenharmony_ci@ for size >= 16), and multiply-accumulate into dst1 and dst3 (or
211cabdff1aSopenharmony_ci@ dst1-dst2 and dst3-dst4 for size >= 16)
212cabdff1aSopenharmony_ci.macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
213cabdff1aSopenharmony_ci        vext.8          q14, \src1, \src2, #(2*\offset)
214cabdff1aSopenharmony_ci        vext.8          q15, \src4, \src5, #(2*\offset)
215cabdff1aSopenharmony_ci.if \size >= 16
216cabdff1aSopenharmony_ci        vmla_lane       \dst1,  q14, \offset
217cabdff1aSopenharmony_ci        vext.8          q5,  \src2, \src3, #(2*\offset)
218cabdff1aSopenharmony_ci        vmla_lane       \dst3,  q15, \offset
219cabdff1aSopenharmony_ci        vext.8          q6,  \src5, \src6, #(2*\offset)
220cabdff1aSopenharmony_ci        vmla_lane       \dst2,  q5,  \offset
221cabdff1aSopenharmony_ci        vmla_lane       \dst4,  q6,  \offset
222cabdff1aSopenharmony_ci.elseif \size == 8
223cabdff1aSopenharmony_ci        vmla_lane       \dst1,  q14, \offset
224cabdff1aSopenharmony_ci        vmla_lane       \dst3,  q15, \offset
225cabdff1aSopenharmony_ci.else
226cabdff1aSopenharmony_ci        vmla_lane       \dst1d, d28, \offset
227cabdff1aSopenharmony_ci        vmla_lane       \dst3d, d30, \offset
228cabdff1aSopenharmony_ci.endif
229cabdff1aSopenharmony_ci.endm
230cabdff1aSopenharmony_ci@ The same as above, but don't accumulate straight into the
231cabdff1aSopenharmony_ci@ destination, but use a temp register and accumulate with saturation.
232cabdff1aSopenharmony_ci.macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
233cabdff1aSopenharmony_ci        vext.8          q14, \src1, \src2, #(2*\offset)
234cabdff1aSopenharmony_ci        vext.8          q15, \src4, \src5, #(2*\offset)
235cabdff1aSopenharmony_ci.if \size >= 16
236cabdff1aSopenharmony_ci        vmul_lane       q14, q14, \offset
237cabdff1aSopenharmony_ci        vext.8          q5,  \src2, \src3, #(2*\offset)
238cabdff1aSopenharmony_ci        vmul_lane       q15, q15, \offset
239cabdff1aSopenharmony_ci        vext.8          q6,  \src5, \src6, #(2*\offset)
240cabdff1aSopenharmony_ci        vmul_lane       q5,  q5,  \offset
241cabdff1aSopenharmony_ci        vmul_lane       q6,  q6,  \offset
242cabdff1aSopenharmony_ci.elseif \size == 8
243cabdff1aSopenharmony_ci        vmul_lane       q14, q14, \offset
244cabdff1aSopenharmony_ci        vmul_lane       q15, q15, \offset
245cabdff1aSopenharmony_ci.else
246cabdff1aSopenharmony_ci        vmul_lane       d28, d28, \offset
247cabdff1aSopenharmony_ci        vmul_lane       d30, d30, \offset
248cabdff1aSopenharmony_ci.endif
249cabdff1aSopenharmony_ci.if \size == 4
250cabdff1aSopenharmony_ci        vqadd.s16       \dst1d, \dst1d, d28
251cabdff1aSopenharmony_ci        vqadd.s16       \dst3d, \dst3d, d30
252cabdff1aSopenharmony_ci.else
253cabdff1aSopenharmony_ci        vqadd.s16       \dst1,  \dst1,  q14
254cabdff1aSopenharmony_ci        vqadd.s16       \dst3,  \dst3,  q15
255cabdff1aSopenharmony_ci.if \size >= 16
256cabdff1aSopenharmony_ci        vqadd.s16       \dst2,  \dst2,  q5
257cabdff1aSopenharmony_ci        vqadd.s16       \dst4,  \dst4,  q6
258cabdff1aSopenharmony_ci.endif
259cabdff1aSopenharmony_ci.endif
260cabdff1aSopenharmony_ci.endm
261cabdff1aSopenharmony_ci
262cabdff1aSopenharmony_ci
263cabdff1aSopenharmony_ci@ Instantiate a horizontal filter function for the given size.
264cabdff1aSopenharmony_ci@ This can work on 4, 8 or 16 pixels in parallel; for larger
265cabdff1aSopenharmony_ci@ widths it will do 16 pixels at a time and loop horizontally.
266cabdff1aSopenharmony_ci@ The actual width is passed in r5, the height in r4 and
267cabdff1aSopenharmony_ci@ the filter coefficients in r12. idx2 is the index of the largest
268cabdff1aSopenharmony_ci@ filter coefficient (3 or 4) and idx1 is the other one of them.
269cabdff1aSopenharmony_ci.macro do_8tap_h type, size, idx1, idx2
270cabdff1aSopenharmony_cifunction \type\()_8tap_\size\()h_\idx1\idx2
271cabdff1aSopenharmony_ci        sub             r2,  r2,  #3
272cabdff1aSopenharmony_ci        add             r6,  r0,  r1
273cabdff1aSopenharmony_ci        add             r7,  r2,  r3
274cabdff1aSopenharmony_ci        add             r1,  r1,  r1
275cabdff1aSopenharmony_ci        add             r3,  r3,  r3
276cabdff1aSopenharmony_ci        @ Only size >= 16 loops horizontally and needs
277cabdff1aSopenharmony_ci        @ reduced dst stride
278cabdff1aSopenharmony_ci.if \size >= 16
279cabdff1aSopenharmony_ci        sub             r1,  r1,  r5
280cabdff1aSopenharmony_ci.endif
281cabdff1aSopenharmony_ci        @ size >= 16 loads two qwords and increments r2,
282cabdff1aSopenharmony_ci        @ for size 4/8 it's enough with one qword and no
283cabdff1aSopenharmony_ci        @ postincrement
284cabdff1aSopenharmony_ci.if \size >= 16
285cabdff1aSopenharmony_ci        sub             r3,  r3,  r5
286cabdff1aSopenharmony_ci        sub             r3,  r3,  #8
287cabdff1aSopenharmony_ci.endif
288cabdff1aSopenharmony_ci        @ Load the filter vector
289cabdff1aSopenharmony_ci        vld1.16         {q0},  [r12,:128]
290cabdff1aSopenharmony_ci1:
291cabdff1aSopenharmony_ci.if \size >= 16
292cabdff1aSopenharmony_ci        mov             r12, r5
293cabdff1aSopenharmony_ci.endif
294cabdff1aSopenharmony_ci        @ Load src
295cabdff1aSopenharmony_ci.if \size >= 16
296cabdff1aSopenharmony_ci        vld1.8          {d18, d19, d20}, [r2]!
297cabdff1aSopenharmony_ci        vld1.8          {d24, d25, d26}, [r7]!
298cabdff1aSopenharmony_ci.else
299cabdff1aSopenharmony_ci        vld1.8          {q9},  [r2]
300cabdff1aSopenharmony_ci        vld1.8          {q12}, [r7]
301cabdff1aSopenharmony_ci.endif
302cabdff1aSopenharmony_ci        vmovl.u8        q8,  d18
303cabdff1aSopenharmony_ci        vmovl.u8        q9,  d19
304cabdff1aSopenharmony_ci        vmovl.u8        q11, d24
305cabdff1aSopenharmony_ci        vmovl.u8        q12, d25
306cabdff1aSopenharmony_ci.if \size >= 16
307cabdff1aSopenharmony_ci        vmovl.u8        q10, d20
308cabdff1aSopenharmony_ci        vmovl.u8        q13, d26
309cabdff1aSopenharmony_ci.endif
310cabdff1aSopenharmony_ci2:
311cabdff1aSopenharmony_ci
312cabdff1aSopenharmony_ci        @ Accumulate, adding idx2 last with a separate
313cabdff1aSopenharmony_ci        @ saturating add. The positive filter coefficients
314cabdff1aSopenharmony_ci        @ for all indices except idx2 must add up to less
315cabdff1aSopenharmony_ci        @ than 127 for this not to overflow.
316cabdff1aSopenharmony_ci        vmul.s16        q1,  q8,  d0[0]
317cabdff1aSopenharmony_ci        vmul.s16        q3,  q11, d0[0]
318cabdff1aSopenharmony_ci.if \size >= 16
319cabdff1aSopenharmony_ci        vmul.s16        q2,  q9,  d0[0]
320cabdff1aSopenharmony_ci        vmul.s16        q4,  q12, d0[0]
321cabdff1aSopenharmony_ci.endif
322cabdff1aSopenharmony_ci        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 1,     \size
323cabdff1aSopenharmony_ci        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 2,     \size
324cabdff1aSopenharmony_ci        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, \idx1, \size
325cabdff1aSopenharmony_ci        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 5,     \size
326cabdff1aSopenharmony_ci        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 6,     \size
327cabdff1aSopenharmony_ci        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 7,     \size
328cabdff1aSopenharmony_ci        extmulqadd      q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, \idx2, \size
329cabdff1aSopenharmony_ci
330cabdff1aSopenharmony_ci        @ Round, shift and saturate
331cabdff1aSopenharmony_ci        vqrshrun.s16    d2,  q1,  #7
332cabdff1aSopenharmony_ci        vqrshrun.s16    d6,  q3,  #7
333cabdff1aSopenharmony_ci.if \size >= 16
334cabdff1aSopenharmony_ci        vqrshrun.s16    d3,  q2,  #7
335cabdff1aSopenharmony_ci        vqrshrun.s16    d7,  q4,  #7
336cabdff1aSopenharmony_ci.endif
337cabdff1aSopenharmony_ci        @ Average
338cabdff1aSopenharmony_ci.ifc \type,avg
339cabdff1aSopenharmony_ci.if \size >= 16
340cabdff1aSopenharmony_ci        vld1.8          {q14}, [r0,:128]
341cabdff1aSopenharmony_ci        vld1.8          {q15}, [r6,:128]
342cabdff1aSopenharmony_ci        vrhadd.u8       q1,  q1,  q14
343cabdff1aSopenharmony_ci        vrhadd.u8       q3,  q3,  q15
344cabdff1aSopenharmony_ci.elseif \size == 8
345cabdff1aSopenharmony_ci        vld1.8          {d28}, [r0,:64]
346cabdff1aSopenharmony_ci        vld1.8          {d30}, [r6,:64]
347cabdff1aSopenharmony_ci        vrhadd.u8       d2,  d2,  d28
348cabdff1aSopenharmony_ci        vrhadd.u8       d6,  d6,  d30
349cabdff1aSopenharmony_ci.else
350cabdff1aSopenharmony_ci        @ We only need d28[0], but [] is faster on some cores
351cabdff1aSopenharmony_ci        vld1.32         {d28[]}, [r0,:32]
352cabdff1aSopenharmony_ci        vld1.32         {d30[]}, [r6,:32]
353cabdff1aSopenharmony_ci        vrhadd.u8       d2,  d2,  d28
354cabdff1aSopenharmony_ci        vrhadd.u8       d6,  d6,  d30
355cabdff1aSopenharmony_ci.endif
356cabdff1aSopenharmony_ci.endif
357cabdff1aSopenharmony_ci        @ Store and loop horizontally (for size >= 16)
358cabdff1aSopenharmony_ci.if \size >= 16
359cabdff1aSopenharmony_ci        subs            r12, r12, #16
360cabdff1aSopenharmony_ci        vst1.8          {q1}, [r0,:128]!
361cabdff1aSopenharmony_ci        vst1.8          {q3}, [r6,:128]!
362cabdff1aSopenharmony_ci        beq             3f
363cabdff1aSopenharmony_ci        vmov            q8,  q10
364cabdff1aSopenharmony_ci        vmov            q11, q13
365cabdff1aSopenharmony_ci        vld1.8          {q10}, [r2]!
366cabdff1aSopenharmony_ci        vld1.8          {q13}, [r7]!
367cabdff1aSopenharmony_ci        vmovl.u8        q9,  d20
368cabdff1aSopenharmony_ci        vmovl.u8        q10, d21
369cabdff1aSopenharmony_ci        vmovl.u8        q12, d26
370cabdff1aSopenharmony_ci        vmovl.u8        q13, d27
371cabdff1aSopenharmony_ci        b               2b
372cabdff1aSopenharmony_ci.elseif \size == 8
373cabdff1aSopenharmony_ci        vst1.8          {d2}, [r0,:64]
374cabdff1aSopenharmony_ci        vst1.8          {d6}, [r6,:64]
375cabdff1aSopenharmony_ci.else @ \size == 4
376cabdff1aSopenharmony_ci        vst1.32         {d2[0]}, [r0,:32]
377cabdff1aSopenharmony_ci        vst1.32         {d6[0]}, [r6,:32]
378cabdff1aSopenharmony_ci.endif
379cabdff1aSopenharmony_ci3:
380cabdff1aSopenharmony_ci        @ Loop vertically
381cabdff1aSopenharmony_ci        add             r0,  r0,  r1
382cabdff1aSopenharmony_ci        add             r6,  r6,  r1
383cabdff1aSopenharmony_ci        add             r2,  r2,  r3
384cabdff1aSopenharmony_ci        add             r7,  r7,  r3
385cabdff1aSopenharmony_ci        subs            r4,  r4,  #2
386cabdff1aSopenharmony_ci        bne             1b
387cabdff1aSopenharmony_ci.if \size >= 16
388cabdff1aSopenharmony_ci        vpop            {q4-q6}
389cabdff1aSopenharmony_ci.endif
390cabdff1aSopenharmony_ci        pop             {r4-r7}
391cabdff1aSopenharmony_ci        bx              lr
392cabdff1aSopenharmony_ciendfunc
393cabdff1aSopenharmony_ci.endm
394cabdff1aSopenharmony_ci
395cabdff1aSopenharmony_ci.macro do_8tap_h_size size
396cabdff1aSopenharmony_cido_8tap_h put, \size, 3, 4
397cabdff1aSopenharmony_cido_8tap_h avg, \size, 3, 4
398cabdff1aSopenharmony_cido_8tap_h put, \size, 4, 3
399cabdff1aSopenharmony_cido_8tap_h avg, \size, 4, 3
400cabdff1aSopenharmony_ci.endm
401cabdff1aSopenharmony_ci
402cabdff1aSopenharmony_cido_8tap_h_size 4
403cabdff1aSopenharmony_cido_8tap_h_size 8
404cabdff1aSopenharmony_cido_8tap_h_size 16
405cabdff1aSopenharmony_ci
406cabdff1aSopenharmony_ci.macro do_8tap_h_func type, filter, offset, size
407cabdff1aSopenharmony_cifunction ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
408cabdff1aSopenharmony_ci        push            {r4-r7}
409cabdff1aSopenharmony_ci.if \size >= 16
410cabdff1aSopenharmony_ci        vpush           {q4-q6}
411cabdff1aSopenharmony_ci        ldr             r4,  [sp, #64]
412cabdff1aSopenharmony_ci        ldr             r5,  [sp, #68]
413cabdff1aSopenharmony_ci.else
414cabdff1aSopenharmony_ci        ldr             r4,  [sp, #16]
415cabdff1aSopenharmony_ci        ldr             r5,  [sp, #20]
416cabdff1aSopenharmony_ci.endif
417cabdff1aSopenharmony_ci        movrelx         r12, X(ff_vp9_subpel_filters), r6
418cabdff1aSopenharmony_ci        add             r12, r12, 256*\offset
419cabdff1aSopenharmony_ci        cmp             r5,  #8
420cabdff1aSopenharmony_ci        add             r12, r12, r5, lsl #4
421cabdff1aSopenharmony_ci        mov             r5,  #\size
422cabdff1aSopenharmony_ci.if \size >= 16
423cabdff1aSopenharmony_ci        bge             \type\()_8tap_16h_34
424cabdff1aSopenharmony_ci        b               \type\()_8tap_16h_43
425cabdff1aSopenharmony_ci.else
426cabdff1aSopenharmony_ci        bge             \type\()_8tap_\size\()h_34
427cabdff1aSopenharmony_ci        b               \type\()_8tap_\size\()h_43
428cabdff1aSopenharmony_ci.endif
429cabdff1aSopenharmony_ciendfunc
430cabdff1aSopenharmony_ci.endm
431cabdff1aSopenharmony_ci
432cabdff1aSopenharmony_ci.macro do_8tap_h_filters size
433cabdff1aSopenharmony_cido_8tap_h_func put, regular, 1, \size
434cabdff1aSopenharmony_cido_8tap_h_func avg, regular, 1, \size
435cabdff1aSopenharmony_cido_8tap_h_func put, sharp,   2, \size
436cabdff1aSopenharmony_cido_8tap_h_func avg, sharp,   2, \size
437cabdff1aSopenharmony_cido_8tap_h_func put, smooth,  0, \size
438cabdff1aSopenharmony_cido_8tap_h_func avg, smooth,  0, \size
439cabdff1aSopenharmony_ci.endm
440cabdff1aSopenharmony_ci
441cabdff1aSopenharmony_cido_8tap_h_filters 64
442cabdff1aSopenharmony_cido_8tap_h_filters 32
443cabdff1aSopenharmony_cido_8tap_h_filters 16
444cabdff1aSopenharmony_cido_8tap_h_filters 8
445cabdff1aSopenharmony_cido_8tap_h_filters 4
446cabdff1aSopenharmony_ci
447cabdff1aSopenharmony_ci.ltorg
448cabdff1aSopenharmony_ci
449cabdff1aSopenharmony_ci@ Vertical filters
450cabdff1aSopenharmony_ci
451cabdff1aSopenharmony_ci@ Round, shift and saturate and store qreg1-2 over 4 lines
452cabdff1aSopenharmony_ci.macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type
453cabdff1aSopenharmony_ci        vqrshrun.s16    \dreg1,  \qreg1, #7
454cabdff1aSopenharmony_ci        vqrshrun.s16    \dreg2,  \qreg2, #7
455cabdff1aSopenharmony_ci.ifc \type,avg
456cabdff1aSopenharmony_ci        vld1.32         {\tmp1[]},   [r0,:32], r1
457cabdff1aSopenharmony_ci        vld1.32         {\tmp2[]},   [r0,:32], r1
458cabdff1aSopenharmony_ci        vld1.32         {\tmp1[1]},  [r0,:32], r1
459cabdff1aSopenharmony_ci        vld1.32         {\tmp2[1]},  [r0,:32], r1
460cabdff1aSopenharmony_ci        vrhadd.u8       \dreg1,  \dreg1,  \tmp1
461cabdff1aSopenharmony_ci        vrhadd.u8       \dreg2,  \dreg2,  \tmp2
462cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
463cabdff1aSopenharmony_ci.endif
464cabdff1aSopenharmony_ci        vst1.32         {\dreg1[0]}, [r0,:32], r1
465cabdff1aSopenharmony_ci        vst1.32         {\dreg2[0]}, [r0,:32], r1
466cabdff1aSopenharmony_ci        vst1.32         {\dreg1[1]}, [r0,:32], r1
467cabdff1aSopenharmony_ci        vst1.32         {\dreg2[1]}, [r0,:32], r1
468cabdff1aSopenharmony_ci.endm
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_ci@ Round, shift and saturate and store qreg1-4
471cabdff1aSopenharmony_ci.macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type
472cabdff1aSopenharmony_ci        vqrshrun.s16    \dreg1,  \qreg1, #7
473cabdff1aSopenharmony_ci        vqrshrun.s16    \dreg2,  \qreg2, #7
474cabdff1aSopenharmony_ci        vqrshrun.s16    \dreg3,  \qreg3, #7
475cabdff1aSopenharmony_ci        vqrshrun.s16    \dreg4,  \qreg4, #7
476cabdff1aSopenharmony_ci.ifc \type,avg
477cabdff1aSopenharmony_ci        vld1.8          {\tmp1},  [r0,:64], r1
478cabdff1aSopenharmony_ci        vld1.8          {\tmp2},  [r0,:64], r1
479cabdff1aSopenharmony_ci        vld1.8          {\tmp3},  [r0,:64], r1
480cabdff1aSopenharmony_ci        vld1.8          {\tmp4},  [r0,:64], r1
481cabdff1aSopenharmony_ci        vrhadd.u8       \dreg1,  \dreg1,  \tmp1
482cabdff1aSopenharmony_ci        vrhadd.u8       \dreg2,  \dreg2,  \tmp2
483cabdff1aSopenharmony_ci        vrhadd.u8       \dreg3,  \dreg3,  \tmp3
484cabdff1aSopenharmony_ci        vrhadd.u8       \dreg4,  \dreg4,  \tmp4
485cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
486cabdff1aSopenharmony_ci.endif
487cabdff1aSopenharmony_ci        vst1.8          {\dreg1}, [r0,:64], r1
488cabdff1aSopenharmony_ci        vst1.8          {\dreg2}, [r0,:64], r1
489cabdff1aSopenharmony_ci        vst1.8          {\dreg3}, [r0,:64], r1
490cabdff1aSopenharmony_ci        vst1.8          {\dreg4}, [r0,:64], r1
491cabdff1aSopenharmony_ci.endm
492cabdff1aSopenharmony_ci
493cabdff1aSopenharmony_ci@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
494cabdff1aSopenharmony_ci@ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
495cabdff1aSopenharmony_ci@ at the end with saturation. Indices 0 and 7 always have negative or zero
496cabdff1aSopenharmony_ci@ coefficients, so they can be accumulated into tmp1-tmp2 together with the
497cabdff1aSopenharmony_ci@ largest coefficient.
498cabdff1aSopenharmony_ci.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
499cabdff1aSopenharmony_ci        vmul.s16        \dst1, \src2, d0[1]
500cabdff1aSopenharmony_ci        vmul.s16        \dst2, \src3, d0[1]
501cabdff1aSopenharmony_ci        vmul.s16        \tmp1, \src1, d0[0]
502cabdff1aSopenharmony_ci        vmul.s16        \tmp2, \src2, d0[0]
503cabdff1aSopenharmony_ci        vmla.s16        \dst1, \src3, d0[2]
504cabdff1aSopenharmony_ci        vmla.s16        \dst2, \src4, d0[2]
505cabdff1aSopenharmony_ci.if \idx1 == 3
506cabdff1aSopenharmony_ci        vmla.s16        \dst1, \src4, d0[3]
507cabdff1aSopenharmony_ci        vmla.s16        \dst2, \src5, d0[3]
508cabdff1aSopenharmony_ci.else
509cabdff1aSopenharmony_ci        vmla.s16        \dst1, \src5, d1[0]
510cabdff1aSopenharmony_ci        vmla.s16        \dst2, \src6, d1[0]
511cabdff1aSopenharmony_ci.endif
512cabdff1aSopenharmony_ci        vmla.s16        \dst1, \src6, d1[1]
513cabdff1aSopenharmony_ci        vmla.s16        \dst2, \src7, d1[1]
514cabdff1aSopenharmony_ci        vmla.s16        \tmp1, \src8, d1[3]
515cabdff1aSopenharmony_ci        vmla.s16        \tmp2, \src9, d1[3]
516cabdff1aSopenharmony_ci        vmla.s16        \dst1, \src7, d1[2]
517cabdff1aSopenharmony_ci        vmla.s16        \dst2, \src8, d1[2]
518cabdff1aSopenharmony_ci.if \idx2 == 3
519cabdff1aSopenharmony_ci        vmla.s16        \tmp1, \src4, d0[3]
520cabdff1aSopenharmony_ci        vmla.s16        \tmp2, \src5, d0[3]
521cabdff1aSopenharmony_ci.else
522cabdff1aSopenharmony_ci        vmla.s16        \tmp1, \src5, d1[0]
523cabdff1aSopenharmony_ci        vmla.s16        \tmp2, \src6, d1[0]
524cabdff1aSopenharmony_ci.endif
525cabdff1aSopenharmony_ci        vqadd.s16       \dst1, \dst1, \tmp1
526cabdff1aSopenharmony_ci        vqadd.s16       \dst2, \dst2, \tmp2
527cabdff1aSopenharmony_ci.endm
528cabdff1aSopenharmony_ci
529cabdff1aSopenharmony_ci@ Load pixels and extend them to 16 bit
530cabdff1aSopenharmony_ci.macro loadl dst1, dst2, dst3, dst4
531cabdff1aSopenharmony_ci        vld1.8          {d2}, [r2], r3
532cabdff1aSopenharmony_ci        vld1.8          {d3}, [r2], r3
533cabdff1aSopenharmony_ci        vld1.8          {d4}, [r2], r3
534cabdff1aSopenharmony_ci.ifnb \dst4
535cabdff1aSopenharmony_ci        vld1.8          {d5}, [r2], r3
536cabdff1aSopenharmony_ci.endif
537cabdff1aSopenharmony_ci        vmovl.u8        \dst1, d2
538cabdff1aSopenharmony_ci        vmovl.u8        \dst2, d3
539cabdff1aSopenharmony_ci        vmovl.u8        \dst3, d4
540cabdff1aSopenharmony_ci.ifnb \dst4
541cabdff1aSopenharmony_ci        vmovl.u8        \dst4, d5
542cabdff1aSopenharmony_ci.endif
543cabdff1aSopenharmony_ci.endm
544cabdff1aSopenharmony_ci
545cabdff1aSopenharmony_ci@ Instantiate a vertical filter function for filtering 8 pixels at a time.
546cabdff1aSopenharmony_ci@ The height is passed in r4, the width in r5 and the filter coefficients
547cabdff1aSopenharmony_ci@ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
548cabdff1aSopenharmony_ci@ and idx1 is the other one of them.
549cabdff1aSopenharmony_ci.macro do_8tap_8v type, idx1, idx2
550cabdff1aSopenharmony_cifunction \type\()_8tap_8v_\idx1\idx2
551cabdff1aSopenharmony_ci        sub             r2,  r2,  r3, lsl #1
552cabdff1aSopenharmony_ci        sub             r2,  r2,  r3
553cabdff1aSopenharmony_ci        vld1.16         {q0},  [r12, :128]
554cabdff1aSopenharmony_ci1:
555cabdff1aSopenharmony_ci        mov             r12, r4
556cabdff1aSopenharmony_ci
557cabdff1aSopenharmony_ci        loadl           q5,  q6,  q7
558cabdff1aSopenharmony_ci        loadl           q8,  q9,  q10, q11
559cabdff1aSopenharmony_ci2:
560cabdff1aSopenharmony_ci        loadl           q12, q13, q14, q15
561cabdff1aSopenharmony_ci        convolve        q1,  q2,  q5,  q6,  q7,  q8,  q9,  q10, q11, q12, q13, \idx1, \idx2, q4,  q5
562cabdff1aSopenharmony_ci        convolve        q3,  q4,  q7,  q8,  q9,  q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5,  q6
563cabdff1aSopenharmony_ci        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q4,  d8,  d3,  d5,  d7,  d9,  \type
564cabdff1aSopenharmony_ci
565cabdff1aSopenharmony_ci        subs            r12, r12, #4
566cabdff1aSopenharmony_ci        beq             8f
567cabdff1aSopenharmony_ci
568cabdff1aSopenharmony_ci        loadl           q4,  q5,  q6,  q7
569cabdff1aSopenharmony_ci        convolve        q1,  q2,  q9,  q10, q11, q12, q13, q14, q15, q4,  q5,  \idx1, \idx2, q8,  q9
570cabdff1aSopenharmony_ci        convolve        q3,  q8,  q11, q12, q13, q14, q15, q4,  q5,  q6,  q7,  \idx1, \idx2, q9,  q10
571cabdff1aSopenharmony_ci        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q8,  d16, d3,  d5,  d7,  d17, \type
572cabdff1aSopenharmony_ci
573cabdff1aSopenharmony_ci        subs            r12, r12, #4
574cabdff1aSopenharmony_ci        beq             8f
575cabdff1aSopenharmony_ci
576cabdff1aSopenharmony_ci        loadl           q8,  q9,  q10, q11
577cabdff1aSopenharmony_ci        convolve        q1,  q2,  q13, q14, q15, q4,  q5,  q6,  q7,  q8,  q9,  \idx1, \idx2, q12, q13
578cabdff1aSopenharmony_ci        convolve        q3,  q12, q15, q4,  q5,  q6,  q7,  q8,  q9,  q10, q11, \idx1, \idx2, q13, q14
579cabdff1aSopenharmony_ci        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q12, d24, d3,  d5,  d7,  d25, \type
580cabdff1aSopenharmony_ci
581cabdff1aSopenharmony_ci        subs            r12, r12, #4
582cabdff1aSopenharmony_ci        bne             2b
583cabdff1aSopenharmony_ci
584cabdff1aSopenharmony_ci8:
585cabdff1aSopenharmony_ci        subs            r5,  r5,  #8
586cabdff1aSopenharmony_ci        beq             9f
587cabdff1aSopenharmony_ci        @ r0 -= h * dst_stride
588cabdff1aSopenharmony_ci        mls             r0,  r1,  r4, r0
589cabdff1aSopenharmony_ci        @ r2 -= h * src_stride
590cabdff1aSopenharmony_ci        mls             r2,  r3,  r4, r2
591cabdff1aSopenharmony_ci        @ r2 -= 8 * src_stride
592cabdff1aSopenharmony_ci        sub             r2,  r2,  r3, lsl #3
593cabdff1aSopenharmony_ci        @ r2 += 1 * src_stride
594cabdff1aSopenharmony_ci        add             r2,  r2,  r3
595cabdff1aSopenharmony_ci        add             r2,  r2,  #8
596cabdff1aSopenharmony_ci        add             r0,  r0,  #8
597cabdff1aSopenharmony_ci        b               1b
598cabdff1aSopenharmony_ci9:
599cabdff1aSopenharmony_ci        vpop            {q4-q7}
600cabdff1aSopenharmony_ci        pop             {r4-r5}
601cabdff1aSopenharmony_ci        bx              lr
602cabdff1aSopenharmony_ciendfunc
603cabdff1aSopenharmony_ci.endm
604cabdff1aSopenharmony_ci
605cabdff1aSopenharmony_cido_8tap_8v put, 3, 4
606cabdff1aSopenharmony_cido_8tap_8v put, 4, 3
607cabdff1aSopenharmony_cido_8tap_8v avg, 3, 4
608cabdff1aSopenharmony_cido_8tap_8v avg, 4, 3
609cabdff1aSopenharmony_ci
610cabdff1aSopenharmony_ci@ Instantiate a vertical filter function for filtering a 4 pixels wide
611cabdff1aSopenharmony_ci@ slice. The first half of the registers contain one row, while the second
612cabdff1aSopenharmony_ci@ half of a register contains the second-next row (also stored in the first
613cabdff1aSopenharmony_ci@ half of the register two steps ahead). The convolution does two outputs
614cabdff1aSopenharmony_ci@ at a time; the output of q5-q12 into one, and q4-q13 into another one.
615cabdff1aSopenharmony_ci@ The first half of first output is the first output row, the first half
616cabdff1aSopenharmony_ci@ of the other output is the second output row. The second halves of the
617cabdff1aSopenharmony_ci@ registers are rows 3 and 4.
618cabdff1aSopenharmony_ci@ This only is designed to work for 4 or 8 output lines.
619cabdff1aSopenharmony_ci.macro do_8tap_4v type, idx1, idx2
620cabdff1aSopenharmony_cifunction \type\()_8tap_4v_\idx1\idx2
621cabdff1aSopenharmony_ci        sub             r2,  r2,  r3, lsl #1
622cabdff1aSopenharmony_ci        sub             r2,  r2,  r3
623cabdff1aSopenharmony_ci        vld1.16         {q0},  [r12, :128]
624cabdff1aSopenharmony_ci
625cabdff1aSopenharmony_ci        vld1.32         {d2[]},   [r2], r3
626cabdff1aSopenharmony_ci        vld1.32         {d3[]},   [r2], r3
627cabdff1aSopenharmony_ci        vld1.32         {d4[]},   [r2], r3
628cabdff1aSopenharmony_ci        vld1.32         {d5[]},   [r2], r3
629cabdff1aSopenharmony_ci        vld1.32         {d6[]},   [r2], r3
630cabdff1aSopenharmony_ci        vld1.32         {d7[]},   [r2], r3
631cabdff1aSopenharmony_ci        vext.8          d2,  d2,  d4,  #4
632cabdff1aSopenharmony_ci        vld1.32         {d8[]},   [r2], r3
633cabdff1aSopenharmony_ci        vext.8          d3,  d3,  d5,  #4
634cabdff1aSopenharmony_ci        vld1.32         {d9[]},   [r2], r3
635cabdff1aSopenharmony_ci        vmovl.u8        q5,  d2
636cabdff1aSopenharmony_ci        vext.8          d4,  d4,  d6,  #4
637cabdff1aSopenharmony_ci        vld1.32         {d28[]},  [r2], r3
638cabdff1aSopenharmony_ci        vmovl.u8        q6,  d3
639cabdff1aSopenharmony_ci        vext.8          d5,  d5,  d7,  #4
640cabdff1aSopenharmony_ci        vld1.32         {d29[]},  [r2], r3
641cabdff1aSopenharmony_ci        vmovl.u8        q7,  d4
642cabdff1aSopenharmony_ci        vext.8          d6,  d6,  d8,  #4
643cabdff1aSopenharmony_ci        vld1.32         {d30[]},  [r2], r3
644cabdff1aSopenharmony_ci        vmovl.u8        q8,  d5
645cabdff1aSopenharmony_ci        vext.8          d7,  d7,  d9,  #4
646cabdff1aSopenharmony_ci        vmovl.u8        q9,  d6
647cabdff1aSopenharmony_ci        vext.8          d8,  d8,  d28, #4
648cabdff1aSopenharmony_ci        vmovl.u8        q10, d7
649cabdff1aSopenharmony_ci        vext.8          d9,  d9,  d29, #4
650cabdff1aSopenharmony_ci        vmovl.u8        q11, d8
651cabdff1aSopenharmony_ci        vext.8          d28, d28, d30, #4
652cabdff1aSopenharmony_ci        vmovl.u8        q12, d9
653cabdff1aSopenharmony_ci        vmovl.u8        q13, d28
654cabdff1aSopenharmony_ci
655cabdff1aSopenharmony_ci        convolve        q1,  q2,  q5,  q6,  q7,  q8,  q9,  q10, q11, q12, q13, \idx1, \idx2, q4, q3
656cabdff1aSopenharmony_ci        do_store4       q1,  d2,  q2,  d4,  d3,  d5,  \type
657cabdff1aSopenharmony_ci        subs            r4,  r4,  #4
658cabdff1aSopenharmony_ci        beq             9f
659cabdff1aSopenharmony_ci
660cabdff1aSopenharmony_ci        vld1.32         {d2[]},   [r2], r3
661cabdff1aSopenharmony_ci        vld1.32         {d3[]},   [r2], r3
662cabdff1aSopenharmony_ci        vext.8          d29, d29, d2,  #4
663cabdff1aSopenharmony_ci        vext.8          d30, d30, d3,  #4
664cabdff1aSopenharmony_ci        vld1.32         {d2[1]},  [r2], r3
665cabdff1aSopenharmony_ci        vmovl.u8        q14, d29
666cabdff1aSopenharmony_ci        vld1.32         {d3[1]},  [r2], r3
667cabdff1aSopenharmony_ci        vmovl.u8        q15, d30
668cabdff1aSopenharmony_ci        vmovl.u8        q5,  d2
669cabdff1aSopenharmony_ci        vmovl.u8        q6,  d3
670cabdff1aSopenharmony_ci
671cabdff1aSopenharmony_ci        convolve        q1,  q2,  q9,  q10, q11, q12, q13, q14, q15, q5,  q6,  \idx1, \idx2, q4, q3
672cabdff1aSopenharmony_ci        do_store4       q1,  d2,  q2,  d4,  d3,  d5,  \type
673cabdff1aSopenharmony_ci
674cabdff1aSopenharmony_ci9:
675cabdff1aSopenharmony_ci        vpop            {q4-q7}
676cabdff1aSopenharmony_ci        pop             {r4-r5}
677cabdff1aSopenharmony_ci        bx              lr
678cabdff1aSopenharmony_ciendfunc
679cabdff1aSopenharmony_ci.endm
680cabdff1aSopenharmony_ci
681cabdff1aSopenharmony_cido_8tap_4v put, 3, 4
682cabdff1aSopenharmony_cido_8tap_4v put, 4, 3
683cabdff1aSopenharmony_cido_8tap_4v avg, 3, 4
684cabdff1aSopenharmony_cido_8tap_4v avg, 4, 3
685cabdff1aSopenharmony_ci
686cabdff1aSopenharmony_ci.macro do_8tap_v_func type, filter, offset, size
687cabdff1aSopenharmony_cifunction ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
688cabdff1aSopenharmony_ci        push            {r4-r5}
689cabdff1aSopenharmony_ci        vpush           {q4-q7}
690cabdff1aSopenharmony_ci        ldr             r4,  [sp, #72]
691cabdff1aSopenharmony_ci        movrelx         r12, X(ff_vp9_subpel_filters), r5
692cabdff1aSopenharmony_ci        ldr             r5,  [sp, #80]
693cabdff1aSopenharmony_ci        add             r12, r12, 256*\offset
694cabdff1aSopenharmony_ci        add             r12, r12, r5, lsl #4
695cabdff1aSopenharmony_ci        cmp             r5,  #8
696cabdff1aSopenharmony_ci        mov             r5,  #\size
697cabdff1aSopenharmony_ci.if \size >= 8
698cabdff1aSopenharmony_ci        bge             \type\()_8tap_8v_34
699cabdff1aSopenharmony_ci        b               \type\()_8tap_8v_43
700cabdff1aSopenharmony_ci.else
701cabdff1aSopenharmony_ci        bge             \type\()_8tap_4v_34
702cabdff1aSopenharmony_ci        b               \type\()_8tap_4v_43
703cabdff1aSopenharmony_ci.endif
704cabdff1aSopenharmony_ciendfunc
705cabdff1aSopenharmony_ci.endm
706cabdff1aSopenharmony_ci
707cabdff1aSopenharmony_ci.macro do_8tap_v_filters size
708cabdff1aSopenharmony_cido_8tap_v_func put, regular, 1, \size
709cabdff1aSopenharmony_cido_8tap_v_func avg, regular, 1, \size
710cabdff1aSopenharmony_cido_8tap_v_func put, sharp,   2, \size
711cabdff1aSopenharmony_cido_8tap_v_func avg, sharp,   2, \size
712cabdff1aSopenharmony_cido_8tap_v_func put, smooth,  0, \size
713cabdff1aSopenharmony_cido_8tap_v_func avg, smooth,  0, \size
714cabdff1aSopenharmony_ci.endm
715cabdff1aSopenharmony_ci
716cabdff1aSopenharmony_cido_8tap_v_filters 64
717cabdff1aSopenharmony_cido_8tap_v_filters 32
718cabdff1aSopenharmony_cido_8tap_v_filters 16
719cabdff1aSopenharmony_cido_8tap_v_filters 8
720cabdff1aSopenharmony_cido_8tap_v_filters 4
721