1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/arm/asm.S"
22cabdff1aSopenharmony_ci#include "neon.S"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci        /* H.264 loop filter */
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci.macro  h264_loop_filter_start
27cabdff1aSopenharmony_ci        ldr             r12, [sp]
28cabdff1aSopenharmony_ci        tst             r2,  r2
29cabdff1aSopenharmony_ci        ldr             r12, [r12]
30cabdff1aSopenharmony_ci        it              ne
31cabdff1aSopenharmony_ci        tstne           r3,  r3
32cabdff1aSopenharmony_ci        vmov.32         d24[0], r12
33cabdff1aSopenharmony_ci        and             r12, r12, r12, lsl #16
34cabdff1aSopenharmony_ci        it              eq
35cabdff1aSopenharmony_ci        bxeq            lr
36cabdff1aSopenharmony_ci        ands            r12, r12, r12, lsl #8
37cabdff1aSopenharmony_ci        it              lt
38cabdff1aSopenharmony_ci        bxlt            lr
39cabdff1aSopenharmony_ci.endm
40cabdff1aSopenharmony_ci
41cabdff1aSopenharmony_ci.macro  h264_loop_filter_luma
42cabdff1aSopenharmony_ci        vdup.8          q11, r2         @ alpha
43cabdff1aSopenharmony_ci        vmovl.u8        q12, d24
44cabdff1aSopenharmony_ci        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
45cabdff1aSopenharmony_ci        vmovl.u16       q12, d24
46cabdff1aSopenharmony_ci        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
47cabdff1aSopenharmony_ci        vsli.16         q12, q12, #8
48cabdff1aSopenharmony_ci        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
49cabdff1aSopenharmony_ci        vsli.32         q12, q12, #16
50cabdff1aSopenharmony_ci        vclt.u8         q6,  q6,  q11   @ < alpha
51cabdff1aSopenharmony_ci        vdup.8          q11, r3         @ beta
52cabdff1aSopenharmony_ci        vclt.s8         q7,  q12, #0
53cabdff1aSopenharmony_ci        vclt.u8         q14, q14, q11   @ < beta
54cabdff1aSopenharmony_ci        vclt.u8         q15, q15, q11   @ < beta
55cabdff1aSopenharmony_ci        vbic            q6,  q6,  q7
56cabdff1aSopenharmony_ci        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
57cabdff1aSopenharmony_ci        vand            q6,  q6,  q14
58cabdff1aSopenharmony_ci        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
59cabdff1aSopenharmony_ci        vclt.u8         q4,  q4,  q11   @ < beta
60cabdff1aSopenharmony_ci        vand            q6,  q6,  q15
61cabdff1aSopenharmony_ci        vclt.u8         q5,  q5,  q11   @ < beta
62cabdff1aSopenharmony_ci        vand            q4,  q4,  q6
63cabdff1aSopenharmony_ci        vand            q5,  q5,  q6
64cabdff1aSopenharmony_ci        vand            q12, q12, q6
65cabdff1aSopenharmony_ci        vrhadd.u8       q14, q8,  q0
66cabdff1aSopenharmony_ci        vsub.i8         q6,  q12, q4
67cabdff1aSopenharmony_ci        vqadd.u8        q7,  q9,  q12
68cabdff1aSopenharmony_ci        vhadd.u8        q10, q10, q14
69cabdff1aSopenharmony_ci        vsub.i8         q6,  q6,  q5
70cabdff1aSopenharmony_ci        vhadd.u8        q14, q2,  q14
71cabdff1aSopenharmony_ci        vmin.u8         q7,  q7,  q10
72cabdff1aSopenharmony_ci        vqsub.u8        q11, q9,  q12
73cabdff1aSopenharmony_ci        vqadd.u8        q2,  q1,  q12
74cabdff1aSopenharmony_ci        vmax.u8         q7,  q7,  q11
75cabdff1aSopenharmony_ci        vqsub.u8        q11, q1,  q12
76cabdff1aSopenharmony_ci        vmin.u8         q14, q2,  q14
77cabdff1aSopenharmony_ci        vmovl.u8        q2,  d0
78cabdff1aSopenharmony_ci        vmax.u8         q14, q14, q11
79cabdff1aSopenharmony_ci        vmovl.u8        q10, d1
80cabdff1aSopenharmony_ci        vsubw.u8        q2,  q2,  d16
81cabdff1aSopenharmony_ci        vsubw.u8        q10, q10, d17
82cabdff1aSopenharmony_ci        vshl.i16        q2,  q2,  #2
83cabdff1aSopenharmony_ci        vshl.i16        q10, q10, #2
84cabdff1aSopenharmony_ci        vaddw.u8        q2,  q2,  d18
85cabdff1aSopenharmony_ci        vaddw.u8        q10, q10, d19
86cabdff1aSopenharmony_ci        vsubw.u8        q2,  q2,  d2
87cabdff1aSopenharmony_ci        vsubw.u8        q10, q10, d3
88cabdff1aSopenharmony_ci        vrshrn.i16      d4,  q2,  #3
89cabdff1aSopenharmony_ci        vrshrn.i16      d5,  q10, #3
90cabdff1aSopenharmony_ci        vbsl            q4,  q7,  q9
91cabdff1aSopenharmony_ci        vbsl            q5,  q14, q1
92cabdff1aSopenharmony_ci        vneg.s8         q7,  q6
93cabdff1aSopenharmony_ci        vmovl.u8        q14, d16
94cabdff1aSopenharmony_ci        vmin.s8         q2,  q2,  q6
95cabdff1aSopenharmony_ci        vmovl.u8        q6,  d17
96cabdff1aSopenharmony_ci        vmax.s8         q2,  q2,  q7
97cabdff1aSopenharmony_ci        vmovl.u8        q11, d0
98cabdff1aSopenharmony_ci        vmovl.u8        q12, d1
99cabdff1aSopenharmony_ci        vaddw.s8        q14, q14, d4
100cabdff1aSopenharmony_ci        vaddw.s8        q6,  q6,  d5
101cabdff1aSopenharmony_ci        vsubw.s8        q11, q11, d4
102cabdff1aSopenharmony_ci        vsubw.s8        q12, q12, d5
103cabdff1aSopenharmony_ci        vqmovun.s16     d16, q14
104cabdff1aSopenharmony_ci        vqmovun.s16     d17, q6
105cabdff1aSopenharmony_ci        vqmovun.s16     d0,  q11
106cabdff1aSopenharmony_ci        vqmovun.s16     d1,  q12
107cabdff1aSopenharmony_ci.endm
108cabdff1aSopenharmony_ci
109cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_luma_neon, export=1
110cabdff1aSopenharmony_ci        h264_loop_filter_start
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci        vld1.8          {d0, d1},  [r0,:128], r1
113cabdff1aSopenharmony_ci        vld1.8          {d2, d3},  [r0,:128], r1
114cabdff1aSopenharmony_ci        vld1.8          {d4, d5},  [r0,:128], r1
115cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #2
116cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
117cabdff1aSopenharmony_ci        vld1.8          {d20,d21}, [r0,:128], r1
118cabdff1aSopenharmony_ci        vld1.8          {d18,d19}, [r0,:128], r1
119cabdff1aSopenharmony_ci        vld1.8          {d16,d17}, [r0,:128], r1
120cabdff1aSopenharmony_ci
121cabdff1aSopenharmony_ci        vpush           {d8-d15}
122cabdff1aSopenharmony_ci
123cabdff1aSopenharmony_ci        h264_loop_filter_luma
124cabdff1aSopenharmony_ci
125cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
126cabdff1aSopenharmony_ci        vst1.8          {d8, d9},  [r0,:128], r1
127cabdff1aSopenharmony_ci        vst1.8          {d16,d17}, [r0,:128], r1
128cabdff1aSopenharmony_ci        vst1.8          {d0, d1},  [r0,:128], r1
129cabdff1aSopenharmony_ci        vst1.8          {d10,d11}, [r0,:128]
130cabdff1aSopenharmony_ci
131cabdff1aSopenharmony_ci        vpop            {d8-d15}
132cabdff1aSopenharmony_ci        bx              lr
133cabdff1aSopenharmony_ciendfunc
134cabdff1aSopenharmony_ci
135cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_luma_neon, export=1
136cabdff1aSopenharmony_ci        h264_loop_filter_start
137cabdff1aSopenharmony_ci
138cabdff1aSopenharmony_ci        sub             r0,  r0,  #4
139cabdff1aSopenharmony_ci        vld1.8          {d6},  [r0], r1
140cabdff1aSopenharmony_ci        vld1.8          {d20}, [r0], r1
141cabdff1aSopenharmony_ci        vld1.8          {d18}, [r0], r1
142cabdff1aSopenharmony_ci        vld1.8          {d16}, [r0], r1
143cabdff1aSopenharmony_ci        vld1.8          {d0},  [r0], r1
144cabdff1aSopenharmony_ci        vld1.8          {d2},  [r0], r1
145cabdff1aSopenharmony_ci        vld1.8          {d4},  [r0], r1
146cabdff1aSopenharmony_ci        vld1.8          {d26}, [r0], r1
147cabdff1aSopenharmony_ci        vld1.8          {d7},  [r0], r1
148cabdff1aSopenharmony_ci        vld1.8          {d21}, [r0], r1
149cabdff1aSopenharmony_ci        vld1.8          {d19}, [r0], r1
150cabdff1aSopenharmony_ci        vld1.8          {d17}, [r0], r1
151cabdff1aSopenharmony_ci        vld1.8          {d1},  [r0], r1
152cabdff1aSopenharmony_ci        vld1.8          {d3},  [r0], r1
153cabdff1aSopenharmony_ci        vld1.8          {d5},  [r0], r1
154cabdff1aSopenharmony_ci        vld1.8          {d27}, [r0], r1
155cabdff1aSopenharmony_ci
156cabdff1aSopenharmony_ci        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_ci        vpush           {d8-d15}
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_ci        h264_loop_filter_luma
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_ci        transpose_4x4   q4, q8, q0, q5
163cabdff1aSopenharmony_ci
164cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #4
165cabdff1aSopenharmony_ci        add             r0,  r0,  #2
166cabdff1aSopenharmony_ci        vst1.32         {d8[0]},  [r0], r1
167cabdff1aSopenharmony_ci        vst1.32         {d16[0]}, [r0], r1
168cabdff1aSopenharmony_ci        vst1.32         {d0[0]},  [r0], r1
169cabdff1aSopenharmony_ci        vst1.32         {d10[0]}, [r0], r1
170cabdff1aSopenharmony_ci        vst1.32         {d8[1]},  [r0], r1
171cabdff1aSopenharmony_ci        vst1.32         {d16[1]}, [r0], r1
172cabdff1aSopenharmony_ci        vst1.32         {d0[1]},  [r0], r1
173cabdff1aSopenharmony_ci        vst1.32         {d10[1]}, [r0], r1
174cabdff1aSopenharmony_ci        vst1.32         {d9[0]},  [r0], r1
175cabdff1aSopenharmony_ci        vst1.32         {d17[0]}, [r0], r1
176cabdff1aSopenharmony_ci        vst1.32         {d1[0]},  [r0], r1
177cabdff1aSopenharmony_ci        vst1.32         {d11[0]}, [r0], r1
178cabdff1aSopenharmony_ci        vst1.32         {d9[1]},  [r0], r1
179cabdff1aSopenharmony_ci        vst1.32         {d17[1]}, [r0], r1
180cabdff1aSopenharmony_ci        vst1.32         {d1[1]},  [r0], r1
181cabdff1aSopenharmony_ci        vst1.32         {d11[1]}, [r0], r1
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci        vpop            {d8-d15}
184cabdff1aSopenharmony_ci        bx              lr
185cabdff1aSopenharmony_ciendfunc
186cabdff1aSopenharmony_ci
187cabdff1aSopenharmony_ci.macro  h264_loop_filter_chroma
188cabdff1aSopenharmony_ci        vdup.8          d22, r2         @ alpha
189cabdff1aSopenharmony_ci        vmovl.u8        q12, d24
190cabdff1aSopenharmony_ci        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
191cabdff1aSopenharmony_ci        vmovl.u8        q2,  d0
192cabdff1aSopenharmony_ci        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
193cabdff1aSopenharmony_ci        vsubw.u8        q2,  q2,  d16
194cabdff1aSopenharmony_ci        vsli.16         d24, d24, #8
195cabdff1aSopenharmony_ci        vshl.i16        q2,  q2,  #2
196cabdff1aSopenharmony_ci        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
197cabdff1aSopenharmony_ci        vaddw.u8        q2,  q2,  d18
198cabdff1aSopenharmony_ci        vclt.u8         d26, d26, d22   @ < alpha
199cabdff1aSopenharmony_ci        vsubw.u8        q2,  q2,  d2
200cabdff1aSopenharmony_ci        vdup.8          d22, r3         @ beta
201cabdff1aSopenharmony_ci        vrshrn.i16      d4,  q2,  #3
202cabdff1aSopenharmony_ci        vclt.u8         d28, d28, d22   @ < beta
203cabdff1aSopenharmony_ci        vclt.u8         d30, d30, d22   @ < beta
204cabdff1aSopenharmony_ci        vmin.s8         d4,  d4,  d24
205cabdff1aSopenharmony_ci        vneg.s8         d25, d24
206cabdff1aSopenharmony_ci        vand            d26, d26, d28
207cabdff1aSopenharmony_ci        vmax.s8         d4,  d4,  d25
208cabdff1aSopenharmony_ci        vand            d26, d26, d30
209cabdff1aSopenharmony_ci        vmovl.u8        q11, d0
210cabdff1aSopenharmony_ci        vand            d4,  d4,  d26
211cabdff1aSopenharmony_ci        vmovl.u8        q14, d16
212cabdff1aSopenharmony_ci        vaddw.s8        q14, q14, d4
213cabdff1aSopenharmony_ci        vsubw.s8        q11, q11, d4
214cabdff1aSopenharmony_ci        vqmovun.s16     d16, q14
215cabdff1aSopenharmony_ci        vqmovun.s16     d0,  q11
216cabdff1aSopenharmony_ci.endm
217cabdff1aSopenharmony_ci
218cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_chroma_neon, export=1
219cabdff1aSopenharmony_ci        h264_loop_filter_start
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
222cabdff1aSopenharmony_ci        vld1.8          {d18}, [r0,:64], r1
223cabdff1aSopenharmony_ci        vld1.8          {d16}, [r0,:64], r1
224cabdff1aSopenharmony_ci        vld1.8          {d0},  [r0,:64], r1
225cabdff1aSopenharmony_ci        vld1.8          {d2},  [r0,:64]
226cabdff1aSopenharmony_ci
227cabdff1aSopenharmony_ci        h264_loop_filter_chroma
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #1
230cabdff1aSopenharmony_ci        vst1.8          {d16}, [r0,:64], r1
231cabdff1aSopenharmony_ci        vst1.8          {d0},  [r0,:64], r1
232cabdff1aSopenharmony_ci
233cabdff1aSopenharmony_ci        bx              lr
234cabdff1aSopenharmony_ciendfunc
235cabdff1aSopenharmony_ci
236cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_neon, export=1
237cabdff1aSopenharmony_ci        h264_loop_filter_start
238cabdff1aSopenharmony_ci
239cabdff1aSopenharmony_ci        sub             r0,  r0,  #2
240cabdff1aSopenharmony_cih_loop_filter_chroma420:
241cabdff1aSopenharmony_ci        vld1.32         {d18[0]}, [r0], r1
242cabdff1aSopenharmony_ci        vld1.32         {d16[0]}, [r0], r1
243cabdff1aSopenharmony_ci        vld1.32         {d0[0]},  [r0], r1
244cabdff1aSopenharmony_ci        vld1.32         {d2[0]},  [r0], r1
245cabdff1aSopenharmony_ci        vld1.32         {d18[1]}, [r0], r1
246cabdff1aSopenharmony_ci        vld1.32         {d16[1]}, [r0], r1
247cabdff1aSopenharmony_ci        vld1.32         {d0[1]},  [r0], r1
248cabdff1aSopenharmony_ci        vld1.32         {d2[1]},  [r0], r1
249cabdff1aSopenharmony_ci
250cabdff1aSopenharmony_ci        vtrn.16         d18, d0
251cabdff1aSopenharmony_ci        vtrn.16         d16, d2
252cabdff1aSopenharmony_ci        vtrn.8          d18, d16
253cabdff1aSopenharmony_ci        vtrn.8          d0,  d2
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ci        h264_loop_filter_chroma
256cabdff1aSopenharmony_ci
257cabdff1aSopenharmony_ci        vtrn.16         d18, d0
258cabdff1aSopenharmony_ci        vtrn.16         d16, d2
259cabdff1aSopenharmony_ci        vtrn.8          d18, d16
260cabdff1aSopenharmony_ci        vtrn.8          d0,  d2
261cabdff1aSopenharmony_ci
262cabdff1aSopenharmony_ci        sub             r0,  r0,  r1, lsl #3
263cabdff1aSopenharmony_ci        vst1.32         {d18[0]}, [r0], r1
264cabdff1aSopenharmony_ci        vst1.32         {d16[0]}, [r0], r1
265cabdff1aSopenharmony_ci        vst1.32         {d0[0]},  [r0], r1
266cabdff1aSopenharmony_ci        vst1.32         {d2[0]},  [r0], r1
267cabdff1aSopenharmony_ci        vst1.32         {d18[1]}, [r0], r1
268cabdff1aSopenharmony_ci        vst1.32         {d16[1]}, [r0], r1
269cabdff1aSopenharmony_ci        vst1.32         {d0[1]},  [r0], r1
270cabdff1aSopenharmony_ci        vst1.32         {d2[1]},  [r0], r1
271cabdff1aSopenharmony_ci
272cabdff1aSopenharmony_ci        bx              lr
273cabdff1aSopenharmony_ciendfunc
274cabdff1aSopenharmony_ci
275cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma422_neon, export=1
276cabdff1aSopenharmony_ci        h264_loop_filter_start
277cabdff1aSopenharmony_ci        push            {r4, lr}
278cabdff1aSopenharmony_ci        add             r4,  r0,  r1
279cabdff1aSopenharmony_ci        add             r1,  r1,  r1
280cabdff1aSopenharmony_ci        sub             r0,  r0,  #2
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci        bl              h_loop_filter_chroma420
283cabdff1aSopenharmony_ci
284cabdff1aSopenharmony_ci        ldr             r12, [sp, #8]
285cabdff1aSopenharmony_ci        ldr             r12, [r12]
286cabdff1aSopenharmony_ci        vmov.32         d24[0], r12
287cabdff1aSopenharmony_ci        sub             r0,  r4,  #2
288cabdff1aSopenharmony_ci
289cabdff1aSopenharmony_ci        bl              h_loop_filter_chroma420
290cabdff1aSopenharmony_ci        pop             {r4, pc}
291cabdff1aSopenharmony_ciendfunc
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci@ Biweighted prediction
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci.macro  biweight_16     macs, macd
296cabdff1aSopenharmony_ci        vdup.8          d0,  r4
297cabdff1aSopenharmony_ci        vdup.8          d1,  r5
298cabdff1aSopenharmony_ci        vmov            q2,  q8
299cabdff1aSopenharmony_ci        vmov            q3,  q8
300cabdff1aSopenharmony_ci1:      subs            r3,  r3,  #2
301cabdff1aSopenharmony_ci        vld1.8          {d20-d21},[r0,:128], r2
302cabdff1aSopenharmony_ci        \macd           q2,  d0,  d20
303cabdff1aSopenharmony_ci        pld             [r0]
304cabdff1aSopenharmony_ci        \macd           q3,  d0,  d21
305cabdff1aSopenharmony_ci        vld1.8          {d22-d23},[r1,:128], r2
306cabdff1aSopenharmony_ci        \macs           q2,  d1,  d22
307cabdff1aSopenharmony_ci        pld             [r1]
308cabdff1aSopenharmony_ci        \macs           q3,  d1,  d23
309cabdff1aSopenharmony_ci        vmov            q12, q8
310cabdff1aSopenharmony_ci        vld1.8          {d28-d29},[r0,:128], r2
311cabdff1aSopenharmony_ci        vmov            q13, q8
312cabdff1aSopenharmony_ci        \macd           q12, d0,  d28
313cabdff1aSopenharmony_ci        pld             [r0]
314cabdff1aSopenharmony_ci        \macd           q13, d0,  d29
315cabdff1aSopenharmony_ci        vld1.8          {d30-d31},[r1,:128], r2
316cabdff1aSopenharmony_ci        \macs           q12, d1,  d30
317cabdff1aSopenharmony_ci        pld             [r1]
318cabdff1aSopenharmony_ci        \macs           q13, d1,  d31
319cabdff1aSopenharmony_ci        vshl.s16        q2,  q2,  q9
320cabdff1aSopenharmony_ci        vshl.s16        q3,  q3,  q9
321cabdff1aSopenharmony_ci        vqmovun.s16     d4,  q2
322cabdff1aSopenharmony_ci        vqmovun.s16     d5,  q3
323cabdff1aSopenharmony_ci        vshl.s16        q12, q12, q9
324cabdff1aSopenharmony_ci        vshl.s16        q13, q13, q9
325cabdff1aSopenharmony_ci        vqmovun.s16     d24, q12
326cabdff1aSopenharmony_ci        vqmovun.s16     d25, q13
327cabdff1aSopenharmony_ci        vmov            q3,  q8
328cabdff1aSopenharmony_ci        vst1.8          {d4- d5}, [r6,:128], r2
329cabdff1aSopenharmony_ci        vmov            q2,  q8
330cabdff1aSopenharmony_ci        vst1.8          {d24-d25},[r6,:128], r2
331cabdff1aSopenharmony_ci        bne             1b
332cabdff1aSopenharmony_ci        pop             {r4-r6, pc}
333cabdff1aSopenharmony_ci.endm
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_ci.macro  biweight_8      macs, macd
336cabdff1aSopenharmony_ci        vdup.8          d0,  r4
337cabdff1aSopenharmony_ci        vdup.8          d1,  r5
338cabdff1aSopenharmony_ci        vmov            q1,  q8
339cabdff1aSopenharmony_ci        vmov            q10, q8
340cabdff1aSopenharmony_ci1:      subs            r3,  r3,  #2
341cabdff1aSopenharmony_ci        vld1.8          {d4},[r0,:64], r2
342cabdff1aSopenharmony_ci        \macd           q1,  d0,  d4
343cabdff1aSopenharmony_ci        pld             [r0]
344cabdff1aSopenharmony_ci        vld1.8          {d5},[r1,:64], r2
345cabdff1aSopenharmony_ci        \macs           q1,  d1,  d5
346cabdff1aSopenharmony_ci        pld             [r1]
347cabdff1aSopenharmony_ci        vld1.8          {d6},[r0,:64], r2
348cabdff1aSopenharmony_ci        \macd           q10, d0,  d6
349cabdff1aSopenharmony_ci        pld             [r0]
350cabdff1aSopenharmony_ci        vld1.8          {d7},[r1,:64], r2
351cabdff1aSopenharmony_ci        \macs           q10, d1,  d7
352cabdff1aSopenharmony_ci        pld             [r1]
353cabdff1aSopenharmony_ci        vshl.s16        q1,  q1,  q9
354cabdff1aSopenharmony_ci        vqmovun.s16     d2,  q1
355cabdff1aSopenharmony_ci        vshl.s16        q10, q10, q9
356cabdff1aSopenharmony_ci        vqmovun.s16     d4,  q10
357cabdff1aSopenharmony_ci        vmov            q10, q8
358cabdff1aSopenharmony_ci        vst1.8          {d2},[r6,:64], r2
359cabdff1aSopenharmony_ci        vmov            q1,  q8
360cabdff1aSopenharmony_ci        vst1.8          {d4},[r6,:64], r2
361cabdff1aSopenharmony_ci        bne             1b
362cabdff1aSopenharmony_ci        pop             {r4-r6, pc}
363cabdff1aSopenharmony_ci.endm
364cabdff1aSopenharmony_ci
365cabdff1aSopenharmony_ci.macro  biweight_4      macs, macd
366cabdff1aSopenharmony_ci        vdup.8          d0,  r4
367cabdff1aSopenharmony_ci        vdup.8          d1,  r5
368cabdff1aSopenharmony_ci        vmov            q1,  q8
369cabdff1aSopenharmony_ci        vmov            q10, q8
370cabdff1aSopenharmony_ci1:      subs            r3,  r3,  #4
371cabdff1aSopenharmony_ci        vld1.32         {d4[0]},[r0,:32], r2
372cabdff1aSopenharmony_ci        vld1.32         {d4[1]},[r0,:32], r2
373cabdff1aSopenharmony_ci        \macd           q1,  d0,  d4
374cabdff1aSopenharmony_ci        pld             [r0]
375cabdff1aSopenharmony_ci        vld1.32         {d5[0]},[r1,:32], r2
376cabdff1aSopenharmony_ci        vld1.32         {d5[1]},[r1,:32], r2
377cabdff1aSopenharmony_ci        \macs           q1,  d1,  d5
378cabdff1aSopenharmony_ci        pld             [r1]
379cabdff1aSopenharmony_ci        blt             2f
380cabdff1aSopenharmony_ci        vld1.32         {d6[0]},[r0,:32], r2
381cabdff1aSopenharmony_ci        vld1.32         {d6[1]},[r0,:32], r2
382cabdff1aSopenharmony_ci        \macd           q10, d0,  d6
383cabdff1aSopenharmony_ci        pld             [r0]
384cabdff1aSopenharmony_ci        vld1.32         {d7[0]},[r1,:32], r2
385cabdff1aSopenharmony_ci        vld1.32         {d7[1]},[r1,:32], r2
386cabdff1aSopenharmony_ci        \macs           q10, d1,  d7
387cabdff1aSopenharmony_ci        pld             [r1]
388cabdff1aSopenharmony_ci        vshl.s16        q1,  q1,  q9
389cabdff1aSopenharmony_ci        vqmovun.s16     d2,  q1
390cabdff1aSopenharmony_ci        vshl.s16        q10, q10, q9
391cabdff1aSopenharmony_ci        vqmovun.s16     d4,  q10
392cabdff1aSopenharmony_ci        vmov            q10, q8
393cabdff1aSopenharmony_ci        vst1.32         {d2[0]},[r6,:32], r2
394cabdff1aSopenharmony_ci        vst1.32         {d2[1]},[r6,:32], r2
395cabdff1aSopenharmony_ci        vmov            q1,  q8
396cabdff1aSopenharmony_ci        vst1.32         {d4[0]},[r6,:32], r2
397cabdff1aSopenharmony_ci        vst1.32         {d4[1]},[r6,:32], r2
398cabdff1aSopenharmony_ci        bne             1b
399cabdff1aSopenharmony_ci        pop             {r4-r6, pc}
400cabdff1aSopenharmony_ci2:      vshl.s16        q1,  q1,  q9
401cabdff1aSopenharmony_ci        vqmovun.s16     d2,  q1
402cabdff1aSopenharmony_ci        vst1.32         {d2[0]},[r6,:32], r2
403cabdff1aSopenharmony_ci        vst1.32         {d2[1]},[r6,:32], r2
404cabdff1aSopenharmony_ci        pop             {r4-r6, pc}
405cabdff1aSopenharmony_ci.endm
406cabdff1aSopenharmony_ci
407cabdff1aSopenharmony_ci.macro  biweight_func   w
408cabdff1aSopenharmony_cifunction ff_biweight_h264_pixels_\w\()_neon, export=1
409cabdff1aSopenharmony_ci        push            {r4-r6, lr}
410cabdff1aSopenharmony_ci        ldr             r12, [sp, #16]
411cabdff1aSopenharmony_ci        add             r4,  sp,  #20
412cabdff1aSopenharmony_ci        ldm             r4,  {r4-r6}
413cabdff1aSopenharmony_ci        lsr             lr,  r4,  #31
414cabdff1aSopenharmony_ci        add             r6,  r6,  #1
415cabdff1aSopenharmony_ci        eors            lr,  lr,  r5,  lsr #30
416cabdff1aSopenharmony_ci        orr             r6,  r6,  #1
417cabdff1aSopenharmony_ci        vdup.16         q9,  r12
418cabdff1aSopenharmony_ci        lsl             r6,  r6,  r12
419cabdff1aSopenharmony_ci        vmvn            q9,  q9
420cabdff1aSopenharmony_ci        vdup.16         q8,  r6
421cabdff1aSopenharmony_ci        mov             r6,  r0
422cabdff1aSopenharmony_ci        beq             10f
423cabdff1aSopenharmony_ci        subs            lr,  lr,  #1
424cabdff1aSopenharmony_ci        beq             20f
425cabdff1aSopenharmony_ci        subs            lr,  lr,  #1
426cabdff1aSopenharmony_ci        beq             30f
427cabdff1aSopenharmony_ci        b               40f
428cabdff1aSopenharmony_ci10:     biweight_\w     vmlal.u8, vmlal.u8
429cabdff1aSopenharmony_ci20:     rsb             r4,  r4,  #0
430cabdff1aSopenharmony_ci        biweight_\w     vmlal.u8, vmlsl.u8
431cabdff1aSopenharmony_ci30:     rsb             r4,  r4,  #0
432cabdff1aSopenharmony_ci        rsb             r5,  r5,  #0
433cabdff1aSopenharmony_ci        biweight_\w     vmlsl.u8, vmlsl.u8
434cabdff1aSopenharmony_ci40:     rsb             r5,  r5,  #0
435cabdff1aSopenharmony_ci        biweight_\w     vmlsl.u8, vmlal.u8
436cabdff1aSopenharmony_ciendfunc
437cabdff1aSopenharmony_ci.endm
438cabdff1aSopenharmony_ci
439cabdff1aSopenharmony_ci        biweight_func   16
440cabdff1aSopenharmony_ci        biweight_func   8
441cabdff1aSopenharmony_ci        biweight_func   4
442cabdff1aSopenharmony_ci
443cabdff1aSopenharmony_ci@ Weighted prediction
444cabdff1aSopenharmony_ci
445cabdff1aSopenharmony_ci.macro  weight_16       add
446cabdff1aSopenharmony_ci        vdup.8          d0,  r12
447cabdff1aSopenharmony_ci1:      subs            r2,  r2,  #2
448cabdff1aSopenharmony_ci        vld1.8          {d20-d21},[r0,:128], r1
449cabdff1aSopenharmony_ci        vmull.u8        q2,  d0,  d20
450cabdff1aSopenharmony_ci        pld             [r0]
451cabdff1aSopenharmony_ci        vmull.u8        q3,  d0,  d21
452cabdff1aSopenharmony_ci        vld1.8          {d28-d29},[r0,:128], r1
453cabdff1aSopenharmony_ci        vmull.u8        q12, d0,  d28
454cabdff1aSopenharmony_ci        pld             [r0]
455cabdff1aSopenharmony_ci        vmull.u8        q13, d0,  d29
456cabdff1aSopenharmony_ci        \add            q2,  q8,  q2
457cabdff1aSopenharmony_ci        vrshl.s16       q2,  q2,  q9
458cabdff1aSopenharmony_ci        \add            q3,  q8,  q3
459cabdff1aSopenharmony_ci        vrshl.s16       q3,  q3,  q9
460cabdff1aSopenharmony_ci        vqmovun.s16     d4,  q2
461cabdff1aSopenharmony_ci        vqmovun.s16     d5,  q3
462cabdff1aSopenharmony_ci        \add            q12, q8,  q12
463cabdff1aSopenharmony_ci        vrshl.s16       q12, q12, q9
464cabdff1aSopenharmony_ci        \add            q13, q8,  q13
465cabdff1aSopenharmony_ci        vrshl.s16       q13, q13, q9
466cabdff1aSopenharmony_ci        vqmovun.s16     d24, q12
467cabdff1aSopenharmony_ci        vqmovun.s16     d25, q13
468cabdff1aSopenharmony_ci        vst1.8          {d4- d5}, [r4,:128], r1
469cabdff1aSopenharmony_ci        vst1.8          {d24-d25},[r4,:128], r1
470cabdff1aSopenharmony_ci        bne             1b
471cabdff1aSopenharmony_ci        pop             {r4, pc}
472cabdff1aSopenharmony_ci.endm
473cabdff1aSopenharmony_ci
474cabdff1aSopenharmony_ci.macro  weight_8        add
475cabdff1aSopenharmony_ci        vdup.8          d0,  r12
476cabdff1aSopenharmony_ci1:      subs            r2,  r2,  #2
477cabdff1aSopenharmony_ci        vld1.8          {d4},[r0,:64], r1
478cabdff1aSopenharmony_ci        vmull.u8        q1,  d0,  d4
479cabdff1aSopenharmony_ci        pld             [r0]
480cabdff1aSopenharmony_ci        vld1.8          {d6},[r0,:64], r1
481cabdff1aSopenharmony_ci        vmull.u8        q10, d0,  d6
482cabdff1aSopenharmony_ci        \add            q1,  q8,  q1
483cabdff1aSopenharmony_ci        pld             [r0]
484cabdff1aSopenharmony_ci        vrshl.s16       q1,  q1,  q9
485cabdff1aSopenharmony_ci        vqmovun.s16     d2,  q1
486cabdff1aSopenharmony_ci        \add            q10, q8,  q10
487cabdff1aSopenharmony_ci        vrshl.s16       q10, q10, q9
488cabdff1aSopenharmony_ci        vqmovun.s16     d4,  q10
489cabdff1aSopenharmony_ci        vst1.8          {d2},[r4,:64], r1
490cabdff1aSopenharmony_ci        vst1.8          {d4},[r4,:64], r1
491cabdff1aSopenharmony_ci        bne             1b
492cabdff1aSopenharmony_ci        pop             {r4, pc}
493cabdff1aSopenharmony_ci.endm
494cabdff1aSopenharmony_ci
495cabdff1aSopenharmony_ci.macro  weight_4        add
496cabdff1aSopenharmony_ci        vdup.8          d0,  r12
497cabdff1aSopenharmony_ci        vmov            q1,  q8
498cabdff1aSopenharmony_ci        vmov            q10, q8
499cabdff1aSopenharmony_ci1:      subs            r2,  r2,  #4
500cabdff1aSopenharmony_ci        vld1.32         {d4[0]},[r0,:32], r1
501cabdff1aSopenharmony_ci        vld1.32         {d4[1]},[r0,:32], r1
502cabdff1aSopenharmony_ci        vmull.u8        q1,  d0,  d4
503cabdff1aSopenharmony_ci        pld             [r0]
504cabdff1aSopenharmony_ci        blt             2f
505cabdff1aSopenharmony_ci        vld1.32         {d6[0]},[r0,:32], r1
506cabdff1aSopenharmony_ci        vld1.32         {d6[1]},[r0,:32], r1
507cabdff1aSopenharmony_ci        vmull.u8        q10, d0,  d6
508cabdff1aSopenharmony_ci        pld             [r0]
509cabdff1aSopenharmony_ci        \add            q1,  q8,  q1
510cabdff1aSopenharmony_ci        vrshl.s16       q1,  q1,  q9
511cabdff1aSopenharmony_ci        vqmovun.s16     d2,  q1
512cabdff1aSopenharmony_ci        \add            q10, q8,  q10
513cabdff1aSopenharmony_ci        vrshl.s16       q10, q10, q9
514cabdff1aSopenharmony_ci        vqmovun.s16     d4,  q10
515cabdff1aSopenharmony_ci        vmov            q10, q8
516cabdff1aSopenharmony_ci        vst1.32         {d2[0]},[r4,:32], r1
517cabdff1aSopenharmony_ci        vst1.32         {d2[1]},[r4,:32], r1
518cabdff1aSopenharmony_ci        vmov            q1,  q8
519cabdff1aSopenharmony_ci        vst1.32         {d4[0]},[r4,:32], r1
520cabdff1aSopenharmony_ci        vst1.32         {d4[1]},[r4,:32], r1
521cabdff1aSopenharmony_ci        bne             1b
522cabdff1aSopenharmony_ci        pop             {r4, pc}
523cabdff1aSopenharmony_ci2:      \add            q1,  q8,  q1
524cabdff1aSopenharmony_ci        vrshl.s16       q1,  q1,  q9
525cabdff1aSopenharmony_ci        vqmovun.s16     d2,  q1
526cabdff1aSopenharmony_ci        vst1.32         {d2[0]},[r4,:32], r1
527cabdff1aSopenharmony_ci        vst1.32         {d2[1]},[r4,:32], r1
528cabdff1aSopenharmony_ci        pop             {r4, pc}
529cabdff1aSopenharmony_ci.endm
530cabdff1aSopenharmony_ci
531cabdff1aSopenharmony_ci.macro  weight_func     w
532cabdff1aSopenharmony_cifunction ff_weight_h264_pixels_\w\()_neon, export=1
533cabdff1aSopenharmony_ci        push            {r4, lr}
534cabdff1aSopenharmony_ci        ldr             r12, [sp, #8]
535cabdff1aSopenharmony_ci        ldr             r4,  [sp, #12]
536cabdff1aSopenharmony_ci        cmp             r3,  #1
537cabdff1aSopenharmony_ci        lsl             r4,  r4,  r3
538cabdff1aSopenharmony_ci        vdup.16         q8,  r4
539cabdff1aSopenharmony_ci        mov             r4,  r0
540cabdff1aSopenharmony_ci        ble             20f
541cabdff1aSopenharmony_ci        rsb             lr,  r3,  #1
542cabdff1aSopenharmony_ci        vdup.16         q9,  lr
543cabdff1aSopenharmony_ci        cmp             r12, #0
544cabdff1aSopenharmony_ci        blt             10f
545cabdff1aSopenharmony_ci        weight_\w       vhadd.s16
546cabdff1aSopenharmony_ci10:     rsb             r12, r12, #0
547cabdff1aSopenharmony_ci        weight_\w       vhsub.s16
548cabdff1aSopenharmony_ci20:     rsb             lr,  r3,  #0
549cabdff1aSopenharmony_ci        vdup.16         q9,  lr
550cabdff1aSopenharmony_ci        cmp             r12, #0
551cabdff1aSopenharmony_ci        blt             10f
552cabdff1aSopenharmony_ci        weight_\w       vadd.s16
553cabdff1aSopenharmony_ci10:     rsb             r12, r12, #0
554cabdff1aSopenharmony_ci        weight_\w       vsub.s16
555cabdff1aSopenharmony_ciendfunc
556cabdff1aSopenharmony_ci.endm
557cabdff1aSopenharmony_ci
558cabdff1aSopenharmony_ci        weight_func     16
559cabdff1aSopenharmony_ci        weight_func     8
560cabdff1aSopenharmony_ci        weight_func     4
561