1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3cabdff1aSopenharmony_ci * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4cabdff1aSopenharmony_ci * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
24cabdff1aSopenharmony_ci#include "neon.S"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci.macro  h264_loop_filter_start
27cabdff1aSopenharmony_ci        cmp             w2,  #0
28cabdff1aSopenharmony_ci        ldr             w6,  [x4]
29cabdff1aSopenharmony_ci        ccmp            w3,  #0, #0, ne
30cabdff1aSopenharmony_ci        mov             v24.S[0], w6
31cabdff1aSopenharmony_ci        and             w8,  w6,  w6,  lsl #16
32cabdff1aSopenharmony_ci        b.eq            1f
33cabdff1aSopenharmony_ci        ands            w8,  w8,  w8,  lsl #8
34cabdff1aSopenharmony_ci        b.ge            2f
35cabdff1aSopenharmony_ci1:
36cabdff1aSopenharmony_ci        ret
37cabdff1aSopenharmony_ci2:
38cabdff1aSopenharmony_ci.endm
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_ci.macro  h264_loop_filter_luma
41cabdff1aSopenharmony_ci        dup             v22.16B, w2                     // alpha
42cabdff1aSopenharmony_ci        uxtl            v24.8H,  v24.8B
43cabdff1aSopenharmony_ci        uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
44cabdff1aSopenharmony_ci        uxtl            v24.4S,  v24.4H
45cabdff1aSopenharmony_ci        uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
46cabdff1aSopenharmony_ci        sli             v24.8H,  v24.8H,  #8
47cabdff1aSopenharmony_ci        uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
48cabdff1aSopenharmony_ci        sli             v24.4S,  v24.4S,  #16
49cabdff1aSopenharmony_ci        cmhi            v21.16B, v22.16B, v21.16B       // < alpha
50cabdff1aSopenharmony_ci        dup             v22.16B, w3                     // beta
51cabdff1aSopenharmony_ci        cmlt            v23.16B, v24.16B, #0
52cabdff1aSopenharmony_ci        cmhi            v28.16B, v22.16B, v28.16B       // < beta
53cabdff1aSopenharmony_ci        cmhi            v30.16B, v22.16B, v30.16B       // < beta
54cabdff1aSopenharmony_ci        bic             v21.16B, v21.16B, v23.16B
55cabdff1aSopenharmony_ci        uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
56cabdff1aSopenharmony_ci        and             v21.16B, v21.16B, v28.16B
57cabdff1aSopenharmony_ci        uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
58cabdff1aSopenharmony_ci        and             v21.16B, v21.16B, v30.16B      // < beta
59cabdff1aSopenharmony_ci        shrn            v30.8b,  v21.8h,  #4
60cabdff1aSopenharmony_ci        mov             x7, v30.d[0]
61cabdff1aSopenharmony_ci        cmhi            v17.16B, v22.16B, v17.16B       // < beta
62cabdff1aSopenharmony_ci        cmhi            v19.16B, v22.16B, v19.16B       // < beta
63cabdff1aSopenharmony_ci        cbz             x7,  9f
64cabdff1aSopenharmony_ci        and             v17.16B, v17.16B, v21.16B
65cabdff1aSopenharmony_ci        and             v19.16B, v19.16B, v21.16B
66cabdff1aSopenharmony_ci        and             v24.16B, v24.16B, v21.16B
67cabdff1aSopenharmony_ci        urhadd          v28.16B, v16.16B,  v0.16B
68cabdff1aSopenharmony_ci        sub             v21.16B, v24.16B, v17.16B
69cabdff1aSopenharmony_ci        uqadd           v23.16B, v18.16B, v24.16B
70cabdff1aSopenharmony_ci        uhadd           v20.16B, v20.16B, v28.16B
71cabdff1aSopenharmony_ci        sub             v21.16B, v21.16B, v19.16B
72cabdff1aSopenharmony_ci        uhadd           v28.16B,  v4.16B, v28.16B
73cabdff1aSopenharmony_ci        umin            v23.16B, v23.16B, v20.16B
74cabdff1aSopenharmony_ci        uqsub           v22.16B, v18.16B, v24.16B
75cabdff1aSopenharmony_ci        uqadd           v4.16B,   v2.16B, v24.16B
76cabdff1aSopenharmony_ci        umax            v23.16B, v23.16B, v22.16B
77cabdff1aSopenharmony_ci        uqsub           v22.16B,  v2.16B, v24.16B
78cabdff1aSopenharmony_ci        umin            v28.16B,  v4.16B, v28.16B
79cabdff1aSopenharmony_ci        uxtl            v4.8H,    v0.8B
80cabdff1aSopenharmony_ci        umax            v28.16B, v28.16B, v22.16B
81cabdff1aSopenharmony_ci        uxtl2           v20.8H,   v0.16B
82cabdff1aSopenharmony_ci        usubw           v4.8H,    v4.8H,  v16.8B
83cabdff1aSopenharmony_ci        usubw2          v20.8H,  v20.8H,  v16.16B
84cabdff1aSopenharmony_ci        shl             v4.8H,    v4.8H,  #2
85cabdff1aSopenharmony_ci        shl             v20.8H,  v20.8H,  #2
86cabdff1aSopenharmony_ci        uaddw           v4.8H,    v4.8H,  v18.8B
87cabdff1aSopenharmony_ci        uaddw2          v20.8H,  v20.8H,  v18.16B
88cabdff1aSopenharmony_ci        usubw           v4.8H,    v4.8H,   v2.8B
89cabdff1aSopenharmony_ci        usubw2          v20.8H,  v20.8H,   v2.16B
90cabdff1aSopenharmony_ci        rshrn           v4.8B,    v4.8H,  #3
91cabdff1aSopenharmony_ci        rshrn2          v4.16B,  v20.8H,  #3
92cabdff1aSopenharmony_ci        bsl             v17.16B, v23.16B, v18.16B
93cabdff1aSopenharmony_ci        bsl             v19.16B, v28.16B,  v2.16B
94cabdff1aSopenharmony_ci        neg             v23.16B, v21.16B
95cabdff1aSopenharmony_ci        uxtl            v28.8H,  v16.8B
96cabdff1aSopenharmony_ci        smin            v4.16B,   v4.16B, v21.16B
97cabdff1aSopenharmony_ci        uxtl2           v21.8H,  v16.16B
98cabdff1aSopenharmony_ci        smax            v4.16B,   v4.16B, v23.16B
99cabdff1aSopenharmony_ci        uxtl            v22.8H,   v0.8B
100cabdff1aSopenharmony_ci        uxtl2           v24.8H,   v0.16B
101cabdff1aSopenharmony_ci        saddw           v28.8H,  v28.8H,  v4.8B
102cabdff1aSopenharmony_ci        saddw2          v21.8H,  v21.8H,  v4.16B
103cabdff1aSopenharmony_ci        ssubw           v22.8H,  v22.8H,  v4.8B
104cabdff1aSopenharmony_ci        ssubw2          v24.8H,  v24.8H,  v4.16B
105cabdff1aSopenharmony_ci        sqxtun          v16.8B,  v28.8H
106cabdff1aSopenharmony_ci        sqxtun2         v16.16B, v21.8H
107cabdff1aSopenharmony_ci        sqxtun          v0.8B,   v22.8H
108cabdff1aSopenharmony_ci        sqxtun2         v0.16B,  v24.8H
109cabdff1aSopenharmony_ci.endm
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_luma_neon, export=1
112cabdff1aSopenharmony_ci        h264_loop_filter_start
113cabdff1aSopenharmony_ci
114cabdff1aSopenharmony_ci        ld1             {v0.16B},  [x0], x1
115cabdff1aSopenharmony_ci        ld1             {v2.16B},  [x0], x1
116cabdff1aSopenharmony_ci        ld1             {v4.16B},  [x0], x1
117cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #2
118cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
119cabdff1aSopenharmony_ci        ld1             {v20.16B},  [x0], x1
120cabdff1aSopenharmony_ci        ld1             {v18.16B},  [x0], x1
121cabdff1aSopenharmony_ci        ld1             {v16.16B},  [x0], x1
122cabdff1aSopenharmony_ci
123cabdff1aSopenharmony_ci        h264_loop_filter_luma
124cabdff1aSopenharmony_ci
125cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
126cabdff1aSopenharmony_ci        st1             {v17.16B},  [x0], x1
127cabdff1aSopenharmony_ci        st1             {v16.16B}, [x0], x1
128cabdff1aSopenharmony_ci        st1             {v0.16B},  [x0], x1
129cabdff1aSopenharmony_ci        st1             {v19.16B}, [x0]
130cabdff1aSopenharmony_ci9:
131cabdff1aSopenharmony_ci        ret
132cabdff1aSopenharmony_ciendfunc
133cabdff1aSopenharmony_ci
134cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_luma_neon, export=1
135cabdff1aSopenharmony_ci        h264_loop_filter_start
136cabdff1aSopenharmony_ci
137cabdff1aSopenharmony_ci        sub             x0,  x0,  #4
138cabdff1aSopenharmony_ci        ld1             {v6.8B},  [x0], x1
139cabdff1aSopenharmony_ci        ld1             {v20.8B}, [x0], x1
140cabdff1aSopenharmony_ci        ld1             {v18.8B}, [x0], x1
141cabdff1aSopenharmony_ci        ld1             {v16.8B}, [x0], x1
142cabdff1aSopenharmony_ci        ld1             {v0.8B},  [x0], x1
143cabdff1aSopenharmony_ci        ld1             {v2.8B},  [x0], x1
144cabdff1aSopenharmony_ci        ld1             {v4.8B},  [x0], x1
145cabdff1aSopenharmony_ci        ld1             {v26.8B}, [x0], x1
146cabdff1aSopenharmony_ci        ld1             {v6.D}[1],  [x0], x1
147cabdff1aSopenharmony_ci        ld1             {v20.D}[1], [x0], x1
148cabdff1aSopenharmony_ci        ld1             {v18.D}[1], [x0], x1
149cabdff1aSopenharmony_ci        ld1             {v16.D}[1], [x0], x1
150cabdff1aSopenharmony_ci        ld1             {v0.D}[1],  [x0], x1
151cabdff1aSopenharmony_ci        ld1             {v2.D}[1],  [x0], x1
152cabdff1aSopenharmony_ci        ld1             {v4.D}[1],  [x0], x1
153cabdff1aSopenharmony_ci        ld1             {v26.D}[1], [x0], x1
154cabdff1aSopenharmony_ci
155cabdff1aSopenharmony_ci        transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
156cabdff1aSopenharmony_ci
157cabdff1aSopenharmony_ci        h264_loop_filter_luma
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_ci        transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
160cabdff1aSopenharmony_ci
161cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #4
162cabdff1aSopenharmony_ci        add             x0,  x0,  #2
163cabdff1aSopenharmony_ci        st1             {v17.S}[0],  [x0], x1
164cabdff1aSopenharmony_ci        st1             {v16.S}[0], [x0], x1
165cabdff1aSopenharmony_ci        st1             {v0.S}[0],  [x0], x1
166cabdff1aSopenharmony_ci        st1             {v19.S}[0], [x0], x1
167cabdff1aSopenharmony_ci        st1             {v17.S}[1],  [x0], x1
168cabdff1aSopenharmony_ci        st1             {v16.S}[1], [x0], x1
169cabdff1aSopenharmony_ci        st1             {v0.S}[1],  [x0], x1
170cabdff1aSopenharmony_ci        st1             {v19.S}[1], [x0], x1
171cabdff1aSopenharmony_ci        st1             {v17.S}[2],  [x0], x1
172cabdff1aSopenharmony_ci        st1             {v16.S}[2], [x0], x1
173cabdff1aSopenharmony_ci        st1             {v0.S}[2],  [x0], x1
174cabdff1aSopenharmony_ci        st1             {v19.S}[2], [x0], x1
175cabdff1aSopenharmony_ci        st1             {v17.S}[3],  [x0], x1
176cabdff1aSopenharmony_ci        st1             {v16.S}[3], [x0], x1
177cabdff1aSopenharmony_ci        st1             {v0.S}[3],  [x0], x1
178cabdff1aSopenharmony_ci        st1             {v19.S}[3], [x0], x1
179cabdff1aSopenharmony_ci9:
180cabdff1aSopenharmony_ci        ret
181cabdff1aSopenharmony_ciendfunc
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ci.macro h264_loop_filter_start_intra
185cabdff1aSopenharmony_ci        orr             w4,  w2,  w3
186cabdff1aSopenharmony_ci        cbnz            w4,  1f
187cabdff1aSopenharmony_ci        ret
188cabdff1aSopenharmony_ci1:
189cabdff1aSopenharmony_ci        dup             v30.16b, w2                // alpha
190cabdff1aSopenharmony_ci        dup             v31.16b, w3                // beta
191cabdff1aSopenharmony_ci.endm
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci.macro h264_loop_filter_luma_intra
194cabdff1aSopenharmony_ci        uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
195cabdff1aSopenharmony_ci        uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
196cabdff1aSopenharmony_ci        uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
197cabdff1aSopenharmony_ci        cmhi            v19.16b, v30.16b, v16.16b       // < alpha
198cabdff1aSopenharmony_ci        cmhi            v17.16b, v31.16b, v17.16b       // < beta
199cabdff1aSopenharmony_ci        cmhi            v18.16b, v31.16b, v18.16b       // < beta
200cabdff1aSopenharmony_ci
201cabdff1aSopenharmony_ci        movi            v29.16b, #2
202cabdff1aSopenharmony_ci        ushr            v30.16b, v30.16b, #2            // alpha >> 2
203cabdff1aSopenharmony_ci        add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
204cabdff1aSopenharmony_ci        cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
205cabdff1aSopenharmony_ci
206cabdff1aSopenharmony_ci        and             v19.16b, v19.16b, v17.16b
207cabdff1aSopenharmony_ci        and             v19.16b, v19.16b, v18.16b
208cabdff1aSopenharmony_ci        shrn            v20.8b,  v19.8h,  #4
209cabdff1aSopenharmony_ci        mov             x4, v20.d[0]
210cabdff1aSopenharmony_ci        cbz             x4, 9f
211cabdff1aSopenharmony_ci
212cabdff1aSopenharmony_ci        ushll           v20.8h,  v6.8b,   #1
213cabdff1aSopenharmony_ci        ushll           v22.8h,  v1.8b,   #1
214cabdff1aSopenharmony_ci        ushll2          v21.8h,  v6.16b,  #1
215cabdff1aSopenharmony_ci        ushll2          v23.8h,  v1.16b,  #1
216cabdff1aSopenharmony_ci        uaddw           v20.8h,  v20.8h,  v7.8b
217cabdff1aSopenharmony_ci        uaddw           v22.8h,  v22.8h,  v0.8b
218cabdff1aSopenharmony_ci        uaddw2          v21.8h,  v21.8h,  v7.16b
219cabdff1aSopenharmony_ci        uaddw2          v23.8h,  v23.8h,  v0.16b
220cabdff1aSopenharmony_ci        uaddw           v20.8h,  v20.8h,  v1.8b
221cabdff1aSopenharmony_ci        uaddw           v22.8h,  v22.8h,  v6.8b
222cabdff1aSopenharmony_ci        uaddw2          v21.8h,  v21.8h,  v1.16b
223cabdff1aSopenharmony_ci        uaddw2          v23.8h,  v23.8h,  v6.16b
224cabdff1aSopenharmony_ci
225cabdff1aSopenharmony_ci        rshrn           v24.8b,  v20.8h,  #2 // p0'_1
226cabdff1aSopenharmony_ci        rshrn           v25.8b,  v22.8h,  #2 // q0'_1
227cabdff1aSopenharmony_ci        rshrn2          v24.16b, v21.8h,  #2 // p0'_1
228cabdff1aSopenharmony_ci        rshrn2          v25.16b, v23.8h,  #2 // q0'_1
229cabdff1aSopenharmony_ci
230cabdff1aSopenharmony_ci        uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
231cabdff1aSopenharmony_ci        uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
232cabdff1aSopenharmony_ci        cmhi            v17.16b, v31.16b, v17.16b       // < beta
233cabdff1aSopenharmony_ci        cmhi            v18.16b, v31.16b, v18.16b       // < beta
234cabdff1aSopenharmony_ci
235cabdff1aSopenharmony_ci        and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
236cabdff1aSopenharmony_ci        and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
237cabdff1aSopenharmony_ci
238cabdff1aSopenharmony_ci        not             v30.16b, v17.16b
239cabdff1aSopenharmony_ci        not             v31.16b, v18.16b
240cabdff1aSopenharmony_ci
241cabdff1aSopenharmony_ci        and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
242cabdff1aSopenharmony_ci        and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
243cabdff1aSopenharmony_ci
244cabdff1aSopenharmony_ci        and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
245cabdff1aSopenharmony_ci        and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
246cabdff1aSopenharmony_ci
247cabdff1aSopenharmony_ci        //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
248cabdff1aSopenharmony_ci        uaddl           v26.8h,  v5.8b,   v7.8b
249cabdff1aSopenharmony_ci        uaddl2          v27.8h,  v5.16b,  v7.16b
250cabdff1aSopenharmony_ci        uaddw           v26.8h,  v26.8h,  v0.8b
251cabdff1aSopenharmony_ci        uaddw2          v27.8h,  v27.8h,  v0.16b
252cabdff1aSopenharmony_ci        add             v20.8h,  v20.8h,  v26.8h
253cabdff1aSopenharmony_ci        add             v21.8h,  v21.8h,  v27.8h
254cabdff1aSopenharmony_ci        uaddw           v20.8h,  v20.8h,  v0.8b
255cabdff1aSopenharmony_ci        uaddw2          v21.8h,  v21.8h,  v0.16b
256cabdff1aSopenharmony_ci        rshrn           v20.8b,  v20.8h,  #3 // p0'_2
257cabdff1aSopenharmony_ci        rshrn2          v20.16b, v21.8h,  #3 // p0'_2
258cabdff1aSopenharmony_ci        uaddw           v26.8h,  v26.8h,  v6.8b
259cabdff1aSopenharmony_ci        uaddw2          v27.8h,  v27.8h,  v6.16b
260cabdff1aSopenharmony_ci        rshrn           v21.8b,  v26.8h,  #2 // p1'_2
261cabdff1aSopenharmony_ci        rshrn2          v21.16b, v27.8h,  #2 // p1'_2
262cabdff1aSopenharmony_ci        uaddl           v28.8h,  v4.8b,   v5.8b
263cabdff1aSopenharmony_ci        uaddl2          v29.8h,  v4.16b,  v5.16b
264cabdff1aSopenharmony_ci        shl             v28.8h,  v28.8h,  #1
265cabdff1aSopenharmony_ci        shl             v29.8h,  v29.8h,  #1
266cabdff1aSopenharmony_ci        add             v28.8h,  v28.8h,  v26.8h
267cabdff1aSopenharmony_ci        add             v29.8h,  v29.8h,  v27.8h
268cabdff1aSopenharmony_ci        rshrn           v19.8b,  v28.8h,  #3 // p2'_2
269cabdff1aSopenharmony_ci        rshrn2          v19.16b, v29.8h,  #3 // p2'_2
270cabdff1aSopenharmony_ci
271cabdff1aSopenharmony_ci        //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
272cabdff1aSopenharmony_ci        uaddl           v26.8h,  v2.8b,   v0.8b
273cabdff1aSopenharmony_ci        uaddl2          v27.8h,  v2.16b,  v0.16b
274cabdff1aSopenharmony_ci        uaddw           v26.8h,  v26.8h,  v7.8b
275cabdff1aSopenharmony_ci        uaddw2          v27.8h,  v27.8h,  v7.16b
276cabdff1aSopenharmony_ci        add             v22.8h,  v22.8h,  v26.8h
277cabdff1aSopenharmony_ci        add             v23.8h,  v23.8h,  v27.8h
278cabdff1aSopenharmony_ci        uaddw           v22.8h,  v22.8h,  v7.8b
279cabdff1aSopenharmony_ci        uaddw2          v23.8h,  v23.8h,  v7.16b
280cabdff1aSopenharmony_ci        rshrn           v22.8b,  v22.8h,  #3 // q0'_2
281cabdff1aSopenharmony_ci        rshrn2          v22.16b, v23.8h,  #3 // q0'_2
282cabdff1aSopenharmony_ci        uaddw           v26.8h,  v26.8h,  v1.8b
283cabdff1aSopenharmony_ci        uaddw2          v27.8h,  v27.8h,  v1.16b
284cabdff1aSopenharmony_ci        rshrn           v23.8b,  v26.8h,  #2 // q1'_2
285cabdff1aSopenharmony_ci        rshrn2          v23.16b, v27.8h,  #2 // q1'_2
286cabdff1aSopenharmony_ci        uaddl           v28.8h,  v2.8b,   v3.8b
287cabdff1aSopenharmony_ci        uaddl2          v29.8h,  v2.16b,  v3.16b
288cabdff1aSopenharmony_ci        shl             v28.8h,  v28.8h,  #1
289cabdff1aSopenharmony_ci        shl             v29.8h,  v29.8h,  #1
290cabdff1aSopenharmony_ci        add             v28.8h,  v28.8h,  v26.8h
291cabdff1aSopenharmony_ci        add             v29.8h,  v29.8h,  v27.8h
292cabdff1aSopenharmony_ci        rshrn           v26.8b,  v28.8h,  #3 // q2'_2
293cabdff1aSopenharmony_ci        rshrn2          v26.16b, v29.8h,  #3 // q2'_2
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci        bit             v7.16b,  v24.16b, v30.16b  // p0'_1
296cabdff1aSopenharmony_ci        bit             v0.16b,  v25.16b, v31.16b  // q0'_1
297cabdff1aSopenharmony_ci        bit             v7.16b,  v20.16b, v17.16b  // p0'_2
298cabdff1aSopenharmony_ci        bit             v6.16b,  v21.16b, v17.16b  // p1'_2
299cabdff1aSopenharmony_ci        bit             v5.16b,  v19.16b, v17.16b  // p2'_2
300cabdff1aSopenharmony_ci        bit             v0.16b,  v22.16b, v18.16b  // q0'_2
301cabdff1aSopenharmony_ci        bit             v1.16b,  v23.16b, v18.16b  // q1'_2
302cabdff1aSopenharmony_ci        bit             v2.16b,  v26.16b, v18.16b  // q2'_2
303cabdff1aSopenharmony_ci.endm
304cabdff1aSopenharmony_ci
305cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_luma_intra_neon, export=1
306cabdff1aSopenharmony_ci        h264_loop_filter_start_intra
307cabdff1aSopenharmony_ci
308cabdff1aSopenharmony_ci        ld1             {v0.16b},  [x0], x1 // q0
309cabdff1aSopenharmony_ci        ld1             {v1.16b},  [x0], x1 // q1
310cabdff1aSopenharmony_ci        ld1             {v2.16b},  [x0], x1 // q2
311cabdff1aSopenharmony_ci        ld1             {v3.16b},  [x0], x1 // q3
312cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
313cabdff1aSopenharmony_ci        ld1             {v4.16b},  [x0], x1 // p3
314cabdff1aSopenharmony_ci        ld1             {v5.16b},  [x0], x1 // p2
315cabdff1aSopenharmony_ci        ld1             {v6.16b},  [x0], x1 // p1
316cabdff1aSopenharmony_ci        ld1             {v7.16b},  [x0]     // p0
317cabdff1aSopenharmony_ci
318cabdff1aSopenharmony_ci        h264_loop_filter_luma_intra
319cabdff1aSopenharmony_ci
320cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
321cabdff1aSopenharmony_ci        st1             {v5.16b}, [x0], x1  // p2
322cabdff1aSopenharmony_ci        st1             {v6.16b}, [x0], x1  // p1
323cabdff1aSopenharmony_ci        st1             {v7.16b}, [x0], x1  // p0
324cabdff1aSopenharmony_ci        st1             {v0.16b}, [x0], x1  // q0
325cabdff1aSopenharmony_ci        st1             {v1.16b}, [x0], x1  // q1
326cabdff1aSopenharmony_ci        st1             {v2.16b}, [x0]      // q2
327cabdff1aSopenharmony_ci9:
328cabdff1aSopenharmony_ci        ret
329cabdff1aSopenharmony_ciendfunc
330cabdff1aSopenharmony_ci
331cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_luma_intra_neon, export=1
332cabdff1aSopenharmony_ci        h264_loop_filter_start_intra
333cabdff1aSopenharmony_ci
334cabdff1aSopenharmony_ci        sub             x0,  x0,  #4
335cabdff1aSopenharmony_ci        ld1             {v4.8b},  [x0], x1
336cabdff1aSopenharmony_ci        ld1             {v5.8b},  [x0], x1
337cabdff1aSopenharmony_ci        ld1             {v6.8b},  [x0], x1
338cabdff1aSopenharmony_ci        ld1             {v7.8b},  [x0], x1
339cabdff1aSopenharmony_ci        ld1             {v0.8b},  [x0], x1
340cabdff1aSopenharmony_ci        ld1             {v1.8b},  [x0], x1
341cabdff1aSopenharmony_ci        ld1             {v2.8b},  [x0], x1
342cabdff1aSopenharmony_ci        ld1             {v3.8b},  [x0], x1
343cabdff1aSopenharmony_ci        ld1             {v4.d}[1],  [x0], x1
344cabdff1aSopenharmony_ci        ld1             {v5.d}[1],  [x0], x1
345cabdff1aSopenharmony_ci        ld1             {v6.d}[1],  [x0], x1
346cabdff1aSopenharmony_ci        ld1             {v7.d}[1],  [x0], x1
347cabdff1aSopenharmony_ci        ld1             {v0.d}[1],  [x0], x1
348cabdff1aSopenharmony_ci        ld1             {v1.d}[1],  [x0], x1
349cabdff1aSopenharmony_ci        ld1             {v2.d}[1],  [x0], x1
350cabdff1aSopenharmony_ci        ld1             {v3.d}[1],  [x0], x1
351cabdff1aSopenharmony_ci
352cabdff1aSopenharmony_ci        transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
353cabdff1aSopenharmony_ci
354cabdff1aSopenharmony_ci        h264_loop_filter_luma_intra
355cabdff1aSopenharmony_ci
356cabdff1aSopenharmony_ci        transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
357cabdff1aSopenharmony_ci
358cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #4
359cabdff1aSopenharmony_ci        st1             {v4.8b},  [x0], x1
360cabdff1aSopenharmony_ci        st1             {v5.8b},  [x0], x1
361cabdff1aSopenharmony_ci        st1             {v6.8b},  [x0], x1
362cabdff1aSopenharmony_ci        st1             {v7.8b},  [x0], x1
363cabdff1aSopenharmony_ci        st1             {v0.8b},  [x0], x1
364cabdff1aSopenharmony_ci        st1             {v1.8b},  [x0], x1
365cabdff1aSopenharmony_ci        st1             {v2.8b},  [x0], x1
366cabdff1aSopenharmony_ci        st1             {v3.8b},  [x0], x1
367cabdff1aSopenharmony_ci        st1             {v4.d}[1],  [x0], x1
368cabdff1aSopenharmony_ci        st1             {v5.d}[1],  [x0], x1
369cabdff1aSopenharmony_ci        st1             {v6.d}[1],  [x0], x1
370cabdff1aSopenharmony_ci        st1             {v7.d}[1],  [x0], x1
371cabdff1aSopenharmony_ci        st1             {v0.d}[1],  [x0], x1
372cabdff1aSopenharmony_ci        st1             {v1.d}[1],  [x0], x1
373cabdff1aSopenharmony_ci        st1             {v2.d}[1],  [x0], x1
374cabdff1aSopenharmony_ci        st1             {v3.d}[1],  [x0], x1
375cabdff1aSopenharmony_ci9:
376cabdff1aSopenharmony_ci        ret
377cabdff1aSopenharmony_ciendfunc
378cabdff1aSopenharmony_ci
379cabdff1aSopenharmony_ci.macro  h264_loop_filter_chroma
380cabdff1aSopenharmony_ci        dup             v22.8B, w2              // alpha
381cabdff1aSopenharmony_ci        dup             v23.8B, w3              // beta
382cabdff1aSopenharmony_ci        uxtl            v24.8H, v24.8B
383cabdff1aSopenharmony_ci        uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
384cabdff1aSopenharmony_ci        uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
385cabdff1aSopenharmony_ci        uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
386cabdff1aSopenharmony_ci        cmhi            v26.8B, v22.8B, v26.8B  // < alpha
387cabdff1aSopenharmony_ci        cmhi            v28.8B, v23.8B, v28.8B  // < beta
388cabdff1aSopenharmony_ci        cmhi            v30.8B, v23.8B, v30.8B  // < beta
389cabdff1aSopenharmony_ci        uxtl            v4.8H,  v0.8B
390cabdff1aSopenharmony_ci        and             v26.8B, v26.8B, v28.8B
391cabdff1aSopenharmony_ci        usubw           v4.8H,  v4.8H,  v16.8B
392cabdff1aSopenharmony_ci        and             v26.8B, v26.8B, v30.8B
393cabdff1aSopenharmony_ci        shl             v4.8H,  v4.8H,  #2
394cabdff1aSopenharmony_ci        mov             x8,  v26.d[0]
395cabdff1aSopenharmony_ci        sli             v24.8H, v24.8H, #8
396cabdff1aSopenharmony_ci        uaddw           v4.8H,  v4.8H,  v18.8B
397cabdff1aSopenharmony_ci        cbz             x8,  9f
398cabdff1aSopenharmony_ci        usubw           v4.8H,  v4.8H,  v2.8B
399cabdff1aSopenharmony_ci        rshrn           v4.8B,  v4.8H,  #3
400cabdff1aSopenharmony_ci        smin            v4.8B,  v4.8B,  v24.8B
401cabdff1aSopenharmony_ci        neg             v25.8B, v24.8B
402cabdff1aSopenharmony_ci        smax            v4.8B,  v4.8B,  v25.8B
403cabdff1aSopenharmony_ci        uxtl            v22.8H, v0.8B
404cabdff1aSopenharmony_ci        and             v4.8B,  v4.8B,  v26.8B
405cabdff1aSopenharmony_ci        uxtl            v28.8H, v16.8B
406cabdff1aSopenharmony_ci        saddw           v28.8H, v28.8H, v4.8B
407cabdff1aSopenharmony_ci        ssubw           v22.8H, v22.8H, v4.8B
408cabdff1aSopenharmony_ci        sqxtun          v16.8B, v28.8H
409cabdff1aSopenharmony_ci        sqxtun          v0.8B,  v22.8H
410cabdff1aSopenharmony_ci.endm
411cabdff1aSopenharmony_ci
412cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_chroma_neon, export=1
413cabdff1aSopenharmony_ci        h264_loop_filter_start
414cabdff1aSopenharmony_ci
415cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
416cabdff1aSopenharmony_ci        ld1             {v18.8B}, [x0], x1
417cabdff1aSopenharmony_ci        ld1             {v16.8B}, [x0], x1
418cabdff1aSopenharmony_ci        ld1             {v0.8B},  [x0], x1
419cabdff1aSopenharmony_ci        ld1             {v2.8B},  [x0]
420cabdff1aSopenharmony_ci
421cabdff1aSopenharmony_ci        h264_loop_filter_chroma
422cabdff1aSopenharmony_ci
423cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
424cabdff1aSopenharmony_ci        st1             {v16.8B}, [x0], x1
425cabdff1aSopenharmony_ci        st1             {v0.8B},  [x0], x1
426cabdff1aSopenharmony_ci9:
427cabdff1aSopenharmony_ci        ret
428cabdff1aSopenharmony_ciendfunc
429cabdff1aSopenharmony_ci
430cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_neon, export=1
431cabdff1aSopenharmony_ci        h264_loop_filter_start
432cabdff1aSopenharmony_ci
433cabdff1aSopenharmony_ci        sub             x0,  x0,  #2
434cabdff1aSopenharmony_cih_loop_filter_chroma420:
435cabdff1aSopenharmony_ci        ld1             {v18.S}[0], [x0], x1
436cabdff1aSopenharmony_ci        ld1             {v16.S}[0], [x0], x1
437cabdff1aSopenharmony_ci        ld1             {v0.S}[0],  [x0], x1
438cabdff1aSopenharmony_ci        ld1             {v2.S}[0],  [x0], x1
439cabdff1aSopenharmony_ci        ld1             {v18.S}[1], [x0], x1
440cabdff1aSopenharmony_ci        ld1             {v16.S}[1], [x0], x1
441cabdff1aSopenharmony_ci        ld1             {v0.S}[1],  [x0], x1
442cabdff1aSopenharmony_ci        ld1             {v2.S}[1],  [x0], x1
443cabdff1aSopenharmony_ci
444cabdff1aSopenharmony_ci        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
445cabdff1aSopenharmony_ci
446cabdff1aSopenharmony_ci        h264_loop_filter_chroma
447cabdff1aSopenharmony_ci
448cabdff1aSopenharmony_ci        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
449cabdff1aSopenharmony_ci
450cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #3
451cabdff1aSopenharmony_ci        st1             {v18.S}[0], [x0], x1
452cabdff1aSopenharmony_ci        st1             {v16.S}[0], [x0], x1
453cabdff1aSopenharmony_ci        st1             {v0.S}[0],  [x0], x1
454cabdff1aSopenharmony_ci        st1             {v2.S}[0],  [x0], x1
455cabdff1aSopenharmony_ci        st1             {v18.S}[1], [x0], x1
456cabdff1aSopenharmony_ci        st1             {v16.S}[1], [x0], x1
457cabdff1aSopenharmony_ci        st1             {v0.S}[1],  [x0], x1
458cabdff1aSopenharmony_ci        st1             {v2.S}[1],  [x0], x1
459cabdff1aSopenharmony_ci9:
460cabdff1aSopenharmony_ci        ret
461cabdff1aSopenharmony_ciendfunc
462cabdff1aSopenharmony_ci
463cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma422_neon, export=1
464cabdff1aSopenharmony_ci        h264_loop_filter_start
465cabdff1aSopenharmony_ci        add             x5,  x0,  x1
466cabdff1aSopenharmony_ci        sub             x0,  x0,  #2
467cabdff1aSopenharmony_ci        add             x1,  x1,  x1
468cabdff1aSopenharmony_ci        mov             x7,  x30
469cabdff1aSopenharmony_ci        bl              h_loop_filter_chroma420
470cabdff1aSopenharmony_ci        mov             x30, x7
471cabdff1aSopenharmony_ci        sub             x0,  x5,  #2
472cabdff1aSopenharmony_ci        mov             v24.s[0], w6
473cabdff1aSopenharmony_ci        b               h_loop_filter_chroma420
474cabdff1aSopenharmony_ciendfunc
475cabdff1aSopenharmony_ci
476cabdff1aSopenharmony_ci.macro h264_loop_filter_chroma_intra
477cabdff1aSopenharmony_ci        uabd            v26.8b,  v16.8b,  v17.8b  // abs(p0 - q0)
478cabdff1aSopenharmony_ci        uabd            v27.8b,  v18.8b,  v16.8b  // abs(p1 - p0)
479cabdff1aSopenharmony_ci        uabd            v28.8b,  v19.8b,  v17.8b  // abs(q1 - q0)
480cabdff1aSopenharmony_ci        cmhi            v26.8b,  v30.8b,  v26.8b  // < alpha
481cabdff1aSopenharmony_ci        cmhi            v27.8b,  v31.8b,  v27.8b  // < beta
482cabdff1aSopenharmony_ci        cmhi            v28.8b,  v31.8b,  v28.8b  // < beta
483cabdff1aSopenharmony_ci        and             v26.8b,  v26.8b,  v27.8b
484cabdff1aSopenharmony_ci        and             v26.8b,  v26.8b,  v28.8b
485cabdff1aSopenharmony_ci        mov             x2, v26.d[0]
486cabdff1aSopenharmony_ci
487cabdff1aSopenharmony_ci        ushll           v4.8h,   v18.8b,  #1
488cabdff1aSopenharmony_ci        ushll           v6.8h,   v19.8b,  #1
489cabdff1aSopenharmony_ci        cbz             x2, 9f
490cabdff1aSopenharmony_ci        uaddl           v20.8h,  v16.8b,  v19.8b
491cabdff1aSopenharmony_ci        uaddl           v22.8h,  v17.8b,  v18.8b
492cabdff1aSopenharmony_ci        add             v20.8h,  v20.8h,  v4.8h
493cabdff1aSopenharmony_ci        add             v22.8h,  v22.8h,  v6.8h
494cabdff1aSopenharmony_ci        uqrshrn         v24.8b,  v20.8h,  #2
495cabdff1aSopenharmony_ci        uqrshrn         v25.8b,  v22.8h,  #2
496cabdff1aSopenharmony_ci        bit             v16.8b,  v24.8b,  v26.8b
497cabdff1aSopenharmony_ci        bit             v17.8b,  v25.8b,  v26.8b
498cabdff1aSopenharmony_ci.endm
499cabdff1aSopenharmony_ci
500cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_chroma_intra_neon, export=1
501cabdff1aSopenharmony_ci        h264_loop_filter_start_intra
502cabdff1aSopenharmony_ci
503cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
504cabdff1aSopenharmony_ci        ld1             {v18.8b}, [x0], x1
505cabdff1aSopenharmony_ci        ld1             {v16.8b}, [x0], x1
506cabdff1aSopenharmony_ci        ld1             {v17.8b}, [x0], x1
507cabdff1aSopenharmony_ci        ld1             {v19.8b}, [x0]
508cabdff1aSopenharmony_ci
509cabdff1aSopenharmony_ci        h264_loop_filter_chroma_intra
510cabdff1aSopenharmony_ci
511cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
512cabdff1aSopenharmony_ci        st1             {v16.8b}, [x0], x1
513cabdff1aSopenharmony_ci        st1             {v17.8b}, [x0], x1
514cabdff1aSopenharmony_ci
515cabdff1aSopenharmony_ci9:
516cabdff1aSopenharmony_ci        ret
517cabdff1aSopenharmony_ciendfunc
518cabdff1aSopenharmony_ci
519cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
520cabdff1aSopenharmony_ci        h264_loop_filter_start_intra
521cabdff1aSopenharmony_ci
522cabdff1aSopenharmony_ci        sub             x4,  x0,  #2
523cabdff1aSopenharmony_ci        sub             x0,  x0,  #1
524cabdff1aSopenharmony_ci        ld1             {v18.8b}, [x4], x1
525cabdff1aSopenharmony_ci        ld1             {v16.8b}, [x4], x1
526cabdff1aSopenharmony_ci        ld1             {v17.8b}, [x4], x1
527cabdff1aSopenharmony_ci        ld1             {v19.8b}, [x4], x1
528cabdff1aSopenharmony_ci
529cabdff1aSopenharmony_ci        transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
530cabdff1aSopenharmony_ci
531cabdff1aSopenharmony_ci        h264_loop_filter_chroma_intra
532cabdff1aSopenharmony_ci
533cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[0], [x0], x1
534cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[1], [x0], x1
535cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[2], [x0], x1
536cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[3], [x0], x1
537cabdff1aSopenharmony_ci
538cabdff1aSopenharmony_ci9:
539cabdff1aSopenharmony_ci        ret
540cabdff1aSopenharmony_ciendfunc
541cabdff1aSopenharmony_ci
542cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_intra_neon, export=1
543cabdff1aSopenharmony_ci        h264_loop_filter_start_intra
544cabdff1aSopenharmony_ci
545cabdff1aSopenharmony_ci        sub             x4,  x0,  #2
546cabdff1aSopenharmony_ci        sub             x0,  x0,  #1
547cabdff1aSopenharmony_cih_loop_filter_chroma420_intra:
548cabdff1aSopenharmony_ci        ld1             {v18.8b}, [x4], x1
549cabdff1aSopenharmony_ci        ld1             {v16.8b}, [x4], x1
550cabdff1aSopenharmony_ci        ld1             {v17.8b}, [x4], x1
551cabdff1aSopenharmony_ci        ld1             {v19.8b}, [x4], x1
552cabdff1aSopenharmony_ci        ld1             {v18.s}[1], [x4], x1
553cabdff1aSopenharmony_ci        ld1             {v16.s}[1], [x4], x1
554cabdff1aSopenharmony_ci        ld1             {v17.s}[1], [x4], x1
555cabdff1aSopenharmony_ci        ld1             {v19.s}[1], [x4], x1
556cabdff1aSopenharmony_ci
557cabdff1aSopenharmony_ci        transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
558cabdff1aSopenharmony_ci
559cabdff1aSopenharmony_ci        h264_loop_filter_chroma_intra
560cabdff1aSopenharmony_ci
561cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[0], [x0], x1
562cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[1], [x0], x1
563cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[2], [x0], x1
564cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[3], [x0], x1
565cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[4], [x0], x1
566cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[5], [x0], x1
567cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[6], [x0], x1
568cabdff1aSopenharmony_ci        st2             {v16.b,v17.b}[7], [x0], x1
569cabdff1aSopenharmony_ci
570cabdff1aSopenharmony_ci9:
571cabdff1aSopenharmony_ci        ret
572cabdff1aSopenharmony_ciendfunc
573cabdff1aSopenharmony_ci
574cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma422_intra_neon, export=1
575cabdff1aSopenharmony_ci        h264_loop_filter_start_intra
576cabdff1aSopenharmony_ci        sub             x4,  x0,  #2
577cabdff1aSopenharmony_ci        add             x5,  x0,  x1, lsl #3
578cabdff1aSopenharmony_ci        sub             x0,  x0,  #1
579cabdff1aSopenharmony_ci        mov             x7,  x30
580cabdff1aSopenharmony_ci        bl              h_loop_filter_chroma420_intra
581cabdff1aSopenharmony_ci        sub             x0,  x5,  #1
582cabdff1aSopenharmony_ci        mov             x30, x7
583cabdff1aSopenharmony_ci        b               h_loop_filter_chroma420_intra
584cabdff1aSopenharmony_ciendfunc
585cabdff1aSopenharmony_ci
586cabdff1aSopenharmony_ci.macro  biweight_16     macs, macd
587cabdff1aSopenharmony_ci        dup             v0.16B,  w5
588cabdff1aSopenharmony_ci        dup             v1.16B,  w6
589cabdff1aSopenharmony_ci        mov             v4.16B,  v16.16B
590cabdff1aSopenharmony_ci        mov             v6.16B,  v16.16B
591cabdff1aSopenharmony_ci1:      subs            w3,  w3,  #2
592cabdff1aSopenharmony_ci        ld1             {v20.16B}, [x0], x2
593cabdff1aSopenharmony_ci        \macd           v4.8H,   v0.8B,  v20.8B
594cabdff1aSopenharmony_ci        \macd\()2       v6.8H,   v0.16B, v20.16B
595cabdff1aSopenharmony_ci        ld1             {v22.16B}, [x1], x2
596cabdff1aSopenharmony_ci        \macs           v4.8H,   v1.8B,  v22.8B
597cabdff1aSopenharmony_ci        \macs\()2       v6.8H,   v1.16B, v22.16B
598cabdff1aSopenharmony_ci        mov             v24.16B, v16.16B
599cabdff1aSopenharmony_ci        ld1             {v28.16B}, [x0], x2
600cabdff1aSopenharmony_ci        mov             v26.16B, v16.16B
601cabdff1aSopenharmony_ci        \macd           v24.8H,  v0.8B,  v28.8B
602cabdff1aSopenharmony_ci        \macd\()2       v26.8H,  v0.16B, v28.16B
603cabdff1aSopenharmony_ci        ld1             {v30.16B}, [x1], x2
604cabdff1aSopenharmony_ci        \macs           v24.8H,  v1.8B,  v30.8B
605cabdff1aSopenharmony_ci        \macs\()2       v26.8H,  v1.16B, v30.16B
606cabdff1aSopenharmony_ci        sshl            v4.8H,   v4.8H,  v18.8H
607cabdff1aSopenharmony_ci        sshl            v6.8H,   v6.8H,  v18.8H
608cabdff1aSopenharmony_ci        sqxtun          v4.8B,   v4.8H
609cabdff1aSopenharmony_ci        sqxtun2         v4.16B,  v6.8H
610cabdff1aSopenharmony_ci        sshl            v24.8H,  v24.8H, v18.8H
611cabdff1aSopenharmony_ci        sshl            v26.8H,  v26.8H, v18.8H
612cabdff1aSopenharmony_ci        sqxtun          v24.8B,  v24.8H
613cabdff1aSopenharmony_ci        sqxtun2         v24.16B, v26.8H
614cabdff1aSopenharmony_ci        mov             v6.16B,  v16.16B
615cabdff1aSopenharmony_ci        st1             {v4.16B},  [x7], x2
616cabdff1aSopenharmony_ci        mov             v4.16B,  v16.16B
617cabdff1aSopenharmony_ci        st1             {v24.16B}, [x7], x2
618cabdff1aSopenharmony_ci        b.ne            1b
619cabdff1aSopenharmony_ci        ret
620cabdff1aSopenharmony_ci.endm
621cabdff1aSopenharmony_ci
622cabdff1aSopenharmony_ci.macro  biweight_8      macs, macd
623cabdff1aSopenharmony_ci        dup             v0.8B,  w5
624cabdff1aSopenharmony_ci        dup             v1.8B,  w6
625cabdff1aSopenharmony_ci        mov             v2.16B,  v16.16B
626cabdff1aSopenharmony_ci        mov             v20.16B, v16.16B
627cabdff1aSopenharmony_ci1:      subs            w3,  w3,  #2
628cabdff1aSopenharmony_ci        ld1             {v4.8B}, [x0], x2
629cabdff1aSopenharmony_ci        \macd           v2.8H,  v0.8B,  v4.8B
630cabdff1aSopenharmony_ci        ld1             {v5.8B}, [x1], x2
631cabdff1aSopenharmony_ci        \macs           v2.8H,  v1.8B,  v5.8B
632cabdff1aSopenharmony_ci        ld1             {v6.8B}, [x0], x2
633cabdff1aSopenharmony_ci        \macd           v20.8H, v0.8B,  v6.8B
634cabdff1aSopenharmony_ci        ld1             {v7.8B}, [x1], x2
635cabdff1aSopenharmony_ci        \macs           v20.8H, v1.8B,  v7.8B
636cabdff1aSopenharmony_ci        sshl            v2.8H,  v2.8H,  v18.8H
637cabdff1aSopenharmony_ci        sqxtun          v2.8B,  v2.8H
638cabdff1aSopenharmony_ci        sshl            v20.8H, v20.8H, v18.8H
639cabdff1aSopenharmony_ci        sqxtun          v4.8B,  v20.8H
640cabdff1aSopenharmony_ci        mov             v20.16B, v16.16B
641cabdff1aSopenharmony_ci        st1             {v2.8B}, [x7], x2
642cabdff1aSopenharmony_ci        mov             v2.16B,  v16.16B
643cabdff1aSopenharmony_ci        st1             {v4.8B}, [x7], x2
644cabdff1aSopenharmony_ci        b.ne            1b
645cabdff1aSopenharmony_ci        ret
646cabdff1aSopenharmony_ci.endm
647cabdff1aSopenharmony_ci
648cabdff1aSopenharmony_ci.macro  biweight_4      macs, macd
649cabdff1aSopenharmony_ci        dup             v0.8B,  w5
650cabdff1aSopenharmony_ci        dup             v1.8B,  w6
651cabdff1aSopenharmony_ci        mov             v2.16B, v16.16B
652cabdff1aSopenharmony_ci        mov             v20.16B,v16.16B
653cabdff1aSopenharmony_ci1:      subs            w3,  w3,  #4
654cabdff1aSopenharmony_ci        ld1             {v4.S}[0], [x0], x2
655cabdff1aSopenharmony_ci        ld1             {v4.S}[1], [x0], x2
656cabdff1aSopenharmony_ci        \macd           v2.8H,  v0.8B,  v4.8B
657cabdff1aSopenharmony_ci        ld1             {v5.S}[0], [x1], x2
658cabdff1aSopenharmony_ci        ld1             {v5.S}[1], [x1], x2
659cabdff1aSopenharmony_ci        \macs           v2.8H,  v1.8B,  v5.8B
660cabdff1aSopenharmony_ci        b.lt            2f
661cabdff1aSopenharmony_ci        ld1             {v6.S}[0], [x0], x2
662cabdff1aSopenharmony_ci        ld1             {v6.S}[1], [x0], x2
663cabdff1aSopenharmony_ci        \macd           v20.8H, v0.8B,  v6.8B
664cabdff1aSopenharmony_ci        ld1             {v7.S}[0], [x1], x2
665cabdff1aSopenharmony_ci        ld1             {v7.S}[1], [x1], x2
666cabdff1aSopenharmony_ci        \macs           v20.8H, v1.8B,  v7.8B
667cabdff1aSopenharmony_ci        sshl            v2.8H,  v2.8H,  v18.8H
668cabdff1aSopenharmony_ci        sqxtun          v2.8B,  v2.8H
669cabdff1aSopenharmony_ci        sshl            v20.8H, v20.8H, v18.8H
670cabdff1aSopenharmony_ci        sqxtun          v4.8B,  v20.8H
671cabdff1aSopenharmony_ci        mov             v20.16B, v16.16B
672cabdff1aSopenharmony_ci        st1             {v2.S}[0], [x7], x2
673cabdff1aSopenharmony_ci        st1             {v2.S}[1], [x7], x2
674cabdff1aSopenharmony_ci        mov             v2.16B,  v16.16B
675cabdff1aSopenharmony_ci        st1             {v4.S}[0], [x7], x2
676cabdff1aSopenharmony_ci        st1             {v4.S}[1], [x7], x2
677cabdff1aSopenharmony_ci        b.ne            1b
678cabdff1aSopenharmony_ci        ret
679cabdff1aSopenharmony_ci2:      sshl            v2.8H,  v2.8H,  v18.8H
680cabdff1aSopenharmony_ci        sqxtun          v2.8B,  v2.8H
681cabdff1aSopenharmony_ci        st1             {v2.S}[0], [x7], x2
682cabdff1aSopenharmony_ci        st1             {v2.S}[1], [x7], x2
683cabdff1aSopenharmony_ci        ret
684cabdff1aSopenharmony_ci.endm
685cabdff1aSopenharmony_ci
686cabdff1aSopenharmony_ci.macro  biweight_func   w
687cabdff1aSopenharmony_cifunction ff_biweight_h264_pixels_\w\()_neon, export=1
688cabdff1aSopenharmony_ci        lsr             w8,  w5,  #31
689cabdff1aSopenharmony_ci        add             w7,  w7,  #1
690cabdff1aSopenharmony_ci        eor             w8,  w8,  w6,  lsr #30
691cabdff1aSopenharmony_ci        orr             w7,  w7,  #1
692cabdff1aSopenharmony_ci        dup             v18.8H,   w4
693cabdff1aSopenharmony_ci        lsl             w7,  w7,  w4
694cabdff1aSopenharmony_ci        not             v18.16B,  v18.16B
695cabdff1aSopenharmony_ci        dup             v16.8H,   w7
696cabdff1aSopenharmony_ci        mov             x7,  x0
697cabdff1aSopenharmony_ci        cbz             w8,  10f
698cabdff1aSopenharmony_ci        subs            w8,  w8,  #1
699cabdff1aSopenharmony_ci        b.eq            20f
700cabdff1aSopenharmony_ci        subs            w8,  w8,  #1
701cabdff1aSopenharmony_ci        b.eq            30f
702cabdff1aSopenharmony_ci        b               40f
703cabdff1aSopenharmony_ci10:     biweight_\w     umlal, umlal
704cabdff1aSopenharmony_ci20:     neg             w5, w5
705cabdff1aSopenharmony_ci        biweight_\w     umlal, umlsl
706cabdff1aSopenharmony_ci30:     neg             w5, w5
707cabdff1aSopenharmony_ci        neg             w6, w6
708cabdff1aSopenharmony_ci        biweight_\w     umlsl, umlsl
709cabdff1aSopenharmony_ci40:     neg             w6, w6
710cabdff1aSopenharmony_ci        biweight_\w     umlsl, umlal
711cabdff1aSopenharmony_ciendfunc
712cabdff1aSopenharmony_ci.endm
713cabdff1aSopenharmony_ci
714cabdff1aSopenharmony_ci        biweight_func   16
715cabdff1aSopenharmony_ci        biweight_func   8
716cabdff1aSopenharmony_ci        biweight_func   4
717cabdff1aSopenharmony_ci
718cabdff1aSopenharmony_ci.macro  weight_16       add
719cabdff1aSopenharmony_ci        dup             v0.16B,  w4
720cabdff1aSopenharmony_ci1:      subs            w2,  w2,  #2
721cabdff1aSopenharmony_ci        ld1             {v20.16B}, [x0], x1
722cabdff1aSopenharmony_ci        umull           v4.8H,   v0.8B,  v20.8B
723cabdff1aSopenharmony_ci        umull2          v6.8H,   v0.16B, v20.16B
724cabdff1aSopenharmony_ci        ld1             {v28.16B}, [x0], x1
725cabdff1aSopenharmony_ci        umull           v24.8H,  v0.8B,  v28.8B
726cabdff1aSopenharmony_ci        umull2          v26.8H,  v0.16B, v28.16B
727cabdff1aSopenharmony_ci        \add            v4.8H,   v16.8H, v4.8H
728cabdff1aSopenharmony_ci        srshl           v4.8H,   v4.8H,  v18.8H
729cabdff1aSopenharmony_ci        \add            v6.8H,   v16.8H, v6.8H
730cabdff1aSopenharmony_ci        srshl           v6.8H,   v6.8H,  v18.8H
731cabdff1aSopenharmony_ci        sqxtun          v4.8B,   v4.8H
732cabdff1aSopenharmony_ci        sqxtun2         v4.16B,  v6.8H
733cabdff1aSopenharmony_ci        \add            v24.8H,  v16.8H, v24.8H
734cabdff1aSopenharmony_ci        srshl           v24.8H,  v24.8H, v18.8H
735cabdff1aSopenharmony_ci        \add            v26.8H,  v16.8H, v26.8H
736cabdff1aSopenharmony_ci        srshl           v26.8H,  v26.8H, v18.8H
737cabdff1aSopenharmony_ci        sqxtun          v24.8B,  v24.8H
738cabdff1aSopenharmony_ci        sqxtun2         v24.16B, v26.8H
739cabdff1aSopenharmony_ci        st1             {v4.16B},  [x5], x1
740cabdff1aSopenharmony_ci        st1             {v24.16B}, [x5], x1
741cabdff1aSopenharmony_ci        b.ne            1b
742cabdff1aSopenharmony_ci        ret
743cabdff1aSopenharmony_ci.endm
744cabdff1aSopenharmony_ci
745cabdff1aSopenharmony_ci.macro  weight_8        add
746cabdff1aSopenharmony_ci        dup             v0.8B,  w4
747cabdff1aSopenharmony_ci1:      subs            w2,  w2,  #2
748cabdff1aSopenharmony_ci        ld1             {v4.8B}, [x0], x1
749cabdff1aSopenharmony_ci        umull           v2.8H,  v0.8B,  v4.8B
750cabdff1aSopenharmony_ci        ld1             {v6.8B}, [x0], x1
751cabdff1aSopenharmony_ci        umull           v20.8H, v0.8B,  v6.8B
752cabdff1aSopenharmony_ci        \add            v2.8H,  v16.8H,  v2.8H
753cabdff1aSopenharmony_ci        srshl           v2.8H,  v2.8H,  v18.8H
754cabdff1aSopenharmony_ci        sqxtun          v2.8B,  v2.8H
755cabdff1aSopenharmony_ci        \add            v20.8H, v16.8H,  v20.8H
756cabdff1aSopenharmony_ci        srshl           v20.8H, v20.8H, v18.8H
757cabdff1aSopenharmony_ci        sqxtun          v4.8B,  v20.8H
758cabdff1aSopenharmony_ci        st1             {v2.8B}, [x5], x1
759cabdff1aSopenharmony_ci        st1             {v4.8B}, [x5], x1
760cabdff1aSopenharmony_ci        b.ne            1b
761cabdff1aSopenharmony_ci        ret
762cabdff1aSopenharmony_ci.endm
763cabdff1aSopenharmony_ci
764cabdff1aSopenharmony_ci.macro  weight_4        add
765cabdff1aSopenharmony_ci        dup             v0.8B,  w4
766cabdff1aSopenharmony_ci1:      subs            w2,  w2,  #4
767cabdff1aSopenharmony_ci        ld1             {v4.S}[0], [x0], x1
768cabdff1aSopenharmony_ci        ld1             {v4.S}[1], [x0], x1
769cabdff1aSopenharmony_ci        umull           v2.8H,  v0.8B,  v4.8B
770cabdff1aSopenharmony_ci        b.lt            2f
771cabdff1aSopenharmony_ci        ld1             {v6.S}[0], [x0], x1
772cabdff1aSopenharmony_ci        ld1             {v6.S}[1], [x0], x1
773cabdff1aSopenharmony_ci        umull           v20.8H, v0.8B,  v6.8B
774cabdff1aSopenharmony_ci        \add            v2.8H,  v16.8H,  v2.8H
775cabdff1aSopenharmony_ci        srshl           v2.8H,  v2.8H,  v18.8H
776cabdff1aSopenharmony_ci        sqxtun          v2.8B,  v2.8H
777cabdff1aSopenharmony_ci        \add            v20.8H, v16.8H,  v20.8H
778cabdff1aSopenharmony_ci        srshl           v20.8H, v20.8h, v18.8H
779cabdff1aSopenharmony_ci        sqxtun          v4.8B,  v20.8H
780cabdff1aSopenharmony_ci        st1             {v2.S}[0], [x5], x1
781cabdff1aSopenharmony_ci        st1             {v2.S}[1], [x5], x1
782cabdff1aSopenharmony_ci        st1             {v4.S}[0], [x5], x1
783cabdff1aSopenharmony_ci        st1             {v4.S}[1], [x5], x1
784cabdff1aSopenharmony_ci        b.ne            1b
785cabdff1aSopenharmony_ci        ret
786cabdff1aSopenharmony_ci2:      \add            v2.8H,  v16.8H,  v2.8H
787cabdff1aSopenharmony_ci        srshl           v2.8H,  v2.8H,  v18.8H
788cabdff1aSopenharmony_ci        sqxtun          v2.8B,  v2.8H
789cabdff1aSopenharmony_ci        st1             {v2.S}[0], [x5], x1
790cabdff1aSopenharmony_ci        st1             {v2.S}[1], [x5], x1
791cabdff1aSopenharmony_ci        ret
792cabdff1aSopenharmony_ci.endm
793cabdff1aSopenharmony_ci
794cabdff1aSopenharmony_ci.macro  weight_func     w
795cabdff1aSopenharmony_cifunction ff_weight_h264_pixels_\w\()_neon, export=1
796cabdff1aSopenharmony_ci        cmp             w3,  #1
797cabdff1aSopenharmony_ci        mov             w6,  #1
798cabdff1aSopenharmony_ci        lsl             w5,  w5,  w3
799cabdff1aSopenharmony_ci        dup             v16.8H,  w5
800cabdff1aSopenharmony_ci        mov             x5,  x0
801cabdff1aSopenharmony_ci        b.le            20f
802cabdff1aSopenharmony_ci        sub             w6,  w6,  w3
803cabdff1aSopenharmony_ci        dup             v18.8H,  w6
804cabdff1aSopenharmony_ci        cmp             w4, #0
805cabdff1aSopenharmony_ci        b.lt            10f
806cabdff1aSopenharmony_ci        weight_\w       shadd
807cabdff1aSopenharmony_ci10:     neg             w4,  w4
808cabdff1aSopenharmony_ci        weight_\w       shsub
809cabdff1aSopenharmony_ci20:     neg             w6,  w3
810cabdff1aSopenharmony_ci        dup             v18.8H,  w6
811cabdff1aSopenharmony_ci        cmp             w4,  #0
812cabdff1aSopenharmony_ci        b.lt            10f
813cabdff1aSopenharmony_ci        weight_\w       add
814cabdff1aSopenharmony_ci10:     neg             w4,  w4
815cabdff1aSopenharmony_ci        weight_\w       sub
816cabdff1aSopenharmony_ciendfunc
817cabdff1aSopenharmony_ci.endm
818cabdff1aSopenharmony_ci
819cabdff1aSopenharmony_ci        weight_func     16
820cabdff1aSopenharmony_ci        weight_func     8
821cabdff1aSopenharmony_ci        weight_func     4
822cabdff1aSopenharmony_ci
823cabdff1aSopenharmony_ci.macro  h264_loop_filter_start_10
824cabdff1aSopenharmony_ci        cmp             w2,  #0
825cabdff1aSopenharmony_ci        ldr             w6,  [x4]
826cabdff1aSopenharmony_ci        ccmp            w3,  #0,  #0,  ne
827cabdff1aSopenharmony_ci        lsl             w2,  w2,  #2
828cabdff1aSopenharmony_ci        mov             v24.S[0], w6
829cabdff1aSopenharmony_ci        lsl             w3,  w3,  #2
830cabdff1aSopenharmony_ci        and             w8,  w6,  w6,  lsl #16
831cabdff1aSopenharmony_ci        b.eq            1f
832cabdff1aSopenharmony_ci        ands            w8,  w8,  w8,  lsl #8
833cabdff1aSopenharmony_ci        b.ge            2f
834cabdff1aSopenharmony_ci1:
835cabdff1aSopenharmony_ci        ret
836cabdff1aSopenharmony_ci2:
837cabdff1aSopenharmony_ci.endm
838cabdff1aSopenharmony_ci
839cabdff1aSopenharmony_ci.macro h264_loop_filter_start_intra_10
840cabdff1aSopenharmony_ci        orr             w4,  w2,  w3
841cabdff1aSopenharmony_ci        cbnz            w4,  1f
842cabdff1aSopenharmony_ci        ret
843cabdff1aSopenharmony_ci1:
844cabdff1aSopenharmony_ci        lsl             w2,  w2,  #2
845cabdff1aSopenharmony_ci        lsl             w3,  w3,  #2
846cabdff1aSopenharmony_ci        dup             v30.8h,   w2              // alpha
847cabdff1aSopenharmony_ci        dup             v31.8h,   w3              // beta
848cabdff1aSopenharmony_ci.endm
849cabdff1aSopenharmony_ci
850cabdff1aSopenharmony_ci.macro  h264_loop_filter_chroma_10
851cabdff1aSopenharmony_ci        dup             v22.8h,  w2               // alpha
852cabdff1aSopenharmony_ci        dup             v23.8h,  w3               // beta
853cabdff1aSopenharmony_ci        uxtl            v24.8h,  v24.8b           // tc0
854cabdff1aSopenharmony_ci
855cabdff1aSopenharmony_ci        uabd            v26.8h,  v16.8h,  v0.8h   // abs(p0 - q0)
856cabdff1aSopenharmony_ci        uabd            v28.8h,  v18.8h,  v16.8h  // abs(p1 - p0)
857cabdff1aSopenharmony_ci        uabd            v30.8h,  v2.8h,   v0.8h   // abs(q1 - q0)
858cabdff1aSopenharmony_ci        cmhi            v26.8h,  v22.8h,  v26.8h  // < alpha
859cabdff1aSopenharmony_ci        cmhi            v28.8h,  v23.8h,  v28.8h  // < beta
860cabdff1aSopenharmony_ci        cmhi            v30.8h,  v23.8h,  v30.8h  // < beta
861cabdff1aSopenharmony_ci
862cabdff1aSopenharmony_ci        and             v26.16b, v26.16b, v28.16b
863cabdff1aSopenharmony_ci        mov             v4.16b,  v0.16b
864cabdff1aSopenharmony_ci        sub             v4.8h,   v4.8h,   v16.8h
865cabdff1aSopenharmony_ci        and             v26.16b, v26.16b, v30.16b
866cabdff1aSopenharmony_ci        shl             v4.8h,   v4.8h,   #2
867cabdff1aSopenharmony_ci        mov             x8, v26.d[0]
868cabdff1aSopenharmony_ci        mov             x9, v26.d[1]
869cabdff1aSopenharmony_ci        sli             v24.8h,  v24.8h,  #8
870cabdff1aSopenharmony_ci        uxtl            v24.8h,  v24.8b
871cabdff1aSopenharmony_ci        add             v4.8h,   v4.8h,   v18.8h
872cabdff1aSopenharmony_ci        adds            x8,  x8,  x9
873cabdff1aSopenharmony_ci        shl             v24.8h,  v24.8h,  #2
874cabdff1aSopenharmony_ci
875cabdff1aSopenharmony_ci        b.eq            9f
876cabdff1aSopenharmony_ci
877cabdff1aSopenharmony_ci        movi            v31.8h, #3                // (tc0 - 1) << (BIT_DEPTH - 8)) + 1
878cabdff1aSopenharmony_ci        uqsub           v24.8h,  v24.8h,  v31.8h
879cabdff1aSopenharmony_ci        sub             v4.8h,   v4.8h,   v2.8h
880cabdff1aSopenharmony_ci        srshr           v4.8h,   v4.8h,   #3
881cabdff1aSopenharmony_ci        smin            v4.8h,   v4.8h,   v24.8h
882cabdff1aSopenharmony_ci        neg             v25.8h,  v24.8h
883cabdff1aSopenharmony_ci        smax            v4.8h,   v4.8h,   v25.8h
884cabdff1aSopenharmony_ci        and             v4.16b,  v4.16b,  v26.16b
885cabdff1aSopenharmony_ci        add             v16.8h,  v16.8h,  v4.8h
886cabdff1aSopenharmony_ci        sub             v0.8h,   v0.8h,   v4.8h
887cabdff1aSopenharmony_ci
888cabdff1aSopenharmony_ci        mvni            v4.8h,   #0xFC, lsl #8    // 1023 for clipping
889cabdff1aSopenharmony_ci        movi            v5.8h,   #0
890cabdff1aSopenharmony_ci        smin            v0.8h,   v0.8h,   v4.8h
891cabdff1aSopenharmony_ci        smin            v16.8h,  v16.8h,  v4.8h
892cabdff1aSopenharmony_ci        smax            v0.8h,   v0.8h,   v5.8h
893cabdff1aSopenharmony_ci        smax            v16.8h,  v16.8h,  v5.8h
894cabdff1aSopenharmony_ci.endm
895cabdff1aSopenharmony_ci
896cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_chroma_neon_10, export=1
897cabdff1aSopenharmony_ci        h264_loop_filter_start_10
898cabdff1aSopenharmony_ci
899cabdff1aSopenharmony_ci        mov             x10,  x0
900cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
901cabdff1aSopenharmony_ci        ld1             {v18.8h}, [x0 ], x1
902cabdff1aSopenharmony_ci        ld1             {v0.8h},  [x10], x1
903cabdff1aSopenharmony_ci        ld1             {v16.8h}, [x0 ], x1
904cabdff1aSopenharmony_ci        ld1             {v2.8h},  [x10]
905cabdff1aSopenharmony_ci
906cabdff1aSopenharmony_ci        h264_loop_filter_chroma_10
907cabdff1aSopenharmony_ci
908cabdff1aSopenharmony_ci        sub             x0,  x10,  x1, lsl #1
909cabdff1aSopenharmony_ci        st1             {v16.8h}, [x0], x1
910cabdff1aSopenharmony_ci        st1             {v0.8h},  [x0], x1
911cabdff1aSopenharmony_ci9:
912cabdff1aSopenharmony_ci        ret
913cabdff1aSopenharmony_ciendfunc
914cabdff1aSopenharmony_ci
915cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_neon_10, export=1
916cabdff1aSopenharmony_ci        h264_loop_filter_start_10
917cabdff1aSopenharmony_ci
918cabdff1aSopenharmony_ci        sub             x0,  x0,  #4 // access the 2nd left pixel
919cabdff1aSopenharmony_cih_loop_filter_chroma420_10:
920cabdff1aSopenharmony_ci        add             x10,  x0,  x1,  lsl #2
921cabdff1aSopenharmony_ci        ld1             {v18.d}[0], [x0 ], x1
922cabdff1aSopenharmony_ci        ld1             {v18.d}[1], [x10], x1
923cabdff1aSopenharmony_ci        ld1             {v16.d}[0], [x0 ], x1
924cabdff1aSopenharmony_ci        ld1             {v16.d}[1], [x10], x1
925cabdff1aSopenharmony_ci        ld1             {v0.d}[0],  [x0 ], x1
926cabdff1aSopenharmony_ci        ld1             {v0.d}[1],  [x10], x1
927cabdff1aSopenharmony_ci        ld1             {v2.d}[0],  [x0 ], x1
928cabdff1aSopenharmony_ci        ld1             {v2.d}[1],  [x10], x1
929cabdff1aSopenharmony_ci
930cabdff1aSopenharmony_ci        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
931cabdff1aSopenharmony_ci
932cabdff1aSopenharmony_ci        h264_loop_filter_chroma_10
933cabdff1aSopenharmony_ci
934cabdff1aSopenharmony_ci        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
935cabdff1aSopenharmony_ci
936cabdff1aSopenharmony_ci        sub             x0,  x10,  x1, lsl #3
937cabdff1aSopenharmony_ci        st1             {v18.d}[0], [x0], x1
938cabdff1aSopenharmony_ci        st1             {v16.d}[0], [x0], x1
939cabdff1aSopenharmony_ci        st1             {v0.d}[0],  [x0], x1
940cabdff1aSopenharmony_ci        st1             {v2.d}[0],  [x0], x1
941cabdff1aSopenharmony_ci        st1             {v18.d}[1], [x0], x1
942cabdff1aSopenharmony_ci        st1             {v16.d}[1], [x0], x1
943cabdff1aSopenharmony_ci        st1             {v0.d}[1],  [x0], x1
944cabdff1aSopenharmony_ci        st1             {v2.d}[1],  [x0], x1
945cabdff1aSopenharmony_ci9:
946cabdff1aSopenharmony_ci        ret
947cabdff1aSopenharmony_ciendfunc
948cabdff1aSopenharmony_ci
949cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma422_neon_10, export=1
950cabdff1aSopenharmony_ci        h264_loop_filter_start_10
951cabdff1aSopenharmony_ci        add             x5,  x0,  x1
952cabdff1aSopenharmony_ci        sub             x0,  x0,  #4
953cabdff1aSopenharmony_ci        add             x1,  x1,  x1
954cabdff1aSopenharmony_ci        mov             x7,  x30
955cabdff1aSopenharmony_ci        bl              h_loop_filter_chroma420_10
956cabdff1aSopenharmony_ci        mov             x30, x7
957cabdff1aSopenharmony_ci        sub             x0,  x5,  #4
958cabdff1aSopenharmony_ci        mov             v24.s[0], w6
959cabdff1aSopenharmony_ci        b               h_loop_filter_chroma420_10
960cabdff1aSopenharmony_ciendfunc
961cabdff1aSopenharmony_ci
962cabdff1aSopenharmony_ci.macro h264_loop_filter_chroma_intra_10
963cabdff1aSopenharmony_ci        uabd            v26.8h,  v16.8h,  v17.8h  // abs(p0 - q0)
964cabdff1aSopenharmony_ci        uabd            v27.8h,  v18.8h,  v16.8h  // abs(p1 - p0)
965cabdff1aSopenharmony_ci        uabd            v28.8h,  v19.8h,  v17.8h  // abs(q1 - q0)
966cabdff1aSopenharmony_ci        cmhi            v26.8h,  v30.8h,  v26.8h  // < alpha
967cabdff1aSopenharmony_ci        cmhi            v27.8h,  v31.8h,  v27.8h  // < beta
968cabdff1aSopenharmony_ci        cmhi            v28.8h,  v31.8h,  v28.8h  // < beta
969cabdff1aSopenharmony_ci        and             v26.16b, v26.16b, v27.16b
970cabdff1aSopenharmony_ci        and             v26.16b, v26.16b, v28.16b
971cabdff1aSopenharmony_ci        mov             x2, v26.d[0]
972cabdff1aSopenharmony_ci        mov             x3, v26.d[1]
973cabdff1aSopenharmony_ci
974cabdff1aSopenharmony_ci        shl             v4.8h,  v18.8h,  #1
975cabdff1aSopenharmony_ci        shl             v6.8h,  v19.8h,  #1
976cabdff1aSopenharmony_ci
977cabdff1aSopenharmony_ci        adds            x2,  x2,  x3
978cabdff1aSopenharmony_ci        b.eq            9f
979cabdff1aSopenharmony_ci
980cabdff1aSopenharmony_ci        add             v20.8h,  v16.8h,  v19.8h
981cabdff1aSopenharmony_ci        add             v22.8h,  v17.8h,  v18.8h
982cabdff1aSopenharmony_ci        add             v20.8h,  v20.8h,  v4.8h
983cabdff1aSopenharmony_ci        add             v22.8h,  v22.8h,  v6.8h
984cabdff1aSopenharmony_ci        urshr           v24.8h,  v20.8h,  #2
985cabdff1aSopenharmony_ci        urshr           v25.8h,  v22.8h,  #2
986cabdff1aSopenharmony_ci        bit             v16.16b, v24.16b, v26.16b
987cabdff1aSopenharmony_ci        bit             v17.16b, v25.16b, v26.16b
988cabdff1aSopenharmony_ci.endm
989cabdff1aSopenharmony_ci
990cabdff1aSopenharmony_cifunction ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
991cabdff1aSopenharmony_ci        h264_loop_filter_start_intra_10
992cabdff1aSopenharmony_ci        mov             x9,  x0
993cabdff1aSopenharmony_ci        sub             x0,  x0,  x1, lsl #1
994cabdff1aSopenharmony_ci        ld1             {v18.8h}, [x0], x1
995cabdff1aSopenharmony_ci        ld1             {v17.8h}, [x9], x1
996cabdff1aSopenharmony_ci        ld1             {v16.8h}, [x0], x1
997cabdff1aSopenharmony_ci        ld1             {v19.8h}, [x9]
998cabdff1aSopenharmony_ci
999cabdff1aSopenharmony_ci        h264_loop_filter_chroma_intra_10
1000cabdff1aSopenharmony_ci
1001cabdff1aSopenharmony_ci        sub             x0,  x9,  x1, lsl #1
1002cabdff1aSopenharmony_ci        st1             {v16.8h}, [x0], x1
1003cabdff1aSopenharmony_ci        st1             {v17.8h}, [x0], x1
1004cabdff1aSopenharmony_ci
1005cabdff1aSopenharmony_ci9:
1006cabdff1aSopenharmony_ci        ret
1007cabdff1aSopenharmony_ciendfunc
1008cabdff1aSopenharmony_ci
1009cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
1010cabdff1aSopenharmony_ci        h264_loop_filter_start_intra_10
1011cabdff1aSopenharmony_ci
1012cabdff1aSopenharmony_ci        sub             x4,  x0,  #4
1013cabdff1aSopenharmony_ci        sub             x0,  x0,  #2
1014cabdff1aSopenharmony_ci        add             x9,  x4,  x1, lsl #1
1015cabdff1aSopenharmony_ci        ld1             {v18.8h}, [x4], x1
1016cabdff1aSopenharmony_ci        ld1             {v17.8h}, [x9], x1
1017cabdff1aSopenharmony_ci        ld1             {v16.8h}, [x4], x1
1018cabdff1aSopenharmony_ci        ld1             {v19.8h}, [x9], x1
1019cabdff1aSopenharmony_ci
1020cabdff1aSopenharmony_ci        transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
1021cabdff1aSopenharmony_ci
1022cabdff1aSopenharmony_ci        h264_loop_filter_chroma_intra_10
1023cabdff1aSopenharmony_ci
1024cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[0], [x0], x1
1025cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[1], [x0], x1
1026cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[2], [x0], x1
1027cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[3], [x0], x1
1028cabdff1aSopenharmony_ci
1029cabdff1aSopenharmony_ci9:
1030cabdff1aSopenharmony_ci        ret
1031cabdff1aSopenharmony_ciendfunc
1032cabdff1aSopenharmony_ci
1033cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
1034cabdff1aSopenharmony_ci        h264_loop_filter_start_intra_10
1035cabdff1aSopenharmony_ci        sub             x4,  x0,  #4
1036cabdff1aSopenharmony_ci        sub             x0,  x0,  #2
1037cabdff1aSopenharmony_cih_loop_filter_chroma420_intra_10:
1038cabdff1aSopenharmony_ci        add             x9,  x4,  x1, lsl #2
1039cabdff1aSopenharmony_ci        ld1             {v18.4h},   [x4], x1
1040cabdff1aSopenharmony_ci        ld1             {v18.d}[1], [x9], x1
1041cabdff1aSopenharmony_ci        ld1             {v16.4h},   [x4], x1
1042cabdff1aSopenharmony_ci        ld1             {v16.d}[1], [x9], x1
1043cabdff1aSopenharmony_ci        ld1             {v17.4h},   [x4], x1
1044cabdff1aSopenharmony_ci        ld1             {v17.d}[1], [x9], x1
1045cabdff1aSopenharmony_ci        ld1             {v19.4h},   [x4], x1
1046cabdff1aSopenharmony_ci        ld1             {v19.d}[1], [x9], x1
1047cabdff1aSopenharmony_ci
1048cabdff1aSopenharmony_ci        transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
1049cabdff1aSopenharmony_ci
1050cabdff1aSopenharmony_ci        h264_loop_filter_chroma_intra_10
1051cabdff1aSopenharmony_ci
1052cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[0], [x0], x1
1053cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[1], [x0], x1
1054cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[2], [x0], x1
1055cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[3], [x0], x1
1056cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[4], [x0], x1
1057cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[5], [x0], x1
1058cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[6], [x0], x1
1059cabdff1aSopenharmony_ci        st2             {v16.h,v17.h}[7], [x0], x1
1060cabdff1aSopenharmony_ci
1061cabdff1aSopenharmony_ci9:
1062cabdff1aSopenharmony_ci        ret
1063cabdff1aSopenharmony_ciendfunc
1064cabdff1aSopenharmony_ci
1065cabdff1aSopenharmony_cifunction ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1
1066cabdff1aSopenharmony_ci        h264_loop_filter_start_intra_10
1067cabdff1aSopenharmony_ci        sub             x4,  x0,  #4
1068cabdff1aSopenharmony_ci        add             x5,  x0,  x1, lsl #3
1069cabdff1aSopenharmony_ci        sub             x0,  x0,  #2
1070cabdff1aSopenharmony_ci        mov             x7,  x30
1071cabdff1aSopenharmony_ci        bl              h_loop_filter_chroma420_intra_10
1072cabdff1aSopenharmony_ci        mov             x4,  x9
1073cabdff1aSopenharmony_ci        sub             x0,  x5,  #2
1074cabdff1aSopenharmony_ci        mov             x30, x7
1075cabdff1aSopenharmony_ci        b               h_loop_filter_chroma420_intra_10
1076cabdff1aSopenharmony_ciendfunc
1077