1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3cabdff1aSopenharmony_ci * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4cabdff1aSopenharmony_ci *
5cabdff1aSopenharmony_ci * This file is part of FFmpeg.
6cabdff1aSopenharmony_ci *
7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci *
12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci * Lesser General Public License for more details.
16cabdff1aSopenharmony_ci *
17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci */
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
23cabdff1aSopenharmony_ci#include "neon.S"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_cifunction ff_h264_idct_add_neon, export=1
26cabdff1aSopenharmony_ci.L_ff_h264_idct_add_neon:
27cabdff1aSopenharmony_ci        AARCH64_VALID_CALL_TARGET
28cabdff1aSopenharmony_ci        ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
29cabdff1aSopenharmony_ci        sxtw            x2,     w2
30cabdff1aSopenharmony_ci        movi            v30.8H, #0
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci        add             v4.4H,  v0.4H,  v2.4H
33cabdff1aSopenharmony_ci        sshr            v16.4H, v1.4H,  #1
34cabdff1aSopenharmony_ci        st1             {v30.8H},    [x1], #16
35cabdff1aSopenharmony_ci        sshr            v17.4H, v3.4H,  #1
36cabdff1aSopenharmony_ci        st1             {v30.8H},    [x1], #16
37cabdff1aSopenharmony_ci        sub             v5.4H,  v0.4H,  v2.4H
38cabdff1aSopenharmony_ci        sub             v6.4H,  v16.4H, v3.4H
39cabdff1aSopenharmony_ci        add             v7.4H,  v1.4H,  v17.4H
40cabdff1aSopenharmony_ci        add             v0.4H,  v4.4H,  v7.4H
41cabdff1aSopenharmony_ci        add             v1.4H,  v5.4H,  v6.4H
42cabdff1aSopenharmony_ci        sub             v2.4H,  v5.4H,  v6.4H
43cabdff1aSopenharmony_ci        sub             v3.4H,  v4.4H,  v7.4H
44cabdff1aSopenharmony_ci
45cabdff1aSopenharmony_ci        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ci        add             v4.4H,  v0.4H,  v2.4H
48cabdff1aSopenharmony_ci        ld1             {v18.S}[0], [x0], x2
49cabdff1aSopenharmony_ci        sshr            v16.4H,  v3.4H,  #1
50cabdff1aSopenharmony_ci        sshr            v17.4H,  v1.4H,  #1
51cabdff1aSopenharmony_ci        ld1             {v18.S}[1], [x0], x2
52cabdff1aSopenharmony_ci        sub             v5.4H,  v0.4H,  v2.4H
53cabdff1aSopenharmony_ci        ld1             {v19.S}[1], [x0], x2
54cabdff1aSopenharmony_ci        add             v6.4H,  v16.4H, v1.4H
55cabdff1aSopenharmony_ci        ins             v4.D[1],  v5.D[0]
56cabdff1aSopenharmony_ci        sub             v7.4H,  v17.4H, v3.4H
57cabdff1aSopenharmony_ci        ld1             {v19.S}[0], [x0], x2
58cabdff1aSopenharmony_ci        ins             v6.D[1],  v7.D[0]
59cabdff1aSopenharmony_ci        sub             x0,  x0,  x2, lsl #2
60cabdff1aSopenharmony_ci        add             v0.8H,  v4.8H,  v6.8H
61cabdff1aSopenharmony_ci        sub             v1.8H,  v4.8H,  v6.8H
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_ci        srshr           v0.8H,  v0.8H,  #6
64cabdff1aSopenharmony_ci        srshr           v1.8H,  v1.8H,  #6
65cabdff1aSopenharmony_ci
66cabdff1aSopenharmony_ci        uaddw           v0.8H,  v0.8H,  v18.8B
67cabdff1aSopenharmony_ci        uaddw           v1.8H,  v1.8H,  v19.8B
68cabdff1aSopenharmony_ci
69cabdff1aSopenharmony_ci        sqxtun          v0.8B, v0.8H
70cabdff1aSopenharmony_ci        sqxtun          v1.8B, v1.8H
71cabdff1aSopenharmony_ci
72cabdff1aSopenharmony_ci        st1             {v0.S}[0],  [x0], x2
73cabdff1aSopenharmony_ci        st1             {v0.S}[1],  [x0], x2
74cabdff1aSopenharmony_ci        st1             {v1.S}[1],  [x0], x2
75cabdff1aSopenharmony_ci        st1             {v1.S}[0],  [x0], x2
76cabdff1aSopenharmony_ci
77cabdff1aSopenharmony_ci        sub             x1,  x1,  #32
78cabdff1aSopenharmony_ci        ret
79cabdff1aSopenharmony_ciendfunc
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_cifunction ff_h264_idct_dc_add_neon, export=1
82cabdff1aSopenharmony_ci.L_ff_h264_idct_dc_add_neon:
83cabdff1aSopenharmony_ci        AARCH64_VALID_CALL_TARGET
84cabdff1aSopenharmony_ci        sxtw            x2,  w2
85cabdff1aSopenharmony_ci        mov             w3,       #0
86cabdff1aSopenharmony_ci        ld1r            {v2.8H},  [x1]
87cabdff1aSopenharmony_ci        strh            w3,       [x1]
88cabdff1aSopenharmony_ci        srshr           v2.8H,  v2.8H,  #6
89cabdff1aSopenharmony_ci        ld1             {v0.S}[0],  [x0], x2
90cabdff1aSopenharmony_ci        ld1             {v0.S}[1],  [x0], x2
91cabdff1aSopenharmony_ci        uaddw           v3.8H,  v2.8H,  v0.8B
92cabdff1aSopenharmony_ci        ld1             {v1.S}[0],  [x0], x2
93cabdff1aSopenharmony_ci        ld1             {v1.S}[1],  [x0], x2
94cabdff1aSopenharmony_ci        uaddw           v4.8H,  v2.8H,  v1.8B
95cabdff1aSopenharmony_ci        sqxtun          v0.8B,  v3.8H
96cabdff1aSopenharmony_ci        sqxtun          v1.8B,  v4.8H
97cabdff1aSopenharmony_ci        sub             x0,  x0,  x2, lsl #2
98cabdff1aSopenharmony_ci        st1             {v0.S}[0],  [x0], x2
99cabdff1aSopenharmony_ci        st1             {v0.S}[1],  [x0], x2
100cabdff1aSopenharmony_ci        st1             {v1.S}[0],  [x0], x2
101cabdff1aSopenharmony_ci        st1             {v1.S}[1],  [x0], x2
102cabdff1aSopenharmony_ci        ret
103cabdff1aSopenharmony_ciendfunc
104cabdff1aSopenharmony_ci
105cabdff1aSopenharmony_cifunction ff_h264_idct_add16_neon, export=1
106cabdff1aSopenharmony_ci        mov             x12, x30
107cabdff1aSopenharmony_ci        mov             x6,  x0         // dest
108cabdff1aSopenharmony_ci        mov             x5,  x1         // block_offset
109cabdff1aSopenharmony_ci        mov             x1,  x2         // block
110cabdff1aSopenharmony_ci        mov             w9,  w3         // stride
111cabdff1aSopenharmony_ci        movrel          x7,  scan8
112cabdff1aSopenharmony_ci        mov             x10, #16
113cabdff1aSopenharmony_ci        movrel          x13, .L_ff_h264_idct_dc_add_neon
114cabdff1aSopenharmony_ci        movrel          x14, .L_ff_h264_idct_add_neon
115cabdff1aSopenharmony_ci1:      mov             w2,  w9
116cabdff1aSopenharmony_ci        ldrb            w3,  [x7], #1
117cabdff1aSopenharmony_ci        ldrsw           x0,  [x5], #4
118cabdff1aSopenharmony_ci        ldrb            w3,  [x4,  w3,  uxtw]
119cabdff1aSopenharmony_ci        subs            w3,  w3,  #1
120cabdff1aSopenharmony_ci        b.lt            2f
121cabdff1aSopenharmony_ci        ldrsh           w3,  [x1]
122cabdff1aSopenharmony_ci        add             x0,  x0,  x6
123cabdff1aSopenharmony_ci        ccmp            w3,  #0,  #4,  eq
124cabdff1aSopenharmony_ci        csel            x15, x13, x14, ne
125cabdff1aSopenharmony_ci        blr             x15
126cabdff1aSopenharmony_ci2:      subs            x10, x10, #1
127cabdff1aSopenharmony_ci        add             x1,  x1,  #32
128cabdff1aSopenharmony_ci        b.ne            1b
129cabdff1aSopenharmony_ci        ret             x12
130cabdff1aSopenharmony_ciendfunc
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_cifunction ff_h264_idct_add16intra_neon, export=1
133cabdff1aSopenharmony_ci        mov             x12, x30
134cabdff1aSopenharmony_ci        mov             x6,  x0         // dest
135cabdff1aSopenharmony_ci        mov             x5,  x1         // block_offset
136cabdff1aSopenharmony_ci        mov             x1,  x2         // block
137cabdff1aSopenharmony_ci        mov             w9,  w3         // stride
138cabdff1aSopenharmony_ci        movrel          x7,  scan8
139cabdff1aSopenharmony_ci        mov             x10, #16
140cabdff1aSopenharmony_ci        movrel          x13, .L_ff_h264_idct_dc_add_neon
141cabdff1aSopenharmony_ci        movrel          x14, .L_ff_h264_idct_add_neon
142cabdff1aSopenharmony_ci1:      mov             w2,  w9
143cabdff1aSopenharmony_ci        ldrb            w3,  [x7], #1
144cabdff1aSopenharmony_ci        ldrsw           x0,  [x5], #4
145cabdff1aSopenharmony_ci        ldrb            w3,  [x4,  w3,  uxtw]
146cabdff1aSopenharmony_ci        add             x0,  x0,  x6
147cabdff1aSopenharmony_ci        cmp             w3,  #0
148cabdff1aSopenharmony_ci        ldrsh           w3,  [x1]
149cabdff1aSopenharmony_ci        csel            x15, x13, x14, eq
150cabdff1aSopenharmony_ci        ccmp            w3,  #0,  #0,  eq
151cabdff1aSopenharmony_ci        b.eq            2f
152cabdff1aSopenharmony_ci        blr             x15
153cabdff1aSopenharmony_ci2:      subs            x10, x10, #1
154cabdff1aSopenharmony_ci        add             x1,  x1,  #32
155cabdff1aSopenharmony_ci        b.ne            1b
156cabdff1aSopenharmony_ci        ret             x12
157cabdff1aSopenharmony_ciendfunc
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_cifunction ff_h264_idct_add8_neon, export=1
160cabdff1aSopenharmony_ci        sub             sp,  sp, #0x40
161cabdff1aSopenharmony_ci        stp             x19, x20, [sp]
162cabdff1aSopenharmony_ci        mov             x12, x30
163cabdff1aSopenharmony_ci        ldp             x6,  x15, [x0]          // dest[0], dest[1]
164cabdff1aSopenharmony_ci        add             x5,  x1,  #16*4         // block_offset
165cabdff1aSopenharmony_ci        add             x9,  x2,  #16*32        // block
166cabdff1aSopenharmony_ci        mov             w19, w3                 // stride
167cabdff1aSopenharmony_ci        movrel          x13, .L_ff_h264_idct_dc_add_neon
168cabdff1aSopenharmony_ci        movrel          x14, .L_ff_h264_idct_add_neon
169cabdff1aSopenharmony_ci        movrel          x7,  scan8, 16
170cabdff1aSopenharmony_ci        mov             x10, #0
171cabdff1aSopenharmony_ci        mov             x11, #16
172cabdff1aSopenharmony_ci1:      mov             w2,  w19
173cabdff1aSopenharmony_ci        ldrb            w3,  [x7, x10]          // scan8[i]
174cabdff1aSopenharmony_ci        ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
175cabdff1aSopenharmony_ci        ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
176cabdff1aSopenharmony_ci        add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
177cabdff1aSopenharmony_ci        add             x1,  x9,  x10, lsl #5   // block + i * 16
178cabdff1aSopenharmony_ci        cmp             w3,  #0
179cabdff1aSopenharmony_ci        ldrsh           w3,  [x1]               // block[i*16]
180cabdff1aSopenharmony_ci        csel            x20, x13, x14, eq
181cabdff1aSopenharmony_ci        ccmp            w3,  #0,  #0,  eq
182cabdff1aSopenharmony_ci        b.eq            2f
183cabdff1aSopenharmony_ci        blr             x20
184cabdff1aSopenharmony_ci2:      add             x10, x10, #1
185cabdff1aSopenharmony_ci        cmp             x10, #4
186cabdff1aSopenharmony_ci        csel            x10, x11, x10, eq     // mov x10, #16
187cabdff1aSopenharmony_ci        csel            x6,  x15, x6,  eq
188cabdff1aSopenharmony_ci        cmp             x10, #20
189cabdff1aSopenharmony_ci        b.lt            1b
190cabdff1aSopenharmony_ci        ldp             x19, x20, [sp]
191cabdff1aSopenharmony_ci        add             sp,  sp,  #0x40
192cabdff1aSopenharmony_ci        ret             x12
193cabdff1aSopenharmony_ciendfunc
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ci.macro  idct8x8_cols    pass
196cabdff1aSopenharmony_ci  .if \pass == 0
197cabdff1aSopenharmony_ci        va      .req    v18
198cabdff1aSopenharmony_ci        vb      .req    v30
199cabdff1aSopenharmony_ci        sshr            v18.8H, v26.8H, #1
200cabdff1aSopenharmony_ci        add             v16.8H, v24.8H, v28.8H
201cabdff1aSopenharmony_ci        ld1             {v30.8H, v31.8H}, [x1]
202cabdff1aSopenharmony_ci        st1             {v19.8H}, [x1],  #16
203cabdff1aSopenharmony_ci        st1             {v19.8H}, [x1],  #16
204cabdff1aSopenharmony_ci        sub             v17.8H,  v24.8H, v28.8H
205cabdff1aSopenharmony_ci        sshr            v19.8H,  v30.8H, #1
206cabdff1aSopenharmony_ci        sub             v18.8H,  v18.8H,  v30.8H
207cabdff1aSopenharmony_ci        add             v19.8H,  v19.8H,  v26.8H
208cabdff1aSopenharmony_ci  .else
209cabdff1aSopenharmony_ci        va      .req    v30
210cabdff1aSopenharmony_ci        vb      .req    v18
211cabdff1aSopenharmony_ci        sshr            v30.8H, v26.8H, #1
212cabdff1aSopenharmony_ci        sshr            v19.8H, v18.8H, #1
213cabdff1aSopenharmony_ci        add             v16.8H, v24.8H, v28.8H
214cabdff1aSopenharmony_ci        sub             v17.8H, v24.8H, v28.8H
215cabdff1aSopenharmony_ci        sub             v30.8H, v30.8H, v18.8H
216cabdff1aSopenharmony_ci        add             v19.8H, v19.8H, v26.8H
217cabdff1aSopenharmony_ci  .endif
218cabdff1aSopenharmony_ci        add             v26.8H, v17.8H, va.8H
219cabdff1aSopenharmony_ci        sub             v28.8H, v17.8H, va.8H
220cabdff1aSopenharmony_ci        add             v24.8H, v16.8H, v19.8H
221cabdff1aSopenharmony_ci        sub             vb.8H,  v16.8H, v19.8H
222cabdff1aSopenharmony_ci        sub             v16.8H, v29.8H, v27.8H
223cabdff1aSopenharmony_ci        add             v17.8H, v31.8H, v25.8H
224cabdff1aSopenharmony_ci        sub             va.8H,  v31.8H, v25.8H
225cabdff1aSopenharmony_ci        add             v19.8H, v29.8H, v27.8H
226cabdff1aSopenharmony_ci        sub             v16.8H, v16.8H, v31.8H
227cabdff1aSopenharmony_ci        sub             v17.8H, v17.8H, v27.8H
228cabdff1aSopenharmony_ci        add             va.8H,  va.8H,  v29.8H
229cabdff1aSopenharmony_ci        add             v19.8H, v19.8H, v25.8H
230cabdff1aSopenharmony_ci        sshr            v25.8H, v25.8H, #1
231cabdff1aSopenharmony_ci        sshr            v27.8H, v27.8H, #1
232cabdff1aSopenharmony_ci        sshr            v29.8H, v29.8H, #1
233cabdff1aSopenharmony_ci        sshr            v31.8H, v31.8H, #1
234cabdff1aSopenharmony_ci        sub             v16.8H, v16.8H, v31.8H
235cabdff1aSopenharmony_ci        sub             v17.8H, v17.8H, v27.8H
236cabdff1aSopenharmony_ci        add             va.8H,  va.8H,  v29.8H
237cabdff1aSopenharmony_ci        add             v19.8H, v19.8H, v25.8H
238cabdff1aSopenharmony_ci        sshr            v25.8H, v16.8H, #2
239cabdff1aSopenharmony_ci        sshr            v27.8H, v17.8H, #2
240cabdff1aSopenharmony_ci        sshr            v29.8H, va.8H,  #2
241cabdff1aSopenharmony_ci        sshr            v31.8H, v19.8H, #2
242cabdff1aSopenharmony_ci        sub             v19.8H, v19.8H, v25.8H
243cabdff1aSopenharmony_ci        sub             va.8H,  v27.8H, va.8H
244cabdff1aSopenharmony_ci        add             v17.8H, v17.8H, v29.8H
245cabdff1aSopenharmony_ci        add             v16.8H, v16.8H, v31.8H
246cabdff1aSopenharmony_ci  .if \pass == 0
247cabdff1aSopenharmony_ci        sub             v31.8H, v24.8H, v19.8H
248cabdff1aSopenharmony_ci        add             v24.8H, v24.8H, v19.8H
249cabdff1aSopenharmony_ci        add             v25.8H, v26.8H, v18.8H
250cabdff1aSopenharmony_ci        sub             v18.8H, v26.8H, v18.8H
251cabdff1aSopenharmony_ci        add             v26.8H, v28.8H, v17.8H
252cabdff1aSopenharmony_ci        add             v27.8H, v30.8H, v16.8H
253cabdff1aSopenharmony_ci        sub             v29.8H, v28.8H, v17.8H
254cabdff1aSopenharmony_ci        sub             v28.8H, v30.8H, v16.8H
255cabdff1aSopenharmony_ci  .else
256cabdff1aSopenharmony_ci        sub             v31.8H, v24.8H, v19.8H
257cabdff1aSopenharmony_ci        add             v24.8H, v24.8H, v19.8H
258cabdff1aSopenharmony_ci        add             v25.8H, v26.8H, v30.8H
259cabdff1aSopenharmony_ci        sub             v30.8H, v26.8H, v30.8H
260cabdff1aSopenharmony_ci        add             v26.8H, v28.8H, v17.8H
261cabdff1aSopenharmony_ci        sub             v29.8H, v28.8H, v17.8H
262cabdff1aSopenharmony_ci        add             v27.8H, v18.8H, v16.8H
263cabdff1aSopenharmony_ci        sub             v28.8H, v18.8H, v16.8H
264cabdff1aSopenharmony_ci  .endif
265cabdff1aSopenharmony_ci        .unreq          va
266cabdff1aSopenharmony_ci        .unreq          vb
267cabdff1aSopenharmony_ci.endm
268cabdff1aSopenharmony_ci
269cabdff1aSopenharmony_cifunction ff_h264_idct8_add_neon, export=1
270cabdff1aSopenharmony_ci.L_ff_h264_idct8_add_neon:
271cabdff1aSopenharmony_ci        AARCH64_VALID_CALL_TARGET
272cabdff1aSopenharmony_ci        movi            v19.8H,   #0
273cabdff1aSopenharmony_ci        sxtw            x2,       w2
274cabdff1aSopenharmony_ci        ld1             {v24.8H, v25.8H}, [x1]
275cabdff1aSopenharmony_ci        st1             {v19.8H},  [x1],   #16
276cabdff1aSopenharmony_ci        st1             {v19.8H},  [x1],   #16
277cabdff1aSopenharmony_ci        ld1             {v26.8H, v27.8H}, [x1]
278cabdff1aSopenharmony_ci        st1             {v19.8H},  [x1],   #16
279cabdff1aSopenharmony_ci        st1             {v19.8H},  [x1],   #16
280cabdff1aSopenharmony_ci        ld1             {v28.8H, v29.8H}, [x1]
281cabdff1aSopenharmony_ci        st1             {v19.8H},  [x1],   #16
282cabdff1aSopenharmony_ci        st1             {v19.8H},  [x1],   #16
283cabdff1aSopenharmony_ci
284cabdff1aSopenharmony_ci        idct8x8_cols    0
285cabdff1aSopenharmony_ci        transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
286cabdff1aSopenharmony_ci        idct8x8_cols    1
287cabdff1aSopenharmony_ci
288cabdff1aSopenharmony_ci        mov             x3,  x0
289cabdff1aSopenharmony_ci        srshr           v24.8H, v24.8H, #6
290cabdff1aSopenharmony_ci        ld1             {v0.8B},     [x0], x2
291cabdff1aSopenharmony_ci        srshr           v25.8H, v25.8H, #6
292cabdff1aSopenharmony_ci        ld1             {v1.8B},     [x0], x2
293cabdff1aSopenharmony_ci        srshr           v26.8H, v26.8H, #6
294cabdff1aSopenharmony_ci        ld1             {v2.8B},     [x0], x2
295cabdff1aSopenharmony_ci        srshr           v27.8H, v27.8H, #6
296cabdff1aSopenharmony_ci        ld1             {v3.8B},     [x0], x2
297cabdff1aSopenharmony_ci        srshr           v28.8H, v28.8H, #6
298cabdff1aSopenharmony_ci        ld1             {v4.8B},     [x0], x2
299cabdff1aSopenharmony_ci        srshr           v29.8H, v29.8H, #6
300cabdff1aSopenharmony_ci        ld1             {v5.8B},     [x0], x2
301cabdff1aSopenharmony_ci        srshr           v30.8H, v30.8H, #6
302cabdff1aSopenharmony_ci        ld1             {v6.8B},     [x0], x2
303cabdff1aSopenharmony_ci        srshr           v31.8H, v31.8H, #6
304cabdff1aSopenharmony_ci        ld1             {v7.8B},     [x0], x2
305cabdff1aSopenharmony_ci        uaddw           v24.8H, v24.8H, v0.8B
306cabdff1aSopenharmony_ci        uaddw           v25.8H, v25.8H, v1.8B
307cabdff1aSopenharmony_ci        uaddw           v26.8H, v26.8H, v2.8B
308cabdff1aSopenharmony_ci        sqxtun          v0.8B,  v24.8H
309cabdff1aSopenharmony_ci        uaddw           v27.8H, v27.8H, v3.8B
310cabdff1aSopenharmony_ci        sqxtun          v1.8B,  v25.8H
311cabdff1aSopenharmony_ci        uaddw           v28.8H, v28.8H, v4.8B
312cabdff1aSopenharmony_ci        sqxtun          v2.8B,  v26.8H
313cabdff1aSopenharmony_ci        st1             {v0.8B},     [x3], x2
314cabdff1aSopenharmony_ci        uaddw           v29.8H, v29.8H, v5.8B
315cabdff1aSopenharmony_ci        sqxtun          v3.8B,  v27.8H
316cabdff1aSopenharmony_ci        st1             {v1.8B},     [x3], x2
317cabdff1aSopenharmony_ci        uaddw           v30.8H, v30.8H, v6.8B
318cabdff1aSopenharmony_ci        sqxtun          v4.8B,  v28.8H
319cabdff1aSopenharmony_ci        st1             {v2.8B},     [x3], x2
320cabdff1aSopenharmony_ci        uaddw           v31.8H, v31.8H, v7.8B
321cabdff1aSopenharmony_ci        sqxtun          v5.8B,  v29.8H
322cabdff1aSopenharmony_ci        st1             {v3.8B},     [x3], x2
323cabdff1aSopenharmony_ci        sqxtun          v6.8B,  v30.8H
324cabdff1aSopenharmony_ci        sqxtun          v7.8B,  v31.8H
325cabdff1aSopenharmony_ci        st1             {v4.8B},     [x3], x2
326cabdff1aSopenharmony_ci        st1             {v5.8B},     [x3], x2
327cabdff1aSopenharmony_ci        st1             {v6.8B},     [x3], x2
328cabdff1aSopenharmony_ci        st1             {v7.8B},     [x3], x2
329cabdff1aSopenharmony_ci
330cabdff1aSopenharmony_ci        sub             x1,  x1,  #128
331cabdff1aSopenharmony_ci        ret
332cabdff1aSopenharmony_ciendfunc
333cabdff1aSopenharmony_ci
334cabdff1aSopenharmony_cifunction ff_h264_idct8_dc_add_neon, export=1
335cabdff1aSopenharmony_ci.L_ff_h264_idct8_dc_add_neon:
336cabdff1aSopenharmony_ci        AARCH64_VALID_CALL_TARGET
337cabdff1aSopenharmony_ci        mov             w3,       #0
338cabdff1aSopenharmony_ci        sxtw            x2,       w2
339cabdff1aSopenharmony_ci        ld1r            {v31.8H}, [x1]
340cabdff1aSopenharmony_ci        strh            w3,       [x1]
341cabdff1aSopenharmony_ci        ld1             {v0.8B},  [x0], x2
342cabdff1aSopenharmony_ci        srshr           v31.8H, v31.8H, #6
343cabdff1aSopenharmony_ci        ld1             {v1.8B},     [x0], x2
344cabdff1aSopenharmony_ci        ld1             {v2.8B},     [x0], x2
345cabdff1aSopenharmony_ci        uaddw           v24.8H, v31.8H, v0.8B
346cabdff1aSopenharmony_ci        ld1             {v3.8B},     [x0], x2
347cabdff1aSopenharmony_ci        uaddw           v25.8H, v31.8H, v1.8B
348cabdff1aSopenharmony_ci        ld1             {v4.8B},     [x0], x2
349cabdff1aSopenharmony_ci        uaddw           v26.8H, v31.8H, v2.8B
350cabdff1aSopenharmony_ci        ld1             {v5.8B},     [x0], x2
351cabdff1aSopenharmony_ci        uaddw           v27.8H, v31.8H, v3.8B
352cabdff1aSopenharmony_ci        ld1             {v6.8B},     [x0], x2
353cabdff1aSopenharmony_ci        uaddw           v28.8H, v31.8H, v4.8B
354cabdff1aSopenharmony_ci        ld1             {v7.8B},     [x0], x2
355cabdff1aSopenharmony_ci        uaddw           v29.8H, v31.8H, v5.8B
356cabdff1aSopenharmony_ci        uaddw           v30.8H, v31.8H, v6.8B
357cabdff1aSopenharmony_ci        uaddw           v31.8H, v31.8H, v7.8B
358cabdff1aSopenharmony_ci        sqxtun          v0.8B,  v24.8H
359cabdff1aSopenharmony_ci        sqxtun          v1.8B,  v25.8H
360cabdff1aSopenharmony_ci        sqxtun          v2.8B,  v26.8H
361cabdff1aSopenharmony_ci        sqxtun          v3.8B,  v27.8H
362cabdff1aSopenharmony_ci        sub             x0,  x0,  x2, lsl #3
363cabdff1aSopenharmony_ci        st1             {v0.8B},     [x0], x2
364cabdff1aSopenharmony_ci        sqxtun          v4.8B,  v28.8H
365cabdff1aSopenharmony_ci        st1             {v1.8B},     [x0], x2
366cabdff1aSopenharmony_ci        sqxtun          v5.8B,  v29.8H
367cabdff1aSopenharmony_ci        st1             {v2.8B},     [x0], x2
368cabdff1aSopenharmony_ci        sqxtun          v6.8B,  v30.8H
369cabdff1aSopenharmony_ci        st1             {v3.8B},     [x0], x2
370cabdff1aSopenharmony_ci        sqxtun          v7.8B,  v31.8H
371cabdff1aSopenharmony_ci        st1             {v4.8B},     [x0], x2
372cabdff1aSopenharmony_ci        st1             {v5.8B},     [x0], x2
373cabdff1aSopenharmony_ci        st1             {v6.8B},     [x0], x2
374cabdff1aSopenharmony_ci        st1             {v7.8B},     [x0], x2
375cabdff1aSopenharmony_ci        ret
376cabdff1aSopenharmony_ciendfunc
377cabdff1aSopenharmony_ci
378cabdff1aSopenharmony_cifunction ff_h264_idct8_add4_neon, export=1
379cabdff1aSopenharmony_ci        mov             x12, x30
380cabdff1aSopenharmony_ci        mov             x6,  x0
381cabdff1aSopenharmony_ci        mov             x5,  x1
382cabdff1aSopenharmony_ci        mov             x1,  x2
383cabdff1aSopenharmony_ci        mov             w2,  w3
384cabdff1aSopenharmony_ci        movrel          x7,  scan8
385cabdff1aSopenharmony_ci        mov             w10, #16
386cabdff1aSopenharmony_ci        movrel          x13, .L_ff_h264_idct8_dc_add_neon
387cabdff1aSopenharmony_ci        movrel          x14, .L_ff_h264_idct8_add_neon
388cabdff1aSopenharmony_ci1:      ldrb            w9,  [x7], #4
389cabdff1aSopenharmony_ci        ldrsw           x0,  [x5], #16
390cabdff1aSopenharmony_ci        ldrb            w9,  [x4, w9, UXTW]
391cabdff1aSopenharmony_ci        subs            w9,  w9,  #1
392cabdff1aSopenharmony_ci        b.lt            2f
393cabdff1aSopenharmony_ci        ldrsh           w11,  [x1]
394cabdff1aSopenharmony_ci        add             x0,  x6,  x0
395cabdff1aSopenharmony_ci        ccmp            w11, #0,  #4,  eq
396cabdff1aSopenharmony_ci        csel            x15, x13, x14, ne
397cabdff1aSopenharmony_ci        blr             x15
398cabdff1aSopenharmony_ci2:      subs            w10, w10, #4
399cabdff1aSopenharmony_ci        add             x1,  x1,  #128
400cabdff1aSopenharmony_ci        b.ne            1b
401cabdff1aSopenharmony_ci        ret             x12
402cabdff1aSopenharmony_ciendfunc
403cabdff1aSopenharmony_ci
404cabdff1aSopenharmony_ciconst   scan8
405cabdff1aSopenharmony_ci        .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
406cabdff1aSopenharmony_ci        .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
407cabdff1aSopenharmony_ci        .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
408cabdff1aSopenharmony_ci        .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
409cabdff1aSopenharmony_ci        .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
410cabdff1aSopenharmony_ci        .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
411cabdff1aSopenharmony_ci        .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
412cabdff1aSopenharmony_ci        .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
413cabdff1aSopenharmony_ci        .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
414cabdff1aSopenharmony_ci        .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
415cabdff1aSopenharmony_ci        .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
416cabdff1aSopenharmony_ci        .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
417cabdff1aSopenharmony_ciendconst
418