1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * VC1 AArch64 NEON optimisations
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci// VC-1 8x8 inverse transform
26cabdff1aSopenharmony_ci// On entry:
27cabdff1aSopenharmony_ci//   x0 -> array of 16-bit inverse transform coefficients, in column-major order
28cabdff1aSopenharmony_ci// On exit:
29cabdff1aSopenharmony_ci//   array at x0 updated to hold transformed block; also now held in row-major order
30cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_8x8_neon, export=1
31cabdff1aSopenharmony_ci        ld1             {v1.16b, v2.16b}, [x0], #32
32cabdff1aSopenharmony_ci        ld1             {v3.16b, v4.16b}, [x0], #32
33cabdff1aSopenharmony_ci        ld1             {v5.16b, v6.16b}, [x0], #32
34cabdff1aSopenharmony_ci        shl             v1.8h, v1.8h, #2        //         8/2 * src[0]
35cabdff1aSopenharmony_ci        sub             x1, x0, #3*32
36cabdff1aSopenharmony_ci        ld1             {v16.16b, v17.16b}, [x0]
37cabdff1aSopenharmony_ci        shl             v7.8h, v2.8h, #4        //          16 * src[8]
38cabdff1aSopenharmony_ci        shl             v18.8h, v2.8h, #2       //           4 * src[8]
39cabdff1aSopenharmony_ci        shl             v19.8h, v4.8h, #4       //                        16 * src[24]
40cabdff1aSopenharmony_ci        ldr             d0, .Lcoeffs_it8
41cabdff1aSopenharmony_ci        shl             v5.8h, v5.8h, #2        //                                      8/2 * src[32]
42cabdff1aSopenharmony_ci        shl             v20.8h, v6.8h, #4       //                                       16 * src[40]
43cabdff1aSopenharmony_ci        shl             v21.8h, v6.8h, #2       //                                        4 * src[40]
44cabdff1aSopenharmony_ci        shl             v22.8h, v17.8h, #4      //                                                      16 * src[56]
45cabdff1aSopenharmony_ci        ssra            v20.8h, v19.8h, #2      //                         4 * src[24] + 16 * src[40]
46cabdff1aSopenharmony_ci        mul             v23.8h, v3.8h, v0.h[0]  //                       6/2 * src[16]
47cabdff1aSopenharmony_ci        sub             v19.8h, v19.8h, v21.8h  //                        16 * src[24] -  4 * src[40]
48cabdff1aSopenharmony_ci        ssra            v7.8h, v22.8h, #2       //          16 * src[8]                               +  4 * src[56]
49cabdff1aSopenharmony_ci        sub             v18.8h, v22.8h, v18.8h  //        -  4 * src[8]                               + 16 * src[56]
50cabdff1aSopenharmony_ci        shl             v3.8h, v3.8h, #3        //                      16/2 * src[16]
51cabdff1aSopenharmony_ci        mls             v20.8h, v2.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
52cabdff1aSopenharmony_ci        ssra            v1.8h, v1.8h, #1        //        12/2 * src[0]
53cabdff1aSopenharmony_ci        ssra            v5.8h, v5.8h, #1        //                                     12/2 * src[32]
54cabdff1aSopenharmony_ci        mla             v7.8h, v4.8h, v0.h[2]   //          16 * src[8] + 15 * src[24]                +  4 * src[56]
55cabdff1aSopenharmony_ci        shl             v21.8h, v16.8h, #3      //                                                    16/2 * src[48]
56cabdff1aSopenharmony_ci        mls             v19.8h, v2.8h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
57cabdff1aSopenharmony_ci        sub             v2.8h, v23.8h, v21.8h   // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
58cabdff1aSopenharmony_ci        mla             v18.8h, v4.8h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
59cabdff1aSopenharmony_ci        add             v4.8h, v1.8h, v5.8h     // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
60cabdff1aSopenharmony_ci        sub             v1.8h, v1.8h, v5.8h     // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
61cabdff1aSopenharmony_ci        mla             v3.8h, v16.8h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
62cabdff1aSopenharmony_ci        mla             v7.8h, v6.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
63cabdff1aSopenharmony_ci        add             v5.8h, v1.8h, v2.8h     // t6/2 = t2/2 + t4/2
64cabdff1aSopenharmony_ci        sub             v16.8h, v1.8h, v2.8h    // t7/2 = t2/2 - t4/2
65cabdff1aSopenharmony_ci        mla             v20.8h, v17.8h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
66cabdff1aSopenharmony_ci        add             v21.8h, v1.8h, v2.8h    // t6/2 = t2/2 + t4/2
67cabdff1aSopenharmony_ci        add             v22.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
68cabdff1aSopenharmony_ci        mls             v19.8h, v17.8h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
69cabdff1aSopenharmony_ci        sub             v17.8h, v4.8h, v3.8h    // t8/2 = t1/2 - t3/2
70cabdff1aSopenharmony_ci        add             v23.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
71cabdff1aSopenharmony_ci        mls             v18.8h, v6.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
72cabdff1aSopenharmony_ci        sub             v1.8h, v1.8h, v2.8h     // t7/2 = t2/2 - t4/2
73cabdff1aSopenharmony_ci        sub             v2.8h, v4.8h, v3.8h     // t8/2 = t1/2 - t3/2
74cabdff1aSopenharmony_ci        neg             v3.8h, v7.8h            // -t1
75cabdff1aSopenharmony_ci        neg             v4.8h, v20.8h           // +t2
76cabdff1aSopenharmony_ci        neg             v6.8h, v19.8h           // +t3
77cabdff1aSopenharmony_ci        ssra            v22.8h, v7.8h, #1       // (t5 + t1) >> 1
78cabdff1aSopenharmony_ci        ssra            v1.8h, v19.8h, #1       // (t7 - t3) >> 1
79cabdff1aSopenharmony_ci        neg             v7.8h, v18.8h           // +t4
80cabdff1aSopenharmony_ci        ssra            v5.8h, v4.8h, #1        // (t6 + t2) >> 1
81cabdff1aSopenharmony_ci        ssra            v16.8h, v6.8h, #1       // (t7 + t3) >> 1
82cabdff1aSopenharmony_ci        ssra            v2.8h, v18.8h, #1       // (t8 - t4) >> 1
83cabdff1aSopenharmony_ci        ssra            v17.8h, v7.8h, #1       // (t8 + t4) >> 1
84cabdff1aSopenharmony_ci        ssra            v21.8h, v20.8h, #1      // (t6 - t2) >> 1
85cabdff1aSopenharmony_ci        ssra            v23.8h, v3.8h, #1       // (t5 - t1) >> 1
86cabdff1aSopenharmony_ci        srshr           v3.8h, v22.8h, #2       // (t5 + t1 + 4) >> 3
87cabdff1aSopenharmony_ci        srshr           v4.8h, v5.8h, #2        // (t6 + t2 + 4) >> 3
88cabdff1aSopenharmony_ci        srshr           v5.8h, v16.8h, #2       // (t7 + t3 + 4) >> 3
89cabdff1aSopenharmony_ci        srshr           v6.8h, v17.8h, #2       // (t8 + t4 + 4) >> 3
90cabdff1aSopenharmony_ci        srshr           v2.8h, v2.8h, #2        // (t8 - t4 + 4) >> 3
91cabdff1aSopenharmony_ci        srshr           v1.8h, v1.8h, #2        // (t7 - t3 + 4) >> 3
92cabdff1aSopenharmony_ci        srshr           v7.8h, v21.8h, #2       // (t6 - t2 + 4) >> 3
93cabdff1aSopenharmony_ci        srshr           v16.8h, v23.8h, #2      // (t5 - t1 + 4) >> 3
94cabdff1aSopenharmony_ci        trn2            v17.8h, v3.8h, v4.8h
95cabdff1aSopenharmony_ci        trn2            v18.8h, v5.8h, v6.8h
96cabdff1aSopenharmony_ci        trn2            v19.8h, v2.8h, v1.8h
97cabdff1aSopenharmony_ci        trn2            v20.8h, v7.8h, v16.8h
98cabdff1aSopenharmony_ci        trn1            v21.4s, v17.4s, v18.4s
99cabdff1aSopenharmony_ci        trn2            v17.4s, v17.4s, v18.4s
100cabdff1aSopenharmony_ci        trn1            v18.4s, v19.4s, v20.4s
101cabdff1aSopenharmony_ci        trn2            v19.4s, v19.4s, v20.4s
102cabdff1aSopenharmony_ci        trn1            v3.8h, v3.8h, v4.8h
103cabdff1aSopenharmony_ci        trn2            v4.2d, v21.2d, v18.2d
104cabdff1aSopenharmony_ci        trn1            v20.2d, v17.2d, v19.2d
105cabdff1aSopenharmony_ci        trn1            v5.8h, v5.8h, v6.8h
106cabdff1aSopenharmony_ci        trn1            v1.8h, v2.8h, v1.8h
107cabdff1aSopenharmony_ci        trn1            v2.8h, v7.8h, v16.8h
108cabdff1aSopenharmony_ci        trn1            v6.2d, v21.2d, v18.2d
109cabdff1aSopenharmony_ci        trn2            v7.2d, v17.2d, v19.2d
110cabdff1aSopenharmony_ci        shl             v16.8h, v20.8h, #4      //                        16 * src[24]
111cabdff1aSopenharmony_ci        shl             v17.8h, v4.8h, #4       //                                       16 * src[40]
112cabdff1aSopenharmony_ci        trn1            v18.4s, v3.4s, v5.4s
113cabdff1aSopenharmony_ci        trn1            v19.4s, v1.4s, v2.4s
114cabdff1aSopenharmony_ci        shl             v21.8h, v7.8h, #4       //                                                      16 * src[56]
115cabdff1aSopenharmony_ci        shl             v22.8h, v6.8h, #2       //           4 * src[8]
116cabdff1aSopenharmony_ci        shl             v23.8h, v4.8h, #2       //                                        4 * src[40]
117cabdff1aSopenharmony_ci        trn2            v3.4s, v3.4s, v5.4s
118cabdff1aSopenharmony_ci        trn2            v1.4s, v1.4s, v2.4s
119cabdff1aSopenharmony_ci        shl             v2.8h, v6.8h, #4        //          16 * src[8]
120cabdff1aSopenharmony_ci        sub             v5.8h, v16.8h, v23.8h   //                        16 * src[24] -  4 * src[40]
121cabdff1aSopenharmony_ci        ssra            v17.8h, v16.8h, #2      //                         4 * src[24] + 16 * src[40]
122cabdff1aSopenharmony_ci        sub             v16.8h, v21.8h, v22.8h  //        -  4 * src[8]                               + 16 * src[56]
123cabdff1aSopenharmony_ci        trn1            v22.2d, v18.2d, v19.2d
124cabdff1aSopenharmony_ci        trn2            v18.2d, v18.2d, v19.2d
125cabdff1aSopenharmony_ci        trn1            v19.2d, v3.2d, v1.2d
126cabdff1aSopenharmony_ci        ssra            v2.8h, v21.8h, #2       //          16 * src[8]                               +  4 * src[56]
127cabdff1aSopenharmony_ci        mls             v17.8h, v6.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
128cabdff1aSopenharmony_ci        shl             v21.8h, v22.8h, #2      //         8/2 * src[0]
129cabdff1aSopenharmony_ci        shl             v18.8h, v18.8h, #2      //                                      8/2 * src[32]
130cabdff1aSopenharmony_ci        mls             v5.8h, v6.8h, v0.h[1]   //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
131cabdff1aSopenharmony_ci        shl             v6.8h, v19.8h, #3       //                      16/2 * src[16]
132cabdff1aSopenharmony_ci        trn2            v1.2d, v3.2d, v1.2d
133cabdff1aSopenharmony_ci        mla             v16.8h, v20.8h, v0.h[1] //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
134cabdff1aSopenharmony_ci        ssra            v21.8h, v21.8h, #1      //        12/2 * src[0]
135cabdff1aSopenharmony_ci        ssra            v18.8h, v18.8h, #1      //                                     12/2 * src[32]
136cabdff1aSopenharmony_ci        mul             v3.8h, v19.8h, v0.h[0]  //                       6/2 * src[16]
137cabdff1aSopenharmony_ci        shl             v19.8h, v1.8h, #3       //                                                    16/2 * src[48]
138cabdff1aSopenharmony_ci        mla             v2.8h, v20.8h, v0.h[2]  //          16 * src[8] + 15 * src[24]                +  4 * src[56]
139cabdff1aSopenharmony_ci        add             v20.8h, v21.8h, v18.8h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
140cabdff1aSopenharmony_ci        mla             v6.8h, v1.8h, v0.h[0]   // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
141cabdff1aSopenharmony_ci        sub             v1.8h, v21.8h, v18.8h   // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
142cabdff1aSopenharmony_ci        sub             v3.8h, v3.8h, v19.8h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
143cabdff1aSopenharmony_ci        mla             v17.8h, v7.8h, v0.h[1]  // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
144cabdff1aSopenharmony_ci        mls             v5.8h, v7.8h, v0.h[2]   // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
145cabdff1aSopenharmony_ci        add             v7.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
146cabdff1aSopenharmony_ci        add             v18.8h, v20.8h, v6.8h   // t5/2 = t1/2 + t3/2
147cabdff1aSopenharmony_ci        mls             v16.8h, v4.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
148cabdff1aSopenharmony_ci        sub             v19.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
149cabdff1aSopenharmony_ci        neg             v21.8h, v17.8h          // +t2
150cabdff1aSopenharmony_ci        mla             v2.8h, v4.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
151cabdff1aSopenharmony_ci        sub             v0.8h, v20.8h, v6.8h    // t8/2 = t1/2 - t3/2
152cabdff1aSopenharmony_ci        neg             v4.8h, v5.8h            // +t3
153cabdff1aSopenharmony_ci        sub             v22.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
154cabdff1aSopenharmony_ci        sub             v23.8h, v20.8h, v6.8h   // t8/2 = t1/2 - t3/2
155cabdff1aSopenharmony_ci        neg             v24.8h, v16.8h          // +t4
156cabdff1aSopenharmony_ci        add             v6.8h, v20.8h, v6.8h    // t5/2 = t1/2 + t3/2
157cabdff1aSopenharmony_ci        add             v1.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
158cabdff1aSopenharmony_ci        ssra            v7.8h, v21.8h, #1       // (t6 + t2) >> 1
159cabdff1aSopenharmony_ci        neg             v3.8h, v2.8h            // -t1
160cabdff1aSopenharmony_ci        ssra            v18.8h, v2.8h, #1       // (t5 + t1) >> 1
161cabdff1aSopenharmony_ci        ssra            v19.8h, v4.8h, #1       // (t7 + t3) >> 1
162cabdff1aSopenharmony_ci        ssra            v0.8h, v24.8h, #1       // (t8 + t4) >> 1
163cabdff1aSopenharmony_ci        srsra           v23.8h, v16.8h, #1      // (t8 - t4 + 1) >> 1
164cabdff1aSopenharmony_ci        srsra           v22.8h, v5.8h, #1       // (t7 - t3 + 1) >> 1
165cabdff1aSopenharmony_ci        srsra           v1.8h, v17.8h, #1       // (t6 - t2 + 1) >> 1
166cabdff1aSopenharmony_ci        srsra           v6.8h, v3.8h, #1        // (t5 - t1 + 1) >> 1
167cabdff1aSopenharmony_ci        srshr           v2.8h, v18.8h, #6       // (t5 + t1 + 64) >> 7
168cabdff1aSopenharmony_ci        srshr           v3.8h, v7.8h, #6        // (t6 + t2 + 64) >> 7
169cabdff1aSopenharmony_ci        srshr           v4.8h, v19.8h, #6       // (t7 + t3 + 64) >> 7
170cabdff1aSopenharmony_ci        srshr           v5.8h, v0.8h, #6        // (t8 + t4 + 64) >> 7
171cabdff1aSopenharmony_ci        srshr           v16.8h, v23.8h, #6      // (t8 - t4 + 65) >> 7
172cabdff1aSopenharmony_ci        srshr           v17.8h, v22.8h, #6      // (t7 - t3 + 65) >> 7
173cabdff1aSopenharmony_ci        st1             {v2.16b, v3.16b}, [x1], #32
174cabdff1aSopenharmony_ci        srshr           v0.8h, v1.8h, #6        // (t6 - t2 + 65) >> 7
175cabdff1aSopenharmony_ci        srshr           v1.8h, v6.8h, #6        // (t5 - t1 + 65) >> 7
176cabdff1aSopenharmony_ci        st1             {v4.16b, v5.16b}, [x1], #32
177cabdff1aSopenharmony_ci        st1             {v16.16b, v17.16b}, [x1], #32
178cabdff1aSopenharmony_ci        st1             {v0.16b, v1.16b}, [x1]
179cabdff1aSopenharmony_ci        ret
180cabdff1aSopenharmony_ciendfunc
181cabdff1aSopenharmony_ci
182cabdff1aSopenharmony_ci// VC-1 8x4 inverse transform
183cabdff1aSopenharmony_ci// On entry:
184cabdff1aSopenharmony_ci//   x0 -> array of 8-bit samples, in row-major order
185cabdff1aSopenharmony_ci//   x1 = row stride for 8-bit sample array
186cabdff1aSopenharmony_ci//   x2 -> array of 16-bit inverse transform coefficients, in row-major order
187cabdff1aSopenharmony_ci// On exit:
188cabdff1aSopenharmony_ci//   array at x0 updated by saturated addition of (narrowed) transformed block
189cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_8x4_neon, export=1
190cabdff1aSopenharmony_ci        ld1             {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
191cabdff1aSopenharmony_ci        mov             x3, x0
192cabdff1aSopenharmony_ci        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
193cabdff1aSopenharmony_ci        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
194cabdff1aSopenharmony_ci        ld1             {v5.8b}, [x0], x1
195cabdff1aSopenharmony_ci        trn2            v6.4h, v1.4h, v3.4h
196cabdff1aSopenharmony_ci        trn2            v7.4h, v2.4h, v4.4h
197cabdff1aSopenharmony_ci        trn1            v1.4h, v1.4h, v3.4h
198cabdff1aSopenharmony_ci        trn1            v2.4h, v2.4h, v4.4h
199cabdff1aSopenharmony_ci        trn2            v3.4h, v16.4h, v18.4h
200cabdff1aSopenharmony_ci        trn2            v4.4h, v17.4h, v19.4h
201cabdff1aSopenharmony_ci        trn1            v16.4h, v16.4h, v18.4h
202cabdff1aSopenharmony_ci        trn1            v17.4h, v17.4h, v19.4h
203cabdff1aSopenharmony_ci        ld1             {v18.8b}, [x0], x1
204cabdff1aSopenharmony_ci        trn1            v19.2s, v6.2s, v3.2s
205cabdff1aSopenharmony_ci        trn2            v3.2s, v6.2s, v3.2s
206cabdff1aSopenharmony_ci        trn1            v6.2s, v7.2s, v4.2s
207cabdff1aSopenharmony_ci        trn2            v4.2s, v7.2s, v4.2s
208cabdff1aSopenharmony_ci        trn1            v7.2s, v1.2s, v16.2s
209cabdff1aSopenharmony_ci        trn1            v20.2s, v2.2s, v17.2s
210cabdff1aSopenharmony_ci        shl             v21.4h, v19.4h, #4      //          16 * src[1]
211cabdff1aSopenharmony_ci        trn2            v1.2s, v1.2s, v16.2s
212cabdff1aSopenharmony_ci        shl             v16.4h, v3.4h, #4       //                        16 * src[3]
213cabdff1aSopenharmony_ci        trn2            v2.2s, v2.2s, v17.2s
214cabdff1aSopenharmony_ci        shl             v17.4h, v6.4h, #4       //                                      16 * src[5]
215cabdff1aSopenharmony_ci        ld1             {v22.8b}, [x0], x1
216cabdff1aSopenharmony_ci        shl             v23.4h, v4.4h, #4       //                                                    16 * src[7]
217cabdff1aSopenharmony_ci        mul             v24.4h, v1.4h, v0.h[0]  //                       6/2 * src[2]
218cabdff1aSopenharmony_ci        ld1             {v25.8b}, [x0]
219cabdff1aSopenharmony_ci        shl             v26.4h, v19.4h, #2      //           4 * src[1]
220cabdff1aSopenharmony_ci        shl             v27.4h, v6.4h, #2       //                                       4 * src[5]
221cabdff1aSopenharmony_ci        ssra            v21.4h, v23.4h, #2      //          16 * src[1]                             +  4 * src[7]
222cabdff1aSopenharmony_ci        ssra            v17.4h, v16.4h, #2      //                         4 * src[3] + 16 * src[5]
223cabdff1aSopenharmony_ci        sub             v23.4h, v23.4h, v26.4h  //        -  4 * src[1]                             + 16 * src[7]
224cabdff1aSopenharmony_ci        sub             v16.4h, v16.4h, v27.4h  //                        16 * src[3] -  4 * src[5]
225cabdff1aSopenharmony_ci        shl             v7.4h, v7.4h, #2        //         8/2 * src[0]
226cabdff1aSopenharmony_ci        shl             v20.4h, v20.4h, #2      //                                     8/2 * src[4]
227cabdff1aSopenharmony_ci        mla             v21.4h, v3.4h, v0.h[2]  //          16 * src[1] + 15 * src[3]               +  4 * src[7]
228cabdff1aSopenharmony_ci        shl             v1.4h, v1.4h, #3        //                      16/2 * src[2]
229cabdff1aSopenharmony_ci        mls             v17.4h, v19.4h, v0.h[2] //        - 15 * src[1] +  4 * src[3] + 16 * src[5]
230cabdff1aSopenharmony_ci        ssra            v7.4h, v7.4h, #1        //        12/2 * src[0]
231cabdff1aSopenharmony_ci        mls             v16.4h, v19.4h, v0.h[1] //        -  9 * src[1] + 16 * src[3] -  4 * src[5]
232cabdff1aSopenharmony_ci        ssra            v20.4h, v20.4h, #1      //                                    12/2 * src[4]
233cabdff1aSopenharmony_ci        mla             v23.4h, v3.4h, v0.h[1]  //        -  4 * src[1] +  9 * src[3]               + 16 * src[7]
234cabdff1aSopenharmony_ci        shl             v3.4h, v2.4h, #3        //                                                  16/2 * src[6]
235cabdff1aSopenharmony_ci        mla             v1.4h, v2.4h, v0.h[0]   // t3/2 =               16/2 * src[2]             +  6/2 * src[6]
236cabdff1aSopenharmony_ci        mla             v21.4h, v6.4h, v0.h[1]  //  t1  =   16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7]
237cabdff1aSopenharmony_ci        mla             v17.4h, v4.4h, v0.h[1]  // -t2  = - 15 * src[1] +  4 * src[3] + 16 * src[5] +  9 * src[7]
238cabdff1aSopenharmony_ci        sub             v2.4h, v24.4h, v3.4h    // t4/2 =                6/2 * src[2]             - 16/2 * src[6]
239cabdff1aSopenharmony_ci        mls             v16.4h, v4.4h, v0.h[2]  // -t3  = -  9 * src[1] + 16 * src[3] -  4 * src[5] - 15 * src[7]
240cabdff1aSopenharmony_ci        add             v3.4h, v7.4h, v20.4h    // t1/2 = 12/2 * src[0]             + 12/2 * src[4]
241cabdff1aSopenharmony_ci        mls             v23.4h, v6.4h, v0.h[2]  // -t4  = -  4 * src[1] +  9 * src[3] - 15 * src[5] + 16 * src[7]
242cabdff1aSopenharmony_ci        sub             v4.4h, v7.4h, v20.4h    // t2/2 = 12/2 * src[0]             - 12/2 * src[4]
243cabdff1aSopenharmony_ci        neg             v6.4h, v21.4h           // -t1
244cabdff1aSopenharmony_ci        add             v7.4h, v3.4h, v1.4h     // t5/2 = t1/2 + t3/2
245cabdff1aSopenharmony_ci        sub             v19.4h, v3.4h, v1.4h    // t8/2 = t1/2 - t3/2
246cabdff1aSopenharmony_ci        add             v20.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
247cabdff1aSopenharmony_ci        sub             v24.4h, v4.4h, v2.4h    // t7/2 = t2/2 - t4/2
248cabdff1aSopenharmony_ci        add             v26.4h, v3.4h, v1.4h    // t5/2 = t1/2 + t3/2
249cabdff1aSopenharmony_ci        add             v27.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
250cabdff1aSopenharmony_ci        sub             v2.4h, v4.4h, v2.4h     // t7/2 = t2/2 - t4/2
251cabdff1aSopenharmony_ci        sub             v1.4h, v3.4h, v1.4h     // t8/2 = t1/2 - t3/2
252cabdff1aSopenharmony_ci        neg             v3.4h, v17.4h           // +t2
253cabdff1aSopenharmony_ci        neg             v4.4h, v16.4h           // +t3
254cabdff1aSopenharmony_ci        neg             v28.4h, v23.4h          // +t4
255cabdff1aSopenharmony_ci        ssra            v7.4h, v21.4h, #1       // (t5 + t1) >> 1
256cabdff1aSopenharmony_ci        ssra            v1.4h, v23.4h, #1       // (t8 - t4) >> 1
257cabdff1aSopenharmony_ci        ssra            v20.4h, v3.4h, #1       // (t6 + t2) >> 1
258cabdff1aSopenharmony_ci        ssra            v24.4h, v4.4h, #1       // (t7 + t3) >> 1
259cabdff1aSopenharmony_ci        ssra            v19.4h, v28.4h, #1      // (t8 + t4) >> 1
260cabdff1aSopenharmony_ci        ssra            v2.4h, v16.4h, #1       // (t7 - t3) >> 1
261cabdff1aSopenharmony_ci        ssra            v27.4h, v17.4h, #1      // (t6 - t2) >> 1
262cabdff1aSopenharmony_ci        ssra            v26.4h, v6.4h, #1       // (t5 - t1) >> 1
263cabdff1aSopenharmony_ci        trn1            v1.2d, v7.2d, v1.2d
264cabdff1aSopenharmony_ci        trn1            v2.2d, v20.2d, v2.2d
265cabdff1aSopenharmony_ci        trn1            v3.2d, v24.2d, v27.2d
266cabdff1aSopenharmony_ci        trn1            v4.2d, v19.2d, v26.2d
267cabdff1aSopenharmony_ci        srshr           v1.8h, v1.8h, #2        // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
268cabdff1aSopenharmony_ci        srshr           v2.8h, v2.8h, #2        // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
269cabdff1aSopenharmony_ci        srshr           v3.8h, v3.8h, #2        // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
270cabdff1aSopenharmony_ci        srshr           v4.8h, v4.8h, #2        // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
271cabdff1aSopenharmony_ci        trn2            v6.8h, v1.8h, v2.8h
272cabdff1aSopenharmony_ci        trn1            v1.8h, v1.8h, v2.8h
273cabdff1aSopenharmony_ci        trn2            v2.8h, v3.8h, v4.8h
274cabdff1aSopenharmony_ci        trn1            v3.8h, v3.8h, v4.8h
275cabdff1aSopenharmony_ci        trn2            v4.4s, v6.4s, v2.4s
276cabdff1aSopenharmony_ci        trn1            v7.4s, v1.4s, v3.4s
277cabdff1aSopenharmony_ci        trn2            v1.4s, v1.4s, v3.4s
278cabdff1aSopenharmony_ci        mul             v3.8h, v4.8h, v0.h[5]   //                                                           22/2 * src[24]
279cabdff1aSopenharmony_ci        trn1            v2.4s, v6.4s, v2.4s
280cabdff1aSopenharmony_ci        mul             v4.8h, v4.8h, v0.h[4]   //                                                           10/2 * src[24]
281cabdff1aSopenharmony_ci        mul             v6.8h, v7.8h, v0.h[6]   //            17 * src[0]
282cabdff1aSopenharmony_ci        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[16]
283cabdff1aSopenharmony_ci        mls             v3.8h, v2.8h, v0.h[4]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
284cabdff1aSopenharmony_ci        mla             v4.8h, v2.8h, v0.h[5]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
285cabdff1aSopenharmony_ci        add             v0.8h, v6.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[16]
286cabdff1aSopenharmony_ci        sub             v1.8h, v6.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[16]
287cabdff1aSopenharmony_ci        neg             v2.8h, v3.8h            // -t4/2
288cabdff1aSopenharmony_ci        neg             v6.8h, v4.8h            // -t3/2
289cabdff1aSopenharmony_ci        ssra            v4.8h, v0.8h, #1        // (t1 + t3) >> 1
290cabdff1aSopenharmony_ci        ssra            v2.8h, v1.8h, #1        // (t2 - t4) >> 1
291cabdff1aSopenharmony_ci        ssra            v3.8h, v1.8h, #1        // (t2 + t4) >> 1
292cabdff1aSopenharmony_ci        ssra            v6.8h, v0.8h, #1        // (t1 - t3) >> 1
293cabdff1aSopenharmony_ci        srshr           v0.8h, v4.8h, #6        // (t1 + t3 + 64) >> 7
294cabdff1aSopenharmony_ci        srshr           v1.8h, v2.8h, #6        // (t2 - t4 + 64) >> 7
295cabdff1aSopenharmony_ci        srshr           v2.8h, v3.8h, #6        // (t2 + t4 + 64) >> 7
296cabdff1aSopenharmony_ci        srshr           v3.8h, v6.8h, #6        // (t1 - t3 + 64) >> 7
297cabdff1aSopenharmony_ci        uaddw           v0.8h, v0.8h, v5.8b
298cabdff1aSopenharmony_ci        uaddw           v1.8h, v1.8h, v18.8b
299cabdff1aSopenharmony_ci        uaddw           v2.8h, v2.8h, v22.8b
300cabdff1aSopenharmony_ci        uaddw           v3.8h, v3.8h, v25.8b
301cabdff1aSopenharmony_ci        sqxtun          v0.8b, v0.8h
302cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
303cabdff1aSopenharmony_ci        sqxtun          v2.8b, v2.8h
304cabdff1aSopenharmony_ci        sqxtun          v3.8b, v3.8h
305cabdff1aSopenharmony_ci        st1             {v0.8b}, [x3], x1
306cabdff1aSopenharmony_ci        st1             {v1.8b}, [x3], x1
307cabdff1aSopenharmony_ci        st1             {v2.8b}, [x3], x1
308cabdff1aSopenharmony_ci        st1             {v3.8b}, [x3]
309cabdff1aSopenharmony_ci        ret
310cabdff1aSopenharmony_ciendfunc
311cabdff1aSopenharmony_ci
312cabdff1aSopenharmony_ci// VC-1 4x8 inverse transform
313cabdff1aSopenharmony_ci// On entry:
314cabdff1aSopenharmony_ci//   x0 -> array of 8-bit samples, in row-major order
315cabdff1aSopenharmony_ci//   x1 = row stride for 8-bit sample array
316cabdff1aSopenharmony_ci//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
317cabdff1aSopenharmony_ci// On exit:
318cabdff1aSopenharmony_ci//   array at x0 updated by saturated addition of (narrowed) transformed block
319cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_4x8_neon, export=1
320cabdff1aSopenharmony_ci        mov             x3, #16
321cabdff1aSopenharmony_ci        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
322cabdff1aSopenharmony_ci        mov             x4, x0
323cabdff1aSopenharmony_ci        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
324cabdff1aSopenharmony_ci        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
325cabdff1aSopenharmony_ci        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
326cabdff1aSopenharmony_ci        ld1             {v4.d}[0], [x2], x3     // 30 31 32 33
327cabdff1aSopenharmony_ci        ld1             {v1.d}[1], [x2], x3     // 40 41 42 43
328cabdff1aSopenharmony_ci        ld1             {v2.d}[1], [x2], x3     // 50 51 52 53
329cabdff1aSopenharmony_ci        ld1             {v3.d}[1], [x2], x3     // 60 61 62 63
330cabdff1aSopenharmony_ci        ld1             {v4.d}[1], [x2]         // 70 71 72 73
331cabdff1aSopenharmony_ci        ld1             {v5.s}[0], [x0], x1
332cabdff1aSopenharmony_ci        ld1             {v6.s}[0], [x0], x1
333cabdff1aSopenharmony_ci        ld1             {v7.s}[0], [x0], x1
334cabdff1aSopenharmony_ci        trn2            v16.8h, v1.8h, v2.8h    // 01 11 03 13 41 51 43 53
335cabdff1aSopenharmony_ci        trn1            v1.8h, v1.8h, v2.8h     // 00 10 02 12 40 50 42 52
336cabdff1aSopenharmony_ci        trn2            v2.8h, v3.8h, v4.8h     // 21 31 23 33 61 71 63 73
337cabdff1aSopenharmony_ci        trn1            v3.8h, v3.8h, v4.8h     // 20 30 22 32 60 70 62 72
338cabdff1aSopenharmony_ci        ld1             {v4.s}[0], [x0], x1
339cabdff1aSopenharmony_ci        trn2            v17.4s, v16.4s, v2.4s   // 03 13 23 33 43 53 63 73
340cabdff1aSopenharmony_ci        trn1            v18.4s, v1.4s, v3.4s    // 00 10 20 30 40 50 60 70
341cabdff1aSopenharmony_ci        trn1            v2.4s, v16.4s, v2.4s    // 01 11 21 31 41 51 61 71
342cabdff1aSopenharmony_ci        mul             v16.8h, v17.8h, v0.h[4] //                                                          10/2 * src[3]
343cabdff1aSopenharmony_ci        ld1             {v5.s}[1], [x0], x1
344cabdff1aSopenharmony_ci        mul             v17.8h, v17.8h, v0.h[5] //                                                          22/2 * src[3]
345cabdff1aSopenharmony_ci        ld1             {v6.s}[1], [x0], x1
346cabdff1aSopenharmony_ci        trn2            v1.4s, v1.4s, v3.4s     // 02 12 22 32 42 52 62 72
347cabdff1aSopenharmony_ci        mul             v3.8h, v18.8h, v0.h[6]  //            17 * src[0]
348cabdff1aSopenharmony_ci        ld1             {v7.s}[1], [x0], x1
349cabdff1aSopenharmony_ci        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[2]
350cabdff1aSopenharmony_ci        ld1             {v4.s}[1], [x0]
351cabdff1aSopenharmony_ci        mla             v16.8h, v2.8h, v0.h[5]  //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
352cabdff1aSopenharmony_ci        mls             v17.8h, v2.8h, v0.h[4]  //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
353cabdff1aSopenharmony_ci        add             v2.8h, v3.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[2]
354cabdff1aSopenharmony_ci        sub             v1.8h, v3.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[2]
355cabdff1aSopenharmony_ci        neg             v3.8h, v16.8h           // -t3/2
356cabdff1aSopenharmony_ci        ssra            v16.8h, v2.8h, #1       // (t1 + t3) >> 1
357cabdff1aSopenharmony_ci        neg             v18.8h, v17.8h          // -t4/2
358cabdff1aSopenharmony_ci        ssra            v17.8h, v1.8h, #1       // (t2 + t4) >> 1
359cabdff1aSopenharmony_ci        ssra            v3.8h, v2.8h, #1        // (t1 - t3) >> 1
360cabdff1aSopenharmony_ci        ssra            v18.8h, v1.8h, #1       // (t2 - t4) >> 1
361cabdff1aSopenharmony_ci        srshr           v1.8h, v16.8h, #2       // (t1 + t3 + 64) >> 3
362cabdff1aSopenharmony_ci        srshr           v2.8h, v17.8h, #2       // (t2 + t4 + 64) >> 3
363cabdff1aSopenharmony_ci        srshr           v3.8h, v3.8h, #2        // (t1 - t3 + 64) >> 3
364cabdff1aSopenharmony_ci        srshr           v16.8h, v18.8h, #2      // (t2 - t4 + 64) >> 3
365cabdff1aSopenharmony_ci        trn2            v17.8h, v2.8h, v3.8h    // 12 13 32 33 52 53 72 73
366cabdff1aSopenharmony_ci        trn2            v18.8h, v1.8h, v16.8h   // 10 11 30 31 50 51 70 71
367cabdff1aSopenharmony_ci        trn1            v1.8h, v1.8h, v16.8h    // 00 01 20 21 40 41 60 61
368cabdff1aSopenharmony_ci        trn1            v2.8h, v2.8h, v3.8h     // 02 03 22 23 42 43 62 63
369cabdff1aSopenharmony_ci        trn1            v3.4s, v18.4s, v17.4s   // 10 11 12 13 50 51 52 53
370cabdff1aSopenharmony_ci        trn2            v16.4s, v18.4s, v17.4s  // 30 31 32 33 70 71 72 73
371cabdff1aSopenharmony_ci        trn1            v17.4s, v1.4s, v2.4s    // 00 01 02 03 40 41 42 43
372cabdff1aSopenharmony_ci        mov             d18, v3.d[1]            // 50 51 52 53
373cabdff1aSopenharmony_ci        shl             v19.4h, v3.4h, #4       //          16 * src[8]
374cabdff1aSopenharmony_ci        mov             d20, v16.d[1]           // 70 71 72 73
375cabdff1aSopenharmony_ci        shl             v21.4h, v16.4h, #4      //                        16 * src[24]
376cabdff1aSopenharmony_ci        mov             d22, v17.d[1]           // 40 41 42 43
377cabdff1aSopenharmony_ci        shl             v23.4h, v3.4h, #2       //           4 * src[8]
378cabdff1aSopenharmony_ci        shl             v24.4h, v18.4h, #4      //                                       16 * src[40]
379cabdff1aSopenharmony_ci        shl             v25.4h, v20.4h, #4      //                                                      16 * src[56]
380cabdff1aSopenharmony_ci        shl             v26.4h, v18.4h, #2      //                                        4 * src[40]
381cabdff1aSopenharmony_ci        trn2            v1.4s, v1.4s, v2.4s     // 20 21 22 23 60 61 62 63
382cabdff1aSopenharmony_ci        ssra            v24.4h, v21.4h, #2      //                         4 * src[24] + 16 * src[40]
383cabdff1aSopenharmony_ci        sub             v2.4h, v25.4h, v23.4h   //        -  4 * src[8]                               + 16 * src[56]
384cabdff1aSopenharmony_ci        shl             v17.4h, v17.4h, #2      //         8/2 * src[0]
385cabdff1aSopenharmony_ci        sub             v21.4h, v21.4h, v26.4h  //                        16 * src[24] -  4 * src[40]
386cabdff1aSopenharmony_ci        shl             v22.4h, v22.4h, #2      //                                      8/2 * src[32]
387cabdff1aSopenharmony_ci        mov             d23, v1.d[1]            // 60 61 62 63
388cabdff1aSopenharmony_ci        ssra            v19.4h, v25.4h, #2      //          16 * src[8]                               +  4 * src[56]
389cabdff1aSopenharmony_ci        mul             v25.4h, v1.4h, v0.h[0]  //                       6/2 * src[16]
390cabdff1aSopenharmony_ci        shl             v1.4h, v1.4h, #3        //                      16/2 * src[16]
391cabdff1aSopenharmony_ci        mls             v24.4h, v3.4h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
392cabdff1aSopenharmony_ci        ssra            v17.4h, v17.4h, #1      //        12/2 * src[0]
393cabdff1aSopenharmony_ci        mls             v21.4h, v3.4h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
394cabdff1aSopenharmony_ci        ssra            v22.4h, v22.4h, #1      //                                     12/2 * src[32]
395cabdff1aSopenharmony_ci        mla             v2.4h, v16.4h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
396cabdff1aSopenharmony_ci        shl             v3.4h, v23.4h, #3       //                                                    16/2 * src[48]
397cabdff1aSopenharmony_ci        mla             v19.4h, v16.4h, v0.h[2] //          16 * src[8] + 15 * src[24]                +  4 * src[56]
398cabdff1aSopenharmony_ci        mla             v1.4h, v23.4h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
399cabdff1aSopenharmony_ci        mla             v24.4h, v20.4h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
400cabdff1aSopenharmony_ci        add             v16.4h, v17.4h, v22.4h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
401cabdff1aSopenharmony_ci        sub             v3.4h, v25.4h, v3.4h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
402cabdff1aSopenharmony_ci        sub             v17.4h, v17.4h, v22.4h  // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
403cabdff1aSopenharmony_ci        mls             v21.4h, v20.4h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
404cabdff1aSopenharmony_ci        mla             v19.4h, v18.4h, v0.h[1] //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
405cabdff1aSopenharmony_ci        add             v20.4h, v16.4h, v1.4h   // t5/2 = t1/2 + t3/2
406cabdff1aSopenharmony_ci        mls             v2.4h, v18.4h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
407cabdff1aSopenharmony_ci        sub             v0.4h, v16.4h, v1.4h    // t8/2 = t1/2 - t3/2
408cabdff1aSopenharmony_ci        add             v18.4h, v17.4h, v3.4h   // t6/2 = t2/2 + t4/2
409cabdff1aSopenharmony_ci        sub             v22.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
410cabdff1aSopenharmony_ci        neg             v23.4h, v24.4h          // +t2
411cabdff1aSopenharmony_ci        sub             v25.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
412cabdff1aSopenharmony_ci        add             v3.4h, v17.4h, v3.4h    // t6/2 = t2/2 + t4/2
413cabdff1aSopenharmony_ci        neg             v17.4h, v21.4h          // +t3
414cabdff1aSopenharmony_ci        sub             v26.4h, v16.4h, v1.4h   // t8/2 = t1/2 - t3/2
415cabdff1aSopenharmony_ci        add             v1.4h, v16.4h, v1.4h    // t5/2 = t1/2 + t3/2
416cabdff1aSopenharmony_ci        neg             v16.4h, v19.4h          // -t1
417cabdff1aSopenharmony_ci        neg             v27.4h, v2.4h           // +t4
418cabdff1aSopenharmony_ci        ssra            v20.4h, v19.4h, #1      // (t5 + t1) >> 1
419cabdff1aSopenharmony_ci        srsra           v0.4h, v2.4h, #1        // (t8 - t4 + 1) >> 1
420cabdff1aSopenharmony_ci        ssra            v18.4h, v23.4h, #1      // (t6 + t2) >> 1
421cabdff1aSopenharmony_ci        srsra           v22.4h, v21.4h, #1      // (t7 - t3 + 1) >> 1
422cabdff1aSopenharmony_ci        ssra            v25.4h, v17.4h, #1      // (t7 + t3) >> 1
423cabdff1aSopenharmony_ci        srsra           v3.4h, v24.4h, #1       // (t6 - t2 + 1) >> 1
424cabdff1aSopenharmony_ci        ssra            v26.4h, v27.4h, #1      // (t8 + t4) >> 1
425cabdff1aSopenharmony_ci        srsra           v1.4h, v16.4h, #1       // (t5 - t1 + 1) >> 1
426cabdff1aSopenharmony_ci        trn1            v0.2d, v20.2d, v0.2d
427cabdff1aSopenharmony_ci        trn1            v2.2d, v18.2d, v22.2d
428cabdff1aSopenharmony_ci        trn1            v3.2d, v25.2d, v3.2d
429cabdff1aSopenharmony_ci        trn1            v1.2d, v26.2d, v1.2d
430cabdff1aSopenharmony_ci        srshr           v0.8h, v0.8h, #6        // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
431cabdff1aSopenharmony_ci        srshr           v2.8h, v2.8h, #6        // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
432cabdff1aSopenharmony_ci        srshr           v3.8h, v3.8h, #6        // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
433cabdff1aSopenharmony_ci        srshr           v1.8h, v1.8h, #6        // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
434cabdff1aSopenharmony_ci        uaddw           v0.8h, v0.8h, v5.8b
435cabdff1aSopenharmony_ci        uaddw           v2.8h, v2.8h, v6.8b
436cabdff1aSopenharmony_ci        uaddw           v3.8h, v3.8h, v7.8b
437cabdff1aSopenharmony_ci        uaddw           v1.8h, v1.8h, v4.8b
438cabdff1aSopenharmony_ci        sqxtun          v0.8b, v0.8h
439cabdff1aSopenharmony_ci        sqxtun          v2.8b, v2.8h
440cabdff1aSopenharmony_ci        sqxtun          v3.8b, v3.8h
441cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
442cabdff1aSopenharmony_ci        st1             {v0.s}[0], [x4], x1
443cabdff1aSopenharmony_ci        st1             {v2.s}[0], [x4], x1
444cabdff1aSopenharmony_ci        st1             {v3.s}[0], [x4], x1
445cabdff1aSopenharmony_ci        st1             {v1.s}[0], [x4], x1
446cabdff1aSopenharmony_ci        st1             {v0.s}[1], [x4], x1
447cabdff1aSopenharmony_ci        st1             {v2.s}[1], [x4], x1
448cabdff1aSopenharmony_ci        st1             {v3.s}[1], [x4], x1
449cabdff1aSopenharmony_ci        st1             {v1.s}[1], [x4]
450cabdff1aSopenharmony_ci        ret
451cabdff1aSopenharmony_ciendfunc
452cabdff1aSopenharmony_ci
453cabdff1aSopenharmony_ci// VC-1 4x4 inverse transform
454cabdff1aSopenharmony_ci// On entry:
455cabdff1aSopenharmony_ci//   x0 -> array of 8-bit samples, in row-major order
456cabdff1aSopenharmony_ci//   x1 = row stride for 8-bit sample array
457cabdff1aSopenharmony_ci//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
458cabdff1aSopenharmony_ci// On exit:
459cabdff1aSopenharmony_ci//   array at x0 updated by saturated addition of (narrowed) transformed block
460cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_4x4_neon, export=1
461cabdff1aSopenharmony_ci        mov             x3, #16
462cabdff1aSopenharmony_ci        ldr             d0, .Lcoeffs_it4
463cabdff1aSopenharmony_ci        mov             x4, x0
464cabdff1aSopenharmony_ci        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
465cabdff1aSopenharmony_ci        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
466cabdff1aSopenharmony_ci        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
467cabdff1aSopenharmony_ci        ld1             {v4.d}[0], [x2]         // 30 31 32 33
468cabdff1aSopenharmony_ci        ld1             {v5.s}[0], [x0], x1
469cabdff1aSopenharmony_ci        ld1             {v5.s}[1], [x0], x1
470cabdff1aSopenharmony_ci        ld1             {v6.s}[0], [x0], x1
471cabdff1aSopenharmony_ci        trn2            v7.4h, v1.4h, v2.4h     // 01 11 03 13
472cabdff1aSopenharmony_ci        trn1            v1.4h, v1.4h, v2.4h     // 00 10 02 12
473cabdff1aSopenharmony_ci        ld1             {v6.s}[1], [x0]
474cabdff1aSopenharmony_ci        trn2            v2.4h, v3.4h, v4.4h     // 21 31 23 33
475cabdff1aSopenharmony_ci        trn1            v3.4h, v3.4h, v4.4h     // 20 30 22 32
476cabdff1aSopenharmony_ci        trn2            v4.2s, v7.2s, v2.2s     // 03 13 23 33
477cabdff1aSopenharmony_ci        trn1            v16.2s, v1.2s, v3.2s    // 00 10 20 30
478cabdff1aSopenharmony_ci        trn1            v2.2s, v7.2s, v2.2s     // 01 11 21 31
479cabdff1aSopenharmony_ci        trn2            v1.2s, v1.2s, v3.2s     // 02 12 22 32
480cabdff1aSopenharmony_ci        mul             v3.4h, v4.4h, v0.h[0]   //                                                          10/2 * src[3]
481cabdff1aSopenharmony_ci        mul             v4.4h, v4.4h, v0.h[1]   //                                                          22/2 * src[3]
482cabdff1aSopenharmony_ci        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
483cabdff1aSopenharmony_ci        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[2]
484cabdff1aSopenharmony_ci        mla             v3.4h, v2.4h, v0.h[1]   //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
485cabdff1aSopenharmony_ci        mls             v4.4h, v2.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
486cabdff1aSopenharmony_ci        add             v2.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[2]
487cabdff1aSopenharmony_ci        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[2]
488cabdff1aSopenharmony_ci        neg             v7.4h, v3.4h            // -t3/2
489cabdff1aSopenharmony_ci        neg             v16.4h, v4.4h           // -t4/2
490cabdff1aSopenharmony_ci        ssra            v3.4h, v2.4h, #1        // (t1 + t3) >> 1
491cabdff1aSopenharmony_ci        ssra            v4.4h, v1.4h, #1        // (t2 + t4) >> 1
492cabdff1aSopenharmony_ci        ssra            v16.4h, v1.4h, #1       // (t2 - t4) >> 1
493cabdff1aSopenharmony_ci        ssra            v7.4h, v2.4h, #1        // (t1 - t3) >> 1
494cabdff1aSopenharmony_ci        srshr           v1.4h, v3.4h, #2        // (t1 + t3 + 64) >> 3
495cabdff1aSopenharmony_ci        srshr           v2.4h, v4.4h, #2        // (t2 + t4 + 64) >> 3
496cabdff1aSopenharmony_ci        srshr           v3.4h, v16.4h, #2       // (t2 - t4 + 64) >> 3
497cabdff1aSopenharmony_ci        srshr           v4.4h, v7.4h, #2        // (t1 - t3 + 64) >> 3
498cabdff1aSopenharmony_ci        trn2            v7.4h, v1.4h, v3.4h     // 10 11 30 31
499cabdff1aSopenharmony_ci        trn1            v1.4h, v1.4h, v3.4h     // 00 01 20 21
500cabdff1aSopenharmony_ci        trn2            v3.4h, v2.4h, v4.4h     // 12 13 32 33
501cabdff1aSopenharmony_ci        trn1            v2.4h, v2.4h, v4.4h     // 02 03 22 23
502cabdff1aSopenharmony_ci        trn2            v4.2s, v7.2s, v3.2s     // 30 31 32 33
503cabdff1aSopenharmony_ci        trn1            v16.2s, v1.2s, v2.2s    // 00 01 02 03
504cabdff1aSopenharmony_ci        trn1            v3.2s, v7.2s, v3.2s     // 10 11 12 13
505cabdff1aSopenharmony_ci        trn2            v1.2s, v1.2s, v2.2s     // 20 21 22 23
506cabdff1aSopenharmony_ci        mul             v2.4h, v4.4h, v0.h[1]   //                                                           22/2 * src[24]
507cabdff1aSopenharmony_ci        mul             v4.4h, v4.4h, v0.h[0]   //                                                           10/2 * src[24]
508cabdff1aSopenharmony_ci        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
509cabdff1aSopenharmony_ci        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[16]
510cabdff1aSopenharmony_ci        mls             v2.4h, v3.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
511cabdff1aSopenharmony_ci        mla             v4.4h, v3.4h, v0.h[1]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
512cabdff1aSopenharmony_ci        add             v0.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[16]
513cabdff1aSopenharmony_ci        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[16]
514cabdff1aSopenharmony_ci        neg             v3.4h, v2.4h            // -t4/2
515cabdff1aSopenharmony_ci        neg             v7.4h, v4.4h            // -t3/2
516cabdff1aSopenharmony_ci        ssra            v4.4h, v0.4h, #1        // (t1 + t3) >> 1
517cabdff1aSopenharmony_ci        ssra            v3.4h, v1.4h, #1        // (t2 - t4) >> 1
518cabdff1aSopenharmony_ci        ssra            v2.4h, v1.4h, #1        // (t2 + t4) >> 1
519cabdff1aSopenharmony_ci        ssra            v7.4h, v0.4h, #1        // (t1 - t3) >> 1
520cabdff1aSopenharmony_ci        trn1            v0.2d, v4.2d, v3.2d
521cabdff1aSopenharmony_ci        trn1            v1.2d, v2.2d, v7.2d
522cabdff1aSopenharmony_ci        srshr           v0.8h, v0.8h, #6        // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
523cabdff1aSopenharmony_ci        srshr           v1.8h, v1.8h, #6        // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
524cabdff1aSopenharmony_ci        uaddw           v0.8h, v0.8h, v5.8b
525cabdff1aSopenharmony_ci        uaddw           v1.8h, v1.8h, v6.8b
526cabdff1aSopenharmony_ci        sqxtun          v0.8b, v0.8h
527cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
528cabdff1aSopenharmony_ci        st1             {v0.s}[0], [x4], x1
529cabdff1aSopenharmony_ci        st1             {v0.s}[1], [x4], x1
530cabdff1aSopenharmony_ci        st1             {v1.s}[0], [x4], x1
531cabdff1aSopenharmony_ci        st1             {v1.s}[1], [x4]
532cabdff1aSopenharmony_ci        ret
533cabdff1aSopenharmony_ciendfunc
534cabdff1aSopenharmony_ci
535cabdff1aSopenharmony_ci// VC-1 8x8 inverse transform, DC case
536cabdff1aSopenharmony_ci// On entry:
537cabdff1aSopenharmony_ci//   x0 -> array of 8-bit samples, in row-major order
538cabdff1aSopenharmony_ci//   x1 = row stride for 8-bit sample array
539cabdff1aSopenharmony_ci//   x2 -> 16-bit inverse transform DC coefficient
540cabdff1aSopenharmony_ci// On exit:
541cabdff1aSopenharmony_ci//   array at x0 updated by saturated addition of (narrowed) transformed block
542cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_8x8_dc_neon, export=1
543cabdff1aSopenharmony_ci        ldrsh           w2, [x2]
544cabdff1aSopenharmony_ci        mov             x3, x0
545cabdff1aSopenharmony_ci        ld1             {v0.8b}, [x0], x1
546cabdff1aSopenharmony_ci        ld1             {v1.8b}, [x0], x1
547cabdff1aSopenharmony_ci        ld1             {v2.8b}, [x0], x1
548cabdff1aSopenharmony_ci        add             w2, w2, w2, lsl #1
549cabdff1aSopenharmony_ci        ld1             {v3.8b}, [x0], x1
550cabdff1aSopenharmony_ci        ld1             {v4.8b}, [x0], x1
551cabdff1aSopenharmony_ci        add             w2, w2, #1
552cabdff1aSopenharmony_ci        ld1             {v5.8b}, [x0], x1
553cabdff1aSopenharmony_ci        asr             w2, w2, #1
554cabdff1aSopenharmony_ci        ld1             {v6.8b}, [x0], x1
555cabdff1aSopenharmony_ci        add             w2, w2, w2, lsl #1
556cabdff1aSopenharmony_ci        ld1             {v7.8b}, [x0]
557cabdff1aSopenharmony_ci        add             w0, w2, #16
558cabdff1aSopenharmony_ci        asr             w0, w0, #5
559cabdff1aSopenharmony_ci        dup             v16.8h, w0
560cabdff1aSopenharmony_ci        uaddw           v0.8h, v16.8h, v0.8b
561cabdff1aSopenharmony_ci        uaddw           v1.8h, v16.8h, v1.8b
562cabdff1aSopenharmony_ci        uaddw           v2.8h, v16.8h, v2.8b
563cabdff1aSopenharmony_ci        uaddw           v3.8h, v16.8h, v3.8b
564cabdff1aSopenharmony_ci        uaddw           v4.8h, v16.8h, v4.8b
565cabdff1aSopenharmony_ci        uaddw           v5.8h, v16.8h, v5.8b
566cabdff1aSopenharmony_ci        sqxtun          v0.8b, v0.8h
567cabdff1aSopenharmony_ci        uaddw           v6.8h, v16.8h, v6.8b
568cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
569cabdff1aSopenharmony_ci        uaddw           v7.8h, v16.8h, v7.8b
570cabdff1aSopenharmony_ci        sqxtun          v2.8b, v2.8h
571cabdff1aSopenharmony_ci        sqxtun          v3.8b, v3.8h
572cabdff1aSopenharmony_ci        sqxtun          v4.8b, v4.8h
573cabdff1aSopenharmony_ci        st1             {v0.8b}, [x3], x1
574cabdff1aSopenharmony_ci        sqxtun          v0.8b, v5.8h
575cabdff1aSopenharmony_ci        st1             {v1.8b}, [x3], x1
576cabdff1aSopenharmony_ci        sqxtun          v1.8b, v6.8h
577cabdff1aSopenharmony_ci        st1             {v2.8b}, [x3], x1
578cabdff1aSopenharmony_ci        sqxtun          v2.8b, v7.8h
579cabdff1aSopenharmony_ci        st1             {v3.8b}, [x3], x1
580cabdff1aSopenharmony_ci        st1             {v4.8b}, [x3], x1
581cabdff1aSopenharmony_ci        st1             {v0.8b}, [x3], x1
582cabdff1aSopenharmony_ci        st1             {v1.8b}, [x3], x1
583cabdff1aSopenharmony_ci        st1             {v2.8b}, [x3]
584cabdff1aSopenharmony_ci        ret
585cabdff1aSopenharmony_ciendfunc
586cabdff1aSopenharmony_ci
587cabdff1aSopenharmony_ci// VC-1 8x4 inverse transform, DC case
588cabdff1aSopenharmony_ci// On entry:
589cabdff1aSopenharmony_ci//   x0 -> array of 8-bit samples, in row-major order
590cabdff1aSopenharmony_ci//   x1 = row stride for 8-bit sample array
591cabdff1aSopenharmony_ci//   x2 -> 16-bit inverse transform DC coefficient
592cabdff1aSopenharmony_ci// On exit:
593cabdff1aSopenharmony_ci//   array at x0 updated by saturated addition of (narrowed) transformed block
594cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_8x4_dc_neon, export=1
595cabdff1aSopenharmony_ci        ldrsh           w2, [x2]
596cabdff1aSopenharmony_ci        mov             x3, x0
597cabdff1aSopenharmony_ci        ld1             {v0.8b}, [x0], x1
598cabdff1aSopenharmony_ci        ld1             {v1.8b}, [x0], x1
599cabdff1aSopenharmony_ci        ld1             {v2.8b}, [x0], x1
600cabdff1aSopenharmony_ci        add             w2, w2, w2, lsl #1
601cabdff1aSopenharmony_ci        ld1             {v3.8b}, [x0]
602cabdff1aSopenharmony_ci        add             w0, w2, #1
603cabdff1aSopenharmony_ci        asr             w0, w0, #1
604cabdff1aSopenharmony_ci        add             w0, w0, w0, lsl #4
605cabdff1aSopenharmony_ci        add             w0, w0, #64
606cabdff1aSopenharmony_ci        asr             w0, w0, #7
607cabdff1aSopenharmony_ci        dup             v4.8h, w0
608cabdff1aSopenharmony_ci        uaddw           v0.8h, v4.8h, v0.8b
609cabdff1aSopenharmony_ci        uaddw           v1.8h, v4.8h, v1.8b
610cabdff1aSopenharmony_ci        uaddw           v2.8h, v4.8h, v2.8b
611cabdff1aSopenharmony_ci        uaddw           v3.8h, v4.8h, v3.8b
612cabdff1aSopenharmony_ci        sqxtun          v0.8b, v0.8h
613cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
614cabdff1aSopenharmony_ci        sqxtun          v2.8b, v2.8h
615cabdff1aSopenharmony_ci        sqxtun          v3.8b, v3.8h
616cabdff1aSopenharmony_ci        st1             {v0.8b}, [x3], x1
617cabdff1aSopenharmony_ci        st1             {v1.8b}, [x3], x1
618cabdff1aSopenharmony_ci        st1             {v2.8b}, [x3], x1
619cabdff1aSopenharmony_ci        st1             {v3.8b}, [x3]
620cabdff1aSopenharmony_ci        ret
621cabdff1aSopenharmony_ciendfunc
622cabdff1aSopenharmony_ci
623cabdff1aSopenharmony_ci// VC-1 4x8 inverse transform, DC case
624cabdff1aSopenharmony_ci// On entry:
625cabdff1aSopenharmony_ci//   x0 -> array of 8-bit samples, in row-major order
626cabdff1aSopenharmony_ci//   x1 = row stride for 8-bit sample array
627cabdff1aSopenharmony_ci//   x2 -> 16-bit inverse transform DC coefficient
628cabdff1aSopenharmony_ci// On exit:
629cabdff1aSopenharmony_ci//   array at x0 updated by saturated addition of (narrowed) transformed block
630cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_4x8_dc_neon, export=1
631cabdff1aSopenharmony_ci        ldrsh           w2, [x2]
632cabdff1aSopenharmony_ci        mov             x3, x0
633cabdff1aSopenharmony_ci        ld1             {v0.s}[0], [x0], x1
634cabdff1aSopenharmony_ci        ld1             {v1.s}[0], [x0], x1
635cabdff1aSopenharmony_ci        ld1             {v2.s}[0], [x0], x1
636cabdff1aSopenharmony_ci        add             w2, w2, w2, lsl #4
637cabdff1aSopenharmony_ci        ld1             {v3.s}[0], [x0], x1
638cabdff1aSopenharmony_ci        add             w2, w2, #4
639cabdff1aSopenharmony_ci        asr             w2, w2, #3
640cabdff1aSopenharmony_ci        add             w2, w2, w2, lsl #1
641cabdff1aSopenharmony_ci        ld1             {v0.s}[1], [x0], x1
642cabdff1aSopenharmony_ci        add             w2, w2, #16
643cabdff1aSopenharmony_ci        asr             w2, w2, #5
644cabdff1aSopenharmony_ci        dup             v4.8h, w2
645cabdff1aSopenharmony_ci        ld1             {v1.s}[1], [x0], x1
646cabdff1aSopenharmony_ci        ld1             {v2.s}[1], [x0], x1
647cabdff1aSopenharmony_ci        ld1             {v3.s}[1], [x0]
648cabdff1aSopenharmony_ci        uaddw           v0.8h, v4.8h, v0.8b
649cabdff1aSopenharmony_ci        uaddw           v1.8h, v4.8h, v1.8b
650cabdff1aSopenharmony_ci        uaddw           v2.8h, v4.8h, v2.8b
651cabdff1aSopenharmony_ci        uaddw           v3.8h, v4.8h, v3.8b
652cabdff1aSopenharmony_ci        sqxtun          v0.8b, v0.8h
653cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
654cabdff1aSopenharmony_ci        sqxtun          v2.8b, v2.8h
655cabdff1aSopenharmony_ci        sqxtun          v3.8b, v3.8h
656cabdff1aSopenharmony_ci        st1             {v0.s}[0], [x3], x1
657cabdff1aSopenharmony_ci        st1             {v1.s}[0], [x3], x1
658cabdff1aSopenharmony_ci        st1             {v2.s}[0], [x3], x1
659cabdff1aSopenharmony_ci        st1             {v3.s}[0], [x3], x1
660cabdff1aSopenharmony_ci        st1             {v0.s}[1], [x3], x1
661cabdff1aSopenharmony_ci        st1             {v1.s}[1], [x3], x1
662cabdff1aSopenharmony_ci        st1             {v2.s}[1], [x3], x1
663cabdff1aSopenharmony_ci        st1             {v3.s}[1], [x3]
664cabdff1aSopenharmony_ci        ret
665cabdff1aSopenharmony_ciendfunc
666cabdff1aSopenharmony_ci
667cabdff1aSopenharmony_ci// VC-1 4x4 inverse transform, DC case
668cabdff1aSopenharmony_ci// On entry:
669cabdff1aSopenharmony_ci//   x0 -> array of 8-bit samples, in row-major order
670cabdff1aSopenharmony_ci//   x1 = row stride for 8-bit sample array
671cabdff1aSopenharmony_ci//   x2 -> 16-bit inverse transform DC coefficient
672cabdff1aSopenharmony_ci// On exit:
673cabdff1aSopenharmony_ci//   array at x0 updated by saturated addition of (narrowed) transformed block
674cabdff1aSopenharmony_cifunction ff_vc1_inv_trans_4x4_dc_neon, export=1
675cabdff1aSopenharmony_ci        ldrsh           w2, [x2]
676cabdff1aSopenharmony_ci        mov             x3, x0
677cabdff1aSopenharmony_ci        ld1             {v0.s}[0], [x0], x1
678cabdff1aSopenharmony_ci        ld1             {v1.s}[0], [x0], x1
679cabdff1aSopenharmony_ci        ld1             {v0.s}[1], [x0], x1
680cabdff1aSopenharmony_ci        add             w2, w2, w2, lsl #4
681cabdff1aSopenharmony_ci        ld1             {v1.s}[1], [x0]
682cabdff1aSopenharmony_ci        add             w0, w2, #4
683cabdff1aSopenharmony_ci        asr             w0, w0, #3
684cabdff1aSopenharmony_ci        add             w0, w0, w0, lsl #4
685cabdff1aSopenharmony_ci        add             w0, w0, #64
686cabdff1aSopenharmony_ci        asr             w0, w0, #7
687cabdff1aSopenharmony_ci        dup             v2.8h, w0
688cabdff1aSopenharmony_ci        uaddw           v0.8h, v2.8h, v0.8b
689cabdff1aSopenharmony_ci        uaddw           v1.8h, v2.8h, v1.8b
690cabdff1aSopenharmony_ci        sqxtun          v0.8b, v0.8h
691cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
692cabdff1aSopenharmony_ci        st1             {v0.s}[0], [x3], x1
693cabdff1aSopenharmony_ci        st1             {v1.s}[0], [x3], x1
694cabdff1aSopenharmony_ci        st1             {v0.s}[1], [x3], x1
695cabdff1aSopenharmony_ci        st1             {v1.s}[1], [x3]
696cabdff1aSopenharmony_ci        ret
697cabdff1aSopenharmony_ciendfunc
698cabdff1aSopenharmony_ci
699cabdff1aSopenharmony_ci.align  5
700cabdff1aSopenharmony_ci.Lcoeffs_it8:
701cabdff1aSopenharmony_ci.quad   0x000F00090003
702cabdff1aSopenharmony_ci.Lcoeffs_it4:
703cabdff1aSopenharmony_ci.quad   0x0011000B0005
704cabdff1aSopenharmony_ci.Lcoeffs:
705cabdff1aSopenharmony_ci.quad   0x00050002
706cabdff1aSopenharmony_ci
707cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
708cabdff1aSopenharmony_ci// On entry:
709cabdff1aSopenharmony_ci//   x0 -> top-left pel of lower block
710cabdff1aSopenharmony_ci//   x1 = row stride, bytes
711cabdff1aSopenharmony_ci//   w2 = PQUANT bitstream parameter
712cabdff1aSopenharmony_cifunction ff_vc1_v_loop_filter4_neon, export=1
713cabdff1aSopenharmony_ci        sub             x3, x0, w1, sxtw #2
714cabdff1aSopenharmony_ci        ldr             d0, .Lcoeffs
715cabdff1aSopenharmony_ci        ld1             {v1.s}[0], [x0], x1     // P5
716cabdff1aSopenharmony_ci        ld1             {v2.s}[0], [x3], x1     // P1
717cabdff1aSopenharmony_ci        ld1             {v3.s}[0], [x3], x1     // P2
718cabdff1aSopenharmony_ci        ld1             {v4.s}[0], [x0], x1     // P6
719cabdff1aSopenharmony_ci        ld1             {v5.s}[0], [x3], x1     // P3
720cabdff1aSopenharmony_ci        ld1             {v6.s}[0], [x0], x1     // P7
721cabdff1aSopenharmony_ci        ld1             {v7.s}[0], [x3]         // P4
722cabdff1aSopenharmony_ci        ld1             {v16.s}[0], [x0]        // P8
723cabdff1aSopenharmony_ci        ushll           v17.8h, v1.8b, #1       // 2*P5
724cabdff1aSopenharmony_ci        dup             v18.8h, w2              // pq
725cabdff1aSopenharmony_ci        ushll           v2.8h, v2.8b, #1        // 2*P1
726cabdff1aSopenharmony_ci        uxtl            v3.8h, v3.8b            // P2
727cabdff1aSopenharmony_ci        uxtl            v4.8h, v4.8b            // P6
728cabdff1aSopenharmony_ci        uxtl            v19.8h, v5.8b           // P3
729cabdff1aSopenharmony_ci        mls             v2.4h, v3.4h, v0.h[1]   // 2*P1-5*P2
730cabdff1aSopenharmony_ci        uxtl            v3.8h, v6.8b            // P7
731cabdff1aSopenharmony_ci        mls             v17.4h, v4.4h, v0.h[1]  // 2*P5-5*P6
732cabdff1aSopenharmony_ci        ushll           v5.8h, v5.8b, #1        // 2*P3
733cabdff1aSopenharmony_ci        uxtl            v6.8h, v7.8b            // P4
734cabdff1aSopenharmony_ci        mla             v17.4h, v3.4h, v0.h[1]  // 2*P5-5*P6+5*P7
735cabdff1aSopenharmony_ci        uxtl            v3.8h, v16.8b           // P8
736cabdff1aSopenharmony_ci        mla             v2.4h, v19.4h, v0.h[1]  // 2*P1-5*P2+5*P3
737cabdff1aSopenharmony_ci        uxtl            v1.8h, v1.8b            // P5
738cabdff1aSopenharmony_ci        mls             v5.4h, v6.4h, v0.h[1]   // 2*P3-5*P4
739cabdff1aSopenharmony_ci        mls             v17.4h, v3.4h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
740cabdff1aSopenharmony_ci        sub             v3.4h, v6.4h, v1.4h     // P4-P5
741cabdff1aSopenharmony_ci        mls             v2.4h, v6.4h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
742cabdff1aSopenharmony_ci        mla             v5.4h, v1.4h, v0.h[1]   // 2*P3-5*P4+5*P5
743cabdff1aSopenharmony_ci        mls             v5.4h, v4.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
744cabdff1aSopenharmony_ci        abs             v4.4h, v3.4h
745cabdff1aSopenharmony_ci        srshr           v7.4h, v17.4h, #3
746cabdff1aSopenharmony_ci        srshr           v2.4h, v2.4h, #3
747cabdff1aSopenharmony_ci        sshr            v4.4h, v4.4h, #1        // clip
748cabdff1aSopenharmony_ci        srshr           v5.4h, v5.4h, #3
749cabdff1aSopenharmony_ci        abs             v7.4h, v7.4h            // a2
750cabdff1aSopenharmony_ci        sshr            v3.4h, v3.4h, #8        // clip_sign
751cabdff1aSopenharmony_ci        abs             v2.4h, v2.4h            // a1
752cabdff1aSopenharmony_ci        cmeq            v16.4h, v4.4h, #0       // test clip == 0
753cabdff1aSopenharmony_ci        abs             v17.4h, v5.4h           // a0
754cabdff1aSopenharmony_ci        sshr            v5.4h, v5.4h, #8        // a0_sign
755cabdff1aSopenharmony_ci        cmhs            v19.4h, v2.4h, v7.4h    // test a1 >= a2
756cabdff1aSopenharmony_ci        cmhs            v18.4h, v17.4h, v18.4h  // test a0 >= pq
757cabdff1aSopenharmony_ci        sub             v3.4h, v3.4h, v5.4h     // clip_sign - a0_sign
758cabdff1aSopenharmony_ci        bsl             v19.8b, v7.8b, v2.8b    // a3
759cabdff1aSopenharmony_ci        orr             v2.8b, v16.8b, v18.8b   // test clip == 0 || a0 >= pq
760cabdff1aSopenharmony_ci        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
761cabdff1aSopenharmony_ci        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
762cabdff1aSopenharmony_ci        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
763cabdff1aSopenharmony_ci        orr             v5.8b, v2.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
764cabdff1aSopenharmony_ci        mov             w0, v5.s[1]             // move to gp reg
765cabdff1aSopenharmony_ci        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
766cabdff1aSopenharmony_ci        cmhs            v5.4h, v0.4h, v4.4h
767cabdff1aSopenharmony_ci        tbnz            w0, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
768cabdff1aSopenharmony_ci        bsl             v5.8b, v4.8b, v0.8b     // FFMIN(d, clip)
769cabdff1aSopenharmony_ci        bic             v0.8b, v5.8b, v2.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
770cabdff1aSopenharmony_ci        mls             v6.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
771cabdff1aSopenharmony_ci        mla             v1.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
772cabdff1aSopenharmony_ci        sqxtun          v0.8b, v6.8h
773cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
774cabdff1aSopenharmony_ci        st1             {v0.s}[0], [x3], x1
775cabdff1aSopenharmony_ci        st1             {v1.s}[0], [x3]
776cabdff1aSopenharmony_ci1:      ret
777cabdff1aSopenharmony_ciendfunc
778cabdff1aSopenharmony_ci
779cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
780cabdff1aSopenharmony_ci// On entry:
781cabdff1aSopenharmony_ci//   x0 -> top-left pel of right block
782cabdff1aSopenharmony_ci//   x1 = row stride, bytes
783cabdff1aSopenharmony_ci//   w2 = PQUANT bitstream parameter
784cabdff1aSopenharmony_cifunction ff_vc1_h_loop_filter4_neon, export=1
785cabdff1aSopenharmony_ci        sub             x3, x0, #4              // where to start reading
786cabdff1aSopenharmony_ci        ldr             d0, .Lcoeffs
787cabdff1aSopenharmony_ci        ld1             {v1.8b}, [x3], x1
788cabdff1aSopenharmony_ci        sub             x0, x0, #1              // where to start writing
789cabdff1aSopenharmony_ci        ld1             {v2.8b}, [x3], x1
790cabdff1aSopenharmony_ci        ld1             {v3.8b}, [x3], x1
791cabdff1aSopenharmony_ci        ld1             {v4.8b}, [x3]
792cabdff1aSopenharmony_ci        dup             v5.8h, w2               // pq
793cabdff1aSopenharmony_ci        trn1            v6.8b, v1.8b, v2.8b
794cabdff1aSopenharmony_ci        trn2            v1.8b, v1.8b, v2.8b
795cabdff1aSopenharmony_ci        trn1            v2.8b, v3.8b, v4.8b
796cabdff1aSopenharmony_ci        trn2            v3.8b, v3.8b, v4.8b
797cabdff1aSopenharmony_ci        trn1            v4.4h, v6.4h, v2.4h     // P1, P5
798cabdff1aSopenharmony_ci        trn1            v7.4h, v1.4h, v3.4h     // P2, P6
799cabdff1aSopenharmony_ci        trn2            v2.4h, v6.4h, v2.4h     // P3, P7
800cabdff1aSopenharmony_ci        trn2            v1.4h, v1.4h, v3.4h     // P4, P8
801cabdff1aSopenharmony_ci        ushll           v3.8h, v4.8b, #1        // 2*P1, 2*P5
802cabdff1aSopenharmony_ci        uxtl            v6.8h, v7.8b            // P2, P6
803cabdff1aSopenharmony_ci        uxtl            v7.8h, v2.8b            // P3, P7
804cabdff1aSopenharmony_ci        uxtl            v1.8h, v1.8b            // P4, P8
805cabdff1aSopenharmony_ci        mls             v3.8h, v6.8h, v0.h[1]   // 2*P1-5*P2, 2*P5-5*P6
806cabdff1aSopenharmony_ci        ushll           v2.8h, v2.8b, #1        // 2*P3, 2*P7
807cabdff1aSopenharmony_ci        uxtl            v4.8h, v4.8b            // P1, P5
808cabdff1aSopenharmony_ci        mla             v3.8h, v7.8h, v0.h[1]   // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
809cabdff1aSopenharmony_ci        mov             d6, v6.d[1]             // P6
810cabdff1aSopenharmony_ci        mls             v3.8h, v1.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
811cabdff1aSopenharmony_ci        mov             d4, v4.d[1]             // P5
812cabdff1aSopenharmony_ci        mls             v2.4h, v1.4h, v0.h[1]   // 2*P3-5*P4
813cabdff1aSopenharmony_ci        mla             v2.4h, v4.4h, v0.h[1]   // 2*P3-5*P4+5*P5
814cabdff1aSopenharmony_ci        sub             v7.4h, v1.4h, v4.4h     // P4-P5
815cabdff1aSopenharmony_ci        mls             v2.4h, v6.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
816cabdff1aSopenharmony_ci        srshr           v3.8h, v3.8h, #3
817cabdff1aSopenharmony_ci        abs             v6.4h, v7.4h
818cabdff1aSopenharmony_ci        sshr            v7.4h, v7.4h, #8        // clip_sign
819cabdff1aSopenharmony_ci        srshr           v2.4h, v2.4h, #3
820cabdff1aSopenharmony_ci        abs             v3.8h, v3.8h            // a1, a2
821cabdff1aSopenharmony_ci        sshr            v6.4h, v6.4h, #1        // clip
822cabdff1aSopenharmony_ci        mov             d16, v3.d[1]            // a2
823cabdff1aSopenharmony_ci        abs             v17.4h, v2.4h           // a0
824cabdff1aSopenharmony_ci        cmeq            v18.4h, v6.4h, #0       // test clip == 0
825cabdff1aSopenharmony_ci        sshr            v2.4h, v2.4h, #8        // a0_sign
826cabdff1aSopenharmony_ci        cmhs            v19.4h, v3.4h, v16.4h   // test a1 >= a2
827cabdff1aSopenharmony_ci        cmhs            v5.4h, v17.4h, v5.4h    // test a0 >= pq
828cabdff1aSopenharmony_ci        sub             v2.4h, v7.4h, v2.4h     // clip_sign - a0_sign
829cabdff1aSopenharmony_ci        bsl             v19.8b, v16.8b, v3.8b   // a3
830cabdff1aSopenharmony_ci        orr             v3.8b, v18.8b, v5.8b    // test clip == 0 || a0 >= pq
831cabdff1aSopenharmony_ci        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
832cabdff1aSopenharmony_ci        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
833cabdff1aSopenharmony_ci        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
834cabdff1aSopenharmony_ci        orr             v5.8b, v3.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
835cabdff1aSopenharmony_ci        mov             w2, v5.s[1]             // move to gp reg
836cabdff1aSopenharmony_ci        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
837cabdff1aSopenharmony_ci        cmhs            v5.4h, v0.4h, v6.4h
838cabdff1aSopenharmony_ci        tbnz            w2, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
839cabdff1aSopenharmony_ci        bsl             v5.8b, v6.8b, v0.8b     // FFMIN(d, clip)
840cabdff1aSopenharmony_ci        bic             v0.8b, v5.8b, v3.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
841cabdff1aSopenharmony_ci        mla             v4.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
842cabdff1aSopenharmony_ci        mls             v1.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
843cabdff1aSopenharmony_ci        sqxtun          v3.8b, v4.8h
844cabdff1aSopenharmony_ci        sqxtun          v2.8b, v1.8h
845cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[0], [x0], x1
846cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[1], [x0], x1
847cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[2], [x0], x1
848cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[3], [x0]
849cabdff1aSopenharmony_ci1:      ret
850cabdff1aSopenharmony_ciendfunc
851cabdff1aSopenharmony_ci
852cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
853cabdff1aSopenharmony_ci// On entry:
854cabdff1aSopenharmony_ci//   x0 -> top-left pel of lower block
855cabdff1aSopenharmony_ci//   x1 = row stride, bytes
856cabdff1aSopenharmony_ci//   w2 = PQUANT bitstream parameter
857cabdff1aSopenharmony_cifunction ff_vc1_v_loop_filter8_neon, export=1
858cabdff1aSopenharmony_ci        sub             x3, x0, w1, sxtw #2
859cabdff1aSopenharmony_ci        ldr             d0, .Lcoeffs
860cabdff1aSopenharmony_ci        ld1             {v1.8b}, [x0], x1       // P5
861cabdff1aSopenharmony_ci        movi            v2.2d, #0x0000ffff00000000
862cabdff1aSopenharmony_ci        ld1             {v3.8b}, [x3], x1       // P1
863cabdff1aSopenharmony_ci        ld1             {v4.8b}, [x3], x1       // P2
864cabdff1aSopenharmony_ci        ld1             {v5.8b}, [x0], x1       // P6
865cabdff1aSopenharmony_ci        ld1             {v6.8b}, [x3], x1       // P3
866cabdff1aSopenharmony_ci        ld1             {v7.8b}, [x0], x1       // P7
867cabdff1aSopenharmony_ci        ushll           v16.8h, v1.8b, #1       // 2*P5
868cabdff1aSopenharmony_ci        ushll           v3.8h, v3.8b, #1        // 2*P1
869cabdff1aSopenharmony_ci        ld1             {v17.8b}, [x3]          // P4
870cabdff1aSopenharmony_ci        uxtl            v4.8h, v4.8b            // P2
871cabdff1aSopenharmony_ci        ld1             {v18.8b}, [x0]          // P8
872cabdff1aSopenharmony_ci        uxtl            v5.8h, v5.8b            // P6
873cabdff1aSopenharmony_ci        dup             v19.8h, w2              // pq
874cabdff1aSopenharmony_ci        uxtl            v20.8h, v6.8b           // P3
875cabdff1aSopenharmony_ci        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1-5*P2
876cabdff1aSopenharmony_ci        uxtl            v4.8h, v7.8b            // P7
877cabdff1aSopenharmony_ci        ushll           v6.8h, v6.8b, #1        // 2*P3
878cabdff1aSopenharmony_ci        mls             v16.8h, v5.8h, v0.h[1]  // 2*P5-5*P6
879cabdff1aSopenharmony_ci        uxtl            v7.8h, v17.8b           // P4
880cabdff1aSopenharmony_ci        uxtl            v17.8h, v18.8b          // P8
881cabdff1aSopenharmony_ci        mla             v16.8h, v4.8h, v0.h[1]  // 2*P5-5*P6+5*P7
882cabdff1aSopenharmony_ci        uxtl            v1.8h, v1.8b            // P5
883cabdff1aSopenharmony_ci        mla             v3.8h, v20.8h, v0.h[1]  // 2*P1-5*P2+5*P3
884cabdff1aSopenharmony_ci        sub             v4.8h, v7.8h, v1.8h     // P4-P5
885cabdff1aSopenharmony_ci        mls             v6.8h, v7.8h, v0.h[1]   // 2*P3-5*P4
886cabdff1aSopenharmony_ci        mls             v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
887cabdff1aSopenharmony_ci        abs             v17.8h, v4.8h
888cabdff1aSopenharmony_ci        sshr            v4.8h, v4.8h, #8        // clip_sign
889cabdff1aSopenharmony_ci        mls             v3.8h, v7.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
890cabdff1aSopenharmony_ci        sshr            v17.8h, v17.8h, #1      // clip
891cabdff1aSopenharmony_ci        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3-5*P4+5*P5
892cabdff1aSopenharmony_ci        srshr           v16.8h, v16.8h, #3
893cabdff1aSopenharmony_ci        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
894cabdff1aSopenharmony_ci        cmeq            v5.8h, v17.8h, #0       // test clip == 0
895cabdff1aSopenharmony_ci        srshr           v3.8h, v3.8h, #3
896cabdff1aSopenharmony_ci        abs             v16.8h, v16.8h          // a2
897cabdff1aSopenharmony_ci        abs             v3.8h, v3.8h            // a1
898cabdff1aSopenharmony_ci        srshr           v6.8h, v6.8h, #3
899cabdff1aSopenharmony_ci        cmhs            v18.8h, v3.8h, v16.8h   // test a1 >= a2
900cabdff1aSopenharmony_ci        abs             v20.8h, v6.8h           // a0
901cabdff1aSopenharmony_ci        sshr            v6.8h, v6.8h, #8        // a0_sign
902cabdff1aSopenharmony_ci        bsl             v18.16b, v16.16b, v3.16b // a3
903cabdff1aSopenharmony_ci        cmhs            v3.8h, v20.8h, v19.8h   // test a0 >= pq
904cabdff1aSopenharmony_ci        sub             v4.8h, v4.8h, v6.8h     // clip_sign - a0_sign
905cabdff1aSopenharmony_ci        uqsub           v6.8h, v20.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
906cabdff1aSopenharmony_ci        cmhs            v16.8h, v18.8h, v20.8h  // test a3 >= a0
907cabdff1aSopenharmony_ci        orr             v3.16b, v5.16b, v3.16b  // test clip == 0 || a0 >= pq
908cabdff1aSopenharmony_ci        mul             v0.8h, v6.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
909cabdff1aSopenharmony_ci        orr             v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
910cabdff1aSopenharmony_ci        cmtst           v2.2d, v5.2d, v2.2d     // if 2nd of each group of is not filtered, then none of the others in the group should be either
911cabdff1aSopenharmony_ci        mov             w0, v5.s[1]             // move to gp reg
912cabdff1aSopenharmony_ci        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
913cabdff1aSopenharmony_ci        mov             w2, v5.s[3]
914cabdff1aSopenharmony_ci        orr             v2.16b, v3.16b, v2.16b
915cabdff1aSopenharmony_ci        cmhs            v3.8h, v0.8h, v17.8h
916cabdff1aSopenharmony_ci        and             w0, w0, w2
917cabdff1aSopenharmony_ci        bsl             v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
918cabdff1aSopenharmony_ci        tbnz            w0, #0, 1f              // none of the 8 pixel pairs should be updated in this case
919cabdff1aSopenharmony_ci        bic             v0.16b, v3.16b, v2.16b  // set each d to zero if it should not be filtered
920cabdff1aSopenharmony_ci        mls             v7.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
921cabdff1aSopenharmony_ci        mla             v1.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
922cabdff1aSopenharmony_ci        sqxtun          v0.8b, v7.8h
923cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
924cabdff1aSopenharmony_ci        st1             {v0.8b}, [x3], x1
925cabdff1aSopenharmony_ci        st1             {v1.8b}, [x3]
926cabdff1aSopenharmony_ci1:      ret
927cabdff1aSopenharmony_ciendfunc
928cabdff1aSopenharmony_ci
929cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
930cabdff1aSopenharmony_ci// On entry:
931cabdff1aSopenharmony_ci//   x0 -> top-left pel of right block
932cabdff1aSopenharmony_ci//   x1 = row stride, bytes
933cabdff1aSopenharmony_ci//   w2 = PQUANT bitstream parameter
934cabdff1aSopenharmony_cifunction ff_vc1_h_loop_filter8_neon, export=1
935cabdff1aSopenharmony_ci        sub             x3, x0, #4              // where to start reading
936cabdff1aSopenharmony_ci        ldr             d0, .Lcoeffs
937cabdff1aSopenharmony_ci        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
938cabdff1aSopenharmony_ci        sub             x0, x0, #1              // where to start writing
939cabdff1aSopenharmony_ci        ld1             {v2.8b}, [x3], x1
940cabdff1aSopenharmony_ci        add             x4, x0, x1, lsl #2
941cabdff1aSopenharmony_ci        ld1             {v3.8b}, [x3], x1
942cabdff1aSopenharmony_ci        ld1             {v4.8b}, [x3], x1
943cabdff1aSopenharmony_ci        ld1             {v5.8b}, [x3], x1
944cabdff1aSopenharmony_ci        ld1             {v6.8b}, [x3], x1
945cabdff1aSopenharmony_ci        ld1             {v7.8b}, [x3], x1
946cabdff1aSopenharmony_ci        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
947cabdff1aSopenharmony_ci        ld1             {v17.8b}, [x3]
948cabdff1aSopenharmony_ci        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
949cabdff1aSopenharmony_ci        trn1            v2.8b, v3.8b, v4.8b     // P1[2], P1[3], P3[2]...
950cabdff1aSopenharmony_ci        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
951cabdff1aSopenharmony_ci        dup             v4.8h, w2               // pq
952cabdff1aSopenharmony_ci        trn1            v18.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
953cabdff1aSopenharmony_ci        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
954cabdff1aSopenharmony_ci        trn1            v6.4h, v16.4h, v2.4h    // P1[0], P1[1], P1[2], P1[3], P5[0]...
955cabdff1aSopenharmony_ci        trn1            v19.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
956cabdff1aSopenharmony_ci        trn1            v20.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
957cabdff1aSopenharmony_ci        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
958cabdff1aSopenharmony_ci        trn2            v2.4h, v16.4h, v2.4h    // P3[0], P3[1], P3[2], P3[3], P7[0]...
959cabdff1aSopenharmony_ci        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
960cabdff1aSopenharmony_ci        trn1            v3.4h, v18.4h, v20.4h   // P1[4], P1[5], P1[6], P1[7], P5[4]...
961cabdff1aSopenharmony_ci        trn1            v16.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
962cabdff1aSopenharmony_ci        trn2            v17.4h, v18.4h, v20.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
963cabdff1aSopenharmony_ci        trn2            v5.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
964cabdff1aSopenharmony_ci        trn1            v7.2s, v6.2s, v3.2s     // P1
965cabdff1aSopenharmony_ci        trn1            v18.2s, v19.2s, v16.2s  // P2
966cabdff1aSopenharmony_ci        trn2            v3.2s, v6.2s, v3.2s     // P5
967cabdff1aSopenharmony_ci        trn2            v6.2s, v19.2s, v16.2s   // P6
968cabdff1aSopenharmony_ci        trn1            v16.2s, v2.2s, v17.2s   // P3
969cabdff1aSopenharmony_ci        trn2            v2.2s, v2.2s, v17.2s    // P7
970cabdff1aSopenharmony_ci        ushll           v7.8h, v7.8b, #1        // 2*P1
971cabdff1aSopenharmony_ci        trn1            v17.2s, v1.2s, v5.2s    // P4
972cabdff1aSopenharmony_ci        ushll           v19.8h, v3.8b, #1       // 2*P5
973cabdff1aSopenharmony_ci        trn2            v1.2s, v1.2s, v5.2s     // P8
974cabdff1aSopenharmony_ci        uxtl            v5.8h, v18.8b           // P2
975cabdff1aSopenharmony_ci        uxtl            v6.8h, v6.8b            // P6
976cabdff1aSopenharmony_ci        uxtl            v18.8h, v16.8b          // P3
977cabdff1aSopenharmony_ci        mls             v7.8h, v5.8h, v0.h[1]   // 2*P1-5*P2
978cabdff1aSopenharmony_ci        uxtl            v2.8h, v2.8b            // P7
979cabdff1aSopenharmony_ci        ushll           v5.8h, v16.8b, #1       // 2*P3
980cabdff1aSopenharmony_ci        mls             v19.8h, v6.8h, v0.h[1]  // 2*P5-5*P6
981cabdff1aSopenharmony_ci        uxtl            v16.8h, v17.8b          // P4
982cabdff1aSopenharmony_ci        uxtl            v1.8h, v1.8b            // P8
983cabdff1aSopenharmony_ci        mla             v19.8h, v2.8h, v0.h[1]  // 2*P5-5*P6+5*P7
984cabdff1aSopenharmony_ci        uxtl            v2.8h, v3.8b            // P5
985cabdff1aSopenharmony_ci        mla             v7.8h, v18.8h, v0.h[1]  // 2*P1-5*P2+5*P3
986cabdff1aSopenharmony_ci        sub             v3.8h, v16.8h, v2.8h    // P4-P5
987cabdff1aSopenharmony_ci        mls             v5.8h, v16.8h, v0.h[1]  // 2*P3-5*P4
988cabdff1aSopenharmony_ci        mls             v19.8h, v1.8h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
989cabdff1aSopenharmony_ci        abs             v1.8h, v3.8h
990cabdff1aSopenharmony_ci        sshr            v3.8h, v3.8h, #8        // clip_sign
991cabdff1aSopenharmony_ci        mls             v7.8h, v16.8h, v0.h[0]  // 2*P1-5*P2+5*P3-2*P4
992cabdff1aSopenharmony_ci        sshr            v1.8h, v1.8h, #1        // clip
993cabdff1aSopenharmony_ci        mla             v5.8h, v2.8h, v0.h[1]   // 2*P3-5*P4+5*P5
994cabdff1aSopenharmony_ci        srshr           v17.8h, v19.8h, #3
995cabdff1aSopenharmony_ci        mls             v5.8h, v6.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
996cabdff1aSopenharmony_ci        cmeq            v6.8h, v1.8h, #0        // test clip == 0
997cabdff1aSopenharmony_ci        srshr           v7.8h, v7.8h, #3
998cabdff1aSopenharmony_ci        abs             v17.8h, v17.8h          // a2
999cabdff1aSopenharmony_ci        abs             v7.8h, v7.8h            // a1
1000cabdff1aSopenharmony_ci        srshr           v5.8h, v5.8h, #3
1001cabdff1aSopenharmony_ci        cmhs            v18.8h, v7.8h, v17.8h   // test a1 >= a2
1002cabdff1aSopenharmony_ci        abs             v19.8h, v5.8h           // a0
1003cabdff1aSopenharmony_ci        sshr            v5.8h, v5.8h, #8        // a0_sign
1004cabdff1aSopenharmony_ci        bsl             v18.16b, v17.16b, v7.16b // a3
1005cabdff1aSopenharmony_ci        cmhs            v4.8h, v19.8h, v4.8h    // test a0 >= pq
1006cabdff1aSopenharmony_ci        sub             v3.8h, v3.8h, v5.8h     // clip_sign - a0_sign
1007cabdff1aSopenharmony_ci        uqsub           v5.8h, v19.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1008cabdff1aSopenharmony_ci        cmhs            v7.8h, v18.8h, v19.8h   // test a3 >= a0
1009cabdff1aSopenharmony_ci        orr             v4.16b, v6.16b, v4.16b  // test clip == 0 || a0 >= pq
1010cabdff1aSopenharmony_ci        mul             v0.8h, v5.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
1011cabdff1aSopenharmony_ci        orr             v5.16b, v4.16b, v7.16b  // test clip == 0 || a0 >= pq || a3 >= a0
1012cabdff1aSopenharmony_ci        mov             w2, v5.s[1]             // move to gp reg
1013cabdff1aSopenharmony_ci        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
1014cabdff1aSopenharmony_ci        mov             w3, v5.s[3]
1015cabdff1aSopenharmony_ci        cmhs            v5.8h, v0.8h, v1.8h
1016cabdff1aSopenharmony_ci        and             w5, w2, w3
1017cabdff1aSopenharmony_ci        bsl             v5.16b, v1.16b, v0.16b  // FFMIN(d, clip)
1018cabdff1aSopenharmony_ci        tbnz            w5, #0, 2f              // none of the 8 pixel pairs should be updated in this case
1019cabdff1aSopenharmony_ci        bic             v0.16b, v5.16b, v4.16b  // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1020cabdff1aSopenharmony_ci        mla             v2.8h, v0.8h, v3.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1021cabdff1aSopenharmony_ci        mls             v16.8h, v0.8h, v3.8h    // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1022cabdff1aSopenharmony_ci        sqxtun          v1.8b, v2.8h
1023cabdff1aSopenharmony_ci        sqxtun          v0.8b, v16.8h
1024cabdff1aSopenharmony_ci        tbnz            w2, #0, 1f              // none of the first 4 pixel pairs should be updated if so
1025cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[0], [x0], x1
1026cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[1], [x0], x1
1027cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[2], [x0], x1
1028cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[3], [x0]
1029cabdff1aSopenharmony_ci1:      tbnz            w3, #0, 2f              // none of the second 4 pixel pairs should be updated if so
1030cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[4], [x4], x1
1031cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[5], [x4], x1
1032cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[6], [x4], x1
1033cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[7], [x4]
1034cabdff1aSopenharmony_ci2:      ret
1035cabdff1aSopenharmony_ciendfunc
1036cabdff1aSopenharmony_ci
1037cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
1038cabdff1aSopenharmony_ci// On entry:
1039cabdff1aSopenharmony_ci//   x0 -> top-left pel of lower block
1040cabdff1aSopenharmony_ci//   x1 = row stride, bytes
1041cabdff1aSopenharmony_ci//   w2 = PQUANT bitstream parameter
1042cabdff1aSopenharmony_cifunction ff_vc1_v_loop_filter16_neon, export=1
1043cabdff1aSopenharmony_ci        sub             x3, x0, w1, sxtw #2
1044cabdff1aSopenharmony_ci        ldr             d0, .Lcoeffs
1045cabdff1aSopenharmony_ci        ld1             {v1.16b}, [x0], x1      // P5
1046cabdff1aSopenharmony_ci        movi            v2.2d, #0x0000ffff00000000
1047cabdff1aSopenharmony_ci        ld1             {v3.16b}, [x3], x1      // P1
1048cabdff1aSopenharmony_ci        ld1             {v4.16b}, [x3], x1      // P2
1049cabdff1aSopenharmony_ci        ld1             {v5.16b}, [x0], x1      // P6
1050cabdff1aSopenharmony_ci        ld1             {v6.16b}, [x3], x1      // P3
1051cabdff1aSopenharmony_ci        ld1             {v7.16b}, [x0], x1      // P7
1052cabdff1aSopenharmony_ci        ushll           v16.8h, v1.8b, #1       // 2*P5[0..7]
1053cabdff1aSopenharmony_ci        ushll           v17.8h, v3.8b, #1       // 2*P1[0..7]
1054cabdff1aSopenharmony_ci        ld1             {v18.16b}, [x3]         // P4
1055cabdff1aSopenharmony_ci        uxtl            v19.8h, v4.8b           // P2[0..7]
1056cabdff1aSopenharmony_ci        ld1             {v20.16b}, [x0]         // P8
1057cabdff1aSopenharmony_ci        uxtl            v21.8h, v5.8b           // P6[0..7]
1058cabdff1aSopenharmony_ci        dup             v22.8h, w2              // pq
1059cabdff1aSopenharmony_ci        ushll2          v3.8h, v3.16b, #1       // 2*P1[8..15]
1060cabdff1aSopenharmony_ci        mls             v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
1061cabdff1aSopenharmony_ci        ushll2          v19.8h, v1.16b, #1      // 2*P5[8..15]
1062cabdff1aSopenharmony_ci        uxtl2           v4.8h, v4.16b           // P2[8..15]
1063cabdff1aSopenharmony_ci        mls             v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
1064cabdff1aSopenharmony_ci        uxtl2           v5.8h, v5.16b           // P6[8..15]
1065cabdff1aSopenharmony_ci        uxtl            v23.8h, v6.8b           // P3[0..7]
1066cabdff1aSopenharmony_ci        uxtl            v24.8h, v7.8b           // P7[0..7]
1067cabdff1aSopenharmony_ci        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1[8..15]-5*P2[8..15]
1068cabdff1aSopenharmony_ci        ushll           v4.8h, v6.8b, #1        // 2*P3[0..7]
1069cabdff1aSopenharmony_ci        uxtl            v25.8h, v18.8b          // P4[0..7]
1070cabdff1aSopenharmony_ci        mls             v19.8h, v5.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]
1071cabdff1aSopenharmony_ci        uxtl2           v26.8h, v6.16b          // P3[8..15]
1072cabdff1aSopenharmony_ci        mla             v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
1073cabdff1aSopenharmony_ci        uxtl2           v7.8h, v7.16b           // P7[8..15]
1074cabdff1aSopenharmony_ci        ushll2          v6.8h, v6.16b, #1       // 2*P3[8..15]
1075cabdff1aSopenharmony_ci        mla             v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1076cabdff1aSopenharmony_ci        uxtl2           v18.8h, v18.16b         // P4[8..15]
1077cabdff1aSopenharmony_ci        uxtl            v23.8h, v20.8b          // P8[0..7]
1078cabdff1aSopenharmony_ci        mls             v4.8h, v25.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
1079cabdff1aSopenharmony_ci        uxtl            v24.8h, v1.8b           // P5[0..7]
1080cabdff1aSopenharmony_ci        uxtl2           v20.8h, v20.16b         // P8[8..15]
1081cabdff1aSopenharmony_ci        mla             v3.8h, v26.8h, v0.h[1]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1082cabdff1aSopenharmony_ci        uxtl2           v1.8h, v1.16b           // P5[8..15]
1083cabdff1aSopenharmony_ci        sub             v26.8h, v25.8h, v24.8h  // P4[0..7]-P5[0..7]
1084cabdff1aSopenharmony_ci        mla             v19.8h, v7.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
1085cabdff1aSopenharmony_ci        sub             v7.8h, v18.8h, v1.8h    // P4[8..15]-P5[8..15]
1086cabdff1aSopenharmony_ci        mls             v6.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]
1087cabdff1aSopenharmony_ci        abs             v27.8h, v26.8h
1088cabdff1aSopenharmony_ci        sshr            v26.8h, v26.8h, #8      // clip_sign[0..7]
1089cabdff1aSopenharmony_ci        mls             v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
1090cabdff1aSopenharmony_ci        abs             v28.8h, v7.8h
1091cabdff1aSopenharmony_ci        sshr            v27.8h, v27.8h, #1      // clip[0..7]
1092cabdff1aSopenharmony_ci        mls             v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1093cabdff1aSopenharmony_ci        sshr            v7.8h, v7.8h, #8        // clip_sign[8..15]
1094cabdff1aSopenharmony_ci        sshr            v23.8h, v28.8h, #1      // clip[8..15]
1095cabdff1aSopenharmony_ci        mla             v4.8h, v24.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1096cabdff1aSopenharmony_ci        cmeq            v28.8h, v27.8h, #0      // test clip[0..7] == 0
1097cabdff1aSopenharmony_ci        srshr           v17.8h, v17.8h, #3
1098cabdff1aSopenharmony_ci        mls             v3.8h, v18.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1099cabdff1aSopenharmony_ci        cmeq            v29.8h, v23.8h, #0      // test clip[8..15] == 0
1100cabdff1aSopenharmony_ci        srshr           v16.8h, v16.8h, #3
1101cabdff1aSopenharmony_ci        mls             v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
1102cabdff1aSopenharmony_ci        abs             v17.8h, v17.8h          // a1[0..7]
1103cabdff1aSopenharmony_ci        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1104cabdff1aSopenharmony_ci        srshr           v3.8h, v3.8h, #3
1105cabdff1aSopenharmony_ci        mls             v4.8h, v21.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
1106cabdff1aSopenharmony_ci        abs             v16.8h, v16.8h          // a2[0..7]
1107cabdff1aSopenharmony_ci        srshr           v19.8h, v19.8h, #3
1108cabdff1aSopenharmony_ci        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1109cabdff1aSopenharmony_ci        cmhs            v5.8h, v17.8h, v16.8h   // test a1[0..7] >= a2[0..7]
1110cabdff1aSopenharmony_ci        abs             v3.8h, v3.8h            // a1[8..15]
1111cabdff1aSopenharmony_ci        srshr           v4.8h, v4.8h, #3
1112cabdff1aSopenharmony_ci        abs             v19.8h, v19.8h          // a2[8..15]
1113cabdff1aSopenharmony_ci        bsl             v5.16b, v16.16b, v17.16b // a3[0..7]
1114cabdff1aSopenharmony_ci        srshr           v6.8h, v6.8h, #3
1115cabdff1aSopenharmony_ci        cmhs            v16.8h, v3.8h, v19.8h   // test a1[8..15] >= a2[8.15]
1116cabdff1aSopenharmony_ci        abs             v17.8h, v4.8h           // a0[0..7]
1117cabdff1aSopenharmony_ci        sshr            v4.8h, v4.8h, #8        // a0_sign[0..7]
1118cabdff1aSopenharmony_ci        bsl             v16.16b, v19.16b, v3.16b // a3[8..15]
1119cabdff1aSopenharmony_ci        uqsub           v3.8h, v17.8h, v5.8h    // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1120cabdff1aSopenharmony_ci        abs             v19.8h, v6.8h           // a0[8..15]
1121cabdff1aSopenharmony_ci        cmhs            v20.8h, v17.8h, v22.8h  // test a0[0..7] >= pq
1122cabdff1aSopenharmony_ci        cmhs            v5.8h, v5.8h, v17.8h    // test a3[0..7] >= a0[0..7]
1123cabdff1aSopenharmony_ci        sub             v4.8h, v26.8h, v4.8h    // clip_sign[0..7] - a0_sign[0..7]
1124cabdff1aSopenharmony_ci        sshr            v6.8h, v6.8h, #8        // a0_sign[8..15]
1125cabdff1aSopenharmony_ci        mul             v3.8h, v3.8h, v0.h[1]   // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1126cabdff1aSopenharmony_ci        uqsub           v17.8h, v19.8h, v16.8h  // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1127cabdff1aSopenharmony_ci        orr             v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
1128cabdff1aSopenharmony_ci        cmhs            v21.8h, v19.8h, v22.8h  // test a0[8..15] >= pq
1129cabdff1aSopenharmony_ci        cmhs            v16.8h, v16.8h, v19.8h  // test a3[8..15] >= a0[8..15]
1130cabdff1aSopenharmony_ci        mul             v0.8h, v17.8h, v0.h[1]  // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1131cabdff1aSopenharmony_ci        sub             v6.8h, v7.8h, v6.8h     // clip_sign[8..15] - a0_sign[8..15]
1132cabdff1aSopenharmony_ci        orr             v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
1133cabdff1aSopenharmony_ci        ushr            v3.8h, v3.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1134cabdff1aSopenharmony_ci        orr             v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
1135cabdff1aSopenharmony_ci        cmtst           v17.2d, v5.2d, v2.2d    // if 2nd of each group of is not filtered, then none of the others in the group should be either
1136cabdff1aSopenharmony_ci        mov             w0, v5.s[1]             // move to gp reg
1137cabdff1aSopenharmony_ci        cmhs            v19.8h, v3.8h, v27.8h
1138cabdff1aSopenharmony_ci        ushr            v0.8h, v0.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1139cabdff1aSopenharmony_ci        mov             w2, v5.s[3]
1140cabdff1aSopenharmony_ci        orr             v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1141cabdff1aSopenharmony_ci        orr             v16.16b, v20.16b, v17.16b
1142cabdff1aSopenharmony_ci        bsl             v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
1143cabdff1aSopenharmony_ci        cmtst           v2.2d, v5.2d, v2.2d
1144cabdff1aSopenharmony_ci        cmhs            v3.8h, v0.8h, v23.8h
1145cabdff1aSopenharmony_ci        mov             w4, v5.s[1]
1146cabdff1aSopenharmony_ci        mov             w5, v5.s[3]
1147cabdff1aSopenharmony_ci        and             w0, w0, w2
1148cabdff1aSopenharmony_ci        bic             v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1149cabdff1aSopenharmony_ci        orr             v2.16b, v7.16b, v2.16b
1150cabdff1aSopenharmony_ci        bsl             v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
1151cabdff1aSopenharmony_ci        mls             v25.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
1152cabdff1aSopenharmony_ci        and             w2, w4, w5
1153cabdff1aSopenharmony_ci        bic             v0.16b, v3.16b, v2.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1154cabdff1aSopenharmony_ci        mla             v24.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
1155cabdff1aSopenharmony_ci        and             w0, w0, w2
1156cabdff1aSopenharmony_ci        mls             v18.8h, v0.8h, v6.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
1157cabdff1aSopenharmony_ci        sqxtun          v2.8b, v25.8h
1158cabdff1aSopenharmony_ci        tbnz            w0, #0, 1f              // none of the 16 pixel pairs should be updated in this case
1159cabdff1aSopenharmony_ci        mla             v1.8h, v0.8h, v6.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
1160cabdff1aSopenharmony_ci        sqxtun          v0.8b, v24.8h
1161cabdff1aSopenharmony_ci        sqxtun2         v2.16b, v18.8h
1162cabdff1aSopenharmony_ci        sqxtun2         v0.16b, v1.8h
1163cabdff1aSopenharmony_ci        st1             {v2.16b}, [x3], x1
1164cabdff1aSopenharmony_ci        st1             {v0.16b}, [x3]
1165cabdff1aSopenharmony_ci1:      ret
1166cabdff1aSopenharmony_ciendfunc
1167cabdff1aSopenharmony_ci
1168cabdff1aSopenharmony_ci// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
1169cabdff1aSopenharmony_ci// On entry:
1170cabdff1aSopenharmony_ci//   x0 -> top-left pel of right block
1171cabdff1aSopenharmony_ci//   x1 = row stride, bytes
1172cabdff1aSopenharmony_ci//   w2 = PQUANT bitstream parameter
1173cabdff1aSopenharmony_cifunction ff_vc1_h_loop_filter16_neon, export=1
1174cabdff1aSopenharmony_ci        sub             x3, x0, #4              // where to start reading
1175cabdff1aSopenharmony_ci        ldr             d0, .Lcoeffs
1176cabdff1aSopenharmony_ci        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
1177cabdff1aSopenharmony_ci        sub             x0, x0, #1              // where to start writing
1178cabdff1aSopenharmony_ci        ld1             {v2.8b}, [x3], x1
1179cabdff1aSopenharmony_ci        add             x4, x0, x1, lsl #3
1180cabdff1aSopenharmony_ci        ld1             {v3.8b}, [x3], x1
1181cabdff1aSopenharmony_ci        add             x5, x0, x1, lsl #2
1182cabdff1aSopenharmony_ci        ld1             {v4.8b}, [x3], x1
1183cabdff1aSopenharmony_ci        add             x6, x4, x1, lsl #2
1184cabdff1aSopenharmony_ci        ld1             {v5.8b}, [x3], x1
1185cabdff1aSopenharmony_ci        ld1             {v6.8b}, [x3], x1
1186cabdff1aSopenharmony_ci        ld1             {v7.8b}, [x3], x1
1187cabdff1aSopenharmony_ci        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
1188cabdff1aSopenharmony_ci        ld1             {v17.8b}, [x3], x1
1189cabdff1aSopenharmony_ci        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
1190cabdff1aSopenharmony_ci        ld1             {v2.8b}, [x3], x1
1191cabdff1aSopenharmony_ci        trn1            v18.8b, v3.8b, v4.8b    // P1[2], P1[3], P3[2]...
1192cabdff1aSopenharmony_ci        ld1             {v19.8b}, [x3], x1
1193cabdff1aSopenharmony_ci        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
1194cabdff1aSopenharmony_ci        ld1             {v4.8b}, [x3], x1
1195cabdff1aSopenharmony_ci        trn1            v20.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
1196cabdff1aSopenharmony_ci        ld1             {v21.8b}, [x3], x1
1197cabdff1aSopenharmony_ci        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
1198cabdff1aSopenharmony_ci        ld1             {v6.8b}, [x3], x1
1199cabdff1aSopenharmony_ci        trn1            v22.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
1200cabdff1aSopenharmony_ci        ld1             {v23.8b}, [x3], x1
1201cabdff1aSopenharmony_ci        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
1202cabdff1aSopenharmony_ci        ld1             {v17.8b}, [x3], x1
1203cabdff1aSopenharmony_ci        trn1            v24.8b, v2.8b, v19.8b   // P1[8], P1[9], P3[8]...
1204cabdff1aSopenharmony_ci        ld1             {v25.8b}, [x3]
1205cabdff1aSopenharmony_ci        trn2            v2.8b, v2.8b, v19.8b    // P2[8], P2[9], P4[8]...
1206cabdff1aSopenharmony_ci        trn1            v19.4h, v16.4h, v18.4h  // P1[0], P1[1], P1[2], P1[3], P5[0]...
1207cabdff1aSopenharmony_ci        trn1            v26.8b, v4.8b, v21.8b   // P1[10], P1[11], P3[10]...
1208cabdff1aSopenharmony_ci        trn2            v4.8b, v4.8b, v21.8b    // P2[10], P2[11], P4[10]...
1209cabdff1aSopenharmony_ci        trn1            v21.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
1210cabdff1aSopenharmony_ci        trn1            v27.4h, v20.4h, v22.4h  // P1[4], P1[5], P1[6], P1[7], P5[4]...
1211cabdff1aSopenharmony_ci        trn1            v28.8b, v6.8b, v23.8b   // P1[12], P1[13], P3[12]...
1212cabdff1aSopenharmony_ci        trn2            v6.8b, v6.8b, v23.8b    // P2[12], P2[13], P4[12]...
1213cabdff1aSopenharmony_ci        trn1            v23.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
1214cabdff1aSopenharmony_ci        trn1            v29.4h, v24.4h, v26.4h  // P1[8], P1[9], P1[10], P1[11], P5[8]...
1215cabdff1aSopenharmony_ci        trn1            v30.8b, v17.8b, v25.8b  // P1[14], P1[15], P3[14]...
1216cabdff1aSopenharmony_ci        trn2            v17.8b, v17.8b, v25.8b  // P2[14], P2[15], P4[14]...
1217cabdff1aSopenharmony_ci        trn1            v25.4h, v2.4h, v4.4h    // P2[8], P2[9], P2[10], P2[11], P6[8]...
1218cabdff1aSopenharmony_ci        trn1            v31.2s, v19.2s, v27.2s  // P1[0..7]
1219cabdff1aSopenharmony_ci        trn2            v19.2s, v19.2s, v27.2s  // P5[0..7]
1220cabdff1aSopenharmony_ci        trn1            v27.2s, v21.2s, v23.2s  // P2[0..7]
1221cabdff1aSopenharmony_ci        trn2            v21.2s, v21.2s, v23.2s  // P6[0..7]
1222cabdff1aSopenharmony_ci        trn1            v23.4h, v28.4h, v30.4h  // P1[12], P1[13], P1[14], P1[15], P5[12]...
1223cabdff1aSopenharmony_ci        trn2            v16.4h, v16.4h, v18.4h  // P3[0], P3[1], P3[2], P3[3], P7[0]...
1224cabdff1aSopenharmony_ci        trn1            v18.4h, v6.4h, v17.4h   // P2[12], P2[13], P2[14], P2[15], P6[12]...
1225cabdff1aSopenharmony_ci        trn2            v20.4h, v20.4h, v22.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
1226cabdff1aSopenharmony_ci        trn2            v22.4h, v24.4h, v26.4h  // P3[8], P3[9], P3[10], P3[11], P7[8]...
1227cabdff1aSopenharmony_ci        trn1            v24.2s, v29.2s, v23.2s  // P1[8..15]
1228cabdff1aSopenharmony_ci        trn2            v23.2s, v29.2s, v23.2s  // P5[8..15]
1229cabdff1aSopenharmony_ci        trn1            v26.2s, v25.2s, v18.2s  // P2[8..15]
1230cabdff1aSopenharmony_ci        trn2            v18.2s, v25.2s, v18.2s  // P6[8..15]
1231cabdff1aSopenharmony_ci        trn2            v25.4h, v28.4h, v30.4h  // P3[12], P3[13], P3[14], P3[15], P7[12]...
1232cabdff1aSopenharmony_ci        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
1233cabdff1aSopenharmony_ci        trn2            v3.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
1234cabdff1aSopenharmony_ci        trn2            v2.4h, v2.4h, v4.4h     // P4[8], P4[9], P4[10], P4[11], P8[8]...
1235cabdff1aSopenharmony_ci        trn2            v4.4h, v6.4h, v17.4h    // P4[12], P4[13], P4[14], P4[15], P8[12]...
1236cabdff1aSopenharmony_ci        ushll           v5.8h, v31.8b, #1       // 2*P1[0..7]
1237cabdff1aSopenharmony_ci        ushll           v6.8h, v19.8b, #1       // 2*P5[0..7]
1238cabdff1aSopenharmony_ci        trn1            v7.2s, v16.2s, v20.2s   // P3[0..7]
1239cabdff1aSopenharmony_ci        uxtl            v17.8h, v27.8b          // P2[0..7]
1240cabdff1aSopenharmony_ci        trn2            v16.2s, v16.2s, v20.2s  // P7[0..7]
1241cabdff1aSopenharmony_ci        uxtl            v20.8h, v21.8b          // P6[0..7]
1242cabdff1aSopenharmony_ci        trn1            v21.2s, v22.2s, v25.2s  // P3[8..15]
1243cabdff1aSopenharmony_ci        ushll           v24.8h, v24.8b, #1      // 2*P1[8..15]
1244cabdff1aSopenharmony_ci        trn2            v22.2s, v22.2s, v25.2s  // P7[8..15]
1245cabdff1aSopenharmony_ci        ushll           v25.8h, v23.8b, #1      // 2*P5[8..15]
1246cabdff1aSopenharmony_ci        trn1            v27.2s, v1.2s, v3.2s    // P4[0..7]
1247cabdff1aSopenharmony_ci        uxtl            v26.8h, v26.8b          // P2[8..15]
1248cabdff1aSopenharmony_ci        mls             v5.8h, v17.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]
1249cabdff1aSopenharmony_ci        uxtl            v17.8h, v18.8b          // P6[8..15]
1250cabdff1aSopenharmony_ci        mls             v6.8h, v20.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]
1251cabdff1aSopenharmony_ci        trn1            v18.2s, v2.2s, v4.2s    // P4[8..15]
1252cabdff1aSopenharmony_ci        uxtl            v28.8h, v7.8b           // P3[0..7]
1253cabdff1aSopenharmony_ci        mls             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
1254cabdff1aSopenharmony_ci        uxtl            v16.8h, v16.8b          // P7[0..7]
1255cabdff1aSopenharmony_ci        uxtl            v26.8h, v21.8b          // P3[8..15]
1256cabdff1aSopenharmony_ci        mls             v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
1257cabdff1aSopenharmony_ci        uxtl            v22.8h, v22.8b          // P7[8..15]
1258cabdff1aSopenharmony_ci        ushll           v7.8h, v7.8b, #1        // 2*P3[0..7]
1259cabdff1aSopenharmony_ci        uxtl            v27.8h, v27.8b          // P4[0..7]
1260cabdff1aSopenharmony_ci        trn2            v1.2s, v1.2s, v3.2s     // P8[0..7]
1261cabdff1aSopenharmony_ci        ushll           v3.8h, v21.8b, #1       // 2*P3[8..15]
1262cabdff1aSopenharmony_ci        trn2            v2.2s, v2.2s, v4.2s     // P8[8..15]
1263cabdff1aSopenharmony_ci        uxtl            v4.8h, v18.8b           // P4[8..15]
1264cabdff1aSopenharmony_ci        mla             v5.8h, v28.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
1265cabdff1aSopenharmony_ci        uxtl            v1.8h, v1.8b            // P8[0..7]
1266cabdff1aSopenharmony_ci        mla             v6.8h, v16.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1267cabdff1aSopenharmony_ci        uxtl            v2.8h, v2.8b            // P8[8..15]
1268cabdff1aSopenharmony_ci        uxtl            v16.8h, v19.8b          // P5[0..7]
1269cabdff1aSopenharmony_ci        mla             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1270cabdff1aSopenharmony_ci        uxtl            v18.8h, v23.8b          // P5[8..15]
1271cabdff1aSopenharmony_ci        dup             v19.8h, w2              // pq
1272cabdff1aSopenharmony_ci        mla             v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
1273cabdff1aSopenharmony_ci        sub             v21.8h, v27.8h, v16.8h  // P4[0..7]-P5[0..7]
1274cabdff1aSopenharmony_ci        sub             v22.8h, v4.8h, v18.8h   // P4[8..15]-P5[8..15]
1275cabdff1aSopenharmony_ci        mls             v7.8h, v27.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
1276cabdff1aSopenharmony_ci        abs             v23.8h, v21.8h
1277cabdff1aSopenharmony_ci        mls             v3.8h, v4.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]
1278cabdff1aSopenharmony_ci        abs             v26.8h, v22.8h
1279cabdff1aSopenharmony_ci        sshr            v21.8h, v21.8h, #8      // clip_sign[0..7]
1280cabdff1aSopenharmony_ci        mls             v5.8h, v27.8h, v0.h[0]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
1281cabdff1aSopenharmony_ci        sshr            v23.8h, v23.8h, #1      // clip[0..7]
1282cabdff1aSopenharmony_ci        sshr            v26.8h, v26.8h, #1      // clip[8..15]
1283cabdff1aSopenharmony_ci        mls             v6.8h, v1.8h, v0.h[0]   // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1284cabdff1aSopenharmony_ci        sshr            v1.8h, v22.8h, #8       // clip_sign[8..15]
1285cabdff1aSopenharmony_ci        cmeq            v22.8h, v23.8h, #0      // test clip[0..7] == 0
1286cabdff1aSopenharmony_ci        mls             v24.8h, v4.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1287cabdff1aSopenharmony_ci        cmeq            v28.8h, v26.8h, #0      // test clip[8..15] == 0
1288cabdff1aSopenharmony_ci        srshr           v5.8h, v5.8h, #3
1289cabdff1aSopenharmony_ci        mls             v25.8h, v2.8h, v0.h[0]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
1290cabdff1aSopenharmony_ci        srshr           v2.8h, v6.8h, #3
1291cabdff1aSopenharmony_ci        mla             v7.8h, v16.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1292cabdff1aSopenharmony_ci        srshr           v6.8h, v24.8h, #3
1293cabdff1aSopenharmony_ci        mla             v3.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1294cabdff1aSopenharmony_ci        abs             v5.8h, v5.8h            // a1[0..7]
1295cabdff1aSopenharmony_ci        srshr           v24.8h, v25.8h, #3
1296cabdff1aSopenharmony_ci        mls             v3.8h, v17.8h, v0.h[0]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1297cabdff1aSopenharmony_ci        abs             v2.8h, v2.8h            // a2[0..7]
1298cabdff1aSopenharmony_ci        abs             v6.8h, v6.8h            // a1[8..15]
1299cabdff1aSopenharmony_ci        mls             v7.8h, v20.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
1300cabdff1aSopenharmony_ci        abs             v17.8h, v24.8h          // a2[8..15]
1301cabdff1aSopenharmony_ci        cmhs            v20.8h, v5.8h, v2.8h    // test a1[0..7] >= a2[0..7]
1302cabdff1aSopenharmony_ci        srshr           v3.8h, v3.8h, #3
1303cabdff1aSopenharmony_ci        cmhs            v24.8h, v6.8h, v17.8h   // test a1[8..15] >= a2[8.15]
1304cabdff1aSopenharmony_ci        srshr           v7.8h, v7.8h, #3
1305cabdff1aSopenharmony_ci        bsl             v20.16b, v2.16b, v5.16b // a3[0..7]
1306cabdff1aSopenharmony_ci        abs             v2.8h, v3.8h            // a0[8..15]
1307cabdff1aSopenharmony_ci        sshr            v3.8h, v3.8h, #8        // a0_sign[8..15]
1308cabdff1aSopenharmony_ci        bsl             v24.16b, v17.16b, v6.16b // a3[8..15]
1309cabdff1aSopenharmony_ci        abs             v5.8h, v7.8h            // a0[0..7]
1310cabdff1aSopenharmony_ci        sshr            v6.8h, v7.8h, #8        // a0_sign[0..7]
1311cabdff1aSopenharmony_ci        cmhs            v7.8h, v2.8h, v19.8h    // test a0[8..15] >= pq
1312cabdff1aSopenharmony_ci        sub             v1.8h, v1.8h, v3.8h     // clip_sign[8..15] - a0_sign[8..15]
1313cabdff1aSopenharmony_ci        uqsub           v3.8h, v2.8h, v24.8h    // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1314cabdff1aSopenharmony_ci        cmhs            v2.8h, v24.8h, v2.8h    // test a3[8..15] >= a0[8..15]
1315cabdff1aSopenharmony_ci        uqsub           v17.8h, v5.8h, v20.8h   // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1316cabdff1aSopenharmony_ci        cmhs            v19.8h, v5.8h, v19.8h   // test a0[0..7] >= pq
1317cabdff1aSopenharmony_ci        orr             v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
1318cabdff1aSopenharmony_ci        sub             v6.8h, v21.8h, v6.8h    // clip_sign[0..7] - a0_sign[0..7]
1319cabdff1aSopenharmony_ci        mul             v3.8h, v3.8h, v0.h[1]   // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1320cabdff1aSopenharmony_ci        cmhs            v5.8h, v20.8h, v5.8h    // test a3[0..7] >= a0[0..7]
1321cabdff1aSopenharmony_ci        orr             v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
1322cabdff1aSopenharmony_ci        mul             v0.8h, v17.8h, v0.h[1]  // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1323cabdff1aSopenharmony_ci        orr             v2.16b, v7.16b, v2.16b  // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1324cabdff1aSopenharmony_ci        orr             v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
1325cabdff1aSopenharmony_ci        ushr            v3.8h, v3.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1326cabdff1aSopenharmony_ci        mov             w7, v2.s[1]
1327cabdff1aSopenharmony_ci        mov             w8, v2.s[3]
1328cabdff1aSopenharmony_ci        ushr            v0.8h, v0.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1329cabdff1aSopenharmony_ci        mov             w2, v5.s[1]             // move to gp reg
1330cabdff1aSopenharmony_ci        cmhs            v2.8h, v3.8h, v26.8h
1331cabdff1aSopenharmony_ci        mov             w3, v5.s[3]
1332cabdff1aSopenharmony_ci        cmhs            v5.8h, v0.8h, v23.8h
1333cabdff1aSopenharmony_ci        bsl             v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
1334cabdff1aSopenharmony_ci        and             w9, w7, w8
1335cabdff1aSopenharmony_ci        bsl             v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
1336cabdff1aSopenharmony_ci        and             w10, w2, w3
1337cabdff1aSopenharmony_ci        bic             v0.16b, v2.16b, v7.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1338cabdff1aSopenharmony_ci        and             w9, w10, w9
1339cabdff1aSopenharmony_ci        bic             v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1340cabdff1aSopenharmony_ci        mls             v4.8h, v0.8h, v1.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
1341cabdff1aSopenharmony_ci        tbnz            w9, #0, 4f              // none of the 16 pixel pairs should be updated in this case
1342cabdff1aSopenharmony_ci        mls             v27.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
1343cabdff1aSopenharmony_ci        mla             v16.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
1344cabdff1aSopenharmony_ci        sqxtun          v2.8b, v4.8h
1345cabdff1aSopenharmony_ci        mla             v18.8h, v0.8h, v1.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
1346cabdff1aSopenharmony_ci        sqxtun          v0.8b, v27.8h
1347cabdff1aSopenharmony_ci        sqxtun          v1.8b, v16.8h
1348cabdff1aSopenharmony_ci        sqxtun          v3.8b, v18.8h
1349cabdff1aSopenharmony_ci        tbnz            w2, #0, 1f
1350cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[0], [x0], x1
1351cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[1], [x0], x1
1352cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[2], [x0], x1
1353cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[3], [x0]
1354cabdff1aSopenharmony_ci1:      tbnz            w3, #0, 2f
1355cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[4], [x5], x1
1356cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[5], [x5], x1
1357cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[6], [x5], x1
1358cabdff1aSopenharmony_ci        st2             {v0.b, v1.b}[7], [x5]
1359cabdff1aSopenharmony_ci2:      tbnz            w7, #0, 3f
1360cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[0], [x4], x1
1361cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[1], [x4], x1
1362cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[2], [x4], x1
1363cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[3], [x4]
1364cabdff1aSopenharmony_ci3:      tbnz            w8, #0, 4f
1365cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[4], [x6], x1
1366cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[5], [x6], x1
1367cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[6], [x6], x1
1368cabdff1aSopenharmony_ci        st2             {v2.b, v3.b}[7], [x6]
1369cabdff1aSopenharmony_ci4:      ret
1370cabdff1aSopenharmony_ciendfunc
1371cabdff1aSopenharmony_ci
1372cabdff1aSopenharmony_ci// Copy at most the specified number of bytes from source to destination buffer,
1373cabdff1aSopenharmony_ci// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
1374cabdff1aSopenharmony_ci// On entry:
1375cabdff1aSopenharmony_ci//   x0 -> source buffer
1376cabdff1aSopenharmony_ci//   w1 = max number of bytes to copy
1377cabdff1aSopenharmony_ci//   x2 -> destination buffer, optimally 8-byte aligned
1378cabdff1aSopenharmony_ci// On exit:
1379cabdff1aSopenharmony_ci//   w0 = number of bytes not copied
1380cabdff1aSopenharmony_cifunction ff_vc1_unescape_buffer_helper_neon, export=1
1381cabdff1aSopenharmony_ci        // Offset by 80 to screen out cases that are too short for us to handle,
1382cabdff1aSopenharmony_ci        // and also make it easy to test for loop termination, or to determine
1383cabdff1aSopenharmony_ci        // whether we need an odd number of half-iterations of the loop.
1384cabdff1aSopenharmony_ci        subs            w1, w1, #80
1385cabdff1aSopenharmony_ci        b.mi            90f
1386cabdff1aSopenharmony_ci
1387cabdff1aSopenharmony_ci        // Set up useful constants
1388cabdff1aSopenharmony_ci        movi            v20.4s, #3, lsl #24
1389cabdff1aSopenharmony_ci        movi            v21.4s, #3, lsl #16
1390cabdff1aSopenharmony_ci
1391cabdff1aSopenharmony_ci        tst             w1, #32
1392cabdff1aSopenharmony_ci        b.ne            1f
1393cabdff1aSopenharmony_ci
1394cabdff1aSopenharmony_ci          ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
1395cabdff1aSopenharmony_ci          ext             v25.16b, v0.16b, v1.16b, #1
1396cabdff1aSopenharmony_ci          ext             v26.16b, v0.16b, v1.16b, #2
1397cabdff1aSopenharmony_ci          ext             v27.16b, v0.16b, v1.16b, #3
1398cabdff1aSopenharmony_ci          ext             v29.16b, v1.16b, v2.16b, #1
1399cabdff1aSopenharmony_ci          ext             v30.16b, v1.16b, v2.16b, #2
1400cabdff1aSopenharmony_ci          ext             v31.16b, v1.16b, v2.16b, #3
1401cabdff1aSopenharmony_ci          bic             v24.16b, v0.16b, v20.16b
1402cabdff1aSopenharmony_ci          bic             v25.16b, v25.16b, v20.16b
1403cabdff1aSopenharmony_ci          bic             v26.16b, v26.16b, v20.16b
1404cabdff1aSopenharmony_ci          bic             v27.16b, v27.16b, v20.16b
1405cabdff1aSopenharmony_ci          bic             v28.16b, v1.16b, v20.16b
1406cabdff1aSopenharmony_ci          bic             v29.16b, v29.16b, v20.16b
1407cabdff1aSopenharmony_ci          bic             v30.16b, v30.16b, v20.16b
1408cabdff1aSopenharmony_ci          bic             v31.16b, v31.16b, v20.16b
1409cabdff1aSopenharmony_ci          eor             v24.16b, v24.16b, v21.16b
1410cabdff1aSopenharmony_ci          eor             v25.16b, v25.16b, v21.16b
1411cabdff1aSopenharmony_ci          eor             v26.16b, v26.16b, v21.16b
1412cabdff1aSopenharmony_ci          eor             v27.16b, v27.16b, v21.16b
1413cabdff1aSopenharmony_ci          eor             v28.16b, v28.16b, v21.16b
1414cabdff1aSopenharmony_ci          eor             v29.16b, v29.16b, v21.16b
1415cabdff1aSopenharmony_ci          eor             v30.16b, v30.16b, v21.16b
1416cabdff1aSopenharmony_ci          eor             v31.16b, v31.16b, v21.16b
1417cabdff1aSopenharmony_ci          cmeq            v24.4s, v24.4s, #0
1418cabdff1aSopenharmony_ci          cmeq            v25.4s, v25.4s, #0
1419cabdff1aSopenharmony_ci          cmeq            v26.4s, v26.4s, #0
1420cabdff1aSopenharmony_ci          cmeq            v27.4s, v27.4s, #0
1421cabdff1aSopenharmony_ci          add             w1, w1, #32
1422cabdff1aSopenharmony_ci          b               3f
1423cabdff1aSopenharmony_ci
1424cabdff1aSopenharmony_ci1:      ld1             {v3.16b, v4.16b, v5.16b}, [x0], #48
1425cabdff1aSopenharmony_ci        ext             v25.16b, v3.16b, v4.16b, #1
1426cabdff1aSopenharmony_ci        ext             v26.16b, v3.16b, v4.16b, #2
1427cabdff1aSopenharmony_ci        ext             v27.16b, v3.16b, v4.16b, #3
1428cabdff1aSopenharmony_ci        ext             v29.16b, v4.16b, v5.16b, #1
1429cabdff1aSopenharmony_ci        ext             v30.16b, v4.16b, v5.16b, #2
1430cabdff1aSopenharmony_ci        ext             v31.16b, v4.16b, v5.16b, #3
1431cabdff1aSopenharmony_ci        bic             v24.16b, v3.16b, v20.16b
1432cabdff1aSopenharmony_ci        bic             v25.16b, v25.16b, v20.16b
1433cabdff1aSopenharmony_ci        bic             v26.16b, v26.16b, v20.16b
1434cabdff1aSopenharmony_ci        bic             v27.16b, v27.16b, v20.16b
1435cabdff1aSopenharmony_ci        bic             v28.16b, v4.16b, v20.16b
1436cabdff1aSopenharmony_ci        bic             v29.16b, v29.16b, v20.16b
1437cabdff1aSopenharmony_ci        bic             v30.16b, v30.16b, v20.16b
1438cabdff1aSopenharmony_ci        bic             v31.16b, v31.16b, v20.16b
1439cabdff1aSopenharmony_ci        eor             v24.16b, v24.16b, v21.16b
1440cabdff1aSopenharmony_ci        eor             v25.16b, v25.16b, v21.16b
1441cabdff1aSopenharmony_ci        eor             v26.16b, v26.16b, v21.16b
1442cabdff1aSopenharmony_ci        eor             v27.16b, v27.16b, v21.16b
1443cabdff1aSopenharmony_ci        eor             v28.16b, v28.16b, v21.16b
1444cabdff1aSopenharmony_ci        eor             v29.16b, v29.16b, v21.16b
1445cabdff1aSopenharmony_ci        eor             v30.16b, v30.16b, v21.16b
1446cabdff1aSopenharmony_ci        eor             v31.16b, v31.16b, v21.16b
1447cabdff1aSopenharmony_ci        cmeq            v24.4s, v24.4s, #0
1448cabdff1aSopenharmony_ci        cmeq            v25.4s, v25.4s, #0
1449cabdff1aSopenharmony_ci        cmeq            v26.4s, v26.4s, #0
1450cabdff1aSopenharmony_ci        cmeq            v27.4s, v27.4s, #0
1451cabdff1aSopenharmony_ci        // Drop through...
1452cabdff1aSopenharmony_ci2:        mov             v0.16b, v5.16b
1453cabdff1aSopenharmony_ci          ld1             {v1.16b, v2.16b}, [x0], #32
1454cabdff1aSopenharmony_ci        cmeq            v28.4s, v28.4s, #0
1455cabdff1aSopenharmony_ci        cmeq            v29.4s, v29.4s, #0
1456cabdff1aSopenharmony_ci        cmeq            v30.4s, v30.4s, #0
1457cabdff1aSopenharmony_ci        cmeq            v31.4s, v31.4s, #0
1458cabdff1aSopenharmony_ci        orr             v24.16b, v24.16b, v25.16b
1459cabdff1aSopenharmony_ci        orr             v26.16b, v26.16b, v27.16b
1460cabdff1aSopenharmony_ci        orr             v28.16b, v28.16b, v29.16b
1461cabdff1aSopenharmony_ci        orr             v30.16b, v30.16b, v31.16b
1462cabdff1aSopenharmony_ci          ext             v25.16b, v0.16b, v1.16b, #1
1463cabdff1aSopenharmony_ci        orr             v22.16b, v24.16b, v26.16b
1464cabdff1aSopenharmony_ci          ext             v26.16b, v0.16b, v1.16b, #2
1465cabdff1aSopenharmony_ci          ext             v27.16b, v0.16b, v1.16b, #3
1466cabdff1aSopenharmony_ci          ext             v29.16b, v1.16b, v2.16b, #1
1467cabdff1aSopenharmony_ci        orr             v23.16b, v28.16b, v30.16b
1468cabdff1aSopenharmony_ci          ext             v30.16b, v1.16b, v2.16b, #2
1469cabdff1aSopenharmony_ci          ext             v31.16b, v1.16b, v2.16b, #3
1470cabdff1aSopenharmony_ci          bic             v24.16b, v0.16b, v20.16b
1471cabdff1aSopenharmony_ci          bic             v25.16b, v25.16b, v20.16b
1472cabdff1aSopenharmony_ci          bic             v26.16b, v26.16b, v20.16b
1473cabdff1aSopenharmony_ci        orr             v22.16b, v22.16b, v23.16b
1474cabdff1aSopenharmony_ci          bic             v27.16b, v27.16b, v20.16b
1475cabdff1aSopenharmony_ci          bic             v28.16b, v1.16b, v20.16b
1476cabdff1aSopenharmony_ci          bic             v29.16b, v29.16b, v20.16b
1477cabdff1aSopenharmony_ci          bic             v30.16b, v30.16b, v20.16b
1478cabdff1aSopenharmony_ci          bic             v31.16b, v31.16b, v20.16b
1479cabdff1aSopenharmony_ci        addv            s22, v22.4s
1480cabdff1aSopenharmony_ci          eor             v24.16b, v24.16b, v21.16b
1481cabdff1aSopenharmony_ci          eor             v25.16b, v25.16b, v21.16b
1482cabdff1aSopenharmony_ci          eor             v26.16b, v26.16b, v21.16b
1483cabdff1aSopenharmony_ci          eor             v27.16b, v27.16b, v21.16b
1484cabdff1aSopenharmony_ci          eor             v28.16b, v28.16b, v21.16b
1485cabdff1aSopenharmony_ci        mov             w3, v22.s[0]
1486cabdff1aSopenharmony_ci          eor             v29.16b, v29.16b, v21.16b
1487cabdff1aSopenharmony_ci          eor             v30.16b, v30.16b, v21.16b
1488cabdff1aSopenharmony_ci          eor             v31.16b, v31.16b, v21.16b
1489cabdff1aSopenharmony_ci          cmeq            v24.4s, v24.4s, #0
1490cabdff1aSopenharmony_ci          cmeq            v25.4s, v25.4s, #0
1491cabdff1aSopenharmony_ci          cmeq            v26.4s, v26.4s, #0
1492cabdff1aSopenharmony_ci          cmeq            v27.4s, v27.4s, #0
1493cabdff1aSopenharmony_ci        cbnz            w3, 90f
1494cabdff1aSopenharmony_ci        st1             {v3.16b, v4.16b}, [x2], #32
1495cabdff1aSopenharmony_ci3:          mov             v3.16b, v2.16b
1496cabdff1aSopenharmony_ci            ld1             {v4.16b, v5.16b}, [x0], #32
1497cabdff1aSopenharmony_ci          cmeq            v28.4s, v28.4s, #0
1498cabdff1aSopenharmony_ci          cmeq            v29.4s, v29.4s, #0
1499cabdff1aSopenharmony_ci          cmeq            v30.4s, v30.4s, #0
1500cabdff1aSopenharmony_ci          cmeq            v31.4s, v31.4s, #0
1501cabdff1aSopenharmony_ci          orr             v24.16b, v24.16b, v25.16b
1502cabdff1aSopenharmony_ci          orr             v26.16b, v26.16b, v27.16b
1503cabdff1aSopenharmony_ci          orr             v28.16b, v28.16b, v29.16b
1504cabdff1aSopenharmony_ci          orr             v30.16b, v30.16b, v31.16b
1505cabdff1aSopenharmony_ci            ext             v25.16b, v3.16b, v4.16b, #1
1506cabdff1aSopenharmony_ci          orr             v22.16b, v24.16b, v26.16b
1507cabdff1aSopenharmony_ci            ext             v26.16b, v3.16b, v4.16b, #2
1508cabdff1aSopenharmony_ci            ext             v27.16b, v3.16b, v4.16b, #3
1509cabdff1aSopenharmony_ci            ext             v29.16b, v4.16b, v5.16b, #1
1510cabdff1aSopenharmony_ci          orr             v23.16b, v28.16b, v30.16b
1511cabdff1aSopenharmony_ci            ext             v30.16b, v4.16b, v5.16b, #2
1512cabdff1aSopenharmony_ci            ext             v31.16b, v4.16b, v5.16b, #3
1513cabdff1aSopenharmony_ci            bic             v24.16b, v3.16b, v20.16b
1514cabdff1aSopenharmony_ci            bic             v25.16b, v25.16b, v20.16b
1515cabdff1aSopenharmony_ci            bic             v26.16b, v26.16b, v20.16b
1516cabdff1aSopenharmony_ci          orr             v22.16b, v22.16b, v23.16b
1517cabdff1aSopenharmony_ci            bic             v27.16b, v27.16b, v20.16b
1518cabdff1aSopenharmony_ci            bic             v28.16b, v4.16b, v20.16b
1519cabdff1aSopenharmony_ci            bic             v29.16b, v29.16b, v20.16b
1520cabdff1aSopenharmony_ci            bic             v30.16b, v30.16b, v20.16b
1521cabdff1aSopenharmony_ci            bic             v31.16b, v31.16b, v20.16b
1522cabdff1aSopenharmony_ci          addv            s22, v22.4s
1523cabdff1aSopenharmony_ci            eor             v24.16b, v24.16b, v21.16b
1524cabdff1aSopenharmony_ci            eor             v25.16b, v25.16b, v21.16b
1525cabdff1aSopenharmony_ci            eor             v26.16b, v26.16b, v21.16b
1526cabdff1aSopenharmony_ci            eor             v27.16b, v27.16b, v21.16b
1527cabdff1aSopenharmony_ci            eor             v28.16b, v28.16b, v21.16b
1528cabdff1aSopenharmony_ci          mov             w3, v22.s[0]
1529cabdff1aSopenharmony_ci            eor             v29.16b, v29.16b, v21.16b
1530cabdff1aSopenharmony_ci            eor             v30.16b, v30.16b, v21.16b
1531cabdff1aSopenharmony_ci            eor             v31.16b, v31.16b, v21.16b
1532cabdff1aSopenharmony_ci            cmeq            v24.4s, v24.4s, #0
1533cabdff1aSopenharmony_ci            cmeq            v25.4s, v25.4s, #0
1534cabdff1aSopenharmony_ci            cmeq            v26.4s, v26.4s, #0
1535cabdff1aSopenharmony_ci            cmeq            v27.4s, v27.4s, #0
1536cabdff1aSopenharmony_ci          cbnz            w3, 91f
1537cabdff1aSopenharmony_ci          st1             {v0.16b, v1.16b}, [x2], #32
1538cabdff1aSopenharmony_ci        subs            w1, w1, #64
1539cabdff1aSopenharmony_ci        b.pl            2b
1540cabdff1aSopenharmony_ci
1541cabdff1aSopenharmony_ci90:     add             w0, w1, #80
1542cabdff1aSopenharmony_ci        ret
1543cabdff1aSopenharmony_ci
1544cabdff1aSopenharmony_ci91:     sub             w1, w1, #32
1545cabdff1aSopenharmony_ci        b               90b
1546cabdff1aSopenharmony_ciendfunc
1547