1 /*
2  * VC1 AArch64 NEON optimisations
3  *
4  * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/aarch64/asm.S"
24 
25 // VC-1 8x8 inverse transform
26 // On entry:
27 //   x0 -> array of 16-bit inverse transform coefficients, in column-major order
28 // On exit:
29 //   array at x0 updated to hold transformed block; also now held in row-major order
30 function ff_vc1_inv_trans_8x8_neon, export=1
31         ld1             {v1.16b, v2.16b}, [x0], #32
32         ld1             {v3.16b, v4.16b}, [x0], #32
33         ld1             {v5.16b, v6.16b}, [x0], #32
34         shl             v1.8h, v1.8h, #2        //         8/2 * src[0]
35         sub             x1, x0, #3*32
36         ld1             {v16.16b, v17.16b}, [x0]
37         shl             v7.8h, v2.8h, #4        //          16 * src[8]
38         shl             v18.8h, v2.8h, #2       //           4 * src[8]
39         shl             v19.8h, v4.8h, #4       //                        16 * src[24]
40         ldr             d0, .Lcoeffs_it8
41         shl             v5.8h, v5.8h, #2        //                                      8/2 * src[32]
42         shl             v20.8h, v6.8h, #4       //                                       16 * src[40]
43         shl             v21.8h, v6.8h, #2       //                                        4 * src[40]
44         shl             v22.8h, v17.8h, #4      //                                                      16 * src[56]
45         ssra            v20.8h, v19.8h, #2      //                         4 * src[24] + 16 * src[40]
46         mul             v23.8h, v3.8h, v0.h[0]  //                       6/2 * src[16]
47         sub             v19.8h, v19.8h, v21.8h  //                        16 * src[24] -  4 * src[40]
48         ssra            v7.8h, v22.8h, #2       //          16 * src[8]                               +  4 * src[56]
49         sub             v18.8h, v22.8h, v18.8h  //        -  4 * src[8]                               + 16 * src[56]
50         shl             v3.8h, v3.8h, #3        //                      16/2 * src[16]
51         mls             v20.8h, v2.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
52         ssra            v1.8h, v1.8h, #1        //        12/2 * src[0]
53         ssra            v5.8h, v5.8h, #1        //                                     12/2 * src[32]
54         mla             v7.8h, v4.8h, v0.h[2]   //          16 * src[8] + 15 * src[24]                +  4 * src[56]
55         shl             v21.8h, v16.8h, #3      //                                                    16/2 * src[48]
56         mls             v19.8h, v2.8h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
57         sub             v2.8h, v23.8h, v21.8h   // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
58         mla             v18.8h, v4.8h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
59         add             v4.8h, v1.8h, v5.8h     // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
60         sub             v1.8h, v1.8h, v5.8h     // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
61         mla             v3.8h, v16.8h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
62         mla             v7.8h, v6.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
63         add             v5.8h, v1.8h, v2.8h     // t6/2 = t2/2 + t4/2
64         sub             v16.8h, v1.8h, v2.8h    // t7/2 = t2/2 - t4/2
65         mla             v20.8h, v17.8h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
66         add             v21.8h, v1.8h, v2.8h    // t6/2 = t2/2 + t4/2
67         add             v22.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
68         mls             v19.8h, v17.8h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
69         sub             v17.8h, v4.8h, v3.8h    // t8/2 = t1/2 - t3/2
70         add             v23.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
71         mls             v18.8h, v6.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
72         sub             v1.8h, v1.8h, v2.8h     // t7/2 = t2/2 - t4/2
73         sub             v2.8h, v4.8h, v3.8h     // t8/2 = t1/2 - t3/2
74         neg             v3.8h, v7.8h            // -t1
75         neg             v4.8h, v20.8h           // +t2
76         neg             v6.8h, v19.8h           // +t3
77         ssra            v22.8h, v7.8h, #1       // (t5 + t1) >> 1
78         ssra            v1.8h, v19.8h, #1       // (t7 - t3) >> 1
79         neg             v7.8h, v18.8h           // +t4
80         ssra            v5.8h, v4.8h, #1        // (t6 + t2) >> 1
81         ssra            v16.8h, v6.8h, #1       // (t7 + t3) >> 1
82         ssra            v2.8h, v18.8h, #1       // (t8 - t4) >> 1
83         ssra            v17.8h, v7.8h, #1       // (t8 + t4) >> 1
84         ssra            v21.8h, v20.8h, #1      // (t6 - t2) >> 1
85         ssra            v23.8h, v3.8h, #1       // (t5 - t1) >> 1
86         srshr           v3.8h, v22.8h, #2       // (t5 + t1 + 4) >> 3
87         srshr           v4.8h, v5.8h, #2        // (t6 + t2 + 4) >> 3
88         srshr           v5.8h, v16.8h, #2       // (t7 + t3 + 4) >> 3
89         srshr           v6.8h, v17.8h, #2       // (t8 + t4 + 4) >> 3
90         srshr           v2.8h, v2.8h, #2        // (t8 - t4 + 4) >> 3
91         srshr           v1.8h, v1.8h, #2        // (t7 - t3 + 4) >> 3
92         srshr           v7.8h, v21.8h, #2       // (t6 - t2 + 4) >> 3
93         srshr           v16.8h, v23.8h, #2      // (t5 - t1 + 4) >> 3
94         trn2            v17.8h, v3.8h, v4.8h
95         trn2            v18.8h, v5.8h, v6.8h
96         trn2            v19.8h, v2.8h, v1.8h
97         trn2            v20.8h, v7.8h, v16.8h
98         trn1            v21.4s, v17.4s, v18.4s
99         trn2            v17.4s, v17.4s, v18.4s
100         trn1            v18.4s, v19.4s, v20.4s
101         trn2            v19.4s, v19.4s, v20.4s
102         trn1            v3.8h, v3.8h, v4.8h
103         trn2            v4.2d, v21.2d, v18.2d
104         trn1            v20.2d, v17.2d, v19.2d
105         trn1            v5.8h, v5.8h, v6.8h
106         trn1            v1.8h, v2.8h, v1.8h
107         trn1            v2.8h, v7.8h, v16.8h
108         trn1            v6.2d, v21.2d, v18.2d
109         trn2            v7.2d, v17.2d, v19.2d
110         shl             v16.8h, v20.8h, #4      //                        16 * src[24]
111         shl             v17.8h, v4.8h, #4       //                                       16 * src[40]
112         trn1            v18.4s, v3.4s, v5.4s
113         trn1            v19.4s, v1.4s, v2.4s
114         shl             v21.8h, v7.8h, #4       //                                                      16 * src[56]
115         shl             v22.8h, v6.8h, #2       //           4 * src[8]
116         shl             v23.8h, v4.8h, #2       //                                        4 * src[40]
117         trn2            v3.4s, v3.4s, v5.4s
118         trn2            v1.4s, v1.4s, v2.4s
119         shl             v2.8h, v6.8h, #4        //          16 * src[8]
120         sub             v5.8h, v16.8h, v23.8h   //                        16 * src[24] -  4 * src[40]
121         ssra            v17.8h, v16.8h, #2      //                         4 * src[24] + 16 * src[40]
122         sub             v16.8h, v21.8h, v22.8h  //        -  4 * src[8]                               + 16 * src[56]
123         trn1            v22.2d, v18.2d, v19.2d
124         trn2            v18.2d, v18.2d, v19.2d
125         trn1            v19.2d, v3.2d, v1.2d
126         ssra            v2.8h, v21.8h, #2       //          16 * src[8]                               +  4 * src[56]
127         mls             v17.8h, v6.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
128         shl             v21.8h, v22.8h, #2      //         8/2 * src[0]
129         shl             v18.8h, v18.8h, #2      //                                      8/2 * src[32]
130         mls             v5.8h, v6.8h, v0.h[1]   //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
131         shl             v6.8h, v19.8h, #3       //                      16/2 * src[16]
132         trn2            v1.2d, v3.2d, v1.2d
133         mla             v16.8h, v20.8h, v0.h[1] //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
134         ssra            v21.8h, v21.8h, #1      //        12/2 * src[0]
135         ssra            v18.8h, v18.8h, #1      //                                     12/2 * src[32]
136         mul             v3.8h, v19.8h, v0.h[0]  //                       6/2 * src[16]
137         shl             v19.8h, v1.8h, #3       //                                                    16/2 * src[48]
138         mla             v2.8h, v20.8h, v0.h[2]  //          16 * src[8] + 15 * src[24]                +  4 * src[56]
139         add             v20.8h, v21.8h, v18.8h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
140         mla             v6.8h, v1.8h, v0.h[0]   // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
141         sub             v1.8h, v21.8h, v18.8h   // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
142         sub             v3.8h, v3.8h, v19.8h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
143         mla             v17.8h, v7.8h, v0.h[1]  // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
144         mls             v5.8h, v7.8h, v0.h[2]   // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
145         add             v7.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
146         add             v18.8h, v20.8h, v6.8h   // t5/2 = t1/2 + t3/2
147         mls             v16.8h, v4.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
148         sub             v19.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
149         neg             v21.8h, v17.8h          // +t2
150         mla             v2.8h, v4.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
151         sub             v0.8h, v20.8h, v6.8h    // t8/2 = t1/2 - t3/2
152         neg             v4.8h, v5.8h            // +t3
153         sub             v22.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
154         sub             v23.8h, v20.8h, v6.8h   // t8/2 = t1/2 - t3/2
155         neg             v24.8h, v16.8h          // +t4
156         add             v6.8h, v20.8h, v6.8h    // t5/2 = t1/2 + t3/2
157         add             v1.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
158         ssra            v7.8h, v21.8h, #1       // (t6 + t2) >> 1
159         neg             v3.8h, v2.8h            // -t1
160         ssra            v18.8h, v2.8h, #1       // (t5 + t1) >> 1
161         ssra            v19.8h, v4.8h, #1       // (t7 + t3) >> 1
162         ssra            v0.8h, v24.8h, #1       // (t8 + t4) >> 1
163         srsra           v23.8h, v16.8h, #1      // (t8 - t4 + 1) >> 1
164         srsra           v22.8h, v5.8h, #1       // (t7 - t3 + 1) >> 1
165         srsra           v1.8h, v17.8h, #1       // (t6 - t2 + 1) >> 1
166         srsra           v6.8h, v3.8h, #1        // (t5 - t1 + 1) >> 1
167         srshr           v2.8h, v18.8h, #6       // (t5 + t1 + 64) >> 7
168         srshr           v3.8h, v7.8h, #6        // (t6 + t2 + 64) >> 7
169         srshr           v4.8h, v19.8h, #6       // (t7 + t3 + 64) >> 7
170         srshr           v5.8h, v0.8h, #6        // (t8 + t4 + 64) >> 7
171         srshr           v16.8h, v23.8h, #6      // (t8 - t4 + 65) >> 7
172         srshr           v17.8h, v22.8h, #6      // (t7 - t3 + 65) >> 7
173         st1             {v2.16b, v3.16b}, [x1], #32
174         srshr           v0.8h, v1.8h, #6        // (t6 - t2 + 65) >> 7
175         srshr           v1.8h, v6.8h, #6        // (t5 - t1 + 65) >> 7
176         st1             {v4.16b, v5.16b}, [x1], #32
177         st1             {v16.16b, v17.16b}, [x1], #32
178         st1             {v0.16b, v1.16b}, [x1]
179         ret
180 endfunc
181 
182 // VC-1 8x4 inverse transform
183 // On entry:
184 //   x0 -> array of 8-bit samples, in row-major order
185 //   x1 = row stride for 8-bit sample array
186 //   x2 -> array of 16-bit inverse transform coefficients, in row-major order
187 // On exit:
188 //   array at x0 updated by saturated addition of (narrowed) transformed block
189 function ff_vc1_inv_trans_8x4_neon, export=1
190         ld1             {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
191         mov             x3, x0
192         ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
193         ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
194         ld1             {v5.8b}, [x0], x1
195         trn2            v6.4h, v1.4h, v3.4h
196         trn2            v7.4h, v2.4h, v4.4h
197         trn1            v1.4h, v1.4h, v3.4h
198         trn1            v2.4h, v2.4h, v4.4h
199         trn2            v3.4h, v16.4h, v18.4h
200         trn2            v4.4h, v17.4h, v19.4h
201         trn1            v16.4h, v16.4h, v18.4h
202         trn1            v17.4h, v17.4h, v19.4h
203         ld1             {v18.8b}, [x0], x1
204         trn1            v19.2s, v6.2s, v3.2s
205         trn2            v3.2s, v6.2s, v3.2s
206         trn1            v6.2s, v7.2s, v4.2s
207         trn2            v4.2s, v7.2s, v4.2s
208         trn1            v7.2s, v1.2s, v16.2s
209         trn1            v20.2s, v2.2s, v17.2s
210         shl             v21.4h, v19.4h, #4      //          16 * src[1]
211         trn2            v1.2s, v1.2s, v16.2s
212         shl             v16.4h, v3.4h, #4       //                        16 * src[3]
213         trn2            v2.2s, v2.2s, v17.2s
214         shl             v17.4h, v6.4h, #4       //                                      16 * src[5]
215         ld1             {v22.8b}, [x0], x1
216         shl             v23.4h, v4.4h, #4       //                                                    16 * src[7]
217         mul             v24.4h, v1.4h, v0.h[0]  //                       6/2 * src[2]
218         ld1             {v25.8b}, [x0]
219         shl             v26.4h, v19.4h, #2      //           4 * src[1]
220         shl             v27.4h, v6.4h, #2       //                                       4 * src[5]
221         ssra            v21.4h, v23.4h, #2      //          16 * src[1]                             +  4 * src[7]
222         ssra            v17.4h, v16.4h, #2      //                         4 * src[3] + 16 * src[5]
223         sub             v23.4h, v23.4h, v26.4h  //        -  4 * src[1]                             + 16 * src[7]
224         sub             v16.4h, v16.4h, v27.4h  //                        16 * src[3] -  4 * src[5]
225         shl             v7.4h, v7.4h, #2        //         8/2 * src[0]
226         shl             v20.4h, v20.4h, #2      //                                     8/2 * src[4]
227         mla             v21.4h, v3.4h, v0.h[2]  //          16 * src[1] + 15 * src[3]               +  4 * src[7]
228         shl             v1.4h, v1.4h, #3        //                      16/2 * src[2]
229         mls             v17.4h, v19.4h, v0.h[2] //        - 15 * src[1] +  4 * src[3] + 16 * src[5]
230         ssra            v7.4h, v7.4h, #1        //        12/2 * src[0]
231         mls             v16.4h, v19.4h, v0.h[1] //        -  9 * src[1] + 16 * src[3] -  4 * src[5]
232         ssra            v20.4h, v20.4h, #1      //                                    12/2 * src[4]
233         mla             v23.4h, v3.4h, v0.h[1]  //        -  4 * src[1] +  9 * src[3]               + 16 * src[7]
234         shl             v3.4h, v2.4h, #3        //                                                  16/2 * src[6]
235         mla             v1.4h, v2.4h, v0.h[0]   // t3/2 =               16/2 * src[2]             +  6/2 * src[6]
236         mla             v21.4h, v6.4h, v0.h[1]  //  t1  =   16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7]
237         mla             v17.4h, v4.4h, v0.h[1]  // -t2  = - 15 * src[1] +  4 * src[3] + 16 * src[5] +  9 * src[7]
238         sub             v2.4h, v24.4h, v3.4h    // t4/2 =                6/2 * src[2]             - 16/2 * src[6]
239         mls             v16.4h, v4.4h, v0.h[2]  // -t3  = -  9 * src[1] + 16 * src[3] -  4 * src[5] - 15 * src[7]
240         add             v3.4h, v7.4h, v20.4h    // t1/2 = 12/2 * src[0]             + 12/2 * src[4]
241         mls             v23.4h, v6.4h, v0.h[2]  // -t4  = -  4 * src[1] +  9 * src[3] - 15 * src[5] + 16 * src[7]
242         sub             v4.4h, v7.4h, v20.4h    // t2/2 = 12/2 * src[0]             - 12/2 * src[4]
243         neg             v6.4h, v21.4h           // -t1
244         add             v7.4h, v3.4h, v1.4h     // t5/2 = t1/2 + t3/2
245         sub             v19.4h, v3.4h, v1.4h    // t8/2 = t1/2 - t3/2
246         add             v20.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
247         sub             v24.4h, v4.4h, v2.4h    // t7/2 = t2/2 - t4/2
248         add             v26.4h, v3.4h, v1.4h    // t5/2 = t1/2 + t3/2
249         add             v27.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
250         sub             v2.4h, v4.4h, v2.4h     // t7/2 = t2/2 - t4/2
251         sub             v1.4h, v3.4h, v1.4h     // t8/2 = t1/2 - t3/2
252         neg             v3.4h, v17.4h           // +t2
253         neg             v4.4h, v16.4h           // +t3
254         neg             v28.4h, v23.4h          // +t4
255         ssra            v7.4h, v21.4h, #1       // (t5 + t1) >> 1
256         ssra            v1.4h, v23.4h, #1       // (t8 - t4) >> 1
257         ssra            v20.4h, v3.4h, #1       // (t6 + t2) >> 1
258         ssra            v24.4h, v4.4h, #1       // (t7 + t3) >> 1
259         ssra            v19.4h, v28.4h, #1      // (t8 + t4) >> 1
260         ssra            v2.4h, v16.4h, #1       // (t7 - t3) >> 1
261         ssra            v27.4h, v17.4h, #1      // (t6 - t2) >> 1
262         ssra            v26.4h, v6.4h, #1       // (t5 - t1) >> 1
263         trn1            v1.2d, v7.2d, v1.2d
264         trn1            v2.2d, v20.2d, v2.2d
265         trn1            v3.2d, v24.2d, v27.2d
266         trn1            v4.2d, v19.2d, v26.2d
267         srshr           v1.8h, v1.8h, #2        // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
268         srshr           v2.8h, v2.8h, #2        // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
269         srshr           v3.8h, v3.8h, #2        // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
270         srshr           v4.8h, v4.8h, #2        // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
271         trn2            v6.8h, v1.8h, v2.8h
272         trn1            v1.8h, v1.8h, v2.8h
273         trn2            v2.8h, v3.8h, v4.8h
274         trn1            v3.8h, v3.8h, v4.8h
275         trn2            v4.4s, v6.4s, v2.4s
276         trn1            v7.4s, v1.4s, v3.4s
277         trn2            v1.4s, v1.4s, v3.4s
278         mul             v3.8h, v4.8h, v0.h[5]   //                                                           22/2 * src[24]
279         trn1            v2.4s, v6.4s, v2.4s
280         mul             v4.8h, v4.8h, v0.h[4]   //                                                           10/2 * src[24]
281         mul             v6.8h, v7.8h, v0.h[6]   //            17 * src[0]
282         mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[16]
283         mls             v3.8h, v2.8h, v0.h[4]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
284         mla             v4.8h, v2.8h, v0.h[5]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
285         add             v0.8h, v6.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[16]
286         sub             v1.8h, v6.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[16]
287         neg             v2.8h, v3.8h            // -t4/2
288         neg             v6.8h, v4.8h            // -t3/2
289         ssra            v4.8h, v0.8h, #1        // (t1 + t3) >> 1
290         ssra            v2.8h, v1.8h, #1        // (t2 - t4) >> 1
291         ssra            v3.8h, v1.8h, #1        // (t2 + t4) >> 1
292         ssra            v6.8h, v0.8h, #1        // (t1 - t3) >> 1
293         srshr           v0.8h, v4.8h, #6        // (t1 + t3 + 64) >> 7
294         srshr           v1.8h, v2.8h, #6        // (t2 - t4 + 64) >> 7
295         srshr           v2.8h, v3.8h, #6        // (t2 + t4 + 64) >> 7
296         srshr           v3.8h, v6.8h, #6        // (t1 - t3 + 64) >> 7
297         uaddw           v0.8h, v0.8h, v5.8b
298         uaddw           v1.8h, v1.8h, v18.8b
299         uaddw           v2.8h, v2.8h, v22.8b
300         uaddw           v3.8h, v3.8h, v25.8b
301         sqxtun          v0.8b, v0.8h
302         sqxtun          v1.8b, v1.8h
303         sqxtun          v2.8b, v2.8h
304         sqxtun          v3.8b, v3.8h
305         st1             {v0.8b}, [x3], x1
306         st1             {v1.8b}, [x3], x1
307         st1             {v2.8b}, [x3], x1
308         st1             {v3.8b}, [x3]
309         ret
310 endfunc
311 
312 // VC-1 4x8 inverse transform
313 // On entry:
314 //   x0 -> array of 8-bit samples, in row-major order
315 //   x1 = row stride for 8-bit sample array
316 //   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
317 // On exit:
318 //   array at x0 updated by saturated addition of (narrowed) transformed block
319 function ff_vc1_inv_trans_4x8_neon, export=1
320         mov             x3, #16
321         ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
322         mov             x4, x0
323         ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
324         ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
325         ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
326         ld1             {v4.d}[0], [x2], x3     // 30 31 32 33
327         ld1             {v1.d}[1], [x2], x3     // 40 41 42 43
328         ld1             {v2.d}[1], [x2], x3     // 50 51 52 53
329         ld1             {v3.d}[1], [x2], x3     // 60 61 62 63
330         ld1             {v4.d}[1], [x2]         // 70 71 72 73
331         ld1             {v5.s}[0], [x0], x1
332         ld1             {v6.s}[0], [x0], x1
333         ld1             {v7.s}[0], [x0], x1
334         trn2            v16.8h, v1.8h, v2.8h    // 01 11 03 13 41 51 43 53
335         trn1            v1.8h, v1.8h, v2.8h     // 00 10 02 12 40 50 42 52
336         trn2            v2.8h, v3.8h, v4.8h     // 21 31 23 33 61 71 63 73
337         trn1            v3.8h, v3.8h, v4.8h     // 20 30 22 32 60 70 62 72
338         ld1             {v4.s}[0], [x0], x1
339         trn2            v17.4s, v16.4s, v2.4s   // 03 13 23 33 43 53 63 73
340         trn1            v18.4s, v1.4s, v3.4s    // 00 10 20 30 40 50 60 70
341         trn1            v2.4s, v16.4s, v2.4s    // 01 11 21 31 41 51 61 71
342         mul             v16.8h, v17.8h, v0.h[4] //                                                          10/2 * src[3]
343         ld1             {v5.s}[1], [x0], x1
344         mul             v17.8h, v17.8h, v0.h[5] //                                                          22/2 * src[3]
345         ld1             {v6.s}[1], [x0], x1
346         trn2            v1.4s, v1.4s, v3.4s     // 02 12 22 32 42 52 62 72
347         mul             v3.8h, v18.8h, v0.h[6]  //            17 * src[0]
348         ld1             {v7.s}[1], [x0], x1
349         mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[2]
350         ld1             {v4.s}[1], [x0]
351         mla             v16.8h, v2.8h, v0.h[5]  //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
352         mls             v17.8h, v2.8h, v0.h[4]  //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
353         add             v2.8h, v3.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[2]
354         sub             v1.8h, v3.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[2]
355         neg             v3.8h, v16.8h           // -t3/2
356         ssra            v16.8h, v2.8h, #1       // (t1 + t3) >> 1
357         neg             v18.8h, v17.8h          // -t4/2
358         ssra            v17.8h, v1.8h, #1       // (t2 + t4) >> 1
359         ssra            v3.8h, v2.8h, #1        // (t1 - t3) >> 1
360         ssra            v18.8h, v1.8h, #1       // (t2 - t4) >> 1
361         srshr           v1.8h, v16.8h, #2       // (t1 + t3 + 64) >> 3
362         srshr           v2.8h, v17.8h, #2       // (t2 + t4 + 64) >> 3
363         srshr           v3.8h, v3.8h, #2        // (t1 - t3 + 64) >> 3
364         srshr           v16.8h, v18.8h, #2      // (t2 - t4 + 64) >> 3
365         trn2            v17.8h, v2.8h, v3.8h    // 12 13 32 33 52 53 72 73
366         trn2            v18.8h, v1.8h, v16.8h   // 10 11 30 31 50 51 70 71
367         trn1            v1.8h, v1.8h, v16.8h    // 00 01 20 21 40 41 60 61
368         trn1            v2.8h, v2.8h, v3.8h     // 02 03 22 23 42 43 62 63
369         trn1            v3.4s, v18.4s, v17.4s   // 10 11 12 13 50 51 52 53
370         trn2            v16.4s, v18.4s, v17.4s  // 30 31 32 33 70 71 72 73
371         trn1            v17.4s, v1.4s, v2.4s    // 00 01 02 03 40 41 42 43
372         mov             d18, v3.d[1]            // 50 51 52 53
373         shl             v19.4h, v3.4h, #4       //          16 * src[8]
374         mov             d20, v16.d[1]           // 70 71 72 73
375         shl             v21.4h, v16.4h, #4      //                        16 * src[24]
376         mov             d22, v17.d[1]           // 40 41 42 43
377         shl             v23.4h, v3.4h, #2       //           4 * src[8]
378         shl             v24.4h, v18.4h, #4      //                                       16 * src[40]
379         shl             v25.4h, v20.4h, #4      //                                                      16 * src[56]
380         shl             v26.4h, v18.4h, #2      //                                        4 * src[40]
381         trn2            v1.4s, v1.4s, v2.4s     // 20 21 22 23 60 61 62 63
382         ssra            v24.4h, v21.4h, #2      //                         4 * src[24] + 16 * src[40]
383         sub             v2.4h, v25.4h, v23.4h   //        -  4 * src[8]                               + 16 * src[56]
384         shl             v17.4h, v17.4h, #2      //         8/2 * src[0]
385         sub             v21.4h, v21.4h, v26.4h  //                        16 * src[24] -  4 * src[40]
386         shl             v22.4h, v22.4h, #2      //                                      8/2 * src[32]
387         mov             d23, v1.d[1]            // 60 61 62 63
388         ssra            v19.4h, v25.4h, #2      //          16 * src[8]                               +  4 * src[56]
389         mul             v25.4h, v1.4h, v0.h[0]  //                       6/2 * src[16]
390         shl             v1.4h, v1.4h, #3        //                      16/2 * src[16]
391         mls             v24.4h, v3.4h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
392         ssra            v17.4h, v17.4h, #1      //        12/2 * src[0]
393         mls             v21.4h, v3.4h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
394         ssra            v22.4h, v22.4h, #1      //                                     12/2 * src[32]
395         mla             v2.4h, v16.4h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
396         shl             v3.4h, v23.4h, #3       //                                                    16/2 * src[48]
397         mla             v19.4h, v16.4h, v0.h[2] //          16 * src[8] + 15 * src[24]                +  4 * src[56]
398         mla             v1.4h, v23.4h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
399         mla             v24.4h, v20.4h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
400         add             v16.4h, v17.4h, v22.4h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
401         sub             v3.4h, v25.4h, v3.4h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
402         sub             v17.4h, v17.4h, v22.4h  // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
403         mls             v21.4h, v20.4h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
404         mla             v19.4h, v18.4h, v0.h[1] //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
405         add             v20.4h, v16.4h, v1.4h   // t5/2 = t1/2 + t3/2
406         mls             v2.4h, v18.4h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
407         sub             v0.4h, v16.4h, v1.4h    // t8/2 = t1/2 - t3/2
408         add             v18.4h, v17.4h, v3.4h   // t6/2 = t2/2 + t4/2
409         sub             v22.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
410         neg             v23.4h, v24.4h          // +t2
411         sub             v25.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
412         add             v3.4h, v17.4h, v3.4h    // t6/2 = t2/2 + t4/2
413         neg             v17.4h, v21.4h          // +t3
414         sub             v26.4h, v16.4h, v1.4h   // t8/2 = t1/2 - t3/2
415         add             v1.4h, v16.4h, v1.4h    // t5/2 = t1/2 + t3/2
416         neg             v16.4h, v19.4h          // -t1
417         neg             v27.4h, v2.4h           // +t4
418         ssra            v20.4h, v19.4h, #1      // (t5 + t1) >> 1
419         srsra           v0.4h, v2.4h, #1        // (t8 - t4 + 1) >> 1
420         ssra            v18.4h, v23.4h, #1      // (t6 + t2) >> 1
421         srsra           v22.4h, v21.4h, #1      // (t7 - t3 + 1) >> 1
422         ssra            v25.4h, v17.4h, #1      // (t7 + t3) >> 1
423         srsra           v3.4h, v24.4h, #1       // (t6 - t2 + 1) >> 1
424         ssra            v26.4h, v27.4h, #1      // (t8 + t4) >> 1
425         srsra           v1.4h, v16.4h, #1       // (t5 - t1 + 1) >> 1
426         trn1            v0.2d, v20.2d, v0.2d
427         trn1            v2.2d, v18.2d, v22.2d
428         trn1            v3.2d, v25.2d, v3.2d
429         trn1            v1.2d, v26.2d, v1.2d
430         srshr           v0.8h, v0.8h, #6        // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
431         srshr           v2.8h, v2.8h, #6        // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
432         srshr           v3.8h, v3.8h, #6        // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
433         srshr           v1.8h, v1.8h, #6        // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
434         uaddw           v0.8h, v0.8h, v5.8b
435         uaddw           v2.8h, v2.8h, v6.8b
436         uaddw           v3.8h, v3.8h, v7.8b
437         uaddw           v1.8h, v1.8h, v4.8b
438         sqxtun          v0.8b, v0.8h
439         sqxtun          v2.8b, v2.8h
440         sqxtun          v3.8b, v3.8h
441         sqxtun          v1.8b, v1.8h
442         st1             {v0.s}[0], [x4], x1
443         st1             {v2.s}[0], [x4], x1
444         st1             {v3.s}[0], [x4], x1
445         st1             {v1.s}[0], [x4], x1
446         st1             {v0.s}[1], [x4], x1
447         st1             {v2.s}[1], [x4], x1
448         st1             {v3.s}[1], [x4], x1
449         st1             {v1.s}[1], [x4]
450         ret
451 endfunc
452 
453 // VC-1 4x4 inverse transform
454 // On entry:
455 //   x0 -> array of 8-bit samples, in row-major order
456 //   x1 = row stride for 8-bit sample array
457 //   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
458 // On exit:
459 //   array at x0 updated by saturated addition of (narrowed) transformed block
460 function ff_vc1_inv_trans_4x4_neon, export=1
461         mov             x3, #16
462         ldr             d0, .Lcoeffs_it4
463         mov             x4, x0
464         ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
465         ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
466         ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
467         ld1             {v4.d}[0], [x2]         // 30 31 32 33
468         ld1             {v5.s}[0], [x0], x1
469         ld1             {v5.s}[1], [x0], x1
470         ld1             {v6.s}[0], [x0], x1
471         trn2            v7.4h, v1.4h, v2.4h     // 01 11 03 13
472         trn1            v1.4h, v1.4h, v2.4h     // 00 10 02 12
473         ld1             {v6.s}[1], [x0]
474         trn2            v2.4h, v3.4h, v4.4h     // 21 31 23 33
475         trn1            v3.4h, v3.4h, v4.4h     // 20 30 22 32
476         trn2            v4.2s, v7.2s, v2.2s     // 03 13 23 33
477         trn1            v16.2s, v1.2s, v3.2s    // 00 10 20 30
478         trn1            v2.2s, v7.2s, v2.2s     // 01 11 21 31
479         trn2            v1.2s, v1.2s, v3.2s     // 02 12 22 32
480         mul             v3.4h, v4.4h, v0.h[0]   //                                                          10/2 * src[3]
481         mul             v4.4h, v4.4h, v0.h[1]   //                                                          22/2 * src[3]
482         mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
483         mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[2]
484         mla             v3.4h, v2.4h, v0.h[1]   //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
485         mls             v4.4h, v2.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
486         add             v2.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[2]
487         sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[2]
488         neg             v7.4h, v3.4h            // -t3/2
489         neg             v16.4h, v4.4h           // -t4/2
490         ssra            v3.4h, v2.4h, #1        // (t1 + t3) >> 1
491         ssra            v4.4h, v1.4h, #1        // (t2 + t4) >> 1
492         ssra            v16.4h, v1.4h, #1       // (t2 - t4) >> 1
493         ssra            v7.4h, v2.4h, #1        // (t1 - t3) >> 1
494         srshr           v1.4h, v3.4h, #2        // (t1 + t3 + 64) >> 3
495         srshr           v2.4h, v4.4h, #2        // (t2 + t4 + 64) >> 3
496         srshr           v3.4h, v16.4h, #2       // (t2 - t4 + 64) >> 3
497         srshr           v4.4h, v7.4h, #2        // (t1 - t3 + 64) >> 3
498         trn2            v7.4h, v1.4h, v3.4h     // 10 11 30 31
499         trn1            v1.4h, v1.4h, v3.4h     // 00 01 20 21
500         trn2            v3.4h, v2.4h, v4.4h     // 12 13 32 33
501         trn1            v2.4h, v2.4h, v4.4h     // 02 03 22 23
502         trn2            v4.2s, v7.2s, v3.2s     // 30 31 32 33
503         trn1            v16.2s, v1.2s, v2.2s    // 00 01 02 03
504         trn1            v3.2s, v7.2s, v3.2s     // 10 11 12 13
505         trn2            v1.2s, v1.2s, v2.2s     // 20 21 22 23
506         mul             v2.4h, v4.4h, v0.h[1]   //                                                           22/2 * src[24]
507         mul             v4.4h, v4.4h, v0.h[0]   //                                                           10/2 * src[24]
508         mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
509         mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[16]
510         mls             v2.4h, v3.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
511         mla             v4.4h, v3.4h, v0.h[1]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
512         add             v0.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[16]
513         sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[16]
514         neg             v3.4h, v2.4h            // -t4/2
515         neg             v7.4h, v4.4h            // -t3/2
516         ssra            v4.4h, v0.4h, #1        // (t1 + t3) >> 1
517         ssra            v3.4h, v1.4h, #1        // (t2 - t4) >> 1
518         ssra            v2.4h, v1.4h, #1        // (t2 + t4) >> 1
519         ssra            v7.4h, v0.4h, #1        // (t1 - t3) >> 1
520         trn1            v0.2d, v4.2d, v3.2d
521         trn1            v1.2d, v2.2d, v7.2d
522         srshr           v0.8h, v0.8h, #6        // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
523         srshr           v1.8h, v1.8h, #6        // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
524         uaddw           v0.8h, v0.8h, v5.8b
525         uaddw           v1.8h, v1.8h, v6.8b
526         sqxtun          v0.8b, v0.8h
527         sqxtun          v1.8b, v1.8h
528         st1             {v0.s}[0], [x4], x1
529         st1             {v0.s}[1], [x4], x1
530         st1             {v1.s}[0], [x4], x1
531         st1             {v1.s}[1], [x4]
532         ret
533 endfunc
534 
535 // VC-1 8x8 inverse transform, DC case
536 // On entry:
537 //   x0 -> array of 8-bit samples, in row-major order
538 //   x1 = row stride for 8-bit sample array
539 //   x2 -> 16-bit inverse transform DC coefficient
540 // On exit:
541 //   array at x0 updated by saturated addition of (narrowed) transformed block
542 function ff_vc1_inv_trans_8x8_dc_neon, export=1
543         ldrsh           w2, [x2]
544         mov             x3, x0
545         ld1             {v0.8b}, [x0], x1
546         ld1             {v1.8b}, [x0], x1
547         ld1             {v2.8b}, [x0], x1
548         add             w2, w2, w2, lsl #1
549         ld1             {v3.8b}, [x0], x1
550         ld1             {v4.8b}, [x0], x1
551         add             w2, w2, #1
552         ld1             {v5.8b}, [x0], x1
553         asr             w2, w2, #1
554         ld1             {v6.8b}, [x0], x1
555         add             w2, w2, w2, lsl #1
556         ld1             {v7.8b}, [x0]
557         add             w0, w2, #16
558         asr             w0, w0, #5
559         dup             v16.8h, w0
560         uaddw           v0.8h, v16.8h, v0.8b
561         uaddw           v1.8h, v16.8h, v1.8b
562         uaddw           v2.8h, v16.8h, v2.8b
563         uaddw           v3.8h, v16.8h, v3.8b
564         uaddw           v4.8h, v16.8h, v4.8b
565         uaddw           v5.8h, v16.8h, v5.8b
566         sqxtun          v0.8b, v0.8h
567         uaddw           v6.8h, v16.8h, v6.8b
568         sqxtun          v1.8b, v1.8h
569         uaddw           v7.8h, v16.8h, v7.8b
570         sqxtun          v2.8b, v2.8h
571         sqxtun          v3.8b, v3.8h
572         sqxtun          v4.8b, v4.8h
573         st1             {v0.8b}, [x3], x1
574         sqxtun          v0.8b, v5.8h
575         st1             {v1.8b}, [x3], x1
576         sqxtun          v1.8b, v6.8h
577         st1             {v2.8b}, [x3], x1
578         sqxtun          v2.8b, v7.8h
579         st1             {v3.8b}, [x3], x1
580         st1             {v4.8b}, [x3], x1
581         st1             {v0.8b}, [x3], x1
582         st1             {v1.8b}, [x3], x1
583         st1             {v2.8b}, [x3]
584         ret
585 endfunc
586 
587 // VC-1 8x4 inverse transform, DC case
588 // On entry:
589 //   x0 -> array of 8-bit samples, in row-major order
590 //   x1 = row stride for 8-bit sample array
591 //   x2 -> 16-bit inverse transform DC coefficient
592 // On exit:
593 //   array at x0 updated by saturated addition of (narrowed) transformed block
594 function ff_vc1_inv_trans_8x4_dc_neon, export=1
595         ldrsh           w2, [x2]
596         mov             x3, x0
597         ld1             {v0.8b}, [x0], x1
598         ld1             {v1.8b}, [x0], x1
599         ld1             {v2.8b}, [x0], x1
600         add             w2, w2, w2, lsl #1
601         ld1             {v3.8b}, [x0]
602         add             w0, w2, #1
603         asr             w0, w0, #1
604         add             w0, w0, w0, lsl #4
605         add             w0, w0, #64
606         asr             w0, w0, #7
607         dup             v4.8h, w0
608         uaddw           v0.8h, v4.8h, v0.8b
609         uaddw           v1.8h, v4.8h, v1.8b
610         uaddw           v2.8h, v4.8h, v2.8b
611         uaddw           v3.8h, v4.8h, v3.8b
612         sqxtun          v0.8b, v0.8h
613         sqxtun          v1.8b, v1.8h
614         sqxtun          v2.8b, v2.8h
615         sqxtun          v3.8b, v3.8h
616         st1             {v0.8b}, [x3], x1
617         st1             {v1.8b}, [x3], x1
618         st1             {v2.8b}, [x3], x1
619         st1             {v3.8b}, [x3]
620         ret
621 endfunc
622 
623 // VC-1 4x8 inverse transform, DC case
624 // On entry:
625 //   x0 -> array of 8-bit samples, in row-major order
626 //   x1 = row stride for 8-bit sample array
627 //   x2 -> 16-bit inverse transform DC coefficient
628 // On exit:
629 //   array at x0 updated by saturated addition of (narrowed) transformed block
630 function ff_vc1_inv_trans_4x8_dc_neon, export=1
631         ldrsh           w2, [x2]
632         mov             x3, x0
633         ld1             {v0.s}[0], [x0], x1
634         ld1             {v1.s}[0], [x0], x1
635         ld1             {v2.s}[0], [x0], x1
636         add             w2, w2, w2, lsl #4
637         ld1             {v3.s}[0], [x0], x1
638         add             w2, w2, #4
639         asr             w2, w2, #3
640         add             w2, w2, w2, lsl #1
641         ld1             {v0.s}[1], [x0], x1
642         add             w2, w2, #16
643         asr             w2, w2, #5
644         dup             v4.8h, w2
645         ld1             {v1.s}[1], [x0], x1
646         ld1             {v2.s}[1], [x0], x1
647         ld1             {v3.s}[1], [x0]
648         uaddw           v0.8h, v4.8h, v0.8b
649         uaddw           v1.8h, v4.8h, v1.8b
650         uaddw           v2.8h, v4.8h, v2.8b
651         uaddw           v3.8h, v4.8h, v3.8b
652         sqxtun          v0.8b, v0.8h
653         sqxtun          v1.8b, v1.8h
654         sqxtun          v2.8b, v2.8h
655         sqxtun          v3.8b, v3.8h
656         st1             {v0.s}[0], [x3], x1
657         st1             {v1.s}[0], [x3], x1
658         st1             {v2.s}[0], [x3], x1
659         st1             {v3.s}[0], [x3], x1
660         st1             {v0.s}[1], [x3], x1
661         st1             {v1.s}[1], [x3], x1
662         st1             {v2.s}[1], [x3], x1
663         st1             {v3.s}[1], [x3]
664         ret
665 endfunc
666 
667 // VC-1 4x4 inverse transform, DC case
668 // On entry:
669 //   x0 -> array of 8-bit samples, in row-major order
670 //   x1 = row stride for 8-bit sample array
671 //   x2 -> 16-bit inverse transform DC coefficient
672 // On exit:
673 //   array at x0 updated by saturated addition of (narrowed) transformed block
674 function ff_vc1_inv_trans_4x4_dc_neon, export=1
675         ldrsh           w2, [x2]
676         mov             x3, x0
677         ld1             {v0.s}[0], [x0], x1
678         ld1             {v1.s}[0], [x0], x1
679         ld1             {v0.s}[1], [x0], x1
680         add             w2, w2, w2, lsl #4
681         ld1             {v1.s}[1], [x0]
682         add             w0, w2, #4
683         asr             w0, w0, #3
684         add             w0, w0, w0, lsl #4
685         add             w0, w0, #64
686         asr             w0, w0, #7
687         dup             v2.8h, w0
688         uaddw           v0.8h, v2.8h, v0.8b
689         uaddw           v1.8h, v2.8h, v1.8b
690         sqxtun          v0.8b, v0.8h
691         sqxtun          v1.8b, v1.8h
692         st1             {v0.s}[0], [x3], x1
693         st1             {v1.s}[0], [x3], x1
694         st1             {v0.s}[1], [x3], x1
695         st1             {v1.s}[1], [x3]
696         ret
697 endfunc
698 
699 .align  5
700 .Lcoeffs_it8:
701 .quad   0x000F00090003
702 .Lcoeffs_it4:
703 .quad   0x0011000B0005
704 .Lcoeffs:
705 .quad   0x00050002
706 
707 // VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
708 // On entry:
709 //   x0 -> top-left pel of lower block
710 //   x1 = row stride, bytes
711 //   w2 = PQUANT bitstream parameter
712 function ff_vc1_v_loop_filter4_neon, export=1
713         sub             x3, x0, w1, sxtw #2
714         ldr             d0, .Lcoeffs
715         ld1             {v1.s}[0], [x0], x1     // P5
716         ld1             {v2.s}[0], [x3], x1     // P1
717         ld1             {v3.s}[0], [x3], x1     // P2
718         ld1             {v4.s}[0], [x0], x1     // P6
719         ld1             {v5.s}[0], [x3], x1     // P3
720         ld1             {v6.s}[0], [x0], x1     // P7
721         ld1             {v7.s}[0], [x3]         // P4
722         ld1             {v16.s}[0], [x0]        // P8
723         ushll           v17.8h, v1.8b, #1       // 2*P5
724         dup             v18.8h, w2              // pq
725         ushll           v2.8h, v2.8b, #1        // 2*P1
726         uxtl            v3.8h, v3.8b            // P2
727         uxtl            v4.8h, v4.8b            // P6
728         uxtl            v19.8h, v5.8b           // P3
729         mls             v2.4h, v3.4h, v0.h[1]   // 2*P1-5*P2
730         uxtl            v3.8h, v6.8b            // P7
731         mls             v17.4h, v4.4h, v0.h[1]  // 2*P5-5*P6
732         ushll           v5.8h, v5.8b, #1        // 2*P3
733         uxtl            v6.8h, v7.8b            // P4
734         mla             v17.4h, v3.4h, v0.h[1]  // 2*P5-5*P6+5*P7
735         uxtl            v3.8h, v16.8b           // P8
736         mla             v2.4h, v19.4h, v0.h[1]  // 2*P1-5*P2+5*P3
737         uxtl            v1.8h, v1.8b            // P5
738         mls             v5.4h, v6.4h, v0.h[1]   // 2*P3-5*P4
739         mls             v17.4h, v3.4h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
740         sub             v3.4h, v6.4h, v1.4h     // P4-P5
741         mls             v2.4h, v6.4h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
742         mla             v5.4h, v1.4h, v0.h[1]   // 2*P3-5*P4+5*P5
743         mls             v5.4h, v4.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
744         abs             v4.4h, v3.4h
745         srshr           v7.4h, v17.4h, #3
746         srshr           v2.4h, v2.4h, #3
747         sshr            v4.4h, v4.4h, #1        // clip
748         srshr           v5.4h, v5.4h, #3
749         abs             v7.4h, v7.4h            // a2
750         sshr            v3.4h, v3.4h, #8        // clip_sign
751         abs             v2.4h, v2.4h            // a1
752         cmeq            v16.4h, v4.4h, #0       // test clip == 0
753         abs             v17.4h, v5.4h           // a0
754         sshr            v5.4h, v5.4h, #8        // a0_sign
755         cmhs            v19.4h, v2.4h, v7.4h    // test a1 >= a2
756         cmhs            v18.4h, v17.4h, v18.4h  // test a0 >= pq
757         sub             v3.4h, v3.4h, v5.4h     // clip_sign - a0_sign
758         bsl             v19.8b, v7.8b, v2.8b    // a3
759         orr             v2.8b, v16.8b, v18.8b   // test clip == 0 || a0 >= pq
760         uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
761         cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
762         mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
763         orr             v5.8b, v2.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
764         mov             w0, v5.s[1]             // move to gp reg
765         ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
766         cmhs            v5.4h, v0.4h, v4.4h
767         tbnz            w0, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
768         bsl             v5.8b, v4.8b, v0.8b     // FFMIN(d, clip)
769         bic             v0.8b, v5.8b, v2.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
770         mls             v6.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
771         mla             v1.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
772         sqxtun          v0.8b, v6.8h
773         sqxtun          v1.8b, v1.8h
774         st1             {v0.s}[0], [x3], x1
775         st1             {v1.s}[0], [x3]
776 1:      ret
777 endfunc
778 
779 // VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
780 // On entry:
781 //   x0 -> top-left pel of right block
782 //   x1 = row stride, bytes
783 //   w2 = PQUANT bitstream parameter
784 function ff_vc1_h_loop_filter4_neon, export=1
785         sub             x3, x0, #4              // where to start reading
786         ldr             d0, .Lcoeffs
787         ld1             {v1.8b}, [x3], x1
788         sub             x0, x0, #1              // where to start writing
789         ld1             {v2.8b}, [x3], x1
790         ld1             {v3.8b}, [x3], x1
791         ld1             {v4.8b}, [x3]
792         dup             v5.8h, w2               // pq
793         trn1            v6.8b, v1.8b, v2.8b
794         trn2            v1.8b, v1.8b, v2.8b
795         trn1            v2.8b, v3.8b, v4.8b
796         trn2            v3.8b, v3.8b, v4.8b
797         trn1            v4.4h, v6.4h, v2.4h     // P1, P5
798         trn1            v7.4h, v1.4h, v3.4h     // P2, P6
799         trn2            v2.4h, v6.4h, v2.4h     // P3, P7
800         trn2            v1.4h, v1.4h, v3.4h     // P4, P8
801         ushll           v3.8h, v4.8b, #1        // 2*P1, 2*P5
802         uxtl            v6.8h, v7.8b            // P2, P6
803         uxtl            v7.8h, v2.8b            // P3, P7
804         uxtl            v1.8h, v1.8b            // P4, P8
805         mls             v3.8h, v6.8h, v0.h[1]   // 2*P1-5*P2, 2*P5-5*P6
806         ushll           v2.8h, v2.8b, #1        // 2*P3, 2*P7
807         uxtl            v4.8h, v4.8b            // P1, P5
808         mla             v3.8h, v7.8h, v0.h[1]   // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
809         mov             d6, v6.d[1]             // P6
810         mls             v3.8h, v1.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
811         mov             d4, v4.d[1]             // P5
812         mls             v2.4h, v1.4h, v0.h[1]   // 2*P3-5*P4
813         mla             v2.4h, v4.4h, v0.h[1]   // 2*P3-5*P4+5*P5
814         sub             v7.4h, v1.4h, v4.4h     // P4-P5
815         mls             v2.4h, v6.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
816         srshr           v3.8h, v3.8h, #3
817         abs             v6.4h, v7.4h
818         sshr            v7.4h, v7.4h, #8        // clip_sign
819         srshr           v2.4h, v2.4h, #3
820         abs             v3.8h, v3.8h            // a1, a2
821         sshr            v6.4h, v6.4h, #1        // clip
822         mov             d16, v3.d[1]            // a2
823         abs             v17.4h, v2.4h           // a0
824         cmeq            v18.4h, v6.4h, #0       // test clip == 0
825         sshr            v2.4h, v2.4h, #8        // a0_sign
826         cmhs            v19.4h, v3.4h, v16.4h   // test a1 >= a2
827         cmhs            v5.4h, v17.4h, v5.4h    // test a0 >= pq
828         sub             v2.4h, v7.4h, v2.4h     // clip_sign - a0_sign
829         bsl             v19.8b, v16.8b, v3.8b   // a3
830         orr             v3.8b, v18.8b, v5.8b    // test clip == 0 || a0 >= pq
831         uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
832         cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
833         mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
834         orr             v5.8b, v3.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
835         mov             w2, v5.s[1]             // move to gp reg
836         ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
837         cmhs            v5.4h, v0.4h, v6.4h
838         tbnz            w2, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
839         bsl             v5.8b, v6.8b, v0.8b     // FFMIN(d, clip)
840         bic             v0.8b, v5.8b, v3.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
841         mla             v4.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
842         mls             v1.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
843         sqxtun          v3.8b, v4.8h
844         sqxtun          v2.8b, v1.8h
845         st2             {v2.b, v3.b}[0], [x0], x1
846         st2             {v2.b, v3.b}[1], [x0], x1
847         st2             {v2.b, v3.b}[2], [x0], x1
848         st2             {v2.b, v3.b}[3], [x0]
849 1:      ret
850 endfunc
851 
852 // VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
853 // On entry:
854 //   x0 -> top-left pel of lower block
855 //   x1 = row stride, bytes
856 //   w2 = PQUANT bitstream parameter
857 function ff_vc1_v_loop_filter8_neon, export=1
858         sub             x3, x0, w1, sxtw #2
859         ldr             d0, .Lcoeffs
860         ld1             {v1.8b}, [x0], x1       // P5
861         movi            v2.2d, #0x0000ffff00000000
862         ld1             {v3.8b}, [x3], x1       // P1
863         ld1             {v4.8b}, [x3], x1       // P2
864         ld1             {v5.8b}, [x0], x1       // P6
865         ld1             {v6.8b}, [x3], x1       // P3
866         ld1             {v7.8b}, [x0], x1       // P7
867         ushll           v16.8h, v1.8b, #1       // 2*P5
868         ushll           v3.8h, v3.8b, #1        // 2*P1
869         ld1             {v17.8b}, [x3]          // P4
870         uxtl            v4.8h, v4.8b            // P2
871         ld1             {v18.8b}, [x0]          // P8
872         uxtl            v5.8h, v5.8b            // P6
873         dup             v19.8h, w2              // pq
874         uxtl            v20.8h, v6.8b           // P3
875         mls             v3.8h, v4.8h, v0.h[1]   // 2*P1-5*P2
876         uxtl            v4.8h, v7.8b            // P7
877         ushll           v6.8h, v6.8b, #1        // 2*P3
878         mls             v16.8h, v5.8h, v0.h[1]  // 2*P5-5*P6
879         uxtl            v7.8h, v17.8b           // P4
880         uxtl            v17.8h, v18.8b          // P8
881         mla             v16.8h, v4.8h, v0.h[1]  // 2*P5-5*P6+5*P7
882         uxtl            v1.8h, v1.8b            // P5
883         mla             v3.8h, v20.8h, v0.h[1]  // 2*P1-5*P2+5*P3
884         sub             v4.8h, v7.8h, v1.8h     // P4-P5
885         mls             v6.8h, v7.8h, v0.h[1]   // 2*P3-5*P4
886         mls             v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
887         abs             v17.8h, v4.8h
888         sshr            v4.8h, v4.8h, #8        // clip_sign
889         mls             v3.8h, v7.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
890         sshr            v17.8h, v17.8h, #1      // clip
891         mla             v6.8h, v1.8h, v0.h[1]   // 2*P3-5*P4+5*P5
892         srshr           v16.8h, v16.8h, #3
893         mls             v6.8h, v5.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
894         cmeq            v5.8h, v17.8h, #0       // test clip == 0
895         srshr           v3.8h, v3.8h, #3
896         abs             v16.8h, v16.8h          // a2
897         abs             v3.8h, v3.8h            // a1
898         srshr           v6.8h, v6.8h, #3
899         cmhs            v18.8h, v3.8h, v16.8h   // test a1 >= a2
900         abs             v20.8h, v6.8h           // a0
901         sshr            v6.8h, v6.8h, #8        // a0_sign
902         bsl             v18.16b, v16.16b, v3.16b // a3
903         cmhs            v3.8h, v20.8h, v19.8h   // test a0 >= pq
904         sub             v4.8h, v4.8h, v6.8h     // clip_sign - a0_sign
905         uqsub           v6.8h, v20.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
906         cmhs            v16.8h, v18.8h, v20.8h  // test a3 >= a0
907         orr             v3.16b, v5.16b, v3.16b  // test clip == 0 || a0 >= pq
908         mul             v0.8h, v6.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
909         orr             v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
910         cmtst           v2.2d, v5.2d, v2.2d     // if 2nd of each group of is not filtered, then none of the others in the group should be either
911         mov             w0, v5.s[1]             // move to gp reg
912         ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
913         mov             w2, v5.s[3]
914         orr             v2.16b, v3.16b, v2.16b
915         cmhs            v3.8h, v0.8h, v17.8h
916         and             w0, w0, w2
917         bsl             v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
918         tbnz            w0, #0, 1f              // none of the 8 pixel pairs should be updated in this case
919         bic             v0.16b, v3.16b, v2.16b  // set each d to zero if it should not be filtered
920         mls             v7.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
921         mla             v1.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
922         sqxtun          v0.8b, v7.8h
923         sqxtun          v1.8b, v1.8h
924         st1             {v0.8b}, [x3], x1
925         st1             {v1.8b}, [x3]
926 1:      ret
927 endfunc
928 
929 // VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
930 // On entry:
931 //   x0 -> top-left pel of right block
932 //   x1 = row stride, bytes
933 //   w2 = PQUANT bitstream parameter
934 function ff_vc1_h_loop_filter8_neon, export=1
935         sub             x3, x0, #4              // where to start reading
936         ldr             d0, .Lcoeffs
937         ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
938         sub             x0, x0, #1              // where to start writing
939         ld1             {v2.8b}, [x3], x1
940         add             x4, x0, x1, lsl #2
941         ld1             {v3.8b}, [x3], x1
942         ld1             {v4.8b}, [x3], x1
943         ld1             {v5.8b}, [x3], x1
944         ld1             {v6.8b}, [x3], x1
945         ld1             {v7.8b}, [x3], x1
946         trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
947         ld1             {v17.8b}, [x3]
948         trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
949         trn1            v2.8b, v3.8b, v4.8b     // P1[2], P1[3], P3[2]...
950         trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
951         dup             v4.8h, w2               // pq
952         trn1            v18.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
953         trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
954         trn1            v6.4h, v16.4h, v2.4h    // P1[0], P1[1], P1[2], P1[3], P5[0]...
955         trn1            v19.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
956         trn1            v20.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
957         trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
958         trn2            v2.4h, v16.4h, v2.4h    // P3[0], P3[1], P3[2], P3[3], P7[0]...
959         trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
960         trn1            v3.4h, v18.4h, v20.4h   // P1[4], P1[5], P1[6], P1[7], P5[4]...
961         trn1            v16.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
962         trn2            v17.4h, v18.4h, v20.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
963         trn2            v5.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
964         trn1            v7.2s, v6.2s, v3.2s     // P1
965         trn1            v18.2s, v19.2s, v16.2s  // P2
966         trn2            v3.2s, v6.2s, v3.2s     // P5
967         trn2            v6.2s, v19.2s, v16.2s   // P6
968         trn1            v16.2s, v2.2s, v17.2s   // P3
969         trn2            v2.2s, v2.2s, v17.2s    // P7
970         ushll           v7.8h, v7.8b, #1        // 2*P1
971         trn1            v17.2s, v1.2s, v5.2s    // P4
972         ushll           v19.8h, v3.8b, #1       // 2*P5
973         trn2            v1.2s, v1.2s, v5.2s     // P8
974         uxtl            v5.8h, v18.8b           // P2
975         uxtl            v6.8h, v6.8b            // P6
976         uxtl            v18.8h, v16.8b          // P3
977         mls             v7.8h, v5.8h, v0.h[1]   // 2*P1-5*P2
978         uxtl            v2.8h, v2.8b            // P7
979         ushll           v5.8h, v16.8b, #1       // 2*P3
980         mls             v19.8h, v6.8h, v0.h[1]  // 2*P5-5*P6
981         uxtl            v16.8h, v17.8b          // P4
982         uxtl            v1.8h, v1.8b            // P8
983         mla             v19.8h, v2.8h, v0.h[1]  // 2*P5-5*P6+5*P7
984         uxtl            v2.8h, v3.8b            // P5
985         mla             v7.8h, v18.8h, v0.h[1]  // 2*P1-5*P2+5*P3
986         sub             v3.8h, v16.8h, v2.8h    // P4-P5
987         mls             v5.8h, v16.8h, v0.h[1]  // 2*P3-5*P4
988         mls             v19.8h, v1.8h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
989         abs             v1.8h, v3.8h
990         sshr            v3.8h, v3.8h, #8        // clip_sign
991         mls             v7.8h, v16.8h, v0.h[0]  // 2*P1-5*P2+5*P3-2*P4
992         sshr            v1.8h, v1.8h, #1        // clip
993         mla             v5.8h, v2.8h, v0.h[1]   // 2*P3-5*P4+5*P5
994         srshr           v17.8h, v19.8h, #3
995         mls             v5.8h, v6.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
996         cmeq            v6.8h, v1.8h, #0        // test clip == 0
997         srshr           v7.8h, v7.8h, #3
998         abs             v17.8h, v17.8h          // a2
999         abs             v7.8h, v7.8h            // a1
1000         srshr           v5.8h, v5.8h, #3
1001         cmhs            v18.8h, v7.8h, v17.8h   // test a1 >= a2
1002         abs             v19.8h, v5.8h           // a0
1003         sshr            v5.8h, v5.8h, #8        // a0_sign
1004         bsl             v18.16b, v17.16b, v7.16b // a3
1005         cmhs            v4.8h, v19.8h, v4.8h    // test a0 >= pq
1006         sub             v3.8h, v3.8h, v5.8h     // clip_sign - a0_sign
1007         uqsub           v5.8h, v19.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1008         cmhs            v7.8h, v18.8h, v19.8h   // test a3 >= a0
1009         orr             v4.16b, v6.16b, v4.16b  // test clip == 0 || a0 >= pq
1010         mul             v0.8h, v5.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
1011         orr             v5.16b, v4.16b, v7.16b  // test clip == 0 || a0 >= pq || a3 >= a0
1012         mov             w2, v5.s[1]             // move to gp reg
1013         ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
1014         mov             w3, v5.s[3]
1015         cmhs            v5.8h, v0.8h, v1.8h
1016         and             w5, w2, w3
1017         bsl             v5.16b, v1.16b, v0.16b  // FFMIN(d, clip)
1018         tbnz            w5, #0, 2f              // none of the 8 pixel pairs should be updated in this case
1019         bic             v0.16b, v5.16b, v4.16b  // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1020         mla             v2.8h, v0.8h, v3.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1021         mls             v16.8h, v0.8h, v3.8h    // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1022         sqxtun          v1.8b, v2.8h
1023         sqxtun          v0.8b, v16.8h
1024         tbnz            w2, #0, 1f              // none of the first 4 pixel pairs should be updated if so
1025         st2             {v0.b, v1.b}[0], [x0], x1
1026         st2             {v0.b, v1.b}[1], [x0], x1
1027         st2             {v0.b, v1.b}[2], [x0], x1
1028         st2             {v0.b, v1.b}[3], [x0]
1029 1:      tbnz            w3, #0, 2f              // none of the second 4 pixel pairs should be updated if so
1030         st2             {v0.b, v1.b}[4], [x4], x1
1031         st2             {v0.b, v1.b}[5], [x4], x1
1032         st2             {v0.b, v1.b}[6], [x4], x1
1033         st2             {v0.b, v1.b}[7], [x4]
1034 2:      ret
1035 endfunc
1036 
1037 // VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
1038 // On entry:
1039 //   x0 -> top-left pel of lower block
1040 //   x1 = row stride, bytes
1041 //   w2 = PQUANT bitstream parameter
1042 function ff_vc1_v_loop_filter16_neon, export=1
1043         sub             x3, x0, w1, sxtw #2
1044         ldr             d0, .Lcoeffs
1045         ld1             {v1.16b}, [x0], x1      // P5
1046         movi            v2.2d, #0x0000ffff00000000
1047         ld1             {v3.16b}, [x3], x1      // P1
1048         ld1             {v4.16b}, [x3], x1      // P2
1049         ld1             {v5.16b}, [x0], x1      // P6
1050         ld1             {v6.16b}, [x3], x1      // P3
1051         ld1             {v7.16b}, [x0], x1      // P7
1052         ushll           v16.8h, v1.8b, #1       // 2*P5[0..7]
1053         ushll           v17.8h, v3.8b, #1       // 2*P1[0..7]
1054         ld1             {v18.16b}, [x3]         // P4
1055         uxtl            v19.8h, v4.8b           // P2[0..7]
1056         ld1             {v20.16b}, [x0]         // P8
1057         uxtl            v21.8h, v5.8b           // P6[0..7]
1058         dup             v22.8h, w2              // pq
1059         ushll2          v3.8h, v3.16b, #1       // 2*P1[8..15]
1060         mls             v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
1061         ushll2          v19.8h, v1.16b, #1      // 2*P5[8..15]
1062         uxtl2           v4.8h, v4.16b           // P2[8..15]
1063         mls             v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
1064         uxtl2           v5.8h, v5.16b           // P6[8..15]
1065         uxtl            v23.8h, v6.8b           // P3[0..7]
1066         uxtl            v24.8h, v7.8b           // P7[0..7]
1067         mls             v3.8h, v4.8h, v0.h[1]   // 2*P1[8..15]-5*P2[8..15]
1068         ushll           v4.8h, v6.8b, #1        // 2*P3[0..7]
1069         uxtl            v25.8h, v18.8b          // P4[0..7]
1070         mls             v19.8h, v5.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]
1071         uxtl2           v26.8h, v6.16b          // P3[8..15]
1072         mla             v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
1073         uxtl2           v7.8h, v7.16b           // P7[8..15]
1074         ushll2          v6.8h, v6.16b, #1       // 2*P3[8..15]
1075         mla             v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1076         uxtl2           v18.8h, v18.16b         // P4[8..15]
1077         uxtl            v23.8h, v20.8b          // P8[0..7]
1078         mls             v4.8h, v25.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
1079         uxtl            v24.8h, v1.8b           // P5[0..7]
1080         uxtl2           v20.8h, v20.16b         // P8[8..15]
1081         mla             v3.8h, v26.8h, v0.h[1]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1082         uxtl2           v1.8h, v1.16b           // P5[8..15]
1083         sub             v26.8h, v25.8h, v24.8h  // P4[0..7]-P5[0..7]
1084         mla             v19.8h, v7.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
1085         sub             v7.8h, v18.8h, v1.8h    // P4[8..15]-P5[8..15]
1086         mls             v6.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]
1087         abs             v27.8h, v26.8h
1088         sshr            v26.8h, v26.8h, #8      // clip_sign[0..7]
1089         mls             v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
1090         abs             v28.8h, v7.8h
1091         sshr            v27.8h, v27.8h, #1      // clip[0..7]
1092         mls             v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1093         sshr            v7.8h, v7.8h, #8        // clip_sign[8..15]
1094         sshr            v23.8h, v28.8h, #1      // clip[8..15]
1095         mla             v4.8h, v24.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1096         cmeq            v28.8h, v27.8h, #0      // test clip[0..7] == 0
1097         srshr           v17.8h, v17.8h, #3
1098         mls             v3.8h, v18.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1099         cmeq            v29.8h, v23.8h, #0      // test clip[8..15] == 0
1100         srshr           v16.8h, v16.8h, #3
1101         mls             v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
1102         abs             v17.8h, v17.8h          // a1[0..7]
1103         mla             v6.8h, v1.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1104         srshr           v3.8h, v3.8h, #3
1105         mls             v4.8h, v21.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
1106         abs             v16.8h, v16.8h          // a2[0..7]
1107         srshr           v19.8h, v19.8h, #3
1108         mls             v6.8h, v5.8h, v0.h[0]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1109         cmhs            v5.8h, v17.8h, v16.8h   // test a1[0..7] >= a2[0..7]
1110         abs             v3.8h, v3.8h            // a1[8..15]
1111         srshr           v4.8h, v4.8h, #3
1112         abs             v19.8h, v19.8h          // a2[8..15]
1113         bsl             v5.16b, v16.16b, v17.16b // a3[0..7]
1114         srshr           v6.8h, v6.8h, #3
1115         cmhs            v16.8h, v3.8h, v19.8h   // test a1[8..15] >= a2[8.15]
1116         abs             v17.8h, v4.8h           // a0[0..7]
1117         sshr            v4.8h, v4.8h, #8        // a0_sign[0..7]
1118         bsl             v16.16b, v19.16b, v3.16b // a3[8..15]
1119         uqsub           v3.8h, v17.8h, v5.8h    // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1120         abs             v19.8h, v6.8h           // a0[8..15]
1121         cmhs            v20.8h, v17.8h, v22.8h  // test a0[0..7] >= pq
1122         cmhs            v5.8h, v5.8h, v17.8h    // test a3[0..7] >= a0[0..7]
1123         sub             v4.8h, v26.8h, v4.8h    // clip_sign[0..7] - a0_sign[0..7]
1124         sshr            v6.8h, v6.8h, #8        // a0_sign[8..15]
1125         mul             v3.8h, v3.8h, v0.h[1]   // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1126         uqsub           v17.8h, v19.8h, v16.8h  // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1127         orr             v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
1128         cmhs            v21.8h, v19.8h, v22.8h  // test a0[8..15] >= pq
1129         cmhs            v16.8h, v16.8h, v19.8h  // test a3[8..15] >= a0[8..15]
1130         mul             v0.8h, v17.8h, v0.h[1]  // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1131         sub             v6.8h, v7.8h, v6.8h     // clip_sign[8..15] - a0_sign[8..15]
1132         orr             v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
1133         ushr            v3.8h, v3.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1134         orr             v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
1135         cmtst           v17.2d, v5.2d, v2.2d    // if 2nd of each group of is not filtered, then none of the others in the group should be either
1136         mov             w0, v5.s[1]             // move to gp reg
1137         cmhs            v19.8h, v3.8h, v27.8h
1138         ushr            v0.8h, v0.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1139         mov             w2, v5.s[3]
1140         orr             v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1141         orr             v16.16b, v20.16b, v17.16b
1142         bsl             v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
1143         cmtst           v2.2d, v5.2d, v2.2d
1144         cmhs            v3.8h, v0.8h, v23.8h
1145         mov             w4, v5.s[1]
1146         mov             w5, v5.s[3]
1147         and             w0, w0, w2
1148         bic             v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1149         orr             v2.16b, v7.16b, v2.16b
1150         bsl             v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
1151         mls             v25.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
1152         and             w2, w4, w5
1153         bic             v0.16b, v3.16b, v2.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1154         mla             v24.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
1155         and             w0, w0, w2
1156         mls             v18.8h, v0.8h, v6.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
1157         sqxtun          v2.8b, v25.8h
1158         tbnz            w0, #0, 1f              // none of the 16 pixel pairs should be updated in this case
1159         mla             v1.8h, v0.8h, v6.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
1160         sqxtun          v0.8b, v24.8h
1161         sqxtun2         v2.16b, v18.8h
1162         sqxtun2         v0.16b, v1.8h
1163         st1             {v2.16b}, [x3], x1
1164         st1             {v0.16b}, [x3]
1165 1:      ret
1166 endfunc
1167 
1168 // VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
1169 // On entry:
1170 //   x0 -> top-left pel of right block
1171 //   x1 = row stride, bytes
1172 //   w2 = PQUANT bitstream parameter
1173 function ff_vc1_h_loop_filter16_neon, export=1
1174         sub             x3, x0, #4              // where to start reading
1175         ldr             d0, .Lcoeffs
1176         ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
1177         sub             x0, x0, #1              // where to start writing
1178         ld1             {v2.8b}, [x3], x1
1179         add             x4, x0, x1, lsl #3
1180         ld1             {v3.8b}, [x3], x1
1181         add             x5, x0, x1, lsl #2
1182         ld1             {v4.8b}, [x3], x1
1183         add             x6, x4, x1, lsl #2
1184         ld1             {v5.8b}, [x3], x1
1185         ld1             {v6.8b}, [x3], x1
1186         ld1             {v7.8b}, [x3], x1
1187         trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
1188         ld1             {v17.8b}, [x3], x1
1189         trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
1190         ld1             {v2.8b}, [x3], x1
1191         trn1            v18.8b, v3.8b, v4.8b    // P1[2], P1[3], P3[2]...
1192         ld1             {v19.8b}, [x3], x1
1193         trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
1194         ld1             {v4.8b}, [x3], x1
1195         trn1            v20.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
1196         ld1             {v21.8b}, [x3], x1
1197         trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
1198         ld1             {v6.8b}, [x3], x1
1199         trn1            v22.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
1200         ld1             {v23.8b}, [x3], x1
1201         trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
1202         ld1             {v17.8b}, [x3], x1
1203         trn1            v24.8b, v2.8b, v19.8b   // P1[8], P1[9], P3[8]...
1204         ld1             {v25.8b}, [x3]
1205         trn2            v2.8b, v2.8b, v19.8b    // P2[8], P2[9], P4[8]...
1206         trn1            v19.4h, v16.4h, v18.4h  // P1[0], P1[1], P1[2], P1[3], P5[0]...
1207         trn1            v26.8b, v4.8b, v21.8b   // P1[10], P1[11], P3[10]...
1208         trn2            v4.8b, v4.8b, v21.8b    // P2[10], P2[11], P4[10]...
1209         trn1            v21.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
1210         trn1            v27.4h, v20.4h, v22.4h  // P1[4], P1[5], P1[6], P1[7], P5[4]...
1211         trn1            v28.8b, v6.8b, v23.8b   // P1[12], P1[13], P3[12]...
1212         trn2            v6.8b, v6.8b, v23.8b    // P2[12], P2[13], P4[12]...
1213         trn1            v23.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
1214         trn1            v29.4h, v24.4h, v26.4h  // P1[8], P1[9], P1[10], P1[11], P5[8]...
1215         trn1            v30.8b, v17.8b, v25.8b  // P1[14], P1[15], P3[14]...
1216         trn2            v17.8b, v17.8b, v25.8b  // P2[14], P2[15], P4[14]...
1217         trn1            v25.4h, v2.4h, v4.4h    // P2[8], P2[9], P2[10], P2[11], P6[8]...
1218         trn1            v31.2s, v19.2s, v27.2s  // P1[0..7]
1219         trn2            v19.2s, v19.2s, v27.2s  // P5[0..7]
1220         trn1            v27.2s, v21.2s, v23.2s  // P2[0..7]
1221         trn2            v21.2s, v21.2s, v23.2s  // P6[0..7]
1222         trn1            v23.4h, v28.4h, v30.4h  // P1[12], P1[13], P1[14], P1[15], P5[12]...
1223         trn2            v16.4h, v16.4h, v18.4h  // P3[0], P3[1], P3[2], P3[3], P7[0]...
1224         trn1            v18.4h, v6.4h, v17.4h   // P2[12], P2[13], P2[14], P2[15], P6[12]...
1225         trn2            v20.4h, v20.4h, v22.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
1226         trn2            v22.4h, v24.4h, v26.4h  // P3[8], P3[9], P3[10], P3[11], P7[8]...
1227         trn1            v24.2s, v29.2s, v23.2s  // P1[8..15]
1228         trn2            v23.2s, v29.2s, v23.2s  // P5[8..15]
1229         trn1            v26.2s, v25.2s, v18.2s  // P2[8..15]
1230         trn2            v18.2s, v25.2s, v18.2s  // P6[8..15]
1231         trn2            v25.4h, v28.4h, v30.4h  // P3[12], P3[13], P3[14], P3[15], P7[12]...
1232         trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
1233         trn2            v3.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
1234         trn2            v2.4h, v2.4h, v4.4h     // P4[8], P4[9], P4[10], P4[11], P8[8]...
1235         trn2            v4.4h, v6.4h, v17.4h    // P4[12], P4[13], P4[14], P4[15], P8[12]...
1236         ushll           v5.8h, v31.8b, #1       // 2*P1[0..7]
1237         ushll           v6.8h, v19.8b, #1       // 2*P5[0..7]
1238         trn1            v7.2s, v16.2s, v20.2s   // P3[0..7]
1239         uxtl            v17.8h, v27.8b          // P2[0..7]
1240         trn2            v16.2s, v16.2s, v20.2s  // P7[0..7]
1241         uxtl            v20.8h, v21.8b          // P6[0..7]
1242         trn1            v21.2s, v22.2s, v25.2s  // P3[8..15]
1243         ushll           v24.8h, v24.8b, #1      // 2*P1[8..15]
1244         trn2            v22.2s, v22.2s, v25.2s  // P7[8..15]
1245         ushll           v25.8h, v23.8b, #1      // 2*P5[8..15]
1246         trn1            v27.2s, v1.2s, v3.2s    // P4[0..7]
1247         uxtl            v26.8h, v26.8b          // P2[8..15]
1248         mls             v5.8h, v17.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]
1249         uxtl            v17.8h, v18.8b          // P6[8..15]
1250         mls             v6.8h, v20.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]
1251         trn1            v18.2s, v2.2s, v4.2s    // P4[8..15]
1252         uxtl            v28.8h, v7.8b           // P3[0..7]
1253         mls             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
1254         uxtl            v16.8h, v16.8b          // P7[0..7]
1255         uxtl            v26.8h, v21.8b          // P3[8..15]
1256         mls             v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
1257         uxtl            v22.8h, v22.8b          // P7[8..15]
1258         ushll           v7.8h, v7.8b, #1        // 2*P3[0..7]
1259         uxtl            v27.8h, v27.8b          // P4[0..7]
1260         trn2            v1.2s, v1.2s, v3.2s     // P8[0..7]
1261         ushll           v3.8h, v21.8b, #1       // 2*P3[8..15]
1262         trn2            v2.2s, v2.2s, v4.2s     // P8[8..15]
1263         uxtl            v4.8h, v18.8b           // P4[8..15]
1264         mla             v5.8h, v28.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
1265         uxtl            v1.8h, v1.8b            // P8[0..7]
1266         mla             v6.8h, v16.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1267         uxtl            v2.8h, v2.8b            // P8[8..15]
1268         uxtl            v16.8h, v19.8b          // P5[0..7]
1269         mla             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1270         uxtl            v18.8h, v23.8b          // P5[8..15]
1271         dup             v19.8h, w2              // pq
1272         mla             v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
1273         sub             v21.8h, v27.8h, v16.8h  // P4[0..7]-P5[0..7]
1274         sub             v22.8h, v4.8h, v18.8h   // P4[8..15]-P5[8..15]
1275         mls             v7.8h, v27.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
1276         abs             v23.8h, v21.8h
1277         mls             v3.8h, v4.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]
1278         abs             v26.8h, v22.8h
1279         sshr            v21.8h, v21.8h, #8      // clip_sign[0..7]
1280         mls             v5.8h, v27.8h, v0.h[0]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
1281         sshr            v23.8h, v23.8h, #1      // clip[0..7]
1282         sshr            v26.8h, v26.8h, #1      // clip[8..15]
1283         mls             v6.8h, v1.8h, v0.h[0]   // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1284         sshr            v1.8h, v22.8h, #8       // clip_sign[8..15]
1285         cmeq            v22.8h, v23.8h, #0      // test clip[0..7] == 0
1286         mls             v24.8h, v4.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1287         cmeq            v28.8h, v26.8h, #0      // test clip[8..15] == 0
1288         srshr           v5.8h, v5.8h, #3
1289         mls             v25.8h, v2.8h, v0.h[0]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
1290         srshr           v2.8h, v6.8h, #3
1291         mla             v7.8h, v16.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1292         srshr           v6.8h, v24.8h, #3
1293         mla             v3.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1294         abs             v5.8h, v5.8h            // a1[0..7]
1295         srshr           v24.8h, v25.8h, #3
1296         mls             v3.8h, v17.8h, v0.h[0]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1297         abs             v2.8h, v2.8h            // a2[0..7]
1298         abs             v6.8h, v6.8h            // a1[8..15]
1299         mls             v7.8h, v20.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
1300         abs             v17.8h, v24.8h          // a2[8..15]
1301         cmhs            v20.8h, v5.8h, v2.8h    // test a1[0..7] >= a2[0..7]
1302         srshr           v3.8h, v3.8h, #3
1303         cmhs            v24.8h, v6.8h, v17.8h   // test a1[8..15] >= a2[8.15]
1304         srshr           v7.8h, v7.8h, #3
1305         bsl             v20.16b, v2.16b, v5.16b // a3[0..7]
1306         abs             v2.8h, v3.8h            // a0[8..15]
1307         sshr            v3.8h, v3.8h, #8        // a0_sign[8..15]
1308         bsl             v24.16b, v17.16b, v6.16b // a3[8..15]
1309         abs             v5.8h, v7.8h            // a0[0..7]
1310         sshr            v6.8h, v7.8h, #8        // a0_sign[0..7]
1311         cmhs            v7.8h, v2.8h, v19.8h    // test a0[8..15] >= pq
1312         sub             v1.8h, v1.8h, v3.8h     // clip_sign[8..15] - a0_sign[8..15]
1313         uqsub           v3.8h, v2.8h, v24.8h    // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1314         cmhs            v2.8h, v24.8h, v2.8h    // test a3[8..15] >= a0[8..15]
1315         uqsub           v17.8h, v5.8h, v20.8h   // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1316         cmhs            v19.8h, v5.8h, v19.8h   // test a0[0..7] >= pq
1317         orr             v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
1318         sub             v6.8h, v21.8h, v6.8h    // clip_sign[0..7] - a0_sign[0..7]
1319         mul             v3.8h, v3.8h, v0.h[1]   // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1320         cmhs            v5.8h, v20.8h, v5.8h    // test a3[0..7] >= a0[0..7]
1321         orr             v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
1322         mul             v0.8h, v17.8h, v0.h[1]  // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1323         orr             v2.16b, v7.16b, v2.16b  // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1324         orr             v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
1325         ushr            v3.8h, v3.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1326         mov             w7, v2.s[1]
1327         mov             w8, v2.s[3]
1328         ushr            v0.8h, v0.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1329         mov             w2, v5.s[1]             // move to gp reg
1330         cmhs            v2.8h, v3.8h, v26.8h
1331         mov             w3, v5.s[3]
1332         cmhs            v5.8h, v0.8h, v23.8h
1333         bsl             v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
1334         and             w9, w7, w8
1335         bsl             v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
1336         and             w10, w2, w3
1337         bic             v0.16b, v2.16b, v7.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1338         and             w9, w10, w9
1339         bic             v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1340         mls             v4.8h, v0.8h, v1.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
1341         tbnz            w9, #0, 4f              // none of the 16 pixel pairs should be updated in this case
1342         mls             v27.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
1343         mla             v16.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
1344         sqxtun          v2.8b, v4.8h
1345         mla             v18.8h, v0.8h, v1.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
1346         sqxtun          v0.8b, v27.8h
1347         sqxtun          v1.8b, v16.8h
1348         sqxtun          v3.8b, v18.8h
1349         tbnz            w2, #0, 1f
1350         st2             {v0.b, v1.b}[0], [x0], x1
1351         st2             {v0.b, v1.b}[1], [x0], x1
1352         st2             {v0.b, v1.b}[2], [x0], x1
1353         st2             {v0.b, v1.b}[3], [x0]
1354 1:      tbnz            w3, #0, 2f
1355         st2             {v0.b, v1.b}[4], [x5], x1
1356         st2             {v0.b, v1.b}[5], [x5], x1
1357         st2             {v0.b, v1.b}[6], [x5], x1
1358         st2             {v0.b, v1.b}[7], [x5]
1359 2:      tbnz            w7, #0, 3f
1360         st2             {v2.b, v3.b}[0], [x4], x1
1361         st2             {v2.b, v3.b}[1], [x4], x1
1362         st2             {v2.b, v3.b}[2], [x4], x1
1363         st2             {v2.b, v3.b}[3], [x4]
1364 3:      tbnz            w8, #0, 4f
1365         st2             {v2.b, v3.b}[4], [x6], x1
1366         st2             {v2.b, v3.b}[5], [x6], x1
1367         st2             {v2.b, v3.b}[6], [x6], x1
1368         st2             {v2.b, v3.b}[7], [x6]
1369 4:      ret
1370 endfunc
1371 
1372 // Copy at most the specified number of bytes from source to destination buffer,
1373 // stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
1374 // On entry:
1375 //   x0 -> source buffer
1376 //   w1 = max number of bytes to copy
1377 //   x2 -> destination buffer, optimally 8-byte aligned
1378 // On exit:
1379 //   w0 = number of bytes not copied
1380 function ff_vc1_unescape_buffer_helper_neon, export=1
1381         // Offset by 80 to screen out cases that are too short for us to handle,
1382         // and also make it easy to test for loop termination, or to determine
1383         // whether we need an odd number of half-iterations of the loop.
1384         subs            w1, w1, #80
1385         b.mi            90f
1386 
1387         // Set up useful constants
1388         movi            v20.4s, #3, lsl #24
1389         movi            v21.4s, #3, lsl #16
1390 
1391         tst             w1, #32
1392         b.ne            1f
1393 
1394           ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
1395           ext             v25.16b, v0.16b, v1.16b, #1
1396           ext             v26.16b, v0.16b, v1.16b, #2
1397           ext             v27.16b, v0.16b, v1.16b, #3
1398           ext             v29.16b, v1.16b, v2.16b, #1
1399           ext             v30.16b, v1.16b, v2.16b, #2
1400           ext             v31.16b, v1.16b, v2.16b, #3
1401           bic             v24.16b, v0.16b, v20.16b
1402           bic             v25.16b, v25.16b, v20.16b
1403           bic             v26.16b, v26.16b, v20.16b
1404           bic             v27.16b, v27.16b, v20.16b
1405           bic             v28.16b, v1.16b, v20.16b
1406           bic             v29.16b, v29.16b, v20.16b
1407           bic             v30.16b, v30.16b, v20.16b
1408           bic             v31.16b, v31.16b, v20.16b
1409           eor             v24.16b, v24.16b, v21.16b
1410           eor             v25.16b, v25.16b, v21.16b
1411           eor             v26.16b, v26.16b, v21.16b
1412           eor             v27.16b, v27.16b, v21.16b
1413           eor             v28.16b, v28.16b, v21.16b
1414           eor             v29.16b, v29.16b, v21.16b
1415           eor             v30.16b, v30.16b, v21.16b
1416           eor             v31.16b, v31.16b, v21.16b
1417           cmeq            v24.4s, v24.4s, #0
1418           cmeq            v25.4s, v25.4s, #0
1419           cmeq            v26.4s, v26.4s, #0
1420           cmeq            v27.4s, v27.4s, #0
1421           add             w1, w1, #32
1422           b               3f
1423 
1424 1:      ld1             {v3.16b, v4.16b, v5.16b}, [x0], #48
1425         ext             v25.16b, v3.16b, v4.16b, #1
1426         ext             v26.16b, v3.16b, v4.16b, #2
1427         ext             v27.16b, v3.16b, v4.16b, #3
1428         ext             v29.16b, v4.16b, v5.16b, #1
1429         ext             v30.16b, v4.16b, v5.16b, #2
1430         ext             v31.16b, v4.16b, v5.16b, #3
1431         bic             v24.16b, v3.16b, v20.16b
1432         bic             v25.16b, v25.16b, v20.16b
1433         bic             v26.16b, v26.16b, v20.16b
1434         bic             v27.16b, v27.16b, v20.16b
1435         bic             v28.16b, v4.16b, v20.16b
1436         bic             v29.16b, v29.16b, v20.16b
1437         bic             v30.16b, v30.16b, v20.16b
1438         bic             v31.16b, v31.16b, v20.16b
1439         eor             v24.16b, v24.16b, v21.16b
1440         eor             v25.16b, v25.16b, v21.16b
1441         eor             v26.16b, v26.16b, v21.16b
1442         eor             v27.16b, v27.16b, v21.16b
1443         eor             v28.16b, v28.16b, v21.16b
1444         eor             v29.16b, v29.16b, v21.16b
1445         eor             v30.16b, v30.16b, v21.16b
1446         eor             v31.16b, v31.16b, v21.16b
1447         cmeq            v24.4s, v24.4s, #0
1448         cmeq            v25.4s, v25.4s, #0
1449         cmeq            v26.4s, v26.4s, #0
1450         cmeq            v27.4s, v27.4s, #0
1451         // Drop through...
1452 2:        mov             v0.16b, v5.16b
1453           ld1             {v1.16b, v2.16b}, [x0], #32
1454         cmeq            v28.4s, v28.4s, #0
1455         cmeq            v29.4s, v29.4s, #0
1456         cmeq            v30.4s, v30.4s, #0
1457         cmeq            v31.4s, v31.4s, #0
1458         orr             v24.16b, v24.16b, v25.16b
1459         orr             v26.16b, v26.16b, v27.16b
1460         orr             v28.16b, v28.16b, v29.16b
1461         orr             v30.16b, v30.16b, v31.16b
1462           ext             v25.16b, v0.16b, v1.16b, #1
1463         orr             v22.16b, v24.16b, v26.16b
1464           ext             v26.16b, v0.16b, v1.16b, #2
1465           ext             v27.16b, v0.16b, v1.16b, #3
1466           ext             v29.16b, v1.16b, v2.16b, #1
1467         orr             v23.16b, v28.16b, v30.16b
1468           ext             v30.16b, v1.16b, v2.16b, #2
1469           ext             v31.16b, v1.16b, v2.16b, #3
1470           bic             v24.16b, v0.16b, v20.16b
1471           bic             v25.16b, v25.16b, v20.16b
1472           bic             v26.16b, v26.16b, v20.16b
1473         orr             v22.16b, v22.16b, v23.16b
1474           bic             v27.16b, v27.16b, v20.16b
1475           bic             v28.16b, v1.16b, v20.16b
1476           bic             v29.16b, v29.16b, v20.16b
1477           bic             v30.16b, v30.16b, v20.16b
1478           bic             v31.16b, v31.16b, v20.16b
1479         addv            s22, v22.4s
1480           eor             v24.16b, v24.16b, v21.16b
1481           eor             v25.16b, v25.16b, v21.16b
1482           eor             v26.16b, v26.16b, v21.16b
1483           eor             v27.16b, v27.16b, v21.16b
1484           eor             v28.16b, v28.16b, v21.16b
1485         mov             w3, v22.s[0]
1486           eor             v29.16b, v29.16b, v21.16b
1487           eor             v30.16b, v30.16b, v21.16b
1488           eor             v31.16b, v31.16b, v21.16b
1489           cmeq            v24.4s, v24.4s, #0
1490           cmeq            v25.4s, v25.4s, #0
1491           cmeq            v26.4s, v26.4s, #0
1492           cmeq            v27.4s, v27.4s, #0
1493         cbnz            w3, 90f
1494         st1             {v3.16b, v4.16b}, [x2], #32
1495 3:          mov             v3.16b, v2.16b
1496             ld1             {v4.16b, v5.16b}, [x0], #32
1497           cmeq            v28.4s, v28.4s, #0
1498           cmeq            v29.4s, v29.4s, #0
1499           cmeq            v30.4s, v30.4s, #0
1500           cmeq            v31.4s, v31.4s, #0
1501           orr             v24.16b, v24.16b, v25.16b
1502           orr             v26.16b, v26.16b, v27.16b
1503           orr             v28.16b, v28.16b, v29.16b
1504           orr             v30.16b, v30.16b, v31.16b
1505             ext             v25.16b, v3.16b, v4.16b, #1
1506           orr             v22.16b, v24.16b, v26.16b
1507             ext             v26.16b, v3.16b, v4.16b, #2
1508             ext             v27.16b, v3.16b, v4.16b, #3
1509             ext             v29.16b, v4.16b, v5.16b, #1
1510           orr             v23.16b, v28.16b, v30.16b
1511             ext             v30.16b, v4.16b, v5.16b, #2
1512             ext             v31.16b, v4.16b, v5.16b, #3
1513             bic             v24.16b, v3.16b, v20.16b
1514             bic             v25.16b, v25.16b, v20.16b
1515             bic             v26.16b, v26.16b, v20.16b
1516           orr             v22.16b, v22.16b, v23.16b
1517             bic             v27.16b, v27.16b, v20.16b
1518             bic             v28.16b, v4.16b, v20.16b
1519             bic             v29.16b, v29.16b, v20.16b
1520             bic             v30.16b, v30.16b, v20.16b
1521             bic             v31.16b, v31.16b, v20.16b
1522           addv            s22, v22.4s
1523             eor             v24.16b, v24.16b, v21.16b
1524             eor             v25.16b, v25.16b, v21.16b
1525             eor             v26.16b, v26.16b, v21.16b
1526             eor             v27.16b, v27.16b, v21.16b
1527             eor             v28.16b, v28.16b, v21.16b
1528           mov             w3, v22.s[0]
1529             eor             v29.16b, v29.16b, v21.16b
1530             eor             v30.16b, v30.16b, v21.16b
1531             eor             v31.16b, v31.16b, v21.16b
1532             cmeq            v24.4s, v24.4s, #0
1533             cmeq            v25.4s, v25.4s, #0
1534             cmeq            v26.4s, v26.4s, #0
1535             cmeq            v27.4s, v27.4s, #0
1536           cbnz            w3, 91f
1537           st1             {v0.16b, v1.16b}, [x2], #32
1538         subs            w1, w1, #64
1539         b.pl            2b
1540 
1541 90:     add             w0, w1, #80
1542         ret
1543 
1544 91:     sub             w1, w1, #32
1545         b               90b
1546 endfunc
1547