1/*
2 * VC1 AArch64 NEON optimisations
3 *
4 * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/aarch64/asm.S"
24
25// VC-1 8x8 inverse transform
26// On entry:
27//   x0 -> array of 16-bit inverse transform coefficients, in column-major order
28// On exit:
29//   array at x0 updated to hold transformed block; also now held in row-major order
30function ff_vc1_inv_trans_8x8_neon, export=1
31        ld1             {v1.16b, v2.16b}, [x0], #32
32        ld1             {v3.16b, v4.16b}, [x0], #32
33        ld1             {v5.16b, v6.16b}, [x0], #32
34        shl             v1.8h, v1.8h, #2        //         8/2 * src[0]
35        sub             x1, x0, #3*32
36        ld1             {v16.16b, v17.16b}, [x0]
37        shl             v7.8h, v2.8h, #4        //          16 * src[8]
38        shl             v18.8h, v2.8h, #2       //           4 * src[8]
39        shl             v19.8h, v4.8h, #4       //                        16 * src[24]
40        ldr             d0, .Lcoeffs_it8
41        shl             v5.8h, v5.8h, #2        //                                      8/2 * src[32]
42        shl             v20.8h, v6.8h, #4       //                                       16 * src[40]
43        shl             v21.8h, v6.8h, #2       //                                        4 * src[40]
44        shl             v22.8h, v17.8h, #4      //                                                      16 * src[56]
45        ssra            v20.8h, v19.8h, #2      //                         4 * src[24] + 16 * src[40]
46        mul             v23.8h, v3.8h, v0.h[0]  //                       6/2 * src[16]
47        sub             v19.8h, v19.8h, v21.8h  //                        16 * src[24] -  4 * src[40]
48        ssra            v7.8h, v22.8h, #2       //          16 * src[8]                               +  4 * src[56]
49        sub             v18.8h, v22.8h, v18.8h  //        -  4 * src[8]                               + 16 * src[56]
50        shl             v3.8h, v3.8h, #3        //                      16/2 * src[16]
51        mls             v20.8h, v2.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
52        ssra            v1.8h, v1.8h, #1        //        12/2 * src[0]
53        ssra            v5.8h, v5.8h, #1        //                                     12/2 * src[32]
54        mla             v7.8h, v4.8h, v0.h[2]   //          16 * src[8] + 15 * src[24]                +  4 * src[56]
55        shl             v21.8h, v16.8h, #3      //                                                    16/2 * src[48]
56        mls             v19.8h, v2.8h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
57        sub             v2.8h, v23.8h, v21.8h   // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
58        mla             v18.8h, v4.8h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
59        add             v4.8h, v1.8h, v5.8h     // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
60        sub             v1.8h, v1.8h, v5.8h     // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
61        mla             v3.8h, v16.8h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
62        mla             v7.8h, v6.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
63        add             v5.8h, v1.8h, v2.8h     // t6/2 = t2/2 + t4/2
64        sub             v16.8h, v1.8h, v2.8h    // t7/2 = t2/2 - t4/2
65        mla             v20.8h, v17.8h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
66        add             v21.8h, v1.8h, v2.8h    // t6/2 = t2/2 + t4/2
67        add             v22.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
68        mls             v19.8h, v17.8h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
69        sub             v17.8h, v4.8h, v3.8h    // t8/2 = t1/2 - t3/2
70        add             v23.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
71        mls             v18.8h, v6.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
72        sub             v1.8h, v1.8h, v2.8h     // t7/2 = t2/2 - t4/2
73        sub             v2.8h, v4.8h, v3.8h     // t8/2 = t1/2 - t3/2
74        neg             v3.8h, v7.8h            // -t1
75        neg             v4.8h, v20.8h           // +t2
76        neg             v6.8h, v19.8h           // +t3
77        ssra            v22.8h, v7.8h, #1       // (t5 + t1) >> 1
78        ssra            v1.8h, v19.8h, #1       // (t7 - t3) >> 1
79        neg             v7.8h, v18.8h           // +t4
80        ssra            v5.8h, v4.8h, #1        // (t6 + t2) >> 1
81        ssra            v16.8h, v6.8h, #1       // (t7 + t3) >> 1
82        ssra            v2.8h, v18.8h, #1       // (t8 - t4) >> 1
83        ssra            v17.8h, v7.8h, #1       // (t8 + t4) >> 1
84        ssra            v21.8h, v20.8h, #1      // (t6 - t2) >> 1
85        ssra            v23.8h, v3.8h, #1       // (t5 - t1) >> 1
86        srshr           v3.8h, v22.8h, #2       // (t5 + t1 + 4) >> 3
87        srshr           v4.8h, v5.8h, #2        // (t6 + t2 + 4) >> 3
88        srshr           v5.8h, v16.8h, #2       // (t7 + t3 + 4) >> 3
89        srshr           v6.8h, v17.8h, #2       // (t8 + t4 + 4) >> 3
90        srshr           v2.8h, v2.8h, #2        // (t8 - t4 + 4) >> 3
91        srshr           v1.8h, v1.8h, #2        // (t7 - t3 + 4) >> 3
92        srshr           v7.8h, v21.8h, #2       // (t6 - t2 + 4) >> 3
93        srshr           v16.8h, v23.8h, #2      // (t5 - t1 + 4) >> 3
94        trn2            v17.8h, v3.8h, v4.8h
95        trn2            v18.8h, v5.8h, v6.8h
96        trn2            v19.8h, v2.8h, v1.8h
97        trn2            v20.8h, v7.8h, v16.8h
98        trn1            v21.4s, v17.4s, v18.4s
99        trn2            v17.4s, v17.4s, v18.4s
100        trn1            v18.4s, v19.4s, v20.4s
101        trn2            v19.4s, v19.4s, v20.4s
102        trn1            v3.8h, v3.8h, v4.8h
103        trn2            v4.2d, v21.2d, v18.2d
104        trn1            v20.2d, v17.2d, v19.2d
105        trn1            v5.8h, v5.8h, v6.8h
106        trn1            v1.8h, v2.8h, v1.8h
107        trn1            v2.8h, v7.8h, v16.8h
108        trn1            v6.2d, v21.2d, v18.2d
109        trn2            v7.2d, v17.2d, v19.2d
110        shl             v16.8h, v20.8h, #4      //                        16 * src[24]
111        shl             v17.8h, v4.8h, #4       //                                       16 * src[40]
112        trn1            v18.4s, v3.4s, v5.4s
113        trn1            v19.4s, v1.4s, v2.4s
114        shl             v21.8h, v7.8h, #4       //                                                      16 * src[56]
115        shl             v22.8h, v6.8h, #2       //           4 * src[8]
116        shl             v23.8h, v4.8h, #2       //                                        4 * src[40]
117        trn2            v3.4s, v3.4s, v5.4s
118        trn2            v1.4s, v1.4s, v2.4s
119        shl             v2.8h, v6.8h, #4        //          16 * src[8]
120        sub             v5.8h, v16.8h, v23.8h   //                        16 * src[24] -  4 * src[40]
121        ssra            v17.8h, v16.8h, #2      //                         4 * src[24] + 16 * src[40]
122        sub             v16.8h, v21.8h, v22.8h  //        -  4 * src[8]                               + 16 * src[56]
123        trn1            v22.2d, v18.2d, v19.2d
124        trn2            v18.2d, v18.2d, v19.2d
125        trn1            v19.2d, v3.2d, v1.2d
126        ssra            v2.8h, v21.8h, #2       //          16 * src[8]                               +  4 * src[56]
127        mls             v17.8h, v6.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
128        shl             v21.8h, v22.8h, #2      //         8/2 * src[0]
129        shl             v18.8h, v18.8h, #2      //                                      8/2 * src[32]
130        mls             v5.8h, v6.8h, v0.h[1]   //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
131        shl             v6.8h, v19.8h, #3       //                      16/2 * src[16]
132        trn2            v1.2d, v3.2d, v1.2d
133        mla             v16.8h, v20.8h, v0.h[1] //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
134        ssra            v21.8h, v21.8h, #1      //        12/2 * src[0]
135        ssra            v18.8h, v18.8h, #1      //                                     12/2 * src[32]
136        mul             v3.8h, v19.8h, v0.h[0]  //                       6/2 * src[16]
137        shl             v19.8h, v1.8h, #3       //                                                    16/2 * src[48]
138        mla             v2.8h, v20.8h, v0.h[2]  //          16 * src[8] + 15 * src[24]                +  4 * src[56]
139        add             v20.8h, v21.8h, v18.8h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
140        mla             v6.8h, v1.8h, v0.h[0]   // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
141        sub             v1.8h, v21.8h, v18.8h   // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
142        sub             v3.8h, v3.8h, v19.8h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
143        mla             v17.8h, v7.8h, v0.h[1]  // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
144        mls             v5.8h, v7.8h, v0.h[2]   // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
145        add             v7.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
146        add             v18.8h, v20.8h, v6.8h   // t5/2 = t1/2 + t3/2
147        mls             v16.8h, v4.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
148        sub             v19.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
149        neg             v21.8h, v17.8h          // +t2
150        mla             v2.8h, v4.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
151        sub             v0.8h, v20.8h, v6.8h    // t8/2 = t1/2 - t3/2
152        neg             v4.8h, v5.8h            // +t3
153        sub             v22.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
154        sub             v23.8h, v20.8h, v6.8h   // t8/2 = t1/2 - t3/2
155        neg             v24.8h, v16.8h          // +t4
156        add             v6.8h, v20.8h, v6.8h    // t5/2 = t1/2 + t3/2
157        add             v1.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
158        ssra            v7.8h, v21.8h, #1       // (t6 + t2) >> 1
159        neg             v3.8h, v2.8h            // -t1
160        ssra            v18.8h, v2.8h, #1       // (t5 + t1) >> 1
161        ssra            v19.8h, v4.8h, #1       // (t7 + t3) >> 1
162        ssra            v0.8h, v24.8h, #1       // (t8 + t4) >> 1
163        srsra           v23.8h, v16.8h, #1      // (t8 - t4 + 1) >> 1
164        srsra           v22.8h, v5.8h, #1       // (t7 - t3 + 1) >> 1
165        srsra           v1.8h, v17.8h, #1       // (t6 - t2 + 1) >> 1
166        srsra           v6.8h, v3.8h, #1        // (t5 - t1 + 1) >> 1
167        srshr           v2.8h, v18.8h, #6       // (t5 + t1 + 64) >> 7
168        srshr           v3.8h, v7.8h, #6        // (t6 + t2 + 64) >> 7
169        srshr           v4.8h, v19.8h, #6       // (t7 + t3 + 64) >> 7
170        srshr           v5.8h, v0.8h, #6        // (t8 + t4 + 64) >> 7
171        srshr           v16.8h, v23.8h, #6      // (t8 - t4 + 65) >> 7
172        srshr           v17.8h, v22.8h, #6      // (t7 - t3 + 65) >> 7
173        st1             {v2.16b, v3.16b}, [x1], #32
174        srshr           v0.8h, v1.8h, #6        // (t6 - t2 + 65) >> 7
175        srshr           v1.8h, v6.8h, #6        // (t5 - t1 + 65) >> 7
176        st1             {v4.16b, v5.16b}, [x1], #32
177        st1             {v16.16b, v17.16b}, [x1], #32
178        st1             {v0.16b, v1.16b}, [x1]
179        ret
180endfunc
181
182// VC-1 8x4 inverse transform
183// On entry:
184//   x0 -> array of 8-bit samples, in row-major order
185//   x1 = row stride for 8-bit sample array
186//   x2 -> array of 16-bit inverse transform coefficients, in row-major order
187// On exit:
188//   array at x0 updated by saturated addition of (narrowed) transformed block
189function ff_vc1_inv_trans_8x4_neon, export=1
190        ld1             {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
191        mov             x3, x0
192        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
193        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
194        ld1             {v5.8b}, [x0], x1
195        trn2            v6.4h, v1.4h, v3.4h
196        trn2            v7.4h, v2.4h, v4.4h
197        trn1            v1.4h, v1.4h, v3.4h
198        trn1            v2.4h, v2.4h, v4.4h
199        trn2            v3.4h, v16.4h, v18.4h
200        trn2            v4.4h, v17.4h, v19.4h
201        trn1            v16.4h, v16.4h, v18.4h
202        trn1            v17.4h, v17.4h, v19.4h
203        ld1             {v18.8b}, [x0], x1
204        trn1            v19.2s, v6.2s, v3.2s
205        trn2            v3.2s, v6.2s, v3.2s
206        trn1            v6.2s, v7.2s, v4.2s
207        trn2            v4.2s, v7.2s, v4.2s
208        trn1            v7.2s, v1.2s, v16.2s
209        trn1            v20.2s, v2.2s, v17.2s
210        shl             v21.4h, v19.4h, #4      //          16 * src[1]
211        trn2            v1.2s, v1.2s, v16.2s
212        shl             v16.4h, v3.4h, #4       //                        16 * src[3]
213        trn2            v2.2s, v2.2s, v17.2s
214        shl             v17.4h, v6.4h, #4       //                                      16 * src[5]
215        ld1             {v22.8b}, [x0], x1
216        shl             v23.4h, v4.4h, #4       //                                                    16 * src[7]
217        mul             v24.4h, v1.4h, v0.h[0]  //                       6/2 * src[2]
218        ld1             {v25.8b}, [x0]
219        shl             v26.4h, v19.4h, #2      //           4 * src[1]
220        shl             v27.4h, v6.4h, #2       //                                       4 * src[5]
221        ssra            v21.4h, v23.4h, #2      //          16 * src[1]                             +  4 * src[7]
222        ssra            v17.4h, v16.4h, #2      //                         4 * src[3] + 16 * src[5]
223        sub             v23.4h, v23.4h, v26.4h  //        -  4 * src[1]                             + 16 * src[7]
224        sub             v16.4h, v16.4h, v27.4h  //                        16 * src[3] -  4 * src[5]
225        shl             v7.4h, v7.4h, #2        //         8/2 * src[0]
226        shl             v20.4h, v20.4h, #2      //                                     8/2 * src[4]
227        mla             v21.4h, v3.4h, v0.h[2]  //          16 * src[1] + 15 * src[3]               +  4 * src[7]
228        shl             v1.4h, v1.4h, #3        //                      16/2 * src[2]
229        mls             v17.4h, v19.4h, v0.h[2] //        - 15 * src[1] +  4 * src[3] + 16 * src[5]
230        ssra            v7.4h, v7.4h, #1        //        12/2 * src[0]
231        mls             v16.4h, v19.4h, v0.h[1] //        -  9 * src[1] + 16 * src[3] -  4 * src[5]
232        ssra            v20.4h, v20.4h, #1      //                                    12/2 * src[4]
233        mla             v23.4h, v3.4h, v0.h[1]  //        -  4 * src[1] +  9 * src[3]               + 16 * src[7]
234        shl             v3.4h, v2.4h, #3        //                                                  16/2 * src[6]
235        mla             v1.4h, v2.4h, v0.h[0]   // t3/2 =               16/2 * src[2]             +  6/2 * src[6]
236        mla             v21.4h, v6.4h, v0.h[1]  //  t1  =   16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7]
237        mla             v17.4h, v4.4h, v0.h[1]  // -t2  = - 15 * src[1] +  4 * src[3] + 16 * src[5] +  9 * src[7]
238        sub             v2.4h, v24.4h, v3.4h    // t4/2 =                6/2 * src[2]             - 16/2 * src[6]
239        mls             v16.4h, v4.4h, v0.h[2]  // -t3  = -  9 * src[1] + 16 * src[3] -  4 * src[5] - 15 * src[7]
240        add             v3.4h, v7.4h, v20.4h    // t1/2 = 12/2 * src[0]             + 12/2 * src[4]
241        mls             v23.4h, v6.4h, v0.h[2]  // -t4  = -  4 * src[1] +  9 * src[3] - 15 * src[5] + 16 * src[7]
242        sub             v4.4h, v7.4h, v20.4h    // t2/2 = 12/2 * src[0]             - 12/2 * src[4]
243        neg             v6.4h, v21.4h           // -t1
244        add             v7.4h, v3.4h, v1.4h     // t5/2 = t1/2 + t3/2
245        sub             v19.4h, v3.4h, v1.4h    // t8/2 = t1/2 - t3/2
246        add             v20.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
247        sub             v24.4h, v4.4h, v2.4h    // t7/2 = t2/2 - t4/2
248        add             v26.4h, v3.4h, v1.4h    // t5/2 = t1/2 + t3/2
249        add             v27.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
250        sub             v2.4h, v4.4h, v2.4h     // t7/2 = t2/2 - t4/2
251        sub             v1.4h, v3.4h, v1.4h     // t8/2 = t1/2 - t3/2
252        neg             v3.4h, v17.4h           // +t2
253        neg             v4.4h, v16.4h           // +t3
254        neg             v28.4h, v23.4h          // +t4
255        ssra            v7.4h, v21.4h, #1       // (t5 + t1) >> 1
256        ssra            v1.4h, v23.4h, #1       // (t8 - t4) >> 1
257        ssra            v20.4h, v3.4h, #1       // (t6 + t2) >> 1
258        ssra            v24.4h, v4.4h, #1       // (t7 + t3) >> 1
259        ssra            v19.4h, v28.4h, #1      // (t8 + t4) >> 1
260        ssra            v2.4h, v16.4h, #1       // (t7 - t3) >> 1
261        ssra            v27.4h, v17.4h, #1      // (t6 - t2) >> 1
262        ssra            v26.4h, v6.4h, #1       // (t5 - t1) >> 1
263        trn1            v1.2d, v7.2d, v1.2d
264        trn1            v2.2d, v20.2d, v2.2d
265        trn1            v3.2d, v24.2d, v27.2d
266        trn1            v4.2d, v19.2d, v26.2d
267        srshr           v1.8h, v1.8h, #2        // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
268        srshr           v2.8h, v2.8h, #2        // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
269        srshr           v3.8h, v3.8h, #2        // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
270        srshr           v4.8h, v4.8h, #2        // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
271        trn2            v6.8h, v1.8h, v2.8h
272        trn1            v1.8h, v1.8h, v2.8h
273        trn2            v2.8h, v3.8h, v4.8h
274        trn1            v3.8h, v3.8h, v4.8h
275        trn2            v4.4s, v6.4s, v2.4s
276        trn1            v7.4s, v1.4s, v3.4s
277        trn2            v1.4s, v1.4s, v3.4s
278        mul             v3.8h, v4.8h, v0.h[5]   //                                                           22/2 * src[24]
279        trn1            v2.4s, v6.4s, v2.4s
280        mul             v4.8h, v4.8h, v0.h[4]   //                                                           10/2 * src[24]
281        mul             v6.8h, v7.8h, v0.h[6]   //            17 * src[0]
282        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[16]
283        mls             v3.8h, v2.8h, v0.h[4]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
284        mla             v4.8h, v2.8h, v0.h[5]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
285        add             v0.8h, v6.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[16]
286        sub             v1.8h, v6.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[16]
287        neg             v2.8h, v3.8h            // -t4/2
288        neg             v6.8h, v4.8h            // -t3/2
289        ssra            v4.8h, v0.8h, #1        // (t1 + t3) >> 1
290        ssra            v2.8h, v1.8h, #1        // (t2 - t4) >> 1
291        ssra            v3.8h, v1.8h, #1        // (t2 + t4) >> 1
292        ssra            v6.8h, v0.8h, #1        // (t1 - t3) >> 1
293        srshr           v0.8h, v4.8h, #6        // (t1 + t3 + 64) >> 7
294        srshr           v1.8h, v2.8h, #6        // (t2 - t4 + 64) >> 7
295        srshr           v2.8h, v3.8h, #6        // (t2 + t4 + 64) >> 7
296        srshr           v3.8h, v6.8h, #6        // (t1 - t3 + 64) >> 7
297        uaddw           v0.8h, v0.8h, v5.8b
298        uaddw           v1.8h, v1.8h, v18.8b
299        uaddw           v2.8h, v2.8h, v22.8b
300        uaddw           v3.8h, v3.8h, v25.8b
301        sqxtun          v0.8b, v0.8h
302        sqxtun          v1.8b, v1.8h
303        sqxtun          v2.8b, v2.8h
304        sqxtun          v3.8b, v3.8h
305        st1             {v0.8b}, [x3], x1
306        st1             {v1.8b}, [x3], x1
307        st1             {v2.8b}, [x3], x1
308        st1             {v3.8b}, [x3]
309        ret
310endfunc
311
312// VC-1 4x8 inverse transform
313// On entry:
314//   x0 -> array of 8-bit samples, in row-major order
315//   x1 = row stride for 8-bit sample array
316//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
317// On exit:
318//   array at x0 updated by saturated addition of (narrowed) transformed block
319function ff_vc1_inv_trans_4x8_neon, export=1
320        mov             x3, #16
321        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
322        mov             x4, x0
323        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
324        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
325        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
326        ld1             {v4.d}[0], [x2], x3     // 30 31 32 33
327        ld1             {v1.d}[1], [x2], x3     // 40 41 42 43
328        ld1             {v2.d}[1], [x2], x3     // 50 51 52 53
329        ld1             {v3.d}[1], [x2], x3     // 60 61 62 63
330        ld1             {v4.d}[1], [x2]         // 70 71 72 73
331        ld1             {v5.s}[0], [x0], x1
332        ld1             {v6.s}[0], [x0], x1
333        ld1             {v7.s}[0], [x0], x1
334        trn2            v16.8h, v1.8h, v2.8h    // 01 11 03 13 41 51 43 53
335        trn1            v1.8h, v1.8h, v2.8h     // 00 10 02 12 40 50 42 52
336        trn2            v2.8h, v3.8h, v4.8h     // 21 31 23 33 61 71 63 73
337        trn1            v3.8h, v3.8h, v4.8h     // 20 30 22 32 60 70 62 72
338        ld1             {v4.s}[0], [x0], x1
339        trn2            v17.4s, v16.4s, v2.4s   // 03 13 23 33 43 53 63 73
340        trn1            v18.4s, v1.4s, v3.4s    // 00 10 20 30 40 50 60 70
341        trn1            v2.4s, v16.4s, v2.4s    // 01 11 21 31 41 51 61 71
342        mul             v16.8h, v17.8h, v0.h[4] //                                                          10/2 * src[3]
343        ld1             {v5.s}[1], [x0], x1
344        mul             v17.8h, v17.8h, v0.h[5] //                                                          22/2 * src[3]
345        ld1             {v6.s}[1], [x0], x1
346        trn2            v1.4s, v1.4s, v3.4s     // 02 12 22 32 42 52 62 72
347        mul             v3.8h, v18.8h, v0.h[6]  //            17 * src[0]
348        ld1             {v7.s}[1], [x0], x1
349        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[2]
350        ld1             {v4.s}[1], [x0]
351        mla             v16.8h, v2.8h, v0.h[5]  //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
352        mls             v17.8h, v2.8h, v0.h[4]  //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
353        add             v2.8h, v3.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[2]
354        sub             v1.8h, v3.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[2]
355        neg             v3.8h, v16.8h           // -t3/2
356        ssra            v16.8h, v2.8h, #1       // (t1 + t3) >> 1
357        neg             v18.8h, v17.8h          // -t4/2
358        ssra            v17.8h, v1.8h, #1       // (t2 + t4) >> 1
359        ssra            v3.8h, v2.8h, #1        // (t1 - t3) >> 1
360        ssra            v18.8h, v1.8h, #1       // (t2 - t4) >> 1
361        srshr           v1.8h, v16.8h, #2       // (t1 + t3 + 64) >> 3
362        srshr           v2.8h, v17.8h, #2       // (t2 + t4 + 64) >> 3
363        srshr           v3.8h, v3.8h, #2        // (t1 - t3 + 64) >> 3
364        srshr           v16.8h, v18.8h, #2      // (t2 - t4 + 64) >> 3
365        trn2            v17.8h, v2.8h, v3.8h    // 12 13 32 33 52 53 72 73
366        trn2            v18.8h, v1.8h, v16.8h   // 10 11 30 31 50 51 70 71
367        trn1            v1.8h, v1.8h, v16.8h    // 00 01 20 21 40 41 60 61
368        trn1            v2.8h, v2.8h, v3.8h     // 02 03 22 23 42 43 62 63
369        trn1            v3.4s, v18.4s, v17.4s   // 10 11 12 13 50 51 52 53
370        trn2            v16.4s, v18.4s, v17.4s  // 30 31 32 33 70 71 72 73
371        trn1            v17.4s, v1.4s, v2.4s    // 00 01 02 03 40 41 42 43
372        mov             d18, v3.d[1]            // 50 51 52 53
373        shl             v19.4h, v3.4h, #4       //          16 * src[8]
374        mov             d20, v16.d[1]           // 70 71 72 73
375        shl             v21.4h, v16.4h, #4      //                        16 * src[24]
376        mov             d22, v17.d[1]           // 40 41 42 43
377        shl             v23.4h, v3.4h, #2       //           4 * src[8]
378        shl             v24.4h, v18.4h, #4      //                                       16 * src[40]
379        shl             v25.4h, v20.4h, #4      //                                                      16 * src[56]
380        shl             v26.4h, v18.4h, #2      //                                        4 * src[40]
381        trn2            v1.4s, v1.4s, v2.4s     // 20 21 22 23 60 61 62 63
382        ssra            v24.4h, v21.4h, #2      //                         4 * src[24] + 16 * src[40]
383        sub             v2.4h, v25.4h, v23.4h   //        -  4 * src[8]                               + 16 * src[56]
384        shl             v17.4h, v17.4h, #2      //         8/2 * src[0]
385        sub             v21.4h, v21.4h, v26.4h  //                        16 * src[24] -  4 * src[40]
386        shl             v22.4h, v22.4h, #2      //                                      8/2 * src[32]
387        mov             d23, v1.d[1]            // 60 61 62 63
388        ssra            v19.4h, v25.4h, #2      //          16 * src[8]                               +  4 * src[56]
389        mul             v25.4h, v1.4h, v0.h[0]  //                       6/2 * src[16]
390        shl             v1.4h, v1.4h, #3        //                      16/2 * src[16]
391        mls             v24.4h, v3.4h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
392        ssra            v17.4h, v17.4h, #1      //        12/2 * src[0]
393        mls             v21.4h, v3.4h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
394        ssra            v22.4h, v22.4h, #1      //                                     12/2 * src[32]
395        mla             v2.4h, v16.4h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
396        shl             v3.4h, v23.4h, #3       //                                                    16/2 * src[48]
397        mla             v19.4h, v16.4h, v0.h[2] //          16 * src[8] + 15 * src[24]                +  4 * src[56]
398        mla             v1.4h, v23.4h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
399        mla             v24.4h, v20.4h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
400        add             v16.4h, v17.4h, v22.4h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
401        sub             v3.4h, v25.4h, v3.4h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
402        sub             v17.4h, v17.4h, v22.4h  // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
403        mls             v21.4h, v20.4h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
404        mla             v19.4h, v18.4h, v0.h[1] //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
405        add             v20.4h, v16.4h, v1.4h   // t5/2 = t1/2 + t3/2
406        mls             v2.4h, v18.4h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
407        sub             v0.4h, v16.4h, v1.4h    // t8/2 = t1/2 - t3/2
408        add             v18.4h, v17.4h, v3.4h   // t6/2 = t2/2 + t4/2
409        sub             v22.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
410        neg             v23.4h, v24.4h          // +t2
411        sub             v25.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
412        add             v3.4h, v17.4h, v3.4h    // t6/2 = t2/2 + t4/2
413        neg             v17.4h, v21.4h          // +t3
414        sub             v26.4h, v16.4h, v1.4h   // t8/2 = t1/2 - t3/2
415        add             v1.4h, v16.4h, v1.4h    // t5/2 = t1/2 + t3/2
416        neg             v16.4h, v19.4h          // -t1
417        neg             v27.4h, v2.4h           // +t4
418        ssra            v20.4h, v19.4h, #1      // (t5 + t1) >> 1
419        srsra           v0.4h, v2.4h, #1        // (t8 - t4 + 1) >> 1
420        ssra            v18.4h, v23.4h, #1      // (t6 + t2) >> 1
421        srsra           v22.4h, v21.4h, #1      // (t7 - t3 + 1) >> 1
422        ssra            v25.4h, v17.4h, #1      // (t7 + t3) >> 1
423        srsra           v3.4h, v24.4h, #1       // (t6 - t2 + 1) >> 1
424        ssra            v26.4h, v27.4h, #1      // (t8 + t4) >> 1
425        srsra           v1.4h, v16.4h, #1       // (t5 - t1 + 1) >> 1
426        trn1            v0.2d, v20.2d, v0.2d
427        trn1            v2.2d, v18.2d, v22.2d
428        trn1            v3.2d, v25.2d, v3.2d
429        trn1            v1.2d, v26.2d, v1.2d
430        srshr           v0.8h, v0.8h, #6        // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
431        srshr           v2.8h, v2.8h, #6        // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
432        srshr           v3.8h, v3.8h, #6        // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
433        srshr           v1.8h, v1.8h, #6        // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
434        uaddw           v0.8h, v0.8h, v5.8b
435        uaddw           v2.8h, v2.8h, v6.8b
436        uaddw           v3.8h, v3.8h, v7.8b
437        uaddw           v1.8h, v1.8h, v4.8b
438        sqxtun          v0.8b, v0.8h
439        sqxtun          v2.8b, v2.8h
440        sqxtun          v3.8b, v3.8h
441        sqxtun          v1.8b, v1.8h
442        st1             {v0.s}[0], [x4], x1
443        st1             {v2.s}[0], [x4], x1
444        st1             {v3.s}[0], [x4], x1
445        st1             {v1.s}[0], [x4], x1
446        st1             {v0.s}[1], [x4], x1
447        st1             {v2.s}[1], [x4], x1
448        st1             {v3.s}[1], [x4], x1
449        st1             {v1.s}[1], [x4]
450        ret
451endfunc
452
453// VC-1 4x4 inverse transform
454// On entry:
455//   x0 -> array of 8-bit samples, in row-major order
456//   x1 = row stride for 8-bit sample array
457//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
458// On exit:
459//   array at x0 updated by saturated addition of (narrowed) transformed block
460function ff_vc1_inv_trans_4x4_neon, export=1
461        mov             x3, #16
462        ldr             d0, .Lcoeffs_it4
463        mov             x4, x0
464        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
465        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
466        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
467        ld1             {v4.d}[0], [x2]         // 30 31 32 33
468        ld1             {v5.s}[0], [x0], x1
469        ld1             {v5.s}[1], [x0], x1
470        ld1             {v6.s}[0], [x0], x1
471        trn2            v7.4h, v1.4h, v2.4h     // 01 11 03 13
472        trn1            v1.4h, v1.4h, v2.4h     // 00 10 02 12
473        ld1             {v6.s}[1], [x0]
474        trn2            v2.4h, v3.4h, v4.4h     // 21 31 23 33
475        trn1            v3.4h, v3.4h, v4.4h     // 20 30 22 32
476        trn2            v4.2s, v7.2s, v2.2s     // 03 13 23 33
477        trn1            v16.2s, v1.2s, v3.2s    // 00 10 20 30
478        trn1            v2.2s, v7.2s, v2.2s     // 01 11 21 31
479        trn2            v1.2s, v1.2s, v3.2s     // 02 12 22 32
480        mul             v3.4h, v4.4h, v0.h[0]   //                                                          10/2 * src[3]
481        mul             v4.4h, v4.4h, v0.h[1]   //                                                          22/2 * src[3]
482        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
483        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[2]
484        mla             v3.4h, v2.4h, v0.h[1]   //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
485        mls             v4.4h, v2.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
486        add             v2.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[2]
487        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[2]
488        neg             v7.4h, v3.4h            // -t3/2
489        neg             v16.4h, v4.4h           // -t4/2
490        ssra            v3.4h, v2.4h, #1        // (t1 + t3) >> 1
491        ssra            v4.4h, v1.4h, #1        // (t2 + t4) >> 1
492        ssra            v16.4h, v1.4h, #1       // (t2 - t4) >> 1
493        ssra            v7.4h, v2.4h, #1        // (t1 - t3) >> 1
494        srshr           v1.4h, v3.4h, #2        // (t1 + t3 + 64) >> 3
495        srshr           v2.4h, v4.4h, #2        // (t2 + t4 + 64) >> 3
496        srshr           v3.4h, v16.4h, #2       // (t2 - t4 + 64) >> 3
497        srshr           v4.4h, v7.4h, #2        // (t1 - t3 + 64) >> 3
498        trn2            v7.4h, v1.4h, v3.4h     // 10 11 30 31
499        trn1            v1.4h, v1.4h, v3.4h     // 00 01 20 21
500        trn2            v3.4h, v2.4h, v4.4h     // 12 13 32 33
501        trn1            v2.4h, v2.4h, v4.4h     // 02 03 22 23
502        trn2            v4.2s, v7.2s, v3.2s     // 30 31 32 33
503        trn1            v16.2s, v1.2s, v2.2s    // 00 01 02 03
504        trn1            v3.2s, v7.2s, v3.2s     // 10 11 12 13
505        trn2            v1.2s, v1.2s, v2.2s     // 20 21 22 23
506        mul             v2.4h, v4.4h, v0.h[1]   //                                                           22/2 * src[24]
507        mul             v4.4h, v4.4h, v0.h[0]   //                                                           10/2 * src[24]
508        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
509        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[16]
510        mls             v2.4h, v3.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
511        mla             v4.4h, v3.4h, v0.h[1]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
512        add             v0.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[16]
513        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[16]
514        neg             v3.4h, v2.4h            // -t4/2
515        neg             v7.4h, v4.4h            // -t3/2
516        ssra            v4.4h, v0.4h, #1        // (t1 + t3) >> 1
517        ssra            v3.4h, v1.4h, #1        // (t2 - t4) >> 1
518        ssra            v2.4h, v1.4h, #1        // (t2 + t4) >> 1
519        ssra            v7.4h, v0.4h, #1        // (t1 - t3) >> 1
520        trn1            v0.2d, v4.2d, v3.2d
521        trn1            v1.2d, v2.2d, v7.2d
522        srshr           v0.8h, v0.8h, #6        // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
523        srshr           v1.8h, v1.8h, #6        // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
524        uaddw           v0.8h, v0.8h, v5.8b
525        uaddw           v1.8h, v1.8h, v6.8b
526        sqxtun          v0.8b, v0.8h
527        sqxtun          v1.8b, v1.8h
528        st1             {v0.s}[0], [x4], x1
529        st1             {v0.s}[1], [x4], x1
530        st1             {v1.s}[0], [x4], x1
531        st1             {v1.s}[1], [x4]
532        ret
533endfunc
534
535// VC-1 8x8 inverse transform, DC case
536// On entry:
537//   x0 -> array of 8-bit samples, in row-major order
538//   x1 = row stride for 8-bit sample array
539//   x2 -> 16-bit inverse transform DC coefficient
540// On exit:
541//   array at x0 updated by saturated addition of (narrowed) transformed block
542function ff_vc1_inv_trans_8x8_dc_neon, export=1
543        ldrsh           w2, [x2]
544        mov             x3, x0
545        ld1             {v0.8b}, [x0], x1
546        ld1             {v1.8b}, [x0], x1
547        ld1             {v2.8b}, [x0], x1
548        add             w2, w2, w2, lsl #1
549        ld1             {v3.8b}, [x0], x1
550        ld1             {v4.8b}, [x0], x1
551        add             w2, w2, #1
552        ld1             {v5.8b}, [x0], x1
553        asr             w2, w2, #1
554        ld1             {v6.8b}, [x0], x1
555        add             w2, w2, w2, lsl #1
556        ld1             {v7.8b}, [x0]
557        add             w0, w2, #16
558        asr             w0, w0, #5
559        dup             v16.8h, w0
560        uaddw           v0.8h, v16.8h, v0.8b
561        uaddw           v1.8h, v16.8h, v1.8b
562        uaddw           v2.8h, v16.8h, v2.8b
563        uaddw           v3.8h, v16.8h, v3.8b
564        uaddw           v4.8h, v16.8h, v4.8b
565        uaddw           v5.8h, v16.8h, v5.8b
566        sqxtun          v0.8b, v0.8h
567        uaddw           v6.8h, v16.8h, v6.8b
568        sqxtun          v1.8b, v1.8h
569        uaddw           v7.8h, v16.8h, v7.8b
570        sqxtun          v2.8b, v2.8h
571        sqxtun          v3.8b, v3.8h
572        sqxtun          v4.8b, v4.8h
573        st1             {v0.8b}, [x3], x1
574        sqxtun          v0.8b, v5.8h
575        st1             {v1.8b}, [x3], x1
576        sqxtun          v1.8b, v6.8h
577        st1             {v2.8b}, [x3], x1
578        sqxtun          v2.8b, v7.8h
579        st1             {v3.8b}, [x3], x1
580        st1             {v4.8b}, [x3], x1
581        st1             {v0.8b}, [x3], x1
582        st1             {v1.8b}, [x3], x1
583        st1             {v2.8b}, [x3]
584        ret
585endfunc
586
587// VC-1 8x4 inverse transform, DC case
588// On entry:
589//   x0 -> array of 8-bit samples, in row-major order
590//   x1 = row stride for 8-bit sample array
591//   x2 -> 16-bit inverse transform DC coefficient
592// On exit:
593//   array at x0 updated by saturated addition of (narrowed) transformed block
594function ff_vc1_inv_trans_8x4_dc_neon, export=1
595        ldrsh           w2, [x2]
596        mov             x3, x0
597        ld1             {v0.8b}, [x0], x1
598        ld1             {v1.8b}, [x0], x1
599        ld1             {v2.8b}, [x0], x1
600        add             w2, w2, w2, lsl #1
601        ld1             {v3.8b}, [x0]
602        add             w0, w2, #1
603        asr             w0, w0, #1
604        add             w0, w0, w0, lsl #4
605        add             w0, w0, #64
606        asr             w0, w0, #7
607        dup             v4.8h, w0
608        uaddw           v0.8h, v4.8h, v0.8b
609        uaddw           v1.8h, v4.8h, v1.8b
610        uaddw           v2.8h, v4.8h, v2.8b
611        uaddw           v3.8h, v4.8h, v3.8b
612        sqxtun          v0.8b, v0.8h
613        sqxtun          v1.8b, v1.8h
614        sqxtun          v2.8b, v2.8h
615        sqxtun          v3.8b, v3.8h
616        st1             {v0.8b}, [x3], x1
617        st1             {v1.8b}, [x3], x1
618        st1             {v2.8b}, [x3], x1
619        st1             {v3.8b}, [x3]
620        ret
621endfunc
622
623// VC-1 4x8 inverse transform, DC case
624// On entry:
625//   x0 -> array of 8-bit samples, in row-major order
626//   x1 = row stride for 8-bit sample array
627//   x2 -> 16-bit inverse transform DC coefficient
628// On exit:
629//   array at x0 updated by saturated addition of (narrowed) transformed block
630function ff_vc1_inv_trans_4x8_dc_neon, export=1
631        ldrsh           w2, [x2]
632        mov             x3, x0
633        ld1             {v0.s}[0], [x0], x1
634        ld1             {v1.s}[0], [x0], x1
635        ld1             {v2.s}[0], [x0], x1
636        add             w2, w2, w2, lsl #4
637        ld1             {v3.s}[0], [x0], x1
638        add             w2, w2, #4
639        asr             w2, w2, #3
640        add             w2, w2, w2, lsl #1
641        ld1             {v0.s}[1], [x0], x1
642        add             w2, w2, #16
643        asr             w2, w2, #5
644        dup             v4.8h, w2
645        ld1             {v1.s}[1], [x0], x1
646        ld1             {v2.s}[1], [x0], x1
647        ld1             {v3.s}[1], [x0]
648        uaddw           v0.8h, v4.8h, v0.8b
649        uaddw           v1.8h, v4.8h, v1.8b
650        uaddw           v2.8h, v4.8h, v2.8b
651        uaddw           v3.8h, v4.8h, v3.8b
652        sqxtun          v0.8b, v0.8h
653        sqxtun          v1.8b, v1.8h
654        sqxtun          v2.8b, v2.8h
655        sqxtun          v3.8b, v3.8h
656        st1             {v0.s}[0], [x3], x1
657        st1             {v1.s}[0], [x3], x1
658        st1             {v2.s}[0], [x3], x1
659        st1             {v3.s}[0], [x3], x1
660        st1             {v0.s}[1], [x3], x1
661        st1             {v1.s}[1], [x3], x1
662        st1             {v2.s}[1], [x3], x1
663        st1             {v3.s}[1], [x3]
664        ret
665endfunc
666
667// VC-1 4x4 inverse transform, DC case
668// On entry:
669//   x0 -> array of 8-bit samples, in row-major order
670//   x1 = row stride for 8-bit sample array
671//   x2 -> 16-bit inverse transform DC coefficient
672// On exit:
673//   array at x0 updated by saturated addition of (narrowed) transformed block
674function ff_vc1_inv_trans_4x4_dc_neon, export=1
675        ldrsh           w2, [x2]
676        mov             x3, x0
677        ld1             {v0.s}[0], [x0], x1
678        ld1             {v1.s}[0], [x0], x1
679        ld1             {v0.s}[1], [x0], x1
680        add             w2, w2, w2, lsl #4
681        ld1             {v1.s}[1], [x0]
682        add             w0, w2, #4
683        asr             w0, w0, #3
684        add             w0, w0, w0, lsl #4
685        add             w0, w0, #64
686        asr             w0, w0, #7
687        dup             v2.8h, w0
688        uaddw           v0.8h, v2.8h, v0.8b
689        uaddw           v1.8h, v2.8h, v1.8b
690        sqxtun          v0.8b, v0.8h
691        sqxtun          v1.8b, v1.8h
692        st1             {v0.s}[0], [x3], x1
693        st1             {v1.s}[0], [x3], x1
694        st1             {v0.s}[1], [x3], x1
695        st1             {v1.s}[1], [x3]
696        ret
697endfunc
698
699.align  5
700.Lcoeffs_it8:
701.quad   0x000F00090003
702.Lcoeffs_it4:
703.quad   0x0011000B0005
704.Lcoeffs:
705.quad   0x00050002
706
707// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
708// On entry:
709//   x0 -> top-left pel of lower block
710//   x1 = row stride, bytes
711//   w2 = PQUANT bitstream parameter
712function ff_vc1_v_loop_filter4_neon, export=1
713        sub             x3, x0, w1, sxtw #2
714        ldr             d0, .Lcoeffs
715        ld1             {v1.s}[0], [x0], x1     // P5
716        ld1             {v2.s}[0], [x3], x1     // P1
717        ld1             {v3.s}[0], [x3], x1     // P2
718        ld1             {v4.s}[0], [x0], x1     // P6
719        ld1             {v5.s}[0], [x3], x1     // P3
720        ld1             {v6.s}[0], [x0], x1     // P7
721        ld1             {v7.s}[0], [x3]         // P4
722        ld1             {v16.s}[0], [x0]        // P8
723        ushll           v17.8h, v1.8b, #1       // 2*P5
724        dup             v18.8h, w2              // pq
725        ushll           v2.8h, v2.8b, #1        // 2*P1
726        uxtl            v3.8h, v3.8b            // P2
727        uxtl            v4.8h, v4.8b            // P6
728        uxtl            v19.8h, v5.8b           // P3
729        mls             v2.4h, v3.4h, v0.h[1]   // 2*P1-5*P2
730        uxtl            v3.8h, v6.8b            // P7
731        mls             v17.4h, v4.4h, v0.h[1]  // 2*P5-5*P6
732        ushll           v5.8h, v5.8b, #1        // 2*P3
733        uxtl            v6.8h, v7.8b            // P4
734        mla             v17.4h, v3.4h, v0.h[1]  // 2*P5-5*P6+5*P7
735        uxtl            v3.8h, v16.8b           // P8
736        mla             v2.4h, v19.4h, v0.h[1]  // 2*P1-5*P2+5*P3
737        uxtl            v1.8h, v1.8b            // P5
738        mls             v5.4h, v6.4h, v0.h[1]   // 2*P3-5*P4
739        mls             v17.4h, v3.4h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
740        sub             v3.4h, v6.4h, v1.4h     // P4-P5
741        mls             v2.4h, v6.4h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
742        mla             v5.4h, v1.4h, v0.h[1]   // 2*P3-5*P4+5*P5
743        mls             v5.4h, v4.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
744        abs             v4.4h, v3.4h
745        srshr           v7.4h, v17.4h, #3
746        srshr           v2.4h, v2.4h, #3
747        sshr            v4.4h, v4.4h, #1        // clip
748        srshr           v5.4h, v5.4h, #3
749        abs             v7.4h, v7.4h            // a2
750        sshr            v3.4h, v3.4h, #8        // clip_sign
751        abs             v2.4h, v2.4h            // a1
752        cmeq            v16.4h, v4.4h, #0       // test clip == 0
753        abs             v17.4h, v5.4h           // a0
754        sshr            v5.4h, v5.4h, #8        // a0_sign
755        cmhs            v19.4h, v2.4h, v7.4h    // test a1 >= a2
756        cmhs            v18.4h, v17.4h, v18.4h  // test a0 >= pq
757        sub             v3.4h, v3.4h, v5.4h     // clip_sign - a0_sign
758        bsl             v19.8b, v7.8b, v2.8b    // a3
759        orr             v2.8b, v16.8b, v18.8b   // test clip == 0 || a0 >= pq
760        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
761        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
762        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
763        orr             v5.8b, v2.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
764        mov             w0, v5.s[1]             // move to gp reg
765        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
766        cmhs            v5.4h, v0.4h, v4.4h
767        tbnz            w0, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
768        bsl             v5.8b, v4.8b, v0.8b     // FFMIN(d, clip)
769        bic             v0.8b, v5.8b, v2.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
770        mls             v6.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
771        mla             v1.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
772        sqxtun          v0.8b, v6.8h
773        sqxtun          v1.8b, v1.8h
774        st1             {v0.s}[0], [x3], x1
775        st1             {v1.s}[0], [x3]
7761:      ret
777endfunc
778
779// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
780// On entry:
781//   x0 -> top-left pel of right block
782//   x1 = row stride, bytes
783//   w2 = PQUANT bitstream parameter
784function ff_vc1_h_loop_filter4_neon, export=1
785        sub             x3, x0, #4              // where to start reading
786        ldr             d0, .Lcoeffs
787        ld1             {v1.8b}, [x3], x1
788        sub             x0, x0, #1              // where to start writing
789        ld1             {v2.8b}, [x3], x1
790        ld1             {v3.8b}, [x3], x1
791        ld1             {v4.8b}, [x3]
792        dup             v5.8h, w2               // pq
793        trn1            v6.8b, v1.8b, v2.8b
794        trn2            v1.8b, v1.8b, v2.8b
795        trn1            v2.8b, v3.8b, v4.8b
796        trn2            v3.8b, v3.8b, v4.8b
797        trn1            v4.4h, v6.4h, v2.4h     // P1, P5
798        trn1            v7.4h, v1.4h, v3.4h     // P2, P6
799        trn2            v2.4h, v6.4h, v2.4h     // P3, P7
800        trn2            v1.4h, v1.4h, v3.4h     // P4, P8
801        ushll           v3.8h, v4.8b, #1        // 2*P1, 2*P5
802        uxtl            v6.8h, v7.8b            // P2, P6
803        uxtl            v7.8h, v2.8b            // P3, P7
804        uxtl            v1.8h, v1.8b            // P4, P8
805        mls             v3.8h, v6.8h, v0.h[1]   // 2*P1-5*P2, 2*P5-5*P6
806        ushll           v2.8h, v2.8b, #1        // 2*P3, 2*P7
807        uxtl            v4.8h, v4.8b            // P1, P5
808        mla             v3.8h, v7.8h, v0.h[1]   // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
809        mov             d6, v6.d[1]             // P6
810        mls             v3.8h, v1.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
811        mov             d4, v4.d[1]             // P5
812        mls             v2.4h, v1.4h, v0.h[1]   // 2*P3-5*P4
813        mla             v2.4h, v4.4h, v0.h[1]   // 2*P3-5*P4+5*P5
814        sub             v7.4h, v1.4h, v4.4h     // P4-P5
815        mls             v2.4h, v6.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
816        srshr           v3.8h, v3.8h, #3
817        abs             v6.4h, v7.4h
818        sshr            v7.4h, v7.4h, #8        // clip_sign
819        srshr           v2.4h, v2.4h, #3
820        abs             v3.8h, v3.8h            // a1, a2
821        sshr            v6.4h, v6.4h, #1        // clip
822        mov             d16, v3.d[1]            // a2
823        abs             v17.4h, v2.4h           // a0
824        cmeq            v18.4h, v6.4h, #0       // test clip == 0
825        sshr            v2.4h, v2.4h, #8        // a0_sign
826        cmhs            v19.4h, v3.4h, v16.4h   // test a1 >= a2
827        cmhs            v5.4h, v17.4h, v5.4h    // test a0 >= pq
828        sub             v2.4h, v7.4h, v2.4h     // clip_sign - a0_sign
829        bsl             v19.8b, v16.8b, v3.8b   // a3
830        orr             v3.8b, v18.8b, v5.8b    // test clip == 0 || a0 >= pq
831        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
832        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
833        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
834        orr             v5.8b, v3.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
835        mov             w2, v5.s[1]             // move to gp reg
836        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
837        cmhs            v5.4h, v0.4h, v6.4h
838        tbnz            w2, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
839        bsl             v5.8b, v6.8b, v0.8b     // FFMIN(d, clip)
840        bic             v0.8b, v5.8b, v3.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
841        mla             v4.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
842        mls             v1.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
843        sqxtun          v3.8b, v4.8h
844        sqxtun          v2.8b, v1.8h
845        st2             {v2.b, v3.b}[0], [x0], x1
846        st2             {v2.b, v3.b}[1], [x0], x1
847        st2             {v2.b, v3.b}[2], [x0], x1
848        st2             {v2.b, v3.b}[3], [x0]
8491:      ret
850endfunc
851
852// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
853// On entry:
854//   x0 -> top-left pel of lower block
855//   x1 = row stride, bytes
856//   w2 = PQUANT bitstream parameter
857function ff_vc1_v_loop_filter8_neon, export=1
858        sub             x3, x0, w1, sxtw #2
859        ldr             d0, .Lcoeffs
860        ld1             {v1.8b}, [x0], x1       // P5
861        movi            v2.2d, #0x0000ffff00000000
862        ld1             {v3.8b}, [x3], x1       // P1
863        ld1             {v4.8b}, [x3], x1       // P2
864        ld1             {v5.8b}, [x0], x1       // P6
865        ld1             {v6.8b}, [x3], x1       // P3
866        ld1             {v7.8b}, [x0], x1       // P7
867        ushll           v16.8h, v1.8b, #1       // 2*P5
868        ushll           v3.8h, v3.8b, #1        // 2*P1
869        ld1             {v17.8b}, [x3]          // P4
870        uxtl            v4.8h, v4.8b            // P2
871        ld1             {v18.8b}, [x0]          // P8
872        uxtl            v5.8h, v5.8b            // P6
873        dup             v19.8h, w2              // pq
874        uxtl            v20.8h, v6.8b           // P3
875        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1-5*P2
876        uxtl            v4.8h, v7.8b            // P7
877        ushll           v6.8h, v6.8b, #1        // 2*P3
878        mls             v16.8h, v5.8h, v0.h[1]  // 2*P5-5*P6
879        uxtl            v7.8h, v17.8b           // P4
880        uxtl            v17.8h, v18.8b          // P8
881        mla             v16.8h, v4.8h, v0.h[1]  // 2*P5-5*P6+5*P7
882        uxtl            v1.8h, v1.8b            // P5
883        mla             v3.8h, v20.8h, v0.h[1]  // 2*P1-5*P2+5*P3
884        sub             v4.8h, v7.8h, v1.8h     // P4-P5
885        mls             v6.8h, v7.8h, v0.h[1]   // 2*P3-5*P4
886        mls             v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
887        abs             v17.8h, v4.8h
888        sshr            v4.8h, v4.8h, #8        // clip_sign
889        mls             v3.8h, v7.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
890        sshr            v17.8h, v17.8h, #1      // clip
891        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3-5*P4+5*P5
892        srshr           v16.8h, v16.8h, #3
893        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
894        cmeq            v5.8h, v17.8h, #0       // test clip == 0
895        srshr           v3.8h, v3.8h, #3
896        abs             v16.8h, v16.8h          // a2
897        abs             v3.8h, v3.8h            // a1
898        srshr           v6.8h, v6.8h, #3
899        cmhs            v18.8h, v3.8h, v16.8h   // test a1 >= a2
900        abs             v20.8h, v6.8h           // a0
901        sshr            v6.8h, v6.8h, #8        // a0_sign
902        bsl             v18.16b, v16.16b, v3.16b // a3
903        cmhs            v3.8h, v20.8h, v19.8h   // test a0 >= pq
904        sub             v4.8h, v4.8h, v6.8h     // clip_sign - a0_sign
905        uqsub           v6.8h, v20.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
906        cmhs            v16.8h, v18.8h, v20.8h  // test a3 >= a0
907        orr             v3.16b, v5.16b, v3.16b  // test clip == 0 || a0 >= pq
908        mul             v0.8h, v6.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
909        orr             v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
910        cmtst           v2.2d, v5.2d, v2.2d     // if 2nd of each group of is not filtered, then none of the others in the group should be either
911        mov             w0, v5.s[1]             // move to gp reg
912        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
913        mov             w2, v5.s[3]
914        orr             v2.16b, v3.16b, v2.16b
915        cmhs            v3.8h, v0.8h, v17.8h
916        and             w0, w0, w2
917        bsl             v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
918        tbnz            w0, #0, 1f              // none of the 8 pixel pairs should be updated in this case
919        bic             v0.16b, v3.16b, v2.16b  // set each d to zero if it should not be filtered
920        mls             v7.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
921        mla             v1.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
922        sqxtun          v0.8b, v7.8h
923        sqxtun          v1.8b, v1.8h
924        st1             {v0.8b}, [x3], x1
925        st1             {v1.8b}, [x3]
9261:      ret
927endfunc
928
929// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
930// On entry:
931//   x0 -> top-left pel of right block
932//   x1 = row stride, bytes
933//   w2 = PQUANT bitstream parameter
934function ff_vc1_h_loop_filter8_neon, export=1
935        sub             x3, x0, #4              // where to start reading
936        ldr             d0, .Lcoeffs
937        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
938        sub             x0, x0, #1              // where to start writing
939        ld1             {v2.8b}, [x3], x1
940        add             x4, x0, x1, lsl #2
941        ld1             {v3.8b}, [x3], x1
942        ld1             {v4.8b}, [x3], x1
943        ld1             {v5.8b}, [x3], x1
944        ld1             {v6.8b}, [x3], x1
945        ld1             {v7.8b}, [x3], x1
946        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
947        ld1             {v17.8b}, [x3]
948        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
949        trn1            v2.8b, v3.8b, v4.8b     // P1[2], P1[3], P3[2]...
950        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
951        dup             v4.8h, w2               // pq
952        trn1            v18.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
953        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
954        trn1            v6.4h, v16.4h, v2.4h    // P1[0], P1[1], P1[2], P1[3], P5[0]...
955        trn1            v19.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
956        trn1            v20.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
957        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
958        trn2            v2.4h, v16.4h, v2.4h    // P3[0], P3[1], P3[2], P3[3], P7[0]...
959        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
960        trn1            v3.4h, v18.4h, v20.4h   // P1[4], P1[5], P1[6], P1[7], P5[4]...
961        trn1            v16.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
962        trn2            v17.4h, v18.4h, v20.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
963        trn2            v5.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
964        trn1            v7.2s, v6.2s, v3.2s     // P1
965        trn1            v18.2s, v19.2s, v16.2s  // P2
966        trn2            v3.2s, v6.2s, v3.2s     // P5
967        trn2            v6.2s, v19.2s, v16.2s   // P6
968        trn1            v16.2s, v2.2s, v17.2s   // P3
969        trn2            v2.2s, v2.2s, v17.2s    // P7
970        ushll           v7.8h, v7.8b, #1        // 2*P1
971        trn1            v17.2s, v1.2s, v5.2s    // P4
972        ushll           v19.8h, v3.8b, #1       // 2*P5
973        trn2            v1.2s, v1.2s, v5.2s     // P8
974        uxtl            v5.8h, v18.8b           // P2
975        uxtl            v6.8h, v6.8b            // P6
976        uxtl            v18.8h, v16.8b          // P3
977        mls             v7.8h, v5.8h, v0.h[1]   // 2*P1-5*P2
978        uxtl            v2.8h, v2.8b            // P7
979        ushll           v5.8h, v16.8b, #1       // 2*P3
980        mls             v19.8h, v6.8h, v0.h[1]  // 2*P5-5*P6
981        uxtl            v16.8h, v17.8b          // P4
982        uxtl            v1.8h, v1.8b            // P8
983        mla             v19.8h, v2.8h, v0.h[1]  // 2*P5-5*P6+5*P7
984        uxtl            v2.8h, v3.8b            // P5
985        mla             v7.8h, v18.8h, v0.h[1]  // 2*P1-5*P2+5*P3
986        sub             v3.8h, v16.8h, v2.8h    // P4-P5
987        mls             v5.8h, v16.8h, v0.h[1]  // 2*P3-5*P4
988        mls             v19.8h, v1.8h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
989        abs             v1.8h, v3.8h
990        sshr            v3.8h, v3.8h, #8        // clip_sign
991        mls             v7.8h, v16.8h, v0.h[0]  // 2*P1-5*P2+5*P3-2*P4
992        sshr            v1.8h, v1.8h, #1        // clip
993        mla             v5.8h, v2.8h, v0.h[1]   // 2*P3-5*P4+5*P5
994        srshr           v17.8h, v19.8h, #3
995        mls             v5.8h, v6.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
996        cmeq            v6.8h, v1.8h, #0        // test clip == 0
997        srshr           v7.8h, v7.8h, #3
998        abs             v17.8h, v17.8h          // a2
999        abs             v7.8h, v7.8h            // a1
1000        srshr           v5.8h, v5.8h, #3
1001        cmhs            v18.8h, v7.8h, v17.8h   // test a1 >= a2
1002        abs             v19.8h, v5.8h           // a0
1003        sshr            v5.8h, v5.8h, #8        // a0_sign
1004        bsl             v18.16b, v17.16b, v7.16b // a3
1005        cmhs            v4.8h, v19.8h, v4.8h    // test a0 >= pq
1006        sub             v3.8h, v3.8h, v5.8h     // clip_sign - a0_sign
1007        uqsub           v5.8h, v19.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1008        cmhs            v7.8h, v18.8h, v19.8h   // test a3 >= a0
1009        orr             v4.16b, v6.16b, v4.16b  // test clip == 0 || a0 >= pq
1010        mul             v0.8h, v5.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
1011        orr             v5.16b, v4.16b, v7.16b  // test clip == 0 || a0 >= pq || a3 >= a0
1012        mov             w2, v5.s[1]             // move to gp reg
1013        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
1014        mov             w3, v5.s[3]
1015        cmhs            v5.8h, v0.8h, v1.8h
1016        and             w5, w2, w3
1017        bsl             v5.16b, v1.16b, v0.16b  // FFMIN(d, clip)
1018        tbnz            w5, #0, 2f              // none of the 8 pixel pairs should be updated in this case
1019        bic             v0.16b, v5.16b, v4.16b  // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1020        mla             v2.8h, v0.8h, v3.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1021        mls             v16.8h, v0.8h, v3.8h    // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1022        sqxtun          v1.8b, v2.8h
1023        sqxtun          v0.8b, v16.8h
1024        tbnz            w2, #0, 1f              // none of the first 4 pixel pairs should be updated if so
1025        st2             {v0.b, v1.b}[0], [x0], x1
1026        st2             {v0.b, v1.b}[1], [x0], x1
1027        st2             {v0.b, v1.b}[2], [x0], x1
1028        st2             {v0.b, v1.b}[3], [x0]
10291:      tbnz            w3, #0, 2f              // none of the second 4 pixel pairs should be updated if so
1030        st2             {v0.b, v1.b}[4], [x4], x1
1031        st2             {v0.b, v1.b}[5], [x4], x1
1032        st2             {v0.b, v1.b}[6], [x4], x1
1033        st2             {v0.b, v1.b}[7], [x4]
10342:      ret
1035endfunc
1036
1037// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
1038// On entry:
1039//   x0 -> top-left pel of lower block
1040//   x1 = row stride, bytes
1041//   w2 = PQUANT bitstream parameter
1042function ff_vc1_v_loop_filter16_neon, export=1
1043        sub             x3, x0, w1, sxtw #2
1044        ldr             d0, .Lcoeffs
1045        ld1             {v1.16b}, [x0], x1      // P5
1046        movi            v2.2d, #0x0000ffff00000000
1047        ld1             {v3.16b}, [x3], x1      // P1
1048        ld1             {v4.16b}, [x3], x1      // P2
1049        ld1             {v5.16b}, [x0], x1      // P6
1050        ld1             {v6.16b}, [x3], x1      // P3
1051        ld1             {v7.16b}, [x0], x1      // P7
1052        ushll           v16.8h, v1.8b, #1       // 2*P5[0..7]
1053        ushll           v17.8h, v3.8b, #1       // 2*P1[0..7]
1054        ld1             {v18.16b}, [x3]         // P4
1055        uxtl            v19.8h, v4.8b           // P2[0..7]
1056        ld1             {v20.16b}, [x0]         // P8
1057        uxtl            v21.8h, v5.8b           // P6[0..7]
1058        dup             v22.8h, w2              // pq
1059        ushll2          v3.8h, v3.16b, #1       // 2*P1[8..15]
1060        mls             v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
1061        ushll2          v19.8h, v1.16b, #1      // 2*P5[8..15]
1062        uxtl2           v4.8h, v4.16b           // P2[8..15]
1063        mls             v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
1064        uxtl2           v5.8h, v5.16b           // P6[8..15]
1065        uxtl            v23.8h, v6.8b           // P3[0..7]
1066        uxtl            v24.8h, v7.8b           // P7[0..7]
1067        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1[8..15]-5*P2[8..15]
1068        ushll           v4.8h, v6.8b, #1        // 2*P3[0..7]
1069        uxtl            v25.8h, v18.8b          // P4[0..7]
1070        mls             v19.8h, v5.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]
1071        uxtl2           v26.8h, v6.16b          // P3[8..15]
1072        mla             v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
1073        uxtl2           v7.8h, v7.16b           // P7[8..15]
1074        ushll2          v6.8h, v6.16b, #1       // 2*P3[8..15]
1075        mla             v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1076        uxtl2           v18.8h, v18.16b         // P4[8..15]
1077        uxtl            v23.8h, v20.8b          // P8[0..7]
1078        mls             v4.8h, v25.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
1079        uxtl            v24.8h, v1.8b           // P5[0..7]
1080        uxtl2           v20.8h, v20.16b         // P8[8..15]
1081        mla             v3.8h, v26.8h, v0.h[1]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1082        uxtl2           v1.8h, v1.16b           // P5[8..15]
1083        sub             v26.8h, v25.8h, v24.8h  // P4[0..7]-P5[0..7]
1084        mla             v19.8h, v7.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
1085        sub             v7.8h, v18.8h, v1.8h    // P4[8..15]-P5[8..15]
1086        mls             v6.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]
1087        abs             v27.8h, v26.8h
1088        sshr            v26.8h, v26.8h, #8      // clip_sign[0..7]
1089        mls             v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
1090        abs             v28.8h, v7.8h
1091        sshr            v27.8h, v27.8h, #1      // clip[0..7]
1092        mls             v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1093        sshr            v7.8h, v7.8h, #8        // clip_sign[8..15]
1094        sshr            v23.8h, v28.8h, #1      // clip[8..15]
1095        mla             v4.8h, v24.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1096        cmeq            v28.8h, v27.8h, #0      // test clip[0..7] == 0
1097        srshr           v17.8h, v17.8h, #3
1098        mls             v3.8h, v18.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1099        cmeq            v29.8h, v23.8h, #0      // test clip[8..15] == 0
1100        srshr           v16.8h, v16.8h, #3
1101        mls             v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
1102        abs             v17.8h, v17.8h          // a1[0..7]
1103        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1104        srshr           v3.8h, v3.8h, #3
1105        mls             v4.8h, v21.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
1106        abs             v16.8h, v16.8h          // a2[0..7]
1107        srshr           v19.8h, v19.8h, #3
1108        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1109        cmhs            v5.8h, v17.8h, v16.8h   // test a1[0..7] >= a2[0..7]
1110        abs             v3.8h, v3.8h            // a1[8..15]
1111        srshr           v4.8h, v4.8h, #3
1112        abs             v19.8h, v19.8h          // a2[8..15]
1113        bsl             v5.16b, v16.16b, v17.16b // a3[0..7]
1114        srshr           v6.8h, v6.8h, #3
1115        cmhs            v16.8h, v3.8h, v19.8h   // test a1[8..15] >= a2[8.15]
1116        abs             v17.8h, v4.8h           // a0[0..7]
1117        sshr            v4.8h, v4.8h, #8        // a0_sign[0..7]
1118        bsl             v16.16b, v19.16b, v3.16b // a3[8..15]
1119        uqsub           v3.8h, v17.8h, v5.8h    // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1120        abs             v19.8h, v6.8h           // a0[8..15]
1121        cmhs            v20.8h, v17.8h, v22.8h  // test a0[0..7] >= pq
1122        cmhs            v5.8h, v5.8h, v17.8h    // test a3[0..7] >= a0[0..7]
1123        sub             v4.8h, v26.8h, v4.8h    // clip_sign[0..7] - a0_sign[0..7]
1124        sshr            v6.8h, v6.8h, #8        // a0_sign[8..15]
1125        mul             v3.8h, v3.8h, v0.h[1]   // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1126        uqsub           v17.8h, v19.8h, v16.8h  // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1127        orr             v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
1128        cmhs            v21.8h, v19.8h, v22.8h  // test a0[8..15] >= pq
1129        cmhs            v16.8h, v16.8h, v19.8h  // test a3[8..15] >= a0[8..15]
1130        mul             v0.8h, v17.8h, v0.h[1]  // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1131        sub             v6.8h, v7.8h, v6.8h     // clip_sign[8..15] - a0_sign[8..15]
1132        orr             v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
1133        ushr            v3.8h, v3.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1134        orr             v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
1135        cmtst           v17.2d, v5.2d, v2.2d    // if 2nd of each group of is not filtered, then none of the others in the group should be either
1136        mov             w0, v5.s[1]             // move to gp reg
1137        cmhs            v19.8h, v3.8h, v27.8h
1138        ushr            v0.8h, v0.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1139        mov             w2, v5.s[3]
1140        orr             v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1141        orr             v16.16b, v20.16b, v17.16b
1142        bsl             v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
1143        cmtst           v2.2d, v5.2d, v2.2d
1144        cmhs            v3.8h, v0.8h, v23.8h
1145        mov             w4, v5.s[1]
1146        mov             w5, v5.s[3]
1147        and             w0, w0, w2
1148        bic             v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1149        orr             v2.16b, v7.16b, v2.16b
1150        bsl             v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
1151        mls             v25.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
1152        and             w2, w4, w5
1153        bic             v0.16b, v3.16b, v2.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1154        mla             v24.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
1155        and             w0, w0, w2
1156        mls             v18.8h, v0.8h, v6.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
1157        sqxtun          v2.8b, v25.8h
1158        tbnz            w0, #0, 1f              // none of the 16 pixel pairs should be updated in this case
1159        mla             v1.8h, v0.8h, v6.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
1160        sqxtun          v0.8b, v24.8h
1161        sqxtun2         v2.16b, v18.8h
1162        sqxtun2         v0.16b, v1.8h
1163        st1             {v2.16b}, [x3], x1
1164        st1             {v0.16b}, [x3]
11651:      ret
1166endfunc
1167
1168// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
1169// On entry:
1170//   x0 -> top-left pel of right block
1171//   x1 = row stride, bytes
1172//   w2 = PQUANT bitstream parameter
1173function ff_vc1_h_loop_filter16_neon, export=1
1174        sub             x3, x0, #4              // where to start reading
1175        ldr             d0, .Lcoeffs
1176        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
1177        sub             x0, x0, #1              // where to start writing
1178        ld1             {v2.8b}, [x3], x1
1179        add             x4, x0, x1, lsl #3
1180        ld1             {v3.8b}, [x3], x1
1181        add             x5, x0, x1, lsl #2
1182        ld1             {v4.8b}, [x3], x1
1183        add             x6, x4, x1, lsl #2
1184        ld1             {v5.8b}, [x3], x1
1185        ld1             {v6.8b}, [x3], x1
1186        ld1             {v7.8b}, [x3], x1
1187        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
1188        ld1             {v17.8b}, [x3], x1
1189        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
1190        ld1             {v2.8b}, [x3], x1
1191        trn1            v18.8b, v3.8b, v4.8b    // P1[2], P1[3], P3[2]...
1192        ld1             {v19.8b}, [x3], x1
1193        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
1194        ld1             {v4.8b}, [x3], x1
1195        trn1            v20.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
1196        ld1             {v21.8b}, [x3], x1
1197        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
1198        ld1             {v6.8b}, [x3], x1
1199        trn1            v22.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
1200        ld1             {v23.8b}, [x3], x1
1201        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
1202        ld1             {v17.8b}, [x3], x1
1203        trn1            v24.8b, v2.8b, v19.8b   // P1[8], P1[9], P3[8]...
1204        ld1             {v25.8b}, [x3]
1205        trn2            v2.8b, v2.8b, v19.8b    // P2[8], P2[9], P4[8]...
1206        trn1            v19.4h, v16.4h, v18.4h  // P1[0], P1[1], P1[2], P1[3], P5[0]...
1207        trn1            v26.8b, v4.8b, v21.8b   // P1[10], P1[11], P3[10]...
1208        trn2            v4.8b, v4.8b, v21.8b    // P2[10], P2[11], P4[10]...
1209        trn1            v21.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
1210        trn1            v27.4h, v20.4h, v22.4h  // P1[4], P1[5], P1[6], P1[7], P5[4]...
1211        trn1            v28.8b, v6.8b, v23.8b   // P1[12], P1[13], P3[12]...
1212        trn2            v6.8b, v6.8b, v23.8b    // P2[12], P2[13], P4[12]...
1213        trn1            v23.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
1214        trn1            v29.4h, v24.4h, v26.4h  // P1[8], P1[9], P1[10], P1[11], P5[8]...
1215        trn1            v30.8b, v17.8b, v25.8b  // P1[14], P1[15], P3[14]...
1216        trn2            v17.8b, v17.8b, v25.8b  // P2[14], P2[15], P4[14]...
1217        trn1            v25.4h, v2.4h, v4.4h    // P2[8], P2[9], P2[10], P2[11], P6[8]...
1218        trn1            v31.2s, v19.2s, v27.2s  // P1[0..7]
1219        trn2            v19.2s, v19.2s, v27.2s  // P5[0..7]
1220        trn1            v27.2s, v21.2s, v23.2s  // P2[0..7]
1221        trn2            v21.2s, v21.2s, v23.2s  // P6[0..7]
1222        trn1            v23.4h, v28.4h, v30.4h  // P1[12], P1[13], P1[14], P1[15], P5[12]...
1223        trn2            v16.4h, v16.4h, v18.4h  // P3[0], P3[1], P3[2], P3[3], P7[0]...
1224        trn1            v18.4h, v6.4h, v17.4h   // P2[12], P2[13], P2[14], P2[15], P6[12]...
1225        trn2            v20.4h, v20.4h, v22.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
1226        trn2            v22.4h, v24.4h, v26.4h  // P3[8], P3[9], P3[10], P3[11], P7[8]...
1227        trn1            v24.2s, v29.2s, v23.2s  // P1[8..15]
1228        trn2            v23.2s, v29.2s, v23.2s  // P5[8..15]
1229        trn1            v26.2s, v25.2s, v18.2s  // P2[8..15]
1230        trn2            v18.2s, v25.2s, v18.2s  // P6[8..15]
1231        trn2            v25.4h, v28.4h, v30.4h  // P3[12], P3[13], P3[14], P3[15], P7[12]...
1232        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
1233        trn2            v3.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
1234        trn2            v2.4h, v2.4h, v4.4h     // P4[8], P4[9], P4[10], P4[11], P8[8]...
1235        trn2            v4.4h, v6.4h, v17.4h    // P4[12], P4[13], P4[14], P4[15], P8[12]...
1236        ushll           v5.8h, v31.8b, #1       // 2*P1[0..7]
1237        ushll           v6.8h, v19.8b, #1       // 2*P5[0..7]
1238        trn1            v7.2s, v16.2s, v20.2s   // P3[0..7]
1239        uxtl            v17.8h, v27.8b          // P2[0..7]
1240        trn2            v16.2s, v16.2s, v20.2s  // P7[0..7]
1241        uxtl            v20.8h, v21.8b          // P6[0..7]
1242        trn1            v21.2s, v22.2s, v25.2s  // P3[8..15]
1243        ushll           v24.8h, v24.8b, #1      // 2*P1[8..15]
1244        trn2            v22.2s, v22.2s, v25.2s  // P7[8..15]
1245        ushll           v25.8h, v23.8b, #1      // 2*P5[8..15]
1246        trn1            v27.2s, v1.2s, v3.2s    // P4[0..7]
1247        uxtl            v26.8h, v26.8b          // P2[8..15]
1248        mls             v5.8h, v17.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]
1249        uxtl            v17.8h, v18.8b          // P6[8..15]
1250        mls             v6.8h, v20.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]
1251        trn1            v18.2s, v2.2s, v4.2s    // P4[8..15]
1252        uxtl            v28.8h, v7.8b           // P3[0..7]
1253        mls             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
1254        uxtl            v16.8h, v16.8b          // P7[0..7]
1255        uxtl            v26.8h, v21.8b          // P3[8..15]
1256        mls             v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
1257        uxtl            v22.8h, v22.8b          // P7[8..15]
1258        ushll           v7.8h, v7.8b, #1        // 2*P3[0..7]
1259        uxtl            v27.8h, v27.8b          // P4[0..7]
1260        trn2            v1.2s, v1.2s, v3.2s     // P8[0..7]
1261        ushll           v3.8h, v21.8b, #1       // 2*P3[8..15]
1262        trn2            v2.2s, v2.2s, v4.2s     // P8[8..15]
1263        uxtl            v4.8h, v18.8b           // P4[8..15]
1264        mla             v5.8h, v28.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
1265        uxtl            v1.8h, v1.8b            // P8[0..7]
1266        mla             v6.8h, v16.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1267        uxtl            v2.8h, v2.8b            // P8[8..15]
1268        uxtl            v16.8h, v19.8b          // P5[0..7]
1269        mla             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1270        uxtl            v18.8h, v23.8b          // P5[8..15]
1271        dup             v19.8h, w2              // pq
1272        mla             v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
1273        sub             v21.8h, v27.8h, v16.8h  // P4[0..7]-P5[0..7]
1274        sub             v22.8h, v4.8h, v18.8h   // P4[8..15]-P5[8..15]
1275        mls             v7.8h, v27.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
1276        abs             v23.8h, v21.8h
1277        mls             v3.8h, v4.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]
1278        abs             v26.8h, v22.8h
1279        sshr            v21.8h, v21.8h, #8      // clip_sign[0..7]
1280        mls             v5.8h, v27.8h, v0.h[0]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
1281        sshr            v23.8h, v23.8h, #1      // clip[0..7]
1282        sshr            v26.8h, v26.8h, #1      // clip[8..15]
1283        mls             v6.8h, v1.8h, v0.h[0]   // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1284        sshr            v1.8h, v22.8h, #8       // clip_sign[8..15]
1285        cmeq            v22.8h, v23.8h, #0      // test clip[0..7] == 0
1286        mls             v24.8h, v4.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1287        cmeq            v28.8h, v26.8h, #0      // test clip[8..15] == 0
1288        srshr           v5.8h, v5.8h, #3
1289        mls             v25.8h, v2.8h, v0.h[0]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
1290        srshr           v2.8h, v6.8h, #3
1291        mla             v7.8h, v16.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1292        srshr           v6.8h, v24.8h, #3
1293        mla             v3.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1294        abs             v5.8h, v5.8h            // a1[0..7]
1295        srshr           v24.8h, v25.8h, #3
1296        mls             v3.8h, v17.8h, v0.h[0]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1297        abs             v2.8h, v2.8h            // a2[0..7]
1298        abs             v6.8h, v6.8h            // a1[8..15]
1299        mls             v7.8h, v20.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
1300        abs             v17.8h, v24.8h          // a2[8..15]
1301        cmhs            v20.8h, v5.8h, v2.8h    // test a1[0..7] >= a2[0..7]
1302        srshr           v3.8h, v3.8h, #3
1303        cmhs            v24.8h, v6.8h, v17.8h   // test a1[8..15] >= a2[8.15]
1304        srshr           v7.8h, v7.8h, #3
1305        bsl             v20.16b, v2.16b, v5.16b // a3[0..7]
1306        abs             v2.8h, v3.8h            // a0[8..15]
1307        sshr            v3.8h, v3.8h, #8        // a0_sign[8..15]
1308        bsl             v24.16b, v17.16b, v6.16b // a3[8..15]
1309        abs             v5.8h, v7.8h            // a0[0..7]
1310        sshr            v6.8h, v7.8h, #8        // a0_sign[0..7]
1311        cmhs            v7.8h, v2.8h, v19.8h    // test a0[8..15] >= pq
1312        sub             v1.8h, v1.8h, v3.8h     // clip_sign[8..15] - a0_sign[8..15]
1313        uqsub           v3.8h, v2.8h, v24.8h    // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1314        cmhs            v2.8h, v24.8h, v2.8h    // test a3[8..15] >= a0[8..15]
1315        uqsub           v17.8h, v5.8h, v20.8h   // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1316        cmhs            v19.8h, v5.8h, v19.8h   // test a0[0..7] >= pq
1317        orr             v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
1318        sub             v6.8h, v21.8h, v6.8h    // clip_sign[0..7] - a0_sign[0..7]
1319        mul             v3.8h, v3.8h, v0.h[1]   // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1320        cmhs            v5.8h, v20.8h, v5.8h    // test a3[0..7] >= a0[0..7]
1321        orr             v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
1322        mul             v0.8h, v17.8h, v0.h[1]  // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1323        orr             v2.16b, v7.16b, v2.16b  // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1324        orr             v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
1325        ushr            v3.8h, v3.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1326        mov             w7, v2.s[1]
1327        mov             w8, v2.s[3]
1328        ushr            v0.8h, v0.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1329        mov             w2, v5.s[1]             // move to gp reg
1330        cmhs            v2.8h, v3.8h, v26.8h
1331        mov             w3, v5.s[3]
1332        cmhs            v5.8h, v0.8h, v23.8h
1333        bsl             v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
1334        and             w9, w7, w8
1335        bsl             v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
1336        and             w10, w2, w3
1337        bic             v0.16b, v2.16b, v7.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1338        and             w9, w10, w9
1339        bic             v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1340        mls             v4.8h, v0.8h, v1.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
1341        tbnz            w9, #0, 4f              // none of the 16 pixel pairs should be updated in this case
1342        mls             v27.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
1343        mla             v16.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
1344        sqxtun          v2.8b, v4.8h
1345        mla             v18.8h, v0.8h, v1.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
1346        sqxtun          v0.8b, v27.8h
1347        sqxtun          v1.8b, v16.8h
1348        sqxtun          v3.8b, v18.8h
1349        tbnz            w2, #0, 1f
1350        st2             {v0.b, v1.b}[0], [x0], x1
1351        st2             {v0.b, v1.b}[1], [x0], x1
1352        st2             {v0.b, v1.b}[2], [x0], x1
1353        st2             {v0.b, v1.b}[3], [x0]
13541:      tbnz            w3, #0, 2f
1355        st2             {v0.b, v1.b}[4], [x5], x1
1356        st2             {v0.b, v1.b}[5], [x5], x1
1357        st2             {v0.b, v1.b}[6], [x5], x1
1358        st2             {v0.b, v1.b}[7], [x5]
13592:      tbnz            w7, #0, 3f
1360        st2             {v2.b, v3.b}[0], [x4], x1
1361        st2             {v2.b, v3.b}[1], [x4], x1
1362        st2             {v2.b, v3.b}[2], [x4], x1
1363        st2             {v2.b, v3.b}[3], [x4]
13643:      tbnz            w8, #0, 4f
1365        st2             {v2.b, v3.b}[4], [x6], x1
1366        st2             {v2.b, v3.b}[5], [x6], x1
1367        st2             {v2.b, v3.b}[6], [x6], x1
1368        st2             {v2.b, v3.b}[7], [x6]
13694:      ret
1370endfunc
1371
1372// Copy at most the specified number of bytes from source to destination buffer,
1373// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
1374// On entry:
1375//   x0 -> source buffer
1376//   w1 = max number of bytes to copy
1377//   x2 -> destination buffer, optimally 8-byte aligned
1378// On exit:
1379//   w0 = number of bytes not copied
1380function ff_vc1_unescape_buffer_helper_neon, export=1
1381        // Offset by 80 to screen out cases that are too short for us to handle,
1382        // and also make it easy to test for loop termination, or to determine
1383        // whether we need an odd number of half-iterations of the loop.
1384        subs            w1, w1, #80
1385        b.mi            90f
1386
1387        // Set up useful constants
1388        movi            v20.4s, #3, lsl #24
1389        movi            v21.4s, #3, lsl #16
1390
1391        tst             w1, #32
1392        b.ne            1f
1393
1394          ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
1395          ext             v25.16b, v0.16b, v1.16b, #1
1396          ext             v26.16b, v0.16b, v1.16b, #2
1397          ext             v27.16b, v0.16b, v1.16b, #3
1398          ext             v29.16b, v1.16b, v2.16b, #1
1399          ext             v30.16b, v1.16b, v2.16b, #2
1400          ext             v31.16b, v1.16b, v2.16b, #3
1401          bic             v24.16b, v0.16b, v20.16b
1402          bic             v25.16b, v25.16b, v20.16b
1403          bic             v26.16b, v26.16b, v20.16b
1404          bic             v27.16b, v27.16b, v20.16b
1405          bic             v28.16b, v1.16b, v20.16b
1406          bic             v29.16b, v29.16b, v20.16b
1407          bic             v30.16b, v30.16b, v20.16b
1408          bic             v31.16b, v31.16b, v20.16b
1409          eor             v24.16b, v24.16b, v21.16b
1410          eor             v25.16b, v25.16b, v21.16b
1411          eor             v26.16b, v26.16b, v21.16b
1412          eor             v27.16b, v27.16b, v21.16b
1413          eor             v28.16b, v28.16b, v21.16b
1414          eor             v29.16b, v29.16b, v21.16b
1415          eor             v30.16b, v30.16b, v21.16b
1416          eor             v31.16b, v31.16b, v21.16b
1417          cmeq            v24.4s, v24.4s, #0
1418          cmeq            v25.4s, v25.4s, #0
1419          cmeq            v26.4s, v26.4s, #0
1420          cmeq            v27.4s, v27.4s, #0
1421          add             w1, w1, #32
1422          b               3f
1423
14241:      ld1             {v3.16b, v4.16b, v5.16b}, [x0], #48
1425        ext             v25.16b, v3.16b, v4.16b, #1
1426        ext             v26.16b, v3.16b, v4.16b, #2
1427        ext             v27.16b, v3.16b, v4.16b, #3
1428        ext             v29.16b, v4.16b, v5.16b, #1
1429        ext             v30.16b, v4.16b, v5.16b, #2
1430        ext             v31.16b, v4.16b, v5.16b, #3
1431        bic             v24.16b, v3.16b, v20.16b
1432        bic             v25.16b, v25.16b, v20.16b
1433        bic             v26.16b, v26.16b, v20.16b
1434        bic             v27.16b, v27.16b, v20.16b
1435        bic             v28.16b, v4.16b, v20.16b
1436        bic             v29.16b, v29.16b, v20.16b
1437        bic             v30.16b, v30.16b, v20.16b
1438        bic             v31.16b, v31.16b, v20.16b
1439        eor             v24.16b, v24.16b, v21.16b
1440        eor             v25.16b, v25.16b, v21.16b
1441        eor             v26.16b, v26.16b, v21.16b
1442        eor             v27.16b, v27.16b, v21.16b
1443        eor             v28.16b, v28.16b, v21.16b
1444        eor             v29.16b, v29.16b, v21.16b
1445        eor             v30.16b, v30.16b, v21.16b
1446        eor             v31.16b, v31.16b, v21.16b
1447        cmeq            v24.4s, v24.4s, #0
1448        cmeq            v25.4s, v25.4s, #0
1449        cmeq            v26.4s, v26.4s, #0
1450        cmeq            v27.4s, v27.4s, #0
1451        // Drop through...
14522:        mov             v0.16b, v5.16b
1453          ld1             {v1.16b, v2.16b}, [x0], #32
1454        cmeq            v28.4s, v28.4s, #0
1455        cmeq            v29.4s, v29.4s, #0
1456        cmeq            v30.4s, v30.4s, #0
1457        cmeq            v31.4s, v31.4s, #0
1458        orr             v24.16b, v24.16b, v25.16b
1459        orr             v26.16b, v26.16b, v27.16b
1460        orr             v28.16b, v28.16b, v29.16b
1461        orr             v30.16b, v30.16b, v31.16b
1462          ext             v25.16b, v0.16b, v1.16b, #1
1463        orr             v22.16b, v24.16b, v26.16b
1464          ext             v26.16b, v0.16b, v1.16b, #2
1465          ext             v27.16b, v0.16b, v1.16b, #3
1466          ext             v29.16b, v1.16b, v2.16b, #1
1467        orr             v23.16b, v28.16b, v30.16b
1468          ext             v30.16b, v1.16b, v2.16b, #2
1469          ext             v31.16b, v1.16b, v2.16b, #3
1470          bic             v24.16b, v0.16b, v20.16b
1471          bic             v25.16b, v25.16b, v20.16b
1472          bic             v26.16b, v26.16b, v20.16b
1473        orr             v22.16b, v22.16b, v23.16b
1474          bic             v27.16b, v27.16b, v20.16b
1475          bic             v28.16b, v1.16b, v20.16b
1476          bic             v29.16b, v29.16b, v20.16b
1477          bic             v30.16b, v30.16b, v20.16b
1478          bic             v31.16b, v31.16b, v20.16b
1479        addv            s22, v22.4s
1480          eor             v24.16b, v24.16b, v21.16b
1481          eor             v25.16b, v25.16b, v21.16b
1482          eor             v26.16b, v26.16b, v21.16b
1483          eor             v27.16b, v27.16b, v21.16b
1484          eor             v28.16b, v28.16b, v21.16b
1485        mov             w3, v22.s[0]
1486          eor             v29.16b, v29.16b, v21.16b
1487          eor             v30.16b, v30.16b, v21.16b
1488          eor             v31.16b, v31.16b, v21.16b
1489          cmeq            v24.4s, v24.4s, #0
1490          cmeq            v25.4s, v25.4s, #0
1491          cmeq            v26.4s, v26.4s, #0
1492          cmeq            v27.4s, v27.4s, #0
1493        cbnz            w3, 90f
1494        st1             {v3.16b, v4.16b}, [x2], #32
14953:          mov             v3.16b, v2.16b
1496            ld1             {v4.16b, v5.16b}, [x0], #32
1497          cmeq            v28.4s, v28.4s, #0
1498          cmeq            v29.4s, v29.4s, #0
1499          cmeq            v30.4s, v30.4s, #0
1500          cmeq            v31.4s, v31.4s, #0
1501          orr             v24.16b, v24.16b, v25.16b
1502          orr             v26.16b, v26.16b, v27.16b
1503          orr             v28.16b, v28.16b, v29.16b
1504          orr             v30.16b, v30.16b, v31.16b
1505            ext             v25.16b, v3.16b, v4.16b, #1
1506          orr             v22.16b, v24.16b, v26.16b
1507            ext             v26.16b, v3.16b, v4.16b, #2
1508            ext             v27.16b, v3.16b, v4.16b, #3
1509            ext             v29.16b, v4.16b, v5.16b, #1
1510          orr             v23.16b, v28.16b, v30.16b
1511            ext             v30.16b, v4.16b, v5.16b, #2
1512            ext             v31.16b, v4.16b, v5.16b, #3
1513            bic             v24.16b, v3.16b, v20.16b
1514            bic             v25.16b, v25.16b, v20.16b
1515            bic             v26.16b, v26.16b, v20.16b
1516          orr             v22.16b, v22.16b, v23.16b
1517            bic             v27.16b, v27.16b, v20.16b
1518            bic             v28.16b, v4.16b, v20.16b
1519            bic             v29.16b, v29.16b, v20.16b
1520            bic             v30.16b, v30.16b, v20.16b
1521            bic             v31.16b, v31.16b, v20.16b
1522          addv            s22, v22.4s
1523            eor             v24.16b, v24.16b, v21.16b
1524            eor             v25.16b, v25.16b, v21.16b
1525            eor             v26.16b, v26.16b, v21.16b
1526            eor             v27.16b, v27.16b, v21.16b
1527            eor             v28.16b, v28.16b, v21.16b
1528          mov             w3, v22.s[0]
1529            eor             v29.16b, v29.16b, v21.16b
1530            eor             v30.16b, v30.16b, v21.16b
1531            eor             v31.16b, v31.16b, v21.16b
1532            cmeq            v24.4s, v24.4s, #0
1533            cmeq            v25.4s, v25.4s, #0
1534            cmeq            v26.4s, v26.4s, #0
1535            cmeq            v27.4s, v27.4s, #0
1536          cbnz            w3, 91f
1537          st1             {v0.16b, v1.16b}, [x2], #32
1538        subs            w1, w1, #64
1539        b.pl            2b
1540
154190:     add             w0, w1, #80
1542        ret
1543
154491:     sub             w1, w1, #32
1545        b               90b
1546endfunc
1547