1/*
2 * VC1 NEON optimisations
3 *
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "libavutil/arm/asm.S"
25#include "neon.S"
26
27#include "config.h"
28
29@ Transpose rows into columns of a matrix of 16-bit elements. For 4x4, pass
30@ double-word registers, for 8x4, pass quad-word registers.
31.macro transpose16 r0, r1, r2, r3
32        @ At this point:
33        @   row[0]  r0
34        @   row[1]  r1
35        @   row[2]  r2
36        @   row[3]  r3
37
38        vtrn.16         \r0,  \r1         @ first and second row
39        vtrn.16         \r2,  \r3         @ third and fourth row
40        vtrn.32         \r0,  \r2         @ first and third row
41        vtrn.32         \r1,  \r3         @ second and fourth row
42
43        @ At this point, if registers are quad-word:
44        @   column[0]   d0
45        @   column[1]   d2
46        @   column[2]   d4
47        @   column[3]   d6
48        @   column[4]   d1
49        @   column[5]   d3
50        @   column[6]   d5
51        @   column[7]   d7
52
53        @ At this point, if registers are double-word:
54        @   column[0]   d0
55        @   column[1]   d1
56        @   column[2]   d2
57        @   column[3]   d3
58.endm
59
60@ ff_vc1_inv_trans_{4,8}x{4,8}_neon and overflow: The input values in the file
61@ are supposed to be in a specific range as to allow for 16-bit math without
62@ causing overflows, but sometimes the input values are just big enough to
63@ barely cause overflow in vadd instructions like:
64@
65@   vadd.i16  q0, q8, q10
66@   vshr.s16  q0, q0, #\rshift
67@
68@ To prevent these borderline cases from overflowing, we just need one more
69@ bit of precision, which is accomplished by replacing the sequence above with:
70@
71@   vhadd.s16 q0, q8, q10
72@   vshr.s16  q0, q0, #(\rshift -1)
73@
74@ This works because vhadd is a single instruction that adds, then shifts to
75@ the right once, all before writing the result to the destination register.
76@
77@ Even with this workaround, there were still some files that caused overflows
78@ in ff_vc1_inv_trans_8x8_neon. See the comments in ff_vc1_inv_trans_8x8_neon
79@ for the additional workaround.
80
81@ Takes 4 columns of 8 values each and operates on it. Modeled after the first
82@ for loop in vc1_inv_trans_4x8_c.
83@ Input columns: q0 q1 q2 q3
84@ Output columns: q0 q1 q2 q3
85@ Trashes: r12 q8 q9 q10 q11 q12 q13
86.macro vc1_inv_trans_4x8_helper add rshift
87        @ Compute temp1, temp2 and setup scalar #17, #22, #10
88        vadd.i16        q12,   q0,  q2              @ temp1 = src[0] + src[2]
89        movw            r12,   #17
90        vsub.i16        q13,   q0,  q2              @ temp2 = src[0] - src[2]
91        movt            r12,   #22
92        vmov.32         d0[0], r12
93        movw            r12,   #10
94        vmov.16         d1[0], r12
95
96        vmov.i16        q8,  #\add                  @ t1 will accumulate here
97        vmov.i16        q9,  #\add                  @ t2 will accumulate here
98
99        vmul.i16        q10, q1,  d0[1]             @ t3 = 22 * (src[1])
100        vmul.i16        q11, q3,  d0[1]             @ t4 = 22 * (src[3])
101
102        vmla.i16        q8,  q12, d0[0]             @ t1 = 17 * (temp1) + 4
103        vmla.i16        q9,  q13, d0[0]             @ t2 = 17 * (temp2) + 4
104
105        vmla.i16        q10, q3,  d1[0]             @ t3 += 10 * src[3]
106        vmls.i16        q11, q1,  d1[0]             @ t4 -= 10 * src[1]
107
108        vhadd.s16       q0,  q8,  q10               @ dst[0] = (t1 + t3) >> 1
109        vhsub.s16       q3,  q8,  q10               @ dst[3] = (t1 - t3) >> 1
110        vhsub.s16       q1,  q9,  q11               @ dst[1] = (t2 - t4) >> 1
111        vhadd.s16       q2,  q9,  q11               @ dst[2] = (t2 + t4) >> 1
112
113        @ Halving add/sub above already did one shift
114        vshr.s16        q0,  q0,  #(\rshift - 1)    @ dst[0] >>= (rshift - 1)
115        vshr.s16        q3,  q3,  #(\rshift - 1)    @ dst[3] >>= (rshift - 1)
116        vshr.s16        q1,  q1,  #(\rshift - 1)    @ dst[1] >>= (rshift - 1)
117        vshr.s16        q2,  q2,  #(\rshift - 1)    @ dst[2] >>= (rshift - 1)
118.endm
119
120@ Takes 8 columns of 4 values each and operates on it. Modeled after the second
121@ for loop in vc1_inv_trans_4x8_c.
122@ Input columns: d0 d2 d4 d6 d1 d3 d5 d7
123@ Output columns: d16 d17 d18 d19 d21 d20 d23 d22
124@ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7
125.macro vc1_inv_trans_8x4_helper add add1beforeshift rshift
126        @ At this point:
127        @   src[0]      d0 overwritten later
128        @   src[8]      d2
129        @   src[16]     d4 overwritten later
130        @   src[24]     d6
131        @   src[32]     d1 overwritten later
132        @   src[40]     d3
133        @   src[48]     d5 overwritten later
134        @   src[56]     d7
135
136        movw            r12,   #12
137        vmov.i16        q14,   #\add            @ t1|t2 will accumulate here
138        movt            r12,   #6
139
140        vadd.i16        d20,   d0,  d1          @ temp1 = src[0] + src[32]
141        vsub.i16        d21,   d0,  d1          @ temp2 = src[0] - src[32]
142        vmov.i32        d0[0], r12              @ 16-bit: d0[0] = #12, d0[1] = #6
143
144        vshl.i16        q15,   q2,  #4          @ t3|t4 = 16 * (src[16]|src[48])
145        vswp            d4,    d5               @ q2 = src[48]|src[16]
146        vmla.i16        q14,   q10, d0[0]       @ t1|t2 = 12 * (temp1|temp2) + 64
147        movw            r12,   #15
148        movt            r12,   #9
149        vmov.i32        d0[1], r12              @ 16-bit: d0[2] = #15, d0[3] = #9
150        vneg.s16        d31,   d31              @ t4 = -t4
151        vmla.i16        q15,   q2,  d0[1]       @ t3|t4 += 6 * (src[48]|src[16])
152
153        @ At this point:
154        @   d0[2]   #15
155        @   d0[3]   #9
156        @   q1      src[8]|src[40]
157        @   q3      src[24]|src[56]
158        @   q14     old t1|t2
159        @   q15     old t3|t4
160
161        vshl.i16        q8,  q1,  #4            @ t1|t2 = 16 * (src[8]|src[40])
162        vswp            d2,  d3                 @ q1 = src[40]|src[8]
163        vshl.i16        q12, q3,  #4            @ temp3a|temp4a = 16 * src[24]|src[56]
164        vswp            d6,  d7                 @ q3 = src[56]|src[24]
165        vshl.i16        q13, q1,  #2            @ temp3b|temp4b = 4 * (src[40]|src[8])
166        vshl.i16        q2,  q3,  #2            @ temp1|temp2 = 4 * (src[56]|src[24])
167        vswp            d3,  d6                 @ q1 = src[40]|src[56], q3 = src[8]|src[24]
168        vsub.i16        q9,  q13, q12           @ t3|t4 = - (temp3a|temp4a) + (temp3b|temp4b)
169        vadd.i16        q8,  q8,  q2            @ t1|t2 += temp1|temp2
170        vmul.i16        q12, q3,  d0[3]         @ temp3|temp4 = 9 * src[8]|src[24]
171        vmla.i16        q8,  q1,  d0[3]         @ t1|t2 += 9 * (src[40]|src[56])
172        vswp            d6,  d7                 @ q3 = src[24]|src[8]
173        vswp            d2,  d3                 @ q1 = src[56]|src[40]
174
175        vsub.i16        q11, q14, q15           @ t8|t7 = old t1|t2 - old t3|t4
176        vadd.i16        q10, q14, q15           @ t5|t6 = old t1|t2 + old t3|t4
177  .if \add1beforeshift
178        vmov.i16        q15, #1
179  .endif
180
181        vadd.i16        d18, d18, d24           @ t3 += temp3
182        vsub.i16        d19, d19, d25           @ t4 -= temp4
183
184        vswp            d22, d23                @ q11 = t7|t8
185
186        vneg.s16        d17, d17                @ t2 = -t2
187        vmla.i16        q9,  q1,  d0[2]         @ t3|t4 += 15 * src[56]|src[40]
188        vmla.i16        q8,  q3,  d0[2]         @ t1|t2 += 15 * src[24]|src[8]
189
190        @ At this point:
191        @   t1  d16
192        @   t2  d17
193        @   t3  d18
194        @   t4  d19
195        @   t5  d20
196        @   t6  d21
197        @   t7  d22
198        @   t8  d23
199        @   #1  q15
200
201  .if \add1beforeshift
202        vadd.i16        q3,  q15, q10           @ line[7,6] = t5|t6 + 1
203        vadd.i16        q2,  q15, q11           @ line[5,4] = t7|t8 + 1
204  .endif
205
206        @ Sometimes this overflows, so to get one additional bit of precision, use
207        @ a single instruction that both adds and shifts right (halving).
208        vhadd.s16       q1,  q9,  q11           @ line[2,3] = (t3|t4 + t7|t8) >> 1
209        vhadd.s16       q0,  q8,  q10           @ line[0,1] = (t1|t2 + t5|t6) >> 1
210  .if \add1beforeshift
211        vhsub.s16       q2,  q2,  q9            @ line[5,4] = (t7|t8 - t3|t4 + 1) >> 1
212        vhsub.s16       q3,  q3,  q8            @ line[7,6] = (t5|t6 - t1|t2 + 1) >> 1
213  .else
214        vhsub.s16       q2,  q11, q9            @ line[5,4] = (t7|t8 - t3|t4) >> 1
215        vhsub.s16       q3,  q10, q8            @ line[7,6] = (t5|t6 - t1|t2) >> 1
216  .endif
217
218        vshr.s16        q9,  q1,  #(\rshift - 1)    @ one shift is already done by vhadd/vhsub above
219        vshr.s16        q8,  q0,  #(\rshift - 1)
220        vshr.s16        q10, q2,  #(\rshift - 1)
221        vshr.s16        q11, q3,  #(\rshift - 1)
222
223        @ At this point:
224        @   dst[0]   d16
225        @   dst[1]   d17
226        @   dst[2]   d18
227        @   dst[3]   d19
228        @   dst[4]   d21
229        @   dst[5]   d20
230        @   dst[6]   d23
231        @   dst[7]   d22
232.endm
233
234@ This is modeled after the first and second for loop in vc1_inv_trans_8x8_c.
235@ Input columns:  q8, q9, q10, q11, q12, q13, q14, q15
236@ Output columns: q8, q9, q10, q11, q12, q13, q14, q15
237@ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7
238.macro vc1_inv_trans_8x8_helper add add1beforeshift rshift
239        @ This actually computes half of t1, t2, t3, t4, as explained below
240        @ near `tNhalf`.
241        vmov.i16        q0,    #(6 / 2)         @ q0 = #6/2
242        vshl.i16        q1,    q10, #3          @ t3 = 16/2 * src[16]
243        vshl.i16        q3,    q14, #3          @ temp4 = 16/2 * src[48]
244        vmul.i16        q2,    q10, q0          @ t4 = 6/2 * src[16]
245        vmla.i16        q1,    q14, q0          @ t3 += 6/2 * src[48]
246        @ unused: q0, q10, q14
247        vmov.i16        q0,    #(12 / 2)        @ q0 = #12/2
248        vadd.i16        q10,   q8,  q12         @ temp1 = src[0] + src[32]
249        vsub.i16        q14,   q8,  q12         @ temp2 = src[0] - src[32]
250        @ unused: q8, q12
251        vmov.i16        q8,    #(\add / 2)      @ t1 will accumulate here
252        vmov.i16        q12,   #(\add / 2)      @ t2 will accumulate here
253        movw            r12,   #15
254        vsub.i16        q2,    q2,  q3          @ t4 = 6/2 * src[16] - 16/2 * src[48]
255        movt            r12,   #9
256        @ unused: q3
257        vmla.i16        q8,    q10, q0          @ t1 = 12/2 * temp1 + add
258        vmla.i16        q12,   q14, q0          @ t2 = 12/2 * temp2 + add
259        vmov.i32        d0[0], r12
260        @ unused: q3, q10, q14
261
262        @ At this point:
263        @   q0          d0=#15|#9
264        @   q1  old t3
265        @   q2  old t4
266        @   q3
267        @   q8  old t1
268        @   q9          src[8]
269        @   q10
270        @   q11         src[24]
271        @   q12 old t2
272        @   q13         src[40]
273        @   q14
274        @   q15         src[56]
275
276        @ unused: q3, q10, q14
277        movw            r12,   #16
278        vshl.i16        q3,    q9,  #4          @ t1 = 16 * src[8]
279        movt            r12,   #4
280        vshl.i16        q10,   q9,  #2          @ t4 = 4 * src[8]
281        vmov.i32        d1[0], r12
282        vmul.i16        q14,   q9,  d0[0]       @ t2 = 15 * src[8]
283        vmul.i16        q9,    q9,  d0[1]       @ t3 = 9 * src[8]
284        @ unused: none
285        vmla.i16        q3,    q11, d0[0]       @ t1 += 15 * src[24]
286        vmls.i16        q10,   q11, d0[1]       @ t4 -= 9 * src[24]
287        vmls.i16        q14,   q11, d1[1]       @ t2 -= 4 * src[24]
288        vmls.i16        q9,    q11, d1[0]       @ t3 -= 16 * src[24]
289        @ unused: q11
290        vmla.i16        q3,    q13, d0[1]       @ t1 += 9 * src[40]
291        vmla.i16        q10,   q13, d0[0]       @ t4 += 15 * src[40]
292        vmls.i16        q14,   q13, d1[0]       @ t2 -= 16 * src[40]
293        vmla.i16        q9,    q13, d1[1]       @ t3 += 4 * src[40]
294        @ unused: q11, q13
295
296        @ Compute t5, t6, t7, t8 from old t1, t2, t3, t4. Actually, it computes
297        @ half of t5, t6, t7, t8 since t1, t2, t3, t4 are halved.
298        vadd.i16        q11,   q8,  q1          @ t5 = t1 + t3
299        vsub.i16        q1,    q8,  q1          @ t8 = t1 - t3
300        vadd.i16        q13,   q12, q2          @ t6 = t2 + t4
301        vsub.i16        q2,    q12, q2          @ t7 = t2 - t4
302        @ unused: q8, q12
303
304  .if \add1beforeshift
305        vmov.i16        q12,   #1
306  .endif
307
308        @ unused: q8
309        vmla.i16        q3,    q15, d1[1]       @ t1 += 4 * src[56]
310        vmls.i16        q14,   q15, d0[1]       @ t2 -= 9 * src[56]
311        vmla.i16        q9,    q15, d0[0]       @ t3 += 15 * src[56]
312        vmls.i16        q10,   q15, d1[0]       @ t4 -= 16 * src[56]
313        @ unused: q0, q8, q15
314
315        @ At this point:
316        @   t1      q3
317        @   t2      q14
318        @   t3      q9
319        @   t4      q10
320        @   t5half  q11
321        @   t6half  q13
322        @   t7half  q2
323        @   t8half  q1
324        @   #1      q12
325        @
326        @ tNhalf is half of the value of tN (as described in vc1_inv_trans_8x8_c).
327        @ This is done because sometimes files have input that causes tN + tM to
328        @ overflow. To avoid this overflow, we compute tNhalf, then compute
329        @ tNhalf + tM (which doesn't overflow), and then we use vhadd to compute
330        @ (tNhalf + (tNhalf + tM)) >> 1 which does not overflow because it is
331        @ one instruction.
332
333        @ For each pair of tN and tM, do:
334        @   lineA = t5half + t1
335        @   if add1beforeshift:  t1 -= 1
336        @   lineA = (t5half + lineA) >> 1
337        @   lineB = t5half - t1
338        @   lineB = (t5half + lineB) >> 1
339        @   lineA >>= rshift - 1
340        @   lineB >>= rshift - 1
341
342        vadd.i16        q8,  q11, q3                @ q8 = t5half + t1
343  .if \add1beforeshift
344        vsub.i16        q3,  q3,  q12               @ q3 = t1 - 1
345  .endif
346
347        vadd.i16        q0,  q13, q14               @ q0  = t6half + t2
348  .if \add1beforeshift
349        vsub.i16        q14, q14, q12               @ q14 = t2 - 1
350  .endif
351
352        vadd.i16        q15, q2,  q9                @ q15 = t7half + t3
353  .if \add1beforeshift
354        vsub.i16        q9,  q9,  q12               @ q9  = t3 - 1
355  .endif
356        @ unused: none
357
358        vhadd.s16       q8,  q11, q8                @ q8  = (t5half + t5half + t1) >> 1
359        vsub.i16        q3,  q11, q3                @ q3  = t5half - t1 + 1
360
361        vhadd.s16       q0,  q13, q0                @ q0  = (t6half + t6half + t2) >> 1
362        vsub.i16        q14, q13, q14               @ q14 = t6half - t2 + 1
363
364        vhadd.s16       q15, q2,  q15               @ q15 = (t7half + t7half + t3) >> 1
365        vsub.i16        q9,  q2,  q9                @ q9  = t7half - t3 + 1
366
367        vhadd.s16       q3,  q11, q3                @ q3  = (t5half + t5half - t1 + 1) >> 1
368        @ unused: q11
369
370        vadd.i16        q11, q1,  q10               @ q11 = t8half + t4
371  .if \add1beforeshift
372        vsub.i16        q10, q10, q12               @ q10 = t4 - 1
373  .endif
374        @ unused: q12
375
376        vhadd.s16       q14, q13, q14               @ q14 = (t6half + t6half - t2 + 1) >> 1
377        @ unused: q12, q13
378        vhadd.s16       q13, q2,  q9                @ q9  = (t7half + t7half - t3 + 1) >> 1
379        @ unused: q12, q2, q9
380
381        vsub.i16        q10, q1,  q10               @ q10 = t8half - t4 + 1
382        vhadd.s16       q11, q1,  q11               @ q11 = (t8half + t8half + t4) >> 1
383
384        vshr.s16        q8,  q8,  #(\rshift - 1)    @ q8  = line[0]
385        vhadd.s16       q12, q1,  q10               @ q12 = (t8half + t8half - t4 + 1) >> 1
386        vshr.s16        q9,  q0,  #(\rshift - 1)    @ q9  = line[1]
387        vshr.s16        q10, q15, #(\rshift - 1)    @ q10 = line[2]
388        vshr.s16        q11, q11, #(\rshift - 1)    @ q11 = line[3]
389        vshr.s16        q12, q12, #(\rshift - 1)    @ q12 = line[4]
390        vshr.s16        q13, q13, #(\rshift - 1)    @ q13 = line[5]
391        vshr.s16        q14, q14, #(\rshift - 1)    @ q14 = line[6]
392        vshr.s16        q15, q3,  #(\rshift - 1)    @ q15 = line[7]
393.endm
394
395@ (int16_t *block [r0])
396function ff_vc1_inv_trans_8x8_neon, export=1
397        vld1.64         {q8-q9},   [r0,:128]!
398        vld1.64         {q10-q11}, [r0,:128]!
399        vld1.64         {q12-q13}, [r0,:128]!
400        vld1.64         {q14-q15}, [r0,:128]
401        sub             r0, r0, #(16 * 2 * 3)   @ restore r0
402
403        @ At this point:
404        @   src[0]  q8
405        @   src[8]  q9
406        @   src[16] q10
407        @   src[24] q11
408        @   src[32] q12
409        @   src[40] q13
410        @   src[48] q14
411        @   src[56] q15
412
413        vc1_inv_trans_8x8_helper add=4, add1beforeshift=0, rshift=3
414
415        @ Transpose result matrix of 8x8
416        swap4           d17, d19, d21, d23, d24, d26, d28, d30
417        transpose16_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15
418
419        vc1_inv_trans_8x8_helper add=64, add1beforeshift=1, rshift=7
420
421        vst1.64         {q8-q9},   [r0,:128]!
422        vst1.64         {q10-q11}, [r0,:128]!
423        vst1.64         {q12-q13}, [r0,:128]!
424        vst1.64         {q14-q15}, [r0,:128]
425
426        bx              lr
427endfunc
428
429@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
430function ff_vc1_inv_trans_8x4_neon, export=1
431        vld1.64         {q0-q1}, [r2,:128]!     @ load 8 * 4 * 2 = 64 bytes / 16 bytes per quad = 4 quad registers
432        vld1.64         {q2-q3}, [r2,:128]
433
434        transpose16     q0, q1, q2, q3          @ transpose rows to columns
435
436        @ At this point:
437        @   src[0]   d0
438        @   src[1]   d2
439        @   src[2]   d4
440        @   src[3]   d6
441        @   src[4]   d1
442        @   src[5]   d3
443        @   src[6]   d5
444        @   src[7]   d7
445
446        vc1_inv_trans_8x4_helper    add=4, add1beforeshift=0, rshift=3
447
448        @ Move output to more standardized registers
449        vmov        d0, d16
450        vmov        d2, d17
451        vmov        d4, d18
452        vmov        d6, d19
453        vmov        d1, d21
454        vmov        d3, d20
455        vmov        d5, d23
456        vmov        d7, d22
457
458        @ At this point:
459        @   dst[0]   d0
460        @   dst[1]   d2
461        @   dst[2]   d4
462        @   dst[3]   d6
463        @   dst[4]   d1
464        @   dst[5]   d3
465        @   dst[6]   d5
466        @   dst[7]   d7
467
468        transpose16     q0, q1, q2, q3   @ turn columns into rows
469
470        @ At this point:
471        @   row[0] q0
472        @   row[1] q1
473        @   row[2] q2
474        @   row[3] q3
475
476        vc1_inv_trans_4x8_helper    add=64, rshift=7
477
478        @ At this point:
479        @   line[0].l   d0
480        @   line[0].h   d1
481        @   line[1].l   d2
482        @   line[1].h   d3
483        @   line[2].l   d4
484        @   line[2].h   d5
485        @   line[3].l   d6
486        @   line[3].h   d7
487
488        @ unused registers: q12, q13, q14, q15
489
490        vld1.64         {d28}, [r0,:64], r1     @ read dest
491        vld1.64         {d29}, [r0,:64], r1
492        vld1.64         {d30}, [r0,:64], r1
493        vld1.64         {d31}, [r0,:64], r1
494        sub             r0,  r0,  r1, lsl #2    @ restore original r0 value
495
496        vaddw.u8        q0,  q0,  d28           @ line[0] += dest[0]
497        vaddw.u8        q1,  q1,  d29           @ line[1] += dest[1]
498        vaddw.u8        q2,  q2,  d30           @ line[2] += dest[2]
499        vaddw.u8        q3,  q3,  d31           @ line[3] += dest[3]
500
501        vqmovun.s16     d0,  q0                 @ line[0]
502        vqmovun.s16     d1,  q1                 @ line[1]
503        vqmovun.s16     d2,  q2                 @ line[2]
504        vqmovun.s16     d3,  q3                 @ line[3]
505
506        vst1.64         {d0},  [r0,:64], r1     @ write dest
507        vst1.64         {d1},  [r0,:64], r1
508        vst1.64         {d2},  [r0,:64], r1
509        vst1.64         {d3},  [r0,:64]
510
511        bx              lr
512endfunc
513
514@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
515function ff_vc1_inv_trans_4x8_neon, export=1
516        mov             r12, #(8 * 2)  @ 8 elements per line, each element 2 bytes
517        vld4.16         {d0[],  d2[],  d4[],  d6[]},  [r2,:64], r12     @ read each column into a q register
518        vld4.16         {d0[1], d2[1], d4[1], d6[1]}, [r2,:64], r12
519        vld4.16         {d0[2], d2[2], d4[2], d6[2]}, [r2,:64], r12
520        vld4.16         {d0[3], d2[3], d4[3], d6[3]}, [r2,:64], r12
521        vld4.16         {d1[],  d3[],  d5[],  d7[]},  [r2,:64], r12
522        vld4.16         {d1[1], d3[1], d5[1], d7[1]}, [r2,:64], r12
523        vld4.16         {d1[2], d3[2], d5[2], d7[2]}, [r2,:64], r12
524        vld4.16         {d1[3], d3[3], d5[3], d7[3]}, [r2,:64]
525
526        vc1_inv_trans_4x8_helper    add=4, rshift=3
527
528        @ At this point:
529        @   dst[0] = q0
530        @   dst[1] = q1
531        @   dst[2] = q2
532        @   dst[3] = q3
533
534        transpose16     q0, q1, q2, q3  @ Transpose rows (registers) into columns
535
536        vc1_inv_trans_8x4_helper    add=64, add1beforeshift=1, rshift=7
537
538        vld1.32         {d28[]},  [r0,:32], r1  @ read dest
539        vld1.32         {d28[1]}, [r0,:32], r1
540        vld1.32         {d29[]},  [r0,:32], r1
541        vld1.32         {d29[1]}, [r0,:32], r1
542
543        vld1.32         {d30[]},  [r0,:32], r1
544        vld1.32         {d30[0]}, [r0,:32], r1
545        vld1.32         {d31[]},  [r0,:32], r1
546        vld1.32         {d31[0]}, [r0,:32], r1
547        sub             r0,  r0,  r1, lsl #3    @ restore original r0 value
548
549        vaddw.u8        q8,  q8,  d28           @ line[0,1] += dest[0,1]
550        vaddw.u8        q9,  q9,  d29           @ line[2,3] += dest[2,3]
551        vaddw.u8        q10, q10, d30           @ line[5,4] += dest[5,4]
552        vaddw.u8        q11, q11, d31           @ line[7,6] += dest[7,6]
553
554        vqmovun.s16     d16, q8                 @ clip(line[0,1])
555        vqmovun.s16     d18, q9                 @ clip(line[2,3])
556        vqmovun.s16     d20, q10                @ clip(line[5,4])
557        vqmovun.s16     d22, q11                @ clip(line[7,6])
558
559        vst1.32         {d16[0]}, [r0,:32], r1  @ write dest
560        vst1.32         {d16[1]}, [r0,:32], r1
561        vst1.32         {d18[0]}, [r0,:32], r1
562        vst1.32         {d18[1]}, [r0,:32], r1
563
564        vst1.32         {d20[1]}, [r0,:32], r1
565        vst1.32         {d20[0]}, [r0,:32], r1
566        vst1.32         {d22[1]}, [r0,:32], r1
567        vst1.32         {d22[0]}, [r0,:32]
568
569        bx              lr
570endfunc
571
572@ Setup constants in registers which are used by vc1_inv_trans_4x4_helper
573.macro vc1_inv_trans_4x4_helper_setup
574        vmov.i16        q13, #17
575        vmov.i16        q14, #22
576        vmov.i16        d30, #10                @ only need double-word, not quad-word
577.endm
578
579@ This is modeled after the first for loop in vc1_inv_trans_4x4_c.
580.macro vc1_inv_trans_4x4_helper add rshift
581        vmov.i16        q2,  #\add              @ t1|t2 will accumulate here
582
583        vadd.i16        d16, d0,  d1            @ temp1 = src[0] + src[2]
584        vsub.i16        d17, d0,  d1            @ temp2 = src[0] - src[2]
585        vmul.i16        q3,  q14, q1            @ t3|t4 = 22 * (src[1]|src[3])
586        vmla.i16        q2,  q13, q8            @ t1|t2 = 17 * (temp1|temp2) + add
587        vmla.i16        d6,  d30, d3            @ t3 += 10 * src[3]
588        vmls.i16        d7,  d30, d2            @ t4 -= 10 * src[1]
589
590        vadd.i16        q0,  q2,  q3            @ dst[0,2] = (t1|t2 + t3|t4)
591        vsub.i16        q1,  q2,  q3            @ dst[3,1] = (t1|t2 - t3|t4)
592        vshr.s16        q0,  q0,  #\rshift      @ dst[0,2] >>= rshift
593        vshr.s16        q1,  q1,  #\rshift      @ dst[3,1] >>= rshift
594.endm
595
596@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
597function ff_vc1_inv_trans_4x4_neon, export=1
598        mov             r12, #(8 * 2)  @ 8 elements per line, each element 2 bytes
599        vld4.16         {d0[],  d1[],  d2[],  d3[]},  [r2,:64], r12     @ read each column into a register
600        vld4.16         {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r12
601        vld4.16         {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r12
602        vld4.16         {d0[3], d1[3], d2[3], d3[3]}, [r2,:64]
603
604        vswp            d1,  d2         @ so that we can later access column 1 and column 3 as a single q1 register
605
606        vc1_inv_trans_4x4_helper_setup
607
608        @ At this point:
609        @   src[0] = d0
610        @   src[1] = d2
611        @   src[2] = d1
612        @   src[3] = d3
613
614        vc1_inv_trans_4x4_helper add=4, rshift=3     @ compute t1, t2, t3, t4 and combine them into dst[0-3]
615
616        @ At this point:
617        @   dst[0] = d0
618        @   dst[1] = d3
619        @   dst[2] = d1
620        @   dst[3] = d2
621
622        transpose16     d0, d3, d1, d2  @ Transpose rows (registers) into columns
623
624        @ At this point:
625        @   src[0]  = d0
626        @   src[8]  = d3
627        @   src[16] = d1
628        @   src[24] = d2
629
630        vswp            d2,  d3         @ so that we can later access column 1 and column 3 in order as a single q1 register
631
632        @ At this point:
633        @   src[0]  = d0
634        @   src[8]  = d2
635        @   src[16] = d1
636        @   src[24] = d3
637
638        vc1_inv_trans_4x4_helper add=64, rshift=7             @ compute t1, t2, t3, t4 and combine them into dst[0-3]
639
640        @ At this point:
641        @   line[0] = d0
642        @   line[1] = d3
643        @   line[2] = d1
644        @   line[3] = d2
645
646        vld1.32         {d18[]},  [r0,:32], r1  @ read dest
647        vld1.32         {d19[]},  [r0,:32], r1
648        vld1.32         {d18[1]}, [r0,:32], r1
649        vld1.32         {d19[0]}, [r0,:32], r1
650        sub             r0,  r0,  r1, lsl #2    @ restore original r0 value
651
652        vaddw.u8        q0,  q0,  d18           @ line[0,2] += dest[0,2]
653        vaddw.u8        q1,  q1,  d19           @ line[3,1] += dest[3,1]
654
655        vqmovun.s16     d0,  q0                 @ clip(line[0,2])
656        vqmovun.s16     d1,  q1                 @ clip(line[3,1])
657
658        vst1.32         {d0[0]},  [r0,:32], r1  @ write dest
659        vst1.32         {d1[1]},  [r0,:32], r1
660        vst1.32         {d0[1]},  [r0,:32], r1
661        vst1.32         {d1[0]},  [r0,:32]
662
663        bx              lr
664endfunc
665
666@ The absolute value of multiplication constants from vc1_mspel_filter and vc1_mspel_{ver,hor}_filter_16bits.
667@ The sign is embedded in the code below that carries out the multiplication (mspel_filter{,.16}).
668#define MSPEL_MODE_1_MUL_CONSTANTS  4, 53, 18, 3
669#define MSPEL_MODE_2_MUL_CONSTANTS  1, 9,  9,  1
670#define MSPEL_MODE_3_MUL_CONSTANTS  3, 18, 53, 4
671
672@ These constants are from reading the source code of vc1_mspel_mc and determining the value that
673@ is added to `rnd` to result in the variable `r`, and the value of the variable `shift`.
674#define MSPEL_MODES_11_ADDSHIFT_CONSTANTS   15, 5
675#define MSPEL_MODES_12_ADDSHIFT_CONSTANTS   3,  3
676#define MSPEL_MODES_13_ADDSHIFT_CONSTANTS   15, 5
677#define MSPEL_MODES_21_ADDSHIFT_CONSTANTS   MSPEL_MODES_12_ADDSHIFT_CONSTANTS
678#define MSPEL_MODES_22_ADDSHIFT_CONSTANTS   0,  1
679#define MSPEL_MODES_23_ADDSHIFT_CONSTANTS   3,  3
680#define MSPEL_MODES_31_ADDSHIFT_CONSTANTS   MSPEL_MODES_13_ADDSHIFT_CONSTANTS
681#define MSPEL_MODES_32_ADDSHIFT_CONSTANTS   MSPEL_MODES_23_ADDSHIFT_CONSTANTS
682#define MSPEL_MODES_33_ADDSHIFT_CONSTANTS   15, 5
683
684@ The addition and shift constants from vc1_mspel_filter.
685#define MSPEL_MODE_1_ADDSHIFT_CONSTANTS     32, 6
686#define MSPEL_MODE_2_ADDSHIFT_CONSTANTS     8,  4
687#define MSPEL_MODE_3_ADDSHIFT_CONSTANTS     32, 6
688
689@ Setup constants in registers for a subsequent use of mspel_filter{,.16}.
690.macro mspel_constants typesize reg_a reg_b reg_c reg_d filter_a filter_b filter_c filter_d reg_add filter_add_register
691  @ Typesize should be i8 or i16.
692
693  @ Only set the register if the value is not 1 and unique
694  .if \filter_a != 1
695        vmov.\typesize  \reg_a,  #\filter_a          @ reg_a = filter_a
696  .endif
697        vmov.\typesize  \reg_b,  #\filter_b          @ reg_b = filter_b
698  .if \filter_b != \filter_c
699        vmov.\typesize  \reg_c,  #\filter_c          @ reg_c = filter_c
700  .endif
701  .if \filter_d != 1
702        vmov.\typesize  \reg_d,  #\filter_d          @ reg_d = filter_d
703  .endif
704  @ vdup to double the size of typesize
705  .ifc \typesize,i8
706        vdup.16         \reg_add,  \filter_add_register     @ reg_add = filter_add_register
707  .else
708        vdup.32         \reg_add,  \filter_add_register     @ reg_add = filter_add_register
709  .endif
710.endm
711
712@ After mspel_constants has been used, do the filtering.
713.macro mspel_filter acc dest src0 src1 src2 src3 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift narrow=1
714  .if \filter_a != 1
715        @ If filter_a != 1, then we need a move and subtract instruction
716        vmov            \acc,  \reg_add                     @ acc = reg_add
717        vmlsl.u8        \acc,  \reg_a,  \src0               @ acc -= filter_a * src[-stride]
718  .else
719        @ If filter_a is 1, then just subtract without an extra move
720        vsubw.u8        \acc,  \reg_add,  \src0             @ acc = reg_add - src[-stride]      @ since filter_a == 1
721  .endif
722        vmlal.u8        \acc,  \reg_b,  \src1               @ acc += filter_b * src[0]
723  .if \filter_b != \filter_c
724        vmlal.u8        \acc,  \reg_c,  \src2               @ acc += filter_c * src[stride]
725  .else
726        @ If filter_b is the same as filter_c, use the same reg_b register
727        vmlal.u8        \acc,  \reg_b,  \src2               @ acc += filter_c * src[stride]     @ where filter_c == filter_b
728  .endif
729  .if \filter_d != 1
730        @ If filter_d != 1, then do a multiply accumulate
731        vmlsl.u8        \acc,  \reg_d,  \src3               @ acc -= filter_d * src[stride * 2]
732  .else
733        @ If filter_d is 1, then just do a subtract
734        vsubw.u8        \acc,  \acc,    \src3               @ acc -= src[stride * 2]            @ since filter_d == 1
735  .endif
736  .if \narrow
737        vqshrun.s16     \dest, \acc,    #\filter_shift      @ dest = clip_uint8(acc >> filter_shift)
738  .else
739        vshr.s16        \dest, \acc,    #\filter_shift      @ dest = acc >> filter_shift
740  .endif
741.endm
742
743@ This is similar to mspel_filter, but the input is 16-bit instead of 8-bit and narrow=0 is not supported.
744.macro mspel_filter.16 acc0 acc1 acc0_0 acc0_1 dest src0 src1 src2 src3 src4 src5 src6 src7 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift
745  .if \filter_a != 1
746        vmov            \acc0,  \reg_add
747        vmov            \acc1,  \reg_add
748        vmlsl.s16       \acc0,  \reg_a,  \src0
749        vmlsl.s16       \acc1,  \reg_a,  \src1
750  .else
751        vsubw.s16       \acc0,  \reg_add,  \src0
752        vsubw.s16       \acc1,  \reg_add,  \src1
753  .endif
754        vmlal.s16       \acc0,  \reg_b,  \src2
755        vmlal.s16       \acc1,  \reg_b,  \src3
756  .if \filter_b != \filter_c
757        vmlal.s16       \acc0,  \reg_c,  \src4
758        vmlal.s16       \acc1,  \reg_c,  \src5
759  .else
760        vmlal.s16       \acc0,  \reg_b,  \src4
761        vmlal.s16       \acc1,  \reg_b,  \src5
762  .endif
763  .if \filter_d != 1
764        vmlsl.s16       \acc0,  \reg_d,  \src6
765        vmlsl.s16       \acc1,  \reg_d,  \src7
766  .else
767        vsubw.s16       \acc0,  \acc0,   \src6
768        vsubw.s16       \acc1,  \acc1,   \src7
769  .endif
770        @ Use acc0_0 and acc0_1 as temp space
771        vqshrun.s32     \acc0_0, \acc0,  #\filter_shift     @ Shift and narrow with saturation from s32 to u16
772        vqshrun.s32     \acc0_1, \acc1,  #\filter_shift
773        vqmovn.u16      \dest,  \acc0                       @ Narrow with saturation from u16 to u8
774.endm
775
776@ Register usage for put_vc1_mspel_mc functions. Registers marked 'hv' are only used in put_vc1_mspel_mc_hv.
777@
778@   r0        adjusted dst
779@   r1        adjusted src
780@   r2        stride
781@   r3        adjusted rnd
782@   r4 [hv]   tmp
783@   r11 [hv]  sp saved
784@   r12       loop counter
785@   d0        src[-stride]
786@   d1        src[0]
787@   d2        src[stride]
788@   d3        src[stride * 2]
789@   q0 [hv]   src[-stride]
790@   q1 [hv]   src[0]
791@   q2 [hv]   src[stride]
792@   q3 [hv]   src[stride * 2]
793@   d21       often result from mspel_filter
794@   q11       accumulator 0
795@   q12 [hv]  accumulator 1
796@   q13       accumulator initial value
797@   d28       filter_a
798@   d29       filter_b
799@   d30       filter_c
800@   d31       filter_d
801
802@ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3])
803.macro put_vc1_mspel_mc_hv hmode vmode filter_h_a filter_h_b filter_h_c filter_h_d filter_v_a filter_v_b filter_v_c filter_v_d filter_add filter_shift
804function ff_put_vc1_mspel_mc\hmode\()\vmode\()_neon, export=1
805        push            {r4, r11, lr}
806        mov             r11, sp                 @ r11 = stack pointer before realignmnet
807A       bic             sp,  sp,  #15           @ sp = round down to multiple of 16 bytes
808T       bic             r4,  r11, #15
809T       mov             sp,  r4
810        sub             sp,  sp,  #(8*2*16)     @ make space for 8 rows * 2 byte per element * 16 elements per row (to fit 11 actual elements per row)
811        mov             r4,  sp                 @ r4 = int16_t tmp[8 * 16]
812
813        sub             r1,  r1,  #1            @ src -= 1
814  .if \filter_add != 0
815        add             r3,  r3,  #\filter_add  @ r3 = filter_add + rnd
816  .endif
817        mov             r12, #8                 @ loop counter
818        sub             r1,  r1,  r2            @ r1 = &src[-stride]      @ slide back
819
820        @ Do vertical filtering from src into tmp
821        mspel_constants i8, d28, d29, d30, d31, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, q13, r3
822
823        vld1.64         {d0,d1}, [r1], r2
824        vld1.64         {d2,d3}, [r1], r2
825        vld1.64         {d4,d5}, [r1], r2
826
8271:
828        subs            r12,  r12,  #4
829
830        vld1.64         {d6,d7}, [r1], r2
831        mspel_filter    q11, q11, d0, d2, d4, d6, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
832        mspel_filter    q12, q12, d1, d3, d5, d7, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
833        vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
834
835        vld1.64         {d0,d1}, [r1], r2
836        mspel_filter    q11, q11, d2, d4, d6, d0, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
837        mspel_filter    q12, q12, d3, d5, d7, d1, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
838        vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
839
840        vld1.64         {d2,d3}, [r1], r2
841        mspel_filter    q11, q11, d4, d6, d0, d2, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
842        mspel_filter    q12, q12, d5, d7, d1, d3, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
843        vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
844
845        vld1.64         {d4,d5}, [r1], r2
846        mspel_filter    q11, q11, d6, d0, d2, d4, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
847        mspel_filter    q12, q12, d7, d1, d3, d5, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
848        vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
849
850        bne             1b
851
852        rsb             r3,   r3,  #(64 + \filter_add)      @ r3 = (64 + filter_add) - r3
853        mov             r12,  #8                @ loop counter
854        mov             r4,   sp                @ r4 = tmp
855
856        @ Do horizontal filtering from temp to dst
857        mspel_constants i16, d28, d29, d30, d31, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, q13, r3
858
8592:
860        subs            r12,  r12,  #1
861
862        vld1.64         {q0,q1}, [r4,:128]!     @ read one line of tmp
863        vext.16         q2,   q0,   q1,  #2
864        vext.16         q3,   q0,   q1,  #3
865        vext.16         q1,   q0,   q1,  #1     @ do last because it writes to q1 which is read by the other vext instructions
866
867        mspel_filter.16 q11, q12, d22, d23, d21, d0, d1, d2, d3, d4, d5, d6, d7, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, d28, d29, d30, d31, q13, 7
868
869        vst1.64         {d21}, [r0,:64], r2     @ store and increment dst
870
871        bne             2b
872
873        mov             sp,  r11
874        pop             {r4, r11, pc}
875endfunc
876.endm
877
878@ Use C preprocessor and assembler macros to expand to functions for horizontal and vertical filtering.
879#define PUT_VC1_MSPEL_MC_HV(hmode, vmode)   \
880    put_vc1_mspel_mc_hv hmode, vmode, \
881        MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, \
882        MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, \
883        MSPEL_MODES_ ## hmode ## vmode ## _ADDSHIFT_CONSTANTS
884
885PUT_VC1_MSPEL_MC_HV(1, 1)
886PUT_VC1_MSPEL_MC_HV(1, 2)
887PUT_VC1_MSPEL_MC_HV(1, 3)
888PUT_VC1_MSPEL_MC_HV(2, 1)
889PUT_VC1_MSPEL_MC_HV(2, 2)
890PUT_VC1_MSPEL_MC_HV(2, 3)
891PUT_VC1_MSPEL_MC_HV(3, 1)
892PUT_VC1_MSPEL_MC_HV(3, 2)
893PUT_VC1_MSPEL_MC_HV(3, 3)
894
895#undef PUT_VC1_MSPEL_MC_HV
896
897.macro  put_vc1_mspel_mc_h_only hmode filter_a filter_b filter_c filter_d filter_add filter_shift
898function ff_put_vc1_mspel_mc\hmode\()0_neon, export=1
899        rsb             r3,   r3,   #\filter_add        @ r3 = filter_add - r = filter_add - rnd
900        mov             r12,  #8                        @ loop counter
901        sub             r1,   r1,   #1                  @ slide back, using immediate
902
903        mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3
904
9051:
906        subs            r12,  r12,  #1
907
908        vld1.64         {d0,d1}, [r1], r2               @ read 16 bytes even though we only need 11, also src += stride
909        vext.8          d2,   d0,   d1,  #2
910        vext.8          d3,   d0,   d1,  #3
911        vext.8          d1,   d0,   d1,  #1             @ do last because it writes to d1 which is read by the other vext instructions
912
913        mspel_filter    q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
914
915        vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
916
917        bne             1b
918
919        bx              lr
920endfunc
921.endm
922
923@ Use C preprocessor and assembler macros to expand to functions for horizontal only filtering.
924#define PUT_VC1_MSPEL_MC_H_ONLY(hmode) \
925        put_vc1_mspel_mc_h_only hmode, MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## hmode ## _ADDSHIFT_CONSTANTS
926
927PUT_VC1_MSPEL_MC_H_ONLY(1)
928PUT_VC1_MSPEL_MC_H_ONLY(2)
929PUT_VC1_MSPEL_MC_H_ONLY(3)
930
931#undef PUT_VC1_MSPEL_MC_H_ONLY
932
933@ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3])
934.macro put_vc1_mspel_mc_v_only vmode filter_a filter_b filter_c filter_d filter_add filter_shift
935function ff_put_vc1_mspel_mc0\vmode\()_neon, export=1
936        add             r3,   r3,   #\filter_add - 1    @ r3 = filter_add - r = filter_add - (1 - rnd) = filter_add - 1 + rnd
937        mov             r12,  #8                        @ loop counter
938        sub             r1,   r1,   r2                  @ r1 = &src[-stride]      @ slide back
939
940        mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3
941
942        vld1.64         {d0},  [r1], r2                 @ d0 = src[-stride]
943        vld1.64         {d1},  [r1], r2                 @ d1 = src[0]
944        vld1.64         {d2},  [r1], r2                 @ d2 = src[stride]
945
9461:
947        subs            r12,  r12,  #4
948
949        vld1.64         {d3},  [r1], r2                 @ d3 = src[stride * 2]
950        mspel_filter    q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
951        vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
952
953        vld1.64         {d0},  [r1], r2                 @ d0 = next line
954        mspel_filter    q11, d21, d1, d2, d3, d0, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
955        vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
956
957        vld1.64         {d1},  [r1], r2                 @ d1 = next line
958        mspel_filter    q11, d21, d2, d3, d0, d1, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
959        vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
960
961        vld1.64         {d2},  [r1], r2                 @ d2 = next line
962        mspel_filter    q11, d21, d3, d0, d1, d2, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
963        vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
964
965        bne             1b
966
967        bx              lr
968endfunc
969.endm
970
971@ Use C preprocessor and assembler macros to expand to functions for vertical only filtering.
972#define PUT_VC1_MSPEL_MC_V_ONLY(vmode) \
973        put_vc1_mspel_mc_v_only vmode, MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## vmode ## _ADDSHIFT_CONSTANTS
974
975PUT_VC1_MSPEL_MC_V_ONLY(1)
976PUT_VC1_MSPEL_MC_V_ONLY(2)
977PUT_VC1_MSPEL_MC_V_ONLY(3)
978
979#undef PUT_VC1_MSPEL_MC_V_ONLY
980
981function ff_put_pixels8x8_neon, export=1
982        vld1.64         {d0}, [r1], r2
983        vld1.64         {d1}, [r1], r2
984        vld1.64         {d2}, [r1], r2
985        vld1.64         {d3}, [r1], r2
986        vld1.64         {d4}, [r1], r2
987        vld1.64         {d5}, [r1], r2
988        vld1.64         {d6}, [r1], r2
989        vld1.64         {d7}, [r1]
990        vst1.64         {d0}, [r0,:64], r2
991        vst1.64         {d1}, [r0,:64], r2
992        vst1.64         {d2}, [r0,:64], r2
993        vst1.64         {d3}, [r0,:64], r2
994        vst1.64         {d4}, [r0,:64], r2
995        vst1.64         {d5}, [r0,:64], r2
996        vst1.64         {d6}, [r0,:64], r2
997        vst1.64         {d7}, [r0,:64]
998        bx              lr
999endfunc
1000
1001function ff_vc1_inv_trans_8x8_dc_neon, export=1
1002        ldrsh           r2, [r2]              @ int dc = block[0];
1003
1004        vld1.64         {d0},  [r0,:64], r1
1005        vld1.64         {d1},  [r0,:64], r1
1006        vld1.64         {d4},  [r0,:64], r1
1007        vld1.64         {d5},  [r0,:64], r1
1008
1009        add             r2, r2, r2, lsl #1    @ dc = (3 * dc +  1) >> 1;
1010        vld1.64         {d6},  [r0,:64], r1
1011        add             r2, r2, #1
1012        vld1.64         {d7},  [r0,:64], r1
1013        vld1.64         {d16}, [r0,:64], r1
1014        vld1.64         {d17}, [r0,:64], r1
1015        asr             r2, r2, #1
1016
1017        sub             r0,  r0,  r1, lsl #3  @ restore r0 to original value
1018
1019        add             r2, r2, r2, lsl #1    @ dc = (3 * dc + 16) >> 5;
1020        add             r2, r2, #16
1021        asr             r2, r2, #5
1022
1023        vdup.16         q1,  r2               @ dc
1024
1025        vaddw.u8        q9,   q1,  d0
1026        vaddw.u8        q10,  q1,  d1
1027        vaddw.u8        q11,  q1,  d4
1028        vaddw.u8        q12,  q1,  d5
1029        vqmovun.s16     d0,  q9
1030        vqmovun.s16     d1,  q10
1031        vqmovun.s16     d4,  q11
1032        vst1.64         {d0},  [r0,:64], r1
1033        vqmovun.s16     d5,  q12
1034        vst1.64         {d1},  [r0,:64], r1
1035        vaddw.u8        q13,  q1,  d6
1036        vst1.64         {d4},  [r0,:64], r1
1037        vaddw.u8        q14,  q1,  d7
1038        vst1.64         {d5},  [r0,:64], r1
1039        vaddw.u8        q15,  q1,  d16
1040        vaddw.u8        q1,   q1,  d17        @ this destroys q1
1041        vqmovun.s16     d6,  q13
1042        vqmovun.s16     d7,  q14
1043        vqmovun.s16     d16, q15
1044        vqmovun.s16     d17, q1
1045        vst1.64         {d6},  [r0,:64], r1
1046        vst1.64         {d7},  [r0,:64], r1
1047        vst1.64         {d16}, [r0,:64], r1
1048        vst1.64         {d17}, [r0,:64]
1049        bx              lr
1050endfunc
1051
1052function ff_vc1_inv_trans_8x4_dc_neon, export=1
1053        ldrsh           r2, [r2]              @ int dc = block[0];
1054
1055        vld1.64         {d0},  [r0,:64], r1
1056        vld1.64         {d1},  [r0,:64], r1
1057        vld1.64         {d4},  [r0,:64], r1
1058        vld1.64         {d5},  [r0,:64], r1
1059
1060        add             r2, r2, r2, lsl #1    @ dc = ( 3 * dc +  1) >> 1;
1061
1062        sub             r0,  r0,  r1, lsl #2  @ restore r0 to original value
1063
1064        add             r2, r2, #1
1065        asr             r2, r2, #1
1066
1067        add             r2, r2, r2, lsl #4    @ dc = (17 * dc + 64) >> 7;
1068        add             r2, r2, #64
1069        asr             r2, r2, #7
1070
1071        vdup.16         q1,  r2               @ dc
1072
1073        vaddw.u8        q3,  q1,  d0
1074        vaddw.u8        q8,  q1,  d1
1075        vaddw.u8        q9,  q1,  d4
1076        vaddw.u8        q10, q1,  d5
1077        vqmovun.s16     d0,  q3
1078        vqmovun.s16     d1,  q8
1079        vqmovun.s16     d4,  q9
1080        vst1.64         {d0},  [r0,:64], r1
1081        vqmovun.s16     d5,  q10
1082        vst1.64         {d1},  [r0,:64], r1
1083        vst1.64         {d4},  [r0,:64], r1
1084        vst1.64         {d5},  [r0,:64]
1085        bx              lr
1086endfunc
1087
1088function ff_vc1_inv_trans_4x8_dc_neon, export=1
1089        ldrsh           r2, [r2]              @ int dc = block[0];
1090
1091        vld1.32         {d0[]},   [r0,:32], r1
1092        vld1.32         {d1[]},   [r0,:32], r1
1093        vld1.32         {d0[1]},  [r0,:32], r1
1094        vld1.32         {d1[1]},  [r0,:32], r1
1095
1096        add             r2, r2, r2, lsl #4    @ dc = (17 * dc +  4) >> 3;
1097        vld1.32         {d4[]},   [r0,:32], r1
1098        add             r2, r2, #4
1099        vld1.32         {d5[]},   [r0,:32], r1
1100        vld1.32         {d4[1]},  [r0,:32], r1
1101        asr             r2, r2, #3
1102        vld1.32         {d5[1]},  [r0,:32], r1
1103
1104        add             r2, r2, r2, lsl #1    @ dc = (12 * dc + 64) >> 7;
1105
1106        sub             r0,  r0,  r1, lsl #3  @ restore r0 to original value
1107
1108        lsl             r2, r2, #2
1109        add             r2, r2, #64
1110        asr             r2, r2, #7
1111
1112        vdup.16         q1,  r2               @ dc
1113
1114        vaddw.u8        q3,  q1,  d0
1115        vaddw.u8        q8,  q1,  d1
1116        vaddw.u8        q9,  q1,  d4
1117        vaddw.u8        q10, q1,  d5
1118        vqmovun.s16     d0,  q3
1119        vst1.32         {d0[0]},  [r0,:32], r1
1120        vqmovun.s16     d1,  q8
1121        vst1.32         {d1[0]},  [r0,:32], r1
1122        vqmovun.s16     d4,  q9
1123        vst1.32         {d0[1]},  [r0,:32], r1
1124        vqmovun.s16     d5,  q10
1125        vst1.32         {d1[1]},  [r0,:32], r1
1126        vst1.32         {d4[0]},  [r0,:32], r1
1127        vst1.32         {d5[0]},  [r0,:32], r1
1128        vst1.32         {d4[1]},  [r0,:32], r1
1129        vst1.32         {d5[1]},  [r0,:32]
1130        bx              lr
1131endfunc
1132
1133function ff_vc1_inv_trans_4x4_dc_neon, export=1
1134        ldrsh           r2, [r2]              @ int dc = block[0];
1135
1136        vld1.32         {d0[]},   [r0,:32], r1
1137        vld1.32         {d1[]},   [r0,:32], r1
1138        vld1.32         {d0[1]},  [r0,:32], r1
1139        vld1.32         {d1[1]},  [r0,:32], r1
1140
1141        add             r2, r2, r2, lsl #4    @ dc = (17 * dc +  4) >> 3;
1142
1143        sub             r0,  r0,  r1, lsl #2  @ restore r0 to original value
1144
1145        add             r2, r2, #4
1146        asr             r2, r2, #3
1147
1148        add             r2, r2, r2, lsl #4    @ dc = (17 * dc + 64) >> 7;
1149        add             r2, r2, #64
1150        asr             r2, r2, #7
1151
1152        vdup.16         q1,  r2               @ dc
1153
1154        vaddw.u8        q2,  q1,  d0
1155        vaddw.u8        q3,  q1,  d1
1156        vqmovun.s16     d0,  q2
1157        vst1.32         {d0[0]},  [r0,:32], r1
1158        vqmovun.s16     d1,  q3
1159        vst1.32         {d1[0]},  [r0,:32], r1
1160        vst1.32         {d0[1]},  [r0,:32], r1
1161        vst1.32         {d1[1]},  [r0,:32]
1162        bx              lr
1163endfunc
1164
1165@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
1166@ On entry:
1167@   r0 -> top-left pel of lower block
1168@   r1 = row stride, bytes
1169@   r2 = PQUANT bitstream parameter
1170function ff_vc1_v_loop_filter4_neon, export=1
1171        sub             r3, r0, r1, lsl #2
1172        vldr            d0, .Lcoeffs
1173        vld1.32         {d1[0]}, [r0], r1       @ P5
1174        vld1.32         {d2[0]}, [r3], r1       @ P1
1175        vld1.32         {d3[0]}, [r3], r1       @ P2
1176        vld1.32         {d4[0]}, [r0], r1       @ P6
1177        vld1.32         {d5[0]}, [r3], r1       @ P3
1178        vld1.32         {d6[0]}, [r0], r1       @ P7
1179        vld1.32         {d7[0]}, [r3]           @ P4
1180        vld1.32         {d16[0]}, [r0]          @ P8
1181        vshll.u8        q9, d1, #1              @ 2*P5
1182        vdup.16         d17, r2                 @ pq
1183        vshll.u8        q10, d2, #1             @ 2*P1
1184        vmovl.u8        q11, d3                 @ P2
1185        vmovl.u8        q1, d4                  @ P6
1186        vmovl.u8        q12, d5                 @ P3
1187        vmls.i16        d20, d22, d0[1]         @ 2*P1-5*P2
1188        vmovl.u8        q11, d6                 @ P7
1189        vmls.i16        d18, d2, d0[1]          @ 2*P5-5*P6
1190        vshll.u8        q2, d5, #1              @ 2*P3
1191        vmovl.u8        q3, d7                  @ P4
1192        vmla.i16        d18, d22, d0[1]         @ 2*P5-5*P6+5*P7
1193        vmovl.u8        q11, d16                @ P8
1194        vmla.u16        d20, d24, d0[1]         @ 2*P1-5*P2+5*P3
1195        vmovl.u8        q12, d1                 @ P5
1196        vmls.u16        d4, d6, d0[1]           @ 2*P3-5*P4
1197        vmls.u16        d18, d22, d0[0]         @ 2*P5-5*P6+5*P7-2*P8
1198        vsub.i16        d1, d6, d24             @ P4-P5
1199        vmls.i16        d20, d6, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
1200        vmla.i16        d4, d24, d0[1]          @ 2*P3-5*P4+5*P5
1201        vmls.i16        d4, d2, d0[0]           @ 2*P3-5*P4+5*P5-2*P6
1202        vabs.s16        d2, d1
1203        vrshr.s16       d3, d18, #3
1204        vrshr.s16       d5, d20, #3
1205        vshr.s16        d2, d2, #1              @ clip
1206        vrshr.s16       d4, d4, #3
1207        vabs.s16        d3, d3                  @ a2
1208        vshr.s16        d1, d1, #8              @ clip_sign
1209        vabs.s16        d5, d5                  @ a1
1210        vceq.i16        d7, d2, #0              @ test clip == 0
1211        vabs.s16        d16, d4                 @ a0
1212        vshr.s16        d4, d4, #8              @ a0_sign
1213        vcge.s16        d18, d5, d3             @ test a1 >= a2
1214        vcge.s16        d17, d16, d17           @ test a0 >= pq
1215        vbsl            d18, d3, d5             @ a3
1216        vsub.i16        d1, d1, d4              @ clip_sign - a0_sign
1217        vorr            d3, d7, d17             @ test clip == 0 || a0 >= pq
1218        vqsub.u16       d4, d16, d18            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1219        vcge.s16        d5, d18, d16            @ test a3 >= a0
1220        vmul.i16        d0, d4, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
1221        vorr            d4, d3, d5              @ test clip == 0 || a0 >= pq || a3 >= a0
1222        vmov.32         r0, d4[1]               @ move to gp reg
1223        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
1224        vcge.s16        d4, d0, d2
1225        tst             r0, #1
1226        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
1227        vbsl            d4, d2, d0              @ FFMIN(d, clip)
1228        vbic            d0, d4, d3              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1229        vmls.i16        d6, d0, d1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1230        vmla.i16        d24, d0, d1             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1231        vqmovun.s16     d0, q3
1232        vqmovun.s16     d1, q12
1233        vst1.32         {d0[0]}, [r3], r1
1234        vst1.32         {d1[0]}, [r3]
12351:      bx              lr
1236endfunc
1237
1238@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
1239@ On entry:
1240@   r0 -> top-left pel of right block
1241@   r1 = row stride, bytes
1242@   r2 = PQUANT bitstream parameter
1243function ff_vc1_h_loop_filter4_neon, export=1
1244        sub             r3, r0, #4              @ where to start reading
1245        vldr            d0, .Lcoeffs
1246        vld1.32         {d2}, [r3], r1
1247        sub             r0, r0, #1              @ where to start writing
1248        vld1.32         {d4}, [r3], r1
1249        vld1.32         {d3}, [r3], r1
1250        vld1.32         {d5}, [r3]
1251        vdup.16         d1, r2                  @ pq
1252        vtrn.8          q1, q2
1253        vtrn.16         d2, d3                  @ P1, P5, P3, P7
1254        vtrn.16         d4, d5                  @ P2, P6, P4, P8
1255        vshll.u8        q3, d2, #1              @ 2*P1, 2*P5
1256        vmovl.u8        q8, d4                  @ P2, P6
1257        vmovl.u8        q9, d3                  @ P3, P7
1258        vmovl.u8        q2, d5                  @ P4, P8
1259        vmls.i16        q3, q8, d0[1]           @ 2*P1-5*P2, 2*P5-5*P6
1260        vshll.u8        q10, d3, #1             @ 2*P3, 2*P7
1261        vmovl.u8        q1, d2                  @ P1, P5
1262        vmla.i16        q3, q9, d0[1]           @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
1263        vmls.i16        q3, q2, d0[0]           @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
1264        vmov            d2, d3                  @ needs to be in an even-numbered vector for when we come to narrow it later
1265        vmls.i16        d20, d4, d0[1]          @ 2*P3-5*P4
1266        vmla.i16        d20, d3, d0[1]          @ 2*P3-5*P4+5*P5
1267        vsub.i16        d3, d4, d2              @ P4-P5
1268        vmls.i16        d20, d17, d0[0]         @ 2*P3-5*P4+5*P5-2*P6
1269        vrshr.s16       q3, q3, #3
1270        vabs.s16        d5, d3
1271        vshr.s16        d3, d3, #8              @ clip_sign
1272        vrshr.s16       d16, d20, #3
1273        vabs.s16        q3, q3                  @ a1, a2
1274        vshr.s16        d5, d5, #1              @ clip
1275        vabs.s16        d17, d16                @ a0
1276        vceq.i16        d18, d5, #0             @ test clip == 0
1277        vshr.s16        d16, d16, #8            @ a0_sign
1278        vcge.s16        d19, d6, d7             @ test a1 >= a2
1279        vcge.s16        d1, d17, d1             @ test a0 >= pq
1280        vsub.i16        d16, d3, d16            @ clip_sign - a0_sign
1281        vbsl            d19, d7, d6             @ a3
1282        vorr            d1, d18, d1             @ test clip == 0 || a0 >= pq
1283        vqsub.u16       d3, d17, d19            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1284        vcge.s16        d6, d19, d17            @ test a3 >= a0    @
1285        vmul.i16        d0, d3, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
1286        vorr            d3, d1, d6              @ test clip == 0 || a0 >= pq || a3 >= a0
1287        vmov.32         r2, d3[1]               @ move to gp reg
1288        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
1289        vcge.s16        d3, d0, d5
1290        tst             r2, #1
1291        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
1292        vbsl            d3, d5, d0              @ FFMIN(d, clip)
1293        vbic            d0, d3, d1              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1294        vmla.i16        d2, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1295        vmls.i16        d4, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1296        vqmovun.s16     d1, q1
1297        vqmovun.s16     d0, q2
1298        vst2.8          {d0[0], d1[0]}, [r0], r1
1299        vst2.8          {d0[1], d1[1]}, [r0], r1
1300        vst2.8          {d0[2], d1[2]}, [r0], r1
1301        vst2.8          {d0[3], d1[3]}, [r0]
13021:      bx              lr
1303endfunc
1304
1305@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
1306@ On entry:
1307@   r0 -> top-left pel of lower block
1308@   r1 = row stride, bytes
1309@   r2 = PQUANT bitstream parameter
1310function ff_vc1_v_loop_filter8_neon, export=1
1311        sub             r3, r0, r1, lsl #2
1312        vldr            d0, .Lcoeffs
1313        vld1.32         {d1}, [r0 :64], r1      @ P5
1314        vld1.32         {d2}, [r3 :64], r1      @ P1
1315        vld1.32         {d3}, [r3 :64], r1      @ P2
1316        vld1.32         {d4}, [r0 :64], r1      @ P6
1317        vld1.32         {d5}, [r3 :64], r1      @ P3
1318        vld1.32         {d6}, [r0 :64], r1      @ P7
1319        vshll.u8        q8, d1, #1              @ 2*P5
1320        vshll.u8        q9, d2, #1              @ 2*P1
1321        vld1.32         {d7}, [r3 :64]          @ P4
1322        vmovl.u8        q1, d3                  @ P2
1323        vld1.32         {d20}, [r0 :64]         @ P8
1324        vmovl.u8        q11, d4                 @ P6
1325        vdup.16         q12, r2                 @ pq
1326        vmovl.u8        q13, d5                 @ P3
1327        vmls.i16        q9, q1, d0[1]           @ 2*P1-5*P2
1328        vmovl.u8        q1, d6                  @ P7
1329        vshll.u8        q2, d5, #1              @ 2*P3
1330        vmls.i16        q8, q11, d0[1]          @ 2*P5-5*P6
1331        vmovl.u8        q3, d7                  @ P4
1332        vmovl.u8        q10, d20                @ P8
1333        vmla.i16        q8, q1, d0[1]           @ 2*P5-5*P6+5*P7
1334        vmovl.u8        q1, d1                  @ P5
1335        vmla.i16        q9, q13, d0[1]          @ 2*P1-5*P2+5*P3
1336        vsub.i16        q13, q3, q1             @ P4-P5
1337        vmls.i16        q2, q3, d0[1]           @ 2*P3-5*P4
1338        vmls.i16        q8, q10, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
1339        vabs.s16        q10, q13
1340        vshr.s16        q13, q13, #8            @ clip_sign
1341        vmls.i16        q9, q3, d0[0]           @ 2*P1-5*P2+5*P3-2*P4
1342        vshr.s16        q10, q10, #1            @ clip
1343        vmla.i16        q2, q1, d0[1]           @ 2*P3-5*P4+5*P5
1344        vrshr.s16       q8, q8, #3
1345        vmls.i16        q2, q11, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
1346        vceq.i16        q11, q10, #0            @ test clip == 0
1347        vrshr.s16       q9, q9, #3
1348        vabs.s16        q8, q8                  @ a2
1349        vabs.s16        q9, q9                  @ a1
1350        vrshr.s16       q2, q2, #3
1351        vcge.s16        q14, q9, q8             @ test a1 >= a2
1352        vabs.s16        q15, q2                 @ a0
1353        vshr.s16        q2, q2, #8              @ a0_sign
1354        vbsl            q14, q8, q9             @ a3
1355        vcge.s16        q8, q15, q12            @ test a0 >= pq
1356        vsub.i16        q2, q13, q2             @ clip_sign - a0_sign
1357        vqsub.u16       q9, q15, q14            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1358        vcge.s16        q12, q14, q15           @ test a3 >= a0
1359        vorr            q8, q11, q8             @ test clip == 0 || a0 >= pq
1360        vmul.i16        q0, q9, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
1361        vorr            q9, q8, q12             @ test clip == 0 || a0 >= pq || a3 >= a0
1362        vshl.i64        q11, q9, #16
1363        vmov.32         r0, d18[1]              @ move to gp reg
1364        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
1365        vmov.32         r2, d19[1]
1366        vshr.s64        q9, q11, #48
1367        vcge.s16        q11, q0, q10
1368        vorr            q8, q8, q9
1369        and             r0, r0, r2
1370        vbsl            q11, q10, q0            @ FFMIN(d, clip)
1371        tst             r0, #1
1372        bne             1f                      @ none of the 8 pixel pairs should be updated in this case
1373        vbic            q0, q11, q8             @ set each d to zero if it should not be filtered
1374        vmls.i16        q3, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1375        vmla.i16        q1, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1376        vqmovun.s16     d0, q3
1377        vqmovun.s16     d1, q1
1378        vst1.32         {d0}, [r3 :64], r1
1379        vst1.32         {d1}, [r3 :64]
13801:      bx              lr
1381endfunc
1382
1383.align  5
1384.Lcoeffs:
1385.quad   0x00050002
1386
1387@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
1388@ On entry:
1389@   r0 -> top-left pel of right block
1390@   r1 = row stride, bytes
1391@   r2 = PQUANT bitstream parameter
1392function ff_vc1_h_loop_filter8_neon, export=1
1393        push            {lr}
1394        sub             r3, r0, #4              @ where to start reading
1395        vldr            d0, .Lcoeffs
1396        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
1397        sub             r0, r0, #1              @ where to start writing
1398        vld1.32         {d4}, [r3], r1
1399        add             r12, r0, r1, lsl #2
1400        vld1.32         {d3}, [r3], r1
1401        vld1.32         {d5}, [r3], r1
1402        vld1.32         {d6}, [r3], r1
1403        vld1.32         {d16}, [r3], r1
1404        vld1.32         {d7}, [r3], r1
1405        vld1.32         {d17}, [r3]
1406        vtrn.8          q1, q2                  @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]...
1407        vdup.16         q9, r2                  @ pq
1408        vtrn.16         d2, d3                  @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
1409        vtrn.16         d4, d5                  @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
1410        vtrn.8          q3, q8                  @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]...
1411        vtrn.16         d6, d7                  @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]...
1412        vtrn.16         d16, d17                @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
1413        vtrn.32         d2, d6                  @ P1, P5
1414        vtrn.32         d4, d16                 @ P2, P6
1415        vtrn.32         d3, d7                  @ P3, P7
1416        vtrn.32         d5, d17                 @ P4, P8
1417        vshll.u8        q10, d2, #1             @ 2*P1
1418        vshll.u8        q11, d6, #1             @ 2*P5
1419        vmovl.u8        q12, d4                 @ P2
1420        vmovl.u8        q13, d16                @ P6
1421        vmovl.u8        q14, d3                 @ P3
1422        vmls.i16        q10, q12, d0[1]         @ 2*P1-5*P2
1423        vmovl.u8        q12, d7                 @ P7
1424        vshll.u8        q1, d3, #1              @ 2*P3
1425        vmls.i16        q11, q13, d0[1]         @ 2*P5-5*P6
1426        vmovl.u8        q2, d5                  @ P4
1427        vmovl.u8        q8, d17                 @ P8
1428        vmla.i16        q11, q12, d0[1]         @ 2*P5-5*P6+5*P7
1429        vmovl.u8        q3, d6                  @ P5
1430        vmla.i16        q10, q14, d0[1]         @ 2*P1-5*P2+5*P3
1431        vsub.i16        q12, q2, q3             @ P4-P5
1432        vmls.i16        q1, q2, d0[1]           @ 2*P3-5*P4
1433        vmls.i16        q11, q8, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
1434        vabs.s16        q8, q12
1435        vshr.s16        q12, q12, #8            @ clip_sign
1436        vmls.i16        q10, q2, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
1437        vshr.s16        q8, q8, #1              @ clip
1438        vmla.i16        q1, q3, d0[1]           @ 2*P3-5*P4+5*P5
1439        vrshr.s16       q11, q11, #3
1440        vmls.i16        q1, q13, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
1441        vceq.i16        q13, q8, #0             @ test clip == 0
1442        vrshr.s16       q10, q10, #3
1443        vabs.s16        q11, q11                @ a2
1444        vabs.s16        q10, q10                @ a1
1445        vrshr.s16       q1, q1, #3
1446        vcge.s16        q14, q10, q11           @ test a1 >= a2
1447        vabs.s16        q15, q1                 @ a0
1448        vshr.s16        q1, q1, #8              @ a0_sign
1449        vbsl            q14, q11, q10           @ a3
1450        vcge.s16        q9, q15, q9             @ test a0 >= pq
1451        vsub.i16        q1, q12, q1             @ clip_sign - a0_sign
1452        vqsub.u16       q10, q15, q14           @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1453        vcge.s16        q11, q14, q15           @ test a3 >= a0
1454        vorr            q9, q13, q9             @ test clip == 0 || a0 >= pq
1455        vmul.i16        q0, q10, d0[1]          @ a0 >= a3 ? 5*(a0-a3) : 0
1456        vorr            q10, q9, q11            @ test clip == 0 || a0 >= pq || a3 >= a0
1457        vmov.32         r2, d20[1]              @ move to gp reg
1458        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
1459        vmov.32         r3, d21[1]
1460        vcge.s16        q10, q0, q8
1461        and             r14, r2, r3
1462        vbsl            q10, q8, q0             @ FFMIN(d, clip)
1463        tst             r14, #1
1464        bne             2f                      @ none of the 8 pixel pairs should be updated in this case
1465        vbic            q0, q10, q9             @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1466        vmla.i16        q3, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1467        vmls.i16        q2, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1468        vqmovun.s16     d1, q3
1469        vqmovun.s16     d0, q2
1470        tst             r2, #1
1471        bne             1f                      @ none of the first 4 pixel pairs should be updated if so
1472        vst2.8          {d0[0], d1[0]}, [r0], r1
1473        vst2.8          {d0[1], d1[1]}, [r0], r1
1474        vst2.8          {d0[2], d1[2]}, [r0], r1
1475        vst2.8          {d0[3], d1[3]}, [r0]
14761:      tst             r3, #1
1477        bne             2f                      @ none of the second 4 pixel pairs should be updated if so
1478        vst2.8          {d0[4], d1[4]}, [r12], r1
1479        vst2.8          {d0[5], d1[5]}, [r12], r1
1480        vst2.8          {d0[6], d1[6]}, [r12], r1
1481        vst2.8          {d0[7], d1[7]}, [r12]
14822:      pop             {pc}
1483endfunc
1484
1485@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
1486@ On entry:
1487@   r0 -> top-left pel of lower block
1488@   r1 = row stride, bytes
1489@   r2 = PQUANT bitstream parameter
1490function ff_vc1_v_loop_filter16_neon, export=1
1491        vpush           {d8-d15}
1492        sub             r3, r0, r1, lsl #2
1493        vldr            d0, .Lcoeffs
1494        vld1.64         {q1}, [r0 :128], r1     @ P5
1495        vld1.64         {q2}, [r3 :128], r1     @ P1
1496        vld1.64         {q3}, [r3 :128], r1     @ P2
1497        vld1.64         {q4}, [r0 :128], r1     @ P6
1498        vld1.64         {q5}, [r3 :128], r1     @ P3
1499        vld1.64         {q6}, [r0 :128], r1     @ P7
1500        vshll.u8        q7, d2, #1              @ 2*P5[0..7]
1501        vshll.u8        q8, d4, #1              @ 2*P1[0..7]
1502        vld1.64         {q9}, [r3 :128]         @ P4
1503        vmovl.u8        q10, d6                 @ P2[0..7]
1504        vld1.64         {q11}, [r0 :128]        @ P8
1505        vmovl.u8        q12, d8                 @ P6[0..7]
1506        vdup.16         q13, r2                 @ pq
1507        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
1508        vmls.i16        q8, q10, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
1509        vshll.u8        q10, d3, #1             @ 2*P5[8..15]
1510        vmovl.u8        q3, d7                  @ P2[8..15]
1511        vmls.i16        q7, q12, d0[1]          @ 2*P5[0..7]-5*P6[0..7]
1512        vmovl.u8        q4, d9                  @ P6[8..15]
1513        vmovl.u8        q14, d10                @ P3[0..7]
1514        vmovl.u8        q15, d12                @ P7[0..7]
1515        vmls.i16        q2, q3, d0[1]           @ 2*P1[8..15]-5*P2[8..15]
1516        vshll.u8        q3, d10, #1             @ 2*P3[0..7]
1517        vmls.i16        q10, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
1518        vmovl.u8        q6, d13                 @ P7[8..15]
1519        vmla.i16        q8, q14, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
1520        vmovl.u8        q14, d18                @ P4[0..7]
1521        vmovl.u8        q9, d19                 @ P4[8..15]
1522        vmla.i16        q7, q15, d0[1]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1523        vmovl.u8        q15, d11                @ P3[8..15]
1524        vshll.u8        q5, d11, #1             @ 2*P3[8..15]
1525        vmls.i16        q3, q14, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
1526        vmla.i16        q2, q15, d0[1]          @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1527        vmovl.u8        q15, d22                @ P8[0..7]
1528        vmovl.u8        q11, d23                @ P8[8..15]
1529        vmla.i16        q10, q6, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
1530        vmovl.u8        q6, d2                  @ P5[0..7]
1531        vmovl.u8        q1, d3                  @ P5[8..15]
1532        vmls.i16        q5, q9, d0[1]           @ 2*P3[8..15]-5*P4[8..15]
1533        vmls.i16        q8, q14, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
1534        vmls.i16        q7, q15, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1535        vsub.i16        q15, q14, q6            @ P4[0..7]-P5[0..7]
1536        vmla.i16        q3, q6, d0[1]           @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1537        vrshr.s16       q8, q8, #3
1538        vmls.i16        q2, q9, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1539        vrshr.s16       q7, q7, #3
1540        vmls.i16        q10, q11, d0[0]         @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
1541        vabs.s16        q11, q15
1542        vabs.s16        q8, q8                  @ a1[0..7]
1543        vmla.i16        q5, q1, d0[1]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1544        vshr.s16        q15, q15, #8            @ clip_sign[0..7]
1545        vrshr.s16       q2, q2, #3
1546        vmls.i16        q3, q12, d0[0]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
1547        vabs.s16        q7, q7                  @ a2[0..7]
1548        vrshr.s16       q10, q10, #3
1549        vsub.i16        q12, q9, q1             @ P4[8..15]-P5[8..15]
1550        vshr.s16        q11, q11, #1            @ clip[0..7]
1551        vmls.i16        q5, q4, d0[0]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1552        vcge.s16        q4, q8, q7              @ test a1[0..7] >= a2[0..7]
1553        vabs.s16        q2, q2                  @ a1[8..15]
1554        vrshr.s16       q3, q3, #3
1555        vabs.s16        q10, q10                @ a2[8..15]
1556        vbsl            q4, q7, q8              @ a3[0..7]
1557        vabs.s16        q7, q12
1558        vshr.s16        q8, q12, #8             @ clip_sign[8..15]
1559        vrshr.s16       q5, q5, #3
1560        vcge.s16        q12, q2, q10            @ test a1[8..15] >= a2[8.15]
1561        vshr.s16        q7, q7, #1              @ clip[8..15]
1562        vbsl            q12, q10, q2            @ a3[8..15]
1563        vabs.s16        q2, q3                  @ a0[0..7]
1564        vceq.i16        q10, q11, #0            @ test clip[0..7] == 0
1565        vshr.s16        q3, q3, #8              @ a0_sign[0..7]
1566        vsub.i16        q3, q15, q3             @ clip_sign[0..7] - a0_sign[0..7]
1567        vcge.s16        q15, q2, q13            @ test a0[0..7] >= pq
1568        vorr            q10, q10, q15           @ test clip[0..7] == 0 || a0[0..7] >= pq
1569        vqsub.u16       q15, q2, q4             @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1570        vcge.s16        q2, q4, q2              @ test a3[0..7] >= a0[0..7]
1571        vabs.s16        q4, q5                  @ a0[8..15]
1572        vshr.s16        q5, q5, #8              @ a0_sign[8..15]
1573        vmul.i16        q15, q15, d0[1]         @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1574        vcge.s16        q13, q4, q13            @ test a0[8..15] >= pq
1575        vorr            q2, q10, q2             @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
1576        vsub.i16        q5, q8, q5              @ clip_sign[8..15] - a0_sign[8..15]
1577        vceq.i16        q8, q7, #0              @ test clip[8..15] == 0
1578        vshr.u16        q15, q15, #3            @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1579        vmov.32         r0, d4[1]               @ move to gp reg
1580        vorr            q8, q8, q13             @ test clip[8..15] == 0 || a0[8..15] >= pq
1581        vqsub.u16       q13, q4, q12            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1582        vmov.32         r2, d5[1]
1583        vcge.s16        q4, q12, q4             @ test a3[8..15] >= a0[8..15]
1584        vshl.i64        q2, q2, #16
1585        vcge.s16        q12, q15, q11
1586        vmul.i16        q0, q13, d0[1]          @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1587        vorr            q4, q8, q4              @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1588        vshr.s64        q2, q2, #48
1589        and             r0, r0, r2
1590        vbsl            q12, q11, q15           @ FFMIN(d[0..7], clip[0..7])
1591        vshl.i64        q11, q4, #16
1592        vmov.32         r2, d8[1]
1593        vshr.u16        q0, q0, #3              @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1594        vorr            q2, q10, q2
1595        vmov.32         r12, d9[1]
1596        vshr.s64        q4, q11, #48
1597        vcge.s16        q10, q0, q7
1598        vbic            q2, q12, q2             @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1599        vorr            q4, q8, q4
1600        and             r2, r2, r12
1601        vbsl            q10, q7, q0             @ FFMIN(d[8..15], clip[8..15])
1602        vmls.i16        q14, q2, q3             @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
1603        and             r0, r0, r2
1604        vbic            q0, q10, q4             @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1605        tst             r0, #1
1606        bne             1f                      @ none of the 16 pixel pairs should be updated in this case
1607        vmla.i16        q6, q2, q3              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
1608        vmls.i16        q9, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
1609        vqmovun.s16     d4, q14
1610        vmla.i16        q1, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
1611        vqmovun.s16     d0, q6
1612        vqmovun.s16     d5, q9
1613        vqmovun.s16     d1, q1
1614        vst1.64         {q2}, [r3 :128], r1
1615        vst1.64         {q0}, [r3 :128]
16161:      vpop            {d8-d15}
1617        bx              lr
1618endfunc
1619
1620@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
1621@ On entry:
1622@   r0 -> top-left pel of right block
1623@   r1 = row stride, bytes
1624@   r2 = PQUANT bitstream parameter
1625function ff_vc1_h_loop_filter16_neon, export=1
1626        push            {r4-r6,lr}
1627        vpush           {d8-d15}
1628        sub             r3, r0, #4              @ where to start reading
1629        vldr            d0, .Lcoeffs
1630        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
1631        sub             r0, r0, #1              @ where to start writing
1632        vld1.32         {d3}, [r3], r1
1633        add             r4, r0, r1, lsl #2
1634        vld1.32         {d10}, [r3], r1
1635        vld1.32         {d11}, [r3], r1
1636        vld1.32         {d16}, [r3], r1
1637        vld1.32         {d4}, [r3], r1
1638        vld1.32         {d8}, [r3], r1
1639        vtrn.8          d2, d3                  @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]...
1640        vld1.32         {d14}, [r3], r1
1641        vld1.32         {d5}, [r3], r1
1642        vtrn.8          d10, d11                @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]...
1643        vld1.32         {d6}, [r3], r1
1644        vld1.32         {d12}, [r3], r1
1645        vtrn.8          d16, d4                 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]...
1646        vld1.32         {d13}, [r3], r1
1647        vtrn.16         d2, d10                 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
1648        vld1.32         {d1}, [r3], r1
1649        vtrn.8          d8, d14                 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]...
1650        vld1.32         {d7}, [r3], r1
1651        vtrn.16         d3, d11                 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
1652        vld1.32         {d9}, [r3], r1
1653        vtrn.8          d5, d6                  @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]...
1654        vld1.32         {d15}, [r3]
1655        vtrn.16         d16, d8                 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]...
1656        vtrn.16         d4, d14                 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
1657        vtrn.8          d12, d13                @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]...
1658        vdup.16         q9, r2                  @ pq
1659        vtrn.8          d1, d7                  @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]...
1660        vtrn.32         d2, d16                 @ P1[0..7], P5[0..7]
1661        vtrn.16         d5, d12                 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]...
1662        vtrn.16         d6, d13                 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]...
1663        vtrn.8          d9, d15                 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]...
1664        vtrn.32         d3, d4                  @ P2[0..7], P6[0..7]
1665        vshll.u8        q10, d2, #1             @ 2*P1[0..7]
1666        vtrn.32         d10, d8                 @ P3[0..7], P7[0..7]
1667        vshll.u8        q11, d16, #1            @ 2*P5[0..7]
1668        vtrn.32         d11, d14                @ P4[0..7], P8[0..7]
1669        vtrn.16         d1, d9                  @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]...
1670        vtrn.16         d7, d15                 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]...
1671        vmovl.u8        q1, d3                  @ P2[0..7]
1672        vmovl.u8        q12, d4                 @ P6[0..7]
1673        vtrn.32         d5, d1                  @ P1[8..15], P5[8..15]
1674        vtrn.32         d6, d7                  @ P2[8..15], P6[8..15]
1675        vtrn.32         d12, d9                 @ P3[8..15], P7[8..15]
1676        vtrn.32         d13, d15                @ P4[8..15], P8[8..15]
1677        vmls.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
1678        vmovl.u8        q1, d10                 @ P3[0..7]
1679        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
1680        vshll.u8        q13, d1, #1             @ 2*P5[8..15]
1681        vmls.i16        q11, q12, d0[1]         @ 2*P5[0..7]-5*P6[0..7]
1682        vmovl.u8        q14, d6                 @ P2[8..15]
1683        vmovl.u8        q3, d7                  @ P6[8..15]
1684        vmovl.u8        q15, d8                 @ P7[0..7]
1685        vmla.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
1686        vmovl.u8        q1, d12                 @ P3[8..15]
1687        vmls.i16        q2, q14, d0[1]          @ 2*P1[8..15]-5*P2[8..15]
1688        vmovl.u8        q4, d9                  @ P7[8..15]
1689        vshll.u8        q14, d10, #1            @ 2*P3[0..7]
1690        vmls.i16        q13, q3, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
1691        vmovl.u8        q5, d11                 @ P4[0..7]
1692        vmla.i16        q11, q15, d0[1]         @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1693        vshll.u8        q15, d12, #1            @ 2*P3[8..15]
1694        vmovl.u8        q6, d13                 @ P4[8..15]
1695        vmla.i16        q2, q1, d0[1]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1696        vmovl.u8        q1, d14                 @ P8[0..7]
1697        vmovl.u8        q7, d15                 @ P8[8..15]
1698        vmla.i16        q13, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
1699        vmovl.u8        q4, d16                 @ P5[0..7]
1700        vmovl.u8        q8, d1                  @ P5[8..15]
1701        vmls.i16        q14, q5, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
1702        vmls.i16        q15, q6, d0[1]          @ 2*P3[8..15]-5*P4[8..15]
1703        vmls.i16        q10, q5, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
1704        vmls.i16        q11, q1, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1705        vsub.i16        q1, q5, q4              @ P4[0..7]-P5[0..7]
1706        vmls.i16        q2, q6, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1707        vrshr.s16       q10, q10, #3
1708        vmls.i16        q13, q7, d0[0]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
1709        vsub.i16        q7, q6, q8              @ P4[8..15]-P5[8..15]
1710        vrshr.s16       q11, q11, #3
1711        vmla.s16        q14, q4, d0[1]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1712        vrshr.s16       q2, q2, #3
1713        vmla.i16        q15, q8, d0[1]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1714        vabs.s16        q10, q10                @ a1[0..7]
1715        vrshr.s16       q13, q13, #3
1716        vmls.i16        q15, q3, d0[0]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1717        vabs.s16        q3, q11                 @ a2[0..7]
1718        vabs.s16        q2, q2                  @ a1[8..15]
1719        vmls.i16        q14, q12, d0[0]         @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
1720        vabs.s16        q11, q1
1721        vabs.s16        q12, q13                @ a2[8..15]
1722        vcge.s16        q13, q10, q3            @ test a1[0..7] >= a2[0..7]
1723        vshr.s16        q1, q1, #8              @ clip_sign[0..7]
1724        vrshr.s16       q15, q15, #3
1725        vshr.s16        q11, q11, #1            @ clip[0..7]
1726        vrshr.s16       q14, q14, #3
1727        vbsl            q13, q3, q10            @ a3[0..7]
1728        vcge.s16        q3, q2, q12             @ test a1[8..15] >= a2[8.15]
1729        vabs.s16        q10, q15                @ a0[8..15]
1730        vshr.s16        q15, q15, #8            @ a0_sign[8..15]
1731        vbsl            q3, q12, q2             @ a3[8..15]
1732        vabs.s16        q2, q14                 @ a0[0..7]
1733        vabs.s16        q12, q7
1734        vshr.s16        q7, q7, #8              @ clip_sign[8..15]
1735        vshr.s16        q14, q14, #8            @ a0_sign[0..7]
1736        vshr.s16        q12, q12, #1            @ clip[8..15]
1737        vsub.i16        q7, q7, q15             @ clip_sign[8..15] - a0_sign[8..15]
1738        vqsub.u16       q15, q10, q3            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1739        vcge.s16        q3, q3, q10             @ test a3[8..15] >= a0[8..15]
1740        vcge.s16        q10, q10, q9            @ test a0[8..15] >= pq
1741        vcge.s16        q9, q2, q9              @ test a0[0..7] >= pq
1742        vsub.i16        q1, q1, q14             @ clip_sign[0..7] - a0_sign[0..7]
1743        vqsub.u16       q14, q2, q13            @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1744        vcge.s16        q2, q13, q2             @ test a3[0..7] >= a0[0..7]
1745        vmul.i16        q13, q15, d0[1]         @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1746        vceq.i16        q15, q11, #0            @ test clip[0..7] == 0
1747        vmul.i16        q0, q14, d0[1]          @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1748        vorr            q9, q15, q9             @ test clip[0..7] == 0 || a0[0..7] >= pq
1749        vceq.i16        q14, q12, #0            @ test clip[8..15] == 0
1750        vshr.u16        q13, q13, #3            @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1751        vorr            q2, q9, q2              @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
1752        vshr.u16        q0, q0, #3              @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1753        vorr            q10, q14, q10           @ test clip[8..15] == 0 || a0[8..15] >= pq
1754        vcge.s16        q14, q13, q12
1755        vmov.32         r2, d4[1]               @ move to gp reg
1756        vorr            q3, q10, q3             @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1757        vmov.32         r3, d5[1]
1758        vcge.s16        q2, q0, q11
1759        vbsl            q14, q12, q13           @ FFMIN(d[8..15], clip[8..15])
1760        vbsl            q2, q11, q0             @ FFMIN(d[0..7], clip[0..7])
1761        vmov.32         r5, d6[1]
1762        vbic            q0, q14, q10            @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1763        vmov.32         r6, d7[1]
1764        and             r12, r2, r3
1765        vbic            q2, q2, q9              @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1766        vmls.i16        q6, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
1767        vmls.i16        q5, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
1768        and             r14, r5, r6
1769        vmla.i16        q4, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
1770        and             r12, r12, r14
1771        vqmovun.s16     d4, q6
1772        vmla.i16        q8, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
1773        tst             r12, #1
1774        bne             4f                      @ none of the 16 pixel pairs should be updated in this case
1775        vqmovun.s16     d2, q5
1776        vqmovun.s16     d3, q4
1777        vqmovun.s16     d5, q8
1778        tst             r2, #1
1779        bne             1f
1780        vst2.8          {d2[0], d3[0]}, [r0], r1
1781        vst2.8          {d2[1], d3[1]}, [r0], r1
1782        vst2.8          {d2[2], d3[2]}, [r0], r1
1783        vst2.8          {d2[3], d3[3]}, [r0]
17841:      add             r0, r4, r1, lsl #2
1785        tst             r3, #1
1786        bne             2f
1787        vst2.8          {d2[4], d3[4]}, [r4], r1
1788        vst2.8          {d2[5], d3[5]}, [r4], r1
1789        vst2.8          {d2[6], d3[6]}, [r4], r1
1790        vst2.8          {d2[7], d3[7]}, [r4]
17912:      add             r4, r0, r1, lsl #2
1792        tst             r5, #1
1793        bne             3f
1794        vst2.8          {d4[0], d5[0]}, [r0], r1
1795        vst2.8          {d4[1], d5[1]}, [r0], r1
1796        vst2.8          {d4[2], d5[2]}, [r0], r1
1797        vst2.8          {d4[3], d5[3]}, [r0]
17983:      tst             r6, #1
1799        bne             4f
1800        vst2.8          {d4[4], d5[4]}, [r4], r1
1801        vst2.8          {d4[5], d5[5]}, [r4], r1
1802        vst2.8          {d4[6], d5[6]}, [r4], r1
1803        vst2.8          {d4[7], d5[7]}, [r4]
18044:      vpop            {d8-d15}
1805        pop             {r4-r6,pc}
1806endfunc
1807
1808@ Copy at most the specified number of bytes from source to destination buffer,
1809@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
1810@ On entry:
1811@   r0 -> source buffer
1812@   r1 = max number of bytes to copy
1813@   r2 -> destination buffer, optimally 8-byte aligned
1814@ On exit:
1815@   r0 = number of bytes not copied
1816function ff_vc1_unescape_buffer_helper_neon, export=1
1817        @ Offset by 48 to screen out cases that are too short for us to handle,
1818        @ and also make it easy to test for loop termination, or to determine
1819        @ whether we need an odd number of half-iterations of the loop.
1820        subs    r1, r1, #48
1821        bmi     90f
1822
1823        @ Set up useful constants
1824        vmov.i32        q0, #0x3000000
1825        vmov.i32        q1, #0x30000
1826
1827        tst             r1, #16
1828        bne             1f
1829
1830          vld1.8          {q8, q9}, [r0]!
1831          vbic            q12, q8, q0
1832          vext.8          q13, q8, q9, #1
1833          vext.8          q14, q8, q9, #2
1834          vext.8          q15, q8, q9, #3
1835          veor            q12, q12, q1
1836          vbic            q13, q13, q0
1837          vbic            q14, q14, q0
1838          vbic            q15, q15, q0
1839          vceq.i32        q12, q12, #0
1840          veor            q13, q13, q1
1841          veor            q14, q14, q1
1842          veor            q15, q15, q1
1843          vceq.i32        q13, q13, #0
1844          vceq.i32        q14, q14, #0
1845          vceq.i32        q15, q15, #0
1846          add             r1, r1, #16
1847          b               3f
1848
18491:      vld1.8          {q10, q11}, [r0]!
1850        vbic            q12, q10, q0
1851        vext.8          q13, q10, q11, #1
1852        vext.8          q14, q10, q11, #2
1853        vext.8          q15, q10, q11, #3
1854        veor            q12, q12, q1
1855        vbic            q13, q13, q0
1856        vbic            q14, q14, q0
1857        vbic            q15, q15, q0
1858        vceq.i32        q12, q12, #0
1859        veor            q13, q13, q1
1860        veor            q14, q14, q1
1861        veor            q15, q15, q1
1862        vceq.i32        q13, q13, #0
1863        vceq.i32        q14, q14, #0
1864        vceq.i32        q15, q15, #0
1865        @ Drop through...
18662:        vmov            q8, q11
1867          vld1.8          {q9}, [r0]!
1868        vorr            q13, q12, q13
1869        vorr            q15, q14, q15
1870          vbic            q12, q8, q0
1871        vorr            q3, q13, q15
1872          vext.8          q13, q8, q9, #1
1873          vext.8          q14, q8, q9, #2
1874          vext.8          q15, q8, q9, #3
1875          veor            q12, q12, q1
1876        vorr            d6, d6, d7
1877          vbic            q13, q13, q0
1878          vbic            q14, q14, q0
1879          vbic            q15, q15, q0
1880          vceq.i32        q12, q12, #0
1881        vmov            r3, r12, d6
1882          veor            q13, q13, q1
1883          veor            q14, q14, q1
1884          veor            q15, q15, q1
1885          vceq.i32        q13, q13, #0
1886          vceq.i32        q14, q14, #0
1887          vceq.i32        q15, q15, #0
1888        orrs            r3, r3, r12
1889        bne             90f
1890        vst1.64         {q10}, [r2]!
18913:          vmov            q10, q9
1892            vld1.8          {q11}, [r0]!
1893          vorr            q13, q12, q13
1894          vorr            q15, q14, q15
1895            vbic            q12, q10, q0
1896          vorr            q3, q13, q15
1897            vext.8          q13, q10, q11, #1
1898            vext.8          q14, q10, q11, #2
1899            vext.8          q15, q10, q11, #3
1900            veor            q12, q12, q1
1901          vorr            d6, d6, d7
1902            vbic            q13, q13, q0
1903            vbic            q14, q14, q0
1904            vbic            q15, q15, q0
1905            vceq.i32        q12, q12, #0
1906          vmov            r3, r12, d6
1907            veor            q13, q13, q1
1908            veor            q14, q14, q1
1909            veor            q15, q15, q1
1910            vceq.i32        q13, q13, #0
1911            vceq.i32        q14, q14, #0
1912            vceq.i32        q15, q15, #0
1913          orrs            r3, r3, r12
1914          bne             91f
1915          vst1.64         {q8}, [r2]!
1916        subs            r1, r1, #32
1917        bpl             2b
1918
191990:     add             r0, r1, #48
1920        bx              lr
1921
192291:     sub             r1, r1, #16
1923        b               90b
1924endfunc
1925