1 /*
2  * VC1 NEON optimisations
3  *
4  * Copyright (c) 2010 Rob Clark <rob@ti.com>
5  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "libavutil/arm/asm.S"
25 #include "neon.S"
26 
27 #include "config.h"
28 
29 @ Transpose rows into columns of a matrix of 16-bit elements. For 4x4, pass
30 @ double-word registers, for 8x4, pass quad-word registers.
31 .macro transpose16 r0, r1, r2, r3
32         @ At this point:
33         @   row[0]  r0
34         @   row[1]  r1
35         @   row[2]  r2
36         @   row[3]  r3
37 
38         vtrn.16         \r0,  \r1         @ first and second row
39         vtrn.16         \r2,  \r3         @ third and fourth row
40         vtrn.32         \r0,  \r2         @ first and third row
41         vtrn.32         \r1,  \r3         @ second and fourth row
42 
43         @ At this point, if registers are quad-word:
44         @   column[0]   d0
45         @   column[1]   d2
46         @   column[2]   d4
47         @   column[3]   d6
48         @   column[4]   d1
49         @   column[5]   d3
50         @   column[6]   d5
51         @   column[7]   d7
52 
53         @ At this point, if registers are double-word:
54         @   column[0]   d0
55         @   column[1]   d1
56         @   column[2]   d2
57         @   column[3]   d3
58 .endm
59 
60 @ ff_vc1_inv_trans_{4,8}x{4,8}_neon and overflow: The input values in the file
61 @ are supposed to be in a specific range as to allow for 16-bit math without
62 @ causing overflows, but sometimes the input values are just big enough to
63 @ barely cause overflow in vadd instructions like:
64 @
65 @   vadd.i16  q0, q8, q10
66 @   vshr.s16  q0, q0, #\rshift
67 @
68 @ To prevent these borderline cases from overflowing, we just need one more
69 @ bit of precision, which is accomplished by replacing the sequence above with:
70 @
71 @   vhadd.s16 q0, q8, q10
72 @   vshr.s16  q0, q0, #(\rshift -1)
73 @
74 @ This works because vhadd is a single instruction that adds, then shifts to
75 @ the right once, all before writing the result to the destination register.
76 @
77 @ Even with this workaround, there were still some files that caused overflows
78 @ in ff_vc1_inv_trans_8x8_neon. See the comments in ff_vc1_inv_trans_8x8_neon
79 @ for the additional workaround.
80 
81 @ Takes 4 columns of 8 values each and operates on it. Modeled after the first
82 @ for loop in vc1_inv_trans_4x8_c.
83 @ Input columns: q0 q1 q2 q3
84 @ Output columns: q0 q1 q2 q3
85 @ Trashes: r12 q8 q9 q10 q11 q12 q13
86 .macro vc1_inv_trans_4x8_helper add rshift
87         @ Compute temp1, temp2 and setup scalar #17, #22, #10
88         vadd.i16        q12,   q0,  q2              @ temp1 = src[0] + src[2]
89         movw            r12,   #17
90         vsub.i16        q13,   q0,  q2              @ temp2 = src[0] - src[2]
91         movt            r12,   #22
92         vmov.32         d0[0], r12
93         movw            r12,   #10
94         vmov.16         d1[0], r12
95 
96         vmov.i16        q8,  #\add                  @ t1 will accumulate here
97         vmov.i16        q9,  #\add                  @ t2 will accumulate here
98 
99         vmul.i16        q10, q1,  d0[1]             @ t3 = 22 * (src[1])
100         vmul.i16        q11, q3,  d0[1]             @ t4 = 22 * (src[3])
101 
102         vmla.i16        q8,  q12, d0[0]             @ t1 = 17 * (temp1) + 4
103         vmla.i16        q9,  q13, d0[0]             @ t2 = 17 * (temp2) + 4
104 
105         vmla.i16        q10, q3,  d1[0]             @ t3 += 10 * src[3]
106         vmls.i16        q11, q1,  d1[0]             @ t4 -= 10 * src[1]
107 
108         vhadd.s16       q0,  q8,  q10               @ dst[0] = (t1 + t3) >> 1
109         vhsub.s16       q3,  q8,  q10               @ dst[3] = (t1 - t3) >> 1
110         vhsub.s16       q1,  q9,  q11               @ dst[1] = (t2 - t4) >> 1
111         vhadd.s16       q2,  q9,  q11               @ dst[2] = (t2 + t4) >> 1
112 
113         @ Halving add/sub above already did one shift
114         vshr.s16        q0,  q0,  #(\rshift - 1)    @ dst[0] >>= (rshift - 1)
115         vshr.s16        q3,  q3,  #(\rshift - 1)    @ dst[3] >>= (rshift - 1)
116         vshr.s16        q1,  q1,  #(\rshift - 1)    @ dst[1] >>= (rshift - 1)
117         vshr.s16        q2,  q2,  #(\rshift - 1)    @ dst[2] >>= (rshift - 1)
118 .endm
119 
120 @ Takes 8 columns of 4 values each and operates on it. Modeled after the second
121 @ for loop in vc1_inv_trans_4x8_c.
122 @ Input columns: d0 d2 d4 d6 d1 d3 d5 d7
123 @ Output columns: d16 d17 d18 d19 d21 d20 d23 d22
124 @ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7
125 .macro vc1_inv_trans_8x4_helper add add1beforeshift rshift
126         @ At this point:
127         @   src[0]      d0 overwritten later
128         @   src[8]      d2
129         @   src[16]     d4 overwritten later
130         @   src[24]     d6
131         @   src[32]     d1 overwritten later
132         @   src[40]     d3
133         @   src[48]     d5 overwritten later
134         @   src[56]     d7
135 
136         movw            r12,   #12
137         vmov.i16        q14,   #\add            @ t1|t2 will accumulate here
138         movt            r12,   #6
139 
140         vadd.i16        d20,   d0,  d1          @ temp1 = src[0] + src[32]
141         vsub.i16        d21,   d0,  d1          @ temp2 = src[0] - src[32]
142         vmov.i32        d0[0], r12              @ 16-bit: d0[0] = #12, d0[1] = #6
143 
144         vshl.i16        q15,   q2,  #4          @ t3|t4 = 16 * (src[16]|src[48])
145         vswp            d4,    d5               @ q2 = src[48]|src[16]
146         vmla.i16        q14,   q10, d0[0]       @ t1|t2 = 12 * (temp1|temp2) + 64
147         movw            r12,   #15
148         movt            r12,   #9
149         vmov.i32        d0[1], r12              @ 16-bit: d0[2] = #15, d0[3] = #9
150         vneg.s16        d31,   d31              @ t4 = -t4
151         vmla.i16        q15,   q2,  d0[1]       @ t3|t4 += 6 * (src[48]|src[16])
152 
153         @ At this point:
154         @   d0[2]   #15
155         @   d0[3]   #9
156         @   q1      src[8]|src[40]
157         @   q3      src[24]|src[56]
158         @   q14     old t1|t2
159         @   q15     old t3|t4
160 
161         vshl.i16        q8,  q1,  #4            @ t1|t2 = 16 * (src[8]|src[40])
162         vswp            d2,  d3                 @ q1 = src[40]|src[8]
163         vshl.i16        q12, q3,  #4            @ temp3a|temp4a = 16 * src[24]|src[56]
164         vswp            d6,  d7                 @ q3 = src[56]|src[24]
165         vshl.i16        q13, q1,  #2            @ temp3b|temp4b = 4 * (src[40]|src[8])
166         vshl.i16        q2,  q3,  #2            @ temp1|temp2 = 4 * (src[56]|src[24])
167         vswp            d3,  d6                 @ q1 = src[40]|src[56], q3 = src[8]|src[24]
168         vsub.i16        q9,  q13, q12           @ t3|t4 = - (temp3a|temp4a) + (temp3b|temp4b)
169         vadd.i16        q8,  q8,  q2            @ t1|t2 += temp1|temp2
170         vmul.i16        q12, q3,  d0[3]         @ temp3|temp4 = 9 * src[8]|src[24]
171         vmla.i16        q8,  q1,  d0[3]         @ t1|t2 += 9 * (src[40]|src[56])
172         vswp            d6,  d7                 @ q3 = src[24]|src[8]
173         vswp            d2,  d3                 @ q1 = src[56]|src[40]
174 
175         vsub.i16        q11, q14, q15           @ t8|t7 = old t1|t2 - old t3|t4
176         vadd.i16        q10, q14, q15           @ t5|t6 = old t1|t2 + old t3|t4
177   .if \add1beforeshift
178         vmov.i16        q15, #1
179   .endif
180 
181         vadd.i16        d18, d18, d24           @ t3 += temp3
182         vsub.i16        d19, d19, d25           @ t4 -= temp4
183 
184         vswp            d22, d23                @ q11 = t7|t8
185 
186         vneg.s16        d17, d17                @ t2 = -t2
187         vmla.i16        q9,  q1,  d0[2]         @ t3|t4 += 15 * src[56]|src[40]
188         vmla.i16        q8,  q3,  d0[2]         @ t1|t2 += 15 * src[24]|src[8]
189 
190         @ At this point:
191         @   t1  d16
192         @   t2  d17
193         @   t3  d18
194         @   t4  d19
195         @   t5  d20
196         @   t6  d21
197         @   t7  d22
198         @   t8  d23
199         @   #1  q15
200 
201   .if \add1beforeshift
202         vadd.i16        q3,  q15, q10           @ line[7,6] = t5|t6 + 1
203         vadd.i16        q2,  q15, q11           @ line[5,4] = t7|t8 + 1
204   .endif
205 
206         @ Sometimes this overflows, so to get one additional bit of precision, use
207         @ a single instruction that both adds and shifts right (halving).
208         vhadd.s16       q1,  q9,  q11           @ line[2,3] = (t3|t4 + t7|t8) >> 1
209         vhadd.s16       q0,  q8,  q10           @ line[0,1] = (t1|t2 + t5|t6) >> 1
210   .if \add1beforeshift
211         vhsub.s16       q2,  q2,  q9            @ line[5,4] = (t7|t8 - t3|t4 + 1) >> 1
212         vhsub.s16       q3,  q3,  q8            @ line[7,6] = (t5|t6 - t1|t2 + 1) >> 1
213   .else
214         vhsub.s16       q2,  q11, q9            @ line[5,4] = (t7|t8 - t3|t4) >> 1
215         vhsub.s16       q3,  q10, q8            @ line[7,6] = (t5|t6 - t1|t2) >> 1
216   .endif
217 
218         vshr.s16        q9,  q1,  #(\rshift - 1)    @ one shift is already done by vhadd/vhsub above
219         vshr.s16        q8,  q0,  #(\rshift - 1)
220         vshr.s16        q10, q2,  #(\rshift - 1)
221         vshr.s16        q11, q3,  #(\rshift - 1)
222 
223         @ At this point:
224         @   dst[0]   d16
225         @   dst[1]   d17
226         @   dst[2]   d18
227         @   dst[3]   d19
228         @   dst[4]   d21
229         @   dst[5]   d20
230         @   dst[6]   d23
231         @   dst[7]   d22
232 .endm
233 
234 @ This is modeled after the first and second for loop in vc1_inv_trans_8x8_c.
235 @ Input columns:  q8, q9, q10, q11, q12, q13, q14, q15
236 @ Output columns: q8, q9, q10, q11, q12, q13, q14, q15
237 @ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7
238 .macro vc1_inv_trans_8x8_helper add add1beforeshift rshift
239         @ This actually computes half of t1, t2, t3, t4, as explained below
240         @ near `tNhalf`.
241         vmov.i16        q0,    #(6 / 2)         @ q0 = #6/2
242         vshl.i16        q1,    q10, #3          @ t3 = 16/2 * src[16]
243         vshl.i16        q3,    q14, #3          @ temp4 = 16/2 * src[48]
244         vmul.i16        q2,    q10, q0          @ t4 = 6/2 * src[16]
245         vmla.i16        q1,    q14, q0          @ t3 += 6/2 * src[48]
246         @ unused: q0, q10, q14
247         vmov.i16        q0,    #(12 / 2)        @ q0 = #12/2
248         vadd.i16        q10,   q8,  q12         @ temp1 = src[0] + src[32]
249         vsub.i16        q14,   q8,  q12         @ temp2 = src[0] - src[32]
250         @ unused: q8, q12
251         vmov.i16        q8,    #(\add / 2)      @ t1 will accumulate here
252         vmov.i16        q12,   #(\add / 2)      @ t2 will accumulate here
253         movw            r12,   #15
254         vsub.i16        q2,    q2,  q3          @ t4 = 6/2 * src[16] - 16/2 * src[48]
255         movt            r12,   #9
256         @ unused: q3
257         vmla.i16        q8,    q10, q0          @ t1 = 12/2 * temp1 + add
258         vmla.i16        q12,   q14, q0          @ t2 = 12/2 * temp2 + add
259         vmov.i32        d0[0], r12
260         @ unused: q3, q10, q14
261 
262         @ At this point:
263         @   q0          d0=#15|#9
264         @   q1  old t3
265         @   q2  old t4
266         @   q3
267         @   q8  old t1
268         @   q9          src[8]
269         @   q10
270         @   q11         src[24]
271         @   q12 old t2
272         @   q13         src[40]
273         @   q14
274         @   q15         src[56]
275 
276         @ unused: q3, q10, q14
277         movw            r12,   #16
278         vshl.i16        q3,    q9,  #4          @ t1 = 16 * src[8]
279         movt            r12,   #4
280         vshl.i16        q10,   q9,  #2          @ t4 = 4 * src[8]
281         vmov.i32        d1[0], r12
282         vmul.i16        q14,   q9,  d0[0]       @ t2 = 15 * src[8]
283         vmul.i16        q9,    q9,  d0[1]       @ t3 = 9 * src[8]
284         @ unused: none
285         vmla.i16        q3,    q11, d0[0]       @ t1 += 15 * src[24]
286         vmls.i16        q10,   q11, d0[1]       @ t4 -= 9 * src[24]
287         vmls.i16        q14,   q11, d1[1]       @ t2 -= 4 * src[24]
288         vmls.i16        q9,    q11, d1[0]       @ t3 -= 16 * src[24]
289         @ unused: q11
290         vmla.i16        q3,    q13, d0[1]       @ t1 += 9 * src[40]
291         vmla.i16        q10,   q13, d0[0]       @ t4 += 15 * src[40]
292         vmls.i16        q14,   q13, d1[0]       @ t2 -= 16 * src[40]
293         vmla.i16        q9,    q13, d1[1]       @ t3 += 4 * src[40]
294         @ unused: q11, q13
295 
296         @ Compute t5, t6, t7, t8 from old t1, t2, t3, t4. Actually, it computes
297         @ half of t5, t6, t7, t8 since t1, t2, t3, t4 are halved.
298         vadd.i16        q11,   q8,  q1          @ t5 = t1 + t3
299         vsub.i16        q1,    q8,  q1          @ t8 = t1 - t3
300         vadd.i16        q13,   q12, q2          @ t6 = t2 + t4
301         vsub.i16        q2,    q12, q2          @ t7 = t2 - t4
302         @ unused: q8, q12
303 
304   .if \add1beforeshift
305         vmov.i16        q12,   #1
306   .endif
307 
308         @ unused: q8
309         vmla.i16        q3,    q15, d1[1]       @ t1 += 4 * src[56]
310         vmls.i16        q14,   q15, d0[1]       @ t2 -= 9 * src[56]
311         vmla.i16        q9,    q15, d0[0]       @ t3 += 15 * src[56]
312         vmls.i16        q10,   q15, d1[0]       @ t4 -= 16 * src[56]
313         @ unused: q0, q8, q15
314 
315         @ At this point:
316         @   t1      q3
317         @   t2      q14
318         @   t3      q9
319         @   t4      q10
320         @   t5half  q11
321         @   t6half  q13
322         @   t7half  q2
323         @   t8half  q1
324         @   #1      q12
325         @
326         @ tNhalf is half of the value of tN (as described in vc1_inv_trans_8x8_c).
327         @ This is done because sometimes files have input that causes tN + tM to
328         @ overflow. To avoid this overflow, we compute tNhalf, then compute
329         @ tNhalf + tM (which doesn't overflow), and then we use vhadd to compute
330         @ (tNhalf + (tNhalf + tM)) >> 1 which does not overflow because it is
331         @ one instruction.
332 
333         @ For each pair of tN and tM, do:
334         @   lineA = t5half + t1
335         @   if add1beforeshift:  t1 -= 1
336         @   lineA = (t5half + lineA) >> 1
337         @   lineB = t5half - t1
338         @   lineB = (t5half + lineB) >> 1
339         @   lineA >>= rshift - 1
340         @   lineB >>= rshift - 1
341 
342         vadd.i16        q8,  q11, q3                @ q8 = t5half + t1
343   .if \add1beforeshift
344         vsub.i16        q3,  q3,  q12               @ q3 = t1 - 1
345   .endif
346 
347         vadd.i16        q0,  q13, q14               @ q0  = t6half + t2
348   .if \add1beforeshift
349         vsub.i16        q14, q14, q12               @ q14 = t2 - 1
350   .endif
351 
352         vadd.i16        q15, q2,  q9                @ q15 = t7half + t3
353   .if \add1beforeshift
354         vsub.i16        q9,  q9,  q12               @ q9  = t3 - 1
355   .endif
356         @ unused: none
357 
358         vhadd.s16       q8,  q11, q8                @ q8  = (t5half + t5half + t1) >> 1
359         vsub.i16        q3,  q11, q3                @ q3  = t5half - t1 + 1
360 
361         vhadd.s16       q0,  q13, q0                @ q0  = (t6half + t6half + t2) >> 1
362         vsub.i16        q14, q13, q14               @ q14 = t6half - t2 + 1
363 
364         vhadd.s16       q15, q2,  q15               @ q15 = (t7half + t7half + t3) >> 1
365         vsub.i16        q9,  q2,  q9                @ q9  = t7half - t3 + 1
366 
367         vhadd.s16       q3,  q11, q3                @ q3  = (t5half + t5half - t1 + 1) >> 1
368         @ unused: q11
369 
370         vadd.i16        q11, q1,  q10               @ q11 = t8half + t4
371   .if \add1beforeshift
372         vsub.i16        q10, q10, q12               @ q10 = t4 - 1
373   .endif
374         @ unused: q12
375 
376         vhadd.s16       q14, q13, q14               @ q14 = (t6half + t6half - t2 + 1) >> 1
377         @ unused: q12, q13
378         vhadd.s16       q13, q2,  q9                @ q9  = (t7half + t7half - t3 + 1) >> 1
379         @ unused: q12, q2, q9
380 
381         vsub.i16        q10, q1,  q10               @ q10 = t8half - t4 + 1
382         vhadd.s16       q11, q1,  q11               @ q11 = (t8half + t8half + t4) >> 1
383 
384         vshr.s16        q8,  q8,  #(\rshift - 1)    @ q8  = line[0]
385         vhadd.s16       q12, q1,  q10               @ q12 = (t8half + t8half - t4 + 1) >> 1
386         vshr.s16        q9,  q0,  #(\rshift - 1)    @ q9  = line[1]
387         vshr.s16        q10, q15, #(\rshift - 1)    @ q10 = line[2]
388         vshr.s16        q11, q11, #(\rshift - 1)    @ q11 = line[3]
389         vshr.s16        q12, q12, #(\rshift - 1)    @ q12 = line[4]
390         vshr.s16        q13, q13, #(\rshift - 1)    @ q13 = line[5]
391         vshr.s16        q14, q14, #(\rshift - 1)    @ q14 = line[6]
392         vshr.s16        q15, q3,  #(\rshift - 1)    @ q15 = line[7]
393 .endm
394 
395 @ (int16_t *block [r0])
396 function ff_vc1_inv_trans_8x8_neon, export=1
397         vld1.64         {q8-q9},   [r0,:128]!
398         vld1.64         {q10-q11}, [r0,:128]!
399         vld1.64         {q12-q13}, [r0,:128]!
400         vld1.64         {q14-q15}, [r0,:128]
401         sub             r0, r0, #(16 * 2 * 3)   @ restore r0
402 
403         @ At this point:
404         @   src[0]  q8
405         @   src[8]  q9
406         @   src[16] q10
407         @   src[24] q11
408         @   src[32] q12
409         @   src[40] q13
410         @   src[48] q14
411         @   src[56] q15
412 
413         vc1_inv_trans_8x8_helper add=4, add1beforeshift=0, rshift=3
414 
415         @ Transpose result matrix of 8x8
416         swap4           d17, d19, d21, d23, d24, d26, d28, d30
417         transpose16_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15
418 
419         vc1_inv_trans_8x8_helper add=64, add1beforeshift=1, rshift=7
420 
421         vst1.64         {q8-q9},   [r0,:128]!
422         vst1.64         {q10-q11}, [r0,:128]!
423         vst1.64         {q12-q13}, [r0,:128]!
424         vst1.64         {q14-q15}, [r0,:128]
425 
426         bx              lr
427 endfunc
428 
429 @ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
430 function ff_vc1_inv_trans_8x4_neon, export=1
431         vld1.64         {q0-q1}, [r2,:128]!     @ load 8 * 4 * 2 = 64 bytes / 16 bytes per quad = 4 quad registers
432         vld1.64         {q2-q3}, [r2,:128]
433 
434         transpose16     q0, q1, q2, q3          @ transpose rows to columns
435 
436         @ At this point:
437         @   src[0]   d0
438         @   src[1]   d2
439         @   src[2]   d4
440         @   src[3]   d6
441         @   src[4]   d1
442         @   src[5]   d3
443         @   src[6]   d5
444         @   src[7]   d7
445 
446         vc1_inv_trans_8x4_helper    add=4, add1beforeshift=0, rshift=3
447 
448         @ Move output to more standardized registers
449         vmov        d0, d16
450         vmov        d2, d17
451         vmov        d4, d18
452         vmov        d6, d19
453         vmov        d1, d21
454         vmov        d3, d20
455         vmov        d5, d23
456         vmov        d7, d22
457 
458         @ At this point:
459         @   dst[0]   d0
460         @   dst[1]   d2
461         @   dst[2]   d4
462         @   dst[3]   d6
463         @   dst[4]   d1
464         @   dst[5]   d3
465         @   dst[6]   d5
466         @   dst[7]   d7
467 
468         transpose16     q0, q1, q2, q3   @ turn columns into rows
469 
470         @ At this point:
471         @   row[0] q0
472         @   row[1] q1
473         @   row[2] q2
474         @   row[3] q3
475 
476         vc1_inv_trans_4x8_helper    add=64, rshift=7
477 
478         @ At this point:
479         @   line[0].l   d0
480         @   line[0].h   d1
481         @   line[1].l   d2
482         @   line[1].h   d3
483         @   line[2].l   d4
484         @   line[2].h   d5
485         @   line[3].l   d6
486         @   line[3].h   d7
487 
488         @ unused registers: q12, q13, q14, q15
489 
490         vld1.64         {d28}, [r0,:64], r1     @ read dest
491         vld1.64         {d29}, [r0,:64], r1
492         vld1.64         {d30}, [r0,:64], r1
493         vld1.64         {d31}, [r0,:64], r1
494         sub             r0,  r0,  r1, lsl #2    @ restore original r0 value
495 
496         vaddw.u8        q0,  q0,  d28           @ line[0] += dest[0]
497         vaddw.u8        q1,  q1,  d29           @ line[1] += dest[1]
498         vaddw.u8        q2,  q2,  d30           @ line[2] += dest[2]
499         vaddw.u8        q3,  q3,  d31           @ line[3] += dest[3]
500 
501         vqmovun.s16     d0,  q0                 @ line[0]
502         vqmovun.s16     d1,  q1                 @ line[1]
503         vqmovun.s16     d2,  q2                 @ line[2]
504         vqmovun.s16     d3,  q3                 @ line[3]
505 
506         vst1.64         {d0},  [r0,:64], r1     @ write dest
507         vst1.64         {d1},  [r0,:64], r1
508         vst1.64         {d2},  [r0,:64], r1
509         vst1.64         {d3},  [r0,:64]
510 
511         bx              lr
512 endfunc
513 
514 @ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
515 function ff_vc1_inv_trans_4x8_neon, export=1
516         mov             r12, #(8 * 2)  @ 8 elements per line, each element 2 bytes
517         vld4.16         {d0[],  d2[],  d4[],  d6[]},  [r2,:64], r12     @ read each column into a q register
518         vld4.16         {d0[1], d2[1], d4[1], d6[1]}, [r2,:64], r12
519         vld4.16         {d0[2], d2[2], d4[2], d6[2]}, [r2,:64], r12
520         vld4.16         {d0[3], d2[3], d4[3], d6[3]}, [r2,:64], r12
521         vld4.16         {d1[],  d3[],  d5[],  d7[]},  [r2,:64], r12
522         vld4.16         {d1[1], d3[1], d5[1], d7[1]}, [r2,:64], r12
523         vld4.16         {d1[2], d3[2], d5[2], d7[2]}, [r2,:64], r12
524         vld4.16         {d1[3], d3[3], d5[3], d7[3]}, [r2,:64]
525 
526         vc1_inv_trans_4x8_helper    add=4, rshift=3
527 
528         @ At this point:
529         @   dst[0] = q0
530         @   dst[1] = q1
531         @   dst[2] = q2
532         @   dst[3] = q3
533 
534         transpose16     q0, q1, q2, q3  @ Transpose rows (registers) into columns
535 
536         vc1_inv_trans_8x4_helper    add=64, add1beforeshift=1, rshift=7
537 
538         vld1.32         {d28[]},  [r0,:32], r1  @ read dest
539         vld1.32         {d28[1]}, [r0,:32], r1
540         vld1.32         {d29[]},  [r0,:32], r1
541         vld1.32         {d29[1]}, [r0,:32], r1
542 
543         vld1.32         {d30[]},  [r0,:32], r1
544         vld1.32         {d30[0]}, [r0,:32], r1
545         vld1.32         {d31[]},  [r0,:32], r1
546         vld1.32         {d31[0]}, [r0,:32], r1
547         sub             r0,  r0,  r1, lsl #3    @ restore original r0 value
548 
549         vaddw.u8        q8,  q8,  d28           @ line[0,1] += dest[0,1]
550         vaddw.u8        q9,  q9,  d29           @ line[2,3] += dest[2,3]
551         vaddw.u8        q10, q10, d30           @ line[5,4] += dest[5,4]
552         vaddw.u8        q11, q11, d31           @ line[7,6] += dest[7,6]
553 
554         vqmovun.s16     d16, q8                 @ clip(line[0,1])
555         vqmovun.s16     d18, q9                 @ clip(line[2,3])
556         vqmovun.s16     d20, q10                @ clip(line[5,4])
557         vqmovun.s16     d22, q11                @ clip(line[7,6])
558 
559         vst1.32         {d16[0]}, [r0,:32], r1  @ write dest
560         vst1.32         {d16[1]}, [r0,:32], r1
561         vst1.32         {d18[0]}, [r0,:32], r1
562         vst1.32         {d18[1]}, [r0,:32], r1
563 
564         vst1.32         {d20[1]}, [r0,:32], r1
565         vst1.32         {d20[0]}, [r0,:32], r1
566         vst1.32         {d22[1]}, [r0,:32], r1
567         vst1.32         {d22[0]}, [r0,:32]
568 
569         bx              lr
570 endfunc
571 
572 @ Setup constants in registers which are used by vc1_inv_trans_4x4_helper
573 .macro vc1_inv_trans_4x4_helper_setup
574         vmov.i16        q13, #17
575         vmov.i16        q14, #22
576         vmov.i16        d30, #10                @ only need double-word, not quad-word
577 .endm
578 
579 @ This is modeled after the first for loop in vc1_inv_trans_4x4_c.
580 .macro vc1_inv_trans_4x4_helper add rshift
581         vmov.i16        q2,  #\add              @ t1|t2 will accumulate here
582 
583         vadd.i16        d16, d0,  d1            @ temp1 = src[0] + src[2]
584         vsub.i16        d17, d0,  d1            @ temp2 = src[0] - src[2]
585         vmul.i16        q3,  q14, q1            @ t3|t4 = 22 * (src[1]|src[3])
586         vmla.i16        q2,  q13, q8            @ t1|t2 = 17 * (temp1|temp2) + add
587         vmla.i16        d6,  d30, d3            @ t3 += 10 * src[3]
588         vmls.i16        d7,  d30, d2            @ t4 -= 10 * src[1]
589 
590         vadd.i16        q0,  q2,  q3            @ dst[0,2] = (t1|t2 + t3|t4)
591         vsub.i16        q1,  q2,  q3            @ dst[3,1] = (t1|t2 - t3|t4)
592         vshr.s16        q0,  q0,  #\rshift      @ dst[0,2] >>= rshift
593         vshr.s16        q1,  q1,  #\rshift      @ dst[3,1] >>= rshift
594 .endm
595 
596 @ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
597 function ff_vc1_inv_trans_4x4_neon, export=1
598         mov             r12, #(8 * 2)  @ 8 elements per line, each element 2 bytes
599         vld4.16         {d0[],  d1[],  d2[],  d3[]},  [r2,:64], r12     @ read each column into a register
600         vld4.16         {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r12
601         vld4.16         {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r12
602         vld4.16         {d0[3], d1[3], d2[3], d3[3]}, [r2,:64]
603 
604         vswp            d1,  d2         @ so that we can later access column 1 and column 3 as a single q1 register
605 
606         vc1_inv_trans_4x4_helper_setup
607 
608         @ At this point:
609         @   src[0] = d0
610         @   src[1] = d2
611         @   src[2] = d1
612         @   src[3] = d3
613 
614         vc1_inv_trans_4x4_helper add=4, rshift=3     @ compute t1, t2, t3, t4 and combine them into dst[0-3]
615 
616         @ At this point:
617         @   dst[0] = d0
618         @   dst[1] = d3
619         @   dst[2] = d1
620         @   dst[3] = d2
621 
622         transpose16     d0, d3, d1, d2  @ Transpose rows (registers) into columns
623 
624         @ At this point:
625         @   src[0]  = d0
626         @   src[8]  = d3
627         @   src[16] = d1
628         @   src[24] = d2
629 
630         vswp            d2,  d3         @ so that we can later access column 1 and column 3 in order as a single q1 register
631 
632         @ At this point:
633         @   src[0]  = d0
634         @   src[8]  = d2
635         @   src[16] = d1
636         @   src[24] = d3
637 
638         vc1_inv_trans_4x4_helper add=64, rshift=7             @ compute t1, t2, t3, t4 and combine them into dst[0-3]
639 
640         @ At this point:
641         @   line[0] = d0
642         @   line[1] = d3
643         @   line[2] = d1
644         @   line[3] = d2
645 
646         vld1.32         {d18[]},  [r0,:32], r1  @ read dest
647         vld1.32         {d19[]},  [r0,:32], r1
648         vld1.32         {d18[1]}, [r0,:32], r1
649         vld1.32         {d19[0]}, [r0,:32], r1
650         sub             r0,  r0,  r1, lsl #2    @ restore original r0 value
651 
652         vaddw.u8        q0,  q0,  d18           @ line[0,2] += dest[0,2]
653         vaddw.u8        q1,  q1,  d19           @ line[3,1] += dest[3,1]
654 
655         vqmovun.s16     d0,  q0                 @ clip(line[0,2])
656         vqmovun.s16     d1,  q1                 @ clip(line[3,1])
657 
658         vst1.32         {d0[0]},  [r0,:32], r1  @ write dest
659         vst1.32         {d1[1]},  [r0,:32], r1
660         vst1.32         {d0[1]},  [r0,:32], r1
661         vst1.32         {d1[0]},  [r0,:32]
662 
663         bx              lr
664 endfunc
665 
666 @ The absolute value of multiplication constants from vc1_mspel_filter and vc1_mspel_{ver,hor}_filter_16bits.
667 @ The sign is embedded in the code below that carries out the multiplication (mspel_filter{,.16}).
668 #define MSPEL_MODE_1_MUL_CONSTANTS  4, 53, 18, 3
669 #define MSPEL_MODE_2_MUL_CONSTANTS  1, 9,  9,  1
670 #define MSPEL_MODE_3_MUL_CONSTANTS  3, 18, 53, 4
671 
672 @ These constants are from reading the source code of vc1_mspel_mc and determining the value that
673 @ is added to `rnd` to result in the variable `r`, and the value of the variable `shift`.
674 #define MSPEL_MODES_11_ADDSHIFT_CONSTANTS   15, 5
675 #define MSPEL_MODES_12_ADDSHIFT_CONSTANTS   3,  3
676 #define MSPEL_MODES_13_ADDSHIFT_CONSTANTS   15, 5
677 #define MSPEL_MODES_21_ADDSHIFT_CONSTANTS   MSPEL_MODES_12_ADDSHIFT_CONSTANTS
678 #define MSPEL_MODES_22_ADDSHIFT_CONSTANTS   0,  1
679 #define MSPEL_MODES_23_ADDSHIFT_CONSTANTS   3,  3
680 #define MSPEL_MODES_31_ADDSHIFT_CONSTANTS   MSPEL_MODES_13_ADDSHIFT_CONSTANTS
681 #define MSPEL_MODES_32_ADDSHIFT_CONSTANTS   MSPEL_MODES_23_ADDSHIFT_CONSTANTS
682 #define MSPEL_MODES_33_ADDSHIFT_CONSTANTS   15, 5
683 
684 @ The addition and shift constants from vc1_mspel_filter.
685 #define MSPEL_MODE_1_ADDSHIFT_CONSTANTS     32, 6
686 #define MSPEL_MODE_2_ADDSHIFT_CONSTANTS     8,  4
687 #define MSPEL_MODE_3_ADDSHIFT_CONSTANTS     32, 6
688 
689 @ Setup constants in registers for a subsequent use of mspel_filter{,.16}.
690 .macro mspel_constants typesize reg_a reg_b reg_c reg_d filter_a filter_b filter_c filter_d reg_add filter_add_register
691   @ Typesize should be i8 or i16.
692 
693   @ Only set the register if the value is not 1 and unique
694   .if \filter_a != 1
695         vmov.\typesize  \reg_a,  #\filter_a          @ reg_a = filter_a
696   .endif
697         vmov.\typesize  \reg_b,  #\filter_b          @ reg_b = filter_b
698   .if \filter_b != \filter_c
699         vmov.\typesize  \reg_c,  #\filter_c          @ reg_c = filter_c
700   .endif
701   .if \filter_d != 1
702         vmov.\typesize  \reg_d,  #\filter_d          @ reg_d = filter_d
703   .endif
704   @ vdup to double the size of typesize
705   .ifc \typesize,i8
706         vdup.16         \reg_add,  \filter_add_register     @ reg_add = filter_add_register
707   .else
708         vdup.32         \reg_add,  \filter_add_register     @ reg_add = filter_add_register
709   .endif
710 .endm
711 
712 @ After mspel_constants has been used, do the filtering.
713 .macro mspel_filter acc dest src0 src1 src2 src3 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift narrow=1
714   .if \filter_a != 1
715         @ If filter_a != 1, then we need a move and subtract instruction
716         vmov            \acc,  \reg_add                     @ acc = reg_add
717         vmlsl.u8        \acc,  \reg_a,  \src0               @ acc -= filter_a * src[-stride]
718   .else
719         @ If filter_a is 1, then just subtract without an extra move
720         vsubw.u8        \acc,  \reg_add,  \src0             @ acc = reg_add - src[-stride]      @ since filter_a == 1
721   .endif
722         vmlal.u8        \acc,  \reg_b,  \src1               @ acc += filter_b * src[0]
723   .if \filter_b != \filter_c
724         vmlal.u8        \acc,  \reg_c,  \src2               @ acc += filter_c * src[stride]
725   .else
726         @ If filter_b is the same as filter_c, use the same reg_b register
727         vmlal.u8        \acc,  \reg_b,  \src2               @ acc += filter_c * src[stride]     @ where filter_c == filter_b
728   .endif
729   .if \filter_d != 1
730         @ If filter_d != 1, then do a multiply accumulate
731         vmlsl.u8        \acc,  \reg_d,  \src3               @ acc -= filter_d * src[stride * 2]
732   .else
733         @ If filter_d is 1, then just do a subtract
734         vsubw.u8        \acc,  \acc,    \src3               @ acc -= src[stride * 2]            @ since filter_d == 1
735   .endif
736   .if \narrow
737         vqshrun.s16     \dest, \acc,    #\filter_shift      @ dest = clip_uint8(acc >> filter_shift)
738   .else
739         vshr.s16        \dest, \acc,    #\filter_shift      @ dest = acc >> filter_shift
740   .endif
741 .endm
742 
743 @ This is similar to mspel_filter, but the input is 16-bit instead of 8-bit and narrow=0 is not supported.
744 .macro mspel_filter.16 acc0 acc1 acc0_0 acc0_1 dest src0 src1 src2 src3 src4 src5 src6 src7 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift
745   .if \filter_a != 1
746         vmov            \acc0,  \reg_add
747         vmov            \acc1,  \reg_add
748         vmlsl.s16       \acc0,  \reg_a,  \src0
749         vmlsl.s16       \acc1,  \reg_a,  \src1
750   .else
751         vsubw.s16       \acc0,  \reg_add,  \src0
752         vsubw.s16       \acc1,  \reg_add,  \src1
753   .endif
754         vmlal.s16       \acc0,  \reg_b,  \src2
755         vmlal.s16       \acc1,  \reg_b,  \src3
756   .if \filter_b != \filter_c
757         vmlal.s16       \acc0,  \reg_c,  \src4
758         vmlal.s16       \acc1,  \reg_c,  \src5
759   .else
760         vmlal.s16       \acc0,  \reg_b,  \src4
761         vmlal.s16       \acc1,  \reg_b,  \src5
762   .endif
763   .if \filter_d != 1
764         vmlsl.s16       \acc0,  \reg_d,  \src6
765         vmlsl.s16       \acc1,  \reg_d,  \src7
766   .else
767         vsubw.s16       \acc0,  \acc0,   \src6
768         vsubw.s16       \acc1,  \acc1,   \src7
769   .endif
770         @ Use acc0_0 and acc0_1 as temp space
771         vqshrun.s32     \acc0_0, \acc0,  #\filter_shift     @ Shift and narrow with saturation from s32 to u16
772         vqshrun.s32     \acc0_1, \acc1,  #\filter_shift
773         vqmovn.u16      \dest,  \acc0                       @ Narrow with saturation from u16 to u8
774 .endm
775 
776 @ Register usage for put_vc1_mspel_mc functions. Registers marked 'hv' are only used in put_vc1_mspel_mc_hv.
777 @
778 @   r0        adjusted dst
779 @   r1        adjusted src
780 @   r2        stride
781 @   r3        adjusted rnd
782 @   r4 [hv]   tmp
783 @   r11 [hv]  sp saved
784 @   r12       loop counter
785 @   d0        src[-stride]
786 @   d1        src[0]
787 @   d2        src[stride]
788 @   d3        src[stride * 2]
789 @   q0 [hv]   src[-stride]
790 @   q1 [hv]   src[0]
791 @   q2 [hv]   src[stride]
792 @   q3 [hv]   src[stride * 2]
793 @   d21       often result from mspel_filter
794 @   q11       accumulator 0
795 @   q12 [hv]  accumulator 1
796 @   q13       accumulator initial value
797 @   d28       filter_a
798 @   d29       filter_b
799 @   d30       filter_c
800 @   d31       filter_d
801 
802 @ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3])
803 .macro put_vc1_mspel_mc_hv hmode vmode filter_h_a filter_h_b filter_h_c filter_h_d filter_v_a filter_v_b filter_v_c filter_v_d filter_add filter_shift
804 function ff_put_vc1_mspel_mc\hmode\()\vmode\()_neon, export=1
805         push            {r4, r11, lr}
806         mov             r11, sp                 @ r11 = stack pointer before realignmnet
807 A       bic             sp,  sp,  #15           @ sp = round down to multiple of 16 bytes
808 T       bic             r4,  r11, #15
809 T       mov             sp,  r4
810         sub             sp,  sp,  #(8*2*16)     @ make space for 8 rows * 2 byte per element * 16 elements per row (to fit 11 actual elements per row)
811         mov             r4,  sp                 @ r4 = int16_t tmp[8 * 16]
812 
813         sub             r1,  r1,  #1            @ src -= 1
814   .if \filter_add != 0
815         add             r3,  r3,  #\filter_add  @ r3 = filter_add + rnd
816   .endif
817         mov             r12, #8                 @ loop counter
818         sub             r1,  r1,  r2            @ r1 = &src[-stride]      @ slide back
819 
820         @ Do vertical filtering from src into tmp
821         mspel_constants i8, d28, d29, d30, d31, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, q13, r3
822 
823         vld1.64         {d0,d1}, [r1], r2
824         vld1.64         {d2,d3}, [r1], r2
825         vld1.64         {d4,d5}, [r1], r2
826 
827 1:
828         subs            r12,  r12,  #4
829 
830         vld1.64         {d6,d7}, [r1], r2
831         mspel_filter    q11, q11, d0, d2, d4, d6, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
832         mspel_filter    q12, q12, d1, d3, d5, d7, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
833         vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
834 
835         vld1.64         {d0,d1}, [r1], r2
836         mspel_filter    q11, q11, d2, d4, d6, d0, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
837         mspel_filter    q12, q12, d3, d5, d7, d1, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
838         vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
839 
840         vld1.64         {d2,d3}, [r1], r2
841         mspel_filter    q11, q11, d4, d6, d0, d2, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
842         mspel_filter    q12, q12, d5, d7, d1, d3, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
843         vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
844 
845         vld1.64         {d4,d5}, [r1], r2
846         mspel_filter    q11, q11, d6, d0, d2, d4, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
847         mspel_filter    q12, q12, d7, d1, d3, d5, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
848         vst1.64         {q11,q12}, [r4,:128]!   @ store and increment
849 
850         bne             1b
851 
852         rsb             r3,   r3,  #(64 + \filter_add)      @ r3 = (64 + filter_add) - r3
853         mov             r12,  #8                @ loop counter
854         mov             r4,   sp                @ r4 = tmp
855 
856         @ Do horizontal filtering from temp to dst
857         mspel_constants i16, d28, d29, d30, d31, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, q13, r3
858 
859 2:
860         subs            r12,  r12,  #1
861 
862         vld1.64         {q0,q1}, [r4,:128]!     @ read one line of tmp
863         vext.16         q2,   q0,   q1,  #2
864         vext.16         q3,   q0,   q1,  #3
865         vext.16         q1,   q0,   q1,  #1     @ do last because it writes to q1 which is read by the other vext instructions
866 
867         mspel_filter.16 q11, q12, d22, d23, d21, d0, d1, d2, d3, d4, d5, d6, d7, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, d28, d29, d30, d31, q13, 7
868 
869         vst1.64         {d21}, [r0,:64], r2     @ store and increment dst
870 
871         bne             2b
872 
873         mov             sp,  r11
874         pop             {r4, r11, pc}
875 endfunc
876 .endm
877 
878 @ Use C preprocessor and assembler macros to expand to functions for horizontal and vertical filtering.
879 #define PUT_VC1_MSPEL_MC_HV(hmode, vmode)   \
880     put_vc1_mspel_mc_hv hmode, vmode, \
881         MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, \
882         MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, \
883         MSPEL_MODES_ ## hmode ## vmode ## _ADDSHIFT_CONSTANTS
884 
885 PUT_VC1_MSPEL_MC_HV(1, 1)
886 PUT_VC1_MSPEL_MC_HV(1, 2)
887 PUT_VC1_MSPEL_MC_HV(1, 3)
888 PUT_VC1_MSPEL_MC_HV(2, 1)
889 PUT_VC1_MSPEL_MC_HV(2, 2)
890 PUT_VC1_MSPEL_MC_HV(2, 3)
891 PUT_VC1_MSPEL_MC_HV(3, 1)
892 PUT_VC1_MSPEL_MC_HV(3, 2)
893 PUT_VC1_MSPEL_MC_HV(3, 3)
894 
895 #undef PUT_VC1_MSPEL_MC_HV
896 
897 .macro  put_vc1_mspel_mc_h_only hmode filter_a filter_b filter_c filter_d filter_add filter_shift
898 function ff_put_vc1_mspel_mc\hmode\()0_neon, export=1
899         rsb             r3,   r3,   #\filter_add        @ r3 = filter_add - r = filter_add - rnd
900         mov             r12,  #8                        @ loop counter
901         sub             r1,   r1,   #1                  @ slide back, using immediate
902 
903         mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3
904 
905 1:
906         subs            r12,  r12,  #1
907 
908         vld1.64         {d0,d1}, [r1], r2               @ read 16 bytes even though we only need 11, also src += stride
909         vext.8          d2,   d0,   d1,  #2
910         vext.8          d3,   d0,   d1,  #3
911         vext.8          d1,   d0,   d1,  #1             @ do last because it writes to d1 which is read by the other vext instructions
912 
913         mspel_filter    q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
914 
915         vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
916 
917         bne             1b
918 
919         bx              lr
920 endfunc
921 .endm
922 
923 @ Use C preprocessor and assembler macros to expand to functions for horizontal only filtering.
924 #define PUT_VC1_MSPEL_MC_H_ONLY(hmode) \
925         put_vc1_mspel_mc_h_only hmode, MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## hmode ## _ADDSHIFT_CONSTANTS
926 
927 PUT_VC1_MSPEL_MC_H_ONLY(1)
928 PUT_VC1_MSPEL_MC_H_ONLY(2)
929 PUT_VC1_MSPEL_MC_H_ONLY(3)
930 
931 #undef PUT_VC1_MSPEL_MC_H_ONLY
932 
933 @ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3])
934 .macro put_vc1_mspel_mc_v_only vmode filter_a filter_b filter_c filter_d filter_add filter_shift
935 function ff_put_vc1_mspel_mc0\vmode\()_neon, export=1
936         add             r3,   r3,   #\filter_add - 1    @ r3 = filter_add - r = filter_add - (1 - rnd) = filter_add - 1 + rnd
937         mov             r12,  #8                        @ loop counter
938         sub             r1,   r1,   r2                  @ r1 = &src[-stride]      @ slide back
939 
940         mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3
941 
942         vld1.64         {d0},  [r1], r2                 @ d0 = src[-stride]
943         vld1.64         {d1},  [r1], r2                 @ d1 = src[0]
944         vld1.64         {d2},  [r1], r2                 @ d2 = src[stride]
945 
946 1:
947         subs            r12,  r12,  #4
948 
949         vld1.64         {d3},  [r1], r2                 @ d3 = src[stride * 2]
950         mspel_filter    q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
951         vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
952 
953         vld1.64         {d0},  [r1], r2                 @ d0 = next line
954         mspel_filter    q11, d21, d1, d2, d3, d0, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
955         vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
956 
957         vld1.64         {d1},  [r1], r2                 @ d1 = next line
958         mspel_filter    q11, d21, d2, d3, d0, d1, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
959         vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
960 
961         vld1.64         {d2},  [r1], r2                 @ d2 = next line
962         mspel_filter    q11, d21, d3, d0, d1, d2, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
963         vst1.64         {d21}, [r0,:64], r2             @ store and increment dst
964 
965         bne             1b
966 
967         bx              lr
968 endfunc
969 .endm
970 
971 @ Use C preprocessor and assembler macros to expand to functions for vertical only filtering.
972 #define PUT_VC1_MSPEL_MC_V_ONLY(vmode) \
973         put_vc1_mspel_mc_v_only vmode, MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## vmode ## _ADDSHIFT_CONSTANTS
974 
975 PUT_VC1_MSPEL_MC_V_ONLY(1)
976 PUT_VC1_MSPEL_MC_V_ONLY(2)
977 PUT_VC1_MSPEL_MC_V_ONLY(3)
978 
979 #undef PUT_VC1_MSPEL_MC_V_ONLY
980 
981 function ff_put_pixels8x8_neon, export=1
982         vld1.64         {d0}, [r1], r2
983         vld1.64         {d1}, [r1], r2
984         vld1.64         {d2}, [r1], r2
985         vld1.64         {d3}, [r1], r2
986         vld1.64         {d4}, [r1], r2
987         vld1.64         {d5}, [r1], r2
988         vld1.64         {d6}, [r1], r2
989         vld1.64         {d7}, [r1]
990         vst1.64         {d0}, [r0,:64], r2
991         vst1.64         {d1}, [r0,:64], r2
992         vst1.64         {d2}, [r0,:64], r2
993         vst1.64         {d3}, [r0,:64], r2
994         vst1.64         {d4}, [r0,:64], r2
995         vst1.64         {d5}, [r0,:64], r2
996         vst1.64         {d6}, [r0,:64], r2
997         vst1.64         {d7}, [r0,:64]
998         bx              lr
999 endfunc
1000 
1001 function ff_vc1_inv_trans_8x8_dc_neon, export=1
1002         ldrsh           r2, [r2]              @ int dc = block[0];
1003 
1004         vld1.64         {d0},  [r0,:64], r1
1005         vld1.64         {d1},  [r0,:64], r1
1006         vld1.64         {d4},  [r0,:64], r1
1007         vld1.64         {d5},  [r0,:64], r1
1008 
1009         add             r2, r2, r2, lsl #1    @ dc = (3 * dc +  1) >> 1;
1010         vld1.64         {d6},  [r0,:64], r1
1011         add             r2, r2, #1
1012         vld1.64         {d7},  [r0,:64], r1
1013         vld1.64         {d16}, [r0,:64], r1
1014         vld1.64         {d17}, [r0,:64], r1
1015         asr             r2, r2, #1
1016 
1017         sub             r0,  r0,  r1, lsl #3  @ restore r0 to original value
1018 
1019         add             r2, r2, r2, lsl #1    @ dc = (3 * dc + 16) >> 5;
1020         add             r2, r2, #16
1021         asr             r2, r2, #5
1022 
1023         vdup.16         q1,  r2               @ dc
1024 
1025         vaddw.u8        q9,   q1,  d0
1026         vaddw.u8        q10,  q1,  d1
1027         vaddw.u8        q11,  q1,  d4
1028         vaddw.u8        q12,  q1,  d5
1029         vqmovun.s16     d0,  q9
1030         vqmovun.s16     d1,  q10
1031         vqmovun.s16     d4,  q11
1032         vst1.64         {d0},  [r0,:64], r1
1033         vqmovun.s16     d5,  q12
1034         vst1.64         {d1},  [r0,:64], r1
1035         vaddw.u8        q13,  q1,  d6
1036         vst1.64         {d4},  [r0,:64], r1
1037         vaddw.u8        q14,  q1,  d7
1038         vst1.64         {d5},  [r0,:64], r1
1039         vaddw.u8        q15,  q1,  d16
1040         vaddw.u8        q1,   q1,  d17        @ this destroys q1
1041         vqmovun.s16     d6,  q13
1042         vqmovun.s16     d7,  q14
1043         vqmovun.s16     d16, q15
1044         vqmovun.s16     d17, q1
1045         vst1.64         {d6},  [r0,:64], r1
1046         vst1.64         {d7},  [r0,:64], r1
1047         vst1.64         {d16}, [r0,:64], r1
1048         vst1.64         {d17}, [r0,:64]
1049         bx              lr
1050 endfunc
1051 
1052 function ff_vc1_inv_trans_8x4_dc_neon, export=1
1053         ldrsh           r2, [r2]              @ int dc = block[0];
1054 
1055         vld1.64         {d0},  [r0,:64], r1
1056         vld1.64         {d1},  [r0,:64], r1
1057         vld1.64         {d4},  [r0,:64], r1
1058         vld1.64         {d5},  [r0,:64], r1
1059 
1060         add             r2, r2, r2, lsl #1    @ dc = ( 3 * dc +  1) >> 1;
1061 
1062         sub             r0,  r0,  r1, lsl #2  @ restore r0 to original value
1063 
1064         add             r2, r2, #1
1065         asr             r2, r2, #1
1066 
1067         add             r2, r2, r2, lsl #4    @ dc = (17 * dc + 64) >> 7;
1068         add             r2, r2, #64
1069         asr             r2, r2, #7
1070 
1071         vdup.16         q1,  r2               @ dc
1072 
1073         vaddw.u8        q3,  q1,  d0
1074         vaddw.u8        q8,  q1,  d1
1075         vaddw.u8        q9,  q1,  d4
1076         vaddw.u8        q10, q1,  d5
1077         vqmovun.s16     d0,  q3
1078         vqmovun.s16     d1,  q8
1079         vqmovun.s16     d4,  q9
1080         vst1.64         {d0},  [r0,:64], r1
1081         vqmovun.s16     d5,  q10
1082         vst1.64         {d1},  [r0,:64], r1
1083         vst1.64         {d4},  [r0,:64], r1
1084         vst1.64         {d5},  [r0,:64]
1085         bx              lr
1086 endfunc
1087 
1088 function ff_vc1_inv_trans_4x8_dc_neon, export=1
1089         ldrsh           r2, [r2]              @ int dc = block[0];
1090 
1091         vld1.32         {d0[]},   [r0,:32], r1
1092         vld1.32         {d1[]},   [r0,:32], r1
1093         vld1.32         {d0[1]},  [r0,:32], r1
1094         vld1.32         {d1[1]},  [r0,:32], r1
1095 
1096         add             r2, r2, r2, lsl #4    @ dc = (17 * dc +  4) >> 3;
1097         vld1.32         {d4[]},   [r0,:32], r1
1098         add             r2, r2, #4
1099         vld1.32         {d5[]},   [r0,:32], r1
1100         vld1.32         {d4[1]},  [r0,:32], r1
1101         asr             r2, r2, #3
1102         vld1.32         {d5[1]},  [r0,:32], r1
1103 
1104         add             r2, r2, r2, lsl #1    @ dc = (12 * dc + 64) >> 7;
1105 
1106         sub             r0,  r0,  r1, lsl #3  @ restore r0 to original value
1107 
1108         lsl             r2, r2, #2
1109         add             r2, r2, #64
1110         asr             r2, r2, #7
1111 
1112         vdup.16         q1,  r2               @ dc
1113 
1114         vaddw.u8        q3,  q1,  d0
1115         vaddw.u8        q8,  q1,  d1
1116         vaddw.u8        q9,  q1,  d4
1117         vaddw.u8        q10, q1,  d5
1118         vqmovun.s16     d0,  q3
1119         vst1.32         {d0[0]},  [r0,:32], r1
1120         vqmovun.s16     d1,  q8
1121         vst1.32         {d1[0]},  [r0,:32], r1
1122         vqmovun.s16     d4,  q9
1123         vst1.32         {d0[1]},  [r0,:32], r1
1124         vqmovun.s16     d5,  q10
1125         vst1.32         {d1[1]},  [r0,:32], r1
1126         vst1.32         {d4[0]},  [r0,:32], r1
1127         vst1.32         {d5[0]},  [r0,:32], r1
1128         vst1.32         {d4[1]},  [r0,:32], r1
1129         vst1.32         {d5[1]},  [r0,:32]
1130         bx              lr
1131 endfunc
1132 
1133 function ff_vc1_inv_trans_4x4_dc_neon, export=1
1134         ldrsh           r2, [r2]              @ int dc = block[0];
1135 
1136         vld1.32         {d0[]},   [r0,:32], r1
1137         vld1.32         {d1[]},   [r0,:32], r1
1138         vld1.32         {d0[1]},  [r0,:32], r1
1139         vld1.32         {d1[1]},  [r0,:32], r1
1140 
1141         add             r2, r2, r2, lsl #4    @ dc = (17 * dc +  4) >> 3;
1142 
1143         sub             r0,  r0,  r1, lsl #2  @ restore r0 to original value
1144 
1145         add             r2, r2, #4
1146         asr             r2, r2, #3
1147 
1148         add             r2, r2, r2, lsl #4    @ dc = (17 * dc + 64) >> 7;
1149         add             r2, r2, #64
1150         asr             r2, r2, #7
1151 
1152         vdup.16         q1,  r2               @ dc
1153 
1154         vaddw.u8        q2,  q1,  d0
1155         vaddw.u8        q3,  q1,  d1
1156         vqmovun.s16     d0,  q2
1157         vst1.32         {d0[0]},  [r0,:32], r1
1158         vqmovun.s16     d1,  q3
1159         vst1.32         {d1[0]},  [r0,:32], r1
1160         vst1.32         {d0[1]},  [r0,:32], r1
1161         vst1.32         {d1[1]},  [r0,:32]
1162         bx              lr
1163 endfunc
1164 
1165 @ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
1166 @ On entry:
1167 @   r0 -> top-left pel of lower block
1168 @   r1 = row stride, bytes
1169 @   r2 = PQUANT bitstream parameter
1170 function ff_vc1_v_loop_filter4_neon, export=1
1171         sub             r3, r0, r1, lsl #2
1172         vldr            d0, .Lcoeffs
1173         vld1.32         {d1[0]}, [r0], r1       @ P5
1174         vld1.32         {d2[0]}, [r3], r1       @ P1
1175         vld1.32         {d3[0]}, [r3], r1       @ P2
1176         vld1.32         {d4[0]}, [r0], r1       @ P6
1177         vld1.32         {d5[0]}, [r3], r1       @ P3
1178         vld1.32         {d6[0]}, [r0], r1       @ P7
1179         vld1.32         {d7[0]}, [r3]           @ P4
1180         vld1.32         {d16[0]}, [r0]          @ P8
1181         vshll.u8        q9, d1, #1              @ 2*P5
1182         vdup.16         d17, r2                 @ pq
1183         vshll.u8        q10, d2, #1             @ 2*P1
1184         vmovl.u8        q11, d3                 @ P2
1185         vmovl.u8        q1, d4                  @ P6
1186         vmovl.u8        q12, d5                 @ P3
1187         vmls.i16        d20, d22, d0[1]         @ 2*P1-5*P2
1188         vmovl.u8        q11, d6                 @ P7
1189         vmls.i16        d18, d2, d0[1]          @ 2*P5-5*P6
1190         vshll.u8        q2, d5, #1              @ 2*P3
1191         vmovl.u8        q3, d7                  @ P4
1192         vmla.i16        d18, d22, d0[1]         @ 2*P5-5*P6+5*P7
1193         vmovl.u8        q11, d16                @ P8
1194         vmla.u16        d20, d24, d0[1]         @ 2*P1-5*P2+5*P3
1195         vmovl.u8        q12, d1                 @ P5
1196         vmls.u16        d4, d6, d0[1]           @ 2*P3-5*P4
1197         vmls.u16        d18, d22, d0[0]         @ 2*P5-5*P6+5*P7-2*P8
1198         vsub.i16        d1, d6, d24             @ P4-P5
1199         vmls.i16        d20, d6, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
1200         vmla.i16        d4, d24, d0[1]          @ 2*P3-5*P4+5*P5
1201         vmls.i16        d4, d2, d0[0]           @ 2*P3-5*P4+5*P5-2*P6
1202         vabs.s16        d2, d1
1203         vrshr.s16       d3, d18, #3
1204         vrshr.s16       d5, d20, #3
1205         vshr.s16        d2, d2, #1              @ clip
1206         vrshr.s16       d4, d4, #3
1207         vabs.s16        d3, d3                  @ a2
1208         vshr.s16        d1, d1, #8              @ clip_sign
1209         vabs.s16        d5, d5                  @ a1
1210         vceq.i16        d7, d2, #0              @ test clip == 0
1211         vabs.s16        d16, d4                 @ a0
1212         vshr.s16        d4, d4, #8              @ a0_sign
1213         vcge.s16        d18, d5, d3             @ test a1 >= a2
1214         vcge.s16        d17, d16, d17           @ test a0 >= pq
1215         vbsl            d18, d3, d5             @ a3
1216         vsub.i16        d1, d1, d4              @ clip_sign - a0_sign
1217         vorr            d3, d7, d17             @ test clip == 0 || a0 >= pq
1218         vqsub.u16       d4, d16, d18            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1219         vcge.s16        d5, d18, d16            @ test a3 >= a0
1220         vmul.i16        d0, d4, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
1221         vorr            d4, d3, d5              @ test clip == 0 || a0 >= pq || a3 >= a0
1222         vmov.32         r0, d4[1]               @ move to gp reg
1223         vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
1224         vcge.s16        d4, d0, d2
1225         tst             r0, #1
1226         bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
1227         vbsl            d4, d2, d0              @ FFMIN(d, clip)
1228         vbic            d0, d4, d3              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1229         vmls.i16        d6, d0, d1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1230         vmla.i16        d24, d0, d1             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1231         vqmovun.s16     d0, q3
1232         vqmovun.s16     d1, q12
1233         vst1.32         {d0[0]}, [r3], r1
1234         vst1.32         {d1[0]}, [r3]
1235 1:      bx              lr
1236 endfunc
1237 
1238 @ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
1239 @ On entry:
1240 @   r0 -> top-left pel of right block
1241 @   r1 = row stride, bytes
1242 @   r2 = PQUANT bitstream parameter
1243 function ff_vc1_h_loop_filter4_neon, export=1
1244         sub             r3, r0, #4              @ where to start reading
1245         vldr            d0, .Lcoeffs
1246         vld1.32         {d2}, [r3], r1
1247         sub             r0, r0, #1              @ where to start writing
1248         vld1.32         {d4}, [r3], r1
1249         vld1.32         {d3}, [r3], r1
1250         vld1.32         {d5}, [r3]
1251         vdup.16         d1, r2                  @ pq
1252         vtrn.8          q1, q2
1253         vtrn.16         d2, d3                  @ P1, P5, P3, P7
1254         vtrn.16         d4, d5                  @ P2, P6, P4, P8
1255         vshll.u8        q3, d2, #1              @ 2*P1, 2*P5
1256         vmovl.u8        q8, d4                  @ P2, P6
1257         vmovl.u8        q9, d3                  @ P3, P7
1258         vmovl.u8        q2, d5                  @ P4, P8
1259         vmls.i16        q3, q8, d0[1]           @ 2*P1-5*P2, 2*P5-5*P6
1260         vshll.u8        q10, d3, #1             @ 2*P3, 2*P7
1261         vmovl.u8        q1, d2                  @ P1, P5
1262         vmla.i16        q3, q9, d0[1]           @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
1263         vmls.i16        q3, q2, d0[0]           @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
1264         vmov            d2, d3                  @ needs to be in an even-numbered vector for when we come to narrow it later
1265         vmls.i16        d20, d4, d0[1]          @ 2*P3-5*P4
1266         vmla.i16        d20, d3, d0[1]          @ 2*P3-5*P4+5*P5
1267         vsub.i16        d3, d4, d2              @ P4-P5
1268         vmls.i16        d20, d17, d0[0]         @ 2*P3-5*P4+5*P5-2*P6
1269         vrshr.s16       q3, q3, #3
1270         vabs.s16        d5, d3
1271         vshr.s16        d3, d3, #8              @ clip_sign
1272         vrshr.s16       d16, d20, #3
1273         vabs.s16        q3, q3                  @ a1, a2
1274         vshr.s16        d5, d5, #1              @ clip
1275         vabs.s16        d17, d16                @ a0
1276         vceq.i16        d18, d5, #0             @ test clip == 0
1277         vshr.s16        d16, d16, #8            @ a0_sign
1278         vcge.s16        d19, d6, d7             @ test a1 >= a2
1279         vcge.s16        d1, d17, d1             @ test a0 >= pq
1280         vsub.i16        d16, d3, d16            @ clip_sign - a0_sign
1281         vbsl            d19, d7, d6             @ a3
1282         vorr            d1, d18, d1             @ test clip == 0 || a0 >= pq
1283         vqsub.u16       d3, d17, d19            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1284         vcge.s16        d6, d19, d17            @ test a3 >= a0    @
1285         vmul.i16        d0, d3, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
1286         vorr            d3, d1, d6              @ test clip == 0 || a0 >= pq || a3 >= a0
1287         vmov.32         r2, d3[1]               @ move to gp reg
1288         vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
1289         vcge.s16        d3, d0, d5
1290         tst             r2, #1
1291         bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
1292         vbsl            d3, d5, d0              @ FFMIN(d, clip)
1293         vbic            d0, d3, d1              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1294         vmla.i16        d2, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1295         vmls.i16        d4, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1296         vqmovun.s16     d1, q1
1297         vqmovun.s16     d0, q2
1298         vst2.8          {d0[0], d1[0]}, [r0], r1
1299         vst2.8          {d0[1], d1[1]}, [r0], r1
1300         vst2.8          {d0[2], d1[2]}, [r0], r1
1301         vst2.8          {d0[3], d1[3]}, [r0]
1302 1:      bx              lr
1303 endfunc
1304 
1305 @ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
1306 @ On entry:
1307 @   r0 -> top-left pel of lower block
1308 @   r1 = row stride, bytes
1309 @   r2 = PQUANT bitstream parameter
1310 function ff_vc1_v_loop_filter8_neon, export=1
1311         sub             r3, r0, r1, lsl #2
1312         vldr            d0, .Lcoeffs
1313         vld1.32         {d1}, [r0 :64], r1      @ P5
1314         vld1.32         {d2}, [r3 :64], r1      @ P1
1315         vld1.32         {d3}, [r3 :64], r1      @ P2
1316         vld1.32         {d4}, [r0 :64], r1      @ P6
1317         vld1.32         {d5}, [r3 :64], r1      @ P3
1318         vld1.32         {d6}, [r0 :64], r1      @ P7
1319         vshll.u8        q8, d1, #1              @ 2*P5
1320         vshll.u8        q9, d2, #1              @ 2*P1
1321         vld1.32         {d7}, [r3 :64]          @ P4
1322         vmovl.u8        q1, d3                  @ P2
1323         vld1.32         {d20}, [r0 :64]         @ P8
1324         vmovl.u8        q11, d4                 @ P6
1325         vdup.16         q12, r2                 @ pq
1326         vmovl.u8        q13, d5                 @ P3
1327         vmls.i16        q9, q1, d0[1]           @ 2*P1-5*P2
1328         vmovl.u8        q1, d6                  @ P7
1329         vshll.u8        q2, d5, #1              @ 2*P3
1330         vmls.i16        q8, q11, d0[1]          @ 2*P5-5*P6
1331         vmovl.u8        q3, d7                  @ P4
1332         vmovl.u8        q10, d20                @ P8
1333         vmla.i16        q8, q1, d0[1]           @ 2*P5-5*P6+5*P7
1334         vmovl.u8        q1, d1                  @ P5
1335         vmla.i16        q9, q13, d0[1]          @ 2*P1-5*P2+5*P3
1336         vsub.i16        q13, q3, q1             @ P4-P5
1337         vmls.i16        q2, q3, d0[1]           @ 2*P3-5*P4
1338         vmls.i16        q8, q10, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
1339         vabs.s16        q10, q13
1340         vshr.s16        q13, q13, #8            @ clip_sign
1341         vmls.i16        q9, q3, d0[0]           @ 2*P1-5*P2+5*P3-2*P4
1342         vshr.s16        q10, q10, #1            @ clip
1343         vmla.i16        q2, q1, d0[1]           @ 2*P3-5*P4+5*P5
1344         vrshr.s16       q8, q8, #3
1345         vmls.i16        q2, q11, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
1346         vceq.i16        q11, q10, #0            @ test clip == 0
1347         vrshr.s16       q9, q9, #3
1348         vabs.s16        q8, q8                  @ a2
1349         vabs.s16        q9, q9                  @ a1
1350         vrshr.s16       q2, q2, #3
1351         vcge.s16        q14, q9, q8             @ test a1 >= a2
1352         vabs.s16        q15, q2                 @ a0
1353         vshr.s16        q2, q2, #8              @ a0_sign
1354         vbsl            q14, q8, q9             @ a3
1355         vcge.s16        q8, q15, q12            @ test a0 >= pq
1356         vsub.i16        q2, q13, q2             @ clip_sign - a0_sign
1357         vqsub.u16       q9, q15, q14            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1358         vcge.s16        q12, q14, q15           @ test a3 >= a0
1359         vorr            q8, q11, q8             @ test clip == 0 || a0 >= pq
1360         vmul.i16        q0, q9, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
1361         vorr            q9, q8, q12             @ test clip == 0 || a0 >= pq || a3 >= a0
1362         vshl.i64        q11, q9, #16
1363         vmov.32         r0, d18[1]              @ move to gp reg
1364         vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
1365         vmov.32         r2, d19[1]
1366         vshr.s64        q9, q11, #48
1367         vcge.s16        q11, q0, q10
1368         vorr            q8, q8, q9
1369         and             r0, r0, r2
1370         vbsl            q11, q10, q0            @ FFMIN(d, clip)
1371         tst             r0, #1
1372         bne             1f                      @ none of the 8 pixel pairs should be updated in this case
1373         vbic            q0, q11, q8             @ set each d to zero if it should not be filtered
1374         vmls.i16        q3, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1375         vmla.i16        q1, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1376         vqmovun.s16     d0, q3
1377         vqmovun.s16     d1, q1
1378         vst1.32         {d0}, [r3 :64], r1
1379         vst1.32         {d1}, [r3 :64]
1380 1:      bx              lr
1381 endfunc
1382 
1383 .align  5
1384 .Lcoeffs:
1385 .quad   0x00050002
1386 
1387 @ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
1388 @ On entry:
1389 @   r0 -> top-left pel of right block
1390 @   r1 = row stride, bytes
1391 @   r2 = PQUANT bitstream parameter
1392 function ff_vc1_h_loop_filter8_neon, export=1
1393         push            {lr}
1394         sub             r3, r0, #4              @ where to start reading
1395         vldr            d0, .Lcoeffs
1396         vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
1397         sub             r0, r0, #1              @ where to start writing
1398         vld1.32         {d4}, [r3], r1
1399         add             r12, r0, r1, lsl #2
1400         vld1.32         {d3}, [r3], r1
1401         vld1.32         {d5}, [r3], r1
1402         vld1.32         {d6}, [r3], r1
1403         vld1.32         {d16}, [r3], r1
1404         vld1.32         {d7}, [r3], r1
1405         vld1.32         {d17}, [r3]
1406         vtrn.8          q1, q2                  @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]...
1407         vdup.16         q9, r2                  @ pq
1408         vtrn.16         d2, d3                  @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
1409         vtrn.16         d4, d5                  @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
1410         vtrn.8          q3, q8                  @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]...
1411         vtrn.16         d6, d7                  @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]...
1412         vtrn.16         d16, d17                @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
1413         vtrn.32         d2, d6                  @ P1, P5
1414         vtrn.32         d4, d16                 @ P2, P6
1415         vtrn.32         d3, d7                  @ P3, P7
1416         vtrn.32         d5, d17                 @ P4, P8
1417         vshll.u8        q10, d2, #1             @ 2*P1
1418         vshll.u8        q11, d6, #1             @ 2*P5
1419         vmovl.u8        q12, d4                 @ P2
1420         vmovl.u8        q13, d16                @ P6
1421         vmovl.u8        q14, d3                 @ P3
1422         vmls.i16        q10, q12, d0[1]         @ 2*P1-5*P2
1423         vmovl.u8        q12, d7                 @ P7
1424         vshll.u8        q1, d3, #1              @ 2*P3
1425         vmls.i16        q11, q13, d0[1]         @ 2*P5-5*P6
1426         vmovl.u8        q2, d5                  @ P4
1427         vmovl.u8        q8, d17                 @ P8
1428         vmla.i16        q11, q12, d0[1]         @ 2*P5-5*P6+5*P7
1429         vmovl.u8        q3, d6                  @ P5
1430         vmla.i16        q10, q14, d0[1]         @ 2*P1-5*P2+5*P3
1431         vsub.i16        q12, q2, q3             @ P4-P5
1432         vmls.i16        q1, q2, d0[1]           @ 2*P3-5*P4
1433         vmls.i16        q11, q8, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
1434         vabs.s16        q8, q12
1435         vshr.s16        q12, q12, #8            @ clip_sign
1436         vmls.i16        q10, q2, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
1437         vshr.s16        q8, q8, #1              @ clip
1438         vmla.i16        q1, q3, d0[1]           @ 2*P3-5*P4+5*P5
1439         vrshr.s16       q11, q11, #3
1440         vmls.i16        q1, q13, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
1441         vceq.i16        q13, q8, #0             @ test clip == 0
1442         vrshr.s16       q10, q10, #3
1443         vabs.s16        q11, q11                @ a2
1444         vabs.s16        q10, q10                @ a1
1445         vrshr.s16       q1, q1, #3
1446         vcge.s16        q14, q10, q11           @ test a1 >= a2
1447         vabs.s16        q15, q1                 @ a0
1448         vshr.s16        q1, q1, #8              @ a0_sign
1449         vbsl            q14, q11, q10           @ a3
1450         vcge.s16        q9, q15, q9             @ test a0 >= pq
1451         vsub.i16        q1, q12, q1             @ clip_sign - a0_sign
1452         vqsub.u16       q10, q15, q14           @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1453         vcge.s16        q11, q14, q15           @ test a3 >= a0
1454         vorr            q9, q13, q9             @ test clip == 0 || a0 >= pq
1455         vmul.i16        q0, q10, d0[1]          @ a0 >= a3 ? 5*(a0-a3) : 0
1456         vorr            q10, q9, q11            @ test clip == 0 || a0 >= pq || a3 >= a0
1457         vmov.32         r2, d20[1]              @ move to gp reg
1458         vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
1459         vmov.32         r3, d21[1]
1460         vcge.s16        q10, q0, q8
1461         and             r14, r2, r3
1462         vbsl            q10, q8, q0             @ FFMIN(d, clip)
1463         tst             r14, #1
1464         bne             2f                      @ none of the 8 pixel pairs should be updated in this case
1465         vbic            q0, q10, q9             @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1466         vmla.i16        q3, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1467         vmls.i16        q2, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1468         vqmovun.s16     d1, q3
1469         vqmovun.s16     d0, q2
1470         tst             r2, #1
1471         bne             1f                      @ none of the first 4 pixel pairs should be updated if so
1472         vst2.8          {d0[0], d1[0]}, [r0], r1
1473         vst2.8          {d0[1], d1[1]}, [r0], r1
1474         vst2.8          {d0[2], d1[2]}, [r0], r1
1475         vst2.8          {d0[3], d1[3]}, [r0]
1476 1:      tst             r3, #1
1477         bne             2f                      @ none of the second 4 pixel pairs should be updated if so
1478         vst2.8          {d0[4], d1[4]}, [r12], r1
1479         vst2.8          {d0[5], d1[5]}, [r12], r1
1480         vst2.8          {d0[6], d1[6]}, [r12], r1
1481         vst2.8          {d0[7], d1[7]}, [r12]
1482 2:      pop             {pc}
1483 endfunc
1484 
1485 @ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
1486 @ On entry:
1487 @   r0 -> top-left pel of lower block
1488 @   r1 = row stride, bytes
1489 @   r2 = PQUANT bitstream parameter
1490 function ff_vc1_v_loop_filter16_neon, export=1
1491         vpush           {d8-d15}
1492         sub             r3, r0, r1, lsl #2
1493         vldr            d0, .Lcoeffs
1494         vld1.64         {q1}, [r0 :128], r1     @ P5
1495         vld1.64         {q2}, [r3 :128], r1     @ P1
1496         vld1.64         {q3}, [r3 :128], r1     @ P2
1497         vld1.64         {q4}, [r0 :128], r1     @ P6
1498         vld1.64         {q5}, [r3 :128], r1     @ P3
1499         vld1.64         {q6}, [r0 :128], r1     @ P7
1500         vshll.u8        q7, d2, #1              @ 2*P5[0..7]
1501         vshll.u8        q8, d4, #1              @ 2*P1[0..7]
1502         vld1.64         {q9}, [r3 :128]         @ P4
1503         vmovl.u8        q10, d6                 @ P2[0..7]
1504         vld1.64         {q11}, [r0 :128]        @ P8
1505         vmovl.u8        q12, d8                 @ P6[0..7]
1506         vdup.16         q13, r2                 @ pq
1507         vshll.u8        q2, d5, #1              @ 2*P1[8..15]
1508         vmls.i16        q8, q10, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
1509         vshll.u8        q10, d3, #1             @ 2*P5[8..15]
1510         vmovl.u8        q3, d7                  @ P2[8..15]
1511         vmls.i16        q7, q12, d0[1]          @ 2*P5[0..7]-5*P6[0..7]
1512         vmovl.u8        q4, d9                  @ P6[8..15]
1513         vmovl.u8        q14, d10                @ P3[0..7]
1514         vmovl.u8        q15, d12                @ P7[0..7]
1515         vmls.i16        q2, q3, d0[1]           @ 2*P1[8..15]-5*P2[8..15]
1516         vshll.u8        q3, d10, #1             @ 2*P3[0..7]
1517         vmls.i16        q10, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
1518         vmovl.u8        q6, d13                 @ P7[8..15]
1519         vmla.i16        q8, q14, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
1520         vmovl.u8        q14, d18                @ P4[0..7]
1521         vmovl.u8        q9, d19                 @ P4[8..15]
1522         vmla.i16        q7, q15, d0[1]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1523         vmovl.u8        q15, d11                @ P3[8..15]
1524         vshll.u8        q5, d11, #1             @ 2*P3[8..15]
1525         vmls.i16        q3, q14, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
1526         vmla.i16        q2, q15, d0[1]          @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1527         vmovl.u8        q15, d22                @ P8[0..7]
1528         vmovl.u8        q11, d23                @ P8[8..15]
1529         vmla.i16        q10, q6, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
1530         vmovl.u8        q6, d2                  @ P5[0..7]
1531         vmovl.u8        q1, d3                  @ P5[8..15]
1532         vmls.i16        q5, q9, d0[1]           @ 2*P3[8..15]-5*P4[8..15]
1533         vmls.i16        q8, q14, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
1534         vmls.i16        q7, q15, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1535         vsub.i16        q15, q14, q6            @ P4[0..7]-P5[0..7]
1536         vmla.i16        q3, q6, d0[1]           @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1537         vrshr.s16       q8, q8, #3
1538         vmls.i16        q2, q9, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1539         vrshr.s16       q7, q7, #3
1540         vmls.i16        q10, q11, d0[0]         @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
1541         vabs.s16        q11, q15
1542         vabs.s16        q8, q8                  @ a1[0..7]
1543         vmla.i16        q5, q1, d0[1]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1544         vshr.s16        q15, q15, #8            @ clip_sign[0..7]
1545         vrshr.s16       q2, q2, #3
1546         vmls.i16        q3, q12, d0[0]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
1547         vabs.s16        q7, q7                  @ a2[0..7]
1548         vrshr.s16       q10, q10, #3
1549         vsub.i16        q12, q9, q1             @ P4[8..15]-P5[8..15]
1550         vshr.s16        q11, q11, #1            @ clip[0..7]
1551         vmls.i16        q5, q4, d0[0]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1552         vcge.s16        q4, q8, q7              @ test a1[0..7] >= a2[0..7]
1553         vabs.s16        q2, q2                  @ a1[8..15]
1554         vrshr.s16       q3, q3, #3
1555         vabs.s16        q10, q10                @ a2[8..15]
1556         vbsl            q4, q7, q8              @ a3[0..7]
1557         vabs.s16        q7, q12
1558         vshr.s16        q8, q12, #8             @ clip_sign[8..15]
1559         vrshr.s16       q5, q5, #3
1560         vcge.s16        q12, q2, q10            @ test a1[8..15] >= a2[8.15]
1561         vshr.s16        q7, q7, #1              @ clip[8..15]
1562         vbsl            q12, q10, q2            @ a3[8..15]
1563         vabs.s16        q2, q3                  @ a0[0..7]
1564         vceq.i16        q10, q11, #0            @ test clip[0..7] == 0
1565         vshr.s16        q3, q3, #8              @ a0_sign[0..7]
1566         vsub.i16        q3, q15, q3             @ clip_sign[0..7] - a0_sign[0..7]
1567         vcge.s16        q15, q2, q13            @ test a0[0..7] >= pq
1568         vorr            q10, q10, q15           @ test clip[0..7] == 0 || a0[0..7] >= pq
1569         vqsub.u16       q15, q2, q4             @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1570         vcge.s16        q2, q4, q2              @ test a3[0..7] >= a0[0..7]
1571         vabs.s16        q4, q5                  @ a0[8..15]
1572         vshr.s16        q5, q5, #8              @ a0_sign[8..15]
1573         vmul.i16        q15, q15, d0[1]         @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1574         vcge.s16        q13, q4, q13            @ test a0[8..15] >= pq
1575         vorr            q2, q10, q2             @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
1576         vsub.i16        q5, q8, q5              @ clip_sign[8..15] - a0_sign[8..15]
1577         vceq.i16        q8, q7, #0              @ test clip[8..15] == 0
1578         vshr.u16        q15, q15, #3            @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1579         vmov.32         r0, d4[1]               @ move to gp reg
1580         vorr            q8, q8, q13             @ test clip[8..15] == 0 || a0[8..15] >= pq
1581         vqsub.u16       q13, q4, q12            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1582         vmov.32         r2, d5[1]
1583         vcge.s16        q4, q12, q4             @ test a3[8..15] >= a0[8..15]
1584         vshl.i64        q2, q2, #16
1585         vcge.s16        q12, q15, q11
1586         vmul.i16        q0, q13, d0[1]          @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1587         vorr            q4, q8, q4              @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1588         vshr.s64        q2, q2, #48
1589         and             r0, r0, r2
1590         vbsl            q12, q11, q15           @ FFMIN(d[0..7], clip[0..7])
1591         vshl.i64        q11, q4, #16
1592         vmov.32         r2, d8[1]
1593         vshr.u16        q0, q0, #3              @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1594         vorr            q2, q10, q2
1595         vmov.32         r12, d9[1]
1596         vshr.s64        q4, q11, #48
1597         vcge.s16        q10, q0, q7
1598         vbic            q2, q12, q2             @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1599         vorr            q4, q8, q4
1600         and             r2, r2, r12
1601         vbsl            q10, q7, q0             @ FFMIN(d[8..15], clip[8..15])
1602         vmls.i16        q14, q2, q3             @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
1603         and             r0, r0, r2
1604         vbic            q0, q10, q4             @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1605         tst             r0, #1
1606         bne             1f                      @ none of the 16 pixel pairs should be updated in this case
1607         vmla.i16        q6, q2, q3              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
1608         vmls.i16        q9, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
1609         vqmovun.s16     d4, q14
1610         vmla.i16        q1, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
1611         vqmovun.s16     d0, q6
1612         vqmovun.s16     d5, q9
1613         vqmovun.s16     d1, q1
1614         vst1.64         {q2}, [r3 :128], r1
1615         vst1.64         {q0}, [r3 :128]
1616 1:      vpop            {d8-d15}
1617         bx              lr
1618 endfunc
1619 
1620 @ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
1621 @ On entry:
1622 @   r0 -> top-left pel of right block
1623 @   r1 = row stride, bytes
1624 @   r2 = PQUANT bitstream parameter
1625 function ff_vc1_h_loop_filter16_neon, export=1
1626         push            {r4-r6,lr}
1627         vpush           {d8-d15}
1628         sub             r3, r0, #4              @ where to start reading
1629         vldr            d0, .Lcoeffs
1630         vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
1631         sub             r0, r0, #1              @ where to start writing
1632         vld1.32         {d3}, [r3], r1
1633         add             r4, r0, r1, lsl #2
1634         vld1.32         {d10}, [r3], r1
1635         vld1.32         {d11}, [r3], r1
1636         vld1.32         {d16}, [r3], r1
1637         vld1.32         {d4}, [r3], r1
1638         vld1.32         {d8}, [r3], r1
1639         vtrn.8          d2, d3                  @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]...
1640         vld1.32         {d14}, [r3], r1
1641         vld1.32         {d5}, [r3], r1
1642         vtrn.8          d10, d11                @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]...
1643         vld1.32         {d6}, [r3], r1
1644         vld1.32         {d12}, [r3], r1
1645         vtrn.8          d16, d4                 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]...
1646         vld1.32         {d13}, [r3], r1
1647         vtrn.16         d2, d10                 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
1648         vld1.32         {d1}, [r3], r1
1649         vtrn.8          d8, d14                 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]...
1650         vld1.32         {d7}, [r3], r1
1651         vtrn.16         d3, d11                 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
1652         vld1.32         {d9}, [r3], r1
1653         vtrn.8          d5, d6                  @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]...
1654         vld1.32         {d15}, [r3]
1655         vtrn.16         d16, d8                 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]...
1656         vtrn.16         d4, d14                 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
1657         vtrn.8          d12, d13                @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]...
1658         vdup.16         q9, r2                  @ pq
1659         vtrn.8          d1, d7                  @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]...
1660         vtrn.32         d2, d16                 @ P1[0..7], P5[0..7]
1661         vtrn.16         d5, d12                 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]...
1662         vtrn.16         d6, d13                 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]...
1663         vtrn.8          d9, d15                 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]...
1664         vtrn.32         d3, d4                  @ P2[0..7], P6[0..7]
1665         vshll.u8        q10, d2, #1             @ 2*P1[0..7]
1666         vtrn.32         d10, d8                 @ P3[0..7], P7[0..7]
1667         vshll.u8        q11, d16, #1            @ 2*P5[0..7]
1668         vtrn.32         d11, d14                @ P4[0..7], P8[0..7]
1669         vtrn.16         d1, d9                  @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]...
1670         vtrn.16         d7, d15                 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]...
1671         vmovl.u8        q1, d3                  @ P2[0..7]
1672         vmovl.u8        q12, d4                 @ P6[0..7]
1673         vtrn.32         d5, d1                  @ P1[8..15], P5[8..15]
1674         vtrn.32         d6, d7                  @ P2[8..15], P6[8..15]
1675         vtrn.32         d12, d9                 @ P3[8..15], P7[8..15]
1676         vtrn.32         d13, d15                @ P4[8..15], P8[8..15]
1677         vmls.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
1678         vmovl.u8        q1, d10                 @ P3[0..7]
1679         vshll.u8        q2, d5, #1              @ 2*P1[8..15]
1680         vshll.u8        q13, d1, #1             @ 2*P5[8..15]
1681         vmls.i16        q11, q12, d0[1]         @ 2*P5[0..7]-5*P6[0..7]
1682         vmovl.u8        q14, d6                 @ P2[8..15]
1683         vmovl.u8        q3, d7                  @ P6[8..15]
1684         vmovl.u8        q15, d8                 @ P7[0..7]
1685         vmla.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
1686         vmovl.u8        q1, d12                 @ P3[8..15]
1687         vmls.i16        q2, q14, d0[1]          @ 2*P1[8..15]-5*P2[8..15]
1688         vmovl.u8        q4, d9                  @ P7[8..15]
1689         vshll.u8        q14, d10, #1            @ 2*P3[0..7]
1690         vmls.i16        q13, q3, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
1691         vmovl.u8        q5, d11                 @ P4[0..7]
1692         vmla.i16        q11, q15, d0[1]         @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1693         vshll.u8        q15, d12, #1            @ 2*P3[8..15]
1694         vmovl.u8        q6, d13                 @ P4[8..15]
1695         vmla.i16        q2, q1, d0[1]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1696         vmovl.u8        q1, d14                 @ P8[0..7]
1697         vmovl.u8        q7, d15                 @ P8[8..15]
1698         vmla.i16        q13, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
1699         vmovl.u8        q4, d16                 @ P5[0..7]
1700         vmovl.u8        q8, d1                  @ P5[8..15]
1701         vmls.i16        q14, q5, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
1702         vmls.i16        q15, q6, d0[1]          @ 2*P3[8..15]-5*P4[8..15]
1703         vmls.i16        q10, q5, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
1704         vmls.i16        q11, q1, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1705         vsub.i16        q1, q5, q4              @ P4[0..7]-P5[0..7]
1706         vmls.i16        q2, q6, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1707         vrshr.s16       q10, q10, #3
1708         vmls.i16        q13, q7, d0[0]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
1709         vsub.i16        q7, q6, q8              @ P4[8..15]-P5[8..15]
1710         vrshr.s16       q11, q11, #3
1711         vmla.s16        q14, q4, d0[1]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1712         vrshr.s16       q2, q2, #3
1713         vmla.i16        q15, q8, d0[1]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1714         vabs.s16        q10, q10                @ a1[0..7]
1715         vrshr.s16       q13, q13, #3
1716         vmls.i16        q15, q3, d0[0]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1717         vabs.s16        q3, q11                 @ a2[0..7]
1718         vabs.s16        q2, q2                  @ a1[8..15]
1719         vmls.i16        q14, q12, d0[0]         @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
1720         vabs.s16        q11, q1
1721         vabs.s16        q12, q13                @ a2[8..15]
1722         vcge.s16        q13, q10, q3            @ test a1[0..7] >= a2[0..7]
1723         vshr.s16        q1, q1, #8              @ clip_sign[0..7]
1724         vrshr.s16       q15, q15, #3
1725         vshr.s16        q11, q11, #1            @ clip[0..7]
1726         vrshr.s16       q14, q14, #3
1727         vbsl            q13, q3, q10            @ a3[0..7]
1728         vcge.s16        q3, q2, q12             @ test a1[8..15] >= a2[8.15]
1729         vabs.s16        q10, q15                @ a0[8..15]
1730         vshr.s16        q15, q15, #8            @ a0_sign[8..15]
1731         vbsl            q3, q12, q2             @ a3[8..15]
1732         vabs.s16        q2, q14                 @ a0[0..7]
1733         vabs.s16        q12, q7
1734         vshr.s16        q7, q7, #8              @ clip_sign[8..15]
1735         vshr.s16        q14, q14, #8            @ a0_sign[0..7]
1736         vshr.s16        q12, q12, #1            @ clip[8..15]
1737         vsub.i16        q7, q7, q15             @ clip_sign[8..15] - a0_sign[8..15]
1738         vqsub.u16       q15, q10, q3            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1739         vcge.s16        q3, q3, q10             @ test a3[8..15] >= a0[8..15]
1740         vcge.s16        q10, q10, q9            @ test a0[8..15] >= pq
1741         vcge.s16        q9, q2, q9              @ test a0[0..7] >= pq
1742         vsub.i16        q1, q1, q14             @ clip_sign[0..7] - a0_sign[0..7]
1743         vqsub.u16       q14, q2, q13            @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1744         vcge.s16        q2, q13, q2             @ test a3[0..7] >= a0[0..7]
1745         vmul.i16        q13, q15, d0[1]         @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1746         vceq.i16        q15, q11, #0            @ test clip[0..7] == 0
1747         vmul.i16        q0, q14, d0[1]          @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1748         vorr            q9, q15, q9             @ test clip[0..7] == 0 || a0[0..7] >= pq
1749         vceq.i16        q14, q12, #0            @ test clip[8..15] == 0
1750         vshr.u16        q13, q13, #3            @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1751         vorr            q2, q9, q2              @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
1752         vshr.u16        q0, q0, #3              @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1753         vorr            q10, q14, q10           @ test clip[8..15] == 0 || a0[8..15] >= pq
1754         vcge.s16        q14, q13, q12
1755         vmov.32         r2, d4[1]               @ move to gp reg
1756         vorr            q3, q10, q3             @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1757         vmov.32         r3, d5[1]
1758         vcge.s16        q2, q0, q11
1759         vbsl            q14, q12, q13           @ FFMIN(d[8..15], clip[8..15])
1760         vbsl            q2, q11, q0             @ FFMIN(d[0..7], clip[0..7])
1761         vmov.32         r5, d6[1]
1762         vbic            q0, q14, q10            @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1763         vmov.32         r6, d7[1]
1764         and             r12, r2, r3
1765         vbic            q2, q2, q9              @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1766         vmls.i16        q6, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
1767         vmls.i16        q5, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
1768         and             r14, r5, r6
1769         vmla.i16        q4, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
1770         and             r12, r12, r14
1771         vqmovun.s16     d4, q6
1772         vmla.i16        q8, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
1773         tst             r12, #1
1774         bne             4f                      @ none of the 16 pixel pairs should be updated in this case
1775         vqmovun.s16     d2, q5
1776         vqmovun.s16     d3, q4
1777         vqmovun.s16     d5, q8
1778         tst             r2, #1
1779         bne             1f
1780         vst2.8          {d2[0], d3[0]}, [r0], r1
1781         vst2.8          {d2[1], d3[1]}, [r0], r1
1782         vst2.8          {d2[2], d3[2]}, [r0], r1
1783         vst2.8          {d2[3], d3[3]}, [r0]
1784 1:      add             r0, r4, r1, lsl #2
1785         tst             r3, #1
1786         bne             2f
1787         vst2.8          {d2[4], d3[4]}, [r4], r1
1788         vst2.8          {d2[5], d3[5]}, [r4], r1
1789         vst2.8          {d2[6], d3[6]}, [r4], r1
1790         vst2.8          {d2[7], d3[7]}, [r4]
1791 2:      add             r4, r0, r1, lsl #2
1792         tst             r5, #1
1793         bne             3f
1794         vst2.8          {d4[0], d5[0]}, [r0], r1
1795         vst2.8          {d4[1], d5[1]}, [r0], r1
1796         vst2.8          {d4[2], d5[2]}, [r0], r1
1797         vst2.8          {d4[3], d5[3]}, [r0]
1798 3:      tst             r6, #1
1799         bne             4f
1800         vst2.8          {d4[4], d5[4]}, [r4], r1
1801         vst2.8          {d4[5], d5[5]}, [r4], r1
1802         vst2.8          {d4[6], d5[6]}, [r4], r1
1803         vst2.8          {d4[7], d5[7]}, [r4]
1804 4:      vpop            {d8-d15}
1805         pop             {r4-r6,pc}
1806 endfunc
1807 
1808 @ Copy at most the specified number of bytes from source to destination buffer,
1809 @ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
1810 @ On entry:
1811 @   r0 -> source buffer
1812 @   r1 = max number of bytes to copy
1813 @   r2 -> destination buffer, optimally 8-byte aligned
1814 @ On exit:
1815 @   r0 = number of bytes not copied
1816 function ff_vc1_unescape_buffer_helper_neon, export=1
1817         @ Offset by 48 to screen out cases that are too short for us to handle,
1818         @ and also make it easy to test for loop termination, or to determine
1819         @ whether we need an odd number of half-iterations of the loop.
1820         subs    r1, r1, #48
1821         bmi     90f
1822 
1823         @ Set up useful constants
1824         vmov.i32        q0, #0x3000000
1825         vmov.i32        q1, #0x30000
1826 
1827         tst             r1, #16
1828         bne             1f
1829 
1830           vld1.8          {q8, q9}, [r0]!
1831           vbic            q12, q8, q0
1832           vext.8          q13, q8, q9, #1
1833           vext.8          q14, q8, q9, #2
1834           vext.8          q15, q8, q9, #3
1835           veor            q12, q12, q1
1836           vbic            q13, q13, q0
1837           vbic            q14, q14, q0
1838           vbic            q15, q15, q0
1839           vceq.i32        q12, q12, #0
1840           veor            q13, q13, q1
1841           veor            q14, q14, q1
1842           veor            q15, q15, q1
1843           vceq.i32        q13, q13, #0
1844           vceq.i32        q14, q14, #0
1845           vceq.i32        q15, q15, #0
1846           add             r1, r1, #16
1847           b               3f
1848 
1849 1:      vld1.8          {q10, q11}, [r0]!
1850         vbic            q12, q10, q0
1851         vext.8          q13, q10, q11, #1
1852         vext.8          q14, q10, q11, #2
1853         vext.8          q15, q10, q11, #3
1854         veor            q12, q12, q1
1855         vbic            q13, q13, q0
1856         vbic            q14, q14, q0
1857         vbic            q15, q15, q0
1858         vceq.i32        q12, q12, #0
1859         veor            q13, q13, q1
1860         veor            q14, q14, q1
1861         veor            q15, q15, q1
1862         vceq.i32        q13, q13, #0
1863         vceq.i32        q14, q14, #0
1864         vceq.i32        q15, q15, #0
1865         @ Drop through...
1866 2:        vmov            q8, q11
1867           vld1.8          {q9}, [r0]!
1868         vorr            q13, q12, q13
1869         vorr            q15, q14, q15
1870           vbic            q12, q8, q0
1871         vorr            q3, q13, q15
1872           vext.8          q13, q8, q9, #1
1873           vext.8          q14, q8, q9, #2
1874           vext.8          q15, q8, q9, #3
1875           veor            q12, q12, q1
1876         vorr            d6, d6, d7
1877           vbic            q13, q13, q0
1878           vbic            q14, q14, q0
1879           vbic            q15, q15, q0
1880           vceq.i32        q12, q12, #0
1881         vmov            r3, r12, d6
1882           veor            q13, q13, q1
1883           veor            q14, q14, q1
1884           veor            q15, q15, q1
1885           vceq.i32        q13, q13, #0
1886           vceq.i32        q14, q14, #0
1887           vceq.i32        q15, q15, #0
1888         orrs            r3, r3, r12
1889         bne             90f
1890         vst1.64         {q10}, [r2]!
1891 3:          vmov            q10, q9
1892             vld1.8          {q11}, [r0]!
1893           vorr            q13, q12, q13
1894           vorr            q15, q14, q15
1895             vbic            q12, q10, q0
1896           vorr            q3, q13, q15
1897             vext.8          q13, q10, q11, #1
1898             vext.8          q14, q10, q11, #2
1899             vext.8          q15, q10, q11, #3
1900             veor            q12, q12, q1
1901           vorr            d6, d6, d7
1902             vbic            q13, q13, q0
1903             vbic            q14, q14, q0
1904             vbic            q15, q15, q0
1905             vceq.i32        q12, q12, #0
1906           vmov            r3, r12, d6
1907             veor            q13, q13, q1
1908             veor            q14, q14, q1
1909             veor            q15, q15, q1
1910             vceq.i32        q13, q13, #0
1911             vceq.i32        q14, q14, #0
1912             vceq.i32        q15, q15, #0
1913           orrs            r3, r3, r12
1914           bne             91f
1915           vst1.64         {q8}, [r2]!
1916         subs            r1, r1, #32
1917         bpl             2b
1918 
1919 90:     add             r0, r1, #48
1920         bx              lr
1921 
1922 91:     sub             r1, r1, #16
1923         b               90b
1924 endfunc
1925