1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "config_components.h"
23 
24 #include "libavutil/aarch64/asm.S"
25 
26 /* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
27 .macro  h264_chroma_mc8 type, codec=h264
28 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
29   .ifc \type,avg
30         mov             x8,  x0
31   .endif
32         prfm            pldl1strm, [x1]
33         prfm            pldl1strm, [x1, x2]
34   .ifc \codec,rv40
35         movrel          x6,  rv40bias
36         lsr             w9,  w5,  #1
37         lsr             w10, w4,  #1
38         lsl             w9,  w9,  #3
39         lsl             w10, w10, #1
40         add             w9,  w9,  w10
41         add             x6,  x6,  w9, UXTW
42         ld1r            {v22.8H}, [x6]
43   .endif
44   .ifc \codec,vc1
45         movi            v22.8H,   #28
46   .endif
47         mul             w7,  w4,  w5
48         lsl             w14, w5,  #3
49         lsl             w13, w4,  #3
50         cmp             w7,  #0
51         sub             w6,  w14, w7
52         sub             w12, w13, w7
53         sub             w4,  w7,  w13
54         sub             w4,  w4,  w14
55         add             w4,  w4,  #64
56         b.eq            2f
57 
58         dup             v0.8B,  w4
59         dup             v1.8B,  w12
60         ld1             {v4.8B, v5.8B}, [x1], x2
61         dup             v2.8B,  w6
62         dup             v3.8B,  w7
63         ext             v5.8B,  v4.8B,  v5.8B,  #1
64 1:      ld1             {v6.8B, v7.8B}, [x1], x2
65         umull           v16.8H, v4.8B,  v0.8B
66         umlal           v16.8H, v5.8B,  v1.8B
67         ext             v7.8B,  v6.8B,  v7.8B,  #1
68         ld1             {v4.8B, v5.8B}, [x1], x2
69         umlal           v16.8H, v6.8B,  v2.8B
70         prfm            pldl1strm, [x1]
71         ext             v5.8B,  v4.8B,  v5.8B,  #1
72         umlal           v16.8H, v7.8B,  v3.8B
73         umull           v17.8H, v6.8B,  v0.8B
74         subs            w3,  w3,  #2
75         umlal           v17.8H, v7.8B, v1.8B
76         umlal           v17.8H, v4.8B, v2.8B
77         umlal           v17.8H, v5.8B, v3.8B
78         prfm            pldl1strm, [x1, x2]
79   .ifc \codec,h264
80         rshrn           v16.8B, v16.8H, #6
81         rshrn           v17.8B, v17.8H, #6
82   .else
83         add             v16.8H, v16.8H, v22.8H
84         add             v17.8H, v17.8H, v22.8H
85         shrn            v16.8B, v16.8H, #6
86         shrn            v17.8B, v17.8H, #6
87   .endif
88   .ifc \type,avg
89         ld1             {v20.8B}, [x8], x2
90         ld1             {v21.8B}, [x8], x2
91         urhadd          v16.8B, v16.8B, v20.8B
92         urhadd          v17.8B, v17.8B, v21.8B
93   .endif
94         st1             {v16.8B}, [x0], x2
95         st1             {v17.8B}, [x0], x2
96         b.gt            1b
97         ret
98 
99 2:      adds            w12, w12, w6
100         dup             v0.8B, w4
101         b.eq            5f
102         tst             w6,  w6
103         dup             v1.8B, w12
104         b.eq            4f
105 
106         ld1             {v4.8B}, [x1], x2
107 3:      ld1             {v6.8B}, [x1], x2
108         umull           v16.8H, v4.8B,  v0.8B
109         umlal           v16.8H, v6.8B,  v1.8B
110         ld1             {v4.8B}, [x1], x2
111         umull           v17.8H, v6.8B,  v0.8B
112         umlal           v17.8H, v4.8B,  v1.8B
113         prfm            pldl1strm, [x1]
114   .ifc \codec,h264
115         rshrn           v16.8B, v16.8H, #6
116         rshrn           v17.8B, v17.8H, #6
117   .else
118         add             v16.8H, v16.8H, v22.8H
119         add             v17.8H, v17.8H, v22.8H
120         shrn            v16.8B, v16.8H, #6
121         shrn            v17.8B, v17.8H, #6
122   .endif
123         prfm            pldl1strm, [x1, x2]
124   .ifc \type,avg
125         ld1             {v20.8B}, [x8], x2
126         ld1             {v21.8B}, [x8], x2
127         urhadd          v16.8B, v16.8B, v20.8B
128         urhadd          v17.8B, v17.8B, v21.8B
129   .endif
130         subs            w3,  w3,  #2
131         st1             {v16.8B}, [x0], x2
132         st1             {v17.8B}, [x0], x2
133         b.gt            3b
134         ret
135 
136 4:      ld1             {v4.8B, v5.8B}, [x1], x2
137         ld1             {v6.8B, v7.8B}, [x1], x2
138         ext             v5.8B,  v4.8B,  v5.8B,  #1
139         ext             v7.8B,  v6.8B,  v7.8B,  #1
140         prfm            pldl1strm, [x1]
141         subs            w3,  w3,  #2
142         umull           v16.8H, v4.8B, v0.8B
143         umlal           v16.8H, v5.8B, v1.8B
144         umull           v17.8H, v6.8B, v0.8B
145         umlal           v17.8H, v7.8B, v1.8B
146         prfm            pldl1strm, [x1, x2]
147   .ifc \codec,h264
148         rshrn           v16.8B, v16.8H, #6
149         rshrn           v17.8B, v17.8H, #6
150   .else
151         add             v16.8H, v16.8H, v22.8H
152         add             v17.8H, v17.8H, v22.8H
153         shrn            v16.8B, v16.8H, #6
154         shrn            v17.8B, v17.8H, #6
155   .endif
156   .ifc \type,avg
157         ld1             {v20.8B}, [x8], x2
158         ld1             {v21.8B}, [x8], x2
159         urhadd          v16.8B, v16.8B, v20.8B
160         urhadd          v17.8B, v17.8B, v21.8B
161   .endif
162         st1             {v16.8B}, [x0], x2
163         st1             {v17.8B}, [x0], x2
164         b.gt            4b
165         ret
166 
167 5:      ld1             {v4.8B}, [x1], x2
168         ld1             {v5.8B}, [x1], x2
169         prfm            pldl1strm, [x1]
170         subs            w3,  w3,  #2
171         umull           v16.8H, v4.8B, v0.8B
172         umull           v17.8H, v5.8B, v0.8B
173         prfm            pldl1strm, [x1, x2]
174   .ifc \codec,h264
175         rshrn           v16.8B, v16.8H, #6
176         rshrn           v17.8B, v17.8H, #6
177   .else
178         add             v16.8H, v16.8H, v22.8H
179         add             v17.8H, v17.8H, v22.8H
180         shrn            v16.8B, v16.8H, #6
181         shrn            v17.8B, v17.8H, #6
182   .endif
183   .ifc \type,avg
184         ld1             {v20.8B}, [x8], x2
185         ld1             {v21.8B}, [x8], x2
186         urhadd          v16.8B, v16.8B, v20.8B
187         urhadd          v17.8B, v17.8B, v21.8B
188   .endif
189         st1             {v16.8B}, [x0], x2
190         st1             {v17.8B}, [x0], x2
191         b.gt            5b
192         ret
193 endfunc
194 .endm
195 
196 /* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
197 .macro  h264_chroma_mc4 type, codec=h264
198 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
199   .ifc \type,avg
200         mov             x8,  x0
201   .endif
202         prfm            pldl1strm, [x1]
203         prfm            pldl1strm, [x1, x2]
204   .ifc \codec,rv40
205         movrel          x6,  rv40bias
206         lsr             w9,  w5,  #1
207         lsr             w10, w4,  #1
208         lsl             w9,  w9,  #3
209         lsl             w10, w10, #1
210         add             w9,  w9,  w10
211         add             x6,  x6,  w9, UXTW
212         ld1r            {v22.8H}, [x6]
213   .endif
214   .ifc \codec,vc1
215         movi            v22.8H,   #28
216   .endif
217         mul             w7,  w4,  w5
218         lsl             w14, w5,  #3
219         lsl             w13, w4,  #3
220         cmp             w7,  #0
221         sub             w6,  w14, w7
222         sub             w12, w13, w7
223         sub             w4,  w7,  w13
224         sub             w4,  w4,  w14
225         add             w4,  w4,  #64
226         b.eq            2f
227 
228         dup             v24.8B,  w4
229         dup             v25.8B,  w12
230         ld1             {v4.8B}, [x1], x2
231         dup             v26.8B,  w6
232         dup             v27.8B,  w7
233         ext             v5.8B,  v4.8B,  v5.8B, #1
234         trn1            v0.2S,  v24.2S, v25.2S
235         trn1            v2.2S,  v26.2S, v27.2S
236         trn1            v4.2S,  v4.2S,  v5.2S
237 1:      ld1             {v6.8B}, [x1], x2
238         ext             v7.8B,  v6.8B,  v7.8B, #1
239         trn1            v6.2S,  v6.2S,  v7.2S
240         umull           v18.8H, v4.8B,  v0.8B
241         umlal           v18.8H, v6.8B,  v2.8B
242         ld1             {v4.8B}, [x1], x2
243         ext             v5.8B,  v4.8B,  v5.8B, #1
244         trn1            v4.2S,  v4.2S,  v5.2S
245         prfm            pldl1strm, [x1]
246         umull           v19.8H, v6.8B,  v0.8B
247         umlal           v19.8H, v4.8B,  v2.8B
248         trn1            v30.2D, v18.2D, v19.2D
249         trn2            v31.2D, v18.2D, v19.2D
250         add             v18.8H, v30.8H, v31.8H
251   .ifc \codec,h264
252         rshrn           v16.8B, v18.8H, #6
253   .else
254         add             v18.8H, v18.8H, v22.8H
255         shrn            v16.8B, v18.8H, #6
256   .endif
257         subs            w3,  w3,  #2
258         prfm            pldl1strm, [x1, x2]
259   .ifc \type,avg
260         ld1             {v20.S}[0], [x8], x2
261         ld1             {v20.S}[1], [x8], x2
262         urhadd          v16.8B, v16.8B, v20.8B
263   .endif
264         st1             {v16.S}[0], [x0], x2
265         st1             {v16.S}[1], [x0], x2
266         b.gt            1b
267         ret
268 
269 2:      adds            w12, w12, w6
270         dup             v30.8B, w4
271         b.eq            5f
272         tst             w6,  w6
273         dup             v31.8B, w12
274         trn1            v0.2S,  v30.2S, v31.2S
275         trn2            v1.2S,  v30.2S, v31.2S
276         b.eq            4f
277 
278         ext             v1.8B,  v0.8B,  v1.8B, #4
279         ld1             {v4.S}[0], [x1], x2
280 3:      ld1             {v4.S}[1], [x1], x2
281         umull           v18.8H, v4.8B,  v0.8B
282         ld1             {v4.S}[0], [x1], x2
283         umull           v19.8H, v4.8B,  v1.8B
284         trn1            v30.2D, v18.2D, v19.2D
285         trn2            v31.2D, v18.2D, v19.2D
286         add             v18.8H, v30.8H, v31.8H
287         prfm            pldl1strm, [x1]
288   .ifc \codec,h264
289         rshrn           v16.8B, v18.8H, #6
290   .else
291         add             v18.8H, v18.8H, v22.8H
292         shrn            v16.8B, v18.8H, #6
293   .endif
294   .ifc \type,avg
295         ld1             {v20.S}[0], [x8], x2
296         ld1             {v20.S}[1], [x8], x2
297         urhadd          v16.8B, v16.8B, v20.8B
298   .endif
299         subs            w3,  w3,  #2
300         prfm            pldl1strm, [x1, x2]
301         st1             {v16.S}[0], [x0], x2
302         st1             {v16.S}[1], [x0], x2
303         b.gt            3b
304         ret
305 
306 4:      ld1             {v4.8B}, [x1], x2
307         ld1             {v6.8B}, [x1], x2
308         ext             v5.8B,  v4.8B,  v5.8B, #1
309         ext             v7.8B,  v6.8B,  v7.8B, #1
310         trn1            v4.2S,  v4.2S,  v5.2S
311         trn1            v6.2S,  v6.2S,  v7.2S
312         umull           v18.8H, v4.8B,  v0.8B
313         umull           v19.8H, v6.8B,  v0.8B
314         subs            w3,  w3,  #2
315         trn1            v30.2D, v18.2D, v19.2D
316         trn2            v31.2D, v18.2D, v19.2D
317         add             v18.8H, v30.8H, v31.8H
318         prfm            pldl1strm, [x1]
319   .ifc \codec,h264
320         rshrn           v16.8B, v18.8H, #6
321   .else
322         add             v18.8H, v18.8H, v22.8H
323         shrn            v16.8B, v18.8H, #6
324   .endif
325   .ifc \type,avg
326         ld1             {v20.S}[0], [x8], x2
327         ld1             {v20.S}[1], [x8], x2
328         urhadd          v16.8B, v16.8B, v20.8B
329   .endif
330         prfm            pldl1strm, [x1]
331         st1             {v16.S}[0], [x0], x2
332         st1             {v16.S}[1], [x0], x2
333         b.gt            4b
334         ret
335 
336 5:      ld1             {v4.S}[0], [x1], x2
337         ld1             {v4.S}[1], [x1], x2
338         umull           v18.8H, v4.8B,  v30.8B
339         subs            w3,  w3,  #2
340         prfm            pldl1strm, [x1]
341   .ifc \codec,h264
342         rshrn           v16.8B, v18.8H, #6
343   .else
344         add             v18.8H, v18.8H, v22.8H
345         shrn            v16.8B, v18.8H, #6
346   .endif
347   .ifc \type,avg
348         ld1             {v20.S}[0], [x8], x2
349         ld1             {v20.S}[1], [x8], x2
350         urhadd          v16.8B, v16.8B, v20.8B
351   .endif
352         prfm            pldl1strm, [x1]
353         st1             {v16.S}[0], [x0], x2
354         st1             {v16.S}[1], [x0], x2
355         b.gt            5b
356         ret
357 endfunc
358 .endm
359 
360 .macro  h264_chroma_mc2 type
361 function ff_\type\()_h264_chroma_mc2_neon, export=1
362         prfm            pldl1strm, [x1]
363         prfm            pldl1strm, [x1, x2]
364         orr             w7,  w4,  w5
365         cbz             w7,  2f
366 
367         mul             w7,  w4,  w5
368         lsl             w14, w5,  #3
369         lsl             w13, w4,  #3
370         sub             w6,  w14, w7
371         sub             w12, w13, w7
372         sub             w4,  w7,  w13
373         sub             w4,  w4,  w14
374         add             w4,  w4,  #64
375         dup             v0.8B,  w4
376         dup             v2.8B,  w12
377         dup             v1.8B,  w6
378         dup             v3.8B,  w7
379         trn1            v0.4H,  v0.4H,  v2.4H
380         trn1            v1.4H,  v1.4H,  v3.4H
381 1:
382         ld1             {v4.S}[0],  [x1], x2
383         ld1             {v4.S}[1],  [x1], x2
384         rev64           v5.2S,  v4.2S
385         ld1             {v5.S}[1],  [x1]
386         ext             v6.8B,  v4.8B,  v5.8B,  #1
387         ext             v7.8B,  v5.8B,  v4.8B,  #1
388         trn1            v4.4H,  v4.4H,  v6.4H
389         trn1            v5.4H,  v5.4H,  v7.4H
390         umull           v16.8H, v4.8B,  v0.8B
391         umlal           v16.8H, v5.8B,  v1.8B
392   .ifc \type,avg
393         ld1             {v18.H}[0], [x0], x2
394         ld1             {v18.H}[2], [x0]
395         sub             x0,  x0,  x2
396   .endif
397         rev64           v17.4S, v16.4S
398         add             v16.8H, v16.8H, v17.8H
399         rshrn           v16.8B, v16.8H, #6
400   .ifc \type,avg
401         urhadd          v16.8B, v16.8B, v18.8B
402   .endif
403         st1             {v16.H}[0], [x0], x2
404         st1             {v16.H}[2], [x0], x2
405         subs            w3,  w3,  #2
406         b.gt            1b
407         ret
408 
409 2:
410         ld1             {v16.H}[0], [x1], x2
411         ld1             {v16.H}[1], [x1], x2
412   .ifc \type,avg
413         ld1             {v18.H}[0], [x0], x2
414         ld1             {v18.H}[1], [x0]
415         sub             x0,  x0,  x2
416         urhadd          v16.8B, v16.8B, v18.8B
417   .endif
418         st1             {v16.H}[0], [x0], x2
419         st1             {v16.H}[1], [x0], x2
420         subs            w3,  w3,  #2
421         b.gt            2b
422         ret
423 endfunc
424 .endm
425 
426         h264_chroma_mc8 put
427         h264_chroma_mc8 avg
428         h264_chroma_mc4 put
429         h264_chroma_mc4 avg
430         h264_chroma_mc2 put
431         h264_chroma_mc2 avg
432 
433 #if CONFIG_RV40_DECODER
434 const   rv40bias
435         .short           0, 16, 32, 16
436         .short          32, 28, 32, 28
437         .short           0, 32, 16, 32
438         .short          32, 28, 32, 28
439 endconst
440 
441         h264_chroma_mc8 put, rv40
442         h264_chroma_mc8 avg, rv40
443         h264_chroma_mc4 put, rv40
444         h264_chroma_mc4 avg, rv40
445 #endif
446 
447 #if CONFIG_VC1DSP
448         h264_chroma_mc8 put, vc1
449         h264_chroma_mc8 avg, vc1
450         h264_chroma_mc4 put, vc1
451         h264_chroma_mc4 avg, vc1
452 #endif
453