1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "config_components.h"
23
24#include "libavutil/aarch64/asm.S"
25
26/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
27.macro  h264_chroma_mc8 type, codec=h264
28function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
29  .ifc \type,avg
30        mov             x8,  x0
31  .endif
32        prfm            pldl1strm, [x1]
33        prfm            pldl1strm, [x1, x2]
34  .ifc \codec,rv40
35        movrel          x6,  rv40bias
36        lsr             w9,  w5,  #1
37        lsr             w10, w4,  #1
38        lsl             w9,  w9,  #3
39        lsl             w10, w10, #1
40        add             w9,  w9,  w10
41        add             x6,  x6,  w9, UXTW
42        ld1r            {v22.8H}, [x6]
43  .endif
44  .ifc \codec,vc1
45        movi            v22.8H,   #28
46  .endif
47        mul             w7,  w4,  w5
48        lsl             w14, w5,  #3
49        lsl             w13, w4,  #3
50        cmp             w7,  #0
51        sub             w6,  w14, w7
52        sub             w12, w13, w7
53        sub             w4,  w7,  w13
54        sub             w4,  w4,  w14
55        add             w4,  w4,  #64
56        b.eq            2f
57
58        dup             v0.8B,  w4
59        dup             v1.8B,  w12
60        ld1             {v4.8B, v5.8B}, [x1], x2
61        dup             v2.8B,  w6
62        dup             v3.8B,  w7
63        ext             v5.8B,  v4.8B,  v5.8B,  #1
641:      ld1             {v6.8B, v7.8B}, [x1], x2
65        umull           v16.8H, v4.8B,  v0.8B
66        umlal           v16.8H, v5.8B,  v1.8B
67        ext             v7.8B,  v6.8B,  v7.8B,  #1
68        ld1             {v4.8B, v5.8B}, [x1], x2
69        umlal           v16.8H, v6.8B,  v2.8B
70        prfm            pldl1strm, [x1]
71        ext             v5.8B,  v4.8B,  v5.8B,  #1
72        umlal           v16.8H, v7.8B,  v3.8B
73        umull           v17.8H, v6.8B,  v0.8B
74        subs            w3,  w3,  #2
75        umlal           v17.8H, v7.8B, v1.8B
76        umlal           v17.8H, v4.8B, v2.8B
77        umlal           v17.8H, v5.8B, v3.8B
78        prfm            pldl1strm, [x1, x2]
79  .ifc \codec,h264
80        rshrn           v16.8B, v16.8H, #6
81        rshrn           v17.8B, v17.8H, #6
82  .else
83        add             v16.8H, v16.8H, v22.8H
84        add             v17.8H, v17.8H, v22.8H
85        shrn            v16.8B, v16.8H, #6
86        shrn            v17.8B, v17.8H, #6
87  .endif
88  .ifc \type,avg
89        ld1             {v20.8B}, [x8], x2
90        ld1             {v21.8B}, [x8], x2
91        urhadd          v16.8B, v16.8B, v20.8B
92        urhadd          v17.8B, v17.8B, v21.8B
93  .endif
94        st1             {v16.8B}, [x0], x2
95        st1             {v17.8B}, [x0], x2
96        b.gt            1b
97        ret
98
992:      adds            w12, w12, w6
100        dup             v0.8B, w4
101        b.eq            5f
102        tst             w6,  w6
103        dup             v1.8B, w12
104        b.eq            4f
105
106        ld1             {v4.8B}, [x1], x2
1073:      ld1             {v6.8B}, [x1], x2
108        umull           v16.8H, v4.8B,  v0.8B
109        umlal           v16.8H, v6.8B,  v1.8B
110        ld1             {v4.8B}, [x1], x2
111        umull           v17.8H, v6.8B,  v0.8B
112        umlal           v17.8H, v4.8B,  v1.8B
113        prfm            pldl1strm, [x1]
114  .ifc \codec,h264
115        rshrn           v16.8B, v16.8H, #6
116        rshrn           v17.8B, v17.8H, #6
117  .else
118        add             v16.8H, v16.8H, v22.8H
119        add             v17.8H, v17.8H, v22.8H
120        shrn            v16.8B, v16.8H, #6
121        shrn            v17.8B, v17.8H, #6
122  .endif
123        prfm            pldl1strm, [x1, x2]
124  .ifc \type,avg
125        ld1             {v20.8B}, [x8], x2
126        ld1             {v21.8B}, [x8], x2
127        urhadd          v16.8B, v16.8B, v20.8B
128        urhadd          v17.8B, v17.8B, v21.8B
129  .endif
130        subs            w3,  w3,  #2
131        st1             {v16.8B}, [x0], x2
132        st1             {v17.8B}, [x0], x2
133        b.gt            3b
134        ret
135
1364:      ld1             {v4.8B, v5.8B}, [x1], x2
137        ld1             {v6.8B, v7.8B}, [x1], x2
138        ext             v5.8B,  v4.8B,  v5.8B,  #1
139        ext             v7.8B,  v6.8B,  v7.8B,  #1
140        prfm            pldl1strm, [x1]
141        subs            w3,  w3,  #2
142        umull           v16.8H, v4.8B, v0.8B
143        umlal           v16.8H, v5.8B, v1.8B
144        umull           v17.8H, v6.8B, v0.8B
145        umlal           v17.8H, v7.8B, v1.8B
146        prfm            pldl1strm, [x1, x2]
147  .ifc \codec,h264
148        rshrn           v16.8B, v16.8H, #6
149        rshrn           v17.8B, v17.8H, #6
150  .else
151        add             v16.8H, v16.8H, v22.8H
152        add             v17.8H, v17.8H, v22.8H
153        shrn            v16.8B, v16.8H, #6
154        shrn            v17.8B, v17.8H, #6
155  .endif
156  .ifc \type,avg
157        ld1             {v20.8B}, [x8], x2
158        ld1             {v21.8B}, [x8], x2
159        urhadd          v16.8B, v16.8B, v20.8B
160        urhadd          v17.8B, v17.8B, v21.8B
161  .endif
162        st1             {v16.8B}, [x0], x2
163        st1             {v17.8B}, [x0], x2
164        b.gt            4b
165        ret
166
1675:      ld1             {v4.8B}, [x1], x2
168        ld1             {v5.8B}, [x1], x2
169        prfm            pldl1strm, [x1]
170        subs            w3,  w3,  #2
171        umull           v16.8H, v4.8B, v0.8B
172        umull           v17.8H, v5.8B, v0.8B
173        prfm            pldl1strm, [x1, x2]
174  .ifc \codec,h264
175        rshrn           v16.8B, v16.8H, #6
176        rshrn           v17.8B, v17.8H, #6
177  .else
178        add             v16.8H, v16.8H, v22.8H
179        add             v17.8H, v17.8H, v22.8H
180        shrn            v16.8B, v16.8H, #6
181        shrn            v17.8B, v17.8H, #6
182  .endif
183  .ifc \type,avg
184        ld1             {v20.8B}, [x8], x2
185        ld1             {v21.8B}, [x8], x2
186        urhadd          v16.8B, v16.8B, v20.8B
187        urhadd          v17.8B, v17.8B, v21.8B
188  .endif
189        st1             {v16.8B}, [x0], x2
190        st1             {v17.8B}, [x0], x2
191        b.gt            5b
192        ret
193endfunc
194.endm
195
196/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
197.macro  h264_chroma_mc4 type, codec=h264
198function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
199  .ifc \type,avg
200        mov             x8,  x0
201  .endif
202        prfm            pldl1strm, [x1]
203        prfm            pldl1strm, [x1, x2]
204  .ifc \codec,rv40
205        movrel          x6,  rv40bias
206        lsr             w9,  w5,  #1
207        lsr             w10, w4,  #1
208        lsl             w9,  w9,  #3
209        lsl             w10, w10, #1
210        add             w9,  w9,  w10
211        add             x6,  x6,  w9, UXTW
212        ld1r            {v22.8H}, [x6]
213  .endif
214  .ifc \codec,vc1
215        movi            v22.8H,   #28
216  .endif
217        mul             w7,  w4,  w5
218        lsl             w14, w5,  #3
219        lsl             w13, w4,  #3
220        cmp             w7,  #0
221        sub             w6,  w14, w7
222        sub             w12, w13, w7
223        sub             w4,  w7,  w13
224        sub             w4,  w4,  w14
225        add             w4,  w4,  #64
226        b.eq            2f
227
228        dup             v24.8B,  w4
229        dup             v25.8B,  w12
230        ld1             {v4.8B}, [x1], x2
231        dup             v26.8B,  w6
232        dup             v27.8B,  w7
233        ext             v5.8B,  v4.8B,  v5.8B, #1
234        trn1            v0.2S,  v24.2S, v25.2S
235        trn1            v2.2S,  v26.2S, v27.2S
236        trn1            v4.2S,  v4.2S,  v5.2S
2371:      ld1             {v6.8B}, [x1], x2
238        ext             v7.8B,  v6.8B,  v7.8B, #1
239        trn1            v6.2S,  v6.2S,  v7.2S
240        umull           v18.8H, v4.8B,  v0.8B
241        umlal           v18.8H, v6.8B,  v2.8B
242        ld1             {v4.8B}, [x1], x2
243        ext             v5.8B,  v4.8B,  v5.8B, #1
244        trn1            v4.2S,  v4.2S,  v5.2S
245        prfm            pldl1strm, [x1]
246        umull           v19.8H, v6.8B,  v0.8B
247        umlal           v19.8H, v4.8B,  v2.8B
248        trn1            v30.2D, v18.2D, v19.2D
249        trn2            v31.2D, v18.2D, v19.2D
250        add             v18.8H, v30.8H, v31.8H
251  .ifc \codec,h264
252        rshrn           v16.8B, v18.8H, #6
253  .else
254        add             v18.8H, v18.8H, v22.8H
255        shrn            v16.8B, v18.8H, #6
256  .endif
257        subs            w3,  w3,  #2
258        prfm            pldl1strm, [x1, x2]
259  .ifc \type,avg
260        ld1             {v20.S}[0], [x8], x2
261        ld1             {v20.S}[1], [x8], x2
262        urhadd          v16.8B, v16.8B, v20.8B
263  .endif
264        st1             {v16.S}[0], [x0], x2
265        st1             {v16.S}[1], [x0], x2
266        b.gt            1b
267        ret
268
2692:      adds            w12, w12, w6
270        dup             v30.8B, w4
271        b.eq            5f
272        tst             w6,  w6
273        dup             v31.8B, w12
274        trn1            v0.2S,  v30.2S, v31.2S
275        trn2            v1.2S,  v30.2S, v31.2S
276        b.eq            4f
277
278        ext             v1.8B,  v0.8B,  v1.8B, #4
279        ld1             {v4.S}[0], [x1], x2
2803:      ld1             {v4.S}[1], [x1], x2
281        umull           v18.8H, v4.8B,  v0.8B
282        ld1             {v4.S}[0], [x1], x2
283        umull           v19.8H, v4.8B,  v1.8B
284        trn1            v30.2D, v18.2D, v19.2D
285        trn2            v31.2D, v18.2D, v19.2D
286        add             v18.8H, v30.8H, v31.8H
287        prfm            pldl1strm, [x1]
288  .ifc \codec,h264
289        rshrn           v16.8B, v18.8H, #6
290  .else
291        add             v18.8H, v18.8H, v22.8H
292        shrn            v16.8B, v18.8H, #6
293  .endif
294  .ifc \type,avg
295        ld1             {v20.S}[0], [x8], x2
296        ld1             {v20.S}[1], [x8], x2
297        urhadd          v16.8B, v16.8B, v20.8B
298  .endif
299        subs            w3,  w3,  #2
300        prfm            pldl1strm, [x1, x2]
301        st1             {v16.S}[0], [x0], x2
302        st1             {v16.S}[1], [x0], x2
303        b.gt            3b
304        ret
305
3064:      ld1             {v4.8B}, [x1], x2
307        ld1             {v6.8B}, [x1], x2
308        ext             v5.8B,  v4.8B,  v5.8B, #1
309        ext             v7.8B,  v6.8B,  v7.8B, #1
310        trn1            v4.2S,  v4.2S,  v5.2S
311        trn1            v6.2S,  v6.2S,  v7.2S
312        umull           v18.8H, v4.8B,  v0.8B
313        umull           v19.8H, v6.8B,  v0.8B
314        subs            w3,  w3,  #2
315        trn1            v30.2D, v18.2D, v19.2D
316        trn2            v31.2D, v18.2D, v19.2D
317        add             v18.8H, v30.8H, v31.8H
318        prfm            pldl1strm, [x1]
319  .ifc \codec,h264
320        rshrn           v16.8B, v18.8H, #6
321  .else
322        add             v18.8H, v18.8H, v22.8H
323        shrn            v16.8B, v18.8H, #6
324  .endif
325  .ifc \type,avg
326        ld1             {v20.S}[0], [x8], x2
327        ld1             {v20.S}[1], [x8], x2
328        urhadd          v16.8B, v16.8B, v20.8B
329  .endif
330        prfm            pldl1strm, [x1]
331        st1             {v16.S}[0], [x0], x2
332        st1             {v16.S}[1], [x0], x2
333        b.gt            4b
334        ret
335
3365:      ld1             {v4.S}[0], [x1], x2
337        ld1             {v4.S}[1], [x1], x2
338        umull           v18.8H, v4.8B,  v30.8B
339        subs            w3,  w3,  #2
340        prfm            pldl1strm, [x1]
341  .ifc \codec,h264
342        rshrn           v16.8B, v18.8H, #6
343  .else
344        add             v18.8H, v18.8H, v22.8H
345        shrn            v16.8B, v18.8H, #6
346  .endif
347  .ifc \type,avg
348        ld1             {v20.S}[0], [x8], x2
349        ld1             {v20.S}[1], [x8], x2
350        urhadd          v16.8B, v16.8B, v20.8B
351  .endif
352        prfm            pldl1strm, [x1]
353        st1             {v16.S}[0], [x0], x2
354        st1             {v16.S}[1], [x0], x2
355        b.gt            5b
356        ret
357endfunc
358.endm
359
360.macro  h264_chroma_mc2 type
361function ff_\type\()_h264_chroma_mc2_neon, export=1
362        prfm            pldl1strm, [x1]
363        prfm            pldl1strm, [x1, x2]
364        orr             w7,  w4,  w5
365        cbz             w7,  2f
366
367        mul             w7,  w4,  w5
368        lsl             w14, w5,  #3
369        lsl             w13, w4,  #3
370        sub             w6,  w14, w7
371        sub             w12, w13, w7
372        sub             w4,  w7,  w13
373        sub             w4,  w4,  w14
374        add             w4,  w4,  #64
375        dup             v0.8B,  w4
376        dup             v2.8B,  w12
377        dup             v1.8B,  w6
378        dup             v3.8B,  w7
379        trn1            v0.4H,  v0.4H,  v2.4H
380        trn1            v1.4H,  v1.4H,  v3.4H
3811:
382        ld1             {v4.S}[0],  [x1], x2
383        ld1             {v4.S}[1],  [x1], x2
384        rev64           v5.2S,  v4.2S
385        ld1             {v5.S}[1],  [x1]
386        ext             v6.8B,  v4.8B,  v5.8B,  #1
387        ext             v7.8B,  v5.8B,  v4.8B,  #1
388        trn1            v4.4H,  v4.4H,  v6.4H
389        trn1            v5.4H,  v5.4H,  v7.4H
390        umull           v16.8H, v4.8B,  v0.8B
391        umlal           v16.8H, v5.8B,  v1.8B
392  .ifc \type,avg
393        ld1             {v18.H}[0], [x0], x2
394        ld1             {v18.H}[2], [x0]
395        sub             x0,  x0,  x2
396  .endif
397        rev64           v17.4S, v16.4S
398        add             v16.8H, v16.8H, v17.8H
399        rshrn           v16.8B, v16.8H, #6
400  .ifc \type,avg
401        urhadd          v16.8B, v16.8B, v18.8B
402  .endif
403        st1             {v16.H}[0], [x0], x2
404        st1             {v16.H}[2], [x0], x2
405        subs            w3,  w3,  #2
406        b.gt            1b
407        ret
408
4092:
410        ld1             {v16.H}[0], [x1], x2
411        ld1             {v16.H}[1], [x1], x2
412  .ifc \type,avg
413        ld1             {v18.H}[0], [x0], x2
414        ld1             {v18.H}[1], [x0]
415        sub             x0,  x0,  x2
416        urhadd          v16.8B, v16.8B, v18.8B
417  .endif
418        st1             {v16.H}[0], [x0], x2
419        st1             {v16.H}[1], [x0], x2
420        subs            w3,  w3,  #2
421        b.gt            2b
422        ret
423endfunc
424.endm
425
426        h264_chroma_mc8 put
427        h264_chroma_mc8 avg
428        h264_chroma_mc4 put
429        h264_chroma_mc4 avg
430        h264_chroma_mc2 put
431        h264_chroma_mc2 avg
432
433#if CONFIG_RV40_DECODER
434const   rv40bias
435        .short           0, 16, 32, 16
436        .short          32, 28, 32, 28
437        .short           0, 32, 16, 32
438        .short          32, 28, 32, 28
439endconst
440
441        h264_chroma_mc8 put, rv40
442        h264_chroma_mc8 avg, rv40
443        h264_chroma_mc4 put, rv40
444        h264_chroma_mc4 avg, rv40
445#endif
446
447#if CONFIG_VC1DSP
448        h264_chroma_mc8 put, vc1
449        h264_chroma_mc8 avg, vc1
450        h264_chroma_mc4 put, vc1
451        h264_chroma_mc4 avg, vc1
452#endif
453