1 /*
2  * IDCT AArch64 NEON optimisations
3  *
4  * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/aarch64/asm.S"
24 
25 // Clamp 16-bit signed block coefficients to unsigned 8-bit
26 // On entry:
27 //   x0 -> array of 64x 16-bit coefficients
28 //   x1 -> 8-bit results
29 //   x2 = row stride for results, bytes
30 function ff_put_pixels_clamped_neon, export=1
31         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
32         ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
33         sqxtun          v0.8b, v0.8h
34         sqxtun          v1.8b, v1.8h
35         sqxtun          v2.8b, v2.8h
36         sqxtun          v3.8b, v3.8h
37         sqxtun          v4.8b, v4.8h
38         st1             {v0.8b}, [x1], x2
39         sqxtun          v0.8b, v5.8h
40         st1             {v1.8b}, [x1], x2
41         sqxtun          v1.8b, v6.8h
42         st1             {v2.8b}, [x1], x2
43         sqxtun          v2.8b, v7.8h
44         st1             {v3.8b}, [x1], x2
45         st1             {v4.8b}, [x1], x2
46         st1             {v0.8b}, [x1], x2
47         st1             {v1.8b}, [x1], x2
48         st1             {v2.8b}, [x1]
49         ret
50 endfunc
51 
52 // Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
53 // On entry:
54 //   x0 -> array of 64x 16-bit coefficients
55 //   x1 -> 8-bit results
56 //   x2 = row stride for results, bytes
57 function ff_put_signed_pixels_clamped_neon, export=1
58         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
59         movi            v4.8b, #128
60         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
61         sqxtn           v0.8b, v0.8h
62         sqxtn           v1.8b, v1.8h
63         sqxtn           v2.8b, v2.8h
64         sqxtn           v3.8b, v3.8h
65         sqxtn           v5.8b, v16.8h
66         add             v0.8b, v0.8b, v4.8b
67         sqxtn           v6.8b, v17.8h
68         add             v1.8b, v1.8b, v4.8b
69         sqxtn           v7.8b, v18.8h
70         add             v2.8b, v2.8b, v4.8b
71         sqxtn           v16.8b, v19.8h
72         add             v3.8b, v3.8b, v4.8b
73         st1             {v0.8b}, [x1], x2
74         add             v0.8b, v5.8b, v4.8b
75         st1             {v1.8b}, [x1], x2
76         add             v1.8b, v6.8b, v4.8b
77         st1             {v2.8b}, [x1], x2
78         add             v2.8b, v7.8b, v4.8b
79         st1             {v3.8b}, [x1], x2
80         add             v3.8b, v16.8b, v4.8b
81         st1             {v0.8b}, [x1], x2
82         st1             {v1.8b}, [x1], x2
83         st1             {v2.8b}, [x1], x2
84         st1             {v3.8b}, [x1]
85         ret
86 endfunc
87 
88 // Add 16-bit signed block coefficients to unsigned 8-bit
89 // On entry:
90 //   x0 -> array of 64x 16-bit coefficients
91 //   x1 -> 8-bit input and results
92 //   x2 = row stride for 8-bit input and results, bytes
93 function ff_add_pixels_clamped_neon, export=1
94         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
95         mov             x3, x1
96         ld1             {v4.8b}, [x1], x2
97         ld1             {v5.8b}, [x1], x2
98         ld1             {v6.8b}, [x1], x2
99         ld1             {v7.8b}, [x1], x2
100         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
101         uaddw           v0.8h, v0.8h, v4.8b
102         uaddw           v1.8h, v1.8h, v5.8b
103         uaddw           v2.8h, v2.8h, v6.8b
104         ld1             {v4.8b}, [x1], x2
105         uaddw           v3.8h, v3.8h, v7.8b
106         ld1             {v5.8b}, [x1], x2
107         sqxtun          v0.8b, v0.8h
108         ld1             {v6.8b}, [x1], x2
109         sqxtun          v1.8b, v1.8h
110         ld1             {v7.8b}, [x1]
111         sqxtun          v2.8b, v2.8h
112         sqxtun          v3.8b, v3.8h
113         uaddw           v4.8h, v16.8h, v4.8b
114         st1             {v0.8b}, [x3], x2
115         uaddw           v0.8h, v17.8h, v5.8b
116         st1             {v1.8b}, [x3], x2
117         uaddw           v1.8h, v18.8h, v6.8b
118         st1             {v2.8b}, [x3], x2
119         uaddw           v2.8h, v19.8h, v7.8b
120         sqxtun          v4.8b, v4.8h
121         sqxtun          v0.8b, v0.8h
122         st1             {v3.8b}, [x3], x2
123         sqxtun          v1.8b, v1.8h
124         sqxtun          v2.8b, v2.8h
125         st1             {v4.8b}, [x3], x2
126         st1             {v0.8b}, [x3], x2
127         st1             {v1.8b}, [x3], x2
128         st1             {v2.8b}, [x3]
129         ret
130 endfunc
131