1/*
2 * IDCT AArch64 NEON optimisations
3 *
4 * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/aarch64/asm.S"
24
25// Clamp 16-bit signed block coefficients to unsigned 8-bit
26// On entry:
27//   x0 -> array of 64x 16-bit coefficients
28//   x1 -> 8-bit results
29//   x2 = row stride for results, bytes
30function ff_put_pixels_clamped_neon, export=1
31        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
32        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
33        sqxtun          v0.8b, v0.8h
34        sqxtun          v1.8b, v1.8h
35        sqxtun          v2.8b, v2.8h
36        sqxtun          v3.8b, v3.8h
37        sqxtun          v4.8b, v4.8h
38        st1             {v0.8b}, [x1], x2
39        sqxtun          v0.8b, v5.8h
40        st1             {v1.8b}, [x1], x2
41        sqxtun          v1.8b, v6.8h
42        st1             {v2.8b}, [x1], x2
43        sqxtun          v2.8b, v7.8h
44        st1             {v3.8b}, [x1], x2
45        st1             {v4.8b}, [x1], x2
46        st1             {v0.8b}, [x1], x2
47        st1             {v1.8b}, [x1], x2
48        st1             {v2.8b}, [x1]
49        ret
50endfunc
51
52// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
53// On entry:
54//   x0 -> array of 64x 16-bit coefficients
55//   x1 -> 8-bit results
56//   x2 = row stride for results, bytes
57function ff_put_signed_pixels_clamped_neon, export=1
58        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
59        movi            v4.8b, #128
60        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
61        sqxtn           v0.8b, v0.8h
62        sqxtn           v1.8b, v1.8h
63        sqxtn           v2.8b, v2.8h
64        sqxtn           v3.8b, v3.8h
65        sqxtn           v5.8b, v16.8h
66        add             v0.8b, v0.8b, v4.8b
67        sqxtn           v6.8b, v17.8h
68        add             v1.8b, v1.8b, v4.8b
69        sqxtn           v7.8b, v18.8h
70        add             v2.8b, v2.8b, v4.8b
71        sqxtn           v16.8b, v19.8h
72        add             v3.8b, v3.8b, v4.8b
73        st1             {v0.8b}, [x1], x2
74        add             v0.8b, v5.8b, v4.8b
75        st1             {v1.8b}, [x1], x2
76        add             v1.8b, v6.8b, v4.8b
77        st1             {v2.8b}, [x1], x2
78        add             v2.8b, v7.8b, v4.8b
79        st1             {v3.8b}, [x1], x2
80        add             v3.8b, v16.8b, v4.8b
81        st1             {v0.8b}, [x1], x2
82        st1             {v1.8b}, [x1], x2
83        st1             {v2.8b}, [x1], x2
84        st1             {v3.8b}, [x1]
85        ret
86endfunc
87
88// Add 16-bit signed block coefficients to unsigned 8-bit
89// On entry:
90//   x0 -> array of 64x 16-bit coefficients
91//   x1 -> 8-bit input and results
92//   x2 = row stride for 8-bit input and results, bytes
93function ff_add_pixels_clamped_neon, export=1
94        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
95        mov             x3, x1
96        ld1             {v4.8b}, [x1], x2
97        ld1             {v5.8b}, [x1], x2
98        ld1             {v6.8b}, [x1], x2
99        ld1             {v7.8b}, [x1], x2
100        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
101        uaddw           v0.8h, v0.8h, v4.8b
102        uaddw           v1.8h, v1.8h, v5.8b
103        uaddw           v2.8h, v2.8h, v6.8b
104        ld1             {v4.8b}, [x1], x2
105        uaddw           v3.8h, v3.8h, v7.8b
106        ld1             {v5.8b}, [x1], x2
107        sqxtun          v0.8b, v0.8h
108        ld1             {v6.8b}, [x1], x2
109        sqxtun          v1.8b, v1.8h
110        ld1             {v7.8b}, [x1]
111        sqxtun          v2.8b, v2.8h
112        sqxtun          v3.8b, v3.8h
113        uaddw           v4.8h, v16.8h, v4.8b
114        st1             {v0.8b}, [x3], x2
115        uaddw           v0.8h, v17.8h, v5.8b
116        st1             {v1.8b}, [x3], x2
117        uaddw           v1.8h, v18.8h, v6.8b
118        st1             {v2.8b}, [x3], x2
119        uaddw           v2.8h, v19.8h, v7.8b
120        sqxtun          v4.8b, v4.8h
121        sqxtun          v0.8b, v0.8h
122        st1             {v3.8b}, [x3], x2
123        sqxtun          v1.8b, v1.8h
124        sqxtun          v2.8b, v2.8h
125        st1             {v4.8b}, [x3], x2
126        st1             {v0.8b}, [x3], x2
127        st1             {v1.8b}, [x3], x2
128        st1             {v2.8b}, [x3]
129        ret
130endfunc
131