1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * IDCT AArch64 NEON optimisations
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci// Clamp 16-bit signed block coefficients to unsigned 8-bit
26cabdff1aSopenharmony_ci// On entry:
27cabdff1aSopenharmony_ci//   x0 -> array of 64x 16-bit coefficients
28cabdff1aSopenharmony_ci//   x1 -> 8-bit results
29cabdff1aSopenharmony_ci//   x2 = row stride for results, bytes
30cabdff1aSopenharmony_cifunction ff_put_pixels_clamped_neon, export=1
31cabdff1aSopenharmony_ci        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
32cabdff1aSopenharmony_ci        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
33cabdff1aSopenharmony_ci        sqxtun          v0.8b, v0.8h
34cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
35cabdff1aSopenharmony_ci        sqxtun          v2.8b, v2.8h
36cabdff1aSopenharmony_ci        sqxtun          v3.8b, v3.8h
37cabdff1aSopenharmony_ci        sqxtun          v4.8b, v4.8h
38cabdff1aSopenharmony_ci        st1             {v0.8b}, [x1], x2
39cabdff1aSopenharmony_ci        sqxtun          v0.8b, v5.8h
40cabdff1aSopenharmony_ci        st1             {v1.8b}, [x1], x2
41cabdff1aSopenharmony_ci        sqxtun          v1.8b, v6.8h
42cabdff1aSopenharmony_ci        st1             {v2.8b}, [x1], x2
43cabdff1aSopenharmony_ci        sqxtun          v2.8b, v7.8h
44cabdff1aSopenharmony_ci        st1             {v3.8b}, [x1], x2
45cabdff1aSopenharmony_ci        st1             {v4.8b}, [x1], x2
46cabdff1aSopenharmony_ci        st1             {v0.8b}, [x1], x2
47cabdff1aSopenharmony_ci        st1             {v1.8b}, [x1], x2
48cabdff1aSopenharmony_ci        st1             {v2.8b}, [x1]
49cabdff1aSopenharmony_ci        ret
50cabdff1aSopenharmony_ciendfunc
51cabdff1aSopenharmony_ci
52cabdff1aSopenharmony_ci// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
53cabdff1aSopenharmony_ci// On entry:
54cabdff1aSopenharmony_ci//   x0 -> array of 64x 16-bit coefficients
55cabdff1aSopenharmony_ci//   x1 -> 8-bit results
56cabdff1aSopenharmony_ci//   x2 = row stride for results, bytes
57cabdff1aSopenharmony_cifunction ff_put_signed_pixels_clamped_neon, export=1
58cabdff1aSopenharmony_ci        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
59cabdff1aSopenharmony_ci        movi            v4.8b, #128
60cabdff1aSopenharmony_ci        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
61cabdff1aSopenharmony_ci        sqxtn           v0.8b, v0.8h
62cabdff1aSopenharmony_ci        sqxtn           v1.8b, v1.8h
63cabdff1aSopenharmony_ci        sqxtn           v2.8b, v2.8h
64cabdff1aSopenharmony_ci        sqxtn           v3.8b, v3.8h
65cabdff1aSopenharmony_ci        sqxtn           v5.8b, v16.8h
66cabdff1aSopenharmony_ci        add             v0.8b, v0.8b, v4.8b
67cabdff1aSopenharmony_ci        sqxtn           v6.8b, v17.8h
68cabdff1aSopenharmony_ci        add             v1.8b, v1.8b, v4.8b
69cabdff1aSopenharmony_ci        sqxtn           v7.8b, v18.8h
70cabdff1aSopenharmony_ci        add             v2.8b, v2.8b, v4.8b
71cabdff1aSopenharmony_ci        sqxtn           v16.8b, v19.8h
72cabdff1aSopenharmony_ci        add             v3.8b, v3.8b, v4.8b
73cabdff1aSopenharmony_ci        st1             {v0.8b}, [x1], x2
74cabdff1aSopenharmony_ci        add             v0.8b, v5.8b, v4.8b
75cabdff1aSopenharmony_ci        st1             {v1.8b}, [x1], x2
76cabdff1aSopenharmony_ci        add             v1.8b, v6.8b, v4.8b
77cabdff1aSopenharmony_ci        st1             {v2.8b}, [x1], x2
78cabdff1aSopenharmony_ci        add             v2.8b, v7.8b, v4.8b
79cabdff1aSopenharmony_ci        st1             {v3.8b}, [x1], x2
80cabdff1aSopenharmony_ci        add             v3.8b, v16.8b, v4.8b
81cabdff1aSopenharmony_ci        st1             {v0.8b}, [x1], x2
82cabdff1aSopenharmony_ci        st1             {v1.8b}, [x1], x2
83cabdff1aSopenharmony_ci        st1             {v2.8b}, [x1], x2
84cabdff1aSopenharmony_ci        st1             {v3.8b}, [x1]
85cabdff1aSopenharmony_ci        ret
86cabdff1aSopenharmony_ciendfunc
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci// Add 16-bit signed block coefficients to unsigned 8-bit
89cabdff1aSopenharmony_ci// On entry:
90cabdff1aSopenharmony_ci//   x0 -> array of 64x 16-bit coefficients
91cabdff1aSopenharmony_ci//   x1 -> 8-bit input and results
92cabdff1aSopenharmony_ci//   x2 = row stride for 8-bit input and results, bytes
93cabdff1aSopenharmony_cifunction ff_add_pixels_clamped_neon, export=1
94cabdff1aSopenharmony_ci        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
95cabdff1aSopenharmony_ci        mov             x3, x1
96cabdff1aSopenharmony_ci        ld1             {v4.8b}, [x1], x2
97cabdff1aSopenharmony_ci        ld1             {v5.8b}, [x1], x2
98cabdff1aSopenharmony_ci        ld1             {v6.8b}, [x1], x2
99cabdff1aSopenharmony_ci        ld1             {v7.8b}, [x1], x2
100cabdff1aSopenharmony_ci        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
101cabdff1aSopenharmony_ci        uaddw           v0.8h, v0.8h, v4.8b
102cabdff1aSopenharmony_ci        uaddw           v1.8h, v1.8h, v5.8b
103cabdff1aSopenharmony_ci        uaddw           v2.8h, v2.8h, v6.8b
104cabdff1aSopenharmony_ci        ld1             {v4.8b}, [x1], x2
105cabdff1aSopenharmony_ci        uaddw           v3.8h, v3.8h, v7.8b
106cabdff1aSopenharmony_ci        ld1             {v5.8b}, [x1], x2
107cabdff1aSopenharmony_ci        sqxtun          v0.8b, v0.8h
108cabdff1aSopenharmony_ci        ld1             {v6.8b}, [x1], x2
109cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
110cabdff1aSopenharmony_ci        ld1             {v7.8b}, [x1]
111cabdff1aSopenharmony_ci        sqxtun          v2.8b, v2.8h
112cabdff1aSopenharmony_ci        sqxtun          v3.8b, v3.8h
113cabdff1aSopenharmony_ci        uaddw           v4.8h, v16.8h, v4.8b
114cabdff1aSopenharmony_ci        st1             {v0.8b}, [x3], x2
115cabdff1aSopenharmony_ci        uaddw           v0.8h, v17.8h, v5.8b
116cabdff1aSopenharmony_ci        st1             {v1.8b}, [x3], x2
117cabdff1aSopenharmony_ci        uaddw           v1.8h, v18.8h, v6.8b
118cabdff1aSopenharmony_ci        st1             {v2.8b}, [x3], x2
119cabdff1aSopenharmony_ci        uaddw           v2.8h, v19.8h, v7.8b
120cabdff1aSopenharmony_ci        sqxtun          v4.8b, v4.8h
121cabdff1aSopenharmony_ci        sqxtun          v0.8b, v0.8h
122cabdff1aSopenharmony_ci        st1             {v3.8b}, [x3], x2
123cabdff1aSopenharmony_ci        sqxtun          v1.8b, v1.8h
124cabdff1aSopenharmony_ci        sqxtun          v2.8b, v2.8h
125cabdff1aSopenharmony_ci        st1             {v4.8b}, [x3], x2
126cabdff1aSopenharmony_ci        st1             {v0.8b}, [x3], x2
127cabdff1aSopenharmony_ci        st1             {v1.8b}, [x3], x2
128cabdff1aSopenharmony_ci        st1             {v2.8b}, [x3]
129cabdff1aSopenharmony_ci        ret
130cabdff1aSopenharmony_ciendfunc
131