1cabdff1aSopenharmony_ci/* -*-arm64-*-
2cabdff1aSopenharmony_ci * vim: syntax=arm64asm
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * AArch64 NEON optimised SAO functions for HEVC decoding
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * Copyright (c) 2022 J. Dekker <jdek@itanimul.li>
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * This file is part of FFmpeg.
9cabdff1aSopenharmony_ci *
10cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
11cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
12cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
13cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
14cabdff1aSopenharmony_ci *
15cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
16cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
17cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18cabdff1aSopenharmony_ci * Lesser General Public License for more details.
19cabdff1aSopenharmony_ci *
20cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
21cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
22cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23cabdff1aSopenharmony_ci */
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci#include "libavutil/aarch64/asm.S"
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci#define MAX_PB_SIZE 64
28cabdff1aSopenharmony_ci#define AV_INPUT_BUFFER_PADDING_SIZE 64
29cabdff1aSopenharmony_ci#define SAO_STRIDE (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE)
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ci// void sao_band_filter(uint8_t *_dst, uint8_t *_src,
32cabdff1aSopenharmony_ci//                      ptrdiff_t stride_dst, ptrdiff_t stride_src,
33cabdff1aSopenharmony_ci//                      int16_t *sao_offset_val, int sao_left_class,
34cabdff1aSopenharmony_ci//                      int width, int height)
35cabdff1aSopenharmony_cifunction ff_hevc_sao_band_filter_8x8_8_neon, export=1
36cabdff1aSopenharmony_ci        sub             sp,  sp, #64
37cabdff1aSopenharmony_ci        stp             xzr, xzr, [sp]
38cabdff1aSopenharmony_ci        stp             xzr, xzr, [sp, #16]
39cabdff1aSopenharmony_ci        stp             xzr, xzr, [sp, #32]
40cabdff1aSopenharmony_ci        stp             xzr, xzr, [sp, #48]
41cabdff1aSopenharmony_ci        mov             w8,  #4
42cabdff1aSopenharmony_ci0:      ldrsh           x9, [x4,  x8, lsl #1]      // sao_offset_val[k+1]
43cabdff1aSopenharmony_ci        subs            w8,  w8,  #1
44cabdff1aSopenharmony_ci        add             w10, w8,  w5               // k + sao_left_class
45cabdff1aSopenharmony_ci        and             w10, w10, #0x1F
46cabdff1aSopenharmony_ci        strh            w9, [sp, x10, lsl #1]
47cabdff1aSopenharmony_ci        bne             0b
48cabdff1aSopenharmony_ci        add             w6,  w6,  #7
49cabdff1aSopenharmony_ci        bic             w6,  w6,  #7
50cabdff1aSopenharmony_ci        ld1             {v16.16b-v19.16b}, [sp], #64
51cabdff1aSopenharmony_ci        sub             x2,  x2,  x6
52cabdff1aSopenharmony_ci        sub             x3,  x3,  x6
53cabdff1aSopenharmony_ci        movi            v20.8h,   #1
54cabdff1aSopenharmony_ci1:      mov             w8,  w6                    // beginning of line
55cabdff1aSopenharmony_ci2:      // Simple layout for accessing 16bit values
56cabdff1aSopenharmony_ci        // with 8bit LUT.
57cabdff1aSopenharmony_ci        //
58cabdff1aSopenharmony_ci        //   00  01  02  03  04  05  06  07
59cabdff1aSopenharmony_ci        // +----------------------------------->
60cabdff1aSopenharmony_ci        // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
61cabdff1aSopenharmony_ci        // +----------------------------------->
62cabdff1aSopenharmony_ci        //    i-0     i-1     i-2     i-3
63cabdff1aSopenharmony_ci        ld1             {v2.8b}, [x1], #8          // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
64cabdff1aSopenharmony_ci        subs            w8, w8,  #8
65cabdff1aSopenharmony_ci        uxtl            v0.8h,  v2.8b              // load src[x]
66cabdff1aSopenharmony_ci        ushr            v2.8h,  v0.8h, #3          // >> BIT_DEPTH - 3
67cabdff1aSopenharmony_ci        shl             v1.8h,  v2.8h, #1          // low (x2, accessing short)
68cabdff1aSopenharmony_ci        add             v3.8h,  v1.8h, v20.8h      // +1 access upper short
69cabdff1aSopenharmony_ci        sli             v1.8h,  v3.8h, #8          // shift insert index to upper byte
70cabdff1aSopenharmony_ci        tbx             v2.16b, {v16.16b-v19.16b}, v1.16b // table
71cabdff1aSopenharmony_ci        add             v1.8h,  v0.8h, v2.8h       // src[x] + table
72cabdff1aSopenharmony_ci        sqxtun          v4.8b,  v1.8h              // clip + narrow
73cabdff1aSopenharmony_ci        st1             {v4.8b}, [x0], #8          // store
74cabdff1aSopenharmony_ci        // done 8 pixels
75cabdff1aSopenharmony_ci        bne             2b
76cabdff1aSopenharmony_ci        subs            w7, w7,  #1                // finished line, prep. new
77cabdff1aSopenharmony_ci        add             x0, x0,  x2                // dst += stride_dst
78cabdff1aSopenharmony_ci        add             x1, x1,  x3                // src += stride_src
79cabdff1aSopenharmony_ci        bne             1b
80cabdff1aSopenharmony_ci        ret
81cabdff1aSopenharmony_ciendfunc
82cabdff1aSopenharmony_ci
83cabdff1aSopenharmony_ci.Lsao_edge_pos:
84cabdff1aSopenharmony_ci.word 1 // horizontal
85cabdff1aSopenharmony_ci.word SAO_STRIDE // vertical
86cabdff1aSopenharmony_ci.word SAO_STRIDE + 1 // 45 degree
87cabdff1aSopenharmony_ci.word SAO_STRIDE - 1 // 135 degree
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst,
90cabdff1aSopenharmony_ci//                                      int16 *sao_offset_val, int eo, int width, int height)
91cabdff1aSopenharmony_cifunction ff_hevc_sao_edge_filter_16x16_8_neon, export=1
92cabdff1aSopenharmony_ci        adr             x7, .Lsao_edge_pos
93cabdff1aSopenharmony_ci        ld1             {v3.8h}, [x3]              // load sao_offset_val
94cabdff1aSopenharmony_ci        add             w5,  w5,  #0xF
95cabdff1aSopenharmony_ci        bic             w5,  w5,  #0xF
96cabdff1aSopenharmony_ci        ldr             w4, [x7, w4, uxtw #2]      // stride_src
97cabdff1aSopenharmony_ci        mov             v3.h[7], v3.h[0]           // reorder to [1,2,0,3,4]
98cabdff1aSopenharmony_ci        mov             v3.h[0], v3.h[1]
99cabdff1aSopenharmony_ci        mov             v3.h[1], v3.h[2]
100cabdff1aSopenharmony_ci        mov             v3.h[2], v3.h[7]
101cabdff1aSopenharmony_ci        // split 16bit values into two tables
102cabdff1aSopenharmony_ci        uzp2            v1.16b, v3.16b, v3.16b     // sao_offset_val -> upper
103cabdff1aSopenharmony_ci        uzp1            v0.16b, v3.16b, v3.16b     // sao_offset_val -> lower
104cabdff1aSopenharmony_ci        movi            v2.16b, #2
105cabdff1aSopenharmony_ci        mov             x15, #SAO_STRIDE
106cabdff1aSopenharmony_ci        // strides between end of line and next src/dst
107cabdff1aSopenharmony_ci        sub             x15, x15, x5               // stride_src - width
108cabdff1aSopenharmony_ci        sub             x16, x2, x5                // stride_dst - width
109cabdff1aSopenharmony_ci        mov             x11, x1                    // copy base src
110cabdff1aSopenharmony_ci1:      // new line
111cabdff1aSopenharmony_ci        mov             x14, x5                    // copy width
112cabdff1aSopenharmony_ci        sub             x12, x11, x4               // src_a (prev) = src - sao_edge_pos
113cabdff1aSopenharmony_ci        add             x13, x11, x4               // src_b (next) = src + sao_edge_pos
114cabdff1aSopenharmony_ci2:      // process 16 bytes
115cabdff1aSopenharmony_ci        ld1             {v3.16b}, [x11], #16       // load src
116cabdff1aSopenharmony_ci        ld1             {v4.16b}, [x12], #16       // load src_a (prev)
117cabdff1aSopenharmony_ci        ld1             {v5.16b}, [x13], #16       // load src_b (next)
118cabdff1aSopenharmony_ci        subs            x14, x14, #16
119cabdff1aSopenharmony_ci        cmhi            v16.16b, v4.16b, v3.16b    // (prev > cur)
120cabdff1aSopenharmony_ci        cmhi            v17.16b, v3.16b, v4.16b    // (cur > prev)
121cabdff1aSopenharmony_ci        cmhi            v18.16b, v5.16b, v3.16b    // (next > cur)
122cabdff1aSopenharmony_ci        cmhi            v19.16b, v3.16b, v5.16b    // (cur > next)
123cabdff1aSopenharmony_ci        sub             v20.16b, v16.16b, v17.16b  // diff0 = CMP(cur, prev) = (cur > prev) - (cur < prev)
124cabdff1aSopenharmony_ci        sub             v21.16b, v18.16b, v19.16b  // diff1 = CMP(cur, next) = (cur > next) - (cur < next)
125cabdff1aSopenharmony_ci        add             v20.16b, v20.16b, v21.16b  // diff = diff0 + diff1
126cabdff1aSopenharmony_ci        add             v20.16b, v20.16b, v2.16b   // offset_val = diff + 2
127cabdff1aSopenharmony_ci        tbl             v16.16b, {v0.16b}, v20.16b
128cabdff1aSopenharmony_ci        tbl             v17.16b, {v1.16b}, v20.16b
129cabdff1aSopenharmony_ci        uxtl            v20.8h, v3.8b              // src[0:7]
130cabdff1aSopenharmony_ci        uxtl2           v21.8h, v3.16b             // src[7:15]
131cabdff1aSopenharmony_ci        zip1            v18.16b, v16.16b, v17.16b  // sao_offset_val lower ->
132cabdff1aSopenharmony_ci        zip2            v19.16b, v16.16b, v17.16b  // sao_offset_val upper ->
133cabdff1aSopenharmony_ci        sqadd           v20.8h, v18.8h, v20.8h     // + sao_offset_val
134cabdff1aSopenharmony_ci        sqadd           v21.8h, v19.8h, v21.8h
135cabdff1aSopenharmony_ci        sqxtun          v3.8b, v20.8h
136cabdff1aSopenharmony_ci        sqxtun2         v3.16b, v21.8h
137cabdff1aSopenharmony_ci        st1             {v3.16b}, [x0], #16
138cabdff1aSopenharmony_ci        // filtered 16 bytes
139cabdff1aSopenharmony_ci        b.ne            2b                         // do we have width to filter?
140cabdff1aSopenharmony_ci        // no width to filter, setup next line
141cabdff1aSopenharmony_ci        subs            w6, w6, #1                 // filtered line
142cabdff1aSopenharmony_ci        add             x11, x11, x15              // stride src to next line
143cabdff1aSopenharmony_ci        add             x0, x0, x16                // stride dst to next line
144cabdff1aSopenharmony_ci        b.ne            1b                         // do we have lines to process?
145cabdff1aSopenharmony_ci        // no lines to filter
146cabdff1aSopenharmony_ci        ret
147cabdff1aSopenharmony_ciendfunc
148cabdff1aSopenharmony_ci
149cabdff1aSopenharmony_ci// ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst,
150cabdff1aSopenharmony_ci//                                    int16 *sao_offset_val, int eo, int width, int height)
151cabdff1aSopenharmony_cifunction ff_hevc_sao_edge_filter_8x8_8_neon, export=1
152cabdff1aSopenharmony_ci        adr             x7, .Lsao_edge_pos
153cabdff1aSopenharmony_ci        ldr             w4, [x7, w4, uxtw #2]
154cabdff1aSopenharmony_ci        ld1             {v3.8h}, [x3]
155cabdff1aSopenharmony_ci        mov             v3.h[7], v3.h[0]
156cabdff1aSopenharmony_ci        mov             v3.h[0], v3.h[1]
157cabdff1aSopenharmony_ci        mov             v3.h[1], v3.h[2]
158cabdff1aSopenharmony_ci        mov             v3.h[2], v3.h[7]
159cabdff1aSopenharmony_ci        uzp2            v1.16b, v3.16b, v3.16b
160cabdff1aSopenharmony_ci        uzp1            v0.16b, v3.16b, v3.16b
161cabdff1aSopenharmony_ci        movi            v2.16b, #2
162cabdff1aSopenharmony_ci        add             x16, x0, x2
163cabdff1aSopenharmony_ci        lsl             x2,  x2, #1
164cabdff1aSopenharmony_ci        mov             x15, #SAO_STRIDE
165cabdff1aSopenharmony_ci        mov             x8,  x1
166cabdff1aSopenharmony_ci        sub             x9,  x1, x4
167cabdff1aSopenharmony_ci        add             x10, x1, x4
168cabdff1aSopenharmony_ci1:      ld1             {v3.d}[0], [ x8], x15
169cabdff1aSopenharmony_ci        ld1             {v4.d}[0], [ x9], x15
170cabdff1aSopenharmony_ci        ld1             {v5.d}[0], [x10], x15
171cabdff1aSopenharmony_ci        ld1             {v3.d}[1], [ x8], x15
172cabdff1aSopenharmony_ci        ld1             {v4.d}[1], [ x9], x15
173cabdff1aSopenharmony_ci        ld1             {v5.d}[1], [x10], x15
174cabdff1aSopenharmony_ci        subs            w6, w6, #2
175cabdff1aSopenharmony_ci        cmhi            v16.16b, v4.16b, v3.16b
176cabdff1aSopenharmony_ci        cmhi            v17.16b, v3.16b, v4.16b
177cabdff1aSopenharmony_ci        cmhi            v18.16b, v5.16b, v3.16b
178cabdff1aSopenharmony_ci        cmhi            v19.16b, v3.16b, v5.16b
179cabdff1aSopenharmony_ci        sub             v20.16b, v16.16b, v17.16b
180cabdff1aSopenharmony_ci        sub             v21.16b, v18.16b, v19.16b
181cabdff1aSopenharmony_ci        add             v20.16b, v20.16b, v21.16b
182cabdff1aSopenharmony_ci        add             v20.16b, v20.16b, v2.16b
183cabdff1aSopenharmony_ci        tbl             v16.16b, {v0.16b}, v20.16b
184cabdff1aSopenharmony_ci        tbl             v17.16b, {v1.16b}, v20.16b
185cabdff1aSopenharmony_ci        uxtl            v20.8h, v3.8b
186cabdff1aSopenharmony_ci        uxtl2           v21.8h, v3.16b
187cabdff1aSopenharmony_ci        zip1            v18.16b, v16.16b, v17.16b
188cabdff1aSopenharmony_ci        zip2            v19.16b, v16.16b, v17.16b
189cabdff1aSopenharmony_ci        sqadd           v20.8h, v18.8h, v20.8h
190cabdff1aSopenharmony_ci        sqadd           v21.8h, v19.8h, v21.8h
191cabdff1aSopenharmony_ci        sqxtun          v6.8b, v20.8h
192cabdff1aSopenharmony_ci        sqxtun          v7.8b, v21.8h
193cabdff1aSopenharmony_ci        st1             {v6.8b}, [ x0], x2
194cabdff1aSopenharmony_ci        st1             {v7.8b}, [x16], x2
195cabdff1aSopenharmony_ci        b.ne            1b
196cabdff1aSopenharmony_ci        ret
197cabdff1aSopenharmony_ciendfunc
198