1/* -*-arm64-*-
2 * vim: syntax=arm64asm
3 *
4 * AArch64 NEON optimised SAO functions for HEVC decoding
5 *
6 * Copyright (c) 2022 J. Dekker <jdek@itanimul.li>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include "libavutil/aarch64/asm.S"
26
27#define MAX_PB_SIZE 64
28#define AV_INPUT_BUFFER_PADDING_SIZE 64
29#define SAO_STRIDE (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE)
30
31// void sao_band_filter(uint8_t *_dst, uint8_t *_src,
32//                      ptrdiff_t stride_dst, ptrdiff_t stride_src,
33//                      int16_t *sao_offset_val, int sao_left_class,
34//                      int width, int height)
35function ff_hevc_sao_band_filter_8x8_8_neon, export=1
36        sub             sp,  sp, #64
37        stp             xzr, xzr, [sp]
38        stp             xzr, xzr, [sp, #16]
39        stp             xzr, xzr, [sp, #32]
40        stp             xzr, xzr, [sp, #48]
41        mov             w8,  #4
420:      ldrsh           x9, [x4,  x8, lsl #1]      // sao_offset_val[k+1]
43        subs            w8,  w8,  #1
44        add             w10, w8,  w5               // k + sao_left_class
45        and             w10, w10, #0x1F
46        strh            w9, [sp, x10, lsl #1]
47        bne             0b
48        add             w6,  w6,  #7
49        bic             w6,  w6,  #7
50        ld1             {v16.16b-v19.16b}, [sp], #64
51        sub             x2,  x2,  x6
52        sub             x3,  x3,  x6
53        movi            v20.8h,   #1
541:      mov             w8,  w6                    // beginning of line
552:      // Simple layout for accessing 16bit values
56        // with 8bit LUT.
57        //
58        //   00  01  02  03  04  05  06  07
59        // +----------------------------------->
60        // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
61        // +----------------------------------->
62        //    i-0     i-1     i-2     i-3
63        ld1             {v2.8b}, [x1], #8          // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
64        subs            w8, w8,  #8
65        uxtl            v0.8h,  v2.8b              // load src[x]
66        ushr            v2.8h,  v0.8h, #3          // >> BIT_DEPTH - 3
67        shl             v1.8h,  v2.8h, #1          // low (x2, accessing short)
68        add             v3.8h,  v1.8h, v20.8h      // +1 access upper short
69        sli             v1.8h,  v3.8h, #8          // shift insert index to upper byte
70        tbx             v2.16b, {v16.16b-v19.16b}, v1.16b // table
71        add             v1.8h,  v0.8h, v2.8h       // src[x] + table
72        sqxtun          v4.8b,  v1.8h              // clip + narrow
73        st1             {v4.8b}, [x0], #8          // store
74        // done 8 pixels
75        bne             2b
76        subs            w7, w7,  #1                // finished line, prep. new
77        add             x0, x0,  x2                // dst += stride_dst
78        add             x1, x1,  x3                // src += stride_src
79        bne             1b
80        ret
81endfunc
82
83.Lsao_edge_pos:
84.word 1 // horizontal
85.word SAO_STRIDE // vertical
86.word SAO_STRIDE + 1 // 45 degree
87.word SAO_STRIDE - 1 // 135 degree
88
89// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst,
90//                                      int16 *sao_offset_val, int eo, int width, int height)
91function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
92        adr             x7, .Lsao_edge_pos
93        ld1             {v3.8h}, [x3]              // load sao_offset_val
94        add             w5,  w5,  #0xF
95        bic             w5,  w5,  #0xF
96        ldr             w4, [x7, w4, uxtw #2]      // stride_src
97        mov             v3.h[7], v3.h[0]           // reorder to [1,2,0,3,4]
98        mov             v3.h[0], v3.h[1]
99        mov             v3.h[1], v3.h[2]
100        mov             v3.h[2], v3.h[7]
101        // split 16bit values into two tables
102        uzp2            v1.16b, v3.16b, v3.16b     // sao_offset_val -> upper
103        uzp1            v0.16b, v3.16b, v3.16b     // sao_offset_val -> lower
104        movi            v2.16b, #2
105        mov             x15, #SAO_STRIDE
106        // strides between end of line and next src/dst
107        sub             x15, x15, x5               // stride_src - width
108        sub             x16, x2, x5                // stride_dst - width
109        mov             x11, x1                    // copy base src
1101:      // new line
111        mov             x14, x5                    // copy width
112        sub             x12, x11, x4               // src_a (prev) = src - sao_edge_pos
113        add             x13, x11, x4               // src_b (next) = src + sao_edge_pos
1142:      // process 16 bytes
115        ld1             {v3.16b}, [x11], #16       // load src
116        ld1             {v4.16b}, [x12], #16       // load src_a (prev)
117        ld1             {v5.16b}, [x13], #16       // load src_b (next)
118        subs            x14, x14, #16
119        cmhi            v16.16b, v4.16b, v3.16b    // (prev > cur)
120        cmhi            v17.16b, v3.16b, v4.16b    // (cur > prev)
121        cmhi            v18.16b, v5.16b, v3.16b    // (next > cur)
122        cmhi            v19.16b, v3.16b, v5.16b    // (cur > next)
123        sub             v20.16b, v16.16b, v17.16b  // diff0 = CMP(cur, prev) = (cur > prev) - (cur < prev)
124        sub             v21.16b, v18.16b, v19.16b  // diff1 = CMP(cur, next) = (cur > next) - (cur < next)
125        add             v20.16b, v20.16b, v21.16b  // diff = diff0 + diff1
126        add             v20.16b, v20.16b, v2.16b   // offset_val = diff + 2
127        tbl             v16.16b, {v0.16b}, v20.16b
128        tbl             v17.16b, {v1.16b}, v20.16b
129        uxtl            v20.8h, v3.8b              // src[0:7]
130        uxtl2           v21.8h, v3.16b             // src[7:15]
131        zip1            v18.16b, v16.16b, v17.16b  // sao_offset_val lower ->
132        zip2            v19.16b, v16.16b, v17.16b  // sao_offset_val upper ->
133        sqadd           v20.8h, v18.8h, v20.8h     // + sao_offset_val
134        sqadd           v21.8h, v19.8h, v21.8h
135        sqxtun          v3.8b, v20.8h
136        sqxtun2         v3.16b, v21.8h
137        st1             {v3.16b}, [x0], #16
138        // filtered 16 bytes
139        b.ne            2b                         // do we have width to filter?
140        // no width to filter, setup next line
141        subs            w6, w6, #1                 // filtered line
142        add             x11, x11, x15              // stride src to next line
143        add             x0, x0, x16                // stride dst to next line
144        b.ne            1b                         // do we have lines to process?
145        // no lines to filter
146        ret
147endfunc
148
149// ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst,
150//                                    int16 *sao_offset_val, int eo, int width, int height)
151function ff_hevc_sao_edge_filter_8x8_8_neon, export=1
152        adr             x7, .Lsao_edge_pos
153        ldr             w4, [x7, w4, uxtw #2]
154        ld1             {v3.8h}, [x3]
155        mov             v3.h[7], v3.h[0]
156        mov             v3.h[0], v3.h[1]
157        mov             v3.h[1], v3.h[2]
158        mov             v3.h[2], v3.h[7]
159        uzp2            v1.16b, v3.16b, v3.16b
160        uzp1            v0.16b, v3.16b, v3.16b
161        movi            v2.16b, #2
162        add             x16, x0, x2
163        lsl             x2,  x2, #1
164        mov             x15, #SAO_STRIDE
165        mov             x8,  x1
166        sub             x9,  x1, x4
167        add             x10, x1, x4
1681:      ld1             {v3.d}[0], [ x8], x15
169        ld1             {v4.d}[0], [ x9], x15
170        ld1             {v5.d}[0], [x10], x15
171        ld1             {v3.d}[1], [ x8], x15
172        ld1             {v4.d}[1], [ x9], x15
173        ld1             {v5.d}[1], [x10], x15
174        subs            w6, w6, #2
175        cmhi            v16.16b, v4.16b, v3.16b
176        cmhi            v17.16b, v3.16b, v4.16b
177        cmhi            v18.16b, v5.16b, v3.16b
178        cmhi            v19.16b, v3.16b, v5.16b
179        sub             v20.16b, v16.16b, v17.16b
180        sub             v21.16b, v18.16b, v19.16b
181        add             v20.16b, v20.16b, v21.16b
182        add             v20.16b, v20.16b, v2.16b
183        tbl             v16.16b, {v0.16b}, v20.16b
184        tbl             v17.16b, {v1.16b}, v20.16b
185        uxtl            v20.8h, v3.8b
186        uxtl2           v21.8h, v3.16b
187        zip1            v18.16b, v16.16b, v17.16b
188        zip2            v19.16b, v16.16b, v17.16b
189        sqadd           v20.8h, v18.8h, v20.8h
190        sqadd           v21.8h, v19.8h, v21.8h
191        sqxtun          v6.8b, v20.8h
192        sqxtun          v7.8b, v21.8h
193        st1             {v6.8b}, [ x0], x2
194        st1             {v7.8b}, [x16], x2
195        b.ne            1b
196        ret
197endfunc
198