1 /* -*-arm64-*-
2  * vim: syntax=arm64asm
3  *
4  * AArch64 NEON optimised SAO functions for HEVC decoding
5  *
6  * Copyright (c) 2022 J. Dekker <jdek@itanimul.li>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/aarch64/asm.S"
26 
27 #define MAX_PB_SIZE 64
28 #define AV_INPUT_BUFFER_PADDING_SIZE 64
29 #define SAO_STRIDE (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE)
30 
31 // void sao_band_filter(uint8_t *_dst, uint8_t *_src,
32 //                      ptrdiff_t stride_dst, ptrdiff_t stride_src,
33 //                      int16_t *sao_offset_val, int sao_left_class,
34 //                      int width, int height)
35 function ff_hevc_sao_band_filter_8x8_8_neon, export=1
36         sub             sp,  sp, #64
37         stp             xzr, xzr, [sp]
38         stp             xzr, xzr, [sp, #16]
39         stp             xzr, xzr, [sp, #32]
40         stp             xzr, xzr, [sp, #48]
41         mov             w8,  #4
42 0:      ldrsh           x9, [x4,  x8, lsl #1]      // sao_offset_val[k+1]
43         subs            w8,  w8,  #1
44         add             w10, w8,  w5               // k + sao_left_class
45         and             w10, w10, #0x1F
46         strh            w9, [sp, x10, lsl #1]
47         bne             0b
48         add             w6,  w6,  #7
49         bic             w6,  w6,  #7
50         ld1             {v16.16b-v19.16b}, [sp], #64
51         sub             x2,  x2,  x6
52         sub             x3,  x3,  x6
53         movi            v20.8h,   #1
54 1:      mov             w8,  w6                    // beginning of line
55 2:      // Simple layout for accessing 16bit values
56         // with 8bit LUT.
57         //
58         //   00  01  02  03  04  05  06  07
59         // +----------------------------------->
60         // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
61         // +----------------------------------->
62         //    i-0     i-1     i-2     i-3
63         ld1             {v2.8b}, [x1], #8          // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
64         subs            w8, w8,  #8
65         uxtl            v0.8h,  v2.8b              // load src[x]
66         ushr            v2.8h,  v0.8h, #3          // >> BIT_DEPTH - 3
67         shl             v1.8h,  v2.8h, #1          // low (x2, accessing short)
68         add             v3.8h,  v1.8h, v20.8h      // +1 access upper short
69         sli             v1.8h,  v3.8h, #8          // shift insert index to upper byte
70         tbx             v2.16b, {v16.16b-v19.16b}, v1.16b // table
71         add             v1.8h,  v0.8h, v2.8h       // src[x] + table
72         sqxtun          v4.8b,  v1.8h              // clip + narrow
73         st1             {v4.8b}, [x0], #8          // store
74         // done 8 pixels
75         bne             2b
76         subs            w7, w7,  #1                // finished line, prep. new
77         add             x0, x0,  x2                // dst += stride_dst
78         add             x1, x1,  x3                // src += stride_src
79         bne             1b
80         ret
81 endfunc
82 
83 .Lsao_edge_pos:
84 .word 1 // horizontal
85 .word SAO_STRIDE // vertical
86 .word SAO_STRIDE + 1 // 45 degree
87 .word SAO_STRIDE - 1 // 135 degree
88 
89 // ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst,
90 //                                      int16 *sao_offset_val, int eo, int width, int height)
91 function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
92         adr             x7, .Lsao_edge_pos
93         ld1             {v3.8h}, [x3]              // load sao_offset_val
94         add             w5,  w5,  #0xF
95         bic             w5,  w5,  #0xF
96         ldr             w4, [x7, w4, uxtw #2]      // stride_src
97         mov             v3.h[7], v3.h[0]           // reorder to [1,2,0,3,4]
98         mov             v3.h[0], v3.h[1]
99         mov             v3.h[1], v3.h[2]
100         mov             v3.h[2], v3.h[7]
101         // split 16bit values into two tables
102         uzp2            v1.16b, v3.16b, v3.16b     // sao_offset_val -> upper
103         uzp1            v0.16b, v3.16b, v3.16b     // sao_offset_val -> lower
104         movi            v2.16b, #2
105         mov             x15, #SAO_STRIDE
106         // strides between end of line and next src/dst
107         sub             x15, x15, x5               // stride_src - width
108         sub             x16, x2, x5                // stride_dst - width
109         mov             x11, x1                    // copy base src
110 1:      // new line
111         mov             x14, x5                    // copy width
112         sub             x12, x11, x4               // src_a (prev) = src - sao_edge_pos
113         add             x13, x11, x4               // src_b (next) = src + sao_edge_pos
114 2:      // process 16 bytes
115         ld1             {v3.16b}, [x11], #16       // load src
116         ld1             {v4.16b}, [x12], #16       // load src_a (prev)
117         ld1             {v5.16b}, [x13], #16       // load src_b (next)
118         subs            x14, x14, #16
119         cmhi            v16.16b, v4.16b, v3.16b    // (prev > cur)
120         cmhi            v17.16b, v3.16b, v4.16b    // (cur > prev)
121         cmhi            v18.16b, v5.16b, v3.16b    // (next > cur)
122         cmhi            v19.16b, v3.16b, v5.16b    // (cur > next)
123         sub             v20.16b, v16.16b, v17.16b  // diff0 = CMP(cur, prev) = (cur > prev) - (cur < prev)
124         sub             v21.16b, v18.16b, v19.16b  // diff1 = CMP(cur, next) = (cur > next) - (cur < next)
125         add             v20.16b, v20.16b, v21.16b  // diff = diff0 + diff1
126         add             v20.16b, v20.16b, v2.16b   // offset_val = diff + 2
127         tbl             v16.16b, {v0.16b}, v20.16b
128         tbl             v17.16b, {v1.16b}, v20.16b
129         uxtl            v20.8h, v3.8b              // src[0:7]
130         uxtl2           v21.8h, v3.16b             // src[7:15]
131         zip1            v18.16b, v16.16b, v17.16b  // sao_offset_val lower ->
132         zip2            v19.16b, v16.16b, v17.16b  // sao_offset_val upper ->
133         sqadd           v20.8h, v18.8h, v20.8h     // + sao_offset_val
134         sqadd           v21.8h, v19.8h, v21.8h
135         sqxtun          v3.8b, v20.8h
136         sqxtun2         v3.16b, v21.8h
137         st1             {v3.16b}, [x0], #16
138         // filtered 16 bytes
139         b.ne            2b                         // do we have width to filter?
140         // no width to filter, setup next line
141         subs            w6, w6, #1                 // filtered line
142         add             x11, x11, x15              // stride src to next line
143         add             x0, x0, x16                // stride dst to next line
144         b.ne            1b                         // do we have lines to process?
145         // no lines to filter
146         ret
147 endfunc
148 
149 // ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst,
150 //                                    int16 *sao_offset_val, int eo, int width, int height)
151 function ff_hevc_sao_edge_filter_8x8_8_neon, export=1
152         adr             x7, .Lsao_edge_pos
153         ldr             w4, [x7, w4, uxtw #2]
154         ld1             {v3.8h}, [x3]
155         mov             v3.h[7], v3.h[0]
156         mov             v3.h[0], v3.h[1]
157         mov             v3.h[1], v3.h[2]
158         mov             v3.h[2], v3.h[7]
159         uzp2            v1.16b, v3.16b, v3.16b
160         uzp1            v0.16b, v3.16b, v3.16b
161         movi            v2.16b, #2
162         add             x16, x0, x2
163         lsl             x2,  x2, #1
164         mov             x15, #SAO_STRIDE
165         mov             x8,  x1
166         sub             x9,  x1, x4
167         add             x10, x1, x4
168 1:      ld1             {v3.d}[0], [ x8], x15
169         ld1             {v4.d}[0], [ x9], x15
170         ld1             {v5.d}[0], [x10], x15
171         ld1             {v3.d}[1], [ x8], x15
172         ld1             {v4.d}[1], [ x9], x15
173         ld1             {v5.d}[1], [x10], x15
174         subs            w6, w6, #2
175         cmhi            v16.16b, v4.16b, v3.16b
176         cmhi            v17.16b, v3.16b, v4.16b
177         cmhi            v18.16b, v5.16b, v3.16b
178         cmhi            v19.16b, v3.16b, v5.16b
179         sub             v20.16b, v16.16b, v17.16b
180         sub             v21.16b, v18.16b, v19.16b
181         add             v20.16b, v20.16b, v21.16b
182         add             v20.16b, v20.16b, v2.16b
183         tbl             v16.16b, {v0.16b}, v20.16b
184         tbl             v17.16b, {v1.16b}, v20.16b
185         uxtl            v20.8h, v3.8b
186         uxtl2           v21.8h, v3.16b
187         zip1            v18.16b, v16.16b, v17.16b
188         zip2            v19.16b, v16.16b, v17.16b
189         sqadd           v20.8h, v18.8h, v20.8h
190         sqadd           v21.8h, v19.8h, v21.8h
191         sqxtun          v6.8b, v20.8h
192         sqxtun          v7.8b, v21.8h
193         st1             {v6.8b}, [ x0], x2
194         st1             {v7.8b}, [x16], x2
195         b.ne            1b
196         ret
197 endfunc
198