1/* -*-arm64-*- 2 * vim: syntax=arm64asm 3 * 4 * AArch64 NEON optimised SAO functions for HEVC decoding 5 * 6 * Copyright (c) 2022 J. Dekker <jdek@itanimul.li> 7 * 8 * This file is part of FFmpeg. 9 * 10 * FFmpeg is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU Lesser General Public 12 * License as published by the Free Software Foundation; either 13 * version 2.1 of the License, or (at your option) any later version. 14 * 15 * FFmpeg is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with FFmpeg; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23 */ 24 25#include "libavutil/aarch64/asm.S" 26 27#define MAX_PB_SIZE 64 28#define AV_INPUT_BUFFER_PADDING_SIZE 64 29#define SAO_STRIDE (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) 30 31// void sao_band_filter(uint8_t *_dst, uint8_t *_src, 32// ptrdiff_t stride_dst, ptrdiff_t stride_src, 33// int16_t *sao_offset_val, int sao_left_class, 34// int width, int height) 35function ff_hevc_sao_band_filter_8x8_8_neon, export=1 36 sub sp, sp, #64 37 stp xzr, xzr, [sp] 38 stp xzr, xzr, [sp, #16] 39 stp xzr, xzr, [sp, #32] 40 stp xzr, xzr, [sp, #48] 41 mov w8, #4 420: ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1] 43 subs w8, w8, #1 44 add w10, w8, w5 // k + sao_left_class 45 and w10, w10, #0x1F 46 strh w9, [sp, x10, lsl #1] 47 bne 0b 48 add w6, w6, #7 49 bic w6, w6, #7 50 ld1 {v16.16b-v19.16b}, [sp], #64 51 sub x2, x2, x6 52 sub x3, x3, x6 53 movi v20.8h, #1 541: mov w8, w6 // beginning of line 552: // Simple layout for accessing 16bit values 56 // with 8bit LUT. 57 // 58 // 00 01 02 03 04 05 06 07 59 // +-----------------------------------> 60 // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|.... 61 // +-----------------------------------> 62 // i-0 i-1 i-2 i-3 63 ld1 {v2.8b}, [x1], #8 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); 64 subs w8, w8, #8 65 uxtl v0.8h, v2.8b // load src[x] 66 ushr v2.8h, v0.8h, #3 // >> BIT_DEPTH - 3 67 shl v1.8h, v2.8h, #1 // low (x2, accessing short) 68 add v3.8h, v1.8h, v20.8h // +1 access upper short 69 sli v1.8h, v3.8h, #8 // shift insert index to upper byte 70 tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table 71 add v1.8h, v0.8h, v2.8h // src[x] + table 72 sqxtun v4.8b, v1.8h // clip + narrow 73 st1 {v4.8b}, [x0], #8 // store 74 // done 8 pixels 75 bne 2b 76 subs w7, w7, #1 // finished line, prep. new 77 add x0, x0, x2 // dst += stride_dst 78 add x1, x1, x3 // src += stride_src 79 bne 1b 80 ret 81endfunc 82 83.Lsao_edge_pos: 84.word 1 // horizontal 85.word SAO_STRIDE // vertical 86.word SAO_STRIDE + 1 // 45 degree 87.word SAO_STRIDE - 1 // 135 degree 88 89// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst, 90// int16 *sao_offset_val, int eo, int width, int height) 91function ff_hevc_sao_edge_filter_16x16_8_neon, export=1 92 adr x7, .Lsao_edge_pos 93 ld1 {v3.8h}, [x3] // load sao_offset_val 94 add w5, w5, #0xF 95 bic w5, w5, #0xF 96 ldr w4, [x7, w4, uxtw #2] // stride_src 97 mov v3.h[7], v3.h[0] // reorder to [1,2,0,3,4] 98 mov v3.h[0], v3.h[1] 99 mov v3.h[1], v3.h[2] 100 mov v3.h[2], v3.h[7] 101 // split 16bit values into two tables 102 uzp2 v1.16b, v3.16b, v3.16b // sao_offset_val -> upper 103 uzp1 v0.16b, v3.16b, v3.16b // sao_offset_val -> lower 104 movi v2.16b, #2 105 mov x15, #SAO_STRIDE 106 // strides between end of line and next src/dst 107 sub x15, x15, x5 // stride_src - width 108 sub x16, x2, x5 // stride_dst - width 109 mov x11, x1 // copy base src 1101: // new line 111 mov x14, x5 // copy width 112 sub x12, x11, x4 // src_a (prev) = src - sao_edge_pos 113 add x13, x11, x4 // src_b (next) = src + sao_edge_pos 1142: // process 16 bytes 115 ld1 {v3.16b}, [x11], #16 // load src 116 ld1 {v4.16b}, [x12], #16 // load src_a (prev) 117 ld1 {v5.16b}, [x13], #16 // load src_b (next) 118 subs x14, x14, #16 119 cmhi v16.16b, v4.16b, v3.16b // (prev > cur) 120 cmhi v17.16b, v3.16b, v4.16b // (cur > prev) 121 cmhi v18.16b, v5.16b, v3.16b // (next > cur) 122 cmhi v19.16b, v3.16b, v5.16b // (cur > next) 123 sub v20.16b, v16.16b, v17.16b // diff0 = CMP(cur, prev) = (cur > prev) - (cur < prev) 124 sub v21.16b, v18.16b, v19.16b // diff1 = CMP(cur, next) = (cur > next) - (cur < next) 125 add v20.16b, v20.16b, v21.16b // diff = diff0 + diff1 126 add v20.16b, v20.16b, v2.16b // offset_val = diff + 2 127 tbl v16.16b, {v0.16b}, v20.16b 128 tbl v17.16b, {v1.16b}, v20.16b 129 uxtl v20.8h, v3.8b // src[0:7] 130 uxtl2 v21.8h, v3.16b // src[7:15] 131 zip1 v18.16b, v16.16b, v17.16b // sao_offset_val lower -> 132 zip2 v19.16b, v16.16b, v17.16b // sao_offset_val upper -> 133 sqadd v20.8h, v18.8h, v20.8h // + sao_offset_val 134 sqadd v21.8h, v19.8h, v21.8h 135 sqxtun v3.8b, v20.8h 136 sqxtun2 v3.16b, v21.8h 137 st1 {v3.16b}, [x0], #16 138 // filtered 16 bytes 139 b.ne 2b // do we have width to filter? 140 // no width to filter, setup next line 141 subs w6, w6, #1 // filtered line 142 add x11, x11, x15 // stride src to next line 143 add x0, x0, x16 // stride dst to next line 144 b.ne 1b // do we have lines to process? 145 // no lines to filter 146 ret 147endfunc 148 149// ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst, 150// int16 *sao_offset_val, int eo, int width, int height) 151function ff_hevc_sao_edge_filter_8x8_8_neon, export=1 152 adr x7, .Lsao_edge_pos 153 ldr w4, [x7, w4, uxtw #2] 154 ld1 {v3.8h}, [x3] 155 mov v3.h[7], v3.h[0] 156 mov v3.h[0], v3.h[1] 157 mov v3.h[1], v3.h[2] 158 mov v3.h[2], v3.h[7] 159 uzp2 v1.16b, v3.16b, v3.16b 160 uzp1 v0.16b, v3.16b, v3.16b 161 movi v2.16b, #2 162 add x16, x0, x2 163 lsl x2, x2, #1 164 mov x15, #SAO_STRIDE 165 mov x8, x1 166 sub x9, x1, x4 167 add x10, x1, x4 1681: ld1 {v3.d}[0], [ x8], x15 169 ld1 {v4.d}[0], [ x9], x15 170 ld1 {v5.d}[0], [x10], x15 171 ld1 {v3.d}[1], [ x8], x15 172 ld1 {v4.d}[1], [ x9], x15 173 ld1 {v5.d}[1], [x10], x15 174 subs w6, w6, #2 175 cmhi v16.16b, v4.16b, v3.16b 176 cmhi v17.16b, v3.16b, v4.16b 177 cmhi v18.16b, v5.16b, v3.16b 178 cmhi v19.16b, v3.16b, v5.16b 179 sub v20.16b, v16.16b, v17.16b 180 sub v21.16b, v18.16b, v19.16b 181 add v20.16b, v20.16b, v21.16b 182 add v20.16b, v20.16b, v2.16b 183 tbl v16.16b, {v0.16b}, v20.16b 184 tbl v17.16b, {v1.16b}, v20.16b 185 uxtl v20.8h, v3.8b 186 uxtl2 v21.8h, v3.16b 187 zip1 v18.16b, v16.16b, v17.16b 188 zip2 v19.16b, v16.16b, v17.16b 189 sqadd v20.8h, v18.8h, v20.8h 190 sqadd v21.8h, v19.8h, v21.8h 191 sqxtun v6.8b, v20.8h 192 sqxtun v7.8b, v21.8h 193 st1 {v6.8b}, [ x0], x2 194 st1 {v7.8b}, [x16], x2 195 b.ne 1b 196 ret 197endfunc 198