1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* SIMD optimized SAO functions for HEVC 8bit decoding 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (c) 2013 Pierre-Edouard LEPERE 5cabdff1aSopenharmony_ci;* Copyright (c) 2014 James Almer 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;****************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION_RODATA 32 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cipb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 29cabdff1aSopenharmony_cipb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1 30cabdff1aSopenharmony_cicextern pb_1 31cabdff1aSopenharmony_cicextern pb_2 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ciSECTION .text 34cabdff1aSopenharmony_ci 35cabdff1aSopenharmony_ci;****************************************************************************** 36cabdff1aSopenharmony_ci;SAO Band Filter 37cabdff1aSopenharmony_ci;****************************************************************************** 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ci%macro HEVC_SAO_BAND_FILTER_INIT 0 40cabdff1aSopenharmony_ci and leftq, 31 41cabdff1aSopenharmony_ci movd xm0, leftd 42cabdff1aSopenharmony_ci add leftq, 1 43cabdff1aSopenharmony_ci and leftq, 31 44cabdff1aSopenharmony_ci movd xm1, leftd 45cabdff1aSopenharmony_ci add leftq, 1 46cabdff1aSopenharmony_ci and leftq, 31 47cabdff1aSopenharmony_ci movd xm2, leftd 48cabdff1aSopenharmony_ci add leftq, 1 49cabdff1aSopenharmony_ci and leftq, 31 50cabdff1aSopenharmony_ci movd xm3, leftd 51cabdff1aSopenharmony_ci 52cabdff1aSopenharmony_ci SPLATW m0, xm0 53cabdff1aSopenharmony_ci SPLATW m1, xm1 54cabdff1aSopenharmony_ci SPLATW m2, xm2 55cabdff1aSopenharmony_ci SPLATW m3, xm3 56cabdff1aSopenharmony_ci%if mmsize > 16 57cabdff1aSopenharmony_ci SPLATW m4, [offsetq + 2] 58cabdff1aSopenharmony_ci SPLATW m5, [offsetq + 4] 59cabdff1aSopenharmony_ci SPLATW m6, [offsetq + 6] 60cabdff1aSopenharmony_ci SPLATW m7, [offsetq + 8] 61cabdff1aSopenharmony_ci%else 62cabdff1aSopenharmony_ci movq m7, [offsetq + 2] 63cabdff1aSopenharmony_ci SPLATW m4, m7, 0 64cabdff1aSopenharmony_ci SPLATW m5, m7, 1 65cabdff1aSopenharmony_ci SPLATW m6, m7, 2 66cabdff1aSopenharmony_ci SPLATW m7, m7, 3 67cabdff1aSopenharmony_ci%endif 68cabdff1aSopenharmony_ci 69cabdff1aSopenharmony_ci%if ARCH_X86_64 70cabdff1aSopenharmony_ci pxor m14, m14 71cabdff1aSopenharmony_ci 72cabdff1aSopenharmony_ci%else ; ARCH_X86_32 73cabdff1aSopenharmony_ci mova [rsp+mmsize*0], m0 74cabdff1aSopenharmony_ci mova [rsp+mmsize*1], m1 75cabdff1aSopenharmony_ci mova [rsp+mmsize*2], m2 76cabdff1aSopenharmony_ci mova [rsp+mmsize*3], m3 77cabdff1aSopenharmony_ci mova [rsp+mmsize*4], m4 78cabdff1aSopenharmony_ci mova [rsp+mmsize*5], m5 79cabdff1aSopenharmony_ci mova [rsp+mmsize*6], m6 80cabdff1aSopenharmony_ci pxor m0, m0 81cabdff1aSopenharmony_ci %assign MMSIZE mmsize 82cabdff1aSopenharmony_ci %define m14 m0 83cabdff1aSopenharmony_ci %define m13 m1 84cabdff1aSopenharmony_ci %define m9 m2 85cabdff1aSopenharmony_ci %define m8 m3 86cabdff1aSopenharmony_ci%endif ; ARCH 87cabdff1aSopenharmony_ciDEFINE_ARGS dst, src, dststride, srcstride, offset, height 88cabdff1aSopenharmony_ci mov heightd, r7m 89cabdff1aSopenharmony_ci%endmacro 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ci%macro HEVC_SAO_BAND_FILTER_COMPUTE 2 92cabdff1aSopenharmony_ci psraw %1, %2, 3 93cabdff1aSopenharmony_ci%if ARCH_X86_64 94cabdff1aSopenharmony_ci pcmpeqw m10, %1, m0 95cabdff1aSopenharmony_ci pcmpeqw m11, %1, m1 96cabdff1aSopenharmony_ci pcmpeqw m12, %1, m2 97cabdff1aSopenharmony_ci pcmpeqw %1, m3 98cabdff1aSopenharmony_ci pand m10, m4 99cabdff1aSopenharmony_ci pand m11, m5 100cabdff1aSopenharmony_ci pand m12, m6 101cabdff1aSopenharmony_ci pand %1, m7 102cabdff1aSopenharmony_ci por m10, m11 103cabdff1aSopenharmony_ci por m12, %1 104cabdff1aSopenharmony_ci por m10, m12 105cabdff1aSopenharmony_ci paddw %2, m10 106cabdff1aSopenharmony_ci%else ; ARCH_X86_32 107cabdff1aSopenharmony_ci pcmpeqw m4, %1, [rsp+MMSIZE*0] 108cabdff1aSopenharmony_ci pcmpeqw m5, %1, [rsp+MMSIZE*1] 109cabdff1aSopenharmony_ci pcmpeqw m6, %1, [rsp+MMSIZE*2] 110cabdff1aSopenharmony_ci pcmpeqw %1, [rsp+MMSIZE*3] 111cabdff1aSopenharmony_ci pand m4, [rsp+MMSIZE*4] 112cabdff1aSopenharmony_ci pand m5, [rsp+MMSIZE*5] 113cabdff1aSopenharmony_ci pand m6, [rsp+MMSIZE*6] 114cabdff1aSopenharmony_ci pand %1, m7 115cabdff1aSopenharmony_ci por m4, m5 116cabdff1aSopenharmony_ci por m6, %1 117cabdff1aSopenharmony_ci por m4, m6 118cabdff1aSopenharmony_ci paddw %2, m4 119cabdff1aSopenharmony_ci%endif ; ARCH 120cabdff1aSopenharmony_ci%endmacro 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_ci;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, 123cabdff1aSopenharmony_ci; int16_t *sao_offset_val, int sao_left_class, int width, int height); 124cabdff1aSopenharmony_ci%macro HEVC_SAO_BAND_FILTER 2 125cabdff1aSopenharmony_cicglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left 126cabdff1aSopenharmony_ci HEVC_SAO_BAND_FILTER_INIT 127cabdff1aSopenharmony_ci 128cabdff1aSopenharmony_cialign 16 129cabdff1aSopenharmony_ci.loop: 130cabdff1aSopenharmony_ci%if %1 == 8 131cabdff1aSopenharmony_ci movq m8, [srcq] 132cabdff1aSopenharmony_ci punpcklbw m8, m14 133cabdff1aSopenharmony_ci HEVC_SAO_BAND_FILTER_COMPUTE m9, m8 134cabdff1aSopenharmony_ci packuswb m8, m14 135cabdff1aSopenharmony_ci movq [dstq], m8 136cabdff1aSopenharmony_ci%endif ; %1 == 8 137cabdff1aSopenharmony_ci 138cabdff1aSopenharmony_ci%assign i 0 139cabdff1aSopenharmony_ci%rep %2 140cabdff1aSopenharmony_ci mova m13, [srcq + i] 141cabdff1aSopenharmony_ci punpcklbw m8, m13, m14 142cabdff1aSopenharmony_ci HEVC_SAO_BAND_FILTER_COMPUTE m9, m8 143cabdff1aSopenharmony_ci punpckhbw m13, m14 144cabdff1aSopenharmony_ci HEVC_SAO_BAND_FILTER_COMPUTE m9, m13 145cabdff1aSopenharmony_ci packuswb m8, m13 146cabdff1aSopenharmony_ci mova [dstq + i], m8 147cabdff1aSopenharmony_ci%assign i i+mmsize 148cabdff1aSopenharmony_ci%endrep 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci%if %1 == 48 151cabdff1aSopenharmony_ciINIT_XMM cpuname 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_ci mova m13, [srcq + i] 154cabdff1aSopenharmony_ci punpcklbw m8, m13, m14 155cabdff1aSopenharmony_ci HEVC_SAO_BAND_FILTER_COMPUTE m9, m8 156cabdff1aSopenharmony_ci punpckhbw m13, m14 157cabdff1aSopenharmony_ci HEVC_SAO_BAND_FILTER_COMPUTE m9, m13 158cabdff1aSopenharmony_ci packuswb m8, m13 159cabdff1aSopenharmony_ci mova [dstq + i], m8 160cabdff1aSopenharmony_ci%if cpuflag(avx2) 161cabdff1aSopenharmony_ciINIT_YMM cpuname 162cabdff1aSopenharmony_ci%endif 163cabdff1aSopenharmony_ci%endif ; %1 == 48 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_ci add dstq, dststrideq ; dst += dststride 166cabdff1aSopenharmony_ci add srcq, srcstrideq ; src += srcstride 167cabdff1aSopenharmony_ci dec heightd ; cmp height 168cabdff1aSopenharmony_ci jnz .loop ; height loop 169cabdff1aSopenharmony_ci REP_RET 170cabdff1aSopenharmony_ci%endmacro 171cabdff1aSopenharmony_ci 172cabdff1aSopenharmony_ci 173cabdff1aSopenharmony_ci%macro HEVC_SAO_BAND_FILTER_FUNCS 0 174cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 8, 0 175cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 16, 1 176cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 32, 2 177cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 48, 2 178cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 64, 4 179cabdff1aSopenharmony_ci%endmacro 180cabdff1aSopenharmony_ci 181cabdff1aSopenharmony_ciINIT_XMM sse2 182cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER_FUNCS 183cabdff1aSopenharmony_ciINIT_XMM avx 184cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER_FUNCS 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 187cabdff1aSopenharmony_ciINIT_XMM avx2 188cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 8, 0 189cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 16, 1 190cabdff1aSopenharmony_ciINIT_YMM avx2 191cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 32, 1 192cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 48, 1 193cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 64, 2 194cabdff1aSopenharmony_ci%endif 195cabdff1aSopenharmony_ci 196cabdff1aSopenharmony_ci;****************************************************************************** 197cabdff1aSopenharmony_ci;SAO Edge Filter 198cabdff1aSopenharmony_ci;****************************************************************************** 199cabdff1aSopenharmony_ci 200cabdff1aSopenharmony_ci%define MAX_PB_SIZE 64 201cabdff1aSopenharmony_ci%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE 202cabdff1aSopenharmony_ci%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE 203cabdff1aSopenharmony_ci 204cabdff1aSopenharmony_ci%macro HEVC_SAO_EDGE_FILTER_INIT 0 205cabdff1aSopenharmony_ci%if WIN64 206cabdff1aSopenharmony_ci movsxd eoq, dword eom 207cabdff1aSopenharmony_ci%elif ARCH_X86_64 208cabdff1aSopenharmony_ci movsxd eoq, eod 209cabdff1aSopenharmony_ci%else 210cabdff1aSopenharmony_ci mov eoq, r4m 211cabdff1aSopenharmony_ci%endif 212cabdff1aSopenharmony_ci lea tmp2q, [pb_eo] 213cabdff1aSopenharmony_ci movsx a_strideq, byte [tmp2q+eoq*4+1] 214cabdff1aSopenharmony_ci movsx b_strideq, byte [tmp2q+eoq*4+3] 215cabdff1aSopenharmony_ci imul a_strideq, EDGE_SRCSTRIDE 216cabdff1aSopenharmony_ci imul b_strideq, EDGE_SRCSTRIDE 217cabdff1aSopenharmony_ci movsx tmpq, byte [tmp2q+eoq*4] 218cabdff1aSopenharmony_ci add a_strideq, tmpq 219cabdff1aSopenharmony_ci movsx tmpq, byte [tmp2q+eoq*4+2] 220cabdff1aSopenharmony_ci add b_strideq, tmpq 221cabdff1aSopenharmony_ci%endmacro 222cabdff1aSopenharmony_ci 223cabdff1aSopenharmony_ci%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1 224cabdff1aSopenharmony_ci pminub m4, m1, m2 225cabdff1aSopenharmony_ci pminub m5, m1, m3 226cabdff1aSopenharmony_ci pcmpeqb m2, m4 227cabdff1aSopenharmony_ci pcmpeqb m3, m5 228cabdff1aSopenharmony_ci pcmpeqb m4, m1 229cabdff1aSopenharmony_ci pcmpeqb m5, m1 230cabdff1aSopenharmony_ci psubb m4, m2 231cabdff1aSopenharmony_ci psubb m5, m3 232cabdff1aSopenharmony_ci paddb m4, m6 233cabdff1aSopenharmony_ci paddb m4, m5 234cabdff1aSopenharmony_ci 235cabdff1aSopenharmony_ci pshufb m2, m0, m4 236cabdff1aSopenharmony_ci%if %1 > 8 237cabdff1aSopenharmony_ci punpckhbw m5, m7, m1 238cabdff1aSopenharmony_ci punpckhbw m4, m2, m7 239cabdff1aSopenharmony_ci punpcklbw m3, m7, m1 240cabdff1aSopenharmony_ci punpcklbw m2, m7 241cabdff1aSopenharmony_ci pmaddubsw m5, m4 242cabdff1aSopenharmony_ci pmaddubsw m3, m2 243cabdff1aSopenharmony_ci packuswb m3, m5 244cabdff1aSopenharmony_ci%else 245cabdff1aSopenharmony_ci punpcklbw m3, m7, m1 246cabdff1aSopenharmony_ci punpcklbw m2, m7 247cabdff1aSopenharmony_ci pmaddubsw m3, m2 248cabdff1aSopenharmony_ci packuswb m3, m3 249cabdff1aSopenharmony_ci%endif 250cabdff1aSopenharmony_ci%endmacro 251cabdff1aSopenharmony_ci 252cabdff1aSopenharmony_ci;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, 253cabdff1aSopenharmony_ci; int eo, int width, int height); 254cabdff1aSopenharmony_ci%macro HEVC_SAO_EDGE_FILTER 2-3 255cabdff1aSopenharmony_ci%if ARCH_X86_64 256cabdff1aSopenharmony_cicglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp 257cabdff1aSopenharmony_ci%define tmp2q heightq 258cabdff1aSopenharmony_ci HEVC_SAO_EDGE_FILTER_INIT 259cabdff1aSopenharmony_ci mov heightd, r6m 260cabdff1aSopenharmony_ci 261cabdff1aSopenharmony_ci%else ; ARCH_X86_32 262cabdff1aSopenharmony_cicglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height 263cabdff1aSopenharmony_ci%define eoq srcq 264cabdff1aSopenharmony_ci%define tmpq heightq 265cabdff1aSopenharmony_ci%define tmp2q dststrideq 266cabdff1aSopenharmony_ci%define offsetq heightq 267cabdff1aSopenharmony_ci HEVC_SAO_EDGE_FILTER_INIT 268cabdff1aSopenharmony_ci mov srcq, srcm 269cabdff1aSopenharmony_ci mov offsetq, r3m 270cabdff1aSopenharmony_ci mov dststrideq, dststridem 271cabdff1aSopenharmony_ci%endif ; ARCH 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci%if mmsize > 16 274cabdff1aSopenharmony_ci vbroadcasti128 m0, [offsetq] 275cabdff1aSopenharmony_ci%else 276cabdff1aSopenharmony_ci movu m0, [offsetq] 277cabdff1aSopenharmony_ci%endif 278cabdff1aSopenharmony_ci mova m1, [pb_edge_shuffle] 279cabdff1aSopenharmony_ci packsswb m0, m0 280cabdff1aSopenharmony_ci mova m7, [pb_1] 281cabdff1aSopenharmony_ci pshufb m0, m1 282cabdff1aSopenharmony_ci mova m6, [pb_2] 283cabdff1aSopenharmony_ci%if ARCH_X86_32 284cabdff1aSopenharmony_ci mov heightd, r6m 285cabdff1aSopenharmony_ci%endif 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_cialign 16 288cabdff1aSopenharmony_ci.loop: 289cabdff1aSopenharmony_ci 290cabdff1aSopenharmony_ci%if %1 == 8 291cabdff1aSopenharmony_ci movq m1, [srcq] 292cabdff1aSopenharmony_ci movq m2, [srcq + a_strideq] 293cabdff1aSopenharmony_ci movq m3, [srcq + b_strideq] 294cabdff1aSopenharmony_ci HEVC_SAO_EDGE_FILTER_COMPUTE %1 295cabdff1aSopenharmony_ci movq [dstq], m3 296cabdff1aSopenharmony_ci%endif 297cabdff1aSopenharmony_ci 298cabdff1aSopenharmony_ci%assign i 0 299cabdff1aSopenharmony_ci%rep %2 300cabdff1aSopenharmony_ci mova m1, [srcq + i] 301cabdff1aSopenharmony_ci movu m2, [srcq + a_strideq + i] 302cabdff1aSopenharmony_ci movu m3, [srcq + b_strideq + i] 303cabdff1aSopenharmony_ci HEVC_SAO_EDGE_FILTER_COMPUTE %1 304cabdff1aSopenharmony_ci mov%3 [dstq + i], m3 305cabdff1aSopenharmony_ci%assign i i+mmsize 306cabdff1aSopenharmony_ci%endrep 307cabdff1aSopenharmony_ci 308cabdff1aSopenharmony_ci%if %1 == 48 309cabdff1aSopenharmony_ciINIT_XMM cpuname 310cabdff1aSopenharmony_ci 311cabdff1aSopenharmony_ci mova m1, [srcq + i] 312cabdff1aSopenharmony_ci movu m2, [srcq + a_strideq + i] 313cabdff1aSopenharmony_ci movu m3, [srcq + b_strideq + i] 314cabdff1aSopenharmony_ci HEVC_SAO_EDGE_FILTER_COMPUTE %1 315cabdff1aSopenharmony_ci mova [dstq + i], m3 316cabdff1aSopenharmony_ci%if cpuflag(avx2) 317cabdff1aSopenharmony_ciINIT_YMM cpuname 318cabdff1aSopenharmony_ci%endif 319cabdff1aSopenharmony_ci%endif 320cabdff1aSopenharmony_ci 321cabdff1aSopenharmony_ci add dstq, dststrideq 322cabdff1aSopenharmony_ci add srcq, EDGE_SRCSTRIDE 323cabdff1aSopenharmony_ci dec heightd 324cabdff1aSopenharmony_ci jg .loop 325cabdff1aSopenharmony_ci RET 326cabdff1aSopenharmony_ci%endmacro 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_ciINIT_XMM ssse3 329cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 8, 0 330cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 16, 1, a 331cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 32, 2, a 332cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 48, 2, a 333cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 64, 4, a 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 336cabdff1aSopenharmony_ciINIT_YMM avx2 337cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 32, 1, a 338cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 48, 1, u 339cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 64, 2, a 340cabdff1aSopenharmony_ci%endif 341