1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* SIMD optimized SAO functions for HEVC 10/12bit decoding
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (c) 2013 Pierre-Edouard LEPERE
5cabdff1aSopenharmony_ci;* Copyright (c) 2014 James Almer
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci;******************************************************************************
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION_RODATA 32
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_cipw_m2:     times 16 dw -2
29cabdff1aSopenharmony_cipw_mask10: times 16 dw 0x03FF
30cabdff1aSopenharmony_cipw_mask12: times 16 dw 0x0FFF
31cabdff1aSopenharmony_cipb_eo:              db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
32cabdff1aSopenharmony_cicextern pw_m1
33cabdff1aSopenharmony_cicextern pw_1
34cabdff1aSopenharmony_cicextern pw_2
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ciSECTION .text
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ci;******************************************************************************
39cabdff1aSopenharmony_ci;SAO Band Filter
40cabdff1aSopenharmony_ci;******************************************************************************
41cabdff1aSopenharmony_ci
42cabdff1aSopenharmony_ci%macro HEVC_SAO_BAND_FILTER_INIT 1
43cabdff1aSopenharmony_ci    and            leftq, 31
44cabdff1aSopenharmony_ci    movd             xm0, leftd
45cabdff1aSopenharmony_ci    add            leftq, 1
46cabdff1aSopenharmony_ci    and            leftq, 31
47cabdff1aSopenharmony_ci    movd             xm1, leftd
48cabdff1aSopenharmony_ci    add            leftq, 1
49cabdff1aSopenharmony_ci    and            leftq, 31
50cabdff1aSopenharmony_ci    movd             xm2, leftd
51cabdff1aSopenharmony_ci    add            leftq, 1
52cabdff1aSopenharmony_ci    and            leftq, 31
53cabdff1aSopenharmony_ci    movd             xm3, leftd
54cabdff1aSopenharmony_ci
55cabdff1aSopenharmony_ci    SPLATW            m0, xm0
56cabdff1aSopenharmony_ci    SPLATW            m1, xm1
57cabdff1aSopenharmony_ci    SPLATW            m2, xm2
58cabdff1aSopenharmony_ci    SPLATW            m3, xm3
59cabdff1aSopenharmony_ci%if mmsize > 16
60cabdff1aSopenharmony_ci    SPLATW            m4, [offsetq + 2]
61cabdff1aSopenharmony_ci    SPLATW            m5, [offsetq + 4]
62cabdff1aSopenharmony_ci    SPLATW            m6, [offsetq + 6]
63cabdff1aSopenharmony_ci    SPLATW            m7, [offsetq + 8]
64cabdff1aSopenharmony_ci%else
65cabdff1aSopenharmony_ci    movq              m7, [offsetq + 2]
66cabdff1aSopenharmony_ci    SPLATW            m4, m7, 0
67cabdff1aSopenharmony_ci    SPLATW            m5, m7, 1
68cabdff1aSopenharmony_ci    SPLATW            m6, m7, 2
69cabdff1aSopenharmony_ci    SPLATW            m7, m7, 3
70cabdff1aSopenharmony_ci%endif
71cabdff1aSopenharmony_ci
72cabdff1aSopenharmony_ci%if ARCH_X86_64
73cabdff1aSopenharmony_ci    mova             m13, [pw_mask %+ %1]
74cabdff1aSopenharmony_ci    pxor             m14, m14
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_ci%else ; ARCH_X86_32
77cabdff1aSopenharmony_ci    mova  [rsp+mmsize*0], m0
78cabdff1aSopenharmony_ci    mova  [rsp+mmsize*1], m1
79cabdff1aSopenharmony_ci    mova  [rsp+mmsize*2], m2
80cabdff1aSopenharmony_ci    mova  [rsp+mmsize*3], m3
81cabdff1aSopenharmony_ci    mova  [rsp+mmsize*4], m4
82cabdff1aSopenharmony_ci    mova  [rsp+mmsize*5], m5
83cabdff1aSopenharmony_ci    mova  [rsp+mmsize*6], m6
84cabdff1aSopenharmony_ci    mova              m1, [pw_mask %+ %1]
85cabdff1aSopenharmony_ci    pxor              m0, m0
86cabdff1aSopenharmony_ci    %define m14 m0
87cabdff1aSopenharmony_ci    %define m13 m1
88cabdff1aSopenharmony_ci    %define  m9 m2
89cabdff1aSopenharmony_ci    %define  m8 m3
90cabdff1aSopenharmony_ci%endif ; ARCH
91cabdff1aSopenharmony_ciDEFINE_ARGS dst, src, dststride, srcstride, offset, height
92cabdff1aSopenharmony_ci    mov          heightd, r7m
93cabdff1aSopenharmony_ci%endmacro
94cabdff1aSopenharmony_ci
95cabdff1aSopenharmony_ci;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
96cabdff1aSopenharmony_ci;                                                   int16_t *sao_offset_val, int sao_left_class, int width, int height);
97cabdff1aSopenharmony_ci%macro HEVC_SAO_BAND_FILTER 3
98cabdff1aSopenharmony_cicglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
99cabdff1aSopenharmony_ci    HEVC_SAO_BAND_FILTER_INIT %1
100cabdff1aSopenharmony_ci
101cabdff1aSopenharmony_cialign 16
102cabdff1aSopenharmony_ci.loop:
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci%assign i 0
105cabdff1aSopenharmony_ci%assign j 0
106cabdff1aSopenharmony_ci%rep %3
107cabdff1aSopenharmony_ci%assign k 8+(j&1)
108cabdff1aSopenharmony_ci%assign l 9-(j&1)
109cabdff1aSopenharmony_ci    mova          m %+ k, [srcq + i]
110cabdff1aSopenharmony_ci    psraw         m %+ l, m %+ k, %1-5
111cabdff1aSopenharmony_ci%if ARCH_X86_64
112cabdff1aSopenharmony_ci    pcmpeqw          m10, m %+ l, m0
113cabdff1aSopenharmony_ci    pcmpeqw          m11, m %+ l, m1
114cabdff1aSopenharmony_ci    pcmpeqw          m12, m %+ l, m2
115cabdff1aSopenharmony_ci    pcmpeqw       m %+ l, m3
116cabdff1aSopenharmony_ci    pand             m10, m4
117cabdff1aSopenharmony_ci    pand             m11, m5
118cabdff1aSopenharmony_ci    pand             m12, m6
119cabdff1aSopenharmony_ci    pand          m %+ l, m7
120cabdff1aSopenharmony_ci    por              m10, m11
121cabdff1aSopenharmony_ci    por              m12, m %+ l
122cabdff1aSopenharmony_ci    por              m10, m12
123cabdff1aSopenharmony_ci    paddw         m %+ k, m10
124cabdff1aSopenharmony_ci%else ; ARCH_X86_32
125cabdff1aSopenharmony_ci    pcmpeqw           m4, m %+ l, [rsp+mmsize*0]
126cabdff1aSopenharmony_ci    pcmpeqw           m5, m %+ l, [rsp+mmsize*1]
127cabdff1aSopenharmony_ci    pcmpeqw           m6, m %+ l, [rsp+mmsize*2]
128cabdff1aSopenharmony_ci    pcmpeqw       m %+ l, [rsp+mmsize*3]
129cabdff1aSopenharmony_ci    pand              m4, [rsp+mmsize*4]
130cabdff1aSopenharmony_ci    pand              m5, [rsp+mmsize*5]
131cabdff1aSopenharmony_ci    pand              m6, [rsp+mmsize*6]
132cabdff1aSopenharmony_ci    pand          m %+ l, m7
133cabdff1aSopenharmony_ci    por               m4, m5
134cabdff1aSopenharmony_ci    por               m6, m %+ l
135cabdff1aSopenharmony_ci    por               m4, m6
136cabdff1aSopenharmony_ci    paddw         m %+ k, m4
137cabdff1aSopenharmony_ci%endif ; ARCH
138cabdff1aSopenharmony_ci    CLIPW             m %+ k, m14, m13
139cabdff1aSopenharmony_ci    mova      [dstq + i], m %+ k
140cabdff1aSopenharmony_ci%assign i i+mmsize
141cabdff1aSopenharmony_ci%assign j j+1
142cabdff1aSopenharmony_ci%endrep
143cabdff1aSopenharmony_ci
144cabdff1aSopenharmony_ci    add             dstq, dststrideq
145cabdff1aSopenharmony_ci    add             srcq, srcstrideq
146cabdff1aSopenharmony_ci    dec          heightd
147cabdff1aSopenharmony_ci    jg .loop
148cabdff1aSopenharmony_ci    REP_RET
149cabdff1aSopenharmony_ci%endmacro
150cabdff1aSopenharmony_ci
151cabdff1aSopenharmony_ci%macro HEVC_SAO_BAND_FILTER_FUNCS 0
152cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 10,  8, 1
153cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 10, 16, 2
154cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 10, 32, 4
155cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 10, 48, 6
156cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 10, 64, 8
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 12,  8, 1
159cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 12, 16, 2
160cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 12, 32, 4
161cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 12, 48, 6
162cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 12, 64, 8
163cabdff1aSopenharmony_ci%endmacro
164cabdff1aSopenharmony_ci
165cabdff1aSopenharmony_ciINIT_XMM sse2
166cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER_FUNCS
167cabdff1aSopenharmony_ciINIT_XMM avx
168cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER_FUNCS
169cabdff1aSopenharmony_ci
170cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
171cabdff1aSopenharmony_ciINIT_XMM avx2
172cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 10,  8, 1
173cabdff1aSopenharmony_ciINIT_YMM avx2
174cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 10, 16, 1
175cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 10, 32, 2
176cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 10, 48, 3
177cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 10, 64, 4
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ciINIT_XMM avx2
180cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 12,  8, 1
181cabdff1aSopenharmony_ciINIT_YMM avx2
182cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 12, 16, 1
183cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 12, 32, 2
184cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 12, 48, 3
185cabdff1aSopenharmony_ciHEVC_SAO_BAND_FILTER 12, 64, 4
186cabdff1aSopenharmony_ci%endif
187cabdff1aSopenharmony_ci
188cabdff1aSopenharmony_ci;******************************************************************************
189cabdff1aSopenharmony_ci;SAO Edge Filter
190cabdff1aSopenharmony_ci;******************************************************************************
191cabdff1aSopenharmony_ci
192cabdff1aSopenharmony_ci%define MAX_PB_SIZE  64
193cabdff1aSopenharmony_ci%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
194cabdff1aSopenharmony_ci%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
195cabdff1aSopenharmony_ci
196cabdff1aSopenharmony_ci%macro PMINUW 4
197cabdff1aSopenharmony_ci%if cpuflag(sse4)
198cabdff1aSopenharmony_ci    pminuw            %1, %2, %3
199cabdff1aSopenharmony_ci%else
200cabdff1aSopenharmony_ci    psubusw           %4, %2, %3
201cabdff1aSopenharmony_ci    psubw             %1, %2, %4
202cabdff1aSopenharmony_ci%endif
203cabdff1aSopenharmony_ci%endmacro
204cabdff1aSopenharmony_ci
205cabdff1aSopenharmony_ci%macro HEVC_SAO_EDGE_FILTER_INIT 0
206cabdff1aSopenharmony_ci%if WIN64
207cabdff1aSopenharmony_ci    movsxd           eoq, dword eom
208cabdff1aSopenharmony_ci%elif ARCH_X86_64
209cabdff1aSopenharmony_ci    movsxd           eoq, eod
210cabdff1aSopenharmony_ci%else
211cabdff1aSopenharmony_ci    mov              eoq, r4m
212cabdff1aSopenharmony_ci%endif
213cabdff1aSopenharmony_ci    lea            tmp2q, [pb_eo]
214cabdff1aSopenharmony_ci    movsx      a_strideq, byte [tmp2q+eoq*4+1]
215cabdff1aSopenharmony_ci    movsx      b_strideq, byte [tmp2q+eoq*4+3]
216cabdff1aSopenharmony_ci    imul       a_strideq, EDGE_SRCSTRIDE >> 1
217cabdff1aSopenharmony_ci    imul       b_strideq, EDGE_SRCSTRIDE >> 1
218cabdff1aSopenharmony_ci    movsx           tmpq, byte [tmp2q+eoq*4]
219cabdff1aSopenharmony_ci    add        a_strideq, tmpq
220cabdff1aSopenharmony_ci    movsx           tmpq, byte [tmp2q+eoq*4+2]
221cabdff1aSopenharmony_ci    add        b_strideq, tmpq
222cabdff1aSopenharmony_ci%endmacro
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_ci;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
225cabdff1aSopenharmony_ci;                                                   int eo, int width, int height);
226cabdff1aSopenharmony_ci%macro HEVC_SAO_EDGE_FILTER 3
227cabdff1aSopenharmony_ci%if ARCH_X86_64
228cabdff1aSopenharmony_cicglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
229cabdff1aSopenharmony_ci%define tmp2q heightq
230cabdff1aSopenharmony_ci    HEVC_SAO_EDGE_FILTER_INIT
231cabdff1aSopenharmony_ci    mov          heightd, r6m
232cabdff1aSopenharmony_ci    add        a_strideq, a_strideq
233cabdff1aSopenharmony_ci    add        b_strideq, b_strideq
234cabdff1aSopenharmony_ci
235cabdff1aSopenharmony_ci%else ; ARCH_X86_32
236cabdff1aSopenharmony_cicglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
237cabdff1aSopenharmony_ci%define eoq   srcq
238cabdff1aSopenharmony_ci%define tmpq  heightq
239cabdff1aSopenharmony_ci%define tmp2q dststrideq
240cabdff1aSopenharmony_ci%define offsetq heightq
241cabdff1aSopenharmony_ci%define m8 m1
242cabdff1aSopenharmony_ci%define m9 m2
243cabdff1aSopenharmony_ci%define m10 m3
244cabdff1aSopenharmony_ci%define m11 m4
245cabdff1aSopenharmony_ci%define m12 m5
246cabdff1aSopenharmony_ci    HEVC_SAO_EDGE_FILTER_INIT
247cabdff1aSopenharmony_ci    mov             srcq, srcm
248cabdff1aSopenharmony_ci    mov          offsetq, r3m
249cabdff1aSopenharmony_ci    mov       dststrideq, dststridem
250cabdff1aSopenharmony_ci    add        a_strideq, a_strideq
251cabdff1aSopenharmony_ci    add        b_strideq, b_strideq
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci%endif ; ARCH
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ci%if mmsize > 16
256cabdff1aSopenharmony_ci    SPLATW            m8, [offsetq+2]
257cabdff1aSopenharmony_ci    SPLATW            m9, [offsetq+4]
258cabdff1aSopenharmony_ci    SPLATW           m10, [offsetq+0]
259cabdff1aSopenharmony_ci    SPLATW           m11, [offsetq+6]
260cabdff1aSopenharmony_ci    SPLATW           m12, [offsetq+8]
261cabdff1aSopenharmony_ci%else
262cabdff1aSopenharmony_ci    movq             m10, [offsetq+0]
263cabdff1aSopenharmony_ci    movd             m12, [offsetq+6]
264cabdff1aSopenharmony_ci    SPLATW            m8, xm10, 1
265cabdff1aSopenharmony_ci    SPLATW            m9, xm10, 2
266cabdff1aSopenharmony_ci    SPLATW           m10, xm10, 0
267cabdff1aSopenharmony_ci    SPLATW           m11, xm12, 0
268cabdff1aSopenharmony_ci    SPLATW           m12, xm12, 1
269cabdff1aSopenharmony_ci%endif
270cabdff1aSopenharmony_ci    pxor              m0, m0
271cabdff1aSopenharmony_ci%if ARCH_X86_64
272cabdff1aSopenharmony_ci    mova             m13, [pw_m1]
273cabdff1aSopenharmony_ci    mova             m14, [pw_1]
274cabdff1aSopenharmony_ci    mova             m15, [pw_2]
275cabdff1aSopenharmony_ci%else
276cabdff1aSopenharmony_ci    mov          heightd, r6m
277cabdff1aSopenharmony_ci    mova  [rsp+mmsize*0], m8
278cabdff1aSopenharmony_ci    mova  [rsp+mmsize*1], m9
279cabdff1aSopenharmony_ci    mova  [rsp+mmsize*2], m10
280cabdff1aSopenharmony_ci    mova  [rsp+mmsize*3], m11
281cabdff1aSopenharmony_ci    mova  [rsp+mmsize*4], m12
282cabdff1aSopenharmony_ci%endif
283cabdff1aSopenharmony_ci
284cabdff1aSopenharmony_cialign 16
285cabdff1aSopenharmony_ci.loop:
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci%assign i 0
288cabdff1aSopenharmony_ci%rep %3
289cabdff1aSopenharmony_ci    mova              m1, [srcq + i]
290cabdff1aSopenharmony_ci    movu              m2, [srcq+a_strideq + i]
291cabdff1aSopenharmony_ci    movu              m3, [srcq+b_strideq + i]
292cabdff1aSopenharmony_ci    PMINUW            m4, m1, m2, m6
293cabdff1aSopenharmony_ci    PMINUW            m5, m1, m3, m7
294cabdff1aSopenharmony_ci    pcmpeqw           m2, m4
295cabdff1aSopenharmony_ci    pcmpeqw           m3, m5
296cabdff1aSopenharmony_ci    pcmpeqw           m4, m1
297cabdff1aSopenharmony_ci    pcmpeqw           m5, m1
298cabdff1aSopenharmony_ci    psubw             m4, m2
299cabdff1aSopenharmony_ci    psubw             m5, m3
300cabdff1aSopenharmony_ci
301cabdff1aSopenharmony_ci    paddw             m4, m5
302cabdff1aSopenharmony_ci    pcmpeqw           m2, m4, [pw_m2]
303cabdff1aSopenharmony_ci%if ARCH_X86_64
304cabdff1aSopenharmony_ci    pcmpeqw           m3, m4, m13
305cabdff1aSopenharmony_ci    pcmpeqw           m5, m4, m0
306cabdff1aSopenharmony_ci    pcmpeqw           m6, m4, m14
307cabdff1aSopenharmony_ci    pcmpeqw           m7, m4, m15
308cabdff1aSopenharmony_ci    pand              m2, m8
309cabdff1aSopenharmony_ci    pand              m3, m9
310cabdff1aSopenharmony_ci    pand              m5, m10
311cabdff1aSopenharmony_ci    pand              m6, m11
312cabdff1aSopenharmony_ci    pand              m7, m12
313cabdff1aSopenharmony_ci%else
314cabdff1aSopenharmony_ci    pcmpeqw           m3, m4, [pw_m1]
315cabdff1aSopenharmony_ci    pcmpeqw           m5, m4, m0
316cabdff1aSopenharmony_ci    pcmpeqw           m6, m4, [pw_1]
317cabdff1aSopenharmony_ci    pcmpeqw           m7, m4, [pw_2]
318cabdff1aSopenharmony_ci    pand              m2, [rsp+mmsize*0]
319cabdff1aSopenharmony_ci    pand              m3, [rsp+mmsize*1]
320cabdff1aSopenharmony_ci    pand              m5, [rsp+mmsize*2]
321cabdff1aSopenharmony_ci    pand              m6, [rsp+mmsize*3]
322cabdff1aSopenharmony_ci    pand              m7, [rsp+mmsize*4]
323cabdff1aSopenharmony_ci%endif
324cabdff1aSopenharmony_ci    paddw             m2, m3
325cabdff1aSopenharmony_ci    paddw             m5, m6
326cabdff1aSopenharmony_ci    paddw             m2, m7
327cabdff1aSopenharmony_ci    paddw             m2, m1
328cabdff1aSopenharmony_ci    paddw             m2, m5
329cabdff1aSopenharmony_ci    CLIPW             m2, m0, [pw_mask %+ %1]
330cabdff1aSopenharmony_ci    mova      [dstq + i], m2
331cabdff1aSopenharmony_ci%assign i i+mmsize
332cabdff1aSopenharmony_ci%endrep
333cabdff1aSopenharmony_ci
334cabdff1aSopenharmony_ci    add             dstq, dststrideq
335cabdff1aSopenharmony_ci    add             srcq, EDGE_SRCSTRIDE
336cabdff1aSopenharmony_ci    dec          heightd
337cabdff1aSopenharmony_ci    jg .loop
338cabdff1aSopenharmony_ci    RET
339cabdff1aSopenharmony_ci%endmacro
340cabdff1aSopenharmony_ci
341cabdff1aSopenharmony_ciINIT_XMM sse2
342cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 10,  8, 1
343cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 10, 16, 2
344cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 10, 32, 4
345cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 10, 48, 6
346cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 10, 64, 8
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 12,  8, 1
349cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 12, 16, 2
350cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 12, 32, 4
351cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 12, 48, 6
352cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 12, 64, 8
353cabdff1aSopenharmony_ci
354cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
355cabdff1aSopenharmony_ciINIT_XMM avx2
356cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 10,  8, 1
357cabdff1aSopenharmony_ciINIT_YMM avx2
358cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 10, 16, 1
359cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 10, 32, 2
360cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 10, 48, 3
361cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 10, 64, 4
362cabdff1aSopenharmony_ci
363cabdff1aSopenharmony_ciINIT_XMM avx2
364cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 12,  8, 1
365cabdff1aSopenharmony_ciINIT_YMM avx2
366cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 12, 16, 1
367cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 12, 32, 2
368cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 12, 48, 3
369cabdff1aSopenharmony_ciHEVC_SAO_EDGE_FILTER 12, 64, 4
370cabdff1aSopenharmony_ci%endif
371