1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (C) 2005-2011 x264 project
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* Authors: Oskar Arvidsson <oskar@irock.se>
7cabdff1aSopenharmony_ci;*          Loren Merritt <lorenm@u.washington.edu>
8cabdff1aSopenharmony_ci;*          Fiona Glaser <fiona@x264.com>
9cabdff1aSopenharmony_ci;*
10cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
13cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
14cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
15cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
18cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
21cabdff1aSopenharmony_ci;*
22cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
23cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
24cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25cabdff1aSopenharmony_ci;******************************************************************************
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ciSECTION .text
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_cicextern pw_2
32cabdff1aSopenharmony_cicextern pw_3
33cabdff1aSopenharmony_cicextern pw_4
34cabdff1aSopenharmony_cicextern pw_1023
35cabdff1aSopenharmony_ci%define pw_pixel_max pw_1023
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci; out: %4 = |%1-%2|-%3
38cabdff1aSopenharmony_ci; clobbers: %5
39cabdff1aSopenharmony_ci%macro ABS_SUB 5
40cabdff1aSopenharmony_ci    psubusw %5, %2, %1
41cabdff1aSopenharmony_ci    psubusw %4, %1, %2
42cabdff1aSopenharmony_ci    por     %4, %5
43cabdff1aSopenharmony_ci    psubw   %4, %3
44cabdff1aSopenharmony_ci%endmacro
45cabdff1aSopenharmony_ci
46cabdff1aSopenharmony_ci; out: %4 = |%1-%2|<%3
47cabdff1aSopenharmony_ci%macro DIFF_LT   5
48cabdff1aSopenharmony_ci    psubusw %4, %2, %1
49cabdff1aSopenharmony_ci    psubusw %5, %1, %2
50cabdff1aSopenharmony_ci    por     %5, %4 ; |%1-%2|
51cabdff1aSopenharmony_ci    pxor    %4, %4
52cabdff1aSopenharmony_ci    psubw   %5, %3 ; |%1-%2|-%3
53cabdff1aSopenharmony_ci    pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
54cabdff1aSopenharmony_ci%endmacro
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ci%macro LOAD_AB 4
57cabdff1aSopenharmony_ci    movd       %1, %3
58cabdff1aSopenharmony_ci    movd       %2, %4
59cabdff1aSopenharmony_ci    SPLATW     %1, %1
60cabdff1aSopenharmony_ci    SPLATW     %2, %2
61cabdff1aSopenharmony_ci%endmacro
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_ci; in:  %2=tc reg
64cabdff1aSopenharmony_ci; out: %1=splatted tc
65cabdff1aSopenharmony_ci%macro LOAD_TC 2
66cabdff1aSopenharmony_ci    movd        %1, [%2]
67cabdff1aSopenharmony_ci    punpcklbw   %1, %1
68cabdff1aSopenharmony_ci%if mmsize == 8
69cabdff1aSopenharmony_ci    pshufw      %1, %1, 0
70cabdff1aSopenharmony_ci%else
71cabdff1aSopenharmony_ci    pshuflw     %1, %1, 01010000b
72cabdff1aSopenharmony_ci    pshufd      %1, %1, 01010000b
73cabdff1aSopenharmony_ci%endif
74cabdff1aSopenharmony_ci    psraw       %1, 6
75cabdff1aSopenharmony_ci%endmacro
76cabdff1aSopenharmony_ci
77cabdff1aSopenharmony_ci; in: %1=p1, %2=p0, %3=q0, %4=q1
78cabdff1aSopenharmony_ci;     %5=alpha, %6=beta, %7-%9=tmp
79cabdff1aSopenharmony_ci; out: %7=mask
80cabdff1aSopenharmony_ci%macro LOAD_MASK 9
81cabdff1aSopenharmony_ci    ABS_SUB     %2, %3, %5, %8, %7 ; |p0-q0| - alpha
82cabdff1aSopenharmony_ci    ABS_SUB     %1, %2, %6, %9, %7 ; |p1-p0| - beta
83cabdff1aSopenharmony_ci    pand        %8, %9
84cabdff1aSopenharmony_ci    ABS_SUB     %3, %4, %6, %9, %7 ; |q1-q0| - beta
85cabdff1aSopenharmony_ci    pxor        %7, %7
86cabdff1aSopenharmony_ci    pand        %8, %9
87cabdff1aSopenharmony_ci    pcmpgtw     %7, %8
88cabdff1aSopenharmony_ci%endmacro
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
91cabdff1aSopenharmony_ci; out: %1=p0', m2=q0'
92cabdff1aSopenharmony_ci%macro DEBLOCK_P0_Q0 7
93cabdff1aSopenharmony_ci    psubw   %3, %4
94cabdff1aSopenharmony_ci    pxor    %7, %7
95cabdff1aSopenharmony_ci    paddw   %3, [pw_4]
96cabdff1aSopenharmony_ci    psubw   %7, %5
97cabdff1aSopenharmony_ci    psubw   %6, %2, %1
98cabdff1aSopenharmony_ci    psllw   %6, 2
99cabdff1aSopenharmony_ci    paddw   %3, %6
100cabdff1aSopenharmony_ci    psraw   %3, 3
101cabdff1aSopenharmony_ci    mova    %6, [pw_pixel_max]
102cabdff1aSopenharmony_ci    CLIPW   %3, %7, %5
103cabdff1aSopenharmony_ci    pxor    %7, %7
104cabdff1aSopenharmony_ci    paddw   %1, %3
105cabdff1aSopenharmony_ci    psubw   %2, %3
106cabdff1aSopenharmony_ci    CLIPW   %1, %7, %6
107cabdff1aSopenharmony_ci    CLIPW   %2, %7, %6
108cabdff1aSopenharmony_ci%endmacro
109cabdff1aSopenharmony_ci
110cabdff1aSopenharmony_ci; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
111cabdff1aSopenharmony_ci%macro LUMA_Q1 6
112cabdff1aSopenharmony_ci    pavgw       %6, %3, %4      ; (p0+q0+1)>>1
113cabdff1aSopenharmony_ci    paddw       %1, %6
114cabdff1aSopenharmony_ci    pxor        %6, %6
115cabdff1aSopenharmony_ci    psraw       %1, 1
116cabdff1aSopenharmony_ci    psubw       %6, %5
117cabdff1aSopenharmony_ci    psubw       %1, %2
118cabdff1aSopenharmony_ci    CLIPW       %1, %6, %5
119cabdff1aSopenharmony_ci    paddw       %1, %2
120cabdff1aSopenharmony_ci%endmacro
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci%macro LUMA_DEBLOCK_ONE 3
123cabdff1aSopenharmony_ci    DIFF_LT     m5, %1, bm, m4, m6
124cabdff1aSopenharmony_ci    pxor        m6, m6
125cabdff1aSopenharmony_ci    mova        %3, m4
126cabdff1aSopenharmony_ci    pcmpgtw     m6, tcm
127cabdff1aSopenharmony_ci    pand        m4, tcm
128cabdff1aSopenharmony_ci    pandn       m6, m7
129cabdff1aSopenharmony_ci    pand        m4, m6
130cabdff1aSopenharmony_ci    LUMA_Q1 m5, %2, m1, m2, m4, m6
131cabdff1aSopenharmony_ci%endmacro
132cabdff1aSopenharmony_ci
133cabdff1aSopenharmony_ci%macro LUMA_H_STORE 2
134cabdff1aSopenharmony_ci%if mmsize == 8
135cabdff1aSopenharmony_ci    movq        [r0-4], m0
136cabdff1aSopenharmony_ci    movq        [r0+r1-4], m1
137cabdff1aSopenharmony_ci    movq        [r0+r1*2-4], m2
138cabdff1aSopenharmony_ci    movq        [r0+%2-4], m3
139cabdff1aSopenharmony_ci%else
140cabdff1aSopenharmony_ci    movq        [r0-4], m0
141cabdff1aSopenharmony_ci    movhps      [r0+r1-4], m0
142cabdff1aSopenharmony_ci    movq        [r0+r1*2-4], m1
143cabdff1aSopenharmony_ci    movhps      [%1-4], m1
144cabdff1aSopenharmony_ci    movq        [%1+r1-4], m2
145cabdff1aSopenharmony_ci    movhps      [%1+r1*2-4], m2
146cabdff1aSopenharmony_ci    movq        [%1+%2-4], m3
147cabdff1aSopenharmony_ci    movhps      [%1+r1*4-4], m3
148cabdff1aSopenharmony_ci%endif
149cabdff1aSopenharmony_ci%endmacro
150cabdff1aSopenharmony_ci
151cabdff1aSopenharmony_ci%macro DEBLOCK_LUMA 0
152cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
153cabdff1aSopenharmony_ci; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta,
154cabdff1aSopenharmony_ci;                           int8_t *tc0)
155cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
156cabdff1aSopenharmony_cicglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
157cabdff1aSopenharmony_ci    %assign pad 5*mmsize+12-(stack_offset&15)
158cabdff1aSopenharmony_ci    %define tcm [rsp]
159cabdff1aSopenharmony_ci    %define ms1 [rsp+mmsize]
160cabdff1aSopenharmony_ci    %define ms2 [rsp+mmsize*2]
161cabdff1aSopenharmony_ci    %define am  [rsp+mmsize*3]
162cabdff1aSopenharmony_ci    %define bm  [rsp+mmsize*4]
163cabdff1aSopenharmony_ci    SUB        rsp, pad
164cabdff1aSopenharmony_ci    shl        r2d, 2
165cabdff1aSopenharmony_ci    shl        r3d, 2
166cabdff1aSopenharmony_ci    LOAD_AB     m4, m5, r2d, r3d
167cabdff1aSopenharmony_ci    mov         r3, 32/mmsize
168cabdff1aSopenharmony_ci    mov         r2, r0
169cabdff1aSopenharmony_ci    sub         r0, r1
170cabdff1aSopenharmony_ci    mova        am, m4
171cabdff1aSopenharmony_ci    sub         r0, r1
172cabdff1aSopenharmony_ci    mova        bm, m5
173cabdff1aSopenharmony_ci    sub         r0, r1
174cabdff1aSopenharmony_ci.loop:
175cabdff1aSopenharmony_ci    mova        m0, [r0+r1]
176cabdff1aSopenharmony_ci    mova        m1, [r0+r1*2]
177cabdff1aSopenharmony_ci    mova        m2, [r2]
178cabdff1aSopenharmony_ci    mova        m3, [r2+r1]
179cabdff1aSopenharmony_ci
180cabdff1aSopenharmony_ci    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
181cabdff1aSopenharmony_ci    LOAD_TC     m6, r4
182cabdff1aSopenharmony_ci    mova       tcm, m6
183cabdff1aSopenharmony_ci
184cabdff1aSopenharmony_ci    mova        m5, [r0]
185cabdff1aSopenharmony_ci    LUMA_DEBLOCK_ONE m1, m0, ms1
186cabdff1aSopenharmony_ci    mova   [r0+r1], m5
187cabdff1aSopenharmony_ci
188cabdff1aSopenharmony_ci    mova        m5, [r2+r1*2]
189cabdff1aSopenharmony_ci    LUMA_DEBLOCK_ONE m2, m3, ms2
190cabdff1aSopenharmony_ci    mova   [r2+r1], m5
191cabdff1aSopenharmony_ci
192cabdff1aSopenharmony_ci    pxor        m5, m5
193cabdff1aSopenharmony_ci    mova        m6, tcm
194cabdff1aSopenharmony_ci    pcmpgtw     m5, tcm
195cabdff1aSopenharmony_ci    psubw       m6, ms1
196cabdff1aSopenharmony_ci    pandn       m5, m7
197cabdff1aSopenharmony_ci    psubw       m6, ms2
198cabdff1aSopenharmony_ci    pand        m5, m6
199cabdff1aSopenharmony_ci    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
200cabdff1aSopenharmony_ci    mova [r0+r1*2], m1
201cabdff1aSopenharmony_ci    mova      [r2], m2
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci    add         r0, mmsize
204cabdff1aSopenharmony_ci    add         r2, mmsize
205cabdff1aSopenharmony_ci    add         r4, mmsize/8
206cabdff1aSopenharmony_ci    dec         r3
207cabdff1aSopenharmony_ci    jg .loop
208cabdff1aSopenharmony_ci    ADD         rsp, pad
209cabdff1aSopenharmony_ci    RET
210cabdff1aSopenharmony_ci
211cabdff1aSopenharmony_cicglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
212cabdff1aSopenharmony_ci    %assign pad 7*mmsize+12-(stack_offset&15)
213cabdff1aSopenharmony_ci    %define tcm [rsp]
214cabdff1aSopenharmony_ci    %define ms1 [rsp+mmsize]
215cabdff1aSopenharmony_ci    %define ms2 [rsp+mmsize*2]
216cabdff1aSopenharmony_ci    %define p1m [rsp+mmsize*3]
217cabdff1aSopenharmony_ci    %define p2m [rsp+mmsize*4]
218cabdff1aSopenharmony_ci    %define am  [rsp+mmsize*5]
219cabdff1aSopenharmony_ci    %define bm  [rsp+mmsize*6]
220cabdff1aSopenharmony_ci    SUB        rsp, pad
221cabdff1aSopenharmony_ci    shl        r2d, 2
222cabdff1aSopenharmony_ci    shl        r3d, 2
223cabdff1aSopenharmony_ci    LOAD_AB     m4, m5, r2d, r3d
224cabdff1aSopenharmony_ci    mov         r3, r1
225cabdff1aSopenharmony_ci    mova        am, m4
226cabdff1aSopenharmony_ci    add         r3, r1
227cabdff1aSopenharmony_ci    mov         r5, 32/mmsize
228cabdff1aSopenharmony_ci    mova        bm, m5
229cabdff1aSopenharmony_ci    add         r3, r1
230cabdff1aSopenharmony_ci%if mmsize == 16
231cabdff1aSopenharmony_ci    mov         r2, r0
232cabdff1aSopenharmony_ci    add         r2, r3
233cabdff1aSopenharmony_ci%endif
234cabdff1aSopenharmony_ci.loop:
235cabdff1aSopenharmony_ci%if mmsize == 8
236cabdff1aSopenharmony_ci    movq        m2, [r0-8]     ; y q2 q1 q0
237cabdff1aSopenharmony_ci    movq        m7, [r0+0]
238cabdff1aSopenharmony_ci    movq        m5, [r0+r1-8]
239cabdff1aSopenharmony_ci    movq        m3, [r0+r1+0]
240cabdff1aSopenharmony_ci    movq        m0, [r0+r1*2-8]
241cabdff1aSopenharmony_ci    movq        m6, [r0+r1*2+0]
242cabdff1aSopenharmony_ci    movq        m1, [r0+r3-8]
243cabdff1aSopenharmony_ci    TRANSPOSE4x4W 2, 5, 0, 1, 4
244cabdff1aSopenharmony_ci    SWAP         2, 7
245cabdff1aSopenharmony_ci    movq        m7, [r0+r3]
246cabdff1aSopenharmony_ci    TRANSPOSE4x4W 2, 3, 6, 7, 4
247cabdff1aSopenharmony_ci%else
248cabdff1aSopenharmony_ci    movu        m5, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
249cabdff1aSopenharmony_ci    movu        m0, [r0+r1-8]
250cabdff1aSopenharmony_ci    movu        m2, [r0+r1*2-8]
251cabdff1aSopenharmony_ci    movu        m3, [r2-8]
252cabdff1aSopenharmony_ci    TRANSPOSE4x4W 5, 0, 2, 3, 6
253cabdff1aSopenharmony_ci    mova       tcm, m3
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ci    movu        m4, [r2+r1-8]
256cabdff1aSopenharmony_ci    movu        m1, [r2+r1*2-8]
257cabdff1aSopenharmony_ci    movu        m3, [r2+r3-8]
258cabdff1aSopenharmony_ci    movu        m7, [r2+r1*4-8]
259cabdff1aSopenharmony_ci    TRANSPOSE4x4W 4, 1, 3, 7, 6
260cabdff1aSopenharmony_ci
261cabdff1aSopenharmony_ci    mova        m6, tcm
262cabdff1aSopenharmony_ci    punpcklqdq  m6, m7
263cabdff1aSopenharmony_ci    punpckhqdq  m5, m4
264cabdff1aSopenharmony_ci    SBUTTERFLY qdq, 0, 1, 7
265cabdff1aSopenharmony_ci    SBUTTERFLY qdq, 2, 3, 7
266cabdff1aSopenharmony_ci%endif
267cabdff1aSopenharmony_ci
268cabdff1aSopenharmony_ci    mova       p2m, m6
269cabdff1aSopenharmony_ci    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
270cabdff1aSopenharmony_ci    LOAD_TC     m6, r4
271cabdff1aSopenharmony_ci    mova       tcm, m6
272cabdff1aSopenharmony_ci
273cabdff1aSopenharmony_ci    LUMA_DEBLOCK_ONE m1, m0, ms1
274cabdff1aSopenharmony_ci    mova       p1m, m5
275cabdff1aSopenharmony_ci
276cabdff1aSopenharmony_ci    mova        m5, p2m
277cabdff1aSopenharmony_ci    LUMA_DEBLOCK_ONE m2, m3, ms2
278cabdff1aSopenharmony_ci    mova       p2m, m5
279cabdff1aSopenharmony_ci
280cabdff1aSopenharmony_ci    pxor        m5, m5
281cabdff1aSopenharmony_ci    mova        m6, tcm
282cabdff1aSopenharmony_ci    pcmpgtw     m5, tcm
283cabdff1aSopenharmony_ci    psubw       m6, ms1
284cabdff1aSopenharmony_ci    pandn       m5, m7
285cabdff1aSopenharmony_ci    psubw       m6, ms2
286cabdff1aSopenharmony_ci    pand        m5, m6
287cabdff1aSopenharmony_ci    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
288cabdff1aSopenharmony_ci    mova        m0, p1m
289cabdff1aSopenharmony_ci    mova        m3, p2m
290cabdff1aSopenharmony_ci    TRANSPOSE4x4W 0, 1, 2, 3, 4
291cabdff1aSopenharmony_ci    LUMA_H_STORE r2, r3
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci    add         r4, mmsize/8
294cabdff1aSopenharmony_ci    lea         r0, [r0+r1*(mmsize/2)]
295cabdff1aSopenharmony_ci    lea         r2, [r2+r1*(mmsize/2)]
296cabdff1aSopenharmony_ci    dec         r5
297cabdff1aSopenharmony_ci    jg .loop
298cabdff1aSopenharmony_ci    ADD        rsp, pad
299cabdff1aSopenharmony_ci    RET
300cabdff1aSopenharmony_ci%endmacro
301cabdff1aSopenharmony_ci
302cabdff1aSopenharmony_ci%if ARCH_X86_64
303cabdff1aSopenharmony_ci; in:  m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
304cabdff1aSopenharmony_ci;      m12=alpha, m13=beta
305cabdff1aSopenharmony_ci; out: m0=p1', m3=q1', m1=p0', m2=q0'
306cabdff1aSopenharmony_ci; clobbers: m4, m5, m6, m7, m10, m11, m14
307cabdff1aSopenharmony_ci%macro DEBLOCK_LUMA_INTER_SSE2 0
308cabdff1aSopenharmony_ci    LOAD_MASK   m0, m1, m2, m3, m12, m13, m7, m4, m6
309cabdff1aSopenharmony_ci    LOAD_TC     m6, r4
310cabdff1aSopenharmony_ci    DIFF_LT     m8, m1, m13, m10, m4
311cabdff1aSopenharmony_ci    DIFF_LT     m9, m2, m13, m11, m4
312cabdff1aSopenharmony_ci    pand        m6, m7
313cabdff1aSopenharmony_ci
314cabdff1aSopenharmony_ci    mova       m14, m6
315cabdff1aSopenharmony_ci    pxor        m4, m4
316cabdff1aSopenharmony_ci    pcmpgtw     m6, m4
317cabdff1aSopenharmony_ci    pand        m6, m14
318cabdff1aSopenharmony_ci
319cabdff1aSopenharmony_ci    mova        m5, m10
320cabdff1aSopenharmony_ci    pand        m5, m6
321cabdff1aSopenharmony_ci    LUMA_Q1 m8, m0, m1, m2, m5, m4
322cabdff1aSopenharmony_ci
323cabdff1aSopenharmony_ci    mova        m5, m11
324cabdff1aSopenharmony_ci    pand        m5, m6
325cabdff1aSopenharmony_ci    LUMA_Q1 m9, m3, m1, m2, m5, m4
326cabdff1aSopenharmony_ci
327cabdff1aSopenharmony_ci    pxor        m4, m4
328cabdff1aSopenharmony_ci    psubw       m6, m10
329cabdff1aSopenharmony_ci    pcmpgtw     m4, m14
330cabdff1aSopenharmony_ci    pandn       m4, m7
331cabdff1aSopenharmony_ci    psubw       m6, m11
332cabdff1aSopenharmony_ci    pand        m4, m6
333cabdff1aSopenharmony_ci    DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
334cabdff1aSopenharmony_ci
335cabdff1aSopenharmony_ci    SWAP         0, 8
336cabdff1aSopenharmony_ci    SWAP         3, 9
337cabdff1aSopenharmony_ci%endmacro
338cabdff1aSopenharmony_ci
339cabdff1aSopenharmony_ci%macro DEBLOCK_LUMA_64 0
340cabdff1aSopenharmony_cicglobal deblock_v_luma_10, 5,5,15
341cabdff1aSopenharmony_ci    %define p2 m8
342cabdff1aSopenharmony_ci    %define p1 m0
343cabdff1aSopenharmony_ci    %define p0 m1
344cabdff1aSopenharmony_ci    %define q0 m2
345cabdff1aSopenharmony_ci    %define q1 m3
346cabdff1aSopenharmony_ci    %define q2 m9
347cabdff1aSopenharmony_ci    %define mask0 m7
348cabdff1aSopenharmony_ci    %define mask1 m10
349cabdff1aSopenharmony_ci    %define mask2 m11
350cabdff1aSopenharmony_ci    shl        r2d, 2
351cabdff1aSopenharmony_ci    shl        r3d, 2
352cabdff1aSopenharmony_ci    LOAD_AB    m12, m13, r2d, r3d
353cabdff1aSopenharmony_ci    mov         r2, r0
354cabdff1aSopenharmony_ci    sub         r0, r1
355cabdff1aSopenharmony_ci    sub         r0, r1
356cabdff1aSopenharmony_ci    sub         r0, r1
357cabdff1aSopenharmony_ci    mov         r3, 2
358cabdff1aSopenharmony_ci.loop:
359cabdff1aSopenharmony_ci    mova        p2, [r0]
360cabdff1aSopenharmony_ci    mova        p1, [r0+r1]
361cabdff1aSopenharmony_ci    mova        p0, [r0+r1*2]
362cabdff1aSopenharmony_ci    mova        q0, [r2]
363cabdff1aSopenharmony_ci    mova        q1, [r2+r1]
364cabdff1aSopenharmony_ci    mova        q2, [r2+r1*2]
365cabdff1aSopenharmony_ci    DEBLOCK_LUMA_INTER_SSE2
366cabdff1aSopenharmony_ci    mova   [r0+r1], p1
367cabdff1aSopenharmony_ci    mova [r0+r1*2], p0
368cabdff1aSopenharmony_ci    mova      [r2], q0
369cabdff1aSopenharmony_ci    mova   [r2+r1], q1
370cabdff1aSopenharmony_ci    add         r0, mmsize
371cabdff1aSopenharmony_ci    add         r2, mmsize
372cabdff1aSopenharmony_ci    add         r4, 2
373cabdff1aSopenharmony_ci    dec         r3
374cabdff1aSopenharmony_ci    jg .loop
375cabdff1aSopenharmony_ci    REP_RET
376cabdff1aSopenharmony_ci
377cabdff1aSopenharmony_cicglobal deblock_h_luma_10, 5,7,15
378cabdff1aSopenharmony_ci    shl        r2d, 2
379cabdff1aSopenharmony_ci    shl        r3d, 2
380cabdff1aSopenharmony_ci    LOAD_AB    m12, m13, r2d, r3d
381cabdff1aSopenharmony_ci    mov         r2, r1
382cabdff1aSopenharmony_ci    add         r2, r1
383cabdff1aSopenharmony_ci    add         r2, r1
384cabdff1aSopenharmony_ci    mov         r5, r0
385cabdff1aSopenharmony_ci    add         r5, r2
386cabdff1aSopenharmony_ci    mov         r6, 2
387cabdff1aSopenharmony_ci.loop:
388cabdff1aSopenharmony_ci    movu        m8, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
389cabdff1aSopenharmony_ci    movu        m0, [r0+r1-8]
390cabdff1aSopenharmony_ci    movu        m2, [r0+r1*2-8]
391cabdff1aSopenharmony_ci    movu        m9, [r5-8]
392cabdff1aSopenharmony_ci    movu        m5, [r5+r1-8]
393cabdff1aSopenharmony_ci    movu        m1, [r5+r1*2-8]
394cabdff1aSopenharmony_ci    movu        m3, [r5+r2-8]
395cabdff1aSopenharmony_ci    movu        m7, [r5+r1*4-8]
396cabdff1aSopenharmony_ci
397cabdff1aSopenharmony_ci    TRANSPOSE4x4W 8, 0, 2, 9, 10
398cabdff1aSopenharmony_ci    TRANSPOSE4x4W 5, 1, 3, 7, 10
399cabdff1aSopenharmony_ci
400cabdff1aSopenharmony_ci    punpckhqdq  m8, m5
401cabdff1aSopenharmony_ci    SBUTTERFLY qdq, 0, 1, 10
402cabdff1aSopenharmony_ci    SBUTTERFLY qdq, 2, 3, 10
403cabdff1aSopenharmony_ci    punpcklqdq  m9, m7
404cabdff1aSopenharmony_ci
405cabdff1aSopenharmony_ci    DEBLOCK_LUMA_INTER_SSE2
406cabdff1aSopenharmony_ci
407cabdff1aSopenharmony_ci    TRANSPOSE4x4W 0, 1, 2, 3, 4
408cabdff1aSopenharmony_ci    LUMA_H_STORE r5, r2
409cabdff1aSopenharmony_ci    add         r4, 2
410cabdff1aSopenharmony_ci    lea         r0, [r0+r1*8]
411cabdff1aSopenharmony_ci    lea         r5, [r5+r1*8]
412cabdff1aSopenharmony_ci    dec         r6
413cabdff1aSopenharmony_ci    jg .loop
414cabdff1aSopenharmony_ci    REP_RET
415cabdff1aSopenharmony_ci%endmacro
416cabdff1aSopenharmony_ci
417cabdff1aSopenharmony_ciINIT_XMM sse2
418cabdff1aSopenharmony_ciDEBLOCK_LUMA_64
419cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
420cabdff1aSopenharmony_ciINIT_XMM avx
421cabdff1aSopenharmony_ciDEBLOCK_LUMA_64
422cabdff1aSopenharmony_ci%endif
423cabdff1aSopenharmony_ci%endif
424cabdff1aSopenharmony_ci
425cabdff1aSopenharmony_ci%macro SWAPMOVA 2
426cabdff1aSopenharmony_ci%ifid %1
427cabdff1aSopenharmony_ci    SWAP %1, %2
428cabdff1aSopenharmony_ci%else
429cabdff1aSopenharmony_ci    mova %1, %2
430cabdff1aSopenharmony_ci%endif
431cabdff1aSopenharmony_ci%endmacro
432cabdff1aSopenharmony_ci
433cabdff1aSopenharmony_ci; in: t0-t2: tmp registers
434cabdff1aSopenharmony_ci;     %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
435cabdff1aSopenharmony_ci;     %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
436cabdff1aSopenharmony_ci%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
437cabdff1aSopenharmony_ci%if ARCH_X86_64
438cabdff1aSopenharmony_ci    paddw     t0, %3, %2
439cabdff1aSopenharmony_ci    mova      t2, %4
440cabdff1aSopenharmony_ci    paddw     t2, %3
441cabdff1aSopenharmony_ci%else
442cabdff1aSopenharmony_ci    mova      t0, %3
443cabdff1aSopenharmony_ci    mova      t2, %4
444cabdff1aSopenharmony_ci    paddw     t0, %2
445cabdff1aSopenharmony_ci    paddw     t2, %3
446cabdff1aSopenharmony_ci%endif
447cabdff1aSopenharmony_ci    paddw     t0, %1
448cabdff1aSopenharmony_ci    paddw     t2, t2
449cabdff1aSopenharmony_ci    paddw     t0, %5
450cabdff1aSopenharmony_ci    paddw     t2, %9
451cabdff1aSopenharmony_ci    paddw     t0, %9    ; (p2 + p1 + p0 + q0 + 2)
452cabdff1aSopenharmony_ci    paddw     t2, t0    ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
453cabdff1aSopenharmony_ci
454cabdff1aSopenharmony_ci    psrlw     t2, 3
455cabdff1aSopenharmony_ci    psrlw     t1, t0, 2
456cabdff1aSopenharmony_ci    psubw     t2, %3
457cabdff1aSopenharmony_ci    psubw     t1, %2
458cabdff1aSopenharmony_ci    pand      t2, %8
459cabdff1aSopenharmony_ci    pand      t1, %8
460cabdff1aSopenharmony_ci    paddw     t2, %3
461cabdff1aSopenharmony_ci    paddw     t1, %2
462cabdff1aSopenharmony_ci    SWAPMOVA %11, t1
463cabdff1aSopenharmony_ci
464cabdff1aSopenharmony_ci    psubw     t1, t0, %3
465cabdff1aSopenharmony_ci    paddw     t0, t0
466cabdff1aSopenharmony_ci    psubw     t1, %5
467cabdff1aSopenharmony_ci    psubw     t0, %3
468cabdff1aSopenharmony_ci    paddw     t1, %6
469cabdff1aSopenharmony_ci    paddw     t1, %2
470cabdff1aSopenharmony_ci    paddw     t0, %6
471cabdff1aSopenharmony_ci    psrlw     t1, 2     ; (2*p1 + p0 + q1 + 2)/4
472cabdff1aSopenharmony_ci    psrlw     t0, 3     ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
473cabdff1aSopenharmony_ci
474cabdff1aSopenharmony_ci    pxor      t0, t1
475cabdff1aSopenharmony_ci    pxor      t1, %1
476cabdff1aSopenharmony_ci    pand      t0, %8
477cabdff1aSopenharmony_ci    pand      t1, %7
478cabdff1aSopenharmony_ci    pxor      t0, t1
479cabdff1aSopenharmony_ci    pxor      t0, %1
480cabdff1aSopenharmony_ci    SWAPMOVA %10, t0
481cabdff1aSopenharmony_ci    SWAPMOVA %12, t2
482cabdff1aSopenharmony_ci%endmacro
483cabdff1aSopenharmony_ci
484cabdff1aSopenharmony_ci%macro LUMA_INTRA_INIT 1
485cabdff1aSopenharmony_ci    %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
486cabdff1aSopenharmony_ci    %define t0 m4
487cabdff1aSopenharmony_ci    %define t1 m5
488cabdff1aSopenharmony_ci    %define t2 m6
489cabdff1aSopenharmony_ci    %define t3 m7
490cabdff1aSopenharmony_ci    %assign i 4
491cabdff1aSopenharmony_ci%rep %1
492cabdff1aSopenharmony_ci    CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
493cabdff1aSopenharmony_ci    %assign i i+1
494cabdff1aSopenharmony_ci%endrep
495cabdff1aSopenharmony_ci    SUB    rsp, pad
496cabdff1aSopenharmony_ci%endmacro
497cabdff1aSopenharmony_ci
498cabdff1aSopenharmony_ci; in: %1-%3=tmp, %4=p2, %5=q2
499cabdff1aSopenharmony_ci%macro LUMA_INTRA_INTER 5
500cabdff1aSopenharmony_ci    LOAD_AB t0, t1, r2d, r3d
501cabdff1aSopenharmony_ci    mova    %1, t0
502cabdff1aSopenharmony_ci    LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
503cabdff1aSopenharmony_ci%if ARCH_X86_64
504cabdff1aSopenharmony_ci    mova    %2, t0        ; mask0
505cabdff1aSopenharmony_ci    psrlw   t3, %1, 2
506cabdff1aSopenharmony_ci%else
507cabdff1aSopenharmony_ci    mova    t3, %1
508cabdff1aSopenharmony_ci    mova    %2, t0        ; mask0
509cabdff1aSopenharmony_ci    psrlw   t3, 2
510cabdff1aSopenharmony_ci%endif
511cabdff1aSopenharmony_ci    paddw   t3, [pw_2]    ; alpha/4+2
512cabdff1aSopenharmony_ci    DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
513cabdff1aSopenharmony_ci    pand    t2, %2
514cabdff1aSopenharmony_ci    mova    t3, %5        ; q2
515cabdff1aSopenharmony_ci    mova    %1, t2        ; mask1
516cabdff1aSopenharmony_ci    DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
517cabdff1aSopenharmony_ci    pand    t2, %1
518cabdff1aSopenharmony_ci    mova    t3, %4        ; p2
519cabdff1aSopenharmony_ci    mova    %3, t2        ; mask1q
520cabdff1aSopenharmony_ci    DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
521cabdff1aSopenharmony_ci    pand    t2, %1
522cabdff1aSopenharmony_ci    mova    %1, t2        ; mask1p
523cabdff1aSopenharmony_ci%endmacro
524cabdff1aSopenharmony_ci
525cabdff1aSopenharmony_ci%macro LUMA_H_INTRA_LOAD 0
526cabdff1aSopenharmony_ci%if mmsize == 8
527cabdff1aSopenharmony_ci    movu    t0, [r0-8]
528cabdff1aSopenharmony_ci    movu    t1, [r0+r1-8]
529cabdff1aSopenharmony_ci    movu    m0, [r0+r1*2-8]
530cabdff1aSopenharmony_ci    movu    m1, [r0+r4-8]
531cabdff1aSopenharmony_ci    TRANSPOSE4x4W 4, 5, 0, 1, 2
532cabdff1aSopenharmony_ci    mova    t4, t0        ; p3
533cabdff1aSopenharmony_ci    mova    t5, t1        ; p2
534cabdff1aSopenharmony_ci
535cabdff1aSopenharmony_ci    movu    m2, [r0]
536cabdff1aSopenharmony_ci    movu    m3, [r0+r1]
537cabdff1aSopenharmony_ci    movu    t0, [r0+r1*2]
538cabdff1aSopenharmony_ci    movu    t1, [r0+r4]
539cabdff1aSopenharmony_ci    TRANSPOSE4x4W 2, 3, 4, 5, 6
540cabdff1aSopenharmony_ci    mova    t6, t0        ; q2
541cabdff1aSopenharmony_ci    mova    t7, t1        ; q3
542cabdff1aSopenharmony_ci%else
543cabdff1aSopenharmony_ci    movu    t0, [r0-8]
544cabdff1aSopenharmony_ci    movu    t1, [r0+r1-8]
545cabdff1aSopenharmony_ci    movu    m0, [r0+r1*2-8]
546cabdff1aSopenharmony_ci    movu    m1, [r0+r5-8]
547cabdff1aSopenharmony_ci    movu    m2, [r4-8]
548cabdff1aSopenharmony_ci    movu    m3, [r4+r1-8]
549cabdff1aSopenharmony_ci    movu    t2, [r4+r1*2-8]
550cabdff1aSopenharmony_ci    movu    t3, [r4+r5-8]
551cabdff1aSopenharmony_ci    TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
552cabdff1aSopenharmony_ci    mova    t4, t0        ; p3
553cabdff1aSopenharmony_ci    mova    t5, t1        ; p2
554cabdff1aSopenharmony_ci    mova    t6, t2        ; q2
555cabdff1aSopenharmony_ci    mova    t7, t3        ; q3
556cabdff1aSopenharmony_ci%endif
557cabdff1aSopenharmony_ci%endmacro
558cabdff1aSopenharmony_ci
559cabdff1aSopenharmony_ci; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
560cabdff1aSopenharmony_ci%macro LUMA_H_INTRA_STORE 9
561cabdff1aSopenharmony_ci%if mmsize == 8
562cabdff1aSopenharmony_ci    TRANSPOSE4x4W %1, %2, %3, %4, %9
563cabdff1aSopenharmony_ci    movq       [r0-8], m%1
564cabdff1aSopenharmony_ci    movq       [r0+r1-8], m%2
565cabdff1aSopenharmony_ci    movq       [r0+r1*2-8], m%3
566cabdff1aSopenharmony_ci    movq       [r0+r4-8], m%4
567cabdff1aSopenharmony_ci    movq       m%1, %8
568cabdff1aSopenharmony_ci    TRANSPOSE4x4W %5, %6, %7, %1, %9
569cabdff1aSopenharmony_ci    movq       [r0], m%5
570cabdff1aSopenharmony_ci    movq       [r0+r1], m%6
571cabdff1aSopenharmony_ci    movq       [r0+r1*2], m%7
572cabdff1aSopenharmony_ci    movq       [r0+r4], m%1
573cabdff1aSopenharmony_ci%else
574cabdff1aSopenharmony_ci    TRANSPOSE2x4x4W %1, %2, %3, %4, %9
575cabdff1aSopenharmony_ci    movq       [r0-8], m%1
576cabdff1aSopenharmony_ci    movq       [r0+r1-8], m%2
577cabdff1aSopenharmony_ci    movq       [r0+r1*2-8], m%3
578cabdff1aSopenharmony_ci    movq       [r0+r5-8], m%4
579cabdff1aSopenharmony_ci    movhps     [r4-8], m%1
580cabdff1aSopenharmony_ci    movhps     [r4+r1-8], m%2
581cabdff1aSopenharmony_ci    movhps     [r4+r1*2-8], m%3
582cabdff1aSopenharmony_ci    movhps     [r4+r5-8], m%4
583cabdff1aSopenharmony_ci%ifnum %8
584cabdff1aSopenharmony_ci    SWAP       %1, %8
585cabdff1aSopenharmony_ci%else
586cabdff1aSopenharmony_ci    mova       m%1, %8
587cabdff1aSopenharmony_ci%endif
588cabdff1aSopenharmony_ci    TRANSPOSE2x4x4W %5, %6, %7, %1, %9
589cabdff1aSopenharmony_ci    movq       [r0], m%5
590cabdff1aSopenharmony_ci    movq       [r0+r1], m%6
591cabdff1aSopenharmony_ci    movq       [r0+r1*2], m%7
592cabdff1aSopenharmony_ci    movq       [r0+r5], m%1
593cabdff1aSopenharmony_ci    movhps     [r4], m%5
594cabdff1aSopenharmony_ci    movhps     [r4+r1], m%6
595cabdff1aSopenharmony_ci    movhps     [r4+r1*2], m%7
596cabdff1aSopenharmony_ci    movhps     [r4+r5], m%1
597cabdff1aSopenharmony_ci%endif
598cabdff1aSopenharmony_ci%endmacro
599cabdff1aSopenharmony_ci
600cabdff1aSopenharmony_ci%if ARCH_X86_64
601cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
602cabdff1aSopenharmony_ci; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
603cabdff1aSopenharmony_ci;                                 int beta)
604cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
605cabdff1aSopenharmony_ci%macro DEBLOCK_LUMA_INTRA_64 0
606cabdff1aSopenharmony_cicglobal deblock_v_luma_intra_10, 4,7,16
607cabdff1aSopenharmony_ci    %define t0 m1
608cabdff1aSopenharmony_ci    %define t1 m2
609cabdff1aSopenharmony_ci    %define t2 m4
610cabdff1aSopenharmony_ci    %define p2 m8
611cabdff1aSopenharmony_ci    %define p1 m9
612cabdff1aSopenharmony_ci    %define p0 m10
613cabdff1aSopenharmony_ci    %define q0 m11
614cabdff1aSopenharmony_ci    %define q1 m12
615cabdff1aSopenharmony_ci    %define q2 m13
616cabdff1aSopenharmony_ci    %define aa m5
617cabdff1aSopenharmony_ci    %define bb m14
618cabdff1aSopenharmony_ci    lea     r4, [r1*4]
619cabdff1aSopenharmony_ci    lea     r5, [r1*3] ; 3*stride
620cabdff1aSopenharmony_ci    neg     r4
621cabdff1aSopenharmony_ci    add     r4, r0     ; pix-4*stride
622cabdff1aSopenharmony_ci    mov     r6, 2
623cabdff1aSopenharmony_ci    mova    m0, [pw_2]
624cabdff1aSopenharmony_ci    shl    r2d, 2
625cabdff1aSopenharmony_ci    shl    r3d, 2
626cabdff1aSopenharmony_ci    LOAD_AB aa, bb, r2d, r3d
627cabdff1aSopenharmony_ci.loop:
628cabdff1aSopenharmony_ci    mova    p2, [r4+r1]
629cabdff1aSopenharmony_ci    mova    p1, [r4+2*r1]
630cabdff1aSopenharmony_ci    mova    p0, [r4+r5]
631cabdff1aSopenharmony_ci    mova    q0, [r0]
632cabdff1aSopenharmony_ci    mova    q1, [r0+r1]
633cabdff1aSopenharmony_ci    mova    q2, [r0+2*r1]
634cabdff1aSopenharmony_ci
635cabdff1aSopenharmony_ci    LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
636cabdff1aSopenharmony_ci    mova    t2, aa
637cabdff1aSopenharmony_ci    psrlw   t2, 2
638cabdff1aSopenharmony_ci    paddw   t2, m0 ; alpha/4+2
639cabdff1aSopenharmony_ci    DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
640cabdff1aSopenharmony_ci    DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
641cabdff1aSopenharmony_ci    DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
642cabdff1aSopenharmony_ci    pand    m6, m3
643cabdff1aSopenharmony_ci    pand    m7, m6
644cabdff1aSopenharmony_ci    pand    m6, t1
645cabdff1aSopenharmony_ci    LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
646cabdff1aSopenharmony_ci    LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
647cabdff1aSopenharmony_ci    add     r0, mmsize
648cabdff1aSopenharmony_ci    add     r4, mmsize
649cabdff1aSopenharmony_ci    dec     r6
650cabdff1aSopenharmony_ci    jg .loop
651cabdff1aSopenharmony_ci    REP_RET
652cabdff1aSopenharmony_ci
653cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
654cabdff1aSopenharmony_ci; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
655cabdff1aSopenharmony_ci;                                 int beta)
656cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
657cabdff1aSopenharmony_cicglobal deblock_h_luma_intra_10, 4,7,16
658cabdff1aSopenharmony_ci    %define t0 m15
659cabdff1aSopenharmony_ci    %define t1 m14
660cabdff1aSopenharmony_ci    %define t2 m2
661cabdff1aSopenharmony_ci    %define q3 m5
662cabdff1aSopenharmony_ci    %define q2 m8
663cabdff1aSopenharmony_ci    %define q1 m9
664cabdff1aSopenharmony_ci    %define q0 m10
665cabdff1aSopenharmony_ci    %define p0 m11
666cabdff1aSopenharmony_ci    %define p1 m12
667cabdff1aSopenharmony_ci    %define p2 m13
668cabdff1aSopenharmony_ci    %define p3 m4
669cabdff1aSopenharmony_ci    %define spill [rsp]
670cabdff1aSopenharmony_ci    %assign pad 24-(stack_offset&15)
671cabdff1aSopenharmony_ci    SUB     rsp, pad
672cabdff1aSopenharmony_ci    lea     r4, [r1*4]
673cabdff1aSopenharmony_ci    lea     r5, [r1*3] ; 3*stride
674cabdff1aSopenharmony_ci    add     r4, r0     ; pix+4*stride
675cabdff1aSopenharmony_ci    mov     r6, 2
676cabdff1aSopenharmony_ci    mova    m0, [pw_2]
677cabdff1aSopenharmony_ci    shl    r2d, 2
678cabdff1aSopenharmony_ci    shl    r3d, 2
679cabdff1aSopenharmony_ci.loop:
680cabdff1aSopenharmony_ci    movu    q3, [r0-8]
681cabdff1aSopenharmony_ci    movu    q2, [r0+r1-8]
682cabdff1aSopenharmony_ci    movu    q1, [r0+r1*2-8]
683cabdff1aSopenharmony_ci    movu    q0, [r0+r5-8]
684cabdff1aSopenharmony_ci    movu    p0, [r4-8]
685cabdff1aSopenharmony_ci    movu    p1, [r4+r1-8]
686cabdff1aSopenharmony_ci    movu    p2, [r4+r1*2-8]
687cabdff1aSopenharmony_ci    movu    p3, [r4+r5-8]
688cabdff1aSopenharmony_ci    TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
689cabdff1aSopenharmony_ci
690cabdff1aSopenharmony_ci    LOAD_AB m1, m2, r2d, r3d
691cabdff1aSopenharmony_ci    LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
692cabdff1aSopenharmony_ci    psrlw   m1, 2
693cabdff1aSopenharmony_ci    paddw   m1, m0 ; alpha/4+2
694cabdff1aSopenharmony_ci    DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
695cabdff1aSopenharmony_ci    DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
696cabdff1aSopenharmony_ci    DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
697cabdff1aSopenharmony_ci    pand    m6, m3
698cabdff1aSopenharmony_ci    pand    m7, m6
699cabdff1aSopenharmony_ci    pand    m6, t1
700cabdff1aSopenharmony_ci
701cabdff1aSopenharmony_ci    mova spill, q3
702cabdff1aSopenharmony_ci    LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
703cabdff1aSopenharmony_ci    LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
704cabdff1aSopenharmony_ci    mova    m7, spill
705cabdff1aSopenharmony_ci
706cabdff1aSopenharmony_ci    LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
707cabdff1aSopenharmony_ci
708cabdff1aSopenharmony_ci    lea     r0, [r0+r1*8]
709cabdff1aSopenharmony_ci    lea     r4, [r4+r1*8]
710cabdff1aSopenharmony_ci    dec     r6
711cabdff1aSopenharmony_ci    jg .loop
712cabdff1aSopenharmony_ci    ADD    rsp, pad
713cabdff1aSopenharmony_ci    RET
714cabdff1aSopenharmony_ci%endmacro
715cabdff1aSopenharmony_ci
716cabdff1aSopenharmony_ciINIT_XMM sse2
717cabdff1aSopenharmony_ciDEBLOCK_LUMA_INTRA_64
718cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
719cabdff1aSopenharmony_ciINIT_XMM avx
720cabdff1aSopenharmony_ciDEBLOCK_LUMA_INTRA_64
721cabdff1aSopenharmony_ci%endif
722cabdff1aSopenharmony_ci
723cabdff1aSopenharmony_ci%endif
724cabdff1aSopenharmony_ci
725cabdff1aSopenharmony_ci%macro DEBLOCK_LUMA_INTRA 0
726cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
727cabdff1aSopenharmony_ci; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
728cabdff1aSopenharmony_ci;                                 int beta)
729cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
730cabdff1aSopenharmony_cicglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
731cabdff1aSopenharmony_ci    LUMA_INTRA_INIT 3
732cabdff1aSopenharmony_ci    lea     r4, [r1*4]
733cabdff1aSopenharmony_ci    lea     r5, [r1*3]
734cabdff1aSopenharmony_ci    neg     r4
735cabdff1aSopenharmony_ci    add     r4, r0
736cabdff1aSopenharmony_ci    mov     r6, 32/mmsize
737cabdff1aSopenharmony_ci    shl    r2d, 2
738cabdff1aSopenharmony_ci    shl    r3d, 2
739cabdff1aSopenharmony_ci.loop:
740cabdff1aSopenharmony_ci    mova    m0, [r4+r1*2] ; p1
741cabdff1aSopenharmony_ci    mova    m1, [r4+r5]   ; p0
742cabdff1aSopenharmony_ci    mova    m2, [r0]      ; q0
743cabdff1aSopenharmony_ci    mova    m3, [r0+r1]   ; q1
744cabdff1aSopenharmony_ci    LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
745cabdff1aSopenharmony_ci    LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
746cabdff1aSopenharmony_ci    mova    t3, [r0+r1*2] ; q2
747cabdff1aSopenharmony_ci    LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
748cabdff1aSopenharmony_ci    add     r0, mmsize
749cabdff1aSopenharmony_ci    add     r4, mmsize
750cabdff1aSopenharmony_ci    dec     r6
751cabdff1aSopenharmony_ci    jg .loop
752cabdff1aSopenharmony_ci    ADD    rsp, pad
753cabdff1aSopenharmony_ci    RET
754cabdff1aSopenharmony_ci
755cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
756cabdff1aSopenharmony_ci; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
757cabdff1aSopenharmony_ci;                                 int beta)
758cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
759cabdff1aSopenharmony_cicglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
760cabdff1aSopenharmony_ci    LUMA_INTRA_INIT 8
761cabdff1aSopenharmony_ci%if mmsize == 8
762cabdff1aSopenharmony_ci    lea     r4, [r1*3]
763cabdff1aSopenharmony_ci    mov     r5, 32/mmsize
764cabdff1aSopenharmony_ci%else
765cabdff1aSopenharmony_ci    lea     r4, [r1*4]
766cabdff1aSopenharmony_ci    lea     r5, [r1*3] ; 3*stride
767cabdff1aSopenharmony_ci    add     r4, r0     ; pix+4*stride
768cabdff1aSopenharmony_ci    mov     r6, 32/mmsize
769cabdff1aSopenharmony_ci%endif
770cabdff1aSopenharmony_ci    shl    r2d, 2
771cabdff1aSopenharmony_ci    shl    r3d, 2
772cabdff1aSopenharmony_ci.loop:
773cabdff1aSopenharmony_ci    LUMA_H_INTRA_LOAD
774cabdff1aSopenharmony_ci    LUMA_INTRA_INTER t8, t9, t10, t5, t6
775cabdff1aSopenharmony_ci
776cabdff1aSopenharmony_ci    LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
777cabdff1aSopenharmony_ci    mova    t3, t6     ; q2
778cabdff1aSopenharmony_ci    LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
779cabdff1aSopenharmony_ci
780cabdff1aSopenharmony_ci    mova    m2, t4
781cabdff1aSopenharmony_ci    mova    m0, t11
782cabdff1aSopenharmony_ci    mova    m1, t5
783cabdff1aSopenharmony_ci    mova    m3, t8
784cabdff1aSopenharmony_ci    mova    m6, t6
785cabdff1aSopenharmony_ci
786cabdff1aSopenharmony_ci    LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
787cabdff1aSopenharmony_ci
788cabdff1aSopenharmony_ci    lea     r0, [r0+r1*(mmsize/2)]
789cabdff1aSopenharmony_ci%if mmsize == 8
790cabdff1aSopenharmony_ci    dec     r5
791cabdff1aSopenharmony_ci%else
792cabdff1aSopenharmony_ci    lea     r4, [r4+r1*(mmsize/2)]
793cabdff1aSopenharmony_ci    dec     r6
794cabdff1aSopenharmony_ci%endif
795cabdff1aSopenharmony_ci    jg .loop
796cabdff1aSopenharmony_ci    ADD    rsp, pad
797cabdff1aSopenharmony_ci    RET
798cabdff1aSopenharmony_ci%endmacro
799cabdff1aSopenharmony_ci
800cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
801cabdff1aSopenharmony_ci%if HAVE_ALIGNED_STACK == 0
802cabdff1aSopenharmony_ciINIT_MMX mmxext
803cabdff1aSopenharmony_ciDEBLOCK_LUMA
804cabdff1aSopenharmony_ciDEBLOCK_LUMA_INTRA
805cabdff1aSopenharmony_ci%endif
806cabdff1aSopenharmony_ciINIT_XMM sse2
807cabdff1aSopenharmony_ciDEBLOCK_LUMA
808cabdff1aSopenharmony_ciDEBLOCK_LUMA_INTRA
809cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
810cabdff1aSopenharmony_ciINIT_XMM avx
811cabdff1aSopenharmony_ciDEBLOCK_LUMA
812cabdff1aSopenharmony_ciDEBLOCK_LUMA_INTRA
813cabdff1aSopenharmony_ci%endif
814cabdff1aSopenharmony_ci%endif
815cabdff1aSopenharmony_ci
816cabdff1aSopenharmony_ci; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
817cabdff1aSopenharmony_ci; out: %1=p0', %2=q0'
818cabdff1aSopenharmony_ci%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
819cabdff1aSopenharmony_ci    mova    %6, [pw_2]
820cabdff1aSopenharmony_ci    paddw   %6, %3
821cabdff1aSopenharmony_ci    paddw   %6, %4
822cabdff1aSopenharmony_ci    paddw   %7, %6, %2
823cabdff1aSopenharmony_ci    paddw   %6, %1
824cabdff1aSopenharmony_ci    paddw   %6, %3
825cabdff1aSopenharmony_ci    paddw   %7, %4
826cabdff1aSopenharmony_ci    psraw   %6, 2
827cabdff1aSopenharmony_ci    psraw   %7, 2
828cabdff1aSopenharmony_ci    psubw   %6, %1
829cabdff1aSopenharmony_ci    psubw   %7, %2
830cabdff1aSopenharmony_ci    pand    %6, %5
831cabdff1aSopenharmony_ci    pand    %7, %5
832cabdff1aSopenharmony_ci    paddw   %1, %6
833cabdff1aSopenharmony_ci    paddw   %2, %7
834cabdff1aSopenharmony_ci%endmacro
835cabdff1aSopenharmony_ci
836cabdff1aSopenharmony_ci%macro CHROMA_V_LOAD 1
837cabdff1aSopenharmony_ci    mova        m0, [r0]    ; p1
838cabdff1aSopenharmony_ci    mova        m1, [r0+r1] ; p0
839cabdff1aSopenharmony_ci    mova        m2, [%1]    ; q0
840cabdff1aSopenharmony_ci    mova        m3, [%1+r1] ; q1
841cabdff1aSopenharmony_ci%endmacro
842cabdff1aSopenharmony_ci
843cabdff1aSopenharmony_ci%macro CHROMA_V_STORE 0
844cabdff1aSopenharmony_ci    mova [r0+1*r1], m1
845cabdff1aSopenharmony_ci    mova [r0+2*r1], m2
846cabdff1aSopenharmony_ci%endmacro
847cabdff1aSopenharmony_ci
848cabdff1aSopenharmony_ci; in: 8 rows of 4 words in %4..%11
849cabdff1aSopenharmony_ci; out: 4 rows of 8 words in m0..m3
850cabdff1aSopenharmony_ci%macro TRANSPOSE4x8W_LOAD 8
851cabdff1aSopenharmony_ci    movq             m0, %1
852cabdff1aSopenharmony_ci    movq             m2, %2
853cabdff1aSopenharmony_ci    movq             m1, %3
854cabdff1aSopenharmony_ci    movq             m3, %4
855cabdff1aSopenharmony_ci
856cabdff1aSopenharmony_ci    punpcklwd        m0, m2
857cabdff1aSopenharmony_ci    punpcklwd        m1, m3
858cabdff1aSopenharmony_ci    punpckhdq        m2, m0, m1
859cabdff1aSopenharmony_ci    punpckldq        m0, m1
860cabdff1aSopenharmony_ci
861cabdff1aSopenharmony_ci    movq             m4, %5
862cabdff1aSopenharmony_ci    movq             m6, %6
863cabdff1aSopenharmony_ci    movq             m5, %7
864cabdff1aSopenharmony_ci    movq             m3, %8
865cabdff1aSopenharmony_ci
866cabdff1aSopenharmony_ci    punpcklwd        m4, m6
867cabdff1aSopenharmony_ci    punpcklwd        m5, m3
868cabdff1aSopenharmony_ci    punpckhdq        m6, m4, m5
869cabdff1aSopenharmony_ci    punpckldq        m4, m5
870cabdff1aSopenharmony_ci
871cabdff1aSopenharmony_ci    punpckhqdq       m1, m0, m4
872cabdff1aSopenharmony_ci    punpcklqdq       m0, m4
873cabdff1aSopenharmony_ci    punpckhqdq       m3, m2, m6
874cabdff1aSopenharmony_ci    punpcklqdq       m2, m6
875cabdff1aSopenharmony_ci%endmacro
876cabdff1aSopenharmony_ci
877cabdff1aSopenharmony_ci; in: 4 rows of 8 words in m0..m3
878cabdff1aSopenharmony_ci; out: 8 rows of 4 words in %1..%8
879cabdff1aSopenharmony_ci%macro TRANSPOSE8x4W_STORE 8
880cabdff1aSopenharmony_ci    TRANSPOSE4x4W     0, 1, 2, 3, 4
881cabdff1aSopenharmony_ci    movq             %1, m0
882cabdff1aSopenharmony_ci    movhps           %2, m0
883cabdff1aSopenharmony_ci    movq             %3, m1
884cabdff1aSopenharmony_ci    movhps           %4, m1
885cabdff1aSopenharmony_ci    movq             %5, m2
886cabdff1aSopenharmony_ci    movhps           %6, m2
887cabdff1aSopenharmony_ci    movq             %7, m3
888cabdff1aSopenharmony_ci    movhps           %8, m3
889cabdff1aSopenharmony_ci%endmacro
890cabdff1aSopenharmony_ci
891cabdff1aSopenharmony_ci; %1 = base + 3*stride
892cabdff1aSopenharmony_ci; %2 = 3*stride (unused on mmx)
893cabdff1aSopenharmony_ci; %3, %4 = place to store p1 and q1 values
894cabdff1aSopenharmony_ci%macro CHROMA_H_LOAD 4
895cabdff1aSopenharmony_ci    %if mmsize == 8
896cabdff1aSopenharmony_ci        movq m0, [pix_q - 4]
897cabdff1aSopenharmony_ci        movq m1, [pix_q +   stride_q - 4]
898cabdff1aSopenharmony_ci        movq m2, [pix_q + 2*stride_q - 4]
899cabdff1aSopenharmony_ci        movq m3, [%1 - 4]
900cabdff1aSopenharmony_ci        TRANSPOSE4x4W 0, 1, 2, 3, 4
901cabdff1aSopenharmony_ci    %else
902cabdff1aSopenharmony_ci        TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
903cabdff1aSopenharmony_ci    %endif
904cabdff1aSopenharmony_ci    mova %3, m0
905cabdff1aSopenharmony_ci    mova %4, m3
906cabdff1aSopenharmony_ci%endmacro
907cabdff1aSopenharmony_ci
908cabdff1aSopenharmony_ci; %1 = base + 3*stride
909cabdff1aSopenharmony_ci; %2 = 3*stride (unused on mmx)
910cabdff1aSopenharmony_ci; %3, %4 = place to load p1 and q1 values
911cabdff1aSopenharmony_ci%macro CHROMA_H_STORE 4
912cabdff1aSopenharmony_ci    mova m0, %3
913cabdff1aSopenharmony_ci    mova m3, %4
914cabdff1aSopenharmony_ci    %if mmsize == 8
915cabdff1aSopenharmony_ci        TRANSPOSE4x4W 0, 1, 2, 3, 4
916cabdff1aSopenharmony_ci        movq [pix_q - 4],              m0
917cabdff1aSopenharmony_ci        movq [pix_q +   stride_q - 4], m1
918cabdff1aSopenharmony_ci        movq [pix_q + 2*stride_q - 4], m2
919cabdff1aSopenharmony_ci        movq [%1 - 4],                 m3
920cabdff1aSopenharmony_ci    %else
921cabdff1aSopenharmony_ci        TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
922cabdff1aSopenharmony_ci    %endif
923cabdff1aSopenharmony_ci%endmacro
924cabdff1aSopenharmony_ci
925cabdff1aSopenharmony_ci%macro CHROMA_V_LOAD_TC 2
926cabdff1aSopenharmony_ci    movd        %1, [%2]
927cabdff1aSopenharmony_ci    punpcklbw   %1, %1
928cabdff1aSopenharmony_ci    punpcklwd   %1, %1
929cabdff1aSopenharmony_ci    psraw       %1, 6
930cabdff1aSopenharmony_ci%endmacro
931cabdff1aSopenharmony_ci
932cabdff1aSopenharmony_ci%macro DEBLOCK_CHROMA 0
933cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
934cabdff1aSopenharmony_ci; void ff_deblock_v_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
935cabdff1aSopenharmony_ci;                             int8_t *tc0)
936cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
937cabdff1aSopenharmony_cicglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
938cabdff1aSopenharmony_ci    mov         r5, r0
939cabdff1aSopenharmony_ci    sub         r0, r1
940cabdff1aSopenharmony_ci    sub         r0, r1
941cabdff1aSopenharmony_ci    shl        r2d, 2
942cabdff1aSopenharmony_ci    shl        r3d, 2
943cabdff1aSopenharmony_ci    CHROMA_V_LOAD r5
944cabdff1aSopenharmony_ci    LOAD_AB     m4, m5, r2d, r3d
945cabdff1aSopenharmony_ci    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
946cabdff1aSopenharmony_ci    pxor        m4, m4
947cabdff1aSopenharmony_ci    CHROMA_V_LOAD_TC m6, r4
948cabdff1aSopenharmony_ci    psubw       m6, [pw_3]
949cabdff1aSopenharmony_ci    pmaxsw      m6, m4
950cabdff1aSopenharmony_ci    pand        m7, m6
951cabdff1aSopenharmony_ci    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
952cabdff1aSopenharmony_ci    CHROMA_V_STORE
953cabdff1aSopenharmony_ci    RET
954cabdff1aSopenharmony_ci
955cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
956cabdff1aSopenharmony_ci; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha,
957cabdff1aSopenharmony_ci;                                   int beta)
958cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
959cabdff1aSopenharmony_cicglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
960cabdff1aSopenharmony_ci    mov         r4, r0
961cabdff1aSopenharmony_ci    sub         r0, r1
962cabdff1aSopenharmony_ci    sub         r0, r1
963cabdff1aSopenharmony_ci    shl        r2d, 2
964cabdff1aSopenharmony_ci    shl        r3d, 2
965cabdff1aSopenharmony_ci    CHROMA_V_LOAD r4
966cabdff1aSopenharmony_ci    LOAD_AB     m4, m5, r2d, r3d
967cabdff1aSopenharmony_ci    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
968cabdff1aSopenharmony_ci    CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
969cabdff1aSopenharmony_ci    CHROMA_V_STORE
970cabdff1aSopenharmony_ci    RET
971cabdff1aSopenharmony_ci
972cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
973cabdff1aSopenharmony_ci; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
974cabdff1aSopenharmony_ci;                             int8_t *tc0)
975cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
976cabdff1aSopenharmony_cicglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_, tc0_
977cabdff1aSopenharmony_ci    shl alpha_d,  2
978cabdff1aSopenharmony_ci    shl beta_d,   2
979cabdff1aSopenharmony_ci    mov r5,       pix_q
980cabdff1aSopenharmony_ci    lea r6,      [3*stride_q]
981cabdff1aSopenharmony_ci    add r5,       r6
982cabdff1aSopenharmony_ci
983cabdff1aSopenharmony_ci        CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
984cabdff1aSopenharmony_ci        LOAD_AB          m4,  m5, alpha_d, beta_d
985cabdff1aSopenharmony_ci        LOAD_MASK        m0,  m1, m2, m3, m4, m5, m7, m6, m4
986cabdff1aSopenharmony_ci        pxor             m4,  m4
987cabdff1aSopenharmony_ci        CHROMA_V_LOAD_TC m6,  tc0_q
988cabdff1aSopenharmony_ci        psubw            m6, [pw_3]
989cabdff1aSopenharmony_ci        pmaxsw           m6,  m4
990cabdff1aSopenharmony_ci        pand             m7,  m6
991cabdff1aSopenharmony_ci        DEBLOCK_P0_Q0    m1,  m2, m0, m3, m7, m5, m6
992cabdff1aSopenharmony_ci        CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
993cabdff1aSopenharmony_ci
994cabdff1aSopenharmony_ciRET
995cabdff1aSopenharmony_ci
996cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
997cabdff1aSopenharmony_ci; void ff_deblock_h_chroma422_10(uint16_t *pix, int stride, int alpha, int beta,
998cabdff1aSopenharmony_ci;                                int8_t *tc0)
999cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1000cabdff1aSopenharmony_cicglobal deblock_h_chroma422_10, 5, 7, 8, 0-3*mmsize, pix_, stride_, alpha_, beta_, tc0_
1001cabdff1aSopenharmony_ci    shl alpha_d,  2
1002cabdff1aSopenharmony_ci    shl beta_d,   2
1003cabdff1aSopenharmony_ci
1004cabdff1aSopenharmony_ci    movd m0, [tc0_q]
1005cabdff1aSopenharmony_ci    punpcklbw m0, m0
1006cabdff1aSopenharmony_ci    psraw m0, 6
1007cabdff1aSopenharmony_ci    movq [rsp], m0
1008cabdff1aSopenharmony_ci
1009cabdff1aSopenharmony_ci    mov r5,       pix_q
1010cabdff1aSopenharmony_ci    lea r6,      [3*stride_q]
1011cabdff1aSopenharmony_ci    add r5,       r6
1012cabdff1aSopenharmony_ci
1013cabdff1aSopenharmony_ci    mov r4, -8
1014cabdff1aSopenharmony_ci    .loop:
1015cabdff1aSopenharmony_ci
1016cabdff1aSopenharmony_ci        CHROMA_H_LOAD r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
1017cabdff1aSopenharmony_ci        LOAD_AB          m4,  m5, alpha_d, beta_d
1018cabdff1aSopenharmony_ci        LOAD_MASK        m0,  m1, m2, m3, m4, m5, m7, m6, m4
1019cabdff1aSopenharmony_ci        pxor             m4,  m4
1020cabdff1aSopenharmony_ci        movd             m6, [rsp + r4 + 8]
1021cabdff1aSopenharmony_ci        punpcklwd        m6,  m6
1022cabdff1aSopenharmony_ci        punpcklwd        m6,  m6
1023cabdff1aSopenharmony_ci        psubw            m6, [pw_3]
1024cabdff1aSopenharmony_ci        pmaxsw           m6,  m4
1025cabdff1aSopenharmony_ci        pand             m7,  m6
1026cabdff1aSopenharmony_ci        DEBLOCK_P0_Q0    m1,  m2, m0, m3, m7, m5, m6
1027cabdff1aSopenharmony_ci        CHROMA_H_STORE r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
1028cabdff1aSopenharmony_ci
1029cabdff1aSopenharmony_ci        lea pix_q, [pix_q + (mmsize/2)*stride_q]
1030cabdff1aSopenharmony_ci        lea r5,    [r5 +    (mmsize/2)*stride_q]
1031cabdff1aSopenharmony_ci        add r4, (mmsize/4)
1032cabdff1aSopenharmony_ci    jl .loop
1033cabdff1aSopenharmony_ciRET
1034cabdff1aSopenharmony_ci
1035cabdff1aSopenharmony_ci%endmacro
1036cabdff1aSopenharmony_ci
1037cabdff1aSopenharmony_ciINIT_XMM sse2
1038cabdff1aSopenharmony_ciDEBLOCK_CHROMA
1039cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
1040cabdff1aSopenharmony_ciINIT_XMM avx
1041cabdff1aSopenharmony_ciDEBLOCK_CHROMA
1042cabdff1aSopenharmony_ci%endif
1043