1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* MMX/SSE2/AVX-optimized H.264 deblocking code
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (C) 2005-2011 x264 project
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* Authors: Loren Merritt <lorenm@u.washington.edu>
7cabdff1aSopenharmony_ci;*          Fiona Glaser <fiona@x264.com>
8cabdff1aSopenharmony_ci;*          Oskar Arvidsson <oskar@irock.se>
9cabdff1aSopenharmony_ci;*
10cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
13cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
14cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
15cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
18cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
21cabdff1aSopenharmony_ci;*
22cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
23cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
24cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25cabdff1aSopenharmony_ci;******************************************************************************
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_ciSECTION_RODATA
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_cipb_A1: times 16 db 0xA1
32cabdff1aSopenharmony_cipb_3_1: times 4 db 3, 1
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ciSECTION .text
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_cicextern pb_0
37cabdff1aSopenharmony_cicextern pb_1
38cabdff1aSopenharmony_cicextern pb_3
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_ci%define PASS8ROWS(base, base3, stride, stride3, offset) \
41cabdff1aSopenharmony_ci    PASS8ROWS(base+offset, base3+offset, stride, stride3)
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ci; in: 8 rows of 4 bytes in %4..%11
44cabdff1aSopenharmony_ci; out: 4 rows of 8 bytes in m0..m3
45cabdff1aSopenharmony_ci%macro TRANSPOSE4x8_LOAD 11
46cabdff1aSopenharmony_ci    movh       m0, %4
47cabdff1aSopenharmony_ci    movh       m2, %5
48cabdff1aSopenharmony_ci    movh       m1, %6
49cabdff1aSopenharmony_ci    movh       m3, %7
50cabdff1aSopenharmony_ci    punpckl%1  m0, m2
51cabdff1aSopenharmony_ci    punpckl%1  m1, m3
52cabdff1aSopenharmony_ci    mova       m2, m0
53cabdff1aSopenharmony_ci    punpckl%2  m0, m1
54cabdff1aSopenharmony_ci    punpckh%2  m2, m1
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ci    movh       m4, %8
57cabdff1aSopenharmony_ci    movh       m6, %9
58cabdff1aSopenharmony_ci    movh       m5, %10
59cabdff1aSopenharmony_ci    movh       m7, %11
60cabdff1aSopenharmony_ci    punpckl%1  m4, m6
61cabdff1aSopenharmony_ci    punpckl%1  m5, m7
62cabdff1aSopenharmony_ci    mova       m6, m4
63cabdff1aSopenharmony_ci    punpckl%2  m4, m5
64cabdff1aSopenharmony_ci    punpckh%2  m6, m5
65cabdff1aSopenharmony_ci
66cabdff1aSopenharmony_ci    punpckh%3  m1, m0, m4
67cabdff1aSopenharmony_ci    punpckh%3  m3, m2, m6
68cabdff1aSopenharmony_ci    punpckl%3  m0, m4
69cabdff1aSopenharmony_ci    punpckl%3  m2, m6
70cabdff1aSopenharmony_ci%endmacro
71cabdff1aSopenharmony_ci
72cabdff1aSopenharmony_ci; in: 4 rows of 8 bytes in m0..m3
73cabdff1aSopenharmony_ci; out: 8 rows of 4 bytes in %1..%8
74cabdff1aSopenharmony_ci%macro TRANSPOSE8x4B_STORE 8
75cabdff1aSopenharmony_ci    punpckhdq  m4, m0, m0
76cabdff1aSopenharmony_ci    punpckhdq  m5, m1, m1
77cabdff1aSopenharmony_ci    punpckhdq  m6, m2, m2
78cabdff1aSopenharmony_ci
79cabdff1aSopenharmony_ci    punpcklbw  m0, m1
80cabdff1aSopenharmony_ci    punpcklbw  m2, m3
81cabdff1aSopenharmony_ci    punpcklwd  m1, m0, m2
82cabdff1aSopenharmony_ci    punpckhwd  m0, m2
83cabdff1aSopenharmony_ci    movh       %1, m1
84cabdff1aSopenharmony_ci    punpckhdq  m1, m1
85cabdff1aSopenharmony_ci    movh       %2, m1
86cabdff1aSopenharmony_ci    movh       %3, m0
87cabdff1aSopenharmony_ci    punpckhdq  m0, m0
88cabdff1aSopenharmony_ci    movh       %4, m0
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci    punpckhdq  m3, m3
91cabdff1aSopenharmony_ci    punpcklbw  m4, m5
92cabdff1aSopenharmony_ci    punpcklbw  m6, m3
93cabdff1aSopenharmony_ci    punpcklwd  m5, m4, m6
94cabdff1aSopenharmony_ci    punpckhwd  m4, m6
95cabdff1aSopenharmony_ci    movh       %5, m5
96cabdff1aSopenharmony_ci    punpckhdq  m5, m5
97cabdff1aSopenharmony_ci    movh       %6, m5
98cabdff1aSopenharmony_ci    movh       %7, m4
99cabdff1aSopenharmony_ci    punpckhdq  m4, m4
100cabdff1aSopenharmony_ci    movh       %8, m4
101cabdff1aSopenharmony_ci%endmacro
102cabdff1aSopenharmony_ci
103cabdff1aSopenharmony_ci%macro TRANSPOSE4x8B_LOAD 8
104cabdff1aSopenharmony_ci    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
105cabdff1aSopenharmony_ci%endmacro
106cabdff1aSopenharmony_ci
107cabdff1aSopenharmony_ci%macro SBUTTERFLY3 4
108cabdff1aSopenharmony_ci    punpckh%1  %4, %2, %3
109cabdff1aSopenharmony_ci    punpckl%1  %2, %3
110cabdff1aSopenharmony_ci%endmacro
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
113cabdff1aSopenharmony_ci; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
114cabdff1aSopenharmony_ci%macro TRANSPOSE6x8_MEM 9
115cabdff1aSopenharmony_ci    RESET_MM_PERMUTATION
116cabdff1aSopenharmony_ci    movq  m0, %1
117cabdff1aSopenharmony_ci    movq  m1, %2
118cabdff1aSopenharmony_ci    movq  m2, %3
119cabdff1aSopenharmony_ci    movq  m3, %4
120cabdff1aSopenharmony_ci    movq  m4, %5
121cabdff1aSopenharmony_ci    movq  m5, %6
122cabdff1aSopenharmony_ci    movq  m6, %7
123cabdff1aSopenharmony_ci    SBUTTERFLY bw, 0, 1, 7
124cabdff1aSopenharmony_ci    SBUTTERFLY bw, 2, 3, 7
125cabdff1aSopenharmony_ci    SBUTTERFLY bw, 4, 5, 7
126cabdff1aSopenharmony_ci    movq  [%9+0x10], m3
127cabdff1aSopenharmony_ci    SBUTTERFLY3 bw, m6, %8, m7
128cabdff1aSopenharmony_ci    SBUTTERFLY wd, 0, 2, 3
129cabdff1aSopenharmony_ci    SBUTTERFLY wd, 4, 6, 3
130cabdff1aSopenharmony_ci    punpckhdq m0, m4
131cabdff1aSopenharmony_ci    movq  [%9+0x00], m0
132cabdff1aSopenharmony_ci    SBUTTERFLY3 wd, m1, [%9+0x10], m3
133cabdff1aSopenharmony_ci    SBUTTERFLY wd, 5, 7, 0
134cabdff1aSopenharmony_ci    SBUTTERFLY dq, 1, 5, 0
135cabdff1aSopenharmony_ci    SBUTTERFLY dq, 2, 6, 0
136cabdff1aSopenharmony_ci    punpckldq m3, m7
137cabdff1aSopenharmony_ci    movq  [%9+0x10], m2
138cabdff1aSopenharmony_ci    movq  [%9+0x20], m6
139cabdff1aSopenharmony_ci    movq  [%9+0x30], m1
140cabdff1aSopenharmony_ci    movq  [%9+0x40], m5
141cabdff1aSopenharmony_ci    movq  [%9+0x50], m3
142cabdff1aSopenharmony_ci    RESET_MM_PERMUTATION
143cabdff1aSopenharmony_ci%endmacro
144cabdff1aSopenharmony_ci
145cabdff1aSopenharmony_ci; in: 8 rows of 8 in %1..%8
146cabdff1aSopenharmony_ci; out: 8 rows of 8 in %9..%16
147cabdff1aSopenharmony_ci%macro TRANSPOSE8x8_MEM 16
148cabdff1aSopenharmony_ci    RESET_MM_PERMUTATION
149cabdff1aSopenharmony_ci    movq  m0, %1
150cabdff1aSopenharmony_ci    movq  m1, %2
151cabdff1aSopenharmony_ci    movq  m2, %3
152cabdff1aSopenharmony_ci    movq  m3, %4
153cabdff1aSopenharmony_ci    movq  m4, %5
154cabdff1aSopenharmony_ci    movq  m5, %6
155cabdff1aSopenharmony_ci    movq  m6, %7
156cabdff1aSopenharmony_ci    SBUTTERFLY bw, 0, 1, 7
157cabdff1aSopenharmony_ci    SBUTTERFLY bw, 2, 3, 7
158cabdff1aSopenharmony_ci    SBUTTERFLY bw, 4, 5, 7
159cabdff1aSopenharmony_ci    SBUTTERFLY3 bw, m6, %8, m7
160cabdff1aSopenharmony_ci    movq  %9,  m5
161cabdff1aSopenharmony_ci    SBUTTERFLY wd, 0, 2, 5
162cabdff1aSopenharmony_ci    SBUTTERFLY wd, 4, 6, 5
163cabdff1aSopenharmony_ci    SBUTTERFLY wd, 1, 3, 5
164cabdff1aSopenharmony_ci    movq  %11, m6
165cabdff1aSopenharmony_ci    movq  m6,  %9
166cabdff1aSopenharmony_ci    SBUTTERFLY wd, 6, 7, 5
167cabdff1aSopenharmony_ci    SBUTTERFLY dq, 0, 4, 5
168cabdff1aSopenharmony_ci    SBUTTERFLY dq, 1, 6, 5
169cabdff1aSopenharmony_ci    movq  %9,  m0
170cabdff1aSopenharmony_ci    movq  %10, m4
171cabdff1aSopenharmony_ci    movq  %13, m1
172cabdff1aSopenharmony_ci    movq  %14, m6
173cabdff1aSopenharmony_ci    SBUTTERFLY3 dq, m2, %11, m0
174cabdff1aSopenharmony_ci    SBUTTERFLY dq, 3, 7, 4
175cabdff1aSopenharmony_ci    movq  %11, m2
176cabdff1aSopenharmony_ci    movq  %12, m0
177cabdff1aSopenharmony_ci    movq  %15, m3
178cabdff1aSopenharmony_ci    movq  %16, m7
179cabdff1aSopenharmony_ci    RESET_MM_PERMUTATION
180cabdff1aSopenharmony_ci%endmacro
181cabdff1aSopenharmony_ci
182cabdff1aSopenharmony_ci; out: %4 = |%1-%2|>%3
183cabdff1aSopenharmony_ci; clobbers: %5
184cabdff1aSopenharmony_ci%macro DIFF_GT 5
185cabdff1aSopenharmony_ci%if avx_enabled == 0
186cabdff1aSopenharmony_ci    mova    %5, %2
187cabdff1aSopenharmony_ci    mova    %4, %1
188cabdff1aSopenharmony_ci    psubusb %5, %1
189cabdff1aSopenharmony_ci    psubusb %4, %2
190cabdff1aSopenharmony_ci%else
191cabdff1aSopenharmony_ci    psubusb %5, %2, %1
192cabdff1aSopenharmony_ci    psubusb %4, %1, %2
193cabdff1aSopenharmony_ci%endif
194cabdff1aSopenharmony_ci    por     %4, %5
195cabdff1aSopenharmony_ci    psubusb %4, %3
196cabdff1aSopenharmony_ci%endmacro
197cabdff1aSopenharmony_ci
198cabdff1aSopenharmony_ci; out: %4 = |%1-%2|>%3
199cabdff1aSopenharmony_ci; clobbers: %5
200cabdff1aSopenharmony_ci%macro DIFF_GT2 5
201cabdff1aSopenharmony_ci%if ARCH_X86_64
202cabdff1aSopenharmony_ci    psubusb %5, %2, %1
203cabdff1aSopenharmony_ci    psubusb %4, %1, %2
204cabdff1aSopenharmony_ci%else
205cabdff1aSopenharmony_ci    mova    %5, %2
206cabdff1aSopenharmony_ci    mova    %4, %1
207cabdff1aSopenharmony_ci    psubusb %5, %1
208cabdff1aSopenharmony_ci    psubusb %4, %2
209cabdff1aSopenharmony_ci%endif
210cabdff1aSopenharmony_ci    psubusb %5, %3
211cabdff1aSopenharmony_ci    psubusb %4, %3
212cabdff1aSopenharmony_ci    pcmpeqb %4, %5
213cabdff1aSopenharmony_ci%endmacro
214cabdff1aSopenharmony_ci
215cabdff1aSopenharmony_ci; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
216cabdff1aSopenharmony_ci; out: m5=beta-1, m7=mask, %3=alpha-1
217cabdff1aSopenharmony_ci; clobbers: m4,m6
218cabdff1aSopenharmony_ci%macro LOAD_MASK 2-3
219cabdff1aSopenharmony_ci    movd     m4, %1
220cabdff1aSopenharmony_ci    movd     m5, %2
221cabdff1aSopenharmony_ci    SPLATW   m4, m4
222cabdff1aSopenharmony_ci    SPLATW   m5, m5
223cabdff1aSopenharmony_ci    packuswb m4, m4  ; 16x alpha-1
224cabdff1aSopenharmony_ci    packuswb m5, m5  ; 16x beta-1
225cabdff1aSopenharmony_ci%if %0>2
226cabdff1aSopenharmony_ci    mova     %3, m4
227cabdff1aSopenharmony_ci%endif
228cabdff1aSopenharmony_ci    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
229cabdff1aSopenharmony_ci    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
230cabdff1aSopenharmony_ci    por      m7, m4
231cabdff1aSopenharmony_ci    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
232cabdff1aSopenharmony_ci    por      m7, m4
233cabdff1aSopenharmony_ci    pxor     m6, m6
234cabdff1aSopenharmony_ci    pcmpeqb  m7, m6
235cabdff1aSopenharmony_ci%endmacro
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
238cabdff1aSopenharmony_ci; out: m1=p0' m2=q0'
239cabdff1aSopenharmony_ci; clobbers: m0,3-6
240cabdff1aSopenharmony_ci%macro DEBLOCK_P0_Q0 0
241cabdff1aSopenharmony_ci    pcmpeqb m4, m4
242cabdff1aSopenharmony_ci    pxor    m5, m1, m2   ; p0^q0
243cabdff1aSopenharmony_ci    pxor    m3, m4
244cabdff1aSopenharmony_ci    pand    m5, [pb_1]   ; (p0^q0)&1
245cabdff1aSopenharmony_ci    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
246cabdff1aSopenharmony_ci    pxor    m4, m1
247cabdff1aSopenharmony_ci    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
248cabdff1aSopenharmony_ci    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
249cabdff1aSopenharmony_ci    pavgb   m3, m5
250cabdff1aSopenharmony_ci    mova    m6, [pb_A1]
251cabdff1aSopenharmony_ci    paddusb m3, m4       ; d+128+33
252cabdff1aSopenharmony_ci    psubusb m6, m3
253cabdff1aSopenharmony_ci    psubusb m3, [pb_A1]
254cabdff1aSopenharmony_ci    pminub  m6, m7
255cabdff1aSopenharmony_ci    pminub  m3, m7
256cabdff1aSopenharmony_ci    psubusb m1, m6
257cabdff1aSopenharmony_ci    psubusb m2, m3
258cabdff1aSopenharmony_ci    paddusb m1, m3
259cabdff1aSopenharmony_ci    paddusb m2, m6
260cabdff1aSopenharmony_ci%endmacro
261cabdff1aSopenharmony_ci
262cabdff1aSopenharmony_ci; in: m1=p0 m2=q0
263cabdff1aSopenharmony_ci;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
264cabdff1aSopenharmony_ci; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
265cabdff1aSopenharmony_ci; clobbers: q2, tmp, tc0
266cabdff1aSopenharmony_ci%macro LUMA_Q1 6
267cabdff1aSopenharmony_ci    pavgb   %6, m1, m2
268cabdff1aSopenharmony_ci    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
269cabdff1aSopenharmony_ci    pxor    %6, %3
270cabdff1aSopenharmony_ci    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
271cabdff1aSopenharmony_ci    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
272cabdff1aSopenharmony_ci    psubusb %6, %1, %5
273cabdff1aSopenharmony_ci    paddusb %5, %1
274cabdff1aSopenharmony_ci    pmaxub  %2, %6
275cabdff1aSopenharmony_ci    pminub  %2, %5
276cabdff1aSopenharmony_ci    mova    %4, %2
277cabdff1aSopenharmony_ci%endmacro
278cabdff1aSopenharmony_ci
279cabdff1aSopenharmony_ci%if ARCH_X86_64
280cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
281cabdff1aSopenharmony_ci; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta,
282cabdff1aSopenharmony_ci;                        int8_t *tc0)
283cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
284cabdff1aSopenharmony_ci%macro DEBLOCK_LUMA 0
285cabdff1aSopenharmony_cicglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_
286cabdff1aSopenharmony_ci    movd    m8, [r4] ; tc0
287cabdff1aSopenharmony_ci    lea     r4, [stride_q*3]
288cabdff1aSopenharmony_ci    dec     alpha_d        ; alpha-1
289cabdff1aSopenharmony_ci    neg     r4
290cabdff1aSopenharmony_ci    dec     beta_d        ; beta-1
291cabdff1aSopenharmony_ci    add     base3_q, pix_q     ; pix-3*stride
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci    mova    m0, [base3_q + stride_q]   ; p1
294cabdff1aSopenharmony_ci    mova    m1, [base3_q + 2*stride_q] ; p0
295cabdff1aSopenharmony_ci    mova    m2, [pix_q]      ; q0
296cabdff1aSopenharmony_ci    mova    m3, [pix_q + stride_q]   ; q1
297cabdff1aSopenharmony_ci    LOAD_MASK r2d, r3d
298cabdff1aSopenharmony_ci
299cabdff1aSopenharmony_ci    punpcklbw m8, m8
300cabdff1aSopenharmony_ci    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
301cabdff1aSopenharmony_ci    pcmpeqb m9, m9
302cabdff1aSopenharmony_ci    pcmpeqb m9, m8
303cabdff1aSopenharmony_ci    pandn   m9, m7
304cabdff1aSopenharmony_ci    pand    m8, m9
305cabdff1aSopenharmony_ci
306cabdff1aSopenharmony_ci    movdqa  m3, [base3_q] ; p2
307cabdff1aSopenharmony_ci    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
308cabdff1aSopenharmony_ci    pand    m6, m9
309cabdff1aSopenharmony_ci    psubb   m7, m8, m6
310cabdff1aSopenharmony_ci    pand    m6, m8
311cabdff1aSopenharmony_ci    LUMA_Q1 m0, m3, [base3_q], [base3_q + stride_q], m6, m4
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci    movdqa  m4, [pix_q + 2*stride_q] ; q2
314cabdff1aSopenharmony_ci    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
315cabdff1aSopenharmony_ci    pand    m6, m9
316cabdff1aSopenharmony_ci    pand    m8, m6
317cabdff1aSopenharmony_ci    psubb   m7, m6
318cabdff1aSopenharmony_ci    mova    m3, [pix_q + stride_q]
319cabdff1aSopenharmony_ci    LUMA_Q1 m3, m4, [pix_q + 2*stride_q], [pix_q + stride_q], m8, m6
320cabdff1aSopenharmony_ci
321cabdff1aSopenharmony_ci    DEBLOCK_P0_Q0
322cabdff1aSopenharmony_ci    mova    [base3_q + 2*stride_q], m1
323cabdff1aSopenharmony_ci    mova    [pix_q], m2
324cabdff1aSopenharmony_ci    RET
325cabdff1aSopenharmony_ci
326cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
327cabdff1aSopenharmony_ci; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
328cabdff1aSopenharmony_ci;                        int8_t *tc0)
329cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
330cabdff1aSopenharmony_ciINIT_MMX cpuname
331cabdff1aSopenharmony_cicglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
332cabdff1aSopenharmony_ci    movsxd r7,  r1d
333cabdff1aSopenharmony_ci    lea    r8,  [r7+r7*2]
334cabdff1aSopenharmony_ci    lea    r6,  [r0-4]
335cabdff1aSopenharmony_ci    lea    r5,  [r0-4+r8]
336cabdff1aSopenharmony_ci%if WIN64
337cabdff1aSopenharmony_ci    %define pix_tmp rsp+0x30 ; shadow space + r4
338cabdff1aSopenharmony_ci%else
339cabdff1aSopenharmony_ci    %define pix_tmp rsp
340cabdff1aSopenharmony_ci%endif
341cabdff1aSopenharmony_ci
342cabdff1aSopenharmony_ci    ; transpose 6x16 -> tmp space
343cabdff1aSopenharmony_ci    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp
344cabdff1aSopenharmony_ci    lea    r6, [r6+r7*8]
345cabdff1aSopenharmony_ci    lea    r5, [r5+r7*8]
346cabdff1aSopenharmony_ci    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
347cabdff1aSopenharmony_ci
348cabdff1aSopenharmony_ci    ; vertical filter
349cabdff1aSopenharmony_ci    ; alpha, beta, tc0 are still in r2d, r3d, r4
350cabdff1aSopenharmony_ci    ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
351cabdff1aSopenharmony_ci    lea    r0, [pix_tmp+0x30]
352cabdff1aSopenharmony_ci    mov    r1d, 0x10
353cabdff1aSopenharmony_ci%if WIN64
354cabdff1aSopenharmony_ci    mov    [rsp+0x20], r4
355cabdff1aSopenharmony_ci%endif
356cabdff1aSopenharmony_ci    call   deblock_v_luma_8
357cabdff1aSopenharmony_ci
358cabdff1aSopenharmony_ci    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
359cabdff1aSopenharmony_ci    add    r6, 2
360cabdff1aSopenharmony_ci    add    r5, 2
361cabdff1aSopenharmony_ci    movq   m0, [pix_tmp+0x18]
362cabdff1aSopenharmony_ci    movq   m1, [pix_tmp+0x28]
363cabdff1aSopenharmony_ci    movq   m2, [pix_tmp+0x38]
364cabdff1aSopenharmony_ci    movq   m3, [pix_tmp+0x48]
365cabdff1aSopenharmony_ci    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
366cabdff1aSopenharmony_ci
367cabdff1aSopenharmony_ci    shl    r7,  3
368cabdff1aSopenharmony_ci    sub    r6,  r7
369cabdff1aSopenharmony_ci    sub    r5,  r7
370cabdff1aSopenharmony_ci    shr    r7,  3
371cabdff1aSopenharmony_ci    movq   m0, [pix_tmp+0x10]
372cabdff1aSopenharmony_ci    movq   m1, [pix_tmp+0x20]
373cabdff1aSopenharmony_ci    movq   m2, [pix_tmp+0x30]
374cabdff1aSopenharmony_ci    movq   m3, [pix_tmp+0x40]
375cabdff1aSopenharmony_ci    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
376cabdff1aSopenharmony_ci
377cabdff1aSopenharmony_ci    RET
378cabdff1aSopenharmony_ci%endmacro
379cabdff1aSopenharmony_ci
380cabdff1aSopenharmony_ci%macro DEBLOCK_H_LUMA_MBAFF 0
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_cicglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_
383cabdff1aSopenharmony_ci    movsxd stride_q,   stride_d
384cabdff1aSopenharmony_ci    dec    alpha_d
385cabdff1aSopenharmony_ci    dec    beta_d
386cabdff1aSopenharmony_ci    mov    base3_q,    pix_q
387cabdff1aSopenharmony_ci    lea    stride3_q, [3*stride_q]
388cabdff1aSopenharmony_ci    add    base3_q,    stride3_q
389cabdff1aSopenharmony_ci
390cabdff1aSopenharmony_ci    movq m0, [pix_q - 4]
391cabdff1aSopenharmony_ci    movq m1, [pix_q + stride_q - 4]
392cabdff1aSopenharmony_ci    movq m2, [pix_q + 2*stride_q - 4]
393cabdff1aSopenharmony_ci    movq m3, [base3_q - 4]
394cabdff1aSopenharmony_ci    movq m4, [base3_q + stride_q - 4]
395cabdff1aSopenharmony_ci    movq m5, [base3_q + 2*stride_q - 4]
396cabdff1aSopenharmony_ci    movq m6, [base3_q + stride3_q - 4]
397cabdff1aSopenharmony_ci    movq m7, [base3_q + 4*stride_q - 4]
398cabdff1aSopenharmony_ci
399cabdff1aSopenharmony_ci    TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
400cabdff1aSopenharmony_ci
401cabdff1aSopenharmony_ci    %assign i 0
402cabdff1aSopenharmony_ci    %rep 8
403cabdff1aSopenharmony_ci        movq [rsp + 16*i], m %+ i
404cabdff1aSopenharmony_ci        %assign i i+1
405cabdff1aSopenharmony_ci    %endrep
406cabdff1aSopenharmony_ci
407cabdff1aSopenharmony_ci    ; p2 = m1 [rsp + 16]
408cabdff1aSopenharmony_ci    ; p1 = m2 [rsp + 32]
409cabdff1aSopenharmony_ci    ; p0 = m3 [rsp + 48]
410cabdff1aSopenharmony_ci    ; q0 = m4 [rsp + 64]
411cabdff1aSopenharmony_ci    ; q1 = m5 [rsp + 80]
412cabdff1aSopenharmony_ci    ; q2 = m6 [rsp + 96]
413cabdff1aSopenharmony_ci
414cabdff1aSopenharmony_ci    SWAP 0, 2
415cabdff1aSopenharmony_ci    SWAP 1, 3
416cabdff1aSopenharmony_ci    SWAP 2, 4
417cabdff1aSopenharmony_ci    SWAP 3, 5
418cabdff1aSopenharmony_ci
419cabdff1aSopenharmony_ci    LOAD_MASK alpha_d, beta_d
420cabdff1aSopenharmony_ci    movd m8, [tc0_q]
421cabdff1aSopenharmony_ci    punpcklbw m8, m8
422cabdff1aSopenharmony_ci    pcmpeqb m9, m9
423cabdff1aSopenharmony_ci    pcmpeqb m9, m8
424cabdff1aSopenharmony_ci    pandn   m9, m7
425cabdff1aSopenharmony_ci    pand    m8, m9
426cabdff1aSopenharmony_ci
427cabdff1aSopenharmony_ci    movdqa  m3, [rsp + 16] ; p2
428cabdff1aSopenharmony_ci    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
429cabdff1aSopenharmony_ci    pand    m6, m9
430cabdff1aSopenharmony_ci    psubb   m7, m8, m6
431cabdff1aSopenharmony_ci    pand    m6, m8
432cabdff1aSopenharmony_ci    LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4
433cabdff1aSopenharmony_ci
434cabdff1aSopenharmony_ci    movdqa  m4, [rsp + 96] ; q2
435cabdff1aSopenharmony_ci    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
436cabdff1aSopenharmony_ci    pand    m6, m9
437cabdff1aSopenharmony_ci    pand    m8, m6
438cabdff1aSopenharmony_ci    psubb   m7, m6
439cabdff1aSopenharmony_ci    mova    m3, [rsp + 80]
440cabdff1aSopenharmony_ci    LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6
441cabdff1aSopenharmony_ci
442cabdff1aSopenharmony_ci    DEBLOCK_P0_Q0
443cabdff1aSopenharmony_ci    SWAP 1, 3
444cabdff1aSopenharmony_ci    SWAP 2, 4
445cabdff1aSopenharmony_ci    movq m0, [rsp]
446cabdff1aSopenharmony_ci    movq m1, [rsp + 16]
447cabdff1aSopenharmony_ci    movq m2, [rsp + 32]
448cabdff1aSopenharmony_ci    movq m5, [rsp + 80]
449cabdff1aSopenharmony_ci    movq m6, [rsp + 96]
450cabdff1aSopenharmony_ci    movq m7, [rsp + 112]
451cabdff1aSopenharmony_ci
452cabdff1aSopenharmony_ci    TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
453cabdff1aSopenharmony_ci    movq [pix_q - 4], m0
454cabdff1aSopenharmony_ci    movq [pix_q + stride_q - 4], m1
455cabdff1aSopenharmony_ci    movq [pix_q + 2*stride_q - 4], m2
456cabdff1aSopenharmony_ci    movq [base3_q - 4], m3
457cabdff1aSopenharmony_ci    movq [base3_q + stride_q - 4], m4
458cabdff1aSopenharmony_ci    movq [base3_q + 2*stride_q - 4], m5
459cabdff1aSopenharmony_ci    movq [base3_q + stride3_q - 4], m6
460cabdff1aSopenharmony_ci    movq [base3_q + 4*stride_q - 4], m7
461cabdff1aSopenharmony_ci
462cabdff1aSopenharmony_ciRET
463cabdff1aSopenharmony_ci
464cabdff1aSopenharmony_ci%endmacro
465cabdff1aSopenharmony_ci
466cabdff1aSopenharmony_ciINIT_XMM sse2
467cabdff1aSopenharmony_ciDEBLOCK_H_LUMA_MBAFF
468cabdff1aSopenharmony_ciDEBLOCK_LUMA
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
471cabdff1aSopenharmony_ciINIT_XMM avx
472cabdff1aSopenharmony_ciDEBLOCK_H_LUMA_MBAFF
473cabdff1aSopenharmony_ciDEBLOCK_LUMA
474cabdff1aSopenharmony_ci%endif
475cabdff1aSopenharmony_ci
476cabdff1aSopenharmony_ci%else
477cabdff1aSopenharmony_ci
478cabdff1aSopenharmony_ci%macro DEBLOCK_LUMA 2
479cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
480cabdff1aSopenharmony_ci; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta,
481cabdff1aSopenharmony_ci;                         int8_t *tc0)
482cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
483cabdff1aSopenharmony_cicglobal deblock_%1_luma_8, 5,5,8,2*%2
484cabdff1aSopenharmony_ci    lea     r4, [r1*3]
485cabdff1aSopenharmony_ci    dec     r2     ; alpha-1
486cabdff1aSopenharmony_ci    neg     r4
487cabdff1aSopenharmony_ci    dec     r3     ; beta-1
488cabdff1aSopenharmony_ci    add     r4, r0 ; pix-3*stride
489cabdff1aSopenharmony_ci
490cabdff1aSopenharmony_ci    mova    m0, [r4+r1]   ; p1
491cabdff1aSopenharmony_ci    mova    m1, [r4+2*r1] ; p0
492cabdff1aSopenharmony_ci    mova    m2, [r0]      ; q0
493cabdff1aSopenharmony_ci    mova    m3, [r0+r1]   ; q1
494cabdff1aSopenharmony_ci    LOAD_MASK r2, r3
495cabdff1aSopenharmony_ci
496cabdff1aSopenharmony_ci    mov     r3, r4mp
497cabdff1aSopenharmony_ci    pcmpeqb m3, m3
498cabdff1aSopenharmony_ci    movd    m4, [r3] ; tc0
499cabdff1aSopenharmony_ci    punpcklbw m4, m4
500cabdff1aSopenharmony_ci    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
501cabdff1aSopenharmony_ci    mova   [esp+%2], m4 ; tc
502cabdff1aSopenharmony_ci    pcmpgtb m4, m3
503cabdff1aSopenharmony_ci    mova    m3, [r4] ; p2
504cabdff1aSopenharmony_ci    pand    m4, m7
505cabdff1aSopenharmony_ci    mova   [esp], m4 ; mask
506cabdff1aSopenharmony_ci
507cabdff1aSopenharmony_ci    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
508cabdff1aSopenharmony_ci    pand    m6, m4
509cabdff1aSopenharmony_ci    pand    m4, [esp+%2] ; tc
510cabdff1aSopenharmony_ci    psubb   m7, m4, m6
511cabdff1aSopenharmony_ci    pand    m6, m4
512cabdff1aSopenharmony_ci    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
513cabdff1aSopenharmony_ci
514cabdff1aSopenharmony_ci    mova    m4, [r0+2*r1] ; q2
515cabdff1aSopenharmony_ci    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
516cabdff1aSopenharmony_ci    pand    m6, [esp] ; mask
517cabdff1aSopenharmony_ci    mova    m5, [esp+%2] ; tc
518cabdff1aSopenharmony_ci    psubb   m7, m6
519cabdff1aSopenharmony_ci    pand    m5, m6
520cabdff1aSopenharmony_ci    mova    m3, [r0+r1]
521cabdff1aSopenharmony_ci    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
522cabdff1aSopenharmony_ci
523cabdff1aSopenharmony_ci    DEBLOCK_P0_Q0
524cabdff1aSopenharmony_ci    mova    [r4+2*r1], m1
525cabdff1aSopenharmony_ci    mova    [r0], m2
526cabdff1aSopenharmony_ci    RET
527cabdff1aSopenharmony_ci
528cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
529cabdff1aSopenharmony_ci; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
530cabdff1aSopenharmony_ci;                        int8_t *tc0)
531cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
532cabdff1aSopenharmony_ciINIT_MMX cpuname
533cabdff1aSopenharmony_cicglobal deblock_h_luma_8, 0,5,8,0x60+12
534cabdff1aSopenharmony_ci    mov    r0, r0mp
535cabdff1aSopenharmony_ci    mov    r3, r1m
536cabdff1aSopenharmony_ci    lea    r4, [r3*3]
537cabdff1aSopenharmony_ci    sub    r0, 4
538cabdff1aSopenharmony_ci    lea    r1, [r0+r4]
539cabdff1aSopenharmony_ci%define pix_tmp esp+12
540cabdff1aSopenharmony_ci
541cabdff1aSopenharmony_ci    ; transpose 6x16 -> tmp space
542cabdff1aSopenharmony_ci    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
543cabdff1aSopenharmony_ci    lea    r0, [r0+r3*8]
544cabdff1aSopenharmony_ci    lea    r1, [r1+r3*8]
545cabdff1aSopenharmony_ci    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
546cabdff1aSopenharmony_ci
547cabdff1aSopenharmony_ci    ; vertical filter
548cabdff1aSopenharmony_ci    lea    r0, [pix_tmp+0x30]
549cabdff1aSopenharmony_ci    PUSH   dword r4m
550cabdff1aSopenharmony_ci    PUSH   dword r3m
551cabdff1aSopenharmony_ci    PUSH   dword r2m
552cabdff1aSopenharmony_ci    PUSH   dword 16
553cabdff1aSopenharmony_ci    PUSH   dword r0
554cabdff1aSopenharmony_ci    call   deblock_%1_luma_8
555cabdff1aSopenharmony_ci%ifidn %1, v8
556cabdff1aSopenharmony_ci    add    dword [esp   ], 8 ; pix_tmp+0x38
557cabdff1aSopenharmony_ci    add    dword [esp+16], 2 ; tc0+2
558cabdff1aSopenharmony_ci    call   deblock_%1_luma_8
559cabdff1aSopenharmony_ci%endif
560cabdff1aSopenharmony_ci    ADD    esp, 20
561cabdff1aSopenharmony_ci
562cabdff1aSopenharmony_ci    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
563cabdff1aSopenharmony_ci    mov    r0, r0mp
564cabdff1aSopenharmony_ci    sub    r0, 2
565cabdff1aSopenharmony_ci
566cabdff1aSopenharmony_ci    movq   m0, [pix_tmp+0x10]
567cabdff1aSopenharmony_ci    movq   m1, [pix_tmp+0x20]
568cabdff1aSopenharmony_ci    lea    r1, [r0+r4]
569cabdff1aSopenharmony_ci    movq   m2, [pix_tmp+0x30]
570cabdff1aSopenharmony_ci    movq   m3, [pix_tmp+0x40]
571cabdff1aSopenharmony_ci    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
572cabdff1aSopenharmony_ci
573cabdff1aSopenharmony_ci    lea    r0, [r0+r3*8]
574cabdff1aSopenharmony_ci    lea    r1, [r1+r3*8]
575cabdff1aSopenharmony_ci    movq   m0, [pix_tmp+0x18]
576cabdff1aSopenharmony_ci    movq   m1, [pix_tmp+0x28]
577cabdff1aSopenharmony_ci    movq   m2, [pix_tmp+0x38]
578cabdff1aSopenharmony_ci    movq   m3, [pix_tmp+0x48]
579cabdff1aSopenharmony_ci    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
580cabdff1aSopenharmony_ci
581cabdff1aSopenharmony_ci    RET
582cabdff1aSopenharmony_ci%endmacro ; DEBLOCK_LUMA
583cabdff1aSopenharmony_ci
584cabdff1aSopenharmony_ciINIT_XMM sse2
585cabdff1aSopenharmony_ciDEBLOCK_LUMA v, 16
586cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
587cabdff1aSopenharmony_ciINIT_XMM avx
588cabdff1aSopenharmony_ciDEBLOCK_LUMA v, 16
589cabdff1aSopenharmony_ci%endif
590cabdff1aSopenharmony_ci
591cabdff1aSopenharmony_ci%endif ; ARCH
592cabdff1aSopenharmony_ci
593cabdff1aSopenharmony_ci
594cabdff1aSopenharmony_ci
595cabdff1aSopenharmony_ci%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
596cabdff1aSopenharmony_ci%if ARCH_X86_64
597cabdff1aSopenharmony_ci    pavgb t0, p2, p1
598cabdff1aSopenharmony_ci    pavgb t1, p0, q0
599cabdff1aSopenharmony_ci%else
600cabdff1aSopenharmony_ci    mova  t0, p2
601cabdff1aSopenharmony_ci    mova  t1, p0
602cabdff1aSopenharmony_ci    pavgb t0, p1
603cabdff1aSopenharmony_ci    pavgb t1, q0
604cabdff1aSopenharmony_ci%endif
605cabdff1aSopenharmony_ci    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
606cabdff1aSopenharmony_ci    mova  t5, t1
607cabdff1aSopenharmony_ci%if ARCH_X86_64
608cabdff1aSopenharmony_ci    paddb t2, p2, p1
609cabdff1aSopenharmony_ci    paddb t3, p0, q0
610cabdff1aSopenharmony_ci%else
611cabdff1aSopenharmony_ci    mova  t2, p2
612cabdff1aSopenharmony_ci    mova  t3, p0
613cabdff1aSopenharmony_ci    paddb t2, p1
614cabdff1aSopenharmony_ci    paddb t3, q0
615cabdff1aSopenharmony_ci%endif
616cabdff1aSopenharmony_ci    paddb t2, t3
617cabdff1aSopenharmony_ci    mova  t3, t2
618cabdff1aSopenharmony_ci    mova  t4, t2
619cabdff1aSopenharmony_ci    psrlw t2, 1
620cabdff1aSopenharmony_ci    pavgb t2, mpb_0
621cabdff1aSopenharmony_ci    pxor  t2, t0
622cabdff1aSopenharmony_ci    pand  t2, mpb_1
623cabdff1aSopenharmony_ci    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
624cabdff1aSopenharmony_ci
625cabdff1aSopenharmony_ci%if ARCH_X86_64
626cabdff1aSopenharmony_ci    pavgb t1, p2, q1
627cabdff1aSopenharmony_ci    psubb t2, p2, q1
628cabdff1aSopenharmony_ci%else
629cabdff1aSopenharmony_ci    mova  t1, p2
630cabdff1aSopenharmony_ci    mova  t2, p2
631cabdff1aSopenharmony_ci    pavgb t1, q1
632cabdff1aSopenharmony_ci    psubb t2, q1
633cabdff1aSopenharmony_ci%endif
634cabdff1aSopenharmony_ci    paddb t3, t3
635cabdff1aSopenharmony_ci    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
636cabdff1aSopenharmony_ci    pand  t2, mpb_1
637cabdff1aSopenharmony_ci    psubb t1, t2
638cabdff1aSopenharmony_ci    pavgb t1, p1
639cabdff1aSopenharmony_ci    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
640cabdff1aSopenharmony_ci    psrlw t3, 2
641cabdff1aSopenharmony_ci    pavgb t3, mpb_0
642cabdff1aSopenharmony_ci    pxor  t3, t1
643cabdff1aSopenharmony_ci    pand  t3, mpb_1
644cabdff1aSopenharmony_ci    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
645cabdff1aSopenharmony_ci
646cabdff1aSopenharmony_ci    pxor  t3, p0, q1
647cabdff1aSopenharmony_ci    pavgb t2, p0, q1
648cabdff1aSopenharmony_ci    pand  t3, mpb_1
649cabdff1aSopenharmony_ci    psubb t2, t3
650cabdff1aSopenharmony_ci    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
651cabdff1aSopenharmony_ci
652cabdff1aSopenharmony_ci    pxor  t1, t2
653cabdff1aSopenharmony_ci    pxor  t2, p0
654cabdff1aSopenharmony_ci    pand  t1, mask1p
655cabdff1aSopenharmony_ci    pand  t2, mask0
656cabdff1aSopenharmony_ci    pxor  t1, t2
657cabdff1aSopenharmony_ci    pxor  t1, p0
658cabdff1aSopenharmony_ci    mova  %1, t1 ; store p0
659cabdff1aSopenharmony_ci
660cabdff1aSopenharmony_ci    mova  t1, %4 ; p3
661cabdff1aSopenharmony_ci    paddb t2, t1, p2
662cabdff1aSopenharmony_ci    pavgb t1, p2
663cabdff1aSopenharmony_ci    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
664cabdff1aSopenharmony_ci    paddb t2, t2
665cabdff1aSopenharmony_ci    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
666cabdff1aSopenharmony_ci    psrlw t2, 2
667cabdff1aSopenharmony_ci    pavgb t2, mpb_0
668cabdff1aSopenharmony_ci    pxor  t2, t1
669cabdff1aSopenharmony_ci    pand  t2, mpb_1
670cabdff1aSopenharmony_ci    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
671cabdff1aSopenharmony_ci
672cabdff1aSopenharmony_ci    pxor  t0, p1
673cabdff1aSopenharmony_ci    pxor  t1, p2
674cabdff1aSopenharmony_ci    pand  t0, mask1p
675cabdff1aSopenharmony_ci    pand  t1, mask1p
676cabdff1aSopenharmony_ci    pxor  t0, p1
677cabdff1aSopenharmony_ci    pxor  t1, p2
678cabdff1aSopenharmony_ci    mova  %2, t0 ; store p1
679cabdff1aSopenharmony_ci    mova  %3, t1 ; store p2
680cabdff1aSopenharmony_ci%endmacro
681cabdff1aSopenharmony_ci
682cabdff1aSopenharmony_ci%macro LUMA_INTRA_SWAP_PQ 0
683cabdff1aSopenharmony_ci    %define q1 m0
684cabdff1aSopenharmony_ci    %define q0 m1
685cabdff1aSopenharmony_ci    %define p0 m2
686cabdff1aSopenharmony_ci    %define p1 m3
687cabdff1aSopenharmony_ci    %define p2 q2
688cabdff1aSopenharmony_ci    %define mask1p mask1q
689cabdff1aSopenharmony_ci%endmacro
690cabdff1aSopenharmony_ci
691cabdff1aSopenharmony_ci%macro DEBLOCK_LUMA_INTRA 1
692cabdff1aSopenharmony_ci    %define p1 m0
693cabdff1aSopenharmony_ci    %define p0 m1
694cabdff1aSopenharmony_ci    %define q0 m2
695cabdff1aSopenharmony_ci    %define q1 m3
696cabdff1aSopenharmony_ci    %define t0 m4
697cabdff1aSopenharmony_ci    %define t1 m5
698cabdff1aSopenharmony_ci    %define t2 m6
699cabdff1aSopenharmony_ci    %define t3 m7
700cabdff1aSopenharmony_ci%if ARCH_X86_64
701cabdff1aSopenharmony_ci    %define p2 m8
702cabdff1aSopenharmony_ci    %define q2 m9
703cabdff1aSopenharmony_ci    %define t4 m10
704cabdff1aSopenharmony_ci    %define t5 m11
705cabdff1aSopenharmony_ci    %define mask0 m12
706cabdff1aSopenharmony_ci    %define mask1p m13
707cabdff1aSopenharmony_ci%if WIN64
708cabdff1aSopenharmony_ci    %define mask1q [rsp]
709cabdff1aSopenharmony_ci%else
710cabdff1aSopenharmony_ci    %define mask1q [rsp-24]
711cabdff1aSopenharmony_ci%endif
712cabdff1aSopenharmony_ci    %define mpb_0 m14
713cabdff1aSopenharmony_ci    %define mpb_1 m15
714cabdff1aSopenharmony_ci%else
715cabdff1aSopenharmony_ci    %define spill(x) [esp+16*x]
716cabdff1aSopenharmony_ci    %define p2 [r4+r1]
717cabdff1aSopenharmony_ci    %define q2 [r0+2*r1]
718cabdff1aSopenharmony_ci    %define t4 spill(0)
719cabdff1aSopenharmony_ci    %define t5 spill(1)
720cabdff1aSopenharmony_ci    %define mask0 spill(2)
721cabdff1aSopenharmony_ci    %define mask1p spill(3)
722cabdff1aSopenharmony_ci    %define mask1q spill(4)
723cabdff1aSopenharmony_ci    %define mpb_0 [pb_0]
724cabdff1aSopenharmony_ci    %define mpb_1 [pb_1]
725cabdff1aSopenharmony_ci%endif
726cabdff1aSopenharmony_ci
727cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
728cabdff1aSopenharmony_ci; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
729cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
730cabdff1aSopenharmony_ci%if WIN64
731cabdff1aSopenharmony_cicglobal deblock_%1_luma_intra_8, 4,6,16,0x10
732cabdff1aSopenharmony_ci%else
733cabdff1aSopenharmony_cicglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
734cabdff1aSopenharmony_ci%endif
735cabdff1aSopenharmony_ci    lea     r4, [r1*4]
736cabdff1aSopenharmony_ci    lea     r5, [r1*3] ; 3*stride
737cabdff1aSopenharmony_ci    dec     r2d        ; alpha-1
738cabdff1aSopenharmony_ci    jl .end
739cabdff1aSopenharmony_ci    neg     r4
740cabdff1aSopenharmony_ci    dec     r3d        ; beta-1
741cabdff1aSopenharmony_ci    jl .end
742cabdff1aSopenharmony_ci    add     r4, r0     ; pix-4*stride
743cabdff1aSopenharmony_ci    mova    p1, [r4+2*r1]
744cabdff1aSopenharmony_ci    mova    p0, [r4+r5]
745cabdff1aSopenharmony_ci    mova    q0, [r0]
746cabdff1aSopenharmony_ci    mova    q1, [r0+r1]
747cabdff1aSopenharmony_ci%if ARCH_X86_64
748cabdff1aSopenharmony_ci    pxor    mpb_0, mpb_0
749cabdff1aSopenharmony_ci    mova    mpb_1, [pb_1]
750cabdff1aSopenharmony_ci    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
751cabdff1aSopenharmony_ci    SWAP    7, 12 ; m12=mask0
752cabdff1aSopenharmony_ci    pavgb   t5, mpb_0
753cabdff1aSopenharmony_ci    pavgb   t5, mpb_1 ; alpha/4+1
754cabdff1aSopenharmony_ci    movdqa  p2, [r4+r1]
755cabdff1aSopenharmony_ci    movdqa  q2, [r0+2*r1]
756cabdff1aSopenharmony_ci    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
757cabdff1aSopenharmony_ci    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
758cabdff1aSopenharmony_ci    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
759cabdff1aSopenharmony_ci    pand    t0, mask0
760cabdff1aSopenharmony_ci    pand    t4, t0
761cabdff1aSopenharmony_ci    pand    t2, t0
762cabdff1aSopenharmony_ci    mova    mask1q, t4
763cabdff1aSopenharmony_ci    mova    mask1p, t2
764cabdff1aSopenharmony_ci%else
765cabdff1aSopenharmony_ci    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
766cabdff1aSopenharmony_ci    mova    m4, t5
767cabdff1aSopenharmony_ci    mova    mask0, m7
768cabdff1aSopenharmony_ci    pavgb   m4, [pb_0]
769cabdff1aSopenharmony_ci    pavgb   m4, [pb_1] ; alpha/4+1
770cabdff1aSopenharmony_ci    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
771cabdff1aSopenharmony_ci    pand    m6, mask0
772cabdff1aSopenharmony_ci    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
773cabdff1aSopenharmony_ci    pand    m4, m6
774cabdff1aSopenharmony_ci    mova    mask1p, m4
775cabdff1aSopenharmony_ci    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
776cabdff1aSopenharmony_ci    pand    m4, m6
777cabdff1aSopenharmony_ci    mova    mask1q, m4
778cabdff1aSopenharmony_ci%endif
779cabdff1aSopenharmony_ci    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
780cabdff1aSopenharmony_ci    LUMA_INTRA_SWAP_PQ
781cabdff1aSopenharmony_ci    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
782cabdff1aSopenharmony_ci.end:
783cabdff1aSopenharmony_ci    RET
784cabdff1aSopenharmony_ci
785cabdff1aSopenharmony_ciINIT_MMX cpuname
786cabdff1aSopenharmony_ci%if ARCH_X86_64
787cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
788cabdff1aSopenharmony_ci; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
789cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
790cabdff1aSopenharmony_cicglobal deblock_h_luma_intra_8, 4,9,0,0x80
791cabdff1aSopenharmony_ci    movsxd r7,  r1d
792cabdff1aSopenharmony_ci    lea    r8,  [r7*3]
793cabdff1aSopenharmony_ci    lea    r6,  [r0-4]
794cabdff1aSopenharmony_ci    lea    r5,  [r0-4+r8]
795cabdff1aSopenharmony_ci%if WIN64
796cabdff1aSopenharmony_ci    %define pix_tmp rsp+0x20 ; shadow space
797cabdff1aSopenharmony_ci%else
798cabdff1aSopenharmony_ci    %define pix_tmp rsp
799cabdff1aSopenharmony_ci%endif
800cabdff1aSopenharmony_ci
801cabdff1aSopenharmony_ci    ; transpose 8x16 -> tmp space
802cabdff1aSopenharmony_ci    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
803cabdff1aSopenharmony_ci    lea    r6, [r6+r7*8]
804cabdff1aSopenharmony_ci    lea    r5, [r5+r7*8]
805cabdff1aSopenharmony_ci    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
806cabdff1aSopenharmony_ci
807cabdff1aSopenharmony_ci    lea    r0,  [pix_tmp+0x40]
808cabdff1aSopenharmony_ci    mov    r1,  0x10
809cabdff1aSopenharmony_ci    call   deblock_v_luma_intra_8
810cabdff1aSopenharmony_ci
811cabdff1aSopenharmony_ci    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
812cabdff1aSopenharmony_ci    lea    r5, [r6+r8]
813cabdff1aSopenharmony_ci    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
814cabdff1aSopenharmony_ci    shl    r7,  3
815cabdff1aSopenharmony_ci    sub    r6,  r7
816cabdff1aSopenharmony_ci    sub    r5,  r7
817cabdff1aSopenharmony_ci    shr    r7,  3
818cabdff1aSopenharmony_ci    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
819cabdff1aSopenharmony_ci    RET
820cabdff1aSopenharmony_ci%else
821cabdff1aSopenharmony_cicglobal deblock_h_luma_intra_8, 2,4,8,0x80
822cabdff1aSopenharmony_ci    lea    r3,  [r1*3]
823cabdff1aSopenharmony_ci    sub    r0,  4
824cabdff1aSopenharmony_ci    lea    r2,  [r0+r3]
825cabdff1aSopenharmony_ci    %define pix_tmp rsp
826cabdff1aSopenharmony_ci
827cabdff1aSopenharmony_ci    ; transpose 8x16 -> tmp space
828cabdff1aSopenharmony_ci    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
829cabdff1aSopenharmony_ci    lea    r0,  [r0+r1*8]
830cabdff1aSopenharmony_ci    lea    r2,  [r2+r1*8]
831cabdff1aSopenharmony_ci    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
832cabdff1aSopenharmony_ci
833cabdff1aSopenharmony_ci    lea    r0,  [pix_tmp+0x40]
834cabdff1aSopenharmony_ci    PUSH   dword r3m
835cabdff1aSopenharmony_ci    PUSH   dword r2m
836cabdff1aSopenharmony_ci    PUSH   dword 16
837cabdff1aSopenharmony_ci    PUSH   r0
838cabdff1aSopenharmony_ci    call   deblock_%1_luma_intra_8
839cabdff1aSopenharmony_ci%ifidn %1, v8
840cabdff1aSopenharmony_ci    add    dword [rsp], 8 ; pix_tmp+8
841cabdff1aSopenharmony_ci    call   deblock_%1_luma_intra_8
842cabdff1aSopenharmony_ci%endif
843cabdff1aSopenharmony_ci    ADD    esp, 16
844cabdff1aSopenharmony_ci
845cabdff1aSopenharmony_ci    mov    r1,  r1m
846cabdff1aSopenharmony_ci    mov    r0,  r0mp
847cabdff1aSopenharmony_ci    lea    r3,  [r1*3]
848cabdff1aSopenharmony_ci    sub    r0,  4
849cabdff1aSopenharmony_ci    lea    r2,  [r0+r3]
850cabdff1aSopenharmony_ci    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
851cabdff1aSopenharmony_ci    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
852cabdff1aSopenharmony_ci    lea    r0,  [r0+r1*8]
853cabdff1aSopenharmony_ci    lea    r2,  [r2+r1*8]
854cabdff1aSopenharmony_ci    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
855cabdff1aSopenharmony_ci    RET
856cabdff1aSopenharmony_ci%endif ; ARCH_X86_64
857cabdff1aSopenharmony_ci%endmacro ; DEBLOCK_LUMA_INTRA
858cabdff1aSopenharmony_ci
859cabdff1aSopenharmony_ciINIT_XMM sse2
860cabdff1aSopenharmony_ciDEBLOCK_LUMA_INTRA v
861cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
862cabdff1aSopenharmony_ciINIT_XMM avx
863cabdff1aSopenharmony_ciDEBLOCK_LUMA_INTRA v
864cabdff1aSopenharmony_ci%endif
865cabdff1aSopenharmony_ci
866cabdff1aSopenharmony_ci%macro LOAD_8_ROWS 8
867cabdff1aSopenharmony_ci    movd m0, %1
868cabdff1aSopenharmony_ci    movd m1, %2
869cabdff1aSopenharmony_ci    movd m2, %3
870cabdff1aSopenharmony_ci    movd m3, %4
871cabdff1aSopenharmony_ci    movd m4, %5
872cabdff1aSopenharmony_ci    movd m5, %6
873cabdff1aSopenharmony_ci    movd m6, %7
874cabdff1aSopenharmony_ci    movd m7, %8
875cabdff1aSopenharmony_ci%endmacro
876cabdff1aSopenharmony_ci
877cabdff1aSopenharmony_ci%macro STORE_8_ROWS 8
878cabdff1aSopenharmony_ci    movd %1, m0
879cabdff1aSopenharmony_ci    movd %2, m1
880cabdff1aSopenharmony_ci    movd %3, m2
881cabdff1aSopenharmony_ci    movd %4, m3
882cabdff1aSopenharmony_ci    movd %5, m4
883cabdff1aSopenharmony_ci    movd %6, m5
884cabdff1aSopenharmony_ci    movd %7, m6
885cabdff1aSopenharmony_ci    movd %8, m7
886cabdff1aSopenharmony_ci%endmacro
887cabdff1aSopenharmony_ci
888cabdff1aSopenharmony_ci%macro TRANSPOSE_8x4B_XMM 0
889cabdff1aSopenharmony_ci    punpcklbw m0, m1
890cabdff1aSopenharmony_ci    punpcklbw m2, m3
891cabdff1aSopenharmony_ci    punpcklbw m4, m5
892cabdff1aSopenharmony_ci    punpcklbw m6, m7
893cabdff1aSopenharmony_ci    punpcklwd m0, m2
894cabdff1aSopenharmony_ci    punpcklwd m4, m6
895cabdff1aSopenharmony_ci    punpckhdq m2, m0, m4
896cabdff1aSopenharmony_ci    punpckldq m0, m4
897cabdff1aSopenharmony_ci    MOVHL m1, m0
898cabdff1aSopenharmony_ci    MOVHL m3, m2
899cabdff1aSopenharmony_ci%endmacro
900cabdff1aSopenharmony_ci
901cabdff1aSopenharmony_ci%macro TRANSPOSE_4x8B_XMM 0
902cabdff1aSopenharmony_ci    punpcklbw m0, m1
903cabdff1aSopenharmony_ci    punpcklbw m2, m3
904cabdff1aSopenharmony_ci    punpckhwd m4, m0, m2
905cabdff1aSopenharmony_ci    punpcklwd m0, m2
906cabdff1aSopenharmony_ci    MOVHL m6, m4
907cabdff1aSopenharmony_ci    MOVHL m2, m0
908cabdff1aSopenharmony_ci    pshufd m1, m0, 1
909cabdff1aSopenharmony_ci    pshufd m3, m2, 1
910cabdff1aSopenharmony_ci    pshufd m5, m4, 1
911cabdff1aSopenharmony_ci    pshufd m7, m6, 1
912cabdff1aSopenharmony_ci%endmacro
913cabdff1aSopenharmony_ci
914cabdff1aSopenharmony_ci%macro CHROMA_INTER_BODY_XMM 1
915cabdff1aSopenharmony_ci    LOAD_MASK alpha_d, beta_d
916cabdff1aSopenharmony_ci    movd m6, [tc0_q]
917cabdff1aSopenharmony_ci    %rep %1
918cabdff1aSopenharmony_ci        punpcklbw m6, m6
919cabdff1aSopenharmony_ci    %endrep
920cabdff1aSopenharmony_ci    pand m7, m6
921cabdff1aSopenharmony_ci    DEBLOCK_P0_Q0
922cabdff1aSopenharmony_ci%endmacro
923cabdff1aSopenharmony_ci
924cabdff1aSopenharmony_ci%macro CHROMA_INTRA_BODY_XMM 0
925cabdff1aSopenharmony_ci    LOAD_MASK alpha_d, beta_d
926cabdff1aSopenharmony_ci    mova    m5,  m1
927cabdff1aSopenharmony_ci    mova    m6,  m2
928cabdff1aSopenharmony_ci    pxor    m4,  m1, m3
929cabdff1aSopenharmony_ci    pand    m4, [pb_1]
930cabdff1aSopenharmony_ci    pavgb   m1,  m3
931cabdff1aSopenharmony_ci    psubusb m1,  m4
932cabdff1aSopenharmony_ci    pavgb   m1,  m0
933cabdff1aSopenharmony_ci    pxor    m4,  m2, m0
934cabdff1aSopenharmony_ci    pand    m4, [pb_1]
935cabdff1aSopenharmony_ci    pavgb   m2,  m0
936cabdff1aSopenharmony_ci    psubusb m2,  m4
937cabdff1aSopenharmony_ci    pavgb   m2,  m3
938cabdff1aSopenharmony_ci    psubb   m1,  m5
939cabdff1aSopenharmony_ci    psubb   m2,  m6
940cabdff1aSopenharmony_ci    pand    m1,  m7
941cabdff1aSopenharmony_ci    pand    m2,  m7
942cabdff1aSopenharmony_ci    paddb   m1,  m5
943cabdff1aSopenharmony_ci    paddb   m2,  m6
944cabdff1aSopenharmony_ci%endmacro
945cabdff1aSopenharmony_ci
946cabdff1aSopenharmony_ci%macro CHROMA_V_START_XMM 1
947cabdff1aSopenharmony_ci    movsxdifnidn stride_q, stride_d
948cabdff1aSopenharmony_ci    dec alpha_d
949cabdff1aSopenharmony_ci    dec beta_d
950cabdff1aSopenharmony_ci    mov %1, pix_q
951cabdff1aSopenharmony_ci    sub %1, stride_q
952cabdff1aSopenharmony_ci    sub %1, stride_q
953cabdff1aSopenharmony_ci%endmacro
954cabdff1aSopenharmony_ci
955cabdff1aSopenharmony_ci%macro CHROMA_H_START_XMM 2
956cabdff1aSopenharmony_ci    movsxdifnidn stride_q, stride_d
957cabdff1aSopenharmony_ci    dec alpha_d
958cabdff1aSopenharmony_ci    dec beta_d
959cabdff1aSopenharmony_ci    lea %2, [3*stride_q]
960cabdff1aSopenharmony_ci    mov %1,  pix_q
961cabdff1aSopenharmony_ci    add %1,  %2
962cabdff1aSopenharmony_ci%endmacro
963cabdff1aSopenharmony_ci
964cabdff1aSopenharmony_ci%macro DEBLOCK_CHROMA_XMM 1
965cabdff1aSopenharmony_ci
966cabdff1aSopenharmony_ciINIT_XMM %1
967cabdff1aSopenharmony_ci
968cabdff1aSopenharmony_cicglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_
969cabdff1aSopenharmony_ci    CHROMA_V_START_XMM r5
970cabdff1aSopenharmony_ci    movq m0, [r5]
971cabdff1aSopenharmony_ci    movq m1, [r5 + stride_q]
972cabdff1aSopenharmony_ci    movq m2, [pix_q]
973cabdff1aSopenharmony_ci    movq m3, [pix_q + stride_q]
974cabdff1aSopenharmony_ci    CHROMA_INTER_BODY_XMM 1
975cabdff1aSopenharmony_ci    movq [r5 + stride_q], m1
976cabdff1aSopenharmony_ci    movq [pix_q], m2
977cabdff1aSopenharmony_ciRET
978cabdff1aSopenharmony_ci
979cabdff1aSopenharmony_cicglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
980cabdff1aSopenharmony_ci    CHROMA_H_START_XMM r5, r6
981cabdff1aSopenharmony_ci    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
982cabdff1aSopenharmony_ci    TRANSPOSE_8x4B_XMM
983cabdff1aSopenharmony_ci    movq [rsp], m0
984cabdff1aSopenharmony_ci    movq [rsp + 8], m3
985cabdff1aSopenharmony_ci    CHROMA_INTER_BODY_XMM 1
986cabdff1aSopenharmony_ci    movq m0, [rsp]
987cabdff1aSopenharmony_ci    movq m3, [rsp + 8]
988cabdff1aSopenharmony_ci    TRANSPOSE_4x8B_XMM
989cabdff1aSopenharmony_ci    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
990cabdff1aSopenharmony_ciRET
991cabdff1aSopenharmony_ci
992cabdff1aSopenharmony_cicglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
993cabdff1aSopenharmony_ci    CHROMA_H_START_XMM r5, r6
994cabdff1aSopenharmony_ci    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
995cabdff1aSopenharmony_ci    TRANSPOSE_8x4B_XMM
996cabdff1aSopenharmony_ci    movq [rsp], m0
997cabdff1aSopenharmony_ci    movq [rsp + 8], m3
998cabdff1aSopenharmony_ci    CHROMA_INTER_BODY_XMM 2
999cabdff1aSopenharmony_ci    movq m0, [rsp]
1000cabdff1aSopenharmony_ci    movq m3, [rsp + 8]
1001cabdff1aSopenharmony_ci    TRANSPOSE_4x8B_XMM
1002cabdff1aSopenharmony_ci    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1003cabdff1aSopenharmony_ci
1004cabdff1aSopenharmony_ci    lea pix_q, [pix_q + 8*stride_q]
1005cabdff1aSopenharmony_ci    lea r5,    [r5    + 8*stride_q]
1006cabdff1aSopenharmony_ci    add tc0_q,  2
1007cabdff1aSopenharmony_ci
1008cabdff1aSopenharmony_ci    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1009cabdff1aSopenharmony_ci    TRANSPOSE_8x4B_XMM
1010cabdff1aSopenharmony_ci    movq [rsp], m0
1011cabdff1aSopenharmony_ci    movq [rsp + 8], m3
1012cabdff1aSopenharmony_ci    CHROMA_INTER_BODY_XMM 2
1013cabdff1aSopenharmony_ci    movq m0, [rsp]
1014cabdff1aSopenharmony_ci    movq m3, [rsp + 8]
1015cabdff1aSopenharmony_ci    TRANSPOSE_4x8B_XMM
1016cabdff1aSopenharmony_ci    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1017cabdff1aSopenharmony_ciRET
1018cabdff1aSopenharmony_ci
1019cabdff1aSopenharmony_cicglobal deblock_v_chroma_intra_8, 4, 5, 8, pix_, stride_, alpha_, beta_
1020cabdff1aSopenharmony_ci    CHROMA_V_START_XMM r4
1021cabdff1aSopenharmony_ci    movq m0, [r4]
1022cabdff1aSopenharmony_ci    movq m1, [r4 + stride_q]
1023cabdff1aSopenharmony_ci    movq m2, [pix_q]
1024cabdff1aSopenharmony_ci    movq m3, [pix_q + stride_q]
1025cabdff1aSopenharmony_ci    CHROMA_INTRA_BODY_XMM
1026cabdff1aSopenharmony_ci    movq [r4 + stride_q], m1
1027cabdff1aSopenharmony_ci    movq [pix_q], m2
1028cabdff1aSopenharmony_ciRET
1029cabdff1aSopenharmony_ci
1030cabdff1aSopenharmony_cicglobal deblock_h_chroma_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
1031cabdff1aSopenharmony_ci    CHROMA_H_START_XMM r4, r5
1032cabdff1aSopenharmony_ci    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1033cabdff1aSopenharmony_ci    TRANSPOSE_8x4B_XMM
1034cabdff1aSopenharmony_ci    CHROMA_INTRA_BODY_XMM
1035cabdff1aSopenharmony_ci    TRANSPOSE_4x8B_XMM
1036cabdff1aSopenharmony_ci    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1037cabdff1aSopenharmony_ciRET
1038cabdff1aSopenharmony_ci
1039cabdff1aSopenharmony_cicglobal deblock_h_chroma422_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
1040cabdff1aSopenharmony_ci    CHROMA_H_START_XMM r4, r5
1041cabdff1aSopenharmony_ci    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1042cabdff1aSopenharmony_ci    TRANSPOSE_8x4B_XMM
1043cabdff1aSopenharmony_ci    CHROMA_INTRA_BODY_XMM
1044cabdff1aSopenharmony_ci    TRANSPOSE_4x8B_XMM
1045cabdff1aSopenharmony_ci    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1046cabdff1aSopenharmony_ci
1047cabdff1aSopenharmony_ci    lea pix_q, [pix_q + 8*stride_q]
1048cabdff1aSopenharmony_ci    lea r4,    [r4    + 8*stride_q]
1049cabdff1aSopenharmony_ci
1050cabdff1aSopenharmony_ci    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1051cabdff1aSopenharmony_ci    TRANSPOSE_8x4B_XMM
1052cabdff1aSopenharmony_ci    CHROMA_INTRA_BODY_XMM
1053cabdff1aSopenharmony_ci    TRANSPOSE_4x8B_XMM
1054cabdff1aSopenharmony_ci    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1055cabdff1aSopenharmony_ciRET
1056cabdff1aSopenharmony_ci
1057cabdff1aSopenharmony_ci%endmacro ; DEBLOCK_CHROMA_XMM
1058cabdff1aSopenharmony_ci
1059cabdff1aSopenharmony_ciDEBLOCK_CHROMA_XMM sse2
1060cabdff1aSopenharmony_ciDEBLOCK_CHROMA_XMM avx
1061cabdff1aSopenharmony_ci
1062cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1063cabdff1aSopenharmony_ci; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
1064cabdff1aSopenharmony_ci;                                   int8_t ref[2][40], int16_t mv[2][40][2],
1065cabdff1aSopenharmony_ci;                                   int bidir,    int edges,    int step,
1066cabdff1aSopenharmony_ci;                                   int mask_mv0, int mask_mv1, int field);
1067cabdff1aSopenharmony_ci;
1068cabdff1aSopenharmony_ci; bidir    is 0 or 1
1069cabdff1aSopenharmony_ci; edges    is 1 or 4
1070cabdff1aSopenharmony_ci; step     is 1 or 2
1071cabdff1aSopenharmony_ci; mask_mv0 is 0 or 3
1072cabdff1aSopenharmony_ci; mask_mv1 is 0 or 1
1073cabdff1aSopenharmony_ci; field    is 0 or 1
1074cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1075cabdff1aSopenharmony_ci%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
1076cabdff1aSopenharmony_ci                                        ; dir, d_idx, mask_dir, bidir
1077cabdff1aSopenharmony_ci%define edgesd    %1
1078cabdff1aSopenharmony_ci%define stepd     %2
1079cabdff1aSopenharmony_ci%define mask_mvd  %3
1080cabdff1aSopenharmony_ci%define dir       %4
1081cabdff1aSopenharmony_ci%define d_idx     %5
1082cabdff1aSopenharmony_ci%define mask_dir  %6
1083cabdff1aSopenharmony_ci%define bidir     %7
1084cabdff1aSopenharmony_ci    xor          b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
1085cabdff1aSopenharmony_ci%%.b_idx_loop:
1086cabdff1aSopenharmony_ci%if mask_dir == 0
1087cabdff1aSopenharmony_ci    pxor             m0, m0
1088cabdff1aSopenharmony_ci%endif
1089cabdff1aSopenharmony_ci    test         b_idxd, dword mask_mvd
1090cabdff1aSopenharmony_ci    jnz %%.skip_loop_iter                       ; if (!(b_idx & mask_mv))
1091cabdff1aSopenharmony_ci%if bidir == 1
1092cabdff1aSopenharmony_ci    movd             m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
1093cabdff1aSopenharmony_ci    punpckldq        m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
1094cabdff1aSopenharmony_ci    pshufw           m0, [refq+b_idxq+12], 0x44 ; { ref0[b],  ref0[b]  }
1095cabdff1aSopenharmony_ci    pshufw           m1, [refq+b_idxq+52], 0x44 ; { ref1[b],  ref1[b]  }
1096cabdff1aSopenharmony_ci    pshufw           m3, m2, 0x4E               ; { ref1[bn], ref0[bn] }
1097cabdff1aSopenharmony_ci    psubb            m0, m2                     ; { ref0[b] != ref0[bn],
1098cabdff1aSopenharmony_ci                                                ;   ref0[b] != ref1[bn] }
1099cabdff1aSopenharmony_ci    psubb            m1, m3                     ; { ref1[b] != ref1[bn],
1100cabdff1aSopenharmony_ci                                                ;   ref1[b] != ref0[bn] }
1101cabdff1aSopenharmony_ci
1102cabdff1aSopenharmony_ci    por              m0, m1
1103cabdff1aSopenharmony_ci    mova             m1, [mvq+b_idxq*4+(d_idx+12)*4]
1104cabdff1aSopenharmony_ci    mova             m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
1105cabdff1aSopenharmony_ci    mova             m3, m1
1106cabdff1aSopenharmony_ci    mova             m4, m2
1107cabdff1aSopenharmony_ci    psubw            m1, [mvq+b_idxq*4+12*4]
1108cabdff1aSopenharmony_ci    psubw            m2, [mvq+b_idxq*4+12*4+mmsize]
1109cabdff1aSopenharmony_ci    psubw            m3, [mvq+b_idxq*4+52*4]
1110cabdff1aSopenharmony_ci    psubw            m4, [mvq+b_idxq*4+52*4+mmsize]
1111cabdff1aSopenharmony_ci    packsswb         m1, m2
1112cabdff1aSopenharmony_ci    packsswb         m3, m4
1113cabdff1aSopenharmony_ci    paddb            m1, m6
1114cabdff1aSopenharmony_ci    paddb            m3, m6
1115cabdff1aSopenharmony_ci    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1116cabdff1aSopenharmony_ci    psubusb          m3, m5
1117cabdff1aSopenharmony_ci    packsswb         m1, m3
1118cabdff1aSopenharmony_ci
1119cabdff1aSopenharmony_ci    por              m0, m1
1120cabdff1aSopenharmony_ci    mova             m1, [mvq+b_idxq*4+(d_idx+52)*4]
1121cabdff1aSopenharmony_ci    mova             m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
1122cabdff1aSopenharmony_ci    mova             m3, m1
1123cabdff1aSopenharmony_ci    mova             m4, m2
1124cabdff1aSopenharmony_ci    psubw            m1, [mvq+b_idxq*4+12*4]
1125cabdff1aSopenharmony_ci    psubw            m2, [mvq+b_idxq*4+12*4+mmsize]
1126cabdff1aSopenharmony_ci    psubw            m3, [mvq+b_idxq*4+52*4]
1127cabdff1aSopenharmony_ci    psubw            m4, [mvq+b_idxq*4+52*4+mmsize]
1128cabdff1aSopenharmony_ci    packsswb         m1, m2
1129cabdff1aSopenharmony_ci    packsswb         m3, m4
1130cabdff1aSopenharmony_ci    paddb            m1, m6
1131cabdff1aSopenharmony_ci    paddb            m3, m6
1132cabdff1aSopenharmony_ci    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1133cabdff1aSopenharmony_ci    psubusb          m3, m5
1134cabdff1aSopenharmony_ci    packsswb         m1, m3
1135cabdff1aSopenharmony_ci
1136cabdff1aSopenharmony_ci    pshufw           m1, m1, 0x4E
1137cabdff1aSopenharmony_ci    por              m0, m1
1138cabdff1aSopenharmony_ci    pshufw           m1, m0, 0x4E
1139cabdff1aSopenharmony_ci    pminub           m0, m1
1140cabdff1aSopenharmony_ci%else ; bidir == 0
1141cabdff1aSopenharmony_ci    movd             m0, [refq+b_idxq+12]
1142cabdff1aSopenharmony_ci    psubb            m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
1143cabdff1aSopenharmony_ci
1144cabdff1aSopenharmony_ci    mova             m1, [mvq+b_idxq*4+12*4]
1145cabdff1aSopenharmony_ci    mova             m2, [mvq+b_idxq*4+12*4+mmsize]
1146cabdff1aSopenharmony_ci    psubw            m1, [mvq+b_idxq*4+(d_idx+12)*4]
1147cabdff1aSopenharmony_ci    psubw            m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
1148cabdff1aSopenharmony_ci    packsswb         m1, m2
1149cabdff1aSopenharmony_ci    paddb            m1, m6
1150cabdff1aSopenharmony_ci    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1151cabdff1aSopenharmony_ci    packsswb         m1, m1
1152cabdff1aSopenharmony_ci    por              m0, m1
1153cabdff1aSopenharmony_ci%endif ; bidir == 1/0
1154cabdff1aSopenharmony_ci
1155cabdff1aSopenharmony_ci%%.skip_loop_iter:
1156cabdff1aSopenharmony_ci    movd             m1, [nnzq+b_idxq+12]
1157cabdff1aSopenharmony_ci    por              m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
1158cabdff1aSopenharmony_ci
1159cabdff1aSopenharmony_ci    pminub           m1, m7
1160cabdff1aSopenharmony_ci    pminub           m0, m7
1161cabdff1aSopenharmony_ci    psllw            m1, 1
1162cabdff1aSopenharmony_ci    pxor             m2, m2
1163cabdff1aSopenharmony_ci    pmaxub           m1, m0
1164cabdff1aSopenharmony_ci    punpcklbw        m1, m2
1165cabdff1aSopenharmony_ci    movq [bsq+b_idxq+32*dir], m1
1166cabdff1aSopenharmony_ci
1167cabdff1aSopenharmony_ci    add          b_idxd, dword stepd
1168cabdff1aSopenharmony_ci    cmp          b_idxd, dword edgesd
1169cabdff1aSopenharmony_ci    jl %%.b_idx_loop
1170cabdff1aSopenharmony_ci%endmacro
1171cabdff1aSopenharmony_ci
1172cabdff1aSopenharmony_ciINIT_MMX mmxext
1173cabdff1aSopenharmony_cicglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
1174cabdff1aSopenharmony_ci                                            step, mask_mv0, mask_mv1, field
1175cabdff1aSopenharmony_ci%define b_idxq bidirq
1176cabdff1aSopenharmony_ci%define b_idxd bidird
1177cabdff1aSopenharmony_ci    cmp    dword fieldm, 0
1178cabdff1aSopenharmony_ci    mova             m7, [pb_1]
1179cabdff1aSopenharmony_ci    mova             m5, [pb_3]
1180cabdff1aSopenharmony_ci    je .nofield
1181cabdff1aSopenharmony_ci    mova             m5, [pb_3_1]
1182cabdff1aSopenharmony_ci.nofield:
1183cabdff1aSopenharmony_ci    mova             m6, m5
1184cabdff1aSopenharmony_ci    paddb            m5, m5
1185cabdff1aSopenharmony_ci
1186cabdff1aSopenharmony_ci    shl     dword stepd, 3
1187cabdff1aSopenharmony_ci    shl    dword edgesd, 3
1188cabdff1aSopenharmony_ci%if ARCH_X86_32
1189cabdff1aSopenharmony_ci%define mask_mv0d mask_mv0m
1190cabdff1aSopenharmony_ci%define mask_mv1d mask_mv1m
1191cabdff1aSopenharmony_ci%endif
1192cabdff1aSopenharmony_ci    shl dword mask_mv1d, 3
1193cabdff1aSopenharmony_ci    shl dword mask_mv0d, 3
1194cabdff1aSopenharmony_ci
1195cabdff1aSopenharmony_ci    cmp    dword bidird, 0
1196cabdff1aSopenharmony_ci    jne .bidir
1197cabdff1aSopenharmony_ci    loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8,  0, 0
1198cabdff1aSopenharmony_ci    loop_filter_strength_iteration     32,     8, mask_mv0d, 0, -1, -1, 0
1199cabdff1aSopenharmony_ci
1200cabdff1aSopenharmony_ci    mova             m0, [bsq+mmsize*0]
1201cabdff1aSopenharmony_ci    mova             m1, [bsq+mmsize*1]
1202cabdff1aSopenharmony_ci    mova             m2, [bsq+mmsize*2]
1203cabdff1aSopenharmony_ci    mova             m3, [bsq+mmsize*3]
1204cabdff1aSopenharmony_ci    TRANSPOSE4x4W 0, 1, 2, 3, 4
1205cabdff1aSopenharmony_ci    mova  [bsq+mmsize*0], m0
1206cabdff1aSopenharmony_ci    mova  [bsq+mmsize*1], m1
1207cabdff1aSopenharmony_ci    mova  [bsq+mmsize*2], m2
1208cabdff1aSopenharmony_ci    mova  [bsq+mmsize*3], m3
1209cabdff1aSopenharmony_ci    RET
1210cabdff1aSopenharmony_ci
1211cabdff1aSopenharmony_ci.bidir:
1212cabdff1aSopenharmony_ci    loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8,  0, 1
1213cabdff1aSopenharmony_ci    loop_filter_strength_iteration     32,     8, mask_mv0d, 0, -1, -1, 1
1214cabdff1aSopenharmony_ci
1215cabdff1aSopenharmony_ci    mova             m0, [bsq+mmsize*0]
1216cabdff1aSopenharmony_ci    mova             m1, [bsq+mmsize*1]
1217cabdff1aSopenharmony_ci    mova             m2, [bsq+mmsize*2]
1218cabdff1aSopenharmony_ci    mova             m3, [bsq+mmsize*3]
1219cabdff1aSopenharmony_ci    TRANSPOSE4x4W 0, 1, 2, 3, 4
1220cabdff1aSopenharmony_ci    mova  [bsq+mmsize*0], m0
1221cabdff1aSopenharmony_ci    mova  [bsq+mmsize*1], m1
1222cabdff1aSopenharmony_ci    mova  [bsq+mmsize*2], m2
1223cabdff1aSopenharmony_ci    mova  [bsq+mmsize*3], m3
1224cabdff1aSopenharmony_ci    RET
1225