1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* SSE2-optimized HEVC deblocking code
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (C) 2013 VTT
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
9cabdff1aSopenharmony_ci;*
10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
14cabdff1aSopenharmony_ci;*
15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
19cabdff1aSopenharmony_ci;*
20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23cabdff1aSopenharmony_ci;******************************************************************************
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ciSECTION_RODATA
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_cicextern pw_1023
30cabdff1aSopenharmony_ci%define pw_pixel_max_10 pw_1023
31cabdff1aSopenharmony_cipw_pixel_max_12: times 8 dw ((1 << 12)-1)
32cabdff1aSopenharmony_cipw_m2:           times 8 dw -2
33cabdff1aSopenharmony_cipd_1 :           times 4 dd  1
34cabdff1aSopenharmony_ci
35cabdff1aSopenharmony_cicextern pw_4
36cabdff1aSopenharmony_cicextern pw_8
37cabdff1aSopenharmony_cicextern pw_m1
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ciSECTION .text
40cabdff1aSopenharmony_ciINIT_XMM sse2
41cabdff1aSopenharmony_ci
42cabdff1aSopenharmony_ci; in: 8 rows of 4 bytes in %4..%11
43cabdff1aSopenharmony_ci; out: 4 rows of 8 words in m0..m3
44cabdff1aSopenharmony_ci%macro TRANSPOSE4x8B_LOAD 8
45cabdff1aSopenharmony_ci    movd             m0, %1
46cabdff1aSopenharmony_ci    movd             m2, %2
47cabdff1aSopenharmony_ci    movd             m1, %3
48cabdff1aSopenharmony_ci    movd             m3, %4
49cabdff1aSopenharmony_ci
50cabdff1aSopenharmony_ci    punpcklbw        m0, m2
51cabdff1aSopenharmony_ci    punpcklbw        m1, m3
52cabdff1aSopenharmony_ci    punpcklwd        m0, m1
53cabdff1aSopenharmony_ci
54cabdff1aSopenharmony_ci    movd             m4, %5
55cabdff1aSopenharmony_ci    movd             m6, %6
56cabdff1aSopenharmony_ci    movd             m5, %7
57cabdff1aSopenharmony_ci    movd             m3, %8
58cabdff1aSopenharmony_ci
59cabdff1aSopenharmony_ci    punpcklbw        m4, m6
60cabdff1aSopenharmony_ci    punpcklbw        m5, m3
61cabdff1aSopenharmony_ci    punpcklwd        m4, m5
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_ci    punpckhdq        m2, m0, m4
64cabdff1aSopenharmony_ci    punpckldq        m0, m4
65cabdff1aSopenharmony_ci
66cabdff1aSopenharmony_ci    pxor             m5, m5
67cabdff1aSopenharmony_ci    punpckhbw        m1, m0, m5
68cabdff1aSopenharmony_ci    punpcklbw        m0, m5
69cabdff1aSopenharmony_ci    punpckhbw        m3, m2, m5
70cabdff1aSopenharmony_ci    punpcklbw        m2, m5
71cabdff1aSopenharmony_ci%endmacro
72cabdff1aSopenharmony_ci
73cabdff1aSopenharmony_ci; in: 4 rows of 8 words in m0..m3
74cabdff1aSopenharmony_ci; out: 8 rows of 4 bytes in %1..%8
75cabdff1aSopenharmony_ci%macro TRANSPOSE8x4B_STORE 8
76cabdff1aSopenharmony_ci    packuswb         m0, m2
77cabdff1aSopenharmony_ci    packuswb         m1, m3
78cabdff1aSopenharmony_ci    SBUTTERFLY bw, 0, 1, 2
79cabdff1aSopenharmony_ci    SBUTTERFLY wd, 0, 1, 2
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci    movd             %1, m0
82cabdff1aSopenharmony_ci    pshufd           m0, m0, 0x39
83cabdff1aSopenharmony_ci    movd             %2, m0
84cabdff1aSopenharmony_ci    pshufd           m0, m0, 0x39
85cabdff1aSopenharmony_ci    movd             %3, m0
86cabdff1aSopenharmony_ci    pshufd           m0, m0, 0x39
87cabdff1aSopenharmony_ci    movd             %4, m0
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci    movd             %5, m1
90cabdff1aSopenharmony_ci    pshufd           m1, m1, 0x39
91cabdff1aSopenharmony_ci    movd             %6, m1
92cabdff1aSopenharmony_ci    pshufd           m1, m1, 0x39
93cabdff1aSopenharmony_ci    movd             %7, m1
94cabdff1aSopenharmony_ci    pshufd           m1, m1, 0x39
95cabdff1aSopenharmony_ci    movd             %8, m1
96cabdff1aSopenharmony_ci%endmacro
97cabdff1aSopenharmony_ci
98cabdff1aSopenharmony_ci; in: 8 rows of 4 words in %4..%11
99cabdff1aSopenharmony_ci; out: 4 rows of 8 words in m0..m3
100cabdff1aSopenharmony_ci%macro TRANSPOSE4x8W_LOAD 8
101cabdff1aSopenharmony_ci    movq             m0, %1
102cabdff1aSopenharmony_ci    movq             m2, %2
103cabdff1aSopenharmony_ci    movq             m1, %3
104cabdff1aSopenharmony_ci    movq             m3, %4
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci    punpcklwd        m0, m2
107cabdff1aSopenharmony_ci    punpcklwd        m1, m3
108cabdff1aSopenharmony_ci    punpckhdq        m2, m0, m1
109cabdff1aSopenharmony_ci    punpckldq        m0, m1
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_ci    movq             m4, %5
112cabdff1aSopenharmony_ci    movq             m6, %6
113cabdff1aSopenharmony_ci    movq             m5, %7
114cabdff1aSopenharmony_ci    movq             m3, %8
115cabdff1aSopenharmony_ci
116cabdff1aSopenharmony_ci    punpcklwd        m4, m6
117cabdff1aSopenharmony_ci    punpcklwd        m5, m3
118cabdff1aSopenharmony_ci    punpckhdq        m6, m4, m5
119cabdff1aSopenharmony_ci    punpckldq        m4, m5
120cabdff1aSopenharmony_ci
121cabdff1aSopenharmony_ci    punpckhqdq       m1, m0, m4
122cabdff1aSopenharmony_ci    punpcklqdq       m0, m4
123cabdff1aSopenharmony_ci    punpckhqdq       m3, m2, m6
124cabdff1aSopenharmony_ci    punpcklqdq       m2, m6
125cabdff1aSopenharmony_ci
126cabdff1aSopenharmony_ci%endmacro
127cabdff1aSopenharmony_ci
128cabdff1aSopenharmony_ci; in: 4 rows of 8 words in m0..m3
129cabdff1aSopenharmony_ci; out: 8 rows of 4 words in %1..%8
130cabdff1aSopenharmony_ci%macro TRANSPOSE8x4W_STORE 9
131cabdff1aSopenharmony_ci    TRANSPOSE4x4W     0, 1, 2, 3, 4
132cabdff1aSopenharmony_ci
133cabdff1aSopenharmony_ci    pxor             m5, m5; zeros reg
134cabdff1aSopenharmony_ci    CLIPW            m0, m5, %9
135cabdff1aSopenharmony_ci    CLIPW            m1, m5, %9
136cabdff1aSopenharmony_ci    CLIPW            m2, m5, %9
137cabdff1aSopenharmony_ci    CLIPW            m3, m5, %9
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci    movq             %1, m0
140cabdff1aSopenharmony_ci    movhps           %2, m0
141cabdff1aSopenharmony_ci    movq             %3, m1
142cabdff1aSopenharmony_ci    movhps           %4, m1
143cabdff1aSopenharmony_ci    movq             %5, m2
144cabdff1aSopenharmony_ci    movhps           %6, m2
145cabdff1aSopenharmony_ci    movq             %7, m3
146cabdff1aSopenharmony_ci    movhps           %8, m3
147cabdff1aSopenharmony_ci%endmacro
148cabdff1aSopenharmony_ci
149cabdff1aSopenharmony_ci; in: 8 rows of 8 bytes in %1..%8
150cabdff1aSopenharmony_ci; out: 8 rows of 8 words in m0..m7
151cabdff1aSopenharmony_ci%macro TRANSPOSE8x8B_LOAD 8
152cabdff1aSopenharmony_ci    movq             m7, %1
153cabdff1aSopenharmony_ci    movq             m2, %2
154cabdff1aSopenharmony_ci    movq             m1, %3
155cabdff1aSopenharmony_ci    movq             m3, %4
156cabdff1aSopenharmony_ci
157cabdff1aSopenharmony_ci    punpcklbw        m7, m2
158cabdff1aSopenharmony_ci    punpcklbw        m1, m3
159cabdff1aSopenharmony_ci    punpcklwd        m3, m7, m1
160cabdff1aSopenharmony_ci    punpckhwd        m7, m1
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_ci    movq             m4, %5
163cabdff1aSopenharmony_ci    movq             m6, %6
164cabdff1aSopenharmony_ci    movq             m5, %7
165cabdff1aSopenharmony_ci    movq            m15, %8
166cabdff1aSopenharmony_ci
167cabdff1aSopenharmony_ci    punpcklbw        m4, m6
168cabdff1aSopenharmony_ci    punpcklbw        m5, m15
169cabdff1aSopenharmony_ci    punpcklwd        m9, m4, m5
170cabdff1aSopenharmony_ci    punpckhwd        m4, m5
171cabdff1aSopenharmony_ci
172cabdff1aSopenharmony_ci    punpckldq        m1, m3, m9;  0, 1
173cabdff1aSopenharmony_ci    punpckhdq        m3, m9;  2, 3
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_ci    punpckldq        m5, m7, m4;  4, 5
176cabdff1aSopenharmony_ci    punpckhdq        m7, m4;  6, 7
177cabdff1aSopenharmony_ci
178cabdff1aSopenharmony_ci    pxor            m13, m13
179cabdff1aSopenharmony_ci
180cabdff1aSopenharmony_ci    punpcklbw        m0, m1, m13; 0 in 16 bit
181cabdff1aSopenharmony_ci    punpckhbw        m1, m13; 1 in 16 bit
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci    punpcklbw        m2, m3, m13; 2
184cabdff1aSopenharmony_ci    punpckhbw        m3, m13; 3
185cabdff1aSopenharmony_ci
186cabdff1aSopenharmony_ci    punpcklbw        m4, m5, m13; 4
187cabdff1aSopenharmony_ci    punpckhbw        m5, m13; 5
188cabdff1aSopenharmony_ci
189cabdff1aSopenharmony_ci    punpcklbw        m6, m7, m13; 6
190cabdff1aSopenharmony_ci    punpckhbw        m7, m13; 7
191cabdff1aSopenharmony_ci%endmacro
192cabdff1aSopenharmony_ci
193cabdff1aSopenharmony_ci
194cabdff1aSopenharmony_ci; in: 8 rows of 8 words in m0..m8
195cabdff1aSopenharmony_ci; out: 8 rows of 8 bytes in %1..%8
196cabdff1aSopenharmony_ci%macro TRANSPOSE8x8B_STORE 8
197cabdff1aSopenharmony_ci    packuswb         m0, m4
198cabdff1aSopenharmony_ci    packuswb         m1, m5
199cabdff1aSopenharmony_ci    packuswb         m2, m6
200cabdff1aSopenharmony_ci    packuswb         m3, m7
201cabdff1aSopenharmony_ci    TRANSPOSE2x4x4B   0, 1, 2, 3, 4
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci    movq             %1, m0
204cabdff1aSopenharmony_ci    movhps           %2, m0
205cabdff1aSopenharmony_ci    movq             %3, m1
206cabdff1aSopenharmony_ci    movhps           %4, m1
207cabdff1aSopenharmony_ci    movq             %5, m2
208cabdff1aSopenharmony_ci    movhps           %6, m2
209cabdff1aSopenharmony_ci    movq             %7, m3
210cabdff1aSopenharmony_ci    movhps           %8, m3
211cabdff1aSopenharmony_ci%endmacro
212cabdff1aSopenharmony_ci
213cabdff1aSopenharmony_ci; in: 8 rows of 8 words in %1..%8
214cabdff1aSopenharmony_ci; out: 8 rows of 8 words in m0..m7
215cabdff1aSopenharmony_ci%macro TRANSPOSE8x8W_LOAD 8
216cabdff1aSopenharmony_ci    movdqu           m0, %1
217cabdff1aSopenharmony_ci    movdqu           m1, %2
218cabdff1aSopenharmony_ci    movdqu           m2, %3
219cabdff1aSopenharmony_ci    movdqu           m3, %4
220cabdff1aSopenharmony_ci    movdqu           m4, %5
221cabdff1aSopenharmony_ci    movdqu           m5, %6
222cabdff1aSopenharmony_ci    movdqu           m6, %7
223cabdff1aSopenharmony_ci    movdqu           m7, %8
224cabdff1aSopenharmony_ci    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
225cabdff1aSopenharmony_ci%endmacro
226cabdff1aSopenharmony_ci
227cabdff1aSopenharmony_ci; in: 8 rows of 8 words in m0..m8
228cabdff1aSopenharmony_ci; out: 8 rows of 8 words in %1..%8
229cabdff1aSopenharmony_ci%macro TRANSPOSE8x8W_STORE 9
230cabdff1aSopenharmony_ci    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
231cabdff1aSopenharmony_ci
232cabdff1aSopenharmony_ci    pxor             m8, m8
233cabdff1aSopenharmony_ci    CLIPW            m0, m8, %9
234cabdff1aSopenharmony_ci    CLIPW            m1, m8, %9
235cabdff1aSopenharmony_ci    CLIPW            m2, m8, %9
236cabdff1aSopenharmony_ci    CLIPW            m3, m8, %9
237cabdff1aSopenharmony_ci    CLIPW            m4, m8, %9
238cabdff1aSopenharmony_ci    CLIPW            m5, m8, %9
239cabdff1aSopenharmony_ci    CLIPW            m6, m8, %9
240cabdff1aSopenharmony_ci    CLIPW            m7, m8, %9
241cabdff1aSopenharmony_ci
242cabdff1aSopenharmony_ci    movdqu           %1, m0
243cabdff1aSopenharmony_ci    movdqu           %2, m1
244cabdff1aSopenharmony_ci    movdqu           %3, m2
245cabdff1aSopenharmony_ci    movdqu           %4, m3
246cabdff1aSopenharmony_ci    movdqu           %5, m4
247cabdff1aSopenharmony_ci    movdqu           %6, m5
248cabdff1aSopenharmony_ci    movdqu           %7, m6
249cabdff1aSopenharmony_ci    movdqu           %8, m7
250cabdff1aSopenharmony_ci%endmacro
251cabdff1aSopenharmony_ci
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci; in: %2 clobbered
254cabdff1aSopenharmony_ci; out: %1
255cabdff1aSopenharmony_ci; mask in m11
256cabdff1aSopenharmony_ci; clobbers m10
257cabdff1aSopenharmony_ci%macro MASKED_COPY 2
258cabdff1aSopenharmony_ci    pand             %2, m11 ; and mask
259cabdff1aSopenharmony_ci    pandn           m10, m11, %1; and -mask
260cabdff1aSopenharmony_ci    por              %2, m10
261cabdff1aSopenharmony_ci    mova             %1, %2
262cabdff1aSopenharmony_ci%endmacro
263cabdff1aSopenharmony_ci
264cabdff1aSopenharmony_ci; in: %2 clobbered
265cabdff1aSopenharmony_ci; out: %1
266cabdff1aSopenharmony_ci; mask in %3, will be clobbered
267cabdff1aSopenharmony_ci%macro MASKED_COPY2 3
268cabdff1aSopenharmony_ci    pand             %2, %3 ; and mask
269cabdff1aSopenharmony_ci    pandn            %3, %1; and -mask
270cabdff1aSopenharmony_ci    por              %2, %3
271cabdff1aSopenharmony_ci    mova             %1, %2
272cabdff1aSopenharmony_ci%endmacro
273cabdff1aSopenharmony_ci
274cabdff1aSopenharmony_ciALIGN 16
275cabdff1aSopenharmony_ci; input in m0 ... m3 and tcs in r2. Output in m1 and m2
276cabdff1aSopenharmony_ci%macro CHROMA_DEBLOCK_BODY 1
277cabdff1aSopenharmony_ci    psubw            m4, m2, m1; q0 - p0
278cabdff1aSopenharmony_ci    psubw            m5, m0, m3; p1 - q1
279cabdff1aSopenharmony_ci    psllw            m4, 2; << 2
280cabdff1aSopenharmony_ci    paddw            m5, m4;
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci    ;tc calculations
283cabdff1aSopenharmony_ci    movq             m6, [tcq]; tc0
284cabdff1aSopenharmony_ci    punpcklwd        m6, m6
285cabdff1aSopenharmony_ci    pshufd           m6, m6, 0xA0; tc0, tc1
286cabdff1aSopenharmony_ci%if cpuflag(ssse3)
287cabdff1aSopenharmony_ci    psignw           m4, m6, [pw_m1]; -tc0, -tc1
288cabdff1aSopenharmony_ci%else
289cabdff1aSopenharmony_ci    pmullw           m4, m6, [pw_m1]; -tc0, -tc1
290cabdff1aSopenharmony_ci%endif
291cabdff1aSopenharmony_ci    ;end tc calculations
292cabdff1aSopenharmony_ci
293cabdff1aSopenharmony_ci    paddw            m5, [pw_4]; +4
294cabdff1aSopenharmony_ci    psraw            m5, 3; >> 3
295cabdff1aSopenharmony_ci
296cabdff1aSopenharmony_ci%if %1 > 8
297cabdff1aSopenharmony_ci    psllw            m4, %1-8; << (BIT_DEPTH - 8)
298cabdff1aSopenharmony_ci    psllw            m6, %1-8; << (BIT_DEPTH - 8)
299cabdff1aSopenharmony_ci%endif
300cabdff1aSopenharmony_ci    pmaxsw           m5, m4
301cabdff1aSopenharmony_ci    pminsw           m5, m6
302cabdff1aSopenharmony_ci    paddw            m1, m5; p0 + delta0
303cabdff1aSopenharmony_ci    psubw            m2, m5; q0 - delta0
304cabdff1aSopenharmony_ci%endmacro
305cabdff1aSopenharmony_ci
306cabdff1aSopenharmony_ci; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
307cabdff1aSopenharmony_ci%macro LUMA_DEBLOCK_BODY 2
308cabdff1aSopenharmony_ci    psllw            m9, m2, 1; *2
309cabdff1aSopenharmony_ci    psubw           m10, m1, m9
310cabdff1aSopenharmony_ci    paddw           m10, m3
311cabdff1aSopenharmony_ci    ABS1            m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci    psllw            m9, m5, 1; *2
314cabdff1aSopenharmony_ci    psubw           m11, m6, m9
315cabdff1aSopenharmony_ci    paddw           m11, m4
316cabdff1aSopenharmony_ci    ABS1            m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
317cabdff1aSopenharmony_ci
318cabdff1aSopenharmony_ci    ;beta calculations
319cabdff1aSopenharmony_ci%if %1 > 8
320cabdff1aSopenharmony_ci    shl             betaq, %1 - 8
321cabdff1aSopenharmony_ci%endif
322cabdff1aSopenharmony_ci    movd            m13, betad
323cabdff1aSopenharmony_ci    SPLATW          m13, m13, 0
324cabdff1aSopenharmony_ci    ;end beta calculations
325cabdff1aSopenharmony_ci
326cabdff1aSopenharmony_ci    paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
327cabdff1aSopenharmony_ci
328cabdff1aSopenharmony_ci    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
329cabdff1aSopenharmony_ci    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low
330cabdff1aSopenharmony_ci
331cabdff1aSopenharmony_ci    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
332cabdff1aSopenharmony_ci    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
333cabdff1aSopenharmony_ci
334cabdff1aSopenharmony_ci    paddw           m14, m9; 0d0+0d3, 1d0+1d3
335cabdff1aSopenharmony_ci
336cabdff1aSopenharmony_ci    ;compare
337cabdff1aSopenharmony_ci    pcmpgtw         m15, m13, m14
338cabdff1aSopenharmony_ci    movmskps        r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
339cabdff1aSopenharmony_ci    test            r13, r13
340cabdff1aSopenharmony_ci    je              .bypassluma
341cabdff1aSopenharmony_ci
342cabdff1aSopenharmony_ci    ;weak / strong decision compare to beta_2
343cabdff1aSopenharmony_ci    psraw           m15, m13, 2;   beta >> 2
344cabdff1aSopenharmony_ci    psllw            m8, m9, 1;
345cabdff1aSopenharmony_ci    pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
346cabdff1aSopenharmony_ci    movmskps        r6, m15;
347cabdff1aSopenharmony_ci    ;end weak / strong decision
348cabdff1aSopenharmony_ci
349cabdff1aSopenharmony_ci    ; weak filter nd_p/q calculation
350cabdff1aSopenharmony_ci    pshufd           m8, m10, 0x31
351cabdff1aSopenharmony_ci    psrld            m8, 16
352cabdff1aSopenharmony_ci    paddw            m8, m10
353cabdff1aSopenharmony_ci    movd            r7d, m8
354cabdff1aSopenharmony_ci    pshufd           m8, m8, 0x4E
355cabdff1aSopenharmony_ci    movd            r8d, m8
356cabdff1aSopenharmony_ci
357cabdff1aSopenharmony_ci    pshufd           m8, m11, 0x31
358cabdff1aSopenharmony_ci    psrld            m8, 16
359cabdff1aSopenharmony_ci    paddw            m8, m11
360cabdff1aSopenharmony_ci    movd            r9d, m8
361cabdff1aSopenharmony_ci    pshufd           m8, m8, 0x4E
362cabdff1aSopenharmony_ci    movd           r10d, m8
363cabdff1aSopenharmony_ci    ; end calc for weak filter
364cabdff1aSopenharmony_ci
365cabdff1aSopenharmony_ci    ; filtering mask
366cabdff1aSopenharmony_ci    mov             r11, r13
367cabdff1aSopenharmony_ci    shr             r11, 3
368cabdff1aSopenharmony_ci    movd            m15, r11d
369cabdff1aSopenharmony_ci    and             r13, 1
370cabdff1aSopenharmony_ci    movd            m11, r13d
371cabdff1aSopenharmony_ci    shufps          m11, m15, 0
372cabdff1aSopenharmony_ci    shl             r11, 1
373cabdff1aSopenharmony_ci    or              r13, r11
374cabdff1aSopenharmony_ci
375cabdff1aSopenharmony_ci    pcmpeqd         m11, [pd_1]; filtering mask
376cabdff1aSopenharmony_ci
377cabdff1aSopenharmony_ci    ;decide between strong and weak filtering
378cabdff1aSopenharmony_ci    ;tc25 calculations
379cabdff1aSopenharmony_ci    mov            r11d, [tcq];
380cabdff1aSopenharmony_ci%if %1 > 8
381cabdff1aSopenharmony_ci    shl             r11, %1 - 8
382cabdff1aSopenharmony_ci%endif
383cabdff1aSopenharmony_ci    movd             m8, r11d; tc0
384cabdff1aSopenharmony_ci    mov             r3d, [tcq+4];
385cabdff1aSopenharmony_ci%if %1 > 8
386cabdff1aSopenharmony_ci    shl              r3, %1 - 8
387cabdff1aSopenharmony_ci%endif
388cabdff1aSopenharmony_ci    add            r11d, r3d; tc0 + tc1
389cabdff1aSopenharmony_ci    jz             .bypassluma
390cabdff1aSopenharmony_ci    movd             m9, r3d; tc1
391cabdff1aSopenharmony_ci    punpcklwd        m8, m8
392cabdff1aSopenharmony_ci    punpcklwd        m9, m9
393cabdff1aSopenharmony_ci    shufps           m8, m9, 0; tc0, tc1
394cabdff1aSopenharmony_ci    mova             m9, m8
395cabdff1aSopenharmony_ci    psllw            m8, 2; tc << 2
396cabdff1aSopenharmony_ci    pavgw            m8, m9; tc25 = ((tc * 5 + 1) >> 1)
397cabdff1aSopenharmony_ci    ;end tc25 calculations
398cabdff1aSopenharmony_ci
399cabdff1aSopenharmony_ci    ;----beta_3 comparison-----
400cabdff1aSopenharmony_ci    psubw           m12, m0, m3;      p3 - p0
401cabdff1aSopenharmony_ci    ABS1            m12, m14; abs(p3 - p0)
402cabdff1aSopenharmony_ci
403cabdff1aSopenharmony_ci    psubw           m15, m7, m4;      q3 - q0
404cabdff1aSopenharmony_ci    ABS1            m15, m14; abs(q3 - q0)
405cabdff1aSopenharmony_ci
406cabdff1aSopenharmony_ci    paddw           m12, m15; abs(p3 - p0) + abs(q3 - q0)
407cabdff1aSopenharmony_ci
408cabdff1aSopenharmony_ci    pshufhw         m12, m12, 0xf0 ;0b11110000;
409cabdff1aSopenharmony_ci    pshuflw         m12, m12, 0xf0 ;0b11110000;
410cabdff1aSopenharmony_ci
411cabdff1aSopenharmony_ci    psraw           m13, 3; beta >> 3
412cabdff1aSopenharmony_ci    pcmpgtw         m13, m12;
413cabdff1aSopenharmony_ci    movmskps        r11, m13;
414cabdff1aSopenharmony_ci    and             r6, r11; strong mask , beta_2 and beta_3 comparisons
415cabdff1aSopenharmony_ci    ;----beta_3 comparison end-----
416cabdff1aSopenharmony_ci    ;----tc25 comparison---
417cabdff1aSopenharmony_ci    psubw           m12, m3, m4;      p0 - q0
418cabdff1aSopenharmony_ci    ABS1            m12, m14; abs(p0 - q0)
419cabdff1aSopenharmony_ci
420cabdff1aSopenharmony_ci    pshufhw         m12, m12, 0xf0 ;0b11110000;
421cabdff1aSopenharmony_ci    pshuflw         m12, m12, 0xf0 ;0b11110000;
422cabdff1aSopenharmony_ci
423cabdff1aSopenharmony_ci    pcmpgtw          m8, m12; tc25 comparisons
424cabdff1aSopenharmony_ci    movmskps        r11, m8;
425cabdff1aSopenharmony_ci    and             r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
426cabdff1aSopenharmony_ci    ;----tc25 comparison end---
427cabdff1aSopenharmony_ci    mov             r11, r6;
428cabdff1aSopenharmony_ci    shr             r11, 1;
429cabdff1aSopenharmony_ci    and             r6, r11; strong mask, bits 2 and 0
430cabdff1aSopenharmony_ci
431cabdff1aSopenharmony_ci    pmullw          m14, m9, [pw_m2]; -tc * 2
432cabdff1aSopenharmony_ci    paddw            m9, m9
433cabdff1aSopenharmony_ci
434cabdff1aSopenharmony_ci    and             r6, 5; 0b101
435cabdff1aSopenharmony_ci    mov             r11, r6; strong mask
436cabdff1aSopenharmony_ci    shr             r6, 2;
437cabdff1aSopenharmony_ci    movd            m12, r6d; store to xmm for mask generation
438cabdff1aSopenharmony_ci    shl             r6, 1
439cabdff1aSopenharmony_ci    and             r11, 1
440cabdff1aSopenharmony_ci    movd            m10, r11d; store to xmm for mask generation
441cabdff1aSopenharmony_ci    or              r6, r11; final strong mask, bits 1 and 0
442cabdff1aSopenharmony_ci    jz      .weakfilter
443cabdff1aSopenharmony_ci
444cabdff1aSopenharmony_ci    shufps          m10, m12, 0
445cabdff1aSopenharmony_ci    pcmpeqd         m10, [pd_1]; strong mask
446cabdff1aSopenharmony_ci
447cabdff1aSopenharmony_ci    mova            m13, [pw_4]; 4 in every cell
448cabdff1aSopenharmony_ci    pand            m11, m10; combine filtering mask and strong mask
449cabdff1aSopenharmony_ci    paddw           m12, m2, m3;          p1 +   p0
450cabdff1aSopenharmony_ci    paddw           m12, m4;          p1 +   p0 +   q0
451cabdff1aSopenharmony_ci    mova            m10, m12; copy
452cabdff1aSopenharmony_ci    paddw           m12, m12;       2*p1 + 2*p0 + 2*q0
453cabdff1aSopenharmony_ci    paddw           m12, m1;   p2 + 2*p1 + 2*p0 + 2*q0
454cabdff1aSopenharmony_ci    paddw           m12, m5;   p2 + 2*p1 + 2*p0 + 2*q0 + q1
455cabdff1aSopenharmony_ci    paddw           m12, m13;  p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
456cabdff1aSopenharmony_ci    psraw           m12, 3;  ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
457cabdff1aSopenharmony_ci    psubw           m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
458cabdff1aSopenharmony_ci    pmaxsw          m12, m14
459cabdff1aSopenharmony_ci    pminsw          m12, m9; av_clip( , -2 * tc, 2 * tc)
460cabdff1aSopenharmony_ci    paddw           m12, m3; p0'
461cabdff1aSopenharmony_ci
462cabdff1aSopenharmony_ci    paddw           m15, m1, m10; p2 + p1 + p0 + q0
463cabdff1aSopenharmony_ci    psrlw           m13, 1; 2 in every cell
464cabdff1aSopenharmony_ci    paddw           m15, m13; p2 + p1 + p0 + q0 + 2
465cabdff1aSopenharmony_ci    psraw           m15, 2;  (p2 + p1 + p0 + q0 + 2) >> 2
466cabdff1aSopenharmony_ci    psubw           m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
467cabdff1aSopenharmony_ci    pmaxsw          m15, m14
468cabdff1aSopenharmony_ci    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
469cabdff1aSopenharmony_ci    paddw           m15, m2; p1'
470cabdff1aSopenharmony_ci
471cabdff1aSopenharmony_ci    paddw            m8, m1, m0;     p3 +   p2
472cabdff1aSopenharmony_ci    paddw            m8, m8;   2*p3 + 2*p2
473cabdff1aSopenharmony_ci    paddw            m8, m1;   2*p3 + 3*p2
474cabdff1aSopenharmony_ci    paddw            m8, m10;  2*p3 + 3*p2 + p1 + p0 + q0
475cabdff1aSopenharmony_ci    paddw           m13, m13
476cabdff1aSopenharmony_ci    paddw            m8, m13;  2*p3 + 3*p2 + p1 + p0 + q0 + 4
477cabdff1aSopenharmony_ci    psraw            m8, 3;   (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
478cabdff1aSopenharmony_ci    psubw            m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
479cabdff1aSopenharmony_ci    pmaxsw           m8, m14
480cabdff1aSopenharmony_ci    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
481cabdff1aSopenharmony_ci    paddw            m8, m1; p2'
482cabdff1aSopenharmony_ci    MASKED_COPY      m1, m8
483cabdff1aSopenharmony_ci
484cabdff1aSopenharmony_ci    paddw            m8, m3, m4;         p0 +   q0
485cabdff1aSopenharmony_ci    paddw            m8, m5;         p0 +   q0 +   q1
486cabdff1aSopenharmony_ci    paddw            m8, m8;       2*p0 + 2*q0 + 2*q1
487cabdff1aSopenharmony_ci    paddw            m8, m2;  p1 + 2*p0 + 2*q0 + 2*q1
488cabdff1aSopenharmony_ci    paddw            m8, m6;  p1 + 2*p0 + 2*q0 + 2*q1 + q2
489cabdff1aSopenharmony_ci    paddw            m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
490cabdff1aSopenharmony_ci    psraw            m8, 3;  (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
491cabdff1aSopenharmony_ci    psubw            m8, m4;
492cabdff1aSopenharmony_ci    pmaxsw           m8, m14
493cabdff1aSopenharmony_ci    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
494cabdff1aSopenharmony_ci    paddw            m8, m4; q0'
495cabdff1aSopenharmony_ci    MASKED_COPY      m2, m15
496cabdff1aSopenharmony_ci
497cabdff1aSopenharmony_ci    paddw           m15, m3, m4;   p0 + q0
498cabdff1aSopenharmony_ci    paddw           m15, m5;   p0 + q0 + q1
499cabdff1aSopenharmony_ci    mova            m10, m15;
500cabdff1aSopenharmony_ci    paddw           m15, m6;   p0 + q0 + q1 + q2
501cabdff1aSopenharmony_ci    psrlw           m13, 1; 2 in every cell
502cabdff1aSopenharmony_ci    paddw           m15, m13;  p0 + q0 + q1 + q2 + 2
503cabdff1aSopenharmony_ci    psraw           m15, 2;   (p0 + q0 + q1 + q2 + 2) >> 2
504cabdff1aSopenharmony_ci    psubw           m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
505cabdff1aSopenharmony_ci    pmaxsw          m15, m14
506cabdff1aSopenharmony_ci    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
507cabdff1aSopenharmony_ci    paddw           m15, m5; q1'
508cabdff1aSopenharmony_ci
509cabdff1aSopenharmony_ci    paddw           m13, m7;      q3 + 2
510cabdff1aSopenharmony_ci    paddw           m13, m6;      q3 +  q2 + 2
511cabdff1aSopenharmony_ci    paddw           m13, m13;   2*q3 + 2*q2 + 4
512cabdff1aSopenharmony_ci    paddw           m13, m6;    2*q3 + 3*q2 + 4
513cabdff1aSopenharmony_ci    paddw           m13, m10;   2*q3 + 3*q2 + q1 + q0 + p0 + 4
514cabdff1aSopenharmony_ci    psraw           m13, 3;    (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
515cabdff1aSopenharmony_ci    psubw           m13, m6;  ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
516cabdff1aSopenharmony_ci    pmaxsw          m13, m14
517cabdff1aSopenharmony_ci    pminsw          m13, m9; av_clip( , -2 * tc, 2 * tc)
518cabdff1aSopenharmony_ci    paddw           m13, m6; q2'
519cabdff1aSopenharmony_ci
520cabdff1aSopenharmony_ci    MASKED_COPY      m6, m13
521cabdff1aSopenharmony_ci    MASKED_COPY      m5, m15
522cabdff1aSopenharmony_ci    MASKED_COPY      m4, m8
523cabdff1aSopenharmony_ci    MASKED_COPY      m3, m12
524cabdff1aSopenharmony_ci
525cabdff1aSopenharmony_ci.weakfilter:
526cabdff1aSopenharmony_ci    not             r6; strong mask -> weak mask
527cabdff1aSopenharmony_ci    and             r6, r13; final weak filtering mask, bits 0 and 1
528cabdff1aSopenharmony_ci    jz             .store
529cabdff1aSopenharmony_ci
530cabdff1aSopenharmony_ci    ; weak filtering mask
531cabdff1aSopenharmony_ci    mov             r11, r6
532cabdff1aSopenharmony_ci    shr             r11, 1
533cabdff1aSopenharmony_ci    movd            m12, r11d
534cabdff1aSopenharmony_ci    and             r6, 1
535cabdff1aSopenharmony_ci    movd            m11, r6d
536cabdff1aSopenharmony_ci    shufps          m11, m12, 0
537cabdff1aSopenharmony_ci    pcmpeqd         m11, [pd_1]; filtering mask
538cabdff1aSopenharmony_ci
539cabdff1aSopenharmony_ci    mov             r13, betaq
540cabdff1aSopenharmony_ci    shr             r13, 1;
541cabdff1aSopenharmony_ci    add             betaq, r13
542cabdff1aSopenharmony_ci    shr             betaq, 3; ((beta + (beta >> 1)) >> 3))
543cabdff1aSopenharmony_ci
544cabdff1aSopenharmony_ci    mova            m13, [pw_8]
545cabdff1aSopenharmony_ci    psubw           m12, m4, m3 ; q0 - p0
546cabdff1aSopenharmony_ci    psllw           m10, m12, 3; 8 * (q0 - p0)
547cabdff1aSopenharmony_ci    paddw           m12, m10 ; 9 * (q0 - p0)
548cabdff1aSopenharmony_ci
549cabdff1aSopenharmony_ci    psubw           m10, m5, m2 ; q1 - p1
550cabdff1aSopenharmony_ci    psllw            m8, m10, 1; 2 * ( q1 - p1 )
551cabdff1aSopenharmony_ci    paddw           m10, m8; 3 * ( q1 - p1 )
552cabdff1aSopenharmony_ci    psubw           m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
553cabdff1aSopenharmony_ci    paddw           m12, m13; + 8
554cabdff1aSopenharmony_ci    psraw           m12, 4; >> 4 , delta0
555cabdff1aSopenharmony_ci    PABSW           m13, m12; abs(delta0)
556cabdff1aSopenharmony_ci
557cabdff1aSopenharmony_ci
558cabdff1aSopenharmony_ci    psllw           m10, m9, 2; 8 * tc
559cabdff1aSopenharmony_ci    paddw           m10, m9; 10 * tc
560cabdff1aSopenharmony_ci    pcmpgtw         m10, m13
561cabdff1aSopenharmony_ci    pand            m11, m10
562cabdff1aSopenharmony_ci
563cabdff1aSopenharmony_ci    psraw            m9, 1;   tc * 2 -> tc
564cabdff1aSopenharmony_ci    psraw           m14, 1; -tc * 2 -> -tc
565cabdff1aSopenharmony_ci
566cabdff1aSopenharmony_ci    pmaxsw          m12, m14
567cabdff1aSopenharmony_ci    pminsw          m12, m9;  av_clip(delta0, -tc, tc)
568cabdff1aSopenharmony_ci
569cabdff1aSopenharmony_ci    psraw            m9, 1;   tc -> tc / 2
570cabdff1aSopenharmony_ci%if cpuflag(ssse3)
571cabdff1aSopenharmony_ci    psignw          m14, m9, [pw_m1]; -tc / 2
572cabdff1aSopenharmony_ci%else
573cabdff1aSopenharmony_ci    pmullw          m14, m9, [pw_m1]; -tc / 2
574cabdff1aSopenharmony_ci%endif
575cabdff1aSopenharmony_ci
576cabdff1aSopenharmony_ci    pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
577cabdff1aSopenharmony_ci    psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
578cabdff1aSopenharmony_ci    paddw           m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
579cabdff1aSopenharmony_ci    psraw           m15, 1;   (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
580cabdff1aSopenharmony_ci    pmaxsw          m15, m14
581cabdff1aSopenharmony_ci    pminsw          m15, m9; av_clip(deltap1, -tc/2, tc/2)
582cabdff1aSopenharmony_ci    paddw           m15, m2; p1'
583cabdff1aSopenharmony_ci
584cabdff1aSopenharmony_ci    ;beta calculations
585cabdff1aSopenharmony_ci    movd            m10, betad
586cabdff1aSopenharmony_ci    SPLATW          m10, m10, 0
587cabdff1aSopenharmony_ci
588cabdff1aSopenharmony_ci    movd            m13, r7d; 1dp0 + 1dp3
589cabdff1aSopenharmony_ci    movd             m8, r8d; 0dp0 + 0dp3
590cabdff1aSopenharmony_ci    punpcklwd        m8, m8
591cabdff1aSopenharmony_ci    punpcklwd       m13, m13
592cabdff1aSopenharmony_ci    shufps          m13, m8, 0;
593cabdff1aSopenharmony_ci    pcmpgtw          m8, m10, m13
594cabdff1aSopenharmony_ci    pand             m8, m11
595cabdff1aSopenharmony_ci    ;end beta calculations
596cabdff1aSopenharmony_ci    MASKED_COPY2     m2, m15, m8; write p1'
597cabdff1aSopenharmony_ci
598cabdff1aSopenharmony_ci    pavgw            m8, m6, m4;   (q2 + q0 + 1) >> 1
599cabdff1aSopenharmony_ci    psubw            m8, m5;  ((q2 + q0 + 1) >> 1) - q1
600cabdff1aSopenharmony_ci    psubw            m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
601cabdff1aSopenharmony_ci    psraw            m8, 1;   ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
602cabdff1aSopenharmony_ci    pmaxsw           m8, m14
603cabdff1aSopenharmony_ci    pminsw           m8, m9; av_clip(deltaq1, -tc/2, tc/2)
604cabdff1aSopenharmony_ci    paddw            m8, m5; q1'
605cabdff1aSopenharmony_ci
606cabdff1aSopenharmony_ci    movd            m13, r9d;
607cabdff1aSopenharmony_ci    movd            m15, r10d;
608cabdff1aSopenharmony_ci    punpcklwd       m15, m15
609cabdff1aSopenharmony_ci    punpcklwd       m13, m13
610cabdff1aSopenharmony_ci    shufps          m13, m15, 0; dq0 + dq3
611cabdff1aSopenharmony_ci
612cabdff1aSopenharmony_ci    pcmpgtw         m10, m13; compare to ((beta+(beta>>1))>>3)
613cabdff1aSopenharmony_ci    pand            m10, m11
614cabdff1aSopenharmony_ci    MASKED_COPY2     m5, m8, m10; write q1'
615cabdff1aSopenharmony_ci
616cabdff1aSopenharmony_ci    paddw           m15, m3, m12 ; p0 + delta0
617cabdff1aSopenharmony_ci    MASKED_COPY      m3, m15
618cabdff1aSopenharmony_ci
619cabdff1aSopenharmony_ci    psubw            m8, m4, m12 ; q0 - delta0
620cabdff1aSopenharmony_ci    MASKED_COPY      m4, m8
621cabdff1aSopenharmony_ci%endmacro
622cabdff1aSopenharmony_ci
623cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
624cabdff1aSopenharmony_ci; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
625cabdff1aSopenharmony_ci;                                   uint8_t *_no_p, uint8_t *_no_q);
626cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
627cabdff1aSopenharmony_ci%macro LOOP_FILTER_CHROMA 0
628cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
629cabdff1aSopenharmony_ci    sub            pixq, 2
630cabdff1aSopenharmony_ci    lea       r3strideq, [3*strideq]
631cabdff1aSopenharmony_ci    mov           pix0q, pixq
632cabdff1aSopenharmony_ci    add            pixq, r3strideq
633cabdff1aSopenharmony_ci    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
634cabdff1aSopenharmony_ci    CHROMA_DEBLOCK_BODY 8
635cabdff1aSopenharmony_ci    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
636cabdff1aSopenharmony_ci    RET
637cabdff1aSopenharmony_ci
638cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
639cabdff1aSopenharmony_ci    sub            pixq, 4
640cabdff1aSopenharmony_ci    lea       r3strideq, [3*strideq]
641cabdff1aSopenharmony_ci    mov           pix0q, pixq
642cabdff1aSopenharmony_ci    add            pixq, r3strideq
643cabdff1aSopenharmony_ci    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
644cabdff1aSopenharmony_ci    CHROMA_DEBLOCK_BODY 10
645cabdff1aSopenharmony_ci    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
646cabdff1aSopenharmony_ci    RET
647cabdff1aSopenharmony_ci
648cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
649cabdff1aSopenharmony_ci    sub            pixq, 4
650cabdff1aSopenharmony_ci    lea       r3strideq, [3*strideq]
651cabdff1aSopenharmony_ci    mov           pix0q, pixq
652cabdff1aSopenharmony_ci    add            pixq, r3strideq
653cabdff1aSopenharmony_ci    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
654cabdff1aSopenharmony_ci    CHROMA_DEBLOCK_BODY 12
655cabdff1aSopenharmony_ci    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
656cabdff1aSopenharmony_ci    RET
657cabdff1aSopenharmony_ci
658cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
659cabdff1aSopenharmony_ci; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
660cabdff1aSopenharmony_ci;                                   uint8_t *_no_p, uint8_t *_no_q);
661cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
662cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
663cabdff1aSopenharmony_ci    mov           pix0q, pixq
664cabdff1aSopenharmony_ci    sub           pix0q, strideq
665cabdff1aSopenharmony_ci    sub           pix0q, strideq
666cabdff1aSopenharmony_ci    movq             m0, [pix0q];    p1
667cabdff1aSopenharmony_ci    movq             m1, [pix0q+strideq]; p0
668cabdff1aSopenharmony_ci    movq             m2, [pixq];    q0
669cabdff1aSopenharmony_ci    movq             m3, [pixq+strideq]; q1
670cabdff1aSopenharmony_ci    pxor             m5, m5; zeros reg
671cabdff1aSopenharmony_ci    punpcklbw        m0, m5
672cabdff1aSopenharmony_ci    punpcklbw        m1, m5
673cabdff1aSopenharmony_ci    punpcklbw        m2, m5
674cabdff1aSopenharmony_ci    punpcklbw        m3, m5
675cabdff1aSopenharmony_ci    CHROMA_DEBLOCK_BODY  8
676cabdff1aSopenharmony_ci    packuswb         m1, m2
677cabdff1aSopenharmony_ci    movh[pix0q+strideq], m1
678cabdff1aSopenharmony_ci    movhps       [pixq], m1
679cabdff1aSopenharmony_ci    RET
680cabdff1aSopenharmony_ci
681cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
682cabdff1aSopenharmony_ci    mov          pix0q, pixq
683cabdff1aSopenharmony_ci    sub          pix0q, strideq
684cabdff1aSopenharmony_ci    sub          pix0q, strideq
685cabdff1aSopenharmony_ci    movu            m0, [pix0q];    p1
686cabdff1aSopenharmony_ci    movu            m1, [pix0q+strideq]; p0
687cabdff1aSopenharmony_ci    movu            m2, [pixq];    q0
688cabdff1aSopenharmony_ci    movu            m3, [pixq+strideq]; q1
689cabdff1aSopenharmony_ci    CHROMA_DEBLOCK_BODY 10
690cabdff1aSopenharmony_ci    pxor            m5, m5; zeros reg
691cabdff1aSopenharmony_ci    CLIPW           m1, m5, [pw_pixel_max_10]
692cabdff1aSopenharmony_ci    CLIPW           m2, m5, [pw_pixel_max_10]
693cabdff1aSopenharmony_ci    movu [pix0q+strideq], m1
694cabdff1aSopenharmony_ci    movu        [pixq], m2
695cabdff1aSopenharmony_ci    RET
696cabdff1aSopenharmony_ci
697cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
698cabdff1aSopenharmony_ci    mov          pix0q, pixq
699cabdff1aSopenharmony_ci    sub          pix0q, strideq
700cabdff1aSopenharmony_ci    sub          pix0q, strideq
701cabdff1aSopenharmony_ci    movu            m0, [pix0q];    p1
702cabdff1aSopenharmony_ci    movu            m1, [pix0q+strideq]; p0
703cabdff1aSopenharmony_ci    movu            m2, [pixq];    q0
704cabdff1aSopenharmony_ci    movu            m3, [pixq+strideq]; q1
705cabdff1aSopenharmony_ci    CHROMA_DEBLOCK_BODY 12
706cabdff1aSopenharmony_ci    pxor            m5, m5; zeros reg
707cabdff1aSopenharmony_ci    CLIPW           m1, m5, [pw_pixel_max_12]
708cabdff1aSopenharmony_ci    CLIPW           m2, m5, [pw_pixel_max_12]
709cabdff1aSopenharmony_ci    movu [pix0q+strideq], m1
710cabdff1aSopenharmony_ci    movu        [pixq], m2
711cabdff1aSopenharmony_ci    RET
712cabdff1aSopenharmony_ci%endmacro
713cabdff1aSopenharmony_ci
714cabdff1aSopenharmony_ciINIT_XMM sse2
715cabdff1aSopenharmony_ciLOOP_FILTER_CHROMA
716cabdff1aSopenharmony_ciINIT_XMM avx
717cabdff1aSopenharmony_ciLOOP_FILTER_CHROMA
718cabdff1aSopenharmony_ci
719cabdff1aSopenharmony_ci%if ARCH_X86_64
720cabdff1aSopenharmony_ci%macro LOOP_FILTER_LUMA 0
721cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
722cabdff1aSopenharmony_ci; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
723cabdff1aSopenharmony_ci;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
724cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
725cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
726cabdff1aSopenharmony_ci    sub            pixq, 4
727cabdff1aSopenharmony_ci    lea           pix0q, [3 * r1]
728cabdff1aSopenharmony_ci    mov     src3strideq, pixq
729cabdff1aSopenharmony_ci    add            pixq, pix0q
730cabdff1aSopenharmony_ci    TRANSPOSE8x8B_LOAD  PASS8ROWS(src3strideq, pixq, r1, pix0q)
731cabdff1aSopenharmony_ci    LUMA_DEBLOCK_BODY 8, v
732cabdff1aSopenharmony_ci.store:
733cabdff1aSopenharmony_ci    TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
734cabdff1aSopenharmony_ci.bypassluma:
735cabdff1aSopenharmony_ci    RET
736cabdff1aSopenharmony_ci
737cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
738cabdff1aSopenharmony_ci    sub            pixq, 8
739cabdff1aSopenharmony_ci    lea           pix0q, [3 * strideq]
740cabdff1aSopenharmony_ci    mov     src3strideq, pixq
741cabdff1aSopenharmony_ci    add            pixq, pix0q
742cabdff1aSopenharmony_ci    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
743cabdff1aSopenharmony_ci    LUMA_DEBLOCK_BODY 10, v
744cabdff1aSopenharmony_ci.store:
745cabdff1aSopenharmony_ci    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
746cabdff1aSopenharmony_ci.bypassluma:
747cabdff1aSopenharmony_ci    RET
748cabdff1aSopenharmony_ci
749cabdff1aSopenharmony_cicglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
750cabdff1aSopenharmony_ci    sub            pixq, 8
751cabdff1aSopenharmony_ci    lea           pix0q, [3 * strideq]
752cabdff1aSopenharmony_ci    mov     src3strideq, pixq
753cabdff1aSopenharmony_ci    add            pixq, pix0q
754cabdff1aSopenharmony_ci    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
755cabdff1aSopenharmony_ci    LUMA_DEBLOCK_BODY 12, v
756cabdff1aSopenharmony_ci.store:
757cabdff1aSopenharmony_ci    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
758cabdff1aSopenharmony_ci.bypassluma:
759cabdff1aSopenharmony_ci    RET
760cabdff1aSopenharmony_ci
761cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
762cabdff1aSopenharmony_ci; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
763cabdff1aSopenharmony_ci;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
764cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
765cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
766cabdff1aSopenharmony_ci    lea     src3strideq, [3 * strideq]
767cabdff1aSopenharmony_ci    mov           pix0q, pixq
768cabdff1aSopenharmony_ci    sub           pix0q, src3strideq
769cabdff1aSopenharmony_ci    sub           pix0q, strideq
770cabdff1aSopenharmony_ci    movq             m0, [pix0q];               p3
771cabdff1aSopenharmony_ci    movq             m1, [pix0q +     strideq]; p2
772cabdff1aSopenharmony_ci    movq             m2, [pix0q + 2 * strideq]; p1
773cabdff1aSopenharmony_ci    movq             m3, [pix0q + src3strideq]; p0
774cabdff1aSopenharmony_ci    movq             m4, [pixq];                q0
775cabdff1aSopenharmony_ci    movq             m5, [pixq +     strideq];  q1
776cabdff1aSopenharmony_ci    movq             m6, [pixq + 2 * strideq];  q2
777cabdff1aSopenharmony_ci    movq             m7, [pixq + src3strideq];  q3
778cabdff1aSopenharmony_ci    pxor             m8, m8
779cabdff1aSopenharmony_ci    punpcklbw        m0, m8
780cabdff1aSopenharmony_ci    punpcklbw        m1, m8
781cabdff1aSopenharmony_ci    punpcklbw        m2, m8
782cabdff1aSopenharmony_ci    punpcklbw        m3, m8
783cabdff1aSopenharmony_ci    punpcklbw        m4, m8
784cabdff1aSopenharmony_ci    punpcklbw        m5, m8
785cabdff1aSopenharmony_ci    punpcklbw        m6, m8
786cabdff1aSopenharmony_ci    punpcklbw        m7, m8
787cabdff1aSopenharmony_ci    LUMA_DEBLOCK_BODY 8, h
788cabdff1aSopenharmony_ci.store:
789cabdff1aSopenharmony_ci    packuswb          m1, m2
790cabdff1aSopenharmony_ci    packuswb          m3, m4
791cabdff1aSopenharmony_ci    packuswb          m5, m6
792cabdff1aSopenharmony_ci    movh   [pix0q +     strideq], m1
793cabdff1aSopenharmony_ci    movhps [pix0q + 2 * strideq], m1
794cabdff1aSopenharmony_ci    movh   [pix0q + src3strideq], m3
795cabdff1aSopenharmony_ci    movhps [pixq               ], m3
796cabdff1aSopenharmony_ci    movh   [pixq  +     strideq], m5
797cabdff1aSopenharmony_ci    movhps [pixq  + 2 * strideq], m5
798cabdff1aSopenharmony_ci.bypassluma:
799cabdff1aSopenharmony_ci    RET
800cabdff1aSopenharmony_ci
801cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
802cabdff1aSopenharmony_ci    lea                  src3strideq, [3 * strideq]
803cabdff1aSopenharmony_ci    mov                        pix0q, pixq
804cabdff1aSopenharmony_ci    sub                        pix0q, src3strideq
805cabdff1aSopenharmony_ci    sub                        pix0q, strideq
806cabdff1aSopenharmony_ci    movdqu                        m0, [pix0q];               p3
807cabdff1aSopenharmony_ci    movdqu                        m1, [pix0q +     strideq]; p2
808cabdff1aSopenharmony_ci    movdqu                        m2, [pix0q + 2 * strideq]; p1
809cabdff1aSopenharmony_ci    movdqu                        m3, [pix0q + src3strideq]; p0
810cabdff1aSopenharmony_ci    movdqu                        m4, [pixq];                q0
811cabdff1aSopenharmony_ci    movdqu                        m5, [pixq  +     strideq]; q1
812cabdff1aSopenharmony_ci    movdqu                        m6, [pixq  + 2 * strideq]; q2
813cabdff1aSopenharmony_ci    movdqu                        m7, [pixq  + src3strideq]; q3
814cabdff1aSopenharmony_ci    LUMA_DEBLOCK_BODY             10, h
815cabdff1aSopenharmony_ci.store:
816cabdff1aSopenharmony_ci    pxor                          m8, m8; zeros reg
817cabdff1aSopenharmony_ci    CLIPW                         m1, m8, [pw_pixel_max_10]
818cabdff1aSopenharmony_ci    CLIPW                         m2, m8, [pw_pixel_max_10]
819cabdff1aSopenharmony_ci    CLIPW                         m3, m8, [pw_pixel_max_10]
820cabdff1aSopenharmony_ci    CLIPW                         m4, m8, [pw_pixel_max_10]
821cabdff1aSopenharmony_ci    CLIPW                         m5, m8, [pw_pixel_max_10]
822cabdff1aSopenharmony_ci    CLIPW                         m6, m8, [pw_pixel_max_10]
823cabdff1aSopenharmony_ci    movdqu     [pix0q +     strideq], m1;  p2
824cabdff1aSopenharmony_ci    movdqu     [pix0q + 2 * strideq], m2;  p1
825cabdff1aSopenharmony_ci    movdqu     [pix0q + src3strideq], m3;  p0
826cabdff1aSopenharmony_ci    movdqu     [pixq               ], m4;  q0
827cabdff1aSopenharmony_ci    movdqu     [pixq  +     strideq], m5;  q1
828cabdff1aSopenharmony_ci    movdqu     [pixq  + 2 * strideq], m6;  q2
829cabdff1aSopenharmony_ci.bypassluma:
830cabdff1aSopenharmony_ci    RET
831cabdff1aSopenharmony_ci
832cabdff1aSopenharmony_cicglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
833cabdff1aSopenharmony_ci    lea                  src3strideq, [3 * strideq]
834cabdff1aSopenharmony_ci    mov                        pix0q, pixq
835cabdff1aSopenharmony_ci    sub                        pix0q, src3strideq
836cabdff1aSopenharmony_ci    sub                        pix0q, strideq
837cabdff1aSopenharmony_ci    movdqu                        m0, [pix0q];               p3
838cabdff1aSopenharmony_ci    movdqu                        m1, [pix0q +     strideq]; p2
839cabdff1aSopenharmony_ci    movdqu                        m2, [pix0q + 2 * strideq]; p1
840cabdff1aSopenharmony_ci    movdqu                        m3, [pix0q + src3strideq]; p0
841cabdff1aSopenharmony_ci    movdqu                        m4, [pixq];                q0
842cabdff1aSopenharmony_ci    movdqu                        m5, [pixq  +     strideq]; q1
843cabdff1aSopenharmony_ci    movdqu                        m6, [pixq  + 2 * strideq]; q2
844cabdff1aSopenharmony_ci    movdqu                        m7, [pixq  + src3strideq]; q3
845cabdff1aSopenharmony_ci    LUMA_DEBLOCK_BODY             12, h
846cabdff1aSopenharmony_ci.store:
847cabdff1aSopenharmony_ci    pxor                          m8, m8; zeros reg
848cabdff1aSopenharmony_ci    CLIPW                         m1, m8, [pw_pixel_max_12]
849cabdff1aSopenharmony_ci    CLIPW                         m2, m8, [pw_pixel_max_12]
850cabdff1aSopenharmony_ci    CLIPW                         m3, m8, [pw_pixel_max_12]
851cabdff1aSopenharmony_ci    CLIPW                         m4, m8, [pw_pixel_max_12]
852cabdff1aSopenharmony_ci    CLIPW                         m5, m8, [pw_pixel_max_12]
853cabdff1aSopenharmony_ci    CLIPW                         m6, m8, [pw_pixel_max_12]
854cabdff1aSopenharmony_ci    movdqu     [pix0q +     strideq], m1;  p2
855cabdff1aSopenharmony_ci    movdqu     [pix0q + 2 * strideq], m2;  p1
856cabdff1aSopenharmony_ci    movdqu     [pix0q + src3strideq], m3;  p0
857cabdff1aSopenharmony_ci    movdqu     [pixq               ], m4;  q0
858cabdff1aSopenharmony_ci    movdqu     [pixq  +     strideq], m5;  q1
859cabdff1aSopenharmony_ci    movdqu     [pixq  + 2 * strideq], m6;  q2
860cabdff1aSopenharmony_ci.bypassluma:
861cabdff1aSopenharmony_ci    RET
862cabdff1aSopenharmony_ci
863cabdff1aSopenharmony_ci%endmacro
864cabdff1aSopenharmony_ci
865cabdff1aSopenharmony_ciINIT_XMM sse2
866cabdff1aSopenharmony_ciLOOP_FILTER_LUMA
867cabdff1aSopenharmony_ciINIT_XMM ssse3
868cabdff1aSopenharmony_ciLOOP_FILTER_LUMA
869cabdff1aSopenharmony_ciINIT_XMM avx
870cabdff1aSopenharmony_ciLOOP_FILTER_LUMA
871cabdff1aSopenharmony_ci%endif
872