1cabdff1aSopenharmony_ci;*****************************************************************************
2cabdff1aSopenharmony_ci;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3cabdff1aSopenharmony_ci;*****************************************************************************
4cabdff1aSopenharmony_ci;* Copyright (C) 2005-2011 x264 project
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
9cabdff1aSopenharmony_ci;*
10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
14cabdff1aSopenharmony_ci;*
15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
19cabdff1aSopenharmony_ci;*
20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23cabdff1aSopenharmony_ci;******************************************************************************
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ciSECTION_RODATA
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_cicextern pw_1023
30cabdff1aSopenharmony_ci%define pw_pixel_max pw_1023
31cabdff1aSopenharmony_cicextern pw_512
32cabdff1aSopenharmony_cicextern pw_16
33cabdff1aSopenharmony_cicextern pw_8
34cabdff1aSopenharmony_cicextern pw_4
35cabdff1aSopenharmony_cicextern pw_2
36cabdff1aSopenharmony_cicextern pw_1
37cabdff1aSopenharmony_cicextern pd_16
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_cipw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
40cabdff1aSopenharmony_cipw_m3:        times 8 dw -3
41cabdff1aSopenharmony_cipd_17:        times 4 dd 17
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ciSECTION .text
44cabdff1aSopenharmony_ci
45cabdff1aSopenharmony_ci; dest, left, right, src
46cabdff1aSopenharmony_ci; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
47cabdff1aSopenharmony_ci%macro PRED4x4_LOWPASS 4
48cabdff1aSopenharmony_ci    paddw       %2, %3
49cabdff1aSopenharmony_ci    psrlw       %2, 1
50cabdff1aSopenharmony_ci    pavgw       %1, %4, %2
51cabdff1aSopenharmony_ci%endmacro
52cabdff1aSopenharmony_ci
53cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
54cabdff1aSopenharmony_ci; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright,
55cabdff1aSopenharmony_ci;                               ptrdiff_t stride)
56cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
57cabdff1aSopenharmony_ci%macro PRED4x4_DR 0
58cabdff1aSopenharmony_cicglobal pred4x4_down_right_10, 3, 3
59cabdff1aSopenharmony_ci    sub       r0, r2
60cabdff1aSopenharmony_ci    lea       r1, [r0+r2*2]
61cabdff1aSopenharmony_ci    movhps    m1, [r1-8]
62cabdff1aSopenharmony_ci    movhps    m2, [r0+r2*1-8]
63cabdff1aSopenharmony_ci    movhps    m4, [r0-8]
64cabdff1aSopenharmony_ci    punpckhwd m2, m4
65cabdff1aSopenharmony_ci    movq      m3, [r0]
66cabdff1aSopenharmony_ci    punpckhdq m1, m2
67cabdff1aSopenharmony_ci    PALIGNR   m3, m1, 10, m1
68cabdff1aSopenharmony_ci    movhps    m4, [r1+r2*1-8]
69cabdff1aSopenharmony_ci    PALIGNR   m0, m3, m4, 14, m4
70cabdff1aSopenharmony_ci    movhps    m4, [r1+r2*2-8]
71cabdff1aSopenharmony_ci    PALIGNR   m2, m0, m4, 14, m4
72cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m0, m2, m3, m0
73cabdff1aSopenharmony_ci    movq      [r1+r2*2], m0
74cabdff1aSopenharmony_ci    psrldq    m0, 2
75cabdff1aSopenharmony_ci    movq      [r1+r2*1], m0
76cabdff1aSopenharmony_ci    psrldq    m0, 2
77cabdff1aSopenharmony_ci    movq      [r0+r2*2], m0
78cabdff1aSopenharmony_ci    psrldq    m0, 2
79cabdff1aSopenharmony_ci    movq      [r0+r2*1], m0
80cabdff1aSopenharmony_ci    RET
81cabdff1aSopenharmony_ci%endmacro
82cabdff1aSopenharmony_ci
83cabdff1aSopenharmony_ciINIT_XMM sse2
84cabdff1aSopenharmony_ciPRED4x4_DR
85cabdff1aSopenharmony_ciINIT_XMM ssse3
86cabdff1aSopenharmony_ciPRED4x4_DR
87cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
88cabdff1aSopenharmony_ciINIT_XMM avx
89cabdff1aSopenharmony_ciPRED4x4_DR
90cabdff1aSopenharmony_ci%endif
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
93cabdff1aSopenharmony_ci; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
94cabdff1aSopenharmony_ci;                                   ptrdiff_t stride)
95cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
96cabdff1aSopenharmony_ci%macro PRED4x4_VR 0
97cabdff1aSopenharmony_cicglobal pred4x4_vertical_right_10, 3, 3, 6
98cabdff1aSopenharmony_ci    sub     r0, r2
99cabdff1aSopenharmony_ci    lea     r1, [r0+r2*2]
100cabdff1aSopenharmony_ci    movq    m5, [r0]            ; ........t3t2t1t0
101cabdff1aSopenharmony_ci    movhps  m1, [r0-8]
102cabdff1aSopenharmony_ci    PALIGNR m0, m5, m1, 14, m1  ; ......t3t2t1t0lt
103cabdff1aSopenharmony_ci    pavgw   m5, m0
104cabdff1aSopenharmony_ci    movhps  m1, [r0+r2*1-8]
105cabdff1aSopenharmony_ci    PALIGNR m0, m1, 14, m1      ; ....t3t2t1t0ltl0
106cabdff1aSopenharmony_ci    movhps  m2, [r0+r2*2-8]
107cabdff1aSopenharmony_ci    PALIGNR m1, m0, m2, 14, m2  ; ..t3t2t1t0ltl0l1
108cabdff1aSopenharmony_ci    movhps  m3, [r1+r2*1-8]
109cabdff1aSopenharmony_ci    PALIGNR m2, m1, m3, 14, m3  ; t3t2t1t0ltl0l1l2
110cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m1, m0, m2, m1
111cabdff1aSopenharmony_ci    pslldq  m0, m1, 12
112cabdff1aSopenharmony_ci    psrldq  m1, 4
113cabdff1aSopenharmony_ci    movq    [r0+r2*1], m5
114cabdff1aSopenharmony_ci    movq    [r0+r2*2], m1
115cabdff1aSopenharmony_ci    PALIGNR m5, m0, 14, m2
116cabdff1aSopenharmony_ci    pslldq  m0, 2
117cabdff1aSopenharmony_ci    movq    [r1+r2*1], m5
118cabdff1aSopenharmony_ci    PALIGNR m1, m0, 14, m0
119cabdff1aSopenharmony_ci    movq    [r1+r2*2], m1
120cabdff1aSopenharmony_ci    RET
121cabdff1aSopenharmony_ci%endmacro
122cabdff1aSopenharmony_ci
123cabdff1aSopenharmony_ciINIT_XMM sse2
124cabdff1aSopenharmony_ciPRED4x4_VR
125cabdff1aSopenharmony_ciINIT_XMM ssse3
126cabdff1aSopenharmony_ciPRED4x4_VR
127cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
128cabdff1aSopenharmony_ciINIT_XMM avx
129cabdff1aSopenharmony_ciPRED4x4_VR
130cabdff1aSopenharmony_ci%endif
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------
133cabdff1aSopenharmony_ci; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
134cabdff1aSopenharmony_ci;                                    ptrdiff_t stride)
135cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------
136cabdff1aSopenharmony_ci%macro PRED4x4_HD 0
137cabdff1aSopenharmony_cicglobal pred4x4_horizontal_down_10, 3, 3
138cabdff1aSopenharmony_ci    sub        r0, r2
139cabdff1aSopenharmony_ci    lea        r1, [r0+r2*2]
140cabdff1aSopenharmony_ci    movq       m0, [r0-8]      ; lt ..
141cabdff1aSopenharmony_ci    movhps     m0, [r0]
142cabdff1aSopenharmony_ci    pslldq     m0, 2           ; t2 t1 t0 lt .. .. .. ..
143cabdff1aSopenharmony_ci    movq       m1, [r1+r2*2-8] ; l3
144cabdff1aSopenharmony_ci    movq       m3, [r1+r2*1-8]
145cabdff1aSopenharmony_ci    punpcklwd  m1, m3          ; l2 l3
146cabdff1aSopenharmony_ci    movq       m2, [r0+r2*2-8] ; l1
147cabdff1aSopenharmony_ci    movq       m3, [r0+r2*1-8]
148cabdff1aSopenharmony_ci    punpcklwd  m2, m3          ; l0 l1
149cabdff1aSopenharmony_ci    punpckhdq  m1, m2          ; l0 l1 l2 l3
150cabdff1aSopenharmony_ci    punpckhqdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
151cabdff1aSopenharmony_ci    psrldq     m0, m1, 4       ; .. .. t2 t1 t0 lt l0 l1
152cabdff1aSopenharmony_ci    psrldq     m3, m1, 2       ; .. t2 t1 t0 lt l0 l1 l2
153cabdff1aSopenharmony_ci    pavgw      m5, m1, m3
154cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m3, m1, m0, m3
155cabdff1aSopenharmony_ci    punpcklwd  m5, m3
156cabdff1aSopenharmony_ci    psrldq     m3, 8
157cabdff1aSopenharmony_ci    PALIGNR    m3, m5, 12, m4
158cabdff1aSopenharmony_ci    movq       [r1+r2*2], m5
159cabdff1aSopenharmony_ci    movhps     [r0+r2*2], m5
160cabdff1aSopenharmony_ci    psrldq     m5, 4
161cabdff1aSopenharmony_ci    movq       [r1+r2*1], m5
162cabdff1aSopenharmony_ci    movq       [r0+r2*1], m3
163cabdff1aSopenharmony_ci    RET
164cabdff1aSopenharmony_ci%endmacro
165cabdff1aSopenharmony_ci
166cabdff1aSopenharmony_ciINIT_XMM sse2
167cabdff1aSopenharmony_ciPRED4x4_HD
168cabdff1aSopenharmony_ciINIT_XMM ssse3
169cabdff1aSopenharmony_ciPRED4x4_HD
170cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
171cabdff1aSopenharmony_ciINIT_XMM avx
172cabdff1aSopenharmony_ciPRED4x4_HD
173cabdff1aSopenharmony_ci%endif
174cabdff1aSopenharmony_ci
175cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
176cabdff1aSopenharmony_ci; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
177cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ciINIT_MMX mmxext
180cabdff1aSopenharmony_cicglobal pred4x4_dc_10, 3, 3
181cabdff1aSopenharmony_ci    sub    r0, r2
182cabdff1aSopenharmony_ci    lea    r1, [r0+r2*2]
183cabdff1aSopenharmony_ci    movq   m2, [r0+r2*1-8]
184cabdff1aSopenharmony_ci    paddw  m2, [r0+r2*2-8]
185cabdff1aSopenharmony_ci    paddw  m2, [r1+r2*1-8]
186cabdff1aSopenharmony_ci    paddw  m2, [r1+r2*2-8]
187cabdff1aSopenharmony_ci    psrlq  m2, 48
188cabdff1aSopenharmony_ci    movq   m0, [r0]
189cabdff1aSopenharmony_ci    HADDW  m0, m1
190cabdff1aSopenharmony_ci    paddw  m0, [pw_4]
191cabdff1aSopenharmony_ci    paddw  m0, m2
192cabdff1aSopenharmony_ci    psrlw  m0, 3
193cabdff1aSopenharmony_ci    SPLATW m0, m0, 0
194cabdff1aSopenharmony_ci    movq   [r0+r2*1], m0
195cabdff1aSopenharmony_ci    movq   [r0+r2*2], m0
196cabdff1aSopenharmony_ci    movq   [r1+r2*1], m0
197cabdff1aSopenharmony_ci    movq   [r1+r2*2], m0
198cabdff1aSopenharmony_ci    RET
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
201cabdff1aSopenharmony_ci; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright,
202cabdff1aSopenharmony_ci;                              ptrdiff_t stride)
203cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
204cabdff1aSopenharmony_ci%macro PRED4x4_DL 0
205cabdff1aSopenharmony_cicglobal pred4x4_down_left_10, 3, 3
206cabdff1aSopenharmony_ci    sub        r0, r2
207cabdff1aSopenharmony_ci    movq       m0, [r0]
208cabdff1aSopenharmony_ci    movhps     m0, [r1]
209cabdff1aSopenharmony_ci    psrldq     m2, m0, 2
210cabdff1aSopenharmony_ci    pslldq     m3, m0, 2
211cabdff1aSopenharmony_ci    pshufhw    m2, m2, 10100100b
212cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m0, m3, m2, m0
213cabdff1aSopenharmony_ci    lea        r1, [r0+r2*2]
214cabdff1aSopenharmony_ci    movhps     [r1+r2*2], m0
215cabdff1aSopenharmony_ci    psrldq     m0, 2
216cabdff1aSopenharmony_ci    movq       [r0+r2*1], m0
217cabdff1aSopenharmony_ci    psrldq     m0, 2
218cabdff1aSopenharmony_ci    movq       [r0+r2*2], m0
219cabdff1aSopenharmony_ci    psrldq     m0, 2
220cabdff1aSopenharmony_ci    movq       [r1+r2*1], m0
221cabdff1aSopenharmony_ci    RET
222cabdff1aSopenharmony_ci%endmacro
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_ciINIT_XMM sse2
225cabdff1aSopenharmony_ciPRED4x4_DL
226cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
227cabdff1aSopenharmony_ciINIT_XMM avx
228cabdff1aSopenharmony_ciPRED4x4_DL
229cabdff1aSopenharmony_ci%endif
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
232cabdff1aSopenharmony_ci; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
233cabdff1aSopenharmony_ci;                                  ptrdiff_t stride)
234cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
235cabdff1aSopenharmony_ci%macro PRED4x4_VL 0
236cabdff1aSopenharmony_cicglobal pred4x4_vertical_left_10, 3, 3
237cabdff1aSopenharmony_ci    sub        r0, r2
238cabdff1aSopenharmony_ci    movu       m1, [r0]
239cabdff1aSopenharmony_ci    movhps     m1, [r1]
240cabdff1aSopenharmony_ci    psrldq     m0, m1, 2
241cabdff1aSopenharmony_ci    psrldq     m2, m1, 4
242cabdff1aSopenharmony_ci    pavgw      m4, m0, m1
243cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m0, m1, m2, m0
244cabdff1aSopenharmony_ci    lea        r1, [r0+r2*2]
245cabdff1aSopenharmony_ci    movq       [r0+r2*1], m4
246cabdff1aSopenharmony_ci    movq       [r0+r2*2], m0
247cabdff1aSopenharmony_ci    psrldq     m4, 2
248cabdff1aSopenharmony_ci    psrldq     m0, 2
249cabdff1aSopenharmony_ci    movq       [r1+r2*1], m4
250cabdff1aSopenharmony_ci    movq       [r1+r2*2], m0
251cabdff1aSopenharmony_ci    RET
252cabdff1aSopenharmony_ci%endmacro
253cabdff1aSopenharmony_ci
254cabdff1aSopenharmony_ciINIT_XMM sse2
255cabdff1aSopenharmony_ciPRED4x4_VL
256cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
257cabdff1aSopenharmony_ciINIT_XMM avx
258cabdff1aSopenharmony_ciPRED4x4_VL
259cabdff1aSopenharmony_ci%endif
260cabdff1aSopenharmony_ci
261cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
262cabdff1aSopenharmony_ci; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
263cabdff1aSopenharmony_ci;                                  ptrdiff_t stride)
264cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
265cabdff1aSopenharmony_ciINIT_MMX mmxext
266cabdff1aSopenharmony_cicglobal pred4x4_horizontal_up_10, 3, 3
267cabdff1aSopenharmony_ci    sub       r0, r2
268cabdff1aSopenharmony_ci    lea       r1, [r0+r2*2]
269cabdff1aSopenharmony_ci    movq      m0, [r0+r2*1-8]
270cabdff1aSopenharmony_ci    punpckhwd m0, [r0+r2*2-8]
271cabdff1aSopenharmony_ci    movq      m1, [r1+r2*1-8]
272cabdff1aSopenharmony_ci    punpckhwd m1, [r1+r2*2-8]
273cabdff1aSopenharmony_ci    punpckhdq m0, m1
274cabdff1aSopenharmony_ci    pshufw    m1, m1, 0xFF
275cabdff1aSopenharmony_ci    movq      [r1+r2*2], m1
276cabdff1aSopenharmony_ci    movd      [r1+r2*1+4], m1
277cabdff1aSopenharmony_ci    pshufw    m2, m0, 11111001b
278cabdff1aSopenharmony_ci    movq      m1, m2
279cabdff1aSopenharmony_ci    pavgw     m2, m0
280cabdff1aSopenharmony_ci
281cabdff1aSopenharmony_ci    pshufw    m5, m0, 11111110b
282cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m1, m0, m5, m1
283cabdff1aSopenharmony_ci    movq      m6, m2
284cabdff1aSopenharmony_ci    punpcklwd m6, m1
285cabdff1aSopenharmony_ci    movq      [r0+r2*1], m6
286cabdff1aSopenharmony_ci    psrlq     m2, 16
287cabdff1aSopenharmony_ci    psrlq     m1, 16
288cabdff1aSopenharmony_ci    punpcklwd m2, m1
289cabdff1aSopenharmony_ci    movq      [r0+r2*2], m2
290cabdff1aSopenharmony_ci    psrlq     m2, 32
291cabdff1aSopenharmony_ci    movd      [r1+r2*1], m2
292cabdff1aSopenharmony_ci    RET
293cabdff1aSopenharmony_ci
294cabdff1aSopenharmony_ci
295cabdff1aSopenharmony_ci
296cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
297cabdff1aSopenharmony_ci; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride)
298cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
299cabdff1aSopenharmony_ciINIT_XMM sse2
300cabdff1aSopenharmony_cicglobal pred8x8_vertical_10, 2, 2
301cabdff1aSopenharmony_ci    sub  r0, r1
302cabdff1aSopenharmony_ci    mova m0, [r0]
303cabdff1aSopenharmony_ci%rep 3
304cabdff1aSopenharmony_ci    mova [r0+r1*1], m0
305cabdff1aSopenharmony_ci    mova [r0+r1*2], m0
306cabdff1aSopenharmony_ci    lea  r0, [r0+r1*2]
307cabdff1aSopenharmony_ci%endrep
308cabdff1aSopenharmony_ci    mova [r0+r1*1], m0
309cabdff1aSopenharmony_ci    mova [r0+r1*2], m0
310cabdff1aSopenharmony_ci    RET
311cabdff1aSopenharmony_ci
312cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
313cabdff1aSopenharmony_ci; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride)
314cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
315cabdff1aSopenharmony_ciINIT_XMM sse2
316cabdff1aSopenharmony_cicglobal pred8x8_horizontal_10, 2, 3
317cabdff1aSopenharmony_ci    mov         r2d, 4
318cabdff1aSopenharmony_ci.loop:
319cabdff1aSopenharmony_ci    movq         m0, [r0+r1*0-8]
320cabdff1aSopenharmony_ci    movq         m1, [r0+r1*1-8]
321cabdff1aSopenharmony_ci    pshuflw      m0, m0, 0xff
322cabdff1aSopenharmony_ci    pshuflw      m1, m1, 0xff
323cabdff1aSopenharmony_ci    punpcklqdq   m0, m0
324cabdff1aSopenharmony_ci    punpcklqdq   m1, m1
325cabdff1aSopenharmony_ci    mova  [r0+r1*0], m0
326cabdff1aSopenharmony_ci    mova  [r0+r1*1], m1
327cabdff1aSopenharmony_ci    lea          r0, [r0+r1*2]
328cabdff1aSopenharmony_ci    dec          r2d
329cabdff1aSopenharmony_ci    jg .loop
330cabdff1aSopenharmony_ci    REP_RET
331cabdff1aSopenharmony_ci
332cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
333cabdff1aSopenharmony_ci; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride)
334cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
335cabdff1aSopenharmony_ci%macro MOV8 2-3
336cabdff1aSopenharmony_ci; sort of a hack, but it works
337cabdff1aSopenharmony_ci    movdqa    [%1], %2
338cabdff1aSopenharmony_ci%endmacro
339cabdff1aSopenharmony_ci
340cabdff1aSopenharmony_ci%macro PRED8x8_DC 1
341cabdff1aSopenharmony_cicglobal pred8x8_dc_10, 2, 6
342cabdff1aSopenharmony_ci    sub         r0, r1
343cabdff1aSopenharmony_ci    pxor        m4, m4
344cabdff1aSopenharmony_ci    movq        m0, [r0+0]
345cabdff1aSopenharmony_ci    movq        m1, [r0+8]
346cabdff1aSopenharmony_ci    punpcklwd   m0, m1
347cabdff1aSopenharmony_ci    movhlps     m1, m0
348cabdff1aSopenharmony_ci    paddw       m0, m1
349cabdff1aSopenharmony_ci    %1          m2, m0, 00001110b
350cabdff1aSopenharmony_ci    paddw       m0, m2
351cabdff1aSopenharmony_ci
352cabdff1aSopenharmony_ci    lea         r5, [r1*3]
353cabdff1aSopenharmony_ci    lea         r4, [r0+r1*4]
354cabdff1aSopenharmony_ci    movzx      r2d, word [r0+r1*1-2]
355cabdff1aSopenharmony_ci    movzx      r3d, word [r0+r1*2-2]
356cabdff1aSopenharmony_ci    add        r2d, r3d
357cabdff1aSopenharmony_ci    movzx      r3d, word [r0+r5*1-2]
358cabdff1aSopenharmony_ci    add        r2d, r3d
359cabdff1aSopenharmony_ci    movzx      r3d, word [r4-2]
360cabdff1aSopenharmony_ci    add        r2d, r3d
361cabdff1aSopenharmony_ci    movd        m2, r2d            ; s2
362cabdff1aSopenharmony_ci
363cabdff1aSopenharmony_ci    movzx      r2d, word [r4+r1*1-2]
364cabdff1aSopenharmony_ci    movzx      r3d, word [r4+r1*2-2]
365cabdff1aSopenharmony_ci    add        r2d, r3d
366cabdff1aSopenharmony_ci    movzx      r3d, word [r4+r5*1-2]
367cabdff1aSopenharmony_ci    add        r2d, r3d
368cabdff1aSopenharmony_ci    movzx      r3d, word [r4+r1*4-2]
369cabdff1aSopenharmony_ci    add        r2d, r3d
370cabdff1aSopenharmony_ci    movd        m3, r2d            ; s3
371cabdff1aSopenharmony_ci
372cabdff1aSopenharmony_ci    punpcklwd   m2, m3
373cabdff1aSopenharmony_ci    punpckldq   m0, m2            ; s0, s1, s2, s3
374cabdff1aSopenharmony_ci    %1          m3, m0, 11110110b ; s2, s1, s3, s3
375cabdff1aSopenharmony_ci    %1          m0, m0, 01110100b ; s0, s1, s3, s1
376cabdff1aSopenharmony_ci    paddw       m0, m3
377cabdff1aSopenharmony_ci    psrlw       m0, 2
378cabdff1aSopenharmony_ci    pavgw       m0, m4            ; s0+s2, s1, s3, s1+s3
379cabdff1aSopenharmony_ci    punpcklwd   m0, m0
380cabdff1aSopenharmony_ci    pshufd      m3, m0, 11111010b
381cabdff1aSopenharmony_ci    punpckldq   m0, m0
382cabdff1aSopenharmony_ci    SWAP         0,1
383cabdff1aSopenharmony_ci    MOV8   r0+r1*1, m1, m2
384cabdff1aSopenharmony_ci    MOV8   r0+r1*2, m1, m2
385cabdff1aSopenharmony_ci    MOV8   r0+r5*1, m1, m2
386cabdff1aSopenharmony_ci    MOV8   r0+r1*4, m1, m2
387cabdff1aSopenharmony_ci    MOV8   r4+r1*1, m3, m4
388cabdff1aSopenharmony_ci    MOV8   r4+r1*2, m3, m4
389cabdff1aSopenharmony_ci    MOV8   r4+r5*1, m3, m4
390cabdff1aSopenharmony_ci    MOV8   r4+r1*4, m3, m4
391cabdff1aSopenharmony_ci    RET
392cabdff1aSopenharmony_ci%endmacro
393cabdff1aSopenharmony_ci
394cabdff1aSopenharmony_ciINIT_XMM sse2
395cabdff1aSopenharmony_ciPRED8x8_DC pshuflw
396cabdff1aSopenharmony_ci
397cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
398cabdff1aSopenharmony_ci; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride)
399cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
400cabdff1aSopenharmony_ciINIT_XMM sse2
401cabdff1aSopenharmony_cicglobal pred8x8_top_dc_10, 2, 4
402cabdff1aSopenharmony_ci    sub         r0, r1
403cabdff1aSopenharmony_ci    mova        m0, [r0]
404cabdff1aSopenharmony_ci    pshuflw     m1, m0, 0x4e
405cabdff1aSopenharmony_ci    pshufhw     m1, m1, 0x4e
406cabdff1aSopenharmony_ci    paddw       m0, m1
407cabdff1aSopenharmony_ci    pshuflw     m1, m0, 0xb1
408cabdff1aSopenharmony_ci    pshufhw     m1, m1, 0xb1
409cabdff1aSopenharmony_ci    paddw       m0, m1
410cabdff1aSopenharmony_ci    lea         r2, [r1*3]
411cabdff1aSopenharmony_ci    lea         r3, [r0+r1*4]
412cabdff1aSopenharmony_ci    paddw       m0, [pw_2]
413cabdff1aSopenharmony_ci    psrlw       m0, 2
414cabdff1aSopenharmony_ci    mova [r0+r1*1], m0
415cabdff1aSopenharmony_ci    mova [r0+r1*2], m0
416cabdff1aSopenharmony_ci    mova [r0+r2*1], m0
417cabdff1aSopenharmony_ci    mova [r0+r1*4], m0
418cabdff1aSopenharmony_ci    mova [r3+r1*1], m0
419cabdff1aSopenharmony_ci    mova [r3+r1*2], m0
420cabdff1aSopenharmony_ci    mova [r3+r2*1], m0
421cabdff1aSopenharmony_ci    mova [r3+r1*4], m0
422cabdff1aSopenharmony_ci    RET
423cabdff1aSopenharmony_ci
424cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
425cabdff1aSopenharmony_ci; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride)
426cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
427cabdff1aSopenharmony_ciINIT_XMM sse2
428cabdff1aSopenharmony_cicglobal pred8x8_plane_10, 2, 7, 7
429cabdff1aSopenharmony_ci    sub       r0, r1
430cabdff1aSopenharmony_ci    lea       r2, [r1*3]
431cabdff1aSopenharmony_ci    lea       r3, [r0+r1*4]
432cabdff1aSopenharmony_ci    mova      m2, [r0]
433cabdff1aSopenharmony_ci    pmaddwd   m2, [pw_m32101234]
434cabdff1aSopenharmony_ci    HADDD     m2, m1
435cabdff1aSopenharmony_ci    movd      m0, [r0-4]
436cabdff1aSopenharmony_ci    psrld     m0, 14
437cabdff1aSopenharmony_ci    psubw     m2, m0               ; H
438cabdff1aSopenharmony_ci    movd      m0, [r3+r1*4-4]
439cabdff1aSopenharmony_ci    movd      m1, [r0+12]
440cabdff1aSopenharmony_ci    paddw     m0, m1
441cabdff1aSopenharmony_ci    psllw     m0, 4                ; 16*(src[7*stride-1] + src[-stride+7])
442cabdff1aSopenharmony_ci    movzx    r4d, word [r3+r1*1-2] ; src[4*stride-1]
443cabdff1aSopenharmony_ci    movzx    r5d, word [r0+r2*1-2] ; src[2*stride-1]
444cabdff1aSopenharmony_ci    sub      r4d, r5d
445cabdff1aSopenharmony_ci    movzx    r6d, word [r3+r1*2-2] ; src[5*stride-1]
446cabdff1aSopenharmony_ci    movzx    r5d, word [r0+r1*2-2] ; src[1*stride-1]
447cabdff1aSopenharmony_ci    sub      r6d, r5d
448cabdff1aSopenharmony_ci    lea      r4d, [r4+r6*2]
449cabdff1aSopenharmony_ci    movzx    r5d, word [r3+r2*1-2] ; src[6*stride-1]
450cabdff1aSopenharmony_ci    movzx    r6d, word [r0+r1*1-2] ; src[0*stride-1]
451cabdff1aSopenharmony_ci    sub      r5d, r6d
452cabdff1aSopenharmony_ci    lea      r5d, [r5*3]
453cabdff1aSopenharmony_ci    add      r4d, r5d
454cabdff1aSopenharmony_ci    movzx    r6d, word [r3+r1*4-2] ; src[7*stride-1]
455cabdff1aSopenharmony_ci    movzx    r5d, word [r0+r1*0-2] ; src[ -stride-1]
456cabdff1aSopenharmony_ci    sub      r6d, r5d
457cabdff1aSopenharmony_ci    lea      r4d, [r4+r6*4]
458cabdff1aSopenharmony_ci    movd      m3, r4d              ; V
459cabdff1aSopenharmony_ci    punpckldq m2, m3
460cabdff1aSopenharmony_ci    pmaddwd   m2, [pd_17]
461cabdff1aSopenharmony_ci    paddd     m2, [pd_16]
462cabdff1aSopenharmony_ci    psrad     m2, 5                ; b, c
463cabdff1aSopenharmony_ci
464cabdff1aSopenharmony_ci    mova      m3, [pw_pixel_max]
465cabdff1aSopenharmony_ci    pxor      m1, m1
466cabdff1aSopenharmony_ci    SPLATW    m0, m0, 1
467cabdff1aSopenharmony_ci    SPLATW    m4, m2, 2
468cabdff1aSopenharmony_ci    SPLATW    m2, m2, 0
469cabdff1aSopenharmony_ci    pmullw    m2, [pw_m32101234]   ; b
470cabdff1aSopenharmony_ci    pmullw    m5, m4, [pw_m3]      ; c
471cabdff1aSopenharmony_ci    paddw     m5, [pw_16]
472cabdff1aSopenharmony_ci    mov      r2d, 8
473cabdff1aSopenharmony_ci    add       r0, r1
474cabdff1aSopenharmony_ci.loop:
475cabdff1aSopenharmony_ci    paddsw    m6, m2, m5
476cabdff1aSopenharmony_ci    paddsw    m6, m0
477cabdff1aSopenharmony_ci    psraw     m6, 5
478cabdff1aSopenharmony_ci    CLIPW     m6, m1, m3
479cabdff1aSopenharmony_ci    mova    [r0], m6
480cabdff1aSopenharmony_ci    paddw     m5, m4
481cabdff1aSopenharmony_ci    add       r0, r1
482cabdff1aSopenharmony_ci    dec r2d
483cabdff1aSopenharmony_ci    jg .loop
484cabdff1aSopenharmony_ci    REP_RET
485cabdff1aSopenharmony_ci
486cabdff1aSopenharmony_ci
487cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
488cabdff1aSopenharmony_ci; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright,
489cabdff1aSopenharmony_ci;                            ptrdiff_t stride)
490cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
491cabdff1aSopenharmony_ciINIT_XMM sse2
492cabdff1aSopenharmony_cicglobal pred8x8l_128_dc_10, 4, 4
493cabdff1aSopenharmony_ci    mova      m0, [pw_512] ; (1<<(BIT_DEPTH-1))
494cabdff1aSopenharmony_ci    lea       r1, [r3*3]
495cabdff1aSopenharmony_ci    lea       r2, [r0+r3*4]
496cabdff1aSopenharmony_ci    MOV8 r0+r3*0, m0, m0
497cabdff1aSopenharmony_ci    MOV8 r0+r3*1, m0, m0
498cabdff1aSopenharmony_ci    MOV8 r0+r3*2, m0, m0
499cabdff1aSopenharmony_ci    MOV8 r0+r1*1, m0, m0
500cabdff1aSopenharmony_ci    MOV8 r2+r3*0, m0, m0
501cabdff1aSopenharmony_ci    MOV8 r2+r3*1, m0, m0
502cabdff1aSopenharmony_ci    MOV8 r2+r3*2, m0, m0
503cabdff1aSopenharmony_ci    MOV8 r2+r1*1, m0, m0
504cabdff1aSopenharmony_ci    RET
505cabdff1aSopenharmony_ci
506cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
507cabdff1aSopenharmony_ci; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright,
508cabdff1aSopenharmony_ci;                            ptrdiff_t stride)
509cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
510cabdff1aSopenharmony_ci%macro PRED8x8L_TOP_DC 0
511cabdff1aSopenharmony_cicglobal pred8x8l_top_dc_10, 4, 4, 6
512cabdff1aSopenharmony_ci    sub         r0, r3
513cabdff1aSopenharmony_ci    mova        m0, [r0]
514cabdff1aSopenharmony_ci    shr        r1d, 14
515cabdff1aSopenharmony_ci    shr        r2d, 13
516cabdff1aSopenharmony_ci    neg         r1
517cabdff1aSopenharmony_ci    pslldq      m1, m0, 2
518cabdff1aSopenharmony_ci    psrldq      m2, m0, 2
519cabdff1aSopenharmony_ci    pinsrw      m1, [r0+r1], 0
520cabdff1aSopenharmony_ci    pinsrw      m2, [r0+r2+14], 7
521cabdff1aSopenharmony_ci    lea         r1, [r3*3]
522cabdff1aSopenharmony_ci    lea         r2, [r0+r3*4]
523cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m0, m2, m1, m0
524cabdff1aSopenharmony_ci    HADDW       m0, m1
525cabdff1aSopenharmony_ci    paddw       m0, [pw_4]
526cabdff1aSopenharmony_ci    psrlw       m0, 3
527cabdff1aSopenharmony_ci    SPLATW      m0, m0, 0
528cabdff1aSopenharmony_ci    mova [r0+r3*1], m0
529cabdff1aSopenharmony_ci    mova [r0+r3*2], m0
530cabdff1aSopenharmony_ci    mova [r0+r1*1], m0
531cabdff1aSopenharmony_ci    mova [r0+r3*4], m0
532cabdff1aSopenharmony_ci    mova [r2+r3*1], m0
533cabdff1aSopenharmony_ci    mova [r2+r3*2], m0
534cabdff1aSopenharmony_ci    mova [r2+r1*1], m0
535cabdff1aSopenharmony_ci    mova [r2+r3*4], m0
536cabdff1aSopenharmony_ci    RET
537cabdff1aSopenharmony_ci%endmacro
538cabdff1aSopenharmony_ci
539cabdff1aSopenharmony_ciINIT_XMM sse2
540cabdff1aSopenharmony_ciPRED8x8L_TOP_DC
541cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
542cabdff1aSopenharmony_ciINIT_XMM avx
543cabdff1aSopenharmony_ciPRED8x8L_TOP_DC
544cabdff1aSopenharmony_ci%endif
545cabdff1aSopenharmony_ci
546cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------
547cabdff1aSopenharmony_ci; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
548cabdff1aSopenharmony_ci;                        ptrdiff_t stride)
549cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------
550cabdff1aSopenharmony_ci;TODO: see if scalar is faster
551cabdff1aSopenharmony_ci%macro PRED8x8L_DC 0
552cabdff1aSopenharmony_cicglobal pred8x8l_dc_10, 4, 6, 6
553cabdff1aSopenharmony_ci    sub         r0, r3
554cabdff1aSopenharmony_ci    lea         r4, [r0+r3*4]
555cabdff1aSopenharmony_ci    lea         r5, [r3*3]
556cabdff1aSopenharmony_ci    mova        m0, [r0+r3*2-16]
557cabdff1aSopenharmony_ci    punpckhwd   m0, [r0+r3*1-16]
558cabdff1aSopenharmony_ci    mova        m1, [r4+r3*0-16]
559cabdff1aSopenharmony_ci    punpckhwd   m1, [r0+r5*1-16]
560cabdff1aSopenharmony_ci    punpckhdq   m1, m0
561cabdff1aSopenharmony_ci    mova        m2, [r4+r3*2-16]
562cabdff1aSopenharmony_ci    punpckhwd   m2, [r4+r3*1-16]
563cabdff1aSopenharmony_ci    mova        m3, [r4+r3*4-16]
564cabdff1aSopenharmony_ci    punpckhwd   m3, [r4+r5*1-16]
565cabdff1aSopenharmony_ci    punpckhdq   m3, m2
566cabdff1aSopenharmony_ci    punpckhqdq  m3, m1
567cabdff1aSopenharmony_ci    mova        m0, [r0]
568cabdff1aSopenharmony_ci    shr        r1d, 14
569cabdff1aSopenharmony_ci    shr        r2d, 13
570cabdff1aSopenharmony_ci    neg         r1
571cabdff1aSopenharmony_ci    pslldq      m1, m0, 2
572cabdff1aSopenharmony_ci    psrldq      m2, m0, 2
573cabdff1aSopenharmony_ci    pinsrw      m1, [r0+r1], 0
574cabdff1aSopenharmony_ci    pinsrw      m2, [r0+r2+14], 7
575cabdff1aSopenharmony_ci    not         r1
576cabdff1aSopenharmony_ci    and         r1, r3
577cabdff1aSopenharmony_ci    pslldq      m4, m3, 2
578cabdff1aSopenharmony_ci    psrldq      m5, m3, 2
579cabdff1aSopenharmony_ci    pshuflw     m4, m4, 11100101b
580cabdff1aSopenharmony_ci    pinsrw      m5, [r0+r1-2], 7
581cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m3, m4, m5, m3
582cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m0, m2, m1, m0
583cabdff1aSopenharmony_ci    paddw       m0, m3
584cabdff1aSopenharmony_ci    HADDW       m0, m1
585cabdff1aSopenharmony_ci    paddw       m0, [pw_8]
586cabdff1aSopenharmony_ci    psrlw       m0, 4
587cabdff1aSopenharmony_ci    SPLATW      m0, m0
588cabdff1aSopenharmony_ci    mova [r0+r3*1], m0
589cabdff1aSopenharmony_ci    mova [r0+r3*2], m0
590cabdff1aSopenharmony_ci    mova [r0+r5*1], m0
591cabdff1aSopenharmony_ci    mova [r0+r3*4], m0
592cabdff1aSopenharmony_ci    mova [r4+r3*1], m0
593cabdff1aSopenharmony_ci    mova [r4+r3*2], m0
594cabdff1aSopenharmony_ci    mova [r4+r5*1], m0
595cabdff1aSopenharmony_ci    mova [r4+r3*4], m0
596cabdff1aSopenharmony_ci    RET
597cabdff1aSopenharmony_ci%endmacro
598cabdff1aSopenharmony_ci
599cabdff1aSopenharmony_ciINIT_XMM sse2
600cabdff1aSopenharmony_ciPRED8x8L_DC
601cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
602cabdff1aSopenharmony_ciINIT_XMM avx
603cabdff1aSopenharmony_ciPRED8x8L_DC
604cabdff1aSopenharmony_ci%endif
605cabdff1aSopenharmony_ci
606cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
607cabdff1aSopenharmony_ci; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
608cabdff1aSopenharmony_ci;                              ptrdiff_t stride)
609cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
610cabdff1aSopenharmony_ci%macro PRED8x8L_VERTICAL 0
611cabdff1aSopenharmony_cicglobal pred8x8l_vertical_10, 4, 4, 6
612cabdff1aSopenharmony_ci    sub         r0, r3
613cabdff1aSopenharmony_ci    mova        m0, [r0]
614cabdff1aSopenharmony_ci    shr        r1d, 14
615cabdff1aSopenharmony_ci    shr        r2d, 13
616cabdff1aSopenharmony_ci    neg         r1
617cabdff1aSopenharmony_ci    pslldq      m1, m0, 2
618cabdff1aSopenharmony_ci    psrldq      m2, m0, 2
619cabdff1aSopenharmony_ci    pinsrw      m1, [r0+r1], 0
620cabdff1aSopenharmony_ci    pinsrw      m2, [r0+r2+14], 7
621cabdff1aSopenharmony_ci    lea         r1, [r3*3]
622cabdff1aSopenharmony_ci    lea         r2, [r0+r3*4]
623cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m0, m2, m1, m0
624cabdff1aSopenharmony_ci    mova [r0+r3*1], m0
625cabdff1aSopenharmony_ci    mova [r0+r3*2], m0
626cabdff1aSopenharmony_ci    mova [r0+r1*1], m0
627cabdff1aSopenharmony_ci    mova [r0+r3*4], m0
628cabdff1aSopenharmony_ci    mova [r2+r3*1], m0
629cabdff1aSopenharmony_ci    mova [r2+r3*2], m0
630cabdff1aSopenharmony_ci    mova [r2+r1*1], m0
631cabdff1aSopenharmony_ci    mova [r2+r3*4], m0
632cabdff1aSopenharmony_ci    RET
633cabdff1aSopenharmony_ci%endmacro
634cabdff1aSopenharmony_ci
635cabdff1aSopenharmony_ciINIT_XMM sse2
636cabdff1aSopenharmony_ciPRED8x8L_VERTICAL
637cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
638cabdff1aSopenharmony_ciINIT_XMM avx
639cabdff1aSopenharmony_ciPRED8x8L_VERTICAL
640cabdff1aSopenharmony_ci%endif
641cabdff1aSopenharmony_ci
642cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
643cabdff1aSopenharmony_ci; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
644cabdff1aSopenharmony_ci;                                int has_topright, ptrdiff_t stride)
645cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
646cabdff1aSopenharmony_ci%macro PRED8x8L_HORIZONTAL 0
647cabdff1aSopenharmony_cicglobal pred8x8l_horizontal_10, 4, 4, 5
648cabdff1aSopenharmony_ci    mova        m0, [r0-16]
649cabdff1aSopenharmony_ci    shr        r1d, 14
650cabdff1aSopenharmony_ci    dec         r1
651cabdff1aSopenharmony_ci    and         r1, r3
652cabdff1aSopenharmony_ci    sub         r1, r3
653cabdff1aSopenharmony_ci    punpckhwd   m0, [r0+r1-16]
654cabdff1aSopenharmony_ci    mova        m1, [r0+r3*2-16]
655cabdff1aSopenharmony_ci    punpckhwd   m1, [r0+r3*1-16]
656cabdff1aSopenharmony_ci    lea         r2, [r0+r3*4]
657cabdff1aSopenharmony_ci    lea         r1, [r3*3]
658cabdff1aSopenharmony_ci    punpckhdq   m1, m0
659cabdff1aSopenharmony_ci    mova        m2, [r2+r3*0-16]
660cabdff1aSopenharmony_ci    punpckhwd   m2, [r0+r1-16]
661cabdff1aSopenharmony_ci    mova        m3, [r2+r3*2-16]
662cabdff1aSopenharmony_ci    punpckhwd   m3, [r2+r3*1-16]
663cabdff1aSopenharmony_ci    punpckhdq   m3, m2
664cabdff1aSopenharmony_ci    punpckhqdq  m3, m1
665cabdff1aSopenharmony_ci    PALIGNR     m4, m3, [r2+r1-16], 14, m0
666cabdff1aSopenharmony_ci    pslldq      m0, m4, 2
667cabdff1aSopenharmony_ci    pshuflw     m0, m0, 11100101b
668cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m4, m3, m0, m4
669cabdff1aSopenharmony_ci    punpckhwd   m3, m4, m4
670cabdff1aSopenharmony_ci    punpcklwd   m4, m4
671cabdff1aSopenharmony_ci    pshufd      m0, m3, 0xff
672cabdff1aSopenharmony_ci    pshufd      m1, m3, 0xaa
673cabdff1aSopenharmony_ci    pshufd      m2, m3, 0x55
674cabdff1aSopenharmony_ci    pshufd      m3, m3, 0x00
675cabdff1aSopenharmony_ci    mova [r0+r3*0], m0
676cabdff1aSopenharmony_ci    mova [r0+r3*1], m1
677cabdff1aSopenharmony_ci    mova [r0+r3*2], m2
678cabdff1aSopenharmony_ci    mova [r0+r1*1], m3
679cabdff1aSopenharmony_ci    pshufd      m0, m4, 0xff
680cabdff1aSopenharmony_ci    pshufd      m1, m4, 0xaa
681cabdff1aSopenharmony_ci    pshufd      m2, m4, 0x55
682cabdff1aSopenharmony_ci    pshufd      m3, m4, 0x00
683cabdff1aSopenharmony_ci    mova [r2+r3*0], m0
684cabdff1aSopenharmony_ci    mova [r2+r3*1], m1
685cabdff1aSopenharmony_ci    mova [r2+r3*2], m2
686cabdff1aSopenharmony_ci    mova [r2+r1*1], m3
687cabdff1aSopenharmony_ci    RET
688cabdff1aSopenharmony_ci%endmacro
689cabdff1aSopenharmony_ci
690cabdff1aSopenharmony_ciINIT_XMM sse2
691cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL
692cabdff1aSopenharmony_ciINIT_XMM ssse3
693cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL
694cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
695cabdff1aSopenharmony_ciINIT_XMM avx
696cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL
697cabdff1aSopenharmony_ci%endif
698cabdff1aSopenharmony_ci
699cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
700cabdff1aSopenharmony_ci; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
701cabdff1aSopenharmony_ci;                               ptrdiff_t stride)
702cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
703cabdff1aSopenharmony_ci%macro PRED8x8L_DOWN_LEFT 0
704cabdff1aSopenharmony_cicglobal pred8x8l_down_left_10, 4, 4, 7
705cabdff1aSopenharmony_ci    sub         r0, r3
706cabdff1aSopenharmony_ci    mova        m3, [r0]
707cabdff1aSopenharmony_ci    shr        r1d, 14
708cabdff1aSopenharmony_ci    neg         r1
709cabdff1aSopenharmony_ci    shr        r2d, 13
710cabdff1aSopenharmony_ci    pslldq      m1, m3, 2
711cabdff1aSopenharmony_ci    psrldq      m2, m3, 2
712cabdff1aSopenharmony_ci    pinsrw      m1, [r0+r1], 0
713cabdff1aSopenharmony_ci    pinsrw      m2, [r0+r2+14], 7
714cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m6, m2, m1, m3
715cabdff1aSopenharmony_ci    jz .fix_tr ; flags from shr r2d
716cabdff1aSopenharmony_ci    mova        m1, [r0+16]
717cabdff1aSopenharmony_ci    psrldq      m5, m1, 2
718cabdff1aSopenharmony_ci    PALIGNR     m2, m1, m3, 14, m3
719cabdff1aSopenharmony_ci    pshufhw     m5, m5, 10100100b
720cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m1, m2, m5, m1
721cabdff1aSopenharmony_ci.do_topright:
722cabdff1aSopenharmony_ci    lea         r1, [r3*3]
723cabdff1aSopenharmony_ci    psrldq      m5, m1, 14
724cabdff1aSopenharmony_ci    lea         r2, [r0+r3*4]
725cabdff1aSopenharmony_ci    PALIGNR     m2, m1, m6,  2, m0
726cabdff1aSopenharmony_ci    PALIGNR     m3, m1, m6, 14, m0
727cabdff1aSopenharmony_ci    PALIGNR     m5, m1,  2, m0
728cabdff1aSopenharmony_ci    pslldq      m4, m6, 2
729cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m6, m4, m2, m6
730cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m1, m3, m5, m1
731cabdff1aSopenharmony_ci    mova [r2+r3*4], m1
732cabdff1aSopenharmony_ci    PALIGNR     m1, m6, 14, m2
733cabdff1aSopenharmony_ci    pslldq      m6, 2
734cabdff1aSopenharmony_ci    mova [r2+r1*1], m1
735cabdff1aSopenharmony_ci    PALIGNR     m1, m6, 14, m2
736cabdff1aSopenharmony_ci    pslldq      m6, 2
737cabdff1aSopenharmony_ci    mova [r2+r3*2], m1
738cabdff1aSopenharmony_ci    PALIGNR     m1, m6, 14, m2
739cabdff1aSopenharmony_ci    pslldq      m6, 2
740cabdff1aSopenharmony_ci    mova [r2+r3*1], m1
741cabdff1aSopenharmony_ci    PALIGNR     m1, m6, 14, m2
742cabdff1aSopenharmony_ci    pslldq      m6, 2
743cabdff1aSopenharmony_ci    mova [r0+r3*4], m1
744cabdff1aSopenharmony_ci    PALIGNR     m1, m6, 14, m2
745cabdff1aSopenharmony_ci    pslldq      m6, 2
746cabdff1aSopenharmony_ci    mova [r0+r1*1], m1
747cabdff1aSopenharmony_ci    PALIGNR     m1, m6, 14, m2
748cabdff1aSopenharmony_ci    pslldq      m6, 2
749cabdff1aSopenharmony_ci    mova [r0+r3*2], m1
750cabdff1aSopenharmony_ci    PALIGNR     m1, m6, 14, m6
751cabdff1aSopenharmony_ci    mova [r0+r3*1], m1
752cabdff1aSopenharmony_ci    RET
753cabdff1aSopenharmony_ci.fix_tr:
754cabdff1aSopenharmony_ci    punpckhwd   m3, m3
755cabdff1aSopenharmony_ci    pshufd      m1, m3, 0xFF
756cabdff1aSopenharmony_ci    jmp .do_topright
757cabdff1aSopenharmony_ci%endmacro
758cabdff1aSopenharmony_ci
759cabdff1aSopenharmony_ciINIT_XMM sse2
760cabdff1aSopenharmony_ciPRED8x8L_DOWN_LEFT
761cabdff1aSopenharmony_ciINIT_XMM ssse3
762cabdff1aSopenharmony_ciPRED8x8L_DOWN_LEFT
763cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
764cabdff1aSopenharmony_ciINIT_XMM avx
765cabdff1aSopenharmony_ciPRED8x8L_DOWN_LEFT
766cabdff1aSopenharmony_ci%endif
767cabdff1aSopenharmony_ci
768cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
769cabdff1aSopenharmony_ci; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
770cabdff1aSopenharmony_ci;                                int has_topright, ptrdiff_t stride)
771cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
772cabdff1aSopenharmony_ci%macro PRED8x8L_DOWN_RIGHT 0
773cabdff1aSopenharmony_ci; standard forbids this when has_topleft is false
774cabdff1aSopenharmony_ci; no need to check
775cabdff1aSopenharmony_cicglobal pred8x8l_down_right_10, 4, 5, 8
776cabdff1aSopenharmony_ci    sub         r0, r3
777cabdff1aSopenharmony_ci    lea         r4, [r0+r3*4]
778cabdff1aSopenharmony_ci    lea         r1, [r3*3]
779cabdff1aSopenharmony_ci    mova        m0, [r0+r3*1-16]
780cabdff1aSopenharmony_ci    punpckhwd   m0, [r0+r3*0-16]
781cabdff1aSopenharmony_ci    mova        m1, [r0+r1*1-16]
782cabdff1aSopenharmony_ci    punpckhwd   m1, [r0+r3*2-16]
783cabdff1aSopenharmony_ci    punpckhdq   m1, m0
784cabdff1aSopenharmony_ci    mova        m2, [r4+r3*1-16]
785cabdff1aSopenharmony_ci    punpckhwd   m2, [r4+r3*0-16]
786cabdff1aSopenharmony_ci    mova        m3, [r4+r1*1-16]
787cabdff1aSopenharmony_ci    punpckhwd   m3, [r4+r3*2-16]
788cabdff1aSopenharmony_ci    punpckhdq   m3, m2
789cabdff1aSopenharmony_ci    punpckhqdq  m3, m1
790cabdff1aSopenharmony_ci    mova        m0, [r4+r3*4-16]
791cabdff1aSopenharmony_ci    mova        m1, [r0]
792cabdff1aSopenharmony_ci    PALIGNR     m4, m3, m0, 14, m0
793cabdff1aSopenharmony_ci    PALIGNR     m1, m3,  2, m2
794cabdff1aSopenharmony_ci    pslldq      m0, m4, 2
795cabdff1aSopenharmony_ci    pshuflw     m0, m0, 11100101b
796cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m6, m1, m4, m3
797cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m4, m3, m0, m4
798cabdff1aSopenharmony_ci    mova        m3, [r0]
799cabdff1aSopenharmony_ci    shr        r2d, 13
800cabdff1aSopenharmony_ci    pslldq      m1, m3, 2
801cabdff1aSopenharmony_ci    psrldq      m2, m3, 2
802cabdff1aSopenharmony_ci    pinsrw      m1, [r0-2], 0
803cabdff1aSopenharmony_ci    pinsrw      m2, [r0+r2+14], 7
804cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m3, m2, m1, m3
805cabdff1aSopenharmony_ci    PALIGNR     m2, m3, m6,  2, m0
806cabdff1aSopenharmony_ci    PALIGNR     m5, m3, m6, 14, m0
807cabdff1aSopenharmony_ci    psrldq      m7, m3, 2
808cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m6, m4, m2, m6
809cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m3, m5, m7, m3
810cabdff1aSopenharmony_ci    mova [r4+r3*4], m6
811cabdff1aSopenharmony_ci    PALIGNR     m3, m6, 14, m2
812cabdff1aSopenharmony_ci    pslldq      m6, 2
813cabdff1aSopenharmony_ci    mova [r0+r3*1], m3
814cabdff1aSopenharmony_ci    PALIGNR     m3, m6, 14, m2
815cabdff1aSopenharmony_ci    pslldq      m6, 2
816cabdff1aSopenharmony_ci    mova [r0+r3*2], m3
817cabdff1aSopenharmony_ci    PALIGNR     m3, m6, 14, m2
818cabdff1aSopenharmony_ci    pslldq      m6, 2
819cabdff1aSopenharmony_ci    mova [r0+r1*1], m3
820cabdff1aSopenharmony_ci    PALIGNR     m3, m6, 14, m2
821cabdff1aSopenharmony_ci    pslldq      m6, 2
822cabdff1aSopenharmony_ci    mova [r0+r3*4], m3
823cabdff1aSopenharmony_ci    PALIGNR     m3, m6, 14, m2
824cabdff1aSopenharmony_ci    pslldq      m6, 2
825cabdff1aSopenharmony_ci    mova [r4+r3*1], m3
826cabdff1aSopenharmony_ci    PALIGNR     m3, m6, 14, m2
827cabdff1aSopenharmony_ci    pslldq      m6, 2
828cabdff1aSopenharmony_ci    mova [r4+r3*2], m3
829cabdff1aSopenharmony_ci    PALIGNR     m3, m6, 14, m6
830cabdff1aSopenharmony_ci    mova [r4+r1*1], m3
831cabdff1aSopenharmony_ci    RET
832cabdff1aSopenharmony_ci%endmacro
833cabdff1aSopenharmony_ci
834cabdff1aSopenharmony_ciINIT_XMM sse2
835cabdff1aSopenharmony_ciPRED8x8L_DOWN_RIGHT
836cabdff1aSopenharmony_ciINIT_XMM ssse3
837cabdff1aSopenharmony_ciPRED8x8L_DOWN_RIGHT
838cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
839cabdff1aSopenharmony_ciINIT_XMM avx
840cabdff1aSopenharmony_ciPRED8x8L_DOWN_RIGHT
841cabdff1aSopenharmony_ci%endif
842cabdff1aSopenharmony_ci
843cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
844cabdff1aSopenharmony_ci; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
845cabdff1aSopenharmony_ci;                                    int has_topright, ptrdiff_t stride)
846cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
847cabdff1aSopenharmony_ci%macro PRED8x8L_VERTICAL_RIGHT 0
848cabdff1aSopenharmony_ci; likewise with 8x8l_down_right
849cabdff1aSopenharmony_cicglobal pred8x8l_vertical_right_10, 4, 5, 7
850cabdff1aSopenharmony_ci    sub         r0, r3
851cabdff1aSopenharmony_ci    lea         r4, [r0+r3*4]
852cabdff1aSopenharmony_ci    lea         r1, [r3*3]
853cabdff1aSopenharmony_ci    mova        m0, [r0+r3*1-16]
854cabdff1aSopenharmony_ci    punpckhwd   m0, [r0+r3*0-16]
855cabdff1aSopenharmony_ci    mova        m1, [r0+r1*1-16]
856cabdff1aSopenharmony_ci    punpckhwd   m1, [r0+r3*2-16]
857cabdff1aSopenharmony_ci    punpckhdq   m1, m0
858cabdff1aSopenharmony_ci    mova        m2, [r4+r3*1-16]
859cabdff1aSopenharmony_ci    punpckhwd   m2, [r4+r3*0-16]
860cabdff1aSopenharmony_ci    mova        m3, [r4+r1*1-16]
861cabdff1aSopenharmony_ci    punpckhwd   m3, [r4+r3*2-16]
862cabdff1aSopenharmony_ci    punpckhdq   m3, m2
863cabdff1aSopenharmony_ci    punpckhqdq  m3, m1
864cabdff1aSopenharmony_ci    mova        m0, [r4+r3*4-16]
865cabdff1aSopenharmony_ci    mova        m1, [r0]
866cabdff1aSopenharmony_ci    PALIGNR     m4, m3, m0, 14, m0
867cabdff1aSopenharmony_ci    PALIGNR     m1, m3,  2, m2
868cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m3, m1, m4, m3
869cabdff1aSopenharmony_ci    mova        m2, [r0]
870cabdff1aSopenharmony_ci    shr        r2d, 13
871cabdff1aSopenharmony_ci    pslldq      m1, m2, 2
872cabdff1aSopenharmony_ci    psrldq      m5, m2, 2
873cabdff1aSopenharmony_ci    pinsrw      m1, [r0-2], 0
874cabdff1aSopenharmony_ci    pinsrw      m5, [r0+r2+14], 7
875cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m2, m5, m1, m2
876cabdff1aSopenharmony_ci    PALIGNR     m6, m2, m3, 12, m1
877cabdff1aSopenharmony_ci    PALIGNR     m5, m2, m3, 14, m0
878cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m0, m6, m2, m5
879cabdff1aSopenharmony_ci    pavgw       m2, m5
880cabdff1aSopenharmony_ci    mova [r0+r3*2], m0
881cabdff1aSopenharmony_ci    mova [r0+r3*1], m2
882cabdff1aSopenharmony_ci    pslldq      m6, m3, 4
883cabdff1aSopenharmony_ci    pslldq      m1, m3, 2
884cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m1, m3, m6, m1
885cabdff1aSopenharmony_ci    PALIGNR     m2, m1, 14, m4
886cabdff1aSopenharmony_ci    mova [r0+r1*1], m2
887cabdff1aSopenharmony_ci    pslldq      m1, 2
888cabdff1aSopenharmony_ci    PALIGNR     m0, m1, 14, m3
889cabdff1aSopenharmony_ci    mova [r0+r3*4], m0
890cabdff1aSopenharmony_ci    pslldq      m1, 2
891cabdff1aSopenharmony_ci    PALIGNR     m2, m1, 14, m4
892cabdff1aSopenharmony_ci    mova [r4+r3*1], m2
893cabdff1aSopenharmony_ci    pslldq      m1, 2
894cabdff1aSopenharmony_ci    PALIGNR     m0, m1, 14, m3
895cabdff1aSopenharmony_ci    mova [r4+r3*2], m0
896cabdff1aSopenharmony_ci    pslldq      m1, 2
897cabdff1aSopenharmony_ci    PALIGNR     m2, m1, 14, m4
898cabdff1aSopenharmony_ci    mova [r4+r1*1], m2
899cabdff1aSopenharmony_ci    pslldq      m1, 2
900cabdff1aSopenharmony_ci    PALIGNR     m0, m1, 14, m1
901cabdff1aSopenharmony_ci    mova [r4+r3*4], m0
902cabdff1aSopenharmony_ci    RET
903cabdff1aSopenharmony_ci%endmacro
904cabdff1aSopenharmony_ci
905cabdff1aSopenharmony_ciINIT_XMM sse2
906cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_RIGHT
907cabdff1aSopenharmony_ciINIT_XMM ssse3
908cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_RIGHT
909cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
910cabdff1aSopenharmony_ciINIT_XMM avx
911cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_RIGHT
912cabdff1aSopenharmony_ci%endif
913cabdff1aSopenharmony_ci
914cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
915cabdff1aSopenharmony_ci; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
916cabdff1aSopenharmony_ci;                                   int has_topright, ptrdiff_t stride)
917cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
918cabdff1aSopenharmony_ci%macro PRED8x8L_HORIZONTAL_UP 0
919cabdff1aSopenharmony_cicglobal pred8x8l_horizontal_up_10, 4, 4, 6
920cabdff1aSopenharmony_ci    mova        m0, [r0+r3*0-16]
921cabdff1aSopenharmony_ci    punpckhwd   m0, [r0+r3*1-16]
922cabdff1aSopenharmony_ci    shr        r1d, 14
923cabdff1aSopenharmony_ci    dec         r1
924cabdff1aSopenharmony_ci    and         r1, r3
925cabdff1aSopenharmony_ci    sub         r1, r3
926cabdff1aSopenharmony_ci    mova        m4, [r0+r1*1-16]
927cabdff1aSopenharmony_ci    lea         r1, [r3*3]
928cabdff1aSopenharmony_ci    lea         r2, [r0+r3*4]
929cabdff1aSopenharmony_ci    mova        m1, [r0+r3*2-16]
930cabdff1aSopenharmony_ci    punpckhwd   m1, [r0+r1*1-16]
931cabdff1aSopenharmony_ci    punpckhdq   m0, m1
932cabdff1aSopenharmony_ci    mova        m2, [r2+r3*0-16]
933cabdff1aSopenharmony_ci    punpckhwd   m2, [r2+r3*1-16]
934cabdff1aSopenharmony_ci    mova        m3, [r2+r3*2-16]
935cabdff1aSopenharmony_ci    punpckhwd   m3, [r2+r1*1-16]
936cabdff1aSopenharmony_ci    punpckhdq   m2, m3
937cabdff1aSopenharmony_ci    punpckhqdq  m0, m2
938cabdff1aSopenharmony_ci    PALIGNR     m1, m0, m4, 14, m4
939cabdff1aSopenharmony_ci    psrldq      m2, m0, 2
940cabdff1aSopenharmony_ci    pshufhw     m2, m2, 10100100b
941cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m0, m1, m2, m0
942cabdff1aSopenharmony_ci    psrldq      m1, m0, 2
943cabdff1aSopenharmony_ci    psrldq      m2, m0, 4
944cabdff1aSopenharmony_ci    pshufhw     m1, m1, 10100100b
945cabdff1aSopenharmony_ci    pshufhw     m2, m2, 01010100b
946cabdff1aSopenharmony_ci    pavgw       m4, m0, m1
947cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m1, m2, m0, m1
948cabdff1aSopenharmony_ci    punpckhwd   m5, m4, m1
949cabdff1aSopenharmony_ci    punpcklwd   m4, m1
950cabdff1aSopenharmony_ci    mova [r2+r3*0], m5
951cabdff1aSopenharmony_ci    mova [r0+r3*0], m4
952cabdff1aSopenharmony_ci    pshufd      m0, m5, 11111001b
953cabdff1aSopenharmony_ci    pshufd      m1, m5, 11111110b
954cabdff1aSopenharmony_ci    pshufd      m2, m5, 11111111b
955cabdff1aSopenharmony_ci    mova [r2+r3*1], m0
956cabdff1aSopenharmony_ci    mova [r2+r3*2], m1
957cabdff1aSopenharmony_ci    mova [r2+r1*1], m2
958cabdff1aSopenharmony_ci    PALIGNR     m2, m5, m4, 4, m0
959cabdff1aSopenharmony_ci    PALIGNR     m3, m5, m4, 8, m1
960cabdff1aSopenharmony_ci    PALIGNR     m5, m5, m4, 12, m4
961cabdff1aSopenharmony_ci    mova [r0+r3*1], m2
962cabdff1aSopenharmony_ci    mova [r0+r3*2], m3
963cabdff1aSopenharmony_ci    mova [r0+r1*1], m5
964cabdff1aSopenharmony_ci    RET
965cabdff1aSopenharmony_ci%endmacro
966cabdff1aSopenharmony_ci
967cabdff1aSopenharmony_ciINIT_XMM sse2
968cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_UP
969cabdff1aSopenharmony_ciINIT_XMM ssse3
970cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_UP
971cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
972cabdff1aSopenharmony_ciINIT_XMM avx
973cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_UP
974cabdff1aSopenharmony_ci%endif
975cabdff1aSopenharmony_ci
976cabdff1aSopenharmony_ci
977cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
978cabdff1aSopenharmony_ci; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride)
979cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
980cabdff1aSopenharmony_ci%macro MOV16 3-5
981cabdff1aSopenharmony_ci    mova [%1+     0], %2
982cabdff1aSopenharmony_ci    mova [%1+mmsize], %3
983cabdff1aSopenharmony_ci%endmacro
984cabdff1aSopenharmony_ci
985cabdff1aSopenharmony_ciINIT_XMM sse2
986cabdff1aSopenharmony_cicglobal pred16x16_vertical_10, 2, 3
987cabdff1aSopenharmony_ci    sub   r0, r1
988cabdff1aSopenharmony_ci    mov  r2d, 8
989cabdff1aSopenharmony_ci    mova  m0, [r0+ 0]
990cabdff1aSopenharmony_ci    mova  m1, [r0+mmsize]
991cabdff1aSopenharmony_ci.loop:
992cabdff1aSopenharmony_ci    MOV16 r0+r1*1, m0, m1, m2, m3
993cabdff1aSopenharmony_ci    MOV16 r0+r1*2, m0, m1, m2, m3
994cabdff1aSopenharmony_ci    lea   r0, [r0+r1*2]
995cabdff1aSopenharmony_ci    dec   r2d
996cabdff1aSopenharmony_ci    jg .loop
997cabdff1aSopenharmony_ci    REP_RET
998cabdff1aSopenharmony_ci
999cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1000cabdff1aSopenharmony_ci; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride)
1001cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1002cabdff1aSopenharmony_ciINIT_XMM sse2
1003cabdff1aSopenharmony_cicglobal pred16x16_horizontal_10, 2, 3
1004cabdff1aSopenharmony_ci    mov   r2d, 8
1005cabdff1aSopenharmony_ci.vloop:
1006cabdff1aSopenharmony_ci    movd   m0, [r0+r1*0-4]
1007cabdff1aSopenharmony_ci    movd   m1, [r0+r1*1-4]
1008cabdff1aSopenharmony_ci    SPLATW m0, m0, 1
1009cabdff1aSopenharmony_ci    SPLATW m1, m1, 1
1010cabdff1aSopenharmony_ci    MOV16  r0+r1*0, m0, m0, m0, m0
1011cabdff1aSopenharmony_ci    MOV16  r0+r1*1, m1, m1, m1, m1
1012cabdff1aSopenharmony_ci    lea    r0, [r0+r1*2]
1013cabdff1aSopenharmony_ci    dec    r2d
1014cabdff1aSopenharmony_ci    jg .vloop
1015cabdff1aSopenharmony_ci    REP_RET
1016cabdff1aSopenharmony_ci
1017cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1018cabdff1aSopenharmony_ci; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride)
1019cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1020cabdff1aSopenharmony_ciINIT_XMM sse2
1021cabdff1aSopenharmony_cicglobal pred16x16_dc_10, 2, 6
1022cabdff1aSopenharmony_ci    mov        r5, r0
1023cabdff1aSopenharmony_ci    sub        r0, r1
1024cabdff1aSopenharmony_ci    mova       m0, [r0+0]
1025cabdff1aSopenharmony_ci    paddw      m0, [r0+mmsize]
1026cabdff1aSopenharmony_ci    HADDW      m0, m2
1027cabdff1aSopenharmony_ci
1028cabdff1aSopenharmony_ci    lea        r0, [r0+r1-2]
1029cabdff1aSopenharmony_ci    movzx     r3d, word [r0]
1030cabdff1aSopenharmony_ci    movzx     r4d, word [r0+r1]
1031cabdff1aSopenharmony_ci%rep 7
1032cabdff1aSopenharmony_ci    lea        r0, [r0+r1*2]
1033cabdff1aSopenharmony_ci    movzx     r2d, word [r0]
1034cabdff1aSopenharmony_ci    add       r3d, r2d
1035cabdff1aSopenharmony_ci    movzx     r2d, word [r0+r1]
1036cabdff1aSopenharmony_ci    add       r4d, r2d
1037cabdff1aSopenharmony_ci%endrep
1038cabdff1aSopenharmony_ci    lea       r3d, [r3+r4+16]
1039cabdff1aSopenharmony_ci
1040cabdff1aSopenharmony_ci    movd       m1, r3d
1041cabdff1aSopenharmony_ci    paddw      m0, m1
1042cabdff1aSopenharmony_ci    psrlw      m0, 5
1043cabdff1aSopenharmony_ci    SPLATW     m0, m0
1044cabdff1aSopenharmony_ci    mov       r3d, 8
1045cabdff1aSopenharmony_ci.loop:
1046cabdff1aSopenharmony_ci    MOV16 r5+r1*0, m0, m0, m0, m0
1047cabdff1aSopenharmony_ci    MOV16 r5+r1*1, m0, m0, m0, m0
1048cabdff1aSopenharmony_ci    lea        r5, [r5+r1*2]
1049cabdff1aSopenharmony_ci    dec       r3d
1050cabdff1aSopenharmony_ci    jg .loop
1051cabdff1aSopenharmony_ci    REP_RET
1052cabdff1aSopenharmony_ci
1053cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1054cabdff1aSopenharmony_ci; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride)
1055cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1056cabdff1aSopenharmony_ciINIT_XMM sse2
1057cabdff1aSopenharmony_cicglobal pred16x16_top_dc_10, 2, 3
1058cabdff1aSopenharmony_ci    sub        r0, r1
1059cabdff1aSopenharmony_ci    mova       m0, [r0+0]
1060cabdff1aSopenharmony_ci    paddw      m0, [r0+mmsize]
1061cabdff1aSopenharmony_ci    HADDW      m0, m2
1062cabdff1aSopenharmony_ci
1063cabdff1aSopenharmony_ci    SPLATW     m0, m0
1064cabdff1aSopenharmony_ci    paddw      m0, [pw_8]
1065cabdff1aSopenharmony_ci    psrlw      m0, 4
1066cabdff1aSopenharmony_ci    mov       r2d, 8
1067cabdff1aSopenharmony_ci.loop:
1068cabdff1aSopenharmony_ci    MOV16 r0+r1*1, m0, m0, m0, m0
1069cabdff1aSopenharmony_ci    MOV16 r0+r1*2, m0, m0, m0, m0
1070cabdff1aSopenharmony_ci    lea        r0, [r0+r1*2]
1071cabdff1aSopenharmony_ci    dec       r2d
1072cabdff1aSopenharmony_ci    jg .loop
1073cabdff1aSopenharmony_ci    REP_RET
1074cabdff1aSopenharmony_ci
1075cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1076cabdff1aSopenharmony_ci; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride)
1077cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1078cabdff1aSopenharmony_ciINIT_XMM sse2
1079cabdff1aSopenharmony_cicglobal pred16x16_left_dc_10, 2, 6
1080cabdff1aSopenharmony_ci    mov        r5, r0
1081cabdff1aSopenharmony_ci
1082cabdff1aSopenharmony_ci    sub        r0, 2
1083cabdff1aSopenharmony_ci    movzx     r3d, word [r0]
1084cabdff1aSopenharmony_ci    movzx     r4d, word [r0+r1]
1085cabdff1aSopenharmony_ci%rep 7
1086cabdff1aSopenharmony_ci    lea        r0, [r0+r1*2]
1087cabdff1aSopenharmony_ci    movzx     r2d, word [r0]
1088cabdff1aSopenharmony_ci    add       r3d, r2d
1089cabdff1aSopenharmony_ci    movzx     r2d, word [r0+r1]
1090cabdff1aSopenharmony_ci    add       r4d, r2d
1091cabdff1aSopenharmony_ci%endrep
1092cabdff1aSopenharmony_ci    lea       r3d, [r3+r4+8]
1093cabdff1aSopenharmony_ci    shr       r3d, 4
1094cabdff1aSopenharmony_ci
1095cabdff1aSopenharmony_ci    movd       m0, r3d
1096cabdff1aSopenharmony_ci    SPLATW     m0, m0
1097cabdff1aSopenharmony_ci    mov       r3d, 8
1098cabdff1aSopenharmony_ci.loop:
1099cabdff1aSopenharmony_ci    MOV16 r5+r1*0, m0, m0, m0, m0
1100cabdff1aSopenharmony_ci    MOV16 r5+r1*1, m0, m0, m0, m0
1101cabdff1aSopenharmony_ci    lea        r5, [r5+r1*2]
1102cabdff1aSopenharmony_ci    dec       r3d
1103cabdff1aSopenharmony_ci    jg .loop
1104cabdff1aSopenharmony_ci    REP_RET
1105cabdff1aSopenharmony_ci
1106cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1107cabdff1aSopenharmony_ci; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride)
1108cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1109cabdff1aSopenharmony_ciINIT_XMM sse2
1110cabdff1aSopenharmony_cicglobal pred16x16_128_dc_10, 2,3
1111cabdff1aSopenharmony_ci    mova       m0, [pw_512]
1112cabdff1aSopenharmony_ci    mov       r2d, 8
1113cabdff1aSopenharmony_ci.loop:
1114cabdff1aSopenharmony_ci    MOV16 r0+r1*0, m0, m0, m0, m0
1115cabdff1aSopenharmony_ci    MOV16 r0+r1*1, m0, m0, m0, m0
1116cabdff1aSopenharmony_ci    lea        r0, [r0+r1*2]
1117cabdff1aSopenharmony_ci    dec       r2d
1118cabdff1aSopenharmony_ci    jg .loop
1119cabdff1aSopenharmony_ci    REP_RET
1120