1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* H.264 intra prediction asm optimizations
3cabdff1aSopenharmony_ci;* Copyright (c) 2010 Fiona Glaser
4cabdff1aSopenharmony_ci;* Copyright (c) 2010 Holger Lubitz
5cabdff1aSopenharmony_ci;* Copyright (c) 2010 Loren Merritt
6cabdff1aSopenharmony_ci;* Copyright (c) 2010 Ronald S. Bultje
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
9cabdff1aSopenharmony_ci;*
10cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
11cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
12cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
13cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
14cabdff1aSopenharmony_ci;*
15cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
16cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
19cabdff1aSopenharmony_ci;*
20cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
21cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
22cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23cabdff1aSopenharmony_ci;******************************************************************************
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ciSECTION_RODATA
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_citm_shuf: times 8 db 0x03, 0x80
30cabdff1aSopenharmony_cipw_ff00: times 8 dw 0xff00
31cabdff1aSopenharmony_ciplane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
32cabdff1aSopenharmony_ci             db  1,  2,  3,  4,  5,  6,  7,  8
33cabdff1aSopenharmony_ciplane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
34cabdff1aSopenharmony_ci             db  1,  2,  3,  4,  0,  0,  0,  0
35cabdff1aSopenharmony_cipw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
36cabdff1aSopenharmony_cipw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
37cabdff1aSopenharmony_cipw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
38cabdff1aSopenharmony_cipw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
39cabdff1aSopenharmony_ci
40cabdff1aSopenharmony_ciSECTION .text
41cabdff1aSopenharmony_ci
42cabdff1aSopenharmony_cicextern pb_1
43cabdff1aSopenharmony_cicextern pb_3
44cabdff1aSopenharmony_cicextern pw_4
45cabdff1aSopenharmony_cicextern pw_8
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
48cabdff1aSopenharmony_ci; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride)
49cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_ciINIT_XMM sse
52cabdff1aSopenharmony_cicglobal pred16x16_vertical_8, 2,3
53cabdff1aSopenharmony_ci    sub   r0, r1
54cabdff1aSopenharmony_ci    mov   r2, 4
55cabdff1aSopenharmony_ci    movaps xmm0, [r0]
56cabdff1aSopenharmony_ci.loop:
57cabdff1aSopenharmony_ci    movaps [r0+r1*1], xmm0
58cabdff1aSopenharmony_ci    movaps [r0+r1*2], xmm0
59cabdff1aSopenharmony_ci    lea   r0, [r0+r1*2]
60cabdff1aSopenharmony_ci    movaps [r0+r1*1], xmm0
61cabdff1aSopenharmony_ci    movaps [r0+r1*2], xmm0
62cabdff1aSopenharmony_ci    lea   r0, [r0+r1*2]
63cabdff1aSopenharmony_ci    dec   r2
64cabdff1aSopenharmony_ci    jg .loop
65cabdff1aSopenharmony_ci    REP_RET
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
68cabdff1aSopenharmony_ci; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride)
69cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_ci%macro PRED16x16_H 0
72cabdff1aSopenharmony_cicglobal pred16x16_horizontal_8, 2,3
73cabdff1aSopenharmony_ci    mov       r2, 8
74cabdff1aSopenharmony_ci%if cpuflag(ssse3)
75cabdff1aSopenharmony_ci    mova      m2, [pb_3]
76cabdff1aSopenharmony_ci%endif
77cabdff1aSopenharmony_ci.loop:
78cabdff1aSopenharmony_ci    movd      m0, [r0+r1*0-4]
79cabdff1aSopenharmony_ci    movd      m1, [r0+r1*1-4]
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci%if cpuflag(ssse3)
82cabdff1aSopenharmony_ci    pshufb    m0, m2
83cabdff1aSopenharmony_ci    pshufb    m1, m2
84cabdff1aSopenharmony_ci%else
85cabdff1aSopenharmony_ci    punpcklbw m0, m0
86cabdff1aSopenharmony_ci    punpcklbw m1, m1
87cabdff1aSopenharmony_ci    SPLATW    m0, m0, 3
88cabdff1aSopenharmony_ci    SPLATW    m1, m1, 3
89cabdff1aSopenharmony_ci    mova [r0+r1*0+8], m0
90cabdff1aSopenharmony_ci    mova [r0+r1*1+8], m1
91cabdff1aSopenharmony_ci%endif
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci    mova [r0+r1*0], m0
94cabdff1aSopenharmony_ci    mova [r0+r1*1], m1
95cabdff1aSopenharmony_ci    lea       r0, [r0+r1*2]
96cabdff1aSopenharmony_ci    dec       r2
97cabdff1aSopenharmony_ci    jg .loop
98cabdff1aSopenharmony_ci    REP_RET
99cabdff1aSopenharmony_ci%endmacro
100cabdff1aSopenharmony_ci
101cabdff1aSopenharmony_ciINIT_MMX mmxext
102cabdff1aSopenharmony_ciPRED16x16_H
103cabdff1aSopenharmony_ciINIT_XMM ssse3
104cabdff1aSopenharmony_ciPRED16x16_H
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
107cabdff1aSopenharmony_ci; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride)
108cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
109cabdff1aSopenharmony_ci
110cabdff1aSopenharmony_ci%macro PRED16x16_DC 0
111cabdff1aSopenharmony_cicglobal pred16x16_dc_8, 2,7
112cabdff1aSopenharmony_ci    mov       r4, r0
113cabdff1aSopenharmony_ci    sub       r0, r1
114cabdff1aSopenharmony_ci    pxor      mm0, mm0
115cabdff1aSopenharmony_ci    pxor      mm1, mm1
116cabdff1aSopenharmony_ci    psadbw    mm0, [r0+0]
117cabdff1aSopenharmony_ci    psadbw    mm1, [r0+8]
118cabdff1aSopenharmony_ci    dec        r0
119cabdff1aSopenharmony_ci    movzx     r5d, byte [r0+r1*1]
120cabdff1aSopenharmony_ci    paddw     mm0, mm1
121cabdff1aSopenharmony_ci    movd      r6d, mm0
122cabdff1aSopenharmony_ci    lea        r0, [r0+r1*2]
123cabdff1aSopenharmony_ci%rep 7
124cabdff1aSopenharmony_ci    movzx     r2d, byte [r0+r1*0]
125cabdff1aSopenharmony_ci    movzx     r3d, byte [r0+r1*1]
126cabdff1aSopenharmony_ci    add       r5d, r2d
127cabdff1aSopenharmony_ci    add       r6d, r3d
128cabdff1aSopenharmony_ci    lea        r0, [r0+r1*2]
129cabdff1aSopenharmony_ci%endrep
130cabdff1aSopenharmony_ci    movzx     r2d, byte [r0+r1*0]
131cabdff1aSopenharmony_ci    add       r5d, r6d
132cabdff1aSopenharmony_ci    lea       r2d, [r2+r5+16]
133cabdff1aSopenharmony_ci    shr       r2d, 5
134cabdff1aSopenharmony_ci%if cpuflag(ssse3)
135cabdff1aSopenharmony_ci    pxor       m1, m1
136cabdff1aSopenharmony_ci%endif
137cabdff1aSopenharmony_ci    SPLATB_REG m0, r2, m1
138cabdff1aSopenharmony_ci
139cabdff1aSopenharmony_ci    mov       r3d, 4
140cabdff1aSopenharmony_ci.loop:
141cabdff1aSopenharmony_ci    mova [r4+r1*0], m0
142cabdff1aSopenharmony_ci    mova [r4+r1*1], m0
143cabdff1aSopenharmony_ci    lea   r4, [r4+r1*2]
144cabdff1aSopenharmony_ci    mova [r4+r1*0], m0
145cabdff1aSopenharmony_ci    mova [r4+r1*1], m0
146cabdff1aSopenharmony_ci    lea   r4, [r4+r1*2]
147cabdff1aSopenharmony_ci    dec   r3d
148cabdff1aSopenharmony_ci    jg .loop
149cabdff1aSopenharmony_ci    REP_RET
150cabdff1aSopenharmony_ci%endmacro
151cabdff1aSopenharmony_ci
152cabdff1aSopenharmony_ciINIT_XMM sse2
153cabdff1aSopenharmony_ciPRED16x16_DC
154cabdff1aSopenharmony_ciINIT_XMM ssse3
155cabdff1aSopenharmony_ciPRED16x16_DC
156cabdff1aSopenharmony_ci
157cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
158cabdff1aSopenharmony_ci; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
159cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
160cabdff1aSopenharmony_ci
161cabdff1aSopenharmony_ciINIT_XMM sse2
162cabdff1aSopenharmony_cicglobal pred16x16_tm_vp8_8, 2,6,6
163cabdff1aSopenharmony_ci    sub          r0, r1
164cabdff1aSopenharmony_ci    pxor       xmm2, xmm2
165cabdff1aSopenharmony_ci    movdqa     xmm0, [r0]
166cabdff1aSopenharmony_ci    movdqa     xmm1, xmm0
167cabdff1aSopenharmony_ci    punpcklbw  xmm0, xmm2
168cabdff1aSopenharmony_ci    punpckhbw  xmm1, xmm2
169cabdff1aSopenharmony_ci    movzx       r4d, byte [r0-1]
170cabdff1aSopenharmony_ci    mov         r5d, 8
171cabdff1aSopenharmony_ci.loop:
172cabdff1aSopenharmony_ci    movzx       r2d, byte [r0+r1*1-1]
173cabdff1aSopenharmony_ci    movzx       r3d, byte [r0+r1*2-1]
174cabdff1aSopenharmony_ci    sub         r2d, r4d
175cabdff1aSopenharmony_ci    sub         r3d, r4d
176cabdff1aSopenharmony_ci    movd       xmm2, r2d
177cabdff1aSopenharmony_ci    movd       xmm4, r3d
178cabdff1aSopenharmony_ci    pshuflw    xmm2, xmm2, 0
179cabdff1aSopenharmony_ci    pshuflw    xmm4, xmm4, 0
180cabdff1aSopenharmony_ci    punpcklqdq xmm2, xmm2
181cabdff1aSopenharmony_ci    punpcklqdq xmm4, xmm4
182cabdff1aSopenharmony_ci    movdqa     xmm3, xmm2
183cabdff1aSopenharmony_ci    movdqa     xmm5, xmm4
184cabdff1aSopenharmony_ci    paddw      xmm2, xmm0
185cabdff1aSopenharmony_ci    paddw      xmm3, xmm1
186cabdff1aSopenharmony_ci    paddw      xmm4, xmm0
187cabdff1aSopenharmony_ci    paddw      xmm5, xmm1
188cabdff1aSopenharmony_ci    packuswb   xmm2, xmm3
189cabdff1aSopenharmony_ci    packuswb   xmm4, xmm5
190cabdff1aSopenharmony_ci    movdqa [r0+r1*1], xmm2
191cabdff1aSopenharmony_ci    movdqa [r0+r1*2], xmm4
192cabdff1aSopenharmony_ci    lea          r0, [r0+r1*2]
193cabdff1aSopenharmony_ci    dec         r5d
194cabdff1aSopenharmony_ci    jg .loop
195cabdff1aSopenharmony_ci    REP_RET
196cabdff1aSopenharmony_ci
197cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
198cabdff1aSopenharmony_ciINIT_YMM avx2
199cabdff1aSopenharmony_cicglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
200cabdff1aSopenharmony_ci    sub                       dstq, strideq
201cabdff1aSopenharmony_ci    pmovzxbw                    m0, [dstq]
202cabdff1aSopenharmony_ci    vpbroadcastb               xm1, [r0-1]
203cabdff1aSopenharmony_ci    pmovzxbw                    m1, xm1
204cabdff1aSopenharmony_ci    psubw                       m0, m1
205cabdff1aSopenharmony_ci    mov                 iterationd, 4
206cabdff1aSopenharmony_ci    lea                   stride3q, [strideq*3]
207cabdff1aSopenharmony_ci.loop:
208cabdff1aSopenharmony_ci    vpbroadcastb               xm1, [dstq+strideq*1-1]
209cabdff1aSopenharmony_ci    vpbroadcastb               xm2, [dstq+strideq*2-1]
210cabdff1aSopenharmony_ci    vpbroadcastb               xm3, [dstq+stride3q-1]
211cabdff1aSopenharmony_ci    vpbroadcastb               xm4, [dstq+strideq*4-1]
212cabdff1aSopenharmony_ci    pmovzxbw                    m1, xm1
213cabdff1aSopenharmony_ci    pmovzxbw                    m2, xm2
214cabdff1aSopenharmony_ci    pmovzxbw                    m3, xm3
215cabdff1aSopenharmony_ci    pmovzxbw                    m4, xm4
216cabdff1aSopenharmony_ci    paddw                       m1, m0
217cabdff1aSopenharmony_ci    paddw                       m2, m0
218cabdff1aSopenharmony_ci    paddw                       m3, m0
219cabdff1aSopenharmony_ci    paddw                       m4, m0
220cabdff1aSopenharmony_ci    vpackuswb                   m1, m1, m2
221cabdff1aSopenharmony_ci    vpackuswb                   m3, m3, m4
222cabdff1aSopenharmony_ci    vpermq                      m1, m1, q3120
223cabdff1aSopenharmony_ci    vpermq                      m3, m3, q3120
224cabdff1aSopenharmony_ci    movdqa        [dstq+strideq*1], xm1
225cabdff1aSopenharmony_ci    vextracti128  [dstq+strideq*2], m1, 1
226cabdff1aSopenharmony_ci    movdqa       [dstq+stride3q*1], xm3
227cabdff1aSopenharmony_ci    vextracti128  [dstq+strideq*4], m3, 1
228cabdff1aSopenharmony_ci    lea                       dstq, [dstq+strideq*4]
229cabdff1aSopenharmony_ci    dec                 iterationd
230cabdff1aSopenharmony_ci    jg .loop
231cabdff1aSopenharmony_ci    REP_RET
232cabdff1aSopenharmony_ci%endif
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
235cabdff1aSopenharmony_ci; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
236cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
237cabdff1aSopenharmony_ci
238cabdff1aSopenharmony_ci%macro H264_PRED16x16_PLANE 1
239cabdff1aSopenharmony_cicglobal pred16x16_plane_%1_8, 2,9,7
240cabdff1aSopenharmony_ci    mov          r2, r1           ; +stride
241cabdff1aSopenharmony_ci    neg          r1               ; -stride
242cabdff1aSopenharmony_ci
243cabdff1aSopenharmony_ci    movh         m0, [r0+r1  -1]
244cabdff1aSopenharmony_ci%if cpuflag(ssse3)
245cabdff1aSopenharmony_ci    movhps       m0, [r0+r1  +8]
246cabdff1aSopenharmony_ci    pmaddubsw    m0, [plane_shuf] ; H coefficients
247cabdff1aSopenharmony_ci%else ; sse2
248cabdff1aSopenharmony_ci    pxor         m2, m2
249cabdff1aSopenharmony_ci    movh         m1, [r0+r1  +8]
250cabdff1aSopenharmony_ci    punpcklbw    m0, m2
251cabdff1aSopenharmony_ci    punpcklbw    m1, m2
252cabdff1aSopenharmony_ci    pmullw       m0, [pw_m8tom1]
253cabdff1aSopenharmony_ci    pmullw       m1, [pw_1to8]
254cabdff1aSopenharmony_ci    paddw        m0, m1
255cabdff1aSopenharmony_ci%endif
256cabdff1aSopenharmony_ci    movhlps      m1, m0
257cabdff1aSopenharmony_ci    paddw        m0, m1
258cabdff1aSopenharmony_ci    PSHUFLW      m1, m0, 0xE
259cabdff1aSopenharmony_ci    paddw        m0, m1
260cabdff1aSopenharmony_ci    PSHUFLW      m1, m0, 0x1
261cabdff1aSopenharmony_ci    paddw        m0, m1           ; sum of H coefficients
262cabdff1aSopenharmony_ci
263cabdff1aSopenharmony_ci    lea          r4, [r0+r2*8-1]
264cabdff1aSopenharmony_ci    lea          r3, [r0+r2*4-1]
265cabdff1aSopenharmony_ci    add          r4, r2
266cabdff1aSopenharmony_ci
267cabdff1aSopenharmony_ci%if ARCH_X86_64
268cabdff1aSopenharmony_ci%define e_reg r8
269cabdff1aSopenharmony_ci%else
270cabdff1aSopenharmony_ci%define e_reg r0
271cabdff1aSopenharmony_ci%endif
272cabdff1aSopenharmony_ci
273cabdff1aSopenharmony_ci    movzx     e_reg, byte [r3+r2*2   ]
274cabdff1aSopenharmony_ci    movzx        r5, byte [r4+r1     ]
275cabdff1aSopenharmony_ci    sub          r5, e_reg
276cabdff1aSopenharmony_ci
277cabdff1aSopenharmony_ci    movzx     e_reg, byte [r3+r2     ]
278cabdff1aSopenharmony_ci    movzx        r6, byte [r4        ]
279cabdff1aSopenharmony_ci    sub          r6, e_reg
280cabdff1aSopenharmony_ci    lea          r5, [r5+r6*2]
281cabdff1aSopenharmony_ci
282cabdff1aSopenharmony_ci    movzx     e_reg, byte [r3+r1     ]
283cabdff1aSopenharmony_ci    movzx        r6, byte [r4+r2*2   ]
284cabdff1aSopenharmony_ci    sub          r6, e_reg
285cabdff1aSopenharmony_ci    lea          r5, [r5+r6*4]
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci    movzx     e_reg, byte [r3        ]
288cabdff1aSopenharmony_ci%if ARCH_X86_64
289cabdff1aSopenharmony_ci    movzx        r7, byte [r4+r2     ]
290cabdff1aSopenharmony_ci    sub          r7, e_reg
291cabdff1aSopenharmony_ci%else
292cabdff1aSopenharmony_ci    movzx        r6, byte [r4+r2     ]
293cabdff1aSopenharmony_ci    sub          r6, e_reg
294cabdff1aSopenharmony_ci    lea          r5, [r5+r6*4]
295cabdff1aSopenharmony_ci    sub          r5, r6
296cabdff1aSopenharmony_ci%endif
297cabdff1aSopenharmony_ci
298cabdff1aSopenharmony_ci    lea       e_reg, [r3+r1*4]
299cabdff1aSopenharmony_ci    lea          r3, [r4+r2*4]
300cabdff1aSopenharmony_ci
301cabdff1aSopenharmony_ci    movzx        r4, byte [e_reg+r2  ]
302cabdff1aSopenharmony_ci    movzx        r6, byte [r3        ]
303cabdff1aSopenharmony_ci    sub          r6, r4
304cabdff1aSopenharmony_ci%if ARCH_X86_64
305cabdff1aSopenharmony_ci    lea          r6, [r7+r6*2]
306cabdff1aSopenharmony_ci    lea          r5, [r5+r6*2]
307cabdff1aSopenharmony_ci    add          r5, r6
308cabdff1aSopenharmony_ci%else
309cabdff1aSopenharmony_ci    lea          r5, [r5+r6*4]
310cabdff1aSopenharmony_ci    lea          r5, [r5+r6*2]
311cabdff1aSopenharmony_ci%endif
312cabdff1aSopenharmony_ci
313cabdff1aSopenharmony_ci    movzx        r4, byte [e_reg     ]
314cabdff1aSopenharmony_ci%if ARCH_X86_64
315cabdff1aSopenharmony_ci    movzx        r7, byte [r3   +r2  ]
316cabdff1aSopenharmony_ci    sub          r7, r4
317cabdff1aSopenharmony_ci    sub          r5, r7
318cabdff1aSopenharmony_ci%else
319cabdff1aSopenharmony_ci    movzx        r6, byte [r3   +r2  ]
320cabdff1aSopenharmony_ci    sub          r6, r4
321cabdff1aSopenharmony_ci    lea          r5, [r5+r6*8]
322cabdff1aSopenharmony_ci    sub          r5, r6
323cabdff1aSopenharmony_ci%endif
324cabdff1aSopenharmony_ci
325cabdff1aSopenharmony_ci    movzx        r4, byte [e_reg+r1  ]
326cabdff1aSopenharmony_ci    movzx        r6, byte [r3   +r2*2]
327cabdff1aSopenharmony_ci    sub          r6, r4
328cabdff1aSopenharmony_ci%if ARCH_X86_64
329cabdff1aSopenharmony_ci    add          r6, r7
330cabdff1aSopenharmony_ci%endif
331cabdff1aSopenharmony_ci    lea          r5, [r5+r6*8]
332cabdff1aSopenharmony_ci
333cabdff1aSopenharmony_ci    movzx        r4, byte [e_reg+r2*2]
334cabdff1aSopenharmony_ci    movzx        r6, byte [r3   +r1  ]
335cabdff1aSopenharmony_ci    sub          r6, r4
336cabdff1aSopenharmony_ci    lea          r5, [r5+r6*4]
337cabdff1aSopenharmony_ci    add          r5, r6           ; sum of V coefficients
338cabdff1aSopenharmony_ci
339cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
340cabdff1aSopenharmony_ci    mov          r0, r0m
341cabdff1aSopenharmony_ci%endif
342cabdff1aSopenharmony_ci
343cabdff1aSopenharmony_ci%ifidn %1, h264
344cabdff1aSopenharmony_ci    lea          r5, [r5*5+32]
345cabdff1aSopenharmony_ci    sar          r5, 6
346cabdff1aSopenharmony_ci%elifidn %1, rv40
347cabdff1aSopenharmony_ci    lea          r5, [r5*5]
348cabdff1aSopenharmony_ci    sar          r5, 6
349cabdff1aSopenharmony_ci%elifidn %1, svq3
350cabdff1aSopenharmony_ci    test         r5, r5
351cabdff1aSopenharmony_ci    lea          r6, [r5+3]
352cabdff1aSopenharmony_ci    cmovs        r5, r6
353cabdff1aSopenharmony_ci    sar          r5, 2            ; V/4
354cabdff1aSopenharmony_ci    lea          r5, [r5*5]       ; 5*(V/4)
355cabdff1aSopenharmony_ci    test         r5, r5
356cabdff1aSopenharmony_ci    lea          r6, [r5+15]
357cabdff1aSopenharmony_ci    cmovs        r5, r6
358cabdff1aSopenharmony_ci    sar          r5, 4            ; (5*(V/4))/16
359cabdff1aSopenharmony_ci%endif
360cabdff1aSopenharmony_ci
361cabdff1aSopenharmony_ci    movzx        r4, byte [r0+r1  +15]
362cabdff1aSopenharmony_ci    movzx        r3, byte [r3+r2*2   ]
363cabdff1aSopenharmony_ci    lea          r3, [r3+r4+1]
364cabdff1aSopenharmony_ci    shl          r3, 4
365cabdff1aSopenharmony_ci
366cabdff1aSopenharmony_ci    movd        r1d, m0
367cabdff1aSopenharmony_ci    movsx       r1d, r1w
368cabdff1aSopenharmony_ci%ifnidn %1, svq3
369cabdff1aSopenharmony_ci%ifidn %1, h264
370cabdff1aSopenharmony_ci    lea         r1d, [r1d*5+32]
371cabdff1aSopenharmony_ci%else ; rv40
372cabdff1aSopenharmony_ci    lea         r1d, [r1d*5]
373cabdff1aSopenharmony_ci%endif
374cabdff1aSopenharmony_ci    sar         r1d, 6
375cabdff1aSopenharmony_ci%else ; svq3
376cabdff1aSopenharmony_ci    test        r1d, r1d
377cabdff1aSopenharmony_ci    lea         r4d, [r1d+3]
378cabdff1aSopenharmony_ci    cmovs       r1d, r4d
379cabdff1aSopenharmony_ci    sar         r1d, 2           ; H/4
380cabdff1aSopenharmony_ci    lea         r1d, [r1d*5]     ; 5*(H/4)
381cabdff1aSopenharmony_ci    test        r1d, r1d
382cabdff1aSopenharmony_ci    lea         r4d, [r1d+15]
383cabdff1aSopenharmony_ci    cmovs       r1d, r4d
384cabdff1aSopenharmony_ci    sar         r1d, 4           ; (5*(H/4))/16
385cabdff1aSopenharmony_ci%endif
386cabdff1aSopenharmony_ci    movd         m0, r1d
387cabdff1aSopenharmony_ci
388cabdff1aSopenharmony_ci    add         r1d, r5d
389cabdff1aSopenharmony_ci    add         r3d, r1d
390cabdff1aSopenharmony_ci    shl         r1d, 3
391cabdff1aSopenharmony_ci    sub         r3d, r1d          ; a
392cabdff1aSopenharmony_ci
393cabdff1aSopenharmony_ci    movd         m1, r5d
394cabdff1aSopenharmony_ci    movd         m3, r3d
395cabdff1aSopenharmony_ci    SPLATW       m0, m0, 0        ; H
396cabdff1aSopenharmony_ci    SPLATW       m1, m1, 0        ; V
397cabdff1aSopenharmony_ci    SPLATW       m3, m3, 0        ; a
398cabdff1aSopenharmony_ci%ifidn %1, svq3
399cabdff1aSopenharmony_ci    SWAP          0, 1
400cabdff1aSopenharmony_ci%endif
401cabdff1aSopenharmony_ci    mova         m2, m0
402cabdff1aSopenharmony_ci    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
403cabdff1aSopenharmony_ci    psllw        m2, 3
404cabdff1aSopenharmony_ci    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
405cabdff1aSopenharmony_ci    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
406cabdff1aSopenharmony_ci
407cabdff1aSopenharmony_ci    mov          r4, 8
408cabdff1aSopenharmony_ci.loop:
409cabdff1aSopenharmony_ci    mova         m3, m0           ; b[0..7]
410cabdff1aSopenharmony_ci    mova         m4, m2           ; b[8..15]
411cabdff1aSopenharmony_ci    psraw        m3, 5
412cabdff1aSopenharmony_ci    psraw        m4, 5
413cabdff1aSopenharmony_ci    packuswb     m3, m4
414cabdff1aSopenharmony_ci    mova       [r0], m3
415cabdff1aSopenharmony_ci    paddw        m0, m1
416cabdff1aSopenharmony_ci    paddw        m2, m1
417cabdff1aSopenharmony_ci
418cabdff1aSopenharmony_ci    mova         m3, m0           ; b[0..7]
419cabdff1aSopenharmony_ci    mova         m4, m2           ; b[8..15]
420cabdff1aSopenharmony_ci    psraw        m3, 5
421cabdff1aSopenharmony_ci    psraw        m4, 5
422cabdff1aSopenharmony_ci    packuswb     m3, m4
423cabdff1aSopenharmony_ci    mova    [r0+r2], m3
424cabdff1aSopenharmony_ci    paddw        m0, m1
425cabdff1aSopenharmony_ci    paddw        m2, m1
426cabdff1aSopenharmony_ci
427cabdff1aSopenharmony_ci    lea          r0, [r0+r2*2]
428cabdff1aSopenharmony_ci    dec          r4
429cabdff1aSopenharmony_ci    jg .loop
430cabdff1aSopenharmony_ci    REP_RET
431cabdff1aSopenharmony_ci%endmacro
432cabdff1aSopenharmony_ci
433cabdff1aSopenharmony_ciINIT_XMM sse2
434cabdff1aSopenharmony_ciH264_PRED16x16_PLANE h264
435cabdff1aSopenharmony_ciH264_PRED16x16_PLANE rv40
436cabdff1aSopenharmony_ciH264_PRED16x16_PLANE svq3
437cabdff1aSopenharmony_ciINIT_XMM ssse3
438cabdff1aSopenharmony_ciH264_PRED16x16_PLANE h264
439cabdff1aSopenharmony_ciH264_PRED16x16_PLANE rv40
440cabdff1aSopenharmony_ciH264_PRED16x16_PLANE svq3
441cabdff1aSopenharmony_ci
442cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
443cabdff1aSopenharmony_ci; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride)
444cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
445cabdff1aSopenharmony_ci
446cabdff1aSopenharmony_ci%macro H264_PRED8x8_PLANE 0
447cabdff1aSopenharmony_cicglobal pred8x8_plane_8, 2,9,7
448cabdff1aSopenharmony_ci    mov          r2, r1           ; +stride
449cabdff1aSopenharmony_ci    neg          r1               ; -stride
450cabdff1aSopenharmony_ci
451cabdff1aSopenharmony_ci    movd         m0, [r0+r1  -1]
452cabdff1aSopenharmony_ci%if cpuflag(ssse3)
453cabdff1aSopenharmony_ci    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
454cabdff1aSopenharmony_ci    pmaddubsw    m0, [plane8_shuf] ; H coefficients
455cabdff1aSopenharmony_ci%else ; sse2
456cabdff1aSopenharmony_ci    pxor         m2, m2
457cabdff1aSopenharmony_ci    movd         m1, [r0+r1  +4]
458cabdff1aSopenharmony_ci    punpckldq    m0, m1
459cabdff1aSopenharmony_ci    punpcklbw    m0, m2
460cabdff1aSopenharmony_ci    pmullw       m0, [pw_m4to4]
461cabdff1aSopenharmony_ci%endif
462cabdff1aSopenharmony_ci    movhlps      m1, m0
463cabdff1aSopenharmony_ci    paddw        m0, m1
464cabdff1aSopenharmony_ci
465cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
466cabdff1aSopenharmony_ci    PSHUFLW      m1, m0, 0xE
467cabdff1aSopenharmony_ci    paddw        m0, m1
468cabdff1aSopenharmony_ci%endif ; !ssse3
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_ci    PSHUFLW      m1, m0, 0x1
471cabdff1aSopenharmony_ci    paddw        m0, m1           ; sum of H coefficients
472cabdff1aSopenharmony_ci
473cabdff1aSopenharmony_ci    lea          r4, [r0+r2*4-1]
474cabdff1aSopenharmony_ci    lea          r3, [r0     -1]
475cabdff1aSopenharmony_ci    add          r4, r2
476cabdff1aSopenharmony_ci
477cabdff1aSopenharmony_ci%if ARCH_X86_64
478cabdff1aSopenharmony_ci%define e_reg r8
479cabdff1aSopenharmony_ci%else
480cabdff1aSopenharmony_ci%define e_reg r0
481cabdff1aSopenharmony_ci%endif
482cabdff1aSopenharmony_ci
483cabdff1aSopenharmony_ci    movzx     e_reg, byte [r3+r2*2   ]
484cabdff1aSopenharmony_ci    movzx        r5, byte [r4+r1     ]
485cabdff1aSopenharmony_ci    sub          r5, e_reg
486cabdff1aSopenharmony_ci
487cabdff1aSopenharmony_ci    movzx     e_reg, byte [r3        ]
488cabdff1aSopenharmony_ci%if ARCH_X86_64
489cabdff1aSopenharmony_ci    movzx        r7, byte [r4+r2     ]
490cabdff1aSopenharmony_ci    sub          r7, e_reg
491cabdff1aSopenharmony_ci    sub          r5, r7
492cabdff1aSopenharmony_ci%else
493cabdff1aSopenharmony_ci    movzx        r6, byte [r4+r2     ]
494cabdff1aSopenharmony_ci    sub          r6, e_reg
495cabdff1aSopenharmony_ci    lea          r5, [r5+r6*4]
496cabdff1aSopenharmony_ci    sub          r5, r6
497cabdff1aSopenharmony_ci%endif
498cabdff1aSopenharmony_ci
499cabdff1aSopenharmony_ci    movzx     e_reg, byte [r3+r1     ]
500cabdff1aSopenharmony_ci    movzx        r6, byte [r4+r2*2   ]
501cabdff1aSopenharmony_ci    sub          r6, e_reg
502cabdff1aSopenharmony_ci%if ARCH_X86_64
503cabdff1aSopenharmony_ci    add          r6, r7
504cabdff1aSopenharmony_ci%endif
505cabdff1aSopenharmony_ci    lea          r5, [r5+r6*4]
506cabdff1aSopenharmony_ci
507cabdff1aSopenharmony_ci    movzx     e_reg, byte [r3+r2     ]
508cabdff1aSopenharmony_ci    movzx        r6, byte [r4        ]
509cabdff1aSopenharmony_ci    sub          r6, e_reg
510cabdff1aSopenharmony_ci    lea          r6, [r5+r6*2]
511cabdff1aSopenharmony_ci
512cabdff1aSopenharmony_ci    lea          r5, [r6*9+16]
513cabdff1aSopenharmony_ci    lea          r5, [r5+r6*8]
514cabdff1aSopenharmony_ci    sar          r5, 5
515cabdff1aSopenharmony_ci
516cabdff1aSopenharmony_ci%if ARCH_X86_64 == 0
517cabdff1aSopenharmony_ci    mov          r0, r0m
518cabdff1aSopenharmony_ci%endif
519cabdff1aSopenharmony_ci
520cabdff1aSopenharmony_ci    movzx        r3, byte [r4+r2*2  ]
521cabdff1aSopenharmony_ci    movzx        r4, byte [r0+r1  +7]
522cabdff1aSopenharmony_ci    lea          r3, [r3+r4+1]
523cabdff1aSopenharmony_ci    shl          r3, 4
524cabdff1aSopenharmony_ci    movd        r1d, m0
525cabdff1aSopenharmony_ci    movsx       r1d, r1w
526cabdff1aSopenharmony_ci    imul        r1d, 17
527cabdff1aSopenharmony_ci    add         r1d, 16
528cabdff1aSopenharmony_ci    sar         r1d, 5
529cabdff1aSopenharmony_ci    movd         m0, r1d
530cabdff1aSopenharmony_ci    add         r1d, r5d
531cabdff1aSopenharmony_ci    sub         r3d, r1d
532cabdff1aSopenharmony_ci    add         r1d, r1d
533cabdff1aSopenharmony_ci    sub         r3d, r1d          ; a
534cabdff1aSopenharmony_ci
535cabdff1aSopenharmony_ci    movd         m1, r5d
536cabdff1aSopenharmony_ci    movd         m3, r3d
537cabdff1aSopenharmony_ci    SPLATW       m0, m0, 0        ; H
538cabdff1aSopenharmony_ci    SPLATW       m1, m1, 0        ; V
539cabdff1aSopenharmony_ci    SPLATW       m3, m3, 0        ; a
540cabdff1aSopenharmony_ci    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
541cabdff1aSopenharmony_ci    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
542cabdff1aSopenharmony_ci
543cabdff1aSopenharmony_ci    mov          r4, 4
544cabdff1aSopenharmony_ciALIGN 16
545cabdff1aSopenharmony_ci.loop:
546cabdff1aSopenharmony_ci    mova         m3, m0           ; b[0..7]
547cabdff1aSopenharmony_ci    paddw        m0, m1
548cabdff1aSopenharmony_ci    psraw        m3, 5
549cabdff1aSopenharmony_ci    mova         m4, m0           ; V+b[0..7]
550cabdff1aSopenharmony_ci    paddw        m0, m1
551cabdff1aSopenharmony_ci    psraw        m4, 5
552cabdff1aSopenharmony_ci    packuswb     m3, m4
553cabdff1aSopenharmony_ci    movh       [r0], m3
554cabdff1aSopenharmony_ci    movhps  [r0+r2], m3
555cabdff1aSopenharmony_ci
556cabdff1aSopenharmony_ci    lea          r0, [r0+r2*2]
557cabdff1aSopenharmony_ci    dec          r4
558cabdff1aSopenharmony_ci    jg .loop
559cabdff1aSopenharmony_ci    REP_RET
560cabdff1aSopenharmony_ci%endmacro
561cabdff1aSopenharmony_ci
562cabdff1aSopenharmony_ciINIT_XMM sse2
563cabdff1aSopenharmony_ciH264_PRED8x8_PLANE
564cabdff1aSopenharmony_ciINIT_XMM ssse3
565cabdff1aSopenharmony_ciH264_PRED8x8_PLANE
566cabdff1aSopenharmony_ci
567cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
568cabdff1aSopenharmony_ci; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride)
569cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
570cabdff1aSopenharmony_ci
571cabdff1aSopenharmony_ciINIT_MMX mmx
572cabdff1aSopenharmony_cicglobal pred8x8_vertical_8, 2,2
573cabdff1aSopenharmony_ci    sub    r0, r1
574cabdff1aSopenharmony_ci    movq  mm0, [r0]
575cabdff1aSopenharmony_ci%rep 3
576cabdff1aSopenharmony_ci    movq [r0+r1*1], mm0
577cabdff1aSopenharmony_ci    movq [r0+r1*2], mm0
578cabdff1aSopenharmony_ci    lea    r0, [r0+r1*2]
579cabdff1aSopenharmony_ci%endrep
580cabdff1aSopenharmony_ci    movq [r0+r1*1], mm0
581cabdff1aSopenharmony_ci    movq [r0+r1*2], mm0
582cabdff1aSopenharmony_ci    RET
583cabdff1aSopenharmony_ci
584cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
585cabdff1aSopenharmony_ci; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride)
586cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
587cabdff1aSopenharmony_ci
588cabdff1aSopenharmony_ci%macro PRED8x8_H 0
589cabdff1aSopenharmony_cicglobal pred8x8_horizontal_8, 2,3
590cabdff1aSopenharmony_ci    mov       r2, 4
591cabdff1aSopenharmony_ci%if cpuflag(ssse3)
592cabdff1aSopenharmony_ci    mova      m2, [pb_3]
593cabdff1aSopenharmony_ci%endif
594cabdff1aSopenharmony_ci.loop:
595cabdff1aSopenharmony_ci    SPLATB_LOAD m0, r0+r1*0-1, m2
596cabdff1aSopenharmony_ci    SPLATB_LOAD m1, r0+r1*1-1, m2
597cabdff1aSopenharmony_ci    mova [r0+r1*0], m0
598cabdff1aSopenharmony_ci    mova [r0+r1*1], m1
599cabdff1aSopenharmony_ci    lea       r0, [r0+r1*2]
600cabdff1aSopenharmony_ci    dec       r2
601cabdff1aSopenharmony_ci    jg .loop
602cabdff1aSopenharmony_ci    REP_RET
603cabdff1aSopenharmony_ci%endmacro
604cabdff1aSopenharmony_ci
605cabdff1aSopenharmony_ciINIT_MMX mmxext
606cabdff1aSopenharmony_ciPRED8x8_H
607cabdff1aSopenharmony_ciINIT_MMX ssse3
608cabdff1aSopenharmony_ciPRED8x8_H
609cabdff1aSopenharmony_ci
610cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
611cabdff1aSopenharmony_ci; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
612cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
613cabdff1aSopenharmony_ciINIT_MMX mmxext
614cabdff1aSopenharmony_cicglobal pred8x8_top_dc_8, 2,5
615cabdff1aSopenharmony_ci    sub         r0, r1
616cabdff1aSopenharmony_ci    movq       mm0, [r0]
617cabdff1aSopenharmony_ci    pxor       mm1, mm1
618cabdff1aSopenharmony_ci    pxor       mm2, mm2
619cabdff1aSopenharmony_ci    lea         r2, [r0+r1*2]
620cabdff1aSopenharmony_ci    punpckhbw  mm1, mm0
621cabdff1aSopenharmony_ci    punpcklbw  mm0, mm2
622cabdff1aSopenharmony_ci    psadbw     mm1, mm2        ; s1
623cabdff1aSopenharmony_ci    lea         r3, [r2+r1*2]
624cabdff1aSopenharmony_ci    psadbw     mm0, mm2        ; s0
625cabdff1aSopenharmony_ci    psrlw      mm1, 1
626cabdff1aSopenharmony_ci    psrlw      mm0, 1
627cabdff1aSopenharmony_ci    pavgw      mm1, mm2
628cabdff1aSopenharmony_ci    lea         r4, [r3+r1*2]
629cabdff1aSopenharmony_ci    pavgw      mm0, mm2
630cabdff1aSopenharmony_ci    pshufw     mm1, mm1, 0
631cabdff1aSopenharmony_ci    pshufw     mm0, mm0, 0     ; dc0 (w)
632cabdff1aSopenharmony_ci    packuswb   mm0, mm1        ; dc0,dc1 (b)
633cabdff1aSopenharmony_ci    movq [r0+r1*1], mm0
634cabdff1aSopenharmony_ci    movq [r0+r1*2], mm0
635cabdff1aSopenharmony_ci    lea         r0, [r3+r1*2]
636cabdff1aSopenharmony_ci    movq [r2+r1*1], mm0
637cabdff1aSopenharmony_ci    movq [r2+r1*2], mm0
638cabdff1aSopenharmony_ci    movq [r3+r1*1], mm0
639cabdff1aSopenharmony_ci    movq [r3+r1*2], mm0
640cabdff1aSopenharmony_ci    movq [r0+r1*1], mm0
641cabdff1aSopenharmony_ci    movq [r0+r1*2], mm0
642cabdff1aSopenharmony_ci    RET
643cabdff1aSopenharmony_ci
644cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
645cabdff1aSopenharmony_ci; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
646cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
647cabdff1aSopenharmony_ci
648cabdff1aSopenharmony_ciINIT_MMX mmxext
649cabdff1aSopenharmony_cicglobal pred8x8_dc_8, 2,5
650cabdff1aSopenharmony_ci    sub       r0, r1
651cabdff1aSopenharmony_ci    pxor      m7, m7
652cabdff1aSopenharmony_ci    movd      m0, [r0+0]
653cabdff1aSopenharmony_ci    movd      m1, [r0+4]
654cabdff1aSopenharmony_ci    psadbw    m0, m7            ; s0
655cabdff1aSopenharmony_ci    mov       r4, r0
656cabdff1aSopenharmony_ci    psadbw    m1, m7            ; s1
657cabdff1aSopenharmony_ci
658cabdff1aSopenharmony_ci    movzx    r2d, byte [r0+r1*1-1]
659cabdff1aSopenharmony_ci    movzx    r3d, byte [r0+r1*2-1]
660cabdff1aSopenharmony_ci    lea       r0, [r0+r1*2]
661cabdff1aSopenharmony_ci    add      r2d, r3d
662cabdff1aSopenharmony_ci    movzx    r3d, byte [r0+r1*1-1]
663cabdff1aSopenharmony_ci    add      r2d, r3d
664cabdff1aSopenharmony_ci    movzx    r3d, byte [r0+r1*2-1]
665cabdff1aSopenharmony_ci    add      r2d, r3d
666cabdff1aSopenharmony_ci    lea       r0, [r0+r1*2]
667cabdff1aSopenharmony_ci    movd      m2, r2d            ; s2
668cabdff1aSopenharmony_ci    movzx    r2d, byte [r0+r1*1-1]
669cabdff1aSopenharmony_ci    movzx    r3d, byte [r0+r1*2-1]
670cabdff1aSopenharmony_ci    lea       r0, [r0+r1*2]
671cabdff1aSopenharmony_ci    add      r2d, r3d
672cabdff1aSopenharmony_ci    movzx    r3d, byte [r0+r1*1-1]
673cabdff1aSopenharmony_ci    add      r2d, r3d
674cabdff1aSopenharmony_ci    movzx    r3d, byte [r0+r1*2-1]
675cabdff1aSopenharmony_ci    add      r2d, r3d
676cabdff1aSopenharmony_ci    movd      m3, r2d            ; s3
677cabdff1aSopenharmony_ci
678cabdff1aSopenharmony_ci    punpcklwd m0, m1
679cabdff1aSopenharmony_ci    mov       r0, r4
680cabdff1aSopenharmony_ci    punpcklwd m2, m3
681cabdff1aSopenharmony_ci    punpckldq m0, m2            ; s0, s1, s2, s3
682cabdff1aSopenharmony_ci    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
683cabdff1aSopenharmony_ci    lea       r2, [r0+r1*2]
684cabdff1aSopenharmony_ci    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
685cabdff1aSopenharmony_ci    paddw     m0, m3
686cabdff1aSopenharmony_ci    lea       r3, [r2+r1*2]
687cabdff1aSopenharmony_ci    psrlw     m0, 2
688cabdff1aSopenharmony_ci    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
689cabdff1aSopenharmony_ci    lea       r4, [r3+r1*2]
690cabdff1aSopenharmony_ci    packuswb  m0, m0
691cabdff1aSopenharmony_ci    punpcklbw m0, m0
692cabdff1aSopenharmony_ci    movq      m1, m0
693cabdff1aSopenharmony_ci    punpcklbw m0, m0
694cabdff1aSopenharmony_ci    punpckhbw m1, m1
695cabdff1aSopenharmony_ci    movq [r0+r1*1], m0
696cabdff1aSopenharmony_ci    movq [r0+r1*2], m0
697cabdff1aSopenharmony_ci    movq [r2+r1*1], m0
698cabdff1aSopenharmony_ci    movq [r2+r1*2], m0
699cabdff1aSopenharmony_ci    movq [r3+r1*1], m1
700cabdff1aSopenharmony_ci    movq [r3+r1*2], m1
701cabdff1aSopenharmony_ci    movq [r4+r1*1], m1
702cabdff1aSopenharmony_ci    movq [r4+r1*2], m1
703cabdff1aSopenharmony_ci    RET
704cabdff1aSopenharmony_ci
705cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
706cabdff1aSopenharmony_ci; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride)
707cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
708cabdff1aSopenharmony_ci
709cabdff1aSopenharmony_ciINIT_MMX mmxext
710cabdff1aSopenharmony_cicglobal pred8x8_dc_rv40_8, 2,7
711cabdff1aSopenharmony_ci    mov       r4, r0
712cabdff1aSopenharmony_ci    sub       r0, r1
713cabdff1aSopenharmony_ci    pxor      mm0, mm0
714cabdff1aSopenharmony_ci    psadbw    mm0, [r0]
715cabdff1aSopenharmony_ci    dec        r0
716cabdff1aSopenharmony_ci    movzx     r5d, byte [r0+r1*1]
717cabdff1aSopenharmony_ci    movd      r6d, mm0
718cabdff1aSopenharmony_ci    lea        r0, [r0+r1*2]
719cabdff1aSopenharmony_ci%rep 3
720cabdff1aSopenharmony_ci    movzx     r2d, byte [r0+r1*0]
721cabdff1aSopenharmony_ci    movzx     r3d, byte [r0+r1*1]
722cabdff1aSopenharmony_ci    add       r5d, r2d
723cabdff1aSopenharmony_ci    add       r6d, r3d
724cabdff1aSopenharmony_ci    lea        r0, [r0+r1*2]
725cabdff1aSopenharmony_ci%endrep
726cabdff1aSopenharmony_ci    movzx     r2d, byte [r0+r1*0]
727cabdff1aSopenharmony_ci    add       r5d, r6d
728cabdff1aSopenharmony_ci    lea       r2d, [r2+r5+8]
729cabdff1aSopenharmony_ci    shr       r2d, 4
730cabdff1aSopenharmony_ci    movd      mm0, r2d
731cabdff1aSopenharmony_ci    punpcklbw mm0, mm0
732cabdff1aSopenharmony_ci    pshufw    mm0, mm0, 0
733cabdff1aSopenharmony_ci    mov       r3d, 4
734cabdff1aSopenharmony_ci.loop:
735cabdff1aSopenharmony_ci    movq [r4+r1*0], mm0
736cabdff1aSopenharmony_ci    movq [r4+r1*1], mm0
737cabdff1aSopenharmony_ci    lea   r4, [r4+r1*2]
738cabdff1aSopenharmony_ci    dec   r3d
739cabdff1aSopenharmony_ci    jg .loop
740cabdff1aSopenharmony_ci    REP_RET
741cabdff1aSopenharmony_ci
742cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
743cabdff1aSopenharmony_ci; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
744cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
745cabdff1aSopenharmony_ci
746cabdff1aSopenharmony_ciINIT_XMM sse2
747cabdff1aSopenharmony_cicglobal pred8x8_tm_vp8_8, 2,6,4
748cabdff1aSopenharmony_ci    sub          r0, r1
749cabdff1aSopenharmony_ci    pxor       xmm1, xmm1
750cabdff1aSopenharmony_ci    movq       xmm0, [r0]
751cabdff1aSopenharmony_ci    punpcklbw  xmm0, xmm1
752cabdff1aSopenharmony_ci    movzx       r4d, byte [r0-1]
753cabdff1aSopenharmony_ci    mov         r5d, 4
754cabdff1aSopenharmony_ci.loop:
755cabdff1aSopenharmony_ci    movzx       r2d, byte [r0+r1*1-1]
756cabdff1aSopenharmony_ci    movzx       r3d, byte [r0+r1*2-1]
757cabdff1aSopenharmony_ci    sub         r2d, r4d
758cabdff1aSopenharmony_ci    sub         r3d, r4d
759cabdff1aSopenharmony_ci    movd       xmm2, r2d
760cabdff1aSopenharmony_ci    movd       xmm3, r3d
761cabdff1aSopenharmony_ci    pshuflw    xmm2, xmm2, 0
762cabdff1aSopenharmony_ci    pshuflw    xmm3, xmm3, 0
763cabdff1aSopenharmony_ci    punpcklqdq xmm2, xmm2
764cabdff1aSopenharmony_ci    punpcklqdq xmm3, xmm3
765cabdff1aSopenharmony_ci    paddw      xmm2, xmm0
766cabdff1aSopenharmony_ci    paddw      xmm3, xmm0
767cabdff1aSopenharmony_ci    packuswb   xmm2, xmm3
768cabdff1aSopenharmony_ci    movq   [r0+r1*1], xmm2
769cabdff1aSopenharmony_ci    movhps [r0+r1*2], xmm2
770cabdff1aSopenharmony_ci    lea          r0, [r0+r1*2]
771cabdff1aSopenharmony_ci    dec         r5d
772cabdff1aSopenharmony_ci    jg .loop
773cabdff1aSopenharmony_ci    REP_RET
774cabdff1aSopenharmony_ci
775cabdff1aSopenharmony_ciINIT_XMM ssse3
776cabdff1aSopenharmony_cicglobal pred8x8_tm_vp8_8, 2,3,6
777cabdff1aSopenharmony_ci    sub          r0, r1
778cabdff1aSopenharmony_ci    movdqa     xmm4, [tm_shuf]
779cabdff1aSopenharmony_ci    pxor       xmm1, xmm1
780cabdff1aSopenharmony_ci    movq       xmm0, [r0]
781cabdff1aSopenharmony_ci    punpcklbw  xmm0, xmm1
782cabdff1aSopenharmony_ci    movd       xmm5, [r0-4]
783cabdff1aSopenharmony_ci    pshufb     xmm5, xmm4
784cabdff1aSopenharmony_ci    mov         r2d, 4
785cabdff1aSopenharmony_ci.loop:
786cabdff1aSopenharmony_ci    movd       xmm2, [r0+r1*1-4]
787cabdff1aSopenharmony_ci    movd       xmm3, [r0+r1*2-4]
788cabdff1aSopenharmony_ci    pshufb     xmm2, xmm4
789cabdff1aSopenharmony_ci    pshufb     xmm3, xmm4
790cabdff1aSopenharmony_ci    psubw      xmm2, xmm5
791cabdff1aSopenharmony_ci    psubw      xmm3, xmm5
792cabdff1aSopenharmony_ci    paddw      xmm2, xmm0
793cabdff1aSopenharmony_ci    paddw      xmm3, xmm0
794cabdff1aSopenharmony_ci    packuswb   xmm2, xmm3
795cabdff1aSopenharmony_ci    movq   [r0+r1*1], xmm2
796cabdff1aSopenharmony_ci    movhps [r0+r1*2], xmm2
797cabdff1aSopenharmony_ci    lea          r0, [r0+r1*2]
798cabdff1aSopenharmony_ci    dec         r2d
799cabdff1aSopenharmony_ci    jg .loop
800cabdff1aSopenharmony_ci    REP_RET
801cabdff1aSopenharmony_ci
802cabdff1aSopenharmony_ci; dest, left, right, src, tmp
803cabdff1aSopenharmony_ci; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
804cabdff1aSopenharmony_ci%macro PRED4x4_LOWPASS 5
805cabdff1aSopenharmony_ci    mova    %5, %2
806cabdff1aSopenharmony_ci    pavgb   %2, %3
807cabdff1aSopenharmony_ci    pxor    %3, %5
808cabdff1aSopenharmony_ci    mova    %1, %4
809cabdff1aSopenharmony_ci    pand    %3, [pb_1]
810cabdff1aSopenharmony_ci    psubusb %2, %3
811cabdff1aSopenharmony_ci    pavgb   %1, %2
812cabdff1aSopenharmony_ci%endmacro
813cabdff1aSopenharmony_ci
814cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
815cabdff1aSopenharmony_ci; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
816cabdff1aSopenharmony_ci;                           ptrdiff_t stride)
817cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
818cabdff1aSopenharmony_ci%macro PRED8x8L_TOP_DC 0
819cabdff1aSopenharmony_cicglobal pred8x8l_top_dc_8, 4,4
820cabdff1aSopenharmony_ci    sub          r0, r3
821cabdff1aSopenharmony_ci    pxor        mm7, mm7
822cabdff1aSopenharmony_ci    movq        mm0, [r0-8]
823cabdff1aSopenharmony_ci    movq        mm3, [r0]
824cabdff1aSopenharmony_ci    movq        mm1, [r0+8]
825cabdff1aSopenharmony_ci    movq        mm2, mm3
826cabdff1aSopenharmony_ci    movq        mm4, mm3
827cabdff1aSopenharmony_ci    PALIGNR     mm2, mm0, 7, mm0
828cabdff1aSopenharmony_ci    PALIGNR     mm1, mm4, 1, mm4
829cabdff1aSopenharmony_ci    test        r1d, r1d ; top_left
830cabdff1aSopenharmony_ci    jz .fix_lt_2
831cabdff1aSopenharmony_ci    test        r2d, r2d ; top_right
832cabdff1aSopenharmony_ci    jz .fix_tr_1
833cabdff1aSopenharmony_ci    jmp .body
834cabdff1aSopenharmony_ci.fix_lt_2:
835cabdff1aSopenharmony_ci    movq        mm5, mm3
836cabdff1aSopenharmony_ci    pxor        mm5, mm2
837cabdff1aSopenharmony_ci    psllq       mm5, 56
838cabdff1aSopenharmony_ci    psrlq       mm5, 56
839cabdff1aSopenharmony_ci    pxor        mm2, mm5
840cabdff1aSopenharmony_ci    test        r2d, r2d ; top_right
841cabdff1aSopenharmony_ci    jnz .body
842cabdff1aSopenharmony_ci.fix_tr_1:
843cabdff1aSopenharmony_ci    movq        mm5, mm3
844cabdff1aSopenharmony_ci    pxor        mm5, mm1
845cabdff1aSopenharmony_ci    psrlq       mm5, 56
846cabdff1aSopenharmony_ci    psllq       mm5, 56
847cabdff1aSopenharmony_ci    pxor        mm1, mm5
848cabdff1aSopenharmony_ci.body:
849cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
850cabdff1aSopenharmony_ci    psadbw   mm7, mm0
851cabdff1aSopenharmony_ci    paddw    mm7, [pw_4]
852cabdff1aSopenharmony_ci    psrlw    mm7, 3
853cabdff1aSopenharmony_ci    pshufw   mm7, mm7, 0
854cabdff1aSopenharmony_ci    packuswb mm7, mm7
855cabdff1aSopenharmony_ci%rep 3
856cabdff1aSopenharmony_ci    movq [r0+r3*1], mm7
857cabdff1aSopenharmony_ci    movq [r0+r3*2], mm7
858cabdff1aSopenharmony_ci    lea    r0, [r0+r3*2]
859cabdff1aSopenharmony_ci%endrep
860cabdff1aSopenharmony_ci    movq [r0+r3*1], mm7
861cabdff1aSopenharmony_ci    movq [r0+r3*2], mm7
862cabdff1aSopenharmony_ci    RET
863cabdff1aSopenharmony_ci%endmacro
864cabdff1aSopenharmony_ci
865cabdff1aSopenharmony_ciINIT_MMX mmxext
866cabdff1aSopenharmony_ciPRED8x8L_TOP_DC
867cabdff1aSopenharmony_ciINIT_MMX ssse3
868cabdff1aSopenharmony_ciPRED8x8L_TOP_DC
869cabdff1aSopenharmony_ci
870cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
871cabdff1aSopenharmony_ci; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
872cabdff1aSopenharmony_ci;                       ptrdiff_t stride)
873cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
874cabdff1aSopenharmony_ci
875cabdff1aSopenharmony_ci%macro PRED8x8L_DC 0
876cabdff1aSopenharmony_cicglobal pred8x8l_dc_8, 4,5
877cabdff1aSopenharmony_ci    sub          r0, r3
878cabdff1aSopenharmony_ci    lea          r4, [r0+r3*2]
879cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*1-8]
880cabdff1aSopenharmony_ci    punpckhbw   mm0, [r0+r3*0-8]
881cabdff1aSopenharmony_ci    movq        mm1, [r4+r3*1-8]
882cabdff1aSopenharmony_ci    punpckhbw   mm1, [r0+r3*2-8]
883cabdff1aSopenharmony_ci    mov          r4, r0
884cabdff1aSopenharmony_ci    punpckhwd   mm1, mm0
885cabdff1aSopenharmony_ci    lea          r0, [r0+r3*4]
886cabdff1aSopenharmony_ci    movq        mm2, [r0+r3*1-8]
887cabdff1aSopenharmony_ci    punpckhbw   mm2, [r0+r3*0-8]
888cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
889cabdff1aSopenharmony_ci    movq        mm3, [r0+r3*1-8]
890cabdff1aSopenharmony_ci    punpckhbw   mm3, [r0+r3*0-8]
891cabdff1aSopenharmony_ci    punpckhwd   mm3, mm2
892cabdff1aSopenharmony_ci    punpckhdq   mm3, mm1
893cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
894cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*0-8]
895cabdff1aSopenharmony_ci    movq        mm1, [r4]
896cabdff1aSopenharmony_ci    mov          r0, r4
897cabdff1aSopenharmony_ci    movq        mm4, mm3
898cabdff1aSopenharmony_ci    movq        mm2, mm3
899cabdff1aSopenharmony_ci    PALIGNR     mm4, mm0, 7, mm0
900cabdff1aSopenharmony_ci    PALIGNR     mm1, mm2, 1, mm2
901cabdff1aSopenharmony_ci    test        r1d, r1d
902cabdff1aSopenharmony_ci    jnz .do_left
903cabdff1aSopenharmony_ci.fix_lt_1:
904cabdff1aSopenharmony_ci    movq        mm5, mm3
905cabdff1aSopenharmony_ci    pxor        mm5, mm4
906cabdff1aSopenharmony_ci    psrlq       mm5, 56
907cabdff1aSopenharmony_ci    psllq       mm5, 48
908cabdff1aSopenharmony_ci    pxor        mm1, mm5
909cabdff1aSopenharmony_ci    jmp .do_left
910cabdff1aSopenharmony_ci.fix_lt_2:
911cabdff1aSopenharmony_ci    movq        mm5, mm3
912cabdff1aSopenharmony_ci    pxor        mm5, mm2
913cabdff1aSopenharmony_ci    psllq       mm5, 56
914cabdff1aSopenharmony_ci    psrlq       mm5, 56
915cabdff1aSopenharmony_ci    pxor        mm2, mm5
916cabdff1aSopenharmony_ci    test        r2d, r2d
917cabdff1aSopenharmony_ci    jnz .body
918cabdff1aSopenharmony_ci.fix_tr_1:
919cabdff1aSopenharmony_ci    movq        mm5, mm3
920cabdff1aSopenharmony_ci    pxor        mm5, mm1
921cabdff1aSopenharmony_ci    psrlq       mm5, 56
922cabdff1aSopenharmony_ci    psllq       mm5, 56
923cabdff1aSopenharmony_ci    pxor        mm1, mm5
924cabdff1aSopenharmony_ci    jmp .body
925cabdff1aSopenharmony_ci.do_left:
926cabdff1aSopenharmony_ci    movq        mm0, mm4
927cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
928cabdff1aSopenharmony_ci    movq        mm4, mm0
929cabdff1aSopenharmony_ci    movq        mm7, mm2
930cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
931cabdff1aSopenharmony_ci    psllq       mm1, 56
932cabdff1aSopenharmony_ci    PALIGNR     mm7, mm1, 7, mm3
933cabdff1aSopenharmony_ci    movq        mm0, [r0-8]
934cabdff1aSopenharmony_ci    movq        mm3, [r0]
935cabdff1aSopenharmony_ci    movq        mm1, [r0+8]
936cabdff1aSopenharmony_ci    movq        mm2, mm3
937cabdff1aSopenharmony_ci    movq        mm4, mm3
938cabdff1aSopenharmony_ci    PALIGNR     mm2, mm0, 7, mm0
939cabdff1aSopenharmony_ci    PALIGNR     mm1, mm4, 1, mm4
940cabdff1aSopenharmony_ci    test        r1d, r1d
941cabdff1aSopenharmony_ci    jz .fix_lt_2
942cabdff1aSopenharmony_ci    test        r2d, r2d
943cabdff1aSopenharmony_ci    jz .fix_tr_1
944cabdff1aSopenharmony_ci.body:
945cabdff1aSopenharmony_ci    lea          r1, [r0+r3*2]
946cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
947cabdff1aSopenharmony_ci    pxor        mm0, mm0
948cabdff1aSopenharmony_ci    pxor        mm1, mm1
949cabdff1aSopenharmony_ci    lea          r2, [r1+r3*2]
950cabdff1aSopenharmony_ci    psadbw      mm0, mm7
951cabdff1aSopenharmony_ci    psadbw      mm1, mm6
952cabdff1aSopenharmony_ci    paddw       mm0, [pw_8]
953cabdff1aSopenharmony_ci    paddw       mm0, mm1
954cabdff1aSopenharmony_ci    lea          r4, [r2+r3*2]
955cabdff1aSopenharmony_ci    psrlw       mm0, 4
956cabdff1aSopenharmony_ci    pshufw      mm0, mm0, 0
957cabdff1aSopenharmony_ci    packuswb    mm0, mm0
958cabdff1aSopenharmony_ci    movq [r0+r3*1], mm0
959cabdff1aSopenharmony_ci    movq [r0+r3*2], mm0
960cabdff1aSopenharmony_ci    movq [r1+r3*1], mm0
961cabdff1aSopenharmony_ci    movq [r1+r3*2], mm0
962cabdff1aSopenharmony_ci    movq [r2+r3*1], mm0
963cabdff1aSopenharmony_ci    movq [r2+r3*2], mm0
964cabdff1aSopenharmony_ci    movq [r4+r3*1], mm0
965cabdff1aSopenharmony_ci    movq [r4+r3*2], mm0
966cabdff1aSopenharmony_ci    RET
967cabdff1aSopenharmony_ci%endmacro
968cabdff1aSopenharmony_ci
969cabdff1aSopenharmony_ciINIT_MMX mmxext
970cabdff1aSopenharmony_ciPRED8x8L_DC
971cabdff1aSopenharmony_ciINIT_MMX ssse3
972cabdff1aSopenharmony_ciPRED8x8L_DC
973cabdff1aSopenharmony_ci
974cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
975cabdff1aSopenharmony_ci; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
976cabdff1aSopenharmony_ci;                               int has_topright, ptrdiff_t stride)
977cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
978cabdff1aSopenharmony_ci
979cabdff1aSopenharmony_ci%macro PRED8x8L_HORIZONTAL 0
980cabdff1aSopenharmony_cicglobal pred8x8l_horizontal_8, 4,4
981cabdff1aSopenharmony_ci    sub          r0, r3
982cabdff1aSopenharmony_ci    lea          r2, [r0+r3*2]
983cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*1-8]
984cabdff1aSopenharmony_ci    test        r1d, r1d
985cabdff1aSopenharmony_ci    lea          r1, [r0+r3]
986cabdff1aSopenharmony_ci    cmovnz       r1, r0
987cabdff1aSopenharmony_ci    punpckhbw   mm0, [r1+r3*0-8]
988cabdff1aSopenharmony_ci    movq        mm1, [r2+r3*1-8]
989cabdff1aSopenharmony_ci    punpckhbw   mm1, [r0+r3*2-8]
990cabdff1aSopenharmony_ci    mov          r2, r0
991cabdff1aSopenharmony_ci    punpckhwd   mm1, mm0
992cabdff1aSopenharmony_ci    lea          r0, [r0+r3*4]
993cabdff1aSopenharmony_ci    movq        mm2, [r0+r3*1-8]
994cabdff1aSopenharmony_ci    punpckhbw   mm2, [r0+r3*0-8]
995cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
996cabdff1aSopenharmony_ci    movq        mm3, [r0+r3*1-8]
997cabdff1aSopenharmony_ci    punpckhbw   mm3, [r0+r3*0-8]
998cabdff1aSopenharmony_ci    punpckhwd   mm3, mm2
999cabdff1aSopenharmony_ci    punpckhdq   mm3, mm1
1000cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
1001cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*0-8]
1002cabdff1aSopenharmony_ci    movq        mm1, [r1+r3*0-8]
1003cabdff1aSopenharmony_ci    mov          r0, r2
1004cabdff1aSopenharmony_ci    movq        mm4, mm3
1005cabdff1aSopenharmony_ci    movq        mm2, mm3
1006cabdff1aSopenharmony_ci    PALIGNR     mm4, mm0, 7, mm0
1007cabdff1aSopenharmony_ci    PALIGNR     mm1, mm2, 1, mm2
1008cabdff1aSopenharmony_ci    movq        mm0, mm4
1009cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1010cabdff1aSopenharmony_ci    movq        mm4, mm0
1011cabdff1aSopenharmony_ci    movq        mm7, mm2
1012cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1013cabdff1aSopenharmony_ci    psllq       mm1, 56
1014cabdff1aSopenharmony_ci    PALIGNR     mm7, mm1, 7, mm3
1015cabdff1aSopenharmony_ci    movq        mm3, mm7
1016cabdff1aSopenharmony_ci    lea         r1, [r0+r3*2]
1017cabdff1aSopenharmony_ci    movq       mm7, mm3
1018cabdff1aSopenharmony_ci    punpckhbw  mm3, mm3
1019cabdff1aSopenharmony_ci    punpcklbw  mm7, mm7
1020cabdff1aSopenharmony_ci    pshufw     mm0, mm3, 0xff
1021cabdff1aSopenharmony_ci    pshufw     mm1, mm3, 0xaa
1022cabdff1aSopenharmony_ci    lea         r2, [r1+r3*2]
1023cabdff1aSopenharmony_ci    pshufw     mm2, mm3, 0x55
1024cabdff1aSopenharmony_ci    pshufw     mm3, mm3, 0x00
1025cabdff1aSopenharmony_ci    pshufw     mm4, mm7, 0xff
1026cabdff1aSopenharmony_ci    pshufw     mm5, mm7, 0xaa
1027cabdff1aSopenharmony_ci    pshufw     mm6, mm7, 0x55
1028cabdff1aSopenharmony_ci    pshufw     mm7, mm7, 0x00
1029cabdff1aSopenharmony_ci    movq [r0+r3*1], mm0
1030cabdff1aSopenharmony_ci    movq [r0+r3*2], mm1
1031cabdff1aSopenharmony_ci    movq [r1+r3*1], mm2
1032cabdff1aSopenharmony_ci    movq [r1+r3*2], mm3
1033cabdff1aSopenharmony_ci    movq [r2+r3*1], mm4
1034cabdff1aSopenharmony_ci    movq [r2+r3*2], mm5
1035cabdff1aSopenharmony_ci    lea         r0, [r2+r3*2]
1036cabdff1aSopenharmony_ci    movq [r0+r3*1], mm6
1037cabdff1aSopenharmony_ci    movq [r0+r3*2], mm7
1038cabdff1aSopenharmony_ci    RET
1039cabdff1aSopenharmony_ci%endmacro
1040cabdff1aSopenharmony_ci
1041cabdff1aSopenharmony_ciINIT_MMX mmxext
1042cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL
1043cabdff1aSopenharmony_ciINIT_MMX ssse3
1044cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL
1045cabdff1aSopenharmony_ci
1046cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1047cabdff1aSopenharmony_ci; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
1048cabdff1aSopenharmony_ci;                             ptrdiff_t stride)
1049cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1050cabdff1aSopenharmony_ci
1051cabdff1aSopenharmony_ci%macro PRED8x8L_VERTICAL 0
1052cabdff1aSopenharmony_cicglobal pred8x8l_vertical_8, 4,4
1053cabdff1aSopenharmony_ci    sub          r0, r3
1054cabdff1aSopenharmony_ci    movq        mm0, [r0-8]
1055cabdff1aSopenharmony_ci    movq        mm3, [r0]
1056cabdff1aSopenharmony_ci    movq        mm1, [r0+8]
1057cabdff1aSopenharmony_ci    movq        mm2, mm3
1058cabdff1aSopenharmony_ci    movq        mm4, mm3
1059cabdff1aSopenharmony_ci    PALIGNR     mm2, mm0, 7, mm0
1060cabdff1aSopenharmony_ci    PALIGNR     mm1, mm4, 1, mm4
1061cabdff1aSopenharmony_ci    test        r1d, r1d ; top_left
1062cabdff1aSopenharmony_ci    jz .fix_lt_2
1063cabdff1aSopenharmony_ci    test        r2d, r2d ; top_right
1064cabdff1aSopenharmony_ci    jz .fix_tr_1
1065cabdff1aSopenharmony_ci    jmp .body
1066cabdff1aSopenharmony_ci.fix_lt_2:
1067cabdff1aSopenharmony_ci    movq        mm5, mm3
1068cabdff1aSopenharmony_ci    pxor        mm5, mm2
1069cabdff1aSopenharmony_ci    psllq       mm5, 56
1070cabdff1aSopenharmony_ci    psrlq       mm5, 56
1071cabdff1aSopenharmony_ci    pxor        mm2, mm5
1072cabdff1aSopenharmony_ci    test        r2d, r2d ; top_right
1073cabdff1aSopenharmony_ci    jnz .body
1074cabdff1aSopenharmony_ci.fix_tr_1:
1075cabdff1aSopenharmony_ci    movq        mm5, mm3
1076cabdff1aSopenharmony_ci    pxor        mm5, mm1
1077cabdff1aSopenharmony_ci    psrlq       mm5, 56
1078cabdff1aSopenharmony_ci    psllq       mm5, 56
1079cabdff1aSopenharmony_ci    pxor        mm1, mm5
1080cabdff1aSopenharmony_ci.body:
1081cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1082cabdff1aSopenharmony_ci%rep 3
1083cabdff1aSopenharmony_ci    movq [r0+r3*1], mm0
1084cabdff1aSopenharmony_ci    movq [r0+r3*2], mm0
1085cabdff1aSopenharmony_ci    lea    r0, [r0+r3*2]
1086cabdff1aSopenharmony_ci%endrep
1087cabdff1aSopenharmony_ci    movq [r0+r3*1], mm0
1088cabdff1aSopenharmony_ci    movq [r0+r3*2], mm0
1089cabdff1aSopenharmony_ci    RET
1090cabdff1aSopenharmony_ci%endmacro
1091cabdff1aSopenharmony_ci
1092cabdff1aSopenharmony_ciINIT_MMX mmxext
1093cabdff1aSopenharmony_ciPRED8x8L_VERTICAL
1094cabdff1aSopenharmony_ciINIT_MMX ssse3
1095cabdff1aSopenharmony_ciPRED8x8L_VERTICAL
1096cabdff1aSopenharmony_ci
1097cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1098cabdff1aSopenharmony_ci; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
1099cabdff1aSopenharmony_ci;                              int has_topright, ptrdiff_t stride)
1100cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1101cabdff1aSopenharmony_ci
1102cabdff1aSopenharmony_ci%macro PRED8x8L_DOWN_LEFT 0
1103cabdff1aSopenharmony_cicglobal pred8x8l_down_left_8, 4,4
1104cabdff1aSopenharmony_ci    sub          r0, r3
1105cabdff1aSopenharmony_ci    movq        mm0, [r0-8]
1106cabdff1aSopenharmony_ci    movq        mm3, [r0]
1107cabdff1aSopenharmony_ci    movq        mm1, [r0+8]
1108cabdff1aSopenharmony_ci    movq        mm2, mm3
1109cabdff1aSopenharmony_ci    movq        mm4, mm3
1110cabdff1aSopenharmony_ci    PALIGNR     mm2, mm0, 7, mm0
1111cabdff1aSopenharmony_ci    PALIGNR     mm1, mm4, 1, mm4
1112cabdff1aSopenharmony_ci    test        r1d, r1d ; top_left
1113cabdff1aSopenharmony_ci    jz .fix_lt_2
1114cabdff1aSopenharmony_ci    test        r2d, r2d ; top_right
1115cabdff1aSopenharmony_ci    jz .fix_tr_1
1116cabdff1aSopenharmony_ci    jmp .do_top
1117cabdff1aSopenharmony_ci.fix_lt_2:
1118cabdff1aSopenharmony_ci    movq        mm5, mm3
1119cabdff1aSopenharmony_ci    pxor        mm5, mm2
1120cabdff1aSopenharmony_ci    psllq       mm5, 56
1121cabdff1aSopenharmony_ci    psrlq       mm5, 56
1122cabdff1aSopenharmony_ci    pxor        mm2, mm5
1123cabdff1aSopenharmony_ci    test        r2d, r2d ; top_right
1124cabdff1aSopenharmony_ci    jnz .do_top
1125cabdff1aSopenharmony_ci.fix_tr_1:
1126cabdff1aSopenharmony_ci    movq        mm5, mm3
1127cabdff1aSopenharmony_ci    pxor        mm5, mm1
1128cabdff1aSopenharmony_ci    psrlq       mm5, 56
1129cabdff1aSopenharmony_ci    psllq       mm5, 56
1130cabdff1aSopenharmony_ci    pxor        mm1, mm5
1131cabdff1aSopenharmony_ci    jmp .do_top
1132cabdff1aSopenharmony_ci.fix_tr_2:
1133cabdff1aSopenharmony_ci    punpckhbw   mm3, mm3
1134cabdff1aSopenharmony_ci    pshufw      mm1, mm3, 0xFF
1135cabdff1aSopenharmony_ci    jmp .do_topright
1136cabdff1aSopenharmony_ci.do_top:
1137cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1138cabdff1aSopenharmony_ci    movq2dq    xmm3, mm4
1139cabdff1aSopenharmony_ci    test        r2d, r2d ; top_right
1140cabdff1aSopenharmony_ci    jz .fix_tr_2
1141cabdff1aSopenharmony_ci    movq        mm0, [r0+8]
1142cabdff1aSopenharmony_ci    movq        mm5, mm0
1143cabdff1aSopenharmony_ci    movq        mm2, mm0
1144cabdff1aSopenharmony_ci    movq        mm4, mm0
1145cabdff1aSopenharmony_ci    psrlq       mm5, 56
1146cabdff1aSopenharmony_ci    PALIGNR     mm2, mm3, 7, mm3
1147cabdff1aSopenharmony_ci    PALIGNR     mm5, mm4, 1, mm4
1148cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1149cabdff1aSopenharmony_ci.do_topright:
1150cabdff1aSopenharmony_ci    movq2dq    xmm4, mm1
1151cabdff1aSopenharmony_ci    psrlq       mm1, 56
1152cabdff1aSopenharmony_ci    movq2dq    xmm5, mm1
1153cabdff1aSopenharmony_ci    lea         r1, [r0+r3*2]
1154cabdff1aSopenharmony_ci    pslldq    xmm4, 8
1155cabdff1aSopenharmony_ci    por       xmm3, xmm4
1156cabdff1aSopenharmony_ci    movdqa    xmm2, xmm3
1157cabdff1aSopenharmony_ci    psrldq    xmm2, 1
1158cabdff1aSopenharmony_ci    pslldq    xmm5, 15
1159cabdff1aSopenharmony_ci    por       xmm2, xmm5
1160cabdff1aSopenharmony_ci    lea         r2, [r1+r3*2]
1161cabdff1aSopenharmony_ci    movdqa    xmm1, xmm3
1162cabdff1aSopenharmony_ci    pslldq    xmm1, 1
1163cabdff1aSopenharmony_ciINIT_XMM cpuname
1164cabdff1aSopenharmony_ci    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1165cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1166cabdff1aSopenharmony_ci    movq [r0+r3*1], xmm0
1167cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1168cabdff1aSopenharmony_ci    movq [r0+r3*2], xmm0
1169cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1170cabdff1aSopenharmony_ci    lea         r0, [r2+r3*2]
1171cabdff1aSopenharmony_ci    movq [r1+r3*1], xmm0
1172cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1173cabdff1aSopenharmony_ci    movq [r1+r3*2], xmm0
1174cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1175cabdff1aSopenharmony_ci    movq [r2+r3*1], xmm0
1176cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1177cabdff1aSopenharmony_ci    movq [r2+r3*2], xmm0
1178cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1179cabdff1aSopenharmony_ci    movq [r0+r3*1], xmm0
1180cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1181cabdff1aSopenharmony_ci    movq [r0+r3*2], xmm0
1182cabdff1aSopenharmony_ci    RET
1183cabdff1aSopenharmony_ci%endmacro
1184cabdff1aSopenharmony_ci
1185cabdff1aSopenharmony_ciINIT_MMX sse2
1186cabdff1aSopenharmony_ciPRED8x8L_DOWN_LEFT
1187cabdff1aSopenharmony_ciINIT_MMX ssse3
1188cabdff1aSopenharmony_ciPRED8x8L_DOWN_LEFT
1189cabdff1aSopenharmony_ci
1190cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1191cabdff1aSopenharmony_ci; void ff_pred8x8l_down_right_8(uint8_t *src, int has_topleft,
1192cabdff1aSopenharmony_ci;                               int has_topright, ptrdiff_t stride)
1193cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1194cabdff1aSopenharmony_ci
1195cabdff1aSopenharmony_ci%macro PRED8x8L_DOWN_RIGHT 0
1196cabdff1aSopenharmony_cicglobal pred8x8l_down_right_8, 4,5
1197cabdff1aSopenharmony_ci    sub          r0, r3
1198cabdff1aSopenharmony_ci    lea          r4, [r0+r3*2]
1199cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*1-8]
1200cabdff1aSopenharmony_ci    punpckhbw   mm0, [r0+r3*0-8]
1201cabdff1aSopenharmony_ci    movq        mm1, [r4+r3*1-8]
1202cabdff1aSopenharmony_ci    punpckhbw   mm1, [r0+r3*2-8]
1203cabdff1aSopenharmony_ci    mov          r4, r0
1204cabdff1aSopenharmony_ci    punpckhwd   mm1, mm0
1205cabdff1aSopenharmony_ci    lea          r0, [r0+r3*4]
1206cabdff1aSopenharmony_ci    movq        mm2, [r0+r3*1-8]
1207cabdff1aSopenharmony_ci    punpckhbw   mm2, [r0+r3*0-8]
1208cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
1209cabdff1aSopenharmony_ci    movq        mm3, [r0+r3*1-8]
1210cabdff1aSopenharmony_ci    punpckhbw   mm3, [r0+r3*0-8]
1211cabdff1aSopenharmony_ci    punpckhwd   mm3, mm2
1212cabdff1aSopenharmony_ci    punpckhdq   mm3, mm1
1213cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
1214cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*0-8]
1215cabdff1aSopenharmony_ci    movq        mm1, [r4]
1216cabdff1aSopenharmony_ci    mov          r0, r4
1217cabdff1aSopenharmony_ci    movq        mm4, mm3
1218cabdff1aSopenharmony_ci    movq        mm2, mm3
1219cabdff1aSopenharmony_ci    PALIGNR     mm4, mm0, 7, mm0
1220cabdff1aSopenharmony_ci    PALIGNR     mm1, mm2, 1, mm2
1221cabdff1aSopenharmony_ci    test        r1d, r1d
1222cabdff1aSopenharmony_ci    jz .fix_lt_1
1223cabdff1aSopenharmony_ci    jmp .do_left
1224cabdff1aSopenharmony_ci.fix_lt_1:
1225cabdff1aSopenharmony_ci    movq        mm5, mm3
1226cabdff1aSopenharmony_ci    pxor        mm5, mm4
1227cabdff1aSopenharmony_ci    psrlq       mm5, 56
1228cabdff1aSopenharmony_ci    psllq       mm5, 48
1229cabdff1aSopenharmony_ci    pxor        mm1, mm5
1230cabdff1aSopenharmony_ci    jmp .do_left
1231cabdff1aSopenharmony_ci.fix_lt_2:
1232cabdff1aSopenharmony_ci    movq        mm5, mm3
1233cabdff1aSopenharmony_ci    pxor        mm5, mm2
1234cabdff1aSopenharmony_ci    psllq       mm5, 56
1235cabdff1aSopenharmony_ci    psrlq       mm5, 56
1236cabdff1aSopenharmony_ci    pxor        mm2, mm5
1237cabdff1aSopenharmony_ci    test        r2d, r2d
1238cabdff1aSopenharmony_ci    jnz .do_top
1239cabdff1aSopenharmony_ci.fix_tr_1:
1240cabdff1aSopenharmony_ci    movq        mm5, mm3
1241cabdff1aSopenharmony_ci    pxor        mm5, mm1
1242cabdff1aSopenharmony_ci    psrlq       mm5, 56
1243cabdff1aSopenharmony_ci    psllq       mm5, 56
1244cabdff1aSopenharmony_ci    pxor        mm1, mm5
1245cabdff1aSopenharmony_ci    jmp .do_top
1246cabdff1aSopenharmony_ci.do_left:
1247cabdff1aSopenharmony_ci    movq        mm0, mm4
1248cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1249cabdff1aSopenharmony_ci    movq        mm4, mm0
1250cabdff1aSopenharmony_ci    movq        mm7, mm2
1251cabdff1aSopenharmony_ci    movq2dq    xmm3, mm2
1252cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1253cabdff1aSopenharmony_ci    psllq       mm1, 56
1254cabdff1aSopenharmony_ci    PALIGNR     mm7, mm1, 7, mm3
1255cabdff1aSopenharmony_ci    movq2dq    xmm1, mm7
1256cabdff1aSopenharmony_ci    movq        mm0, [r0-8]
1257cabdff1aSopenharmony_ci    movq        mm3, [r0]
1258cabdff1aSopenharmony_ci    movq        mm1, [r0+8]
1259cabdff1aSopenharmony_ci    movq        mm2, mm3
1260cabdff1aSopenharmony_ci    movq        mm4, mm3
1261cabdff1aSopenharmony_ci    PALIGNR     mm2, mm0, 7, mm0
1262cabdff1aSopenharmony_ci    PALIGNR     mm1, mm4, 1, mm4
1263cabdff1aSopenharmony_ci    test        r1d, r1d
1264cabdff1aSopenharmony_ci    jz .fix_lt_2
1265cabdff1aSopenharmony_ci    test        r2d, r2d
1266cabdff1aSopenharmony_ci    jz .fix_tr_1
1267cabdff1aSopenharmony_ci.do_top:
1268cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1269cabdff1aSopenharmony_ci    movq2dq   xmm4, mm4
1270cabdff1aSopenharmony_ci    lea         r1, [r0+r3*2]
1271cabdff1aSopenharmony_ci    movdqa    xmm0, xmm3
1272cabdff1aSopenharmony_ci    pslldq    xmm4, 8
1273cabdff1aSopenharmony_ci    por       xmm3, xmm4
1274cabdff1aSopenharmony_ci    lea         r2, [r1+r3*2]
1275cabdff1aSopenharmony_ci    pslldq    xmm4, 1
1276cabdff1aSopenharmony_ci    por       xmm1, xmm4
1277cabdff1aSopenharmony_ci    psrldq    xmm0, 7
1278cabdff1aSopenharmony_ci    pslldq    xmm0, 15
1279cabdff1aSopenharmony_ci    psrldq    xmm0, 7
1280cabdff1aSopenharmony_ci    por       xmm1, xmm0
1281cabdff1aSopenharmony_ci    lea         r0, [r2+r3*2]
1282cabdff1aSopenharmony_ci    movdqa    xmm2, xmm3
1283cabdff1aSopenharmony_ci    psrldq    xmm2, 1
1284cabdff1aSopenharmony_ciINIT_XMM cpuname
1285cabdff1aSopenharmony_ci    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1286cabdff1aSopenharmony_ci    movdqa    xmm1, xmm0
1287cabdff1aSopenharmony_ci    psrldq    xmm1, 1
1288cabdff1aSopenharmony_ci    movq [r0+r3*2], xmm0
1289cabdff1aSopenharmony_ci    movq [r0+r3*1], xmm1
1290cabdff1aSopenharmony_ci    psrldq    xmm0, 2
1291cabdff1aSopenharmony_ci    psrldq    xmm1, 2
1292cabdff1aSopenharmony_ci    movq [r2+r3*2], xmm0
1293cabdff1aSopenharmony_ci    movq [r2+r3*1], xmm1
1294cabdff1aSopenharmony_ci    psrldq    xmm0, 2
1295cabdff1aSopenharmony_ci    psrldq    xmm1, 2
1296cabdff1aSopenharmony_ci    movq [r1+r3*2], xmm0
1297cabdff1aSopenharmony_ci    movq [r1+r3*1], xmm1
1298cabdff1aSopenharmony_ci    psrldq    xmm0, 2
1299cabdff1aSopenharmony_ci    psrldq    xmm1, 2
1300cabdff1aSopenharmony_ci    movq [r4+r3*2], xmm0
1301cabdff1aSopenharmony_ci    movq [r4+r3*1], xmm1
1302cabdff1aSopenharmony_ci    RET
1303cabdff1aSopenharmony_ci%endmacro
1304cabdff1aSopenharmony_ci
1305cabdff1aSopenharmony_ciINIT_MMX sse2
1306cabdff1aSopenharmony_ciPRED8x8L_DOWN_RIGHT
1307cabdff1aSopenharmony_ciINIT_MMX ssse3
1308cabdff1aSopenharmony_ciPRED8x8L_DOWN_RIGHT
1309cabdff1aSopenharmony_ci
1310cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1311cabdff1aSopenharmony_ci; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
1312cabdff1aSopenharmony_ci;                                   int has_topright, ptrdiff_t stride)
1313cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1314cabdff1aSopenharmony_ci
1315cabdff1aSopenharmony_ci%macro PRED8x8L_VERTICAL_RIGHT 0
1316cabdff1aSopenharmony_cicglobal pred8x8l_vertical_right_8, 4,5,7
1317cabdff1aSopenharmony_ci    ; manually spill XMM registers for Win64 because
1318cabdff1aSopenharmony_ci    ; the code here is initialized with INIT_MMX
1319cabdff1aSopenharmony_ci    WIN64_SPILL_XMM 7
1320cabdff1aSopenharmony_ci    sub          r0, r3
1321cabdff1aSopenharmony_ci    lea          r4, [r0+r3*2]
1322cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*1-8]
1323cabdff1aSopenharmony_ci    punpckhbw   mm0, [r0+r3*0-8]
1324cabdff1aSopenharmony_ci    movq        mm1, [r4+r3*1-8]
1325cabdff1aSopenharmony_ci    punpckhbw   mm1, [r0+r3*2-8]
1326cabdff1aSopenharmony_ci    mov          r4, r0
1327cabdff1aSopenharmony_ci    punpckhwd   mm1, mm0
1328cabdff1aSopenharmony_ci    lea          r0, [r0+r3*4]
1329cabdff1aSopenharmony_ci    movq        mm2, [r0+r3*1-8]
1330cabdff1aSopenharmony_ci    punpckhbw   mm2, [r0+r3*0-8]
1331cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
1332cabdff1aSopenharmony_ci    movq        mm3, [r0+r3*1-8]
1333cabdff1aSopenharmony_ci    punpckhbw   mm3, [r0+r3*0-8]
1334cabdff1aSopenharmony_ci    punpckhwd   mm3, mm2
1335cabdff1aSopenharmony_ci    punpckhdq   mm3, mm1
1336cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
1337cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*0-8]
1338cabdff1aSopenharmony_ci    movq        mm1, [r4]
1339cabdff1aSopenharmony_ci    mov          r0, r4
1340cabdff1aSopenharmony_ci    movq        mm4, mm3
1341cabdff1aSopenharmony_ci    movq        mm2, mm3
1342cabdff1aSopenharmony_ci    PALIGNR     mm4, mm0, 7, mm0
1343cabdff1aSopenharmony_ci    PALIGNR     mm1, mm2, 1, mm2
1344cabdff1aSopenharmony_ci    test        r1d, r1d
1345cabdff1aSopenharmony_ci    jnz .do_left
1346cabdff1aSopenharmony_ci.fix_lt_1:
1347cabdff1aSopenharmony_ci    movq        mm5, mm3
1348cabdff1aSopenharmony_ci    pxor        mm5, mm4
1349cabdff1aSopenharmony_ci    psrlq       mm5, 56
1350cabdff1aSopenharmony_ci    psllq       mm5, 48
1351cabdff1aSopenharmony_ci    pxor        mm1, mm5
1352cabdff1aSopenharmony_ci    jmp .do_left
1353cabdff1aSopenharmony_ci.fix_lt_2:
1354cabdff1aSopenharmony_ci    movq        mm5, mm3
1355cabdff1aSopenharmony_ci    pxor        mm5, mm2
1356cabdff1aSopenharmony_ci    psllq       mm5, 56
1357cabdff1aSopenharmony_ci    psrlq       mm5, 56
1358cabdff1aSopenharmony_ci    pxor        mm2, mm5
1359cabdff1aSopenharmony_ci    test        r2d, r2d
1360cabdff1aSopenharmony_ci    jnz .do_top
1361cabdff1aSopenharmony_ci.fix_tr_1:
1362cabdff1aSopenharmony_ci    movq        mm5, mm3
1363cabdff1aSopenharmony_ci    pxor        mm5, mm1
1364cabdff1aSopenharmony_ci    psrlq       mm5, 56
1365cabdff1aSopenharmony_ci    psllq       mm5, 56
1366cabdff1aSopenharmony_ci    pxor        mm1, mm5
1367cabdff1aSopenharmony_ci    jmp .do_top
1368cabdff1aSopenharmony_ci.do_left:
1369cabdff1aSopenharmony_ci    movq        mm0, mm4
1370cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1371cabdff1aSopenharmony_ci    movq2dq    xmm0, mm2
1372cabdff1aSopenharmony_ci    movq        mm0, [r0-8]
1373cabdff1aSopenharmony_ci    movq        mm3, [r0]
1374cabdff1aSopenharmony_ci    movq        mm1, [r0+8]
1375cabdff1aSopenharmony_ci    movq        mm2, mm3
1376cabdff1aSopenharmony_ci    movq        mm4, mm3
1377cabdff1aSopenharmony_ci    PALIGNR     mm2, mm0, 7, mm0
1378cabdff1aSopenharmony_ci    PALIGNR     mm1, mm4, 1, mm4
1379cabdff1aSopenharmony_ci    test        r1d, r1d
1380cabdff1aSopenharmony_ci    jz .fix_lt_2
1381cabdff1aSopenharmony_ci    test        r2d, r2d
1382cabdff1aSopenharmony_ci    jz .fix_tr_1
1383cabdff1aSopenharmony_ci.do_top:
1384cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1385cabdff1aSopenharmony_ci    lea           r1, [r0+r3*2]
1386cabdff1aSopenharmony_ci    movq2dq     xmm4, mm6
1387cabdff1aSopenharmony_ci    pslldq      xmm4, 8
1388cabdff1aSopenharmony_ci    por         xmm0, xmm4
1389cabdff1aSopenharmony_ci    movdqa      xmm6, [pw_ff00]
1390cabdff1aSopenharmony_ci    movdqa      xmm1, xmm0
1391cabdff1aSopenharmony_ci    lea           r2, [r1+r3*2]
1392cabdff1aSopenharmony_ci    movdqa      xmm2, xmm0
1393cabdff1aSopenharmony_ci    movdqa      xmm3, xmm0
1394cabdff1aSopenharmony_ci    pslldq      xmm0, 1
1395cabdff1aSopenharmony_ci    pslldq      xmm1, 2
1396cabdff1aSopenharmony_ci    pavgb       xmm2, xmm0
1397cabdff1aSopenharmony_ciINIT_XMM cpuname
1398cabdff1aSopenharmony_ci    PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1399cabdff1aSopenharmony_ci    pandn       xmm6, xmm4
1400cabdff1aSopenharmony_ci    movdqa      xmm5, xmm4
1401cabdff1aSopenharmony_ci    psrlw       xmm4, 8
1402cabdff1aSopenharmony_ci    packuswb    xmm6, xmm4
1403cabdff1aSopenharmony_ci    movhlps     xmm4, xmm6
1404cabdff1aSopenharmony_ci    movhps [r0+r3*2], xmm5
1405cabdff1aSopenharmony_ci    movhps [r0+r3*1], xmm2
1406cabdff1aSopenharmony_ci    psrldq      xmm5, 4
1407cabdff1aSopenharmony_ci    movss       xmm5, xmm6
1408cabdff1aSopenharmony_ci    psrldq      xmm2, 4
1409cabdff1aSopenharmony_ci    movss       xmm2, xmm4
1410cabdff1aSopenharmony_ci    lea           r0, [r2+r3*2]
1411cabdff1aSopenharmony_ci    psrldq      xmm5, 1
1412cabdff1aSopenharmony_ci    psrldq      xmm2, 1
1413cabdff1aSopenharmony_ci    movq        [r0+r3*2], xmm5
1414cabdff1aSopenharmony_ci    movq        [r0+r3*1], xmm2
1415cabdff1aSopenharmony_ci    psrldq      xmm5, 1
1416cabdff1aSopenharmony_ci    psrldq      xmm2, 1
1417cabdff1aSopenharmony_ci    movq        [r2+r3*2], xmm5
1418cabdff1aSopenharmony_ci    movq        [r2+r3*1], xmm2
1419cabdff1aSopenharmony_ci    psrldq      xmm5, 1
1420cabdff1aSopenharmony_ci    psrldq      xmm2, 1
1421cabdff1aSopenharmony_ci    movq        [r1+r3*2], xmm5
1422cabdff1aSopenharmony_ci    movq        [r1+r3*1], xmm2
1423cabdff1aSopenharmony_ci    RET
1424cabdff1aSopenharmony_ci%endmacro
1425cabdff1aSopenharmony_ci
1426cabdff1aSopenharmony_ciINIT_MMX sse2
1427cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_RIGHT
1428cabdff1aSopenharmony_ciINIT_MMX ssse3
1429cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_RIGHT
1430cabdff1aSopenharmony_ci
1431cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1432cabdff1aSopenharmony_ci; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
1433cabdff1aSopenharmony_ci;                                  int has_topright, ptrdiff_t stride)
1434cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1435cabdff1aSopenharmony_ci
1436cabdff1aSopenharmony_ci%macro PRED8x8L_VERTICAL_LEFT 0
1437cabdff1aSopenharmony_cicglobal pred8x8l_vertical_left_8, 4,4
1438cabdff1aSopenharmony_ci    sub          r0, r3
1439cabdff1aSopenharmony_ci    movq        mm0, [r0-8]
1440cabdff1aSopenharmony_ci    movq        mm3, [r0]
1441cabdff1aSopenharmony_ci    movq        mm1, [r0+8]
1442cabdff1aSopenharmony_ci    movq        mm2, mm3
1443cabdff1aSopenharmony_ci    movq        mm4, mm3
1444cabdff1aSopenharmony_ci    PALIGNR     mm2, mm0, 7, mm0
1445cabdff1aSopenharmony_ci    PALIGNR     mm1, mm4, 1, mm4
1446cabdff1aSopenharmony_ci    test        r1d, r1d
1447cabdff1aSopenharmony_ci    jz .fix_lt_2
1448cabdff1aSopenharmony_ci    test        r2d, r2d
1449cabdff1aSopenharmony_ci    jz .fix_tr_1
1450cabdff1aSopenharmony_ci    jmp .do_top
1451cabdff1aSopenharmony_ci.fix_lt_2:
1452cabdff1aSopenharmony_ci    movq        mm5, mm3
1453cabdff1aSopenharmony_ci    pxor        mm5, mm2
1454cabdff1aSopenharmony_ci    psllq       mm5, 56
1455cabdff1aSopenharmony_ci    psrlq       mm5, 56
1456cabdff1aSopenharmony_ci    pxor        mm2, mm5
1457cabdff1aSopenharmony_ci    test        r2d, r2d
1458cabdff1aSopenharmony_ci    jnz .do_top
1459cabdff1aSopenharmony_ci.fix_tr_1:
1460cabdff1aSopenharmony_ci    movq        mm5, mm3
1461cabdff1aSopenharmony_ci    pxor        mm5, mm1
1462cabdff1aSopenharmony_ci    psrlq       mm5, 56
1463cabdff1aSopenharmony_ci    psllq       mm5, 56
1464cabdff1aSopenharmony_ci    pxor        mm1, mm5
1465cabdff1aSopenharmony_ci    jmp .do_top
1466cabdff1aSopenharmony_ci.fix_tr_2:
1467cabdff1aSopenharmony_ci    punpckhbw   mm3, mm3
1468cabdff1aSopenharmony_ci    pshufw      mm1, mm3, 0xFF
1469cabdff1aSopenharmony_ci    jmp .do_topright
1470cabdff1aSopenharmony_ci.do_top:
1471cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1472cabdff1aSopenharmony_ci    movq2dq    xmm4, mm4
1473cabdff1aSopenharmony_ci    test        r2d, r2d
1474cabdff1aSopenharmony_ci    jz .fix_tr_2
1475cabdff1aSopenharmony_ci    movq        mm0, [r0+8]
1476cabdff1aSopenharmony_ci    movq        mm5, mm0
1477cabdff1aSopenharmony_ci    movq        mm2, mm0
1478cabdff1aSopenharmony_ci    movq        mm4, mm0
1479cabdff1aSopenharmony_ci    psrlq       mm5, 56
1480cabdff1aSopenharmony_ci    PALIGNR     mm2, mm3, 7, mm3
1481cabdff1aSopenharmony_ci    PALIGNR     mm5, mm4, 1, mm4
1482cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1483cabdff1aSopenharmony_ci.do_topright:
1484cabdff1aSopenharmony_ci    movq2dq   xmm3, mm1
1485cabdff1aSopenharmony_ci    lea         r1, [r0+r3*2]
1486cabdff1aSopenharmony_ci    pslldq    xmm3, 8
1487cabdff1aSopenharmony_ci    por       xmm4, xmm3
1488cabdff1aSopenharmony_ci    movdqa    xmm2, xmm4
1489cabdff1aSopenharmony_ci    movdqa    xmm1, xmm4
1490cabdff1aSopenharmony_ci    movdqa    xmm3, xmm4
1491cabdff1aSopenharmony_ci    psrldq    xmm2, 1
1492cabdff1aSopenharmony_ci    pslldq    xmm1, 1
1493cabdff1aSopenharmony_ci    pavgb     xmm3, xmm2
1494cabdff1aSopenharmony_ci    lea         r2, [r1+r3*2]
1495cabdff1aSopenharmony_ciINIT_XMM cpuname
1496cabdff1aSopenharmony_ci    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
1497cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1498cabdff1aSopenharmony_ci    movq [r0+r3*1], xmm3
1499cabdff1aSopenharmony_ci    movq [r0+r3*2], xmm0
1500cabdff1aSopenharmony_ci    lea         r0, [r2+r3*2]
1501cabdff1aSopenharmony_ci    psrldq    xmm3, 1
1502cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1503cabdff1aSopenharmony_ci    movq [r1+r3*1], xmm3
1504cabdff1aSopenharmony_ci    movq [r1+r3*2], xmm0
1505cabdff1aSopenharmony_ci    psrldq    xmm3, 1
1506cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1507cabdff1aSopenharmony_ci    movq [r2+r3*1], xmm3
1508cabdff1aSopenharmony_ci    movq [r2+r3*2], xmm0
1509cabdff1aSopenharmony_ci    psrldq    xmm3, 1
1510cabdff1aSopenharmony_ci    psrldq    xmm0, 1
1511cabdff1aSopenharmony_ci    movq [r0+r3*1], xmm3
1512cabdff1aSopenharmony_ci    movq [r0+r3*2], xmm0
1513cabdff1aSopenharmony_ci    RET
1514cabdff1aSopenharmony_ci%endmacro
1515cabdff1aSopenharmony_ci
1516cabdff1aSopenharmony_ciINIT_MMX sse2
1517cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_LEFT
1518cabdff1aSopenharmony_ciINIT_MMX ssse3
1519cabdff1aSopenharmony_ciPRED8x8L_VERTICAL_LEFT
1520cabdff1aSopenharmony_ci
1521cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1522cabdff1aSopenharmony_ci; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
1523cabdff1aSopenharmony_ci;                                  int has_topright, ptrdiff_t stride)
1524cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1525cabdff1aSopenharmony_ci
1526cabdff1aSopenharmony_ci%macro PRED8x8L_HORIZONTAL_UP 0
1527cabdff1aSopenharmony_cicglobal pred8x8l_horizontal_up_8, 4,4
1528cabdff1aSopenharmony_ci    sub          r0, r3
1529cabdff1aSopenharmony_ci    lea          r2, [r0+r3*2]
1530cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*1-8]
1531cabdff1aSopenharmony_ci    test        r1d, r1d
1532cabdff1aSopenharmony_ci    lea          r1, [r0+r3]
1533cabdff1aSopenharmony_ci    cmovnz       r1, r0
1534cabdff1aSopenharmony_ci    punpckhbw   mm0, [r1+r3*0-8]
1535cabdff1aSopenharmony_ci    movq        mm1, [r2+r3*1-8]
1536cabdff1aSopenharmony_ci    punpckhbw   mm1, [r0+r3*2-8]
1537cabdff1aSopenharmony_ci    mov          r2, r0
1538cabdff1aSopenharmony_ci    punpckhwd   mm1, mm0
1539cabdff1aSopenharmony_ci    lea          r0, [r0+r3*4]
1540cabdff1aSopenharmony_ci    movq        mm2, [r0+r3*1-8]
1541cabdff1aSopenharmony_ci    punpckhbw   mm2, [r0+r3*0-8]
1542cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
1543cabdff1aSopenharmony_ci    movq        mm3, [r0+r3*1-8]
1544cabdff1aSopenharmony_ci    punpckhbw   mm3, [r0+r3*0-8]
1545cabdff1aSopenharmony_ci    punpckhwd   mm3, mm2
1546cabdff1aSopenharmony_ci    punpckhdq   mm3, mm1
1547cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
1548cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*0-8]
1549cabdff1aSopenharmony_ci    movq        mm1, [r1+r3*0-8]
1550cabdff1aSopenharmony_ci    mov          r0, r2
1551cabdff1aSopenharmony_ci    movq        mm4, mm3
1552cabdff1aSopenharmony_ci    movq        mm2, mm3
1553cabdff1aSopenharmony_ci    PALIGNR     mm4, mm0, 7, mm0
1554cabdff1aSopenharmony_ci    PALIGNR     mm1, mm2, 1, mm2
1555cabdff1aSopenharmony_ci    movq       mm0, mm4
1556cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1557cabdff1aSopenharmony_ci    movq       mm4, mm0
1558cabdff1aSopenharmony_ci    movq       mm7, mm2
1559cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1560cabdff1aSopenharmony_ci    psllq      mm1, 56
1561cabdff1aSopenharmony_ci    PALIGNR    mm7, mm1, 7, mm3
1562cabdff1aSopenharmony_ci    lea         r1, [r0+r3*2]
1563cabdff1aSopenharmony_ci    pshufw     mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
1564cabdff1aSopenharmony_ci    psllq      mm7, 56             ; l7 .. .. .. .. .. .. ..
1565cabdff1aSopenharmony_ci    movq       mm2, mm0
1566cabdff1aSopenharmony_ci    psllw      mm0, 8
1567cabdff1aSopenharmony_ci    psrlw      mm2, 8
1568cabdff1aSopenharmony_ci    por        mm2, mm0            ; l7 l6 l5 l4 l3 l2 l1 l0
1569cabdff1aSopenharmony_ci    movq       mm3, mm2
1570cabdff1aSopenharmony_ci    movq       mm4, mm2
1571cabdff1aSopenharmony_ci    movq       mm5, mm2
1572cabdff1aSopenharmony_ci    psrlq      mm2, 8
1573cabdff1aSopenharmony_ci    psrlq      mm3, 16
1574cabdff1aSopenharmony_ci    lea         r2, [r1+r3*2]
1575cabdff1aSopenharmony_ci    por        mm2, mm7            ; l7 l7 l6 l5 l4 l3 l2 l1
1576cabdff1aSopenharmony_ci    punpckhbw  mm7, mm7
1577cabdff1aSopenharmony_ci    por        mm3, mm7            ; l7 l7 l7 l6 l5 l4 l3 l2
1578cabdff1aSopenharmony_ci    pavgb      mm4, mm2
1579cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
1580cabdff1aSopenharmony_ci    movq       mm5, mm4
1581cabdff1aSopenharmony_ci    punpcklbw  mm4, mm1            ; p4 p3 p2 p1
1582cabdff1aSopenharmony_ci    punpckhbw  mm5, mm1            ; p8 p7 p6 p5
1583cabdff1aSopenharmony_ci    movq       mm6, mm5
1584cabdff1aSopenharmony_ci    movq       mm7, mm5
1585cabdff1aSopenharmony_ci    movq       mm0, mm5
1586cabdff1aSopenharmony_ci    PALIGNR    mm5, mm4, 2, mm1
1587cabdff1aSopenharmony_ci    pshufw     mm1, mm6, 11111001b
1588cabdff1aSopenharmony_ci    PALIGNR    mm6, mm4, 4, mm2
1589cabdff1aSopenharmony_ci    pshufw     mm2, mm7, 11111110b
1590cabdff1aSopenharmony_ci    PALIGNR    mm7, mm4, 6, mm3
1591cabdff1aSopenharmony_ci    pshufw     mm3, mm0, 11111111b
1592cabdff1aSopenharmony_ci    movq [r0+r3*1], mm4
1593cabdff1aSopenharmony_ci    movq [r0+r3*2], mm5
1594cabdff1aSopenharmony_ci    lea         r0, [r2+r3*2]
1595cabdff1aSopenharmony_ci    movq [r1+r3*1], mm6
1596cabdff1aSopenharmony_ci    movq [r1+r3*2], mm7
1597cabdff1aSopenharmony_ci    movq [r2+r3*1], mm0
1598cabdff1aSopenharmony_ci    movq [r2+r3*2], mm1
1599cabdff1aSopenharmony_ci    movq [r0+r3*1], mm2
1600cabdff1aSopenharmony_ci    movq [r0+r3*2], mm3
1601cabdff1aSopenharmony_ci    RET
1602cabdff1aSopenharmony_ci%endmacro
1603cabdff1aSopenharmony_ci
1604cabdff1aSopenharmony_ciINIT_MMX mmxext
1605cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_UP
1606cabdff1aSopenharmony_ciINIT_MMX ssse3
1607cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_UP
1608cabdff1aSopenharmony_ci
1609cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1610cabdff1aSopenharmony_ci; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
1611cabdff1aSopenharmony_ci;                                    int has_topright, ptrdiff_t stride)
1612cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1613cabdff1aSopenharmony_ci
1614cabdff1aSopenharmony_ci%macro PRED8x8L_HORIZONTAL_DOWN 0
1615cabdff1aSopenharmony_cicglobal pred8x8l_horizontal_down_8, 4,5
1616cabdff1aSopenharmony_ci    sub          r0, r3
1617cabdff1aSopenharmony_ci    lea          r4, [r0+r3*2]
1618cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*1-8]
1619cabdff1aSopenharmony_ci    punpckhbw   mm0, [r0+r3*0-8]
1620cabdff1aSopenharmony_ci    movq        mm1, [r4+r3*1-8]
1621cabdff1aSopenharmony_ci    punpckhbw   mm1, [r0+r3*2-8]
1622cabdff1aSopenharmony_ci    mov          r4, r0
1623cabdff1aSopenharmony_ci    punpckhwd   mm1, mm0
1624cabdff1aSopenharmony_ci    lea          r0, [r0+r3*4]
1625cabdff1aSopenharmony_ci    movq        mm2, [r0+r3*1-8]
1626cabdff1aSopenharmony_ci    punpckhbw   mm2, [r0+r3*0-8]
1627cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
1628cabdff1aSopenharmony_ci    movq        mm3, [r0+r3*1-8]
1629cabdff1aSopenharmony_ci    punpckhbw   mm3, [r0+r3*0-8]
1630cabdff1aSopenharmony_ci    punpckhwd   mm3, mm2
1631cabdff1aSopenharmony_ci    punpckhdq   mm3, mm1
1632cabdff1aSopenharmony_ci    lea          r0, [r0+r3*2]
1633cabdff1aSopenharmony_ci    movq        mm0, [r0+r3*0-8]
1634cabdff1aSopenharmony_ci    movq        mm1, [r4]
1635cabdff1aSopenharmony_ci    mov          r0, r4
1636cabdff1aSopenharmony_ci    movq        mm4, mm3
1637cabdff1aSopenharmony_ci    movq        mm2, mm3
1638cabdff1aSopenharmony_ci    PALIGNR     mm4, mm0, 7, mm0
1639cabdff1aSopenharmony_ci    PALIGNR     mm1, mm2, 1, mm2
1640cabdff1aSopenharmony_ci    test        r1d, r1d
1641cabdff1aSopenharmony_ci    jnz .do_left
1642cabdff1aSopenharmony_ci.fix_lt_1:
1643cabdff1aSopenharmony_ci    movq        mm5, mm3
1644cabdff1aSopenharmony_ci    pxor        mm5, mm4
1645cabdff1aSopenharmony_ci    psrlq       mm5, 56
1646cabdff1aSopenharmony_ci    psllq       mm5, 48
1647cabdff1aSopenharmony_ci    pxor        mm1, mm5
1648cabdff1aSopenharmony_ci    jmp .do_left
1649cabdff1aSopenharmony_ci.fix_lt_2:
1650cabdff1aSopenharmony_ci    movq        mm5, mm3
1651cabdff1aSopenharmony_ci    pxor        mm5, mm2
1652cabdff1aSopenharmony_ci    psllq       mm5, 56
1653cabdff1aSopenharmony_ci    psrlq       mm5, 56
1654cabdff1aSopenharmony_ci    pxor        mm2, mm5
1655cabdff1aSopenharmony_ci    test        r2d, r2d
1656cabdff1aSopenharmony_ci    jnz .do_top
1657cabdff1aSopenharmony_ci.fix_tr_1:
1658cabdff1aSopenharmony_ci    movq        mm5, mm3
1659cabdff1aSopenharmony_ci    pxor        mm5, mm1
1660cabdff1aSopenharmony_ci    psrlq       mm5, 56
1661cabdff1aSopenharmony_ci    psllq       mm5, 56
1662cabdff1aSopenharmony_ci    pxor        mm1, mm5
1663cabdff1aSopenharmony_ci    jmp .do_top
1664cabdff1aSopenharmony_ci.fix_tr_2:
1665cabdff1aSopenharmony_ci    punpckhbw   mm3, mm3
1666cabdff1aSopenharmony_ci    pshufw      mm1, mm3, 0xFF
1667cabdff1aSopenharmony_ci    jmp .do_topright
1668cabdff1aSopenharmony_ci.do_left:
1669cabdff1aSopenharmony_ci    movq        mm0, mm4
1670cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1671cabdff1aSopenharmony_ci    movq2dq    xmm0, mm2
1672cabdff1aSopenharmony_ci    pslldq     xmm0, 8
1673cabdff1aSopenharmony_ci    movq        mm4, mm0
1674cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1675cabdff1aSopenharmony_ci    movq2dq    xmm2, mm1
1676cabdff1aSopenharmony_ci    pslldq     xmm2, 15
1677cabdff1aSopenharmony_ci    psrldq     xmm2, 8
1678cabdff1aSopenharmony_ci    por        xmm0, xmm2
1679cabdff1aSopenharmony_ci    movq        mm0, [r0-8]
1680cabdff1aSopenharmony_ci    movq        mm3, [r0]
1681cabdff1aSopenharmony_ci    movq        mm1, [r0+8]
1682cabdff1aSopenharmony_ci    movq        mm2, mm3
1683cabdff1aSopenharmony_ci    movq        mm4, mm3
1684cabdff1aSopenharmony_ci    PALIGNR     mm2, mm0, 7, mm0
1685cabdff1aSopenharmony_ci    PALIGNR     mm1, mm4, 1, mm4
1686cabdff1aSopenharmony_ci    test        r1d, r1d
1687cabdff1aSopenharmony_ci    jz .fix_lt_2
1688cabdff1aSopenharmony_ci    test        r2d, r2d
1689cabdff1aSopenharmony_ci    jz .fix_tr_1
1690cabdff1aSopenharmony_ci.do_top:
1691cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1692cabdff1aSopenharmony_ci    movq2dq    xmm1, mm4
1693cabdff1aSopenharmony_ci    test        r2d, r2d
1694cabdff1aSopenharmony_ci    jz .fix_tr_2
1695cabdff1aSopenharmony_ci    movq        mm0, [r0+8]
1696cabdff1aSopenharmony_ci    movq        mm5, mm0
1697cabdff1aSopenharmony_ci    movq        mm2, mm0
1698cabdff1aSopenharmony_ci    movq        mm4, mm0
1699cabdff1aSopenharmony_ci    psrlq       mm5, 56
1700cabdff1aSopenharmony_ci    PALIGNR     mm2, mm3, 7, mm3
1701cabdff1aSopenharmony_ci    PALIGNR     mm5, mm4, 1, mm4
1702cabdff1aSopenharmony_ci    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1703cabdff1aSopenharmony_ci.do_topright:
1704cabdff1aSopenharmony_ci    movq2dq    xmm5, mm1
1705cabdff1aSopenharmony_ci    pslldq     xmm5, 8
1706cabdff1aSopenharmony_ci    por        xmm1, xmm5
1707cabdff1aSopenharmony_ciINIT_XMM cpuname
1708cabdff1aSopenharmony_ci    lea         r2, [r4+r3*2]
1709cabdff1aSopenharmony_ci    movdqa    xmm2, xmm1
1710cabdff1aSopenharmony_ci    movdqa    xmm3, xmm1
1711cabdff1aSopenharmony_ci    PALIGNR   xmm1, xmm0, 7, xmm4
1712cabdff1aSopenharmony_ci    PALIGNR   xmm2, xmm0, 9, xmm5
1713cabdff1aSopenharmony_ci    lea         r1, [r2+r3*2]
1714cabdff1aSopenharmony_ci    PALIGNR   xmm3, xmm0, 8, xmm0
1715cabdff1aSopenharmony_ci    movdqa    xmm4, xmm1
1716cabdff1aSopenharmony_ci    pavgb     xmm4, xmm3
1717cabdff1aSopenharmony_ci    lea         r0, [r1+r3*2]
1718cabdff1aSopenharmony_ci    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
1719cabdff1aSopenharmony_ci    punpcklbw xmm4, xmm0
1720cabdff1aSopenharmony_ci    movhlps   xmm0, xmm4
1721cabdff1aSopenharmony_ci    movq   [r0+r3*2], xmm4
1722cabdff1aSopenharmony_ci    movq   [r2+r3*2], xmm0
1723cabdff1aSopenharmony_ci    psrldq xmm4, 2
1724cabdff1aSopenharmony_ci    psrldq xmm0, 2
1725cabdff1aSopenharmony_ci    movq   [r0+r3*1], xmm4
1726cabdff1aSopenharmony_ci    movq   [r2+r3*1], xmm0
1727cabdff1aSopenharmony_ci    psrldq xmm4, 2
1728cabdff1aSopenharmony_ci    psrldq xmm0, 2
1729cabdff1aSopenharmony_ci    movq   [r1+r3*2], xmm4
1730cabdff1aSopenharmony_ci    movq   [r4+r3*2], xmm0
1731cabdff1aSopenharmony_ci    psrldq xmm4, 2
1732cabdff1aSopenharmony_ci    psrldq xmm0, 2
1733cabdff1aSopenharmony_ci    movq   [r1+r3*1], xmm4
1734cabdff1aSopenharmony_ci    movq   [r4+r3*1], xmm0
1735cabdff1aSopenharmony_ci    RET
1736cabdff1aSopenharmony_ci%endmacro
1737cabdff1aSopenharmony_ci
1738cabdff1aSopenharmony_ciINIT_MMX sse2
1739cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_DOWN
1740cabdff1aSopenharmony_ciINIT_MMX ssse3
1741cabdff1aSopenharmony_ciPRED8x8L_HORIZONTAL_DOWN
1742cabdff1aSopenharmony_ci
1743cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------
1744cabdff1aSopenharmony_ci; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright,
1745cabdff1aSopenharmony_ci;                             ptrdiff_t stride)
1746cabdff1aSopenharmony_ci;-------------------------------------------------------------------------------
1747cabdff1aSopenharmony_ci
1748cabdff1aSopenharmony_ciINIT_MMX mmxext
1749cabdff1aSopenharmony_cicglobal pred4x4_dc_8, 3,5
1750cabdff1aSopenharmony_ci    pxor   mm7, mm7
1751cabdff1aSopenharmony_ci    mov     r4, r0
1752cabdff1aSopenharmony_ci    sub     r0, r2
1753cabdff1aSopenharmony_ci    movd   mm0, [r0]
1754cabdff1aSopenharmony_ci    psadbw mm0, mm7
1755cabdff1aSopenharmony_ci    movzx  r1d, byte [r0+r2*1-1]
1756cabdff1aSopenharmony_ci    movd   r3d, mm0
1757cabdff1aSopenharmony_ci    add    r3d, r1d
1758cabdff1aSopenharmony_ci    movzx  r1d, byte [r0+r2*2-1]
1759cabdff1aSopenharmony_ci    lea     r0, [r0+r2*2]
1760cabdff1aSopenharmony_ci    add    r3d, r1d
1761cabdff1aSopenharmony_ci    movzx  r1d, byte [r0+r2*1-1]
1762cabdff1aSopenharmony_ci    add    r3d, r1d
1763cabdff1aSopenharmony_ci    movzx  r1d, byte [r0+r2*2-1]
1764cabdff1aSopenharmony_ci    add    r3d, r1d
1765cabdff1aSopenharmony_ci    add    r3d, 4
1766cabdff1aSopenharmony_ci    shr    r3d, 3
1767cabdff1aSopenharmony_ci    imul   r3d, 0x01010101
1768cabdff1aSopenharmony_ci    mov   [r4+r2*0], r3d
1769cabdff1aSopenharmony_ci    mov   [r0+r2*0], r3d
1770cabdff1aSopenharmony_ci    mov   [r0+r2*1], r3d
1771cabdff1aSopenharmony_ci    mov   [r0+r2*2], r3d
1772cabdff1aSopenharmony_ci    RET
1773cabdff1aSopenharmony_ci
1774cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1775cabdff1aSopenharmony_ci; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
1776cabdff1aSopenharmony_ci;                                 ptrdiff_t stride)
1777cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1778cabdff1aSopenharmony_ci
1779cabdff1aSopenharmony_ciINIT_MMX mmxext
1780cabdff1aSopenharmony_cicglobal pred4x4_tm_vp8_8, 3,6
1781cabdff1aSopenharmony_ci    sub        r0, r2
1782cabdff1aSopenharmony_ci    pxor      mm7, mm7
1783cabdff1aSopenharmony_ci    movd      mm0, [r0]
1784cabdff1aSopenharmony_ci    punpcklbw mm0, mm7
1785cabdff1aSopenharmony_ci    movzx     r4d, byte [r0-1]
1786cabdff1aSopenharmony_ci    mov       r5d, 2
1787cabdff1aSopenharmony_ci.loop:
1788cabdff1aSopenharmony_ci    movzx     r1d, byte [r0+r2*1-1]
1789cabdff1aSopenharmony_ci    movzx     r3d, byte [r0+r2*2-1]
1790cabdff1aSopenharmony_ci    sub       r1d, r4d
1791cabdff1aSopenharmony_ci    sub       r3d, r4d
1792cabdff1aSopenharmony_ci    movd      mm2, r1d
1793cabdff1aSopenharmony_ci    movd      mm4, r3d
1794cabdff1aSopenharmony_ci    pshufw    mm2, mm2, 0
1795cabdff1aSopenharmony_ci    pshufw    mm4, mm4, 0
1796cabdff1aSopenharmony_ci    paddw     mm2, mm0
1797cabdff1aSopenharmony_ci    paddw     mm4, mm0
1798cabdff1aSopenharmony_ci    packuswb  mm2, mm2
1799cabdff1aSopenharmony_ci    packuswb  mm4, mm4
1800cabdff1aSopenharmony_ci    movd [r0+r2*1], mm2
1801cabdff1aSopenharmony_ci    movd [r0+r2*2], mm4
1802cabdff1aSopenharmony_ci    lea        r0, [r0+r2*2]
1803cabdff1aSopenharmony_ci    dec       r5d
1804cabdff1aSopenharmony_ci    jg .loop
1805cabdff1aSopenharmony_ci    REP_RET
1806cabdff1aSopenharmony_ci
1807cabdff1aSopenharmony_ciINIT_XMM ssse3
1808cabdff1aSopenharmony_cicglobal pred4x4_tm_vp8_8, 3,3
1809cabdff1aSopenharmony_ci    sub         r0, r2
1810cabdff1aSopenharmony_ci    movq       mm6, [tm_shuf]
1811cabdff1aSopenharmony_ci    pxor       mm1, mm1
1812cabdff1aSopenharmony_ci    movd       mm0, [r0]
1813cabdff1aSopenharmony_ci    punpcklbw  mm0, mm1
1814cabdff1aSopenharmony_ci    movd       mm7, [r0-4]
1815cabdff1aSopenharmony_ci    pshufb     mm7, mm6
1816cabdff1aSopenharmony_ci    lea         r1, [r0+r2*2]
1817cabdff1aSopenharmony_ci    movd       mm2, [r0+r2*1-4]
1818cabdff1aSopenharmony_ci    movd       mm3, [r0+r2*2-4]
1819cabdff1aSopenharmony_ci    movd       mm4, [r1+r2*1-4]
1820cabdff1aSopenharmony_ci    movd       mm5, [r1+r2*2-4]
1821cabdff1aSopenharmony_ci    pshufb     mm2, mm6
1822cabdff1aSopenharmony_ci    pshufb     mm3, mm6
1823cabdff1aSopenharmony_ci    pshufb     mm4, mm6
1824cabdff1aSopenharmony_ci    pshufb     mm5, mm6
1825cabdff1aSopenharmony_ci    psubw      mm0, mm7
1826cabdff1aSopenharmony_ci    paddw      mm2, mm0
1827cabdff1aSopenharmony_ci    paddw      mm3, mm0
1828cabdff1aSopenharmony_ci    paddw      mm4, mm0
1829cabdff1aSopenharmony_ci    paddw      mm5, mm0
1830cabdff1aSopenharmony_ci    packuswb   mm2, mm2
1831cabdff1aSopenharmony_ci    packuswb   mm3, mm3
1832cabdff1aSopenharmony_ci    packuswb   mm4, mm4
1833cabdff1aSopenharmony_ci    packuswb   mm5, mm5
1834cabdff1aSopenharmony_ci    movd [r0+r2*1], mm2
1835cabdff1aSopenharmony_ci    movd [r0+r2*2], mm3
1836cabdff1aSopenharmony_ci    movd [r1+r2*1], mm4
1837cabdff1aSopenharmony_ci    movd [r1+r2*2], mm5
1838cabdff1aSopenharmony_ci    RET
1839cabdff1aSopenharmony_ci
1840cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1841cabdff1aSopenharmony_ci; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
1842cabdff1aSopenharmony_ci;                                       ptrdiff_t stride)
1843cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1844cabdff1aSopenharmony_ci
1845cabdff1aSopenharmony_ciINIT_MMX mmxext
1846cabdff1aSopenharmony_cicglobal pred4x4_vertical_vp8_8, 3,3
1847cabdff1aSopenharmony_ci    sub       r0, r2
1848cabdff1aSopenharmony_ci    movd      m1, [r0-1]
1849cabdff1aSopenharmony_ci    movd      m0, [r0]
1850cabdff1aSopenharmony_ci    mova      m2, m0   ;t0 t1 t2 t3
1851cabdff1aSopenharmony_ci    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
1852cabdff1aSopenharmony_ci    lea       r1, [r0+r2*2]
1853cabdff1aSopenharmony_ci    psrlq     m0, 8    ;t1 t2 t3 t4
1854cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m3, m1, m0, m2, m4
1855cabdff1aSopenharmony_ci    movd [r0+r2*1], m3
1856cabdff1aSopenharmony_ci    movd [r0+r2*2], m3
1857cabdff1aSopenharmony_ci    movd [r1+r2*1], m3
1858cabdff1aSopenharmony_ci    movd [r1+r2*2], m3
1859cabdff1aSopenharmony_ci    RET
1860cabdff1aSopenharmony_ci
1861cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1862cabdff1aSopenharmony_ci; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
1863cabdff1aSopenharmony_ci;                                    ptrdiff_t stride)
1864cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1865cabdff1aSopenharmony_ciINIT_MMX mmxext
1866cabdff1aSopenharmony_cicglobal pred4x4_down_left_8, 3,3
1867cabdff1aSopenharmony_ci    sub       r0, r2
1868cabdff1aSopenharmony_ci    movq      m1, [r0]
1869cabdff1aSopenharmony_ci    punpckldq m1, [r1]
1870cabdff1aSopenharmony_ci    movq      m2, m1
1871cabdff1aSopenharmony_ci    movq      m3, m1
1872cabdff1aSopenharmony_ci    psllq     m1, 8
1873cabdff1aSopenharmony_ci    pxor      m2, m1
1874cabdff1aSopenharmony_ci    psrlq     m2, 8
1875cabdff1aSopenharmony_ci    pxor      m2, m3
1876cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m0, m1, m2, m3, m4
1877cabdff1aSopenharmony_ci    lea       r1, [r0+r2*2]
1878cabdff1aSopenharmony_ci    psrlq     m0, 8
1879cabdff1aSopenharmony_ci    movd      [r0+r2*1], m0
1880cabdff1aSopenharmony_ci    psrlq     m0, 8
1881cabdff1aSopenharmony_ci    movd      [r0+r2*2], m0
1882cabdff1aSopenharmony_ci    psrlq     m0, 8
1883cabdff1aSopenharmony_ci    movd      [r1+r2*1], m0
1884cabdff1aSopenharmony_ci    psrlq     m0, 8
1885cabdff1aSopenharmony_ci    movd      [r1+r2*2], m0
1886cabdff1aSopenharmony_ci    RET
1887cabdff1aSopenharmony_ci
1888cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1889cabdff1aSopenharmony_ci; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
1890cabdff1aSopenharmony_ci;                                        ptrdiff_t stride)
1891cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1892cabdff1aSopenharmony_ci
1893cabdff1aSopenharmony_ciINIT_MMX mmxext
1894cabdff1aSopenharmony_cicglobal pred4x4_vertical_left_8, 3,3
1895cabdff1aSopenharmony_ci    sub       r0, r2
1896cabdff1aSopenharmony_ci    movq      m1, [r0]
1897cabdff1aSopenharmony_ci    punpckldq m1, [r1]
1898cabdff1aSopenharmony_ci    movq      m3, m1
1899cabdff1aSopenharmony_ci    movq      m2, m1
1900cabdff1aSopenharmony_ci    psrlq     m3, 8
1901cabdff1aSopenharmony_ci    psrlq     m2, 16
1902cabdff1aSopenharmony_ci    movq      m4, m3
1903cabdff1aSopenharmony_ci    pavgb     m4, m1
1904cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m0, m1, m2, m3, m5
1905cabdff1aSopenharmony_ci    lea       r1, [r0+r2*2]
1906cabdff1aSopenharmony_ci    movh      [r0+r2*1], m4
1907cabdff1aSopenharmony_ci    movh      [r0+r2*2], m0
1908cabdff1aSopenharmony_ci    psrlq     m4, 8
1909cabdff1aSopenharmony_ci    psrlq     m0, 8
1910cabdff1aSopenharmony_ci    movh      [r1+r2*1], m4
1911cabdff1aSopenharmony_ci    movh      [r1+r2*2], m0
1912cabdff1aSopenharmony_ci    RET
1913cabdff1aSopenharmony_ci
1914cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1915cabdff1aSopenharmony_ci; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
1916cabdff1aSopenharmony_ci;                                        ptrdiff_t stride)
1917cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1918cabdff1aSopenharmony_ci
1919cabdff1aSopenharmony_ciINIT_MMX mmxext
1920cabdff1aSopenharmony_cicglobal pred4x4_horizontal_up_8, 3,3
1921cabdff1aSopenharmony_ci    sub       r0, r2
1922cabdff1aSopenharmony_ci    lea       r1, [r0+r2*2]
1923cabdff1aSopenharmony_ci    movd      m0, [r0+r2*1-4]
1924cabdff1aSopenharmony_ci    punpcklbw m0, [r0+r2*2-4]
1925cabdff1aSopenharmony_ci    movd      m1, [r1+r2*1-4]
1926cabdff1aSopenharmony_ci    punpcklbw m1, [r1+r2*2-4]
1927cabdff1aSopenharmony_ci    punpckhwd m0, m1
1928cabdff1aSopenharmony_ci    movq      m1, m0
1929cabdff1aSopenharmony_ci    punpckhbw m1, m1
1930cabdff1aSopenharmony_ci    pshufw    m1, m1, 0xFF
1931cabdff1aSopenharmony_ci    punpckhdq m0, m1
1932cabdff1aSopenharmony_ci    movq      m2, m0
1933cabdff1aSopenharmony_ci    movq      m3, m0
1934cabdff1aSopenharmony_ci    movq      m7, m0
1935cabdff1aSopenharmony_ci    psrlq     m2, 16
1936cabdff1aSopenharmony_ci    psrlq     m3, 8
1937cabdff1aSopenharmony_ci    pavgb     m7, m3
1938cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m4, m0, m2, m3, m5
1939cabdff1aSopenharmony_ci    punpcklbw m7, m4
1940cabdff1aSopenharmony_ci    movd    [r0+r2*1], m7
1941cabdff1aSopenharmony_ci    psrlq    m7, 16
1942cabdff1aSopenharmony_ci    movd    [r0+r2*2], m7
1943cabdff1aSopenharmony_ci    psrlq    m7, 16
1944cabdff1aSopenharmony_ci    movd    [r1+r2*1], m7
1945cabdff1aSopenharmony_ci    movd    [r1+r2*2], m1
1946cabdff1aSopenharmony_ci    RET
1947cabdff1aSopenharmony_ci
1948cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1949cabdff1aSopenharmony_ci; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
1950cabdff1aSopenharmony_ci;                                          const uint8_t *topright,
1951cabdff1aSopenharmony_ci;                                          ptrdiff_t stride)
1952cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
1953cabdff1aSopenharmony_ci
1954cabdff1aSopenharmony_ciINIT_MMX mmxext
1955cabdff1aSopenharmony_cicglobal pred4x4_horizontal_down_8, 3,3
1956cabdff1aSopenharmony_ci    sub       r0, r2
1957cabdff1aSopenharmony_ci    lea       r1, [r0+r2*2]
1958cabdff1aSopenharmony_ci    movh      m0, [r0-4]      ; lt ..
1959cabdff1aSopenharmony_ci    punpckldq m0, [r0]        ; t3 t2 t1 t0 lt .. .. ..
1960cabdff1aSopenharmony_ci    psllq     m0, 8           ; t2 t1 t0 lt .. .. .. ..
1961cabdff1aSopenharmony_ci    movd      m1, [r1+r2*2-4] ; l3
1962cabdff1aSopenharmony_ci    punpcklbw m1, [r1+r2*1-4] ; l2 l3
1963cabdff1aSopenharmony_ci    movd      m2, [r0+r2*2-4] ; l1
1964cabdff1aSopenharmony_ci    punpcklbw m2, [r0+r2*1-4] ; l0 l1
1965cabdff1aSopenharmony_ci    punpckhwd m1, m2          ; l0 l1 l2 l3
1966cabdff1aSopenharmony_ci    punpckhdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
1967cabdff1aSopenharmony_ci    movq      m0, m1
1968cabdff1aSopenharmony_ci    movq      m2, m1
1969cabdff1aSopenharmony_ci    movq      m5, m1
1970cabdff1aSopenharmony_ci    psrlq     m0, 16          ; .. .. t2 t1 t0 lt l0 l1
1971cabdff1aSopenharmony_ci    psrlq     m2, 8           ; .. t2 t1 t0 lt l0 l1 l2
1972cabdff1aSopenharmony_ci    pavgb     m5, m2
1973cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m3, m1, m0, m2, m4
1974cabdff1aSopenharmony_ci    punpcklbw m5, m3
1975cabdff1aSopenharmony_ci    psrlq     m3, 32
1976cabdff1aSopenharmony_ci    PALIGNR   m3, m5, 6, m4
1977cabdff1aSopenharmony_ci    movh      [r1+r2*2], m5
1978cabdff1aSopenharmony_ci    psrlq     m5, 16
1979cabdff1aSopenharmony_ci    movh      [r1+r2*1], m5
1980cabdff1aSopenharmony_ci    psrlq     m5, 16
1981cabdff1aSopenharmony_ci    movh      [r0+r2*2], m5
1982cabdff1aSopenharmony_ci    movh      [r0+r2*1], m3
1983cabdff1aSopenharmony_ci    RET
1984cabdff1aSopenharmony_ci
1985cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1986cabdff1aSopenharmony_ci; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
1987cabdff1aSopenharmony_ci;                                         const uint8_t *topright,
1988cabdff1aSopenharmony_ci;                                         ptrdiff_t stride)
1989cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
1990cabdff1aSopenharmony_ci
1991cabdff1aSopenharmony_ciINIT_MMX mmxext
1992cabdff1aSopenharmony_cicglobal pred4x4_vertical_right_8, 3,3
1993cabdff1aSopenharmony_ci    sub     r0, r2
1994cabdff1aSopenharmony_ci    lea     r1, [r0+r2*2]
1995cabdff1aSopenharmony_ci    movh    m0, [r0]                    ; ........t3t2t1t0
1996cabdff1aSopenharmony_ci    movq    m5, m0
1997cabdff1aSopenharmony_ci    PALIGNR m0, [r0-8], 7, m1           ; ......t3t2t1t0lt
1998cabdff1aSopenharmony_ci    pavgb   m5, m0
1999cabdff1aSopenharmony_ci    PALIGNR m0, [r0+r2*1-8], 7, m1      ; ....t3t2t1t0ltl0
2000cabdff1aSopenharmony_ci    movq    m1, m0
2001cabdff1aSopenharmony_ci    PALIGNR m0, [r0+r2*2-8], 7, m2      ; ..t3t2t1t0ltl0l1
2002cabdff1aSopenharmony_ci    movq    m2, m0
2003cabdff1aSopenharmony_ci    PALIGNR m0, [r1+r2*1-8], 7, m3      ; t3t2t1t0ltl0l1l2
2004cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2005cabdff1aSopenharmony_ci    movq    m1, m3
2006cabdff1aSopenharmony_ci    psrlq   m3, 16
2007cabdff1aSopenharmony_ci    psllq   m1, 48
2008cabdff1aSopenharmony_ci    movh    [r0+r2*1], m5
2009cabdff1aSopenharmony_ci    movh    [r0+r2*2], m3
2010cabdff1aSopenharmony_ci    PALIGNR m5, m1, 7, m2
2011cabdff1aSopenharmony_ci    psllq   m1, 8
2012cabdff1aSopenharmony_ci    movh    [r1+r2*1], m5
2013cabdff1aSopenharmony_ci    PALIGNR m3, m1, 7, m1
2014cabdff1aSopenharmony_ci    movh    [r1+r2*2], m3
2015cabdff1aSopenharmony_ci    RET
2016cabdff1aSopenharmony_ci
2017cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
2018cabdff1aSopenharmony_ci; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
2019cabdff1aSopenharmony_ci;                                     ptrdiff_t stride)
2020cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
2021cabdff1aSopenharmony_ci
2022cabdff1aSopenharmony_ciINIT_MMX mmxext
2023cabdff1aSopenharmony_cicglobal pred4x4_down_right_8, 3,3
2024cabdff1aSopenharmony_ci    sub       r0, r2
2025cabdff1aSopenharmony_ci    lea       r1, [r0+r2*2]
2026cabdff1aSopenharmony_ci    movq      m1, [r1-8]
2027cabdff1aSopenharmony_ci    movq      m2, [r0+r2*1-8]
2028cabdff1aSopenharmony_ci    punpckhbw m2, [r0-8]
2029cabdff1aSopenharmony_ci    movh      m3, [r0]
2030cabdff1aSopenharmony_ci    punpckhwd m1, m2
2031cabdff1aSopenharmony_ci    PALIGNR   m3, m1, 5, m1
2032cabdff1aSopenharmony_ci    movq      m1, m3
2033cabdff1aSopenharmony_ci    PALIGNR   m3, [r1+r2*1-8], 7, m4
2034cabdff1aSopenharmony_ci    movq      m2, m3
2035cabdff1aSopenharmony_ci    PALIGNR   m3, [r1+r2*2-8], 7, m4
2036cabdff1aSopenharmony_ci    PRED4x4_LOWPASS m0, m3, m1, m2, m4
2037cabdff1aSopenharmony_ci    movh      [r1+r2*2], m0
2038cabdff1aSopenharmony_ci    psrlq     m0, 8
2039cabdff1aSopenharmony_ci    movh      [r1+r2*1], m0
2040cabdff1aSopenharmony_ci    psrlq     m0, 8
2041cabdff1aSopenharmony_ci    movh      [r0+r2*2], m0
2042cabdff1aSopenharmony_ci    psrlq     m0, 8
2043cabdff1aSopenharmony_ci    movh      [r0+r2*1], m0
2044cabdff1aSopenharmony_ci    RET
2045