1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* VP9 Intra prediction SIMD optimizations
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* Parts based on:
7cabdff1aSopenharmony_ci;* H.264 intra prediction asm optimizations
8cabdff1aSopenharmony_ci;* Copyright (c) 2010 Fiona Glaser
9cabdff1aSopenharmony_ci;* Copyright (c) 2010 Holger Lubitz
10cabdff1aSopenharmony_ci;* Copyright (c) 2010 Loren Merritt
11cabdff1aSopenharmony_ci;* Copyright (c) 2010 Ronald S. Bultje
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
14cabdff1aSopenharmony_ci;*
15cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
16cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
17cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
18cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
19cabdff1aSopenharmony_ci;*
20cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
21cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
22cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
24cabdff1aSopenharmony_ci;*
25cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
26cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
27cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28cabdff1aSopenharmony_ci;******************************************************************************
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ciSECTION_RODATA 32
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_cipw_m256: times 16 dw -256
35cabdff1aSopenharmony_cipw_m255: times 16 dw -255
36cabdff1aSopenharmony_cipw_4096: times 8 dw 4096
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_cipb_4x3_4x2_4x1_4x0: times 4 db 3
39cabdff1aSopenharmony_ci                    times 4 db 2
40cabdff1aSopenharmony_ci                    times 4 db 1
41cabdff1aSopenharmony_ci                    times 4 db 0
42cabdff1aSopenharmony_cipb_8x1_8x0:   times 8 db 1
43cabdff1aSopenharmony_ci              times 8 db 0
44cabdff1aSopenharmony_cipb_8x3_8x2:   times 8 db 3
45cabdff1aSopenharmony_ci              times 8 db 2
46cabdff1aSopenharmony_cipb_0to5_2x7:  db 0, 1, 2, 3, 4, 5, 7, 7
47cabdff1aSopenharmony_ci              times 8 db -1
48cabdff1aSopenharmony_cipb_0to6_9x7:  db 0, 1, 2, 3, 4, 5, 6
49cabdff1aSopenharmony_ci              times 9 db 7
50cabdff1aSopenharmony_cipb_1to6_10x7: db 1, 2, 3, 4, 5, 6
51cabdff1aSopenharmony_ci              times 10 db 7
52cabdff1aSopenharmony_cipb_2to6_3x7:
53cabdff1aSopenharmony_cipb_2to6_11x7: db 2, 3, 4, 5, 6
54cabdff1aSopenharmony_ci              times 11 db 7
55cabdff1aSopenharmony_cipb_1toE_2xF:  db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
56cabdff1aSopenharmony_cipb_2toE_3xF:  db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
57cabdff1aSopenharmony_cipb_13456_3xm1: db 1, 3, 4, 5, 6
58cabdff1aSopenharmony_ci               times 3 db -1
59cabdff1aSopenharmony_cipb_6012_4xm1: db 6, 0, 1, 2
60cabdff1aSopenharmony_ci              times 4 db -1
61cabdff1aSopenharmony_cipb_6xm1_246_8toE: times 6 db -1
62cabdff1aSopenharmony_ci                  db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
63cabdff1aSopenharmony_cipb_6xm1_BDF_0to6: times 6 db -1
64cabdff1aSopenharmony_ci                  db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
65cabdff1aSopenharmony_cipb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_cipb_15x0_1xm1: times 15 db 0
68cabdff1aSopenharmony_ci              db -1
69cabdff1aSopenharmony_cipb_0to2_5x3: db 0, 1, 2
70cabdff1aSopenharmony_ci             times 5 db 3
71cabdff1aSopenharmony_cipb_6xm1_2x0: times 6 db -1
72cabdff1aSopenharmony_ci             times 2 db 0
73cabdff1aSopenharmony_cipb_6x0_2xm1: times 6 db 0
74cabdff1aSopenharmony_ci             times 2 db -1
75cabdff1aSopenharmony_ci
76cabdff1aSopenharmony_cicextern pb_1
77cabdff1aSopenharmony_cicextern pb_2
78cabdff1aSopenharmony_cicextern pb_3
79cabdff1aSopenharmony_cicextern pb_15
80cabdff1aSopenharmony_cicextern pw_2
81cabdff1aSopenharmony_cicextern pw_4
82cabdff1aSopenharmony_cicextern pw_8
83cabdff1aSopenharmony_cicextern pw_16
84cabdff1aSopenharmony_cicextern pw_32
85cabdff1aSopenharmony_cicextern pw_255
86cabdff1aSopenharmony_cicextern pw_512
87cabdff1aSopenharmony_cicextern pw_1024
88cabdff1aSopenharmony_cicextern pw_2048
89cabdff1aSopenharmony_cicextern pw_8192
90cabdff1aSopenharmony_ci
91cabdff1aSopenharmony_ciSECTION .text
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
94cabdff1aSopenharmony_ci
95cabdff1aSopenharmony_ci%macro DC_4to8_FUNCS 0
96cabdff1aSopenharmony_cicglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
97cabdff1aSopenharmony_ci    movd                    m0, [lq]
98cabdff1aSopenharmony_ci    punpckldq               m0, [aq]
99cabdff1aSopenharmony_ci    pxor                    m1, m1
100cabdff1aSopenharmony_ci    psadbw                  m0, m1
101cabdff1aSopenharmony_ci%if cpuflag(ssse3)
102cabdff1aSopenharmony_ci    pmulhrsw                m0, [pw_4096]
103cabdff1aSopenharmony_ci    pshufb                  m0, m1
104cabdff1aSopenharmony_ci%else
105cabdff1aSopenharmony_ci    paddw                   m0, [pw_4]
106cabdff1aSopenharmony_ci    psraw                   m0, 3
107cabdff1aSopenharmony_ci    punpcklbw               m0, m0
108cabdff1aSopenharmony_ci    pshufw                  m0, m0, q0000
109cabdff1aSopenharmony_ci%endif
110cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m0
111cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m0
112cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
113cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m0
114cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m0
115cabdff1aSopenharmony_ci    RET
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_cicglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
118cabdff1aSopenharmony_ci    movq                    m0, [lq]
119cabdff1aSopenharmony_ci    movq                    m1, [aq]
120cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
121cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
122cabdff1aSopenharmony_ci    pxor                    m2, m2
123cabdff1aSopenharmony_ci    psadbw                  m0, m2
124cabdff1aSopenharmony_ci    psadbw                  m1, m2
125cabdff1aSopenharmony_ci    paddw                   m0, m1
126cabdff1aSopenharmony_ci%if cpuflag(ssse3)
127cabdff1aSopenharmony_ci    pmulhrsw                m0, [pw_2048]
128cabdff1aSopenharmony_ci    pshufb                  m0, m2
129cabdff1aSopenharmony_ci%else
130cabdff1aSopenharmony_ci    paddw                   m0, [pw_8]
131cabdff1aSopenharmony_ci    psraw                   m0, 4
132cabdff1aSopenharmony_ci    punpcklbw               m0, m0
133cabdff1aSopenharmony_ci    pshufw                  m0, m0, q0000
134cabdff1aSopenharmony_ci%endif
135cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m0
136cabdff1aSopenharmony_ci    movq      [dstq+strideq*1], m0
137cabdff1aSopenharmony_ci    movq      [dstq+strideq*2], m0
138cabdff1aSopenharmony_ci    movq      [dstq+stride3q ], m0
139cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
140cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m0
141cabdff1aSopenharmony_ci    movq      [dstq+strideq*1], m0
142cabdff1aSopenharmony_ci    movq      [dstq+strideq*2], m0
143cabdff1aSopenharmony_ci    movq      [dstq+stride3q ], m0
144cabdff1aSopenharmony_ci    RET
145cabdff1aSopenharmony_ci%endmacro
146cabdff1aSopenharmony_ci
147cabdff1aSopenharmony_ciINIT_MMX mmxext
148cabdff1aSopenharmony_ciDC_4to8_FUNCS
149cabdff1aSopenharmony_ciINIT_MMX ssse3
150cabdff1aSopenharmony_ciDC_4to8_FUNCS
151cabdff1aSopenharmony_ci
152cabdff1aSopenharmony_ci%macro DC_16to32_FUNCS 0
153cabdff1aSopenharmony_cicglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
154cabdff1aSopenharmony_ci    mova                    m0, [lq]
155cabdff1aSopenharmony_ci    mova                    m1, [aq]
156cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
157cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
158cabdff1aSopenharmony_ci    pxor                    m2, m2
159cabdff1aSopenharmony_ci    psadbw                  m0, m2
160cabdff1aSopenharmony_ci    psadbw                  m1, m2
161cabdff1aSopenharmony_ci    paddw                   m0, m1
162cabdff1aSopenharmony_ci    movhlps                 m1, m0
163cabdff1aSopenharmony_ci    paddw                   m0, m1
164cabdff1aSopenharmony_ci%if cpuflag(ssse3)
165cabdff1aSopenharmony_ci    pmulhrsw                m0, [pw_1024]
166cabdff1aSopenharmony_ci    pshufb                  m0, m2
167cabdff1aSopenharmony_ci%else
168cabdff1aSopenharmony_ci    paddw                   m0, [pw_16]
169cabdff1aSopenharmony_ci    psraw                   m0, 5
170cabdff1aSopenharmony_ci    punpcklbw               m0, m0
171cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q0000
172cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
173cabdff1aSopenharmony_ci%endif
174cabdff1aSopenharmony_ci    mov                   cntd, 4
175cabdff1aSopenharmony_ci.loop:
176cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
177cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
178cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
179cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
180cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
181cabdff1aSopenharmony_ci    dec                   cntd
182cabdff1aSopenharmony_ci    jg .loop
183cabdff1aSopenharmony_ci    RET
184cabdff1aSopenharmony_ci
185cabdff1aSopenharmony_cicglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
186cabdff1aSopenharmony_ci    mova                    m0, [lq]
187cabdff1aSopenharmony_ci    mova                    m1, [lq+16]
188cabdff1aSopenharmony_ci    mova                    m2, [aq]
189cabdff1aSopenharmony_ci    mova                    m3, [aq+16]
190cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
191cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
192cabdff1aSopenharmony_ci    pxor                    m4, m4
193cabdff1aSopenharmony_ci    psadbw                  m0, m4
194cabdff1aSopenharmony_ci    psadbw                  m1, m4
195cabdff1aSopenharmony_ci    psadbw                  m2, m4
196cabdff1aSopenharmony_ci    psadbw                  m3, m4
197cabdff1aSopenharmony_ci    paddw                   m0, m1
198cabdff1aSopenharmony_ci    paddw                   m2, m3
199cabdff1aSopenharmony_ci    paddw                   m0, m2
200cabdff1aSopenharmony_ci    movhlps                 m1, m0
201cabdff1aSopenharmony_ci    paddw                   m0, m1
202cabdff1aSopenharmony_ci%if cpuflag(ssse3)
203cabdff1aSopenharmony_ci    pmulhrsw                m0, [pw_512]
204cabdff1aSopenharmony_ci    pshufb                  m0, m4
205cabdff1aSopenharmony_ci%else
206cabdff1aSopenharmony_ci    paddw                   m0, [pw_32]
207cabdff1aSopenharmony_ci    psraw                   m0, 6
208cabdff1aSopenharmony_ci    punpcklbw               m0, m0
209cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q0000
210cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
211cabdff1aSopenharmony_ci%endif
212cabdff1aSopenharmony_ci    mov                   cntd, 8
213cabdff1aSopenharmony_ci.loop:
214cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
215cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m0
216cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m0
217cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m0
218cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+ 0], m0
219cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+16], m0
220cabdff1aSopenharmony_ci    mova   [dstq+stride3q + 0], m0
221cabdff1aSopenharmony_ci    mova   [dstq+stride3q +16], m0
222cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
223cabdff1aSopenharmony_ci    dec                   cntd
224cabdff1aSopenharmony_ci    jg .loop
225cabdff1aSopenharmony_ci    RET
226cabdff1aSopenharmony_ci%endmacro
227cabdff1aSopenharmony_ci
228cabdff1aSopenharmony_ciINIT_XMM sse2
229cabdff1aSopenharmony_ciDC_16to32_FUNCS
230cabdff1aSopenharmony_ciINIT_XMM ssse3
231cabdff1aSopenharmony_ciDC_16to32_FUNCS
232cabdff1aSopenharmony_ci
233cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
234cabdff1aSopenharmony_ciINIT_YMM avx2
235cabdff1aSopenharmony_cicglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
236cabdff1aSopenharmony_ci    mova                    m0, [lq]
237cabdff1aSopenharmony_ci    mova                    m1, [aq]
238cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
239cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
240cabdff1aSopenharmony_ci    pxor                    m2, m2
241cabdff1aSopenharmony_ci    psadbw                  m0, m2
242cabdff1aSopenharmony_ci    psadbw                  m1, m2
243cabdff1aSopenharmony_ci    paddw                   m0, m1
244cabdff1aSopenharmony_ci    vextracti128           xm1, m0, 1
245cabdff1aSopenharmony_ci    paddw                  xm0, xm1
246cabdff1aSopenharmony_ci    movhlps                xm1, xm0
247cabdff1aSopenharmony_ci    paddw                  xm0, xm1
248cabdff1aSopenharmony_ci    pmulhrsw               xm0, [pw_512]
249cabdff1aSopenharmony_ci    vpbroadcastb            m0, xm0
250cabdff1aSopenharmony_ci    mov                   cntd, 4
251cabdff1aSopenharmony_ci.loop:
252cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
253cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
254cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
255cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
256cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
257cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
258cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
259cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
260cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
261cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
262cabdff1aSopenharmony_ci    dec                   cntd
263cabdff1aSopenharmony_ci    jg .loop
264cabdff1aSopenharmony_ci    RET
265cabdff1aSopenharmony_ci%endif
266cabdff1aSopenharmony_ci
267cabdff1aSopenharmony_ci; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
268cabdff1aSopenharmony_ci
269cabdff1aSopenharmony_ci%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
270cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
271cabdff1aSopenharmony_ci    movd                    m0, [%2q]
272cabdff1aSopenharmony_ci    pxor                    m1, m1
273cabdff1aSopenharmony_ci    psadbw                  m0, m1
274cabdff1aSopenharmony_ci%if cpuflag(ssse3)
275cabdff1aSopenharmony_ci    pmulhrsw                m0, [pw_8192]
276cabdff1aSopenharmony_ci    pshufb                  m0, m1
277cabdff1aSopenharmony_ci%else
278cabdff1aSopenharmony_ci    paddw                   m0, [pw_2]
279cabdff1aSopenharmony_ci    psraw                   m0, 2
280cabdff1aSopenharmony_ci    punpcklbw               m0, m0
281cabdff1aSopenharmony_ci    pshufw                  m0, m0, q0000
282cabdff1aSopenharmony_ci%endif
283cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m0
284cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m0
285cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
286cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m0
287cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m0
288cabdff1aSopenharmony_ci    RET
289cabdff1aSopenharmony_ci
290cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
291cabdff1aSopenharmony_ci    movq                    m0, [%2q]
292cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
293cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
294cabdff1aSopenharmony_ci    pxor                    m1, m1
295cabdff1aSopenharmony_ci    psadbw                  m0, m1
296cabdff1aSopenharmony_ci%if cpuflag(ssse3)
297cabdff1aSopenharmony_ci    pmulhrsw                m0, [pw_4096]
298cabdff1aSopenharmony_ci    pshufb                  m0, m1
299cabdff1aSopenharmony_ci%else
300cabdff1aSopenharmony_ci    paddw                   m0, [pw_4]
301cabdff1aSopenharmony_ci    psraw                   m0, 3
302cabdff1aSopenharmony_ci    punpcklbw               m0, m0
303cabdff1aSopenharmony_ci    pshufw                  m0, m0, q0000
304cabdff1aSopenharmony_ci%endif
305cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m0
306cabdff1aSopenharmony_ci    movq      [dstq+strideq*1], m0
307cabdff1aSopenharmony_ci    movq      [dstq+strideq*2], m0
308cabdff1aSopenharmony_ci    movq      [dstq+stride3q ], m0
309cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
310cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m0
311cabdff1aSopenharmony_ci    movq      [dstq+strideq*1], m0
312cabdff1aSopenharmony_ci    movq      [dstq+strideq*2], m0
313cabdff1aSopenharmony_ci    movq      [dstq+stride3q ], m0
314cabdff1aSopenharmony_ci    RET
315cabdff1aSopenharmony_ci%endmacro
316cabdff1aSopenharmony_ci
317cabdff1aSopenharmony_ciINIT_MMX mmxext
318cabdff1aSopenharmony_ciDC_1D_4to8_FUNCS top,  a
319cabdff1aSopenharmony_ciDC_1D_4to8_FUNCS left, l
320cabdff1aSopenharmony_ciINIT_MMX ssse3
321cabdff1aSopenharmony_ciDC_1D_4to8_FUNCS top,  a
322cabdff1aSopenharmony_ciDC_1D_4to8_FUNCS left, l
323cabdff1aSopenharmony_ci
324cabdff1aSopenharmony_ci%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l)
325cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
326cabdff1aSopenharmony_ci    mova                    m0, [%2q]
327cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
328cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
329cabdff1aSopenharmony_ci    pxor                    m2, m2
330cabdff1aSopenharmony_ci    psadbw                  m0, m2
331cabdff1aSopenharmony_ci    movhlps                 m1, m0
332cabdff1aSopenharmony_ci    paddw                   m0, m1
333cabdff1aSopenharmony_ci%if cpuflag(ssse3)
334cabdff1aSopenharmony_ci    pmulhrsw                m0, [pw_2048]
335cabdff1aSopenharmony_ci    pshufb                  m0, m2
336cabdff1aSopenharmony_ci%else
337cabdff1aSopenharmony_ci    paddw                   m0, [pw_8]
338cabdff1aSopenharmony_ci    psraw                   m0, 4
339cabdff1aSopenharmony_ci    punpcklbw               m0, m0
340cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q0000
341cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
342cabdff1aSopenharmony_ci%endif
343cabdff1aSopenharmony_ci    mov                   cntd, 4
344cabdff1aSopenharmony_ci.loop:
345cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
346cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
347cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
348cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
349cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
350cabdff1aSopenharmony_ci    dec                   cntd
351cabdff1aSopenharmony_ci    jg .loop
352cabdff1aSopenharmony_ci    RET
353cabdff1aSopenharmony_ci
354cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
355cabdff1aSopenharmony_ci    mova                    m0, [%2q]
356cabdff1aSopenharmony_ci    mova                    m1, [%2q+16]
357cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
358cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
359cabdff1aSopenharmony_ci    pxor                    m2, m2
360cabdff1aSopenharmony_ci    psadbw                  m0, m2
361cabdff1aSopenharmony_ci    psadbw                  m1, m2
362cabdff1aSopenharmony_ci    paddw                   m0, m1
363cabdff1aSopenharmony_ci    movhlps                 m1, m0
364cabdff1aSopenharmony_ci    paddw                   m0, m1
365cabdff1aSopenharmony_ci%if cpuflag(ssse3)
366cabdff1aSopenharmony_ci    pmulhrsw                m0, [pw_1024]
367cabdff1aSopenharmony_ci    pshufb                  m0, m2
368cabdff1aSopenharmony_ci%else
369cabdff1aSopenharmony_ci    paddw                   m0, [pw_16]
370cabdff1aSopenharmony_ci    psraw                   m0, 5
371cabdff1aSopenharmony_ci    punpcklbw               m0, m0
372cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q0000
373cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
374cabdff1aSopenharmony_ci%endif
375cabdff1aSopenharmony_ci    mov                   cntd, 8
376cabdff1aSopenharmony_ci.loop:
377cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
378cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m0
379cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m0
380cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m0
381cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+ 0], m0
382cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+16], m0
383cabdff1aSopenharmony_ci    mova   [dstq+stride3q + 0], m0
384cabdff1aSopenharmony_ci    mova   [dstq+stride3q +16], m0
385cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
386cabdff1aSopenharmony_ci    dec                   cntd
387cabdff1aSopenharmony_ci    jg .loop
388cabdff1aSopenharmony_ci    RET
389cabdff1aSopenharmony_ci%endmacro
390cabdff1aSopenharmony_ci
391cabdff1aSopenharmony_ciINIT_XMM sse2
392cabdff1aSopenharmony_ciDC_1D_16to32_FUNCS top,  a
393cabdff1aSopenharmony_ciDC_1D_16to32_FUNCS left, l
394cabdff1aSopenharmony_ciINIT_XMM ssse3
395cabdff1aSopenharmony_ciDC_1D_16to32_FUNCS top,  a
396cabdff1aSopenharmony_ciDC_1D_16to32_FUNCS left, l
397cabdff1aSopenharmony_ci
398cabdff1aSopenharmony_ci%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
399cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
400cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
401cabdff1aSopenharmony_ci    mova                    m0, [%2q]
402cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
403cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
404cabdff1aSopenharmony_ci    pxor                    m2, m2
405cabdff1aSopenharmony_ci    psadbw                  m0, m2
406cabdff1aSopenharmony_ci    vextracti128           xm1, m0, 1
407cabdff1aSopenharmony_ci    paddw                  xm0, xm1
408cabdff1aSopenharmony_ci    movhlps                xm1, xm0
409cabdff1aSopenharmony_ci    paddw                  xm0, xm1
410cabdff1aSopenharmony_ci    pmulhrsw               xm0, [pw_1024]
411cabdff1aSopenharmony_ci    vpbroadcastb            m0, xm0
412cabdff1aSopenharmony_ci    mov                   cntd, 4
413cabdff1aSopenharmony_ci.loop:
414cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
415cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
416cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
417cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
418cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
419cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
420cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
421cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
422cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
423cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
424cabdff1aSopenharmony_ci    dec                   cntd
425cabdff1aSopenharmony_ci    jg .loop
426cabdff1aSopenharmony_ci    RET
427cabdff1aSopenharmony_ci%endif
428cabdff1aSopenharmony_ci%endmacro
429cabdff1aSopenharmony_ci
430cabdff1aSopenharmony_ciINIT_YMM avx2
431cabdff1aSopenharmony_ciDC_1D_AVX2_FUNCS top,  a
432cabdff1aSopenharmony_ciDC_1D_AVX2_FUNCS left, l
433cabdff1aSopenharmony_ci
434cabdff1aSopenharmony_ci; v
435cabdff1aSopenharmony_ci
436cabdff1aSopenharmony_ciINIT_MMX mmx
437cabdff1aSopenharmony_cicglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
438cabdff1aSopenharmony_ci    movq                    m0, [aq]
439cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
440cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
441cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m0
442cabdff1aSopenharmony_ci    movq      [dstq+strideq*1], m0
443cabdff1aSopenharmony_ci    movq      [dstq+strideq*2], m0
444cabdff1aSopenharmony_ci    movq      [dstq+stride3q ], m0
445cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
446cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m0
447cabdff1aSopenharmony_ci    movq      [dstq+strideq*1], m0
448cabdff1aSopenharmony_ci    movq      [dstq+strideq*2], m0
449cabdff1aSopenharmony_ci    movq      [dstq+stride3q ], m0
450cabdff1aSopenharmony_ci    RET
451cabdff1aSopenharmony_ci
452cabdff1aSopenharmony_ciINIT_XMM sse
453cabdff1aSopenharmony_cicglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
454cabdff1aSopenharmony_ci    mova                    m0, [aq]
455cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
456cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
457cabdff1aSopenharmony_ci    mov                   cntd, 4
458cabdff1aSopenharmony_ci.loop:
459cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
460cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
461cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
462cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
463cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
464cabdff1aSopenharmony_ci    dec                   cntd
465cabdff1aSopenharmony_ci    jg .loop
466cabdff1aSopenharmony_ci    RET
467cabdff1aSopenharmony_ci
468cabdff1aSopenharmony_ciINIT_XMM sse
469cabdff1aSopenharmony_cicglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
470cabdff1aSopenharmony_ci    mova                    m0, [aq]
471cabdff1aSopenharmony_ci    mova                    m1, [aq+16]
472cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
473cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
474cabdff1aSopenharmony_ci    mov                   cntd, 8
475cabdff1aSopenharmony_ci.loop:
476cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
477cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m1
478cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m0
479cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m1
480cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+ 0], m0
481cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+16], m1
482cabdff1aSopenharmony_ci    mova   [dstq+stride3q + 0], m0
483cabdff1aSopenharmony_ci    mova   [dstq+stride3q +16], m1
484cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
485cabdff1aSopenharmony_ci    dec                   cntd
486cabdff1aSopenharmony_ci    jg .loop
487cabdff1aSopenharmony_ci    RET
488cabdff1aSopenharmony_ci
489cabdff1aSopenharmony_ciINIT_YMM avx
490cabdff1aSopenharmony_cicglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
491cabdff1aSopenharmony_ci    mova                    m0, [aq]
492cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
493cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
494cabdff1aSopenharmony_ci    mov                   cntd, 4
495cabdff1aSopenharmony_ci.loop:
496cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
497cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
498cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
499cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
500cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
501cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
502cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
503cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
504cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
505cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
506cabdff1aSopenharmony_ci    dec                   cntd
507cabdff1aSopenharmony_ci    jg .loop
508cabdff1aSopenharmony_ci    RET
509cabdff1aSopenharmony_ci
510cabdff1aSopenharmony_ci; h
511cabdff1aSopenharmony_ci
512cabdff1aSopenharmony_ci%macro H_XMM_FUNCS 2
513cabdff1aSopenharmony_ci%if notcpuflag(avx)
514cabdff1aSopenharmony_cicglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
515cabdff1aSopenharmony_ci    movd                    m0, [lq]
516cabdff1aSopenharmony_ci%if cpuflag(ssse3)
517cabdff1aSopenharmony_ci    pshufb                  m0, [pb_4x3_4x2_4x1_4x0]
518cabdff1aSopenharmony_ci%else
519cabdff1aSopenharmony_ci    punpcklbw               m0, m0
520cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q0123
521cabdff1aSopenharmony_ci    punpcklwd               m0, m0
522cabdff1aSopenharmony_ci%endif
523cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
524cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m0
525cabdff1aSopenharmony_ci    psrldq                  m0, 4
526cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m0
527cabdff1aSopenharmony_ci    psrldq                  m0, 4
528cabdff1aSopenharmony_ci    movd      [dstq+strideq*2], m0
529cabdff1aSopenharmony_ci    psrldq                  m0, 4
530cabdff1aSopenharmony_ci    movd      [dstq+stride3q ], m0
531cabdff1aSopenharmony_ci    RET
532cabdff1aSopenharmony_ci%endif
533cabdff1aSopenharmony_ci
534cabdff1aSopenharmony_cicglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt
535cabdff1aSopenharmony_ci%if cpuflag(ssse3)
536cabdff1aSopenharmony_ci    mova                    m2, [pb_8x1_8x0]
537cabdff1aSopenharmony_ci    mova                    m3, [pb_8x3_8x2]
538cabdff1aSopenharmony_ci%endif
539cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
540cabdff1aSopenharmony_ci    mov                   cntq, 1
541cabdff1aSopenharmony_ci.loop:
542cabdff1aSopenharmony_ci    movd                    m0, [lq+cntq*4]
543cabdff1aSopenharmony_ci%if cpuflag(ssse3)
544cabdff1aSopenharmony_ci    pshufb                  m1, m0, m3
545cabdff1aSopenharmony_ci    pshufb                  m0, m2
546cabdff1aSopenharmony_ci%else
547cabdff1aSopenharmony_ci    punpcklbw               m0, m0
548cabdff1aSopenharmony_ci    punpcklwd               m0, m0
549cabdff1aSopenharmony_ci    pshufd                  m1, m0, q2233
550cabdff1aSopenharmony_ci    pshufd                  m0, m0, q0011
551cabdff1aSopenharmony_ci%endif
552cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m1
553cabdff1aSopenharmony_ci    movhps    [dstq+strideq*1], m1
554cabdff1aSopenharmony_ci    movq      [dstq+strideq*2], m0
555cabdff1aSopenharmony_ci    movhps    [dstq+stride3q ], m0
556cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
557cabdff1aSopenharmony_ci    dec                   cntq
558cabdff1aSopenharmony_ci    jge .loop
559cabdff1aSopenharmony_ci    RET
560cabdff1aSopenharmony_ci
561cabdff1aSopenharmony_cicglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt
562cabdff1aSopenharmony_ci%if cpuflag(ssse3)
563cabdff1aSopenharmony_ci    mova                    m5, [pb_1]
564cabdff1aSopenharmony_ci    mova                    m6, [pb_2]
565cabdff1aSopenharmony_ci    mova                    m7, [pb_3]
566cabdff1aSopenharmony_ci    pxor                    m4, m4
567cabdff1aSopenharmony_ci%endif
568cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
569cabdff1aSopenharmony_ci    mov                   cntq, 3
570cabdff1aSopenharmony_ci.loop:
571cabdff1aSopenharmony_ci    movd                    m3, [lq+cntq*4]
572cabdff1aSopenharmony_ci%if cpuflag(ssse3)
573cabdff1aSopenharmony_ci    pshufb                  m0, m3, m7
574cabdff1aSopenharmony_ci    pshufb                  m1, m3, m6
575cabdff1aSopenharmony_ci%else
576cabdff1aSopenharmony_ci    punpcklbw               m3, m3
577cabdff1aSopenharmony_ci    punpcklwd               m3, m3
578cabdff1aSopenharmony_ci    pshufd                  m0, m3, q3333
579cabdff1aSopenharmony_ci    pshufd                  m1, m3, q2222
580cabdff1aSopenharmony_ci%endif
581cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
582cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m1
583cabdff1aSopenharmony_ci%if cpuflag(ssse3)
584cabdff1aSopenharmony_ci    pshufb                  m2, m3, m5
585cabdff1aSopenharmony_ci    pshufb                  m3, m4
586cabdff1aSopenharmony_ci%else
587cabdff1aSopenharmony_ci    pshufd                  m2, m3, q1111
588cabdff1aSopenharmony_ci    pshufd                  m3, m3, q0000
589cabdff1aSopenharmony_ci%endif
590cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m2
591cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m3
592cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
593cabdff1aSopenharmony_ci    dec                   cntq
594cabdff1aSopenharmony_ci    jge .loop
595cabdff1aSopenharmony_ci    RET
596cabdff1aSopenharmony_ci
597cabdff1aSopenharmony_cicglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt
598cabdff1aSopenharmony_ci%if cpuflag(ssse3)
599cabdff1aSopenharmony_ci    mova                    m5, [pb_1]
600cabdff1aSopenharmony_ci    mova                    m6, [pb_2]
601cabdff1aSopenharmony_ci    mova                    m7, [pb_3]
602cabdff1aSopenharmony_ci    pxor                    m4, m4
603cabdff1aSopenharmony_ci%endif
604cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
605cabdff1aSopenharmony_ci    mov                   cntq, 7
606cabdff1aSopenharmony_ci.loop:
607cabdff1aSopenharmony_ci    movd                    m3, [lq+cntq*4]
608cabdff1aSopenharmony_ci%if cpuflag(ssse3)
609cabdff1aSopenharmony_ci    pshufb                  m0, m3, m7
610cabdff1aSopenharmony_ci    pshufb                  m1, m3, m6
611cabdff1aSopenharmony_ci%else
612cabdff1aSopenharmony_ci    punpcklbw               m3, m3
613cabdff1aSopenharmony_ci    punpcklwd               m3, m3
614cabdff1aSopenharmony_ci    pshufd                  m0, m3, q3333
615cabdff1aSopenharmony_ci    pshufd                  m1, m3, q2222
616cabdff1aSopenharmony_ci%endif
617cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
618cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m0
619cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m1
620cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m1
621cabdff1aSopenharmony_ci%if cpuflag(ssse3)
622cabdff1aSopenharmony_ci    pshufb                  m2, m3, m5
623cabdff1aSopenharmony_ci    pshufb                  m3, m4
624cabdff1aSopenharmony_ci%else
625cabdff1aSopenharmony_ci    pshufd                  m2, m3, q1111
626cabdff1aSopenharmony_ci    pshufd                  m3, m3, q0000
627cabdff1aSopenharmony_ci%endif
628cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+ 0], m2
629cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+16], m2
630cabdff1aSopenharmony_ci    mova   [dstq+stride3q + 0], m3
631cabdff1aSopenharmony_ci    mova   [dstq+stride3q +16], m3
632cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
633cabdff1aSopenharmony_ci    dec                   cntq
634cabdff1aSopenharmony_ci    jge .loop
635cabdff1aSopenharmony_ci    RET
636cabdff1aSopenharmony_ci%endmacro
637cabdff1aSopenharmony_ci
638cabdff1aSopenharmony_ciINIT_XMM sse2
639cabdff1aSopenharmony_ciH_XMM_FUNCS 2, 4
640cabdff1aSopenharmony_ciINIT_XMM ssse3
641cabdff1aSopenharmony_ciH_XMM_FUNCS 4, 8
642cabdff1aSopenharmony_ciINIT_XMM avx
643cabdff1aSopenharmony_ciH_XMM_FUNCS 4, 8
644cabdff1aSopenharmony_ci
645cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
646cabdff1aSopenharmony_ciINIT_YMM avx2
647cabdff1aSopenharmony_cicglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
648cabdff1aSopenharmony_ci    mova                    m5, [pb_1]
649cabdff1aSopenharmony_ci    mova                    m6, [pb_2]
650cabdff1aSopenharmony_ci    mova                    m7, [pb_3]
651cabdff1aSopenharmony_ci    pxor                    m4, m4
652cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
653cabdff1aSopenharmony_ci    mov                   cntq, 7
654cabdff1aSopenharmony_ci.loop:
655cabdff1aSopenharmony_ci    movd                   xm3, [lq+cntq*4]
656cabdff1aSopenharmony_ci    vinserti128             m3, m3, xm3, 1
657cabdff1aSopenharmony_ci    pshufb                  m0, m3, m7
658cabdff1aSopenharmony_ci    pshufb                  m1, m3, m6
659cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
660cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m1
661cabdff1aSopenharmony_ci    pshufb                  m2, m3, m5
662cabdff1aSopenharmony_ci    pshufb                  m3, m4
663cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m2
664cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m3
665cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
666cabdff1aSopenharmony_ci    dec                   cntq
667cabdff1aSopenharmony_ci    jge .loop
668cabdff1aSopenharmony_ci    RET
669cabdff1aSopenharmony_ci%endif
670cabdff1aSopenharmony_ci
671cabdff1aSopenharmony_ci; tm
672cabdff1aSopenharmony_ci
673cabdff1aSopenharmony_ci%macro TM_MMX_FUNCS 0
674cabdff1aSopenharmony_cicglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
675cabdff1aSopenharmony_ci    pxor                    m1, m1
676cabdff1aSopenharmony_ci    movd                    m0, [aq]
677cabdff1aSopenharmony_ci    pinsrw                  m2, [aq-1], 0
678cabdff1aSopenharmony_ci    punpcklbw               m0, m1
679cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, l, cnt
680cabdff1aSopenharmony_ci%if cpuflag(ssse3)
681cabdff1aSopenharmony_ci    mova                    m3, [pw_m256]
682cabdff1aSopenharmony_ci    mova                    m1, [pw_m255]
683cabdff1aSopenharmony_ci    pshufb                  m2, m3
684cabdff1aSopenharmony_ci%else
685cabdff1aSopenharmony_ci    punpcklbw               m2, m1
686cabdff1aSopenharmony_ci    pshufw                  m2, m2, q0000
687cabdff1aSopenharmony_ci%endif
688cabdff1aSopenharmony_ci    psubw                   m0, m2
689cabdff1aSopenharmony_ci    mov                   cntq, 1
690cabdff1aSopenharmony_ci.loop:
691cabdff1aSopenharmony_ci    pinsrw                  m2, [lq+cntq*2], 0
692cabdff1aSopenharmony_ci%if cpuflag(ssse3)
693cabdff1aSopenharmony_ci    pshufb                  m4, m2, m1
694cabdff1aSopenharmony_ci    pshufb                  m2, m3
695cabdff1aSopenharmony_ci%else
696cabdff1aSopenharmony_ci    punpcklbw               m2, m1
697cabdff1aSopenharmony_ci    pshufw                  m4, m2, q1111
698cabdff1aSopenharmony_ci    pshufw                  m2, m2, q0000
699cabdff1aSopenharmony_ci%endif
700cabdff1aSopenharmony_ci    paddw                   m4, m0
701cabdff1aSopenharmony_ci    paddw                   m2, m0
702cabdff1aSopenharmony_ci    packuswb                m4, m4
703cabdff1aSopenharmony_ci    packuswb                m2, m2
704cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m4
705cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m2
706cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
707cabdff1aSopenharmony_ci    dec                   cntq
708cabdff1aSopenharmony_ci    jge .loop
709cabdff1aSopenharmony_ci    RET
710cabdff1aSopenharmony_ci%endmacro
711cabdff1aSopenharmony_ci
712cabdff1aSopenharmony_ciINIT_MMX mmxext
713cabdff1aSopenharmony_ciTM_MMX_FUNCS
714cabdff1aSopenharmony_ciINIT_MMX ssse3
715cabdff1aSopenharmony_ciTM_MMX_FUNCS
716cabdff1aSopenharmony_ci
717cabdff1aSopenharmony_ci%macro TM_XMM_FUNCS 0
718cabdff1aSopenharmony_cicglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
719cabdff1aSopenharmony_ci    pxor                    m1, m1
720cabdff1aSopenharmony_ci    movh                    m0, [aq]
721cabdff1aSopenharmony_ci    pinsrw                  m2, [aq-1], 0
722cabdff1aSopenharmony_ci    punpcklbw               m0, m1
723cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, l, cnt
724cabdff1aSopenharmony_ci%if cpuflag(ssse3)
725cabdff1aSopenharmony_ci    mova                    m3, [pw_m256]
726cabdff1aSopenharmony_ci    mova                    m1, [pw_m255]
727cabdff1aSopenharmony_ci    pshufb                  m2, m3
728cabdff1aSopenharmony_ci%else
729cabdff1aSopenharmony_ci    punpcklbw               m2, m1
730cabdff1aSopenharmony_ci    punpcklwd               m2, m2
731cabdff1aSopenharmony_ci    pshufd                  m2, m2, q0000
732cabdff1aSopenharmony_ci%endif
733cabdff1aSopenharmony_ci    psubw                   m0, m2
734cabdff1aSopenharmony_ci    mov                   cntq, 3
735cabdff1aSopenharmony_ci.loop:
736cabdff1aSopenharmony_ci    pinsrw                  m2, [lq+cntq*2], 0
737cabdff1aSopenharmony_ci%if cpuflag(ssse3)
738cabdff1aSopenharmony_ci    pshufb                  m4, m2, m1
739cabdff1aSopenharmony_ci    pshufb                  m2, m3
740cabdff1aSopenharmony_ci%else
741cabdff1aSopenharmony_ci    punpcklbw               m2, m1
742cabdff1aSopenharmony_ci    punpcklwd               m2, m2
743cabdff1aSopenharmony_ci    pshufd                  m4, m2, q1111
744cabdff1aSopenharmony_ci    pshufd                  m2, m2, q0000
745cabdff1aSopenharmony_ci%endif
746cabdff1aSopenharmony_ci    paddw                   m4, m0
747cabdff1aSopenharmony_ci    paddw                   m2, m0
748cabdff1aSopenharmony_ci    packuswb                m4, m2
749cabdff1aSopenharmony_ci    movh      [dstq+strideq*0], m4
750cabdff1aSopenharmony_ci    movhps    [dstq+strideq*1], m4
751cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
752cabdff1aSopenharmony_ci    dec                   cntq
753cabdff1aSopenharmony_ci    jge .loop
754cabdff1aSopenharmony_ci    RET
755cabdff1aSopenharmony_ci
756cabdff1aSopenharmony_cicglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
757cabdff1aSopenharmony_ci    pxor                    m3, m3
758cabdff1aSopenharmony_ci    mova                    m0, [aq]
759cabdff1aSopenharmony_ci    pinsrw                  m2, [aq-1], 0
760cabdff1aSopenharmony_ci    punpckhbw               m1, m0, m3
761cabdff1aSopenharmony_ci    punpcklbw               m0, m3
762cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, l, cnt
763cabdff1aSopenharmony_ci%if cpuflag(ssse3)
764cabdff1aSopenharmony_ci    mova                    m4, [pw_m256]
765cabdff1aSopenharmony_ci    mova                    m3, [pw_m255]
766cabdff1aSopenharmony_ci    pshufb                  m2, m4
767cabdff1aSopenharmony_ci%else
768cabdff1aSopenharmony_ci    punpcklbw               m2, m3
769cabdff1aSopenharmony_ci    punpcklwd               m2, m2
770cabdff1aSopenharmony_ci    pshufd                  m2, m2, q0000
771cabdff1aSopenharmony_ci%endif
772cabdff1aSopenharmony_ci    psubw                   m1, m2
773cabdff1aSopenharmony_ci    psubw                   m0, m2
774cabdff1aSopenharmony_ci    mov                   cntq, 7
775cabdff1aSopenharmony_ci.loop:
776cabdff1aSopenharmony_ci    pinsrw                  m7, [lq+cntq*2], 0
777cabdff1aSopenharmony_ci%if cpuflag(ssse3)
778cabdff1aSopenharmony_ci    pshufb                  m5, m7, m3
779cabdff1aSopenharmony_ci    pshufb                  m7, m4
780cabdff1aSopenharmony_ci%else
781cabdff1aSopenharmony_ci    punpcklbw               m7, m3
782cabdff1aSopenharmony_ci    punpcklwd               m7, m7
783cabdff1aSopenharmony_ci    pshufd                  m5, m7, q1111
784cabdff1aSopenharmony_ci    pshufd                  m7, m7, q0000
785cabdff1aSopenharmony_ci%endif
786cabdff1aSopenharmony_ci    paddw                   m2, m5, m0
787cabdff1aSopenharmony_ci    paddw                   m5, m1
788cabdff1aSopenharmony_ci    paddw                   m6, m7, m0
789cabdff1aSopenharmony_ci    paddw                   m7, m1
790cabdff1aSopenharmony_ci    packuswb                m2, m5
791cabdff1aSopenharmony_ci    packuswb                m6, m7
792cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m2
793cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m6
794cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
795cabdff1aSopenharmony_ci    dec                   cntq
796cabdff1aSopenharmony_ci    jge .loop
797cabdff1aSopenharmony_ci    RET
798cabdff1aSopenharmony_ci
799cabdff1aSopenharmony_ci%if ARCH_X86_64
800cabdff1aSopenharmony_ci%define mem 0
801cabdff1aSopenharmony_ci%else
802cabdff1aSopenharmony_ci%define mem 64
803cabdff1aSopenharmony_ci%endif
804cabdff1aSopenharmony_cicglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a
805cabdff1aSopenharmony_ci    pxor                    m5, m5
806cabdff1aSopenharmony_ci    pinsrw                  m4, [aq-1], 0
807cabdff1aSopenharmony_ci    mova                    m0, [aq]
808cabdff1aSopenharmony_ci    mova                    m2, [aq+16]
809cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, l, cnt
810cabdff1aSopenharmony_ci%if cpuflag(ssse3)
811cabdff1aSopenharmony_ci%if ARCH_X86_64
812cabdff1aSopenharmony_ci    mova                   m12, [pw_m256]
813cabdff1aSopenharmony_ci    mova                   m13, [pw_m255]
814cabdff1aSopenharmony_ci%define pw_m256_reg m12
815cabdff1aSopenharmony_ci%define pw_m255_reg m13
816cabdff1aSopenharmony_ci%else
817cabdff1aSopenharmony_ci%define pw_m256_reg [pw_m256]
818cabdff1aSopenharmony_ci%define pw_m255_reg [pw_m255]
819cabdff1aSopenharmony_ci%endif
820cabdff1aSopenharmony_ci    pshufb                  m4, pw_m256_reg
821cabdff1aSopenharmony_ci%else
822cabdff1aSopenharmony_ci    punpcklbw               m4, m5
823cabdff1aSopenharmony_ci    punpcklwd               m4, m4
824cabdff1aSopenharmony_ci    pshufd                  m4, m4, q0000
825cabdff1aSopenharmony_ci%endif
826cabdff1aSopenharmony_ci    punpckhbw               m1, m0,  m5
827cabdff1aSopenharmony_ci    punpckhbw               m3, m2,  m5
828cabdff1aSopenharmony_ci    punpcklbw               m0, m5
829cabdff1aSopenharmony_ci    punpcklbw               m2, m5
830cabdff1aSopenharmony_ci    psubw                   m1, m4
831cabdff1aSopenharmony_ci    psubw                   m0, m4
832cabdff1aSopenharmony_ci    psubw                   m3, m4
833cabdff1aSopenharmony_ci    psubw                   m2, m4
834cabdff1aSopenharmony_ci%if ARCH_X86_64
835cabdff1aSopenharmony_ci    SWAP                     0, 8
836cabdff1aSopenharmony_ci    SWAP                     1, 9
837cabdff1aSopenharmony_ci    SWAP                     2, 10
838cabdff1aSopenharmony_ci    SWAP                     3, 11
839cabdff1aSopenharmony_ci%else
840cabdff1aSopenharmony_ci    mova            [rsp+0*16], m0
841cabdff1aSopenharmony_ci    mova            [rsp+1*16], m1
842cabdff1aSopenharmony_ci    mova            [rsp+2*16], m2
843cabdff1aSopenharmony_ci    mova            [rsp+3*16], m3
844cabdff1aSopenharmony_ci%endif
845cabdff1aSopenharmony_ci    mov                   cntq, 15
846cabdff1aSopenharmony_ci.loop:
847cabdff1aSopenharmony_ci    pinsrw                  m3, [lq+cntq*2], 0
848cabdff1aSopenharmony_ci%if cpuflag(ssse3)
849cabdff1aSopenharmony_ci    pshufb                  m7, m3, pw_m255_reg
850cabdff1aSopenharmony_ci    pshufb                  m3, pw_m256_reg
851cabdff1aSopenharmony_ci%else
852cabdff1aSopenharmony_ci    pxor                    m7, m7
853cabdff1aSopenharmony_ci    punpcklbw               m3, m7
854cabdff1aSopenharmony_ci    punpcklwd               m3, m3
855cabdff1aSopenharmony_ci    pshufd                  m7, m3, q1111
856cabdff1aSopenharmony_ci    pshufd                  m3, m3, q0000
857cabdff1aSopenharmony_ci%endif
858cabdff1aSopenharmony_ci%if ARCH_X86_64
859cabdff1aSopenharmony_ci    paddw                   m4, m7, m8
860cabdff1aSopenharmony_ci    paddw                   m5, m7, m9
861cabdff1aSopenharmony_ci    paddw                   m6, m7, m10
862cabdff1aSopenharmony_ci    paddw                   m7, m11
863cabdff1aSopenharmony_ci    paddw                   m0, m3, m8
864cabdff1aSopenharmony_ci    paddw                   m1, m3, m9
865cabdff1aSopenharmony_ci    paddw                   m2, m3, m10
866cabdff1aSopenharmony_ci    paddw                   m3, m11
867cabdff1aSopenharmony_ci%else
868cabdff1aSopenharmony_ci    paddw                   m4, m7, [rsp+0*16]
869cabdff1aSopenharmony_ci    paddw                   m5, m7, [rsp+1*16]
870cabdff1aSopenharmony_ci    paddw                   m6, m7, [rsp+2*16]
871cabdff1aSopenharmony_ci    paddw                   m7, [rsp+3*16]
872cabdff1aSopenharmony_ci    paddw                   m0, m3, [rsp+0*16]
873cabdff1aSopenharmony_ci    paddw                   m1, m3, [rsp+1*16]
874cabdff1aSopenharmony_ci    paddw                   m2, m3, [rsp+2*16]
875cabdff1aSopenharmony_ci    paddw                   m3, [rsp+3*16]
876cabdff1aSopenharmony_ci%endif
877cabdff1aSopenharmony_ci    packuswb                m4, m5
878cabdff1aSopenharmony_ci    packuswb                m6, m7
879cabdff1aSopenharmony_ci    packuswb                m0, m1
880cabdff1aSopenharmony_ci    packuswb                m2, m3
881cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m4
882cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m6
883cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m0
884cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m2
885cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
886cabdff1aSopenharmony_ci    dec                   cntq
887cabdff1aSopenharmony_ci    jge .loop
888cabdff1aSopenharmony_ci    RET
889cabdff1aSopenharmony_ci%undef pw_m256_reg
890cabdff1aSopenharmony_ci%undef pw_m255_reg
891cabdff1aSopenharmony_ci%undef mem
892cabdff1aSopenharmony_ci%endmacro
893cabdff1aSopenharmony_ci
894cabdff1aSopenharmony_ciINIT_XMM sse2
895cabdff1aSopenharmony_ciTM_XMM_FUNCS
896cabdff1aSopenharmony_ciINIT_XMM ssse3
897cabdff1aSopenharmony_ciTM_XMM_FUNCS
898cabdff1aSopenharmony_ciINIT_XMM avx
899cabdff1aSopenharmony_ciTM_XMM_FUNCS
900cabdff1aSopenharmony_ci
901cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
902cabdff1aSopenharmony_ciINIT_YMM avx2
903cabdff1aSopenharmony_cicglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
904cabdff1aSopenharmony_ci    pxor                    m3, m3
905cabdff1aSopenharmony_ci    pinsrw                 xm2, [aq-1], 0
906cabdff1aSopenharmony_ci    vinserti128             m2, m2, xm2, 1
907cabdff1aSopenharmony_ci    mova                    m0, [aq]
908cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, l, cnt
909cabdff1aSopenharmony_ci    mova                    m4, [pw_m256]
910cabdff1aSopenharmony_ci    mova                    m5, [pw_m255]
911cabdff1aSopenharmony_ci    pshufb                  m2, m4
912cabdff1aSopenharmony_ci    punpckhbw               m1, m0, m3
913cabdff1aSopenharmony_ci    punpcklbw               m0, m3
914cabdff1aSopenharmony_ci    psubw                   m1, m2
915cabdff1aSopenharmony_ci    psubw                   m0, m2
916cabdff1aSopenharmony_ci    mov                   cntq, 15
917cabdff1aSopenharmony_ci.loop:
918cabdff1aSopenharmony_ci    pinsrw                 xm7, [lq+cntq*2], 0
919cabdff1aSopenharmony_ci    vinserti128             m7, m7, xm7, 1
920cabdff1aSopenharmony_ci    pshufb                  m3, m7, m5
921cabdff1aSopenharmony_ci    pshufb                  m7, m4
922cabdff1aSopenharmony_ci    paddw                   m2, m3, m0
923cabdff1aSopenharmony_ci    paddw                   m3, m1
924cabdff1aSopenharmony_ci    paddw                   m6, m7, m0
925cabdff1aSopenharmony_ci    paddw                   m7, m1
926cabdff1aSopenharmony_ci    packuswb                m2, m3
927cabdff1aSopenharmony_ci    packuswb                m6, m7
928cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m2
929cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m6
930cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
931cabdff1aSopenharmony_ci    dec                   cntq
932cabdff1aSopenharmony_ci    jge .loop
933cabdff1aSopenharmony_ci    RET
934cabdff1aSopenharmony_ci%endif
935cabdff1aSopenharmony_ci
936cabdff1aSopenharmony_ci; dl
937cabdff1aSopenharmony_ci
938cabdff1aSopenharmony_ci%macro LOWPASS 4 ; left [dst], center, right, tmp
939cabdff1aSopenharmony_ci    pxor                   m%4, m%1, m%3
940cabdff1aSopenharmony_ci    pand                   m%4, [pb_1]
941cabdff1aSopenharmony_ci    pavgb                  m%1, m%3
942cabdff1aSopenharmony_ci    psubusb                m%1, m%4
943cabdff1aSopenharmony_ci    pavgb                  m%1, m%2
944cabdff1aSopenharmony_ci%endmacro
945cabdff1aSopenharmony_ci
946cabdff1aSopenharmony_ci%macro DL_MMX_FUNCS 0
947cabdff1aSopenharmony_cicglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
948cabdff1aSopenharmony_ci    movq                    m1, [aq]
949cabdff1aSopenharmony_ci%if cpuflag(ssse3)
950cabdff1aSopenharmony_ci    pshufb                  m0, m1, [pb_0to5_2x7]
951cabdff1aSopenharmony_ci    pshufb                  m2, m1, [pb_2to6_3x7]
952cabdff1aSopenharmony_ci%else
953cabdff1aSopenharmony_ci    punpckhbw               m3, m1, m1              ; 44556677
954cabdff1aSopenharmony_ci    pand                    m0, m1, [pb_6xm1_2x0]   ; 012345__
955cabdff1aSopenharmony_ci    pand                    m3, [pb_6x0_2xm1]       ; ______77
956cabdff1aSopenharmony_ci    psrlq                   m2, m1, 16              ; 234567__
957cabdff1aSopenharmony_ci    por                     m0, m3                  ; 01234577
958cabdff1aSopenharmony_ci    por                     m2, m3                  ; 23456777
959cabdff1aSopenharmony_ci%endif
960cabdff1aSopenharmony_ci    psrlq                   m1, 8
961cabdff1aSopenharmony_ci    LOWPASS                  0, 1, 2, 3
962cabdff1aSopenharmony_ci
963cabdff1aSopenharmony_ci    pshufw                  m1, m0, q3321
964cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m0
965cabdff1aSopenharmony_ci    movd      [dstq+strideq*2], m1
966cabdff1aSopenharmony_ci    psrlq                   m0, 8
967cabdff1aSopenharmony_ci    psrlq                   m1, 8
968cabdff1aSopenharmony_ci    add                   dstq, strideq
969cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m0
970cabdff1aSopenharmony_ci    movd      [dstq+strideq*2], m1
971cabdff1aSopenharmony_ci    RET
972cabdff1aSopenharmony_ci%endmacro
973cabdff1aSopenharmony_ci
974cabdff1aSopenharmony_ciINIT_MMX mmxext
975cabdff1aSopenharmony_ciDL_MMX_FUNCS
976cabdff1aSopenharmony_ciINIT_MMX ssse3
977cabdff1aSopenharmony_ciDL_MMX_FUNCS
978cabdff1aSopenharmony_ci
979cabdff1aSopenharmony_ci%macro DL_XMM_FUNCS 0
980cabdff1aSopenharmony_cicglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
981cabdff1aSopenharmony_ci    movq                    m0, [aq]
982cabdff1aSopenharmony_ci    lea               stride5q, [strideq*5]
983cabdff1aSopenharmony_ci%if cpuflag(ssse3)
984cabdff1aSopenharmony_ci    pshufb                  m1, m0, [pb_1to6_10x7]
985cabdff1aSopenharmony_ci%else
986cabdff1aSopenharmony_ci    punpcklbw               m1, m0, m0              ; 0011223344556677
987cabdff1aSopenharmony_ci    punpckhwd               m1, m1                  ; 4x4,4x5,4x6,4x7
988cabdff1aSopenharmony_ci%endif
989cabdff1aSopenharmony_ci    shufps                  m0, m1, q3310
990cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
991cabdff1aSopenharmony_ci    psrldq                  m1, m0, 1
992cabdff1aSopenharmony_ci    shufps                  m1, m0, q3210
993cabdff1aSopenharmony_ci%endif
994cabdff1aSopenharmony_ci    psrldq                  m2, m1, 1
995cabdff1aSopenharmony_ci    LOWPASS                  0, 1, 2, 3
996cabdff1aSopenharmony_ci
997cabdff1aSopenharmony_ci    pshufd                  m1, m0, q3321
998cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m0
999cabdff1aSopenharmony_ci    movq      [dstq+strideq*4], m1
1000cabdff1aSopenharmony_ci    psrldq                  m0, 1
1001cabdff1aSopenharmony_ci    psrldq                  m1, 1
1002cabdff1aSopenharmony_ci    movq      [dstq+strideq*1], m0
1003cabdff1aSopenharmony_ci    movq      [dstq+stride5q ], m1
1004cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1005cabdff1aSopenharmony_ci    psrldq                  m0, 1
1006cabdff1aSopenharmony_ci    psrldq                  m1, 1
1007cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m0
1008cabdff1aSopenharmony_ci    movq      [dstq+strideq*4], m1
1009cabdff1aSopenharmony_ci    psrldq                  m0, 1
1010cabdff1aSopenharmony_ci    psrldq                  m1, 1
1011cabdff1aSopenharmony_ci    movq      [dstq+strideq*1], m0
1012cabdff1aSopenharmony_ci    movq      [dstq+stride5q ], m1
1013cabdff1aSopenharmony_ci    RET
1014cabdff1aSopenharmony_ci
1015cabdff1aSopenharmony_cicglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
1016cabdff1aSopenharmony_ci    mova                    m0, [aq]
1017cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1018cabdff1aSopenharmony_ci    mova                    m5, [pb_1toE_2xF]
1019cabdff1aSopenharmony_ci    pshufb                  m1, m0, m5
1020cabdff1aSopenharmony_ci    pshufb                  m2, m1, m5
1021cabdff1aSopenharmony_ci    pshufb                  m4, m0, [pb_15]
1022cabdff1aSopenharmony_ci%else
1023cabdff1aSopenharmony_ci    pand                    m5, m0, [pb_15x0_1xm1]      ; _______________F
1024cabdff1aSopenharmony_ci    psrldq                  m1, m0, 1                   ; 123456789ABCDEF_
1025cabdff1aSopenharmony_ci    por                     m1, m5                      ; 123456789ABCDEFF
1026cabdff1aSopenharmony_ci    psrldq                  m2, m1, 1                   ; 23456789ABCDEFF_
1027cabdff1aSopenharmony_ci    por                     m2, m5                      ; 23456789ABCDEFFF
1028cabdff1aSopenharmony_ci    pshufhw                 m4, m1, q3333               ; xxxxxxxxFFFFFFFF
1029cabdff1aSopenharmony_ci%endif
1030cabdff1aSopenharmony_ci    LOWPASS                  0, 1, 2, 3
1031cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt, stride9
1032cabdff1aSopenharmony_ci    lea               stride9q, [strideq+strideq*8]
1033cabdff1aSopenharmony_ci    mov                   cntd, 4
1034cabdff1aSopenharmony_ci
1035cabdff1aSopenharmony_ci.loop:
1036cabdff1aSopenharmony_ci    movhlps                 m4, m0
1037cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
1038cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1039cabdff1aSopenharmony_ci    pshufb                  m0, m5
1040cabdff1aSopenharmony_ci%else
1041cabdff1aSopenharmony_ci    psrldq                  m0, 1
1042cabdff1aSopenharmony_ci    por                     m0, m5
1043cabdff1aSopenharmony_ci%endif
1044cabdff1aSopenharmony_ci    mova      [dstq+strideq*8], m4
1045cabdff1aSopenharmony_ci    movhlps                 m4, m0
1046cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
1047cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1048cabdff1aSopenharmony_ci    pshufb                  m0, m5
1049cabdff1aSopenharmony_ci%else
1050cabdff1aSopenharmony_ci    psrldq                  m0, 1
1051cabdff1aSopenharmony_ci    por                     m0, m5
1052cabdff1aSopenharmony_ci%endif
1053cabdff1aSopenharmony_ci    mova      [dstq+stride9q ], m4
1054cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1055cabdff1aSopenharmony_ci    dec                   cntd
1056cabdff1aSopenharmony_ci    jg .loop
1057cabdff1aSopenharmony_ci    RET
1058cabdff1aSopenharmony_ci
1059cabdff1aSopenharmony_cicglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
1060cabdff1aSopenharmony_ci    mova                    m0, [aq]
1061cabdff1aSopenharmony_ci    mova                    m1, [aq+16]
1062cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, m0, 1, m4
1063cabdff1aSopenharmony_ci    PALIGNR                 m3, m1, m0, 2, m4
1064cabdff1aSopenharmony_ci    LOWPASS                  0, 2, 3, 4
1065cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1066cabdff1aSopenharmony_ci    mova                    m5, [pb_1toE_2xF]
1067cabdff1aSopenharmony_ci    pshufb                  m2, m1, m5
1068cabdff1aSopenharmony_ci    pshufb                  m3, m2, m5
1069cabdff1aSopenharmony_ci    pshufb                  m6, m1, [pb_15]
1070cabdff1aSopenharmony_ci    mova                    m7, m6
1071cabdff1aSopenharmony_ci%else
1072cabdff1aSopenharmony_ci    pand                    m5, m1, [pb_15x0_1xm1]      ; _______________F
1073cabdff1aSopenharmony_ci    psrldq                  m2, m1, 1                   ; 123456789ABCDEF_
1074cabdff1aSopenharmony_ci    por                     m2, m5                      ; 123456789ABCDEFF
1075cabdff1aSopenharmony_ci    psrldq                  m3, m2, 1                   ; 23456789ABCDEFF_
1076cabdff1aSopenharmony_ci    por                     m3, m5                      ; 23456789ABCDEFFF
1077cabdff1aSopenharmony_ci    pshufhw                 m7, m2, q3333               ; xxxxxxxxFFFFFFFF
1078cabdff1aSopenharmony_ci    pshufd                  m6, m7, q3333
1079cabdff1aSopenharmony_ci%endif
1080cabdff1aSopenharmony_ci    LOWPASS                  1, 2, 3, 4
1081cabdff1aSopenharmony_ci    lea                 dst16q, [dstq  +strideq*8]
1082cabdff1aSopenharmony_ci    mov                   cntd, 8
1083cabdff1aSopenharmony_ci    lea                 dst16q, [dst16q+strideq*8]
1084cabdff1aSopenharmony_ci.loop:
1085cabdff1aSopenharmony_ci    movhlps                 m7, m1
1086cabdff1aSopenharmony_ci    mova [dstq  +strideq*0+ 0], m0
1087cabdff1aSopenharmony_ci    mova [dstq  +strideq*0+16], m1
1088cabdff1aSopenharmony_ci    movhps [dstq+strideq*8+ 0], m0
1089cabdff1aSopenharmony_ci    movq [dstq  +strideq*8+ 8], m1
1090cabdff1aSopenharmony_ci    mova [dstq  +strideq*8+16], m7
1091cabdff1aSopenharmony_ci    mova [dst16q+strideq*0+ 0], m1
1092cabdff1aSopenharmony_ci    mova [dst16q+strideq*0+16], m6
1093cabdff1aSopenharmony_ci    mova [dst16q+strideq*8+ 0], m7
1094cabdff1aSopenharmony_ci    mova [dst16q+strideq*8+16], m6
1095cabdff1aSopenharmony_ci%if cpuflag(avx)
1096cabdff1aSopenharmony_ci    vpalignr                m0, m1, m0, 1
1097cabdff1aSopenharmony_ci    pshufb                  m1, m5
1098cabdff1aSopenharmony_ci%elif cpuflag(ssse3)
1099cabdff1aSopenharmony_ci    palignr                 m2, m1, m0, 1
1100cabdff1aSopenharmony_ci    pshufb                  m1, m5
1101cabdff1aSopenharmony_ci    mova                    m0, m2
1102cabdff1aSopenharmony_ci%else
1103cabdff1aSopenharmony_ci    mova                    m4, m1
1104cabdff1aSopenharmony_ci    psrldq                  m0, 1
1105cabdff1aSopenharmony_ci    pslldq                  m4, 15
1106cabdff1aSopenharmony_ci    psrldq                  m1, 1
1107cabdff1aSopenharmony_ci    por                     m0, m4
1108cabdff1aSopenharmony_ci    por                     m1, m5
1109cabdff1aSopenharmony_ci%endif
1110cabdff1aSopenharmony_ci    add                   dstq, strideq
1111cabdff1aSopenharmony_ci    add                 dst16q, strideq
1112cabdff1aSopenharmony_ci    dec                   cntd
1113cabdff1aSopenharmony_ci    jg .loop
1114cabdff1aSopenharmony_ci    RET
1115cabdff1aSopenharmony_ci%endmacro
1116cabdff1aSopenharmony_ci
1117cabdff1aSopenharmony_ciINIT_XMM sse2
1118cabdff1aSopenharmony_ciDL_XMM_FUNCS
1119cabdff1aSopenharmony_ciINIT_XMM ssse3
1120cabdff1aSopenharmony_ciDL_XMM_FUNCS
1121cabdff1aSopenharmony_ciINIT_XMM avx
1122cabdff1aSopenharmony_ciDL_XMM_FUNCS
1123cabdff1aSopenharmony_ci
1124cabdff1aSopenharmony_ci; dr
1125cabdff1aSopenharmony_ci
1126cabdff1aSopenharmony_ci%macro DR_MMX_FUNCS 0
1127cabdff1aSopenharmony_cicglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
1128cabdff1aSopenharmony_ci    movd                    m0, [lq]
1129cabdff1aSopenharmony_ci    punpckldq               m0, [aq-1]
1130cabdff1aSopenharmony_ci    movd                    m1, [aq+3]
1131cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1132cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1133cabdff1aSopenharmony_ci    PALIGNR                 m1, m0, 1, m3
1134cabdff1aSopenharmony_ci    psrlq                   m2, m1, 8
1135cabdff1aSopenharmony_ci    LOWPASS                  0, 1, 2, 3
1136cabdff1aSopenharmony_ci
1137cabdff1aSopenharmony_ci    movd      [dstq+stride3q ], m0
1138cabdff1aSopenharmony_ci    psrlq                   m0, 8
1139cabdff1aSopenharmony_ci    movd      [dstq+strideq*2], m0
1140cabdff1aSopenharmony_ci    psrlq                   m0, 8
1141cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m0
1142cabdff1aSopenharmony_ci    psrlq                   m0, 8
1143cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m0
1144cabdff1aSopenharmony_ci    RET
1145cabdff1aSopenharmony_ci%endmacro
1146cabdff1aSopenharmony_ci
1147cabdff1aSopenharmony_ciINIT_MMX mmxext
1148cabdff1aSopenharmony_ciDR_MMX_FUNCS
1149cabdff1aSopenharmony_ciINIT_MMX ssse3
1150cabdff1aSopenharmony_ciDR_MMX_FUNCS
1151cabdff1aSopenharmony_ci
1152cabdff1aSopenharmony_ci%macro DR_XMM_FUNCS 0
1153cabdff1aSopenharmony_cicglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
1154cabdff1aSopenharmony_ci    movq                    m1, [lq]
1155cabdff1aSopenharmony_ci    movhps                  m1, [aq-1]
1156cabdff1aSopenharmony_ci    movd                    m2, [aq+7]
1157cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1158cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1159cabdff1aSopenharmony_ci    pslldq                  m0, m1, 1
1160cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, 1, m3
1161cabdff1aSopenharmony_ci    LOWPASS                  0, 1, 2, 3
1162cabdff1aSopenharmony_ci
1163cabdff1aSopenharmony_ci    movhps    [dstq+strideq*0], m0
1164cabdff1aSopenharmony_ci    pslldq                  m0, 1
1165cabdff1aSopenharmony_ci    movhps    [dstq+strideq*1], m0
1166cabdff1aSopenharmony_ci    pslldq                  m0, 1
1167cabdff1aSopenharmony_ci    movhps    [dstq+strideq*2], m0
1168cabdff1aSopenharmony_ci    pslldq                  m0, 1
1169cabdff1aSopenharmony_ci    movhps    [dstq+stride3q ], m0
1170cabdff1aSopenharmony_ci    pslldq                  m0, 1
1171cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
1172cabdff1aSopenharmony_ci    movhps    [dstq+strideq*0], m0
1173cabdff1aSopenharmony_ci    pslldq                  m0, 1
1174cabdff1aSopenharmony_ci    movhps    [dstq+strideq*1], m0
1175cabdff1aSopenharmony_ci    pslldq                  m0, 1
1176cabdff1aSopenharmony_ci    movhps    [dstq+strideq*2], m0
1177cabdff1aSopenharmony_ci    pslldq                  m0, 1
1178cabdff1aSopenharmony_ci    movhps    [dstq+stride3q ], m0
1179cabdff1aSopenharmony_ci    RET
1180cabdff1aSopenharmony_ci
1181cabdff1aSopenharmony_cicglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
1182cabdff1aSopenharmony_ci    mova                    m1, [lq]
1183cabdff1aSopenharmony_ci    movu                    m2, [aq-1]
1184cabdff1aSopenharmony_ci    movd                    m4, [aq+15]
1185cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride9, cnt
1186cabdff1aSopenharmony_ci    lea               stride9q, [strideq *3]
1187cabdff1aSopenharmony_ci    mov                   cntd, 4
1188cabdff1aSopenharmony_ci    lea               stride9q, [stride9q*3]
1189cabdff1aSopenharmony_ci    PALIGNR                 m4, m2, 1, m5
1190cabdff1aSopenharmony_ci    PALIGNR                 m3, m2, m1, 15, m5
1191cabdff1aSopenharmony_ci    LOWPASS                  3,  2, 4, 5
1192cabdff1aSopenharmony_ci    pslldq                  m0, m1, 1
1193cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, 1, m4
1194cabdff1aSopenharmony_ci    LOWPASS                  0,  1, 2, 4
1195cabdff1aSopenharmony_ci
1196cabdff1aSopenharmony_ci.loop:
1197cabdff1aSopenharmony_ci    mova    [dstq+strideq*0  ], m3
1198cabdff1aSopenharmony_ci    movhps  [dstq+strideq*8+0], m0
1199cabdff1aSopenharmony_ci    movq    [dstq+strideq*8+8], m3
1200cabdff1aSopenharmony_ci    PALIGNR                 m3, m0, 15, m1
1201cabdff1aSopenharmony_ci    pslldq                  m0, 1
1202cabdff1aSopenharmony_ci    mova    [dstq+strideq*1  ], m3
1203cabdff1aSopenharmony_ci    movhps  [dstq+stride9q +0], m0
1204cabdff1aSopenharmony_ci    movq    [dstq+stride9q +8], m3
1205cabdff1aSopenharmony_ci    PALIGNR                 m3, m0, 15, m1
1206cabdff1aSopenharmony_ci    pslldq                  m0, 1
1207cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1208cabdff1aSopenharmony_ci    dec                   cntd
1209cabdff1aSopenharmony_ci    jg .loop
1210cabdff1aSopenharmony_ci    RET
1211cabdff1aSopenharmony_ci
1212cabdff1aSopenharmony_cicglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
1213cabdff1aSopenharmony_ci    mova                    m1, [lq]
1214cabdff1aSopenharmony_ci    mova                    m2, [lq+16]
1215cabdff1aSopenharmony_ci    movu                    m3, [aq-1]
1216cabdff1aSopenharmony_ci    movu                    m4, [aq+15]
1217cabdff1aSopenharmony_ci    movd                    m5, [aq+31]
1218cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride8, cnt
1219cabdff1aSopenharmony_ci    lea               stride8q, [strideq*8]
1220cabdff1aSopenharmony_ci    PALIGNR                 m5, m4, 1, m7
1221cabdff1aSopenharmony_ci    PALIGNR                 m6, m4, m3, 15, m7
1222cabdff1aSopenharmony_ci    LOWPASS                  5,  4,  6,  7
1223cabdff1aSopenharmony_ci    PALIGNR                 m4, m3, 1, m7
1224cabdff1aSopenharmony_ci    PALIGNR                 m6, m3, m2, 15, m7
1225cabdff1aSopenharmony_ci    LOWPASS                  4,  3,  6,  7
1226cabdff1aSopenharmony_ci    PALIGNR                 m3, m2, 1, m7
1227cabdff1aSopenharmony_ci    PALIGNR                 m6, m2, m1, 15, m7
1228cabdff1aSopenharmony_ci    LOWPASS                  3,  2,  6,  7
1229cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, 1, m6
1230cabdff1aSopenharmony_ci    pslldq                  m0, m1, 1
1231cabdff1aSopenharmony_ci    LOWPASS                  2,  1,  0,  6
1232cabdff1aSopenharmony_ci    mov                   cntd, 16
1233cabdff1aSopenharmony_ci
1234cabdff1aSopenharmony_ci    ; out=m2/m3/m4/m5
1235cabdff1aSopenharmony_ci.loop:
1236cabdff1aSopenharmony_ci    mova  [dstq+stride8q*0+ 0], m4
1237cabdff1aSopenharmony_ci    mova  [dstq+stride8q*0+16], m5
1238cabdff1aSopenharmony_ci    mova  [dstq+stride8q*2+ 0], m3
1239cabdff1aSopenharmony_ci    mova  [dstq+stride8q*2+16], m4
1240cabdff1aSopenharmony_ci    PALIGNR                 m5, m4, 15, m6
1241cabdff1aSopenharmony_ci    PALIGNR                 m4, m3, 15, m6
1242cabdff1aSopenharmony_ci    PALIGNR                 m3, m2, 15, m6
1243cabdff1aSopenharmony_ci    pslldq                  m2, 1
1244cabdff1aSopenharmony_ci    add                   dstq, strideq
1245cabdff1aSopenharmony_ci    dec                   cntd
1246cabdff1aSopenharmony_ci    jg .loop
1247cabdff1aSopenharmony_ci    RET
1248cabdff1aSopenharmony_ci%endmacro
1249cabdff1aSopenharmony_ci
1250cabdff1aSopenharmony_ciINIT_XMM sse2
1251cabdff1aSopenharmony_ciDR_XMM_FUNCS
1252cabdff1aSopenharmony_ciINIT_XMM ssse3
1253cabdff1aSopenharmony_ciDR_XMM_FUNCS
1254cabdff1aSopenharmony_ciINIT_XMM avx
1255cabdff1aSopenharmony_ciDR_XMM_FUNCS
1256cabdff1aSopenharmony_ci
1257cabdff1aSopenharmony_ci; vl
1258cabdff1aSopenharmony_ci
1259cabdff1aSopenharmony_ciINIT_MMX mmxext
1260cabdff1aSopenharmony_cicglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
1261cabdff1aSopenharmony_ci    movq                    m0, [aq]
1262cabdff1aSopenharmony_ci    psrlq                   m1, m0, 8
1263cabdff1aSopenharmony_ci    psrlq                   m2, m1, 8
1264cabdff1aSopenharmony_ci    LOWPASS                  2,  1, 0, 3
1265cabdff1aSopenharmony_ci    pavgb                   m1, m0
1266cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m1
1267cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m2
1268cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1269cabdff1aSopenharmony_ci    psrlq                   m1, 8
1270cabdff1aSopenharmony_ci    psrlq                   m2, 8
1271cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m1
1272cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m2
1273cabdff1aSopenharmony_ci    RET
1274cabdff1aSopenharmony_ci
1275cabdff1aSopenharmony_ci%macro VL_XMM_FUNCS 0
1276cabdff1aSopenharmony_cicglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
1277cabdff1aSopenharmony_ci    movq                    m0, [aq]
1278cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1279cabdff1aSopenharmony_ci    pshufb                  m0, [pb_0to6_9x7]
1280cabdff1aSopenharmony_ci%else
1281cabdff1aSopenharmony_ci    punpcklbw               m1, m0, m0
1282cabdff1aSopenharmony_ci    punpckhwd               m1, m1
1283cabdff1aSopenharmony_ci    shufps                  m0, m1, q3310
1284cabdff1aSopenharmony_ci%endif
1285cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1286cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1287cabdff1aSopenharmony_ci    psrldq                  m1, m0, 1
1288cabdff1aSopenharmony_ci    psrldq                  m2, m0, 2
1289cabdff1aSopenharmony_ci    LOWPASS                  2,  1,  0,  3
1290cabdff1aSopenharmony_ci    pavgb                   m1, m0
1291cabdff1aSopenharmony_ci
1292cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m1
1293cabdff1aSopenharmony_ci    movq      [dstq+strideq*1], m2
1294cabdff1aSopenharmony_ci    psrldq                  m1, 1
1295cabdff1aSopenharmony_ci    psrldq                  m2, 1
1296cabdff1aSopenharmony_ci    movq      [dstq+strideq*2], m1
1297cabdff1aSopenharmony_ci    movq      [dstq+stride3q ], m2
1298cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
1299cabdff1aSopenharmony_ci    psrldq                  m1, 1
1300cabdff1aSopenharmony_ci    psrldq                  m2, 1
1301cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m1
1302cabdff1aSopenharmony_ci    movq      [dstq+strideq*1], m2
1303cabdff1aSopenharmony_ci    psrldq                  m1, 1
1304cabdff1aSopenharmony_ci    psrldq                  m2, 1
1305cabdff1aSopenharmony_ci    movq      [dstq+strideq*2], m1
1306cabdff1aSopenharmony_ci    movq      [dstq+stride3q ], m2
1307cabdff1aSopenharmony_ci    RET
1308cabdff1aSopenharmony_ci
1309cabdff1aSopenharmony_cicglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
1310cabdff1aSopenharmony_ci    mova                    m0, [aq]
1311cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
1312cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1313cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1314cabdff1aSopenharmony_ci    mova                    m4, [pb_1toE_2xF]
1315cabdff1aSopenharmony_ci    pshufb                  m1, m0, m4
1316cabdff1aSopenharmony_ci    pshufb                  m2, m1, m4
1317cabdff1aSopenharmony_ci%else
1318cabdff1aSopenharmony_ci    pand                    m4, m0, [pb_15x0_1xm1]  ; _______________F
1319cabdff1aSopenharmony_ci    psrldq                  m1, m0, 1               ; 123456789ABCDEF_
1320cabdff1aSopenharmony_ci    por                     m1, m4                  ; 123456789ABCDEFF
1321cabdff1aSopenharmony_ci    psrldq                  m2, m1, 1               ; 23456789ABCDEFF_
1322cabdff1aSopenharmony_ci    por                     m2, m4                  ; 23456789ABCDEFFF
1323cabdff1aSopenharmony_ci%endif
1324cabdff1aSopenharmony_ci    LOWPASS                  2,  1,  0, 3
1325cabdff1aSopenharmony_ci    pavgb                   m1, m0
1326cabdff1aSopenharmony_ci    mov                   cntd, 4
1327cabdff1aSopenharmony_ci.loop:
1328cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m1
1329cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m2
1330cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1331cabdff1aSopenharmony_ci    pshufb                  m1, m4
1332cabdff1aSopenharmony_ci    pshufb                  m2, m4
1333cabdff1aSopenharmony_ci%else
1334cabdff1aSopenharmony_ci    psrldq                  m1, 1
1335cabdff1aSopenharmony_ci    psrldq                  m2, 1
1336cabdff1aSopenharmony_ci    por                     m1, m4
1337cabdff1aSopenharmony_ci    por                     m2, m4
1338cabdff1aSopenharmony_ci%endif
1339cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m1
1340cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m2
1341cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1342cabdff1aSopenharmony_ci    pshufb                  m1, m4
1343cabdff1aSopenharmony_ci    pshufb                  m2, m4
1344cabdff1aSopenharmony_ci%else
1345cabdff1aSopenharmony_ci    psrldq                  m1, 1
1346cabdff1aSopenharmony_ci    psrldq                  m2, 1
1347cabdff1aSopenharmony_ci    por                     m1, m4
1348cabdff1aSopenharmony_ci    por                     m2, m4
1349cabdff1aSopenharmony_ci%endif
1350cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
1351cabdff1aSopenharmony_ci    dec                   cntd
1352cabdff1aSopenharmony_ci    jg .loop
1353cabdff1aSopenharmony_ci    RET
1354cabdff1aSopenharmony_ci
1355cabdff1aSopenharmony_cicglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
1356cabdff1aSopenharmony_ci    mova                    m0, [aq]
1357cabdff1aSopenharmony_ci    mova                    m5, [aq+16]
1358cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, dst16, cnt
1359cabdff1aSopenharmony_ci    PALIGNR                 m2, m5, m0, 1, m4
1360cabdff1aSopenharmony_ci    PALIGNR                 m3, m5, m0, 2, m4
1361cabdff1aSopenharmony_ci    lea                 dst16q, [dstq  +strideq*8]
1362cabdff1aSopenharmony_ci    LOWPASS                  3,  2,  0, 6
1363cabdff1aSopenharmony_ci    pavgb                   m2, m0
1364cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1365cabdff1aSopenharmony_ci    mova                    m4, [pb_1toE_2xF]
1366cabdff1aSopenharmony_ci    pshufb                  m0, m5, m4
1367cabdff1aSopenharmony_ci    pshufb                  m1, m0, m4
1368cabdff1aSopenharmony_ci%else
1369cabdff1aSopenharmony_ci    pand                    m4, m5, [pb_15x0_1xm1]  ; _______________F
1370cabdff1aSopenharmony_ci    psrldq                  m0, m5, 1               ; 123456789ABCDEF_
1371cabdff1aSopenharmony_ci    por                     m0, m4                  ; 123456789ABCDEFF
1372cabdff1aSopenharmony_ci    psrldq                  m1, m0, 1               ; 23456789ABCDEFF_
1373cabdff1aSopenharmony_ci    por                     m1, m4                  ; 23456789ABCDEFFF
1374cabdff1aSopenharmony_ci%endif
1375cabdff1aSopenharmony_ci    lea                 dst16q, [dst16q+strideq*8]
1376cabdff1aSopenharmony_ci    LOWPASS                  1,  0,  5, 6
1377cabdff1aSopenharmony_ci    pavgb                   m0, m5
1378cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1379cabdff1aSopenharmony_ci    pshufb                  m5, [pb_15]
1380cabdff1aSopenharmony_ci%else
1381cabdff1aSopenharmony_ci    punpckhbw               m5, m4, m4
1382cabdff1aSopenharmony_ci    pshufhw                 m5, m5, q3333
1383cabdff1aSopenharmony_ci    punpckhqdq              m5, m5
1384cabdff1aSopenharmony_ci%endif
1385cabdff1aSopenharmony_ci    mov                   cntd, 8
1386cabdff1aSopenharmony_ci
1387cabdff1aSopenharmony_ci.loop:
1388cabdff1aSopenharmony_ci%macro %%write 3
1389cabdff1aSopenharmony_ci    mova    [dstq+stride%1+ 0], %2
1390cabdff1aSopenharmony_ci    mova    [dstq+stride%1+16], %3
1391cabdff1aSopenharmony_ci    movhps  [dst16q+stride%1 ], %2
1392cabdff1aSopenharmony_ci    movu  [dst16q+stride%1+ 8], %3
1393cabdff1aSopenharmony_ci    movq  [dst16q+stride%1+24], m5
1394cabdff1aSopenharmony_ci%if cpuflag(avx)
1395cabdff1aSopenharmony_ci    palignr                 %2, %3, %2, 1
1396cabdff1aSopenharmony_ci    pshufb                  %3, m4
1397cabdff1aSopenharmony_ci%elif cpuflag(ssse3)
1398cabdff1aSopenharmony_ci    palignr                 m6, %3, %2, 1
1399cabdff1aSopenharmony_ci    pshufb                  %3, m4
1400cabdff1aSopenharmony_ci    mova                    %2, m6
1401cabdff1aSopenharmony_ci%else
1402cabdff1aSopenharmony_ci    pslldq                  m6, %3, 15
1403cabdff1aSopenharmony_ci    psrldq                  %3, 1
1404cabdff1aSopenharmony_ci    psrldq                  %2, 1
1405cabdff1aSopenharmony_ci    por                     %3, m4
1406cabdff1aSopenharmony_ci    por                     %2, m6
1407cabdff1aSopenharmony_ci%endif
1408cabdff1aSopenharmony_ci%endmacro
1409cabdff1aSopenharmony_ci
1410cabdff1aSopenharmony_ci    %%write                q*0, m2, m0
1411cabdff1aSopenharmony_ci    %%write                q*1, m3, m1
1412cabdff1aSopenharmony_ci    lea                   dstq, [dstq  +strideq*2]
1413cabdff1aSopenharmony_ci    lea                 dst16q, [dst16q+strideq*2]
1414cabdff1aSopenharmony_ci    dec                   cntd
1415cabdff1aSopenharmony_ci    jg .loop
1416cabdff1aSopenharmony_ci    RET
1417cabdff1aSopenharmony_ci%endmacro
1418cabdff1aSopenharmony_ci
1419cabdff1aSopenharmony_ciINIT_XMM sse2
1420cabdff1aSopenharmony_ciVL_XMM_FUNCS
1421cabdff1aSopenharmony_ciINIT_XMM ssse3
1422cabdff1aSopenharmony_ciVL_XMM_FUNCS
1423cabdff1aSopenharmony_ciINIT_XMM avx
1424cabdff1aSopenharmony_ciVL_XMM_FUNCS
1425cabdff1aSopenharmony_ci
1426cabdff1aSopenharmony_ci; vr
1427cabdff1aSopenharmony_ci
1428cabdff1aSopenharmony_ci%macro VR_MMX_FUNCS 0
1429cabdff1aSopenharmony_cicglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
1430cabdff1aSopenharmony_ci    movq                    m1, [aq-1]
1431cabdff1aSopenharmony_ci    punpckldq               m2, [lq]
1432cabdff1aSopenharmony_ci    movd                    m0, [aq]
1433cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1434cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1435cabdff1aSopenharmony_ci    pavgb                   m0, m1
1436cabdff1aSopenharmony_ci    PALIGNR                 m1, m2, 5, m3
1437cabdff1aSopenharmony_ci    psrlq                   m2, m1, 8
1438cabdff1aSopenharmony_ci    psllq                   m3, m1, 8
1439cabdff1aSopenharmony_ci    LOWPASS                  2,  1, 3, 4
1440cabdff1aSopenharmony_ci
1441cabdff1aSopenharmony_ci    ; ABCD <- for the following predictor:
1442cabdff1aSopenharmony_ci    ; EFGH
1443cabdff1aSopenharmony_ci    ; IABC  | m0 contains ABCDxxxx
1444cabdff1aSopenharmony_ci    ; JEFG  | m2 contains xJIEFGHx
1445cabdff1aSopenharmony_ci
1446cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1447cabdff1aSopenharmony_ci    punpckldq               m0, m2
1448cabdff1aSopenharmony_ci    pshufb                  m2, [pb_13456_3xm1]
1449cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m0
1450cabdff1aSopenharmony_ci    pshufb                  m0, [pb_6012_4xm1]
1451cabdff1aSopenharmony_ci    movd      [dstq+stride3q ], m2
1452cabdff1aSopenharmony_ci    psrlq                   m2, 8
1453cabdff1aSopenharmony_ci    movd      [dstq+strideq*2], m0
1454cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m2
1455cabdff1aSopenharmony_ci%else
1456cabdff1aSopenharmony_ci    psllq                   m1, m2, 40
1457cabdff1aSopenharmony_ci    psrlq                   m2, 24
1458cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m0
1459cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m2
1460cabdff1aSopenharmony_ci    PALIGNR                 m0, m1, 7, m3
1461cabdff1aSopenharmony_ci    psllq                   m1, 8
1462cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, 7, m3
1463cabdff1aSopenharmony_ci    movd      [dstq+strideq*2], m0
1464cabdff1aSopenharmony_ci    movd      [dstq+stride3q ], m2
1465cabdff1aSopenharmony_ci%endif
1466cabdff1aSopenharmony_ci    RET
1467cabdff1aSopenharmony_ci%endmacro
1468cabdff1aSopenharmony_ci
1469cabdff1aSopenharmony_ciINIT_MMX mmxext
1470cabdff1aSopenharmony_ciVR_MMX_FUNCS
1471cabdff1aSopenharmony_ciINIT_MMX ssse3
1472cabdff1aSopenharmony_ciVR_MMX_FUNCS
1473cabdff1aSopenharmony_ci
1474cabdff1aSopenharmony_ci%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16
1475cabdff1aSopenharmony_cicglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
1476cabdff1aSopenharmony_ci    movu                    m1, [aq-1]
1477cabdff1aSopenharmony_ci    movhps                  m2, [lq]
1478cabdff1aSopenharmony_ci    movq                    m0, [aq]
1479cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1480cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1481cabdff1aSopenharmony_ci    pavgb                   m0, m1
1482cabdff1aSopenharmony_ci    PALIGNR                 m1, m2, 9, m3
1483cabdff1aSopenharmony_ci    pslldq                  m2, m1, 1
1484cabdff1aSopenharmony_ci    pslldq                  m3, m1, 2
1485cabdff1aSopenharmony_ci    LOWPASS                  1,  2, 3, 4
1486cabdff1aSopenharmony_ci
1487cabdff1aSopenharmony_ci    ; ABCDEFGH <- for the following predictor:
1488cabdff1aSopenharmony_ci    ; IJKLMNOP
1489cabdff1aSopenharmony_ci    ; QABCDEFG  | m0 contains ABCDEFGHxxxxxxxx
1490cabdff1aSopenharmony_ci    ; RIJKLMNO  | m1 contains xxVUTSRQIJKLMNOP
1491cabdff1aSopenharmony_ci    ; SQABCDEF
1492cabdff1aSopenharmony_ci    ; TRIJKLMN
1493cabdff1aSopenharmony_ci    ; USQABCDE
1494cabdff1aSopenharmony_ci    ; VTRIJKLM
1495cabdff1aSopenharmony_ci
1496cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1497cabdff1aSopenharmony_ci    punpcklqdq              m0, m1 ; ABCDEFGHxxVUTSRQ
1498cabdff1aSopenharmony_ci%endif
1499cabdff1aSopenharmony_ci    movq      [dstq+strideq*0], m0
1500cabdff1aSopenharmony_ci    movhps    [dstq+strideq*1], m1
1501cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1502cabdff1aSopenharmony_ci    pshufb                  m0, [pb_6xm1_BDF_0to6]  ; xxxxxxUSQABCDEFG
1503cabdff1aSopenharmony_ci    pshufb                  m1, [pb_6xm1_246_8toE]  ; xxxxxxVTRIJKLMNO
1504cabdff1aSopenharmony_ci%else
1505cabdff1aSopenharmony_ci    psrlw                   m2, m1, 8               ; x_U_S_Q_xxxxxxxx
1506cabdff1aSopenharmony_ci    pand                    m3, m1, [pw_255]        ; x_V_T_R_xxxxxxxx
1507cabdff1aSopenharmony_ci    packuswb                m3, m2                  ; xVTRxxxxxUSQxxxx
1508cabdff1aSopenharmony_ci    pslldq                  m3, 4                   ; xxxxxVTRxxxxxUSQ
1509cabdff1aSopenharmony_ci    PALIGNR                 m0, m3, 7, m4           ; xxxxxxUSQABCDEFG
1510cabdff1aSopenharmony_ci    psrldq                  m1, 8
1511cabdff1aSopenharmony_ci    pslldq                  m3, 8
1512cabdff1aSopenharmony_ci    PALIGNR                 m1, m3, 7, m4           ; xxxxxxVTRIJKLMNO
1513cabdff1aSopenharmony_ci%endif
1514cabdff1aSopenharmony_ci    movhps    [dstq+strideq*2], m0
1515cabdff1aSopenharmony_ci    movhps    [dstq+stride3q ], m1
1516cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
1517cabdff1aSopenharmony_ci    pslldq                  m0, 1
1518cabdff1aSopenharmony_ci    pslldq                  m1, 1
1519cabdff1aSopenharmony_ci    movhps    [dstq+strideq*0], m0
1520cabdff1aSopenharmony_ci    movhps    [dstq+strideq*1], m1
1521cabdff1aSopenharmony_ci    pslldq                  m0, 1
1522cabdff1aSopenharmony_ci    pslldq                  m1, 1
1523cabdff1aSopenharmony_ci    movhps    [dstq+strideq*2], m0
1524cabdff1aSopenharmony_ci    movhps    [dstq+stride3q ], m1
1525cabdff1aSopenharmony_ci    RET
1526cabdff1aSopenharmony_ci
1527cabdff1aSopenharmony_cicglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a
1528cabdff1aSopenharmony_ci    mova                    m0, [aq]
1529cabdff1aSopenharmony_ci    movu                    m1, [aq-1]
1530cabdff1aSopenharmony_ci    mova                    m2, [lq]
1531cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
1532cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1533cabdff1aSopenharmony_ci    PALIGNR                 m3, m1, m2, 15, m6
1534cabdff1aSopenharmony_ci    LOWPASS                  3,  1,  0,  4
1535cabdff1aSopenharmony_ci    pavgb                   m0, m1
1536cabdff1aSopenharmony_ci    PALIGNR                 m1, m2,  1, m6
1537cabdff1aSopenharmony_ci    pslldq                  m4, m2,  1
1538cabdff1aSopenharmony_ci    LOWPASS                  1,  2,  4,  5
1539cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1540cabdff1aSopenharmony_ci    pshufb                  m1, [pb_02468ACE_13579BDF]
1541cabdff1aSopenharmony_ci%else
1542cabdff1aSopenharmony_ci    psrlw                   m5, m1, 8
1543cabdff1aSopenharmony_ci    pand                    m1, [pw_255]
1544cabdff1aSopenharmony_ci    packuswb                m1, m5
1545cabdff1aSopenharmony_ci%endif
1546cabdff1aSopenharmony_ci    mov                   cntd, 4
1547cabdff1aSopenharmony_ci
1548cabdff1aSopenharmony_ci.loop:
1549cabdff1aSopenharmony_ci    movlhps                 m2, m1
1550cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
1551cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m3
1552cabdff1aSopenharmony_ci    PALIGNR                 m4, m0, m1, 15, m6
1553cabdff1aSopenharmony_ci    PALIGNR                 m5, m3, m2, 15, m6
1554cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m4
1555cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m5
1556cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
1557cabdff1aSopenharmony_ci    PALIGNR                 m0, m1, 14, m6
1558cabdff1aSopenharmony_ci    PALIGNR                 m3, m2, 14, m6
1559cabdff1aSopenharmony_ci    pslldq                  m1, 2
1560cabdff1aSopenharmony_ci    dec                   cntd
1561cabdff1aSopenharmony_ci    jg .loop
1562cabdff1aSopenharmony_ci    RET
1563cabdff1aSopenharmony_ci
1564cabdff1aSopenharmony_cicglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
1565cabdff1aSopenharmony_ci    mova                    m0, [aq]
1566cabdff1aSopenharmony_ci    mova                    m2, [aq+16]
1567cabdff1aSopenharmony_ci    movu                    m1, [aq-1]
1568cabdff1aSopenharmony_ci    PALIGNR                 m3, m2, m0, 15, m6
1569cabdff1aSopenharmony_ci    PALIGNR                 m4, m2, m0, 14, m6
1570cabdff1aSopenharmony_ci    LOWPASS                  4,  3,  2,  5
1571cabdff1aSopenharmony_ci    pavgb                   m3, m2
1572cabdff1aSopenharmony_ci    mova                    m2, [lq+16]
1573cabdff1aSopenharmony_ci    PALIGNR                 m5, m1, m2, 15, m6
1574cabdff1aSopenharmony_ci    LOWPASS                  5,  1,  0,  6
1575cabdff1aSopenharmony_ci    pavgb                   m0, m1
1576cabdff1aSopenharmony_ci    mova                    m6, [lq]
1577cabdff1aSopenharmony_ci%if ARCH_X86_64
1578cabdff1aSopenharmony_ci    SWAP                     0, 8
1579cabdff1aSopenharmony_ci%else
1580cabdff1aSopenharmony_ci    mova                [dstq], m0
1581cabdff1aSopenharmony_ci%endif
1582cabdff1aSopenharmony_ci    PALIGNR                 m1, m2,  1, m0
1583cabdff1aSopenharmony_ci    PALIGNR                 m7, m2, m6, 15, m0
1584cabdff1aSopenharmony_ci    LOWPASS                  1,  2,  7,  0
1585cabdff1aSopenharmony_ci    PALIGNR                 m2, m6,  1, m0
1586cabdff1aSopenharmony_ci    pslldq                  m7, m6,  1
1587cabdff1aSopenharmony_ci    LOWPASS                  2,  6,  7,  0
1588cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1589cabdff1aSopenharmony_ci    pshufb                  m1, [pb_02468ACE_13579BDF]
1590cabdff1aSopenharmony_ci    pshufb                  m2, [pb_02468ACE_13579BDF]
1591cabdff1aSopenharmony_ci%else
1592cabdff1aSopenharmony_ci    psrlw                   m0, m1, 8
1593cabdff1aSopenharmony_ci    psrlw                   m6, m2, 8
1594cabdff1aSopenharmony_ci    pand                    m1, [pw_255]
1595cabdff1aSopenharmony_ci    pand                    m2, [pw_255]
1596cabdff1aSopenharmony_ci    packuswb                m1, m0
1597cabdff1aSopenharmony_ci    packuswb                m2, m6
1598cabdff1aSopenharmony_ci%endif
1599cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, dst16, cnt
1600cabdff1aSopenharmony_ci    lea                 dst16q, [dstq  +strideq*8]
1601cabdff1aSopenharmony_ci    lea                 dst16q, [dst16q+strideq*8]
1602cabdff1aSopenharmony_ci    SBUTTERFLY             qdq,  2,  1,  6
1603cabdff1aSopenharmony_ci%if ARCH_X86_64
1604cabdff1aSopenharmony_ci    SWAP                     0, 8
1605cabdff1aSopenharmony_ci%else
1606cabdff1aSopenharmony_ci    mova                    m0, [dstq]
1607cabdff1aSopenharmony_ci%endif
1608cabdff1aSopenharmony_ci    mov                   cntd, 8
1609cabdff1aSopenharmony_ci
1610cabdff1aSopenharmony_ci.loop:
1611cabdff1aSopenharmony_ci    ; even lines (0, 2, 4, ...): m1 | m0, m3
1612cabdff1aSopenharmony_ci    ;  odd lines (1, 3, 5, ...): m2 | m5, m4
1613cabdff1aSopenharmony_ci%macro %%write 4
1614cabdff1aSopenharmony_ci    mova    [dstq+stride%1+ 0], %3
1615cabdff1aSopenharmony_ci    mova    [dstq+stride%1+16], %4
1616cabdff1aSopenharmony_ci    movhps  [dst16q+stride%1 ], %2
1617cabdff1aSopenharmony_ci    movu  [dst16q+stride%1+ 8], %3
1618cabdff1aSopenharmony_ci    movq  [dst16q+stride%1+24], %4
1619cabdff1aSopenharmony_ci    PALIGNR                 %4, %3, 15, m6
1620cabdff1aSopenharmony_ci    PALIGNR                 %3, %2, 15, m6
1621cabdff1aSopenharmony_ci    pslldq                  %2,  1
1622cabdff1aSopenharmony_ci%endmacro
1623cabdff1aSopenharmony_ci
1624cabdff1aSopenharmony_ci    %%write                q*0, m1, m0, m3
1625cabdff1aSopenharmony_ci    %%write                q*1, m2, m5, m4
1626cabdff1aSopenharmony_ci    lea                   dstq, [dstq  +strideq*2]
1627cabdff1aSopenharmony_ci    lea                 dst16q, [dst16q+strideq*2]
1628cabdff1aSopenharmony_ci    dec                   cntd
1629cabdff1aSopenharmony_ci    jg .loop
1630cabdff1aSopenharmony_ci    RET
1631cabdff1aSopenharmony_ci%endmacro
1632cabdff1aSopenharmony_ci
1633cabdff1aSopenharmony_ciINIT_XMM sse2
1634cabdff1aSopenharmony_ciVR_XMM_FUNCS 7
1635cabdff1aSopenharmony_ciINIT_XMM ssse3
1636cabdff1aSopenharmony_ciVR_XMM_FUNCS 6
1637cabdff1aSopenharmony_ciINIT_XMM avx
1638cabdff1aSopenharmony_ciVR_XMM_FUNCS 6
1639cabdff1aSopenharmony_ci
1640cabdff1aSopenharmony_ci; hd
1641cabdff1aSopenharmony_ci
1642cabdff1aSopenharmony_ciINIT_MMX mmxext
1643cabdff1aSopenharmony_cicglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
1644cabdff1aSopenharmony_ci    movd                    m0, [lq]
1645cabdff1aSopenharmony_ci    punpckldq               m0, [aq-1]
1646cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1647cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1648cabdff1aSopenharmony_ci    psrlq                   m1, m0, 8
1649cabdff1aSopenharmony_ci    psrlq                   m2, m1, 8
1650cabdff1aSopenharmony_ci    LOWPASS                  2,  1, 0,  3
1651cabdff1aSopenharmony_ci    pavgb                   m1, m0
1652cabdff1aSopenharmony_ci
1653cabdff1aSopenharmony_ci    ; DHIJ <- for the following predictor:
1654cabdff1aSopenharmony_ci    ; CGDH
1655cabdff1aSopenharmony_ci    ; BFCG  | m1 contains ABCDxxxx
1656cabdff1aSopenharmony_ci    ; AEBF  | m2 contains EFGHIJxx
1657cabdff1aSopenharmony_ci
1658cabdff1aSopenharmony_ci    punpcklbw               m1, m2
1659cabdff1aSopenharmony_ci    punpckhdq               m0, m1, m2
1660cabdff1aSopenharmony_ci
1661cabdff1aSopenharmony_ci    ; m1 contains AEBFCGDH
1662cabdff1aSopenharmony_ci    ; m0 contains CGDHIJxx
1663cabdff1aSopenharmony_ci
1664cabdff1aSopenharmony_ci    movd      [dstq+stride3q ], m1
1665cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m0
1666cabdff1aSopenharmony_ci    psrlq                   m1, 16
1667cabdff1aSopenharmony_ci    psrlq                   m0, 16
1668cabdff1aSopenharmony_ci    movd      [dstq+strideq*2], m1
1669cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m0
1670cabdff1aSopenharmony_ci    RET
1671cabdff1aSopenharmony_ci
1672cabdff1aSopenharmony_ci%macro HD_XMM_FUNCS 0
1673cabdff1aSopenharmony_cicglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a
1674cabdff1aSopenharmony_ci    movq                    m0, [lq]
1675cabdff1aSopenharmony_ci    movhps                  m0, [aq-1]
1676cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, dst4
1677cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1678cabdff1aSopenharmony_ci    lea                  dst4q, [dstq+strideq*4]
1679cabdff1aSopenharmony_ci    psrldq                  m1, m0, 1
1680cabdff1aSopenharmony_ci    psrldq                  m2, m1, 1
1681cabdff1aSopenharmony_ci    LOWPASS                  2,  1,  0,  3
1682cabdff1aSopenharmony_ci    pavgb                   m1, m0
1683cabdff1aSopenharmony_ci
1684cabdff1aSopenharmony_ci    ; HPQRSTUV <- for the following predictor
1685cabdff1aSopenharmony_ci    ; GOHPQRST
1686cabdff1aSopenharmony_ci    ; FNGOHPQR  | m1 contains ABCDEFGHxxxxxxxx
1687cabdff1aSopenharmony_ci    ; EMFNGOHP  | m2 contains IJKLMNOPQRSTUVxx
1688cabdff1aSopenharmony_ci    ; DLEMFNGO
1689cabdff1aSopenharmony_ci    ; CKDLEMFN
1690cabdff1aSopenharmony_ci    ; BJCKDLEM
1691cabdff1aSopenharmony_ci    ; AIBJCKDL
1692cabdff1aSopenharmony_ci
1693cabdff1aSopenharmony_ci    punpcklbw               m1, m2
1694cabdff1aSopenharmony_ci    movhlps                 m2, m2
1695cabdff1aSopenharmony_ci
1696cabdff1aSopenharmony_ci    ; m1 contains AIBJCKDLEMFNGOHP
1697cabdff1aSopenharmony_ci    ; m2 contains QRSTUVxxxxxxxxxx
1698cabdff1aSopenharmony_ci
1699cabdff1aSopenharmony_ci    movhps   [dstq +stride3q ], m1
1700cabdff1aSopenharmony_ci    movq     [dst4q+stride3q ], m1
1701cabdff1aSopenharmony_ci    PALIGNR                 m3, m2, m1, 2, m4
1702cabdff1aSopenharmony_ci    movhps   [dstq +strideq*2], m3
1703cabdff1aSopenharmony_ci    movq     [dst4q+strideq*2], m3
1704cabdff1aSopenharmony_ci    PALIGNR                 m3, m2, m1, 4, m4
1705cabdff1aSopenharmony_ci    movhps   [dstq +strideq*1], m3
1706cabdff1aSopenharmony_ci    movq     [dst4q+strideq*1], m3
1707cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, 6, m4
1708cabdff1aSopenharmony_ci    movhps   [dstq +strideq*0], m2
1709cabdff1aSopenharmony_ci    movq     [dst4q+strideq*0], m2
1710cabdff1aSopenharmony_ci    RET
1711cabdff1aSopenharmony_ci
1712cabdff1aSopenharmony_cicglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
1713cabdff1aSopenharmony_ci    mova                    m0, [lq]
1714cabdff1aSopenharmony_ci    movu                    m3, [aq-1]
1715cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
1716cabdff1aSopenharmony_ci    lea               stride4q, [strideq*4]
1717cabdff1aSopenharmony_ci    lea                  dst4q, [dstq +stride4q]
1718cabdff1aSopenharmony_ci    lea                  dst8q, [dst4q+stride4q]
1719cabdff1aSopenharmony_ci    lea                 dst12q, [dst8q+stride4q]
1720cabdff1aSopenharmony_ci    psrldq                  m4, m3,  1
1721cabdff1aSopenharmony_ci    psrldq                  m5, m3,  2
1722cabdff1aSopenharmony_ci    LOWPASS                  5,  4,  3,  6
1723cabdff1aSopenharmony_ci    PALIGNR                 m1, m3, m0,  1, m6
1724cabdff1aSopenharmony_ci    PALIGNR                 m2, m3, m0,  2, m6
1725cabdff1aSopenharmony_ci    LOWPASS                  2,  1,  0,  6
1726cabdff1aSopenharmony_ci    pavgb                   m1, m0
1727cabdff1aSopenharmony_ci    SBUTTERFLY              bw,  1,  2,  6
1728cabdff1aSopenharmony_ci
1729cabdff1aSopenharmony_ci    ; I PROBABLY INVERTED L0 ad L16 here
1730cabdff1aSopenharmony_ci    ; m1, m2, m5
1731cabdff1aSopenharmony_ci.loop:
1732cabdff1aSopenharmony_ci    sub               stride4q, strideq
1733cabdff1aSopenharmony_ci    movhps [dstq +stride4q +0], m2
1734cabdff1aSopenharmony_ci    movq   [dstq +stride4q +8], m5
1735cabdff1aSopenharmony_ci    mova   [dst4q+stride4q   ], m2
1736cabdff1aSopenharmony_ci    movhps [dst8q+stride4q +0], m1
1737cabdff1aSopenharmony_ci    movq   [dst8q+stride4q +8], m2
1738cabdff1aSopenharmony_ci    mova  [dst12q+stride4q   ], m1
1739cabdff1aSopenharmony_ci%if cpuflag(avx)
1740cabdff1aSopenharmony_ci    palignr                 m1, m2, m1, 2
1741cabdff1aSopenharmony_ci    palignr                 m2, m5, m2, 2
1742cabdff1aSopenharmony_ci%elif cpuflag(ssse3)
1743cabdff1aSopenharmony_ci    palignr                 m3, m2, m1, 2
1744cabdff1aSopenharmony_ci    palignr                 m0, m5, m2, 2
1745cabdff1aSopenharmony_ci    mova                    m1, m3
1746cabdff1aSopenharmony_ci    mova                    m2, m0
1747cabdff1aSopenharmony_ci%else
1748cabdff1aSopenharmony_ci    ; slightly modified version of PALIGNR
1749cabdff1aSopenharmony_ci    mova                    m6, m2
1750cabdff1aSopenharmony_ci    mova                    m4, m5
1751cabdff1aSopenharmony_ci    pslldq                  m6, 14
1752cabdff1aSopenharmony_ci    pslldq                  m4, 14
1753cabdff1aSopenharmony_ci    psrldq                  m1, 2
1754cabdff1aSopenharmony_ci    psrldq                  m2, 2
1755cabdff1aSopenharmony_ci    por                     m1, m6
1756cabdff1aSopenharmony_ci    por                     m2, m4
1757cabdff1aSopenharmony_ci%endif
1758cabdff1aSopenharmony_ci    psrldq                  m5, 2
1759cabdff1aSopenharmony_ci    jg .loop
1760cabdff1aSopenharmony_ci    RET
1761cabdff1aSopenharmony_ci
1762cabdff1aSopenharmony_cicglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
1763cabdff1aSopenharmony_ci    mova                    m0, [lq]
1764cabdff1aSopenharmony_ci    mova                    m1, [lq+16]
1765cabdff1aSopenharmony_ci    movu                    m2, [aq-1]
1766cabdff1aSopenharmony_ci    movu                    m3, [aq+15]
1767cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
1768cabdff1aSopenharmony_ci    lea               stride8q, [strideq*8]
1769cabdff1aSopenharmony_ci    lea                  dst8q, [dstq  +stride8q]
1770cabdff1aSopenharmony_ci    lea                 dst16q, [dst8q +stride8q]
1771cabdff1aSopenharmony_ci    lea                 dst24q, [dst16q+stride8q]
1772cabdff1aSopenharmony_ci    psrldq                  m4, m3,  1
1773cabdff1aSopenharmony_ci    psrldq                  m5, m3,  2
1774cabdff1aSopenharmony_ci    LOWPASS                  5,  4,  3,  6
1775cabdff1aSopenharmony_ci    PALIGNR                 m4, m3, m2,  2, m6
1776cabdff1aSopenharmony_ci    PALIGNR                 m3, m2,  1, m6
1777cabdff1aSopenharmony_ci    LOWPASS                  4,  3,  2,  6
1778cabdff1aSopenharmony_ci    PALIGNR                 m3, m2, m1,  2, m6
1779cabdff1aSopenharmony_ci    PALIGNR                 m2, m1,  1, m6
1780cabdff1aSopenharmony_ci    LOWPASS                  3,  2,  1,  6
1781cabdff1aSopenharmony_ci    pavgb                   m2, m1
1782cabdff1aSopenharmony_ci    PALIGNR                 m6, m1, m0,  1, m7
1783cabdff1aSopenharmony_ci    PALIGNR                 m1, m0,  2, m7
1784cabdff1aSopenharmony_ci    LOWPASS                  1,  6,  0,  7
1785cabdff1aSopenharmony_ci    pavgb                   m0, m6
1786cabdff1aSopenharmony_ci    SBUTTERFLY              bw,  2,  3,  6
1787cabdff1aSopenharmony_ci    SBUTTERFLY              bw,  0,  1,  6
1788cabdff1aSopenharmony_ci
1789cabdff1aSopenharmony_ci    ; m0, m1, m2, m3, m4, m5
1790cabdff1aSopenharmony_ci.loop:
1791cabdff1aSopenharmony_ci    sub               stride8q, strideq
1792cabdff1aSopenharmony_ci    mova  [dstq  +stride8q+ 0], m3
1793cabdff1aSopenharmony_ci    mova  [dstq  +stride8q+16], m4
1794cabdff1aSopenharmony_ci    mova  [dst8q +stride8q+ 0], m2
1795cabdff1aSopenharmony_ci    mova  [dst8q +stride8q+16], m3
1796cabdff1aSopenharmony_ci    mova  [dst16q+stride8q+ 0], m1
1797cabdff1aSopenharmony_ci    mova  [dst16q+stride8q+16], m2
1798cabdff1aSopenharmony_ci    mova  [dst24q+stride8q+ 0], m0
1799cabdff1aSopenharmony_ci    mova  [dst24q+stride8q+16], m1
1800cabdff1aSopenharmony_ci%if cpuflag(avx)
1801cabdff1aSopenharmony_ci    palignr                 m0, m1, m0, 2
1802cabdff1aSopenharmony_ci    palignr                 m1, m2, m1, 2
1803cabdff1aSopenharmony_ci    palignr                 m2, m3, m2, 2
1804cabdff1aSopenharmony_ci    palignr                 m3, m4, m3, 2
1805cabdff1aSopenharmony_ci    palignr                 m4, m5, m4, 2
1806cabdff1aSopenharmony_ci    psrldq                  m5, 2
1807cabdff1aSopenharmony_ci%elif cpuflag(ssse3)
1808cabdff1aSopenharmony_ci    psrldq                  m6, m5, 2
1809cabdff1aSopenharmony_ci    palignr                 m5, m4, 2
1810cabdff1aSopenharmony_ci    palignr                 m4, m3, 2
1811cabdff1aSopenharmony_ci    palignr                 m3, m2, 2
1812cabdff1aSopenharmony_ci    palignr                 m2, m1, 2
1813cabdff1aSopenharmony_ci    palignr                 m1, m0, 2
1814cabdff1aSopenharmony_ci    mova                    m0, m1
1815cabdff1aSopenharmony_ci    mova                    m1, m2
1816cabdff1aSopenharmony_ci    mova                    m2, m3
1817cabdff1aSopenharmony_ci    mova                    m3, m4
1818cabdff1aSopenharmony_ci    mova                    m4, m5
1819cabdff1aSopenharmony_ci    mova                    m5, m6
1820cabdff1aSopenharmony_ci%else
1821cabdff1aSopenharmony_ci    ; sort of a half-integrated version of PALIGNR
1822cabdff1aSopenharmony_ci    pslldq                  m7, m4, 14
1823cabdff1aSopenharmony_ci    pslldq                  m6, m5, 14
1824cabdff1aSopenharmony_ci    psrldq                  m4, 2
1825cabdff1aSopenharmony_ci    psrldq                  m5, 2
1826cabdff1aSopenharmony_ci    por                     m4, m6
1827cabdff1aSopenharmony_ci    pslldq                  m6, m3, 14
1828cabdff1aSopenharmony_ci    psrldq                  m3, 2
1829cabdff1aSopenharmony_ci    por                     m3, m7
1830cabdff1aSopenharmony_ci    pslldq                  m7, m2, 14
1831cabdff1aSopenharmony_ci    psrldq                  m2, 2
1832cabdff1aSopenharmony_ci    por                     m2, m6
1833cabdff1aSopenharmony_ci    pslldq                  m6, m1, 14
1834cabdff1aSopenharmony_ci    psrldq                  m1, 2
1835cabdff1aSopenharmony_ci    por                     m1, m7
1836cabdff1aSopenharmony_ci    psrldq                  m0, 2
1837cabdff1aSopenharmony_ci    por                     m0, m6
1838cabdff1aSopenharmony_ci%endif
1839cabdff1aSopenharmony_ci    jg .loop
1840cabdff1aSopenharmony_ci    RET
1841cabdff1aSopenharmony_ci%endmacro
1842cabdff1aSopenharmony_ci
1843cabdff1aSopenharmony_ciINIT_XMM sse2
1844cabdff1aSopenharmony_ciHD_XMM_FUNCS
1845cabdff1aSopenharmony_ciINIT_XMM ssse3
1846cabdff1aSopenharmony_ciHD_XMM_FUNCS
1847cabdff1aSopenharmony_ciINIT_XMM avx
1848cabdff1aSopenharmony_ciHD_XMM_FUNCS
1849cabdff1aSopenharmony_ci
1850cabdff1aSopenharmony_ci%macro HU_MMX_FUNCS 0
1851cabdff1aSopenharmony_cicglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
1852cabdff1aSopenharmony_ci    movd                    m0, [lq]
1853cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1854cabdff1aSopenharmony_ci    pshufb                  m0, [pb_0to2_5x3]
1855cabdff1aSopenharmony_ci%else
1856cabdff1aSopenharmony_ci    punpcklbw               m1, m0, m0          ; 00112233
1857cabdff1aSopenharmony_ci    pshufw                  m1, m1, q3333       ; 33333333
1858cabdff1aSopenharmony_ci    punpckldq               m0, m1              ; 01233333
1859cabdff1aSopenharmony_ci%endif
1860cabdff1aSopenharmony_ci    psrlq                   m1, m0, 8
1861cabdff1aSopenharmony_ci    psrlq                   m2, m1, 8
1862cabdff1aSopenharmony_ci    LOWPASS                  2,  1, 0, 3
1863cabdff1aSopenharmony_ci    pavgb                   m1, m0
1864cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1865cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1866cabdff1aSopenharmony_ci    SBUTTERFLY              bw,  1, 2, 0
1867cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, 2, m0
1868cabdff1aSopenharmony_ci    movd      [dstq+strideq*0], m1
1869cabdff1aSopenharmony_ci    movd      [dstq+strideq*1], m2
1870cabdff1aSopenharmony_ci    punpckhdq               m1, m1
1871cabdff1aSopenharmony_ci    punpckhdq               m2, m2
1872cabdff1aSopenharmony_ci    movd      [dstq+strideq*2], m1
1873cabdff1aSopenharmony_ci    movd      [dstq+stride3q ], m2
1874cabdff1aSopenharmony_ci    RET
1875cabdff1aSopenharmony_ci%endmacro
1876cabdff1aSopenharmony_ci
1877cabdff1aSopenharmony_ciINIT_MMX mmxext
1878cabdff1aSopenharmony_ciHU_MMX_FUNCS
1879cabdff1aSopenharmony_ciINIT_MMX ssse3
1880cabdff1aSopenharmony_ciHU_MMX_FUNCS
1881cabdff1aSopenharmony_ci
1882cabdff1aSopenharmony_ci%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32
1883cabdff1aSopenharmony_cicglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
1884cabdff1aSopenharmony_ci    movq                    m0, [lq]
1885cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1886cabdff1aSopenharmony_ci    pshufb                  m0, [pb_0to6_9x7]
1887cabdff1aSopenharmony_ci%else
1888cabdff1aSopenharmony_ci    punpcklbw               m1, m0, m0          ; 0011223344556677
1889cabdff1aSopenharmony_ci    punpckhwd               m1, m1              ; 4444555566667777
1890cabdff1aSopenharmony_ci    shufps                  m0, m1, q3310       ; 0123456777777777
1891cabdff1aSopenharmony_ci%endif
1892cabdff1aSopenharmony_ci    psrldq                  m1, m0, 1
1893cabdff1aSopenharmony_ci    psrldq                  m2, m1, 1
1894cabdff1aSopenharmony_ci    LOWPASS                  2,  1, 0, 3
1895cabdff1aSopenharmony_ci    pavgb                   m1, m0
1896cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, dst4
1897cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1898cabdff1aSopenharmony_ci    lea                  dst4q, [dstq+strideq*4]
1899cabdff1aSopenharmony_ci    SBUTTERFLY              bw,  1, 2, 0
1900cabdff1aSopenharmony_ci    movq     [dstq +strideq*0], m1
1901cabdff1aSopenharmony_ci    movhps   [dst4q+strideq*0], m1
1902cabdff1aSopenharmony_ci    PALIGNR                 m0, m2, m1, 2, m3
1903cabdff1aSopenharmony_ci    movq     [dstq +strideq*1], m0
1904cabdff1aSopenharmony_ci    movhps   [dst4q+strideq*1], m0
1905cabdff1aSopenharmony_ci    PALIGNR                 m0, m2, m1, 4, m3
1906cabdff1aSopenharmony_ci    movq     [dstq +strideq*2], m0
1907cabdff1aSopenharmony_ci    movhps   [dst4q+strideq*2], m0
1908cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, 6, m3
1909cabdff1aSopenharmony_ci    movq     [dstq +stride3q ], m2
1910cabdff1aSopenharmony_ci    movhps   [dst4q+stride3q ], m2
1911cabdff1aSopenharmony_ci    RET
1912cabdff1aSopenharmony_ci
1913cabdff1aSopenharmony_cicglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
1914cabdff1aSopenharmony_ci    mova                    m0, [lq]
1915cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1916cabdff1aSopenharmony_ci    mova                    m3, [pb_2toE_3xF]
1917cabdff1aSopenharmony_ci    pshufb                  m1, m0, [pb_1toE_2xF]
1918cabdff1aSopenharmony_ci    pshufb                  m2, m0, m3
1919cabdff1aSopenharmony_ci%else
1920cabdff1aSopenharmony_ci    pand                    m3, m0, [pb_15x0_1xm1]
1921cabdff1aSopenharmony_ci    psrldq                  m1, m0, 1
1922cabdff1aSopenharmony_ci    por                     m1, m3
1923cabdff1aSopenharmony_ci    punpckhbw               m3, m3
1924cabdff1aSopenharmony_ci    psrldq                  m2, m0, 2
1925cabdff1aSopenharmony_ci    por                     m2, m3
1926cabdff1aSopenharmony_ci%endif
1927cabdff1aSopenharmony_ci    LOWPASS                  2,  1,  0,  4
1928cabdff1aSopenharmony_ci    pavgb                   m1, m0
1929cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride9, cnt
1930cabdff1aSopenharmony_ci    lea                stride9q, [strideq*8+strideq]
1931cabdff1aSopenharmony_ci    mov                   cntd,  4
1932cabdff1aSopenharmony_ci    SBUTTERFLY              bw,  1,  2,  0
1933cabdff1aSopenharmony_ci
1934cabdff1aSopenharmony_ci.loop:
1935cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m1
1936cabdff1aSopenharmony_ci    mova      [dstq+strideq*8], m2
1937cabdff1aSopenharmony_ci    PALIGNR                 m0, m2, m1, 2, m4
1938cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1939cabdff1aSopenharmony_ci    pshufb                  m2, m3
1940cabdff1aSopenharmony_ci%else
1941cabdff1aSopenharmony_ci    psrldq                  m2, 2
1942cabdff1aSopenharmony_ci    por                     m2, m3
1943cabdff1aSopenharmony_ci%endif
1944cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
1945cabdff1aSopenharmony_ci    mova      [dstq+stride9q ], m2
1946cabdff1aSopenharmony_ci    PALIGNR                 m1, m2, m0, 2, m4
1947cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1948cabdff1aSopenharmony_ci    pshufb                  m2, m3
1949cabdff1aSopenharmony_ci%else
1950cabdff1aSopenharmony_ci    psrldq                  m2, 2
1951cabdff1aSopenharmony_ci    por                     m2, m3
1952cabdff1aSopenharmony_ci%endif
1953cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1954cabdff1aSopenharmony_ci    dec                   cntd
1955cabdff1aSopenharmony_ci    jg .loop
1956cabdff1aSopenharmony_ci    RET
1957cabdff1aSopenharmony_ci
1958cabdff1aSopenharmony_cicglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l
1959cabdff1aSopenharmony_ci    mova                    m1, [lq]
1960cabdff1aSopenharmony_ci    mova                    m0, [lq+16]
1961cabdff1aSopenharmony_ci    PALIGNR                 m2, m0, m1,  1, m5
1962cabdff1aSopenharmony_ci    PALIGNR                 m3, m0, m1,  2, m5
1963cabdff1aSopenharmony_ci    LOWPASS                  3,  2,  1,  5
1964cabdff1aSopenharmony_ci    pavgb                   m2, m1
1965cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1966cabdff1aSopenharmony_ci    mova                    m4, [pb_2toE_3xF]
1967cabdff1aSopenharmony_ci    pshufb                  m5, m0, [pb_1toE_2xF]
1968cabdff1aSopenharmony_ci    pshufb                  m1, m0, m4
1969cabdff1aSopenharmony_ci%else
1970cabdff1aSopenharmony_ci    pand                    m4, m0, [pb_15x0_1xm1]
1971cabdff1aSopenharmony_ci    psrldq                  m5, m0, 1
1972cabdff1aSopenharmony_ci    por                     m5, m4
1973cabdff1aSopenharmony_ci    punpckhbw               m4, m4
1974cabdff1aSopenharmony_ci    psrldq                  m1, m0, 2
1975cabdff1aSopenharmony_ci    por                     m1, m4
1976cabdff1aSopenharmony_ci%endif
1977cabdff1aSopenharmony_ci    LOWPASS                  1,  5,  0,  6
1978cabdff1aSopenharmony_ci    pavgb                   m0, m5
1979cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
1980cabdff1aSopenharmony_ci    mov                   cntd,  8
1981cabdff1aSopenharmony_ci    xor               stride0q, stride0q
1982cabdff1aSopenharmony_ci    lea                  dst8q, [dstq  +strideq*8]
1983cabdff1aSopenharmony_ci    lea                 dst16q, [dst8q +strideq*8]
1984cabdff1aSopenharmony_ci    lea                 dst24q, [dst16q+strideq*8]
1985cabdff1aSopenharmony_ci    SBUTTERFLY              bw,  0,  1,  5
1986cabdff1aSopenharmony_ci    SBUTTERFLY              bw,  2,  3,  5
1987cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1988cabdff1aSopenharmony_ci    pshufb                  m6, m1, [pb_15]
1989cabdff1aSopenharmony_ci%else
1990cabdff1aSopenharmony_ci    pshufhw                 m6, m4, q3333
1991cabdff1aSopenharmony_ci    punpckhqdq              m6, m6
1992cabdff1aSopenharmony_ci%endif
1993cabdff1aSopenharmony_ci
1994cabdff1aSopenharmony_ci.loop:
1995cabdff1aSopenharmony_ci    mova  [dstq  +stride0q+ 0], m2
1996cabdff1aSopenharmony_ci    mova  [dstq  +stride0q+16], m3
1997cabdff1aSopenharmony_ci    mova  [dst8q +stride0q+ 0], m3
1998cabdff1aSopenharmony_ci    mova  [dst8q +stride0q+16], m0
1999cabdff1aSopenharmony_ci    mova  [dst16q+stride0q+ 0], m0
2000cabdff1aSopenharmony_ci    mova  [dst16q+stride0q+16], m1
2001cabdff1aSopenharmony_ci    mova  [dst24q+stride0q+ 0], m1
2002cabdff1aSopenharmony_ci    mova  [dst24q+stride0q+16], m6
2003cabdff1aSopenharmony_ci%if cpuflag(avx)
2004cabdff1aSopenharmony_ci    palignr                 m2, m3, m2, 2
2005cabdff1aSopenharmony_ci    palignr                 m3, m0, m3, 2
2006cabdff1aSopenharmony_ci    palignr                 m0, m1, m0, 2
2007cabdff1aSopenharmony_ci    pshufb                  m1, m4
2008cabdff1aSopenharmony_ci%elif cpuflag(ssse3)
2009cabdff1aSopenharmony_ci    pshufb                  m5, m1, m4
2010cabdff1aSopenharmony_ci    palignr                 m1, m0, 2
2011cabdff1aSopenharmony_ci    palignr                 m0, m3, 2
2012cabdff1aSopenharmony_ci    palignr                 m3, m2, 2
2013cabdff1aSopenharmony_ci    mova                    m2, m3
2014cabdff1aSopenharmony_ci    mova                    m3, m0
2015cabdff1aSopenharmony_ci    mova                    m0, m1
2016cabdff1aSopenharmony_ci    mova                    m1, m5
2017cabdff1aSopenharmony_ci%else
2018cabdff1aSopenharmony_ci    ; half-integrated version of PALIGNR
2019cabdff1aSopenharmony_ci    pslldq                  m5, m1, 14
2020cabdff1aSopenharmony_ci    pslldq                  m7, m0, 14
2021cabdff1aSopenharmony_ci    psrldq                  m1, 2
2022cabdff1aSopenharmony_ci    psrldq                  m0, 2
2023cabdff1aSopenharmony_ci    por                     m1, m4
2024cabdff1aSopenharmony_ci    por                     m0, m5
2025cabdff1aSopenharmony_ci    pslldq                  m5, m3, 14
2026cabdff1aSopenharmony_ci    psrldq                  m3, 2
2027cabdff1aSopenharmony_ci    por                     m3, m7
2028cabdff1aSopenharmony_ci    psrldq                  m2, 2
2029cabdff1aSopenharmony_ci    por                     m2, m5
2030cabdff1aSopenharmony_ci%endif
2031cabdff1aSopenharmony_ci    add               stride0q, strideq
2032cabdff1aSopenharmony_ci    dec                   cntd
2033cabdff1aSopenharmony_ci    jg .loop
2034cabdff1aSopenharmony_ci    RET
2035cabdff1aSopenharmony_ci%endmacro
2036cabdff1aSopenharmony_ci
2037cabdff1aSopenharmony_ciINIT_XMM sse2
2038cabdff1aSopenharmony_ciHU_XMM_FUNCS 8
2039cabdff1aSopenharmony_ciINIT_XMM ssse3
2040cabdff1aSopenharmony_ciHU_XMM_FUNCS 7
2041cabdff1aSopenharmony_ciINIT_XMM avx
2042cabdff1aSopenharmony_ciHU_XMM_FUNCS 7
2043cabdff1aSopenharmony_ci
2044cabdff1aSopenharmony_ci; FIXME 127, 128, 129 ?
2045