1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* VP9 Intra prediction SIMD optimizations
3cabdff1aSopenharmony_ci;*
4cabdff1aSopenharmony_ci;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
5cabdff1aSopenharmony_ci;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci;******************************************************************************
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION_RODATA 32
27cabdff1aSopenharmony_ci
28cabdff1aSopenharmony_cipd_2: times 8 dd 2
29cabdff1aSopenharmony_cipd_4: times 8 dd 4
30cabdff1aSopenharmony_cipd_8: times 8 dd 8
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_cipb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
33cabdff1aSopenharmony_cipb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
34cabdff1aSopenharmony_cipb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_cicextern pw_1
37cabdff1aSopenharmony_cicextern pw_1023
38cabdff1aSopenharmony_cicextern pw_4095
39cabdff1aSopenharmony_cicextern pd_16
40cabdff1aSopenharmony_cicextern pd_32
41cabdff1aSopenharmony_cicextern pd_65535;
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ci; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
44cabdff1aSopenharmony_ci; only 3 registers on x86-32, which would make it one cycle faster, but that
45cabdff1aSopenharmony_ci; would make the code quite a bit uglier...
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ciSECTION .text
48cabdff1aSopenharmony_ci
49cabdff1aSopenharmony_ci%macro SCRATCH 3-4
50cabdff1aSopenharmony_ci%if ARCH_X86_64
51cabdff1aSopenharmony_ci    SWAP                %1, %2
52cabdff1aSopenharmony_ci%if %0 == 4
53cabdff1aSopenharmony_ci%define reg_%4 m%2
54cabdff1aSopenharmony_ci%endif
55cabdff1aSopenharmony_ci%else
56cabdff1aSopenharmony_ci    mova              [%3], m%1
57cabdff1aSopenharmony_ci%if %0 == 4
58cabdff1aSopenharmony_ci%define reg_%4 [%3]
59cabdff1aSopenharmony_ci%endif
60cabdff1aSopenharmony_ci%endif
61cabdff1aSopenharmony_ci%endmacro
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_ci%macro UNSCRATCH 3-4
64cabdff1aSopenharmony_ci%if ARCH_X86_64
65cabdff1aSopenharmony_ci    SWAP                %1, %2
66cabdff1aSopenharmony_ci%else
67cabdff1aSopenharmony_ci    mova               m%1, [%3]
68cabdff1aSopenharmony_ci%endif
69cabdff1aSopenharmony_ci%if %0 == 4
70cabdff1aSopenharmony_ci%undef reg_%4
71cabdff1aSopenharmony_ci%endif
72cabdff1aSopenharmony_ci%endmacro
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_ci%macro PRELOAD 2-3
75cabdff1aSopenharmony_ci%if ARCH_X86_64
76cabdff1aSopenharmony_ci    mova               m%1, [%2]
77cabdff1aSopenharmony_ci%if %0 == 3
78cabdff1aSopenharmony_ci%define reg_%3 m%1
79cabdff1aSopenharmony_ci%endif
80cabdff1aSopenharmony_ci%elif %0 == 3
81cabdff1aSopenharmony_ci%define reg_%3 [%2]
82cabdff1aSopenharmony_ci%endif
83cabdff1aSopenharmony_ci%endmacro
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_ciINIT_MMX mmx
86cabdff1aSopenharmony_cicglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
87cabdff1aSopenharmony_ci    movifnidn               aq, amp
88cabdff1aSopenharmony_ci    mova                    m0, [aq]
89cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
90cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
91cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
92cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
93cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
94cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
95cabdff1aSopenharmony_ci    RET
96cabdff1aSopenharmony_ci
97cabdff1aSopenharmony_ciINIT_XMM sse
98cabdff1aSopenharmony_cicglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
99cabdff1aSopenharmony_ci    movifnidn               aq, amp
100cabdff1aSopenharmony_ci    mova                    m0, [aq]
101cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
102cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
103cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
104cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
105cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
106cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
107cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
108cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
109cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
110cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
111cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
112cabdff1aSopenharmony_ci    RET
113cabdff1aSopenharmony_ci
114cabdff1aSopenharmony_ciINIT_XMM sse
115cabdff1aSopenharmony_cicglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
116cabdff1aSopenharmony_ci    movifnidn               aq, amp
117cabdff1aSopenharmony_ci    mova                    m0, [aq]
118cabdff1aSopenharmony_ci    mova                    m1, [aq+mmsize]
119cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
120cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
121cabdff1aSopenharmony_ci    mov                   cntd, 4
122cabdff1aSopenharmony_ci.loop:
123cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
124cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m1
125cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m0
126cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m1
127cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+ 0], m0
128cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+16], m1
129cabdff1aSopenharmony_ci    mova   [dstq+stride3q + 0], m0
130cabdff1aSopenharmony_ci    mova   [dstq+stride3q +16], m1
131cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
132cabdff1aSopenharmony_ci    dec               cntd
133cabdff1aSopenharmony_ci    jg .loop
134cabdff1aSopenharmony_ci    RET
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ciINIT_XMM sse
137cabdff1aSopenharmony_cicglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
138cabdff1aSopenharmony_ci    movifnidn               aq, amp
139cabdff1aSopenharmony_ci    mova                    m0, [aq+mmsize*0]
140cabdff1aSopenharmony_ci    mova                    m1, [aq+mmsize*1]
141cabdff1aSopenharmony_ci    mova                    m2, [aq+mmsize*2]
142cabdff1aSopenharmony_ci    mova                    m3, [aq+mmsize*3]
143cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt
144cabdff1aSopenharmony_ci    mov                   cntd, 16
145cabdff1aSopenharmony_ci.loop:
146cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
147cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m1
148cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+32], m2
149cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+48], m3
150cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m0
151cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m1
152cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+32], m2
153cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+48], m3
154cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
155cabdff1aSopenharmony_ci    dec               cntd
156cabdff1aSopenharmony_ci    jg .loop
157cabdff1aSopenharmony_ci    RET
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_ciINIT_MMX mmxext
160cabdff1aSopenharmony_cicglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
161cabdff1aSopenharmony_ci    mova                    m3, [lq]
162cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
163cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
164cabdff1aSopenharmony_ci    pshufw                  m0, m3, q3333
165cabdff1aSopenharmony_ci    pshufw                  m1, m3, q2222
166cabdff1aSopenharmony_ci    pshufw                  m2, m3, q1111
167cabdff1aSopenharmony_ci    pshufw                  m3, m3, q0000
168cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
169cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m1
170cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m2
171cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m3
172cabdff1aSopenharmony_ci    RET
173cabdff1aSopenharmony_ci
174cabdff1aSopenharmony_ciINIT_XMM sse2
175cabdff1aSopenharmony_cicglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
176cabdff1aSopenharmony_ci    mova                    m2, [lq]
177cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
178cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
179cabdff1aSopenharmony_ci    punpckhwd               m3, m2, m2
180cabdff1aSopenharmony_ci    pshufd                  m0, m3, q3333
181cabdff1aSopenharmony_ci    pshufd                  m1, m3, q2222
182cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
183cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m1
184cabdff1aSopenharmony_ci    pshufd                  m0, m3, q1111
185cabdff1aSopenharmony_ci    pshufd                  m1, m3, q0000
186cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
187cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m1
188cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
189cabdff1aSopenharmony_ci    punpcklwd               m2, m2
190cabdff1aSopenharmony_ci    pshufd                  m0, m2, q3333
191cabdff1aSopenharmony_ci    pshufd                  m1, m2, q2222
192cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
193cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m1
194cabdff1aSopenharmony_ci    pshufd                  m0, m2, q1111
195cabdff1aSopenharmony_ci    pshufd                  m1, m2, q0000
196cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
197cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m1
198cabdff1aSopenharmony_ci    RET
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ciINIT_XMM sse2
201cabdff1aSopenharmony_cicglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
202cabdff1aSopenharmony_ci    mov                   cntd, 3
203cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
204cabdff1aSopenharmony_ci.loop:
205cabdff1aSopenharmony_ci    movh                    m3, [lq+cntq*8]
206cabdff1aSopenharmony_ci    punpcklwd               m3, m3
207cabdff1aSopenharmony_ci    pshufd                  m0, m3, q3333
208cabdff1aSopenharmony_ci    pshufd                  m1, m3, q2222
209cabdff1aSopenharmony_ci    pshufd                  m2, m3, q1111
210cabdff1aSopenharmony_ci    pshufd                  m3, m3, q0000
211cabdff1aSopenharmony_ci    mova    [dstq+strideq*0+ 0], m0
212cabdff1aSopenharmony_ci    mova    [dstq+strideq*0+16], m0
213cabdff1aSopenharmony_ci    mova    [dstq+strideq*1+ 0], m1
214cabdff1aSopenharmony_ci    mova    [dstq+strideq*1+16], m1
215cabdff1aSopenharmony_ci    mova    [dstq+strideq*2+ 0], m2
216cabdff1aSopenharmony_ci    mova    [dstq+strideq*2+16], m2
217cabdff1aSopenharmony_ci    mova    [dstq+stride3q + 0], m3
218cabdff1aSopenharmony_ci    mova    [dstq+stride3q +16], m3
219cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
220cabdff1aSopenharmony_ci    dec                   cntd
221cabdff1aSopenharmony_ci    jge .loop
222cabdff1aSopenharmony_ci    RET
223cabdff1aSopenharmony_ci
224cabdff1aSopenharmony_ciINIT_XMM sse2
225cabdff1aSopenharmony_cicglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
226cabdff1aSopenharmony_ci    mov                   cntd, 7
227cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
228cabdff1aSopenharmony_ci.loop:
229cabdff1aSopenharmony_ci    movh                    m3, [lq+cntq*8]
230cabdff1aSopenharmony_ci    punpcklwd               m3, m3
231cabdff1aSopenharmony_ci    pshufd                  m0, m3, q3333
232cabdff1aSopenharmony_ci    pshufd                  m1, m3, q2222
233cabdff1aSopenharmony_ci    pshufd                  m2, m3, q1111
234cabdff1aSopenharmony_ci    pshufd                  m3, m3, q0000
235cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
236cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m0
237cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+32], m0
238cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+48], m0
239cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m1
240cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m1
241cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+32], m1
242cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+48], m1
243cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+ 0], m2
244cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+16], m2
245cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+32], m2
246cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+48], m2
247cabdff1aSopenharmony_ci    mova   [dstq+stride3q + 0], m3
248cabdff1aSopenharmony_ci    mova   [dstq+stride3q +16], m3
249cabdff1aSopenharmony_ci    mova   [dstq+stride3q +32], m3
250cabdff1aSopenharmony_ci    mova   [dstq+stride3q +48], m3
251cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
252cabdff1aSopenharmony_ci    dec                   cntd
253cabdff1aSopenharmony_ci    jge .loop
254cabdff1aSopenharmony_ci    RET
255cabdff1aSopenharmony_ci
256cabdff1aSopenharmony_ciINIT_MMX mmxext
257cabdff1aSopenharmony_cicglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
258cabdff1aSopenharmony_ci    mova                    m0, [lq]
259cabdff1aSopenharmony_ci    paddw                   m0, [aq]
260cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
261cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
262cabdff1aSopenharmony_ci    pmaddwd                 m0, [pw_1]
263cabdff1aSopenharmony_ci    pshufw                  m1, m0, q3232
264cabdff1aSopenharmony_ci    paddd                   m0, [pd_4]
265cabdff1aSopenharmony_ci    paddd                   m0, m1
266cabdff1aSopenharmony_ci    psrad                   m0, 3
267cabdff1aSopenharmony_ci    pshufw                  m0, m0, q0000
268cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
269cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
270cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
271cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
272cabdff1aSopenharmony_ci    RET
273cabdff1aSopenharmony_ci
274cabdff1aSopenharmony_ciINIT_XMM sse2
275cabdff1aSopenharmony_cicglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
276cabdff1aSopenharmony_ci    mova                    m0, [lq]
277cabdff1aSopenharmony_ci    paddw                   m0, [aq]
278cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
279cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
280cabdff1aSopenharmony_ci    pmaddwd                 m0, [pw_1]
281cabdff1aSopenharmony_ci    pshufd                  m1, m0, q3232
282cabdff1aSopenharmony_ci    paddd                   m0, m1
283cabdff1aSopenharmony_ci    pshufd                  m1, m0, q1111
284cabdff1aSopenharmony_ci    paddd                   m0, [pd_8]
285cabdff1aSopenharmony_ci    paddd                   m0, m1
286cabdff1aSopenharmony_ci    psrad                   m0, 4
287cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q0000
288cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
289cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
290cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
291cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
292cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
293cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
294cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
295cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
296cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
297cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
298cabdff1aSopenharmony_ci    RET
299cabdff1aSopenharmony_ci
300cabdff1aSopenharmony_ciINIT_XMM sse2
301cabdff1aSopenharmony_cicglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
302cabdff1aSopenharmony_ci    mova                    m0, [lq]
303cabdff1aSopenharmony_ci    paddw                   m0, [lq+mmsize]
304cabdff1aSopenharmony_ci    paddw                   m0, [aq]
305cabdff1aSopenharmony_ci    paddw                   m0, [aq+mmsize]
306cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
307cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
308cabdff1aSopenharmony_ci    mov                   cntd, 4
309cabdff1aSopenharmony_ci    pmaddwd                 m0, [pw_1]
310cabdff1aSopenharmony_ci    pshufd                  m1, m0, q3232
311cabdff1aSopenharmony_ci    paddd                   m0, m1
312cabdff1aSopenharmony_ci    pshufd                  m1, m0, q1111
313cabdff1aSopenharmony_ci    paddd                   m0, [pd_16]
314cabdff1aSopenharmony_ci    paddd                   m0, m1
315cabdff1aSopenharmony_ci    psrad                   m0, 5
316cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q0000
317cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
318cabdff1aSopenharmony_ci.loop:
319cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
320cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m0
321cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m0
322cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m0
323cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+ 0], m0
324cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+16], m0
325cabdff1aSopenharmony_ci    mova   [dstq+stride3q + 0], m0
326cabdff1aSopenharmony_ci    mova   [dstq+stride3q +16], m0
327cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
328cabdff1aSopenharmony_ci    dec                   cntd
329cabdff1aSopenharmony_ci    jg .loop
330cabdff1aSopenharmony_ci    RET
331cabdff1aSopenharmony_ci
332cabdff1aSopenharmony_ciINIT_XMM sse2
333cabdff1aSopenharmony_cicglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
334cabdff1aSopenharmony_ci    mova                    m0, [lq+mmsize*0]
335cabdff1aSopenharmony_ci    paddw                   m0, [lq+mmsize*1]
336cabdff1aSopenharmony_ci    paddw                   m0, [lq+mmsize*2]
337cabdff1aSopenharmony_ci    paddw                   m0, [lq+mmsize*3]
338cabdff1aSopenharmony_ci    paddw                   m0, [aq+mmsize*0]
339cabdff1aSopenharmony_ci    paddw                   m0, [aq+mmsize*1]
340cabdff1aSopenharmony_ci    paddw                   m0, [aq+mmsize*2]
341cabdff1aSopenharmony_ci    paddw                   m0, [aq+mmsize*3]
342cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
343cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
344cabdff1aSopenharmony_ci    mov                   cntd, 16
345cabdff1aSopenharmony_ci    pmaddwd                 m0, [pw_1]
346cabdff1aSopenharmony_ci    pshufd                  m1, m0, q3232
347cabdff1aSopenharmony_ci    paddd                   m0, m1
348cabdff1aSopenharmony_ci    pshufd                  m1, m0, q1111
349cabdff1aSopenharmony_ci    paddd                   m0, [pd_32]
350cabdff1aSopenharmony_ci    paddd                   m0, m1
351cabdff1aSopenharmony_ci    psrad                   m0, 6
352cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q0000
353cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
354cabdff1aSopenharmony_ci.loop:
355cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
356cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m0
357cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+32], m0
358cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+48], m0
359cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m0
360cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m0
361cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+32], m0
362cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+48], m0
363cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
364cabdff1aSopenharmony_ci    dec                   cntd
365cabdff1aSopenharmony_ci    jg .loop
366cabdff1aSopenharmony_ci    RET
367cabdff1aSopenharmony_ci
368cabdff1aSopenharmony_ci%macro DC_1D_FNS 2
369cabdff1aSopenharmony_ciINIT_MMX mmxext
370cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
371cabdff1aSopenharmony_ci    mova                    m0, [%2]
372cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
373cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
374cabdff1aSopenharmony_ci    pmaddwd                 m0, [pw_1]
375cabdff1aSopenharmony_ci    pshufw                  m1, m0, q3232
376cabdff1aSopenharmony_ci    paddd                   m0, [pd_2]
377cabdff1aSopenharmony_ci    paddd                   m0, m1
378cabdff1aSopenharmony_ci    psrad                   m0, 2
379cabdff1aSopenharmony_ci    pshufw                  m0, m0, q0000
380cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
381cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
382cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
383cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
384cabdff1aSopenharmony_ci    RET
385cabdff1aSopenharmony_ci
386cabdff1aSopenharmony_ciINIT_XMM sse2
387cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
388cabdff1aSopenharmony_ci    mova                    m0, [%2]
389cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
390cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
391cabdff1aSopenharmony_ci    pmaddwd                 m0, [pw_1]
392cabdff1aSopenharmony_ci    pshufd                  m1, m0, q3232
393cabdff1aSopenharmony_ci    paddd                   m0, m1
394cabdff1aSopenharmony_ci    pshufd                  m1, m0, q1111
395cabdff1aSopenharmony_ci    paddd                   m0, [pd_4]
396cabdff1aSopenharmony_ci    paddd                   m0, m1
397cabdff1aSopenharmony_ci    psrad                   m0, 3
398cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q0000
399cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
400cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
401cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
402cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
403cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
404cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
405cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
406cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
407cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
408cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m0
409cabdff1aSopenharmony_ci    RET
410cabdff1aSopenharmony_ci
411cabdff1aSopenharmony_ciINIT_XMM sse2
412cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
413cabdff1aSopenharmony_ci    mova                    m0, [%2]
414cabdff1aSopenharmony_ci    paddw                   m0, [%2+mmsize]
415cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
416cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
417cabdff1aSopenharmony_ci    mov                   cntd, 4
418cabdff1aSopenharmony_ci    pmaddwd                 m0, [pw_1]
419cabdff1aSopenharmony_ci    pshufd                  m1, m0, q3232
420cabdff1aSopenharmony_ci    paddd                   m0, m1
421cabdff1aSopenharmony_ci    pshufd                  m1, m0, q1111
422cabdff1aSopenharmony_ci    paddd                   m0, [pd_8]
423cabdff1aSopenharmony_ci    paddd                   m0, m1
424cabdff1aSopenharmony_ci    psrad                   m0, 4
425cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q0000
426cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
427cabdff1aSopenharmony_ci.loop:
428cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
429cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m0
430cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m0
431cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m0
432cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+ 0], m0
433cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+16], m0
434cabdff1aSopenharmony_ci    mova   [dstq+stride3q + 0], m0
435cabdff1aSopenharmony_ci    mova   [dstq+stride3q +16], m0
436cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
437cabdff1aSopenharmony_ci    dec                   cntd
438cabdff1aSopenharmony_ci    jg .loop
439cabdff1aSopenharmony_ci    RET
440cabdff1aSopenharmony_ci
441cabdff1aSopenharmony_ciINIT_XMM sse2
442cabdff1aSopenharmony_cicglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
443cabdff1aSopenharmony_ci    mova                    m0, [%2+mmsize*0]
444cabdff1aSopenharmony_ci    paddw                   m0, [%2+mmsize*1]
445cabdff1aSopenharmony_ci    paddw                   m0, [%2+mmsize*2]
446cabdff1aSopenharmony_ci    paddw                   m0, [%2+mmsize*3]
447cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt
448cabdff1aSopenharmony_ci    mov                   cntd, 16
449cabdff1aSopenharmony_ci    pmaddwd                 m0, [pw_1]
450cabdff1aSopenharmony_ci    pshufd                  m1, m0, q3232
451cabdff1aSopenharmony_ci    paddd                   m0, m1
452cabdff1aSopenharmony_ci    pshufd                  m1, m0, q1111
453cabdff1aSopenharmony_ci    paddd                   m0, [pd_16]
454cabdff1aSopenharmony_ci    paddd                   m0, m1
455cabdff1aSopenharmony_ci    psrad                   m0, 5
456cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q0000
457cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
458cabdff1aSopenharmony_ci.loop:
459cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
460cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m0
461cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+32], m0
462cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+48], m0
463cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m0
464cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m0
465cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+32], m0
466cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+48], m0
467cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
468cabdff1aSopenharmony_ci    dec                   cntd
469cabdff1aSopenharmony_ci    jg .loop
470cabdff1aSopenharmony_ci    RET
471cabdff1aSopenharmony_ci%endmacro
472cabdff1aSopenharmony_ci
473cabdff1aSopenharmony_ciDC_1D_FNS top,  aq
474cabdff1aSopenharmony_ciDC_1D_FNS left, lq
475cabdff1aSopenharmony_ci
476cabdff1aSopenharmony_ciINIT_MMX mmxext
477cabdff1aSopenharmony_cicglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
478cabdff1aSopenharmony_ci    mova                    m5, [pw_1023]
479cabdff1aSopenharmony_ci.body:
480cabdff1aSopenharmony_ci    mova                    m4, [aq]
481cabdff1aSopenharmony_ci    mova                    m3, [lq]
482cabdff1aSopenharmony_ci    movd                    m0, [aq-4]
483cabdff1aSopenharmony_ci    pshufw                  m0, m0, q1111
484cabdff1aSopenharmony_ci    psubw                   m4, m0
485cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
486cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
487cabdff1aSopenharmony_ci    pshufw                  m0, m3, q3333
488cabdff1aSopenharmony_ci    pshufw                  m1, m3, q2222
489cabdff1aSopenharmony_ci    pshufw                  m2, m3, q1111
490cabdff1aSopenharmony_ci    pshufw                  m3, m3, q0000
491cabdff1aSopenharmony_ci    paddw                   m0, m4
492cabdff1aSopenharmony_ci    paddw                   m1, m4
493cabdff1aSopenharmony_ci    paddw                   m2, m4
494cabdff1aSopenharmony_ci    paddw                   m3, m4
495cabdff1aSopenharmony_ci    pxor                    m4, m4
496cabdff1aSopenharmony_ci    pmaxsw                  m0, m4
497cabdff1aSopenharmony_ci    pmaxsw                  m1, m4
498cabdff1aSopenharmony_ci    pmaxsw                  m2, m4
499cabdff1aSopenharmony_ci    pmaxsw                  m3, m4
500cabdff1aSopenharmony_ci    pminsw                  m0, m5
501cabdff1aSopenharmony_ci    pminsw                  m1, m5
502cabdff1aSopenharmony_ci    pminsw                  m2, m5
503cabdff1aSopenharmony_ci    pminsw                  m3, m5
504cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
505cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m1
506cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m2
507cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m3
508cabdff1aSopenharmony_ci    RET
509cabdff1aSopenharmony_ci
510cabdff1aSopenharmony_cicglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
511cabdff1aSopenharmony_ci    mova                    m5, [pw_4095]
512cabdff1aSopenharmony_ci    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
513cabdff1aSopenharmony_ci
514cabdff1aSopenharmony_ciINIT_XMM sse2
515cabdff1aSopenharmony_cicglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
516cabdff1aSopenharmony_ci    mova                    m4, [pw_1023]
517cabdff1aSopenharmony_ci.body:
518cabdff1aSopenharmony_ci    pxor                    m6, m6
519cabdff1aSopenharmony_ci    mova                    m5, [aq]
520cabdff1aSopenharmony_ci    movd                    m0, [aq-4]
521cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q1111
522cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
523cabdff1aSopenharmony_ci    psubw                   m5, m0
524cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, l, stride3, cnt
525cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
526cabdff1aSopenharmony_ci    mov                   cntd, 1
527cabdff1aSopenharmony_ci.loop:
528cabdff1aSopenharmony_ci    movh                    m3, [lq+cntq*8]
529cabdff1aSopenharmony_ci    punpcklwd               m3, m3
530cabdff1aSopenharmony_ci    pshufd                  m0, m3, q3333
531cabdff1aSopenharmony_ci    pshufd                  m1, m3, q2222
532cabdff1aSopenharmony_ci    pshufd                  m2, m3, q1111
533cabdff1aSopenharmony_ci    pshufd                  m3, m3, q0000
534cabdff1aSopenharmony_ci    paddw                   m0, m5
535cabdff1aSopenharmony_ci    paddw                   m1, m5
536cabdff1aSopenharmony_ci    paddw                   m2, m5
537cabdff1aSopenharmony_ci    paddw                   m3, m5
538cabdff1aSopenharmony_ci    pmaxsw                  m0, m6
539cabdff1aSopenharmony_ci    pmaxsw                  m1, m6
540cabdff1aSopenharmony_ci    pmaxsw                  m2, m6
541cabdff1aSopenharmony_ci    pmaxsw                  m3, m6
542cabdff1aSopenharmony_ci    pminsw                  m0, m4
543cabdff1aSopenharmony_ci    pminsw                  m1, m4
544cabdff1aSopenharmony_ci    pminsw                  m2, m4
545cabdff1aSopenharmony_ci    pminsw                  m3, m4
546cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
547cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m1
548cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m2
549cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m3
550cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
551cabdff1aSopenharmony_ci    dec                   cntd
552cabdff1aSopenharmony_ci    jge .loop
553cabdff1aSopenharmony_ci    RET
554cabdff1aSopenharmony_ci
555cabdff1aSopenharmony_cicglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
556cabdff1aSopenharmony_ci    mova                    m4, [pw_4095]
557cabdff1aSopenharmony_ci    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
558cabdff1aSopenharmony_ci
559cabdff1aSopenharmony_ciINIT_XMM sse2
560cabdff1aSopenharmony_cicglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
561cabdff1aSopenharmony_ci    mova                    m7, [pw_1023]
562cabdff1aSopenharmony_ci.body:
563cabdff1aSopenharmony_ci    pxor                    m6, m6
564cabdff1aSopenharmony_ci    mova                    m4, [aq]
565cabdff1aSopenharmony_ci    mova                    m5, [aq+mmsize]
566cabdff1aSopenharmony_ci    movd                    m0, [aq-4]
567cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q1111
568cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
569cabdff1aSopenharmony_ci    psubw                   m4, m0
570cabdff1aSopenharmony_ci    psubw                   m5, m0
571cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, l, cnt
572cabdff1aSopenharmony_ci    mov                   cntd, 7
573cabdff1aSopenharmony_ci.loop:
574cabdff1aSopenharmony_ci    movd                    m3, [lq+cntq*4]
575cabdff1aSopenharmony_ci    punpcklwd               m3, m3
576cabdff1aSopenharmony_ci    pshufd                  m2, m3, q1111
577cabdff1aSopenharmony_ci    pshufd                  m3, m3, q0000
578cabdff1aSopenharmony_ci    paddw                   m0, m2, m4
579cabdff1aSopenharmony_ci    paddw                   m2, m5
580cabdff1aSopenharmony_ci    paddw                   m1, m3, m4
581cabdff1aSopenharmony_ci    paddw                   m3, m5
582cabdff1aSopenharmony_ci    pmaxsw                  m0, m6
583cabdff1aSopenharmony_ci    pmaxsw                  m2, m6
584cabdff1aSopenharmony_ci    pmaxsw                  m1, m6
585cabdff1aSopenharmony_ci    pmaxsw                  m3, m6
586cabdff1aSopenharmony_ci    pminsw                  m0, m7
587cabdff1aSopenharmony_ci    pminsw                  m2, m7
588cabdff1aSopenharmony_ci    pminsw                  m1, m7
589cabdff1aSopenharmony_ci    pminsw                  m3, m7
590cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
591cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m2
592cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m1
593cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m3
594cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
595cabdff1aSopenharmony_ci    dec                   cntd
596cabdff1aSopenharmony_ci    jge .loop
597cabdff1aSopenharmony_ci    RET
598cabdff1aSopenharmony_ci
599cabdff1aSopenharmony_cicglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
600cabdff1aSopenharmony_ci    mova                    m7, [pw_4095]
601cabdff1aSopenharmony_ci    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
602cabdff1aSopenharmony_ci
603cabdff1aSopenharmony_ciINIT_XMM sse2
604cabdff1aSopenharmony_cicglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
605cabdff1aSopenharmony_ci    mova                    m0, [pw_1023]
606cabdff1aSopenharmony_ci.body:
607cabdff1aSopenharmony_ci    pxor                    m1, m1
608cabdff1aSopenharmony_ci%if ARCH_X86_64
609cabdff1aSopenharmony_ci    SWAP                     0, 8
610cabdff1aSopenharmony_ci    SWAP                     1, 9
611cabdff1aSopenharmony_ci%define reg_min m9
612cabdff1aSopenharmony_ci%define reg_max m8
613cabdff1aSopenharmony_ci%else
614cabdff1aSopenharmony_ci    mova              [rsp+ 0], m0
615cabdff1aSopenharmony_ci    mova              [rsp+16], m1
616cabdff1aSopenharmony_ci%define reg_min [rsp+16]
617cabdff1aSopenharmony_ci%define reg_max [rsp+ 0]
618cabdff1aSopenharmony_ci%endif
619cabdff1aSopenharmony_ci
620cabdff1aSopenharmony_ci    mova                    m4, [aq+mmsize*0]
621cabdff1aSopenharmony_ci    mova                    m5, [aq+mmsize*1]
622cabdff1aSopenharmony_ci    mova                    m6, [aq+mmsize*2]
623cabdff1aSopenharmony_ci    mova                    m7, [aq+mmsize*3]
624cabdff1aSopenharmony_ci    movd                    m0, [aq-4]
625cabdff1aSopenharmony_ci    pshuflw                 m0, m0, q1111
626cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
627cabdff1aSopenharmony_ci    psubw                   m4, m0
628cabdff1aSopenharmony_ci    psubw                   m5, m0
629cabdff1aSopenharmony_ci    psubw                   m6, m0
630cabdff1aSopenharmony_ci    psubw                   m7, m0
631cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, l, cnt
632cabdff1aSopenharmony_ci    mov                   cntd, 31
633cabdff1aSopenharmony_ci.loop:
634cabdff1aSopenharmony_ci    pinsrw                  m3, [lq+cntq*2], 0
635cabdff1aSopenharmony_ci    punpcklwd               m3, m3
636cabdff1aSopenharmony_ci    pshufd                  m3, m3, q0000
637cabdff1aSopenharmony_ci    paddw                   m0, m3, m4
638cabdff1aSopenharmony_ci    paddw                   m1, m3, m5
639cabdff1aSopenharmony_ci    paddw                   m2, m3, m6
640cabdff1aSopenharmony_ci    paddw                   m3, m7
641cabdff1aSopenharmony_ci    pmaxsw                  m0, reg_min
642cabdff1aSopenharmony_ci    pmaxsw                  m1, reg_min
643cabdff1aSopenharmony_ci    pmaxsw                  m2, reg_min
644cabdff1aSopenharmony_ci    pmaxsw                  m3, reg_min
645cabdff1aSopenharmony_ci    pminsw                  m0, reg_max
646cabdff1aSopenharmony_ci    pminsw                  m1, reg_max
647cabdff1aSopenharmony_ci    pminsw                  m2, reg_max
648cabdff1aSopenharmony_ci    pminsw                  m3, reg_max
649cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
650cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m1
651cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+32], m2
652cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+48], m3
653cabdff1aSopenharmony_ci    add                   dstq, strideq
654cabdff1aSopenharmony_ci    dec                   cntd
655cabdff1aSopenharmony_ci    jge .loop
656cabdff1aSopenharmony_ci    RET
657cabdff1aSopenharmony_ci
658cabdff1aSopenharmony_cicglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
659cabdff1aSopenharmony_ci    mova                    m0, [pw_4095]
660cabdff1aSopenharmony_ci    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
661cabdff1aSopenharmony_ci
662cabdff1aSopenharmony_ci; Directional intra predicion functions
663cabdff1aSopenharmony_ci;
664cabdff1aSopenharmony_ci; in the functions below, 'abcdefgh' refers to above data (sometimes simply
665cabdff1aSopenharmony_ci; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
666cabdff1aSopenharmony_ci; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
667cabdff1aSopenharmony_ci; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
668cabdff1aSopenharmony_ci; top-left data.
669cabdff1aSopenharmony_ci
670cabdff1aSopenharmony_ci; left=(left+2*center+right+2)>>2
671cabdff1aSopenharmony_ci%macro LOWPASS 3 ; left [dst], center, right
672cabdff1aSopenharmony_ci    paddw                  m%1, m%3
673cabdff1aSopenharmony_ci    psraw                  m%1, 1
674cabdff1aSopenharmony_ci    pavgw                  m%1, m%2
675cabdff1aSopenharmony_ci%endmacro
676cabdff1aSopenharmony_ci
677cabdff1aSopenharmony_ci; abcdefgh (src) -> bcdefghh (dst)
678cabdff1aSopenharmony_ci; dst/src can be the same register
679cabdff1aSopenharmony_ci%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
680cabdff1aSopenharmony_ci%if cpuflag(ssse3)
681cabdff1aSopenharmony_ci    pshufb                  %1, %2, %3              ; abcdefgh -> bcdefghh
682cabdff1aSopenharmony_ci%else
683cabdff1aSopenharmony_ci    psrldq                  %1, %2, 2               ; abcdefgh -> bcdefgh.
684cabdff1aSopenharmony_ci    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
685cabdff1aSopenharmony_ci%endif
686cabdff1aSopenharmony_ci%endmacro
687cabdff1aSopenharmony_ci
688cabdff1aSopenharmony_ci; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
689cabdff1aSopenharmony_ci%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
690cabdff1aSopenharmony_ci%if cpuflag(ssse3)
691cabdff1aSopenharmony_ci    pshufb                  %1, %3, %4              ; abcdefgh -> bcdefghh
692cabdff1aSopenharmony_ci    pshufb                  %2, %1, %4              ; bcdefghh -> cdefghhh
693cabdff1aSopenharmony_ci%else
694cabdff1aSopenharmony_ci    psrldq                  %1, %3, 2               ; abcdefgh -> bcdefgh.
695cabdff1aSopenharmony_ci    psrldq                  %2, %3, 4               ; abcdefgh -> cdefgh..
696cabdff1aSopenharmony_ci    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
697cabdff1aSopenharmony_ci    pshufhw                 %2, %2, q1110           ; cdefgh.. -> cdefghhh
698cabdff1aSopenharmony_ci%endif
699cabdff1aSopenharmony_ci%endmacro
700cabdff1aSopenharmony_ci
701cabdff1aSopenharmony_ci%macro DL_FUNCS 0
702cabdff1aSopenharmony_cicglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
703cabdff1aSopenharmony_ci    movifnidn               aq, amp
704cabdff1aSopenharmony_ci    movu                    m1, [aq]                ; abcdefgh
705cabdff1aSopenharmony_ci    pshufhw                 m0, m1, q3310           ; abcdefhh
706cabdff1aSopenharmony_ci    SHIFT_RIGHT             m1, m1                  ; bcdefghh
707cabdff1aSopenharmony_ci    psrldq                  m2, m1, 2               ; cdefghh.
708cabdff1aSopenharmony_ci    LOWPASS                  0,  1,  2              ; BCDEFGh.
709cabdff1aSopenharmony_ci    pshufd                  m1, m0, q3321           ; DEFGh...
710cabdff1aSopenharmony_ci    movh      [dstq+strideq*0], m0
711cabdff1aSopenharmony_ci    movh      [dstq+strideq*2], m1
712cabdff1aSopenharmony_ci    add                   dstq, strideq
713cabdff1aSopenharmony_ci    psrldq                  m0, 2                   ; CDEFGh..
714cabdff1aSopenharmony_ci    psrldq                  m1, 2                   ; EFGh....
715cabdff1aSopenharmony_ci    movh      [dstq+strideq*0], m0
716cabdff1aSopenharmony_ci    movh      [dstq+strideq*2], m1
717cabdff1aSopenharmony_ci    RET
718cabdff1aSopenharmony_ci
719cabdff1aSopenharmony_cicglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
720cabdff1aSopenharmony_ci    movifnidn               aq, amp
721cabdff1aSopenharmony_ci    mova                    m0, [aq]                ; abcdefgh
722cabdff1aSopenharmony_ci%if cpuflag(ssse3)
723cabdff1aSopenharmony_ci    mova                    m4, [pb_2to15_14_15]
724cabdff1aSopenharmony_ci%endif
725cabdff1aSopenharmony_ci    SHIFT_RIGHTx2           m1, m2, m0, m4          ; bcdefghh/cdefghhh
726cabdff1aSopenharmony_ci    LOWPASS                  0,  1,  2              ; BCDEFGHh
727cabdff1aSopenharmony_ci    shufps                  m1, m0, m2, q3332       ; FGHhhhhh
728cabdff1aSopenharmony_ci    shufps                  m3, m0, m1, q2121       ; DEFGHhhh
729cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride5
730cabdff1aSopenharmony_ci    lea               stride5q, [strideq*5]
731cabdff1aSopenharmony_ci
732cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
733cabdff1aSopenharmony_ci    mova      [dstq+strideq*4], m1
734cabdff1aSopenharmony_ci    SHIFT_RIGHT             m0, m0, m4              ; CDEFGHhh
735cabdff1aSopenharmony_ci    pshuflw                 m1, m1, q3321           ; GHhhhhhh
736cabdff1aSopenharmony_ci    pshufd                  m2, m0, q3321           ; EFGHhhhh
737cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m0
738cabdff1aSopenharmony_ci    mova      [dstq+stride5q ], m1
739cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
740cabdff1aSopenharmony_ci    pshuflw                 m1, m1, q3321           ; Hhhhhhhh
741cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m3
742cabdff1aSopenharmony_ci    mova      [dstq+strideq*4], m1
743cabdff1aSopenharmony_ci    pshuflw                 m1, m1, q3321           ; hhhhhhhh
744cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m2
745cabdff1aSopenharmony_ci    mova      [dstq+stride5q ], m1
746cabdff1aSopenharmony_ci    RET
747cabdff1aSopenharmony_ci
748cabdff1aSopenharmony_cicglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
749cabdff1aSopenharmony_ci    movifnidn               aq, amp
750cabdff1aSopenharmony_ci    mova                    m0, [aq]                ; abcdefgh
751cabdff1aSopenharmony_ci    mova                    m3, [aq+mmsize]         ; ijklmnop
752cabdff1aSopenharmony_ci    PALIGNR                 m1, m3, m0, 2, m4       ; bcdefghi
753cabdff1aSopenharmony_ci    PALIGNR                 m2, m3, m0, 4, m4       ; cdefghij
754cabdff1aSopenharmony_ci    LOWPASS                  0,  1,  2              ; BCDEFGHI
755cabdff1aSopenharmony_ci%if cpuflag(ssse3)
756cabdff1aSopenharmony_ci    mova                    m4, [pb_2to15_14_15]
757cabdff1aSopenharmony_ci%endif
758cabdff1aSopenharmony_ci    SHIFT_RIGHTx2           m2, m1, m3, m4          ; jklmnopp/klmnoppp
759cabdff1aSopenharmony_ci    LOWPASS                  1,  2,  3              ; JKLMNOPp
760cabdff1aSopenharmony_ci    pshufd                  m2, m2, q3333           ; pppppppp
761cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt
762cabdff1aSopenharmony_ci    mov                   cntd, 8
763cabdff1aSopenharmony_ci
764cabdff1aSopenharmony_ci.loop:
765cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m0
766cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m1
767cabdff1aSopenharmony_ci    mova   [dstq+strideq*8+ 0], m1
768cabdff1aSopenharmony_ci    mova   [dstq+strideq*8+16], m2
769cabdff1aSopenharmony_ci    add                   dstq, strideq
770cabdff1aSopenharmony_ci%if cpuflag(avx)
771cabdff1aSopenharmony_ci    vpalignr                m0, m1, m0, 2
772cabdff1aSopenharmony_ci%else
773cabdff1aSopenharmony_ci    PALIGNR                 m3, m1, m0, 2, m4
774cabdff1aSopenharmony_ci    mova                    m0, m3
775cabdff1aSopenharmony_ci%endif
776cabdff1aSopenharmony_ci    SHIFT_RIGHT             m1, m1, m4
777cabdff1aSopenharmony_ci    dec                   cntd
778cabdff1aSopenharmony_ci    jg .loop
779cabdff1aSopenharmony_ci    RET
780cabdff1aSopenharmony_ci
781cabdff1aSopenharmony_cicglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
782cabdff1aSopenharmony_ci    movifnidn               aq, amp
783cabdff1aSopenharmony_ci    mova                    m0, [aq+mmsize*0]       ; abcdefgh
784cabdff1aSopenharmony_ci    mova                    m1, [aq+mmsize*1]       ; ijklmnop
785cabdff1aSopenharmony_ci    mova                    m2, [aq+mmsize*2]       ; qrstuvwx
786cabdff1aSopenharmony_ci    mova                    m3, [aq+mmsize*3]       ; yz012345
787cabdff1aSopenharmony_ci    PALIGNR                 m4, m1, m0, 2, m6
788cabdff1aSopenharmony_ci    PALIGNR                 m5, m1, m0, 4, m6
789cabdff1aSopenharmony_ci    LOWPASS                  0,  4,  5              ; BCDEFGHI
790cabdff1aSopenharmony_ci    PALIGNR                 m4, m2, m1, 2, m6
791cabdff1aSopenharmony_ci    PALIGNR                 m5, m2, m1, 4, m6
792cabdff1aSopenharmony_ci    LOWPASS                  1,  4,  5              ; JKLMNOPQ
793cabdff1aSopenharmony_ci    PALIGNR                 m4, m3, m2, 2, m6
794cabdff1aSopenharmony_ci    PALIGNR                 m5, m3, m2, 4, m6
795cabdff1aSopenharmony_ci    LOWPASS                  2,  4,  5              ; RSTUVWXY
796cabdff1aSopenharmony_ci%if cpuflag(ssse3)
797cabdff1aSopenharmony_ci    mova                    m6, [pb_2to15_14_15]
798cabdff1aSopenharmony_ci%endif
799cabdff1aSopenharmony_ci    SHIFT_RIGHTx2           m4, m5, m3, m6
800cabdff1aSopenharmony_ci    LOWPASS                  3,  4,  5              ; Z0123455
801cabdff1aSopenharmony_ci    pshufd                  m4, m4, q3333           ; 55555555
802cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride8, stride24, cnt
803cabdff1aSopenharmony_ci    mov                   cntd, 8
804cabdff1aSopenharmony_ci    lea               stride8q, [strideq*8]
805cabdff1aSopenharmony_ci    lea              stride24q, [stride8q*3]
806cabdff1aSopenharmony_ci
807cabdff1aSopenharmony_ci.loop:
808cabdff1aSopenharmony_ci    mova  [dstq+stride8q*0+ 0], m0
809cabdff1aSopenharmony_ci    mova  [dstq+stride8q*0+16], m1
810cabdff1aSopenharmony_ci    mova  [dstq+stride8q*0+32], m2
811cabdff1aSopenharmony_ci    mova  [dstq+stride8q*0+48], m3
812cabdff1aSopenharmony_ci    mova  [dstq+stride8q*1+ 0], m1
813cabdff1aSopenharmony_ci    mova  [dstq+stride8q*1+16], m2
814cabdff1aSopenharmony_ci    mova  [dstq+stride8q*1+32], m3
815cabdff1aSopenharmony_ci    mova  [dstq+stride8q*1+48], m4
816cabdff1aSopenharmony_ci    mova  [dstq+stride8q*2+ 0], m2
817cabdff1aSopenharmony_ci    mova  [dstq+stride8q*2+16], m3
818cabdff1aSopenharmony_ci    mova  [dstq+stride8q*2+32], m4
819cabdff1aSopenharmony_ci    mova  [dstq+stride8q*2+48], m4
820cabdff1aSopenharmony_ci    mova  [dstq+stride24q + 0], m3
821cabdff1aSopenharmony_ci    mova  [dstq+stride24q +16], m4
822cabdff1aSopenharmony_ci    mova  [dstq+stride24q +32], m4
823cabdff1aSopenharmony_ci    mova  [dstq+stride24q +48], m4
824cabdff1aSopenharmony_ci    add                   dstq, strideq
825cabdff1aSopenharmony_ci%if cpuflag(avx)
826cabdff1aSopenharmony_ci    vpalignr                m0, m1, m0, 2
827cabdff1aSopenharmony_ci    vpalignr                m1, m2, m1, 2
828cabdff1aSopenharmony_ci    vpalignr                m2, m3, m2, 2
829cabdff1aSopenharmony_ci%else
830cabdff1aSopenharmony_ci    PALIGNR                 m5, m1, m0, 2, m6
831cabdff1aSopenharmony_ci    mova                    m0, m5
832cabdff1aSopenharmony_ci    PALIGNR                 m5, m2, m1, 2, m6
833cabdff1aSopenharmony_ci    mova                    m1, m5
834cabdff1aSopenharmony_ci    PALIGNR                 m5, m3, m2, 2, m6
835cabdff1aSopenharmony_ci    mova                    m2, m5
836cabdff1aSopenharmony_ci%endif
837cabdff1aSopenharmony_ci    SHIFT_RIGHT             m3, m3, m6
838cabdff1aSopenharmony_ci    dec                   cntd
839cabdff1aSopenharmony_ci    jg .loop
840cabdff1aSopenharmony_ci    RET
841cabdff1aSopenharmony_ci%endmacro
842cabdff1aSopenharmony_ci
843cabdff1aSopenharmony_ciINIT_XMM sse2
844cabdff1aSopenharmony_ciDL_FUNCS
845cabdff1aSopenharmony_ciINIT_XMM ssse3
846cabdff1aSopenharmony_ciDL_FUNCS
847cabdff1aSopenharmony_ciINIT_XMM avx
848cabdff1aSopenharmony_ciDL_FUNCS
849cabdff1aSopenharmony_ci
850cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
851cabdff1aSopenharmony_ciINIT_YMM avx2
852cabdff1aSopenharmony_cicglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
853cabdff1aSopenharmony_ci    movifnidn               aq, amp
854cabdff1aSopenharmony_ci    mova                    m0, [aq]                   ; abcdefghijklmnop
855cabdff1aSopenharmony_ci    vpbroadcastw           xm1, [aq+30]                ; pppppppp
856cabdff1aSopenharmony_ci    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
857cabdff1aSopenharmony_ci    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
858cabdff1aSopenharmony_ci    vpalignr                m4, m2, m0, 4              ; cdefghijklmnoppp
859cabdff1aSopenharmony_ci    LOWPASS                  0,  3,  4                 ; BCDEFGHIJKLMNOPp
860cabdff1aSopenharmony_ci    vperm2i128              m2, m0, m1, q0201          ; JKLMNOPppppppppp
861cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
862cabdff1aSopenharmony_ci    mov                   cntd, 2
863cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
864cabdff1aSopenharmony_ci
865cabdff1aSopenharmony_ci.loop:
866cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
867cabdff1aSopenharmony_ci    vpalignr                m3, m2, m0, 2
868cabdff1aSopenharmony_ci    vpalignr                m4, m2, m0, 4
869cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m3
870cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m4
871cabdff1aSopenharmony_ci    vpalignr                m3, m2, m0, 6
872cabdff1aSopenharmony_ci    vpalignr                m4, m2, m0, 8
873cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m3
874cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
875cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m4
876cabdff1aSopenharmony_ci    vpalignr                m3, m2, m0, 10
877cabdff1aSopenharmony_ci    vpalignr                m4, m2, m0, 12
878cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m3
879cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m4
880cabdff1aSopenharmony_ci    vpalignr                m3, m2, m0, 14
881cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m3
882cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
883cabdff1aSopenharmony_ci    mova                    m0, m2
884cabdff1aSopenharmony_ci    vperm2i128              m2, m2, m2, q0101          ; pppppppppppppppp
885cabdff1aSopenharmony_ci    dec                   cntd
886cabdff1aSopenharmony_ci    jg .loop
887cabdff1aSopenharmony_ci    RET
888cabdff1aSopenharmony_ci
889cabdff1aSopenharmony_cicglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
890cabdff1aSopenharmony_ci    movifnidn               aq, amp
891cabdff1aSopenharmony_ci    mova                    m0, [aq+mmsize*0+ 0]       ; abcdefghijklmnop
892cabdff1aSopenharmony_ci    mova                    m1, [aq+mmsize*1+ 0]       ; qrstuvwxyz012345
893cabdff1aSopenharmony_ci    vpbroadcastw           xm4, [aq+mmsize*1+30]       ; 55555555
894cabdff1aSopenharmony_ci    vperm2i128              m5, m0, m1, q0201          ; ijklmnopqrstuvwx
895cabdff1aSopenharmony_ci    vpalignr                m2, m5, m0, 2              ; bcdefghijklmnopq
896cabdff1aSopenharmony_ci    vpalignr                m3, m5, m0, 4              ; cdefghijklmnopqr
897cabdff1aSopenharmony_ci    LOWPASS                  0,  2,  3                 ; BCDEFGHIJKLMNOPQ
898cabdff1aSopenharmony_ci    vperm2i128              m5, m1, m4, q0201          ; yz01234555555555
899cabdff1aSopenharmony_ci    vpalignr                m2, m5, m1, 2              ; rstuvwxyz0123455
900cabdff1aSopenharmony_ci    vpalignr                m3, m5, m1, 4              ; stuvwxyz01234555
901cabdff1aSopenharmony_ci    LOWPASS                  1,  2,  3                 ; RSTUVWXYZ......5
902cabdff1aSopenharmony_ci    vperm2i128              m2, m1, m4, q0201          ; Z......555555555
903cabdff1aSopenharmony_ci    vperm2i128              m5, m0, m1, q0201          ; JKLMNOPQRSTUVWXY
904cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
905cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
906cabdff1aSopenharmony_ci    mov                   cntd, 4
907cabdff1aSopenharmony_ci
908cabdff1aSopenharmony_ci.loop:
909cabdff1aSopenharmony_ci    mova   [dstq+strideq*0 + 0], m0
910cabdff1aSopenharmony_ci    mova   [dstq+strideq*0 +32], m1
911cabdff1aSopenharmony_ci    vpalignr                 m3, m5, m0, 2
912cabdff1aSopenharmony_ci    vpalignr                 m4, m2, m1, 2
913cabdff1aSopenharmony_ci    mova   [dstq+strideq*1 + 0], m3
914cabdff1aSopenharmony_ci    mova   [dstq+strideq*1 +32], m4
915cabdff1aSopenharmony_ci    vpalignr                 m3, m5, m0, 4
916cabdff1aSopenharmony_ci    vpalignr                 m4, m2, m1, 4
917cabdff1aSopenharmony_ci    mova   [dstq+strideq*2 + 0], m3
918cabdff1aSopenharmony_ci    mova   [dstq+strideq*2 +32], m4
919cabdff1aSopenharmony_ci    vpalignr                 m3, m5, m0, 6
920cabdff1aSopenharmony_ci    vpalignr                 m4, m2, m1, 6
921cabdff1aSopenharmony_ci    mova   [dstq+stride3q*1+ 0], m3
922cabdff1aSopenharmony_ci    mova   [dstq+stride3q*1+32], m4
923cabdff1aSopenharmony_ci    lea                    dstq, [dstq+strideq*4]
924cabdff1aSopenharmony_ci    vpalignr                 m3, m5, m0, 8
925cabdff1aSopenharmony_ci    vpalignr                 m4, m2, m1, 8
926cabdff1aSopenharmony_ci    mova   [dstq+strideq*0 + 0], m3
927cabdff1aSopenharmony_ci    mova   [dstq+strideq*0 +32], m4
928cabdff1aSopenharmony_ci    vpalignr                 m3, m5, m0, 10
929cabdff1aSopenharmony_ci    vpalignr                 m4, m2, m1, 10
930cabdff1aSopenharmony_ci    mova   [dstq+strideq*1 + 0], m3
931cabdff1aSopenharmony_ci    mova   [dstq+strideq*1 +32], m4
932cabdff1aSopenharmony_ci    vpalignr                 m3, m5, m0, 12
933cabdff1aSopenharmony_ci    vpalignr                 m4, m2, m1, 12
934cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+ 0], m3
935cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+32], m4
936cabdff1aSopenharmony_ci    vpalignr                 m3, m5, m0, 14
937cabdff1aSopenharmony_ci    vpalignr                 m4, m2, m1, 14
938cabdff1aSopenharmony_ci    mova   [dstq+stride3q+  0], m3
939cabdff1aSopenharmony_ci    mova   [dstq+stride3q+ 32], m4
940cabdff1aSopenharmony_ci    vpalignr                 m3, m5, m0, 16
941cabdff1aSopenharmony_ci    vpalignr                 m4, m2, m1, 16
942cabdff1aSopenharmony_ci    vperm2i128               m5, m3, m4, q0201
943cabdff1aSopenharmony_ci    vperm2i128               m2, m4, m4, q0101
944cabdff1aSopenharmony_ci    mova                     m0, m3
945cabdff1aSopenharmony_ci    mova                     m1, m4
946cabdff1aSopenharmony_ci    lea                    dstq, [dstq+strideq*4]
947cabdff1aSopenharmony_ci    dec                    cntd
948cabdff1aSopenharmony_ci    jg .loop
949cabdff1aSopenharmony_ci    RET
950cabdff1aSopenharmony_ci%endif
951cabdff1aSopenharmony_ci
952cabdff1aSopenharmony_ci%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
953cabdff1aSopenharmony_cicglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
954cabdff1aSopenharmony_ci    movh                    m0, [lq]                ; wxyz....
955cabdff1aSopenharmony_ci    movhps                  m0, [aq-2]              ; wxyz*abc
956cabdff1aSopenharmony_ci    movd                    m1, [aq+6]              ; d.......
957cabdff1aSopenharmony_ci    PALIGNR                 m1, m0, 2, m2           ; xyz*abcd
958cabdff1aSopenharmony_ci    psrldq                  m2, m1, 2               ; yz*abcd.
959cabdff1aSopenharmony_ci    LOWPASS                  0, 1, 2                ; XYZ#ABC.
960cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
961cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
962cabdff1aSopenharmony_ci
963cabdff1aSopenharmony_ci    movh      [dstq+stride3q ], m0
964cabdff1aSopenharmony_ci    psrldq                  m0, 2                   ; YZ#ABC..
965cabdff1aSopenharmony_ci    movh      [dstq+strideq*2], m0
966cabdff1aSopenharmony_ci    psrldq                  m0, 2                   ; Z#ABC...
967cabdff1aSopenharmony_ci    movh      [dstq+strideq*1], m0
968cabdff1aSopenharmony_ci    psrldq                  m0, 2                   ; #ABC....
969cabdff1aSopenharmony_ci    movh      [dstq+strideq*0], m0
970cabdff1aSopenharmony_ci    RET
971cabdff1aSopenharmony_ci
972cabdff1aSopenharmony_cicglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
973cabdff1aSopenharmony_ci    mova                    m0, [lq]                ; stuvwxyz
974cabdff1aSopenharmony_ci    movu                    m1, [aq-2]              ; *abcdefg
975cabdff1aSopenharmony_ci    mova                    m2, [aq]                ; abcdefgh
976cabdff1aSopenharmony_ci    psrldq                  m3, m2, 2               ; bcdefgh.
977cabdff1aSopenharmony_ci    LOWPASS                  3,  2, 1               ; ABCDEFG.
978cabdff1aSopenharmony_ci    PALIGNR                 m1, m0, 2, m4           ; tuvwxyz*
979cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, 2, m4           ; uvwxyz*a
980cabdff1aSopenharmony_ci    LOWPASS                  2,  1, 0               ; TUVWXYZ#
981cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, dst4, stride3
982cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
983cabdff1aSopenharmony_ci    lea                  dst4q, [dstq+strideq*4]
984cabdff1aSopenharmony_ci
985cabdff1aSopenharmony_ci    movhps [dstq +stride3q +0], m2
986cabdff1aSopenharmony_ci    movh   [dstq+ stride3q +8], m3
987cabdff1aSopenharmony_ci    mova   [dst4q+stride3q +0], m2
988cabdff1aSopenharmony_ci    PALIGNR                 m1, m3, m2, 2, m0
989cabdff1aSopenharmony_ci    psrldq                  m3, 2
990cabdff1aSopenharmony_ci    movhps [dstq +strideq*2+0], m1
991cabdff1aSopenharmony_ci    movh   [dstq+ strideq*2+8], m3
992cabdff1aSopenharmony_ci    mova   [dst4q+strideq*2+0], m1
993cabdff1aSopenharmony_ci    PALIGNR                 m2, m3, m1, 2, m0
994cabdff1aSopenharmony_ci    psrldq                  m3, 2
995cabdff1aSopenharmony_ci    movhps [dstq +strideq*1+0], m2
996cabdff1aSopenharmony_ci    movh   [dstq+ strideq*1+8], m3
997cabdff1aSopenharmony_ci    mova   [dst4q+strideq*1+0], m2
998cabdff1aSopenharmony_ci    PALIGNR                 m1, m3, m2, 2, m0
999cabdff1aSopenharmony_ci    psrldq                  m3, 2
1000cabdff1aSopenharmony_ci    movhps [dstq +strideq*0+0], m1
1001cabdff1aSopenharmony_ci    movh   [dstq+ strideq*0+8], m3
1002cabdff1aSopenharmony_ci    mova   [dst4q+strideq*0+0], m1
1003cabdff1aSopenharmony_ci    RET
1004cabdff1aSopenharmony_ci
1005cabdff1aSopenharmony_cicglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
1006cabdff1aSopenharmony_ci    mova                    m0, [lq]                ; klmnopqr
1007cabdff1aSopenharmony_ci    mova                    m1, [lq+mmsize]         ; stuvwxyz
1008cabdff1aSopenharmony_ci    movu                    m2, [aq-2]              ; *abcdefg
1009cabdff1aSopenharmony_ci    movu                    m3, [aq+mmsize-2]       ; hijklmno
1010cabdff1aSopenharmony_ci    mova                    m4, [aq]                ; abcdefgh
1011cabdff1aSopenharmony_ci    mova                    m5, [aq+mmsize]         ; ijklmnop
1012cabdff1aSopenharmony_ci    psrldq                  m6, m5, 2               ; jklmnop.
1013cabdff1aSopenharmony_ci    LOWPASS                  6,  5, 3               ; IJKLMNO.
1014cabdff1aSopenharmony_ci    PALIGNR                 m5, m4, 2, m3           ; bcdefghi
1015cabdff1aSopenharmony_ci    LOWPASS                  5,  4, 2               ; ABCDEFGH
1016cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, 2, m3           ; tuvwxyz*
1017cabdff1aSopenharmony_ci    PALIGNR                 m4, m2, 2, m3           ; uvwxyz*a
1018cabdff1aSopenharmony_ci    LOWPASS                  4,  2, 1               ; TUVWXYZ#
1019cabdff1aSopenharmony_ci    PALIGNR                 m1, m0, 2, m3           ; lmnopqrs
1020cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, 2, m3           ; mnopqrst
1021cabdff1aSopenharmony_ci    LOWPASS                  2, 1, 0                ; LMNOPQRS
1022cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, dst8, cnt
1023cabdff1aSopenharmony_ci    lea                  dst8q, [dstq+strideq*8]
1024cabdff1aSopenharmony_ci    mov                   cntd, 8
1025cabdff1aSopenharmony_ci
1026cabdff1aSopenharmony_ci.loop:
1027cabdff1aSopenharmony_ci    sub                  dst8q, strideq
1028cabdff1aSopenharmony_ci    mova  [dst8q+strideq*0+ 0], m4
1029cabdff1aSopenharmony_ci    mova  [dst8q+strideq*0+16], m5
1030cabdff1aSopenharmony_ci    mova  [dst8q+strideq*8+ 0], m2
1031cabdff1aSopenharmony_ci    mova  [dst8q+strideq*8+16], m4
1032cabdff1aSopenharmony_ci%if cpuflag(avx)
1033cabdff1aSopenharmony_ci    vpalignr                m2, m4, m2, 2
1034cabdff1aSopenharmony_ci    vpalignr                m4, m5, m4, 2
1035cabdff1aSopenharmony_ci    vpalignr                m5, m6, m5, 2
1036cabdff1aSopenharmony_ci%else
1037cabdff1aSopenharmony_ci    PALIGNR                 m0, m4, m2, 2, m1
1038cabdff1aSopenharmony_ci    mova                    m2, m0
1039cabdff1aSopenharmony_ci    PALIGNR                 m0, m5, m4, 2, m1
1040cabdff1aSopenharmony_ci    mova                    m4, m0
1041cabdff1aSopenharmony_ci    PALIGNR                 m0, m6, m5, 2, m1
1042cabdff1aSopenharmony_ci    mova                    m5, m0
1043cabdff1aSopenharmony_ci%endif
1044cabdff1aSopenharmony_ci    psrldq                  m6, 2
1045cabdff1aSopenharmony_ci    dec                   cntd
1046cabdff1aSopenharmony_ci    jg .loop
1047cabdff1aSopenharmony_ci    RET
1048cabdff1aSopenharmony_ci
1049cabdff1aSopenharmony_cicglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
1050cabdff1aSopenharmony_ci                               %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
1051cabdff1aSopenharmony_ci    mova                    m0, [aq+mmsize*3]       ; a[24-31]
1052cabdff1aSopenharmony_ci    movu                    m1, [aq+mmsize*3-2]     ; a[23-30]
1053cabdff1aSopenharmony_ci    psrldq                  m2, m0, 2               ; a[25-31].
1054cabdff1aSopenharmony_ci    LOWPASS                  2,  0, 1               ; A[24-30].
1055cabdff1aSopenharmony_ci    mova                    m1, [aq+mmsize*2]       ; a[16-23]
1056cabdff1aSopenharmony_ci    movu                    m3, [aq+mmsize*2-2]     ; a[15-22]
1057cabdff1aSopenharmony_ci    PALIGNR                 m0, m1, 2, m4           ; a[17-24]
1058cabdff1aSopenharmony_ci    LOWPASS                  0,  1, 3               ; A[16-23]
1059cabdff1aSopenharmony_ci    mova                    m3, [aq+mmsize*1]       ; a[8-15]
1060cabdff1aSopenharmony_ci    movu                    m4, [aq+mmsize*1-2]     ; a[7-14]
1061cabdff1aSopenharmony_ci    PALIGNR                 m1, m3, 2, m5           ; a[9-16]
1062cabdff1aSopenharmony_ci    LOWPASS                  1,  3, 4               ; A[8-15]
1063cabdff1aSopenharmony_ci    mova                    m4, [aq+mmsize*0]       ; a[0-7]
1064cabdff1aSopenharmony_ci    movu                    m5, [aq+mmsize*0-2]     ; *a[0-6]
1065cabdff1aSopenharmony_ci    PALIGNR                 m3, m4, 2, m6           ; a[1-8]
1066cabdff1aSopenharmony_ci    LOWPASS                  3,  4, 5               ; A[0-7]
1067cabdff1aSopenharmony_ci    SCRATCH                  1,  8, rsp+0*mmsize
1068cabdff1aSopenharmony_ci    SCRATCH                  3,  9, rsp+1*mmsize
1069cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
1070cabdff1aSopenharmony_ci    SCRATCH                  0, 10, rsp+2*mmsize
1071cabdff1aSopenharmony_ci%endif
1072cabdff1aSopenharmony_ci    mova                    m6, [lq+mmsize*3]       ; l[24-31]
1073cabdff1aSopenharmony_ci    PALIGNR                 m5, m6, 2, m0           ; l[25-31]*
1074cabdff1aSopenharmony_ci    PALIGNR                 m4, m5, 2, m0           ; l[26-31]*a
1075cabdff1aSopenharmony_ci    LOWPASS                  4,  5, 6               ; L[25-31]#
1076cabdff1aSopenharmony_ci    mova                    m7, [lq+mmsize*2]       ; l[16-23]
1077cabdff1aSopenharmony_ci    PALIGNR                 m6, m7, 2, m0           ; l[17-24]
1078cabdff1aSopenharmony_ci    PALIGNR                 m5, m6, 2, m0           ; l[18-25]
1079cabdff1aSopenharmony_ci    LOWPASS                  5,  6, 7               ; L[17-24]
1080cabdff1aSopenharmony_ci    mova                    m1, [lq+mmsize*1]       ; l[8-15]
1081cabdff1aSopenharmony_ci    PALIGNR                 m7, m1, 2, m0           ; l[9-16]
1082cabdff1aSopenharmony_ci    PALIGNR                 m6, m7, 2, m0           ; l[10-17]
1083cabdff1aSopenharmony_ci    LOWPASS                  6,  7, 1               ; L[9-16]
1084cabdff1aSopenharmony_ci    mova                    m3, [lq+mmsize*0]       ; l[0-7]
1085cabdff1aSopenharmony_ci    PALIGNR                 m1, m3, 2, m0           ; l[1-8]
1086cabdff1aSopenharmony_ci    PALIGNR                 m7, m1, 2, m0           ; l[2-9]
1087cabdff1aSopenharmony_ci    LOWPASS                  7,  1, 3               ; L[1-8]
1088cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1089cabdff1aSopenharmony_ci%if cpuflag(avx)
1090cabdff1aSopenharmony_ci    UNSCRATCH                1,  8, rsp+0*mmsize
1091cabdff1aSopenharmony_ci%endif
1092cabdff1aSopenharmony_ci    UNSCRATCH                3,  9, rsp+1*mmsize
1093cabdff1aSopenharmony_ci%else
1094cabdff1aSopenharmony_ci    UNSCRATCH                0, 10, rsp+2*mmsize
1095cabdff1aSopenharmony_ci%endif
1096cabdff1aSopenharmony_ci    DEFINE_ARGS dst8, stride, stride8, stride24, cnt
1097cabdff1aSopenharmony_ci    lea               stride8q, [strideq*8]
1098cabdff1aSopenharmony_ci    lea              stride24q, [stride8q*3]
1099cabdff1aSopenharmony_ci    lea                  dst8q, [dst8q+strideq*8]
1100cabdff1aSopenharmony_ci    mov                   cntd, 8
1101cabdff1aSopenharmony_ci
1102cabdff1aSopenharmony_ci.loop:
1103cabdff1aSopenharmony_ci    sub                  dst8q, strideq
1104cabdff1aSopenharmony_ci%if notcpuflag(avx)
1105cabdff1aSopenharmony_ci    UNSCRATCH                1,  8, rsp+0*mmsize
1106cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
1107cabdff1aSopenharmony_ci    UNSCRATCH                3,  9, rsp+1*mmsize
1108cabdff1aSopenharmony_ci%endif
1109cabdff1aSopenharmony_ci%endif
1110cabdff1aSopenharmony_ci    mova [dst8q+stride8q*0+ 0], m4
1111cabdff1aSopenharmony_ci    mova [dst8q+stride8q*0+16], m3
1112cabdff1aSopenharmony_ci    mova [dst8q+stride8q*0+32], m1
1113cabdff1aSopenharmony_ci    mova [dst8q+stride8q*0+48], m0
1114cabdff1aSopenharmony_ci    mova [dst8q+stride8q*1+ 0], m5
1115cabdff1aSopenharmony_ci    mova [dst8q+stride8q*1+16], m4
1116cabdff1aSopenharmony_ci    mova [dst8q+stride8q*1+32], m3
1117cabdff1aSopenharmony_ci    mova [dst8q+stride8q*1+48], m1
1118cabdff1aSopenharmony_ci    mova [dst8q+stride8q*2+ 0], m6
1119cabdff1aSopenharmony_ci    mova [dst8q+stride8q*2+16], m5
1120cabdff1aSopenharmony_ci    mova [dst8q+stride8q*2+32], m4
1121cabdff1aSopenharmony_ci    mova [dst8q+stride8q*2+48], m3
1122cabdff1aSopenharmony_ci    mova [dst8q+stride24q + 0], m7
1123cabdff1aSopenharmony_ci    mova [dst8q+stride24q +16], m6
1124cabdff1aSopenharmony_ci    mova [dst8q+stride24q +32], m5
1125cabdff1aSopenharmony_ci    mova [dst8q+stride24q +48], m4
1126cabdff1aSopenharmony_ci%if cpuflag(avx)
1127cabdff1aSopenharmony_ci    vpalignr                m7, m6, m7, 2
1128cabdff1aSopenharmony_ci    vpalignr                m6, m5, m6, 2
1129cabdff1aSopenharmony_ci    vpalignr                m5, m4, m5, 2
1130cabdff1aSopenharmony_ci    vpalignr                m4, m3, m4, 2
1131cabdff1aSopenharmony_ci    vpalignr                m3, m1, m3, 2
1132cabdff1aSopenharmony_ci    vpalignr                m1, m0, m1, 2
1133cabdff1aSopenharmony_ci    vpalignr                m0, m2, m0, 2
1134cabdff1aSopenharmony_ci%else
1135cabdff1aSopenharmony_ci    SCRATCH                  2,  8, rsp+0*mmsize
1136cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
1137cabdff1aSopenharmony_ci    SCRATCH                  0,  9, rsp+1*mmsize
1138cabdff1aSopenharmony_ci%endif
1139cabdff1aSopenharmony_ci    PALIGNR                 m2, m6, m7, 2, m0
1140cabdff1aSopenharmony_ci    mova                    m7, m2
1141cabdff1aSopenharmony_ci    PALIGNR                 m2, m5, m6, 2, m0
1142cabdff1aSopenharmony_ci    mova                    m6, m2
1143cabdff1aSopenharmony_ci    PALIGNR                 m2, m4, m5, 2, m0
1144cabdff1aSopenharmony_ci    mova                    m5, m2
1145cabdff1aSopenharmony_ci    PALIGNR                 m2, m3, m4, 2, m0
1146cabdff1aSopenharmony_ci    mova                    m4, m2
1147cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, m3, 2, m0
1148cabdff1aSopenharmony_ci    mova                    m3, m2
1149cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
1150cabdff1aSopenharmony_ci    UNSCRATCH                0,  9, rsp+1*mmsize
1151cabdff1aSopenharmony_ci    SCRATCH                  3,  9, rsp+1*mmsize
1152cabdff1aSopenharmony_ci%endif
1153cabdff1aSopenharmony_ci    PALIGNR                 m2, m0, m1, 2, m3
1154cabdff1aSopenharmony_ci    mova                    m1, m2
1155cabdff1aSopenharmony_ci    UNSCRATCH                2,  8, rsp+0*mmsize
1156cabdff1aSopenharmony_ci    SCRATCH                  1,  8, rsp+0*mmsize
1157cabdff1aSopenharmony_ci    PALIGNR                 m1, m2, m0, 2, m3
1158cabdff1aSopenharmony_ci    mova                    m0, m1
1159cabdff1aSopenharmony_ci%endif
1160cabdff1aSopenharmony_ci    psrldq                  m2, 2
1161cabdff1aSopenharmony_ci    dec                   cntd
1162cabdff1aSopenharmony_ci    jg .loop
1163cabdff1aSopenharmony_ci    RET
1164cabdff1aSopenharmony_ci%endmacro
1165cabdff1aSopenharmony_ci
1166cabdff1aSopenharmony_ciINIT_XMM sse2
1167cabdff1aSopenharmony_ciDR_FUNCS 3
1168cabdff1aSopenharmony_ciINIT_XMM ssse3
1169cabdff1aSopenharmony_ciDR_FUNCS 2
1170cabdff1aSopenharmony_ciINIT_XMM avx
1171cabdff1aSopenharmony_ciDR_FUNCS 2
1172cabdff1aSopenharmony_ci
1173cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
1174cabdff1aSopenharmony_ciINIT_YMM avx2
1175cabdff1aSopenharmony_cicglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
1176cabdff1aSopenharmony_ci    mova                    m0, [lq]                   ; klmnopqrstuvwxyz
1177cabdff1aSopenharmony_ci    movu                    m1, [aq-2]                 ; *abcdefghijklmno
1178cabdff1aSopenharmony_ci    mova                    m2, [aq]                   ; abcdefghijklmnop
1179cabdff1aSopenharmony_ci    vperm2i128              m4, m2, m2, q2001          ; ijklmnop........
1180cabdff1aSopenharmony_ci    vpalignr                m5, m4, m2, 2              ; bcdefghijklmnop.
1181cabdff1aSopenharmony_ci    vperm2i128              m3, m0, m1, q0201          ; stuvwxyz*abcdefg
1182cabdff1aSopenharmony_ci    LOWPASS                  1,  2,  5                 ; ABCDEFGHIJKLMNO.
1183cabdff1aSopenharmony_ci    vpalignr                m4, m3, m0, 2              ; lmnopqrstuvwxyz*
1184cabdff1aSopenharmony_ci    vpalignr                m5, m3, m0, 4              ; mnopqrstuvwxyz*a
1185cabdff1aSopenharmony_ci    LOWPASS                  0,  4,  5                 ; LMNOPQRSTUVWXYZ#
1186cabdff1aSopenharmony_ci    vperm2i128              m5, m0, m1, q0201          ; TUVWXYZ#ABCDEFGH
1187cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, stride5, dst3
1188cabdff1aSopenharmony_ci    lea                  dst3q, [dstq+strideq*4]
1189cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1190cabdff1aSopenharmony_ci    lea               stride5q, [stride3q+strideq*2]
1191cabdff1aSopenharmony_ci
1192cabdff1aSopenharmony_ci    vpalignr                m3, m5, m0, 2
1193cabdff1aSopenharmony_ci    vpalignr                m4, m1, m5, 2
1194cabdff1aSopenharmony_ci    mova    [dst3q+stride5q*2], m3                     ; 14
1195cabdff1aSopenharmony_ci    mova    [ dstq+stride3q*2], m4                     ; 6
1196cabdff1aSopenharmony_ci    vpalignr                m3, m5, m0, 4
1197cabdff1aSopenharmony_ci    vpalignr                m4, m1, m5, 4
1198cabdff1aSopenharmony_ci    sub                  dst3q, strideq
1199cabdff1aSopenharmony_ci    mova    [dst3q+stride5q*2], m3                     ; 13
1200cabdff1aSopenharmony_ci    mova    [dst3q+strideq*2 ], m4                     ; 5
1201cabdff1aSopenharmony_ci    mova    [dst3q+stride3q*4], m0                     ; 15
1202cabdff1aSopenharmony_ci    vpalignr                m3, m5, m0, 6
1203cabdff1aSopenharmony_ci    vpalignr                m4, m1, m5, 6
1204cabdff1aSopenharmony_ci    mova     [dstq+stride3q*4], m3                     ; 12
1205cabdff1aSopenharmony_ci    mova     [dst3q+strideq*1], m4                     ; 4
1206cabdff1aSopenharmony_ci    vpalignr                m3, m5, m0, 8
1207cabdff1aSopenharmony_ci    vpalignr                m4, m1, m5, 8
1208cabdff1aSopenharmony_ci    mova     [dst3q+strideq*8], m3                     ; 11
1209cabdff1aSopenharmony_ci    mova     [dst3q+strideq*0], m4                     ; 3
1210cabdff1aSopenharmony_ci    vpalignr                m3, m5, m0, 10
1211cabdff1aSopenharmony_ci    vpalignr                m4, m1, m5, 10
1212cabdff1aSopenharmony_ci    mova     [dstq+stride5q*2], m3                     ; 10
1213cabdff1aSopenharmony_ci    mova     [dstq+strideq*2 ], m4                     ; 2
1214cabdff1aSopenharmony_ci    vpalignr                m3, m5, m0, 12
1215cabdff1aSopenharmony_ci    vpalignr                m4, m1, m5, 12
1216cabdff1aSopenharmony_ci    mova    [dst3q+stride3q*2], m3                     ; 9
1217cabdff1aSopenharmony_ci    mova     [dstq+strideq*1 ], m4                     ; 1
1218cabdff1aSopenharmony_ci    vpalignr                m3, m5, m0, 14
1219cabdff1aSopenharmony_ci    vpalignr                m4, m1, m5, 14
1220cabdff1aSopenharmony_ci    mova      [dstq+strideq*8], m3                     ; 8
1221cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m4                     ; 0
1222cabdff1aSopenharmony_ci    mova     [dst3q+strideq*4], m5                     ; 7
1223cabdff1aSopenharmony_ci    RET
1224cabdff1aSopenharmony_ci
1225cabdff1aSopenharmony_cicglobal vp9_ipred_vl_16x16_16, 4, 5, 7, dst, stride, l, a
1226cabdff1aSopenharmony_ci    movifnidn               aq, amp
1227cabdff1aSopenharmony_ci    mova                    m0, [aq]                   ; abcdefghijklmnop
1228cabdff1aSopenharmony_ci    vpbroadcastw           xm1, [aq+30]                ; pppppppp
1229cabdff1aSopenharmony_ci    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
1230cabdff1aSopenharmony_ci    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
1231cabdff1aSopenharmony_ci    vperm2i128              m4, m3, m1, q0201          ; jklmnopppppppppp
1232cabdff1aSopenharmony_ci    vpalignr                m5, m2, m0, 4              ; cdefghijklmnoppp
1233cabdff1aSopenharmony_ci    vperm2i128              m6, m5, m1, q0201          ; klmnoppppppppppp
1234cabdff1aSopenharmony_ci    LOWPASS                  5,  3,  0                 ; BCDEFGHIJKLMNOPP
1235cabdff1aSopenharmony_ci    LOWPASS                  6,  4,  2                 ; JKLMNOPPPPPPPPPP
1236cabdff1aSopenharmony_ci    pavgw                   m3, m0                     ; abcdefghijklmnop
1237cabdff1aSopenharmony_ci    pavgw                   m4, m2                     ; ijklmnoppppppppp
1238cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, stride5, dst4
1239cabdff1aSopenharmony_ci    lea                  dst4q, [dstq+strideq*4]
1240cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1241cabdff1aSopenharmony_ci    lea               stride5q, [stride3q+strideq*2]
1242cabdff1aSopenharmony_ci
1243cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m3                     ; 0  abcdefghijklmnop
1244cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m5                     ; 1  BCDEFGHIJKLMNOPP
1245cabdff1aSopenharmony_ci    vpalignr                m0, m4, m3, 2
1246cabdff1aSopenharmony_ci    vpalignr                m1, m6, m5, 2
1247cabdff1aSopenharmony_ci    mova     [dstq+strideq*2 ], m0                     ; 2  bcdefghijklmnopp
1248cabdff1aSopenharmony_ci    mova     [dstq+stride3q*1], m1                     ; 3  CDEFGHIJKLMNOPPP
1249cabdff1aSopenharmony_ci    vpalignr                m0, m4, m3, 4
1250cabdff1aSopenharmony_ci    vpalignr                m1, m6, m5, 4
1251cabdff1aSopenharmony_ci    mova     [dst4q+strideq*0], m0                     ; 4  cdefghijklmnoppp
1252cabdff1aSopenharmony_ci    mova     [dstq+stride5q*1], m1                     ; 5  DEFGHIJKLMNOPPPP
1253cabdff1aSopenharmony_ci    vpalignr                m0, m4, m3, 6
1254cabdff1aSopenharmony_ci    vpalignr                m1, m6, m5, 6
1255cabdff1aSopenharmony_ci    mova    [ dstq+stride3q*2], m0                     ; 6  defghijklmnopppp
1256cabdff1aSopenharmony_ci    mova    [dst4q+stride3q*1], m1                     ; 7  EFGHIJKLMNOPPPPP
1257cabdff1aSopenharmony_ci    vpalignr                m0, m4, m3, 8
1258cabdff1aSopenharmony_ci    vpalignr                m1, m6, m5, 8
1259cabdff1aSopenharmony_ci    mova    [  dstq+strideq*8], m0                     ; 8  efghijklmnoppppp
1260cabdff1aSopenharmony_ci    mova    [dst4q+stride5q*1], m1                     ; 9  FGHIJKLMNOPPPPPP
1261cabdff1aSopenharmony_ci    vpalignr                m0, m4, m3, 10
1262cabdff1aSopenharmony_ci    mova     [dstq+stride5q*2], m0                     ; 10 fghijklmnopppppp
1263cabdff1aSopenharmony_ci    vpalignr                m0, m4, m3, 12
1264cabdff1aSopenharmony_ci    mova     [dst4q+strideq*8], m0                     ; 12 ghijklmnoppppppp
1265cabdff1aSopenharmony_ci    vpalignr                m0, m4, m3, 14
1266cabdff1aSopenharmony_ci    mova    [dst4q+stride5q*2], m0                     ; 14 hijklmnopppppppp
1267cabdff1aSopenharmony_ci    sub                  dst4q, strideq
1268cabdff1aSopenharmony_ci    vpalignr                m1, m6, m5, 10
1269cabdff1aSopenharmony_ci    mova     [dst4q+strideq*8], m1                     ; 11 GHIJKLMNOPPPPPPP
1270cabdff1aSopenharmony_ci    vpalignr                m1, m6, m5, 12
1271cabdff1aSopenharmony_ci    mova    [dst4q+stride5q*2], m1                     ; 13 HIJKLMNOPPPPPPPP
1272cabdff1aSopenharmony_ci    vpalignr                m1, m6, m5, 14
1273cabdff1aSopenharmony_ci    mova    [dst4q+stride3q*4], m1                     ; 15 IJKLMNOPPPPPPPPP
1274cabdff1aSopenharmony_ci    RET
1275cabdff1aSopenharmony_ci
1276cabdff1aSopenharmony_cicglobal vp9_ipred_hd_16x16_16, 4, 5, 7, dst, stride, l, a
1277cabdff1aSopenharmony_ci    movu                    m0, [aq-2]                 ; *abcdefghijklmno
1278cabdff1aSopenharmony_ci    mova                    m1, [lq]                   ; klmnopqrstuvwxyz
1279cabdff1aSopenharmony_ci    vperm2i128              m2, m1, m0, q0201          ; stuvwxyz*abcdefg
1280cabdff1aSopenharmony_ci    vpalignr                m3, m2, m1, 2              ; lmnopqrstuvwxyz*
1281cabdff1aSopenharmony_ci    vpalignr                m4, m2, m1, 4              ; mnopqrstuvwxyz*a
1282cabdff1aSopenharmony_ci    LOWPASS                  4,  3,  1                 ; LMNOPQRSTUVWXYZ#
1283cabdff1aSopenharmony_ci    pavgw                   m3, m1                     ; klmnopqrstuvwxyz
1284cabdff1aSopenharmony_ci    mova                    m1, [aq]                   ; abcdefghijklmnop
1285cabdff1aSopenharmony_ci    movu                    m2, [aq+2]                 ; bcdefghijklmnop.
1286cabdff1aSopenharmony_ci    LOWPASS                  2,  1,  0                 ; ABCDEFGHIJKLMNO.
1287cabdff1aSopenharmony_ci    vpunpcklwd              m0, m3, m4                 ; kLlMmNnOsTtUuVvW
1288cabdff1aSopenharmony_ci    vpunpckhwd              m1, m3, m4                 ; oPpQqRrSwXxYyZz#
1289cabdff1aSopenharmony_ci    vperm2i128              m3, m1, m0, q0002          ; kLlMmNnOoPpQqRrS
1290cabdff1aSopenharmony_ci    vperm2i128              m4, m0, m1, q0301          ; sTtUuVvWwXxYyZz#
1291cabdff1aSopenharmony_ci    vperm2i128              m0, m4, m2, q0201          ; wXxYyZz#ABCDEFGH
1292cabdff1aSopenharmony_ci    vperm2i128              m1, m3, m4, q0201          ; oPpQqRrSsTtUuVvW
1293cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, stride5, dst5
1294cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1295cabdff1aSopenharmony_ci    lea               stride5q, [stride3q+strideq*2]
1296cabdff1aSopenharmony_ci    lea                  dst5q, [dstq+stride5q]
1297cabdff1aSopenharmony_ci
1298cabdff1aSopenharmony_ci    mova    [dst5q+stride5q*2], m3                     ; 15 kLlMmNnOoPpQqRrS
1299cabdff1aSopenharmony_ci    mova    [dst5q+stride3q*2], m1                     ; 11 oPpQqRrSsTtUuVvW
1300cabdff1aSopenharmony_ci    mova     [dst5q+strideq*2], m4                     ; 7  sTtUuVvWwXxYyZz#
1301cabdff1aSopenharmony_ci    mova     [dstq+stride3q*1], m0                     ; 3  wXxYyZz#ABCDEFGH
1302cabdff1aSopenharmony_ci    vpalignr                m5, m4, m1, 4
1303cabdff1aSopenharmony_ci    mova     [dstq+stride5q*2], m5                     ; 10 pQqRrSsTtUuVvWwX
1304cabdff1aSopenharmony_ci    vpalignr                m5, m0, m4, 4
1305cabdff1aSopenharmony_ci    vpalignr                m6, m2, m0, 4
1306cabdff1aSopenharmony_ci    mova     [dstq+stride3q*2], m5                     ; 6  tUuVvWwXxYyZz#AB
1307cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m6                     ; 2  xYyZz#ABCDEFGHIJ
1308cabdff1aSopenharmony_ci    vpalignr                m5, m4, m1, 8
1309cabdff1aSopenharmony_ci    mova     [dst5q+strideq*4], m5                     ; 9  qRrSsTtUuVvWwXxY
1310cabdff1aSopenharmony_ci    vpalignr                m5, m0, m4, 8
1311cabdff1aSopenharmony_ci    vpalignr                m6, m2, m0, 8
1312cabdff1aSopenharmony_ci    mova     [dstq+stride5q*1], m5                     ; 5  uVvWwXxYyZz#ABCD
1313cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m6                     ; 1  yZz#ABCDEFGHIJKL
1314cabdff1aSopenharmony_ci    vpalignr                m5, m1, m3, 12
1315cabdff1aSopenharmony_ci    vpalignr                m6, m4, m1, 12
1316cabdff1aSopenharmony_ci    mova     [dstq+stride3q*4], m5                     ; 12 nOoPpQqRrSsTtUuV
1317cabdff1aSopenharmony_ci    mova      [dst5q+stride3q], m6                     ; 8  rSsTtUuVvWwXxYyZ
1318cabdff1aSopenharmony_ci    vpalignr                m5, m0, m4, 12
1319cabdff1aSopenharmony_ci    vpalignr                m6, m2, m0, 12
1320cabdff1aSopenharmony_ci    mova      [dstq+strideq*4], m5                     ; 4  nOoPpQqRrSsTtUuV
1321cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m6                     ; 0  z#ABCDEFGHIJKLMN
1322cabdff1aSopenharmony_ci    sub                  dst5q, strideq
1323cabdff1aSopenharmony_ci    vpalignr                m5, m1, m3, 4
1324cabdff1aSopenharmony_ci    mova    [dst5q+stride5q*2], m5                     ; 14 lMmNnOoPpQqRrSsT
1325cabdff1aSopenharmony_ci    sub                  dst5q, strideq
1326cabdff1aSopenharmony_ci    vpalignr                m5, m1, m3, 8
1327cabdff1aSopenharmony_ci    mova    [dst5q+stride5q*2], m5                    ; 13 mNnOoPpQqRrSsTtU
1328cabdff1aSopenharmony_ci    RET
1329cabdff1aSopenharmony_ci
1330cabdff1aSopenharmony_ci%if ARCH_X86_64
1331cabdff1aSopenharmony_cicglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
1332cabdff1aSopenharmony_ci    mova                    m0, [lq+mmsize*0+0]        ; l[0-15]
1333cabdff1aSopenharmony_ci    mova                    m1, [lq+mmsize*1+0]        ; l[16-31]
1334cabdff1aSopenharmony_ci    movu                    m2, [aq+mmsize*0-2]        ; *abcdefghijklmno
1335cabdff1aSopenharmony_ci    mova                    m3, [aq+mmsize*0+0]        ; abcdefghijklmnop
1336cabdff1aSopenharmony_ci    mova                    m4, [aq+mmsize*1+0]        ; qrstuvwxyz012345
1337cabdff1aSopenharmony_ci    vperm2i128              m5, m0, m1, q0201          ; lmnopqrstuvwxyz0
1338cabdff1aSopenharmony_ci    vpalignr                m6, m5, m0, 2              ; mnopqrstuvwxyz01
1339cabdff1aSopenharmony_ci    vpalignr                m7, m5, m0, 4              ; nopqrstuvwxyz012
1340cabdff1aSopenharmony_ci    LOWPASS                  0,  6,  7                 ; L[0-15]
1341cabdff1aSopenharmony_ci    vperm2i128              m7, m1, m2, q0201          ; stuvwxyz*abcdefg
1342cabdff1aSopenharmony_ci    vpalignr                m5, m7, m1, 2              ; lmnopqrstuvwxyz*
1343cabdff1aSopenharmony_ci    vpalignr                m6, m7, m1, 4              ; mnopqrstuvwxyz*a
1344cabdff1aSopenharmony_ci    LOWPASS                  1,  5,  6                 ; L[16-31]#
1345cabdff1aSopenharmony_ci    vperm2i128              m5, m3, m4, q0201          ; ijklmnopqrstuvwx
1346cabdff1aSopenharmony_ci    vpalignr                m6, m5, m3, 2              ; bcdefghijklmnopq
1347cabdff1aSopenharmony_ci    LOWPASS                  2,  3,  6                 ; A[0-15]
1348cabdff1aSopenharmony_ci    movu                    m3, [aq+mmsize*1-2]        ; pqrstuvwxyz01234
1349cabdff1aSopenharmony_ci    vperm2i128              m6, m4, m4, q2001          ; yz012345........
1350cabdff1aSopenharmony_ci    vpalignr                m7, m6, m4, 2              ; rstuvwxyz012345.
1351cabdff1aSopenharmony_ci    LOWPASS                  3,  4,  7                 ; A[16-31].
1352cabdff1aSopenharmony_ci    vperm2i128              m4, m1, m2, q0201          ; TUVWXYZ#ABCDEFGH
1353cabdff1aSopenharmony_ci    vperm2i128              m5, m0, m1, q0201          ; L[7-15]L[16-23]
1354cabdff1aSopenharmony_ci    vperm2i128              m8, m2, m3, q0201          ; IJKLMNOPQRSTUVWX
1355cabdff1aSopenharmony_ci    DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt
1356cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1357cabdff1aSopenharmony_ci    lea               stride5q, [stride3q+strideq*2]
1358cabdff1aSopenharmony_ci    lea               stride7q, [strideq*4+stride3q]
1359cabdff1aSopenharmony_ci    lea                 dst24q, [dst8q+stride3q*8]
1360cabdff1aSopenharmony_ci    lea                  dst8q, [dst8q+strideq*8]
1361cabdff1aSopenharmony_ci    mov                   cntd, 2
1362cabdff1aSopenharmony_ci
1363cabdff1aSopenharmony_ci.loop:
1364cabdff1aSopenharmony_ci    mova  [dst24q+stride7q+0 ], m0                     ; 31 23 15 7
1365cabdff1aSopenharmony_ci    mova  [dst24q+stride7q+32], m1
1366cabdff1aSopenharmony_ci    mova    [dst8q+stride7q+0], m1
1367cabdff1aSopenharmony_ci    mova   [dst8q+stride7q+32], m2
1368cabdff1aSopenharmony_ci    vpalignr                m6, m4, m1, 2
1369cabdff1aSopenharmony_ci    vpalignr                m7, m5, m0, 2
1370cabdff1aSopenharmony_ci    vpalignr                m9, m8, m2, 2
1371cabdff1aSopenharmony_ci    mova [dst24q+stride3q*2+0], m7                     ; 30 22 14 6
1372cabdff1aSopenharmony_ci    mova [dst24q+stride3q*2+32], m6
1373cabdff1aSopenharmony_ci    mova  [dst8q+stride3q*2+0], m6
1374cabdff1aSopenharmony_ci    mova [dst8q+stride3q*2+32], m9
1375cabdff1aSopenharmony_ci    vpalignr                m6, m4, m1, 4
1376cabdff1aSopenharmony_ci    vpalignr                m7, m5, m0, 4
1377cabdff1aSopenharmony_ci    vpalignr                m9, m8, m2, 4
1378cabdff1aSopenharmony_ci    mova   [dst24q+stride5q+0], m7                     ; 29 21 13 5
1379cabdff1aSopenharmony_ci    mova  [dst24q+stride5q+32], m6
1380cabdff1aSopenharmony_ci    mova    [dst8q+stride5q+0], m6
1381cabdff1aSopenharmony_ci    mova   [dst8q+stride5q+32], m9
1382cabdff1aSopenharmony_ci    vpalignr                m6, m4, m1, 6
1383cabdff1aSopenharmony_ci    vpalignr                m7, m5, m0, 6
1384cabdff1aSopenharmony_ci    vpalignr                m9, m8, m2, 6
1385cabdff1aSopenharmony_ci    mova [dst24q+strideq*4+0 ], m7                     ; 28 20 12 4
1386cabdff1aSopenharmony_ci    mova [dst24q+strideq*4+32], m6
1387cabdff1aSopenharmony_ci    mova   [dst8q+strideq*4+0], m6
1388cabdff1aSopenharmony_ci    mova  [dst8q+strideq*4+32], m9
1389cabdff1aSopenharmony_ci    vpalignr                m6, m4, m1, 8
1390cabdff1aSopenharmony_ci    vpalignr                m7, m5, m0, 8
1391cabdff1aSopenharmony_ci    vpalignr                m9, m8, m2, 8
1392cabdff1aSopenharmony_ci    mova  [dst24q+stride3q+0 ], m7                     ; 27 19 11 3
1393cabdff1aSopenharmony_ci    mova  [dst24q+stride3q+32], m6
1394cabdff1aSopenharmony_ci    mova    [dst8q+stride3q+0], m6
1395cabdff1aSopenharmony_ci    mova   [dst8q+stride3q+32], m9
1396cabdff1aSopenharmony_ci    vpalignr                m6, m4, m1, 10
1397cabdff1aSopenharmony_ci    vpalignr                m7, m5, m0, 10
1398cabdff1aSopenharmony_ci    vpalignr                m9, m8, m2, 10
1399cabdff1aSopenharmony_ci    mova [dst24q+strideq*2+0 ], m7                     ; 26 18 10 2
1400cabdff1aSopenharmony_ci    mova [dst24q+strideq*2+32], m6
1401cabdff1aSopenharmony_ci    mova   [dst8q+strideq*2+0], m6
1402cabdff1aSopenharmony_ci    mova  [dst8q+strideq*2+32], m9
1403cabdff1aSopenharmony_ci    vpalignr                m6, m4, m1, 12
1404cabdff1aSopenharmony_ci    vpalignr                m7, m5, m0, 12
1405cabdff1aSopenharmony_ci    vpalignr                m9, m8, m2, 12
1406cabdff1aSopenharmony_ci    mova   [dst24q+strideq+0 ], m7                     ; 25 17 9 1
1407cabdff1aSopenharmony_ci    mova   [dst24q+strideq+32], m6
1408cabdff1aSopenharmony_ci    mova     [dst8q+strideq+0], m6
1409cabdff1aSopenharmony_ci    mova    [dst8q+strideq+32], m9
1410cabdff1aSopenharmony_ci    vpalignr                m6, m4, m1, 14
1411cabdff1aSopenharmony_ci    vpalignr                m7, m5, m0, 14
1412cabdff1aSopenharmony_ci    vpalignr                m9, m8, m2, 14
1413cabdff1aSopenharmony_ci    mova [dst24q+strideq*0+0 ], m7                     ; 24 16 8 0
1414cabdff1aSopenharmony_ci    mova [dst24q+strideq*0+32], m6
1415cabdff1aSopenharmony_ci    mova   [dst8q+strideq*0+0], m6
1416cabdff1aSopenharmony_ci    mova  [dst8q+strideq*0+32], m9
1417cabdff1aSopenharmony_ci    mova                    m0, m5
1418cabdff1aSopenharmony_ci    mova                    m5, m1
1419cabdff1aSopenharmony_ci    mova                    m1, m4
1420cabdff1aSopenharmony_ci    mova                    m4, m2
1421cabdff1aSopenharmony_ci    mova                    m2, m8
1422cabdff1aSopenharmony_ci    mova                    m8, m3
1423cabdff1aSopenharmony_ci    sub                 dst24q, stride7q
1424cabdff1aSopenharmony_ci    sub                 dst24q, strideq
1425cabdff1aSopenharmony_ci    sub                  dst8q, stride7q
1426cabdff1aSopenharmony_ci    sub                  dst8q, strideq
1427cabdff1aSopenharmony_ci    dec                   cntd
1428cabdff1aSopenharmony_ci    jg .loop
1429cabdff1aSopenharmony_ci    RET
1430cabdff1aSopenharmony_ci%endif
1431cabdff1aSopenharmony_ci%endif
1432cabdff1aSopenharmony_ci
1433cabdff1aSopenharmony_ci%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1434cabdff1aSopenharmony_cicglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
1435cabdff1aSopenharmony_ci    movifnidn               aq, amp
1436cabdff1aSopenharmony_ci    movu                    m0, [aq]                ; abcdefgh
1437cabdff1aSopenharmony_ci    psrldq                  m1, m0, 2               ; bcdefgh.
1438cabdff1aSopenharmony_ci    psrldq                  m2, m0, 4               ; cdefgh..
1439cabdff1aSopenharmony_ci    LOWPASS                  2,  1, 0               ; BCDEFGH.
1440cabdff1aSopenharmony_ci    pavgw                   m1, m0                  ; ABCDEFG.
1441cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1442cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1443cabdff1aSopenharmony_ci
1444cabdff1aSopenharmony_ci    movh      [dstq+strideq*0], m1
1445cabdff1aSopenharmony_ci    movh      [dstq+strideq*1], m2
1446cabdff1aSopenharmony_ci    psrldq                  m1, 2
1447cabdff1aSopenharmony_ci    psrldq                  m2, 2
1448cabdff1aSopenharmony_ci    movh      [dstq+strideq*2], m1
1449cabdff1aSopenharmony_ci    movh      [dstq+stride3q ], m2
1450cabdff1aSopenharmony_ci    RET
1451cabdff1aSopenharmony_ci
1452cabdff1aSopenharmony_cicglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
1453cabdff1aSopenharmony_ci    movifnidn               aq, amp
1454cabdff1aSopenharmony_ci    mova                    m0, [aq]                ; abcdefgh
1455cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1456cabdff1aSopenharmony_ci    mova                    m3, [pb_2to15_14_15]
1457cabdff1aSopenharmony_ci%endif
1458cabdff1aSopenharmony_ci    SHIFT_RIGHTx2           m1, m2, m0, m3          ; bcdefghh/cdefghhh
1459cabdff1aSopenharmony_ci    LOWPASS                  2,  1, 0               ; BCDEFGHh
1460cabdff1aSopenharmony_ci    pavgw                   m1, m0                  ; ABCDEFGh
1461cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1462cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1463cabdff1aSopenharmony_ci
1464cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m1
1465cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m2
1466cabdff1aSopenharmony_ci    SHIFT_RIGHT             m1, m1, m3
1467cabdff1aSopenharmony_ci    SHIFT_RIGHT             m2, m2, m3
1468cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m1
1469cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m2
1470cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
1471cabdff1aSopenharmony_ci    SHIFT_RIGHT             m1, m1, m3
1472cabdff1aSopenharmony_ci    SHIFT_RIGHT             m2, m2, m3
1473cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m1
1474cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m2
1475cabdff1aSopenharmony_ci    SHIFT_RIGHT             m1, m1, m3
1476cabdff1aSopenharmony_ci    SHIFT_RIGHT             m2, m2, m3
1477cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m1
1478cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m2
1479cabdff1aSopenharmony_ci    RET
1480cabdff1aSopenharmony_ci
1481cabdff1aSopenharmony_cicglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
1482cabdff1aSopenharmony_ci    movifnidn               aq, amp
1483cabdff1aSopenharmony_ci    mova                    m0, [aq]
1484cabdff1aSopenharmony_ci    mova                    m1, [aq+mmsize]
1485cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, m0, 2, m3
1486cabdff1aSopenharmony_ci    PALIGNR                 m3, m1, m0, 4, m4
1487cabdff1aSopenharmony_ci    LOWPASS                  3,  2,  0
1488cabdff1aSopenharmony_ci    pavgw                   m2, m0
1489cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1490cabdff1aSopenharmony_ci    mova                    m4, [pb_2to15_14_15]
1491cabdff1aSopenharmony_ci%endif
1492cabdff1aSopenharmony_ci    SHIFT_RIGHTx2           m5, m0, m1, m4
1493cabdff1aSopenharmony_ci    LOWPASS                  0,  5,  1
1494cabdff1aSopenharmony_ci    pavgw                   m1, m5
1495cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt
1496cabdff1aSopenharmony_ci    mov                   cntd, 8
1497cabdff1aSopenharmony_ci
1498cabdff1aSopenharmony_ci.loop:
1499cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m2
1500cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m1
1501cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m3
1502cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m0
1503cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1504cabdff1aSopenharmony_ci%if cpuflag(avx)
1505cabdff1aSopenharmony_ci    vpalignr                m2, m1, m2, 2
1506cabdff1aSopenharmony_ci    vpalignr                m3, m0, m3, 2
1507cabdff1aSopenharmony_ci%else
1508cabdff1aSopenharmony_ci    PALIGNR                 m5, m1, m2, 2, m4
1509cabdff1aSopenharmony_ci    mova                    m2, m5
1510cabdff1aSopenharmony_ci    PALIGNR                 m5, m0, m3, 2, m4
1511cabdff1aSopenharmony_ci    mova                    m3, m5
1512cabdff1aSopenharmony_ci%endif
1513cabdff1aSopenharmony_ci    SHIFT_RIGHT             m1, m1, m4
1514cabdff1aSopenharmony_ci    SHIFT_RIGHT             m0, m0, m4
1515cabdff1aSopenharmony_ci    dec                   cntd
1516cabdff1aSopenharmony_ci    jg .loop
1517cabdff1aSopenharmony_ci    RET
1518cabdff1aSopenharmony_ci
1519cabdff1aSopenharmony_cicglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
1520cabdff1aSopenharmony_ci    movifnidn               aq, amp
1521cabdff1aSopenharmony_ci    mova                    m0, [aq+mmsize*0]
1522cabdff1aSopenharmony_ci    mova                    m1, [aq+mmsize*1]
1523cabdff1aSopenharmony_ci    mova                    m2, [aq+mmsize*2]
1524cabdff1aSopenharmony_ci    PALIGNR                 m6, m1, m0, 2, m5
1525cabdff1aSopenharmony_ci    PALIGNR                 m7, m1, m0, 4, m5
1526cabdff1aSopenharmony_ci    LOWPASS                  7,  6,  0
1527cabdff1aSopenharmony_ci    pavgw                   m6, m0
1528cabdff1aSopenharmony_ci    SCRATCH                  6,  8, rsp+0*mmsize
1529cabdff1aSopenharmony_ci    PALIGNR                 m4, m2, m1, 2, m0
1530cabdff1aSopenharmony_ci    PALIGNR                 m5, m2, m1, 4, m0
1531cabdff1aSopenharmony_ci    LOWPASS                  5,  4,  1
1532cabdff1aSopenharmony_ci    pavgw                   m4, m1
1533cabdff1aSopenharmony_ci    mova                    m0, [aq+mmsize*3]
1534cabdff1aSopenharmony_ci    PALIGNR                 m1, m0, m2, 2, m6
1535cabdff1aSopenharmony_ci    PALIGNR                 m3, m0, m2, 4, m6
1536cabdff1aSopenharmony_ci    LOWPASS                  3,  1,  2
1537cabdff1aSopenharmony_ci    pavgw                   m2, m1
1538cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1539cabdff1aSopenharmony_ci    PRELOAD                 10, pb_2to15_14_15, shuf
1540cabdff1aSopenharmony_ci%endif
1541cabdff1aSopenharmony_ci    SHIFT_RIGHTx2           m6, m1, m0, reg_shuf
1542cabdff1aSopenharmony_ci    LOWPASS                  1,  6,  0
1543cabdff1aSopenharmony_ci    pavgw                   m0, m6
1544cabdff1aSopenharmony_ci%if ARCH_X86_64
1545cabdff1aSopenharmony_ci    pshufd                  m9, m6, q3333
1546cabdff1aSopenharmony_ci%endif
1547cabdff1aSopenharmony_ci%if cpuflag(avx)
1548cabdff1aSopenharmony_ci    UNSCRATCH                6,  8, rsp+0*mmsize
1549cabdff1aSopenharmony_ci%endif
1550cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt, stride16, stride17
1551cabdff1aSopenharmony_ci    mov              stride16q, strideq
1552cabdff1aSopenharmony_ci    mov                   cntd, 8
1553cabdff1aSopenharmony_ci    shl              stride16q, 4
1554cabdff1aSopenharmony_ci    lea              stride17q, [stride16q+strideq]
1555cabdff1aSopenharmony_ci
1556cabdff1aSopenharmony_ci    ; FIXME m8 is unused for avx, so we could save one register here for win64
1557cabdff1aSopenharmony_ci.loop:
1558cabdff1aSopenharmony_ci%if notcpuflag(avx)
1559cabdff1aSopenharmony_ci    UNSCRATCH                6,  8, rsp+0*mmsize
1560cabdff1aSopenharmony_ci%endif
1561cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m6
1562cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m4
1563cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+32], m2
1564cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+48], m0
1565cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m7
1566cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m5
1567cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+32], m3
1568cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+48], m1
1569cabdff1aSopenharmony_ci    mova   [dstq+stride16q+ 0], m4
1570cabdff1aSopenharmony_ci    mova   [dstq+stride16q+16], m2
1571cabdff1aSopenharmony_ci    mova   [dstq+stride16q+32], m0
1572cabdff1aSopenharmony_ci%if ARCH_X86_64
1573cabdff1aSopenharmony_ci    mova   [dstq+stride16q+48], m9
1574cabdff1aSopenharmony_ci%endif
1575cabdff1aSopenharmony_ci    mova   [dstq+stride17q+ 0], m5
1576cabdff1aSopenharmony_ci    mova   [dstq+stride17q+16], m3
1577cabdff1aSopenharmony_ci    mova   [dstq+stride17q+32], m1
1578cabdff1aSopenharmony_ci%if ARCH_X86_64
1579cabdff1aSopenharmony_ci    mova   [dstq+stride17q+48], m9
1580cabdff1aSopenharmony_ci%endif
1581cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1582cabdff1aSopenharmony_ci%if cpuflag(avx)
1583cabdff1aSopenharmony_ci    vpalignr                m6, m4, m6, 2
1584cabdff1aSopenharmony_ci    vpalignr                m4, m2, m4, 2
1585cabdff1aSopenharmony_ci    vpalignr                m2, m0, m2, 2
1586cabdff1aSopenharmony_ci    vpalignr                m7, m5, m7, 2
1587cabdff1aSopenharmony_ci    vpalignr                m5, m3, m5, 2
1588cabdff1aSopenharmony_ci    vpalignr                m3, m1, m3, 2
1589cabdff1aSopenharmony_ci%else
1590cabdff1aSopenharmony_ci    SCRATCH                  3,  8, rsp+0*mmsize
1591cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
1592cabdff1aSopenharmony_ci    SCRATCH                  1, 10, rsp+1*mmsize
1593cabdff1aSopenharmony_ci%endif
1594cabdff1aSopenharmony_ci    PALIGNR                 m3, m4, m6, 2, m1
1595cabdff1aSopenharmony_ci    mova                    m6, m3
1596cabdff1aSopenharmony_ci    PALIGNR                 m3, m2, m4, 2, m1
1597cabdff1aSopenharmony_ci    mova                    m4, m3
1598cabdff1aSopenharmony_ci    PALIGNR                 m3, m0, m2, 2, m1
1599cabdff1aSopenharmony_ci    mova                    m2, m3
1600cabdff1aSopenharmony_ci    PALIGNR                 m3, m5, m7, 2, m1
1601cabdff1aSopenharmony_ci    mova                    m7, m3
1602cabdff1aSopenharmony_ci    UNSCRATCH                3,  8, rsp+0*mmsize
1603cabdff1aSopenharmony_ci    SCRATCH                  6,  8, rsp+0*mmsize
1604cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
1605cabdff1aSopenharmony_ci    UNSCRATCH                1, 10, rsp+1*mmsize
1606cabdff1aSopenharmony_ci    SCRATCH                  7, 10, rsp+1*mmsize
1607cabdff1aSopenharmony_ci%endif
1608cabdff1aSopenharmony_ci    PALIGNR                 m6, m3, m5, 2, m7
1609cabdff1aSopenharmony_ci    mova                    m5, m6
1610cabdff1aSopenharmony_ci    PALIGNR                 m6, m1, m3, 2, m7
1611cabdff1aSopenharmony_ci    mova                    m3, m6
1612cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
1613cabdff1aSopenharmony_ci    UNSCRATCH                7, 10, rsp+1*mmsize
1614cabdff1aSopenharmony_ci%endif
1615cabdff1aSopenharmony_ci%endif
1616cabdff1aSopenharmony_ci    SHIFT_RIGHT             m1, m1, reg_shuf
1617cabdff1aSopenharmony_ci    SHIFT_RIGHT             m0, m0, reg_shuf
1618cabdff1aSopenharmony_ci    dec                   cntd
1619cabdff1aSopenharmony_ci    jg .loop
1620cabdff1aSopenharmony_ci
1621cabdff1aSopenharmony_ci%if ARCH_X86_32
1622cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1623cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1624cabdff1aSopenharmony_ci%assign %%n 0
1625cabdff1aSopenharmony_ci%rep 4
1626cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+48], m0
1627cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+48], m0
1628cabdff1aSopenharmony_ci    mova   [dstq+strideq*2+48], m0
1629cabdff1aSopenharmony_ci    mova   [dstq+stride3q +48], m0
1630cabdff1aSopenharmony_ci%if %%n < 3
1631cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
1632cabdff1aSopenharmony_ci%endif
1633cabdff1aSopenharmony_ci%assign %%n (%%n+1)
1634cabdff1aSopenharmony_ci%endrep
1635cabdff1aSopenharmony_ci%endif
1636cabdff1aSopenharmony_ci    RET
1637cabdff1aSopenharmony_ci%endmacro
1638cabdff1aSopenharmony_ci
1639cabdff1aSopenharmony_ciINIT_XMM sse2
1640cabdff1aSopenharmony_ciVL_FUNCS 2
1641cabdff1aSopenharmony_ciINIT_XMM ssse3
1642cabdff1aSopenharmony_ciVL_FUNCS 1
1643cabdff1aSopenharmony_ciINIT_XMM avx
1644cabdff1aSopenharmony_ciVL_FUNCS 1
1645cabdff1aSopenharmony_ci
1646cabdff1aSopenharmony_ci%macro VR_FUNCS 0
1647cabdff1aSopenharmony_cicglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
1648cabdff1aSopenharmony_ci    movu                    m0, [aq-2]
1649cabdff1aSopenharmony_ci    movhps                  m1, [lq]
1650cabdff1aSopenharmony_ci    PALIGNR                 m0, m1, 10, m2          ; xyz*abcd
1651cabdff1aSopenharmony_ci    pslldq                  m1, m0, 2               ; .xyz*abc
1652cabdff1aSopenharmony_ci    pslldq                  m2, m0, 4               ; ..xyz*ab
1653cabdff1aSopenharmony_ci    LOWPASS                  2,  1, 0               ; ..YZ#ABC
1654cabdff1aSopenharmony_ci    pavgw                   m1, m0                  ; ....#ABC
1655cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1656cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1657cabdff1aSopenharmony_ci
1658cabdff1aSopenharmony_ci    movhps    [dstq+strideq*0], m1
1659cabdff1aSopenharmony_ci    movhps    [dstq+strideq*1], m2
1660cabdff1aSopenharmony_ci    shufps                  m0, m2, m1, q3210
1661cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1662cabdff1aSopenharmony_ci    pshufb                  m2, [pb_4_5_8to13_8x0]
1663cabdff1aSopenharmony_ci%else
1664cabdff1aSopenharmony_ci    pshuflw                 m2, m2, q2222
1665cabdff1aSopenharmony_ci    psrldq                  m2, 6
1666cabdff1aSopenharmony_ci%endif
1667cabdff1aSopenharmony_ci    psrldq                  m0, 6
1668cabdff1aSopenharmony_ci    movh      [dstq+strideq*2], m0
1669cabdff1aSopenharmony_ci    movh      [dstq+stride3q ], m2
1670cabdff1aSopenharmony_ci    RET
1671cabdff1aSopenharmony_ci
1672cabdff1aSopenharmony_cicglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
1673cabdff1aSopenharmony_ci    movu                    m1, [aq-2]              ; *abcdefg
1674cabdff1aSopenharmony_ci    movu                    m2, [lq]                ; stuvwxyz
1675cabdff1aSopenharmony_ci    mova                    m0, [aq]                ; abcdefgh
1676cabdff1aSopenharmony_ci    PALIGNR                 m3, m1, m2, 14, m4      ; z*abcdef
1677cabdff1aSopenharmony_ci    LOWPASS                  3,  1,  0
1678cabdff1aSopenharmony_ci    pavgw                   m0, m1
1679cabdff1aSopenharmony_ci    PALIGNR                 m1, m2,  2, m4          ; tuvwxyz*
1680cabdff1aSopenharmony_ci    pslldq                  m4, m2,  2              ; .stuvwxy
1681cabdff1aSopenharmony_ci    LOWPASS                  4,  2,  1
1682cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1683cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1684cabdff1aSopenharmony_ci
1685cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
1686cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m3
1687cabdff1aSopenharmony_ci    PALIGNR                 m0, m4, 14, m1
1688cabdff1aSopenharmony_ci    pslldq                  m4, 2
1689cabdff1aSopenharmony_ci    PALIGNR                 m3, m4, 14, m1
1690cabdff1aSopenharmony_ci    pslldq                  m4, 2
1691cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
1692cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m3
1693cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*4]
1694cabdff1aSopenharmony_ci    PALIGNR                 m0, m4, 14, m1
1695cabdff1aSopenharmony_ci    pslldq                  m4, 2
1696cabdff1aSopenharmony_ci    PALIGNR                 m3, m4, 14, m1
1697cabdff1aSopenharmony_ci    pslldq                  m4, 2
1698cabdff1aSopenharmony_ci    mova      [dstq+strideq*0], m0
1699cabdff1aSopenharmony_ci    mova      [dstq+strideq*1], m3
1700cabdff1aSopenharmony_ci    PALIGNR                 m0, m4, 14, m1
1701cabdff1aSopenharmony_ci    pslldq                  m4, 2
1702cabdff1aSopenharmony_ci    PALIGNR                 m3, m4, 14, m4
1703cabdff1aSopenharmony_ci    mova      [dstq+strideq*2], m0
1704cabdff1aSopenharmony_ci    mova      [dstq+stride3q ], m3
1705cabdff1aSopenharmony_ci    RET
1706cabdff1aSopenharmony_ci
1707cabdff1aSopenharmony_cicglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
1708cabdff1aSopenharmony_ci    movu                    m1, [aq-2]              ; *abcdefg
1709cabdff1aSopenharmony_ci    movu                    m2, [aq+mmsize-2]       ; hijklmno
1710cabdff1aSopenharmony_ci    mova                    m3, [aq]                ; abcdefgh
1711cabdff1aSopenharmony_ci    mova                    m4, [aq+mmsize]         ; ijklmnop
1712cabdff1aSopenharmony_ci    mova                    m5, [lq+mmsize]         ; stuvwxyz
1713cabdff1aSopenharmony_ci    PALIGNR                 m0, m1, m5, 14, m6      ; z*abcdef
1714cabdff1aSopenharmony_ci    movu                    m6, [aq+mmsize-4]       ; ghijklmn
1715cabdff1aSopenharmony_ci    LOWPASS                  6,  2,  4
1716cabdff1aSopenharmony_ci    pavgw                   m2, m4
1717cabdff1aSopenharmony_ci    LOWPASS                  0,  1,  3
1718cabdff1aSopenharmony_ci    pavgw                   m3, m1
1719cabdff1aSopenharmony_ci    PALIGNR                 m1, m5,  2, m7          ; tuvwxyz*
1720cabdff1aSopenharmony_ci    movu                    m7, [lq+mmsize-2]       ; rstuvwxy
1721cabdff1aSopenharmony_ci    LOWPASS                  1,  5,  7
1722cabdff1aSopenharmony_ci    movu                    m5, [lq+2]              ; lmnopqrs
1723cabdff1aSopenharmony_ci    pslldq                  m4, m5,  2              ; .lmnopqr
1724cabdff1aSopenharmony_ci    pslldq                  m7, m5,  4              ; ..lmnopq
1725cabdff1aSopenharmony_ci    LOWPASS                  5,  4,  7
1726cabdff1aSopenharmony_ci    psrld                   m4, m1, 16
1727cabdff1aSopenharmony_ci    psrld                   m7, m5, 16
1728cabdff1aSopenharmony_ci    pand                    m1, [pd_65535]
1729cabdff1aSopenharmony_ci    pand                    m5, [pd_65535]
1730cabdff1aSopenharmony_ci    packssdw                m7, m4
1731cabdff1aSopenharmony_ci    packssdw                m5, m1
1732cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt
1733cabdff1aSopenharmony_ci    mov                   cntd, 8
1734cabdff1aSopenharmony_ci
1735cabdff1aSopenharmony_ci.loop:
1736cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m3
1737cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m2
1738cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m0
1739cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m6
1740cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1741cabdff1aSopenharmony_ci    PALIGNR                 m2, m3, 14, m4
1742cabdff1aSopenharmony_ci    PALIGNR                 m3, m7, 14, m4
1743cabdff1aSopenharmony_ci    pslldq                  m7, 2
1744cabdff1aSopenharmony_ci    PALIGNR                 m6, m0, 14, m4
1745cabdff1aSopenharmony_ci    PALIGNR                 m0, m5, 14, m4
1746cabdff1aSopenharmony_ci    pslldq                  m5, 2
1747cabdff1aSopenharmony_ci    dec                   cntd
1748cabdff1aSopenharmony_ci    jg .loop
1749cabdff1aSopenharmony_ci    RET
1750cabdff1aSopenharmony_ci
1751cabdff1aSopenharmony_cicglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
1752cabdff1aSopenharmony_ci    movu                    m0, [aq+mmsize*0-2]     ; *a[0-6]
1753cabdff1aSopenharmony_ci    movu                    m1, [aq+mmsize*1-2]     ; a[7-14]
1754cabdff1aSopenharmony_ci    movu                    m2, [aq+mmsize*2-2]     ; a[15-22]
1755cabdff1aSopenharmony_ci    movu                    m3, [aq+mmsize*3-2]     ; a[23-30]
1756cabdff1aSopenharmony_ci    mova                    m4, [aq+mmsize*3+0]     ; a[24-31]
1757cabdff1aSopenharmony_ci    movu                    m5, [aq+mmsize*3-4]     ; a[22-29]
1758cabdff1aSopenharmony_ci    LOWPASS                  5,  3,  4              ; A[23-30]
1759cabdff1aSopenharmony_ci    SCRATCH                  5,  8, rsp+0*mmsize
1760cabdff1aSopenharmony_ci    pavgw                   m3, m4
1761cabdff1aSopenharmony_ci    mova                    m4, [aq+mmsize*2+0]     ; a[16-23]
1762cabdff1aSopenharmony_ci    movu                    m6, [aq+mmsize*2-4]     ; a[14-21]
1763cabdff1aSopenharmony_ci    LOWPASS                  6,  2,  4              ; A[15-22]
1764cabdff1aSopenharmony_ci    SCRATCH                  6,  9, rsp+1*mmsize
1765cabdff1aSopenharmony_ci    pavgw                   m2, m4
1766cabdff1aSopenharmony_ci    mova                    m4, [aq+mmsize*1+0]     ; a[8-15]
1767cabdff1aSopenharmony_ci    movu                    m7, [aq+mmsize*1-4]     ; a[6-13]
1768cabdff1aSopenharmony_ci    LOWPASS                  7,  1,  4              ; A[7-14]
1769cabdff1aSopenharmony_ci    SCRATCH                  7, 10, rsp+2*mmsize
1770cabdff1aSopenharmony_ci    pavgw                   m1, m4
1771cabdff1aSopenharmony_ci    mova                    m4, [aq+mmsize*0+0]     ; a[0-7]
1772cabdff1aSopenharmony_ci    mova                    m5, [lq+mmsize*3+0]     ; l[24-31]
1773cabdff1aSopenharmony_ci    PALIGNR                 m6, m0, m5, 14, m7      ; l[31]*a[0-5]
1774cabdff1aSopenharmony_ci    LOWPASS                  6,  0,  4              ; #A[0-6]
1775cabdff1aSopenharmony_ci    SCRATCH                  6, 11, rsp+3*mmsize
1776cabdff1aSopenharmony_ci    pavgw                   m4, m0
1777cabdff1aSopenharmony_ci    PALIGNR                 m0, m5,  2, m7          ; l[25-31]*
1778cabdff1aSopenharmony_ci    movu                    m7, [lq+mmsize*3-2]     ; l[23-30]
1779cabdff1aSopenharmony_ci    LOWPASS                  0,  5,  7              ; L[24-31]
1780cabdff1aSopenharmony_ci    movu                    m5, [lq+mmsize*2-2]     ; l[15-22]
1781cabdff1aSopenharmony_ci    mova                    m7, [lq+mmsize*2+0]     ; l[16-23]
1782cabdff1aSopenharmony_ci    movu                    m6, [lq+mmsize*2+2]     ; l[17-24]
1783cabdff1aSopenharmony_ci    LOWPASS                  5,  7,  6              ; L[16-23]
1784cabdff1aSopenharmony_ci    psrld                   m7, m0, 16
1785cabdff1aSopenharmony_ci    psrld                   m6, m5, 16
1786cabdff1aSopenharmony_ci    pand                    m0, [pd_65535]
1787cabdff1aSopenharmony_ci    pand                    m5, [pd_65535]
1788cabdff1aSopenharmony_ci    packssdw                m6, m7
1789cabdff1aSopenharmony_ci    packssdw                m5, m0
1790cabdff1aSopenharmony_ci    SCRATCH                  5, 12, rsp+4*mmsize
1791cabdff1aSopenharmony_ci    SCRATCH                  6, 13, rsp+5*mmsize
1792cabdff1aSopenharmony_ci    movu                    m6, [lq+mmsize*1-2]     ; l[7-14]
1793cabdff1aSopenharmony_ci    mova                    m0, [lq+mmsize*1+0]     ; l[8-15]
1794cabdff1aSopenharmony_ci    movu                    m5, [lq+mmsize*1+2]     ; l[9-16]
1795cabdff1aSopenharmony_ci    LOWPASS                  6,  0,  5              ; L[8-15]
1796cabdff1aSopenharmony_ci    movu                    m0, [lq+mmsize*0+2]     ; l[1-8]
1797cabdff1aSopenharmony_ci    pslldq                  m5, m0,  2              ; .l[1-7]
1798cabdff1aSopenharmony_ci    pslldq                  m7, m0,  4              ; ..l[1-6]
1799cabdff1aSopenharmony_ci    LOWPASS                  0,  5,  7
1800cabdff1aSopenharmony_ci    psrld                   m5, m6, 16
1801cabdff1aSopenharmony_ci    psrld                   m7, m0, 16
1802cabdff1aSopenharmony_ci    pand                    m6, [pd_65535]
1803cabdff1aSopenharmony_ci    pand                    m0, [pd_65535]
1804cabdff1aSopenharmony_ci    packssdw                m7, m5
1805cabdff1aSopenharmony_ci    packssdw                m0, m6
1806cabdff1aSopenharmony_ci    UNSCRATCH                6, 13, rsp+5*mmsize
1807cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride16, cnt, stride17
1808cabdff1aSopenharmony_ci    mov              stride16q, strideq
1809cabdff1aSopenharmony_ci    mov                   cntd, 8
1810cabdff1aSopenharmony_ci    shl              stride16q, 4
1811cabdff1aSopenharmony_ci%if ARCH_X86_64
1812cabdff1aSopenharmony_ci    lea              stride17q, [stride16q+strideq]
1813cabdff1aSopenharmony_ci%endif
1814cabdff1aSopenharmony_ci
1815cabdff1aSopenharmony_ci.loop:
1816cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m4
1817cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m1
1818cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+32], m2
1819cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+48], m3
1820cabdff1aSopenharmony_ci%if ARCH_X86_64
1821cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+ 0], m11
1822cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+16], m10
1823cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+32], m9
1824cabdff1aSopenharmony_ci    mova   [dstq+strideq*1+48], m8
1825cabdff1aSopenharmony_ci%endif
1826cabdff1aSopenharmony_ci    mova   [dstq+stride16q+ 0], m6
1827cabdff1aSopenharmony_ci    mova   [dstq+stride16q+16], m4
1828cabdff1aSopenharmony_ci    mova   [dstq+stride16q+32], m1
1829cabdff1aSopenharmony_ci    mova   [dstq+stride16q+48], m2
1830cabdff1aSopenharmony_ci%if ARCH_X86_64
1831cabdff1aSopenharmony_ci    mova   [dstq+stride17q+ 0], m12
1832cabdff1aSopenharmony_ci    mova   [dstq+stride17q+16], m11
1833cabdff1aSopenharmony_ci    mova   [dstq+stride17q+32], m10
1834cabdff1aSopenharmony_ci    mova   [dstq+stride17q+48], m9
1835cabdff1aSopenharmony_ci%endif
1836cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1837cabdff1aSopenharmony_ci    PALIGNR                 m3, m2,  14, m5
1838cabdff1aSopenharmony_ci    PALIGNR                 m2, m1,  14, m5
1839cabdff1aSopenharmony_ci    PALIGNR                 m1, m4,  14, m5
1840cabdff1aSopenharmony_ci    PALIGNR                 m4, m6,  14, m5
1841cabdff1aSopenharmony_ci    PALIGNR                 m6, m7,  14, m5
1842cabdff1aSopenharmony_ci    pslldq                  m7, 2
1843cabdff1aSopenharmony_ci%if ARCH_X86_64
1844cabdff1aSopenharmony_ci    PALIGNR                 m8, m9,  14, m5
1845cabdff1aSopenharmony_ci    PALIGNR                 m9, m10, 14, m5
1846cabdff1aSopenharmony_ci    PALIGNR                m10, m11, 14, m5
1847cabdff1aSopenharmony_ci    PALIGNR                m11, m12, 14, m5
1848cabdff1aSopenharmony_ci    PALIGNR                m12, m0,  14, m5
1849cabdff1aSopenharmony_ci    pslldq                  m0, 2
1850cabdff1aSopenharmony_ci%endif
1851cabdff1aSopenharmony_ci    dec                   cntd
1852cabdff1aSopenharmony_ci    jg .loop
1853cabdff1aSopenharmony_ci
1854cabdff1aSopenharmony_ci%if ARCH_X86_32
1855cabdff1aSopenharmony_ci    UNSCRATCH                5, 12, rsp+4*mmsize
1856cabdff1aSopenharmony_ci    UNSCRATCH                4, 11, rsp+3*mmsize
1857cabdff1aSopenharmony_ci    UNSCRATCH                3, 10, rsp+2*mmsize
1858cabdff1aSopenharmony_ci    UNSCRATCH                2,  9, rsp+1*mmsize
1859cabdff1aSopenharmony_ci    UNSCRATCH                1,  8, rsp+0*mmsize
1860cabdff1aSopenharmony_ci    mov                   dstq, dstm
1861cabdff1aSopenharmony_ci    mov                   cntd, 8
1862cabdff1aSopenharmony_ci    add                   dstq, strideq
1863cabdff1aSopenharmony_ci.loop2:
1864cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+ 0], m4
1865cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+16], m3
1866cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+32], m2
1867cabdff1aSopenharmony_ci    mova   [dstq+strideq*0+48], m1
1868cabdff1aSopenharmony_ci    mova   [dstq+stride16q+ 0], m5
1869cabdff1aSopenharmony_ci    mova   [dstq+stride16q+16], m4
1870cabdff1aSopenharmony_ci    mova   [dstq+stride16q+32], m3
1871cabdff1aSopenharmony_ci    mova   [dstq+stride16q+48], m2
1872cabdff1aSopenharmony_ci    lea                   dstq, [dstq+strideq*2]
1873cabdff1aSopenharmony_ci    PALIGNR                 m1, m2,  14, m6
1874cabdff1aSopenharmony_ci    PALIGNR                 m2, m3,  14, m6
1875cabdff1aSopenharmony_ci    PALIGNR                 m3, m4,  14, m6
1876cabdff1aSopenharmony_ci    PALIGNR                 m4, m5,  14, m6
1877cabdff1aSopenharmony_ci    PALIGNR                 m5, m0,  14, m6
1878cabdff1aSopenharmony_ci    pslldq                  m0, 2
1879cabdff1aSopenharmony_ci    dec                   cntd
1880cabdff1aSopenharmony_ci    jg .loop2
1881cabdff1aSopenharmony_ci%endif
1882cabdff1aSopenharmony_ci    RET
1883cabdff1aSopenharmony_ci%endmacro
1884cabdff1aSopenharmony_ci
1885cabdff1aSopenharmony_ciINIT_XMM sse2
1886cabdff1aSopenharmony_ciVR_FUNCS
1887cabdff1aSopenharmony_ciINIT_XMM ssse3
1888cabdff1aSopenharmony_ciVR_FUNCS
1889cabdff1aSopenharmony_ciINIT_XMM avx
1890cabdff1aSopenharmony_ciVR_FUNCS
1891cabdff1aSopenharmony_ci
1892cabdff1aSopenharmony_ci%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1893cabdff1aSopenharmony_cicglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
1894cabdff1aSopenharmony_ci    movh                    m0, [lq]                ; abcd
1895cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1896cabdff1aSopenharmony_ci    pshufb                  m0, [pb_0to7_67x4]      ; abcddddd
1897cabdff1aSopenharmony_ci%else
1898cabdff1aSopenharmony_ci    punpcklqdq              m0, m0
1899cabdff1aSopenharmony_ci    pshufhw                 m0, m0, q3333           ; abcddddd
1900cabdff1aSopenharmony_ci%endif
1901cabdff1aSopenharmony_ci    psrldq                  m1, m0,  2              ; bcddddd.
1902cabdff1aSopenharmony_ci    psrldq                  m2, m0,  4              ; cddddd..
1903cabdff1aSopenharmony_ci    LOWPASS                  2,  1,  0              ; BCDddd..
1904cabdff1aSopenharmony_ci    pavgw                   m1, m0                  ; abcddddd
1905cabdff1aSopenharmony_ci    SBUTTERFLY          wd,  1,  2,  0              ; aBbCcDdd, dddddddd
1906cabdff1aSopenharmony_ci    PALIGNR                 m2, m1,  4, m0          ; bCcDdddd
1907cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1908cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1909cabdff1aSopenharmony_ci
1910cabdff1aSopenharmony_ci    movh      [dstq+strideq*0], m1                  ; aBbC
1911cabdff1aSopenharmony_ci    movh      [dstq+strideq*1], m2                  ; bCcD
1912cabdff1aSopenharmony_ci    movhps    [dstq+strideq*2], m1                  ; cDdd
1913cabdff1aSopenharmony_ci    movhps    [dstq+stride3q ], m2                  ; dddd
1914cabdff1aSopenharmony_ci    RET
1915cabdff1aSopenharmony_ci
1916cabdff1aSopenharmony_cicglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
1917cabdff1aSopenharmony_ci    mova                    m0, [lq]
1918cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1919cabdff1aSopenharmony_ci    mova                    m3, [pb_2to15_14_15]
1920cabdff1aSopenharmony_ci%endif
1921cabdff1aSopenharmony_ci    SHIFT_RIGHTx2           m1, m2, m0, m3
1922cabdff1aSopenharmony_ci    LOWPASS                  2,  1,  0
1923cabdff1aSopenharmony_ci    pavgw                   m1, m0
1924cabdff1aSopenharmony_ci    SBUTTERFLY          wd,  1,  2,  0
1925cabdff1aSopenharmony_ci    shufps                  m0, m1, m2, q1032
1926cabdff1aSopenharmony_ci    pshufd                  m3, m2, q3332
1927cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
1928cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1929cabdff1aSopenharmony_ci
1930cabdff1aSopenharmony_ci    mova     [dstq+strideq *0], m1
1931cabdff1aSopenharmony_ci    mova     [dstq+strideq *2], m0
1932cabdff1aSopenharmony_ci    mova     [dstq+strideq *4], m2
1933cabdff1aSopenharmony_ci    mova     [dstq+stride3q*2], m3
1934cabdff1aSopenharmony_ci    add                   dstq, strideq
1935cabdff1aSopenharmony_ci%if cpuflag(avx)
1936cabdff1aSopenharmony_ci    vpalignr                m1, m2, m1, 4
1937cabdff1aSopenharmony_ci%else
1938cabdff1aSopenharmony_ci    PALIGNR                 m0, m2, m1, 4, m3
1939cabdff1aSopenharmony_ci    mova                    m1, m0
1940cabdff1aSopenharmony_ci%endif
1941cabdff1aSopenharmony_ci    pshufd                  m2, m2, q3321
1942cabdff1aSopenharmony_ci    shufps                  m0, m1, m2, q1032
1943cabdff1aSopenharmony_ci    pshufd                  m3, m2, q3332
1944cabdff1aSopenharmony_ci    mova     [dstq+strideq *0], m1
1945cabdff1aSopenharmony_ci    mova     [dstq+strideq *2], m0
1946cabdff1aSopenharmony_ci    mova     [dstq+strideq *4], m2
1947cabdff1aSopenharmony_ci    mova     [dstq+stride3q*2], m3
1948cabdff1aSopenharmony_ci    RET
1949cabdff1aSopenharmony_ci
1950cabdff1aSopenharmony_cicglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
1951cabdff1aSopenharmony_ci    mova                    m0, [lq]
1952cabdff1aSopenharmony_ci    mova                    m3, [lq+mmsize]
1953cabdff1aSopenharmony_ci    movu                    m1, [lq+2]
1954cabdff1aSopenharmony_ci    movu                    m2, [lq+4]
1955cabdff1aSopenharmony_ci    LOWPASS                  2,  1,  0
1956cabdff1aSopenharmony_ci    pavgw                   m1, m0
1957cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 1,  2,  0
1958cabdff1aSopenharmony_ci%if cpuflag(ssse3)
1959cabdff1aSopenharmony_ci    mova                    m5, [pb_2to15_14_15]
1960cabdff1aSopenharmony_ci%endif
1961cabdff1aSopenharmony_ci    SHIFT_RIGHTx2           m0, m4, m3, m5
1962cabdff1aSopenharmony_ci    LOWPASS                  4,  0,  3
1963cabdff1aSopenharmony_ci    pavgw                   m3, m0
1964cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 3,  4,  5
1965cabdff1aSopenharmony_ci    pshufd                  m0, m0, q3333
1966cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3, cnt
1967cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
1968cabdff1aSopenharmony_ci    mov                   cntd, 4
1969cabdff1aSopenharmony_ci
1970cabdff1aSopenharmony_ci.loop:
1971cabdff1aSopenharmony_ci    mova  [dstq+strideq *0+ 0], m1
1972cabdff1aSopenharmony_ci    mova  [dstq+strideq *0+16], m2
1973cabdff1aSopenharmony_ci    mova  [dstq+strideq *4+ 0], m2
1974cabdff1aSopenharmony_ci    mova  [dstq+strideq *4+16], m3
1975cabdff1aSopenharmony_ci    mova  [dstq+strideq *8+ 0], m3
1976cabdff1aSopenharmony_ci    mova  [dstq+strideq *8+16], m4
1977cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+ 0], m4
1978cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+16], m0
1979cabdff1aSopenharmony_ci    add                   dstq, strideq
1980cabdff1aSopenharmony_ci%if cpuflag(avx)
1981cabdff1aSopenharmony_ci    vpalignr                m1, m2, m1, 4
1982cabdff1aSopenharmony_ci    vpalignr                m2, m3, m2, 4
1983cabdff1aSopenharmony_ci    vpalignr                m3, m4, m3, 4
1984cabdff1aSopenharmony_ci    vpalignr                m4, m0, m4, 4
1985cabdff1aSopenharmony_ci%else
1986cabdff1aSopenharmony_ci    PALIGNR                 m5, m2, m1, 4, m6
1987cabdff1aSopenharmony_ci    mova                    m1, m5
1988cabdff1aSopenharmony_ci    PALIGNR                 m5, m3, m2, 4, m6
1989cabdff1aSopenharmony_ci    mova                    m2, m5
1990cabdff1aSopenharmony_ci    PALIGNR                 m5, m4, m3, 4, m6
1991cabdff1aSopenharmony_ci    mova                    m3, m5
1992cabdff1aSopenharmony_ci    PALIGNR                 m5, m0, m4, 4, m6
1993cabdff1aSopenharmony_ci    mova                    m4, m5
1994cabdff1aSopenharmony_ci%endif
1995cabdff1aSopenharmony_ci    dec                   cntd
1996cabdff1aSopenharmony_ci    jg .loop
1997cabdff1aSopenharmony_ci    RET
1998cabdff1aSopenharmony_ci
1999cabdff1aSopenharmony_cicglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
2000cabdff1aSopenharmony_ci                               %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
2001cabdff1aSopenharmony_ci    mova                    m2, [lq+mmsize*0+0]
2002cabdff1aSopenharmony_ci    movu                    m1, [lq+mmsize*0+2]
2003cabdff1aSopenharmony_ci    movu                    m0, [lq+mmsize*0+4]
2004cabdff1aSopenharmony_ci    LOWPASS                  0,  1,  2
2005cabdff1aSopenharmony_ci    pavgw                   m1, m2
2006cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 1,  0,  2
2007cabdff1aSopenharmony_ci    SCRATCH                  1,  8, rsp+0*mmsize
2008cabdff1aSopenharmony_ci    mova                    m4, [lq+mmsize*1+0]
2009cabdff1aSopenharmony_ci    movu                    m3, [lq+mmsize*1+2]
2010cabdff1aSopenharmony_ci    movu                    m2, [lq+mmsize*1+4]
2011cabdff1aSopenharmony_ci    LOWPASS                  2,  3,  4
2012cabdff1aSopenharmony_ci    pavgw                   m3, m4
2013cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 3,  2,  4
2014cabdff1aSopenharmony_ci    mova                    m6, [lq+mmsize*2+0]
2015cabdff1aSopenharmony_ci    movu                    m5, [lq+mmsize*2+2]
2016cabdff1aSopenharmony_ci    movu                    m4, [lq+mmsize*2+4]
2017cabdff1aSopenharmony_ci    LOWPASS                  4,  5,  6
2018cabdff1aSopenharmony_ci    pavgw                   m5, m6
2019cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 5,  4,  6
2020cabdff1aSopenharmony_ci    mova                    m7, [lq+mmsize*3+0]
2021cabdff1aSopenharmony_ci    SCRATCH                  0,  9, rsp+1*mmsize
2022cabdff1aSopenharmony_ci%if cpuflag(ssse3)
2023cabdff1aSopenharmony_ci    mova                    m0, [pb_2to15_14_15]
2024cabdff1aSopenharmony_ci%endif
2025cabdff1aSopenharmony_ci    SHIFT_RIGHTx2           m1, m6, m7, m0
2026cabdff1aSopenharmony_ci    LOWPASS                  6,  1,  7
2027cabdff1aSopenharmony_ci    pavgw                   m7, m1
2028cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 7,  6,  0
2029cabdff1aSopenharmony_ci    pshufd                  m1, m1, q3333
2030cabdff1aSopenharmony_ci    UNSCRATCH                0,  9, rsp+1*mmsize
2031cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
2032cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
2033cabdff1aSopenharmony_ci    lea               stride4q, [strideq*4]
2034cabdff1aSopenharmony_ci    lea              stride28q, [stride4q*8]
2035cabdff1aSopenharmony_ci    lea              stride20q, [stride4q*5]
2036cabdff1aSopenharmony_ci    sub              stride28q, stride4q
2037cabdff1aSopenharmony_ci    mov                   cntd, 4
2038cabdff1aSopenharmony_ci
2039cabdff1aSopenharmony_ci.loop:
2040cabdff1aSopenharmony_ci%if ARCH_X86_64
2041cabdff1aSopenharmony_ci    SWAP                     1,  8
2042cabdff1aSopenharmony_ci%else
2043cabdff1aSopenharmony_ci    mova        [rsp+1*mmsize], m1
2044cabdff1aSopenharmony_ci    mova                    m1, [rsp+0*mmsize]
2045cabdff1aSopenharmony_ci%endif
2046cabdff1aSopenharmony_ci    mova  [dstq+strideq *0+ 0], m1
2047cabdff1aSopenharmony_ci    mova  [dstq+strideq *0+16], m0
2048cabdff1aSopenharmony_ci    mova  [dstq+strideq *0+32], m3
2049cabdff1aSopenharmony_ci    mova  [dstq+strideq *0+48], m2
2050cabdff1aSopenharmony_ci    mova  [dstq+stride4q*1+ 0], m0
2051cabdff1aSopenharmony_ci    mova  [dstq+stride4q*1+16], m3
2052cabdff1aSopenharmony_ci    mova  [dstq+stride4q*1+32], m2
2053cabdff1aSopenharmony_ci    mova  [dstq+stride4q*1+48], m5
2054cabdff1aSopenharmony_ci    mova  [dstq+stride4q*2+ 0], m3
2055cabdff1aSopenharmony_ci    mova  [dstq+stride4q*2+16], m2
2056cabdff1aSopenharmony_ci    mova  [dstq+stride4q*2+32], m5
2057cabdff1aSopenharmony_ci    mova  [dstq+stride4q*2+48], m4
2058cabdff1aSopenharmony_ci%if cpuflag(avx)
2059cabdff1aSopenharmony_ci    vpalignr                m1, m0, m1, 4
2060cabdff1aSopenharmony_ci    vpalignr                m0, m3, m0, 4
2061cabdff1aSopenharmony_ci    vpalignr                m3, m2, m3, 4
2062cabdff1aSopenharmony_ci%else
2063cabdff1aSopenharmony_ci    SCRATCH                  6,  9, rsp+2*mmsize
2064cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
2065cabdff1aSopenharmony_ci    SCRATCH                  7, 10, rsp+3*mmsize
2066cabdff1aSopenharmony_ci%endif
2067cabdff1aSopenharmony_ci    PALIGNR                 m6, m0, m1, 4, m7
2068cabdff1aSopenharmony_ci    mova                    m1, m6
2069cabdff1aSopenharmony_ci    PALIGNR                 m6, m3, m0, 4, m7
2070cabdff1aSopenharmony_ci    mova                    m0, m6
2071cabdff1aSopenharmony_ci    PALIGNR                 m6, m2, m3, 4, m7
2072cabdff1aSopenharmony_ci    mova                    m3, m6
2073cabdff1aSopenharmony_ci    UNSCRATCH                6,  9, rsp+2*mmsize
2074cabdff1aSopenharmony_ci    SCRATCH                  0,  9, rsp+2*mmsize
2075cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
2076cabdff1aSopenharmony_ci    UNSCRATCH                7, 10, rsp+3*mmsize
2077cabdff1aSopenharmony_ci    SCRATCH                  3, 10, rsp+3*mmsize
2078cabdff1aSopenharmony_ci%endif
2079cabdff1aSopenharmony_ci%endif
2080cabdff1aSopenharmony_ci%if ARCH_X86_64
2081cabdff1aSopenharmony_ci    SWAP                     1,  8
2082cabdff1aSopenharmony_ci%else
2083cabdff1aSopenharmony_ci    mova        [rsp+0*mmsize], m1
2084cabdff1aSopenharmony_ci    mova                    m1, [rsp+1*mmsize]
2085cabdff1aSopenharmony_ci%endif
2086cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+ 0], m2
2087cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+16], m5
2088cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+32], m4
2089cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+48], m7
2090cabdff1aSopenharmony_ci    mova  [dstq+stride4q*4+ 0], m5
2091cabdff1aSopenharmony_ci    mova  [dstq+stride4q*4+16], m4
2092cabdff1aSopenharmony_ci    mova  [dstq+stride4q*4+32], m7
2093cabdff1aSopenharmony_ci    mova  [dstq+stride4q*4+48], m6
2094cabdff1aSopenharmony_ci    mova  [dstq+stride20q + 0], m4
2095cabdff1aSopenharmony_ci    mova  [dstq+stride20q +16], m7
2096cabdff1aSopenharmony_ci    mova  [dstq+stride20q +32], m6
2097cabdff1aSopenharmony_ci    mova  [dstq+stride20q +48], m1
2098cabdff1aSopenharmony_ci    mova  [dstq+stride3q*8+ 0], m7
2099cabdff1aSopenharmony_ci    mova  [dstq+stride3q*8+16], m6
2100cabdff1aSopenharmony_ci    mova  [dstq+stride3q*8+32], m1
2101cabdff1aSopenharmony_ci    mova  [dstq+stride3q*8+48], m1
2102cabdff1aSopenharmony_ci    mova  [dstq+stride28q + 0], m6
2103cabdff1aSopenharmony_ci    mova  [dstq+stride28q +16], m1
2104cabdff1aSopenharmony_ci    mova  [dstq+stride28q +32], m1
2105cabdff1aSopenharmony_ci    mova  [dstq+stride28q +48], m1
2106cabdff1aSopenharmony_ci%if cpuflag(avx)
2107cabdff1aSopenharmony_ci    vpalignr                m2, m5, m2, 4
2108cabdff1aSopenharmony_ci    vpalignr                m5, m4, m5, 4
2109cabdff1aSopenharmony_ci    vpalignr                m4, m7, m4, 4
2110cabdff1aSopenharmony_ci    vpalignr                m7, m6, m7, 4
2111cabdff1aSopenharmony_ci    vpalignr                m6, m1, m6, 4
2112cabdff1aSopenharmony_ci%else
2113cabdff1aSopenharmony_ci    PALIGNR                 m0, m5, m2, 4, m3
2114cabdff1aSopenharmony_ci    mova                    m2, m0
2115cabdff1aSopenharmony_ci    PALIGNR                 m0, m4, m5, 4, m3
2116cabdff1aSopenharmony_ci    mova                    m5, m0
2117cabdff1aSopenharmony_ci    PALIGNR                 m0, m7, m4, 4, m3
2118cabdff1aSopenharmony_ci    mova                    m4, m0
2119cabdff1aSopenharmony_ci    PALIGNR                 m0, m6, m7, 4, m3
2120cabdff1aSopenharmony_ci    mova                    m7, m0
2121cabdff1aSopenharmony_ci    PALIGNR                 m0, m1, m6, 4, m3
2122cabdff1aSopenharmony_ci    mova                    m6, m0
2123cabdff1aSopenharmony_ci    UNSCRATCH                0,  9, rsp+2*mmsize
2124cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
2125cabdff1aSopenharmony_ci    UNSCRATCH                3, 10, rsp+3*mmsize
2126cabdff1aSopenharmony_ci%endif
2127cabdff1aSopenharmony_ci%endif
2128cabdff1aSopenharmony_ci    add                   dstq, strideq
2129cabdff1aSopenharmony_ci    dec                   cntd
2130cabdff1aSopenharmony_ci    jg .loop
2131cabdff1aSopenharmony_ci    RET
2132cabdff1aSopenharmony_ci%endmacro
2133cabdff1aSopenharmony_ci
2134cabdff1aSopenharmony_ciINIT_XMM sse2
2135cabdff1aSopenharmony_ciHU_FUNCS 4
2136cabdff1aSopenharmony_ciINIT_XMM ssse3
2137cabdff1aSopenharmony_ciHU_FUNCS 3
2138cabdff1aSopenharmony_ciINIT_XMM avx
2139cabdff1aSopenharmony_ciHU_FUNCS 2
2140cabdff1aSopenharmony_ci
2141cabdff1aSopenharmony_ci%macro HD_FUNCS 0
2142cabdff1aSopenharmony_cicglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
2143cabdff1aSopenharmony_ci    movh                    m0, [lq]
2144cabdff1aSopenharmony_ci    movhps                  m0, [aq-2]
2145cabdff1aSopenharmony_ci    psrldq                  m1, m0, 2
2146cabdff1aSopenharmony_ci    psrldq                  m2, m0, 4
2147cabdff1aSopenharmony_ci    LOWPASS                  2,  1,  0
2148cabdff1aSopenharmony_ci    pavgw                   m1, m0
2149cabdff1aSopenharmony_ci    punpcklwd               m1, m2
2150cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride3
2151cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
2152cabdff1aSopenharmony_ci
2153cabdff1aSopenharmony_ci    movh      [dstq+stride3q ], m1
2154cabdff1aSopenharmony_ci    movhps    [dstq+strideq*1], m1
2155cabdff1aSopenharmony_ci    movhlps                 m2, m2
2156cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, 4, m0
2157cabdff1aSopenharmony_ci    movh      [dstq+strideq*2], m2
2158cabdff1aSopenharmony_ci    movhps    [dstq+strideq*0], m2
2159cabdff1aSopenharmony_ci    RET
2160cabdff1aSopenharmony_ci
2161cabdff1aSopenharmony_cicglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
2162cabdff1aSopenharmony_ci    mova                    m0, [lq]
2163cabdff1aSopenharmony_ci    movu                    m1, [aq-2]
2164cabdff1aSopenharmony_ci    PALIGNR                 m2, m1, m0, 2, m3
2165cabdff1aSopenharmony_ci    PALIGNR                 m3, m1, m0, 4, m4
2166cabdff1aSopenharmony_ci    LOWPASS                  3,  2,  0
2167cabdff1aSopenharmony_ci    pavgw                   m2, m0
2168cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 2,  3,  0
2169cabdff1aSopenharmony_ci    psrldq                  m0, m1,  2
2170cabdff1aSopenharmony_ci    psrldq                  m4, m1,  4
2171cabdff1aSopenharmony_ci    LOWPASS                  1,  0,  4
2172cabdff1aSopenharmony_ci    DEFINE_ARGS dst8, mstride, cnt
2173cabdff1aSopenharmony_ci    lea                  dst8q, [dst8q+mstrideq*8]
2174cabdff1aSopenharmony_ci    neg               mstrideq
2175cabdff1aSopenharmony_ci    mov                   cntd, 4
2176cabdff1aSopenharmony_ci
2177cabdff1aSopenharmony_ci.loop:
2178cabdff1aSopenharmony_ci    add                  dst8q, mstrideq
2179cabdff1aSopenharmony_ci    mova    [dst8q+mstrideq*0], m2
2180cabdff1aSopenharmony_ci    mova    [dst8q+mstrideq*4], m3
2181cabdff1aSopenharmony_ci%if cpuflag(avx)
2182cabdff1aSopenharmony_ci    vpalignr                m2, m3, m2, 4
2183cabdff1aSopenharmony_ci    vpalignr                m3, m1, m3, 4
2184cabdff1aSopenharmony_ci%else
2185cabdff1aSopenharmony_ci    PALIGNR                 m0, m3, m2, 4, m4
2186cabdff1aSopenharmony_ci    mova                    m2, m0
2187cabdff1aSopenharmony_ci    PALIGNR                 m0, m1, m3, 4, m4
2188cabdff1aSopenharmony_ci    mova                    m3, m0
2189cabdff1aSopenharmony_ci%endif
2190cabdff1aSopenharmony_ci    psrldq                  m1, 4
2191cabdff1aSopenharmony_ci    dec                   cntd
2192cabdff1aSopenharmony_ci    jg .loop
2193cabdff1aSopenharmony_ci    RET
2194cabdff1aSopenharmony_ci
2195cabdff1aSopenharmony_cicglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
2196cabdff1aSopenharmony_ci    mova                    m2, [lq]
2197cabdff1aSopenharmony_ci    movu                    m1, [lq+2]
2198cabdff1aSopenharmony_ci    movu                    m0, [lq+4]
2199cabdff1aSopenharmony_ci    LOWPASS                  0,  1,  2
2200cabdff1aSopenharmony_ci    pavgw                   m1, m2
2201cabdff1aSopenharmony_ci    mova                    m4, [lq+mmsize]
2202cabdff1aSopenharmony_ci    movu                    m5, [aq-2]
2203cabdff1aSopenharmony_ci    PALIGNR                 m3, m5, m4, 2, m6
2204cabdff1aSopenharmony_ci    PALIGNR                 m2, m5, m4, 4, m6
2205cabdff1aSopenharmony_ci    LOWPASS                  2,  3,  4
2206cabdff1aSopenharmony_ci    pavgw                   m3, m4
2207cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 1,  0,  4
2208cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 3,  2,  4
2209cabdff1aSopenharmony_ci    mova                    m6, [aq]
2210cabdff1aSopenharmony_ci    movu                    m4, [aq+2]
2211cabdff1aSopenharmony_ci    LOWPASS                  4,  6,  5
2212cabdff1aSopenharmony_ci    movu                    m5, [aq+mmsize-2]
2213cabdff1aSopenharmony_ci    psrldq                  m6, m5,  2
2214cabdff1aSopenharmony_ci    psrldq                  m7, m5,  4
2215cabdff1aSopenharmony_ci    LOWPASS                  5,  6,  7
2216cabdff1aSopenharmony_ci    DEFINE_ARGS dst, mstride, mstride3, cnt
2217cabdff1aSopenharmony_ci    lea                   dstq, [dstq+mstrideq*8]
2218cabdff1aSopenharmony_ci    lea                   dstq, [dstq+mstrideq*8]
2219cabdff1aSopenharmony_ci    neg               mstrideq
2220cabdff1aSopenharmony_ci    lea              mstride3q, [mstrideq*3]
2221cabdff1aSopenharmony_ci    mov                   cntd, 4
2222cabdff1aSopenharmony_ci
2223cabdff1aSopenharmony_ci.loop:
2224cabdff1aSopenharmony_ci    add                  dstq, mstrideq
2225cabdff1aSopenharmony_ci    mova [dstq+mstride3q*4+ 0], m2
2226cabdff1aSopenharmony_ci    mova [dstq+mstride3q*4+16], m4
2227cabdff1aSopenharmony_ci    mova [dstq+mstrideq *8+ 0], m3
2228cabdff1aSopenharmony_ci    mova [dstq+mstrideq *8+16], m2
2229cabdff1aSopenharmony_ci    mova [dstq+mstrideq *4+ 0], m0
2230cabdff1aSopenharmony_ci    mova [dstq+mstrideq *4+16], m3
2231cabdff1aSopenharmony_ci    mova [dstq+mstrideq *0+ 0], m1
2232cabdff1aSopenharmony_ci    mova [dstq+mstrideq *0+16], m0
2233cabdff1aSopenharmony_ci%if cpuflag(avx)
2234cabdff1aSopenharmony_ci    vpalignr                m1, m0, m1, 4
2235cabdff1aSopenharmony_ci    vpalignr                m0, m3, m0, 4
2236cabdff1aSopenharmony_ci    vpalignr                m3, m2, m3, 4
2237cabdff1aSopenharmony_ci    vpalignr                m2, m4, m2, 4
2238cabdff1aSopenharmony_ci    vpalignr                m4, m5, m4, 4
2239cabdff1aSopenharmony_ci%else
2240cabdff1aSopenharmony_ci    PALIGNR                 m6, m0, m1, 4, m7
2241cabdff1aSopenharmony_ci    mova                    m1, m6
2242cabdff1aSopenharmony_ci    PALIGNR                 m6, m3, m0, 4, m7
2243cabdff1aSopenharmony_ci    mova                    m0, m6
2244cabdff1aSopenharmony_ci    PALIGNR                 m6, m2, m3, 4, m7
2245cabdff1aSopenharmony_ci    mova                    m3, m6
2246cabdff1aSopenharmony_ci    PALIGNR                 m6, m4, m2, 4, m7
2247cabdff1aSopenharmony_ci    mova                    m2, m6
2248cabdff1aSopenharmony_ci    PALIGNR                 m6, m5, m4, 4, m7
2249cabdff1aSopenharmony_ci    mova                    m4, m6
2250cabdff1aSopenharmony_ci%endif
2251cabdff1aSopenharmony_ci    psrldq                  m5, 4
2252cabdff1aSopenharmony_ci    dec                   cntd
2253cabdff1aSopenharmony_ci    jg .loop
2254cabdff1aSopenharmony_ci    RET
2255cabdff1aSopenharmony_ci
2256cabdff1aSopenharmony_cicglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
2257cabdff1aSopenharmony_ci                               10 * -mmsize * ARCH_X86_32, dst, stride, l, a
2258cabdff1aSopenharmony_ci    mova                    m2, [lq+mmsize*0+0]
2259cabdff1aSopenharmony_ci    movu                    m1, [lq+mmsize*0+2]
2260cabdff1aSopenharmony_ci    movu                    m0, [lq+mmsize*0+4]
2261cabdff1aSopenharmony_ci    LOWPASS                  0,  1,  2
2262cabdff1aSopenharmony_ci    pavgw                   m1, m2
2263cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 1,  0,  2
2264cabdff1aSopenharmony_ci    mova                    m4, [lq+mmsize*1+0]
2265cabdff1aSopenharmony_ci    movu                    m3, [lq+mmsize*1+2]
2266cabdff1aSopenharmony_ci    movu                    m2, [lq+mmsize*1+4]
2267cabdff1aSopenharmony_ci    LOWPASS                  2,  3,  4
2268cabdff1aSopenharmony_ci    pavgw                   m3, m4
2269cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 3,  2,  4
2270cabdff1aSopenharmony_ci    SCRATCH                  0,  8, rsp+0*mmsize
2271cabdff1aSopenharmony_ci    SCRATCH                  1,  9, rsp+1*mmsize
2272cabdff1aSopenharmony_ci    SCRATCH                  2, 10, rsp+2*mmsize
2273cabdff1aSopenharmony_ci    SCRATCH                  3, 11, rsp+3*mmsize
2274cabdff1aSopenharmony_ci    mova                    m6, [lq+mmsize*2+0]
2275cabdff1aSopenharmony_ci    movu                    m5, [lq+mmsize*2+2]
2276cabdff1aSopenharmony_ci    movu                    m4, [lq+mmsize*2+4]
2277cabdff1aSopenharmony_ci    LOWPASS                  4,  5,  6
2278cabdff1aSopenharmony_ci    pavgw                   m5, m6
2279cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 5,  4,  6
2280cabdff1aSopenharmony_ci    mova                    m0, [lq+mmsize*3+0]
2281cabdff1aSopenharmony_ci    movu                    m1, [aq+mmsize*0-2]
2282cabdff1aSopenharmony_ci    PALIGNR                 m7, m1, m0, 2, m2
2283cabdff1aSopenharmony_ci    PALIGNR                 m6, m1, m0, 4, m2
2284cabdff1aSopenharmony_ci    LOWPASS                  6,  7,  0
2285cabdff1aSopenharmony_ci    pavgw                   m7, m0
2286cabdff1aSopenharmony_ci    SBUTTERFLY           wd, 7,  6,  0
2287cabdff1aSopenharmony_ci    mova                    m2, [aq+mmsize*0+0]
2288cabdff1aSopenharmony_ci    movu                    m0, [aq+mmsize*0+2]
2289cabdff1aSopenharmony_ci    LOWPASS                  0,  2,  1
2290cabdff1aSopenharmony_ci    movu                    m1, [aq+mmsize*1-2]
2291cabdff1aSopenharmony_ci    mova                    m2, [aq+mmsize*1+0]
2292cabdff1aSopenharmony_ci    movu                    m3, [aq+mmsize*1+2]
2293cabdff1aSopenharmony_ci    LOWPASS                  1,  2,  3
2294cabdff1aSopenharmony_ci    SCRATCH                  6, 12, rsp+6*mmsize
2295cabdff1aSopenharmony_ci    SCRATCH                  7, 13, rsp+7*mmsize
2296cabdff1aSopenharmony_ci    movu                    m2, [aq+mmsize*2-2]
2297cabdff1aSopenharmony_ci    mova                    m3, [aq+mmsize*2+0]
2298cabdff1aSopenharmony_ci    movu                    m6, [aq+mmsize*2+2]
2299cabdff1aSopenharmony_ci    LOWPASS                  2,  3,  6
2300cabdff1aSopenharmony_ci    movu                    m3, [aq+mmsize*3-2]
2301cabdff1aSopenharmony_ci    psrldq                  m6, m3,  2
2302cabdff1aSopenharmony_ci    psrldq                  m7, m3,  4
2303cabdff1aSopenharmony_ci    LOWPASS                  3,  6,  7
2304cabdff1aSopenharmony_ci    UNSCRATCH                6, 12, rsp+6*mmsize
2305cabdff1aSopenharmony_ci    UNSCRATCH                7, 13, rsp+7*mmsize
2306cabdff1aSopenharmony_ci%if ARCH_X86_32
2307cabdff1aSopenharmony_ci    mova        [rsp+4*mmsize], m4
2308cabdff1aSopenharmony_ci    mova        [rsp+5*mmsize], m5
2309cabdff1aSopenharmony_ci    ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
2310cabdff1aSopenharmony_ci    ; to do it again here
2311cabdff1aSopenharmony_ci%endif
2312cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
2313cabdff1aSopenharmony_ci    mov                   cntd, 4
2314cabdff1aSopenharmony_ci    lea               stride3q, [strideq*3]
2315cabdff1aSopenharmony_ci%if ARCH_X86_64
2316cabdff1aSopenharmony_ci    lea               stride4q, [strideq*4]
2317cabdff1aSopenharmony_ci    lea              stride28q, [stride4q*8]
2318cabdff1aSopenharmony_ci    lea              stride20q, [stride4q*5]
2319cabdff1aSopenharmony_ci    sub              stride28q, stride4q
2320cabdff1aSopenharmony_ci%endif
2321cabdff1aSopenharmony_ci    add                   dstq, stride3q
2322cabdff1aSopenharmony_ci
2323cabdff1aSopenharmony_ci    ; x86-32 doesn't have enough registers, so on that platform, we split
2324cabdff1aSopenharmony_ci    ; the loop in 2... Otherwise you spend most of the loop (un)scratching
2325cabdff1aSopenharmony_ci.loop:
2326cabdff1aSopenharmony_ci%if ARCH_X86_64
2327cabdff1aSopenharmony_ci    mova  [dstq+stride28q + 0], m9
2328cabdff1aSopenharmony_ci    mova  [dstq+stride28q +16], m8
2329cabdff1aSopenharmony_ci    mova  [dstq+stride28q +32], m11
2330cabdff1aSopenharmony_ci    mova  [dstq+stride28q +48], m10
2331cabdff1aSopenharmony_ci    mova  [dstq+stride3q*8+ 0], m8
2332cabdff1aSopenharmony_ci    mova  [dstq+stride3q*8+16], m11
2333cabdff1aSopenharmony_ci    mova  [dstq+stride3q*8+32], m10
2334cabdff1aSopenharmony_ci    mova  [dstq+stride3q*8+48], m5
2335cabdff1aSopenharmony_ci    mova  [dstq+stride20q + 0], m11
2336cabdff1aSopenharmony_ci    mova  [dstq+stride20q +16], m10
2337cabdff1aSopenharmony_ci    mova  [dstq+stride20q +32], m5
2338cabdff1aSopenharmony_ci    mova  [dstq+stride20q +48], m4
2339cabdff1aSopenharmony_ci    mova  [dstq+stride4q*4+ 0], m10
2340cabdff1aSopenharmony_ci    mova  [dstq+stride4q*4+16], m5
2341cabdff1aSopenharmony_ci    mova  [dstq+stride4q*4+32], m4
2342cabdff1aSopenharmony_ci    mova  [dstq+stride4q*4+48], m7
2343cabdff1aSopenharmony_ci%endif
2344cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+ 0], m5
2345cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+16], m4
2346cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+32], m7
2347cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+48], m6
2348cabdff1aSopenharmony_ci    mova  [dstq+strideq* 8+ 0], m4
2349cabdff1aSopenharmony_ci    mova  [dstq+strideq* 8+16], m7
2350cabdff1aSopenharmony_ci    mova  [dstq+strideq* 8+32], m6
2351cabdff1aSopenharmony_ci    mova  [dstq+strideq* 8+48], m0
2352cabdff1aSopenharmony_ci    mova  [dstq+strideq* 4+ 0], m7
2353cabdff1aSopenharmony_ci    mova  [dstq+strideq* 4+16], m6
2354cabdff1aSopenharmony_ci    mova  [dstq+strideq* 4+32], m0
2355cabdff1aSopenharmony_ci    mova  [dstq+strideq* 4+48], m1
2356cabdff1aSopenharmony_ci    mova  [dstq+strideq* 0+ 0], m6
2357cabdff1aSopenharmony_ci    mova  [dstq+strideq* 0+16], m0
2358cabdff1aSopenharmony_ci    mova  [dstq+strideq* 0+32], m1
2359cabdff1aSopenharmony_ci    mova  [dstq+strideq* 0+48], m2
2360cabdff1aSopenharmony_ci    sub                   dstq, strideq
2361cabdff1aSopenharmony_ci%if cpuflag(avx)
2362cabdff1aSopenharmony_ci%if ARCH_X86_64
2363cabdff1aSopenharmony_ci    vpalignr                m9, m8,  m9,  4
2364cabdff1aSopenharmony_ci    vpalignr                m8, m11, m8,  4
2365cabdff1aSopenharmony_ci    vpalignr               m11, m10, m11, 4
2366cabdff1aSopenharmony_ci    vpalignr               m10, m5,  m10, 4
2367cabdff1aSopenharmony_ci%endif
2368cabdff1aSopenharmony_ci    vpalignr                m5, m4,  m5,  4
2369cabdff1aSopenharmony_ci    vpalignr                m4, m7,  m4,  4
2370cabdff1aSopenharmony_ci    vpalignr                m7, m6,  m7,  4
2371cabdff1aSopenharmony_ci    vpalignr                m6, m0,  m6,  4
2372cabdff1aSopenharmony_ci    vpalignr                m0, m1,  m0,  4
2373cabdff1aSopenharmony_ci    vpalignr                m1, m2,  m1,  4
2374cabdff1aSopenharmony_ci    vpalignr                m2, m3,  m2,  4
2375cabdff1aSopenharmony_ci%else
2376cabdff1aSopenharmony_ci%if ARCH_X86_64
2377cabdff1aSopenharmony_ci    PALIGNR                m12, m8,  m9,  4, m13
2378cabdff1aSopenharmony_ci    mova                    m9, m12
2379cabdff1aSopenharmony_ci    PALIGNR                m12, m11, m8,  4, m13
2380cabdff1aSopenharmony_ci    mova                    m8, m12
2381cabdff1aSopenharmony_ci    PALIGNR                m12, m10, m11, 4, m13
2382cabdff1aSopenharmony_ci    mova                   m11, m12
2383cabdff1aSopenharmony_ci    PALIGNR                m12, m5,  m10, 4, m13
2384cabdff1aSopenharmony_ci    mova                   m10, m12
2385cabdff1aSopenharmony_ci%endif
2386cabdff1aSopenharmony_ci    SCRATCH                  3, 12, rsp+8*mmsize, sh
2387cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
2388cabdff1aSopenharmony_ci    SCRATCH                  2, 13, rsp+9*mmsize
2389cabdff1aSopenharmony_ci%endif
2390cabdff1aSopenharmony_ci    PALIGNR                 m3, m4,  m5,  4, m2
2391cabdff1aSopenharmony_ci    mova                    m5, m3
2392cabdff1aSopenharmony_ci    PALIGNR                 m3, m7,  m4,  4, m2
2393cabdff1aSopenharmony_ci    mova                    m4, m3
2394cabdff1aSopenharmony_ci    PALIGNR                 m3, m6,  m7,  4, m2
2395cabdff1aSopenharmony_ci    mova                    m7, m3
2396cabdff1aSopenharmony_ci    PALIGNR                 m3, m0,  m6,  4, m2
2397cabdff1aSopenharmony_ci    mova                    m6, m3
2398cabdff1aSopenharmony_ci    PALIGNR                 m3, m1,  m0,  4, m2
2399cabdff1aSopenharmony_ci    mova                    m0, m3
2400cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
2401cabdff1aSopenharmony_ci    UNSCRATCH                2, 13, rsp+9*mmsize
2402cabdff1aSopenharmony_ci    SCRATCH                  0, 13, rsp+9*mmsize
2403cabdff1aSopenharmony_ci%endif
2404cabdff1aSopenharmony_ci    PALIGNR                 m3, m2,  m1,  4, m0
2405cabdff1aSopenharmony_ci    mova                    m1, m3
2406cabdff1aSopenharmony_ci    PALIGNR                 m3, reg_sh,  m2,  4, m0
2407cabdff1aSopenharmony_ci    mova                    m2, m3
2408cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
2409cabdff1aSopenharmony_ci    UNSCRATCH                0, 13, rsp+9*mmsize
2410cabdff1aSopenharmony_ci%endif
2411cabdff1aSopenharmony_ci    UNSCRATCH                3, 12, rsp+8*mmsize, sh
2412cabdff1aSopenharmony_ci%endif
2413cabdff1aSopenharmony_ci    psrldq                  m3, 4
2414cabdff1aSopenharmony_ci    dec                   cntd
2415cabdff1aSopenharmony_ci    jg .loop
2416cabdff1aSopenharmony_ci
2417cabdff1aSopenharmony_ci%if ARCH_X86_32
2418cabdff1aSopenharmony_ci    UNSCRATCH                0,  8, rsp+0*mmsize
2419cabdff1aSopenharmony_ci    UNSCRATCH                1,  9, rsp+1*mmsize
2420cabdff1aSopenharmony_ci    UNSCRATCH                2, 10, rsp+2*mmsize
2421cabdff1aSopenharmony_ci    UNSCRATCH                3, 11, rsp+3*mmsize
2422cabdff1aSopenharmony_ci    mova                    m4, [rsp+4*mmsize]
2423cabdff1aSopenharmony_ci    mova                    m5, [rsp+5*mmsize]
2424cabdff1aSopenharmony_ci    mova                    m6, [rsp+6*mmsize]
2425cabdff1aSopenharmony_ci    mova                    m7, [rsp+7*mmsize]
2426cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, stride5, stride3
2427cabdff1aSopenharmony_ci    lea               stride5q, [strideq*5]
2428cabdff1aSopenharmony_ci    lea                   dstq, [dstq+stride5q*4]
2429cabdff1aSopenharmony_ci    DEFINE_ARGS dst, stride, cnt, stride3
2430cabdff1aSopenharmony_ci    mov                   cntd, 4
2431cabdff1aSopenharmony_ci.loop_2:
2432cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+ 0], m1
2433cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+16], m0
2434cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+32], m3
2435cabdff1aSopenharmony_ci    mova  [dstq+stride3q*4+48], m2
2436cabdff1aSopenharmony_ci    mova  [dstq+strideq* 8+ 0], m0
2437cabdff1aSopenharmony_ci    mova  [dstq+strideq* 8+16], m3
2438cabdff1aSopenharmony_ci    mova  [dstq+strideq* 8+32], m2
2439cabdff1aSopenharmony_ci    mova  [dstq+strideq* 8+48], m5
2440cabdff1aSopenharmony_ci    mova  [dstq+strideq* 4+ 0], m3
2441cabdff1aSopenharmony_ci    mova  [dstq+strideq* 4+16], m2
2442cabdff1aSopenharmony_ci    mova  [dstq+strideq* 4+32], m5
2443cabdff1aSopenharmony_ci    mova  [dstq+strideq* 4+48], m4
2444cabdff1aSopenharmony_ci    mova  [dstq+strideq* 0+ 0], m2
2445cabdff1aSopenharmony_ci    mova  [dstq+strideq* 0+16], m5
2446cabdff1aSopenharmony_ci    mova  [dstq+strideq* 0+32], m4
2447cabdff1aSopenharmony_ci    mova  [dstq+strideq* 0+48], m7
2448cabdff1aSopenharmony_ci    sub                   dstq, strideq
2449cabdff1aSopenharmony_ci%if cpuflag(avx)
2450cabdff1aSopenharmony_ci    vpalignr                m1, m0,  m1,  4
2451cabdff1aSopenharmony_ci    vpalignr                m0, m3,  m0,  4
2452cabdff1aSopenharmony_ci    vpalignr                m3, m2,  m3,  4
2453cabdff1aSopenharmony_ci    vpalignr                m2, m5,  m2,  4
2454cabdff1aSopenharmony_ci    vpalignr                m5, m4,  m5,  4
2455cabdff1aSopenharmony_ci    vpalignr                m4, m7,  m4,  4
2456cabdff1aSopenharmony_ci    vpalignr                m7, m6,  m7,  4
2457cabdff1aSopenharmony_ci%else
2458cabdff1aSopenharmony_ci    SCRATCH                  6, 12, rsp+8*mmsize, sh
2459cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
2460cabdff1aSopenharmony_ci    SCRATCH                  7, 13, rsp+9*mmsize
2461cabdff1aSopenharmony_ci%endif
2462cabdff1aSopenharmony_ci    PALIGNR                 m6, m0,  m1,  4, m7
2463cabdff1aSopenharmony_ci    mova                    m1, m6
2464cabdff1aSopenharmony_ci    PALIGNR                 m6, m3,  m0,  4, m7
2465cabdff1aSopenharmony_ci    mova                    m0, m6
2466cabdff1aSopenharmony_ci    PALIGNR                 m6, m2,  m3,  4, m7
2467cabdff1aSopenharmony_ci    mova                    m3, m6
2468cabdff1aSopenharmony_ci    PALIGNR                 m6, m5,  m2,  4, m7
2469cabdff1aSopenharmony_ci    mova                    m2, m6
2470cabdff1aSopenharmony_ci    PALIGNR                 m6, m4,  m5,  4, m7
2471cabdff1aSopenharmony_ci    mova                    m5, m6
2472cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
2473cabdff1aSopenharmony_ci    UNSCRATCH                7, 13, rsp+9*mmsize
2474cabdff1aSopenharmony_ci    SCRATCH                  5, 13, rsp+9*mmsize
2475cabdff1aSopenharmony_ci%endif
2476cabdff1aSopenharmony_ci    PALIGNR                 m6, m7,  m4,  4, m5
2477cabdff1aSopenharmony_ci    mova                    m4, m6
2478cabdff1aSopenharmony_ci    PALIGNR                 m6, reg_sh,  m7,  4, m5
2479cabdff1aSopenharmony_ci    mova                    m7, m6
2480cabdff1aSopenharmony_ci%if notcpuflag(ssse3)
2481cabdff1aSopenharmony_ci    UNSCRATCH                5, 13, rsp+9*mmsize
2482cabdff1aSopenharmony_ci%endif
2483cabdff1aSopenharmony_ci    UNSCRATCH                6, 12, rsp+8*mmsize, sh
2484cabdff1aSopenharmony_ci%endif
2485cabdff1aSopenharmony_ci    psrldq                  m6, 4
2486cabdff1aSopenharmony_ci    dec                   cntd
2487cabdff1aSopenharmony_ci    jg .loop_2
2488cabdff1aSopenharmony_ci%endif
2489cabdff1aSopenharmony_ci    RET
2490cabdff1aSopenharmony_ci%endmacro
2491cabdff1aSopenharmony_ci
2492cabdff1aSopenharmony_ciINIT_XMM sse2
2493cabdff1aSopenharmony_ciHD_FUNCS
2494cabdff1aSopenharmony_ciINIT_XMM ssse3
2495cabdff1aSopenharmony_ciHD_FUNCS
2496cabdff1aSopenharmony_ciINIT_XMM avx
2497cabdff1aSopenharmony_ciHD_FUNCS
2498