1;******************************************************************************
2;* VP9 Intra prediction SIMD optimizations
3;*
4;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
5;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA 32
27
28pd_2: times 8 dd 2
29pd_4: times 8 dd 4
30pd_8: times 8 dd 8
31
32pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
33pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
34pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
35
36cextern pw_1
37cextern pw_1023
38cextern pw_4095
39cextern pd_16
40cextern pd_32
41cextern pd_65535;
42
43; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
44; only 3 registers on x86-32, which would make it one cycle faster, but that
45; would make the code quite a bit uglier...
46
47SECTION .text
48
49%macro SCRATCH 3-4
50%if ARCH_X86_64
51    SWAP                %1, %2
52%if %0 == 4
53%define reg_%4 m%2
54%endif
55%else
56    mova              [%3], m%1
57%if %0 == 4
58%define reg_%4 [%3]
59%endif
60%endif
61%endmacro
62
63%macro UNSCRATCH 3-4
64%if ARCH_X86_64
65    SWAP                %1, %2
66%else
67    mova               m%1, [%3]
68%endif
69%if %0 == 4
70%undef reg_%4
71%endif
72%endmacro
73
74%macro PRELOAD 2-3
75%if ARCH_X86_64
76    mova               m%1, [%2]
77%if %0 == 3
78%define reg_%3 m%1
79%endif
80%elif %0 == 3
81%define reg_%3 [%2]
82%endif
83%endmacro
84
85INIT_MMX mmx
86cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
87    movifnidn               aq, amp
88    mova                    m0, [aq]
89    DEFINE_ARGS dst, stride, stride3
90    lea               stride3q, [strideq*3]
91    mova      [dstq+strideq*0], m0
92    mova      [dstq+strideq*1], m0
93    mova      [dstq+strideq*2], m0
94    mova      [dstq+stride3q ], m0
95    RET
96
97INIT_XMM sse
98cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
99    movifnidn               aq, amp
100    mova                    m0, [aq]
101    DEFINE_ARGS dst, stride, stride3
102    lea               stride3q, [strideq*3]
103    mova      [dstq+strideq*0], m0
104    mova      [dstq+strideq*1], m0
105    mova      [dstq+strideq*2], m0
106    mova      [dstq+stride3q ], m0
107    lea                   dstq, [dstq+strideq*4]
108    mova      [dstq+strideq*0], m0
109    mova      [dstq+strideq*1], m0
110    mova      [dstq+strideq*2], m0
111    mova      [dstq+stride3q ], m0
112    RET
113
114INIT_XMM sse
115cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
116    movifnidn               aq, amp
117    mova                    m0, [aq]
118    mova                    m1, [aq+mmsize]
119    DEFINE_ARGS dst, stride, stride3, cnt
120    lea               stride3q, [strideq*3]
121    mov                   cntd, 4
122.loop:
123    mova   [dstq+strideq*0+ 0], m0
124    mova   [dstq+strideq*0+16], m1
125    mova   [dstq+strideq*1+ 0], m0
126    mova   [dstq+strideq*1+16], m1
127    mova   [dstq+strideq*2+ 0], m0
128    mova   [dstq+strideq*2+16], m1
129    mova   [dstq+stride3q + 0], m0
130    mova   [dstq+stride3q +16], m1
131    lea                   dstq, [dstq+strideq*4]
132    dec               cntd
133    jg .loop
134    RET
135
136INIT_XMM sse
137cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
138    movifnidn               aq, amp
139    mova                    m0, [aq+mmsize*0]
140    mova                    m1, [aq+mmsize*1]
141    mova                    m2, [aq+mmsize*2]
142    mova                    m3, [aq+mmsize*3]
143    DEFINE_ARGS dst, stride, cnt
144    mov                   cntd, 16
145.loop:
146    mova   [dstq+strideq*0+ 0], m0
147    mova   [dstq+strideq*0+16], m1
148    mova   [dstq+strideq*0+32], m2
149    mova   [dstq+strideq*0+48], m3
150    mova   [dstq+strideq*1+ 0], m0
151    mova   [dstq+strideq*1+16], m1
152    mova   [dstq+strideq*1+32], m2
153    mova   [dstq+strideq*1+48], m3
154    lea                   dstq, [dstq+strideq*2]
155    dec               cntd
156    jg .loop
157    RET
158
159INIT_MMX mmxext
160cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
161    mova                    m3, [lq]
162    DEFINE_ARGS dst, stride, stride3
163    lea               stride3q, [strideq*3]
164    pshufw                  m0, m3, q3333
165    pshufw                  m1, m3, q2222
166    pshufw                  m2, m3, q1111
167    pshufw                  m3, m3, q0000
168    mova      [dstq+strideq*0], m0
169    mova      [dstq+strideq*1], m1
170    mova      [dstq+strideq*2], m2
171    mova      [dstq+stride3q ], m3
172    RET
173
174INIT_XMM sse2
175cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
176    mova                    m2, [lq]
177    DEFINE_ARGS dst, stride, stride3
178    lea               stride3q, [strideq*3]
179    punpckhwd               m3, m2, m2
180    pshufd                  m0, m3, q3333
181    pshufd                  m1, m3, q2222
182    mova      [dstq+strideq*0], m0
183    mova      [dstq+strideq*1], m1
184    pshufd                  m0, m3, q1111
185    pshufd                  m1, m3, q0000
186    mova      [dstq+strideq*2], m0
187    mova      [dstq+stride3q ], m1
188    lea                   dstq, [dstq+strideq*4]
189    punpcklwd               m2, m2
190    pshufd                  m0, m2, q3333
191    pshufd                  m1, m2, q2222
192    mova      [dstq+strideq*0], m0
193    mova      [dstq+strideq*1], m1
194    pshufd                  m0, m2, q1111
195    pshufd                  m1, m2, q0000
196    mova      [dstq+strideq*2], m0
197    mova      [dstq+stride3q ], m1
198    RET
199
200INIT_XMM sse2
201cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
202    mov                   cntd, 3
203    lea               stride3q, [strideq*3]
204.loop:
205    movh                    m3, [lq+cntq*8]
206    punpcklwd               m3, m3
207    pshufd                  m0, m3, q3333
208    pshufd                  m1, m3, q2222
209    pshufd                  m2, m3, q1111
210    pshufd                  m3, m3, q0000
211    mova    [dstq+strideq*0+ 0], m0
212    mova    [dstq+strideq*0+16], m0
213    mova    [dstq+strideq*1+ 0], m1
214    mova    [dstq+strideq*1+16], m1
215    mova    [dstq+strideq*2+ 0], m2
216    mova    [dstq+strideq*2+16], m2
217    mova    [dstq+stride3q + 0], m3
218    mova    [dstq+stride3q +16], m3
219    lea                   dstq, [dstq+strideq*4]
220    dec                   cntd
221    jge .loop
222    RET
223
224INIT_XMM sse2
225cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
226    mov                   cntd, 7
227    lea               stride3q, [strideq*3]
228.loop:
229    movh                    m3, [lq+cntq*8]
230    punpcklwd               m3, m3
231    pshufd                  m0, m3, q3333
232    pshufd                  m1, m3, q2222
233    pshufd                  m2, m3, q1111
234    pshufd                  m3, m3, q0000
235    mova   [dstq+strideq*0+ 0], m0
236    mova   [dstq+strideq*0+16], m0
237    mova   [dstq+strideq*0+32], m0
238    mova   [dstq+strideq*0+48], m0
239    mova   [dstq+strideq*1+ 0], m1
240    mova   [dstq+strideq*1+16], m1
241    mova   [dstq+strideq*1+32], m1
242    mova   [dstq+strideq*1+48], m1
243    mova   [dstq+strideq*2+ 0], m2
244    mova   [dstq+strideq*2+16], m2
245    mova   [dstq+strideq*2+32], m2
246    mova   [dstq+strideq*2+48], m2
247    mova   [dstq+stride3q + 0], m3
248    mova   [dstq+stride3q +16], m3
249    mova   [dstq+stride3q +32], m3
250    mova   [dstq+stride3q +48], m3
251    lea                   dstq, [dstq+strideq*4]
252    dec                   cntd
253    jge .loop
254    RET
255
256INIT_MMX mmxext
257cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
258    mova                    m0, [lq]
259    paddw                   m0, [aq]
260    DEFINE_ARGS dst, stride, stride3
261    lea               stride3q, [strideq*3]
262    pmaddwd                 m0, [pw_1]
263    pshufw                  m1, m0, q3232
264    paddd                   m0, [pd_4]
265    paddd                   m0, m1
266    psrad                   m0, 3
267    pshufw                  m0, m0, q0000
268    mova      [dstq+strideq*0], m0
269    mova      [dstq+strideq*1], m0
270    mova      [dstq+strideq*2], m0
271    mova      [dstq+stride3q ], m0
272    RET
273
274INIT_XMM sse2
275cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
276    mova                    m0, [lq]
277    paddw                   m0, [aq]
278    DEFINE_ARGS dst, stride, stride3
279    lea               stride3q, [strideq*3]
280    pmaddwd                 m0, [pw_1]
281    pshufd                  m1, m0, q3232
282    paddd                   m0, m1
283    pshufd                  m1, m0, q1111
284    paddd                   m0, [pd_8]
285    paddd                   m0, m1
286    psrad                   m0, 4
287    pshuflw                 m0, m0, q0000
288    punpcklqdq              m0, m0
289    mova      [dstq+strideq*0], m0
290    mova      [dstq+strideq*1], m0
291    mova      [dstq+strideq*2], m0
292    mova      [dstq+stride3q ], m0
293    lea                   dstq, [dstq+strideq*4]
294    mova      [dstq+strideq*0], m0
295    mova      [dstq+strideq*1], m0
296    mova      [dstq+strideq*2], m0
297    mova      [dstq+stride3q ], m0
298    RET
299
300INIT_XMM sse2
301cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
302    mova                    m0, [lq]
303    paddw                   m0, [lq+mmsize]
304    paddw                   m0, [aq]
305    paddw                   m0, [aq+mmsize]
306    DEFINE_ARGS dst, stride, stride3, cnt
307    lea               stride3q, [strideq*3]
308    mov                   cntd, 4
309    pmaddwd                 m0, [pw_1]
310    pshufd                  m1, m0, q3232
311    paddd                   m0, m1
312    pshufd                  m1, m0, q1111
313    paddd                   m0, [pd_16]
314    paddd                   m0, m1
315    psrad                   m0, 5
316    pshuflw                 m0, m0, q0000
317    punpcklqdq              m0, m0
318.loop:
319    mova   [dstq+strideq*0+ 0], m0
320    mova   [dstq+strideq*0+16], m0
321    mova   [dstq+strideq*1+ 0], m0
322    mova   [dstq+strideq*1+16], m0
323    mova   [dstq+strideq*2+ 0], m0
324    mova   [dstq+strideq*2+16], m0
325    mova   [dstq+stride3q + 0], m0
326    mova   [dstq+stride3q +16], m0
327    lea                   dstq, [dstq+strideq*4]
328    dec                   cntd
329    jg .loop
330    RET
331
332INIT_XMM sse2
333cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
334    mova                    m0, [lq+mmsize*0]
335    paddw                   m0, [lq+mmsize*1]
336    paddw                   m0, [lq+mmsize*2]
337    paddw                   m0, [lq+mmsize*3]
338    paddw                   m0, [aq+mmsize*0]
339    paddw                   m0, [aq+mmsize*1]
340    paddw                   m0, [aq+mmsize*2]
341    paddw                   m0, [aq+mmsize*3]
342    DEFINE_ARGS dst, stride, stride3, cnt
343    lea               stride3q, [strideq*3]
344    mov                   cntd, 16
345    pmaddwd                 m0, [pw_1]
346    pshufd                  m1, m0, q3232
347    paddd                   m0, m1
348    pshufd                  m1, m0, q1111
349    paddd                   m0, [pd_32]
350    paddd                   m0, m1
351    psrad                   m0, 6
352    pshuflw                 m0, m0, q0000
353    punpcklqdq              m0, m0
354.loop:
355    mova   [dstq+strideq*0+ 0], m0
356    mova   [dstq+strideq*0+16], m0
357    mova   [dstq+strideq*0+32], m0
358    mova   [dstq+strideq*0+48], m0
359    mova   [dstq+strideq*1+ 0], m0
360    mova   [dstq+strideq*1+16], m0
361    mova   [dstq+strideq*1+32], m0
362    mova   [dstq+strideq*1+48], m0
363    lea                   dstq, [dstq+strideq*2]
364    dec                   cntd
365    jg .loop
366    RET
367
368%macro DC_1D_FNS 2
369INIT_MMX mmxext
370cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
371    mova                    m0, [%2]
372    DEFINE_ARGS dst, stride, stride3
373    lea               stride3q, [strideq*3]
374    pmaddwd                 m0, [pw_1]
375    pshufw                  m1, m0, q3232
376    paddd                   m0, [pd_2]
377    paddd                   m0, m1
378    psrad                   m0, 2
379    pshufw                  m0, m0, q0000
380    mova      [dstq+strideq*0], m0
381    mova      [dstq+strideq*1], m0
382    mova      [dstq+strideq*2], m0
383    mova      [dstq+stride3q ], m0
384    RET
385
386INIT_XMM sse2
387cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
388    mova                    m0, [%2]
389    DEFINE_ARGS dst, stride, stride3
390    lea               stride3q, [strideq*3]
391    pmaddwd                 m0, [pw_1]
392    pshufd                  m1, m0, q3232
393    paddd                   m0, m1
394    pshufd                  m1, m0, q1111
395    paddd                   m0, [pd_4]
396    paddd                   m0, m1
397    psrad                   m0, 3
398    pshuflw                 m0, m0, q0000
399    punpcklqdq              m0, m0
400    mova      [dstq+strideq*0], m0
401    mova      [dstq+strideq*1], m0
402    mova      [dstq+strideq*2], m0
403    mova      [dstq+stride3q ], m0
404    lea                   dstq, [dstq+strideq*4]
405    mova      [dstq+strideq*0], m0
406    mova      [dstq+strideq*1], m0
407    mova      [dstq+strideq*2], m0
408    mova      [dstq+stride3q ], m0
409    RET
410
411INIT_XMM sse2
412cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
413    mova                    m0, [%2]
414    paddw                   m0, [%2+mmsize]
415    DEFINE_ARGS dst, stride, stride3, cnt
416    lea               stride3q, [strideq*3]
417    mov                   cntd, 4
418    pmaddwd                 m0, [pw_1]
419    pshufd                  m1, m0, q3232
420    paddd                   m0, m1
421    pshufd                  m1, m0, q1111
422    paddd                   m0, [pd_8]
423    paddd                   m0, m1
424    psrad                   m0, 4
425    pshuflw                 m0, m0, q0000
426    punpcklqdq              m0, m0
427.loop:
428    mova   [dstq+strideq*0+ 0], m0
429    mova   [dstq+strideq*0+16], m0
430    mova   [dstq+strideq*1+ 0], m0
431    mova   [dstq+strideq*1+16], m0
432    mova   [dstq+strideq*2+ 0], m0
433    mova   [dstq+strideq*2+16], m0
434    mova   [dstq+stride3q + 0], m0
435    mova   [dstq+stride3q +16], m0
436    lea                   dstq, [dstq+strideq*4]
437    dec                   cntd
438    jg .loop
439    RET
440
441INIT_XMM sse2
442cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
443    mova                    m0, [%2+mmsize*0]
444    paddw                   m0, [%2+mmsize*1]
445    paddw                   m0, [%2+mmsize*2]
446    paddw                   m0, [%2+mmsize*3]
447    DEFINE_ARGS dst, stride, cnt
448    mov                   cntd, 16
449    pmaddwd                 m0, [pw_1]
450    pshufd                  m1, m0, q3232
451    paddd                   m0, m1
452    pshufd                  m1, m0, q1111
453    paddd                   m0, [pd_16]
454    paddd                   m0, m1
455    psrad                   m0, 5
456    pshuflw                 m0, m0, q0000
457    punpcklqdq              m0, m0
458.loop:
459    mova   [dstq+strideq*0+ 0], m0
460    mova   [dstq+strideq*0+16], m0
461    mova   [dstq+strideq*0+32], m0
462    mova   [dstq+strideq*0+48], m0
463    mova   [dstq+strideq*1+ 0], m0
464    mova   [dstq+strideq*1+16], m0
465    mova   [dstq+strideq*1+32], m0
466    mova   [dstq+strideq*1+48], m0
467    lea                   dstq, [dstq+strideq*2]
468    dec                   cntd
469    jg .loop
470    RET
471%endmacro
472
473DC_1D_FNS top,  aq
474DC_1D_FNS left, lq
475
476INIT_MMX mmxext
477cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
478    mova                    m5, [pw_1023]
479.body:
480    mova                    m4, [aq]
481    mova                    m3, [lq]
482    movd                    m0, [aq-4]
483    pshufw                  m0, m0, q1111
484    psubw                   m4, m0
485    DEFINE_ARGS dst, stride, stride3
486    lea               stride3q, [strideq*3]
487    pshufw                  m0, m3, q3333
488    pshufw                  m1, m3, q2222
489    pshufw                  m2, m3, q1111
490    pshufw                  m3, m3, q0000
491    paddw                   m0, m4
492    paddw                   m1, m4
493    paddw                   m2, m4
494    paddw                   m3, m4
495    pxor                    m4, m4
496    pmaxsw                  m0, m4
497    pmaxsw                  m1, m4
498    pmaxsw                  m2, m4
499    pmaxsw                  m3, m4
500    pminsw                  m0, m5
501    pminsw                  m1, m5
502    pminsw                  m2, m5
503    pminsw                  m3, m5
504    mova      [dstq+strideq*0], m0
505    mova      [dstq+strideq*1], m1
506    mova      [dstq+strideq*2], m2
507    mova      [dstq+stride3q ], m3
508    RET
509
510cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
511    mova                    m5, [pw_4095]
512    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
513
514INIT_XMM sse2
515cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
516    mova                    m4, [pw_1023]
517.body:
518    pxor                    m6, m6
519    mova                    m5, [aq]
520    movd                    m0, [aq-4]
521    pshuflw                 m0, m0, q1111
522    punpcklqdq              m0, m0
523    psubw                   m5, m0
524    DEFINE_ARGS dst, stride, l, stride3, cnt
525    lea               stride3q, [strideq*3]
526    mov                   cntd, 1
527.loop:
528    movh                    m3, [lq+cntq*8]
529    punpcklwd               m3, m3
530    pshufd                  m0, m3, q3333
531    pshufd                  m1, m3, q2222
532    pshufd                  m2, m3, q1111
533    pshufd                  m3, m3, q0000
534    paddw                   m0, m5
535    paddw                   m1, m5
536    paddw                   m2, m5
537    paddw                   m3, m5
538    pmaxsw                  m0, m6
539    pmaxsw                  m1, m6
540    pmaxsw                  m2, m6
541    pmaxsw                  m3, m6
542    pminsw                  m0, m4
543    pminsw                  m1, m4
544    pminsw                  m2, m4
545    pminsw                  m3, m4
546    mova      [dstq+strideq*0], m0
547    mova      [dstq+strideq*1], m1
548    mova      [dstq+strideq*2], m2
549    mova      [dstq+stride3q ], m3
550    lea                   dstq, [dstq+strideq*4]
551    dec                   cntd
552    jge .loop
553    RET
554
555cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
556    mova                    m4, [pw_4095]
557    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
558
559INIT_XMM sse2
560cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
561    mova                    m7, [pw_1023]
562.body:
563    pxor                    m6, m6
564    mova                    m4, [aq]
565    mova                    m5, [aq+mmsize]
566    movd                    m0, [aq-4]
567    pshuflw                 m0, m0, q1111
568    punpcklqdq              m0, m0
569    psubw                   m4, m0
570    psubw                   m5, m0
571    DEFINE_ARGS dst, stride, l, cnt
572    mov                   cntd, 7
573.loop:
574    movd                    m3, [lq+cntq*4]
575    punpcklwd               m3, m3
576    pshufd                  m2, m3, q1111
577    pshufd                  m3, m3, q0000
578    paddw                   m0, m2, m4
579    paddw                   m2, m5
580    paddw                   m1, m3, m4
581    paddw                   m3, m5
582    pmaxsw                  m0, m6
583    pmaxsw                  m2, m6
584    pmaxsw                  m1, m6
585    pmaxsw                  m3, m6
586    pminsw                  m0, m7
587    pminsw                  m2, m7
588    pminsw                  m1, m7
589    pminsw                  m3, m7
590    mova   [dstq+strideq*0+ 0], m0
591    mova   [dstq+strideq*0+16], m2
592    mova   [dstq+strideq*1+ 0], m1
593    mova   [dstq+strideq*1+16], m3
594    lea                   dstq, [dstq+strideq*2]
595    dec                   cntd
596    jge .loop
597    RET
598
599cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
600    mova                    m7, [pw_4095]
601    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
602
603INIT_XMM sse2
604cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
605    mova                    m0, [pw_1023]
606.body:
607    pxor                    m1, m1
608%if ARCH_X86_64
609    SWAP                     0, 8
610    SWAP                     1, 9
611%define reg_min m9
612%define reg_max m8
613%else
614    mova              [rsp+ 0], m0
615    mova              [rsp+16], m1
616%define reg_min [rsp+16]
617%define reg_max [rsp+ 0]
618%endif
619
620    mova                    m4, [aq+mmsize*0]
621    mova                    m5, [aq+mmsize*1]
622    mova                    m6, [aq+mmsize*2]
623    mova                    m7, [aq+mmsize*3]
624    movd                    m0, [aq-4]
625    pshuflw                 m0, m0, q1111
626    punpcklqdq              m0, m0
627    psubw                   m4, m0
628    psubw                   m5, m0
629    psubw                   m6, m0
630    psubw                   m7, m0
631    DEFINE_ARGS dst, stride, l, cnt
632    mov                   cntd, 31
633.loop:
634    pinsrw                  m3, [lq+cntq*2], 0
635    punpcklwd               m3, m3
636    pshufd                  m3, m3, q0000
637    paddw                   m0, m3, m4
638    paddw                   m1, m3, m5
639    paddw                   m2, m3, m6
640    paddw                   m3, m7
641    pmaxsw                  m0, reg_min
642    pmaxsw                  m1, reg_min
643    pmaxsw                  m2, reg_min
644    pmaxsw                  m3, reg_min
645    pminsw                  m0, reg_max
646    pminsw                  m1, reg_max
647    pminsw                  m2, reg_max
648    pminsw                  m3, reg_max
649    mova   [dstq+strideq*0+ 0], m0
650    mova   [dstq+strideq*0+16], m1
651    mova   [dstq+strideq*0+32], m2
652    mova   [dstq+strideq*0+48], m3
653    add                   dstq, strideq
654    dec                   cntd
655    jge .loop
656    RET
657
658cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
659    mova                    m0, [pw_4095]
660    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
661
662; Directional intra predicion functions
663;
664; in the functions below, 'abcdefgh' refers to above data (sometimes simply
665; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
666; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
667; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
668; top-left data.
669
670; left=(left+2*center+right+2)>>2
671%macro LOWPASS 3 ; left [dst], center, right
672    paddw                  m%1, m%3
673    psraw                  m%1, 1
674    pavgw                  m%1, m%2
675%endmacro
676
677; abcdefgh (src) -> bcdefghh (dst)
678; dst/src can be the same register
679%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
680%if cpuflag(ssse3)
681    pshufb                  %1, %2, %3              ; abcdefgh -> bcdefghh
682%else
683    psrldq                  %1, %2, 2               ; abcdefgh -> bcdefgh.
684    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
685%endif
686%endmacro
687
688; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
689%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
690%if cpuflag(ssse3)
691    pshufb                  %1, %3, %4              ; abcdefgh -> bcdefghh
692    pshufb                  %2, %1, %4              ; bcdefghh -> cdefghhh
693%else
694    psrldq                  %1, %3, 2               ; abcdefgh -> bcdefgh.
695    psrldq                  %2, %3, 4               ; abcdefgh -> cdefgh..
696    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
697    pshufhw                 %2, %2, q1110           ; cdefgh.. -> cdefghhh
698%endif
699%endmacro
700
701%macro DL_FUNCS 0
702cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
703    movifnidn               aq, amp
704    movu                    m1, [aq]                ; abcdefgh
705    pshufhw                 m0, m1, q3310           ; abcdefhh
706    SHIFT_RIGHT             m1, m1                  ; bcdefghh
707    psrldq                  m2, m1, 2               ; cdefghh.
708    LOWPASS                  0,  1,  2              ; BCDEFGh.
709    pshufd                  m1, m0, q3321           ; DEFGh...
710    movh      [dstq+strideq*0], m0
711    movh      [dstq+strideq*2], m1
712    add                   dstq, strideq
713    psrldq                  m0, 2                   ; CDEFGh..
714    psrldq                  m1, 2                   ; EFGh....
715    movh      [dstq+strideq*0], m0
716    movh      [dstq+strideq*2], m1
717    RET
718
719cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
720    movifnidn               aq, amp
721    mova                    m0, [aq]                ; abcdefgh
722%if cpuflag(ssse3)
723    mova                    m4, [pb_2to15_14_15]
724%endif
725    SHIFT_RIGHTx2           m1, m2, m0, m4          ; bcdefghh/cdefghhh
726    LOWPASS                  0,  1,  2              ; BCDEFGHh
727    shufps                  m1, m0, m2, q3332       ; FGHhhhhh
728    shufps                  m3, m0, m1, q2121       ; DEFGHhhh
729    DEFINE_ARGS dst, stride, stride5
730    lea               stride5q, [strideq*5]
731
732    mova      [dstq+strideq*0], m0
733    mova      [dstq+strideq*4], m1
734    SHIFT_RIGHT             m0, m0, m4              ; CDEFGHhh
735    pshuflw                 m1, m1, q3321           ; GHhhhhhh
736    pshufd                  m2, m0, q3321           ; EFGHhhhh
737    mova      [dstq+strideq*1], m0
738    mova      [dstq+stride5q ], m1
739    lea                   dstq, [dstq+strideq*2]
740    pshuflw                 m1, m1, q3321           ; Hhhhhhhh
741    mova      [dstq+strideq*0], m3
742    mova      [dstq+strideq*4], m1
743    pshuflw                 m1, m1, q3321           ; hhhhhhhh
744    mova      [dstq+strideq*1], m2
745    mova      [dstq+stride5q ], m1
746    RET
747
748cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
749    movifnidn               aq, amp
750    mova                    m0, [aq]                ; abcdefgh
751    mova                    m3, [aq+mmsize]         ; ijklmnop
752    PALIGNR                 m1, m3, m0, 2, m4       ; bcdefghi
753    PALIGNR                 m2, m3, m0, 4, m4       ; cdefghij
754    LOWPASS                  0,  1,  2              ; BCDEFGHI
755%if cpuflag(ssse3)
756    mova                    m4, [pb_2to15_14_15]
757%endif
758    SHIFT_RIGHTx2           m2, m1, m3, m4          ; jklmnopp/klmnoppp
759    LOWPASS                  1,  2,  3              ; JKLMNOPp
760    pshufd                  m2, m2, q3333           ; pppppppp
761    DEFINE_ARGS dst, stride, cnt
762    mov                   cntd, 8
763
764.loop:
765    mova   [dstq+strideq*0+ 0], m0
766    mova   [dstq+strideq*0+16], m1
767    mova   [dstq+strideq*8+ 0], m1
768    mova   [dstq+strideq*8+16], m2
769    add                   dstq, strideq
770%if cpuflag(avx)
771    vpalignr                m0, m1, m0, 2
772%else
773    PALIGNR                 m3, m1, m0, 2, m4
774    mova                    m0, m3
775%endif
776    SHIFT_RIGHT             m1, m1, m4
777    dec                   cntd
778    jg .loop
779    RET
780
781cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
782    movifnidn               aq, amp
783    mova                    m0, [aq+mmsize*0]       ; abcdefgh
784    mova                    m1, [aq+mmsize*1]       ; ijklmnop
785    mova                    m2, [aq+mmsize*2]       ; qrstuvwx
786    mova                    m3, [aq+mmsize*3]       ; yz012345
787    PALIGNR                 m4, m1, m0, 2, m6
788    PALIGNR                 m5, m1, m0, 4, m6
789    LOWPASS                  0,  4,  5              ; BCDEFGHI
790    PALIGNR                 m4, m2, m1, 2, m6
791    PALIGNR                 m5, m2, m1, 4, m6
792    LOWPASS                  1,  4,  5              ; JKLMNOPQ
793    PALIGNR                 m4, m3, m2, 2, m6
794    PALIGNR                 m5, m3, m2, 4, m6
795    LOWPASS                  2,  4,  5              ; RSTUVWXY
796%if cpuflag(ssse3)
797    mova                    m6, [pb_2to15_14_15]
798%endif
799    SHIFT_RIGHTx2           m4, m5, m3, m6
800    LOWPASS                  3,  4,  5              ; Z0123455
801    pshufd                  m4, m4, q3333           ; 55555555
802    DEFINE_ARGS dst, stride, stride8, stride24, cnt
803    mov                   cntd, 8
804    lea               stride8q, [strideq*8]
805    lea              stride24q, [stride8q*3]
806
807.loop:
808    mova  [dstq+stride8q*0+ 0], m0
809    mova  [dstq+stride8q*0+16], m1
810    mova  [dstq+stride8q*0+32], m2
811    mova  [dstq+stride8q*0+48], m3
812    mova  [dstq+stride8q*1+ 0], m1
813    mova  [dstq+stride8q*1+16], m2
814    mova  [dstq+stride8q*1+32], m3
815    mova  [dstq+stride8q*1+48], m4
816    mova  [dstq+stride8q*2+ 0], m2
817    mova  [dstq+stride8q*2+16], m3
818    mova  [dstq+stride8q*2+32], m4
819    mova  [dstq+stride8q*2+48], m4
820    mova  [dstq+stride24q + 0], m3
821    mova  [dstq+stride24q +16], m4
822    mova  [dstq+stride24q +32], m4
823    mova  [dstq+stride24q +48], m4
824    add                   dstq, strideq
825%if cpuflag(avx)
826    vpalignr                m0, m1, m0, 2
827    vpalignr                m1, m2, m1, 2
828    vpalignr                m2, m3, m2, 2
829%else
830    PALIGNR                 m5, m1, m0, 2, m6
831    mova                    m0, m5
832    PALIGNR                 m5, m2, m1, 2, m6
833    mova                    m1, m5
834    PALIGNR                 m5, m3, m2, 2, m6
835    mova                    m2, m5
836%endif
837    SHIFT_RIGHT             m3, m3, m6
838    dec                   cntd
839    jg .loop
840    RET
841%endmacro
842
843INIT_XMM sse2
844DL_FUNCS
845INIT_XMM ssse3
846DL_FUNCS
847INIT_XMM avx
848DL_FUNCS
849
850%if HAVE_AVX2_EXTERNAL
851INIT_YMM avx2
852cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
853    movifnidn               aq, amp
854    mova                    m0, [aq]                   ; abcdefghijklmnop
855    vpbroadcastw           xm1, [aq+30]                ; pppppppp
856    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
857    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
858    vpalignr                m4, m2, m0, 4              ; cdefghijklmnoppp
859    LOWPASS                  0,  3,  4                 ; BCDEFGHIJKLMNOPp
860    vperm2i128              m2, m0, m1, q0201          ; JKLMNOPppppppppp
861    DEFINE_ARGS dst, stride, stride3, cnt
862    mov                   cntd, 2
863    lea               stride3q, [strideq*3]
864
865.loop:
866    mova      [dstq+strideq*0], m0
867    vpalignr                m3, m2, m0, 2
868    vpalignr                m4, m2, m0, 4
869    mova      [dstq+strideq*1], m3
870    mova      [dstq+strideq*2], m4
871    vpalignr                m3, m2, m0, 6
872    vpalignr                m4, m2, m0, 8
873    mova      [dstq+stride3q ], m3
874    lea                   dstq, [dstq+strideq*4]
875    mova      [dstq+strideq*0], m4
876    vpalignr                m3, m2, m0, 10
877    vpalignr                m4, m2, m0, 12
878    mova      [dstq+strideq*1], m3
879    mova      [dstq+strideq*2], m4
880    vpalignr                m3, m2, m0, 14
881    mova      [dstq+stride3q ], m3
882    lea                   dstq, [dstq+strideq*4]
883    mova                    m0, m2
884    vperm2i128              m2, m2, m2, q0101          ; pppppppppppppppp
885    dec                   cntd
886    jg .loop
887    RET
888
889cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
890    movifnidn               aq, amp
891    mova                    m0, [aq+mmsize*0+ 0]       ; abcdefghijklmnop
892    mova                    m1, [aq+mmsize*1+ 0]       ; qrstuvwxyz012345
893    vpbroadcastw           xm4, [aq+mmsize*1+30]       ; 55555555
894    vperm2i128              m5, m0, m1, q0201          ; ijklmnopqrstuvwx
895    vpalignr                m2, m5, m0, 2              ; bcdefghijklmnopq
896    vpalignr                m3, m5, m0, 4              ; cdefghijklmnopqr
897    LOWPASS                  0,  2,  3                 ; BCDEFGHIJKLMNOPQ
898    vperm2i128              m5, m1, m4, q0201          ; yz01234555555555
899    vpalignr                m2, m5, m1, 2              ; rstuvwxyz0123455
900    vpalignr                m3, m5, m1, 4              ; stuvwxyz01234555
901    LOWPASS                  1,  2,  3                 ; RSTUVWXYZ......5
902    vperm2i128              m2, m1, m4, q0201          ; Z......555555555
903    vperm2i128              m5, m0, m1, q0201          ; JKLMNOPQRSTUVWXY
904    DEFINE_ARGS dst, stride, stride3, cnt
905    lea               stride3q, [strideq*3]
906    mov                   cntd, 4
907
908.loop:
909    mova   [dstq+strideq*0 + 0], m0
910    mova   [dstq+strideq*0 +32], m1
911    vpalignr                 m3, m5, m0, 2
912    vpalignr                 m4, m2, m1, 2
913    mova   [dstq+strideq*1 + 0], m3
914    mova   [dstq+strideq*1 +32], m4
915    vpalignr                 m3, m5, m0, 4
916    vpalignr                 m4, m2, m1, 4
917    mova   [dstq+strideq*2 + 0], m3
918    mova   [dstq+strideq*2 +32], m4
919    vpalignr                 m3, m5, m0, 6
920    vpalignr                 m4, m2, m1, 6
921    mova   [dstq+stride3q*1+ 0], m3
922    mova   [dstq+stride3q*1+32], m4
923    lea                    dstq, [dstq+strideq*4]
924    vpalignr                 m3, m5, m0, 8
925    vpalignr                 m4, m2, m1, 8
926    mova   [dstq+strideq*0 + 0], m3
927    mova   [dstq+strideq*0 +32], m4
928    vpalignr                 m3, m5, m0, 10
929    vpalignr                 m4, m2, m1, 10
930    mova   [dstq+strideq*1 + 0], m3
931    mova   [dstq+strideq*1 +32], m4
932    vpalignr                 m3, m5, m0, 12
933    vpalignr                 m4, m2, m1, 12
934    mova   [dstq+strideq*2+ 0], m3
935    mova   [dstq+strideq*2+32], m4
936    vpalignr                 m3, m5, m0, 14
937    vpalignr                 m4, m2, m1, 14
938    mova   [dstq+stride3q+  0], m3
939    mova   [dstq+stride3q+ 32], m4
940    vpalignr                 m3, m5, m0, 16
941    vpalignr                 m4, m2, m1, 16
942    vperm2i128               m5, m3, m4, q0201
943    vperm2i128               m2, m4, m4, q0101
944    mova                     m0, m3
945    mova                     m1, m4
946    lea                    dstq, [dstq+strideq*4]
947    dec                    cntd
948    jg .loop
949    RET
950%endif
951
952%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
953cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
954    movh                    m0, [lq]                ; wxyz....
955    movhps                  m0, [aq-2]              ; wxyz*abc
956    movd                    m1, [aq+6]              ; d.......
957    PALIGNR                 m1, m0, 2, m2           ; xyz*abcd
958    psrldq                  m2, m1, 2               ; yz*abcd.
959    LOWPASS                  0, 1, 2                ; XYZ#ABC.
960    DEFINE_ARGS dst, stride, stride3
961    lea               stride3q, [strideq*3]
962
963    movh      [dstq+stride3q ], m0
964    psrldq                  m0, 2                   ; YZ#ABC..
965    movh      [dstq+strideq*2], m0
966    psrldq                  m0, 2                   ; Z#ABC...
967    movh      [dstq+strideq*1], m0
968    psrldq                  m0, 2                   ; #ABC....
969    movh      [dstq+strideq*0], m0
970    RET
971
972cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
973    mova                    m0, [lq]                ; stuvwxyz
974    movu                    m1, [aq-2]              ; *abcdefg
975    mova                    m2, [aq]                ; abcdefgh
976    psrldq                  m3, m2, 2               ; bcdefgh.
977    LOWPASS                  3,  2, 1               ; ABCDEFG.
978    PALIGNR                 m1, m0, 2, m4           ; tuvwxyz*
979    PALIGNR                 m2, m1, 2, m4           ; uvwxyz*a
980    LOWPASS                  2,  1, 0               ; TUVWXYZ#
981    DEFINE_ARGS dst, stride, dst4, stride3
982    lea               stride3q, [strideq*3]
983    lea                  dst4q, [dstq+strideq*4]
984
985    movhps [dstq +stride3q +0], m2
986    movh   [dstq+ stride3q +8], m3
987    mova   [dst4q+stride3q +0], m2
988    PALIGNR                 m1, m3, m2, 2, m0
989    psrldq                  m3, 2
990    movhps [dstq +strideq*2+0], m1
991    movh   [dstq+ strideq*2+8], m3
992    mova   [dst4q+strideq*2+0], m1
993    PALIGNR                 m2, m3, m1, 2, m0
994    psrldq                  m3, 2
995    movhps [dstq +strideq*1+0], m2
996    movh   [dstq+ strideq*1+8], m3
997    mova   [dst4q+strideq*1+0], m2
998    PALIGNR                 m1, m3, m2, 2, m0
999    psrldq                  m3, 2
1000    movhps [dstq +strideq*0+0], m1
1001    movh   [dstq+ strideq*0+8], m3
1002    mova   [dst4q+strideq*0+0], m1
1003    RET
1004
1005cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
1006    mova                    m0, [lq]                ; klmnopqr
1007    mova                    m1, [lq+mmsize]         ; stuvwxyz
1008    movu                    m2, [aq-2]              ; *abcdefg
1009    movu                    m3, [aq+mmsize-2]       ; hijklmno
1010    mova                    m4, [aq]                ; abcdefgh
1011    mova                    m5, [aq+mmsize]         ; ijklmnop
1012    psrldq                  m6, m5, 2               ; jklmnop.
1013    LOWPASS                  6,  5, 3               ; IJKLMNO.
1014    PALIGNR                 m5, m4, 2, m3           ; bcdefghi
1015    LOWPASS                  5,  4, 2               ; ABCDEFGH
1016    PALIGNR                 m2, m1, 2, m3           ; tuvwxyz*
1017    PALIGNR                 m4, m2, 2, m3           ; uvwxyz*a
1018    LOWPASS                  4,  2, 1               ; TUVWXYZ#
1019    PALIGNR                 m1, m0, 2, m3           ; lmnopqrs
1020    PALIGNR                 m2, m1, 2, m3           ; mnopqrst
1021    LOWPASS                  2, 1, 0                ; LMNOPQRS
1022    DEFINE_ARGS dst, stride, dst8, cnt
1023    lea                  dst8q, [dstq+strideq*8]
1024    mov                   cntd, 8
1025
1026.loop:
1027    sub                  dst8q, strideq
1028    mova  [dst8q+strideq*0+ 0], m4
1029    mova  [dst8q+strideq*0+16], m5
1030    mova  [dst8q+strideq*8+ 0], m2
1031    mova  [dst8q+strideq*8+16], m4
1032%if cpuflag(avx)
1033    vpalignr                m2, m4, m2, 2
1034    vpalignr                m4, m5, m4, 2
1035    vpalignr                m5, m6, m5, 2
1036%else
1037    PALIGNR                 m0, m4, m2, 2, m1
1038    mova                    m2, m0
1039    PALIGNR                 m0, m5, m4, 2, m1
1040    mova                    m4, m0
1041    PALIGNR                 m0, m6, m5, 2, m1
1042    mova                    m5, m0
1043%endif
1044    psrldq                  m6, 2
1045    dec                   cntd
1046    jg .loop
1047    RET
1048
1049cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
1050                               %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
1051    mova                    m0, [aq+mmsize*3]       ; a[24-31]
1052    movu                    m1, [aq+mmsize*3-2]     ; a[23-30]
1053    psrldq                  m2, m0, 2               ; a[25-31].
1054    LOWPASS                  2,  0, 1               ; A[24-30].
1055    mova                    m1, [aq+mmsize*2]       ; a[16-23]
1056    movu                    m3, [aq+mmsize*2-2]     ; a[15-22]
1057    PALIGNR                 m0, m1, 2, m4           ; a[17-24]
1058    LOWPASS                  0,  1, 3               ; A[16-23]
1059    mova                    m3, [aq+mmsize*1]       ; a[8-15]
1060    movu                    m4, [aq+mmsize*1-2]     ; a[7-14]
1061    PALIGNR                 m1, m3, 2, m5           ; a[9-16]
1062    LOWPASS                  1,  3, 4               ; A[8-15]
1063    mova                    m4, [aq+mmsize*0]       ; a[0-7]
1064    movu                    m5, [aq+mmsize*0-2]     ; *a[0-6]
1065    PALIGNR                 m3, m4, 2, m6           ; a[1-8]
1066    LOWPASS                  3,  4, 5               ; A[0-7]
1067    SCRATCH                  1,  8, rsp+0*mmsize
1068    SCRATCH                  3,  9, rsp+1*mmsize
1069%if notcpuflag(ssse3)
1070    SCRATCH                  0, 10, rsp+2*mmsize
1071%endif
1072    mova                    m6, [lq+mmsize*3]       ; l[24-31]
1073    PALIGNR                 m5, m6, 2, m0           ; l[25-31]*
1074    PALIGNR                 m4, m5, 2, m0           ; l[26-31]*a
1075    LOWPASS                  4,  5, 6               ; L[25-31]#
1076    mova                    m7, [lq+mmsize*2]       ; l[16-23]
1077    PALIGNR                 m6, m7, 2, m0           ; l[17-24]
1078    PALIGNR                 m5, m6, 2, m0           ; l[18-25]
1079    LOWPASS                  5,  6, 7               ; L[17-24]
1080    mova                    m1, [lq+mmsize*1]       ; l[8-15]
1081    PALIGNR                 m7, m1, 2, m0           ; l[9-16]
1082    PALIGNR                 m6, m7, 2, m0           ; l[10-17]
1083    LOWPASS                  6,  7, 1               ; L[9-16]
1084    mova                    m3, [lq+mmsize*0]       ; l[0-7]
1085    PALIGNR                 m1, m3, 2, m0           ; l[1-8]
1086    PALIGNR                 m7, m1, 2, m0           ; l[2-9]
1087    LOWPASS                  7,  1, 3               ; L[1-8]
1088%if cpuflag(ssse3)
1089%if cpuflag(avx)
1090    UNSCRATCH                1,  8, rsp+0*mmsize
1091%endif
1092    UNSCRATCH                3,  9, rsp+1*mmsize
1093%else
1094    UNSCRATCH                0, 10, rsp+2*mmsize
1095%endif
1096    DEFINE_ARGS dst8, stride, stride8, stride24, cnt
1097    lea               stride8q, [strideq*8]
1098    lea              stride24q, [stride8q*3]
1099    lea                  dst8q, [dst8q+strideq*8]
1100    mov                   cntd, 8
1101
1102.loop:
1103    sub                  dst8q, strideq
1104%if notcpuflag(avx)
1105    UNSCRATCH                1,  8, rsp+0*mmsize
1106%if notcpuflag(ssse3)
1107    UNSCRATCH                3,  9, rsp+1*mmsize
1108%endif
1109%endif
1110    mova [dst8q+stride8q*0+ 0], m4
1111    mova [dst8q+stride8q*0+16], m3
1112    mova [dst8q+stride8q*0+32], m1
1113    mova [dst8q+stride8q*0+48], m0
1114    mova [dst8q+stride8q*1+ 0], m5
1115    mova [dst8q+stride8q*1+16], m4
1116    mova [dst8q+stride8q*1+32], m3
1117    mova [dst8q+stride8q*1+48], m1
1118    mova [dst8q+stride8q*2+ 0], m6
1119    mova [dst8q+stride8q*2+16], m5
1120    mova [dst8q+stride8q*2+32], m4
1121    mova [dst8q+stride8q*2+48], m3
1122    mova [dst8q+stride24q + 0], m7
1123    mova [dst8q+stride24q +16], m6
1124    mova [dst8q+stride24q +32], m5
1125    mova [dst8q+stride24q +48], m4
1126%if cpuflag(avx)
1127    vpalignr                m7, m6, m7, 2
1128    vpalignr                m6, m5, m6, 2
1129    vpalignr                m5, m4, m5, 2
1130    vpalignr                m4, m3, m4, 2
1131    vpalignr                m3, m1, m3, 2
1132    vpalignr                m1, m0, m1, 2
1133    vpalignr                m0, m2, m0, 2
1134%else
1135    SCRATCH                  2,  8, rsp+0*mmsize
1136%if notcpuflag(ssse3)
1137    SCRATCH                  0,  9, rsp+1*mmsize
1138%endif
1139    PALIGNR                 m2, m6, m7, 2, m0
1140    mova                    m7, m2
1141    PALIGNR                 m2, m5, m6, 2, m0
1142    mova                    m6, m2
1143    PALIGNR                 m2, m4, m5, 2, m0
1144    mova                    m5, m2
1145    PALIGNR                 m2, m3, m4, 2, m0
1146    mova                    m4, m2
1147    PALIGNR                 m2, m1, m3, 2, m0
1148    mova                    m3, m2
1149%if notcpuflag(ssse3)
1150    UNSCRATCH                0,  9, rsp+1*mmsize
1151    SCRATCH                  3,  9, rsp+1*mmsize
1152%endif
1153    PALIGNR                 m2, m0, m1, 2, m3
1154    mova                    m1, m2
1155    UNSCRATCH                2,  8, rsp+0*mmsize
1156    SCRATCH                  1,  8, rsp+0*mmsize
1157    PALIGNR                 m1, m2, m0, 2, m3
1158    mova                    m0, m1
1159%endif
1160    psrldq                  m2, 2
1161    dec                   cntd
1162    jg .loop
1163    RET
1164%endmacro
1165
1166INIT_XMM sse2
1167DR_FUNCS 3
1168INIT_XMM ssse3
1169DR_FUNCS 2
1170INIT_XMM avx
1171DR_FUNCS 2
1172
1173%if HAVE_AVX2_EXTERNAL
1174INIT_YMM avx2
1175cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
1176    mova                    m0, [lq]                   ; klmnopqrstuvwxyz
1177    movu                    m1, [aq-2]                 ; *abcdefghijklmno
1178    mova                    m2, [aq]                   ; abcdefghijklmnop
1179    vperm2i128              m4, m2, m2, q2001          ; ijklmnop........
1180    vpalignr                m5, m4, m2, 2              ; bcdefghijklmnop.
1181    vperm2i128              m3, m0, m1, q0201          ; stuvwxyz*abcdefg
1182    LOWPASS                  1,  2,  5                 ; ABCDEFGHIJKLMNO.
1183    vpalignr                m4, m3, m0, 2              ; lmnopqrstuvwxyz*
1184    vpalignr                m5, m3, m0, 4              ; mnopqrstuvwxyz*a
1185    LOWPASS                  0,  4,  5                 ; LMNOPQRSTUVWXYZ#
1186    vperm2i128              m5, m0, m1, q0201          ; TUVWXYZ#ABCDEFGH
1187    DEFINE_ARGS dst, stride, stride3, stride5, dst3
1188    lea                  dst3q, [dstq+strideq*4]
1189    lea               stride3q, [strideq*3]
1190    lea               stride5q, [stride3q+strideq*2]
1191
1192    vpalignr                m3, m5, m0, 2
1193    vpalignr                m4, m1, m5, 2
1194    mova    [dst3q+stride5q*2], m3                     ; 14
1195    mova    [ dstq+stride3q*2], m4                     ; 6
1196    vpalignr                m3, m5, m0, 4
1197    vpalignr                m4, m1, m5, 4
1198    sub                  dst3q, strideq
1199    mova    [dst3q+stride5q*2], m3                     ; 13
1200    mova    [dst3q+strideq*2 ], m4                     ; 5
1201    mova    [dst3q+stride3q*4], m0                     ; 15
1202    vpalignr                m3, m5, m0, 6
1203    vpalignr                m4, m1, m5, 6
1204    mova     [dstq+stride3q*4], m3                     ; 12
1205    mova     [dst3q+strideq*1], m4                     ; 4
1206    vpalignr                m3, m5, m0, 8
1207    vpalignr                m4, m1, m5, 8
1208    mova     [dst3q+strideq*8], m3                     ; 11
1209    mova     [dst3q+strideq*0], m4                     ; 3
1210    vpalignr                m3, m5, m0, 10
1211    vpalignr                m4, m1, m5, 10
1212    mova     [dstq+stride5q*2], m3                     ; 10
1213    mova     [dstq+strideq*2 ], m4                     ; 2
1214    vpalignr                m3, m5, m0, 12
1215    vpalignr                m4, m1, m5, 12
1216    mova    [dst3q+stride3q*2], m3                     ; 9
1217    mova     [dstq+strideq*1 ], m4                     ; 1
1218    vpalignr                m3, m5, m0, 14
1219    vpalignr                m4, m1, m5, 14
1220    mova      [dstq+strideq*8], m3                     ; 8
1221    mova      [dstq+strideq*0], m4                     ; 0
1222    mova     [dst3q+strideq*4], m5                     ; 7
1223    RET
1224
1225cglobal vp9_ipred_vl_16x16_16, 4, 5, 7, dst, stride, l, a
1226    movifnidn               aq, amp
1227    mova                    m0, [aq]                   ; abcdefghijklmnop
1228    vpbroadcastw           xm1, [aq+30]                ; pppppppp
1229    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
1230    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
1231    vperm2i128              m4, m3, m1, q0201          ; jklmnopppppppppp
1232    vpalignr                m5, m2, m0, 4              ; cdefghijklmnoppp
1233    vperm2i128              m6, m5, m1, q0201          ; klmnoppppppppppp
1234    LOWPASS                  5,  3,  0                 ; BCDEFGHIJKLMNOPP
1235    LOWPASS                  6,  4,  2                 ; JKLMNOPPPPPPPPPP
1236    pavgw                   m3, m0                     ; abcdefghijklmnop
1237    pavgw                   m4, m2                     ; ijklmnoppppppppp
1238    DEFINE_ARGS dst, stride, stride3, stride5, dst4
1239    lea                  dst4q, [dstq+strideq*4]
1240    lea               stride3q, [strideq*3]
1241    lea               stride5q, [stride3q+strideq*2]
1242
1243    mova      [dstq+strideq*0], m3                     ; 0  abcdefghijklmnop
1244    mova      [dstq+strideq*1], m5                     ; 1  BCDEFGHIJKLMNOPP
1245    vpalignr                m0, m4, m3, 2
1246    vpalignr                m1, m6, m5, 2
1247    mova     [dstq+strideq*2 ], m0                     ; 2  bcdefghijklmnopp
1248    mova     [dstq+stride3q*1], m1                     ; 3  CDEFGHIJKLMNOPPP
1249    vpalignr                m0, m4, m3, 4
1250    vpalignr                m1, m6, m5, 4
1251    mova     [dst4q+strideq*0], m0                     ; 4  cdefghijklmnoppp
1252    mova     [dstq+stride5q*1], m1                     ; 5  DEFGHIJKLMNOPPPP
1253    vpalignr                m0, m4, m3, 6
1254    vpalignr                m1, m6, m5, 6
1255    mova    [ dstq+stride3q*2], m0                     ; 6  defghijklmnopppp
1256    mova    [dst4q+stride3q*1], m1                     ; 7  EFGHIJKLMNOPPPPP
1257    vpalignr                m0, m4, m3, 8
1258    vpalignr                m1, m6, m5, 8
1259    mova    [  dstq+strideq*8], m0                     ; 8  efghijklmnoppppp
1260    mova    [dst4q+stride5q*1], m1                     ; 9  FGHIJKLMNOPPPPPP
1261    vpalignr                m0, m4, m3, 10
1262    mova     [dstq+stride5q*2], m0                     ; 10 fghijklmnopppppp
1263    vpalignr                m0, m4, m3, 12
1264    mova     [dst4q+strideq*8], m0                     ; 12 ghijklmnoppppppp
1265    vpalignr                m0, m4, m3, 14
1266    mova    [dst4q+stride5q*2], m0                     ; 14 hijklmnopppppppp
1267    sub                  dst4q, strideq
1268    vpalignr                m1, m6, m5, 10
1269    mova     [dst4q+strideq*8], m1                     ; 11 GHIJKLMNOPPPPPPP
1270    vpalignr                m1, m6, m5, 12
1271    mova    [dst4q+stride5q*2], m1                     ; 13 HIJKLMNOPPPPPPPP
1272    vpalignr                m1, m6, m5, 14
1273    mova    [dst4q+stride3q*4], m1                     ; 15 IJKLMNOPPPPPPPPP
1274    RET
1275
1276cglobal vp9_ipred_hd_16x16_16, 4, 5, 7, dst, stride, l, a
1277    movu                    m0, [aq-2]                 ; *abcdefghijklmno
1278    mova                    m1, [lq]                   ; klmnopqrstuvwxyz
1279    vperm2i128              m2, m1, m0, q0201          ; stuvwxyz*abcdefg
1280    vpalignr                m3, m2, m1, 2              ; lmnopqrstuvwxyz*
1281    vpalignr                m4, m2, m1, 4              ; mnopqrstuvwxyz*a
1282    LOWPASS                  4,  3,  1                 ; LMNOPQRSTUVWXYZ#
1283    pavgw                   m3, m1                     ; klmnopqrstuvwxyz
1284    mova                    m1, [aq]                   ; abcdefghijklmnop
1285    movu                    m2, [aq+2]                 ; bcdefghijklmnop.
1286    LOWPASS                  2,  1,  0                 ; ABCDEFGHIJKLMNO.
1287    vpunpcklwd              m0, m3, m4                 ; kLlMmNnOsTtUuVvW
1288    vpunpckhwd              m1, m3, m4                 ; oPpQqRrSwXxYyZz#
1289    vperm2i128              m3, m1, m0, q0002          ; kLlMmNnOoPpQqRrS
1290    vperm2i128              m4, m0, m1, q0301          ; sTtUuVvWwXxYyZz#
1291    vperm2i128              m0, m4, m2, q0201          ; wXxYyZz#ABCDEFGH
1292    vperm2i128              m1, m3, m4, q0201          ; oPpQqRrSsTtUuVvW
1293    DEFINE_ARGS dst, stride, stride3, stride5, dst5
1294    lea               stride3q, [strideq*3]
1295    lea               stride5q, [stride3q+strideq*2]
1296    lea                  dst5q, [dstq+stride5q]
1297
1298    mova    [dst5q+stride5q*2], m3                     ; 15 kLlMmNnOoPpQqRrS
1299    mova    [dst5q+stride3q*2], m1                     ; 11 oPpQqRrSsTtUuVvW
1300    mova     [dst5q+strideq*2], m4                     ; 7  sTtUuVvWwXxYyZz#
1301    mova     [dstq+stride3q*1], m0                     ; 3  wXxYyZz#ABCDEFGH
1302    vpalignr                m5, m4, m1, 4
1303    mova     [dstq+stride5q*2], m5                     ; 10 pQqRrSsTtUuVvWwX
1304    vpalignr                m5, m0, m4, 4
1305    vpalignr                m6, m2, m0, 4
1306    mova     [dstq+stride3q*2], m5                     ; 6  tUuVvWwXxYyZz#AB
1307    mova      [dstq+strideq*2], m6                     ; 2  xYyZz#ABCDEFGHIJ
1308    vpalignr                m5, m4, m1, 8
1309    mova     [dst5q+strideq*4], m5                     ; 9  qRrSsTtUuVvWwXxY
1310    vpalignr                m5, m0, m4, 8
1311    vpalignr                m6, m2, m0, 8
1312    mova     [dstq+stride5q*1], m5                     ; 5  uVvWwXxYyZz#ABCD
1313    mova      [dstq+strideq*1], m6                     ; 1  yZz#ABCDEFGHIJKL
1314    vpalignr                m5, m1, m3, 12
1315    vpalignr                m6, m4, m1, 12
1316    mova     [dstq+stride3q*4], m5                     ; 12 nOoPpQqRrSsTtUuV
1317    mova      [dst5q+stride3q], m6                     ; 8  rSsTtUuVvWwXxYyZ
1318    vpalignr                m5, m0, m4, 12
1319    vpalignr                m6, m2, m0, 12
1320    mova      [dstq+strideq*4], m5                     ; 4  nOoPpQqRrSsTtUuV
1321    mova      [dstq+strideq*0], m6                     ; 0  z#ABCDEFGHIJKLMN
1322    sub                  dst5q, strideq
1323    vpalignr                m5, m1, m3, 4
1324    mova    [dst5q+stride5q*2], m5                     ; 14 lMmNnOoPpQqRrSsT
1325    sub                  dst5q, strideq
1326    vpalignr                m5, m1, m3, 8
1327    mova    [dst5q+stride5q*2], m5                    ; 13 mNnOoPpQqRrSsTtU
1328    RET
1329
1330%if ARCH_X86_64
1331cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
1332    mova                    m0, [lq+mmsize*0+0]        ; l[0-15]
1333    mova                    m1, [lq+mmsize*1+0]        ; l[16-31]
1334    movu                    m2, [aq+mmsize*0-2]        ; *abcdefghijklmno
1335    mova                    m3, [aq+mmsize*0+0]        ; abcdefghijklmnop
1336    mova                    m4, [aq+mmsize*1+0]        ; qrstuvwxyz012345
1337    vperm2i128              m5, m0, m1, q0201          ; lmnopqrstuvwxyz0
1338    vpalignr                m6, m5, m0, 2              ; mnopqrstuvwxyz01
1339    vpalignr                m7, m5, m0, 4              ; nopqrstuvwxyz012
1340    LOWPASS                  0,  6,  7                 ; L[0-15]
1341    vperm2i128              m7, m1, m2, q0201          ; stuvwxyz*abcdefg
1342    vpalignr                m5, m7, m1, 2              ; lmnopqrstuvwxyz*
1343    vpalignr                m6, m7, m1, 4              ; mnopqrstuvwxyz*a
1344    LOWPASS                  1,  5,  6                 ; L[16-31]#
1345    vperm2i128              m5, m3, m4, q0201          ; ijklmnopqrstuvwx
1346    vpalignr                m6, m5, m3, 2              ; bcdefghijklmnopq
1347    LOWPASS                  2,  3,  6                 ; A[0-15]
1348    movu                    m3, [aq+mmsize*1-2]        ; pqrstuvwxyz01234
1349    vperm2i128              m6, m4, m4, q2001          ; yz012345........
1350    vpalignr                m7, m6, m4, 2              ; rstuvwxyz012345.
1351    LOWPASS                  3,  4,  7                 ; A[16-31].
1352    vperm2i128              m4, m1, m2, q0201          ; TUVWXYZ#ABCDEFGH
1353    vperm2i128              m5, m0, m1, q0201          ; L[7-15]L[16-23]
1354    vperm2i128              m8, m2, m3, q0201          ; IJKLMNOPQRSTUVWX
1355    DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt
1356    lea               stride3q, [strideq*3]
1357    lea               stride5q, [stride3q+strideq*2]
1358    lea               stride7q, [strideq*4+stride3q]
1359    lea                 dst24q, [dst8q+stride3q*8]
1360    lea                  dst8q, [dst8q+strideq*8]
1361    mov                   cntd, 2
1362
1363.loop:
1364    mova  [dst24q+stride7q+0 ], m0                     ; 31 23 15 7
1365    mova  [dst24q+stride7q+32], m1
1366    mova    [dst8q+stride7q+0], m1
1367    mova   [dst8q+stride7q+32], m2
1368    vpalignr                m6, m4, m1, 2
1369    vpalignr                m7, m5, m0, 2
1370    vpalignr                m9, m8, m2, 2
1371    mova [dst24q+stride3q*2+0], m7                     ; 30 22 14 6
1372    mova [dst24q+stride3q*2+32], m6
1373    mova  [dst8q+stride3q*2+0], m6
1374    mova [dst8q+stride3q*2+32], m9
1375    vpalignr                m6, m4, m1, 4
1376    vpalignr                m7, m5, m0, 4
1377    vpalignr                m9, m8, m2, 4
1378    mova   [dst24q+stride5q+0], m7                     ; 29 21 13 5
1379    mova  [dst24q+stride5q+32], m6
1380    mova    [dst8q+stride5q+0], m6
1381    mova   [dst8q+stride5q+32], m9
1382    vpalignr                m6, m4, m1, 6
1383    vpalignr                m7, m5, m0, 6
1384    vpalignr                m9, m8, m2, 6
1385    mova [dst24q+strideq*4+0 ], m7                     ; 28 20 12 4
1386    mova [dst24q+strideq*4+32], m6
1387    mova   [dst8q+strideq*4+0], m6
1388    mova  [dst8q+strideq*4+32], m9
1389    vpalignr                m6, m4, m1, 8
1390    vpalignr                m7, m5, m0, 8
1391    vpalignr                m9, m8, m2, 8
1392    mova  [dst24q+stride3q+0 ], m7                     ; 27 19 11 3
1393    mova  [dst24q+stride3q+32], m6
1394    mova    [dst8q+stride3q+0], m6
1395    mova   [dst8q+stride3q+32], m9
1396    vpalignr                m6, m4, m1, 10
1397    vpalignr                m7, m5, m0, 10
1398    vpalignr                m9, m8, m2, 10
1399    mova [dst24q+strideq*2+0 ], m7                     ; 26 18 10 2
1400    mova [dst24q+strideq*2+32], m6
1401    mova   [dst8q+strideq*2+0], m6
1402    mova  [dst8q+strideq*2+32], m9
1403    vpalignr                m6, m4, m1, 12
1404    vpalignr                m7, m5, m0, 12
1405    vpalignr                m9, m8, m2, 12
1406    mova   [dst24q+strideq+0 ], m7                     ; 25 17 9 1
1407    mova   [dst24q+strideq+32], m6
1408    mova     [dst8q+strideq+0], m6
1409    mova    [dst8q+strideq+32], m9
1410    vpalignr                m6, m4, m1, 14
1411    vpalignr                m7, m5, m0, 14
1412    vpalignr                m9, m8, m2, 14
1413    mova [dst24q+strideq*0+0 ], m7                     ; 24 16 8 0
1414    mova [dst24q+strideq*0+32], m6
1415    mova   [dst8q+strideq*0+0], m6
1416    mova  [dst8q+strideq*0+32], m9
1417    mova                    m0, m5
1418    mova                    m5, m1
1419    mova                    m1, m4
1420    mova                    m4, m2
1421    mova                    m2, m8
1422    mova                    m8, m3
1423    sub                 dst24q, stride7q
1424    sub                 dst24q, strideq
1425    sub                  dst8q, stride7q
1426    sub                  dst8q, strideq
1427    dec                   cntd
1428    jg .loop
1429    RET
1430%endif
1431%endif
1432
1433%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1434cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
1435    movifnidn               aq, amp
1436    movu                    m0, [aq]                ; abcdefgh
1437    psrldq                  m1, m0, 2               ; bcdefgh.
1438    psrldq                  m2, m0, 4               ; cdefgh..
1439    LOWPASS                  2,  1, 0               ; BCDEFGH.
1440    pavgw                   m1, m0                  ; ABCDEFG.
1441    DEFINE_ARGS dst, stride, stride3
1442    lea               stride3q, [strideq*3]
1443
1444    movh      [dstq+strideq*0], m1
1445    movh      [dstq+strideq*1], m2
1446    psrldq                  m1, 2
1447    psrldq                  m2, 2
1448    movh      [dstq+strideq*2], m1
1449    movh      [dstq+stride3q ], m2
1450    RET
1451
1452cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
1453    movifnidn               aq, amp
1454    mova                    m0, [aq]                ; abcdefgh
1455%if cpuflag(ssse3)
1456    mova                    m3, [pb_2to15_14_15]
1457%endif
1458    SHIFT_RIGHTx2           m1, m2, m0, m3          ; bcdefghh/cdefghhh
1459    LOWPASS                  2,  1, 0               ; BCDEFGHh
1460    pavgw                   m1, m0                  ; ABCDEFGh
1461    DEFINE_ARGS dst, stride, stride3
1462    lea               stride3q, [strideq*3]
1463
1464    mova      [dstq+strideq*0], m1
1465    mova      [dstq+strideq*1], m2
1466    SHIFT_RIGHT             m1, m1, m3
1467    SHIFT_RIGHT             m2, m2, m3
1468    mova      [dstq+strideq*2], m1
1469    mova      [dstq+stride3q ], m2
1470    lea                   dstq, [dstq+strideq*4]
1471    SHIFT_RIGHT             m1, m1, m3
1472    SHIFT_RIGHT             m2, m2, m3
1473    mova      [dstq+strideq*0], m1
1474    mova      [dstq+strideq*1], m2
1475    SHIFT_RIGHT             m1, m1, m3
1476    SHIFT_RIGHT             m2, m2, m3
1477    mova      [dstq+strideq*2], m1
1478    mova      [dstq+stride3q ], m2
1479    RET
1480
1481cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
1482    movifnidn               aq, amp
1483    mova                    m0, [aq]
1484    mova                    m1, [aq+mmsize]
1485    PALIGNR                 m2, m1, m0, 2, m3
1486    PALIGNR                 m3, m1, m0, 4, m4
1487    LOWPASS                  3,  2,  0
1488    pavgw                   m2, m0
1489%if cpuflag(ssse3)
1490    mova                    m4, [pb_2to15_14_15]
1491%endif
1492    SHIFT_RIGHTx2           m5, m0, m1, m4
1493    LOWPASS                  0,  5,  1
1494    pavgw                   m1, m5
1495    DEFINE_ARGS dst, stride, cnt
1496    mov                   cntd, 8
1497
1498.loop:
1499    mova   [dstq+strideq*0+ 0], m2
1500    mova   [dstq+strideq*0+16], m1
1501    mova   [dstq+strideq*1+ 0], m3
1502    mova   [dstq+strideq*1+16], m0
1503    lea                   dstq, [dstq+strideq*2]
1504%if cpuflag(avx)
1505    vpalignr                m2, m1, m2, 2
1506    vpalignr                m3, m0, m3, 2
1507%else
1508    PALIGNR                 m5, m1, m2, 2, m4
1509    mova                    m2, m5
1510    PALIGNR                 m5, m0, m3, 2, m4
1511    mova                    m3, m5
1512%endif
1513    SHIFT_RIGHT             m1, m1, m4
1514    SHIFT_RIGHT             m0, m0, m4
1515    dec                   cntd
1516    jg .loop
1517    RET
1518
1519cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
1520    movifnidn               aq, amp
1521    mova                    m0, [aq+mmsize*0]
1522    mova                    m1, [aq+mmsize*1]
1523    mova                    m2, [aq+mmsize*2]
1524    PALIGNR                 m6, m1, m0, 2, m5
1525    PALIGNR                 m7, m1, m0, 4, m5
1526    LOWPASS                  7,  6,  0
1527    pavgw                   m6, m0
1528    SCRATCH                  6,  8, rsp+0*mmsize
1529    PALIGNR                 m4, m2, m1, 2, m0
1530    PALIGNR                 m5, m2, m1, 4, m0
1531    LOWPASS                  5,  4,  1
1532    pavgw                   m4, m1
1533    mova                    m0, [aq+mmsize*3]
1534    PALIGNR                 m1, m0, m2, 2, m6
1535    PALIGNR                 m3, m0, m2, 4, m6
1536    LOWPASS                  3,  1,  2
1537    pavgw                   m2, m1
1538%if cpuflag(ssse3)
1539    PRELOAD                 10, pb_2to15_14_15, shuf
1540%endif
1541    SHIFT_RIGHTx2           m6, m1, m0, reg_shuf
1542    LOWPASS                  1,  6,  0
1543    pavgw                   m0, m6
1544%if ARCH_X86_64
1545    pshufd                  m9, m6, q3333
1546%endif
1547%if cpuflag(avx)
1548    UNSCRATCH                6,  8, rsp+0*mmsize
1549%endif
1550    DEFINE_ARGS dst, stride, cnt, stride16, stride17
1551    mov              stride16q, strideq
1552    mov                   cntd, 8
1553    shl              stride16q, 4
1554    lea              stride17q, [stride16q+strideq]
1555
1556    ; FIXME m8 is unused for avx, so we could save one register here for win64
1557.loop:
1558%if notcpuflag(avx)
1559    UNSCRATCH                6,  8, rsp+0*mmsize
1560%endif
1561    mova   [dstq+strideq*0+ 0], m6
1562    mova   [dstq+strideq*0+16], m4
1563    mova   [dstq+strideq*0+32], m2
1564    mova   [dstq+strideq*0+48], m0
1565    mova   [dstq+strideq*1+ 0], m7
1566    mova   [dstq+strideq*1+16], m5
1567    mova   [dstq+strideq*1+32], m3
1568    mova   [dstq+strideq*1+48], m1
1569    mova   [dstq+stride16q+ 0], m4
1570    mova   [dstq+stride16q+16], m2
1571    mova   [dstq+stride16q+32], m0
1572%if ARCH_X86_64
1573    mova   [dstq+stride16q+48], m9
1574%endif
1575    mova   [dstq+stride17q+ 0], m5
1576    mova   [dstq+stride17q+16], m3
1577    mova   [dstq+stride17q+32], m1
1578%if ARCH_X86_64
1579    mova   [dstq+stride17q+48], m9
1580%endif
1581    lea                   dstq, [dstq+strideq*2]
1582%if cpuflag(avx)
1583    vpalignr                m6, m4, m6, 2
1584    vpalignr                m4, m2, m4, 2
1585    vpalignr                m2, m0, m2, 2
1586    vpalignr                m7, m5, m7, 2
1587    vpalignr                m5, m3, m5, 2
1588    vpalignr                m3, m1, m3, 2
1589%else
1590    SCRATCH                  3,  8, rsp+0*mmsize
1591%if notcpuflag(ssse3)
1592    SCRATCH                  1, 10, rsp+1*mmsize
1593%endif
1594    PALIGNR                 m3, m4, m6, 2, m1
1595    mova                    m6, m3
1596    PALIGNR                 m3, m2, m4, 2, m1
1597    mova                    m4, m3
1598    PALIGNR                 m3, m0, m2, 2, m1
1599    mova                    m2, m3
1600    PALIGNR                 m3, m5, m7, 2, m1
1601    mova                    m7, m3
1602    UNSCRATCH                3,  8, rsp+0*mmsize
1603    SCRATCH                  6,  8, rsp+0*mmsize
1604%if notcpuflag(ssse3)
1605    UNSCRATCH                1, 10, rsp+1*mmsize
1606    SCRATCH                  7, 10, rsp+1*mmsize
1607%endif
1608    PALIGNR                 m6, m3, m5, 2, m7
1609    mova                    m5, m6
1610    PALIGNR                 m6, m1, m3, 2, m7
1611    mova                    m3, m6
1612%if notcpuflag(ssse3)
1613    UNSCRATCH                7, 10, rsp+1*mmsize
1614%endif
1615%endif
1616    SHIFT_RIGHT             m1, m1, reg_shuf
1617    SHIFT_RIGHT             m0, m0, reg_shuf
1618    dec                   cntd
1619    jg .loop
1620
1621%if ARCH_X86_32
1622    DEFINE_ARGS dst, stride, stride3
1623    lea               stride3q, [strideq*3]
1624%assign %%n 0
1625%rep 4
1626    mova   [dstq+strideq*0+48], m0
1627    mova   [dstq+strideq*1+48], m0
1628    mova   [dstq+strideq*2+48], m0
1629    mova   [dstq+stride3q +48], m0
1630%if %%n < 3
1631    lea                   dstq, [dstq+strideq*4]
1632%endif
1633%assign %%n (%%n+1)
1634%endrep
1635%endif
1636    RET
1637%endmacro
1638
1639INIT_XMM sse2
1640VL_FUNCS 2
1641INIT_XMM ssse3
1642VL_FUNCS 1
1643INIT_XMM avx
1644VL_FUNCS 1
1645
1646%macro VR_FUNCS 0
1647cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
1648    movu                    m0, [aq-2]
1649    movhps                  m1, [lq]
1650    PALIGNR                 m0, m1, 10, m2          ; xyz*abcd
1651    pslldq                  m1, m0, 2               ; .xyz*abc
1652    pslldq                  m2, m0, 4               ; ..xyz*ab
1653    LOWPASS                  2,  1, 0               ; ..YZ#ABC
1654    pavgw                   m1, m0                  ; ....#ABC
1655    DEFINE_ARGS dst, stride, stride3
1656    lea               stride3q, [strideq*3]
1657
1658    movhps    [dstq+strideq*0], m1
1659    movhps    [dstq+strideq*1], m2
1660    shufps                  m0, m2, m1, q3210
1661%if cpuflag(ssse3)
1662    pshufb                  m2, [pb_4_5_8to13_8x0]
1663%else
1664    pshuflw                 m2, m2, q2222
1665    psrldq                  m2, 6
1666%endif
1667    psrldq                  m0, 6
1668    movh      [dstq+strideq*2], m0
1669    movh      [dstq+stride3q ], m2
1670    RET
1671
1672cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
1673    movu                    m1, [aq-2]              ; *abcdefg
1674    movu                    m2, [lq]                ; stuvwxyz
1675    mova                    m0, [aq]                ; abcdefgh
1676    PALIGNR                 m3, m1, m2, 14, m4      ; z*abcdef
1677    LOWPASS                  3,  1,  0
1678    pavgw                   m0, m1
1679    PALIGNR                 m1, m2,  2, m4          ; tuvwxyz*
1680    pslldq                  m4, m2,  2              ; .stuvwxy
1681    LOWPASS                  4,  2,  1
1682    DEFINE_ARGS dst, stride, stride3
1683    lea               stride3q, [strideq*3]
1684
1685    mova      [dstq+strideq*0], m0
1686    mova      [dstq+strideq*1], m3
1687    PALIGNR                 m0, m4, 14, m1
1688    pslldq                  m4, 2
1689    PALIGNR                 m3, m4, 14, m1
1690    pslldq                  m4, 2
1691    mova      [dstq+strideq*2], m0
1692    mova      [dstq+stride3q ], m3
1693    lea                   dstq, [dstq+strideq*4]
1694    PALIGNR                 m0, m4, 14, m1
1695    pslldq                  m4, 2
1696    PALIGNR                 m3, m4, 14, m1
1697    pslldq                  m4, 2
1698    mova      [dstq+strideq*0], m0
1699    mova      [dstq+strideq*1], m3
1700    PALIGNR                 m0, m4, 14, m1
1701    pslldq                  m4, 2
1702    PALIGNR                 m3, m4, 14, m4
1703    mova      [dstq+strideq*2], m0
1704    mova      [dstq+stride3q ], m3
1705    RET
1706
1707cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
1708    movu                    m1, [aq-2]              ; *abcdefg
1709    movu                    m2, [aq+mmsize-2]       ; hijklmno
1710    mova                    m3, [aq]                ; abcdefgh
1711    mova                    m4, [aq+mmsize]         ; ijklmnop
1712    mova                    m5, [lq+mmsize]         ; stuvwxyz
1713    PALIGNR                 m0, m1, m5, 14, m6      ; z*abcdef
1714    movu                    m6, [aq+mmsize-4]       ; ghijklmn
1715    LOWPASS                  6,  2,  4
1716    pavgw                   m2, m4
1717    LOWPASS                  0,  1,  3
1718    pavgw                   m3, m1
1719    PALIGNR                 m1, m5,  2, m7          ; tuvwxyz*
1720    movu                    m7, [lq+mmsize-2]       ; rstuvwxy
1721    LOWPASS                  1,  5,  7
1722    movu                    m5, [lq+2]              ; lmnopqrs
1723    pslldq                  m4, m5,  2              ; .lmnopqr
1724    pslldq                  m7, m5,  4              ; ..lmnopq
1725    LOWPASS                  5,  4,  7
1726    psrld                   m4, m1, 16
1727    psrld                   m7, m5, 16
1728    pand                    m1, [pd_65535]
1729    pand                    m5, [pd_65535]
1730    packssdw                m7, m4
1731    packssdw                m5, m1
1732    DEFINE_ARGS dst, stride, cnt
1733    mov                   cntd, 8
1734
1735.loop:
1736    mova   [dstq+strideq*0+ 0], m3
1737    mova   [dstq+strideq*0+16], m2
1738    mova   [dstq+strideq*1+ 0], m0
1739    mova   [dstq+strideq*1+16], m6
1740    lea                   dstq, [dstq+strideq*2]
1741    PALIGNR                 m2, m3, 14, m4
1742    PALIGNR                 m3, m7, 14, m4
1743    pslldq                  m7, 2
1744    PALIGNR                 m6, m0, 14, m4
1745    PALIGNR                 m0, m5, 14, m4
1746    pslldq                  m5, 2
1747    dec                   cntd
1748    jg .loop
1749    RET
1750
1751cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
1752    movu                    m0, [aq+mmsize*0-2]     ; *a[0-6]
1753    movu                    m1, [aq+mmsize*1-2]     ; a[7-14]
1754    movu                    m2, [aq+mmsize*2-2]     ; a[15-22]
1755    movu                    m3, [aq+mmsize*3-2]     ; a[23-30]
1756    mova                    m4, [aq+mmsize*3+0]     ; a[24-31]
1757    movu                    m5, [aq+mmsize*3-4]     ; a[22-29]
1758    LOWPASS                  5,  3,  4              ; A[23-30]
1759    SCRATCH                  5,  8, rsp+0*mmsize
1760    pavgw                   m3, m4
1761    mova                    m4, [aq+mmsize*2+0]     ; a[16-23]
1762    movu                    m6, [aq+mmsize*2-4]     ; a[14-21]
1763    LOWPASS                  6,  2,  4              ; A[15-22]
1764    SCRATCH                  6,  9, rsp+1*mmsize
1765    pavgw                   m2, m4
1766    mova                    m4, [aq+mmsize*1+0]     ; a[8-15]
1767    movu                    m7, [aq+mmsize*1-4]     ; a[6-13]
1768    LOWPASS                  7,  1,  4              ; A[7-14]
1769    SCRATCH                  7, 10, rsp+2*mmsize
1770    pavgw                   m1, m4
1771    mova                    m4, [aq+mmsize*0+0]     ; a[0-7]
1772    mova                    m5, [lq+mmsize*3+0]     ; l[24-31]
1773    PALIGNR                 m6, m0, m5, 14, m7      ; l[31]*a[0-5]
1774    LOWPASS                  6,  0,  4              ; #A[0-6]
1775    SCRATCH                  6, 11, rsp+3*mmsize
1776    pavgw                   m4, m0
1777    PALIGNR                 m0, m5,  2, m7          ; l[25-31]*
1778    movu                    m7, [lq+mmsize*3-2]     ; l[23-30]
1779    LOWPASS                  0,  5,  7              ; L[24-31]
1780    movu                    m5, [lq+mmsize*2-2]     ; l[15-22]
1781    mova                    m7, [lq+mmsize*2+0]     ; l[16-23]
1782    movu                    m6, [lq+mmsize*2+2]     ; l[17-24]
1783    LOWPASS                  5,  7,  6              ; L[16-23]
1784    psrld                   m7, m0, 16
1785    psrld                   m6, m5, 16
1786    pand                    m0, [pd_65535]
1787    pand                    m5, [pd_65535]
1788    packssdw                m6, m7
1789    packssdw                m5, m0
1790    SCRATCH                  5, 12, rsp+4*mmsize
1791    SCRATCH                  6, 13, rsp+5*mmsize
1792    movu                    m6, [lq+mmsize*1-2]     ; l[7-14]
1793    mova                    m0, [lq+mmsize*1+0]     ; l[8-15]
1794    movu                    m5, [lq+mmsize*1+2]     ; l[9-16]
1795    LOWPASS                  6,  0,  5              ; L[8-15]
1796    movu                    m0, [lq+mmsize*0+2]     ; l[1-8]
1797    pslldq                  m5, m0,  2              ; .l[1-7]
1798    pslldq                  m7, m0,  4              ; ..l[1-6]
1799    LOWPASS                  0,  5,  7
1800    psrld                   m5, m6, 16
1801    psrld                   m7, m0, 16
1802    pand                    m6, [pd_65535]
1803    pand                    m0, [pd_65535]
1804    packssdw                m7, m5
1805    packssdw                m0, m6
1806    UNSCRATCH                6, 13, rsp+5*mmsize
1807    DEFINE_ARGS dst, stride, stride16, cnt, stride17
1808    mov              stride16q, strideq
1809    mov                   cntd, 8
1810    shl              stride16q, 4
1811%if ARCH_X86_64
1812    lea              stride17q, [stride16q+strideq]
1813%endif
1814
1815.loop:
1816    mova   [dstq+strideq*0+ 0], m4
1817    mova   [dstq+strideq*0+16], m1
1818    mova   [dstq+strideq*0+32], m2
1819    mova   [dstq+strideq*0+48], m3
1820%if ARCH_X86_64
1821    mova   [dstq+strideq*1+ 0], m11
1822    mova   [dstq+strideq*1+16], m10
1823    mova   [dstq+strideq*1+32], m9
1824    mova   [dstq+strideq*1+48], m8
1825%endif
1826    mova   [dstq+stride16q+ 0], m6
1827    mova   [dstq+stride16q+16], m4
1828    mova   [dstq+stride16q+32], m1
1829    mova   [dstq+stride16q+48], m2
1830%if ARCH_X86_64
1831    mova   [dstq+stride17q+ 0], m12
1832    mova   [dstq+stride17q+16], m11
1833    mova   [dstq+stride17q+32], m10
1834    mova   [dstq+stride17q+48], m9
1835%endif
1836    lea                   dstq, [dstq+strideq*2]
1837    PALIGNR                 m3, m2,  14, m5
1838    PALIGNR                 m2, m1,  14, m5
1839    PALIGNR                 m1, m4,  14, m5
1840    PALIGNR                 m4, m6,  14, m5
1841    PALIGNR                 m6, m7,  14, m5
1842    pslldq                  m7, 2
1843%if ARCH_X86_64
1844    PALIGNR                 m8, m9,  14, m5
1845    PALIGNR                 m9, m10, 14, m5
1846    PALIGNR                m10, m11, 14, m5
1847    PALIGNR                m11, m12, 14, m5
1848    PALIGNR                m12, m0,  14, m5
1849    pslldq                  m0, 2
1850%endif
1851    dec                   cntd
1852    jg .loop
1853
1854%if ARCH_X86_32
1855    UNSCRATCH                5, 12, rsp+4*mmsize
1856    UNSCRATCH                4, 11, rsp+3*mmsize
1857    UNSCRATCH                3, 10, rsp+2*mmsize
1858    UNSCRATCH                2,  9, rsp+1*mmsize
1859    UNSCRATCH                1,  8, rsp+0*mmsize
1860    mov                   dstq, dstm
1861    mov                   cntd, 8
1862    add                   dstq, strideq
1863.loop2:
1864    mova   [dstq+strideq*0+ 0], m4
1865    mova   [dstq+strideq*0+16], m3
1866    mova   [dstq+strideq*0+32], m2
1867    mova   [dstq+strideq*0+48], m1
1868    mova   [dstq+stride16q+ 0], m5
1869    mova   [dstq+stride16q+16], m4
1870    mova   [dstq+stride16q+32], m3
1871    mova   [dstq+stride16q+48], m2
1872    lea                   dstq, [dstq+strideq*2]
1873    PALIGNR                 m1, m2,  14, m6
1874    PALIGNR                 m2, m3,  14, m6
1875    PALIGNR                 m3, m4,  14, m6
1876    PALIGNR                 m4, m5,  14, m6
1877    PALIGNR                 m5, m0,  14, m6
1878    pslldq                  m0, 2
1879    dec                   cntd
1880    jg .loop2
1881%endif
1882    RET
1883%endmacro
1884
1885INIT_XMM sse2
1886VR_FUNCS
1887INIT_XMM ssse3
1888VR_FUNCS
1889INIT_XMM avx
1890VR_FUNCS
1891
1892%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1893cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
1894    movh                    m0, [lq]                ; abcd
1895%if cpuflag(ssse3)
1896    pshufb                  m0, [pb_0to7_67x4]      ; abcddddd
1897%else
1898    punpcklqdq              m0, m0
1899    pshufhw                 m0, m0, q3333           ; abcddddd
1900%endif
1901    psrldq                  m1, m0,  2              ; bcddddd.
1902    psrldq                  m2, m0,  4              ; cddddd..
1903    LOWPASS                  2,  1,  0              ; BCDddd..
1904    pavgw                   m1, m0                  ; abcddddd
1905    SBUTTERFLY          wd,  1,  2,  0              ; aBbCcDdd, dddddddd
1906    PALIGNR                 m2, m1,  4, m0          ; bCcDdddd
1907    DEFINE_ARGS dst, stride, stride3
1908    lea               stride3q, [strideq*3]
1909
1910    movh      [dstq+strideq*0], m1                  ; aBbC
1911    movh      [dstq+strideq*1], m2                  ; bCcD
1912    movhps    [dstq+strideq*2], m1                  ; cDdd
1913    movhps    [dstq+stride3q ], m2                  ; dddd
1914    RET
1915
1916cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
1917    mova                    m0, [lq]
1918%if cpuflag(ssse3)
1919    mova                    m3, [pb_2to15_14_15]
1920%endif
1921    SHIFT_RIGHTx2           m1, m2, m0, m3
1922    LOWPASS                  2,  1,  0
1923    pavgw                   m1, m0
1924    SBUTTERFLY          wd,  1,  2,  0
1925    shufps                  m0, m1, m2, q1032
1926    pshufd                  m3, m2, q3332
1927    DEFINE_ARGS dst, stride, stride3
1928    lea               stride3q, [strideq*3]
1929
1930    mova     [dstq+strideq *0], m1
1931    mova     [dstq+strideq *2], m0
1932    mova     [dstq+strideq *4], m2
1933    mova     [dstq+stride3q*2], m3
1934    add                   dstq, strideq
1935%if cpuflag(avx)
1936    vpalignr                m1, m2, m1, 4
1937%else
1938    PALIGNR                 m0, m2, m1, 4, m3
1939    mova                    m1, m0
1940%endif
1941    pshufd                  m2, m2, q3321
1942    shufps                  m0, m1, m2, q1032
1943    pshufd                  m3, m2, q3332
1944    mova     [dstq+strideq *0], m1
1945    mova     [dstq+strideq *2], m0
1946    mova     [dstq+strideq *4], m2
1947    mova     [dstq+stride3q*2], m3
1948    RET
1949
1950cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
1951    mova                    m0, [lq]
1952    mova                    m3, [lq+mmsize]
1953    movu                    m1, [lq+2]
1954    movu                    m2, [lq+4]
1955    LOWPASS                  2,  1,  0
1956    pavgw                   m1, m0
1957    SBUTTERFLY           wd, 1,  2,  0
1958%if cpuflag(ssse3)
1959    mova                    m5, [pb_2to15_14_15]
1960%endif
1961    SHIFT_RIGHTx2           m0, m4, m3, m5
1962    LOWPASS                  4,  0,  3
1963    pavgw                   m3, m0
1964    SBUTTERFLY           wd, 3,  4,  5
1965    pshufd                  m0, m0, q3333
1966    DEFINE_ARGS dst, stride, stride3, cnt
1967    lea               stride3q, [strideq*3]
1968    mov                   cntd, 4
1969
1970.loop:
1971    mova  [dstq+strideq *0+ 0], m1
1972    mova  [dstq+strideq *0+16], m2
1973    mova  [dstq+strideq *4+ 0], m2
1974    mova  [dstq+strideq *4+16], m3
1975    mova  [dstq+strideq *8+ 0], m3
1976    mova  [dstq+strideq *8+16], m4
1977    mova  [dstq+stride3q*4+ 0], m4
1978    mova  [dstq+stride3q*4+16], m0
1979    add                   dstq, strideq
1980%if cpuflag(avx)
1981    vpalignr                m1, m2, m1, 4
1982    vpalignr                m2, m3, m2, 4
1983    vpalignr                m3, m4, m3, 4
1984    vpalignr                m4, m0, m4, 4
1985%else
1986    PALIGNR                 m5, m2, m1, 4, m6
1987    mova                    m1, m5
1988    PALIGNR                 m5, m3, m2, 4, m6
1989    mova                    m2, m5
1990    PALIGNR                 m5, m4, m3, 4, m6
1991    mova                    m3, m5
1992    PALIGNR                 m5, m0, m4, 4, m6
1993    mova                    m4, m5
1994%endif
1995    dec                   cntd
1996    jg .loop
1997    RET
1998
1999cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
2000                               %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
2001    mova                    m2, [lq+mmsize*0+0]
2002    movu                    m1, [lq+mmsize*0+2]
2003    movu                    m0, [lq+mmsize*0+4]
2004    LOWPASS                  0,  1,  2
2005    pavgw                   m1, m2
2006    SBUTTERFLY           wd, 1,  0,  2
2007    SCRATCH                  1,  8, rsp+0*mmsize
2008    mova                    m4, [lq+mmsize*1+0]
2009    movu                    m3, [lq+mmsize*1+2]
2010    movu                    m2, [lq+mmsize*1+4]
2011    LOWPASS                  2,  3,  4
2012    pavgw                   m3, m4
2013    SBUTTERFLY           wd, 3,  2,  4
2014    mova                    m6, [lq+mmsize*2+0]
2015    movu                    m5, [lq+mmsize*2+2]
2016    movu                    m4, [lq+mmsize*2+4]
2017    LOWPASS                  4,  5,  6
2018    pavgw                   m5, m6
2019    SBUTTERFLY           wd, 5,  4,  6
2020    mova                    m7, [lq+mmsize*3+0]
2021    SCRATCH                  0,  9, rsp+1*mmsize
2022%if cpuflag(ssse3)
2023    mova                    m0, [pb_2to15_14_15]
2024%endif
2025    SHIFT_RIGHTx2           m1, m6, m7, m0
2026    LOWPASS                  6,  1,  7
2027    pavgw                   m7, m1
2028    SBUTTERFLY           wd, 7,  6,  0
2029    pshufd                  m1, m1, q3333
2030    UNSCRATCH                0,  9, rsp+1*mmsize
2031    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
2032    lea               stride3q, [strideq*3]
2033    lea               stride4q, [strideq*4]
2034    lea              stride28q, [stride4q*8]
2035    lea              stride20q, [stride4q*5]
2036    sub              stride28q, stride4q
2037    mov                   cntd, 4
2038
2039.loop:
2040%if ARCH_X86_64
2041    SWAP                     1,  8
2042%else
2043    mova        [rsp+1*mmsize], m1
2044    mova                    m1, [rsp+0*mmsize]
2045%endif
2046    mova  [dstq+strideq *0+ 0], m1
2047    mova  [dstq+strideq *0+16], m0
2048    mova  [dstq+strideq *0+32], m3
2049    mova  [dstq+strideq *0+48], m2
2050    mova  [dstq+stride4q*1+ 0], m0
2051    mova  [dstq+stride4q*1+16], m3
2052    mova  [dstq+stride4q*1+32], m2
2053    mova  [dstq+stride4q*1+48], m5
2054    mova  [dstq+stride4q*2+ 0], m3
2055    mova  [dstq+stride4q*2+16], m2
2056    mova  [dstq+stride4q*2+32], m5
2057    mova  [dstq+stride4q*2+48], m4
2058%if cpuflag(avx)
2059    vpalignr                m1, m0, m1, 4
2060    vpalignr                m0, m3, m0, 4
2061    vpalignr                m3, m2, m3, 4
2062%else
2063    SCRATCH                  6,  9, rsp+2*mmsize
2064%if notcpuflag(ssse3)
2065    SCRATCH                  7, 10, rsp+3*mmsize
2066%endif
2067    PALIGNR                 m6, m0, m1, 4, m7
2068    mova                    m1, m6
2069    PALIGNR                 m6, m3, m0, 4, m7
2070    mova                    m0, m6
2071    PALIGNR                 m6, m2, m3, 4, m7
2072    mova                    m3, m6
2073    UNSCRATCH                6,  9, rsp+2*mmsize
2074    SCRATCH                  0,  9, rsp+2*mmsize
2075%if notcpuflag(ssse3)
2076    UNSCRATCH                7, 10, rsp+3*mmsize
2077    SCRATCH                  3, 10, rsp+3*mmsize
2078%endif
2079%endif
2080%if ARCH_X86_64
2081    SWAP                     1,  8
2082%else
2083    mova        [rsp+0*mmsize], m1
2084    mova                    m1, [rsp+1*mmsize]
2085%endif
2086    mova  [dstq+stride3q*4+ 0], m2
2087    mova  [dstq+stride3q*4+16], m5
2088    mova  [dstq+stride3q*4+32], m4
2089    mova  [dstq+stride3q*4+48], m7
2090    mova  [dstq+stride4q*4+ 0], m5
2091    mova  [dstq+stride4q*4+16], m4
2092    mova  [dstq+stride4q*4+32], m7
2093    mova  [dstq+stride4q*4+48], m6
2094    mova  [dstq+stride20q + 0], m4
2095    mova  [dstq+stride20q +16], m7
2096    mova  [dstq+stride20q +32], m6
2097    mova  [dstq+stride20q +48], m1
2098    mova  [dstq+stride3q*8+ 0], m7
2099    mova  [dstq+stride3q*8+16], m6
2100    mova  [dstq+stride3q*8+32], m1
2101    mova  [dstq+stride3q*8+48], m1
2102    mova  [dstq+stride28q + 0], m6
2103    mova  [dstq+stride28q +16], m1
2104    mova  [dstq+stride28q +32], m1
2105    mova  [dstq+stride28q +48], m1
2106%if cpuflag(avx)
2107    vpalignr                m2, m5, m2, 4
2108    vpalignr                m5, m4, m5, 4
2109    vpalignr                m4, m7, m4, 4
2110    vpalignr                m7, m6, m7, 4
2111    vpalignr                m6, m1, m6, 4
2112%else
2113    PALIGNR                 m0, m5, m2, 4, m3
2114    mova                    m2, m0
2115    PALIGNR                 m0, m4, m5, 4, m3
2116    mova                    m5, m0
2117    PALIGNR                 m0, m7, m4, 4, m3
2118    mova                    m4, m0
2119    PALIGNR                 m0, m6, m7, 4, m3
2120    mova                    m7, m0
2121    PALIGNR                 m0, m1, m6, 4, m3
2122    mova                    m6, m0
2123    UNSCRATCH                0,  9, rsp+2*mmsize
2124%if notcpuflag(ssse3)
2125    UNSCRATCH                3, 10, rsp+3*mmsize
2126%endif
2127%endif
2128    add                   dstq, strideq
2129    dec                   cntd
2130    jg .loop
2131    RET
2132%endmacro
2133
2134INIT_XMM sse2
2135HU_FUNCS 4
2136INIT_XMM ssse3
2137HU_FUNCS 3
2138INIT_XMM avx
2139HU_FUNCS 2
2140
2141%macro HD_FUNCS 0
2142cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
2143    movh                    m0, [lq]
2144    movhps                  m0, [aq-2]
2145    psrldq                  m1, m0, 2
2146    psrldq                  m2, m0, 4
2147    LOWPASS                  2,  1,  0
2148    pavgw                   m1, m0
2149    punpcklwd               m1, m2
2150    DEFINE_ARGS dst, stride, stride3
2151    lea               stride3q, [strideq*3]
2152
2153    movh      [dstq+stride3q ], m1
2154    movhps    [dstq+strideq*1], m1
2155    movhlps                 m2, m2
2156    PALIGNR                 m2, m1, 4, m0
2157    movh      [dstq+strideq*2], m2
2158    movhps    [dstq+strideq*0], m2
2159    RET
2160
2161cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
2162    mova                    m0, [lq]
2163    movu                    m1, [aq-2]
2164    PALIGNR                 m2, m1, m0, 2, m3
2165    PALIGNR                 m3, m1, m0, 4, m4
2166    LOWPASS                  3,  2,  0
2167    pavgw                   m2, m0
2168    SBUTTERFLY           wd, 2,  3,  0
2169    psrldq                  m0, m1,  2
2170    psrldq                  m4, m1,  4
2171    LOWPASS                  1,  0,  4
2172    DEFINE_ARGS dst8, mstride, cnt
2173    lea                  dst8q, [dst8q+mstrideq*8]
2174    neg               mstrideq
2175    mov                   cntd, 4
2176
2177.loop:
2178    add                  dst8q, mstrideq
2179    mova    [dst8q+mstrideq*0], m2
2180    mova    [dst8q+mstrideq*4], m3
2181%if cpuflag(avx)
2182    vpalignr                m2, m3, m2, 4
2183    vpalignr                m3, m1, m3, 4
2184%else
2185    PALIGNR                 m0, m3, m2, 4, m4
2186    mova                    m2, m0
2187    PALIGNR                 m0, m1, m3, 4, m4
2188    mova                    m3, m0
2189%endif
2190    psrldq                  m1, 4
2191    dec                   cntd
2192    jg .loop
2193    RET
2194
2195cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
2196    mova                    m2, [lq]
2197    movu                    m1, [lq+2]
2198    movu                    m0, [lq+4]
2199    LOWPASS                  0,  1,  2
2200    pavgw                   m1, m2
2201    mova                    m4, [lq+mmsize]
2202    movu                    m5, [aq-2]
2203    PALIGNR                 m3, m5, m4, 2, m6
2204    PALIGNR                 m2, m5, m4, 4, m6
2205    LOWPASS                  2,  3,  4
2206    pavgw                   m3, m4
2207    SBUTTERFLY           wd, 1,  0,  4
2208    SBUTTERFLY           wd, 3,  2,  4
2209    mova                    m6, [aq]
2210    movu                    m4, [aq+2]
2211    LOWPASS                  4,  6,  5
2212    movu                    m5, [aq+mmsize-2]
2213    psrldq                  m6, m5,  2
2214    psrldq                  m7, m5,  4
2215    LOWPASS                  5,  6,  7
2216    DEFINE_ARGS dst, mstride, mstride3, cnt
2217    lea                   dstq, [dstq+mstrideq*8]
2218    lea                   dstq, [dstq+mstrideq*8]
2219    neg               mstrideq
2220    lea              mstride3q, [mstrideq*3]
2221    mov                   cntd, 4
2222
2223.loop:
2224    add                  dstq, mstrideq
2225    mova [dstq+mstride3q*4+ 0], m2
2226    mova [dstq+mstride3q*4+16], m4
2227    mova [dstq+mstrideq *8+ 0], m3
2228    mova [dstq+mstrideq *8+16], m2
2229    mova [dstq+mstrideq *4+ 0], m0
2230    mova [dstq+mstrideq *4+16], m3
2231    mova [dstq+mstrideq *0+ 0], m1
2232    mova [dstq+mstrideq *0+16], m0
2233%if cpuflag(avx)
2234    vpalignr                m1, m0, m1, 4
2235    vpalignr                m0, m3, m0, 4
2236    vpalignr                m3, m2, m3, 4
2237    vpalignr                m2, m4, m2, 4
2238    vpalignr                m4, m5, m4, 4
2239%else
2240    PALIGNR                 m6, m0, m1, 4, m7
2241    mova                    m1, m6
2242    PALIGNR                 m6, m3, m0, 4, m7
2243    mova                    m0, m6
2244    PALIGNR                 m6, m2, m3, 4, m7
2245    mova                    m3, m6
2246    PALIGNR                 m6, m4, m2, 4, m7
2247    mova                    m2, m6
2248    PALIGNR                 m6, m5, m4, 4, m7
2249    mova                    m4, m6
2250%endif
2251    psrldq                  m5, 4
2252    dec                   cntd
2253    jg .loop
2254    RET
2255
2256cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
2257                               10 * -mmsize * ARCH_X86_32, dst, stride, l, a
2258    mova                    m2, [lq+mmsize*0+0]
2259    movu                    m1, [lq+mmsize*0+2]
2260    movu                    m0, [lq+mmsize*0+4]
2261    LOWPASS                  0,  1,  2
2262    pavgw                   m1, m2
2263    SBUTTERFLY           wd, 1,  0,  2
2264    mova                    m4, [lq+mmsize*1+0]
2265    movu                    m3, [lq+mmsize*1+2]
2266    movu                    m2, [lq+mmsize*1+4]
2267    LOWPASS                  2,  3,  4
2268    pavgw                   m3, m4
2269    SBUTTERFLY           wd, 3,  2,  4
2270    SCRATCH                  0,  8, rsp+0*mmsize
2271    SCRATCH                  1,  9, rsp+1*mmsize
2272    SCRATCH                  2, 10, rsp+2*mmsize
2273    SCRATCH                  3, 11, rsp+3*mmsize
2274    mova                    m6, [lq+mmsize*2+0]
2275    movu                    m5, [lq+mmsize*2+2]
2276    movu                    m4, [lq+mmsize*2+4]
2277    LOWPASS                  4,  5,  6
2278    pavgw                   m5, m6
2279    SBUTTERFLY           wd, 5,  4,  6
2280    mova                    m0, [lq+mmsize*3+0]
2281    movu                    m1, [aq+mmsize*0-2]
2282    PALIGNR                 m7, m1, m0, 2, m2
2283    PALIGNR                 m6, m1, m0, 4, m2
2284    LOWPASS                  6,  7,  0
2285    pavgw                   m7, m0
2286    SBUTTERFLY           wd, 7,  6,  0
2287    mova                    m2, [aq+mmsize*0+0]
2288    movu                    m0, [aq+mmsize*0+2]
2289    LOWPASS                  0,  2,  1
2290    movu                    m1, [aq+mmsize*1-2]
2291    mova                    m2, [aq+mmsize*1+0]
2292    movu                    m3, [aq+mmsize*1+2]
2293    LOWPASS                  1,  2,  3
2294    SCRATCH                  6, 12, rsp+6*mmsize
2295    SCRATCH                  7, 13, rsp+7*mmsize
2296    movu                    m2, [aq+mmsize*2-2]
2297    mova                    m3, [aq+mmsize*2+0]
2298    movu                    m6, [aq+mmsize*2+2]
2299    LOWPASS                  2,  3,  6
2300    movu                    m3, [aq+mmsize*3-2]
2301    psrldq                  m6, m3,  2
2302    psrldq                  m7, m3,  4
2303    LOWPASS                  3,  6,  7
2304    UNSCRATCH                6, 12, rsp+6*mmsize
2305    UNSCRATCH                7, 13, rsp+7*mmsize
2306%if ARCH_X86_32
2307    mova        [rsp+4*mmsize], m4
2308    mova        [rsp+5*mmsize], m5
2309    ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
2310    ; to do it again here
2311%endif
2312    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
2313    mov                   cntd, 4
2314    lea               stride3q, [strideq*3]
2315%if ARCH_X86_64
2316    lea               stride4q, [strideq*4]
2317    lea              stride28q, [stride4q*8]
2318    lea              stride20q, [stride4q*5]
2319    sub              stride28q, stride4q
2320%endif
2321    add                   dstq, stride3q
2322
2323    ; x86-32 doesn't have enough registers, so on that platform, we split
2324    ; the loop in 2... Otherwise you spend most of the loop (un)scratching
2325.loop:
2326%if ARCH_X86_64
2327    mova  [dstq+stride28q + 0], m9
2328    mova  [dstq+stride28q +16], m8
2329    mova  [dstq+stride28q +32], m11
2330    mova  [dstq+stride28q +48], m10
2331    mova  [dstq+stride3q*8+ 0], m8
2332    mova  [dstq+stride3q*8+16], m11
2333    mova  [dstq+stride3q*8+32], m10
2334    mova  [dstq+stride3q*8+48], m5
2335    mova  [dstq+stride20q + 0], m11
2336    mova  [dstq+stride20q +16], m10
2337    mova  [dstq+stride20q +32], m5
2338    mova  [dstq+stride20q +48], m4
2339    mova  [dstq+stride4q*4+ 0], m10
2340    mova  [dstq+stride4q*4+16], m5
2341    mova  [dstq+stride4q*4+32], m4
2342    mova  [dstq+stride4q*4+48], m7
2343%endif
2344    mova  [dstq+stride3q*4+ 0], m5
2345    mova  [dstq+stride3q*4+16], m4
2346    mova  [dstq+stride3q*4+32], m7
2347    mova  [dstq+stride3q*4+48], m6
2348    mova  [dstq+strideq* 8+ 0], m4
2349    mova  [dstq+strideq* 8+16], m7
2350    mova  [dstq+strideq* 8+32], m6
2351    mova  [dstq+strideq* 8+48], m0
2352    mova  [dstq+strideq* 4+ 0], m7
2353    mova  [dstq+strideq* 4+16], m6
2354    mova  [dstq+strideq* 4+32], m0
2355    mova  [dstq+strideq* 4+48], m1
2356    mova  [dstq+strideq* 0+ 0], m6
2357    mova  [dstq+strideq* 0+16], m0
2358    mova  [dstq+strideq* 0+32], m1
2359    mova  [dstq+strideq* 0+48], m2
2360    sub                   dstq, strideq
2361%if cpuflag(avx)
2362%if ARCH_X86_64
2363    vpalignr                m9, m8,  m9,  4
2364    vpalignr                m8, m11, m8,  4
2365    vpalignr               m11, m10, m11, 4
2366    vpalignr               m10, m5,  m10, 4
2367%endif
2368    vpalignr                m5, m4,  m5,  4
2369    vpalignr                m4, m7,  m4,  4
2370    vpalignr                m7, m6,  m7,  4
2371    vpalignr                m6, m0,  m6,  4
2372    vpalignr                m0, m1,  m0,  4
2373    vpalignr                m1, m2,  m1,  4
2374    vpalignr                m2, m3,  m2,  4
2375%else
2376%if ARCH_X86_64
2377    PALIGNR                m12, m8,  m9,  4, m13
2378    mova                    m9, m12
2379    PALIGNR                m12, m11, m8,  4, m13
2380    mova                    m8, m12
2381    PALIGNR                m12, m10, m11, 4, m13
2382    mova                   m11, m12
2383    PALIGNR                m12, m5,  m10, 4, m13
2384    mova                   m10, m12
2385%endif
2386    SCRATCH                  3, 12, rsp+8*mmsize, sh
2387%if notcpuflag(ssse3)
2388    SCRATCH                  2, 13, rsp+9*mmsize
2389%endif
2390    PALIGNR                 m3, m4,  m5,  4, m2
2391    mova                    m5, m3
2392    PALIGNR                 m3, m7,  m4,  4, m2
2393    mova                    m4, m3
2394    PALIGNR                 m3, m6,  m7,  4, m2
2395    mova                    m7, m3
2396    PALIGNR                 m3, m0,  m6,  4, m2
2397    mova                    m6, m3
2398    PALIGNR                 m3, m1,  m0,  4, m2
2399    mova                    m0, m3
2400%if notcpuflag(ssse3)
2401    UNSCRATCH                2, 13, rsp+9*mmsize
2402    SCRATCH                  0, 13, rsp+9*mmsize
2403%endif
2404    PALIGNR                 m3, m2,  m1,  4, m0
2405    mova                    m1, m3
2406    PALIGNR                 m3, reg_sh,  m2,  4, m0
2407    mova                    m2, m3
2408%if notcpuflag(ssse3)
2409    UNSCRATCH                0, 13, rsp+9*mmsize
2410%endif
2411    UNSCRATCH                3, 12, rsp+8*mmsize, sh
2412%endif
2413    psrldq                  m3, 4
2414    dec                   cntd
2415    jg .loop
2416
2417%if ARCH_X86_32
2418    UNSCRATCH                0,  8, rsp+0*mmsize
2419    UNSCRATCH                1,  9, rsp+1*mmsize
2420    UNSCRATCH                2, 10, rsp+2*mmsize
2421    UNSCRATCH                3, 11, rsp+3*mmsize
2422    mova                    m4, [rsp+4*mmsize]
2423    mova                    m5, [rsp+5*mmsize]
2424    mova                    m6, [rsp+6*mmsize]
2425    mova                    m7, [rsp+7*mmsize]
2426    DEFINE_ARGS dst, stride, stride5, stride3
2427    lea               stride5q, [strideq*5]
2428    lea                   dstq, [dstq+stride5q*4]
2429    DEFINE_ARGS dst, stride, cnt, stride3
2430    mov                   cntd, 4
2431.loop_2:
2432    mova  [dstq+stride3q*4+ 0], m1
2433    mova  [dstq+stride3q*4+16], m0
2434    mova  [dstq+stride3q*4+32], m3
2435    mova  [dstq+stride3q*4+48], m2
2436    mova  [dstq+strideq* 8+ 0], m0
2437    mova  [dstq+strideq* 8+16], m3
2438    mova  [dstq+strideq* 8+32], m2
2439    mova  [dstq+strideq* 8+48], m5
2440    mova  [dstq+strideq* 4+ 0], m3
2441    mova  [dstq+strideq* 4+16], m2
2442    mova  [dstq+strideq* 4+32], m5
2443    mova  [dstq+strideq* 4+48], m4
2444    mova  [dstq+strideq* 0+ 0], m2
2445    mova  [dstq+strideq* 0+16], m5
2446    mova  [dstq+strideq* 0+32], m4
2447    mova  [dstq+strideq* 0+48], m7
2448    sub                   dstq, strideq
2449%if cpuflag(avx)
2450    vpalignr                m1, m0,  m1,  4
2451    vpalignr                m0, m3,  m0,  4
2452    vpalignr                m3, m2,  m3,  4
2453    vpalignr                m2, m5,  m2,  4
2454    vpalignr                m5, m4,  m5,  4
2455    vpalignr                m4, m7,  m4,  4
2456    vpalignr                m7, m6,  m7,  4
2457%else
2458    SCRATCH                  6, 12, rsp+8*mmsize, sh
2459%if notcpuflag(ssse3)
2460    SCRATCH                  7, 13, rsp+9*mmsize
2461%endif
2462    PALIGNR                 m6, m0,  m1,  4, m7
2463    mova                    m1, m6
2464    PALIGNR                 m6, m3,  m0,  4, m7
2465    mova                    m0, m6
2466    PALIGNR                 m6, m2,  m3,  4, m7
2467    mova                    m3, m6
2468    PALIGNR                 m6, m5,  m2,  4, m7
2469    mova                    m2, m6
2470    PALIGNR                 m6, m4,  m5,  4, m7
2471    mova                    m5, m6
2472%if notcpuflag(ssse3)
2473    UNSCRATCH                7, 13, rsp+9*mmsize
2474    SCRATCH                  5, 13, rsp+9*mmsize
2475%endif
2476    PALIGNR                 m6, m7,  m4,  4, m5
2477    mova                    m4, m6
2478    PALIGNR                 m6, reg_sh,  m7,  4, m5
2479    mova                    m7, m6
2480%if notcpuflag(ssse3)
2481    UNSCRATCH                5, 13, rsp+9*mmsize
2482%endif
2483    UNSCRATCH                6, 12, rsp+8*mmsize, sh
2484%endif
2485    psrldq                  m6, 4
2486    dec                   cntd
2487    jg .loop_2
2488%endif
2489    RET
2490%endmacro
2491
2492INIT_XMM sse2
2493HD_FUNCS
2494INIT_XMM ssse3
2495HD_FUNCS
2496INIT_XMM avx
2497HD_FUNCS
2498