1;******************************************************************************
2;* H.264 intra prediction asm optimizations
3;* Copyright (c) 2010 Fiona Glaser
4;* Copyright (c) 2010 Holger Lubitz
5;* Copyright (c) 2010 Loren Merritt
6;* Copyright (c) 2010 Ronald S. Bultje
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29tm_shuf: times 8 db 0x03, 0x80
30pw_ff00: times 8 dw 0xff00
31plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
32             db  1,  2,  3,  4,  5,  6,  7,  8
33plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
34             db  1,  2,  3,  4,  0,  0,  0,  0
35pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
36pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
37pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
38pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
39
40SECTION .text
41
42cextern pb_1
43cextern pb_3
44cextern pw_4
45cextern pw_8
46
47;-----------------------------------------------------------------------------
48; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride)
49;-----------------------------------------------------------------------------
50
51INIT_XMM sse
52cglobal pred16x16_vertical_8, 2,3
53    sub   r0, r1
54    mov   r2, 4
55    movaps xmm0, [r0]
56.loop:
57    movaps [r0+r1*1], xmm0
58    movaps [r0+r1*2], xmm0
59    lea   r0, [r0+r1*2]
60    movaps [r0+r1*1], xmm0
61    movaps [r0+r1*2], xmm0
62    lea   r0, [r0+r1*2]
63    dec   r2
64    jg .loop
65    REP_RET
66
67;-----------------------------------------------------------------------------
68; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride)
69;-----------------------------------------------------------------------------
70
71%macro PRED16x16_H 0
72cglobal pred16x16_horizontal_8, 2,3
73    mov       r2, 8
74%if cpuflag(ssse3)
75    mova      m2, [pb_3]
76%endif
77.loop:
78    movd      m0, [r0+r1*0-4]
79    movd      m1, [r0+r1*1-4]
80
81%if cpuflag(ssse3)
82    pshufb    m0, m2
83    pshufb    m1, m2
84%else
85    punpcklbw m0, m0
86    punpcklbw m1, m1
87    SPLATW    m0, m0, 3
88    SPLATW    m1, m1, 3
89    mova [r0+r1*0+8], m0
90    mova [r0+r1*1+8], m1
91%endif
92
93    mova [r0+r1*0], m0
94    mova [r0+r1*1], m1
95    lea       r0, [r0+r1*2]
96    dec       r2
97    jg .loop
98    REP_RET
99%endmacro
100
101INIT_MMX mmxext
102PRED16x16_H
103INIT_XMM ssse3
104PRED16x16_H
105
106;-----------------------------------------------------------------------------
107; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride)
108;-----------------------------------------------------------------------------
109
110%macro PRED16x16_DC 0
111cglobal pred16x16_dc_8, 2,7
112    mov       r4, r0
113    sub       r0, r1
114    pxor      mm0, mm0
115    pxor      mm1, mm1
116    psadbw    mm0, [r0+0]
117    psadbw    mm1, [r0+8]
118    dec        r0
119    movzx     r5d, byte [r0+r1*1]
120    paddw     mm0, mm1
121    movd      r6d, mm0
122    lea        r0, [r0+r1*2]
123%rep 7
124    movzx     r2d, byte [r0+r1*0]
125    movzx     r3d, byte [r0+r1*1]
126    add       r5d, r2d
127    add       r6d, r3d
128    lea        r0, [r0+r1*2]
129%endrep
130    movzx     r2d, byte [r0+r1*0]
131    add       r5d, r6d
132    lea       r2d, [r2+r5+16]
133    shr       r2d, 5
134%if cpuflag(ssse3)
135    pxor       m1, m1
136%endif
137    SPLATB_REG m0, r2, m1
138
139    mov       r3d, 4
140.loop:
141    mova [r4+r1*0], m0
142    mova [r4+r1*1], m0
143    lea   r4, [r4+r1*2]
144    mova [r4+r1*0], m0
145    mova [r4+r1*1], m0
146    lea   r4, [r4+r1*2]
147    dec   r3d
148    jg .loop
149    REP_RET
150%endmacro
151
152INIT_XMM sse2
153PRED16x16_DC
154INIT_XMM ssse3
155PRED16x16_DC
156
157;-----------------------------------------------------------------------------
158; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
159;-----------------------------------------------------------------------------
160
161INIT_XMM sse2
162cglobal pred16x16_tm_vp8_8, 2,6,6
163    sub          r0, r1
164    pxor       xmm2, xmm2
165    movdqa     xmm0, [r0]
166    movdqa     xmm1, xmm0
167    punpcklbw  xmm0, xmm2
168    punpckhbw  xmm1, xmm2
169    movzx       r4d, byte [r0-1]
170    mov         r5d, 8
171.loop:
172    movzx       r2d, byte [r0+r1*1-1]
173    movzx       r3d, byte [r0+r1*2-1]
174    sub         r2d, r4d
175    sub         r3d, r4d
176    movd       xmm2, r2d
177    movd       xmm4, r3d
178    pshuflw    xmm2, xmm2, 0
179    pshuflw    xmm4, xmm4, 0
180    punpcklqdq xmm2, xmm2
181    punpcklqdq xmm4, xmm4
182    movdqa     xmm3, xmm2
183    movdqa     xmm5, xmm4
184    paddw      xmm2, xmm0
185    paddw      xmm3, xmm1
186    paddw      xmm4, xmm0
187    paddw      xmm5, xmm1
188    packuswb   xmm2, xmm3
189    packuswb   xmm4, xmm5
190    movdqa [r0+r1*1], xmm2
191    movdqa [r0+r1*2], xmm4
192    lea          r0, [r0+r1*2]
193    dec         r5d
194    jg .loop
195    REP_RET
196
197%if HAVE_AVX2_EXTERNAL
198INIT_YMM avx2
199cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
200    sub                       dstq, strideq
201    pmovzxbw                    m0, [dstq]
202    vpbroadcastb               xm1, [r0-1]
203    pmovzxbw                    m1, xm1
204    psubw                       m0, m1
205    mov                 iterationd, 4
206    lea                   stride3q, [strideq*3]
207.loop:
208    vpbroadcastb               xm1, [dstq+strideq*1-1]
209    vpbroadcastb               xm2, [dstq+strideq*2-1]
210    vpbroadcastb               xm3, [dstq+stride3q-1]
211    vpbroadcastb               xm4, [dstq+strideq*4-1]
212    pmovzxbw                    m1, xm1
213    pmovzxbw                    m2, xm2
214    pmovzxbw                    m3, xm3
215    pmovzxbw                    m4, xm4
216    paddw                       m1, m0
217    paddw                       m2, m0
218    paddw                       m3, m0
219    paddw                       m4, m0
220    vpackuswb                   m1, m1, m2
221    vpackuswb                   m3, m3, m4
222    vpermq                      m1, m1, q3120
223    vpermq                      m3, m3, q3120
224    movdqa        [dstq+strideq*1], xm1
225    vextracti128  [dstq+strideq*2], m1, 1
226    movdqa       [dstq+stride3q*1], xm3
227    vextracti128  [dstq+strideq*4], m3, 1
228    lea                       dstq, [dstq+strideq*4]
229    dec                 iterationd
230    jg .loop
231    REP_RET
232%endif
233
234;-----------------------------------------------------------------------------
235; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
236;-----------------------------------------------------------------------------
237
238%macro H264_PRED16x16_PLANE 1
239cglobal pred16x16_plane_%1_8, 2,9,7
240    mov          r2, r1           ; +stride
241    neg          r1               ; -stride
242
243    movh         m0, [r0+r1  -1]
244%if cpuflag(ssse3)
245    movhps       m0, [r0+r1  +8]
246    pmaddubsw    m0, [plane_shuf] ; H coefficients
247%else ; sse2
248    pxor         m2, m2
249    movh         m1, [r0+r1  +8]
250    punpcklbw    m0, m2
251    punpcklbw    m1, m2
252    pmullw       m0, [pw_m8tom1]
253    pmullw       m1, [pw_1to8]
254    paddw        m0, m1
255%endif
256    movhlps      m1, m0
257    paddw        m0, m1
258    PSHUFLW      m1, m0, 0xE
259    paddw        m0, m1
260    PSHUFLW      m1, m0, 0x1
261    paddw        m0, m1           ; sum of H coefficients
262
263    lea          r4, [r0+r2*8-1]
264    lea          r3, [r0+r2*4-1]
265    add          r4, r2
266
267%if ARCH_X86_64
268%define e_reg r8
269%else
270%define e_reg r0
271%endif
272
273    movzx     e_reg, byte [r3+r2*2   ]
274    movzx        r5, byte [r4+r1     ]
275    sub          r5, e_reg
276
277    movzx     e_reg, byte [r3+r2     ]
278    movzx        r6, byte [r4        ]
279    sub          r6, e_reg
280    lea          r5, [r5+r6*2]
281
282    movzx     e_reg, byte [r3+r1     ]
283    movzx        r6, byte [r4+r2*2   ]
284    sub          r6, e_reg
285    lea          r5, [r5+r6*4]
286
287    movzx     e_reg, byte [r3        ]
288%if ARCH_X86_64
289    movzx        r7, byte [r4+r2     ]
290    sub          r7, e_reg
291%else
292    movzx        r6, byte [r4+r2     ]
293    sub          r6, e_reg
294    lea          r5, [r5+r6*4]
295    sub          r5, r6
296%endif
297
298    lea       e_reg, [r3+r1*4]
299    lea          r3, [r4+r2*4]
300
301    movzx        r4, byte [e_reg+r2  ]
302    movzx        r6, byte [r3        ]
303    sub          r6, r4
304%if ARCH_X86_64
305    lea          r6, [r7+r6*2]
306    lea          r5, [r5+r6*2]
307    add          r5, r6
308%else
309    lea          r5, [r5+r6*4]
310    lea          r5, [r5+r6*2]
311%endif
312
313    movzx        r4, byte [e_reg     ]
314%if ARCH_X86_64
315    movzx        r7, byte [r3   +r2  ]
316    sub          r7, r4
317    sub          r5, r7
318%else
319    movzx        r6, byte [r3   +r2  ]
320    sub          r6, r4
321    lea          r5, [r5+r6*8]
322    sub          r5, r6
323%endif
324
325    movzx        r4, byte [e_reg+r1  ]
326    movzx        r6, byte [r3   +r2*2]
327    sub          r6, r4
328%if ARCH_X86_64
329    add          r6, r7
330%endif
331    lea          r5, [r5+r6*8]
332
333    movzx        r4, byte [e_reg+r2*2]
334    movzx        r6, byte [r3   +r1  ]
335    sub          r6, r4
336    lea          r5, [r5+r6*4]
337    add          r5, r6           ; sum of V coefficients
338
339%if ARCH_X86_64 == 0
340    mov          r0, r0m
341%endif
342
343%ifidn %1, h264
344    lea          r5, [r5*5+32]
345    sar          r5, 6
346%elifidn %1, rv40
347    lea          r5, [r5*5]
348    sar          r5, 6
349%elifidn %1, svq3
350    test         r5, r5
351    lea          r6, [r5+3]
352    cmovs        r5, r6
353    sar          r5, 2            ; V/4
354    lea          r5, [r5*5]       ; 5*(V/4)
355    test         r5, r5
356    lea          r6, [r5+15]
357    cmovs        r5, r6
358    sar          r5, 4            ; (5*(V/4))/16
359%endif
360
361    movzx        r4, byte [r0+r1  +15]
362    movzx        r3, byte [r3+r2*2   ]
363    lea          r3, [r3+r4+1]
364    shl          r3, 4
365
366    movd        r1d, m0
367    movsx       r1d, r1w
368%ifnidn %1, svq3
369%ifidn %1, h264
370    lea         r1d, [r1d*5+32]
371%else ; rv40
372    lea         r1d, [r1d*5]
373%endif
374    sar         r1d, 6
375%else ; svq3
376    test        r1d, r1d
377    lea         r4d, [r1d+3]
378    cmovs       r1d, r4d
379    sar         r1d, 2           ; H/4
380    lea         r1d, [r1d*5]     ; 5*(H/4)
381    test        r1d, r1d
382    lea         r4d, [r1d+15]
383    cmovs       r1d, r4d
384    sar         r1d, 4           ; (5*(H/4))/16
385%endif
386    movd         m0, r1d
387
388    add         r1d, r5d
389    add         r3d, r1d
390    shl         r1d, 3
391    sub         r3d, r1d          ; a
392
393    movd         m1, r5d
394    movd         m3, r3d
395    SPLATW       m0, m0, 0        ; H
396    SPLATW       m1, m1, 0        ; V
397    SPLATW       m3, m3, 0        ; a
398%ifidn %1, svq3
399    SWAP          0, 1
400%endif
401    mova         m2, m0
402    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
403    psllw        m2, 3
404    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
405    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
406
407    mov          r4, 8
408.loop:
409    mova         m3, m0           ; b[0..7]
410    mova         m4, m2           ; b[8..15]
411    psraw        m3, 5
412    psraw        m4, 5
413    packuswb     m3, m4
414    mova       [r0], m3
415    paddw        m0, m1
416    paddw        m2, m1
417
418    mova         m3, m0           ; b[0..7]
419    mova         m4, m2           ; b[8..15]
420    psraw        m3, 5
421    psraw        m4, 5
422    packuswb     m3, m4
423    mova    [r0+r2], m3
424    paddw        m0, m1
425    paddw        m2, m1
426
427    lea          r0, [r0+r2*2]
428    dec          r4
429    jg .loop
430    REP_RET
431%endmacro
432
433INIT_XMM sse2
434H264_PRED16x16_PLANE h264
435H264_PRED16x16_PLANE rv40
436H264_PRED16x16_PLANE svq3
437INIT_XMM ssse3
438H264_PRED16x16_PLANE h264
439H264_PRED16x16_PLANE rv40
440H264_PRED16x16_PLANE svq3
441
442;-----------------------------------------------------------------------------
443; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride)
444;-----------------------------------------------------------------------------
445
446%macro H264_PRED8x8_PLANE 0
447cglobal pred8x8_plane_8, 2,9,7
448    mov          r2, r1           ; +stride
449    neg          r1               ; -stride
450
451    movd         m0, [r0+r1  -1]
452%if cpuflag(ssse3)
453    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
454    pmaddubsw    m0, [plane8_shuf] ; H coefficients
455%else ; sse2
456    pxor         m2, m2
457    movd         m1, [r0+r1  +4]
458    punpckldq    m0, m1
459    punpcklbw    m0, m2
460    pmullw       m0, [pw_m4to4]
461%endif
462    movhlps      m1, m0
463    paddw        m0, m1
464
465%if notcpuflag(ssse3)
466    PSHUFLW      m1, m0, 0xE
467    paddw        m0, m1
468%endif ; !ssse3
469
470    PSHUFLW      m1, m0, 0x1
471    paddw        m0, m1           ; sum of H coefficients
472
473    lea          r4, [r0+r2*4-1]
474    lea          r3, [r0     -1]
475    add          r4, r2
476
477%if ARCH_X86_64
478%define e_reg r8
479%else
480%define e_reg r0
481%endif
482
483    movzx     e_reg, byte [r3+r2*2   ]
484    movzx        r5, byte [r4+r1     ]
485    sub          r5, e_reg
486
487    movzx     e_reg, byte [r3        ]
488%if ARCH_X86_64
489    movzx        r7, byte [r4+r2     ]
490    sub          r7, e_reg
491    sub          r5, r7
492%else
493    movzx        r6, byte [r4+r2     ]
494    sub          r6, e_reg
495    lea          r5, [r5+r6*4]
496    sub          r5, r6
497%endif
498
499    movzx     e_reg, byte [r3+r1     ]
500    movzx        r6, byte [r4+r2*2   ]
501    sub          r6, e_reg
502%if ARCH_X86_64
503    add          r6, r7
504%endif
505    lea          r5, [r5+r6*4]
506
507    movzx     e_reg, byte [r3+r2     ]
508    movzx        r6, byte [r4        ]
509    sub          r6, e_reg
510    lea          r6, [r5+r6*2]
511
512    lea          r5, [r6*9+16]
513    lea          r5, [r5+r6*8]
514    sar          r5, 5
515
516%if ARCH_X86_64 == 0
517    mov          r0, r0m
518%endif
519
520    movzx        r3, byte [r4+r2*2  ]
521    movzx        r4, byte [r0+r1  +7]
522    lea          r3, [r3+r4+1]
523    shl          r3, 4
524    movd        r1d, m0
525    movsx       r1d, r1w
526    imul        r1d, 17
527    add         r1d, 16
528    sar         r1d, 5
529    movd         m0, r1d
530    add         r1d, r5d
531    sub         r3d, r1d
532    add         r1d, r1d
533    sub         r3d, r1d          ; a
534
535    movd         m1, r5d
536    movd         m3, r3d
537    SPLATW       m0, m0, 0        ; H
538    SPLATW       m1, m1, 0        ; V
539    SPLATW       m3, m3, 0        ; a
540    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
541    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
542
543    mov          r4, 4
544ALIGN 16
545.loop:
546    mova         m3, m0           ; b[0..7]
547    paddw        m0, m1
548    psraw        m3, 5
549    mova         m4, m0           ; V+b[0..7]
550    paddw        m0, m1
551    psraw        m4, 5
552    packuswb     m3, m4
553    movh       [r0], m3
554    movhps  [r0+r2], m3
555
556    lea          r0, [r0+r2*2]
557    dec          r4
558    jg .loop
559    REP_RET
560%endmacro
561
562INIT_XMM sse2
563H264_PRED8x8_PLANE
564INIT_XMM ssse3
565H264_PRED8x8_PLANE
566
567;-----------------------------------------------------------------------------
568; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride)
569;-----------------------------------------------------------------------------
570
571INIT_MMX mmx
572cglobal pred8x8_vertical_8, 2,2
573    sub    r0, r1
574    movq  mm0, [r0]
575%rep 3
576    movq [r0+r1*1], mm0
577    movq [r0+r1*2], mm0
578    lea    r0, [r0+r1*2]
579%endrep
580    movq [r0+r1*1], mm0
581    movq [r0+r1*2], mm0
582    RET
583
584;-----------------------------------------------------------------------------
585; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride)
586;-----------------------------------------------------------------------------
587
588%macro PRED8x8_H 0
589cglobal pred8x8_horizontal_8, 2,3
590    mov       r2, 4
591%if cpuflag(ssse3)
592    mova      m2, [pb_3]
593%endif
594.loop:
595    SPLATB_LOAD m0, r0+r1*0-1, m2
596    SPLATB_LOAD m1, r0+r1*1-1, m2
597    mova [r0+r1*0], m0
598    mova [r0+r1*1], m1
599    lea       r0, [r0+r1*2]
600    dec       r2
601    jg .loop
602    REP_RET
603%endmacro
604
605INIT_MMX mmxext
606PRED8x8_H
607INIT_MMX ssse3
608PRED8x8_H
609
610;-----------------------------------------------------------------------------
611; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
612;-----------------------------------------------------------------------------
613INIT_MMX mmxext
614cglobal pred8x8_top_dc_8, 2,5
615    sub         r0, r1
616    movq       mm0, [r0]
617    pxor       mm1, mm1
618    pxor       mm2, mm2
619    lea         r2, [r0+r1*2]
620    punpckhbw  mm1, mm0
621    punpcklbw  mm0, mm2
622    psadbw     mm1, mm2        ; s1
623    lea         r3, [r2+r1*2]
624    psadbw     mm0, mm2        ; s0
625    psrlw      mm1, 1
626    psrlw      mm0, 1
627    pavgw      mm1, mm2
628    lea         r4, [r3+r1*2]
629    pavgw      mm0, mm2
630    pshufw     mm1, mm1, 0
631    pshufw     mm0, mm0, 0     ; dc0 (w)
632    packuswb   mm0, mm1        ; dc0,dc1 (b)
633    movq [r0+r1*1], mm0
634    movq [r0+r1*2], mm0
635    lea         r0, [r3+r1*2]
636    movq [r2+r1*1], mm0
637    movq [r2+r1*2], mm0
638    movq [r3+r1*1], mm0
639    movq [r3+r1*2], mm0
640    movq [r0+r1*1], mm0
641    movq [r0+r1*2], mm0
642    RET
643
644;-----------------------------------------------------------------------------
645; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
646;-----------------------------------------------------------------------------
647
648INIT_MMX mmxext
649cglobal pred8x8_dc_8, 2,5
650    sub       r0, r1
651    pxor      m7, m7
652    movd      m0, [r0+0]
653    movd      m1, [r0+4]
654    psadbw    m0, m7            ; s0
655    mov       r4, r0
656    psadbw    m1, m7            ; s1
657
658    movzx    r2d, byte [r0+r1*1-1]
659    movzx    r3d, byte [r0+r1*2-1]
660    lea       r0, [r0+r1*2]
661    add      r2d, r3d
662    movzx    r3d, byte [r0+r1*1-1]
663    add      r2d, r3d
664    movzx    r3d, byte [r0+r1*2-1]
665    add      r2d, r3d
666    lea       r0, [r0+r1*2]
667    movd      m2, r2d            ; s2
668    movzx    r2d, byte [r0+r1*1-1]
669    movzx    r3d, byte [r0+r1*2-1]
670    lea       r0, [r0+r1*2]
671    add      r2d, r3d
672    movzx    r3d, byte [r0+r1*1-1]
673    add      r2d, r3d
674    movzx    r3d, byte [r0+r1*2-1]
675    add      r2d, r3d
676    movd      m3, r2d            ; s3
677
678    punpcklwd m0, m1
679    mov       r0, r4
680    punpcklwd m2, m3
681    punpckldq m0, m2            ; s0, s1, s2, s3
682    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
683    lea       r2, [r0+r1*2]
684    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
685    paddw     m0, m3
686    lea       r3, [r2+r1*2]
687    psrlw     m0, 2
688    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
689    lea       r4, [r3+r1*2]
690    packuswb  m0, m0
691    punpcklbw m0, m0
692    movq      m1, m0
693    punpcklbw m0, m0
694    punpckhbw m1, m1
695    movq [r0+r1*1], m0
696    movq [r0+r1*2], m0
697    movq [r2+r1*1], m0
698    movq [r2+r1*2], m0
699    movq [r3+r1*1], m1
700    movq [r3+r1*2], m1
701    movq [r4+r1*1], m1
702    movq [r4+r1*2], m1
703    RET
704
705;-----------------------------------------------------------------------------
706; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride)
707;-----------------------------------------------------------------------------
708
709INIT_MMX mmxext
710cglobal pred8x8_dc_rv40_8, 2,7
711    mov       r4, r0
712    sub       r0, r1
713    pxor      mm0, mm0
714    psadbw    mm0, [r0]
715    dec        r0
716    movzx     r5d, byte [r0+r1*1]
717    movd      r6d, mm0
718    lea        r0, [r0+r1*2]
719%rep 3
720    movzx     r2d, byte [r0+r1*0]
721    movzx     r3d, byte [r0+r1*1]
722    add       r5d, r2d
723    add       r6d, r3d
724    lea        r0, [r0+r1*2]
725%endrep
726    movzx     r2d, byte [r0+r1*0]
727    add       r5d, r6d
728    lea       r2d, [r2+r5+8]
729    shr       r2d, 4
730    movd      mm0, r2d
731    punpcklbw mm0, mm0
732    pshufw    mm0, mm0, 0
733    mov       r3d, 4
734.loop:
735    movq [r4+r1*0], mm0
736    movq [r4+r1*1], mm0
737    lea   r4, [r4+r1*2]
738    dec   r3d
739    jg .loop
740    REP_RET
741
742;-----------------------------------------------------------------------------
743; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
744;-----------------------------------------------------------------------------
745
746INIT_XMM sse2
747cglobal pred8x8_tm_vp8_8, 2,6,4
748    sub          r0, r1
749    pxor       xmm1, xmm1
750    movq       xmm0, [r0]
751    punpcklbw  xmm0, xmm1
752    movzx       r4d, byte [r0-1]
753    mov         r5d, 4
754.loop:
755    movzx       r2d, byte [r0+r1*1-1]
756    movzx       r3d, byte [r0+r1*2-1]
757    sub         r2d, r4d
758    sub         r3d, r4d
759    movd       xmm2, r2d
760    movd       xmm3, r3d
761    pshuflw    xmm2, xmm2, 0
762    pshuflw    xmm3, xmm3, 0
763    punpcklqdq xmm2, xmm2
764    punpcklqdq xmm3, xmm3
765    paddw      xmm2, xmm0
766    paddw      xmm3, xmm0
767    packuswb   xmm2, xmm3
768    movq   [r0+r1*1], xmm2
769    movhps [r0+r1*2], xmm2
770    lea          r0, [r0+r1*2]
771    dec         r5d
772    jg .loop
773    REP_RET
774
775INIT_XMM ssse3
776cglobal pred8x8_tm_vp8_8, 2,3,6
777    sub          r0, r1
778    movdqa     xmm4, [tm_shuf]
779    pxor       xmm1, xmm1
780    movq       xmm0, [r0]
781    punpcklbw  xmm0, xmm1
782    movd       xmm5, [r0-4]
783    pshufb     xmm5, xmm4
784    mov         r2d, 4
785.loop:
786    movd       xmm2, [r0+r1*1-4]
787    movd       xmm3, [r0+r1*2-4]
788    pshufb     xmm2, xmm4
789    pshufb     xmm3, xmm4
790    psubw      xmm2, xmm5
791    psubw      xmm3, xmm5
792    paddw      xmm2, xmm0
793    paddw      xmm3, xmm0
794    packuswb   xmm2, xmm3
795    movq   [r0+r1*1], xmm2
796    movhps [r0+r1*2], xmm2
797    lea          r0, [r0+r1*2]
798    dec         r2d
799    jg .loop
800    REP_RET
801
802; dest, left, right, src, tmp
803; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
804%macro PRED4x4_LOWPASS 5
805    mova    %5, %2
806    pavgb   %2, %3
807    pxor    %3, %5
808    mova    %1, %4
809    pand    %3, [pb_1]
810    psubusb %2, %3
811    pavgb   %1, %2
812%endmacro
813
814;-----------------------------------------------------------------------------
815; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
816;                           ptrdiff_t stride)
817;-----------------------------------------------------------------------------
818%macro PRED8x8L_TOP_DC 0
819cglobal pred8x8l_top_dc_8, 4,4
820    sub          r0, r3
821    pxor        mm7, mm7
822    movq        mm0, [r0-8]
823    movq        mm3, [r0]
824    movq        mm1, [r0+8]
825    movq        mm2, mm3
826    movq        mm4, mm3
827    PALIGNR     mm2, mm0, 7, mm0
828    PALIGNR     mm1, mm4, 1, mm4
829    test        r1d, r1d ; top_left
830    jz .fix_lt_2
831    test        r2d, r2d ; top_right
832    jz .fix_tr_1
833    jmp .body
834.fix_lt_2:
835    movq        mm5, mm3
836    pxor        mm5, mm2
837    psllq       mm5, 56
838    psrlq       mm5, 56
839    pxor        mm2, mm5
840    test        r2d, r2d ; top_right
841    jnz .body
842.fix_tr_1:
843    movq        mm5, mm3
844    pxor        mm5, mm1
845    psrlq       mm5, 56
846    psllq       mm5, 56
847    pxor        mm1, mm5
848.body:
849    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
850    psadbw   mm7, mm0
851    paddw    mm7, [pw_4]
852    psrlw    mm7, 3
853    pshufw   mm7, mm7, 0
854    packuswb mm7, mm7
855%rep 3
856    movq [r0+r3*1], mm7
857    movq [r0+r3*2], mm7
858    lea    r0, [r0+r3*2]
859%endrep
860    movq [r0+r3*1], mm7
861    movq [r0+r3*2], mm7
862    RET
863%endmacro
864
865INIT_MMX mmxext
866PRED8x8L_TOP_DC
867INIT_MMX ssse3
868PRED8x8L_TOP_DC
869
870;-----------------------------------------------------------------------------
871; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
872;                       ptrdiff_t stride)
873;-----------------------------------------------------------------------------
874
875%macro PRED8x8L_DC 0
876cglobal pred8x8l_dc_8, 4,5
877    sub          r0, r3
878    lea          r4, [r0+r3*2]
879    movq        mm0, [r0+r3*1-8]
880    punpckhbw   mm0, [r0+r3*0-8]
881    movq        mm1, [r4+r3*1-8]
882    punpckhbw   mm1, [r0+r3*2-8]
883    mov          r4, r0
884    punpckhwd   mm1, mm0
885    lea          r0, [r0+r3*4]
886    movq        mm2, [r0+r3*1-8]
887    punpckhbw   mm2, [r0+r3*0-8]
888    lea          r0, [r0+r3*2]
889    movq        mm3, [r0+r3*1-8]
890    punpckhbw   mm3, [r0+r3*0-8]
891    punpckhwd   mm3, mm2
892    punpckhdq   mm3, mm1
893    lea          r0, [r0+r3*2]
894    movq        mm0, [r0+r3*0-8]
895    movq        mm1, [r4]
896    mov          r0, r4
897    movq        mm4, mm3
898    movq        mm2, mm3
899    PALIGNR     mm4, mm0, 7, mm0
900    PALIGNR     mm1, mm2, 1, mm2
901    test        r1d, r1d
902    jnz .do_left
903.fix_lt_1:
904    movq        mm5, mm3
905    pxor        mm5, mm4
906    psrlq       mm5, 56
907    psllq       mm5, 48
908    pxor        mm1, mm5
909    jmp .do_left
910.fix_lt_2:
911    movq        mm5, mm3
912    pxor        mm5, mm2
913    psllq       mm5, 56
914    psrlq       mm5, 56
915    pxor        mm2, mm5
916    test        r2d, r2d
917    jnz .body
918.fix_tr_1:
919    movq        mm5, mm3
920    pxor        mm5, mm1
921    psrlq       mm5, 56
922    psllq       mm5, 56
923    pxor        mm1, mm5
924    jmp .body
925.do_left:
926    movq        mm0, mm4
927    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
928    movq        mm4, mm0
929    movq        mm7, mm2
930    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
931    psllq       mm1, 56
932    PALIGNR     mm7, mm1, 7, mm3
933    movq        mm0, [r0-8]
934    movq        mm3, [r0]
935    movq        mm1, [r0+8]
936    movq        mm2, mm3
937    movq        mm4, mm3
938    PALIGNR     mm2, mm0, 7, mm0
939    PALIGNR     mm1, mm4, 1, mm4
940    test        r1d, r1d
941    jz .fix_lt_2
942    test        r2d, r2d
943    jz .fix_tr_1
944.body:
945    lea          r1, [r0+r3*2]
946    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
947    pxor        mm0, mm0
948    pxor        mm1, mm1
949    lea          r2, [r1+r3*2]
950    psadbw      mm0, mm7
951    psadbw      mm1, mm6
952    paddw       mm0, [pw_8]
953    paddw       mm0, mm1
954    lea          r4, [r2+r3*2]
955    psrlw       mm0, 4
956    pshufw      mm0, mm0, 0
957    packuswb    mm0, mm0
958    movq [r0+r3*1], mm0
959    movq [r0+r3*2], mm0
960    movq [r1+r3*1], mm0
961    movq [r1+r3*2], mm0
962    movq [r2+r3*1], mm0
963    movq [r2+r3*2], mm0
964    movq [r4+r3*1], mm0
965    movq [r4+r3*2], mm0
966    RET
967%endmacro
968
969INIT_MMX mmxext
970PRED8x8L_DC
971INIT_MMX ssse3
972PRED8x8L_DC
973
974;-----------------------------------------------------------------------------
975; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
976;                               int has_topright, ptrdiff_t stride)
977;-----------------------------------------------------------------------------
978
979%macro PRED8x8L_HORIZONTAL 0
980cglobal pred8x8l_horizontal_8, 4,4
981    sub          r0, r3
982    lea          r2, [r0+r3*2]
983    movq        mm0, [r0+r3*1-8]
984    test        r1d, r1d
985    lea          r1, [r0+r3]
986    cmovnz       r1, r0
987    punpckhbw   mm0, [r1+r3*0-8]
988    movq        mm1, [r2+r3*1-8]
989    punpckhbw   mm1, [r0+r3*2-8]
990    mov          r2, r0
991    punpckhwd   mm1, mm0
992    lea          r0, [r0+r3*4]
993    movq        mm2, [r0+r3*1-8]
994    punpckhbw   mm2, [r0+r3*0-8]
995    lea          r0, [r0+r3*2]
996    movq        mm3, [r0+r3*1-8]
997    punpckhbw   mm3, [r0+r3*0-8]
998    punpckhwd   mm3, mm2
999    punpckhdq   mm3, mm1
1000    lea          r0, [r0+r3*2]
1001    movq        mm0, [r0+r3*0-8]
1002    movq        mm1, [r1+r3*0-8]
1003    mov          r0, r2
1004    movq        mm4, mm3
1005    movq        mm2, mm3
1006    PALIGNR     mm4, mm0, 7, mm0
1007    PALIGNR     mm1, mm2, 1, mm2
1008    movq        mm0, mm4
1009    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1010    movq        mm4, mm0
1011    movq        mm7, mm2
1012    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1013    psllq       mm1, 56
1014    PALIGNR     mm7, mm1, 7, mm3
1015    movq        mm3, mm7
1016    lea         r1, [r0+r3*2]
1017    movq       mm7, mm3
1018    punpckhbw  mm3, mm3
1019    punpcklbw  mm7, mm7
1020    pshufw     mm0, mm3, 0xff
1021    pshufw     mm1, mm3, 0xaa
1022    lea         r2, [r1+r3*2]
1023    pshufw     mm2, mm3, 0x55
1024    pshufw     mm3, mm3, 0x00
1025    pshufw     mm4, mm7, 0xff
1026    pshufw     mm5, mm7, 0xaa
1027    pshufw     mm6, mm7, 0x55
1028    pshufw     mm7, mm7, 0x00
1029    movq [r0+r3*1], mm0
1030    movq [r0+r3*2], mm1
1031    movq [r1+r3*1], mm2
1032    movq [r1+r3*2], mm3
1033    movq [r2+r3*1], mm4
1034    movq [r2+r3*2], mm5
1035    lea         r0, [r2+r3*2]
1036    movq [r0+r3*1], mm6
1037    movq [r0+r3*2], mm7
1038    RET
1039%endmacro
1040
1041INIT_MMX mmxext
1042PRED8x8L_HORIZONTAL
1043INIT_MMX ssse3
1044PRED8x8L_HORIZONTAL
1045
1046;-----------------------------------------------------------------------------
1047; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
1048;                             ptrdiff_t stride)
1049;-----------------------------------------------------------------------------
1050
1051%macro PRED8x8L_VERTICAL 0
1052cglobal pred8x8l_vertical_8, 4,4
1053    sub          r0, r3
1054    movq        mm0, [r0-8]
1055    movq        mm3, [r0]
1056    movq        mm1, [r0+8]
1057    movq        mm2, mm3
1058    movq        mm4, mm3
1059    PALIGNR     mm2, mm0, 7, mm0
1060    PALIGNR     mm1, mm4, 1, mm4
1061    test        r1d, r1d ; top_left
1062    jz .fix_lt_2
1063    test        r2d, r2d ; top_right
1064    jz .fix_tr_1
1065    jmp .body
1066.fix_lt_2:
1067    movq        mm5, mm3
1068    pxor        mm5, mm2
1069    psllq       mm5, 56
1070    psrlq       mm5, 56
1071    pxor        mm2, mm5
1072    test        r2d, r2d ; top_right
1073    jnz .body
1074.fix_tr_1:
1075    movq        mm5, mm3
1076    pxor        mm5, mm1
1077    psrlq       mm5, 56
1078    psllq       mm5, 56
1079    pxor        mm1, mm5
1080.body:
1081    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1082%rep 3
1083    movq [r0+r3*1], mm0
1084    movq [r0+r3*2], mm0
1085    lea    r0, [r0+r3*2]
1086%endrep
1087    movq [r0+r3*1], mm0
1088    movq [r0+r3*2], mm0
1089    RET
1090%endmacro
1091
1092INIT_MMX mmxext
1093PRED8x8L_VERTICAL
1094INIT_MMX ssse3
1095PRED8x8L_VERTICAL
1096
1097;-----------------------------------------------------------------------------
1098; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
1099;                              int has_topright, ptrdiff_t stride)
1100;-----------------------------------------------------------------------------
1101
1102%macro PRED8x8L_DOWN_LEFT 0
1103cglobal pred8x8l_down_left_8, 4,4
1104    sub          r0, r3
1105    movq        mm0, [r0-8]
1106    movq        mm3, [r0]
1107    movq        mm1, [r0+8]
1108    movq        mm2, mm3
1109    movq        mm4, mm3
1110    PALIGNR     mm2, mm0, 7, mm0
1111    PALIGNR     mm1, mm4, 1, mm4
1112    test        r1d, r1d ; top_left
1113    jz .fix_lt_2
1114    test        r2d, r2d ; top_right
1115    jz .fix_tr_1
1116    jmp .do_top
1117.fix_lt_2:
1118    movq        mm5, mm3
1119    pxor        mm5, mm2
1120    psllq       mm5, 56
1121    psrlq       mm5, 56
1122    pxor        mm2, mm5
1123    test        r2d, r2d ; top_right
1124    jnz .do_top
1125.fix_tr_1:
1126    movq        mm5, mm3
1127    pxor        mm5, mm1
1128    psrlq       mm5, 56
1129    psllq       mm5, 56
1130    pxor        mm1, mm5
1131    jmp .do_top
1132.fix_tr_2:
1133    punpckhbw   mm3, mm3
1134    pshufw      mm1, mm3, 0xFF
1135    jmp .do_topright
1136.do_top:
1137    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1138    movq2dq    xmm3, mm4
1139    test        r2d, r2d ; top_right
1140    jz .fix_tr_2
1141    movq        mm0, [r0+8]
1142    movq        mm5, mm0
1143    movq        mm2, mm0
1144    movq        mm4, mm0
1145    psrlq       mm5, 56
1146    PALIGNR     mm2, mm3, 7, mm3
1147    PALIGNR     mm5, mm4, 1, mm4
1148    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1149.do_topright:
1150    movq2dq    xmm4, mm1
1151    psrlq       mm1, 56
1152    movq2dq    xmm5, mm1
1153    lea         r1, [r0+r3*2]
1154    pslldq    xmm4, 8
1155    por       xmm3, xmm4
1156    movdqa    xmm2, xmm3
1157    psrldq    xmm2, 1
1158    pslldq    xmm5, 15
1159    por       xmm2, xmm5
1160    lea         r2, [r1+r3*2]
1161    movdqa    xmm1, xmm3
1162    pslldq    xmm1, 1
1163INIT_XMM cpuname
1164    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1165    psrldq    xmm0, 1
1166    movq [r0+r3*1], xmm0
1167    psrldq    xmm0, 1
1168    movq [r0+r3*2], xmm0
1169    psrldq    xmm0, 1
1170    lea         r0, [r2+r3*2]
1171    movq [r1+r3*1], xmm0
1172    psrldq    xmm0, 1
1173    movq [r1+r3*2], xmm0
1174    psrldq    xmm0, 1
1175    movq [r2+r3*1], xmm0
1176    psrldq    xmm0, 1
1177    movq [r2+r3*2], xmm0
1178    psrldq    xmm0, 1
1179    movq [r0+r3*1], xmm0
1180    psrldq    xmm0, 1
1181    movq [r0+r3*2], xmm0
1182    RET
1183%endmacro
1184
1185INIT_MMX sse2
1186PRED8x8L_DOWN_LEFT
1187INIT_MMX ssse3
1188PRED8x8L_DOWN_LEFT
1189
1190;-----------------------------------------------------------------------------
1191; void ff_pred8x8l_down_right_8(uint8_t *src, int has_topleft,
1192;                               int has_topright, ptrdiff_t stride)
1193;-----------------------------------------------------------------------------
1194
1195%macro PRED8x8L_DOWN_RIGHT 0
1196cglobal pred8x8l_down_right_8, 4,5
1197    sub          r0, r3
1198    lea          r4, [r0+r3*2]
1199    movq        mm0, [r0+r3*1-8]
1200    punpckhbw   mm0, [r0+r3*0-8]
1201    movq        mm1, [r4+r3*1-8]
1202    punpckhbw   mm1, [r0+r3*2-8]
1203    mov          r4, r0
1204    punpckhwd   mm1, mm0
1205    lea          r0, [r0+r3*4]
1206    movq        mm2, [r0+r3*1-8]
1207    punpckhbw   mm2, [r0+r3*0-8]
1208    lea          r0, [r0+r3*2]
1209    movq        mm3, [r0+r3*1-8]
1210    punpckhbw   mm3, [r0+r3*0-8]
1211    punpckhwd   mm3, mm2
1212    punpckhdq   mm3, mm1
1213    lea          r0, [r0+r3*2]
1214    movq        mm0, [r0+r3*0-8]
1215    movq        mm1, [r4]
1216    mov          r0, r4
1217    movq        mm4, mm3
1218    movq        mm2, mm3
1219    PALIGNR     mm4, mm0, 7, mm0
1220    PALIGNR     mm1, mm2, 1, mm2
1221    test        r1d, r1d
1222    jz .fix_lt_1
1223    jmp .do_left
1224.fix_lt_1:
1225    movq        mm5, mm3
1226    pxor        mm5, mm4
1227    psrlq       mm5, 56
1228    psllq       mm5, 48
1229    pxor        mm1, mm5
1230    jmp .do_left
1231.fix_lt_2:
1232    movq        mm5, mm3
1233    pxor        mm5, mm2
1234    psllq       mm5, 56
1235    psrlq       mm5, 56
1236    pxor        mm2, mm5
1237    test        r2d, r2d
1238    jnz .do_top
1239.fix_tr_1:
1240    movq        mm5, mm3
1241    pxor        mm5, mm1
1242    psrlq       mm5, 56
1243    psllq       mm5, 56
1244    pxor        mm1, mm5
1245    jmp .do_top
1246.do_left:
1247    movq        mm0, mm4
1248    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1249    movq        mm4, mm0
1250    movq        mm7, mm2
1251    movq2dq    xmm3, mm2
1252    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1253    psllq       mm1, 56
1254    PALIGNR     mm7, mm1, 7, mm3
1255    movq2dq    xmm1, mm7
1256    movq        mm0, [r0-8]
1257    movq        mm3, [r0]
1258    movq        mm1, [r0+8]
1259    movq        mm2, mm3
1260    movq        mm4, mm3
1261    PALIGNR     mm2, mm0, 7, mm0
1262    PALIGNR     mm1, mm4, 1, mm4
1263    test        r1d, r1d
1264    jz .fix_lt_2
1265    test        r2d, r2d
1266    jz .fix_tr_1
1267.do_top:
1268    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1269    movq2dq   xmm4, mm4
1270    lea         r1, [r0+r3*2]
1271    movdqa    xmm0, xmm3
1272    pslldq    xmm4, 8
1273    por       xmm3, xmm4
1274    lea         r2, [r1+r3*2]
1275    pslldq    xmm4, 1
1276    por       xmm1, xmm4
1277    psrldq    xmm0, 7
1278    pslldq    xmm0, 15
1279    psrldq    xmm0, 7
1280    por       xmm1, xmm0
1281    lea         r0, [r2+r3*2]
1282    movdqa    xmm2, xmm3
1283    psrldq    xmm2, 1
1284INIT_XMM cpuname
1285    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1286    movdqa    xmm1, xmm0
1287    psrldq    xmm1, 1
1288    movq [r0+r3*2], xmm0
1289    movq [r0+r3*1], xmm1
1290    psrldq    xmm0, 2
1291    psrldq    xmm1, 2
1292    movq [r2+r3*2], xmm0
1293    movq [r2+r3*1], xmm1
1294    psrldq    xmm0, 2
1295    psrldq    xmm1, 2
1296    movq [r1+r3*2], xmm0
1297    movq [r1+r3*1], xmm1
1298    psrldq    xmm0, 2
1299    psrldq    xmm1, 2
1300    movq [r4+r3*2], xmm0
1301    movq [r4+r3*1], xmm1
1302    RET
1303%endmacro
1304
1305INIT_MMX sse2
1306PRED8x8L_DOWN_RIGHT
1307INIT_MMX ssse3
1308PRED8x8L_DOWN_RIGHT
1309
1310;-----------------------------------------------------------------------------
1311; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
1312;                                   int has_topright, ptrdiff_t stride)
1313;-----------------------------------------------------------------------------
1314
1315%macro PRED8x8L_VERTICAL_RIGHT 0
1316cglobal pred8x8l_vertical_right_8, 4,5,7
1317    ; manually spill XMM registers for Win64 because
1318    ; the code here is initialized with INIT_MMX
1319    WIN64_SPILL_XMM 7
1320    sub          r0, r3
1321    lea          r4, [r0+r3*2]
1322    movq        mm0, [r0+r3*1-8]
1323    punpckhbw   mm0, [r0+r3*0-8]
1324    movq        mm1, [r4+r3*1-8]
1325    punpckhbw   mm1, [r0+r3*2-8]
1326    mov          r4, r0
1327    punpckhwd   mm1, mm0
1328    lea          r0, [r0+r3*4]
1329    movq        mm2, [r0+r3*1-8]
1330    punpckhbw   mm2, [r0+r3*0-8]
1331    lea          r0, [r0+r3*2]
1332    movq        mm3, [r0+r3*1-8]
1333    punpckhbw   mm3, [r0+r3*0-8]
1334    punpckhwd   mm3, mm2
1335    punpckhdq   mm3, mm1
1336    lea          r0, [r0+r3*2]
1337    movq        mm0, [r0+r3*0-8]
1338    movq        mm1, [r4]
1339    mov          r0, r4
1340    movq        mm4, mm3
1341    movq        mm2, mm3
1342    PALIGNR     mm4, mm0, 7, mm0
1343    PALIGNR     mm1, mm2, 1, mm2
1344    test        r1d, r1d
1345    jnz .do_left
1346.fix_lt_1:
1347    movq        mm5, mm3
1348    pxor        mm5, mm4
1349    psrlq       mm5, 56
1350    psllq       mm5, 48
1351    pxor        mm1, mm5
1352    jmp .do_left
1353.fix_lt_2:
1354    movq        mm5, mm3
1355    pxor        mm5, mm2
1356    psllq       mm5, 56
1357    psrlq       mm5, 56
1358    pxor        mm2, mm5
1359    test        r2d, r2d
1360    jnz .do_top
1361.fix_tr_1:
1362    movq        mm5, mm3
1363    pxor        mm5, mm1
1364    psrlq       mm5, 56
1365    psllq       mm5, 56
1366    pxor        mm1, mm5
1367    jmp .do_top
1368.do_left:
1369    movq        mm0, mm4
1370    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1371    movq2dq    xmm0, mm2
1372    movq        mm0, [r0-8]
1373    movq        mm3, [r0]
1374    movq        mm1, [r0+8]
1375    movq        mm2, mm3
1376    movq        mm4, mm3
1377    PALIGNR     mm2, mm0, 7, mm0
1378    PALIGNR     mm1, mm4, 1, mm4
1379    test        r1d, r1d
1380    jz .fix_lt_2
1381    test        r2d, r2d
1382    jz .fix_tr_1
1383.do_top:
1384    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1385    lea           r1, [r0+r3*2]
1386    movq2dq     xmm4, mm6
1387    pslldq      xmm4, 8
1388    por         xmm0, xmm4
1389    movdqa      xmm6, [pw_ff00]
1390    movdqa      xmm1, xmm0
1391    lea           r2, [r1+r3*2]
1392    movdqa      xmm2, xmm0
1393    movdqa      xmm3, xmm0
1394    pslldq      xmm0, 1
1395    pslldq      xmm1, 2
1396    pavgb       xmm2, xmm0
1397INIT_XMM cpuname
1398    PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1399    pandn       xmm6, xmm4
1400    movdqa      xmm5, xmm4
1401    psrlw       xmm4, 8
1402    packuswb    xmm6, xmm4
1403    movhlps     xmm4, xmm6
1404    movhps [r0+r3*2], xmm5
1405    movhps [r0+r3*1], xmm2
1406    psrldq      xmm5, 4
1407    movss       xmm5, xmm6
1408    psrldq      xmm2, 4
1409    movss       xmm2, xmm4
1410    lea           r0, [r2+r3*2]
1411    psrldq      xmm5, 1
1412    psrldq      xmm2, 1
1413    movq        [r0+r3*2], xmm5
1414    movq        [r0+r3*1], xmm2
1415    psrldq      xmm5, 1
1416    psrldq      xmm2, 1
1417    movq        [r2+r3*2], xmm5
1418    movq        [r2+r3*1], xmm2
1419    psrldq      xmm5, 1
1420    psrldq      xmm2, 1
1421    movq        [r1+r3*2], xmm5
1422    movq        [r1+r3*1], xmm2
1423    RET
1424%endmacro
1425
1426INIT_MMX sse2
1427PRED8x8L_VERTICAL_RIGHT
1428INIT_MMX ssse3
1429PRED8x8L_VERTICAL_RIGHT
1430
1431;-----------------------------------------------------------------------------
1432; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
1433;                                  int has_topright, ptrdiff_t stride)
1434;-----------------------------------------------------------------------------
1435
1436%macro PRED8x8L_VERTICAL_LEFT 0
1437cglobal pred8x8l_vertical_left_8, 4,4
1438    sub          r0, r3
1439    movq        mm0, [r0-8]
1440    movq        mm3, [r0]
1441    movq        mm1, [r0+8]
1442    movq        mm2, mm3
1443    movq        mm4, mm3
1444    PALIGNR     mm2, mm0, 7, mm0
1445    PALIGNR     mm1, mm4, 1, mm4
1446    test        r1d, r1d
1447    jz .fix_lt_2
1448    test        r2d, r2d
1449    jz .fix_tr_1
1450    jmp .do_top
1451.fix_lt_2:
1452    movq        mm5, mm3
1453    pxor        mm5, mm2
1454    psllq       mm5, 56
1455    psrlq       mm5, 56
1456    pxor        mm2, mm5
1457    test        r2d, r2d
1458    jnz .do_top
1459.fix_tr_1:
1460    movq        mm5, mm3
1461    pxor        mm5, mm1
1462    psrlq       mm5, 56
1463    psllq       mm5, 56
1464    pxor        mm1, mm5
1465    jmp .do_top
1466.fix_tr_2:
1467    punpckhbw   mm3, mm3
1468    pshufw      mm1, mm3, 0xFF
1469    jmp .do_topright
1470.do_top:
1471    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1472    movq2dq    xmm4, mm4
1473    test        r2d, r2d
1474    jz .fix_tr_2
1475    movq        mm0, [r0+8]
1476    movq        mm5, mm0
1477    movq        mm2, mm0
1478    movq        mm4, mm0
1479    psrlq       mm5, 56
1480    PALIGNR     mm2, mm3, 7, mm3
1481    PALIGNR     mm5, mm4, 1, mm4
1482    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1483.do_topright:
1484    movq2dq   xmm3, mm1
1485    lea         r1, [r0+r3*2]
1486    pslldq    xmm3, 8
1487    por       xmm4, xmm3
1488    movdqa    xmm2, xmm4
1489    movdqa    xmm1, xmm4
1490    movdqa    xmm3, xmm4
1491    psrldq    xmm2, 1
1492    pslldq    xmm1, 1
1493    pavgb     xmm3, xmm2
1494    lea         r2, [r1+r3*2]
1495INIT_XMM cpuname
1496    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
1497    psrldq    xmm0, 1
1498    movq [r0+r3*1], xmm3
1499    movq [r0+r3*2], xmm0
1500    lea         r0, [r2+r3*2]
1501    psrldq    xmm3, 1
1502    psrldq    xmm0, 1
1503    movq [r1+r3*1], xmm3
1504    movq [r1+r3*2], xmm0
1505    psrldq    xmm3, 1
1506    psrldq    xmm0, 1
1507    movq [r2+r3*1], xmm3
1508    movq [r2+r3*2], xmm0
1509    psrldq    xmm3, 1
1510    psrldq    xmm0, 1
1511    movq [r0+r3*1], xmm3
1512    movq [r0+r3*2], xmm0
1513    RET
1514%endmacro
1515
1516INIT_MMX sse2
1517PRED8x8L_VERTICAL_LEFT
1518INIT_MMX ssse3
1519PRED8x8L_VERTICAL_LEFT
1520
1521;-----------------------------------------------------------------------------
1522; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
1523;                                  int has_topright, ptrdiff_t stride)
1524;-----------------------------------------------------------------------------
1525
1526%macro PRED8x8L_HORIZONTAL_UP 0
1527cglobal pred8x8l_horizontal_up_8, 4,4
1528    sub          r0, r3
1529    lea          r2, [r0+r3*2]
1530    movq        mm0, [r0+r3*1-8]
1531    test        r1d, r1d
1532    lea          r1, [r0+r3]
1533    cmovnz       r1, r0
1534    punpckhbw   mm0, [r1+r3*0-8]
1535    movq        mm1, [r2+r3*1-8]
1536    punpckhbw   mm1, [r0+r3*2-8]
1537    mov          r2, r0
1538    punpckhwd   mm1, mm0
1539    lea          r0, [r0+r3*4]
1540    movq        mm2, [r0+r3*1-8]
1541    punpckhbw   mm2, [r0+r3*0-8]
1542    lea          r0, [r0+r3*2]
1543    movq        mm3, [r0+r3*1-8]
1544    punpckhbw   mm3, [r0+r3*0-8]
1545    punpckhwd   mm3, mm2
1546    punpckhdq   mm3, mm1
1547    lea          r0, [r0+r3*2]
1548    movq        mm0, [r0+r3*0-8]
1549    movq        mm1, [r1+r3*0-8]
1550    mov          r0, r2
1551    movq        mm4, mm3
1552    movq        mm2, mm3
1553    PALIGNR     mm4, mm0, 7, mm0
1554    PALIGNR     mm1, mm2, 1, mm2
1555    movq       mm0, mm4
1556    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1557    movq       mm4, mm0
1558    movq       mm7, mm2
1559    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1560    psllq      mm1, 56
1561    PALIGNR    mm7, mm1, 7, mm3
1562    lea         r1, [r0+r3*2]
1563    pshufw     mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
1564    psllq      mm7, 56             ; l7 .. .. .. .. .. .. ..
1565    movq       mm2, mm0
1566    psllw      mm0, 8
1567    psrlw      mm2, 8
1568    por        mm2, mm0            ; l7 l6 l5 l4 l3 l2 l1 l0
1569    movq       mm3, mm2
1570    movq       mm4, mm2
1571    movq       mm5, mm2
1572    psrlq      mm2, 8
1573    psrlq      mm3, 16
1574    lea         r2, [r1+r3*2]
1575    por        mm2, mm7            ; l7 l7 l6 l5 l4 l3 l2 l1
1576    punpckhbw  mm7, mm7
1577    por        mm3, mm7            ; l7 l7 l7 l6 l5 l4 l3 l2
1578    pavgb      mm4, mm2
1579    PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
1580    movq       mm5, mm4
1581    punpcklbw  mm4, mm1            ; p4 p3 p2 p1
1582    punpckhbw  mm5, mm1            ; p8 p7 p6 p5
1583    movq       mm6, mm5
1584    movq       mm7, mm5
1585    movq       mm0, mm5
1586    PALIGNR    mm5, mm4, 2, mm1
1587    pshufw     mm1, mm6, 11111001b
1588    PALIGNR    mm6, mm4, 4, mm2
1589    pshufw     mm2, mm7, 11111110b
1590    PALIGNR    mm7, mm4, 6, mm3
1591    pshufw     mm3, mm0, 11111111b
1592    movq [r0+r3*1], mm4
1593    movq [r0+r3*2], mm5
1594    lea         r0, [r2+r3*2]
1595    movq [r1+r3*1], mm6
1596    movq [r1+r3*2], mm7
1597    movq [r2+r3*1], mm0
1598    movq [r2+r3*2], mm1
1599    movq [r0+r3*1], mm2
1600    movq [r0+r3*2], mm3
1601    RET
1602%endmacro
1603
1604INIT_MMX mmxext
1605PRED8x8L_HORIZONTAL_UP
1606INIT_MMX ssse3
1607PRED8x8L_HORIZONTAL_UP
1608
1609;-----------------------------------------------------------------------------
1610; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
1611;                                    int has_topright, ptrdiff_t stride)
1612;-----------------------------------------------------------------------------
1613
1614%macro PRED8x8L_HORIZONTAL_DOWN 0
1615cglobal pred8x8l_horizontal_down_8, 4,5
1616    sub          r0, r3
1617    lea          r4, [r0+r3*2]
1618    movq        mm0, [r0+r3*1-8]
1619    punpckhbw   mm0, [r0+r3*0-8]
1620    movq        mm1, [r4+r3*1-8]
1621    punpckhbw   mm1, [r0+r3*2-8]
1622    mov          r4, r0
1623    punpckhwd   mm1, mm0
1624    lea          r0, [r0+r3*4]
1625    movq        mm2, [r0+r3*1-8]
1626    punpckhbw   mm2, [r0+r3*0-8]
1627    lea          r0, [r0+r3*2]
1628    movq        mm3, [r0+r3*1-8]
1629    punpckhbw   mm3, [r0+r3*0-8]
1630    punpckhwd   mm3, mm2
1631    punpckhdq   mm3, mm1
1632    lea          r0, [r0+r3*2]
1633    movq        mm0, [r0+r3*0-8]
1634    movq        mm1, [r4]
1635    mov          r0, r4
1636    movq        mm4, mm3
1637    movq        mm2, mm3
1638    PALIGNR     mm4, mm0, 7, mm0
1639    PALIGNR     mm1, mm2, 1, mm2
1640    test        r1d, r1d
1641    jnz .do_left
1642.fix_lt_1:
1643    movq        mm5, mm3
1644    pxor        mm5, mm4
1645    psrlq       mm5, 56
1646    psllq       mm5, 48
1647    pxor        mm1, mm5
1648    jmp .do_left
1649.fix_lt_2:
1650    movq        mm5, mm3
1651    pxor        mm5, mm2
1652    psllq       mm5, 56
1653    psrlq       mm5, 56
1654    pxor        mm2, mm5
1655    test        r2d, r2d
1656    jnz .do_top
1657.fix_tr_1:
1658    movq        mm5, mm3
1659    pxor        mm5, mm1
1660    psrlq       mm5, 56
1661    psllq       mm5, 56
1662    pxor        mm1, mm5
1663    jmp .do_top
1664.fix_tr_2:
1665    punpckhbw   mm3, mm3
1666    pshufw      mm1, mm3, 0xFF
1667    jmp .do_topright
1668.do_left:
1669    movq        mm0, mm4
1670    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1671    movq2dq    xmm0, mm2
1672    pslldq     xmm0, 8
1673    movq        mm4, mm0
1674    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1675    movq2dq    xmm2, mm1
1676    pslldq     xmm2, 15
1677    psrldq     xmm2, 8
1678    por        xmm0, xmm2
1679    movq        mm0, [r0-8]
1680    movq        mm3, [r0]
1681    movq        mm1, [r0+8]
1682    movq        mm2, mm3
1683    movq        mm4, mm3
1684    PALIGNR     mm2, mm0, 7, mm0
1685    PALIGNR     mm1, mm4, 1, mm4
1686    test        r1d, r1d
1687    jz .fix_lt_2
1688    test        r2d, r2d
1689    jz .fix_tr_1
1690.do_top:
1691    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1692    movq2dq    xmm1, mm4
1693    test        r2d, r2d
1694    jz .fix_tr_2
1695    movq        mm0, [r0+8]
1696    movq        mm5, mm0
1697    movq        mm2, mm0
1698    movq        mm4, mm0
1699    psrlq       mm5, 56
1700    PALIGNR     mm2, mm3, 7, mm3
1701    PALIGNR     mm5, mm4, 1, mm4
1702    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1703.do_topright:
1704    movq2dq    xmm5, mm1
1705    pslldq     xmm5, 8
1706    por        xmm1, xmm5
1707INIT_XMM cpuname
1708    lea         r2, [r4+r3*2]
1709    movdqa    xmm2, xmm1
1710    movdqa    xmm3, xmm1
1711    PALIGNR   xmm1, xmm0, 7, xmm4
1712    PALIGNR   xmm2, xmm0, 9, xmm5
1713    lea         r1, [r2+r3*2]
1714    PALIGNR   xmm3, xmm0, 8, xmm0
1715    movdqa    xmm4, xmm1
1716    pavgb     xmm4, xmm3
1717    lea         r0, [r1+r3*2]
1718    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
1719    punpcklbw xmm4, xmm0
1720    movhlps   xmm0, xmm4
1721    movq   [r0+r3*2], xmm4
1722    movq   [r2+r3*2], xmm0
1723    psrldq xmm4, 2
1724    psrldq xmm0, 2
1725    movq   [r0+r3*1], xmm4
1726    movq   [r2+r3*1], xmm0
1727    psrldq xmm4, 2
1728    psrldq xmm0, 2
1729    movq   [r1+r3*2], xmm4
1730    movq   [r4+r3*2], xmm0
1731    psrldq xmm4, 2
1732    psrldq xmm0, 2
1733    movq   [r1+r3*1], xmm4
1734    movq   [r4+r3*1], xmm0
1735    RET
1736%endmacro
1737
1738INIT_MMX sse2
1739PRED8x8L_HORIZONTAL_DOWN
1740INIT_MMX ssse3
1741PRED8x8L_HORIZONTAL_DOWN
1742
1743;-------------------------------------------------------------------------------
1744; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright,
1745;                             ptrdiff_t stride)
1746;-------------------------------------------------------------------------------
1747
1748INIT_MMX mmxext
1749cglobal pred4x4_dc_8, 3,5
1750    pxor   mm7, mm7
1751    mov     r4, r0
1752    sub     r0, r2
1753    movd   mm0, [r0]
1754    psadbw mm0, mm7
1755    movzx  r1d, byte [r0+r2*1-1]
1756    movd   r3d, mm0
1757    add    r3d, r1d
1758    movzx  r1d, byte [r0+r2*2-1]
1759    lea     r0, [r0+r2*2]
1760    add    r3d, r1d
1761    movzx  r1d, byte [r0+r2*1-1]
1762    add    r3d, r1d
1763    movzx  r1d, byte [r0+r2*2-1]
1764    add    r3d, r1d
1765    add    r3d, 4
1766    shr    r3d, 3
1767    imul   r3d, 0x01010101
1768    mov   [r4+r2*0], r3d
1769    mov   [r0+r2*0], r3d
1770    mov   [r0+r2*1], r3d
1771    mov   [r0+r2*2], r3d
1772    RET
1773
1774;-----------------------------------------------------------------------------
1775; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
1776;                                 ptrdiff_t stride)
1777;-----------------------------------------------------------------------------
1778
1779INIT_MMX mmxext
1780cglobal pred4x4_tm_vp8_8, 3,6
1781    sub        r0, r2
1782    pxor      mm7, mm7
1783    movd      mm0, [r0]
1784    punpcklbw mm0, mm7
1785    movzx     r4d, byte [r0-1]
1786    mov       r5d, 2
1787.loop:
1788    movzx     r1d, byte [r0+r2*1-1]
1789    movzx     r3d, byte [r0+r2*2-1]
1790    sub       r1d, r4d
1791    sub       r3d, r4d
1792    movd      mm2, r1d
1793    movd      mm4, r3d
1794    pshufw    mm2, mm2, 0
1795    pshufw    mm4, mm4, 0
1796    paddw     mm2, mm0
1797    paddw     mm4, mm0
1798    packuswb  mm2, mm2
1799    packuswb  mm4, mm4
1800    movd [r0+r2*1], mm2
1801    movd [r0+r2*2], mm4
1802    lea        r0, [r0+r2*2]
1803    dec       r5d
1804    jg .loop
1805    REP_RET
1806
1807INIT_XMM ssse3
1808cglobal pred4x4_tm_vp8_8, 3,3
1809    sub         r0, r2
1810    movq       mm6, [tm_shuf]
1811    pxor       mm1, mm1
1812    movd       mm0, [r0]
1813    punpcklbw  mm0, mm1
1814    movd       mm7, [r0-4]
1815    pshufb     mm7, mm6
1816    lea         r1, [r0+r2*2]
1817    movd       mm2, [r0+r2*1-4]
1818    movd       mm3, [r0+r2*2-4]
1819    movd       mm4, [r1+r2*1-4]
1820    movd       mm5, [r1+r2*2-4]
1821    pshufb     mm2, mm6
1822    pshufb     mm3, mm6
1823    pshufb     mm4, mm6
1824    pshufb     mm5, mm6
1825    psubw      mm0, mm7
1826    paddw      mm2, mm0
1827    paddw      mm3, mm0
1828    paddw      mm4, mm0
1829    paddw      mm5, mm0
1830    packuswb   mm2, mm2
1831    packuswb   mm3, mm3
1832    packuswb   mm4, mm4
1833    packuswb   mm5, mm5
1834    movd [r0+r2*1], mm2
1835    movd [r0+r2*2], mm3
1836    movd [r1+r2*1], mm4
1837    movd [r1+r2*2], mm5
1838    RET
1839
1840;-----------------------------------------------------------------------------
1841; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
1842;                                       ptrdiff_t stride)
1843;-----------------------------------------------------------------------------
1844
1845INIT_MMX mmxext
1846cglobal pred4x4_vertical_vp8_8, 3,3
1847    sub       r0, r2
1848    movd      m1, [r0-1]
1849    movd      m0, [r0]
1850    mova      m2, m0   ;t0 t1 t2 t3
1851    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
1852    lea       r1, [r0+r2*2]
1853    psrlq     m0, 8    ;t1 t2 t3 t4
1854    PRED4x4_LOWPASS m3, m1, m0, m2, m4
1855    movd [r0+r2*1], m3
1856    movd [r0+r2*2], m3
1857    movd [r1+r2*1], m3
1858    movd [r1+r2*2], m3
1859    RET
1860
1861;-----------------------------------------------------------------------------
1862; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
1863;                                    ptrdiff_t stride)
1864;-----------------------------------------------------------------------------
1865INIT_MMX mmxext
1866cglobal pred4x4_down_left_8, 3,3
1867    sub       r0, r2
1868    movq      m1, [r0]
1869    punpckldq m1, [r1]
1870    movq      m2, m1
1871    movq      m3, m1
1872    psllq     m1, 8
1873    pxor      m2, m1
1874    psrlq     m2, 8
1875    pxor      m2, m3
1876    PRED4x4_LOWPASS m0, m1, m2, m3, m4
1877    lea       r1, [r0+r2*2]
1878    psrlq     m0, 8
1879    movd      [r0+r2*1], m0
1880    psrlq     m0, 8
1881    movd      [r0+r2*2], m0
1882    psrlq     m0, 8
1883    movd      [r1+r2*1], m0
1884    psrlq     m0, 8
1885    movd      [r1+r2*2], m0
1886    RET
1887
1888;------------------------------------------------------------------------------
1889; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
1890;                                        ptrdiff_t stride)
1891;------------------------------------------------------------------------------
1892
1893INIT_MMX mmxext
1894cglobal pred4x4_vertical_left_8, 3,3
1895    sub       r0, r2
1896    movq      m1, [r0]
1897    punpckldq m1, [r1]
1898    movq      m3, m1
1899    movq      m2, m1
1900    psrlq     m3, 8
1901    psrlq     m2, 16
1902    movq      m4, m3
1903    pavgb     m4, m1
1904    PRED4x4_LOWPASS m0, m1, m2, m3, m5
1905    lea       r1, [r0+r2*2]
1906    movh      [r0+r2*1], m4
1907    movh      [r0+r2*2], m0
1908    psrlq     m4, 8
1909    psrlq     m0, 8
1910    movh      [r1+r2*1], m4
1911    movh      [r1+r2*2], m0
1912    RET
1913
1914;------------------------------------------------------------------------------
1915; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
1916;                                        ptrdiff_t stride)
1917;------------------------------------------------------------------------------
1918
1919INIT_MMX mmxext
1920cglobal pred4x4_horizontal_up_8, 3,3
1921    sub       r0, r2
1922    lea       r1, [r0+r2*2]
1923    movd      m0, [r0+r2*1-4]
1924    punpcklbw m0, [r0+r2*2-4]
1925    movd      m1, [r1+r2*1-4]
1926    punpcklbw m1, [r1+r2*2-4]
1927    punpckhwd m0, m1
1928    movq      m1, m0
1929    punpckhbw m1, m1
1930    pshufw    m1, m1, 0xFF
1931    punpckhdq m0, m1
1932    movq      m2, m0
1933    movq      m3, m0
1934    movq      m7, m0
1935    psrlq     m2, 16
1936    psrlq     m3, 8
1937    pavgb     m7, m3
1938    PRED4x4_LOWPASS m4, m0, m2, m3, m5
1939    punpcklbw m7, m4
1940    movd    [r0+r2*1], m7
1941    psrlq    m7, 16
1942    movd    [r0+r2*2], m7
1943    psrlq    m7, 16
1944    movd    [r1+r2*1], m7
1945    movd    [r1+r2*2], m1
1946    RET
1947
1948;------------------------------------------------------------------------------
1949; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
1950;                                          const uint8_t *topright,
1951;                                          ptrdiff_t stride)
1952;------------------------------------------------------------------------------
1953
1954INIT_MMX mmxext
1955cglobal pred4x4_horizontal_down_8, 3,3
1956    sub       r0, r2
1957    lea       r1, [r0+r2*2]
1958    movh      m0, [r0-4]      ; lt ..
1959    punpckldq m0, [r0]        ; t3 t2 t1 t0 lt .. .. ..
1960    psllq     m0, 8           ; t2 t1 t0 lt .. .. .. ..
1961    movd      m1, [r1+r2*2-4] ; l3
1962    punpcklbw m1, [r1+r2*1-4] ; l2 l3
1963    movd      m2, [r0+r2*2-4] ; l1
1964    punpcklbw m2, [r0+r2*1-4] ; l0 l1
1965    punpckhwd m1, m2          ; l0 l1 l2 l3
1966    punpckhdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
1967    movq      m0, m1
1968    movq      m2, m1
1969    movq      m5, m1
1970    psrlq     m0, 16          ; .. .. t2 t1 t0 lt l0 l1
1971    psrlq     m2, 8           ; .. t2 t1 t0 lt l0 l1 l2
1972    pavgb     m5, m2
1973    PRED4x4_LOWPASS m3, m1, m0, m2, m4
1974    punpcklbw m5, m3
1975    psrlq     m3, 32
1976    PALIGNR   m3, m5, 6, m4
1977    movh      [r1+r2*2], m5
1978    psrlq     m5, 16
1979    movh      [r1+r2*1], m5
1980    psrlq     m5, 16
1981    movh      [r0+r2*2], m5
1982    movh      [r0+r2*1], m3
1983    RET
1984
1985;-----------------------------------------------------------------------------
1986; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
1987;                                         const uint8_t *topright,
1988;                                         ptrdiff_t stride)
1989;-----------------------------------------------------------------------------
1990
1991INIT_MMX mmxext
1992cglobal pred4x4_vertical_right_8, 3,3
1993    sub     r0, r2
1994    lea     r1, [r0+r2*2]
1995    movh    m0, [r0]                    ; ........t3t2t1t0
1996    movq    m5, m0
1997    PALIGNR m0, [r0-8], 7, m1           ; ......t3t2t1t0lt
1998    pavgb   m5, m0
1999    PALIGNR m0, [r0+r2*1-8], 7, m1      ; ....t3t2t1t0ltl0
2000    movq    m1, m0
2001    PALIGNR m0, [r0+r2*2-8], 7, m2      ; ..t3t2t1t0ltl0l1
2002    movq    m2, m0
2003    PALIGNR m0, [r1+r2*1-8], 7, m3      ; t3t2t1t0ltl0l1l2
2004    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2005    movq    m1, m3
2006    psrlq   m3, 16
2007    psllq   m1, 48
2008    movh    [r0+r2*1], m5
2009    movh    [r0+r2*2], m3
2010    PALIGNR m5, m1, 7, m2
2011    psllq   m1, 8
2012    movh    [r1+r2*1], m5
2013    PALIGNR m3, m1, 7, m1
2014    movh    [r1+r2*2], m3
2015    RET
2016
2017;-----------------------------------------------------------------------------
2018; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
2019;                                     ptrdiff_t stride)
2020;-----------------------------------------------------------------------------
2021
2022INIT_MMX mmxext
2023cglobal pred4x4_down_right_8, 3,3
2024    sub       r0, r2
2025    lea       r1, [r0+r2*2]
2026    movq      m1, [r1-8]
2027    movq      m2, [r0+r2*1-8]
2028    punpckhbw m2, [r0-8]
2029    movh      m3, [r0]
2030    punpckhwd m1, m2
2031    PALIGNR   m3, m1, 5, m1
2032    movq      m1, m3
2033    PALIGNR   m3, [r1+r2*1-8], 7, m4
2034    movq      m2, m3
2035    PALIGNR   m3, [r1+r2*2-8], 7, m4
2036    PRED4x4_LOWPASS m0, m3, m1, m2, m4
2037    movh      [r1+r2*2], m0
2038    psrlq     m0, 8
2039    movh      [r1+r2*1], m0
2040    psrlq     m0, 8
2041    movh      [r0+r2*2], m0
2042    psrlq     m0, 8
2043    movh      [r0+r2*1], m0
2044    RET
2045