1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29cextern pw_1023
30%define pw_pixel_max pw_1023
31cextern pw_512
32cextern pw_16
33cextern pw_8
34cextern pw_4
35cextern pw_2
36cextern pw_1
37cextern pd_16
38
39pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
40pw_m3:        times 8 dw -3
41pd_17:        times 4 dd 17
42
43SECTION .text
44
45; dest, left, right, src
46; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
47%macro PRED4x4_LOWPASS 4
48    paddw       %2, %3
49    psrlw       %2, 1
50    pavgw       %1, %4, %2
51%endmacro
52
53;-----------------------------------------------------------------------------
54; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright,
55;                               ptrdiff_t stride)
56;-----------------------------------------------------------------------------
57%macro PRED4x4_DR 0
58cglobal pred4x4_down_right_10, 3, 3
59    sub       r0, r2
60    lea       r1, [r0+r2*2]
61    movhps    m1, [r1-8]
62    movhps    m2, [r0+r2*1-8]
63    movhps    m4, [r0-8]
64    punpckhwd m2, m4
65    movq      m3, [r0]
66    punpckhdq m1, m2
67    PALIGNR   m3, m1, 10, m1
68    movhps    m4, [r1+r2*1-8]
69    PALIGNR   m0, m3, m4, 14, m4
70    movhps    m4, [r1+r2*2-8]
71    PALIGNR   m2, m0, m4, 14, m4
72    PRED4x4_LOWPASS m0, m2, m3, m0
73    movq      [r1+r2*2], m0
74    psrldq    m0, 2
75    movq      [r1+r2*1], m0
76    psrldq    m0, 2
77    movq      [r0+r2*2], m0
78    psrldq    m0, 2
79    movq      [r0+r2*1], m0
80    RET
81%endmacro
82
83INIT_XMM sse2
84PRED4x4_DR
85INIT_XMM ssse3
86PRED4x4_DR
87%if HAVE_AVX_EXTERNAL
88INIT_XMM avx
89PRED4x4_DR
90%endif
91
92;------------------------------------------------------------------------------
93; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
94;                                   ptrdiff_t stride)
95;------------------------------------------------------------------------------
96%macro PRED4x4_VR 0
97cglobal pred4x4_vertical_right_10, 3, 3, 6
98    sub     r0, r2
99    lea     r1, [r0+r2*2]
100    movq    m5, [r0]            ; ........t3t2t1t0
101    movhps  m1, [r0-8]
102    PALIGNR m0, m5, m1, 14, m1  ; ......t3t2t1t0lt
103    pavgw   m5, m0
104    movhps  m1, [r0+r2*1-8]
105    PALIGNR m0, m1, 14, m1      ; ....t3t2t1t0ltl0
106    movhps  m2, [r0+r2*2-8]
107    PALIGNR m1, m0, m2, 14, m2  ; ..t3t2t1t0ltl0l1
108    movhps  m3, [r1+r2*1-8]
109    PALIGNR m2, m1, m3, 14, m3  ; t3t2t1t0ltl0l1l2
110    PRED4x4_LOWPASS m1, m0, m2, m1
111    pslldq  m0, m1, 12
112    psrldq  m1, 4
113    movq    [r0+r2*1], m5
114    movq    [r0+r2*2], m1
115    PALIGNR m5, m0, 14, m2
116    pslldq  m0, 2
117    movq    [r1+r2*1], m5
118    PALIGNR m1, m0, 14, m0
119    movq    [r1+r2*2], m1
120    RET
121%endmacro
122
123INIT_XMM sse2
124PRED4x4_VR
125INIT_XMM ssse3
126PRED4x4_VR
127%if HAVE_AVX_EXTERNAL
128INIT_XMM avx
129PRED4x4_VR
130%endif
131
132;-------------------------------------------------------------------------------
133; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
134;                                    ptrdiff_t stride)
135;-------------------------------------------------------------------------------
136%macro PRED4x4_HD 0
137cglobal pred4x4_horizontal_down_10, 3, 3
138    sub        r0, r2
139    lea        r1, [r0+r2*2]
140    movq       m0, [r0-8]      ; lt ..
141    movhps     m0, [r0]
142    pslldq     m0, 2           ; t2 t1 t0 lt .. .. .. ..
143    movq       m1, [r1+r2*2-8] ; l3
144    movq       m3, [r1+r2*1-8]
145    punpcklwd  m1, m3          ; l2 l3
146    movq       m2, [r0+r2*2-8] ; l1
147    movq       m3, [r0+r2*1-8]
148    punpcklwd  m2, m3          ; l0 l1
149    punpckhdq  m1, m2          ; l0 l1 l2 l3
150    punpckhqdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
151    psrldq     m0, m1, 4       ; .. .. t2 t1 t0 lt l0 l1
152    psrldq     m3, m1, 2       ; .. t2 t1 t0 lt l0 l1 l2
153    pavgw      m5, m1, m3
154    PRED4x4_LOWPASS m3, m1, m0, m3
155    punpcklwd  m5, m3
156    psrldq     m3, 8
157    PALIGNR    m3, m5, 12, m4
158    movq       [r1+r2*2], m5
159    movhps     [r0+r2*2], m5
160    psrldq     m5, 4
161    movq       [r1+r2*1], m5
162    movq       [r0+r2*1], m3
163    RET
164%endmacro
165
166INIT_XMM sse2
167PRED4x4_HD
168INIT_XMM ssse3
169PRED4x4_HD
170%if HAVE_AVX_EXTERNAL
171INIT_XMM avx
172PRED4x4_HD
173%endif
174
175;-----------------------------------------------------------------------------
176; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
177;-----------------------------------------------------------------------------
178
179INIT_MMX mmxext
180cglobal pred4x4_dc_10, 3, 3
181    sub    r0, r2
182    lea    r1, [r0+r2*2]
183    movq   m2, [r0+r2*1-8]
184    paddw  m2, [r0+r2*2-8]
185    paddw  m2, [r1+r2*1-8]
186    paddw  m2, [r1+r2*2-8]
187    psrlq  m2, 48
188    movq   m0, [r0]
189    HADDW  m0, m1
190    paddw  m0, [pw_4]
191    paddw  m0, m2
192    psrlw  m0, 3
193    SPLATW m0, m0, 0
194    movq   [r0+r2*1], m0
195    movq   [r0+r2*2], m0
196    movq   [r1+r2*1], m0
197    movq   [r1+r2*2], m0
198    RET
199
200;-----------------------------------------------------------------------------
201; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright,
202;                              ptrdiff_t stride)
203;-----------------------------------------------------------------------------
204%macro PRED4x4_DL 0
205cglobal pred4x4_down_left_10, 3, 3
206    sub        r0, r2
207    movq       m0, [r0]
208    movhps     m0, [r1]
209    psrldq     m2, m0, 2
210    pslldq     m3, m0, 2
211    pshufhw    m2, m2, 10100100b
212    PRED4x4_LOWPASS m0, m3, m2, m0
213    lea        r1, [r0+r2*2]
214    movhps     [r1+r2*2], m0
215    psrldq     m0, 2
216    movq       [r0+r2*1], m0
217    psrldq     m0, 2
218    movq       [r0+r2*2], m0
219    psrldq     m0, 2
220    movq       [r1+r2*1], m0
221    RET
222%endmacro
223
224INIT_XMM sse2
225PRED4x4_DL
226%if HAVE_AVX_EXTERNAL
227INIT_XMM avx
228PRED4x4_DL
229%endif
230
231;-----------------------------------------------------------------------------
232; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
233;                                  ptrdiff_t stride)
234;-----------------------------------------------------------------------------
235%macro PRED4x4_VL 0
236cglobal pred4x4_vertical_left_10, 3, 3
237    sub        r0, r2
238    movu       m1, [r0]
239    movhps     m1, [r1]
240    psrldq     m0, m1, 2
241    psrldq     m2, m1, 4
242    pavgw      m4, m0, m1
243    PRED4x4_LOWPASS m0, m1, m2, m0
244    lea        r1, [r0+r2*2]
245    movq       [r0+r2*1], m4
246    movq       [r0+r2*2], m0
247    psrldq     m4, 2
248    psrldq     m0, 2
249    movq       [r1+r2*1], m4
250    movq       [r1+r2*2], m0
251    RET
252%endmacro
253
254INIT_XMM sse2
255PRED4x4_VL
256%if HAVE_AVX_EXTERNAL
257INIT_XMM avx
258PRED4x4_VL
259%endif
260
261;-----------------------------------------------------------------------------
262; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
263;                                  ptrdiff_t stride)
264;-----------------------------------------------------------------------------
265INIT_MMX mmxext
266cglobal pred4x4_horizontal_up_10, 3, 3
267    sub       r0, r2
268    lea       r1, [r0+r2*2]
269    movq      m0, [r0+r2*1-8]
270    punpckhwd m0, [r0+r2*2-8]
271    movq      m1, [r1+r2*1-8]
272    punpckhwd m1, [r1+r2*2-8]
273    punpckhdq m0, m1
274    pshufw    m1, m1, 0xFF
275    movq      [r1+r2*2], m1
276    movd      [r1+r2*1+4], m1
277    pshufw    m2, m0, 11111001b
278    movq      m1, m2
279    pavgw     m2, m0
280
281    pshufw    m5, m0, 11111110b
282    PRED4x4_LOWPASS m1, m0, m5, m1
283    movq      m6, m2
284    punpcklwd m6, m1
285    movq      [r0+r2*1], m6
286    psrlq     m2, 16
287    psrlq     m1, 16
288    punpcklwd m2, m1
289    movq      [r0+r2*2], m2
290    psrlq     m2, 32
291    movd      [r1+r2*1], m2
292    RET
293
294
295
296;-----------------------------------------------------------------------------
297; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride)
298;-----------------------------------------------------------------------------
299INIT_XMM sse2
300cglobal pred8x8_vertical_10, 2, 2
301    sub  r0, r1
302    mova m0, [r0]
303%rep 3
304    mova [r0+r1*1], m0
305    mova [r0+r1*2], m0
306    lea  r0, [r0+r1*2]
307%endrep
308    mova [r0+r1*1], m0
309    mova [r0+r1*2], m0
310    RET
311
312;-----------------------------------------------------------------------------
313; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride)
314;-----------------------------------------------------------------------------
315INIT_XMM sse2
316cglobal pred8x8_horizontal_10, 2, 3
317    mov         r2d, 4
318.loop:
319    movq         m0, [r0+r1*0-8]
320    movq         m1, [r0+r1*1-8]
321    pshuflw      m0, m0, 0xff
322    pshuflw      m1, m1, 0xff
323    punpcklqdq   m0, m0
324    punpcklqdq   m1, m1
325    mova  [r0+r1*0], m0
326    mova  [r0+r1*1], m1
327    lea          r0, [r0+r1*2]
328    dec          r2d
329    jg .loop
330    REP_RET
331
332;-----------------------------------------------------------------------------
333; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride)
334;-----------------------------------------------------------------------------
335%macro MOV8 2-3
336; sort of a hack, but it works
337    movdqa    [%1], %2
338%endmacro
339
340%macro PRED8x8_DC 1
341cglobal pred8x8_dc_10, 2, 6
342    sub         r0, r1
343    pxor        m4, m4
344    movq        m0, [r0+0]
345    movq        m1, [r0+8]
346    punpcklwd   m0, m1
347    movhlps     m1, m0
348    paddw       m0, m1
349    %1          m2, m0, 00001110b
350    paddw       m0, m2
351
352    lea         r5, [r1*3]
353    lea         r4, [r0+r1*4]
354    movzx      r2d, word [r0+r1*1-2]
355    movzx      r3d, word [r0+r1*2-2]
356    add        r2d, r3d
357    movzx      r3d, word [r0+r5*1-2]
358    add        r2d, r3d
359    movzx      r3d, word [r4-2]
360    add        r2d, r3d
361    movd        m2, r2d            ; s2
362
363    movzx      r2d, word [r4+r1*1-2]
364    movzx      r3d, word [r4+r1*2-2]
365    add        r2d, r3d
366    movzx      r3d, word [r4+r5*1-2]
367    add        r2d, r3d
368    movzx      r3d, word [r4+r1*4-2]
369    add        r2d, r3d
370    movd        m3, r2d            ; s3
371
372    punpcklwd   m2, m3
373    punpckldq   m0, m2            ; s0, s1, s2, s3
374    %1          m3, m0, 11110110b ; s2, s1, s3, s3
375    %1          m0, m0, 01110100b ; s0, s1, s3, s1
376    paddw       m0, m3
377    psrlw       m0, 2
378    pavgw       m0, m4            ; s0+s2, s1, s3, s1+s3
379    punpcklwd   m0, m0
380    pshufd      m3, m0, 11111010b
381    punpckldq   m0, m0
382    SWAP         0,1
383    MOV8   r0+r1*1, m1, m2
384    MOV8   r0+r1*2, m1, m2
385    MOV8   r0+r5*1, m1, m2
386    MOV8   r0+r1*4, m1, m2
387    MOV8   r4+r1*1, m3, m4
388    MOV8   r4+r1*2, m3, m4
389    MOV8   r4+r5*1, m3, m4
390    MOV8   r4+r1*4, m3, m4
391    RET
392%endmacro
393
394INIT_XMM sse2
395PRED8x8_DC pshuflw
396
397;-----------------------------------------------------------------------------
398; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride)
399;-----------------------------------------------------------------------------
400INIT_XMM sse2
401cglobal pred8x8_top_dc_10, 2, 4
402    sub         r0, r1
403    mova        m0, [r0]
404    pshuflw     m1, m0, 0x4e
405    pshufhw     m1, m1, 0x4e
406    paddw       m0, m1
407    pshuflw     m1, m0, 0xb1
408    pshufhw     m1, m1, 0xb1
409    paddw       m0, m1
410    lea         r2, [r1*3]
411    lea         r3, [r0+r1*4]
412    paddw       m0, [pw_2]
413    psrlw       m0, 2
414    mova [r0+r1*1], m0
415    mova [r0+r1*2], m0
416    mova [r0+r2*1], m0
417    mova [r0+r1*4], m0
418    mova [r3+r1*1], m0
419    mova [r3+r1*2], m0
420    mova [r3+r2*1], m0
421    mova [r3+r1*4], m0
422    RET
423
424;-----------------------------------------------------------------------------
425; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride)
426;-----------------------------------------------------------------------------
427INIT_XMM sse2
428cglobal pred8x8_plane_10, 2, 7, 7
429    sub       r0, r1
430    lea       r2, [r1*3]
431    lea       r3, [r0+r1*4]
432    mova      m2, [r0]
433    pmaddwd   m2, [pw_m32101234]
434    HADDD     m2, m1
435    movd      m0, [r0-4]
436    psrld     m0, 14
437    psubw     m2, m0               ; H
438    movd      m0, [r3+r1*4-4]
439    movd      m1, [r0+12]
440    paddw     m0, m1
441    psllw     m0, 4                ; 16*(src[7*stride-1] + src[-stride+7])
442    movzx    r4d, word [r3+r1*1-2] ; src[4*stride-1]
443    movzx    r5d, word [r0+r2*1-2] ; src[2*stride-1]
444    sub      r4d, r5d
445    movzx    r6d, word [r3+r1*2-2] ; src[5*stride-1]
446    movzx    r5d, word [r0+r1*2-2] ; src[1*stride-1]
447    sub      r6d, r5d
448    lea      r4d, [r4+r6*2]
449    movzx    r5d, word [r3+r2*1-2] ; src[6*stride-1]
450    movzx    r6d, word [r0+r1*1-2] ; src[0*stride-1]
451    sub      r5d, r6d
452    lea      r5d, [r5*3]
453    add      r4d, r5d
454    movzx    r6d, word [r3+r1*4-2] ; src[7*stride-1]
455    movzx    r5d, word [r0+r1*0-2] ; src[ -stride-1]
456    sub      r6d, r5d
457    lea      r4d, [r4+r6*4]
458    movd      m3, r4d              ; V
459    punpckldq m2, m3
460    pmaddwd   m2, [pd_17]
461    paddd     m2, [pd_16]
462    psrad     m2, 5                ; b, c
463
464    mova      m3, [pw_pixel_max]
465    pxor      m1, m1
466    SPLATW    m0, m0, 1
467    SPLATW    m4, m2, 2
468    SPLATW    m2, m2, 0
469    pmullw    m2, [pw_m32101234]   ; b
470    pmullw    m5, m4, [pw_m3]      ; c
471    paddw     m5, [pw_16]
472    mov      r2d, 8
473    add       r0, r1
474.loop:
475    paddsw    m6, m2, m5
476    paddsw    m6, m0
477    psraw     m6, 5
478    CLIPW     m6, m1, m3
479    mova    [r0], m6
480    paddw     m5, m4
481    add       r0, r1
482    dec r2d
483    jg .loop
484    REP_RET
485
486
487;-----------------------------------------------------------------------------
488; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright,
489;                            ptrdiff_t stride)
490;-----------------------------------------------------------------------------
491INIT_XMM sse2
492cglobal pred8x8l_128_dc_10, 4, 4
493    mova      m0, [pw_512] ; (1<<(BIT_DEPTH-1))
494    lea       r1, [r3*3]
495    lea       r2, [r0+r3*4]
496    MOV8 r0+r3*0, m0, m0
497    MOV8 r0+r3*1, m0, m0
498    MOV8 r0+r3*2, m0, m0
499    MOV8 r0+r1*1, m0, m0
500    MOV8 r2+r3*0, m0, m0
501    MOV8 r2+r3*1, m0, m0
502    MOV8 r2+r3*2, m0, m0
503    MOV8 r2+r1*1, m0, m0
504    RET
505
506;-----------------------------------------------------------------------------
507; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright,
508;                            ptrdiff_t stride)
509;-----------------------------------------------------------------------------
510%macro PRED8x8L_TOP_DC 0
511cglobal pred8x8l_top_dc_10, 4, 4, 6
512    sub         r0, r3
513    mova        m0, [r0]
514    shr        r1d, 14
515    shr        r2d, 13
516    neg         r1
517    pslldq      m1, m0, 2
518    psrldq      m2, m0, 2
519    pinsrw      m1, [r0+r1], 0
520    pinsrw      m2, [r0+r2+14], 7
521    lea         r1, [r3*3]
522    lea         r2, [r0+r3*4]
523    PRED4x4_LOWPASS m0, m2, m1, m0
524    HADDW       m0, m1
525    paddw       m0, [pw_4]
526    psrlw       m0, 3
527    SPLATW      m0, m0, 0
528    mova [r0+r3*1], m0
529    mova [r0+r3*2], m0
530    mova [r0+r1*1], m0
531    mova [r0+r3*4], m0
532    mova [r2+r3*1], m0
533    mova [r2+r3*2], m0
534    mova [r2+r1*1], m0
535    mova [r2+r3*4], m0
536    RET
537%endmacro
538
539INIT_XMM sse2
540PRED8x8L_TOP_DC
541%if HAVE_AVX_EXTERNAL
542INIT_XMM avx
543PRED8x8L_TOP_DC
544%endif
545
546;-------------------------------------------------------------------------------
547; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
548;                        ptrdiff_t stride)
549;-------------------------------------------------------------------------------
550;TODO: see if scalar is faster
551%macro PRED8x8L_DC 0
552cglobal pred8x8l_dc_10, 4, 6, 6
553    sub         r0, r3
554    lea         r4, [r0+r3*4]
555    lea         r5, [r3*3]
556    mova        m0, [r0+r3*2-16]
557    punpckhwd   m0, [r0+r3*1-16]
558    mova        m1, [r4+r3*0-16]
559    punpckhwd   m1, [r0+r5*1-16]
560    punpckhdq   m1, m0
561    mova        m2, [r4+r3*2-16]
562    punpckhwd   m2, [r4+r3*1-16]
563    mova        m3, [r4+r3*4-16]
564    punpckhwd   m3, [r4+r5*1-16]
565    punpckhdq   m3, m2
566    punpckhqdq  m3, m1
567    mova        m0, [r0]
568    shr        r1d, 14
569    shr        r2d, 13
570    neg         r1
571    pslldq      m1, m0, 2
572    psrldq      m2, m0, 2
573    pinsrw      m1, [r0+r1], 0
574    pinsrw      m2, [r0+r2+14], 7
575    not         r1
576    and         r1, r3
577    pslldq      m4, m3, 2
578    psrldq      m5, m3, 2
579    pshuflw     m4, m4, 11100101b
580    pinsrw      m5, [r0+r1-2], 7
581    PRED4x4_LOWPASS m3, m4, m5, m3
582    PRED4x4_LOWPASS m0, m2, m1, m0
583    paddw       m0, m3
584    HADDW       m0, m1
585    paddw       m0, [pw_8]
586    psrlw       m0, 4
587    SPLATW      m0, m0
588    mova [r0+r3*1], m0
589    mova [r0+r3*2], m0
590    mova [r0+r5*1], m0
591    mova [r0+r3*4], m0
592    mova [r4+r3*1], m0
593    mova [r4+r3*2], m0
594    mova [r4+r5*1], m0
595    mova [r4+r3*4], m0
596    RET
597%endmacro
598
599INIT_XMM sse2
600PRED8x8L_DC
601%if HAVE_AVX_EXTERNAL
602INIT_XMM avx
603PRED8x8L_DC
604%endif
605
606;-----------------------------------------------------------------------------
607; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
608;                              ptrdiff_t stride)
609;-----------------------------------------------------------------------------
610%macro PRED8x8L_VERTICAL 0
611cglobal pred8x8l_vertical_10, 4, 4, 6
612    sub         r0, r3
613    mova        m0, [r0]
614    shr        r1d, 14
615    shr        r2d, 13
616    neg         r1
617    pslldq      m1, m0, 2
618    psrldq      m2, m0, 2
619    pinsrw      m1, [r0+r1], 0
620    pinsrw      m2, [r0+r2+14], 7
621    lea         r1, [r3*3]
622    lea         r2, [r0+r3*4]
623    PRED4x4_LOWPASS m0, m2, m1, m0
624    mova [r0+r3*1], m0
625    mova [r0+r3*2], m0
626    mova [r0+r1*1], m0
627    mova [r0+r3*4], m0
628    mova [r2+r3*1], m0
629    mova [r2+r3*2], m0
630    mova [r2+r1*1], m0
631    mova [r2+r3*4], m0
632    RET
633%endmacro
634
635INIT_XMM sse2
636PRED8x8L_VERTICAL
637%if HAVE_AVX_EXTERNAL
638INIT_XMM avx
639PRED8x8L_VERTICAL
640%endif
641
642;-----------------------------------------------------------------------------
643; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
644;                                int has_topright, ptrdiff_t stride)
645;-----------------------------------------------------------------------------
646%macro PRED8x8L_HORIZONTAL 0
647cglobal pred8x8l_horizontal_10, 4, 4, 5
648    mova        m0, [r0-16]
649    shr        r1d, 14
650    dec         r1
651    and         r1, r3
652    sub         r1, r3
653    punpckhwd   m0, [r0+r1-16]
654    mova        m1, [r0+r3*2-16]
655    punpckhwd   m1, [r0+r3*1-16]
656    lea         r2, [r0+r3*4]
657    lea         r1, [r3*3]
658    punpckhdq   m1, m0
659    mova        m2, [r2+r3*0-16]
660    punpckhwd   m2, [r0+r1-16]
661    mova        m3, [r2+r3*2-16]
662    punpckhwd   m3, [r2+r3*1-16]
663    punpckhdq   m3, m2
664    punpckhqdq  m3, m1
665    PALIGNR     m4, m3, [r2+r1-16], 14, m0
666    pslldq      m0, m4, 2
667    pshuflw     m0, m0, 11100101b
668    PRED4x4_LOWPASS m4, m3, m0, m4
669    punpckhwd   m3, m4, m4
670    punpcklwd   m4, m4
671    pshufd      m0, m3, 0xff
672    pshufd      m1, m3, 0xaa
673    pshufd      m2, m3, 0x55
674    pshufd      m3, m3, 0x00
675    mova [r0+r3*0], m0
676    mova [r0+r3*1], m1
677    mova [r0+r3*2], m2
678    mova [r0+r1*1], m3
679    pshufd      m0, m4, 0xff
680    pshufd      m1, m4, 0xaa
681    pshufd      m2, m4, 0x55
682    pshufd      m3, m4, 0x00
683    mova [r2+r3*0], m0
684    mova [r2+r3*1], m1
685    mova [r2+r3*2], m2
686    mova [r2+r1*1], m3
687    RET
688%endmacro
689
690INIT_XMM sse2
691PRED8x8L_HORIZONTAL
692INIT_XMM ssse3
693PRED8x8L_HORIZONTAL
694%if HAVE_AVX_EXTERNAL
695INIT_XMM avx
696PRED8x8L_HORIZONTAL
697%endif
698
699;-----------------------------------------------------------------------------
700; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
701;                               ptrdiff_t stride)
702;-----------------------------------------------------------------------------
703%macro PRED8x8L_DOWN_LEFT 0
704cglobal pred8x8l_down_left_10, 4, 4, 7
705    sub         r0, r3
706    mova        m3, [r0]
707    shr        r1d, 14
708    neg         r1
709    shr        r2d, 13
710    pslldq      m1, m3, 2
711    psrldq      m2, m3, 2
712    pinsrw      m1, [r0+r1], 0
713    pinsrw      m2, [r0+r2+14], 7
714    PRED4x4_LOWPASS m6, m2, m1, m3
715    jz .fix_tr ; flags from shr r2d
716    mova        m1, [r0+16]
717    psrldq      m5, m1, 2
718    PALIGNR     m2, m1, m3, 14, m3
719    pshufhw     m5, m5, 10100100b
720    PRED4x4_LOWPASS m1, m2, m5, m1
721.do_topright:
722    lea         r1, [r3*3]
723    psrldq      m5, m1, 14
724    lea         r2, [r0+r3*4]
725    PALIGNR     m2, m1, m6,  2, m0
726    PALIGNR     m3, m1, m6, 14, m0
727    PALIGNR     m5, m1,  2, m0
728    pslldq      m4, m6, 2
729    PRED4x4_LOWPASS m6, m4, m2, m6
730    PRED4x4_LOWPASS m1, m3, m5, m1
731    mova [r2+r3*4], m1
732    PALIGNR     m1, m6, 14, m2
733    pslldq      m6, 2
734    mova [r2+r1*1], m1
735    PALIGNR     m1, m6, 14, m2
736    pslldq      m6, 2
737    mova [r2+r3*2], m1
738    PALIGNR     m1, m6, 14, m2
739    pslldq      m6, 2
740    mova [r2+r3*1], m1
741    PALIGNR     m1, m6, 14, m2
742    pslldq      m6, 2
743    mova [r0+r3*4], m1
744    PALIGNR     m1, m6, 14, m2
745    pslldq      m6, 2
746    mova [r0+r1*1], m1
747    PALIGNR     m1, m6, 14, m2
748    pslldq      m6, 2
749    mova [r0+r3*2], m1
750    PALIGNR     m1, m6, 14, m6
751    mova [r0+r3*1], m1
752    RET
753.fix_tr:
754    punpckhwd   m3, m3
755    pshufd      m1, m3, 0xFF
756    jmp .do_topright
757%endmacro
758
759INIT_XMM sse2
760PRED8x8L_DOWN_LEFT
761INIT_XMM ssse3
762PRED8x8L_DOWN_LEFT
763%if HAVE_AVX_EXTERNAL
764INIT_XMM avx
765PRED8x8L_DOWN_LEFT
766%endif
767
768;-----------------------------------------------------------------------------
769; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
770;                                int has_topright, ptrdiff_t stride)
771;-----------------------------------------------------------------------------
772%macro PRED8x8L_DOWN_RIGHT 0
773; standard forbids this when has_topleft is false
774; no need to check
775cglobal pred8x8l_down_right_10, 4, 5, 8
776    sub         r0, r3
777    lea         r4, [r0+r3*4]
778    lea         r1, [r3*3]
779    mova        m0, [r0+r3*1-16]
780    punpckhwd   m0, [r0+r3*0-16]
781    mova        m1, [r0+r1*1-16]
782    punpckhwd   m1, [r0+r3*2-16]
783    punpckhdq   m1, m0
784    mova        m2, [r4+r3*1-16]
785    punpckhwd   m2, [r4+r3*0-16]
786    mova        m3, [r4+r1*1-16]
787    punpckhwd   m3, [r4+r3*2-16]
788    punpckhdq   m3, m2
789    punpckhqdq  m3, m1
790    mova        m0, [r4+r3*4-16]
791    mova        m1, [r0]
792    PALIGNR     m4, m3, m0, 14, m0
793    PALIGNR     m1, m3,  2, m2
794    pslldq      m0, m4, 2
795    pshuflw     m0, m0, 11100101b
796    PRED4x4_LOWPASS m6, m1, m4, m3
797    PRED4x4_LOWPASS m4, m3, m0, m4
798    mova        m3, [r0]
799    shr        r2d, 13
800    pslldq      m1, m3, 2
801    psrldq      m2, m3, 2
802    pinsrw      m1, [r0-2], 0
803    pinsrw      m2, [r0+r2+14], 7
804    PRED4x4_LOWPASS m3, m2, m1, m3
805    PALIGNR     m2, m3, m6,  2, m0
806    PALIGNR     m5, m3, m6, 14, m0
807    psrldq      m7, m3, 2
808    PRED4x4_LOWPASS m6, m4, m2, m6
809    PRED4x4_LOWPASS m3, m5, m7, m3
810    mova [r4+r3*4], m6
811    PALIGNR     m3, m6, 14, m2
812    pslldq      m6, 2
813    mova [r0+r3*1], m3
814    PALIGNR     m3, m6, 14, m2
815    pslldq      m6, 2
816    mova [r0+r3*2], m3
817    PALIGNR     m3, m6, 14, m2
818    pslldq      m6, 2
819    mova [r0+r1*1], m3
820    PALIGNR     m3, m6, 14, m2
821    pslldq      m6, 2
822    mova [r0+r3*4], m3
823    PALIGNR     m3, m6, 14, m2
824    pslldq      m6, 2
825    mova [r4+r3*1], m3
826    PALIGNR     m3, m6, 14, m2
827    pslldq      m6, 2
828    mova [r4+r3*2], m3
829    PALIGNR     m3, m6, 14, m6
830    mova [r4+r1*1], m3
831    RET
832%endmacro
833
834INIT_XMM sse2
835PRED8x8L_DOWN_RIGHT
836INIT_XMM ssse3
837PRED8x8L_DOWN_RIGHT
838%if HAVE_AVX_EXTERNAL
839INIT_XMM avx
840PRED8x8L_DOWN_RIGHT
841%endif
842
843;-----------------------------------------------------------------------------
844; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
845;                                    int has_topright, ptrdiff_t stride)
846;-----------------------------------------------------------------------------
847%macro PRED8x8L_VERTICAL_RIGHT 0
848; likewise with 8x8l_down_right
849cglobal pred8x8l_vertical_right_10, 4, 5, 7
850    sub         r0, r3
851    lea         r4, [r0+r3*4]
852    lea         r1, [r3*3]
853    mova        m0, [r0+r3*1-16]
854    punpckhwd   m0, [r0+r3*0-16]
855    mova        m1, [r0+r1*1-16]
856    punpckhwd   m1, [r0+r3*2-16]
857    punpckhdq   m1, m0
858    mova        m2, [r4+r3*1-16]
859    punpckhwd   m2, [r4+r3*0-16]
860    mova        m3, [r4+r1*1-16]
861    punpckhwd   m3, [r4+r3*2-16]
862    punpckhdq   m3, m2
863    punpckhqdq  m3, m1
864    mova        m0, [r4+r3*4-16]
865    mova        m1, [r0]
866    PALIGNR     m4, m3, m0, 14, m0
867    PALIGNR     m1, m3,  2, m2
868    PRED4x4_LOWPASS m3, m1, m4, m3
869    mova        m2, [r0]
870    shr        r2d, 13
871    pslldq      m1, m2, 2
872    psrldq      m5, m2, 2
873    pinsrw      m1, [r0-2], 0
874    pinsrw      m5, [r0+r2+14], 7
875    PRED4x4_LOWPASS m2, m5, m1, m2
876    PALIGNR     m6, m2, m3, 12, m1
877    PALIGNR     m5, m2, m3, 14, m0
878    PRED4x4_LOWPASS m0, m6, m2, m5
879    pavgw       m2, m5
880    mova [r0+r3*2], m0
881    mova [r0+r3*1], m2
882    pslldq      m6, m3, 4
883    pslldq      m1, m3, 2
884    PRED4x4_LOWPASS m1, m3, m6, m1
885    PALIGNR     m2, m1, 14, m4
886    mova [r0+r1*1], m2
887    pslldq      m1, 2
888    PALIGNR     m0, m1, 14, m3
889    mova [r0+r3*4], m0
890    pslldq      m1, 2
891    PALIGNR     m2, m1, 14, m4
892    mova [r4+r3*1], m2
893    pslldq      m1, 2
894    PALIGNR     m0, m1, 14, m3
895    mova [r4+r3*2], m0
896    pslldq      m1, 2
897    PALIGNR     m2, m1, 14, m4
898    mova [r4+r1*1], m2
899    pslldq      m1, 2
900    PALIGNR     m0, m1, 14, m1
901    mova [r4+r3*4], m0
902    RET
903%endmacro
904
905INIT_XMM sse2
906PRED8x8L_VERTICAL_RIGHT
907INIT_XMM ssse3
908PRED8x8L_VERTICAL_RIGHT
909%if HAVE_AVX_EXTERNAL
910INIT_XMM avx
911PRED8x8L_VERTICAL_RIGHT
912%endif
913
914;-----------------------------------------------------------------------------
915; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
916;                                   int has_topright, ptrdiff_t stride)
917;-----------------------------------------------------------------------------
918%macro PRED8x8L_HORIZONTAL_UP 0
919cglobal pred8x8l_horizontal_up_10, 4, 4, 6
920    mova        m0, [r0+r3*0-16]
921    punpckhwd   m0, [r0+r3*1-16]
922    shr        r1d, 14
923    dec         r1
924    and         r1, r3
925    sub         r1, r3
926    mova        m4, [r0+r1*1-16]
927    lea         r1, [r3*3]
928    lea         r2, [r0+r3*4]
929    mova        m1, [r0+r3*2-16]
930    punpckhwd   m1, [r0+r1*1-16]
931    punpckhdq   m0, m1
932    mova        m2, [r2+r3*0-16]
933    punpckhwd   m2, [r2+r3*1-16]
934    mova        m3, [r2+r3*2-16]
935    punpckhwd   m3, [r2+r1*1-16]
936    punpckhdq   m2, m3
937    punpckhqdq  m0, m2
938    PALIGNR     m1, m0, m4, 14, m4
939    psrldq      m2, m0, 2
940    pshufhw     m2, m2, 10100100b
941    PRED4x4_LOWPASS m0, m1, m2, m0
942    psrldq      m1, m0, 2
943    psrldq      m2, m0, 4
944    pshufhw     m1, m1, 10100100b
945    pshufhw     m2, m2, 01010100b
946    pavgw       m4, m0, m1
947    PRED4x4_LOWPASS m1, m2, m0, m1
948    punpckhwd   m5, m4, m1
949    punpcklwd   m4, m1
950    mova [r2+r3*0], m5
951    mova [r0+r3*0], m4
952    pshufd      m0, m5, 11111001b
953    pshufd      m1, m5, 11111110b
954    pshufd      m2, m5, 11111111b
955    mova [r2+r3*1], m0
956    mova [r2+r3*2], m1
957    mova [r2+r1*1], m2
958    PALIGNR     m2, m5, m4, 4, m0
959    PALIGNR     m3, m5, m4, 8, m1
960    PALIGNR     m5, m5, m4, 12, m4
961    mova [r0+r3*1], m2
962    mova [r0+r3*2], m3
963    mova [r0+r1*1], m5
964    RET
965%endmacro
966
967INIT_XMM sse2
968PRED8x8L_HORIZONTAL_UP
969INIT_XMM ssse3
970PRED8x8L_HORIZONTAL_UP
971%if HAVE_AVX_EXTERNAL
972INIT_XMM avx
973PRED8x8L_HORIZONTAL_UP
974%endif
975
976
977;-----------------------------------------------------------------------------
978; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride)
979;-----------------------------------------------------------------------------
980%macro MOV16 3-5
981    mova [%1+     0], %2
982    mova [%1+mmsize], %3
983%endmacro
984
985INIT_XMM sse2
986cglobal pred16x16_vertical_10, 2, 3
987    sub   r0, r1
988    mov  r2d, 8
989    mova  m0, [r0+ 0]
990    mova  m1, [r0+mmsize]
991.loop:
992    MOV16 r0+r1*1, m0, m1, m2, m3
993    MOV16 r0+r1*2, m0, m1, m2, m3
994    lea   r0, [r0+r1*2]
995    dec   r2d
996    jg .loop
997    REP_RET
998
999;-----------------------------------------------------------------------------
1000; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride)
1001;-----------------------------------------------------------------------------
1002INIT_XMM sse2
1003cglobal pred16x16_horizontal_10, 2, 3
1004    mov   r2d, 8
1005.vloop:
1006    movd   m0, [r0+r1*0-4]
1007    movd   m1, [r0+r1*1-4]
1008    SPLATW m0, m0, 1
1009    SPLATW m1, m1, 1
1010    MOV16  r0+r1*0, m0, m0, m0, m0
1011    MOV16  r0+r1*1, m1, m1, m1, m1
1012    lea    r0, [r0+r1*2]
1013    dec    r2d
1014    jg .vloop
1015    REP_RET
1016
1017;-----------------------------------------------------------------------------
1018; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride)
1019;-----------------------------------------------------------------------------
1020INIT_XMM sse2
1021cglobal pred16x16_dc_10, 2, 6
1022    mov        r5, r0
1023    sub        r0, r1
1024    mova       m0, [r0+0]
1025    paddw      m0, [r0+mmsize]
1026    HADDW      m0, m2
1027
1028    lea        r0, [r0+r1-2]
1029    movzx     r3d, word [r0]
1030    movzx     r4d, word [r0+r1]
1031%rep 7
1032    lea        r0, [r0+r1*2]
1033    movzx     r2d, word [r0]
1034    add       r3d, r2d
1035    movzx     r2d, word [r0+r1]
1036    add       r4d, r2d
1037%endrep
1038    lea       r3d, [r3+r4+16]
1039
1040    movd       m1, r3d
1041    paddw      m0, m1
1042    psrlw      m0, 5
1043    SPLATW     m0, m0
1044    mov       r3d, 8
1045.loop:
1046    MOV16 r5+r1*0, m0, m0, m0, m0
1047    MOV16 r5+r1*1, m0, m0, m0, m0
1048    lea        r5, [r5+r1*2]
1049    dec       r3d
1050    jg .loop
1051    REP_RET
1052
1053;-----------------------------------------------------------------------------
1054; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride)
1055;-----------------------------------------------------------------------------
1056INIT_XMM sse2
1057cglobal pred16x16_top_dc_10, 2, 3
1058    sub        r0, r1
1059    mova       m0, [r0+0]
1060    paddw      m0, [r0+mmsize]
1061    HADDW      m0, m2
1062
1063    SPLATW     m0, m0
1064    paddw      m0, [pw_8]
1065    psrlw      m0, 4
1066    mov       r2d, 8
1067.loop:
1068    MOV16 r0+r1*1, m0, m0, m0, m0
1069    MOV16 r0+r1*2, m0, m0, m0, m0
1070    lea        r0, [r0+r1*2]
1071    dec       r2d
1072    jg .loop
1073    REP_RET
1074
1075;-----------------------------------------------------------------------------
1076; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride)
1077;-----------------------------------------------------------------------------
1078INIT_XMM sse2
1079cglobal pred16x16_left_dc_10, 2, 6
1080    mov        r5, r0
1081
1082    sub        r0, 2
1083    movzx     r3d, word [r0]
1084    movzx     r4d, word [r0+r1]
1085%rep 7
1086    lea        r0, [r0+r1*2]
1087    movzx     r2d, word [r0]
1088    add       r3d, r2d
1089    movzx     r2d, word [r0+r1]
1090    add       r4d, r2d
1091%endrep
1092    lea       r3d, [r3+r4+8]
1093    shr       r3d, 4
1094
1095    movd       m0, r3d
1096    SPLATW     m0, m0
1097    mov       r3d, 8
1098.loop:
1099    MOV16 r5+r1*0, m0, m0, m0, m0
1100    MOV16 r5+r1*1, m0, m0, m0, m0
1101    lea        r5, [r5+r1*2]
1102    dec       r3d
1103    jg .loop
1104    REP_RET
1105
1106;-----------------------------------------------------------------------------
1107; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride)
1108;-----------------------------------------------------------------------------
1109INIT_XMM sse2
1110cglobal pred16x16_128_dc_10, 2,3
1111    mova       m0, [pw_512]
1112    mov       r2d, 8
1113.loop:
1114    MOV16 r0+r1*0, m0, m0, m0, m0
1115    MOV16 r0+r1*1, m0, m0, m0, m0
1116    lea        r0, [r0+r1*2]
1117    dec       r2d
1118    jg .loop
1119    REP_RET
1120