1;******************************************************************************
2;* SIMD-optimized HuffYUV functions
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2014 Christophe Gisquet
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION .text
26
27%include "libavcodec/x86/huffyuvdsp_template.asm"
28
29;------------------------------------------------------------------------------
30; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
31;------------------------------------------------------------------------------
32
33%macro ADD_INT16 0
34cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
35    test srcq, mmsize-1
36    jnz .unaligned
37    test dstq, mmsize-1
38    jnz .unaligned
39    INT16_LOOP a, add
40.unaligned:
41    INT16_LOOP u, add
42%endmacro
43
44INIT_XMM sse2
45ADD_INT16
46
47%if HAVE_AVX2_EXTERNAL
48INIT_YMM avx2
49ADD_INT16
50%endif
51
52; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
53;                               intptr_t w, uint8_t *left)
54INIT_XMM sse2
55cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
56    shl           wq, 2
57    movd          m0, [leftq]
58    lea         dstq, [dstq + wq]
59    lea         srcq, [srcq + wq]
60    LSHIFT        m0, mmsize-4
61    neg           wq
62.loop:
63    movu          m1, [srcq+wq]
64    mova          m2, m1
65    LSHIFT        m1, 4
66    paddb         m1, m2
67    pshufd        m0, m0, q3333
68    mova          m2, m1
69    LSHIFT        m1, 8
70    paddb         m1, m2
71    paddb         m0, m1
72    movu   [dstq+wq], m0
73    add           wq, mmsize
74    jl         .loop
75    movd          m0, [dstq-4]
76    movd     [leftq], m0
77    REP_RET
78
79
80; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
81INIT_MMX mmxext
82cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
83    add      wd, wd
84    movd    mm6, maskd
85    SPLATW  mm6, mm6
86    movq    mm0, [topq]
87    movq    mm2, mm0
88    movd    mm4, [left_topq]
89    psllq   mm2, 16
90    movq    mm1, mm0
91    por     mm4, mm2
92    movd    mm3, [leftq]
93    psubw   mm0, mm4 ; t-tl
94    add    dstq, wq
95    add    topq, wq
96    add   diffq, wq
97    neg      wq
98    jmp .skip
99.loop:
100    movq    mm4, [topq+wq]
101    movq    mm0, mm4
102    psllq   mm4, 16
103    por     mm4, mm1
104    movq    mm1, mm0 ; t
105    psubw   mm0, mm4 ; t-tl
106.skip:
107    movq    mm2, [diffq+wq]
108%assign i 0
109%rep 4
110    movq    mm4, mm0
111    paddw   mm4, mm3 ; t-tl+l
112    pand    mm4, mm6
113    movq    mm5, mm3
114    pmaxsw  mm3, mm1
115    pminsw  mm5, mm1
116    pminsw  mm3, mm4
117    pmaxsw  mm3, mm5 ; median
118    paddw   mm3, mm2 ; +residual
119    pand    mm3, mm6
120%if i==0
121    movq    mm7, mm3
122    psllq   mm7, 48
123%else
124    movq    mm4, mm3
125    psrlq   mm7, 16
126    psllq   mm4, 48
127    por     mm7, mm4
128%endif
129%if i<3
130    psrlq   mm0, 16
131    psrlq   mm1, 16
132    psrlq   mm2, 16
133%endif
134%assign i i+1
135%endrep
136    movq [dstq+wq], mm7
137    add      wq, 8
138    jl .loop
139    movzx   r2d, word [dstq-2]
140    mov [leftq], r2d
141    movzx   r2d, word [topq-2]
142    mov [left_topq], r2d
143    RET
144