1;******************************************************************************
2;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25pw_row_coeffs:  times 4 dw 13
26                times 4 dw 17
27                times 4 dw  7
28pd_512: times 2 dd 0x200
29pw_col_coeffs:  dw 13,  13,  13, -13
30                dw 17,   7,   7, -17
31                dw 13, -13,  13,  13
32                dw -7,  17, -17,  -7
33
34SECTION .text
35
36%macro IDCT_DC_NOROUND 1
37    imul   %1, 13*13*3
38    sar    %1, 11
39%endmacro
40
41%macro IDCT_DC_ROUND 1
42    imul   %1, 13*13
43    add    %1, 0x200
44    sar    %1, 10
45%endmacro
46
47INIT_MMX mmxext
48cglobal rv34_idct_dc_noround, 1, 2, 0
49    movsx   r1, word [r0]
50    IDCT_DC_NOROUND r1
51    movd    m0, r1d
52    pshufw  m0, m0, 0
53    movq    [r0+ 0], m0
54    movq    [r0+ 8], m0
55    movq    [r0+16], m0
56    movq    [r0+24], m0
57    REP_RET
58
59; Load coeffs and perform row transform
60; Output: coeffs in mm[0467], rounder in mm5
61%macro ROW_TRANSFORM  1
62    pxor        mm7, mm7
63    mova        mm0, [%1+ 0*8]
64    mova        mm1, [%1+ 1*8]
65    mova        mm2, [%1+ 2*8]
66    mova        mm3, [%1+ 3*8]
67    mova  [%1+ 0*8], mm7
68    mova  [%1+ 1*8], mm7
69    mova  [%1+ 2*8], mm7
70    mova  [%1+ 3*8], mm7
71    mova        mm4, mm0
72    mova        mm6, [pw_row_coeffs+ 0]
73    paddsw      mm0, mm2                ; b0 + b2
74    psubsw      mm4, mm2                ; b0 - b2
75    pmullw      mm0, mm6                ; *13 = z0
76    pmullw      mm4, mm6                ; *13 = z1
77    mova        mm5, mm1
78    pmullw      mm1, [pw_row_coeffs+ 8] ; b1*17
79    pmullw      mm5, [pw_row_coeffs+16] ; b1* 7
80    mova        mm7, mm3
81    pmullw      mm3, [pw_row_coeffs+ 8] ; b3*17
82    pmullw      mm7, [pw_row_coeffs+16] ; b3* 7
83    paddsw      mm1, mm7                ; z3 = b1*17 + b3* 7
84    psubsw      mm5, mm3                ; z2 = b1* 7 - b3*17
85    mova        mm7, mm0
86    mova        mm6, mm4
87    paddsw      mm0, mm1                ; z0 + z3
88    psubsw      mm7, mm1                ; z0 - z3
89    paddsw      mm4, mm5                ; z1 + z2
90    psubsw      mm6, mm5                ; z1 - z2
91    mova        mm5, [pd_512]           ; 0x200
92%endmacro
93
94; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
95%macro COL_TRANSFORM  4
96    pshufw      mm3, %2, 0xDD        ; col. 1,3,1,3
97    pshufw       %2, %2, 0x88        ; col. 0,2,0,2
98    pmaddwd      %2, %3              ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
99    pmaddwd     mm3, %4              ; 17*c1+ 7*c3 |  7*c1-17*c3 = z3 | z2
100    paddd        %2, mm5
101    pshufw      mm1,  %2, 01001110b  ;    z1 | z0
102    pshufw      mm2, mm3, 01001110b  ;    z2 | z3
103    paddd        %2, mm3             ; z0+z3 | z1+z2
104    psubd       mm1, mm2             ; z1-z2 | z0-z3
105    movd        mm3, %1
106    psrad        %2, 10
107    pxor        mm2, mm2
108    psrad       mm1, 10
109    punpcklbw   mm3, mm2
110    packssdw     %2, mm1
111    paddw        %2, mm3
112    packuswb     %2, %2
113    movd         %1, %2
114%endmacro
115INIT_MMX mmxext
116cglobal rv34_idct_add, 3,3,0, d, s, b
117    ROW_TRANSFORM       bq
118    COL_TRANSFORM     [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
119    mova               mm0, [pw_col_coeffs+ 0]
120    COL_TRANSFORM  [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
121    mova               mm4, [pw_col_coeffs+ 8]
122    lea                 dq, [dq + 2*sq]
123    COL_TRANSFORM     [dq], mm6, mm0, mm4
124    COL_TRANSFORM  [dq+sq], mm7, mm0, mm4
125    ret
126
127; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
128%macro RV34_IDCT_DC_ADD 0
129cglobal rv34_idct_dc_add, 3, 3, 6
130    ; load data
131    IDCT_DC_ROUND r2
132    pxor       m1, m1
133
134    ; calculate DC
135    movd       m0, r2d
136    lea        r2, [r0+r1*2]
137    movd       m2, [r0]
138    movd       m3, [r0+r1]
139    pshuflw    m0, m0, 0
140    movd       m4, [r2]
141    movd       m5, [r2+r1]
142    punpcklqdq m0, m0
143    punpckldq  m2, m3
144    punpckldq  m4, m5
145    punpcklbw  m2, m1
146    punpcklbw  m4, m1
147    paddw      m2, m0
148    paddw      m4, m0
149    packuswb   m2, m4
150    movd      [r0], m2
151%if cpuflag(sse4)
152    pextrd [r0+r1], m2, 1
153    pextrd    [r2], m2, 2
154    pextrd [r2+r1], m2, 3
155%else
156    psrldq     m2, 4
157    movd   [r0+r1], m2
158    psrldq     m2, 4
159    movd      [r2], m2
160    psrldq     m2, 4
161    movd   [r2+r1], m2
162%endif
163    RET
164%endmacro
165
166INIT_XMM sse2
167RV34_IDCT_DC_ADD
168INIT_XMM sse4
169RV34_IDCT_DC_ADD
170