1;****************************************************************************** 2;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders 3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25pw_row_coeffs: times 4 dw 13 26 times 4 dw 17 27 times 4 dw 7 28pd_512: times 2 dd 0x200 29pw_col_coeffs: dw 13, 13, 13, -13 30 dw 17, 7, 7, -17 31 dw 13, -13, 13, 13 32 dw -7, 17, -17, -7 33 34SECTION .text 35 36%macro IDCT_DC_NOROUND 1 37 imul %1, 13*13*3 38 sar %1, 11 39%endmacro 40 41%macro IDCT_DC_ROUND 1 42 imul %1, 13*13 43 add %1, 0x200 44 sar %1, 10 45%endmacro 46 47INIT_MMX mmxext 48cglobal rv34_idct_dc_noround, 1, 2, 0 49 movsx r1, word [r0] 50 IDCT_DC_NOROUND r1 51 movd m0, r1d 52 pshufw m0, m0, 0 53 movq [r0+ 0], m0 54 movq [r0+ 8], m0 55 movq [r0+16], m0 56 movq [r0+24], m0 57 REP_RET 58 59; Load coeffs and perform row transform 60; Output: coeffs in mm[0467], rounder in mm5 61%macro ROW_TRANSFORM 1 62 pxor mm7, mm7 63 mova mm0, [%1+ 0*8] 64 mova mm1, [%1+ 1*8] 65 mova mm2, [%1+ 2*8] 66 mova mm3, [%1+ 3*8] 67 mova [%1+ 0*8], mm7 68 mova [%1+ 1*8], mm7 69 mova [%1+ 2*8], mm7 70 mova [%1+ 3*8], mm7 71 mova mm4, mm0 72 mova mm6, [pw_row_coeffs+ 0] 73 paddsw mm0, mm2 ; b0 + b2 74 psubsw mm4, mm2 ; b0 - b2 75 pmullw mm0, mm6 ; *13 = z0 76 pmullw mm4, mm6 ; *13 = z1 77 mova mm5, mm1 78 pmullw mm1, [pw_row_coeffs+ 8] ; b1*17 79 pmullw mm5, [pw_row_coeffs+16] ; b1* 7 80 mova mm7, mm3 81 pmullw mm3, [pw_row_coeffs+ 8] ; b3*17 82 pmullw mm7, [pw_row_coeffs+16] ; b3* 7 83 paddsw mm1, mm7 ; z3 = b1*17 + b3* 7 84 psubsw mm5, mm3 ; z2 = b1* 7 - b3*17 85 mova mm7, mm0 86 mova mm6, mm4 87 paddsw mm0, mm1 ; z0 + z3 88 psubsw mm7, mm1 ; z0 - z3 89 paddsw mm4, mm5 ; z1 + z2 90 psubsw mm6, mm5 ; z1 - z2 91 mova mm5, [pd_512] ; 0x200 92%endmacro 93 94; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); 95%macro COL_TRANSFORM 4 96 pshufw mm3, %2, 0xDD ; col. 1,3,1,3 97 pshufw %2, %2, 0x88 ; col. 0,2,0,2 98 pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1 99 pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2 100 paddd %2, mm5 101 pshufw mm1, %2, 01001110b ; z1 | z0 102 pshufw mm2, mm3, 01001110b ; z2 | z3 103 paddd %2, mm3 ; z0+z3 | z1+z2 104 psubd mm1, mm2 ; z1-z2 | z0-z3 105 movd mm3, %1 106 psrad %2, 10 107 pxor mm2, mm2 108 psrad mm1, 10 109 punpcklbw mm3, mm2 110 packssdw %2, mm1 111 paddw %2, mm3 112 packuswb %2, %2 113 movd %1, %2 114%endmacro 115INIT_MMX mmxext 116cglobal rv34_idct_add, 3,3,0, d, s, b 117 ROW_TRANSFORM bq 118 COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8] 119 mova mm0, [pw_col_coeffs+ 0] 120 COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8] 121 mova mm4, [pw_col_coeffs+ 8] 122 lea dq, [dq + 2*sq] 123 COL_TRANSFORM [dq], mm6, mm0, mm4 124 COL_TRANSFORM [dq+sq], mm7, mm0, mm4 125 ret 126 127; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); 128%macro RV34_IDCT_DC_ADD 0 129cglobal rv34_idct_dc_add, 3, 3, 6 130 ; load data 131 IDCT_DC_ROUND r2 132 pxor m1, m1 133 134 ; calculate DC 135 movd m0, r2d 136 lea r2, [r0+r1*2] 137 movd m2, [r0] 138 movd m3, [r0+r1] 139 pshuflw m0, m0, 0 140 movd m4, [r2] 141 movd m5, [r2+r1] 142 punpcklqdq m0, m0 143 punpckldq m2, m3 144 punpckldq m4, m5 145 punpcklbw m2, m1 146 punpcklbw m4, m1 147 paddw m2, m0 148 paddw m4, m0 149 packuswb m2, m4 150 movd [r0], m2 151%if cpuflag(sse4) 152 pextrd [r0+r1], m2, 1 153 pextrd [r2], m2, 2 154 pextrd [r2+r1], m2, 3 155%else 156 psrldq m2, 4 157 movd [r0+r1], m2 158 psrldq m2, 4 159 movd [r2], m2 160 psrldq m2, 4 161 movd [r2+r1], m2 162%endif 163 RET 164%endmacro 165 166INIT_XMM sse2 167RV34_IDCT_DC_ADD 168INIT_XMM sse4 169RV34_IDCT_DC_ADD 170