1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders 3cabdff1aSopenharmony_ci;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> 4cabdff1aSopenharmony_ci;* 5cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci;* 12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 16cabdff1aSopenharmony_ci;* 17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci;****************************************************************************** 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ciSECTION_RODATA 25cabdff1aSopenharmony_cipw_row_coeffs: times 4 dw 13 26cabdff1aSopenharmony_ci times 4 dw 17 27cabdff1aSopenharmony_ci times 4 dw 7 28cabdff1aSopenharmony_cipd_512: times 2 dd 0x200 29cabdff1aSopenharmony_cipw_col_coeffs: dw 13, 13, 13, -13 30cabdff1aSopenharmony_ci dw 17, 7, 7, -17 31cabdff1aSopenharmony_ci dw 13, -13, 13, 13 32cabdff1aSopenharmony_ci dw -7, 17, -17, -7 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ciSECTION .text 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ci%macro IDCT_DC_NOROUND 1 37cabdff1aSopenharmony_ci imul %1, 13*13*3 38cabdff1aSopenharmony_ci sar %1, 11 39cabdff1aSopenharmony_ci%endmacro 40cabdff1aSopenharmony_ci 41cabdff1aSopenharmony_ci%macro IDCT_DC_ROUND 1 42cabdff1aSopenharmony_ci imul %1, 13*13 43cabdff1aSopenharmony_ci add %1, 0x200 44cabdff1aSopenharmony_ci sar %1, 10 45cabdff1aSopenharmony_ci%endmacro 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ciINIT_MMX mmxext 48cabdff1aSopenharmony_cicglobal rv34_idct_dc_noround, 1, 2, 0 49cabdff1aSopenharmony_ci movsx r1, word [r0] 50cabdff1aSopenharmony_ci IDCT_DC_NOROUND r1 51cabdff1aSopenharmony_ci movd m0, r1d 52cabdff1aSopenharmony_ci pshufw m0, m0, 0 53cabdff1aSopenharmony_ci movq [r0+ 0], m0 54cabdff1aSopenharmony_ci movq [r0+ 8], m0 55cabdff1aSopenharmony_ci movq [r0+16], m0 56cabdff1aSopenharmony_ci movq [r0+24], m0 57cabdff1aSopenharmony_ci REP_RET 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci; Load coeffs and perform row transform 60cabdff1aSopenharmony_ci; Output: coeffs in mm[0467], rounder in mm5 61cabdff1aSopenharmony_ci%macro ROW_TRANSFORM 1 62cabdff1aSopenharmony_ci pxor mm7, mm7 63cabdff1aSopenharmony_ci mova mm0, [%1+ 0*8] 64cabdff1aSopenharmony_ci mova mm1, [%1+ 1*8] 65cabdff1aSopenharmony_ci mova mm2, [%1+ 2*8] 66cabdff1aSopenharmony_ci mova mm3, [%1+ 3*8] 67cabdff1aSopenharmony_ci mova [%1+ 0*8], mm7 68cabdff1aSopenharmony_ci mova [%1+ 1*8], mm7 69cabdff1aSopenharmony_ci mova [%1+ 2*8], mm7 70cabdff1aSopenharmony_ci mova [%1+ 3*8], mm7 71cabdff1aSopenharmony_ci mova mm4, mm0 72cabdff1aSopenharmony_ci mova mm6, [pw_row_coeffs+ 0] 73cabdff1aSopenharmony_ci paddsw mm0, mm2 ; b0 + b2 74cabdff1aSopenharmony_ci psubsw mm4, mm2 ; b0 - b2 75cabdff1aSopenharmony_ci pmullw mm0, mm6 ; *13 = z0 76cabdff1aSopenharmony_ci pmullw mm4, mm6 ; *13 = z1 77cabdff1aSopenharmony_ci mova mm5, mm1 78cabdff1aSopenharmony_ci pmullw mm1, [pw_row_coeffs+ 8] ; b1*17 79cabdff1aSopenharmony_ci pmullw mm5, [pw_row_coeffs+16] ; b1* 7 80cabdff1aSopenharmony_ci mova mm7, mm3 81cabdff1aSopenharmony_ci pmullw mm3, [pw_row_coeffs+ 8] ; b3*17 82cabdff1aSopenharmony_ci pmullw mm7, [pw_row_coeffs+16] ; b3* 7 83cabdff1aSopenharmony_ci paddsw mm1, mm7 ; z3 = b1*17 + b3* 7 84cabdff1aSopenharmony_ci psubsw mm5, mm3 ; z2 = b1* 7 - b3*17 85cabdff1aSopenharmony_ci mova mm7, mm0 86cabdff1aSopenharmony_ci mova mm6, mm4 87cabdff1aSopenharmony_ci paddsw mm0, mm1 ; z0 + z3 88cabdff1aSopenharmony_ci psubsw mm7, mm1 ; z0 - z3 89cabdff1aSopenharmony_ci paddsw mm4, mm5 ; z1 + z2 90cabdff1aSopenharmony_ci psubsw mm6, mm5 ; z1 - z2 91cabdff1aSopenharmony_ci mova mm5, [pd_512] ; 0x200 92cabdff1aSopenharmony_ci%endmacro 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); 95cabdff1aSopenharmony_ci%macro COL_TRANSFORM 4 96cabdff1aSopenharmony_ci pshufw mm3, %2, 0xDD ; col. 1,3,1,3 97cabdff1aSopenharmony_ci pshufw %2, %2, 0x88 ; col. 0,2,0,2 98cabdff1aSopenharmony_ci pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1 99cabdff1aSopenharmony_ci pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2 100cabdff1aSopenharmony_ci paddd %2, mm5 101cabdff1aSopenharmony_ci pshufw mm1, %2, 01001110b ; z1 | z0 102cabdff1aSopenharmony_ci pshufw mm2, mm3, 01001110b ; z2 | z3 103cabdff1aSopenharmony_ci paddd %2, mm3 ; z0+z3 | z1+z2 104cabdff1aSopenharmony_ci psubd mm1, mm2 ; z1-z2 | z0-z3 105cabdff1aSopenharmony_ci movd mm3, %1 106cabdff1aSopenharmony_ci psrad %2, 10 107cabdff1aSopenharmony_ci pxor mm2, mm2 108cabdff1aSopenharmony_ci psrad mm1, 10 109cabdff1aSopenharmony_ci punpcklbw mm3, mm2 110cabdff1aSopenharmony_ci packssdw %2, mm1 111cabdff1aSopenharmony_ci paddw %2, mm3 112cabdff1aSopenharmony_ci packuswb %2, %2 113cabdff1aSopenharmony_ci movd %1, %2 114cabdff1aSopenharmony_ci%endmacro 115cabdff1aSopenharmony_ciINIT_MMX mmxext 116cabdff1aSopenharmony_cicglobal rv34_idct_add, 3,3,0, d, s, b 117cabdff1aSopenharmony_ci ROW_TRANSFORM bq 118cabdff1aSopenharmony_ci COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8] 119cabdff1aSopenharmony_ci mova mm0, [pw_col_coeffs+ 0] 120cabdff1aSopenharmony_ci COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8] 121cabdff1aSopenharmony_ci mova mm4, [pw_col_coeffs+ 8] 122cabdff1aSopenharmony_ci lea dq, [dq + 2*sq] 123cabdff1aSopenharmony_ci COL_TRANSFORM [dq], mm6, mm0, mm4 124cabdff1aSopenharmony_ci COL_TRANSFORM [dq+sq], mm7, mm0, mm4 125cabdff1aSopenharmony_ci ret 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); 128cabdff1aSopenharmony_ci%macro RV34_IDCT_DC_ADD 0 129cabdff1aSopenharmony_cicglobal rv34_idct_dc_add, 3, 3, 6 130cabdff1aSopenharmony_ci ; load data 131cabdff1aSopenharmony_ci IDCT_DC_ROUND r2 132cabdff1aSopenharmony_ci pxor m1, m1 133cabdff1aSopenharmony_ci 134cabdff1aSopenharmony_ci ; calculate DC 135cabdff1aSopenharmony_ci movd m0, r2d 136cabdff1aSopenharmony_ci lea r2, [r0+r1*2] 137cabdff1aSopenharmony_ci movd m2, [r0] 138cabdff1aSopenharmony_ci movd m3, [r0+r1] 139cabdff1aSopenharmony_ci pshuflw m0, m0, 0 140cabdff1aSopenharmony_ci movd m4, [r2] 141cabdff1aSopenharmony_ci movd m5, [r2+r1] 142cabdff1aSopenharmony_ci punpcklqdq m0, m0 143cabdff1aSopenharmony_ci punpckldq m2, m3 144cabdff1aSopenharmony_ci punpckldq m4, m5 145cabdff1aSopenharmony_ci punpcklbw m2, m1 146cabdff1aSopenharmony_ci punpcklbw m4, m1 147cabdff1aSopenharmony_ci paddw m2, m0 148cabdff1aSopenharmony_ci paddw m4, m0 149cabdff1aSopenharmony_ci packuswb m2, m4 150cabdff1aSopenharmony_ci movd [r0], m2 151cabdff1aSopenharmony_ci%if cpuflag(sse4) 152cabdff1aSopenharmony_ci pextrd [r0+r1], m2, 1 153cabdff1aSopenharmony_ci pextrd [r2], m2, 2 154cabdff1aSopenharmony_ci pextrd [r2+r1], m2, 3 155cabdff1aSopenharmony_ci%else 156cabdff1aSopenharmony_ci psrldq m2, 4 157cabdff1aSopenharmony_ci movd [r0+r1], m2 158cabdff1aSopenharmony_ci psrldq m2, 4 159cabdff1aSopenharmony_ci movd [r2], m2 160cabdff1aSopenharmony_ci psrldq m2, 4 161cabdff1aSopenharmony_ci movd [r2+r1], m2 162cabdff1aSopenharmony_ci%endif 163cabdff1aSopenharmony_ci RET 164cabdff1aSopenharmony_ci%endmacro 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ciINIT_XMM sse2 167cabdff1aSopenharmony_ciRV34_IDCT_DC_ADD 168cabdff1aSopenharmony_ciINIT_XMM sse4 169cabdff1aSopenharmony_ciRV34_IDCT_DC_ADD 170