1;****************************************************************************** 2;* SIMD-optimized IDCT-related routines 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2003-2013 Michael Niedermayer 5;* Copyright (c) 2013 Daniel Kang 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28cextern pb_80 29 30SECTION .text 31 32;-------------------------------------------------------------------------- 33;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels, 34; ptrdiff_t line_size) 35;-------------------------------------------------------------------------- 36 37%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1 38 mova m1, [blockq+mmsize*0+%1] 39 mova m2, [blockq+mmsize*2+%1] 40 packsswb m1, [blockq+mmsize*1+%1] 41 packsswb m2, [blockq+mmsize*3+%1] 42 paddb m1, m0 43 paddb m2, m0 44 movq [pixelsq+lsizeq*0], m1 45 movhps [pixelsq+lsizeq*1], m1 46 movq [pixelsq+lsizeq*2], m2 47 movhps [pixelsq+lsize3q ], m2 48%endmacro 49 50INIT_XMM sse2 51cglobal put_signed_pixels_clamped, 3, 4, 3, block, pixels, lsize, lsize3 52 mova m0, [pb_80] 53 lea lsize3q, [lsizeq*3] 54 PUT_SIGNED_PIXELS_CLAMPED_HALF 0 55 lea pixelsq, [pixelsq+lsizeq*4] 56 PUT_SIGNED_PIXELS_CLAMPED_HALF 64 57 RET 58 59;-------------------------------------------------------------------------- 60; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels, 61; ptrdiff_t line_size); 62;-------------------------------------------------------------------------- 63; %1 = block offset 64%macro PUT_PIXELS_CLAMPED_HALF 1 65 mova m0, [blockq+mmsize*0+%1] 66 mova m1, [blockq+mmsize*2+%1] 67 packuswb m0, [blockq+mmsize*1+%1] 68 packuswb m1, [blockq+mmsize*3+%1] 69 movq [pixelsq], m0 70 movhps [lsizeq+pixelsq], m0 71 movq [2*lsizeq+pixelsq], m1 72 movhps [lsize3q+pixelsq], m1 73%endmacro 74 75INIT_XMM sse2 76cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3 77 lea lsize3q, [lsizeq*3] 78 PUT_PIXELS_CLAMPED_HALF 0 79 lea pixelsq, [pixelsq+lsizeq*4] 80 PUT_PIXELS_CLAMPED_HALF 64 81 RET 82 83;-------------------------------------------------------------------------- 84; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels, 85; ptrdiff_t line_size); 86;-------------------------------------------------------------------------- 87; %1 = block offset 88%macro ADD_PIXELS_CLAMPED 1 89 mova m0, [blockq+mmsize*0+%1] 90 mova m1, [blockq+mmsize*1+%1] 91 movq m2, [pixelsq] 92 movq m3, [pixelsq+lsizeq] 93 punpcklbw m2, m4 94 punpcklbw m3, m4 95 paddsw m0, m2 96 paddsw m1, m3 97 packuswb m0, m1 98 movq [pixelsq], m0 99 movhps [pixelsq+lsizeq], m0 100%endmacro 101 102INIT_XMM sse2 103cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize 104 pxor m4, m4 105 ADD_PIXELS_CLAMPED 0 106 lea pixelsq, [pixelsq+lsizeq*2] 107 ADD_PIXELS_CLAMPED 32 108 lea pixelsq, [pixelsq+lsizeq*2] 109 ADD_PIXELS_CLAMPED 64 110 lea pixelsq, [pixelsq+lsizeq*2] 111 ADD_PIXELS_CLAMPED 96 112 RET 113