1;***************************************************************************** 2;* SSE2-optimized weighted prediction code 3;***************************************************************************** 4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 5;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION .text 27 28;----------------------------------------------------------------------------- 29; biweight pred: 30; 31; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, 32; int height, int log2_denom, int weightd, 33; int weights, int offset); 34; and 35; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height, 36; int log2_denom, int weight, int offset); 37;----------------------------------------------------------------------------- 38 39%macro WEIGHT_SETUP 0 40 add r5, r5 41 inc r5 42 movd m3, r4d 43 movd m5, r5d 44 movd m6, r3d 45 pslld m5, m6 46 psrld m5, 1 47%if mmsize == 16 48 pshuflw m3, m3, 0 49 pshuflw m5, m5, 0 50 punpcklqdq m3, m3 51 punpcklqdq m5, m5 52%else 53 pshufw m3, m3, 0 54 pshufw m5, m5, 0 55%endif 56 pxor m7, m7 57%endmacro 58 59%macro WEIGHT_OP 2 60 movh m0, [r0+%1] 61 movh m1, [r0+%2] 62 punpcklbw m0, m7 63 punpcklbw m1, m7 64 pmullw m0, m3 65 pmullw m1, m3 66 paddsw m0, m5 67 paddsw m1, m5 68 psraw m0, m6 69 psraw m1, m6 70 packuswb m0, m1 71%endmacro 72 73%macro WEIGHT_FUNC_MM 2 74cglobal h264_weight_%1, 6, 6, %2 75 WEIGHT_SETUP 76.nextrow: 77 WEIGHT_OP 0, mmsize/2 78 mova [r0], m0 79 add r0, r1 80 dec r2d 81 jnz .nextrow 82 REP_RET 83%endmacro 84 85INIT_XMM sse2 86WEIGHT_FUNC_MM 16, 8 87 88%macro WEIGHT_FUNC_HALF_MM 2 89cglobal h264_weight_%1, 6, 6, %2 90 WEIGHT_SETUP 91 sar r2d, 1 92 lea r3, [r1*2] 93.nextrow: 94 WEIGHT_OP 0, r1 95 movh [r0], m0 96%if mmsize == 16 97 movhps [r0+r1], m0 98%else 99 psrlq m0, 32 100 movh [r0+r1], m0 101%endif 102 add r0, r3 103 dec r2d 104 jnz .nextrow 105 REP_RET 106%endmacro 107 108INIT_MMX mmxext 109WEIGHT_FUNC_HALF_MM 4, 0 110INIT_XMM sse2 111WEIGHT_FUNC_HALF_MM 8, 8 112 113%macro BIWEIGHT_SETUP 0 114%if ARCH_X86_64 115%define off_regd r7d 116%else 117%define off_regd r3d 118%endif 119 mov off_regd, r7m 120 add off_regd, 1 121 or off_regd, 1 122 add r4d, 1 123 cmp r6d, 128 124 je .nonnormal 125 cmp r5d, 128 126 jne .normal 127.nonnormal: 128 sar r5d, 1 129 sar r6d, 1 130 sar off_regd, 1 131 sub r4d, 1 132.normal: 133%if cpuflag(ssse3) 134 movd m4, r5d 135 movd m0, r6d 136%else 137 movd m3, r5d 138 movd m4, r6d 139%endif 140 movd m5, off_regd 141 movd m6, r4d 142 pslld m5, m6 143 psrld m5, 1 144%if cpuflag(ssse3) 145 punpcklbw m4, m0 146 pshuflw m4, m4, 0 147 pshuflw m5, m5, 0 148 punpcklqdq m4, m4 149 punpcklqdq m5, m5 150 151%else 152%if mmsize == 16 153 pshuflw m3, m3, 0 154 pshuflw m4, m4, 0 155 pshuflw m5, m5, 0 156 punpcklqdq m3, m3 157 punpcklqdq m4, m4 158 punpcklqdq m5, m5 159%else 160 pshufw m3, m3, 0 161 pshufw m4, m4, 0 162 pshufw m5, m5, 0 163%endif 164 pxor m7, m7 165%endif 166%endmacro 167 168%macro BIWEIGHT_STEPA 3 169 movh m%1, [r0+%3] 170 movh m%2, [r1+%3] 171 punpcklbw m%1, m7 172 punpcklbw m%2, m7 173 pmullw m%1, m3 174 pmullw m%2, m4 175 paddsw m%1, m%2 176%endmacro 177 178%macro BIWEIGHT_STEPB 0 179 paddsw m0, m5 180 paddsw m1, m5 181 psraw m0, m6 182 psraw m1, m6 183 packuswb m0, m1 184%endmacro 185 186%macro BIWEIGHT_FUNC_MM 2 187cglobal h264_biweight_%1, 7, 8, %2 188 BIWEIGHT_SETUP 189 movifnidn r3d, r3m 190.nextrow: 191 BIWEIGHT_STEPA 0, 1, 0 192 BIWEIGHT_STEPA 1, 2, mmsize/2 193 BIWEIGHT_STEPB 194 mova [r0], m0 195 add r0, r2 196 add r1, r2 197 dec r3d 198 jnz .nextrow 199 REP_RET 200%endmacro 201 202INIT_XMM sse2 203BIWEIGHT_FUNC_MM 16, 8 204 205%macro BIWEIGHT_FUNC_HALF_MM 2 206cglobal h264_biweight_%1, 7, 8, %2 207 BIWEIGHT_SETUP 208 movifnidn r3d, r3m 209 sar r3, 1 210 lea r4, [r2*2] 211.nextrow: 212 BIWEIGHT_STEPA 0, 1, 0 213 BIWEIGHT_STEPA 1, 2, r2 214 BIWEIGHT_STEPB 215 movh [r0], m0 216%if mmsize == 16 217 movhps [r0+r2], m0 218%else 219 psrlq m0, 32 220 movh [r0+r2], m0 221%endif 222 add r0, r4 223 add r1, r4 224 dec r3d 225 jnz .nextrow 226 REP_RET 227%endmacro 228 229INIT_MMX mmxext 230BIWEIGHT_FUNC_HALF_MM 4, 0 231INIT_XMM sse2 232BIWEIGHT_FUNC_HALF_MM 8, 8 233 234%macro BIWEIGHT_SSSE3_OP 0 235 pmaddubsw m0, m4 236 pmaddubsw m2, m4 237 paddsw m0, m5 238 paddsw m2, m5 239 psraw m0, m6 240 psraw m2, m6 241 packuswb m0, m2 242%endmacro 243 244INIT_XMM ssse3 245cglobal h264_biweight_16, 7, 8, 8 246 BIWEIGHT_SETUP 247 movifnidn r3d, r3m 248 249.nextrow: 250 movh m0, [r0] 251 movh m2, [r0+8] 252 movh m3, [r1+8] 253 punpcklbw m0, [r1] 254 punpcklbw m2, m3 255 BIWEIGHT_SSSE3_OP 256 mova [r0], m0 257 add r0, r2 258 add r1, r2 259 dec r3d 260 jnz .nextrow 261 REP_RET 262 263INIT_XMM ssse3 264cglobal h264_biweight_8, 7, 8, 8 265 BIWEIGHT_SETUP 266 movifnidn r3d, r3m 267 sar r3, 1 268 lea r4, [r2*2] 269 270.nextrow: 271 movh m0, [r0] 272 movh m1, [r1] 273 movh m2, [r0+r2] 274 movh m3, [r1+r2] 275 punpcklbw m0, m1 276 punpcklbw m2, m3 277 BIWEIGHT_SSSE3_OP 278 movh [r0], m0 279 movhps [r0+r2], m0 280 add r0, r4 281 add r1, r4 282 dec r3d 283 jnz .nextrow 284 REP_RET 285