1;****************************************************************************** 2;* Pixel utilities SIMD 3;* 4;* Copyright (C) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5;* Copyright (C) 2014 Clément Bœsch <u pkh me> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION .text 27 28;------------------------------------------------------------------------------- 29; int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1, 30; const uint8_t *src2, ptrdiff_t stride2); 31;------------------------------------------------------------------------------- 32INIT_MMX mmxext 33cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2 34 pxor m2, m2 35%rep 4 36 mova m0, [src1q] 37 mova m1, [src1q + stride1q] 38 psadbw m0, [src2q] 39 psadbw m1, [src2q + stride2q] 40 paddw m2, m0 41 paddw m2, m1 42 lea src1q, [src1q + 2*stride1q] 43 lea src2q, [src2q + 2*stride2q] 44%endrep 45 movd eax, m2 46 RET 47 48;------------------------------------------------------------------------------- 49; int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, 50; const uint8_t *src2, ptrdiff_t stride2); 51;------------------------------------------------------------------------------- 52INIT_XMM sse2 53cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2 54 movu m4, [src1q] 55 movu m2, [src2q] 56 movu m1, [src1q + stride1q] 57 movu m3, [src2q + stride2q] 58 psadbw m4, m2 59 psadbw m1, m3 60 paddw m4, m1 61%rep 7 62 lea src1q, [src1q + 2*stride1q] 63 lea src2q, [src2q + 2*stride2q] 64 movu m0, [src1q] 65 movu m2, [src2q] 66 movu m1, [src1q + stride1q] 67 movu m3, [src2q + stride2q] 68 psadbw m0, m2 69 psadbw m1, m3 70 paddw m4, m0 71 paddw m4, m1 72%endrep 73 movhlps m0, m4 74 paddw m4, m0 75 movd eax, m4 76 RET 77 78;------------------------------------------------------------------------------- 79; int ff_pixelutils_sad_[au]_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, 80; const uint8_t *src2, ptrdiff_t stride2); 81;------------------------------------------------------------------------------- 82%macro SAD_XMM_16x16 1 83INIT_XMM sse2 84cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2 85 mov%1 m2, [src2q] 86 psadbw m2, [src1q] 87 mov%1 m1, [src2q + stride2q] 88 psadbw m1, [src1q + stride1q] 89 paddw m2, m1 90%rep 7 91 lea src1q, [src1q + 2*stride1q] 92 lea src2q, [src2q + 2*stride2q] 93 mov%1 m0, [src2q] 94 psadbw m0, [src1q] 95 mov%1 m1, [src2q + stride2q] 96 psadbw m1, [src1q + stride1q] 97 paddw m2, m0 98 paddw m2, m1 99%endrep 100 movhlps m0, m2 101 paddw m2, m0 102 movd eax, m2 103 RET 104%endmacro 105 106SAD_XMM_16x16 a 107SAD_XMM_16x16 u 108 109 110%macro PROCESS_SAD_32x4_U 0 111 movu m1, [r2] 112 movu m2, [r2 + 16] 113 movu m3, [r0] 114 movu m4, [r0 + 16] 115 psadbw m1, m3 116 psadbw m2, m4 117 paddd m1, m2 118 paddd m0, m1 119 lea r2, [r2 + r3] 120 lea r0, [r0 + r1] 121 122 movu m1, [r2] 123 movu m2, [r2 + 16] 124 movu m3, [r0] 125 movu m4, [r0 + 16] 126 psadbw m1, m3 127 psadbw m2, m4 128 paddd m1, m2 129 paddd m0, m1 130 lea r2, [r2 + r3] 131 lea r0, [r0 + r1] 132 133 movu m1, [r2] 134 movu m2, [r2 + 16] 135 movu m3, [r0] 136 movu m4, [r0 + 16] 137 psadbw m1, m3 138 psadbw m2, m4 139 paddd m1, m2 140 paddd m0, m1 141 lea r2, [r2 + r3] 142 lea r0, [r0 + r1] 143 144 movu m1, [r2] 145 movu m2, [r2 + 16] 146 movu m3, [r0] 147 movu m4, [r0 + 16] 148 psadbw m1, m3 149 psadbw m2, m4 150 paddd m1, m2 151 paddd m0, m1 152 lea r2, [r2 + r3] 153 lea r0, [r0 + r1] 154%endmacro 155 156%macro PROCESS_SAD_32x4 1 157 mov%1 m1, [r2] 158 mov%1 m2, [r2 + 16] 159 psadbw m1, [r0] 160 psadbw m2, [r0 + 16] 161 paddd m1, m2 162 paddd m0, m1 163 lea r2, [r2 + r3] 164 lea r0, [r0 + r1] 165 166 mov%1 m1, [r2] 167 mov%1 m2, [r2 + 16] 168 psadbw m1, [r0] 169 psadbw m2, [r0 + 16] 170 paddd m1, m2 171 paddd m0, m1 172 lea r2, [r2 + r3] 173 lea r0, [r0 + r1] 174 175 mov%1 m1, [r2] 176 mov%1 m2, [r2 + 16] 177 psadbw m1, [r0] 178 psadbw m2, [r0 + 16] 179 paddd m1, m2 180 paddd m0, m1 181 lea r2, [r2 + r3] 182 lea r0, [r0 + r1] 183 184 mov%1 m1, [r2] 185 mov%1 m2, [r2 + 16] 186 psadbw m1, [r0] 187 psadbw m2, [r0 + 16] 188 paddd m1, m2 189 paddd m0, m1 190 lea r2, [r2 + r3] 191 lea r0, [r0 + r1] 192%endmacro 193 194;----------------------------------------------------------------------------- 195; int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, 196; const uint8_t *src2, ptrdiff_t stride2); 197;----------------------------------------------------------------------------- 198INIT_XMM sse2 199cglobal pixelutils_sad_32x32, 4,5,5, src1, stride1, src2, stride2 200 pxor m0, m0 201 mov r4d, 4 202.loop: 203 PROCESS_SAD_32x4_U 204 PROCESS_SAD_32x4_U 205 dec r4d 206 jnz .loop 207 208 movhlps m1, m0 209 paddd m0, m1 210 movd eax, m0 211 RET 212 213;------------------------------------------------------------------------------- 214; int ff_pixelutils_sad_[au]_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, 215; const uint8_t *src2, ptrdiff_t stride2); 216;------------------------------------------------------------------------------- 217%macro SAD_XMM_32x32 1 218INIT_XMM sse2 219cglobal pixelutils_sad_%1_32x32, 4,5,3, src1, stride1, src2, stride2 220 pxor m0, m0 221 mov r4d, 4 222.loop: 223 PROCESS_SAD_32x4 %1 224 PROCESS_SAD_32x4 %1 225 dec r4d 226 jnz .loop 227 228 movhlps m1, m0 229 paddd m0, m1 230 movd eax, m0 231 RET 232%endmacro 233 234SAD_XMM_32x32 a 235SAD_XMM_32x32 u 236 237%if HAVE_AVX2_EXTERNAL 238;------------------------------------------------------------------------------- 239; int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, 240; const uint8_t *src2, ptrdiff_t stride2); 241;------------------------------------------------------------------------------- 242INIT_YMM avx2 243cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2 244 pxor m0, m0 245 mov r4d, 32/4 246 lea r5, [stride1q * 3] 247 lea r6, [stride2q * 3] 248 249.loop: 250 movu m1, [src1q] ; row 0 of pix0 251 movu m2, [src2q] ; row 0 of pix1 252 movu m3, [src1q + stride1q] ; row 1 of pix0 253 movu m4, [src2q + stride2q] ; row 1 of pix1 254 255 psadbw m1, m2 256 psadbw m3, m4 257 paddd m0, m1 258 paddd m0, m3 259 260 movu m1, [src1q + 2 * stride1q] ; row 2 of pix0 261 movu m2, [src2q + 2 * stride2q] ; row 2 of pix1 262 movu m3, [src1q + r5] ; row 3 of pix0 263 movu m4, [src2q + r6] ; row 3 of pix1 264 265 psadbw m1, m2 266 psadbw m3, m4 267 paddd m0, m1 268 paddd m0, m3 269 270 lea src2q, [src2q + 4 * stride2q] 271 lea src1q, [src1q + 4 * stride1q] 272 273 dec r4d 274 jnz .loop 275 276 vextracti128 xm1, m0, 1 277 paddd xm0, xm1 278 pshufd xm1, xm0, 2 279 paddd xm0, xm1 280 movd eax, xm0 281 RET 282 283;------------------------------------------------------------------------------- 284; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, 285; const uint8_t *src2, ptrdiff_t stride2); 286;------------------------------------------------------------------------------- 287%macro SAD_AVX2_32x32 1 288INIT_YMM avx2 289cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2 290 pxor m0, m0 291 mov r4d, 32/4 292 lea r5, [stride1q * 3] 293 lea r6, [stride2q * 3] 294 295.loop: 296 mov%1 m1, [src2q] ; row 0 of pix1 297 psadbw m1, [src1q] 298 mov%1 m2, [src2q + stride2q] ; row 1 of pix1 299 psadbw m2, [src1q + stride1q] 300 301 paddd m0, m1 302 paddd m0, m2 303 304 mov%1 m1, [src2q + 2 * stride2q] ; row 2 of pix1 305 psadbw m1, [src1q + 2 * stride1q] 306 mov%1 m2, [src2q + r6] ; row 3 of pix1 307 psadbw m2, [src1q + r5] 308 309 paddd m0, m1 310 paddd m0, m2 311 312 lea src2q, [src2q + 4 * stride2q] 313 lea src1q, [src1q + 4 * stride1q] 314 315 dec r4d 316 jnz .loop 317 318 vextracti128 xm1, m0, 1 319 paddd xm0, xm1 320 pshufd xm1, xm0, 2 321 paddd xm0, xm1 322 movd eax, xm0 323 RET 324%endmacro 325 326SAD_AVX2_32x32 a 327SAD_AVX2_32x32 u 328%endif 329