1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* Pixel utilities SIMD 3cabdff1aSopenharmony_ci;* 4cabdff1aSopenharmony_ci;* Copyright (C) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5cabdff1aSopenharmony_ci;* Copyright (C) 2014 Clément Bœsch <u pkh me> 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;****************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION .text 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 29cabdff1aSopenharmony_ci; int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1, 30cabdff1aSopenharmony_ci; const uint8_t *src2, ptrdiff_t stride2); 31cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 32cabdff1aSopenharmony_ciINIT_MMX mmxext 33cabdff1aSopenharmony_cicglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2 34cabdff1aSopenharmony_ci pxor m2, m2 35cabdff1aSopenharmony_ci%rep 4 36cabdff1aSopenharmony_ci mova m0, [src1q] 37cabdff1aSopenharmony_ci mova m1, [src1q + stride1q] 38cabdff1aSopenharmony_ci psadbw m0, [src2q] 39cabdff1aSopenharmony_ci psadbw m1, [src2q + stride2q] 40cabdff1aSopenharmony_ci paddw m2, m0 41cabdff1aSopenharmony_ci paddw m2, m1 42cabdff1aSopenharmony_ci lea src1q, [src1q + 2*stride1q] 43cabdff1aSopenharmony_ci lea src2q, [src2q + 2*stride2q] 44cabdff1aSopenharmony_ci%endrep 45cabdff1aSopenharmony_ci movd eax, m2 46cabdff1aSopenharmony_ci RET 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 49cabdff1aSopenharmony_ci; int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, 50cabdff1aSopenharmony_ci; const uint8_t *src2, ptrdiff_t stride2); 51cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 52cabdff1aSopenharmony_ciINIT_XMM sse2 53cabdff1aSopenharmony_cicglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2 54cabdff1aSopenharmony_ci movu m4, [src1q] 55cabdff1aSopenharmony_ci movu m2, [src2q] 56cabdff1aSopenharmony_ci movu m1, [src1q + stride1q] 57cabdff1aSopenharmony_ci movu m3, [src2q + stride2q] 58cabdff1aSopenharmony_ci psadbw m4, m2 59cabdff1aSopenharmony_ci psadbw m1, m3 60cabdff1aSopenharmony_ci paddw m4, m1 61cabdff1aSopenharmony_ci%rep 7 62cabdff1aSopenharmony_ci lea src1q, [src1q + 2*stride1q] 63cabdff1aSopenharmony_ci lea src2q, [src2q + 2*stride2q] 64cabdff1aSopenharmony_ci movu m0, [src1q] 65cabdff1aSopenharmony_ci movu m2, [src2q] 66cabdff1aSopenharmony_ci movu m1, [src1q + stride1q] 67cabdff1aSopenharmony_ci movu m3, [src2q + stride2q] 68cabdff1aSopenharmony_ci psadbw m0, m2 69cabdff1aSopenharmony_ci psadbw m1, m3 70cabdff1aSopenharmony_ci paddw m4, m0 71cabdff1aSopenharmony_ci paddw m4, m1 72cabdff1aSopenharmony_ci%endrep 73cabdff1aSopenharmony_ci movhlps m0, m4 74cabdff1aSopenharmony_ci paddw m4, m0 75cabdff1aSopenharmony_ci movd eax, m4 76cabdff1aSopenharmony_ci RET 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 79cabdff1aSopenharmony_ci; int ff_pixelutils_sad_[au]_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, 80cabdff1aSopenharmony_ci; const uint8_t *src2, ptrdiff_t stride2); 81cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 82cabdff1aSopenharmony_ci%macro SAD_XMM_16x16 1 83cabdff1aSopenharmony_ciINIT_XMM sse2 84cabdff1aSopenharmony_cicglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2 85cabdff1aSopenharmony_ci mov%1 m2, [src2q] 86cabdff1aSopenharmony_ci psadbw m2, [src1q] 87cabdff1aSopenharmony_ci mov%1 m1, [src2q + stride2q] 88cabdff1aSopenharmony_ci psadbw m1, [src1q + stride1q] 89cabdff1aSopenharmony_ci paddw m2, m1 90cabdff1aSopenharmony_ci%rep 7 91cabdff1aSopenharmony_ci lea src1q, [src1q + 2*stride1q] 92cabdff1aSopenharmony_ci lea src2q, [src2q + 2*stride2q] 93cabdff1aSopenharmony_ci mov%1 m0, [src2q] 94cabdff1aSopenharmony_ci psadbw m0, [src1q] 95cabdff1aSopenharmony_ci mov%1 m1, [src2q + stride2q] 96cabdff1aSopenharmony_ci psadbw m1, [src1q + stride1q] 97cabdff1aSopenharmony_ci paddw m2, m0 98cabdff1aSopenharmony_ci paddw m2, m1 99cabdff1aSopenharmony_ci%endrep 100cabdff1aSopenharmony_ci movhlps m0, m2 101cabdff1aSopenharmony_ci paddw m2, m0 102cabdff1aSopenharmony_ci movd eax, m2 103cabdff1aSopenharmony_ci RET 104cabdff1aSopenharmony_ci%endmacro 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ciSAD_XMM_16x16 a 107cabdff1aSopenharmony_ciSAD_XMM_16x16 u 108cabdff1aSopenharmony_ci 109cabdff1aSopenharmony_ci 110cabdff1aSopenharmony_ci%macro PROCESS_SAD_32x4_U 0 111cabdff1aSopenharmony_ci movu m1, [r2] 112cabdff1aSopenharmony_ci movu m2, [r2 + 16] 113cabdff1aSopenharmony_ci movu m3, [r0] 114cabdff1aSopenharmony_ci movu m4, [r0 + 16] 115cabdff1aSopenharmony_ci psadbw m1, m3 116cabdff1aSopenharmony_ci psadbw m2, m4 117cabdff1aSopenharmony_ci paddd m1, m2 118cabdff1aSopenharmony_ci paddd m0, m1 119cabdff1aSopenharmony_ci lea r2, [r2 + r3] 120cabdff1aSopenharmony_ci lea r0, [r0 + r1] 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_ci movu m1, [r2] 123cabdff1aSopenharmony_ci movu m2, [r2 + 16] 124cabdff1aSopenharmony_ci movu m3, [r0] 125cabdff1aSopenharmony_ci movu m4, [r0 + 16] 126cabdff1aSopenharmony_ci psadbw m1, m3 127cabdff1aSopenharmony_ci psadbw m2, m4 128cabdff1aSopenharmony_ci paddd m1, m2 129cabdff1aSopenharmony_ci paddd m0, m1 130cabdff1aSopenharmony_ci lea r2, [r2 + r3] 131cabdff1aSopenharmony_ci lea r0, [r0 + r1] 132cabdff1aSopenharmony_ci 133cabdff1aSopenharmony_ci movu m1, [r2] 134cabdff1aSopenharmony_ci movu m2, [r2 + 16] 135cabdff1aSopenharmony_ci movu m3, [r0] 136cabdff1aSopenharmony_ci movu m4, [r0 + 16] 137cabdff1aSopenharmony_ci psadbw m1, m3 138cabdff1aSopenharmony_ci psadbw m2, m4 139cabdff1aSopenharmony_ci paddd m1, m2 140cabdff1aSopenharmony_ci paddd m0, m1 141cabdff1aSopenharmony_ci lea r2, [r2 + r3] 142cabdff1aSopenharmony_ci lea r0, [r0 + r1] 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci movu m1, [r2] 145cabdff1aSopenharmony_ci movu m2, [r2 + 16] 146cabdff1aSopenharmony_ci movu m3, [r0] 147cabdff1aSopenharmony_ci movu m4, [r0 + 16] 148cabdff1aSopenharmony_ci psadbw m1, m3 149cabdff1aSopenharmony_ci psadbw m2, m4 150cabdff1aSopenharmony_ci paddd m1, m2 151cabdff1aSopenharmony_ci paddd m0, m1 152cabdff1aSopenharmony_ci lea r2, [r2 + r3] 153cabdff1aSopenharmony_ci lea r0, [r0 + r1] 154cabdff1aSopenharmony_ci%endmacro 155cabdff1aSopenharmony_ci 156cabdff1aSopenharmony_ci%macro PROCESS_SAD_32x4 1 157cabdff1aSopenharmony_ci mov%1 m1, [r2] 158cabdff1aSopenharmony_ci mov%1 m2, [r2 + 16] 159cabdff1aSopenharmony_ci psadbw m1, [r0] 160cabdff1aSopenharmony_ci psadbw m2, [r0 + 16] 161cabdff1aSopenharmony_ci paddd m1, m2 162cabdff1aSopenharmony_ci paddd m0, m1 163cabdff1aSopenharmony_ci lea r2, [r2 + r3] 164cabdff1aSopenharmony_ci lea r0, [r0 + r1] 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ci mov%1 m1, [r2] 167cabdff1aSopenharmony_ci mov%1 m2, [r2 + 16] 168cabdff1aSopenharmony_ci psadbw m1, [r0] 169cabdff1aSopenharmony_ci psadbw m2, [r0 + 16] 170cabdff1aSopenharmony_ci paddd m1, m2 171cabdff1aSopenharmony_ci paddd m0, m1 172cabdff1aSopenharmony_ci lea r2, [r2 + r3] 173cabdff1aSopenharmony_ci lea r0, [r0 + r1] 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ci mov%1 m1, [r2] 176cabdff1aSopenharmony_ci mov%1 m2, [r2 + 16] 177cabdff1aSopenharmony_ci psadbw m1, [r0] 178cabdff1aSopenharmony_ci psadbw m2, [r0 + 16] 179cabdff1aSopenharmony_ci paddd m1, m2 180cabdff1aSopenharmony_ci paddd m0, m1 181cabdff1aSopenharmony_ci lea r2, [r2 + r3] 182cabdff1aSopenharmony_ci lea r0, [r0 + r1] 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci mov%1 m1, [r2] 185cabdff1aSopenharmony_ci mov%1 m2, [r2 + 16] 186cabdff1aSopenharmony_ci psadbw m1, [r0] 187cabdff1aSopenharmony_ci psadbw m2, [r0 + 16] 188cabdff1aSopenharmony_ci paddd m1, m2 189cabdff1aSopenharmony_ci paddd m0, m1 190cabdff1aSopenharmony_ci lea r2, [r2 + r3] 191cabdff1aSopenharmony_ci lea r0, [r0 + r1] 192cabdff1aSopenharmony_ci%endmacro 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 195cabdff1aSopenharmony_ci; int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, 196cabdff1aSopenharmony_ci; const uint8_t *src2, ptrdiff_t stride2); 197cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 198cabdff1aSopenharmony_ciINIT_XMM sse2 199cabdff1aSopenharmony_cicglobal pixelutils_sad_32x32, 4,5,5, src1, stride1, src2, stride2 200cabdff1aSopenharmony_ci pxor m0, m0 201cabdff1aSopenharmony_ci mov r4d, 4 202cabdff1aSopenharmony_ci.loop: 203cabdff1aSopenharmony_ci PROCESS_SAD_32x4_U 204cabdff1aSopenharmony_ci PROCESS_SAD_32x4_U 205cabdff1aSopenharmony_ci dec r4d 206cabdff1aSopenharmony_ci jnz .loop 207cabdff1aSopenharmony_ci 208cabdff1aSopenharmony_ci movhlps m1, m0 209cabdff1aSopenharmony_ci paddd m0, m1 210cabdff1aSopenharmony_ci movd eax, m0 211cabdff1aSopenharmony_ci RET 212cabdff1aSopenharmony_ci 213cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 214cabdff1aSopenharmony_ci; int ff_pixelutils_sad_[au]_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, 215cabdff1aSopenharmony_ci; const uint8_t *src2, ptrdiff_t stride2); 216cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 217cabdff1aSopenharmony_ci%macro SAD_XMM_32x32 1 218cabdff1aSopenharmony_ciINIT_XMM sse2 219cabdff1aSopenharmony_cicglobal pixelutils_sad_%1_32x32, 4,5,3, src1, stride1, src2, stride2 220cabdff1aSopenharmony_ci pxor m0, m0 221cabdff1aSopenharmony_ci mov r4d, 4 222cabdff1aSopenharmony_ci.loop: 223cabdff1aSopenharmony_ci PROCESS_SAD_32x4 %1 224cabdff1aSopenharmony_ci PROCESS_SAD_32x4 %1 225cabdff1aSopenharmony_ci dec r4d 226cabdff1aSopenharmony_ci jnz .loop 227cabdff1aSopenharmony_ci 228cabdff1aSopenharmony_ci movhlps m1, m0 229cabdff1aSopenharmony_ci paddd m0, m1 230cabdff1aSopenharmony_ci movd eax, m0 231cabdff1aSopenharmony_ci RET 232cabdff1aSopenharmony_ci%endmacro 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ciSAD_XMM_32x32 a 235cabdff1aSopenharmony_ciSAD_XMM_32x32 u 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 238cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 239cabdff1aSopenharmony_ci; int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, 240cabdff1aSopenharmony_ci; const uint8_t *src2, ptrdiff_t stride2); 241cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 242cabdff1aSopenharmony_ciINIT_YMM avx2 243cabdff1aSopenharmony_cicglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2 244cabdff1aSopenharmony_ci pxor m0, m0 245cabdff1aSopenharmony_ci mov r4d, 32/4 246cabdff1aSopenharmony_ci lea r5, [stride1q * 3] 247cabdff1aSopenharmony_ci lea r6, [stride2q * 3] 248cabdff1aSopenharmony_ci 249cabdff1aSopenharmony_ci.loop: 250cabdff1aSopenharmony_ci movu m1, [src1q] ; row 0 of pix0 251cabdff1aSopenharmony_ci movu m2, [src2q] ; row 0 of pix1 252cabdff1aSopenharmony_ci movu m3, [src1q + stride1q] ; row 1 of pix0 253cabdff1aSopenharmony_ci movu m4, [src2q + stride2q] ; row 1 of pix1 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_ci psadbw m1, m2 256cabdff1aSopenharmony_ci psadbw m3, m4 257cabdff1aSopenharmony_ci paddd m0, m1 258cabdff1aSopenharmony_ci paddd m0, m3 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci movu m1, [src1q + 2 * stride1q] ; row 2 of pix0 261cabdff1aSopenharmony_ci movu m2, [src2q + 2 * stride2q] ; row 2 of pix1 262cabdff1aSopenharmony_ci movu m3, [src1q + r5] ; row 3 of pix0 263cabdff1aSopenharmony_ci movu m4, [src2q + r6] ; row 3 of pix1 264cabdff1aSopenharmony_ci 265cabdff1aSopenharmony_ci psadbw m1, m2 266cabdff1aSopenharmony_ci psadbw m3, m4 267cabdff1aSopenharmony_ci paddd m0, m1 268cabdff1aSopenharmony_ci paddd m0, m3 269cabdff1aSopenharmony_ci 270cabdff1aSopenharmony_ci lea src2q, [src2q + 4 * stride2q] 271cabdff1aSopenharmony_ci lea src1q, [src1q + 4 * stride1q] 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci dec r4d 274cabdff1aSopenharmony_ci jnz .loop 275cabdff1aSopenharmony_ci 276cabdff1aSopenharmony_ci vextracti128 xm1, m0, 1 277cabdff1aSopenharmony_ci paddd xm0, xm1 278cabdff1aSopenharmony_ci pshufd xm1, xm0, 2 279cabdff1aSopenharmony_ci paddd xm0, xm1 280cabdff1aSopenharmony_ci movd eax, xm0 281cabdff1aSopenharmony_ci RET 282cabdff1aSopenharmony_ci 283cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 284cabdff1aSopenharmony_ci; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, 285cabdff1aSopenharmony_ci; const uint8_t *src2, ptrdiff_t stride2); 286cabdff1aSopenharmony_ci;------------------------------------------------------------------------------- 287cabdff1aSopenharmony_ci%macro SAD_AVX2_32x32 1 288cabdff1aSopenharmony_ciINIT_YMM avx2 289cabdff1aSopenharmony_cicglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2 290cabdff1aSopenharmony_ci pxor m0, m0 291cabdff1aSopenharmony_ci mov r4d, 32/4 292cabdff1aSopenharmony_ci lea r5, [stride1q * 3] 293cabdff1aSopenharmony_ci lea r6, [stride2q * 3] 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci.loop: 296cabdff1aSopenharmony_ci mov%1 m1, [src2q] ; row 0 of pix1 297cabdff1aSopenharmony_ci psadbw m1, [src1q] 298cabdff1aSopenharmony_ci mov%1 m2, [src2q + stride2q] ; row 1 of pix1 299cabdff1aSopenharmony_ci psadbw m2, [src1q + stride1q] 300cabdff1aSopenharmony_ci 301cabdff1aSopenharmony_ci paddd m0, m1 302cabdff1aSopenharmony_ci paddd m0, m2 303cabdff1aSopenharmony_ci 304cabdff1aSopenharmony_ci mov%1 m1, [src2q + 2 * stride2q] ; row 2 of pix1 305cabdff1aSopenharmony_ci psadbw m1, [src1q + 2 * stride1q] 306cabdff1aSopenharmony_ci mov%1 m2, [src2q + r6] ; row 3 of pix1 307cabdff1aSopenharmony_ci psadbw m2, [src1q + r5] 308cabdff1aSopenharmony_ci 309cabdff1aSopenharmony_ci paddd m0, m1 310cabdff1aSopenharmony_ci paddd m0, m2 311cabdff1aSopenharmony_ci 312cabdff1aSopenharmony_ci lea src2q, [src2q + 4 * stride2q] 313cabdff1aSopenharmony_ci lea src1q, [src1q + 4 * stride1q] 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_ci dec r4d 316cabdff1aSopenharmony_ci jnz .loop 317cabdff1aSopenharmony_ci 318cabdff1aSopenharmony_ci vextracti128 xm1, m0, 1 319cabdff1aSopenharmony_ci paddd xm0, xm1 320cabdff1aSopenharmony_ci pshufd xm1, xm0, 2 321cabdff1aSopenharmony_ci paddd xm0, xm1 322cabdff1aSopenharmony_ci movd eax, xm0 323cabdff1aSopenharmony_ci RET 324cabdff1aSopenharmony_ci%endmacro 325cabdff1aSopenharmony_ci 326cabdff1aSopenharmony_ciSAD_AVX2_32x32 a 327cabdff1aSopenharmony_ciSAD_AVX2_32x32 u 328cabdff1aSopenharmony_ci%endif 329