1cabdff1aSopenharmony_ci;***************************************************************************** 2cabdff1aSopenharmony_ci;* SIMD-optimized motion compensation estimation 3cabdff1aSopenharmony_ci;***************************************************************************** 4cabdff1aSopenharmony_ci;* Copyright (c) 2000, 2001 Fabrice Bellard 5cabdff1aSopenharmony_ci;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;***************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION_RODATA 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cicextern pb_1 29cabdff1aSopenharmony_cicextern pb_80 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ciSECTION .text 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ci%macro DIFF_PIXELS_1 4 34cabdff1aSopenharmony_ci movh %1, %3 35cabdff1aSopenharmony_ci movh %2, %4 36cabdff1aSopenharmony_ci punpcklbw %2, %1 37cabdff1aSopenharmony_ci punpcklbw %1, %1 38cabdff1aSopenharmony_ci psubw %1, %2 39cabdff1aSopenharmony_ci%endmacro 40cabdff1aSopenharmony_ci 41cabdff1aSopenharmony_ci; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 42cabdff1aSopenharmony_ci; %6=temporary storage location 43cabdff1aSopenharmony_ci; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) 44cabdff1aSopenharmony_ci%macro DIFF_PIXELS_8 6 45cabdff1aSopenharmony_ci DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] 46cabdff1aSopenharmony_ci DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] 47cabdff1aSopenharmony_ci DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] 48cabdff1aSopenharmony_ci add %1, %5 49cabdff1aSopenharmony_ci add %2, %5 50cabdff1aSopenharmony_ci DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] 51cabdff1aSopenharmony_ci DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] 52cabdff1aSopenharmony_ci DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] 53cabdff1aSopenharmony_ci DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] 54cabdff1aSopenharmony_ci%ifdef m8 55cabdff1aSopenharmony_ci DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] 56cabdff1aSopenharmony_ci%else 57cabdff1aSopenharmony_ci mova [%6], m0 58cabdff1aSopenharmony_ci DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] 59cabdff1aSopenharmony_ci mova m0, [%6] 60cabdff1aSopenharmony_ci%endif 61cabdff1aSopenharmony_ci sub %1, %5 62cabdff1aSopenharmony_ci sub %2, %5 63cabdff1aSopenharmony_ci%endmacro 64cabdff1aSopenharmony_ci 65cabdff1aSopenharmony_ci%macro HADAMARD8 0 66cabdff1aSopenharmony_ci SUMSUB_BADC w, 0, 1, 2, 3 67cabdff1aSopenharmony_ci SUMSUB_BADC w, 4, 5, 6, 7 68cabdff1aSopenharmony_ci SUMSUB_BADC w, 0, 2, 1, 3 69cabdff1aSopenharmony_ci SUMSUB_BADC w, 4, 6, 5, 7 70cabdff1aSopenharmony_ci SUMSUB_BADC w, 0, 4, 1, 5 71cabdff1aSopenharmony_ci SUMSUB_BADC w, 2, 6, 3, 7 72cabdff1aSopenharmony_ci%endmacro 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_ci%macro ABS1_SUM 3 75cabdff1aSopenharmony_ci ABS1 %1, %2 76cabdff1aSopenharmony_ci paddusw %3, %1 77cabdff1aSopenharmony_ci%endmacro 78cabdff1aSopenharmony_ci 79cabdff1aSopenharmony_ci%macro ABS2_SUM 6 80cabdff1aSopenharmony_ci ABS2 %1, %2, %3, %4 81cabdff1aSopenharmony_ci paddusw %5, %1 82cabdff1aSopenharmony_ci paddusw %6, %2 83cabdff1aSopenharmony_ci%endmacro 84cabdff1aSopenharmony_ci 85cabdff1aSopenharmony_ci%macro ABS_SUM_8x8_64 1 86cabdff1aSopenharmony_ci ABS2 m0, m1, m8, m9 87cabdff1aSopenharmony_ci ABS2_SUM m2, m3, m8, m9, m0, m1 88cabdff1aSopenharmony_ci ABS2_SUM m4, m5, m8, m9, m0, m1 89cabdff1aSopenharmony_ci ABS2_SUM m6, m7, m8, m9, m0, m1 90cabdff1aSopenharmony_ci paddusw m0, m1 91cabdff1aSopenharmony_ci%endmacro 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ci%macro ABS_SUM_8x8_32 1 94cabdff1aSopenharmony_ci mova [%1], m7 95cabdff1aSopenharmony_ci ABS1 m0, m7 96cabdff1aSopenharmony_ci ABS1 m1, m7 97cabdff1aSopenharmony_ci ABS1_SUM m2, m7, m0 98cabdff1aSopenharmony_ci ABS1_SUM m3, m7, m1 99cabdff1aSopenharmony_ci ABS1_SUM m4, m7, m0 100cabdff1aSopenharmony_ci ABS1_SUM m5, m7, m1 101cabdff1aSopenharmony_ci ABS1_SUM m6, m7, m0 102cabdff1aSopenharmony_ci mova m2, [%1] 103cabdff1aSopenharmony_ci ABS1_SUM m2, m7, m1 104cabdff1aSopenharmony_ci paddusw m0, m1 105cabdff1aSopenharmony_ci%endmacro 106cabdff1aSopenharmony_ci 107cabdff1aSopenharmony_ci; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to 108cabdff1aSopenharmony_ci; about 100k on extreme inputs. But that's very unlikely to occur in natural video, 109cabdff1aSopenharmony_ci; and it's even more unlikely to not have any alternative mvs/modes with lower cost. 110cabdff1aSopenharmony_ci%macro HSUM 3 111cabdff1aSopenharmony_ci%if cpuflag(sse2) 112cabdff1aSopenharmony_ci movhlps %2, %1 113cabdff1aSopenharmony_ci paddusw %1, %2 114cabdff1aSopenharmony_ci pshuflw %2, %1, 0xE 115cabdff1aSopenharmony_ci paddusw %1, %2 116cabdff1aSopenharmony_ci pshuflw %2, %1, 0x1 117cabdff1aSopenharmony_ci paddusw %1, %2 118cabdff1aSopenharmony_ci movd %3, %1 119cabdff1aSopenharmony_ci%elif cpuflag(mmxext) 120cabdff1aSopenharmony_ci pshufw %2, %1, 0xE 121cabdff1aSopenharmony_ci paddusw %1, %2 122cabdff1aSopenharmony_ci pshufw %2, %1, 0x1 123cabdff1aSopenharmony_ci paddusw %1, %2 124cabdff1aSopenharmony_ci movd %3, %1 125cabdff1aSopenharmony_ci%elif cpuflag(mmx) 126cabdff1aSopenharmony_ci mova %2, %1 127cabdff1aSopenharmony_ci psrlq %1, 32 128cabdff1aSopenharmony_ci paddusw %1, %2 129cabdff1aSopenharmony_ci mova %2, %1 130cabdff1aSopenharmony_ci psrlq %1, 16 131cabdff1aSopenharmony_ci paddusw %1, %2 132cabdff1aSopenharmony_ci movd %3, %1 133cabdff1aSopenharmony_ci%endif 134cabdff1aSopenharmony_ci%endmacro 135cabdff1aSopenharmony_ci 136cabdff1aSopenharmony_ci%macro STORE4 5 137cabdff1aSopenharmony_ci mova [%1+mmsize*0], %2 138cabdff1aSopenharmony_ci mova [%1+mmsize*1], %3 139cabdff1aSopenharmony_ci mova [%1+mmsize*2], %4 140cabdff1aSopenharmony_ci mova [%1+mmsize*3], %5 141cabdff1aSopenharmony_ci%endmacro 142cabdff1aSopenharmony_ci 143cabdff1aSopenharmony_ci%macro LOAD4 5 144cabdff1aSopenharmony_ci mova %2, [%1+mmsize*0] 145cabdff1aSopenharmony_ci mova %3, [%1+mmsize*1] 146cabdff1aSopenharmony_ci mova %4, [%1+mmsize*2] 147cabdff1aSopenharmony_ci mova %5, [%1+mmsize*3] 148cabdff1aSopenharmony_ci%endmacro 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci%macro hadamard8_16_wrapper 2 151cabdff1aSopenharmony_cicglobal hadamard8_diff, 4, 4, %1 152cabdff1aSopenharmony_ci%ifndef m8 153cabdff1aSopenharmony_ci %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) 154cabdff1aSopenharmony_ci SUB rsp, pad 155cabdff1aSopenharmony_ci%endif 156cabdff1aSopenharmony_ci call hadamard8x8_diff %+ SUFFIX 157cabdff1aSopenharmony_ci%ifndef m8 158cabdff1aSopenharmony_ci ADD rsp, pad 159cabdff1aSopenharmony_ci%endif 160cabdff1aSopenharmony_ci RET 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_cicglobal hadamard8_diff16, 5, 6, %1 163cabdff1aSopenharmony_ci%ifndef m8 164cabdff1aSopenharmony_ci %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) 165cabdff1aSopenharmony_ci SUB rsp, pad 166cabdff1aSopenharmony_ci%endif 167cabdff1aSopenharmony_ci 168cabdff1aSopenharmony_ci call hadamard8x8_diff %+ SUFFIX 169cabdff1aSopenharmony_ci mov r5d, eax 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci add r1, 8 172cabdff1aSopenharmony_ci add r2, 8 173cabdff1aSopenharmony_ci call hadamard8x8_diff %+ SUFFIX 174cabdff1aSopenharmony_ci add r5d, eax 175cabdff1aSopenharmony_ci 176cabdff1aSopenharmony_ci cmp r4d, 16 177cabdff1aSopenharmony_ci jne .done 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci lea r1, [r1+r3*8-8] 180cabdff1aSopenharmony_ci lea r2, [r2+r3*8-8] 181cabdff1aSopenharmony_ci call hadamard8x8_diff %+ SUFFIX 182cabdff1aSopenharmony_ci add r5d, eax 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci add r1, 8 185cabdff1aSopenharmony_ci add r2, 8 186cabdff1aSopenharmony_ci call hadamard8x8_diff %+ SUFFIX 187cabdff1aSopenharmony_ci add r5d, eax 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci.done: 190cabdff1aSopenharmony_ci mov eax, r5d 191cabdff1aSopenharmony_ci%ifndef m8 192cabdff1aSopenharmony_ci ADD rsp, pad 193cabdff1aSopenharmony_ci%endif 194cabdff1aSopenharmony_ci RET 195cabdff1aSopenharmony_ci%endmacro 196cabdff1aSopenharmony_ci 197cabdff1aSopenharmony_ci%macro HADAMARD8_DIFF 0-1 198cabdff1aSopenharmony_ci%if cpuflag(sse2) 199cabdff1aSopenharmony_cihadamard8x8_diff %+ SUFFIX: 200cabdff1aSopenharmony_ci lea r0, [r3*3] 201cabdff1aSopenharmony_ci DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize 202cabdff1aSopenharmony_ci HADAMARD8 203cabdff1aSopenharmony_ci%if ARCH_X86_64 204cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 205cabdff1aSopenharmony_ci%else 206cabdff1aSopenharmony_ci TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] 207cabdff1aSopenharmony_ci%endif 208cabdff1aSopenharmony_ci HADAMARD8 209cabdff1aSopenharmony_ci ABS_SUM_8x8 rsp+gprsize 210cabdff1aSopenharmony_ci HSUM m0, m1, eax 211cabdff1aSopenharmony_ci and eax, 0xFFFF 212cabdff1aSopenharmony_ci ret 213cabdff1aSopenharmony_ci 214cabdff1aSopenharmony_cihadamard8_16_wrapper %1, 3 215cabdff1aSopenharmony_ci%elif cpuflag(mmx) 216cabdff1aSopenharmony_ciALIGN 16 217cabdff1aSopenharmony_ci; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, 218cabdff1aSopenharmony_ci; uint8_t *src2, ptrdiff_t stride, int h) 219cabdff1aSopenharmony_ci; r0 = void *s = unused, int h = unused (always 8) 220cabdff1aSopenharmony_ci; note how r1, r2 and r3 are not clobbered in this function, so 16x16 221cabdff1aSopenharmony_ci; can simply call this 2x2x (and that's why we access rsp+gprsize 222cabdff1aSopenharmony_ci; everywhere, which is rsp of calling func 223cabdff1aSopenharmony_cihadamard8x8_diff %+ SUFFIX: 224cabdff1aSopenharmony_ci lea r0, [r3*3] 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_ci ; first 4x8 pixels 227cabdff1aSopenharmony_ci DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 228cabdff1aSopenharmony_ci HADAMARD8 229cabdff1aSopenharmony_ci mova [rsp+gprsize+0x60], m7 230cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 7 231cabdff1aSopenharmony_ci STORE4 rsp+gprsize, m0, m1, m2, m3 232cabdff1aSopenharmony_ci mova m7, [rsp+gprsize+0x60] 233cabdff1aSopenharmony_ci TRANSPOSE4x4W 4, 5, 6, 7, 0 234cabdff1aSopenharmony_ci STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 235cabdff1aSopenharmony_ci 236cabdff1aSopenharmony_ci ; second 4x8 pixels 237cabdff1aSopenharmony_ci DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 238cabdff1aSopenharmony_ci HADAMARD8 239cabdff1aSopenharmony_ci mova [rsp+gprsize+0x60], m7 240cabdff1aSopenharmony_ci TRANSPOSE4x4W 0, 1, 2, 3, 7 241cabdff1aSopenharmony_ci STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 242cabdff1aSopenharmony_ci mova m7, [rsp+gprsize+0x60] 243cabdff1aSopenharmony_ci TRANSPOSE4x4W 4, 5, 6, 7, 0 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_ci LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 246cabdff1aSopenharmony_ci HADAMARD8 247cabdff1aSopenharmony_ci ABS_SUM_8x8_32 rsp+gprsize+0x60 248cabdff1aSopenharmony_ci mova [rsp+gprsize+0x60], m0 249cabdff1aSopenharmony_ci 250cabdff1aSopenharmony_ci LOAD4 rsp+gprsize , m0, m1, m2, m3 251cabdff1aSopenharmony_ci LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 252cabdff1aSopenharmony_ci HADAMARD8 253cabdff1aSopenharmony_ci ABS_SUM_8x8_32 rsp+gprsize 254cabdff1aSopenharmony_ci paddusw m0, [rsp+gprsize+0x60] 255cabdff1aSopenharmony_ci 256cabdff1aSopenharmony_ci HSUM m0, m1, eax 257cabdff1aSopenharmony_ci and rax, 0xFFFF 258cabdff1aSopenharmony_ci ret 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_cihadamard8_16_wrapper 0, 14 261cabdff1aSopenharmony_ci%endif 262cabdff1aSopenharmony_ci%endmacro 263cabdff1aSopenharmony_ci 264cabdff1aSopenharmony_ci%if HAVE_ALIGNED_STACK == 0 265cabdff1aSopenharmony_ciINIT_MMX mmxext 266cabdff1aSopenharmony_ciHADAMARD8_DIFF 267cabdff1aSopenharmony_ci%endif 268cabdff1aSopenharmony_ci 269cabdff1aSopenharmony_ciINIT_XMM sse2 270cabdff1aSopenharmony_ci%if ARCH_X86_64 271cabdff1aSopenharmony_ci%define ABS_SUM_8x8 ABS_SUM_8x8_64 272cabdff1aSopenharmony_ci%else 273cabdff1aSopenharmony_ci%define ABS_SUM_8x8 ABS_SUM_8x8_32 274cabdff1aSopenharmony_ci%endif 275cabdff1aSopenharmony_ciHADAMARD8_DIFF 10 276cabdff1aSopenharmony_ci 277cabdff1aSopenharmony_ciINIT_XMM ssse3 278cabdff1aSopenharmony_ci%define ABS_SUM_8x8 ABS_SUM_8x8_64 279cabdff1aSopenharmony_ciHADAMARD8_DIFF 9 280cabdff1aSopenharmony_ci 281cabdff1aSopenharmony_ci; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 282cabdff1aSopenharmony_ci; ptrdiff_t line_size, int h) 283cabdff1aSopenharmony_ci 284cabdff1aSopenharmony_ci%macro SUM_SQUARED_ERRORS 1 285cabdff1aSopenharmony_cicglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h 286cabdff1aSopenharmony_ci%if %1 == mmsize 287cabdff1aSopenharmony_ci shr hd, 1 288cabdff1aSopenharmony_ci%endif 289cabdff1aSopenharmony_ci pxor m0, m0 ; mm0 = 0 290cabdff1aSopenharmony_ci pxor m7, m7 ; mm7 holds the sum 291cabdff1aSopenharmony_ci 292cabdff1aSopenharmony_ci.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned 293cabdff1aSopenharmony_ci movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx 294cabdff1aSopenharmony_ci movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx 295cabdff1aSopenharmony_ci%if %1 == mmsize 296cabdff1aSopenharmony_ci movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx 297cabdff1aSopenharmony_ci movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx 298cabdff1aSopenharmony_ci%else ; %1 / 2 == mmsize; mmx only 299cabdff1aSopenharmony_ci mova m3, [pix1q+8] ; m3 = pix1[0][8-15] 300cabdff1aSopenharmony_ci mova m4, [pix2q+8] ; m4 = pix2[0][8-15] 301cabdff1aSopenharmony_ci%endif 302cabdff1aSopenharmony_ci 303cabdff1aSopenharmony_ci ; todo: mm1-mm2, mm3-mm4 304cabdff1aSopenharmony_ci ; algo: subtract mm1 from mm2 with saturation and vice versa 305cabdff1aSopenharmony_ci ; OR the result to get the absolute difference 306cabdff1aSopenharmony_ci mova m5, m1 307cabdff1aSopenharmony_ci mova m6, m3 308cabdff1aSopenharmony_ci psubusb m1, m2 309cabdff1aSopenharmony_ci psubusb m3, m4 310cabdff1aSopenharmony_ci psubusb m2, m5 311cabdff1aSopenharmony_ci psubusb m4, m6 312cabdff1aSopenharmony_ci 313cabdff1aSopenharmony_ci por m2, m1 314cabdff1aSopenharmony_ci por m4, m3 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_ci ; now convert to 16-bit vectors so we can square them 317cabdff1aSopenharmony_ci mova m1, m2 318cabdff1aSopenharmony_ci mova m3, m4 319cabdff1aSopenharmony_ci 320cabdff1aSopenharmony_ci punpckhbw m2, m0 321cabdff1aSopenharmony_ci punpckhbw m4, m0 322cabdff1aSopenharmony_ci punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) 323cabdff1aSopenharmony_ci punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) 324cabdff1aSopenharmony_ci 325cabdff1aSopenharmony_ci pmaddwd m2, m2 326cabdff1aSopenharmony_ci pmaddwd m4, m4 327cabdff1aSopenharmony_ci pmaddwd m1, m1 328cabdff1aSopenharmony_ci pmaddwd m3, m3 329cabdff1aSopenharmony_ci 330cabdff1aSopenharmony_ci paddd m1, m2 331cabdff1aSopenharmony_ci paddd m3, m4 332cabdff1aSopenharmony_ci paddd m7, m1 333cabdff1aSopenharmony_ci paddd m7, m3 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_ci%if %1 == mmsize 336cabdff1aSopenharmony_ci lea pix1q, [pix1q + 2*lsizeq] 337cabdff1aSopenharmony_ci lea pix2q, [pix2q + 2*lsizeq] 338cabdff1aSopenharmony_ci%else 339cabdff1aSopenharmony_ci add pix1q, lsizeq 340cabdff1aSopenharmony_ci add pix2q, lsizeq 341cabdff1aSopenharmony_ci%endif 342cabdff1aSopenharmony_ci dec hd 343cabdff1aSopenharmony_ci jnz .next2lines 344cabdff1aSopenharmony_ci 345cabdff1aSopenharmony_ci HADDD m7, m1 346cabdff1aSopenharmony_ci movd eax, m7 ; return value 347cabdff1aSopenharmony_ci RET 348cabdff1aSopenharmony_ci%endmacro 349cabdff1aSopenharmony_ci 350cabdff1aSopenharmony_ciINIT_MMX mmx 351cabdff1aSopenharmony_ciSUM_SQUARED_ERRORS 8 352cabdff1aSopenharmony_ci 353cabdff1aSopenharmony_ciINIT_MMX mmx 354cabdff1aSopenharmony_ciSUM_SQUARED_ERRORS 16 355cabdff1aSopenharmony_ci 356cabdff1aSopenharmony_ciINIT_XMM sse2 357cabdff1aSopenharmony_ciSUM_SQUARED_ERRORS 16 358cabdff1aSopenharmony_ci 359cabdff1aSopenharmony_ci;----------------------------------------------- 360cabdff1aSopenharmony_ci;int ff_sum_abs_dctelem(int16_t *block) 361cabdff1aSopenharmony_ci;----------------------------------------------- 362cabdff1aSopenharmony_ci; %1 = number of xmm registers used 363cabdff1aSopenharmony_ci; %2 = number of inline loops 364cabdff1aSopenharmony_ci 365cabdff1aSopenharmony_ci%macro SUM_ABS_DCTELEM 2 366cabdff1aSopenharmony_cicglobal sum_abs_dctelem, 1, 1, %1, block 367cabdff1aSopenharmony_ci pxor m0, m0 368cabdff1aSopenharmony_ci pxor m1, m1 369cabdff1aSopenharmony_ci%assign %%i 0 370cabdff1aSopenharmony_ci%rep %2 371cabdff1aSopenharmony_ci mova m2, [blockq+mmsize*(0+%%i)] 372cabdff1aSopenharmony_ci mova m3, [blockq+mmsize*(1+%%i)] 373cabdff1aSopenharmony_ci mova m4, [blockq+mmsize*(2+%%i)] 374cabdff1aSopenharmony_ci mova m5, [blockq+mmsize*(3+%%i)] 375cabdff1aSopenharmony_ci ABS1_SUM m2, m6, m0 376cabdff1aSopenharmony_ci ABS1_SUM m3, m6, m1 377cabdff1aSopenharmony_ci ABS1_SUM m4, m6, m0 378cabdff1aSopenharmony_ci ABS1_SUM m5, m6, m1 379cabdff1aSopenharmony_ci%assign %%i %%i+4 380cabdff1aSopenharmony_ci%endrep 381cabdff1aSopenharmony_ci paddusw m0, m1 382cabdff1aSopenharmony_ci HSUM m0, m1, eax 383cabdff1aSopenharmony_ci and eax, 0xFFFF 384cabdff1aSopenharmony_ci RET 385cabdff1aSopenharmony_ci%endmacro 386cabdff1aSopenharmony_ci 387cabdff1aSopenharmony_ciINIT_XMM sse2 388cabdff1aSopenharmony_ciSUM_ABS_DCTELEM 7, 2 389cabdff1aSopenharmony_ciINIT_XMM ssse3 390cabdff1aSopenharmony_ciSUM_ABS_DCTELEM 6, 2 391cabdff1aSopenharmony_ci 392cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 393cabdff1aSopenharmony_ci; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h) 394cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 395cabdff1aSopenharmony_ci; %1 = 8/16. %2-5=m# 396cabdff1aSopenharmony_ci%macro HF_NOISE_PART1 5 397cabdff1aSopenharmony_ci mova m%2, [pix1q] 398cabdff1aSopenharmony_ci%if %1 == 8 399cabdff1aSopenharmony_ci mova m%3, m%2 400cabdff1aSopenharmony_ci psllq m%2, 8 401cabdff1aSopenharmony_ci psrlq m%3, 8 402cabdff1aSopenharmony_ci psrlq m%2, 8 403cabdff1aSopenharmony_ci%else 404cabdff1aSopenharmony_ci mova m%3, [pix1q+1] 405cabdff1aSopenharmony_ci%endif 406cabdff1aSopenharmony_ci mova m%4, m%2 407cabdff1aSopenharmony_ci mova m%5, m%3 408cabdff1aSopenharmony_ci punpcklbw m%2, m7 409cabdff1aSopenharmony_ci punpcklbw m%3, m7 410cabdff1aSopenharmony_ci punpckhbw m%4, m7 411cabdff1aSopenharmony_ci punpckhbw m%5, m7 412cabdff1aSopenharmony_ci psubw m%2, m%3 413cabdff1aSopenharmony_ci psubw m%4, m%5 414cabdff1aSopenharmony_ci%endmacro 415cabdff1aSopenharmony_ci 416cabdff1aSopenharmony_ci; %1-2 = m# 417cabdff1aSopenharmony_ci%macro HF_NOISE_PART2 4 418cabdff1aSopenharmony_ci psubw m%1, m%3 419cabdff1aSopenharmony_ci psubw m%2, m%4 420cabdff1aSopenharmony_ci pxor m3, m3 421cabdff1aSopenharmony_ci pxor m1, m1 422cabdff1aSopenharmony_ci pcmpgtw m3, m%1 423cabdff1aSopenharmony_ci pcmpgtw m1, m%2 424cabdff1aSopenharmony_ci pxor m%1, m3 425cabdff1aSopenharmony_ci pxor m%2, m1 426cabdff1aSopenharmony_ci psubw m%1, m3 427cabdff1aSopenharmony_ci psubw m%2, m1 428cabdff1aSopenharmony_ci paddw m%2, m%1 429cabdff1aSopenharmony_ci paddw m6, m%2 430cabdff1aSopenharmony_ci%endmacro 431cabdff1aSopenharmony_ci 432cabdff1aSopenharmony_ci; %1 = 8/16 433cabdff1aSopenharmony_ci%macro HF_NOISE 1 434cabdff1aSopenharmony_cicglobal hf_noise%1, 3,3,0, pix1, lsize, h 435cabdff1aSopenharmony_ci sub hd, 2 436cabdff1aSopenharmony_ci pxor m7, m7 437cabdff1aSopenharmony_ci pxor m6, m6 438cabdff1aSopenharmony_ci HF_NOISE_PART1 %1, 0, 1, 2, 3 439cabdff1aSopenharmony_ci add pix1q, lsizeq 440cabdff1aSopenharmony_ci HF_NOISE_PART1 %1, 4, 1, 5, 3 441cabdff1aSopenharmony_ci HF_NOISE_PART2 0, 2, 4, 5 442cabdff1aSopenharmony_ci add pix1q, lsizeq 443cabdff1aSopenharmony_ci.loop: 444cabdff1aSopenharmony_ci HF_NOISE_PART1 %1, 0, 1, 2, 3 445cabdff1aSopenharmony_ci HF_NOISE_PART2 4, 5, 0, 2 446cabdff1aSopenharmony_ci add pix1q, lsizeq 447cabdff1aSopenharmony_ci HF_NOISE_PART1 %1, 4, 1, 5, 3 448cabdff1aSopenharmony_ci HF_NOISE_PART2 0, 2, 4, 5 449cabdff1aSopenharmony_ci add pix1q, lsizeq 450cabdff1aSopenharmony_ci sub hd, 2 451cabdff1aSopenharmony_ci jne .loop 452cabdff1aSopenharmony_ci 453cabdff1aSopenharmony_ci mova m0, m6 454cabdff1aSopenharmony_ci punpcklwd m0, m7 455cabdff1aSopenharmony_ci punpckhwd m6, m7 456cabdff1aSopenharmony_ci paddd m6, m0 457cabdff1aSopenharmony_ci mova m0, m6 458cabdff1aSopenharmony_ci psrlq m6, 32 459cabdff1aSopenharmony_ci paddd m0, m6 460cabdff1aSopenharmony_ci movd eax, m0 ; eax = result of hf_noise8; 461cabdff1aSopenharmony_ci REP_RET ; return eax; 462cabdff1aSopenharmony_ci%endmacro 463cabdff1aSopenharmony_ci 464cabdff1aSopenharmony_ciINIT_MMX mmx 465cabdff1aSopenharmony_ciHF_NOISE 8 466cabdff1aSopenharmony_ciHF_NOISE 16 467cabdff1aSopenharmony_ci 468cabdff1aSopenharmony_ci;--------------------------------------------------------------------------------------- 469cabdff1aSopenharmony_ci;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); 470cabdff1aSopenharmony_ci;--------------------------------------------------------------------------------------- 471cabdff1aSopenharmony_ci;%1 = 8/16 472cabdff1aSopenharmony_ci%macro SAD 1 473cabdff1aSopenharmony_cicglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h 474cabdff1aSopenharmony_ci movu m2, [pix2q] 475cabdff1aSopenharmony_ci movu m1, [pix2q+strideq] 476cabdff1aSopenharmony_ci psadbw m2, [pix1q] 477cabdff1aSopenharmony_ci psadbw m1, [pix1q+strideq] 478cabdff1aSopenharmony_ci paddw m2, m1 479cabdff1aSopenharmony_ci%if %1 != mmsize 480cabdff1aSopenharmony_ci movu m0, [pix2q+8] 481cabdff1aSopenharmony_ci movu m1, [pix2q+strideq+8] 482cabdff1aSopenharmony_ci psadbw m0, [pix1q+8] 483cabdff1aSopenharmony_ci psadbw m1, [pix1q+strideq+8] 484cabdff1aSopenharmony_ci paddw m2, m0 485cabdff1aSopenharmony_ci paddw m2, m1 486cabdff1aSopenharmony_ci%endif 487cabdff1aSopenharmony_ci sub hd, 2 488cabdff1aSopenharmony_ci 489cabdff1aSopenharmony_cialign 16 490cabdff1aSopenharmony_ci.loop: 491cabdff1aSopenharmony_ci lea pix1q, [pix1q+strideq*2] 492cabdff1aSopenharmony_ci lea pix2q, [pix2q+strideq*2] 493cabdff1aSopenharmony_ci movu m0, [pix2q] 494cabdff1aSopenharmony_ci movu m1, [pix2q+strideq] 495cabdff1aSopenharmony_ci psadbw m0, [pix1q] 496cabdff1aSopenharmony_ci psadbw m1, [pix1q+strideq] 497cabdff1aSopenharmony_ci paddw m2, m0 498cabdff1aSopenharmony_ci paddw m2, m1 499cabdff1aSopenharmony_ci%if %1 != mmsize 500cabdff1aSopenharmony_ci movu m0, [pix2q+8] 501cabdff1aSopenharmony_ci movu m1, [pix2q+strideq+8] 502cabdff1aSopenharmony_ci psadbw m0, [pix1q+8] 503cabdff1aSopenharmony_ci psadbw m1, [pix1q+strideq+8] 504cabdff1aSopenharmony_ci paddw m2, m0 505cabdff1aSopenharmony_ci paddw m2, m1 506cabdff1aSopenharmony_ci%endif 507cabdff1aSopenharmony_ci sub hd, 2 508cabdff1aSopenharmony_ci jg .loop 509cabdff1aSopenharmony_ci%if mmsize == 16 510cabdff1aSopenharmony_ci movhlps m0, m2 511cabdff1aSopenharmony_ci paddw m2, m0 512cabdff1aSopenharmony_ci%endif 513cabdff1aSopenharmony_ci movd eax, m2 514cabdff1aSopenharmony_ci RET 515cabdff1aSopenharmony_ci%endmacro 516cabdff1aSopenharmony_ci 517cabdff1aSopenharmony_ciINIT_MMX mmxext 518cabdff1aSopenharmony_ciSAD 8 519cabdff1aSopenharmony_ciSAD 16 520cabdff1aSopenharmony_ciINIT_XMM sse2 521cabdff1aSopenharmony_ciSAD 16 522cabdff1aSopenharmony_ci 523cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------ 524cabdff1aSopenharmony_ci;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); 525cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------ 526cabdff1aSopenharmony_ci;%1 = 8/16 527cabdff1aSopenharmony_ci%macro SAD_X2 1 528cabdff1aSopenharmony_cicglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h 529cabdff1aSopenharmony_ci movu m0, [pix2q] 530cabdff1aSopenharmony_ci movu m2, [pix2q+strideq] 531cabdff1aSopenharmony_ci%if mmsize == 16 532cabdff1aSopenharmony_ci movu m3, [pix2q+1] 533cabdff1aSopenharmony_ci movu m4, [pix2q+strideq+1] 534cabdff1aSopenharmony_ci pavgb m0, m3 535cabdff1aSopenharmony_ci pavgb m2, m4 536cabdff1aSopenharmony_ci%else 537cabdff1aSopenharmony_ci pavgb m0, [pix2q+1] 538cabdff1aSopenharmony_ci pavgb m2, [pix2q+strideq+1] 539cabdff1aSopenharmony_ci%endif 540cabdff1aSopenharmony_ci psadbw m0, [pix1q] 541cabdff1aSopenharmony_ci psadbw m2, [pix1q+strideq] 542cabdff1aSopenharmony_ci paddw m0, m2 543cabdff1aSopenharmony_ci%if %1 != mmsize 544cabdff1aSopenharmony_ci movu m1, [pix2q+8] 545cabdff1aSopenharmony_ci movu m2, [pix2q+strideq+8] 546cabdff1aSopenharmony_ci pavgb m1, [pix2q+9] 547cabdff1aSopenharmony_ci pavgb m2, [pix2q+strideq+9] 548cabdff1aSopenharmony_ci psadbw m1, [pix1q+8] 549cabdff1aSopenharmony_ci psadbw m2, [pix1q+strideq+8] 550cabdff1aSopenharmony_ci paddw m0, m1 551cabdff1aSopenharmony_ci paddw m0, m2 552cabdff1aSopenharmony_ci%endif 553cabdff1aSopenharmony_ci sub hd, 2 554cabdff1aSopenharmony_ci 555cabdff1aSopenharmony_cialign 16 556cabdff1aSopenharmony_ci.loop: 557cabdff1aSopenharmony_ci lea pix1q, [pix1q+2*strideq] 558cabdff1aSopenharmony_ci lea pix2q, [pix2q+2*strideq] 559cabdff1aSopenharmony_ci movu m1, [pix2q] 560cabdff1aSopenharmony_ci movu m2, [pix2q+strideq] 561cabdff1aSopenharmony_ci%if mmsize == 16 562cabdff1aSopenharmony_ci movu m3, [pix2q+1] 563cabdff1aSopenharmony_ci movu m4, [pix2q+strideq+1] 564cabdff1aSopenharmony_ci pavgb m1, m3 565cabdff1aSopenharmony_ci pavgb m2, m4 566cabdff1aSopenharmony_ci%else 567cabdff1aSopenharmony_ci pavgb m1, [pix2q+1] 568cabdff1aSopenharmony_ci pavgb m2, [pix2q+strideq+1] 569cabdff1aSopenharmony_ci%endif 570cabdff1aSopenharmony_ci psadbw m1, [pix1q] 571cabdff1aSopenharmony_ci psadbw m2, [pix1q+strideq] 572cabdff1aSopenharmony_ci paddw m0, m1 573cabdff1aSopenharmony_ci paddw m0, m2 574cabdff1aSopenharmony_ci%if %1 != mmsize 575cabdff1aSopenharmony_ci movu m1, [pix2q+8] 576cabdff1aSopenharmony_ci movu m2, [pix2q+strideq+8] 577cabdff1aSopenharmony_ci pavgb m1, [pix2q+9] 578cabdff1aSopenharmony_ci pavgb m2, [pix2q+strideq+9] 579cabdff1aSopenharmony_ci psadbw m1, [pix1q+8] 580cabdff1aSopenharmony_ci psadbw m2, [pix1q+strideq+8] 581cabdff1aSopenharmony_ci paddw m0, m1 582cabdff1aSopenharmony_ci paddw m0, m2 583cabdff1aSopenharmony_ci%endif 584cabdff1aSopenharmony_ci sub hd, 2 585cabdff1aSopenharmony_ci jg .loop 586cabdff1aSopenharmony_ci%if mmsize == 16 587cabdff1aSopenharmony_ci movhlps m1, m0 588cabdff1aSopenharmony_ci paddw m0, m1 589cabdff1aSopenharmony_ci%endif 590cabdff1aSopenharmony_ci movd eax, m0 591cabdff1aSopenharmony_ci RET 592cabdff1aSopenharmony_ci%endmacro 593cabdff1aSopenharmony_ci 594cabdff1aSopenharmony_ciINIT_MMX mmxext 595cabdff1aSopenharmony_ciSAD_X2 8 596cabdff1aSopenharmony_ciSAD_X2 16 597cabdff1aSopenharmony_ciINIT_XMM sse2 598cabdff1aSopenharmony_ciSAD_X2 16 599cabdff1aSopenharmony_ci 600cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------ 601cabdff1aSopenharmony_ci;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); 602cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------ 603cabdff1aSopenharmony_ci;%1 = 8/16 604cabdff1aSopenharmony_ci%macro SAD_Y2 1 605cabdff1aSopenharmony_cicglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h 606cabdff1aSopenharmony_ci movu m1, [pix2q] 607cabdff1aSopenharmony_ci movu m0, [pix2q+strideq] 608cabdff1aSopenharmony_ci movu m3, [pix2q+2*strideq] 609cabdff1aSopenharmony_ci pavgb m1, m0 610cabdff1aSopenharmony_ci pavgb m0, m3 611cabdff1aSopenharmony_ci psadbw m1, [pix1q] 612cabdff1aSopenharmony_ci psadbw m0, [pix1q+strideq] 613cabdff1aSopenharmony_ci paddw m0, m1 614cabdff1aSopenharmony_ci mova m1, m3 615cabdff1aSopenharmony_ci%if %1 != mmsize 616cabdff1aSopenharmony_ci movu m4, [pix2q+8] 617cabdff1aSopenharmony_ci movu m5, [pix2q+strideq+8] 618cabdff1aSopenharmony_ci movu m6, [pix2q+2*strideq+8] 619cabdff1aSopenharmony_ci pavgb m4, m5 620cabdff1aSopenharmony_ci pavgb m5, m6 621cabdff1aSopenharmony_ci psadbw m4, [pix1q+8] 622cabdff1aSopenharmony_ci psadbw m5, [pix1q+strideq+8] 623cabdff1aSopenharmony_ci paddw m0, m4 624cabdff1aSopenharmony_ci paddw m0, m5 625cabdff1aSopenharmony_ci mova m4, m6 626cabdff1aSopenharmony_ci%endif 627cabdff1aSopenharmony_ci add pix2q, strideq 628cabdff1aSopenharmony_ci sub hd, 2 629cabdff1aSopenharmony_ci 630cabdff1aSopenharmony_cialign 16 631cabdff1aSopenharmony_ci.loop: 632cabdff1aSopenharmony_ci lea pix1q, [pix1q+2*strideq] 633cabdff1aSopenharmony_ci lea pix2q, [pix2q+2*strideq] 634cabdff1aSopenharmony_ci movu m2, [pix2q] 635cabdff1aSopenharmony_ci movu m3, [pix2q+strideq] 636cabdff1aSopenharmony_ci pavgb m1, m2 637cabdff1aSopenharmony_ci pavgb m2, m3 638cabdff1aSopenharmony_ci psadbw m1, [pix1q] 639cabdff1aSopenharmony_ci psadbw m2, [pix1q+strideq] 640cabdff1aSopenharmony_ci paddw m0, m1 641cabdff1aSopenharmony_ci paddw m0, m2 642cabdff1aSopenharmony_ci mova m1, m3 643cabdff1aSopenharmony_ci%if %1 != mmsize 644cabdff1aSopenharmony_ci movu m5, [pix2q+8] 645cabdff1aSopenharmony_ci movu m6, [pix2q+strideq+8] 646cabdff1aSopenharmony_ci pavgb m4, m5 647cabdff1aSopenharmony_ci pavgb m5, m6 648cabdff1aSopenharmony_ci psadbw m4, [pix1q+8] 649cabdff1aSopenharmony_ci psadbw m5, [pix1q+strideq+8] 650cabdff1aSopenharmony_ci paddw m0, m4 651cabdff1aSopenharmony_ci paddw m0, m5 652cabdff1aSopenharmony_ci mova m4, m6 653cabdff1aSopenharmony_ci%endif 654cabdff1aSopenharmony_ci sub hd, 2 655cabdff1aSopenharmony_ci jg .loop 656cabdff1aSopenharmony_ci%if mmsize == 16 657cabdff1aSopenharmony_ci movhlps m1, m0 658cabdff1aSopenharmony_ci paddw m0, m1 659cabdff1aSopenharmony_ci%endif 660cabdff1aSopenharmony_ci movd eax, m0 661cabdff1aSopenharmony_ci RET 662cabdff1aSopenharmony_ci%endmacro 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_ciINIT_MMX mmxext 665cabdff1aSopenharmony_ciSAD_Y2 8 666cabdff1aSopenharmony_ciSAD_Y2 16 667cabdff1aSopenharmony_ciINIT_XMM sse2 668cabdff1aSopenharmony_ciSAD_Y2 16 669cabdff1aSopenharmony_ci 670cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------- 671cabdff1aSopenharmony_ci;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); 672cabdff1aSopenharmony_ci;------------------------------------------------------------------------------------------- 673cabdff1aSopenharmony_ci;%1 = 8/16 674cabdff1aSopenharmony_ci%macro SAD_APPROX_XY2 1 675cabdff1aSopenharmony_cicglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h 676cabdff1aSopenharmony_ci mova m4, [pb_1] 677cabdff1aSopenharmony_ci movu m1, [pix2q] 678cabdff1aSopenharmony_ci movu m0, [pix2q+strideq] 679cabdff1aSopenharmony_ci movu m3, [pix2q+2*strideq] 680cabdff1aSopenharmony_ci%if mmsize == 16 681cabdff1aSopenharmony_ci movu m5, [pix2q+1] 682cabdff1aSopenharmony_ci movu m6, [pix2q+strideq+1] 683cabdff1aSopenharmony_ci movu m2, [pix2q+2*strideq+1] 684cabdff1aSopenharmony_ci pavgb m1, m5 685cabdff1aSopenharmony_ci pavgb m0, m6 686cabdff1aSopenharmony_ci pavgb m3, m2 687cabdff1aSopenharmony_ci%else 688cabdff1aSopenharmony_ci pavgb m1, [pix2q+1] 689cabdff1aSopenharmony_ci pavgb m0, [pix2q+strideq+1] 690cabdff1aSopenharmony_ci pavgb m3, [pix2q+2*strideq+1] 691cabdff1aSopenharmony_ci%endif 692cabdff1aSopenharmony_ci psubusb m0, m4 693cabdff1aSopenharmony_ci pavgb m1, m0 694cabdff1aSopenharmony_ci pavgb m0, m3 695cabdff1aSopenharmony_ci psadbw m1, [pix1q] 696cabdff1aSopenharmony_ci psadbw m0, [pix1q+strideq] 697cabdff1aSopenharmony_ci paddw m0, m1 698cabdff1aSopenharmony_ci mova m1, m3 699cabdff1aSopenharmony_ci%if %1 != mmsize 700cabdff1aSopenharmony_ci movu m5, [pix2q+8] 701cabdff1aSopenharmony_ci movu m6, [pix2q+strideq+8] 702cabdff1aSopenharmony_ci movu m7, [pix2q+2*strideq+8] 703cabdff1aSopenharmony_ci pavgb m5, [pix2q+1+8] 704cabdff1aSopenharmony_ci pavgb m6, [pix2q+strideq+1+8] 705cabdff1aSopenharmony_ci pavgb m7, [pix2q+2*strideq+1+8] 706cabdff1aSopenharmony_ci psubusb m6, m4 707cabdff1aSopenharmony_ci pavgb m5, m6 708cabdff1aSopenharmony_ci pavgb m6, m7 709cabdff1aSopenharmony_ci psadbw m5, [pix1q+8] 710cabdff1aSopenharmony_ci psadbw m6, [pix1q+strideq+8] 711cabdff1aSopenharmony_ci paddw m0, m5 712cabdff1aSopenharmony_ci paddw m0, m6 713cabdff1aSopenharmony_ci mova m5, m7 714cabdff1aSopenharmony_ci%endif 715cabdff1aSopenharmony_ci add pix2q, strideq 716cabdff1aSopenharmony_ci sub hd, 2 717cabdff1aSopenharmony_ci 718cabdff1aSopenharmony_cialign 16 719cabdff1aSopenharmony_ci.loop: 720cabdff1aSopenharmony_ci lea pix1q, [pix1q+2*strideq] 721cabdff1aSopenharmony_ci lea pix2q, [pix2q+2*strideq] 722cabdff1aSopenharmony_ci movu m2, [pix2q] 723cabdff1aSopenharmony_ci movu m3, [pix2q+strideq] 724cabdff1aSopenharmony_ci%if mmsize == 16 725cabdff1aSopenharmony_ci movu m5, [pix2q+1] 726cabdff1aSopenharmony_ci movu m6, [pix2q+strideq+1] 727cabdff1aSopenharmony_ci pavgb m2, m5 728cabdff1aSopenharmony_ci pavgb m3, m6 729cabdff1aSopenharmony_ci%else 730cabdff1aSopenharmony_ci pavgb m2, [pix2q+1] 731cabdff1aSopenharmony_ci pavgb m3, [pix2q+strideq+1] 732cabdff1aSopenharmony_ci%endif 733cabdff1aSopenharmony_ci psubusb m2, m4 734cabdff1aSopenharmony_ci pavgb m1, m2 735cabdff1aSopenharmony_ci pavgb m2, m3 736cabdff1aSopenharmony_ci psadbw m1, [pix1q] 737cabdff1aSopenharmony_ci psadbw m2, [pix1q+strideq] 738cabdff1aSopenharmony_ci paddw m0, m1 739cabdff1aSopenharmony_ci paddw m0, m2 740cabdff1aSopenharmony_ci mova m1, m3 741cabdff1aSopenharmony_ci%if %1 != mmsize 742cabdff1aSopenharmony_ci movu m6, [pix2q+8] 743cabdff1aSopenharmony_ci movu m7, [pix2q+strideq+8] 744cabdff1aSopenharmony_ci pavgb m6, [pix2q+8+1] 745cabdff1aSopenharmony_ci pavgb m7, [pix2q+strideq+8+1] 746cabdff1aSopenharmony_ci psubusb m6, m4 747cabdff1aSopenharmony_ci pavgb m5, m6 748cabdff1aSopenharmony_ci pavgb m6, m7 749cabdff1aSopenharmony_ci psadbw m5, [pix1q+8] 750cabdff1aSopenharmony_ci psadbw m6, [pix1q+strideq+8] 751cabdff1aSopenharmony_ci paddw m0, m5 752cabdff1aSopenharmony_ci paddw m0, m6 753cabdff1aSopenharmony_ci mova m5, m7 754cabdff1aSopenharmony_ci%endif 755cabdff1aSopenharmony_ci sub hd, 2 756cabdff1aSopenharmony_ci jg .loop 757cabdff1aSopenharmony_ci%if mmsize == 16 758cabdff1aSopenharmony_ci movhlps m1, m0 759cabdff1aSopenharmony_ci paddw m0, m1 760cabdff1aSopenharmony_ci%endif 761cabdff1aSopenharmony_ci movd eax, m0 762cabdff1aSopenharmony_ci RET 763cabdff1aSopenharmony_ci%endmacro 764cabdff1aSopenharmony_ci 765cabdff1aSopenharmony_ciINIT_MMX mmxext 766cabdff1aSopenharmony_ciSAD_APPROX_XY2 8 767cabdff1aSopenharmony_ciSAD_APPROX_XY2 16 768cabdff1aSopenharmony_ciINIT_XMM sse2 769cabdff1aSopenharmony_ciSAD_APPROX_XY2 16 770cabdff1aSopenharmony_ci 771cabdff1aSopenharmony_ci;-------------------------------------------------------------------- 772cabdff1aSopenharmony_ci;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 773cabdff1aSopenharmony_ci; ptrdiff_t line_size, int h); 774cabdff1aSopenharmony_ci;-------------------------------------------------------------------- 775cabdff1aSopenharmony_ci; %1 = 8/16 776cabdff1aSopenharmony_ci%macro VSAD_INTRA 1 777cabdff1aSopenharmony_cicglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h 778cabdff1aSopenharmony_ci mova m0, [pix1q] 779cabdff1aSopenharmony_ci%if %1 == mmsize 780cabdff1aSopenharmony_ci mova m2, [pix1q+lsizeq] 781cabdff1aSopenharmony_ci psadbw m0, m2 782cabdff1aSopenharmony_ci%else 783cabdff1aSopenharmony_ci mova m2, [pix1q+lsizeq] 784cabdff1aSopenharmony_ci mova m3, [pix1q+8] 785cabdff1aSopenharmony_ci mova m4, [pix1q+lsizeq+8] 786cabdff1aSopenharmony_ci psadbw m0, m2 787cabdff1aSopenharmony_ci psadbw m3, m4 788cabdff1aSopenharmony_ci paddw m0, m3 789cabdff1aSopenharmony_ci%endif 790cabdff1aSopenharmony_ci sub hd, 2 791cabdff1aSopenharmony_ci 792cabdff1aSopenharmony_ci.loop: 793cabdff1aSopenharmony_ci lea pix1q, [pix1q + 2*lsizeq] 794cabdff1aSopenharmony_ci%if %1 == mmsize 795cabdff1aSopenharmony_ci mova m1, [pix1q] 796cabdff1aSopenharmony_ci psadbw m2, m1 797cabdff1aSopenharmony_ci paddw m0, m2 798cabdff1aSopenharmony_ci mova m2, [pix1q+lsizeq] 799cabdff1aSopenharmony_ci psadbw m1, m2 800cabdff1aSopenharmony_ci paddw m0, m1 801cabdff1aSopenharmony_ci%else 802cabdff1aSopenharmony_ci mova m1, [pix1q] 803cabdff1aSopenharmony_ci mova m3, [pix1q+8] 804cabdff1aSopenharmony_ci psadbw m2, m1 805cabdff1aSopenharmony_ci psadbw m4, m3 806cabdff1aSopenharmony_ci paddw m0, m2 807cabdff1aSopenharmony_ci paddw m0, m4 808cabdff1aSopenharmony_ci mova m2, [pix1q+lsizeq] 809cabdff1aSopenharmony_ci mova m4, [pix1q+lsizeq+8] 810cabdff1aSopenharmony_ci psadbw m1, m2 811cabdff1aSopenharmony_ci psadbw m3, m4 812cabdff1aSopenharmony_ci paddw m0, m1 813cabdff1aSopenharmony_ci paddw m0, m3 814cabdff1aSopenharmony_ci%endif 815cabdff1aSopenharmony_ci sub hd, 2 816cabdff1aSopenharmony_ci jg .loop 817cabdff1aSopenharmony_ci 818cabdff1aSopenharmony_ci%if mmsize == 16 819cabdff1aSopenharmony_ci pshufd m1, m0, 0xe 820cabdff1aSopenharmony_ci paddd m0, m1 821cabdff1aSopenharmony_ci%endif 822cabdff1aSopenharmony_ci movd eax, m0 823cabdff1aSopenharmony_ci RET 824cabdff1aSopenharmony_ci%endmacro 825cabdff1aSopenharmony_ci 826cabdff1aSopenharmony_ciINIT_MMX mmxext 827cabdff1aSopenharmony_ciVSAD_INTRA 8 828cabdff1aSopenharmony_ciVSAD_INTRA 16 829cabdff1aSopenharmony_ciINIT_XMM sse2 830cabdff1aSopenharmony_ciVSAD_INTRA 16 831cabdff1aSopenharmony_ci 832cabdff1aSopenharmony_ci;--------------------------------------------------------------------- 833cabdff1aSopenharmony_ci;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 834cabdff1aSopenharmony_ci; ptrdiff_t line_size, int h); 835cabdff1aSopenharmony_ci;--------------------------------------------------------------------- 836cabdff1aSopenharmony_ci; %1 = 8/16 837cabdff1aSopenharmony_ci%macro VSAD_APPROX 1 838cabdff1aSopenharmony_cicglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h 839cabdff1aSopenharmony_ci mova m1, [pb_80] 840cabdff1aSopenharmony_ci mova m0, [pix1q] 841cabdff1aSopenharmony_ci%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2 842cabdff1aSopenharmony_ci mova m4, [pix1q+lsizeq] 843cabdff1aSopenharmony_ci%if mmsize == 16 844cabdff1aSopenharmony_ci movu m3, [pix2q] 845cabdff1aSopenharmony_ci movu m2, [pix2q+lsizeq] 846cabdff1aSopenharmony_ci psubb m0, m3 847cabdff1aSopenharmony_ci psubb m4, m2 848cabdff1aSopenharmony_ci%else 849cabdff1aSopenharmony_ci psubb m0, [pix2q] 850cabdff1aSopenharmony_ci psubb m4, [pix2q+lsizeq] 851cabdff1aSopenharmony_ci%endif 852cabdff1aSopenharmony_ci pxor m0, m1 853cabdff1aSopenharmony_ci pxor m4, m1 854cabdff1aSopenharmony_ci psadbw m0, m4 855cabdff1aSopenharmony_ci%else ; vsad16_mmxext 856cabdff1aSopenharmony_ci mova m3, [pix1q+8] 857cabdff1aSopenharmony_ci psubb m0, [pix2q] 858cabdff1aSopenharmony_ci psubb m3, [pix2q+8] 859cabdff1aSopenharmony_ci pxor m0, m1 860cabdff1aSopenharmony_ci pxor m3, m1 861cabdff1aSopenharmony_ci mova m4, [pix1q+lsizeq] 862cabdff1aSopenharmony_ci mova m5, [pix1q+lsizeq+8] 863cabdff1aSopenharmony_ci psubb m4, [pix2q+lsizeq] 864cabdff1aSopenharmony_ci psubb m5, [pix2q+lsizeq+8] 865cabdff1aSopenharmony_ci pxor m4, m1 866cabdff1aSopenharmony_ci pxor m5, m1 867cabdff1aSopenharmony_ci psadbw m0, m4 868cabdff1aSopenharmony_ci psadbw m3, m5 869cabdff1aSopenharmony_ci paddw m0, m3 870cabdff1aSopenharmony_ci%endif 871cabdff1aSopenharmony_ci sub hd, 2 872cabdff1aSopenharmony_ci 873cabdff1aSopenharmony_ci.loop: 874cabdff1aSopenharmony_ci lea pix1q, [pix1q + 2*lsizeq] 875cabdff1aSopenharmony_ci lea pix2q, [pix2q + 2*lsizeq] 876cabdff1aSopenharmony_ci mova m2, [pix1q] 877cabdff1aSopenharmony_ci%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2 878cabdff1aSopenharmony_ci%if mmsize == 16 879cabdff1aSopenharmony_ci movu m3, [pix2q] 880cabdff1aSopenharmony_ci psubb m2, m3 881cabdff1aSopenharmony_ci%else 882cabdff1aSopenharmony_ci psubb m2, [pix2q] 883cabdff1aSopenharmony_ci%endif 884cabdff1aSopenharmony_ci pxor m2, m1 885cabdff1aSopenharmony_ci psadbw m4, m2 886cabdff1aSopenharmony_ci paddw m0, m4 887cabdff1aSopenharmony_ci mova m4, [pix1q+lsizeq] 888cabdff1aSopenharmony_ci movu m3, [pix2q+lsizeq] 889cabdff1aSopenharmony_ci psubb m4, m3 890cabdff1aSopenharmony_ci pxor m4, m1 891cabdff1aSopenharmony_ci psadbw m2, m4 892cabdff1aSopenharmony_ci paddw m0, m2 893cabdff1aSopenharmony_ci%else ; vsad16_mmxext 894cabdff1aSopenharmony_ci mova m3, [pix1q+8] 895cabdff1aSopenharmony_ci psubb m2, [pix2q] 896cabdff1aSopenharmony_ci psubb m3, [pix2q+8] 897cabdff1aSopenharmony_ci pxor m2, m1 898cabdff1aSopenharmony_ci pxor m3, m1 899cabdff1aSopenharmony_ci psadbw m4, m2 900cabdff1aSopenharmony_ci psadbw m5, m3 901cabdff1aSopenharmony_ci paddw m0, m4 902cabdff1aSopenharmony_ci paddw m0, m5 903cabdff1aSopenharmony_ci mova m4, [pix1q+lsizeq] 904cabdff1aSopenharmony_ci mova m5, [pix1q+lsizeq+8] 905cabdff1aSopenharmony_ci psubb m4, [pix2q+lsizeq] 906cabdff1aSopenharmony_ci psubb m5, [pix2q+lsizeq+8] 907cabdff1aSopenharmony_ci pxor m4, m1 908cabdff1aSopenharmony_ci pxor m5, m1 909cabdff1aSopenharmony_ci psadbw m2, m4 910cabdff1aSopenharmony_ci psadbw m3, m5 911cabdff1aSopenharmony_ci paddw m0, m2 912cabdff1aSopenharmony_ci paddw m0, m3 913cabdff1aSopenharmony_ci%endif 914cabdff1aSopenharmony_ci sub hd, 2 915cabdff1aSopenharmony_ci jg .loop 916cabdff1aSopenharmony_ci 917cabdff1aSopenharmony_ci%if mmsize == 16 918cabdff1aSopenharmony_ci pshufd m1, m0, 0xe 919cabdff1aSopenharmony_ci paddd m0, m1 920cabdff1aSopenharmony_ci%endif 921cabdff1aSopenharmony_ci movd eax, m0 922cabdff1aSopenharmony_ci RET 923cabdff1aSopenharmony_ci%endmacro 924cabdff1aSopenharmony_ci 925cabdff1aSopenharmony_ciINIT_MMX mmxext 926cabdff1aSopenharmony_ciVSAD_APPROX 8 927cabdff1aSopenharmony_ciVSAD_APPROX 16 928cabdff1aSopenharmony_ciINIT_XMM sse2 929cabdff1aSopenharmony_ciVSAD_APPROX 16 930