1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* 3cabdff1aSopenharmony_ci;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org> 4cabdff1aSopenharmony_ci;* Copyright (c) Nick Kurshev <nickols_k@mail.ru> 5cabdff1aSopenharmony_ci;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> 6cabdff1aSopenharmony_ci;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz> 7cabdff1aSopenharmony_ci;* Copyright (c) 2013 Daniel Kang 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* SIMD-optimized halfpel functions 10cabdff1aSopenharmony_ci;* 11cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 14cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 15cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 16cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 19cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 20cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 22cabdff1aSopenharmony_ci;* 23cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 24cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 25cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 26cabdff1aSopenharmony_ci;****************************************************************************** 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ciSECTION_RODATA 31cabdff1aSopenharmony_cicextern pb_1 32cabdff1aSopenharmony_cicextern pw_2 33cabdff1aSopenharmony_cipb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 34cabdff1aSopenharmony_cipb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_cicextern pw_8192 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ciSECTION .text 39cabdff1aSopenharmony_ci 40cabdff1aSopenharmony_ci; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 41cabdff1aSopenharmony_ci%macro PUT_PIXELS8_X2 0 42cabdff1aSopenharmony_ci%if cpuflag(sse2) 43cabdff1aSopenharmony_cicglobal put_pixels16_x2, 4,5,4 44cabdff1aSopenharmony_ci%else 45cabdff1aSopenharmony_cicglobal put_pixels8_x2, 4,5 46cabdff1aSopenharmony_ci%endif 47cabdff1aSopenharmony_ci lea r4, [r2*2] 48cabdff1aSopenharmony_ci.loop: 49cabdff1aSopenharmony_ci movu m0, [r1+1] 50cabdff1aSopenharmony_ci movu m1, [r1+r2+1] 51cabdff1aSopenharmony_ci%if cpuflag(sse2) 52cabdff1aSopenharmony_ci movu m2, [r1] 53cabdff1aSopenharmony_ci movu m3, [r1+r2] 54cabdff1aSopenharmony_ci pavgb m0, m2 55cabdff1aSopenharmony_ci pavgb m1, m3 56cabdff1aSopenharmony_ci%else 57cabdff1aSopenharmony_ci PAVGB m0, [r1] 58cabdff1aSopenharmony_ci PAVGB m1, [r1+r2] 59cabdff1aSopenharmony_ci%endif 60cabdff1aSopenharmony_ci mova [r0], m0 61cabdff1aSopenharmony_ci mova [r0+r2], m1 62cabdff1aSopenharmony_ci add r1, r4 63cabdff1aSopenharmony_ci add r0, r4 64cabdff1aSopenharmony_ci movu m0, [r1+1] 65cabdff1aSopenharmony_ci movu m1, [r1+r2+1] 66cabdff1aSopenharmony_ci%if cpuflag(sse2) 67cabdff1aSopenharmony_ci movu m2, [r1] 68cabdff1aSopenharmony_ci movu m3, [r1+r2] 69cabdff1aSopenharmony_ci pavgb m0, m2 70cabdff1aSopenharmony_ci pavgb m1, m3 71cabdff1aSopenharmony_ci%else 72cabdff1aSopenharmony_ci PAVGB m0, [r1] 73cabdff1aSopenharmony_ci PAVGB m1, [r1+r2] 74cabdff1aSopenharmony_ci%endif 75cabdff1aSopenharmony_ci add r1, r4 76cabdff1aSopenharmony_ci mova [r0], m0 77cabdff1aSopenharmony_ci mova [r0+r2], m1 78cabdff1aSopenharmony_ci add r0, r4 79cabdff1aSopenharmony_ci sub r3d, 4 80cabdff1aSopenharmony_ci jne .loop 81cabdff1aSopenharmony_ci REP_RET 82cabdff1aSopenharmony_ci%endmacro 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ciINIT_MMX mmxext 85cabdff1aSopenharmony_ciPUT_PIXELS8_X2 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 89cabdff1aSopenharmony_ci%macro PUT_PIXELS_16 0 90cabdff1aSopenharmony_cicglobal put_pixels16_x2, 4,5 91cabdff1aSopenharmony_ci lea r4, [r2*2] 92cabdff1aSopenharmony_ci.loop: 93cabdff1aSopenharmony_ci mova m0, [r1] 94cabdff1aSopenharmony_ci mova m1, [r1+r2] 95cabdff1aSopenharmony_ci mova m2, [r1+8] 96cabdff1aSopenharmony_ci mova m3, [r1+r2+8] 97cabdff1aSopenharmony_ci PAVGB m0, [r1+1] 98cabdff1aSopenharmony_ci PAVGB m1, [r1+r2+1] 99cabdff1aSopenharmony_ci PAVGB m2, [r1+9] 100cabdff1aSopenharmony_ci PAVGB m3, [r1+r2+9] 101cabdff1aSopenharmony_ci mova [r0], m0 102cabdff1aSopenharmony_ci mova [r0+r2], m1 103cabdff1aSopenharmony_ci mova [r0+8], m2 104cabdff1aSopenharmony_ci mova [r0+r2+8], m3 105cabdff1aSopenharmony_ci add r1, r4 106cabdff1aSopenharmony_ci add r0, r4 107cabdff1aSopenharmony_ci mova m0, [r1] 108cabdff1aSopenharmony_ci mova m1, [r1+r2] 109cabdff1aSopenharmony_ci mova m2, [r1+8] 110cabdff1aSopenharmony_ci mova m3, [r1+r2+8] 111cabdff1aSopenharmony_ci PAVGB m0, [r1+1] 112cabdff1aSopenharmony_ci PAVGB m1, [r1+r2+1] 113cabdff1aSopenharmony_ci PAVGB m2, [r1+9] 114cabdff1aSopenharmony_ci PAVGB m3, [r1+r2+9] 115cabdff1aSopenharmony_ci add r1, r4 116cabdff1aSopenharmony_ci mova [r0], m0 117cabdff1aSopenharmony_ci mova [r0+r2], m1 118cabdff1aSopenharmony_ci mova [r0+8], m2 119cabdff1aSopenharmony_ci mova [r0+r2+8], m3 120cabdff1aSopenharmony_ci add r0, r4 121cabdff1aSopenharmony_ci sub r3d, 4 122cabdff1aSopenharmony_ci jne .loop 123cabdff1aSopenharmony_ci REP_RET 124cabdff1aSopenharmony_ci%endmacro 125cabdff1aSopenharmony_ci 126cabdff1aSopenharmony_ciINIT_MMX mmxext 127cabdff1aSopenharmony_ciPUT_PIXELS_16 128cabdff1aSopenharmony_ci; The 8_X2 macro can easily be used here 129cabdff1aSopenharmony_ciINIT_XMM sse2 130cabdff1aSopenharmony_ciPUT_PIXELS8_X2 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci 133cabdff1aSopenharmony_ci; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 134cabdff1aSopenharmony_ciINIT_MMX mmxext 135cabdff1aSopenharmony_cicglobal put_no_rnd_pixels8_x2, 4,5 136cabdff1aSopenharmony_ci mova m6, [pb_1] 137cabdff1aSopenharmony_ci lea r4, [r2*2] 138cabdff1aSopenharmony_ci.loop: 139cabdff1aSopenharmony_ci mova m0, [r1] 140cabdff1aSopenharmony_ci mova m2, [r1+r2] 141cabdff1aSopenharmony_ci mova m1, [r1+1] 142cabdff1aSopenharmony_ci mova m3, [r1+r2+1] 143cabdff1aSopenharmony_ci add r1, r4 144cabdff1aSopenharmony_ci psubusb m0, m6 145cabdff1aSopenharmony_ci psubusb m2, m6 146cabdff1aSopenharmony_ci PAVGB m0, m1 147cabdff1aSopenharmony_ci PAVGB m2, m3 148cabdff1aSopenharmony_ci mova [r0], m0 149cabdff1aSopenharmony_ci mova [r0+r2], m2 150cabdff1aSopenharmony_ci mova m0, [r1] 151cabdff1aSopenharmony_ci mova m1, [r1+1] 152cabdff1aSopenharmony_ci mova m2, [r1+r2] 153cabdff1aSopenharmony_ci mova m3, [r1+r2+1] 154cabdff1aSopenharmony_ci add r0, r4 155cabdff1aSopenharmony_ci add r1, r4 156cabdff1aSopenharmony_ci psubusb m0, m6 157cabdff1aSopenharmony_ci psubusb m2, m6 158cabdff1aSopenharmony_ci PAVGB m0, m1 159cabdff1aSopenharmony_ci PAVGB m2, m3 160cabdff1aSopenharmony_ci mova [r0], m0 161cabdff1aSopenharmony_ci mova [r0+r2], m2 162cabdff1aSopenharmony_ci add r0, r4 163cabdff1aSopenharmony_ci sub r3d, 4 164cabdff1aSopenharmony_ci jne .loop 165cabdff1aSopenharmony_ci REP_RET 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ci 168cabdff1aSopenharmony_ci; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 169cabdff1aSopenharmony_ci%macro PUT_PIXELS8_Y2 0 170cabdff1aSopenharmony_ci%if cpuflag(sse2) 171cabdff1aSopenharmony_cicglobal put_pixels16_y2, 4,5,3 172cabdff1aSopenharmony_ci%else 173cabdff1aSopenharmony_cicglobal put_pixels8_y2, 4,5 174cabdff1aSopenharmony_ci%endif 175cabdff1aSopenharmony_ci lea r4, [r2*2] 176cabdff1aSopenharmony_ci movu m0, [r1] 177cabdff1aSopenharmony_ci sub r0, r2 178cabdff1aSopenharmony_ci.loop: 179cabdff1aSopenharmony_ci movu m1, [r1+r2] 180cabdff1aSopenharmony_ci movu m2, [r1+r4] 181cabdff1aSopenharmony_ci add r1, r4 182cabdff1aSopenharmony_ci PAVGB m0, m1 183cabdff1aSopenharmony_ci PAVGB m1, m2 184cabdff1aSopenharmony_ci mova [r0+r2], m0 185cabdff1aSopenharmony_ci mova [r0+r4], m1 186cabdff1aSopenharmony_ci movu m1, [r1+r2] 187cabdff1aSopenharmony_ci movu m0, [r1+r4] 188cabdff1aSopenharmony_ci add r0, r4 189cabdff1aSopenharmony_ci add r1, r4 190cabdff1aSopenharmony_ci PAVGB m2, m1 191cabdff1aSopenharmony_ci PAVGB m1, m0 192cabdff1aSopenharmony_ci mova [r0+r2], m2 193cabdff1aSopenharmony_ci mova [r0+r4], m1 194cabdff1aSopenharmony_ci add r0, r4 195cabdff1aSopenharmony_ci sub r3d, 4 196cabdff1aSopenharmony_ci jne .loop 197cabdff1aSopenharmony_ci REP_RET 198cabdff1aSopenharmony_ci%endmacro 199cabdff1aSopenharmony_ci 200cabdff1aSopenharmony_ciINIT_MMX mmxext 201cabdff1aSopenharmony_ciPUT_PIXELS8_Y2 202cabdff1aSopenharmony_ci; actually, put_pixels16_y2_sse2 203cabdff1aSopenharmony_ciINIT_XMM sse2 204cabdff1aSopenharmony_ciPUT_PIXELS8_Y2 205cabdff1aSopenharmony_ci 206cabdff1aSopenharmony_ci 207cabdff1aSopenharmony_ci; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 208cabdff1aSopenharmony_ciINIT_MMX mmxext 209cabdff1aSopenharmony_cicglobal put_no_rnd_pixels8_y2, 4,5 210cabdff1aSopenharmony_ci mova m6, [pb_1] 211cabdff1aSopenharmony_ci lea r4, [r2+r2] 212cabdff1aSopenharmony_ci mova m0, [r1] 213cabdff1aSopenharmony_ci sub r0, r2 214cabdff1aSopenharmony_ci.loop: 215cabdff1aSopenharmony_ci mova m1, [r1+r2] 216cabdff1aSopenharmony_ci mova m2, [r1+r4] 217cabdff1aSopenharmony_ci add r1, r4 218cabdff1aSopenharmony_ci psubusb m1, m6 219cabdff1aSopenharmony_ci PAVGB m0, m1 220cabdff1aSopenharmony_ci PAVGB m1, m2 221cabdff1aSopenharmony_ci mova [r0+r2], m0 222cabdff1aSopenharmony_ci mova [r0+r4], m1 223cabdff1aSopenharmony_ci mova m1, [r1+r2] 224cabdff1aSopenharmony_ci mova m0, [r1+r4] 225cabdff1aSopenharmony_ci add r0, r4 226cabdff1aSopenharmony_ci add r1, r4 227cabdff1aSopenharmony_ci psubusb m1, m6 228cabdff1aSopenharmony_ci PAVGB m2, m1 229cabdff1aSopenharmony_ci PAVGB m1, m0 230cabdff1aSopenharmony_ci mova [r0+r2], m2 231cabdff1aSopenharmony_ci mova [r0+r4], m1 232cabdff1aSopenharmony_ci add r0, r4 233cabdff1aSopenharmony_ci sub r3d, 4 234cabdff1aSopenharmony_ci jne .loop 235cabdff1aSopenharmony_ci REP_RET 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci 238cabdff1aSopenharmony_ci; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 239cabdff1aSopenharmony_ci%macro AVG_PIXELS8_X2 0 240cabdff1aSopenharmony_ci%if cpuflag(sse2) 241cabdff1aSopenharmony_cicglobal avg_pixels16_x2, 4,5,4 242cabdff1aSopenharmony_ci%else 243cabdff1aSopenharmony_cicglobal avg_pixels8_x2, 4,5 244cabdff1aSopenharmony_ci%endif 245cabdff1aSopenharmony_ci lea r4, [r2*2] 246cabdff1aSopenharmony_ci.loop: 247cabdff1aSopenharmony_ci movu m0, [r1] 248cabdff1aSopenharmony_ci movu m2, [r1+r2] 249cabdff1aSopenharmony_ci%if cpuflag(sse2) 250cabdff1aSopenharmony_ci movu m1, [r1+1] 251cabdff1aSopenharmony_ci movu m3, [r1+r2+1] 252cabdff1aSopenharmony_ci pavgb m0, m1 253cabdff1aSopenharmony_ci pavgb m2, m3 254cabdff1aSopenharmony_ci%else 255cabdff1aSopenharmony_ci PAVGB m0, [r1+1], m3, m5 256cabdff1aSopenharmony_ci PAVGB m2, [r1+r2+1], m4, m5 257cabdff1aSopenharmony_ci%endif 258cabdff1aSopenharmony_ci PAVGB m0, [r0], m3, m5 259cabdff1aSopenharmony_ci PAVGB m2, [r0+r2], m4, m5 260cabdff1aSopenharmony_ci add r1, r4 261cabdff1aSopenharmony_ci mova [r0], m0 262cabdff1aSopenharmony_ci mova [r0+r2], m2 263cabdff1aSopenharmony_ci movu m0, [r1] 264cabdff1aSopenharmony_ci movu m2, [r1+r2] 265cabdff1aSopenharmony_ci%if cpuflag(sse2) 266cabdff1aSopenharmony_ci movu m1, [r1+1] 267cabdff1aSopenharmony_ci movu m3, [r1+r2+1] 268cabdff1aSopenharmony_ci pavgb m0, m1 269cabdff1aSopenharmony_ci pavgb m2, m3 270cabdff1aSopenharmony_ci%else 271cabdff1aSopenharmony_ci PAVGB m0, [r1+1], m3, m5 272cabdff1aSopenharmony_ci PAVGB m2, [r1+r2+1], m4, m5 273cabdff1aSopenharmony_ci%endif 274cabdff1aSopenharmony_ci add r0, r4 275cabdff1aSopenharmony_ci add r1, r4 276cabdff1aSopenharmony_ci PAVGB m0, [r0], m3, m5 277cabdff1aSopenharmony_ci PAVGB m2, [r0+r2], m4, m5 278cabdff1aSopenharmony_ci mova [r0], m0 279cabdff1aSopenharmony_ci mova [r0+r2], m2 280cabdff1aSopenharmony_ci add r0, r4 281cabdff1aSopenharmony_ci sub r3d, 4 282cabdff1aSopenharmony_ci jne .loop 283cabdff1aSopenharmony_ci REP_RET 284cabdff1aSopenharmony_ci%endmacro 285cabdff1aSopenharmony_ci 286cabdff1aSopenharmony_ciINIT_MMX mmxext 287cabdff1aSopenharmony_ciAVG_PIXELS8_X2 288cabdff1aSopenharmony_ci; actually avg_pixels16_x2 289cabdff1aSopenharmony_ciINIT_XMM sse2 290cabdff1aSopenharmony_ciAVG_PIXELS8_X2 291cabdff1aSopenharmony_ci 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 294cabdff1aSopenharmony_ci%macro AVG_PIXELS8_Y2 0 295cabdff1aSopenharmony_ci%if cpuflag(sse2) 296cabdff1aSopenharmony_cicglobal avg_pixels16_y2, 4,5,3 297cabdff1aSopenharmony_ci%else 298cabdff1aSopenharmony_cicglobal avg_pixels8_y2, 4,5 299cabdff1aSopenharmony_ci%endif 300cabdff1aSopenharmony_ci lea r4, [r2*2] 301cabdff1aSopenharmony_ci movu m0, [r1] 302cabdff1aSopenharmony_ci sub r0, r2 303cabdff1aSopenharmony_ci.loop: 304cabdff1aSopenharmony_ci movu m1, [r1+r2] 305cabdff1aSopenharmony_ci movu m2, [r1+r4] 306cabdff1aSopenharmony_ci add r1, r4 307cabdff1aSopenharmony_ci PAVGB m0, m1 308cabdff1aSopenharmony_ci PAVGB m1, m2 309cabdff1aSopenharmony_ci PAVGB m0, [r0+r2] 310cabdff1aSopenharmony_ci PAVGB m1, [r0+r4] 311cabdff1aSopenharmony_ci mova [r0+r2], m0 312cabdff1aSopenharmony_ci mova [r0+r4], m1 313cabdff1aSopenharmony_ci movu m1, [r1+r2] 314cabdff1aSopenharmony_ci movu m0, [r1+r4] 315cabdff1aSopenharmony_ci PAVGB m2, m1 316cabdff1aSopenharmony_ci PAVGB m1, m0 317cabdff1aSopenharmony_ci add r0, r4 318cabdff1aSopenharmony_ci add r1, r4 319cabdff1aSopenharmony_ci PAVGB m2, [r0+r2] 320cabdff1aSopenharmony_ci PAVGB m1, [r0+r4] 321cabdff1aSopenharmony_ci mova [r0+r2], m2 322cabdff1aSopenharmony_ci mova [r0+r4], m1 323cabdff1aSopenharmony_ci add r0, r4 324cabdff1aSopenharmony_ci sub r3d, 4 325cabdff1aSopenharmony_ci jne .loop 326cabdff1aSopenharmony_ci REP_RET 327cabdff1aSopenharmony_ci%endmacro 328cabdff1aSopenharmony_ci 329cabdff1aSopenharmony_ciINIT_MMX mmxext 330cabdff1aSopenharmony_ciAVG_PIXELS8_Y2 331cabdff1aSopenharmony_ci; actually avg_pixels16_y2 332cabdff1aSopenharmony_ciINIT_XMM sse2 333cabdff1aSopenharmony_ciAVG_PIXELS8_Y2 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_ci 336cabdff1aSopenharmony_ci; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 337cabdff1aSopenharmony_ci; Note this is not correctly rounded, and is therefore used for 338cabdff1aSopenharmony_ci; not-bitexact output 339cabdff1aSopenharmony_ciINIT_MMX mmxext 340cabdff1aSopenharmony_cicglobal avg_approx_pixels8_xy2, 4,5 341cabdff1aSopenharmony_ci mova m6, [pb_1] 342cabdff1aSopenharmony_ci lea r4, [r2*2] 343cabdff1aSopenharmony_ci mova m0, [r1] 344cabdff1aSopenharmony_ci PAVGB m0, [r1+1] 345cabdff1aSopenharmony_ci.loop: 346cabdff1aSopenharmony_ci mova m2, [r1+r4] 347cabdff1aSopenharmony_ci mova m1, [r1+r2] 348cabdff1aSopenharmony_ci psubusb m2, m6 349cabdff1aSopenharmony_ci PAVGB m1, [r1+r2+1] 350cabdff1aSopenharmony_ci PAVGB m2, [r1+r4+1] 351cabdff1aSopenharmony_ci add r1, r4 352cabdff1aSopenharmony_ci PAVGB m0, m1 353cabdff1aSopenharmony_ci PAVGB m1, m2 354cabdff1aSopenharmony_ci PAVGB m0, [r0] 355cabdff1aSopenharmony_ci PAVGB m1, [r0+r2] 356cabdff1aSopenharmony_ci mova [r0], m0 357cabdff1aSopenharmony_ci mova [r0+r2], m1 358cabdff1aSopenharmony_ci mova m1, [r1+r2] 359cabdff1aSopenharmony_ci mova m0, [r1+r4] 360cabdff1aSopenharmony_ci PAVGB m1, [r1+r2+1] 361cabdff1aSopenharmony_ci PAVGB m0, [r1+r4+1] 362cabdff1aSopenharmony_ci add r0, r4 363cabdff1aSopenharmony_ci add r1, r4 364cabdff1aSopenharmony_ci PAVGB m2, m1 365cabdff1aSopenharmony_ci PAVGB m1, m0 366cabdff1aSopenharmony_ci PAVGB m2, [r0] 367cabdff1aSopenharmony_ci PAVGB m1, [r0+r2] 368cabdff1aSopenharmony_ci mova [r0], m2 369cabdff1aSopenharmony_ci mova [r0+r2], m1 370cabdff1aSopenharmony_ci add r0, r4 371cabdff1aSopenharmony_ci sub r3d, 4 372cabdff1aSopenharmony_ci jne .loop 373cabdff1aSopenharmony_ci REP_RET 374cabdff1aSopenharmony_ci 375cabdff1aSopenharmony_ci 376cabdff1aSopenharmony_ci; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 377cabdff1aSopenharmony_ci%macro SET_PIXELS_XY2 1 378cabdff1aSopenharmony_ci%if cpuflag(sse2) 379cabdff1aSopenharmony_cicglobal %1_pixels16_xy2, 4,5,8 380cabdff1aSopenharmony_ci%else 381cabdff1aSopenharmony_cicglobal %1_pixels8_xy2, 4,5 382cabdff1aSopenharmony_ci%endif 383cabdff1aSopenharmony_ci pxor m7, m7 384cabdff1aSopenharmony_ci mova m6, [pw_2] 385cabdff1aSopenharmony_ci movu m0, [r1] 386cabdff1aSopenharmony_ci movu m4, [r1+1] 387cabdff1aSopenharmony_ci mova m1, m0 388cabdff1aSopenharmony_ci mova m5, m4 389cabdff1aSopenharmony_ci punpcklbw m0, m7 390cabdff1aSopenharmony_ci punpcklbw m4, m7 391cabdff1aSopenharmony_ci punpckhbw m1, m7 392cabdff1aSopenharmony_ci punpckhbw m5, m7 393cabdff1aSopenharmony_ci paddusw m4, m0 394cabdff1aSopenharmony_ci paddusw m5, m1 395cabdff1aSopenharmony_ci xor r4, r4 396cabdff1aSopenharmony_ci add r1, r2 397cabdff1aSopenharmony_ci.loop: 398cabdff1aSopenharmony_ci movu m0, [r1+r4] 399cabdff1aSopenharmony_ci movu m2, [r1+r4+1] 400cabdff1aSopenharmony_ci mova m1, m0 401cabdff1aSopenharmony_ci mova m3, m2 402cabdff1aSopenharmony_ci punpcklbw m0, m7 403cabdff1aSopenharmony_ci punpcklbw m2, m7 404cabdff1aSopenharmony_ci punpckhbw m1, m7 405cabdff1aSopenharmony_ci punpckhbw m3, m7 406cabdff1aSopenharmony_ci paddusw m0, m2 407cabdff1aSopenharmony_ci paddusw m1, m3 408cabdff1aSopenharmony_ci paddusw m4, m6 409cabdff1aSopenharmony_ci paddusw m5, m6 410cabdff1aSopenharmony_ci paddusw m4, m0 411cabdff1aSopenharmony_ci paddusw m5, m1 412cabdff1aSopenharmony_ci psrlw m4, 2 413cabdff1aSopenharmony_ci psrlw m5, 2 414cabdff1aSopenharmony_ci%ifidn %1, avg 415cabdff1aSopenharmony_ci mova m3, [r0+r4] 416cabdff1aSopenharmony_ci packuswb m4, m5 417cabdff1aSopenharmony_ci PAVGB m4, m3 418cabdff1aSopenharmony_ci%else 419cabdff1aSopenharmony_ci packuswb m4, m5 420cabdff1aSopenharmony_ci%endif 421cabdff1aSopenharmony_ci mova [r0+r4], m4 422cabdff1aSopenharmony_ci add r4, r2 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci movu m2, [r1+r4] 425cabdff1aSopenharmony_ci movu m4, [r1+r4+1] 426cabdff1aSopenharmony_ci mova m3, m2 427cabdff1aSopenharmony_ci mova m5, m4 428cabdff1aSopenharmony_ci punpcklbw m2, m7 429cabdff1aSopenharmony_ci punpcklbw m4, m7 430cabdff1aSopenharmony_ci punpckhbw m3, m7 431cabdff1aSopenharmony_ci punpckhbw m5, m7 432cabdff1aSopenharmony_ci paddusw m4, m2 433cabdff1aSopenharmony_ci paddusw m5, m3 434cabdff1aSopenharmony_ci paddusw m0, m6 435cabdff1aSopenharmony_ci paddusw m1, m6 436cabdff1aSopenharmony_ci paddusw m0, m4 437cabdff1aSopenharmony_ci paddusw m1, m5 438cabdff1aSopenharmony_ci psrlw m0, 2 439cabdff1aSopenharmony_ci psrlw m1, 2 440cabdff1aSopenharmony_ci%ifidn %1, avg 441cabdff1aSopenharmony_ci mova m3, [r0+r4] 442cabdff1aSopenharmony_ci packuswb m0, m1 443cabdff1aSopenharmony_ci PAVGB m0, m3 444cabdff1aSopenharmony_ci%else 445cabdff1aSopenharmony_ci packuswb m0, m1 446cabdff1aSopenharmony_ci%endif 447cabdff1aSopenharmony_ci mova [r0+r4], m0 448cabdff1aSopenharmony_ci add r4, r2 449cabdff1aSopenharmony_ci sub r3d, 2 450cabdff1aSopenharmony_ci jnz .loop 451cabdff1aSopenharmony_ci REP_RET 452cabdff1aSopenharmony_ci%endmacro 453cabdff1aSopenharmony_ci 454cabdff1aSopenharmony_ciINIT_MMX mmxext 455cabdff1aSopenharmony_ciSET_PIXELS_XY2 avg 456cabdff1aSopenharmony_ciINIT_XMM sse2 457cabdff1aSopenharmony_ciSET_PIXELS_XY2 put 458cabdff1aSopenharmony_ciSET_PIXELS_XY2 avg 459cabdff1aSopenharmony_ci 460cabdff1aSopenharmony_ci%macro SSSE3_PIXELS_XY2 1-2 461cabdff1aSopenharmony_ci%if %0 == 2 ; sse2 462cabdff1aSopenharmony_cicglobal %1_pixels16_xy2, 4,5,%2 463cabdff1aSopenharmony_ci mova m4, [pb_interleave16] 464cabdff1aSopenharmony_ci%else 465cabdff1aSopenharmony_cicglobal %1_pixels8_xy2, 4,5 466cabdff1aSopenharmony_ci mova m4, [pb_interleave8] 467cabdff1aSopenharmony_ci%endif 468cabdff1aSopenharmony_ci mova m5, [pb_1] 469cabdff1aSopenharmony_ci movu m0, [r1] 470cabdff1aSopenharmony_ci movu m1, [r1+1] 471cabdff1aSopenharmony_ci pmaddubsw m0, m5 472cabdff1aSopenharmony_ci pmaddubsw m1, m5 473cabdff1aSopenharmony_ci xor r4, r4 474cabdff1aSopenharmony_ci add r1, r2 475cabdff1aSopenharmony_ci.loop: 476cabdff1aSopenharmony_ci movu m2, [r1+r4] 477cabdff1aSopenharmony_ci movu m3, [r1+r4+1] 478cabdff1aSopenharmony_ci pmaddubsw m2, m5 479cabdff1aSopenharmony_ci pmaddubsw m3, m5 480cabdff1aSopenharmony_ci paddusw m0, m2 481cabdff1aSopenharmony_ci paddusw m1, m3 482cabdff1aSopenharmony_ci pmulhrsw m0, [pw_8192] 483cabdff1aSopenharmony_ci pmulhrsw m1, [pw_8192] 484cabdff1aSopenharmony_ci%ifidn %1, avg 485cabdff1aSopenharmony_ci mova m6, [r0+r4] 486cabdff1aSopenharmony_ci packuswb m0, m1 487cabdff1aSopenharmony_ci pshufb m0, m4 488cabdff1aSopenharmony_ci pavgb m0, m6 489cabdff1aSopenharmony_ci%else 490cabdff1aSopenharmony_ci packuswb m0, m1 491cabdff1aSopenharmony_ci pshufb m0, m4 492cabdff1aSopenharmony_ci%endif 493cabdff1aSopenharmony_ci mova [r0+r4], m0 494cabdff1aSopenharmony_ci add r4, r2 495cabdff1aSopenharmony_ci 496cabdff1aSopenharmony_ci movu m0, [r1+r4] 497cabdff1aSopenharmony_ci movu m1, [r1+r4+1] 498cabdff1aSopenharmony_ci pmaddubsw m0, m5 499cabdff1aSopenharmony_ci pmaddubsw m1, m5 500cabdff1aSopenharmony_ci paddusw m2, m0 501cabdff1aSopenharmony_ci paddusw m3, m1 502cabdff1aSopenharmony_ci pmulhrsw m2, [pw_8192] 503cabdff1aSopenharmony_ci pmulhrsw m3, [pw_8192] 504cabdff1aSopenharmony_ci%ifidn %1, avg 505cabdff1aSopenharmony_ci mova m6, [r0+r4] 506cabdff1aSopenharmony_ci packuswb m2, m3 507cabdff1aSopenharmony_ci pshufb m2, m4 508cabdff1aSopenharmony_ci pavgb m2, m6 509cabdff1aSopenharmony_ci%else 510cabdff1aSopenharmony_ci packuswb m2, m3 511cabdff1aSopenharmony_ci pshufb m2, m4 512cabdff1aSopenharmony_ci%endif 513cabdff1aSopenharmony_ci mova [r0+r4], m2 514cabdff1aSopenharmony_ci add r4, r2 515cabdff1aSopenharmony_ci sub r3d, 2 516cabdff1aSopenharmony_ci jnz .loop 517cabdff1aSopenharmony_ci REP_RET 518cabdff1aSopenharmony_ci%endmacro 519cabdff1aSopenharmony_ci 520cabdff1aSopenharmony_ciINIT_MMX ssse3 521cabdff1aSopenharmony_ciSSSE3_PIXELS_XY2 put 522cabdff1aSopenharmony_ciSSSE3_PIXELS_XY2 avg 523cabdff1aSopenharmony_ciINIT_XMM ssse3 524cabdff1aSopenharmony_ciSSSE3_PIXELS_XY2 put, 6 525cabdff1aSopenharmony_ciSSSE3_PIXELS_XY2 avg, 7 526