1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* x86-optimized yuv2yuvX 3cabdff1aSopenharmony_ci;* Copyright 2020 Google LLC 4cabdff1aSopenharmony_ci;* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION .text 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 28cabdff1aSopenharmony_ci; yuv2yuvX 29cabdff1aSopenharmony_ci; 30cabdff1aSopenharmony_ci; void ff_yuv2yuvX_<opt>(const int16_t *filter, int filterSize, 31cabdff1aSopenharmony_ci; int srcOffset, uint8_t *dest, int dstW, 32cabdff1aSopenharmony_ci; const uint8_t *dither, int offset); 33cabdff1aSopenharmony_ci; 34cabdff1aSopenharmony_ci;----------------------------------------------------------------------------- 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ci%macro YUV2YUVX_FUNC 0 37cabdff1aSopenharmony_cicglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset 38cabdff1aSopenharmony_ci%if notcpuflag(sse3) 39cabdff1aSopenharmony_ci%define movr mova 40cabdff1aSopenharmony_ci%define unroll 1 41cabdff1aSopenharmony_ci%else 42cabdff1aSopenharmony_ci%define movr movdqu 43cabdff1aSopenharmony_ci%define unroll 2 44cabdff1aSopenharmony_ci%endif 45cabdff1aSopenharmony_ci movsxdifnidn dstWq, dstWd 46cabdff1aSopenharmony_ci movsxdifnidn offsetq, offsetd 47cabdff1aSopenharmony_ci movsxdifnidn srcq, srcd 48cabdff1aSopenharmony_ci%if cpuflag(avx2) 49cabdff1aSopenharmony_ci vpbroadcastq m3, [ditherq] 50cabdff1aSopenharmony_ci%else 51cabdff1aSopenharmony_ci movq xm3, [ditherq] 52cabdff1aSopenharmony_ci%endif ; avx2 53cabdff1aSopenharmony_ci cmp offsetd, 0 54cabdff1aSopenharmony_ci jz .offset 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci ; offset != 0 path. 57cabdff1aSopenharmony_ci psrlq m5, m3, $18 58cabdff1aSopenharmony_ci psllq m3, m3, $28 59cabdff1aSopenharmony_ci por m3, m3, m5 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci.offset: 62cabdff1aSopenharmony_ci add offsetq, srcq 63cabdff1aSopenharmony_ci movd xm1, filterSized 64cabdff1aSopenharmony_ci SPLATW m1, xm1, 0 65cabdff1aSopenharmony_ci pxor m0, m0, m0 66cabdff1aSopenharmony_ci mov filterSizeq, filterq 67cabdff1aSopenharmony_ci mov srcq, [filterSizeq] 68cabdff1aSopenharmony_ci punpcklbw m3, m0 69cabdff1aSopenharmony_ci psllw m1, m1, 3 70cabdff1aSopenharmony_ci paddw m3, m3, m1 71cabdff1aSopenharmony_ci psraw m7, m3, 4 72cabdff1aSopenharmony_ci.outerloop: 73cabdff1aSopenharmony_ci mova m4, m7 74cabdff1aSopenharmony_ci mova m3, m7 75cabdff1aSopenharmony_ci%if cpuflag(sse3) 76cabdff1aSopenharmony_ci mova m6, m7 77cabdff1aSopenharmony_ci mova m1, m7 78cabdff1aSopenharmony_ci%endif 79cabdff1aSopenharmony_ci.loop: 80cabdff1aSopenharmony_ci%if cpuflag(avx2) 81cabdff1aSopenharmony_ci vpbroadcastq m0, [filterSizeq + 8] 82cabdff1aSopenharmony_ci%elif cpuflag(sse3) 83cabdff1aSopenharmony_ci movddup m0, [filterSizeq + 8] 84cabdff1aSopenharmony_ci%else 85cabdff1aSopenharmony_ci mova m0, [filterSizeq + 8] 86cabdff1aSopenharmony_ci%endif 87cabdff1aSopenharmony_ci pmulhw m2, m0, [srcq + offsetq * 2] 88cabdff1aSopenharmony_ci pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] 89cabdff1aSopenharmony_ci paddw m3, m3, m2 90cabdff1aSopenharmony_ci paddw m4, m4, m5 91cabdff1aSopenharmony_ci%if cpuflag(sse3) 92cabdff1aSopenharmony_ci pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] 93cabdff1aSopenharmony_ci pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] 94cabdff1aSopenharmony_ci paddw m6, m6, m2 95cabdff1aSopenharmony_ci paddw m1, m1, m5 96cabdff1aSopenharmony_ci%endif 97cabdff1aSopenharmony_ci add filterSizeq, $10 98cabdff1aSopenharmony_ci mov srcq, [filterSizeq] 99cabdff1aSopenharmony_ci test srcq, srcq 100cabdff1aSopenharmony_ci jnz .loop 101cabdff1aSopenharmony_ci psraw m3, m3, 3 102cabdff1aSopenharmony_ci psraw m4, m4, 3 103cabdff1aSopenharmony_ci%if cpuflag(sse3) 104cabdff1aSopenharmony_ci psraw m6, m6, 3 105cabdff1aSopenharmony_ci psraw m1, m1, 3 106cabdff1aSopenharmony_ci%endif 107cabdff1aSopenharmony_ci packuswb m3, m3, m4 108cabdff1aSopenharmony_ci%if cpuflag(sse3) 109cabdff1aSopenharmony_ci packuswb m6, m6, m1 110cabdff1aSopenharmony_ci%endif 111cabdff1aSopenharmony_ci mov srcq, [filterq] 112cabdff1aSopenharmony_ci%if cpuflag(avx2) 113cabdff1aSopenharmony_ci vpermq m3, m3, 216 114cabdff1aSopenharmony_ci vpermq m6, m6, 216 115cabdff1aSopenharmony_ci%endif 116cabdff1aSopenharmony_ci movr [destq + offsetq], m3 117cabdff1aSopenharmony_ci%if cpuflag(sse3) 118cabdff1aSopenharmony_ci movr [destq + offsetq + mmsize], m6 119cabdff1aSopenharmony_ci%endif 120cabdff1aSopenharmony_ci add offsetq, mmsize * unroll 121cabdff1aSopenharmony_ci mov filterSizeq, filterq 122cabdff1aSopenharmony_ci cmp offsetq, dstWq 123cabdff1aSopenharmony_ci jb .outerloop 124cabdff1aSopenharmony_ci REP_RET 125cabdff1aSopenharmony_ci%endmacro 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ciINIT_MMX mmx 128cabdff1aSopenharmony_ciYUV2YUVX_FUNC 129cabdff1aSopenharmony_ciINIT_MMX mmxext 130cabdff1aSopenharmony_ciYUV2YUVX_FUNC 131cabdff1aSopenharmony_ciINIT_XMM sse3 132cabdff1aSopenharmony_ciYUV2YUVX_FUNC 133cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 134cabdff1aSopenharmony_ciINIT_YMM avx2 135cabdff1aSopenharmony_ciYUV2YUVX_FUNC 136cabdff1aSopenharmony_ci%endif 137