1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* V210 SIMD unpack 3cabdff1aSopenharmony_ci;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu> 4cabdff1aSopenharmony_ci;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com> 5cabdff1aSopenharmony_ci;* 6cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 7cabdff1aSopenharmony_ci;* 8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci;* 13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 17cabdff1aSopenharmony_ci;* 18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci;****************************************************************************** 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ciSECTION_RODATA 32 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_ci; for AVX2 version only 28cabdff1aSopenharmony_civ210_luma_permute: dd 0,1,2,4,5,6,7,7 ; 32-byte alignment required 29cabdff1aSopenharmony_civ210_chroma_shuf2: db 0,1,2,3,4,5,8,9,10,11,12,13,-1,-1,-1,-1 30cabdff1aSopenharmony_civ210_luma_shuf_avx2: db 0,1,4,5,6,7,8,9,12,13,14,15,-1,-1,-1,-1 31cabdff1aSopenharmony_civ210_chroma_shuf_avx2: db 0,1,4,5,10,11,-1,-1,2,3,8,9,12,13,-1,-1 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_civ210_mult: dw 64,4,64,4,64,4,64,4 34cabdff1aSopenharmony_civ210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 35cabdff1aSopenharmony_civ210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ciSECTION .text 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ci%macro v210_planar_unpack 1 40cabdff1aSopenharmony_ci 41cabdff1aSopenharmony_ci; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) 42cabdff1aSopenharmony_cicglobal v210_planar_unpack_%1, 5, 5, 6 + 2 * cpuflag(avx2), src, y, u, v, w 43cabdff1aSopenharmony_ci movsxdifnidn wq, wd 44cabdff1aSopenharmony_ci lea yq, [yq+2*wq] 45cabdff1aSopenharmony_ci add uq, wq 46cabdff1aSopenharmony_ci add vq, wq 47cabdff1aSopenharmony_ci neg wq 48cabdff1aSopenharmony_ci 49cabdff1aSopenharmony_ci VBROADCASTI128 m3, [v210_mult] 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_ci%if cpuflag(avx2) 52cabdff1aSopenharmony_ci VBROADCASTI128 m4, [v210_luma_shuf_avx2] 53cabdff1aSopenharmony_ci VBROADCASTI128 m5, [v210_chroma_shuf_avx2] 54cabdff1aSopenharmony_ci mova m6, [v210_luma_permute] 55cabdff1aSopenharmony_ci VBROADCASTI128 m7, [v210_chroma_shuf2] 56cabdff1aSopenharmony_ci%else 57cabdff1aSopenharmony_ci VBROADCASTI128 m4, [v210_luma_shuf] 58cabdff1aSopenharmony_ci VBROADCASTI128 m5, [v210_chroma_shuf] 59cabdff1aSopenharmony_ci%endif 60cabdff1aSopenharmony_ci 61cabdff1aSopenharmony_ci.loop: 62cabdff1aSopenharmony_ci%ifidn %1, unaligned 63cabdff1aSopenharmony_ci movu m0, [srcq] ; yB v5 yA u5 y9 v4 y8 u4 y7 v3 y6 u3 y5 v2 y4 u2 y3 v1 y2 u1 y1 v0 y0 u0 64cabdff1aSopenharmony_ci%else 65cabdff1aSopenharmony_ci mova m0, [srcq] 66cabdff1aSopenharmony_ci%endif 67cabdff1aSopenharmony_ci 68cabdff1aSopenharmony_ci pmullw m1, m0, m3 69cabdff1aSopenharmony_ci pslld m0, 12 70cabdff1aSopenharmony_ci psrlw m1, 6 ; yB yA u5 v4 y8 y7 v3 u3 y5 y4 u2 v1 y2 y1 v0 u0 71cabdff1aSopenharmony_ci psrld m0, 22 ; 00 v5 00 y9 00 u4 00 y6 00 v2 00 y3 00 u1 00 y0 72cabdff1aSopenharmony_ci 73cabdff1aSopenharmony_ci%if cpuflag(avx2) 74cabdff1aSopenharmony_ci vpblendd m2, m1, m0, 0x55 ; yB yA 00 y9 y8 y7 00 y6 y5 y4 00 y3 y2 y1 00 y0 75cabdff1aSopenharmony_ci pshufb m2, m4 ; 00 00 yB yA y9 y8 y7 y6 00 00 y5 y4 y3 y2 y1 y0 76cabdff1aSopenharmony_ci vpermd m2, m6, m2 ; 00 00 00 00 yB yA y9 y8 y7 y6 y5 y4 y3 y2 y1 y0 77cabdff1aSopenharmony_ci movu [yq+2*wq], m2 78cabdff1aSopenharmony_ci 79cabdff1aSopenharmony_ci vpblendd m1, m1, m0, 0xaa ; 00 v5 u5 v4 00 u4 v3 u3 00 v2 u2 v1 00 u1 v0 u0 80cabdff1aSopenharmony_ci pshufb m1, m5 ; 00 v5 v4 v3 00 u5 u4 u3 00 v2 v1 v0 00 u2 u1 u0 81cabdff1aSopenharmony_ci vpermq m1, m1, 0xd8 ; 00 v5 v4 v3 00 v2 v1 v0 00 u5 u4 u3 00 u2 u1 u0 82cabdff1aSopenharmony_ci pshufb m1, m7 ; 00 00 v5 v4 v3 v2 v1 v0 00 00 u5 u4 u3 u2 u1 u0 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ci movu [uq+wq], xm1 85cabdff1aSopenharmony_ci vextracti128 [vq+wq], m1, 1 86cabdff1aSopenharmony_ci%else 87cabdff1aSopenharmony_ci shufps m2, m1, m0, 0x8d ; 00 y9 00 y6 yB yA y8 y7 00 y3 00 y0 y5 y4 y2 y1 88cabdff1aSopenharmony_ci pshufb m2, m4 ; 00 00 yB yA y9 y8 y7 y6 00 00 y5 y4 y3 y2 y1 y0 89cabdff1aSopenharmony_ci movu [yq+2*wq], m2 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ci shufps m1, m0, 0xd8 ; 00 v5 00 u4 u5 v4 v3 u3 00 v2 00 u1 u2 v1 v0 u0 92cabdff1aSopenharmony_ci pshufb m1, m5 ; 00 v5 v4 v3 00 u5 u4 u3 00 v2 v1 v0 00 u2 u1 u0 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci movq [uq+wq], m1 95cabdff1aSopenharmony_ci movhps [vq+wq], m1 96cabdff1aSopenharmony_ci%endif 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci add srcq, mmsize 99cabdff1aSopenharmony_ci add wq, (mmsize*3)/8 100cabdff1aSopenharmony_ci jl .loop 101cabdff1aSopenharmony_ci 102cabdff1aSopenharmony_ci REP_RET 103cabdff1aSopenharmony_ci%endmacro 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ciINIT_XMM ssse3 106cabdff1aSopenharmony_civ210_planar_unpack unaligned 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 109cabdff1aSopenharmony_ciINIT_XMM avx 110cabdff1aSopenharmony_civ210_planar_unpack unaligned 111cabdff1aSopenharmony_ci%endif 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 114cabdff1aSopenharmony_ciINIT_YMM avx2 115cabdff1aSopenharmony_civ210_planar_unpack unaligned 116cabdff1aSopenharmony_ci%endif 117cabdff1aSopenharmony_ci 118cabdff1aSopenharmony_ciINIT_XMM ssse3 119cabdff1aSopenharmony_civ210_planar_unpack aligned 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL 122cabdff1aSopenharmony_ciINIT_XMM avx 123cabdff1aSopenharmony_civ210_planar_unpack aligned 124cabdff1aSopenharmony_ci%endif 125cabdff1aSopenharmony_ci 126cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 127cabdff1aSopenharmony_ciINIT_YMM avx2 128cabdff1aSopenharmony_civ210_planar_unpack aligned 129cabdff1aSopenharmony_ci%endif 130