1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* Copyright Nick Kurshev 3cabdff1aSopenharmony_ci;* Copyright Michael (michaelni@gmx.at) 4cabdff1aSopenharmony_ci;* Copyright 2018 Jokyo Images 5cabdff1aSopenharmony_ci;* Copyright Ivo van Poorten 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;****************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION_RODATA 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cipb_mask_shuffle2103_mmx times 8 dw 255 29cabdff1aSopenharmony_cipb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 30cabdff1aSopenharmony_cipb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13 31cabdff1aSopenharmony_cipb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 32cabdff1aSopenharmony_cipb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14 33cabdff1aSopenharmony_cipb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 34cabdff1aSopenharmony_ci 35cabdff1aSopenharmony_ciSECTION .text 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci%macro RSHIFT_COPY 3 38cabdff1aSopenharmony_ci; %1 dst ; %2 src ; %3 shift 39cabdff1aSopenharmony_ci%if cpuflag(avx) 40cabdff1aSopenharmony_ci psrldq %1, %2, %3 41cabdff1aSopenharmony_ci%else 42cabdff1aSopenharmony_ci mova %1, %2 43cabdff1aSopenharmony_ci RSHIFT %1, %3 44cabdff1aSopenharmony_ci%endif 45cabdff1aSopenharmony_ci%endmacro 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 48cabdff1aSopenharmony_ci; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size) 49cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 50cabdff1aSopenharmony_ciINIT_MMX mmxext 51cabdff1aSopenharmony_cicglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x 52cabdff1aSopenharmony_ci mova m6, [pb_mask_shuffle2103_mmx] 53cabdff1aSopenharmony_ci mova m7, m6 54cabdff1aSopenharmony_ci psllq m7, 8 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci movsxdifnidn wq, wd 57cabdff1aSopenharmony_ci mov xq, wq 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci add srcq, wq 60cabdff1aSopenharmony_ci add dstq, wq 61cabdff1aSopenharmony_ci neg wq 62cabdff1aSopenharmony_ci 63cabdff1aSopenharmony_ci;calc scalar loop 64cabdff1aSopenharmony_ci and xq, mmsize*2 -4 65cabdff1aSopenharmony_ci je .loop_simd 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ci.loop_scalar: 68cabdff1aSopenharmony_ci mov tmpb, [srcq + wq + 2] 69cabdff1aSopenharmony_ci mov [dstq+wq + 0], tmpb 70cabdff1aSopenharmony_ci mov tmpb, [srcq + wq + 1] 71cabdff1aSopenharmony_ci mov [dstq+wq + 1], tmpb 72cabdff1aSopenharmony_ci mov tmpb, [srcq + wq + 0] 73cabdff1aSopenharmony_ci mov [dstq+wq + 2], tmpb 74cabdff1aSopenharmony_ci mov tmpb, [srcq + wq + 3] 75cabdff1aSopenharmony_ci mov [dstq+wq + 3], tmpb 76cabdff1aSopenharmony_ci add wq, 4 77cabdff1aSopenharmony_ci sub xq, 4 78cabdff1aSopenharmony_ci jg .loop_scalar 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_ci;check if src_size < mmsize * 2 81cabdff1aSopenharmony_cicmp wq, 0 82cabdff1aSopenharmony_cijge .end 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ci.loop_simd: 85cabdff1aSopenharmony_ci movu m0, [srcq+wq] 86cabdff1aSopenharmony_ci movu m1, [srcq+wq+8] 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci pshufw m3, m0, 177 89cabdff1aSopenharmony_ci pshufw m5, m1, 177 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ci pand m0, m7 92cabdff1aSopenharmony_ci pand m3, m6 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci pand m1, m7 95cabdff1aSopenharmony_ci pand m5, m6 96cabdff1aSopenharmony_ci 97cabdff1aSopenharmony_ci por m0, m3 98cabdff1aSopenharmony_ci por m1, m5 99cabdff1aSopenharmony_ci 100cabdff1aSopenharmony_ci movu [dstq+wq], m0 101cabdff1aSopenharmony_ci movu [dstq+wq + 8], m1 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ci add wq, mmsize*2 104cabdff1aSopenharmony_ci jl .loop_simd 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci.end: 107cabdff1aSopenharmony_ci RET 108cabdff1aSopenharmony_ci 109cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 110cabdff1aSopenharmony_ci; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) 111cabdff1aSopenharmony_ci;------------------------------------------------------------------------------ 112cabdff1aSopenharmony_ci; %1-4 index shuffle 113cabdff1aSopenharmony_ci%macro SHUFFLE_BYTES 4 114cabdff1aSopenharmony_cicglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x 115cabdff1aSopenharmony_ci VBROADCASTI128 m0, [pb_shuffle%1%2%3%4] 116cabdff1aSopenharmony_ci movsxdifnidn wq, wd 117cabdff1aSopenharmony_ci mov xq, wq 118cabdff1aSopenharmony_ci 119cabdff1aSopenharmony_ci add srcq, wq 120cabdff1aSopenharmony_ci add dstq, wq 121cabdff1aSopenharmony_ci neg wq 122cabdff1aSopenharmony_ci 123cabdff1aSopenharmony_ci;calc scalar loop 124cabdff1aSopenharmony_ci and xq, mmsize-4 125cabdff1aSopenharmony_ci je .loop_simd 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci.loop_scalar: 128cabdff1aSopenharmony_ci mov tmpb, [srcq + wq + %1] 129cabdff1aSopenharmony_ci mov [dstq+wq + 0], tmpb 130cabdff1aSopenharmony_ci mov tmpb, [srcq + wq + %2] 131cabdff1aSopenharmony_ci mov [dstq+wq + 1], tmpb 132cabdff1aSopenharmony_ci mov tmpb, [srcq + wq + %3] 133cabdff1aSopenharmony_ci mov [dstq+wq + 2], tmpb 134cabdff1aSopenharmony_ci mov tmpb, [srcq + wq + %4] 135cabdff1aSopenharmony_ci mov [dstq+wq + 3], tmpb 136cabdff1aSopenharmony_ci add wq, 4 137cabdff1aSopenharmony_ci sub xq, 4 138cabdff1aSopenharmony_ci jg .loop_scalar 139cabdff1aSopenharmony_ci 140cabdff1aSopenharmony_ci;check if src_size < mmsize 141cabdff1aSopenharmony_cicmp wq, 0 142cabdff1aSopenharmony_cijge .end 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci.loop_simd: 145cabdff1aSopenharmony_ci movu m1, [srcq+wq] 146cabdff1aSopenharmony_ci pshufb m1, m0 147cabdff1aSopenharmony_ci movu [dstq+wq], m1 148cabdff1aSopenharmony_ci add wq, mmsize 149cabdff1aSopenharmony_ci jl .loop_simd 150cabdff1aSopenharmony_ci 151cabdff1aSopenharmony_ci.end: 152cabdff1aSopenharmony_ci RET 153cabdff1aSopenharmony_ci%endmacro 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ciINIT_XMM ssse3 156cabdff1aSopenharmony_ciSHUFFLE_BYTES 2, 1, 0, 3 157cabdff1aSopenharmony_ciSHUFFLE_BYTES 0, 3, 2, 1 158cabdff1aSopenharmony_ciSHUFFLE_BYTES 1, 2, 3, 0 159cabdff1aSopenharmony_ciSHUFFLE_BYTES 3, 0, 1, 2 160cabdff1aSopenharmony_ciSHUFFLE_BYTES 3, 2, 1, 0 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci%if ARCH_X86_64 163cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 164cabdff1aSopenharmony_ciINIT_YMM avx2 165cabdff1aSopenharmony_ciSHUFFLE_BYTES 2, 1, 0, 3 166cabdff1aSopenharmony_ciSHUFFLE_BYTES 0, 3, 2, 1 167cabdff1aSopenharmony_ciSHUFFLE_BYTES 1, 2, 3, 0 168cabdff1aSopenharmony_ciSHUFFLE_BYTES 3, 0, 1, 2 169cabdff1aSopenharmony_ciSHUFFLE_BYTES 3, 2, 1, 0 170cabdff1aSopenharmony_ci%endif 171cabdff1aSopenharmony_ci%endif 172cabdff1aSopenharmony_ci 173cabdff1aSopenharmony_ci;----------------------------------------------------------------------------------------------- 174cabdff1aSopenharmony_ci; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 175cabdff1aSopenharmony_ci; const uint8_t *src, int width, int height, 176cabdff1aSopenharmony_ci; int lumStride, int chromStride, int srcStride) 177cabdff1aSopenharmony_ci;----------------------------------------------------------------------------------------------- 178cabdff1aSopenharmony_ci%macro UYVY_TO_YUV422 0 179cabdff1aSopenharmony_cicglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w 180cabdff1aSopenharmony_ci pxor m0, m0 181cabdff1aSopenharmony_ci pcmpeqw m1, m1 182cabdff1aSopenharmony_ci psrlw m1, 8 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci movsxdifnidn wq, wd 185cabdff1aSopenharmony_ci movsxdifnidn lum_strideq, lum_strided 186cabdff1aSopenharmony_ci movsxdifnidn chrom_strideq, chrom_strided 187cabdff1aSopenharmony_ci movsxdifnidn src_strideq, src_strided 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci mov back_wq, wq 190cabdff1aSopenharmony_ci mov whalfq, wq 191cabdff1aSopenharmony_ci shr whalfq, 1 ; whalf = width / 2 192cabdff1aSopenharmony_ci 193cabdff1aSopenharmony_ci lea srcq, [srcq + wq * 2] 194cabdff1aSopenharmony_ci add ydstq, wq 195cabdff1aSopenharmony_ci add udstq, whalfq 196cabdff1aSopenharmony_ci add vdstq, whalfq 197cabdff1aSopenharmony_ci 198cabdff1aSopenharmony_ci.loop_line: 199cabdff1aSopenharmony_ci mov xq, wq 200cabdff1aSopenharmony_ci mov wtwoq, wq 201cabdff1aSopenharmony_ci add wtwoq, wtwoq ; wtwo = width * 2 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci neg wq 204cabdff1aSopenharmony_ci neg wtwoq 205cabdff1aSopenharmony_ci neg whalfq 206cabdff1aSopenharmony_ci 207cabdff1aSopenharmony_ci ;calc scalar loop count 208cabdff1aSopenharmony_ci and xq, mmsize * 2 - 1 209cabdff1aSopenharmony_ci je .loop_simd 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci .loop_scalar: 212cabdff1aSopenharmony_ci mov tmpb, [srcq + wtwoq + 0] 213cabdff1aSopenharmony_ci mov [udstq + whalfq], tmpb 214cabdff1aSopenharmony_ci 215cabdff1aSopenharmony_ci mov tmpb, [srcq + wtwoq + 1] 216cabdff1aSopenharmony_ci mov [ydstq + wq], tmpb 217cabdff1aSopenharmony_ci 218cabdff1aSopenharmony_ci mov tmpb, [srcq + wtwoq + 2] 219cabdff1aSopenharmony_ci mov [vdstq + whalfq], tmpb 220cabdff1aSopenharmony_ci 221cabdff1aSopenharmony_ci mov tmpb, [srcq + wtwoq + 3] 222cabdff1aSopenharmony_ci mov [ydstq + wq + 1], tmpb 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ci add wq, 2 225cabdff1aSopenharmony_ci add wtwoq, 4 226cabdff1aSopenharmony_ci add whalfq, 1 227cabdff1aSopenharmony_ci sub xq, 2 228cabdff1aSopenharmony_ci jg .loop_scalar 229cabdff1aSopenharmony_ci 230cabdff1aSopenharmony_ci ; check if simd loop is need 231cabdff1aSopenharmony_ci cmp wq, 0 232cabdff1aSopenharmony_ci jge .end_line 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci .loop_simd: 235cabdff1aSopenharmony_ci movu m2, [srcq + wtwoq ] 236cabdff1aSopenharmony_ci movu m3, [srcq + wtwoq + mmsize ] 237cabdff1aSopenharmony_ci movu m4, [srcq + wtwoq + mmsize * 2] 238cabdff1aSopenharmony_ci movu m5, [srcq + wtwoq + mmsize * 3] 239cabdff1aSopenharmony_ci 240cabdff1aSopenharmony_ci ; extract y part 1 241cabdff1aSopenharmony_ci RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... 242cabdff1aSopenharmony_ci pand m6, m1; YxYx YxYx... 243cabdff1aSopenharmony_ci 244cabdff1aSopenharmony_ci RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... 245cabdff1aSopenharmony_ci pand m7, m1 ; YxYx YxYx... 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci packuswb m6, m7 ; YYYY YYYY... 248cabdff1aSopenharmony_ci movu [ydstq + wq], m6 249cabdff1aSopenharmony_ci 250cabdff1aSopenharmony_ci ; extract y part 2 251cabdff1aSopenharmony_ci RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY... 252cabdff1aSopenharmony_ci pand m6, m1; YxYx YxYx... 253cabdff1aSopenharmony_ci 254cabdff1aSopenharmony_ci RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... 255cabdff1aSopenharmony_ci pand m7, m1 ; YxYx YxYx... 256cabdff1aSopenharmony_ci 257cabdff1aSopenharmony_ci packuswb m6, m7 ; YYYY YYYY... 258cabdff1aSopenharmony_ci movu [ydstq + wq + mmsize], m6 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci ; extract uv 261cabdff1aSopenharmony_ci pand m2, m1 ; UxVx... 262cabdff1aSopenharmony_ci pand m3, m1 ; UxVx... 263cabdff1aSopenharmony_ci pand m4, m1 ; UxVx... 264cabdff1aSopenharmony_ci pand m5, m1 ; UxVx... 265cabdff1aSopenharmony_ci 266cabdff1aSopenharmony_ci packuswb m2, m3 ; UVUV... 267cabdff1aSopenharmony_ci packuswb m4, m5 ; UVUV... 268cabdff1aSopenharmony_ci 269cabdff1aSopenharmony_ci ; U 270cabdff1aSopenharmony_ci pand m6, m2, m1 ; UxUx... 271cabdff1aSopenharmony_ci pand m7, m4, m1 ; UxUx... 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci packuswb m6, m7 ; UUUU 274cabdff1aSopenharmony_ci movu [udstq + whalfq], m6 275cabdff1aSopenharmony_ci 276cabdff1aSopenharmony_ci 277cabdff1aSopenharmony_ci ; V 278cabdff1aSopenharmony_ci psrlw m2, 8 ; VxVx... 279cabdff1aSopenharmony_ci psrlw m4, 8 ; VxVx... 280cabdff1aSopenharmony_ci packuswb m2, m4 ; VVVV 281cabdff1aSopenharmony_ci movu [vdstq + whalfq], m2 282cabdff1aSopenharmony_ci 283cabdff1aSopenharmony_ci add whalfq, mmsize 284cabdff1aSopenharmony_ci add wtwoq, mmsize * 4 285cabdff1aSopenharmony_ci add wq, mmsize * 2 286cabdff1aSopenharmony_ci jl .loop_simd 287cabdff1aSopenharmony_ci 288cabdff1aSopenharmony_ci .end_line: 289cabdff1aSopenharmony_ci add srcq, src_strideq 290cabdff1aSopenharmony_ci add ydstq, lum_strideq 291cabdff1aSopenharmony_ci add udstq, chrom_strideq 292cabdff1aSopenharmony_ci add vdstq, chrom_strideq 293cabdff1aSopenharmony_ci 294cabdff1aSopenharmony_ci ;restore initial state of line variable 295cabdff1aSopenharmony_ci mov wq, back_wq 296cabdff1aSopenharmony_ci mov xq, wq 297cabdff1aSopenharmony_ci mov whalfq, wq 298cabdff1aSopenharmony_ci shr whalfq, 1 ; whalf = width / 2 299cabdff1aSopenharmony_ci sub hd, 1 300cabdff1aSopenharmony_ci jg .loop_line 301cabdff1aSopenharmony_ci 302cabdff1aSopenharmony_ci RET 303cabdff1aSopenharmony_ci%endmacro 304cabdff1aSopenharmony_ci 305cabdff1aSopenharmony_ci%if ARCH_X86_64 306cabdff1aSopenharmony_ciINIT_XMM sse2 307cabdff1aSopenharmony_ciUYVY_TO_YUV422 308cabdff1aSopenharmony_ci 309cabdff1aSopenharmony_ciINIT_XMM avx 310cabdff1aSopenharmony_ciUYVY_TO_YUV422 311cabdff1aSopenharmony_ci%endif 312