1cabdff1aSopenharmony_ci;****************************************************************************** 2cabdff1aSopenharmony_ci;* optimized bswap buffer functions 3cabdff1aSopenharmony_ci;* Copyright (c) 2008 Loren Merritt 4cabdff1aSopenharmony_ci;* Copyright (c) 2003-2013 Michael Niedermayer 5cabdff1aSopenharmony_ci;* Copyright (c) 2013 Daniel Kang 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;****************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION_RODATA 27cabdff1aSopenharmony_cipb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 28cabdff1aSopenharmony_ci 29cabdff1aSopenharmony_cicextern pb_80 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ciSECTION .text 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ci; %1 = aligned/unaligned 34cabdff1aSopenharmony_ci%macro BSWAP_LOOPS 1 35cabdff1aSopenharmony_ci mov r3d, r2d 36cabdff1aSopenharmony_ci sar r2d, 3 37cabdff1aSopenharmony_ci jz .left4_%1 38cabdff1aSopenharmony_ci%if cpuflag(avx2) 39cabdff1aSopenharmony_ci sar r2d, 1 40cabdff1aSopenharmony_ci jz .left8_%1 41cabdff1aSopenharmony_ci%endif 42cabdff1aSopenharmony_ci.loop8_%1: 43cabdff1aSopenharmony_ci mov%1 m0, [r1 + 0] 44cabdff1aSopenharmony_ci mov%1 m1, [r1 + mmsize] 45cabdff1aSopenharmony_ci%if cpuflag(ssse3)||cpuflag(avx2) 46cabdff1aSopenharmony_ci pshufb m0, m2 47cabdff1aSopenharmony_ci pshufb m1, m2 48cabdff1aSopenharmony_ci mov%1 [r0 + 0], m0 49cabdff1aSopenharmony_ci mov%1 [r0 + mmsize], m1 50cabdff1aSopenharmony_ci%else 51cabdff1aSopenharmony_ci pshuflw m0, m0, 10110001b 52cabdff1aSopenharmony_ci pshuflw m1, m1, 10110001b 53cabdff1aSopenharmony_ci pshufhw m0, m0, 10110001b 54cabdff1aSopenharmony_ci pshufhw m1, m1, 10110001b 55cabdff1aSopenharmony_ci mova m2, m0 56cabdff1aSopenharmony_ci mova m3, m1 57cabdff1aSopenharmony_ci psllw m0, 8 58cabdff1aSopenharmony_ci psllw m1, 8 59cabdff1aSopenharmony_ci psrlw m2, 8 60cabdff1aSopenharmony_ci psrlw m3, 8 61cabdff1aSopenharmony_ci por m2, m0 62cabdff1aSopenharmony_ci por m3, m1 63cabdff1aSopenharmony_ci mov%1 [r0 + 0], m2 64cabdff1aSopenharmony_ci mov%1 [r0 + 16], m3 65cabdff1aSopenharmony_ci%endif 66cabdff1aSopenharmony_ci add r0, mmsize*2 67cabdff1aSopenharmony_ci add r1, mmsize*2 68cabdff1aSopenharmony_ci dec r2d 69cabdff1aSopenharmony_ci jnz .loop8_%1 70cabdff1aSopenharmony_ci%if cpuflag(avx2) 71cabdff1aSopenharmony_ci.left8_%1: 72cabdff1aSopenharmony_ci mov r2d, r3d 73cabdff1aSopenharmony_ci test r3d, 8 74cabdff1aSopenharmony_ci jz .left4_%1 75cabdff1aSopenharmony_ci mov%1 m0, [r1] 76cabdff1aSopenharmony_ci pshufb m0, m2 77cabdff1aSopenharmony_ci mov%1 [r0 + 0], m0 78cabdff1aSopenharmony_ci add r1, mmsize 79cabdff1aSopenharmony_ci add r0, mmsize 80cabdff1aSopenharmony_ci%endif 81cabdff1aSopenharmony_ci.left4_%1: 82cabdff1aSopenharmony_ci mov r2d, r3d 83cabdff1aSopenharmony_ci test r3d, 4 84cabdff1aSopenharmony_ci jz .left 85cabdff1aSopenharmony_ci mov%1 xm0, [r1] 86cabdff1aSopenharmony_ci%if cpuflag(ssse3) 87cabdff1aSopenharmony_ci pshufb xm0, xm2 88cabdff1aSopenharmony_ci mov%1 [r0], xm0 89cabdff1aSopenharmony_ci%else 90cabdff1aSopenharmony_ci pshuflw m0, m0, 10110001b 91cabdff1aSopenharmony_ci pshufhw m0, m0, 10110001b 92cabdff1aSopenharmony_ci mova m2, m0 93cabdff1aSopenharmony_ci psllw m0, 8 94cabdff1aSopenharmony_ci psrlw m2, 8 95cabdff1aSopenharmony_ci por m2, m0 96cabdff1aSopenharmony_ci mov%1 [r0], m2 97cabdff1aSopenharmony_ci%endif 98cabdff1aSopenharmony_ci add r1, 16 99cabdff1aSopenharmony_ci add r0, 16 100cabdff1aSopenharmony_ci%endmacro 101cabdff1aSopenharmony_ci 102cabdff1aSopenharmony_ci; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w); 103cabdff1aSopenharmony_ci%macro BSWAP32_BUF 0 104cabdff1aSopenharmony_ci%if cpuflag(ssse3)||cpuflag(avx2) 105cabdff1aSopenharmony_cicglobal bswap32_buf, 3,4,3 106cabdff1aSopenharmony_ci mov r3, r1 107cabdff1aSopenharmony_ci VBROADCASTI128 m2, [pb_bswap32] 108cabdff1aSopenharmony_ci%else 109cabdff1aSopenharmony_cicglobal bswap32_buf, 3,4,5 110cabdff1aSopenharmony_ci mov r3, r1 111cabdff1aSopenharmony_ci%endif 112cabdff1aSopenharmony_ci or r3, r0 113cabdff1aSopenharmony_ci test r3, mmsize - 1 114cabdff1aSopenharmony_ci jz .start_align 115cabdff1aSopenharmony_ci BSWAP_LOOPS u 116cabdff1aSopenharmony_ci jmp .left 117cabdff1aSopenharmony_ci.start_align: 118cabdff1aSopenharmony_ci BSWAP_LOOPS a 119cabdff1aSopenharmony_ci.left: 120cabdff1aSopenharmony_ci%if cpuflag(ssse3) 121cabdff1aSopenharmony_ci test r2d, 2 122cabdff1aSopenharmony_ci jz .left1 123cabdff1aSopenharmony_ci movq xm0, [r1] 124cabdff1aSopenharmony_ci pshufb xm0, xm2 125cabdff1aSopenharmony_ci movq [r0], xm0 126cabdff1aSopenharmony_ci add r1, 8 127cabdff1aSopenharmony_ci add r0, 8 128cabdff1aSopenharmony_ci.left1: 129cabdff1aSopenharmony_ci test r2d, 1 130cabdff1aSopenharmony_ci jz .end 131cabdff1aSopenharmony_ci mov r2d, [r1] 132cabdff1aSopenharmony_ci bswap r2d 133cabdff1aSopenharmony_ci mov [r0], r2d 134cabdff1aSopenharmony_ci%else 135cabdff1aSopenharmony_ci and r2d, 3 136cabdff1aSopenharmony_ci jz .end 137cabdff1aSopenharmony_ci.loop2: 138cabdff1aSopenharmony_ci mov r3d, [r1] 139cabdff1aSopenharmony_ci bswap r3d 140cabdff1aSopenharmony_ci mov [r0], r3d 141cabdff1aSopenharmony_ci add r1, 4 142cabdff1aSopenharmony_ci add r0, 4 143cabdff1aSopenharmony_ci dec r2d 144cabdff1aSopenharmony_ci jnz .loop2 145cabdff1aSopenharmony_ci%endif 146cabdff1aSopenharmony_ci.end: 147cabdff1aSopenharmony_ci RET 148cabdff1aSopenharmony_ci%endmacro 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ciINIT_XMM sse2 151cabdff1aSopenharmony_ciBSWAP32_BUF 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_ciINIT_XMM ssse3 154cabdff1aSopenharmony_ciBSWAP32_BUF 155cabdff1aSopenharmony_ci 156cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 157cabdff1aSopenharmony_ciINIT_YMM avx2 158cabdff1aSopenharmony_ciBSWAP32_BUF 159cabdff1aSopenharmony_ci%endif 160