1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* optimized bswap buffer functions
3cabdff1aSopenharmony_ci;* Copyright (c) 2008 Loren Merritt
4cabdff1aSopenharmony_ci;* Copyright (c) 2003-2013 Michael Niedermayer
5cabdff1aSopenharmony_ci;* Copyright (c) 2013 Daniel Kang
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
8cabdff1aSopenharmony_ci;*
9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
13cabdff1aSopenharmony_ci;*
14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
18cabdff1aSopenharmony_ci;*
19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22cabdff1aSopenharmony_ci;******************************************************************************
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ciSECTION_RODATA
27cabdff1aSopenharmony_cipb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_cicextern pb_80
30cabdff1aSopenharmony_ci
31cabdff1aSopenharmony_ciSECTION .text
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci; %1 = aligned/unaligned
34cabdff1aSopenharmony_ci%macro BSWAP_LOOPS  1
35cabdff1aSopenharmony_ci    mov      r3d, r2d
36cabdff1aSopenharmony_ci    sar      r2d, 3
37cabdff1aSopenharmony_ci    jz       .left4_%1
38cabdff1aSopenharmony_ci%if cpuflag(avx2)
39cabdff1aSopenharmony_ci    sar      r2d, 1
40cabdff1aSopenharmony_ci    jz       .left8_%1
41cabdff1aSopenharmony_ci%endif
42cabdff1aSopenharmony_ci.loop8_%1:
43cabdff1aSopenharmony_ci    mov%1    m0, [r1 +  0]
44cabdff1aSopenharmony_ci    mov%1    m1, [r1 + mmsize]
45cabdff1aSopenharmony_ci%if cpuflag(ssse3)||cpuflag(avx2)
46cabdff1aSopenharmony_ci    pshufb   m0, m2
47cabdff1aSopenharmony_ci    pshufb   m1, m2
48cabdff1aSopenharmony_ci    mov%1    [r0 +  0], m0
49cabdff1aSopenharmony_ci    mov%1    [r0 + mmsize], m1
50cabdff1aSopenharmony_ci%else
51cabdff1aSopenharmony_ci    pshuflw  m0, m0, 10110001b
52cabdff1aSopenharmony_ci    pshuflw  m1, m1, 10110001b
53cabdff1aSopenharmony_ci    pshufhw  m0, m0, 10110001b
54cabdff1aSopenharmony_ci    pshufhw  m1, m1, 10110001b
55cabdff1aSopenharmony_ci    mova     m2, m0
56cabdff1aSopenharmony_ci    mova     m3, m1
57cabdff1aSopenharmony_ci    psllw    m0, 8
58cabdff1aSopenharmony_ci    psllw    m1, 8
59cabdff1aSopenharmony_ci    psrlw    m2, 8
60cabdff1aSopenharmony_ci    psrlw    m3, 8
61cabdff1aSopenharmony_ci    por      m2, m0
62cabdff1aSopenharmony_ci    por      m3, m1
63cabdff1aSopenharmony_ci    mov%1    [r0 +  0], m2
64cabdff1aSopenharmony_ci    mov%1    [r0 + 16], m3
65cabdff1aSopenharmony_ci%endif
66cabdff1aSopenharmony_ci    add      r0, mmsize*2
67cabdff1aSopenharmony_ci    add      r1, mmsize*2
68cabdff1aSopenharmony_ci    dec      r2d
69cabdff1aSopenharmony_ci    jnz      .loop8_%1
70cabdff1aSopenharmony_ci%if cpuflag(avx2)
71cabdff1aSopenharmony_ci.left8_%1:
72cabdff1aSopenharmony_ci    mov      r2d, r3d
73cabdff1aSopenharmony_ci    test     r3d, 8
74cabdff1aSopenharmony_ci    jz       .left4_%1
75cabdff1aSopenharmony_ci    mov%1    m0, [r1]
76cabdff1aSopenharmony_ci    pshufb   m0, m2
77cabdff1aSopenharmony_ci    mov%1    [r0 +  0], m0
78cabdff1aSopenharmony_ci    add r1, mmsize
79cabdff1aSopenharmony_ci    add r0, mmsize
80cabdff1aSopenharmony_ci%endif
81cabdff1aSopenharmony_ci.left4_%1:
82cabdff1aSopenharmony_ci    mov      r2d, r3d
83cabdff1aSopenharmony_ci    test     r3d, 4
84cabdff1aSopenharmony_ci    jz       .left
85cabdff1aSopenharmony_ci    mov%1    xm0, [r1]
86cabdff1aSopenharmony_ci%if cpuflag(ssse3)
87cabdff1aSopenharmony_ci    pshufb   xm0, xm2
88cabdff1aSopenharmony_ci    mov%1    [r0], xm0
89cabdff1aSopenharmony_ci%else
90cabdff1aSopenharmony_ci    pshuflw  m0, m0, 10110001b
91cabdff1aSopenharmony_ci    pshufhw  m0, m0, 10110001b
92cabdff1aSopenharmony_ci    mova     m2, m0
93cabdff1aSopenharmony_ci    psllw    m0, 8
94cabdff1aSopenharmony_ci    psrlw    m2, 8
95cabdff1aSopenharmony_ci    por      m2, m0
96cabdff1aSopenharmony_ci    mov%1    [r0], m2
97cabdff1aSopenharmony_ci%endif
98cabdff1aSopenharmony_ci    add      r1, 16
99cabdff1aSopenharmony_ci    add      r0, 16
100cabdff1aSopenharmony_ci%endmacro
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_ci; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
103cabdff1aSopenharmony_ci%macro BSWAP32_BUF 0
104cabdff1aSopenharmony_ci%if cpuflag(ssse3)||cpuflag(avx2)
105cabdff1aSopenharmony_cicglobal bswap32_buf, 3,4,3
106cabdff1aSopenharmony_ci    mov      r3, r1
107cabdff1aSopenharmony_ci    VBROADCASTI128  m2, [pb_bswap32]
108cabdff1aSopenharmony_ci%else
109cabdff1aSopenharmony_cicglobal bswap32_buf, 3,4,5
110cabdff1aSopenharmony_ci    mov      r3, r1
111cabdff1aSopenharmony_ci%endif
112cabdff1aSopenharmony_ci    or       r3, r0
113cabdff1aSopenharmony_ci    test     r3, mmsize - 1
114cabdff1aSopenharmony_ci    jz       .start_align
115cabdff1aSopenharmony_ci    BSWAP_LOOPS  u
116cabdff1aSopenharmony_ci    jmp      .left
117cabdff1aSopenharmony_ci.start_align:
118cabdff1aSopenharmony_ci    BSWAP_LOOPS  a
119cabdff1aSopenharmony_ci.left:
120cabdff1aSopenharmony_ci%if cpuflag(ssse3)
121cabdff1aSopenharmony_ci    test     r2d, 2
122cabdff1aSopenharmony_ci    jz       .left1
123cabdff1aSopenharmony_ci    movq     xm0, [r1]
124cabdff1aSopenharmony_ci    pshufb   xm0, xm2
125cabdff1aSopenharmony_ci    movq     [r0], xm0
126cabdff1aSopenharmony_ci    add      r1, 8
127cabdff1aSopenharmony_ci    add      r0, 8
128cabdff1aSopenharmony_ci.left1:
129cabdff1aSopenharmony_ci    test     r2d, 1
130cabdff1aSopenharmony_ci    jz       .end
131cabdff1aSopenharmony_ci    mov      r2d, [r1]
132cabdff1aSopenharmony_ci    bswap    r2d
133cabdff1aSopenharmony_ci    mov      [r0], r2d
134cabdff1aSopenharmony_ci%else
135cabdff1aSopenharmony_ci    and      r2d, 3
136cabdff1aSopenharmony_ci    jz       .end
137cabdff1aSopenharmony_ci.loop2:
138cabdff1aSopenharmony_ci    mov      r3d, [r1]
139cabdff1aSopenharmony_ci    bswap    r3d
140cabdff1aSopenharmony_ci    mov      [r0], r3d
141cabdff1aSopenharmony_ci    add      r1, 4
142cabdff1aSopenharmony_ci    add      r0, 4
143cabdff1aSopenharmony_ci    dec      r2d
144cabdff1aSopenharmony_ci    jnz      .loop2
145cabdff1aSopenharmony_ci%endif
146cabdff1aSopenharmony_ci.end:
147cabdff1aSopenharmony_ci    RET
148cabdff1aSopenharmony_ci%endmacro
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ciINIT_XMM sse2
151cabdff1aSopenharmony_ciBSWAP32_BUF
152cabdff1aSopenharmony_ci
153cabdff1aSopenharmony_ciINIT_XMM ssse3
154cabdff1aSopenharmony_ciBSWAP32_BUF
155cabdff1aSopenharmony_ci
156cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
157cabdff1aSopenharmony_ciINIT_YMM avx2
158cabdff1aSopenharmony_ciBSWAP32_BUF
159cabdff1aSopenharmony_ci%endif
160