1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* V210 SIMD pack
3cabdff1aSopenharmony_ci;* Copyright (c) 2014 Kieran Kunhya <kierank@obe.tv>
4cabdff1aSopenharmony_ci;*
5cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
6cabdff1aSopenharmony_ci;*
7cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
8cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
9cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
10cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
13cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
18cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
19cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20cabdff1aSopenharmony_ci;******************************************************************************
21cabdff1aSopenharmony_ci
22cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ciSECTION_RODATA 32
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_cicextern pw_4
27cabdff1aSopenharmony_ci%define v210_enc_min_10 pw_4
28cabdff1aSopenharmony_civ210_enc_max_10: times 16 dw 0x3fb
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_civ210_enc_luma_mult_10: times 2 dw 4,1,16,4,1,16,0,0
31cabdff1aSopenharmony_civ210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_civ210_enc_chroma_mult_10: times 2 dw 1,4,16,0,16,1,4,0
34cabdff1aSopenharmony_civ210_enc_chroma_shuf_10: times 2 db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_cicextern pb_1
37cabdff1aSopenharmony_ci%define v210_enc_min_8 pb_1
38cabdff1aSopenharmony_cicextern pb_FE
39cabdff1aSopenharmony_ci%define v210_enc_max_8 pb_FE
40cabdff1aSopenharmony_ci
41cabdff1aSopenharmony_civ210_enc_luma_shuf_8: times 2 db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1
42cabdff1aSopenharmony_civ210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0
43cabdff1aSopenharmony_ci
44cabdff1aSopenharmony_civ210_enc_chroma_shuf1_8: times 2 db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1
45cabdff1aSopenharmony_civ210_enc_chroma_shuf2_8: times 2 db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_civ210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
48cabdff1aSopenharmony_ci
49cabdff1aSopenharmony_ciSECTION .text
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_ci%macro v210_planar_pack_10 0
52cabdff1aSopenharmony_ci
53cabdff1aSopenharmony_ci; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width)
54cabdff1aSopenharmony_cicglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
55cabdff1aSopenharmony_ci    lea     r0, [yq+2*widthq]
56cabdff1aSopenharmony_ci    add     uq, widthq
57cabdff1aSopenharmony_ci    add     vq, widthq
58cabdff1aSopenharmony_ci    neg     widthq
59cabdff1aSopenharmony_ci
60cabdff1aSopenharmony_ci    mova    m2, [v210_enc_min_10]
61cabdff1aSopenharmony_ci    mova    m3, [v210_enc_max_10]
62cabdff1aSopenharmony_ci
63cabdff1aSopenharmony_ci.loop:
64cabdff1aSopenharmony_ci    movu        xm0, [yq+2*widthq]
65cabdff1aSopenharmony_ci%if cpuflag(avx2)
66cabdff1aSopenharmony_ci    vinserti128 m0,   m0, [yq+widthq*2+12], 1
67cabdff1aSopenharmony_ci%endif
68cabdff1aSopenharmony_ci    CLIPW   m0, m2, m3
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_ci    movq         xm1, [uq+widthq]
71cabdff1aSopenharmony_ci    movhps       xm1, [vq+widthq]
72cabdff1aSopenharmony_ci%if cpuflag(avx2)
73cabdff1aSopenharmony_ci    movq         xm4, [uq+widthq+6]
74cabdff1aSopenharmony_ci    movhps       xm4, [vq+widthq+6]
75cabdff1aSopenharmony_ci    vinserti128  m1,   m1, xm4, 1
76cabdff1aSopenharmony_ci%endif
77cabdff1aSopenharmony_ci    CLIPW   m1, m2, m3
78cabdff1aSopenharmony_ci
79cabdff1aSopenharmony_ci    pmullw  m0, [v210_enc_luma_mult_10]
80cabdff1aSopenharmony_ci    pshufb  m0, [v210_enc_luma_shuf_10]
81cabdff1aSopenharmony_ci
82cabdff1aSopenharmony_ci    pmullw  m1, [v210_enc_chroma_mult_10]
83cabdff1aSopenharmony_ci    pshufb  m1, [v210_enc_chroma_shuf_10]
84cabdff1aSopenharmony_ci
85cabdff1aSopenharmony_ci    por     m0, m1
86cabdff1aSopenharmony_ci
87cabdff1aSopenharmony_ci    movu    [dstq], m0
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci    add     dstq, mmsize
90cabdff1aSopenharmony_ci    add     widthq, (mmsize*3)/8
91cabdff1aSopenharmony_ci    jl .loop
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci    RET
94cabdff1aSopenharmony_ci%endmacro
95cabdff1aSopenharmony_ci
96cabdff1aSopenharmony_ci%if HAVE_SSSE3_EXTERNAL
97cabdff1aSopenharmony_ciINIT_XMM ssse3
98cabdff1aSopenharmony_civ210_planar_pack_10
99cabdff1aSopenharmony_ci%endif
100cabdff1aSopenharmony_ci
101cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
102cabdff1aSopenharmony_ciINIT_YMM avx2
103cabdff1aSopenharmony_civ210_planar_pack_10
104cabdff1aSopenharmony_ci%endif
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci%macro v210_planar_pack_8 0
107cabdff1aSopenharmony_ci
108cabdff1aSopenharmony_ci; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width)
109cabdff1aSopenharmony_cicglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width
110cabdff1aSopenharmony_ci    add     yq, widthq
111cabdff1aSopenharmony_ci    shr     widthq, 1
112cabdff1aSopenharmony_ci    add     uq, widthq
113cabdff1aSopenharmony_ci    add     vq, widthq
114cabdff1aSopenharmony_ci    neg     widthq
115cabdff1aSopenharmony_ci
116cabdff1aSopenharmony_ci    mova    m4, [v210_enc_min_8]
117cabdff1aSopenharmony_ci    mova    m5, [v210_enc_max_8]
118cabdff1aSopenharmony_ci    pxor    m6, m6
119cabdff1aSopenharmony_ci
120cabdff1aSopenharmony_ci.loop:
121cabdff1aSopenharmony_ci    movu        xm1, [yq+widthq*2]
122cabdff1aSopenharmony_ci%if cpuflag(avx2)
123cabdff1aSopenharmony_ci    vinserti128 m1,   m1, [yq+widthq*2+12], 1
124cabdff1aSopenharmony_ci%endif
125cabdff1aSopenharmony_ci    CLIPUB  m1, m4, m5
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci    punpcklbw m0, m1, m6
128cabdff1aSopenharmony_ci    ; can't unpack high bytes in the same way because we process
129cabdff1aSopenharmony_ci    ; only six bytes at a time
130cabdff1aSopenharmony_ci    pshufb  m1, [v210_enc_luma_shuf_8]
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci    pmullw  m0, [v210_enc_luma_mult_8]
133cabdff1aSopenharmony_ci    pmullw  m1, [v210_enc_luma_mult_8]
134cabdff1aSopenharmony_ci    pshufb  m0, [v210_enc_luma_shuf_10]
135cabdff1aSopenharmony_ci    pshufb  m1, [v210_enc_luma_shuf_10]
136cabdff1aSopenharmony_ci
137cabdff1aSopenharmony_ci    movq         xm3, [uq+widthq]
138cabdff1aSopenharmony_ci    movhps       xm3, [vq+widthq]
139cabdff1aSopenharmony_ci%if cpuflag(avx2)
140cabdff1aSopenharmony_ci    movq         xm2, [uq+widthq+6]
141cabdff1aSopenharmony_ci    movhps       xm2, [vq+widthq+6]
142cabdff1aSopenharmony_ci    vinserti128  m3,   m3, xm2, 1
143cabdff1aSopenharmony_ci%endif
144cabdff1aSopenharmony_ci    CLIPUB  m3, m4, m5
145cabdff1aSopenharmony_ci
146cabdff1aSopenharmony_ci    ; shuffle and multiply to get the same packing as in 10-bit
147cabdff1aSopenharmony_ci    pshufb  m2, m3, [v210_enc_chroma_shuf1_8]
148cabdff1aSopenharmony_ci    pshufb  m3, [v210_enc_chroma_shuf2_8]
149cabdff1aSopenharmony_ci
150cabdff1aSopenharmony_ci    pmullw  m2, [v210_enc_chroma_mult_8]
151cabdff1aSopenharmony_ci    pmullw  m3, [v210_enc_chroma_mult_8]
152cabdff1aSopenharmony_ci    pshufb  m2, [v210_enc_chroma_shuf_10]
153cabdff1aSopenharmony_ci    pshufb  m3, [v210_enc_chroma_shuf_10]
154cabdff1aSopenharmony_ci
155cabdff1aSopenharmony_ci    por     m0, m2
156cabdff1aSopenharmony_ci    por     m1, m3
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_ci    movu         [dstq],    xm0
159cabdff1aSopenharmony_ci    movu         [dstq+16], xm1
160cabdff1aSopenharmony_ci%if cpuflag(avx2)
161cabdff1aSopenharmony_ci    vextracti128 [dstq+32], m0, 1
162cabdff1aSopenharmony_ci    vextracti128 [dstq+48], m1, 1
163cabdff1aSopenharmony_ci%endif
164cabdff1aSopenharmony_ci
165cabdff1aSopenharmony_ci    add     dstq, 2*mmsize
166cabdff1aSopenharmony_ci    add     widthq, (mmsize*3)/8
167cabdff1aSopenharmony_ci    jl .loop
168cabdff1aSopenharmony_ci
169cabdff1aSopenharmony_ci    RET
170cabdff1aSopenharmony_ci%endmacro
171cabdff1aSopenharmony_ci
172cabdff1aSopenharmony_ci%if HAVE_SSSE3_EXTERNAL
173cabdff1aSopenharmony_ciINIT_XMM ssse3
174cabdff1aSopenharmony_civ210_planar_pack_8
175cabdff1aSopenharmony_ci%endif
176cabdff1aSopenharmony_ci%if HAVE_AVX_EXTERNAL
177cabdff1aSopenharmony_ciINIT_XMM avx
178cabdff1aSopenharmony_civ210_planar_pack_8
179cabdff1aSopenharmony_ci%endif
180cabdff1aSopenharmony_ci
181cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
182cabdff1aSopenharmony_ciINIT_YMM avx2
183cabdff1aSopenharmony_civ210_planar_pack_8
184cabdff1aSopenharmony_ci%endif
185