1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* x86-optimized yuv2yuvX
3cabdff1aSopenharmony_ci;* Copyright 2020 Google LLC
4cabdff1aSopenharmony_ci;* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
5cabdff1aSopenharmony_ci;*
6cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci;*
13cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
17cabdff1aSopenharmony_ci;*
18cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci;******************************************************************************
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ciSECTION .text
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
28cabdff1aSopenharmony_ci; yuv2yuvX
29cabdff1aSopenharmony_ci;
30cabdff1aSopenharmony_ci; void ff_yuv2yuvX_<opt>(const int16_t *filter, int filterSize,
31cabdff1aSopenharmony_ci;                        int srcOffset, uint8_t *dest, int dstW,
32cabdff1aSopenharmony_ci;                        const uint8_t *dither, int offset);
33cabdff1aSopenharmony_ci;
34cabdff1aSopenharmony_ci;-----------------------------------------------------------------------------
35cabdff1aSopenharmony_ci
36cabdff1aSopenharmony_ci%macro YUV2YUVX_FUNC 0
37cabdff1aSopenharmony_cicglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
38cabdff1aSopenharmony_ci%if notcpuflag(sse3)
39cabdff1aSopenharmony_ci%define movr mova
40cabdff1aSopenharmony_ci%define unroll 1
41cabdff1aSopenharmony_ci%else
42cabdff1aSopenharmony_ci%define movr movdqu
43cabdff1aSopenharmony_ci%define unroll 2
44cabdff1aSopenharmony_ci%endif
45cabdff1aSopenharmony_ci    movsxdifnidn         dstWq, dstWd
46cabdff1aSopenharmony_ci    movsxdifnidn         offsetq, offsetd
47cabdff1aSopenharmony_ci    movsxdifnidn         srcq, srcd
48cabdff1aSopenharmony_ci%if cpuflag(avx2)
49cabdff1aSopenharmony_ci    vpbroadcastq         m3, [ditherq]
50cabdff1aSopenharmony_ci%else
51cabdff1aSopenharmony_ci    movq                 xm3, [ditherq]
52cabdff1aSopenharmony_ci%endif ; avx2
53cabdff1aSopenharmony_ci    cmp                  offsetd, 0
54cabdff1aSopenharmony_ci    jz                   .offset
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ci    ; offset != 0 path.
57cabdff1aSopenharmony_ci    psrlq                m5, m3, $18
58cabdff1aSopenharmony_ci    psllq                m3, m3, $28
59cabdff1aSopenharmony_ci    por                  m3, m3, m5
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ci.offset:
62cabdff1aSopenharmony_ci    add offsetq, srcq
63cabdff1aSopenharmony_ci    movd                 xm1, filterSized
64cabdff1aSopenharmony_ci    SPLATW               m1, xm1, 0
65cabdff1aSopenharmony_ci    pxor                 m0, m0, m0
66cabdff1aSopenharmony_ci    mov                  filterSizeq, filterq
67cabdff1aSopenharmony_ci    mov                  srcq, [filterSizeq]
68cabdff1aSopenharmony_ci    punpcklbw            m3, m0
69cabdff1aSopenharmony_ci    psllw                m1, m1, 3
70cabdff1aSopenharmony_ci    paddw                m3, m3, m1
71cabdff1aSopenharmony_ci    psraw                m7, m3, 4
72cabdff1aSopenharmony_ci.outerloop:
73cabdff1aSopenharmony_ci    mova                 m4, m7
74cabdff1aSopenharmony_ci    mova                 m3, m7
75cabdff1aSopenharmony_ci%if cpuflag(sse3)
76cabdff1aSopenharmony_ci    mova                 m6, m7
77cabdff1aSopenharmony_ci    mova                 m1, m7
78cabdff1aSopenharmony_ci%endif
79cabdff1aSopenharmony_ci.loop:
80cabdff1aSopenharmony_ci%if cpuflag(avx2)
81cabdff1aSopenharmony_ci    vpbroadcastq         m0, [filterSizeq + 8]
82cabdff1aSopenharmony_ci%elif cpuflag(sse3)
83cabdff1aSopenharmony_ci    movddup              m0, [filterSizeq + 8]
84cabdff1aSopenharmony_ci%else
85cabdff1aSopenharmony_ci    mova                 m0, [filterSizeq + 8]
86cabdff1aSopenharmony_ci%endif
87cabdff1aSopenharmony_ci    pmulhw               m2, m0, [srcq + offsetq * 2]
88cabdff1aSopenharmony_ci    pmulhw               m5, m0, [srcq + offsetq * 2 + mmsize]
89cabdff1aSopenharmony_ci    paddw                m3, m3, m2
90cabdff1aSopenharmony_ci    paddw                m4, m4, m5
91cabdff1aSopenharmony_ci%if cpuflag(sse3)
92cabdff1aSopenharmony_ci    pmulhw               m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
93cabdff1aSopenharmony_ci    pmulhw               m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
94cabdff1aSopenharmony_ci    paddw                m6, m6, m2
95cabdff1aSopenharmony_ci    paddw                m1, m1, m5
96cabdff1aSopenharmony_ci%endif
97cabdff1aSopenharmony_ci    add                  filterSizeq, $10
98cabdff1aSopenharmony_ci    mov                  srcq, [filterSizeq]
99cabdff1aSopenharmony_ci    test                 srcq, srcq
100cabdff1aSopenharmony_ci    jnz                  .loop
101cabdff1aSopenharmony_ci    psraw                m3, m3, 3
102cabdff1aSopenharmony_ci    psraw                m4, m4, 3
103cabdff1aSopenharmony_ci%if cpuflag(sse3)
104cabdff1aSopenharmony_ci    psraw                m6, m6, 3
105cabdff1aSopenharmony_ci    psraw                m1, m1, 3
106cabdff1aSopenharmony_ci%endif
107cabdff1aSopenharmony_ci    packuswb             m3, m3, m4
108cabdff1aSopenharmony_ci%if cpuflag(sse3)
109cabdff1aSopenharmony_ci    packuswb             m6, m6, m1
110cabdff1aSopenharmony_ci%endif
111cabdff1aSopenharmony_ci    mov                  srcq, [filterq]
112cabdff1aSopenharmony_ci%if cpuflag(avx2)
113cabdff1aSopenharmony_ci    vpermq               m3, m3, 216
114cabdff1aSopenharmony_ci    vpermq               m6, m6, 216
115cabdff1aSopenharmony_ci%endif
116cabdff1aSopenharmony_ci    movr                 [destq + offsetq], m3
117cabdff1aSopenharmony_ci%if cpuflag(sse3)
118cabdff1aSopenharmony_ci    movr                 [destq + offsetq + mmsize], m6
119cabdff1aSopenharmony_ci%endif
120cabdff1aSopenharmony_ci    add                  offsetq, mmsize * unroll
121cabdff1aSopenharmony_ci    mov                  filterSizeq, filterq
122cabdff1aSopenharmony_ci    cmp                  offsetq, dstWq
123cabdff1aSopenharmony_ci    jb                  .outerloop
124cabdff1aSopenharmony_ci    REP_RET
125cabdff1aSopenharmony_ci%endmacro
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ciINIT_MMX mmx
128cabdff1aSopenharmony_ciYUV2YUVX_FUNC
129cabdff1aSopenharmony_ciINIT_MMX mmxext
130cabdff1aSopenharmony_ciYUV2YUVX_FUNC
131cabdff1aSopenharmony_ciINIT_XMM sse3
132cabdff1aSopenharmony_ciYUV2YUVX_FUNC
133cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
134cabdff1aSopenharmony_ciINIT_YMM avx2
135cabdff1aSopenharmony_ciYUV2YUVX_FUNC
136cabdff1aSopenharmony_ci%endif
137