1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * This file is part of FFmpeg.
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
10cabdff1aSopenharmony_ci *
11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14cabdff1aSopenharmony_ci * Lesser General Public License for more details.
15cabdff1aSopenharmony_ci *
16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19cabdff1aSopenharmony_ci */
20cabdff1aSopenharmony_ci
21cabdff1aSopenharmony_ci#include "libavutil/mem_internal.h"
22cabdff1aSopenharmony_ci#include "libavutil/ppc/util_altivec.h"
23cabdff1aSopenharmony_ci
24cabdff1aSopenharmony_ci/* this code assume that stride % 16 == 0 */
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
27cabdff1aSopenharmony_ci        vsrc2ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc2uc);\
28cabdff1aSopenharmony_ci        vsrc3ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc3uc);\
29cabdff1aSopenharmony_ci\
30cabdff1aSopenharmony_ci        psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
31cabdff1aSopenharmony_ci        psum = vec_mladd(vB, vsrc1ssH, psum);\
32cabdff1aSopenharmony_ci        psum = vec_mladd(vC, vsrc2ssH, psum);\
33cabdff1aSopenharmony_ci        psum = vec_mladd(vD, vsrc3ssH, psum);\
34cabdff1aSopenharmony_ci        psum = BIAS2(psum);\
35cabdff1aSopenharmony_ci        psum = vec_sr(psum, v6us);\
36cabdff1aSopenharmony_ci\
37cabdff1aSopenharmony_ci        vdst = vec_ld(0, dst);\
38cabdff1aSopenharmony_ci        ppsum = (vec_u8)vec_pack(psum, psum);\
39cabdff1aSopenharmony_ci        vfdst = vec_perm(vdst, ppsum, fperm);\
40cabdff1aSopenharmony_ci\
41cabdff1aSopenharmony_ci        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
42cabdff1aSopenharmony_ci\
43cabdff1aSopenharmony_ci        vec_st(fsum, 0, dst);\
44cabdff1aSopenharmony_ci\
45cabdff1aSopenharmony_ci        vsrc0ssH = vsrc2ssH;\
46cabdff1aSopenharmony_ci        vsrc1ssH = vsrc3ssH;\
47cabdff1aSopenharmony_ci\
48cabdff1aSopenharmony_ci        dst += stride;\
49cabdff1aSopenharmony_ci        src += stride;
50cabdff1aSopenharmony_ci
51cabdff1aSopenharmony_ci#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
52cabdff1aSopenharmony_ci\
53cabdff1aSopenharmony_ci        vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);\
54cabdff1aSopenharmony_ci        vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);\
55cabdff1aSopenharmony_ci\
56cabdff1aSopenharmony_ci        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
57cabdff1aSopenharmony_ci        psum = vec_mladd(vE, vsrc1ssH, psum);\
58cabdff1aSopenharmony_ci        psum = vec_sr(psum, v6us);\
59cabdff1aSopenharmony_ci\
60cabdff1aSopenharmony_ci        vdst = vec_ld(0, dst);\
61cabdff1aSopenharmony_ci        ppsum = (vec_u8)vec_pack(psum, psum);\
62cabdff1aSopenharmony_ci        vfdst = vec_perm(vdst, ppsum, fperm);\
63cabdff1aSopenharmony_ci\
64cabdff1aSopenharmony_ci        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
65cabdff1aSopenharmony_ci\
66cabdff1aSopenharmony_ci        vec_st(fsum, 0, dst);\
67cabdff1aSopenharmony_ci\
68cabdff1aSopenharmony_ci        dst += stride;\
69cabdff1aSopenharmony_ci        src += stride;
70cabdff1aSopenharmony_ci
71cabdff1aSopenharmony_ci#define noop(a) a
72cabdff1aSopenharmony_ci#define add28(a) vec_add(v28ss, a)
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN
75cabdff1aSopenharmony_ci#define GET_VSRC1(vs0, off, b, perm0, s){    \
76cabdff1aSopenharmony_ci    vec_u8 vsrcCuc, vsrcDuc;                 \
77cabdff1aSopenharmony_ci    vsrcCuc = vec_ld(off, s);                \
78cabdff1aSopenharmony_ci    if (loadSecond){                         \
79cabdff1aSopenharmony_ci        vsrcDuc = vec_ld(off + b, s);        \
80cabdff1aSopenharmony_ci    } else                                   \
81cabdff1aSopenharmony_ci        vsrcDuc = vsrcCuc;                   \
82cabdff1aSopenharmony_ci                                             \
83cabdff1aSopenharmony_ci    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \
84cabdff1aSopenharmony_ci}
85cabdff1aSopenharmony_ci#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
86cabdff1aSopenharmony_ci    vec_u8 vsrcCuc, vsrcDuc;                         \
87cabdff1aSopenharmony_ci    vsrcCuc = vec_ld(off, s);                        \
88cabdff1aSopenharmony_ci    if (loadSecond){                                 \
89cabdff1aSopenharmony_ci        vsrcDuc = vec_ld(off + b, s);                \
90cabdff1aSopenharmony_ci    } else                                           \
91cabdff1aSopenharmony_ci        vsrcDuc = vsrcCuc;                           \
92cabdff1aSopenharmony_ci                                                     \
93cabdff1aSopenharmony_ci    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0);         \
94cabdff1aSopenharmony_ci    if (reallyBadAlign){                             \
95cabdff1aSopenharmony_ci        vs1 = vsrcDuc;                               \
96cabdff1aSopenharmony_ci    } else                                           \
97cabdff1aSopenharmony_ci        vs1 = vec_perm(vsrcCuc, vsrcDuc, perm1);     \
98cabdff1aSopenharmony_ci }
99cabdff1aSopenharmony_ci
100cabdff1aSopenharmony_ci#else
101cabdff1aSopenharmony_ci
102cabdff1aSopenharmony_ci#define GET_VSRC1(vs0, off, b, perm0, s){            \
103cabdff1aSopenharmony_ci    vs0 = vec_vsx_ld(off, s);                        \
104cabdff1aSopenharmony_ci }
105cabdff1aSopenharmony_ci#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
106cabdff1aSopenharmony_ci    vs0 = vec_vsx_ld(off, s);                        \
107cabdff1aSopenharmony_ci    vs1 = vec_vsx_ld(off + 1, s);                    \
108cabdff1aSopenharmony_ci }
109cabdff1aSopenharmony_ci#endif /* HAVE_BIGENDIAN */
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_ci#ifdef PREFIX_h264_chroma_mc8_altivec
112cabdff1aSopenharmony_cistatic void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
113cabdff1aSopenharmony_ci                                           ptrdiff_t stride, int h,
114cabdff1aSopenharmony_ci                                           int x, int y)
115cabdff1aSopenharmony_ci{
116cabdff1aSopenharmony_ci    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
117cabdff1aSopenharmony_ci                        {((8 - x) * (8 - y)),
118cabdff1aSopenharmony_ci                         ((    x) * (8 - y)),
119cabdff1aSopenharmony_ci                         ((8 - x) * (    y)),
120cabdff1aSopenharmony_ci                         ((    x) * (    y))};
121cabdff1aSopenharmony_ci    register int i;
122cabdff1aSopenharmony_ci    vec_u8 fperm;
123cabdff1aSopenharmony_ci    LOAD_ZERO;
124cabdff1aSopenharmony_ci    const vec_s32 vABCD = vec_ld(0, ABCD);
125cabdff1aSopenharmony_ci    const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
126cabdff1aSopenharmony_ci    const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
127cabdff1aSopenharmony_ci    const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
128cabdff1aSopenharmony_ci    const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
129cabdff1aSopenharmony_ci    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
130cabdff1aSopenharmony_ci    const vec_u16 v6us = vec_splat_u16(6);
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci    vec_u8 vsrcperm0, vsrcperm1;
133cabdff1aSopenharmony_ci    vec_u8 vsrc0uc, vsrc1uc;
134cabdff1aSopenharmony_ci    vec_s16 vsrc0ssH, vsrc1ssH;
135cabdff1aSopenharmony_ci    vec_u8 vsrc2uc, vsrc3uc;
136cabdff1aSopenharmony_ci    vec_s16 vsrc2ssH, vsrc3ssH, psum;
137cabdff1aSopenharmony_ci    vec_u8 vdst, ppsum, vfdst, fsum;
138cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN
139cabdff1aSopenharmony_ci    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
140cabdff1aSopenharmony_ci    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
141cabdff1aSopenharmony_ci    vsrcperm0 = vec_lvsl(0, src);
142cabdff1aSopenharmony_ci    vsrcperm1 = vec_lvsl(1, src);
143cabdff1aSopenharmony_ci#endif
144cabdff1aSopenharmony_ci
145cabdff1aSopenharmony_ci    if (((unsigned long)dst) % 16 == 0) {
146cabdff1aSopenharmony_ci        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
147cabdff1aSopenharmony_ci                         0x14, 0x15, 0x16, 0x17,
148cabdff1aSopenharmony_ci                         0x08, 0x09, 0x0A, 0x0B,
149cabdff1aSopenharmony_ci                         0x0C, 0x0D, 0x0E, 0x0F};
150cabdff1aSopenharmony_ci    } else {
151cabdff1aSopenharmony_ci        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
152cabdff1aSopenharmony_ci                         0x04, 0x05, 0x06, 0x07,
153cabdff1aSopenharmony_ci                         0x18, 0x19, 0x1A, 0x1B,
154cabdff1aSopenharmony_ci                         0x1C, 0x1D, 0x1E, 0x1F};
155cabdff1aSopenharmony_ci    }
156cabdff1aSopenharmony_ci
157cabdff1aSopenharmony_ci    GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
158cabdff1aSopenharmony_ci
159cabdff1aSopenharmony_ci    vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);
160cabdff1aSopenharmony_ci    vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);
161cabdff1aSopenharmony_ci
162cabdff1aSopenharmony_ci    if (ABCD[3]) {
163cabdff1aSopenharmony_ci        for (i = 0 ; i < h ; i++) {
164cabdff1aSopenharmony_ci            GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
165cabdff1aSopenharmony_ci            CHROMA_MC8_ALTIVEC_CORE(v32ss, noop);
166cabdff1aSopenharmony_ci        }
167cabdff1aSopenharmony_ci    } else {
168cabdff1aSopenharmony_ci        const vec_s16 vE = vec_add(vB, vC);
169cabdff1aSopenharmony_ci        if (ABCD[2]) { // x == 0 B == 0
170cabdff1aSopenharmony_ci            for (i = 0 ; i < h ; i++) {
171cabdff1aSopenharmony_ci                GET_VSRC1(vsrc1uc, stride, 15, vsrcperm0, src);
172cabdff1aSopenharmony_ci                CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
173cabdff1aSopenharmony_ci                vsrc0uc = vsrc1uc;
174cabdff1aSopenharmony_ci            }
175cabdff1aSopenharmony_ci        } else { // y == 0 C == 0
176cabdff1aSopenharmony_ci            for (i = 0 ; i < h ; i++) {
177cabdff1aSopenharmony_ci               GET_VSRC(vsrc0uc, vsrc1uc, 0, 15, vsrcperm0, vsrcperm1, src);
178cabdff1aSopenharmony_ci               CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
179cabdff1aSopenharmony_ci            }
180cabdff1aSopenharmony_ci        }
181cabdff1aSopenharmony_ci    }
182cabdff1aSopenharmony_ci}
183cabdff1aSopenharmony_ci#endif
184cabdff1aSopenharmony_ci
185cabdff1aSopenharmony_ci/* this code assume that stride % 16 == 0 */
186cabdff1aSopenharmony_ci#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
187cabdff1aSopenharmony_cistatic void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t *dst, uint8_t *src,
188cabdff1aSopenharmony_ci                                                 ptrdiff_t stride, int h,
189cabdff1aSopenharmony_ci                                                 int x, int y)
190cabdff1aSopenharmony_ci{
191cabdff1aSopenharmony_ci   DECLARE_ALIGNED(16, signed int, ABCD)[4] =
192cabdff1aSopenharmony_ci                        {((8 - x) * (8 - y)),
193cabdff1aSopenharmony_ci                         ((    x) * (8 - y)),
194cabdff1aSopenharmony_ci                         ((8 - x) * (    y)),
195cabdff1aSopenharmony_ci                         ((    x) * (    y))};
196cabdff1aSopenharmony_ci    register int i;
197cabdff1aSopenharmony_ci    vec_u8 fperm;
198cabdff1aSopenharmony_ci    LOAD_ZERO;
199cabdff1aSopenharmony_ci    const vec_s32 vABCD = vec_ld(0, ABCD);
200cabdff1aSopenharmony_ci    const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
201cabdff1aSopenharmony_ci    const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
202cabdff1aSopenharmony_ci    const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
203cabdff1aSopenharmony_ci    const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
204cabdff1aSopenharmony_ci    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
205cabdff1aSopenharmony_ci    const vec_u16 v6us  = vec_splat_u16(6);
206cabdff1aSopenharmony_ci
207cabdff1aSopenharmony_ci    vec_u8 vsrcperm0, vsrcperm1;
208cabdff1aSopenharmony_ci    vec_u8 vsrc0uc, vsrc1uc;
209cabdff1aSopenharmony_ci    vec_s16 vsrc0ssH, vsrc1ssH;
210cabdff1aSopenharmony_ci    vec_u8 vsrc2uc, vsrc3uc;
211cabdff1aSopenharmony_ci    vec_s16 vsrc2ssH, vsrc3ssH, psum;
212cabdff1aSopenharmony_ci    vec_u8 vdst, ppsum, vfdst, fsum;
213cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN
214cabdff1aSopenharmony_ci    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
215cabdff1aSopenharmony_ci    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
216cabdff1aSopenharmony_ci    vsrcperm0 = vec_lvsl(0, src);
217cabdff1aSopenharmony_ci    vsrcperm1 = vec_lvsl(1, src);
218cabdff1aSopenharmony_ci#endif
219cabdff1aSopenharmony_ci
220cabdff1aSopenharmony_ci    if (((unsigned long)dst) % 16 == 0) {
221cabdff1aSopenharmony_ci        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
222cabdff1aSopenharmony_ci                         0x14, 0x15, 0x16, 0x17,
223cabdff1aSopenharmony_ci                         0x08, 0x09, 0x0A, 0x0B,
224cabdff1aSopenharmony_ci                         0x0C, 0x0D, 0x0E, 0x0F};
225cabdff1aSopenharmony_ci    } else {
226cabdff1aSopenharmony_ci        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
227cabdff1aSopenharmony_ci                         0x04, 0x05, 0x06, 0x07,
228cabdff1aSopenharmony_ci                         0x18, 0x19, 0x1A, 0x1B,
229cabdff1aSopenharmony_ci                         0x1C, 0x1D, 0x1E, 0x1F};
230cabdff1aSopenharmony_ci    }
231cabdff1aSopenharmony_ci
232cabdff1aSopenharmony_ci    GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci    vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc0uc);
235cabdff1aSopenharmony_ci    vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc1uc);
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci    for (i = 0 ; i < h ; i++) {
238cabdff1aSopenharmony_ci        GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
239cabdff1aSopenharmony_ci        CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28);
240cabdff1aSopenharmony_ci    }
241cabdff1aSopenharmony_ci}
242cabdff1aSopenharmony_ci#endif
243cabdff1aSopenharmony_ci
244cabdff1aSopenharmony_ci#undef noop
245cabdff1aSopenharmony_ci#undef add28
246cabdff1aSopenharmony_ci#undef CHROMA_MC8_ALTIVEC_CORE
247