1/*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/mem_internal.h"
22#include "libavutil/ppc/util_altivec.h"
23
24/* this code assume that stride % 16 == 0 */
25
26#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
27        vsrc2ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc2uc);\
28        vsrc3ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc3uc);\
29\
30        psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
31        psum = vec_mladd(vB, vsrc1ssH, psum);\
32        psum = vec_mladd(vC, vsrc2ssH, psum);\
33        psum = vec_mladd(vD, vsrc3ssH, psum);\
34        psum = BIAS2(psum);\
35        psum = vec_sr(psum, v6us);\
36\
37        vdst = vec_ld(0, dst);\
38        ppsum = (vec_u8)vec_pack(psum, psum);\
39        vfdst = vec_perm(vdst, ppsum, fperm);\
40\
41        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
42\
43        vec_st(fsum, 0, dst);\
44\
45        vsrc0ssH = vsrc2ssH;\
46        vsrc1ssH = vsrc3ssH;\
47\
48        dst += stride;\
49        src += stride;
50
51#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
52\
53        vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);\
54        vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);\
55\
56        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
57        psum = vec_mladd(vE, vsrc1ssH, psum);\
58        psum = vec_sr(psum, v6us);\
59\
60        vdst = vec_ld(0, dst);\
61        ppsum = (vec_u8)vec_pack(psum, psum);\
62        vfdst = vec_perm(vdst, ppsum, fperm);\
63\
64        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
65\
66        vec_st(fsum, 0, dst);\
67\
68        dst += stride;\
69        src += stride;
70
71#define noop(a) a
72#define add28(a) vec_add(v28ss, a)
73
74#if HAVE_BIGENDIAN
75#define GET_VSRC1(vs0, off, b, perm0, s){    \
76    vec_u8 vsrcCuc, vsrcDuc;                 \
77    vsrcCuc = vec_ld(off, s);                \
78    if (loadSecond){                         \
79        vsrcDuc = vec_ld(off + b, s);        \
80    } else                                   \
81        vsrcDuc = vsrcCuc;                   \
82                                             \
83    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \
84}
85#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
86    vec_u8 vsrcCuc, vsrcDuc;                         \
87    vsrcCuc = vec_ld(off, s);                        \
88    if (loadSecond){                                 \
89        vsrcDuc = vec_ld(off + b, s);                \
90    } else                                           \
91        vsrcDuc = vsrcCuc;                           \
92                                                     \
93    vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0);         \
94    if (reallyBadAlign){                             \
95        vs1 = vsrcDuc;                               \
96    } else                                           \
97        vs1 = vec_perm(vsrcCuc, vsrcDuc, perm1);     \
98 }
99
100#else
101
102#define GET_VSRC1(vs0, off, b, perm0, s){            \
103    vs0 = vec_vsx_ld(off, s);                        \
104 }
105#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
106    vs0 = vec_vsx_ld(off, s);                        \
107    vs1 = vec_vsx_ld(off + 1, s);                    \
108 }
109#endif /* HAVE_BIGENDIAN */
110
111#ifdef PREFIX_h264_chroma_mc8_altivec
112static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
113                                           ptrdiff_t stride, int h,
114                                           int x, int y)
115{
116    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
117                        {((8 - x) * (8 - y)),
118                         ((    x) * (8 - y)),
119                         ((8 - x) * (    y)),
120                         ((    x) * (    y))};
121    register int i;
122    vec_u8 fperm;
123    LOAD_ZERO;
124    const vec_s32 vABCD = vec_ld(0, ABCD);
125    const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
126    const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
127    const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
128    const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
129    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
130    const vec_u16 v6us = vec_splat_u16(6);
131
132    vec_u8 vsrcperm0, vsrcperm1;
133    vec_u8 vsrc0uc, vsrc1uc;
134    vec_s16 vsrc0ssH, vsrc1ssH;
135    vec_u8 vsrc2uc, vsrc3uc;
136    vec_s16 vsrc2ssH, vsrc3ssH, psum;
137    vec_u8 vdst, ppsum, vfdst, fsum;
138#if HAVE_BIGENDIAN
139    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
140    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
141    vsrcperm0 = vec_lvsl(0, src);
142    vsrcperm1 = vec_lvsl(1, src);
143#endif
144
145    if (((unsigned long)dst) % 16 == 0) {
146        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
147                         0x14, 0x15, 0x16, 0x17,
148                         0x08, 0x09, 0x0A, 0x0B,
149                         0x0C, 0x0D, 0x0E, 0x0F};
150    } else {
151        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
152                         0x04, 0x05, 0x06, 0x07,
153                         0x18, 0x19, 0x1A, 0x1B,
154                         0x1C, 0x1D, 0x1E, 0x1F};
155    }
156
157    GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
158
159    vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);
160    vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);
161
162    if (ABCD[3]) {
163        for (i = 0 ; i < h ; i++) {
164            GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
165            CHROMA_MC8_ALTIVEC_CORE(v32ss, noop);
166        }
167    } else {
168        const vec_s16 vE = vec_add(vB, vC);
169        if (ABCD[2]) { // x == 0 B == 0
170            for (i = 0 ; i < h ; i++) {
171                GET_VSRC1(vsrc1uc, stride, 15, vsrcperm0, src);
172                CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
173                vsrc0uc = vsrc1uc;
174            }
175        } else { // y == 0 C == 0
176            for (i = 0 ; i < h ; i++) {
177               GET_VSRC(vsrc0uc, vsrc1uc, 0, 15, vsrcperm0, vsrcperm1, src);
178               CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
179            }
180        }
181    }
182}
183#endif
184
185/* this code assume that stride % 16 == 0 */
186#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
187static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t *dst, uint8_t *src,
188                                                 ptrdiff_t stride, int h,
189                                                 int x, int y)
190{
191   DECLARE_ALIGNED(16, signed int, ABCD)[4] =
192                        {((8 - x) * (8 - y)),
193                         ((    x) * (8 - y)),
194                         ((8 - x) * (    y)),
195                         ((    x) * (    y))};
196    register int i;
197    vec_u8 fperm;
198    LOAD_ZERO;
199    const vec_s32 vABCD = vec_ld(0, ABCD);
200    const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
201    const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
202    const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
203    const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
204    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
205    const vec_u16 v6us  = vec_splat_u16(6);
206
207    vec_u8 vsrcperm0, vsrcperm1;
208    vec_u8 vsrc0uc, vsrc1uc;
209    vec_s16 vsrc0ssH, vsrc1ssH;
210    vec_u8 vsrc2uc, vsrc3uc;
211    vec_s16 vsrc2ssH, vsrc3ssH, psum;
212    vec_u8 vdst, ppsum, vfdst, fsum;
213#if HAVE_BIGENDIAN
214    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
215    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
216    vsrcperm0 = vec_lvsl(0, src);
217    vsrcperm1 = vec_lvsl(1, src);
218#endif
219
220    if (((unsigned long)dst) % 16 == 0) {
221        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
222                         0x14, 0x15, 0x16, 0x17,
223                         0x08, 0x09, 0x0A, 0x0B,
224                         0x0C, 0x0D, 0x0E, 0x0F};
225    } else {
226        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
227                         0x04, 0x05, 0x06, 0x07,
228                         0x18, 0x19, 0x1A, 0x1B,
229                         0x1C, 0x1D, 0x1E, 0x1F};
230    }
231
232    GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
233
234    vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc0uc);
235    vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc1uc);
236
237    for (i = 0 ; i < h ; i++) {
238        GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
239        CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28);
240    }
241}
242#endif
243
244#undef noop
245#undef add28
246#undef CHROMA_MC8_ALTIVEC_CORE
247