1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "config.h"
24
25#include "libavutil/attributes.h"
26#include "libavutil/cpu.h"
27#include "libavutil/ppc/cpu.h"
28#include "libavutil/ppc/util_altivec.h"
29
30#include "libavcodec/avcodec.h"
31#include "libavcodec/pixblockdsp.h"
32
33#if HAVE_ALTIVEC
34
35#if HAVE_VSX
36static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
37                               ptrdiff_t stride)
38{
39    int i;
40    vector unsigned char perm =
41        (vector unsigned char) {0x00,0x10, 0x01,0x11,0x02,0x12,0x03,0x13,\
42            0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17};
43    const vector unsigned char zero =
44        (const vector unsigned char) vec_splat_u8(0);
45
46    for (i = 0; i < 8; i++) {
47        /* Read potentially unaligned pixels.
48         * We're reading 16 pixels, and actually only want 8,
49         * but we simply ignore the extras. */
50        vector unsigned char bytes = vec_vsx_ld(0, pixels);
51
52        // Convert the bytes into shorts.
53        //vector signed short shorts = (vector signed short) vec_perm(zero, bytes, perm);
54        vector signed short shorts = (vector signed short) vec_perm(bytes, zero, perm);
55
56        // Save the data to the block, we assume the block is 16-byte aligned.
57        vec_vsx_st(shorts, i * 16, (vector signed short *) block);
58
59        pixels += stride;
60    }
61}
62#else
63static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
64                               ptrdiff_t stride)
65{
66    int i;
67    const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
68
69    for (i = 0; i < 8; i++) {
70        vec_u8 perm = vec_lvsl(0, pixels);
71        /* Read potentially unaligned pixels.
72         * We're reading 16 pixels, and actually only want 8,
73         * but we simply ignore the extras. */
74        vec_u8 pixl = vec_ld(0, pixels);
75        vec_u8 pixr = vec_ld(7, pixels);
76        vec_u8 bytes = vec_perm(pixl, pixr, perm);
77
78        // Convert the bytes into shorts.
79        vec_s16 shorts = (vec_s16)vec_mergeh(zero, bytes);
80
81        // Save the data to the block, we assume the block is 16-byte aligned.
82        vec_st(shorts, i * 16, (vec_s16 *)block);
83
84        pixels += stride;
85    }
86}
87
88#endif /* HAVE_VSX */
89
90#if HAVE_VSX
91static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
92                                const uint8_t *s2, ptrdiff_t stride)
93{
94  int i;
95  const vector unsigned char zero =
96    (const vector unsigned char) vec_splat_u8(0);
97  vector signed short shorts1, shorts2;
98
99  for (i = 0; i < 4; i++) {
100    /* Read potentially unaligned pixels.
101     * We're reading 16 pixels, and actually only want 8,
102     * but we simply ignore the extras. */
103    vector unsigned char bytes = vec_vsx_ld(0,  s1);
104
105    // Convert the bytes into shorts.
106    shorts1 = (vector signed short) vec_mergeh(bytes, zero);
107
108    // Do the same for the second block of pixels.
109    bytes =vec_vsx_ld(0,  s2);
110
111    // Convert the bytes into shorts.
112    shorts2 = (vector signed short) vec_mergeh(bytes, zero);
113
114    // Do the subtraction.
115    shorts1 = vec_sub(shorts1, shorts2);
116
117    // Save the data to the block, we assume the block is 16-byte aligned.
118    vec_vsx_st(shorts1, 0, (vector signed short *) block);
119
120    s1    += stride;
121    s2    += stride;
122    block += 8;
123
124    /* The code below is a copy of the code above...
125     * This is a manual unroll. */
126
127    /* Read potentially unaligned pixels.
128     * We're reading 16 pixels, and actually only want 8,
129     * but we simply ignore the extras. */
130    bytes = vec_vsx_ld(0,  s1);
131
132    // Convert the bytes into shorts.
133    shorts1 = (vector signed short) vec_mergeh(bytes, zero);
134
135    // Do the same for the second block of pixels.
136    bytes = vec_vsx_ld(0,  s2);
137
138    // Convert the bytes into shorts.
139    shorts2 = (vector signed short) vec_mergeh(bytes, zero);
140
141    // Do the subtraction.
142    shorts1 = vec_sub(shorts1, shorts2);
143
144    // Save the data to the block, we assume the block is 16-byte aligned.
145    vec_vsx_st(shorts1, 0, (vector signed short *) block);
146
147    s1    += stride;
148    s2    += stride;
149    block += 8;
150  }
151}
152#else
153static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
154                                const uint8_t *s2, ptrdiff_t stride)
155{
156    int i;
157    vec_u8 perm;
158    const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
159    vec_s16 shorts1, shorts2;
160
161    for (i = 0; i < 4; i++) {
162        /* Read potentially unaligned pixels.
163         * We're reading 16 pixels, and actually only want 8,
164         * but we simply ignore the extras. */
165        perm = vec_lvsl(0, s1);
166        vec_u8 pixl  = vec_ld(0,  s1);
167        vec_u8 pixr  = vec_ld(15, s1);
168        vec_u8 bytes = vec_perm(pixl, pixr, perm);
169
170        // Convert the bytes into shorts.
171        shorts1 = (vec_s16)vec_mergeh(zero, bytes);
172
173        // Do the same for the second block of pixels.
174        perm = vec_lvsl(0, s2);
175        pixl  = vec_ld(0,  s2);
176        pixr  = vec_ld(15, s2);
177        bytes = vec_perm(pixl, pixr, perm);
178
179        // Convert the bytes into shorts.
180        shorts2 = (vec_s16)vec_mergeh(zero, bytes);
181
182        // Do the subtraction.
183        shorts1 = vec_sub(shorts1, shorts2);
184
185        // Save the data to the block, we assume the block is 16-byte aligned.
186        vec_st(shorts1, 0, (vec_s16 *)block);
187
188        s1    += stride;
189        s2    += stride;
190        block += 8;
191
192        /* The code below is a copy of the code above...
193         * This is a manual unroll. */
194
195        /* Read potentially unaligned pixels.
196         * We're reading 16 pixels, and actually only want 8,
197         * but we simply ignore the extras. */
198        perm = vec_lvsl(0, s1);
199        pixl  = vec_ld(0,  s1);
200        pixr  = vec_ld(15, s1);
201        bytes = vec_perm(pixl, pixr, perm);
202
203        // Convert the bytes into shorts.
204        shorts1 = (vec_s16)vec_mergeh(zero, bytes);
205
206        // Do the same for the second block of pixels.
207        perm = vec_lvsl(0, s2);
208        pixl  = vec_ld(0,  s2);
209        pixr  = vec_ld(15, s2);
210        bytes = vec_perm(pixl, pixr, perm);
211
212        // Convert the bytes into shorts.
213        shorts2 = (vec_s16)vec_mergeh(zero, bytes);
214
215        // Do the subtraction.
216        shorts1 = vec_sub(shorts1, shorts2);
217
218        // Save the data to the block, we assume the block is 16-byte aligned.
219        vec_st(shorts1, 0, (vec_s16 *)block);
220
221        s1    += stride;
222        s2    += stride;
223        block += 8;
224    }
225}
226
227#endif /* HAVE_VSX */
228
229#endif /* HAVE_ALTIVEC */
230
231#if HAVE_VSX
232static void get_pixels_vsx(int16_t *restrict block, const uint8_t *pixels,
233                           ptrdiff_t stride)
234{
235    int i;
236    for (i = 0; i < 8; i++) {
237        vec_s16 shorts = vsx_ld_u8_s16(0, pixels);
238
239        vec_vsx_st(shorts, i * 16, block);
240
241        pixels += stride;
242    }
243}
244
245static void diff_pixels_vsx(int16_t *restrict block, const uint8_t *s1,
246                            const uint8_t *s2, ptrdiff_t stride)
247{
248    int i;
249    vec_s16 shorts1, shorts2;
250    for (i = 0; i < 8; i++) {
251        shorts1 = vsx_ld_u8_s16(0, s1);
252        shorts2 = vsx_ld_u8_s16(0, s2);
253
254        shorts1 = vec_sub(shorts1, shorts2);
255
256        vec_vsx_st(shorts1, 0, block);
257
258        s1    += stride;
259        s2    += stride;
260        block += 8;
261    }
262}
263#endif /* HAVE_VSX */
264
265av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
266                                     AVCodecContext *avctx,
267                                     unsigned high_bit_depth)
268{
269#if HAVE_ALTIVEC
270    if (!PPC_ALTIVEC(av_get_cpu_flags()))
271        return;
272
273    c->diff_pixels = diff_pixels_altivec;
274
275    if (!high_bit_depth) {
276        c->get_pixels = get_pixels_altivec;
277    }
278#endif /* HAVE_ALTIVEC */
279
280#if HAVE_VSX
281    if (!PPC_VSX(av_get_cpu_flags()))
282        return;
283
284    c->diff_pixels = diff_pixels_vsx;
285
286    if (!high_bit_depth)
287        c->get_pixels = get_pixels_vsx;
288#endif /* HAVE_VSX */
289}
290