1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "config.h"
24
25#include "libavutil/attributes.h"
26#include "libavutil/cpu.h"
27#include "libavutil/ppc/cpu.h"
28#include "libavutil/ppc/util_altivec.h"
29
30#include "libavcodec/hpeldsp.h"
31
32#include "hpeldsp_altivec.h"
33
34#if HAVE_ALTIVEC
35/* next one assumes that ((line_size % 16) == 0) */
36void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
37{
38    register vector unsigned char pixelsv1;
39    register vector unsigned char pixelsv1B;
40    register vector unsigned char pixelsv1C;
41    register vector unsigned char pixelsv1D;
42
43    int i;
44    register ptrdiff_t line_size_2 = line_size << 1;
45    register ptrdiff_t line_size_3 = line_size + line_size_2;
46    register ptrdiff_t line_size_4 = line_size << 2;
47
48// hand-unrolling the loop by 4 gains about 15%
49// mininum execution time goes from 74 to 60 cycles
50// it's faster than -funroll-loops, but using
51// -funroll-loops w/ this is bad - 74 cycles again.
52// all this is on a 7450, tuning for the 7450
53    for (i = 0; i < h; i += 4) {
54        pixelsv1  = unaligned_load( 0, pixels);
55        pixelsv1B = unaligned_load(line_size, pixels);
56        pixelsv1C = unaligned_load(line_size_2, pixels);
57        pixelsv1D = unaligned_load(line_size_3, pixels);
58        VEC_ST(pixelsv1, 0, (unsigned char*)block);
59        VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
60        VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
61        VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
62        pixels+=line_size_4;
63        block +=line_size_4;
64    }
65}
66
67/* next one assumes that ((line_size % 16) == 0) */
68#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
69void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
70{
71    register vector unsigned char pixelsv, blockv;
72
73    int i;
74    for (i = 0; i < h; i++) {
75        blockv = vec_ld(0, block);
76        pixelsv = VEC_LD( 0, pixels);
77        blockv = vec_avg(blockv,pixelsv);
78        vec_st(blockv, 0, (unsigned char*)block);
79        pixels+=line_size;
80        block +=line_size;
81    }
82}
83
84/* next one assumes that ((line_size % 8) == 0) */
85static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
86{
87    register vector unsigned char pixelsv, blockv;
88    int i;
89
90   for (i = 0; i < h; i++) {
91       /* block is 8 bytes-aligned, so we're either in the
92          left block (16 bytes-aligned) or in the right block (not) */
93       int rightside = ((unsigned long)block & 0x0000000F);
94
95       blockv = vec_ld(0, block);
96       pixelsv = VEC_LD( 0, pixels);
97
98       if (rightside) {
99           pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
100       } else {
101           pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
102       }
103
104       blockv = vec_avg(blockv, pixelsv);
105
106       vec_st(blockv, 0, block);
107
108       pixels += line_size;
109       block += line_size;
110   }
111}
112
113/* next one assumes that ((line_size % 8) == 0) */
114static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
115{
116    register int i;
117    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
118    register vector unsigned char blockv;
119    register vector unsigned short pixelssum1, pixelssum2, temp3;
120    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
121    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
122
123    pixelsv1 = VEC_LD(0, pixels);
124    pixelsv2 = VEC_LD(1, pixels);
125    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
126    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
127
128    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
129                         (vector unsigned short)pixelsv2);
130    pixelssum1 = vec_add(pixelssum1, vctwo);
131
132    for (i = 0; i < h ; i++) {
133        int rightside = ((unsigned long)block & 0x0000000F);
134        blockv = vec_ld(0, block);
135
136        pixelsv1 = unaligned_load(line_size, pixels);
137        pixelsv2 = unaligned_load(line_size+1, pixels);
138        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
139        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
140        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
141                             (vector unsigned short)pixelsv2);
142        temp3 = vec_add(pixelssum1, pixelssum2);
143        temp3 = vec_sra(temp3, vctwo);
144        pixelssum1 = vec_add(pixelssum2, vctwo);
145        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
146
147        if (rightside) {
148            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
149        } else {
150            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
151        }
152
153        vec_st(blockv, 0, block);
154
155        block += line_size;
156        pixels += line_size;
157    }
158}
159
160/* next one assumes that ((line_size % 8) == 0) */
161static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
162{
163    register int i;
164    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
165    register vector unsigned char blockv;
166    register vector unsigned short pixelssum1, pixelssum2, temp3;
167    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
168    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
169    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
170
171    pixelsv1 = VEC_LD(0, pixels);
172    pixelsv2 = VEC_LD(1, pixels);
173    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
174    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
175    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
176                         (vector unsigned short)pixelsv2);
177    pixelssum1 = vec_add(pixelssum1, vcone);
178
179    for (i = 0; i < h ; i++) {
180        int rightside = ((unsigned long)block & 0x0000000F);
181        blockv = vec_ld(0, block);
182
183        pixelsv1 = unaligned_load(line_size, pixels);
184        pixelsv2 = unaligned_load(line_size+1, pixels);
185        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
186        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
187        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
188                             (vector unsigned short)pixelsv2);
189        temp3 = vec_add(pixelssum1, pixelssum2);
190        temp3 = vec_sra(temp3, vctwo);
191        pixelssum1 = vec_add(pixelssum2, vcone);
192        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
193
194        if (rightside) {
195            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
196        } else {
197            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
198        }
199
200        vec_st(blockv, 0, block);
201
202        block += line_size;
203        pixels += line_size;
204    }
205}
206
207/* next one assumes that ((line_size % 16) == 0) */
208static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
209{
210    register int i;
211    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
212    register vector unsigned char blockv;
213    register vector unsigned short temp3, temp4,
214        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
215    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
216    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
217
218    pixelsv1 = VEC_LD(0, pixels);
219    pixelsv2 = VEC_LD(1, pixels);
220    pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
221    pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
222    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
223    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
224    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
225                         (vector unsigned short)pixelsv4);
226    pixelssum3 = vec_add(pixelssum3, vctwo);
227    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
228                         (vector unsigned short)pixelsv2);
229    pixelssum1 = vec_add(pixelssum1, vctwo);
230
231    for (i = 0; i < h ; i++) {
232        blockv = vec_ld(0, block);
233
234        pixelsv1 = unaligned_load(line_size, pixels);
235        pixelsv2 = unaligned_load(line_size+1, pixels);
236
237        pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
238        pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
239        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
240        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
241        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
242                             (vector unsigned short)pixelsv4);
243        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
244                             (vector unsigned short)pixelsv2);
245        temp4 = vec_add(pixelssum3, pixelssum4);
246        temp4 = vec_sra(temp4, vctwo);
247        temp3 = vec_add(pixelssum1, pixelssum2);
248        temp3 = vec_sra(temp3, vctwo);
249
250        pixelssum3 = vec_add(pixelssum4, vctwo);
251        pixelssum1 = vec_add(pixelssum2, vctwo);
252
253        blockv = vec_packsu(temp3, temp4);
254
255        vec_st(blockv, 0, block);
256
257        block += line_size;
258        pixels += line_size;
259    }
260}
261
262/* next one assumes that ((line_size % 16) == 0) */
263static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
264{
265    register int i;
266    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
267    register vector unsigned char blockv;
268    register vector unsigned short temp3, temp4,
269        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
270    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
271    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
272    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
273
274    pixelsv1 = VEC_LD(0, pixels);
275    pixelsv2 = VEC_LD(1, pixels);
276    pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
277    pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
278    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
279    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
280    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
281                         (vector unsigned short)pixelsv4);
282    pixelssum3 = vec_add(pixelssum3, vcone);
283    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
284                         (vector unsigned short)pixelsv2);
285    pixelssum1 = vec_add(pixelssum1, vcone);
286
287    for (i = 0; i < h ; i++) {
288        pixelsv1 = unaligned_load(line_size, pixels);
289        pixelsv2 = unaligned_load(line_size+1, pixels);
290
291        pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
292        pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
293        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
294        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
295        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
296                             (vector unsigned short)pixelsv4);
297        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
298                             (vector unsigned short)pixelsv2);
299        temp4 = vec_add(pixelssum3, pixelssum4);
300        temp4 = vec_sra(temp4, vctwo);
301        temp3 = vec_add(pixelssum1, pixelssum2);
302        temp3 = vec_sra(temp3, vctwo);
303
304        pixelssum3 = vec_add(pixelssum4, vcone);
305        pixelssum1 = vec_add(pixelssum2, vcone);
306
307        blockv = vec_packsu(temp3, temp4);
308
309        VEC_ST(blockv, 0, block);
310
311        block += line_size;
312        pixels += line_size;
313    }
314}
315
316/* next one assumes that ((line_size % 8) == 0) */
317static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
318{
319    register int i;
320    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
321    register vector unsigned char blockv, blocktemp;
322    register vector unsigned short pixelssum1, pixelssum2, temp3;
323
324    register const vector unsigned char vczero = (const vector unsigned char)
325                                        vec_splat_u8(0);
326    register const vector unsigned short vctwo = (const vector unsigned short)
327                                        vec_splat_u16(2);
328
329    pixelsv1 = VEC_LD(0, pixels);
330    pixelsv2 = VEC_LD(1, pixels);
331    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
332    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
333    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
334                         (vector unsigned short)pixelsv2);
335    pixelssum1 = vec_add(pixelssum1, vctwo);
336
337    for (i = 0; i < h ; i++) {
338        int rightside = ((unsigned long)block & 0x0000000F);
339        blockv = vec_ld(0, block);
340
341        pixelsv1 = unaligned_load(line_size, pixels);
342        pixelsv2 = unaligned_load(line_size+1, pixels);
343
344        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
345        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
346        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
347                             (vector unsigned short)pixelsv2);
348        temp3 = vec_add(pixelssum1, pixelssum2);
349        temp3 = vec_sra(temp3, vctwo);
350        pixelssum1 = vec_add(pixelssum2, vctwo);
351        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
352
353        if (rightside) {
354            blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
355        } else {
356            blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
357        }
358
359        blockv = vec_avg(blocktemp, blockv);
360        vec_st(blockv, 0, block);
361
362        block += line_size;
363        pixels += line_size;
364    }
365}
366#endif /* HAVE_ALTIVEC */
367
368av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
369{
370#if HAVE_ALTIVEC
371    if (!PPC_ALTIVEC(av_get_cpu_flags()))
372        return;
373
374    c->avg_pixels_tab[0][0]        = ff_avg_pixels16_altivec;
375    c->avg_pixels_tab[1][0]        = avg_pixels8_altivec;
376    c->avg_pixels_tab[1][3]        = avg_pixels8_xy2_altivec;
377
378    c->put_pixels_tab[0][0]        = ff_put_pixels16_altivec;
379    c->put_pixels_tab[1][3]        = put_pixels8_xy2_altivec;
380    c->put_pixels_tab[0][3]        = put_pixels16_xy2_altivec;
381
382    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
383    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
384    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
385#endif /* HAVE_ALTIVEC */
386}
387