xref: /third_party/ffmpeg/libavcodec/ppc/me_cmp.c (revision cabdff1a)
1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "config.h"
24
25#include "libavutil/attributes.h"
26#include "libavutil/cpu.h"
27#include "libavutil/ppc/cpu.h"
28#include "libavutil/ppc/util_altivec.h"
29
30#include "libavcodec/avcodec.h"
31#include "libavcodec/mpegvideo.h"
32#include "libavcodec/me_cmp.h"
33
34#if HAVE_ALTIVEC
35
36#if HAVE_BIGENDIAN
37#define GET_PERM(per1, per2, pix) {\
38    per1 = vec_lvsl(0, pix);\
39    per2 = vec_add(per1, vec_splat_u8(1));\
40}
41#define LOAD_PIX(v, iv, pix, per1, per2) {\
42    vector unsigned char pix2l  = vec_ld(0,  pix);\
43    vector unsigned char pix2r  = vec_ld(16, pix);\
44    v  = vec_perm(pix2l, pix2r, per1);\
45    iv = vec_perm(pix2l, pix2r, per2);\
46}
47#else
48#define GET_PERM(per1, per2, pix) {}
49#define LOAD_PIX(v, iv, pix, per1, per2) {\
50    v  = vec_vsx_ld(0,  pix);\
51    iv = vec_vsx_ld(1,  pix);\
52}
53#endif
54static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
55                            ptrdiff_t stride, int h)
56{
57    int i;
58    int __attribute__((aligned(16))) s = 0;
59    const vector unsigned char zero =
60        (const vector unsigned char) vec_splat_u8(0);
61    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
62    vector signed int sumdiffs;
63    vector unsigned char perm1, perm2, pix2v, pix2iv;
64
65    GET_PERM(perm1, perm2, pix2);
66    for (i = 0; i < h; i++) {
67        /* Read unaligned pixels into our vectors. The vectors are as follows:
68         * pix1v: pix1[0] - pix1[15]
69         * pix2v: pix2[0] - pix2[15]      pix2iv: pix2[1] - pix2[16] */
70        vector unsigned char pix1v  = vec_ld(0,  pix1);
71        LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
72
73        /* Calculate the average vector. */
74        vector unsigned char avgv = vec_avg(pix2v, pix2iv);
75
76        /* Calculate a sum of abs differences vector. */
77        vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv),
78                                          vec_min(pix1v, avgv));
79
80        /* Add each 4 pixel group together and put 4 results into sad. */
81        sad = vec_sum4s(t5, sad);
82
83        pix1 += stride;
84        pix2 += stride;
85    }
86    /* Sum up the four partial sums, and put the result into s. */
87    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
88    sumdiffs = vec_splat(sumdiffs, 3);
89    vec_ste(sumdiffs, 0, &s);
90
91    return s;
92}
93
94static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
95                            ptrdiff_t stride, int h)
96{
97    int i;
98    int  __attribute__((aligned(16))) s = 0;
99    const vector unsigned char zero =
100        (const vector unsigned char) vec_splat_u8(0);
101    vector unsigned char pix1v, pix3v, avgv, t5;
102    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
103    vector signed int sumdiffs;
104
105    uint8_t *pix3 = pix2 + stride;
106
107    /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
108     * iteration becomes pix2 in the next iteration. We can use this
109     * fact to avoid a potentially expensive unaligned read, each
110     * time around the loop.
111     * Read unaligned pixels into our vectors. The vectors are as follows:
112     * pix2v: pix2[0] - pix2[15]
113     * Split the pixel vectors into shorts. */
114    vector unsigned char pix2v = VEC_LD(0, pix2);
115
116    for (i = 0; i < h; i++) {
117        /* Read unaligned pixels into our vectors. The vectors are as follows:
118         * pix1v: pix1[0] - pix1[15]
119         * pix3v: pix3[0] - pix3[15] */
120        pix1v = vec_ld(0,  pix1);
121        pix3v = VEC_LD(0,  pix3);
122
123        /* Calculate the average vector. */
124        avgv = vec_avg(pix2v, pix3v);
125
126        /* Calculate a sum of abs differences vector. */
127        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
128
129        /* Add each 4 pixel group together and put 4 results into sad. */
130        sad = vec_sum4s(t5, sad);
131
132        pix1 += stride;
133        pix2v = pix3v;
134        pix3 += stride;
135    }
136
137    /* Sum up the four partial sums, and put the result into s. */
138    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
139    sumdiffs = vec_splat(sumdiffs, 3);
140    vec_ste(sumdiffs, 0, &s);
141    return s;
142}
143
144static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
145                             ptrdiff_t stride, int h)
146{
147    int i;
148    int  __attribute__((aligned(16))) s = 0;
149    uint8_t *pix3 = pix2 + stride;
150    const vector unsigned char zero =
151        (const vector unsigned char) vec_splat_u8(0);
152    const vector unsigned short two =
153        (const vector unsigned short) vec_splat_u16(2);
154    vector unsigned char avgv, t5;
155    vector unsigned char pix1v, pix3v, pix3iv;
156    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
157    vector unsigned short avghv, avglv;
158    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
159    vector signed int sumdiffs;
160    vector unsigned char perm1, perm2, pix2v, pix2iv;
161    GET_PERM(perm1, perm2, pix2);
162
163    /* Due to the fact that pix3 = pix2 + stride, the pix3 of one
164     * iteration becomes pix2 in the next iteration. We can use this
165     * fact to avoid a potentially expensive unaligned read, as well
166     * as some splitting, and vector addition each time around the loop.
167     * Read unaligned pixels into our vectors. The vectors are as follows:
168     * pix2v: pix2[0] - pix2[15]  pix2iv: pix2[1] - pix2[16]
169     * Split the pixel vectors into shorts. */
170    LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
171    vector unsigned short pix2hv  =
172        (vector unsigned short) VEC_MERGEH(zero, pix2v);
173    vector unsigned short pix2lv  =
174        (vector unsigned short) VEC_MERGEL(zero, pix2v);
175    vector unsigned short pix2ihv =
176        (vector unsigned short) VEC_MERGEH(zero, pix2iv);
177    vector unsigned short pix2ilv =
178        (vector unsigned short) VEC_MERGEL(zero, pix2iv);
179
180    vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
181    vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
182    vector unsigned short t3, t4;
183
184    for (i = 0; i < h; i++) {
185        /* Read unaligned pixels into our vectors. The vectors are as follows:
186         * pix1v: pix1[0] - pix1[15]
187         * pix3v: pix3[0] - pix3[15]      pix3iv: pix3[1] - pix3[16] */
188        pix1v  = vec_ld(0, pix1);
189        LOAD_PIX(pix3v, pix3iv, pix3, perm1, perm2);
190
191        /* Note that AltiVec does have vec_avg, but this works on vector pairs
192         * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
193         * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when
194         * it should be 1. Instead, we have to split the pixel vectors into
195         * vectors of shorts and do the averaging by hand. */
196
197        /* Split the pixel vectors into shorts. */
198        pix3hv  = (vector unsigned short) VEC_MERGEH(zero, pix3v);
199        pix3lv  = (vector unsigned short) VEC_MERGEL(zero, pix3v);
200        pix3ihv = (vector unsigned short) VEC_MERGEH(zero, pix3iv);
201        pix3ilv = (vector unsigned short) VEC_MERGEL(zero, pix3iv);
202
203        /* Do the averaging on them. */
204        t3 = vec_add(pix3hv, pix3ihv);
205        t4 = vec_add(pix3lv, pix3ilv);
206
207        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
208        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
209
210        /* Pack the shorts back into a result. */
211        avgv = vec_pack(avghv, avglv);
212
213        /* Calculate a sum of abs differences vector. */
214        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
215
216        /* Add each 4 pixel group together and put 4 results into sad. */
217        sad = vec_sum4s(t5, sad);
218
219        pix1 += stride;
220        pix3 += stride;
221        /* Transfer the calculated values for pix3 into pix2. */
222        t1 = t3;
223        t2 = t4;
224    }
225    /* Sum up the four partial sums, and put the result into s. */
226    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
227    sumdiffs = vec_splat(sumdiffs, 3);
228    vec_ste(sumdiffs, 0, &s);
229
230    return s;
231}
232
233static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
234                         ptrdiff_t stride, int h)
235{
236    int i;
237    int  __attribute__((aligned(16))) s;
238    const vector unsigned int zero =
239        (const vector unsigned int) vec_splat_u32(0);
240    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
241    vector signed int sumdiffs;
242
243    for (i = 0; i < h; i++) {
244        /* Read potentially unaligned pixels into t1 and t2. */
245        vector unsigned char t1 =vec_ld(0, pix1);
246        vector unsigned char t2 = VEC_LD(0, pix2);
247
248        /* Calculate a sum of abs differences vector. */
249        vector unsigned char t3 = vec_max(t1, t2);
250        vector unsigned char t4 = vec_min(t1, t2);
251        vector unsigned char t5 = vec_sub(t3, t4);
252
253        /* Add each 4 pixel group together and put 4 results into sad. */
254        sad = vec_sum4s(t5, sad);
255
256        pix1 += stride;
257        pix2 += stride;
258    }
259
260    /* Sum up the four partial sums, and put the result into s. */
261    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
262    sumdiffs = vec_splat(sumdiffs, 3);
263    vec_ste(sumdiffs, 0, &s);
264
265    return s;
266}
267
268static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
269                        ptrdiff_t stride, int h)
270{
271    int i;
272    int  __attribute__((aligned(16))) s;
273    const vector unsigned int zero =
274        (const vector unsigned int) vec_splat_u32(0);
275    const vector unsigned char permclear =
276        (vector unsigned char)
277        { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
278    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
279    vector signed int sumdiffs;
280
281    for (i = 0; i < h; i++) {
282        /* Read potentially unaligned pixels into t1 and t2.
283         * Since we're reading 16 pixels, and actually only want 8,
284         * mask out the last 8 pixels. The 0s don't change the sum. */
285        vector unsigned char pix1l = VEC_LD(0, pix1);
286        vector unsigned char pix2l = VEC_LD(0, pix2);
287        vector unsigned char t1 = vec_and(pix1l, permclear);
288        vector unsigned char t2 = vec_and(pix2l, permclear);
289
290        /* Calculate a sum of abs differences vector. */
291        vector unsigned char t3 = vec_max(t1, t2);
292        vector unsigned char t4 = vec_min(t1, t2);
293        vector unsigned char t5 = vec_sub(t3, t4);
294
295        /* Add each 4 pixel group together and put 4 results into sad. */
296        sad = vec_sum4s(t5, sad);
297
298        pix1 += stride;
299        pix2 += stride;
300    }
301
302    /* Sum up the four partial sums, and put the result into s. */
303    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
304    sumdiffs = vec_splat(sumdiffs, 3);
305    vec_ste(sumdiffs, 0, &s);
306
307    return s;
308}
309
310/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
311 * It's the sad8_altivec code above w/ squaring added. */
312static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
313                        ptrdiff_t stride, int h)
314{
315    int i;
316    int  __attribute__((aligned(16))) s;
317    const vector unsigned int zero =
318        (const vector unsigned int) vec_splat_u32(0);
319    const vector unsigned char permclear =
320        (vector unsigned char)
321        { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
322    vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
323    vector signed int sumsqr;
324
325    for (i = 0; i < h; i++) {
326        /* Read potentially unaligned pixels into t1 and t2.
327         * Since we're reading 16 pixels, and actually only want 8,
328         * mask out the last 8 pixels. The 0s don't change the sum. */
329        vector unsigned char t1 = vec_and(VEC_LD(0, pix1), permclear);
330        vector unsigned char t2 = vec_and(VEC_LD(0, pix2), permclear);
331
332        /* Since we want to use unsigned chars, we can take advantage
333         * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
334
335        /* Calculate abs differences vector. */
336        vector unsigned char t3 = vec_max(t1, t2);
337        vector unsigned char t4 = vec_min(t1, t2);
338        vector unsigned char t5 = vec_sub(t3, t4);
339
340        /* Square the values and add them to our sum. */
341        sum = vec_msum(t5, t5, sum);
342
343        pix1 += stride;
344        pix2 += stride;
345    }
346
347    /* Sum up the four partial sums, and put the result into s. */
348    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
349    sumsqr = vec_splat(sumsqr, 3);
350    vec_ste(sumsqr, 0, &s);
351
352    return s;
353}
354
355/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced.
356 * It's the sad16_altivec code above w/ squaring added. */
357static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
358                         ptrdiff_t stride, int h)
359{
360    int i;
361    int  __attribute__((aligned(16))) s;
362    const vector unsigned int zero =
363        (const vector unsigned int) vec_splat_u32(0);
364    vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
365    vector signed int sumsqr;
366
367    for (i = 0; i < h; i++) {
368        /* Read potentially unaligned pixels into t1 and t2. */
369        vector unsigned char t1 = vec_ld(0, pix1);
370        vector unsigned char t2 = VEC_LD(0, pix2);
371
372        /* Since we want to use unsigned chars, we can take advantage
373         * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
374
375        /* Calculate abs differences vector. */
376        vector unsigned char t3 = vec_max(t1, t2);
377        vector unsigned char t4 = vec_min(t1, t2);
378        vector unsigned char t5 = vec_sub(t3, t4);
379
380        /* Square the values and add them to our sum. */
381        sum = vec_msum(t5, t5, sum);
382
383        pix1 += stride;
384        pix2 += stride;
385    }
386
387    /* Sum up the four partial sums, and put the result into s. */
388    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
389    sumsqr = vec_splat(sumsqr, 3);
390
391    vec_ste(sumsqr, 0, &s);
392    return s;
393}
394
395static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
396                                     uint8_t *src, ptrdiff_t stride, int h)
397{
398    int __attribute__((aligned(16))) sum;
399    register const vector unsigned char vzero =
400        (const vector unsigned char) vec_splat_u8(0);
401    register vector signed short temp0, temp1, temp2, temp3, temp4,
402                                 temp5, temp6, temp7;
403    {
404        register const vector signed short vprod1 =
405            (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
406        register const vector signed short vprod2 =
407            (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
408        register const vector signed short vprod3 =
409            (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
410        register const vector unsigned char perm1 =
411            (const vector unsigned char)
412            { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
413              0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
414        register const vector unsigned char perm2 =
415            (const vector unsigned char)
416            { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
417              0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
418        register const vector unsigned char perm3 =
419            (const vector unsigned char)
420            { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
421              0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
422
423
424#define ONEITERBUTTERFLY(i, res)                                            \
425    {                                                                       \
426        register vector unsigned char srcO =  unaligned_load(stride * i, src);  \
427        register vector unsigned char dstO = unaligned_load(stride * i, dst);\
428                                                                            \
429        /* Promote the unsigned chars to signed shorts. */                  \
430        /* We're in the 8x8 function, we only care for the first 8. */      \
431        register vector signed short srcV =                                 \
432            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
433                                             (vector signed char) srcO);    \
434        register vector signed short dstV =                                 \
435            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
436                                             (vector signed char) dstO);    \
437                                                                            \
438        /* subtractions inside the first butterfly */                       \
439        register vector signed short but0 = vec_sub(srcV, dstV);            \
440        register vector signed short op1  = vec_perm(but0, but0, perm1);    \
441        register vector signed short but1 = vec_mladd(but0, vprod1, op1);   \
442        register vector signed short op2  = vec_perm(but1, but1, perm2);    \
443        register vector signed short but2 = vec_mladd(but1, vprod2, op2);   \
444        register vector signed short op3  = vec_perm(but2, but2, perm3);    \
445        res  = vec_mladd(but2, vprod3, op3);                                \
446    }
447
448        ONEITERBUTTERFLY(0, temp0);
449        ONEITERBUTTERFLY(1, temp1);
450        ONEITERBUTTERFLY(2, temp2);
451        ONEITERBUTTERFLY(3, temp3);
452        ONEITERBUTTERFLY(4, temp4);
453        ONEITERBUTTERFLY(5, temp5);
454        ONEITERBUTTERFLY(6, temp6);
455        ONEITERBUTTERFLY(7, temp7);
456    }
457#undef ONEITERBUTTERFLY
458    {
459        register vector signed int vsum;
460        register vector signed short line0  = vec_add(temp0, temp1);
461        register vector signed short line1  = vec_sub(temp0, temp1);
462        register vector signed short line2  = vec_add(temp2, temp3);
463        register vector signed short line3  = vec_sub(temp2, temp3);
464        register vector signed short line4  = vec_add(temp4, temp5);
465        register vector signed short line5  = vec_sub(temp4, temp5);
466        register vector signed short line6  = vec_add(temp6, temp7);
467        register vector signed short line7  = vec_sub(temp6, temp7);
468
469        register vector signed short line0B = vec_add(line0, line2);
470        register vector signed short line2B = vec_sub(line0, line2);
471        register vector signed short line1B = vec_add(line1, line3);
472        register vector signed short line3B = vec_sub(line1, line3);
473        register vector signed short line4B = vec_add(line4, line6);
474        register vector signed short line6B = vec_sub(line4, line6);
475        register vector signed short line5B = vec_add(line5, line7);
476        register vector signed short line7B = vec_sub(line5, line7);
477
478        register vector signed short line0C = vec_add(line0B, line4B);
479        register vector signed short line4C = vec_sub(line0B, line4B);
480        register vector signed short line1C = vec_add(line1B, line5B);
481        register vector signed short line5C = vec_sub(line1B, line5B);
482        register vector signed short line2C = vec_add(line2B, line6B);
483        register vector signed short line6C = vec_sub(line2B, line6B);
484        register vector signed short line3C = vec_add(line3B, line7B);
485        register vector signed short line7C = vec_sub(line3B, line7B);
486
487        vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
488        vsum = vec_sum4s(vec_abs(line1C), vsum);
489        vsum = vec_sum4s(vec_abs(line2C), vsum);
490        vsum = vec_sum4s(vec_abs(line3C), vsum);
491        vsum = vec_sum4s(vec_abs(line4C), vsum);
492        vsum = vec_sum4s(vec_abs(line5C), vsum);
493        vsum = vec_sum4s(vec_abs(line6C), vsum);
494        vsum = vec_sum4s(vec_abs(line7C), vsum);
495        vsum = vec_sums(vsum, (vector signed int) vzero);
496        vsum = vec_splat(vsum, 3);
497
498        vec_ste(vsum, 0, &sum);
499    }
500    return sum;
501}
502
503/*
504 * 16x8 works with 16 elements; it can avoid replicating loads, and
505 * gives the compiler more room for scheduling. It's only used from
506 * inside hadamard8_diff16_altivec.
507 *
508 * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has
509 * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in
510 * registers by itself. The following code includes hand-made register
511 * allocation. It's not clean, but on a 7450 the resulting code is much faster
512 * (best case falls from 700+ cycles to 550).
513 *
514 * xlc doesn't add spill code, but it doesn't know how to schedule for the
515 * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses
516 * 25% fewer instructions...)
517 *
518 * On the 970, the hand-made RA is still a win (around 690 vs. around 780),
519 * but xlc goes to around 660 on the regular C code...
520 */
521static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
522                                      uint8_t *src, ptrdiff_t stride, int h)
523{
524    int __attribute__((aligned(16))) sum;
525    register vector signed short
526        temp0 __asm__ ("v0"),
527        temp1 __asm__ ("v1"),
528        temp2 __asm__ ("v2"),
529        temp3 __asm__ ("v3"),
530        temp4 __asm__ ("v4"),
531        temp5 __asm__ ("v5"),
532        temp6 __asm__ ("v6"),
533        temp7 __asm__ ("v7");
534    register vector signed short
535        temp0S __asm__ ("v8"),
536        temp1S __asm__ ("v9"),
537        temp2S __asm__ ("v10"),
538        temp3S __asm__ ("v11"),
539        temp4S __asm__ ("v12"),
540        temp5S __asm__ ("v13"),
541        temp6S __asm__ ("v14"),
542        temp7S __asm__ ("v15");
543    register const vector unsigned char vzero __asm__ ("v31") =
544        (const vector unsigned char) vec_splat_u8(0);
545    {
546        register const vector signed short vprod1 __asm__ ("v16") =
547            (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
548
549        register const vector signed short vprod2 __asm__ ("v17") =
550            (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
551
552        register const vector signed short vprod3 __asm__ ("v18") =
553            (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
554
555        register const vector unsigned char perm1 __asm__ ("v19") =
556            (const vector unsigned char)
557            { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
558              0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
559
560        register const vector unsigned char perm2 __asm__ ("v20") =
561            (const vector unsigned char)
562            { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
563              0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
564
565        register const vector unsigned char perm3 __asm__ ("v21") =
566            (const vector unsigned char)
567            { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
568              0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
569
570#define ONEITERBUTTERFLY(i, res1, res2)                                     \
571    {                                                                       \
572        register vector unsigned char srcO __asm__ ("v22") =                \
573            unaligned_load(stride * i, src);                                    \
574        register vector unsigned char dstO __asm__ ("v23") =                \
575            unaligned_load(stride * i, dst);\
576                                                                            \
577        /* Promote the unsigned chars to signed shorts. */                  \
578        register vector signed short srcV __asm__ ("v24") =                 \
579            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
580                                             (vector signed char) srcO);    \
581        register vector signed short dstV __asm__ ("v25") =                 \
582            (vector signed short) VEC_MERGEH((vector signed char) vzero,    \
583                                             (vector signed char) dstO);    \
584        register vector signed short srcW __asm__ ("v26") =                 \
585            (vector signed short) VEC_MERGEL((vector signed char) vzero,    \
586                                             (vector signed char) srcO);    \
587        register vector signed short dstW __asm__ ("v27") =                 \
588            (vector signed short) VEC_MERGEL((vector signed char) vzero,    \
589                                             (vector signed char) dstO);    \
590                                                                            \
591        /* subtractions inside the first butterfly */                       \
592        register vector signed short but0  __asm__ ("v28") =                \
593            vec_sub(srcV, dstV);                                            \
594        register vector signed short but0S __asm__ ("v29") =                \
595            vec_sub(srcW, dstW);                                            \
596        register vector signed short op1   __asm__ ("v30") =                \
597            vec_perm(but0, but0, perm1);                                    \
598        register vector signed short but1  __asm__ ("v22") =                \
599            vec_mladd(but0, vprod1, op1);                                   \
600        register vector signed short op1S  __asm__ ("v23") =                \
601            vec_perm(but0S, but0S, perm1);                                  \
602        register vector signed short but1S __asm__ ("v24") =                \
603            vec_mladd(but0S, vprod1, op1S);                                 \
604        register vector signed short op2   __asm__ ("v25") =                \
605            vec_perm(but1, but1, perm2);                                    \
606        register vector signed short but2  __asm__ ("v26") =                \
607            vec_mladd(but1, vprod2, op2);                                   \
608        register vector signed short op2S  __asm__ ("v27") =                \
609            vec_perm(but1S, but1S, perm2);                                  \
610        register vector signed short but2S __asm__ ("v28") =                \
611            vec_mladd(but1S, vprod2, op2S);                                 \
612        register vector signed short op3   __asm__ ("v29") =                \
613            vec_perm(but2, but2, perm3);                                    \
614        register vector signed short op3S  __asm__ ("v30") =                \
615            vec_perm(but2S, but2S, perm3);                                  \
616        res1 = vec_mladd(but2, vprod3, op3);                                \
617        res2 = vec_mladd(but2S, vprod3, op3S);                              \
618    }
619
620        ONEITERBUTTERFLY(0, temp0, temp0S);
621        ONEITERBUTTERFLY(1, temp1, temp1S);
622        ONEITERBUTTERFLY(2, temp2, temp2S);
623        ONEITERBUTTERFLY(3, temp3, temp3S);
624        ONEITERBUTTERFLY(4, temp4, temp4S);
625        ONEITERBUTTERFLY(5, temp5, temp5S);
626        ONEITERBUTTERFLY(6, temp6, temp6S);
627        ONEITERBUTTERFLY(7, temp7, temp7S);
628    }
629#undef ONEITERBUTTERFLY
630    {
631        register vector signed int vsum;
632
633        register vector signed short line0  = vec_add(temp0, temp1);
634        register vector signed short line1  = vec_sub(temp0, temp1);
635        register vector signed short line2  = vec_add(temp2, temp3);
636        register vector signed short line3  = vec_sub(temp2, temp3);
637        register vector signed short line4  = vec_add(temp4, temp5);
638        register vector signed short line5  = vec_sub(temp4, temp5);
639        register vector signed short line6  = vec_add(temp6, temp7);
640        register vector signed short line7  = vec_sub(temp6, temp7);
641
642        register vector signed short line0B = vec_add(line0, line2);
643        register vector signed short line2B = vec_sub(line0, line2);
644        register vector signed short line1B = vec_add(line1, line3);
645        register vector signed short line3B = vec_sub(line1, line3);
646        register vector signed short line4B = vec_add(line4, line6);
647        register vector signed short line6B = vec_sub(line4, line6);
648        register vector signed short line5B = vec_add(line5, line7);
649        register vector signed short line7B = vec_sub(line5, line7);
650
651        register vector signed short line0C = vec_add(line0B, line4B);
652        register vector signed short line4C = vec_sub(line0B, line4B);
653        register vector signed short line1C = vec_add(line1B, line5B);
654        register vector signed short line5C = vec_sub(line1B, line5B);
655        register vector signed short line2C = vec_add(line2B, line6B);
656        register vector signed short line6C = vec_sub(line2B, line6B);
657        register vector signed short line3C = vec_add(line3B, line7B);
658        register vector signed short line7C = vec_sub(line3B, line7B);
659
660        register vector signed short line0S = vec_add(temp0S, temp1S);
661        register vector signed short line1S = vec_sub(temp0S, temp1S);
662        register vector signed short line2S = vec_add(temp2S, temp3S);
663        register vector signed short line3S = vec_sub(temp2S, temp3S);
664        register vector signed short line4S = vec_add(temp4S, temp5S);
665        register vector signed short line5S = vec_sub(temp4S, temp5S);
666        register vector signed short line6S = vec_add(temp6S, temp7S);
667        register vector signed short line7S = vec_sub(temp6S, temp7S);
668
669        register vector signed short line0BS = vec_add(line0S, line2S);
670        register vector signed short line2BS = vec_sub(line0S, line2S);
671        register vector signed short line1BS = vec_add(line1S, line3S);
672        register vector signed short line3BS = vec_sub(line1S, line3S);
673        register vector signed short line4BS = vec_add(line4S, line6S);
674        register vector signed short line6BS = vec_sub(line4S, line6S);
675        register vector signed short line5BS = vec_add(line5S, line7S);
676        register vector signed short line7BS = vec_sub(line5S, line7S);
677
678        register vector signed short line0CS = vec_add(line0BS, line4BS);
679        register vector signed short line4CS = vec_sub(line0BS, line4BS);
680        register vector signed short line1CS = vec_add(line1BS, line5BS);
681        register vector signed short line5CS = vec_sub(line1BS, line5BS);
682        register vector signed short line2CS = vec_add(line2BS, line6BS);
683        register vector signed short line6CS = vec_sub(line2BS, line6BS);
684        register vector signed short line3CS = vec_add(line3BS, line7BS);
685        register vector signed short line7CS = vec_sub(line3BS, line7BS);
686
687        vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
688        vsum = vec_sum4s(vec_abs(line1C), vsum);
689        vsum = vec_sum4s(vec_abs(line2C), vsum);
690        vsum = vec_sum4s(vec_abs(line3C), vsum);
691        vsum = vec_sum4s(vec_abs(line4C), vsum);
692        vsum = vec_sum4s(vec_abs(line5C), vsum);
693        vsum = vec_sum4s(vec_abs(line6C), vsum);
694        vsum = vec_sum4s(vec_abs(line7C), vsum);
695
696        vsum = vec_sum4s(vec_abs(line0CS), vsum);
697        vsum = vec_sum4s(vec_abs(line1CS), vsum);
698        vsum = vec_sum4s(vec_abs(line2CS), vsum);
699        vsum = vec_sum4s(vec_abs(line3CS), vsum);
700        vsum = vec_sum4s(vec_abs(line4CS), vsum);
701        vsum = vec_sum4s(vec_abs(line5CS), vsum);
702        vsum = vec_sum4s(vec_abs(line6CS), vsum);
703        vsum = vec_sum4s(vec_abs(line7CS), vsum);
704        vsum = vec_sums(vsum, (vector signed int) vzero);
705        vsum = vec_splat(vsum, 3);
706
707        vec_ste(vsum, 0, &sum);
708    }
709    return sum;
710}
711
712static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst,
713                                    uint8_t *src, ptrdiff_t stride, int h)
714{
715    int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
716
717    if (h == 16) {
718        dst   += 8 * stride;
719        src   += 8 * stride;
720        score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
721    }
722    return score;
723}
724#endif /* HAVE_ALTIVEC */
725
726av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx)
727{
728#if HAVE_ALTIVEC
729    if (!PPC_ALTIVEC(av_get_cpu_flags()))
730        return;
731
732    c->pix_abs[0][1] = sad16_x2_altivec;
733    c->pix_abs[0][2] = sad16_y2_altivec;
734    c->pix_abs[0][3] = sad16_xy2_altivec;
735    c->pix_abs[0][0] = sad16_altivec;
736    c->pix_abs[1][0] = sad8_altivec;
737
738    c->sad[0] = sad16_altivec;
739    c->sad[1] = sad8_altivec;
740    c->sse[0] = sse16_altivec;
741    c->sse[1] = sse8_altivec;
742
743    c->hadamard8_diff[0] = hadamard8_diff16_altivec;
744    c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
745#endif /* HAVE_ALTIVEC */
746}
747