1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2002 Brian Foley
3cabdff1aSopenharmony_ci * Copyright (c) 2002 Dieter Shirley
4cabdff1aSopenharmony_ci * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "config.h"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci#include "libavutil/attributes.h"
26cabdff1aSopenharmony_ci#include "libavutil/cpu.h"
27cabdff1aSopenharmony_ci#include "libavutil/ppc/cpu.h"
28cabdff1aSopenharmony_ci#include "libavutil/ppc/util_altivec.h"
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ci#include "libavcodec/hpeldsp.h"
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci#include "hpeldsp_altivec.h"
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ci#if HAVE_ALTIVEC
35cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 16) == 0) */
36cabdff1aSopenharmony_civoid ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
37cabdff1aSopenharmony_ci{
38cabdff1aSopenharmony_ci    register vector unsigned char pixelsv1;
39cabdff1aSopenharmony_ci    register vector unsigned char pixelsv1B;
40cabdff1aSopenharmony_ci    register vector unsigned char pixelsv1C;
41cabdff1aSopenharmony_ci    register vector unsigned char pixelsv1D;
42cabdff1aSopenharmony_ci
43cabdff1aSopenharmony_ci    int i;
44cabdff1aSopenharmony_ci    register ptrdiff_t line_size_2 = line_size << 1;
45cabdff1aSopenharmony_ci    register ptrdiff_t line_size_3 = line_size + line_size_2;
46cabdff1aSopenharmony_ci    register ptrdiff_t line_size_4 = line_size << 2;
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_ci// hand-unrolling the loop by 4 gains about 15%
49cabdff1aSopenharmony_ci// mininum execution time goes from 74 to 60 cycles
50cabdff1aSopenharmony_ci// it's faster than -funroll-loops, but using
51cabdff1aSopenharmony_ci// -funroll-loops w/ this is bad - 74 cycles again.
52cabdff1aSopenharmony_ci// all this is on a 7450, tuning for the 7450
53cabdff1aSopenharmony_ci    for (i = 0; i < h; i += 4) {
54cabdff1aSopenharmony_ci        pixelsv1  = unaligned_load( 0, pixels);
55cabdff1aSopenharmony_ci        pixelsv1B = unaligned_load(line_size, pixels);
56cabdff1aSopenharmony_ci        pixelsv1C = unaligned_load(line_size_2, pixels);
57cabdff1aSopenharmony_ci        pixelsv1D = unaligned_load(line_size_3, pixels);
58cabdff1aSopenharmony_ci        VEC_ST(pixelsv1, 0, (unsigned char*)block);
59cabdff1aSopenharmony_ci        VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
60cabdff1aSopenharmony_ci        VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
61cabdff1aSopenharmony_ci        VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
62cabdff1aSopenharmony_ci        pixels+=line_size_4;
63cabdff1aSopenharmony_ci        block +=line_size_4;
64cabdff1aSopenharmony_ci    }
65cabdff1aSopenharmony_ci}
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 16) == 0) */
68cabdff1aSopenharmony_ci#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
69cabdff1aSopenharmony_civoid ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
70cabdff1aSopenharmony_ci{
71cabdff1aSopenharmony_ci    register vector unsigned char pixelsv, blockv;
72cabdff1aSopenharmony_ci
73cabdff1aSopenharmony_ci    int i;
74cabdff1aSopenharmony_ci    for (i = 0; i < h; i++) {
75cabdff1aSopenharmony_ci        blockv = vec_ld(0, block);
76cabdff1aSopenharmony_ci        pixelsv = VEC_LD( 0, pixels);
77cabdff1aSopenharmony_ci        blockv = vec_avg(blockv,pixelsv);
78cabdff1aSopenharmony_ci        vec_st(blockv, 0, (unsigned char*)block);
79cabdff1aSopenharmony_ci        pixels+=line_size;
80cabdff1aSopenharmony_ci        block +=line_size;
81cabdff1aSopenharmony_ci    }
82cabdff1aSopenharmony_ci}
83cabdff1aSopenharmony_ci
84cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 8) == 0) */
85cabdff1aSopenharmony_cistatic void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
86cabdff1aSopenharmony_ci{
87cabdff1aSopenharmony_ci    register vector unsigned char pixelsv, blockv;
88cabdff1aSopenharmony_ci    int i;
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci   for (i = 0; i < h; i++) {
91cabdff1aSopenharmony_ci       /* block is 8 bytes-aligned, so we're either in the
92cabdff1aSopenharmony_ci          left block (16 bytes-aligned) or in the right block (not) */
93cabdff1aSopenharmony_ci       int rightside = ((unsigned long)block & 0x0000000F);
94cabdff1aSopenharmony_ci
95cabdff1aSopenharmony_ci       blockv = vec_ld(0, block);
96cabdff1aSopenharmony_ci       pixelsv = VEC_LD( 0, pixels);
97cabdff1aSopenharmony_ci
98cabdff1aSopenharmony_ci       if (rightside) {
99cabdff1aSopenharmony_ci           pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
100cabdff1aSopenharmony_ci       } else {
101cabdff1aSopenharmony_ci           pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
102cabdff1aSopenharmony_ci       }
103cabdff1aSopenharmony_ci
104cabdff1aSopenharmony_ci       blockv = vec_avg(blockv, pixelsv);
105cabdff1aSopenharmony_ci
106cabdff1aSopenharmony_ci       vec_st(blockv, 0, block);
107cabdff1aSopenharmony_ci
108cabdff1aSopenharmony_ci       pixels += line_size;
109cabdff1aSopenharmony_ci       block += line_size;
110cabdff1aSopenharmony_ci   }
111cabdff1aSopenharmony_ci}
112cabdff1aSopenharmony_ci
113cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 8) == 0) */
114cabdff1aSopenharmony_cistatic void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
115cabdff1aSopenharmony_ci{
116cabdff1aSopenharmony_ci    register int i;
117cabdff1aSopenharmony_ci    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
118cabdff1aSopenharmony_ci    register vector unsigned char blockv;
119cabdff1aSopenharmony_ci    register vector unsigned short pixelssum1, pixelssum2, temp3;
120cabdff1aSopenharmony_ci    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
121cabdff1aSopenharmony_ci    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
122cabdff1aSopenharmony_ci
123cabdff1aSopenharmony_ci    pixelsv1 = VEC_LD(0, pixels);
124cabdff1aSopenharmony_ci    pixelsv2 = VEC_LD(1, pixels);
125cabdff1aSopenharmony_ci    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
126cabdff1aSopenharmony_ci    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
127cabdff1aSopenharmony_ci
128cabdff1aSopenharmony_ci    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
129cabdff1aSopenharmony_ci                         (vector unsigned short)pixelsv2);
130cabdff1aSopenharmony_ci    pixelssum1 = vec_add(pixelssum1, vctwo);
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci    for (i = 0; i < h ; i++) {
133cabdff1aSopenharmony_ci        int rightside = ((unsigned long)block & 0x0000000F);
134cabdff1aSopenharmony_ci        blockv = vec_ld(0, block);
135cabdff1aSopenharmony_ci
136cabdff1aSopenharmony_ci        pixelsv1 = unaligned_load(line_size, pixels);
137cabdff1aSopenharmony_ci        pixelsv2 = unaligned_load(line_size+1, pixels);
138cabdff1aSopenharmony_ci        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
139cabdff1aSopenharmony_ci        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
140cabdff1aSopenharmony_ci        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
141cabdff1aSopenharmony_ci                             (vector unsigned short)pixelsv2);
142cabdff1aSopenharmony_ci        temp3 = vec_add(pixelssum1, pixelssum2);
143cabdff1aSopenharmony_ci        temp3 = vec_sra(temp3, vctwo);
144cabdff1aSopenharmony_ci        pixelssum1 = vec_add(pixelssum2, vctwo);
145cabdff1aSopenharmony_ci        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
146cabdff1aSopenharmony_ci
147cabdff1aSopenharmony_ci        if (rightside) {
148cabdff1aSopenharmony_ci            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
149cabdff1aSopenharmony_ci        } else {
150cabdff1aSopenharmony_ci            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
151cabdff1aSopenharmony_ci        }
152cabdff1aSopenharmony_ci
153cabdff1aSopenharmony_ci        vec_st(blockv, 0, block);
154cabdff1aSopenharmony_ci
155cabdff1aSopenharmony_ci        block += line_size;
156cabdff1aSopenharmony_ci        pixels += line_size;
157cabdff1aSopenharmony_ci    }
158cabdff1aSopenharmony_ci}
159cabdff1aSopenharmony_ci
160cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 8) == 0) */
161cabdff1aSopenharmony_cistatic void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
162cabdff1aSopenharmony_ci{
163cabdff1aSopenharmony_ci    register int i;
164cabdff1aSopenharmony_ci    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
165cabdff1aSopenharmony_ci    register vector unsigned char blockv;
166cabdff1aSopenharmony_ci    register vector unsigned short pixelssum1, pixelssum2, temp3;
167cabdff1aSopenharmony_ci    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
168cabdff1aSopenharmony_ci    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
169cabdff1aSopenharmony_ci    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci    pixelsv1 = VEC_LD(0, pixels);
172cabdff1aSopenharmony_ci    pixelsv2 = VEC_LD(1, pixels);
173cabdff1aSopenharmony_ci    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
174cabdff1aSopenharmony_ci    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
175cabdff1aSopenharmony_ci    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
176cabdff1aSopenharmony_ci                         (vector unsigned short)pixelsv2);
177cabdff1aSopenharmony_ci    pixelssum1 = vec_add(pixelssum1, vcone);
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci    for (i = 0; i < h ; i++) {
180cabdff1aSopenharmony_ci        int rightside = ((unsigned long)block & 0x0000000F);
181cabdff1aSopenharmony_ci        blockv = vec_ld(0, block);
182cabdff1aSopenharmony_ci
183cabdff1aSopenharmony_ci        pixelsv1 = unaligned_load(line_size, pixels);
184cabdff1aSopenharmony_ci        pixelsv2 = unaligned_load(line_size+1, pixels);
185cabdff1aSopenharmony_ci        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
186cabdff1aSopenharmony_ci        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
187cabdff1aSopenharmony_ci        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
188cabdff1aSopenharmony_ci                             (vector unsigned short)pixelsv2);
189cabdff1aSopenharmony_ci        temp3 = vec_add(pixelssum1, pixelssum2);
190cabdff1aSopenharmony_ci        temp3 = vec_sra(temp3, vctwo);
191cabdff1aSopenharmony_ci        pixelssum1 = vec_add(pixelssum2, vcone);
192cabdff1aSopenharmony_ci        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
193cabdff1aSopenharmony_ci
194cabdff1aSopenharmony_ci        if (rightside) {
195cabdff1aSopenharmony_ci            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
196cabdff1aSopenharmony_ci        } else {
197cabdff1aSopenharmony_ci            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
198cabdff1aSopenharmony_ci        }
199cabdff1aSopenharmony_ci
200cabdff1aSopenharmony_ci        vec_st(blockv, 0, block);
201cabdff1aSopenharmony_ci
202cabdff1aSopenharmony_ci        block += line_size;
203cabdff1aSopenharmony_ci        pixels += line_size;
204cabdff1aSopenharmony_ci    }
205cabdff1aSopenharmony_ci}
206cabdff1aSopenharmony_ci
207cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 16) == 0) */
208cabdff1aSopenharmony_cistatic void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
209cabdff1aSopenharmony_ci{
210cabdff1aSopenharmony_ci    register int i;
211cabdff1aSopenharmony_ci    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
212cabdff1aSopenharmony_ci    register vector unsigned char blockv;
213cabdff1aSopenharmony_ci    register vector unsigned short temp3, temp4,
214cabdff1aSopenharmony_ci        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
215cabdff1aSopenharmony_ci    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
216cabdff1aSopenharmony_ci    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
217cabdff1aSopenharmony_ci
218cabdff1aSopenharmony_ci    pixelsv1 = VEC_LD(0, pixels);
219cabdff1aSopenharmony_ci    pixelsv2 = VEC_LD(1, pixels);
220cabdff1aSopenharmony_ci    pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
221cabdff1aSopenharmony_ci    pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
222cabdff1aSopenharmony_ci    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
223cabdff1aSopenharmony_ci    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
224cabdff1aSopenharmony_ci    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
225cabdff1aSopenharmony_ci                         (vector unsigned short)pixelsv4);
226cabdff1aSopenharmony_ci    pixelssum3 = vec_add(pixelssum3, vctwo);
227cabdff1aSopenharmony_ci    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
228cabdff1aSopenharmony_ci                         (vector unsigned short)pixelsv2);
229cabdff1aSopenharmony_ci    pixelssum1 = vec_add(pixelssum1, vctwo);
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci    for (i = 0; i < h ; i++) {
232cabdff1aSopenharmony_ci        blockv = vec_ld(0, block);
233cabdff1aSopenharmony_ci
234cabdff1aSopenharmony_ci        pixelsv1 = unaligned_load(line_size, pixels);
235cabdff1aSopenharmony_ci        pixelsv2 = unaligned_load(line_size+1, pixels);
236cabdff1aSopenharmony_ci
237cabdff1aSopenharmony_ci        pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
238cabdff1aSopenharmony_ci        pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
239cabdff1aSopenharmony_ci        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
240cabdff1aSopenharmony_ci        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
241cabdff1aSopenharmony_ci        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
242cabdff1aSopenharmony_ci                             (vector unsigned short)pixelsv4);
243cabdff1aSopenharmony_ci        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
244cabdff1aSopenharmony_ci                             (vector unsigned short)pixelsv2);
245cabdff1aSopenharmony_ci        temp4 = vec_add(pixelssum3, pixelssum4);
246cabdff1aSopenharmony_ci        temp4 = vec_sra(temp4, vctwo);
247cabdff1aSopenharmony_ci        temp3 = vec_add(pixelssum1, pixelssum2);
248cabdff1aSopenharmony_ci        temp3 = vec_sra(temp3, vctwo);
249cabdff1aSopenharmony_ci
250cabdff1aSopenharmony_ci        pixelssum3 = vec_add(pixelssum4, vctwo);
251cabdff1aSopenharmony_ci        pixelssum1 = vec_add(pixelssum2, vctwo);
252cabdff1aSopenharmony_ci
253cabdff1aSopenharmony_ci        blockv = vec_packsu(temp3, temp4);
254cabdff1aSopenharmony_ci
255cabdff1aSopenharmony_ci        vec_st(blockv, 0, block);
256cabdff1aSopenharmony_ci
257cabdff1aSopenharmony_ci        block += line_size;
258cabdff1aSopenharmony_ci        pixels += line_size;
259cabdff1aSopenharmony_ci    }
260cabdff1aSopenharmony_ci}
261cabdff1aSopenharmony_ci
262cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 16) == 0) */
263cabdff1aSopenharmony_cistatic void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
264cabdff1aSopenharmony_ci{
265cabdff1aSopenharmony_ci    register int i;
266cabdff1aSopenharmony_ci    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
267cabdff1aSopenharmony_ci    register vector unsigned char blockv;
268cabdff1aSopenharmony_ci    register vector unsigned short temp3, temp4,
269cabdff1aSopenharmony_ci        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
270cabdff1aSopenharmony_ci    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
271cabdff1aSopenharmony_ci    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
272cabdff1aSopenharmony_ci    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
273cabdff1aSopenharmony_ci
274cabdff1aSopenharmony_ci    pixelsv1 = VEC_LD(0, pixels);
275cabdff1aSopenharmony_ci    pixelsv2 = VEC_LD(1, pixels);
276cabdff1aSopenharmony_ci    pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
277cabdff1aSopenharmony_ci    pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
278cabdff1aSopenharmony_ci    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
279cabdff1aSopenharmony_ci    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
280cabdff1aSopenharmony_ci    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
281cabdff1aSopenharmony_ci                         (vector unsigned short)pixelsv4);
282cabdff1aSopenharmony_ci    pixelssum3 = vec_add(pixelssum3, vcone);
283cabdff1aSopenharmony_ci    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
284cabdff1aSopenharmony_ci                         (vector unsigned short)pixelsv2);
285cabdff1aSopenharmony_ci    pixelssum1 = vec_add(pixelssum1, vcone);
286cabdff1aSopenharmony_ci
287cabdff1aSopenharmony_ci    for (i = 0; i < h ; i++) {
288cabdff1aSopenharmony_ci        pixelsv1 = unaligned_load(line_size, pixels);
289cabdff1aSopenharmony_ci        pixelsv2 = unaligned_load(line_size+1, pixels);
290cabdff1aSopenharmony_ci
291cabdff1aSopenharmony_ci        pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
292cabdff1aSopenharmony_ci        pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
293cabdff1aSopenharmony_ci        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
294cabdff1aSopenharmony_ci        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
295cabdff1aSopenharmony_ci        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
296cabdff1aSopenharmony_ci                             (vector unsigned short)pixelsv4);
297cabdff1aSopenharmony_ci        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
298cabdff1aSopenharmony_ci                             (vector unsigned short)pixelsv2);
299cabdff1aSopenharmony_ci        temp4 = vec_add(pixelssum3, pixelssum4);
300cabdff1aSopenharmony_ci        temp4 = vec_sra(temp4, vctwo);
301cabdff1aSopenharmony_ci        temp3 = vec_add(pixelssum1, pixelssum2);
302cabdff1aSopenharmony_ci        temp3 = vec_sra(temp3, vctwo);
303cabdff1aSopenharmony_ci
304cabdff1aSopenharmony_ci        pixelssum3 = vec_add(pixelssum4, vcone);
305cabdff1aSopenharmony_ci        pixelssum1 = vec_add(pixelssum2, vcone);
306cabdff1aSopenharmony_ci
307cabdff1aSopenharmony_ci        blockv = vec_packsu(temp3, temp4);
308cabdff1aSopenharmony_ci
309cabdff1aSopenharmony_ci        VEC_ST(blockv, 0, block);
310cabdff1aSopenharmony_ci
311cabdff1aSopenharmony_ci        block += line_size;
312cabdff1aSopenharmony_ci        pixels += line_size;
313cabdff1aSopenharmony_ci    }
314cabdff1aSopenharmony_ci}
315cabdff1aSopenharmony_ci
316cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 8) == 0) */
317cabdff1aSopenharmony_cistatic void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
318cabdff1aSopenharmony_ci{
319cabdff1aSopenharmony_ci    register int i;
320cabdff1aSopenharmony_ci    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
321cabdff1aSopenharmony_ci    register vector unsigned char blockv, blocktemp;
322cabdff1aSopenharmony_ci    register vector unsigned short pixelssum1, pixelssum2, temp3;
323cabdff1aSopenharmony_ci
324cabdff1aSopenharmony_ci    register const vector unsigned char vczero = (const vector unsigned char)
325cabdff1aSopenharmony_ci                                        vec_splat_u8(0);
326cabdff1aSopenharmony_ci    register const vector unsigned short vctwo = (const vector unsigned short)
327cabdff1aSopenharmony_ci                                        vec_splat_u16(2);
328cabdff1aSopenharmony_ci
329cabdff1aSopenharmony_ci    pixelsv1 = VEC_LD(0, pixels);
330cabdff1aSopenharmony_ci    pixelsv2 = VEC_LD(1, pixels);
331cabdff1aSopenharmony_ci    pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
332cabdff1aSopenharmony_ci    pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
333cabdff1aSopenharmony_ci    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
334cabdff1aSopenharmony_ci                         (vector unsigned short)pixelsv2);
335cabdff1aSopenharmony_ci    pixelssum1 = vec_add(pixelssum1, vctwo);
336cabdff1aSopenharmony_ci
337cabdff1aSopenharmony_ci    for (i = 0; i < h ; i++) {
338cabdff1aSopenharmony_ci        int rightside = ((unsigned long)block & 0x0000000F);
339cabdff1aSopenharmony_ci        blockv = vec_ld(0, block);
340cabdff1aSopenharmony_ci
341cabdff1aSopenharmony_ci        pixelsv1 = unaligned_load(line_size, pixels);
342cabdff1aSopenharmony_ci        pixelsv2 = unaligned_load(line_size+1, pixels);
343cabdff1aSopenharmony_ci
344cabdff1aSopenharmony_ci        pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
345cabdff1aSopenharmony_ci        pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
346cabdff1aSopenharmony_ci        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
347cabdff1aSopenharmony_ci                             (vector unsigned short)pixelsv2);
348cabdff1aSopenharmony_ci        temp3 = vec_add(pixelssum1, pixelssum2);
349cabdff1aSopenharmony_ci        temp3 = vec_sra(temp3, vctwo);
350cabdff1aSopenharmony_ci        pixelssum1 = vec_add(pixelssum2, vctwo);
351cabdff1aSopenharmony_ci        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
352cabdff1aSopenharmony_ci
353cabdff1aSopenharmony_ci        if (rightside) {
354cabdff1aSopenharmony_ci            blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
355cabdff1aSopenharmony_ci        } else {
356cabdff1aSopenharmony_ci            blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
357cabdff1aSopenharmony_ci        }
358cabdff1aSopenharmony_ci
359cabdff1aSopenharmony_ci        blockv = vec_avg(blocktemp, blockv);
360cabdff1aSopenharmony_ci        vec_st(blockv, 0, block);
361cabdff1aSopenharmony_ci
362cabdff1aSopenharmony_ci        block += line_size;
363cabdff1aSopenharmony_ci        pixels += line_size;
364cabdff1aSopenharmony_ci    }
365cabdff1aSopenharmony_ci}
366cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */
367cabdff1aSopenharmony_ci
368cabdff1aSopenharmony_ciav_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
369cabdff1aSopenharmony_ci{
370cabdff1aSopenharmony_ci#if HAVE_ALTIVEC
371cabdff1aSopenharmony_ci    if (!PPC_ALTIVEC(av_get_cpu_flags()))
372cabdff1aSopenharmony_ci        return;
373cabdff1aSopenharmony_ci
374cabdff1aSopenharmony_ci    c->avg_pixels_tab[0][0]        = ff_avg_pixels16_altivec;
375cabdff1aSopenharmony_ci    c->avg_pixels_tab[1][0]        = avg_pixels8_altivec;
376cabdff1aSopenharmony_ci    c->avg_pixels_tab[1][3]        = avg_pixels8_xy2_altivec;
377cabdff1aSopenharmony_ci
378cabdff1aSopenharmony_ci    c->put_pixels_tab[0][0]        = ff_put_pixels16_altivec;
379cabdff1aSopenharmony_ci    c->put_pixels_tab[1][3]        = put_pixels8_xy2_altivec;
380cabdff1aSopenharmony_ci    c->put_pixels_tab[0][3]        = put_pixels16_xy2_altivec;
381cabdff1aSopenharmony_ci
382cabdff1aSopenharmony_ci    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
383cabdff1aSopenharmony_ci    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
384cabdff1aSopenharmony_ci    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
385cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */
386cabdff1aSopenharmony_ci}
387