1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Copyright (c) 2002 Brian Foley
3cabdff1aSopenharmony_ci * Copyright (c) 2002 Dieter Shirley
4cabdff1aSopenharmony_ci * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#include "config.h"
24cabdff1aSopenharmony_ci
25cabdff1aSopenharmony_ci#include "libavutil/attributes.h"
26cabdff1aSopenharmony_ci#include "libavutil/cpu.h"
27cabdff1aSopenharmony_ci#include "libavutil/ppc/cpu.h"
28cabdff1aSopenharmony_ci#include "libavutil/ppc/util_altivec.h"
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ci#include "libavcodec/avcodec.h"
31cabdff1aSopenharmony_ci#include "libavcodec/pixblockdsp.h"
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci#if HAVE_ALTIVEC
34cabdff1aSopenharmony_ci
35cabdff1aSopenharmony_ci#if HAVE_VSX
36cabdff1aSopenharmony_cistatic void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
37cabdff1aSopenharmony_ci                               ptrdiff_t stride)
38cabdff1aSopenharmony_ci{
39cabdff1aSopenharmony_ci    int i;
40cabdff1aSopenharmony_ci    vector unsigned char perm =
41cabdff1aSopenharmony_ci        (vector unsigned char) {0x00,0x10, 0x01,0x11,0x02,0x12,0x03,0x13,\
42cabdff1aSopenharmony_ci            0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17};
43cabdff1aSopenharmony_ci    const vector unsigned char zero =
44cabdff1aSopenharmony_ci        (const vector unsigned char) vec_splat_u8(0);
45cabdff1aSopenharmony_ci
46cabdff1aSopenharmony_ci    for (i = 0; i < 8; i++) {
47cabdff1aSopenharmony_ci        /* Read potentially unaligned pixels.
48cabdff1aSopenharmony_ci         * We're reading 16 pixels, and actually only want 8,
49cabdff1aSopenharmony_ci         * but we simply ignore the extras. */
50cabdff1aSopenharmony_ci        vector unsigned char bytes = vec_vsx_ld(0, pixels);
51cabdff1aSopenharmony_ci
52cabdff1aSopenharmony_ci        // Convert the bytes into shorts.
53cabdff1aSopenharmony_ci        //vector signed short shorts = (vector signed short) vec_perm(zero, bytes, perm);
54cabdff1aSopenharmony_ci        vector signed short shorts = (vector signed short) vec_perm(bytes, zero, perm);
55cabdff1aSopenharmony_ci
56cabdff1aSopenharmony_ci        // Save the data to the block, we assume the block is 16-byte aligned.
57cabdff1aSopenharmony_ci        vec_vsx_st(shorts, i * 16, (vector signed short *) block);
58cabdff1aSopenharmony_ci
59cabdff1aSopenharmony_ci        pixels += stride;
60cabdff1aSopenharmony_ci    }
61cabdff1aSopenharmony_ci}
62cabdff1aSopenharmony_ci#else
63cabdff1aSopenharmony_cistatic void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
64cabdff1aSopenharmony_ci                               ptrdiff_t stride)
65cabdff1aSopenharmony_ci{
66cabdff1aSopenharmony_ci    int i;
67cabdff1aSopenharmony_ci    const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
68cabdff1aSopenharmony_ci
69cabdff1aSopenharmony_ci    for (i = 0; i < 8; i++) {
70cabdff1aSopenharmony_ci        vec_u8 perm = vec_lvsl(0, pixels);
71cabdff1aSopenharmony_ci        /* Read potentially unaligned pixels.
72cabdff1aSopenharmony_ci         * We're reading 16 pixels, and actually only want 8,
73cabdff1aSopenharmony_ci         * but we simply ignore the extras. */
74cabdff1aSopenharmony_ci        vec_u8 pixl = vec_ld(0, pixels);
75cabdff1aSopenharmony_ci        vec_u8 pixr = vec_ld(7, pixels);
76cabdff1aSopenharmony_ci        vec_u8 bytes = vec_perm(pixl, pixr, perm);
77cabdff1aSopenharmony_ci
78cabdff1aSopenharmony_ci        // Convert the bytes into shorts.
79cabdff1aSopenharmony_ci        vec_s16 shorts = (vec_s16)vec_mergeh(zero, bytes);
80cabdff1aSopenharmony_ci
81cabdff1aSopenharmony_ci        // Save the data to the block, we assume the block is 16-byte aligned.
82cabdff1aSopenharmony_ci        vec_st(shorts, i * 16, (vec_s16 *)block);
83cabdff1aSopenharmony_ci
84cabdff1aSopenharmony_ci        pixels += stride;
85cabdff1aSopenharmony_ci    }
86cabdff1aSopenharmony_ci}
87cabdff1aSopenharmony_ci
88cabdff1aSopenharmony_ci#endif /* HAVE_VSX */
89cabdff1aSopenharmony_ci
90cabdff1aSopenharmony_ci#if HAVE_VSX
91cabdff1aSopenharmony_cistatic void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
92cabdff1aSopenharmony_ci                                const uint8_t *s2, ptrdiff_t stride)
93cabdff1aSopenharmony_ci{
94cabdff1aSopenharmony_ci  int i;
95cabdff1aSopenharmony_ci  const vector unsigned char zero =
96cabdff1aSopenharmony_ci    (const vector unsigned char) vec_splat_u8(0);
97cabdff1aSopenharmony_ci  vector signed short shorts1, shorts2;
98cabdff1aSopenharmony_ci
99cabdff1aSopenharmony_ci  for (i = 0; i < 4; i++) {
100cabdff1aSopenharmony_ci    /* Read potentially unaligned pixels.
101cabdff1aSopenharmony_ci     * We're reading 16 pixels, and actually only want 8,
102cabdff1aSopenharmony_ci     * but we simply ignore the extras. */
103cabdff1aSopenharmony_ci    vector unsigned char bytes = vec_vsx_ld(0,  s1);
104cabdff1aSopenharmony_ci
105cabdff1aSopenharmony_ci    // Convert the bytes into shorts.
106cabdff1aSopenharmony_ci    shorts1 = (vector signed short) vec_mergeh(bytes, zero);
107cabdff1aSopenharmony_ci
108cabdff1aSopenharmony_ci    // Do the same for the second block of pixels.
109cabdff1aSopenharmony_ci    bytes =vec_vsx_ld(0,  s2);
110cabdff1aSopenharmony_ci
111cabdff1aSopenharmony_ci    // Convert the bytes into shorts.
112cabdff1aSopenharmony_ci    shorts2 = (vector signed short) vec_mergeh(bytes, zero);
113cabdff1aSopenharmony_ci
114cabdff1aSopenharmony_ci    // Do the subtraction.
115cabdff1aSopenharmony_ci    shorts1 = vec_sub(shorts1, shorts2);
116cabdff1aSopenharmony_ci
117cabdff1aSopenharmony_ci    // Save the data to the block, we assume the block is 16-byte aligned.
118cabdff1aSopenharmony_ci    vec_vsx_st(shorts1, 0, (vector signed short *) block);
119cabdff1aSopenharmony_ci
120cabdff1aSopenharmony_ci    s1    += stride;
121cabdff1aSopenharmony_ci    s2    += stride;
122cabdff1aSopenharmony_ci    block += 8;
123cabdff1aSopenharmony_ci
124cabdff1aSopenharmony_ci    /* The code below is a copy of the code above...
125cabdff1aSopenharmony_ci     * This is a manual unroll. */
126cabdff1aSopenharmony_ci
127cabdff1aSopenharmony_ci    /* Read potentially unaligned pixels.
128cabdff1aSopenharmony_ci     * We're reading 16 pixels, and actually only want 8,
129cabdff1aSopenharmony_ci     * but we simply ignore the extras. */
130cabdff1aSopenharmony_ci    bytes = vec_vsx_ld(0,  s1);
131cabdff1aSopenharmony_ci
132cabdff1aSopenharmony_ci    // Convert the bytes into shorts.
133cabdff1aSopenharmony_ci    shorts1 = (vector signed short) vec_mergeh(bytes, zero);
134cabdff1aSopenharmony_ci
135cabdff1aSopenharmony_ci    // Do the same for the second block of pixels.
136cabdff1aSopenharmony_ci    bytes = vec_vsx_ld(0,  s2);
137cabdff1aSopenharmony_ci
138cabdff1aSopenharmony_ci    // Convert the bytes into shorts.
139cabdff1aSopenharmony_ci    shorts2 = (vector signed short) vec_mergeh(bytes, zero);
140cabdff1aSopenharmony_ci
141cabdff1aSopenharmony_ci    // Do the subtraction.
142cabdff1aSopenharmony_ci    shorts1 = vec_sub(shorts1, shorts2);
143cabdff1aSopenharmony_ci
144cabdff1aSopenharmony_ci    // Save the data to the block, we assume the block is 16-byte aligned.
145cabdff1aSopenharmony_ci    vec_vsx_st(shorts1, 0, (vector signed short *) block);
146cabdff1aSopenharmony_ci
147cabdff1aSopenharmony_ci    s1    += stride;
148cabdff1aSopenharmony_ci    s2    += stride;
149cabdff1aSopenharmony_ci    block += 8;
150cabdff1aSopenharmony_ci  }
151cabdff1aSopenharmony_ci}
152cabdff1aSopenharmony_ci#else
153cabdff1aSopenharmony_cistatic void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
154cabdff1aSopenharmony_ci                                const uint8_t *s2, ptrdiff_t stride)
155cabdff1aSopenharmony_ci{
156cabdff1aSopenharmony_ci    int i;
157cabdff1aSopenharmony_ci    vec_u8 perm;
158cabdff1aSopenharmony_ci    const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
159cabdff1aSopenharmony_ci    vec_s16 shorts1, shorts2;
160cabdff1aSopenharmony_ci
161cabdff1aSopenharmony_ci    for (i = 0; i < 4; i++) {
162cabdff1aSopenharmony_ci        /* Read potentially unaligned pixels.
163cabdff1aSopenharmony_ci         * We're reading 16 pixels, and actually only want 8,
164cabdff1aSopenharmony_ci         * but we simply ignore the extras. */
165cabdff1aSopenharmony_ci        perm = vec_lvsl(0, s1);
166cabdff1aSopenharmony_ci        vec_u8 pixl  = vec_ld(0,  s1);
167cabdff1aSopenharmony_ci        vec_u8 pixr  = vec_ld(15, s1);
168cabdff1aSopenharmony_ci        vec_u8 bytes = vec_perm(pixl, pixr, perm);
169cabdff1aSopenharmony_ci
170cabdff1aSopenharmony_ci        // Convert the bytes into shorts.
171cabdff1aSopenharmony_ci        shorts1 = (vec_s16)vec_mergeh(zero, bytes);
172cabdff1aSopenharmony_ci
173cabdff1aSopenharmony_ci        // Do the same for the second block of pixels.
174cabdff1aSopenharmony_ci        perm = vec_lvsl(0, s2);
175cabdff1aSopenharmony_ci        pixl  = vec_ld(0,  s2);
176cabdff1aSopenharmony_ci        pixr  = vec_ld(15, s2);
177cabdff1aSopenharmony_ci        bytes = vec_perm(pixl, pixr, perm);
178cabdff1aSopenharmony_ci
179cabdff1aSopenharmony_ci        // Convert the bytes into shorts.
180cabdff1aSopenharmony_ci        shorts2 = (vec_s16)vec_mergeh(zero, bytes);
181cabdff1aSopenharmony_ci
182cabdff1aSopenharmony_ci        // Do the subtraction.
183cabdff1aSopenharmony_ci        shorts1 = vec_sub(shorts1, shorts2);
184cabdff1aSopenharmony_ci
185cabdff1aSopenharmony_ci        // Save the data to the block, we assume the block is 16-byte aligned.
186cabdff1aSopenharmony_ci        vec_st(shorts1, 0, (vec_s16 *)block);
187cabdff1aSopenharmony_ci
188cabdff1aSopenharmony_ci        s1    += stride;
189cabdff1aSopenharmony_ci        s2    += stride;
190cabdff1aSopenharmony_ci        block += 8;
191cabdff1aSopenharmony_ci
192cabdff1aSopenharmony_ci        /* The code below is a copy of the code above...
193cabdff1aSopenharmony_ci         * This is a manual unroll. */
194cabdff1aSopenharmony_ci
195cabdff1aSopenharmony_ci        /* Read potentially unaligned pixels.
196cabdff1aSopenharmony_ci         * We're reading 16 pixels, and actually only want 8,
197cabdff1aSopenharmony_ci         * but we simply ignore the extras. */
198cabdff1aSopenharmony_ci        perm = vec_lvsl(0, s1);
199cabdff1aSopenharmony_ci        pixl  = vec_ld(0,  s1);
200cabdff1aSopenharmony_ci        pixr  = vec_ld(15, s1);
201cabdff1aSopenharmony_ci        bytes = vec_perm(pixl, pixr, perm);
202cabdff1aSopenharmony_ci
203cabdff1aSopenharmony_ci        // Convert the bytes into shorts.
204cabdff1aSopenharmony_ci        shorts1 = (vec_s16)vec_mergeh(zero, bytes);
205cabdff1aSopenharmony_ci
206cabdff1aSopenharmony_ci        // Do the same for the second block of pixels.
207cabdff1aSopenharmony_ci        perm = vec_lvsl(0, s2);
208cabdff1aSopenharmony_ci        pixl  = vec_ld(0,  s2);
209cabdff1aSopenharmony_ci        pixr  = vec_ld(15, s2);
210cabdff1aSopenharmony_ci        bytes = vec_perm(pixl, pixr, perm);
211cabdff1aSopenharmony_ci
212cabdff1aSopenharmony_ci        // Convert the bytes into shorts.
213cabdff1aSopenharmony_ci        shorts2 = (vec_s16)vec_mergeh(zero, bytes);
214cabdff1aSopenharmony_ci
215cabdff1aSopenharmony_ci        // Do the subtraction.
216cabdff1aSopenharmony_ci        shorts1 = vec_sub(shorts1, shorts2);
217cabdff1aSopenharmony_ci
218cabdff1aSopenharmony_ci        // Save the data to the block, we assume the block is 16-byte aligned.
219cabdff1aSopenharmony_ci        vec_st(shorts1, 0, (vec_s16 *)block);
220cabdff1aSopenharmony_ci
221cabdff1aSopenharmony_ci        s1    += stride;
222cabdff1aSopenharmony_ci        s2    += stride;
223cabdff1aSopenharmony_ci        block += 8;
224cabdff1aSopenharmony_ci    }
225cabdff1aSopenharmony_ci}
226cabdff1aSopenharmony_ci
227cabdff1aSopenharmony_ci#endif /* HAVE_VSX */
228cabdff1aSopenharmony_ci
229cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */
230cabdff1aSopenharmony_ci
231cabdff1aSopenharmony_ci#if HAVE_VSX
232cabdff1aSopenharmony_cistatic void get_pixels_vsx(int16_t *restrict block, const uint8_t *pixels,
233cabdff1aSopenharmony_ci                           ptrdiff_t stride)
234cabdff1aSopenharmony_ci{
235cabdff1aSopenharmony_ci    int i;
236cabdff1aSopenharmony_ci    for (i = 0; i < 8; i++) {
237cabdff1aSopenharmony_ci        vec_s16 shorts = vsx_ld_u8_s16(0, pixels);
238cabdff1aSopenharmony_ci
239cabdff1aSopenharmony_ci        vec_vsx_st(shorts, i * 16, block);
240cabdff1aSopenharmony_ci
241cabdff1aSopenharmony_ci        pixels += stride;
242cabdff1aSopenharmony_ci    }
243cabdff1aSopenharmony_ci}
244cabdff1aSopenharmony_ci
245cabdff1aSopenharmony_cistatic void diff_pixels_vsx(int16_t *restrict block, const uint8_t *s1,
246cabdff1aSopenharmony_ci                            const uint8_t *s2, ptrdiff_t stride)
247cabdff1aSopenharmony_ci{
248cabdff1aSopenharmony_ci    int i;
249cabdff1aSopenharmony_ci    vec_s16 shorts1, shorts2;
250cabdff1aSopenharmony_ci    for (i = 0; i < 8; i++) {
251cabdff1aSopenharmony_ci        shorts1 = vsx_ld_u8_s16(0, s1);
252cabdff1aSopenharmony_ci        shorts2 = vsx_ld_u8_s16(0, s2);
253cabdff1aSopenharmony_ci
254cabdff1aSopenharmony_ci        shorts1 = vec_sub(shorts1, shorts2);
255cabdff1aSopenharmony_ci
256cabdff1aSopenharmony_ci        vec_vsx_st(shorts1, 0, block);
257cabdff1aSopenharmony_ci
258cabdff1aSopenharmony_ci        s1    += stride;
259cabdff1aSopenharmony_ci        s2    += stride;
260cabdff1aSopenharmony_ci        block += 8;
261cabdff1aSopenharmony_ci    }
262cabdff1aSopenharmony_ci}
263cabdff1aSopenharmony_ci#endif /* HAVE_VSX */
264cabdff1aSopenharmony_ci
265cabdff1aSopenharmony_ciav_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
266cabdff1aSopenharmony_ci                                     AVCodecContext *avctx,
267cabdff1aSopenharmony_ci                                     unsigned high_bit_depth)
268cabdff1aSopenharmony_ci{
269cabdff1aSopenharmony_ci#if HAVE_ALTIVEC
270cabdff1aSopenharmony_ci    if (!PPC_ALTIVEC(av_get_cpu_flags()))
271cabdff1aSopenharmony_ci        return;
272cabdff1aSopenharmony_ci
273cabdff1aSopenharmony_ci    c->diff_pixels = diff_pixels_altivec;
274cabdff1aSopenharmony_ci
275cabdff1aSopenharmony_ci    if (!high_bit_depth) {
276cabdff1aSopenharmony_ci        c->get_pixels = get_pixels_altivec;
277cabdff1aSopenharmony_ci    }
278cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */
279cabdff1aSopenharmony_ci
280cabdff1aSopenharmony_ci#if HAVE_VSX
281cabdff1aSopenharmony_ci    if (!PPC_VSX(av_get_cpu_flags()))
282cabdff1aSopenharmony_ci        return;
283cabdff1aSopenharmony_ci
284cabdff1aSopenharmony_ci    c->diff_pixels = diff_pixels_vsx;
285cabdff1aSopenharmony_ci
286cabdff1aSopenharmony_ci    if (!high_bit_depth)
287cabdff1aSopenharmony_ci        c->get_pixels = get_pixels_vsx;
288cabdff1aSopenharmony_ci#endif /* HAVE_VSX */
289cabdff1aSopenharmony_ci}
290