1/* 2 * Copyright (c) 2002 Brian Foley 3 * Copyright (c) 2002 Dieter Shirley 4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "config.h" 24 25#include "libavutil/attributes.h" 26#include "libavutil/cpu.h" 27#include "libavutil/ppc/cpu.h" 28#include "libavutil/ppc/util_altivec.h" 29 30#include "libavcodec/avcodec.h" 31#include "libavcodec/pixblockdsp.h" 32 33#if HAVE_ALTIVEC 34 35#if HAVE_VSX 36static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, 37 ptrdiff_t stride) 38{ 39 int i; 40 vector unsigned char perm = 41 (vector unsigned char) {0x00,0x10, 0x01,0x11,0x02,0x12,0x03,0x13,\ 42 0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17}; 43 const vector unsigned char zero = 44 (const vector unsigned char) vec_splat_u8(0); 45 46 for (i = 0; i < 8; i++) { 47 /* Read potentially unaligned pixels. 48 * We're reading 16 pixels, and actually only want 8, 49 * but we simply ignore the extras. */ 50 vector unsigned char bytes = vec_vsx_ld(0, pixels); 51 52 // Convert the bytes into shorts. 53 //vector signed short shorts = (vector signed short) vec_perm(zero, bytes, perm); 54 vector signed short shorts = (vector signed short) vec_perm(bytes, zero, perm); 55 56 // Save the data to the block, we assume the block is 16-byte aligned. 57 vec_vsx_st(shorts, i * 16, (vector signed short *) block); 58 59 pixels += stride; 60 } 61} 62#else 63static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, 64 ptrdiff_t stride) 65{ 66 int i; 67 const vec_u8 zero = (const vec_u8)vec_splat_u8(0); 68 69 for (i = 0; i < 8; i++) { 70 vec_u8 perm = vec_lvsl(0, pixels); 71 /* Read potentially unaligned pixels. 72 * We're reading 16 pixels, and actually only want 8, 73 * but we simply ignore the extras. */ 74 vec_u8 pixl = vec_ld(0, pixels); 75 vec_u8 pixr = vec_ld(7, pixels); 76 vec_u8 bytes = vec_perm(pixl, pixr, perm); 77 78 // Convert the bytes into shorts. 79 vec_s16 shorts = (vec_s16)vec_mergeh(zero, bytes); 80 81 // Save the data to the block, we assume the block is 16-byte aligned. 82 vec_st(shorts, i * 16, (vec_s16 *)block); 83 84 pixels += stride; 85 } 86} 87 88#endif /* HAVE_VSX */ 89 90#if HAVE_VSX 91static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1, 92 const uint8_t *s2, ptrdiff_t stride) 93{ 94 int i; 95 const vector unsigned char zero = 96 (const vector unsigned char) vec_splat_u8(0); 97 vector signed short shorts1, shorts2; 98 99 for (i = 0; i < 4; i++) { 100 /* Read potentially unaligned pixels. 101 * We're reading 16 pixels, and actually only want 8, 102 * but we simply ignore the extras. */ 103 vector unsigned char bytes = vec_vsx_ld(0, s1); 104 105 // Convert the bytes into shorts. 106 shorts1 = (vector signed short) vec_mergeh(bytes, zero); 107 108 // Do the same for the second block of pixels. 109 bytes =vec_vsx_ld(0, s2); 110 111 // Convert the bytes into shorts. 112 shorts2 = (vector signed short) vec_mergeh(bytes, zero); 113 114 // Do the subtraction. 115 shorts1 = vec_sub(shorts1, shorts2); 116 117 // Save the data to the block, we assume the block is 16-byte aligned. 118 vec_vsx_st(shorts1, 0, (vector signed short *) block); 119 120 s1 += stride; 121 s2 += stride; 122 block += 8; 123 124 /* The code below is a copy of the code above... 125 * This is a manual unroll. */ 126 127 /* Read potentially unaligned pixels. 128 * We're reading 16 pixels, and actually only want 8, 129 * but we simply ignore the extras. */ 130 bytes = vec_vsx_ld(0, s1); 131 132 // Convert the bytes into shorts. 133 shorts1 = (vector signed short) vec_mergeh(bytes, zero); 134 135 // Do the same for the second block of pixels. 136 bytes = vec_vsx_ld(0, s2); 137 138 // Convert the bytes into shorts. 139 shorts2 = (vector signed short) vec_mergeh(bytes, zero); 140 141 // Do the subtraction. 142 shorts1 = vec_sub(shorts1, shorts2); 143 144 // Save the data to the block, we assume the block is 16-byte aligned. 145 vec_vsx_st(shorts1, 0, (vector signed short *) block); 146 147 s1 += stride; 148 s2 += stride; 149 block += 8; 150 } 151} 152#else 153static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1, 154 const uint8_t *s2, ptrdiff_t stride) 155{ 156 int i; 157 vec_u8 perm; 158 const vec_u8 zero = (const vec_u8)vec_splat_u8(0); 159 vec_s16 shorts1, shorts2; 160 161 for (i = 0; i < 4; i++) { 162 /* Read potentially unaligned pixels. 163 * We're reading 16 pixels, and actually only want 8, 164 * but we simply ignore the extras. */ 165 perm = vec_lvsl(0, s1); 166 vec_u8 pixl = vec_ld(0, s1); 167 vec_u8 pixr = vec_ld(15, s1); 168 vec_u8 bytes = vec_perm(pixl, pixr, perm); 169 170 // Convert the bytes into shorts. 171 shorts1 = (vec_s16)vec_mergeh(zero, bytes); 172 173 // Do the same for the second block of pixels. 174 perm = vec_lvsl(0, s2); 175 pixl = vec_ld(0, s2); 176 pixr = vec_ld(15, s2); 177 bytes = vec_perm(pixl, pixr, perm); 178 179 // Convert the bytes into shorts. 180 shorts2 = (vec_s16)vec_mergeh(zero, bytes); 181 182 // Do the subtraction. 183 shorts1 = vec_sub(shorts1, shorts2); 184 185 // Save the data to the block, we assume the block is 16-byte aligned. 186 vec_st(shorts1, 0, (vec_s16 *)block); 187 188 s1 += stride; 189 s2 += stride; 190 block += 8; 191 192 /* The code below is a copy of the code above... 193 * This is a manual unroll. */ 194 195 /* Read potentially unaligned pixels. 196 * We're reading 16 pixels, and actually only want 8, 197 * but we simply ignore the extras. */ 198 perm = vec_lvsl(0, s1); 199 pixl = vec_ld(0, s1); 200 pixr = vec_ld(15, s1); 201 bytes = vec_perm(pixl, pixr, perm); 202 203 // Convert the bytes into shorts. 204 shorts1 = (vec_s16)vec_mergeh(zero, bytes); 205 206 // Do the same for the second block of pixels. 207 perm = vec_lvsl(0, s2); 208 pixl = vec_ld(0, s2); 209 pixr = vec_ld(15, s2); 210 bytes = vec_perm(pixl, pixr, perm); 211 212 // Convert the bytes into shorts. 213 shorts2 = (vec_s16)vec_mergeh(zero, bytes); 214 215 // Do the subtraction. 216 shorts1 = vec_sub(shorts1, shorts2); 217 218 // Save the data to the block, we assume the block is 16-byte aligned. 219 vec_st(shorts1, 0, (vec_s16 *)block); 220 221 s1 += stride; 222 s2 += stride; 223 block += 8; 224 } 225} 226 227#endif /* HAVE_VSX */ 228 229#endif /* HAVE_ALTIVEC */ 230 231#if HAVE_VSX 232static void get_pixels_vsx(int16_t *restrict block, const uint8_t *pixels, 233 ptrdiff_t stride) 234{ 235 int i; 236 for (i = 0; i < 8; i++) { 237 vec_s16 shorts = vsx_ld_u8_s16(0, pixels); 238 239 vec_vsx_st(shorts, i * 16, block); 240 241 pixels += stride; 242 } 243} 244 245static void diff_pixels_vsx(int16_t *restrict block, const uint8_t *s1, 246 const uint8_t *s2, ptrdiff_t stride) 247{ 248 int i; 249 vec_s16 shorts1, shorts2; 250 for (i = 0; i < 8; i++) { 251 shorts1 = vsx_ld_u8_s16(0, s1); 252 shorts2 = vsx_ld_u8_s16(0, s2); 253 254 shorts1 = vec_sub(shorts1, shorts2); 255 256 vec_vsx_st(shorts1, 0, block); 257 258 s1 += stride; 259 s2 += stride; 260 block += 8; 261 } 262} 263#endif /* HAVE_VSX */ 264 265av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, 266 AVCodecContext *avctx, 267 unsigned high_bit_depth) 268{ 269#if HAVE_ALTIVEC 270 if (!PPC_ALTIVEC(av_get_cpu_flags())) 271 return; 272 273 c->diff_pixels = diff_pixels_altivec; 274 275 if (!high_bit_depth) { 276 c->get_pixels = get_pixels_altivec; 277 } 278#endif /* HAVE_ALTIVEC */ 279 280#if HAVE_VSX 281 if (!PPC_VSX(av_get_cpu_flags())) 282 return; 283 284 c->diff_pixels = diff_pixels_vsx; 285 286 if (!high_bit_depth) 287 c->get_pixels = get_pixels_vsx; 288#endif /* HAVE_VSX */ 289} 290