1/* 2 * Copyright (c) 2002 Brian Foley 3 * Copyright (c) 2002 Dieter Shirley 4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "config.h" 24 25#include "libavutil/attributes.h" 26#include "libavutil/cpu.h" 27#include "libavutil/ppc/cpu.h" 28#include "libavutil/ppc/util_altivec.h" 29 30#include "libavcodec/hpeldsp.h" 31 32#include "hpeldsp_altivec.h" 33 34#if HAVE_ALTIVEC 35/* next one assumes that ((line_size % 16) == 0) */ 36void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 37{ 38 register vector unsigned char pixelsv1; 39 register vector unsigned char pixelsv1B; 40 register vector unsigned char pixelsv1C; 41 register vector unsigned char pixelsv1D; 42 43 int i; 44 register ptrdiff_t line_size_2 = line_size << 1; 45 register ptrdiff_t line_size_3 = line_size + line_size_2; 46 register ptrdiff_t line_size_4 = line_size << 2; 47 48// hand-unrolling the loop by 4 gains about 15% 49// mininum execution time goes from 74 to 60 cycles 50// it's faster than -funroll-loops, but using 51// -funroll-loops w/ this is bad - 74 cycles again. 52// all this is on a 7450, tuning for the 7450 53 for (i = 0; i < h; i += 4) { 54 pixelsv1 = unaligned_load( 0, pixels); 55 pixelsv1B = unaligned_load(line_size, pixels); 56 pixelsv1C = unaligned_load(line_size_2, pixels); 57 pixelsv1D = unaligned_load(line_size_3, pixels); 58 VEC_ST(pixelsv1, 0, (unsigned char*)block); 59 VEC_ST(pixelsv1B, line_size, (unsigned char*)block); 60 VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block); 61 VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block); 62 pixels+=line_size_4; 63 block +=line_size_4; 64 } 65} 66 67/* next one assumes that ((line_size % 16) == 0) */ 68#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 69void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 70{ 71 register vector unsigned char pixelsv, blockv; 72 73 int i; 74 for (i = 0; i < h; i++) { 75 blockv = vec_ld(0, block); 76 pixelsv = VEC_LD( 0, pixels); 77 blockv = vec_avg(blockv,pixelsv); 78 vec_st(blockv, 0, (unsigned char*)block); 79 pixels+=line_size; 80 block +=line_size; 81 } 82} 83 84/* next one assumes that ((line_size % 8) == 0) */ 85static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) 86{ 87 register vector unsigned char pixelsv, blockv; 88 int i; 89 90 for (i = 0; i < h; i++) { 91 /* block is 8 bytes-aligned, so we're either in the 92 left block (16 bytes-aligned) or in the right block (not) */ 93 int rightside = ((unsigned long)block & 0x0000000F); 94 95 blockv = vec_ld(0, block); 96 pixelsv = VEC_LD( 0, pixels); 97 98 if (rightside) { 99 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 100 } else { 101 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); 102 } 103 104 blockv = vec_avg(blockv, pixelsv); 105 106 vec_st(blockv, 0, block); 107 108 pixels += line_size; 109 block += line_size; 110 } 111} 112 113/* next one assumes that ((line_size % 8) == 0) */ 114static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 115{ 116 register int i; 117 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 118 register vector unsigned char blockv; 119 register vector unsigned short pixelssum1, pixelssum2, temp3; 120 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 121 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 122 123 pixelsv1 = VEC_LD(0, pixels); 124 pixelsv2 = VEC_LD(1, pixels); 125 pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 126 pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 127 128 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 129 (vector unsigned short)pixelsv2); 130 pixelssum1 = vec_add(pixelssum1, vctwo); 131 132 for (i = 0; i < h ; i++) { 133 int rightside = ((unsigned long)block & 0x0000000F); 134 blockv = vec_ld(0, block); 135 136 pixelsv1 = unaligned_load(line_size, pixels); 137 pixelsv2 = unaligned_load(line_size+1, pixels); 138 pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 139 pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 140 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 141 (vector unsigned short)pixelsv2); 142 temp3 = vec_add(pixelssum1, pixelssum2); 143 temp3 = vec_sra(temp3, vctwo); 144 pixelssum1 = vec_add(pixelssum2, vctwo); 145 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 146 147 if (rightside) { 148 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 149 } else { 150 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 151 } 152 153 vec_st(blockv, 0, block); 154 155 block += line_size; 156 pixels += line_size; 157 } 158} 159 160/* next one assumes that ((line_size % 8) == 0) */ 161static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 162{ 163 register int i; 164 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 165 register vector unsigned char blockv; 166 register vector unsigned short pixelssum1, pixelssum2, temp3; 167 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 168 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 169 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 170 171 pixelsv1 = VEC_LD(0, pixels); 172 pixelsv2 = VEC_LD(1, pixels); 173 pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 174 pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 175 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 176 (vector unsigned short)pixelsv2); 177 pixelssum1 = vec_add(pixelssum1, vcone); 178 179 for (i = 0; i < h ; i++) { 180 int rightside = ((unsigned long)block & 0x0000000F); 181 blockv = vec_ld(0, block); 182 183 pixelsv1 = unaligned_load(line_size, pixels); 184 pixelsv2 = unaligned_load(line_size+1, pixels); 185 pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 186 pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 187 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 188 (vector unsigned short)pixelsv2); 189 temp3 = vec_add(pixelssum1, pixelssum2); 190 temp3 = vec_sra(temp3, vctwo); 191 pixelssum1 = vec_add(pixelssum2, vcone); 192 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 193 194 if (rightside) { 195 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 196 } else { 197 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 198 } 199 200 vec_st(blockv, 0, block); 201 202 block += line_size; 203 pixels += line_size; 204 } 205} 206 207/* next one assumes that ((line_size % 16) == 0) */ 208static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) 209{ 210 register int i; 211 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 212 register vector unsigned char blockv; 213 register vector unsigned short temp3, temp4, 214 pixelssum1, pixelssum2, pixelssum3, pixelssum4; 215 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 216 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 217 218 pixelsv1 = VEC_LD(0, pixels); 219 pixelsv2 = VEC_LD(1, pixels); 220 pixelsv3 = VEC_MERGEL(vczero, pixelsv1); 221 pixelsv4 = VEC_MERGEL(vczero, pixelsv2); 222 pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 223 pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 224 pixelssum3 = vec_add((vector unsigned short)pixelsv3, 225 (vector unsigned short)pixelsv4); 226 pixelssum3 = vec_add(pixelssum3, vctwo); 227 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 228 (vector unsigned short)pixelsv2); 229 pixelssum1 = vec_add(pixelssum1, vctwo); 230 231 for (i = 0; i < h ; i++) { 232 blockv = vec_ld(0, block); 233 234 pixelsv1 = unaligned_load(line_size, pixels); 235 pixelsv2 = unaligned_load(line_size+1, pixels); 236 237 pixelsv3 = VEC_MERGEL(vczero, pixelsv1); 238 pixelsv4 = VEC_MERGEL(vczero, pixelsv2); 239 pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 240 pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 241 pixelssum4 = vec_add((vector unsigned short)pixelsv3, 242 (vector unsigned short)pixelsv4); 243 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 244 (vector unsigned short)pixelsv2); 245 temp4 = vec_add(pixelssum3, pixelssum4); 246 temp4 = vec_sra(temp4, vctwo); 247 temp3 = vec_add(pixelssum1, pixelssum2); 248 temp3 = vec_sra(temp3, vctwo); 249 250 pixelssum3 = vec_add(pixelssum4, vctwo); 251 pixelssum1 = vec_add(pixelssum2, vctwo); 252 253 blockv = vec_packsu(temp3, temp4); 254 255 vec_st(blockv, 0, block); 256 257 block += line_size; 258 pixels += line_size; 259 } 260} 261 262/* next one assumes that ((line_size % 16) == 0) */ 263static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) 264{ 265 register int i; 266 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 267 register vector unsigned char blockv; 268 register vector unsigned short temp3, temp4, 269 pixelssum1, pixelssum2, pixelssum3, pixelssum4; 270 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 271 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 272 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 273 274 pixelsv1 = VEC_LD(0, pixels); 275 pixelsv2 = VEC_LD(1, pixels); 276 pixelsv3 = VEC_MERGEL(vczero, pixelsv1); 277 pixelsv4 = VEC_MERGEL(vczero, pixelsv2); 278 pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 279 pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 280 pixelssum3 = vec_add((vector unsigned short)pixelsv3, 281 (vector unsigned short)pixelsv4); 282 pixelssum3 = vec_add(pixelssum3, vcone); 283 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 284 (vector unsigned short)pixelsv2); 285 pixelssum1 = vec_add(pixelssum1, vcone); 286 287 for (i = 0; i < h ; i++) { 288 pixelsv1 = unaligned_load(line_size, pixels); 289 pixelsv2 = unaligned_load(line_size+1, pixels); 290 291 pixelsv3 = VEC_MERGEL(vczero, pixelsv1); 292 pixelsv4 = VEC_MERGEL(vczero, pixelsv2); 293 pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 294 pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 295 pixelssum4 = vec_add((vector unsigned short)pixelsv3, 296 (vector unsigned short)pixelsv4); 297 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 298 (vector unsigned short)pixelsv2); 299 temp4 = vec_add(pixelssum3, pixelssum4); 300 temp4 = vec_sra(temp4, vctwo); 301 temp3 = vec_add(pixelssum1, pixelssum2); 302 temp3 = vec_sra(temp3, vctwo); 303 304 pixelssum3 = vec_add(pixelssum4, vcone); 305 pixelssum1 = vec_add(pixelssum2, vcone); 306 307 blockv = vec_packsu(temp3, temp4); 308 309 VEC_ST(blockv, 0, block); 310 311 block += line_size; 312 pixels += line_size; 313 } 314} 315 316/* next one assumes that ((line_size % 8) == 0) */ 317static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 318{ 319 register int i; 320 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 321 register vector unsigned char blockv, blocktemp; 322 register vector unsigned short pixelssum1, pixelssum2, temp3; 323 324 register const vector unsigned char vczero = (const vector unsigned char) 325 vec_splat_u8(0); 326 register const vector unsigned short vctwo = (const vector unsigned short) 327 vec_splat_u16(2); 328 329 pixelsv1 = VEC_LD(0, pixels); 330 pixelsv2 = VEC_LD(1, pixels); 331 pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 332 pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 333 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 334 (vector unsigned short)pixelsv2); 335 pixelssum1 = vec_add(pixelssum1, vctwo); 336 337 for (i = 0; i < h ; i++) { 338 int rightside = ((unsigned long)block & 0x0000000F); 339 blockv = vec_ld(0, block); 340 341 pixelsv1 = unaligned_load(line_size, pixels); 342 pixelsv2 = unaligned_load(line_size+1, pixels); 343 344 pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 345 pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 346 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 347 (vector unsigned short)pixelsv2); 348 temp3 = vec_add(pixelssum1, pixelssum2); 349 temp3 = vec_sra(temp3, vctwo); 350 pixelssum1 = vec_add(pixelssum2, vctwo); 351 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 352 353 if (rightside) { 354 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 355 } else { 356 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 357 } 358 359 blockv = vec_avg(blocktemp, blockv); 360 vec_st(blockv, 0, block); 361 362 block += line_size; 363 pixels += line_size; 364 } 365} 366#endif /* HAVE_ALTIVEC */ 367 368av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags) 369{ 370#if HAVE_ALTIVEC 371 if (!PPC_ALTIVEC(av_get_cpu_flags())) 372 return; 373 374 c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec; 375 c->avg_pixels_tab[1][0] = avg_pixels8_altivec; 376 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; 377 378 c->put_pixels_tab[0][0] = ff_put_pixels16_altivec; 379 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; 380 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; 381 382 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec; 383 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; 384 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; 385#endif /* HAVE_ALTIVEC */ 386} 387