1/* 2 * Copyright (c) 2002 Brian Foley 3 * Copyright (c) 2002 Dieter Shirley 4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "config.h" 24 25#include "libavutil/attributes.h" 26#include "libavutil/cpu.h" 27#include "libavutil/ppc/cpu.h" 28#include "libavutil/ppc/util_altivec.h" 29 30#include "libavcodec/avcodec.h" 31#include "libavcodec/mpegvideo.h" 32#include "libavcodec/me_cmp.h" 33 34#if HAVE_ALTIVEC 35 36#if HAVE_BIGENDIAN 37#define GET_PERM(per1, per2, pix) {\ 38 per1 = vec_lvsl(0, pix);\ 39 per2 = vec_add(per1, vec_splat_u8(1));\ 40} 41#define LOAD_PIX(v, iv, pix, per1, per2) {\ 42 vector unsigned char pix2l = vec_ld(0, pix);\ 43 vector unsigned char pix2r = vec_ld(16, pix);\ 44 v = vec_perm(pix2l, pix2r, per1);\ 45 iv = vec_perm(pix2l, pix2r, per2);\ 46} 47#else 48#define GET_PERM(per1, per2, pix) {} 49#define LOAD_PIX(v, iv, pix, per1, per2) {\ 50 v = vec_vsx_ld(0, pix);\ 51 iv = vec_vsx_ld(1, pix);\ 52} 53#endif 54static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 55 ptrdiff_t stride, int h) 56{ 57 int i; 58 int __attribute__((aligned(16))) s = 0; 59 const vector unsigned char zero = 60 (const vector unsigned char) vec_splat_u8(0); 61 vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); 62 vector signed int sumdiffs; 63 vector unsigned char perm1, perm2, pix2v, pix2iv; 64 65 GET_PERM(perm1, perm2, pix2); 66 for (i = 0; i < h; i++) { 67 /* Read unaligned pixels into our vectors. The vectors are as follows: 68 * pix1v: pix1[0] - pix1[15] 69 * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */ 70 vector unsigned char pix1v = vec_ld(0, pix1); 71 LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2); 72 73 /* Calculate the average vector. */ 74 vector unsigned char avgv = vec_avg(pix2v, pix2iv); 75 76 /* Calculate a sum of abs differences vector. */ 77 vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv), 78 vec_min(pix1v, avgv)); 79 80 /* Add each 4 pixel group together and put 4 results into sad. */ 81 sad = vec_sum4s(t5, sad); 82 83 pix1 += stride; 84 pix2 += stride; 85 } 86 /* Sum up the four partial sums, and put the result into s. */ 87 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 88 sumdiffs = vec_splat(sumdiffs, 3); 89 vec_ste(sumdiffs, 0, &s); 90 91 return s; 92} 93 94static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 95 ptrdiff_t stride, int h) 96{ 97 int i; 98 int __attribute__((aligned(16))) s = 0; 99 const vector unsigned char zero = 100 (const vector unsigned char) vec_splat_u8(0); 101 vector unsigned char pix1v, pix3v, avgv, t5; 102 vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); 103 vector signed int sumdiffs; 104 105 uint8_t *pix3 = pix2 + stride; 106 107 /* Due to the fact that pix3 = pix2 + stride, the pix3 of one 108 * iteration becomes pix2 in the next iteration. We can use this 109 * fact to avoid a potentially expensive unaligned read, each 110 * time around the loop. 111 * Read unaligned pixels into our vectors. The vectors are as follows: 112 * pix2v: pix2[0] - pix2[15] 113 * Split the pixel vectors into shorts. */ 114 vector unsigned char pix2v = VEC_LD(0, pix2); 115 116 for (i = 0; i < h; i++) { 117 /* Read unaligned pixels into our vectors. The vectors are as follows: 118 * pix1v: pix1[0] - pix1[15] 119 * pix3v: pix3[0] - pix3[15] */ 120 pix1v = vec_ld(0, pix1); 121 pix3v = VEC_LD(0, pix3); 122 123 /* Calculate the average vector. */ 124 avgv = vec_avg(pix2v, pix3v); 125 126 /* Calculate a sum of abs differences vector. */ 127 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 128 129 /* Add each 4 pixel group together and put 4 results into sad. */ 130 sad = vec_sum4s(t5, sad); 131 132 pix1 += stride; 133 pix2v = pix3v; 134 pix3 += stride; 135 } 136 137 /* Sum up the four partial sums, and put the result into s. */ 138 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 139 sumdiffs = vec_splat(sumdiffs, 3); 140 vec_ste(sumdiffs, 0, &s); 141 return s; 142} 143 144static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 145 ptrdiff_t stride, int h) 146{ 147 int i; 148 int __attribute__((aligned(16))) s = 0; 149 uint8_t *pix3 = pix2 + stride; 150 const vector unsigned char zero = 151 (const vector unsigned char) vec_splat_u8(0); 152 const vector unsigned short two = 153 (const vector unsigned short) vec_splat_u16(2); 154 vector unsigned char avgv, t5; 155 vector unsigned char pix1v, pix3v, pix3iv; 156 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; 157 vector unsigned short avghv, avglv; 158 vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); 159 vector signed int sumdiffs; 160 vector unsigned char perm1, perm2, pix2v, pix2iv; 161 GET_PERM(perm1, perm2, pix2); 162 163 /* Due to the fact that pix3 = pix2 + stride, the pix3 of one 164 * iteration becomes pix2 in the next iteration. We can use this 165 * fact to avoid a potentially expensive unaligned read, as well 166 * as some splitting, and vector addition each time around the loop. 167 * Read unaligned pixels into our vectors. The vectors are as follows: 168 * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] 169 * Split the pixel vectors into shorts. */ 170 LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2); 171 vector unsigned short pix2hv = 172 (vector unsigned short) VEC_MERGEH(zero, pix2v); 173 vector unsigned short pix2lv = 174 (vector unsigned short) VEC_MERGEL(zero, pix2v); 175 vector unsigned short pix2ihv = 176 (vector unsigned short) VEC_MERGEH(zero, pix2iv); 177 vector unsigned short pix2ilv = 178 (vector unsigned short) VEC_MERGEL(zero, pix2iv); 179 180 vector unsigned short t1 = vec_add(pix2hv, pix2ihv); 181 vector unsigned short t2 = vec_add(pix2lv, pix2ilv); 182 vector unsigned short t3, t4; 183 184 for (i = 0; i < h; i++) { 185 /* Read unaligned pixels into our vectors. The vectors are as follows: 186 * pix1v: pix1[0] - pix1[15] 187 * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */ 188 pix1v = vec_ld(0, pix1); 189 LOAD_PIX(pix3v, pix3iv, pix3, perm1, perm2); 190 191 /* Note that AltiVec does have vec_avg, but this works on vector pairs 192 * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the 193 * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when 194 * it should be 1. Instead, we have to split the pixel vectors into 195 * vectors of shorts and do the averaging by hand. */ 196 197 /* Split the pixel vectors into shorts. */ 198 pix3hv = (vector unsigned short) VEC_MERGEH(zero, pix3v); 199 pix3lv = (vector unsigned short) VEC_MERGEL(zero, pix3v); 200 pix3ihv = (vector unsigned short) VEC_MERGEH(zero, pix3iv); 201 pix3ilv = (vector unsigned short) VEC_MERGEL(zero, pix3iv); 202 203 /* Do the averaging on them. */ 204 t3 = vec_add(pix3hv, pix3ihv); 205 t4 = vec_add(pix3lv, pix3ilv); 206 207 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); 208 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); 209 210 /* Pack the shorts back into a result. */ 211 avgv = vec_pack(avghv, avglv); 212 213 /* Calculate a sum of abs differences vector. */ 214 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 215 216 /* Add each 4 pixel group together and put 4 results into sad. */ 217 sad = vec_sum4s(t5, sad); 218 219 pix1 += stride; 220 pix3 += stride; 221 /* Transfer the calculated values for pix3 into pix2. */ 222 t1 = t3; 223 t2 = t4; 224 } 225 /* Sum up the four partial sums, and put the result into s. */ 226 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 227 sumdiffs = vec_splat(sumdiffs, 3); 228 vec_ste(sumdiffs, 0, &s); 229 230 return s; 231} 232 233static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 234 ptrdiff_t stride, int h) 235{ 236 int i; 237 int __attribute__((aligned(16))) s; 238 const vector unsigned int zero = 239 (const vector unsigned int) vec_splat_u32(0); 240 vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); 241 vector signed int sumdiffs; 242 243 for (i = 0; i < h; i++) { 244 /* Read potentially unaligned pixels into t1 and t2. */ 245 vector unsigned char t1 =vec_ld(0, pix1); 246 vector unsigned char t2 = VEC_LD(0, pix2); 247 248 /* Calculate a sum of abs differences vector. */ 249 vector unsigned char t3 = vec_max(t1, t2); 250 vector unsigned char t4 = vec_min(t1, t2); 251 vector unsigned char t5 = vec_sub(t3, t4); 252 253 /* Add each 4 pixel group together and put 4 results into sad. */ 254 sad = vec_sum4s(t5, sad); 255 256 pix1 += stride; 257 pix2 += stride; 258 } 259 260 /* Sum up the four partial sums, and put the result into s. */ 261 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 262 sumdiffs = vec_splat(sumdiffs, 3); 263 vec_ste(sumdiffs, 0, &s); 264 265 return s; 266} 267 268static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 269 ptrdiff_t stride, int h) 270{ 271 int i; 272 int __attribute__((aligned(16))) s; 273 const vector unsigned int zero = 274 (const vector unsigned int) vec_splat_u32(0); 275 const vector unsigned char permclear = 276 (vector unsigned char) 277 { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; 278 vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); 279 vector signed int sumdiffs; 280 281 for (i = 0; i < h; i++) { 282 /* Read potentially unaligned pixels into t1 and t2. 283 * Since we're reading 16 pixels, and actually only want 8, 284 * mask out the last 8 pixels. The 0s don't change the sum. */ 285 vector unsigned char pix1l = VEC_LD(0, pix1); 286 vector unsigned char pix2l = VEC_LD(0, pix2); 287 vector unsigned char t1 = vec_and(pix1l, permclear); 288 vector unsigned char t2 = vec_and(pix2l, permclear); 289 290 /* Calculate a sum of abs differences vector. */ 291 vector unsigned char t3 = vec_max(t1, t2); 292 vector unsigned char t4 = vec_min(t1, t2); 293 vector unsigned char t5 = vec_sub(t3, t4); 294 295 /* Add each 4 pixel group together and put 4 results into sad. */ 296 sad = vec_sum4s(t5, sad); 297 298 pix1 += stride; 299 pix2 += stride; 300 } 301 302 /* Sum up the four partial sums, and put the result into s. */ 303 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 304 sumdiffs = vec_splat(sumdiffs, 3); 305 vec_ste(sumdiffs, 0, &s); 306 307 return s; 308} 309 310/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. 311 * It's the sad8_altivec code above w/ squaring added. */ 312static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 313 ptrdiff_t stride, int h) 314{ 315 int i; 316 int __attribute__((aligned(16))) s; 317 const vector unsigned int zero = 318 (const vector unsigned int) vec_splat_u32(0); 319 const vector unsigned char permclear = 320 (vector unsigned char) 321 { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; 322 vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); 323 vector signed int sumsqr; 324 325 for (i = 0; i < h; i++) { 326 /* Read potentially unaligned pixels into t1 and t2. 327 * Since we're reading 16 pixels, and actually only want 8, 328 * mask out the last 8 pixels. The 0s don't change the sum. */ 329 vector unsigned char t1 = vec_and(VEC_LD(0, pix1), permclear); 330 vector unsigned char t2 = vec_and(VEC_LD(0, pix2), permclear); 331 332 /* Since we want to use unsigned chars, we can take advantage 333 * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ 334 335 /* Calculate abs differences vector. */ 336 vector unsigned char t3 = vec_max(t1, t2); 337 vector unsigned char t4 = vec_min(t1, t2); 338 vector unsigned char t5 = vec_sub(t3, t4); 339 340 /* Square the values and add them to our sum. */ 341 sum = vec_msum(t5, t5, sum); 342 343 pix1 += stride; 344 pix2 += stride; 345 } 346 347 /* Sum up the four partial sums, and put the result into s. */ 348 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 349 sumsqr = vec_splat(sumsqr, 3); 350 vec_ste(sumsqr, 0, &s); 351 352 return s; 353} 354 355/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced. 356 * It's the sad16_altivec code above w/ squaring added. */ 357static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 358 ptrdiff_t stride, int h) 359{ 360 int i; 361 int __attribute__((aligned(16))) s; 362 const vector unsigned int zero = 363 (const vector unsigned int) vec_splat_u32(0); 364 vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); 365 vector signed int sumsqr; 366 367 for (i = 0; i < h; i++) { 368 /* Read potentially unaligned pixels into t1 and t2. */ 369 vector unsigned char t1 = vec_ld(0, pix1); 370 vector unsigned char t2 = VEC_LD(0, pix2); 371 372 /* Since we want to use unsigned chars, we can take advantage 373 * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ 374 375 /* Calculate abs differences vector. */ 376 vector unsigned char t3 = vec_max(t1, t2); 377 vector unsigned char t4 = vec_min(t1, t2); 378 vector unsigned char t5 = vec_sub(t3, t4); 379 380 /* Square the values and add them to our sum. */ 381 sum = vec_msum(t5, t5, sum); 382 383 pix1 += stride; 384 pix2 += stride; 385 } 386 387 /* Sum up the four partial sums, and put the result into s. */ 388 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 389 sumsqr = vec_splat(sumsqr, 3); 390 391 vec_ste(sumsqr, 0, &s); 392 return s; 393} 394 395static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst, 396 uint8_t *src, ptrdiff_t stride, int h) 397{ 398 int __attribute__((aligned(16))) sum; 399 register const vector unsigned char vzero = 400 (const vector unsigned char) vec_splat_u8(0); 401 register vector signed short temp0, temp1, temp2, temp3, temp4, 402 temp5, temp6, temp7; 403 { 404 register const vector signed short vprod1 = 405 (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; 406 register const vector signed short vprod2 = 407 (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; 408 register const vector signed short vprod3 = 409 (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; 410 register const vector unsigned char perm1 = 411 (const vector unsigned char) 412 { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 413 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; 414 register const vector unsigned char perm2 = 415 (const vector unsigned char) 416 { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 417 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; 418 register const vector unsigned char perm3 = 419 (const vector unsigned char) 420 { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 421 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; 422 423 424#define ONEITERBUTTERFLY(i, res) \ 425 { \ 426 register vector unsigned char srcO = unaligned_load(stride * i, src); \ 427 register vector unsigned char dstO = unaligned_load(stride * i, dst);\ 428 \ 429 /* Promote the unsigned chars to signed shorts. */ \ 430 /* We're in the 8x8 function, we only care for the first 8. */ \ 431 register vector signed short srcV = \ 432 (vector signed short) VEC_MERGEH((vector signed char) vzero, \ 433 (vector signed char) srcO); \ 434 register vector signed short dstV = \ 435 (vector signed short) VEC_MERGEH((vector signed char) vzero, \ 436 (vector signed char) dstO); \ 437 \ 438 /* subtractions inside the first butterfly */ \ 439 register vector signed short but0 = vec_sub(srcV, dstV); \ 440 register vector signed short op1 = vec_perm(but0, but0, perm1); \ 441 register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ 442 register vector signed short op2 = vec_perm(but1, but1, perm2); \ 443 register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ 444 register vector signed short op3 = vec_perm(but2, but2, perm3); \ 445 res = vec_mladd(but2, vprod3, op3); \ 446 } 447 448 ONEITERBUTTERFLY(0, temp0); 449 ONEITERBUTTERFLY(1, temp1); 450 ONEITERBUTTERFLY(2, temp2); 451 ONEITERBUTTERFLY(3, temp3); 452 ONEITERBUTTERFLY(4, temp4); 453 ONEITERBUTTERFLY(5, temp5); 454 ONEITERBUTTERFLY(6, temp6); 455 ONEITERBUTTERFLY(7, temp7); 456 } 457#undef ONEITERBUTTERFLY 458 { 459 register vector signed int vsum; 460 register vector signed short line0 = vec_add(temp0, temp1); 461 register vector signed short line1 = vec_sub(temp0, temp1); 462 register vector signed short line2 = vec_add(temp2, temp3); 463 register vector signed short line3 = vec_sub(temp2, temp3); 464 register vector signed short line4 = vec_add(temp4, temp5); 465 register vector signed short line5 = vec_sub(temp4, temp5); 466 register vector signed short line6 = vec_add(temp6, temp7); 467 register vector signed short line7 = vec_sub(temp6, temp7); 468 469 register vector signed short line0B = vec_add(line0, line2); 470 register vector signed short line2B = vec_sub(line0, line2); 471 register vector signed short line1B = vec_add(line1, line3); 472 register vector signed short line3B = vec_sub(line1, line3); 473 register vector signed short line4B = vec_add(line4, line6); 474 register vector signed short line6B = vec_sub(line4, line6); 475 register vector signed short line5B = vec_add(line5, line7); 476 register vector signed short line7B = vec_sub(line5, line7); 477 478 register vector signed short line0C = vec_add(line0B, line4B); 479 register vector signed short line4C = vec_sub(line0B, line4B); 480 register vector signed short line1C = vec_add(line1B, line5B); 481 register vector signed short line5C = vec_sub(line1B, line5B); 482 register vector signed short line2C = vec_add(line2B, line6B); 483 register vector signed short line6C = vec_sub(line2B, line6B); 484 register vector signed short line3C = vec_add(line3B, line7B); 485 register vector signed short line7C = vec_sub(line3B, line7B); 486 487 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); 488 vsum = vec_sum4s(vec_abs(line1C), vsum); 489 vsum = vec_sum4s(vec_abs(line2C), vsum); 490 vsum = vec_sum4s(vec_abs(line3C), vsum); 491 vsum = vec_sum4s(vec_abs(line4C), vsum); 492 vsum = vec_sum4s(vec_abs(line5C), vsum); 493 vsum = vec_sum4s(vec_abs(line6C), vsum); 494 vsum = vec_sum4s(vec_abs(line7C), vsum); 495 vsum = vec_sums(vsum, (vector signed int) vzero); 496 vsum = vec_splat(vsum, 3); 497 498 vec_ste(vsum, 0, &sum); 499 } 500 return sum; 501} 502 503/* 504 * 16x8 works with 16 elements; it can avoid replicating loads, and 505 * gives the compiler more room for scheduling. It's only used from 506 * inside hadamard8_diff16_altivec. 507 * 508 * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has 509 * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in 510 * registers by itself. The following code includes hand-made register 511 * allocation. It's not clean, but on a 7450 the resulting code is much faster 512 * (best case falls from 700+ cycles to 550). 513 * 514 * xlc doesn't add spill code, but it doesn't know how to schedule for the 515 * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses 516 * 25% fewer instructions...) 517 * 518 * On the 970, the hand-made RA is still a win (around 690 vs. around 780), 519 * but xlc goes to around 660 on the regular C code... 520 */ 521static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst, 522 uint8_t *src, ptrdiff_t stride, int h) 523{ 524 int __attribute__((aligned(16))) sum; 525 register vector signed short 526 temp0 __asm__ ("v0"), 527 temp1 __asm__ ("v1"), 528 temp2 __asm__ ("v2"), 529 temp3 __asm__ ("v3"), 530 temp4 __asm__ ("v4"), 531 temp5 __asm__ ("v5"), 532 temp6 __asm__ ("v6"), 533 temp7 __asm__ ("v7"); 534 register vector signed short 535 temp0S __asm__ ("v8"), 536 temp1S __asm__ ("v9"), 537 temp2S __asm__ ("v10"), 538 temp3S __asm__ ("v11"), 539 temp4S __asm__ ("v12"), 540 temp5S __asm__ ("v13"), 541 temp6S __asm__ ("v14"), 542 temp7S __asm__ ("v15"); 543 register const vector unsigned char vzero __asm__ ("v31") = 544 (const vector unsigned char) vec_splat_u8(0); 545 { 546 register const vector signed short vprod1 __asm__ ("v16") = 547 (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; 548 549 register const vector signed short vprod2 __asm__ ("v17") = 550 (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; 551 552 register const vector signed short vprod3 __asm__ ("v18") = 553 (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; 554 555 register const vector unsigned char perm1 __asm__ ("v19") = 556 (const vector unsigned char) 557 { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 558 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; 559 560 register const vector unsigned char perm2 __asm__ ("v20") = 561 (const vector unsigned char) 562 { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 563 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; 564 565 register const vector unsigned char perm3 __asm__ ("v21") = 566 (const vector unsigned char) 567 { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 568 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; 569 570#define ONEITERBUTTERFLY(i, res1, res2) \ 571 { \ 572 register vector unsigned char srcO __asm__ ("v22") = \ 573 unaligned_load(stride * i, src); \ 574 register vector unsigned char dstO __asm__ ("v23") = \ 575 unaligned_load(stride * i, dst);\ 576 \ 577 /* Promote the unsigned chars to signed shorts. */ \ 578 register vector signed short srcV __asm__ ("v24") = \ 579 (vector signed short) VEC_MERGEH((vector signed char) vzero, \ 580 (vector signed char) srcO); \ 581 register vector signed short dstV __asm__ ("v25") = \ 582 (vector signed short) VEC_MERGEH((vector signed char) vzero, \ 583 (vector signed char) dstO); \ 584 register vector signed short srcW __asm__ ("v26") = \ 585 (vector signed short) VEC_MERGEL((vector signed char) vzero, \ 586 (vector signed char) srcO); \ 587 register vector signed short dstW __asm__ ("v27") = \ 588 (vector signed short) VEC_MERGEL((vector signed char) vzero, \ 589 (vector signed char) dstO); \ 590 \ 591 /* subtractions inside the first butterfly */ \ 592 register vector signed short but0 __asm__ ("v28") = \ 593 vec_sub(srcV, dstV); \ 594 register vector signed short but0S __asm__ ("v29") = \ 595 vec_sub(srcW, dstW); \ 596 register vector signed short op1 __asm__ ("v30") = \ 597 vec_perm(but0, but0, perm1); \ 598 register vector signed short but1 __asm__ ("v22") = \ 599 vec_mladd(but0, vprod1, op1); \ 600 register vector signed short op1S __asm__ ("v23") = \ 601 vec_perm(but0S, but0S, perm1); \ 602 register vector signed short but1S __asm__ ("v24") = \ 603 vec_mladd(but0S, vprod1, op1S); \ 604 register vector signed short op2 __asm__ ("v25") = \ 605 vec_perm(but1, but1, perm2); \ 606 register vector signed short but2 __asm__ ("v26") = \ 607 vec_mladd(but1, vprod2, op2); \ 608 register vector signed short op2S __asm__ ("v27") = \ 609 vec_perm(but1S, but1S, perm2); \ 610 register vector signed short but2S __asm__ ("v28") = \ 611 vec_mladd(but1S, vprod2, op2S); \ 612 register vector signed short op3 __asm__ ("v29") = \ 613 vec_perm(but2, but2, perm3); \ 614 register vector signed short op3S __asm__ ("v30") = \ 615 vec_perm(but2S, but2S, perm3); \ 616 res1 = vec_mladd(but2, vprod3, op3); \ 617 res2 = vec_mladd(but2S, vprod3, op3S); \ 618 } 619 620 ONEITERBUTTERFLY(0, temp0, temp0S); 621 ONEITERBUTTERFLY(1, temp1, temp1S); 622 ONEITERBUTTERFLY(2, temp2, temp2S); 623 ONEITERBUTTERFLY(3, temp3, temp3S); 624 ONEITERBUTTERFLY(4, temp4, temp4S); 625 ONEITERBUTTERFLY(5, temp5, temp5S); 626 ONEITERBUTTERFLY(6, temp6, temp6S); 627 ONEITERBUTTERFLY(7, temp7, temp7S); 628 } 629#undef ONEITERBUTTERFLY 630 { 631 register vector signed int vsum; 632 633 register vector signed short line0 = vec_add(temp0, temp1); 634 register vector signed short line1 = vec_sub(temp0, temp1); 635 register vector signed short line2 = vec_add(temp2, temp3); 636 register vector signed short line3 = vec_sub(temp2, temp3); 637 register vector signed short line4 = vec_add(temp4, temp5); 638 register vector signed short line5 = vec_sub(temp4, temp5); 639 register vector signed short line6 = vec_add(temp6, temp7); 640 register vector signed short line7 = vec_sub(temp6, temp7); 641 642 register vector signed short line0B = vec_add(line0, line2); 643 register vector signed short line2B = vec_sub(line0, line2); 644 register vector signed short line1B = vec_add(line1, line3); 645 register vector signed short line3B = vec_sub(line1, line3); 646 register vector signed short line4B = vec_add(line4, line6); 647 register vector signed short line6B = vec_sub(line4, line6); 648 register vector signed short line5B = vec_add(line5, line7); 649 register vector signed short line7B = vec_sub(line5, line7); 650 651 register vector signed short line0C = vec_add(line0B, line4B); 652 register vector signed short line4C = vec_sub(line0B, line4B); 653 register vector signed short line1C = vec_add(line1B, line5B); 654 register vector signed short line5C = vec_sub(line1B, line5B); 655 register vector signed short line2C = vec_add(line2B, line6B); 656 register vector signed short line6C = vec_sub(line2B, line6B); 657 register vector signed short line3C = vec_add(line3B, line7B); 658 register vector signed short line7C = vec_sub(line3B, line7B); 659 660 register vector signed short line0S = vec_add(temp0S, temp1S); 661 register vector signed short line1S = vec_sub(temp0S, temp1S); 662 register vector signed short line2S = vec_add(temp2S, temp3S); 663 register vector signed short line3S = vec_sub(temp2S, temp3S); 664 register vector signed short line4S = vec_add(temp4S, temp5S); 665 register vector signed short line5S = vec_sub(temp4S, temp5S); 666 register vector signed short line6S = vec_add(temp6S, temp7S); 667 register vector signed short line7S = vec_sub(temp6S, temp7S); 668 669 register vector signed short line0BS = vec_add(line0S, line2S); 670 register vector signed short line2BS = vec_sub(line0S, line2S); 671 register vector signed short line1BS = vec_add(line1S, line3S); 672 register vector signed short line3BS = vec_sub(line1S, line3S); 673 register vector signed short line4BS = vec_add(line4S, line6S); 674 register vector signed short line6BS = vec_sub(line4S, line6S); 675 register vector signed short line5BS = vec_add(line5S, line7S); 676 register vector signed short line7BS = vec_sub(line5S, line7S); 677 678 register vector signed short line0CS = vec_add(line0BS, line4BS); 679 register vector signed short line4CS = vec_sub(line0BS, line4BS); 680 register vector signed short line1CS = vec_add(line1BS, line5BS); 681 register vector signed short line5CS = vec_sub(line1BS, line5BS); 682 register vector signed short line2CS = vec_add(line2BS, line6BS); 683 register vector signed short line6CS = vec_sub(line2BS, line6BS); 684 register vector signed short line3CS = vec_add(line3BS, line7BS); 685 register vector signed short line7CS = vec_sub(line3BS, line7BS); 686 687 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); 688 vsum = vec_sum4s(vec_abs(line1C), vsum); 689 vsum = vec_sum4s(vec_abs(line2C), vsum); 690 vsum = vec_sum4s(vec_abs(line3C), vsum); 691 vsum = vec_sum4s(vec_abs(line4C), vsum); 692 vsum = vec_sum4s(vec_abs(line5C), vsum); 693 vsum = vec_sum4s(vec_abs(line6C), vsum); 694 vsum = vec_sum4s(vec_abs(line7C), vsum); 695 696 vsum = vec_sum4s(vec_abs(line0CS), vsum); 697 vsum = vec_sum4s(vec_abs(line1CS), vsum); 698 vsum = vec_sum4s(vec_abs(line2CS), vsum); 699 vsum = vec_sum4s(vec_abs(line3CS), vsum); 700 vsum = vec_sum4s(vec_abs(line4CS), vsum); 701 vsum = vec_sum4s(vec_abs(line5CS), vsum); 702 vsum = vec_sum4s(vec_abs(line6CS), vsum); 703 vsum = vec_sum4s(vec_abs(line7CS), vsum); 704 vsum = vec_sums(vsum, (vector signed int) vzero); 705 vsum = vec_splat(vsum, 3); 706 707 vec_ste(vsum, 0, &sum); 708 } 709 return sum; 710} 711 712static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst, 713 uint8_t *src, ptrdiff_t stride, int h) 714{ 715 int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); 716 717 if (h == 16) { 718 dst += 8 * stride; 719 src += 8 * stride; 720 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); 721 } 722 return score; 723} 724#endif /* HAVE_ALTIVEC */ 725 726av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx) 727{ 728#if HAVE_ALTIVEC 729 if (!PPC_ALTIVEC(av_get_cpu_flags())) 730 return; 731 732 c->pix_abs[0][1] = sad16_x2_altivec; 733 c->pix_abs[0][2] = sad16_y2_altivec; 734 c->pix_abs[0][3] = sad16_xy2_altivec; 735 c->pix_abs[0][0] = sad16_altivec; 736 c->pix_abs[1][0] = sad8_altivec; 737 738 c->sad[0] = sad16_altivec; 739 c->sad[1] = sad8_altivec; 740 c->sse[0] = sse16_altivec; 741 c->sse[1] = sse8_altivec; 742 743 c->hadamard8_diff[0] = hadamard8_diff16_altivec; 744 c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; 745#endif /* HAVE_ALTIVEC */ 746} 747