1/* 2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "config.h" 22#if HAVE_UNISTD_H 23#include <unistd.h> 24#endif 25 26#include "libavutil/avassert.h" 27#include "libavutil/mem.h" 28#include "libavutil/ppc/util_altivec.h" 29 30#define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F)); 31 32#if HAVE_BIGENDIAN 33#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\ 34 vec_u8 srcR1 = vec_ld(-2, s);\ 35 vec_u8 srcR2 = vec_ld(14, s);\ 36 switch (ali) {\ 37 default: {\ 38 srcM2 = vec_perm(srcR1, srcR2, pm2);\ 39 srcM1 = vec_perm(srcR1, srcR2, pm1);\ 40 srcP0 = vec_perm(srcR1, srcR2, pp0);\ 41 srcP1 = vec_perm(srcR1, srcR2, pp1);\ 42 srcP2 = vec_perm(srcR1, srcR2, pp2);\ 43 srcP3 = vec_perm(srcR1, srcR2, pp3);\ 44 } break;\ 45 case 11: {\ 46 srcM2 = vec_perm(srcR1, srcR2, pm2);\ 47 srcM1 = vec_perm(srcR1, srcR2, pm1);\ 48 srcP0 = vec_perm(srcR1, srcR2, pp0);\ 49 srcP1 = vec_perm(srcR1, srcR2, pp1);\ 50 srcP2 = vec_perm(srcR1, srcR2, pp2);\ 51 srcP3 = srcR2;\ 52 } break;\ 53 case 12: {\ 54 vec_u8 srcR3 = vec_ld(30, s);\ 55 srcM2 = vec_perm(srcR1, srcR2, pm2);\ 56 srcM1 = vec_perm(srcR1, srcR2, pm1);\ 57 srcP0 = vec_perm(srcR1, srcR2, pp0);\ 58 srcP1 = vec_perm(srcR1, srcR2, pp1);\ 59 srcP2 = srcR2;\ 60 srcP3 = vec_perm(srcR2, srcR3, pp3);\ 61 } break;\ 62 case 13: {\ 63 vec_u8 srcR3 = vec_ld(30, s);\ 64 srcM2 = vec_perm(srcR1, srcR2, pm2);\ 65 srcM1 = vec_perm(srcR1, srcR2, pm1);\ 66 srcP0 = vec_perm(srcR1, srcR2, pp0);\ 67 srcP1 = srcR2;\ 68 srcP2 = vec_perm(srcR2, srcR3, pp2);\ 69 srcP3 = vec_perm(srcR2, srcR3, pp3);\ 70 } break;\ 71 case 14: {\ 72 vec_u8 srcR3 = vec_ld(30, s);\ 73 srcM2 = vec_perm(srcR1, srcR2, pm2);\ 74 srcM1 = vec_perm(srcR1, srcR2, pm1);\ 75 srcP0 = srcR2;\ 76 srcP1 = vec_perm(srcR2, srcR3, pp1);\ 77 srcP2 = vec_perm(srcR2, srcR3, pp2);\ 78 srcP3 = vec_perm(srcR2, srcR3, pp3);\ 79 } break;\ 80 case 15: {\ 81 vec_u8 srcR3 = vec_ld(30, s);\ 82 srcM2 = vec_perm(srcR1, srcR2, pm2);\ 83 srcM1 = srcR2;\ 84 srcP0 = vec_perm(srcR2, srcR3, pp0);\ 85 srcP1 = vec_perm(srcR2, srcR3, pp1);\ 86 srcP2 = vec_perm(srcR2, srcR3, pp2);\ 87 srcP3 = vec_perm(srcR2, srcR3, pp3);\ 88 } break;\ 89 }\ 90 } 91#else 92#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\ 93 srcM2 = vec_vsx_ld(-2, s);\ 94 srcM1 = vec_vsx_ld(-1, s);\ 95 srcP0 = vec_vsx_ld(0, s);\ 96 srcP1 = vec_vsx_ld(1, s);\ 97 srcP2 = vec_vsx_ld(2, s);\ 98 srcP3 = vec_vsx_ld(3, s);\ 99 } 100#endif /* HAVE_BIGENDIAN */ 101 102/* this code assume stride % 16 == 0 */ 103#ifdef PREFIX_h264_qpel16_h_lowpass_altivec 104static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst, 105 const uint8_t *src, 106 int dstStride, int srcStride) 107{ 108 register int i; 109 110 LOAD_ZERO; 111 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3; 112 const vec_s16 v5ss = vec_splat_s16(5); 113 const vec_u16 v5us = vec_splat_u16(5); 114 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 115 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 116 117 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 118 119 register int align = ((((unsigned long)src) - 2) % 16); 120 121 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 122 srcP2A, srcP2B, srcP3A, srcP3B, 123 srcM1A, srcM1B, srcM2A, srcM2B, 124 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 125 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 126 psumA, psumB, sumA, sumB; 127 128 vec_u8 sum, fsum; 129 130#if HAVE_BIGENDIAN 131 permM2 = vec_lvsl(-2, src); 132 permM1 = vec_lvsl(-1, src); 133 permP0 = vec_lvsl(+0, src); 134 permP1 = vec_lvsl(+1, src); 135 permP2 = vec_lvsl(+2, src); 136 permP3 = vec_lvsl(+3, src); 137#endif /* HAVE_BIGENDIAN */ 138 139 for (i = 0 ; i < 16 ; i ++) { 140 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3); 141 142 srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0); 143 srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0); 144 srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1); 145 srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1); 146 147 srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2); 148 srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2); 149 srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3); 150 srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3); 151 152 srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1); 153 srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1); 154 srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2); 155 srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2); 156 157 sum1A = vec_adds(srcP0A, srcP1A); 158 sum1B = vec_adds(srcP0B, srcP1B); 159 sum2A = vec_adds(srcM1A, srcP2A); 160 sum2B = vec_adds(srcM1B, srcP2B); 161 sum3A = vec_adds(srcM2A, srcP3A); 162 sum3B = vec_adds(srcM2B, srcP3B); 163 164 pp1A = vec_mladd(sum1A, v20ss, v16ss); 165 pp1B = vec_mladd(sum1B, v20ss, v16ss); 166 167 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 168 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 169 170 pp3A = vec_add(sum3A, pp1A); 171 pp3B = vec_add(sum3B, pp1B); 172 173 psumA = vec_sub(pp3A, pp2A); 174 psumB = vec_sub(pp3B, pp2B); 175 176 sumA = vec_sra(psumA, v5us); 177 sumB = vec_sra(psumB, v5us); 178 179 sum = vec_packsu(sumA, sumB); 180 181 ASSERT_ALIGNED(dst); 182 183 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); 184 185 vec_st(fsum, 0, dst); 186 187 src += srcStride; 188 dst += dstStride; 189 } 190} 191#endif /* PREFIX_h264_qpel16_h_lowpass_altivec */ 192 193/* this code assume stride % 16 == 0 */ 194#ifdef PREFIX_h264_qpel16_v_lowpass_altivec 195static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst, 196 const uint8_t *src, 197 int dstStride, int srcStride) 198{ 199 register int i; 200 201 LOAD_ZERO; 202 vec_u8 perm; 203#if HAVE_BIGENDIAN 204 perm = vec_lvsl(0, src); 205#endif 206 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 207 const vec_u16 v5us = vec_splat_u16(5); 208 const vec_s16 v5ss = vec_splat_s16(5); 209 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 210 211 const uint8_t *srcbis = src - (srcStride * 2); 212 213 const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm); 214 srcbis += srcStride; 215 const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm); 216 srcbis += srcStride; 217 const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm); 218 srcbis += srcStride; 219 const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm); 220 srcbis += srcStride; 221 const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm); 222 srcbis += srcStride; 223 224 vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2); 225 vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2); 226 vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1); 227 vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1); 228 vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0); 229 vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0); 230 vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1); 231 vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1); 232 vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2); 233 vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2); 234 235 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 236 psumA, psumB, sumA, sumB, 237 srcP3ssA, srcP3ssB, 238 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 239 240 vec_u8 sum, fsum, srcP3; 241 242 for (i = 0 ; i < 16 ; i++) { 243 srcP3 = load_with_perm_vec(0, srcbis, perm); 244 srcbis += srcStride; 245 246 srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3); 247 srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3); 248 249 sum1A = vec_adds(srcP0ssA, srcP1ssA); 250 sum1B = vec_adds(srcP0ssB, srcP1ssB); 251 sum2A = vec_adds(srcM1ssA, srcP2ssA); 252 sum2B = vec_adds(srcM1ssB, srcP2ssB); 253 sum3A = vec_adds(srcM2ssA, srcP3ssA); 254 sum3B = vec_adds(srcM2ssB, srcP3ssB); 255 256 srcM2ssA = srcM1ssA; 257 srcM2ssB = srcM1ssB; 258 srcM1ssA = srcP0ssA; 259 srcM1ssB = srcP0ssB; 260 srcP0ssA = srcP1ssA; 261 srcP0ssB = srcP1ssB; 262 srcP1ssA = srcP2ssA; 263 srcP1ssB = srcP2ssB; 264 srcP2ssA = srcP3ssA; 265 srcP2ssB = srcP3ssB; 266 267 pp1A = vec_mladd(sum1A, v20ss, v16ss); 268 pp1B = vec_mladd(sum1B, v20ss, v16ss); 269 270 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 271 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 272 273 pp3A = vec_add(sum3A, pp1A); 274 pp3B = vec_add(sum3B, pp1B); 275 276 psumA = vec_sub(pp3A, pp2A); 277 psumB = vec_sub(pp3B, pp2B); 278 279 sumA = vec_sra(psumA, v5us); 280 sumB = vec_sra(psumB, v5us); 281 282 sum = vec_packsu(sumA, sumB); 283 284 ASSERT_ALIGNED(dst); 285 286 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); 287 288 vec_st(fsum, 0, dst); 289 290 dst += dstStride; 291 } 292} 293#endif /* PREFIX_h264_qpel16_v_lowpass_altivec */ 294 295/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 296#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec 297static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp, 298 const uint8_t *src, 299 int dstStride, int tmpStride, 300 int srcStride) 301{ 302 register int i; 303 LOAD_ZERO; 304 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3; 305 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 306 const vec_u32 v10ui = vec_splat_u32(10); 307 const vec_s16 v5ss = vec_splat_s16(5); 308 const vec_s16 v1ss = vec_splat_s16(1); 309 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 310 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 311 312 register int align = ((((unsigned long)src) - 2) % 16); 313 314 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 315 srcP2A, srcP2B, srcP3A, srcP3B, 316 srcM1A, srcM1B, srcM2A, srcM2B, 317 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 318 pp1A, pp1B, pp2A, pp2B, psumA, psumB; 319 320 const vec_u8 mperm = (const vec_u8) 321 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 322 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; 323 int16_t *tmpbis = tmp; 324 325 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 326 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 327 tmpP2ssA, tmpP2ssB; 328 329 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 330 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 331 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 332 ssumAe, ssumAo, ssumBe, ssumBo; 333 vec_u8 fsum, sumv, sum; 334 vec_s16 ssume, ssumo; 335 336#if HAVE_BIGENDIAN 337 permM2 = vec_lvsl(-2, src); 338 permM1 = vec_lvsl(-1, src); 339 permP0 = vec_lvsl(+0, src); 340 permP1 = vec_lvsl(+1, src); 341 permP2 = vec_lvsl(+2, src); 342 permP3 = vec_lvsl(+3, src); 343#endif /* HAVE_BIGENDIAN */ 344 345 src -= (2 * srcStride); 346 for (i = 0 ; i < 21 ; i ++) { 347 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 348 349 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3); 350 351 srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0); 352 srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0); 353 srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1); 354 srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1); 355 356 srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2); 357 srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2); 358 srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3); 359 srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3); 360 361 srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1); 362 srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1); 363 srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2); 364 srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2); 365 366 sum1A = vec_adds(srcP0A, srcP1A); 367 sum1B = vec_adds(srcP0B, srcP1B); 368 sum2A = vec_adds(srcM1A, srcP2A); 369 sum2B = vec_adds(srcM1B, srcP2B); 370 sum3A = vec_adds(srcM2A, srcP3A); 371 sum3B = vec_adds(srcM2B, srcP3B); 372 373 pp1A = vec_mladd(sum1A, v20ss, sum3A); 374 pp1B = vec_mladd(sum1B, v20ss, sum3B); 375 376 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 377 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 378 379 psumA = vec_sub(pp1A, pp2A); 380 psumB = vec_sub(pp1B, pp2B); 381 382 vec_st(psumA, 0, tmp); 383 vec_st(psumB, 16, tmp); 384 385 src += srcStride; 386 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 387 } 388 389 tmpM2ssA = vec_ld(0, tmpbis); 390 tmpM2ssB = vec_ld(16, tmpbis); 391 tmpbis += tmpStride; 392 tmpM1ssA = vec_ld(0, tmpbis); 393 tmpM1ssB = vec_ld(16, tmpbis); 394 tmpbis += tmpStride; 395 tmpP0ssA = vec_ld(0, tmpbis); 396 tmpP0ssB = vec_ld(16, tmpbis); 397 tmpbis += tmpStride; 398 tmpP1ssA = vec_ld(0, tmpbis); 399 tmpP1ssB = vec_ld(16, tmpbis); 400 tmpbis += tmpStride; 401 tmpP2ssA = vec_ld(0, tmpbis); 402 tmpP2ssB = vec_ld(16, tmpbis); 403 tmpbis += tmpStride; 404 405 for (i = 0 ; i < 16 ; i++) { 406 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); 407 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); 408 409 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 410 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 411 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 412 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 413 vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 414 vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 415 416 tmpbis += tmpStride; 417 418 tmpM2ssA = tmpM1ssA; 419 tmpM2ssB = tmpM1ssB; 420 tmpM1ssA = tmpP0ssA; 421 tmpM1ssB = tmpP0ssB; 422 tmpP0ssA = tmpP1ssA; 423 tmpP0ssB = tmpP1ssB; 424 tmpP1ssA = tmpP2ssA; 425 tmpP1ssB = tmpP2ssB; 426 tmpP2ssA = tmpP3ssA; 427 tmpP2ssB = tmpP3ssB; 428 429 pp1Ae = vec_mule(sum1A, v20ss); 430 pp1Ao = vec_mulo(sum1A, v20ss); 431 pp1Be = vec_mule(sum1B, v20ss); 432 pp1Bo = vec_mulo(sum1B, v20ss); 433 434 pp2Ae = vec_mule(sum2A, v5ss); 435 pp2Ao = vec_mulo(sum2A, v5ss); 436 pp2Be = vec_mule(sum2B, v5ss); 437 pp2Bo = vec_mulo(sum2B, v5ss); 438 439 pp3Ao = vec_mulo(sum3A, v1ss); 440 pp3Bo = vec_mulo(sum3B, v1ss); 441#if !HAVE_BIGENDIAN 442 sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3)); 443 sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3)); 444#endif 445 pp3Ae = vec_sra((vec_s32)sum3A, v16ui); 446 pp3Be = vec_sra((vec_s32)sum3B, v16ui); 447 448 pp1cAe = vec_add(pp1Ae, v512si); 449 pp1cAo = vec_add(pp1Ao, v512si); 450 pp1cBe = vec_add(pp1Be, v512si); 451 pp1cBo = vec_add(pp1Bo, v512si); 452 453 pp32Ae = vec_sub(pp3Ae, pp2Ae); 454 pp32Ao = vec_sub(pp3Ao, pp2Ao); 455 pp32Be = vec_sub(pp3Be, pp2Be); 456 pp32Bo = vec_sub(pp3Bo, pp2Bo); 457 458 sumAe = vec_add(pp1cAe, pp32Ae); 459 sumAo = vec_add(pp1cAo, pp32Ao); 460 sumBe = vec_add(pp1cBe, pp32Be); 461 sumBo = vec_add(pp1cBo, pp32Bo); 462 463 ssumAe = vec_sra(sumAe, v10ui); 464 ssumAo = vec_sra(sumAo, v10ui); 465 ssumBe = vec_sra(sumBe, v10ui); 466 ssumBo = vec_sra(sumBo, v10ui); 467 468 ssume = vec_packs(ssumAe, ssumBe); 469 ssumo = vec_packs(ssumAo, ssumBo); 470 471 sumv = vec_packsu(ssume, ssumo); 472 sum = vec_perm(sumv, sumv, mperm); 473 474 ASSERT_ALIGNED(dst); 475 476 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); 477 478 vec_st(fsum, 0, dst); 479 480 dst += dstStride; 481 } 482} 483#endif /* PREFIX_h264_qpel16_hv_lowpass_altivec */ 484