1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2002 Brian Foley 3cabdff1aSopenharmony_ci * Copyright (c) 2002 Dieter Shirley 4cabdff1aSopenharmony_ci * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "config.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 26cabdff1aSopenharmony_ci#include "libavutil/cpu.h" 27cabdff1aSopenharmony_ci#include "libavutil/ppc/cpu.h" 28cabdff1aSopenharmony_ci#include "libavutil/ppc/util_altivec.h" 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ci#include "libavcodec/hpeldsp.h" 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ci#include "hpeldsp_altivec.h" 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 35cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 16) == 0) */ 36cabdff1aSopenharmony_civoid ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 37cabdff1aSopenharmony_ci{ 38cabdff1aSopenharmony_ci register vector unsigned char pixelsv1; 39cabdff1aSopenharmony_ci register vector unsigned char pixelsv1B; 40cabdff1aSopenharmony_ci register vector unsigned char pixelsv1C; 41cabdff1aSopenharmony_ci register vector unsigned char pixelsv1D; 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_ci int i; 44cabdff1aSopenharmony_ci register ptrdiff_t line_size_2 = line_size << 1; 45cabdff1aSopenharmony_ci register ptrdiff_t line_size_3 = line_size + line_size_2; 46cabdff1aSopenharmony_ci register ptrdiff_t line_size_4 = line_size << 2; 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_ci// hand-unrolling the loop by 4 gains about 15% 49cabdff1aSopenharmony_ci// mininum execution time goes from 74 to 60 cycles 50cabdff1aSopenharmony_ci// it's faster than -funroll-loops, but using 51cabdff1aSopenharmony_ci// -funroll-loops w/ this is bad - 74 cycles again. 52cabdff1aSopenharmony_ci// all this is on a 7450, tuning for the 7450 53cabdff1aSopenharmony_ci for (i = 0; i < h; i += 4) { 54cabdff1aSopenharmony_ci pixelsv1 = unaligned_load( 0, pixels); 55cabdff1aSopenharmony_ci pixelsv1B = unaligned_load(line_size, pixels); 56cabdff1aSopenharmony_ci pixelsv1C = unaligned_load(line_size_2, pixels); 57cabdff1aSopenharmony_ci pixelsv1D = unaligned_load(line_size_3, pixels); 58cabdff1aSopenharmony_ci VEC_ST(pixelsv1, 0, (unsigned char*)block); 59cabdff1aSopenharmony_ci VEC_ST(pixelsv1B, line_size, (unsigned char*)block); 60cabdff1aSopenharmony_ci VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block); 61cabdff1aSopenharmony_ci VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block); 62cabdff1aSopenharmony_ci pixels+=line_size_4; 63cabdff1aSopenharmony_ci block +=line_size_4; 64cabdff1aSopenharmony_ci } 65cabdff1aSopenharmony_ci} 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 16) == 0) */ 68cabdff1aSopenharmony_ci#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 69cabdff1aSopenharmony_civoid ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 70cabdff1aSopenharmony_ci{ 71cabdff1aSopenharmony_ci register vector unsigned char pixelsv, blockv; 72cabdff1aSopenharmony_ci 73cabdff1aSopenharmony_ci int i; 74cabdff1aSopenharmony_ci for (i = 0; i < h; i++) { 75cabdff1aSopenharmony_ci blockv = vec_ld(0, block); 76cabdff1aSopenharmony_ci pixelsv = VEC_LD( 0, pixels); 77cabdff1aSopenharmony_ci blockv = vec_avg(blockv,pixelsv); 78cabdff1aSopenharmony_ci vec_st(blockv, 0, (unsigned char*)block); 79cabdff1aSopenharmony_ci pixels+=line_size; 80cabdff1aSopenharmony_ci block +=line_size; 81cabdff1aSopenharmony_ci } 82cabdff1aSopenharmony_ci} 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 8) == 0) */ 85cabdff1aSopenharmony_cistatic void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) 86cabdff1aSopenharmony_ci{ 87cabdff1aSopenharmony_ci register vector unsigned char pixelsv, blockv; 88cabdff1aSopenharmony_ci int i; 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ci for (i = 0; i < h; i++) { 91cabdff1aSopenharmony_ci /* block is 8 bytes-aligned, so we're either in the 92cabdff1aSopenharmony_ci left block (16 bytes-aligned) or in the right block (not) */ 93cabdff1aSopenharmony_ci int rightside = ((unsigned long)block & 0x0000000F); 94cabdff1aSopenharmony_ci 95cabdff1aSopenharmony_ci blockv = vec_ld(0, block); 96cabdff1aSopenharmony_ci pixelsv = VEC_LD( 0, pixels); 97cabdff1aSopenharmony_ci 98cabdff1aSopenharmony_ci if (rightside) { 99cabdff1aSopenharmony_ci pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 100cabdff1aSopenharmony_ci } else { 101cabdff1aSopenharmony_ci pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); 102cabdff1aSopenharmony_ci } 103cabdff1aSopenharmony_ci 104cabdff1aSopenharmony_ci blockv = vec_avg(blockv, pixelsv); 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci vec_st(blockv, 0, block); 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_ci pixels += line_size; 109cabdff1aSopenharmony_ci block += line_size; 110cabdff1aSopenharmony_ci } 111cabdff1aSopenharmony_ci} 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 8) == 0) */ 114cabdff1aSopenharmony_cistatic void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 115cabdff1aSopenharmony_ci{ 116cabdff1aSopenharmony_ci register int i; 117cabdff1aSopenharmony_ci register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 118cabdff1aSopenharmony_ci register vector unsigned char blockv; 119cabdff1aSopenharmony_ci register vector unsigned short pixelssum1, pixelssum2, temp3; 120cabdff1aSopenharmony_ci register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 121cabdff1aSopenharmony_ci register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 122cabdff1aSopenharmony_ci 123cabdff1aSopenharmony_ci pixelsv1 = VEC_LD(0, pixels); 124cabdff1aSopenharmony_ci pixelsv2 = VEC_LD(1, pixels); 125cabdff1aSopenharmony_ci pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 126cabdff1aSopenharmony_ci pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 127cabdff1aSopenharmony_ci 128cabdff1aSopenharmony_ci pixelssum1 = vec_add((vector unsigned short)pixelsv1, 129cabdff1aSopenharmony_ci (vector unsigned short)pixelsv2); 130cabdff1aSopenharmony_ci pixelssum1 = vec_add(pixelssum1, vctwo); 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci for (i = 0; i < h ; i++) { 133cabdff1aSopenharmony_ci int rightside = ((unsigned long)block & 0x0000000F); 134cabdff1aSopenharmony_ci blockv = vec_ld(0, block); 135cabdff1aSopenharmony_ci 136cabdff1aSopenharmony_ci pixelsv1 = unaligned_load(line_size, pixels); 137cabdff1aSopenharmony_ci pixelsv2 = unaligned_load(line_size+1, pixels); 138cabdff1aSopenharmony_ci pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 139cabdff1aSopenharmony_ci pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 140cabdff1aSopenharmony_ci pixelssum2 = vec_add((vector unsigned short)pixelsv1, 141cabdff1aSopenharmony_ci (vector unsigned short)pixelsv2); 142cabdff1aSopenharmony_ci temp3 = vec_add(pixelssum1, pixelssum2); 143cabdff1aSopenharmony_ci temp3 = vec_sra(temp3, vctwo); 144cabdff1aSopenharmony_ci pixelssum1 = vec_add(pixelssum2, vctwo); 145cabdff1aSopenharmony_ci pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 146cabdff1aSopenharmony_ci 147cabdff1aSopenharmony_ci if (rightside) { 148cabdff1aSopenharmony_ci blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 149cabdff1aSopenharmony_ci } else { 150cabdff1aSopenharmony_ci blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 151cabdff1aSopenharmony_ci } 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_ci vec_st(blockv, 0, block); 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ci block += line_size; 156cabdff1aSopenharmony_ci pixels += line_size; 157cabdff1aSopenharmony_ci } 158cabdff1aSopenharmony_ci} 159cabdff1aSopenharmony_ci 160cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 8) == 0) */ 161cabdff1aSopenharmony_cistatic void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 162cabdff1aSopenharmony_ci{ 163cabdff1aSopenharmony_ci register int i; 164cabdff1aSopenharmony_ci register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 165cabdff1aSopenharmony_ci register vector unsigned char blockv; 166cabdff1aSopenharmony_ci register vector unsigned short pixelssum1, pixelssum2, temp3; 167cabdff1aSopenharmony_ci register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 168cabdff1aSopenharmony_ci register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 169cabdff1aSopenharmony_ci register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci pixelsv1 = VEC_LD(0, pixels); 172cabdff1aSopenharmony_ci pixelsv2 = VEC_LD(1, pixels); 173cabdff1aSopenharmony_ci pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 174cabdff1aSopenharmony_ci pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 175cabdff1aSopenharmony_ci pixelssum1 = vec_add((vector unsigned short)pixelsv1, 176cabdff1aSopenharmony_ci (vector unsigned short)pixelsv2); 177cabdff1aSopenharmony_ci pixelssum1 = vec_add(pixelssum1, vcone); 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci for (i = 0; i < h ; i++) { 180cabdff1aSopenharmony_ci int rightside = ((unsigned long)block & 0x0000000F); 181cabdff1aSopenharmony_ci blockv = vec_ld(0, block); 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_ci pixelsv1 = unaligned_load(line_size, pixels); 184cabdff1aSopenharmony_ci pixelsv2 = unaligned_load(line_size+1, pixels); 185cabdff1aSopenharmony_ci pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 186cabdff1aSopenharmony_ci pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 187cabdff1aSopenharmony_ci pixelssum2 = vec_add((vector unsigned short)pixelsv1, 188cabdff1aSopenharmony_ci (vector unsigned short)pixelsv2); 189cabdff1aSopenharmony_ci temp3 = vec_add(pixelssum1, pixelssum2); 190cabdff1aSopenharmony_ci temp3 = vec_sra(temp3, vctwo); 191cabdff1aSopenharmony_ci pixelssum1 = vec_add(pixelssum2, vcone); 192cabdff1aSopenharmony_ci pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci if (rightside) { 195cabdff1aSopenharmony_ci blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 196cabdff1aSopenharmony_ci } else { 197cabdff1aSopenharmony_ci blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 198cabdff1aSopenharmony_ci } 199cabdff1aSopenharmony_ci 200cabdff1aSopenharmony_ci vec_st(blockv, 0, block); 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ci block += line_size; 203cabdff1aSopenharmony_ci pixels += line_size; 204cabdff1aSopenharmony_ci } 205cabdff1aSopenharmony_ci} 206cabdff1aSopenharmony_ci 207cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 16) == 0) */ 208cabdff1aSopenharmony_cistatic void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) 209cabdff1aSopenharmony_ci{ 210cabdff1aSopenharmony_ci register int i; 211cabdff1aSopenharmony_ci register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 212cabdff1aSopenharmony_ci register vector unsigned char blockv; 213cabdff1aSopenharmony_ci register vector unsigned short temp3, temp4, 214cabdff1aSopenharmony_ci pixelssum1, pixelssum2, pixelssum3, pixelssum4; 215cabdff1aSopenharmony_ci register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 216cabdff1aSopenharmony_ci register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 217cabdff1aSopenharmony_ci 218cabdff1aSopenharmony_ci pixelsv1 = VEC_LD(0, pixels); 219cabdff1aSopenharmony_ci pixelsv2 = VEC_LD(1, pixels); 220cabdff1aSopenharmony_ci pixelsv3 = VEC_MERGEL(vczero, pixelsv1); 221cabdff1aSopenharmony_ci pixelsv4 = VEC_MERGEL(vczero, pixelsv2); 222cabdff1aSopenharmony_ci pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 223cabdff1aSopenharmony_ci pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 224cabdff1aSopenharmony_ci pixelssum3 = vec_add((vector unsigned short)pixelsv3, 225cabdff1aSopenharmony_ci (vector unsigned short)pixelsv4); 226cabdff1aSopenharmony_ci pixelssum3 = vec_add(pixelssum3, vctwo); 227cabdff1aSopenharmony_ci pixelssum1 = vec_add((vector unsigned short)pixelsv1, 228cabdff1aSopenharmony_ci (vector unsigned short)pixelsv2); 229cabdff1aSopenharmony_ci pixelssum1 = vec_add(pixelssum1, vctwo); 230cabdff1aSopenharmony_ci 231cabdff1aSopenharmony_ci for (i = 0; i < h ; i++) { 232cabdff1aSopenharmony_ci blockv = vec_ld(0, block); 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci pixelsv1 = unaligned_load(line_size, pixels); 235cabdff1aSopenharmony_ci pixelsv2 = unaligned_load(line_size+1, pixels); 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci pixelsv3 = VEC_MERGEL(vczero, pixelsv1); 238cabdff1aSopenharmony_ci pixelsv4 = VEC_MERGEL(vczero, pixelsv2); 239cabdff1aSopenharmony_ci pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 240cabdff1aSopenharmony_ci pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 241cabdff1aSopenharmony_ci pixelssum4 = vec_add((vector unsigned short)pixelsv3, 242cabdff1aSopenharmony_ci (vector unsigned short)pixelsv4); 243cabdff1aSopenharmony_ci pixelssum2 = vec_add((vector unsigned short)pixelsv1, 244cabdff1aSopenharmony_ci (vector unsigned short)pixelsv2); 245cabdff1aSopenharmony_ci temp4 = vec_add(pixelssum3, pixelssum4); 246cabdff1aSopenharmony_ci temp4 = vec_sra(temp4, vctwo); 247cabdff1aSopenharmony_ci temp3 = vec_add(pixelssum1, pixelssum2); 248cabdff1aSopenharmony_ci temp3 = vec_sra(temp3, vctwo); 249cabdff1aSopenharmony_ci 250cabdff1aSopenharmony_ci pixelssum3 = vec_add(pixelssum4, vctwo); 251cabdff1aSopenharmony_ci pixelssum1 = vec_add(pixelssum2, vctwo); 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci blockv = vec_packsu(temp3, temp4); 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_ci vec_st(blockv, 0, block); 256cabdff1aSopenharmony_ci 257cabdff1aSopenharmony_ci block += line_size; 258cabdff1aSopenharmony_ci pixels += line_size; 259cabdff1aSopenharmony_ci } 260cabdff1aSopenharmony_ci} 261cabdff1aSopenharmony_ci 262cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 16) == 0) */ 263cabdff1aSopenharmony_cistatic void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) 264cabdff1aSopenharmony_ci{ 265cabdff1aSopenharmony_ci register int i; 266cabdff1aSopenharmony_ci register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 267cabdff1aSopenharmony_ci register vector unsigned char blockv; 268cabdff1aSopenharmony_ci register vector unsigned short temp3, temp4, 269cabdff1aSopenharmony_ci pixelssum1, pixelssum2, pixelssum3, pixelssum4; 270cabdff1aSopenharmony_ci register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 271cabdff1aSopenharmony_ci register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 272cabdff1aSopenharmony_ci register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 273cabdff1aSopenharmony_ci 274cabdff1aSopenharmony_ci pixelsv1 = VEC_LD(0, pixels); 275cabdff1aSopenharmony_ci pixelsv2 = VEC_LD(1, pixels); 276cabdff1aSopenharmony_ci pixelsv3 = VEC_MERGEL(vczero, pixelsv1); 277cabdff1aSopenharmony_ci pixelsv4 = VEC_MERGEL(vczero, pixelsv2); 278cabdff1aSopenharmony_ci pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 279cabdff1aSopenharmony_ci pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 280cabdff1aSopenharmony_ci pixelssum3 = vec_add((vector unsigned short)pixelsv3, 281cabdff1aSopenharmony_ci (vector unsigned short)pixelsv4); 282cabdff1aSopenharmony_ci pixelssum3 = vec_add(pixelssum3, vcone); 283cabdff1aSopenharmony_ci pixelssum1 = vec_add((vector unsigned short)pixelsv1, 284cabdff1aSopenharmony_ci (vector unsigned short)pixelsv2); 285cabdff1aSopenharmony_ci pixelssum1 = vec_add(pixelssum1, vcone); 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci for (i = 0; i < h ; i++) { 288cabdff1aSopenharmony_ci pixelsv1 = unaligned_load(line_size, pixels); 289cabdff1aSopenharmony_ci pixelsv2 = unaligned_load(line_size+1, pixels); 290cabdff1aSopenharmony_ci 291cabdff1aSopenharmony_ci pixelsv3 = VEC_MERGEL(vczero, pixelsv1); 292cabdff1aSopenharmony_ci pixelsv4 = VEC_MERGEL(vczero, pixelsv2); 293cabdff1aSopenharmony_ci pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 294cabdff1aSopenharmony_ci pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 295cabdff1aSopenharmony_ci pixelssum4 = vec_add((vector unsigned short)pixelsv3, 296cabdff1aSopenharmony_ci (vector unsigned short)pixelsv4); 297cabdff1aSopenharmony_ci pixelssum2 = vec_add((vector unsigned short)pixelsv1, 298cabdff1aSopenharmony_ci (vector unsigned short)pixelsv2); 299cabdff1aSopenharmony_ci temp4 = vec_add(pixelssum3, pixelssum4); 300cabdff1aSopenharmony_ci temp4 = vec_sra(temp4, vctwo); 301cabdff1aSopenharmony_ci temp3 = vec_add(pixelssum1, pixelssum2); 302cabdff1aSopenharmony_ci temp3 = vec_sra(temp3, vctwo); 303cabdff1aSopenharmony_ci 304cabdff1aSopenharmony_ci pixelssum3 = vec_add(pixelssum4, vcone); 305cabdff1aSopenharmony_ci pixelssum1 = vec_add(pixelssum2, vcone); 306cabdff1aSopenharmony_ci 307cabdff1aSopenharmony_ci blockv = vec_packsu(temp3, temp4); 308cabdff1aSopenharmony_ci 309cabdff1aSopenharmony_ci VEC_ST(blockv, 0, block); 310cabdff1aSopenharmony_ci 311cabdff1aSopenharmony_ci block += line_size; 312cabdff1aSopenharmony_ci pixels += line_size; 313cabdff1aSopenharmony_ci } 314cabdff1aSopenharmony_ci} 315cabdff1aSopenharmony_ci 316cabdff1aSopenharmony_ci/* next one assumes that ((line_size % 8) == 0) */ 317cabdff1aSopenharmony_cistatic void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 318cabdff1aSopenharmony_ci{ 319cabdff1aSopenharmony_ci register int i; 320cabdff1aSopenharmony_ci register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 321cabdff1aSopenharmony_ci register vector unsigned char blockv, blocktemp; 322cabdff1aSopenharmony_ci register vector unsigned short pixelssum1, pixelssum2, temp3; 323cabdff1aSopenharmony_ci 324cabdff1aSopenharmony_ci register const vector unsigned char vczero = (const vector unsigned char) 325cabdff1aSopenharmony_ci vec_splat_u8(0); 326cabdff1aSopenharmony_ci register const vector unsigned short vctwo = (const vector unsigned short) 327cabdff1aSopenharmony_ci vec_splat_u16(2); 328cabdff1aSopenharmony_ci 329cabdff1aSopenharmony_ci pixelsv1 = VEC_LD(0, pixels); 330cabdff1aSopenharmony_ci pixelsv2 = VEC_LD(1, pixels); 331cabdff1aSopenharmony_ci pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 332cabdff1aSopenharmony_ci pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 333cabdff1aSopenharmony_ci pixelssum1 = vec_add((vector unsigned short)pixelsv1, 334cabdff1aSopenharmony_ci (vector unsigned short)pixelsv2); 335cabdff1aSopenharmony_ci pixelssum1 = vec_add(pixelssum1, vctwo); 336cabdff1aSopenharmony_ci 337cabdff1aSopenharmony_ci for (i = 0; i < h ; i++) { 338cabdff1aSopenharmony_ci int rightside = ((unsigned long)block & 0x0000000F); 339cabdff1aSopenharmony_ci blockv = vec_ld(0, block); 340cabdff1aSopenharmony_ci 341cabdff1aSopenharmony_ci pixelsv1 = unaligned_load(line_size, pixels); 342cabdff1aSopenharmony_ci pixelsv2 = unaligned_load(line_size+1, pixels); 343cabdff1aSopenharmony_ci 344cabdff1aSopenharmony_ci pixelsv1 = VEC_MERGEH(vczero, pixelsv1); 345cabdff1aSopenharmony_ci pixelsv2 = VEC_MERGEH(vczero, pixelsv2); 346cabdff1aSopenharmony_ci pixelssum2 = vec_add((vector unsigned short)pixelsv1, 347cabdff1aSopenharmony_ci (vector unsigned short)pixelsv2); 348cabdff1aSopenharmony_ci temp3 = vec_add(pixelssum1, pixelssum2); 349cabdff1aSopenharmony_ci temp3 = vec_sra(temp3, vctwo); 350cabdff1aSopenharmony_ci pixelssum1 = vec_add(pixelssum2, vctwo); 351cabdff1aSopenharmony_ci pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 352cabdff1aSopenharmony_ci 353cabdff1aSopenharmony_ci if (rightside) { 354cabdff1aSopenharmony_ci blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 355cabdff1aSopenharmony_ci } else { 356cabdff1aSopenharmony_ci blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 357cabdff1aSopenharmony_ci } 358cabdff1aSopenharmony_ci 359cabdff1aSopenharmony_ci blockv = vec_avg(blocktemp, blockv); 360cabdff1aSopenharmony_ci vec_st(blockv, 0, block); 361cabdff1aSopenharmony_ci 362cabdff1aSopenharmony_ci block += line_size; 363cabdff1aSopenharmony_ci pixels += line_size; 364cabdff1aSopenharmony_ci } 365cabdff1aSopenharmony_ci} 366cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 367cabdff1aSopenharmony_ci 368cabdff1aSopenharmony_ciav_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags) 369cabdff1aSopenharmony_ci{ 370cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 371cabdff1aSopenharmony_ci if (!PPC_ALTIVEC(av_get_cpu_flags())) 372cabdff1aSopenharmony_ci return; 373cabdff1aSopenharmony_ci 374cabdff1aSopenharmony_ci c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec; 375cabdff1aSopenharmony_ci c->avg_pixels_tab[1][0] = avg_pixels8_altivec; 376cabdff1aSopenharmony_ci c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; 377cabdff1aSopenharmony_ci 378cabdff1aSopenharmony_ci c->put_pixels_tab[0][0] = ff_put_pixels16_altivec; 379cabdff1aSopenharmony_ci c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; 380cabdff1aSopenharmony_ci c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; 381cabdff1aSopenharmony_ci 382cabdff1aSopenharmony_ci c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec; 383cabdff1aSopenharmony_ci c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; 384cabdff1aSopenharmony_ci c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; 385cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 386cabdff1aSopenharmony_ci} 387