1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2002 Brian Foley 3cabdff1aSopenharmony_ci * Copyright (c) 2002 Dieter Shirley 4cabdff1aSopenharmony_ci * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "config.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 26cabdff1aSopenharmony_ci#include "libavutil/cpu.h" 27cabdff1aSopenharmony_ci#include "libavutil/ppc/cpu.h" 28cabdff1aSopenharmony_ci#include "libavutil/ppc/util_altivec.h" 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ci#include "libavcodec/avcodec.h" 31cabdff1aSopenharmony_ci#include "libavcodec/mpegvideo.h" 32cabdff1aSopenharmony_ci#include "libavcodec/me_cmp.h" 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 35cabdff1aSopenharmony_ci 36cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 37cabdff1aSopenharmony_ci#define GET_PERM(per1, per2, pix) {\ 38cabdff1aSopenharmony_ci per1 = vec_lvsl(0, pix);\ 39cabdff1aSopenharmony_ci per2 = vec_add(per1, vec_splat_u8(1));\ 40cabdff1aSopenharmony_ci} 41cabdff1aSopenharmony_ci#define LOAD_PIX(v, iv, pix, per1, per2) {\ 42cabdff1aSopenharmony_ci vector unsigned char pix2l = vec_ld(0, pix);\ 43cabdff1aSopenharmony_ci vector unsigned char pix2r = vec_ld(16, pix);\ 44cabdff1aSopenharmony_ci v = vec_perm(pix2l, pix2r, per1);\ 45cabdff1aSopenharmony_ci iv = vec_perm(pix2l, pix2r, per2);\ 46cabdff1aSopenharmony_ci} 47cabdff1aSopenharmony_ci#else 48cabdff1aSopenharmony_ci#define GET_PERM(per1, per2, pix) {} 49cabdff1aSopenharmony_ci#define LOAD_PIX(v, iv, pix, per1, per2) {\ 50cabdff1aSopenharmony_ci v = vec_vsx_ld(0, pix);\ 51cabdff1aSopenharmony_ci iv = vec_vsx_ld(1, pix);\ 52cabdff1aSopenharmony_ci} 53cabdff1aSopenharmony_ci#endif 54cabdff1aSopenharmony_cistatic int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 55cabdff1aSopenharmony_ci ptrdiff_t stride, int h) 56cabdff1aSopenharmony_ci{ 57cabdff1aSopenharmony_ci int i; 58cabdff1aSopenharmony_ci int __attribute__((aligned(16))) s = 0; 59cabdff1aSopenharmony_ci const vector unsigned char zero = 60cabdff1aSopenharmony_ci (const vector unsigned char) vec_splat_u8(0); 61cabdff1aSopenharmony_ci vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); 62cabdff1aSopenharmony_ci vector signed int sumdiffs; 63cabdff1aSopenharmony_ci vector unsigned char perm1, perm2, pix2v, pix2iv; 64cabdff1aSopenharmony_ci 65cabdff1aSopenharmony_ci GET_PERM(perm1, perm2, pix2); 66cabdff1aSopenharmony_ci for (i = 0; i < h; i++) { 67cabdff1aSopenharmony_ci /* Read unaligned pixels into our vectors. The vectors are as follows: 68cabdff1aSopenharmony_ci * pix1v: pix1[0] - pix1[15] 69cabdff1aSopenharmony_ci * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */ 70cabdff1aSopenharmony_ci vector unsigned char pix1v = vec_ld(0, pix1); 71cabdff1aSopenharmony_ci LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2); 72cabdff1aSopenharmony_ci 73cabdff1aSopenharmony_ci /* Calculate the average vector. */ 74cabdff1aSopenharmony_ci vector unsigned char avgv = vec_avg(pix2v, pix2iv); 75cabdff1aSopenharmony_ci 76cabdff1aSopenharmony_ci /* Calculate a sum of abs differences vector. */ 77cabdff1aSopenharmony_ci vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv), 78cabdff1aSopenharmony_ci vec_min(pix1v, avgv)); 79cabdff1aSopenharmony_ci 80cabdff1aSopenharmony_ci /* Add each 4 pixel group together and put 4 results into sad. */ 81cabdff1aSopenharmony_ci sad = vec_sum4s(t5, sad); 82cabdff1aSopenharmony_ci 83cabdff1aSopenharmony_ci pix1 += stride; 84cabdff1aSopenharmony_ci pix2 += stride; 85cabdff1aSopenharmony_ci } 86cabdff1aSopenharmony_ci /* Sum up the four partial sums, and put the result into s. */ 87cabdff1aSopenharmony_ci sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 88cabdff1aSopenharmony_ci sumdiffs = vec_splat(sumdiffs, 3); 89cabdff1aSopenharmony_ci vec_ste(sumdiffs, 0, &s); 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ci return s; 92cabdff1aSopenharmony_ci} 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_cistatic int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 95cabdff1aSopenharmony_ci ptrdiff_t stride, int h) 96cabdff1aSopenharmony_ci{ 97cabdff1aSopenharmony_ci int i; 98cabdff1aSopenharmony_ci int __attribute__((aligned(16))) s = 0; 99cabdff1aSopenharmony_ci const vector unsigned char zero = 100cabdff1aSopenharmony_ci (const vector unsigned char) vec_splat_u8(0); 101cabdff1aSopenharmony_ci vector unsigned char pix1v, pix3v, avgv, t5; 102cabdff1aSopenharmony_ci vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); 103cabdff1aSopenharmony_ci vector signed int sumdiffs; 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci uint8_t *pix3 = pix2 + stride; 106cabdff1aSopenharmony_ci 107cabdff1aSopenharmony_ci /* Due to the fact that pix3 = pix2 + stride, the pix3 of one 108cabdff1aSopenharmony_ci * iteration becomes pix2 in the next iteration. We can use this 109cabdff1aSopenharmony_ci * fact to avoid a potentially expensive unaligned read, each 110cabdff1aSopenharmony_ci * time around the loop. 111cabdff1aSopenharmony_ci * Read unaligned pixels into our vectors. The vectors are as follows: 112cabdff1aSopenharmony_ci * pix2v: pix2[0] - pix2[15] 113cabdff1aSopenharmony_ci * Split the pixel vectors into shorts. */ 114cabdff1aSopenharmony_ci vector unsigned char pix2v = VEC_LD(0, pix2); 115cabdff1aSopenharmony_ci 116cabdff1aSopenharmony_ci for (i = 0; i < h; i++) { 117cabdff1aSopenharmony_ci /* Read unaligned pixels into our vectors. The vectors are as follows: 118cabdff1aSopenharmony_ci * pix1v: pix1[0] - pix1[15] 119cabdff1aSopenharmony_ci * pix3v: pix3[0] - pix3[15] */ 120cabdff1aSopenharmony_ci pix1v = vec_ld(0, pix1); 121cabdff1aSopenharmony_ci pix3v = VEC_LD(0, pix3); 122cabdff1aSopenharmony_ci 123cabdff1aSopenharmony_ci /* Calculate the average vector. */ 124cabdff1aSopenharmony_ci avgv = vec_avg(pix2v, pix3v); 125cabdff1aSopenharmony_ci 126cabdff1aSopenharmony_ci /* Calculate a sum of abs differences vector. */ 127cabdff1aSopenharmony_ci t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 128cabdff1aSopenharmony_ci 129cabdff1aSopenharmony_ci /* Add each 4 pixel group together and put 4 results into sad. */ 130cabdff1aSopenharmony_ci sad = vec_sum4s(t5, sad); 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci pix1 += stride; 133cabdff1aSopenharmony_ci pix2v = pix3v; 134cabdff1aSopenharmony_ci pix3 += stride; 135cabdff1aSopenharmony_ci } 136cabdff1aSopenharmony_ci 137cabdff1aSopenharmony_ci /* Sum up the four partial sums, and put the result into s. */ 138cabdff1aSopenharmony_ci sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 139cabdff1aSopenharmony_ci sumdiffs = vec_splat(sumdiffs, 3); 140cabdff1aSopenharmony_ci vec_ste(sumdiffs, 0, &s); 141cabdff1aSopenharmony_ci return s; 142cabdff1aSopenharmony_ci} 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_cistatic int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 145cabdff1aSopenharmony_ci ptrdiff_t stride, int h) 146cabdff1aSopenharmony_ci{ 147cabdff1aSopenharmony_ci int i; 148cabdff1aSopenharmony_ci int __attribute__((aligned(16))) s = 0; 149cabdff1aSopenharmony_ci uint8_t *pix3 = pix2 + stride; 150cabdff1aSopenharmony_ci const vector unsigned char zero = 151cabdff1aSopenharmony_ci (const vector unsigned char) vec_splat_u8(0); 152cabdff1aSopenharmony_ci const vector unsigned short two = 153cabdff1aSopenharmony_ci (const vector unsigned short) vec_splat_u16(2); 154cabdff1aSopenharmony_ci vector unsigned char avgv, t5; 155cabdff1aSopenharmony_ci vector unsigned char pix1v, pix3v, pix3iv; 156cabdff1aSopenharmony_ci vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; 157cabdff1aSopenharmony_ci vector unsigned short avghv, avglv; 158cabdff1aSopenharmony_ci vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); 159cabdff1aSopenharmony_ci vector signed int sumdiffs; 160cabdff1aSopenharmony_ci vector unsigned char perm1, perm2, pix2v, pix2iv; 161cabdff1aSopenharmony_ci GET_PERM(perm1, perm2, pix2); 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_ci /* Due to the fact that pix3 = pix2 + stride, the pix3 of one 164cabdff1aSopenharmony_ci * iteration becomes pix2 in the next iteration. We can use this 165cabdff1aSopenharmony_ci * fact to avoid a potentially expensive unaligned read, as well 166cabdff1aSopenharmony_ci * as some splitting, and vector addition each time around the loop. 167cabdff1aSopenharmony_ci * Read unaligned pixels into our vectors. The vectors are as follows: 168cabdff1aSopenharmony_ci * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] 169cabdff1aSopenharmony_ci * Split the pixel vectors into shorts. */ 170cabdff1aSopenharmony_ci LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2); 171cabdff1aSopenharmony_ci vector unsigned short pix2hv = 172cabdff1aSopenharmony_ci (vector unsigned short) VEC_MERGEH(zero, pix2v); 173cabdff1aSopenharmony_ci vector unsigned short pix2lv = 174cabdff1aSopenharmony_ci (vector unsigned short) VEC_MERGEL(zero, pix2v); 175cabdff1aSopenharmony_ci vector unsigned short pix2ihv = 176cabdff1aSopenharmony_ci (vector unsigned short) VEC_MERGEH(zero, pix2iv); 177cabdff1aSopenharmony_ci vector unsigned short pix2ilv = 178cabdff1aSopenharmony_ci (vector unsigned short) VEC_MERGEL(zero, pix2iv); 179cabdff1aSopenharmony_ci 180cabdff1aSopenharmony_ci vector unsigned short t1 = vec_add(pix2hv, pix2ihv); 181cabdff1aSopenharmony_ci vector unsigned short t2 = vec_add(pix2lv, pix2ilv); 182cabdff1aSopenharmony_ci vector unsigned short t3, t4; 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci for (i = 0; i < h; i++) { 185cabdff1aSopenharmony_ci /* Read unaligned pixels into our vectors. The vectors are as follows: 186cabdff1aSopenharmony_ci * pix1v: pix1[0] - pix1[15] 187cabdff1aSopenharmony_ci * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */ 188cabdff1aSopenharmony_ci pix1v = vec_ld(0, pix1); 189cabdff1aSopenharmony_ci LOAD_PIX(pix3v, pix3iv, pix3, perm1, perm2); 190cabdff1aSopenharmony_ci 191cabdff1aSopenharmony_ci /* Note that AltiVec does have vec_avg, but this works on vector pairs 192cabdff1aSopenharmony_ci * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the 193cabdff1aSopenharmony_ci * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when 194cabdff1aSopenharmony_ci * it should be 1. Instead, we have to split the pixel vectors into 195cabdff1aSopenharmony_ci * vectors of shorts and do the averaging by hand. */ 196cabdff1aSopenharmony_ci 197cabdff1aSopenharmony_ci /* Split the pixel vectors into shorts. */ 198cabdff1aSopenharmony_ci pix3hv = (vector unsigned short) VEC_MERGEH(zero, pix3v); 199cabdff1aSopenharmony_ci pix3lv = (vector unsigned short) VEC_MERGEL(zero, pix3v); 200cabdff1aSopenharmony_ci pix3ihv = (vector unsigned short) VEC_MERGEH(zero, pix3iv); 201cabdff1aSopenharmony_ci pix3ilv = (vector unsigned short) VEC_MERGEL(zero, pix3iv); 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci /* Do the averaging on them. */ 204cabdff1aSopenharmony_ci t3 = vec_add(pix3hv, pix3ihv); 205cabdff1aSopenharmony_ci t4 = vec_add(pix3lv, pix3ilv); 206cabdff1aSopenharmony_ci 207cabdff1aSopenharmony_ci avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); 208cabdff1aSopenharmony_ci avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); 209cabdff1aSopenharmony_ci 210cabdff1aSopenharmony_ci /* Pack the shorts back into a result. */ 211cabdff1aSopenharmony_ci avgv = vec_pack(avghv, avglv); 212cabdff1aSopenharmony_ci 213cabdff1aSopenharmony_ci /* Calculate a sum of abs differences vector. */ 214cabdff1aSopenharmony_ci t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 215cabdff1aSopenharmony_ci 216cabdff1aSopenharmony_ci /* Add each 4 pixel group together and put 4 results into sad. */ 217cabdff1aSopenharmony_ci sad = vec_sum4s(t5, sad); 218cabdff1aSopenharmony_ci 219cabdff1aSopenharmony_ci pix1 += stride; 220cabdff1aSopenharmony_ci pix3 += stride; 221cabdff1aSopenharmony_ci /* Transfer the calculated values for pix3 into pix2. */ 222cabdff1aSopenharmony_ci t1 = t3; 223cabdff1aSopenharmony_ci t2 = t4; 224cabdff1aSopenharmony_ci } 225cabdff1aSopenharmony_ci /* Sum up the four partial sums, and put the result into s. */ 226cabdff1aSopenharmony_ci sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 227cabdff1aSopenharmony_ci sumdiffs = vec_splat(sumdiffs, 3); 228cabdff1aSopenharmony_ci vec_ste(sumdiffs, 0, &s); 229cabdff1aSopenharmony_ci 230cabdff1aSopenharmony_ci return s; 231cabdff1aSopenharmony_ci} 232cabdff1aSopenharmony_ci 233cabdff1aSopenharmony_cistatic int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 234cabdff1aSopenharmony_ci ptrdiff_t stride, int h) 235cabdff1aSopenharmony_ci{ 236cabdff1aSopenharmony_ci int i; 237cabdff1aSopenharmony_ci int __attribute__((aligned(16))) s; 238cabdff1aSopenharmony_ci const vector unsigned int zero = 239cabdff1aSopenharmony_ci (const vector unsigned int) vec_splat_u32(0); 240cabdff1aSopenharmony_ci vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); 241cabdff1aSopenharmony_ci vector signed int sumdiffs; 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci for (i = 0; i < h; i++) { 244cabdff1aSopenharmony_ci /* Read potentially unaligned pixels into t1 and t2. */ 245cabdff1aSopenharmony_ci vector unsigned char t1 =vec_ld(0, pix1); 246cabdff1aSopenharmony_ci vector unsigned char t2 = VEC_LD(0, pix2); 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_ci /* Calculate a sum of abs differences vector. */ 249cabdff1aSopenharmony_ci vector unsigned char t3 = vec_max(t1, t2); 250cabdff1aSopenharmony_ci vector unsigned char t4 = vec_min(t1, t2); 251cabdff1aSopenharmony_ci vector unsigned char t5 = vec_sub(t3, t4); 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci /* Add each 4 pixel group together and put 4 results into sad. */ 254cabdff1aSopenharmony_ci sad = vec_sum4s(t5, sad); 255cabdff1aSopenharmony_ci 256cabdff1aSopenharmony_ci pix1 += stride; 257cabdff1aSopenharmony_ci pix2 += stride; 258cabdff1aSopenharmony_ci } 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci /* Sum up the four partial sums, and put the result into s. */ 261cabdff1aSopenharmony_ci sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 262cabdff1aSopenharmony_ci sumdiffs = vec_splat(sumdiffs, 3); 263cabdff1aSopenharmony_ci vec_ste(sumdiffs, 0, &s); 264cabdff1aSopenharmony_ci 265cabdff1aSopenharmony_ci return s; 266cabdff1aSopenharmony_ci} 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_cistatic int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 269cabdff1aSopenharmony_ci ptrdiff_t stride, int h) 270cabdff1aSopenharmony_ci{ 271cabdff1aSopenharmony_ci int i; 272cabdff1aSopenharmony_ci int __attribute__((aligned(16))) s; 273cabdff1aSopenharmony_ci const vector unsigned int zero = 274cabdff1aSopenharmony_ci (const vector unsigned int) vec_splat_u32(0); 275cabdff1aSopenharmony_ci const vector unsigned char permclear = 276cabdff1aSopenharmony_ci (vector unsigned char) 277cabdff1aSopenharmony_ci { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; 278cabdff1aSopenharmony_ci vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); 279cabdff1aSopenharmony_ci vector signed int sumdiffs; 280cabdff1aSopenharmony_ci 281cabdff1aSopenharmony_ci for (i = 0; i < h; i++) { 282cabdff1aSopenharmony_ci /* Read potentially unaligned pixels into t1 and t2. 283cabdff1aSopenharmony_ci * Since we're reading 16 pixels, and actually only want 8, 284cabdff1aSopenharmony_ci * mask out the last 8 pixels. The 0s don't change the sum. */ 285cabdff1aSopenharmony_ci vector unsigned char pix1l = VEC_LD(0, pix1); 286cabdff1aSopenharmony_ci vector unsigned char pix2l = VEC_LD(0, pix2); 287cabdff1aSopenharmony_ci vector unsigned char t1 = vec_and(pix1l, permclear); 288cabdff1aSopenharmony_ci vector unsigned char t2 = vec_and(pix2l, permclear); 289cabdff1aSopenharmony_ci 290cabdff1aSopenharmony_ci /* Calculate a sum of abs differences vector. */ 291cabdff1aSopenharmony_ci vector unsigned char t3 = vec_max(t1, t2); 292cabdff1aSopenharmony_ci vector unsigned char t4 = vec_min(t1, t2); 293cabdff1aSopenharmony_ci vector unsigned char t5 = vec_sub(t3, t4); 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci /* Add each 4 pixel group together and put 4 results into sad. */ 296cabdff1aSopenharmony_ci sad = vec_sum4s(t5, sad); 297cabdff1aSopenharmony_ci 298cabdff1aSopenharmony_ci pix1 += stride; 299cabdff1aSopenharmony_ci pix2 += stride; 300cabdff1aSopenharmony_ci } 301cabdff1aSopenharmony_ci 302cabdff1aSopenharmony_ci /* Sum up the four partial sums, and put the result into s. */ 303cabdff1aSopenharmony_ci sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 304cabdff1aSopenharmony_ci sumdiffs = vec_splat(sumdiffs, 3); 305cabdff1aSopenharmony_ci vec_ste(sumdiffs, 0, &s); 306cabdff1aSopenharmony_ci 307cabdff1aSopenharmony_ci return s; 308cabdff1aSopenharmony_ci} 309cabdff1aSopenharmony_ci 310cabdff1aSopenharmony_ci/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. 311cabdff1aSopenharmony_ci * It's the sad8_altivec code above w/ squaring added. */ 312cabdff1aSopenharmony_cistatic int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 313cabdff1aSopenharmony_ci ptrdiff_t stride, int h) 314cabdff1aSopenharmony_ci{ 315cabdff1aSopenharmony_ci int i; 316cabdff1aSopenharmony_ci int __attribute__((aligned(16))) s; 317cabdff1aSopenharmony_ci const vector unsigned int zero = 318cabdff1aSopenharmony_ci (const vector unsigned int) vec_splat_u32(0); 319cabdff1aSopenharmony_ci const vector unsigned char permclear = 320cabdff1aSopenharmony_ci (vector unsigned char) 321cabdff1aSopenharmony_ci { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; 322cabdff1aSopenharmony_ci vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); 323cabdff1aSopenharmony_ci vector signed int sumsqr; 324cabdff1aSopenharmony_ci 325cabdff1aSopenharmony_ci for (i = 0; i < h; i++) { 326cabdff1aSopenharmony_ci /* Read potentially unaligned pixels into t1 and t2. 327cabdff1aSopenharmony_ci * Since we're reading 16 pixels, and actually only want 8, 328cabdff1aSopenharmony_ci * mask out the last 8 pixels. The 0s don't change the sum. */ 329cabdff1aSopenharmony_ci vector unsigned char t1 = vec_and(VEC_LD(0, pix1), permclear); 330cabdff1aSopenharmony_ci vector unsigned char t2 = vec_and(VEC_LD(0, pix2), permclear); 331cabdff1aSopenharmony_ci 332cabdff1aSopenharmony_ci /* Since we want to use unsigned chars, we can take advantage 333cabdff1aSopenharmony_ci * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ 334cabdff1aSopenharmony_ci 335cabdff1aSopenharmony_ci /* Calculate abs differences vector. */ 336cabdff1aSopenharmony_ci vector unsigned char t3 = vec_max(t1, t2); 337cabdff1aSopenharmony_ci vector unsigned char t4 = vec_min(t1, t2); 338cabdff1aSopenharmony_ci vector unsigned char t5 = vec_sub(t3, t4); 339cabdff1aSopenharmony_ci 340cabdff1aSopenharmony_ci /* Square the values and add them to our sum. */ 341cabdff1aSopenharmony_ci sum = vec_msum(t5, t5, sum); 342cabdff1aSopenharmony_ci 343cabdff1aSopenharmony_ci pix1 += stride; 344cabdff1aSopenharmony_ci pix2 += stride; 345cabdff1aSopenharmony_ci } 346cabdff1aSopenharmony_ci 347cabdff1aSopenharmony_ci /* Sum up the four partial sums, and put the result into s. */ 348cabdff1aSopenharmony_ci sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 349cabdff1aSopenharmony_ci sumsqr = vec_splat(sumsqr, 3); 350cabdff1aSopenharmony_ci vec_ste(sumsqr, 0, &s); 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci return s; 353cabdff1aSopenharmony_ci} 354cabdff1aSopenharmony_ci 355cabdff1aSopenharmony_ci/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced. 356cabdff1aSopenharmony_ci * It's the sad16_altivec code above w/ squaring added. */ 357cabdff1aSopenharmony_cistatic int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 358cabdff1aSopenharmony_ci ptrdiff_t stride, int h) 359cabdff1aSopenharmony_ci{ 360cabdff1aSopenharmony_ci int i; 361cabdff1aSopenharmony_ci int __attribute__((aligned(16))) s; 362cabdff1aSopenharmony_ci const vector unsigned int zero = 363cabdff1aSopenharmony_ci (const vector unsigned int) vec_splat_u32(0); 364cabdff1aSopenharmony_ci vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); 365cabdff1aSopenharmony_ci vector signed int sumsqr; 366cabdff1aSopenharmony_ci 367cabdff1aSopenharmony_ci for (i = 0; i < h; i++) { 368cabdff1aSopenharmony_ci /* Read potentially unaligned pixels into t1 and t2. */ 369cabdff1aSopenharmony_ci vector unsigned char t1 = vec_ld(0, pix1); 370cabdff1aSopenharmony_ci vector unsigned char t2 = VEC_LD(0, pix2); 371cabdff1aSopenharmony_ci 372cabdff1aSopenharmony_ci /* Since we want to use unsigned chars, we can take advantage 373cabdff1aSopenharmony_ci * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ 374cabdff1aSopenharmony_ci 375cabdff1aSopenharmony_ci /* Calculate abs differences vector. */ 376cabdff1aSopenharmony_ci vector unsigned char t3 = vec_max(t1, t2); 377cabdff1aSopenharmony_ci vector unsigned char t4 = vec_min(t1, t2); 378cabdff1aSopenharmony_ci vector unsigned char t5 = vec_sub(t3, t4); 379cabdff1aSopenharmony_ci 380cabdff1aSopenharmony_ci /* Square the values and add them to our sum. */ 381cabdff1aSopenharmony_ci sum = vec_msum(t5, t5, sum); 382cabdff1aSopenharmony_ci 383cabdff1aSopenharmony_ci pix1 += stride; 384cabdff1aSopenharmony_ci pix2 += stride; 385cabdff1aSopenharmony_ci } 386cabdff1aSopenharmony_ci 387cabdff1aSopenharmony_ci /* Sum up the four partial sums, and put the result into s. */ 388cabdff1aSopenharmony_ci sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 389cabdff1aSopenharmony_ci sumsqr = vec_splat(sumsqr, 3); 390cabdff1aSopenharmony_ci 391cabdff1aSopenharmony_ci vec_ste(sumsqr, 0, &s); 392cabdff1aSopenharmony_ci return s; 393cabdff1aSopenharmony_ci} 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_cistatic int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst, 396cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t stride, int h) 397cabdff1aSopenharmony_ci{ 398cabdff1aSopenharmony_ci int __attribute__((aligned(16))) sum; 399cabdff1aSopenharmony_ci register const vector unsigned char vzero = 400cabdff1aSopenharmony_ci (const vector unsigned char) vec_splat_u8(0); 401cabdff1aSopenharmony_ci register vector signed short temp0, temp1, temp2, temp3, temp4, 402cabdff1aSopenharmony_ci temp5, temp6, temp7; 403cabdff1aSopenharmony_ci { 404cabdff1aSopenharmony_ci register const vector signed short vprod1 = 405cabdff1aSopenharmony_ci (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; 406cabdff1aSopenharmony_ci register const vector signed short vprod2 = 407cabdff1aSopenharmony_ci (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; 408cabdff1aSopenharmony_ci register const vector signed short vprod3 = 409cabdff1aSopenharmony_ci (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; 410cabdff1aSopenharmony_ci register const vector unsigned char perm1 = 411cabdff1aSopenharmony_ci (const vector unsigned char) 412cabdff1aSopenharmony_ci { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 413cabdff1aSopenharmony_ci 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; 414cabdff1aSopenharmony_ci register const vector unsigned char perm2 = 415cabdff1aSopenharmony_ci (const vector unsigned char) 416cabdff1aSopenharmony_ci { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 417cabdff1aSopenharmony_ci 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; 418cabdff1aSopenharmony_ci register const vector unsigned char perm3 = 419cabdff1aSopenharmony_ci (const vector unsigned char) 420cabdff1aSopenharmony_ci { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 421cabdff1aSopenharmony_ci 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; 422cabdff1aSopenharmony_ci 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci#define ONEITERBUTTERFLY(i, res) \ 425cabdff1aSopenharmony_ci { \ 426cabdff1aSopenharmony_ci register vector unsigned char srcO = unaligned_load(stride * i, src); \ 427cabdff1aSopenharmony_ci register vector unsigned char dstO = unaligned_load(stride * i, dst);\ 428cabdff1aSopenharmony_ci \ 429cabdff1aSopenharmony_ci /* Promote the unsigned chars to signed shorts. */ \ 430cabdff1aSopenharmony_ci /* We're in the 8x8 function, we only care for the first 8. */ \ 431cabdff1aSopenharmony_ci register vector signed short srcV = \ 432cabdff1aSopenharmony_ci (vector signed short) VEC_MERGEH((vector signed char) vzero, \ 433cabdff1aSopenharmony_ci (vector signed char) srcO); \ 434cabdff1aSopenharmony_ci register vector signed short dstV = \ 435cabdff1aSopenharmony_ci (vector signed short) VEC_MERGEH((vector signed char) vzero, \ 436cabdff1aSopenharmony_ci (vector signed char) dstO); \ 437cabdff1aSopenharmony_ci \ 438cabdff1aSopenharmony_ci /* subtractions inside the first butterfly */ \ 439cabdff1aSopenharmony_ci register vector signed short but0 = vec_sub(srcV, dstV); \ 440cabdff1aSopenharmony_ci register vector signed short op1 = vec_perm(but0, but0, perm1); \ 441cabdff1aSopenharmony_ci register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ 442cabdff1aSopenharmony_ci register vector signed short op2 = vec_perm(but1, but1, perm2); \ 443cabdff1aSopenharmony_ci register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ 444cabdff1aSopenharmony_ci register vector signed short op3 = vec_perm(but2, but2, perm3); \ 445cabdff1aSopenharmony_ci res = vec_mladd(but2, vprod3, op3); \ 446cabdff1aSopenharmony_ci } 447cabdff1aSopenharmony_ci 448cabdff1aSopenharmony_ci ONEITERBUTTERFLY(0, temp0); 449cabdff1aSopenharmony_ci ONEITERBUTTERFLY(1, temp1); 450cabdff1aSopenharmony_ci ONEITERBUTTERFLY(2, temp2); 451cabdff1aSopenharmony_ci ONEITERBUTTERFLY(3, temp3); 452cabdff1aSopenharmony_ci ONEITERBUTTERFLY(4, temp4); 453cabdff1aSopenharmony_ci ONEITERBUTTERFLY(5, temp5); 454cabdff1aSopenharmony_ci ONEITERBUTTERFLY(6, temp6); 455cabdff1aSopenharmony_ci ONEITERBUTTERFLY(7, temp7); 456cabdff1aSopenharmony_ci } 457cabdff1aSopenharmony_ci#undef ONEITERBUTTERFLY 458cabdff1aSopenharmony_ci { 459cabdff1aSopenharmony_ci register vector signed int vsum; 460cabdff1aSopenharmony_ci register vector signed short line0 = vec_add(temp0, temp1); 461cabdff1aSopenharmony_ci register vector signed short line1 = vec_sub(temp0, temp1); 462cabdff1aSopenharmony_ci register vector signed short line2 = vec_add(temp2, temp3); 463cabdff1aSopenharmony_ci register vector signed short line3 = vec_sub(temp2, temp3); 464cabdff1aSopenharmony_ci register vector signed short line4 = vec_add(temp4, temp5); 465cabdff1aSopenharmony_ci register vector signed short line5 = vec_sub(temp4, temp5); 466cabdff1aSopenharmony_ci register vector signed short line6 = vec_add(temp6, temp7); 467cabdff1aSopenharmony_ci register vector signed short line7 = vec_sub(temp6, temp7); 468cabdff1aSopenharmony_ci 469cabdff1aSopenharmony_ci register vector signed short line0B = vec_add(line0, line2); 470cabdff1aSopenharmony_ci register vector signed short line2B = vec_sub(line0, line2); 471cabdff1aSopenharmony_ci register vector signed short line1B = vec_add(line1, line3); 472cabdff1aSopenharmony_ci register vector signed short line3B = vec_sub(line1, line3); 473cabdff1aSopenharmony_ci register vector signed short line4B = vec_add(line4, line6); 474cabdff1aSopenharmony_ci register vector signed short line6B = vec_sub(line4, line6); 475cabdff1aSopenharmony_ci register vector signed short line5B = vec_add(line5, line7); 476cabdff1aSopenharmony_ci register vector signed short line7B = vec_sub(line5, line7); 477cabdff1aSopenharmony_ci 478cabdff1aSopenharmony_ci register vector signed short line0C = vec_add(line0B, line4B); 479cabdff1aSopenharmony_ci register vector signed short line4C = vec_sub(line0B, line4B); 480cabdff1aSopenharmony_ci register vector signed short line1C = vec_add(line1B, line5B); 481cabdff1aSopenharmony_ci register vector signed short line5C = vec_sub(line1B, line5B); 482cabdff1aSopenharmony_ci register vector signed short line2C = vec_add(line2B, line6B); 483cabdff1aSopenharmony_ci register vector signed short line6C = vec_sub(line2B, line6B); 484cabdff1aSopenharmony_ci register vector signed short line3C = vec_add(line3B, line7B); 485cabdff1aSopenharmony_ci register vector signed short line7C = vec_sub(line3B, line7B); 486cabdff1aSopenharmony_ci 487cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); 488cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line1C), vsum); 489cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line2C), vsum); 490cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line3C), vsum); 491cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line4C), vsum); 492cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line5C), vsum); 493cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line6C), vsum); 494cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line7C), vsum); 495cabdff1aSopenharmony_ci vsum = vec_sums(vsum, (vector signed int) vzero); 496cabdff1aSopenharmony_ci vsum = vec_splat(vsum, 3); 497cabdff1aSopenharmony_ci 498cabdff1aSopenharmony_ci vec_ste(vsum, 0, &sum); 499cabdff1aSopenharmony_ci } 500cabdff1aSopenharmony_ci return sum; 501cabdff1aSopenharmony_ci} 502cabdff1aSopenharmony_ci 503cabdff1aSopenharmony_ci/* 504cabdff1aSopenharmony_ci * 16x8 works with 16 elements; it can avoid replicating loads, and 505cabdff1aSopenharmony_ci * gives the compiler more room for scheduling. It's only used from 506cabdff1aSopenharmony_ci * inside hadamard8_diff16_altivec. 507cabdff1aSopenharmony_ci * 508cabdff1aSopenharmony_ci * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has 509cabdff1aSopenharmony_ci * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in 510cabdff1aSopenharmony_ci * registers by itself. The following code includes hand-made register 511cabdff1aSopenharmony_ci * allocation. It's not clean, but on a 7450 the resulting code is much faster 512cabdff1aSopenharmony_ci * (best case falls from 700+ cycles to 550). 513cabdff1aSopenharmony_ci * 514cabdff1aSopenharmony_ci * xlc doesn't add spill code, but it doesn't know how to schedule for the 515cabdff1aSopenharmony_ci * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses 516cabdff1aSopenharmony_ci * 25% fewer instructions...) 517cabdff1aSopenharmony_ci * 518cabdff1aSopenharmony_ci * On the 970, the hand-made RA is still a win (around 690 vs. around 780), 519cabdff1aSopenharmony_ci * but xlc goes to around 660 on the regular C code... 520cabdff1aSopenharmony_ci */ 521cabdff1aSopenharmony_cistatic int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst, 522cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t stride, int h) 523cabdff1aSopenharmony_ci{ 524cabdff1aSopenharmony_ci int __attribute__((aligned(16))) sum; 525cabdff1aSopenharmony_ci register vector signed short 526cabdff1aSopenharmony_ci temp0 __asm__ ("v0"), 527cabdff1aSopenharmony_ci temp1 __asm__ ("v1"), 528cabdff1aSopenharmony_ci temp2 __asm__ ("v2"), 529cabdff1aSopenharmony_ci temp3 __asm__ ("v3"), 530cabdff1aSopenharmony_ci temp4 __asm__ ("v4"), 531cabdff1aSopenharmony_ci temp5 __asm__ ("v5"), 532cabdff1aSopenharmony_ci temp6 __asm__ ("v6"), 533cabdff1aSopenharmony_ci temp7 __asm__ ("v7"); 534cabdff1aSopenharmony_ci register vector signed short 535cabdff1aSopenharmony_ci temp0S __asm__ ("v8"), 536cabdff1aSopenharmony_ci temp1S __asm__ ("v9"), 537cabdff1aSopenharmony_ci temp2S __asm__ ("v10"), 538cabdff1aSopenharmony_ci temp3S __asm__ ("v11"), 539cabdff1aSopenharmony_ci temp4S __asm__ ("v12"), 540cabdff1aSopenharmony_ci temp5S __asm__ ("v13"), 541cabdff1aSopenharmony_ci temp6S __asm__ ("v14"), 542cabdff1aSopenharmony_ci temp7S __asm__ ("v15"); 543cabdff1aSopenharmony_ci register const vector unsigned char vzero __asm__ ("v31") = 544cabdff1aSopenharmony_ci (const vector unsigned char) vec_splat_u8(0); 545cabdff1aSopenharmony_ci { 546cabdff1aSopenharmony_ci register const vector signed short vprod1 __asm__ ("v16") = 547cabdff1aSopenharmony_ci (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; 548cabdff1aSopenharmony_ci 549cabdff1aSopenharmony_ci register const vector signed short vprod2 __asm__ ("v17") = 550cabdff1aSopenharmony_ci (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; 551cabdff1aSopenharmony_ci 552cabdff1aSopenharmony_ci register const vector signed short vprod3 __asm__ ("v18") = 553cabdff1aSopenharmony_ci (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; 554cabdff1aSopenharmony_ci 555cabdff1aSopenharmony_ci register const vector unsigned char perm1 __asm__ ("v19") = 556cabdff1aSopenharmony_ci (const vector unsigned char) 557cabdff1aSopenharmony_ci { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 558cabdff1aSopenharmony_ci 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; 559cabdff1aSopenharmony_ci 560cabdff1aSopenharmony_ci register const vector unsigned char perm2 __asm__ ("v20") = 561cabdff1aSopenharmony_ci (const vector unsigned char) 562cabdff1aSopenharmony_ci { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 563cabdff1aSopenharmony_ci 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; 564cabdff1aSopenharmony_ci 565cabdff1aSopenharmony_ci register const vector unsigned char perm3 __asm__ ("v21") = 566cabdff1aSopenharmony_ci (const vector unsigned char) 567cabdff1aSopenharmony_ci { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 568cabdff1aSopenharmony_ci 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; 569cabdff1aSopenharmony_ci 570cabdff1aSopenharmony_ci#define ONEITERBUTTERFLY(i, res1, res2) \ 571cabdff1aSopenharmony_ci { \ 572cabdff1aSopenharmony_ci register vector unsigned char srcO __asm__ ("v22") = \ 573cabdff1aSopenharmony_ci unaligned_load(stride * i, src); \ 574cabdff1aSopenharmony_ci register vector unsigned char dstO __asm__ ("v23") = \ 575cabdff1aSopenharmony_ci unaligned_load(stride * i, dst);\ 576cabdff1aSopenharmony_ci \ 577cabdff1aSopenharmony_ci /* Promote the unsigned chars to signed shorts. */ \ 578cabdff1aSopenharmony_ci register vector signed short srcV __asm__ ("v24") = \ 579cabdff1aSopenharmony_ci (vector signed short) VEC_MERGEH((vector signed char) vzero, \ 580cabdff1aSopenharmony_ci (vector signed char) srcO); \ 581cabdff1aSopenharmony_ci register vector signed short dstV __asm__ ("v25") = \ 582cabdff1aSopenharmony_ci (vector signed short) VEC_MERGEH((vector signed char) vzero, \ 583cabdff1aSopenharmony_ci (vector signed char) dstO); \ 584cabdff1aSopenharmony_ci register vector signed short srcW __asm__ ("v26") = \ 585cabdff1aSopenharmony_ci (vector signed short) VEC_MERGEL((vector signed char) vzero, \ 586cabdff1aSopenharmony_ci (vector signed char) srcO); \ 587cabdff1aSopenharmony_ci register vector signed short dstW __asm__ ("v27") = \ 588cabdff1aSopenharmony_ci (vector signed short) VEC_MERGEL((vector signed char) vzero, \ 589cabdff1aSopenharmony_ci (vector signed char) dstO); \ 590cabdff1aSopenharmony_ci \ 591cabdff1aSopenharmony_ci /* subtractions inside the first butterfly */ \ 592cabdff1aSopenharmony_ci register vector signed short but0 __asm__ ("v28") = \ 593cabdff1aSopenharmony_ci vec_sub(srcV, dstV); \ 594cabdff1aSopenharmony_ci register vector signed short but0S __asm__ ("v29") = \ 595cabdff1aSopenharmony_ci vec_sub(srcW, dstW); \ 596cabdff1aSopenharmony_ci register vector signed short op1 __asm__ ("v30") = \ 597cabdff1aSopenharmony_ci vec_perm(but0, but0, perm1); \ 598cabdff1aSopenharmony_ci register vector signed short but1 __asm__ ("v22") = \ 599cabdff1aSopenharmony_ci vec_mladd(but0, vprod1, op1); \ 600cabdff1aSopenharmony_ci register vector signed short op1S __asm__ ("v23") = \ 601cabdff1aSopenharmony_ci vec_perm(but0S, but0S, perm1); \ 602cabdff1aSopenharmony_ci register vector signed short but1S __asm__ ("v24") = \ 603cabdff1aSopenharmony_ci vec_mladd(but0S, vprod1, op1S); \ 604cabdff1aSopenharmony_ci register vector signed short op2 __asm__ ("v25") = \ 605cabdff1aSopenharmony_ci vec_perm(but1, but1, perm2); \ 606cabdff1aSopenharmony_ci register vector signed short but2 __asm__ ("v26") = \ 607cabdff1aSopenharmony_ci vec_mladd(but1, vprod2, op2); \ 608cabdff1aSopenharmony_ci register vector signed short op2S __asm__ ("v27") = \ 609cabdff1aSopenharmony_ci vec_perm(but1S, but1S, perm2); \ 610cabdff1aSopenharmony_ci register vector signed short but2S __asm__ ("v28") = \ 611cabdff1aSopenharmony_ci vec_mladd(but1S, vprod2, op2S); \ 612cabdff1aSopenharmony_ci register vector signed short op3 __asm__ ("v29") = \ 613cabdff1aSopenharmony_ci vec_perm(but2, but2, perm3); \ 614cabdff1aSopenharmony_ci register vector signed short op3S __asm__ ("v30") = \ 615cabdff1aSopenharmony_ci vec_perm(but2S, but2S, perm3); \ 616cabdff1aSopenharmony_ci res1 = vec_mladd(but2, vprod3, op3); \ 617cabdff1aSopenharmony_ci res2 = vec_mladd(but2S, vprod3, op3S); \ 618cabdff1aSopenharmony_ci } 619cabdff1aSopenharmony_ci 620cabdff1aSopenharmony_ci ONEITERBUTTERFLY(0, temp0, temp0S); 621cabdff1aSopenharmony_ci ONEITERBUTTERFLY(1, temp1, temp1S); 622cabdff1aSopenharmony_ci ONEITERBUTTERFLY(2, temp2, temp2S); 623cabdff1aSopenharmony_ci ONEITERBUTTERFLY(3, temp3, temp3S); 624cabdff1aSopenharmony_ci ONEITERBUTTERFLY(4, temp4, temp4S); 625cabdff1aSopenharmony_ci ONEITERBUTTERFLY(5, temp5, temp5S); 626cabdff1aSopenharmony_ci ONEITERBUTTERFLY(6, temp6, temp6S); 627cabdff1aSopenharmony_ci ONEITERBUTTERFLY(7, temp7, temp7S); 628cabdff1aSopenharmony_ci } 629cabdff1aSopenharmony_ci#undef ONEITERBUTTERFLY 630cabdff1aSopenharmony_ci { 631cabdff1aSopenharmony_ci register vector signed int vsum; 632cabdff1aSopenharmony_ci 633cabdff1aSopenharmony_ci register vector signed short line0 = vec_add(temp0, temp1); 634cabdff1aSopenharmony_ci register vector signed short line1 = vec_sub(temp0, temp1); 635cabdff1aSopenharmony_ci register vector signed short line2 = vec_add(temp2, temp3); 636cabdff1aSopenharmony_ci register vector signed short line3 = vec_sub(temp2, temp3); 637cabdff1aSopenharmony_ci register vector signed short line4 = vec_add(temp4, temp5); 638cabdff1aSopenharmony_ci register vector signed short line5 = vec_sub(temp4, temp5); 639cabdff1aSopenharmony_ci register vector signed short line6 = vec_add(temp6, temp7); 640cabdff1aSopenharmony_ci register vector signed short line7 = vec_sub(temp6, temp7); 641cabdff1aSopenharmony_ci 642cabdff1aSopenharmony_ci register vector signed short line0B = vec_add(line0, line2); 643cabdff1aSopenharmony_ci register vector signed short line2B = vec_sub(line0, line2); 644cabdff1aSopenharmony_ci register vector signed short line1B = vec_add(line1, line3); 645cabdff1aSopenharmony_ci register vector signed short line3B = vec_sub(line1, line3); 646cabdff1aSopenharmony_ci register vector signed short line4B = vec_add(line4, line6); 647cabdff1aSopenharmony_ci register vector signed short line6B = vec_sub(line4, line6); 648cabdff1aSopenharmony_ci register vector signed short line5B = vec_add(line5, line7); 649cabdff1aSopenharmony_ci register vector signed short line7B = vec_sub(line5, line7); 650cabdff1aSopenharmony_ci 651cabdff1aSopenharmony_ci register vector signed short line0C = vec_add(line0B, line4B); 652cabdff1aSopenharmony_ci register vector signed short line4C = vec_sub(line0B, line4B); 653cabdff1aSopenharmony_ci register vector signed short line1C = vec_add(line1B, line5B); 654cabdff1aSopenharmony_ci register vector signed short line5C = vec_sub(line1B, line5B); 655cabdff1aSopenharmony_ci register vector signed short line2C = vec_add(line2B, line6B); 656cabdff1aSopenharmony_ci register vector signed short line6C = vec_sub(line2B, line6B); 657cabdff1aSopenharmony_ci register vector signed short line3C = vec_add(line3B, line7B); 658cabdff1aSopenharmony_ci register vector signed short line7C = vec_sub(line3B, line7B); 659cabdff1aSopenharmony_ci 660cabdff1aSopenharmony_ci register vector signed short line0S = vec_add(temp0S, temp1S); 661cabdff1aSopenharmony_ci register vector signed short line1S = vec_sub(temp0S, temp1S); 662cabdff1aSopenharmony_ci register vector signed short line2S = vec_add(temp2S, temp3S); 663cabdff1aSopenharmony_ci register vector signed short line3S = vec_sub(temp2S, temp3S); 664cabdff1aSopenharmony_ci register vector signed short line4S = vec_add(temp4S, temp5S); 665cabdff1aSopenharmony_ci register vector signed short line5S = vec_sub(temp4S, temp5S); 666cabdff1aSopenharmony_ci register vector signed short line6S = vec_add(temp6S, temp7S); 667cabdff1aSopenharmony_ci register vector signed short line7S = vec_sub(temp6S, temp7S); 668cabdff1aSopenharmony_ci 669cabdff1aSopenharmony_ci register vector signed short line0BS = vec_add(line0S, line2S); 670cabdff1aSopenharmony_ci register vector signed short line2BS = vec_sub(line0S, line2S); 671cabdff1aSopenharmony_ci register vector signed short line1BS = vec_add(line1S, line3S); 672cabdff1aSopenharmony_ci register vector signed short line3BS = vec_sub(line1S, line3S); 673cabdff1aSopenharmony_ci register vector signed short line4BS = vec_add(line4S, line6S); 674cabdff1aSopenharmony_ci register vector signed short line6BS = vec_sub(line4S, line6S); 675cabdff1aSopenharmony_ci register vector signed short line5BS = vec_add(line5S, line7S); 676cabdff1aSopenharmony_ci register vector signed short line7BS = vec_sub(line5S, line7S); 677cabdff1aSopenharmony_ci 678cabdff1aSopenharmony_ci register vector signed short line0CS = vec_add(line0BS, line4BS); 679cabdff1aSopenharmony_ci register vector signed short line4CS = vec_sub(line0BS, line4BS); 680cabdff1aSopenharmony_ci register vector signed short line1CS = vec_add(line1BS, line5BS); 681cabdff1aSopenharmony_ci register vector signed short line5CS = vec_sub(line1BS, line5BS); 682cabdff1aSopenharmony_ci register vector signed short line2CS = vec_add(line2BS, line6BS); 683cabdff1aSopenharmony_ci register vector signed short line6CS = vec_sub(line2BS, line6BS); 684cabdff1aSopenharmony_ci register vector signed short line3CS = vec_add(line3BS, line7BS); 685cabdff1aSopenharmony_ci register vector signed short line7CS = vec_sub(line3BS, line7BS); 686cabdff1aSopenharmony_ci 687cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); 688cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line1C), vsum); 689cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line2C), vsum); 690cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line3C), vsum); 691cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line4C), vsum); 692cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line5C), vsum); 693cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line6C), vsum); 694cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line7C), vsum); 695cabdff1aSopenharmony_ci 696cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line0CS), vsum); 697cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line1CS), vsum); 698cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line2CS), vsum); 699cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line3CS), vsum); 700cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line4CS), vsum); 701cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line5CS), vsum); 702cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line6CS), vsum); 703cabdff1aSopenharmony_ci vsum = vec_sum4s(vec_abs(line7CS), vsum); 704cabdff1aSopenharmony_ci vsum = vec_sums(vsum, (vector signed int) vzero); 705cabdff1aSopenharmony_ci vsum = vec_splat(vsum, 3); 706cabdff1aSopenharmony_ci 707cabdff1aSopenharmony_ci vec_ste(vsum, 0, &sum); 708cabdff1aSopenharmony_ci } 709cabdff1aSopenharmony_ci return sum; 710cabdff1aSopenharmony_ci} 711cabdff1aSopenharmony_ci 712cabdff1aSopenharmony_cistatic int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst, 713cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t stride, int h) 714cabdff1aSopenharmony_ci{ 715cabdff1aSopenharmony_ci int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); 716cabdff1aSopenharmony_ci 717cabdff1aSopenharmony_ci if (h == 16) { 718cabdff1aSopenharmony_ci dst += 8 * stride; 719cabdff1aSopenharmony_ci src += 8 * stride; 720cabdff1aSopenharmony_ci score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); 721cabdff1aSopenharmony_ci } 722cabdff1aSopenharmony_ci return score; 723cabdff1aSopenharmony_ci} 724cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 725cabdff1aSopenharmony_ci 726cabdff1aSopenharmony_ciav_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx) 727cabdff1aSopenharmony_ci{ 728cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 729cabdff1aSopenharmony_ci if (!PPC_ALTIVEC(av_get_cpu_flags())) 730cabdff1aSopenharmony_ci return; 731cabdff1aSopenharmony_ci 732cabdff1aSopenharmony_ci c->pix_abs[0][1] = sad16_x2_altivec; 733cabdff1aSopenharmony_ci c->pix_abs[0][2] = sad16_y2_altivec; 734cabdff1aSopenharmony_ci c->pix_abs[0][3] = sad16_xy2_altivec; 735cabdff1aSopenharmony_ci c->pix_abs[0][0] = sad16_altivec; 736cabdff1aSopenharmony_ci c->pix_abs[1][0] = sad8_altivec; 737cabdff1aSopenharmony_ci 738cabdff1aSopenharmony_ci c->sad[0] = sad16_altivec; 739cabdff1aSopenharmony_ci c->sad[1] = sad8_altivec; 740cabdff1aSopenharmony_ci c->sse[0] = sse16_altivec; 741cabdff1aSopenharmony_ci c->sse[1] = sse8_altivec; 742cabdff1aSopenharmony_ci 743cabdff1aSopenharmony_ci c->hadamard8_diff[0] = hadamard8_diff16_altivec; 744cabdff1aSopenharmony_ci c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; 745cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 746cabdff1aSopenharmony_ci} 747