1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * AltiVec-enhanced yuv2yuvX 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org> 5cabdff1aSopenharmony_ci * based on the equivalent C code in swscale.c 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * This file is part of FFmpeg. 8cabdff1aSopenharmony_ci * 9cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci * 14cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci * Lesser General Public License for more details. 18cabdff1aSopenharmony_ci * 19cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci */ 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci#include <inttypes.h> 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci#include "config.h" 27cabdff1aSopenharmony_ci#include "libswscale/swscale.h" 28cabdff1aSopenharmony_ci#include "libswscale/swscale_internal.h" 29cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 30cabdff1aSopenharmony_ci#include "libavutil/cpu.h" 31cabdff1aSopenharmony_ci#include "yuv2rgb_altivec.h" 32cabdff1aSopenharmony_ci#include "libavutil/ppc/util_altivec.h" 33cabdff1aSopenharmony_ci 34cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 35cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 36cabdff1aSopenharmony_ci#define vzero vec_splat_s32(0) 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ci#define GET_LS(a,b,c,s) {\ 39cabdff1aSopenharmony_ci vector signed short l2 = vec_ld(((b) << 1) + 16, s);\ 40cabdff1aSopenharmony_ci ls = vec_perm(a, l2, c);\ 41cabdff1aSopenharmony_ci a = l2;\ 42cabdff1aSopenharmony_ci } 43cabdff1aSopenharmony_ci 44cabdff1aSopenharmony_ci#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\ 45cabdff1aSopenharmony_ci vector signed short ls;\ 46cabdff1aSopenharmony_ci vector signed int vf1, vf2, i1, i2;\ 47cabdff1aSopenharmony_ci GET_LS(l1, x, perm, src);\ 48cabdff1aSopenharmony_ci i1 = vec_mule(filter, ls);\ 49cabdff1aSopenharmony_ci i2 = vec_mulo(filter, ls);\ 50cabdff1aSopenharmony_ci vf1 = vec_mergeh(i1, i2);\ 51cabdff1aSopenharmony_ci vf2 = vec_mergel(i1, i2);\ 52cabdff1aSopenharmony_ci d1 = vec_add(d1, vf1);\ 53cabdff1aSopenharmony_ci d2 = vec_add(d2, vf2);\ 54cabdff1aSopenharmony_ci } while (0) 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci#define LOAD_FILTER(vf,f) {\ 57cabdff1aSopenharmony_ci vector unsigned char perm0 = vec_lvsl(joffset, f);\ 58cabdff1aSopenharmony_ci vf = vec_ld(joffset, f);\ 59cabdff1aSopenharmony_ci vf = vec_perm(vf, vf, perm0);\ 60cabdff1aSopenharmony_ci} 61cabdff1aSopenharmony_ci#define LOAD_L1(ll1,s,p){\ 62cabdff1aSopenharmony_ci p = vec_lvsl(xoffset, s);\ 63cabdff1aSopenharmony_ci ll1 = vec_ld(xoffset, s);\ 64cabdff1aSopenharmony_ci} 65cabdff1aSopenharmony_ci 66cabdff1aSopenharmony_ci// The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2). 67cabdff1aSopenharmony_ci 68cabdff1aSopenharmony_ci// The neat trick: We only care for half the elements, 69cabdff1aSopenharmony_ci// high or low depending on (i<<3)%16 (it's 0 or 8 here), 70cabdff1aSopenharmony_ci// and we're going to use vec_mule, so we choose 71cabdff1aSopenharmony_ci// carefully how to "unpack" the elements into the even slots. 72cabdff1aSopenharmony_ci#define GET_VF4(a, vf, f) {\ 73cabdff1aSopenharmony_ci vf = vec_ld(a<< 3, f);\ 74cabdff1aSopenharmony_ci if ((a << 3) % 16)\ 75cabdff1aSopenharmony_ci vf = vec_mergel(vf, (vector signed short)vzero);\ 76cabdff1aSopenharmony_ci else\ 77cabdff1aSopenharmony_ci vf = vec_mergeh(vf, (vector signed short)vzero);\ 78cabdff1aSopenharmony_ci} 79cabdff1aSopenharmony_ci#define FIRST_LOAD(sv, pos, s, per) {\ 80cabdff1aSopenharmony_ci sv = vec_ld(pos, s);\ 81cabdff1aSopenharmony_ci per = vec_lvsl(pos, s);\ 82cabdff1aSopenharmony_ci} 83cabdff1aSopenharmony_ci#define UPDATE_PTR(s0, d0, s1, d1) {\ 84cabdff1aSopenharmony_ci d0 = s0;\ 85cabdff1aSopenharmony_ci d1 = s1;\ 86cabdff1aSopenharmony_ci} 87cabdff1aSopenharmony_ci#define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\ 88cabdff1aSopenharmony_ci v1 = vec_ld(pos + a + 16, s);\ 89cabdff1aSopenharmony_ci vf = vec_perm(v0, v1, per);\ 90cabdff1aSopenharmony_ci} 91cabdff1aSopenharmony_ci#define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) {\ 92cabdff1aSopenharmony_ci if ((((uintptr_t)s + pos) % 16) > 8) {\ 93cabdff1aSopenharmony_ci v1 = vec_ld(pos + a + 16, s);\ 94cabdff1aSopenharmony_ci }\ 95cabdff1aSopenharmony_ci vf = vec_perm(v0, src_v1, per);\ 96cabdff1aSopenharmony_ci} 97cabdff1aSopenharmony_ci#define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\ 98cabdff1aSopenharmony_ci vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\ 99cabdff1aSopenharmony_ci vf = vec_perm(vf0, vf1, per);\ 100cabdff1aSopenharmony_ci} 101cabdff1aSopenharmony_ci 102cabdff1aSopenharmony_ci#define FUNC(name) name ## _altivec 103cabdff1aSopenharmony_ci#include "swscale_ppc_template.c" 104cabdff1aSopenharmony_ci#undef FUNC 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci#undef vzero 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_ci#endif /* HAVE_BIGENDIAN */ 109cabdff1aSopenharmony_ci 110cabdff1aSopenharmony_ci#define output_pixel(pos, val, bias, signedness) \ 111cabdff1aSopenharmony_ci if (big_endian) { \ 112cabdff1aSopenharmony_ci AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \ 113cabdff1aSopenharmony_ci } else { \ 114cabdff1aSopenharmony_ci AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \ 115cabdff1aSopenharmony_ci } 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_cistatic void 118cabdff1aSopenharmony_ciyuv2plane1_float_u(const int32_t *src, float *dest, int dstW, int start) 119cabdff1aSopenharmony_ci{ 120cabdff1aSopenharmony_ci static const int big_endian = HAVE_BIGENDIAN; 121cabdff1aSopenharmony_ci static const int shift = 3; 122cabdff1aSopenharmony_ci static const float float_mult = 1.0f / 65535.0f; 123cabdff1aSopenharmony_ci int i, val; 124cabdff1aSopenharmony_ci uint16_t val_uint; 125cabdff1aSopenharmony_ci 126cabdff1aSopenharmony_ci for (i = start; i < dstW; ++i){ 127cabdff1aSopenharmony_ci val = src[i] + (1 << (shift - 1)); 128cabdff1aSopenharmony_ci output_pixel(&val_uint, val, 0, uint); 129cabdff1aSopenharmony_ci dest[i] = float_mult * (float)val_uint; 130cabdff1aSopenharmony_ci } 131cabdff1aSopenharmony_ci} 132cabdff1aSopenharmony_ci 133cabdff1aSopenharmony_cistatic void 134cabdff1aSopenharmony_ciyuv2plane1_float_bswap_u(const int32_t *src, uint32_t *dest, int dstW, int start) 135cabdff1aSopenharmony_ci{ 136cabdff1aSopenharmony_ci static const int big_endian = HAVE_BIGENDIAN; 137cabdff1aSopenharmony_ci static const int shift = 3; 138cabdff1aSopenharmony_ci static const float float_mult = 1.0f / 65535.0f; 139cabdff1aSopenharmony_ci int i, val; 140cabdff1aSopenharmony_ci uint16_t val_uint; 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_ci for (i = start; i < dstW; ++i){ 143cabdff1aSopenharmony_ci val = src[i] + (1 << (shift - 1)); 144cabdff1aSopenharmony_ci output_pixel(&val_uint, val, 0, uint); 145cabdff1aSopenharmony_ci dest[i] = av_bswap32(av_float2int(float_mult * (float)val_uint)); 146cabdff1aSopenharmony_ci } 147cabdff1aSopenharmony_ci} 148cabdff1aSopenharmony_ci 149cabdff1aSopenharmony_cistatic void yuv2plane1_float_altivec(const int32_t *src, float *dest, int dstW) 150cabdff1aSopenharmony_ci{ 151cabdff1aSopenharmony_ci const int dst_u = -(uintptr_t)dest & 3; 152cabdff1aSopenharmony_ci const int shift = 3; 153cabdff1aSopenharmony_ci const int add = (1 << (shift - 1)); 154cabdff1aSopenharmony_ci const int clip = (1 << 16) - 1; 155cabdff1aSopenharmony_ci const float fmult = 1.0f / 65535.0f; 156cabdff1aSopenharmony_ci const vec_u32 vadd = (vec_u32) {add, add, add, add}; 157cabdff1aSopenharmony_ci const vec_u32 vshift = (vec_u32) vec_splat_u32(shift); 158cabdff1aSopenharmony_ci const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip}; 159cabdff1aSopenharmony_ci const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult}; 160cabdff1aSopenharmony_ci const vec_f vzero = (vec_f) {0, 0, 0, 0}; 161cabdff1aSopenharmony_ci vec_u32 v; 162cabdff1aSopenharmony_ci vec_f vd; 163cabdff1aSopenharmony_ci int i; 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_ci yuv2plane1_float_u(src, dest, dst_u, 0); 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ci for (i = dst_u; i < dstW - 3; i += 4) { 168cabdff1aSopenharmony_ci v = vec_ld(0, (const uint32_t *) &src[i]); 169cabdff1aSopenharmony_ci v = vec_add(v, vadd); 170cabdff1aSopenharmony_ci v = vec_sr(v, vshift); 171cabdff1aSopenharmony_ci v = vec_min(v, vlargest); 172cabdff1aSopenharmony_ci 173cabdff1aSopenharmony_ci vd = vec_ctf(v, 0); 174cabdff1aSopenharmony_ci vd = vec_madd(vd, vmul, vzero); 175cabdff1aSopenharmony_ci 176cabdff1aSopenharmony_ci vec_st(vd, 0, &dest[i]); 177cabdff1aSopenharmony_ci } 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci yuv2plane1_float_u(src, dest, dstW, i); 180cabdff1aSopenharmony_ci} 181cabdff1aSopenharmony_ci 182cabdff1aSopenharmony_cistatic void yuv2plane1_float_bswap_altivec(const int32_t *src, uint32_t *dest, int dstW) 183cabdff1aSopenharmony_ci{ 184cabdff1aSopenharmony_ci const int dst_u = -(uintptr_t)dest & 3; 185cabdff1aSopenharmony_ci const int shift = 3; 186cabdff1aSopenharmony_ci const int add = (1 << (shift - 1)); 187cabdff1aSopenharmony_ci const int clip = (1 << 16) - 1; 188cabdff1aSopenharmony_ci const float fmult = 1.0f / 65535.0f; 189cabdff1aSopenharmony_ci const vec_u32 vadd = (vec_u32) {add, add, add, add}; 190cabdff1aSopenharmony_ci const vec_u32 vshift = (vec_u32) vec_splat_u32(shift); 191cabdff1aSopenharmony_ci const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip}; 192cabdff1aSopenharmony_ci const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult}; 193cabdff1aSopenharmony_ci const vec_f vzero = (vec_f) {0, 0, 0, 0}; 194cabdff1aSopenharmony_ci const vec_u32 vswapbig = (vec_u32) {16, 16, 16, 16}; 195cabdff1aSopenharmony_ci const vec_u16 vswapsmall = vec_splat_u16(8); 196cabdff1aSopenharmony_ci vec_u32 v; 197cabdff1aSopenharmony_ci vec_f vd; 198cabdff1aSopenharmony_ci int i; 199cabdff1aSopenharmony_ci 200cabdff1aSopenharmony_ci yuv2plane1_float_bswap_u(src, dest, dst_u, 0); 201cabdff1aSopenharmony_ci 202cabdff1aSopenharmony_ci for (i = dst_u; i < dstW - 3; i += 4) { 203cabdff1aSopenharmony_ci v = vec_ld(0, (const uint32_t *) &src[i]); 204cabdff1aSopenharmony_ci v = vec_add(v, vadd); 205cabdff1aSopenharmony_ci v = vec_sr(v, vshift); 206cabdff1aSopenharmony_ci v = vec_min(v, vlargest); 207cabdff1aSopenharmony_ci 208cabdff1aSopenharmony_ci vd = vec_ctf(v, 0); 209cabdff1aSopenharmony_ci vd = vec_madd(vd, vmul, vzero); 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci vd = (vec_f) vec_rl((vec_u32) vd, vswapbig); 212cabdff1aSopenharmony_ci vd = (vec_f) vec_rl((vec_u16) vd, vswapsmall); 213cabdff1aSopenharmony_ci 214cabdff1aSopenharmony_ci vec_st(vd, 0, (float *) &dest[i]); 215cabdff1aSopenharmony_ci } 216cabdff1aSopenharmony_ci 217cabdff1aSopenharmony_ci yuv2plane1_float_bswap_u(src, dest, dstW, i); 218cabdff1aSopenharmony_ci} 219cabdff1aSopenharmony_ci 220cabdff1aSopenharmony_ci#define yuv2plane1_float(template, dest_type, BE_LE) \ 221cabdff1aSopenharmony_cistatic void yuv2plane1_float ## BE_LE ## _altivec(const int16_t *src, uint8_t *dest, \ 222cabdff1aSopenharmony_ci int dstW, \ 223cabdff1aSopenharmony_ci const uint8_t *dither, int offset) \ 224cabdff1aSopenharmony_ci{ \ 225cabdff1aSopenharmony_ci template((const int32_t *)src, (dest_type *)dest, dstW); \ 226cabdff1aSopenharmony_ci} 227cabdff1aSopenharmony_ci 228cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 229cabdff1aSopenharmony_ciyuv2plane1_float(yuv2plane1_float_altivec, float, BE) 230cabdff1aSopenharmony_ciyuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, LE) 231cabdff1aSopenharmony_ci#else 232cabdff1aSopenharmony_ciyuv2plane1_float(yuv2plane1_float_altivec, float, LE) 233cabdff1aSopenharmony_ciyuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, BE) 234cabdff1aSopenharmony_ci#endif 235cabdff1aSopenharmony_ci 236cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 237cabdff1aSopenharmony_ci 238cabdff1aSopenharmony_ciav_cold void ff_sws_init_swscale_ppc(SwsContext *c) 239cabdff1aSopenharmony_ci{ 240cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 241cabdff1aSopenharmony_ci enum AVPixelFormat dstFormat = c->dstFormat; 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) 244cabdff1aSopenharmony_ci return; 245cabdff1aSopenharmony_ci 246cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 247cabdff1aSopenharmony_ci if (c->srcBpc == 8 && c->dstBpc <= 14) { 248cabdff1aSopenharmony_ci c->hyScale = c->hcScale = hScale_real_altivec; 249cabdff1aSopenharmony_ci } 250cabdff1aSopenharmony_ci if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) && 251cabdff1aSopenharmony_ci dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE && 252cabdff1aSopenharmony_ci !c->needAlpha) { 253cabdff1aSopenharmony_ci c->yuv2planeX = yuv2planeX_altivec; 254cabdff1aSopenharmony_ci } 255cabdff1aSopenharmony_ci#endif 256cabdff1aSopenharmony_ci 257cabdff1aSopenharmony_ci if (dstFormat == AV_PIX_FMT_GRAYF32BE) { 258cabdff1aSopenharmony_ci c->yuv2plane1 = yuv2plane1_floatBE_altivec; 259cabdff1aSopenharmony_ci } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) { 260cabdff1aSopenharmony_ci c->yuv2plane1 = yuv2plane1_floatLE_altivec; 261cabdff1aSopenharmony_ci } 262cabdff1aSopenharmony_ci 263cabdff1aSopenharmony_ci /* The following list of supported dstFormat values should 264cabdff1aSopenharmony_ci * match what's found in the body of ff_yuv2packedX_altivec() */ 265cabdff1aSopenharmony_ci if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) { 266cabdff1aSopenharmony_ci switch (c->dstFormat) { 267cabdff1aSopenharmony_ci case AV_PIX_FMT_ABGR: 268cabdff1aSopenharmony_ci c->yuv2packedX = ff_yuv2abgr_X_altivec; 269cabdff1aSopenharmony_ci break; 270cabdff1aSopenharmony_ci case AV_PIX_FMT_BGRA: 271cabdff1aSopenharmony_ci c->yuv2packedX = ff_yuv2bgra_X_altivec; 272cabdff1aSopenharmony_ci break; 273cabdff1aSopenharmony_ci case AV_PIX_FMT_ARGB: 274cabdff1aSopenharmony_ci c->yuv2packedX = ff_yuv2argb_X_altivec; 275cabdff1aSopenharmony_ci break; 276cabdff1aSopenharmony_ci case AV_PIX_FMT_RGBA: 277cabdff1aSopenharmony_ci c->yuv2packedX = ff_yuv2rgba_X_altivec; 278cabdff1aSopenharmony_ci break; 279cabdff1aSopenharmony_ci case AV_PIX_FMT_BGR24: 280cabdff1aSopenharmony_ci c->yuv2packedX = ff_yuv2bgr24_X_altivec; 281cabdff1aSopenharmony_ci break; 282cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB24: 283cabdff1aSopenharmony_ci c->yuv2packedX = ff_yuv2rgb24_X_altivec; 284cabdff1aSopenharmony_ci break; 285cabdff1aSopenharmony_ci } 286cabdff1aSopenharmony_ci } 287cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 288cabdff1aSopenharmony_ci 289cabdff1aSopenharmony_ci ff_sws_init_swscale_vsx(c); 290cabdff1aSopenharmony_ci} 291