1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * AltiVec acceleration for colorspace conversion 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com> 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci/* 24cabdff1aSopenharmony_ci * Convert I420 YV12 to RGB in various formats, 25cabdff1aSopenharmony_ci * it rejects images that are not in 420 formats, 26cabdff1aSopenharmony_ci * it rejects images that don't have widths of multiples of 16, 27cabdff1aSopenharmony_ci * it rejects images that don't have heights of multiples of 2. 28cabdff1aSopenharmony_ci * Reject defers to C simulation code. 29cabdff1aSopenharmony_ci * 30cabdff1aSopenharmony_ci * Lots of optimizations to be done here. 31cabdff1aSopenharmony_ci * 32cabdff1aSopenharmony_ci * 1. Need to fix saturation code. I just couldn't get it to fly with packs 33cabdff1aSopenharmony_ci * and adds, so we currently use max/min to clip. 34cabdff1aSopenharmony_ci * 35cabdff1aSopenharmony_ci * 2. The inefficient use of chroma loading needs a bit of brushing up. 36cabdff1aSopenharmony_ci * 37cabdff1aSopenharmony_ci * 3. Analysis of pipeline stalls needs to be done. Use shark to identify 38cabdff1aSopenharmony_ci * pipeline stalls. 39cabdff1aSopenharmony_ci * 40cabdff1aSopenharmony_ci * 41cabdff1aSopenharmony_ci * MODIFIED to calculate coeffs from currently selected color space. 42cabdff1aSopenharmony_ci * MODIFIED core to be a macro where you specify the output format. 43cabdff1aSopenharmony_ci * ADDED UYVY conversion which is never called due to some thing in swscale. 44cabdff1aSopenharmony_ci * CORRECTED algorithim selection to be strict on input formats. 45cabdff1aSopenharmony_ci * ADDED runtime detection of AltiVec. 46cabdff1aSopenharmony_ci * 47cabdff1aSopenharmony_ci * ADDED altivec_yuv2packedX vertical scl + RGB converter 48cabdff1aSopenharmony_ci * 49cabdff1aSopenharmony_ci * March 27,2004 50cabdff1aSopenharmony_ci * PERFORMANCE ANALYSIS 51cabdff1aSopenharmony_ci * 52cabdff1aSopenharmony_ci * The C version uses 25% of the processor or ~250Mips for D1 video rawvideo 53cabdff1aSopenharmony_ci * used as test. 54cabdff1aSopenharmony_ci * The AltiVec version uses 10% of the processor or ~100Mips for D1 video 55cabdff1aSopenharmony_ci * same sequence. 56cabdff1aSopenharmony_ci * 57cabdff1aSopenharmony_ci * 720 * 480 * 30 ~10MPS 58cabdff1aSopenharmony_ci * 59cabdff1aSopenharmony_ci * so we have roughly 10 clocks per pixel. This is too high, something has 60cabdff1aSopenharmony_ci * to be wrong. 61cabdff1aSopenharmony_ci * 62cabdff1aSopenharmony_ci * OPTIMIZED clip codes to utilize vec_max and vec_packs removing the 63cabdff1aSopenharmony_ci * need for vec_min. 64cabdff1aSopenharmony_ci * 65cabdff1aSopenharmony_ci * OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to 66cabdff1aSopenharmony_ci * have the input video frame, it was just decompressed so it probably resides 67cabdff1aSopenharmony_ci * in L1 caches. However, we are creating the output video stream. This needs 68cabdff1aSopenharmony_ci * to use the DSTST instruction to optimize for the cache. We couple this with 69cabdff1aSopenharmony_ci * the fact that we are not going to be visiting the input buffer again so we 70cabdff1aSopenharmony_ci * mark it Least Recently Used. This shaves 25% of the processor cycles off. 71cabdff1aSopenharmony_ci * 72cabdff1aSopenharmony_ci * Now memcpy is the largest mips consumer in the system, probably due 73cabdff1aSopenharmony_ci * to the inefficient X11 stuff. 74cabdff1aSopenharmony_ci * 75cabdff1aSopenharmony_ci * GL libraries seem to be very slow on this machine 1.33Ghz PB running 76cabdff1aSopenharmony_ci * Jaguar, this is not the case for my 1Ghz PB. I thought it might be 77cabdff1aSopenharmony_ci * a versioning issue, however I have libGL.1.2.dylib for both 78cabdff1aSopenharmony_ci * machines. (We need to figure this out now.) 79cabdff1aSopenharmony_ci * 80cabdff1aSopenharmony_ci * GL2 libraries work now with patch for RGB32. 81cabdff1aSopenharmony_ci * 82cabdff1aSopenharmony_ci * NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor. 83cabdff1aSopenharmony_ci * 84cabdff1aSopenharmony_ci * Integrated luma prescaling adjustment for saturation/contrast/brightness 85cabdff1aSopenharmony_ci * adjustment. 86cabdff1aSopenharmony_ci */ 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci#include <stdio.h> 89cabdff1aSopenharmony_ci#include <stdlib.h> 90cabdff1aSopenharmony_ci#include <string.h> 91cabdff1aSopenharmony_ci#include <inttypes.h> 92cabdff1aSopenharmony_ci 93cabdff1aSopenharmony_ci#include "config.h" 94cabdff1aSopenharmony_ci#include "libswscale/rgb2rgb.h" 95cabdff1aSopenharmony_ci#include "libswscale/swscale.h" 96cabdff1aSopenharmony_ci#include "libswscale/swscale_internal.h" 97cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 98cabdff1aSopenharmony_ci#include "libavutil/cpu.h" 99cabdff1aSopenharmony_ci#include "libavutil/mem_internal.h" 100cabdff1aSopenharmony_ci#include "libavutil/pixdesc.h" 101cabdff1aSopenharmony_ci#include "yuv2rgb_altivec.h" 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci#undef PROFILE_THE_BEAST 106cabdff1aSopenharmony_ci#undef INC_SCALING 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_citypedef unsigned char ubyte; 109cabdff1aSopenharmony_citypedef signed char sbyte; 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ci/* RGB interleaver, 16 planar pels 8-bit samples per channel in 112cabdff1aSopenharmony_ci * homogeneous vector registers x0,x1,x2 are interleaved with the 113cabdff1aSopenharmony_ci * following technique: 114cabdff1aSopenharmony_ci * 115cabdff1aSopenharmony_ci * o0 = vec_mergeh(x0, x1); 116cabdff1aSopenharmony_ci * o1 = vec_perm(o0, x2, perm_rgb_0); 117cabdff1aSopenharmony_ci * o2 = vec_perm(o0, x2, perm_rgb_1); 118cabdff1aSopenharmony_ci * o3 = vec_mergel(x0, x1); 119cabdff1aSopenharmony_ci * o4 = vec_perm(o3, o2, perm_rgb_2); 120cabdff1aSopenharmony_ci * o5 = vec_perm(o3, o2, perm_rgb_3); 121cabdff1aSopenharmony_ci * 122cabdff1aSopenharmony_ci * perm_rgb_0: o0(RG).h v1(B) --> o1* 123cabdff1aSopenharmony_ci * 0 1 2 3 4 124cabdff1aSopenharmony_ci * rgbr|gbrg|brgb|rgbr 125cabdff1aSopenharmony_ci * 0010 0100 1001 0010 126cabdff1aSopenharmony_ci * 0102 3145 2673 894A 127cabdff1aSopenharmony_ci * 128cabdff1aSopenharmony_ci * perm_rgb_1: o0(RG).h v1(B) --> o2 129cabdff1aSopenharmony_ci * 0 1 2 3 4 130cabdff1aSopenharmony_ci * gbrg|brgb|bbbb|bbbb 131cabdff1aSopenharmony_ci * 0100 1001 1111 1111 132cabdff1aSopenharmony_ci * B5CD 6EF7 89AB CDEF 133cabdff1aSopenharmony_ci * 134cabdff1aSopenharmony_ci * perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4* 135cabdff1aSopenharmony_ci * 0 1 2 3 4 136cabdff1aSopenharmony_ci * gbrg|brgb|rgbr|gbrg 137cabdff1aSopenharmony_ci * 1111 1111 0010 0100 138cabdff1aSopenharmony_ci * 89AB CDEF 0182 3945 139cabdff1aSopenharmony_ci * 140cabdff1aSopenharmony_ci * perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5* 141cabdff1aSopenharmony_ci * 0 1 2 3 4 142cabdff1aSopenharmony_ci * brgb|rgbr|gbrg|brgb 143cabdff1aSopenharmony_ci * 1001 0010 0100 1001 144cabdff1aSopenharmony_ci * a67b 89cA BdCD eEFf 145cabdff1aSopenharmony_ci */ 146cabdff1aSopenharmony_cistatic const vector unsigned char 147cabdff1aSopenharmony_ci perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05, 148cabdff1aSopenharmony_ci 0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a }, 149cabdff1aSopenharmony_ci perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17, 150cabdff1aSopenharmony_ci 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f }, 151cabdff1aSopenharmony_ci perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 152cabdff1aSopenharmony_ci 0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 }, 153cabdff1aSopenharmony_ci perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a, 154cabdff1aSopenharmony_ci 0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f }; 155cabdff1aSopenharmony_ci 156cabdff1aSopenharmony_ci#define vec_merge3(x2, x1, x0, y0, y1, y2) \ 157cabdff1aSopenharmony_ci do { \ 158cabdff1aSopenharmony_ci __typeof__(x0) o0, o2, o3; \ 159cabdff1aSopenharmony_ci o0 = vec_mergeh(x0, x1); \ 160cabdff1aSopenharmony_ci y0 = vec_perm(o0, x2, perm_rgb_0); \ 161cabdff1aSopenharmony_ci o2 = vec_perm(o0, x2, perm_rgb_1); \ 162cabdff1aSopenharmony_ci o3 = vec_mergel(x0, x1); \ 163cabdff1aSopenharmony_ci y1 = vec_perm(o3, o2, perm_rgb_2); \ 164cabdff1aSopenharmony_ci y2 = vec_perm(o3, o2, perm_rgb_3); \ 165cabdff1aSopenharmony_ci } while (0) 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ci#define vec_mstbgr24(x0, x1, x2, ptr) \ 168cabdff1aSopenharmony_ci do { \ 169cabdff1aSopenharmony_ci __typeof__(x0) _0, _1, _2; \ 170cabdff1aSopenharmony_ci vec_merge3(x0, x1, x2, _0, _1, _2); \ 171cabdff1aSopenharmony_ci vec_st(_0, 0, ptr++); \ 172cabdff1aSopenharmony_ci vec_st(_1, 0, ptr++); \ 173cabdff1aSopenharmony_ci vec_st(_2, 0, ptr++); \ 174cabdff1aSopenharmony_ci } while (0) 175cabdff1aSopenharmony_ci 176cabdff1aSopenharmony_ci#define vec_mstrgb24(x0, x1, x2, ptr) \ 177cabdff1aSopenharmony_ci do { \ 178cabdff1aSopenharmony_ci __typeof__(x0) _0, _1, _2; \ 179cabdff1aSopenharmony_ci vec_merge3(x2, x1, x0, _0, _1, _2); \ 180cabdff1aSopenharmony_ci vec_st(_0, 0, ptr++); \ 181cabdff1aSopenharmony_ci vec_st(_1, 0, ptr++); \ 182cabdff1aSopenharmony_ci vec_st(_2, 0, ptr++); \ 183cabdff1aSopenharmony_ci } while (0) 184cabdff1aSopenharmony_ci 185cabdff1aSopenharmony_ci/* pack the pixels in rgb0 format 186cabdff1aSopenharmony_ci * msb R 187cabdff1aSopenharmony_ci * lsb 0 188cabdff1aSopenharmony_ci */ 189cabdff1aSopenharmony_ci#define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \ 190cabdff1aSopenharmony_ci do { \ 191cabdff1aSopenharmony_ci T _0, _1, _2, _3; \ 192cabdff1aSopenharmony_ci _0 = vec_mergeh(x0, x1); \ 193cabdff1aSopenharmony_ci _1 = vec_mergeh(x2, x3); \ 194cabdff1aSopenharmony_ci _2 = (T) vec_mergeh((vector unsigned short) _0, \ 195cabdff1aSopenharmony_ci (vector unsigned short) _1); \ 196cabdff1aSopenharmony_ci _3 = (T) vec_mergel((vector unsigned short) _0, \ 197cabdff1aSopenharmony_ci (vector unsigned short) _1); \ 198cabdff1aSopenharmony_ci vec_st(_2, 0 * 16, (T *) ptr); \ 199cabdff1aSopenharmony_ci vec_st(_3, 1 * 16, (T *) ptr); \ 200cabdff1aSopenharmony_ci _0 = vec_mergel(x0, x1); \ 201cabdff1aSopenharmony_ci _1 = vec_mergel(x2, x3); \ 202cabdff1aSopenharmony_ci _2 = (T) vec_mergeh((vector unsigned short) _0, \ 203cabdff1aSopenharmony_ci (vector unsigned short) _1); \ 204cabdff1aSopenharmony_ci _3 = (T) vec_mergel((vector unsigned short) _0, \ 205cabdff1aSopenharmony_ci (vector unsigned short) _1); \ 206cabdff1aSopenharmony_ci vec_st(_2, 2 * 16, (T *) ptr); \ 207cabdff1aSopenharmony_ci vec_st(_3, 3 * 16, (T *) ptr); \ 208cabdff1aSopenharmony_ci ptr += 4; \ 209cabdff1aSopenharmony_ci } while (0) 210cabdff1aSopenharmony_ci 211cabdff1aSopenharmony_ci/* 212cabdff1aSopenharmony_ci * 1 0 1.4021 | | Y | 213cabdff1aSopenharmony_ci * 1 -0.3441 -0.7142 |x| Cb| 214cabdff1aSopenharmony_ci * 1 1.7718 0 | | Cr| 215cabdff1aSopenharmony_ci * 216cabdff1aSopenharmony_ci * 217cabdff1aSopenharmony_ci * Y: [-128 127] 218cabdff1aSopenharmony_ci * Cb/Cr : [-128 127] 219cabdff1aSopenharmony_ci * 220cabdff1aSopenharmony_ci * typical YUV conversion works on Y: 0-255 this version has been 221cabdff1aSopenharmony_ci * optimized for JPEG decoding. 222cabdff1aSopenharmony_ci */ 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 225cabdff1aSopenharmony_ci#define vec_unh(x) \ 226cabdff1aSopenharmony_ci (vector signed short) \ 227cabdff1aSopenharmony_ci vec_perm(x, (__typeof__(x)) { 0 }, \ 228cabdff1aSopenharmony_ci ((vector unsigned char) { \ 229cabdff1aSopenharmony_ci 0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \ 230cabdff1aSopenharmony_ci 0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 })) 231cabdff1aSopenharmony_ci 232cabdff1aSopenharmony_ci#define vec_unl(x) \ 233cabdff1aSopenharmony_ci (vector signed short) \ 234cabdff1aSopenharmony_ci vec_perm(x, (__typeof__(x)) { 0 }, \ 235cabdff1aSopenharmony_ci ((vector unsigned char) { \ 236cabdff1aSopenharmony_ci 0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \ 237cabdff1aSopenharmony_ci 0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F })) 238cabdff1aSopenharmony_ci#else 239cabdff1aSopenharmony_ci#define vec_unh(x)(vector signed short) vec_mergeh(x,(__typeof__(x)) { 0 }) 240cabdff1aSopenharmony_ci#define vec_unl(x)(vector signed short) vec_mergel(x,(__typeof__(x)) { 0 }) 241cabdff1aSopenharmony_ci#endif 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci#define vec_clip_s16(x) \ 244cabdff1aSopenharmony_ci vec_max(vec_min(x, ((vector signed short) { \ 245cabdff1aSopenharmony_ci 235, 235, 235, 235, 235, 235, 235, 235 })), \ 246cabdff1aSopenharmony_ci ((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 })) 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_ci#define vec_packclp(x, y) \ 249cabdff1aSopenharmony_ci (vector unsigned char) \ 250cabdff1aSopenharmony_ci vec_packs((vector unsigned short) \ 251cabdff1aSopenharmony_ci vec_max(x, ((vector signed short) { 0 })), \ 252cabdff1aSopenharmony_ci (vector unsigned short) \ 253cabdff1aSopenharmony_ci vec_max(y, ((vector signed short) { 0 }))) 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_cistatic inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y, 256cabdff1aSopenharmony_ci vector signed short U, vector signed short V, 257cabdff1aSopenharmony_ci vector signed short *R, vector signed short *G, 258cabdff1aSopenharmony_ci vector signed short *B) 259cabdff1aSopenharmony_ci{ 260cabdff1aSopenharmony_ci vector signed short vx, ux, uvx; 261cabdff1aSopenharmony_ci 262cabdff1aSopenharmony_ci Y = vec_mradds(Y, c->CY, c->OY); 263cabdff1aSopenharmony_ci U = vec_sub(U, (vector signed short) 264cabdff1aSopenharmony_ci vec_splat((vector signed short) { 128 }, 0)); 265cabdff1aSopenharmony_ci V = vec_sub(V, (vector signed short) 266cabdff1aSopenharmony_ci vec_splat((vector signed short) { 128 }, 0)); 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_ci // ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15; 269cabdff1aSopenharmony_ci ux = vec_sl(U, c->CSHIFT); 270cabdff1aSopenharmony_ci *B = vec_mradds(ux, c->CBU, Y); 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_ci // vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15; 273cabdff1aSopenharmony_ci vx = vec_sl(V, c->CSHIFT); 274cabdff1aSopenharmony_ci *R = vec_mradds(vx, c->CRV, Y); 275cabdff1aSopenharmony_ci 276cabdff1aSopenharmony_ci // uvx = ((CGU * u) + (CGV * v)) >> 15; 277cabdff1aSopenharmony_ci uvx = vec_mradds(U, c->CGU, Y); 278cabdff1aSopenharmony_ci *G = vec_mradds(V, c->CGV, uvx); 279cabdff1aSopenharmony_ci} 280cabdff1aSopenharmony_ci 281cabdff1aSopenharmony_ci/* 282cabdff1aSopenharmony_ci * ------------------------------------------------------------------------------ 283cabdff1aSopenharmony_ci * CS converters 284cabdff1aSopenharmony_ci * ------------------------------------------------------------------------------ 285cabdff1aSopenharmony_ci */ 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci#if !HAVE_VSX 288cabdff1aSopenharmony_cistatic inline vector unsigned char vec_xl(signed long long offset, const ubyte *addr) 289cabdff1aSopenharmony_ci{ 290cabdff1aSopenharmony_ci const vector unsigned char *v_addr = (const vector unsigned char *) (addr + offset); 291cabdff1aSopenharmony_ci vector unsigned char align_perm = vec_lvsl(offset, addr); 292cabdff1aSopenharmony_ci 293cabdff1aSopenharmony_ci return (vector unsigned char) vec_perm(v_addr[0], v_addr[1], align_perm); 294cabdff1aSopenharmony_ci} 295cabdff1aSopenharmony_ci#endif /* !HAVE_VSX */ 296cabdff1aSopenharmony_ci 297cabdff1aSopenharmony_ci#define DEFCSP420_CVT(name, out_pixels) \ 298cabdff1aSopenharmony_cistatic int altivec_ ## name(SwsContext *c, const unsigned char **in, \ 299cabdff1aSopenharmony_ci int *instrides, int srcSliceY, int srcSliceH, \ 300cabdff1aSopenharmony_ci unsigned char **oplanes, int *outstrides) \ 301cabdff1aSopenharmony_ci{ \ 302cabdff1aSopenharmony_ci int w = c->srcW; \ 303cabdff1aSopenharmony_ci int h = srcSliceH; \ 304cabdff1aSopenharmony_ci int i, j; \ 305cabdff1aSopenharmony_ci int instrides_scl[3]; \ 306cabdff1aSopenharmony_ci vector unsigned char y0, y1; \ 307cabdff1aSopenharmony_ci \ 308cabdff1aSopenharmony_ci vector signed char u, v; \ 309cabdff1aSopenharmony_ci \ 310cabdff1aSopenharmony_ci vector signed short Y0, Y1, Y2, Y3; \ 311cabdff1aSopenharmony_ci vector signed short U, V; \ 312cabdff1aSopenharmony_ci vector signed short vx, ux, uvx; \ 313cabdff1aSopenharmony_ci vector signed short vx0, ux0, uvx0; \ 314cabdff1aSopenharmony_ci vector signed short vx1, ux1, uvx1; \ 315cabdff1aSopenharmony_ci vector signed short R0, G0, B0; \ 316cabdff1aSopenharmony_ci vector signed short R1, G1, B1; \ 317cabdff1aSopenharmony_ci vector unsigned char R, G, B; \ 318cabdff1aSopenharmony_ci \ 319cabdff1aSopenharmony_ci vector signed short lCY = c->CY; \ 320cabdff1aSopenharmony_ci vector signed short lOY = c->OY; \ 321cabdff1aSopenharmony_ci vector signed short lCRV = c->CRV; \ 322cabdff1aSopenharmony_ci vector signed short lCBU = c->CBU; \ 323cabdff1aSopenharmony_ci vector signed short lCGU = c->CGU; \ 324cabdff1aSopenharmony_ci vector signed short lCGV = c->CGV; \ 325cabdff1aSopenharmony_ci vector unsigned short lCSHIFT = c->CSHIFT; \ 326cabdff1aSopenharmony_ci \ 327cabdff1aSopenharmony_ci const ubyte *y1i = in[0]; \ 328cabdff1aSopenharmony_ci const ubyte *y2i = in[0] + instrides[0]; \ 329cabdff1aSopenharmony_ci const ubyte *ui = in[1]; \ 330cabdff1aSopenharmony_ci const ubyte *vi = in[2]; \ 331cabdff1aSopenharmony_ci \ 332cabdff1aSopenharmony_ci vector unsigned char *oute, *outo; \ 333cabdff1aSopenharmony_ci \ 334cabdff1aSopenharmony_ci /* loop moves y{1, 2}i by w */ \ 335cabdff1aSopenharmony_ci instrides_scl[0] = instrides[0] * 2 - w; \ 336cabdff1aSopenharmony_ci /* loop moves ui by w / 2 */ \ 337cabdff1aSopenharmony_ci instrides_scl[1] = instrides[1] - w / 2; \ 338cabdff1aSopenharmony_ci /* loop moves vi by w / 2 */ \ 339cabdff1aSopenharmony_ci instrides_scl[2] = instrides[2] - w / 2; \ 340cabdff1aSopenharmony_ci \ 341cabdff1aSopenharmony_ci for (i = 0; i < h / 2; i++) { \ 342cabdff1aSopenharmony_ci oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \ 343cabdff1aSopenharmony_ci (srcSliceY + i * 2)); \ 344cabdff1aSopenharmony_ci outo = oute + (outstrides[0] >> 4); \ 345cabdff1aSopenharmony_ci vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \ 346cabdff1aSopenharmony_ci vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \ 347cabdff1aSopenharmony_ci \ 348cabdff1aSopenharmony_ci for (j = 0; j < w / 16; j++) { \ 349cabdff1aSopenharmony_ci y0 = vec_xl(0, y1i); \ 350cabdff1aSopenharmony_ci \ 351cabdff1aSopenharmony_ci y1 = vec_xl(0, y2i); \ 352cabdff1aSopenharmony_ci \ 353cabdff1aSopenharmony_ci u = (vector signed char) vec_xl(0, ui); \ 354cabdff1aSopenharmony_ci \ 355cabdff1aSopenharmony_ci v = (vector signed char) vec_xl(0, vi); \ 356cabdff1aSopenharmony_ci \ 357cabdff1aSopenharmony_ci u = (vector signed char) \ 358cabdff1aSopenharmony_ci vec_sub(u, \ 359cabdff1aSopenharmony_ci (vector signed char) \ 360cabdff1aSopenharmony_ci vec_splat((vector signed char) { 128 }, 0)); \ 361cabdff1aSopenharmony_ci v = (vector signed char) \ 362cabdff1aSopenharmony_ci vec_sub(v, \ 363cabdff1aSopenharmony_ci (vector signed char) \ 364cabdff1aSopenharmony_ci vec_splat((vector signed char) { 128 }, 0)); \ 365cabdff1aSopenharmony_ci \ 366cabdff1aSopenharmony_ci U = vec_unpackh(u); \ 367cabdff1aSopenharmony_ci V = vec_unpackh(v); \ 368cabdff1aSopenharmony_ci \ 369cabdff1aSopenharmony_ci Y0 = vec_unh(y0); \ 370cabdff1aSopenharmony_ci Y1 = vec_unl(y0); \ 371cabdff1aSopenharmony_ci Y2 = vec_unh(y1); \ 372cabdff1aSopenharmony_ci Y3 = vec_unl(y1); \ 373cabdff1aSopenharmony_ci \ 374cabdff1aSopenharmony_ci Y0 = vec_mradds(Y0, lCY, lOY); \ 375cabdff1aSopenharmony_ci Y1 = vec_mradds(Y1, lCY, lOY); \ 376cabdff1aSopenharmony_ci Y2 = vec_mradds(Y2, lCY, lOY); \ 377cabdff1aSopenharmony_ci Y3 = vec_mradds(Y3, lCY, lOY); \ 378cabdff1aSopenharmony_ci \ 379cabdff1aSopenharmony_ci /* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \ 380cabdff1aSopenharmony_ci ux = vec_sl(U, lCSHIFT); \ 381cabdff1aSopenharmony_ci ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \ 382cabdff1aSopenharmony_ci ux0 = vec_mergeh(ux, ux); \ 383cabdff1aSopenharmony_ci ux1 = vec_mergel(ux, ux); \ 384cabdff1aSopenharmony_ci \ 385cabdff1aSopenharmony_ci /* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \ 386cabdff1aSopenharmony_ci vx = vec_sl(V, lCSHIFT); \ 387cabdff1aSopenharmony_ci vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \ 388cabdff1aSopenharmony_ci vx0 = vec_mergeh(vx, vx); \ 389cabdff1aSopenharmony_ci vx1 = vec_mergel(vx, vx); \ 390cabdff1aSopenharmony_ci \ 391cabdff1aSopenharmony_ci /* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \ 392cabdff1aSopenharmony_ci uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \ 393cabdff1aSopenharmony_ci uvx = vec_mradds(V, lCGV, uvx); \ 394cabdff1aSopenharmony_ci uvx0 = vec_mergeh(uvx, uvx); \ 395cabdff1aSopenharmony_ci uvx1 = vec_mergel(uvx, uvx); \ 396cabdff1aSopenharmony_ci \ 397cabdff1aSopenharmony_ci R0 = vec_add(Y0, vx0); \ 398cabdff1aSopenharmony_ci G0 = vec_add(Y0, uvx0); \ 399cabdff1aSopenharmony_ci B0 = vec_add(Y0, ux0); \ 400cabdff1aSopenharmony_ci R1 = vec_add(Y1, vx1); \ 401cabdff1aSopenharmony_ci G1 = vec_add(Y1, uvx1); \ 402cabdff1aSopenharmony_ci B1 = vec_add(Y1, ux1); \ 403cabdff1aSopenharmony_ci \ 404cabdff1aSopenharmony_ci R = vec_packclp(R0, R1); \ 405cabdff1aSopenharmony_ci G = vec_packclp(G0, G1); \ 406cabdff1aSopenharmony_ci B = vec_packclp(B0, B1); \ 407cabdff1aSopenharmony_ci \ 408cabdff1aSopenharmony_ci out_pixels(R, G, B, oute); \ 409cabdff1aSopenharmony_ci \ 410cabdff1aSopenharmony_ci R0 = vec_add(Y2, vx0); \ 411cabdff1aSopenharmony_ci G0 = vec_add(Y2, uvx0); \ 412cabdff1aSopenharmony_ci B0 = vec_add(Y2, ux0); \ 413cabdff1aSopenharmony_ci R1 = vec_add(Y3, vx1); \ 414cabdff1aSopenharmony_ci G1 = vec_add(Y3, uvx1); \ 415cabdff1aSopenharmony_ci B1 = vec_add(Y3, ux1); \ 416cabdff1aSopenharmony_ci R = vec_packclp(R0, R1); \ 417cabdff1aSopenharmony_ci G = vec_packclp(G0, G1); \ 418cabdff1aSopenharmony_ci B = vec_packclp(B0, B1); \ 419cabdff1aSopenharmony_ci \ 420cabdff1aSopenharmony_ci \ 421cabdff1aSopenharmony_ci out_pixels(R, G, B, outo); \ 422cabdff1aSopenharmony_ci \ 423cabdff1aSopenharmony_ci y1i += 16; \ 424cabdff1aSopenharmony_ci y2i += 16; \ 425cabdff1aSopenharmony_ci ui += 8; \ 426cabdff1aSopenharmony_ci vi += 8; \ 427cabdff1aSopenharmony_ci } \ 428cabdff1aSopenharmony_ci \ 429cabdff1aSopenharmony_ci ui += instrides_scl[1]; \ 430cabdff1aSopenharmony_ci vi += instrides_scl[2]; \ 431cabdff1aSopenharmony_ci y1i += instrides_scl[0]; \ 432cabdff1aSopenharmony_ci y2i += instrides_scl[0]; \ 433cabdff1aSopenharmony_ci } \ 434cabdff1aSopenharmony_ci return srcSliceH; \ 435cabdff1aSopenharmony_ci} 436cabdff1aSopenharmony_ci 437cabdff1aSopenharmony_ci#define out_abgr(a, b, c, ptr) \ 438cabdff1aSopenharmony_ci vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), c, b, a, ptr) 439cabdff1aSopenharmony_ci#define out_bgra(a, b, c, ptr) \ 440cabdff1aSopenharmony_ci vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr) 441cabdff1aSopenharmony_ci#define out_rgba(a, b, c, ptr) \ 442cabdff1aSopenharmony_ci vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr) 443cabdff1aSopenharmony_ci#define out_argb(a, b, c, ptr) \ 444cabdff1aSopenharmony_ci vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), a, b, c, ptr) 445cabdff1aSopenharmony_ci#define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr) 446cabdff1aSopenharmony_ci#define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr) 447cabdff1aSopenharmony_ci 448cabdff1aSopenharmony_ciDEFCSP420_CVT(yuv2_abgr, out_abgr) 449cabdff1aSopenharmony_ciDEFCSP420_CVT(yuv2_bgra, out_bgra) 450cabdff1aSopenharmony_ciDEFCSP420_CVT(yuv2_rgba, out_rgba) 451cabdff1aSopenharmony_ciDEFCSP420_CVT(yuv2_argb, out_argb) 452cabdff1aSopenharmony_ciDEFCSP420_CVT(yuv2_rgb24, out_rgb24) 453cabdff1aSopenharmony_ciDEFCSP420_CVT(yuv2_bgr24, out_bgr24) 454cabdff1aSopenharmony_ci 455cabdff1aSopenharmony_ci// uyvy|uyvy|uyvy|uyvy 456cabdff1aSopenharmony_ci// 0123 4567 89ab cdef 457cabdff1aSopenharmony_cistatic const vector unsigned char 458cabdff1aSopenharmony_ci demux_u = { 0x10, 0x00, 0x10, 0x00, 459cabdff1aSopenharmony_ci 0x10, 0x04, 0x10, 0x04, 460cabdff1aSopenharmony_ci 0x10, 0x08, 0x10, 0x08, 461cabdff1aSopenharmony_ci 0x10, 0x0c, 0x10, 0x0c }, 462cabdff1aSopenharmony_ci demux_v = { 0x10, 0x02, 0x10, 0x02, 463cabdff1aSopenharmony_ci 0x10, 0x06, 0x10, 0x06, 464cabdff1aSopenharmony_ci 0x10, 0x0A, 0x10, 0x0A, 465cabdff1aSopenharmony_ci 0x10, 0x0E, 0x10, 0x0E }, 466cabdff1aSopenharmony_ci demux_y = { 0x10, 0x01, 0x10, 0x03, 467cabdff1aSopenharmony_ci 0x10, 0x05, 0x10, 0x07, 468cabdff1aSopenharmony_ci 0x10, 0x09, 0x10, 0x0B, 469cabdff1aSopenharmony_ci 0x10, 0x0D, 0x10, 0x0F }; 470cabdff1aSopenharmony_ci 471cabdff1aSopenharmony_ci/* 472cabdff1aSopenharmony_ci * this is so I can play live CCIR raw video 473cabdff1aSopenharmony_ci */ 474cabdff1aSopenharmony_cistatic int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in, 475cabdff1aSopenharmony_ci int *instrides, int srcSliceY, int srcSliceH, 476cabdff1aSopenharmony_ci unsigned char **oplanes, int *outstrides) 477cabdff1aSopenharmony_ci{ 478cabdff1aSopenharmony_ci int w = c->srcW; 479cabdff1aSopenharmony_ci int h = srcSliceH; 480cabdff1aSopenharmony_ci int i, j; 481cabdff1aSopenharmony_ci vector unsigned char uyvy; 482cabdff1aSopenharmony_ci vector signed short Y, U, V; 483cabdff1aSopenharmony_ci vector signed short R0, G0, B0, R1, G1, B1; 484cabdff1aSopenharmony_ci vector unsigned char R, G, B; 485cabdff1aSopenharmony_ci vector unsigned char *out; 486cabdff1aSopenharmony_ci const ubyte *img; 487cabdff1aSopenharmony_ci 488cabdff1aSopenharmony_ci img = in[0]; 489cabdff1aSopenharmony_ci out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]); 490cabdff1aSopenharmony_ci 491cabdff1aSopenharmony_ci for (i = 0; i < h; i++) 492cabdff1aSopenharmony_ci for (j = 0; j < w / 16; j++) { 493cabdff1aSopenharmony_ci uyvy = vec_ld(0, img); 494cabdff1aSopenharmony_ci 495cabdff1aSopenharmony_ci U = (vector signed short) 496cabdff1aSopenharmony_ci vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u); 497cabdff1aSopenharmony_ci V = (vector signed short) 498cabdff1aSopenharmony_ci vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v); 499cabdff1aSopenharmony_ci Y = (vector signed short) 500cabdff1aSopenharmony_ci vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y); 501cabdff1aSopenharmony_ci 502cabdff1aSopenharmony_ci cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0); 503cabdff1aSopenharmony_ci 504cabdff1aSopenharmony_ci uyvy = vec_ld(16, img); 505cabdff1aSopenharmony_ci 506cabdff1aSopenharmony_ci U = (vector signed short) 507cabdff1aSopenharmony_ci vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u); 508cabdff1aSopenharmony_ci V = (vector signed short) 509cabdff1aSopenharmony_ci vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v); 510cabdff1aSopenharmony_ci Y = (vector signed short) 511cabdff1aSopenharmony_ci vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y); 512cabdff1aSopenharmony_ci 513cabdff1aSopenharmony_ci cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1); 514cabdff1aSopenharmony_ci 515cabdff1aSopenharmony_ci R = vec_packclp(R0, R1); 516cabdff1aSopenharmony_ci G = vec_packclp(G0, G1); 517cabdff1aSopenharmony_ci B = vec_packclp(B0, B1); 518cabdff1aSopenharmony_ci 519cabdff1aSopenharmony_ci // vec_mstbgr24 (R,G,B, out); 520cabdff1aSopenharmony_ci out_rgba(R, G, B, out); 521cabdff1aSopenharmony_ci 522cabdff1aSopenharmony_ci img += 32; 523cabdff1aSopenharmony_ci } 524cabdff1aSopenharmony_ci return srcSliceH; 525cabdff1aSopenharmony_ci} 526cabdff1aSopenharmony_ci 527cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 528cabdff1aSopenharmony_ci 529cabdff1aSopenharmony_ci/* Ok currently the acceleration routine only supports 530cabdff1aSopenharmony_ci * inputs of widths a multiple of 16 531cabdff1aSopenharmony_ci * and heights a multiple 2 532cabdff1aSopenharmony_ci * 533cabdff1aSopenharmony_ci * So we just fall back to the C codes for this. 534cabdff1aSopenharmony_ci */ 535cabdff1aSopenharmony_ciav_cold SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c) 536cabdff1aSopenharmony_ci{ 537cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 538cabdff1aSopenharmony_ci if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) 539cabdff1aSopenharmony_ci return NULL; 540cabdff1aSopenharmony_ci 541cabdff1aSopenharmony_ci /* 542cabdff1aSopenharmony_ci * and this seems not to matter too much I tried a bunch of 543cabdff1aSopenharmony_ci * videos with abnormal widths and MPlayer crashes elsewhere. 544cabdff1aSopenharmony_ci * mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 545cabdff1aSopenharmony_ci * boom with X11 bad match. 546cabdff1aSopenharmony_ci * 547cabdff1aSopenharmony_ci */ 548cabdff1aSopenharmony_ci if ((c->srcW & 0xf) != 0) 549cabdff1aSopenharmony_ci return NULL; 550cabdff1aSopenharmony_ci 551cabdff1aSopenharmony_ci switch (c->srcFormat) { 552cabdff1aSopenharmony_ci case AV_PIX_FMT_YUV410P: 553cabdff1aSopenharmony_ci case AV_PIX_FMT_YUV420P: 554cabdff1aSopenharmony_ci /*case IMGFMT_CLPL: ??? */ 555cabdff1aSopenharmony_ci case AV_PIX_FMT_GRAY8: 556cabdff1aSopenharmony_ci case AV_PIX_FMT_NV12: 557cabdff1aSopenharmony_ci case AV_PIX_FMT_NV21: 558cabdff1aSopenharmony_ci if ((c->srcH & 0x1) != 0) 559cabdff1aSopenharmony_ci return NULL; 560cabdff1aSopenharmony_ci 561cabdff1aSopenharmony_ci switch (c->dstFormat) { 562cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB24: 563cabdff1aSopenharmony_ci av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n"); 564cabdff1aSopenharmony_ci return altivec_yuv2_rgb24; 565cabdff1aSopenharmony_ci case AV_PIX_FMT_BGR24: 566cabdff1aSopenharmony_ci av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n"); 567cabdff1aSopenharmony_ci return altivec_yuv2_bgr24; 568cabdff1aSopenharmony_ci case AV_PIX_FMT_ARGB: 569cabdff1aSopenharmony_ci av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n"); 570cabdff1aSopenharmony_ci return altivec_yuv2_argb; 571cabdff1aSopenharmony_ci case AV_PIX_FMT_ABGR: 572cabdff1aSopenharmony_ci av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n"); 573cabdff1aSopenharmony_ci return altivec_yuv2_abgr; 574cabdff1aSopenharmony_ci case AV_PIX_FMT_RGBA: 575cabdff1aSopenharmony_ci av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n"); 576cabdff1aSopenharmony_ci return altivec_yuv2_rgba; 577cabdff1aSopenharmony_ci case AV_PIX_FMT_BGRA: 578cabdff1aSopenharmony_ci av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n"); 579cabdff1aSopenharmony_ci return altivec_yuv2_bgra; 580cabdff1aSopenharmony_ci default: return NULL; 581cabdff1aSopenharmony_ci } 582cabdff1aSopenharmony_ci break; 583cabdff1aSopenharmony_ci 584cabdff1aSopenharmony_ci case AV_PIX_FMT_UYVY422: 585cabdff1aSopenharmony_ci switch (c->dstFormat) { 586cabdff1aSopenharmony_ci case AV_PIX_FMT_BGR32: 587cabdff1aSopenharmony_ci av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n"); 588cabdff1aSopenharmony_ci return altivec_uyvy_rgb32; 589cabdff1aSopenharmony_ci default: return NULL; 590cabdff1aSopenharmony_ci } 591cabdff1aSopenharmony_ci break; 592cabdff1aSopenharmony_ci } 593cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 594cabdff1aSopenharmony_ci 595cabdff1aSopenharmony_ci return NULL; 596cabdff1aSopenharmony_ci} 597cabdff1aSopenharmony_ci 598cabdff1aSopenharmony_ciav_cold void ff_yuv2rgb_init_tables_ppc(SwsContext *c, 599cabdff1aSopenharmony_ci const int inv_table[4], 600cabdff1aSopenharmony_ci int brightness, 601cabdff1aSopenharmony_ci int contrast, 602cabdff1aSopenharmony_ci int saturation) 603cabdff1aSopenharmony_ci{ 604cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 605cabdff1aSopenharmony_ci union { 606cabdff1aSopenharmony_ci DECLARE_ALIGNED(16, signed short, tmp)[8]; 607cabdff1aSopenharmony_ci vector signed short vec; 608cabdff1aSopenharmony_ci } buf; 609cabdff1aSopenharmony_ci 610cabdff1aSopenharmony_ci if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) 611cabdff1aSopenharmony_ci return; 612cabdff1aSopenharmony_ci 613cabdff1aSopenharmony_ci buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy 614cabdff1aSopenharmony_ci buf.tmp[1] = -256 * brightness; // oy 615cabdff1aSopenharmony_ci buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv 616cabdff1aSopenharmony_ci buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu 617cabdff1aSopenharmony_ci buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu 618cabdff1aSopenharmony_ci buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv 619cabdff1aSopenharmony_ci 620cabdff1aSopenharmony_ci c->CSHIFT = (vector unsigned short) vec_splat_u16(2); 621cabdff1aSopenharmony_ci c->CY = vec_splat((vector signed short) buf.vec, 0); 622cabdff1aSopenharmony_ci c->OY = vec_splat((vector signed short) buf.vec, 1); 623cabdff1aSopenharmony_ci c->CRV = vec_splat((vector signed short) buf.vec, 2); 624cabdff1aSopenharmony_ci c->CBU = vec_splat((vector signed short) buf.vec, 3); 625cabdff1aSopenharmony_ci c->CGU = vec_splat((vector signed short) buf.vec, 4); 626cabdff1aSopenharmony_ci c->CGV = vec_splat((vector signed short) buf.vec, 5); 627cabdff1aSopenharmony_ci return; 628cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 629cabdff1aSopenharmony_ci} 630cabdff1aSopenharmony_ci 631cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 632cabdff1aSopenharmony_ci 633cabdff1aSopenharmony_cistatic av_always_inline void yuv2packedX_altivec(SwsContext *c, 634cabdff1aSopenharmony_ci const int16_t *lumFilter, 635cabdff1aSopenharmony_ci const int16_t **lumSrc, 636cabdff1aSopenharmony_ci int lumFilterSize, 637cabdff1aSopenharmony_ci const int16_t *chrFilter, 638cabdff1aSopenharmony_ci const int16_t **chrUSrc, 639cabdff1aSopenharmony_ci const int16_t **chrVSrc, 640cabdff1aSopenharmony_ci int chrFilterSize, 641cabdff1aSopenharmony_ci const int16_t **alpSrc, 642cabdff1aSopenharmony_ci uint8_t *dest, 643cabdff1aSopenharmony_ci int dstW, int dstY, 644cabdff1aSopenharmony_ci enum AVPixelFormat target) 645cabdff1aSopenharmony_ci{ 646cabdff1aSopenharmony_ci int i, j; 647cabdff1aSopenharmony_ci vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V; 648cabdff1aSopenharmony_ci vector signed short R0, G0, B0, R1, G1, B1; 649cabdff1aSopenharmony_ci 650cabdff1aSopenharmony_ci vector unsigned char R, G, B; 651cabdff1aSopenharmony_ci vector unsigned char *out, *nout; 652cabdff1aSopenharmony_ci 653cabdff1aSopenharmony_ci vector signed short RND = vec_splat_s16(1 << 3); 654cabdff1aSopenharmony_ci vector unsigned short SCL = vec_splat_u16(4); 655cabdff1aSopenharmony_ci DECLARE_ALIGNED(16, unsigned int, scratch)[16]; 656cabdff1aSopenharmony_ci 657cabdff1aSopenharmony_ci vector signed short *YCoeffs, *CCoeffs; 658cabdff1aSopenharmony_ci 659cabdff1aSopenharmony_ci YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize; 660cabdff1aSopenharmony_ci CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize; 661cabdff1aSopenharmony_ci 662cabdff1aSopenharmony_ci out = (vector unsigned char *) dest; 663cabdff1aSopenharmony_ci 664cabdff1aSopenharmony_ci for (i = 0; i < dstW; i += 16) { 665cabdff1aSopenharmony_ci Y0 = RND; 666cabdff1aSopenharmony_ci Y1 = RND; 667cabdff1aSopenharmony_ci /* extract 16 coeffs from lumSrc */ 668cabdff1aSopenharmony_ci for (j = 0; j < lumFilterSize; j++) { 669cabdff1aSopenharmony_ci X0 = vec_ld(0, &lumSrc[j][i]); 670cabdff1aSopenharmony_ci X1 = vec_ld(16, &lumSrc[j][i]); 671cabdff1aSopenharmony_ci Y0 = vec_mradds(X0, YCoeffs[j], Y0); 672cabdff1aSopenharmony_ci Y1 = vec_mradds(X1, YCoeffs[j], Y1); 673cabdff1aSopenharmony_ci } 674cabdff1aSopenharmony_ci 675cabdff1aSopenharmony_ci U = RND; 676cabdff1aSopenharmony_ci V = RND; 677cabdff1aSopenharmony_ci /* extract 8 coeffs from U,V */ 678cabdff1aSopenharmony_ci for (j = 0; j < chrFilterSize; j++) { 679cabdff1aSopenharmony_ci X = vec_ld(0, &chrUSrc[j][i / 2]); 680cabdff1aSopenharmony_ci U = vec_mradds(X, CCoeffs[j], U); 681cabdff1aSopenharmony_ci X = vec_ld(0, &chrVSrc[j][i / 2]); 682cabdff1aSopenharmony_ci V = vec_mradds(X, CCoeffs[j], V); 683cabdff1aSopenharmony_ci } 684cabdff1aSopenharmony_ci 685cabdff1aSopenharmony_ci /* scale and clip signals */ 686cabdff1aSopenharmony_ci Y0 = vec_sra(Y0, SCL); 687cabdff1aSopenharmony_ci Y1 = vec_sra(Y1, SCL); 688cabdff1aSopenharmony_ci U = vec_sra(U, SCL); 689cabdff1aSopenharmony_ci V = vec_sra(V, SCL); 690cabdff1aSopenharmony_ci 691cabdff1aSopenharmony_ci Y0 = vec_clip_s16(Y0); 692cabdff1aSopenharmony_ci Y1 = vec_clip_s16(Y1); 693cabdff1aSopenharmony_ci U = vec_clip_s16(U); 694cabdff1aSopenharmony_ci V = vec_clip_s16(V); 695cabdff1aSopenharmony_ci 696cabdff1aSopenharmony_ci /* now we have 697cabdff1aSopenharmony_ci * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15 698cabdff1aSopenharmony_ci * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7 699cabdff1aSopenharmony_ci * 700cabdff1aSopenharmony_ci * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15 701cabdff1aSopenharmony_ci * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7 702cabdff1aSopenharmony_ci * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7 703cabdff1aSopenharmony_ci */ 704cabdff1aSopenharmony_ci 705cabdff1aSopenharmony_ci U0 = vec_mergeh(U, U); 706cabdff1aSopenharmony_ci V0 = vec_mergeh(V, V); 707cabdff1aSopenharmony_ci 708cabdff1aSopenharmony_ci U1 = vec_mergel(U, U); 709cabdff1aSopenharmony_ci V1 = vec_mergel(V, V); 710cabdff1aSopenharmony_ci 711cabdff1aSopenharmony_ci cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0); 712cabdff1aSopenharmony_ci cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1); 713cabdff1aSopenharmony_ci 714cabdff1aSopenharmony_ci R = vec_packclp(R0, R1); 715cabdff1aSopenharmony_ci G = vec_packclp(G0, G1); 716cabdff1aSopenharmony_ci B = vec_packclp(B0, B1); 717cabdff1aSopenharmony_ci 718cabdff1aSopenharmony_ci switch (target) { 719cabdff1aSopenharmony_ci case AV_PIX_FMT_ABGR: 720cabdff1aSopenharmony_ci out_abgr(R, G, B, out); 721cabdff1aSopenharmony_ci break; 722cabdff1aSopenharmony_ci case AV_PIX_FMT_BGRA: 723cabdff1aSopenharmony_ci out_bgra(R, G, B, out); 724cabdff1aSopenharmony_ci break; 725cabdff1aSopenharmony_ci case AV_PIX_FMT_RGBA: 726cabdff1aSopenharmony_ci out_rgba(R, G, B, out); 727cabdff1aSopenharmony_ci break; 728cabdff1aSopenharmony_ci case AV_PIX_FMT_ARGB: 729cabdff1aSopenharmony_ci out_argb(R, G, B, out); 730cabdff1aSopenharmony_ci break; 731cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB24: 732cabdff1aSopenharmony_ci out_rgb24(R, G, B, out); 733cabdff1aSopenharmony_ci break; 734cabdff1aSopenharmony_ci case AV_PIX_FMT_BGR24: 735cabdff1aSopenharmony_ci out_bgr24(R, G, B, out); 736cabdff1aSopenharmony_ci break; 737cabdff1aSopenharmony_ci default: 738cabdff1aSopenharmony_ci { 739cabdff1aSopenharmony_ci /* If this is reached, the caller should have called yuv2packedXinC 740cabdff1aSopenharmony_ci * instead. */ 741cabdff1aSopenharmony_ci static int printed_error_message; 742cabdff1aSopenharmony_ci if (!printed_error_message) { 743cabdff1aSopenharmony_ci av_log(c, AV_LOG_ERROR, 744cabdff1aSopenharmony_ci "altivec_yuv2packedX doesn't support %s output\n", 745cabdff1aSopenharmony_ci av_get_pix_fmt_name(c->dstFormat)); 746cabdff1aSopenharmony_ci printed_error_message = 1; 747cabdff1aSopenharmony_ci } 748cabdff1aSopenharmony_ci return; 749cabdff1aSopenharmony_ci } 750cabdff1aSopenharmony_ci } 751cabdff1aSopenharmony_ci } 752cabdff1aSopenharmony_ci 753cabdff1aSopenharmony_ci if (i < dstW) { 754cabdff1aSopenharmony_ci i -= 16; 755cabdff1aSopenharmony_ci 756cabdff1aSopenharmony_ci Y0 = RND; 757cabdff1aSopenharmony_ci Y1 = RND; 758cabdff1aSopenharmony_ci /* extract 16 coeffs from lumSrc */ 759cabdff1aSopenharmony_ci for (j = 0; j < lumFilterSize; j++) { 760cabdff1aSopenharmony_ci X0 = vec_ld(0, &lumSrc[j][i]); 761cabdff1aSopenharmony_ci X1 = vec_ld(16, &lumSrc[j][i]); 762cabdff1aSopenharmony_ci Y0 = vec_mradds(X0, YCoeffs[j], Y0); 763cabdff1aSopenharmony_ci Y1 = vec_mradds(X1, YCoeffs[j], Y1); 764cabdff1aSopenharmony_ci } 765cabdff1aSopenharmony_ci 766cabdff1aSopenharmony_ci U = RND; 767cabdff1aSopenharmony_ci V = RND; 768cabdff1aSopenharmony_ci /* extract 8 coeffs from U,V */ 769cabdff1aSopenharmony_ci for (j = 0; j < chrFilterSize; j++) { 770cabdff1aSopenharmony_ci X = vec_ld(0, &chrUSrc[j][i / 2]); 771cabdff1aSopenharmony_ci U = vec_mradds(X, CCoeffs[j], U); 772cabdff1aSopenharmony_ci X = vec_ld(0, &chrVSrc[j][i / 2]); 773cabdff1aSopenharmony_ci V = vec_mradds(X, CCoeffs[j], V); 774cabdff1aSopenharmony_ci } 775cabdff1aSopenharmony_ci 776cabdff1aSopenharmony_ci /* scale and clip signals */ 777cabdff1aSopenharmony_ci Y0 = vec_sra(Y0, SCL); 778cabdff1aSopenharmony_ci Y1 = vec_sra(Y1, SCL); 779cabdff1aSopenharmony_ci U = vec_sra(U, SCL); 780cabdff1aSopenharmony_ci V = vec_sra(V, SCL); 781cabdff1aSopenharmony_ci 782cabdff1aSopenharmony_ci Y0 = vec_clip_s16(Y0); 783cabdff1aSopenharmony_ci Y1 = vec_clip_s16(Y1); 784cabdff1aSopenharmony_ci U = vec_clip_s16(U); 785cabdff1aSopenharmony_ci V = vec_clip_s16(V); 786cabdff1aSopenharmony_ci 787cabdff1aSopenharmony_ci /* now we have 788cabdff1aSopenharmony_ci * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15 789cabdff1aSopenharmony_ci * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7 790cabdff1aSopenharmony_ci * 791cabdff1aSopenharmony_ci * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15 792cabdff1aSopenharmony_ci * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7 793cabdff1aSopenharmony_ci * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7 794cabdff1aSopenharmony_ci */ 795cabdff1aSopenharmony_ci 796cabdff1aSopenharmony_ci U0 = vec_mergeh(U, U); 797cabdff1aSopenharmony_ci V0 = vec_mergeh(V, V); 798cabdff1aSopenharmony_ci 799cabdff1aSopenharmony_ci U1 = vec_mergel(U, U); 800cabdff1aSopenharmony_ci V1 = vec_mergel(V, V); 801cabdff1aSopenharmony_ci 802cabdff1aSopenharmony_ci cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0); 803cabdff1aSopenharmony_ci cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1); 804cabdff1aSopenharmony_ci 805cabdff1aSopenharmony_ci R = vec_packclp(R0, R1); 806cabdff1aSopenharmony_ci G = vec_packclp(G0, G1); 807cabdff1aSopenharmony_ci B = vec_packclp(B0, B1); 808cabdff1aSopenharmony_ci 809cabdff1aSopenharmony_ci nout = (vector unsigned char *) scratch; 810cabdff1aSopenharmony_ci switch (target) { 811cabdff1aSopenharmony_ci case AV_PIX_FMT_ABGR: 812cabdff1aSopenharmony_ci out_abgr(R, G, B, nout); 813cabdff1aSopenharmony_ci break; 814cabdff1aSopenharmony_ci case AV_PIX_FMT_BGRA: 815cabdff1aSopenharmony_ci out_bgra(R, G, B, nout); 816cabdff1aSopenharmony_ci break; 817cabdff1aSopenharmony_ci case AV_PIX_FMT_RGBA: 818cabdff1aSopenharmony_ci out_rgba(R, G, B, nout); 819cabdff1aSopenharmony_ci break; 820cabdff1aSopenharmony_ci case AV_PIX_FMT_ARGB: 821cabdff1aSopenharmony_ci out_argb(R, G, B, nout); 822cabdff1aSopenharmony_ci break; 823cabdff1aSopenharmony_ci case AV_PIX_FMT_RGB24: 824cabdff1aSopenharmony_ci out_rgb24(R, G, B, nout); 825cabdff1aSopenharmony_ci break; 826cabdff1aSopenharmony_ci case AV_PIX_FMT_BGR24: 827cabdff1aSopenharmony_ci out_bgr24(R, G, B, nout); 828cabdff1aSopenharmony_ci break; 829cabdff1aSopenharmony_ci default: 830cabdff1aSopenharmony_ci /* Unreachable, I think. */ 831cabdff1aSopenharmony_ci av_log(c, AV_LOG_ERROR, 832cabdff1aSopenharmony_ci "altivec_yuv2packedX doesn't support %s output\n", 833cabdff1aSopenharmony_ci av_get_pix_fmt_name(c->dstFormat)); 834cabdff1aSopenharmony_ci return; 835cabdff1aSopenharmony_ci } 836cabdff1aSopenharmony_ci 837cabdff1aSopenharmony_ci memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4); 838cabdff1aSopenharmony_ci } 839cabdff1aSopenharmony_ci} 840cabdff1aSopenharmony_ci 841cabdff1aSopenharmony_ci#define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \ 842cabdff1aSopenharmony_civoid ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \ 843cabdff1aSopenharmony_ci const int16_t *lumFilter, \ 844cabdff1aSopenharmony_ci const int16_t **lumSrc, \ 845cabdff1aSopenharmony_ci int lumFilterSize, \ 846cabdff1aSopenharmony_ci const int16_t *chrFilter, \ 847cabdff1aSopenharmony_ci const int16_t **chrUSrc, \ 848cabdff1aSopenharmony_ci const int16_t **chrVSrc, \ 849cabdff1aSopenharmony_ci int chrFilterSize, \ 850cabdff1aSopenharmony_ci const int16_t **alpSrc, \ 851cabdff1aSopenharmony_ci uint8_t *dest, int dstW, int dstY) \ 852cabdff1aSopenharmony_ci{ \ 853cabdff1aSopenharmony_ci yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \ 854cabdff1aSopenharmony_ci chrFilter, chrUSrc, chrVSrc, \ 855cabdff1aSopenharmony_ci chrFilterSize, alpSrc, \ 856cabdff1aSopenharmony_ci dest, dstW, dstY, pixfmt); \ 857cabdff1aSopenharmony_ci} 858cabdff1aSopenharmony_ci 859cabdff1aSopenharmony_ciYUV2PACKEDX_WRAPPER(abgr, AV_PIX_FMT_ABGR); 860cabdff1aSopenharmony_ciYUV2PACKEDX_WRAPPER(bgra, AV_PIX_FMT_BGRA); 861cabdff1aSopenharmony_ciYUV2PACKEDX_WRAPPER(argb, AV_PIX_FMT_ARGB); 862cabdff1aSopenharmony_ciYUV2PACKEDX_WRAPPER(rgba, AV_PIX_FMT_RGBA); 863cabdff1aSopenharmony_ciYUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24); 864cabdff1aSopenharmony_ciYUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24); 865cabdff1aSopenharmony_ci 866cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 867