1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/mem_internal.h" 22cabdff1aSopenharmony_ci#include "libavutil/ppc/util_altivec.h" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci/* this code assume that stride % 16 == 0 */ 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \ 27cabdff1aSopenharmony_ci vsrc2ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc2uc);\ 28cabdff1aSopenharmony_ci vsrc3ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc3uc);\ 29cabdff1aSopenharmony_ci\ 30cabdff1aSopenharmony_ci psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ 31cabdff1aSopenharmony_ci psum = vec_mladd(vB, vsrc1ssH, psum);\ 32cabdff1aSopenharmony_ci psum = vec_mladd(vC, vsrc2ssH, psum);\ 33cabdff1aSopenharmony_ci psum = vec_mladd(vD, vsrc3ssH, psum);\ 34cabdff1aSopenharmony_ci psum = BIAS2(psum);\ 35cabdff1aSopenharmony_ci psum = vec_sr(psum, v6us);\ 36cabdff1aSopenharmony_ci\ 37cabdff1aSopenharmony_ci vdst = vec_ld(0, dst);\ 38cabdff1aSopenharmony_ci ppsum = (vec_u8)vec_pack(psum, psum);\ 39cabdff1aSopenharmony_ci vfdst = vec_perm(vdst, ppsum, fperm);\ 40cabdff1aSopenharmony_ci\ 41cabdff1aSopenharmony_ci OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 42cabdff1aSopenharmony_ci\ 43cabdff1aSopenharmony_ci vec_st(fsum, 0, dst);\ 44cabdff1aSopenharmony_ci\ 45cabdff1aSopenharmony_ci vsrc0ssH = vsrc2ssH;\ 46cabdff1aSopenharmony_ci vsrc1ssH = vsrc3ssH;\ 47cabdff1aSopenharmony_ci\ 48cabdff1aSopenharmony_ci dst += stride;\ 49cabdff1aSopenharmony_ci src += stride; 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_ci#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ 52cabdff1aSopenharmony_ci\ 53cabdff1aSopenharmony_ci vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);\ 54cabdff1aSopenharmony_ci vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);\ 55cabdff1aSopenharmony_ci\ 56cabdff1aSopenharmony_ci psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 57cabdff1aSopenharmony_ci psum = vec_mladd(vE, vsrc1ssH, psum);\ 58cabdff1aSopenharmony_ci psum = vec_sr(psum, v6us);\ 59cabdff1aSopenharmony_ci\ 60cabdff1aSopenharmony_ci vdst = vec_ld(0, dst);\ 61cabdff1aSopenharmony_ci ppsum = (vec_u8)vec_pack(psum, psum);\ 62cabdff1aSopenharmony_ci vfdst = vec_perm(vdst, ppsum, fperm);\ 63cabdff1aSopenharmony_ci\ 64cabdff1aSopenharmony_ci OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 65cabdff1aSopenharmony_ci\ 66cabdff1aSopenharmony_ci vec_st(fsum, 0, dst);\ 67cabdff1aSopenharmony_ci\ 68cabdff1aSopenharmony_ci dst += stride;\ 69cabdff1aSopenharmony_ci src += stride; 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_ci#define noop(a) a 72cabdff1aSopenharmony_ci#define add28(a) vec_add(v28ss, a) 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 75cabdff1aSopenharmony_ci#define GET_VSRC1(vs0, off, b, perm0, s){ \ 76cabdff1aSopenharmony_ci vec_u8 vsrcCuc, vsrcDuc; \ 77cabdff1aSopenharmony_ci vsrcCuc = vec_ld(off, s); \ 78cabdff1aSopenharmony_ci if (loadSecond){ \ 79cabdff1aSopenharmony_ci vsrcDuc = vec_ld(off + b, s); \ 80cabdff1aSopenharmony_ci } else \ 81cabdff1aSopenharmony_ci vsrcDuc = vsrcCuc; \ 82cabdff1aSopenharmony_ci \ 83cabdff1aSopenharmony_ci vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \ 84cabdff1aSopenharmony_ci} 85cabdff1aSopenharmony_ci#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \ 86cabdff1aSopenharmony_ci vec_u8 vsrcCuc, vsrcDuc; \ 87cabdff1aSopenharmony_ci vsrcCuc = vec_ld(off, s); \ 88cabdff1aSopenharmony_ci if (loadSecond){ \ 89cabdff1aSopenharmony_ci vsrcDuc = vec_ld(off + b, s); \ 90cabdff1aSopenharmony_ci } else \ 91cabdff1aSopenharmony_ci vsrcDuc = vsrcCuc; \ 92cabdff1aSopenharmony_ci \ 93cabdff1aSopenharmony_ci vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \ 94cabdff1aSopenharmony_ci if (reallyBadAlign){ \ 95cabdff1aSopenharmony_ci vs1 = vsrcDuc; \ 96cabdff1aSopenharmony_ci } else \ 97cabdff1aSopenharmony_ci vs1 = vec_perm(vsrcCuc, vsrcDuc, perm1); \ 98cabdff1aSopenharmony_ci } 99cabdff1aSopenharmony_ci 100cabdff1aSopenharmony_ci#else 101cabdff1aSopenharmony_ci 102cabdff1aSopenharmony_ci#define GET_VSRC1(vs0, off, b, perm0, s){ \ 103cabdff1aSopenharmony_ci vs0 = vec_vsx_ld(off, s); \ 104cabdff1aSopenharmony_ci } 105cabdff1aSopenharmony_ci#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \ 106cabdff1aSopenharmony_ci vs0 = vec_vsx_ld(off, s); \ 107cabdff1aSopenharmony_ci vs1 = vec_vsx_ld(off + 1, s); \ 108cabdff1aSopenharmony_ci } 109cabdff1aSopenharmony_ci#endif /* HAVE_BIGENDIAN */ 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ci#ifdef PREFIX_h264_chroma_mc8_altivec 112cabdff1aSopenharmony_cistatic void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, 113cabdff1aSopenharmony_ci ptrdiff_t stride, int h, 114cabdff1aSopenharmony_ci int x, int y) 115cabdff1aSopenharmony_ci{ 116cabdff1aSopenharmony_ci DECLARE_ALIGNED(16, signed int, ABCD)[4] = 117cabdff1aSopenharmony_ci {((8 - x) * (8 - y)), 118cabdff1aSopenharmony_ci (( x) * (8 - y)), 119cabdff1aSopenharmony_ci ((8 - x) * ( y)), 120cabdff1aSopenharmony_ci (( x) * ( y))}; 121cabdff1aSopenharmony_ci register int i; 122cabdff1aSopenharmony_ci vec_u8 fperm; 123cabdff1aSopenharmony_ci LOAD_ZERO; 124cabdff1aSopenharmony_ci const vec_s32 vABCD = vec_ld(0, ABCD); 125cabdff1aSopenharmony_ci const vec_s16 vA = VEC_SPLAT16(vABCD, 1); 126cabdff1aSopenharmony_ci const vec_s16 vB = VEC_SPLAT16(vABCD, 3); 127cabdff1aSopenharmony_ci const vec_s16 vC = VEC_SPLAT16(vABCD, 5); 128cabdff1aSopenharmony_ci const vec_s16 vD = VEC_SPLAT16(vABCD, 7); 129cabdff1aSopenharmony_ci const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); 130cabdff1aSopenharmony_ci const vec_u16 v6us = vec_splat_u16(6); 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci vec_u8 vsrcperm0, vsrcperm1; 133cabdff1aSopenharmony_ci vec_u8 vsrc0uc, vsrc1uc; 134cabdff1aSopenharmony_ci vec_s16 vsrc0ssH, vsrc1ssH; 135cabdff1aSopenharmony_ci vec_u8 vsrc2uc, vsrc3uc; 136cabdff1aSopenharmony_ci vec_s16 vsrc2ssH, vsrc3ssH, psum; 137cabdff1aSopenharmony_ci vec_u8 vdst, ppsum, vfdst, fsum; 138cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 139cabdff1aSopenharmony_ci register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 140cabdff1aSopenharmony_ci register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 141cabdff1aSopenharmony_ci vsrcperm0 = vec_lvsl(0, src); 142cabdff1aSopenharmony_ci vsrcperm1 = vec_lvsl(1, src); 143cabdff1aSopenharmony_ci#endif 144cabdff1aSopenharmony_ci 145cabdff1aSopenharmony_ci if (((unsigned long)dst) % 16 == 0) { 146cabdff1aSopenharmony_ci fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 147cabdff1aSopenharmony_ci 0x14, 0x15, 0x16, 0x17, 148cabdff1aSopenharmony_ci 0x08, 0x09, 0x0A, 0x0B, 149cabdff1aSopenharmony_ci 0x0C, 0x0D, 0x0E, 0x0F}; 150cabdff1aSopenharmony_ci } else { 151cabdff1aSopenharmony_ci fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 152cabdff1aSopenharmony_ci 0x04, 0x05, 0x06, 0x07, 153cabdff1aSopenharmony_ci 0x18, 0x19, 0x1A, 0x1B, 154cabdff1aSopenharmony_ci 0x1C, 0x1D, 0x1E, 0x1F}; 155cabdff1aSopenharmony_ci } 156cabdff1aSopenharmony_ci 157cabdff1aSopenharmony_ci GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src); 158cabdff1aSopenharmony_ci 159cabdff1aSopenharmony_ci vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc); 160cabdff1aSopenharmony_ci vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc); 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci if (ABCD[3]) { 163cabdff1aSopenharmony_ci for (i = 0 ; i < h ; i++) { 164cabdff1aSopenharmony_ci GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src); 165cabdff1aSopenharmony_ci CHROMA_MC8_ALTIVEC_CORE(v32ss, noop); 166cabdff1aSopenharmony_ci } 167cabdff1aSopenharmony_ci } else { 168cabdff1aSopenharmony_ci const vec_s16 vE = vec_add(vB, vC); 169cabdff1aSopenharmony_ci if (ABCD[2]) { // x == 0 B == 0 170cabdff1aSopenharmony_ci for (i = 0 ; i < h ; i++) { 171cabdff1aSopenharmony_ci GET_VSRC1(vsrc1uc, stride, 15, vsrcperm0, src); 172cabdff1aSopenharmony_ci CHROMA_MC8_ALTIVEC_CORE_SIMPLE; 173cabdff1aSopenharmony_ci vsrc0uc = vsrc1uc; 174cabdff1aSopenharmony_ci } 175cabdff1aSopenharmony_ci } else { // y == 0 C == 0 176cabdff1aSopenharmony_ci for (i = 0 ; i < h ; i++) { 177cabdff1aSopenharmony_ci GET_VSRC(vsrc0uc, vsrc1uc, 0, 15, vsrcperm0, vsrcperm1, src); 178cabdff1aSopenharmony_ci CHROMA_MC8_ALTIVEC_CORE_SIMPLE; 179cabdff1aSopenharmony_ci } 180cabdff1aSopenharmony_ci } 181cabdff1aSopenharmony_ci } 182cabdff1aSopenharmony_ci} 183cabdff1aSopenharmony_ci#endif 184cabdff1aSopenharmony_ci 185cabdff1aSopenharmony_ci/* this code assume that stride % 16 == 0 */ 186cabdff1aSopenharmony_ci#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec 187cabdff1aSopenharmony_cistatic void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t *dst, uint8_t *src, 188cabdff1aSopenharmony_ci ptrdiff_t stride, int h, 189cabdff1aSopenharmony_ci int x, int y) 190cabdff1aSopenharmony_ci{ 191cabdff1aSopenharmony_ci DECLARE_ALIGNED(16, signed int, ABCD)[4] = 192cabdff1aSopenharmony_ci {((8 - x) * (8 - y)), 193cabdff1aSopenharmony_ci (( x) * (8 - y)), 194cabdff1aSopenharmony_ci ((8 - x) * ( y)), 195cabdff1aSopenharmony_ci (( x) * ( y))}; 196cabdff1aSopenharmony_ci register int i; 197cabdff1aSopenharmony_ci vec_u8 fperm; 198cabdff1aSopenharmony_ci LOAD_ZERO; 199cabdff1aSopenharmony_ci const vec_s32 vABCD = vec_ld(0, ABCD); 200cabdff1aSopenharmony_ci const vec_s16 vA = VEC_SPLAT16(vABCD, 1); 201cabdff1aSopenharmony_ci const vec_s16 vB = VEC_SPLAT16(vABCD, 3); 202cabdff1aSopenharmony_ci const vec_s16 vC = VEC_SPLAT16(vABCD, 5); 203cabdff1aSopenharmony_ci const vec_s16 vD = VEC_SPLAT16(vABCD, 7); 204cabdff1aSopenharmony_ci const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); 205cabdff1aSopenharmony_ci const vec_u16 v6us = vec_splat_u16(6); 206cabdff1aSopenharmony_ci 207cabdff1aSopenharmony_ci vec_u8 vsrcperm0, vsrcperm1; 208cabdff1aSopenharmony_ci vec_u8 vsrc0uc, vsrc1uc; 209cabdff1aSopenharmony_ci vec_s16 vsrc0ssH, vsrc1ssH; 210cabdff1aSopenharmony_ci vec_u8 vsrc2uc, vsrc3uc; 211cabdff1aSopenharmony_ci vec_s16 vsrc2ssH, vsrc3ssH, psum; 212cabdff1aSopenharmony_ci vec_u8 vdst, ppsum, vfdst, fsum; 213cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 214cabdff1aSopenharmony_ci register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 215cabdff1aSopenharmony_ci register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 216cabdff1aSopenharmony_ci vsrcperm0 = vec_lvsl(0, src); 217cabdff1aSopenharmony_ci vsrcperm1 = vec_lvsl(1, src); 218cabdff1aSopenharmony_ci#endif 219cabdff1aSopenharmony_ci 220cabdff1aSopenharmony_ci if (((unsigned long)dst) % 16 == 0) { 221cabdff1aSopenharmony_ci fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 222cabdff1aSopenharmony_ci 0x14, 0x15, 0x16, 0x17, 223cabdff1aSopenharmony_ci 0x08, 0x09, 0x0A, 0x0B, 224cabdff1aSopenharmony_ci 0x0C, 0x0D, 0x0E, 0x0F}; 225cabdff1aSopenharmony_ci } else { 226cabdff1aSopenharmony_ci fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 227cabdff1aSopenharmony_ci 0x04, 0x05, 0x06, 0x07, 228cabdff1aSopenharmony_ci 0x18, 0x19, 0x1A, 0x1B, 229cabdff1aSopenharmony_ci 0x1C, 0x1D, 0x1E, 0x1F}; 230cabdff1aSopenharmony_ci } 231cabdff1aSopenharmony_ci 232cabdff1aSopenharmony_ci GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src); 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc0uc); 235cabdff1aSopenharmony_ci vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc1uc); 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci for (i = 0 ; i < h ; i++) { 238cabdff1aSopenharmony_ci GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src); 239cabdff1aSopenharmony_ci CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28); 240cabdff1aSopenharmony_ci } 241cabdff1aSopenharmony_ci} 242cabdff1aSopenharmony_ci#endif 243cabdff1aSopenharmony_ci 244cabdff1aSopenharmony_ci#undef noop 245cabdff1aSopenharmony_ci#undef add28 246cabdff1aSopenharmony_ci#undef CHROMA_MC8_ALTIVEC_CORE 247