1/* 2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mem_internal.h" 22#include "libavutil/ppc/util_altivec.h" 23 24/* this code assume that stride % 16 == 0 */ 25 26#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \ 27 vsrc2ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc2uc);\ 28 vsrc3ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc3uc);\ 29\ 30 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ 31 psum = vec_mladd(vB, vsrc1ssH, psum);\ 32 psum = vec_mladd(vC, vsrc2ssH, psum);\ 33 psum = vec_mladd(vD, vsrc3ssH, psum);\ 34 psum = BIAS2(psum);\ 35 psum = vec_sr(psum, v6us);\ 36\ 37 vdst = vec_ld(0, dst);\ 38 ppsum = (vec_u8)vec_pack(psum, psum);\ 39 vfdst = vec_perm(vdst, ppsum, fperm);\ 40\ 41 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 42\ 43 vec_st(fsum, 0, dst);\ 44\ 45 vsrc0ssH = vsrc2ssH;\ 46 vsrc1ssH = vsrc3ssH;\ 47\ 48 dst += stride;\ 49 src += stride; 50 51#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ 52\ 53 vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);\ 54 vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);\ 55\ 56 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 57 psum = vec_mladd(vE, vsrc1ssH, psum);\ 58 psum = vec_sr(psum, v6us);\ 59\ 60 vdst = vec_ld(0, dst);\ 61 ppsum = (vec_u8)vec_pack(psum, psum);\ 62 vfdst = vec_perm(vdst, ppsum, fperm);\ 63\ 64 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 65\ 66 vec_st(fsum, 0, dst);\ 67\ 68 dst += stride;\ 69 src += stride; 70 71#define noop(a) a 72#define add28(a) vec_add(v28ss, a) 73 74#if HAVE_BIGENDIAN 75#define GET_VSRC1(vs0, off, b, perm0, s){ \ 76 vec_u8 vsrcCuc, vsrcDuc; \ 77 vsrcCuc = vec_ld(off, s); \ 78 if (loadSecond){ \ 79 vsrcDuc = vec_ld(off + b, s); \ 80 } else \ 81 vsrcDuc = vsrcCuc; \ 82 \ 83 vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \ 84} 85#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \ 86 vec_u8 vsrcCuc, vsrcDuc; \ 87 vsrcCuc = vec_ld(off, s); \ 88 if (loadSecond){ \ 89 vsrcDuc = vec_ld(off + b, s); \ 90 } else \ 91 vsrcDuc = vsrcCuc; \ 92 \ 93 vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \ 94 if (reallyBadAlign){ \ 95 vs1 = vsrcDuc; \ 96 } else \ 97 vs1 = vec_perm(vsrcCuc, vsrcDuc, perm1); \ 98 } 99 100#else 101 102#define GET_VSRC1(vs0, off, b, perm0, s){ \ 103 vs0 = vec_vsx_ld(off, s); \ 104 } 105#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \ 106 vs0 = vec_vsx_ld(off, s); \ 107 vs1 = vec_vsx_ld(off + 1, s); \ 108 } 109#endif /* HAVE_BIGENDIAN */ 110 111#ifdef PREFIX_h264_chroma_mc8_altivec 112static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, 113 ptrdiff_t stride, int h, 114 int x, int y) 115{ 116 DECLARE_ALIGNED(16, signed int, ABCD)[4] = 117 {((8 - x) * (8 - y)), 118 (( x) * (8 - y)), 119 ((8 - x) * ( y)), 120 (( x) * ( y))}; 121 register int i; 122 vec_u8 fperm; 123 LOAD_ZERO; 124 const vec_s32 vABCD = vec_ld(0, ABCD); 125 const vec_s16 vA = VEC_SPLAT16(vABCD, 1); 126 const vec_s16 vB = VEC_SPLAT16(vABCD, 3); 127 const vec_s16 vC = VEC_SPLAT16(vABCD, 5); 128 const vec_s16 vD = VEC_SPLAT16(vABCD, 7); 129 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); 130 const vec_u16 v6us = vec_splat_u16(6); 131 132 vec_u8 vsrcperm0, vsrcperm1; 133 vec_u8 vsrc0uc, vsrc1uc; 134 vec_s16 vsrc0ssH, vsrc1ssH; 135 vec_u8 vsrc2uc, vsrc3uc; 136 vec_s16 vsrc2ssH, vsrc3ssH, psum; 137 vec_u8 vdst, ppsum, vfdst, fsum; 138#if HAVE_BIGENDIAN 139 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 140 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 141 vsrcperm0 = vec_lvsl(0, src); 142 vsrcperm1 = vec_lvsl(1, src); 143#endif 144 145 if (((unsigned long)dst) % 16 == 0) { 146 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 147 0x14, 0x15, 0x16, 0x17, 148 0x08, 0x09, 0x0A, 0x0B, 149 0x0C, 0x0D, 0x0E, 0x0F}; 150 } else { 151 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 152 0x04, 0x05, 0x06, 0x07, 153 0x18, 0x19, 0x1A, 0x1B, 154 0x1C, 0x1D, 0x1E, 0x1F}; 155 } 156 157 GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src); 158 159 vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc); 160 vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc); 161 162 if (ABCD[3]) { 163 for (i = 0 ; i < h ; i++) { 164 GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src); 165 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop); 166 } 167 } else { 168 const vec_s16 vE = vec_add(vB, vC); 169 if (ABCD[2]) { // x == 0 B == 0 170 for (i = 0 ; i < h ; i++) { 171 GET_VSRC1(vsrc1uc, stride, 15, vsrcperm0, src); 172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE; 173 vsrc0uc = vsrc1uc; 174 } 175 } else { // y == 0 C == 0 176 for (i = 0 ; i < h ; i++) { 177 GET_VSRC(vsrc0uc, vsrc1uc, 0, 15, vsrcperm0, vsrcperm1, src); 178 CHROMA_MC8_ALTIVEC_CORE_SIMPLE; 179 } 180 } 181 } 182} 183#endif 184 185/* this code assume that stride % 16 == 0 */ 186#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec 187static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t *dst, uint8_t *src, 188 ptrdiff_t stride, int h, 189 int x, int y) 190{ 191 DECLARE_ALIGNED(16, signed int, ABCD)[4] = 192 {((8 - x) * (8 - y)), 193 (( x) * (8 - y)), 194 ((8 - x) * ( y)), 195 (( x) * ( y))}; 196 register int i; 197 vec_u8 fperm; 198 LOAD_ZERO; 199 const vec_s32 vABCD = vec_ld(0, ABCD); 200 const vec_s16 vA = VEC_SPLAT16(vABCD, 1); 201 const vec_s16 vB = VEC_SPLAT16(vABCD, 3); 202 const vec_s16 vC = VEC_SPLAT16(vABCD, 5); 203 const vec_s16 vD = VEC_SPLAT16(vABCD, 7); 204 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); 205 const vec_u16 v6us = vec_splat_u16(6); 206 207 vec_u8 vsrcperm0, vsrcperm1; 208 vec_u8 vsrc0uc, vsrc1uc; 209 vec_s16 vsrc0ssH, vsrc1ssH; 210 vec_u8 vsrc2uc, vsrc3uc; 211 vec_s16 vsrc2ssH, vsrc3ssH, psum; 212 vec_u8 vdst, ppsum, vfdst, fsum; 213#if HAVE_BIGENDIAN 214 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 215 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 216 vsrcperm0 = vec_lvsl(0, src); 217 vsrcperm1 = vec_lvsl(1, src); 218#endif 219 220 if (((unsigned long)dst) % 16 == 0) { 221 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 222 0x14, 0x15, 0x16, 0x17, 223 0x08, 0x09, 0x0A, 0x0B, 224 0x0C, 0x0D, 0x0E, 0x0F}; 225 } else { 226 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 227 0x04, 0x05, 0x06, 0x07, 228 0x18, 0x19, 0x1A, 0x1B, 229 0x1C, 0x1D, 0x1E, 0x1F}; 230 } 231 232 GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src); 233 234 vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc0uc); 235 vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc1uc); 236 237 for (i = 0 ; i < h ; i++) { 238 GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src); 239 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28); 240 } 241} 242#endif 243 244#undef noop 245#undef add28 246#undef CHROMA_MC8_ALTIVEC_CORE 247