1/* 2 * VP8 compatible video decoder 3 * 4 * Copyright (C) 2010 David Conrad 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "config.h" 24 25#include "libavutil/attributes.h" 26#include "libavutil/cpu.h" 27#include "libavutil/mem_internal.h" 28#include "libavutil/ppc/cpu.h" 29#include "libavutil/ppc/util_altivec.h" 30 31#include "libavcodec/vp8dsp.h" 32 33#include "hpeldsp_altivec.h" 34 35#if HAVE_ALTIVEC 36#define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ } 37 38// h subpel filter uses msum to multiply+add 4 pixel taps at once 39static const vec_s8 h_subpel_filters_inner[7] = 40{ 41 REPT4( -6, 123, 12, -1), 42 REPT4(-11, 108, 36, -8), 43 REPT4( -9, 93, 50, -6), 44 REPT4(-16, 77, 77, -16), 45 REPT4( -6, 50, 93, -9), 46 REPT4( -8, 36, 108, -11), 47 REPT4( -1, 12, 123, -6), 48}; 49 50// for 6tap filters, these are the outer two taps 51// The zeros mask off pixels 4-7 when filtering 0-3 52// and vice-versa 53static const vec_s8 h_subpel_filters_outer[3] = 54{ 55 REPT4(0, 0, 2, 1), 56 REPT4(0, 0, 3, 3), 57 REPT4(0, 0, 1, 2), 58}; 59 60#define LOAD_H_SUBPEL_FILTER(i) \ 61 vec_s8 filter_inner = h_subpel_filters_inner[i]; \ 62 vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \ 63 vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2) 64 65#if HAVE_BIGENDIAN 66#define GET_PIXHL(offset) \ 67 a = vec_ld((offset)-is6tap-1, src); \ 68 b = vec_ld((offset)-is6tap-1+15, src); \ 69 pixh = vec_perm(a, b, permh##offset); \ 70 pixl = vec_perm(a, b, perml##offset) 71 72#define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset) 73#else 74#define GET_PIXHL(offset) \ 75 a = vec_vsx_ld((offset)-is6tap-1, src); \ 76 pixh = vec_perm(a, a, perm_inner); \ 77 pixl = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4))) 78 79#define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer) 80#endif 81 82#define FILTER_H(dstv, off) \ 83 GET_PIXHL(off); \ 84 filth = vec_msum(filter_inner, pixh, c64); \ 85 filtl = vec_msum(filter_inner, pixl, c64); \ 86\ 87 if (is6tap) { \ 88 GET_OUTER(off); \ 89 filth = vec_msum(filter_outerh, outer, filth); \ 90 filtl = vec_msum(filter_outerl, outer, filtl); \ 91 } \ 92 if (w == 4) \ 93 filtl = filth; /* discard pixels 4-7 */ \ 94 dstv = vec_packs(filth, filtl); \ 95 dstv = vec_sra(dstv, c7) 96 97static av_always_inline 98void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, 99 uint8_t *src, ptrdiff_t src_stride, 100 int h, int mx, int w, int is6tap) 101{ 102 LOAD_H_SUBPEL_FILTER(mx-1); 103#if HAVE_BIGENDIAN 104 vec_u8 align_vec0, align_vec8, permh0, permh8; 105 vec_u8 perm_6tap0, perm_6tap8, perml0, perml8; 106 vec_u8 b; 107#endif 108 vec_u8 filt, a, pixh, pixl, outer; 109 vec_s16 f16h, f16l; 110 vec_s32 filth, filtl; 111 112 vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 }; 113 vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 }; 114 vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4; 115 vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 }; 116 vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6)); 117 vec_u16 c7 = vec_splat_u16(7); 118 119#if HAVE_BIGENDIAN 120 align_vec0 = vec_lvsl( -is6tap-1, src); 121 align_vec8 = vec_lvsl(8-is6tap-1, src); 122 123 permh0 = vec_perm(align_vec0, align_vec0, perm_inner); 124 permh8 = vec_perm(align_vec8, align_vec8, perm_inner); 125 perm_inner = vec_add(perm_inner, vec_splat_u8(4)); 126 perml0 = vec_perm(align_vec0, align_vec0, perm_inner); 127 perml8 = vec_perm(align_vec8, align_vec8, perm_inner); 128 perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer); 129 perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer); 130#endif 131 132 while (h --> 0) { 133 FILTER_H(f16h, 0); 134 135 if (w == 16) { 136 FILTER_H(f16l, 8); 137 filt = vec_packsu(f16h, f16l); 138 vec_st(filt, 0, dst); 139 } else { 140 filt = vec_packsu(f16h, f16h); 141 vec_ste((vec_u32)filt, 0, (uint32_t*)dst); 142 if (w == 8) 143 vec_ste((vec_u32)filt, 4, (uint32_t*)dst); 144 } 145 src += src_stride; 146 dst += dst_stride; 147 } 148} 149 150// v subpel filter does a simple vertical multiply + add 151static const vec_u8 v_subpel_filters[7] = 152{ 153 { 0, 6, 123, 12, 1, 0 }, 154 { 2, 11, 108, 36, 8, 1 }, 155 { 0, 9, 93, 50, 6, 0 }, 156 { 3, 16, 77, 77, 16, 3 }, 157 { 0, 6, 50, 93, 9, 0 }, 158 { 1, 8, 36, 108, 11, 2 }, 159 { 0, 1, 12, 123, 6, 0 }, 160}; 161 162#define LOAD_V_SUBPEL_FILTER(i) \ 163 vec_u8 subpel_filter = v_subpel_filters[i]; \ 164 vec_u8 f0 = vec_splat(subpel_filter, 0); \ 165 vec_u8 f1 = vec_splat(subpel_filter, 1); \ 166 vec_u8 f2 = vec_splat(subpel_filter, 2); \ 167 vec_u8 f3 = vec_splat(subpel_filter, 3); \ 168 vec_u8 f4 = vec_splat(subpel_filter, 4); \ 169 vec_u8 f5 = vec_splat(subpel_filter, 5) 170 171#define FILTER_V(dstv, vec_mul) \ 172 s1f = (vec_s16)vec_mul(s1, f1); \ 173 s2f = (vec_s16)vec_mul(s2, f2); \ 174 s3f = (vec_s16)vec_mul(s3, f3); \ 175 s4f = (vec_s16)vec_mul(s4, f4); \ 176 s2f = vec_subs(s2f, s1f); \ 177 s3f = vec_subs(s3f, s4f); \ 178 if (is6tap) { \ 179 s0f = (vec_s16)vec_mul(s0, f0); \ 180 s5f = (vec_s16)vec_mul(s5, f5); \ 181 s2f = vec_adds(s2f, s0f); \ 182 s3f = vec_adds(s3f, s5f); \ 183 } \ 184 dstv = vec_adds(s2f, s3f); \ 185 dstv = vec_adds(dstv, c64); \ 186 dstv = vec_sra(dstv, c7) 187 188#if HAVE_BIGENDIAN 189#define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm) 190#else 191#define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s)) 192#endif 193 194static av_always_inline 195void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, 196 uint8_t *src, ptrdiff_t src_stride, 197 int h, int my, int w, int is6tap) 198{ 199 LOAD_V_SUBPEL_FILTER(my-1); 200 vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl; 201 vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l; 202 vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6)); 203 vec_u16 c7 = vec_splat_u16(7); 204 205#if HAVE_BIGENDIAN 206 // we want pixels 0-7 to be in the even positions and 8-15 in the odd, 207 // so combine this permute with the alignment permute vector 208 align_vech = vec_lvsl(0, src); 209 align_vecl = vec_sld(align_vech, align_vech, 8); 210 if (w ==16) 211 perm_vec = vec_mergeh(align_vech, align_vecl); 212 else 213 perm_vec = vec_mergeh(align_vech, align_vech); 214#endif 215 216 if (is6tap) 217 s0 = LOAD_HL(-2*src_stride, src, perm_vec); 218 s1 = LOAD_HL(-1*src_stride, src, perm_vec); 219 s2 = LOAD_HL( 0*src_stride, src, perm_vec); 220 s3 = LOAD_HL( 1*src_stride, src, perm_vec); 221 if (is6tap) 222 s4 = LOAD_HL( 2*src_stride, src, perm_vec); 223 224 src += (2+is6tap)*src_stride; 225 226 while (h --> 0) { 227 if (is6tap) 228 s5 = LOAD_HL(0, src, perm_vec); 229 else 230 s4 = LOAD_HL(0, src, perm_vec); 231 232 FILTER_V(f16h, vec_mule); 233 234 if (w == 16) { 235 FILTER_V(f16l, vec_mulo); 236 filt = vec_packsu(f16h, f16l); 237 vec_st(filt, 0, dst); 238 } else { 239 filt = vec_packsu(f16h, f16h); 240 if (w == 4) 241 filt = (vec_u8)vec_splat((vec_u32)filt, 0); 242 else 243 vec_ste((vec_u32)filt, 4, (uint32_t*)dst); 244 vec_ste((vec_u32)filt, 0, (uint32_t*)dst); 245 } 246 247 if (is6tap) 248 s0 = s1; 249 s1 = s2; 250 s2 = s3; 251 s3 = s4; 252 if (is6tap) 253 s4 = s5; 254 255 dst += dst_stride; 256 src += src_stride; 257 } 258} 259 260#define EPEL_FUNCS(WIDTH, TAPS) \ 261static av_noinline \ 262void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ 263{ \ 264 put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \ 265} \ 266\ 267static av_noinline \ 268void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ 269{ \ 270 put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \ 271} 272 273#define EPEL_HV(WIDTH, HTAPS, VTAPS) \ 274static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \ 275{ \ 276 DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \ 277 if (VTAPS == 6) { \ 278 put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \ 279 put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \ 280 } else { \ 281 put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \ 282 put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \ 283 } \ 284} 285 286EPEL_FUNCS(16,6) 287EPEL_FUNCS(8, 6) 288EPEL_FUNCS(8, 4) 289EPEL_FUNCS(4, 6) 290EPEL_FUNCS(4, 4) 291 292EPEL_HV(16, 6,6) 293EPEL_HV(8, 6,6) 294EPEL_HV(8, 4,6) 295EPEL_HV(8, 6,4) 296EPEL_HV(8, 4,4) 297EPEL_HV(4, 6,6) 298EPEL_HV(4, 4,6) 299EPEL_HV(4, 6,4) 300EPEL_HV(4, 4,4) 301 302static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) 303{ 304 register vector unsigned char perm; 305 int i; 306 register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1; 307 register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2; 308 register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2; 309 310#if HAVE_BIGENDIAN 311 perm = vec_lvsl(0, src); 312#endif 313// hand-unrolling the loop by 4 gains about 15% 314// mininum execution time goes from 74 to 60 cycles 315// it's faster than -funroll-loops, but using 316// -funroll-loops w/ this is bad - 74 cycles again. 317// all this is on a 7450, tuning for the 7450 318 for (i = 0; i < h; i += 4) { 319 vec_st(load_with_perm_vec(0, src, perm), 0, dst); 320 vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst); 321 vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst); 322 vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst); 323 src += sstride4; 324 dst += dstride4; 325 } 326} 327 328#endif /* HAVE_ALTIVEC */ 329 330 331av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c) 332{ 333#if HAVE_ALTIVEC 334 if (!PPC_ALTIVEC(av_get_cpu_flags())) 335 return; 336 337 c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec; 338 c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec; 339 c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec; 340 c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec; 341 342 c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec; 343 c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec; 344 c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec; 345 c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec; 346 347 c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec; 348 c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec; 349 c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec; 350 c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec; 351 352 c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec; 353 c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec; 354 c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec; 355 c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec; 356 357 c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec; 358 c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec; 359 c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec; 360 c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec; 361#endif /* HAVE_ALTIVEC */ 362} 363