1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * VP8 compatible video decoder 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (C) 2010 David Conrad 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "config.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 26cabdff1aSopenharmony_ci#include "libavutil/cpu.h" 27cabdff1aSopenharmony_ci#include "libavutil/mem_internal.h" 28cabdff1aSopenharmony_ci#include "libavutil/ppc/cpu.h" 29cabdff1aSopenharmony_ci#include "libavutil/ppc/util_altivec.h" 30cabdff1aSopenharmony_ci 31cabdff1aSopenharmony_ci#include "libavcodec/vp8dsp.h" 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ci#include "hpeldsp_altivec.h" 34cabdff1aSopenharmony_ci 35cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 36cabdff1aSopenharmony_ci#define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ } 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ci// h subpel filter uses msum to multiply+add 4 pixel taps at once 39cabdff1aSopenharmony_cistatic const vec_s8 h_subpel_filters_inner[7] = 40cabdff1aSopenharmony_ci{ 41cabdff1aSopenharmony_ci REPT4( -6, 123, 12, -1), 42cabdff1aSopenharmony_ci REPT4(-11, 108, 36, -8), 43cabdff1aSopenharmony_ci REPT4( -9, 93, 50, -6), 44cabdff1aSopenharmony_ci REPT4(-16, 77, 77, -16), 45cabdff1aSopenharmony_ci REPT4( -6, 50, 93, -9), 46cabdff1aSopenharmony_ci REPT4( -8, 36, 108, -11), 47cabdff1aSopenharmony_ci REPT4( -1, 12, 123, -6), 48cabdff1aSopenharmony_ci}; 49cabdff1aSopenharmony_ci 50cabdff1aSopenharmony_ci// for 6tap filters, these are the outer two taps 51cabdff1aSopenharmony_ci// The zeros mask off pixels 4-7 when filtering 0-3 52cabdff1aSopenharmony_ci// and vice-versa 53cabdff1aSopenharmony_cistatic const vec_s8 h_subpel_filters_outer[3] = 54cabdff1aSopenharmony_ci{ 55cabdff1aSopenharmony_ci REPT4(0, 0, 2, 1), 56cabdff1aSopenharmony_ci REPT4(0, 0, 3, 3), 57cabdff1aSopenharmony_ci REPT4(0, 0, 1, 2), 58cabdff1aSopenharmony_ci}; 59cabdff1aSopenharmony_ci 60cabdff1aSopenharmony_ci#define LOAD_H_SUBPEL_FILTER(i) \ 61cabdff1aSopenharmony_ci vec_s8 filter_inner = h_subpel_filters_inner[i]; \ 62cabdff1aSopenharmony_ci vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \ 63cabdff1aSopenharmony_ci vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2) 64cabdff1aSopenharmony_ci 65cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 66cabdff1aSopenharmony_ci#define GET_PIXHL(offset) \ 67cabdff1aSopenharmony_ci a = vec_ld((offset)-is6tap-1, src); \ 68cabdff1aSopenharmony_ci b = vec_ld((offset)-is6tap-1+15, src); \ 69cabdff1aSopenharmony_ci pixh = vec_perm(a, b, permh##offset); \ 70cabdff1aSopenharmony_ci pixl = vec_perm(a, b, perml##offset) 71cabdff1aSopenharmony_ci 72cabdff1aSopenharmony_ci#define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset) 73cabdff1aSopenharmony_ci#else 74cabdff1aSopenharmony_ci#define GET_PIXHL(offset) \ 75cabdff1aSopenharmony_ci a = vec_vsx_ld((offset)-is6tap-1, src); \ 76cabdff1aSopenharmony_ci pixh = vec_perm(a, a, perm_inner); \ 77cabdff1aSopenharmony_ci pixl = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4))) 78cabdff1aSopenharmony_ci 79cabdff1aSopenharmony_ci#define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer) 80cabdff1aSopenharmony_ci#endif 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_ci#define FILTER_H(dstv, off) \ 83cabdff1aSopenharmony_ci GET_PIXHL(off); \ 84cabdff1aSopenharmony_ci filth = vec_msum(filter_inner, pixh, c64); \ 85cabdff1aSopenharmony_ci filtl = vec_msum(filter_inner, pixl, c64); \ 86cabdff1aSopenharmony_ci\ 87cabdff1aSopenharmony_ci if (is6tap) { \ 88cabdff1aSopenharmony_ci GET_OUTER(off); \ 89cabdff1aSopenharmony_ci filth = vec_msum(filter_outerh, outer, filth); \ 90cabdff1aSopenharmony_ci filtl = vec_msum(filter_outerl, outer, filtl); \ 91cabdff1aSopenharmony_ci } \ 92cabdff1aSopenharmony_ci if (w == 4) \ 93cabdff1aSopenharmony_ci filtl = filth; /* discard pixels 4-7 */ \ 94cabdff1aSopenharmony_ci dstv = vec_packs(filth, filtl); \ 95cabdff1aSopenharmony_ci dstv = vec_sra(dstv, c7) 96cabdff1aSopenharmony_ci 97cabdff1aSopenharmony_cistatic av_always_inline 98cabdff1aSopenharmony_civoid put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, 99cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 100cabdff1aSopenharmony_ci int h, int mx, int w, int is6tap) 101cabdff1aSopenharmony_ci{ 102cabdff1aSopenharmony_ci LOAD_H_SUBPEL_FILTER(mx-1); 103cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 104cabdff1aSopenharmony_ci vec_u8 align_vec0, align_vec8, permh0, permh8; 105cabdff1aSopenharmony_ci vec_u8 perm_6tap0, perm_6tap8, perml0, perml8; 106cabdff1aSopenharmony_ci vec_u8 b; 107cabdff1aSopenharmony_ci#endif 108cabdff1aSopenharmony_ci vec_u8 filt, a, pixh, pixl, outer; 109cabdff1aSopenharmony_ci vec_s16 f16h, f16l; 110cabdff1aSopenharmony_ci vec_s32 filth, filtl; 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_ci vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 }; 113cabdff1aSopenharmony_ci vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 }; 114cabdff1aSopenharmony_ci vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4; 115cabdff1aSopenharmony_ci vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 }; 116cabdff1aSopenharmony_ci vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6)); 117cabdff1aSopenharmony_ci vec_u16 c7 = vec_splat_u16(7); 118cabdff1aSopenharmony_ci 119cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 120cabdff1aSopenharmony_ci align_vec0 = vec_lvsl( -is6tap-1, src); 121cabdff1aSopenharmony_ci align_vec8 = vec_lvsl(8-is6tap-1, src); 122cabdff1aSopenharmony_ci 123cabdff1aSopenharmony_ci permh0 = vec_perm(align_vec0, align_vec0, perm_inner); 124cabdff1aSopenharmony_ci permh8 = vec_perm(align_vec8, align_vec8, perm_inner); 125cabdff1aSopenharmony_ci perm_inner = vec_add(perm_inner, vec_splat_u8(4)); 126cabdff1aSopenharmony_ci perml0 = vec_perm(align_vec0, align_vec0, perm_inner); 127cabdff1aSopenharmony_ci perml8 = vec_perm(align_vec8, align_vec8, perm_inner); 128cabdff1aSopenharmony_ci perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer); 129cabdff1aSopenharmony_ci perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer); 130cabdff1aSopenharmony_ci#endif 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci while (h --> 0) { 133cabdff1aSopenharmony_ci FILTER_H(f16h, 0); 134cabdff1aSopenharmony_ci 135cabdff1aSopenharmony_ci if (w == 16) { 136cabdff1aSopenharmony_ci FILTER_H(f16l, 8); 137cabdff1aSopenharmony_ci filt = vec_packsu(f16h, f16l); 138cabdff1aSopenharmony_ci vec_st(filt, 0, dst); 139cabdff1aSopenharmony_ci } else { 140cabdff1aSopenharmony_ci filt = vec_packsu(f16h, f16h); 141cabdff1aSopenharmony_ci vec_ste((vec_u32)filt, 0, (uint32_t*)dst); 142cabdff1aSopenharmony_ci if (w == 8) 143cabdff1aSopenharmony_ci vec_ste((vec_u32)filt, 4, (uint32_t*)dst); 144cabdff1aSopenharmony_ci } 145cabdff1aSopenharmony_ci src += src_stride; 146cabdff1aSopenharmony_ci dst += dst_stride; 147cabdff1aSopenharmony_ci } 148cabdff1aSopenharmony_ci} 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci// v subpel filter does a simple vertical multiply + add 151cabdff1aSopenharmony_cistatic const vec_u8 v_subpel_filters[7] = 152cabdff1aSopenharmony_ci{ 153cabdff1aSopenharmony_ci { 0, 6, 123, 12, 1, 0 }, 154cabdff1aSopenharmony_ci { 2, 11, 108, 36, 8, 1 }, 155cabdff1aSopenharmony_ci { 0, 9, 93, 50, 6, 0 }, 156cabdff1aSopenharmony_ci { 3, 16, 77, 77, 16, 3 }, 157cabdff1aSopenharmony_ci { 0, 6, 50, 93, 9, 0 }, 158cabdff1aSopenharmony_ci { 1, 8, 36, 108, 11, 2 }, 159cabdff1aSopenharmony_ci { 0, 1, 12, 123, 6, 0 }, 160cabdff1aSopenharmony_ci}; 161cabdff1aSopenharmony_ci 162cabdff1aSopenharmony_ci#define LOAD_V_SUBPEL_FILTER(i) \ 163cabdff1aSopenharmony_ci vec_u8 subpel_filter = v_subpel_filters[i]; \ 164cabdff1aSopenharmony_ci vec_u8 f0 = vec_splat(subpel_filter, 0); \ 165cabdff1aSopenharmony_ci vec_u8 f1 = vec_splat(subpel_filter, 1); \ 166cabdff1aSopenharmony_ci vec_u8 f2 = vec_splat(subpel_filter, 2); \ 167cabdff1aSopenharmony_ci vec_u8 f3 = vec_splat(subpel_filter, 3); \ 168cabdff1aSopenharmony_ci vec_u8 f4 = vec_splat(subpel_filter, 4); \ 169cabdff1aSopenharmony_ci vec_u8 f5 = vec_splat(subpel_filter, 5) 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci#define FILTER_V(dstv, vec_mul) \ 172cabdff1aSopenharmony_ci s1f = (vec_s16)vec_mul(s1, f1); \ 173cabdff1aSopenharmony_ci s2f = (vec_s16)vec_mul(s2, f2); \ 174cabdff1aSopenharmony_ci s3f = (vec_s16)vec_mul(s3, f3); \ 175cabdff1aSopenharmony_ci s4f = (vec_s16)vec_mul(s4, f4); \ 176cabdff1aSopenharmony_ci s2f = vec_subs(s2f, s1f); \ 177cabdff1aSopenharmony_ci s3f = vec_subs(s3f, s4f); \ 178cabdff1aSopenharmony_ci if (is6tap) { \ 179cabdff1aSopenharmony_ci s0f = (vec_s16)vec_mul(s0, f0); \ 180cabdff1aSopenharmony_ci s5f = (vec_s16)vec_mul(s5, f5); \ 181cabdff1aSopenharmony_ci s2f = vec_adds(s2f, s0f); \ 182cabdff1aSopenharmony_ci s3f = vec_adds(s3f, s5f); \ 183cabdff1aSopenharmony_ci } \ 184cabdff1aSopenharmony_ci dstv = vec_adds(s2f, s3f); \ 185cabdff1aSopenharmony_ci dstv = vec_adds(dstv, c64); \ 186cabdff1aSopenharmony_ci dstv = vec_sra(dstv, c7) 187cabdff1aSopenharmony_ci 188cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 189cabdff1aSopenharmony_ci#define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm) 190cabdff1aSopenharmony_ci#else 191cabdff1aSopenharmony_ci#define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s)) 192cabdff1aSopenharmony_ci#endif 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_cistatic av_always_inline 195cabdff1aSopenharmony_civoid put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride, 196cabdff1aSopenharmony_ci uint8_t *src, ptrdiff_t src_stride, 197cabdff1aSopenharmony_ci int h, int my, int w, int is6tap) 198cabdff1aSopenharmony_ci{ 199cabdff1aSopenharmony_ci LOAD_V_SUBPEL_FILTER(my-1); 200cabdff1aSopenharmony_ci vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl; 201cabdff1aSopenharmony_ci vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l; 202cabdff1aSopenharmony_ci vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6)); 203cabdff1aSopenharmony_ci vec_u16 c7 = vec_splat_u16(7); 204cabdff1aSopenharmony_ci 205cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 206cabdff1aSopenharmony_ci // we want pixels 0-7 to be in the even positions and 8-15 in the odd, 207cabdff1aSopenharmony_ci // so combine this permute with the alignment permute vector 208cabdff1aSopenharmony_ci align_vech = vec_lvsl(0, src); 209cabdff1aSopenharmony_ci align_vecl = vec_sld(align_vech, align_vech, 8); 210cabdff1aSopenharmony_ci if (w ==16) 211cabdff1aSopenharmony_ci perm_vec = vec_mergeh(align_vech, align_vecl); 212cabdff1aSopenharmony_ci else 213cabdff1aSopenharmony_ci perm_vec = vec_mergeh(align_vech, align_vech); 214cabdff1aSopenharmony_ci#endif 215cabdff1aSopenharmony_ci 216cabdff1aSopenharmony_ci if (is6tap) 217cabdff1aSopenharmony_ci s0 = LOAD_HL(-2*src_stride, src, perm_vec); 218cabdff1aSopenharmony_ci s1 = LOAD_HL(-1*src_stride, src, perm_vec); 219cabdff1aSopenharmony_ci s2 = LOAD_HL( 0*src_stride, src, perm_vec); 220cabdff1aSopenharmony_ci s3 = LOAD_HL( 1*src_stride, src, perm_vec); 221cabdff1aSopenharmony_ci if (is6tap) 222cabdff1aSopenharmony_ci s4 = LOAD_HL( 2*src_stride, src, perm_vec); 223cabdff1aSopenharmony_ci 224cabdff1aSopenharmony_ci src += (2+is6tap)*src_stride; 225cabdff1aSopenharmony_ci 226cabdff1aSopenharmony_ci while (h --> 0) { 227cabdff1aSopenharmony_ci if (is6tap) 228cabdff1aSopenharmony_ci s5 = LOAD_HL(0, src, perm_vec); 229cabdff1aSopenharmony_ci else 230cabdff1aSopenharmony_ci s4 = LOAD_HL(0, src, perm_vec); 231cabdff1aSopenharmony_ci 232cabdff1aSopenharmony_ci FILTER_V(f16h, vec_mule); 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_ci if (w == 16) { 235cabdff1aSopenharmony_ci FILTER_V(f16l, vec_mulo); 236cabdff1aSopenharmony_ci filt = vec_packsu(f16h, f16l); 237cabdff1aSopenharmony_ci vec_st(filt, 0, dst); 238cabdff1aSopenharmony_ci } else { 239cabdff1aSopenharmony_ci filt = vec_packsu(f16h, f16h); 240cabdff1aSopenharmony_ci if (w == 4) 241cabdff1aSopenharmony_ci filt = (vec_u8)vec_splat((vec_u32)filt, 0); 242cabdff1aSopenharmony_ci else 243cabdff1aSopenharmony_ci vec_ste((vec_u32)filt, 4, (uint32_t*)dst); 244cabdff1aSopenharmony_ci vec_ste((vec_u32)filt, 0, (uint32_t*)dst); 245cabdff1aSopenharmony_ci } 246cabdff1aSopenharmony_ci 247cabdff1aSopenharmony_ci if (is6tap) 248cabdff1aSopenharmony_ci s0 = s1; 249cabdff1aSopenharmony_ci s1 = s2; 250cabdff1aSopenharmony_ci s2 = s3; 251cabdff1aSopenharmony_ci s3 = s4; 252cabdff1aSopenharmony_ci if (is6tap) 253cabdff1aSopenharmony_ci s4 = s5; 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_ci dst += dst_stride; 256cabdff1aSopenharmony_ci src += src_stride; 257cabdff1aSopenharmony_ci } 258cabdff1aSopenharmony_ci} 259cabdff1aSopenharmony_ci 260cabdff1aSopenharmony_ci#define EPEL_FUNCS(WIDTH, TAPS) \ 261cabdff1aSopenharmony_cistatic av_noinline \ 262cabdff1aSopenharmony_civoid put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ 263cabdff1aSopenharmony_ci{ \ 264cabdff1aSopenharmony_ci put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \ 265cabdff1aSopenharmony_ci} \ 266cabdff1aSopenharmony_ci\ 267cabdff1aSopenharmony_cistatic av_noinline \ 268cabdff1aSopenharmony_civoid put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ 269cabdff1aSopenharmony_ci{ \ 270cabdff1aSopenharmony_ci put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \ 271cabdff1aSopenharmony_ci} 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci#define EPEL_HV(WIDTH, HTAPS, VTAPS) \ 274cabdff1aSopenharmony_cistatic void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \ 275cabdff1aSopenharmony_ci{ \ 276cabdff1aSopenharmony_ci DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \ 277cabdff1aSopenharmony_ci if (VTAPS == 6) { \ 278cabdff1aSopenharmony_ci put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \ 279cabdff1aSopenharmony_ci put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \ 280cabdff1aSopenharmony_ci } else { \ 281cabdff1aSopenharmony_ci put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \ 282cabdff1aSopenharmony_ci put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \ 283cabdff1aSopenharmony_ci } \ 284cabdff1aSopenharmony_ci} 285cabdff1aSopenharmony_ci 286cabdff1aSopenharmony_ciEPEL_FUNCS(16,6) 287cabdff1aSopenharmony_ciEPEL_FUNCS(8, 6) 288cabdff1aSopenharmony_ciEPEL_FUNCS(8, 4) 289cabdff1aSopenharmony_ciEPEL_FUNCS(4, 6) 290cabdff1aSopenharmony_ciEPEL_FUNCS(4, 4) 291cabdff1aSopenharmony_ci 292cabdff1aSopenharmony_ciEPEL_HV(16, 6,6) 293cabdff1aSopenharmony_ciEPEL_HV(8, 6,6) 294cabdff1aSopenharmony_ciEPEL_HV(8, 4,6) 295cabdff1aSopenharmony_ciEPEL_HV(8, 6,4) 296cabdff1aSopenharmony_ciEPEL_HV(8, 4,4) 297cabdff1aSopenharmony_ciEPEL_HV(4, 6,6) 298cabdff1aSopenharmony_ciEPEL_HV(4, 4,6) 299cabdff1aSopenharmony_ciEPEL_HV(4, 6,4) 300cabdff1aSopenharmony_ciEPEL_HV(4, 4,4) 301cabdff1aSopenharmony_ci 302cabdff1aSopenharmony_cistatic void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) 303cabdff1aSopenharmony_ci{ 304cabdff1aSopenharmony_ci register vector unsigned char perm; 305cabdff1aSopenharmony_ci int i; 306cabdff1aSopenharmony_ci register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1; 307cabdff1aSopenharmony_ci register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2; 308cabdff1aSopenharmony_ci register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2; 309cabdff1aSopenharmony_ci 310cabdff1aSopenharmony_ci#if HAVE_BIGENDIAN 311cabdff1aSopenharmony_ci perm = vec_lvsl(0, src); 312cabdff1aSopenharmony_ci#endif 313cabdff1aSopenharmony_ci// hand-unrolling the loop by 4 gains about 15% 314cabdff1aSopenharmony_ci// mininum execution time goes from 74 to 60 cycles 315cabdff1aSopenharmony_ci// it's faster than -funroll-loops, but using 316cabdff1aSopenharmony_ci// -funroll-loops w/ this is bad - 74 cycles again. 317cabdff1aSopenharmony_ci// all this is on a 7450, tuning for the 7450 318cabdff1aSopenharmony_ci for (i = 0; i < h; i += 4) { 319cabdff1aSopenharmony_ci vec_st(load_with_perm_vec(0, src, perm), 0, dst); 320cabdff1aSopenharmony_ci vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst); 321cabdff1aSopenharmony_ci vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst); 322cabdff1aSopenharmony_ci vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst); 323cabdff1aSopenharmony_ci src += sstride4; 324cabdff1aSopenharmony_ci dst += dstride4; 325cabdff1aSopenharmony_ci } 326cabdff1aSopenharmony_ci} 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 329cabdff1aSopenharmony_ci 330cabdff1aSopenharmony_ci 331cabdff1aSopenharmony_ciav_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c) 332cabdff1aSopenharmony_ci{ 333cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 334cabdff1aSopenharmony_ci if (!PPC_ALTIVEC(av_get_cpu_flags())) 335cabdff1aSopenharmony_ci return; 336cabdff1aSopenharmony_ci 337cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec; 338cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec; 339cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec; 340cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec; 341cabdff1aSopenharmony_ci 342cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec; 343cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec; 344cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec; 345cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec; 346cabdff1aSopenharmony_ci 347cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec; 348cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec; 349cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec; 350cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec; 351cabdff1aSopenharmony_ci 352cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec; 353cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec; 354cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec; 355cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec; 356cabdff1aSopenharmony_ci 357cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec; 358cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec; 359cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec; 360cabdff1aSopenharmony_ci c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec; 361cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 362cabdff1aSopenharmony_ci} 363