1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2002 Brian Foley 3cabdff1aSopenharmony_ci * Copyright (c) 2002 Dieter Shirley 4cabdff1aSopenharmony_ci * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#include "config.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 26cabdff1aSopenharmony_ci#include "libavutil/cpu.h" 27cabdff1aSopenharmony_ci#include "libavutil/ppc/cpu.h" 28cabdff1aSopenharmony_ci#include "libavutil/ppc/util_altivec.h" 29cabdff1aSopenharmony_ci 30cabdff1aSopenharmony_ci#include "libavcodec/avcodec.h" 31cabdff1aSopenharmony_ci#include "libavcodec/pixblockdsp.h" 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 34cabdff1aSopenharmony_ci 35cabdff1aSopenharmony_ci#if HAVE_VSX 36cabdff1aSopenharmony_cistatic void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, 37cabdff1aSopenharmony_ci ptrdiff_t stride) 38cabdff1aSopenharmony_ci{ 39cabdff1aSopenharmony_ci int i; 40cabdff1aSopenharmony_ci vector unsigned char perm = 41cabdff1aSopenharmony_ci (vector unsigned char) {0x00,0x10, 0x01,0x11,0x02,0x12,0x03,0x13,\ 42cabdff1aSopenharmony_ci 0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17}; 43cabdff1aSopenharmony_ci const vector unsigned char zero = 44cabdff1aSopenharmony_ci (const vector unsigned char) vec_splat_u8(0); 45cabdff1aSopenharmony_ci 46cabdff1aSopenharmony_ci for (i = 0; i < 8; i++) { 47cabdff1aSopenharmony_ci /* Read potentially unaligned pixels. 48cabdff1aSopenharmony_ci * We're reading 16 pixels, and actually only want 8, 49cabdff1aSopenharmony_ci * but we simply ignore the extras. */ 50cabdff1aSopenharmony_ci vector unsigned char bytes = vec_vsx_ld(0, pixels); 51cabdff1aSopenharmony_ci 52cabdff1aSopenharmony_ci // Convert the bytes into shorts. 53cabdff1aSopenharmony_ci //vector signed short shorts = (vector signed short) vec_perm(zero, bytes, perm); 54cabdff1aSopenharmony_ci vector signed short shorts = (vector signed short) vec_perm(bytes, zero, perm); 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci // Save the data to the block, we assume the block is 16-byte aligned. 57cabdff1aSopenharmony_ci vec_vsx_st(shorts, i * 16, (vector signed short *) block); 58cabdff1aSopenharmony_ci 59cabdff1aSopenharmony_ci pixels += stride; 60cabdff1aSopenharmony_ci } 61cabdff1aSopenharmony_ci} 62cabdff1aSopenharmony_ci#else 63cabdff1aSopenharmony_cistatic void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, 64cabdff1aSopenharmony_ci ptrdiff_t stride) 65cabdff1aSopenharmony_ci{ 66cabdff1aSopenharmony_ci int i; 67cabdff1aSopenharmony_ci const vec_u8 zero = (const vec_u8)vec_splat_u8(0); 68cabdff1aSopenharmony_ci 69cabdff1aSopenharmony_ci for (i = 0; i < 8; i++) { 70cabdff1aSopenharmony_ci vec_u8 perm = vec_lvsl(0, pixels); 71cabdff1aSopenharmony_ci /* Read potentially unaligned pixels. 72cabdff1aSopenharmony_ci * We're reading 16 pixels, and actually only want 8, 73cabdff1aSopenharmony_ci * but we simply ignore the extras. */ 74cabdff1aSopenharmony_ci vec_u8 pixl = vec_ld(0, pixels); 75cabdff1aSopenharmony_ci vec_u8 pixr = vec_ld(7, pixels); 76cabdff1aSopenharmony_ci vec_u8 bytes = vec_perm(pixl, pixr, perm); 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_ci // Convert the bytes into shorts. 79cabdff1aSopenharmony_ci vec_s16 shorts = (vec_s16)vec_mergeh(zero, bytes); 80cabdff1aSopenharmony_ci 81cabdff1aSopenharmony_ci // Save the data to the block, we assume the block is 16-byte aligned. 82cabdff1aSopenharmony_ci vec_st(shorts, i * 16, (vec_s16 *)block); 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ci pixels += stride; 85cabdff1aSopenharmony_ci } 86cabdff1aSopenharmony_ci} 87cabdff1aSopenharmony_ci 88cabdff1aSopenharmony_ci#endif /* HAVE_VSX */ 89cabdff1aSopenharmony_ci 90cabdff1aSopenharmony_ci#if HAVE_VSX 91cabdff1aSopenharmony_cistatic void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1, 92cabdff1aSopenharmony_ci const uint8_t *s2, ptrdiff_t stride) 93cabdff1aSopenharmony_ci{ 94cabdff1aSopenharmony_ci int i; 95cabdff1aSopenharmony_ci const vector unsigned char zero = 96cabdff1aSopenharmony_ci (const vector unsigned char) vec_splat_u8(0); 97cabdff1aSopenharmony_ci vector signed short shorts1, shorts2; 98cabdff1aSopenharmony_ci 99cabdff1aSopenharmony_ci for (i = 0; i < 4; i++) { 100cabdff1aSopenharmony_ci /* Read potentially unaligned pixels. 101cabdff1aSopenharmony_ci * We're reading 16 pixels, and actually only want 8, 102cabdff1aSopenharmony_ci * but we simply ignore the extras. */ 103cabdff1aSopenharmony_ci vector unsigned char bytes = vec_vsx_ld(0, s1); 104cabdff1aSopenharmony_ci 105cabdff1aSopenharmony_ci // Convert the bytes into shorts. 106cabdff1aSopenharmony_ci shorts1 = (vector signed short) vec_mergeh(bytes, zero); 107cabdff1aSopenharmony_ci 108cabdff1aSopenharmony_ci // Do the same for the second block of pixels. 109cabdff1aSopenharmony_ci bytes =vec_vsx_ld(0, s2); 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_ci // Convert the bytes into shorts. 112cabdff1aSopenharmony_ci shorts2 = (vector signed short) vec_mergeh(bytes, zero); 113cabdff1aSopenharmony_ci 114cabdff1aSopenharmony_ci // Do the subtraction. 115cabdff1aSopenharmony_ci shorts1 = vec_sub(shorts1, shorts2); 116cabdff1aSopenharmony_ci 117cabdff1aSopenharmony_ci // Save the data to the block, we assume the block is 16-byte aligned. 118cabdff1aSopenharmony_ci vec_vsx_st(shorts1, 0, (vector signed short *) block); 119cabdff1aSopenharmony_ci 120cabdff1aSopenharmony_ci s1 += stride; 121cabdff1aSopenharmony_ci s2 += stride; 122cabdff1aSopenharmony_ci block += 8; 123cabdff1aSopenharmony_ci 124cabdff1aSopenharmony_ci /* The code below is a copy of the code above... 125cabdff1aSopenharmony_ci * This is a manual unroll. */ 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci /* Read potentially unaligned pixels. 128cabdff1aSopenharmony_ci * We're reading 16 pixels, and actually only want 8, 129cabdff1aSopenharmony_ci * but we simply ignore the extras. */ 130cabdff1aSopenharmony_ci bytes = vec_vsx_ld(0, s1); 131cabdff1aSopenharmony_ci 132cabdff1aSopenharmony_ci // Convert the bytes into shorts. 133cabdff1aSopenharmony_ci shorts1 = (vector signed short) vec_mergeh(bytes, zero); 134cabdff1aSopenharmony_ci 135cabdff1aSopenharmony_ci // Do the same for the second block of pixels. 136cabdff1aSopenharmony_ci bytes = vec_vsx_ld(0, s2); 137cabdff1aSopenharmony_ci 138cabdff1aSopenharmony_ci // Convert the bytes into shorts. 139cabdff1aSopenharmony_ci shorts2 = (vector signed short) vec_mergeh(bytes, zero); 140cabdff1aSopenharmony_ci 141cabdff1aSopenharmony_ci // Do the subtraction. 142cabdff1aSopenharmony_ci shorts1 = vec_sub(shorts1, shorts2); 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci // Save the data to the block, we assume the block is 16-byte aligned. 145cabdff1aSopenharmony_ci vec_vsx_st(shorts1, 0, (vector signed short *) block); 146cabdff1aSopenharmony_ci 147cabdff1aSopenharmony_ci s1 += stride; 148cabdff1aSopenharmony_ci s2 += stride; 149cabdff1aSopenharmony_ci block += 8; 150cabdff1aSopenharmony_ci } 151cabdff1aSopenharmony_ci} 152cabdff1aSopenharmony_ci#else 153cabdff1aSopenharmony_cistatic void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1, 154cabdff1aSopenharmony_ci const uint8_t *s2, ptrdiff_t stride) 155cabdff1aSopenharmony_ci{ 156cabdff1aSopenharmony_ci int i; 157cabdff1aSopenharmony_ci vec_u8 perm; 158cabdff1aSopenharmony_ci const vec_u8 zero = (const vec_u8)vec_splat_u8(0); 159cabdff1aSopenharmony_ci vec_s16 shorts1, shorts2; 160cabdff1aSopenharmony_ci 161cabdff1aSopenharmony_ci for (i = 0; i < 4; i++) { 162cabdff1aSopenharmony_ci /* Read potentially unaligned pixels. 163cabdff1aSopenharmony_ci * We're reading 16 pixels, and actually only want 8, 164cabdff1aSopenharmony_ci * but we simply ignore the extras. */ 165cabdff1aSopenharmony_ci perm = vec_lvsl(0, s1); 166cabdff1aSopenharmony_ci vec_u8 pixl = vec_ld(0, s1); 167cabdff1aSopenharmony_ci vec_u8 pixr = vec_ld(15, s1); 168cabdff1aSopenharmony_ci vec_u8 bytes = vec_perm(pixl, pixr, perm); 169cabdff1aSopenharmony_ci 170cabdff1aSopenharmony_ci // Convert the bytes into shorts. 171cabdff1aSopenharmony_ci shorts1 = (vec_s16)vec_mergeh(zero, bytes); 172cabdff1aSopenharmony_ci 173cabdff1aSopenharmony_ci // Do the same for the second block of pixels. 174cabdff1aSopenharmony_ci perm = vec_lvsl(0, s2); 175cabdff1aSopenharmony_ci pixl = vec_ld(0, s2); 176cabdff1aSopenharmony_ci pixr = vec_ld(15, s2); 177cabdff1aSopenharmony_ci bytes = vec_perm(pixl, pixr, perm); 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci // Convert the bytes into shorts. 180cabdff1aSopenharmony_ci shorts2 = (vec_s16)vec_mergeh(zero, bytes); 181cabdff1aSopenharmony_ci 182cabdff1aSopenharmony_ci // Do the subtraction. 183cabdff1aSopenharmony_ci shorts1 = vec_sub(shorts1, shorts2); 184cabdff1aSopenharmony_ci 185cabdff1aSopenharmony_ci // Save the data to the block, we assume the block is 16-byte aligned. 186cabdff1aSopenharmony_ci vec_st(shorts1, 0, (vec_s16 *)block); 187cabdff1aSopenharmony_ci 188cabdff1aSopenharmony_ci s1 += stride; 189cabdff1aSopenharmony_ci s2 += stride; 190cabdff1aSopenharmony_ci block += 8; 191cabdff1aSopenharmony_ci 192cabdff1aSopenharmony_ci /* The code below is a copy of the code above... 193cabdff1aSopenharmony_ci * This is a manual unroll. */ 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_ci /* Read potentially unaligned pixels. 196cabdff1aSopenharmony_ci * We're reading 16 pixels, and actually only want 8, 197cabdff1aSopenharmony_ci * but we simply ignore the extras. */ 198cabdff1aSopenharmony_ci perm = vec_lvsl(0, s1); 199cabdff1aSopenharmony_ci pixl = vec_ld(0, s1); 200cabdff1aSopenharmony_ci pixr = vec_ld(15, s1); 201cabdff1aSopenharmony_ci bytes = vec_perm(pixl, pixr, perm); 202cabdff1aSopenharmony_ci 203cabdff1aSopenharmony_ci // Convert the bytes into shorts. 204cabdff1aSopenharmony_ci shorts1 = (vec_s16)vec_mergeh(zero, bytes); 205cabdff1aSopenharmony_ci 206cabdff1aSopenharmony_ci // Do the same for the second block of pixels. 207cabdff1aSopenharmony_ci perm = vec_lvsl(0, s2); 208cabdff1aSopenharmony_ci pixl = vec_ld(0, s2); 209cabdff1aSopenharmony_ci pixr = vec_ld(15, s2); 210cabdff1aSopenharmony_ci bytes = vec_perm(pixl, pixr, perm); 211cabdff1aSopenharmony_ci 212cabdff1aSopenharmony_ci // Convert the bytes into shorts. 213cabdff1aSopenharmony_ci shorts2 = (vec_s16)vec_mergeh(zero, bytes); 214cabdff1aSopenharmony_ci 215cabdff1aSopenharmony_ci // Do the subtraction. 216cabdff1aSopenharmony_ci shorts1 = vec_sub(shorts1, shorts2); 217cabdff1aSopenharmony_ci 218cabdff1aSopenharmony_ci // Save the data to the block, we assume the block is 16-byte aligned. 219cabdff1aSopenharmony_ci vec_st(shorts1, 0, (vec_s16 *)block); 220cabdff1aSopenharmony_ci 221cabdff1aSopenharmony_ci s1 += stride; 222cabdff1aSopenharmony_ci s2 += stride; 223cabdff1aSopenharmony_ci block += 8; 224cabdff1aSopenharmony_ci } 225cabdff1aSopenharmony_ci} 226cabdff1aSopenharmony_ci 227cabdff1aSopenharmony_ci#endif /* HAVE_VSX */ 228cabdff1aSopenharmony_ci 229cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 230cabdff1aSopenharmony_ci 231cabdff1aSopenharmony_ci#if HAVE_VSX 232cabdff1aSopenharmony_cistatic void get_pixels_vsx(int16_t *restrict block, const uint8_t *pixels, 233cabdff1aSopenharmony_ci ptrdiff_t stride) 234cabdff1aSopenharmony_ci{ 235cabdff1aSopenharmony_ci int i; 236cabdff1aSopenharmony_ci for (i = 0; i < 8; i++) { 237cabdff1aSopenharmony_ci vec_s16 shorts = vsx_ld_u8_s16(0, pixels); 238cabdff1aSopenharmony_ci 239cabdff1aSopenharmony_ci vec_vsx_st(shorts, i * 16, block); 240cabdff1aSopenharmony_ci 241cabdff1aSopenharmony_ci pixels += stride; 242cabdff1aSopenharmony_ci } 243cabdff1aSopenharmony_ci} 244cabdff1aSopenharmony_ci 245cabdff1aSopenharmony_cistatic void diff_pixels_vsx(int16_t *restrict block, const uint8_t *s1, 246cabdff1aSopenharmony_ci const uint8_t *s2, ptrdiff_t stride) 247cabdff1aSopenharmony_ci{ 248cabdff1aSopenharmony_ci int i; 249cabdff1aSopenharmony_ci vec_s16 shorts1, shorts2; 250cabdff1aSopenharmony_ci for (i = 0; i < 8; i++) { 251cabdff1aSopenharmony_ci shorts1 = vsx_ld_u8_s16(0, s1); 252cabdff1aSopenharmony_ci shorts2 = vsx_ld_u8_s16(0, s2); 253cabdff1aSopenharmony_ci 254cabdff1aSopenharmony_ci shorts1 = vec_sub(shorts1, shorts2); 255cabdff1aSopenharmony_ci 256cabdff1aSopenharmony_ci vec_vsx_st(shorts1, 0, block); 257cabdff1aSopenharmony_ci 258cabdff1aSopenharmony_ci s1 += stride; 259cabdff1aSopenharmony_ci s2 += stride; 260cabdff1aSopenharmony_ci block += 8; 261cabdff1aSopenharmony_ci } 262cabdff1aSopenharmony_ci} 263cabdff1aSopenharmony_ci#endif /* HAVE_VSX */ 264cabdff1aSopenharmony_ci 265cabdff1aSopenharmony_ciav_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, 266cabdff1aSopenharmony_ci AVCodecContext *avctx, 267cabdff1aSopenharmony_ci unsigned high_bit_depth) 268cabdff1aSopenharmony_ci{ 269cabdff1aSopenharmony_ci#if HAVE_ALTIVEC 270cabdff1aSopenharmony_ci if (!PPC_ALTIVEC(av_get_cpu_flags())) 271cabdff1aSopenharmony_ci return; 272cabdff1aSopenharmony_ci 273cabdff1aSopenharmony_ci c->diff_pixels = diff_pixels_altivec; 274cabdff1aSopenharmony_ci 275cabdff1aSopenharmony_ci if (!high_bit_depth) { 276cabdff1aSopenharmony_ci c->get_pixels = get_pixels_altivec; 277cabdff1aSopenharmony_ci } 278cabdff1aSopenharmony_ci#endif /* HAVE_ALTIVEC */ 279cabdff1aSopenharmony_ci 280cabdff1aSopenharmony_ci#if HAVE_VSX 281cabdff1aSopenharmony_ci if (!PPC_VSX(av_get_cpu_flags())) 282cabdff1aSopenharmony_ci return; 283cabdff1aSopenharmony_ci 284cabdff1aSopenharmony_ci c->diff_pixels = diff_pixels_vsx; 285cabdff1aSopenharmony_ci 286cabdff1aSopenharmony_ci if (!high_bit_depth) 287cabdff1aSopenharmony_ci c->get_pixels = get_pixels_vsx; 288cabdff1aSopenharmony_ci#endif /* HAVE_VSX */ 289cabdff1aSopenharmony_ci} 290