1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Alpha optimized DSP utils 3cabdff1aSopenharmony_ci * Copyright (c) 2002 Falk Hueffner <falk@debian.org> 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 23cabdff1aSopenharmony_ci#include "libavcodec/hpeldsp.h" 24cabdff1aSopenharmony_ci#include "hpeldsp_alpha.h" 25cabdff1aSopenharmony_ci#include "asm.h" 26cabdff1aSopenharmony_ci 27cabdff1aSopenharmony_cistatic inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) 28cabdff1aSopenharmony_ci{ 29cabdff1aSopenharmony_ci return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 30cabdff1aSopenharmony_ci} 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_cistatic inline uint64_t avg2(uint64_t a, uint64_t b) 33cabdff1aSopenharmony_ci{ 34cabdff1aSopenharmony_ci return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 35cabdff1aSopenharmony_ci} 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci#if 0 38cabdff1aSopenharmony_ci/* The XY2 routines basically utilize this scheme, but reuse parts in 39cabdff1aSopenharmony_ci each iteration. */ 40cabdff1aSopenharmony_cistatic inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) 41cabdff1aSopenharmony_ci{ 42cabdff1aSopenharmony_ci uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) 43cabdff1aSopenharmony_ci + ((l2 & ~BYTE_VEC(0x03)) >> 2) 44cabdff1aSopenharmony_ci + ((l3 & ~BYTE_VEC(0x03)) >> 2) 45cabdff1aSopenharmony_ci + ((l4 & ~BYTE_VEC(0x03)) >> 2); 46cabdff1aSopenharmony_ci uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) 47cabdff1aSopenharmony_ci + (l2 & BYTE_VEC(0x03)) 48cabdff1aSopenharmony_ci + (l3 & BYTE_VEC(0x03)) 49cabdff1aSopenharmony_ci + (l4 & BYTE_VEC(0x03)) 50cabdff1aSopenharmony_ci + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); 51cabdff1aSopenharmony_ci return r1 + r2; 52cabdff1aSopenharmony_ci} 53cabdff1aSopenharmony_ci#endif 54cabdff1aSopenharmony_ci 55cabdff1aSopenharmony_ci#define OP(LOAD, STORE) \ 56cabdff1aSopenharmony_ci do { \ 57cabdff1aSopenharmony_ci STORE(LOAD(pixels), block); \ 58cabdff1aSopenharmony_ci pixels += line_size; \ 59cabdff1aSopenharmony_ci block += line_size; \ 60cabdff1aSopenharmony_ci } while (--h) 61cabdff1aSopenharmony_ci 62cabdff1aSopenharmony_ci#define OP_X2(LOAD, STORE) \ 63cabdff1aSopenharmony_ci do { \ 64cabdff1aSopenharmony_ci uint64_t pix1, pix2; \ 65cabdff1aSopenharmony_ci \ 66cabdff1aSopenharmony_ci pix1 = LOAD(pixels); \ 67cabdff1aSopenharmony_ci pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 68cabdff1aSopenharmony_ci STORE(AVG2(pix1, pix2), block); \ 69cabdff1aSopenharmony_ci pixels += line_size; \ 70cabdff1aSopenharmony_ci block += line_size; \ 71cabdff1aSopenharmony_ci } while (--h) 72cabdff1aSopenharmony_ci 73cabdff1aSopenharmony_ci#define OP_Y2(LOAD, STORE) \ 74cabdff1aSopenharmony_ci do { \ 75cabdff1aSopenharmony_ci uint64_t pix = LOAD(pixels); \ 76cabdff1aSopenharmony_ci do { \ 77cabdff1aSopenharmony_ci uint64_t next_pix; \ 78cabdff1aSopenharmony_ci \ 79cabdff1aSopenharmony_ci pixels += line_size; \ 80cabdff1aSopenharmony_ci next_pix = LOAD(pixels); \ 81cabdff1aSopenharmony_ci STORE(AVG2(pix, next_pix), block); \ 82cabdff1aSopenharmony_ci block += line_size; \ 83cabdff1aSopenharmony_ci pix = next_pix; \ 84cabdff1aSopenharmony_ci } while (--h); \ 85cabdff1aSopenharmony_ci } while (0) 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_ci#define OP_XY2(LOAD, STORE) \ 88cabdff1aSopenharmony_ci do { \ 89cabdff1aSopenharmony_ci uint64_t pix1 = LOAD(pixels); \ 90cabdff1aSopenharmony_ci uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 91cabdff1aSopenharmony_ci uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ 92cabdff1aSopenharmony_ci + (pix2 & BYTE_VEC(0x03)); \ 93cabdff1aSopenharmony_ci uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ 94cabdff1aSopenharmony_ci + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ 95cabdff1aSopenharmony_ci \ 96cabdff1aSopenharmony_ci do { \ 97cabdff1aSopenharmony_ci uint64_t npix1, npix2; \ 98cabdff1aSopenharmony_ci uint64_t npix_l, npix_h; \ 99cabdff1aSopenharmony_ci uint64_t avg; \ 100cabdff1aSopenharmony_ci \ 101cabdff1aSopenharmony_ci pixels += line_size; \ 102cabdff1aSopenharmony_ci npix1 = LOAD(pixels); \ 103cabdff1aSopenharmony_ci npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 104cabdff1aSopenharmony_ci npix_l = (npix1 & BYTE_VEC(0x03)) \ 105cabdff1aSopenharmony_ci + (npix2 & BYTE_VEC(0x03)); \ 106cabdff1aSopenharmony_ci npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ 107cabdff1aSopenharmony_ci + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ 108cabdff1aSopenharmony_ci avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ 109cabdff1aSopenharmony_ci + pix_h + npix_h; \ 110cabdff1aSopenharmony_ci STORE(avg, block); \ 111cabdff1aSopenharmony_ci \ 112cabdff1aSopenharmony_ci block += line_size; \ 113cabdff1aSopenharmony_ci pix_l = npix_l; \ 114cabdff1aSopenharmony_ci pix_h = npix_h; \ 115cabdff1aSopenharmony_ci } while (--h); \ 116cabdff1aSopenharmony_ci } while (0) 117cabdff1aSopenharmony_ci 118cabdff1aSopenharmony_ci#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ 119cabdff1aSopenharmony_cistatic void OPNAME ## _pixels ## SUFF ## _axp \ 120cabdff1aSopenharmony_ci (uint8_t *restrict block, const uint8_t *restrict pixels, \ 121cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) \ 122cabdff1aSopenharmony_ci{ \ 123cabdff1aSopenharmony_ci if ((size_t) pixels & 0x7) { \ 124cabdff1aSopenharmony_ci OPKIND(uldq, STORE); \ 125cabdff1aSopenharmony_ci } else { \ 126cabdff1aSopenharmony_ci OPKIND(ldq, STORE); \ 127cabdff1aSopenharmony_ci } \ 128cabdff1aSopenharmony_ci} \ 129cabdff1aSopenharmony_ci \ 130cabdff1aSopenharmony_cistatic void OPNAME ## _pixels16 ## SUFF ## _axp \ 131cabdff1aSopenharmony_ci (uint8_t *restrict block, const uint8_t *restrict pixels, \ 132cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) \ 133cabdff1aSopenharmony_ci{ \ 134cabdff1aSopenharmony_ci OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ 135cabdff1aSopenharmony_ci OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ 136cabdff1aSopenharmony_ci} 137cabdff1aSopenharmony_ci 138cabdff1aSopenharmony_ci#define PIXOP(OPNAME, STORE) \ 139cabdff1aSopenharmony_ci MAKE_OP(OPNAME, , OP, STORE) \ 140cabdff1aSopenharmony_ci MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ 141cabdff1aSopenharmony_ci MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ 142cabdff1aSopenharmony_ci MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) 143cabdff1aSopenharmony_ci 144cabdff1aSopenharmony_ci/* Rounding primitives. */ 145cabdff1aSopenharmony_ci#define AVG2 avg2 146cabdff1aSopenharmony_ci#define AVG4 avg4 147cabdff1aSopenharmony_ci#define AVG4_ROUNDER BYTE_VEC(0x02) 148cabdff1aSopenharmony_ci#define STORE(l, b) stq(l, b) 149cabdff1aSopenharmony_ciPIXOP(put, STORE); 150cabdff1aSopenharmony_ci 151cabdff1aSopenharmony_ci#undef STORE 152cabdff1aSopenharmony_ci#define STORE(l, b) stq(AVG2(l, ldq(b)), b); 153cabdff1aSopenharmony_ciPIXOP(avg, STORE); 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ci/* Not rounding primitives. */ 156cabdff1aSopenharmony_ci#undef AVG2 157cabdff1aSopenharmony_ci#undef AVG4 158cabdff1aSopenharmony_ci#undef AVG4_ROUNDER 159cabdff1aSopenharmony_ci#undef STORE 160cabdff1aSopenharmony_ci#define AVG2 avg2_no_rnd 161cabdff1aSopenharmony_ci#define AVG4 avg4_no_rnd 162cabdff1aSopenharmony_ci#define AVG4_ROUNDER BYTE_VEC(0x01) 163cabdff1aSopenharmony_ci#define STORE(l, b) stq(l, b) 164cabdff1aSopenharmony_ciPIXOP(put_no_rnd, STORE); 165cabdff1aSopenharmony_ci 166cabdff1aSopenharmony_ci#undef STORE 167cabdff1aSopenharmony_ci#define STORE(l, b) stq(AVG2(l, ldq(b)), b); 168cabdff1aSopenharmony_ciPIXOP(avg_no_rnd, STORE); 169cabdff1aSopenharmony_ci 170cabdff1aSopenharmony_cistatic void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, 171cabdff1aSopenharmony_ci ptrdiff_t line_size, int h) 172cabdff1aSopenharmony_ci{ 173cabdff1aSopenharmony_ci put_pixels_axp_asm(block, pixels, line_size, h); 174cabdff1aSopenharmony_ci put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); 175cabdff1aSopenharmony_ci} 176cabdff1aSopenharmony_ci 177cabdff1aSopenharmony_ciav_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags) 178cabdff1aSopenharmony_ci{ 179cabdff1aSopenharmony_ci c->put_pixels_tab[0][0] = put_pixels16_axp_asm; 180cabdff1aSopenharmony_ci c->put_pixels_tab[0][1] = put_pixels16_x2_axp; 181cabdff1aSopenharmony_ci c->put_pixels_tab[0][2] = put_pixels16_y2_axp; 182cabdff1aSopenharmony_ci c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; 183cabdff1aSopenharmony_ci 184cabdff1aSopenharmony_ci c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; 185cabdff1aSopenharmony_ci c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; 186cabdff1aSopenharmony_ci c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; 187cabdff1aSopenharmony_ci c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; 188cabdff1aSopenharmony_ci 189cabdff1aSopenharmony_ci c->avg_pixels_tab[0][0] = avg_pixels16_axp; 190cabdff1aSopenharmony_ci c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; 191cabdff1aSopenharmony_ci c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; 192cabdff1aSopenharmony_ci c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; 193cabdff1aSopenharmony_ci 194cabdff1aSopenharmony_ci c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp; 195cabdff1aSopenharmony_ci c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp; 196cabdff1aSopenharmony_ci c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp; 197cabdff1aSopenharmony_ci c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp; 198cabdff1aSopenharmony_ci 199cabdff1aSopenharmony_ci c->put_pixels_tab[1][0] = put_pixels_axp_asm; 200cabdff1aSopenharmony_ci c->put_pixels_tab[1][1] = put_pixels_x2_axp; 201cabdff1aSopenharmony_ci c->put_pixels_tab[1][2] = put_pixels_y2_axp; 202cabdff1aSopenharmony_ci c->put_pixels_tab[1][3] = put_pixels_xy2_axp; 203cabdff1aSopenharmony_ci 204cabdff1aSopenharmony_ci c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; 205cabdff1aSopenharmony_ci c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; 206cabdff1aSopenharmony_ci c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; 207cabdff1aSopenharmony_ci c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; 208cabdff1aSopenharmony_ci 209cabdff1aSopenharmony_ci c->avg_pixels_tab[1][0] = avg_pixels_axp; 210cabdff1aSopenharmony_ci c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; 211cabdff1aSopenharmony_ci c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; 212cabdff1aSopenharmony_ci c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; 213cabdff1aSopenharmony_ci} 214