1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Alpha optimized DSP utils 3cabdff1aSopenharmony_ci * Copyright (c) 2002 Falk Hueffner <falk@debian.org> 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci#include "libavutil/attributes.h" 23cabdff1aSopenharmony_ci#include "libavcodec/me_cmp.h" 24cabdff1aSopenharmony_ci#include "asm.h" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciint pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cistatic inline uint64_t avg2(uint64_t a, uint64_t b) 29cabdff1aSopenharmony_ci{ 30cabdff1aSopenharmony_ci return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 31cabdff1aSopenharmony_ci} 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_cistatic inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) 34cabdff1aSopenharmony_ci{ 35cabdff1aSopenharmony_ci uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) 36cabdff1aSopenharmony_ci + ((l2 & ~BYTE_VEC(0x03)) >> 2) 37cabdff1aSopenharmony_ci + ((l3 & ~BYTE_VEC(0x03)) >> 2) 38cabdff1aSopenharmony_ci + ((l4 & ~BYTE_VEC(0x03)) >> 2); 39cabdff1aSopenharmony_ci uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) 40cabdff1aSopenharmony_ci + (l2 & BYTE_VEC(0x03)) 41cabdff1aSopenharmony_ci + (l3 & BYTE_VEC(0x03)) 42cabdff1aSopenharmony_ci + (l4 & BYTE_VEC(0x03)) 43cabdff1aSopenharmony_ci + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); 44cabdff1aSopenharmony_ci return r1 + r2; 45cabdff1aSopenharmony_ci} 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_cistatic int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 48cabdff1aSopenharmony_ci{ 49cabdff1aSopenharmony_ci int result = 0; 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_ci if ((size_t) pix2 & 0x7) { 52cabdff1aSopenharmony_ci /* works only when pix2 is actually unaligned */ 53cabdff1aSopenharmony_ci do { /* do 8 pixel a time */ 54cabdff1aSopenharmony_ci uint64_t p1, p2; 55cabdff1aSopenharmony_ci 56cabdff1aSopenharmony_ci p1 = ldq(pix1); 57cabdff1aSopenharmony_ci p2 = uldq(pix2); 58cabdff1aSopenharmony_ci result += perr(p1, p2); 59cabdff1aSopenharmony_ci 60cabdff1aSopenharmony_ci pix1 += line_size; 61cabdff1aSopenharmony_ci pix2 += line_size; 62cabdff1aSopenharmony_ci } while (--h); 63cabdff1aSopenharmony_ci } else { 64cabdff1aSopenharmony_ci do { 65cabdff1aSopenharmony_ci uint64_t p1, p2; 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_ci p1 = ldq(pix1); 68cabdff1aSopenharmony_ci p2 = ldq(pix2); 69cabdff1aSopenharmony_ci result += perr(p1, p2); 70cabdff1aSopenharmony_ci 71cabdff1aSopenharmony_ci pix1 += line_size; 72cabdff1aSopenharmony_ci pix2 += line_size; 73cabdff1aSopenharmony_ci } while (--h); 74cabdff1aSopenharmony_ci } 75cabdff1aSopenharmony_ci 76cabdff1aSopenharmony_ci return result; 77cabdff1aSopenharmony_ci} 78cabdff1aSopenharmony_ci 79cabdff1aSopenharmony_ci#if 0 /* now done in assembly */ 80cabdff1aSopenharmony_ciint pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) 81cabdff1aSopenharmony_ci{ 82cabdff1aSopenharmony_ci int result = 0; 83cabdff1aSopenharmony_ci int h = 16; 84cabdff1aSopenharmony_ci 85cabdff1aSopenharmony_ci if ((size_t) pix2 & 0x7) { 86cabdff1aSopenharmony_ci /* works only when pix2 is actually unaligned */ 87cabdff1aSopenharmony_ci do { /* do 16 pixel a time */ 88cabdff1aSopenharmony_ci uint64_t p1_l, p1_r, p2_l, p2_r; 89cabdff1aSopenharmony_ci uint64_t t; 90cabdff1aSopenharmony_ci 91cabdff1aSopenharmony_ci p1_l = ldq(pix1); 92cabdff1aSopenharmony_ci p1_r = ldq(pix1 + 8); 93cabdff1aSopenharmony_ci t = ldq_u(pix2 + 8); 94cabdff1aSopenharmony_ci p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); 95cabdff1aSopenharmony_ci p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); 96cabdff1aSopenharmony_ci pix1 += line_size; 97cabdff1aSopenharmony_ci pix2 += line_size; 98cabdff1aSopenharmony_ci 99cabdff1aSopenharmony_ci result += perr(p1_l, p2_l) 100cabdff1aSopenharmony_ci + perr(p1_r, p2_r); 101cabdff1aSopenharmony_ci } while (--h); 102cabdff1aSopenharmony_ci } else { 103cabdff1aSopenharmony_ci do { 104cabdff1aSopenharmony_ci uint64_t p1_l, p1_r, p2_l, p2_r; 105cabdff1aSopenharmony_ci 106cabdff1aSopenharmony_ci p1_l = ldq(pix1); 107cabdff1aSopenharmony_ci p1_r = ldq(pix1 + 8); 108cabdff1aSopenharmony_ci p2_l = ldq(pix2); 109cabdff1aSopenharmony_ci p2_r = ldq(pix2 + 8); 110cabdff1aSopenharmony_ci pix1 += line_size; 111cabdff1aSopenharmony_ci pix2 += line_size; 112cabdff1aSopenharmony_ci 113cabdff1aSopenharmony_ci result += perr(p1_l, p2_l) 114cabdff1aSopenharmony_ci + perr(p1_r, p2_r); 115cabdff1aSopenharmony_ci } while (--h); 116cabdff1aSopenharmony_ci } 117cabdff1aSopenharmony_ci 118cabdff1aSopenharmony_ci return result; 119cabdff1aSopenharmony_ci} 120cabdff1aSopenharmony_ci#endif 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_cistatic int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 123cabdff1aSopenharmony_ci{ 124cabdff1aSopenharmony_ci int result = 0; 125cabdff1aSopenharmony_ci uint64_t disalign = (size_t) pix2 & 0x7; 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci switch (disalign) { 128cabdff1aSopenharmony_ci case 0: 129cabdff1aSopenharmony_ci do { 130cabdff1aSopenharmony_ci uint64_t p1_l, p1_r, p2_l, p2_r; 131cabdff1aSopenharmony_ci uint64_t l, r; 132cabdff1aSopenharmony_ci 133cabdff1aSopenharmony_ci p1_l = ldq(pix1); 134cabdff1aSopenharmony_ci p1_r = ldq(pix1 + 8); 135cabdff1aSopenharmony_ci l = ldq(pix2); 136cabdff1aSopenharmony_ci r = ldq(pix2 + 8); 137cabdff1aSopenharmony_ci p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56)); 138cabdff1aSopenharmony_ci p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56)); 139cabdff1aSopenharmony_ci pix1 += line_size; 140cabdff1aSopenharmony_ci pix2 += line_size; 141cabdff1aSopenharmony_ci 142cabdff1aSopenharmony_ci result += perr(p1_l, p2_l) 143cabdff1aSopenharmony_ci + perr(p1_r, p2_r); 144cabdff1aSopenharmony_ci } while (--h); 145cabdff1aSopenharmony_ci break; 146cabdff1aSopenharmony_ci case 7: 147cabdff1aSopenharmony_ci /* |.......l|lllllllr|rrrrrrr*| 148cabdff1aSopenharmony_ci This case is special because disalign1 would be 8, which 149cabdff1aSopenharmony_ci gets treated as 0 by extqh. At least it is a bit faster 150cabdff1aSopenharmony_ci that way :) */ 151cabdff1aSopenharmony_ci do { 152cabdff1aSopenharmony_ci uint64_t p1_l, p1_r, p2_l, p2_r; 153cabdff1aSopenharmony_ci uint64_t l, m, r; 154cabdff1aSopenharmony_ci 155cabdff1aSopenharmony_ci p1_l = ldq(pix1); 156cabdff1aSopenharmony_ci p1_r = ldq(pix1 + 8); 157cabdff1aSopenharmony_ci l = ldq_u(pix2); 158cabdff1aSopenharmony_ci m = ldq_u(pix2 + 8); 159cabdff1aSopenharmony_ci r = ldq_u(pix2 + 16); 160cabdff1aSopenharmony_ci p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m); 161cabdff1aSopenharmony_ci p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r); 162cabdff1aSopenharmony_ci pix1 += line_size; 163cabdff1aSopenharmony_ci pix2 += line_size; 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_ci result += perr(p1_l, p2_l) 166cabdff1aSopenharmony_ci + perr(p1_r, p2_r); 167cabdff1aSopenharmony_ci } while (--h); 168cabdff1aSopenharmony_ci break; 169cabdff1aSopenharmony_ci default: 170cabdff1aSopenharmony_ci do { 171cabdff1aSopenharmony_ci uint64_t disalign1 = disalign + 1; 172cabdff1aSopenharmony_ci uint64_t p1_l, p1_r, p2_l, p2_r; 173cabdff1aSopenharmony_ci uint64_t l, m, r; 174cabdff1aSopenharmony_ci 175cabdff1aSopenharmony_ci p1_l = ldq(pix1); 176cabdff1aSopenharmony_ci p1_r = ldq(pix1 + 8); 177cabdff1aSopenharmony_ci l = ldq_u(pix2); 178cabdff1aSopenharmony_ci m = ldq_u(pix2 + 8); 179cabdff1aSopenharmony_ci r = ldq_u(pix2 + 16); 180cabdff1aSopenharmony_ci p2_l = avg2(extql(l, disalign) | extqh(m, disalign), 181cabdff1aSopenharmony_ci extql(l, disalign1) | extqh(m, disalign1)); 182cabdff1aSopenharmony_ci p2_r = avg2(extql(m, disalign) | extqh(r, disalign), 183cabdff1aSopenharmony_ci extql(m, disalign1) | extqh(r, disalign1)); 184cabdff1aSopenharmony_ci pix1 += line_size; 185cabdff1aSopenharmony_ci pix2 += line_size; 186cabdff1aSopenharmony_ci 187cabdff1aSopenharmony_ci result += perr(p1_l, p2_l) 188cabdff1aSopenharmony_ci + perr(p1_r, p2_r); 189cabdff1aSopenharmony_ci } while (--h); 190cabdff1aSopenharmony_ci break; 191cabdff1aSopenharmony_ci } 192cabdff1aSopenharmony_ci return result; 193cabdff1aSopenharmony_ci} 194cabdff1aSopenharmony_ci 195cabdff1aSopenharmony_cistatic int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 196cabdff1aSopenharmony_ci{ 197cabdff1aSopenharmony_ci int result = 0; 198cabdff1aSopenharmony_ci 199cabdff1aSopenharmony_ci if ((size_t) pix2 & 0x7) { 200cabdff1aSopenharmony_ci uint64_t t, p2_l, p2_r; 201cabdff1aSopenharmony_ci t = ldq_u(pix2 + 8); 202cabdff1aSopenharmony_ci p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); 203cabdff1aSopenharmony_ci p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); 204cabdff1aSopenharmony_ci 205cabdff1aSopenharmony_ci do { 206cabdff1aSopenharmony_ci uint64_t p1_l, p1_r, np2_l, np2_r; 207cabdff1aSopenharmony_ci uint64_t t; 208cabdff1aSopenharmony_ci 209cabdff1aSopenharmony_ci p1_l = ldq(pix1); 210cabdff1aSopenharmony_ci p1_r = ldq(pix1 + 8); 211cabdff1aSopenharmony_ci pix2 += line_size; 212cabdff1aSopenharmony_ci t = ldq_u(pix2 + 8); 213cabdff1aSopenharmony_ci np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); 214cabdff1aSopenharmony_ci np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); 215cabdff1aSopenharmony_ci 216cabdff1aSopenharmony_ci result += perr(p1_l, avg2(p2_l, np2_l)) 217cabdff1aSopenharmony_ci + perr(p1_r, avg2(p2_r, np2_r)); 218cabdff1aSopenharmony_ci 219cabdff1aSopenharmony_ci pix1 += line_size; 220cabdff1aSopenharmony_ci p2_l = np2_l; 221cabdff1aSopenharmony_ci p2_r = np2_r; 222cabdff1aSopenharmony_ci 223cabdff1aSopenharmony_ci } while (--h); 224cabdff1aSopenharmony_ci } else { 225cabdff1aSopenharmony_ci uint64_t p2_l, p2_r; 226cabdff1aSopenharmony_ci p2_l = ldq(pix2); 227cabdff1aSopenharmony_ci p2_r = ldq(pix2 + 8); 228cabdff1aSopenharmony_ci do { 229cabdff1aSopenharmony_ci uint64_t p1_l, p1_r, np2_l, np2_r; 230cabdff1aSopenharmony_ci 231cabdff1aSopenharmony_ci p1_l = ldq(pix1); 232cabdff1aSopenharmony_ci p1_r = ldq(pix1 + 8); 233cabdff1aSopenharmony_ci pix2 += line_size; 234cabdff1aSopenharmony_ci np2_l = ldq(pix2); 235cabdff1aSopenharmony_ci np2_r = ldq(pix2 + 8); 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ci result += perr(p1_l, avg2(p2_l, np2_l)) 238cabdff1aSopenharmony_ci + perr(p1_r, avg2(p2_r, np2_r)); 239cabdff1aSopenharmony_ci 240cabdff1aSopenharmony_ci pix1 += line_size; 241cabdff1aSopenharmony_ci p2_l = np2_l; 242cabdff1aSopenharmony_ci p2_r = np2_r; 243cabdff1aSopenharmony_ci } while (--h); 244cabdff1aSopenharmony_ci } 245cabdff1aSopenharmony_ci return result; 246cabdff1aSopenharmony_ci} 247cabdff1aSopenharmony_ci 248cabdff1aSopenharmony_cistatic int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 249cabdff1aSopenharmony_ci{ 250cabdff1aSopenharmony_ci int result = 0; 251cabdff1aSopenharmony_ci 252cabdff1aSopenharmony_ci uint64_t p1_l, p1_r; 253cabdff1aSopenharmony_ci uint64_t p2_l, p2_r, p2_x; 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_ci p1_l = ldq(pix1); 256cabdff1aSopenharmony_ci p1_r = ldq(pix1 + 8); 257cabdff1aSopenharmony_ci 258cabdff1aSopenharmony_ci if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ 259cabdff1aSopenharmony_ci p2_l = uldq(pix2); 260cabdff1aSopenharmony_ci p2_r = uldq(pix2 + 8); 261cabdff1aSopenharmony_ci p2_x = (uint64_t) pix2[16] << 56; 262cabdff1aSopenharmony_ci } else { 263cabdff1aSopenharmony_ci p2_l = ldq(pix2); 264cabdff1aSopenharmony_ci p2_r = ldq(pix2 + 8); 265cabdff1aSopenharmony_ci p2_x = ldq(pix2 + 16) << 56; 266cabdff1aSopenharmony_ci } 267cabdff1aSopenharmony_ci 268cabdff1aSopenharmony_ci do { 269cabdff1aSopenharmony_ci uint64_t np1_l, np1_r; 270cabdff1aSopenharmony_ci uint64_t np2_l, np2_r, np2_x; 271cabdff1aSopenharmony_ci 272cabdff1aSopenharmony_ci pix1 += line_size; 273cabdff1aSopenharmony_ci pix2 += line_size; 274cabdff1aSopenharmony_ci 275cabdff1aSopenharmony_ci np1_l = ldq(pix1); 276cabdff1aSopenharmony_ci np1_r = ldq(pix1 + 8); 277cabdff1aSopenharmony_ci 278cabdff1aSopenharmony_ci if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ 279cabdff1aSopenharmony_ci np2_l = uldq(pix2); 280cabdff1aSopenharmony_ci np2_r = uldq(pix2 + 8); 281cabdff1aSopenharmony_ci np2_x = (uint64_t) pix2[16] << 56; 282cabdff1aSopenharmony_ci } else { 283cabdff1aSopenharmony_ci np2_l = ldq(pix2); 284cabdff1aSopenharmony_ci np2_r = ldq(pix2 + 8); 285cabdff1aSopenharmony_ci np2_x = ldq(pix2 + 16) << 56; 286cabdff1aSopenharmony_ci } 287cabdff1aSopenharmony_ci 288cabdff1aSopenharmony_ci result += perr(p1_l, 289cabdff1aSopenharmony_ci avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56), 290cabdff1aSopenharmony_ci np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56))) 291cabdff1aSopenharmony_ci + perr(p1_r, 292cabdff1aSopenharmony_ci avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x), 293cabdff1aSopenharmony_ci np2_r, (np2_r >> 8) | ((uint64_t) np2_x))); 294cabdff1aSopenharmony_ci 295cabdff1aSopenharmony_ci p1_l = np1_l; 296cabdff1aSopenharmony_ci p1_r = np1_r; 297cabdff1aSopenharmony_ci p2_l = np2_l; 298cabdff1aSopenharmony_ci p2_r = np2_r; 299cabdff1aSopenharmony_ci p2_x = np2_x; 300cabdff1aSopenharmony_ci } while (--h); 301cabdff1aSopenharmony_ci 302cabdff1aSopenharmony_ci return result; 303cabdff1aSopenharmony_ci} 304cabdff1aSopenharmony_ci 305cabdff1aSopenharmony_ciav_cold void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx) 306cabdff1aSopenharmony_ci{ 307cabdff1aSopenharmony_ci /* amask clears all bits that correspond to present features. */ 308cabdff1aSopenharmony_ci if (amask(AMASK_MVI) == 0) { 309cabdff1aSopenharmony_ci c->sad[0] = pix_abs16x16_mvi_asm; 310cabdff1aSopenharmony_ci c->sad[1] = pix_abs8x8_mvi; 311cabdff1aSopenharmony_ci c->pix_abs[0][0] = pix_abs16x16_mvi_asm; 312cabdff1aSopenharmony_ci c->pix_abs[1][0] = pix_abs8x8_mvi; 313cabdff1aSopenharmony_ci c->pix_abs[0][1] = pix_abs16x16_x2_mvi; 314cabdff1aSopenharmony_ci c->pix_abs[0][2] = pix_abs16x16_y2_mvi; 315cabdff1aSopenharmony_ci c->pix_abs[0][3] = pix_abs16x16_xy2_mvi; 316cabdff1aSopenharmony_ci } 317cabdff1aSopenharmony_ci} 318