1/* 2 * Alpha optimized DSP utils 3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/attributes.h" 23#include "libavcodec/me_cmp.h" 24#include "asm.h" 25 26int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 27 28static inline uint64_t avg2(uint64_t a, uint64_t b) 29{ 30 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 31} 32 33static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) 34{ 35 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) 36 + ((l2 & ~BYTE_VEC(0x03)) >> 2) 37 + ((l3 & ~BYTE_VEC(0x03)) >> 2) 38 + ((l4 & ~BYTE_VEC(0x03)) >> 2); 39 uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) 40 + (l2 & BYTE_VEC(0x03)) 41 + (l3 & BYTE_VEC(0x03)) 42 + (l4 & BYTE_VEC(0x03)) 43 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); 44 return r1 + r2; 45} 46 47static int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 48{ 49 int result = 0; 50 51 if ((size_t) pix2 & 0x7) { 52 /* works only when pix2 is actually unaligned */ 53 do { /* do 8 pixel a time */ 54 uint64_t p1, p2; 55 56 p1 = ldq(pix1); 57 p2 = uldq(pix2); 58 result += perr(p1, p2); 59 60 pix1 += line_size; 61 pix2 += line_size; 62 } while (--h); 63 } else { 64 do { 65 uint64_t p1, p2; 66 67 p1 = ldq(pix1); 68 p2 = ldq(pix2); 69 result += perr(p1, p2); 70 71 pix1 += line_size; 72 pix2 += line_size; 73 } while (--h); 74 } 75 76 return result; 77} 78 79#if 0 /* now done in assembly */ 80int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) 81{ 82 int result = 0; 83 int h = 16; 84 85 if ((size_t) pix2 & 0x7) { 86 /* works only when pix2 is actually unaligned */ 87 do { /* do 16 pixel a time */ 88 uint64_t p1_l, p1_r, p2_l, p2_r; 89 uint64_t t; 90 91 p1_l = ldq(pix1); 92 p1_r = ldq(pix1 + 8); 93 t = ldq_u(pix2 + 8); 94 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); 95 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); 96 pix1 += line_size; 97 pix2 += line_size; 98 99 result += perr(p1_l, p2_l) 100 + perr(p1_r, p2_r); 101 } while (--h); 102 } else { 103 do { 104 uint64_t p1_l, p1_r, p2_l, p2_r; 105 106 p1_l = ldq(pix1); 107 p1_r = ldq(pix1 + 8); 108 p2_l = ldq(pix2); 109 p2_r = ldq(pix2 + 8); 110 pix1 += line_size; 111 pix2 += line_size; 112 113 result += perr(p1_l, p2_l) 114 + perr(p1_r, p2_r); 115 } while (--h); 116 } 117 118 return result; 119} 120#endif 121 122static int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 123{ 124 int result = 0; 125 uint64_t disalign = (size_t) pix2 & 0x7; 126 127 switch (disalign) { 128 case 0: 129 do { 130 uint64_t p1_l, p1_r, p2_l, p2_r; 131 uint64_t l, r; 132 133 p1_l = ldq(pix1); 134 p1_r = ldq(pix1 + 8); 135 l = ldq(pix2); 136 r = ldq(pix2 + 8); 137 p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56)); 138 p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56)); 139 pix1 += line_size; 140 pix2 += line_size; 141 142 result += perr(p1_l, p2_l) 143 + perr(p1_r, p2_r); 144 } while (--h); 145 break; 146 case 7: 147 /* |.......l|lllllllr|rrrrrrr*| 148 This case is special because disalign1 would be 8, which 149 gets treated as 0 by extqh. At least it is a bit faster 150 that way :) */ 151 do { 152 uint64_t p1_l, p1_r, p2_l, p2_r; 153 uint64_t l, m, r; 154 155 p1_l = ldq(pix1); 156 p1_r = ldq(pix1 + 8); 157 l = ldq_u(pix2); 158 m = ldq_u(pix2 + 8); 159 r = ldq_u(pix2 + 16); 160 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m); 161 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r); 162 pix1 += line_size; 163 pix2 += line_size; 164 165 result += perr(p1_l, p2_l) 166 + perr(p1_r, p2_r); 167 } while (--h); 168 break; 169 default: 170 do { 171 uint64_t disalign1 = disalign + 1; 172 uint64_t p1_l, p1_r, p2_l, p2_r; 173 uint64_t l, m, r; 174 175 p1_l = ldq(pix1); 176 p1_r = ldq(pix1 + 8); 177 l = ldq_u(pix2); 178 m = ldq_u(pix2 + 8); 179 r = ldq_u(pix2 + 16); 180 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), 181 extql(l, disalign1) | extqh(m, disalign1)); 182 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), 183 extql(m, disalign1) | extqh(r, disalign1)); 184 pix1 += line_size; 185 pix2 += line_size; 186 187 result += perr(p1_l, p2_l) 188 + perr(p1_r, p2_r); 189 } while (--h); 190 break; 191 } 192 return result; 193} 194 195static int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 196{ 197 int result = 0; 198 199 if ((size_t) pix2 & 0x7) { 200 uint64_t t, p2_l, p2_r; 201 t = ldq_u(pix2 + 8); 202 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); 203 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); 204 205 do { 206 uint64_t p1_l, p1_r, np2_l, np2_r; 207 uint64_t t; 208 209 p1_l = ldq(pix1); 210 p1_r = ldq(pix1 + 8); 211 pix2 += line_size; 212 t = ldq_u(pix2 + 8); 213 np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); 214 np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); 215 216 result += perr(p1_l, avg2(p2_l, np2_l)) 217 + perr(p1_r, avg2(p2_r, np2_r)); 218 219 pix1 += line_size; 220 p2_l = np2_l; 221 p2_r = np2_r; 222 223 } while (--h); 224 } else { 225 uint64_t p2_l, p2_r; 226 p2_l = ldq(pix2); 227 p2_r = ldq(pix2 + 8); 228 do { 229 uint64_t p1_l, p1_r, np2_l, np2_r; 230 231 p1_l = ldq(pix1); 232 p1_r = ldq(pix1 + 8); 233 pix2 += line_size; 234 np2_l = ldq(pix2); 235 np2_r = ldq(pix2 + 8); 236 237 result += perr(p1_l, avg2(p2_l, np2_l)) 238 + perr(p1_r, avg2(p2_r, np2_r)); 239 240 pix1 += line_size; 241 p2_l = np2_l; 242 p2_r = np2_r; 243 } while (--h); 244 } 245 return result; 246} 247 248static int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 249{ 250 int result = 0; 251 252 uint64_t p1_l, p1_r; 253 uint64_t p2_l, p2_r, p2_x; 254 255 p1_l = ldq(pix1); 256 p1_r = ldq(pix1 + 8); 257 258 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ 259 p2_l = uldq(pix2); 260 p2_r = uldq(pix2 + 8); 261 p2_x = (uint64_t) pix2[16] << 56; 262 } else { 263 p2_l = ldq(pix2); 264 p2_r = ldq(pix2 + 8); 265 p2_x = ldq(pix2 + 16) << 56; 266 } 267 268 do { 269 uint64_t np1_l, np1_r; 270 uint64_t np2_l, np2_r, np2_x; 271 272 pix1 += line_size; 273 pix2 += line_size; 274 275 np1_l = ldq(pix1); 276 np1_r = ldq(pix1 + 8); 277 278 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ 279 np2_l = uldq(pix2); 280 np2_r = uldq(pix2 + 8); 281 np2_x = (uint64_t) pix2[16] << 56; 282 } else { 283 np2_l = ldq(pix2); 284 np2_r = ldq(pix2 + 8); 285 np2_x = ldq(pix2 + 16) << 56; 286 } 287 288 result += perr(p1_l, 289 avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56), 290 np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56))) 291 + perr(p1_r, 292 avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x), 293 np2_r, (np2_r >> 8) | ((uint64_t) np2_x))); 294 295 p1_l = np1_l; 296 p1_r = np1_r; 297 p2_l = np2_l; 298 p2_r = np2_r; 299 p2_x = np2_x; 300 } while (--h); 301 302 return result; 303} 304 305av_cold void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx) 306{ 307 /* amask clears all bits that correspond to present features. */ 308 if (amask(AMASK_MVI) == 0) { 309 c->sad[0] = pix_abs16x16_mvi_asm; 310 c->sad[1] = pix_abs8x8_mvi; 311 c->pix_abs[0][0] = pix_abs16x16_mvi_asm; 312 c->pix_abs[1][0] = pix_abs8x8_mvi; 313 c->pix_abs[0][1] = pix_abs16x16_x2_mvi; 314 c->pix_abs[0][2] = pix_abs16x16_y2_mvi; 315 c->pix_abs[0][3] = pix_abs16x16_xy2_mvi; 316 } 317} 318