1/* 2 * Copyright (c) 2018 gxw <guxiwei-hf@loongson.cn> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "vp3dsp_mips.h" 22#include "libavutil/mips/generic_macros_msa.h" 23#include "libavutil/intreadwrite.h" 24#include "libavcodec/rnd_avg.h" 25 26static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) 27{ 28 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign; 29 v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l, 30 r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l; 31 v4i32 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H; 32 v4i32 Ed, Gd, Add, Bdd, Fd, Hd; 33 v16u8 sign_l; 34 v16i8 d0, d1, d2, d3, d4, d5, d6, d7; 35 v4i32 c0, c1, c2, c3, c4, c5, c6, c7; 36 v4i32 f0, f1, f2, f3, f4, f5, f6, f7; 37 v4i32 sign_t; 38 v16i8 zero = {0}; 39 v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; 40 v4i32 cnst64277w = {64277, 64277, 64277, 64277}; 41 v4i32 cnst60547w = {60547, 60547, 60547, 60547}; 42 v4i32 cnst54491w = {54491, 54491, 54491, 54491}; 43 v4i32 cnst46341w = {46341, 46341, 46341, 46341}; 44 v4i32 cnst36410w = {36410, 36410, 36410, 36410}; 45 v4i32 cnst25080w = {25080, 25080, 25080, 25080}; 46 v4i32 cnst12785w = {12785, 12785, 12785, 12785}; 47 v4i32 cnst8w = {8, 8, 8, 8}; 48 v4i32 cnst2048w = {2048, 2048, 2048, 2048}; 49 v4i32 cnst128w = {128, 128, 128, 128}; 50 51 /* Extended input data */ 52 LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7); 53 sign = __msa_clti_s_h(r0, 0); 54 r0_r = (v4i32) __msa_ilvr_h(sign, r0); 55 r0_l = (v4i32) __msa_ilvl_h(sign, r0); 56 sign = __msa_clti_s_h(r1, 0); 57 r1_r = (v4i32) __msa_ilvr_h(sign, r1); 58 r1_l = (v4i32) __msa_ilvl_h(sign, r1); 59 sign = __msa_clti_s_h(r2, 0); 60 r2_r = (v4i32) __msa_ilvr_h(sign, r2); 61 r2_l = (v4i32) __msa_ilvl_h(sign, r2); 62 sign = __msa_clti_s_h(r3, 0); 63 r3_r = (v4i32) __msa_ilvr_h(sign, r3); 64 r3_l = (v4i32) __msa_ilvl_h(sign, r3); 65 sign = __msa_clti_s_h(r4, 0); 66 r4_r = (v4i32) __msa_ilvr_h(sign, r4); 67 r4_l = (v4i32) __msa_ilvl_h(sign, r4); 68 sign = __msa_clti_s_h(r5, 0); 69 r5_r = (v4i32) __msa_ilvr_h(sign, r5); 70 r5_l = (v4i32) __msa_ilvl_h(sign, r5); 71 sign = __msa_clti_s_h(r6, 0); 72 r6_r = (v4i32) __msa_ilvr_h(sign, r6); 73 r6_l = (v4i32) __msa_ilvl_h(sign, r6); 74 sign = __msa_clti_s_h(r7, 0); 75 r7_r = (v4i32) __msa_ilvr_h(sign, r7); 76 r7_l = (v4i32) __msa_ilvl_h(sign, r7); 77 78 /* Right part */ 79 A = ((r1_r * cnst64277w) >> 16) + ((r7_r * cnst12785w) >> 16); 80 B = ((r1_r * cnst12785w) >> 16) - ((r7_r * cnst64277w) >> 16); 81 C = ((r3_r * cnst54491w) >> 16) + ((r5_r * cnst36410w) >> 16); 82 D = ((r5_r * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16); 83 Ad = ((A - C) * cnst46341w) >> 16; 84 Bd = ((B - D) * cnst46341w) >> 16; 85 Cd = A + C; 86 Dd = B + D; 87 E = ((r0_r + r4_r) * cnst46341w) >> 16; 88 F = ((r0_r - r4_r) * cnst46341w) >> 16; 89 G = ((r2_r * cnst60547w) >> 16) + ((r6_r * cnst25080w) >> 16); 90 H = ((r2_r * cnst25080w) >> 16) - ((r6_r * cnst60547w) >> 16); 91 Ed = E - G; 92 Gd = E + G; 93 Add = F + Ad; 94 Bdd = Bd - H; 95 Fd = F - Ad; 96 Hd = Bd + H; 97 r0_r = Gd + Cd; 98 r7_r = Gd - Cd; 99 r1_r = Add + Hd; 100 r2_r = Add - Hd; 101 r3_r = Ed + Dd; 102 r4_r = Ed - Dd; 103 r5_r = Fd + Bdd; 104 r6_r = Fd - Bdd; 105 106 /* Left part */ 107 A = ((r1_l * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16); 108 B = ((r1_l * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16); 109 C = ((r3_l * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16); 110 D = ((r5_l * cnst54491w) >> 16) - ((r3_l * cnst36410w) >> 16); 111 Ad = ((A - C) * cnst46341w) >> 16; 112 Bd = ((B - D) * cnst46341w) >> 16; 113 Cd = A + C; 114 Dd = B + D; 115 E = ((r0_l + r4_l) * cnst46341w) >> 16; 116 F = ((r0_l - r4_l) * cnst46341w) >> 16; 117 G = ((r2_l * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16); 118 H = ((r2_l * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16); 119 Ed = E - G; 120 Gd = E + G; 121 Add = F + Ad; 122 Bdd = Bd - H; 123 Fd = F - Ad; 124 Hd = Bd + H; 125 r0_l = Gd + Cd; 126 r7_l = Gd - Cd; 127 r1_l = Add + Hd; 128 r2_l = Add - Hd; 129 r3_l = Ed + Dd; 130 r4_l = Ed - Dd; 131 r5_l = Fd + Bdd; 132 r6_l = Fd - Bdd; 133 134 /* Row 0 to 3 */ 135 TRANSPOSE4x4_SW_SW(r0_r, r1_r, r2_r, r3_r, 136 r0_r, r1_r, r2_r, r3_r); 137 TRANSPOSE4x4_SW_SW(r0_l, r1_l, r2_l, r3_l, 138 r0_l, r1_l, r2_l, r3_l); 139 A = ((r1_r * cnst64277w) >> 16) + ((r3_l * cnst12785w) >> 16); 140 B = ((r1_r * cnst12785w) >> 16) - ((r3_l * cnst64277w) >> 16); 141 C = ((r3_r * cnst54491w) >> 16) + ((r1_l * cnst36410w) >> 16); 142 D = ((r1_l * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16); 143 Ad = ((A - C) * cnst46341w) >> 16; 144 Bd = ((B - D) * cnst46341w) >> 16; 145 Cd = A + C; 146 Dd = B + D; 147 E = ((r0_r + r0_l) * cnst46341w) >> 16; 148 E += cnst8w; 149 F = ((r0_r - r0_l) * cnst46341w) >> 16; 150 F += cnst8w; 151 if (type == 1) { // HACK 152 E += cnst2048w; 153 F += cnst2048w; 154 } 155 G = ((r2_r * cnst60547w) >> 16) + ((r2_l * cnst25080w) >> 16); 156 H = ((r2_r * cnst25080w) >> 16) - ((r2_l * cnst60547w) >> 16); 157 Ed = E - G; 158 Gd = E + G; 159 Add = F + Ad; 160 Bdd = Bd - H; 161 Fd = F - Ad; 162 Hd = Bd + H; 163 A = (Gd + Cd) >> 4; 164 B = (Gd - Cd) >> 4; 165 C = (Add + Hd) >> 4; 166 D = (Add - Hd) >> 4; 167 E = (Ed + Dd) >> 4; 168 F = (Ed - Dd) >> 4; 169 G = (Fd + Bdd) >> 4; 170 H = (Fd - Bdd) >> 4; 171 if (type != 1) { 172 LD_SB8(dst, stride, d0, d1, d2, d3, d4, d5, d6, d7); 173 ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3, 174 f0, f1, f2, f3); 175 ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7, 176 f4, f5, f6, f7); 177 ILVR_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3, 178 c0, c1, c2, c3); 179 ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7, 180 c4, c5, c6, c7); 181 A += c0; 182 B += c7; 183 C += c1; 184 D += c2; 185 E += c3; 186 F += c4; 187 G += c5; 188 H += c6; 189 } 190 CLIP_SW8_0_255(A, B, C, D, E, F, G, H); 191 sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r); 192 sign_l = __msa_or_v(sign_l, (v16u8)r3_r); 193 sign_l = __msa_or_v(sign_l, (v16u8)r0_l); 194 sign_l = __msa_or_v(sign_l, (v16u8)r1_l); 195 sign_l = __msa_or_v(sign_l, (v16u8)r2_l); 196 sign_l = __msa_or_v(sign_l, (v16u8)r3_l); 197 sign_t = __msa_ceqi_w((v4i32)sign_l, 0); 198 Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20; 199 if (type == 1) { 200 Bdd = Add + cnst128w; 201 CLIP_SW_0_255(Bdd); 202 Ad = Bdd; 203 Bd = Bdd; 204 Cd = Bdd; 205 Dd = Bdd; 206 Ed = Bdd; 207 Fd = Bdd; 208 Gd = Bdd; 209 Hd = Bdd; 210 } else { 211 Ad = Add + c0; 212 Bd = Add + c1; 213 Cd = Add + c2; 214 Dd = Add + c3; 215 Ed = Add + c4; 216 Fd = Add + c5; 217 Gd = Add + c6; 218 Hd = Add + c7; 219 CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); 220 } 221 Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); 222 Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t); 223 Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t); 224 Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t); 225 Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t); 226 Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t); 227 Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t); 228 Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t); 229 sign_t = __msa_ceqi_w(sign_t, 0); 230 A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t); 231 B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t); 232 C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t); 233 D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t); 234 E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t); 235 F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t); 236 G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t); 237 H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t); 238 r0_r = Ad + A; 239 r1_r = Bd + C; 240 r2_r = Cd + D; 241 r3_r = Dd + E; 242 r0_l = Ed + F; 243 r1_l = Fd + G; 244 r2_l = Gd + H; 245 r3_l = Hd + B; 246 247 /* Row 4 to 7 */ 248 TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r, 249 r4_r, r5_r, r6_r, r7_r); 250 TRANSPOSE4x4_SW_SW(r4_l, r5_l, r6_l, r7_l, 251 r4_l, r5_l, r6_l, r7_l); 252 A = ((r5_r * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16); 253 B = ((r5_r * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16); 254 C = ((r7_r * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16); 255 D = ((r5_l * cnst54491w) >> 16) - ((r7_r * cnst36410w) >> 16); 256 Ad = ((A - C) * cnst46341w) >> 16; 257 Bd = ((B - D) * cnst46341w) >> 16; 258 Cd = A + C; 259 Dd = B + D; 260 E = ((r4_r + r4_l) * cnst46341w) >> 16; 261 E += cnst8w; 262 F = ((r4_r - r4_l) * cnst46341w) >> 16; 263 F += cnst8w; 264 if (type == 1) { // HACK 265 E += cnst2048w; 266 F += cnst2048w; 267 } 268 G = ((r6_r * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16); 269 H = ((r6_r * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16); 270 Ed = E - G; 271 Gd = E + G; 272 Add = F + Ad; 273 Bdd = Bd - H; 274 Fd = F - Ad; 275 Hd = Bd + H; 276 A = (Gd + Cd) >> 4; 277 B = (Gd - Cd) >> 4; 278 C = (Add + Hd) >> 4; 279 D = (Add - Hd) >> 4; 280 E = (Ed + Dd) >> 4; 281 F = (Ed - Dd) >> 4; 282 G = (Fd + Bdd) >> 4; 283 H = (Fd - Bdd) >> 4; 284 if (type != 1) { 285 ILVL_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3, 286 c0, c1, c2, c3); 287 ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7, 288 c4, c5, c6, c7); 289 A += c0; 290 B += c7; 291 C += c1; 292 D += c2; 293 E += c3; 294 F += c4; 295 G += c5; 296 H += c6; 297 } 298 CLIP_SW8_0_255(A, B, C, D, E, F, G, H); 299 sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r); 300 sign_l = __msa_or_v(sign_l, (v16u8)r7_r); 301 sign_l = __msa_or_v(sign_l, (v16u8)r4_l); 302 sign_l = __msa_or_v(sign_l, (v16u8)r5_l); 303 sign_l = __msa_or_v(sign_l, (v16u8)r6_l); 304 sign_l = __msa_or_v(sign_l, (v16u8)r7_l); 305 sign_t = __msa_ceqi_w((v4i32)sign_l, 0); 306 Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20; 307 if (type == 1) { 308 Bdd = Add + cnst128w; 309 CLIP_SW_0_255(Bdd); 310 Ad = Bdd; 311 Bd = Bdd; 312 Cd = Bdd; 313 Dd = Bdd; 314 Ed = Bdd; 315 Fd = Bdd; 316 Gd = Bdd; 317 Hd = Bdd; 318 } else { 319 Ad = Add + c0; 320 Bd = Add + c1; 321 Cd = Add + c2; 322 Dd = Add + c3; 323 Ed = Add + c4; 324 Fd = Add + c5; 325 Gd = Add + c6; 326 Hd = Add + c7; 327 CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); 328 } 329 Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); 330 Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t); 331 Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t); 332 Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t); 333 Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t); 334 Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t); 335 Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t); 336 Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t); 337 sign_t = __msa_ceqi_w(sign_t, 0); 338 A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t); 339 B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t); 340 C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t); 341 D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t); 342 E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t); 343 F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t); 344 G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t); 345 H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t); 346 r4_r = Ad + A; 347 r5_r = Bd + C; 348 r6_r = Cd + D; 349 r7_r = Dd + E; 350 r4_l = Ed + F; 351 r5_l = Fd + G; 352 r6_l = Gd + H; 353 r7_l = Hd + B; 354 VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1); 355 VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3); 356 VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5); 357 VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7); 358 359 /* Final sequence of operations over-write original dst */ 360 ST_D1(d0, 0, dst); 361 ST_D1(d1, 0, dst + stride); 362 ST_D1(d2, 0, dst + 2 * stride); 363 ST_D1(d3, 0, dst + 3 * stride); 364 ST_D1(d4, 0, dst + 4 * stride); 365 ST_D1(d5, 0, dst + 5 * stride); 366 ST_D1(d6, 0, dst + 6 * stride); 367 ST_D1(d7, 0, dst + 7 * stride); 368} 369 370void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) 371{ 372 idct_msa(dest, line_size, block, 1); 373 memset(block, 0, sizeof(*block) * 64); 374} 375 376void ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) 377{ 378 idct_msa(dest, line_size, block, 2); 379 memset(block, 0, sizeof(*block) * 64); 380} 381 382void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) 383{ 384 int i = (block[0] + 15) >> 5; 385 v4i32 dc = {i, i, i, i}; 386 v16i8 d0, d1, d2, d3, d4, d5, d6, d7; 387 v4i32 c0, c1, c2, c3, c4, c5, c6, c7; 388 v4i32 e0, e1, e2, e3, e4, e5, e6, e7; 389 v4i32 r0, r1, r2, r3, r4, r5, r6, r7; 390 v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; 391 v16i8 zero = {0}; 392 393 LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7); 394 ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3, 395 c0, c1, c2, c3); 396 ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7, 397 c4, c5, c6, c7); 398 /* Right part */ 399 ILVR_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3, 400 e0, e1, e2, e3); 401 ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7, 402 e4, e5, e6, e7); 403 e0 += dc; 404 e1 += dc; 405 e2 += dc; 406 e3 += dc; 407 e4 += dc; 408 e5 += dc; 409 e6 += dc; 410 e7 += dc; 411 CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7); 412 413 /* Left part */ 414 ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3, 415 r0, r1, r2, r3); 416 ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7, 417 r4, r5, r6, r7); 418 r0 += dc; 419 r1 += dc; 420 r2 += dc; 421 r3 += dc; 422 r4 += dc; 423 r5 += dc; 424 r6 += dc; 425 r7 += dc; 426 CLIP_SW8_0_255(r0, r1, r2, r3, r4, r5, r6, r7); 427 VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1); 428 VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3); 429 VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5); 430 VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7); 431 432 /* Final sequence of operations over-write original dst */ 433 ST_D1(d0, 0, dest); 434 ST_D1(d1, 0, dest + line_size); 435 ST_D1(d2, 0, dest + 2 * line_size); 436 ST_D1(d3, 0, dest + 3 * line_size); 437 ST_D1(d4, 0, dest + 4 * line_size); 438 ST_D1(d5, 0, dest + 5 * line_size); 439 ST_D1(d6, 0, dest + 6 * line_size); 440 ST_D1(d7, 0, dest + 7 * line_size); 441 442 block[0] = 0; 443} 444 445void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, 446 int *bounding_values) 447{ 448 int nstride = -stride; 449 v4i32 e0, e1, f0, f1, g0, g1; 450 v16i8 zero = {0}; 451 v16i8 d0, d1, d2, d3; 452 v8i16 c0, c1, c2, c3; 453 v8i16 r0; 454 v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3}, 455 cnst4h = {4, 4, 4, 4, 4, 4, 4, 4}; 456 v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; 457 int16_t temp_16[8]; 458 int temp_32[8]; 459 460 LD_SB4(first_pixel + nstride * 2, stride, d0, d1, d2, d3); 461 ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3, 462 c0, c1, c2, c3); 463 r0 = (c0 - c3) + (c2 - c1) * cnst3h; 464 r0 += cnst4h; 465 r0 = r0 >> 3; 466 /* Get filter_value from bounding_values one by one */ 467 ST_SH(r0, temp_16); 468 for (int i = 0; i < 8; i++) 469 temp_32[i] = bounding_values[temp_16[i]]; 470 LD_SW2(temp_32, 4, e0, e1); 471 ILVR_H2_SW(zero, c1, zero, c2, f0, g0); 472 ILVL_H2_SW(zero, c1, zero, c2, f1, g1); 473 f0 += e0; 474 f1 += e1; 475 g0 -= e0; 476 g1 -= e1; 477 CLIP_SW4_0_255(f0, f1, g0, g1); 478 VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2); 479 480 /* Final move to first_pixel */ 481 ST_D1(d1, 0, first_pixel + nstride); 482 ST_D1(d2, 0, first_pixel); 483} 484 485void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, 486 int *bounding_values) 487{ 488 v16i8 d0, d1, d2, d3, d4, d5, d6, d7; 489 v8i16 c0, c1, c2, c3, c4, c5, c6, c7; 490 v8i16 r0; 491 v4i32 e0, e1, f0, f1, g0, g1; 492 v16i8 zero = {0}; 493 v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3}, 494 cnst4h = {4, 4, 4, 4, 4, 4, 4, 4}; 495 v16i8 mask = {0, 16, 4, 20, 8, 24, 12, 28, 0, 0, 0, 0, 0, 0, 0, 0}; 496 int16_t temp_16[8]; 497 int temp_32[8]; 498 499 LD_SB8(first_pixel - 2, stride, d0, d1, d2, d3, d4, d5, d6, d7); 500 ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3, 501 c0, c1, c2, c3); 502 ILVR_B4_SH(zero, d4, zero, d5, zero, d6, zero, d7, 503 c4, c5, c6, c7); 504 TRANSPOSE8x8_SH_SH(c0, c1, c2, c3, c4, c5, c6, c7, 505 c0, c1, c2, c3, c4, c5, c6, c7); 506 r0 = (c0 - c3) + (c2 - c1) * cnst3h; 507 r0 += cnst4h; 508 r0 = r0 >> 3; 509 510 /* Get filter_value from bounding_values one by one */ 511 ST_SH(r0, temp_16); 512 for (int i = 0; i < 8; i++) 513 temp_32[i] = bounding_values[temp_16[i]]; 514 LD_SW2(temp_32, 4, e0, e1); 515 ILVR_H2_SW(zero, c1, zero, c2, f0, g0); 516 ILVL_H2_SW(zero, c1, zero, c2, f1, g1); 517 f0 += e0; 518 f1 += e1; 519 g0 -= e0; 520 g1 -= e1; 521 CLIP_SW4_0_255(f0, f1, g0, g1); 522 VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2); 523 /* Final move to first_pixel */ 524 ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride); 525 ST_H4(d2, 0, 1, 2, 3, first_pixel - 1 + 4 * stride, stride); 526} 527 528void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1, 529 const uint8_t *src2, ptrdiff_t stride, int h) 530{ 531 if (h == 8) { 532 v16i8 d0, d1, d2, d3, d4, d5, d6, d7; 533 v16i8 c0, c1, c2, c3; 534 v4i32 a0, a1, a2, a3, b0, b1, b2, b3; 535 v4i32 e0, e1, e2; 536 v4i32 f0, f1, f2; 537 v4u32 t0, t1, t2, t3; 538 v16i8 mask = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; 539 int32_t value = 0xfefefefe; 540 v4i32 fmask = {value, value, value, value}; 541 542 LD_SB8(src1, stride, d0, d1, d2, d3, d4, d5, d6, d7); 543 VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1); 544 VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3); 545 a0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0); 546 a2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0); 547 a1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2); 548 a3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2); 549 550 LD_SB8(src2, stride, d0, d1, d2, d3, d4, d5, d6, d7); 551 VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1); 552 VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3); 553 b0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0); 554 b2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0); 555 b1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2); 556 b3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2); 557 558 e0 = (v4i32) __msa_xor_v((v16u8)a0, (v16u8)b0); 559 e0 = (v4i32) __msa_and_v((v16u8)e0, (v16u8)fmask); 560 t0 = ((v4u32)e0) >> 1; 561 e2 = (v4i32) __msa_and_v((v16u8)a0, (v16u8)b0); 562 t0 = t0 + (v4u32)e2; 563 564 e1 = (v4i32) __msa_xor_v((v16u8)a1, (v16u8)b1); 565 e1 = (v4i32) __msa_and_v((v16u8)e1, (v16u8)fmask); 566 t1 = ((v4u32)e1) >> 1; 567 e2 = (v4i32) __msa_and_v((v16u8)a1, (v16u8)b1); 568 t1 = t1 + (v4u32)e2; 569 570 f0 = (v4i32) __msa_xor_v((v16u8)a2, (v16u8)b2); 571 f0 = (v4i32) __msa_and_v((v16u8)f0, (v16u8)fmask); 572 t2 = ((v4u32)f0) >> 1; 573 f2 = (v4i32) __msa_and_v((v16u8)a2, (v16u8)b2); 574 t2 = t2 + (v4u32)f2; 575 576 f1 = (v4i32) __msa_xor_v((v16u8)a3, (v16u8)b3); 577 f1 = (v4i32) __msa_and_v((v16u8)f1, (v16u8)fmask); 578 t3 = ((v4u32)f1) >> 1; 579 f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3); 580 t3 = t3 + (v4u32)f2; 581 582 ST_W8(t0, t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride); 583 ST_W8(t2, t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, stride); 584 } else { 585 int i; 586 587 for (i = 0; i < h; i++) { 588 uint32_t a, b; 589 590 a = AV_RN32(&src1[i * stride]); 591 b = AV_RN32(&src2[i * stride]); 592 AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b)); 593 a = AV_RN32(&src1[i * stride + 4]); 594 b = AV_RN32(&src2[i * stride + 4]); 595 AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b)); 596 } 597 } 598} 599