1/* 2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavcodec/vp8dsp.h" 22#include "libavutil/mips/generic_macros_msa.h" 23#include "vp8dsp_mips.h" 24 25#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) \ 26{ \ 27 v16u8 p1_a_sub_q1, p0_a_sub_q0; \ 28 \ 29 p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \ 30 p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \ 31 p1_a_sub_q1 = (v16u8) __msa_srli_b((v16i8) p1_a_sub_q1, 1); \ 32 p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \ 33 mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \ 34 mask = ((v16u8) mask <= b_limit); \ 35} 36 37#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out, \ 38 mask_in, hev_in) \ 39{ \ 40 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ 41 v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ 42 v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ 43 \ 44 p1_m = (v16i8) __msa_xori_b(p1_in_out, 0x80); \ 45 p0_m = (v16i8) __msa_xori_b(p0_in_out, 0x80); \ 46 q0_m = (v16i8) __msa_xori_b(q0_in_out, 0x80); \ 47 q1_m = (v16i8) __msa_xori_b(q1_in_out, 0x80); \ 48 \ 49 filt = __msa_subs_s_b(p1_m, q1_m); \ 50 \ 51 filt = filt & (v16i8) hev_in; \ 52 \ 53 q0_sub_p0 = q0_m - p0_m; \ 54 filt_sign = __msa_clti_s_b(filt, 0); \ 55 \ 56 cnst3h = __msa_ldi_h(3); \ 57 q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ 58 q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \ 59 filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \ 60 filt_r += q0_sub_p0_r; \ 61 filt_r = __msa_sat_s_h(filt_r, 7); \ 62 \ 63 q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ 64 q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h); \ 65 filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \ 66 filt_l += q0_sub_p0_l; \ 67 filt_l = __msa_sat_s_h(filt_l, 7); \ 68 \ 69 filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \ 70 filt = filt & (v16i8) mask_in; \ 71 \ 72 cnst4b = __msa_ldi_b(4); \ 73 filt1 = __msa_adds_s_b(filt, cnst4b); \ 74 filt1 >>= 3; \ 75 \ 76 cnst3b = __msa_ldi_b(3); \ 77 filt2 = __msa_adds_s_b(filt, cnst3b); \ 78 filt2 >>= 3; \ 79 \ 80 q0_m = __msa_subs_s_b(q0_m, filt1); \ 81 q0_in_out = __msa_xori_b((v16u8) q0_m, 0x80); \ 82 p0_m = __msa_adds_s_b(p0_m, filt2); \ 83 p0_in_out = __msa_xori_b((v16u8) p0_m, 0x80); \ 84 \ 85 filt = __msa_srari_b(filt1, 1); \ 86 hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \ 87 filt = filt & (v16i8) hev_in; \ 88 \ 89 q1_m = __msa_subs_s_b(q1_m, filt); \ 90 q1_in_out = __msa_xori_b((v16u8) q1_m, 0x80); \ 91 p1_m = __msa_adds_s_b(p1_m, filt); \ 92 p1_in_out = __msa_xori_b((v16u8) p1_m, 0x80); \ 93} 94 95#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \ 96{ \ 97 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign; \ 98 v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign; \ 99 v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ 100 \ 101 p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \ 102 p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \ 103 q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \ 104 q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \ 105 \ 106 filt = __msa_subs_s_b(p1_m, q1_m); \ 107 \ 108 q0_sub_p0 = q0_m - p0_m; \ 109 filt_sign = __msa_clti_s_b(filt, 0); \ 110 \ 111 cnst3h = __msa_ldi_h(3); \ 112 q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \ 113 q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \ 114 q0_sub_p0_r *= cnst3h; \ 115 filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \ 116 filt_r += q0_sub_p0_r; \ 117 filt_r = __msa_sat_s_h(filt_r, 7); \ 118 \ 119 q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \ 120 q0_sub_p0_l *= cnst3h; \ 121 filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \ 122 filt_l += q0_sub_p0_l; \ 123 filt_l = __msa_sat_s_h(filt_l, 7); \ 124 \ 125 filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \ 126 filt = filt & (v16i8) (mask); \ 127 \ 128 cnst4b = __msa_ldi_b(4); \ 129 filt1 = __msa_adds_s_b(filt, cnst4b); \ 130 filt1 >>= 3; \ 131 \ 132 cnst3b = __msa_ldi_b(3); \ 133 filt2 = __msa_adds_s_b(filt, cnst3b); \ 134 filt2 >>= 3; \ 135 \ 136 q0_m = __msa_subs_s_b(q0_m, filt1); \ 137 p0_m = __msa_adds_s_b(p0_m, filt2); \ 138 q0_in = __msa_xori_b((v16u8) q0_m, 0x80); \ 139 p0_in = __msa_xori_b((v16u8) p0_m, 0x80); \ 140} 141 142#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ 143{ \ 144 v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ 145 v16i8 filt, q0_sub_p0, cnst4b, cnst3b; \ 146 v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign; \ 147 v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l; \ 148 v8i16 cnst3h, cnst27h, cnst18h, cnst63h; \ 149 \ 150 cnst3h = __msa_ldi_h(3); \ 151 \ 152 p2_m = (v16i8) __msa_xori_b(p2, 0x80); \ 153 p1_m = (v16i8) __msa_xori_b(p1, 0x80); \ 154 p0_m = (v16i8) __msa_xori_b(p0, 0x80); \ 155 q0_m = (v16i8) __msa_xori_b(q0, 0x80); \ 156 q1_m = (v16i8) __msa_xori_b(q1, 0x80); \ 157 q2_m = (v16i8) __msa_xori_b(q2, 0x80); \ 158 \ 159 filt = __msa_subs_s_b(p1_m, q1_m); \ 160 q0_sub_p0 = q0_m - p0_m; \ 161 q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \ 162 filt_sign = __msa_clti_s_b(filt, 0); \ 163 \ 164 /* right part */ \ 165 q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \ 166 q0_sub_p0_r *= cnst3h; \ 167 filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \ 168 filt_r = filt_r + q0_sub_p0_r; \ 169 filt_r = __msa_sat_s_h(filt_r, 7); \ 170 \ 171 /* left part */ \ 172 q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \ 173 q0_sub_p0_l *= cnst3h; \ 174 filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \ 175 filt_l = filt_l + q0_sub_p0_l; \ 176 filt_l = __msa_sat_s_h(filt_l, 7); \ 177 \ 178 /* combine left and right part */ \ 179 filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \ 180 filt = filt & (v16i8) mask; \ 181 filt2 = filt & (v16i8) hev; \ 182 \ 183 /* filt_val &= ~hev */ \ 184 hev = __msa_xori_b(hev, 0xff); \ 185 filt = filt & (v16i8) hev; \ 186 cnst4b = __msa_ldi_b(4); \ 187 filt1 = __msa_adds_s_b(filt2, cnst4b); \ 188 filt1 >>= 3; \ 189 cnst3b = __msa_ldi_b(3); \ 190 filt2 = __msa_adds_s_b(filt2, cnst3b); \ 191 filt2 >>= 3; \ 192 q0_m = __msa_subs_s_b(q0_m, filt1); \ 193 p0_m = __msa_adds_s_b(p0_m, filt2); \ 194 \ 195 filt_sign = __msa_clti_s_b(filt, 0); \ 196 ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \ 197 \ 198 cnst27h = __msa_ldi_h(27); \ 199 cnst63h = __msa_ldi_h(63); \ 200 \ 201 /* right part */ \ 202 u_r = filt_r * cnst27h; \ 203 u_r += cnst63h; \ 204 u_r >>= 7; \ 205 u_r = __msa_sat_s_h(u_r, 7); \ 206 /* left part */ \ 207 u_l = filt_l * cnst27h; \ 208 u_l += cnst63h; \ 209 u_l >>= 7; \ 210 u_l = __msa_sat_s_h(u_l, 7); \ 211 /* combine left and right part */ \ 212 u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \ 213 q0_m = __msa_subs_s_b(q0_m, u); \ 214 q0 = __msa_xori_b((v16u8) q0_m, 0x80); \ 215 p0_m = __msa_adds_s_b(p0_m, u); \ 216 p0 = __msa_xori_b((v16u8) p0_m, 0x80); \ 217 cnst18h = __msa_ldi_h(18); \ 218 u_r = filt_r * cnst18h; \ 219 u_r += cnst63h; \ 220 u_r >>= 7; \ 221 u_r = __msa_sat_s_h(u_r, 7); \ 222 \ 223 /* left part */ \ 224 u_l = filt_l * cnst18h; \ 225 u_l += cnst63h; \ 226 u_l >>= 7; \ 227 u_l = __msa_sat_s_h(u_l, 7); \ 228 /* combine left and right part */ \ 229 u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \ 230 q1_m = __msa_subs_s_b(q1_m, u); \ 231 q1 = __msa_xori_b((v16u8) q1_m, 0x80); \ 232 p1_m = __msa_adds_s_b(p1_m, u); \ 233 p1 = __msa_xori_b((v16u8) p1_m, 0x80); \ 234 u_r = filt_r << 3; \ 235 u_r += filt_r + cnst63h; \ 236 u_r >>= 7; \ 237 u_r = __msa_sat_s_h(u_r, 7); \ 238 \ 239 /* left part */ \ 240 u_l = filt_l << 3; \ 241 u_l += filt_l + cnst63h; \ 242 u_l >>= 7; \ 243 u_l = __msa_sat_s_h(u_l, 7); \ 244 /* combine left and right part */ \ 245 u = __msa_pckev_b((v16i8) u_l, (v16i8) u_r); \ 246 q2_m = __msa_subs_s_b(q2_m, u); \ 247 q2 = __msa_xori_b((v16u8) q2_m, 0x80); \ 248 p2_m = __msa_adds_s_b(p2_m, u); \ 249 p2 = __msa_xori_b((v16u8) p2_m, 0x80); \ 250} 251 252#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \ 253 q0_in, q1_in, q2_in, q3_in, \ 254 limit_in, b_limit_in, thresh_in, \ 255 hev_out, mask_out, flat_out) \ 256{ \ 257 v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ 258 v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ 259 \ 260 /* absolute subtraction of pixel values */ \ 261 p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in)); \ 262 p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in)); \ 263 p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in)); \ 264 q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in)); \ 265 q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in)); \ 266 q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in)); \ 267 p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in)); \ 268 p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in)); \ 269 /* calculation of hev */ \ 270 flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ 271 hev_out = (thresh_in) < (v16u8) flat_out; \ 272 /* calculation of mask */ \ 273 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ 274 p1_asub_q1_m >>= 1; \ 275 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ 276 mask_out = (b_limit_in) < p0_asub_q0_m; \ 277 mask_out = __msa_max_u_b(flat_out, mask_out); \ 278 p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ 279 mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ 280 q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ 281 mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ 282 mask_out = (limit_in) < (v16u8) mask_out; \ 283 mask_out = __msa_xori_b(mask_out, 0xff); \ 284} 285 286#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) \ 287{ \ 288 uint16_t tmp0_h; \ 289 uint32_t tmp0_w; \ 290 \ 291 tmp0_w = __msa_copy_u_w((v4i32) in0, in0_idx); \ 292 tmp0_h = __msa_copy_u_h((v8i16) in1, in1_idx); \ 293 SW(tmp0_w, pdst); \ 294 SH(tmp0_h, pdst + stride); \ 295} 296 297void ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in, 298 int limit_in, int thresh_in) 299{ 300 uint8_t *temp_src; 301 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 302 v16u8 mask, hev, flat, thresh, limit, b_limit; 303 304 b_limit = (v16u8) __msa_fill_b(b_limit_in); 305 limit = (v16u8) __msa_fill_b(limit_in); 306 thresh = (v16u8) __msa_fill_b(thresh_in); 307 /* load vector elements */ 308 temp_src = src - (pitch << 2); 309 LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3); 310 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 311 hev, mask, flat); 312 VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 313 /* store vector elements */ 314 temp_src = src - 3 * pitch; 315 ST_UB4(p2, p1, p0, q0, temp_src, pitch); 316 temp_src += (4 * pitch); 317 ST_UB2(q1, q2, temp_src, pitch); 318} 319 320void ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v, 321 ptrdiff_t pitch, int b_limit_in, int limit_in, 322 int thresh_in) 323{ 324 uint8_t *temp_src; 325 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 326 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 327 v16u8 mask, hev, flat, thresh, limit, b_limit; 328 v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; 329 v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; 330 331 b_limit = (v16u8) __msa_fill_b(b_limit_in); 332 limit = (v16u8) __msa_fill_b(limit_in); 333 thresh = (v16u8) __msa_fill_b(thresh_in); 334 335 temp_src = src_u - (pitch << 2); 336 LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); 337 temp_src = src_v - (pitch << 2); 338 LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); 339 340 /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */ 341 ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); 342 ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); 343 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 344 hev, mask, flat); 345 VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 346 347 p2_d = __msa_copy_u_d((v2i64) p2, 0); 348 p1_d = __msa_copy_u_d((v2i64) p1, 0); 349 p0_d = __msa_copy_u_d((v2i64) p0, 0); 350 q0_d = __msa_copy_u_d((v2i64) q0, 0); 351 q1_d = __msa_copy_u_d((v2i64) q1, 0); 352 q2_d = __msa_copy_u_d((v2i64) q2, 0); 353 src_u -= (pitch * 3); 354 SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch); 355 src_u += 4 * pitch; 356 SD(q1_d, src_u); 357 src_u += pitch; 358 SD(q2_d, src_u); 359 360 p2_d = __msa_copy_u_d((v2i64) p2, 1); 361 p1_d = __msa_copy_u_d((v2i64) p1, 1); 362 p0_d = __msa_copy_u_d((v2i64) p0, 1); 363 q0_d = __msa_copy_u_d((v2i64) q0, 1); 364 q1_d = __msa_copy_u_d((v2i64) q1, 1); 365 q2_d = __msa_copy_u_d((v2i64) q2, 1); 366 src_v -= (pitch * 3); 367 SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch); 368 src_v += 4 * pitch; 369 SD(q1_d, src_v); 370 src_v += pitch; 371 SD(q2_d, src_v); 372} 373 374void ff_vp8_h_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in, 375 int limit_in, int thresh_in) 376{ 377 uint8_t *temp_src; 378 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 379 v16u8 mask, hev, flat, thresh, limit, b_limit; 380 v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 381 v16u8 row9, row10, row11, row12, row13, row14, row15; 382 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 383 384 b_limit = (v16u8) __msa_fill_b(b_limit_in); 385 limit = (v16u8) __msa_fill_b(limit_in); 386 thresh = (v16u8) __msa_fill_b(thresh_in); 387 temp_src = src - 4; 388 LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 389 temp_src += (8 * pitch); 390 LD_UB8(temp_src, pitch, 391 row8, row9, row10, row11, row12, row13, row14, row15); 392 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 393 row8, row9, row10, row11, row12, row13, row14, row15, 394 p3, p2, p1, p0, q0, q1, q2, q3); 395 396 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 397 hev, mask, flat); 398 VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 399 ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 400 ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4); 401 ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 402 ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7); 403 ILVRL_B2_SH(q2, q1, tmp2, tmp5); 404 405 temp_src = src - 3; 406 VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4); 407 temp_src += pitch; 408 VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4); 409 temp_src += pitch; 410 VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4); 411 temp_src += pitch; 412 VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4); 413 temp_src += pitch; 414 VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4); 415 temp_src += pitch; 416 VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4); 417 temp_src += pitch; 418 VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4); 419 temp_src += pitch; 420 VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4); 421 temp_src += pitch; 422 VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4); 423 temp_src += pitch; 424 VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4); 425 temp_src += pitch; 426 VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4); 427 temp_src += pitch; 428 VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4); 429 temp_src += pitch; 430 VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4); 431 temp_src += pitch; 432 VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4); 433 temp_src += pitch; 434 VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4); 435 temp_src += pitch; 436 VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4); 437} 438 439void ff_vp8_h_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v, 440 ptrdiff_t pitch, int b_limit_in, int limit_in, 441 int thresh_in) 442{ 443 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 444 v16u8 mask, hev, flat, thresh, limit, b_limit; 445 v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 446 v16u8 row9, row10, row11, row12, row13, row14, row15; 447 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 448 449 b_limit = (v16u8) __msa_fill_b(b_limit_in); 450 limit = (v16u8) __msa_fill_b(limit_in); 451 thresh = (v16u8) __msa_fill_b(thresh_in); 452 453 LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 454 LD_UB8(src_v - 4, pitch, 455 row8, row9, row10, row11, row12, row13, row14, row15); 456 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 457 row8, row9, row10, row11, row12, row13, row14, row15, 458 p3, p2, p1, p0, q0, q1, q2, q3); 459 460 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 461 hev, mask, flat); 462 VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 463 464 ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 465 ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4); 466 ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 467 ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7); 468 ILVRL_B2_SH(q2, q1, tmp2, tmp5); 469 470 src_u -= 3; 471 VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4); 472 src_u += pitch; 473 VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4); 474 src_u += pitch; 475 VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4); 476 src_u += pitch; 477 VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4); 478 src_u += pitch; 479 VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4); 480 src_u += pitch; 481 VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4); 482 src_u += pitch; 483 VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4); 484 src_u += pitch; 485 VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4); 486 487 src_v -= 3; 488 VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4); 489 src_v += pitch; 490 VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4); 491 src_v += pitch; 492 VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4); 493 src_v += pitch; 494 VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4); 495 src_v += pitch; 496 VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4); 497 src_v += pitch; 498 VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4); 499 src_v += pitch; 500 VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4); 501 src_v += pitch; 502 VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4); 503} 504 505void ff_vp8_v_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch, 506 int b_limit_ptr) 507{ 508 v16u8 p1, p0, q1, q0; 509 v16u8 mask, b_limit; 510 511 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 512 /* load vector elements */ 513 LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1); 514 VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); 515 VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); 516 ST_UB2(p0, q0, (src - pitch), pitch); 517} 518 519void ff_vp8_h_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch, 520 int b_limit_ptr) 521{ 522 uint8_t *temp_src; 523 v16u8 p1, p0, q1, q0; 524 v16u8 mask, b_limit; 525 v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 526 v16u8 row9, row10, row11, row12, row13, row14, row15; 527 v8i16 tmp0, tmp1; 528 529 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 530 temp_src = src - 2; 531 LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 532 temp_src += (8 * pitch); 533 LD_UB8(temp_src, pitch, 534 row8, row9, row10, row11, row12, row13, row14, row15); 535 TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 536 row8, row9, row10, row11, row12, row13, row14, row15, 537 p1, p0, q0, q1); 538 VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); 539 VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); 540 ILVRL_B2_SH(q0, p0, tmp1, tmp0); 541 542 src -= 1; 543 ST_H8(tmp1, 0, 1, 2, 3, 4, 5, 6, 7, src, pitch) 544 ST_H8(tmp0, 0, 1, 2, 3, 4, 5, 6, 7, src + 8 * pitch, pitch) 545} 546 547void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v, 548 ptrdiff_t pitch, int b_limit_in, 549 int limit_in, int thresh_in) 550{ 551 uint64_t p1_d, p0_d, q0_d, q1_d; 552 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 553 v16u8 mask, hev, flat, thresh, limit, b_limit; 554 v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; 555 v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; 556 557 thresh = (v16u8) __msa_fill_b(thresh_in); 558 limit = (v16u8) __msa_fill_b(limit_in); 559 b_limit = (v16u8) __msa_fill_b(b_limit_in); 560 561 src_u = src_u - (pitch << 2); 562 LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); 563 src_u += (5 * pitch); 564 src_v = src_v - (pitch << 2); 565 LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); 566 src_v += (5 * pitch); 567 568 /* right 8 element of p3 are u pixel and 569 left 8 element of p3 are v pixel */ 570 ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); 571 ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); 572 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 573 hev, mask, flat); 574 VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 575 576 p1_d = __msa_copy_u_d((v2i64) p1, 0); 577 p0_d = __msa_copy_u_d((v2i64) p0, 0); 578 q0_d = __msa_copy_u_d((v2i64) q0, 0); 579 q1_d = __msa_copy_u_d((v2i64) q1, 0); 580 SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch)); 581 582 p1_d = __msa_copy_u_d((v2i64) p1, 1); 583 p0_d = __msa_copy_u_d((v2i64) p0, 1); 584 q0_d = __msa_copy_u_d((v2i64) q0, 1); 585 q1_d = __msa_copy_u_d((v2i64) q1, 1); 586 SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch)); 587} 588 589void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v, 590 ptrdiff_t pitch, int b_limit_in, 591 int limit_in, int thresh_in) 592{ 593 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 594 v16u8 mask, hev, flat, thresh, limit, b_limit; 595 v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 596 v16u8 row9, row10, row11, row12, row13, row14, row15; 597 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 598 599 thresh = (v16u8) __msa_fill_b(thresh_in); 600 limit = (v16u8) __msa_fill_b(limit_in); 601 b_limit = (v16u8) __msa_fill_b(b_limit_in); 602 603 LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 604 LD_UB8(src_v - 4, pitch, 605 row8, row9, row10, row11, row12, row13, row14, row15); 606 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 607 row8, row9, row10, row11, row12, row13, row14, row15, 608 p3, p2, p1, p0, q0, q1, q2, q3); 609 610 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 611 hev, mask, flat); 612 VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 613 ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1); 614 ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3); 615 tmp0 = (v4i32) __msa_ilvl_b((v16i8) p0, (v16i8) p1); 616 tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0); 617 ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5); 618 619 ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src_u - 2, pitch); 620 ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src_v - 2, pitch); 621} 622 623void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, 624 int32_t e, int32_t i, int32_t h) 625{ 626 v16u8 mask, hev, flat; 627 v16u8 thresh, b_limit, limit; 628 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 629 630 /* load vector elements */ 631 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 632 thresh = (v16u8) __msa_fill_b(h); 633 b_limit = (v16u8) __msa_fill_b(e); 634 limit = (v16u8) __msa_fill_b(i); 635 636 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 637 hev, mask, flat); 638 VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 639 640 ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); 641} 642 643void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, 644 int32_t e, int32_t i, int32_t h) 645{ 646 v16u8 mask, hev, flat; 647 v16u8 thresh, b_limit, limit; 648 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 649 v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 650 v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 651 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 652 653 LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 654 LD_UB8(src - 4 + (8 * pitch), pitch, 655 row8, row9, row10, row11, row12, row13, row14, row15); 656 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 657 row8, row9, row10, row11, row12, row13, row14, row15, 658 p3, p2, p1, p0, q0, q1, q2, q3); 659 660 thresh = (v16u8) __msa_fill_b(h); 661 b_limit = (v16u8) __msa_fill_b(e); 662 limit = (v16u8) __msa_fill_b(i); 663 664 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 665 hev, mask, flat); 666 VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 667 ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); 668 ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); 669 ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); 670 ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); 671 672 src -= 2; 673 ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch) 674 ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch) 675} 676