1/* 2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavcodec/vp9dsp.h" 22#include "libavutil/mips/generic_macros_msa.h" 23#include "vp9dsp_mips.h" 24 25#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ 26 p1_out, p0_out, q0_out, q1_out) \ 27{ \ 28 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2; \ 29 const v16i8 cnst4b = __msa_ldi_b(4); \ 30 const v16i8 cnst3b = __msa_ldi_b(3); \ 31 \ 32 p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \ 33 p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \ 34 q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \ 35 q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \ 36 \ 37 filt = __msa_subs_s_b(p1_m, q1_m); \ 38 \ 39 filt = filt & (v16i8) hev_in; \ 40 \ 41 q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ 42 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 43 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 44 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 45 filt = filt & (v16i8) mask_in; \ 46 \ 47 filt1 = __msa_adds_s_b(filt, cnst4b); \ 48 filt1 >>= 3; \ 49 \ 50 filt2 = __msa_adds_s_b(filt, cnst3b); \ 51 filt2 >>= 3; \ 52 \ 53 q0_m = __msa_subs_s_b(q0_m, filt1); \ 54 q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \ 55 p0_m = __msa_adds_s_b(p0_m, filt2); \ 56 p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \ 57 \ 58 filt = __msa_srari_b(filt1, 1); \ 59 hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \ 60 filt = filt & (v16i8) hev_in; \ 61 \ 62 q1_m = __msa_subs_s_b(q1_m, filt); \ 63 q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \ 64 p1_m = __msa_adds_s_b(p1_m, filt); \ 65 p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \ 66} 67 68#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ 69{ \ 70 v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ 71 v16u8 zero_in = { 0 }; \ 72 \ 73 tmp = __msa_ori_b(zero_in, 1); \ 74 p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ 75 q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ 76 p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ 77 q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ 78 \ 79 p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ 80 flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ 81 p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ 82 flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ 83 \ 84 flat_out = (tmp < (v16u8) flat_out); \ 85 flat_out = __msa_xori_b(flat_out, 0xff); \ 86 flat_out = flat_out & (mask); \ 87} 88 89#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \ 90 q5_in, q6_in, q7_in, flat_in, flat2_out) \ 91{ \ 92 v16u8 tmp, zero_in = { 0 }; \ 93 v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ 94 v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ 95 \ 96 tmp = __msa_ori_b(zero_in, 1); \ 97 p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ 98 q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ 99 p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ 100 q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ 101 p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ 102 q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ 103 p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ 104 q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ 105 \ 106 p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ 107 flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ 108 flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ 109 p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ 110 flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ 111 p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ 112 flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ 113 \ 114 flat2_out = (tmp < (v16u8) flat2_out); \ 115 flat2_out = __msa_xori_b(flat2_out, 0xff); \ 116 flat2_out = flat2_out & flat_in; \ 117} 118 119#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \ 120 q0_in, q1_in, q2_in, q3_in, \ 121 p2_filt8_out, p1_filt8_out, p0_filt8_out, \ 122 q0_filt8_out, q1_filt8_out, q2_filt8_out) \ 123{ \ 124 v8u16 tmp0, tmp1, tmp2; \ 125 \ 126 tmp2 = p2_in + p1_in + p0_in; \ 127 tmp0 = p3_in << 1; \ 128 \ 129 tmp0 = tmp0 + tmp2 + q0_in; \ 130 tmp1 = tmp0 + p3_in + p2_in; \ 131 p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ 132 \ 133 tmp1 = tmp0 + p1_in + q1_in; \ 134 p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ 135 \ 136 tmp1 = q2_in + q1_in + q0_in; \ 137 tmp2 = tmp2 + tmp1; \ 138 tmp0 = tmp2 + (p0_in); \ 139 tmp0 = tmp0 + (p3_in); \ 140 p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3); \ 141 \ 142 tmp0 = q2_in + q3_in; \ 143 tmp0 = p0_in + tmp1 + tmp0; \ 144 tmp1 = q3_in + q3_in; \ 145 tmp1 = tmp1 + tmp0; \ 146 q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ 147 \ 148 tmp0 = tmp2 + q3_in; \ 149 tmp1 = tmp0 + q0_in; \ 150 q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ 151 \ 152 tmp1 = tmp0 - p2_in; \ 153 tmp0 = q1_in + q3_in; \ 154 tmp1 = tmp0 + tmp1; \ 155 q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \ 156} 157 158#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \ 159 q0_in, q1_in, q2_in, q3_in, \ 160 limit_in, b_limit_in, thresh_in, \ 161 hev_out, mask_out, flat_out) \ 162{ \ 163 v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ 164 v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ 165 \ 166 /* absolute subtraction of pixel values */ \ 167 p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ 168 p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ 169 p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ 170 q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ 171 q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ 172 q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ 173 p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ 174 p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ 175 \ 176 /* calculation of hev */ \ 177 flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ 178 hev_out = thresh_in < (v16u8) flat_out; \ 179 \ 180 /* calculation of mask */ \ 181 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ 182 p1_asub_q1_m >>= 1; \ 183 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ 184 \ 185 mask_out = b_limit_in < p0_asub_q0_m; \ 186 mask_out = __msa_max_u_b(flat_out, mask_out); \ 187 p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ 188 mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ 189 q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ 190 mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ 191 \ 192 mask_out = limit_in < (v16u8) mask_out; \ 193 mask_out = __msa_xori_b(mask_out, 0xff); \ 194} 195 196void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch, 197 int32_t b_limit_ptr, 198 int32_t limit_ptr, 199 int32_t thresh_ptr) 200{ 201 uint64_t p1_d, p0_d, q0_d, q1_d; 202 v16u8 mask, hev, flat, thresh, b_limit, limit; 203 v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; 204 205 /* load vector elements */ 206 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 207 208 thresh = (v16u8) __msa_fill_b(thresh_ptr); 209 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 210 limit = (v16u8) __msa_fill_b(limit_ptr); 211 212 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 213 hev, mask, flat); 214 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 215 q1_out); 216 217 p1_d = __msa_copy_u_d((v2i64) p1_out, 0); 218 p0_d = __msa_copy_u_d((v2i64) p0_out, 0); 219 q0_d = __msa_copy_u_d((v2i64) q0_out, 0); 220 q1_d = __msa_copy_u_d((v2i64) q1_out, 0); 221 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); 222} 223 224 225void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch, 226 int32_t b_limit_ptr, 227 int32_t limit_ptr, 228 int32_t thresh_ptr) 229{ 230 v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; 231 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 232 233 /* load vector elements */ 234 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 235 236 thresh0 = (v16u8) __msa_fill_b(thresh_ptr); 237 thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8); 238 thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0); 239 240 b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr); 241 b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8); 242 b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0); 243 244 limit0 = (v16u8) __msa_fill_b(limit_ptr); 245 limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8); 246 limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0); 247 248 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, 249 hev, mask, flat); 250 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); 251 252 ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); 253} 254 255void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch, 256 int32_t b_limit_ptr, 257 int32_t limit_ptr, 258 int32_t thresh_ptr) 259{ 260 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 261 v16u8 mask, hev, flat, thresh, b_limit, limit; 262 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 263 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 264 v8i16 p2_filter8, p1_filter8, p0_filter8; 265 v8i16 q0_filter8, q1_filter8, q2_filter8; 266 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; 267 v16i8 zero = { 0 }; 268 269 /* load vector elements */ 270 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 271 272 thresh = (v16u8) __msa_fill_b(thresh_ptr); 273 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 274 limit = (v16u8) __msa_fill_b(limit_ptr); 275 276 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 277 hev, mask, flat); 278 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 279 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 280 q1_out); 281 282 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 283 284 /* if flat is zero for all pixels, then no need to calculate other filter */ 285 if (__msa_test_bz_v(flat)) { 286 p1_d = __msa_copy_u_d((v2i64) p1_out, 0); 287 p0_d = __msa_copy_u_d((v2i64) p0_out, 0); 288 q0_d = __msa_copy_u_d((v2i64) q0_out, 0); 289 q1_d = __msa_copy_u_d((v2i64) q1_out, 0); 290 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); 291 } else { 292 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 293 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, 294 q2_r, q3_r); 295 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, 296 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); 297 298 /* convert 16 bit output data into 8 bit */ 299 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, 300 zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, 301 q0_filter8); 302 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); 303 304 /* store pixel values */ 305 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat); 306 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat); 307 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat); 308 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat); 309 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat); 310 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat); 311 312 p2_d = __msa_copy_u_d((v2i64) p2_out, 0); 313 p1_d = __msa_copy_u_d((v2i64) p1_out, 0); 314 p0_d = __msa_copy_u_d((v2i64) p0_out, 0); 315 q0_d = __msa_copy_u_d((v2i64) q0_out, 0); 316 q1_d = __msa_copy_u_d((v2i64) q1_out, 0); 317 q2_d = __msa_copy_u_d((v2i64) q2_out, 0); 318 319 src -= 3 * pitch; 320 321 SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); 322 src += (4 * pitch); 323 SD(q1_d, src); 324 src += pitch; 325 SD(q2_d, src); 326 } 327} 328 329void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch, 330 int32_t b_limit_ptr, 331 int32_t limit_ptr, 332 int32_t thresh_ptr) 333{ 334 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 335 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 336 v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; 337 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 338 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 339 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 340 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 341 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 342 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 343 v16u8 zero = { 0 }; 344 345 /* load vector elements */ 346 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 347 348 thresh = (v16u8) __msa_fill_b(thresh_ptr); 349 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8); 350 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh); 351 352 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 353 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8); 354 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit); 355 356 limit = (v16u8) __msa_fill_b(limit_ptr); 357 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8); 358 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit); 359 360 /* mask and hev */ 361 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 362 hev, mask, flat); 363 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 364 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 365 q1_out); 366 367 /* if flat is zero for all pixels, then no need to calculate other filter */ 368 if (__msa_test_bz_v(flat)) { 369 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 370 } else { 371 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 372 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, 373 q2_r, q3_r); 374 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 375 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 376 377 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 378 p0_l); 379 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 380 q3_l); 381 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 382 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 383 384 /* convert 16 bit output data into 8 bit */ 385 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 386 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 387 p0_filt8_r, q0_filt8_r); 388 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, 389 q1_filt8_r, q2_filt8_r); 390 391 /* store pixel values */ 392 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 393 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 394 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 395 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 396 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 397 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 398 399 src -= 3 * pitch; 400 401 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); 402 src += (4 * pitch); 403 ST_UB2(q1_out, q2_out, src, pitch); 404 src += (2 * pitch); 405 } 406} 407 408void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch, 409 int32_t b_limit_ptr, 410 int32_t limit_ptr, 411 int32_t thresh_ptr) 412{ 413 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 414 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 415 v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; 416 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 417 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 418 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 419 v16u8 zero = { 0 }; 420 421 /* load vector elements */ 422 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 423 424 thresh = (v16u8) __msa_fill_b(thresh_ptr); 425 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8); 426 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh); 427 428 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 429 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8); 430 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit); 431 432 limit = (v16u8) __msa_fill_b(limit_ptr); 433 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8); 434 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit); 435 436 /* mask and hev */ 437 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 438 hev, mask, flat); 439 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 440 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 441 q1_out); 442 443 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 444 445 /* if flat is zero for all pixels, then no need to calculate other filter */ 446 if (__msa_test_bz_v(flat)) { 447 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 448 } else { 449 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 450 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, 451 q2_r, q3_r); 452 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 453 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 454 455 /* convert 16 bit output data into 8 bit */ 456 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, 457 p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r, 458 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); 459 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, 460 q1_filt8_r, q2_filt8_r); 461 462 /* store pixel values */ 463 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 464 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 465 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 466 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 467 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 468 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 469 470 src -= 3 * pitch; 471 472 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); 473 src += (4 * pitch); 474 ST_UB2(q1_out, q2_out, src, pitch); 475 src += (2 * pitch); 476 } 477} 478 479void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch, 480 int32_t b_limit_ptr, 481 int32_t limit_ptr, 482 int32_t thresh_ptr) 483{ 484 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 485 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 486 v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; 487 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 488 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 489 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 490 v16u8 zero = { 0 }; 491 492 /* load vector elements */ 493 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 494 495 thresh = (v16u8) __msa_fill_b(thresh_ptr); 496 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8); 497 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh); 498 499 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 500 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8); 501 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit); 502 503 limit = (v16u8) __msa_fill_b(limit_ptr); 504 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8); 505 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit); 506 507 /* mask and hev */ 508 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 509 hev, mask, flat); 510 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 511 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 512 q1_out); 513 514 flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero); 515 516 /* if flat is zero for all pixels, then no need to calculate other filter */ 517 if (__msa_test_bz_v(flat)) { 518 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 519 } else { 520 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 521 p0_l); 522 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 523 q3_l); 524 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 525 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 526 527 /* convert 16 bit output data into 8 bit */ 528 PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l, 529 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, 530 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l); 531 PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l, 532 q1_filt8_l, q2_filt8_l); 533 534 /* store pixel values */ 535 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat); 536 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat); 537 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat); 538 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat); 539 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat); 540 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat); 541 542 src -= 3 * pitch; 543 544 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); 545 src += (4 * pitch); 546 ST_UB2(q1_out, q2_out, src, pitch); 547 src += (2 * pitch); 548 } 549} 550 551static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch, 552 uint8_t *filter48, 553 int32_t b_limit_ptr, 554 int32_t limit_ptr, 555 int32_t thresh_ptr) 556{ 557 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 558 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 559 v16u8 flat, mask, hev, thresh, b_limit, limit; 560 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 561 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 562 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 563 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 564 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 565 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 566 v16u8 zero = { 0 }; 567 568 /* load vector elements */ 569 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 570 571 thresh = (v16u8) __msa_fill_b(thresh_ptr); 572 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 573 limit = (v16u8) __msa_fill_b(limit_ptr); 574 575 /* mask and hev */ 576 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 577 hev, mask, flat); 578 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 579 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 580 q1_out); 581 582 /* if flat is zero for all pixels, then no need to calculate other filter */ 583 if (__msa_test_bz_v(flat)) { 584 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 585 586 return 1; 587 } else { 588 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 589 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, 590 q2_r, q3_r); 591 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 592 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 593 594 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 595 p0_l); 596 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 597 q3_l); 598 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 599 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 600 601 /* convert 16 bit output data into 8 bit */ 602 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 603 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 604 p0_filt8_r, q0_filt8_r); 605 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 606 q2_filt8_r); 607 608 /* store pixel values */ 609 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 610 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 611 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 612 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 613 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 614 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 615 616 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 617 filter48 += (4 * 16); 618 ST_UB2(q1_out, q2_out, filter48, 16); 619 filter48 += (2 * 16); 620 ST_UB(flat, filter48); 621 622 return 0; 623 } 624} 625 626static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48) 627{ 628 v16u8 flat, flat2, filter8; 629 v16i8 zero = { 0 }; 630 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 631 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in; 632 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in; 633 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in; 634 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in; 635 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; 636 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; 637 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; 638 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; 639 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; 640 v8i16 l_out, r_out; 641 642 flat = LD_UB(filter48 + 96); 643 644 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); 645 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); 646 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 647 648 /* if flat2 is zero for all pixels, then no need to calculate other filter */ 649 if (__msa_test_bz_v(flat2)) { 650 LD_UB4(filter48, 16, p2, p1, p0, q0); 651 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 652 653 src -= 3 * pitch; 654 ST_UB4(p2, p1, p0, q0, src, pitch); 655 src += (4 * pitch); 656 ST_UB2(q1, q2, src, pitch); 657 } else { 658 src -= 7 * pitch; 659 660 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, 661 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, 662 p3_r_in, p2_r_in, p1_r_in, p0_r_in); 663 664 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0); 665 666 tmp0_r = p7_r_in << 3; 667 tmp0_r -= p7_r_in; 668 tmp0_r += p6_r_in; 669 tmp0_r += q0_r_in; 670 tmp1_r = p6_r_in + p5_r_in; 671 tmp1_r += p4_r_in; 672 tmp1_r += p3_r_in; 673 tmp1_r += p2_r_in; 674 tmp1_r += p1_r_in; 675 tmp1_r += p0_r_in; 676 tmp1_r += tmp0_r; 677 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 678 679 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, 680 p5_l_in, p4_l_in); 681 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, 682 p1_l_in, p0_l_in); 683 q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0); 684 685 tmp0_l = p7_l_in << 3; 686 tmp0_l -= p7_l_in; 687 tmp0_l += p6_l_in; 688 tmp0_l += q0_l_in; 689 tmp1_l = p6_l_in + p5_l_in; 690 tmp1_l += p4_l_in; 691 tmp1_l += p3_l_in; 692 tmp1_l += p2_l_in; 693 tmp1_l += p1_l_in; 694 tmp1_l += p0_l_in; 695 tmp1_l += tmp0_l; 696 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 697 698 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 699 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2); 700 ST_UB(p6, src); 701 src += pitch; 702 703 /* p5 */ 704 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1); 705 tmp0_r = p5_r_in - p6_r_in; 706 tmp0_r += q1_r_in; 707 tmp0_r -= p7_r_in; 708 tmp1_r += tmp0_r; 709 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 710 711 q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1); 712 tmp0_l = p5_l_in - p6_l_in; 713 tmp0_l += q1_l_in; 714 tmp0_l -= p7_l_in; 715 tmp1_l += tmp0_l; 716 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 717 718 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 719 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2); 720 ST_UB(p5, src); 721 src += pitch; 722 723 /* p4 */ 724 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2); 725 tmp0_r = p4_r_in - p5_r_in; 726 tmp0_r += q2_r_in; 727 tmp0_r -= p7_r_in; 728 tmp1_r += tmp0_r; 729 r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4); 730 731 q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2); 732 tmp0_l = p4_l_in - p5_l_in; 733 tmp0_l += q2_l_in; 734 tmp0_l -= p7_l_in; 735 tmp1_l += tmp0_l; 736 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 737 738 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 739 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2); 740 ST_UB(p4, src); 741 src += pitch; 742 743 /* p3 */ 744 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3); 745 tmp0_r = p3_r_in - p4_r_in; 746 tmp0_r += q3_r_in; 747 tmp0_r -= p7_r_in; 748 tmp1_r += tmp0_r; 749 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 750 751 q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3); 752 tmp0_l = p3_l_in - p4_l_in; 753 tmp0_l += q3_l_in; 754 tmp0_l -= p7_l_in; 755 tmp1_l += tmp0_l; 756 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 757 758 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 759 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2); 760 ST_UB(p3, src); 761 src += pitch; 762 763 /* p2 */ 764 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4); 765 filter8 = LD_UB(filter48); 766 tmp0_r = p2_r_in - p3_r_in; 767 tmp0_r += q4_r_in; 768 tmp0_r -= p7_r_in; 769 tmp1_r += tmp0_r; 770 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 771 772 q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4); 773 tmp0_l = p2_l_in - p3_l_in; 774 tmp0_l += q4_l_in; 775 tmp0_l -= p7_l_in; 776 tmp1_l += tmp0_l; 777 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 778 779 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 780 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 781 ST_UB(filter8, src); 782 src += pitch; 783 784 /* p1 */ 785 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5); 786 filter8 = LD_UB(filter48 + 16); 787 tmp0_r = p1_r_in - p2_r_in; 788 tmp0_r += q5_r_in; 789 tmp0_r -= p7_r_in; 790 tmp1_r += tmp0_r; 791 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 792 793 q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5); 794 tmp0_l = p1_l_in - p2_l_in; 795 tmp0_l += q5_l_in; 796 tmp0_l -= p7_l_in; 797 tmp1_l += tmp0_l; 798 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 799 800 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 801 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 802 ST_UB(filter8, src); 803 src += pitch; 804 805 /* p0 */ 806 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6); 807 filter8 = LD_UB(filter48 + 32); 808 tmp0_r = p0_r_in - p1_r_in; 809 tmp0_r += q6_r_in; 810 tmp0_r -= p7_r_in; 811 tmp1_r += tmp0_r; 812 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 813 814 q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6); 815 tmp0_l = p0_l_in - p1_l_in; 816 tmp0_l += q6_l_in; 817 tmp0_l -= p7_l_in; 818 tmp1_l += tmp0_l; 819 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 820 821 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 822 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 823 ST_UB(filter8, src); 824 src += pitch; 825 826 /* q0 */ 827 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7); 828 filter8 = LD_UB(filter48 + 48); 829 tmp0_r = q7_r_in - p0_r_in; 830 tmp0_r += q0_r_in; 831 tmp0_r -= p7_r_in; 832 tmp1_r += tmp0_r; 833 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 834 835 q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7); 836 tmp0_l = q7_l_in - p0_l_in; 837 tmp0_l += q0_l_in; 838 tmp0_l -= p7_l_in; 839 tmp1_l += tmp0_l; 840 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 841 842 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 843 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 844 ST_UB(filter8, src); 845 src += pitch; 846 847 /* q1 */ 848 filter8 = LD_UB(filter48 + 64); 849 tmp0_r = q7_r_in - q0_r_in; 850 tmp0_r += q1_r_in; 851 tmp0_r -= p6_r_in; 852 tmp1_r += tmp0_r; 853 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 854 855 tmp0_l = q7_l_in - q0_l_in; 856 tmp0_l += q1_l_in; 857 tmp0_l -= p6_l_in; 858 tmp1_l += tmp0_l; 859 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 860 861 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 862 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 863 ST_UB(filter8, src); 864 src += pitch; 865 866 /* q2 */ 867 filter8 = LD_UB(filter48 + 80); 868 tmp0_r = q7_r_in - q1_r_in; 869 tmp0_r += q2_r_in; 870 tmp0_r -= p5_r_in; 871 tmp1_r += tmp0_r; 872 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 873 874 tmp0_l = q7_l_in - q1_l_in; 875 tmp0_l += q2_l_in; 876 tmp0_l -= p5_l_in; 877 tmp1_l += tmp0_l; 878 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 879 880 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 881 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 882 ST_UB(filter8, src); 883 src += pitch; 884 885 /* q3 */ 886 tmp0_r = q7_r_in - q2_r_in; 887 tmp0_r += q3_r_in; 888 tmp0_r -= p4_r_in; 889 tmp1_r += tmp0_r; 890 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 891 892 tmp0_l = q7_l_in - q2_l_in; 893 tmp0_l += q3_l_in; 894 tmp0_l -= p4_l_in; 895 tmp1_l += tmp0_l; 896 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 897 898 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 899 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2); 900 ST_UB(q3, src); 901 src += pitch; 902 903 /* q4 */ 904 tmp0_r = q7_r_in - q3_r_in; 905 tmp0_r += q4_r_in; 906 tmp0_r -= p3_r_in; 907 tmp1_r += tmp0_r; 908 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 909 910 tmp0_l = q7_l_in - q3_l_in; 911 tmp0_l += q4_l_in; 912 tmp0_l -= p3_l_in; 913 tmp1_l += tmp0_l; 914 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 915 916 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 917 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2); 918 ST_UB(q4, src); 919 src += pitch; 920 921 /* q5 */ 922 tmp0_r = q7_r_in - q4_r_in; 923 tmp0_r += q5_r_in; 924 tmp0_r -= p2_r_in; 925 tmp1_r += tmp0_r; 926 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 927 928 tmp0_l = q7_l_in - q4_l_in; 929 tmp0_l += q5_l_in; 930 tmp0_l -= p2_l_in; 931 tmp1_l += tmp0_l; 932 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 933 934 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 935 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2); 936 ST_UB(q5, src); 937 src += pitch; 938 939 /* q6 */ 940 tmp0_r = q7_r_in - q5_r_in; 941 tmp0_r += q6_r_in; 942 tmp0_r -= p1_r_in; 943 tmp1_r += tmp0_r; 944 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 945 946 tmp0_l = q7_l_in - q5_l_in; 947 tmp0_l += q6_l_in; 948 tmp0_l -= p1_l_in; 949 tmp1_l += tmp0_l; 950 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 951 952 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 953 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2); 954 ST_UB(q6, src); 955 } 956} 957 958void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch, 959 int32_t b_limit_ptr, 960 int32_t limit_ptr, 961 int32_t thresh_ptr) 962{ 963 uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT); 964 uint8_t early_exit = 0; 965 966 early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], 967 b_limit_ptr, limit_ptr, thresh_ptr); 968 969 if (0 == early_exit) { 970 vp9_hz_lpf_t16_16w(src, pitch, filter48); 971 } 972} 973 974void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch, 975 int32_t b_limit_ptr, 976 int32_t limit_ptr, 977 int32_t thresh_ptr) 978{ 979 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 980 uint64_t dword0, dword1; 981 v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; 982 v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; 983 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 984 v16u8 p0_filter16, p1_filter16; 985 v8i16 p2_filter8, p1_filter8, p0_filter8; 986 v8i16 q0_filter8, q1_filter8, q2_filter8; 987 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; 988 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; 989 v16i8 zero = { 0 }; 990 v8u16 tmp0, tmp1, tmp2; 991 992 /* load vector elements */ 993 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 994 995 thresh = (v16u8) __msa_fill_b(thresh_ptr); 996 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 997 limit = (v16u8) __msa_fill_b(limit_ptr); 998 999 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1000 hev, mask, flat); 1001 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1002 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1003 q1_out); 1004 1005 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 1006 1007 /* if flat is zero for all pixels, then no need to calculate other filter */ 1008 if (__msa_test_bz_v(flat)) { 1009 p1_d = __msa_copy_u_d((v2i64) p1_out, 0); 1010 p0_d = __msa_copy_u_d((v2i64) p0_out, 0); 1011 q0_d = __msa_copy_u_d((v2i64) q0_out, 0); 1012 q1_d = __msa_copy_u_d((v2i64) q1_out, 0); 1013 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); 1014 } else { 1015 /* convert 8 bit input data into 16 bit */ 1016 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, 1017 q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, 1018 q1_r, q2_r, q3_r); 1019 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, 1020 p2_filter8, p1_filter8, p0_filter8, q0_filter8, 1021 q1_filter8, q2_filter8); 1022 1023 /* convert 16 bit output data into 8 bit */ 1024 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, 1025 zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, 1026 q0_filter8); 1027 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, 1028 q2_filter8); 1029 1030 /* store pixel values */ 1031 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat); 1032 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat); 1033 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat); 1034 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat); 1035 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat); 1036 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat); 1037 1038 /* load 16 vector elements */ 1039 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); 1040 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); 1041 1042 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 1043 1044 /* if flat2 is zero for all pixels, then no need to calculate other filter */ 1045 if (__msa_test_bz_v(flat2)) { 1046 p2_d = __msa_copy_u_d((v2i64) p2_out, 0); 1047 p1_d = __msa_copy_u_d((v2i64) p1_out, 0); 1048 p0_d = __msa_copy_u_d((v2i64) p0_out, 0); 1049 q0_d = __msa_copy_u_d((v2i64) q0_out, 0); 1050 q1_d = __msa_copy_u_d((v2i64) q1_out, 0); 1051 q2_d = __msa_copy_u_d((v2i64) q2_out, 0); 1052 1053 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); 1054 SD(q1_d, src + pitch); 1055 SD(q2_d, src + 2 * pitch); 1056 } else { 1057 /* LSB(right) 8 pixel operation */ 1058 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, 1059 zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, 1060 q4_r, q5_r, q6_r, q7_r); 1061 1062 tmp0 = p7_r << 3; 1063 tmp0 -= p7_r; 1064 tmp0 += p6_r; 1065 tmp0 += q0_r; 1066 1067 src -= 7 * pitch; 1068 1069 /* calculation of p6 and p5 */ 1070 tmp1 = p6_r + p5_r + p4_r + p3_r; 1071 tmp1 += (p2_r + p1_r + p0_r); 1072 tmp1 += tmp0; 1073 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1074 tmp0 = p5_r - p6_r + q1_r - p7_r; 1075 tmp1 += tmp0; 1076 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1077 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1078 p0_filter16, p1_filter16); 1079 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); 1080 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); 1081 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1082 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1083 SD(dword0, src); 1084 src += pitch; 1085 SD(dword1, src); 1086 src += pitch; 1087 1088 /* calculation of p4 and p3 */ 1089 tmp0 = p4_r - p5_r + q2_r - p7_r; 1090 tmp2 = p3_r - p4_r + q3_r - p7_r; 1091 tmp1 += tmp0; 1092 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1093 tmp1 += tmp2; 1094 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1095 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1096 p0_filter16, p1_filter16); 1097 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); 1098 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); 1099 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1100 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1101 SD(dword0, src); 1102 src += pitch; 1103 SD(dword1, src); 1104 src += pitch; 1105 1106 /* calculation of p2 and p1 */ 1107 tmp0 = p2_r - p3_r + q4_r - p7_r; 1108 tmp2 = p1_r - p2_r + q5_r - p7_r; 1109 tmp1 += tmp0; 1110 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1111 tmp1 += tmp2; 1112 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1113 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1114 p0_filter16, p1_filter16); 1115 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); 1116 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); 1117 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1118 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1119 SD(dword0, src); 1120 src += pitch; 1121 SD(dword1, src); 1122 src += pitch; 1123 1124 /* calculation of p0 and q0 */ 1125 tmp0 = (p0_r - p1_r) + (q6_r - p7_r); 1126 tmp2 = (q7_r - p0_r) + (q0_r - p7_r); 1127 tmp1 += tmp0; 1128 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1129 tmp1 += tmp2; 1130 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1131 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1132 p0_filter16, p1_filter16); 1133 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); 1134 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); 1135 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1136 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1137 SD(dword0, src); 1138 src += pitch; 1139 SD(dword1, src); 1140 src += pitch; 1141 1142 /* calculation of q1 and q2 */ 1143 tmp0 = q7_r - q0_r + q1_r - p6_r; 1144 tmp2 = q7_r - q1_r + q2_r - p5_r; 1145 tmp1 += tmp0; 1146 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1147 tmp1 += tmp2; 1148 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1149 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1150 p0_filter16, p1_filter16); 1151 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); 1152 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); 1153 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1154 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1155 SD(dword0, src); 1156 src += pitch; 1157 SD(dword1, src); 1158 src += pitch; 1159 1160 /* calculation of q3 and q4 */ 1161 tmp0 = (q7_r - q2_r) + (q3_r - p4_r); 1162 tmp2 = (q7_r - q3_r) + (q4_r - p3_r); 1163 tmp1 += tmp0; 1164 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1165 tmp1 += tmp2; 1166 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1167 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1168 p0_filter16, p1_filter16); 1169 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); 1170 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); 1171 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1172 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1173 SD(dword0, src); 1174 src += pitch; 1175 SD(dword1, src); 1176 src += pitch; 1177 1178 /* calculation of q5 and q6 */ 1179 tmp0 = (q7_r - q4_r) + (q5_r - p2_r); 1180 tmp2 = (q7_r - q5_r) + (q6_r - p1_r); 1181 tmp1 += tmp0; 1182 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1183 tmp1 += tmp2; 1184 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4); 1185 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, 1186 p0_filter16, p1_filter16); 1187 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); 1188 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); 1189 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0); 1190 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0); 1191 SD(dword0, src); 1192 src += pitch; 1193 SD(dword1, src); 1194 } 1195 } 1196} 1197 1198void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch, 1199 int32_t b_limit_ptr, 1200 int32_t limit_ptr, 1201 int32_t thresh_ptr) 1202{ 1203 v16u8 mask, hev, flat, limit, thresh, b_limit; 1204 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1205 v8i16 vec0, vec1, vec2, vec3; 1206 1207 LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 1208 1209 thresh = (v16u8) __msa_fill_b(thresh_ptr); 1210 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1211 limit = (v16u8) __msa_fill_b(limit_ptr); 1212 1213 TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, 1214 p3, p2, p1, p0, q0, q1, q2, q3); 1215 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1216 hev, mask, flat); 1217 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); 1218 ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); 1219 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1220 1221 src -= 2; 1222 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1223} 1224 1225void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch, 1226 int32_t b_limit_ptr, 1227 int32_t limit_ptr, 1228 int32_t thresh_ptr) 1229{ 1230 v16u8 mask, hev, flat; 1231 v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; 1232 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1233 v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 1234 v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 1235 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 1236 1237 LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 1238 LD_UB8(src - 4 + (8 * pitch), pitch, 1239 row8, row9, row10, row11, row12, row13, row14, row15); 1240 1241 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 1242 row8, row9, row10, row11, row12, row13, row14, row15, 1243 p3, p2, p1, p0, q0, q1, q2, q3); 1244 1245 thresh0 = (v16u8) __msa_fill_b(thresh_ptr); 1246 thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8); 1247 thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0); 1248 1249 b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr); 1250 b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8); 1251 b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0); 1252 1253 limit0 = (v16u8) __msa_fill_b(limit_ptr); 1254 limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8); 1255 limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0); 1256 1257 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, 1258 hev, mask, flat); 1259 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); 1260 ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); 1261 ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); 1262 ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); 1263 ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); 1264 1265 src -= 2; 1266 1267 ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1268 ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch); 1269} 1270 1271void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch, 1272 int32_t b_limit_ptr, 1273 int32_t limit_ptr, 1274 int32_t thresh_ptr) 1275{ 1276 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1277 v16u8 p1_out, p0_out, q0_out, q1_out; 1278 v16u8 flat, mask, hev, thresh, b_limit, limit; 1279 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 1280 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 1281 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 1282 v16u8 zero = { 0 }; 1283 v8i16 vec0, vec1, vec2, vec3, vec4; 1284 1285 /* load vector elements */ 1286 LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); 1287 1288 TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, 1289 p3, p2, p1, p0, q0, q1, q2, q3); 1290 1291 thresh = (v16u8) __msa_fill_b(thresh_ptr); 1292 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1293 limit = (v16u8) __msa_fill_b(limit_ptr); 1294 1295 /* mask and hev */ 1296 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1297 hev, mask, flat); 1298 /* flat4 */ 1299 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1300 /* filter4 */ 1301 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1302 q1_out); 1303 1304 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 1305 1306 /* if flat is zero for all pixels, then no need to calculate other filter */ 1307 if (__msa_test_bz_v(flat)) { 1308 /* Store 4 pixels p1-_q1 */ 1309 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1310 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1311 1312 src -= 2; 1313 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1314 } else { 1315 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 1316 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 1317 q3_r); 1318 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 1319 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 1320 /* convert 16 bit output data into 8 bit */ 1321 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, 1322 p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, 1323 p0_filt8_r, q0_filt8_r); 1324 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, 1325 q2_filt8_r); 1326 1327 /* store pixel values */ 1328 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 1329 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 1330 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 1331 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 1332 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 1333 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 1334 1335 /* Store 6 pixels p2-_q2 */ 1336 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1337 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1338 vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1); 1339 1340 src -= 3; 1341 ST_W4(vec2, 0, 1, 2, 3, src, pitch); 1342 ST_H4(vec4, 0, 1, 2, 3, src + 4, pitch); 1343 src += (4 * pitch); 1344 ST_W4(vec3, 0, 1, 2, 3, src, pitch); 1345 ST_H4(vec4, 4, 5, 6, 7, src + 4, pitch); 1346 } 1347} 1348 1349void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch, 1350 int32_t b_limit_ptr, 1351 int32_t limit_ptr, 1352 int32_t thresh_ptr) 1353{ 1354 uint8_t *temp_src; 1355 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1356 v16u8 p1_out, p0_out, q0_out, q1_out; 1357 v16u8 flat, mask, hev, thresh, b_limit, limit; 1358 v16u8 row4, row5, row6, row7, row12, row13, row14, row15; 1359 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 1360 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 1361 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 1362 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 1363 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 1364 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 1365 v16u8 zero = { 0 }; 1366 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1367 1368 temp_src = src - 4; 1369 1370 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); 1371 temp_src += (8 * pitch); 1372 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); 1373 1374 /* transpose 16x8 matrix into 8x16 */ 1375 TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, 1376 q3, q2, q1, q0, row12, row13, row14, row15, 1377 p3, p2, p1, p0, q0, q1, q2, q3); 1378 1379 thresh = (v16u8) __msa_fill_b(thresh_ptr); 1380 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8); 1381 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh); 1382 1383 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1384 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8); 1385 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit); 1386 1387 limit = (v16u8) __msa_fill_b(limit_ptr); 1388 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8); 1389 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit); 1390 1391 /* mask and hev */ 1392 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1393 hev, mask, flat); 1394 /* flat4 */ 1395 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1396 /* filter4 */ 1397 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1398 q1_out); 1399 1400 /* if flat is zero for all pixels, then no need to calculate other filter */ 1401 if (__msa_test_bz_v(flat)) { 1402 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1403 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1404 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1405 ILVRL_H2_SH(vec1, vec0, vec4, vec5); 1406 1407 src -= 2; 1408 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1409 ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch); 1410 } else { 1411 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 1412 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 1413 q3_r); 1414 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 1415 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 1416 1417 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 1418 p0_l); 1419 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 1420 q3_l); 1421 1422 /* filter8 */ 1423 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 1424 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 1425 1426 /* convert 16 bit output data into 8 bit */ 1427 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 1428 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 1429 p0_filt8_r, q0_filt8_r); 1430 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 1431 q2_filt8_r); 1432 1433 /* store pixel values */ 1434 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 1435 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 1436 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 1437 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 1438 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 1439 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 1440 1441 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1442 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 1443 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 1444 ILVRL_H2_SH(vec1, vec0, vec6, vec7); 1445 ILVRL_B2_SH(q2, q1, vec2, vec5); 1446 1447 src -= 3; 1448 ST_W4(vec3, 0, 1, 2, 3, src, pitch); 1449 ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch); 1450 src += (4 * pitch); 1451 ST_W4(vec4, 0, 1, 2, 3, src, pitch); 1452 ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch); 1453 src += (4 * pitch); 1454 ST_W4(vec6, 0, 1, 2, 3, src, pitch); 1455 ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch); 1456 src += (4 * pitch); 1457 ST_W4(vec7, 0, 1, 2, 3, src, pitch); 1458 ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch); 1459 } 1460} 1461 1462void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch, 1463 int32_t b_limit_ptr, 1464 int32_t limit_ptr, 1465 int32_t thresh_ptr) 1466{ 1467 uint8_t *temp_src; 1468 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1469 v16u8 p1_out, p0_out, q0_out, q1_out; 1470 v16u8 flat, mask, hev, thresh, b_limit, limit; 1471 v16u8 row4, row5, row6, row7, row12, row13, row14, row15; 1472 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 1473 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 1474 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 1475 v16u8 zero = { 0 }; 1476 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1477 1478 temp_src = src - 4; 1479 1480 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); 1481 temp_src += (8 * pitch); 1482 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); 1483 1484 /* transpose 16x8 matrix into 8x16 */ 1485 TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, 1486 q3, q2, q1, q0, row12, row13, row14, row15, 1487 p3, p2, p1, p0, q0, q1, q2, q3); 1488 1489 thresh = (v16u8) __msa_fill_b(thresh_ptr); 1490 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8); 1491 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh); 1492 1493 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1494 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8); 1495 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit); 1496 1497 limit = (v16u8) __msa_fill_b(limit_ptr); 1498 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8); 1499 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit); 1500 1501 /* mask and hev */ 1502 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1503 hev, mask, flat); 1504 /* flat4 */ 1505 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1506 /* filter4 */ 1507 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1508 q1_out); 1509 1510 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 1511 1512 /* if flat is zero for all pixels, then no need to calculate other filter */ 1513 if (__msa_test_bz_v(flat)) { 1514 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1515 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1516 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1517 ILVRL_H2_SH(vec1, vec0, vec4, vec5); 1518 1519 src -= 2; 1520 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1521 ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch); 1522 } else { 1523 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 1524 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 1525 q3_r); 1526 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 1527 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 1528 1529 /* convert 16 bit output data into 8 bit */ 1530 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, 1531 p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r, 1532 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); 1533 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, 1534 q1_filt8_r, q2_filt8_r); 1535 1536 /* store pixel values */ 1537 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 1538 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 1539 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 1540 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 1541 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 1542 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 1543 1544 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1545 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 1546 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 1547 ILVRL_H2_SH(vec1, vec0, vec6, vec7); 1548 ILVRL_B2_SH(q2, q1, vec2, vec5); 1549 1550 src -= 3; 1551 ST_W4(vec3, 0, 1, 2, 3, src, pitch); 1552 ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch); 1553 src += (4 * pitch); 1554 ST_W4(vec4, 0, 1, 2, 3, src, pitch); 1555 ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch); 1556 src += (4 * pitch); 1557 ST_W4(vec6, 0, 1, 2, 3, src, pitch); 1558 ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch); 1559 src += (4 * pitch); 1560 ST_W4(vec7, 0, 1, 2, 3, src, pitch); 1561 ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch); 1562 } 1563} 1564 1565void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch, 1566 int32_t b_limit_ptr, 1567 int32_t limit_ptr, 1568 int32_t thresh_ptr) 1569{ 1570 uint8_t *temp_src; 1571 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1572 v16u8 p1_out, p0_out, q0_out, q1_out; 1573 v16u8 flat, mask, hev, thresh, b_limit, limit; 1574 v16u8 row4, row5, row6, row7, row12, row13, row14, row15; 1575 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 1576 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 1577 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 1578 v16u8 zero = { 0 }; 1579 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1580 1581 temp_src = src - 4; 1582 1583 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); 1584 temp_src += (8 * pitch); 1585 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); 1586 1587 /* transpose 16x8 matrix into 8x16 */ 1588 TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, 1589 q3, q2, q1, q0, row12, row13, row14, row15, 1590 p3, p2, p1, p0, q0, q1, q2, q3); 1591 1592 thresh = (v16u8) __msa_fill_b(thresh_ptr); 1593 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8); 1594 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh); 1595 1596 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1597 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8); 1598 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit); 1599 1600 limit = (v16u8) __msa_fill_b(limit_ptr); 1601 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8); 1602 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit); 1603 1604 /* mask and hev */ 1605 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1606 hev, mask, flat); 1607 /* flat4 */ 1608 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1609 /* filter4 */ 1610 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1611 q1_out); 1612 1613 flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero); 1614 1615 /* if flat is zero for all pixels, then no need to calculate other filter */ 1616 if (__msa_test_bz_v(flat)) { 1617 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1618 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1619 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1620 ILVRL_H2_SH(vec1, vec0, vec4, vec5); 1621 1622 src -= 2; 1623 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch); 1624 ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch); 1625 } else { 1626 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 1627 p0_l); 1628 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 1629 q3_l); 1630 1631 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 1632 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 1633 1634 /* convert 16 bit output data into 8 bit */ 1635 PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l, 1636 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, 1637 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l); 1638 PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l, 1639 q1_filt8_l, q2_filt8_l); 1640 1641 /* store pixel values */ 1642 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat); 1643 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat); 1644 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat); 1645 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat); 1646 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat); 1647 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat); 1648 1649 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1650 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 1651 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 1652 ILVRL_H2_SH(vec1, vec0, vec6, vec7); 1653 ILVRL_B2_SH(q2, q1, vec2, vec5); 1654 1655 src -= 3; 1656 ST_W4(vec3, 0, 1, 2, 3, src, pitch); 1657 ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch); 1658 src += (4 * pitch); 1659 ST_W4(vec4, 0, 1, 2, 3, src, pitch); 1660 ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch); 1661 src += (4 * pitch); 1662 ST_W4(vec6, 0, 1, 2, 3, src, pitch); 1663 ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch); 1664 src += (4 * pitch); 1665 ST_W4(vec7, 0, 1, 2, 3, src, pitch); 1666 ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch); 1667 } 1668} 1669 1670static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, 1671 uint8_t *output, int32_t out_pitch) 1672{ 1673 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; 1674 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1675 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 1676 v16i8 zeros = { 0 }; 1677 1678 LD_UB8(input, in_pitch, 1679 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org); 1680 /* 8x8 transpose */ 1681 TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, 1682 p0_org, p7, p6, p5, p4, p3, p2, p1, p0); 1683 /* 8x8 transpose */ 1684 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, 1685 tmp0, tmp1, tmp2, tmp3); 1686 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); 1687 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); 1688 ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); 1689 ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); 1690 SLDI_B4_UB(zeros, q0, zeros, q2, zeros, q4, zeros, q6, 8, q1, q3, q5, q7); 1691 1692 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); 1693 output += (8 * out_pitch); 1694 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); 1695} 1696 1697static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, 1698 uint8_t *output, int32_t out_pitch) 1699{ 1700 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; 1701 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 1702 1703 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); 1704 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); 1705 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, 1706 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); 1707 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); 1708} 1709 1710static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch, 1711 uint8_t *output, int32_t out_pitch) 1712{ 1713 v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 1714 v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 1715 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; 1716 v4i32 tmp2, tmp3; 1717 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 1718 1719 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); 1720 input += (8 * in_pitch); 1721 LD_UB8(input, in_pitch, 1722 row8, row9, row10, row11, row12, row13, row14, row15); 1723 1724 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 1725 row8, row9, row10, row11, row12, row13, row14, row15, 1726 p7, p6, p5, p4, p3, p2, p1, p0); 1727 1728 /* transpose 16x8 matrix into 8x16 */ 1729 /* total 8 intermediate register and 32 instructions */ 1730 q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0); 1731 q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1); 1732 q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2); 1733 q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3); 1734 q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4); 1735 q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5); 1736 q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6); 1737 q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7); 1738 1739 ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); 1740 tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7); 1741 tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5); 1742 1743 ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); 1744 tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3); 1745 tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1); 1746 1747 ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); 1748 q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2); 1749 q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2); 1750 1751 tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0); 1752 tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5); 1753 q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2); 1754 q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2); 1755 1756 ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); 1757 q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2); 1758 q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2); 1759 1760 tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4); 1761 tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6); 1762 q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2); 1763 q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2); 1764 1765 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); 1766 output += (8 * out_pitch); 1767 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); 1768} 1769 1770static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, 1771 uint8_t *src_org, int32_t pitch_org, 1772 int32_t b_limit_ptr, 1773 int32_t limit_ptr, 1774 int32_t thresh_ptr) 1775{ 1776 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1777 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 1778 v16u8 flat, mask, hev, thresh, b_limit, limit; 1779 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 1780 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 1781 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 1782 v16i8 zero = { 0 }; 1783 v8i16 vec0, vec1, vec2, vec3; 1784 1785 /* load vector elements */ 1786 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); 1787 1788 thresh = (v16u8) __msa_fill_b(thresh_ptr); 1789 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 1790 limit = (v16u8) __msa_fill_b(limit_ptr); 1791 1792 /* mask and hev */ 1793 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1794 hev, mask, flat); 1795 /* flat4 */ 1796 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1797 /* filter4 */ 1798 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 1799 q1_out); 1800 1801 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat); 1802 1803 /* if flat is zero for all pixels, then no need to calculate other filter */ 1804 if (__msa_test_bz_v(flat)) { 1805 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1806 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1807 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org); 1808 return 1; 1809 } else { 1810 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 1811 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 1812 q3_r); 1813 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 1814 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 1815 1816 /* convert 16 bit output data into 8 bit */ 1817 p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r); 1818 p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r); 1819 p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r); 1820 q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r); 1821 q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r); 1822 q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r); 1823 1824 /* store pixel values */ 1825 p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat); 1826 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat); 1827 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat); 1828 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat); 1829 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat); 1830 q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat); 1831 1832 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 1833 filter48 += (4 * 16); 1834 ST_UB2(q1_out, q2_out, filter48, 16); 1835 filter48 += (2 * 16); 1836 ST_UB(flat, filter48); 1837 1838 return 0; 1839 } 1840} 1841 1842static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, 1843 uint8_t *filter48) 1844{ 1845 v16i8 zero = { 0 }; 1846 v16u8 filter8, flat, flat2; 1847 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 1848 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in; 1849 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in; 1850 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in; 1851 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in; 1852 v8u16 tmp0_r, tmp1_r; 1853 v8i16 r_out; 1854 1855 flat = LD_UB(filter48 + 6 * 16); 1856 1857 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); 1858 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); 1859 1860 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 1861 1862 /* if flat2 is zero for all pixels, then no need to calculate other filter */ 1863 if (__msa_test_bz_v(flat2)) { 1864 v8i16 vec0, vec1, vec2, vec3, vec4; 1865 1866 LD_UB4(filter48, 16, p2, p1, p0, q0); 1867 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 1868 1869 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1870 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 1871 vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1); 1872 1873 src_org -= 3; 1874 ST_W4(vec3, 0, 1, 2, 3, src_org, pitch); 1875 ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch); 1876 src_org += (4 * pitch); 1877 ST_W4(vec4, 0, 1, 2, 3, src_org, pitch); 1878 ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch); 1879 1880 return 1; 1881 } else { 1882 src -= 7 * 16; 1883 1884 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, 1885 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, 1886 p3_r_in, p2_r_in, p1_r_in, p0_r_in); 1887 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0); 1888 1889 tmp0_r = p7_r_in << 3; 1890 tmp0_r -= p7_r_in; 1891 tmp0_r += p6_r_in; 1892 tmp0_r += q0_r_in; 1893 tmp1_r = p6_r_in + p5_r_in; 1894 tmp1_r += p4_r_in; 1895 tmp1_r += p3_r_in; 1896 tmp1_r += p2_r_in; 1897 tmp1_r += p1_r_in; 1898 tmp1_r += p0_r_in; 1899 tmp1_r += tmp0_r; 1900 1901 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1902 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1903 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2); 1904 ST_D1(p6, 0, src); 1905 src += 16; 1906 1907 /* p5 */ 1908 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1); 1909 tmp0_r = p5_r_in - p6_r_in; 1910 tmp0_r += q1_r_in; 1911 tmp0_r -= p7_r_in; 1912 tmp1_r += tmp0_r; 1913 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1914 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1915 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2); 1916 ST_D1(p5, 0, src); 1917 src += 16; 1918 1919 /* p4 */ 1920 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2); 1921 tmp0_r = p4_r_in - p5_r_in; 1922 tmp0_r += q2_r_in; 1923 tmp0_r -= p7_r_in; 1924 tmp1_r += tmp0_r; 1925 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1926 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1927 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2); 1928 ST_D1(p4, 0, src); 1929 src += 16; 1930 1931 /* p3 */ 1932 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3); 1933 tmp0_r = p3_r_in - p4_r_in; 1934 tmp0_r += q3_r_in; 1935 tmp0_r -= p7_r_in; 1936 tmp1_r += tmp0_r; 1937 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1938 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1939 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2); 1940 ST_D1(p3, 0, src); 1941 src += 16; 1942 1943 /* p2 */ 1944 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4); 1945 filter8 = LD_UB(filter48); 1946 tmp0_r = p2_r_in - p3_r_in; 1947 tmp0_r += q4_r_in; 1948 tmp0_r -= p7_r_in; 1949 tmp1_r += tmp0_r; 1950 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1951 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1952 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 1953 ST_D1(filter8, 0, src); 1954 src += 16; 1955 1956 /* p1 */ 1957 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5); 1958 filter8 = LD_UB(filter48 + 16); 1959 tmp0_r = p1_r_in - p2_r_in; 1960 tmp0_r += q5_r_in; 1961 tmp0_r -= p7_r_in; 1962 tmp1_r += tmp0_r; 1963 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1964 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1965 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 1966 ST_D1(filter8, 0, src); 1967 src += 16; 1968 1969 /* p0 */ 1970 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6); 1971 filter8 = LD_UB(filter48 + 32); 1972 tmp0_r = p0_r_in - p1_r_in; 1973 tmp0_r += q6_r_in; 1974 tmp0_r -= p7_r_in; 1975 tmp1_r += tmp0_r; 1976 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1977 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1978 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 1979 ST_D1(filter8, 0, src); 1980 src += 16; 1981 1982 /* q0 */ 1983 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7); 1984 filter8 = LD_UB(filter48 + 48); 1985 tmp0_r = q7_r_in - p0_r_in; 1986 tmp0_r += q0_r_in; 1987 tmp0_r -= p7_r_in; 1988 tmp1_r += tmp0_r; 1989 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 1990 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 1991 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 1992 ST_D1(filter8, 0, src); 1993 src += 16; 1994 1995 /* q1 */ 1996 filter8 = LD_UB(filter48 + 64); 1997 tmp0_r = q7_r_in - q0_r_in; 1998 tmp0_r += q1_r_in; 1999 tmp0_r -= p6_r_in; 2000 tmp1_r += tmp0_r; 2001 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2002 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2003 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2004 ST_D1(filter8, 0, src); 2005 src += 16; 2006 2007 /* q2 */ 2008 filter8 = LD_UB(filter48 + 80); 2009 tmp0_r = q7_r_in - q1_r_in; 2010 tmp0_r += q2_r_in; 2011 tmp0_r -= p5_r_in; 2012 tmp1_r += tmp0_r; 2013 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2014 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2015 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2016 ST_D1(filter8, 0, src); 2017 src += 16; 2018 2019 /* q3 */ 2020 tmp0_r = q7_r_in - q2_r_in; 2021 tmp0_r += q3_r_in; 2022 tmp0_r -= p4_r_in; 2023 tmp1_r += tmp0_r; 2024 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2025 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2026 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2); 2027 ST_D1(q3, 0, src); 2028 src += 16; 2029 2030 /* q4 */ 2031 tmp0_r = q7_r_in - q3_r_in; 2032 tmp0_r += q4_r_in; 2033 tmp0_r -= p3_r_in; 2034 tmp1_r += tmp0_r; 2035 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2036 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2037 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2); 2038 ST_D1(q4, 0, src); 2039 src += 16; 2040 2041 /* q5 */ 2042 tmp0_r = q7_r_in - q4_r_in; 2043 tmp0_r += q5_r_in; 2044 tmp0_r -= p2_r_in; 2045 tmp1_r += tmp0_r; 2046 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2047 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2048 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2); 2049 ST_D1(q5, 0, src); 2050 src += 16; 2051 2052 /* q6 */ 2053 tmp0_r = q7_r_in - q5_r_in; 2054 tmp0_r += q6_r_in; 2055 tmp0_r -= p1_r_in; 2056 tmp1_r += tmp0_r; 2057 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2058 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out); 2059 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2); 2060 ST_D1(q6, 0, src); 2061 2062 return 0; 2063 } 2064} 2065 2066void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch, 2067 int32_t b_limit_ptr, 2068 int32_t limit_ptr, 2069 int32_t thresh_ptr) 2070{ 2071 uint8_t early_exit = 0; 2072 uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT); 2073 uint8_t *filter48 = &transposed_input[16 * 16]; 2074 2075 vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); 2076 2077 early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), 2078 &filter48[0], src, pitch, 2079 b_limit_ptr, limit_ptr, thresh_ptr); 2080 2081 if (0 == early_exit) { 2082 early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, 2083 &filter48[0]); 2084 2085 if (0 == early_exit) { 2086 vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); 2087 } 2088 } 2089} 2090 2091static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, 2092 uint8_t *src_org, ptrdiff_t pitch, 2093 int32_t b_limit_ptr, 2094 int32_t limit_ptr, 2095 int32_t thresh_ptr) 2096{ 2097 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 2098 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 2099 v16u8 flat, mask, hev, thresh, b_limit, limit; 2100 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 2101 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 2102 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r; 2103 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r; 2104 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l; 2105 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l; 2106 v16i8 zero = { 0 }; 2107 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; 2108 2109 /* load vector elements */ 2110 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); 2111 2112 thresh = (v16u8) __msa_fill_b(thresh_ptr); 2113 b_limit = (v16u8) __msa_fill_b(b_limit_ptr); 2114 limit = (v16u8) __msa_fill_b(limit_ptr); 2115 2116 /* mask and hev */ 2117 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 2118 hev, mask, flat); 2119 /* flat4 */ 2120 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 2121 /* filter4 */ 2122 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 2123 q1_out); 2124 2125 /* if flat is zero for all pixels, then no need to calculate other filter */ 2126 if (__msa_test_bz_v(flat)) { 2127 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 2128 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 2129 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 2130 ILVRL_H2_SH(vec1, vec0, vec4, vec5); 2131 2132 src_org -= 2; 2133 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch); 2134 ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch); 2135 2136 return 1; 2137 } else { 2138 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 2139 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 2140 q3_r); 2141 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 2142 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 2143 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, 2144 p0_l); 2145 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, 2146 q3_l); 2147 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 2148 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 2149 2150 /* convert 16 bit output data into 8 bit */ 2151 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 2152 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 2153 p0_filt8_r, q0_filt8_r); 2154 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 2155 q2_filt8_r); 2156 2157 /* store pixel values */ 2158 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat); 2159 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat); 2160 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat); 2161 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat); 2162 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat); 2163 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat); 2164 2165 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 2166 filter48 += (4 * 16); 2167 ST_UB2(q1_out, q2_out, filter48, 16); 2168 filter48 += (2 * 16); 2169 ST_UB(flat, filter48); 2170 2171 return 0; 2172 } 2173} 2174 2175static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, 2176 uint8_t *filter48) 2177{ 2178 v16u8 flat, flat2, filter8; 2179 v16i8 zero = { 0 }; 2180 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 2181 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in; 2182 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in; 2183 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in; 2184 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in; 2185 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; 2186 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; 2187 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; 2188 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; 2189 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; 2190 v8i16 l_out, r_out; 2191 2192 flat = LD_UB(filter48 + 6 * 16); 2193 2194 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); 2195 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); 2196 2197 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 2198 2199 /* if flat2 is zero for all pixels, then no need to calculate other filter */ 2200 if (__msa_test_bz_v(flat2)) { 2201 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2202 2203 LD_UB4(filter48, 16, p2, p1, p0, q0); 2204 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 2205 2206 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 2207 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 2208 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 2209 ILVRL_H2_SH(vec1, vec0, vec6, vec7); 2210 ILVRL_B2_SH(q2, q1, vec2, vec5); 2211 2212 src_org -= 3; 2213 ST_W4(vec3, 0, 1, 2, 3, src_org, pitch); 2214 ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch); 2215 src_org += (4 * pitch); 2216 ST_W4(vec4, 0, 1, 2, 3, src_org, pitch); 2217 ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch); 2218 src_org += (4 * pitch); 2219 ST_W4(vec6, 0, 1, 2, 3, src_org, pitch); 2220 ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch); 2221 src_org += (4 * pitch); 2222 ST_W4(vec7, 0, 1, 2, 3, src_org, pitch); 2223 ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch); 2224 2225 return 1; 2226 } else { 2227 src -= 7 * 16; 2228 2229 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, 2230 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, 2231 p3_r_in, p2_r_in, p1_r_in, p0_r_in); 2232 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0); 2233 2234 tmp0_r = p7_r_in << 3; 2235 tmp0_r -= p7_r_in; 2236 tmp0_r += p6_r_in; 2237 tmp0_r += q0_r_in; 2238 tmp1_r = p6_r_in + p5_r_in; 2239 tmp1_r += p4_r_in; 2240 tmp1_r += p3_r_in; 2241 tmp1_r += p2_r_in; 2242 tmp1_r += p1_r_in; 2243 tmp1_r += p0_r_in; 2244 tmp1_r += tmp0_r; 2245 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2246 2247 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, 2248 p5_l_in, p4_l_in); 2249 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, 2250 p1_l_in, p0_l_in); 2251 q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0); 2252 2253 tmp0_l = p7_l_in << 3; 2254 tmp0_l -= p7_l_in; 2255 tmp0_l += p6_l_in; 2256 tmp0_l += q0_l_in; 2257 tmp1_l = p6_l_in + p5_l_in; 2258 tmp1_l += p4_l_in; 2259 tmp1_l += p3_l_in; 2260 tmp1_l += p2_l_in; 2261 tmp1_l += p1_l_in; 2262 tmp1_l += p0_l_in; 2263 tmp1_l += tmp0_l; 2264 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2265 2266 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2267 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2); 2268 ST_UB(p6, src); 2269 src += 16; 2270 2271 /* p5 */ 2272 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1); 2273 tmp0_r = p5_r_in - p6_r_in; 2274 tmp0_r += q1_r_in; 2275 tmp0_r -= p7_r_in; 2276 tmp1_r += tmp0_r; 2277 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2278 q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1); 2279 tmp0_l = p5_l_in - p6_l_in; 2280 tmp0_l += q1_l_in; 2281 tmp0_l -= p7_l_in; 2282 tmp1_l += tmp0_l; 2283 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2284 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2285 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2); 2286 ST_UB(p5, src); 2287 src += 16; 2288 2289 /* p4 */ 2290 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2); 2291 tmp0_r = p4_r_in - p5_r_in; 2292 tmp0_r += q2_r_in; 2293 tmp0_r -= p7_r_in; 2294 tmp1_r += tmp0_r; 2295 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2296 q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2); 2297 tmp0_l = p4_l_in - p5_l_in; 2298 tmp0_l += q2_l_in; 2299 tmp0_l -= p7_l_in; 2300 tmp1_l += tmp0_l; 2301 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2302 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2303 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2); 2304 ST_UB(p4, src); 2305 src += 16; 2306 2307 /* p3 */ 2308 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3); 2309 tmp0_r = p3_r_in - p4_r_in; 2310 tmp0_r += q3_r_in; 2311 tmp0_r -= p7_r_in; 2312 tmp1_r += tmp0_r; 2313 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2314 q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3); 2315 tmp0_l = p3_l_in - p4_l_in; 2316 tmp0_l += q3_l_in; 2317 tmp0_l -= p7_l_in; 2318 tmp1_l += tmp0_l; 2319 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2320 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2321 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2); 2322 ST_UB(p3, src); 2323 src += 16; 2324 2325 /* p2 */ 2326 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4); 2327 filter8 = LD_UB(filter48); 2328 tmp0_r = p2_r_in - p3_r_in; 2329 tmp0_r += q4_r_in; 2330 tmp0_r -= p7_r_in; 2331 tmp1_r += tmp0_r; 2332 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2333 q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4); 2334 tmp0_l = p2_l_in - p3_l_in; 2335 tmp0_l += q4_l_in; 2336 tmp0_l -= p7_l_in; 2337 tmp1_l += tmp0_l; 2338 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2339 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2340 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2341 ST_UB(filter8, src); 2342 src += 16; 2343 2344 /* p1 */ 2345 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5); 2346 filter8 = LD_UB(filter48 + 16); 2347 tmp0_r = p1_r_in - p2_r_in; 2348 tmp0_r += q5_r_in; 2349 tmp0_r -= p7_r_in; 2350 tmp1_r += tmp0_r; 2351 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2352 q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5); 2353 tmp0_l = p1_l_in - p2_l_in; 2354 tmp0_l += q5_l_in; 2355 tmp0_l -= p7_l_in; 2356 tmp1_l += tmp0_l; 2357 l_out = __msa_srari_h((v8i16) (tmp1_l), 4); 2358 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2359 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2360 ST_UB(filter8, src); 2361 src += 16; 2362 2363 /* p0 */ 2364 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6); 2365 filter8 = LD_UB(filter48 + 32); 2366 tmp0_r = p0_r_in - p1_r_in; 2367 tmp0_r += q6_r_in; 2368 tmp0_r -= p7_r_in; 2369 tmp1_r += tmp0_r; 2370 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2371 q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6); 2372 tmp0_l = p0_l_in - p1_l_in; 2373 tmp0_l += q6_l_in; 2374 tmp0_l -= p7_l_in; 2375 tmp1_l += tmp0_l; 2376 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2377 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2378 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2379 ST_UB(filter8, src); 2380 src += 16; 2381 2382 /* q0 */ 2383 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7); 2384 filter8 = LD_UB(filter48 + 48); 2385 tmp0_r = q7_r_in - p0_r_in; 2386 tmp0_r += q0_r_in; 2387 tmp0_r -= p7_r_in; 2388 tmp1_r += tmp0_r; 2389 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2390 q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7); 2391 tmp0_l = q7_l_in - p0_l_in; 2392 tmp0_l += q0_l_in; 2393 tmp0_l -= p7_l_in; 2394 tmp1_l += tmp0_l; 2395 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2396 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2397 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2398 ST_UB(filter8, src); 2399 src += 16; 2400 2401 /* q1 */ 2402 filter8 = LD_UB(filter48 + 64); 2403 tmp0_r = q7_r_in - q0_r_in; 2404 tmp0_r += q1_r_in; 2405 tmp0_r -= p6_r_in; 2406 tmp1_r += tmp0_r; 2407 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2408 tmp0_l = q7_l_in - q0_l_in; 2409 tmp0_l += q1_l_in; 2410 tmp0_l -= p6_l_in; 2411 tmp1_l += tmp0_l; 2412 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2413 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2414 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2415 ST_UB(filter8, src); 2416 src += 16; 2417 2418 /* q2 */ 2419 filter8 = LD_UB(filter48 + 80); 2420 tmp0_r = q7_r_in - q1_r_in; 2421 tmp0_r += q2_r_in; 2422 tmp0_r -= p5_r_in; 2423 tmp1_r += tmp0_r; 2424 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2425 tmp0_l = q7_l_in - q1_l_in; 2426 tmp0_l += q2_l_in; 2427 tmp0_l -= p5_l_in; 2428 tmp1_l += tmp0_l; 2429 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2430 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2431 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2); 2432 ST_UB(filter8, src); 2433 src += 16; 2434 2435 /* q3 */ 2436 tmp0_r = q7_r_in - q2_r_in; 2437 tmp0_r += q3_r_in; 2438 tmp0_r -= p4_r_in; 2439 tmp1_r += tmp0_r; 2440 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2441 tmp0_l = q7_l_in - q2_l_in; 2442 tmp0_l += q3_l_in; 2443 tmp0_l -= p4_l_in; 2444 tmp1_l += tmp0_l; 2445 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2446 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2447 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2); 2448 ST_UB(q3, src); 2449 src += 16; 2450 2451 /* q4 */ 2452 tmp0_r = q7_r_in - q3_r_in; 2453 tmp0_r += q4_r_in; 2454 tmp0_r -= p3_r_in; 2455 tmp1_r += tmp0_r; 2456 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2457 tmp0_l = q7_l_in - q3_l_in; 2458 tmp0_l += q4_l_in; 2459 tmp0_l -= p3_l_in; 2460 tmp1_l += tmp0_l; 2461 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2462 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2463 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2); 2464 ST_UB(q4, src); 2465 src += 16; 2466 2467 /* q5 */ 2468 tmp0_r = q7_r_in - q4_r_in; 2469 tmp0_r += q5_r_in; 2470 tmp0_r -= p2_r_in; 2471 tmp1_r += tmp0_r; 2472 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2473 tmp0_l = q7_l_in - q4_l_in; 2474 tmp0_l += q5_l_in; 2475 tmp0_l -= p2_l_in; 2476 tmp1_l += tmp0_l; 2477 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2478 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2479 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2); 2480 ST_UB(q5, src); 2481 src += 16; 2482 2483 /* q6 */ 2484 tmp0_r = q7_r_in - q5_r_in; 2485 tmp0_r += q6_r_in; 2486 tmp0_r -= p1_r_in; 2487 tmp1_r += tmp0_r; 2488 r_out = __msa_srari_h((v8i16) tmp1_r, 4); 2489 tmp0_l = q7_l_in - q5_l_in; 2490 tmp0_l += q6_l_in; 2491 tmp0_l -= p1_l_in; 2492 tmp1_l += tmp0_l; 2493 l_out = __msa_srari_h((v8i16) tmp1_l, 4); 2494 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out); 2495 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2); 2496 ST_UB(q6, src); 2497 2498 return 0; 2499 } 2500} 2501 2502void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch, 2503 int32_t b_limit_ptr, 2504 int32_t limit_ptr, 2505 int32_t thresh_ptr) 2506{ 2507 uint8_t early_exit = 0; 2508 uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT); 2509 uint8_t *filter48 = &transposed_input[16 * 16]; 2510 2511 vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16); 2512 2513 early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), 2514 &filter48[0], src, pitch, 2515 b_limit_ptr, limit_ptr, thresh_ptr); 2516 2517 if (0 == early_exit) { 2518 early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, 2519 &filter48[0]); 2520 2521 if (0 == early_exit) { 2522 vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch); 2523 } 2524 } 2525} 2526