1/* 2 * Loongson LASX optimized h264dsp 3 * 4 * Copyright (c) 2021 Loongson Technology Corporation Limited 5 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 6 * Xiwei Gu <guxiwei-hf@loongson.cn> 7 * 8 * This file is part of FFmpeg. 9 * 10 * FFmpeg is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU Lesser General Public 12 * License as published by the Free Software Foundation; either 13 * version 2.1 of the License, or (at your option) any later version. 14 * 15 * FFmpeg is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with FFmpeg; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23 */ 24 25#include "libavutil/loongarch/loongson_intrinsics.h" 26#include "h264dsp_lasx.h" 27 28#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \ 29 p1_or_q1_org_in, p2_or_q2_org_in, \ 30 neg_tc_in, tc_in, p1_or_q1_out) \ 31{ \ 32 __m256i clip3, temp; \ 33 \ 34 clip3 = __lasx_xvavgr_hu(p0_or_q0_org_in, \ 35 q0_or_p0_org_in); \ 36 temp = __lasx_xvslli_h(p1_or_q1_org_in, 1); \ 37 clip3 = __lasx_xvsub_h(clip3, temp); \ 38 clip3 = __lasx_xvavg_h(p2_or_q2_org_in, clip3); \ 39 clip3 = __lasx_xvclip_h(clip3, neg_tc_in, tc_in); \ 40 p1_or_q1_out = __lasx_xvadd_h(p1_or_q1_org_in, clip3); \ 41} 42 43#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \ 44 p1_or_q1_org_in, q1_or_p1_org_in, \ 45 neg_threshold_in, threshold_in, \ 46 p0_or_q0_out, q0_or_p0_out) \ 47{ \ 48 __m256i q0_sub_p0, p1_sub_q1, delta; \ 49 \ 50 q0_sub_p0 = __lasx_xvsub_h(q0_or_p0_org_in, \ 51 p0_or_q0_org_in); \ 52 p1_sub_q1 = __lasx_xvsub_h(p1_or_q1_org_in, \ 53 q1_or_p1_org_in); \ 54 q0_sub_p0 = __lasx_xvslli_h(q0_sub_p0, 2); \ 55 p1_sub_q1 = __lasx_xvaddi_hu(p1_sub_q1, 4); \ 56 delta = __lasx_xvadd_h(q0_sub_p0, p1_sub_q1); \ 57 delta = __lasx_xvsrai_h(delta, 3); \ 58 delta = __lasx_xvclip_h(delta, neg_threshold_in, \ 59 threshold_in); \ 60 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, delta); \ 61 q0_or_p0_out = __lasx_xvsub_h(q0_or_p0_org_in, delta); \ 62 \ 63 p0_or_q0_out = __lasx_xvclip255_h(p0_or_q0_out); \ 64 q0_or_p0_out = __lasx_xvclip255_h(q0_or_p0_out); \ 65} 66 67void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width, 68 int alpha_in, int beta_in, int8_t *tc) 69{ 70 ptrdiff_t img_width_2x = img_width << 1; 71 ptrdiff_t img_width_4x = img_width << 2; 72 ptrdiff_t img_width_8x = img_width << 3; 73 ptrdiff_t img_width_3x = img_width_2x + img_width; 74 __m256i tmp_vec0, bs_vec; 75 __m256i tc_vec = {0x0101010100000000, 0x0303030302020202, 76 0x0101010100000000, 0x0303030302020202}; 77 78 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0); 79 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec); 80 bs_vec = __lasx_xvslti_b(tc_vec, 0); 81 bs_vec = __lasx_xvxori_b(bs_vec, 255); 82 bs_vec = __lasx_xvandi_b(bs_vec, 1); 83 84 if (__lasx_xbnz_v(bs_vec)) { 85 uint8_t *src = data - 4; 86 __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; 87 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 88 __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 89 __m256i is_bs_greater_than0; 90 __m256i zero = __lasx_xvldi(0); 91 92 is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec); 93 94 { 95 uint8_t *src_tmp = src + img_width_8x; 96 __m256i row0, row1, row2, row3, row4, row5, row6, row7; 97 __m256i row8, row9, row10, row11, row12, row13, row14, row15; 98 99 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 100 src, img_width_3x, row0, row1, row2, row3); 101 src += img_width_4x; 102 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 103 src, img_width_3x, row4, row5, row6, row7); 104 src -= img_width_4x; 105 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp, 106 img_width_2x, src_tmp, img_width_3x, 107 row8, row9, row10, row11); 108 src_tmp += img_width_4x; 109 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp, 110 img_width_2x, src_tmp, img_width_3x, 111 row12, row13, row14, row15); 112 src_tmp -= img_width_4x; 113 114 LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, 115 row7, row8, row9, row10, row11, 116 row12, row13, row14, row15, 117 p3_org, p2_org, p1_org, p0_org, 118 q0_org, q1_org, q2_org, q3_org); 119 } 120 121 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 122 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 123 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 124 125 alpha = __lasx_xvreplgr2vr_b(alpha_in); 126 beta = __lasx_xvreplgr2vr_b(beta_in); 127 128 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 129 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 130 is_less_than = is_less_than_alpha & is_less_than_beta; 131 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 132 is_less_than = is_less_than_beta & is_less_than; 133 is_less_than = is_less_than & is_bs_greater_than0; 134 135 if (__lasx_xbnz_v(is_less_than)) { 136 __m256i neg_tc_h, tc_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h; 137 __m256i p2_asub_p0, q2_asub_q0; 138 139 neg_tc_h = __lasx_xvneg_b(tc_vec); 140 neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h); 141 tc_h = __lasx_vext2xv_hu_bu(tc_vec); 142 p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 143 p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 144 q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 145 146 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org); 147 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta); 148 is_less_than_beta = is_less_than_beta & is_less_than; 149 150 if (__lasx_xbnz_v(is_less_than_beta)) { 151 __m256i p2_org_h, p1_h; 152 153 p2_org_h = __lasx_vext2xv_hu_bu(p2_org); 154 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h, 155 neg_tc_h, tc_h, p1_h); 156 p1_h = __lasx_xvpickev_b(p1_h, p1_h); 157 p1_h = __lasx_xvpermi_d(p1_h, 0xd8); 158 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta); 159 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1); 160 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta); 161 } 162 163 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org); 164 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta); 165 is_less_than_beta = is_less_than_beta & is_less_than; 166 167 q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 168 169 if (__lasx_xbnz_v(is_less_than_beta)) { 170 __m256i q2_org_h, q1_h; 171 172 q2_org_h = __lasx_vext2xv_hu_bu(q2_org); 173 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h, 174 neg_tc_h, tc_h, q1_h); 175 q1_h = __lasx_xvpickev_b(q1_h, q1_h); 176 q1_h = __lasx_xvpermi_d(q1_h, 0xd8); 177 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta); 178 179 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1); 180 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta); 181 } 182 183 { 184 __m256i neg_thresh_h, p0_h, q0_h; 185 186 neg_thresh_h = __lasx_xvneg_b(tc_vec); 187 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h); 188 tc_h = __lasx_vext2xv_hu_bu(tc_vec); 189 190 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h, 191 neg_thresh_h, tc_h, p0_h, q0_h); 192 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, 193 p0_h, q0_h); 194 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, 195 p0_h, q0_h); 196 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 197 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 198 } 199 200 { 201 __m256i row0, row1, row2, row3, row4, row5, row6, row7; 202 __m256i control = {0x0000000400000000, 0x0000000500000001, 203 0x0000000600000002, 0x0000000700000003}; 204 205 DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, 206 q2_org, 0x02, p2_org, q1_org, 0x02, p3_org, 207 q0_org, 0x02, p0_org, p1_org, p2_org, p3_org); 208 DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org, 209 row0, row2); 210 DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org, 211 row1, row3); 212 DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6); 213 DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7); 214 DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6, 215 control, row7, control, row4, row5, row6, row7); 216 __lasx_xvstelm_d(row4, src, 0, 0); 217 __lasx_xvstelm_d(row4, src + img_width, 0, 1); 218 src += img_width_2x; 219 __lasx_xvstelm_d(row4, src, 0, 2); 220 __lasx_xvstelm_d(row4, src + img_width, 0, 3); 221 src += img_width_2x; 222 __lasx_xvstelm_d(row5, src, 0, 0); 223 __lasx_xvstelm_d(row5, src + img_width, 0, 1); 224 src += img_width_2x; 225 __lasx_xvstelm_d(row5, src, 0, 2); 226 __lasx_xvstelm_d(row5, src + img_width, 0, 3); 227 src += img_width_2x; 228 __lasx_xvstelm_d(row6, src, 0, 0); 229 __lasx_xvstelm_d(row6, src + img_width, 0, 1); 230 src += img_width_2x; 231 __lasx_xvstelm_d(row6, src, 0, 2); 232 __lasx_xvstelm_d(row6, src + img_width, 0, 3); 233 src += img_width_2x; 234 __lasx_xvstelm_d(row7, src, 0, 0); 235 __lasx_xvstelm_d(row7, src + img_width, 0, 1); 236 src += img_width_2x; 237 __lasx_xvstelm_d(row7, src, 0, 2); 238 __lasx_xvstelm_d(row7, src + img_width, 0, 3); 239 } 240 } 241 } 242} 243 244void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width, 245 int alpha_in, int beta_in, int8_t *tc) 246{ 247 ptrdiff_t img_width_2x = img_width << 1; 248 ptrdiff_t img_width_3x = img_width + img_width_2x; 249 __m256i tmp_vec0, bs_vec; 250 __m256i tc_vec = {0x0101010100000000, 0x0303030302020202, 251 0x0101010100000000, 0x0303030302020202}; 252 253 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0); 254 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec); 255 bs_vec = __lasx_xvslti_b(tc_vec, 0); 256 bs_vec = __lasx_xvxori_b(bs_vec, 255); 257 bs_vec = __lasx_xvandi_b(bs_vec, 1); 258 259 if (__lasx_xbnz_v(bs_vec)) { 260 __m256i p2_org, p1_org, p0_org, q0_org, q1_org, q2_org; 261 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 262 __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 263 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h; 264 __m256i is_bs_greater_than0; 265 __m256i zero = __lasx_xvldi(0); 266 267 alpha = __lasx_xvreplgr2vr_b(alpha_in); 268 beta = __lasx_xvreplgr2vr_b(beta_in); 269 270 DUP2_ARG2(__lasx_xvldx, data, -img_width_3x, data, -img_width_2x, 271 p2_org, p1_org); 272 p0_org = __lasx_xvldx(data, -img_width); 273 DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org); 274 275 is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec); 276 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 277 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 278 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 279 280 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 281 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 282 is_less_than = is_less_than_alpha & is_less_than_beta; 283 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 284 is_less_than = is_less_than_beta & is_less_than; 285 is_less_than = is_less_than & is_bs_greater_than0; 286 287 if (__lasx_xbnz_v(is_less_than)) { 288 __m256i neg_tc_h, tc_h, p2_asub_p0, q2_asub_q0; 289 290 q2_org = __lasx_xvldx(data, img_width_2x); 291 292 neg_tc_h = __lasx_xvneg_b(tc_vec); 293 neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h); 294 tc_h = __lasx_vext2xv_hu_bu(tc_vec); 295 p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 296 p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 297 q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 298 299 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org); 300 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta); 301 is_less_than_beta = is_less_than_beta & is_less_than; 302 303 if (__lasx_xbnz_v(is_less_than_beta)) { 304 __m256i p1_h, p2_org_h; 305 306 p2_org_h = __lasx_vext2xv_hu_bu(p2_org); 307 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h, 308 neg_tc_h, tc_h, p1_h); 309 p1_h = __lasx_xvpickev_b(p1_h, p1_h); 310 p1_h = __lasx_xvpermi_d(p1_h, 0xd8); 311 p1_h = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta); 312 p1_org = __lasx_xvpermi_q(p1_org, p1_h, 0x30); 313 __lasx_xvst(p1_org, data - img_width_2x, 0); 314 315 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1); 316 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta); 317 } 318 319 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org); 320 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta); 321 is_less_than_beta = is_less_than_beta & is_less_than; 322 323 q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 324 325 if (__lasx_xbnz_v(is_less_than_beta)) { 326 __m256i q1_h, q2_org_h; 327 328 q2_org_h = __lasx_vext2xv_hu_bu(q2_org); 329 AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h, 330 neg_tc_h, tc_h, q1_h); 331 q1_h = __lasx_xvpickev_b(q1_h, q1_h); 332 q1_h = __lasx_xvpermi_d(q1_h, 0xd8); 333 q1_h = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta); 334 q1_org = __lasx_xvpermi_q(q1_org, q1_h, 0x30); 335 __lasx_xvst(q1_org, data + img_width, 0); 336 337 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1); 338 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta); 339 340 } 341 342 { 343 __m256i neg_thresh_h, p0_h, q0_h; 344 345 neg_thresh_h = __lasx_xvneg_b(tc_vec); 346 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h); 347 tc_h = __lasx_vext2xv_hu_bu(tc_vec); 348 349 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h, 350 neg_thresh_h, tc_h, p0_h, q0_h); 351 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, 352 p0_h, q0_h); 353 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0Xd8, q0_h, 0xd8, 354 p0_h, q0_h); 355 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 356 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 357 p0_org = __lasx_xvpermi_q(p0_org, p0_h, 0x30); 358 q0_org = __lasx_xvpermi_q(q0_org, q0_h, 0x30); 359 __lasx_xvst(p0_org, data - img_width, 0); 360 __lasx_xvst(q0_org, data, 0); 361 } 362 } 363 } 364} 365 366void ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width, 367 int alpha_in, int beta_in, int8_t *tc) 368{ 369 __m256i tmp_vec0, bs_vec; 370 __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0}; 371 __m256i zero = __lasx_xvldi(0); 372 ptrdiff_t img_width_2x = img_width << 1; 373 ptrdiff_t img_width_4x = img_width << 2; 374 ptrdiff_t img_width_3x = img_width_2x + img_width; 375 376 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0); 377 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec); 378 bs_vec = __lasx_xvslti_b(tc_vec, 0); 379 bs_vec = __lasx_xvxori_b(bs_vec, 255); 380 bs_vec = __lasx_xvandi_b(bs_vec, 1); 381 bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30); 382 383 if (__lasx_xbnz_v(bs_vec)) { 384 uint8_t *src = data - 2; 385 __m256i p1_org, p0_org, q0_org, q1_org; 386 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 387 __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 388 __m256i is_bs_greater_than0; 389 390 is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec); 391 392 { 393 __m256i row0, row1, row2, row3, row4, row5, row6, row7; 394 395 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 396 src, img_width_3x, row0, row1, row2, row3); 397 src += img_width_4x; 398 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 399 src, img_width_3x, row4, row5, row6, row7); 400 src -= img_width_4x; 401 /* LASX_TRANSPOSE8x4_B */ 402 DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, 403 row7, row5, p1_org, p0_org, q0_org, q1_org); 404 row0 = __lasx_xvilvl_b(p0_org, p1_org); 405 row1 = __lasx_xvilvl_b(q1_org, q0_org); 406 row3 = __lasx_xvilvh_w(row1, row0); 407 row2 = __lasx_xvilvl_w(row1, row0); 408 p1_org = __lasx_xvpermi_d(row2, 0x00); 409 p0_org = __lasx_xvpermi_d(row2, 0x55); 410 q0_org = __lasx_xvpermi_d(row3, 0x00); 411 q1_org = __lasx_xvpermi_d(row3, 0x55); 412 } 413 414 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 415 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 416 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 417 418 alpha = __lasx_xvreplgr2vr_b(alpha_in); 419 beta = __lasx_xvreplgr2vr_b(beta_in); 420 421 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 422 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 423 is_less_than = is_less_than_alpha & is_less_than_beta; 424 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 425 is_less_than = is_less_than_beta & is_less_than; 426 is_less_than = is_less_than & is_bs_greater_than0; 427 428 if (__lasx_xbnz_v(is_less_than)) { 429 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h; 430 431 p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 432 p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 433 q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 434 q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 435 436 { 437 __m256i tc_h, neg_thresh_h, p0_h, q0_h; 438 439 neg_thresh_h = __lasx_xvneg_b(tc_vec); 440 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h); 441 tc_h = __lasx_vext2xv_hu_bu(tc_vec); 442 443 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h, 444 neg_thresh_h, tc_h, p0_h, q0_h); 445 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, 446 p0_h, q0_h); 447 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, 448 p0_h, q0_h); 449 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 450 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 451 } 452 453 p0_org = __lasx_xvilvl_b(q0_org, p0_org); 454 src = data - 1; 455 __lasx_xvstelm_h(p0_org, src, 0, 0); 456 src += img_width; 457 __lasx_xvstelm_h(p0_org, src, 0, 1); 458 src += img_width; 459 __lasx_xvstelm_h(p0_org, src, 0, 2); 460 src += img_width; 461 __lasx_xvstelm_h(p0_org, src, 0, 3); 462 src += img_width; 463 __lasx_xvstelm_h(p0_org, src, 0, 4); 464 src += img_width; 465 __lasx_xvstelm_h(p0_org, src, 0, 5); 466 src += img_width; 467 __lasx_xvstelm_h(p0_org, src, 0, 6); 468 src += img_width; 469 __lasx_xvstelm_h(p0_org, src, 0, 7); 470 } 471 } 472} 473 474void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width, 475 int alpha_in, int beta_in, int8_t *tc) 476{ 477 int img_width_2x = img_width << 1; 478 __m256i tmp_vec0, bs_vec; 479 __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0}; 480 __m256i zero = __lasx_xvldi(0); 481 482 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0); 483 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec); 484 bs_vec = __lasx_xvslti_b(tc_vec, 0); 485 bs_vec = __lasx_xvxori_b(bs_vec, 255); 486 bs_vec = __lasx_xvandi_b(bs_vec, 1); 487 bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30); 488 489 if (__lasx_xbnz_v(bs_vec)) { 490 __m256i p1_org, p0_org, q0_org, q1_org; 491 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 492 __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 493 __m256i is_bs_greater_than0; 494 495 alpha = __lasx_xvreplgr2vr_b(alpha_in); 496 beta = __lasx_xvreplgr2vr_b(beta_in); 497 498 DUP2_ARG2(__lasx_xvldx, data, -img_width_2x, data, -img_width, 499 p1_org, p0_org); 500 DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org); 501 502 is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec); 503 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 504 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 505 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 506 507 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 508 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 509 is_less_than = is_less_than_alpha & is_less_than_beta; 510 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 511 is_less_than = is_less_than_beta & is_less_than; 512 is_less_than = is_less_than & is_bs_greater_than0; 513 514 if (__lasx_xbnz_v(is_less_than)) { 515 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h; 516 517 p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 518 p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 519 q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 520 q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 521 522 { 523 __m256i neg_thresh_h, tc_h, p0_h, q0_h; 524 525 neg_thresh_h = __lasx_xvneg_b(tc_vec); 526 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h); 527 tc_h = __lasx_vext2xv_hu_bu(tc_vec); 528 529 AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h, 530 neg_thresh_h, tc_h, p0_h, q0_h); 531 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, 532 p0_h, q0_h); 533 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, 534 p0_h, q0_h); 535 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 536 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 537 __lasx_xvstelm_d(p0_h, data - img_width, 0, 0); 538 __lasx_xvstelm_d(q0_h, data, 0, 0); 539 } 540 } 541 } 542} 543 544#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \ 545 q3_or_p3_org_in, p1_or_q1_org_in, \ 546 p2_or_q2_org_in, q1_or_p1_org_in, \ 547 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \ 548{ \ 549 __m256i threshold; \ 550 __m256i const2, const3 = __lasx_xvldi(0); \ 551 \ 552 const2 = __lasx_xvaddi_hu(const3, 2); \ 553 const3 = __lasx_xvaddi_hu(const3, 3); \ 554 threshold = __lasx_xvadd_h(p0_or_q0_org_in, q3_or_p3_org_in); \ 555 threshold = __lasx_xvadd_h(p1_or_q1_org_in, threshold); \ 556 \ 557 p0_or_q0_out = __lasx_xvslli_h(threshold, 1); \ 558 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p2_or_q2_org_in); \ 559 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, q1_or_p1_org_in); \ 560 p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const3); \ 561 \ 562 p1_or_q1_out = __lasx_xvadd_h(p2_or_q2_org_in, threshold); \ 563 p1_or_q1_out = __lasx_xvsrar_h(p1_or_q1_out, const2); \ 564 \ 565 p2_or_q2_out = __lasx_xvmul_h(p2_or_q2_org_in, const3); \ 566 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \ 567 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \ 568 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, threshold); \ 569 p2_or_q2_out = __lasx_xvsrar_h(p2_or_q2_out, const3); \ 570} 571 572/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */ 573#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \ 574 p1_or_q1_org_in, p0_or_q0_out) \ 575{ \ 576 __m256i const2 = __lasx_xvldi(0); \ 577 const2 = __lasx_xvaddi_hu(const2, 2); \ 578 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, q1_or_p1_org_in); \ 579 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \ 580 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \ 581 p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const2); \ 582} 583 584void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, 585 int alpha_in, int beta_in) 586{ 587 ptrdiff_t img_width_2x = img_width << 1; 588 ptrdiff_t img_width_4x = img_width << 2; 589 ptrdiff_t img_width_3x = img_width_2x + img_width; 590 uint8_t *src = data - 4; 591 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 592 __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 593 __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; 594 __m256i zero = __lasx_xvldi(0); 595 596 { 597 __m256i row0, row1, row2, row3, row4, row5, row6, row7; 598 __m256i row8, row9, row10, row11, row12, row13, row14, row15; 599 600 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 601 src, img_width_3x, row0, row1, row2, row3); 602 src += img_width_4x; 603 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 604 src, img_width_3x, row4, row5, row6, row7); 605 src += img_width_4x; 606 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 607 src, img_width_3x, row8, row9, row10, row11); 608 src += img_width_4x; 609 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 610 src, img_width_3x, row12, row13, row14, row15); 611 src += img_width_4x; 612 613 LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, 614 row4, row5, row6, row7, 615 row8, row9, row10, row11, 616 row12, row13, row14, row15, 617 p3_org, p2_org, p1_org, p0_org, 618 q0_org, q1_org, q2_org, q3_org); 619 } 620 621 alpha = __lasx_xvreplgr2vr_b(alpha_in); 622 beta = __lasx_xvreplgr2vr_b(beta_in); 623 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 624 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 625 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 626 627 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 628 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 629 is_less_than = is_less_than_beta & is_less_than_alpha; 630 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 631 is_less_than = is_less_than_beta & is_less_than; 632 is_less_than = __lasx_xvpermi_q(zero, is_less_than, 0x30); 633 634 if (__lasx_xbnz_v(is_less_than)) { 635 __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta; 636 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h; 637 __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2); 638 639 less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2); 640 less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0, 641 less_alpha_shift2_add2); 642 643 p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 644 p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 645 q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 646 q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 647 648 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org); 649 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta); 650 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2; 651 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff); 652 is_less_than_beta = is_less_than_beta & is_less_than; 653 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 654 655 /* combine and store */ 656 if (__lasx_xbnz_v(is_less_than_beta)) { 657 __m256i p2_org_h, p3_org_h, p1_h, p2_h; 658 659 p2_org_h = __lasx_vext2xv_hu_bu(p2_org); 660 p3_org_h = __lasx_vext2xv_hu_bu(p3_org); 661 662 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h, 663 p2_org_h, q1_org_h, p0_h, p1_h, p2_h); 664 665 p0_h = __lasx_xvpickev_b(p0_h, p0_h); 666 p0_h = __lasx_xvpermi_d(p0_h, 0xd8); 667 DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h); 668 DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h); 669 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta); 670 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta); 671 p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta); 672 } 673 674 AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h); 675 /* combine */ 676 p0_h = __lasx_xvpickev_b(p0_h, p0_h); 677 p0_h = __lasx_xvpermi_d(p0_h, 0xd8); 678 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta); 679 680 /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */ 681 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org); 682 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta); 683 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2; 684 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff); 685 is_less_than_beta = is_less_than_beta & is_less_than; 686 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 687 688 /* combine and store */ 689 if (__lasx_xbnz_v(is_less_than_beta)) { 690 __m256i q2_org_h, q3_org_h, q1_h, q2_h; 691 692 q2_org_h = __lasx_vext2xv_hu_bu(q2_org); 693 q3_org_h = __lasx_vext2xv_hu_bu(q3_org); 694 695 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h, 696 q2_org_h, p1_org_h, q0_h, q1_h, q2_h); 697 698 q0_h = __lasx_xvpickev_b(q0_h, q0_h); 699 q0_h = __lasx_xvpermi_d(q0_h, 0xd8); 700 DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h); 701 DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h); 702 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta); 703 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta); 704 q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta); 705 706 } 707 708 AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h); 709 710 /* combine */ 711 q0_h = __lasx_xvpickev_b(q0_h, q0_h); 712 q0_h = __lasx_xvpermi_d(q0_h, 0xd8); 713 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta); 714 715 /* transpose and store */ 716 { 717 __m256i row0, row1, row2, row3, row4, row5, row6, row7; 718 __m256i control = {0x0000000400000000, 0x0000000500000001, 719 0x0000000600000002, 0x0000000700000003}; 720 721 DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, q2_org, 722 0x02, p2_org, q1_org, 0x02, p3_org, q0_org, 0x02, 723 p0_org, p1_org, p2_org, p3_org); 724 DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org, 725 row0, row2); 726 DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org, 727 row1, row3); 728 DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6); 729 DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7); 730 DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6, 731 control, row7, control, row4, row5, row6, row7); 732 src = data - 4; 733 __lasx_xvstelm_d(row4, src, 0, 0); 734 __lasx_xvstelm_d(row4, src + img_width, 0, 1); 735 src += img_width_2x; 736 __lasx_xvstelm_d(row4, src, 0, 2); 737 __lasx_xvstelm_d(row4, src + img_width, 0, 3); 738 src += img_width_2x; 739 __lasx_xvstelm_d(row5, src, 0, 0); 740 __lasx_xvstelm_d(row5, src + img_width, 0, 1); 741 src += img_width_2x; 742 __lasx_xvstelm_d(row5, src, 0, 2); 743 __lasx_xvstelm_d(row5, src + img_width, 0, 3); 744 src += img_width_2x; 745 __lasx_xvstelm_d(row6, src, 0, 0); 746 __lasx_xvstelm_d(row6, src + img_width, 0, 1); 747 src += img_width_2x; 748 __lasx_xvstelm_d(row6, src, 0, 2); 749 __lasx_xvstelm_d(row6, src + img_width, 0, 3); 750 src += img_width_2x; 751 __lasx_xvstelm_d(row7, src, 0, 0); 752 __lasx_xvstelm_d(row7, src + img_width, 0, 1); 753 src += img_width_2x; 754 __lasx_xvstelm_d(row7, src, 0, 2); 755 __lasx_xvstelm_d(row7, src + img_width, 0, 3); 756 } 757 } 758} 759 760void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, 761 int alpha_in, int beta_in) 762{ 763 ptrdiff_t img_width_2x = img_width << 1; 764 ptrdiff_t img_width_3x = img_width_2x + img_width; 765 uint8_t *src = data - img_width_2x; 766 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 767 __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 768 __m256i p1_org, p0_org, q0_org, q1_org; 769 __m256i zero = __lasx_xvldi(0); 770 771 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 772 src, img_width_3x, p1_org, p0_org, q0_org, q1_org); 773 alpha = __lasx_xvreplgr2vr_b(alpha_in); 774 beta = __lasx_xvreplgr2vr_b(beta_in); 775 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 776 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 777 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 778 779 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 780 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 781 is_less_than = is_less_than_beta & is_less_than_alpha; 782 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 783 is_less_than = is_less_than_beta & is_less_than; 784 is_less_than = __lasx_xvpermi_q(zero, is_less_than, 0x30); 785 786 if (__lasx_xbnz_v(is_less_than)) { 787 __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta; 788 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h; 789 __m256i p2_org = __lasx_xvldx(src, -img_width); 790 __m256i q2_org = __lasx_xvldx(data, img_width_2x); 791 __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2); 792 less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2); 793 less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0, 794 less_alpha_shift2_add2); 795 796 p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 797 p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 798 q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 799 q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 800 801 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org); 802 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta); 803 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2; 804 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff); 805 is_less_than_beta = is_less_than_beta & is_less_than; 806 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 807 808 /* combine and store */ 809 if (__lasx_xbnz_v(is_less_than_beta)) { 810 __m256i p2_org_h, p3_org_h, p1_h, p2_h; 811 __m256i p3_org = __lasx_xvldx(src, -img_width_2x); 812 813 p2_org_h = __lasx_vext2xv_hu_bu(p2_org); 814 p3_org_h = __lasx_vext2xv_hu_bu(p3_org); 815 816 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h, 817 p2_org_h, q1_org_h, p0_h, p1_h, p2_h); 818 819 p0_h = __lasx_xvpickev_b(p0_h, p0_h); 820 p0_h = __lasx_xvpermi_d(p0_h, 0xd8); 821 DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h); 822 DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h); 823 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta); 824 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta); 825 p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta); 826 827 __lasx_xvst(p1_org, src, 0); 828 __lasx_xvst(p2_org, src - img_width, 0); 829 } 830 831 AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h); 832 /* combine */ 833 p0_h = __lasx_xvpickev_b(p0_h, p0_h); 834 p0_h = __lasx_xvpermi_d(p0_h, 0xd8); 835 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta); 836 __lasx_xvst(p0_org, data - img_width, 0); 837 838 /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */ 839 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org); 840 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta); 841 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2; 842 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff); 843 is_less_than_beta = is_less_than_beta & is_less_than; 844 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 845 846 /* combine and store */ 847 if (__lasx_xbnz_v(is_less_than_beta)) { 848 __m256i q2_org_h, q3_org_h, q1_h, q2_h; 849 __m256i q3_org = __lasx_xvldx(data, img_width_2x + img_width); 850 851 q2_org_h = __lasx_vext2xv_hu_bu(q2_org); 852 q3_org_h = __lasx_vext2xv_hu_bu(q3_org); 853 854 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h, 855 q2_org_h, p1_org_h, q0_h, q1_h, q2_h); 856 857 q0_h = __lasx_xvpickev_b(q0_h, q0_h); 858 q0_h = __lasx_xvpermi_d(q0_h, 0xd8); 859 DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h); 860 DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h); 861 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta); 862 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta); 863 q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta); 864 865 __lasx_xvst(q1_org, data + img_width, 0); 866 __lasx_xvst(q2_org, data + img_width_2x, 0); 867 } 868 869 AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h); 870 871 /* combine */ 872 q0_h = __lasx_xvpickev_b(q0_h, q0_h); 873 q0_h = __lasx_xvpermi_d(q0_h, 0xd8); 874 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta); 875 876 __lasx_xvst(q0_org, data, 0); 877 } 878} 879 880void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, 881 int alpha_in, int beta_in) 882{ 883 uint8_t *src = data - 2; 884 ptrdiff_t img_width_2x = img_width << 1; 885 ptrdiff_t img_width_4x = img_width << 2; 886 ptrdiff_t img_width_3x = img_width_2x + img_width; 887 __m256i p1_org, p0_org, q0_org, q1_org; 888 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 889 __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 890 891 { 892 __m256i row0, row1, row2, row3, row4, row5, row6, row7; 893 894 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src, 895 img_width_3x, row0, row1, row2, row3); 896 src += img_width_4x; 897 DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src, 898 img_width_3x, row4, row5, row6, row7); 899 900 /* LASX_TRANSPOSE8x4_B */ 901 DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5, 902 p1_org, p0_org, q0_org, q1_org); 903 row0 = __lasx_xvilvl_b(p0_org, p1_org); 904 row1 = __lasx_xvilvl_b(q1_org, q0_org); 905 row3 = __lasx_xvilvh_w(row1, row0); 906 row2 = __lasx_xvilvl_w(row1, row0); 907 p1_org = __lasx_xvpermi_d(row2, 0x00); 908 p0_org = __lasx_xvpermi_d(row2, 0x55); 909 q0_org = __lasx_xvpermi_d(row3, 0x00); 910 q1_org = __lasx_xvpermi_d(row3, 0x55); 911 } 912 913 alpha = __lasx_xvreplgr2vr_b(alpha_in); 914 beta = __lasx_xvreplgr2vr_b(beta_in); 915 916 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 917 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 918 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 919 920 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 921 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 922 is_less_than = is_less_than_alpha & is_less_than_beta; 923 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 924 is_less_than = is_less_than_beta & is_less_than; 925 926 if (__lasx_xbnz_v(is_less_than)) { 927 __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h; 928 929 p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 930 p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 931 q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 932 q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 933 934 AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h); 935 AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h); 936 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h); 937 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h); 938 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 939 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 940 } 941 p0_org = __lasx_xvilvl_b(q0_org, p0_org); 942 src = data - 1; 943 __lasx_xvstelm_h(p0_org, src, 0, 0); 944 src += img_width; 945 __lasx_xvstelm_h(p0_org, src, 0, 1); 946 src += img_width; 947 __lasx_xvstelm_h(p0_org, src, 0, 2); 948 src += img_width; 949 __lasx_xvstelm_h(p0_org, src, 0, 3); 950 src += img_width; 951 __lasx_xvstelm_h(p0_org, src, 0, 4); 952 src += img_width; 953 __lasx_xvstelm_h(p0_org, src, 0, 5); 954 src += img_width; 955 __lasx_xvstelm_h(p0_org, src, 0, 6); 956 src += img_width; 957 __lasx_xvstelm_h(p0_org, src, 0, 7); 958} 959 960void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, 961 int alpha_in, int beta_in) 962{ 963 ptrdiff_t img_width_2x = img_width << 1; 964 __m256i p1_org, p0_org, q0_org, q1_org; 965 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 966 __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 967 968 alpha = __lasx_xvreplgr2vr_b(alpha_in); 969 beta = __lasx_xvreplgr2vr_b(beta_in); 970 971 p1_org = __lasx_xvldx(data, -img_width_2x); 972 p0_org = __lasx_xvldx(data, -img_width); 973 DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org); 974 975 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 976 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 977 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 978 979 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 980 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 981 is_less_than = is_less_than_alpha & is_less_than_beta; 982 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 983 is_less_than = is_less_than_beta & is_less_than; 984 985 if (__lasx_xbnz_v(is_less_than)) { 986 __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h; 987 988 p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 989 p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 990 q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 991 q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 992 993 AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h); 994 AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h); 995 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h); 996 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h); 997 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 998 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 999 __lasx_xvstelm_d(p0_h, data - img_width, 0, 0); 1000 __lasx_xvstelm_d(q0_h, data, 0, 0); 1001 } 1002} 1003 1004void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src, 1005 ptrdiff_t stride, int height, 1006 int log2_denom, int weight_dst, 1007 int weight_src, int offset_in) 1008{ 1009 __m256i wgt; 1010 __m256i src0, src1, src2, src3; 1011 __m256i dst0, dst1, dst2, dst3; 1012 __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1013 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1014 __m256i denom, offset; 1015 int stride_2x = stride << 1; 1016 int stride_4x = stride << 2; 1017 int stride_3x = stride_2x + stride; 1018 1019 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1020 offset_in += ((weight_src + weight_dst) << 7); 1021 log2_denom += 1; 1022 1023 tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1024 tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1025 wgt = __lasx_xvilvh_b(tmp1, tmp0); 1026 offset = __lasx_xvreplgr2vr_h(offset_in); 1027 denom = __lasx_xvreplgr2vr_h(log2_denom); 1028 1029 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1030 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1031 src += stride_4x; 1032 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1033 src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1034 src += stride_4x; 1035 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4, 1036 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3); 1037 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1038 dst, stride_3x, tmp0, tmp1, tmp2, tmp3); 1039 dst += stride_4x; 1040 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1041 dst, stride_3x, tmp4, tmp5, tmp6, tmp7); 1042 dst -= stride_4x; 1043 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4, 1044 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3); 1045 1046 DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1047 src0, src1, src2, src3); 1048 DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128, 1049 dst0, dst1, dst2, dst3); 1050 DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2, 1051 dst3, src3, vec0, vec2, vec4, vec6); 1052 DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2, 1053 dst3, src3, vec1, vec3, vec5, vec7); 1054 1055 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1056 offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3); 1057 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5, 1058 offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7); 1059 1060 tmp0 = __lasx_xvsra_h(tmp0, denom); 1061 tmp1 = __lasx_xvsra_h(tmp1, denom); 1062 tmp2 = __lasx_xvsra_h(tmp2, denom); 1063 tmp3 = __lasx_xvsra_h(tmp3, denom); 1064 tmp4 = __lasx_xvsra_h(tmp4, denom); 1065 tmp5 = __lasx_xvsra_h(tmp5, denom); 1066 tmp6 = __lasx_xvsra_h(tmp6, denom); 1067 tmp7 = __lasx_xvsra_h(tmp7, denom); 1068 1069 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3, 1070 tmp0, tmp1, tmp2, tmp3); 1071 DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7, 1072 tmp4, tmp5, tmp6, tmp7); 1073 DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, 1074 dst0, dst1, dst2, dst3); 1075 __lasx_xvstelm_d(dst0, dst, 0, 0); 1076 __lasx_xvstelm_d(dst0, dst, 8, 1); 1077 dst += stride; 1078 __lasx_xvstelm_d(dst0, dst, 0, 2); 1079 __lasx_xvstelm_d(dst0, dst, 8, 3); 1080 dst += stride; 1081 __lasx_xvstelm_d(dst1, dst, 0, 0); 1082 __lasx_xvstelm_d(dst1, dst, 8, 1); 1083 dst += stride; 1084 __lasx_xvstelm_d(dst1, dst, 0, 2); 1085 __lasx_xvstelm_d(dst1, dst, 8, 3); 1086 dst += stride; 1087 __lasx_xvstelm_d(dst2, dst, 0, 0); 1088 __lasx_xvstelm_d(dst2, dst, 8, 1); 1089 dst += stride; 1090 __lasx_xvstelm_d(dst2, dst, 0, 2); 1091 __lasx_xvstelm_d(dst2, dst, 8, 3); 1092 dst += stride; 1093 __lasx_xvstelm_d(dst3, dst, 0, 0); 1094 __lasx_xvstelm_d(dst3, dst, 8, 1); 1095 dst += stride; 1096 __lasx_xvstelm_d(dst3, dst, 0, 2); 1097 __lasx_xvstelm_d(dst3, dst, 8, 3); 1098 dst += stride; 1099 1100 if (16 == height) { 1101 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1102 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1103 src += stride_4x; 1104 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1105 src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1106 src += stride_4x; 1107 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, 1108 tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3); 1109 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1110 dst, stride_3x, tmp0, tmp1, tmp2, tmp3); 1111 dst += stride_4x; 1112 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1113 dst, stride_3x, tmp4, tmp5, tmp6, tmp7); 1114 dst -= stride_4x; 1115 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, 1116 tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3); 1117 1118 DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1119 src0, src1, src2, src3); 1120 DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128, 1121 dst0, dst1, dst2, dst3); 1122 DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2, 1123 dst3, src3, vec0, vec2, vec4, vec6); 1124 DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2, 1125 dst3, src3, vec1, vec3, vec5, vec7); 1126 1127 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1128 offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3); 1129 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5, 1130 offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7); 1131 1132 tmp0 = __lasx_xvsra_h(tmp0, denom); 1133 tmp1 = __lasx_xvsra_h(tmp1, denom); 1134 tmp2 = __lasx_xvsra_h(tmp2, denom); 1135 tmp3 = __lasx_xvsra_h(tmp3, denom); 1136 tmp4 = __lasx_xvsra_h(tmp4, denom); 1137 tmp5 = __lasx_xvsra_h(tmp5, denom); 1138 tmp6 = __lasx_xvsra_h(tmp6, denom); 1139 tmp7 = __lasx_xvsra_h(tmp7, denom); 1140 1141 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3, 1142 tmp0, tmp1, tmp2, tmp3); 1143 DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7, 1144 tmp4, tmp5, tmp6, tmp7); 1145 DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, 1146 tmp6, dst0, dst1, dst2, dst3); 1147 __lasx_xvstelm_d(dst0, dst, 0, 0); 1148 __lasx_xvstelm_d(dst0, dst, 8, 1); 1149 dst += stride; 1150 __lasx_xvstelm_d(dst0, dst, 0, 2); 1151 __lasx_xvstelm_d(dst0, dst, 8, 3); 1152 dst += stride; 1153 __lasx_xvstelm_d(dst1, dst, 0, 0); 1154 __lasx_xvstelm_d(dst1, dst, 8, 1); 1155 dst += stride; 1156 __lasx_xvstelm_d(dst1, dst, 0, 2); 1157 __lasx_xvstelm_d(dst1, dst, 8, 3); 1158 dst += stride; 1159 __lasx_xvstelm_d(dst2, dst, 0, 0); 1160 __lasx_xvstelm_d(dst2, dst, 8, 1); 1161 dst += stride; 1162 __lasx_xvstelm_d(dst2, dst, 0, 2); 1163 __lasx_xvstelm_d(dst2, dst, 8, 3); 1164 dst += stride; 1165 __lasx_xvstelm_d(dst3, dst, 0, 0); 1166 __lasx_xvstelm_d(dst3, dst, 8, 1); 1167 dst += stride; 1168 __lasx_xvstelm_d(dst3, dst, 0, 2); 1169 __lasx_xvstelm_d(dst3, dst, 8, 3); 1170 } 1171} 1172 1173static void avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1174 int32_t log2_denom, int32_t weight_src, 1175 int32_t weight_dst, int32_t offset_in) 1176{ 1177 __m256i wgt, vec0, vec1; 1178 __m256i src0, dst0; 1179 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset; 1180 ptrdiff_t stride_2x = stride << 1; 1181 ptrdiff_t stride_3x = stride_2x + stride; 1182 1183 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1184 offset_in += ((weight_src + weight_dst) << 7); 1185 log2_denom += 1; 1186 1187 tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1188 tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1189 wgt = __lasx_xvilvh_b(tmp1, tmp0); 1190 offset = __lasx_xvreplgr2vr_h(offset_in); 1191 denom = __lasx_xvreplgr2vr_h(log2_denom); 1192 1193 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1194 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1195 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1196 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1197 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1198 dst, stride_3x, tmp0, tmp1, tmp2, tmp3); 1199 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1200 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1201 DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0); 1202 vec0 = __lasx_xvilvl_b(dst0, src0); 1203 vec1 = __lasx_xvilvh_b(dst0, src0); 1204 DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1205 tmp0, tmp1); 1206 tmp0 = __lasx_xvsra_h(tmp0, denom); 1207 tmp1 = __lasx_xvsra_h(tmp1, denom); 1208 DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1); 1209 dst0 = __lasx_xvpickev_b(tmp1, tmp0); 1210 __lasx_xvstelm_d(dst0, dst, 0, 0); 1211 __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1212 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1213 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1214} 1215 1216static void avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1217 int32_t log2_denom, int32_t weight_src, 1218 int32_t weight_dst, int32_t offset_in) 1219{ 1220 __m256i wgt, vec0, vec1, vec2, vec3; 1221 __m256i src0, src1, dst0, dst1; 1222 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset; 1223 ptrdiff_t stride_2x = stride << 1; 1224 ptrdiff_t stride_4x = stride << 2; 1225 ptrdiff_t stride_3x = stride_2x + stride; 1226 uint8_t* dst_tmp = dst; 1227 1228 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1229 offset_in += ((weight_src + weight_dst) << 7); 1230 log2_denom += 1; 1231 1232 tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1233 tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1234 wgt = __lasx_xvilvh_b(tmp1, tmp0); 1235 offset = __lasx_xvreplgr2vr_h(offset_in); 1236 denom = __lasx_xvreplgr2vr_h(log2_denom); 1237 1238 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1239 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1240 src += stride_4x; 1241 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1242 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1243 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1244 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1245 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1246 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1247 tmp0 = __lasx_xvld(dst_tmp, 0); 1248 DUP2_ARG2(__lasx_xvldx, dst_tmp, stride, dst_tmp, stride_2x, tmp1, tmp2); 1249 tmp3 = __lasx_xvldx(dst_tmp, stride_3x); 1250 dst_tmp += stride_4x; 1251 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1252 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1253 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x, 1254 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1255 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1256 dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1257 1258 DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, dst0, 128, dst1, 128, 1259 src0, src1, dst0, dst1); 1260 DUP2_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, vec0, vec2); 1261 DUP2_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, vec1, vec3); 1262 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1263 offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3); 1264 tmp0 = __lasx_xvsra_h(tmp0, denom); 1265 tmp1 = __lasx_xvsra_h(tmp1, denom); 1266 tmp2 = __lasx_xvsra_h(tmp2, denom); 1267 tmp3 = __lasx_xvsra_h(tmp3, denom); 1268 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3, 1269 tmp0, tmp1, tmp2, tmp3); 1270 DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1); 1271 __lasx_xvstelm_d(dst0, dst, 0, 0); 1272 __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1273 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1274 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1275 dst += stride_4x; 1276 __lasx_xvstelm_d(dst1, dst, 0, 0); 1277 __lasx_xvstelm_d(dst1, dst + stride, 0, 1); 1278 __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2); 1279 __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3); 1280} 1281 1282static void avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1283 int32_t log2_denom, int32_t weight_src, 1284 int32_t weight_dst, int32_t offset_in) 1285{ 1286 __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1287 __m256i src0, src1, src2, src3, dst0, dst1, dst2, dst3; 1288 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset; 1289 ptrdiff_t stride_2x = stride << 1; 1290 ptrdiff_t stride_4x = stride << 2; 1291 ptrdiff_t stride_3x = stride_2x + stride; 1292 uint8_t* dst_tmp = dst; 1293 1294 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1295 offset_in += ((weight_src + weight_dst) << 7); 1296 log2_denom += 1; 1297 1298 tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1299 tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1300 wgt = __lasx_xvilvh_b(tmp1, tmp0); 1301 offset = __lasx_xvreplgr2vr_h(offset_in); 1302 denom = __lasx_xvreplgr2vr_h(log2_denom); 1303 1304 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1305 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1306 src += stride_4x; 1307 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1308 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1309 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1310 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1311 src += stride_4x; 1312 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1313 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1314 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1315 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1316 src += stride_4x; 1317 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1318 src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1319 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1320 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1321 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1322 src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1323 1324 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x, 1325 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1326 dst_tmp += stride_4x; 1327 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1328 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1329 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x, 1330 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1331 dst_tmp += stride_4x; 1332 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1333 dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1334 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x, 1335 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1336 dst_tmp += stride_4x; 1337 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1338 dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1339 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x, 1340 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1341 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1342 dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1343 1344 DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1345 src0, src1, src2, src3); 1346 DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128, 1347 dst0, dst1, dst2, dst3); 1348 DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2, 1349 dst3, src3, vec0, vec2, vec4, vec6); 1350 DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2, 1351 dst3, src3, vec1, vec3, vec5, vec7); 1352 DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1353 offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3); 1354 DUP4_ARG3(__lasx_xvdp2add_h_b,offset, wgt, vec4, offset, wgt, vec5, 1355 offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7); 1356 tmp0 = __lasx_xvsra_h(tmp0, denom); 1357 tmp1 = __lasx_xvsra_h(tmp1, denom); 1358 tmp2 = __lasx_xvsra_h(tmp2, denom); 1359 tmp3 = __lasx_xvsra_h(tmp3, denom); 1360 tmp4 = __lasx_xvsra_h(tmp4, denom); 1361 tmp5 = __lasx_xvsra_h(tmp5, denom); 1362 tmp6 = __lasx_xvsra_h(tmp6, denom); 1363 tmp7 = __lasx_xvsra_h(tmp7, denom); 1364 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3, 1365 tmp0, tmp1, tmp2, tmp3); 1366 DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7, 1367 tmp4, tmp5, tmp6, tmp7); 1368 DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, 1369 dst0, dst1, dst2, dst3) 1370 __lasx_xvstelm_d(dst0, dst, 0, 0); 1371 __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1372 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1373 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1374 dst += stride_4x; 1375 __lasx_xvstelm_d(dst1, dst, 0, 0); 1376 __lasx_xvstelm_d(dst1, dst + stride, 0, 1); 1377 __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2); 1378 __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3); 1379 dst += stride_4x; 1380 __lasx_xvstelm_d(dst2, dst, 0, 0); 1381 __lasx_xvstelm_d(dst2, dst + stride, 0, 1); 1382 __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2); 1383 __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3); 1384 dst += stride_4x; 1385 __lasx_xvstelm_d(dst3, dst, 0, 0); 1386 __lasx_xvstelm_d(dst3, dst + stride, 0, 1); 1387 __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2); 1388 __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3); 1389} 1390 1391void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src, 1392 ptrdiff_t stride, int height, 1393 int log2_denom, int weight_dst, 1394 int weight_src, int offset) 1395{ 1396 if (4 == height) { 1397 avc_biwgt_8x4_lasx(src, dst, stride, log2_denom, weight_src, weight_dst, 1398 offset); 1399 } else if (8 == height) { 1400 avc_biwgt_8x8_lasx(src, dst, stride, log2_denom, weight_src, weight_dst, 1401 offset); 1402 } else { 1403 avc_biwgt_8x16_lasx(src, dst, stride, log2_denom, weight_src, weight_dst, 1404 offset); 1405 } 1406} 1407 1408static void avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1409 int32_t log2_denom, int32_t weight_src, 1410 int32_t weight_dst, int32_t offset_in) 1411{ 1412 __m256i wgt, vec0; 1413 __m256i src0, dst0; 1414 __m256i tmp0, tmp1, denom, offset; 1415 1416 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1417 offset_in += ((weight_src + weight_dst) << 7); 1418 log2_denom += 1; 1419 1420 tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1421 tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1422 wgt = __lasx_xvilvh_b(tmp1, tmp0); 1423 offset = __lasx_xvreplgr2vr_h(offset_in); 1424 denom = __lasx_xvreplgr2vr_h(log2_denom); 1425 1426 DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1); 1427 src0 = __lasx_xvilvl_w(tmp1, tmp0); 1428 DUP2_ARG2(__lasx_xvldx, dst, 0, dst, stride, tmp0, tmp1); 1429 dst0 = __lasx_xvilvl_w(tmp1, tmp0); 1430 DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0); 1431 vec0 = __lasx_xvilvl_b(dst0, src0); 1432 tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0); 1433 tmp0 = __lasx_xvsra_h(tmp0, denom); 1434 tmp0 = __lasx_xvclip255_h(tmp0); 1435 tmp0 = __lasx_xvpickev_b(tmp0, tmp0); 1436 __lasx_xvstelm_w(tmp0, dst, 0, 0); 1437 __lasx_xvstelm_w(tmp0, dst + stride, 0, 1); 1438} 1439 1440static void avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1441 int32_t log2_denom, int32_t weight_src, 1442 int32_t weight_dst, int32_t offset_in) 1443{ 1444 __m256i wgt, vec0; 1445 __m256i src0, dst0; 1446 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset; 1447 ptrdiff_t stride_2x = stride << 1; 1448 ptrdiff_t stride_3x = stride_2x + stride; 1449 1450 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1451 offset_in += ((weight_src + weight_dst) << 7); 1452 log2_denom += 1; 1453 1454 tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1455 tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1456 wgt = __lasx_xvilvh_b(tmp1, tmp0); 1457 offset = __lasx_xvreplgr2vr_h(offset_in); 1458 denom = __lasx_xvreplgr2vr_h(log2_denom); 1459 1460 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1461 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1462 DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 1463 src0 = __lasx_xvilvl_w(tmp1, tmp0); 1464 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1465 dst, stride_3x, tmp0, tmp1, tmp2, tmp3); 1466 DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 1467 dst0 = __lasx_xvilvl_w(tmp1, tmp0); 1468 DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0); 1469 vec0 = __lasx_xvilvl_b(dst0, src0); 1470 dst0 = __lasx_xvilvh_b(dst0, src0); 1471 vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02); 1472 tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0); 1473 tmp0 = __lasx_xvsra_h(tmp0, denom); 1474 tmp0 = __lasx_xvclip255_h(tmp0); 1475 tmp0 = __lasx_xvpickev_b(tmp0, tmp0); 1476 __lasx_xvstelm_w(tmp0, dst, 0, 0); 1477 __lasx_xvstelm_w(tmp0, dst + stride, 0, 1); 1478 __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4); 1479 __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5); 1480} 1481 1482static void avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1483 int32_t log2_denom, int32_t weight_src, 1484 int32_t weight_dst, int32_t offset_in) 1485{ 1486 __m256i wgt, vec0, vec1; 1487 __m256i src0, dst0; 1488 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset; 1489 ptrdiff_t stride_2x = stride << 1; 1490 ptrdiff_t stride_4x = stride << 2; 1491 ptrdiff_t stride_3x = stride_2x + stride; 1492 1493 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1494 offset_in += ((weight_src + weight_dst) << 7); 1495 log2_denom += 1; 1496 1497 tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1498 tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1499 wgt = __lasx_xvilvh_b(tmp1, tmp0); 1500 offset = __lasx_xvreplgr2vr_h(offset_in); 1501 denom = __lasx_xvreplgr2vr_h(log2_denom); 1502 1503 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1504 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1505 src += stride_4x; 1506 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1507 src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1508 DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5, 1509 tmp0, tmp1, tmp2, tmp3); 1510 DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1511 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1512 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1513 dst, stride_3x, tmp0, tmp1, tmp2, tmp3); 1514 dst += stride_4x; 1515 DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1516 dst, stride_3x, tmp4, tmp5, tmp6, tmp7); 1517 dst -= stride_4x; 1518 DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5, 1519 tmp0, tmp1, tmp2, tmp3); 1520 DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1521 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1522 DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0); 1523 vec0 = __lasx_xvilvl_b(dst0, src0); 1524 vec1 = __lasx_xvilvh_b(dst0, src0); 1525 DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1526 tmp0, tmp1); 1527 tmp0 = __lasx_xvsra_h(tmp0, denom); 1528 tmp1 = __lasx_xvsra_h(tmp1, denom); 1529 DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1); 1530 tmp0 = __lasx_xvpickev_b(tmp1, tmp0); 1531 __lasx_xvstelm_w(tmp0, dst, 0, 0); 1532 __lasx_xvstelm_w(tmp0, dst + stride, 0, 1); 1533 __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2); 1534 __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3); 1535 dst += stride_4x; 1536 __lasx_xvstelm_w(tmp0, dst, 0, 4); 1537 __lasx_xvstelm_w(tmp0, dst + stride, 0, 5); 1538 __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6); 1539 __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7); 1540} 1541 1542void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src, 1543 ptrdiff_t stride, int height, 1544 int log2_denom, int weight_dst, 1545 int weight_src, int offset) 1546{ 1547 if (2 == height) { 1548 avc_biwgt_4x2_lasx(src, dst, stride, log2_denom, weight_src, 1549 weight_dst, offset); 1550 } else if (4 == height) { 1551 avc_biwgt_4x4_lasx(src, dst, stride, log2_denom, weight_src, 1552 weight_dst, offset); 1553 } else { 1554 avc_biwgt_4x8_lasx(src, dst, stride, log2_denom, weight_src, 1555 weight_dst, offset); 1556 } 1557} 1558 1559void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride, 1560 int height, int log2_denom, 1561 int weight_src, int offset_in) 1562{ 1563 uint32_t offset_val; 1564 ptrdiff_t stride_2x = stride << 1; 1565 ptrdiff_t stride_4x = stride << 2; 1566 ptrdiff_t stride_3x = stride_2x + stride; 1567 __m256i zero = __lasx_xvldi(0); 1568 __m256i src0, src1, src2, src3; 1569 __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h; 1570 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1571 __m256i wgt, denom, offset; 1572 1573 offset_val = (unsigned) offset_in << log2_denom; 1574 1575 wgt = __lasx_xvreplgr2vr_h(weight_src); 1576 offset = __lasx_xvreplgr2vr_h(offset_val); 1577 denom = __lasx_xvreplgr2vr_h(log2_denom); 1578 1579 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1580 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1581 src += stride_4x; 1582 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1583 src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1584 src -= stride_4x; 1585 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4, 1586 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3); 1587 DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, 1588 zero, src3, src0_l, src1_l, src2_l, src3_l); 1589 DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, 1590 zero, src3, src0_h, src1_h, src2_h, src3_h); 1591 src0_l = __lasx_xvmul_h(wgt, src0_l); 1592 src0_h = __lasx_xvmul_h(wgt, src0_h); 1593 src1_l = __lasx_xvmul_h(wgt, src1_l); 1594 src1_h = __lasx_xvmul_h(wgt, src1_h); 1595 src2_l = __lasx_xvmul_h(wgt, src2_l); 1596 src2_h = __lasx_xvmul_h(wgt, src2_h); 1597 src3_l = __lasx_xvmul_h(wgt, src3_l); 1598 src3_h = __lasx_xvmul_h(wgt, src3_h); 1599 DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset, 1600 src1_h, offset, src0_l, src0_h, src1_l, src1_h); 1601 DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset, 1602 src3_h, offset, src2_l, src2_h, src3_l, src3_h); 1603 src0_l = __lasx_xvmaxi_h(src0_l, 0); 1604 src0_h = __lasx_xvmaxi_h(src0_h, 0); 1605 src1_l = __lasx_xvmaxi_h(src1_l, 0); 1606 src1_h = __lasx_xvmaxi_h(src1_h, 0); 1607 src2_l = __lasx_xvmaxi_h(src2_l, 0); 1608 src2_h = __lasx_xvmaxi_h(src2_h, 0); 1609 src3_l = __lasx_xvmaxi_h(src3_l, 0); 1610 src3_h = __lasx_xvmaxi_h(src3_h, 0); 1611 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 1612 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 1613 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom); 1614 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom); 1615 src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom); 1616 src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom); 1617 src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom); 1618 src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom); 1619 __lasx_xvstelm_d(src0_l, src, 0, 0); 1620 __lasx_xvstelm_d(src0_h, src, 8, 0); 1621 src += stride; 1622 __lasx_xvstelm_d(src0_l, src, 0, 2); 1623 __lasx_xvstelm_d(src0_h, src, 8, 2); 1624 src += stride; 1625 __lasx_xvstelm_d(src1_l, src, 0, 0); 1626 __lasx_xvstelm_d(src1_h, src, 8, 0); 1627 src += stride; 1628 __lasx_xvstelm_d(src1_l, src, 0, 2); 1629 __lasx_xvstelm_d(src1_h, src, 8, 2); 1630 src += stride; 1631 __lasx_xvstelm_d(src2_l, src, 0, 0); 1632 __lasx_xvstelm_d(src2_h, src, 8, 0); 1633 src += stride; 1634 __lasx_xvstelm_d(src2_l, src, 0, 2); 1635 __lasx_xvstelm_d(src2_h, src, 8, 2); 1636 src += stride; 1637 __lasx_xvstelm_d(src3_l, src, 0, 0); 1638 __lasx_xvstelm_d(src3_h, src, 8, 0); 1639 src += stride; 1640 __lasx_xvstelm_d(src3_l, src, 0, 2); 1641 __lasx_xvstelm_d(src3_h, src, 8, 2); 1642 src += stride; 1643 1644 if (16 == height) { 1645 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1646 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1647 src += stride_4x; 1648 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1649 src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1650 src -= stride_4x; 1651 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, 1652 tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3); 1653 DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, 1654 zero, src3, src0_l, src1_l, src2_l, src3_l); 1655 DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, 1656 zero, src3, src0_h, src1_h, src2_h, src3_h); 1657 src0_l = __lasx_xvmul_h(wgt, src0_l); 1658 src0_h = __lasx_xvmul_h(wgt, src0_h); 1659 src1_l = __lasx_xvmul_h(wgt, src1_l); 1660 src1_h = __lasx_xvmul_h(wgt, src1_h); 1661 src2_l = __lasx_xvmul_h(wgt, src2_l); 1662 src2_h = __lasx_xvmul_h(wgt, src2_h); 1663 src3_l = __lasx_xvmul_h(wgt, src3_l); 1664 src3_h = __lasx_xvmul_h(wgt, src3_h); 1665 DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, 1666 offset, src1_h, offset, src0_l, src0_h, src1_l, src1_h); 1667 DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, 1668 offset, src3_h, offset, src2_l, src2_h, src3_l, src3_h); 1669 src0_l = __lasx_xvmaxi_h(src0_l, 0); 1670 src0_h = __lasx_xvmaxi_h(src0_h, 0); 1671 src1_l = __lasx_xvmaxi_h(src1_l, 0); 1672 src1_h = __lasx_xvmaxi_h(src1_h, 0); 1673 src2_l = __lasx_xvmaxi_h(src2_l, 0); 1674 src2_h = __lasx_xvmaxi_h(src2_h, 0); 1675 src3_l = __lasx_xvmaxi_h(src3_l, 0); 1676 src3_h = __lasx_xvmaxi_h(src3_h, 0); 1677 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 1678 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 1679 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom); 1680 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom); 1681 src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom); 1682 src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom); 1683 src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom); 1684 src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom); 1685 __lasx_xvstelm_d(src0_l, src, 0, 0); 1686 __lasx_xvstelm_d(src0_h, src, 8, 0); 1687 src += stride; 1688 __lasx_xvstelm_d(src0_l, src, 0, 2); 1689 __lasx_xvstelm_d(src0_h, src, 8, 2); 1690 src += stride; 1691 __lasx_xvstelm_d(src1_l, src, 0, 0); 1692 __lasx_xvstelm_d(src1_h, src, 8, 0); 1693 src += stride; 1694 __lasx_xvstelm_d(src1_l, src, 0, 2); 1695 __lasx_xvstelm_d(src1_h, src, 8, 2); 1696 src += stride; 1697 __lasx_xvstelm_d(src2_l, src, 0, 0); 1698 __lasx_xvstelm_d(src2_h, src, 8, 0); 1699 src += stride; 1700 __lasx_xvstelm_d(src2_l, src, 0, 2); 1701 __lasx_xvstelm_d(src2_h, src, 8, 2); 1702 src += stride; 1703 __lasx_xvstelm_d(src3_l, src, 0, 0); 1704 __lasx_xvstelm_d(src3_h, src, 8, 0); 1705 src += stride; 1706 __lasx_xvstelm_d(src3_l, src, 0, 2); 1707 __lasx_xvstelm_d(src3_h, src, 8, 2); 1708 } 1709} 1710 1711static void avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride, 1712 int32_t log2_denom, int32_t weight_src, 1713 int32_t offset_in) 1714{ 1715 uint32_t offset_val; 1716 ptrdiff_t stride_2x = stride << 1; 1717 ptrdiff_t stride_3x = stride_2x + stride; 1718 __m256i wgt, zero = __lasx_xvldi(0); 1719 __m256i src0, src0_h, src0_l; 1720 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset; 1721 1722 offset_val = (unsigned) offset_in << log2_denom; 1723 1724 wgt = __lasx_xvreplgr2vr_h(weight_src); 1725 offset = __lasx_xvreplgr2vr_h(offset_val); 1726 denom = __lasx_xvreplgr2vr_h(log2_denom); 1727 1728 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1729 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1730 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1731 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1732 src0_l = __lasx_xvilvl_b(zero, src0); 1733 src0_h = __lasx_xvilvh_b(zero, src0); 1734 src0_l = __lasx_xvmul_h(wgt, src0_l); 1735 src0_h = __lasx_xvmul_h(wgt, src0_h); 1736 src0_l = __lasx_xvsadd_h(src0_l, offset); 1737 src0_h = __lasx_xvsadd_h(src0_h, offset); 1738 src0_l = __lasx_xvmaxi_h(src0_l, 0); 1739 src0_h = __lasx_xvmaxi_h(src0_h, 0); 1740 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 1741 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 1742 1743 src0 = __lasx_xvpickev_d(src0_h, src0_l); 1744 __lasx_xvstelm_d(src0, src, 0, 0); 1745 __lasx_xvstelm_d(src0, src + stride, 0, 1); 1746 __lasx_xvstelm_d(src0, src + stride_2x, 0, 2); 1747 __lasx_xvstelm_d(src0, src + stride_3x, 0, 3); 1748} 1749 1750static void avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, 1751 int32_t src_weight, int32_t offset_in) 1752{ 1753 __m256i src0, src1, src0_h, src0_l, src1_h, src1_l, zero = __lasx_xvldi(0); 1754 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt; 1755 uint32_t offset_val; 1756 uint8_t* src_tmp = src; 1757 ptrdiff_t stride_2x = stride << 1; 1758 ptrdiff_t stride_4x = stride << 2; 1759 ptrdiff_t stride_3x = stride_2x + stride; 1760 1761 offset_val = (unsigned) offset_in << log2_denom; 1762 1763 wgt = __lasx_xvreplgr2vr_h(src_weight); 1764 offset = __lasx_xvreplgr2vr_h(offset_val); 1765 denom = __lasx_xvreplgr2vr_h(log2_denom); 1766 1767 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1768 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1769 src_tmp += stride_4x; 1770 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1771 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1772 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1773 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1774 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1775 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1776 DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0_l, src1_l); 1777 DUP2_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, src0_h, src1_h); 1778 src0_l = __lasx_xvmul_h(wgt, src0_l); 1779 src0_h = __lasx_xvmul_h(wgt, src0_h); 1780 src1_l = __lasx_xvmul_h(wgt, src1_l); 1781 src1_h = __lasx_xvmul_h(wgt, src1_h); 1782 DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset, 1783 src1_h, offset, src0_l, src0_h, src1_l, src1_h); 1784 src0_l = __lasx_xvmaxi_h(src0_l, 0); 1785 src0_h = __lasx_xvmaxi_h(src0_h, 0); 1786 src1_l = __lasx_xvmaxi_h(src1_l, 0); 1787 src1_h = __lasx_xvmaxi_h(src1_h, 0); 1788 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 1789 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 1790 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom); 1791 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom); 1792 1793 DUP2_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src0, src1); 1794 __lasx_xvstelm_d(src0, src, 0, 0); 1795 __lasx_xvstelm_d(src0, src + stride, 0, 1); 1796 __lasx_xvstelm_d(src0, src + stride_2x, 0, 2); 1797 __lasx_xvstelm_d(src0, src + stride_3x, 0, 3); 1798 src += stride_4x; 1799 __lasx_xvstelm_d(src1, src, 0, 0); 1800 __lasx_xvstelm_d(src1, src + stride, 0, 1); 1801 __lasx_xvstelm_d(src1, src + stride_2x, 0, 2); 1802 __lasx_xvstelm_d(src1, src + stride_3x, 0, 3); 1803} 1804 1805static void avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride, 1806 int32_t log2_denom, int32_t src_weight, 1807 int32_t offset_in) 1808{ 1809 __m256i src0, src1, src2, src3; 1810 __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l; 1811 __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt; 1812 __m256i zero = __lasx_xvldi(0); 1813 uint32_t offset_val; 1814 uint8_t* src_tmp = src; 1815 ptrdiff_t stride_2x = stride << 1; 1816 ptrdiff_t stride_4x = stride << 2; 1817 ptrdiff_t stride_3x = stride_2x + stride; 1818 1819 offset_val = (unsigned) offset_in << log2_denom; 1820 1821 wgt = __lasx_xvreplgr2vr_h(src_weight); 1822 offset = __lasx_xvreplgr2vr_h(offset_val); 1823 denom = __lasx_xvreplgr2vr_h(log2_denom); 1824 1825 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1826 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1827 src_tmp += stride_4x; 1828 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1829 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1830 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1831 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1832 src_tmp += stride_4x; 1833 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1834 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1835 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1836 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1837 src_tmp += stride_4x; 1838 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1839 src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1840 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1841 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1842 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1843 src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1844 1845 DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, zero, src3, 1846 src0_l, src1_l, src2_l, src3_l); 1847 DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, zero, src3, 1848 src0_h, src1_h, src2_h, src3_h); 1849 src0_l = __lasx_xvmul_h(wgt, src0_l); 1850 src0_h = __lasx_xvmul_h(wgt, src0_h); 1851 src1_l = __lasx_xvmul_h(wgt, src1_l); 1852 src1_h = __lasx_xvmul_h(wgt, src1_h); 1853 src2_l = __lasx_xvmul_h(wgt, src2_l); 1854 src2_h = __lasx_xvmul_h(wgt, src2_h); 1855 src3_l = __lasx_xvmul_h(wgt, src3_l); 1856 src3_h = __lasx_xvmul_h(wgt, src3_h); 1857 1858 DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset, 1859 src1_h, offset, src0_l, src0_h, src1_l, src1_h); 1860 DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset, 1861 src3_h, offset, src2_l, src2_h, src3_l, src3_h); 1862 1863 src0_l = __lasx_xvmaxi_h(src0_l, 0); 1864 src0_h = __lasx_xvmaxi_h(src0_h, 0); 1865 src1_l = __lasx_xvmaxi_h(src1_l, 0); 1866 src1_h = __lasx_xvmaxi_h(src1_h, 0); 1867 src2_l = __lasx_xvmaxi_h(src2_l, 0); 1868 src2_h = __lasx_xvmaxi_h(src2_h, 0); 1869 src3_l = __lasx_xvmaxi_h(src3_l, 0); 1870 src3_h = __lasx_xvmaxi_h(src3_h, 0); 1871 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 1872 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 1873 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom); 1874 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom); 1875 src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom); 1876 src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom); 1877 src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom); 1878 src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom); 1879 DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, 1880 src3_h, src3_l, src0, src1, src2, src3); 1881 1882 __lasx_xvstelm_d(src0, src, 0, 0); 1883 __lasx_xvstelm_d(src0, src + stride, 0, 1); 1884 __lasx_xvstelm_d(src0, src + stride_2x, 0, 2); 1885 __lasx_xvstelm_d(src0, src + stride_3x, 0, 3); 1886 src += stride_4x; 1887 __lasx_xvstelm_d(src1, src, 0, 0); 1888 __lasx_xvstelm_d(src1, src + stride, 0, 1); 1889 __lasx_xvstelm_d(src1, src + stride_2x, 0, 2); 1890 __lasx_xvstelm_d(src1, src + stride_3x, 0, 3); 1891 src += stride_4x; 1892 __lasx_xvstelm_d(src2, src, 0, 0); 1893 __lasx_xvstelm_d(src2, src + stride, 0, 1); 1894 __lasx_xvstelm_d(src2, src + stride_2x, 0, 2); 1895 __lasx_xvstelm_d(src2, src + stride_3x, 0, 3); 1896 src += stride_4x; 1897 __lasx_xvstelm_d(src3, src, 0, 0); 1898 __lasx_xvstelm_d(src3, src + stride, 0, 1); 1899 __lasx_xvstelm_d(src3, src + stride_2x, 0, 2); 1900 __lasx_xvstelm_d(src3, src + stride_3x, 0, 3); 1901} 1902 1903void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride, 1904 int height, int log2_denom, 1905 int weight_src, int offset) 1906{ 1907 if (4 == height) { 1908 avc_wgt_8x4_lasx(src, stride, log2_denom, weight_src, offset); 1909 } else if (8 == height) { 1910 avc_wgt_8x8_lasx(src, stride, log2_denom, weight_src, offset); 1911 } else { 1912 avc_wgt_8x16_lasx(src, stride, log2_denom, weight_src, offset); 1913 } 1914} 1915 1916static void avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride, 1917 int32_t log2_denom, int32_t weight_src, 1918 int32_t offset_in) 1919{ 1920 uint32_t offset_val; 1921 __m256i wgt, zero = __lasx_xvldi(0); 1922 __m256i src0, tmp0, tmp1, denom, offset; 1923 1924 offset_val = (unsigned) offset_in << log2_denom; 1925 1926 wgt = __lasx_xvreplgr2vr_h(weight_src); 1927 offset = __lasx_xvreplgr2vr_h(offset_val); 1928 denom = __lasx_xvreplgr2vr_h(log2_denom); 1929 1930 DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1); 1931 src0 = __lasx_xvilvl_w(tmp1, tmp0); 1932 src0 = __lasx_xvilvl_b(zero, src0); 1933 src0 = __lasx_xvmul_h(wgt, src0); 1934 src0 = __lasx_xvsadd_h(src0, offset); 1935 src0 = __lasx_xvmaxi_h(src0, 0); 1936 src0 = __lasx_xvssrlrn_bu_h(src0, denom); 1937 __lasx_xvstelm_w(src0, src, 0, 0); 1938 __lasx_xvstelm_w(src0, src + stride, 0, 1); 1939} 1940 1941static void avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride, 1942 int32_t log2_denom, int32_t weight_src, 1943 int32_t offset_in) 1944{ 1945 __m256i wgt; 1946 __m256i src0, tmp0, tmp1, tmp2, tmp3, denom, offset; 1947 uint32_t offset_val; 1948 ptrdiff_t stride_2x = stride << 1; 1949 ptrdiff_t stride_3x = stride_2x + stride; 1950 1951 offset_val = (unsigned) offset_in << log2_denom; 1952 1953 wgt = __lasx_xvreplgr2vr_h(weight_src); 1954 offset = __lasx_xvreplgr2vr_h(offset_val); 1955 denom = __lasx_xvreplgr2vr_h(log2_denom); 1956 1957 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1958 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1959 DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 1960 src0 = __lasx_xvilvl_w(tmp1, tmp0); 1961 src0 = __lasx_vext2xv_hu_bu(src0); 1962 src0 = __lasx_xvmul_h(wgt, src0); 1963 src0 = __lasx_xvsadd_h(src0, offset); 1964 src0 = __lasx_xvmaxi_h(src0, 0); 1965 src0 = __lasx_xvssrlrn_bu_h(src0, denom); 1966 __lasx_xvstelm_w(src0, src, 0, 0); 1967 __lasx_xvstelm_w(src0, src + stride, 0, 1); 1968 __lasx_xvstelm_w(src0, src + stride_2x, 0, 4); 1969 __lasx_xvstelm_w(src0, src + stride_3x, 0, 5); 1970} 1971 1972static void avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride, 1973 int32_t log2_denom, int32_t weight_src, 1974 int32_t offset_in) 1975{ 1976 __m256i src0, src0_h, src0_l; 1977 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset; 1978 __m256i wgt, zero = __lasx_xvldi(0); 1979 uint32_t offset_val; 1980 ptrdiff_t stride_2x = stride << 1; 1981 ptrdiff_t stride_4x = stride << 2; 1982 ptrdiff_t stride_3x = stride_2x + stride; 1983 1984 offset_val = (unsigned) offset_in << log2_denom; 1985 1986 wgt = __lasx_xvreplgr2vr_h(weight_src); 1987 offset = __lasx_xvreplgr2vr_h(offset_val); 1988 denom = __lasx_xvreplgr2vr_h(log2_denom); 1989 1990 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1991 src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1992 src += stride_4x; 1993 DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1994 src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1995 src -= stride_4x; 1996 DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, 1997 tmp5, tmp0, tmp1, tmp2, tmp3); 1998 DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1999 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 2000 src0_l = __lasx_xvilvl_b(zero, src0); 2001 src0_h = __lasx_xvilvh_b(zero, src0); 2002 src0_l = __lasx_xvmul_h(wgt, src0_l); 2003 src0_h = __lasx_xvmul_h(wgt, src0_h); 2004 src0_l = __lasx_xvsadd_h(src0_l, offset); 2005 src0_h = __lasx_xvsadd_h(src0_h, offset); 2006 src0_l = __lasx_xvmaxi_h(src0_l, 0); 2007 src0_h = __lasx_xvmaxi_h(src0_h, 0); 2008 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 2009 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 2010 __lasx_xvstelm_w(src0_l, src, 0, 0); 2011 __lasx_xvstelm_w(src0_l, src + stride, 0, 1); 2012 __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 0); 2013 __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 1); 2014 src += stride_4x; 2015 __lasx_xvstelm_w(src0_l, src, 0, 4); 2016 __lasx_xvstelm_w(src0_l, src + stride, 0, 5); 2017 __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 4); 2018 __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 5); 2019} 2020 2021void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride, 2022 int height, int log2_denom, 2023 int weight_src, int offset) 2024{ 2025 if (2 == height) { 2026 avc_wgt_4x2_lasx(src, stride, log2_denom, weight_src, offset); 2027 } else if (4 == height) { 2028 avc_wgt_4x4_lasx(src, stride, log2_denom, weight_src, offset); 2029 } else { 2030 avc_wgt_4x8_lasx(src, stride, log2_denom, weight_src, offset); 2031 } 2032} 2033 2034void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride) 2035{ 2036 __m256i src0, dst0, dst1, dst2, dst3, zero; 2037 __m256i tmp0, tmp1; 2038 uint8_t* _dst1 = _dst + stride; 2039 uint8_t* _dst2 = _dst1 + stride; 2040 uint8_t* _dst3 = _dst2 + stride; 2041 2042 src0 = __lasx_xvld(_src, 0); 2043 dst0 = __lasx_xvldrepl_w(_dst, 0); 2044 dst1 = __lasx_xvldrepl_w(_dst1, 0); 2045 dst2 = __lasx_xvldrepl_w(_dst2, 0); 2046 dst3 = __lasx_xvldrepl_w(_dst3, 0); 2047 tmp0 = __lasx_xvilvl_w(dst1, dst0); 2048 tmp1 = __lasx_xvilvl_w(dst3, dst2); 2049 dst0 = __lasx_xvilvl_d(tmp1, tmp0); 2050 tmp0 = __lasx_vext2xv_hu_bu(dst0); 2051 zero = __lasx_xvldi(0); 2052 tmp1 = __lasx_xvadd_h(src0, tmp0); 2053 dst0 = __lasx_xvpickev_b(tmp1, tmp1); 2054 __lasx_xvstelm_w(dst0, _dst, 0, 0); 2055 __lasx_xvstelm_w(dst0, _dst1, 0, 1); 2056 __lasx_xvstelm_w(dst0, _dst2, 0, 4); 2057 __lasx_xvstelm_w(dst0, _dst3, 0, 5); 2058 __lasx_xvst(zero, _src, 0); 2059} 2060 2061void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride) 2062{ 2063 __m256i src0, src1, src2, src3; 2064 __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2065 __m256i tmp0, tmp1, tmp2, tmp3; 2066 __m256i zero = __lasx_xvldi(0); 2067 uint8_t *_dst1 = _dst + stride; 2068 uint8_t *_dst2 = _dst1 + stride; 2069 uint8_t *_dst3 = _dst2 + stride; 2070 uint8_t *_dst4 = _dst3 + stride; 2071 uint8_t *_dst5 = _dst4 + stride; 2072 uint8_t *_dst6 = _dst5 + stride; 2073 uint8_t *_dst7 = _dst6 + stride; 2074 2075 src0 = __lasx_xvld(_src, 0); 2076 src1 = __lasx_xvld(_src, 32); 2077 src2 = __lasx_xvld(_src, 64); 2078 src3 = __lasx_xvld(_src, 96); 2079 dst0 = __lasx_xvldrepl_d(_dst, 0); 2080 dst1 = __lasx_xvldrepl_d(_dst1, 0); 2081 dst2 = __lasx_xvldrepl_d(_dst2, 0); 2082 dst3 = __lasx_xvldrepl_d(_dst3, 0); 2083 dst4 = __lasx_xvldrepl_d(_dst4, 0); 2084 dst5 = __lasx_xvldrepl_d(_dst5, 0); 2085 dst6 = __lasx_xvldrepl_d(_dst6, 0); 2086 dst7 = __lasx_xvldrepl_d(_dst7, 0); 2087 tmp0 = __lasx_xvilvl_d(dst1, dst0); 2088 tmp1 = __lasx_xvilvl_d(dst3, dst2); 2089 tmp2 = __lasx_xvilvl_d(dst5, dst4); 2090 tmp3 = __lasx_xvilvl_d(dst7, dst6); 2091 dst0 = __lasx_vext2xv_hu_bu(tmp0); 2092 dst1 = __lasx_vext2xv_hu_bu(tmp1); 2093 dst1 = __lasx_vext2xv_hu_bu(tmp1); 2094 dst2 = __lasx_vext2xv_hu_bu(tmp2); 2095 dst3 = __lasx_vext2xv_hu_bu(tmp3); 2096 tmp0 = __lasx_xvadd_h(src0, dst0); 2097 tmp1 = __lasx_xvadd_h(src1, dst1); 2098 tmp2 = __lasx_xvadd_h(src2, dst2); 2099 tmp3 = __lasx_xvadd_h(src3, dst3); 2100 dst1 = __lasx_xvpickev_b(tmp1, tmp0); 2101 dst2 = __lasx_xvpickev_b(tmp3, tmp2); 2102 __lasx_xvst(zero, _src, 0); 2103 __lasx_xvst(zero, _src, 32); 2104 __lasx_xvst(zero, _src, 64); 2105 __lasx_xvst(zero, _src, 96); 2106 __lasx_xvstelm_d(dst1, _dst, 0, 0); 2107 __lasx_xvstelm_d(dst1, _dst1, 0, 2); 2108 __lasx_xvstelm_d(dst1, _dst2, 0, 1); 2109 __lasx_xvstelm_d(dst1, _dst3, 0, 3); 2110 __lasx_xvstelm_d(dst2, _dst4, 0, 0); 2111 __lasx_xvstelm_d(dst2, _dst5, 0, 2); 2112 __lasx_xvstelm_d(dst2, _dst6, 0, 1); 2113 __lasx_xvstelm_d(dst2, _dst7, 0, 3); 2114} 2115