1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Loongson LASX optimized h264dsp 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited 5cabdff1aSopenharmony_ci * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn> 6cabdff1aSopenharmony_ci * Xiwei Gu <guxiwei-hf@loongson.cn> 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * This file is part of FFmpeg. 9cabdff1aSopenharmony_ci * 10cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 11cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 12cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 13cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 14cabdff1aSopenharmony_ci * 15cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 16cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 17cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18cabdff1aSopenharmony_ci * Lesser General Public License for more details. 19cabdff1aSopenharmony_ci * 20cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 21cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 22cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23cabdff1aSopenharmony_ci */ 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h" 26cabdff1aSopenharmony_ci#include "h264dsp_lasx.h" 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_ci#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \ 29cabdff1aSopenharmony_ci p1_or_q1_org_in, p2_or_q2_org_in, \ 30cabdff1aSopenharmony_ci neg_tc_in, tc_in, p1_or_q1_out) \ 31cabdff1aSopenharmony_ci{ \ 32cabdff1aSopenharmony_ci __m256i clip3, temp; \ 33cabdff1aSopenharmony_ci \ 34cabdff1aSopenharmony_ci clip3 = __lasx_xvavgr_hu(p0_or_q0_org_in, \ 35cabdff1aSopenharmony_ci q0_or_p0_org_in); \ 36cabdff1aSopenharmony_ci temp = __lasx_xvslli_h(p1_or_q1_org_in, 1); \ 37cabdff1aSopenharmony_ci clip3 = __lasx_xvsub_h(clip3, temp); \ 38cabdff1aSopenharmony_ci clip3 = __lasx_xvavg_h(p2_or_q2_org_in, clip3); \ 39cabdff1aSopenharmony_ci clip3 = __lasx_xvclip_h(clip3, neg_tc_in, tc_in); \ 40cabdff1aSopenharmony_ci p1_or_q1_out = __lasx_xvadd_h(p1_or_q1_org_in, clip3); \ 41cabdff1aSopenharmony_ci} 42cabdff1aSopenharmony_ci 43cabdff1aSopenharmony_ci#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \ 44cabdff1aSopenharmony_ci p1_or_q1_org_in, q1_or_p1_org_in, \ 45cabdff1aSopenharmony_ci neg_threshold_in, threshold_in, \ 46cabdff1aSopenharmony_ci p0_or_q0_out, q0_or_p0_out) \ 47cabdff1aSopenharmony_ci{ \ 48cabdff1aSopenharmony_ci __m256i q0_sub_p0, p1_sub_q1, delta; \ 49cabdff1aSopenharmony_ci \ 50cabdff1aSopenharmony_ci q0_sub_p0 = __lasx_xvsub_h(q0_or_p0_org_in, \ 51cabdff1aSopenharmony_ci p0_or_q0_org_in); \ 52cabdff1aSopenharmony_ci p1_sub_q1 = __lasx_xvsub_h(p1_or_q1_org_in, \ 53cabdff1aSopenharmony_ci q1_or_p1_org_in); \ 54cabdff1aSopenharmony_ci q0_sub_p0 = __lasx_xvslli_h(q0_sub_p0, 2); \ 55cabdff1aSopenharmony_ci p1_sub_q1 = __lasx_xvaddi_hu(p1_sub_q1, 4); \ 56cabdff1aSopenharmony_ci delta = __lasx_xvadd_h(q0_sub_p0, p1_sub_q1); \ 57cabdff1aSopenharmony_ci delta = __lasx_xvsrai_h(delta, 3); \ 58cabdff1aSopenharmony_ci delta = __lasx_xvclip_h(delta, neg_threshold_in, \ 59cabdff1aSopenharmony_ci threshold_in); \ 60cabdff1aSopenharmony_ci p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, delta); \ 61cabdff1aSopenharmony_ci q0_or_p0_out = __lasx_xvsub_h(q0_or_p0_org_in, delta); \ 62cabdff1aSopenharmony_ci \ 63cabdff1aSopenharmony_ci p0_or_q0_out = __lasx_xvclip255_h(p0_or_q0_out); \ 64cabdff1aSopenharmony_ci q0_or_p0_out = __lasx_xvclip255_h(q0_or_p0_out); \ 65cabdff1aSopenharmony_ci} 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_civoid ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width, 68cabdff1aSopenharmony_ci int alpha_in, int beta_in, int8_t *tc) 69cabdff1aSopenharmony_ci{ 70cabdff1aSopenharmony_ci ptrdiff_t img_width_2x = img_width << 1; 71cabdff1aSopenharmony_ci ptrdiff_t img_width_4x = img_width << 2; 72cabdff1aSopenharmony_ci ptrdiff_t img_width_8x = img_width << 3; 73cabdff1aSopenharmony_ci ptrdiff_t img_width_3x = img_width_2x + img_width; 74cabdff1aSopenharmony_ci __m256i tmp_vec0, bs_vec; 75cabdff1aSopenharmony_ci __m256i tc_vec = {0x0101010100000000, 0x0303030302020202, 76cabdff1aSopenharmony_ci 0x0101010100000000, 0x0303030302020202}; 77cabdff1aSopenharmony_ci 78cabdff1aSopenharmony_ci tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0); 79cabdff1aSopenharmony_ci tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec); 80cabdff1aSopenharmony_ci bs_vec = __lasx_xvslti_b(tc_vec, 0); 81cabdff1aSopenharmony_ci bs_vec = __lasx_xvxori_b(bs_vec, 255); 82cabdff1aSopenharmony_ci bs_vec = __lasx_xvandi_b(bs_vec, 1); 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_ci if (__lasx_xbnz_v(bs_vec)) { 85cabdff1aSopenharmony_ci uint8_t *src = data - 4; 86cabdff1aSopenharmony_ci __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; 87cabdff1aSopenharmony_ci __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 88cabdff1aSopenharmony_ci __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 89cabdff1aSopenharmony_ci __m256i is_bs_greater_than0; 90cabdff1aSopenharmony_ci __m256i zero = __lasx_xvldi(0); 91cabdff1aSopenharmony_ci 92cabdff1aSopenharmony_ci is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec); 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci { 95cabdff1aSopenharmony_ci uint8_t *src_tmp = src + img_width_8x; 96cabdff1aSopenharmony_ci __m256i row0, row1, row2, row3, row4, row5, row6, row7; 97cabdff1aSopenharmony_ci __m256i row8, row9, row10, row11, row12, row13, row14, row15; 98cabdff1aSopenharmony_ci 99cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 100cabdff1aSopenharmony_ci src, img_width_3x, row0, row1, row2, row3); 101cabdff1aSopenharmony_ci src += img_width_4x; 102cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 103cabdff1aSopenharmony_ci src, img_width_3x, row4, row5, row6, row7); 104cabdff1aSopenharmony_ci src -= img_width_4x; 105cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp, 106cabdff1aSopenharmony_ci img_width_2x, src_tmp, img_width_3x, 107cabdff1aSopenharmony_ci row8, row9, row10, row11); 108cabdff1aSopenharmony_ci src_tmp += img_width_4x; 109cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp, 110cabdff1aSopenharmony_ci img_width_2x, src_tmp, img_width_3x, 111cabdff1aSopenharmony_ci row12, row13, row14, row15); 112cabdff1aSopenharmony_ci src_tmp -= img_width_4x; 113cabdff1aSopenharmony_ci 114cabdff1aSopenharmony_ci LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, 115cabdff1aSopenharmony_ci row7, row8, row9, row10, row11, 116cabdff1aSopenharmony_ci row12, row13, row14, row15, 117cabdff1aSopenharmony_ci p3_org, p2_org, p1_org, p0_org, 118cabdff1aSopenharmony_ci q0_org, q1_org, q2_org, q3_org); 119cabdff1aSopenharmony_ci } 120cabdff1aSopenharmony_ci 121cabdff1aSopenharmony_ci p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 122cabdff1aSopenharmony_ci p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 123cabdff1aSopenharmony_ci q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 124cabdff1aSopenharmony_ci 125cabdff1aSopenharmony_ci alpha = __lasx_xvreplgr2vr_b(alpha_in); 126cabdff1aSopenharmony_ci beta = __lasx_xvreplgr2vr_b(beta_in); 127cabdff1aSopenharmony_ci 128cabdff1aSopenharmony_ci is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 129cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 130cabdff1aSopenharmony_ci is_less_than = is_less_than_alpha & is_less_than_beta; 131cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 132cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 133cabdff1aSopenharmony_ci is_less_than = is_less_than & is_bs_greater_than0; 134cabdff1aSopenharmony_ci 135cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than)) { 136cabdff1aSopenharmony_ci __m256i neg_tc_h, tc_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h; 137cabdff1aSopenharmony_ci __m256i p2_asub_p0, q2_asub_q0; 138cabdff1aSopenharmony_ci 139cabdff1aSopenharmony_ci neg_tc_h = __lasx_xvneg_b(tc_vec); 140cabdff1aSopenharmony_ci neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h); 141cabdff1aSopenharmony_ci tc_h = __lasx_vext2xv_hu_bu(tc_vec); 142cabdff1aSopenharmony_ci p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 143cabdff1aSopenharmony_ci p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 144cabdff1aSopenharmony_ci q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 145cabdff1aSopenharmony_ci 146cabdff1aSopenharmony_ci p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org); 147cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta); 148cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 149cabdff1aSopenharmony_ci 150cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than_beta)) { 151cabdff1aSopenharmony_ci __m256i p2_org_h, p1_h; 152cabdff1aSopenharmony_ci 153cabdff1aSopenharmony_ci p2_org_h = __lasx_vext2xv_hu_bu(p2_org); 154cabdff1aSopenharmony_ci AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h, 155cabdff1aSopenharmony_ci neg_tc_h, tc_h, p1_h); 156cabdff1aSopenharmony_ci p1_h = __lasx_xvpickev_b(p1_h, p1_h); 157cabdff1aSopenharmony_ci p1_h = __lasx_xvpermi_d(p1_h, 0xd8); 158cabdff1aSopenharmony_ci p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta); 159cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1); 160cabdff1aSopenharmony_ci tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta); 161cabdff1aSopenharmony_ci } 162cabdff1aSopenharmony_ci 163cabdff1aSopenharmony_ci q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org); 164cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta); 165cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 166cabdff1aSopenharmony_ci 167cabdff1aSopenharmony_ci q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 168cabdff1aSopenharmony_ci 169cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than_beta)) { 170cabdff1aSopenharmony_ci __m256i q2_org_h, q1_h; 171cabdff1aSopenharmony_ci 172cabdff1aSopenharmony_ci q2_org_h = __lasx_vext2xv_hu_bu(q2_org); 173cabdff1aSopenharmony_ci AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h, 174cabdff1aSopenharmony_ci neg_tc_h, tc_h, q1_h); 175cabdff1aSopenharmony_ci q1_h = __lasx_xvpickev_b(q1_h, q1_h); 176cabdff1aSopenharmony_ci q1_h = __lasx_xvpermi_d(q1_h, 0xd8); 177cabdff1aSopenharmony_ci q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta); 178cabdff1aSopenharmony_ci 179cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1); 180cabdff1aSopenharmony_ci tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta); 181cabdff1aSopenharmony_ci } 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_ci { 184cabdff1aSopenharmony_ci __m256i neg_thresh_h, p0_h, q0_h; 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_ci neg_thresh_h = __lasx_xvneg_b(tc_vec); 187cabdff1aSopenharmony_ci neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h); 188cabdff1aSopenharmony_ci tc_h = __lasx_vext2xv_hu_bu(tc_vec); 189cabdff1aSopenharmony_ci 190cabdff1aSopenharmony_ci AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h, 191cabdff1aSopenharmony_ci neg_thresh_h, tc_h, p0_h, q0_h); 192cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, 193cabdff1aSopenharmony_ci p0_h, q0_h); 194cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, 195cabdff1aSopenharmony_ci p0_h, q0_h); 196cabdff1aSopenharmony_ci p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 197cabdff1aSopenharmony_ci q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 198cabdff1aSopenharmony_ci } 199cabdff1aSopenharmony_ci 200cabdff1aSopenharmony_ci { 201cabdff1aSopenharmony_ci __m256i row0, row1, row2, row3, row4, row5, row6, row7; 202cabdff1aSopenharmony_ci __m256i control = {0x0000000400000000, 0x0000000500000001, 203cabdff1aSopenharmony_ci 0x0000000600000002, 0x0000000700000003}; 204cabdff1aSopenharmony_ci 205cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, 206cabdff1aSopenharmony_ci q2_org, 0x02, p2_org, q1_org, 0x02, p3_org, 207cabdff1aSopenharmony_ci q0_org, 0x02, p0_org, p1_org, p2_org, p3_org); 208cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org, 209cabdff1aSopenharmony_ci row0, row2); 210cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org, 211cabdff1aSopenharmony_ci row1, row3); 212cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6); 213cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7); 214cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6, 215cabdff1aSopenharmony_ci control, row7, control, row4, row5, row6, row7); 216cabdff1aSopenharmony_ci __lasx_xvstelm_d(row4, src, 0, 0); 217cabdff1aSopenharmony_ci __lasx_xvstelm_d(row4, src + img_width, 0, 1); 218cabdff1aSopenharmony_ci src += img_width_2x; 219cabdff1aSopenharmony_ci __lasx_xvstelm_d(row4, src, 0, 2); 220cabdff1aSopenharmony_ci __lasx_xvstelm_d(row4, src + img_width, 0, 3); 221cabdff1aSopenharmony_ci src += img_width_2x; 222cabdff1aSopenharmony_ci __lasx_xvstelm_d(row5, src, 0, 0); 223cabdff1aSopenharmony_ci __lasx_xvstelm_d(row5, src + img_width, 0, 1); 224cabdff1aSopenharmony_ci src += img_width_2x; 225cabdff1aSopenharmony_ci __lasx_xvstelm_d(row5, src, 0, 2); 226cabdff1aSopenharmony_ci __lasx_xvstelm_d(row5, src + img_width, 0, 3); 227cabdff1aSopenharmony_ci src += img_width_2x; 228cabdff1aSopenharmony_ci __lasx_xvstelm_d(row6, src, 0, 0); 229cabdff1aSopenharmony_ci __lasx_xvstelm_d(row6, src + img_width, 0, 1); 230cabdff1aSopenharmony_ci src += img_width_2x; 231cabdff1aSopenharmony_ci __lasx_xvstelm_d(row6, src, 0, 2); 232cabdff1aSopenharmony_ci __lasx_xvstelm_d(row6, src + img_width, 0, 3); 233cabdff1aSopenharmony_ci src += img_width_2x; 234cabdff1aSopenharmony_ci __lasx_xvstelm_d(row7, src, 0, 0); 235cabdff1aSopenharmony_ci __lasx_xvstelm_d(row7, src + img_width, 0, 1); 236cabdff1aSopenharmony_ci src += img_width_2x; 237cabdff1aSopenharmony_ci __lasx_xvstelm_d(row7, src, 0, 2); 238cabdff1aSopenharmony_ci __lasx_xvstelm_d(row7, src + img_width, 0, 3); 239cabdff1aSopenharmony_ci } 240cabdff1aSopenharmony_ci } 241cabdff1aSopenharmony_ci } 242cabdff1aSopenharmony_ci} 243cabdff1aSopenharmony_ci 244cabdff1aSopenharmony_civoid ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width, 245cabdff1aSopenharmony_ci int alpha_in, int beta_in, int8_t *tc) 246cabdff1aSopenharmony_ci{ 247cabdff1aSopenharmony_ci ptrdiff_t img_width_2x = img_width << 1; 248cabdff1aSopenharmony_ci ptrdiff_t img_width_3x = img_width + img_width_2x; 249cabdff1aSopenharmony_ci __m256i tmp_vec0, bs_vec; 250cabdff1aSopenharmony_ci __m256i tc_vec = {0x0101010100000000, 0x0303030302020202, 251cabdff1aSopenharmony_ci 0x0101010100000000, 0x0303030302020202}; 252cabdff1aSopenharmony_ci 253cabdff1aSopenharmony_ci tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0); 254cabdff1aSopenharmony_ci tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec); 255cabdff1aSopenharmony_ci bs_vec = __lasx_xvslti_b(tc_vec, 0); 256cabdff1aSopenharmony_ci bs_vec = __lasx_xvxori_b(bs_vec, 255); 257cabdff1aSopenharmony_ci bs_vec = __lasx_xvandi_b(bs_vec, 1); 258cabdff1aSopenharmony_ci 259cabdff1aSopenharmony_ci if (__lasx_xbnz_v(bs_vec)) { 260cabdff1aSopenharmony_ci __m256i p2_org, p1_org, p0_org, q0_org, q1_org, q2_org; 261cabdff1aSopenharmony_ci __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 262cabdff1aSopenharmony_ci __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 263cabdff1aSopenharmony_ci __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h; 264cabdff1aSopenharmony_ci __m256i is_bs_greater_than0; 265cabdff1aSopenharmony_ci __m256i zero = __lasx_xvldi(0); 266cabdff1aSopenharmony_ci 267cabdff1aSopenharmony_ci alpha = __lasx_xvreplgr2vr_b(alpha_in); 268cabdff1aSopenharmony_ci beta = __lasx_xvreplgr2vr_b(beta_in); 269cabdff1aSopenharmony_ci 270cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, data, -img_width_3x, data, -img_width_2x, 271cabdff1aSopenharmony_ci p2_org, p1_org); 272cabdff1aSopenharmony_ci p0_org = __lasx_xvldx(data, -img_width); 273cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org); 274cabdff1aSopenharmony_ci 275cabdff1aSopenharmony_ci is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec); 276cabdff1aSopenharmony_ci p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 277cabdff1aSopenharmony_ci p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 278cabdff1aSopenharmony_ci q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 279cabdff1aSopenharmony_ci 280cabdff1aSopenharmony_ci is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 281cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 282cabdff1aSopenharmony_ci is_less_than = is_less_than_alpha & is_less_than_beta; 283cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 284cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 285cabdff1aSopenharmony_ci is_less_than = is_less_than & is_bs_greater_than0; 286cabdff1aSopenharmony_ci 287cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than)) { 288cabdff1aSopenharmony_ci __m256i neg_tc_h, tc_h, p2_asub_p0, q2_asub_q0; 289cabdff1aSopenharmony_ci 290cabdff1aSopenharmony_ci q2_org = __lasx_xvldx(data, img_width_2x); 291cabdff1aSopenharmony_ci 292cabdff1aSopenharmony_ci neg_tc_h = __lasx_xvneg_b(tc_vec); 293cabdff1aSopenharmony_ci neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h); 294cabdff1aSopenharmony_ci tc_h = __lasx_vext2xv_hu_bu(tc_vec); 295cabdff1aSopenharmony_ci p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 296cabdff1aSopenharmony_ci p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 297cabdff1aSopenharmony_ci q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 298cabdff1aSopenharmony_ci 299cabdff1aSopenharmony_ci p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org); 300cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta); 301cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 302cabdff1aSopenharmony_ci 303cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than_beta)) { 304cabdff1aSopenharmony_ci __m256i p1_h, p2_org_h; 305cabdff1aSopenharmony_ci 306cabdff1aSopenharmony_ci p2_org_h = __lasx_vext2xv_hu_bu(p2_org); 307cabdff1aSopenharmony_ci AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h, 308cabdff1aSopenharmony_ci neg_tc_h, tc_h, p1_h); 309cabdff1aSopenharmony_ci p1_h = __lasx_xvpickev_b(p1_h, p1_h); 310cabdff1aSopenharmony_ci p1_h = __lasx_xvpermi_d(p1_h, 0xd8); 311cabdff1aSopenharmony_ci p1_h = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta); 312cabdff1aSopenharmony_ci p1_org = __lasx_xvpermi_q(p1_org, p1_h, 0x30); 313cabdff1aSopenharmony_ci __lasx_xvst(p1_org, data - img_width_2x, 0); 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1); 316cabdff1aSopenharmony_ci tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta); 317cabdff1aSopenharmony_ci } 318cabdff1aSopenharmony_ci 319cabdff1aSopenharmony_ci q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org); 320cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta); 321cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 322cabdff1aSopenharmony_ci 323cabdff1aSopenharmony_ci q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 324cabdff1aSopenharmony_ci 325cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than_beta)) { 326cabdff1aSopenharmony_ci __m256i q1_h, q2_org_h; 327cabdff1aSopenharmony_ci 328cabdff1aSopenharmony_ci q2_org_h = __lasx_vext2xv_hu_bu(q2_org); 329cabdff1aSopenharmony_ci AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h, 330cabdff1aSopenharmony_ci neg_tc_h, tc_h, q1_h); 331cabdff1aSopenharmony_ci q1_h = __lasx_xvpickev_b(q1_h, q1_h); 332cabdff1aSopenharmony_ci q1_h = __lasx_xvpermi_d(q1_h, 0xd8); 333cabdff1aSopenharmony_ci q1_h = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta); 334cabdff1aSopenharmony_ci q1_org = __lasx_xvpermi_q(q1_org, q1_h, 0x30); 335cabdff1aSopenharmony_ci __lasx_xvst(q1_org, data + img_width, 0); 336cabdff1aSopenharmony_ci 337cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1); 338cabdff1aSopenharmony_ci tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta); 339cabdff1aSopenharmony_ci 340cabdff1aSopenharmony_ci } 341cabdff1aSopenharmony_ci 342cabdff1aSopenharmony_ci { 343cabdff1aSopenharmony_ci __m256i neg_thresh_h, p0_h, q0_h; 344cabdff1aSopenharmony_ci 345cabdff1aSopenharmony_ci neg_thresh_h = __lasx_xvneg_b(tc_vec); 346cabdff1aSopenharmony_ci neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h); 347cabdff1aSopenharmony_ci tc_h = __lasx_vext2xv_hu_bu(tc_vec); 348cabdff1aSopenharmony_ci 349cabdff1aSopenharmony_ci AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h, 350cabdff1aSopenharmony_ci neg_thresh_h, tc_h, p0_h, q0_h); 351cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, 352cabdff1aSopenharmony_ci p0_h, q0_h); 353cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0Xd8, q0_h, 0xd8, 354cabdff1aSopenharmony_ci p0_h, q0_h); 355cabdff1aSopenharmony_ci p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 356cabdff1aSopenharmony_ci q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 357cabdff1aSopenharmony_ci p0_org = __lasx_xvpermi_q(p0_org, p0_h, 0x30); 358cabdff1aSopenharmony_ci q0_org = __lasx_xvpermi_q(q0_org, q0_h, 0x30); 359cabdff1aSopenharmony_ci __lasx_xvst(p0_org, data - img_width, 0); 360cabdff1aSopenharmony_ci __lasx_xvst(q0_org, data, 0); 361cabdff1aSopenharmony_ci } 362cabdff1aSopenharmony_ci } 363cabdff1aSopenharmony_ci } 364cabdff1aSopenharmony_ci} 365cabdff1aSopenharmony_ci 366cabdff1aSopenharmony_civoid ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width, 367cabdff1aSopenharmony_ci int alpha_in, int beta_in, int8_t *tc) 368cabdff1aSopenharmony_ci{ 369cabdff1aSopenharmony_ci __m256i tmp_vec0, bs_vec; 370cabdff1aSopenharmony_ci __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0}; 371cabdff1aSopenharmony_ci __m256i zero = __lasx_xvldi(0); 372cabdff1aSopenharmony_ci ptrdiff_t img_width_2x = img_width << 1; 373cabdff1aSopenharmony_ci ptrdiff_t img_width_4x = img_width << 2; 374cabdff1aSopenharmony_ci ptrdiff_t img_width_3x = img_width_2x + img_width; 375cabdff1aSopenharmony_ci 376cabdff1aSopenharmony_ci tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0); 377cabdff1aSopenharmony_ci tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec); 378cabdff1aSopenharmony_ci bs_vec = __lasx_xvslti_b(tc_vec, 0); 379cabdff1aSopenharmony_ci bs_vec = __lasx_xvxori_b(bs_vec, 255); 380cabdff1aSopenharmony_ci bs_vec = __lasx_xvandi_b(bs_vec, 1); 381cabdff1aSopenharmony_ci bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30); 382cabdff1aSopenharmony_ci 383cabdff1aSopenharmony_ci if (__lasx_xbnz_v(bs_vec)) { 384cabdff1aSopenharmony_ci uint8_t *src = data - 2; 385cabdff1aSopenharmony_ci __m256i p1_org, p0_org, q0_org, q1_org; 386cabdff1aSopenharmony_ci __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 387cabdff1aSopenharmony_ci __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 388cabdff1aSopenharmony_ci __m256i is_bs_greater_than0; 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_ci is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec); 391cabdff1aSopenharmony_ci 392cabdff1aSopenharmony_ci { 393cabdff1aSopenharmony_ci __m256i row0, row1, row2, row3, row4, row5, row6, row7; 394cabdff1aSopenharmony_ci 395cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 396cabdff1aSopenharmony_ci src, img_width_3x, row0, row1, row2, row3); 397cabdff1aSopenharmony_ci src += img_width_4x; 398cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 399cabdff1aSopenharmony_ci src, img_width_3x, row4, row5, row6, row7); 400cabdff1aSopenharmony_ci src -= img_width_4x; 401cabdff1aSopenharmony_ci /* LASX_TRANSPOSE8x4_B */ 402cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, 403cabdff1aSopenharmony_ci row7, row5, p1_org, p0_org, q0_org, q1_org); 404cabdff1aSopenharmony_ci row0 = __lasx_xvilvl_b(p0_org, p1_org); 405cabdff1aSopenharmony_ci row1 = __lasx_xvilvl_b(q1_org, q0_org); 406cabdff1aSopenharmony_ci row3 = __lasx_xvilvh_w(row1, row0); 407cabdff1aSopenharmony_ci row2 = __lasx_xvilvl_w(row1, row0); 408cabdff1aSopenharmony_ci p1_org = __lasx_xvpermi_d(row2, 0x00); 409cabdff1aSopenharmony_ci p0_org = __lasx_xvpermi_d(row2, 0x55); 410cabdff1aSopenharmony_ci q0_org = __lasx_xvpermi_d(row3, 0x00); 411cabdff1aSopenharmony_ci q1_org = __lasx_xvpermi_d(row3, 0x55); 412cabdff1aSopenharmony_ci } 413cabdff1aSopenharmony_ci 414cabdff1aSopenharmony_ci p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 415cabdff1aSopenharmony_ci p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 416cabdff1aSopenharmony_ci q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 417cabdff1aSopenharmony_ci 418cabdff1aSopenharmony_ci alpha = __lasx_xvreplgr2vr_b(alpha_in); 419cabdff1aSopenharmony_ci beta = __lasx_xvreplgr2vr_b(beta_in); 420cabdff1aSopenharmony_ci 421cabdff1aSopenharmony_ci is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 422cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 423cabdff1aSopenharmony_ci is_less_than = is_less_than_alpha & is_less_than_beta; 424cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 425cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 426cabdff1aSopenharmony_ci is_less_than = is_less_than & is_bs_greater_than0; 427cabdff1aSopenharmony_ci 428cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than)) { 429cabdff1aSopenharmony_ci __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h; 430cabdff1aSopenharmony_ci 431cabdff1aSopenharmony_ci p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 432cabdff1aSopenharmony_ci p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 433cabdff1aSopenharmony_ci q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 434cabdff1aSopenharmony_ci q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 435cabdff1aSopenharmony_ci 436cabdff1aSopenharmony_ci { 437cabdff1aSopenharmony_ci __m256i tc_h, neg_thresh_h, p0_h, q0_h; 438cabdff1aSopenharmony_ci 439cabdff1aSopenharmony_ci neg_thresh_h = __lasx_xvneg_b(tc_vec); 440cabdff1aSopenharmony_ci neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h); 441cabdff1aSopenharmony_ci tc_h = __lasx_vext2xv_hu_bu(tc_vec); 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h, 444cabdff1aSopenharmony_ci neg_thresh_h, tc_h, p0_h, q0_h); 445cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, 446cabdff1aSopenharmony_ci p0_h, q0_h); 447cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, 448cabdff1aSopenharmony_ci p0_h, q0_h); 449cabdff1aSopenharmony_ci p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 450cabdff1aSopenharmony_ci q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 451cabdff1aSopenharmony_ci } 452cabdff1aSopenharmony_ci 453cabdff1aSopenharmony_ci p0_org = __lasx_xvilvl_b(q0_org, p0_org); 454cabdff1aSopenharmony_ci src = data - 1; 455cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 0); 456cabdff1aSopenharmony_ci src += img_width; 457cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 1); 458cabdff1aSopenharmony_ci src += img_width; 459cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 2); 460cabdff1aSopenharmony_ci src += img_width; 461cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 3); 462cabdff1aSopenharmony_ci src += img_width; 463cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 4); 464cabdff1aSopenharmony_ci src += img_width; 465cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 5); 466cabdff1aSopenharmony_ci src += img_width; 467cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 6); 468cabdff1aSopenharmony_ci src += img_width; 469cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 7); 470cabdff1aSopenharmony_ci } 471cabdff1aSopenharmony_ci } 472cabdff1aSopenharmony_ci} 473cabdff1aSopenharmony_ci 474cabdff1aSopenharmony_civoid ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width, 475cabdff1aSopenharmony_ci int alpha_in, int beta_in, int8_t *tc) 476cabdff1aSopenharmony_ci{ 477cabdff1aSopenharmony_ci int img_width_2x = img_width << 1; 478cabdff1aSopenharmony_ci __m256i tmp_vec0, bs_vec; 479cabdff1aSopenharmony_ci __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0}; 480cabdff1aSopenharmony_ci __m256i zero = __lasx_xvldi(0); 481cabdff1aSopenharmony_ci 482cabdff1aSopenharmony_ci tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0); 483cabdff1aSopenharmony_ci tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec); 484cabdff1aSopenharmony_ci bs_vec = __lasx_xvslti_b(tc_vec, 0); 485cabdff1aSopenharmony_ci bs_vec = __lasx_xvxori_b(bs_vec, 255); 486cabdff1aSopenharmony_ci bs_vec = __lasx_xvandi_b(bs_vec, 1); 487cabdff1aSopenharmony_ci bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30); 488cabdff1aSopenharmony_ci 489cabdff1aSopenharmony_ci if (__lasx_xbnz_v(bs_vec)) { 490cabdff1aSopenharmony_ci __m256i p1_org, p0_org, q0_org, q1_org; 491cabdff1aSopenharmony_ci __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 492cabdff1aSopenharmony_ci __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 493cabdff1aSopenharmony_ci __m256i is_bs_greater_than0; 494cabdff1aSopenharmony_ci 495cabdff1aSopenharmony_ci alpha = __lasx_xvreplgr2vr_b(alpha_in); 496cabdff1aSopenharmony_ci beta = __lasx_xvreplgr2vr_b(beta_in); 497cabdff1aSopenharmony_ci 498cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, data, -img_width_2x, data, -img_width, 499cabdff1aSopenharmony_ci p1_org, p0_org); 500cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org); 501cabdff1aSopenharmony_ci 502cabdff1aSopenharmony_ci is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec); 503cabdff1aSopenharmony_ci p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 504cabdff1aSopenharmony_ci p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 505cabdff1aSopenharmony_ci q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 506cabdff1aSopenharmony_ci 507cabdff1aSopenharmony_ci is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 508cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 509cabdff1aSopenharmony_ci is_less_than = is_less_than_alpha & is_less_than_beta; 510cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 511cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 512cabdff1aSopenharmony_ci is_less_than = is_less_than & is_bs_greater_than0; 513cabdff1aSopenharmony_ci 514cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than)) { 515cabdff1aSopenharmony_ci __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h; 516cabdff1aSopenharmony_ci 517cabdff1aSopenharmony_ci p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 518cabdff1aSopenharmony_ci p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 519cabdff1aSopenharmony_ci q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 520cabdff1aSopenharmony_ci q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 521cabdff1aSopenharmony_ci 522cabdff1aSopenharmony_ci { 523cabdff1aSopenharmony_ci __m256i neg_thresh_h, tc_h, p0_h, q0_h; 524cabdff1aSopenharmony_ci 525cabdff1aSopenharmony_ci neg_thresh_h = __lasx_xvneg_b(tc_vec); 526cabdff1aSopenharmony_ci neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h); 527cabdff1aSopenharmony_ci tc_h = __lasx_vext2xv_hu_bu(tc_vec); 528cabdff1aSopenharmony_ci 529cabdff1aSopenharmony_ci AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h, 530cabdff1aSopenharmony_ci neg_thresh_h, tc_h, p0_h, q0_h); 531cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, 532cabdff1aSopenharmony_ci p0_h, q0_h); 533cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, 534cabdff1aSopenharmony_ci p0_h, q0_h); 535cabdff1aSopenharmony_ci p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 536cabdff1aSopenharmony_ci q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 537cabdff1aSopenharmony_ci __lasx_xvstelm_d(p0_h, data - img_width, 0, 0); 538cabdff1aSopenharmony_ci __lasx_xvstelm_d(q0_h, data, 0, 0); 539cabdff1aSopenharmony_ci } 540cabdff1aSopenharmony_ci } 541cabdff1aSopenharmony_ci } 542cabdff1aSopenharmony_ci} 543cabdff1aSopenharmony_ci 544cabdff1aSopenharmony_ci#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \ 545cabdff1aSopenharmony_ci q3_or_p3_org_in, p1_or_q1_org_in, \ 546cabdff1aSopenharmony_ci p2_or_q2_org_in, q1_or_p1_org_in, \ 547cabdff1aSopenharmony_ci p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \ 548cabdff1aSopenharmony_ci{ \ 549cabdff1aSopenharmony_ci __m256i threshold; \ 550cabdff1aSopenharmony_ci __m256i const2, const3 = __lasx_xvldi(0); \ 551cabdff1aSopenharmony_ci \ 552cabdff1aSopenharmony_ci const2 = __lasx_xvaddi_hu(const3, 2); \ 553cabdff1aSopenharmony_ci const3 = __lasx_xvaddi_hu(const3, 3); \ 554cabdff1aSopenharmony_ci threshold = __lasx_xvadd_h(p0_or_q0_org_in, q3_or_p3_org_in); \ 555cabdff1aSopenharmony_ci threshold = __lasx_xvadd_h(p1_or_q1_org_in, threshold); \ 556cabdff1aSopenharmony_ci \ 557cabdff1aSopenharmony_ci p0_or_q0_out = __lasx_xvslli_h(threshold, 1); \ 558cabdff1aSopenharmony_ci p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p2_or_q2_org_in); \ 559cabdff1aSopenharmony_ci p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, q1_or_p1_org_in); \ 560cabdff1aSopenharmony_ci p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const3); \ 561cabdff1aSopenharmony_ci \ 562cabdff1aSopenharmony_ci p1_or_q1_out = __lasx_xvadd_h(p2_or_q2_org_in, threshold); \ 563cabdff1aSopenharmony_ci p1_or_q1_out = __lasx_xvsrar_h(p1_or_q1_out, const2); \ 564cabdff1aSopenharmony_ci \ 565cabdff1aSopenharmony_ci p2_or_q2_out = __lasx_xvmul_h(p2_or_q2_org_in, const3); \ 566cabdff1aSopenharmony_ci p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \ 567cabdff1aSopenharmony_ci p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \ 568cabdff1aSopenharmony_ci p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, threshold); \ 569cabdff1aSopenharmony_ci p2_or_q2_out = __lasx_xvsrar_h(p2_or_q2_out, const3); \ 570cabdff1aSopenharmony_ci} 571cabdff1aSopenharmony_ci 572cabdff1aSopenharmony_ci/* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */ 573cabdff1aSopenharmony_ci#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \ 574cabdff1aSopenharmony_ci p1_or_q1_org_in, p0_or_q0_out) \ 575cabdff1aSopenharmony_ci{ \ 576cabdff1aSopenharmony_ci __m256i const2 = __lasx_xvldi(0); \ 577cabdff1aSopenharmony_ci const2 = __lasx_xvaddi_hu(const2, 2); \ 578cabdff1aSopenharmony_ci p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, q1_or_p1_org_in); \ 579cabdff1aSopenharmony_ci p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \ 580cabdff1aSopenharmony_ci p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \ 581cabdff1aSopenharmony_ci p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const2); \ 582cabdff1aSopenharmony_ci} 583cabdff1aSopenharmony_ci 584cabdff1aSopenharmony_civoid ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, 585cabdff1aSopenharmony_ci int alpha_in, int beta_in) 586cabdff1aSopenharmony_ci{ 587cabdff1aSopenharmony_ci ptrdiff_t img_width_2x = img_width << 1; 588cabdff1aSopenharmony_ci ptrdiff_t img_width_4x = img_width << 2; 589cabdff1aSopenharmony_ci ptrdiff_t img_width_3x = img_width_2x + img_width; 590cabdff1aSopenharmony_ci uint8_t *src = data - 4; 591cabdff1aSopenharmony_ci __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 592cabdff1aSopenharmony_ci __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 593cabdff1aSopenharmony_ci __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; 594cabdff1aSopenharmony_ci __m256i zero = __lasx_xvldi(0); 595cabdff1aSopenharmony_ci 596cabdff1aSopenharmony_ci { 597cabdff1aSopenharmony_ci __m256i row0, row1, row2, row3, row4, row5, row6, row7; 598cabdff1aSopenharmony_ci __m256i row8, row9, row10, row11, row12, row13, row14, row15; 599cabdff1aSopenharmony_ci 600cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 601cabdff1aSopenharmony_ci src, img_width_3x, row0, row1, row2, row3); 602cabdff1aSopenharmony_ci src += img_width_4x; 603cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 604cabdff1aSopenharmony_ci src, img_width_3x, row4, row5, row6, row7); 605cabdff1aSopenharmony_ci src += img_width_4x; 606cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 607cabdff1aSopenharmony_ci src, img_width_3x, row8, row9, row10, row11); 608cabdff1aSopenharmony_ci src += img_width_4x; 609cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 610cabdff1aSopenharmony_ci src, img_width_3x, row12, row13, row14, row15); 611cabdff1aSopenharmony_ci src += img_width_4x; 612cabdff1aSopenharmony_ci 613cabdff1aSopenharmony_ci LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, 614cabdff1aSopenharmony_ci row4, row5, row6, row7, 615cabdff1aSopenharmony_ci row8, row9, row10, row11, 616cabdff1aSopenharmony_ci row12, row13, row14, row15, 617cabdff1aSopenharmony_ci p3_org, p2_org, p1_org, p0_org, 618cabdff1aSopenharmony_ci q0_org, q1_org, q2_org, q3_org); 619cabdff1aSopenharmony_ci } 620cabdff1aSopenharmony_ci 621cabdff1aSopenharmony_ci alpha = __lasx_xvreplgr2vr_b(alpha_in); 622cabdff1aSopenharmony_ci beta = __lasx_xvreplgr2vr_b(beta_in); 623cabdff1aSopenharmony_ci p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 624cabdff1aSopenharmony_ci p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 625cabdff1aSopenharmony_ci q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 626cabdff1aSopenharmony_ci 627cabdff1aSopenharmony_ci is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 628cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 629cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than_alpha; 630cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 631cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 632cabdff1aSopenharmony_ci is_less_than = __lasx_xvpermi_q(zero, is_less_than, 0x30); 633cabdff1aSopenharmony_ci 634cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than)) { 635cabdff1aSopenharmony_ci __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta; 636cabdff1aSopenharmony_ci __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h; 637cabdff1aSopenharmony_ci __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2); 638cabdff1aSopenharmony_ci 639cabdff1aSopenharmony_ci less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2); 640cabdff1aSopenharmony_ci less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0, 641cabdff1aSopenharmony_ci less_alpha_shift2_add2); 642cabdff1aSopenharmony_ci 643cabdff1aSopenharmony_ci p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 644cabdff1aSopenharmony_ci p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 645cabdff1aSopenharmony_ci q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 646cabdff1aSopenharmony_ci q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 647cabdff1aSopenharmony_ci 648cabdff1aSopenharmony_ci p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org); 649cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta); 650cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2; 651cabdff1aSopenharmony_ci negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff); 652cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 653cabdff1aSopenharmony_ci negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 654cabdff1aSopenharmony_ci 655cabdff1aSopenharmony_ci /* combine and store */ 656cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than_beta)) { 657cabdff1aSopenharmony_ci __m256i p2_org_h, p3_org_h, p1_h, p2_h; 658cabdff1aSopenharmony_ci 659cabdff1aSopenharmony_ci p2_org_h = __lasx_vext2xv_hu_bu(p2_org); 660cabdff1aSopenharmony_ci p3_org_h = __lasx_vext2xv_hu_bu(p3_org); 661cabdff1aSopenharmony_ci 662cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h, 663cabdff1aSopenharmony_ci p2_org_h, q1_org_h, p0_h, p1_h, p2_h); 664cabdff1aSopenharmony_ci 665cabdff1aSopenharmony_ci p0_h = __lasx_xvpickev_b(p0_h, p0_h); 666cabdff1aSopenharmony_ci p0_h = __lasx_xvpermi_d(p0_h, 0xd8); 667cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h); 668cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h); 669cabdff1aSopenharmony_ci p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta); 670cabdff1aSopenharmony_ci p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta); 671cabdff1aSopenharmony_ci p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta); 672cabdff1aSopenharmony_ci } 673cabdff1aSopenharmony_ci 674cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h); 675cabdff1aSopenharmony_ci /* combine */ 676cabdff1aSopenharmony_ci p0_h = __lasx_xvpickev_b(p0_h, p0_h); 677cabdff1aSopenharmony_ci p0_h = __lasx_xvpermi_d(p0_h, 0xd8); 678cabdff1aSopenharmony_ci p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta); 679cabdff1aSopenharmony_ci 680cabdff1aSopenharmony_ci /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */ 681cabdff1aSopenharmony_ci q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org); 682cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta); 683cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2; 684cabdff1aSopenharmony_ci negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff); 685cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 686cabdff1aSopenharmony_ci negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 687cabdff1aSopenharmony_ci 688cabdff1aSopenharmony_ci /* combine and store */ 689cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than_beta)) { 690cabdff1aSopenharmony_ci __m256i q2_org_h, q3_org_h, q1_h, q2_h; 691cabdff1aSopenharmony_ci 692cabdff1aSopenharmony_ci q2_org_h = __lasx_vext2xv_hu_bu(q2_org); 693cabdff1aSopenharmony_ci q3_org_h = __lasx_vext2xv_hu_bu(q3_org); 694cabdff1aSopenharmony_ci 695cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h, 696cabdff1aSopenharmony_ci q2_org_h, p1_org_h, q0_h, q1_h, q2_h); 697cabdff1aSopenharmony_ci 698cabdff1aSopenharmony_ci q0_h = __lasx_xvpickev_b(q0_h, q0_h); 699cabdff1aSopenharmony_ci q0_h = __lasx_xvpermi_d(q0_h, 0xd8); 700cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h); 701cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h); 702cabdff1aSopenharmony_ci q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta); 703cabdff1aSopenharmony_ci q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta); 704cabdff1aSopenharmony_ci q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta); 705cabdff1aSopenharmony_ci 706cabdff1aSopenharmony_ci } 707cabdff1aSopenharmony_ci 708cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h); 709cabdff1aSopenharmony_ci 710cabdff1aSopenharmony_ci /* combine */ 711cabdff1aSopenharmony_ci q0_h = __lasx_xvpickev_b(q0_h, q0_h); 712cabdff1aSopenharmony_ci q0_h = __lasx_xvpermi_d(q0_h, 0xd8); 713cabdff1aSopenharmony_ci q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta); 714cabdff1aSopenharmony_ci 715cabdff1aSopenharmony_ci /* transpose and store */ 716cabdff1aSopenharmony_ci { 717cabdff1aSopenharmony_ci __m256i row0, row1, row2, row3, row4, row5, row6, row7; 718cabdff1aSopenharmony_ci __m256i control = {0x0000000400000000, 0x0000000500000001, 719cabdff1aSopenharmony_ci 0x0000000600000002, 0x0000000700000003}; 720cabdff1aSopenharmony_ci 721cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, q2_org, 722cabdff1aSopenharmony_ci 0x02, p2_org, q1_org, 0x02, p3_org, q0_org, 0x02, 723cabdff1aSopenharmony_ci p0_org, p1_org, p2_org, p3_org); 724cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org, 725cabdff1aSopenharmony_ci row0, row2); 726cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org, 727cabdff1aSopenharmony_ci row1, row3); 728cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6); 729cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7); 730cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6, 731cabdff1aSopenharmony_ci control, row7, control, row4, row5, row6, row7); 732cabdff1aSopenharmony_ci src = data - 4; 733cabdff1aSopenharmony_ci __lasx_xvstelm_d(row4, src, 0, 0); 734cabdff1aSopenharmony_ci __lasx_xvstelm_d(row4, src + img_width, 0, 1); 735cabdff1aSopenharmony_ci src += img_width_2x; 736cabdff1aSopenharmony_ci __lasx_xvstelm_d(row4, src, 0, 2); 737cabdff1aSopenharmony_ci __lasx_xvstelm_d(row4, src + img_width, 0, 3); 738cabdff1aSopenharmony_ci src += img_width_2x; 739cabdff1aSopenharmony_ci __lasx_xvstelm_d(row5, src, 0, 0); 740cabdff1aSopenharmony_ci __lasx_xvstelm_d(row5, src + img_width, 0, 1); 741cabdff1aSopenharmony_ci src += img_width_2x; 742cabdff1aSopenharmony_ci __lasx_xvstelm_d(row5, src, 0, 2); 743cabdff1aSopenharmony_ci __lasx_xvstelm_d(row5, src + img_width, 0, 3); 744cabdff1aSopenharmony_ci src += img_width_2x; 745cabdff1aSopenharmony_ci __lasx_xvstelm_d(row6, src, 0, 0); 746cabdff1aSopenharmony_ci __lasx_xvstelm_d(row6, src + img_width, 0, 1); 747cabdff1aSopenharmony_ci src += img_width_2x; 748cabdff1aSopenharmony_ci __lasx_xvstelm_d(row6, src, 0, 2); 749cabdff1aSopenharmony_ci __lasx_xvstelm_d(row6, src + img_width, 0, 3); 750cabdff1aSopenharmony_ci src += img_width_2x; 751cabdff1aSopenharmony_ci __lasx_xvstelm_d(row7, src, 0, 0); 752cabdff1aSopenharmony_ci __lasx_xvstelm_d(row7, src + img_width, 0, 1); 753cabdff1aSopenharmony_ci src += img_width_2x; 754cabdff1aSopenharmony_ci __lasx_xvstelm_d(row7, src, 0, 2); 755cabdff1aSopenharmony_ci __lasx_xvstelm_d(row7, src + img_width, 0, 3); 756cabdff1aSopenharmony_ci } 757cabdff1aSopenharmony_ci } 758cabdff1aSopenharmony_ci} 759cabdff1aSopenharmony_ci 760cabdff1aSopenharmony_civoid ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, 761cabdff1aSopenharmony_ci int alpha_in, int beta_in) 762cabdff1aSopenharmony_ci{ 763cabdff1aSopenharmony_ci ptrdiff_t img_width_2x = img_width << 1; 764cabdff1aSopenharmony_ci ptrdiff_t img_width_3x = img_width_2x + img_width; 765cabdff1aSopenharmony_ci uint8_t *src = data - img_width_2x; 766cabdff1aSopenharmony_ci __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 767cabdff1aSopenharmony_ci __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 768cabdff1aSopenharmony_ci __m256i p1_org, p0_org, q0_org, q1_org; 769cabdff1aSopenharmony_ci __m256i zero = __lasx_xvldi(0); 770cabdff1aSopenharmony_ci 771cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, 772cabdff1aSopenharmony_ci src, img_width_3x, p1_org, p0_org, q0_org, q1_org); 773cabdff1aSopenharmony_ci alpha = __lasx_xvreplgr2vr_b(alpha_in); 774cabdff1aSopenharmony_ci beta = __lasx_xvreplgr2vr_b(beta_in); 775cabdff1aSopenharmony_ci p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 776cabdff1aSopenharmony_ci p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 777cabdff1aSopenharmony_ci q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 778cabdff1aSopenharmony_ci 779cabdff1aSopenharmony_ci is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 780cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 781cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than_alpha; 782cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 783cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 784cabdff1aSopenharmony_ci is_less_than = __lasx_xvpermi_q(zero, is_less_than, 0x30); 785cabdff1aSopenharmony_ci 786cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than)) { 787cabdff1aSopenharmony_ci __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta; 788cabdff1aSopenharmony_ci __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h; 789cabdff1aSopenharmony_ci __m256i p2_org = __lasx_xvldx(src, -img_width); 790cabdff1aSopenharmony_ci __m256i q2_org = __lasx_xvldx(data, img_width_2x); 791cabdff1aSopenharmony_ci __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2); 792cabdff1aSopenharmony_ci less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2); 793cabdff1aSopenharmony_ci less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0, 794cabdff1aSopenharmony_ci less_alpha_shift2_add2); 795cabdff1aSopenharmony_ci 796cabdff1aSopenharmony_ci p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 797cabdff1aSopenharmony_ci p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 798cabdff1aSopenharmony_ci q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 799cabdff1aSopenharmony_ci q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 800cabdff1aSopenharmony_ci 801cabdff1aSopenharmony_ci p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org); 802cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta); 803cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2; 804cabdff1aSopenharmony_ci negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff); 805cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 806cabdff1aSopenharmony_ci negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 807cabdff1aSopenharmony_ci 808cabdff1aSopenharmony_ci /* combine and store */ 809cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than_beta)) { 810cabdff1aSopenharmony_ci __m256i p2_org_h, p3_org_h, p1_h, p2_h; 811cabdff1aSopenharmony_ci __m256i p3_org = __lasx_xvldx(src, -img_width_2x); 812cabdff1aSopenharmony_ci 813cabdff1aSopenharmony_ci p2_org_h = __lasx_vext2xv_hu_bu(p2_org); 814cabdff1aSopenharmony_ci p3_org_h = __lasx_vext2xv_hu_bu(p3_org); 815cabdff1aSopenharmony_ci 816cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h, 817cabdff1aSopenharmony_ci p2_org_h, q1_org_h, p0_h, p1_h, p2_h); 818cabdff1aSopenharmony_ci 819cabdff1aSopenharmony_ci p0_h = __lasx_xvpickev_b(p0_h, p0_h); 820cabdff1aSopenharmony_ci p0_h = __lasx_xvpermi_d(p0_h, 0xd8); 821cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h); 822cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h); 823cabdff1aSopenharmony_ci p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta); 824cabdff1aSopenharmony_ci p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta); 825cabdff1aSopenharmony_ci p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta); 826cabdff1aSopenharmony_ci 827cabdff1aSopenharmony_ci __lasx_xvst(p1_org, src, 0); 828cabdff1aSopenharmony_ci __lasx_xvst(p2_org, src - img_width, 0); 829cabdff1aSopenharmony_ci } 830cabdff1aSopenharmony_ci 831cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h); 832cabdff1aSopenharmony_ci /* combine */ 833cabdff1aSopenharmony_ci p0_h = __lasx_xvpickev_b(p0_h, p0_h); 834cabdff1aSopenharmony_ci p0_h = __lasx_xvpermi_d(p0_h, 0xd8); 835cabdff1aSopenharmony_ci p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta); 836cabdff1aSopenharmony_ci __lasx_xvst(p0_org, data - img_width, 0); 837cabdff1aSopenharmony_ci 838cabdff1aSopenharmony_ci /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */ 839cabdff1aSopenharmony_ci q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org); 840cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta); 841cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2; 842cabdff1aSopenharmony_ci negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff); 843cabdff1aSopenharmony_ci is_less_than_beta = is_less_than_beta & is_less_than; 844cabdff1aSopenharmony_ci negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; 845cabdff1aSopenharmony_ci 846cabdff1aSopenharmony_ci /* combine and store */ 847cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than_beta)) { 848cabdff1aSopenharmony_ci __m256i q2_org_h, q3_org_h, q1_h, q2_h; 849cabdff1aSopenharmony_ci __m256i q3_org = __lasx_xvldx(data, img_width_2x + img_width); 850cabdff1aSopenharmony_ci 851cabdff1aSopenharmony_ci q2_org_h = __lasx_vext2xv_hu_bu(q2_org); 852cabdff1aSopenharmony_ci q3_org_h = __lasx_vext2xv_hu_bu(q3_org); 853cabdff1aSopenharmony_ci 854cabdff1aSopenharmony_ci AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h, 855cabdff1aSopenharmony_ci q2_org_h, p1_org_h, q0_h, q1_h, q2_h); 856cabdff1aSopenharmony_ci 857cabdff1aSopenharmony_ci q0_h = __lasx_xvpickev_b(q0_h, q0_h); 858cabdff1aSopenharmony_ci q0_h = __lasx_xvpermi_d(q0_h, 0xd8); 859cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h); 860cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h); 861cabdff1aSopenharmony_ci q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta); 862cabdff1aSopenharmony_ci q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta); 863cabdff1aSopenharmony_ci q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta); 864cabdff1aSopenharmony_ci 865cabdff1aSopenharmony_ci __lasx_xvst(q1_org, data + img_width, 0); 866cabdff1aSopenharmony_ci __lasx_xvst(q2_org, data + img_width_2x, 0); 867cabdff1aSopenharmony_ci } 868cabdff1aSopenharmony_ci 869cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h); 870cabdff1aSopenharmony_ci 871cabdff1aSopenharmony_ci /* combine */ 872cabdff1aSopenharmony_ci q0_h = __lasx_xvpickev_b(q0_h, q0_h); 873cabdff1aSopenharmony_ci q0_h = __lasx_xvpermi_d(q0_h, 0xd8); 874cabdff1aSopenharmony_ci q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta); 875cabdff1aSopenharmony_ci 876cabdff1aSopenharmony_ci __lasx_xvst(q0_org, data, 0); 877cabdff1aSopenharmony_ci } 878cabdff1aSopenharmony_ci} 879cabdff1aSopenharmony_ci 880cabdff1aSopenharmony_civoid ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, 881cabdff1aSopenharmony_ci int alpha_in, int beta_in) 882cabdff1aSopenharmony_ci{ 883cabdff1aSopenharmony_ci uint8_t *src = data - 2; 884cabdff1aSopenharmony_ci ptrdiff_t img_width_2x = img_width << 1; 885cabdff1aSopenharmony_ci ptrdiff_t img_width_4x = img_width << 2; 886cabdff1aSopenharmony_ci ptrdiff_t img_width_3x = img_width_2x + img_width; 887cabdff1aSopenharmony_ci __m256i p1_org, p0_org, q0_org, q1_org; 888cabdff1aSopenharmony_ci __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 889cabdff1aSopenharmony_ci __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 890cabdff1aSopenharmony_ci 891cabdff1aSopenharmony_ci { 892cabdff1aSopenharmony_ci __m256i row0, row1, row2, row3, row4, row5, row6, row7; 893cabdff1aSopenharmony_ci 894cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src, 895cabdff1aSopenharmony_ci img_width_3x, row0, row1, row2, row3); 896cabdff1aSopenharmony_ci src += img_width_4x; 897cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src, 898cabdff1aSopenharmony_ci img_width_3x, row4, row5, row6, row7); 899cabdff1aSopenharmony_ci 900cabdff1aSopenharmony_ci /* LASX_TRANSPOSE8x4_B */ 901cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5, 902cabdff1aSopenharmony_ci p1_org, p0_org, q0_org, q1_org); 903cabdff1aSopenharmony_ci row0 = __lasx_xvilvl_b(p0_org, p1_org); 904cabdff1aSopenharmony_ci row1 = __lasx_xvilvl_b(q1_org, q0_org); 905cabdff1aSopenharmony_ci row3 = __lasx_xvilvh_w(row1, row0); 906cabdff1aSopenharmony_ci row2 = __lasx_xvilvl_w(row1, row0); 907cabdff1aSopenharmony_ci p1_org = __lasx_xvpermi_d(row2, 0x00); 908cabdff1aSopenharmony_ci p0_org = __lasx_xvpermi_d(row2, 0x55); 909cabdff1aSopenharmony_ci q0_org = __lasx_xvpermi_d(row3, 0x00); 910cabdff1aSopenharmony_ci q1_org = __lasx_xvpermi_d(row3, 0x55); 911cabdff1aSopenharmony_ci } 912cabdff1aSopenharmony_ci 913cabdff1aSopenharmony_ci alpha = __lasx_xvreplgr2vr_b(alpha_in); 914cabdff1aSopenharmony_ci beta = __lasx_xvreplgr2vr_b(beta_in); 915cabdff1aSopenharmony_ci 916cabdff1aSopenharmony_ci p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 917cabdff1aSopenharmony_ci p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 918cabdff1aSopenharmony_ci q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 919cabdff1aSopenharmony_ci 920cabdff1aSopenharmony_ci is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 921cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 922cabdff1aSopenharmony_ci is_less_than = is_less_than_alpha & is_less_than_beta; 923cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 924cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 925cabdff1aSopenharmony_ci 926cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than)) { 927cabdff1aSopenharmony_ci __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h; 928cabdff1aSopenharmony_ci 929cabdff1aSopenharmony_ci p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 930cabdff1aSopenharmony_ci p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 931cabdff1aSopenharmony_ci q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 932cabdff1aSopenharmony_ci q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 933cabdff1aSopenharmony_ci 934cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h); 935cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h); 936cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h); 937cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h); 938cabdff1aSopenharmony_ci p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 939cabdff1aSopenharmony_ci q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 940cabdff1aSopenharmony_ci } 941cabdff1aSopenharmony_ci p0_org = __lasx_xvilvl_b(q0_org, p0_org); 942cabdff1aSopenharmony_ci src = data - 1; 943cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 0); 944cabdff1aSopenharmony_ci src += img_width; 945cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 1); 946cabdff1aSopenharmony_ci src += img_width; 947cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 2); 948cabdff1aSopenharmony_ci src += img_width; 949cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 3); 950cabdff1aSopenharmony_ci src += img_width; 951cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 4); 952cabdff1aSopenharmony_ci src += img_width; 953cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 5); 954cabdff1aSopenharmony_ci src += img_width; 955cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 6); 956cabdff1aSopenharmony_ci src += img_width; 957cabdff1aSopenharmony_ci __lasx_xvstelm_h(p0_org, src, 0, 7); 958cabdff1aSopenharmony_ci} 959cabdff1aSopenharmony_ci 960cabdff1aSopenharmony_civoid ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, 961cabdff1aSopenharmony_ci int alpha_in, int beta_in) 962cabdff1aSopenharmony_ci{ 963cabdff1aSopenharmony_ci ptrdiff_t img_width_2x = img_width << 1; 964cabdff1aSopenharmony_ci __m256i p1_org, p0_org, q0_org, q1_org; 965cabdff1aSopenharmony_ci __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta; 966cabdff1aSopenharmony_ci __m256i is_less_than, is_less_than_beta, is_less_than_alpha; 967cabdff1aSopenharmony_ci 968cabdff1aSopenharmony_ci alpha = __lasx_xvreplgr2vr_b(alpha_in); 969cabdff1aSopenharmony_ci beta = __lasx_xvreplgr2vr_b(beta_in); 970cabdff1aSopenharmony_ci 971cabdff1aSopenharmony_ci p1_org = __lasx_xvldx(data, -img_width_2x); 972cabdff1aSopenharmony_ci p0_org = __lasx_xvldx(data, -img_width); 973cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org); 974cabdff1aSopenharmony_ci 975cabdff1aSopenharmony_ci p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org); 976cabdff1aSopenharmony_ci p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org); 977cabdff1aSopenharmony_ci q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org); 978cabdff1aSopenharmony_ci 979cabdff1aSopenharmony_ci is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha); 980cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta); 981cabdff1aSopenharmony_ci is_less_than = is_less_than_alpha & is_less_than_beta; 982cabdff1aSopenharmony_ci is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta); 983cabdff1aSopenharmony_ci is_less_than = is_less_than_beta & is_less_than; 984cabdff1aSopenharmony_ci 985cabdff1aSopenharmony_ci if (__lasx_xbnz_v(is_less_than)) { 986cabdff1aSopenharmony_ci __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h; 987cabdff1aSopenharmony_ci 988cabdff1aSopenharmony_ci p1_org_h = __lasx_vext2xv_hu_bu(p1_org); 989cabdff1aSopenharmony_ci p0_org_h = __lasx_vext2xv_hu_bu(p0_org); 990cabdff1aSopenharmony_ci q0_org_h = __lasx_vext2xv_hu_bu(q0_org); 991cabdff1aSopenharmony_ci q1_org_h = __lasx_vext2xv_hu_bu(q1_org); 992cabdff1aSopenharmony_ci 993cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h); 994cabdff1aSopenharmony_ci AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h); 995cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h); 996cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h); 997cabdff1aSopenharmony_ci p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than); 998cabdff1aSopenharmony_ci q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than); 999cabdff1aSopenharmony_ci __lasx_xvstelm_d(p0_h, data - img_width, 0, 0); 1000cabdff1aSopenharmony_ci __lasx_xvstelm_d(q0_h, data, 0, 0); 1001cabdff1aSopenharmony_ci } 1002cabdff1aSopenharmony_ci} 1003cabdff1aSopenharmony_ci 1004cabdff1aSopenharmony_civoid ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src, 1005cabdff1aSopenharmony_ci ptrdiff_t stride, int height, 1006cabdff1aSopenharmony_ci int log2_denom, int weight_dst, 1007cabdff1aSopenharmony_ci int weight_src, int offset_in) 1008cabdff1aSopenharmony_ci{ 1009cabdff1aSopenharmony_ci __m256i wgt; 1010cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3; 1011cabdff1aSopenharmony_ci __m256i dst0, dst1, dst2, dst3; 1012cabdff1aSopenharmony_ci __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1013cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1014cabdff1aSopenharmony_ci __m256i denom, offset; 1015cabdff1aSopenharmony_ci int stride_2x = stride << 1; 1016cabdff1aSopenharmony_ci int stride_4x = stride << 2; 1017cabdff1aSopenharmony_ci int stride_3x = stride_2x + stride; 1018cabdff1aSopenharmony_ci 1019cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1020cabdff1aSopenharmony_ci offset_in += ((weight_src + weight_dst) << 7); 1021cabdff1aSopenharmony_ci log2_denom += 1; 1022cabdff1aSopenharmony_ci 1023cabdff1aSopenharmony_ci tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1024cabdff1aSopenharmony_ci tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1025cabdff1aSopenharmony_ci wgt = __lasx_xvilvh_b(tmp1, tmp0); 1026cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_in); 1027cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1028cabdff1aSopenharmony_ci 1029cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1030cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1031cabdff1aSopenharmony_ci src += stride_4x; 1032cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1033cabdff1aSopenharmony_ci src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1034cabdff1aSopenharmony_ci src += stride_4x; 1035cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4, 1036cabdff1aSopenharmony_ci 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3); 1037cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1038cabdff1aSopenharmony_ci dst, stride_3x, tmp0, tmp1, tmp2, tmp3); 1039cabdff1aSopenharmony_ci dst += stride_4x; 1040cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1041cabdff1aSopenharmony_ci dst, stride_3x, tmp4, tmp5, tmp6, tmp7); 1042cabdff1aSopenharmony_ci dst -= stride_4x; 1043cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4, 1044cabdff1aSopenharmony_ci 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3); 1045cabdff1aSopenharmony_ci 1046cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1047cabdff1aSopenharmony_ci src0, src1, src2, src3); 1048cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128, 1049cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 1050cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2, 1051cabdff1aSopenharmony_ci dst3, src3, vec0, vec2, vec4, vec6); 1052cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2, 1053cabdff1aSopenharmony_ci dst3, src3, vec1, vec3, vec5, vec7); 1054cabdff1aSopenharmony_ci 1055cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1056cabdff1aSopenharmony_ci offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3); 1057cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5, 1058cabdff1aSopenharmony_ci offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7); 1059cabdff1aSopenharmony_ci 1060cabdff1aSopenharmony_ci tmp0 = __lasx_xvsra_h(tmp0, denom); 1061cabdff1aSopenharmony_ci tmp1 = __lasx_xvsra_h(tmp1, denom); 1062cabdff1aSopenharmony_ci tmp2 = __lasx_xvsra_h(tmp2, denom); 1063cabdff1aSopenharmony_ci tmp3 = __lasx_xvsra_h(tmp3, denom); 1064cabdff1aSopenharmony_ci tmp4 = __lasx_xvsra_h(tmp4, denom); 1065cabdff1aSopenharmony_ci tmp5 = __lasx_xvsra_h(tmp5, denom); 1066cabdff1aSopenharmony_ci tmp6 = __lasx_xvsra_h(tmp6, denom); 1067cabdff1aSopenharmony_ci tmp7 = __lasx_xvsra_h(tmp7, denom); 1068cabdff1aSopenharmony_ci 1069cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3, 1070cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1071cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7, 1072cabdff1aSopenharmony_ci tmp4, tmp5, tmp6, tmp7); 1073cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, 1074cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 1075cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 0); 1076cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 8, 1); 1077cabdff1aSopenharmony_ci dst += stride; 1078cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 2); 1079cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 8, 3); 1080cabdff1aSopenharmony_ci dst += stride; 1081cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst, 0, 0); 1082cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst, 8, 1); 1083cabdff1aSopenharmony_ci dst += stride; 1084cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst, 0, 2); 1085cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst, 8, 3); 1086cabdff1aSopenharmony_ci dst += stride; 1087cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst, 0, 0); 1088cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst, 8, 1); 1089cabdff1aSopenharmony_ci dst += stride; 1090cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst, 0, 2); 1091cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst, 8, 3); 1092cabdff1aSopenharmony_ci dst += stride; 1093cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst, 0, 0); 1094cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst, 8, 1); 1095cabdff1aSopenharmony_ci dst += stride; 1096cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst, 0, 2); 1097cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst, 8, 3); 1098cabdff1aSopenharmony_ci dst += stride; 1099cabdff1aSopenharmony_ci 1100cabdff1aSopenharmony_ci if (16 == height) { 1101cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1102cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1103cabdff1aSopenharmony_ci src += stride_4x; 1104cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1105cabdff1aSopenharmony_ci src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1106cabdff1aSopenharmony_ci src += stride_4x; 1107cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, 1108cabdff1aSopenharmony_ci tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3); 1109cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1110cabdff1aSopenharmony_ci dst, stride_3x, tmp0, tmp1, tmp2, tmp3); 1111cabdff1aSopenharmony_ci dst += stride_4x; 1112cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1113cabdff1aSopenharmony_ci dst, stride_3x, tmp4, tmp5, tmp6, tmp7); 1114cabdff1aSopenharmony_ci dst -= stride_4x; 1115cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, 1116cabdff1aSopenharmony_ci tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3); 1117cabdff1aSopenharmony_ci 1118cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1119cabdff1aSopenharmony_ci src0, src1, src2, src3); 1120cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128, 1121cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 1122cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2, 1123cabdff1aSopenharmony_ci dst3, src3, vec0, vec2, vec4, vec6); 1124cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2, 1125cabdff1aSopenharmony_ci dst3, src3, vec1, vec3, vec5, vec7); 1126cabdff1aSopenharmony_ci 1127cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1128cabdff1aSopenharmony_ci offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3); 1129cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5, 1130cabdff1aSopenharmony_ci offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7); 1131cabdff1aSopenharmony_ci 1132cabdff1aSopenharmony_ci tmp0 = __lasx_xvsra_h(tmp0, denom); 1133cabdff1aSopenharmony_ci tmp1 = __lasx_xvsra_h(tmp1, denom); 1134cabdff1aSopenharmony_ci tmp2 = __lasx_xvsra_h(tmp2, denom); 1135cabdff1aSopenharmony_ci tmp3 = __lasx_xvsra_h(tmp3, denom); 1136cabdff1aSopenharmony_ci tmp4 = __lasx_xvsra_h(tmp4, denom); 1137cabdff1aSopenharmony_ci tmp5 = __lasx_xvsra_h(tmp5, denom); 1138cabdff1aSopenharmony_ci tmp6 = __lasx_xvsra_h(tmp6, denom); 1139cabdff1aSopenharmony_ci tmp7 = __lasx_xvsra_h(tmp7, denom); 1140cabdff1aSopenharmony_ci 1141cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3, 1142cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1143cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7, 1144cabdff1aSopenharmony_ci tmp4, tmp5, tmp6, tmp7); 1145cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, 1146cabdff1aSopenharmony_ci tmp6, dst0, dst1, dst2, dst3); 1147cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 0); 1148cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 8, 1); 1149cabdff1aSopenharmony_ci dst += stride; 1150cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 2); 1151cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 8, 3); 1152cabdff1aSopenharmony_ci dst += stride; 1153cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst, 0, 0); 1154cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst, 8, 1); 1155cabdff1aSopenharmony_ci dst += stride; 1156cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst, 0, 2); 1157cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst, 8, 3); 1158cabdff1aSopenharmony_ci dst += stride; 1159cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst, 0, 0); 1160cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst, 8, 1); 1161cabdff1aSopenharmony_ci dst += stride; 1162cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst, 0, 2); 1163cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst, 8, 3); 1164cabdff1aSopenharmony_ci dst += stride; 1165cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst, 0, 0); 1166cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst, 8, 1); 1167cabdff1aSopenharmony_ci dst += stride; 1168cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst, 0, 2); 1169cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst, 8, 3); 1170cabdff1aSopenharmony_ci } 1171cabdff1aSopenharmony_ci} 1172cabdff1aSopenharmony_ci 1173cabdff1aSopenharmony_cistatic void avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1174cabdff1aSopenharmony_ci int32_t log2_denom, int32_t weight_src, 1175cabdff1aSopenharmony_ci int32_t weight_dst, int32_t offset_in) 1176cabdff1aSopenharmony_ci{ 1177cabdff1aSopenharmony_ci __m256i wgt, vec0, vec1; 1178cabdff1aSopenharmony_ci __m256i src0, dst0; 1179cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, denom, offset; 1180cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1181cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1182cabdff1aSopenharmony_ci 1183cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1184cabdff1aSopenharmony_ci offset_in += ((weight_src + weight_dst) << 7); 1185cabdff1aSopenharmony_ci log2_denom += 1; 1186cabdff1aSopenharmony_ci 1187cabdff1aSopenharmony_ci tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1188cabdff1aSopenharmony_ci tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1189cabdff1aSopenharmony_ci wgt = __lasx_xvilvh_b(tmp1, tmp0); 1190cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_in); 1191cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1192cabdff1aSopenharmony_ci 1193cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1194cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1195cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1196cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1197cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1198cabdff1aSopenharmony_ci dst, stride_3x, tmp0, tmp1, tmp2, tmp3); 1199cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1200cabdff1aSopenharmony_ci dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1201cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0); 1202cabdff1aSopenharmony_ci vec0 = __lasx_xvilvl_b(dst0, src0); 1203cabdff1aSopenharmony_ci vec1 = __lasx_xvilvh_b(dst0, src0); 1204cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1205cabdff1aSopenharmony_ci tmp0, tmp1); 1206cabdff1aSopenharmony_ci tmp0 = __lasx_xvsra_h(tmp0, denom); 1207cabdff1aSopenharmony_ci tmp1 = __lasx_xvsra_h(tmp1, denom); 1208cabdff1aSopenharmony_ci DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1); 1209cabdff1aSopenharmony_ci dst0 = __lasx_xvpickev_b(tmp1, tmp0); 1210cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 0); 1211cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1212cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1213cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1214cabdff1aSopenharmony_ci} 1215cabdff1aSopenharmony_ci 1216cabdff1aSopenharmony_cistatic void avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1217cabdff1aSopenharmony_ci int32_t log2_denom, int32_t weight_src, 1218cabdff1aSopenharmony_ci int32_t weight_dst, int32_t offset_in) 1219cabdff1aSopenharmony_ci{ 1220cabdff1aSopenharmony_ci __m256i wgt, vec0, vec1, vec2, vec3; 1221cabdff1aSopenharmony_ci __m256i src0, src1, dst0, dst1; 1222cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, denom, offset; 1223cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1224cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 1225cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1226cabdff1aSopenharmony_ci uint8_t* dst_tmp = dst; 1227cabdff1aSopenharmony_ci 1228cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1229cabdff1aSopenharmony_ci offset_in += ((weight_src + weight_dst) << 7); 1230cabdff1aSopenharmony_ci log2_denom += 1; 1231cabdff1aSopenharmony_ci 1232cabdff1aSopenharmony_ci tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1233cabdff1aSopenharmony_ci tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1234cabdff1aSopenharmony_ci wgt = __lasx_xvilvh_b(tmp1, tmp0); 1235cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_in); 1236cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1237cabdff1aSopenharmony_ci 1238cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1239cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1240cabdff1aSopenharmony_ci src += stride_4x; 1241cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1242cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1243cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1244cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1245cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1246cabdff1aSopenharmony_ci src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1247cabdff1aSopenharmony_ci tmp0 = __lasx_xvld(dst_tmp, 0); 1248cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, dst_tmp, stride, dst_tmp, stride_2x, tmp1, tmp2); 1249cabdff1aSopenharmony_ci tmp3 = __lasx_xvldx(dst_tmp, stride_3x); 1250cabdff1aSopenharmony_ci dst_tmp += stride_4x; 1251cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1252cabdff1aSopenharmony_ci dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1253cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x, 1254cabdff1aSopenharmony_ci dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1255cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1256cabdff1aSopenharmony_ci dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1257cabdff1aSopenharmony_ci 1258cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, dst0, 128, dst1, 128, 1259cabdff1aSopenharmony_ci src0, src1, dst0, dst1); 1260cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, vec0, vec2); 1261cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, vec1, vec3); 1262cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1263cabdff1aSopenharmony_ci offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3); 1264cabdff1aSopenharmony_ci tmp0 = __lasx_xvsra_h(tmp0, denom); 1265cabdff1aSopenharmony_ci tmp1 = __lasx_xvsra_h(tmp1, denom); 1266cabdff1aSopenharmony_ci tmp2 = __lasx_xvsra_h(tmp2, denom); 1267cabdff1aSopenharmony_ci tmp3 = __lasx_xvsra_h(tmp3, denom); 1268cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3, 1269cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1270cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1); 1271cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 0); 1272cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1273cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1274cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1275cabdff1aSopenharmony_ci dst += stride_4x; 1276cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst, 0, 0); 1277cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst + stride, 0, 1); 1278cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2); 1279cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3); 1280cabdff1aSopenharmony_ci} 1281cabdff1aSopenharmony_ci 1282cabdff1aSopenharmony_cistatic void avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1283cabdff1aSopenharmony_ci int32_t log2_denom, int32_t weight_src, 1284cabdff1aSopenharmony_ci int32_t weight_dst, int32_t offset_in) 1285cabdff1aSopenharmony_ci{ 1286cabdff1aSopenharmony_ci __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1287cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3, dst0, dst1, dst2, dst3; 1288cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset; 1289cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1290cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 1291cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1292cabdff1aSopenharmony_ci uint8_t* dst_tmp = dst; 1293cabdff1aSopenharmony_ci 1294cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1295cabdff1aSopenharmony_ci offset_in += ((weight_src + weight_dst) << 7); 1296cabdff1aSopenharmony_ci log2_denom += 1; 1297cabdff1aSopenharmony_ci 1298cabdff1aSopenharmony_ci tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1299cabdff1aSopenharmony_ci tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1300cabdff1aSopenharmony_ci wgt = __lasx_xvilvh_b(tmp1, tmp0); 1301cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_in); 1302cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1303cabdff1aSopenharmony_ci 1304cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1305cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1306cabdff1aSopenharmony_ci src += stride_4x; 1307cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1308cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1309cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1310cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1311cabdff1aSopenharmony_ci src += stride_4x; 1312cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1313cabdff1aSopenharmony_ci src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1314cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1315cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1316cabdff1aSopenharmony_ci src += stride_4x; 1317cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1318cabdff1aSopenharmony_ci src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1319cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1320cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1321cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1322cabdff1aSopenharmony_ci src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1323cabdff1aSopenharmony_ci 1324cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x, 1325cabdff1aSopenharmony_ci dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1326cabdff1aSopenharmony_ci dst_tmp += stride_4x; 1327cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1328cabdff1aSopenharmony_ci dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1329cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x, 1330cabdff1aSopenharmony_ci dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1331cabdff1aSopenharmony_ci dst_tmp += stride_4x; 1332cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1333cabdff1aSopenharmony_ci dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1334cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x, 1335cabdff1aSopenharmony_ci dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1336cabdff1aSopenharmony_ci dst_tmp += stride_4x; 1337cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1338cabdff1aSopenharmony_ci dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1339cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x, 1340cabdff1aSopenharmony_ci dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1341cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1342cabdff1aSopenharmony_ci dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1343cabdff1aSopenharmony_ci 1344cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128, 1345cabdff1aSopenharmony_ci src0, src1, src2, src3); 1346cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128, 1347cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3); 1348cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2, 1349cabdff1aSopenharmony_ci dst3, src3, vec0, vec2, vec4, vec6); 1350cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2, 1351cabdff1aSopenharmony_ci dst3, src3, vec1, vec3, vec5, vec7); 1352cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1353cabdff1aSopenharmony_ci offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3); 1354cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvdp2add_h_b,offset, wgt, vec4, offset, wgt, vec5, 1355cabdff1aSopenharmony_ci offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7); 1356cabdff1aSopenharmony_ci tmp0 = __lasx_xvsra_h(tmp0, denom); 1357cabdff1aSopenharmony_ci tmp1 = __lasx_xvsra_h(tmp1, denom); 1358cabdff1aSopenharmony_ci tmp2 = __lasx_xvsra_h(tmp2, denom); 1359cabdff1aSopenharmony_ci tmp3 = __lasx_xvsra_h(tmp3, denom); 1360cabdff1aSopenharmony_ci tmp4 = __lasx_xvsra_h(tmp4, denom); 1361cabdff1aSopenharmony_ci tmp5 = __lasx_xvsra_h(tmp5, denom); 1362cabdff1aSopenharmony_ci tmp6 = __lasx_xvsra_h(tmp6, denom); 1363cabdff1aSopenharmony_ci tmp7 = __lasx_xvsra_h(tmp7, denom); 1364cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3, 1365cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1366cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7, 1367cabdff1aSopenharmony_ci tmp4, tmp5, tmp6, tmp7); 1368cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, 1369cabdff1aSopenharmony_ci dst0, dst1, dst2, dst3) 1370cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst, 0, 0); 1371cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride, 0, 1); 1372cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2); 1373cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3); 1374cabdff1aSopenharmony_ci dst += stride_4x; 1375cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst, 0, 0); 1376cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst + stride, 0, 1); 1377cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2); 1378cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3); 1379cabdff1aSopenharmony_ci dst += stride_4x; 1380cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst, 0, 0); 1381cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst + stride, 0, 1); 1382cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2); 1383cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3); 1384cabdff1aSopenharmony_ci dst += stride_4x; 1385cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst, 0, 0); 1386cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst + stride, 0, 1); 1387cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2); 1388cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3); 1389cabdff1aSopenharmony_ci} 1390cabdff1aSopenharmony_ci 1391cabdff1aSopenharmony_civoid ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src, 1392cabdff1aSopenharmony_ci ptrdiff_t stride, int height, 1393cabdff1aSopenharmony_ci int log2_denom, int weight_dst, 1394cabdff1aSopenharmony_ci int weight_src, int offset) 1395cabdff1aSopenharmony_ci{ 1396cabdff1aSopenharmony_ci if (4 == height) { 1397cabdff1aSopenharmony_ci avc_biwgt_8x4_lasx(src, dst, stride, log2_denom, weight_src, weight_dst, 1398cabdff1aSopenharmony_ci offset); 1399cabdff1aSopenharmony_ci } else if (8 == height) { 1400cabdff1aSopenharmony_ci avc_biwgt_8x8_lasx(src, dst, stride, log2_denom, weight_src, weight_dst, 1401cabdff1aSopenharmony_ci offset); 1402cabdff1aSopenharmony_ci } else { 1403cabdff1aSopenharmony_ci avc_biwgt_8x16_lasx(src, dst, stride, log2_denom, weight_src, weight_dst, 1404cabdff1aSopenharmony_ci offset); 1405cabdff1aSopenharmony_ci } 1406cabdff1aSopenharmony_ci} 1407cabdff1aSopenharmony_ci 1408cabdff1aSopenharmony_cistatic void avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1409cabdff1aSopenharmony_ci int32_t log2_denom, int32_t weight_src, 1410cabdff1aSopenharmony_ci int32_t weight_dst, int32_t offset_in) 1411cabdff1aSopenharmony_ci{ 1412cabdff1aSopenharmony_ci __m256i wgt, vec0; 1413cabdff1aSopenharmony_ci __m256i src0, dst0; 1414cabdff1aSopenharmony_ci __m256i tmp0, tmp1, denom, offset; 1415cabdff1aSopenharmony_ci 1416cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1417cabdff1aSopenharmony_ci offset_in += ((weight_src + weight_dst) << 7); 1418cabdff1aSopenharmony_ci log2_denom += 1; 1419cabdff1aSopenharmony_ci 1420cabdff1aSopenharmony_ci tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1421cabdff1aSopenharmony_ci tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1422cabdff1aSopenharmony_ci wgt = __lasx_xvilvh_b(tmp1, tmp0); 1423cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_in); 1424cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1425cabdff1aSopenharmony_ci 1426cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1); 1427cabdff1aSopenharmony_ci src0 = __lasx_xvilvl_w(tmp1, tmp0); 1428cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, dst, 0, dst, stride, tmp0, tmp1); 1429cabdff1aSopenharmony_ci dst0 = __lasx_xvilvl_w(tmp1, tmp0); 1430cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0); 1431cabdff1aSopenharmony_ci vec0 = __lasx_xvilvl_b(dst0, src0); 1432cabdff1aSopenharmony_ci tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0); 1433cabdff1aSopenharmony_ci tmp0 = __lasx_xvsra_h(tmp0, denom); 1434cabdff1aSopenharmony_ci tmp0 = __lasx_xvclip255_h(tmp0); 1435cabdff1aSopenharmony_ci tmp0 = __lasx_xvpickev_b(tmp0, tmp0); 1436cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst, 0, 0); 1437cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst + stride, 0, 1); 1438cabdff1aSopenharmony_ci} 1439cabdff1aSopenharmony_ci 1440cabdff1aSopenharmony_cistatic void avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1441cabdff1aSopenharmony_ci int32_t log2_denom, int32_t weight_src, 1442cabdff1aSopenharmony_ci int32_t weight_dst, int32_t offset_in) 1443cabdff1aSopenharmony_ci{ 1444cabdff1aSopenharmony_ci __m256i wgt, vec0; 1445cabdff1aSopenharmony_ci __m256i src0, dst0; 1446cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, denom, offset; 1447cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1448cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1449cabdff1aSopenharmony_ci 1450cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1451cabdff1aSopenharmony_ci offset_in += ((weight_src + weight_dst) << 7); 1452cabdff1aSopenharmony_ci log2_denom += 1; 1453cabdff1aSopenharmony_ci 1454cabdff1aSopenharmony_ci tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1455cabdff1aSopenharmony_ci tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1456cabdff1aSopenharmony_ci wgt = __lasx_xvilvh_b(tmp1, tmp0); 1457cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_in); 1458cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1459cabdff1aSopenharmony_ci 1460cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1461cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1462cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 1463cabdff1aSopenharmony_ci src0 = __lasx_xvilvl_w(tmp1, tmp0); 1464cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1465cabdff1aSopenharmony_ci dst, stride_3x, tmp0, tmp1, tmp2, tmp3); 1466cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 1467cabdff1aSopenharmony_ci dst0 = __lasx_xvilvl_w(tmp1, tmp0); 1468cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0); 1469cabdff1aSopenharmony_ci vec0 = __lasx_xvilvl_b(dst0, src0); 1470cabdff1aSopenharmony_ci dst0 = __lasx_xvilvh_b(dst0, src0); 1471cabdff1aSopenharmony_ci vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02); 1472cabdff1aSopenharmony_ci tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0); 1473cabdff1aSopenharmony_ci tmp0 = __lasx_xvsra_h(tmp0, denom); 1474cabdff1aSopenharmony_ci tmp0 = __lasx_xvclip255_h(tmp0); 1475cabdff1aSopenharmony_ci tmp0 = __lasx_xvpickev_b(tmp0, tmp0); 1476cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst, 0, 0); 1477cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst + stride, 0, 1); 1478cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4); 1479cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5); 1480cabdff1aSopenharmony_ci} 1481cabdff1aSopenharmony_ci 1482cabdff1aSopenharmony_cistatic void avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 1483cabdff1aSopenharmony_ci int32_t log2_denom, int32_t weight_src, 1484cabdff1aSopenharmony_ci int32_t weight_dst, int32_t offset_in) 1485cabdff1aSopenharmony_ci{ 1486cabdff1aSopenharmony_ci __m256i wgt, vec0, vec1; 1487cabdff1aSopenharmony_ci __m256i src0, dst0; 1488cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset; 1489cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1490cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 1491cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1492cabdff1aSopenharmony_ci 1493cabdff1aSopenharmony_ci offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom; 1494cabdff1aSopenharmony_ci offset_in += ((weight_src + weight_dst) << 7); 1495cabdff1aSopenharmony_ci log2_denom += 1; 1496cabdff1aSopenharmony_ci 1497cabdff1aSopenharmony_ci tmp0 = __lasx_xvreplgr2vr_b(weight_src); 1498cabdff1aSopenharmony_ci tmp1 = __lasx_xvreplgr2vr_b(weight_dst); 1499cabdff1aSopenharmony_ci wgt = __lasx_xvilvh_b(tmp1, tmp0); 1500cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_in); 1501cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1502cabdff1aSopenharmony_ci 1503cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1504cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1505cabdff1aSopenharmony_ci src += stride_4x; 1506cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1507cabdff1aSopenharmony_ci src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1508cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5, 1509cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1510cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1511cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1512cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1513cabdff1aSopenharmony_ci dst, stride_3x, tmp0, tmp1, tmp2, tmp3); 1514cabdff1aSopenharmony_ci dst += stride_4x; 1515cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, 1516cabdff1aSopenharmony_ci dst, stride_3x, tmp4, tmp5, tmp6, tmp7); 1517cabdff1aSopenharmony_ci dst -= stride_4x; 1518cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5, 1519cabdff1aSopenharmony_ci tmp0, tmp1, tmp2, tmp3); 1520cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1521cabdff1aSopenharmony_ci dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1522cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0); 1523cabdff1aSopenharmony_ci vec0 = __lasx_xvilvl_b(dst0, src0); 1524cabdff1aSopenharmony_ci vec1 = __lasx_xvilvh_b(dst0, src0); 1525cabdff1aSopenharmony_ci DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1, 1526cabdff1aSopenharmony_ci tmp0, tmp1); 1527cabdff1aSopenharmony_ci tmp0 = __lasx_xvsra_h(tmp0, denom); 1528cabdff1aSopenharmony_ci tmp1 = __lasx_xvsra_h(tmp1, denom); 1529cabdff1aSopenharmony_ci DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1); 1530cabdff1aSopenharmony_ci tmp0 = __lasx_xvpickev_b(tmp1, tmp0); 1531cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst, 0, 0); 1532cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst + stride, 0, 1); 1533cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2); 1534cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3); 1535cabdff1aSopenharmony_ci dst += stride_4x; 1536cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst, 0, 4); 1537cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst + stride, 0, 5); 1538cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6); 1539cabdff1aSopenharmony_ci __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7); 1540cabdff1aSopenharmony_ci} 1541cabdff1aSopenharmony_ci 1542cabdff1aSopenharmony_civoid ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src, 1543cabdff1aSopenharmony_ci ptrdiff_t stride, int height, 1544cabdff1aSopenharmony_ci int log2_denom, int weight_dst, 1545cabdff1aSopenharmony_ci int weight_src, int offset) 1546cabdff1aSopenharmony_ci{ 1547cabdff1aSopenharmony_ci if (2 == height) { 1548cabdff1aSopenharmony_ci avc_biwgt_4x2_lasx(src, dst, stride, log2_denom, weight_src, 1549cabdff1aSopenharmony_ci weight_dst, offset); 1550cabdff1aSopenharmony_ci } else if (4 == height) { 1551cabdff1aSopenharmony_ci avc_biwgt_4x4_lasx(src, dst, stride, log2_denom, weight_src, 1552cabdff1aSopenharmony_ci weight_dst, offset); 1553cabdff1aSopenharmony_ci } else { 1554cabdff1aSopenharmony_ci avc_biwgt_4x8_lasx(src, dst, stride, log2_denom, weight_src, 1555cabdff1aSopenharmony_ci weight_dst, offset); 1556cabdff1aSopenharmony_ci } 1557cabdff1aSopenharmony_ci} 1558cabdff1aSopenharmony_ci 1559cabdff1aSopenharmony_civoid ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride, 1560cabdff1aSopenharmony_ci int height, int log2_denom, 1561cabdff1aSopenharmony_ci int weight_src, int offset_in) 1562cabdff1aSopenharmony_ci{ 1563cabdff1aSopenharmony_ci uint32_t offset_val; 1564cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1565cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 1566cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1567cabdff1aSopenharmony_ci __m256i zero = __lasx_xvldi(0); 1568cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3; 1569cabdff1aSopenharmony_ci __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h; 1570cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1571cabdff1aSopenharmony_ci __m256i wgt, denom, offset; 1572cabdff1aSopenharmony_ci 1573cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 1574cabdff1aSopenharmony_ci 1575cabdff1aSopenharmony_ci wgt = __lasx_xvreplgr2vr_h(weight_src); 1576cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_val); 1577cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1578cabdff1aSopenharmony_ci 1579cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1580cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1581cabdff1aSopenharmony_ci src += stride_4x; 1582cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1583cabdff1aSopenharmony_ci src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1584cabdff1aSopenharmony_ci src -= stride_4x; 1585cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4, 1586cabdff1aSopenharmony_ci 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3); 1587cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, 1588cabdff1aSopenharmony_ci zero, src3, src0_l, src1_l, src2_l, src3_l); 1589cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, 1590cabdff1aSopenharmony_ci zero, src3, src0_h, src1_h, src2_h, src3_h); 1591cabdff1aSopenharmony_ci src0_l = __lasx_xvmul_h(wgt, src0_l); 1592cabdff1aSopenharmony_ci src0_h = __lasx_xvmul_h(wgt, src0_h); 1593cabdff1aSopenharmony_ci src1_l = __lasx_xvmul_h(wgt, src1_l); 1594cabdff1aSopenharmony_ci src1_h = __lasx_xvmul_h(wgt, src1_h); 1595cabdff1aSopenharmony_ci src2_l = __lasx_xvmul_h(wgt, src2_l); 1596cabdff1aSopenharmony_ci src2_h = __lasx_xvmul_h(wgt, src2_h); 1597cabdff1aSopenharmony_ci src3_l = __lasx_xvmul_h(wgt, src3_l); 1598cabdff1aSopenharmony_ci src3_h = __lasx_xvmul_h(wgt, src3_h); 1599cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset, 1600cabdff1aSopenharmony_ci src1_h, offset, src0_l, src0_h, src1_l, src1_h); 1601cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset, 1602cabdff1aSopenharmony_ci src3_h, offset, src2_l, src2_h, src3_l, src3_h); 1603cabdff1aSopenharmony_ci src0_l = __lasx_xvmaxi_h(src0_l, 0); 1604cabdff1aSopenharmony_ci src0_h = __lasx_xvmaxi_h(src0_h, 0); 1605cabdff1aSopenharmony_ci src1_l = __lasx_xvmaxi_h(src1_l, 0); 1606cabdff1aSopenharmony_ci src1_h = __lasx_xvmaxi_h(src1_h, 0); 1607cabdff1aSopenharmony_ci src2_l = __lasx_xvmaxi_h(src2_l, 0); 1608cabdff1aSopenharmony_ci src2_h = __lasx_xvmaxi_h(src2_h, 0); 1609cabdff1aSopenharmony_ci src3_l = __lasx_xvmaxi_h(src3_l, 0); 1610cabdff1aSopenharmony_ci src3_h = __lasx_xvmaxi_h(src3_h, 0); 1611cabdff1aSopenharmony_ci src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 1612cabdff1aSopenharmony_ci src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 1613cabdff1aSopenharmony_ci src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom); 1614cabdff1aSopenharmony_ci src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom); 1615cabdff1aSopenharmony_ci src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom); 1616cabdff1aSopenharmony_ci src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom); 1617cabdff1aSopenharmony_ci src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom); 1618cabdff1aSopenharmony_ci src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom); 1619cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0_l, src, 0, 0); 1620cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0_h, src, 8, 0); 1621cabdff1aSopenharmony_ci src += stride; 1622cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0_l, src, 0, 2); 1623cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0_h, src, 8, 2); 1624cabdff1aSopenharmony_ci src += stride; 1625cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1_l, src, 0, 0); 1626cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1_h, src, 8, 0); 1627cabdff1aSopenharmony_ci src += stride; 1628cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1_l, src, 0, 2); 1629cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1_h, src, 8, 2); 1630cabdff1aSopenharmony_ci src += stride; 1631cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2_l, src, 0, 0); 1632cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2_h, src, 8, 0); 1633cabdff1aSopenharmony_ci src += stride; 1634cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2_l, src, 0, 2); 1635cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2_h, src, 8, 2); 1636cabdff1aSopenharmony_ci src += stride; 1637cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3_l, src, 0, 0); 1638cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3_h, src, 8, 0); 1639cabdff1aSopenharmony_ci src += stride; 1640cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3_l, src, 0, 2); 1641cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3_h, src, 8, 2); 1642cabdff1aSopenharmony_ci src += stride; 1643cabdff1aSopenharmony_ci 1644cabdff1aSopenharmony_ci if (16 == height) { 1645cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1646cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1647cabdff1aSopenharmony_ci src += stride_4x; 1648cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1649cabdff1aSopenharmony_ci src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1650cabdff1aSopenharmony_ci src -= stride_4x; 1651cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, 1652cabdff1aSopenharmony_ci tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3); 1653cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, 1654cabdff1aSopenharmony_ci zero, src3, src0_l, src1_l, src2_l, src3_l); 1655cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, 1656cabdff1aSopenharmony_ci zero, src3, src0_h, src1_h, src2_h, src3_h); 1657cabdff1aSopenharmony_ci src0_l = __lasx_xvmul_h(wgt, src0_l); 1658cabdff1aSopenharmony_ci src0_h = __lasx_xvmul_h(wgt, src0_h); 1659cabdff1aSopenharmony_ci src1_l = __lasx_xvmul_h(wgt, src1_l); 1660cabdff1aSopenharmony_ci src1_h = __lasx_xvmul_h(wgt, src1_h); 1661cabdff1aSopenharmony_ci src2_l = __lasx_xvmul_h(wgt, src2_l); 1662cabdff1aSopenharmony_ci src2_h = __lasx_xvmul_h(wgt, src2_h); 1663cabdff1aSopenharmony_ci src3_l = __lasx_xvmul_h(wgt, src3_l); 1664cabdff1aSopenharmony_ci src3_h = __lasx_xvmul_h(wgt, src3_h); 1665cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, 1666cabdff1aSopenharmony_ci offset, src1_h, offset, src0_l, src0_h, src1_l, src1_h); 1667cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, 1668cabdff1aSopenharmony_ci offset, src3_h, offset, src2_l, src2_h, src3_l, src3_h); 1669cabdff1aSopenharmony_ci src0_l = __lasx_xvmaxi_h(src0_l, 0); 1670cabdff1aSopenharmony_ci src0_h = __lasx_xvmaxi_h(src0_h, 0); 1671cabdff1aSopenharmony_ci src1_l = __lasx_xvmaxi_h(src1_l, 0); 1672cabdff1aSopenharmony_ci src1_h = __lasx_xvmaxi_h(src1_h, 0); 1673cabdff1aSopenharmony_ci src2_l = __lasx_xvmaxi_h(src2_l, 0); 1674cabdff1aSopenharmony_ci src2_h = __lasx_xvmaxi_h(src2_h, 0); 1675cabdff1aSopenharmony_ci src3_l = __lasx_xvmaxi_h(src3_l, 0); 1676cabdff1aSopenharmony_ci src3_h = __lasx_xvmaxi_h(src3_h, 0); 1677cabdff1aSopenharmony_ci src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 1678cabdff1aSopenharmony_ci src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 1679cabdff1aSopenharmony_ci src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom); 1680cabdff1aSopenharmony_ci src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom); 1681cabdff1aSopenharmony_ci src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom); 1682cabdff1aSopenharmony_ci src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom); 1683cabdff1aSopenharmony_ci src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom); 1684cabdff1aSopenharmony_ci src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom); 1685cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0_l, src, 0, 0); 1686cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0_h, src, 8, 0); 1687cabdff1aSopenharmony_ci src += stride; 1688cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0_l, src, 0, 2); 1689cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0_h, src, 8, 2); 1690cabdff1aSopenharmony_ci src += stride; 1691cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1_l, src, 0, 0); 1692cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1_h, src, 8, 0); 1693cabdff1aSopenharmony_ci src += stride; 1694cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1_l, src, 0, 2); 1695cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1_h, src, 8, 2); 1696cabdff1aSopenharmony_ci src += stride; 1697cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2_l, src, 0, 0); 1698cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2_h, src, 8, 0); 1699cabdff1aSopenharmony_ci src += stride; 1700cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2_l, src, 0, 2); 1701cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2_h, src, 8, 2); 1702cabdff1aSopenharmony_ci src += stride; 1703cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3_l, src, 0, 0); 1704cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3_h, src, 8, 0); 1705cabdff1aSopenharmony_ci src += stride; 1706cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3_l, src, 0, 2); 1707cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3_h, src, 8, 2); 1708cabdff1aSopenharmony_ci } 1709cabdff1aSopenharmony_ci} 1710cabdff1aSopenharmony_ci 1711cabdff1aSopenharmony_cistatic void avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride, 1712cabdff1aSopenharmony_ci int32_t log2_denom, int32_t weight_src, 1713cabdff1aSopenharmony_ci int32_t offset_in) 1714cabdff1aSopenharmony_ci{ 1715cabdff1aSopenharmony_ci uint32_t offset_val; 1716cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1717cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1718cabdff1aSopenharmony_ci __m256i wgt, zero = __lasx_xvldi(0); 1719cabdff1aSopenharmony_ci __m256i src0, src0_h, src0_l; 1720cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, denom, offset; 1721cabdff1aSopenharmony_ci 1722cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 1723cabdff1aSopenharmony_ci 1724cabdff1aSopenharmony_ci wgt = __lasx_xvreplgr2vr_h(weight_src); 1725cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_val); 1726cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1727cabdff1aSopenharmony_ci 1728cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1729cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1730cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1731cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1732cabdff1aSopenharmony_ci src0_l = __lasx_xvilvl_b(zero, src0); 1733cabdff1aSopenharmony_ci src0_h = __lasx_xvilvh_b(zero, src0); 1734cabdff1aSopenharmony_ci src0_l = __lasx_xvmul_h(wgt, src0_l); 1735cabdff1aSopenharmony_ci src0_h = __lasx_xvmul_h(wgt, src0_h); 1736cabdff1aSopenharmony_ci src0_l = __lasx_xvsadd_h(src0_l, offset); 1737cabdff1aSopenharmony_ci src0_h = __lasx_xvsadd_h(src0_h, offset); 1738cabdff1aSopenharmony_ci src0_l = __lasx_xvmaxi_h(src0_l, 0); 1739cabdff1aSopenharmony_ci src0_h = __lasx_xvmaxi_h(src0_h, 0); 1740cabdff1aSopenharmony_ci src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 1741cabdff1aSopenharmony_ci src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 1742cabdff1aSopenharmony_ci 1743cabdff1aSopenharmony_ci src0 = __lasx_xvpickev_d(src0_h, src0_l); 1744cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src, 0, 0); 1745cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src + stride, 0, 1); 1746cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src + stride_2x, 0, 2); 1747cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src + stride_3x, 0, 3); 1748cabdff1aSopenharmony_ci} 1749cabdff1aSopenharmony_ci 1750cabdff1aSopenharmony_cistatic void avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, 1751cabdff1aSopenharmony_ci int32_t src_weight, int32_t offset_in) 1752cabdff1aSopenharmony_ci{ 1753cabdff1aSopenharmony_ci __m256i src0, src1, src0_h, src0_l, src1_h, src1_l, zero = __lasx_xvldi(0); 1754cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt; 1755cabdff1aSopenharmony_ci uint32_t offset_val; 1756cabdff1aSopenharmony_ci uint8_t* src_tmp = src; 1757cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1758cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 1759cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1760cabdff1aSopenharmony_ci 1761cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 1762cabdff1aSopenharmony_ci 1763cabdff1aSopenharmony_ci wgt = __lasx_xvreplgr2vr_h(src_weight); 1764cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_val); 1765cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1766cabdff1aSopenharmony_ci 1767cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1768cabdff1aSopenharmony_ci src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1769cabdff1aSopenharmony_ci src_tmp += stride_4x; 1770cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1771cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1772cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1773cabdff1aSopenharmony_ci src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1774cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1775cabdff1aSopenharmony_ci src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1776cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0_l, src1_l); 1777cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, src0_h, src1_h); 1778cabdff1aSopenharmony_ci src0_l = __lasx_xvmul_h(wgt, src0_l); 1779cabdff1aSopenharmony_ci src0_h = __lasx_xvmul_h(wgt, src0_h); 1780cabdff1aSopenharmony_ci src1_l = __lasx_xvmul_h(wgt, src1_l); 1781cabdff1aSopenharmony_ci src1_h = __lasx_xvmul_h(wgt, src1_h); 1782cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset, 1783cabdff1aSopenharmony_ci src1_h, offset, src0_l, src0_h, src1_l, src1_h); 1784cabdff1aSopenharmony_ci src0_l = __lasx_xvmaxi_h(src0_l, 0); 1785cabdff1aSopenharmony_ci src0_h = __lasx_xvmaxi_h(src0_h, 0); 1786cabdff1aSopenharmony_ci src1_l = __lasx_xvmaxi_h(src1_l, 0); 1787cabdff1aSopenharmony_ci src1_h = __lasx_xvmaxi_h(src1_h, 0); 1788cabdff1aSopenharmony_ci src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 1789cabdff1aSopenharmony_ci src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 1790cabdff1aSopenharmony_ci src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom); 1791cabdff1aSopenharmony_ci src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom); 1792cabdff1aSopenharmony_ci 1793cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src0, src1); 1794cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src, 0, 0); 1795cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src + stride, 0, 1); 1796cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src + stride_2x, 0, 2); 1797cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src + stride_3x, 0, 3); 1798cabdff1aSopenharmony_ci src += stride_4x; 1799cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, src, 0, 0); 1800cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, src + stride, 0, 1); 1801cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, src + stride_2x, 0, 2); 1802cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, src + stride_3x, 0, 3); 1803cabdff1aSopenharmony_ci} 1804cabdff1aSopenharmony_ci 1805cabdff1aSopenharmony_cistatic void avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride, 1806cabdff1aSopenharmony_ci int32_t log2_denom, int32_t src_weight, 1807cabdff1aSopenharmony_ci int32_t offset_in) 1808cabdff1aSopenharmony_ci{ 1809cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3; 1810cabdff1aSopenharmony_ci __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l; 1811cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt; 1812cabdff1aSopenharmony_ci __m256i zero = __lasx_xvldi(0); 1813cabdff1aSopenharmony_ci uint32_t offset_val; 1814cabdff1aSopenharmony_ci uint8_t* src_tmp = src; 1815cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1816cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 1817cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1818cabdff1aSopenharmony_ci 1819cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 1820cabdff1aSopenharmony_ci 1821cabdff1aSopenharmony_ci wgt = __lasx_xvreplgr2vr_h(src_weight); 1822cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_val); 1823cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1824cabdff1aSopenharmony_ci 1825cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1826cabdff1aSopenharmony_ci src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1827cabdff1aSopenharmony_ci src_tmp += stride_4x; 1828cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1829cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1830cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1831cabdff1aSopenharmony_ci src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1832cabdff1aSopenharmony_ci src_tmp += stride_4x; 1833cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1834cabdff1aSopenharmony_ci src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1835cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1836cabdff1aSopenharmony_ci src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1837cabdff1aSopenharmony_ci src_tmp += stride_4x; 1838cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1839cabdff1aSopenharmony_ci src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1840cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x, 1841cabdff1aSopenharmony_ci src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3); 1842cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1843cabdff1aSopenharmony_ci src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 1844cabdff1aSopenharmony_ci 1845cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, zero, src3, 1846cabdff1aSopenharmony_ci src0_l, src1_l, src2_l, src3_l); 1847cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, zero, src3, 1848cabdff1aSopenharmony_ci src0_h, src1_h, src2_h, src3_h); 1849cabdff1aSopenharmony_ci src0_l = __lasx_xvmul_h(wgt, src0_l); 1850cabdff1aSopenharmony_ci src0_h = __lasx_xvmul_h(wgt, src0_h); 1851cabdff1aSopenharmony_ci src1_l = __lasx_xvmul_h(wgt, src1_l); 1852cabdff1aSopenharmony_ci src1_h = __lasx_xvmul_h(wgt, src1_h); 1853cabdff1aSopenharmony_ci src2_l = __lasx_xvmul_h(wgt, src2_l); 1854cabdff1aSopenharmony_ci src2_h = __lasx_xvmul_h(wgt, src2_h); 1855cabdff1aSopenharmony_ci src3_l = __lasx_xvmul_h(wgt, src3_l); 1856cabdff1aSopenharmony_ci src3_h = __lasx_xvmul_h(wgt, src3_h); 1857cabdff1aSopenharmony_ci 1858cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset, 1859cabdff1aSopenharmony_ci src1_h, offset, src0_l, src0_h, src1_l, src1_h); 1860cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset, 1861cabdff1aSopenharmony_ci src3_h, offset, src2_l, src2_h, src3_l, src3_h); 1862cabdff1aSopenharmony_ci 1863cabdff1aSopenharmony_ci src0_l = __lasx_xvmaxi_h(src0_l, 0); 1864cabdff1aSopenharmony_ci src0_h = __lasx_xvmaxi_h(src0_h, 0); 1865cabdff1aSopenharmony_ci src1_l = __lasx_xvmaxi_h(src1_l, 0); 1866cabdff1aSopenharmony_ci src1_h = __lasx_xvmaxi_h(src1_h, 0); 1867cabdff1aSopenharmony_ci src2_l = __lasx_xvmaxi_h(src2_l, 0); 1868cabdff1aSopenharmony_ci src2_h = __lasx_xvmaxi_h(src2_h, 0); 1869cabdff1aSopenharmony_ci src3_l = __lasx_xvmaxi_h(src3_l, 0); 1870cabdff1aSopenharmony_ci src3_h = __lasx_xvmaxi_h(src3_h, 0); 1871cabdff1aSopenharmony_ci src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 1872cabdff1aSopenharmony_ci src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 1873cabdff1aSopenharmony_ci src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom); 1874cabdff1aSopenharmony_ci src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom); 1875cabdff1aSopenharmony_ci src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom); 1876cabdff1aSopenharmony_ci src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom); 1877cabdff1aSopenharmony_ci src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom); 1878cabdff1aSopenharmony_ci src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom); 1879cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, 1880cabdff1aSopenharmony_ci src3_h, src3_l, src0, src1, src2, src3); 1881cabdff1aSopenharmony_ci 1882cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src, 0, 0); 1883cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src + stride, 0, 1); 1884cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src + stride_2x, 0, 2); 1885cabdff1aSopenharmony_ci __lasx_xvstelm_d(src0, src + stride_3x, 0, 3); 1886cabdff1aSopenharmony_ci src += stride_4x; 1887cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, src, 0, 0); 1888cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, src + stride, 0, 1); 1889cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, src + stride_2x, 0, 2); 1890cabdff1aSopenharmony_ci __lasx_xvstelm_d(src1, src + stride_3x, 0, 3); 1891cabdff1aSopenharmony_ci src += stride_4x; 1892cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, src, 0, 0); 1893cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, src + stride, 0, 1); 1894cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, src + stride_2x, 0, 2); 1895cabdff1aSopenharmony_ci __lasx_xvstelm_d(src2, src + stride_3x, 0, 3); 1896cabdff1aSopenharmony_ci src += stride_4x; 1897cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3, src, 0, 0); 1898cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3, src + stride, 0, 1); 1899cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3, src + stride_2x, 0, 2); 1900cabdff1aSopenharmony_ci __lasx_xvstelm_d(src3, src + stride_3x, 0, 3); 1901cabdff1aSopenharmony_ci} 1902cabdff1aSopenharmony_ci 1903cabdff1aSopenharmony_civoid ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride, 1904cabdff1aSopenharmony_ci int height, int log2_denom, 1905cabdff1aSopenharmony_ci int weight_src, int offset) 1906cabdff1aSopenharmony_ci{ 1907cabdff1aSopenharmony_ci if (4 == height) { 1908cabdff1aSopenharmony_ci avc_wgt_8x4_lasx(src, stride, log2_denom, weight_src, offset); 1909cabdff1aSopenharmony_ci } else if (8 == height) { 1910cabdff1aSopenharmony_ci avc_wgt_8x8_lasx(src, stride, log2_denom, weight_src, offset); 1911cabdff1aSopenharmony_ci } else { 1912cabdff1aSopenharmony_ci avc_wgt_8x16_lasx(src, stride, log2_denom, weight_src, offset); 1913cabdff1aSopenharmony_ci } 1914cabdff1aSopenharmony_ci} 1915cabdff1aSopenharmony_ci 1916cabdff1aSopenharmony_cistatic void avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride, 1917cabdff1aSopenharmony_ci int32_t log2_denom, int32_t weight_src, 1918cabdff1aSopenharmony_ci int32_t offset_in) 1919cabdff1aSopenharmony_ci{ 1920cabdff1aSopenharmony_ci uint32_t offset_val; 1921cabdff1aSopenharmony_ci __m256i wgt, zero = __lasx_xvldi(0); 1922cabdff1aSopenharmony_ci __m256i src0, tmp0, tmp1, denom, offset; 1923cabdff1aSopenharmony_ci 1924cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 1925cabdff1aSopenharmony_ci 1926cabdff1aSopenharmony_ci wgt = __lasx_xvreplgr2vr_h(weight_src); 1927cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_val); 1928cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1929cabdff1aSopenharmony_ci 1930cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1); 1931cabdff1aSopenharmony_ci src0 = __lasx_xvilvl_w(tmp1, tmp0); 1932cabdff1aSopenharmony_ci src0 = __lasx_xvilvl_b(zero, src0); 1933cabdff1aSopenharmony_ci src0 = __lasx_xvmul_h(wgt, src0); 1934cabdff1aSopenharmony_ci src0 = __lasx_xvsadd_h(src0, offset); 1935cabdff1aSopenharmony_ci src0 = __lasx_xvmaxi_h(src0, 0); 1936cabdff1aSopenharmony_ci src0 = __lasx_xvssrlrn_bu_h(src0, denom); 1937cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0, src, 0, 0); 1938cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0, src + stride, 0, 1); 1939cabdff1aSopenharmony_ci} 1940cabdff1aSopenharmony_ci 1941cabdff1aSopenharmony_cistatic void avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride, 1942cabdff1aSopenharmony_ci int32_t log2_denom, int32_t weight_src, 1943cabdff1aSopenharmony_ci int32_t offset_in) 1944cabdff1aSopenharmony_ci{ 1945cabdff1aSopenharmony_ci __m256i wgt; 1946cabdff1aSopenharmony_ci __m256i src0, tmp0, tmp1, tmp2, tmp3, denom, offset; 1947cabdff1aSopenharmony_ci uint32_t offset_val; 1948cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1949cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1950cabdff1aSopenharmony_ci 1951cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 1952cabdff1aSopenharmony_ci 1953cabdff1aSopenharmony_ci wgt = __lasx_xvreplgr2vr_h(weight_src); 1954cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_val); 1955cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1956cabdff1aSopenharmony_ci 1957cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1958cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1959cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1); 1960cabdff1aSopenharmony_ci src0 = __lasx_xvilvl_w(tmp1, tmp0); 1961cabdff1aSopenharmony_ci src0 = __lasx_vext2xv_hu_bu(src0); 1962cabdff1aSopenharmony_ci src0 = __lasx_xvmul_h(wgt, src0); 1963cabdff1aSopenharmony_ci src0 = __lasx_xvsadd_h(src0, offset); 1964cabdff1aSopenharmony_ci src0 = __lasx_xvmaxi_h(src0, 0); 1965cabdff1aSopenharmony_ci src0 = __lasx_xvssrlrn_bu_h(src0, denom); 1966cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0, src, 0, 0); 1967cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0, src + stride, 0, 1); 1968cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0, src + stride_2x, 0, 4); 1969cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0, src + stride_3x, 0, 5); 1970cabdff1aSopenharmony_ci} 1971cabdff1aSopenharmony_ci 1972cabdff1aSopenharmony_cistatic void avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride, 1973cabdff1aSopenharmony_ci int32_t log2_denom, int32_t weight_src, 1974cabdff1aSopenharmony_ci int32_t offset_in) 1975cabdff1aSopenharmony_ci{ 1976cabdff1aSopenharmony_ci __m256i src0, src0_h, src0_l; 1977cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset; 1978cabdff1aSopenharmony_ci __m256i wgt, zero = __lasx_xvldi(0); 1979cabdff1aSopenharmony_ci uint32_t offset_val; 1980cabdff1aSopenharmony_ci ptrdiff_t stride_2x = stride << 1; 1981cabdff1aSopenharmony_ci ptrdiff_t stride_4x = stride << 2; 1982cabdff1aSopenharmony_ci ptrdiff_t stride_3x = stride_2x + stride; 1983cabdff1aSopenharmony_ci 1984cabdff1aSopenharmony_ci offset_val = (unsigned) offset_in << log2_denom; 1985cabdff1aSopenharmony_ci 1986cabdff1aSopenharmony_ci wgt = __lasx_xvreplgr2vr_h(weight_src); 1987cabdff1aSopenharmony_ci offset = __lasx_xvreplgr2vr_h(offset_val); 1988cabdff1aSopenharmony_ci denom = __lasx_xvreplgr2vr_h(log2_denom); 1989cabdff1aSopenharmony_ci 1990cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1991cabdff1aSopenharmony_ci src, stride_3x, tmp0, tmp1, tmp2, tmp3); 1992cabdff1aSopenharmony_ci src += stride_4x; 1993cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, 1994cabdff1aSopenharmony_ci src, stride_3x, tmp4, tmp5, tmp6, tmp7); 1995cabdff1aSopenharmony_ci src -= stride_4x; 1996cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, 1997cabdff1aSopenharmony_ci tmp5, tmp0, tmp1, tmp2, tmp3); 1998cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); 1999cabdff1aSopenharmony_ci src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20); 2000cabdff1aSopenharmony_ci src0_l = __lasx_xvilvl_b(zero, src0); 2001cabdff1aSopenharmony_ci src0_h = __lasx_xvilvh_b(zero, src0); 2002cabdff1aSopenharmony_ci src0_l = __lasx_xvmul_h(wgt, src0_l); 2003cabdff1aSopenharmony_ci src0_h = __lasx_xvmul_h(wgt, src0_h); 2004cabdff1aSopenharmony_ci src0_l = __lasx_xvsadd_h(src0_l, offset); 2005cabdff1aSopenharmony_ci src0_h = __lasx_xvsadd_h(src0_h, offset); 2006cabdff1aSopenharmony_ci src0_l = __lasx_xvmaxi_h(src0_l, 0); 2007cabdff1aSopenharmony_ci src0_h = __lasx_xvmaxi_h(src0_h, 0); 2008cabdff1aSopenharmony_ci src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom); 2009cabdff1aSopenharmony_ci src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom); 2010cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0_l, src, 0, 0); 2011cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0_l, src + stride, 0, 1); 2012cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 0); 2013cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 1); 2014cabdff1aSopenharmony_ci src += stride_4x; 2015cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0_l, src, 0, 4); 2016cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0_l, src + stride, 0, 5); 2017cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 4); 2018cabdff1aSopenharmony_ci __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 5); 2019cabdff1aSopenharmony_ci} 2020cabdff1aSopenharmony_ci 2021cabdff1aSopenharmony_civoid ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride, 2022cabdff1aSopenharmony_ci int height, int log2_denom, 2023cabdff1aSopenharmony_ci int weight_src, int offset) 2024cabdff1aSopenharmony_ci{ 2025cabdff1aSopenharmony_ci if (2 == height) { 2026cabdff1aSopenharmony_ci avc_wgt_4x2_lasx(src, stride, log2_denom, weight_src, offset); 2027cabdff1aSopenharmony_ci } else if (4 == height) { 2028cabdff1aSopenharmony_ci avc_wgt_4x4_lasx(src, stride, log2_denom, weight_src, offset); 2029cabdff1aSopenharmony_ci } else { 2030cabdff1aSopenharmony_ci avc_wgt_4x8_lasx(src, stride, log2_denom, weight_src, offset); 2031cabdff1aSopenharmony_ci } 2032cabdff1aSopenharmony_ci} 2033cabdff1aSopenharmony_ci 2034cabdff1aSopenharmony_civoid ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride) 2035cabdff1aSopenharmony_ci{ 2036cabdff1aSopenharmony_ci __m256i src0, dst0, dst1, dst2, dst3, zero; 2037cabdff1aSopenharmony_ci __m256i tmp0, tmp1; 2038cabdff1aSopenharmony_ci uint8_t* _dst1 = _dst + stride; 2039cabdff1aSopenharmony_ci uint8_t* _dst2 = _dst1 + stride; 2040cabdff1aSopenharmony_ci uint8_t* _dst3 = _dst2 + stride; 2041cabdff1aSopenharmony_ci 2042cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 2043cabdff1aSopenharmony_ci dst0 = __lasx_xvldrepl_w(_dst, 0); 2044cabdff1aSopenharmony_ci dst1 = __lasx_xvldrepl_w(_dst1, 0); 2045cabdff1aSopenharmony_ci dst2 = __lasx_xvldrepl_w(_dst2, 0); 2046cabdff1aSopenharmony_ci dst3 = __lasx_xvldrepl_w(_dst3, 0); 2047cabdff1aSopenharmony_ci tmp0 = __lasx_xvilvl_w(dst1, dst0); 2048cabdff1aSopenharmony_ci tmp1 = __lasx_xvilvl_w(dst3, dst2); 2049cabdff1aSopenharmony_ci dst0 = __lasx_xvilvl_d(tmp1, tmp0); 2050cabdff1aSopenharmony_ci tmp0 = __lasx_vext2xv_hu_bu(dst0); 2051cabdff1aSopenharmony_ci zero = __lasx_xvldi(0); 2052cabdff1aSopenharmony_ci tmp1 = __lasx_xvadd_h(src0, tmp0); 2053cabdff1aSopenharmony_ci dst0 = __lasx_xvpickev_b(tmp1, tmp1); 2054cabdff1aSopenharmony_ci __lasx_xvstelm_w(dst0, _dst, 0, 0); 2055cabdff1aSopenharmony_ci __lasx_xvstelm_w(dst0, _dst1, 0, 1); 2056cabdff1aSopenharmony_ci __lasx_xvstelm_w(dst0, _dst2, 0, 4); 2057cabdff1aSopenharmony_ci __lasx_xvstelm_w(dst0, _dst3, 0, 5); 2058cabdff1aSopenharmony_ci __lasx_xvst(zero, _src, 0); 2059cabdff1aSopenharmony_ci} 2060cabdff1aSopenharmony_ci 2061cabdff1aSopenharmony_civoid ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride) 2062cabdff1aSopenharmony_ci{ 2063cabdff1aSopenharmony_ci __m256i src0, src1, src2, src3; 2064cabdff1aSopenharmony_ci __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 2065cabdff1aSopenharmony_ci __m256i tmp0, tmp1, tmp2, tmp3; 2066cabdff1aSopenharmony_ci __m256i zero = __lasx_xvldi(0); 2067cabdff1aSopenharmony_ci uint8_t *_dst1 = _dst + stride; 2068cabdff1aSopenharmony_ci uint8_t *_dst2 = _dst1 + stride; 2069cabdff1aSopenharmony_ci uint8_t *_dst3 = _dst2 + stride; 2070cabdff1aSopenharmony_ci uint8_t *_dst4 = _dst3 + stride; 2071cabdff1aSopenharmony_ci uint8_t *_dst5 = _dst4 + stride; 2072cabdff1aSopenharmony_ci uint8_t *_dst6 = _dst5 + stride; 2073cabdff1aSopenharmony_ci uint8_t *_dst7 = _dst6 + stride; 2074cabdff1aSopenharmony_ci 2075cabdff1aSopenharmony_ci src0 = __lasx_xvld(_src, 0); 2076cabdff1aSopenharmony_ci src1 = __lasx_xvld(_src, 32); 2077cabdff1aSopenharmony_ci src2 = __lasx_xvld(_src, 64); 2078cabdff1aSopenharmony_ci src3 = __lasx_xvld(_src, 96); 2079cabdff1aSopenharmony_ci dst0 = __lasx_xvldrepl_d(_dst, 0); 2080cabdff1aSopenharmony_ci dst1 = __lasx_xvldrepl_d(_dst1, 0); 2081cabdff1aSopenharmony_ci dst2 = __lasx_xvldrepl_d(_dst2, 0); 2082cabdff1aSopenharmony_ci dst3 = __lasx_xvldrepl_d(_dst3, 0); 2083cabdff1aSopenharmony_ci dst4 = __lasx_xvldrepl_d(_dst4, 0); 2084cabdff1aSopenharmony_ci dst5 = __lasx_xvldrepl_d(_dst5, 0); 2085cabdff1aSopenharmony_ci dst6 = __lasx_xvldrepl_d(_dst6, 0); 2086cabdff1aSopenharmony_ci dst7 = __lasx_xvldrepl_d(_dst7, 0); 2087cabdff1aSopenharmony_ci tmp0 = __lasx_xvilvl_d(dst1, dst0); 2088cabdff1aSopenharmony_ci tmp1 = __lasx_xvilvl_d(dst3, dst2); 2089cabdff1aSopenharmony_ci tmp2 = __lasx_xvilvl_d(dst5, dst4); 2090cabdff1aSopenharmony_ci tmp3 = __lasx_xvilvl_d(dst7, dst6); 2091cabdff1aSopenharmony_ci dst0 = __lasx_vext2xv_hu_bu(tmp0); 2092cabdff1aSopenharmony_ci dst1 = __lasx_vext2xv_hu_bu(tmp1); 2093cabdff1aSopenharmony_ci dst1 = __lasx_vext2xv_hu_bu(tmp1); 2094cabdff1aSopenharmony_ci dst2 = __lasx_vext2xv_hu_bu(tmp2); 2095cabdff1aSopenharmony_ci dst3 = __lasx_vext2xv_hu_bu(tmp3); 2096cabdff1aSopenharmony_ci tmp0 = __lasx_xvadd_h(src0, dst0); 2097cabdff1aSopenharmony_ci tmp1 = __lasx_xvadd_h(src1, dst1); 2098cabdff1aSopenharmony_ci tmp2 = __lasx_xvadd_h(src2, dst2); 2099cabdff1aSopenharmony_ci tmp3 = __lasx_xvadd_h(src3, dst3); 2100cabdff1aSopenharmony_ci dst1 = __lasx_xvpickev_b(tmp1, tmp0); 2101cabdff1aSopenharmony_ci dst2 = __lasx_xvpickev_b(tmp3, tmp2); 2102cabdff1aSopenharmony_ci __lasx_xvst(zero, _src, 0); 2103cabdff1aSopenharmony_ci __lasx_xvst(zero, _src, 32); 2104cabdff1aSopenharmony_ci __lasx_xvst(zero, _src, 64); 2105cabdff1aSopenharmony_ci __lasx_xvst(zero, _src, 96); 2106cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, _dst, 0, 0); 2107cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, _dst1, 0, 2); 2108cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, _dst2, 0, 1); 2109cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst1, _dst3, 0, 3); 2110cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, _dst4, 0, 0); 2111cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, _dst5, 0, 2); 2112cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, _dst6, 0, 1); 2113cabdff1aSopenharmony_ci __lasx_xvstelm_d(dst2, _dst7, 0, 3); 2114cabdff1aSopenharmony_ci} 2115