1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2021 Loongson Technology Corporation Limited 3cabdff1aSopenharmony_ci * Contributed by Hao Chen <chenhao@loongson.cn> 4cabdff1aSopenharmony_ci * 5cabdff1aSopenharmony_ci * This file is part of FFmpeg. 6cabdff1aSopenharmony_ci * 7cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 8cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 9cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 10cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 11cabdff1aSopenharmony_ci * 12cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 13cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 14cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15cabdff1aSopenharmony_ci * Lesser General Public License for more details. 16cabdff1aSopenharmony_ci * 17cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 18cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 19cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20cabdff1aSopenharmony_ci */ 21cabdff1aSopenharmony_ci 22cabdff1aSopenharmony_ci#include "libavutil/loongarch/loongson_intrinsics.h" 23cabdff1aSopenharmony_ci#include "idctdsp_loongarch.h" 24cabdff1aSopenharmony_ci 25cabdff1aSopenharmony_ci#define LASX_TRANSPOSE4x16(in_0, in_1, in_2, in_3, out_0, out_1, out_2, out_3) \ 26cabdff1aSopenharmony_ci{ \ 27cabdff1aSopenharmony_ci __m256i temp_0, temp_1, temp_2, temp_3; \ 28cabdff1aSopenharmony_ci __m256i temp_4, temp_5, temp_6, temp_7; \ 29cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvpermi_q, in_2, in_0, 0x20, in_2, in_0, 0x31, in_3, in_1,\ 30cabdff1aSopenharmony_ci 0x20, in_3, in_1, 0x31, temp_0, temp_1, temp_2, temp_3); \ 31cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_h, temp_1, temp_0, temp_3, temp_2, temp_4, temp_6);\ 32cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_h, temp_1, temp_0, temp_3, temp_2, temp_5, temp_7);\ 33cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_w, temp_6, temp_4, temp_7, temp_5, out_0, out_2); \ 34cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_w, temp_6, temp_4, temp_7, temp_5, out_1, out_3); \ 35cabdff1aSopenharmony_ci} 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci#define LASX_IDCTROWCONDDC \ 38cabdff1aSopenharmony_ci const_val = 16383 * ((1 << 19) / 16383); \ 39cabdff1aSopenharmony_ci const_val1 = __lasx_xvreplgr2vr_w(const_val); \ 40cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96, \ 41cabdff1aSopenharmony_ci in0, in1, in2, in3); \ 42cabdff1aSopenharmony_ci LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3); \ 43cabdff1aSopenharmony_ci a0 = __lasx_xvpermi_d(in0, 0xD8); \ 44cabdff1aSopenharmony_ci a0 = __lasx_vext2xv_w_h(a0); \ 45cabdff1aSopenharmony_ci temp = __lasx_xvslli_w(a0, 3); \ 46cabdff1aSopenharmony_ci a1 = __lasx_xvpermi_d(in0, 0x8D); \ 47cabdff1aSopenharmony_ci a1 = __lasx_vext2xv_w_h(a1); \ 48cabdff1aSopenharmony_ci a2 = __lasx_xvpermi_d(in1, 0xD8); \ 49cabdff1aSopenharmony_ci a2 = __lasx_vext2xv_w_h(a2); \ 50cabdff1aSopenharmony_ci a3 = __lasx_xvpermi_d(in1, 0x8D); \ 51cabdff1aSopenharmony_ci a3 = __lasx_vext2xv_w_h(a3); \ 52cabdff1aSopenharmony_ci b0 = __lasx_xvpermi_d(in2, 0xD8); \ 53cabdff1aSopenharmony_ci b0 = __lasx_vext2xv_w_h(b0); \ 54cabdff1aSopenharmony_ci b1 = __lasx_xvpermi_d(in2, 0x8D); \ 55cabdff1aSopenharmony_ci b1 = __lasx_vext2xv_w_h(b1); \ 56cabdff1aSopenharmony_ci b2 = __lasx_xvpermi_d(in3, 0xD8); \ 57cabdff1aSopenharmony_ci b2 = __lasx_vext2xv_w_h(b2); \ 58cabdff1aSopenharmony_ci b3 = __lasx_xvpermi_d(in3, 0x8D); \ 59cabdff1aSopenharmony_ci b3 = __lasx_vext2xv_w_h(b3); \ 60cabdff1aSopenharmony_ci select_vec = a0 | a1 | a2 | a3 | b0 | b1 | b2 | b3; \ 61cabdff1aSopenharmony_ci select_vec = __lasx_xvslti_wu(select_vec, 1); \ 62cabdff1aSopenharmony_ci \ 63cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvrepl128vei_h, w1, 2, w1, 3, w1, 4, w1, 5, \ 64cabdff1aSopenharmony_ci w2, w3, w4, w5); \ 65cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvrepl128vei_h, w1, 6, w1, 7, w6, w7); \ 66cabdff1aSopenharmony_ci w1 = __lasx_xvrepl128vei_h(w1, 1); \ 67cabdff1aSopenharmony_ci \ 68cabdff1aSopenharmony_ci /* part of FUNC6(idctRowCondDC) */ \ 69cabdff1aSopenharmony_ci temp0 = __lasx_xvmaddwl_w_h(const_val0, in0, w4); \ 70cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2); \ 71cabdff1aSopenharmony_ci a0 = __lasx_xvadd_w(temp0, temp1); \ 72cabdff1aSopenharmony_ci a1 = __lasx_xvadd_w(temp0, temp2); \ 73cabdff1aSopenharmony_ci a2 = __lasx_xvsub_w(temp0, temp2); \ 74cabdff1aSopenharmony_ci a3 = __lasx_xvsub_w(temp0, temp1); \ 75cabdff1aSopenharmony_ci \ 76cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1); \ 77cabdff1aSopenharmony_ci b0 = __lasx_xvdp2_w_h(temp0, temp1); \ 78cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w7); \ 79cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(temp1, w3); \ 80cabdff1aSopenharmony_ci b1 = __lasx_xvdp2_w_h(temp0, temp2); \ 81cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w1); \ 82cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(temp1, w5); \ 83cabdff1aSopenharmony_ci b2 = __lasx_xvdp2_w_h(temp0, temp2); \ 84cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w5); \ 85cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(temp1, w7); \ 86cabdff1aSopenharmony_ci b3 = __lasx_xvdp2_w_h(temp0, temp2); \ 87cabdff1aSopenharmony_ci \ 88cabdff1aSopenharmony_ci /* if (AV_RAN64A(row + 4)) */ \ 89cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1); \ 90cabdff1aSopenharmony_ci a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1); \ 91cabdff1aSopenharmony_ci temp1 = __lasx_xvilvl_h(w2, w4); \ 92cabdff1aSopenharmony_ci a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1); \ 93cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w4); \ 94cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(w2, temp1); \ 95cabdff1aSopenharmony_ci a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2); \ 96cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w6); \ 97cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(temp1, w4); \ 98cabdff1aSopenharmony_ci a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2); \ 99cabdff1aSopenharmony_ci \ 100cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1); \ 101cabdff1aSopenharmony_ci b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1); \ 102cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2); \ 103cabdff1aSopenharmony_ci b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1); \ 104cabdff1aSopenharmony_ci b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2); \ 105cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w1); \ 106cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(temp1, w3); \ 107cabdff1aSopenharmony_ci b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2); \ 108cabdff1aSopenharmony_ci \ 109cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3, \ 110cabdff1aSopenharmony_ci temp0, temp1, temp2, temp3); \ 111cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsub_w, a0, b0, a1, b1, a2, b2, a3, b3, \ 112cabdff1aSopenharmony_ci a0, a1, a2, a3); \ 113cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsrai_w, temp0, 11, temp1, 11, temp2, 11, temp3, 11, \ 114cabdff1aSopenharmony_ci temp0, temp1, temp2, temp3); \ 115cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsrai_w, a0, 11, a1, 11, a2, 11, a3, 11, a0, a1, a2, a3);\ 116cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvbitsel_v, temp0, temp, select_vec, temp1, temp, \ 117cabdff1aSopenharmony_ci select_vec, temp2, temp, select_vec, temp3, temp, select_vec, \ 118cabdff1aSopenharmony_ci in0, in1, in2, in3); \ 119cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvbitsel_v, a0, temp, select_vec, a1, temp, \ 120cabdff1aSopenharmony_ci select_vec, a2, temp, select_vec, a3, temp, select_vec, \ 121cabdff1aSopenharmony_ci a0, a1, a2, a3); \ 122cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpickev_h, in1, in0, in3, in2, a2, a3, a0, a1, \ 123cabdff1aSopenharmony_ci in0, in1, in2, in3); \ 124cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, \ 125cabdff1aSopenharmony_ci in0, in1, in2, in3); \ 126cabdff1aSopenharmony_ci 127cabdff1aSopenharmony_ci#define LASX_IDCTCOLS \ 128cabdff1aSopenharmony_ci /* part of FUNC6(idctSparaseCol) */ \ 129cabdff1aSopenharmony_ci LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3); \ 130cabdff1aSopenharmony_ci temp0 = __lasx_xvmaddwl_w_h(const_val1, in0, w4); \ 131cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2); \ 132cabdff1aSopenharmony_ci a0 = __lasx_xvadd_w(temp0, temp1); \ 133cabdff1aSopenharmony_ci a1 = __lasx_xvadd_w(temp0, temp2); \ 134cabdff1aSopenharmony_ci a2 = __lasx_xvsub_w(temp0, temp2); \ 135cabdff1aSopenharmony_ci a3 = __lasx_xvsub_w(temp0, temp1); \ 136cabdff1aSopenharmony_ci \ 137cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1); \ 138cabdff1aSopenharmony_ci b0 = __lasx_xvdp2_w_h(temp0, temp1); \ 139cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w7); \ 140cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(temp1, w3); \ 141cabdff1aSopenharmony_ci b1 = __lasx_xvdp2_w_h(temp0, temp2); \ 142cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w1); \ 143cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(temp1, w5); \ 144cabdff1aSopenharmony_ci b2 = __lasx_xvdp2_w_h(temp0, temp2); \ 145cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w5); \ 146cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(temp1, w7); \ 147cabdff1aSopenharmony_ci b3 = __lasx_xvdp2_w_h(temp0, temp2); \ 148cabdff1aSopenharmony_ci \ 149cabdff1aSopenharmony_ci /* if (AV_RAN64A(row + 4)) */ \ 150cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1); \ 151cabdff1aSopenharmony_ci a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1); \ 152cabdff1aSopenharmony_ci temp1 = __lasx_xvilvl_h(w2, w4); \ 153cabdff1aSopenharmony_ci a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1); \ 154cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w4); \ 155cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(w2, temp1); \ 156cabdff1aSopenharmony_ci a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2); \ 157cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w6); \ 158cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(temp1, w4); \ 159cabdff1aSopenharmony_ci a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2); \ 160cabdff1aSopenharmony_ci \ 161cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1); \ 162cabdff1aSopenharmony_ci b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1); \ 163cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2); \ 164cabdff1aSopenharmony_ci b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1); \ 165cabdff1aSopenharmony_ci b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2); \ 166cabdff1aSopenharmony_ci temp1 = __lasx_xvneg_h(w1); \ 167cabdff1aSopenharmony_ci temp2 = __lasx_xvilvl_h(temp1, w3); \ 168cabdff1aSopenharmony_ci b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2); \ 169cabdff1aSopenharmony_ci \ 170cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3, \ 171cabdff1aSopenharmony_ci temp0, temp1, temp2, temp3); \ 172cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvsub_w, a3, b3, a2, b2, a1, b1, a0, b0, \ 173cabdff1aSopenharmony_ci a3, a2, a1, a0); \ 174cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 20, temp3, temp2, 20, a2, a3, \ 175cabdff1aSopenharmony_ci 20, a0, a1, 20, in0, in1, in2, in3); \ 176cabdff1aSopenharmony_ci 177cabdff1aSopenharmony_civoid ff_simple_idct_lasx(int16_t *block) 178cabdff1aSopenharmony_ci{ 179cabdff1aSopenharmony_ci int32_t const_val = 1 << 10; 180cabdff1aSopenharmony_ci __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF, 181cabdff1aSopenharmony_ci 0x4B42539F58C50000, 0x11A822A332493FFF}; 182cabdff1aSopenharmony_ci __m256i in0, in1, in2, in3; 183cabdff1aSopenharmony_ci __m256i w2, w3, w4, w5, w6, w7; 184cabdff1aSopenharmony_ci __m256i a0, a1, a2, a3; 185cabdff1aSopenharmony_ci __m256i b0, b1, b2, b3; 186cabdff1aSopenharmony_ci __m256i temp0, temp1, temp2, temp3; 187cabdff1aSopenharmony_ci __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val); 188cabdff1aSopenharmony_ci __m256i const_val1, select_vec, temp; 189cabdff1aSopenharmony_ci 190cabdff1aSopenharmony_ci LASX_IDCTROWCONDDC 191cabdff1aSopenharmony_ci LASX_IDCTCOLS 192cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, 193cabdff1aSopenharmony_ci in0, in1, in2, in3); 194cabdff1aSopenharmony_ci __lasx_xvst(in0, block, 0); 195cabdff1aSopenharmony_ci __lasx_xvst(in1, block, 32); 196cabdff1aSopenharmony_ci __lasx_xvst(in2, block, 64); 197cabdff1aSopenharmony_ci __lasx_xvst(in3, block, 96); 198cabdff1aSopenharmony_ci} 199cabdff1aSopenharmony_ci 200cabdff1aSopenharmony_civoid ff_simple_idct_put_lasx(uint8_t *dst, ptrdiff_t dst_stride, 201cabdff1aSopenharmony_ci int16_t *block) 202cabdff1aSopenharmony_ci{ 203cabdff1aSopenharmony_ci int32_t const_val = 1 << 10; 204cabdff1aSopenharmony_ci ptrdiff_t dst_stride_2x = dst_stride << 1; 205cabdff1aSopenharmony_ci ptrdiff_t dst_stride_4x = dst_stride << 2; 206cabdff1aSopenharmony_ci ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride; 207cabdff1aSopenharmony_ci __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF, 208cabdff1aSopenharmony_ci 0x4B42539F58C50000, 0x11A822A332493FFF}; 209cabdff1aSopenharmony_ci __m256i in0, in1, in2, in3; 210cabdff1aSopenharmony_ci __m256i w2, w3, w4, w5, w6, w7; 211cabdff1aSopenharmony_ci __m256i a0, a1, a2, a3; 212cabdff1aSopenharmony_ci __m256i b0, b1, b2, b3; 213cabdff1aSopenharmony_ci __m256i temp0, temp1, temp2, temp3; 214cabdff1aSopenharmony_ci __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val); 215cabdff1aSopenharmony_ci __m256i const_val1, select_vec, temp; 216cabdff1aSopenharmony_ci 217cabdff1aSopenharmony_ci LASX_IDCTROWCONDDC 218cabdff1aSopenharmony_ci LASX_IDCTCOLS 219cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, 220cabdff1aSopenharmony_ci in0, in1, in2, in3); 221cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3); 222cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1); 223cabdff1aSopenharmony_ci __lasx_xvstelm_d(in0, dst, 0, 0); 224cabdff1aSopenharmony_ci __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2); 225cabdff1aSopenharmony_ci __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1); 226cabdff1aSopenharmony_ci __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3); 227cabdff1aSopenharmony_ci dst += dst_stride_4x; 228cabdff1aSopenharmony_ci __lasx_xvstelm_d(in1, dst, 0, 0); 229cabdff1aSopenharmony_ci __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2); 230cabdff1aSopenharmony_ci __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1); 231cabdff1aSopenharmony_ci __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3); 232cabdff1aSopenharmony_ci} 233cabdff1aSopenharmony_ci 234cabdff1aSopenharmony_civoid ff_simple_idct_add_lasx(uint8_t *dst, ptrdiff_t dst_stride, 235cabdff1aSopenharmony_ci int16_t *block) 236cabdff1aSopenharmony_ci{ 237cabdff1aSopenharmony_ci int32_t const_val = 1 << 10; 238cabdff1aSopenharmony_ci uint8_t *dst1 = dst; 239cabdff1aSopenharmony_ci ptrdiff_t dst_stride_2x = dst_stride << 1; 240cabdff1aSopenharmony_ci ptrdiff_t dst_stride_4x = dst_stride << 2; 241cabdff1aSopenharmony_ci ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride; 242cabdff1aSopenharmony_ci 243cabdff1aSopenharmony_ci __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF, 244cabdff1aSopenharmony_ci 0x4B42539F58C50000, 0x11A822A332493FFF}; 245cabdff1aSopenharmony_ci __m256i sh = {0x0003000200010000, 0x000B000A00090008, 246cabdff1aSopenharmony_ci 0x0007000600050004, 0x000F000E000D000C}; 247cabdff1aSopenharmony_ci __m256i in0, in1, in2, in3; 248cabdff1aSopenharmony_ci __m256i w2, w3, w4, w5, w6, w7; 249cabdff1aSopenharmony_ci __m256i a0, a1, a2, a3; 250cabdff1aSopenharmony_ci __m256i b0, b1, b2, b3; 251cabdff1aSopenharmony_ci __m256i temp0, temp1, temp2, temp3; 252cabdff1aSopenharmony_ci __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val); 253cabdff1aSopenharmony_ci __m256i const_val1, select_vec, temp; 254cabdff1aSopenharmony_ci 255cabdff1aSopenharmony_ci LASX_IDCTROWCONDDC 256cabdff1aSopenharmony_ci LASX_IDCTCOLS 257cabdff1aSopenharmony_ci a0 = __lasx_xvldrepl_d(dst1, 0); 258cabdff1aSopenharmony_ci a0 = __lasx_vext2xv_hu_bu(a0); 259cabdff1aSopenharmony_ci dst1 += dst_stride; 260cabdff1aSopenharmony_ci a1 = __lasx_xvldrepl_d(dst1, 0); 261cabdff1aSopenharmony_ci a1 = __lasx_vext2xv_hu_bu(a1); 262cabdff1aSopenharmony_ci dst1 += dst_stride; 263cabdff1aSopenharmony_ci a2 = __lasx_xvldrepl_d(dst1, 0); 264cabdff1aSopenharmony_ci a2 = __lasx_vext2xv_hu_bu(a2); 265cabdff1aSopenharmony_ci dst1 += dst_stride; 266cabdff1aSopenharmony_ci a3 = __lasx_xvldrepl_d(dst1, 0); 267cabdff1aSopenharmony_ci a3 = __lasx_vext2xv_hu_bu(a3); 268cabdff1aSopenharmony_ci dst1 += dst_stride; 269cabdff1aSopenharmony_ci b0 = __lasx_xvldrepl_d(dst1, 0); 270cabdff1aSopenharmony_ci b0 = __lasx_vext2xv_hu_bu(b0); 271cabdff1aSopenharmony_ci dst1 += dst_stride; 272cabdff1aSopenharmony_ci b1 = __lasx_xvldrepl_d(dst1, 0); 273cabdff1aSopenharmony_ci b1 = __lasx_vext2xv_hu_bu(b1); 274cabdff1aSopenharmony_ci dst1 += dst_stride; 275cabdff1aSopenharmony_ci b2 = __lasx_xvldrepl_d(dst1, 0); 276cabdff1aSopenharmony_ci b2 = __lasx_vext2xv_hu_bu(b2); 277cabdff1aSopenharmony_ci dst1 += dst_stride; 278cabdff1aSopenharmony_ci b3 = __lasx_xvldrepl_d(dst1, 0); 279cabdff1aSopenharmony_ci b3 = __lasx_vext2xv_hu_bu(b3); 280cabdff1aSopenharmony_ci DUP4_ARG3(__lasx_xvshuf_h, sh, a1, a0, sh, a3, a2, sh, b1, b0, sh, b3, b2, 281cabdff1aSopenharmony_ci temp0, temp1, temp2, temp3); 282cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvadd_h, temp0, in0, temp1, in1, temp2, in2, temp3, in3, 283cabdff1aSopenharmony_ci in0, in1, in2, in3); 284cabdff1aSopenharmony_ci DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, 285cabdff1aSopenharmony_ci in0, in1, in2, in3); 286cabdff1aSopenharmony_ci DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3); 287cabdff1aSopenharmony_ci DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1); 288cabdff1aSopenharmony_ci __lasx_xvstelm_d(in0, dst, 0, 0); 289cabdff1aSopenharmony_ci __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2); 290cabdff1aSopenharmony_ci __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1); 291cabdff1aSopenharmony_ci __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3); 292cabdff1aSopenharmony_ci dst += dst_stride_4x; 293cabdff1aSopenharmony_ci __lasx_xvstelm_d(in1, dst, 0, 0); 294cabdff1aSopenharmony_ci __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2); 295cabdff1aSopenharmony_ci __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1); 296cabdff1aSopenharmony_ci __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3); 297cabdff1aSopenharmony_ci} 298