1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * This file is part of FFmpeg. 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 7cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 8cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 9cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 10cabdff1aSopenharmony_ci * 11cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 12cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 13cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14cabdff1aSopenharmony_ci * Lesser General Public License for more details. 15cabdff1aSopenharmony_ci * 16cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 17cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 18cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19cabdff1aSopenharmony_ci */ 20cabdff1aSopenharmony_ci 21cabdff1aSopenharmony_ci#include "libavutil/mips/generic_macros_msa.h" 22cabdff1aSopenharmony_ci#include "idctdsp_mips.h" 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_cistatic void simple_idct_msa(int16_t *block) 25cabdff1aSopenharmony_ci{ 26cabdff1aSopenharmony_ci int32_t const_val; 27cabdff1aSopenharmony_ci v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 }; 28cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 29cabdff1aSopenharmony_ci v8i16 w1, w3, w5, w7; 30cabdff1aSopenharmony_ci v8i16 const0, const1, const2, const3, const4, const5, const6, const7; 31cabdff1aSopenharmony_ci v4i32 temp0_r, temp1_r, temp2_r, temp3_r; 32cabdff1aSopenharmony_ci v4i32 temp0_l, temp1_l, temp2_l, temp3_l; 33cabdff1aSopenharmony_ci v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l; 34cabdff1aSopenharmony_ci v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l; 35cabdff1aSopenharmony_ci v4i32 w2, w4, w6; 36cabdff1aSopenharmony_ci v8i16 select_vec, temp; 37cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 38cabdff1aSopenharmony_ci v4i32 const_val0 = __msa_ldi_w(1); 39cabdff1aSopenharmony_ci v4i32 const_val1 = __msa_ldi_w(1); 40cabdff1aSopenharmony_ci 41cabdff1aSopenharmony_ci LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); 42cabdff1aSopenharmony_ci const_val0 <<= 10; 43cabdff1aSopenharmony_ci const_val = 16383 * ((1 << 19) / 16383); 44cabdff1aSopenharmony_ci const_val1 = __msa_insert_w(const_val0, 0, const_val); 45cabdff1aSopenharmony_ci const_val1 = __msa_splati_w(const_val1, 0); 46cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 47cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 48cabdff1aSopenharmony_ci select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7; 49cabdff1aSopenharmony_ci select_vec = __msa_clti_u_h((v8u16) select_vec, 1); 50cabdff1aSopenharmony_ci UNPCK_SH_SW(in0, a0_r, a0_l); 51cabdff1aSopenharmony_ci UNPCK_SH_SW(in2, temp3_r, temp3_l); 52cabdff1aSopenharmony_ci temp = in0 << 3; 53cabdff1aSopenharmony_ci w2 = (v4i32) __msa_splati_h(weights, 2); 54cabdff1aSopenharmony_ci w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); 55cabdff1aSopenharmony_ci w4 = (v4i32) __msa_splati_h(weights, 4); 56cabdff1aSopenharmony_ci w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); 57cabdff1aSopenharmony_ci w6 = (v4i32) __msa_splati_h(weights, 6); 58cabdff1aSopenharmony_ci w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); 59cabdff1aSopenharmony_ci MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 60cabdff1aSopenharmony_ci ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l); 61cabdff1aSopenharmony_ci MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, 62cabdff1aSopenharmony_ci temp1_r, temp1_l, temp2_r, temp2_l); 63cabdff1aSopenharmony_ci BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 64cabdff1aSopenharmony_ci temp2_l, temp2_r, temp1_l, temp1_r, 65cabdff1aSopenharmony_ci a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 66cabdff1aSopenharmony_ci UNPCK_SH_SW(in4, temp0_r, temp0_l); 67cabdff1aSopenharmony_ci UNPCK_SH_SW(in6, temp3_r, temp3_l); 68cabdff1aSopenharmony_ci MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); 69cabdff1aSopenharmony_ci MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, 70cabdff1aSopenharmony_ci temp2_r, temp2_l, temp1_r, temp1_l); 71cabdff1aSopenharmony_ci ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); 72cabdff1aSopenharmony_ci SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l, 73cabdff1aSopenharmony_ci a1_r, a1_l, a2_r, a2_l); 74cabdff1aSopenharmony_ci ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l, 75cabdff1aSopenharmony_ci a3_r, a3_l, a0_r, a0_l); 76cabdff1aSopenharmony_ci SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); 77cabdff1aSopenharmony_ci ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); 78cabdff1aSopenharmony_ci SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); 79cabdff1aSopenharmony_ci ILVRL_H2_SW(in1, in3, b3_r, b3_l); 80cabdff1aSopenharmony_ci SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); 81cabdff1aSopenharmony_ci ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); 82cabdff1aSopenharmony_ci ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, 83cabdff1aSopenharmony_ci const0, const1, const2, const3); 84cabdff1aSopenharmony_ci ILVR_H2_SH(w5, w7, w7, w3, const4, const6); 85cabdff1aSopenharmony_ci const5 = __msa_ilvod_h(-w1, -w5); 86cabdff1aSopenharmony_ci const7 = __msa_ilvod_h(w3, -w1); 87cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 88cabdff1aSopenharmony_ci b0_r, b1_r, b2_r, b3_r); 89cabdff1aSopenharmony_ci DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, 90cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 91cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 92cabdff1aSopenharmony_ci b0_l, b1_l, b2_l, b3_l); 93cabdff1aSopenharmony_ci DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, 94cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 95cabdff1aSopenharmony_ci BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 96cabdff1aSopenharmony_ci b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 97cabdff1aSopenharmony_ci temp0_r, temp0_l, temp1_r, temp1_l, 98cabdff1aSopenharmony_ci temp2_r, temp2_l, temp3_r, temp3_l, 99cabdff1aSopenharmony_ci a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 100cabdff1aSopenharmony_ci SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11); 101cabdff1aSopenharmony_ci SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11); 102cabdff1aSopenharmony_ci PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, 103cabdff1aSopenharmony_ci temp2_l, temp2_r, temp3_l, temp3_r, 104cabdff1aSopenharmony_ci temp0_r, temp1_r, temp2_r, temp3_r); 105cabdff1aSopenharmony_ci in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp, 106cabdff1aSopenharmony_ci (v16u8) select_vec); 107cabdff1aSopenharmony_ci in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp, 108cabdff1aSopenharmony_ci (v16u8) select_vec); 109cabdff1aSopenharmony_ci in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp, 110cabdff1aSopenharmony_ci (v16u8) select_vec); 111cabdff1aSopenharmony_ci in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp, 112cabdff1aSopenharmony_ci (v16u8) select_vec); 113cabdff1aSopenharmony_ci SRA_4V(a3_r, a3_l, a2_r, a2_l, 11); 114cabdff1aSopenharmony_ci SRA_4V(a1_r, a1_l, a0_r, a0_l, 11); 115cabdff1aSopenharmony_ci PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 116cabdff1aSopenharmony_ci a0_r, a1_r, a2_r, a3_r); 117cabdff1aSopenharmony_ci in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec); 118cabdff1aSopenharmony_ci in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec); 119cabdff1aSopenharmony_ci in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec); 120cabdff1aSopenharmony_ci in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec); 121cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 122cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 123cabdff1aSopenharmony_ci 124cabdff1aSopenharmony_ci UNPCK_SH_SW(in0, a0_r, a0_l); 125cabdff1aSopenharmony_ci UNPCK_SH_SW(in2, temp3_r, temp3_l); 126cabdff1aSopenharmony_ci w2 = (v4i32) __msa_splati_h(weights, 2); 127cabdff1aSopenharmony_ci w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); 128cabdff1aSopenharmony_ci w4 = (v4i32) __msa_splati_h(weights, 4); 129cabdff1aSopenharmony_ci w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); 130cabdff1aSopenharmony_ci w6 = (v4i32) __msa_splati_h(weights, 6); 131cabdff1aSopenharmony_ci w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); 132cabdff1aSopenharmony_ci MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 133cabdff1aSopenharmony_ci ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l); 134cabdff1aSopenharmony_ci MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, 135cabdff1aSopenharmony_ci temp1_r, temp1_l, temp2_r, temp2_l); 136cabdff1aSopenharmony_ci BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 137cabdff1aSopenharmony_ci temp2_l, temp2_r, temp1_l, temp1_r, 138cabdff1aSopenharmony_ci a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 139cabdff1aSopenharmony_ci UNPCK_SH_SW(in4, temp0_r, temp0_l); 140cabdff1aSopenharmony_ci UNPCK_SH_SW(in6, temp3_r, temp3_l); 141cabdff1aSopenharmony_ci MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); 142cabdff1aSopenharmony_ci MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l, 143cabdff1aSopenharmony_ci temp2_r, temp2_l, temp1_r, temp1_l); 144cabdff1aSopenharmony_ci ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); 145cabdff1aSopenharmony_ci SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l, 146cabdff1aSopenharmony_ci a1_r, a1_l, a2_r, a2_l); 147cabdff1aSopenharmony_ci ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l, 148cabdff1aSopenharmony_ci a3_r, a3_l, a0_r, a0_l); 149cabdff1aSopenharmony_ci SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); 150cabdff1aSopenharmony_ci ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); 151cabdff1aSopenharmony_ci SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); 152cabdff1aSopenharmony_ci ILVRL_H2_SW(in1, in3, b3_r, b3_l); 153cabdff1aSopenharmony_ci SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); 154cabdff1aSopenharmony_ci ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, 155cabdff1aSopenharmony_ci const0, const1, const2, const3); 156cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 157cabdff1aSopenharmony_ci b0_r, b1_r, b2_r, b3_r); 158cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 159cabdff1aSopenharmony_ci b0_l, b1_l, b2_l, b3_l); 160cabdff1aSopenharmony_ci ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); 161cabdff1aSopenharmony_ci ILVR_H2_SH(w5, w7, w7, w3, const4, const6); 162cabdff1aSopenharmony_ci const5 = __msa_ilvod_h(-w1, -w5); 163cabdff1aSopenharmony_ci const7 = __msa_ilvod_h(w3, -w1); 164cabdff1aSopenharmony_ci DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, 165cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 166cabdff1aSopenharmony_ci DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, 167cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 168cabdff1aSopenharmony_ci BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 169cabdff1aSopenharmony_ci b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 170cabdff1aSopenharmony_ci temp0_r, temp0_l, temp1_r, temp1_l, 171cabdff1aSopenharmony_ci temp2_r, temp2_l, temp3_r, temp3_l, 172cabdff1aSopenharmony_ci a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 173cabdff1aSopenharmony_ci SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20); 174cabdff1aSopenharmony_ci SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20); 175cabdff1aSopenharmony_ci PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r, 176cabdff1aSopenharmony_ci temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r); 177cabdff1aSopenharmony_ci SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); 178cabdff1aSopenharmony_ci SRA_4V(a1_r, a1_l, a0_r, a0_l, 20); 179cabdff1aSopenharmony_ci PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 180cabdff1aSopenharmony_ci a0_r, a1_r, a2_r, a3_r); 181cabdff1aSopenharmony_ci ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r, 182cabdff1aSopenharmony_ci block, 8); 183cabdff1aSopenharmony_ci} 184cabdff1aSopenharmony_ci 185cabdff1aSopenharmony_cistatic void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, 186cabdff1aSopenharmony_ci int16_t *block) 187cabdff1aSopenharmony_ci{ 188cabdff1aSopenharmony_ci int32_t const_val; 189cabdff1aSopenharmony_ci uint64_t tmp0, tmp1, tmp2, tmp3; 190cabdff1aSopenharmony_ci v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 }; 191cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 192cabdff1aSopenharmony_ci v8i16 w1, w3, w5, w7; 193cabdff1aSopenharmony_ci v8i16 const0, const1, const2, const3, const4, const5, const6, const7; 194cabdff1aSopenharmony_ci v4i32 temp0_r, temp1_r, temp2_r, temp3_r; 195cabdff1aSopenharmony_ci v4i32 temp0_l, temp1_l, temp2_l, temp3_l; 196cabdff1aSopenharmony_ci v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l; 197cabdff1aSopenharmony_ci v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l; 198cabdff1aSopenharmony_ci v4i32 w2, w4, w6; 199cabdff1aSopenharmony_ci v8i16 select_vec, temp; 200cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 201cabdff1aSopenharmony_ci v4i32 const_val0 = __msa_ldi_w(1); 202cabdff1aSopenharmony_ci v4i32 const_val1 = __msa_ldi_w(1); 203cabdff1aSopenharmony_ci 204cabdff1aSopenharmony_ci LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); 205cabdff1aSopenharmony_ci const_val0 <<= 10; 206cabdff1aSopenharmony_ci const_val = 16383 * ((1 << 19) / 16383); 207cabdff1aSopenharmony_ci const_val1 = __msa_insert_w(const_val0, 0, const_val); 208cabdff1aSopenharmony_ci const_val1 = __msa_splati_w(const_val1, 0); 209cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 210cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 211cabdff1aSopenharmony_ci select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7; 212cabdff1aSopenharmony_ci select_vec = __msa_clti_u_h((v8u16) select_vec, 1); 213cabdff1aSopenharmony_ci UNPCK_SH_SW(in0, a0_r, a0_l); 214cabdff1aSopenharmony_ci UNPCK_SH_SW(in2, temp3_r, temp3_l); 215cabdff1aSopenharmony_ci temp = in0 << 3; 216cabdff1aSopenharmony_ci w2 = (v4i32) __msa_splati_h(weights, 2); 217cabdff1aSopenharmony_ci w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); 218cabdff1aSopenharmony_ci w4 = (v4i32) __msa_splati_h(weights, 4); 219cabdff1aSopenharmony_ci w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); 220cabdff1aSopenharmony_ci w6 = (v4i32) __msa_splati_h(weights, 6); 221cabdff1aSopenharmony_ci w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); 222cabdff1aSopenharmony_ci MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 223cabdff1aSopenharmony_ci ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l); 224cabdff1aSopenharmony_ci MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); 225cabdff1aSopenharmony_ci MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); 226cabdff1aSopenharmony_ci BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 227cabdff1aSopenharmony_ci temp2_l, temp2_r, temp1_l, temp1_r, 228cabdff1aSopenharmony_ci a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 229cabdff1aSopenharmony_ci UNPCK_SH_SW(in4, temp0_r, temp0_l); 230cabdff1aSopenharmony_ci UNPCK_SH_SW(in6, temp3_r, temp3_l); 231cabdff1aSopenharmony_ci MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); 232cabdff1aSopenharmony_ci MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l); 233cabdff1aSopenharmony_ci MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l); 234cabdff1aSopenharmony_ci ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); 235cabdff1aSopenharmony_ci SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l); 236cabdff1aSopenharmony_ci SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l); 237cabdff1aSopenharmony_ci ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l); 238cabdff1aSopenharmony_ci ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l); 239cabdff1aSopenharmony_ci SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); 240cabdff1aSopenharmony_ci ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); 241cabdff1aSopenharmony_ci SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); 242cabdff1aSopenharmony_ci ILVRL_H2_SW(in1, in3, b3_r, b3_l); 243cabdff1aSopenharmony_ci SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); 244cabdff1aSopenharmony_ci ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); 245cabdff1aSopenharmony_ci ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, 246cabdff1aSopenharmony_ci const0, const1, const2, const3); 247cabdff1aSopenharmony_ci ILVR_H2_SH(w5, w7, w7, w3, const4, const6); 248cabdff1aSopenharmony_ci const5 = __msa_ilvod_h(-w1, -w5); 249cabdff1aSopenharmony_ci const7 = __msa_ilvod_h(w3, -w1); 250cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 251cabdff1aSopenharmony_ci b0_r, b1_r, b2_r, b3_r); 252cabdff1aSopenharmony_ci DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, 253cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 254cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 255cabdff1aSopenharmony_ci b0_l, b1_l, b2_l, b3_l); 256cabdff1aSopenharmony_ci DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, 257cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 258cabdff1aSopenharmony_ci BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 259cabdff1aSopenharmony_ci b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 260cabdff1aSopenharmony_ci temp0_r, temp0_l, temp1_r, temp1_l, 261cabdff1aSopenharmony_ci temp2_r, temp2_l, temp3_r, temp3_l, 262cabdff1aSopenharmony_ci a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 263cabdff1aSopenharmony_ci SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11); 264cabdff1aSopenharmony_ci SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11); 265cabdff1aSopenharmony_ci PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, 266cabdff1aSopenharmony_ci temp2_l, temp2_r, temp3_l, temp3_r, 267cabdff1aSopenharmony_ci temp0_r, temp1_r, temp2_r, temp3_r); 268cabdff1aSopenharmony_ci in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp, 269cabdff1aSopenharmony_ci (v16u8) select_vec); 270cabdff1aSopenharmony_ci in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp, 271cabdff1aSopenharmony_ci (v16u8) select_vec); 272cabdff1aSopenharmony_ci in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp, 273cabdff1aSopenharmony_ci (v16u8) select_vec); 274cabdff1aSopenharmony_ci in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp, 275cabdff1aSopenharmony_ci (v16u8) select_vec); 276cabdff1aSopenharmony_ci SRA_4V(a3_r, a3_l, a2_r, a2_l, 11); 277cabdff1aSopenharmony_ci SRA_4V(a1_r, a1_l, a0_r, a0_l, 11); 278cabdff1aSopenharmony_ci PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 279cabdff1aSopenharmony_ci a0_r, a1_r, a2_r, a3_r); 280cabdff1aSopenharmony_ci in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec); 281cabdff1aSopenharmony_ci in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec); 282cabdff1aSopenharmony_ci in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec); 283cabdff1aSopenharmony_ci in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec); 284cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 285cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 286cabdff1aSopenharmony_ci UNPCK_SH_SW(in0, a0_r, a0_l); 287cabdff1aSopenharmony_ci UNPCK_SH_SW(in2, temp3_r, temp3_l); 288cabdff1aSopenharmony_ci w2 = (v4i32) __msa_splati_h(weights, 2); 289cabdff1aSopenharmony_ci w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); 290cabdff1aSopenharmony_ci w4 = (v4i32) __msa_splati_h(weights, 4); 291cabdff1aSopenharmony_ci w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); 292cabdff1aSopenharmony_ci w6 = (v4i32) __msa_splati_h(weights, 6); 293cabdff1aSopenharmony_ci w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); 294cabdff1aSopenharmony_ci MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 295cabdff1aSopenharmony_ci ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l); 296cabdff1aSopenharmony_ci MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); 297cabdff1aSopenharmony_ci MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); 298cabdff1aSopenharmony_ci BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 299cabdff1aSopenharmony_ci temp2_l, temp2_r, temp1_l, temp1_r, 300cabdff1aSopenharmony_ci a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 301cabdff1aSopenharmony_ci UNPCK_SH_SW(in4, temp0_r, temp0_l); 302cabdff1aSopenharmony_ci UNPCK_SH_SW(in6, temp3_r, temp3_l); 303cabdff1aSopenharmony_ci MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); 304cabdff1aSopenharmony_ci MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l); 305cabdff1aSopenharmony_ci MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l); 306cabdff1aSopenharmony_ci ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); 307cabdff1aSopenharmony_ci SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l); 308cabdff1aSopenharmony_ci SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l); 309cabdff1aSopenharmony_ci ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l); 310cabdff1aSopenharmony_ci ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l); 311cabdff1aSopenharmony_ci SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); 312cabdff1aSopenharmony_ci ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); 313cabdff1aSopenharmony_ci SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); 314cabdff1aSopenharmony_ci ILVRL_H2_SW(in1, in3, b3_r, b3_l); 315cabdff1aSopenharmony_ci SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); 316cabdff1aSopenharmony_ci ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, 317cabdff1aSopenharmony_ci const0, const1, const2, const3); 318cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 319cabdff1aSopenharmony_ci b0_r, b1_r, b2_r, b3_r); 320cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 321cabdff1aSopenharmony_ci b0_l, b1_l, b2_l, b3_l); 322cabdff1aSopenharmony_ci ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); 323cabdff1aSopenharmony_ci ILVR_H2_SH(w5, w7, w7, w3, const4, const6); 324cabdff1aSopenharmony_ci const5 = __msa_ilvod_h(-w1, -w5); 325cabdff1aSopenharmony_ci const7 = __msa_ilvod_h(w3, -w1); 326cabdff1aSopenharmony_ci DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, 327cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 328cabdff1aSopenharmony_ci DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, 329cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 330cabdff1aSopenharmony_ci BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 331cabdff1aSopenharmony_ci b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 332cabdff1aSopenharmony_ci temp0_r, temp0_l, temp1_r, temp1_l, 333cabdff1aSopenharmony_ci temp2_r, temp2_l, temp3_r, temp3_l, 334cabdff1aSopenharmony_ci a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 335cabdff1aSopenharmony_ci SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20); 336cabdff1aSopenharmony_ci SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20); 337cabdff1aSopenharmony_ci SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); 338cabdff1aSopenharmony_ci SRA_4V(a1_r, a1_l, a0_r, a0_l, 20); 339cabdff1aSopenharmony_ci PCKEV_H4_SH(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r, 340cabdff1aSopenharmony_ci temp3_l, temp3_r, in0, in1, in2, in3); 341cabdff1aSopenharmony_ci PCKEV_H4_SH(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 342cabdff1aSopenharmony_ci in4, in5, in6, in7); 343cabdff1aSopenharmony_ci CLIP_SH4_0_255(in0, in1, in2, in3); 344cabdff1aSopenharmony_ci PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, 345cabdff1aSopenharmony_ci in0, in1, in2, in3); 346cabdff1aSopenharmony_ci tmp0 = __msa_copy_u_d((v2i64) in0, 1); 347cabdff1aSopenharmony_ci tmp1 = __msa_copy_u_d((v2i64) in1, 1); 348cabdff1aSopenharmony_ci tmp2 = __msa_copy_u_d((v2i64) in2, 1); 349cabdff1aSopenharmony_ci tmp3 = __msa_copy_u_d((v2i64) in3, 1); 350cabdff1aSopenharmony_ci SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 351cabdff1aSopenharmony_ci CLIP_SH4_0_255(in4, in5, in6, in7); 352cabdff1aSopenharmony_ci PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, 353cabdff1aSopenharmony_ci in4, in5, in6, in7); 354cabdff1aSopenharmony_ci tmp3 = __msa_copy_u_d((v2i64) in4, 1); 355cabdff1aSopenharmony_ci tmp2 = __msa_copy_u_d((v2i64) in5, 1); 356cabdff1aSopenharmony_ci tmp1 = __msa_copy_u_d((v2i64) in6, 1); 357cabdff1aSopenharmony_ci tmp0 = __msa_copy_u_d((v2i64) in7, 1); 358cabdff1aSopenharmony_ci SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride); 359cabdff1aSopenharmony_ci} 360cabdff1aSopenharmony_ci 361cabdff1aSopenharmony_cistatic void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, 362cabdff1aSopenharmony_ci int16_t *block) 363cabdff1aSopenharmony_ci{ 364cabdff1aSopenharmony_ci int32_t const_val; 365cabdff1aSopenharmony_ci uint64_t tmp0, tmp1, tmp2, tmp3; 366cabdff1aSopenharmony_ci v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 }; 367cabdff1aSopenharmony_ci v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 368cabdff1aSopenharmony_ci v8i16 w1, w3, w5, w7; 369cabdff1aSopenharmony_ci v8i16 const0, const1, const2, const3, const4, const5, const6, const7; 370cabdff1aSopenharmony_ci v4i32 temp0_r, temp1_r, temp2_r, temp3_r; 371cabdff1aSopenharmony_ci v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r; 372cabdff1aSopenharmony_ci v4i32 temp0_l, temp1_l, temp2_l, temp3_l; 373cabdff1aSopenharmony_ci v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l; 374cabdff1aSopenharmony_ci v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l; 375cabdff1aSopenharmony_ci v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l; 376cabdff1aSopenharmony_ci v4i32 w2, w4, w6; 377cabdff1aSopenharmony_ci v8i16 select_vec, temp; 378cabdff1aSopenharmony_ci v8i16 zero = { 0 }; 379cabdff1aSopenharmony_ci v4i32 const_val0 = __msa_ldi_w(1); 380cabdff1aSopenharmony_ci v4i32 const_val1 = __msa_ldi_w(1); 381cabdff1aSopenharmony_ci 382cabdff1aSopenharmony_ci const_val0 <<= 10; 383cabdff1aSopenharmony_ci const_val = 16383 * ((1 << 19) / 16383); 384cabdff1aSopenharmony_ci const_val1 = __msa_insert_w(const_val0, 0, const_val); 385cabdff1aSopenharmony_ci const_val1 = __msa_splati_w(const_val1, 0); 386cabdff1aSopenharmony_ci LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7); 387cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 388cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 389cabdff1aSopenharmony_ci 390cabdff1aSopenharmony_ci select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7; 391cabdff1aSopenharmony_ci select_vec = __msa_clti_u_h((v8u16) select_vec, 1); 392cabdff1aSopenharmony_ci UNPCK_SH_SW(in0, a0_r, a0_l); 393cabdff1aSopenharmony_ci UNPCK_SH_SW(in2, temp3_r, temp3_l); 394cabdff1aSopenharmony_ci ILVRL_H2_SW(in1, in3, b3_r, b3_l); 395cabdff1aSopenharmony_ci UNPCK_SH_SW(in4, temp4_r, temp4_l); 396cabdff1aSopenharmony_ci UNPCK_SH_SW(in6, temp7_r, temp7_l); 397cabdff1aSopenharmony_ci ILVRL_H2_SW(in5, in7, temp8_r, temp8_l); 398cabdff1aSopenharmony_ci temp = in0 << 3; 399cabdff1aSopenharmony_ci SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7); 400cabdff1aSopenharmony_ci ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5, 401cabdff1aSopenharmony_ci const0, const1, const2, const3); 402cabdff1aSopenharmony_ci ILVR_H2_SH(w5, w7, w7, w3, const4, const6); 403cabdff1aSopenharmony_ci const5 = __msa_ilvod_h(-w1, -w5); 404cabdff1aSopenharmony_ci const7 = __msa_ilvod_h(w3, -w1); 405cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 406cabdff1aSopenharmony_ci b0_r, b1_r, b2_r, b3_r); 407cabdff1aSopenharmony_ci DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r, 408cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 409cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 410cabdff1aSopenharmony_ci b0_l, b1_l, b2_l, b3_l); 411cabdff1aSopenharmony_ci DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l, 412cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 413cabdff1aSopenharmony_ci w2 = (v4i32) __msa_splati_h(weights, 2); 414cabdff1aSopenharmony_ci w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2); 415cabdff1aSopenharmony_ci w4 = (v4i32) __msa_splati_h(weights, 4); 416cabdff1aSopenharmony_ci w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4); 417cabdff1aSopenharmony_ci w6 = (v4i32) __msa_splati_h(weights, 6); 418cabdff1aSopenharmony_ci w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6); 419cabdff1aSopenharmony_ci MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 420cabdff1aSopenharmony_ci ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l); 421cabdff1aSopenharmony_ci MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); 422cabdff1aSopenharmony_ci MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); 423cabdff1aSopenharmony_ci BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 424cabdff1aSopenharmony_ci temp2_l, temp2_r, temp1_l, temp1_r, 425cabdff1aSopenharmony_ci a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 426cabdff1aSopenharmony_ci MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l); 427cabdff1aSopenharmony_ci MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l); 428cabdff1aSopenharmony_ci MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l); 429cabdff1aSopenharmony_ci ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l); 430cabdff1aSopenharmony_ci SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l); 431cabdff1aSopenharmony_ci SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l); 432cabdff1aSopenharmony_ci ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l); 433cabdff1aSopenharmony_ci ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l); 434cabdff1aSopenharmony_ci SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l); 435cabdff1aSopenharmony_ci ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l); 436cabdff1aSopenharmony_ci SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l); 437cabdff1aSopenharmony_ci BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 438cabdff1aSopenharmony_ci b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 439cabdff1aSopenharmony_ci temp0_r, temp0_l, temp1_r, temp1_l, 440cabdff1aSopenharmony_ci temp2_r, temp2_l, temp3_r, temp3_l, 441cabdff1aSopenharmony_ci a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 442cabdff1aSopenharmony_ci SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11); 443cabdff1aSopenharmony_ci SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11); 444cabdff1aSopenharmony_ci PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, 445cabdff1aSopenharmony_ci temp2_l, temp2_r, temp3_l, temp3_r, 446cabdff1aSopenharmony_ci temp0_r, temp1_r, temp2_r, temp3_r); 447cabdff1aSopenharmony_ci in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp, 448cabdff1aSopenharmony_ci (v16u8) select_vec); 449cabdff1aSopenharmony_ci in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp, 450cabdff1aSopenharmony_ci (v16u8) select_vec); 451cabdff1aSopenharmony_ci in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp, 452cabdff1aSopenharmony_ci (v16u8) select_vec); 453cabdff1aSopenharmony_ci in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp, 454cabdff1aSopenharmony_ci (v16u8) select_vec); 455cabdff1aSopenharmony_ci SRA_4V(a3_r, a3_l, a2_r, a2_l, 11); 456cabdff1aSopenharmony_ci SRA_4V(a1_r, a1_l, a0_r, a0_l, 11); 457cabdff1aSopenharmony_ci PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 458cabdff1aSopenharmony_ci a0_r, a1_r, a2_r, a3_r); 459cabdff1aSopenharmony_ci in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec); 460cabdff1aSopenharmony_ci in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec); 461cabdff1aSopenharmony_ci in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec); 462cabdff1aSopenharmony_ci in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec); 463cabdff1aSopenharmony_ci TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, 464cabdff1aSopenharmony_ci in0, in1, in2, in3, in4, in5, in6, in7); 465cabdff1aSopenharmony_ci 466cabdff1aSopenharmony_ci UNPCK_SH_SW(in0, a0_r, a0_l); 467cabdff1aSopenharmony_ci UNPCK_SH_SW(in2, temp3_r, temp3_l); 468cabdff1aSopenharmony_ci MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l); 469cabdff1aSopenharmony_ci ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l); 470cabdff1aSopenharmony_ci MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l); 471cabdff1aSopenharmony_ci MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l); 472cabdff1aSopenharmony_ci BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l, 473cabdff1aSopenharmony_ci temp2_l, temp2_r, temp1_l, temp1_r, 474cabdff1aSopenharmony_ci a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r); 475cabdff1aSopenharmony_ci UNPCK_SH_SW(in4, temp0_r, temp0_l); 476cabdff1aSopenharmony_ci UNPCK_SH_SW(in6, temp3_r, temp3_l); 477cabdff1aSopenharmony_ci MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l); 478cabdff1aSopenharmony_ci MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l); 479cabdff1aSopenharmony_ci MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l); 480cabdff1aSopenharmony_ci ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l); 481cabdff1aSopenharmony_ci SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l); 482cabdff1aSopenharmony_ci SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l); 483cabdff1aSopenharmony_ci ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l); 484cabdff1aSopenharmony_ci ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l); 485cabdff1aSopenharmony_ci SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l); 486cabdff1aSopenharmony_ci ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l); 487cabdff1aSopenharmony_ci SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l); 488cabdff1aSopenharmony_ci ILVRL_H2_SW(in1, in3, b3_r, b3_l); 489cabdff1aSopenharmony_ci ILVRL_H2_SW(in5, in7, temp0_r, temp0_l); 490cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3, 491cabdff1aSopenharmony_ci b0_r, b1_r, b2_r, b3_r); 492cabdff1aSopenharmony_ci DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3, 493cabdff1aSopenharmony_ci b0_l, b1_l, b2_l, b3_l); 494cabdff1aSopenharmony_ci DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r, 495cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r); 496cabdff1aSopenharmony_ci DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l, 497cabdff1aSopenharmony_ci const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l); 498cabdff1aSopenharmony_ci BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l, 499cabdff1aSopenharmony_ci b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r, 500cabdff1aSopenharmony_ci temp0_r, temp0_l, temp1_r, temp1_l, 501cabdff1aSopenharmony_ci temp2_r, temp2_l, temp3_r, temp3_l, 502cabdff1aSopenharmony_ci a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r); 503cabdff1aSopenharmony_ci SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20); 504cabdff1aSopenharmony_ci SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20); 505cabdff1aSopenharmony_ci LD_SH4(dst, dst_stride, in0, in1, in2, in3); 506cabdff1aSopenharmony_ci PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r, 507cabdff1aSopenharmony_ci temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r); 508cabdff1aSopenharmony_ci ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3, 509cabdff1aSopenharmony_ci temp0_l, temp1_l, temp2_l, temp3_l); 510cabdff1aSopenharmony_ci in0 = (v8i16) (temp0_r) + (v8i16) (temp0_l); 511cabdff1aSopenharmony_ci in1 = (v8i16) (temp1_r) + (v8i16) (temp1_l); 512cabdff1aSopenharmony_ci in2 = (v8i16) (temp2_r) + (v8i16) (temp2_l); 513cabdff1aSopenharmony_ci in3 = (v8i16) (temp3_r) + (v8i16) (temp3_l); 514cabdff1aSopenharmony_ci CLIP_SH4_0_255(in0, in1, in2, in3); 515cabdff1aSopenharmony_ci PCKEV_B4_SH(in0, in0, in1, in1, in2, in2, in3, in3, 516cabdff1aSopenharmony_ci in0, in1, in2, in3); 517cabdff1aSopenharmony_ci tmp0 = __msa_copy_u_d((v2i64) in0, 1); 518cabdff1aSopenharmony_ci tmp1 = __msa_copy_u_d((v2i64) in1, 1); 519cabdff1aSopenharmony_ci tmp2 = __msa_copy_u_d((v2i64) in2, 1); 520cabdff1aSopenharmony_ci tmp3 = __msa_copy_u_d((v2i64) in3, 1); 521cabdff1aSopenharmony_ci SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 522cabdff1aSopenharmony_ci 523cabdff1aSopenharmony_ci SRA_4V(a3_r, a3_l, a2_r, a2_l, 20); 524cabdff1aSopenharmony_ci SRA_4V(a1_r, a1_l, a0_r, a0_l, 20); 525cabdff1aSopenharmony_ci LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7); 526cabdff1aSopenharmony_ci PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r, 527cabdff1aSopenharmony_ci a0_r, a1_r, a2_r, a3_r); 528cabdff1aSopenharmony_ci ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7, 529cabdff1aSopenharmony_ci a3_l, a2_l, a1_l, a0_l); 530cabdff1aSopenharmony_ci in4 = (v8i16) (a3_r) + (v8i16) (a3_l); 531cabdff1aSopenharmony_ci in5 = (v8i16) (a2_r) + (v8i16) (a2_l); 532cabdff1aSopenharmony_ci in6 = (v8i16) (a1_r) + (v8i16) (a1_l); 533cabdff1aSopenharmony_ci in7 = (v8i16) (a0_r) + (v8i16) (a0_l); 534cabdff1aSopenharmony_ci CLIP_SH4_0_255(in4, in5, in6, in7); 535cabdff1aSopenharmony_ci PCKEV_B4_SH(in4, in4, in5, in5, in6, in6, in7, in7, 536cabdff1aSopenharmony_ci in4, in5, in6, in7); 537cabdff1aSopenharmony_ci tmp0 = __msa_copy_u_d((v2i64) in4, 1); 538cabdff1aSopenharmony_ci tmp1 = __msa_copy_u_d((v2i64) in5, 1); 539cabdff1aSopenharmony_ci tmp2 = __msa_copy_u_d((v2i64) in6, 1); 540cabdff1aSopenharmony_ci tmp3 = __msa_copy_u_d((v2i64) in7, 1); 541cabdff1aSopenharmony_ci SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride); 542cabdff1aSopenharmony_ci} 543cabdff1aSopenharmony_ci 544cabdff1aSopenharmony_civoid ff_simple_idct_msa(int16_t *block) 545cabdff1aSopenharmony_ci{ 546cabdff1aSopenharmony_ci simple_idct_msa(block); 547cabdff1aSopenharmony_ci} 548cabdff1aSopenharmony_ci 549cabdff1aSopenharmony_civoid ff_simple_idct_put_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block) 550cabdff1aSopenharmony_ci{ 551cabdff1aSopenharmony_ci simple_idct_put_msa(dst, dst_stride, block); 552cabdff1aSopenharmony_ci} 553cabdff1aSopenharmony_ci 554cabdff1aSopenharmony_civoid ff_simple_idct_add_msa(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block) 555cabdff1aSopenharmony_ci{ 556cabdff1aSopenharmony_ci simple_idct_add_msa(dst, dst_stride, block); 557cabdff1aSopenharmony_ci} 558